Completed HW3: Intermediate SQL

2025-10-07 21:01:20 -04:00
parent a21adf25be
commit 062e21b33e
3 changed files with 1537 additions and 0 deletions
--- a/Intermediate-SQL/hw3-autograder.py
+++ b/Intermediate-SQL/hw3-autograder.py
@@ -0,0 +1,235 @@
 """Autograder for Postgres queries assignment.
 Author: CS374 Faculty
 Version: 09/30/2025
 """
 import difflib
 import psycopg
 import re
 import socket
 # The expected number of queries on this assignment
 QUERIES = 8
 # Determine whether connecting from on/off campus
 try:
    socket.gethostbyname("data.cs.jmu.edu")
    HOST = "data.cs.jmu.edu"
 except socket.gaierror:
    HOST = "localhost"
 def connect(dbname):
    """Connect to the database and create a cursor.
    Args:
        dbname: The name of the database to use.
    """
    global con, cur
    con = psycopg.connect(host=HOST, user="demo", password="demo", dbname=dbname)
    cur = con.cursor()
 def assert_eq(actual, expect, message=""):
    """Assert whether two values are equal (custom feedback).
    Args:
        actual: The value produced by the code being tested.
        expect: The expected value to compare with `actual`.
        message: Text to display if AssertionError is raised.
    Raises:
        AssertionError: If `actual` is not equal to `expect`,
                        with a message displaying both values.
    """
    if type(actual) is str:
        # Abbreviate output if too long
        a_str = actual if len(actual) <= 120 else actual[:120] + "..."
        e_str = expect if len(expect) <= 120 else expect[:120] + "..."
    else:
        # Convert to simple strings
        a_str = str(actual)
        e_str = str(expect)
    assert actual == expect, f"{message}\n  Actual: {a_str}\n  Expect: {e_str}"
 def res2str(res):
    """Convert query results into a multiline string.
    Args:
        res (list of tuples): Results obtained from fetchall().
    Returns:
        str: Each line is one row with values separated by tabs.
    """
    return "\n".join(
        ["\t".join(map(lambda x: "" if x is None else str(x), tup)) for tup in res]
    )
 def run_query(sql, txt, qno):
    """Run the query and compare with expected output.
    Args:
        sql (str): The sql chunk from the HW file.
        txt (str): The expected output of the sql.
        qno (int): The query number being tested.
    Raises:
        RuntimeError: If incorrect number of queries.
    """
    # Print status message for autograder feedback
    if qno:
        print(f"Running Query #{qno}...")
        # Reset connection in case of previous error or timeout
        con.cancel_safe()
        connect(con.info.dbname)
    elif "\\c" in sql:
        beg = sql.find("\\c")
        end = sql.find("\n", beg)
        dbname = sql[beg + 3 : end]
        print("Connecting to", dbname)
        connect(dbname)
        return
    else:
        print("Running header comment")
        connect("postgres")
    # Execute the chunk, convert results to text
    results = []
    if "\\echo" in sql:
        sql = sql.replace("\\echo", "--\\echo")
        beg = sql.find("\\echo")
        end = sql.find("\n", beg)
        name = sql[beg + 6 : end]
        results.append(name)
    res = cur.execute(sql)
    if res.rowcount > -1:
        column_names = [desc[0] for desc in res.description]
        schema = "\t".join(column_names)
        output = res2str(res.fetchall())
        output = output.replace(".0\n", "\n")  # integer hack
        footer = f"({res.rowcount} rows)\n"
        results.append(schema + "\n" + output + "\n" + footer)
    # Compare with expected output, if applicable
    if txt:
        if len(results) == 0:
            raise RuntimeError(f"Missing output of \\echo Query #{qno}")
        # 1st line blank, 2nd line "Query #"
        actual = results[0]
        expect = txt.splitlines()
        assert_eq(actual, expect[1], "Incorrect query number")
        # Check the number of queries run
        if len(results) == 1:
            raise RuntimeError("No results (code is blank)")
        if len(results) > 2:
            raise RuntimeError("Extra results (more than one query)")
        # Calculate similarity percentage
        actual = "\n" + results[0] + "\n" + results[1]
        seq = difflib.SequenceMatcher(None, actual, txt)
        sim = int(seq.ratio() * 100)
        print(f"Output matches {sim}%")
        # Compare schema and row count
        actual = results[1].rstrip().splitlines()
        expect = expect[2:]
        assert_eq(actual[0], expect[0], "Incorrect schema")
        a_rows = len(actual) - 2
        e_rows = len(expect) - 2
        assert_eq(a_rows, e_rows, "Incorrect row count")
        # Compare each row of the results
        for i in range(1, a_rows):
            assert_eq(actual[i], expect[i], f"Row {i} does not match")
    # No output expected (not a SELECT)
    elif results:
        raise RuntimeError(f"Results should be empty: {results}")
 def split_file(path):
    """Split a text file into chunks by query number.
    Args:
        path (str): The path of the text file to split.
    Returns:
        list of str: The code or output for each query.
    """
    # Read the file contents
    beg = 0
    chunks = []
    with open(path) as file:
        text = file.read()
    # Extract the text before each query
    pattern = re.compile(r"^-- -+\n-- |^(--)?\n?.*Query #\d+", re.MULTILINE)
    for match in re.finditer(pattern, text):
        end = match.start()
        chunks.append(text[beg:end])
        beg = end
    # Append the text of the final query
    chunks.append(text[beg:])
    return chunks
 def main(sql_file, txt_file, g_scope=False):
    """Split the given files and execute each query.
    Args:
        sql_file (str): Path to the sql script file.
        txt_file (str): Path to the expected output.
        g_scope (bool): True if running on Gradescope.
    Returns:
        tuple or None: queries and outputs (for Gradescope)
    Raises:
        RuntimeError: If a file doesn't split correctly.
    """
    # Split and validate the given files
    queries = split_file(sql_file)
    q_count = sum(1 for s in queries if "Query #" in s)
    if q_count != QUERIES:
        raise RuntimeError(f"Expected {QUERIES} queries, but {q_count} were found.")
    outputs = split_file(txt_file)
    del outputs[0]  # Blank string
    o_count = sum(1 for s in outputs if "Query #" in s)
    if o_count != QUERIES:
        raise RuntimeError(f"Expected {QUERIES} outputs, but {o_count} were found.")
    # Gradescope skips the rest of main()
    if g_scope:
        return queries, outputs
    # Execute each chunk of sql in order
    qno = 0
    for sql in queries:
        try:
            if "Query #" in sql:
                qno += 1
                run_query(sql, outputs[qno-1], qno)
            else:
                run_query(sql, None, None)  # Ex: meta-command
        except Exception as e:
            # Assertion or psycopg or Runtime error
            print(type(e).__name__ + ":", e)
        print()
    # That's all folks!
    if qno != QUERIES:
        print(f"Error: Something went wrong. {qno} of {QUERIES} queries were run.")
 if __name__ == "__main__":
    main("hw3.sql", "hw3-sol.txt")
--- a/Intermediate-SQL/hw3-sol.txt
+++ b/Intermediate-SQL/hw3-sol.txt
--- a/Intermediate-SQL/hw3.sql
+++ b/Intermediate-SQL/hw3.sql
@@ -0,0 +1,163 @@
 --
 -- Name: Nicholas Tamassia
 --
 -- Write your queries below each comment. Please use good style (one clause
 -- per line, JOIN syntax, indentation) and make sure all queries end with a
 -- semicolon. When necessary, limit the output to the first 200 results.
 --
 -- DO NOT MODIFY OR DELETE ANY OTHER LINES!
 --
 -- -----------------------------------------------------------------------------
 -- Connect to tpch database
 \c tpch
 -- -----------------------------------------------------------------------------
 --
 \echo
 \echo Query #1
 --
 -- Show the customer name, order date, and line items for order number 3.
 --
 -- Schema: c_name, o_orderdate, l_partkey, l_suppkey, l_quantity, l_extendedPrice
 --  Order: l_linenumber
 SELECT c_name, o_orderdate, l_partkey, l_suppkey, l_quantity, l_extendedPrice
 FROM customer
 JOIN orders ON c_custkey = o_custkey
 JOIN lineitem ON o_orderkey = l_orderkey
 WHERE o_orderkey = 3
 ORDER BY l_linenumber
 LIMIT 200;
 --
 \echo
 \echo Query #2
 --
 -- Show the part name, supply cost, and retail price of each line item.
 --
 -- Schema: l_orderkey, l_partkey, l_suppkey, p_name, ps_supplycost, p_retailprice
 --  Order: l_orderkey, l_linenumber
 SELECT l_orderkey, l_partkey, l_suppkey, p_name, ps_supplycost, p_retailprice
 FROM lineitem
 JOIN partsupp ON (l_partkey = ps_partkey AND l_suppkey = ps_suppkey)
 JOIN part ON ps_partkey = p_partkey
 ORDER BY l_orderkey, l_linenumber
 LIMIT 200;
 --
 \echo
 \echo Query #3
 --
 -- Which customers with an account balance over 5000 have no orders?
 --
 -- Schema: c_name, c_acctbal
 --  Order: c_custkey
 SELECT c_name, c_acctbal
 FROM customer
 LEFT JOIN orders ON c_custkey = o_custkey
 WHERE c_acctbal > 5000 AND o_orderkey is NULL
 ORDER BY c_custkey
 LIMIT 200;
 --
 \echo
 \echo Query #4
 --
 -- Which urgent priority orders have only one line item?
 --
 -- Schema: o_orderkey, o_custkey, o_orderstatus
 --  Order: o_orderkey
 SELECT o_orderkey, o_custkey, o_orderstatus
 FROM orders
 JOIN lineitem ON o_orderkey = l_orderkey
 WHERE o_orderpriority = '1-URGENT'
 GROUP BY o_orderkey
 HAVING COUNT(l_orderkey) = 1
 ORDER BY o_orderkey
 LIMIT 200;
 --
 \echo
 \echo Query #5
 --
 -- What parts containing the word "chocolate" were ordered on or after
 -- August 1, 1998 and shipped by mail?
 --
 -- Schema: p_name, ps_supplycost, l_quantity
 --  Order: p_partkey
 SELECT p_name, ps_supplycost, l_quantity
 FROM part
 JOIN partsupp ON p_partkey = ps_partkey
 JOIN lineitem ON (ps_partkey = l_partkey AND ps_suppkey = l_suppkey)
 JOIN orders ON l_orderkey = o_orderkey
 WHERE p_name LIKE '%chocolate%' AND o_orderdate >= '1998-08-01' AND l_shipmode = 'MAIL'
 ORDER BY p_partkey
 LIMIT 200;
 --
 \echo
 \echo Query #6
 --
 -- Get the min, max, and average supply cost of each part. The average supply cost
 -- must be rounded to 2 decimal places.
 --
 -- Schema: p_name, p_retailprice, min_supplycost, max_supplycost, avg_supplycost
 --  Order: p_partkey
 SELECT p_name, p_retailprice, MIN(ps_supplycost) AS "min_supplycost", MAX(ps_supplycost) AS "max_supplycost", ROUND(AVG(ps_supplycost), 2) as "avg_supplycost"
 FROM part
 JOIN partsupp on p_partkey = ps_partkey
 GROUP BY p_partkey
 ORDER BY p_partkey
 LIMIT 200;
 --
 \echo
 \echo Query #7
 --
 -- What suppliers have an inventory (total available quantity) of over 125,000
 -- parts with size over 40.
 --
 -- Schema: r_name, n_name, s_name, total_qty
 --  Order: r_name, n_name, s_name
 SELECT r_name, n_name, s_name, SUM(ps_availqty) AS "total_qty"
 FROM supplier
 JOIN partsupp ON s_suppkey = ps_suppkey
 JOIN part ON ps_partkey = p_partkey
 JOIN nation ON s_nationkey = n_nationkey
 JOIN region ON n_regionkey = r_regionkey
 WHERE p_size > 40
 GROUP BY s_name, n_name, r_name
 HAVING SUM(ps_availqty) > 125000
 ORDER BY r_name, n_name, s_name
 LIMIT 200;
 --
 \echo
 \echo Query #8
 --
 -- Find orders with a total profit of over $375,000. The profit of a single
 -- lineitem is defined as quantity * (retail price - supply cost). The profit
 -- of an order is the sum of its lineitem profits. For each order, show also
 -- the number of lineitems, the total quantity of items (rounded to 0 decimal
 -- places), and the total profit of the order (rounded to 2 decimal places).
 --
 -- Schema: l_orderkey, num_lineitems, total_quantity, total_profit
 --  Order: total_profit (descending), total_quantity
 SELECT l_orderkey, COUNT(l_quantity) AS "num_lineitems", ROUND(SUM(l_quantity)) AS "total_quantity", ROUND(SUM(l_quantity * (p_retailprice - ps_supplycost)), 2) AS "total_profit"
 FROM orders
 JOIN lineitem ON o_orderkey = l_orderkey
 JOIN partsupp ON (l_partkey = ps_partkey AND l_suppkey = ps_suppkey)
 JOIN part ON ps_partkey = p_partkey
 GROUP BY l_orderkey
 HAVING SUM(l_quantity * (p_retailprice - ps_supplycost)) > 375000
 ORDER BY "total_profit" DESC, "total_quantity"
 LIMIT 200;