Completed HW3: Intermediate SQL

2025-10-07 21:01:20 -04:00
parent a21adf25be
commit 062e21b33e
3 changed files with 1537 additions and 0 deletions
@@ -0,0 +1,235 @@
+"""Autograder for Postgres queries assignment.
+
+Author: CS374 Faculty
+Version: 09/30/2025
+"""
+
+import difflib
+import psycopg
+import re
+import socket
+
+# The expected number of queries on this assignment
+QUERIES = 8
+
+# Determine whether connecting from on/off campus
+try:
+    socket.gethostbyname("data.cs.jmu.edu")
+    HOST = "data.cs.jmu.edu"
+except socket.gaierror:
+    HOST = "localhost"
+
+
+def connect(dbname):
+    """Connect to the database and create a cursor.
+
+    Args:
+        dbname: The name of the database to use.
+    """
+    global con, cur
+    con = psycopg.connect(host=HOST, user="demo", password="demo", dbname=dbname)
+    cur = con.cursor()
+
+
+def assert_eq(actual, expect, message=""):
+    """Assert whether two values are equal (custom feedback).
+
+    Args:
+        actual: The value produced by the code being tested.
+        expect: The expected value to compare with `actual`.
+        message: Text to display if AssertionError is raised.
+
+    Raises:
+        AssertionError: If `actual` is not equal to `expect`,
+                        with a message displaying both values.
+    """
+    if type(actual) is str:
+        # Abbreviate output if too long
+        a_str = actual if len(actual) <= 120 else actual[:120] + "..."
+        e_str = expect if len(expect) <= 120 else expect[:120] + "..."
+    else:
+        # Convert to simple strings
+        a_str = str(actual)
+        e_str = str(expect)
+
+    assert actual == expect, f"{message}\n  Actual: {a_str}\n  Expect: {e_str}"
+
+
+def res2str(res):
+    """Convert query results into a multiline string.
+
+    Args:
+        res (list of tuples): Results obtained from fetchall().
+
+    Returns:
+        str: Each line is one row with values separated by tabs.
+    """
+    return "\n".join(
+        ["\t".join(map(lambda x: "" if x is None else str(x), tup)) for tup in res]
+    )
+
+
+def run_query(sql, txt, qno):
+    """Run the query and compare with expected output.
+
+    Args:
+        sql (str): The sql chunk from the HW file.
+        txt (str): The expected output of the sql.
+        qno (int): The query number being tested.
+
+    Raises:
+        RuntimeError: If incorrect number of queries.
+    """
+
+    # Print status message for autograder feedback
+    if qno:
+        print(f"Running Query #{qno}...")
+        # Reset connection in case of previous error or timeout
+        con.cancel_safe()
+        connect(con.info.dbname)
+    elif "\\c" in sql:
+        beg = sql.find("\\c")
+        end = sql.find("\n", beg)
+        dbname = sql[beg + 3 : end]
+        print("Connecting to", dbname)
+        connect(dbname)
+        return
+    else:
+        print("Running header comment")
+        connect("postgres")
+
+    # Execute the chunk, convert results to text
+    results = []
+    if "\\echo" in sql:
+        sql = sql.replace("\\echo", "--\\echo")
+        beg = sql.find("\\echo")
+        end = sql.find("\n", beg)
+        name = sql[beg + 6 : end]
+        results.append(name)
+    res = cur.execute(sql)
+    if res.rowcount > -1:
+        column_names = [desc[0] for desc in res.description]
+        schema = "\t".join(column_names)
+        output = res2str(res.fetchall())
+        output = output.replace(".0\n", "\n")  # integer hack
+        footer = f"({res.rowcount} rows)\n"
+        results.append(schema + "\n" + output + "\n" + footer)
+
+    # Compare with expected output, if applicable
+    if txt:
+        if len(results) == 0:
+            raise RuntimeError(f"Missing output of \\echo Query #{qno}")
+
+        # 1st line blank, 2nd line "Query #"
+        actual = results[0]
+        expect = txt.splitlines()
+        assert_eq(actual, expect[1], "Incorrect query number")
+
+        # Check the number of queries run
+        if len(results) == 1:
+            raise RuntimeError("No results (code is blank)")
+        if len(results) > 2:
+            raise RuntimeError("Extra results (more than one query)")
+
+        # Calculate similarity percentage
+        actual = "\n" + results[0] + "\n" + results[1]
+        seq = difflib.SequenceMatcher(None, actual, txt)
+        sim = int(seq.ratio() * 100)
+        print(f"Output matches {sim}%")
+
+        # Compare schema and row count
+        actual = results[1].rstrip().splitlines()
+        expect = expect[2:]
+        assert_eq(actual[0], expect[0], "Incorrect schema")
+        a_rows = len(actual) - 2
+        e_rows = len(expect) - 2
+        assert_eq(a_rows, e_rows, "Incorrect row count")
+
+        # Compare each row of the results
+        for i in range(1, a_rows):
+            assert_eq(actual[i], expect[i], f"Row {i} does not match")
+
+    # No output expected (not a SELECT)
+    elif results:
+        raise RuntimeError(f"Results should be empty: {results}")
+
+
+def split_file(path):
+    """Split a text file into chunks by query number.
+
+    Args:
+        path (str): The path of the text file to split.
+
+    Returns:
+        list of str: The code or output for each query.
+    """
+
+    # Read the file contents
+    beg = 0
+    chunks = []
+    with open(path) as file:
+        text = file.read()
+
+    # Extract the text before each query
+    pattern = re.compile(r"^-- -+\n-- |^(--)?\n?.*Query #\d+", re.MULTILINE)
+    for match in re.finditer(pattern, text):
+        end = match.start()
+        chunks.append(text[beg:end])
+        beg = end
+
+    # Append the text of the final query
+    chunks.append(text[beg:])
+    return chunks
+
+
+def main(sql_file, txt_file, g_scope=False):
+    """Split the given files and execute each query.
+
+    Args:
+        sql_file (str): Path to the sql script file.
+        txt_file (str): Path to the expected output.
+        g_scope (bool): True if running on Gradescope.
+
+    Returns:
+        tuple or None: queries and outputs (for Gradescope)
+
+    Raises:
+        RuntimeError: If a file doesn't split correctly.
+    """
+
+    # Split and validate the given files
+    queries = split_file(sql_file)
+    q_count = sum(1 for s in queries if "Query #" in s)
+    if q_count != QUERIES:
+        raise RuntimeError(f"Expected {QUERIES} queries, but {q_count} were found.")
+    outputs = split_file(txt_file)
+    del outputs[0]  # Blank string
+    o_count = sum(1 for s in outputs if "Query #" in s)
+    if o_count != QUERIES:
+        raise RuntimeError(f"Expected {QUERIES} outputs, but {o_count} were found.")
+
+    # Gradescope skips the rest of main()
+    if g_scope:
+        return queries, outputs
+
+    # Execute each chunk of sql in order
+    qno = 0
+    for sql in queries:
+        try:
+            if "Query #" in sql:
+                qno += 1
+                run_query(sql, outputs[qno-1], qno)
+            else:
+                run_query(sql, None, None)  # Ex: meta-command
+        except Exception as e:
+            # Assertion or psycopg or Runtime error
+            print(type(e).__name__ + ":", e)
+        print()
+
+    # That's all folks!
+    if qno != QUERIES:
+        print(f"Error: Something went wrong. {qno} of {QUERIES} queries were run.")
+
+
+if __name__ == "__main__":
+    main("hw3.sql", "hw3-sol.txt")
@@ -0,0 +1,163 @@
+--
+-- Name: Nicholas Tamassia
+--
+-- Write your queries below each comment. Please use good style (one clause
+-- per line, JOIN syntax, indentation) and make sure all queries end with a
+-- semicolon. When necessary, limit the output to the first 200 results.
+--
+-- DO NOT MODIFY OR DELETE ANY OTHER LINES!
+--
+
+-- -----------------------------------------------------------------------------
+-- Connect to tpch database
+\c tpch
+-- -----------------------------------------------------------------------------
+
+--
+\echo
+\echo Query #1
+--
+-- Show the customer name, order date, and line items for order number 3.
+--
+-- Schema: c_name, o_orderdate, l_partkey, l_suppkey, l_quantity, l_extendedPrice
+--  Order: l_linenumber
+
+SELECT c_name, o_orderdate, l_partkey, l_suppkey, l_quantity, l_extendedPrice
+FROM customer
+JOIN orders ON c_custkey = o_custkey
+JOIN lineitem ON o_orderkey = l_orderkey
+WHERE o_orderkey = 3
+ORDER BY l_linenumber
+LIMIT 200;
+
+--
+\echo
+\echo Query #2
+--
+-- Show the part name, supply cost, and retail price of each line item.
+--
+-- Schema: l_orderkey, l_partkey, l_suppkey, p_name, ps_supplycost, p_retailprice
+--  Order: l_orderkey, l_linenumber
+
+SELECT l_orderkey, l_partkey, l_suppkey, p_name, ps_supplycost, p_retailprice
+FROM lineitem
+JOIN partsupp ON (l_partkey = ps_partkey AND l_suppkey = ps_suppkey)
+JOIN part ON ps_partkey = p_partkey
+ORDER BY l_orderkey, l_linenumber
+LIMIT 200;
+
+--
+\echo
+\echo Query #3
+--
+-- Which customers with an account balance over 5000 have no orders?
+--
+-- Schema: c_name, c_acctbal
+--  Order: c_custkey
+
+SELECT c_name, c_acctbal
+FROM customer
+LEFT JOIN orders ON c_custkey = o_custkey
+WHERE c_acctbal > 5000 AND o_orderkey is NULL
+ORDER BY c_custkey
+LIMIT 200;
+
+--
+\echo
+\echo Query #4
+--
+-- Which urgent priority orders have only one line item?
+--
+-- Schema: o_orderkey, o_custkey, o_orderstatus
+--  Order: o_orderkey
+
+SELECT o_orderkey, o_custkey, o_orderstatus
+FROM orders
+JOIN lineitem ON o_orderkey = l_orderkey
+WHERE o_orderpriority = '1-URGENT'
+GROUP BY o_orderkey
+HAVING COUNT(l_orderkey) = 1
+ORDER BY o_orderkey
+LIMIT 200;
+
+--
+\echo
+\echo Query #5
+--
+-- What parts containing the word "chocolate" were ordered on or after
+-- August 1, 1998 and shipped by mail?
+--
+-- Schema: p_name, ps_supplycost, l_quantity
+--  Order: p_partkey
+
+SELECT p_name, ps_supplycost, l_quantity
+FROM part
+JOIN partsupp ON p_partkey = ps_partkey
+JOIN lineitem ON (ps_partkey = l_partkey AND ps_suppkey = l_suppkey)
+JOIN orders ON l_orderkey = o_orderkey
+WHERE p_name LIKE '%chocolate%' AND o_orderdate >= '1998-08-01' AND l_shipmode = 'MAIL'
+ORDER BY p_partkey
+LIMIT 200;
+
+--
+\echo
+\echo Query #6
+--
+-- Get the min, max, and average supply cost of each part. The average supply cost
+-- must be rounded to 2 decimal places.
+--
+-- Schema: p_name, p_retailprice, min_supplycost, max_supplycost, avg_supplycost
+--  Order: p_partkey
+
+SELECT p_name, p_retailprice, MIN(ps_supplycost) AS "min_supplycost", MAX(ps_supplycost) AS "max_supplycost", ROUND(AVG(ps_supplycost), 2) as "avg_supplycost"
+FROM part
+JOIN partsupp on p_partkey = ps_partkey
+GROUP BY p_partkey
+ORDER BY p_partkey
+LIMIT 200;
+
+
+--
+\echo
+\echo Query #7
+--
+-- What suppliers have an inventory (total available quantity) of over 125,000
+-- parts with size over 40.
+--
+-- Schema: r_name, n_name, s_name, total_qty
+--  Order: r_name, n_name, s_name
+
+SELECT r_name, n_name, s_name, SUM(ps_availqty) AS "total_qty"
+FROM supplier
+JOIN partsupp ON s_suppkey = ps_suppkey
+JOIN part ON ps_partkey = p_partkey
+JOIN nation ON s_nationkey = n_nationkey
+JOIN region ON n_regionkey = r_regionkey
+WHERE p_size > 40
+GROUP BY s_name, n_name, r_name
+HAVING SUM(ps_availqty) > 125000
+ORDER BY r_name, n_name, s_name
+LIMIT 200;
+
+--
+\echo
+\echo Query #8
+--
+-- Find orders with a total profit of over $375,000. The profit of a single
+-- lineitem is defined as quantity * (retail price - supply cost). The profit
+-- of an order is the sum of its lineitem profits. For each order, show also
+-- the number of lineitems, the total quantity of items (rounded to 0 decimal
+-- places), and the total profit of the order (rounded to 2 decimal places).
+--
+-- Schema: l_orderkey, num_lineitems, total_quantity, total_profit
+--  Order: total_profit (descending), total_quantity
+
+SELECT l_orderkey, COUNT(l_quantity) AS "num_lineitems", ROUND(SUM(l_quantity)) AS "total_quantity", ROUND(SUM(l_quantity * (p_retailprice - ps_supplycost)), 2) AS "total_profit"
+FROM orders
+JOIN lineitem ON o_orderkey = l_orderkey
+JOIN partsupp ON (l_partkey = ps_partkey AND l_suppkey = ps_suppkey)
+JOIN part ON ps_partkey = p_partkey
+GROUP BY l_orderkey
+HAVING SUM(l_quantity * (p_retailprice - ps_supplycost)) > 375000
+ORDER BY "total_profit" DESC, "total_quantity"
+LIMIT 200;