Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -48,17 +48,17 @@ Given this structure:
Lesson 1/
Exercise1/
solution/
L1 Exercise 1 Solutions DRAFT.ipynb
L1 Exercise 1 Solutions.ipynb
```

The script will create:
```
Lesson 1/
Exercise1/
solution/
L1 Exercise 1 Solutions DRAFT.ipynb
L1 Exercise 1 Solutions.ipynb
starter/
L1 Exercise 1 Starter DRAFT.ipynb ← Generated!
L1 Exercise 1 Starter.ipynb ← Generated!
```

## Requirements
Expand Down
70 changes: 70 additions & 0 deletions Exercise Utils/audit_log.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,70 @@
Notebook Audit Report — 2026-03-16 11:05:13
======================================================================

Pair: Lesson 1\Exercise1-Identifying-Data-Information-and-Knowledge\solution\L1 Exercise 1 Solutions.ipynb
vs: Lesson 1\Exercise1-Identifying-Data-Information-and-Knowledge\starter\L1 Exercise 1 Starter.ipynb
✓ Clean — no unexpected differences

Pair: Lesson 1\Exercise2-Data-Metadata-Query-Engines\solution\L1 Exercise 2 Solutions.ipynb
vs: Lesson 1\Exercise2-Data-Metadata-Query-Engines\starter\L1 Exercise 2 Starter.ipynb
✓ Clean — no unexpected differences

Pair: Lesson 1\Exercise3-OLTP-OLAP-Use-Cases\solution\L1 Exercise 3 Solutions.ipynb
vs: Lesson 1\Exercise3-OLTP-OLAP-Use-Cases\starter\L1 Exercise 3 Starter.ipynb
✓ Clean — no unexpected differences

Pair: Lesson 1\Exercise4-Structured-and-Unstructured-Data\solution\L1 Exercise 4 Solutions.ipynb
vs: Lesson 1\Exercise4-Structured-and-Unstructured-Data\starter\L1 Exercise 4 Starter.ipynb
✓ Clean — no unexpected differences

Pair: Lesson 1\Exercise5-Visualizing-a-Graph-with-Python\solution\L1 Exercise 5 Solutions.ipynb
vs: Lesson 1\Exercise5-Visualizing-a-Graph-with-Python\starter\L1 Exercise 5 Starter.ipynb
✓ Clean — no unexpected differences

Pair: Lesson 2\Exercise1-Testing-Transactions-in-PostgreSQL\solution\L2 Exercise 1 Solutions.ipynb
vs: Lesson 2\Exercise1-Testing-Transactions-in-PostgreSQL\starter\L2 Exercise 1 Starter.ipynb
✓ Clean — no unexpected differences

Pair: Lesson 2\Exercise2-Creating-Anomalies\solution\L2 Exercise 2 Solutions.ipynb
vs: Lesson 2\Exercise2-Creating-Anomalies\starter\L2 Exercise 2 Starter.ipynb
✓ Clean — no unexpected differences

Pair: Lesson 2\Exercise3-Modeling-Data-for-OLAP-Use-Cases\solution\L2 Exercise 3 Solutions.ipynb
vs: Lesson 2\Exercise3-Modeling-Data-for-OLAP-Use-Cases\starter\L2 Exercise 3 Starter.ipynb
✓ Clean — no unexpected differences

Pair: Lesson 2\Exercise4-Dealing-with-Flexible-Data\solution\L2 Exercise 4 Solutions.ipynb
vs: Lesson 2\Exercise4-Dealing-with-Flexible-Data\starter\L2 Exercise 4 Starter.ipynb
✓ Clean — no unexpected differences

Pair: Lesson 2\Exercise5-Launching-AWS-RDS-Postgres\solution\L2 Exercise 5 Solutions.ipynb
vs: Lesson 2\Exercise5-Launching-AWS-RDS-Postgres\starter\L2 Exercise 5 Starter.ipynb
✓ Clean — no unexpected differences

Pair: Lesson 3\Exercise1-Querying-MongoDB\solution\L3 Exercise 1 Solutions.ipynb
vs: Lesson 3\Exercise1-Querying-MongoDB\starter\L3 Exercise 1 Starter.ipynb
✓ Clean — no unexpected differences

Pair: Lesson 3\Exercise2-Creating-Collections-and-NoSQL-Data-Modeling\solution\L3 Exercise 2 Solutions.ipynb
vs: Lesson 3\Exercise2-Creating-Collections-and-NoSQL-Data-Modeling\starter\L3 Exercise 2 Starter.ipynb
✓ Clean — no unexpected differences

Pair: Lesson 3\Exercise3-NoSQL-on-AWS\solution\L3 Exercise 3 Solutions.ipynb
vs: Lesson 3\Exercise3-NoSQL-on-AWS\starter\L3 Exercise 3 Starter.ipynb
✓ Clean — no unexpected differences

Pair: Lesson 4\Exercise1-Getting-started-with-Neo4j\solution\L4 Exercise 1 Solutions.ipynb
vs: Lesson 4\Exercise1-Getting-started-with-Neo4j\starter\L4 Exercise 1 Starter.ipynb
✓ Clean — no unexpected differences

Pair: Lesson 4\Exercise2-Creating-and-populating-graphs-in-Neo4j\solution\L4 Exercise 2 Solutions.ipynb
vs: Lesson 4\Exercise2-Creating-and-populating-graphs-in-Neo4j\starter\L4 Exercise 2 Starter.ipynb
✓ Clean — no unexpected differences

Pair: Lesson 4\Exercise3-Graph-Databases-on-AWS\solution\L4 Exercise 3 Solutions.ipynb
vs: Lesson 4\Exercise3-Graph-Databases-on-AWS\starter\L4 Exercise 3 Starter.ipynb
✓ Clean — no unexpected differences

======================================================================
SUMMARY: 16 pairs checked, 16 clean, 0 with issues (0 total issues)
======================================================================
240 changes: 240 additions & 0 deletions Exercise Utils/audit_notebooks.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,240 @@
"""
Audit script to compare solution and starter notebooks.

Expected differences (allowed):
- Solutions have cell outputs and execution_count values
- Solutions have code between BEGIN/END SOLUTION markers;
starters replace those blocks with YOUR CODE HERE placeholders
- Notebook-level metadata (kernelspec, language_info) may differ

Any other difference is flagged as an unexpected discrepancy.
"""

import json
import re
import os
from pathlib import Path
from datetime import datetime


# ---------- helpers ----------

def load_notebook(path):
with open(path, "r", encoding="utf-8") as f:
return json.load(f)


def join_source(source):
"""Normalise a cell's source field to a single string."""
if isinstance(source, list):
return "".join(source)
return source


def strip_solution_blocks(text):
"""
Replace solution blocks with the placeholder the generator would produce.
Handles both multi-line and inline cases:
Multi-line: ### BEGIN SOLUTION\\n...\\n### END SOLUTION
Inline: '### BEGIN SOLUTION ... ### END SOLUTION'
"""
# --- markers → -- YOUR CODE HERE
# Inline case first (single-quote delimited on one line)
text = re.sub(
r"'--- BEGIN SOLUTION.*?--- END SOLUTION[^']*'",
"'-- YOUR CODE HERE'",
text,
)
# Multi-line case
text = re.sub(
r"--- BEGIN SOLUTION.*?--- END SOLUTION[^\n]*",
"-- YOUR CODE HERE",
text,
flags=re.DOTALL,
)
# ### markers → ### YOUR CODE HERE
# Inline case first (single-quote delimited on one line)
text = re.sub(
r"'### BEGIN SOLUTION.*?### END SOLUTION[^']*'",
"'### YOUR CODE HERE'",
text,
)
# Multi-line case
text = re.sub(
r"### BEGIN SOLUTION.*?### END SOLUTION[^\n]*",
"### YOUR CODE HERE",
text,
flags=re.DOTALL,
)
return text


def normalise_whitespace(text):
"""Collapse runs of whitespace for a softer comparison."""
return re.sub(r"\s+", " ", text).strip()


# ---------- pair discovery ----------

def find_pairs(root_dir):
"""
Walk the tree and return a list of (solution_path, starter_path) tuples.
"""
root = Path(root_dir)
pairs = []
for sol_dir in sorted(root.rglob("solution")):
if not sol_dir.is_dir():
continue
starter_dir = sol_dir.parent / "starter"
if not starter_dir.is_dir():
continue
for sol_nb in sorted(sol_dir.glob("*.ipynb")):
starter_name = sol_nb.name.replace("Solutions", "Starter").replace("Solution", "Starter")
starter_nb = starter_dir / starter_name
if starter_nb.exists():
pairs.append((sol_nb, starter_nb))
else:
pairs.append((sol_nb, None))
return pairs


# ---------- cell-level diff ----------

def diff_cells(sol_cell, start_cell, cell_idx, issues):
"""Compare one solution cell against its starter counterpart."""
prefix = f" Cell {cell_idx + 1}"

# Type check
if sol_cell["cell_type"] != start_cell["cell_type"]:
issues.append(f"{prefix}: cell_type mismatch — solution={sol_cell['cell_type']}, starter={start_cell['cell_type']}")
return

sol_src = join_source(sol_cell.get("source", ""))
start_src = join_source(start_cell.get("source", ""))

if sol_cell["cell_type"] == "markdown":
# Markdown cells should be identical
if normalise_whitespace(sol_src) != normalise_whitespace(start_src):
issues.append(f"{prefix} (markdown): content differs")
issues.append(f" SOL : {sol_src[:200]!r}")
issues.append(f" START: {start_src[:200]!r}")
return

# Code cells: strip solution blocks from solution, then compare
sol_stripped = strip_solution_blocks(sol_src)

if normalise_whitespace(sol_stripped) != normalise_whitespace(start_src):
# Check if starter simply has YOUR CODE HERE where solution had a block
# If even after stripping they don't match, flag it
issues.append(f"{prefix} (code): source differs after stripping solution blocks")
# Show a short snippet of each
sol_lines = sol_stripped.strip().splitlines()
start_lines = start_src.strip().splitlines()
# Find first differing line
for i, (sl, stl) in enumerate(zip(sol_lines, start_lines)):
if normalise_whitespace(sl) != normalise_whitespace(stl):
issues.append(f" First diff at source line {i + 1}:")
issues.append(f" SOL (stripped): {sl.rstrip()!r}")
issues.append(f" STARTER : {stl.rstrip()!r}")
break
else:
if len(sol_lines) != len(start_lines):
issues.append(f" Line count differs: solution(stripped)={len(sol_lines)}, starter={len(start_lines)}")


# ---------- notebook-level diff ----------

def audit_pair(sol_path, start_path):
"""
Compare one solution/starter pair.
Returns a list of issue strings (empty = clean).
"""
issues = []

if start_path is None:
issues.append(" No matching starter notebook found!")
return issues

sol_nb = load_notebook(sol_path)
start_nb = load_notebook(start_path)

sol_cells = sol_nb.get("cells", [])
start_cells = start_nb.get("cells", [])

if len(sol_cells) != len(start_cells):
issues.append(f" Cell count mismatch: solution={len(sol_cells)}, starter={len(start_cells)}")
# Still compare up to the shorter length
if len(sol_cells) > len(start_cells):
for i in range(len(start_cells), len(sol_cells)):
src = join_source(sol_cells[i].get("source", "")).strip()
kind = sol_cells[i]["cell_type"]
desc = f"(empty)" if not src else f"({src[:80]!r})"
issues.append(f" Extra solution cell {i + 1} [{kind}]: {desc}")
else:
for i in range(len(sol_cells), len(start_cells)):
src = join_source(start_cells[i].get("source", "")).strip()
kind = start_cells[i]["cell_type"]
desc = f"(empty)" if not src else f"({src[:80]!r})"
issues.append(f" Extra starter cell {i + 1} [{kind}]: {desc}")

min_len = min(len(sol_cells), len(start_cells))
for i in range(min_len):
diff_cells(sol_cells[i], start_cells[i], i, issues)

return issues


# ---------- main ----------

def main():
root = Path(__file__).parent.parent # repo root (one level up from Exercise Utils)
pairs = find_pairs(root)

timestamp = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
log_lines = []
log_lines.append(f"Notebook Audit Report — {timestamp}")
log_lines.append("=" * 70)
log_lines.append("")

total_issues = 0
clean_count = 0

for sol_path, start_path in pairs:
rel_sol = sol_path.relative_to(root)
rel_start = start_path.relative_to(root) if start_path else "MISSING"

log_lines.append(f"Pair: {rel_sol}")
log_lines.append(f" vs: {rel_start}")

issues = audit_pair(sol_path, start_path)

if issues:
total_issues += len(issues)
for issue in issues:
log_lines.append(issue)
else:
log_lines.append(" ✓ Clean — no unexpected differences")
clean_count += 1

log_lines.append("")

# Summary
log_lines.append("=" * 70)
log_lines.append(f"SUMMARY: {len(pairs)} pairs checked, {clean_count} clean, "
f"{len(pairs) - clean_count} with issues ({total_issues} total issues)")
log_lines.append("=" * 70)

report = "\n".join(log_lines)

# Write log file next to this script (in Exercise Utils/)
log_path = Path(__file__).parent / "audit_log.txt"
with open(log_path, "w", encoding="utf-8") as f:
f.write(report)

# Also print to console
print(report)
print(f"\nLog written to: {log_path}")


if __name__ == "__main__":
main()
Original file line number Diff line number Diff line change
Expand Up @@ -145,5 +145,7 @@ def main(root_directory='.'):
import sys

# Allow passing root directory as command line argument
root_dir = sys.argv[1] if len(sys.argv) > 1 else '.'
# Default: repo root (one level up from Exercise Utils)
default_root = str(Path(__file__).parent.parent)
root_dir = sys.argv[1] if len(sys.argv) > 1 else default_root
main(root_dir)
Loading