From ea2a8f2a82a4826e14d13f00ace023490c05cd3f Mon Sep 17 00:00:00 2001 From: William Hill Date: Sun, 3 May 2026 12:07:35 -0400 Subject: [PATCH 1/2] feat(ferpa): add repo-local FERPA audit skill and scanners (#129) Add .claude/skills/ferpa-audit with SKILL.md, regulatory citation index, static (TypeScript) and Postgres (Python) layers, root ferpa-config.yaml, runbook, /ferpa-audit command, and scripts/ferpa-audit.sh orchestrator. Layer A flags execute-sql vs analyze enforcement, syntex fetch path, console logging, RBAC gaps, and AI transparency drift; Layer B adds RLS, small-N, and schema snapshot when DB is available. Dated reports are gitignored; dashboard package.json gains yaml for config parsing. Co-authored-by: Cursor --- .claude/commands/ferpa-audit.md | 12 + .claude/skills/ferpa-audit/SKILL.md | 110 +++++ .../references/regulatory-citations.md | 81 ++++ .../skills/ferpa-audit/scripts/db-audit.py | 345 ++++++++++++++++ .../ferpa-audit/scripts/static-audit.ts | 377 ++++++++++++++++++ .gitignore | 3 + codebenders-dashboard/package.json | 1 + docs/ferpa-audit-runbook.md | 67 ++++ ferpa-config.yaml | 82 ++++ scripts/ferpa-audit.sh | 21 + 10 files changed, 1099 insertions(+) create mode 100644 .claude/commands/ferpa-audit.md create mode 100644 .claude/skills/ferpa-audit/SKILL.md create mode 100644 .claude/skills/ferpa-audit/references/regulatory-citations.md create mode 100644 .claude/skills/ferpa-audit/scripts/db-audit.py create mode 100644 .claude/skills/ferpa-audit/scripts/static-audit.ts create mode 100644 docs/ferpa-audit-runbook.md create mode 100644 ferpa-config.yaml create mode 100755 scripts/ferpa-audit.sh diff --git a/.claude/commands/ferpa-audit.md b/.claude/commands/ferpa-audit.md new file mode 100644 index 0000000..4f95815 --- /dev/null +++ b/.claude/commands/ferpa-audit.md @@ -0,0 +1,12 @@ +--- +description: Run FERPA read-time audit (Layer A static + Layer B DB) and produce docs/ferpa-audit-.md +--- + +Follow `.claude/skills/ferpa-audit/SKILL.md` exactly. + +1. Ensure `ferpa-config.yaml` at the repository root is current. +2. Run `./scripts/ferpa-audit.sh` from the repository root (or Layer A then Layer B per the skill if the shell script is unavailable). +3. Open the generated `docs/ferpa-audit-.md` and confirm Critical/Warning/Note counts match executive summary. +4. Every finding in any narrative you add MUST cite `.claude/skills/ferpa-audit/references/regulatory-citations.md`. + +v1 scope: read-time detection only — do not expand into lineage, retention, breach response, or CI gating unless the user explicitly asks. diff --git a/.claude/skills/ferpa-audit/SKILL.md b/.claude/skills/ferpa-audit/SKILL.md new file mode 100644 index 0000000..58a9299 --- /dev/null +++ b/.claude/skills/ferpa-audit/SKILL.md @@ -0,0 +1,110 @@ +--- +name: ferpa-audit +description: Run Layer A (static) + Layer B (Postgres) read-time FERPA leak detection and write docs/ferpa-audit-.md. Invoke with /ferpa-audit or when auditing student-data flows for CIO/legal review. +--- + +# FERPA audit skill (v1 — read-time detection) + +## Purpose + +This skill produces a **single markdown report** institutions can share with a **CIO, compliance lead, or legal counsel** without a engineer in the room. It answers: *“Where could student education records or personally identifiable information leak at read time — in code, configuration, logs, vendor calls, or the database?”* + +v1 is **detection and documentation only**: no CI gating, no retention/breach playbooks, no data-lineage graphs (see issue #107 for lineage). + +## Invocation + +- **Slash command:** `/ferpa-audit` (repo command: `.claude/commands/ferpa-audit.md`). +- **CLI (authoritative for Layer A+B merge):** from repository root, after configuring `ferpa-config.yaml` and (for Layer B) database env vars per `operations/db_config.py`: + +```bash +./scripts/ferpa-audit.sh +``` + +- **Layer A only (fast):** + +```bash +cd codebenders-dashboard && npx tsx ../.claude/skills/ferpa-audit/scripts/static-audit.ts --repo-root .. --out /tmp/ferpa-static.json +``` + +- **Merge Layer B (expects static JSON from Layer A):** + +```bash +./venv/bin/python .claude/skills/ferpa-audit/scripts/db-audit.py --repo-root . --static-json /tmp/ferpa-static.json +``` + +Default report path: `docs/ferpa-audit-.md` (UTC date). + +## Regulatory knowledge (load-bearing) + +**Every finding MUST cite** a section from `references/regulatory-citations.md` in this skill folder. Do not invent citations. If nothing fits, lower severity to **Note** and still pick the closest hook, or omit the finding. + +**Why this matters:** Generic “PII scanners” flag strings; FERPA audits explain **whether education records or PII could be disclosed** without a valid basis (e.g. consent, school-official/legitimate educational interest, permitted vendor relationship, statistical de-identification). The citation anchors the narrative for procurement and legal review. + +### Severity rubric + +| Level | Meaning | Typical examples | +|-------|---------|------------------| +| **Critical** | Plausible **uncontrolled disclosure** of identifiers or row-level education records to the wrong party, or bypass of documented safeguards. | Arbitrary SQL execution without FERPA column controls; student-level payloads to non-allowlisted external hosts when hardening is off; logging full query results client-side. | +| **Warning** | **Material gap** in access control, vendor boundary, or consistency with documented policy — exploitable or high residual risk but may depend on deployment flags or network posture. | API routes returning cohort/student analytics without role checks; optional external data API path; LLM receives result rows without institutional review of caps/redaction. | +| **Note** | **Transparency / policy alignment** issues, small-N disclosure risk, or technical debt that should be tracked with FERPA framing. | AI transparency inventory drift; schema metadata sent to vendors; planned audit-log coverage not yet implemented. | + +Escalate **Critical → Warning** when the issue is **fully mitigated by documented production configuration** (e.g. `FORCE_DIRECT_DB=true`) but the **unsafe path still exists in code** — frame as “residual risk if misconfigured.” + +## Layers + +### Layer A — Static codebase audit (`scripts/static-audit.ts`) + +Inputs: repository tree, `ferpa-config.yaml`. Output: JSON findings file consumed by Layer B. + +**Checks (v1):** + +1. **SELECT / response-shape exclusions** — Regex over SQL-like literals and template strings for columns in `select_exclusions` / `sensitive_demographics`; flag unless `// FERPA-OK:` appears on the same line (convention). +2. **Policy vs enforcement** — Compare `/api/analyze` FERPA column guard (`lib/sql-inspector.ts`) to other execution paths (notably `/api/execute-sql`). A documented exclusion that is **prompt-only** or **only on one route** is a **Critical** or **Warning** finding. +3. **Console / client log leakage** — Flag `console.log` / `console.debug` / `console.info` (client bundles) that log query plans, results, or other large objects that may contain student rows. +4. **External fetch / third-party hosts** — Detect `schools.syntex-ai.com` and other non-allowlisted hosts; tie to `FORCE_DIRECT_DB` / `buildExternalAnalysisReadyUrl` story (#126). +5. **RBAC coverage** — Routes listed in `ferpa-config.yaml` under `rbac.student_data_routes` must reference the configured role header (default `x-user-role`). +6. **LLM prompt / vendor disclosure** — Ensure `ferpaExcluded` in `app/api/analyze/route.ts` is the single source of truth for model-directed SQL exclusions; flag gross inconsistency if other prompts contradict. +7. **AI transparency cross-check** — Every OpenAI / Vercel AI SDK call site under `app/api/**` must be reflected in `content/ai-transparency.ts` (issue #108). + +**Microsoft Presidio:** Optional enhancement. v1 does not require Presidio to pass acceptance; regex + AST + project config carry FERPA semantics. To add Presidio, install `presidio-analyzer` in the project venv and extend Layer A with a subprocess helper (documented in runbook). + +### Layer B — Live Postgres audit (`scripts/db-audit.py`) + +Inputs: read-only DB connection (`DB_*` env vars or `operations/db_config.py` defaults), `ferpa-config.yaml`, static JSON. Output: final markdown report. + +**Checks (v1):** + +1. **Schema snapshot** — Tables/columns in the configured schema(s), timestamped in the appendix. +2. **RLS** — Flag tables holding PII-class columns (per config taxonomy) with **no** row-level security policy (informational **Warning** in v1 — many institutional dashboards rely on app-layer RBAC instead). +3. **Small-N / subgroup disclosure** — For configured dimensions on the predictions table, flag cells where `COUNT(*) < subpopulation_minimum_n` (§99.35 statistical-disclosure framing). +4. **Audit-log path** — If `audit_log.table_name` is unset or table missing, emit a **Note** referencing planned institutional audit coverage (#67), not a fake pass. + +### Layer C — Report + +`db-audit.py` merges executive summary, findings grouped by severity, and appendix (skill path, config hash, DB snapshot time). + +**Writing style:** Plain English “what we saw,” “why it matters under FERPA,” “what to do next.” Avoid library names without one-line explanations. + +## Known regression targets (main branch) + +When tuning scanners, these MUST appear **without manual hints**: + +1. **Student_GUID / execute-sql** — Runtime guard exists on `/api/analyze` (#127) but **not** on arbitrary SQL execution — policy vs enforcement gap. +2. **schools.syntex-ai.com** — External analysis-ready API path unless `FORCE_DIRECT_DB` / direct DB mode (#126). +3. **Client console logging** — e.g. query page logging plans/results. + +## Out of scope (v1) + +- Lineage (#107), retention schedules, breach playbooks, write-time prevention / CI enforcement. + +## Files + +| File | Role | +|------|------| +| `SKILL.md` | This document | +| `references/regulatory-citations.md` | §99.x citation anchors | +| `scripts/static-audit.ts` | Layer A | +| `scripts/db-audit.py` | Layer B + report merge | +| `ferpa-config.yaml` (repository root) | Project exclusions, allowlists, thresholds | +| `../../docs/ferpa-audit-runbook.md` | Human runbook | +| `../../scripts/ferpa-audit.sh` | One-shot runner | diff --git a/.claude/skills/ferpa-audit/references/regulatory-citations.md b/.claude/skills/ferpa-audit/references/regulatory-citations.md new file mode 100644 index 0000000..c5d65ed --- /dev/null +++ b/.claude/skills/ferpa-audit/references/regulatory-citations.md @@ -0,0 +1,81 @@ +# FERPA regulatory hooks (34 CFR Part 99) + +Use this file as the **only** authoritative list of section citations for FERPA-audit findings. Each finding in `docs/ferpa-audit-.md` must reference **at least one** anchor below. Prefer the narrowest hook that matches the risk. + +Citations refer to the **Family Educational Rights and Privacy Act** regulations at 34 CFR Part 99 (commonly cited as “FERPA” in higher-education practice). This is a plain-English index for audit narratives; it is not legal advice. + +--- + +## §99.3 — Definitions + +| Anchor | What it covers | Typical audit use | +|--------|----------------|-------------------| +| **§99.3 — “Personally identifiable information” (PII)** | Information that, alone or in combination, would let a reasonable person identify a student with reasonable certainty — including direct identifiers and many indirect linkages. | Flagging direct identifiers (e.g. institution-issued student IDs/GUIDs), linkable keys, or combinations that re-identify individuals in outputs, logs, or vendor payloads. | +| **§99.3 — “Education records”** | Records directly related to a student and maintained by an educational agency or institution (with stated exceptions). | Explaining why student academic/demographic datasets, predictions, and course rows are not “just analytics data” — they are protected education records unless an exception clearly applies. | +| **§99.3 — “Directory information”** | Limited categories an institution may disclose without consent if public notice and opt-out requirements are met. | Warning when “directory information” arguments are misapplied to non-directory fields (grades, risk flags, detailed demographics, etc.). | + +--- + +## §99.7 — Policy and rights awareness + +| Anchor | Typical audit use | +|--------|-------------------| +| **§99.7 — annual notification / rights awareness** | Institution must inform parents/eligible students of rights under FERPA. | **Note**-level reminders when new technical surfaces (AI, external APIs) change how records are processed, so policy notices and transparency artifacts stay aligned with reality. | + +--- + +## §99.30 — Basis for disclosure + +| Anchor | Typical audit use | +|--------|-------------------| +| **§99.30 — general rule on consent** | Disclosure of PII from education records generally requires prior written consent unless a specific exception applies. | Framing **why** a new outbound data path (vendor API, third-party host) is sensitive even if “no SSN is sent.” | + +--- + +## §99.31 — Conditions for disclosure + +| Anchor | Typical audit use | +|--------|-------------------| +| **§99.31(a)(1) — studies exception (statutory)** | Permits disclosure to researchers under defined conditions. | Rare in routine dashboard audits; use only when the system is actually operating under this exception. | +| **§99.31(a)(1)(i) — “school officials” / legitimate educational interest** | Institutions may disclose to school officials with legitimate educational interest in the information. | **Primary hook** for internal analytics: staff may access student data only when their role and task justify it — motivates **RBAC**, access logging, and least-privilege API design. | +| **§99.31(a)(1)(ii)(A)(B) — contractors / “school officials” vendors** | Vendors performing institutional services may receive disclosures only under direct control and consistent use/re-disclosure rules. | **Primary hook** for **cloud LLM APIs**, hosted analytics, or third-party data hosts: the institution remains responsible for whether the disclosure is permitted and properly constrained. | + +--- + +## §99.32 — Recordkeeping and transparency to parents/students + +| Anchor | Typical audit use | +|--------|-------------------| +| **§99.32 — record of requests and disclosures** | Institutions must maintain a record of certain disclosures (with defined exceptions). | **Note**/**Warning** when read paths touch sensitive tables but no auditable trail exists (future linkage to institutional audit-log requirements). | + +--- + +## §99.33 — Limits on redisclosure + +| Anchor | Typical audit use | +|--------|-------------------| +| **§99.33 — redisclosure rules** | Third parties receiving education records generally may not redisclose except under specific circumstances. | Explaining risk when data is sent to vendors, partner hosts, or embedded in client-side logs that leave the institution’s control. | + +--- + +## §99.35 — Disclosure for research / statistical purposes + +| Anchor | Typical audit use | +|--------|-------------------| +| **§99.35 — de-identified / statistical disclosures** | Additional conditions when disclosing for research or statistical purposes. | Supporting findings on **small-cell suppression**, aggregation, and k-anonymity-style thresholds so subgroup statistics cannot identify individuals. | + +--- + +## §99.37 — Directory information + +| Anchor | Typical audit use | +|--------|-------------------| +| **§99.37 — directory information disclosures** | Conditions under which directory information may be released without consent. | Use when reviewing whether a field is truly directory information before treating it as low sensitivity. | + +--- + +## How to cite in a finding + +1. Pick **one** primary anchor (e.g. `§99.31(a)(1)(i) — legitimate educational interest`). +2. Add a **short** plain-English “why” tying the technical fact to that hook (vendor disclosure, missing access control, identifier in export, etc.). +3. Do **not** stack unrelated sections; add a second citation only when two distinct legal bases are genuinely implicated. diff --git a/.claude/skills/ferpa-audit/scripts/db-audit.py b/.claude/skills/ferpa-audit/scripts/db-audit.py new file mode 100644 index 0000000..886a04b --- /dev/null +++ b/.claude/skills/ferpa-audit/scripts/db-audit.py @@ -0,0 +1,345 @@ +#!/usr/bin/env python3 +""" +Layer B — Postgres FERPA-oriented audit + Layer C markdown report. +Merge with Layer A JSON from static-audit.ts (--static-json). +""" +from __future__ import annotations + +import argparse +import json +import os +import sys +from datetime import datetime, timezone +from pathlib import Path + +import yaml + +try: + import psycopg2 + from psycopg2.extras import RealDictCursor +except ImportError: + psycopg2 = None # type: ignore + + +def load_db_config(repo_root: Path) -> dict: + sys.path.insert(0, str(repo_root)) + try: + from operations import db_config as dc + + return { + "host": dc.DB_CONFIG["host"], + "port": dc.DB_CONFIG["port"], + "user": dc.DB_CONFIG["user"], + "password": dc.DB_CONFIG["password"], + "database": dc.DB_CONFIG["database"], + } + except Exception: + return { + "host": os.environ.get("DB_HOST", "127.0.0.1"), + "port": int(os.environ.get("DB_PORT", "54332")), + "user": os.environ.get("DB_USER", "postgres"), + "password": os.environ.get("DB_PASSWORD", "postgres"), + "database": os.environ.get("DB_NAME", "postgres"), + } + + +def md_escape(s: str) -> str: + return s.replace("|", "\\|") + + +def finding_block(f: dict) -> list[str]: + line = f.get("line") + loc = f['file'] + if line: + loc = f"{loc}:{line}" + return [ + f"### {f['severity']}: {f['title']}", + "", + f"- **Location:** `{loc}`", + f"- **Regulatory hook:** {f['regulation']}", + f"- **What we saw:** {f['description']}", + f"- **Remediation:** {f['remediation']}", + "", + ] + + +def run_db_checks(conn, ferpa: dict, repo_root: Path) -> tuple[list[dict], str]: + findings: list[dict] = [] + schema = ferpa["database"]["schema"] + table = ferpa["database"]["predictions_table"] + n_min = int(ferpa["subpopulation_minimum_n"]) + dims = ferpa["database"]["small_n_dimensions"] + + snapshot_lines: list[str] = [] + snap_ts = datetime.now(timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ") + + with conn.cursor(cursor_factory=RealDictCursor) as cur: + cur.execute( + """ + SELECT table_name, column_name, data_type + FROM information_schema.columns + WHERE table_schema = %s + ORDER BY table_name, ordinal_position + """, + (schema,), + ) + rows = cur.fetchall() + for r in rows[:500]: + snapshot_lines.append( + f"| {r['table_name']} | {r['column_name']} | {r['data_type']} |" + ) + + cur.execute( + """ + SELECT c.relname AS table_name + FROM pg_class c + JOIN pg_namespace n ON n.oid = c.relnamespace + WHERE n.nspname = %s + AND c.relkind = 'r' + AND EXISTS ( + SELECT 1 FROM information_schema.columns col + WHERE col.table_schema = %s + AND col.table_name = c.relname + AND ( + col.column_name ILIKE '%%guid%%' + OR col.column_name ILIKE '%%ssn%%' + OR col.column_name ILIKE '%%student%%' + ) + ) + AND NOT EXISTS ( + SELECT 1 FROM pg_policies p + WHERE p.schemaname = %s AND p.tablename = c.relname + ) + """, + (schema, schema, schema), + ) + for r in cur.fetchall()[:15]: + findings.append( + { + "severity": "Warning", + "category": "rls_gap", + "file": f"postgres:{schema}.{r['table_name']}", + "regulation": "§99.31(a)(1)(i) — legitimate educational interest", + "title": "Table with likely student identifiers has no Postgres RLS policy", + "description": ( + "Row-level security is one technical control institutions use to ensure " + "database sessions cannot read beyond an authorized scope. This table appears " + "to hold student-linked columns but has no RLS policy in Postgres — reliance " + "may be entirely on the application tier." + ), + "remediation": ( + "Evaluate RLS or equivalent database-session scoping with your data steward; " + "document compensating controls if the app layer alone enforces access." + ), + } + ) + + audit_table = ferpa.get("audit_log", {}).get("table_name") + if audit_table: + cur.execute( + """ + SELECT EXISTS ( + SELECT 1 FROM information_schema.tables + WHERE table_schema = %s AND table_name = %s + ) AS ok + """, + (schema, audit_table), + ) + if not cur.fetchone()["ok"]: + findings.append( + { + "severity": "Note", + "category": "audit_log_missing", + "file": "ferpa-config.yaml", + "regulation": "§99.32 — record of disclosures", + "title": "Configured audit log table not found in database", + "description": ( + "Institutional FERPA programs often require a retained record of certain " + "access or disclosures. The configured audit table is absent in this schema." + ), + "remediation": "Ship or attach the audit schema (#67) or clear the audit_log.table_name setting until available.", + } + ) + else: + findings.append( + { + "severity": "Note", + "category": "audit_log_not_configured", + "file": "ferpa-config.yaml", + "regulation": "§99.32 — record of disclosures", + "title": "Database audit-log cross-check skipped (not configured)", + "description": ( + "Layer B did not verify an institutional audit trail table. Read-time analytics " + "still benefit from logging who accessed which student-level exports." + ), + "remediation": "When #67 lands, set audit_log.table_name and re-run this audit.", + } + ) + + try: + dim_sql = ", ".join(dims) + sql = f""" + SELECT {dim_sql}, COUNT(*) AS n + FROM {schema}.{table} + GROUP BY {dim_sql} + HAVING COUNT(*) < %s + ORDER BY n ASC + LIMIT 50 + """ + cur.execute(sql, (n_min,)) + small = cur.fetchall() + for row in small: + findings.append( + { + "severity": "Note", + "category": "small_n_cell", + "file": f"postgres:{schema}.{table}", + "regulation": "§99.35 — disclosure for research / statistical purposes", + "title": "Small subgroup cell in predictions table", + "description": ( + f"A demographic × cohort cell contains fewer than {n_min} rows " + f"({dict(row)}). Publishing or exporting such cells can increase " + "re-identification risk for students in minority subgroups." + ), + "remediation": ( + "Apply suppression, rounding, or aggregation thresholds (#109) before " + "display or export." + ), + } + ) + except Exception as e: + findings.append( + { + "severity": "Note", + "category": "small_n_skipped", + "file": f"postgres:{schema}.{table}", + "regulation": "§99.35 — disclosure for research / statistical purposes", + "title": "Small-N cohort query could not run", + "description": f"The predictions table or columns may differ in this environment: {e}", + "remediation": "Align ferpa-config.yaml database.predictions_table and small_n_dimensions with the live schema.", + } + ) + + header = f"| Table | Column | Type |\n|------|--------|------|\n" + snapshot_md = header + "\n".join(snapshot_lines[:200]) + if len(snapshot_lines) > 200: + snapshot_md += f"\n\n_(truncated; {len(snapshot_lines)} total columns)_\n" + + appendix_schema = f"**Snapshot time (UTC):** {snap_ts}\n\n{snapshot_md}" + return findings, appendix_schema + + +def main() -> None: + ap = argparse.ArgumentParser() + ap.add_argument("--repo-root", required=True) + ap.add_argument("--static-json", required=True) + ap.add_argument("--out", default="") + ap.add_argument("--skip-db", action="store_true") + args = ap.parse_args() + + repo_root = Path(args.repo_root).resolve() + static_path = Path(args.static_json).resolve() + with open(static_path, encoding="utf8") as f: + static = json.load(f) + + config_path = repo_root / "ferpa-config.yaml" + with open(config_path, encoding="utf8") as f: + ferpa = yaml.safe_load(f) + + all_findings: list[dict] = list(static.get("findings", [])) + appendix_db = "" + + if not args.skip_db and psycopg2: + cfg = load_db_config(repo_root) + try: + conn = psycopg2.connect(**cfg) + try: + db_findings, appendix_db = run_db_checks(conn, ferpa, repo_root) + all_findings.extend(db_findings) + finally: + conn.close() + except Exception as e: + all_findings.append( + { + "severity": "Note", + "category": "db_unreachable", + "file": "operations/db_config.py", + "regulation": "§99.3 — education records", + "title": "Layer B database checks skipped (connection failed)", + "description": f"Could not connect for live schema audit: {e}", + "remediation": "Run with DB_* env vars set, or use --skip-db for static-only reports.", + } + ) + elif not args.skip_db and not psycopg2: + all_findings.append( + { + "severity": "Note", + "category": "db_driver_missing", + "file": "requirements.txt", + "regulation": "§99.3 — education records", + "title": "psycopg2 not installed; Layer B skipped", + "description": "Install project Python requirements in venv to enable Postgres checks.", + "remediation": "./venv/bin/pip install -r requirements.txt", + } + ) + + date_s = datetime.now(timezone.utc).strftime("%Y-%m-%d") + out = Path(args.out) if args.out else repo_root / "docs" / f"ferpa-audit-{date_s}.md" + + by_sev = {"Critical": [], "Warning": [], "Note": []} + for f in all_findings: + by_sev.setdefault(f["severity"], []).append(f) + + lines: list[str] = [ + "# FERPA read-time audit report", + "", + f"**Report date (UTC):** {date_s}", + "", + "## Executive summary", + "", + "This report documents **read-time** risks: places student education records or personally identifiable information could be exposed through application code, logs, vendor calls, or database configuration. It is intended for CIO, compliance, and legal review alongside engineering.", + "", + f"- **Critical findings:** {len(by_sev['Critical'])}", + f"- **Warnings:** {len(by_sev['Warning'])}", + f"- **Notes:** {len(by_sev['Note'])}", + "", + f"**Configuration fingerprint:** `{static.get('configHash', 'n/a')}` (`{static.get('configPath', 'ferpa-config.yaml')}`)", + "", + "---", + "", + ] + + for sev in ("Critical", "Warning", "Note"): + lines.append(f"## {sev} findings") + lines.append("") + if not by_sev[sev]: + lines.append("_None._") + lines.append("") + continue + for f in by_sev[sev]: + lines.extend(finding_block(f)) + + lines.extend( + [ + "---", + "", + "## Appendix", + "", + f"- **Skill:** `.claude/skills/ferpa-audit/`", + f"- **Regulatory index:** `.claude/skills/ferpa-audit/references/regulatory-citations.md`", + f"- **Layer A generated:** {static.get('generatedAt', '')}", + "", + "### Database schema snapshot (Layer B)", + "", + appendix_db or "_Layer B not run or no snapshot._", + "", + ] + ) + + out.parent.mkdir(parents=True, exist_ok=True) + out.write_text("\n".join(lines), encoding="utf8") + print(f"Wrote {out}") + + +if __name__ == "__main__": + main() diff --git a/.claude/skills/ferpa-audit/scripts/static-audit.ts b/.claude/skills/ferpa-audit/scripts/static-audit.ts new file mode 100644 index 0000000..98a1740 --- /dev/null +++ b/.claude/skills/ferpa-audit/scripts/static-audit.ts @@ -0,0 +1,377 @@ +/** + * Layer A — static FERPA-oriented audit. Run from repo via: + * cd codebenders-dashboard && npx tsx ../.claude/skills/ferpa-audit/scripts/static-audit.ts --repo-root .. --out /tmp/ferpa-static.json + */ +import * as crypto from "crypto" +import { createRequire } from "module" +import * as fs from "fs" +import * as path from "path" + +type Severity = "Critical" | "Warning" | "Note" + +interface Finding { + severity: Severity + category: string + file: string + line?: number + regulation: string + title: string + description: string + remediation: string +} + +interface FerpaConfig { + version: number + project: { + dashboard_relative_path: string + analyze_route_glob: string + execute_sql_route_glob: string + ai_transparency_file: string + query_executor_file: string + config_file: string + } + select_exclusions: string[] + sensitive_demographics: string[] + subpopulation_minimum_n: number + external_data_hosts: { hostname: string; note?: string }[] + allowlisted_external_hosts: string[] + rbac: { header_name: string; student_data_routes: string[] } + llm: { sdk_markers: string[] } +} + +function parseArgs(argv: string[]): { repoRoot: string; outPath: string } { + let repoRoot = process.cwd() + let outPath = "" + for (let i = 2; i < argv.length; i++) { + if (argv[i] === "--repo-root" && argv[i + 1]) { + repoRoot = path.resolve(argv[++i]) + } else if (argv[i] === "--out" && argv[i + 1]) { + outPath = path.resolve(argv[++i]) + } + } + if (!outPath) { + console.error("Missing --out ") + process.exit(1) + } + return { repoRoot, outPath } +} + +function walkFiles(root: string, exts: Set): string[] { + const out: string[] = [] + const skip = new Set(["node_modules", ".next", "dist", ".git"]) + function walk(dir: string) { + let entries: fs.Dirent[] + try { + entries = fs.readdirSync(dir, { withFileTypes: true }) + } catch { + return + } + for (const e of entries) { + if (skip.has(e.name)) continue + const p = path.join(dir, e.name) + if (e.isDirectory()) walk(p) + else if (exts.has(path.extname(e.name))) out.push(p) + } + } + walk(root) + return out +} + +function hashFile(filePath: string): string { + const h = crypto.createHash("sha256") + h.update(fs.readFileSync(filePath)) + return h.digest("hex").slice(0, 16) +} + +function rel(repoRoot: string, abs: string): string { + return path.relative(repoRoot, abs).split(path.sep).join("/") +} + +function add( + findings: Finding[], + f: Finding +): void { + findings.push(f) +} + +/** Regex: excluded column appears in SELECT ... context in a TS string literal */ +function scanSqlLiteralsForExclusions( + content: string, + relPath: string, + exclusions: string[], + findings: Finding[] +): void { + const re = /[`'"]([\s\S]*?\bSELECT\b[\s\S]*?)[`'"]/gi + let m: RegExpExecArray | null + while ((m = re.exec(content)) !== null) { + const chunk = m[1] + const line = content.slice(0, m.index).split("\n").length + if (!/\bfrom\b/i.test(chunk)) continue + for (const col of exclusions) { + const word = new RegExp(`\\b${col.replace(/[.*+?^${}()|[\]\\]/g, "\\$&")}\\b`, "i") + if (word.test(chunk) && !/FERPA-OK:/i.test(chunk)) { + add(findings, { + severity: "Warning", + category: "select_exclusion_literal", + file: relPath, + line, + regulation: "§99.31(a)(1)(i) — legitimate educational interest", + title: "SQL text may select a restricted student identifier or field", + description: + `A string in this file contains a SELECT-style fragment that references "${col}". Under FERPA, releasing such fields in the wrong context can expose personally identifiable information from education records. Institutions should verify this string is never executed for end-user export without appropriate access control and minimization.`, + remediation: + "Remove the column from selectable output, aggregate or de-identify, or mark the line with // FERPA-OK: after legal review.", + }) + break + } + } + } +} + +function scanConsoleLeak( + ts: typeof import("typescript"), + sourceFile: import("typescript").SourceFile, + relPath: string, + findings: Finding[], + isClientish: boolean +): void { + function visit(node: import("typescript").Node): void { + if ( + ts.isCallExpression(node) && + ts.isPropertyAccessExpression(node.expression) && + ["log", "debug", "info"].includes(node.expression.name.text) && + ts.isIdentifier(node.expression.expression) && + node.expression.expression.text === "console" + ) { + const args = node.arguments.map((a) => a.getText(sourceFile)).join(" ") + if ( + /\b(plan|result|data|rows|students?|response)\b/i.test(args) && + !/FERPA-OK:/i.test(node.getFullText(sourceFile)) + ) { + const pos = sourceFile.getLineAndCharacterOfPosition(node.getStart()) + add(findings, { + severity: isClientish ? "Warning" : "Note", + category: "console_leak", + file: relPath, + line: pos.line + 1, + regulation: "§99.33 — limits on redisclosure", + title: "Console logging may capture student-level query or response objects", + description: + "Browser or server consoles are not a controlled disclosure channel. Logging plans, results, or ambiguous large objects can place education-record-derived data where institutional access rules no longer apply (screenshots, remote debugging, third-party tooling).", + remediation: + "Remove debug logs before release; log only non-identifying error codes server-side, or gate verbose logging behind a secure, audited diagnostics mode.", + }) + } + } + ts.forEachChild(node, visit) + } + visit(sourceFile) +} + +function apiUrlFromRouteFile(routeFile: string, dashRoot: string): string { + const apiRoot = path.join(dashRoot, "app", "api") + const dir = path.dirname(routeFile) + let rel = path.relative(apiRoot, dir) + if (rel.startsWith("..")) return "" + const segments = rel.split(path.sep).filter(Boolean) + const url = segments.map((s) => (s.startsWith("[") && s.endsWith("]") ? `:${s.slice(1, -1)}` : s)).join("/") + return `/api/${url}` +} + +function main(): void { + const { repoRoot, outPath } = parseArgs(process.argv) + const req = createRequire(path.join(repoRoot, "codebenders-dashboard", "package.json")) + const ts: typeof import("typescript") = req("typescript") + const parseYaml = req("yaml").parse as (s: string) => unknown + const configPath = path.join(repoRoot, "ferpa-config.yaml") + if (!fs.existsSync(configPath)) { + console.error("ferpa-config.yaml not found at repo root") + process.exit(1) + } + const config = parseYaml(fs.readFileSync(configPath, "utf8")) as FerpaConfig + const findings: Finding[] = [] + const dashRoot = path.join(repoRoot, config.project.dashboard_relative_path) + + const executeSql = path.join(repoRoot, config.project.execute_sql_route_glob) + if (fs.existsSync(executeSql)) { + const ex = fs.readFileSync(executeSql, "utf8") + if (!ex.includes("inspectSelectForFerpaExclusions")) { + add(findings, { + severity: "Critical", + category: "ferpa_select_enforcement_gap", + file: rel(repoRoot, executeSql), + regulation: "§99.31(a)(1)(i) — legitimate educational interest", + title: "Arbitrary SQL execution path has no FERPA column guard", + description: + "The `/api/analyze` route applies a conservative SELECT-clause check (`inspectSelectForFerpaExclusions`) for columns such as Student_GUID, in addition to prompt instructions (#127). This endpoint executes whatever SQL the caller supplies with no equivalent guard — so the same identifier could still appear in results when queries bypass the analyzer (for example rule-based fallback → `/api/execute-sql`).", + remediation: + "Reuse the same SELECT-clause inspection used by /api/analyze, restrict to prepared institutional queries, or enforce column allowlists at the database role level.", + }) + } + if (!ex.includes(config.rbac.header_name)) { + add(findings, { + severity: "Warning", + category: "rbac_gap", + file: rel(repoRoot, executeSql), + regulation: "§99.31(a)(1)(i) — legitimate educational interest", + title: "SQL execution API does not check institutional role header", + description: + "Without application-layer role checks, any caller who can reach this route may exercise the privileges of the database connection — a common mismatch with FERPA’s expectation that access to education records tracks legitimate educational interest.", + remediation: + "Require the configured role header (or stronger authn/authz) before executing SQL, aligned to institutional policy.", + }) + } + } + + const analyzeRoute = path.join(repoRoot, config.project.analyze_route_glob) + if (fs.existsSync(analyzeRoute)) { + const an = fs.readFileSync(analyzeRoute, "utf8") + if (!an.includes(config.rbac.header_name)) { + add(findings, { + severity: "Warning", + category: "rbac_gap", + file: rel(repoRoot, analyzeRoute), + regulation: "§99.31(a)(1)(i) — legitimate educational interest", + title: "LLM query planner route has no role header check", + description: + "This route sends schema metadata to a vendor model and returns executable SQL. Institutional policy usually ties such capability to specific staff roles; missing header checks increase the risk of over-broad access if the network perimeter is ever misconfigured.", + remediation: + "Enforce the same RBAC primitive used elsewhere before invoking the model or returning a plan.", + }) + } + if (an.includes("Student_GUID") && an.includes("FERPA COMPLIANCE")) { + add(findings, { + severity: "Note", + category: "vendor_schema_disclosure", + file: rel(repoRoot, analyzeRoute), + regulation: "§99.31(a)(1)(ii)(A)(B) — contractor disclosure rules", + title: "Cloud LLM receives schema text that names the student identifier column", + description: + "Even when result rows are not sent, the prompt embeds column names and descriptions that reveal how individuals are keyed in the database. Vendors may log prompts for abuse monitoring; institutions should treat this as a controlled disclosure bounded by contract and the school-official framework.", + remediation: + "Confirm vendor agreements, data processing addenda, and institutional notices align with this disclosure; minimize schema detail where possible.", + }) + } + } + + const cfgFile = path.join(repoRoot, config.project.config_file) + const qeFile = path.join(repoRoot, config.project.query_executor_file) + if (fs.existsSync(cfgFile) && fs.existsSync(qeFile)) { + const cfg = fs.readFileSync(cfgFile, "utf8") + const qe = fs.readFileSync(qeFile, "utf8") + if (cfg.includes("schools.syntex-ai.com") && /fetch\s*\(\s*url\s*\)/.test(qe)) { + add(findings, { + severity: "Warning", + category: "external_student_data_host", + file: rel(repoRoot, qeFile), + regulation: "§99.31(a)(1)(ii)(A)(B) — contractor disclosure rules", + title: "Query executor can fetch student-level rows from a non-institutional host", + description: + "When direct-database mode and FORCE_DIRECT_DB hardening are not in effect, the dashboard retrieves analysis-ready rows from a project-hosted API domain rather than from the institution’s Postgres deployment. That shifts custody of student-level payloads and may affect contractual and FERPA oversight expectations.", + remediation: + "Set FORCE_DIRECT_DB=true (or equivalent) for procurement-hardened installs; document the residual code path in transparency materials until removed.", + }) + } + } + + for (const routeRel of config.rbac.student_data_routes) { + const abs = path.join(dashRoot, routeRel) + if (!fs.existsSync(abs)) continue + const txt = fs.readFileSync(abs, "utf8") + if (!txt.includes(config.rbac.header_name)) { + add(findings, { + severity: "Warning", + category: "rbac_gap", + file: rel(repoRoot, abs), + regulation: "§99.31(a)(1)(i) — legitimate educational interest", + title: "Student-data API route omits configured role header check", + description: + "This route appears on the institutional student-data route list in ferpa-config.yaml but does not reference the configured RBAC header. Access to education records should follow role-based institutional policy.", + remediation: + "Add the same role verification pattern used on other student endpoints, or remove the route from the list after documenting why it is exempt.", + }) + } + } + + const apiFiles = walkFiles(path.join(dashRoot, "app", "api"), new Set([".ts"])) + const llmRoutes: string[] = [] + const markers = config.llm.sdk_markers + for (const f of apiFiles) { + if (!f.endsWith(`${path.sep}route.ts`)) continue + const t = fs.readFileSync(f, "utf8") + if (markers.some((m) => t.includes(m))) { + llmRoutes.push(f) + } + } + + const transparencyPath = path.join(repoRoot, config.project.ai_transparency_file) + const transparency = fs.existsSync(transparencyPath) + ? fs.readFileSync(transparencyPath, "utf8") + : "" + + for (const f of llmRoutes) { + const apiPath = apiUrlFromRouteFile(f, dashRoot) + const mentioned = + Boolean(apiPath) && + (transparency.includes(apiPath) || + (apiPath.includes("analyze") && transparency.includes("analyze")) || + (apiPath.includes("query-summary") && transparency.includes("query-summary")) || + (apiPath.includes("explain-pairing") && transparency.includes("explain-pairing"))) + if (apiPath && !mentioned) { + add(findings, { + severity: "Note", + category: "ai_transparency_drift", + file: rel(repoRoot, f), + regulation: "§99.7 — policy and rights awareness", + title: "LLM call site may be missing from the AI transparency inventory", + description: + "Institutions increasingly publish AI transparency pages for procurement. An undeployed or undocumented model route creates a gap between what legal teams believe is running and what code can execute.", + remediation: + "Add an entry to content/ai-transparency.ts describing inputs, vendor, and data flow for this route.", + }) + } + } + + const scanRoots = [path.join(dashRoot, "app"), path.join(dashRoot, "lib")] + for (const sr of scanRoots) { + if (!fs.existsSync(sr)) continue + for (const f of walkFiles(sr, new Set([".ts", ".tsx"]))) { + const r = rel(repoRoot, f) + // Prompt templates intentionally name excluded columns for the model; covered by vendor_schema_disclosure. + if (r.endsWith("app/api/analyze/route.ts")) continue + if (r.includes("__tests__") || r.endsWith(".test.ts")) continue + const content = fs.readFileSync(f, "utf8") + // Identifier-focused literals only; raw demographics are assessed via Layer B small-N. + scanSqlLiteralsForExclusions(content, r, config.select_exclusions, findings) + } + } + + const compilerOptions: ts.CompilerOptions = { target: ts.ScriptTarget.ES2022, allowJs: true } + for (const f of walkFiles(path.join(dashRoot, "app"), new Set([".tsx", ".ts"]))) { + const content = fs.readFileSync(f, "utf8") + const sf = ts.createSourceFile(f, content, ts.ScriptTarget.ES2022, true, f.endsWith(".tsx") ? ts.ScriptKind.TSX : ts.ScriptKind.TS) + const clientish = f.includes(`${path.sep}app${path.sep}`) && !f.includes(`${path.sep}app${path.sep}api${path.sep}`) + scanConsoleLeak(ts, sf, rel(repoRoot, f), findings, clientish) + } + + const seen = new Set() + const deduped = findings.filter((f) => { + const k = `${f.severity}|${f.category}|${f.file}|${f.line ?? 0}|${f.title}` + if (seen.has(k)) return false + seen.add(k) + return true + }) + + const payload = { + layer: "A", + generatedAt: new Date().toISOString(), + configPath: rel(repoRoot, configPath), + configHash: hashFile(configPath), + repoRoot, + findings: deduped, + } + fs.writeFileSync(outPath, JSON.stringify(payload, null, 2), "utf8") + console.error(`Layer A: ${deduped.length} findings → ${outPath}`) +} + +main() diff --git a/.gitignore b/.gitignore index 14132fc..67e5c5a 100644 --- a/.gitignore +++ b/.gitignore @@ -165,6 +165,9 @@ docs/Copy-of-AI-Powered-Student-Success-Analytics.pdf docs/CodeBenders-PRD_Student_Success_Analytics.pdf DOCUMENTATION_ISSUES.md +# FERPA audit reports (generated locally; see scripts/ferpa-audit.sh) +docs/ferpa-audit-[0-9][0-9][0-9][0-9]-[0-9][0-9]-[0-9][0-9].md + # Misc .cache/ *.seed diff --git a/codebenders-dashboard/package.json b/codebenders-dashboard/package.json index 3942ba5..37a8483 100644 --- a/codebenders-dashboard/package.json +++ b/codebenders-dashboard/package.json @@ -34,6 +34,7 @@ "zod": "^3.24.1" }, "devDependencies": { + "yaml": "^2.8.0", "@types/node": "24.9.2", "@types/papaparse": "^5.5.2", "@types/pg": "^8.16.0", diff --git a/docs/ferpa-audit-runbook.md b/docs/ferpa-audit-runbook.md new file mode 100644 index 0000000..366eb1a --- /dev/null +++ b/docs/ferpa-audit-runbook.md @@ -0,0 +1,67 @@ +# FERPA audit runbook + +## When to run + +- Before procurement demos, accreditation packets, or **AASCU / Gates** convening materials that reference FERPA posture. +- After changes to **query execution**, **LLM routes**, **external data APIs**, or **student list/detail** endpoints. +- Quarterly read-time review even if features are stable — config drift (`FORCE_DIRECT_DB`, allowlists) is common. + +## Prerequisites + +- Repository clone with `ferpa-config.yaml` at the **repository root** (hash is recorded in each report appendix). +- **Layer A:** Node.js 20+, `codebenders-dashboard` dependencies installed (`npm install` in that directory). +- **Layer B:** Python 3 from project `venv/`, `psycopg2-binary` available (see root `requirements.txt`), database reachable read-only. + +Connection defaults match `operations/db_config.py` (`DB_HOST`, `DB_PORT`, `DB_NAME`, `DB_USER`, `DB_PASSWORD`). For Docker Compose local installs, `DB_NAME` is often `bishop_state` — set env vars before running if your instance differs from defaults. + +## Run + +From repository root: + +```bash +./scripts/ferpa-audit.sh +``` + +Output: `docs/ferpa-audit-.md` (UTC date). Dated report files are gitignored by default (see repository `.gitignore`). + +Skip database checks (CI or laptop without Postgres): + +```bash +./venv/bin/python .claude/skills/ferpa-audit/scripts/db-audit.py \ + --repo-root . \ + --static-json /tmp/ferpa-static.json \ + --skip-db +``` + +(Generate `/tmp/ferpa-static.json` first with the Layer A-only command in `.claude/skills/ferpa-audit/SKILL.md`.) + +## Interpreting severity + +| Severity | Institutional response | +|----------|-------------------------| +| **Critical** | Treat as release / procurement blocker until remediated or explicitly accepted with signed risk decision. | +| **Warning** | Remediate in sprint; document compensating controls if deferred. | +| **Note** | Track in transparency / policy backlog; often pairs with documentation updates. | + +## Escalation matrix + +| Finding domain | First line | Escalate to | +|----------------|------------|-------------| +| Vendor / cloud LLM disclosure | Product owner + engineering lead | Legal / compliance | +| External student-row API (`syntex`) | DevOps / SRE | CIO + compliance | +| Missing RBAC on student endpoints | Engineering | Security + registrar / data steward | +| Small-N cohort cells | Data science | IR / research compliance | + +## Optional: Presidio + +Microsoft Presidio adds NLP/regex breadth for **generic** PII in prose. It does **not** replace project-specific FERPA rules in `ferpa-config.yaml`. Install in venv if desired: + +```bash +./venv/bin/pip install presidio-analyzer +``` + +Extend Layer A with a subprocess if your institution requires Presidio-class scanning. + +## Slash command + +In Claude Code, `/ferpa-audit` points agents at `.claude/skills/ferpa-audit/SKILL.md` and this runbook. diff --git a/ferpa-config.yaml b/ferpa-config.yaml new file mode 100644 index 0000000..82defad --- /dev/null +++ b/ferpa-config.yaml @@ -0,0 +1,82 @@ +# FERPA audit configuration (repository root) +# Referenced by .claude/skills/ferpa-audit/scripts/* — keep stable keys for hash comparisons across runs. + +version: 1 + +project: + dashboard_relative_path: codebenders-dashboard + analyze_route_glob: codebenders-dashboard/app/api/analyze/route.ts + execute_sql_route_glob: codebenders-dashboard/app/api/execute-sql/route.ts + ai_transparency_file: codebenders-dashboard/content/ai-transparency.ts + query_executor_file: codebenders-dashboard/lib/query-executor.ts + config_file: codebenders-dashboard/lib/config.ts + +# Columns that must not appear in SELECT lists, API JSON shapes, exports, or logs +# without an explicit same-line annotation: // FERPA-OK: +select_exclusions: + - Student_GUID + - student_guid + +# Demographic fields — raw values in small cells heighten re-identification risk (Layer B) +sensitive_demographics: + - Race + - Gender + +subpopulation_minimum_n: 10 + +# Postgres (Layer B) — override with DB_* env vars; see operations/db_config.py +database: + schema: public + predictions_table: student_level_with_predictions + small_n_dimensions: + - '"Race"' + - '"Gender"' + - '"Cohort"' + +# Third-party / non-institutional hosts (Layer A fetch classification) +external_data_hosts: + - hostname: schools.syntex-ai.com + note: "Analysis-ready student-row API; must be blocked when FORCE_DIRECT_DB=true" + +allowlisted_external_hosts: + - api.openai.com + - api.anthropic.com + +rbac: + header_name: x-user-role + # API routes (relative to dashboard) that return student-level or row-level education data + # and must enforce the role header per institutional policy. + # execute-sql and analyze are covered by dedicated Layer A checks (FERPA guard + vendor prompt). + student_data_routes: + - app/api/students/route.ts + - app/api/students/[guid]/route.ts + - app/api/dashboard/kpis/route.ts + - app/api/dashboard/readiness/route.ts + - app/api/dashboard/retention-risk/route.ts + - app/api/dashboard/risk-alerts/route.ts + - app/api/query-history/route.ts + +taxonomy: + pii_column_substrings: + - GUID + - SSN + - social + - birth + - email + educational_record_tables: + - student_level_with_predictions + - course_enrollments + - student_predictions + - course_predictions + +audit_log: + # When set, Layer B will check for this table; if missing, emit informational Note only. + table_name: null + +llm: + # Files that must stay aligned with ferpaExcluded in analyze_route (regex extract in Layer A) + sdk_markers: + - "@ai-sdk/openai" + - "createOpenAI" + - "streamObject" + - "generateText" diff --git a/scripts/ferpa-audit.sh b/scripts/ferpa-audit.sh new file mode 100755 index 0000000..902cea3 --- /dev/null +++ b/scripts/ferpa-audit.sh @@ -0,0 +1,21 @@ +#!/usr/bin/env bash +# Run full FERPA audit (Layer A + Layer B). Repository root = parent of scripts/. +set -euo pipefail +ROOT="$(cd "$(dirname "$0")/.." && pwd)" +TMP_JSON="$(mktemp)" +cleanup() { rm -f "$TMP_JSON"; } +trap cleanup EXIT + +cd "$ROOT/codebenders-dashboard" +npx tsx "$ROOT/.claude/skills/ferpa-audit/scripts/static-audit.ts" \ + --repo-root "$ROOT" \ + --out "$TMP_JSON" + +PYTHON="$ROOT/venv/bin/python" +if [[ ! -x "$PYTHON" ]]; then + PYTHON="python3" +fi + +"$PYTHON" "$ROOT/.claude/skills/ferpa-audit/scripts/db-audit.py" \ + --repo-root "$ROOT" \ + --static-json "$TMP_JSON" From e17ddd8ecda049d9393565ee5813e32f39fe586f Mon Sep 17 00:00:00 2001 From: William Hill Date: Sun, 3 May 2026 12:13:58 -0400 Subject: [PATCH 2/2] refactor(ferpa-audit): simplify static and db audit scripts Extract shared regulation strings and helpers, dedupe findings logic, split db-audit render/merge paths; behavior and 19 Layer A findings unchanged. Co-authored-by: Cursor --- .../skills/ferpa-audit/scripts/db-audit.py | 381 ++++++++++-------- .../ferpa-audit/scripts/static-audit.ts | 114 +++--- 2 files changed, 266 insertions(+), 229 deletions(-) diff --git a/.claude/skills/ferpa-audit/scripts/db-audit.py b/.claude/skills/ferpa-audit/scripts/db-audit.py index 886a04b..36e3073 100644 --- a/.claude/skills/ferpa-audit/scripts/db-audit.py +++ b/.claude/skills/ferpa-audit/scripts/db-audit.py @@ -20,6 +20,48 @@ except ImportError: psycopg2 = None # type: ignore +# Regulatory hooks — keep literals identical to historical reports. +REG_LEI = "§99.31(a)(1)(i) — legitimate educational interest" +REG_DISCLOSURES = "§99.32 — record of disclosures" +REG_SMALL_N = "§99.35 — disclosure for research / statistical purposes" +REG_ED_RECORDS = "§99.3 — education records" + +SCHEMA_COLUMNS_SQL = """ + SELECT table_name, column_name, data_type + FROM information_schema.columns + WHERE table_schema = %s + ORDER BY table_name, ordinal_position +""" + +RLS_GAP_SQL = """ + SELECT c.relname AS table_name + FROM pg_class c + JOIN pg_namespace n ON n.oid = c.relnamespace + WHERE n.nspname = %s + AND c.relkind = 'r' + AND EXISTS ( + SELECT 1 FROM information_schema.columns col + WHERE col.table_schema = %s + AND col.table_name = c.relname + AND ( + col.column_name ILIKE '%%guid%%' + OR col.column_name ILIKE '%%ssn%%' + OR col.column_name ILIKE '%%student%%' + ) + ) + AND NOT EXISTS ( + SELECT 1 FROM pg_policies p + WHERE p.schemaname = %s AND p.tablename = c.relname + ) +""" + +AUDIT_TABLE_EXISTS_SQL = """ + SELECT EXISTS ( + SELECT 1 FROM information_schema.tables + WHERE table_schema = %s AND table_name = %s + ) AS ok +""" + def load_db_config(repo_root: Path) -> dict: sys.path.insert(0, str(repo_root)) @@ -43,13 +85,33 @@ def load_db_config(repo_root: Path) -> dict: } -def md_escape(s: str) -> str: - return s.replace("|", "\\|") +def append_finding( + findings: list[dict], + *, + severity: str, + category: str, + file: str, + regulation: str, + title: str, + description: str, + remediation: str, +) -> None: + findings.append( + { + "severity": severity, + "category": category, + "file": file, + "regulation": regulation, + "title": title, + "description": description, + "remediation": remediation, + } + ) def finding_block(f: dict) -> list[str]: line = f.get("line") - loc = f['file'] + loc = f["file"] if line: loc = f"{loc}:{line}" return [ @@ -63,7 +125,7 @@ def finding_block(f: dict) -> list[str]: ] -def run_db_checks(conn, ferpa: dict, repo_root: Path) -> tuple[list[dict], str]: +def run_db_checks(conn, ferpa: dict) -> tuple[list[dict], str]: findings: list[dict] = [] schema = ferpa["database"]["schema"] table = ferpa["database"]["predictions_table"] @@ -74,111 +136,69 @@ def run_db_checks(conn, ferpa: dict, repo_root: Path) -> tuple[list[dict], str]: snap_ts = datetime.now(timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ") with conn.cursor(cursor_factory=RealDictCursor) as cur: - cur.execute( - """ - SELECT table_name, column_name, data_type - FROM information_schema.columns - WHERE table_schema = %s - ORDER BY table_name, ordinal_position - """, - (schema,), - ) + cur.execute(SCHEMA_COLUMNS_SQL, (schema,)) rows = cur.fetchall() for r in rows[:500]: snapshot_lines.append( f"| {r['table_name']} | {r['column_name']} | {r['data_type']} |" ) - cur.execute( - """ - SELECT c.relname AS table_name - FROM pg_class c - JOIN pg_namespace n ON n.oid = c.relnamespace - WHERE n.nspname = %s - AND c.relkind = 'r' - AND EXISTS ( - SELECT 1 FROM information_schema.columns col - WHERE col.table_schema = %s - AND col.table_name = c.relname - AND ( - col.column_name ILIKE '%%guid%%' - OR col.column_name ILIKE '%%ssn%%' - OR col.column_name ILIKE '%%student%%' - ) - ) - AND NOT EXISTS ( - SELECT 1 FROM pg_policies p - WHERE p.schemaname = %s AND p.tablename = c.relname - ) - """, - (schema, schema, schema), - ) + cur.execute(RLS_GAP_SQL, (schema, schema, schema)) for r in cur.fetchall()[:15]: - findings.append( - { - "severity": "Warning", - "category": "rls_gap", - "file": f"postgres:{schema}.{r['table_name']}", - "regulation": "§99.31(a)(1)(i) — legitimate educational interest", - "title": "Table with likely student identifiers has no Postgres RLS policy", - "description": ( - "Row-level security is one technical control institutions use to ensure " - "database sessions cannot read beyond an authorized scope. This table appears " - "to hold student-linked columns but has no RLS policy in Postgres — reliance " - "may be entirely on the application tier." - ), - "remediation": ( - "Evaluate RLS or equivalent database-session scoping with your data steward; " - "document compensating controls if the app layer alone enforces access." - ), - } + append_finding( + findings, + severity="Warning", + category="rls_gap", + file=f"postgres:{schema}.{r['table_name']}", + regulation=REG_LEI, + title="Table with likely student identifiers has no Postgres RLS policy", + description=( + "Row-level security is one technical control institutions use to ensure " + "database sessions cannot read beyond an authorized scope. This table appears " + "to hold student-linked columns but has no RLS policy in Postgres — reliance " + "may be entirely on the application tier." + ), + remediation=( + "Evaluate RLS or equivalent database-session scoping with your data steward; " + "document compensating controls if the app layer alone enforces access." + ), ) audit_table = ferpa.get("audit_log", {}).get("table_name") if audit_table: - cur.execute( - """ - SELECT EXISTS ( - SELECT 1 FROM information_schema.tables - WHERE table_schema = %s AND table_name = %s - ) AS ok - """, - (schema, audit_table), - ) + cur.execute(AUDIT_TABLE_EXISTS_SQL, (schema, audit_table)) if not cur.fetchone()["ok"]: - findings.append( - { - "severity": "Note", - "category": "audit_log_missing", - "file": "ferpa-config.yaml", - "regulation": "§99.32 — record of disclosures", - "title": "Configured audit log table not found in database", - "description": ( - "Institutional FERPA programs often require a retained record of certain " - "access or disclosures. The configured audit table is absent in this schema." - ), - "remediation": "Ship or attach the audit schema (#67) or clear the audit_log.table_name setting until available.", - } + append_finding( + findings, + severity="Note", + category="audit_log_missing", + file="ferpa-config.yaml", + regulation=REG_DISCLOSURES, + title="Configured audit log table not found in database", + description=( + "Institutional FERPA programs often require a retained record of certain " + "access or disclosures. The configured audit table is absent in this schema." + ), + remediation="Ship or attach the audit schema (#67) or clear the audit_log.table_name setting until available.", ) else: - findings.append( - { - "severity": "Note", - "category": "audit_log_not_configured", - "file": "ferpa-config.yaml", - "regulation": "§99.32 — record of disclosures", - "title": "Database audit-log cross-check skipped (not configured)", - "description": ( - "Layer B did not verify an institutional audit trail table. Read-time analytics " - "still benefit from logging who accessed which student-level exports." - ), - "remediation": "When #67 lands, set audit_log.table_name and re-run this audit.", - } + append_finding( + findings, + severity="Note", + category="audit_log_not_configured", + file="ferpa-config.yaml", + regulation=REG_DISCLOSURES, + title="Database audit-log cross-check skipped (not configured)", + description=( + "Layer B did not verify an institutional audit trail table. Read-time analytics " + "still benefit from logging who accessed which student-level exports." + ), + remediation="When #67 lands, set audit_log.table_name and re-run this audit.", ) try: dim_sql = ", ".join(dims) - sql = f""" + small_n_sql = f""" SELECT {dim_sql}, COUNT(*) AS n FROM {schema}.{table} GROUP BY {dim_sql} @@ -186,41 +206,38 @@ def run_db_checks(conn, ferpa: dict, repo_root: Path) -> tuple[list[dict], str]: ORDER BY n ASC LIMIT 50 """ - cur.execute(sql, (n_min,)) - small = cur.fetchall() - for row in small: - findings.append( - { - "severity": "Note", - "category": "small_n_cell", - "file": f"postgres:{schema}.{table}", - "regulation": "§99.35 — disclosure for research / statistical purposes", - "title": "Small subgroup cell in predictions table", - "description": ( - f"A demographic × cohort cell contains fewer than {n_min} rows " - f"({dict(row)}). Publishing or exporting such cells can increase " - "re-identification risk for students in minority subgroups." - ), - "remediation": ( - "Apply suppression, rounding, or aggregation thresholds (#109) before " - "display or export." - ), - } + cur.execute(small_n_sql, (n_min,)) + for row in cur.fetchall(): + append_finding( + findings, + severity="Note", + category="small_n_cell", + file=f"postgres:{schema}.{table}", + regulation=REG_SMALL_N, + title="Small subgroup cell in predictions table", + description=( + f"A demographic × cohort cell contains fewer than {n_min} rows " + f"({dict(row)}). Publishing or exporting such cells can increase " + "re-identification risk for students in minority subgroups." + ), + remediation=( + "Apply suppression, rounding, or aggregation thresholds (#109) before " + "display or export." + ), ) except Exception as e: - findings.append( - { - "severity": "Note", - "category": "small_n_skipped", - "file": f"postgres:{schema}.{table}", - "regulation": "§99.35 — disclosure for research / statistical purposes", - "title": "Small-N cohort query could not run", - "description": f"The predictions table or columns may differ in this environment: {e}", - "remediation": "Align ferpa-config.yaml database.predictions_table and small_n_dimensions with the live schema.", - } + append_finding( + findings, + severity="Note", + category="small_n_skipped", + file=f"postgres:{schema}.{table}", + regulation=REG_SMALL_N, + title="Small-N cohort query could not run", + description=f"The predictions table or columns may differ in this environment: {e}", + remediation="Align ferpa-config.yaml database.predictions_table and small_n_dimensions with the live schema.", ) - header = f"| Table | Column | Type |\n|------|--------|------|\n" + header = "| Table | Column | Type |\n|------|--------|------|\n" snapshot_md = header + "\n".join(snapshot_lines[:200]) if len(snapshot_lines) > 200: snapshot_md += f"\n\n_(truncated; {len(snapshot_lines)} total columns)_\n" @@ -229,64 +246,56 @@ def run_db_checks(conn, ferpa: dict, repo_root: Path) -> tuple[list[dict], str]: return findings, appendix_schema -def main() -> None: - ap = argparse.ArgumentParser() - ap.add_argument("--repo-root", required=True) - ap.add_argument("--static-json", required=True) - ap.add_argument("--out", default="") - ap.add_argument("--skip-db", action="store_true") - args = ap.parse_args() - - repo_root = Path(args.repo_root).resolve() - static_path = Path(args.static_json).resolve() - with open(static_path, encoding="utf8") as f: - static = json.load(f) - - config_path = repo_root / "ferpa-config.yaml" - with open(config_path, encoding="utf8") as f: - ferpa = yaml.safe_load(f) - +def merge_static_and_db( + repo_root: Path, + ferpa: dict, + static: dict, + skip_db: bool, +) -> tuple[list[dict], str]: all_findings: list[dict] = list(static.get("findings", [])) appendix_db = "" - if not args.skip_db and psycopg2: - cfg = load_db_config(repo_root) + if skip_db: + return all_findings, appendix_db + + if not psycopg2: + append_finding( + all_findings, + severity="Note", + category="db_driver_missing", + file="requirements.txt", + regulation=REG_ED_RECORDS, + title="psycopg2 not installed; Layer B skipped", + description="Install project Python requirements in venv to enable Postgres checks.", + remediation="./venv/bin/pip install -r requirements.txt", + ) + return all_findings, appendix_db + + cfg = load_db_config(repo_root) + try: + conn = psycopg2.connect(**cfg) try: - conn = psycopg2.connect(**cfg) - try: - db_findings, appendix_db = run_db_checks(conn, ferpa, repo_root) - all_findings.extend(db_findings) - finally: - conn.close() - except Exception as e: - all_findings.append( - { - "severity": "Note", - "category": "db_unreachable", - "file": "operations/db_config.py", - "regulation": "§99.3 — education records", - "title": "Layer B database checks skipped (connection failed)", - "description": f"Could not connect for live schema audit: {e}", - "remediation": "Run with DB_* env vars set, or use --skip-db for static-only reports.", - } - ) - elif not args.skip_db and not psycopg2: - all_findings.append( - { - "severity": "Note", - "category": "db_driver_missing", - "file": "requirements.txt", - "regulation": "§99.3 — education records", - "title": "psycopg2 not installed; Layer B skipped", - "description": "Install project Python requirements in venv to enable Postgres checks.", - "remediation": "./venv/bin/pip install -r requirements.txt", - } + db_findings, appendix_db = run_db_checks(conn, ferpa) + all_findings.extend(db_findings) + finally: + conn.close() + except Exception as e: + append_finding( + all_findings, + severity="Note", + category="db_unreachable", + file="operations/db_config.py", + regulation=REG_ED_RECORDS, + title="Layer B database checks skipped (connection failed)", + description=f"Could not connect for live schema audit: {e}", + remediation="Run with DB_* env vars set, or use --skip-db for static-only reports.", ) - date_s = datetime.now(timezone.utc).strftime("%Y-%m-%d") - out = Path(args.out) if args.out else repo_root / "docs" / f"ferpa-audit-{date_s}.md" + return all_findings, appendix_db + - by_sev = {"Critical": [], "Warning": [], "Note": []} +def render_report(static: dict, all_findings: list[dict], appendix_db: str, date_s: str) -> str: + by_sev: dict[str, list[dict]] = {"Critical": [], "Warning": [], "Note": []} for f in all_findings: by_sev.setdefault(f["severity"], []).append(f) @@ -336,8 +345,32 @@ def main() -> None: ] ) + return "\n".join(lines) + + +def main() -> None: + ap = argparse.ArgumentParser() + ap.add_argument("--repo-root", required=True) + ap.add_argument("--static-json", required=True) + ap.add_argument("--out", default="") + ap.add_argument("--skip-db", action="store_true") + args = ap.parse_args() + + repo_root = Path(args.repo_root).resolve() + static_path = Path(args.static_json).resolve() + with open(static_path, encoding="utf8") as f: + static = json.load(f) + + with open(repo_root / "ferpa-config.yaml", encoding="utf8") as f: + ferpa = yaml.safe_load(f) + + all_findings, appendix_db = merge_static_and_db(repo_root, ferpa, static, args.skip_db) + + date_s = datetime.now(timezone.utc).strftime("%Y-%m-%d") + out = Path(args.out) if args.out else repo_root / "docs" / f"ferpa-audit-{date_s}.md" + out.parent.mkdir(parents=True, exist_ok=True) - out.write_text("\n".join(lines), encoding="utf8") + out.write_text(render_report(static, all_findings, appendix_db, date_s), encoding="utf8") print(f"Wrote {out}") diff --git a/.claude/skills/ferpa-audit/scripts/static-audit.ts b/.claude/skills/ferpa-audit/scripts/static-audit.ts index 98a1740..f93c249 100644 --- a/.claude/skills/ferpa-audit/scripts/static-audit.ts +++ b/.claude/skills/ferpa-audit/scripts/static-audit.ts @@ -39,6 +39,15 @@ interface FerpaConfig { llm: { sdk_markers: string[] } } +/** Repeated regulatory hooks — keep literals identical to historical reports. */ +const REG_LEI = "§99.31(a)(1)(i) — legitimate educational interest" +const REG_CONTRACTOR = "§99.31(a)(1)(ii)(A)(B) — contractor disclosure rules" +const REG_REDISCLOSURE = "§99.33 — limits on redisclosure" +const REG_RIGHTS = "§99.7 — policy and rights awareness" + +const SKIP_DIRS = new Set(["node_modules", ".next", "dist", ".git"]) +const CONSOLE_LEVELS = ["log", "debug", "info"] + function parseArgs(argv: string[]): { repoRoot: string; outPath: string } { let repoRoot = process.cwd() let outPath = "" @@ -58,8 +67,7 @@ function parseArgs(argv: string[]): { repoRoot: string; outPath: string } { function walkFiles(root: string, exts: Set): string[] { const out: string[] = [] - const skip = new Set(["node_modules", ".next", "dist", ".git"]) - function walk(dir: string) { + function walk(dir: string): void { let entries: fs.Dirent[] try { entries = fs.readdirSync(dir, { withFileTypes: true }) @@ -67,7 +75,7 @@ function walkFiles(root: string, exts: Set): string[] { return } for (const e of entries) { - if (skip.has(e.name)) continue + if (SKIP_DIRS.has(e.name)) continue const p = path.join(dir, e.name) if (e.isDirectory()) walk(p) else if (exts.has(path.extname(e.name))) out.push(p) @@ -78,20 +86,33 @@ function walkFiles(root: string, exts: Set): string[] { } function hashFile(filePath: string): string { - const h = crypto.createHash("sha256") - h.update(fs.readFileSync(filePath)) - return h.digest("hex").slice(0, 16) + return crypto.createHash("sha256").update(fs.readFileSync(filePath)).digest("hex").slice(0, 16) } function rel(repoRoot: string, abs: string): string { return path.relative(repoRoot, abs).split(path.sep).join("/") } -function add( - findings: Finding[], - f: Finding -): void { - findings.push(f) +function escapeRegexChars(s: string): string { + return s.replace(/[.*+?^${}()|[\]\\]/g, "\\$&") +} + +function routeMentionedInTransparency(apiPath: string, transparency: string): boolean { + if (transparency.includes(apiPath)) return true + if (apiPath.includes("analyze") && transparency.includes("analyze")) return true + if (apiPath.includes("query-summary") && transparency.includes("query-summary")) return true + if (apiPath.includes("explain-pairing") && transparency.includes("explain-pairing")) return true + return false +} + +function dedupeFindings(findings: Finding[]): Finding[] { + const seen = new Set() + return findings.filter((f) => { + const k = `${f.severity}|${f.category}|${f.file}|${f.line ?? 0}|${f.title}` + if (seen.has(k)) return false + seen.add(k) + return true + }) } /** Regex: excluded column appears in SELECT ... context in a TS string literal */ @@ -108,14 +129,14 @@ function scanSqlLiteralsForExclusions( const line = content.slice(0, m.index).split("\n").length if (!/\bfrom\b/i.test(chunk)) continue for (const col of exclusions) { - const word = new RegExp(`\\b${col.replace(/[.*+?^${}()|[\]\\]/g, "\\$&")}\\b`, "i") + const word = new RegExp(`\\b${escapeRegexChars(col)}\\b`, "i") if (word.test(chunk) && !/FERPA-OK:/i.test(chunk)) { - add(findings, { + findings.push({ severity: "Warning", category: "select_exclusion_literal", file: relPath, line, - regulation: "§99.31(a)(1)(i) — legitimate educational interest", + regulation: REG_LEI, title: "SQL text may select a restricted student identifier or field", description: `A string in this file contains a SELECT-style fragment that references "${col}". Under FERPA, releasing such fields in the wrong context can expose personally identifiable information from education records. Institutions should verify this string is never executed for end-user export without appropriate access control and minimization.`, @@ -139,7 +160,7 @@ function scanConsoleLeak( if ( ts.isCallExpression(node) && ts.isPropertyAccessExpression(node.expression) && - ["log", "debug", "info"].includes(node.expression.name.text) && + CONSOLE_LEVELS.includes(node.expression.name.text) && ts.isIdentifier(node.expression.expression) && node.expression.expression.text === "console" ) { @@ -149,12 +170,12 @@ function scanConsoleLeak( !/FERPA-OK:/i.test(node.getFullText(sourceFile)) ) { const pos = sourceFile.getLineAndCharacterOfPosition(node.getStart()) - add(findings, { + findings.push({ severity: isClientish ? "Warning" : "Note", category: "console_leak", file: relPath, line: pos.line + 1, - regulation: "§99.33 — limits on redisclosure", + regulation: REG_REDISCLOSURE, title: "Console logging may capture student-level query or response objects", description: "Browser or server consoles are not a controlled disclosure channel. Logging plans, results, or ambiguous large objects can place education-record-derived data where institutional access rules no longer apply (screenshots, remote debugging, third-party tooling).", @@ -171,9 +192,9 @@ function scanConsoleLeak( function apiUrlFromRouteFile(routeFile: string, dashRoot: string): string { const apiRoot = path.join(dashRoot, "app", "api") const dir = path.dirname(routeFile) - let rel = path.relative(apiRoot, dir) - if (rel.startsWith("..")) return "" - const segments = rel.split(path.sep).filter(Boolean) + let relToApi = path.relative(apiRoot, dir) + if (relToApi.startsWith("..")) return "" + const segments = relToApi.split(path.sep).filter(Boolean) const url = segments.map((s) => (s.startsWith("[") && s.endsWith("]") ? `:${s.slice(1, -1)}` : s)).join("/") return `/api/${url}` } @@ -196,11 +217,11 @@ function main(): void { if (fs.existsSync(executeSql)) { const ex = fs.readFileSync(executeSql, "utf8") if (!ex.includes("inspectSelectForFerpaExclusions")) { - add(findings, { + findings.push({ severity: "Critical", category: "ferpa_select_enforcement_gap", file: rel(repoRoot, executeSql), - regulation: "§99.31(a)(1)(i) — legitimate educational interest", + regulation: REG_LEI, title: "Arbitrary SQL execution path has no FERPA column guard", description: "The `/api/analyze` route applies a conservative SELECT-clause check (`inspectSelectForFerpaExclusions`) for columns such as Student_GUID, in addition to prompt instructions (#127). This endpoint executes whatever SQL the caller supplies with no equivalent guard — so the same identifier could still appear in results when queries bypass the analyzer (for example rule-based fallback → `/api/execute-sql`).", @@ -209,11 +230,11 @@ function main(): void { }) } if (!ex.includes(config.rbac.header_name)) { - add(findings, { + findings.push({ severity: "Warning", category: "rbac_gap", file: rel(repoRoot, executeSql), - regulation: "§99.31(a)(1)(i) — legitimate educational interest", + regulation: REG_LEI, title: "SQL execution API does not check institutional role header", description: "Without application-layer role checks, any caller who can reach this route may exercise the privileges of the database connection — a common mismatch with FERPA’s expectation that access to education records tracks legitimate educational interest.", @@ -227,11 +248,11 @@ function main(): void { if (fs.existsSync(analyzeRoute)) { const an = fs.readFileSync(analyzeRoute, "utf8") if (!an.includes(config.rbac.header_name)) { - add(findings, { + findings.push({ severity: "Warning", category: "rbac_gap", file: rel(repoRoot, analyzeRoute), - regulation: "§99.31(a)(1)(i) — legitimate educational interest", + regulation: REG_LEI, title: "LLM query planner route has no role header check", description: "This route sends schema metadata to a vendor model and returns executable SQL. Institutional policy usually ties such capability to specific staff roles; missing header checks increase the risk of over-broad access if the network perimeter is ever misconfigured.", @@ -240,11 +261,11 @@ function main(): void { }) } if (an.includes("Student_GUID") && an.includes("FERPA COMPLIANCE")) { - add(findings, { + findings.push({ severity: "Note", category: "vendor_schema_disclosure", file: rel(repoRoot, analyzeRoute), - regulation: "§99.31(a)(1)(ii)(A)(B) — contractor disclosure rules", + regulation: REG_CONTRACTOR, title: "Cloud LLM receives schema text that names the student identifier column", description: "Even when result rows are not sent, the prompt embeds column names and descriptions that reveal how individuals are keyed in the database. Vendors may log prompts for abuse monitoring; institutions should treat this as a controlled disclosure bounded by contract and the school-official framework.", @@ -260,11 +281,11 @@ function main(): void { const cfg = fs.readFileSync(cfgFile, "utf8") const qe = fs.readFileSync(qeFile, "utf8") if (cfg.includes("schools.syntex-ai.com") && /fetch\s*\(\s*url\s*\)/.test(qe)) { - add(findings, { + findings.push({ severity: "Warning", category: "external_student_data_host", file: rel(repoRoot, qeFile), - regulation: "§99.31(a)(1)(ii)(A)(B) — contractor disclosure rules", + regulation: REG_CONTRACTOR, title: "Query executor can fetch student-level rows from a non-institutional host", description: "When direct-database mode and FORCE_DIRECT_DB hardening are not in effect, the dashboard retrieves analysis-ready rows from a project-hosted API domain rather than from the institution’s Postgres deployment. That shifts custody of student-level payloads and may affect contractual and FERPA oversight expectations.", @@ -279,11 +300,11 @@ function main(): void { if (!fs.existsSync(abs)) continue const txt = fs.readFileSync(abs, "utf8") if (!txt.includes(config.rbac.header_name)) { - add(findings, { + findings.push({ severity: "Warning", category: "rbac_gap", file: rel(repoRoot, abs), - regulation: "§99.31(a)(1)(i) — legitimate educational interest", + regulation: REG_LEI, title: "Student-data API route omits configured role header check", description: "This route appears on the institutional student-data route list in ferpa-config.yaml but does not reference the configured RBAC header. Access to education records should follow role-based institutional policy.", @@ -299,30 +320,20 @@ function main(): void { for (const f of apiFiles) { if (!f.endsWith(`${path.sep}route.ts`)) continue const t = fs.readFileSync(f, "utf8") - if (markers.some((m) => t.includes(m))) { - llmRoutes.push(f) - } + if (markers.some((m) => t.includes(m))) llmRoutes.push(f) } const transparencyPath = path.join(repoRoot, config.project.ai_transparency_file) - const transparency = fs.existsSync(transparencyPath) - ? fs.readFileSync(transparencyPath, "utf8") - : "" + const transparency = fs.existsSync(transparencyPath) ? fs.readFileSync(transparencyPath, "utf8") : "" for (const f of llmRoutes) { const apiPath = apiUrlFromRouteFile(f, dashRoot) - const mentioned = - Boolean(apiPath) && - (transparency.includes(apiPath) || - (apiPath.includes("analyze") && transparency.includes("analyze")) || - (apiPath.includes("query-summary") && transparency.includes("query-summary")) || - (apiPath.includes("explain-pairing") && transparency.includes("explain-pairing"))) - if (apiPath && !mentioned) { - add(findings, { + if (apiPath && !routeMentionedInTransparency(apiPath, transparency)) { + findings.push({ severity: "Note", category: "ai_transparency_drift", file: rel(repoRoot, f), - regulation: "§99.7 — policy and rights awareness", + regulation: REG_RIGHTS, title: "LLM call site may be missing from the AI transparency inventory", description: "Institutions increasingly publish AI transparency pages for procurement. An undeployed or undocumented model route creates a gap between what legal teams believe is running and what code can execute.", @@ -346,7 +357,6 @@ function main(): void { } } - const compilerOptions: ts.CompilerOptions = { target: ts.ScriptTarget.ES2022, allowJs: true } for (const f of walkFiles(path.join(dashRoot, "app"), new Set([".tsx", ".ts"]))) { const content = fs.readFileSync(f, "utf8") const sf = ts.createSourceFile(f, content, ts.ScriptTarget.ES2022, true, f.endsWith(".tsx") ? ts.ScriptKind.TSX : ts.ScriptKind.TS) @@ -354,13 +364,7 @@ function main(): void { scanConsoleLeak(ts, sf, rel(repoRoot, f), findings, clientish) } - const seen = new Set() - const deduped = findings.filter((f) => { - const k = `${f.severity}|${f.category}|${f.file}|${f.line ?? 0}|${f.title}` - if (seen.has(k)) return false - seen.add(k) - return true - }) + const deduped = dedupeFindings(findings) const payload = { layer: "A",