OpenAgriNet · Keerti707 · May 13, 2026
diff --git a/README.md b/README.md
@@ -90,3 +90,36 @@ The pipeline should treat **export formats** as first-class requirements so the
 6. Define **student model** constraints (context length, tool set) and a **filter + eval** plan for teacher-to-student parity before production swap.
 
 
+---
+
+## PII residual-risk audit
+
+This repository now includes a lightweight audit utility for checking whether redacted JSONL training data still contains common residual PII patterns.
+
+### Currently detected patterns
+
+- email addresses
+- 10-digit phone numbers
+- API-key-like strings
+- token-like strings
+
+### Run the sample audit
+
+```bash
+python training_setup_logs/pii_audit.py
+```
+
+### Run tests
+
+```bash
+pytest -q
+```
+
+### Audit output
+
+The generated audit report includes:
+
+- number of rows scanned
+- suspected rows containing residual PII
+- line-level issue summaries
+- overall residual-risk classification
diff --git a/examples/redacted_sample.jsonl b/examples/redacted_sample.jsonl
@@ -0,0 +1,4 @@
+{"text": "User email has been replaced with [EMAIL]"}
+{"text": "Call me maybe at [PHONE]"}
+{"text": "No sensitive information here"}
+{"text": "Oops leaked email john@example.com"}
diff --git a/pyproject.toml b/pyproject.toml
@@ -0,0 +1,3 @@
+[tool.pytest.ini_options]
+pythonpath = ["."]
+testpaths = ["tests"]
diff --git a/tests/__pycache__/test_pii_audit.cpython-314-pytest-9.0.3.pyc b/tests/__pycache__/test_pii_audit.cpython-314-pytest-9.0.3.pyc
diff --git a/tests/test_pii_audit.py b/tests/test_pii_audit.py
@@ -0,0 +1,17 @@
+from training_setup_logs.pii_audit import detect_pii
+
+
+def test_email_detection():
+    text = "contact me at test@example.com"
+
+    findings = detect_pii(text)
+
+    assert len(findings) > 0
+
+
+def test_clean_text():
+    text = "all pii has been removed"
+
+    findings = detect_pii(text)
+
+    assert findings == []
diff --git a/training_setup_logs/__init__.py b/training_setup_logs/__init__.py
diff --git a/training_setup_logs/__pycache__/__init__.cpython-314.pyc b/training_setup_logs/__pycache__/__init__.cpython-314.pyc
diff --git a/training_setup_logs/__pycache__/pii_audit.cpython-314.pyc b/training_setup_logs/__pycache__/pii_audit.cpython-314.pyc
diff --git a/training_setup_logs/pii_audit.py b/training_setup_logs/pii_audit.py
@@ -0,0 +1,73 @@
+import json
+import re
+from pathlib import Path
+
+
+PII_PATTERNS = {
+    "email": r"[a-zA-Z0-9_.+-]+@[a-zA-Z0-9-]+\.[a-zA-Z0-9-.]+",
+    "phone": r"\b\d{10}\b",
+    "api_key": r"sk-[A-Za-z0-9]{20,}",
+    "token": r"token\s*[:=]\s*[A-Za-z0-9-_]+",
+}
+
+
+def detect_pii(text):
+    findings = []
+
+    for pii_type, pattern in PII_PATTERNS.items():
+        matches = re.findall(pattern, text)
+
+        if matches:
+            findings.append(
+                {
+                    "type": pii_type,
+                    "matches_found": len(matches),
+                }
+            )
+
+    return findings
+
+
+def audit_jsonl(file_path):
+    report = {
+        "rows_scanned": 0,
+        "suspected_pii": 0,
+        "findings": [],
+    }
+
+    path = Path(file_path)
+
+    with path.open("r", encoding="utf-8") as file:
+        for line_number, line in enumerate(file, start=1):
+            report["rows_scanned"] += 1
+
+            data = json.loads(line)
+
+            text = json.dumps(data)
+
+            findings = detect_pii(text)
+
+            if findings:
+                report["suspected_pii"] += 1
+
+                report["findings"].append(
+                    {
+                        "line": line_number,
+                        "issues": findings,
+                    }
+                )
+
+    if report["suspected_pii"] == 0:
+        report["risk_level"] = "low"
+    elif report["suspected_pii"] < 3:
+        report["risk_level"] = "medium"
+    else:
+        report["risk_level"] = "high"
+
+    return report
+
+
+if __name__ == "__main__":
+    report = audit_jsonl("examples/redacted_sample.jsonl")
+
+    print(json.dumps(report, indent=2))