diff --git a/README.md b/README.md index 5a82102..aafb673 100644 --- a/README.md +++ b/README.md @@ -90,3 +90,36 @@ The pipeline should treat **export formats** as first-class requirements so the 6. Define **student model** constraints (context length, tool set) and a **filter + eval** plan for teacher-to-student parity before production swap. +--- + +## PII residual-risk audit + +This repository now includes a lightweight audit utility for checking whether redacted JSONL training data still contains common residual PII patterns. + +### Currently detected patterns + +- email addresses +- 10-digit phone numbers +- API-key-like strings +- token-like strings + +### Run the sample audit + +```bash +python training_setup_logs/pii_audit.py +``` + +### Run tests + +```bash +pytest -q +``` + +### Audit output + +The generated audit report includes: + +- number of rows scanned +- suspected rows containing residual PII +- line-level issue summaries +- overall residual-risk classification \ No newline at end of file diff --git a/examples/redacted_sample.jsonl b/examples/redacted_sample.jsonl new file mode 100644 index 0000000..25b5f47 --- /dev/null +++ b/examples/redacted_sample.jsonl @@ -0,0 +1,4 @@ +{"text": "User email has been replaced with [EMAIL]"} +{"text": "Call me maybe at [PHONE]"} +{"text": "No sensitive information here"} +{"text": "Oops leaked email john@example.com"} \ No newline at end of file diff --git a/pyproject.toml b/pyproject.toml new file mode 100644 index 0000000..bb6e0fe --- /dev/null +++ b/pyproject.toml @@ -0,0 +1,3 @@ +[tool.pytest.ini_options] +pythonpath = ["."] +testpaths = ["tests"] \ No newline at end of file diff --git a/tests/__pycache__/test_pii_audit.cpython-314-pytest-9.0.3.pyc b/tests/__pycache__/test_pii_audit.cpython-314-pytest-9.0.3.pyc new file mode 100644 index 0000000..d071988 Binary files /dev/null and b/tests/__pycache__/test_pii_audit.cpython-314-pytest-9.0.3.pyc differ diff --git a/tests/test_pii_audit.py b/tests/test_pii_audit.py new file mode 100644 index 0000000..28771cb --- /dev/null +++ b/tests/test_pii_audit.py @@ -0,0 +1,17 @@ +from training_setup_logs.pii_audit import detect_pii + + +def test_email_detection(): + text = "contact me at test@example.com" + + findings = detect_pii(text) + + assert len(findings) > 0 + + +def test_clean_text(): + text = "all pii has been removed" + + findings = detect_pii(text) + + assert findings == [] \ No newline at end of file diff --git a/training_setup_logs/__init__.py b/training_setup_logs/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/training_setup_logs/__pycache__/__init__.cpython-314.pyc b/training_setup_logs/__pycache__/__init__.cpython-314.pyc new file mode 100644 index 0000000..d74623b Binary files /dev/null and b/training_setup_logs/__pycache__/__init__.cpython-314.pyc differ diff --git a/training_setup_logs/__pycache__/pii_audit.cpython-314.pyc b/training_setup_logs/__pycache__/pii_audit.cpython-314.pyc new file mode 100644 index 0000000..9712d96 Binary files /dev/null and b/training_setup_logs/__pycache__/pii_audit.cpython-314.pyc differ diff --git a/training_setup_logs/pii_audit.py b/training_setup_logs/pii_audit.py new file mode 100644 index 0000000..ab6d5bf --- /dev/null +++ b/training_setup_logs/pii_audit.py @@ -0,0 +1,73 @@ +import json +import re +from pathlib import Path + + +PII_PATTERNS = { + "email": r"[a-zA-Z0-9_.+-]+@[a-zA-Z0-9-]+\.[a-zA-Z0-9-.]+", + "phone": r"\b\d{10}\b", + "api_key": r"sk-[A-Za-z0-9]{20,}", + "token": r"token\s*[:=]\s*[A-Za-z0-9-_]+", +} + + +def detect_pii(text): + findings = [] + + for pii_type, pattern in PII_PATTERNS.items(): + matches = re.findall(pattern, text) + + if matches: + findings.append( + { + "type": pii_type, + "matches_found": len(matches), + } + ) + + return findings + + +def audit_jsonl(file_path): + report = { + "rows_scanned": 0, + "suspected_pii": 0, + "findings": [], + } + + path = Path(file_path) + + with path.open("r", encoding="utf-8") as file: + for line_number, line in enumerate(file, start=1): + report["rows_scanned"] += 1 + + data = json.loads(line) + + text = json.dumps(data) + + findings = detect_pii(text) + + if findings: + report["suspected_pii"] += 1 + + report["findings"].append( + { + "line": line_number, + "issues": findings, + } + ) + + if report["suspected_pii"] == 0: + report["risk_level"] = "low" + elif report["suspected_pii"] < 3: + report["risk_level"] = "medium" + else: + report["risk_level"] = "high" + + return report + + +if __name__ == "__main__": + report = audit_jsonl("examples/redacted_sample.jsonl") + + print(json.dumps(report, indent=2)) \ No newline at end of file