From 0bba7fb59ed385e5ed863b5ad3a410a8ff6024d4 Mon Sep 17 00:00:00 2001 From: Keerti Gupta <24bsm032@iiitdmj.ac.in> Date: Wed, 13 May 2026 17:54:14 +0530 Subject: [PATCH] feat: add residual PII audit workflow --- README.md | 33 ++++++++ examples/redacted_sample.jsonl | 4 + pyproject.toml | 3 + ...est_pii_audit.cpython-314-pytest-9.0.3.pyc | Bin 0 -> 2449 bytes tests/test_pii_audit.py | 17 ++++ training_setup_logs/__init__.py | 0 .../__pycache__/__init__.cpython-314.pyc | Bin 0 -> 184 bytes .../__pycache__/pii_audit.cpython-314.pyc | Bin 0 -> 2337 bytes training_setup_logs/pii_audit.py | 73 ++++++++++++++++++ 9 files changed, 130 insertions(+) create mode 100644 examples/redacted_sample.jsonl create mode 100644 pyproject.toml create mode 100644 tests/__pycache__/test_pii_audit.cpython-314-pytest-9.0.3.pyc create mode 100644 tests/test_pii_audit.py create mode 100644 training_setup_logs/__init__.py create mode 100644 training_setup_logs/__pycache__/__init__.cpython-314.pyc create mode 100644 training_setup_logs/__pycache__/pii_audit.cpython-314.pyc create mode 100644 training_setup_logs/pii_audit.py diff --git a/README.md b/README.md index 5a82102..aafb673 100644 --- a/README.md +++ b/README.md @@ -90,3 +90,36 @@ The pipeline should treat **export formats** as first-class requirements so the 6. Define **student model** constraints (context length, tool set) and a **filter + eval** plan for teacher-to-student parity before production swap. +--- + +## PII residual-risk audit + +This repository now includes a lightweight audit utility for checking whether redacted JSONL training data still contains common residual PII patterns. + +### Currently detected patterns + +- email addresses +- 10-digit phone numbers +- API-key-like strings +- token-like strings + +### Run the sample audit + +```bash +python training_setup_logs/pii_audit.py +``` + +### Run tests + +```bash +pytest -q +``` + +### Audit output + +The generated audit report includes: + +- number of rows scanned +- suspected rows containing residual PII +- line-level issue summaries +- overall residual-risk classification \ No newline at end of file diff --git a/examples/redacted_sample.jsonl b/examples/redacted_sample.jsonl new file mode 100644 index 0000000..25b5f47 --- /dev/null +++ b/examples/redacted_sample.jsonl @@ -0,0 +1,4 @@ +{"text": "User email has been replaced with [EMAIL]"} +{"text": "Call me maybe at [PHONE]"} +{"text": "No sensitive information here"} +{"text": "Oops leaked email john@example.com"} \ No newline at end of file diff --git a/pyproject.toml b/pyproject.toml new file mode 100644 index 0000000..bb6e0fe --- /dev/null +++ b/pyproject.toml @@ -0,0 +1,3 @@ +[tool.pytest.ini_options] +pythonpath = ["."] +testpaths = ["tests"] \ No newline at end of file diff --git a/tests/__pycache__/test_pii_audit.cpython-314-pytest-9.0.3.pyc b/tests/__pycache__/test_pii_audit.cpython-314-pytest-9.0.3.pyc new file mode 100644 index 0000000000000000000000000000000000000000..d07198841b26b8348d034b3bf1fd7050c78462a2 GIT binary patch literal 2449 zcmb7`&2Jk;6u@WK>+dva)1uVHA!~(-E=6_|C&5*5)QVI}4=vUn+>6!K#-2C}>)p-l z7?L2Q9C`?X0|yQ{a)cxQNok=)dx1kQ+z{c!iT8Fr>va^-GID0#d-G=I&D(iD<@)rw z0)b{5%Et|fkS;2s$s^&U0m319M0DvM;Ziuto2j9^KsFVgUcedoUS>1PvkOE|%@Ll{ z73lN2v`Dn{AGpl}O$rJ<%4wGyo@Gs(Q-t{x{GEIYCms@=c%%cdM6FF$MzSY$&?{_h zDYnXw$nJ7G)p$(E?#4T#`MXkoo{%!>C?f-DWrQ>yD?~4}mAa%CA1>DAm65M5>2fD0 zyjo#?gcgkzvNAHab<)(NH)<8q5!sCIBVz5nd20Ri>#e^Jaq1~u`CqY~&dQmw zjgacDg1_STmgqB8!Qg zXN7G%R!BP|b{6A~q~~u*o_tGUAL@B{!&3)ubzO&>U9JwOYI2oRpWmi?=D@RQwd)Ro zl(&B)NZHguX(`BUTTah%`aY_4?A8vZP2Z=Ct5>gi`wcCeX<%7M>zZ^>Lu+003!je! zwHZ3z(EMv?S<-y1@P)b_8rM)^_@G!Cx}b2|+sB9ulLZ%yu4&r_qaK5}Jd@F23Z{*% zp=EQ+@q>)*f`=bmHvAoTX!k-_qi?%grfoRpfChQPH@7M75KP|*W42u9E@LhWE*je| z8<^and!B7NCWjd<8z`LaahAncig;Lb)8(YHnp9RICEDx#q;k{8UDT)r_iH=ufYv^t zlnrYeZg)7K4)<#xdDOYnXVwGCYn+*u1DQ5_%7>m|yM4cgxrlmXjbRRZ7O#5y!KE-s zbYNPx5wQruXHb3wOW*$W7}j^psMd&uc#3F1%WOLZw9Gx6Jw?m79$PUt#VTaq1T7;1 zjZZq0uVa#k2zwEM#_L`|u859f>bB1Ij|o(hC-|WtzuL+vs-!zJ;$DfJ&kNgl1gfM3 zO~eRP$pBH7^$ZXte{dPfRE6iM?wG#1MX94QI&eRwJ%+gti-}=2C%Ca1abp#@@sHeW z0yt)o4nPUD836+rqU9qt;EmZi?2x_;LuVK(Lo_mEA-f2jmSLAr#s|dSfG%07OUdG# zDBCq3zj{=t>{U(i5fh4f}Hag;I)s&y>

pQHdztPDqLg&-%m7v>-+84)R~5uu>`tBiieEDqF*O+j{KV;Foe ze+FWzF;}8gBSRQo_XiT593>?8!NBbeZMx3hg(>_u#)nr-{!B@d^cx|M$uDH(uS{B+ M|NDGex)Lt@7iF-|cK`qY literal 0 HcmV?d00001 diff --git a/tests/test_pii_audit.py b/tests/test_pii_audit.py new file mode 100644 index 0000000..28771cb --- /dev/null +++ b/tests/test_pii_audit.py @@ -0,0 +1,17 @@ +from training_setup_logs.pii_audit import detect_pii + + +def test_email_detection(): + text = "contact me at test@example.com" + + findings = detect_pii(text) + + assert len(findings) > 0 + + +def test_clean_text(): + text = "all pii has been removed" + + findings = detect_pii(text) + + assert findings == [] \ No newline at end of file diff --git a/training_setup_logs/__init__.py b/training_setup_logs/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/training_setup_logs/__pycache__/__init__.cpython-314.pyc b/training_setup_logs/__pycache__/__init__.cpython-314.pyc new file mode 100644 index 0000000000000000000000000000000000000000..d74623bc2934c7c5936e4233513543b3d82b2da3 GIT binary patch literal 184 zcmdPq zjJ14+y(G+m*@<_OblKC3$jQO^+8OQWMVMK5U;GSH0SzOaNdQ$HF}ibTjMaEQI{yJe zHz5Z`PlBr}#o)dUj}W>xGFHo#&c(U@n!+MjZ%3CP`*9{l<2W=V6n98W!c!N=xq7Cf zjylzeE}cV2M908J4+=X7XWiB@@nsZiehHiMLN=$R!cO9iUA!pACa+#SCrw`=ylUZ$ zNgM_yt_4*~Dd{wED!Puf6k*d?3-e$H-?E56cU5iCB!aH0vTfU?E0%=~jksz~OdcZa z>w-bz(WPt#N0%`+a?z>md@h4E%Z$!|<;g`uoyJzwG89z{}hz}@k(xB1@8-I?9yqpKIn%{|{=SqpxbsPJr4;2)OvH9UhPo~*PWZ*yVh z1z1L}FO!pSkp(2#fTLg=Mo|_@X0N2;sD3rNBkp)z0aamH2hSNsW^#@kE2Da0>1=`~ z)E;s6IwLvjx=?c8)JpDLoRQEO#~H`a(Yh`|*O?*TD=x?89DTqh_DV8VZyv9dPzc5O zy@;LrbnO#8EG=(|`G2v3=7AH%8V^Q5Ims30B=B

P>q3KQ z@5E3W?MQWWq7Kw)KOat>VLOn@gd2$0$lf$%b6(LjoFYCmXX;SVIAzx~xI%<6@_f!( zh@679DPV0rOOXdOZ(53B5iYH2m^f9_%wcmHaGJeI+=gl{%W3=>1^{PpO3h^mx1=sE zg$08qfpA%X9uctyNHi1+6YhqY)d-)?Dk+ojsa!@kZA2L~(_xW_3u+q60P!W_G_anf zj%|*k8^l9{kips}Eh4g3&@%JT{JH>N#zUfP{W%wgP4f zXjf=iWy(TuwW%xwR^G3;xCUq8{NG;W^{}w!QtE-tH$yf)6I{PpqakkMtC$4spRysdqEj zU%c?NF?jb@+26AA<&M99ZFJi|wCV5P1wOy+@80o;H~it<)?+14xwWfw0vg2|Ezl^M z0?*n|_ryyUxd$pIkkC-fZo7_^LR+q$a%XSRS^RuMXs?2M zEqC_U_l4>{kGlG5qFql@;quFIunK4V*QBZZak6*H$^X*&7R7`KP znv%h?Yz$JC*9}#(j4;qYAfuJ)+P5jZoyn$hX?)fg2N`XA<^oI=mSLDj$nyw^zoXWQ zfOzk^?`)a#6q9Anx88Uh$aP<&BJzGm!Ta3bz%)Pa?qvd%X$EoLVt$Kl`_0$7-gbJ^ hH&HnM2PYPL?+tu4uo5p#Zn^s&a{bR-h&yV>@gL;W1z!LF literal 0 HcmV?d00001 diff --git a/training_setup_logs/pii_audit.py b/training_setup_logs/pii_audit.py new file mode 100644 index 0000000..ab6d5bf --- /dev/null +++ b/training_setup_logs/pii_audit.py @@ -0,0 +1,73 @@ +import json +import re +from pathlib import Path + + +PII_PATTERNS = { + "email": r"[a-zA-Z0-9_.+-]+@[a-zA-Z0-9-]+\.[a-zA-Z0-9-.]+", + "phone": r"\b\d{10}\b", + "api_key": r"sk-[A-Za-z0-9]{20,}", + "token": r"token\s*[:=]\s*[A-Za-z0-9-_]+", +} + + +def detect_pii(text): + findings = [] + + for pii_type, pattern in PII_PATTERNS.items(): + matches = re.findall(pattern, text) + + if matches: + findings.append( + { + "type": pii_type, + "matches_found": len(matches), + } + ) + + return findings + + +def audit_jsonl(file_path): + report = { + "rows_scanned": 0, + "suspected_pii": 0, + "findings": [], + } + + path = Path(file_path) + + with path.open("r", encoding="utf-8") as file: + for line_number, line in enumerate(file, start=1): + report["rows_scanned"] += 1 + + data = json.loads(line) + + text = json.dumps(data) + + findings = detect_pii(text) + + if findings: + report["suspected_pii"] += 1 + + report["findings"].append( + { + "line": line_number, + "issues": findings, + } + ) + + if report["suspected_pii"] == 0: + report["risk_level"] = "low" + elif report["suspected_pii"] < 3: + report["risk_level"] = "medium" + else: + report["risk_level"] = "high" + + return report + + +if __name__ == "__main__": + report = audit_jsonl("examples/redacted_sample.jsonl") + + print(json.dumps(report, indent=2)) \ No newline at end of file