From ee6f541f586ed371a6a75ba637be418d9608f523 Mon Sep 17 00:00:00 2001 From: NomzzNJS Date: Sat, 16 May 2026 21:37:07 +0530 Subject: [PATCH] added PII redaction prototype, canonical schema, and sample log export --- README.md | 17 +++ pipeline/README.md | 24 ++++ pipeline/__init__.py | 1 + pipeline/__pycache__/__init__.cpython-314.pyc | Bin 0 -> 291 bytes pipeline/__pycache__/cli.cpython-314.pyc | Bin 0 -> 5215 bytes .../__pycache__/normalize.cpython-314.pyc | Bin 0 -> 6442 bytes .../__pycache__/pii_redactor.cpython-314.pyc | Bin 0 -> 6580 bytes pipeline/cli.py | 79 ++++++++++++ pipeline/normalize.py | 113 ++++++++++++++++ pipeline/pii_redactor.py | 122 ++++++++++++++++++ pipeline/pii_taxonomy.yaml | 27 ++++ pipeline/schema.yaml | 19 +++ sample_logs/example.jsonl | 9 ++ 13 files changed, 411 insertions(+) create mode 100644 pipeline/README.md create mode 100644 pipeline/__init__.py create mode 100644 pipeline/__pycache__/__init__.cpython-314.pyc create mode 100644 pipeline/__pycache__/cli.cpython-314.pyc create mode 100644 pipeline/__pycache__/normalize.cpython-314.pyc create mode 100644 pipeline/__pycache__/pii_redactor.cpython-314.pyc create mode 100644 pipeline/cli.py create mode 100644 pipeline/normalize.py create mode 100644 pipeline/pii_redactor.py create mode 100644 pipeline/pii_taxonomy.yaml create mode 100644 pipeline/schema.yaml create mode 100644 sample_logs/example.jsonl diff --git a/README.md b/README.md index 5a82102..c487e99 100644 --- a/README.md +++ b/README.md @@ -89,4 +89,21 @@ The pipeline should treat **export formats** as first-class requirements so the 5. Specify **one SFT JSONL schema** and **one DPO JSONL schema** (and chat template) validated end-to-end with a **LoRA** dry run and a **small DPO** dry run on toy data. 6. Define **student model** constraints (context length, tool set) and a **filter + eval** plan for teacher-to-student parity before production swap. +--- + +## Prototype contribution + +A minimal privacy-preserving prototype has been added in `pipeline/`: + +- `pipeline/pii_redactor.py`: rule-based detection and placeholder replacement for email, phone, URL token, UUID, credit card, SSN, and IP address patterns. +- `pipeline/normalize.py`: canonical event normalization for user, assistant, tool, and tool_result log entries. +- `pipeline/cli.py`: sample runner that audits PII, applies redaction, and exports `sft_ready.jsonl` and `dpo_ready.jsonl`. +- `sample_logs/example.jsonl`: representative log sample containing both Q&A and agentic tool workflow traces. + +Run the prototype from the repo root: + +```bash +python pipeline/cli.py sample_logs/example.jsonl output +``` + diff --git a/pipeline/README.md b/pipeline/README.md new file mode 100644 index 0000000..e89a9d6 --- /dev/null +++ b/pipeline/README.md @@ -0,0 +1,24 @@ +# PII Redaction Pipeline + +This pipeline demonstrates a privacy-preserving preprocessing flow for sample production logs. + +## What it does + +- Normalizes heterogeneous log entries into a canonical event schema. +- Detects and redacts PII categories such as emails, phone numbers, URLs with tokens, credit card numbers, and UUIDs. +- Produces SFT-ready JSONL and DPO-ready JSONL exports from redacted events. + +## Usage + +From the repository root: + +```bash +cd training_setup_logs +python pipeline/cli.py sample_logs/example.jsonl output +``` + +This writes: + +- `output/sft_ready.jsonl` +- `output/dpo_ready.jsonl` +- `output/redacted_events.jsonl` diff --git a/pipeline/__init__.py b/pipeline/__init__.py new file mode 100644 index 0000000..d51d6c9 --- /dev/null +++ b/pipeline/__init__.py @@ -0,0 +1 @@ +"""Pipeline package for log normalization and PII sanitization.""" diff --git a/pipeline/__pycache__/__init__.cpython-314.pyc b/pipeline/__pycache__/__init__.cpython-314.pyc new file mode 100644 index 0000000000000000000000000000000000000000..c64e450838caa721162ad931e1c0896303ebc1d0 GIT binary patch literal 291 zcmXv}OG*SW5KUCH5%dfNt_E=qAcNp%Gcf2dx(pPdySmAkNrfc6a8};IQ+TCz=>>$j zfC+ThqN?7bs@|*X%Zqc7gTAWmQ?9=SOd-eSP)0t*MkKM3S7LUwU#yKMYaD^s^}Fr} zS`J{j1IJ;ft=a2j*nxHptd~oO+L<(x^XHiy&JY`}5x>Quhz$MXt-4>Wl&28cX{*+O zw^XO7UOYK?WYC{r?6Q?$WfWa6KWNRgBZzekfjI-=c&di3e zs@km-wfMJPFLTiEb0a9PUvXZ146T2Ymr`Mijfa1RMh;y-A~S>x>&X}lyt6~> zAZKtbgvJf=gEqr9XgBPG4#Pod{E&0dWw-|2hI`Oscqq*_fsVVL}gsvAJ6ej}qM<+!Y+F5 ztP)SAr{YS=z;j{NbW0goHR2cbbjtK#MV1oQOM-!o$&VBV{5beMoQ5igqUZwK^DeLs zv}SacnPx;L20d00*;whiQH(E9WA<{(S!#8$hnh-lgeu#eY#X}HeM8nUc@0i(+Oc9} zaO%1Z6Gkvu4ckaJryCfPZQzpZljMwsPWWy$wZ7iiiII_*reHXol#Gqx^hG6M=&>`J zk`l(#8JtjJ6IfDHYHCV=ThWCML6;^K!6*^alfq0|R+4(GM2hRch(_*Ek7=r=B-NA> zOC;5ZHfMU{aVeEb8=n$(CL1jetx#Ha)W8zGP&2aIuT4`*Q ztwIAQ5z@FQH%8XnR!Ry;`U%gZOi>vR??x!fjFAi9R<`6iR+j75ki96%br*RpJzLOl zKSD0F2Y^QYx{I8&zz%IlKnGEIZ#YXcQrQ-?8@15C2^evV+l^Ev%$p2;Y|9vv9X;0> z)2XBqX&J;g8n&5y8sux*KrvL!v`ZSi$tF)#+?GsBvL5Df0GKo#SV^R@tef^}DJ27L zcp^=mXwyDH5O6ISlQ8P%z$VCy@8sH0*oOVE zrFD}$PS5JN3f@O?HPm4S*TBOE1F%8KPgXl=JKzNnQ^E^qAKFR3MV6em zG-^jwF@UPW6jkgPv6S0Ntc;5>qB`3OL5OG*?WG+=$DxW56_|kwqnL4u@VA8-(fL!D zAtQ>IaZP~5cW4p@>#qZ5-T?;^Prqq0v$zp<#l#v9fF1QJDLsRgxTGgk)nweGVY_L! zrkS>xH)R#$W*BPPX0d80xP`Xm%#2n-7T!meYbA7!Xa+?TK*~athfXU(>Ie*?8j*lT z4i$LkABKN7e47ug@}Wg}c_Pn;*7+klgRG{0zOUd7ULLtLa<%s3rjMGI`fe!ejnCgS zHX8eGdHWx5sJ4MZ3vdblU%A4Mbr0(xm@It%JdO_gfTN#84Y}|N zPZA{VB6zi%c#`0?{*~I5#AjX0)1U6yXz!t27E&tx1Q7y7$_g}cC|2%^qLQ9z4EPr| zN;LYHM5ALx+YHvfY{xY|%8O5UYmuRW3|Px%2&ABwtMuhZVK*84b+Hc-)rhh?VJ*s+ z_y8XPev0UO6e0nz^={>D(Al^#r-6|+(&?n>A$1%pdM0TQot`6>JfbMc3_KPw>^a3=f+%Z;Zk&D#5 zmVGTAK>I=ax)niCOIy^qAGF7|it%=^%$4Jb*teB(OvA(o@M(cahPc#irKEcr8zU-1 zOO!2BD`FyfglIvG_&HicM~pZ-TDEZFMWXXa88vVo4p0TgUE?cex&STiO0>W%574~Z zXg;9%pQIhJmv{rTL@czN_EfAL*y@iF3v2PWbs}4=zD%{1#zq*;9wW|xmZ+@?z2eBQ z5tLyE0N-!TidB_dh|L7u#TYRNxOR-P!=O7!+&7+D@jEjWFFsJ%L@E zv0cSYNMuR0dRq}ghsqLzmF=auE$Am_);%%P`G3r;e`2PsV&;~#8!Dt-mI27Ev4ZX_ z-6znSc0QjMRZ39Y)dn}#LT^n);oaG@VkRX>f(|)bQYa-Ff{_-=_KIdKhXk6yby}&L zFg7#@!3Jg_>V}X-MmhBLzozAkog6T{^{xk(EW_h9GxBwA{WF)4u?W1YB zhFn@xO$o-dQev4NCW4;40&|hZM)ze}&VAB4lgHl?%lz{AoQ*MUkx$ zh!85MAjQ$+#BiB>Mpv+D*XMM&89WH2aKLm*cnYE|T`~P9@Kk07Jl`03#im!1!66lC zG!r)6bR;2K?g@kAM3`7=)23lHWtd!2NnrvdOd?TCLR3tmF8mTyVHb_;$l^R*qZf## z-})*rsisIyrJIgo%M5HgKcwH2hRG-<2REe?`%By>Emo|@&p>*OUxg0pZLWYj0Ke%A z-S)MA)7pdNr?)+~(SuC^_v)|xu< zjgfrhRNghuE}T1a*VDWhU-NY2-H}}1eP`p> z4g1%;7d{Lu9A1un`eOe0`PGBxFQ1)1`(a?!djXCa+PieHP`i7{o`3pOzGZmR##IM5 zIh)V_(B^cxH#xV*p6mOr0oke+_zwc_1@f(38&zF5#@G3yclp4N(T;9Z9liPDIzN1$ zcg+vH-?gA$=~+0CZ~nz+H8;lp(s09_?>+sc{Ll11()rUD^6vO8{`LQUyBqo93{5fr z_H1Av?EJgi4z=lw$BE?P@o+5$cZaMXAL1-yI;p;4sh%B71E>f6`a(a((9bevuw<PL{n7&82Rj!##C!9O>HMzQY9rJ7u6raQc9X3UshfuG!nR2I_7*H zzR_fo%5f|KjRYJzLDDA2Fw7kkxPv_ZLJjv&>ndvf4?4Gs&fP;tAKBpx%%eJzem?R; g2>qyzgg%?24C1}>*>$e*Yu@?$0Z1VDCc4xA0IIYqj{pDw literal 0 HcmV?d00001 diff --git a/pipeline/__pycache__/normalize.cpython-314.pyc b/pipeline/__pycache__/normalize.cpython-314.pyc new file mode 100644 index 0000000000000000000000000000000000000000..88310f1fc8336c83278a8b3ef607a0255086669e GIT binary patch literal 6442 zcmcIoO>7&-6`tiTmy}3~q^Mtu7AY!~#n_fyOR=2Vm8wXV75{ss(00t2*^n!mE=97t zE89wcA_P9jZ2{9L3Ms9P(2WDjK!ECy06yg40x6I~(2}5j7V83W+f#0G)YM2%eKXvp zC|Zi078!uU**9PkBR4ejVe1VM~^V2HbpO{pc z!B|*UnIkbt4pUHpe>k!5qv5wILpFs1=$v8i6@(%nbEA{gd5WWgP*X@j-BA=|bXk{S z#}>Vez~OJB-Dvu|(7xPevv0POrp|30_$j zCda(v$%qh_g1RUqLHDT;)()m(DIp$92*CvIHy(Ra=txbf)=-E~B$6^O3!#u`17D2r zlb(lc3Oz>KsJcfuyD@sZ^|q90zjhR=+;(Lit0ou znyW1Ir1^um%F?wJpgBt$_`w;3`y>j&dy3?3q0u1c=#y4L&rf)0nc|jJ$WK>89$q5N>Rj-hc#h4&@V+lFw<-K7( zkxazGeB3KsgetETJ}->(9cm@{49SxzL1n~b9P$ZC5Y-A^lHi=<6SB%mlaefqs}*uG z84o4+anQj<$Z;#vMB>MZxT&dfBNdf%(|D01l3)R1-e$a4Tah5fLt{ zrW8LJPx6soyfMTYoVg&oOR02Jko`u{j70z-NOa;1F9Wr*ct%Aed>*xVA&FL8J#^O0)L*Zkb?2RJQ-?mc)GGGsYnQHGN?-idThokUv8VU^ zx>srRTs@@NoY~qP3$`6f`;K|bLi@gVn(jAXal`Dug3XiZy3=#3XKv4(zCZL4{fCY2 zckAD&&(zH|<{P`F4?S%5P90QCj`YBSsb1N#bvAahW7?Evui6!pE$zDUb1WOXxm}l4 zr^5@TM#baH>|6A7g4B{TyLDBN?z#EXqHq&kd%Ek&TT07wnbYrY1zlUt+^m?K>5&Cf zgVNlRdH!Y+q?I{ylVYk$w_f>$($qYA>E?bAnsVlb6&6+0ro%JQ>(SZx+`u0X-aVM@ zK9D_lI=|ygzV21U>`IT!oVtE$_H4e^ub7?bu9=?eJ+m)n!gr##qI1!F+n&6uM={r= zFW}mq6$@Lebfdxgt@J|GwqljGhZMnnn- zD|HkQ1|ttam;%;dp06SiqJ~7+W=Lf)po+D)Xw@(-gke%_!YW&F=Ea#0G8HgWlK7|~ ziOu-a38Wh{_M|u&iVKOTd|s_O&r76zXaXy)W5{^Jbir?RFJwLl+!Pu`Lx6)nadPXX ziZkG?ei=;}VL?`F6ATxk7w8xT_X>=h#|q$zhd`XXi~I)F(6LLU1PPyuZ5nn~?8l_! za!Tw~pL+wxF(i%2AuVzp`{PNVBvNljQR^7AmGIEN0PnIbujK;4>`5+j}cA+j6dL zbL_lvKEAkXAnzKS9(Z!Z@QPryN@b0TW2RSX8`P% zJzY6Z*Zkr8)p^h0iivSn|C?rQRe+pTu7b&9E|$1_K=y9XBP2D77rVy;dPL#9|gw_TaQdrgYf{cia8;T)Ob ze9LR^hHr#djC5_~ipgYUSCGlbep!h;zB1VgmK7W2n_nxdUxPXAo`*2m!>eTVFOYqQ zYyhkm(zoFSr3WNOf18&0AcK{T83kF@s#z59Ss+9Ru@HAcg`e7-M~pzd1mFiDd5oGd zkAUn69s#nCBcF7CMt?T)srk>ni@n?m)>}Arb|DyA;3L^!Wbu?h+7=>T%9YZ{P3H+4j6UK6ve^ofDYM_jp+m4!)1xHayEUQ zfj-!Pgcvw`fZ-f)SH?5~?+%gzMM4NPVol1Nz$CmZ=`=$3j;kqYK|M>0PPnDR?yP;Y zU#AX(CywuwV6<%TST&;n0W&aaC2(Dt=&*Bz#M4D|x5TC`(DWL^#mU2>hgY zfwI&!OdWqvN?9^(3zl}p+O%x-<*dHU(A?1c%l9VkpIPiXzT7vG>l?|AzM2i5%lCz{ zt*whH+gznty`F=jz`o@wq{HOvG2P>+ozINv7nMd06tr}q(R}oXiP7ML#gV)C%I2+f9 zR%ZK>Wv61@vTSY1Sz9vSo2yv#?Oyin$@%uoU%V&geJ?Fp2Ob$w^%i)1aMWCzygE7C zoVT}64U@-+O*x7q-Km%z>9#et(sW}aIYl7vfjcigw!?=|PQ?ozB@cV&QH4t5FIr{EvqzfM#OgziL_G ACjbBd literal 0 HcmV?d00001 diff --git a/pipeline/__pycache__/pii_redactor.cpython-314.pyc b/pipeline/__pycache__/pii_redactor.cpython-314.pyc new file mode 100644 index 0000000000000000000000000000000000000000..1e4ed5ad7d93f5e12aa12dd971da7b53d59f87d4 GIT binary patch literal 6580 zcmcIoYit|GwVvfJA0lay7WJ?s+FJP$o0de`qMW$0ELjnyP}b8cnQkN6EJdwl#xzOn zE*+b(Q3fqgIW20Z5n#7)dle+NwSWLuKN7%31KhUv;@+aaWXrLbZ2$*t{{-j{xk8%c zU(cB(mz4ZSTc87Qb{=PD&YYR^op0DwzR5};-G9sS%J0hw`4d(O$(b*#{gojkOHL7i z86XON4FgQTpcn$I!Ui~n3m6q+z@(T0X2sk<1ok8uD~v771>Z&@aP2S?CZ9ntK2^$8 zB4vK6l%+(f?5R@L5-HnLrR*hAo1Q9F4m}l`{-t@lL94-@98VqV{MxabpE|apsGZlM zSOr#a31*>EFbY*dnNTg*gc@O!utjjdZ?jO_YEf)LT@4ZHYe?Lw*abI~x6<+^VH=bi zXt`YAq1;Hz4q>OT9cnwU7H1Tv!0aHNU7z8`Mm#2!?MYlzIe#pwsFndmlp<4cQRRkW zvZ5L%(kZAIk0)Zuglg;??im=$Jin(q<_S$r_O!elY02zs`Gz;VtEuH=(~*)EsD`}x z`pMVYnlFc@R9mmm=O35|_x1?>OigHNPj^RXdXjHBxwqT%+J0a2{^wE3I2ITk>B|i5 z>Av7WiKgz*^lST@eU~Rga?8nXPd7I1fws2h_RAs9rF_YwbwkYu?dvKqDLXzslBo(! zVfW=0971!LU`pkVA0O~%Mqxv+l}O9!o|gWTuN}PHQd({QXR7<#aN4rt!ccf(bg*wE zGZdOSqbMo4yQ8&r@>EEM4Z|jsMS{;u8gr0%+{4Lb)to^3M+SI?48ABx}xavdABDVlv((#?Sm{!g5;Fp;P(ibhBh}j z#++dUCImfN5)8$1t6>LOU_V1wQ#nPvpr|G=N?J;I7?qV32|lLNN$Fffk#Vi<>c!rU zP|wg%C?!gAG7*V~#-!vcVpNesqbV`Lk0;YoR18f>kys*@nBn8e8JTb4<;ZD~R|<{E z)BL&Qv>2B|g{H6!i>AUT9ZJPgVmy`*p%4p8;&ddcBqeX^qG}I^BZ)*(i6~+?EY*TK zo$!@v$gxOql6+btlw6+GlgxU%Vn2hiPyKrk$P{@)Ak}9!{lk{&VDMDo=HKQ=JHry>ft zdqU<rjdV2c;qeK2a zAw1kO=HaADSYNehb`qymOBBpAlawy1=E5SV@N+!ROIuLXriq4eNeT9<8PN-cEcw*o z$_5^8ZM|!$owej#b=MEQdFbYMZe94O;o$8(%dXxxERR^SZQrLhS9Vmwy|253CTKu| ztz{wmx=R=W6Kknh@~tyy=`}6N6iXYV+2X;pvS4;ja>4wnNlrx);Yj{FB~?MAhn39O z$Ho$h%1tGcaStQaK#O+Ash#NkqM08%_Cto3jAfpGgxSqq@3QZX&s%=hx{B5J4}9qQ zFnW8-2i>&#zX}!7Or&oVrob{a5#C%hQU0be^xJo${or0g4l%o-7t;*fRX1rMCj@LS zwC*L~v&_f>@tCBYASLaBOf}0&L{ea%CGA1(Nl+3b z1qK4L-zOwx3>wD}o;IZRd_8Fj;r3`c#2n*}lIaHRf&@)IfOudTmY~5&3iAE-;-#4Z zZQW$hMgW!s!*C_SlfVu;8_AQ$7BoXEVbVg$;Bi8NhOsiOtu!Ck*<}O3-jgIw92kSu zH>Oj%4Sa^enhj*|iUy#A@00*3mw7>;v5AlHao|Wi#F1%Hf#4%fQ&cTL9}nRwdPbD_ zSb`#}_c8qUff{S*gPqKF@Gp!4{hQ$cm{1|2l5s;3(Jzp!5tITzp=JaBJHW;m$paHC zFw}JxSk>U&r*g6=CLl-G0)qqWW;N6;>zol2;)RsdrPiO0fkQ%Y^LECQ(MVkG^6H(( zV2oT3*_WSzQcD&7J9UedbBE_E=am&#^OCFiS9M>lanS$WS~oM%4*=kXifX|rV-QiK z7qH4kfL59!DG12Zs&Ph2rc)lX7AVYl)2LR|G)xthco-x#;z%RyrnAWC%M>s8BibJ5 zDyX>hB6Lw7dIJhyLfqK38?a)}-t195cvQJps&?J2+BHA@psH!MFX!5Fz2nV}w>oG2 z4;>XhkniKxmMi>M<*V`=dTqtwzvu9O?A(&GSFhN&-nDPNF@4{@BWHJA9l0`cBk;?! zZ=YRkd?&F~f8c@r`86Ap*^#1~7ETdOh3D`u@gjf}8ba9Q)GJ!tU?iOE zS;+ED=+_yEg>gJ0kj;&+;^*c#trfKdS(NQoMsfQVe$mcA+~YIJcXGbhs#j z$&U3i=#cT)41oD=h(S+s)&hh$$#LemA?ytooPnNw`Z8f*a?Mht`S1V%4R8&VMFgW!v4Lj%!PjA>=) zi6QT57Q|>;l4Iw^_(fi$JrpMZ7LJaOjKTY2{A zS8HbCe2Mw$kB&AO59K$unip$Lza^C~HgM|*9l|{I9))WAk4JN%gp3tKR`D)}U_LE9 zBeEAsoMvDS`0N?Y4>{_GB>^8z{VcbPLcgqW+%HHCJ`^CE>s`gqO__e)^S@vDe)L1% z;+YS&ebjt}N@%XClN6W)?FA;`*8T#m5gU#BhJVmuKfr4a9HGQG5chwPcd9;!{Rp`m zQqiqJ_J`mWrl9Fr4x6-vtc58ya0Hvq5!ip4BRp|z!6|h}0?Po$K#stUZ7OI{{EpiS zBY-0qo+Ke~K}IkJIRSU+D2^M8M20eXw2t=UY zkctr44>yMb--QXo7(O62q!7LxX!_Rcx-9_I0n`{f5zIleowTvT%!IkHe$ftt@M5P= z#f>`SV~wW57=31l@c{hD5@=d?L*OYuHT@~r_nDfWcpEgc1j`^VYvT%Ql;{P^74!;% zZXRpK-$l#z-dw&r1#hyM=vNE)~a;P3gE`0Brtib+1OhlIzr zj(bT5p+((FwFo0_KY@3h#X&{hRUNqV^5VqY<@t$Q?JG6=mTLC>>gBJb4iK8?mqy{E zQKlZ8nDMI2q{^K1luP~CLti1t%A`Y>At94GF*}S|7iKSE7Jy7O!6O_=nzg8%gv3fM z6BFrk@Q0uxdh8Vb=(2gDmza^IBRB_-84@=MRgnfU8-fg?RbpDaAPr*!YCv&bd;Q@H zFwf|u6WEWLhL6BLzSPKEKno}N$dqgL&RQQf`hbtk26DChO0DN^t>@N`2esa7#$1j2 zdgjf{TbHhJ4_(!7`R=3fma7BTeb;?I)@v)Sz&%&s<4SkVS-0YByz6Y7-+AA;JLlYT z-SMUao|!i$<~wd(xLxz>UH{+oo2K7I{>QsidF*ejVRHr0yt-DPdE47pD~NN;tYclM z%hhfD56@3LKX1BnbhbBVue>^XW%TCJ`Skx)E>h*cfMu!=mYy0OjG~d zr;eHplYCOYbAJ5iT~`KysOnQFx6UsfTTCy`+&;F{(!K0H^1%KgjCb#)<8%6dAn+F% z@oZKNlBk-Liz)aEM~hAvlO&{P#W$&hOiwjOljl+~_;c4f@Y2Yr(AV2D-bZ=7v<;<^ zbkH{irP&&9pa-=VJMdqZJRR?ZKhe^0u}k_kw87aC8Qpi4WfOOty)Q0wcgyaZ1L*N1?il9a*Z{VSyzolkS`}?<+^;+vZYy< zcjTLM_EYQSEA@`aoW1<&@Ri{;Ba8EyU_NK*e4blv|8;ob@UrC_`m9y@;C&^t4lsMG z4NIhA^<{<_?X%y?aW>30U+d0sWwS5jI4j&w&T*|KUv*ydu5K=K8M2Pgx4>V5pF82N Vn$PX<$p738eEf^*0u!KC{W~QX0|5X4 literal 0 HcmV?d00001 diff --git a/pipeline/cli.py b/pipeline/cli.py new file mode 100644 index 0000000..2607cc1 --- /dev/null +++ b/pipeline/cli.py @@ -0,0 +1,79 @@ +import argparse +import json +from pathlib import Path +from typing import Any, Dict, List + +from pipeline.normalize import build_dpo_example, build_sft_example, normalize_log_entry +from pipeline.pii_redactor import audit_json, redact_json + + +def read_jsonl(path: Path) -> List[Dict[str, Any]]: + records: List[Dict[str, Any]] = [] + with path.open("r", encoding="utf-8") as handle: + for line in handle: + line = line.strip() + if not line: + continue + records.append(json.loads(line)) + return records + + +def write_jsonl(path: Path, rows: List[Dict[str, Any]]) -> None: + path.parent.mkdir(parents=True, exist_ok=True) + with path.open("w", encoding="utf-8") as handle: + for row in rows: + handle.write(json.dumps(row, ensure_ascii=False) + "\n") + + +def load_and_process(path: Path) -> List[Dict[str, Any]]: + raw_entries = read_jsonl(path) + events: List[Dict[str, Any]] = [] + for raw in raw_entries: + normalized = normalize_log_entry(raw) + if normalized["event_type"] == "tool" and "tool_result" in normalized["payload"]: + normalized["event_type"] = "tool_result" + events.append(normalized) + return events + + +def main() -> None: + parser = argparse.ArgumentParser( + description="Run a small pipeline to normalize logs, audit/redact PII, and emit SFT/DPO JSONL." + ) + parser.add_argument("input", type=Path, help="Input JSONL sample log path") + parser.add_argument("output_dir", type=Path, help="Output directory for redacted data") + args = parser.parse_args() + + events = load_and_process(args.input) + + audit_findings = audit_json(events) + if audit_findings: + print(f"Detected {len(audit_findings)} PII items before redaction.") + for finding in audit_findings: + print(f"- {finding['path']}: {finding['category']} -> {finding['match']}") + else: + print("No PII detected in the sample input.") + + redacted_events = redact_json(events) + sft = [build_sft_example(redacted_events)] + dpo = [ + build_dpo_example( + [e for e in redacted_events if e["event_type"] in {"user", "assistant", "system"}], + [ + { + "event_type": "assistant", + "payload": {"text": "I cannot help with that request."}, + } + ], + ) + ] + + write_jsonl(args.output_dir / "sft_ready.jsonl", sft) + write_jsonl(args.output_dir / "dpo_ready.jsonl", dpo) + write_jsonl(args.output_dir / "redacted_events.jsonl", redacted_events) + + print(f"Wrote {len(sft)} SFT example(s) and {len(dpo)} DPO example(s) to {args.output_dir}") + + +if __name__ == "__main__": + main() diff --git a/pipeline/normalize.py b/pipeline/normalize.py new file mode 100644 index 0000000..410a498 --- /dev/null +++ b/pipeline/normalize.py @@ -0,0 +1,113 @@ +import json +from typing import Any, Dict, List + + +def normalize_log_entry(raw: Dict[str, Any]) -> Dict[str, Any]: + """Normalize heterogeneous log entries into a canonical event schema.""" + event_type = raw.get("event_type") + if not event_type: + role = raw.get("role") + if role == "user": + event_type = "user" + elif role in {"assistant", "system"}: + event_type = "assistant" if role == "assistant" else "system" + elif "tool_name" in raw or raw.get("tool"): + event_type = "tool" + else: + event_type = raw.get("type", "unknown") + + payload: Dict[str, Any] = {} + if "content" in raw: + payload["text"] = raw["content"] + if "role" in raw: + payload["role"] = raw["role"] + if "tool_name" in raw: + payload["tool_name"] = raw["tool_name"] + if "tool" in raw: + payload["tool_name"] = raw["tool"] + if "tool_args" in raw: + payload["tool_args"] = raw["tool_args"] + if "result" in raw: + payload["tool_result"] = raw["result"] + if "metadata" in raw: + payload["metadata"] = raw["metadata"] + if "error" in raw: + payload["error"] = raw["error"] + + normalized = { + "session_id": raw.get("session_id", raw.get("conversation_id", "unknown-session")), + "timestamp": raw.get("timestamp"), + "turn_id": raw.get("turn_id", raw.get("index")), + "event_type": event_type, + "payload": payload, + } + return normalized + + +def build_sft_example(events: List[Dict[str, Any]]) -> Dict[str, Any]: + """Build a simple SFT chat example from normalized events.""" + messages = [] + for event in events: + if event["event_type"] in {"user", "assistant", "system"}: + role = event["payload"].get("role", event["event_type"]) + text = event["payload"].get("text", "") + if text: + messages.append({"role": role, "content": text}) + elif event["event_type"] == "tool": + messages.append( + { + "role": "tool", + "name": event["payload"].get("tool_name", "unknown_tool"), + "content": event["payload"].get("tool_args", ""), + } + ) + elif event["event_type"] == "tool_result": + messages.append( + { + "role": "assistant", + "name": event["payload"].get("tool_name", "unknown_tool"), + "content": event["payload"].get("tool_result", ""), + } + ) + + return { + "session_id": events[0].get("session_id", "unknown-session") if events else "unknown-session", + "messages": messages, + "metadata": { + "trajectory_length": len(events), + "has_tool_usage": any(e["event_type"] in {"tool", "tool_result"} for e in events), + }, + } + + +def build_dpo_example(events: List[Dict[str, Any]], rejected: List[Dict[str, Any]]) -> Dict[str, Any]: + """Build a DPO-style prompt/completion pair from event trajectories.""" + prompt_messages = [] + for event in events: + if event["event_type"] == "user": + prompt_messages.append({"role": "user", "content": event["payload"].get("text", "")}) + elif event["event_type"] == "assistant" and event["payload"].get("role") == "system": + prompt_messages.append({"role": "system", "content": event["payload"].get("text", "")}) + + chosen = "\n".join( + event["payload"].get("text", "") + for event in events + if event["event_type"] == "assistant" + ) + rejected_texts = [] + for event in rejected: + if event["event_type"] == "assistant": + rejected_texts.append(event["payload"].get("text", "")) + rejected = "\n".join(rejected_texts) + + return { + "session_id": events[0].get("session_id", "unknown-session") if events else "unknown-session", + "prompt": prompt_messages, + "chosen": chosen, + "rejected": rejected, + "metadata": { + "prefix_length": len(prompt_messages), + "chosen_length": len(chosen.split()), + "rejected_length": len(rejected.split()), + }, + } diff --git a/pipeline/pii_redactor.py b/pipeline/pii_redactor.py new file mode 100644 index 0000000..db3d7ab --- /dev/null +++ b/pipeline/pii_redactor.py @@ -0,0 +1,122 @@ +import re +from typing import Any, Dict, Iterable, List, Tuple, Union + +REDACTION_PATTERNS = [ + ( + "EMAIL", + re.compile(r"(?i)\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Za-z]{2,}\b"), + ), + ( + "CREDIT_CARD", + re.compile(r"\b(?:\d[ -]*?){13,16}\b"), + ), + ( + "PHONE", + re.compile( + r"(?x)(?:\+?\d{1,3}[\s-]?)?(?:\(\d{2,4}\)|\d{2,4})[\s-]?\d{3,4}[\s-]?\d{3,4}\b" + ), + ), + ( + "SSN", + re.compile(r"\b\d{3}-\d{2}-\d{4}\b"), + ), + ( + "UUID", + re.compile( + r"\b[0-9a-fA-F]{8}-[0-9a-fA-F]{4}-[0-9a-fA-F]{4}-[0-9a-fA-F]{4}-[0-9a-fA-F]{12}\b" + ), + ), + ( + "URL_TOKEN", + re.compile( + r"\bhttps?://[^\s]*?(?:token|secret|api[_-]?key|access[_-]?token|auth)[^\s]*\b", + re.IGNORECASE, + ), + ), + ( + "URL", + re.compile(r"\bhttps?://[^\s]+\b", re.IGNORECASE), + ), + ( + "IP_ADDRESS", + re.compile( + r"\b(?:[0-9]{1,3}\.){3}[0-9]{1,3}\b" + ), + ), +] + +PLACEHOLDER_MAP = { + "EMAIL": "", + "PHONE": "", + "CREDIT_CARD": "", + "SSN": "", + "UUID": "", + "URL_TOKEN": "", + "URL": "", + "IP_ADDRESS": "", +} + + +def redact_text(text: str) -> str: + """Redact known PII patterns from a text string.""" + redacted = text + for category, pattern in REDACTION_PATTERNS: + redacted = pattern.sub(PLACEHOLDER_MAP[category], redacted) + return redacted + + +def spans_overlap(span_a: Tuple[int, int], span_b: Tuple[int, int]) -> bool: + start_a, end_a = span_a + start_b, end_b = span_b + return not (end_a <= start_b or end_b <= start_a) + + +def find_pii(text: str) -> List[Tuple[str, str]]: + """Return a list of detected PII category matches in text.""" + matches: List[Tuple[str, str]] = [] + seen_spans: List[Tuple[int, int]] = [] + for category, pattern in REDACTION_PATTERNS: + for match in pattern.finditer(text): + span = match.span() + if any(spans_overlap(span, seen) for seen in seen_spans): + continue + seen_spans.append(span) + matches.append((category, match.group(0))) + return matches + + +def redact_json(obj: Any) -> Any: + """Recursively redact PII in JSON-like objects.""" + if isinstance(obj, str): + return redact_text(obj) + if isinstance(obj, dict): + return {key: redact_json(value) for key, value in obj.items()} + if isinstance(obj, list): + return [redact_json(item) for item in obj] + return obj + + +def audit_json(obj: Any, path: str = "") -> List[Dict[str, Any]]: + """Collect detected PII items in a nested JSON object for audit review.""" + findings: List[Dict[str, Any]] = [] + if isinstance(obj, str): + seen_spans: List[Tuple[int, int]] = [] + for category, pattern in REDACTION_PATTERNS: + for match in pattern.finditer(obj): + span = match.span() + if any(spans_overlap(span, seen) for seen in seen_spans): + continue + seen_spans.append(span) + findings.append( + {"path": path, "category": category, "match": match.group(0)} + ) + return findings + if isinstance(obj, dict): + for key, value in obj.items(): + findings.extend(audit_json(value, f"{path}.{key}" if path else key)) + return findings + if isinstance(obj, list): + for index, item in enumerate(obj): + findings.extend(audit_json(item, f"{path}[{index}]") ) + return findings + return findings diff --git a/pipeline/pii_taxonomy.yaml b/pipeline/pii_taxonomy.yaml new file mode 100644 index 0000000..3d5a919 --- /dev/null +++ b/pipeline/pii_taxonomy.yaml @@ -0,0 +1,27 @@ +pii_taxonomy: + description: "List of PII categories detected and replaced by the pipeline." + categories: + - name: EMAIL + description: "Email addresses or email-like identifiers." + example: "user@example.com" + - name: PHONE + description: "Phone numbers or voice contact strings." + example: "+1-555-123-4567" + - name: CREDIT_CARD + description: "Credit/debit card numbers or card-like digit sequences." + example: "4111 1111 1111 1111" + - name: SSN + description: "Social Security numbers or national ID numbers in common numeric format." + example: "123-45-6789" + - name: UUID + description: "Standard UUID/GUID values used for tokens, request IDs, or session IDs." + example: "3f8f9b8e-4d49-4e00-8c1d-1a4e1dbfe9e4" + - name: URL_TOKEN + description: "URLs containing auth tokens, API keys, or secrets in the query string or path." + example: "https://api.example.com/data?api_key=abcd1234" + - name: URL + description: "Generic HTTP(S) URLs that may contain sensitive endpoints or links." + example: "https://example.com/resource" + - name: IP_ADDRESS + description: "IPv4 addresses found in logs or runtime traces." + example: "192.168.1.100" diff --git a/pipeline/schema.yaml b/pipeline/schema.yaml new file mode 100644 index 0000000..0b1c080 --- /dev/null +++ b/pipeline/schema.yaml @@ -0,0 +1,19 @@ +canonical_event_schema: + description: "A normalized event schema for training data extraction." + fields: + session_id: + type: string + description: "Identifier for the user session or conversation." + timestamp: + type: string + description: "ISO-8601 or event timestamp from the source log." + turn_id: + type: string + description: "Optional ordinal or unique turn identifier within the session." + event_type: + type: string + description: "Normalized event type, e.g. user, assistant, tool, tool_result, system, error." + enum: [user, assistant, system, tool, tool_result, error, unknown] + payload: + type: object + description: "Event-specific data such as text content, tool name, tool args, results, metadata, or error details." diff --git a/sample_logs/example.jsonl b/sample_logs/example.jsonl new file mode 100644 index 0000000..14bc6f1 --- /dev/null +++ b/sample_logs/example.jsonl @@ -0,0 +1,9 @@ +{"timestamp": "2026-05-16T10:00:00Z", "session_id": "sess-123", "turn_id": 1, "role": "user", "content": "Hi, my name is Alex Rivera. Please find the nearest urgent care at 123 Main St and call me at +1-555-123-4567. Also email me at alex.rivera@example.com."} +{"timestamp": "2026-05-16T10:00:02Z", "session_id": "sess-123", "turn_id": 2, "role": "assistant", "content": "Checking the nearest urgent care locations now."} +{"timestamp": "2026-05-16T10:00:03Z", "session_id": "sess-123", "turn_id": 3, "tool": "geo_search", "tool_args": "query=urgent care near 123 Main St"} +{"timestamp": "2026-05-16T10:00:04Z", "session_id": "sess-123", "turn_id": 4, "tool_name": "geo_search", "result": "Found urgent care clinic at 125 Main St, 0.3 miles away."} +{"timestamp": "2026-05-16T10:00:05Z", "session_id": "sess-123", "turn_id": 5, "role": "assistant", "content": "I found an urgent care clinic at 125 Main St, 0.3 miles from your location."} +{"timestamp": "2026-05-16T10:05:00Z", "session_id": "sess-234", "turn_id": 1, "role": "user", "content": "Please transfer the invoice to finance@company.com and verify the transaction reference 4111 1111 1111 1111."} +{"timestamp": "2026-05-16T10:05:05Z", "session_id": "sess-234", "turn_id": 2, "tool": "payment_api", "tool_args": "POST https://api.payments.example.com/charge?api_key=secret_token_ABC123"} +{"timestamp": "2026-05-16T10:05:06Z", "session_id": "sess-234", "turn_id": 3, "tool_name": "payment_api", "result": "Payment accepted, confirmation id 3f8f9b8e-4d49-4e00-8c1d-1a4e1dbfe9e4."} +{"timestamp": "2026-05-16T10:05:07Z", "session_id": "sess-234", "turn_id": 4, "role": "assistant", "content": "The payment has been processed and the confirmation id is available."}