From 9fed3597420a6cc4d23a0cb224d7753c430b5fce Mon Sep 17 00:00:00 2001 From: Aayush Pratap Singh Date: Wed, 13 May 2026 00:13:16 +0530 Subject: [PATCH] Add safe log-to-training demo pipeline --- DEMO.md | 25 +++++ out/dpo.jsonl | 2 + out/sft.jsonl | 2 + out/validation_report.json | 10 ++ samples/agent_logs.jsonl | 2 + schemas/training_records.schema.json | 11 +++ scripts/log_to_training_data.js | 139 +++++++++++++++++++++++++++ 7 files changed, 191 insertions(+) create mode 100644 DEMO.md create mode 100644 out/dpo.jsonl create mode 100644 out/sft.jsonl create mode 100644 out/validation_report.json create mode 100644 samples/agent_logs.jsonl create mode 100644 schemas/training_records.schema.json create mode 100644 scripts/log_to_training_data.js diff --git a/DEMO.md b/DEMO.md new file mode 100644 index 0000000..a48f753 --- /dev/null +++ b/DEMO.md @@ -0,0 +1,25 @@ +# Demo: safe logs-to-training pipeline + +This demo turns toy agent logs into privacy-stripped training artifacts. + +## Run + +```bash +node scripts/log_to_training_data.js samples/agent_logs.jsonl out +``` + +## Outputs + +- `out/sft.jsonl` — LoRA/SFT-style chat records with tool events preserved. +- `out/dpo.jsonl` — DPO-style prompt/chosen/rejected records for format validation. +- `out/validation_report.json` — PII counts and schema validation status. + +## What it demonstrates + +1. Normalizes mixed Q&A and agentic traces into message records. +2. Redacts emails, phone numbers, API-like secrets, and simple name patterns. +3. Preserves tool calls and tool observations as trajectory data. +4. Tags complexity using turns, tool count, tool names, and recovery signals. +5. Validates required SFT and DPO fields before writing artifacts. + +This is intentionally small and dependency-free so mentors can review the data flow quickly. Production implementation can replace the regex redactor with policy-specific PII detectors and extend schema checks into JSON Schema or trainer dry-runs. diff --git a/out/dpo.jsonl b/out/dpo.jsonl new file mode 100644 index 0000000..c16178d --- /dev/null +++ b/out/dpo.jsonl @@ -0,0 +1,2 @@ +{"id":"dpo-1","prompt":[{"role":"user","content":"Hi, I am [NAME]. Email me at [EMAIL] or call [PHONE]. My tomato leaves are yellowing."}],"chosen":[{"role":"assistant","content":"Check pH first, then EC, then root oxygenation. If pH is outside 5.8-6.5, correct that before adding nutrients."}],"rejected":[{"role":"assistant","content":"I cannot help with that."}],"metadata":{"source_session_id":"sess-001","domain":"agriculture","pii_redacted":{"email":1,"phone":1,"name":1},"complexity":{"turns":3,"tool_calls":1,"tools":["lookup_crop_guidance"],"has_error_recovery":false,"level":"agentic_simple"},"pair_source":"toy_negative_for_validation_only"}} +{"id":"dpo-2","prompt":[{"role":"user","content":"My API key is [SECRET] and the pump setup failed."}],"chosen":[{"role":"assistant","content":"Pump relay is not switching. Verify GPIO pin mapping and relay power, then retry setup."}],"rejected":[{"role":"assistant","content":"I cannot help with that."}],"metadata":{"source_session_id":"sess-002","domain":"setup","pii_redacted":{"api_key":1},"complexity":{"turns":3,"tool_calls":1,"tools":["read_device_state"],"has_error_recovery":true,"level":"agentic_recovery"},"pair_source":"toy_negative_for_validation_only"}} diff --git a/out/sft.jsonl b/out/sft.jsonl new file mode 100644 index 0000000..2306743 --- /dev/null +++ b/out/sft.jsonl @@ -0,0 +1,2 @@ +{"id":"sft-1","messages":[{"role":"system","content":"You are a safe agriculture support assistant. Use tools when needed and never expose private data."},{"role":"user","content":"Hi, I am [NAME]. Email me at [EMAIL] or call [PHONE]. My tomato leaves are yellowing."},{"role":"assistant","content":"Yellowing may indicate nitrogen deficiency, pH drift, or overwatering. I will check nutrient guidance."},{"role":"assistant","content":"{\"tool_name\":\"lookup_crop_guidance\",\"arguments\":{\"crop\":\"tomato\",\"symptom\":\"yellow leaves\"}}","name":"lookup_crop_guidance"},{"role":"tool","content":"Tomato yellow leaves: check pH 5.8-6.5, EC, nitrogen, root oxygenation.","name":"lookup_crop_guidance"},{"role":"assistant","content":"Check pH first, then EC, then root oxygenation. If pH is outside 5.8-6.5, correct that before adding nutrients."}],"metadata":{"source_session_id":"sess-001","domain":"agriculture","pii_redacted":{"email":1,"phone":1,"name":1},"complexity":{"turns":3,"tool_calls":1,"tools":["lookup_crop_guidance"],"has_error_recovery":false,"level":"agentic_simple"}}} +{"id":"sft-2","messages":[{"role":"system","content":"You are a safe agriculture support assistant. Use tools when needed and never expose private data."},{"role":"user","content":"My API key is [SECRET] and the pump setup failed."},{"role":"assistant","content":"I will inspect the setup state."},{"role":"assistant","content":"{\"tool_name\":\"read_device_state\",\"arguments\":{\"device_id\":\"hydro-controller-7\"}}","name":"read_device_state"},{"role":"tool","content":"relay_pump=false, tank_level=ok, last_error=GPIO timeout","name":"read_device_state"},{"role":"assistant","content":"Pump relay is not switching. Verify GPIO pin mapping and relay power, then retry setup."}],"metadata":{"source_session_id":"sess-002","domain":"setup","pii_redacted":{"api_key":1},"complexity":{"turns":3,"tool_calls":1,"tools":["read_device_state"],"has_error_recovery":true,"level":"agentic_recovery"}}} diff --git a/out/validation_report.json b/out/validation_report.json new file mode 100644 index 0000000..8dc8ab5 --- /dev/null +++ b/out/validation_report.json @@ -0,0 +1,10 @@ +{ + "input_sessions": 2, + "pii_counts": { + "email": 1, + "phone": 1, + "name": 1, + "api_key": 1 + }, + "validation_errors": [] +} \ No newline at end of file diff --git a/samples/agent_logs.jsonl b/samples/agent_logs.jsonl new file mode 100644 index 0000000..faa391f --- /dev/null +++ b/samples/agent_logs.jsonl @@ -0,0 +1,2 @@ +{"session_id":"sess-001","domain":"agriculture","events":[{"type":"user","timestamp":"2026-05-12T10:00:00Z","content":"Hi, I am Aayush. Email me at aayush@example.com or call +91 80098 54588. My tomato leaves are yellowing."},{"type":"assistant","timestamp":"2026-05-12T10:00:04Z","content":"Yellowing may indicate nitrogen deficiency, pH drift, or overwatering. I will check nutrient guidance."},{"type":"tool_call","timestamp":"2026-05-12T10:00:06Z","tool_name":"lookup_crop_guidance","arguments":{"crop":"tomato","symptom":"yellow leaves"}},{"type":"tool_result","timestamp":"2026-05-12T10:00:07Z","tool_name":"lookup_crop_guidance","content":"Tomato yellow leaves: check pH 5.8-6.5, EC, nitrogen, root oxygenation."},{"type":"assistant","timestamp":"2026-05-12T10:00:12Z","content":"Check pH first, then EC, then root oxygenation. If pH is outside 5.8-6.5, correct that before adding nutrients."}]} +{"session_id":"sess-002","domain":"setup","events":[{"type":"user","timestamp":"2026-05-12T11:00:00Z","content":"My API key is sk-test-1234567890abcdef and the pump setup failed."},{"type":"assistant","timestamp":"2026-05-12T11:00:02Z","content":"I will inspect the setup state."},{"type":"tool_call","timestamp":"2026-05-12T11:00:03Z","tool_name":"read_device_state","arguments":{"device_id":"hydro-controller-7"}},{"type":"tool_result","timestamp":"2026-05-12T11:00:04Z","tool_name":"read_device_state","content":"relay_pump=false, tank_level=ok, last_error=GPIO timeout"},{"type":"assistant","timestamp":"2026-05-12T11:00:08Z","content":"Pump relay is not switching. Verify GPIO pin mapping and relay power, then retry setup."}]} diff --git a/schemas/training_records.schema.json b/schemas/training_records.schema.json new file mode 100644 index 0000000..a0577d2 --- /dev/null +++ b/schemas/training_records.schema.json @@ -0,0 +1,11 @@ +{ + "sft_record": { + "required": ["id", "messages", "metadata"], + "message_required": ["role", "content"], + "roles": ["system", "user", "assistant", "tool"] + }, + "dpo_record": { + "required": ["id", "prompt", "chosen", "rejected", "metadata"] + }, + "metadata_required": ["source_session_id", "domain", "pii_redacted", "complexity"] +} diff --git a/scripts/log_to_training_data.js b/scripts/log_to_training_data.js new file mode 100644 index 0000000..4ecfc7c --- /dev/null +++ b/scripts/log_to_training_data.js @@ -0,0 +1,139 @@ +const fs = require("fs"); +const path = require("path"); + +const input = process.argv[2] || "samples/agent_logs.jsonl"; +const outDir = process.argv[3] || "out"; +fs.mkdirSync(outDir, { recursive: true }); + +const PII_PATTERNS = [ + { name: "email", re: /[A-Z0-9._%+-]+@[A-Z0-9.-]+\.[A-Z]{2,}/gi, token: "[EMAIL]" }, + { name: "api_key", re: /\b(?:sk|pk|gho|ghp|api)[-_][A-Za-z0-9_-]{8,}\b/g, token: "[SECRET]" }, + { name: "phone", re: /(?:\+\d{1,3}[\s.-]?)?(?:\d[\s.-]?){10}\b/g, token: "[PHONE]" }, + { name: "name", re: /\bI am [A-Z][a-z]+\b/g, token: "I am [NAME]" } +]; + +function redact(value, audit) { + if (typeof value === "string") { + let out = value; + for (const pattern of PII_PATTERNS) { + out = out.replace(pattern.re, (match) => { + audit[pattern.name] = (audit[pattern.name] || 0) + 1; + return pattern.token; + }); + } + return out; + } + if (Array.isArray(value)) return value.map((v) => redact(v, audit)); + if (value && typeof value === "object") { + return Object.fromEntries(Object.entries(value).map(([k, v]) => [k, redact(v, audit)])); + } + return value; +} + +function roleFor(event) { + if (event.type === "tool_call") return "assistant"; + if (event.type === "tool_result") return "tool"; + return event.type; +} + +function contentFor(event) { + if (event.type === "tool_call") { + return JSON.stringify({ tool_name: event.tool_name, arguments: event.arguments || {} }); + } + return event.content || ""; +} + +function complexity(events) { + const tools = events.filter((e) => e.type === "tool_call").map((e) => e.tool_name); + const errors = events.filter((e) => JSON.stringify(e).toLowerCase().includes("error")).length; + const recovery = errors > 0 || events.some((e) => /retry|failed|timeout|verify/i.test(e.content || "")); + return { + turns: events.filter((e) => e.type === "user" || e.type === "assistant").length, + tool_calls: tools.length, + tools: [...new Set(tools)], + has_error_recovery: recovery, + level: tools.length >= 1 && recovery ? "agentic_recovery" : tools.length >= 1 ? "agentic_simple" : "qa" + }; +} + +function toSft(session, idx, audit) { + return { + id: `sft-${idx + 1}`, + messages: [ + { role: "system", content: "You are a safe agriculture support assistant. Use tools when needed and never expose private data." }, + ...session.events.map((event) => ({ role: roleFor(event), content: contentFor(event), name: event.tool_name })).map((m) => Object.fromEntries(Object.entries(m).filter(([,v]) => v !== undefined))) + ], + metadata: { + source_session_id: session.session_id, + domain: session.domain || "unknown", + pii_redacted: audit, + complexity: complexity(session.events) + } + }; +} + +function toDpo(session, idx, audit) { + const user = session.events.find((e) => e.type === "user"); + const final = [...session.events].reverse().find((e) => e.type === "assistant"); + return { + id: `dpo-${idx + 1}`, + prompt: [{ role: "user", content: user ? user.content : "" }], + chosen: [{ role: "assistant", content: final ? final.content : "" }], + rejected: [{ role: "assistant", content: "I cannot help with that." }], + metadata: { + source_session_id: session.session_id, + domain: session.domain || "unknown", + pii_redacted: audit, + complexity: complexity(session.events), + pair_source: "toy_negative_for_validation_only" + } + }; +} + +function validateSft(record) { + const errors = []; + if (!record.id) errors.push("missing id"); + if (!Array.isArray(record.messages) || record.messages.length < 2) errors.push("messages must contain at least system + one turn"); + for (const [i, msg] of (record.messages || []).entries()) { + if (!msg.role || !["system", "user", "assistant", "tool"].includes(msg.role)) errors.push(`message ${i} invalid role`); + if (typeof msg.content !== "string") errors.push(`message ${i} missing content`); + } + if (!record.metadata?.pii_redacted) errors.push("missing pii metadata"); + return errors; +} + +function validateDpo(record) { + const errors = []; + for (const key of ["id", "prompt", "chosen", "rejected", "metadata"]) if (!record[key]) errors.push(`missing ${key}`); + if (!Array.isArray(record.prompt) || !Array.isArray(record.chosen) || !Array.isArray(record.rejected)) errors.push("prompt/chosen/rejected must be message arrays"); + return errors; +} + +const sessions = fs.readFileSync(input, "utf8").trim().split(/\r?\n/).filter(Boolean).map((line) => JSON.parse(line)); +const sft = []; +const dpo = []; +const report = { input_sessions: sessions.length, pii_counts: {}, validation_errors: [] }; + +sessions.forEach((raw, idx) => { + const audit = {}; + const session = redact(raw, audit); + for (const [k, v] of Object.entries(audit)) report.pii_counts[k] = (report.pii_counts[k] || 0) + v; + const sftRecord = toSft(session, idx, audit); + const dpoRecord = toDpo(session, idx, audit); + validateSft(sftRecord).forEach((e) => report.validation_errors.push({ id: sftRecord.id, error: e })); + validateDpo(dpoRecord).forEach((e) => report.validation_errors.push({ id: dpoRecord.id, error: e })); + sft.push(sftRecord); + dpo.push(dpoRecord); +}); + +fs.writeFileSync(path.join(outDir, "sft.jsonl"), sft.map((r) => JSON.stringify(r)).join("\n") + "\n"); +fs.writeFileSync(path.join(outDir, "dpo.jsonl"), dpo.map((r) => JSON.stringify(r)).join("\n") + "\n"); +fs.writeFileSync(path.join(outDir, "validation_report.json"), JSON.stringify(report, null, 2)); + +if (report.validation_errors.length) { + console.error(`Validation failed: ${report.validation_errors.length} errors`); + process.exit(1); +} +console.log(`Processed ${sessions.length} sessions`); +console.log(`PII redactions: ${JSON.stringify(report.pii_counts)}`); +console.log(`Wrote ${outDir}/sft.jsonl, ${outDir}/dpo.jsonl, ${outDir}/validation_report.json`);