Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
25 changes: 25 additions & 0 deletions DEMO.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
# Demo: safe logs-to-training pipeline

This demo turns toy agent logs into privacy-stripped training artifacts.

## Run

```bash
node scripts/log_to_training_data.js samples/agent_logs.jsonl out
```

## Outputs

- `out/sft.jsonl` — LoRA/SFT-style chat records with tool events preserved.
- `out/dpo.jsonl` — DPO-style prompt/chosen/rejected records for format validation.
- `out/validation_report.json` — PII counts and schema validation status.

## What it demonstrates

1. Normalizes mixed Q&A and agentic traces into message records.
2. Redacts emails, phone numbers, API-like secrets, and simple name patterns.
3. Preserves tool calls and tool observations as trajectory data.
4. Tags complexity using turns, tool count, tool names, and recovery signals.
5. Validates required SFT and DPO fields before writing artifacts.

This is intentionally small and dependency-free so mentors can review the data flow quickly. Production implementation can replace the regex redactor with policy-specific PII detectors and extend schema checks into JSON Schema or trainer dry-runs.
2 changes: 2 additions & 0 deletions out/dpo.jsonl
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
{"id":"dpo-1","prompt":[{"role":"user","content":"Hi, I am [NAME]. Email me at [EMAIL] or call [PHONE]. My tomato leaves are yellowing."}],"chosen":[{"role":"assistant","content":"Check pH first, then EC, then root oxygenation. If pH is outside 5.8-6.5, correct that before adding nutrients."}],"rejected":[{"role":"assistant","content":"I cannot help with that."}],"metadata":{"source_session_id":"sess-001","domain":"agriculture","pii_redacted":{"email":1,"phone":1,"name":1},"complexity":{"turns":3,"tool_calls":1,"tools":["lookup_crop_guidance"],"has_error_recovery":false,"level":"agentic_simple"},"pair_source":"toy_negative_for_validation_only"}}
{"id":"dpo-2","prompt":[{"role":"user","content":"My API key is [SECRET] and the pump setup failed."}],"chosen":[{"role":"assistant","content":"Pump relay is not switching. Verify GPIO pin mapping and relay power, then retry setup."}],"rejected":[{"role":"assistant","content":"I cannot help with that."}],"metadata":{"source_session_id":"sess-002","domain":"setup","pii_redacted":{"api_key":1},"complexity":{"turns":3,"tool_calls":1,"tools":["read_device_state"],"has_error_recovery":true,"level":"agentic_recovery"},"pair_source":"toy_negative_for_validation_only"}}
2 changes: 2 additions & 0 deletions out/sft.jsonl
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
{"id":"sft-1","messages":[{"role":"system","content":"You are a safe agriculture support assistant. Use tools when needed and never expose private data."},{"role":"user","content":"Hi, I am [NAME]. Email me at [EMAIL] or call [PHONE]. My tomato leaves are yellowing."},{"role":"assistant","content":"Yellowing may indicate nitrogen deficiency, pH drift, or overwatering. I will check nutrient guidance."},{"role":"assistant","content":"{\"tool_name\":\"lookup_crop_guidance\",\"arguments\":{\"crop\":\"tomato\",\"symptom\":\"yellow leaves\"}}","name":"lookup_crop_guidance"},{"role":"tool","content":"Tomato yellow leaves: check pH 5.8-6.5, EC, nitrogen, root oxygenation.","name":"lookup_crop_guidance"},{"role":"assistant","content":"Check pH first, then EC, then root oxygenation. If pH is outside 5.8-6.5, correct that before adding nutrients."}],"metadata":{"source_session_id":"sess-001","domain":"agriculture","pii_redacted":{"email":1,"phone":1,"name":1},"complexity":{"turns":3,"tool_calls":1,"tools":["lookup_crop_guidance"],"has_error_recovery":false,"level":"agentic_simple"}}}
{"id":"sft-2","messages":[{"role":"system","content":"You are a safe agriculture support assistant. Use tools when needed and never expose private data."},{"role":"user","content":"My API key is [SECRET] and the pump setup failed."},{"role":"assistant","content":"I will inspect the setup state."},{"role":"assistant","content":"{\"tool_name\":\"read_device_state\",\"arguments\":{\"device_id\":\"hydro-controller-7\"}}","name":"read_device_state"},{"role":"tool","content":"relay_pump=false, tank_level=ok, last_error=GPIO timeout","name":"read_device_state"},{"role":"assistant","content":"Pump relay is not switching. Verify GPIO pin mapping and relay power, then retry setup."}],"metadata":{"source_session_id":"sess-002","domain":"setup","pii_redacted":{"api_key":1},"complexity":{"turns":3,"tool_calls":1,"tools":["read_device_state"],"has_error_recovery":true,"level":"agentic_recovery"}}}
10 changes: 10 additions & 0 deletions out/validation_report.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
{
"input_sessions": 2,
"pii_counts": {
"email": 1,
"phone": 1,
"name": 1,
"api_key": 1
},
"validation_errors": []
}
2 changes: 2 additions & 0 deletions samples/agent_logs.jsonl
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
{"session_id":"sess-001","domain":"agriculture","events":[{"type":"user","timestamp":"2026-05-12T10:00:00Z","content":"Hi, I am Aayush. Email me at aayush@example.com or call +91 80098 54588. My tomato leaves are yellowing."},{"type":"assistant","timestamp":"2026-05-12T10:00:04Z","content":"Yellowing may indicate nitrogen deficiency, pH drift, or overwatering. I will check nutrient guidance."},{"type":"tool_call","timestamp":"2026-05-12T10:00:06Z","tool_name":"lookup_crop_guidance","arguments":{"crop":"tomato","symptom":"yellow leaves"}},{"type":"tool_result","timestamp":"2026-05-12T10:00:07Z","tool_name":"lookup_crop_guidance","content":"Tomato yellow leaves: check pH 5.8-6.5, EC, nitrogen, root oxygenation."},{"type":"assistant","timestamp":"2026-05-12T10:00:12Z","content":"Check pH first, then EC, then root oxygenation. If pH is outside 5.8-6.5, correct that before adding nutrients."}]}
{"session_id":"sess-002","domain":"setup","events":[{"type":"user","timestamp":"2026-05-12T11:00:00Z","content":"My API key is sk-test-1234567890abcdef and the pump setup failed."},{"type":"assistant","timestamp":"2026-05-12T11:00:02Z","content":"I will inspect the setup state."},{"type":"tool_call","timestamp":"2026-05-12T11:00:03Z","tool_name":"read_device_state","arguments":{"device_id":"hydro-controller-7"}},{"type":"tool_result","timestamp":"2026-05-12T11:00:04Z","tool_name":"read_device_state","content":"relay_pump=false, tank_level=ok, last_error=GPIO timeout"},{"type":"assistant","timestamp":"2026-05-12T11:00:08Z","content":"Pump relay is not switching. Verify GPIO pin mapping and relay power, then retry setup."}]}
11 changes: 11 additions & 0 deletions schemas/training_records.schema.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
{
"sft_record": {
"required": ["id", "messages", "metadata"],
"message_required": ["role", "content"],
"roles": ["system", "user", "assistant", "tool"]
},
"dpo_record": {
"required": ["id", "prompt", "chosen", "rejected", "metadata"]
},
"metadata_required": ["source_session_id", "domain", "pii_redacted", "complexity"]
}
139 changes: 139 additions & 0 deletions scripts/log_to_training_data.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,139 @@
const fs = require("fs");
const path = require("path");

const input = process.argv[2] || "samples/agent_logs.jsonl";
const outDir = process.argv[3] || "out";
fs.mkdirSync(outDir, { recursive: true });

const PII_PATTERNS = [
{ name: "email", re: /[A-Z0-9._%+-]+@[A-Z0-9.-]+\.[A-Z]{2,}/gi, token: "[EMAIL]" },
{ name: "api_key", re: /\b(?:sk|pk|gho|ghp|api)[-_][A-Za-z0-9_-]{8,}\b/g, token: "[SECRET]" },
{ name: "phone", re: /(?:\+\d{1,3}[\s.-]?)?(?:\d[\s.-]?){10}\b/g, token: "[PHONE]" },
{ name: "name", re: /\bI am [A-Z][a-z]+\b/g, token: "I am [NAME]" }
];

function redact(value, audit) {
if (typeof value === "string") {
let out = value;
for (const pattern of PII_PATTERNS) {
out = out.replace(pattern.re, (match) => {
audit[pattern.name] = (audit[pattern.name] || 0) + 1;
return pattern.token;
});
}
return out;
}
if (Array.isArray(value)) return value.map((v) => redact(v, audit));
if (value && typeof value === "object") {
return Object.fromEntries(Object.entries(value).map(([k, v]) => [k, redact(v, audit)]));
}
return value;
}

function roleFor(event) {
if (event.type === "tool_call") return "assistant";
if (event.type === "tool_result") return "tool";
return event.type;
}

function contentFor(event) {
if (event.type === "tool_call") {
return JSON.stringify({ tool_name: event.tool_name, arguments: event.arguments || {} });
}
return event.content || "";
}

function complexity(events) {
const tools = events.filter((e) => e.type === "tool_call").map((e) => e.tool_name);
const errors = events.filter((e) => JSON.stringify(e).toLowerCase().includes("error")).length;
const recovery = errors > 0 || events.some((e) => /retry|failed|timeout|verify/i.test(e.content || ""));
return {
turns: events.filter((e) => e.type === "user" || e.type === "assistant").length,
tool_calls: tools.length,
tools: [...new Set(tools)],
has_error_recovery: recovery,
level: tools.length >= 1 && recovery ? "agentic_recovery" : tools.length >= 1 ? "agentic_simple" : "qa"
};
}

function toSft(session, idx, audit) {
return {
id: `sft-${idx + 1}`,
messages: [
{ role: "system", content: "You are a safe agriculture support assistant. Use tools when needed and never expose private data." },
...session.events.map((event) => ({ role: roleFor(event), content: contentFor(event), name: event.tool_name })).map((m) => Object.fromEntries(Object.entries(m).filter(([,v]) => v !== undefined)))
],
metadata: {
source_session_id: session.session_id,
domain: session.domain || "unknown",
pii_redacted: audit,
complexity: complexity(session.events)
}
};
}

function toDpo(session, idx, audit) {
const user = session.events.find((e) => e.type === "user");
const final = [...session.events].reverse().find((e) => e.type === "assistant");
return {
id: `dpo-${idx + 1}`,
prompt: [{ role: "user", content: user ? user.content : "" }],
chosen: [{ role: "assistant", content: final ? final.content : "" }],
rejected: [{ role: "assistant", content: "I cannot help with that." }],
metadata: {
source_session_id: session.session_id,
domain: session.domain || "unknown",
pii_redacted: audit,
complexity: complexity(session.events),
pair_source: "toy_negative_for_validation_only"
}
};
}

function validateSft(record) {
const errors = [];
if (!record.id) errors.push("missing id");
if (!Array.isArray(record.messages) || record.messages.length < 2) errors.push("messages must contain at least system + one turn");
for (const [i, msg] of (record.messages || []).entries()) {
if (!msg.role || !["system", "user", "assistant", "tool"].includes(msg.role)) errors.push(`message ${i} invalid role`);
if (typeof msg.content !== "string") errors.push(`message ${i} missing content`);
}
if (!record.metadata?.pii_redacted) errors.push("missing pii metadata");
return errors;
}

function validateDpo(record) {
const errors = [];
for (const key of ["id", "prompt", "chosen", "rejected", "metadata"]) if (!record[key]) errors.push(`missing ${key}`);
if (!Array.isArray(record.prompt) || !Array.isArray(record.chosen) || !Array.isArray(record.rejected)) errors.push("prompt/chosen/rejected must be message arrays");
return errors;
}

const sessions = fs.readFileSync(input, "utf8").trim().split(/\r?\n/).filter(Boolean).map((line) => JSON.parse(line));
const sft = [];
const dpo = [];
const report = { input_sessions: sessions.length, pii_counts: {}, validation_errors: [] };

sessions.forEach((raw, idx) => {
const audit = {};
const session = redact(raw, audit);
for (const [k, v] of Object.entries(audit)) report.pii_counts[k] = (report.pii_counts[k] || 0) + v;
const sftRecord = toSft(session, idx, audit);
const dpoRecord = toDpo(session, idx, audit);
validateSft(sftRecord).forEach((e) => report.validation_errors.push({ id: sftRecord.id, error: e }));
validateDpo(dpoRecord).forEach((e) => report.validation_errors.push({ id: dpoRecord.id, error: e }));
sft.push(sftRecord);
dpo.push(dpoRecord);
});

fs.writeFileSync(path.join(outDir, "sft.jsonl"), sft.map((r) => JSON.stringify(r)).join("\n") + "\n");
fs.writeFileSync(path.join(outDir, "dpo.jsonl"), dpo.map((r) => JSON.stringify(r)).join("\n") + "\n");
fs.writeFileSync(path.join(outDir, "validation_report.json"), JSON.stringify(report, null, 2));

if (report.validation_errors.length) {
console.error(`Validation failed: ${report.validation_errors.length} errors`);
process.exit(1);
}
console.log(`Processed ${sessions.length} sessions`);
console.log(`PII redactions: ${JSON.stringify(report.pii_counts)}`);
console.log(`Wrote ${outDir}/sft.jsonl, ${outDir}/dpo.jsonl, ${outDir}/validation_report.json`);