From 399c1ff4ead9fdfed74a3346ff1ace2771a2f960 Mon Sep 17 00:00:00 2001 From: Maxim Gross Date: Mon, 15 Jun 2026 15:30:24 +0200 Subject: [PATCH] feat: implement filtering for harness-injected context in workflow analysis and parsing --- src/core/analyzer-workflows.test.ts | 15 +++++++++++++++ src/core/analyzer-workflows.ts | 4 +++- src/core/helpers.ts | 22 ++++++++++++++++++++++ src/core/parser-codex.test.ts | 23 +++++++++++++++++++++++ src/core/parser-codex.ts | 7 +++++-- 5 files changed, 68 insertions(+), 3 deletions(-) diff --git a/src/core/analyzer-workflows.test.ts b/src/core/analyzer-workflows.test.ts index 9aae868c..2b284e3b 100644 --- a/src/core/analyzer-workflows.test.ts +++ b/src/core/analyzer-workflows.test.ts @@ -101,6 +101,21 @@ describe('WorkflowAnalyzer', () => { expect(result.clusters).toEqual([]); }); + it('filters harness-injected AGENTS.md context as noise', () => { + const now = Date.now(); + const injected = '# AGENTS.md instructions for /home/me/project\n\n\nfollow repo conventions\n'; + const sessions = Array.from({ length: 5 }, (_, i) => + makeSession({ + sessionId: `inj-${i}`, + requests: [makeRequest({ messageText: injected, timestamp: now + i * 1000 })], + lastMessageDate: now + i * 1000, + }) + ); + const analyzer = createAnalyzer(sessions); + const result = analyzer.getWorkflowOptimization(); + expect(result.clusters).toEqual([]); + }); + it('clusters repeated similar prompts', () => { const now = Date.now(); // Need at least 3 occurrences with same fingerprint diff --git a/src/core/analyzer-workflows.ts b/src/core/analyzer-workflows.ts index 885bd707..41bdea57 100644 --- a/src/core/analyzer-workflows.ts +++ b/src/core/analyzer-workflows.ts @@ -7,7 +7,7 @@ import { DateFilter, WorkflowCluster, WorkflowOptimizationData } from './types'; import { AnalyzerBase } from './analyzer-base'; -import { toDateStr } from './helpers'; +import { isHarnessInjectedContext, toDateStr } from './helpers'; /** Minimum prompt length to consider for clustering (skip trivial messages) */ const MIN_PROMPT_LEN = 15; @@ -53,6 +53,8 @@ function normalizePrompt(raw: string): string { /** Check if text is likely a system/bot message or noise rather than a user prompt */ function isNoise(text: string): boolean { const t = text.trim(); + // Harness-injected session-start context (e.g. Codex AGENTS.md / environment context) + if (isHarnessInjectedContext(t)) return true; // Decorative separators/borders if (/^[═─━=\-_*]{10,}/.test(t)) return true; // System auth/notification messages diff --git a/src/core/helpers.ts b/src/core/helpers.ts index 62918dd6..1866f328 100644 --- a/src/core/helpers.ts +++ b/src/core/helpers.ts @@ -294,3 +294,25 @@ export function classifyWorkType(msg: string): WorkType { } return 'other'; } + +/** + * Harness-injected session-start payloads that are recorded as `user` messages + * but are not real user prompts (e.g. Codex injects the repo `AGENTS.md` and + * environment/instruction context at session start). These should be treated as + * noise so they don't get clustered into workflow/skill recommendations. + * + * Matches a small set of known leading markers only — deliberately NOT a generic + * `#` markdown header or `` rule, both of which would suppress legitimate + * user prompts. + */ +const HARNESS_INJECTED_MARKERS: RegExp[] = [ + /^# AGENTS\.md instructions\b/, + /^ re.test(t)); +} diff --git a/src/core/parser-codex.test.ts b/src/core/parser-codex.test.ts index 67d9d224..5fbb8b5a 100644 --- a/src/core/parser-codex.test.ts +++ b/src/core/parser-codex.test.ts @@ -186,6 +186,29 @@ describe('parseCodexSessions skillsUsed extraction', () => { expect(sessions[0].requests[0].skillsUsed).toEqual(['pdf']); }); }); + + it('ignores harness-injected AGENTS.md context and captures the real prompt', () => { + withCodexFile([ + { type: 'session_meta', payload: { id: 'sess-inject-1', cwd: '/Users/me/proj' } }, + { type: 'turn_context', payload: { model: 'gpt-5.3-codex' } }, + // Session-start injected context recorded as a user response_item. + { type: 'response_item', timestamp: '2025-06-15T10:00:00Z', + payload: { type: 'message', role: 'user', content: [ + { type: 'input_text', text: '# AGENTS.md instructions for /Users/me/proj\n\nfollow repo conventions' }, + { type: 'input_text', text: '\n /Users/me/proj\n' }, + ] } }, + // The actual user prompt. + { type: 'event_msg', timestamp: '2025-06-15T10:00:05Z', payload: { type: 'user_message', message: 'what is this repo about?' } }, + { type: 'event_msg', timestamp: '2025-06-15T10:00:06Z', payload: { type: 'assistant_message', content: 'a coach' } }, + ], (sessionsDir) => { + const sessions = parseCodexSessions(sessionsDir); + expect(sessions).toHaveLength(1); + const texts = sessions[0].requests.map(r => r.messageText); + // The injected AGENTS.md / environment context must not be captured as a prompt. + expect(texts.some(t => t.startsWith('# AGENTS.md instructions'))).toBe(false); + expect(texts).toContain('what is this repo about?'); + }); + }); }); describe('findCodexDirs', () => { diff --git a/src/core/parser-codex.ts b/src/core/parser-codex.ts index dc270d39..c6f437fd 100644 --- a/src/core/parser-codex.ts +++ b/src/core/parser-codex.ts @@ -22,7 +22,7 @@ import * as path from 'path'; import { StringDecoder } from 'string_decoder'; import { ModelUsage, Session, SessionRequest } from './types'; import { assertTrustedPath, createRequest, createSession, detectDevcontainerFromRequests, extractSkillNameFromPath, extractSkillPathsFromText } from './parser-shared'; -import { canonicalizeReasoningEffort, extractReasoningEffortFromModelId } from './helpers'; +import { canonicalizeReasoningEffort, extractReasoningEffortFromModelId, isHarnessInjectedContext } from './helpers'; interface CodexLine { type: string; @@ -268,6 +268,9 @@ function extractFilePath(args: Record | null | undefined): stri function handleUserMessageEvent(payload: Record, state: CodexParseState, ts: number | null, defaultModel: string): void { const newMessage = stringValue(payload.message) || stringValue(payload.text); + // Harness-injected session-start context is recorded as a user message but is + // not a real prompt; ignore it before flushing or mutating turn state. + if (isHarnessInjectedContext(newMessage)) return; if (state.currentUserMessage && state.currentUserMessage === newMessage && isTurnEmpty(state)) { if (state.turnStartTs == null) state.turnStartTs = ts; return; @@ -342,7 +345,7 @@ function handleTurnContext(payload: Record, state: CodexParseSt function handleUserResponseItem(payload: Record, state: CodexParseState, ts: number | null, defaultModel: string): void { for (const item of extractContentItems(payload.content)) { - if (item.type !== 'input_text' || !item.text || item.text.startsWith('<')) continue; + if (item.type !== 'input_text' || !item.text || item.text.startsWith('<') || isHarnessInjectedContext(item.text)) continue; if (!state.currentUserMessage) { flushCodexTurn(state, defaultModel); state.currentUserMessage = item.text;