diff --git a/browse/src/domain-skills.ts b/browse/src/domain-skills.ts index b68c031ff5..011059b273 100644 --- a/browse/src/domain-skills.ts +++ b/browse/src/domain-skills.ts @@ -291,8 +291,20 @@ export async function writeSkill(input: WriteSkillInput): Promise= PROMOTE_THRESHOLD AND flag_count == 0 → state:active - * - else stay quarantined with updated counter + * - if use_count >= PROMOTE_THRESHOLD AND flag_count == 0 AND L4 has scored + * the body (classifier_score > 0) → state:active + * - else stay quarantined with updated counter; user must run + * `domain-skill promote-to-global` manually + * + * The classifier_score > 0 gate is load-bearing: handleSave currently writes + * classifier_score=0 with the comment "L4 deferred to load-time / sidebar-agent + * fills this in on first prompt-injection load," but sidebar-agent was ripped + * (CLAUDE.md "Sidebar architecture") and nothing else updates the score, so + * skills authored via the production path never had their body scanned by L4. + * Without this gate, three benign uses promote any quarantined skill — including + * one written under the influence of a poisoned page — into the prompt context + * for every subsequent visit. The gate re-opens automatically the day L4 is + * rewired and writeSkill / recordSkillUse start receiving non-zero scores. */ export async function recordSkillUse(host: string, projectSlug: string, classifierFlagged: boolean): Promise { const normalized = normalizeHost(host); @@ -303,7 +315,12 @@ export async function recordSkillUse(host: string, projectSlug: string, classifi const useCount = current.use_count + 1; const flagCount = current.flag_count + (classifierFlagged ? 1 : 0); let state: SkillState = current.state; - if (state === 'quarantined' && useCount >= PROMOTE_THRESHOLD && flagCount === 0) { + if ( + state === 'quarantined' && + useCount >= PROMOTE_THRESHOLD && + flagCount === 0 && + current.classifier_score > 0 + ) { state = 'active'; } const updated: DomainSkillRow = { diff --git a/browse/test/domain-skills-storage.test.ts b/browse/test/domain-skills-storage.test.ts index cdc238f183..df53d8bc92 100644 --- a/browse/test/domain-skills-storage.test.ts +++ b/browse/test/domain-skills-storage.test.ts @@ -106,6 +106,31 @@ describe('domain-skills: state machine (T6)', () => { }) ).rejects.toThrow(/classifier flagged/); }); + + // domain-skill-commands.ts:140 (handleSave) writes classifier_score=0 with + // the comment "L4 deferred to load-time" — but sidebar-agent (the deferred + // scanner) was ripped per CLAUDE.md "Sidebar architecture." Without an + // explicit gate, three benign uses promote any quarantined skill, including + // one authored under a poisoned page, into prompt context permanently. + it('does NOT auto-promote when classifier_score is 0 (production handleSave shape)', async () => { + const m = await freshImport(); + await m.writeSkill({ + host: 'linkedin.com', + body: '# LinkedIn', + projectSlug: 'test-slug', + source: 'agent', + classifierScore: 0, // matches domain-skill-commands.ts:140 production path + }); + const after3 = await m.recordSkillUse('linkedin.com', 'test-slug', false); + await m.recordSkillUse('linkedin.com', 'test-slug', false); + const final = await m.recordSkillUse('linkedin.com', 'test-slug', false); + expect(after3?.state).toBe('quarantined'); + expect(final?.state).toBe('quarantined'); + expect(final?.use_count).toBe(3); + // readSkill returns null for quarantined skills — they don't fire. + const read = await m.readSkill('linkedin.com', 'test-slug'); + expect(read).toBeNull(); + }); }); describe('domain-skills: scope shadowing (T4)', () => {