diff --git a/src/core/citability.test.ts b/src/core/citability.test.ts index cafa6a9..d95449f 100644 --- a/src/core/citability.test.ts +++ b/src/core/citability.test.ts @@ -90,6 +90,125 @@ Furthermore, we plan to add more features.`; const stats = result.dimensions.find(d => d.name === 'Statistical Density')!; expect(stats.score).toBeGreaterThanOrEqual(12); }); + + it('suggests attribution when statistical claims lack evidence signals', () => { + const statsContent = `Our platform serves 50,000 users across 120 countries. Revenue grew 250% in 2024. We process $2.5 million in transactions daily with 99.99% uptime.`; + const result = scorePageCitability(makePage(statsContent)); + const evidenceHint = result.hints.find(h => h.message.includes('source links or attribution')); + expect(evidenceHint).toBeDefined(); + }); + + it('does not suggest attribution when statistical claims include evidence signals', () => { + const statsContent = `According to the 2024 Benchmark Report, our platform serves 50,000 users across 120 countries. Revenue grew 250% in 2024. Source: https://example.com/report`; + const result = scorePageCitability(makePage(statsContent)); + const evidenceHint = result.hints.find(h => h.message.includes('source links or attribution')); + expect(evidenceHint).toBeUndefined(); + }); + + it('still suggests attribution for self-referential phrasing that mentions reports/studies', () => { + // "our internal report" / "based on our data" / "user behavior study" are + // self-referential — they should NOT silence the attribution hint. + const selfReferential = `Based on our internal data, we grew 300% in 2024. See our quarterly report for more numbers — our user behavior study found 80% retention.`; + const result = scorePageCitability(makePage(selfReferential)); + const evidenceHint = result.hints.find(h => h.message.includes('source links or attribution')); + expect(evidenceHint).toBeDefined(); + }); + + it('recognizes "study by" / "report from" attribution phrases as evidence', () => { + const externalAttribution = `A 2024 study by Stanford found 80% adoption. Revenue figures came from a report from McKinsey, showing 250% growth.`; + const result = scorePageCitability(makePage(externalAttribution)); + const evidenceHint = result.hints.find(h => h.message.includes('source links or attribution')); + expect(evidenceHint).toBeUndefined(); + }); + + it('still suggests attribution for "Citing our own data" — bare citing is self-referential', () => { + // The bare "citing" alternative used to swallow the hint here. Real attribution uses + // "cited by/in" or "as cited" — those are kept. "Citing our X" is self-referential. + const selfCiting = `Citing our own data, we grew 300% in 2024. Revenue grew 250% to $50 million daily.`; + const result = scorePageCitability(makePage(selfCiting)); + const evidenceHint = result.hints.find(h => h.message.includes('source links or attribution')); + expect(evidenceHint).toBeDefined(); + }); + + it('still suggests attribution for "According to our internal data" — self-referential according', () => { + // "according to" used to be unscoped and silently passed for "according to our X". The + // negative lookahead excludes our/my/us and "the {company,team,organization,internal}". + const selfAccording = `According to our internal data, we grew 300% in 2024. Revenue grew 250% to $50 million daily.`; + const result = scorePageCitability(makePage(selfAccording)); + const evidenceHint = result.hints.find(h => h.message.includes('source links or attribution')); + expect(evidenceHint).toBeDefined(); + }); + + it('recognizes "according to" an external source as evidence', () => { + const externalAccording = `According to Gartner, the market grew 40% in 2024. Revenue figures reached $50 million daily.`; + const result = scorePageCitability(makePage(externalAccording)); + const evidenceHint = result.hints.find(h => h.message.includes('source links or attribution')); + expect(evidenceHint).toBeUndefined(); + }); + + it('still suggests attribution for "data from our research team" — self-referential data-from', () => { + // "data from" used to be unguarded — "Data from our research team" would silently + // suppress the hint. Same negative-lookahead scoping as "according to". + const selfData = `Data from our research team shows we grew 300% in 2024. Revenue grew 250% to $50 million daily.`; + const result = scorePageCitability(makePage(selfData)); + const evidenceHint = result.hints.find(h => h.message.includes('source links or attribution')); + expect(evidenceHint).toBeDefined(); + }); + + it('recognizes "data from" an external source as evidence', () => { + const externalData = `Data from Bloomberg shows the market grew 40% in 2024. Revenue figures reached $50 million daily.`; + const result = scorePageCitability(makePage(externalData)); + const evidenceHint = result.hints.find(h => h.message.includes('source links or attribution')); + expect(evidenceHint).toBeUndefined(); + }); + + it('still suggests attribution for "study by our team" and "reported by our CEO"', () => { + // Same self-referential guard for study/report/research-by-from and reported/published-by-in. + const selfStudy = `In a study by our team, we grew 300% in 2024. Reported by our CEO, revenue reached $50 million daily.`; + const result = scorePageCitability(makePage(selfStudy)); + const evidenceHint = result.hints.find(h => h.message.includes('source links or attribution')); + expect(evidenceHint).toBeDefined(); + }); + + it('recognizes "According to US regulators" as external evidence (not self-referential)', () => { + // /i flag was making 'us' in NOT_SELF_REF match 'US' (United States), incorrectly + // flagging legitimate citations like "According to US regulators". 'us' was dropped. + const usCitation = `According to US regulators, the market grew 40% in 2024. Revenue figures reached $50 million daily across 120 countries.`; + const result = scorePageCitability(makePage(usCitation)); + const evidenceHint = result.hints.find(h => h.message.includes('source links or attribution')); + expect(evidenceHint).toBeUndefined(); + }); + + it('still suggests attribution for "As cited in our internal report"', () => { + // 'as cited' alone is meaningless without 'in/by SOURCE' — covered by the + // 'cited (by|in)' pattern with the self-referential guard. + const selfAsCited = `As cited in our internal report, we grew 300% in 2024 across 120 countries. Daily transactions reached $50 million.`; + const result = scorePageCitability(makePage(selfAsCited)); + const evidenceHint = result.hints.find(h => h.message.includes('source links or attribution')); + expect(evidenceHint).toBeDefined(); + }); + + it('still suggests attribution for "cited by our internal research team"', () => { + const selfCited = `Revenue data cited by our internal research team shows we grew 300% in 2024 across 120 countries. Daily transactions reached $50 million.`; + const result = scorePageCitability(makePage(selfCited)); + const evidenceHint = result.hints.find(h => h.message.includes('source links or attribution')); + expect(evidenceHint).toBeDefined(); + }); + + it('recognizes "cited by" an external source as evidence', () => { + const externalCited = `Findings cited by Stanford show the market grew 40% in 2024. Revenue figures reached $50 million daily.`; + const result = scorePageCitability(makePage(externalCited)); + const evidenceHint = result.hints.find(h => h.message.includes('source links or attribution')); + expect(evidenceHint).toBeUndefined(); + }); + + it('still suggests attribution for "Sources: our internal team"', () => { + // The sources?: label is a strong signal but only when followed by an external source. + const selfSources = `We grew 300% in 2024. Revenue reached $50 million daily across 120 countries. Sources: our internal team and our research department.`; + const result = scorePageCitability(makePage(selfSources)); + const evidenceHint = result.hints.find(h => h.message.includes('source links or attribution')); + expect(evidenceHint).toBeDefined(); + }); }); describe('scoreSiteCitability', () => { diff --git a/src/core/citability.ts b/src/core/citability.ts index dfdfd4b..267036b 100644 --- a/src/core/citability.ts +++ b/src/core/citability.ts @@ -197,11 +197,42 @@ function scoreStatisticalDensity(content: string, hints: ContentHint[]): Citabil if (totalMatches === 0) { hints.push({ type: 'suggestion', message: 'No statistics or factual claims found — AI favors content with concrete numbers, percentages, and dates' }); + } else if (!hasEvidenceSignals(content)) { + hints.push({ type: 'suggestion', message: 'Add source links or attribution for statistical claims so AI systems can verify them' }); } return { name: 'Statistical Density', score, maxScore: 25, details: `${totalMatches} statistical claims (${density.toFixed(1)} per 100 words)` }; } +// Shared negative-lookahead fragment used by every attribution pattern. The lookahead +// is anchored immediately after the keyword and consumes any whitespace internally so +// the engine can't backtrack \s* to 0 chars and slip a self-referential token through. +// Excludes "our/my" and "the {company,team,organization,internal}" so phrases like +// "according to our CEO" or "study from our team" don't masquerade as evidence. +// 'us' is intentionally omitted — the /i flag makes it also match "US" (United States), +// which would mis-flag legitimate external sources like "According to US regulators". +// The bare "data from us" / "according to us" cases are uncommon and not worth the +// collision with US-government citations. +const NOT_SELF_REF = String.raw`(?!\s*(?:our|my|the\s+(?:company|team|organization|internal))\b)`; + +function hasEvidenceSignals(content: string): boolean { + const evidencePatterns = [ + // External-source markers: URLs, footnote refs, or explicit attribution phrases. + // Every keyword pair must be followed by something that isn't self-referential — + // "our internal report", "data from us", "study by our team" should NOT count. + /https?:\/\/\S+/i, + /\[\^?\d+\]/, + new RegExp(String.raw`\baccording to\b${NOT_SELF_REF}`, 'i'), + new RegExp(String.raw`\bsources?:${NOT_SELF_REF}`, 'i'), + new RegExp(String.raw`\b(reported|published) (by|in)\b${NOT_SELF_REF}`, 'i'), + new RegExp(String.raw`\b(study|survey|report|research|paper|analysis) (by|from)\b${NOT_SELF_REF}`, 'i'), + new RegExp(String.raw`\bdata (from|by)\b${NOT_SELF_REF}`, 'i'), + new RegExp(String.raw`\bcited (by|in)\b${NOT_SELF_REF}`, 'i'), + ]; + + return evidencePatterns.some(pattern => pattern.test(content)); +} + /** * Dimension 4: Structure Quality (0-25) * Checks heading hierarchy, lists, short paragraphs, summary/TL;DR.