Skip to content

Commit 497fde6

Browse files
authored
Merge pull request #1658 from Hack23/copilot/refactor-news-generation-modules
refactor: decompose shared.ts and extract URL utils from generators.ts
2 parents 3b4c57f + 39df7fc commit 497fde6

8 files changed

Lines changed: 1050 additions & 880 deletions

File tree

Lines changed: 56 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,56 @@
1+
/**
2+
* @module data-transformers/content-generators/ai-marker-helpers
3+
* @description Banned content pattern detection.
4+
* Per SHARED_PROMPT_PATTERNS.md §BANNED Content Patterns v4.0,
5+
* these patterns must never appear in production articles.
6+
* AI agents MUST replace all AI_MUST_REPLACE markers with genuine analysis.
7+
*
8+
* @author Hack23 AB
9+
* @license Apache-2.0
10+
*/
11+
12+
/**
13+
* Banned content patterns that indicate low-quality boilerplate text.
14+
* Per SHARED_PROMPT_PATTERNS.md §BANNED Content Patterns v4.0, these
15+
* must never appear in production articles. AI agents MUST replace them
16+
* with genuine, document-specific analysis.
17+
*/
18+
const BANNED_PATTERNS: readonly { label: string; pattern: RegExp }[] = [
19+
{ label: 'neutralText: "The political landscape remains fluid…"', pattern: /The political landscape remains fluid,? with both government and opposition positioning for advantage/i },
20+
{ label: 'debateAnalysisMarker: "No chamber debate data is available…"', pattern: /No chamber debate data is available for these items,? limiting our ability/i },
21+
{ label: 'policySignificanceTouches: "Touches on {domains}."', pattern: /Touches on [\p{L}\p{N}][\p{L}\p{N}\s,&/()-]*\./iu },
22+
{ label: 'analysisOfNDocuments: "Analysis of N documents covering…"', pattern: /Analysis of \d+ documents covering/i },
23+
{ label: 'policySignificanceGeneric: "Requires committee review and chamber debate…"', pattern: /Requires committee review and chamber debate/i },
24+
{ label: 'topicInFocusSuffix: "…: {Topic} in Focus"', pattern: /:\s+\w[\w\s]*\bin Focus\b/i },
25+
{ label: 'briefingOnFieldLabels: "Political intelligence briefing on {Field}: and {Field}:"', pattern: /Political intelligence briefing on \w+:\s+and\s+\w+:/i },
26+
// Deep Analysis generic template patterns — AI MUST replace these with specific analysis
27+
{ label: 'genericTimeline: "The pace of activity signals…"', pattern: /The pace of activity signals the political urgency/i },
28+
{ label: 'genericTimeline: "define the current legislative landscape"', pattern: /define the current legislative landscape/i },
29+
{ label: 'genericWhy: "broad legislative push that will shape"', pattern: /broad legislative push that will shape multiple aspects/i },
30+
{ label: 'genericWhy: "critical period for understanding the government"', pattern: /critical period for understanding the government.s strategic direction/i },
31+
{ label: 'genericImpact: "culmination of legislative review, with recommendations that guide"', pattern: /culmination of legislative review,? with recommendations that guide/i },
32+
{ label: 'genericImpact: "interplay between governing ambition and opposition scrutiny"', pattern: /interplay between governing ambition and opposition scrutiny/i },
33+
{ label: 'genericConsequences: "cascade through committee deliberations"', pattern: /cascade through committee deliberations,? chamber votes/i },
34+
{ label: 'genericConsequences: "establish the policy alternatives that opposition parties will champion"', pattern: /establish the policy alternatives that opposition parties will champion/i },
35+
{ label: 'genericCritical: "Standard parliamentary procedures are being followed"', pattern: /Standard parliamentary procedures are being followed/i },
36+
{ label: 'genericCritical: "gap between legislative intent and implementation"', pattern: /gap between legislative intent and implementation often reveals/i },
37+
{ label: 'genericPillarTransition: "While parliament deliberates these legislative matters"', pattern: /While parliament deliberates these legislative matters/i },
38+
];
39+
40+
/**
41+
* Detect banned boilerplate patterns in HTML content.
42+
* Returns an array of human-readable labels identifying each detected
43+
* banned pattern, suitable for quality gate logs and error messages.
44+
*
45+
* @param html - The HTML string to scan for banned patterns
46+
* @returns Array of stable human-readable labels for each detected banned pattern
47+
*/
48+
export function detectBannedPatterns(html: string): string[] {
49+
const found: string[] = [];
50+
for (const { label, pattern } of BANNED_PATTERNS) {
51+
if (pattern.test(html)) {
52+
found.push(label);
53+
}
54+
}
55+
return found;
56+
}
Lines changed: 156 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,156 @@
1+
/**
2+
* @module data-transformers/content-generators/doc-type-helpers
3+
* @description Document type display names (multi-language) and title suffix templates.
4+
* Provides DOC_TYPE_DISPLAY lookup table, localizeDocType utility, and
5+
* TITLE_SUFFIX_TEMPLATES for inverted-pyramid lede construction.
6+
*
7+
* @author Hack23 AB
8+
* @license Apache-2.0
9+
*/
10+
11+
import type { Language } from '../../types/language.js';
12+
13+
/** Localized singular/plural display names for a Riksdag document type code. */
14+
export type DocTypeLocalization = {
15+
singular: Partial<Record<Language, string>>;
16+
plural: Partial<Record<Language, string>>;
17+
};
18+
19+
/** Multi-language display names for known Riksdag document type codes. */
20+
export const DOC_TYPE_DISPLAY: Readonly<Record<string, DocTypeLocalization>> = {
21+
prop: {
22+
singular: {
23+
en: 'Proposition', sv: 'Proposition', da: 'Proposition', no: 'Proposisjon',
24+
fi: 'Hallituksen esitys', de: 'Regierungsvorlage', fr: 'Projet de loi', es: 'Proposición',
25+
nl: 'Wetsvoorstel', ar: 'مقترح قانون', he: 'הצעת חוק', ja: '法案', ko: '정부 제출 법안', zh: '政府法案',
26+
},
27+
plural: {
28+
en: 'Propositions', sv: 'Propositioner', da: 'Propositioner', no: 'Proposisjoner',
29+
fi: 'Hallituksen esitykset', de: 'Regierungsvorlagen', fr: 'Projets de loi', es: 'Proposiciones',
30+
nl: 'Wetsvoorstellen', ar: 'مقترحات قوانين', he: 'הצעות חוק', ja: '法案', ko: '정부 제출 법안', zh: '政府法案',
31+
},
32+
},
33+
bet: {
34+
singular: {
35+
en: 'Committee Report', sv: 'Betänkande', da: 'Udvalgsbetænkning', no: 'Komitéinnstilling',
36+
fi: 'Valiokunnan mietintö', de: 'Ausschussbericht', fr: 'Rapport de commission', es: 'Informe de comisión',
37+
nl: 'Commissieverslag', ar: 'تقرير لجنة', he: 'דוח ועדה', ja: '委員会報告書', ko: '위원회 보고서', zh: '委员会报告',
38+
},
39+
plural: {
40+
en: 'Committee Reports', sv: 'Betänkanden', da: 'Udvalgsbetænkninger', no: 'Komitéinnstillinger',
41+
fi: 'Valiokunnan mietinnöt', de: 'Ausschussberichte', fr: 'Rapports de commission', es: 'Informes de comisión',
42+
nl: 'Commissieverslagen', ar: 'تقارير لجان', he: 'דוחות ועדה', ja: '委員会報告書', ko: '위원회 보고서', zh: '委员会报告',
43+
},
44+
},
45+
mot: {
46+
singular: {
47+
en: 'Motion', sv: 'Motion', da: 'Forslag', no: 'Forslag',
48+
fi: 'Aloite', de: 'Antrag', fr: 'Motion', es: 'Moción',
49+
nl: 'Motie', ar: 'مقترح', he: 'הצעה', ja: '動議', ko: '동의안', zh: '动议',
50+
},
51+
plural: {
52+
en: 'Motions', sv: 'Motioner', da: 'Forslag', no: 'Forslag',
53+
fi: 'Aloitteet', de: 'Anträge', fr: 'Motions', es: 'Mociones',
54+
nl: 'Moties', ar: 'مقترحات', he: 'הצעות', ja: '動議', ko: '동의안', zh: '动议',
55+
},
56+
},
57+
skr: {
58+
singular: {
59+
en: 'Government Communication', sv: 'Skrivelse', da: 'Regeringsskrivelse', no: 'Regjeringsskriv',
60+
fi: 'Valtioneuvoston kirjelmä', de: 'Regierungsschreiben', fr: 'Communication du gouvernement', es: 'Comunicación del gobierno',
61+
nl: 'Regeringsmededeling', ar: 'مذكرة حكومية', he: 'מכתב ממשלתי', ja: '政府通信文書', ko: '정부 통신문', zh: '政府公文',
62+
},
63+
plural: {
64+
en: 'Government Communications', sv: 'Skrivelser', da: 'Regeringsskrivelser', no: 'Regjeringsskriv',
65+
fi: 'Valtioneuvoston kirjelmät', de: 'Regierungsschreiben', fr: 'Communications du gouvernement', es: 'Comunicaciones del gobierno',
66+
nl: 'Regeringsmededelingen', ar: 'مذكرات حكومية', he: 'מכתבים ממשלתיים', ja: '政府通信文書', ko: '정부 통신문', zh: '政府公文',
67+
},
68+
},
69+
sfs: {
70+
singular: {
71+
en: 'Law/Statute', sv: 'Lag/förordning', da: 'Lov/forordning', no: 'Lov/forordning',
72+
fi: 'Laki/asetus', de: 'Gesetz/Verordnung', fr: 'Loi/Règlement', es: 'Ley/Reglamento',
73+
nl: 'Wet/Verordening', ar: 'قانون / لائحة', he: 'חוק/תקנה', ja: '法律/条例', ko: '법률/법규', zh: '法律/法规',
74+
},
75+
plural: {
76+
en: 'Laws/Statutes', sv: 'Lagar/förordningar', da: 'Love/forordninger', no: 'Lover/forordninger',
77+
fi: 'Lait/asetukset', de: 'Gesetze/Verordnungen', fr: 'Lois/Règlements', es: 'Leyes/Reglamentos',
78+
nl: 'Wetten/Verordeningen', ar: 'قوانين / لوائح', he: 'חוקים/תקנות', ja: '法律/条例', ko: '법률/법규', zh: '法律/法规',
79+
},
80+
},
81+
fpm: {
82+
singular: {
83+
en: 'EU Position Paper', sv: 'Faktapromemoria', da: 'EU-faktanota', no: 'EU-faktanotat',
84+
fi: 'EU-tietomuistio', de: 'EU-Positionspapier', fr: 'Note de position UE', es: 'Documento de posición de la UE',
85+
nl: 'EU-positiepaper', ar: 'ورقة موقف للاتحاد الأوروبي', he: 'מסמך עמדה של האיחוד האירופי', ja: 'EUポジションペーパー', ko: 'EU 입장 문서', zh: '欧盟立场文件',
86+
},
87+
plural: {
88+
en: 'EU Position Papers', sv: 'Faktapromemorior', da: 'EU-faktanotaer', no: 'EU-faktanotater',
89+
fi: 'EU-tietomuistiot', de: 'EU-Positionspapiere', fr: 'Notes de position UE', es: 'Documentos de posición de la UE',
90+
nl: 'EU-positiepapers', ar: 'أوراق موقف للاتحاد الأوروبي', he: 'מסמכי עמדה של האיחוד האירופי', ja: 'EUポジションペーパー', ko: 'EU 입장 문서', zh: '欧盟立场文件',
91+
},
92+
},
93+
pressm: {
94+
singular: {
95+
en: 'Press Release', sv: 'Pressmeddelande', da: 'Pressemeddelelse', no: 'Pressemelding',
96+
fi: 'Lehdistötiedote', de: 'Pressemitteilung', fr: 'Communiqué de presse', es: 'Comunicado de prensa',
97+
nl: 'Persbericht', ar: 'بيان صحفي', he: 'הודעה לעיתונות', ja: 'プレスリリース', ko: '보도자료', zh: '新闻稿',
98+
},
99+
plural: {
100+
en: 'Press Releases', sv: 'Pressmeddelanden', da: 'Pressemeddelelser', no: 'Pressemeldinger',
101+
fi: 'Lehdistötiedotteet', de: 'Pressemitteilungen', fr: 'Communiqués de presse', es: 'Comunicados de prensa',
102+
nl: 'Persberichten', ar: 'بيانات صحفية', he: 'הודעות לעיתונות', ja: 'プレスリリース', ko: '보도자료', zh: '新闻稿',
103+
},
104+
},
105+
ext: {
106+
singular: {
107+
en: 'External Reference', sv: 'Extern referens', da: 'Ekstern reference', no: 'Ekstern referanse',
108+
fi: 'Ulkoinen viite', de: 'Externe Referenz', fr: 'Référence externe', es: 'Referencia externa',
109+
nl: 'Externe referentie', ar: 'مرجع خارجي', he: 'הפניה חיצונית', ja: '外部参照', ko: '외부 참조', zh: '外部参考',
110+
},
111+
plural: {
112+
en: 'External References', sv: 'Externa referenser', da: 'Eksterne referencer', no: 'Eksterne referanser',
113+
fi: 'Ulkoiset viitteet', de: 'Externe Referenzen', fr: 'Références externes', es: 'Referencias externas',
114+
nl: 'Externe referenties', ar: 'مراجع خارجية', he: 'הפניות חיצוניות', ja: '外部参照', ko: '외부 참조', zh: '外部参考',
115+
},
116+
},
117+
other: {
118+
singular: {
119+
en: 'Other Document', sv: 'Övrigt dokument', da: 'Andet dokument', no: 'Annet dokument',
120+
fi: 'Muu asiakirja', de: 'Sonstiges Dokument', fr: 'Autre document', es: 'Otro documento',
121+
nl: 'Overig document', ar: 'مستند آخر', he: 'מסמך אחר', ja: 'その他の文書', ko: '기타 문서', zh: '其他文件',
122+
},
123+
plural: {
124+
en: 'Other Documents', sv: 'Övriga dokument', da: 'Andre dokumenter', no: 'Andre dokumenter',
125+
fi: 'Muut asiakirjat', de: 'Sonstige Dokumente', fr: 'Autres documents', es: 'Otros documentos',
126+
nl: 'Overige documenten', ar: 'مستندات أخرى', he: 'מסמכים אחרים', ja: 'その他の文書', ko: '기타 문서', zh: '其他文件',
127+
},
128+
},
129+
};
130+
131+
/** Localise raw Riksdag document type codes for display (singular/plural-aware, multi-language). */
132+
export function localizeDocType(code: string, lang: Language | string, count?: number): string {
133+
const entry = DOC_TYPE_DISPLAY[code];
134+
if (!entry) return code;
135+
const usePlural = count !== 1;
136+
const primary = usePlural ? entry.plural : entry.singular;
137+
const fallback = usePlural ? entry.singular : entry.plural;
138+
return primary[lang as Language] ?? primary.en ?? fallback[lang as Language] ?? fallback.en ?? code;
139+
}
140+
141+
/** Per-language title-suffix templates for inverted-pyramid lede construction. */
142+
export const TITLE_SUFFIX_TEMPLATES: Readonly<Record<string, (t: string) => string>> = {
143+
sv: t => ` — inklusive "${t}"`,
144+
da: t => ` — herunder "${t}"`,
145+
no: t => ` — inkludert "${t}"`,
146+
fi: t => ` — mukaan lukien "${t}"`,
147+
de: t => ` — darunter "${t}"`,
148+
fr: t => ` — notamment "${t}"`,
149+
es: t => ` — incluyendo "${t}"`,
150+
nl: t => ` — inclusief "${t}"`,
151+
ar: t => ` — بما فيها "${t}"`,
152+
he: t => ` — כולל "${t}"`,
153+
ja: t => `、「${t}」を含む`,
154+
ko: t => `, "${t}" 포함`,
155+
zh: t => `,包括"${t}"`,
156+
};
Lines changed: 62 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,62 @@
1+
/**
2+
* @module data-transformers/content-generators/event-helpers
3+
* @description Calendar event and document matching helpers.
4+
* Provides keyword extraction, related document lookup, and
5+
* minister extraction from interpellation summaries.
6+
*
7+
* @author Hack23 AB
8+
* @license Apache-2.0
9+
*/
10+
11+
import type { RawDocument, RawCalendarEvent } from '../types.js';
12+
13+
/** Extract meaningful keywords from text for cross-reference matching
14+
* (min 2 chars, captures EU, KU, etc.; splits on whitespace, hyphens, and commas) */
15+
function extractKeywords(text: string): string[] {
16+
return text.toLowerCase().split(/[\s,-]+/u).filter(w => w.length >= 2);
17+
}
18+
19+
/** Find documents related to a calendar event by organ match or keyword overlap (max 3) */
20+
export function findRelatedDocuments(event: RawCalendarEvent, documents: RawDocument[]): RawDocument[] {
21+
const eventOrgan = event.organ ?? '';
22+
const keywords = extractKeywords(event.rubrik ?? event.titel ?? event.title ?? '');
23+
return documents.filter(doc => {
24+
const docOrgan = doc.organ ?? doc.committee ?? '';
25+
if (eventOrgan && docOrgan && eventOrgan.toLowerCase() === docOrgan.toLowerCase()) return true;
26+
const docText = (doc.titel ?? doc.title ?? '').toLowerCase();
27+
return keywords.some(kw => docText.includes(kw));
28+
}).slice(0, 3);
29+
}
30+
31+
/** Find written questions related to a calendar event by keyword overlap (max 3) */
32+
export function findRelatedQuestions(event: RawCalendarEvent, questions: RawDocument[]): RawDocument[] {
33+
const keywords = extractKeywords(event.rubrik ?? event.titel ?? event.title ?? '');
34+
return questions.filter(q => {
35+
const qText = (q.titel ?? q.title ?? '').toLowerCase();
36+
return keywords.some(kw => qText.includes(kw));
37+
}).slice(0, 3);
38+
}
39+
40+
/** Extract targeted minister name from interpellation summary "till MINISTER" header line.
41+
* Strips trailing topic clauses ("om X", "angående Y", etc.) and punctuation. */
42+
export function extractMinister(summary: string): string {
43+
// Use non-newline whitespace ([^\S\n]+) so we don't cross into the next line
44+
const m = summary.match(/\btill[^\S\n]+([^\n]+)/i);
45+
if (!m) return '';
46+
const raw = m[1].trim();
47+
if (!raw) return '';
48+
49+
// Remove common trailing topic clauses and punctuation
50+
const lowerRaw = raw.toLowerCase();
51+
const stopPhrases = [' om ', ' angående ', ' rörande ', ' beträffande '];
52+
let end = raw.length;
53+
for (const phrase of stopPhrases) {
54+
const idx = lowerRaw.indexOf(phrase);
55+
if (idx !== -1 && idx < end) end = idx;
56+
}
57+
// Cut at terminating punctuation if it comes earlier
58+
const punctIdx = raw.search(/[?:;.,]/);
59+
if (punctIdx !== -1 && punctIdx < end) end = punctIdx;
60+
61+
return raw.slice(0, end).trim();
62+
}

0 commit comments

Comments
 (0)