Skip to content
Merged
56 changes: 56 additions & 0 deletions scripts/data-transformers/content-generators/ai-marker-helpers.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,56 @@
/**
* @module data-transformers/content-generators/ai-marker-helpers
* @description Banned content pattern detection.
* Per SHARED_PROMPT_PATTERNS.md §BANNED Content Patterns v4.0,
* these patterns must never appear in production articles.
* AI agents MUST replace all AI_MUST_REPLACE markers with genuine analysis.
*
* @author Hack23 AB
* @license Apache-2.0
*/

/**
* Banned content patterns that indicate low-quality boilerplate text.
* Per SHARED_PROMPT_PATTERNS.md §BANNED Content Patterns v4.0, these
* must never appear in production articles. AI agents MUST replace them
* with genuine, document-specific analysis.
*/
const BANNED_PATTERNS: readonly { label: string; pattern: RegExp }[] = [
{ label: 'neutralText: "The political landscape remains fluid…"', pattern: /The political landscape remains fluid,? with both government and opposition positioning for advantage/i },
{ label: 'debateAnalysisMarker: "No chamber debate data is available…"', pattern: /No chamber debate data is available for these items,? limiting our ability/i },
{ label: 'policySignificanceTouches: "Touches on {domains}."', pattern: /Touches on [\p{L}\p{N}][\p{L}\p{N}\s,&/()-]*\./iu },
{ label: 'analysisOfNDocuments: "Analysis of N documents covering…"', pattern: /Analysis of \d+ documents covering/i },
{ label: 'policySignificanceGeneric: "Requires committee review and chamber debate…"', pattern: /Requires committee review and chamber debate/i },
{ label: 'topicInFocusSuffix: "…: {Topic} in Focus"', pattern: /:\s+\w[\w\s]*\bin Focus\b/i },
{ label: 'briefingOnFieldLabels: "Political intelligence briefing on {Field}: and {Field}:"', pattern: /Political intelligence briefing on \w+:\s+and\s+\w+:/i },
// Deep Analysis generic template patterns — AI MUST replace these with specific analysis
{ label: 'genericTimeline: "The pace of activity signals…"', pattern: /The pace of activity signals the political urgency/i },
{ label: 'genericTimeline: "define the current legislative landscape"', pattern: /define the current legislative landscape/i },
{ label: 'genericWhy: "broad legislative push that will shape"', pattern: /broad legislative push that will shape multiple aspects/i },
{ label: 'genericWhy: "critical period for understanding the government"', pattern: /critical period for understanding the government.s strategic direction/i },
{ label: 'genericImpact: "culmination of legislative review, with recommendations that guide"', pattern: /culmination of legislative review,? with recommendations that guide/i },
{ label: 'genericImpact: "interplay between governing ambition and opposition scrutiny"', pattern: /interplay between governing ambition and opposition scrutiny/i },
{ label: 'genericConsequences: "cascade through committee deliberations"', pattern: /cascade through committee deliberations,? chamber votes/i },
{ label: 'genericConsequences: "establish the policy alternatives that opposition parties will champion"', pattern: /establish the policy alternatives that opposition parties will champion/i },
{ label: 'genericCritical: "Standard parliamentary procedures are being followed"', pattern: /Standard parliamentary procedures are being followed/i },
{ label: 'genericCritical: "gap between legislative intent and implementation"', pattern: /gap between legislative intent and implementation often reveals/i },
{ label: 'genericPillarTransition: "While parliament deliberates these legislative matters"', pattern: /While parliament deliberates these legislative matters/i },
];

/**
* Detect banned boilerplate patterns in HTML content.
* Returns an array of human-readable labels identifying each detected
* banned pattern, suitable for quality gate logs and error messages.
*
* @param html - The HTML string to scan for banned patterns
* @returns Array of stable human-readable labels for each detected banned pattern
*/
export function detectBannedPatterns(html: string): string[] {
const found: string[] = [];
for (const { label, pattern } of BANNED_PATTERNS) {
if (pattern.test(html)) {
found.push(label);
}
}
return found;
}
156 changes: 156 additions & 0 deletions scripts/data-transformers/content-generators/doc-type-helpers.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,156 @@
/**
* @module data-transformers/content-generators/doc-type-helpers
* @description Document type display names (multi-language) and title suffix templates.
* Provides DOC_TYPE_DISPLAY lookup table, localizeDocType utility, and
* TITLE_SUFFIX_TEMPLATES for inverted-pyramid lede construction.
*
* @author Hack23 AB
* @license Apache-2.0
*/

import type { Language } from '../../types/language.js';

/** Localized singular/plural display names for a Riksdag document type code. */
export type DocTypeLocalization = {
singular: Partial<Record<Language, string>>;
plural: Partial<Record<Language, string>>;
};

/** Multi-language display names for known Riksdag document type codes. */
export const DOC_TYPE_DISPLAY: Readonly<Record<string, DocTypeLocalization>> = {
prop: {
singular: {
en: 'Proposition', sv: 'Proposition', da: 'Proposition', no: 'Proposisjon',
fi: 'Hallituksen esitys', de: 'Regierungsvorlage', fr: 'Projet de loi', es: 'Proposición',
nl: 'Wetsvoorstel', ar: 'مقترح قانون', he: 'הצעת חוק', ja: '法案', ko: '정부 제출 법안', zh: '政府法案',
},
plural: {
en: 'Propositions', sv: 'Propositioner', da: 'Propositioner', no: 'Proposisjoner',
fi: 'Hallituksen esitykset', de: 'Regierungsvorlagen', fr: 'Projets de loi', es: 'Proposiciones',
nl: 'Wetsvoorstellen', ar: 'مقترحات قوانين', he: 'הצעות חוק', ja: '法案', ko: '정부 제출 법안', zh: '政府法案',
},
},
bet: {
singular: {
en: 'Committee Report', sv: 'Betänkande', da: 'Udvalgsbetænkning', no: 'Komitéinnstilling',
fi: 'Valiokunnan mietintö', de: 'Ausschussbericht', fr: 'Rapport de commission', es: 'Informe de comisión',
nl: 'Commissieverslag', ar: 'تقرير لجنة', he: 'דוח ועדה', ja: '委員会報告書', ko: '위원회 보고서', zh: '委员会报告',
},
plural: {
en: 'Committee Reports', sv: 'Betänkanden', da: 'Udvalgsbetænkninger', no: 'Komitéinnstillinger',
fi: 'Valiokunnan mietinnöt', de: 'Ausschussberichte', fr: 'Rapports de commission', es: 'Informes de comisión',
nl: 'Commissieverslagen', ar: 'تقارير لجان', he: 'דוחות ועדה', ja: '委員会報告書', ko: '위원회 보고서', zh: '委员会报告',
},
},
mot: {
singular: {
en: 'Motion', sv: 'Motion', da: 'Forslag', no: 'Forslag',
fi: 'Aloite', de: 'Antrag', fr: 'Motion', es: 'Moción',
nl: 'Motie', ar: 'مقترح', he: 'הצעה', ja: '動議', ko: '동의안', zh: '动议',
},
plural: {
en: 'Motions', sv: 'Motioner', da: 'Forslag', no: 'Forslag',
fi: 'Aloitteet', de: 'Anträge', fr: 'Motions', es: 'Mociones',
nl: 'Moties', ar: 'مقترحات', he: 'הצעות', ja: '動議', ko: '동의안', zh: '动议',
},
},
skr: {
singular: {
en: 'Government Communication', sv: 'Skrivelse', da: 'Regeringsskrivelse', no: 'Regjeringsskriv',
fi: 'Valtioneuvoston kirjelmä', de: 'Regierungsschreiben', fr: 'Communication du gouvernement', es: 'Comunicación del gobierno',
nl: 'Regeringsmededeling', ar: 'مذكرة حكومية', he: 'מכתב ממשלתי', ja: '政府通信文書', ko: '정부 통신문', zh: '政府公文',
},
plural: {
en: 'Government Communications', sv: 'Skrivelser', da: 'Regeringsskrivelser', no: 'Regjeringsskriv',
fi: 'Valtioneuvoston kirjelmät', de: 'Regierungsschreiben', fr: 'Communications du gouvernement', es: 'Comunicaciones del gobierno',
nl: 'Regeringsmededelingen', ar: 'مذكرات حكومية', he: 'מכתבים ממשלתיים', ja: '政府通信文書', ko: '정부 통신문', zh: '政府公文',
},
},
sfs: {
singular: {
en: 'Law/Statute', sv: 'Lag/förordning', da: 'Lov/forordning', no: 'Lov/forordning',
fi: 'Laki/asetus', de: 'Gesetz/Verordnung', fr: 'Loi/Règlement', es: 'Ley/Reglamento',
nl: 'Wet/Verordening', ar: 'قانون / لائحة', he: 'חוק/תקנה', ja: '法律/条例', ko: '법률/법규', zh: '法律/法规',
},
plural: {
en: 'Laws/Statutes', sv: 'Lagar/förordningar', da: 'Love/forordninger', no: 'Lover/forordninger',
fi: 'Lait/asetukset', de: 'Gesetze/Verordnungen', fr: 'Lois/Règlements', es: 'Leyes/Reglamentos',
nl: 'Wetten/Verordeningen', ar: 'قوانين / لوائح', he: 'חוקים/תקנות', ja: '法律/条例', ko: '법률/법규', zh: '法律/法规',
},
},
fpm: {
singular: {
en: 'EU Position Paper', sv: 'Faktapromemoria', da: 'EU-faktanota', no: 'EU-faktanotat',
fi: 'EU-tietomuistio', de: 'EU-Positionspapier', fr: 'Note de position UE', es: 'Documento de posición de la UE',
nl: 'EU-positiepaper', ar: 'ورقة موقف للاتحاد الأوروبي', he: 'מסמך עמדה של האיחוד האירופי', ja: 'EUポジションペーパー', ko: 'EU 입장 문서', zh: '欧盟立场文件',
},
plural: {
en: 'EU Position Papers', sv: 'Faktapromemorior', da: 'EU-faktanotaer', no: 'EU-faktanotater',
fi: 'EU-tietomuistiot', de: 'EU-Positionspapiere', fr: 'Notes de position UE', es: 'Documentos de posición de la UE',
nl: 'EU-positiepapers', ar: 'أوراق موقف للاتحاد الأوروبي', he: 'מסמכי עמדה של האיחוד האירופי', ja: 'EUポジションペーパー', ko: 'EU 입장 문서', zh: '欧盟立场文件',
},
},
pressm: {
singular: {
en: 'Press Release', sv: 'Pressmeddelande', da: 'Pressemeddelelse', no: 'Pressemelding',
fi: 'Lehdistötiedote', de: 'Pressemitteilung', fr: 'Communiqué de presse', es: 'Comunicado de prensa',
nl: 'Persbericht', ar: 'بيان صحفي', he: 'הודעה לעיתונות', ja: 'プレスリリース', ko: '보도자료', zh: '新闻稿',
},
plural: {
en: 'Press Releases', sv: 'Pressmeddelanden', da: 'Pressemeddelelser', no: 'Pressemeldinger',
fi: 'Lehdistötiedotteet', de: 'Pressemitteilungen', fr: 'Communiqués de presse', es: 'Comunicados de prensa',
nl: 'Persberichten', ar: 'بيانات صحفية', he: 'הודעות לעיתונות', ja: 'プレスリリース', ko: '보도자료', zh: '新闻稿',
},
},
ext: {
singular: {
en: 'External Reference', sv: 'Extern referens', da: 'Ekstern reference', no: 'Ekstern referanse',
fi: 'Ulkoinen viite', de: 'Externe Referenz', fr: 'Référence externe', es: 'Referencia externa',
nl: 'Externe referentie', ar: 'مرجع خارجي', he: 'הפניה חיצונית', ja: '外部参照', ko: '외부 참조', zh: '外部参考',
},
plural: {
en: 'External References', sv: 'Externa referenser', da: 'Eksterne referencer', no: 'Eksterne referanser',
fi: 'Ulkoiset viitteet', de: 'Externe Referenzen', fr: 'Références externes', es: 'Referencias externas',
nl: 'Externe referenties', ar: 'مراجع خارجية', he: 'הפניות חיצוניות', ja: '外部参照', ko: '외부 참조', zh: '外部参考',
},
},
other: {
singular: {
en: 'Other Document', sv: 'Övrigt dokument', da: 'Andet dokument', no: 'Annet dokument',
fi: 'Muu asiakirja', de: 'Sonstiges Dokument', fr: 'Autre document', es: 'Otro documento',
nl: 'Overig document', ar: 'مستند آخر', he: 'מסמך אחר', ja: 'その他の文書', ko: '기타 문서', zh: '其他文件',
},
plural: {
en: 'Other Documents', sv: 'Övriga dokument', da: 'Andre dokumenter', no: 'Andre dokumenter',
fi: 'Muut asiakirjat', de: 'Sonstige Dokumente', fr: 'Autres documents', es: 'Otros documentos',
nl: 'Overige documenten', ar: 'مستندات أخرى', he: 'מסמכים אחרים', ja: 'その他の文書', ko: '기타 문서', zh: '其他文件',
},
},
};

/** Localise raw Riksdag document type codes for display (singular/plural-aware, multi-language). */
export function localizeDocType(code: string, lang: Language | string, count?: number): string {
const entry = DOC_TYPE_DISPLAY[code];
if (!entry) return code;
const usePlural = count !== 1;
const primary = usePlural ? entry.plural : entry.singular;
const fallback = usePlural ? entry.singular : entry.plural;
return primary[lang as Language] ?? primary.en ?? fallback[lang as Language] ?? fallback.en ?? code;
}

/** Per-language title-suffix templates for inverted-pyramid lede construction. */
export const TITLE_SUFFIX_TEMPLATES: Readonly<Record<string, (t: string) => string>> = {
sv: t => ` — inklusive "${t}"`,
da: t => ` — herunder "${t}"`,
no: t => ` — inkludert "${t}"`,
fi: t => ` — mukaan lukien "${t}"`,
de: t => ` — darunter "${t}"`,
fr: t => ` — notamment "${t}"`,
es: t => ` — incluyendo "${t}"`,
nl: t => ` — inclusief "${t}"`,
ar: t => ` — بما فيها "${t}"`,
he: t => ` — כולל "${t}"`,
ja: t => `、「${t}」を含む`,
ko: t => `, "${t}" 포함`,
zh: t => `,包括"${t}"`,
};
62 changes: 62 additions & 0 deletions scripts/data-transformers/content-generators/event-helpers.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,62 @@
/**
* @module data-transformers/content-generators/event-helpers
* @description Calendar event and document matching helpers.
* Provides keyword extraction, related document lookup, and
* minister extraction from interpellation summaries.
*
* @author Hack23 AB
* @license Apache-2.0
*/

import type { RawDocument, RawCalendarEvent } from '../types.js';

/** Extract meaningful keywords from text for cross-reference matching
* (min 2 chars, captures EU, KU, etc.; splits on whitespace, hyphens, and commas) */
function extractKeywords(text: string): string[] {
return text.toLowerCase().split(/[\s,–-]+/u).filter(w => w.length >= 2);
}

/** Find documents related to a calendar event by organ match or keyword overlap (max 3) */
export function findRelatedDocuments(event: RawCalendarEvent, documents: RawDocument[]): RawDocument[] {
const eventOrgan = event.organ ?? '';
const keywords = extractKeywords(event.rubrik ?? event.titel ?? event.title ?? '');
return documents.filter(doc => {
const docOrgan = doc.organ ?? doc.committee ?? '';
if (eventOrgan && docOrgan && eventOrgan.toLowerCase() === docOrgan.toLowerCase()) return true;
const docText = (doc.titel ?? doc.title ?? '').toLowerCase();
return keywords.some(kw => docText.includes(kw));
}).slice(0, 3);
}

/** Find written questions related to a calendar event by keyword overlap (max 3) */
export function findRelatedQuestions(event: RawCalendarEvent, questions: RawDocument[]): RawDocument[] {
const keywords = extractKeywords(event.rubrik ?? event.titel ?? event.title ?? '');
return questions.filter(q => {
const qText = (q.titel ?? q.title ?? '').toLowerCase();
return keywords.some(kw => qText.includes(kw));
}).slice(0, 3);
}

/** Extract targeted minister name from interpellation summary "till MINISTER" header line.
* Strips trailing topic clauses ("om X", "angående Y", etc.) and punctuation. */
export function extractMinister(summary: string): string {
// Use non-newline whitespace ([^\S\n]+) so we don't cross into the next line
const m = summary.match(/\btill[^\S\n]+([^\n]+)/i);
if (!m) return '';
const raw = m[1].trim();
if (!raw) return '';

// Remove common trailing topic clauses and punctuation
const lowerRaw = raw.toLowerCase();
const stopPhrases = [' om ', ' angående ', ' rörande ', ' beträffande '];
let end = raw.length;
for (const phrase of stopPhrases) {
const idx = lowerRaw.indexOf(phrase);
if (idx !== -1 && idx < end) end = idx;
}
// Cut at terminating punctuation if it comes earlier
const punctIdx = raw.search(/[?:;.,]/);
if (punctIdx !== -1 && punctIdx < end) end = punctIdx;

return raw.slice(0, end).trim();
}
Loading
Loading