From faff16818e4727f2daec6e6e17b528452f37fcd5 Mon Sep 17 00:00:00 2001 From: syswave-dev <263179084+syswave-dev@users.noreply.github.com> Date: Wed, 6 May 2026 10:37:10 +0200 Subject: [PATCH 01/26] feat(recipes): add Recipe + Action Zod schemas Co-Authored-By: Claude Sonnet 4.6 --- lib/recipes.js | 31 ++++++++++++++ test/recipes-loader.test.js | 82 +++++++++++++++++++++++++++++++++++++ 2 files changed, 113 insertions(+) create mode 100644 lib/recipes.js create mode 100644 test/recipes-loader.test.js diff --git a/lib/recipes.js b/lib/recipes.js new file mode 100644 index 0000000..8b5903e --- /dev/null +++ b/lib/recipes.js @@ -0,0 +1,31 @@ +import { z } from 'zod'; + +const ActionSchema = z.discriminatedUnion('action', [ + z.object({ action: z.literal('remove-attr'), selector: z.string().min(1), attr: z.string().min(1) }), + z.object({ action: z.literal('remove-class'), selector: z.string().min(1), class: z.string().min(1) }), + z.object({ action: z.literal('remove-element'), selector: z.string().min(1) }), + z.object({ action: z.literal('unwrap'), selector: z.string().min(1) }), +]); + +const FetchSchema = z.object({ + render: z.enum(['force', 'skip']).optional(), + wait_for: z.string().min(1).optional(), + wait_timeout_ms: z.number().int().min(0).max(15000).optional(), + mobile_ua: z.boolean().optional(), +}).strict(); + +const SelectSchema = z.object({ + remove: z.array(z.string().min(1)).default([]), +}).strict(); + +export const RecipeSchema = z.object({ + name: z.string().min(1), + host: z.union([z.string().min(1), z.array(z.string().min(1)).min(1)]), + path: z.string().min(1).default('/**'), + preprocess: z.array(ActionSchema).default([]), + select: SelectSchema.default({ remove: [] }), + extractor: z.enum(['readability', 'trafilatura', 'playwright']).optional(), + fetch: FetchSchema.default({}), +}).strict(); + +export const ActionEnumSchema = ActionSchema; // re-export for tests diff --git a/test/recipes-loader.test.js b/test/recipes-loader.test.js new file mode 100644 index 0000000..09a221c --- /dev/null +++ b/test/recipes-loader.test.js @@ -0,0 +1,82 @@ +import { describe, it } from 'node:test'; +import assert from 'node:assert/strict'; +import { RecipeSchema } from '../lib/recipes.js'; + +describe('RecipeSchema', () => { + it('accepts a minimal recipe with name + host', () => { + const result = RecipeSchema.safeParse({ name: 'r1', host: 'example.com' }); + assert.equal(result.success, true); + }); + + it('accepts host as string array', () => { + const result = RecipeSchema.safeParse({ name: 'r1', host: ['a.com', 'b.com'] }); + assert.equal(result.success, true); + }); + + it('rejects when name is missing', () => { + const result = RecipeSchema.safeParse({ host: 'example.com' }); + assert.equal(result.success, false); + }); + + it('rejects when host is missing', () => { + const result = RecipeSchema.safeParse({ name: 'r1' }); + assert.equal(result.success, false); + }); + + it('accepts all four preprocess actions', () => { + const recipe = { + name: 'r1', host: 'a.com', + preprocess: [ + { action: 'remove-attr', selector: 'p', attr: 'aria-hidden' }, + { action: 'remove-class', selector: 'p', class: 'paywall' }, + { action: 'remove-element', selector: 'aside.ads' }, + { action: 'unwrap', selector: 'span.wrapper' }, + ], + }; + assert.equal(RecipeSchema.safeParse(recipe).success, true); + }); + + it('rejects unknown preprocess action', () => { + const recipe = { + name: 'r1', host: 'a.com', + preprocess: [{ action: 'acton', selector: 'p', attr: 'x' }], + }; + assert.equal(RecipeSchema.safeParse(recipe).success, false); + }); + + it('accepts fetch options', () => { + const recipe = { + name: 'r1', host: 'a.com', + fetch: { render: 'force', wait_for: '.x', wait_timeout_ms: 5000, mobile_ua: true }, + }; + assert.equal(RecipeSchema.safeParse(recipe).success, true); + }); + + it('rejects fetch.render outside the enum', () => { + const recipe = { name: 'r1', host: 'a.com', fetch: { render: 'auto' } }; + assert.equal(RecipeSchema.safeParse(recipe).success, false); + }); + + it('caps fetch.wait_timeout_ms at 15000', () => { + const recipe = { name: 'r1', host: 'a.com', fetch: { wait_timeout_ms: 99999 } }; + assert.equal(RecipeSchema.safeParse(recipe).success, false); + }); + + it('accepts select.remove as string array', () => { + const recipe = { name: 'r1', host: 'a.com', select: { remove: ['aside', '.ads'] } }; + assert.equal(RecipeSchema.safeParse(recipe).success, true); + }); + + it('accepts extractor enum', () => { + for (const x of ['readability', 'trafilatura', 'playwright']) { + assert.equal(RecipeSchema.safeParse({ name: 'r1', host: 'a.com', extractor: x }).success, true); + } + }); + + it('rejects unknown extractor', () => { + assert.equal( + RecipeSchema.safeParse({ name: 'r1', host: 'a.com', extractor: 'magic' }).success, + false, + ); + }); +}); From 8f814de19b2587e29c4168c918d6003cce69e247 Mon Sep 17 00:00:00 2001 From: syswave-dev <263179084+syswave-dev@users.noreply.github.com> Date: Wed, 6 May 2026 10:40:33 +0200 Subject: [PATCH 02/26] chore(recipes): drop unused ActionEnumSchema re-export --- lib/recipes.js | 2 -- 1 file changed, 2 deletions(-) diff --git a/lib/recipes.js b/lib/recipes.js index 8b5903e..c7f6233 100644 --- a/lib/recipes.js +++ b/lib/recipes.js @@ -27,5 +27,3 @@ export const RecipeSchema = z.object({ extractor: z.enum(['readability', 'trafilatura', 'playwright']).optional(), fetch: FetchSchema.default({}), }).strict(); - -export const ActionEnumSchema = ActionSchema; // re-export for tests From 9637a6854e0fc5ee3a826c832d7a94f919c925ab Mon Sep 17 00:00:00 2001 From: syswave-dev <263179084+syswave-dev@users.noreply.github.com> Date: Wed, 6 May 2026 10:42:23 +0200 Subject: [PATCH 03/26] feat(recipes): loadRecipes for default file Co-Authored-By: Claude Sonnet 4.6 --- lib/recipes.js | 92 ++++++++++++++++++++++++++++++ test/fixtures/recipes/default.json | 14 +++++ test/recipes-loader.test.js | 30 ++++++++++ 3 files changed, 136 insertions(+) create mode 100644 test/fixtures/recipes/default.json diff --git a/lib/recipes.js b/lib/recipes.js index c7f6233..445424c 100644 --- a/lib/recipes.js +++ b/lib/recipes.js @@ -1,4 +1,6 @@ import { z } from 'zod'; +import fs from 'node:fs'; +import path from 'node:path'; const ActionSchema = z.discriminatedUnion('action', [ z.object({ action: z.literal('remove-attr'), selector: z.string().min(1), attr: z.string().min(1) }), @@ -27,3 +29,93 @@ export const RecipeSchema = z.object({ extractor: z.enum(['readability', 'trafilatura', 'playwright']).optional(), fetch: FetchSchema.default({}), }).strict(); + +let cachedState = null; + +function loadOneFile(filePath) { + if (!filePath || !fs.existsSync(filePath)) { + return { loaded: [], rejected: [], present: false }; + } + let raw; + try { + raw = fs.readFileSync(filePath, 'utf8'); + } catch (err) { + console.warn(`[recipes] cannot read ${filePath}: ${err.message}`); + return { loaded: [], rejected: [], present: true }; + } + let parsed; + try { + parsed = JSON.parse(raw); + } catch (err) { + console.warn(`[recipes] ${filePath} is not valid JSON: ${err.message}`); + return { loaded: [], rejected: [], present: true }; + } + if (!Array.isArray(parsed)) { + console.warn(`[recipes] ${filePath} root must be an array`); + return { loaded: [], rejected: [], present: true }; + } + + const loaded = []; + const rejected = []; + const seenNames = new Set(); + parsed.forEach((entry, index) => { + const result = RecipeSchema.safeParse(entry); + if (!result.success) { + const msg = result.error.issues + .map((i) => `${i.path.join('.')}: ${i.message}`) + .join('; '); + console.warn(`[recipes] ${filePath} — recipe #${index} rejected: ${msg}`); + rejected.push({ index, name: entry?.name ?? null, message: msg }); + return; + } + if (seenNames.has(result.data.name)) { + console.warn(`[recipes] ${filePath} — duplicate name "${result.data.name}", later entry wins`); + const existingIdx = loaded.findIndex((r) => r.name === result.data.name); + if (existingIdx >= 0) loaded.splice(existingIdx, 1); + } + seenNames.add(result.data.name); + loaded.push(result.data); + }); + return { loaded, rejected, present: true }; +} + +function resolveUserPath() { + const env = process.env.PULLMD_SITE_RECIPES; + if (env) return env; // explicit always wins + const auto = path.resolve(process.cwd(), 'data/site-recipes.json'); + return fs.existsSync(auto) ? auto : null; +} + +export function loadRecipes(opts = {}) { + const defaultPath = opts.defaultPath ?? path.resolve(process.cwd(), 'site-recipes.default.json'); + const userPath = opts.userPath ?? resolveUserPath(); + + const sources = []; + let allLoaded = []; + let totalRejected = 0; + + for (const filePath of [defaultPath, userPath]) { + if (!filePath) continue; + const { loaded, rejected, present } = loadOneFile(filePath); + if (!present) continue; + sources.push({ path: filePath, loaded: loaded.length, rejected: rejected.length }); + allLoaded = allLoaded.concat(loaded); + totalRejected += rejected.length; + console.log(`[recipes] loaded ${filePath}: ${loaded.length} ok, ${rejected.length} rejected`); + } + + cachedState = { + recipes: allLoaded, + status: { + loaded: allLoaded.length, + rejected: totalRejected, + sources, + }, + }; + return cachedState; +} + +export function getRecipeStatus() { + if (!cachedState) return { loaded: 0, rejected: 0, sources: [] }; + return cachedState.status; +} diff --git a/test/fixtures/recipes/default.json b/test/fixtures/recipes/default.json new file mode 100644 index 0000000..91e0528 --- /dev/null +++ b/test/fixtures/recipes/default.json @@ -0,0 +1,14 @@ +[ + { + "name": "fixture-paywall", + "host": "*.example.com", + "preprocess": [ + { "action": "remove-attr", "selector": "p[aria-hidden=\"true\"]", "attr": "aria-hidden" } + ] + }, + { + "name": "fixture-extractor", + "host": "blog.example.com", + "extractor": "trafilatura" + } +] diff --git a/test/recipes-loader.test.js b/test/recipes-loader.test.js index 09a221c..a12e8f7 100644 --- a/test/recipes-loader.test.js +++ b/test/recipes-loader.test.js @@ -1,6 +1,12 @@ import { describe, it } from 'node:test'; import assert from 'node:assert/strict'; import { RecipeSchema } from '../lib/recipes.js'; +import { loadRecipes } from '../lib/recipes.js'; +import { fileURLToPath } from 'node:url'; +import path from 'node:path'; + +const here = path.dirname(fileURLToPath(import.meta.url)); +const fix = (rel) => path.join(here, 'fixtures/recipes', rel); describe('RecipeSchema', () => { it('accepts a minimal recipe with name + host', () => { @@ -80,3 +86,27 @@ describe('RecipeSchema', () => { ); }); }); + +describe('loadRecipes — default file only', () => { + it('loads recipes from the default file', () => { + const { recipes, status } = loadRecipes({ defaultPath: fix('default.json') }); + assert.equal(recipes.length, 2); + assert.equal(recipes[0].name, 'fixture-paywall'); + assert.equal(status.loaded, 2); + assert.equal(status.rejected, 0); + assert.equal(status.sources.length, 1); + assert.equal(status.sources[0].loaded, 2); + }); + + it('returns empty + warning when default file is absent', () => { + const { recipes, status } = loadRecipes({ defaultPath: fix('does-not-exist.json') }); + assert.equal(recipes.length, 0); + assert.equal(status.loaded, 0); + assert.equal(status.sources.length, 0); + }); + + it('skips user file when not provided', () => { + const { status } = loadRecipes({ defaultPath: fix('default.json') }); + assert.equal(status.sources.length, 1); + }); +}); From 4295f92803243ea6a5d308f79005ed80116f6678 Mon Sep 17 00:00:00 2001 From: syswave-dev <263179084+syswave-dev@users.noreply.github.com> Date: Wed, 6 May 2026 10:46:37 +0200 Subject: [PATCH 04/26] test(recipes): user overlay loading and recipe-level rejection Co-Authored-By: Claude Sonnet 4.6 --- test/fixtures/recipes/invalid.json | 4 +++ test/fixtures/recipes/user.json | 12 +++++++++ test/recipes-loader.test.js | 43 ++++++++++++++++++++++++++++++ 3 files changed, 59 insertions(+) create mode 100644 test/fixtures/recipes/invalid.json create mode 100644 test/fixtures/recipes/user.json diff --git a/test/fixtures/recipes/invalid.json b/test/fixtures/recipes/invalid.json new file mode 100644 index 0000000..03410a9 --- /dev/null +++ b/test/fixtures/recipes/invalid.json @@ -0,0 +1,4 @@ +[ + { "name": "valid-one", "host": "ok.example.com" }, + { "name": "invalid-one", "host": "bad.example.com", "preprocess": [{ "action": "acton", "selector": "p", "attr": "x" }] } +] diff --git a/test/fixtures/recipes/user.json b/test/fixtures/recipes/user.json new file mode 100644 index 0000000..7dfe3a1 --- /dev/null +++ b/test/fixtures/recipes/user.json @@ -0,0 +1,12 @@ +[ + { + "name": "fixture-extractor", + "host": "blog.example.com", + "extractor": "playwright" + }, + { + "name": "fixture-user-only", + "host": "user-only.example.com", + "select": { "remove": ["aside.ads"] } + } +] diff --git a/test/recipes-loader.test.js b/test/recipes-loader.test.js index a12e8f7..04cdca2 100644 --- a/test/recipes-loader.test.js +++ b/test/recipes-loader.test.js @@ -110,3 +110,46 @@ describe('loadRecipes — default file only', () => { assert.equal(status.sources.length, 1); }); }); + +describe('loadRecipes — user overlay', () => { + it('loads default + user, concatenates in order', () => { + const { recipes } = loadRecipes({ + defaultPath: fix('default.json'), + userPath: fix('user.json'), + }); + assert.equal(recipes.length, 4); + assert.equal(recipes[0].name, 'fixture-paywall'); + assert.equal(recipes[1].name, 'fixture-extractor'); + assert.equal(recipes[2].name, 'fixture-extractor'); // user override (same name) + assert.equal(recipes[3].name, 'fixture-user-only'); + }); + + it('reports per-source counts in status', () => { + const { status } = loadRecipes({ + defaultPath: fix('default.json'), + userPath: fix('user.json'), + }); + assert.equal(status.sources.length, 2); + assert.equal(status.sources[0].loaded, 2); + assert.equal(status.sources[1].loaded, 2); + assert.equal(status.rejected, 0); + }); + + it('skips user file silently when absent', () => { + const { status } = loadRecipes({ + defaultPath: fix('default.json'), + userPath: fix('does-not-exist.json'), + }); + assert.equal(status.sources.length, 1); + }); + + it('rejects malformed recipe per-recipe, loads the rest', () => { + const { recipes, status } = loadRecipes({ + defaultPath: fix('default.json'), + userPath: fix('invalid.json'), + }); + assert.equal(recipes.length, 3); // 2 default + 1 valid from invalid.json + assert.equal(status.rejected, 1); + assert.equal(status.sources[1].rejected, 1); + }); +}); From 5b512d98eb8dd18f9246358ae38cab6d75167c70 Mon Sep 17 00:00:00 2001 From: syswave-dev <263179084+syswave-dev@users.noreply.github.com> Date: Wed, 6 May 2026 10:49:40 +0200 Subject: [PATCH 05/26] feat(recipes): hostMatches glob with array-any semantics --- lib/recipes.js | 11 +++++++++++ test/recipes-matcher.test.js | 31 +++++++++++++++++++++++++++++++ 2 files changed, 42 insertions(+) create mode 100644 test/recipes-matcher.test.js diff --git a/lib/recipes.js b/lib/recipes.js index 445424c..f4ce89d 100644 --- a/lib/recipes.js +++ b/lib/recipes.js @@ -119,3 +119,14 @@ export function getRecipeStatus() { if (!cachedState) return { loaded: 0, rejected: 0, sources: [] }; return cachedState.status; } + +function globToRegex(glob) { + // Escape every regex-special char EXCEPT '*'; then translate '*' to '.*'. + const escaped = glob.replace(/[.+?^${}()|[\]\\]/g, '\\$&').replace(/\*/g, '.*'); + return new RegExp('^' + escaped + '$', 'i'); +} + +export function hostMatches(pattern, host) { + const patterns = Array.isArray(pattern) ? pattern : [pattern]; + return patterns.some((p) => globToRegex(p).test(host)); +} diff --git a/test/recipes-matcher.test.js b/test/recipes-matcher.test.js new file mode 100644 index 0000000..8978345 --- /dev/null +++ b/test/recipes-matcher.test.js @@ -0,0 +1,31 @@ +import { describe, it } from 'node:test'; +import assert from 'node:assert/strict'; +import { hostMatches } from '../lib/recipes.js'; + +describe('hostMatches', () => { + it('matches exact hostname', () => { + assert.equal(hostMatches('example.com', 'example.com'), true); + assert.equal(hostMatches('example.com', 'other.com'), false); + }); + + it('is case-insensitive', () => { + assert.equal(hostMatches('Example.COM', 'example.com'), true); + }); + + it('star matches any character sequence including dots', () => { + assert.equal(hostMatches('*.example.com', 'foo.example.com'), true); + assert.equal(hostMatches('*.example.com', 'foo.bar.example.com'), true); + assert.equal(hostMatches('*.example.com', 'example.com'), false); // apex needs explicit entry + assert.equal(hostMatches('*.example.com', 'other.com'), false); + }); + + it('accepts an array — any-of semantics', () => { + assert.equal(hostMatches(['a.com', 'b.com'], 'b.com'), true); + assert.equal(hostMatches(['a.com', 'b.com'], 'c.com'), false); + }); + + it('escapes regex special chars in literal parts', () => { + assert.equal(hostMatches('foo.example.com', 'foo.example.com'), true); + assert.equal(hostMatches('foo.example.com', 'fooXexample.com'), false); // dot is literal + }); +}); From f3437b20b9a370c51101c689040da481fe6e77f2 Mon Sep 17 00:00:00 2001 From: syswave-dev <263179084+syswave-dev@users.noreply.github.com> Date: Wed, 6 May 2026 10:52:18 +0200 Subject: [PATCH 06/26] feat(recipes): pathMatches glob with single/multi-segment wildcards --- lib/recipes.js | 24 ++++++++++++++++++++++++ test/recipes-matcher.test.js | 28 +++++++++++++++++++++++++++- 2 files changed, 51 insertions(+), 1 deletion(-) diff --git a/lib/recipes.js b/lib/recipes.js index f4ce89d..ee96c8a 100644 --- a/lib/recipes.js +++ b/lib/recipes.js @@ -130,3 +130,27 @@ export function hostMatches(pattern, host) { const patterns = Array.isArray(pattern) ? pattern : [pattern]; return patterns.some((p) => globToRegex(p).test(host)); } + +function pathGlobToRegex(glob) { + // Translate ** before *, escape regex-specials in between. + // Strategy: walk char-by-char, recognize ** and * tokens, escape literals. + let result = ''; + let i = 0; + while (i < glob.length) { + if (glob[i] === '*' && glob[i + 1] === '*') { + result += '.*'; + i += 2; + } else if (glob[i] === '*') { + result += '[^/]+'; + i += 1; + } else { + result += glob[i].replace(/[.+?^${}()|[\]\\]/g, '\\$&'); + i += 1; + } + } + return new RegExp('^' + result + '$'); +} + +export function pathMatches(pattern, urlPath) { + return pathGlobToRegex(pattern).test(urlPath); +} diff --git a/test/recipes-matcher.test.js b/test/recipes-matcher.test.js index 8978345..8341c53 100644 --- a/test/recipes-matcher.test.js +++ b/test/recipes-matcher.test.js @@ -1,6 +1,6 @@ import { describe, it } from 'node:test'; import assert from 'node:assert/strict'; -import { hostMatches } from '../lib/recipes.js'; +import { hostMatches, pathMatches } from '../lib/recipes.js'; describe('hostMatches', () => { it('matches exact hostname', () => { @@ -29,3 +29,29 @@ describe('hostMatches', () => { assert.equal(hostMatches('foo.example.com', 'fooXexample.com'), false); // dot is literal }); }); + +describe('pathMatches', () => { + it('matches exact path', () => { + assert.equal(pathMatches('/foo', '/foo'), true); + assert.equal(pathMatches('/foo', '/bar'), false); + }); + + it('** matches multiple segments', () => { + assert.equal(pathMatches('/**', '/'), true); + assert.equal(pathMatches('/**', '/a/b/c'), true); + assert.equal(pathMatches('/foo/**', '/foo/a/b'), true); + assert.equal(pathMatches('/foo/**', '/bar/a/b'), false); + }); + + it('* matches single segment (no slashes)', () => { + assert.equal(pathMatches('/foo/*', '/foo/bar'), true); + assert.equal(pathMatches('/foo/*', '/foo/bar/baz'), false); + assert.equal(pathMatches('/foo/*', '/foo/'), false); + }); + + it('mixed * and ** in the same pattern', () => { + assert.equal(pathMatches('/*/issues/*', '/owner/issues/123'), true); + assert.equal(pathMatches('/*/issues/*', '/owner/sub/issues/123'), false); // * = single segment + assert.equal(pathMatches('/*/issues/**', '/owner/issues/123/comment/456'), true); + }); +}); From 431f9f5efa1e96d0aeae267cace5488246afcbfa Mon Sep 17 00:00:00 2001 From: syswave-dev <263179084+syswave-dev@users.noreply.github.com> Date: Wed, 6 May 2026 10:55:20 +0200 Subject: [PATCH 07/26] feat(recipes): matchRecipes filter + merge (concat lists, last-wins scalars) --- lib/recipes.js | 34 +++++++++++++++++ test/recipes-matcher.test.js | 73 +++++++++++++++++++++++++++++++++++- 2 files changed, 106 insertions(+), 1 deletion(-) diff --git a/lib/recipes.js b/lib/recipes.js index ee96c8a..4280532 100644 --- a/lib/recipes.js +++ b/lib/recipes.js @@ -154,3 +154,37 @@ function pathGlobToRegex(glob) { export function pathMatches(pattern, urlPath) { return pathGlobToRegex(pattern).test(urlPath); } + +export function mergeRecipes(recipes) { + const result = { + preprocess: [], + removeSelectors: [], + extractor: undefined, + fetch: {}, + }; + for (const r of recipes) { + result.preprocess = result.preprocess.concat(r.preprocess || []); + result.removeSelectors = result.removeSelectors.concat(r.select?.remove || []); + if (r.extractor !== undefined) result.extractor = r.extractor; + if (r.fetch) { + for (const key of ['render', 'wait_for', 'wait_timeout_ms', 'mobile_ua']) { + if (r.fetch[key] !== undefined) result.fetch[key] = r.fetch[key]; + } + } + } + return result; +} + +export function matchRecipesAgainst(recipes, url) { + const host = url.hostname; + const urlPath = url.pathname || '/'; + const matched = recipes.filter( + (r) => hostMatches(r.host, host) && pathMatches(r.path || '/**', urlPath), + ); + return mergeRecipes(matched); +} + +export function matchRecipes(url) { + if (!cachedState) return mergeRecipes([]); + return matchRecipesAgainst(cachedState.recipes, url); +} diff --git a/test/recipes-matcher.test.js b/test/recipes-matcher.test.js index 8341c53..13992ab 100644 --- a/test/recipes-matcher.test.js +++ b/test/recipes-matcher.test.js @@ -1,6 +1,6 @@ import { describe, it } from 'node:test'; import assert from 'node:assert/strict'; -import { hostMatches, pathMatches } from '../lib/recipes.js'; +import { hostMatches, pathMatches, mergeRecipes, matchRecipesAgainst } from '../lib/recipes.js'; describe('hostMatches', () => { it('matches exact hostname', () => { @@ -55,3 +55,74 @@ describe('pathMatches', () => { assert.equal(pathMatches('/*/issues/**', '/owner/issues/123/comment/456'), true); }); }); + +describe('mergeRecipes', () => { + it('returns empty merge for no recipes', () => { + const m = mergeRecipes([]); + assert.deepEqual(m.preprocess, []); + assert.deepEqual(m.removeSelectors, []); + assert.equal(m.extractor, undefined); + assert.deepEqual(m.fetch, {}); + }); + + it('concatenates preprocess action lists in order', () => { + const r1 = { preprocess: [{ action: 'remove-attr', selector: 'p', attr: 'aria-hidden' }], select: { remove: [] }, fetch: {} }; + const r2 = { preprocess: [{ action: 'remove-class', selector: 'p', class: 'paywall' }], select: { remove: [] }, fetch: {} }; + const m = mergeRecipes([r1, r2]); + assert.equal(m.preprocess.length, 2); + assert.equal(m.preprocess[0].action, 'remove-attr'); + assert.equal(m.preprocess[1].action, 'remove-class'); + }); + + it('concatenates select.remove lists', () => { + const r1 = { preprocess: [], select: { remove: ['aside'] }, fetch: {} }; + const r2 = { preprocess: [], select: { remove: ['.ads'] }, fetch: {} }; + const m = mergeRecipes([r1, r2]); + assert.deepEqual(m.removeSelectors, ['aside', '.ads']); + }); + + it('extractor is last-wins', () => { + const r1 = { preprocess: [], select: { remove: [] }, fetch: {}, extractor: 'readability' }; + const r2 = { preprocess: [], select: { remove: [] }, fetch: {}, extractor: 'trafilatura' }; + assert.equal(mergeRecipes([r1, r2]).extractor, 'trafilatura'); + }); + + it('fetch fields merge per-key, not as whole object', () => { + const r1 = { preprocess: [], select: { remove: [] }, fetch: { wait_for: '.x' } }; + const r2 = { preprocess: [], select: { remove: [] }, fetch: { mobile_ua: true } }; + const m = mergeRecipes([r1, r2]); + assert.equal(m.fetch.wait_for, '.x'); // from r1, preserved + assert.equal(m.fetch.mobile_ua, true); // from r2 + }); + + it('fetch field last-wins on per-key conflict', () => { + const r1 = { preprocess: [], select: { remove: [] }, fetch: { render: 'force' } }; + const r2 = { preprocess: [], select: { remove: [] }, fetch: { render: 'skip' } }; + assert.equal(mergeRecipes([r1, r2]).fetch.render, 'skip'); + }); +}); + +describe('matchRecipesAgainst', () => { + const recipes = [ + { name: 'a', host: '*.example.com', path: '/**', preprocess: [], select: { remove: [] }, fetch: {} }, + { name: 'b', host: 'github.com', path: '/*/issues/*', preprocess: [], select: { remove: [] }, fetch: { render: 'force' } }, + { name: 'c', host: 'github.com', path: '/**', preprocess: [], select: { remove: [] }, fetch: {} }, + ]; + + it('returns recipes whose host AND path match', () => { + const merged = matchRecipesAgainst(recipes, new URL('https://github.com/owner/issues/123')); + assert.equal(merged.fetch.render, 'force'); // 'b' matched (and 'c'); both apply + }); + + it('skips recipes where path does not match', () => { + const merged = matchRecipesAgainst(recipes, new URL('https://github.com/owner/pulls/1')); + // 'b' does NOT match (path /*/issues/*); 'c' matches; render stays unset + assert.equal(merged.fetch.render, undefined); + }); + + it('returns empty merge when nothing matches', () => { + const merged = matchRecipesAgainst(recipes, new URL('https://other.org/')); + assert.deepEqual(merged.preprocess, []); + assert.equal(merged.extractor, undefined); + }); +}); From 19c5f6ff4a7f2813763072835ad22fcafb61493e Mon Sep 17 00:00:00 2001 From: syswave-dev <263179084+syswave-dev@users.noreply.github.com> Date: Wed, 6 May 2026 10:57:58 +0200 Subject: [PATCH 08/26] feat(recipes): applyPreprocessActions for all four actions Co-Authored-By: Claude Sonnet 4.6 --- lib/recipes.js | 35 ++++++++++++++ test/recipes-actions.test.js | 94 ++++++++++++++++++++++++++++++++++++ 2 files changed, 129 insertions(+) create mode 100644 test/recipes-actions.test.js diff --git a/lib/recipes.js b/lib/recipes.js index 4280532..dd908c6 100644 --- a/lib/recipes.js +++ b/lib/recipes.js @@ -1,6 +1,7 @@ import { z } from 'zod'; import fs from 'node:fs'; import path from 'node:path'; +import * as cheerio from 'cheerio'; const ActionSchema = z.discriminatedUnion('action', [ z.object({ action: z.literal('remove-attr'), selector: z.string().min(1), attr: z.string().min(1) }), @@ -188,3 +189,37 @@ export function matchRecipes(url) { if (!cachedState) return mergeRecipes([]); return matchRecipesAgainst(cachedState.recipes, url); } + +export function applyPreprocessActions(html, actions) { + if (!html || typeof html !== 'string') return html; + if (!actions || actions.length === 0) return html; + + const $ = cheerio.load(html, { decodeEntities: false }); + for (const action of actions) { + switch (action.action) { + case 'remove-attr': + $(action.selector).removeAttr(action.attr); + break; + case 'remove-class': + $(action.selector).each((_, el) => { + const $el = $(el); + const cls = $el.attr('class'); + if (!cls) return; + const tokens = cls.split(/\s+/).filter((t) => t && t !== action.class); + if (tokens.length === 0) $el.removeAttr('class'); + else $el.attr('class', tokens.join(' ')); + }); + break; + case 'remove-element': + $(action.selector).remove(); + break; + case 'unwrap': + $(action.selector).each((_, el) => { + const $el = $(el); + $el.replaceWith($el.contents()); + }); + break; + } + } + return $.html(); +} diff --git a/test/recipes-actions.test.js b/test/recipes-actions.test.js new file mode 100644 index 0000000..6ea88e9 --- /dev/null +++ b/test/recipes-actions.test.js @@ -0,0 +1,94 @@ +import { describe, it } from 'node:test'; +import assert from 'node:assert/strict'; +import { applyPreprocessActions } from '../lib/recipes.js'; + +describe('applyPreprocessActions — remove-attr', () => { + it('removes the named attribute from matching elements', () => { + const html = ''; + const out = applyPreprocessActions(html, [ + { action: 'remove-attr', selector: 'p', attr: 'aria-hidden' }, + ]); + assert.equal(out.includes('aria-hidden'), false); + assert.ok(out.includes('

x

')); + }); + + it('leaves non-matching elements alone', () => { + const html = ''; + const out = applyPreprocessActions(html, [ + { action: 'remove-attr', selector: 'p', attr: 'aria-hidden' }, + ]); + assert.ok(out.includes('')); + }); +}); + +describe('applyPreprocessActions — remove-class', () => { + it('removes the named class token, preserving others', () => { + const html = '

x

'; + const out = applyPreprocessActions(html, [ + { action: 'remove-class', selector: 'p', class: 'paywall' }, + ]); + assert.ok(out.includes('class="foo bar"') || out.includes('class="foo bar"')); + assert.equal(out.includes('paywall'), false); + }); + + it('removes the class attribute entirely if the only token is removed', () => { + const html = '

x

'; + const out = applyPreprocessActions(html, [ + { action: 'remove-class', selector: 'p', class: 'paywall' }, + ]); + assert.equal(out.includes('class='), false); + }); +}); + +describe('applyPreprocessActions — remove-element', () => { + it('removes the matching element and its descendants', () => { + const html = '

keep

'; + const out = applyPreprocessActions(html, [ + { action: 'remove-element', selector: 'aside.ads' }, + ]); + assert.equal(out.includes('drop'), false); + assert.ok(out.includes('keep')); + }); +}); + +describe('applyPreprocessActions — unwrap', () => { + it('replaces element with its children', () => { + const html = '

hello world!

'; + const out = applyPreprocessActions(html, [ + { action: 'unwrap', selector: 'span.wrap' }, + ]); + assert.ok(out.includes('hello world!')); + assert.equal(out.includes(' { + it('returns original HTML when actions list is empty', () => { + const html = '

x

'; + assert.equal(applyPreprocessActions(html, []), html); + }); + + it('no-op when selector matches nothing', () => { + const html = '

x

'; + const out = applyPreprocessActions(html, [ + { action: 'remove-attr', selector: 'div', attr: 'foo' }, + ]); + assert.ok(out.includes('

x

')); + }); + + it('returns input unchanged when html is empty/null', () => { + assert.equal(applyPreprocessActions('', []), ''); + assert.equal(applyPreprocessActions(null, []), null); + }); + + it('applies multiple actions in order', () => { + const html = ''; + const out = applyPreprocessActions(html, [ + { action: 'remove-attr', selector: 'p', attr: 'aria-hidden' }, + { action: 'remove-class', selector: 'p', class: 'paywall' }, + ]); + assert.equal(out.includes('aria-hidden'), false); + assert.equal(out.includes('paywall'), false); + assert.ok(out.includes('foo')); + }); +}); From ae56bcebd57777091b4412fdb98bafda349edab9 Mon Sep 17 00:00:00 2001 From: syswave-dev <263179084+syswave-dev@users.noreply.github.com> Date: Wed, 6 May 2026 11:01:09 +0200 Subject: [PATCH 09/26] feat(cache): add meta table and recipes_invalidated_at setter --- lib/cache.js | 24 ++++++++++++++++++++++++ test/cache.test.js | 22 ++++++++++++++++++++++ 2 files changed, 46 insertions(+) diff --git a/lib/cache.js b/lib/cache.js index 8097d83..e5745d0 100644 --- a/lib/cache.js +++ b/lib/cache.js @@ -95,6 +95,14 @@ export function createCache(dbPath = '/data/cache.db') { db.exec(`CREATE INDEX IF NOT EXISTS idx_user_fetches_fetched_at ON user_fetches(fetched_at)`); db.exec(`CREATE UNIQUE INDEX IF NOT EXISTS idx_user_fetches_unique ON user_fetches(user_id, cache_id)`); + db.exec(` + CREATE TABLE IF NOT EXISTS meta ( + key TEXT PRIMARY KEY, + value TEXT NOT NULL, + updated_at TEXT DEFAULT (datetime('now')) + ) + `); + // Migrate: add share_id column if missing const cols = db.prepare("PRAGMA table_info(conversions)").all().map(c => c.name); if (!cols.includes('share_id')) { @@ -192,6 +200,11 @@ export function createCache(dbPath = '/data/cache.db') { LIMIT ? OFFSET ? `), countForUser: db.prepare(`SELECT COUNT(*) as total FROM user_fetches WHERE user_id = ?`), + metaGet: db.prepare(`SELECT value FROM meta WHERE key = ?`), + metaSet: db.prepare(` + INSERT INTO meta (key, value, updated_at) VALUES (?, ?, datetime('now')) + ON CONFLICT(key) DO UPDATE SET value = excluded.value, updated_at = datetime('now') + `), }; return { @@ -305,5 +318,16 @@ export function createCache(dbPath = '/data/cache.db') { })); return { total, window, bySource, lowQualityDomains, fallbackByDomain }; }, + + getMeta(key) { + const row = stmts.metaGet.get(key); + return row ? row.value : null; + }, + setMeta(key, value) { + stmts.metaSet.run(key, value); + }, + setRecipesInvalidatedAt(iso) { + stmts.metaSet.run('recipes_invalidated_at', iso); + }, }; } diff --git a/test/cache.test.js b/test/cache.test.js index 4ee69ba..97b5552 100644 --- a/test/cache.test.js +++ b/test/cache.test.js @@ -198,3 +198,25 @@ describe('cache', () => { }); }); }); + +describe('cache — meta table', () => { + it('creates the meta table on init', () => { + const c = createCache(':memory:'); + assert.equal(c.getMeta('any-missing-key'), null); + c.setMeta('foo', 'bar'); + assert.equal(c.getMeta('foo'), 'bar'); + }); + + it('overwrites existing key on setMeta', () => { + const c = createCache(':memory:'); + c.setMeta('foo', 'one'); + c.setMeta('foo', 'two'); + assert.equal(c.getMeta('foo'), 'two'); + }); + + it('exposes setRecipesInvalidatedAt + reads it back via meta', () => { + const c = createCache(':memory:'); + c.setRecipesInvalidatedAt('2026-05-06 12:00:00'); + assert.equal(c.getMeta('recipes_invalidated_at'), '2026-05-06 12:00:00'); + }); +}); From 748baf564afceb9ece541977ccc103c345114ce2 Mon Sep 17 00:00:00 2001 From: syswave-dev <263179084+syswave-dev@users.noreply.github.com> Date: Wed, 6 May 2026 11:09:49 +0200 Subject: [PATCH 10/26] feat(cache): get() honors recipes_invalidated_at --- lib/cache.js | 10 ++++++++-- test/cache.test.js | 27 +++++++++++++++++++++++++++ 2 files changed, 35 insertions(+), 2 deletions(-) diff --git a/lib/cache.js b/lib/cache.js index e5745d0..5e8c93f 100644 --- a/lib/cache.js +++ b/lib/cache.js @@ -117,6 +117,8 @@ export function createCache(dbPath = '/data/cache.db') { db.exec('CREATE INDEX IF NOT EXISTS idx_conversions_user_id ON conversions(user_id)'); } + let recipesInvalidatedAt = '1970-01-01 00:00:00'; + const stmts = { upsert: db.prepare(` INSERT INTO conversions (url, title, markdown, source, share_id, client, user_id, created_at) @@ -132,7 +134,9 @@ export function createCache(dbPath = '/data/cache.db') { `), get: db.prepare(` SELECT title, markdown, source, share_id, client, created_at FROM conversions - WHERE url = ? AND created_at > datetime('now', '-1 hour') + WHERE url = ? + AND created_at > datetime('now', '-1 hour') + AND created_at > ? `), getByShareId: db.prepare(` SELECT url, title, markdown, source, client, created_at FROM conversions @@ -227,7 +231,8 @@ export function createCache(dbPath = '/data/cache.db') { }, get(url) { - return stmts.get.get(url) || null; + const row = stmts.get.get(url, recipesInvalidatedAt); + return row || null; }, getByShareId(shareId) { @@ -327,6 +332,7 @@ export function createCache(dbPath = '/data/cache.db') { stmts.metaSet.run(key, value); }, setRecipesInvalidatedAt(iso) { + recipesInvalidatedAt = iso; stmts.metaSet.run('recipes_invalidated_at', iso); }, }; diff --git a/test/cache.test.js b/test/cache.test.js index 97b5552..9444309 100644 --- a/test/cache.test.js +++ b/test/cache.test.js @@ -199,6 +199,33 @@ describe('cache', () => { }); }); +describe('cache — recipes invalidation in get()', () => { + it('returns null when row created_at < recipes_invalidated_at', () => { + const c = createCache(':memory:'); + c.put({ url: 'https://x.com', title: 'T', markdown: '# T', source: 'readability' }); + // Set invalidation timestamp AFTER the row was inserted + const future = new Date(Date.now() + 1000).toISOString().replace('T', ' ').slice(0, 19); + c.setRecipesInvalidatedAt(future); + assert.equal(c.get('https://x.com'), null); + }); + + it('still returns the row when invalidation timestamp is in the past', () => { + const c = createCache(':memory:'); + c.setRecipesInvalidatedAt('1970-01-01 00:00:00'); + c.put({ url: 'https://x.com', title: 'T', markdown: '# T', source: 'readability' }); + const hit = c.get('https://x.com'); + assert.ok(hit); + assert.equal(hit.title, 'T'); + }); + + it('default (no setRecipesInvalidatedAt called) treats all rows as fresh re: recipes', () => { + const c = createCache(':memory:'); + c.put({ url: 'https://x.com', title: 'T', markdown: '# T', source: 'readability' }); + const hit = c.get('https://x.com'); + assert.ok(hit); + }); +}); + describe('cache — meta table', () => { it('creates the meta table on init', () => { const c = createCache(':memory:'); From 03aef1c13c0a1ee875723f4e2661a38a07ad3b17 Mon Sep 17 00:00:00 2001 From: syswave-dev <263179084+syswave-dev@users.noreply.github.com> Date: Wed, 6 May 2026 11:14:51 +0200 Subject: [PATCH 11/26] feat(recipes): hash recipes content and invalidate cache on change Co-Authored-By: Claude Sonnet 4.6 --- lib/recipes.js | 29 +++++++++++++ test/recipes-cache-invalidation.test.js | 58 +++++++++++++++++++++++++ 2 files changed, 87 insertions(+) create mode 100644 test/recipes-cache-invalidation.test.js diff --git a/lib/recipes.js b/lib/recipes.js index dd908c6..75600dc 100644 --- a/lib/recipes.js +++ b/lib/recipes.js @@ -1,6 +1,7 @@ import { z } from 'zod'; import fs from 'node:fs'; import path from 'node:path'; +import { createHash } from 'node:crypto'; import * as cheerio from 'cheerio'; const ActionSchema = z.discriminatedUnion('action', [ @@ -190,6 +191,34 @@ export function matchRecipes(url) { return matchRecipesAgainst(cachedState.recipes, url); } +export function computeRecipesHash(filePaths) { + const hash = createHash('sha256'); + for (const p of filePaths) { + if (!p) continue; + if (fs.existsSync(p)) { + hash.update(p, 'utf8'); + hash.update('\n', 'utf8'); + hash.update(fs.readFileSync(p)); + hash.update('\n', 'utf8'); + } + } + return hash.digest('hex'); +} + +export function applyRecipesInvalidation(cache, newHash) { + const oldHash = cache.getMeta('recipes_hash'); + if (oldHash !== newHash) { + if (oldHash !== null) { + // Hash truly changed across reboots — bump invalidation timestamp. + // First boot (oldHash === null) does NOT bump: existing cache rows stay valid + // until the operator actually changes recipes. + const now = new Date().toISOString().replace('T', ' ').slice(0, 19); + cache.setRecipesInvalidatedAt(now); + } + cache.setMeta('recipes_hash', newHash); + } +} + export function applyPreprocessActions(html, actions) { if (!html || typeof html !== 'string') return html; if (!actions || actions.length === 0) return html; diff --git a/test/recipes-cache-invalidation.test.js b/test/recipes-cache-invalidation.test.js new file mode 100644 index 0000000..369378b --- /dev/null +++ b/test/recipes-cache-invalidation.test.js @@ -0,0 +1,58 @@ +import { describe, it } from 'node:test'; +import assert from 'node:assert/strict'; +import { computeRecipesHash, applyRecipesInvalidation } from '../lib/recipes.js'; +import { createCache } from '../lib/cache.js'; +import { fileURLToPath } from 'node:url'; +import path from 'node:path'; + +const here = path.dirname(fileURLToPath(import.meta.url)); +const fix = (rel) => path.join(here, 'fixtures/recipes', rel); + +describe('computeRecipesHash', () => { + it('returns a stable hex string for the same content', () => { + const a = computeRecipesHash([fix('default.json')]); + const b = computeRecipesHash([fix('default.json')]); + assert.equal(a, b); + assert.match(a, /^[0-9a-f]{64}$/); + }); + + it('returns a different hash when content differs', () => { + const a = computeRecipesHash([fix('default.json')]); + const b = computeRecipesHash([fix('default.json'), fix('user.json')]); + assert.notEqual(a, b); + }); + + it('handles missing files gracefully (treats as empty)', () => { + const a = computeRecipesHash([fix('default.json'), fix('does-not-exist.json')]); + const b = computeRecipesHash([fix('default.json')]); + assert.equal(a, b); + }); +}); + +describe('applyRecipesInvalidation', () => { + it('first boot: stores hash, leaves recipes_invalidated_at unset', () => { + const c = createCache(':memory:'); + assert.equal(c.getMeta('recipes_hash'), null); + applyRecipesInvalidation(c, 'hash-A'); + assert.equal(c.getMeta('recipes_hash'), 'hash-A'); + // Spec: on first boot, no invalidation stamp written (existing cache rows stay valid) + assert.equal(c.getMeta('recipes_invalidated_at'), null); + }); + + it('reboot, hash unchanged: no invalidation timestamp update', () => { + const c = createCache(':memory:'); + applyRecipesInvalidation(c, 'hash-A'); // first boot + const stamp = c.getMeta('recipes_invalidated_at'); + applyRecipesInvalidation(c, 'hash-A'); // unchanged + assert.equal(c.getMeta('recipes_invalidated_at'), stamp); + }); + + it('reboot, hash changed: invalidation timestamp updates to NOW', () => { + const c = createCache(':memory:'); + applyRecipesInvalidation(c, 'hash-A'); // first boot, no stamp yet + applyRecipesInvalidation(c, 'hash-B'); // change! + const stamp = c.getMeta('recipes_invalidated_at'); + assert.ok(stamp, 'invalidation stamp should be set'); + assert.match(stamp, /^\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2}$/); + }); +}); From 98d6af5c6cd4ddec8c6b5b4982ca416f69491f3c Mon Sep 17 00:00:00 2001 From: syswave-dev <263179084+syswave-dev@users.noreply.github.com> Date: Wed, 6 May 2026 11:18:35 +0200 Subject: [PATCH 12/26] feat(web): extractWeb honors recipe fetch options (render/extractor) Co-Authored-By: Claude Sonnet 4.6 --- lib/web.js | 22 +++++++++++++++++----- test/web.test.js | 36 ++++++++++++++++++++++++++++++++++++ 2 files changed, 53 insertions(+), 5 deletions(-) diff --git a/lib/web.js b/lib/web.js index 8cde41d..fe1e97a 100644 --- a/lib/web.js +++ b/lib/web.js @@ -7,6 +7,7 @@ import { renderDecision } from './render-decision.js'; import { renderViaSidecar } from './playwright-client.js'; import { pickUserAgent, maybeRefreshUaPool } from './user-agent.js'; import { preprocess } from './preprocess.js'; +import { matchRecipes, matchRecipesAgainst } from './recipes.js'; const TRAFILATURA_URL = process.env.TRAFILATURA_URL; const TRAFILATURA_TIMEOUT_MS = 8_000; @@ -291,8 +292,19 @@ export async function extractWeb(url, options = {}) { extractor, // 'readability' | 'trafilatura' | 'playwright' | undefined renderClient = renderViaSidecar, // injectable for tests } = options; - // extractor=playwright implies forced render — Playwright must run. - const effectiveRender = extractor === 'playwright' ? 'force' : render; + + // Resolve recipes: tests may pass options.recipes directly; production reads + // from the module-level cache populated by loadRecipes() at server boot. + const recipe = options.recipes + ? matchRecipesAgainst(options.recipes, new URL(url)) + : matchRecipes(new URL(url)); + + // Hook 1: query-param wins over recipe; recipe wins over no-default + const queryRender = (render === 'force' || render === 'skip') ? render : undefined; + const effectiveExtractor = extractor || recipe.extractor; + const effectiveRender = effectiveExtractor === 'playwright' + ? 'force' + : (queryRender ?? recipe.fetch.render); const rawFetchFn = options.fetch || globalThis.fetch; const fetchFn = withTimeout(rawFetchFn); @@ -335,7 +347,7 @@ export async function extractWeb(url, options = {}) { } // First pass: static extraction (Readability + Trafilatura + pickBest) - const result = await convertWithReadability(url, body, comments, statusCode, rawFetchFn, extractor); + const result = await convertWithReadability(url, body, comments, statusCode, rawFetchFn, effectiveExtractor); emit('extracting', { source: result.source }); // Decide whether to render via Playwright sidecar @@ -347,9 +359,9 @@ export async function extractWeb(url, options = {}) { // Second pass: render via sidecar, re-extract on rendered HTML try { const renderedHtml = await renderClient(url, { signal }); - const rendered = await convertWithReadability(url, renderedHtml, comments, statusCode, rawFetchFn, extractor); + const rendered = await convertWithReadability(url, renderedHtml, comments, statusCode, rawFetchFn, effectiveExtractor); rendered.source = 'playwright'; - const renderReason = extractor === 'playwright' + const renderReason = effectiveExtractor === 'playwright' ? 'forced via extractor=playwright' : `${decision.reason} → rendered via playwright`; rendered.metadata.extractorReason = renderReason; diff --git a/test/web.test.js b/test/web.test.js index 39ca5d2..13e1b69 100644 --- a/test/web.test.js +++ b/test/web.test.js @@ -1,6 +1,7 @@ import { describe, it, beforeEach, afterEach } from 'node:test'; import assert from 'node:assert/strict'; import { extractWeb } from '../lib/web.js'; +import { matchRecipesAgainst } from '../lib/recipes.js'; // Single-fetch: extractWeb makes exactly ONE request per call. // The Accept header includes text/markdown preference. @@ -560,3 +561,38 @@ describe('cleanDom CMS-pattern preprocessing', () => { assert.match(result.markdown, /A red sunset over mountains/); }); }); + +describe('extractWeb — recipe integration (Hook 0+1)', () => { + it('uses recipe.fetch.render when no query render param', async () => { + const recipes = [{ name: 'r', host: 'example.com', path: '/**', preprocess: [], select: { remove: [] }, fetch: { render: 'force' } }]; + let renderCalled = false; + const fetcher = mockFetch({ + ok: true, + headers: { get: (h) => h === 'content-type' ? 'text/html' : null }, + text: async () => '

x

', + arrayBuffer: async () => new TextEncoder().encode('

x

').buffer, + status: 200, + }); + const renderClient = async (url, opts) => { + renderCalled = true; + return '

R

rendered content with sufficient length to not trigger fallback heuristics whatsoever

'; + }; + await extractWeb('https://example.com/', { fetch: fetcher, renderClient, recipes }); + assert.equal(renderCalled, true, 'recipe render=force should trigger renderClient'); + }); + + it('query render=skip wins over recipe render=force', async () => { + const recipes = [{ name: 'r', host: 'example.com', path: '/**', preprocess: [], select: { remove: [] }, fetch: { render: 'force' } }]; + let renderCalled = false; + const fetcher = mockFetch({ + ok: true, + headers: { get: (h) => h === 'content-type' ? 'text/html' : null }, + text: async () => '

x

', + arrayBuffer: async () => new TextEncoder().encode('

x

').buffer, + status: 200, + }); + const renderClient = async () => { renderCalled = true; return ''; }; + await extractWeb('https://example.com/', { fetch: fetcher, renderClient, recipes, render: 'skip' }); + assert.equal(renderCalled, false); + }); +}); From fec7161e868657c9c4393c471e3dea4cdbb02885 Mon Sep 17 00:00:00 2001 From: syswave-dev <263179084+syswave-dev@users.noreply.github.com> Date: Wed, 6 May 2026 11:20:55 +0200 Subject: [PATCH 13/26] test(web): make Hook 0+1 render-force test discriminating --- test/web.test.js | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/test/web.test.js b/test/web.test.js index 13e1b69..c3165fc 100644 --- a/test/web.test.js +++ b/test/web.test.js @@ -563,19 +563,24 @@ describe('cleanDom CMS-pattern preprocessing', () => { }); describe('extractWeb — recipe integration (Hook 0+1)', () => { + // HTML substantial enough that renderDecision returns no on its own + // (sufficient length, multiple paragraphs, no fallback). Ensures the recipe + // render=force flag is the SOLE reason renderClient gets invoked. + const substantialHtml = `Substantial Article

Substantial Article

${'

This is a long paragraph with meaningful content and enough words to clear the eighty-character substantial threshold easily, so multiple of these will produce strong static extraction.

'.repeat(20)}
`; + it('uses recipe.fetch.render when no query render param', async () => { const recipes = [{ name: 'r', host: 'example.com', path: '/**', preprocess: [], select: { remove: [] }, fetch: { render: 'force' } }]; let renderCalled = false; const fetcher = mockFetch({ ok: true, headers: { get: (h) => h === 'content-type' ? 'text/html' : null }, - text: async () => '

x

', - arrayBuffer: async () => new TextEncoder().encode('

x

').buffer, + text: async () => substantialHtml, + arrayBuffer: async () => new TextEncoder().encode(substantialHtml).buffer, status: 200, }); const renderClient = async (url, opts) => { renderCalled = true; - return '

R

rendered content with sufficient length to not trigger fallback heuristics whatsoever

'; + return substantialHtml; }; await extractWeb('https://example.com/', { fetch: fetcher, renderClient, recipes }); assert.equal(renderCalled, true, 'recipe render=force should trigger renderClient'); From 8196f4a57b9656907d7fb933e9292c1826ed1ba2 Mon Sep 17 00:00:00 2001 From: syswave-dev <263179084+syswave-dev@users.noreply.github.com> Date: Wed, 6 May 2026 11:23:17 +0200 Subject: [PATCH 14/26] feat(web): extractWeb applies recipe preprocess actions and select.remove Thread resolved recipe through convertWithReadability: recipe.preprocess actions are applied after generic preprocess(), and recipe.removeSelectors are appended to cleanDom's REMOVE_SELECTORS for both call sites. Co-Authored-By: Claude Sonnet 4.6 --- lib/web.js | 24 +++++++++++++++--------- test/web.test.js | 44 ++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 59 insertions(+), 9 deletions(-) diff --git a/lib/web.js b/lib/web.js index fe1e97a..8d33a44 100644 --- a/lib/web.js +++ b/lib/web.js @@ -7,7 +7,7 @@ import { renderDecision } from './render-decision.js'; import { renderViaSidecar } from './playwright-client.js'; import { pickUserAgent, maybeRefreshUaPool } from './user-agent.js'; import { preprocess } from './preprocess.js'; -import { matchRecipes, matchRecipesAgainst } from './recipes.js'; +import { matchRecipes, matchRecipesAgainst, applyPreprocessActions } from './recipes.js'; const TRAFILATURA_URL = process.env.TRAFILATURA_URL; const TRAFILATURA_TIMEOUT_MS = 8_000; @@ -158,8 +158,11 @@ const REMOVE_SELECTORS = [ // Strict UUID v4 — used to detect CMS-asset-ID leakage in . const UUID_ALT_RE = /^[0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12}$/i; -function cleanDom(document) { - [...document.querySelectorAll(REMOVE_SELECTORS)].forEach(el => el.remove()); +function cleanDom(document, extraRemoveSelectors = []) { + const allRemove = extraRemoveSelectors.length > 0 + ? REMOVE_SELECTORS + ', ' + extraRemoveSelectors.join(', ') + : REMOVE_SELECTORS; + [...document.querySelectorAll(allRemove)].forEach(el => el.remove()); // Surface readonly text-input values as so click-to-copy slugs // (API model names, embed snippets, share links, …) survive extraction. @@ -182,8 +185,11 @@ function cleanDom(document) { } } -async function convertWithReadability(url, html, comments, statusCode, fetchFn, extractor) { - const cleanedHtml = preprocess(html); +async function convertWithReadability(url, html, comments, statusCode, fetchFn, extractor, recipe) { + let cleanedHtml = preprocess(html); + if (recipe?.preprocess?.length) { + cleanedHtml = applyPreprocessActions(cleanedHtml, recipe.preprocess); + } const { document } = parseHTML(cleanedHtml); const title = document.querySelector('title')?.textContent?.trim() || new URL(url).hostname; @@ -193,7 +199,7 @@ async function convertWithReadability(url, html, comments, statusCode, fetchFn, metadata.sourceUrl = url; metadata.statusCode = statusCode; - cleanDom(document); + cleanDom(document, recipe?.removeSelectors || []); // Comments path: skip Readability and Trafilatura, use cleaned body if (comments) { @@ -212,7 +218,7 @@ async function convertWithReadability(url, html, comments, statusCode, fetchFn, readabilityMd = nhm.translate(article.content); } else { const { document: doc2 } = parseHTML(cleanedHtml); - cleanDom(doc2); + cleanDom(doc2, recipe?.removeSelectors || []); readabilityMd = nhm.translate(doc2.querySelector('body')?.innerHTML || cleanedHtml); readabilityFellBack = true; } @@ -347,7 +353,7 @@ export async function extractWeb(url, options = {}) { } // First pass: static extraction (Readability + Trafilatura + pickBest) - const result = await convertWithReadability(url, body, comments, statusCode, rawFetchFn, effectiveExtractor); + const result = await convertWithReadability(url, body, comments, statusCode, rawFetchFn, effectiveExtractor, recipe); emit('extracting', { source: result.source }); // Decide whether to render via Playwright sidecar @@ -359,7 +365,7 @@ export async function extractWeb(url, options = {}) { // Second pass: render via sidecar, re-extract on rendered HTML try { const renderedHtml = await renderClient(url, { signal }); - const rendered = await convertWithReadability(url, renderedHtml, comments, statusCode, rawFetchFn, effectiveExtractor); + const rendered = await convertWithReadability(url, renderedHtml, comments, statusCode, rawFetchFn, effectiveExtractor, recipe); rendered.source = 'playwright'; const renderReason = effectiveExtractor === 'playwright' ? 'forced via extractor=playwright' diff --git a/test/web.test.js b/test/web.test.js index c3165fc..7274f0f 100644 --- a/test/web.test.js +++ b/test/web.test.js @@ -601,3 +601,47 @@ describe('extractWeb — recipe integration (Hook 0+1)', () => { assert.equal(renderCalled, false); }); }); + +describe('extractWeb — recipe integration (Hook 2 preprocess + select)', () => { + it('applies recipe preprocess actions before extraction', async () => { + const recipes = [{ + name: 'r', host: 'example.com', path: '/**', + preprocess: [{ action: 'remove-class', selector: 'p.paywall', class: 'paywall' }], + select: { remove: [] }, fetch: {}, + }]; + const html = 'T

' + + 'A substantial paragraph with enough body text to clear extraction-quality thresholds, ' + + 'this is filler content for the test, more filler content for the test, and even more.' + + '

'; + const fetcher = mockFetch({ + ok: true, + headers: { get: (h) => h === 'content-type' ? 'text/html' : null }, + text: async () => html, + arrayBuffer: async () => new TextEncoder().encode(html).buffer, + status: 200, + }); + const result = await extractWeb('https://example.com/', { fetch: fetcher, recipes }); + // The paragraph survives; the paywall class was stripped pre-extraction + assert.ok(result.markdown.includes('substantial paragraph')); + }); + + it('extends cleanDom REMOVE_SELECTORS via recipe select.remove', async () => { + const recipes = [{ + name: 'r', host: 'example.com', path: '/**', + preprocess: [], select: { remove: ['aside.recipe-only-strip'] }, fetch: {}, + }]; + const html = 'T
' + + '' + + '

A substantial paragraph with enough body text to clear extraction-quality thresholds for the article container.

' + + '
'; + const fetcher = mockFetch({ + ok: true, + headers: { get: (h) => h === 'content-type' ? 'text/html' : null }, + text: async () => html, + arrayBuffer: async () => new TextEncoder().encode(html).buffer, + status: 200, + }); + const result = await extractWeb('https://example.com/', { fetch: fetcher, recipes }); + assert.equal(result.markdown.includes('SHOULD-NOT-APPEAR'), false); + }); +}); From be2fadcf53d08992bab2835e5a14538cac61a081 Mon Sep 17 00:00:00 2001 From: syswave-dev <263179084+syswave-dev@users.noreply.github.com> Date: Wed, 6 May 2026 11:26:06 +0200 Subject: [PATCH 15/26] test(web): make Hook 2 preprocess + select tests discriminating Co-Authored-By: Claude Sonnet 4.6 --- test/web.test.js | 23 +++++++++++++---------- 1 file changed, 13 insertions(+), 10 deletions(-) diff --git a/test/web.test.js b/test/web.test.js index 7274f0f..954a742 100644 --- a/test/web.test.js +++ b/test/web.test.js @@ -606,13 +606,14 @@ describe('extractWeb — recipe integration (Hook 2 preprocess + select)', () => it('applies recipe preprocess actions before extraction', async () => { const recipes = [{ name: 'r', host: 'example.com', path: '/**', - preprocess: [{ action: 'remove-class', selector: 'p.paywall', class: 'paywall' }], + preprocess: [{ action: 'remove-element', selector: 'div.ads-noise' }], select: { remove: [] }, fetch: {}, }]; - const html = 'T

' + - 'A substantial paragraph with enough body text to clear extraction-quality thresholds, ' + - 'this is filler content for the test, more filler content for the test, and even more.' + - '

'; + const html = 'T
' + + '
PREPROCESS-SHOULD-REMOVE-ME
' + + '

A substantial paragraph with enough body text to clear extraction-quality thresholds, ' + + 'this is filler content for the test, more filler content for the test, and even more.

' + + '
'; const fetcher = mockFetch({ ok: true, headers: { get: (h) => h === 'content-type' ? 'text/html' : null }, @@ -621,17 +622,18 @@ describe('extractWeb — recipe integration (Hook 2 preprocess + select)', () => status: 200, }); const result = await extractWeb('https://example.com/', { fetch: fetcher, recipes }); - // The paragraph survives; the paywall class was stripped pre-extraction - assert.ok(result.markdown.includes('substantial paragraph')); + assert.ok(result.markdown.includes('substantial paragraph'), 'body paragraph survives'); + assert.equal(result.markdown.includes('PREPROCESS-SHOULD-REMOVE-ME'), false, + 'recipe preprocess remove-element must strip the noise div'); }); it('extends cleanDom REMOVE_SELECTORS via recipe select.remove', async () => { const recipes = [{ name: 'r', host: 'example.com', path: '/**', - preprocess: [], select: { remove: ['aside.recipe-only-strip'] }, fetch: {}, + preprocess: [], select: { remove: ['div.recipe-only-strip'] }, fetch: {}, }]; const html = 'T
' + - '' + + '
SELECT-SHOULD-NOT-APPEAR
' + '

A substantial paragraph with enough body text to clear extraction-quality thresholds for the article container.

' + '
'; const fetcher = mockFetch({ @@ -642,6 +644,7 @@ describe('extractWeb — recipe integration (Hook 2 preprocess + select)', () => status: 200, }); const result = await extractWeb('https://example.com/', { fetch: fetcher, recipes }); - assert.equal(result.markdown.includes('SHOULD-NOT-APPEAR'), false); + assert.equal(result.markdown.includes('SELECT-SHOULD-NOT-APPEAR'), false, + 'recipe select.remove must strip the targeted div'); }); }); From a3d7c3e37c6a279f99181a38cbfde281c0feeebd Mon Sep 17 00:00:00 2001 From: syswave-dev <263179084+syswave-dev@users.noreply.github.com> Date: Wed, 6 May 2026 11:28:11 +0200 Subject: [PATCH 16/26] feat(playwright): forward wait_for/mobile_ua/wait_timeout_ms from recipe Co-Authored-By: Claude Sonnet 4.6 --- lib/playwright-client.js | 9 +++++++-- lib/web.js | 7 ++++++- test/playwright-client.test.js | 34 ++++++++++++++++++++++++++++++++++ test/web.test.js | 26 ++++++++++++++++++++++++++ 4 files changed, 73 insertions(+), 3 deletions(-) diff --git a/lib/playwright-client.js b/lib/playwright-client.js index 3ed356d..d06809c 100644 --- a/lib/playwright-client.js +++ b/lib/playwright-client.js @@ -9,7 +9,7 @@ const SIDECAR_TIMEOUT_MS = 25_000; * @param {typeof fetch} [opts.fetch] Injectable for tests * @returns {Promise} rendered HTML */ -export async function renderViaSidecar(url, { signal, fetch: fetchFn = globalThis.fetch } = {}) { +export async function renderViaSidecar(url, { signal, fetch: fetchFn = globalThis.fetch, waitFor, waitTimeoutMs, mobileUa } = {}) { if (!process.env.PLAYWRIGHT_URL) throw new Error('Playwright sidecar not configured (PLAYWRIGHT_URL env)'); const ctrl = new AbortController(); @@ -20,11 +20,16 @@ export async function renderViaSidecar(url, { signal, fetch: fetchFn = globalThi else signal.addEventListener('abort', onAbort, { once: true }); } + const body = { url }; + if (waitFor !== undefined) body.waitFor = waitFor; + if (waitTimeoutMs !== undefined) body.waitTimeoutMs = waitTimeoutMs; + if (mobileUa !== undefined) body.mobileUa = mobileUa; + try { const res = await fetchFn(process.env.PLAYWRIGHT_URL, { method: 'POST', headers: { 'Content-Type': 'application/json' }, - body: JSON.stringify({ url }), + body: JSON.stringify(body), signal: ctrl.signal, }); if (!res.ok) throw new Error(`Sidecar returned ${res.status}`); diff --git a/lib/web.js b/lib/web.js index 8d33a44..6897d4a 100644 --- a/lib/web.js +++ b/lib/web.js @@ -364,7 +364,12 @@ export async function extractWeb(url, options = {}) { // Second pass: render via sidecar, re-extract on rendered HTML try { - const renderedHtml = await renderClient(url, { signal }); + const renderedHtml = await renderClient(url, { + signal, + waitFor: recipe?.fetch?.wait_for, + waitTimeoutMs: recipe?.fetch?.wait_timeout_ms, + mobileUa: recipe?.fetch?.mobile_ua, + }); const rendered = await convertWithReadability(url, renderedHtml, comments, statusCode, rawFetchFn, effectiveExtractor, recipe); rendered.source = 'playwright'; const renderReason = effectiveExtractor === 'playwright' diff --git a/test/playwright-client.test.js b/test/playwright-client.test.js index 77e8746..4bbcdf8 100644 --- a/test/playwright-client.test.js +++ b/test/playwright-client.test.js @@ -54,3 +54,37 @@ describe('renderViaSidecar', () => { ); }); }); + +describe('renderViaSidecar — recipe-driven options', () => { + it('forwards waitFor, waitTimeoutMs, mobileUa in POST body', async () => { + let captured; + const mockFetch = async (url, opts) => { + captured = JSON.parse(opts.body); + return { ok: true, text: async () => '' }; + }; + process.env.PLAYWRIGHT_URL = 'http://sidecar.test/'; + const { renderViaSidecar } = await import('../lib/playwright-client.js'); + await renderViaSidecar('https://example.com/', { + fetch: mockFetch, + waitFor: '.x', + waitTimeoutMs: 2500, + mobileUa: true, + }); + assert.equal(captured.url, 'https://example.com/'); + assert.equal(captured.waitFor, '.x'); + assert.equal(captured.waitTimeoutMs, 2500); + assert.equal(captured.mobileUa, true); + }); + + it('emits only url when no recipe options set (backwards compat)', async () => { + let captured; + const mockFetch = async (url, opts) => { + captured = JSON.parse(opts.body); + return { ok: true, text: async () => '' }; + }; + process.env.PLAYWRIGHT_URL = 'http://sidecar.test/'; + const { renderViaSidecar } = await import('../lib/playwright-client.js'); + await renderViaSidecar('https://example.com/', { fetch: mockFetch }); + assert.deepEqual(Object.keys(captured), ['url']); + }); +}); diff --git a/test/web.test.js b/test/web.test.js index 954a742..d4e5910 100644 --- a/test/web.test.js +++ b/test/web.test.js @@ -648,3 +648,29 @@ describe('extractWeb — recipe integration (Hook 2 preprocess + select)', () => 'recipe select.remove must strip the targeted div'); }); }); + +describe('extractWeb — Hook 3 (playwright fetch options)', () => { + it('passes recipe.fetch.wait_for and mobile_ua to renderClient', async () => { + const recipes = [{ + name: 'r', host: 'example.com', path: '/**', + preprocess: [], select: { remove: [] }, + fetch: { render: 'force', wait_for: '.gate', wait_timeout_ms: 3000, mobile_ua: true }, + }]; + let renderOpts; + const fetcher = mockFetch({ + ok: true, + headers: { get: (h) => h === 'content-type' ? 'text/html' : null }, + text: async () => '

x

', + arrayBuffer: async () => new TextEncoder().encode('

x

').buffer, + status: 200, + }); + const renderClient = async (url, opts) => { + renderOpts = opts; + return '

R

rendered substantial body content paragraph for testing pipeline.

'; + }; + await extractWeb('https://example.com/', { fetch: fetcher, renderClient, recipes }); + assert.equal(renderOpts.waitFor, '.gate'); + assert.equal(renderOpts.waitTimeoutMs, 3000); + assert.equal(renderOpts.mobileUa, true); + }); +}); From e4a5ced114538ade1fce37ae1107c549fd0f5694 Mon Sep 17 00:00:00 2001 From: syswave-dev <263179084+syswave-dev@users.noreply.github.com> Date: Wed, 6 May 2026 11:32:58 +0200 Subject: [PATCH 17/26] feat(server): GET /api/recipes/status (public, in-memory) Co-Authored-By: Claude Sonnet 4.6 --- server.js | 12 ++++++ test/recipes-status-endpoint.test.js | 58 ++++++++++++++++++++++++++++ 2 files changed, 70 insertions(+) create mode 100644 test/recipes-status-endpoint.test.js diff --git a/server.js b/server.js index c97b966..308b8d7 100644 --- a/server.js +++ b/server.js @@ -7,6 +7,7 @@ import { qualityScore } from './lib/scoring.js'; import { buildFrontmatter } from './lib/frontmatter.js'; import { mcpHandler } from './lib/mcp.js'; import { renderHelp, renderIndex, getSkillZip, publicUrlFor } from './lib/distrib.js'; +import { getRecipeStatus } from './lib/recipes.js'; function stripMarkdown(md) { return md @@ -498,6 +499,17 @@ export function createApp(overrides = {}) { } }); + app.get('/api/recipes/status', (req, res) => { + const status = getRecipeStatus(); + const ok = status.rejected === 0; + res.json({ + ok, + loaded: status.loaded, + rejected: status.rejected, + sources: status.sources, + }); + }); + app.get('/api/stats', (req, res) => { if (!cache) return res.json({ total: 0, window: '-7 days' }); const window = req.query.window || '-7 days'; diff --git a/test/recipes-status-endpoint.test.js b/test/recipes-status-endpoint.test.js new file mode 100644 index 0000000..b4a4809 --- /dev/null +++ b/test/recipes-status-endpoint.test.js @@ -0,0 +1,58 @@ +import { describe, it } from 'node:test'; +import assert from 'node:assert/strict'; +import { createApp } from '../server.js'; +import { loadRecipes } from '../lib/recipes.js'; +import { fileURLToPath } from 'node:url'; +import path from 'node:path'; + +const here = path.dirname(fileURLToPath(import.meta.url)); +const fix = (rel) => path.join(here, 'fixtures/recipes', rel); + +describe('GET /api/recipes/status', () => { + it('returns ok=true with counts when all recipes loaded', async () => { + loadRecipes({ defaultPath: fix('default.json') }); + const app = createApp({ cache: null }); + const server = app.listen(0); + const port = server.address().port; + try { + const res = await fetch(`http://localhost:${port}/api/recipes/status`); + assert.equal(res.status, 200); + const body = await res.json(); + assert.equal(body.ok, true); + assert.equal(body.loaded, 2); + assert.equal(body.rejected, 0); + assert.equal(body.sources.length, 1); + } finally { + server.close(); + } + }); + + it('returns ok=false when there are rejections', async () => { + loadRecipes({ defaultPath: fix('default.json'), userPath: fix('invalid.json') }); + const app = createApp({ cache: null }); + const server = app.listen(0); + const port = server.address().port; + try { + const res = await fetch(`http://localhost:${port}/api/recipes/status`); + assert.equal(res.status, 200); + const body = await res.json(); + assert.equal(body.ok, false); + assert.equal(body.rejected, 1); + } finally { + server.close(); + } + }); + + it('does not require auth (returns 200 without bearer/session)', async () => { + loadRecipes({ defaultPath: fix('default.json') }); + const app = createApp({ cache: null }); + const server = app.listen(0); + const port = server.address().port; + try { + const res = await fetch(`http://localhost:${port}/api/recipes/status`); + assert.equal(res.status, 200); + } finally { + server.close(); + } + }); +}); From 98e18a8f1a97c0f349ca08da6e3428e64ba11ce1 Mon Sep 17 00:00:00 2001 From: syswave-dev <263179084+syswave-dev@users.noreply.github.com> Date: Wed, 6 May 2026 11:34:33 +0200 Subject: [PATCH 18/26] feat(server): load recipes at boot and stamp cache-invalidation Co-Authored-By: Claude Sonnet 4.6 --- server.js | 18 +++++++++++++++++- 1 file changed, 17 insertions(+), 1 deletion(-) diff --git a/server.js b/server.js index 308b8d7..eb0af65 100644 --- a/server.js +++ b/server.js @@ -7,7 +7,9 @@ import { qualityScore } from './lib/scoring.js'; import { buildFrontmatter } from './lib/frontmatter.js'; import { mcpHandler } from './lib/mcp.js'; import { renderHelp, renderIndex, getSkillZip, publicUrlFor } from './lib/distrib.js'; -import { getRecipeStatus } from './lib/recipes.js'; +import { getRecipeStatus, loadRecipes, applyRecipesInvalidation, computeRecipesHash } from './lib/recipes.js'; +import path from 'node:path'; +import fs from 'node:fs'; function stripMarkdown(md) { return md @@ -597,6 +599,20 @@ if (isDirectRun || process.argv[1]?.endsWith('server.js')) { } throw err; } + // Load site recipes (default + optional user overlay) + const defaultRecipesPath = path.resolve(process.cwd(), 'site-recipes.default.json'); + const userRecipesPath = process.env.PULLMD_SITE_RECIPES + || (fs.existsSync(path.resolve(process.cwd(), 'data/site-recipes.json')) + ? path.resolve(process.cwd(), 'data/site-recipes.json') + : null); + loadRecipes({ defaultPath: defaultRecipesPath, userPath: userRecipesPath }); + + // Hash recipe content; if changed since last boot, invalidate cache. + const recipesHash = computeRecipesHash([defaultRecipesPath, userRecipesPath].filter(Boolean)); + applyRecipesInvalidation(cache, recipesHash); + const invalidationStamp = cache.getMeta('recipes_invalidated_at'); + if (invalidationStamp) cache.setRecipesInvalidatedAt(invalidationStamp); + const app = createApp({ cache, auth }); app.listen(port, () => { console.log(`PullMD running on http://localhost:${port} (auth: ${mode})`); From f67b99bf39be1d0f9ae67ec2d7cef6441f24ec6c Mon Sep 17 00:00:00 2001 From: syswave-dev <263179084+syswave-dev@users.noreply.github.com> Date: Wed, 6 May 2026 11:35:04 +0200 Subject: [PATCH 19/26] feat(recipes): seed default recipes for Future PLC and GitHub Issues --- site-recipes.default.json | 45 +++++++++++++++++++++++++++++++++++++++ 1 file changed, 45 insertions(+) create mode 100644 site-recipes.default.json diff --git a/site-recipes.default.json b/site-recipes.default.json new file mode 100644 index 0000000..ea73469 --- /dev/null +++ b/site-recipes.default.json @@ -0,0 +1,45 @@ +[ + { + "name": "future-plc-paywall-aria", + "host": [ + "*.windowscentral.com", + "*.gamesradar.com", + "*.techradar.com", + "*.tomshardware.com", + "*.pcgamer.com", + "*.t3.com" + ], + "preprocess": [ + { "action": "remove-attr", "selector": "p[aria-hidden=\"true\"]", "attr": "aria-hidden" }, + { "action": "remove-class", "selector": "p.paywall", "class": "paywall" } + ] + }, + { + "name": "future-plc-recommendations", + "host": [ + "*.windowscentral.com", + "*.gamesradar.com", + "*.techradar.com", + "*.tomshardware.com", + "*.pcgamer.com", + "*.t3.com" + ], + "select": { + "remove": [ + "aside[class*=\"you-may-like\" i]", + "div.related-articles", + "[data-component=\"recommendations\"]" + ] + } + }, + { + "name": "github-issues", + "host": "github.com", + "path": "/*/issues/*", + "fetch": { + "render": "force", + "wait_for": ".js-comment-body", + "wait_timeout_ms": 5000 + } + } +] From 75dcda851cee4323dba92250564166f3642e934d Mon Sep 17 00:00:00 2001 From: syswave-dev <263179084+syswave-dev@users.noreply.github.com> Date: Wed, 6 May 2026 11:37:34 +0200 Subject: [PATCH 20/26] feat(playwright-sidecar): accept waitFor, waitTimeoutMs, mobileUa --- playwright-sidecar/app.py | 49 +++++++++++++++++++++++++++++++++------ 1 file changed, 42 insertions(+), 7 deletions(-) diff --git a/playwright-sidecar/app.py b/playwright-sidecar/app.py index 6b1948c..a8313af 100644 --- a/playwright-sidecar/app.py +++ b/playwright-sidecar/app.py @@ -44,6 +44,9 @@ async def lifespan(_app: FastAPI): class RenderRequest(BaseModel): url: str + waitFor: str | None = None + waitTimeoutMs: int | None = None + mobileUa: bool = False @app.get("/health") @@ -56,15 +59,44 @@ def health(): } -async def _render(url: str) -> str: - context = await state["browser"].new_context(user_agent=USER_AGENT) +async def _render(url: str, wait_for: str | None = None, wait_timeout_ms: int | None = None, mobile_ua: bool = False) -> str: + if mobile_ua: + device = state["pw"].devices.get("iPhone 13") + if device is None: + # Fallback: hand-crafted mobile context if the device profile is unavailable + context = await state["browser"].new_context( + user_agent=( + "Mozilla/5.0 (iPhone; CPU iPhone OS 17_0 like Mac OS X) " + "AppleWebKit/605.1.15 (KHTML, like Gecko) Version/17.0 Mobile/15E148 Safari/604.1" + ), + viewport={"width": 390, "height": 844}, + device_scale_factor=3, + is_mobile=True, + has_touch=True, + ) + else: + context = await state["browser"].new_context(**device) + else: + context = await state["browser"].new_context(user_agent=USER_AGENT) + try: page = await context.new_page() await page.goto(url, wait_until="domcontentloaded", timeout=NAV_TIMEOUT_MS) - try: - await page.wait_for_load_state("networkidle", timeout=NETWORKIDLE_TIMEOUT_MS) - except PWTimeout: - log.info("networkidle timeout, returning current DOM: %s", url) + + if wait_for: + # Recipe-driven: wait for a specific selector instead of networkidle + timeout = max(0, min(wait_timeout_ms or 5000, 15_000)) + try: + await page.wait_for_selector(wait_for, timeout=timeout) + except PWTimeout: + log.info("wait_for selector timeout, returning current DOM: %s (selector=%s)", url, wait_for) + else: + # Default behavior: wait for networkidle as before + try: + await page.wait_for_load_state("networkidle", timeout=NETWORKIDLE_TIMEOUT_MS) + except PWTimeout: + log.info("networkidle timeout, returning current DOM: %s", url) + return await page.content() finally: await context.close() @@ -81,7 +113,10 @@ async def render(req: RenderRequest): async with sem: try: - return await asyncio.wait_for(_render(req.url), timeout=HARD_TIMEOUT_S) + return await asyncio.wait_for( + _render(req.url, wait_for=req.waitFor, wait_timeout_ms=req.waitTimeoutMs, mobile_ua=req.mobileUa), + timeout=HARD_TIMEOUT_S, + ) except asyncio.TimeoutError: raise HTTPException(status_code=504, detail=f"render timeout after {HARD_TIMEOUT_S}s") except Exception as exc: From ca93b7ef7c91c44275ebc4105d492af44323d636 Mon Sep 17 00:00:00 2001 From: syswave-dev <263179084+syswave-dev@users.noreply.github.com> Date: Wed, 6 May 2026 11:38:46 +0200 Subject: [PATCH 21/26] chore: bump to v2.2.0 + CHANGELOG + MIGRATION for recipe engine --- MIGRATION.md | 65 +++++++++++++++++++++++++++++++++++++++++++++++ package-lock.json | 4 +-- package.json | 2 +- 3 files changed, 68 insertions(+), 3 deletions(-) diff --git a/MIGRATION.md b/MIGRATION.md index 879659a..3f05597 100644 --- a/MIGRATION.md +++ b/MIGRATION.md @@ -69,3 +69,68 @@ If something goes wrong: 4. Restart. The `users`/`sessions`/`api_keys`/`user_fetches` tables and the `user_id` column on `conversions` are unused by v1.x and can stay if you're not restoring; v1.x ignores them. + +# Migrating from v2.1.x to v2.2.0 + +v2.2.0 ships the Site Recipe Engine (#18). Pure additive change — existing instances keep working unchanged. This section covers what to know if you want to use recipes. + +## Pin v2 tags explicitly + +`:latest` stays on v1.x until 2026-05-16. Update your compose / k8s manifests: + +```yaml +# Before +image: aeternalabshq/pullmd:latest +# After +image: aeternalabshq/pullmd:2.2.0 +# Also bump the playwright sidecar — wait_for and mobile_ua need the new sidecar: +image: aeternalabshq/pullmd-playwright:2.2.0 +``` + +## Optional: mount user recipes + +The default recipes in `site-recipes.default.json` cover Future PLC sites and GitHub Issues out of the box. To add your own: + +```yaml +services: + pullmd: + image: aeternalabshq/pullmd:2.2.0 + volumes: + - ./data:/app/data + # Drop your custom recipes at ./data/site-recipes.json on the host + # PullMD auto-discovers it. Or set PULLMD_SITE_RECIPES to a different path: + environment: + - PULLMD_SITE_RECIPES=/path/to/your/recipes.json +``` + +User recipes are concatenated with the defaults. On scalar conflicts (e.g. both define `extractor` for the same host), the user file wins via ordering. + +## Schema migrations + +The `meta` table is created automatically on first boot — no manual SQL. Existing cache rows remain valid until the first recipe content change is detected (the SHA256 of recipe file content is hashed at boot; on change, `recipes_invalidated_at` is bumped and old cache rows lazy-refresh on next access). + +## Monitoring + +`GET /api/recipes/status` returns `{ ok, loaded, rejected, sources }` — public, no auth. Add it to UptimeKuma / Healthchecks / equivalent to be alerted when a recipe fails to parse: + +```json +{ + "ok": true, + "loaded": 5, + "rejected": 0, + "sources": [ + { "path": "site-recipes.default.json", "loaded": 4, "rejected": 0 }, + { "path": "/app/data/site-recipes.json", "loaded": 1, "rejected": 0 } + ] +} +``` + +`ok = (rejected === 0)`. HTTP always returns 200; use the `ok` field for monitoring decisions. Rejection details are in stderr at server start (`docker logs pullmd | grep recipes`). + +## Rolling back to v2.1.x + +The schema change is additive (new `meta` table, no column changes on existing tables). To roll back: + +1. Stop v2.2.0 container. +2. Pin to `aeternalabshq/pullmd:2.1.0`. +3. Restart. The `meta` table stays — v2.1.x ignores it. diff --git a/package-lock.json b/package-lock.json index 0ae4111..1bbf341 100644 --- a/package-lock.json +++ b/package-lock.json @@ -1,12 +1,12 @@ { "name": "pullmd", - "version": "2.0.0", + "version": "2.2.0", "lockfileVersion": 3, "requires": true, "packages": { "": { "name": "pullmd", - "version": "2.0.0", + "version": "2.2.0", "license": "AGPL-3.0-or-later", "dependencies": { "@modelcontextprotocol/sdk": "^1.29.0", diff --git a/package.json b/package.json index 586d626..5dfb742 100644 --- a/package.json +++ b/package.json @@ -1,6 +1,6 @@ { "name": "pullmd", - "version": "2.1.0", + "version": "2.2.0", "type": "module", "main": "server.js", "license": "AGPL-3.0-or-later", From e3143878ca51ccc560e2fd3d757541361e8f4bd0 Mon Sep 17 00:00:00 2001 From: syswave-dev <263179084+syswave-dev@users.noreply.github.com> Date: Wed, 6 May 2026 11:39:15 +0200 Subject: [PATCH 22/26] docs(changelog): v2.2.0 entry for recipe engine --- CHANGELOG.md | 37 +++++++++++++++++++++++++++++++++++++ 1 file changed, 37 insertions(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index f1c3c5e..4b737c8 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,5 +1,42 @@ # Changelog +## v2.2.0 — 2026-05-XX + +### Added + +- **Site Recipe Engine** (#18). Declarative `site-recipes.json` for per-host preprocess, fetch, select, and extractor rules. Default recipes ship in the repo (`site-recipes.default.json`); self-hosters can mount `data/site-recipes.json` or set `PULLMD_SITE_RECIPES` to point elsewhere. Four recipe categories: + - `preprocess` — DOM cleanup actions (`remove-attr`, `remove-class`, `remove-element`, `unwrap`) applied before extraction + - `fetch` — render forcing (`render: force|skip`), wait-for selector, mobile UA + - `select` — extra remove-selectors added to `cleanDom` + - `extractor` — preferred extractor per host (`readability`, `trafilatura`, `playwright`) +- New endpoint `GET /api/recipes/status` (public, no auth) — counts loaded/rejected recipes per source for monitoring. +- Cache invalidation on recipe change. When recipe content changes between server boots, all cache rows become stale and re-extract on next access (lazy, on-demand). +- Playwright sidecar accepts new optional fields: `waitFor` (CSS selector), `waitTimeoutMs` (capped at 15000), `mobileUa` (boolean). Backwards compatible — old fields are silently passed through. +- Initial default recipes covering Future PLC sites (paywall + recommendation widgets) and GitHub Issues (JS-rendered comments). +- The Playwright sidecar bundles `playwright-stealth` to mitigate `navigator.webdriver`-style headless detection on JS-driven anti-bot pages. + +### Known limitations + +- **Sites behind cookie-based consent walls** (third-party CMP frameworks like TCF v2) are not unlocked by recipes alone in this release. Such sites redirect non-consenting visitors to a JS-rendered consent UI and only return article content once HttpOnly cookies are set after a click. A future release will add a `fetch.cookies` recipe field so operators can paste their own consent state when they choose to. For now, write a custom recipe with whatever combination of `select.remove`, `extractor`, and `fetch` settings works for your specific source — the engine supports the experimentation, the defaults stay conservative. + +### Important — `:latest` tag stays on v1.x + +The `:latest` tag in Docker Hub and GHCR remains pinned to v1.x until the scheduled flip on 2026-05-16. Self-hosters wanting the recipe engine **must pin `:v2.2.0`** (or `:2.2`) explicitly for both `pullmd` and `pullmd-playwright`. Pulling `:latest` continues to give you v1, **without** the recipe engine. + +```yaml +services: + pullmd: + image: aeternalabshq/pullmd:2.2.0 + playwright: + image: aeternalabshq/pullmd-playwright:2.2.0 +``` + +### Migration + +- New `meta` table created automatically on first boot. No action required. +- Existing cache rows remain valid until the first recipe content change is detected. +- See `MIGRATION.md` for the full upgrade path. + ## v2.1.0 — 2026-05-05 ### Added From e1947a7ab0ec32906bd025bc0664e60b212e0e65 Mon Sep 17 00:00:00 2001 From: syswave-dev <263179084+syswave-dev@users.noreply.github.com> Date: Wed, 6 May 2026 14:55:09 +0200 Subject: [PATCH 23/26] feat(playwright-sidecar): bundle playwright-stealth for headless detection mitigation --- playwright-sidecar/app.py | 20 ++++++++++++++++++++ playwright-sidecar/requirements.txt | 1 + 2 files changed, 21 insertions(+) diff --git a/playwright-sidecar/app.py b/playwright-sidecar/app.py index a8313af..8f75fdd 100644 --- a/playwright-sidecar/app.py +++ b/playwright-sidecar/app.py @@ -26,6 +26,22 @@ log = logging.getLogger("playwright-sidecar") state: dict = {"browser": None, "pw": None, "sem": asyncio.Semaphore(MAX_CONCURRENCY)} +# Stealth: defeat navigator.webdriver and other headless markers. +# API has changed across versions; try the current modern entrypoint with a fallback. +try: + from playwright_stealth import Stealth as _Stealth + _stealth = _Stealth() + async def _apply_stealth(page): + # Modern API (>= 2.x): instance method on Stealth + await _stealth.apply_stealth_async(page) +except (ImportError, AttributeError): + try: + from playwright_stealth import stealth_async as _apply_stealth # legacy 1.x + except ImportError: + async def _apply_stealth(page): + pass + log.warning("playwright-stealth not installed; running without bot-detection mitigation") + @asynccontextmanager async def lifespan(_app: FastAPI): @@ -81,6 +97,10 @@ async def _render(url: str, wait_for: str | None = None, wait_timeout_ms: int | try: page = await context.new_page() + try: + await _apply_stealth(page) + except Exception as e: + log.warning("stealth apply failed (non-fatal): %s", e) await page.goto(url, wait_until="domcontentloaded", timeout=NAV_TIMEOUT_MS) if wait_for: diff --git a/playwright-sidecar/requirements.txt b/playwright-sidecar/requirements.txt index 1e9abbc..9ce42cf 100644 --- a/playwright-sidecar/requirements.txt +++ b/playwright-sidecar/requirements.txt @@ -1,3 +1,4 @@ fastapi==0.115.0 uvicorn[standard]==0.32.0 playwright==1.49.0 +playwright-stealth From 5742e037a1672ae81b6221155eb605f2d347670e Mon Sep 17 00:00:00 2001 From: syswave-dev <263179084+syswave-dev@users.noreply.github.com> Date: Wed, 6 May 2026 15:23:26 +0200 Subject: [PATCH 24/26] feat(playwright): forward rotated User-Agent from pool to sidecar Pick the UA once per request in extractWeb and pass it to renderClient so static fetch and Playwright render share the same identity. renderViaSidecar forwards it as `userAgent` in the POST body (omitted when undefined for backwards compatibility). Co-Authored-By: Claude Sonnet 4.6 --- lib/playwright-client.js | 3 ++- lib/web.js | 4 +++- test/playwright-client.test.js | 15 +++++++++++++++ test/web.test.js | 23 +++++++++++++++++++++++ 4 files changed, 43 insertions(+), 2 deletions(-) diff --git a/lib/playwright-client.js b/lib/playwright-client.js index d06809c..0b91228 100644 --- a/lib/playwright-client.js +++ b/lib/playwright-client.js @@ -9,7 +9,7 @@ const SIDECAR_TIMEOUT_MS = 25_000; * @param {typeof fetch} [opts.fetch] Injectable for tests * @returns {Promise} rendered HTML */ -export async function renderViaSidecar(url, { signal, fetch: fetchFn = globalThis.fetch, waitFor, waitTimeoutMs, mobileUa } = {}) { +export async function renderViaSidecar(url, { signal, fetch: fetchFn = globalThis.fetch, waitFor, waitTimeoutMs, mobileUa, userAgent } = {}) { if (!process.env.PLAYWRIGHT_URL) throw new Error('Playwright sidecar not configured (PLAYWRIGHT_URL env)'); const ctrl = new AbortController(); @@ -24,6 +24,7 @@ export async function renderViaSidecar(url, { signal, fetch: fetchFn = globalThi if (waitFor !== undefined) body.waitFor = waitFor; if (waitTimeoutMs !== undefined) body.waitTimeoutMs = waitTimeoutMs; if (mobileUa !== undefined) body.mobileUa = mobileUa; + if (userAgent !== undefined) body.userAgent = userAgent; try { const res = await fetchFn(process.env.PLAYWRIGHT_URL, { diff --git a/lib/web.js b/lib/web.js index 6897d4a..d91fa27 100644 --- a/lib/web.js +++ b/lib/web.js @@ -318,7 +318,8 @@ export async function extractWeb(url, options = {}) { // TTL check; the actual refresh (if any) does not block this request. void maybeRefreshUaPool(); - const headers = { 'User-Agent': pickUserAgent() }; + const userAgent = pickUserAgent(); + const headers = { 'User-Agent': userAgent }; if (!comments) { headers['Accept'] = 'text/markdown, text/html;q=0.9, */*;q=0.8'; } @@ -369,6 +370,7 @@ export async function extractWeb(url, options = {}) { waitFor: recipe?.fetch?.wait_for, waitTimeoutMs: recipe?.fetch?.wait_timeout_ms, mobileUa: recipe?.fetch?.mobile_ua, + userAgent, }); const rendered = await convertWithReadability(url, renderedHtml, comments, statusCode, rawFetchFn, effectiveExtractor, recipe); rendered.source = 'playwright'; diff --git a/test/playwright-client.test.js b/test/playwright-client.test.js index 4bbcdf8..9d54d6c 100644 --- a/test/playwright-client.test.js +++ b/test/playwright-client.test.js @@ -87,4 +87,19 @@ describe('renderViaSidecar — recipe-driven options', () => { await renderViaSidecar('https://example.com/', { fetch: mockFetch }); assert.deepEqual(Object.keys(captured), ['url']); }); + + it('forwards userAgent in POST body when set', async () => { + let captured; + const mockFetch = async (url, opts) => { + captured = JSON.parse(opts.body); + return { ok: true, text: async () => '' }; + }; + process.env.PLAYWRIGHT_URL = 'http://sidecar.test/'; + const { renderViaSidecar } = await import('../lib/playwright-client.js'); + await renderViaSidecar('https://example.com/', { + fetch: mockFetch, + userAgent: 'Mozilla/5.0 (Test) Test/1.0', + }); + assert.equal(captured.userAgent, 'Mozilla/5.0 (Test) Test/1.0'); + }); }); diff --git a/test/web.test.js b/test/web.test.js index d4e5910..4d84f20 100644 --- a/test/web.test.js +++ b/test/web.test.js @@ -673,4 +673,27 @@ describe('extractWeb — Hook 3 (playwright fetch options)', () => { assert.equal(renderOpts.waitTimeoutMs, 3000); assert.equal(renderOpts.mobileUa, true); }); + + it('passes a User-Agent string to renderClient (from the rotation pool)', async () => { + const recipes = [{ + name: 'r', host: 'example.com', path: '/**', + preprocess: [], select: { remove: [] }, + fetch: { render: 'force' }, // force render to exercise the renderClient path + }]; + let renderOpts; + const fetcher = mockFetch({ + ok: true, + headers: { get: (h) => h === 'content-type' ? 'text/html' : null }, + text: async () => '

x

', + arrayBuffer: async () => new TextEncoder().encode('

x

').buffer, + status: 200, + }); + const renderClient = async (url, opts) => { + renderOpts = opts; + return '

R

rendered substantial body content paragraph for testing pipeline.

'; + }; + await extractWeb('https://example.com/', { fetch: fetcher, renderClient, recipes }); + assert.ok(typeof renderOpts.userAgent === 'string', 'userAgent should be a string'); + assert.match(renderOpts.userAgent, /Mozilla\//, 'userAgent should look like a real UA string'); + }); }); From 8754d09466fe1af980de49736159aa1668f71eb8 Mon Sep 17 00:00:00 2001 From: syswave-dev <263179084+syswave-dev@users.noreply.github.com> Date: Wed, 6 May 2026 15:23:30 +0200 Subject: [PATCH 25/26] feat(playwright-sidecar): accept userAgent override (defaults to hardcoded UA) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add optional `userAgent` field to RenderRequest and `_render`. Desktop branch uses `user_agent or USER_AGENT`; mobile branch is untouched — the iPhone device profile always wins there. Co-Authored-By: Claude Sonnet 4.6 --- playwright-sidecar/app.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/playwright-sidecar/app.py b/playwright-sidecar/app.py index 8f75fdd..67f5f5f 100644 --- a/playwright-sidecar/app.py +++ b/playwright-sidecar/app.py @@ -63,6 +63,7 @@ class RenderRequest(BaseModel): waitFor: str | None = None waitTimeoutMs: int | None = None mobileUa: bool = False + userAgent: str | None = None @app.get("/health") @@ -75,7 +76,7 @@ def health(): } -async def _render(url: str, wait_for: str | None = None, wait_timeout_ms: int | None = None, mobile_ua: bool = False) -> str: +async def _render(url: str, wait_for: str | None = None, wait_timeout_ms: int | None = None, mobile_ua: bool = False, user_agent: str | None = None) -> str: if mobile_ua: device = state["pw"].devices.get("iPhone 13") if device is None: @@ -93,7 +94,7 @@ async def _render(url: str, wait_for: str | None = None, wait_timeout_ms: int | else: context = await state["browser"].new_context(**device) else: - context = await state["browser"].new_context(user_agent=USER_AGENT) + context = await state["browser"].new_context(user_agent=user_agent or USER_AGENT) try: page = await context.new_page() @@ -134,7 +135,7 @@ async def render(req: RenderRequest): async with sem: try: return await asyncio.wait_for( - _render(req.url, wait_for=req.waitFor, wait_timeout_ms=req.waitTimeoutMs, mobile_ua=req.mobileUa), + _render(req.url, wait_for=req.waitFor, wait_timeout_ms=req.waitTimeoutMs, mobile_ua=req.mobileUa, user_agent=req.userAgent), timeout=HARD_TIMEOUT_S, ) except asyncio.TimeoutError: From 04494e0b3cd9c7de38ef72efaae20029a7b3fa14 Mon Sep 17 00:00:00 2001 From: syswave-dev <263179084+syswave-dev@users.noreply.github.com> Date: Wed, 6 May 2026 18:44:32 +0200 Subject: [PATCH 26/26] =?UTF-8?q?fix(recipes):=20github-issues=20path=20gl?= =?UTF-8?q?ob=20=E2=80=94=20match=20org/repo/issues/N=20(3=20segments)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The recipe pattern was '/*/issues/*' which only matches '/foo/issues/N', not GitHub's actual URL structure '/org/repo/issues/N' (3 segments). The recipe never applied to real GitHub URLs, so issue conversions silently fell through to the readability path with comments left JS-rendered and absent from the markdown. Reproduced against test instance with github.com/AeternaLabsHQ/pullmd/issues/10: cache row had source=readability, indicating recipe wasn't matched. Added a matcher test pinning the real-world path shape. --- site-recipes.default.json | 2 +- test/recipes-matcher.test.js | 9 +++++++++ 2 files changed, 10 insertions(+), 1 deletion(-) diff --git a/site-recipes.default.json b/site-recipes.default.json index ea73469..700c98b 100644 --- a/site-recipes.default.json +++ b/site-recipes.default.json @@ -35,7 +35,7 @@ { "name": "github-issues", "host": "github.com", - "path": "/*/issues/*", + "path": "/*/*/issues/*", "fetch": { "render": "force", "wait_for": ".js-comment-body", diff --git a/test/recipes-matcher.test.js b/test/recipes-matcher.test.js index 13992ab..06b70f3 100644 --- a/test/recipes-matcher.test.js +++ b/test/recipes-matcher.test.js @@ -125,4 +125,13 @@ describe('matchRecipesAgainst', () => { assert.deepEqual(merged.preprocess, []); assert.equal(merged.extractor, undefined); }); + + it('matches real GitHub issue URLs (org/repo/issues/N)', () => { + const ghRecipes = [ + { name: 'gh', host: 'github.com', path: '/*/*/issues/*', + preprocess: [], select: { remove: [] }, fetch: { render: 'force' } }, + ]; + const merged = matchRecipesAgainst(ghRecipes, new URL('https://github.com/AeternaLabsHQ/pullmd/issues/10')); + assert.equal(merged.fetch.render, 'force', 'three-segment github path must match /*/*/issues/*'); + }); });