From b16c7c66192a25b09b072d5fa4cc860c619ecb1c Mon Sep 17 00:00:00 2001
From: Dale Wahl <dalewahl@gmail.com>
Date: Tue, 5 May 2026 12:10:26 +0200
Subject: [PATCH 01/13] add mapper helper and crazy CI

---
 .../workflows/zeeschuimer_map_item_sync.yml   | 259 ++++++++++
 helper-scripts/map_item_converter.py          | 488 ++++++++++++++++++
 2 files changed, 747 insertions(+)
 create mode 100644 .github/workflows/zeeschuimer_map_item_sync.yml
 create mode 100644 helper-scripts/map_item_converter.py

diff --git a/.github/workflows/zeeschuimer_map_item_sync.yml b/.github/workflows/zeeschuimer_map_item_sync.yml
new file mode 100644
index 000000000..203df44c8
--- /dev/null
+++ b/.github/workflows/zeeschuimer_map_item_sync.yml
@@ -0,0 +1,259 @@
+# Auto-translate Zeeschuimer datasource `map_item` functions from Python to JS
+# and open a draft PR against digitalmethodsinitiative/zeeschuimer.
+#
+# Triggers on pushes to master that touch any Zeeschuimer datasource (or the
+# helper script itself). Also exposes a `workflow_dispatch` trigger with a
+# `bootstrap` input for the initial run that translates all 15 datasources at
+# once.
+#
+# Required secrets (configured in repo Settings -> Secrets and variables -> Actions):
+#   DMI_OLLAMA_KEY               - API key for https://ollama.digitalmethods.net (already used by helper script)
+#   ZEESCHUIMER_APP_ID           - numeric App ID of the GitHub App installed on
+#                                  digitalmethodsinitiative/zeeschuimer with permissions
+#                                  contents:write + pull-requests:write (and nothing else)
+#   ZEESCHUIMER_APP_PRIVATE_KEY  - full PEM private key for that App (including BEGIN/END lines)
+
+name: Sync Zeeschuimer map_item from 4CAT
+
+on:
+  push:
+    branches: [master]
+    paths:
+      - 'datasources/**/search_*.py'
+      - 'helper-scripts/map_item_converter.py'
+      - '.github/workflows/zeeschuimer_map_item_sync.yml'
+  workflow_dispatch:
+    inputs:
+      bootstrap:
+        description: 'Translate every Zeeschuimer datasource (initial sync). Ignored if "files" is set.'
+        type: boolean
+        default: false
+      files:
+        description: 'Space-separated list of datasource files to translate (e.g. "datasources/tiktok/search_tiktok.py"). Overrides bootstrap.'
+        type: string
+        default: ''
+      model:
+        description: 'Override LLM model on DMI Ollama (default: gemma3:4b). Examples: qwen2.5-coder:7b, deepseek-coder-v2:16b, gemma3:27b'
+        type: string
+        default: ''
+
+jobs:
+  sync:
+    name: Translate map_item and open Zeeschuimer PR
+    runs-on: ubuntu-latest
+    steps:
+      - name: Checkout 4CAT
+        uses: actions/checkout@v4
+        with:
+          fetch-depth: 2
+
+      - name: Determine changed datasource files
+        id: changed
+        run: |
+          # Manual run with explicit files takes precedence
+          if [ "${{ github.event_name }}" = "workflow_dispatch" ] && [ -n "${{ inputs.files }}" ]; then
+            echo "mode=files" >> "$GITHUB_OUTPUT"
+            echo "files=${{ inputs.files }}" >> "$GITHUB_OUTPUT"
+            exit 0
+          fi
+          if [ "${{ github.event_name }}" = "workflow_dispatch" ] && [ "${{ inputs.bootstrap }}" = "true" ]; then
+            echo "mode=bootstrap" >> "$GITHUB_OUTPUT"
+            echo "files=" >> "$GITHUB_OUTPUT"
+            exit 0
+          fi
+          changed=$(git diff --name-only "${{ github.event.before }}" "${{ github.sha }}" -- 'datasources/*/search_*.py' || true)
+          if [ -z "$changed" ]; then
+            echo "mode=none" >> "$GITHUB_OUTPUT"
+            echo "files=" >> "$GITHUB_OUTPUT"
+          else
+            echo "mode=files" >> "$GITHUB_OUTPUT"
+            files=$(echo "$changed" | tr '\n' ' ')
+            echo "files=$files" >> "$GITHUB_OUTPUT"
+          fi
+
+      - name: Exit early if nothing to do
+        if: steps.changed.outputs.mode == 'none'
+        run: |
+          echo "No Zeeschuimer datasource files changed; nothing to translate."
+          exit 0
+
+      - name: Set up Python
+        if: steps.changed.outputs.mode != 'none'
+        uses: actions/setup-python@v5
+        with:
+          python-version: "3.11"
+
+      - name: Install LLM dependencies
+        if: steps.changed.outputs.mode != 'none'
+        run: |
+          # LLMAdapter (common/lib/llm.py) imports every provider's langchain
+          # package at module load, so all of these are required even though
+          # we only use the Ollama provider at runtime.
+          pip install \
+            langchain-core \
+            langchain-ollama \
+            langchain-openai \
+            langchain-anthropic \
+            langchain-google-genai \
+            langchain-mistralai \
+            langchain-deepseek \
+            pydantic \
+            requests
+
+      - name: Mint Zeeschuimer App token
+        if: steps.changed.outputs.mode != 'none'
+        id: app_token
+        uses: actions/create-github-app-token@v1
+        with:
+          app-id: ${{ secrets.ZEESCHUIMER_APP_ID }}
+          private-key: ${{ secrets.ZEESCHUIMER_APP_PRIVATE_KEY }}
+          owner: digitalmethodsinitiative
+          repositories: zeeschuimer
+
+      - name: Checkout Zeeschuimer
+        if: steps.changed.outputs.mode != 'none'
+        uses: actions/checkout@v4
+        with:
+          repository: digitalmethodsinitiative/zeeschuimer
+          path: zeeschuimer-checkout
+          token: ${{ steps.app_token.outputs.token }}
+
+      - name: Run translation
+        if: steps.changed.outputs.mode != 'none'
+        env:
+          DMI_OLLAMA_KEY: ${{ secrets.DMI_OLLAMA_KEY }}
+          LLM_MODEL: ${{ inputs.model || 'gemma3:4b' }}
+        run: |
+          if [ "${{ steps.changed.outputs.mode }}" = "bootstrap" ]; then
+            python helper-scripts/map_item_converter.py \
+              --bootstrap \
+              --zeeschuimer-checkout ./zeeschuimer-checkout \
+              --output-manifest ./manifest.json
+          else
+            python helper-scripts/map_item_converter.py \
+              --files ${{ steps.changed.outputs.files }} \
+              --zeeschuimer-checkout ./zeeschuimer-checkout \
+              --output-manifest ./manifest.json
+          fi
+
+      - name: Build PR body
+        if: steps.changed.outputs.mode != 'none'
+        id: pr_body
+        env:
+          MODE: ${{ steps.changed.outputs.mode }}
+          BEFORE_SHA: ${{ github.event.before }}
+          AFTER_SHA: ${{ github.sha }}
+          RUN_ID: ${{ github.run_id }}
+          EVENT_NAME: ${{ github.event_name }}
+        run: |
+          python - <<'EOF'
+          import json
+          import os
+          import subprocess
+
+          mode = os.environ["MODE"]
+          before = os.environ["BEFORE_SHA"]
+          after = os.environ["AFTER_SHA"]
+          run_id = os.environ["RUN_ID"]
+          event_name = os.environ["EVENT_NAME"]
+          repo = "${{ github.repository }}"
+
+          with open("manifest.json") as f:
+              manifest = json.load(f)
+
+          model = manifest.get("model", "(unknown)")
+          provider = manifest.get("provider", "ollama")
+          structured_output = manifest.get("structured_output", False)
+          entries = manifest.get("entries", [])
+
+          short_sha = after[:7]
+          lines = []
+          lines.append("> :robot: This PR was auto-generated by the [4CAT map_item sync workflow](https://github.com/{}/actions/runs/{}). It is a **draft** because the JS was produced by an LLM and needs human review and end-to-end extension testing before merging.".format(repo, run_id))
+          lines.append("")
+          lines.append("## Generation parameters")
+          lines.append("- **Model:** `{}` (provider: `{}`, structured output: `{}`)".format(model, provider, structured_output))
+          if mode == "bootstrap":
+              lines.append("- **Trigger:** manual `workflow_dispatch` with `bootstrap=true` (initial sync of all Zeeschuimer datasources).")
+          elif event_name == "workflow_dispatch":
+              lines.append("- **Trigger:** manual `workflow_dispatch` with explicit file list.")
+          else:
+              lines.append("- **Trigger:** push of [`{}`](https://github.com/{}/commit/{}) to 4CAT master.".format(short_sha, repo, after))
+          lines.append("")
+
+          ok = [e for e in entries if e["status"] == "ok"]
+          failed = [e for e in entries if e["status"] == "failed"]
+          skipped = [e for e in entries if e["status"] == "skipped"]
+
+          lines.append("## Summary")
+          lines.append("- :white_check_mark: {} translated".format(len(ok)))
+          lines.append("- :x: {} failed".format(len(failed)))
+          lines.append("- :grey_question: {} skipped".format(len(skipped)))
+          lines.append("")
+
+          for entry in ok:
+              lines.append("## `{}` -> `{}`".format(entry["python_file"], entry["js_file"]))
+              if entry.get("commentary"):
+                  lines.append("**LLM commentary:**")
+                  lines.append("")
+                  lines.append("> " + entry["commentary"].replace("\n", "\n> "))
+                  lines.append("")
+              if event_name == "push":
+                  try:
+                      diff = subprocess.check_output(
+                          ["git", "diff", "{}..{}".format(before, after), "--", entry["python_file"]],
+                          text=True,
+                      )
+                  except subprocess.CalledProcessError:
+                      diff = ""
+              else:
+                  diff = ""
+                  if diff.strip():
+                      lines.append("<details><summary>Python diff</summary>")
+                      lines.append("")
+                      lines.append("```diff")
+                      lines.append(diff.rstrip())
+                      lines.append("```")
+                      lines.append("</details>")
+                      lines.append("")
+
+          if failed:
+              lines.append("## Failures")
+              for entry in failed:
+                  lines.append("- `{}`: {}".format(entry["python_file"], entry.get("error", "(no error message)")))
+              lines.append("")
+
+          if skipped:
+              lines.append("## Skipped")
+              for entry in skipped:
+                  lines.append("- `{}`: {}".format(entry["python_file"], entry.get("error", "")))
+              lines.append("")
+
+          body = "\n".join(lines)
+          with open("pr_body.md", "w", encoding="utf-8") as f:
+              f.write(body)
+          print("Wrote pr_body.md ({} chars)".format(len(body)))
+          EOF
+
+      - name: Check there are JS changes to PR
+        if: steps.changed.outputs.mode != 'none'
+        id: have_changes
+        working-directory: zeeschuimer-checkout
+        run: |
+          if [ -z "$(git status --porcelain)" ]; then
+            echo "has_changes=false" >> "$GITHUB_OUTPUT"
+            echo "No JS changes produced by translation; not opening a PR."
+          else
+            echo "has_changes=true" >> "$GITHUB_OUTPUT"
+          fi
+
+      - name: Open or update Zeeschuimer PR
+        if: steps.changed.outputs.mode != 'none' && steps.have_changes.outputs.has_changes == 'true'
+        uses: peter-evans/create-pull-request@v6
+        with:
+          path: zeeschuimer-checkout
+          token: ${{ steps.app_token.outputs.token }}
+          branch: auto/4cat-map-item-sync
+          title: "Auto-translated map_item updates from 4CAT @ ${{ github.sha }}"
+          commit-message: "chore: sync map_item from 4CAT ${{ github.sha }}"
+          body-path: pr_body.md
+          draft: true
diff --git a/helper-scripts/map_item_converter.py b/helper-scripts/map_item_converter.py
new file mode 100644
index 000000000..ffe6028a8
--- /dev/null
+++ b/helper-scripts/map_item_converter.py
@@ -0,0 +1,488 @@
+"""
+Translate 4CAT Zeeschuimer-import datasource `map_item` functions from Python
+to JavaScript and splice them into the corresponding Zeeschuimer
+`modules/<platform>.js` file.
+
+Designed to be invoked by a GitHub Action whenever a Zeeschuimer datasource's
+Python file changes on master. Can also be run locally for testing or via
+`workflow_dispatch` with `--bootstrap` to translate every datasource at once.
+
+The LLM produces only the new `map_item` function (plus any imports/helpers it
+needs and free-text commentary). This script does the file integration: it
+locates a marker block in the existing JS module and replaces its contents,
+preserving every hand-written line outside the markers.
+
+Usage:
+    DMI_OLLAMA_KEY=... python helper-scripts/map_item_converter.py \\
+        --files datasources/tiktok/search_tiktok.py \\
+        --zeeschuimer-checkout ../zeeschuimer \\
+        --output-manifest /tmp/manifest.json
+"""
+import argparse
+import ast
+import json
+import os
+import re
+import sys
+import traceback
+from pathlib import Path
+from typing import Optional
+
+sys.path.insert(0, os.path.join(os.path.abspath(os.path.dirname(__file__)), ".."))
+
+from common.lib.llm import LLMAdapter
+
+
+# 4CAT datasource path -> Zeeschuimer module path (relative to checkout root).
+# Verified against https://github.com/digitalmethodsinitiative/zeeschuimer/tree/master/modules
+# Note: facebook has no Zeeschuimer module today, so it's intentionally absent.
+PLATFORM_MAP = {
+    "datasources/douyin/search_douyin.py": "modules/douyin.js",
+    "datasources/gab/search_gab.py": "modules/gab.js",
+    "datasources/imgur/search_imgur.py": "modules/imgur.js",
+    "datasources/instagram/search_instagram.py": "modules/instagram.js",
+    "datasources/linkedin/search_linkedin.py": "modules/linkedin.js",
+    "datasources/ninegag/search_9gag.py": "modules/9gag.js",
+    "datasources/pinterest/search_pinterest.py": "modules/pinterest.js",
+    "datasources/threads/search_threads.py": "modules/threads.js",
+    "datasources/tiktok/search_tiktok.py": "modules/tiktok.js",
+    "datasources/tiktok_comments/search_tiktok_comments.py": "modules/tiktok-comments.js",
+    "datasources/truth/search_truth.py": "modules/truth.js",
+    "datasources/twitter-import/search_twitter.py": "modules/twitter.js",
+    "datasources/xiaohongshu/search_rednote.py": "modules/rednote.js",
+    "datasources/xiaohongshu_comments/search_rednote_comments.py": "modules/rednote-comments.js",
+}
+
+DEFAULT_MODEL = "gemma3:4b"
+
+IMPORTS_MARKER_START = "// === auto-generated imports for map_item — DO NOT EDIT BY HAND ==="
+IMPORTS_MARKER_END = "// === end auto-generated imports ==="
+BLOCK_MARKER_START = "// === auto-generated by 4cat map_item sync — DO NOT EDIT BY HAND ==="
+BLOCK_MARKER_END = "// === end auto-generated ==="
+
+LLM_SCHEMA = {
+    "title": "MapItemTranslation",
+    "type": "object",
+    "required": ["map_item_function", "imports_to_add", "helpers_to_add", "commentary"],
+    "properties": {
+        "map_item_function": {
+            "type": "string",
+            "description": (
+                "Full JavaScript source of the new map_item function. Must include "
+                "the function declaration (e.g. 'export function map_item(item) {...}'). "
+                "Do not include surrounding code from the module."
+            ),
+        },
+        "imports_to_add": {
+            "type": "array",
+            "items": {"type": "string"},
+            "description": (
+                "Complete import statements (one per array entry) that map_item needs "
+                "and that are not already present in the existing module. Empty array "
+                "if none needed."
+            ),
+        },
+        "helpers_to_add": {
+            "type": "array",
+            "items": {"type": "string"},
+            "description": (
+                "Full source of any helper functions map_item depends on (e.g. a JS "
+                "port of normalize_url_encoding). Empty array if none needed."
+            ),
+        },
+        "commentary": {
+            "type": "string",
+            "description": (
+                "Notes for the human reviewer: assumptions made, fields you were "
+                "unsure about, Python idioms that don't translate cleanly. Plain text."
+            ),
+        },
+    },
+}
+
+SYSTEM_PROMPT = (
+    "You translate 4CAT Python `map_item` functions into JavaScript for the "
+    "Zeeschuimer browser extension. You return ONLY the new map_item function, "
+    "any imports it needs, any helper functions it depends on, and commentary "
+    "for the human reviewer. You NEVER return the surrounding module file. "
+    "You preserve the field names produced by the Python function exactly. "
+    "You do not invent fields not present in the Python output."
+)
+
+
+def is_zeeschuimer_datasource(python_path: Path) -> bool:
+    """
+    Returns True if the given Python file defines a 4CAT Search subclass with
+    `is_from_zeeschuimer = True` as a class attribute.
+    """
+    try:
+        tree = ast.parse(python_path.read_text(encoding="utf-8"))
+    except (SyntaxError, OSError):
+        return False
+
+    for node in ast.walk(tree):
+        if not isinstance(node, ast.ClassDef):
+            continue
+        for stmt in node.body:
+            if not isinstance(stmt, ast.Assign):
+                continue
+            for target in stmt.targets:
+                if isinstance(target, ast.Name) and target.id == "is_from_zeeschuimer":
+                    if isinstance(stmt.value, ast.Constant) and stmt.value.value is True:
+                        return True
+    return False
+
+
+def discover_bootstrap_files(repo_root: Path) -> list[Path]:
+    """
+    Find every datasource file in PLATFORM_MAP whose class is a Zeeschuimer
+    datasource. Returns absolute paths.
+    """
+    found = []
+    for rel in PLATFORM_MAP:
+        path = repo_root / rel
+        if path.exists() and is_zeeschuimer_datasource(path):
+            found.append(path)
+    return sorted(found)
+
+
+def build_user_prompt(python_source: str, existing_module_source: str, python_rel: str) -> str:
+    return (
+        f"# Source Python file (datasources/{python_rel})\n"
+        "This is the file that just changed in 4CAT. The `map_item` function on the "
+        "class is the source of truth — your JavaScript translation must produce an "
+        "object with the same field names and equivalent values.\n\n"
+        f"```python\n{python_source}\n```\n\n"
+        "# Existing Zeeschuimer module\n"
+        "This module's `capture()` function returns the raw items that will be "
+        "passed to `map_item(item)` as `item`. Use it to understand the input shape "
+        "and to match the existing code style (ES modules, `export` keyword, etc.).\n\n"
+        f"```javascript\n{existing_module_source}\n```\n\n"
+        "# Task\n"
+        "Produce a JavaScript `map_item(item)` function that mirrors the Python "
+        "`map_item`. Re-implement any Python helpers it calls (e.g. `normalize_url_encoding`, "
+        "`urlparse`/`parse_qs`, `datetime` formatting) inline in JS — either inside "
+        "`map_item_function` or as separate snippets in `helpers_to_add`. If you reference "
+        "the `MappedItem` class, list its import in `imports_to_add` (it lives in `../js/lib.js`). "
+        "Use `export function map_item(item) { ... }` to match this module's style."
+    )
+
+
+def parse_freetext_response(text: str) -> dict:
+    """
+    Fallback parser for when structured output is disabled or unreliable.
+    Looks for a fenced JS block (the function) and treats remaining text as
+    commentary.
+    """
+    js_match = re.search(r"```(?:js|javascript)\s*\n(.*?)```", text, re.DOTALL)
+    map_item_function = js_match.group(1).strip() if js_match else ""
+    commentary = re.sub(r"```(?:js|javascript)\s*\n.*?```", "", text, flags=re.DOTALL).strip()
+    return {
+        "map_item_function": map_item_function,
+        "imports_to_add": [],
+        "helpers_to_add": [],
+        "commentary": commentary,
+    }
+
+
+def validate_translation(translation: dict) -> Optional[str]:
+    """
+    Returns None if the translation passes basic sanity checks, else a string
+    describing what went wrong.
+    """
+    fn = translation.get("map_item_function", "").strip()
+    if not fn:
+        return "LLM returned empty map_item_function"
+    if not re.search(r"\bmap_item\b", fn):
+        return "LLM output does not contain `map_item` identifier"
+    if not re.search(r"function\s+map_item|map_item\s*=|map_item\s*:", fn):
+        return "LLM output does not declare `map_item` as a function"
+    return None
+
+
+def splice_into_module(existing: str, translation: dict, python_rel: str) -> str:
+    """
+    Idempotently insert / replace the auto-generated marker blocks in the JS
+    module text.
+
+    Raises ValueError if exactly one of (start, end) markers is present —
+    that means the file is corrupted or partially hand-edited and we should
+    refuse to touch it.
+    """
+    main_block_body = []
+    for helper in translation.get("helpers_to_add", []):
+        helper = helper.strip()
+        if helper:
+            main_block_body.append(helper)
+    fn = translation["map_item_function"].strip()
+    main_block_body.append(fn)
+    main_block = (
+        f"{BLOCK_MARKER_START}\n"
+        f"// (regenerated from {python_rel})\n"
+        + "\n\n".join(main_block_body)
+        + f"\n{BLOCK_MARKER_END}\n"
+    )
+
+    imports = [imp.strip() for imp in translation.get("imports_to_add", []) if imp.strip()]
+    # Drop imports that already appear verbatim outside the marker block.
+    existing_outside_block = re.sub(
+        re.escape(BLOCK_MARKER_START) + r".*?" + re.escape(BLOCK_MARKER_END) + r"\n?",
+        "",
+        existing,
+        flags=re.DOTALL,
+    )
+    existing_outside_imports_block = re.sub(
+        re.escape(IMPORTS_MARKER_START) + r".*?" + re.escape(IMPORTS_MARKER_END) + r"\n?",
+        "",
+        existing_outside_block,
+        flags=re.DOTALL,
+    )
+    imports = [imp for imp in imports if imp not in existing_outside_imports_block]
+
+    imports_block = ""
+    if imports:
+        imports_block = (
+            f"{IMPORTS_MARKER_START}\n"
+            + "\n".join(imports)
+            + f"\n{IMPORTS_MARKER_END}\n"
+        )
+
+    updated = existing
+
+    # Replace or insert imports block.
+    has_imports_start = IMPORTS_MARKER_START in updated
+    has_imports_end = IMPORTS_MARKER_END in updated
+    if has_imports_start ^ has_imports_end:
+        raise ValueError(
+            "Auto-generated imports markers are partially missing in the existing "
+            "module — refusing to overwrite. Restore both markers or remove both."
+        )
+    if has_imports_start and has_imports_end:
+        updated = re.sub(
+            re.escape(IMPORTS_MARKER_START) + r".*?" + re.escape(IMPORTS_MARKER_END) + r"\n?",
+            imports_block,
+            updated,
+            count=1,
+            flags=re.DOTALL,
+        )
+    elif imports_block:
+        # Prepend at top of file.
+        if updated and not updated.startswith("\n"):
+            updated = imports_block + "\n" + updated
+        else:
+            updated = imports_block + updated
+
+    # Replace or append main block.
+    has_main_start = BLOCK_MARKER_START in updated
+    has_main_end = BLOCK_MARKER_END in updated
+    if has_main_start ^ has_main_end:
+        raise ValueError(
+            "Auto-generated map_item markers are partially missing in the existing "
+            "module — refusing to overwrite. Restore both markers or remove both."
+        )
+    if has_main_start and has_main_end:
+        updated = re.sub(
+            re.escape(BLOCK_MARKER_START) + r".*?" + re.escape(BLOCK_MARKER_END) + r"\n?",
+            main_block,
+            updated,
+            count=1,
+            flags=re.DOTALL,
+        )
+    else:
+        if not updated.endswith("\n"):
+            updated += "\n"
+        updated += "\n" + main_block
+
+    return updated
+
+
+def translate_one(
+    llm: LLMAdapter,
+    python_path: Path,
+    repo_root: Path,
+    zeeschuimer_root: Path,
+    use_structured_output: bool,
+) -> dict:
+    """
+    Translate one Python file. Returns a manifest entry dict.
+    """
+    rel = python_path.relative_to(repo_root).as_posix()
+    entry = {
+        "python_file": rel,
+        "js_file": None,
+        "status": "failed",
+        "commentary": "",
+        "error": None,
+    }
+
+    if not is_zeeschuimer_datasource(python_path):
+        entry["status"] = "skipped"
+        entry["error"] = "not a Zeeschuimer datasource (is_from_zeeschuimer != True)"
+        return entry
+
+    js_rel = PLATFORM_MAP.get(rel)
+    if not js_rel:
+        entry["status"] = "skipped"
+        entry["error"] = f"no Zeeschuimer module mapped for {rel}"
+        return entry
+    entry["js_file"] = js_rel
+
+    js_path = zeeschuimer_root / js_rel
+    if not js_path.exists():
+        entry["status"] = "skipped"
+        entry["error"] = f"Zeeschuimer module {js_rel} does not exist in checkout"
+        return entry
+
+    python_source = python_path.read_text(encoding="utf-8")
+    existing_module = js_path.read_text(encoding="utf-8")
+    user_prompt = build_user_prompt(python_source, existing_module, rel)
+
+    try:
+        response = llm.generate_text(user_prompt, system_prompt=SYSTEM_PROMPT)
+    except Exception as e:
+        entry["error"] = f"LLM call failed: {e}"
+        return entry
+
+    if use_structured_output:
+        # with_structured_output returns the parsed dict directly.
+        if isinstance(response, dict):
+            translation = response
+        else:
+            entry["error"] = (
+                f"Expected dict from structured output, got {type(response).__name__}"
+            )
+            return entry
+    else:
+        text = getattr(response, "content", str(response))
+        translation = parse_freetext_response(text)
+
+    bad = validate_translation(translation)
+    if bad:
+        entry["error"] = bad
+        return entry
+
+    try:
+        spliced = splice_into_module(existing_module, translation, rel)
+    except ValueError as e:
+        entry["error"] = str(e)
+        return entry
+
+    js_path.write_text(spliced, encoding="utf-8")
+    entry["status"] = "ok"
+    entry["commentary"] = translation.get("commentary", "").strip()
+    return entry
+
+
+def main():
+    cli = argparse.ArgumentParser(description=__doc__)
+    group = cli.add_mutually_exclusive_group(required=True)
+    group.add_argument("--files", nargs="+", help="Specific datasource files to translate.")
+    group.add_argument(
+        "--bootstrap",
+        action="store_true",
+        help="Translate every Zeeschuimer datasource in the repo.",
+    )
+    cli.add_argument(
+        "--zeeschuimer-checkout",
+        required=True,
+        type=Path,
+        help="Path to a local clone of the Zeeschuimer repo.",
+    )
+    cli.add_argument(
+        "--output-manifest",
+        required=True,
+        type=Path,
+        help="Where to write the JSON manifest of results.",
+    )
+    cli.add_argument(
+        "--model",
+        default=os.environ.get("LLM_MODEL", DEFAULT_MODEL),
+        help=f"Ollama model to use (default: {DEFAULT_MODEL}, or $LLM_MODEL).",
+    )
+    cli.add_argument(
+        "--no-structured-output",
+        action="store_true",
+        help="Disable JSON-schema structured output; parse the response as free text.",
+    )
+    args = cli.parse_args()
+
+    dmi_ollama_key = os.environ.get("DMI_OLLAMA_KEY")
+    if not dmi_ollama_key:
+        sys.exit("Error: DMI_OLLAMA_KEY environment variable not set.")
+
+    repo_root = Path(__file__).resolve().parent.parent
+
+    if args.bootstrap:
+        files = discover_bootstrap_files(repo_root)
+        if not files:
+            sys.exit("No Zeeschuimer datasources found to bootstrap.")
+    else:
+        files = [Path(f).resolve() for f in args.files]
+
+    llm = LLMAdapter(
+        provider="ollama",
+        model=args.model,
+        base_url="https://ollama.digitalmethods.net",
+        temperature=0.2,
+        max_tokens=8192,
+        client_kwargs={"headers": {"X-API-KEY": dmi_ollama_key}},
+    )
+
+    use_structured_output = not args.no_structured_output
+    if use_structured_output:
+        try:
+            llm.set_structure(LLM_SCHEMA)
+        except Exception as e:
+            print(
+                f"Warning: could not enable structured output ({e}); "
+                "falling back to free-text parsing.",
+                file=sys.stderr,
+            )
+            use_structured_output = False
+
+    print(f"Using model: {args.model} (provider: ollama, structured_output: {use_structured_output})")
+
+    entries = []
+    for python_path in files:
+        print(f"Translating {python_path.relative_to(repo_root).as_posix()}...")
+        try:
+            entry = translate_one(
+                llm,
+                python_path,
+                repo_root,
+                args.zeeschuimer_checkout.resolve(),
+                use_structured_output,
+            )
+        except Exception as e:
+            entry = {
+                "python_file": str(python_path),
+                "js_file": None,
+                "status": "failed",
+                "commentary": "",
+                "error": f"unexpected exception: {e}\n{traceback.format_exc()}",
+            }
+        entry["model"] = args.model
+        entries.append(entry)
+        print(f"  -> {entry['status']}" + (f" ({entry['error']})" if entry.get("error") else ""))
+
+    manifest = {
+        "model": args.model,
+        "provider": "ollama",
+        "structured_output": use_structured_output,
+        "entries": entries,
+    }
+    args.output_manifest.parent.mkdir(parents=True, exist_ok=True)
+    args.output_manifest.write_text(json.dumps(manifest, indent=2), encoding="utf-8")
+
+    n_ok = sum(1 for e in entries if e["status"] == "ok")
+    n_failed = sum(1 for e in entries if e["status"] == "failed")
+    n_skipped = sum(1 for e in entries if e["status"] == "skipped")
+    print(f"\nDone with model `{args.model}`: {n_ok} ok, {n_failed} failed, {n_skipped} skipped.")
+    print(f"Manifest written to {args.output_manifest}")
+
+    if n_ok == 0 and n_failed > 0:
+        sys.exit(1)
+
+
+if __name__ == "__main__":
+    main()

From 585a2d1446af63944292710c9a2ab85c6f22d364 Mon Sep 17 00:00:00 2001
From: Dale Wahl <dalewahl@gmail.com>
Date: Tue, 5 May 2026 17:56:46 +0200
Subject: [PATCH 02/13] add streaming and timing and other stuff so i can see
 what's going on

---
 helper-scripts/map_item_converter.py | 197 ++++++++++++++++++++++++---
 1 file changed, 176 insertions(+), 21 deletions(-)

diff --git a/helper-scripts/map_item_converter.py b/helper-scripts/map_item_converter.py
index ffe6028a8..75d3d2ea8 100644
--- a/helper-scripts/map_item_converter.py
+++ b/helper-scripts/map_item_converter.py
@@ -24,6 +24,7 @@
 import os
 import re
 import sys
+import time
 import traceback
 from pathlib import Path
 from typing import Optional
@@ -296,12 +297,91 @@ def splice_into_module(existing: str, translation: dict, python_rel: str) -> str
     return updated
 
 
+REASONING_KEYS = ("reasoning_content", "reasoning", "thinking", "thought")
+
+
+def _extract_reasoning(chunk) -> str:
+    """
+    Pull reasoning/thinking tokens off a LangChain chunk. Different model
+    families surface them under different keys, so try several.
+    """
+    kwargs = getattr(chunk, "additional_kwargs", None) or {}
+    for key in REASONING_KEYS:
+        val = kwargs.get(key)
+        if isinstance(val, str) and val:
+            return val
+    return ""
+
+
+def call_llm_streaming(llm: LLMAdapter, user_prompt: str, system_prompt: str) -> str:
+    """
+    Stream the LLM response to stderr chunk-by-chunk and return the accumulated
+    *visible content* (reasoning tokens are surfaced live but NOT included in
+    the returned string — they aren't part of the parseable answer).
+
+    For reasoning models like gpt-oss / deepseek-r1, the model spends a long
+    time emitting reasoning tokens before producing the visible answer; without
+    surfacing them the stream looks frozen. We mark the reasoning vs content
+    transitions so the user can tell what phase the model is in.
+    """
+    from langchain_core.messages import HumanMessage, SystemMessage
+
+    messages = []
+    if system_prompt:
+        messages.append(SystemMessage(content=system_prompt))
+    messages.append(HumanMessage(content=user_prompt))
+
+    sys.stderr.write("\n--- LLM stream begin ---\n")
+    sys.stderr.flush()
+
+    content_chunks: list[str] = []
+    state: Optional[str] = None  # "reasoning" | "content"
+
+    for chunk in llm.llm.stream(messages):
+        reasoning = _extract_reasoning(chunk)
+        content = getattr(chunk, "content", "") or ""
+
+        if reasoning:
+            if state != "reasoning":
+                sys.stderr.write("\n--- reasoning ---\n")
+                state = "reasoning"
+            sys.stderr.write(reasoning)
+            sys.stderr.flush()
+
+        if content:
+            if state != "content":
+                # transitioning from reasoning (or nothing) to visible output
+                sys.stderr.write("\n--- output ---\n")
+                state = "content"
+            sys.stderr.write(content)
+            sys.stderr.flush()
+            content_chunks.append(content)
+
+    sys.stderr.write("\n--- LLM stream end ---\n")
+    sys.stderr.flush()
+    return "".join(content_chunks)
+
+
+def extract_raw_from_exception(exc: BaseException) -> Optional[str]:
+    """
+    Pull whatever raw LLM output we can find off a LangChain exception. Tries
+    several attribute names since they vary by LangChain version. Returns None
+    if nothing recoverable.
+    """
+    for attr in ("llm_output", "observation", "output"):
+        val = getattr(exc, attr, None)
+        if isinstance(val, str):
+            return val
+    return None
+
+
 def translate_one(
     llm: LLMAdapter,
     python_path: Path,
     repo_root: Path,
     zeeschuimer_root: Path,
     use_structured_output: bool,
+    stream: bool,
 ) -> dict:
     """
     Translate one Python file. Returns a manifest entry dict.
@@ -312,6 +392,8 @@ def translate_one(
         "js_file": None,
         "status": "failed",
         "commentary": "",
+        "duration_seconds": None,
+        "raw_response": None,
         "error": None,
     }
 
@@ -337,24 +419,48 @@ def translate_one(
     existing_module = js_path.read_text(encoding="utf-8")
     user_prompt = build_user_prompt(python_source, existing_module, rel)
 
+    started = time.monotonic()
+    raw_response: Optional[str] = None
+    translation: Optional[dict] = None
+    llm_error: Optional[str] = None
+
     try:
-        response = llm.generate_text(user_prompt, system_prompt=SYSTEM_PROMPT)
+        if stream:
+            raw_response = call_llm_streaming(llm, user_prompt, SYSTEM_PROMPT)
+            translation = parse_freetext_response(raw_response)
+        elif use_structured_output:
+            try:
+                response = llm.generate_text(user_prompt, system_prompt=SYSTEM_PROMPT)
+            except Exception as e:
+                raw_response = extract_raw_from_exception(e)
+                llm_error = f"LLM call failed: {e}"
+                response = None
+            if response is not None:
+                if isinstance(response, dict):
+                    translation = response
+                    # Structured-output success doesn't expose the raw text.
+                else:
+                    llm_error = (
+                        f"Expected dict from structured output, got "
+                        f"{type(response).__name__}"
+                    )
+        else:
+            response = llm.generate_text(user_prompt, system_prompt=SYSTEM_PROMPT)
+            raw_response = getattr(response, "content", str(response))
+            translation = parse_freetext_response(raw_response)
     except Exception as e:
-        entry["error"] = f"LLM call failed: {e}"
-        return entry
+        raw_response = raw_response or extract_raw_from_exception(e)
+        llm_error = f"LLM call failed: {e}"
 
-    if use_structured_output:
-        # with_structured_output returns the parsed dict directly.
-        if isinstance(response, dict):
-            translation = response
-        else:
-            entry["error"] = (
-                f"Expected dict from structured output, got {type(response).__name__}"
-            )
-            return entry
-    else:
-        text = getattr(response, "content", str(response))
-        translation = parse_freetext_response(text)
+    entry["duration_seconds"] = round(time.monotonic() - started, 2)
+    entry["raw_response"] = raw_response
+
+    if llm_error:
+        entry["error"] = llm_error
+        return entry
+    if translation is None:
+        entry["error"] = "no translation produced (no error raised, no dict returned)"
+        return entry
 
     bad = validate_translation(translation)
     if bad:
@@ -404,6 +510,25 @@ def main():
         action="store_true",
         help="Disable JSON-schema structured output; parse the response as free text.",
     )
+    cli.add_argument(
+        "--stream",
+        action="store_true",
+        help=(
+            "Stream LLM output to stderr as it is generated, so you can watch a "
+            "slow model work. Implies --no-structured-output (streaming and "
+            "structured output don't mix cleanly)."
+        ),
+    )
+    cli.add_argument(
+        "--no-fail-fast",
+        action="store_true",
+        help=(
+            "Continue translating remaining files even after one fails. By "
+            "default the script aborts on the first failure, since failures here "
+            "are typically configuration- or model-correlated and continuing "
+            "wastes LLM time."
+        ),
+    )
     args = cli.parse_args()
 
     dmi_ollama_key = os.environ.get("DMI_OLLAMA_KEY")
@@ -428,7 +553,7 @@ def main():
         client_kwargs={"headers": {"X-API-KEY": dmi_ollama_key}},
     )
 
-    use_structured_output = not args.no_structured_output
+    use_structured_output = not args.no_structured_output and not args.stream
     if use_structured_output:
         try:
             llm.set_structure(LLM_SCHEMA)
@@ -440,11 +565,19 @@ def main():
             )
             use_structured_output = False
 
-    print(f"Using model: {args.model} (provider: ollama, structured_output: {use_structured_output})")
+    fail_fast = not args.no_fail_fast
+    print(
+        f"Using model: {args.model} "
+        f"(provider: ollama, structured_output: {use_structured_output}, "
+        f"stream: {args.stream}, fail_fast: {fail_fast})"
+    )
 
     entries = []
+    overall_started = time.monotonic()
     for python_path in files:
-        print(f"Translating {python_path.relative_to(repo_root).as_posix()}...")
+        rel_for_log = python_path.relative_to(repo_root).as_posix()
+        print(f"Translating {rel_for_log}...", flush=True)
+        per_file_started = time.monotonic()
         try:
             entry = translate_one(
                 llm,
@@ -452,6 +585,7 @@ def main():
                 repo_root,
                 args.zeeschuimer_checkout.resolve(),
                 use_structured_output,
+                args.stream,
             )
         except Exception as e:
             entry = {
@@ -459,16 +593,34 @@ def main():
                 "js_file": None,
                 "status": "failed",
                 "commentary": "",
+                "duration_seconds": round(time.monotonic() - per_file_started, 2),
                 "error": f"unexpected exception: {e}\n{traceback.format_exc()}",
             }
         entry["model"] = args.model
         entries.append(entry)
-        print(f"  -> {entry['status']}" + (f" ({entry['error']})" if entry.get("error") else ""))
-
+        dur = entry.get("duration_seconds")
+        dur_str = f" in {dur}s" if dur is not None else ""
+        err_str = f" ({entry['error']})" if entry.get("error") else ""
+        print(f"  -> {entry['status']}{dur_str}{err_str}", flush=True)
+
+        if entry["status"] == "failed" and not args.no_fail_fast:
+            remaining = len(files) - len(entries)
+            if remaining > 0:
+                print(
+                    f"\nFail-fast: aborting after first failure; skipping "
+                    f"{remaining} remaining file(s). Pass --no-fail-fast to continue past failures.",
+                    flush=True,
+                )
+            break
+
+    overall_duration = round(time.monotonic() - overall_started, 2)
     manifest = {
         "model": args.model,
         "provider": "ollama",
         "structured_output": use_structured_output,
+        "stream": args.stream,
+        "fail_fast": fail_fast,
+        "total_duration_seconds": overall_duration,
         "entries": entries,
     }
     args.output_manifest.parent.mkdir(parents=True, exist_ok=True)
@@ -477,7 +629,10 @@ def main():
     n_ok = sum(1 for e in entries if e["status"] == "ok")
     n_failed = sum(1 for e in entries if e["status"] == "failed")
     n_skipped = sum(1 for e in entries if e["status"] == "skipped")
-    print(f"\nDone with model `{args.model}`: {n_ok} ok, {n_failed} failed, {n_skipped} skipped.")
+    print(
+        f"\nDone with model `{args.model}` in {overall_duration}s: "
+        f"{n_ok} ok, {n_failed} failed, {n_skipped} skipped."
+    )
     print(f"Manifest written to {args.output_manifest}")
 
     if n_ok == 0 and n_failed > 0:

From 7594c9a254ff40c266ab76075d2d4a11aa9c3415 Mon Sep 17 00:00:00 2001
From: Dale Wahl <dalewahl@gmail.com>
Date: Tue, 5 May 2026 17:57:08 +0200
Subject: [PATCH 03/13] no streaming here

---
 .../workflows/zeeschuimer_map_item_sync.yml   | 39 +++++++++++++------
 1 file changed, 28 insertions(+), 11 deletions(-)

diff --git a/.github/workflows/zeeschuimer_map_item_sync.yml b/.github/workflows/zeeschuimer_map_item_sync.yml
index 203df44c8..54f6a266c 100644
--- a/.github/workflows/zeeschuimer_map_item_sync.yml
+++ b/.github/workflows/zeeschuimer_map_item_sync.yml
@@ -164,6 +164,8 @@ jobs:
           model = manifest.get("model", "(unknown)")
           provider = manifest.get("provider", "ollama")
           structured_output = manifest.get("structured_output", False)
+          stream = manifest.get("stream", False)
+          total_duration = manifest.get("total_duration_seconds")
           entries = manifest.get("entries", [])
 
           short_sha = after[:7]
@@ -171,7 +173,9 @@ jobs:
           lines.append("> :robot: This PR was auto-generated by the [4CAT map_item sync workflow](https://github.com/{}/actions/runs/{}). It is a **draft** because the JS was produced by an LLM and needs human review and end-to-end extension testing before merging.".format(repo, run_id))
           lines.append("")
           lines.append("## Generation parameters")
-          lines.append("- **Model:** `{}` (provider: `{}`, structured output: `{}`)".format(model, provider, structured_output))
+          lines.append("- **Model:** `{}` (provider: `{}`, structured output: `{}`, stream: `{}`)".format(model, provider, structured_output, stream))
+          if total_duration is not None:
+              lines.append("- **Total LLM time:** {}s".format(total_duration))
           if mode == "bootstrap":
               lines.append("- **Trigger:** manual `workflow_dispatch` with `bootstrap=true` (initial sync of all Zeeschuimer datasources).")
           elif event_name == "workflow_dispatch":
@@ -190,8 +194,19 @@ jobs:
           lines.append("- :grey_question: {} skipped".format(len(skipped)))
           lines.append("")
 
+          if ok:
+              lines.append("| Datasource | Module | Time |")
+              lines.append("|---|---|---:|")
+              for entry in ok:
+                  dur = entry.get("duration_seconds")
+                  dur_cell = "{}s".format(dur) if dur is not None else "—"
+                  lines.append("| `{}` | `{}` | {} |".format(entry["python_file"], entry["js_file"], dur_cell))
+              lines.append("")
+
           for entry in ok:
-              lines.append("## `{}` -> `{}`".format(entry["python_file"], entry["js_file"]))
+              dur = entry.get("duration_seconds")
+              header_dur = " ({}s)".format(dur) if dur is not None else ""
+              lines.append("## `{}` -> `{}`{}".format(entry["python_file"], entry["js_file"], header_dur))
               if entry.get("commentary"):
                   lines.append("**LLM commentary:**")
                   lines.append("")
@@ -207,19 +222,21 @@ jobs:
                       diff = ""
               else:
                   diff = ""
-                  if diff.strip():
-                      lines.append("<details><summary>Python diff</summary>")
-                      lines.append("")
-                      lines.append("```diff")
-                      lines.append(diff.rstrip())
-                      lines.append("```")
-                      lines.append("</details>")
-                      lines.append("")
+              if diff.strip():
+                  lines.append("<details><summary>Python diff</summary>")
+                  lines.append("")
+                  lines.append("```diff")
+                  lines.append(diff.rstrip())
+                  lines.append("```")
+                  lines.append("</details>")
+                  lines.append("")
 
           if failed:
               lines.append("## Failures")
               for entry in failed:
-                  lines.append("- `{}`: {}".format(entry["python_file"], entry.get("error", "(no error message)")))
+                  dur = entry.get("duration_seconds")
+                  dur_str = " (after {}s)".format(dur) if dur is not None else ""
+                  lines.append("- `{}`{}: {}".format(entry["python_file"], dur_str, entry.get("error", "(no error message)")))
               lines.append("")
 
           if skipped:

From 3d843c821c4e93142dd49b679b842ef943a4044c Mon Sep 17 00:00:00 2001
From: Dale Wahl <dalewahl@gmail.com>
Date: Wed, 6 May 2026 14:18:04 +0200
Subject: [PATCH 04/13] use qwen; lib.js is global apparently, add some helper
 functions; add a bunch of DON'T Do's; clean up code block fences; do some
 surface lint tests

---
 helper-scripts/map_item_converter.py | 317 +++++++++++++++++++++++++--
 1 file changed, 301 insertions(+), 16 deletions(-)

diff --git a/helper-scripts/map_item_converter.py b/helper-scripts/map_item_converter.py
index 75d3d2ea8..3d43a53d0 100644
--- a/helper-scripts/map_item_converter.py
+++ b/helper-scripts/map_item_converter.py
@@ -54,7 +54,7 @@
     "datasources/xiaohongshu_comments/search_rednote_comments.py": "modules/rednote-comments.js",
 }
 
-DEFAULT_MODEL = "gemma3:4b"
+DEFAULT_MODEL = "qwen2.5-coder:14b"
 
 IMPORTS_MARKER_START = "// === auto-generated imports for map_item — DO NOT EDIT BY HAND ==="
 IMPORTS_MARKER_END = "// === end auto-generated imports ==="
@@ -78,9 +78,12 @@
             "type": "array",
             "items": {"type": "string"},
             "description": (
-                "Complete import statements (one per array entry) that map_item needs "
-                "and that are not already present in the existing module. Empty array "
-                "if none needed."
+                "Complete ES-module import statements that map_item needs. "
+                "Normally empty: Zeeschuimer's `js/lib.js` (which provides MappedItem, "
+                "MissingMappedField, normalize_url_encoding, strip_tags) is loaded as a "
+                "plain script, NOT an ES module — its declarations are already global, "
+                "so do not write `import { X } from '../js/lib.js'`. Only populate this "
+                "if you genuinely need to import from another ES module."
             ),
         },
         "helpers_to_add": {
@@ -107,10 +110,62 @@
     "any imports it needs, any helper functions it depends on, and commentary "
     "for the human reviewer. You NEVER return the surrounding module file. "
     "You preserve the field names produced by the Python function exactly. "
-    "You do not invent fields not present in the Python output."
+    "You do not invent fields not present in the Python output. "
+    "You output raw JavaScript source — never wrap it in markdown code fences "
+    "(```js, ```javascript, etc.). The fields in your structured response are "
+    "already typed as code; fences make them invalid."
 )
 
 
+# Whitelist of helpers that Zeeschuimer makes available as globals at runtime.
+# `js/lib.js` is loaded as a plain <script>, not as an ES module, so its top-
+# level declarations are global — they must NOT be imported. Anything not on
+# this list must be inlined or added to `helpers_to_add`. Update this list
+# whenever new helpers are added to `js/lib.js`.
+AVAILABLE_JS_HELPERS = [
+    {
+        "name": "MappedItem",
+        "kind": "class",
+        "usage": "new MappedItem({field: value, ...})",
+        "note": "Wraps the return value of map_item. Always instantiate with `new`.",
+    },
+    {
+        "name": "MissingMappedField",
+        "kind": "class",
+        "usage": "new MissingMappedField(value, label)",
+        "note": "Represents a field that may legitimately be missing. Always instantiate with `new`.",
+    },
+    {
+        "name": "normalize_url_encoding",
+        "kind": "function",
+        "usage": "normalize_url_encoding(url)",
+        "note": "Direct port of the Python helper of the same name.",
+    },
+    {
+        "name": "strip_tags",
+        "kind": "function",
+        "usage": "strip_tags(html, convertNewlines = true)",
+        "note": "Direct port of the Python helper of the same name.",
+    },
+    {
+        "name": "formatUtcTimestamp",
+        "kind": "function",
+        "usage": "formatUtcTimestamp(timestamp)",
+        "note": "Formats a UTC timestamp as a readable string.",
+    }
+]
+
+
+def _format_available_helpers() -> str:
+    lines = []
+    for h in AVAILABLE_JS_HELPERS:
+        lines.append(
+            f"- `{h['name']}` ({h['kind']}, global) — {h['note']} "
+            f"Usage: `{h['usage']}`."
+        )
+    return "\n".join(lines)
+
+
 def is_zeeschuimer_datasource(python_path: Path) -> bool:
     """
     Returns True if the given Python file defines a 4CAT Search subclass with
@@ -148,6 +203,7 @@ def discover_bootstrap_files(repo_root: Path) -> list[Path]:
 
 
 def build_user_prompt(python_source: str, existing_module_source: str, python_rel: str) -> str:
+    helpers_block = _format_available_helpers()
     return (
         f"# Source Python file (datasources/{python_rel})\n"
         "This is the file that just changed in 4CAT. The `map_item` function on the "
@@ -159,16 +215,72 @@ def build_user_prompt(python_source: str, existing_module_source: str, python_re
         "passed to `map_item(item)` as `item`. Use it to understand the input shape "
         "and to match the existing code style (ES modules, `export` keyword, etc.).\n\n"
         f"```javascript\n{existing_module_source}\n```\n\n"
-        "# Task\n"
-        "Produce a JavaScript `map_item(item)` function that mirrors the Python "
-        "`map_item`. Re-implement any Python helpers it calls (e.g. `normalize_url_encoding`, "
-        "`urlparse`/`parse_qs`, `datetime` formatting) inline in JS — either inside "
-        "`map_item_function` or as separate snippets in `helpers_to_add`. If you reference "
-        "the `MappedItem` class, list its import in `imports_to_add` (it lives in `../js/lib.js`). "
-        "Use `export function map_item(item) { ... }` to match this module's style."
+        "# Available Zeeschuimer JS helpers (globals)\n"
+        "Zeeschuimer loads `js/lib.js` as a plain `<script>`, NOT as an ES "
+        "module. Its top-level declarations are global — available everywhere "
+        "without any `import`. Use them by name only. The following are the "
+        "helpers you may use; everything else must be implemented as JavaScript "
+        "(inline in `map_item_function` or as separate snippets in `helpers_to_add`).\n\n"
+        f"{helpers_block}\n\n"
+        "# Imports — almost always none\n"
+        "Do NOT write `import { MappedItem } from '../js/lib.js'` or any similar "
+        "statement for the helpers above — `lib.js` is a script, not a module, "
+        "and the import will fail at runtime. The `imports_to_add` field should "
+        "normally be EMPTY; only include an import if you genuinely need to pull "
+        "from another ES module (rare for `map_item`).\n\n"
+        "Also forbidden, because they don't exist in JavaScript:\n"
+        "- Anything from `common.lib.helpers` not listed above (e.g. `convert_to_int`, `timify`)\n"
+        "- Anything from `common.lib.exceptions`, `common.lib.user_input`, `backend.lib.*`\n"
+        "- Python stdlib modules (`datetime`, `urllib.parse`, `re`, `json`, `hashlib`, etc.) — use the JavaScript native equivalents instead.\n\n"
+        "# Python → JavaScript translation rules\n"
+        "- **Class instantiation**: JavaScript requires the `new` keyword. Python `MappedItem({...})` becomes JavaScript `new MappedItem({...})`. Same for `MissingMappedField`.\n"
+        "- **datetime**: Python `datetime.utcfromtimestamp(t)` → JS `new Date(t * 1000)`; `datetime.now()` → `new Date()`; `.strftime('%Y-%m-%d %H:%M:%S')` → manual formatting via `toISOString()` / `Date` getters.\n"
+        "- **URLs**: Python `urlparse(u)` / `parse_qs(q)` → JS `new URL(u)` / `url.searchParams`. The `URL` class auto-handles encoding.\n"
+        "- **regex**: Python `re.compile(p).search(s)` → JS `s.match(p)` or `new RegExp(p).exec(s)`. Watch out for differing flag syntax.\n"
+        "- **f-strings**: Python `f\"x {y}\"` → JS template literals `` `x ${y}` ``.\n"
+        "- **dict iteration**: Python `d.get(k, default)` → JS `d[k] ?? default` or `(d[k] !== undefined ? d[k] : default)`.\n"
+        "- **list comprehensions**: Python `[f(x) for x in xs if g(x)]` → JS `xs.filter(g).map(f)`.\n\n"
+        "# Common mistakes from past runs (the script lints for these and rejects matches)\n"
+        "- Python `dict.get(k)` / `dict.get(k, default)` does NOT exist in JavaScript. Replace EVERY `.get(...)` with `[k]` or `[k] ?? default`. Pinterest- and Instagram-style code has many of these — translate every one.\n"
+        "- Literal newlines inside string literals are a JS syntax error. Python `\"\\n\".join(xs)` becomes JS `xs.join(\"\\n\")` — keep the `\\n` as an escape sequence; do NOT put an actual newline character inside the quotes.\n"
+        "- `MappedItem` and `MissingMappedField` are CLASSES — always use `new MappedItem({...})` and `new MissingMappedField(...)`, never bare calls.\n"
+        "- `js/lib.js` is loaded as a script, NOT a module. Do NOT write `import { X } from '../js/lib.js'`. The helpers there are globals.\n"
+        "- Python keywords don't exist in JS: `None` → `null`, `True`/`False` → `true`/`false`, `def` → `function`.\n"
+        "- f-strings (`f\"x {y}\"`) don't exist in JS. Use template literals (`` `x ${y}` ``).\n\n"
+        "# Before submitting, verify your output\n"
+        "1. The function contains zero `.get(` calls.\n"
+        "2. Every `MappedItem(` and `MissingMappedField(` is preceded by `new `.\n"
+        "3. No string literal contains a raw newline character — use `\\n` escapes.\n"
+        "4. `imports_to_add` is empty unless you really need an ES-module import (it should NOT contain anything for `MappedItem` etc.).\n"
+        "5. No Python keywords (`None`, `True`, `False`, `def`, f-strings).\n\n"
+        "# Output format\n"
+        "Use `export function map_item(item) { ... }` to match this module's ES-module style. "
+        "Return raw JavaScript source — do NOT wrap fields in markdown code fences. "
+        "The `imports_to_add` field is normally an empty array (the helpers above are global, not imported). "
+        "The `helpers_to_add` field should contain full helper-function source (each entry one complete function)."
     )
 
 
+# Remove these code fences that keep appearing in the LLM output, even when explicitly told not to use them.
+_FENCE_OPEN = re.compile(r"^```(?:js|javascript|typescript|ts)?\s*\n?", re.IGNORECASE)
+_FENCE_CLOSE = re.compile(r"\n?```\s*$")
+
+
+def strip_code_fences(s: str) -> str:
+    """
+    Strip leading/trailing markdown code fences (```js, ```javascript, ``` etc.)
+    Idempotent; returns the input unchanged if no fences found. Defensive
+    post-processing because LLMs frequently wrap their answers in fences even
+    when explicitly told not to.
+    """
+    if not s:
+        return s
+    s = s.strip()
+    s = _FENCE_OPEN.sub("", s)
+    s = _FENCE_CLOSE.sub("", s)
+    return s.strip()
+
+# If all else fails...
 def parse_freetext_response(text: str) -> dict:
     """
     Fallback parser for when structured output is disabled or unreliable.
@@ -185,6 +297,129 @@ def parse_freetext_response(text: str) -> dict:
         "commentary": commentary,
     }
 
+# Helper to check for common issues
+def _strip_js_comments(s: str) -> str:
+    """Remove // and /* */ comments so they don't trip pattern checks."""
+    s = re.sub(r"//[^\n]*", "", s)
+    s = re.sub(r"/\*.*?\*/", "", s, flags=re.DOTALL)
+    return s
+
+
+# Regex checks for known anti-patterns the LLM produces.
+# Each entry is (pattern, message). Stay conservative: false negatives are
+# fine, false positives block valid code so they're worse.
+LINT_PATTERNS = [
+    (
+        re.compile(r"\.get\("),
+        "Python `dict.get()` does not exist in JavaScript. Replace every `.get(k)` with `[k]` and every `.get(k, default)` with `[k] ?? default`.",
+    ),
+    (
+        re.compile(r"^\s*from\s+\S+\s+import\b", re.MULTILINE),
+        "Python-style `from X import Y` statement found. JavaScript uses `import { Y } from 'X'` syntax (and only when really needed — Zeeschuimer helpers are globals).",
+    ),
+    (
+        re.compile(r"import\s*(?:\{[^}]*\}|\*\s+as\s+\w+|\w+)\s+from\s+['\"]\.\.?/js/lib\.js['\"]"),
+        "Do not import from `js/lib.js` — it is loaded as a script, its declarations are global.",
+    ),
+    (
+        re.compile(r"\bNone\b"),
+        "Python `None` is not valid JavaScript. Use `null`.",
+    ),
+    (
+        re.compile(r"\bTrue\b"),
+        "Python `True` is not valid JavaScript. Use `true` (lowercase).",
+    ),
+    (
+        re.compile(r"\bFalse\b"),
+        "Python `False` is not valid JavaScript. Use `false` (lowercase).",
+    ),
+    (
+        re.compile(r"\bdef\s+\w+\s*\("),
+        "Python `def` keyword found. Use JavaScript `function name(...)` declaration syntax.",
+    ),
+    (
+        re.compile(r"\bf\"|\bf'"),
+        "Python f-string detected (`f\"...\"` or `f'...'`). Use JavaScript template literals (`` `...${x}...` ``).",
+    ),
+]
+
+
+# Lexer that matches JS string and template literals as whole units. Used to
+# check whether a single- or double-quoted literal contains a raw newline (a
+# JS syntax error). Template literals are allowed to span lines so they're
+# matched and skipped. Regex matches escapes (\\.) so embedded `\"` etc. don't
+# prematurely close the string.
+_JS_STRING_LITERAL = re.compile(
+    r'''
+        (?P<dq>"(?:[^"\\]|\\.)*")
+      | (?P<sq>'(?:[^'\\]|\\.)*')
+      | (?P<tl>`(?:[^`\\]|\\.)*`)
+    ''',
+    re.DOTALL | re.VERBOSE,
+)
+
+
+def _has_literal_newline_in_string(source: str) -> bool:
+    for m in _JS_STRING_LITERAL.finditer(source):
+        if m.group("tl"):
+            continue  # template literals legally span lines
+        if "\n" in m.group(0):
+            return True
+    return False
+
+
+def lint_translation(translation: dict) -> list:
+    """
+    Scan the generated JS for known bugs. Returns a list of error strings
+    (empty if clean). Runs on `map_item_function` and each `helpers_to_add`
+    entry. Comments are stripped before scanning to avoid false positives
+    on commentary text.
+    """
+    issues = []
+    sources = []
+    fn = translation.get("map_item_function") or ""
+    if fn:
+        sources.append(("map_item_function", fn))
+    for i, h in enumerate(translation.get("helpers_to_add") or []):
+        if isinstance(h, str) and h:
+            sources.append((f"helpers_to_add[{i}]", h))
+
+    for label, source in sources:
+        clean = _strip_js_comments(source)
+
+        seen = set()
+        for regex, message in LINT_PATTERNS:
+            if regex.search(clean) and message not in seen:
+                issues.append(f"[{label}] {message}")
+                seen.add(message)
+
+        # Class instantiation without `new` (variable-width lookbehind, so
+        # check the chars before each match manually).
+        for cls in ("MappedItem", "MissingMappedField"):
+            pattern = re.compile(rf"\b{cls}\s*\(")
+            reported = False
+            for m in pattern.finditer(clean):
+                before = clean[max(0, m.start() - 8) : m.start()]
+                if not re.search(r"\bnew\s+$", before):
+                    if not reported:
+                        issues.append(
+                            f"[{label}] `{cls}` instantiated without `new` keyword "
+                            f"(at offset {m.start()}). All class instantiations need `new`."
+                        )
+                        reported = True
+
+        # Literal newlines inside single- or double-quoted strings (a JS
+        # syntax error). The LLM sometimes emits e.g. `.join('\n')` as
+        # `.join('<actual newline>')` which doesn't parse.
+        if _has_literal_newline_in_string(clean):
+            issues.append(
+                f"[{label}] Literal newline inside a string literal — JS strings "
+                f"can't span lines without escape (`\"\\n\"`) or template literals "
+                f"(`` `\\n` ``)."
+            )
+
+    return issues
+
 
 def validate_translation(translation: dict) -> Optional[str]:
     """
@@ -203,7 +438,7 @@ def validate_translation(translation: dict) -> Optional[str]:
 
 def splice_into_module(existing: str, translation: dict, python_rel: str) -> str:
     """
-    Idempotently insert / replace the auto-generated marker blocks in the JS
+    Insert / replace the auto-generated marker blocks in the JS
     module text.
 
     Raises ValueError if exactly one of (start, end) markers is present —
@@ -382,6 +617,7 @@ def translate_one(
     zeeschuimer_root: Path,
     use_structured_output: bool,
     stream: bool,
+    strict_lint: bool,
 ) -> dict:
     """
     Translate one Python file. Returns a manifest entry dict.
@@ -394,6 +630,8 @@ def translate_one(
         "commentary": "",
         "duration_seconds": None,
         "raw_response": None,
+        "lint_warnings": [],
+        "translation": None,
         "error": None,
     }
 
@@ -462,20 +700,47 @@ def translate_one(
         entry["error"] = "no translation produced (no error raised, no dict returned)"
         return entry
 
+    # Defensive: strip stray markdown code fences from the function source and
+    # each helper. Models wrap things in ```js even when instructed not to.
+    if isinstance(translation.get("map_item_function"), str):
+        translation["map_item_function"] = strip_code_fences(translation["map_item_function"])
+    helpers = translation.get("helpers_to_add")
+    if isinstance(helpers, list):
+        translation["helpers_to_add"] = [
+            strip_code_fences(h) if isinstance(h, str) else h for h in helpers
+        ]
+
     bad = validate_translation(translation)
     if bad:
+        entry["translation"] = translation
         entry["error"] = bad
         return entry
 
+    lint_issues = lint_translation(translation)
+    if lint_issues:
+        if strict_lint:
+            entry["translation"] = translation
+            entry["lint_warnings"] = lint_issues
+            entry["error"] = "Lint issues (--strict-lint):\n  - " + "\n  - ".join(lint_issues)
+            return entry
+        # Non-strict (default): record as warnings and let the file ship. The
+        # reviewer sees the issues in the PR body and fixes them by hand.
+        entry["lint_warnings"] = lint_issues
+
     try:
         spliced = splice_into_module(existing_module, translation, rel)
     except ValueError as e:
+        entry["translation"] = translation
         entry["error"] = str(e)
         return entry
 
     js_path.write_text(spliced, encoding="utf-8")
     entry["status"] = "ok"
     entry["commentary"] = translation.get("commentary", "").strip()
+    # Keep parsed translation on warning entries so the PR / reviewer can see
+    # exactly what was emitted alongside the warnings.
+    if entry["lint_warnings"]:
+        entry["translation"] = translation
     return entry
 
 
@@ -529,6 +794,16 @@ def main():
             "wastes LLM time."
         ),
     )
+    cli.add_argument(
+        "--strict-lint",
+        action="store_true",
+        help=(
+            "Treat lint findings (Python `.get()`, missing `new`, literal newlines "
+            "in strings, etc.) as failures rather than warnings. Default is "
+            "warnings — the file still ships and the PR body surfaces the "
+            "issues so the reviewer can fix them by hand."
+        ),
+    )
     args = cli.parse_args()
 
     dmi_ollama_key = os.environ.get("DMI_OLLAMA_KEY")
@@ -569,7 +844,8 @@ def main():
     print(
         f"Using model: {args.model} "
         f"(provider: ollama, structured_output: {use_structured_output}, "
-        f"stream: {args.stream}, fail_fast: {fail_fast})"
+        f"stream: {args.stream}, fail_fast: {fail_fast}, "
+        f"strict_lint: {args.strict_lint})"
     )
 
     entries = []
@@ -586,6 +862,7 @@ def main():
                 args.zeeschuimer_checkout.resolve(),
                 use_structured_output,
                 args.stream,
+                args.strict_lint,
             )
         except Exception as e:
             entry = {
@@ -601,7 +878,12 @@ def main():
         dur = entry.get("duration_seconds")
         dur_str = f" in {dur}s" if dur is not None else ""
         err_str = f" ({entry['error']})" if entry.get("error") else ""
-        print(f"  -> {entry['status']}{dur_str}{err_str}", flush=True)
+        warn_str = (
+            f" with {len(entry['lint_warnings'])} lint warning(s)"
+            if entry.get("lint_warnings")
+            else ""
+        )
+        print(f"  -> {entry['status']}{warn_str}{dur_str}{err_str}", flush=True)
 
         if entry["status"] == "failed" and not args.no_fail_fast:
             remaining = len(files) - len(entries)
@@ -620,6 +902,7 @@ def main():
         "structured_output": use_structured_output,
         "stream": args.stream,
         "fail_fast": fail_fast,
+        "strict_lint": args.strict_lint,
         "total_duration_seconds": overall_duration,
         "entries": entries,
     }
@@ -627,11 +910,13 @@ def main():
     args.output_manifest.write_text(json.dumps(manifest, indent=2), encoding="utf-8")
 
     n_ok = sum(1 for e in entries if e["status"] == "ok")
+    n_with_warnings = sum(1 for e in entries if e["status"] == "ok" and e.get("lint_warnings"))
     n_failed = sum(1 for e in entries if e["status"] == "failed")
     n_skipped = sum(1 for e in entries if e["status"] == "skipped")
     print(
         f"\nDone with model `{args.model}` in {overall_duration}s: "
-        f"{n_ok} ok, {n_failed} failed, {n_skipped} skipped."
+        f"{n_ok} ok ({n_with_warnings} with warnings), "
+        f"{n_failed} failed, {n_skipped} skipped."
     )
     print(f"Manifest written to {args.output_manifest}")
 

From a26cec4586bd5060dd72b914d4568a7a66e9b765 Mon Sep 17 00:00:00 2001
From: Dale Wahl <dalewahl@gmail.com>
Date: Wed, 6 May 2026 14:18:34 +0200
Subject: [PATCH 05/13] CI: use qwen, log warnings and add to PR

---
 .../workflows/zeeschuimer_map_item_sync.yml   | 31 ++++++++++++++-----
 1 file changed, 24 insertions(+), 7 deletions(-)

diff --git a/.github/workflows/zeeschuimer_map_item_sync.yml b/.github/workflows/zeeschuimer_map_item_sync.yml
index 54f6a266c..9981cc411 100644
--- a/.github/workflows/zeeschuimer_map_item_sync.yml
+++ b/.github/workflows/zeeschuimer_map_item_sync.yml
@@ -33,7 +33,7 @@ on:
         type: string
         default: ''
       model:
-        description: 'Override LLM model on DMI Ollama (default: gemma3:4b). Examples: qwen2.5-coder:7b, deepseek-coder-v2:16b, gemma3:27b'
+        description: 'Override LLM model on DMI Ollama (default: qwen2.5-coder:14b). Examples: qwen2.5-coder:7b, deepseek-coder-v2:16b, gemma3:27b'
         type: string
         default: ''
 
@@ -122,7 +122,7 @@ jobs:
         if: steps.changed.outputs.mode != 'none'
         env:
           DMI_OLLAMA_KEY: ${{ secrets.DMI_OLLAMA_KEY }}
-          LLM_MODEL: ${{ inputs.model || 'gemma3:4b' }}
+          LLM_MODEL: ${{ inputs.model || 'qwen2.5-coder:14b' }}
         run: |
           if [ "${{ steps.changed.outputs.mode }}" = "bootstrap" ]; then
             python helper-scripts/map_item_converter.py \
@@ -170,7 +170,7 @@ jobs:
 
           short_sha = after[:7]
           lines = []
-          lines.append("> :robot: This PR was auto-generated by the [4CAT map_item sync workflow](https://github.com/{}/actions/runs/{}). It is a **draft** because the JS was produced by an LLM and needs human review and end-to-end extension testing before merging.".format(repo, run_id))
+          lines.append("> :robot: This PR was auto-generated by the [4CAT map_item sync workflow](https://github.com/{}/actions/runs/{}). The JavaScript was produced by an LLM and **requires human review** before merging — including manual fixes for any lint warnings flagged below.".format(repo, run_id))
           lines.append("")
           lines.append("## Generation parameters")
           lines.append("- **Model:** `{}` (provider: `{}`, structured output: `{}`, stream: `{}`)".format(model, provider, structured_output, stream))
@@ -185,28 +185,45 @@ jobs:
           lines.append("")
 
           ok = [e for e in entries if e["status"] == "ok"]
+          ok_with_warnings = [e for e in ok if e.get("lint_warnings")]
           failed = [e for e in entries if e["status"] == "failed"]
           skipped = [e for e in entries if e["status"] == "skipped"]
 
           lines.append("## Summary")
           lines.append("- :white_check_mark: {} translated".format(len(ok)))
+          if ok_with_warnings:
+              lines.append("- :warning: {} translated with lint warnings (require manual fix)".format(len(ok_with_warnings)))
           lines.append("- :x: {} failed".format(len(failed)))
           lines.append("- :grey_question: {} skipped".format(len(skipped)))
           lines.append("")
 
           if ok:
-              lines.append("| Datasource | Module | Time |")
-              lines.append("|---|---|---:|")
+              lines.append("| Datasource | Module | Time | Warnings |")
+              lines.append("|---|---|---:|---:|")
               for entry in ok:
                   dur = entry.get("duration_seconds")
                   dur_cell = "{}s".format(dur) if dur is not None else "—"
-                  lines.append("| `{}` | `{}` | {} |".format(entry["python_file"], entry["js_file"], dur_cell))
+                  warn_count = len(entry.get("lint_warnings") or [])
+                  warn_cell = ":warning: {}".format(warn_count) if warn_count else "—"
+                  lines.append("| `{}` | `{}` | {} | {} |".format(entry["python_file"], entry["js_file"], dur_cell, warn_cell))
               lines.append("")
 
+          if ok_with_warnings:
+              lines.append("## :warning: Lint warnings — fix before merging")
+              lines.append("")
+              lines.append("The following datasources translated successfully but the static lint flagged issues that need human fixes. The auto-generated code was spliced into the JS module as-is; please patch the file directly in this PR.")
+              lines.append("")
+              for entry in ok_with_warnings:
+                  lines.append("**`{}` -> `{}`**".format(entry["python_file"], entry["js_file"]))
+                  for w in entry["lint_warnings"]:
+                      lines.append("- {}".format(w))
+                  lines.append("")
+
           for entry in ok:
               dur = entry.get("duration_seconds")
               header_dur = " ({}s)".format(dur) if dur is not None else ""
-              lines.append("## `{}` -> `{}`{}".format(entry["python_file"], entry["js_file"], header_dur))
+              warn_marker = " :warning:" if entry.get("lint_warnings") else ""
+              lines.append("## `{}` -> `{}`{}{}".format(entry["python_file"], entry["js_file"], header_dur, warn_marker))
               if entry.get("commentary"):
                   lines.append("**LLM commentary:**")
                   lines.append("")

From 2e3ecfe2e253f3d5ebc19c2f2cea6b5eb5f80434 Mon Sep 17 00:00:00 2001
From: Dale Wahl <dalewahl@gmail.com>
Date: Thu, 7 May 2026 14:22:33 +0200
Subject: [PATCH 06/13] clear up AI generated block markers note

---
 helper-scripts/map_item_converter.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/helper-scripts/map_item_converter.py b/helper-scripts/map_item_converter.py
index 3d43a53d0..397e11fd1 100644
--- a/helper-scripts/map_item_converter.py
+++ b/helper-scripts/map_item_converter.py
@@ -56,9 +56,9 @@
 
 DEFAULT_MODEL = "qwen2.5-coder:14b"
 
-IMPORTS_MARKER_START = "// === auto-generated imports for map_item — DO NOT EDIT BY HAND ==="
+IMPORTS_MARKER_START = "// === auto-generated imports for map_item — BLOCK REPLACED AUTOMATICALLY ==="
 IMPORTS_MARKER_END = "// === end auto-generated imports ==="
-BLOCK_MARKER_START = "// === auto-generated by 4cat map_item sync — DO NOT EDIT BY HAND ==="
+BLOCK_MARKER_START = "// === auto-generated by 4cat map_item sync — BLOCK REPLACED AUTOMATICALLY ==="
 BLOCK_MARKER_END = "// === end auto-generated ==="
 
 LLM_SCHEMA = {

From 4e3487cd3c22cb1f4b5df635ee4f9871f02161d3 Mon Sep 17 00:00:00 2001
From: Dale Wahl <dalewahl@gmail.com>
Date: Thu, 7 May 2026 14:32:17 +0200
Subject: [PATCH 07/13] one PR per module

---
 .../workflows/zeeschuimer_map_item_sync.yml   | 190 +++++++++++++-----
 1 file changed, 139 insertions(+), 51 deletions(-)

diff --git a/.github/workflows/zeeschuimer_map_item_sync.yml b/.github/workflows/zeeschuimer_map_item_sync.yml
index 9981cc411..73fbd8bd6 100644
--- a/.github/workflows/zeeschuimer_map_item_sync.yml
+++ b/.github/workflows/zeeschuimer_map_item_sync.yml
@@ -1,10 +1,14 @@
 # Auto-translate Zeeschuimer datasource `map_item` functions from Python to JS
-# and open a draft PR against digitalmethodsinitiative/zeeschuimer.
+# and open a draft PR per module against digitalmethodsinitiative/zeeschuimer.
 #
 # Triggers on pushes to master that touch any Zeeschuimer datasource (or the
 # helper script itself). Also exposes a `workflow_dispatch` trigger with a
 # `bootstrap` input for the initial run that translates all 15 datasources at
-# once.
+# once (single PR).
+#
+# Architecture: a `detect` job groups changed files by module and emits a
+# matrix; a `sync` job fans out one parallel run per module, each opening
+# (or updating) its own PR on a stable per-module branch.
 #
 # Required secrets (configured in repo Settings -> Secrets and variables -> Actions):
 #   DMI_OLLAMA_KEY               - API key for https://ollama.digitalmethods.net (already used by helper script)
@@ -25,11 +29,11 @@ on:
   workflow_dispatch:
     inputs:
       bootstrap:
-        description: 'Translate every Zeeschuimer datasource (initial sync). Ignored if "files" is set.'
+        description: 'Translate every Zeeschuimer datasource (initial sync, single PR). Ignored if "files" is set.'
         type: boolean
         default: false
       files:
-        description: 'Space-separated list of datasource files to translate (e.g. "datasources/tiktok/search_tiktok.py"). Overrides bootstrap.'
+        description: 'Space-separated list of datasource files to translate (e.g. "datasources/tiktok/search_tiktok.py"). Overrides bootstrap. One PR per module.'
         type: string
         default: ''
       model:
@@ -38,53 +42,115 @@ on:
         default: ''
 
 jobs:
-  sync:
-    name: Translate map_item and open Zeeschuimer PR
+  detect:
+    name: Detect modules to translate
     runs-on: ubuntu-latest
+    outputs:
+      mode: ${{ steps.plan.outputs.mode }}
+      matrix: ${{ steps.plan.outputs.matrix }}
     steps:
       - name: Checkout 4CAT
         uses: actions/checkout@v4
         with:
           fetch-depth: 2
 
-      - name: Determine changed datasource files
-        id: changed
+      - name: Plan translation matrix
+        id: plan
+        env:
+          EVENT_NAME: ${{ github.event_name }}
+          INPUTS_FILES: ${{ inputs.files }}
+          INPUTS_BOOTSTRAP: ${{ inputs.bootstrap }}
+          BEFORE_SHA: ${{ github.event.before }}
+          AFTER_SHA: ${{ github.sha }}
         run: |
-          # Manual run with explicit files takes precedence
-          if [ "${{ github.event_name }}" = "workflow_dispatch" ] && [ -n "${{ inputs.files }}" ]; then
-            echo "mode=files" >> "$GITHUB_OUTPUT"
-            echo "files=${{ inputs.files }}" >> "$GITHUB_OUTPUT"
-            exit 0
-          fi
-          if [ "${{ github.event_name }}" = "workflow_dispatch" ] && [ "${{ inputs.bootstrap }}" = "true" ]; then
-            echo "mode=bootstrap" >> "$GITHUB_OUTPUT"
-            echo "files=" >> "$GITHUB_OUTPUT"
-            exit 0
-          fi
-          changed=$(git diff --name-only "${{ github.event.before }}" "${{ github.sha }}" -- 'datasources/*/search_*.py' || true)
-          if [ -z "$changed" ]; then
-            echo "mode=none" >> "$GITHUB_OUTPUT"
-            echo "files=" >> "$GITHUB_OUTPUT"
-          else
-            echo "mode=files" >> "$GITHUB_OUTPUT"
-            files=$(echo "$changed" | tr '\n' ' ')
-            echo "files=$files" >> "$GITHUB_OUTPUT"
-          fi
+          python - <<'EOF'
+          import json
+          import os
+          import subprocess
 
-      - name: Exit early if nothing to do
-        if: steps.changed.outputs.mode == 'none'
-        run: |
-          echo "No Zeeschuimer datasource files changed; nothing to translate."
-          exit 0
+          event_name = os.environ["EVENT_NAME"]
+          inputs_files = os.environ.get("INPUTS_FILES", "").strip()
+          inputs_bootstrap = os.environ.get("INPUTS_BOOTSTRAP", "").lower() == "true"
+          before = os.environ.get("BEFORE_SHA", "")
+          after = os.environ.get("AFTER_SHA", "")
+          out_path = os.environ["GITHUB_OUTPUT"]
+
+          def emit(mode, matrix):
+              with open(out_path, "a", encoding="utf-8") as f:
+                  f.write("mode={}\n".format(mode))
+                  f.write("matrix={}\n".format(json.dumps(matrix)))
+
+          # Bootstrap is special: one PR for all datasources.
+          # Explicit `files` input overrides bootstrap; honor that.
+          if event_name == "workflow_dispatch" and inputs_bootstrap and not inputs_files:
+              emit("bootstrap", [{"module": "bootstrap", "files": "", "bootstrap": True}])
+              print("Plan: bootstrap (single PR)")
+              raise SystemExit(0)
+
+          # Resolve the file list to translate.
+          if event_name == "workflow_dispatch" and inputs_files:
+              files = inputs_files.split()
+          else:
+              # push event: diff datasource files between before and after.
+              try:
+                  out = subprocess.check_output(
+                      ["git", "diff", "--name-only", before, after, "--",
+                       "datasources/*/search_*.py"],
+                      text=True,
+                  )
+              except subprocess.CalledProcessError:
+                  out = ""
+              files = [f for f in out.splitlines() if f.strip()]
+
+          # Group by module: datasources/<module>/search_*.py
+          modules = {}
+          for path in files:
+              parts = path.split("/")
+              if len(parts) >= 3 and parts[0] == "datasources":
+                  modules.setdefault(parts[1], []).append(path)
+
+          if not modules:
+              emit("none", [])
+              print("Plan: nothing to translate")
+              raise SystemExit(0)
+
+          matrix = [
+              {"module": mod, "files": " ".join(sorted(paths)), "bootstrap": False}
+              for mod, paths in sorted(modules.items())
+          ]
+          emit("files", matrix)
+          print("Plan: {} module(s)".format(len(matrix)))
+          for entry in matrix:
+              print("  - {}: {}".format(entry["module"], entry["files"]))
+          EOF
+
+  sync:
+    name: Sync ${{ matrix.target.module }}
+    needs: detect
+    if: needs.detect.outputs.mode != 'none'
+    runs-on: ubuntu-latest
+    # Per-module concurrency: a newer push to master supersedes any in-flight
+    # sync for the same module (LLM run gets cancelled, latest run wins).
+    # Each matrix instance gets its own group, so different modules don't block.
+    concurrency:
+      group: zeeschuimer-sync-${{ matrix.target.module }}
+      cancel-in-progress: true
+    strategy:
+      fail-fast: false
+      matrix:
+        target: ${{ fromJson(needs.detect.outputs.matrix) }}
+    steps:
+      - name: Checkout 4CAT
+        uses: actions/checkout@v4
+        with:
+          fetch-depth: 2
 
       - name: Set up Python
-        if: steps.changed.outputs.mode != 'none'
         uses: actions/setup-python@v5
         with:
           python-version: "3.11"
 
       - name: Install LLM dependencies
-        if: steps.changed.outputs.mode != 'none'
         run: |
           # LLMAdapter (common/lib/llm.py) imports every provider's langchain
           # package at module load, so all of these are required even though
@@ -101,7 +167,6 @@ jobs:
             requests
 
       - name: Mint Zeeschuimer App token
-        if: steps.changed.outputs.mode != 'none'
         id: app_token
         uses: actions/create-github-app-token@v1
         with:
@@ -111,7 +176,6 @@ jobs:
           repositories: zeeschuimer
 
       - name: Checkout Zeeschuimer
-        if: steps.changed.outputs.mode != 'none'
         uses: actions/checkout@v4
         with:
           repository: digitalmethodsinitiative/zeeschuimer
@@ -119,28 +183,27 @@ jobs:
           token: ${{ steps.app_token.outputs.token }}
 
       - name: Run translation
-        if: steps.changed.outputs.mode != 'none'
         env:
           DMI_OLLAMA_KEY: ${{ secrets.DMI_OLLAMA_KEY }}
           LLM_MODEL: ${{ inputs.model || 'qwen2.5-coder:14b' }}
         run: |
-          if [ "${{ steps.changed.outputs.mode }}" = "bootstrap" ]; then
+          if [ "${{ matrix.target.bootstrap }}" = "true" ]; then
             python helper-scripts/map_item_converter.py \
               --bootstrap \
               --zeeschuimer-checkout ./zeeschuimer-checkout \
               --output-manifest ./manifest.json
           else
             python helper-scripts/map_item_converter.py \
-              --files ${{ steps.changed.outputs.files }} \
+              --files ${{ matrix.target.files }} \
               --zeeschuimer-checkout ./zeeschuimer-checkout \
               --output-manifest ./manifest.json
           fi
 
       - name: Build PR body
-        if: steps.changed.outputs.mode != 'none'
         id: pr_body
         env:
-          MODE: ${{ steps.changed.outputs.mode }}
+          MODULE: ${{ matrix.target.module }}
+          BOOTSTRAP: ${{ matrix.target.bootstrap }}
           BEFORE_SHA: ${{ github.event.before }}
           AFTER_SHA: ${{ github.sha }}
           RUN_ID: ${{ github.run_id }}
@@ -151,7 +214,8 @@ jobs:
           import os
           import subprocess
 
-          mode = os.environ["MODE"]
+          module = os.environ["MODULE"]
+          is_bootstrap = os.environ.get("BOOTSTRAP", "").lower() == "true"
           before = os.environ["BEFORE_SHA"]
           after = os.environ["AFTER_SHA"]
           run_id = os.environ["RUN_ID"]
@@ -176,12 +240,12 @@ jobs:
           lines.append("- **Model:** `{}` (provider: `{}`, structured output: `{}`, stream: `{}`)".format(model, provider, structured_output, stream))
           if total_duration is not None:
               lines.append("- **Total LLM time:** {}s".format(total_duration))
-          if mode == "bootstrap":
+          if is_bootstrap:
               lines.append("- **Trigger:** manual `workflow_dispatch` with `bootstrap=true` (initial sync of all Zeeschuimer datasources).")
           elif event_name == "workflow_dispatch":
-              lines.append("- **Trigger:** manual `workflow_dispatch` with explicit file list.")
+              lines.append("- **Trigger:** manual `workflow_dispatch` for `{}`.".format(module))
           else:
-              lines.append("- **Trigger:** push of [`{}`](https://github.com/{}/commit/{}) to 4CAT master.".format(short_sha, repo, after))
+              lines.append("- **Trigger:** push of [`{}`](https://github.com/{}/commit/{}) to 4CAT master (module: `{}`).".format(short_sha, repo, after, module))
           lines.append("")
 
           ok = [e for e in entries if e["status"] == "ok"]
@@ -266,10 +330,32 @@ jobs:
           with open("pr_body.md", "w", encoding="utf-8") as f:
               f.write(body)
           print("Wrote pr_body.md ({} chars)".format(len(body)))
+
+          # Title is single-module in the matrix path; bootstrap is its own
+          # special-case (one PR covering all 15 datasources).
+          ok_modules = []
+          for entry in ok:
+              parts = entry["python_file"].split("/")
+              if len(parts) >= 2 and parts[0] == "datasources":
+                  mod = parts[1]
+                  if mod not in ok_modules:
+                      ok_modules.append(mod)
+
+          if is_bootstrap:
+              title = "Auto-translated map_item updates from 4CAT (bootstrap, {} datasources)".format(len(ok_modules))
+          elif not ok_modules:
+              title = "Auto-translated map_item updates from 4CAT: {}".format(module)
+          else:
+              title = "Auto-translated map_item updates from 4CAT: {}".format(", ".join(ok_modules))
+
+          github_output = os.environ.get("GITHUB_OUTPUT")
+          if github_output:
+              with open(github_output, "a", encoding="utf-8") as f:
+                  f.write("title={}\n".format(title))
+          print("PR title: {}".format(title))
           EOF
 
       - name: Check there are JS changes to PR
-        if: steps.changed.outputs.mode != 'none'
         id: have_changes
         working-directory: zeeschuimer-checkout
         run: |
@@ -281,13 +367,15 @@ jobs:
           fi
 
       - name: Open or update Zeeschuimer PR
-        if: steps.changed.outputs.mode != 'none' && steps.have_changes.outputs.has_changes == 'true'
+        if: steps.have_changes.outputs.has_changes == 'true'
         uses: peter-evans/create-pull-request@v6
         with:
           path: zeeschuimer-checkout
           token: ${{ steps.app_token.outputs.token }}
-          branch: auto/4cat-map-item-sync
-          title: "Auto-translated map_item updates from 4CAT @ ${{ github.sha }}"
-          commit-message: "chore: sync map_item from 4CAT ${{ github.sha }}"
+          # Stable per-module branch: a fresh push that retranslates the same
+          # module updates the same PR. Different modules never share a branch.
+          branch: auto/4cat-map-item-sync-${{ matrix.target.module }}
+          title: ${{ steps.pr_body.outputs.title }}
+          commit-message: "chore: sync map_item for ${{ matrix.target.module }} from 4CAT ${{ github.sha }}"
           body-path: pr_body.md
           draft: true

From 1dae693850788063ef92790cc6e805c19fa85894 Mon Sep 17 00:00:00 2001
From: Dale Wahl <dalewahl@gmail.com>
Date: Thu, 7 May 2026 15:59:50 +0200
Subject: [PATCH 08/13] make platforms dynamic

---
 helper-scripts/map_item_converter.py | 73 +++++++++++++++++-----------
 1 file changed, 44 insertions(+), 29 deletions(-)

diff --git a/helper-scripts/map_item_converter.py b/helper-scripts/map_item_converter.py
index 397e11fd1..c69e7040a 100644
--- a/helper-scripts/map_item_converter.py
+++ b/helper-scripts/map_item_converter.py
@@ -34,25 +34,33 @@
 from common.lib.llm import LLMAdapter
 
 
-# 4CAT datasource path -> Zeeschuimer module path (relative to checkout root).
-# Verified against https://github.com/digitalmethodsinitiative/zeeschuimer/tree/master/modules
-# Note: facebook has no Zeeschuimer module today, so it's intentionally absent.
-PLATFORM_MAP = {
-    "datasources/douyin/search_douyin.py": "modules/douyin.js",
-    "datasources/gab/search_gab.py": "modules/gab.js",
-    "datasources/imgur/search_imgur.py": "modules/imgur.js",
-    "datasources/instagram/search_instagram.py": "modules/instagram.js",
-    "datasources/linkedin/search_linkedin.py": "modules/linkedin.js",
-    "datasources/ninegag/search_9gag.py": "modules/9gag.js",
-    "datasources/pinterest/search_pinterest.py": "modules/pinterest.js",
-    "datasources/threads/search_threads.py": "modules/threads.js",
-    "datasources/tiktok/search_tiktok.py": "modules/tiktok.js",
-    "datasources/tiktok_comments/search_tiktok_comments.py": "modules/tiktok-comments.js",
-    "datasources/truth/search_truth.py": "modules/truth.js",
-    "datasources/twitter-import/search_twitter.py": "modules/twitter.js",
-    "datasources/xiaohongshu/search_rednote.py": "modules/rednote.js",
-    "datasources/xiaohongshu_comments/search_rednote_comments.py": "modules/rednote-comments.js",
-}
+# 4CAT datasource path -> Zeeschuimer module path is derived by convention:
+# the Python file is `datasources/<dir>/search_<name>.py`; the JS module is
+# `modules/<name-with-hyphens>.js`. The convention only depends on the Python
+# *filename*, not the directory, so cases where they differ still work
+# (e.g. `xiaohongshu/search_rednote.py` -> `modules/rednote.js`,
+#  `twitter-import/search_twitter.py` -> `modules/twitter.js`).
+#
+# Datasources without a matching Zeeschuimer module (today: facebook) are
+# skipped automatically — the JS file existence check in `translate_one`
+# handles them without any explicit allow-list. New Zeeschuimer datasources
+# added to 4CAT are picked up automatically as long as Zeeschuimer ships the
+# matching `modules/<name>.js` file.
+def python_to_js_module(python_rel: str) -> Optional[str]:
+    """
+    Derive the Zeeschuimer module path for a 4CAT datasource Python file.
+    Returns None if the path doesn't follow `datasources/<dir>/search_*.py`.
+    """
+    parts = python_rel.split("/")
+    if len(parts) != 3 or parts[0] != "datasources":
+        return None
+    filename = parts[2]
+    if not filename.startswith("search_") or not filename.endswith(".py"):
+        return None
+    base = filename[len("search_"):-len(".py")]
+    if not base:
+        return None
+    return f"modules/{base.replace('_', '-')}.js"
 
 DEFAULT_MODEL = "qwen2.5-coder:14b"
 
@@ -189,17 +197,24 @@ def is_zeeschuimer_datasource(python_path: Path) -> bool:
     return False
 
 
-def discover_bootstrap_files(repo_root: Path) -> list[Path]:
+def discover_bootstrap_files(repo_root: Path, zeeschuimer_root: Path) -> list[Path]:
     """
-    Find every datasource file in PLATFORM_MAP whose class is a Zeeschuimer
-    datasource. Returns absolute paths.
+    Find every Python datasource that has a matching Zeeschuimer module.
+    Scans `datasources/*/search_*.py`, keeping only Zeeschuimer datasources
+    whose derived JS module exists in the checkout (so e.g. facebook, which
+    4CAT supports but Zeeschuimer does not, is silently dropped).
     """
     found = []
-    for rel in PLATFORM_MAP:
-        path = repo_root / rel
-        if path.exists() and is_zeeschuimer_datasource(path):
+    for path in sorted((repo_root / "datasources").glob("*/search_*.py")):
+        if not is_zeeschuimer_datasource(path):
+            continue
+        rel = path.relative_to(repo_root).as_posix()
+        js_rel = python_to_js_module(rel)
+        if not js_rel:
+            continue
+        if (zeeschuimer_root / js_rel).exists():
             found.append(path)
-    return sorted(found)
+    return found
 
 
 def build_user_prompt(python_source: str, existing_module_source: str, python_rel: str) -> str:
@@ -640,10 +655,10 @@ def translate_one(
         entry["error"] = "not a Zeeschuimer datasource (is_from_zeeschuimer != True)"
         return entry
 
-    js_rel = PLATFORM_MAP.get(rel)
+    js_rel = python_to_js_module(rel)
     if not js_rel:
         entry["status"] = "skipped"
-        entry["error"] = f"no Zeeschuimer module mapped for {rel}"
+        entry["error"] = f"could not derive Zeeschuimer module path from {rel} (expected `datasources/<dir>/search_*.py` form)"
         return entry
     entry["js_file"] = js_rel
 
@@ -813,7 +828,7 @@ def main():
     repo_root = Path(__file__).resolve().parent.parent
 
     if args.bootstrap:
-        files = discover_bootstrap_files(repo_root)
+        files = discover_bootstrap_files(repo_root, args.zeeschuimer_checkout.resolve())
         if not files:
             sys.exit("No Zeeschuimer datasources found to bootstrap.")
     else:

From e80c18d82518c7c213e8fe057a5336d0c6c03820 Mon Sep 17 00:00:00 2001
From: Dale Wahl <dalewahl@gmail.com>
Date: Thu, 7 May 2026 16:00:16 +0200
Subject: [PATCH 09/13] add MapItemException to shared helpers

---
 helper-scripts/map_item_converter.py | 13 ++++++++++++-
 1 file changed, 12 insertions(+), 1 deletion(-)

diff --git a/helper-scripts/map_item_converter.py b/helper-scripts/map_item_converter.py
index c69e7040a..034790a1a 100644
--- a/helper-scripts/map_item_converter.py
+++ b/helper-scripts/map_item_converter.py
@@ -160,7 +160,18 @@ def python_to_js_module(python_rel: str) -> Optional[str]:
         "kind": "function",
         "usage": "formatUtcTimestamp(timestamp)",
         "note": "Formats a UTC timestamp as a readable string.",
-    }
+    },
+    {
+        "name": "MapItemException",
+        "kind": "class",
+        "usage": "throw new MapItemException(message)",
+        "note": (
+            "Mirror of 4CAT's `MapItemException`. Throw from `map_item` to "
+            "signal a known mapping failure (e.g. unrecognized item shape); "
+            "callers catch it, skip the item, and warn that the platform's "
+            "format may have shifted. Always instantiate with `new`."
+        ),
+    },
 ]
 
 

From 6581f8aa92f8be1e54419004ccd8d3597c8b4a56 Mon Sep 17 00:00:00 2001
From: Dale Wahl <dalewahl@gmail.com>
Date: Thu, 7 May 2026 17:13:46 +0200
Subject: [PATCH 10/13] Update rules on regex and notify reviewers

---
 helper-scripts/map_item_converter.py | 60 +++++++++++++++++++++++++---
 1 file changed, 55 insertions(+), 5 deletions(-)

diff --git a/helper-scripts/map_item_converter.py b/helper-scripts/map_item_converter.py
index 034790a1a..9c951d7d4 100644
--- a/helper-scripts/map_item_converter.py
+++ b/helper-scripts/map_item_converter.py
@@ -269,16 +269,21 @@ def build_user_prompt(python_source: str, existing_module_source: str, python_re
         "# Common mistakes from past runs (the script lints for these and rejects matches)\n"
         "- Python `dict.get(k)` / `dict.get(k, default)` does NOT exist in JavaScript. Replace EVERY `.get(...)` with `[k]` or `[k] ?? default`. Pinterest- and Instagram-style code has many of these — translate every one.\n"
         "- Literal newlines inside string literals are a JS syntax error. Python `\"\\n\".join(xs)` becomes JS `xs.join(\"\\n\")` — keep the `\\n` as an escape sequence; do NOT put an actual newline character inside the quotes.\n"
-        "- `MappedItem` and `MissingMappedField` are CLASSES — always use `new MappedItem({...})` and `new MissingMappedField(...)`, never bare calls.\n"
+        "- Regex literals `/.../` cannot span multiple lines. If the Python regex source contains a literal newline (e.g. inside a character class), encode it as `\\n` in the JS regex — never paste a raw newline into the `/.../` body.\n"
+        "- `MappedItem`, `MissingMappedField`, and `MapItemException` are CLASSES — always use `new MappedItem({...})`, `new MissingMappedField(...)`, `throw new MapItemException(...)`. Never call them bare.\n"
+        "- Python `'x' in some_string` is a SUBSTRING check; the JS `in` operator does NOT do this — on a string it throws TypeError. Use `someString.includes('x')` for substring tests. The JS `in` operator is only for object property names (`'key' in obj`).\n"
+        "- Empty containers are TRUTHY in JavaScript but FALSY in Python. After `const user = node.user ?? {}`, the variable is always truthy — `if (user)` is always true. Either guard on the original nullable BEFORE defaulting (`if (node.user) {...}`) or check `Object.keys(user).length` / `arr.length`. Same trap for `[]`.\n"
         "- `js/lib.js` is loaded as a script, NOT a module. Do NOT write `import { X } from '../js/lib.js'`. The helpers there are globals.\n"
         "- Python keywords don't exist in JS: `None` → `null`, `True`/`False` → `true`/`false`, `def` → `function`.\n"
         "- f-strings (`f\"x {y}\"`) don't exist in JS. Use template literals (`` `x ${y}` ``).\n\n"
         "# Before submitting, verify your output\n"
         "1. The function contains zero `.get(` calls.\n"
-        "2. Every `MappedItem(` and `MissingMappedField(` is preceded by `new `.\n"
-        "3. No string literal contains a raw newline character — use `\\n` escapes.\n"
+        "2. Every `MappedItem(`, `MissingMappedField(`, and `MapItemException(` is preceded by `new `.\n"
+        "3. No string literal or regex literal contains a raw newline character — use `\\n` escapes.\n"
         "4. `imports_to_add` is empty unless you really need an ES-module import (it should NOT contain anything for `MappedItem` etc.).\n"
-        "5. No Python keywords (`None`, `True`, `False`, `def`, f-strings).\n\n"
+        "5. No Python keywords (`None`, `True`, `False`, `def`, f-strings).\n"
+        "6. No `'literal' in someStringExpression` — those are substring checks; rewrite as `.includes(...)`.\n"
+        "7. No `if (x)` guards where `x` was defaulted to `{}` or `[]` — those are always-true in JS. Guard on the pre-default value or check `.length` / `Object.keys(...).length`.\n\n"
         "# Output format\n"
         "Use `export function map_item(item) { ... }` to match this module's ES-module style. "
         "Return raw JavaScript source — do NOT wrap fields in markdown code fences. "
@@ -367,6 +372,19 @@ def _strip_js_comments(s: str) -> str:
         re.compile(r"\bf\"|\bf'"),
         "Python f-string detected (`f\"...\"` or `f'...'`). Use JavaScript template literals (`` `...${x}...` ``).",
     ),
+    (
+        # Python-style substring test: 'lit' in expr.someStringMethod(...).
+        # The JS `in` operator only works on objects (checking property names);
+        # on a string it throws TypeError. Catch the obvious cases where the
+        # right-hand side ends in a method that's known to return a string.
+        re.compile(
+            r"""['"][^'"]*['"]\s+in\s+[\w.\[\]]+\.(?:"""
+            r"""toLowerCase|toUpperCase|toString|trim|trimStart|trimEnd|"""
+            r"""slice|substring|substr|concat|charAt|normalize|repeat|"""
+            r"""padStart|padEnd|replace|replaceAll)\s*\("""
+        ),
+        "Python-style substring check (`'x' in someString`) detected. The JS `in` operator only checks object property names and throws TypeError on a string. Use `someString.includes('x')` instead.",
+    ),
 ]
 
 
@@ -394,6 +412,26 @@ def _has_literal_newline_in_string(source: str) -> bool:
     return False
 
 
+# Heuristic openers for a regex literal: `.match(/`, `new RegExp(/`, etc.
+# Scoped to method-call contexts because regex `/` is hard to disambiguate
+# from division otherwise. False negatives are acceptable; false positives
+# would block valid code.
+_REGEX_LITERAL_OPENER = re.compile(
+    r"(?:\.(?:match|replace|replaceAll|split|search|matchAll|test|exec)\s*\(\s*"
+    r"|\bnew\s+RegExp\s*\(\s*)/"
+)
+
+
+def _uses_regex(source: str) -> bool:
+    """
+    Detect whether the source uses a regex in a recognized context
+    (`.match(/.../)`, `new RegExp(...)`, etc.). Current models translate
+    regex unreliably — this is a blanket "needs human review" flag, not a
+    bug detector. Reviewer must verify the regex behavior end-to-end.
+    """
+    return bool(_REGEX_LITERAL_OPENER.search(source))
+
+
 def lint_translation(translation: dict) -> list:
     """
     Scan the generated JS for known bugs. Returns a list of error strings
@@ -421,7 +459,7 @@ def lint_translation(translation: dict) -> list:
 
         # Class instantiation without `new` (variable-width lookbehind, so
         # check the chars before each match manually).
-        for cls in ("MappedItem", "MissingMappedField"):
+        for cls in ("MappedItem", "MissingMappedField", "MapItemException"):
             pattern = re.compile(rf"\b{cls}\s*\(")
             reported = False
             for m in pattern.finditer(clean):
@@ -444,6 +482,18 @@ def lint_translation(translation: dict) -> list:
                 f"(`` `\\n` ``)."
             )
 
+        # Regex translation is unreliable on the current model — flag any
+        # regex use for manual reviewer verification rather than trying to
+        # detect specific failure modes (literal newlines, dropped escapes,
+        # flag-syntax differences, character-class drift). The reviewer is
+        # the source of truth here until we can upgrade the model.
+        if _uses_regex(clean):
+            issues.append(
+                f"[{label}] Regex detected. The current LLM translates regex "
+                f"unreliably (escapes, character classes, flags) — please verify "
+                f"the regex behavior against the Python original by hand."
+            )
+
     return issues
 
 

From ccf66aeb64227d9428da7b14f1ee3451c712df68 Mon Sep 17 00:00:00 2001
From: Dale Wahl <dalewahl@gmail.com>
Date: Tue, 12 May 2026 11:20:05 +0200
Subject: [PATCH 11/13] =?UTF-8?q?=F0=9F=A4=A8=20fix=20merge...=20again=3F?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 .github/workflows/zeeschuimer_map_item_sync.yml | 16 ----------------
 1 file changed, 16 deletions(-)

diff --git a/.github/workflows/zeeschuimer_map_item_sync.yml b/.github/workflows/zeeschuimer_map_item_sync.yml
index d01d3f16d..73fbd8bd6 100644
--- a/.github/workflows/zeeschuimer_map_item_sync.yml
+++ b/.github/workflows/zeeschuimer_map_item_sync.yml
@@ -1,4 +1,3 @@
-<<<<<<< zeeschuimer_auto_map
 # Auto-translate Zeeschuimer datasource `map_item` functions from Python to JS
 # and open a draft PR per module against digitalmethodsinitiative/zeeschuimer.
 #
@@ -17,15 +16,10 @@
 #                                  digitalmethodsinitiative/zeeschuimer with permissions
 #                                  contents:write + pull-requests:write (and nothing else)
 #   ZEESCHUIMER_APP_PRIVATE_KEY  - full PEM private key for that App (including BEGIN/END lines)
-=======
-# Bootstrap the Zeeschuimer map_item sync workflow
-# This is necessary to test workflow in PR (so far as I can tell)
->>>>>>> master
 
 name: Sync Zeeschuimer map_item from 4CAT
 
 on:
-<<<<<<< zeeschuimer_auto_map
   push:
     branches: [master]
     paths:
@@ -385,13 +379,3 @@ jobs:
           commit-message: "chore: sync map_item for ${{ matrix.target.module }} from 4CAT ${{ github.sha }}"
           body-path: pr_body.md
           draft: true
-=======
-  workflow_dispatch:
-
-jobs:
-  sync-map-item:
-    runs-on: ubuntu-latest
-    steps:
-      - name: Placeholder
-        run: echo "Workflow scaffold is valid."
->>>>>>> master

From b67c397fad74f36e7725e65d3192896eb958674a Mon Sep 17 00:00:00 2001
From: Dale Wahl <dalewahl@gmail.com>
Date: Wed, 27 May 2026 18:59:53 +0200
Subject: [PATCH 12/13] add extensions syslink to .gitignore

---
 .gitignore | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/.gitignore b/.gitignore
index 8850a7bcc..76dd51700 100644
--- a/.gitignore
+++ b/.gitignore
@@ -46,6 +46,8 @@ webtool/venv/
 *.ipynb
 venv/
 __pycache__/
+.claude/
+extensions
 
 # do not ignore interface images
 !webtool/static/img/*.png

From b4235f62fbe65f849126f29127196734926fb0e0 Mon Sep 17 00:00:00 2001
From: Dale Wahl <dalewahl@gmail.com>
Date: Thu, 28 May 2026 14:16:26 +0200
Subject: [PATCH 13/13] streamline translation helpers for prompt and linting

---
 helper-scripts/map_item_converter.py | 117 ++++------
 helper-scripts/map_item_rules.py     | 323 +++++++++++++++++++++++++++
 2 files changed, 365 insertions(+), 75 deletions(-)
 create mode 100644 helper-scripts/map_item_rules.py

diff --git a/helper-scripts/map_item_converter.py b/helper-scripts/map_item_converter.py
index 9c951d7d4..6acda73e1 100644
--- a/helper-scripts/map_item_converter.py
+++ b/helper-scripts/map_item_converter.py
@@ -33,6 +33,10 @@
 
 from common.lib.llm import LLMAdapter
 
+# Sibling module — lives next to this script in helper-scripts/. Python adds the
+# script's directory to sys.path automatically when the file is run directly.
+from map_item_rules import RULES, get_regex_lint_rules
+
 
 # 4CAT datasource path -> Zeeschuimer module path is derived by convention:
 # the Python file is `datasources/<dir>/search_<name>.py`; the JS module is
@@ -185,6 +189,32 @@ def _format_available_helpers() -> str:
     return "\n".join(lines)
 
 
+def _format_past_errors(rules) -> str:
+    """
+    Render rule records as a bulleted block for the "things to get right"
+    prompt section. Each rule emits one bullet with its `prompt_rule`, plus
+    optional `Wrong:` / `Right:` example lines. Multi-line examples are
+    indented as code blocks.
+    """
+    lines = []
+    for rule in rules:
+        lines.append(f"- **{rule.id}** — {rule.prompt_rule}")
+        for label, snippet in (("Wrong", rule.bad), ("Right", rule.good)):
+            if not snippet:
+                continue
+            if "\n" in snippet:
+                lines.append(f"    {label}:")
+                lines.extend(f"        {line}" for line in snippet.split("\n"))
+            else:
+                lines.append(f"    {label}: `{snippet}`")
+    return "\n".join(lines)
+
+
+def _format_verification_checklist(rules) -> str:
+    items = [r.verify for r in rules if r.verify]
+    return "\n".join(f"{i + 1}. {item}" for i, item in enumerate(items))
+
+
 def is_zeeschuimer_datasource(python_path: Path) -> bool:
     """
     Returns True if the given Python file defines a 4CAT Search subclass with
@@ -230,6 +260,8 @@ def discover_bootstrap_files(repo_root: Path, zeeschuimer_root: Path) -> list[Pa
 
 def build_user_prompt(python_source: str, existing_module_source: str, python_rel: str) -> str:
     helpers_block = _format_available_helpers()
+    past_errors_block = _format_past_errors(RULES)
+    verification_block = _format_verification_checklist(RULES)
     return (
         f"# Source Python file (datasources/{python_rel})\n"
         "This is the file that just changed in 4CAT. The `map_item` function on the "
@@ -258,32 +290,12 @@ def build_user_prompt(python_source: str, existing_module_source: str, python_re
         "- Anything from `common.lib.helpers` not listed above (e.g. `convert_to_int`, `timify`)\n"
         "- Anything from `common.lib.exceptions`, `common.lib.user_input`, `backend.lib.*`\n"
         "- Python stdlib modules (`datetime`, `urllib.parse`, `re`, `json`, `hashlib`, etc.) — use the JavaScript native equivalents instead.\n\n"
-        "# Python → JavaScript translation rules\n"
-        "- **Class instantiation**: JavaScript requires the `new` keyword. Python `MappedItem({...})` becomes JavaScript `new MappedItem({...})`. Same for `MissingMappedField`.\n"
-        "- **datetime**: Python `datetime.utcfromtimestamp(t)` → JS `new Date(t * 1000)`; `datetime.now()` → `new Date()`; `.strftime('%Y-%m-%d %H:%M:%S')` → manual formatting via `toISOString()` / `Date` getters.\n"
-        "- **URLs**: Python `urlparse(u)` / `parse_qs(q)` → JS `new URL(u)` / `url.searchParams`. The `URL` class auto-handles encoding.\n"
-        "- **regex**: Python `re.compile(p).search(s)` → JS `s.match(p)` or `new RegExp(p).exec(s)`. Watch out for differing flag syntax.\n"
-        "- **f-strings**: Python `f\"x {y}\"` → JS template literals `` `x ${y}` ``.\n"
-        "- **dict iteration**: Python `d.get(k, default)` → JS `d[k] ?? default` or `(d[k] !== undefined ? d[k] : default)`.\n"
-        "- **list comprehensions**: Python `[f(x) for x in xs if g(x)]` → JS `xs.filter(g).map(f)`.\n\n"
-        "# Common mistakes from past runs (the script lints for these and rejects matches)\n"
-        "- Python `dict.get(k)` / `dict.get(k, default)` does NOT exist in JavaScript. Replace EVERY `.get(...)` with `[k]` or `[k] ?? default`. Pinterest- and Instagram-style code has many of these — translate every one.\n"
-        "- Literal newlines inside string literals are a JS syntax error. Python `\"\\n\".join(xs)` becomes JS `xs.join(\"\\n\")` — keep the `\\n` as an escape sequence; do NOT put an actual newline character inside the quotes.\n"
-        "- Regex literals `/.../` cannot span multiple lines. If the Python regex source contains a literal newline (e.g. inside a character class), encode it as `\\n` in the JS regex — never paste a raw newline into the `/.../` body.\n"
-        "- `MappedItem`, `MissingMappedField`, and `MapItemException` are CLASSES — always use `new MappedItem({...})`, `new MissingMappedField(...)`, `throw new MapItemException(...)`. Never call them bare.\n"
-        "- Python `'x' in some_string` is a SUBSTRING check; the JS `in` operator does NOT do this — on a string it throws TypeError. Use `someString.includes('x')` for substring tests. The JS `in` operator is only for object property names (`'key' in obj`).\n"
-        "- Empty containers are TRUTHY in JavaScript but FALSY in Python. After `const user = node.user ?? {}`, the variable is always truthy — `if (user)` is always true. Either guard on the original nullable BEFORE defaulting (`if (node.user) {...}`) or check `Object.keys(user).length` / `arr.length`. Same trap for `[]`.\n"
-        "- `js/lib.js` is loaded as a script, NOT a module. Do NOT write `import { X } from '../js/lib.js'`. The helpers there are globals.\n"
-        "- Python keywords don't exist in JS: `None` → `null`, `True`/`False` → `true`/`false`, `def` → `function`.\n"
-        "- f-strings (`f\"x {y}\"`) don't exist in JS. Use template literals (`` `x ${y}` ``).\n\n"
+        "# Things to get right — past errors from this generator\n"
+        "Each item below has been observed in previous LLM output. The script lints "
+        "for many of them and surfaces matches as warnings on the PR. Translate accordingly.\n\n"
+        f"{past_errors_block}\n\n"
         "# Before submitting, verify your output\n"
-        "1. The function contains zero `.get(` calls.\n"
-        "2. Every `MappedItem(`, `MissingMappedField(`, and `MapItemException(` is preceded by `new `.\n"
-        "3. No string literal or regex literal contains a raw newline character — use `\\n` escapes.\n"
-        "4. `imports_to_add` is empty unless you really need an ES-module import (it should NOT contain anything for `MappedItem` etc.).\n"
-        "5. No Python keywords (`None`, `True`, `False`, `def`, f-strings).\n"
-        "6. No `'literal' in someStringExpression` — those are substring checks; rewrite as `.includes(...)`.\n"
-        "7. No `if (x)` guards where `x` was defaulted to `{}` or `[]` — those are always-true in JS. Guard on the pre-default value or check `.length` / `Object.keys(...).length`.\n\n"
+        f"{verification_block}\n\n"
         "# Output format\n"
         "Use `export function map_item(item) { ... }` to match this module's ES-module style. "
         "Return raw JavaScript source — do NOT wrap fields in markdown code fences. "
@@ -336,56 +348,11 @@ def _strip_js_comments(s: str) -> str:
     return s
 
 
-# Regex checks for known anti-patterns the LLM produces.
-# Each entry is (pattern, message). Stay conservative: false negatives are
-# fine, false positives block valid code so they're worse.
-LINT_PATTERNS = [
-    (
-        re.compile(r"\.get\("),
-        "Python `dict.get()` does not exist in JavaScript. Replace every `.get(k)` with `[k]` and every `.get(k, default)` with `[k] ?? default`.",
-    ),
-    (
-        re.compile(r"^\s*from\s+\S+\s+import\b", re.MULTILINE),
-        "Python-style `from X import Y` statement found. JavaScript uses `import { Y } from 'X'` syntax (and only when really needed — Zeeschuimer helpers are globals).",
-    ),
-    (
-        re.compile(r"import\s*(?:\{[^}]*\}|\*\s+as\s+\w+|\w+)\s+from\s+['\"]\.\.?/js/lib\.js['\"]"),
-        "Do not import from `js/lib.js` — it is loaded as a script, its declarations are global.",
-    ),
-    (
-        re.compile(r"\bNone\b"),
-        "Python `None` is not valid JavaScript. Use `null`.",
-    ),
-    (
-        re.compile(r"\bTrue\b"),
-        "Python `True` is not valid JavaScript. Use `true` (lowercase).",
-    ),
-    (
-        re.compile(r"\bFalse\b"),
-        "Python `False` is not valid JavaScript. Use `false` (lowercase).",
-    ),
-    (
-        re.compile(r"\bdef\s+\w+\s*\("),
-        "Python `def` keyword found. Use JavaScript `function name(...)` declaration syntax.",
-    ),
-    (
-        re.compile(r"\bf\"|\bf'"),
-        "Python f-string detected (`f\"...\"` or `f'...'`). Use JavaScript template literals (`` `...${x}...` ``).",
-    ),
-    (
-        # Python-style substring test: 'lit' in expr.someStringMethod(...).
-        # The JS `in` operator only works on objects (checking property names);
-        # on a string it throws TypeError. Catch the obvious cases where the
-        # right-hand side ends in a method that's known to return a string.
-        re.compile(
-            r"""['"][^'"]*['"]\s+in\s+[\w.\[\]]+\.(?:"""
-            r"""toLowerCase|toUpperCase|toString|trim|trimStart|trimEnd|"""
-            r"""slice|substring|substr|concat|charAt|normalize|repeat|"""
-            r"""padStart|padEnd|replace|replaceAll)\s*\("""
-        ),
-        "Python-style substring check (`'x' in someString`) detected. The JS `in` operator only checks object property names and throws TypeError on a string. Use `someString.includes('x')` instead.",
-    ),
-]
+# Regex checks for known anti-patterns. Sourced from the rule registry so
+# that prompt guidance and lint stay in sync. Bespoke checks below (class
+# instantiation, literal newlines, regex use) are not in the registry's
+# regex list — they're tied to records by `id`.
+LINT_PATTERNS = get_regex_lint_rules()
 
 
 # Lexer that matches JS string and template literals as whole units. Used to
diff --git a/helper-scripts/map_item_rules.py b/helper-scripts/map_item_rules.py
new file mode 100644
index 000000000..76fe4da56
--- /dev/null
+++ b/helper-scripts/map_item_rules.py
@@ -0,0 +1,323 @@
+"""
+Registry of known Python → JavaScript translation pitfalls for the
+Zeeschuimer auto-generator.
+
+Each `TranslationError` record drives three things in
+`map_item_converter.py`:
+
+- The "things to get right" section of the LLM prompt.
+- The "before submitting" verification checklist.
+- The regex-based lint pass over LLM output.
+
+Cross-repo workflow:
+
+- `translation-errors.md` (in the Zeeschuimer repo) is the freeform
+  observation log. Reviewers add entries there as new bugs surface.
+- This file is the structured input for the prompt and linter. When an
+  observation in the md is worth teaching the generator about, mirror it
+  here using the same `id` as the md heading slug. Not every md entry
+  needs a record — this is a curated subset.
+
+Three lint checks are too complex for a single regex and live as bespoke
+code in `map_item_converter.lint_translation`:
+
+- `class_needs_new` — variable-width lookbehind for `new `.
+- `literal_newline_in_string` — JS string lexer.
+- `regex_in_use` — heuristic regex-use detection.
+
+Those records have `lint_pattern=None`; the bespoke check is the lint.
+"""
+import re
+from dataclasses import dataclass
+from typing import Optional
+
+
+@dataclass(frozen=True)
+class TranslationError:
+    id: str
+    prompt_rule: str
+    bad: Optional[str] = None
+    good: Optional[str] = None
+    verify: Optional[str] = None
+    lint_pattern: Optional[re.Pattern] = None
+
+
+RULES: list[TranslationError] = [
+
+    # ---- Python syntax that does not exist in JavaScript ----
+
+    TranslationError(
+        id="python_keywords",
+        prompt_rule=(
+            "Python keywords don't exist in JavaScript: `None` → `null`, "
+            "`True` / `False` → `true` / `false` (lowercase), `def name(...)` → `function name(...)`."
+        ),
+        bad="return None if not item.is_admin else True",
+        good="return item.is_admin ? true : null",
+        verify="No Python keywords (`None`, `True`, `False`, `def`) appear.",
+        lint_pattern=re.compile(r"\b(?:None|True|False)\b|\bdef\s+\w+\s*\("),
+    ),
+    TranslationError(
+        id="python_fstring",
+        prompt_rule=(
+            "Python f-strings (`f\"...\"` / `f'...'`) don't exist in JavaScript. Use "
+            "template literals with backticks and `${...}` instead."
+        ),
+        bad='throw new Error(f"item {item.id} not found")',
+        good="throw new Error(`item ${item.id} not found`)",
+        lint_pattern=re.compile(r"\bf\"|\bf'"),
+    ),
+    TranslationError(
+        id="unquoted_interpolation",
+        prompt_rule=(
+            "Even without an `f` prefix, `\"text {var}\"` / `'text {var}'` are literal "
+            "text in JavaScript — no interpolation happens. Whenever the original Python "
+            "used an f-string, the JS must use a template literal (backticks)."
+        ),
+        bad="throw new MapItemException('different user {user.id} and owner {owner.id}')",
+        good="throw new MapItemException(`different user ${user.id} and owner ${owner.id}`)",
+        verify="No `{var}` patterns remain inside single- or double-quoted strings.",
+        lint_pattern=re.compile(r"""['"][^'"\n]*\{[a-zA-Z_$][\w$.]*\}[^'"\n]*['"]"""),
+    ),
+    TranslationError(
+        id="python_from_import",
+        prompt_rule=(
+            "Python `from X import Y` doesn't exist in JavaScript. JavaScript uses "
+            "`import { Y } from 'X'` — and only when really needed; Zeeschuimer helpers "
+            "are globals, so `imports_to_add` is usually empty."
+        ),
+        bad="from common.lib.helpers import strip_tags",
+        good="// (no import — strip_tags is a global from js/lib.js)",
+        lint_pattern=re.compile(r"^\s*from\s+\S+\s+import\b", re.MULTILINE),
+    ),
+
+    # ---- dict.get is not a thing in JS ----
+
+    TranslationError(
+        id="dict_get",
+        prompt_rule=(
+            "Python `dict.get(k)` / `dict.get(k, default)` does not exist in JavaScript. "
+            "Replace every `.get(k)` with `[k]` and every `.get(k, default)` with `[k] ?? default`."
+        ),
+        bad="user.get('name', 'anonymous')",
+        good="user['name'] ?? 'anonymous'",
+        verify="The function contains zero `.get(` calls.",
+        lint_pattern=re.compile(r"\.get\("),
+    ),
+
+    # ---- `in` operator: substring check vs key existence ----
+
+    TranslationError(
+        id="in_operator_on_strings",
+        prompt_rule=(
+            "Python `'x' in some_string` is a substring check. JavaScript's `in` operator "
+            "only works on objects (checking property names) — on a string it throws "
+            "TypeError. Use `someString.includes('x')` instead."
+        ),
+        bad="if ('polaris' in item.__typename.toLowerCase()) { ... }",
+        good="if (item.__typename.toLowerCase().includes('polaris')) { ... }",
+        verify="No `'literal' in someStringExpression` — use `.includes(...)`.",
+        # Conservative: only flag when the RHS ends in a known string method, since
+        # `'key' in someObj` is legitimate JS for property checks.
+        lint_pattern=re.compile(
+            r"""['"][^'"]*['"]\s+in\s+[\w.\[\]]+\.(?:"""
+            r"""toLowerCase|toUpperCase|toString|trim|trimStart|trimEnd|"""
+            r"""slice|substring|substr|concat|charAt|normalize|repeat|"""
+            r"""padStart|padEnd|replace|replaceAll)\s*\("""
+        ),
+    ),
+    TranslationError(
+        id="key_existence_vs_value_truthy",
+        prompt_rule=(
+            "Python `if node.get('X'):` is a *truthy check on the value* (false if the key "
+            "is missing OR if the value is `None`/empty). The naive translation "
+            "`if ('X' in node)` is a *key existence check* — true even when `node.X` is "
+            "`null`. Subsequent property accesses then throw. Use `if (node.X)` or "
+            "`if (node.X != null)`."
+        ),
+        bad="const usertags = 'usertags' in node ? node.usertags.in.map(...) : '';",
+        good="const usertags = node.usertags ? node.usertags.in.map(...) : '';",
+        lint_pattern=re.compile(r"'[^']+'\s+in\s+[a-zA-Z_$][\w$]*\s*\?"),
+    ),
+
+    # ---- Empty container is truthy in JS ----
+
+    TranslationError(
+        id="empty_container_truthy",
+        prompt_rule=(
+            "Empty `{}` and `[]` are TRUTHY in JavaScript but FALSY in Python. After "
+            "`const user = node.user ?? {}`, `if (user)` is always true. Either guard on "
+            "the original nullable BEFORE defaulting, or check `Object.keys(user).length` "
+            "/ `arr.length`."
+        ),
+        bad="const user = node.user ?? {};\nif (user) { /* always true */ }",
+        good="const user = node.user;\nif (user) { /* meaningful */ }",
+        verify="No `if (x)` guards where `x` was defaulted to `{}` or `[]` (always true in JS).",
+    ),
+
+    # ---- Object identity ----
+
+    TranslationError(
+        id="class_needs_new",
+        prompt_rule=(
+            "`MappedItem`, `MissingMappedField`, and `MapItemException` are CLASSES — "
+            "always `new MappedItem({...})`, `new MissingMappedField(...)`, "
+            "`throw new MapItemException(...)`. Calling them bare returns `undefined` "
+            "and silently breaks downstream."
+        ),
+        bad="return MappedItem({author: 'foo'})",
+        good="return new MappedItem({author: 'foo'})",
+        verify="Every `MappedItem(`, `MissingMappedField(`, and `MapItemException(` is preceded by `new`.",
+        # Bespoke check in `lint_translation` (variable-width lookbehind).
+        lint_pattern=None,
+    ),
+    TranslationError(
+        id="object_reference_equality",
+        prompt_rule=(
+            "`!==` / `===` on objects compares references, not values. "
+            "`caption !== new MissingMappedField('')` is always true because `new` "
+            "creates a fresh object each call. Use `instanceof MissingMappedField` for "
+            "type checks, or truthy-check the value directly."
+        ),
+        bad="caption !== new MissingMappedField('') ? caption.match(...) : ''",
+        good="caption instanceof MissingMappedField ? '' : caption.match(...)",
+        lint_pattern=re.compile(r"(?:!==|===)\s+new\s+[A-Z]"),
+    ),
+
+    # ---- Method calls on possibly-null receivers ----
+
+    TranslationError(
+        id="method_chain_on_nullable",
+        prompt_rule=(
+            "Calling a method on `null` / `undefined` throws TypeError. In Python the "
+            "equivalent AttributeError is sometimes caught by 4CAT — but the JS "
+            "`map_item` doesn't catch. Use optional chaining (`?.`) whenever the "
+            "receiver could be null/undefined."
+        ),
+        bad="caption.match(/#(\\w+)/g).join(',')",
+        good="caption?.match(/#(\\w+)/g)?.join(',') ?? ''",
+        # No reliable static check — leave to reviewer.
+        lint_pattern=None,
+    ),
+
+    # ---- Datetime: use the global helper ----
+
+    TranslationError(
+        id="datetime_helper_preferred",
+        prompt_rule=(
+            "For Python `datetime.utcfromtimestamp(t).strftime('%Y-%m-%d %H:%M:%S')`, "
+            "use the global `formatUtcTimestamp(t)` helper from `js/lib.js` — NOT "
+            "`new Date(t * 1000).toISOString()`. `.toISOString()` produces "
+            "`2026-05-13T21:27:31.000Z` (T separator, milliseconds, Z), which doesn't "
+            "match the Python output `2026-05-13 21:27:31`."
+        ),
+        bad="collected_at: new Date(node.taken_at * 1000).toISOString()",
+        good="collected_at: formatUtcTimestamp(node.taken_at)",
+        lint_pattern=re.compile(r"new\s+Date\([^)]+\)\.toISOString\(\)"),
+    ),
+
+    # ---- Regex translation traps ----
+
+    TranslationError(
+        id="regex_findall_capture_groups",
+        prompt_rule=(
+            "Python `re.findall(r'#(\\w+)', s)` returns CAPTURE GROUP contents "
+            "(`['lotr']`). JavaScript `s.match(/#(\\w+)/g)` returns FULL MATCHES "
+            "(`['#lotr']`) — capture groups are ignored with `/g`. For capture-group "
+            "behavior use `[...s.matchAll(/.../g)].map(m => m[1])`, or post-process the "
+            "full matches to strip the literal prefix."
+        ),
+        bad="caption.match(/#(\\w+)/g)?.join(',')",
+        good="[...caption.matchAll(/#(\\w+)/g)].map(m => m[1]).join(',')",
+        lint_pattern=re.compile(r"\.match\(\s*/[^/]*\([^/]*\)[^/]*/g\s*\)"),
+    ),
+    TranslationError(
+        id="regex_in_use",
+        prompt_rule=(
+            "Regex translation between Python and JavaScript is fragile: flag syntax "
+            "differs (`re.IGNORECASE` → `/.../i`), Python `re.compile(p).search(s)` "
+            "becomes JS `s.match(p)` or `new RegExp(p).exec(s)`, and regex literals "
+            "cannot span lines — encode any literal newline as `\\n`. Translate "
+            "carefully and verify behavior end-to-end."
+        ),
+        # Bespoke check in `lint_translation` flags any regex use for human review.
+        lint_pattern=None,
+    ),
+
+    # ---- String/regex literal syntax ----
+
+    TranslationError(
+        id="literal_newline_in_string",
+        prompt_rule=(
+            "JavaScript single- or double-quoted strings cannot contain a literal "
+            "newline — syntax error. Python `\"\\n\".join(xs)` becomes JS "
+            "`xs.join(\"\\n\")` — keep `\\n` as an escape sequence; never put a real "
+            "newline inside the quotes. Template literals (backticks) may span lines."
+        ),
+        bad='lines.join("\n")  // raw newline = syntax error',
+        good='lines.join("\\n")',
+        verify="No string or regex literal contains a raw newline character — use `\\n`.",
+        # Bespoke check in `lint_translation` (JS string lexer).
+        lint_pattern=None,
+    ),
+
+    # ---- Imports: don't, unless you really must ----
+
+    TranslationError(
+        id="lib_js_import",
+        prompt_rule=(
+            "`js/lib.js` is loaded as a plain `<script>`, NOT an ES module. Its "
+            "declarations (`MappedItem`, `MissingMappedField`, `MapItemException`, "
+            "`strip_tags`, `normalize_url_encoding`, `formatUtcTimestamp`) are GLOBALS. "
+            "Never write `import { ... } from '../js/lib.js'` — that import fails at "
+            "runtime."
+        ),
+        bad="import { MappedItem } from '../js/lib.js';",
+        good="// (no import — MappedItem is global)",
+        verify="`imports_to_add` is empty unless you really need an ES-module import (NOT for `MappedItem` etc.).",
+        lint_pattern=re.compile(
+            r"""import\s*(?:\{[^}]*\}|\*\s+as\s+\w+|\w+)\s+from\s+['"]\.\.?/js/lib\.js['"]"""
+        ),
+    ),
+    TranslationError(
+        id="bare_relative_path_import",
+        prompt_rule=(
+            "Every entry in `imports_to_add` must be a complete `import { ... } from '...'` "
+            "statement. Never emit a bare relative path (like `'../js/lib.js'`) as an "
+            "entry — JavaScript parses that as `..` `.` `/js/lib.js` and rejects the file."
+        ),
+        bad="imports_to_add: ['../js/lib.js']",
+        good="imports_to_add: []  // helpers are globals; no import needed",
+        # Surfaces in `imports_to_add`, not in the function body — not lint-able by
+        # the regex pass over `map_item_function`.
+        lint_pattern=None,
+    ),
+
+    # ---- JSON serialization difference ----
+
+    TranslationError(
+        id="undefined_dropped_from_json",
+        prompt_rule=(
+            "`JSON.stringify` omits keys whose value is `undefined`. Python's "
+            "`json.dumps` serializes `None` as `null`, keeping the key. When the Python "
+            "`map_item` explicitly returns `None` (or `\"\"`) for a missing field, the "
+            "JS must explicitly assign `null` (or `\"\"`) — typically with `value ?? null` "
+            "or `value ?? \"\"`, matching whichever Python uses for that field."
+        ),
+        bad="location_city: node.location.city  // undefined → key disappears from output",
+        good="location_city: node.location.city ?? null  // matches Python's `None`",
+        # Hard to lint statically (depends on per-field Python behavior).
+        lint_pattern=None,
+    ),
+]
+
+
+def get_regex_lint_rules() -> list[tuple[re.Pattern, str]]:
+    """
+    Return all (pattern, message) pairs for the regex-based lint pass.
+
+    Bespoke lint checks (class instantiation, literal newlines, regex use)
+    are NOT included here — they live in `map_item_converter.lint_translation`
+    and are tied to records by `id` in comments.
+    """
+    return [(r.lint_pattern, r.prompt_rule) for r in RULES if r.lint_pattern is not None]