From b16c7c66192a25b09b072d5fa4cc860c619ecb1c Mon Sep 17 00:00:00 2001 From: Dale Wahl Date: Tue, 5 May 2026 12:10:26 +0200 Subject: [PATCH 01/13] add mapper helper and crazy CI --- .../workflows/zeeschuimer_map_item_sync.yml | 259 ++++++++++ helper-scripts/map_item_converter.py | 488 ++++++++++++++++++ 2 files changed, 747 insertions(+) create mode 100644 .github/workflows/zeeschuimer_map_item_sync.yml create mode 100644 helper-scripts/map_item_converter.py diff --git a/.github/workflows/zeeschuimer_map_item_sync.yml b/.github/workflows/zeeschuimer_map_item_sync.yml new file mode 100644 index 000000000..203df44c8 --- /dev/null +++ b/.github/workflows/zeeschuimer_map_item_sync.yml @@ -0,0 +1,259 @@ +# Auto-translate Zeeschuimer datasource `map_item` functions from Python to JS +# and open a draft PR against digitalmethodsinitiative/zeeschuimer. +# +# Triggers on pushes to master that touch any Zeeschuimer datasource (or the +# helper script itself). Also exposes a `workflow_dispatch` trigger with a +# `bootstrap` input for the initial run that translates all 15 datasources at +# once. +# +# Required secrets (configured in repo Settings -> Secrets and variables -> Actions): +# DMI_OLLAMA_KEY - API key for https://ollama.digitalmethods.net (already used by helper script) +# ZEESCHUIMER_APP_ID - numeric App ID of the GitHub App installed on +# digitalmethodsinitiative/zeeschuimer with permissions +# contents:write + pull-requests:write (and nothing else) +# ZEESCHUIMER_APP_PRIVATE_KEY - full PEM private key for that App (including BEGIN/END lines) + +name: Sync Zeeschuimer map_item from 4CAT + +on: + push: + branches: [master] + paths: + - 'datasources/**/search_*.py' + - 'helper-scripts/map_item_converter.py' + - '.github/workflows/zeeschuimer_map_item_sync.yml' + workflow_dispatch: + inputs: + bootstrap: + description: 'Translate every Zeeschuimer datasource (initial sync). Ignored if "files" is set.' + type: boolean + default: false + files: + description: 'Space-separated list of datasource files to translate (e.g. "datasources/tiktok/search_tiktok.py"). Overrides bootstrap.' + type: string + default: '' + model: + description: 'Override LLM model on DMI Ollama (default: gemma3:4b). Examples: qwen2.5-coder:7b, deepseek-coder-v2:16b, gemma3:27b' + type: string + default: '' + +jobs: + sync: + name: Translate map_item and open Zeeschuimer PR + runs-on: ubuntu-latest + steps: + - name: Checkout 4CAT + uses: actions/checkout@v4 + with: + fetch-depth: 2 + + - name: Determine changed datasource files + id: changed + run: | + # Manual run with explicit files takes precedence + if [ "${{ github.event_name }}" = "workflow_dispatch" ] && [ -n "${{ inputs.files }}" ]; then + echo "mode=files" >> "$GITHUB_OUTPUT" + echo "files=${{ inputs.files }}" >> "$GITHUB_OUTPUT" + exit 0 + fi + if [ "${{ github.event_name }}" = "workflow_dispatch" ] && [ "${{ inputs.bootstrap }}" = "true" ]; then + echo "mode=bootstrap" >> "$GITHUB_OUTPUT" + echo "files=" >> "$GITHUB_OUTPUT" + exit 0 + fi + changed=$(git diff --name-only "${{ github.event.before }}" "${{ github.sha }}" -- 'datasources/*/search_*.py' || true) + if [ -z "$changed" ]; then + echo "mode=none" >> "$GITHUB_OUTPUT" + echo "files=" >> "$GITHUB_OUTPUT" + else + echo "mode=files" >> "$GITHUB_OUTPUT" + files=$(echo "$changed" | tr '\n' ' ') + echo "files=$files" >> "$GITHUB_OUTPUT" + fi + + - name: Exit early if nothing to do + if: steps.changed.outputs.mode == 'none' + run: | + echo "No Zeeschuimer datasource files changed; nothing to translate." + exit 0 + + - name: Set up Python + if: steps.changed.outputs.mode != 'none' + uses: actions/setup-python@v5 + with: + python-version: "3.11" + + - name: Install LLM dependencies + if: steps.changed.outputs.mode != 'none' + run: | + # LLMAdapter (common/lib/llm.py) imports every provider's langchain + # package at module load, so all of these are required even though + # we only use the Ollama provider at runtime. + pip install \ + langchain-core \ + langchain-ollama \ + langchain-openai \ + langchain-anthropic \ + langchain-google-genai \ + langchain-mistralai \ + langchain-deepseek \ + pydantic \ + requests + + - name: Mint Zeeschuimer App token + if: steps.changed.outputs.mode != 'none' + id: app_token + uses: actions/create-github-app-token@v1 + with: + app-id: ${{ secrets.ZEESCHUIMER_APP_ID }} + private-key: ${{ secrets.ZEESCHUIMER_APP_PRIVATE_KEY }} + owner: digitalmethodsinitiative + repositories: zeeschuimer + + - name: Checkout Zeeschuimer + if: steps.changed.outputs.mode != 'none' + uses: actions/checkout@v4 + with: + repository: digitalmethodsinitiative/zeeschuimer + path: zeeschuimer-checkout + token: ${{ steps.app_token.outputs.token }} + + - name: Run translation + if: steps.changed.outputs.mode != 'none' + env: + DMI_OLLAMA_KEY: ${{ secrets.DMI_OLLAMA_KEY }} + LLM_MODEL: ${{ inputs.model || 'gemma3:4b' }} + run: | + if [ "${{ steps.changed.outputs.mode }}" = "bootstrap" ]; then + python helper-scripts/map_item_converter.py \ + --bootstrap \ + --zeeschuimer-checkout ./zeeschuimer-checkout \ + --output-manifest ./manifest.json + else + python helper-scripts/map_item_converter.py \ + --files ${{ steps.changed.outputs.files }} \ + --zeeschuimer-checkout ./zeeschuimer-checkout \ + --output-manifest ./manifest.json + fi + + - name: Build PR body + if: steps.changed.outputs.mode != 'none' + id: pr_body + env: + MODE: ${{ steps.changed.outputs.mode }} + BEFORE_SHA: ${{ github.event.before }} + AFTER_SHA: ${{ github.sha }} + RUN_ID: ${{ github.run_id }} + EVENT_NAME: ${{ github.event_name }} + run: | + python - <<'EOF' + import json + import os + import subprocess + + mode = os.environ["MODE"] + before = os.environ["BEFORE_SHA"] + after = os.environ["AFTER_SHA"] + run_id = os.environ["RUN_ID"] + event_name = os.environ["EVENT_NAME"] + repo = "${{ github.repository }}" + + with open("manifest.json") as f: + manifest = json.load(f) + + model = manifest.get("model", "(unknown)") + provider = manifest.get("provider", "ollama") + structured_output = manifest.get("structured_output", False) + entries = manifest.get("entries", []) + + short_sha = after[:7] + lines = [] + lines.append("> :robot: This PR was auto-generated by the [4CAT map_item sync workflow](https://github.com/{}/actions/runs/{}). It is a **draft** because the JS was produced by an LLM and needs human review and end-to-end extension testing before merging.".format(repo, run_id)) + lines.append("") + lines.append("## Generation parameters") + lines.append("- **Model:** `{}` (provider: `{}`, structured output: `{}`)".format(model, provider, structured_output)) + if mode == "bootstrap": + lines.append("- **Trigger:** manual `workflow_dispatch` with `bootstrap=true` (initial sync of all Zeeschuimer datasources).") + elif event_name == "workflow_dispatch": + lines.append("- **Trigger:** manual `workflow_dispatch` with explicit file list.") + else: + lines.append("- **Trigger:** push of [`{}`](https://github.com/{}/commit/{}) to 4CAT master.".format(short_sha, repo, after)) + lines.append("") + + ok = [e for e in entries if e["status"] == "ok"] + failed = [e for e in entries if e["status"] == "failed"] + skipped = [e for e in entries if e["status"] == "skipped"] + + lines.append("## Summary") + lines.append("- :white_check_mark: {} translated".format(len(ok))) + lines.append("- :x: {} failed".format(len(failed))) + lines.append("- :grey_question: {} skipped".format(len(skipped))) + lines.append("") + + for entry in ok: + lines.append("## `{}` -> `{}`".format(entry["python_file"], entry["js_file"])) + if entry.get("commentary"): + lines.append("**LLM commentary:**") + lines.append("") + lines.append("> " + entry["commentary"].replace("\n", "\n> ")) + lines.append("") + if event_name == "push": + try: + diff = subprocess.check_output( + ["git", "diff", "{}..{}".format(before, after), "--", entry["python_file"]], + text=True, + ) + except subprocess.CalledProcessError: + diff = "" + else: + diff = "" + if diff.strip(): + lines.append("
Python diff") + lines.append("") + lines.append("```diff") + lines.append(diff.rstrip()) + lines.append("```") + lines.append("
") + lines.append("") + + if failed: + lines.append("## Failures") + for entry in failed: + lines.append("- `{}`: {}".format(entry["python_file"], entry.get("error", "(no error message)"))) + lines.append("") + + if skipped: + lines.append("## Skipped") + for entry in skipped: + lines.append("- `{}`: {}".format(entry["python_file"], entry.get("error", ""))) + lines.append("") + + body = "\n".join(lines) + with open("pr_body.md", "w", encoding="utf-8") as f: + f.write(body) + print("Wrote pr_body.md ({} chars)".format(len(body))) + EOF + + - name: Check there are JS changes to PR + if: steps.changed.outputs.mode != 'none' + id: have_changes + working-directory: zeeschuimer-checkout + run: | + if [ -z "$(git status --porcelain)" ]; then + echo "has_changes=false" >> "$GITHUB_OUTPUT" + echo "No JS changes produced by translation; not opening a PR." + else + echo "has_changes=true" >> "$GITHUB_OUTPUT" + fi + + - name: Open or update Zeeschuimer PR + if: steps.changed.outputs.mode != 'none' && steps.have_changes.outputs.has_changes == 'true' + uses: peter-evans/create-pull-request@v6 + with: + path: zeeschuimer-checkout + token: ${{ steps.app_token.outputs.token }} + branch: auto/4cat-map-item-sync + title: "Auto-translated map_item updates from 4CAT @ ${{ github.sha }}" + commit-message: "chore: sync map_item from 4CAT ${{ github.sha }}" + body-path: pr_body.md + draft: true diff --git a/helper-scripts/map_item_converter.py b/helper-scripts/map_item_converter.py new file mode 100644 index 000000000..ffe6028a8 --- /dev/null +++ b/helper-scripts/map_item_converter.py @@ -0,0 +1,488 @@ +""" +Translate 4CAT Zeeschuimer-import datasource `map_item` functions from Python +to JavaScript and splice them into the corresponding Zeeschuimer +`modules/.js` file. + +Designed to be invoked by a GitHub Action whenever a Zeeschuimer datasource's +Python file changes on master. Can also be run locally for testing or via +`workflow_dispatch` with `--bootstrap` to translate every datasource at once. + +The LLM produces only the new `map_item` function (plus any imports/helpers it +needs and free-text commentary). This script does the file integration: it +locates a marker block in the existing JS module and replaces its contents, +preserving every hand-written line outside the markers. + +Usage: + DMI_OLLAMA_KEY=... python helper-scripts/map_item_converter.py \\ + --files datasources/tiktok/search_tiktok.py \\ + --zeeschuimer-checkout ../zeeschuimer \\ + --output-manifest /tmp/manifest.json +""" +import argparse +import ast +import json +import os +import re +import sys +import traceback +from pathlib import Path +from typing import Optional + +sys.path.insert(0, os.path.join(os.path.abspath(os.path.dirname(__file__)), "..")) + +from common.lib.llm import LLMAdapter + + +# 4CAT datasource path -> Zeeschuimer module path (relative to checkout root). +# Verified against https://github.com/digitalmethodsinitiative/zeeschuimer/tree/master/modules +# Note: facebook has no Zeeschuimer module today, so it's intentionally absent. +PLATFORM_MAP = { + "datasources/douyin/search_douyin.py": "modules/douyin.js", + "datasources/gab/search_gab.py": "modules/gab.js", + "datasources/imgur/search_imgur.py": "modules/imgur.js", + "datasources/instagram/search_instagram.py": "modules/instagram.js", + "datasources/linkedin/search_linkedin.py": "modules/linkedin.js", + "datasources/ninegag/search_9gag.py": "modules/9gag.js", + "datasources/pinterest/search_pinterest.py": "modules/pinterest.js", + "datasources/threads/search_threads.py": "modules/threads.js", + "datasources/tiktok/search_tiktok.py": "modules/tiktok.js", + "datasources/tiktok_comments/search_tiktok_comments.py": "modules/tiktok-comments.js", + "datasources/truth/search_truth.py": "modules/truth.js", + "datasources/twitter-import/search_twitter.py": "modules/twitter.js", + "datasources/xiaohongshu/search_rednote.py": "modules/rednote.js", + "datasources/xiaohongshu_comments/search_rednote_comments.py": "modules/rednote-comments.js", +} + +DEFAULT_MODEL = "gemma3:4b" + +IMPORTS_MARKER_START = "// === auto-generated imports for map_item — DO NOT EDIT BY HAND ===" +IMPORTS_MARKER_END = "// === end auto-generated imports ===" +BLOCK_MARKER_START = "// === auto-generated by 4cat map_item sync — DO NOT EDIT BY HAND ===" +BLOCK_MARKER_END = "// === end auto-generated ===" + +LLM_SCHEMA = { + "title": "MapItemTranslation", + "type": "object", + "required": ["map_item_function", "imports_to_add", "helpers_to_add", "commentary"], + "properties": { + "map_item_function": { + "type": "string", + "description": ( + "Full JavaScript source of the new map_item function. Must include " + "the function declaration (e.g. 'export function map_item(item) {...}'). " + "Do not include surrounding code from the module." + ), + }, + "imports_to_add": { + "type": "array", + "items": {"type": "string"}, + "description": ( + "Complete import statements (one per array entry) that map_item needs " + "and that are not already present in the existing module. Empty array " + "if none needed." + ), + }, + "helpers_to_add": { + "type": "array", + "items": {"type": "string"}, + "description": ( + "Full source of any helper functions map_item depends on (e.g. a JS " + "port of normalize_url_encoding). Empty array if none needed." + ), + }, + "commentary": { + "type": "string", + "description": ( + "Notes for the human reviewer: assumptions made, fields you were " + "unsure about, Python idioms that don't translate cleanly. Plain text." + ), + }, + }, +} + +SYSTEM_PROMPT = ( + "You translate 4CAT Python `map_item` functions into JavaScript for the " + "Zeeschuimer browser extension. You return ONLY the new map_item function, " + "any imports it needs, any helper functions it depends on, and commentary " + "for the human reviewer. You NEVER return the surrounding module file. " + "You preserve the field names produced by the Python function exactly. " + "You do not invent fields not present in the Python output." +) + + +def is_zeeschuimer_datasource(python_path: Path) -> bool: + """ + Returns True if the given Python file defines a 4CAT Search subclass with + `is_from_zeeschuimer = True` as a class attribute. + """ + try: + tree = ast.parse(python_path.read_text(encoding="utf-8")) + except (SyntaxError, OSError): + return False + + for node in ast.walk(tree): + if not isinstance(node, ast.ClassDef): + continue + for stmt in node.body: + if not isinstance(stmt, ast.Assign): + continue + for target in stmt.targets: + if isinstance(target, ast.Name) and target.id == "is_from_zeeschuimer": + if isinstance(stmt.value, ast.Constant) and stmt.value.value is True: + return True + return False + + +def discover_bootstrap_files(repo_root: Path) -> list[Path]: + """ + Find every datasource file in PLATFORM_MAP whose class is a Zeeschuimer + datasource. Returns absolute paths. + """ + found = [] + for rel in PLATFORM_MAP: + path = repo_root / rel + if path.exists() and is_zeeschuimer_datasource(path): + found.append(path) + return sorted(found) + + +def build_user_prompt(python_source: str, existing_module_source: str, python_rel: str) -> str: + return ( + f"# Source Python file (datasources/{python_rel})\n" + "This is the file that just changed in 4CAT. The `map_item` function on the " + "class is the source of truth — your JavaScript translation must produce an " + "object with the same field names and equivalent values.\n\n" + f"```python\n{python_source}\n```\n\n" + "# Existing Zeeschuimer module\n" + "This module's `capture()` function returns the raw items that will be " + "passed to `map_item(item)` as `item`. Use it to understand the input shape " + "and to match the existing code style (ES modules, `export` keyword, etc.).\n\n" + f"```javascript\n{existing_module_source}\n```\n\n" + "# Task\n" + "Produce a JavaScript `map_item(item)` function that mirrors the Python " + "`map_item`. Re-implement any Python helpers it calls (e.g. `normalize_url_encoding`, " + "`urlparse`/`parse_qs`, `datetime` formatting) inline in JS — either inside " + "`map_item_function` or as separate snippets in `helpers_to_add`. If you reference " + "the `MappedItem` class, list its import in `imports_to_add` (it lives in `../js/lib.js`). " + "Use `export function map_item(item) { ... }` to match this module's style." + ) + + +def parse_freetext_response(text: str) -> dict: + """ + Fallback parser for when structured output is disabled or unreliable. + Looks for a fenced JS block (the function) and treats remaining text as + commentary. + """ + js_match = re.search(r"```(?:js|javascript)\s*\n(.*?)```", text, re.DOTALL) + map_item_function = js_match.group(1).strip() if js_match else "" + commentary = re.sub(r"```(?:js|javascript)\s*\n.*?```", "", text, flags=re.DOTALL).strip() + return { + "map_item_function": map_item_function, + "imports_to_add": [], + "helpers_to_add": [], + "commentary": commentary, + } + + +def validate_translation(translation: dict) -> Optional[str]: + """ + Returns None if the translation passes basic sanity checks, else a string + describing what went wrong. + """ + fn = translation.get("map_item_function", "").strip() + if not fn: + return "LLM returned empty map_item_function" + if not re.search(r"\bmap_item\b", fn): + return "LLM output does not contain `map_item` identifier" + if not re.search(r"function\s+map_item|map_item\s*=|map_item\s*:", fn): + return "LLM output does not declare `map_item` as a function" + return None + + +def splice_into_module(existing: str, translation: dict, python_rel: str) -> str: + """ + Idempotently insert / replace the auto-generated marker blocks in the JS + module text. + + Raises ValueError if exactly one of (start, end) markers is present — + that means the file is corrupted or partially hand-edited and we should + refuse to touch it. + """ + main_block_body = [] + for helper in translation.get("helpers_to_add", []): + helper = helper.strip() + if helper: + main_block_body.append(helper) + fn = translation["map_item_function"].strip() + main_block_body.append(fn) + main_block = ( + f"{BLOCK_MARKER_START}\n" + f"// (regenerated from {python_rel})\n" + + "\n\n".join(main_block_body) + + f"\n{BLOCK_MARKER_END}\n" + ) + + imports = [imp.strip() for imp in translation.get("imports_to_add", []) if imp.strip()] + # Drop imports that already appear verbatim outside the marker block. + existing_outside_block = re.sub( + re.escape(BLOCK_MARKER_START) + r".*?" + re.escape(BLOCK_MARKER_END) + r"\n?", + "", + existing, + flags=re.DOTALL, + ) + existing_outside_imports_block = re.sub( + re.escape(IMPORTS_MARKER_START) + r".*?" + re.escape(IMPORTS_MARKER_END) + r"\n?", + "", + existing_outside_block, + flags=re.DOTALL, + ) + imports = [imp for imp in imports if imp not in existing_outside_imports_block] + + imports_block = "" + if imports: + imports_block = ( + f"{IMPORTS_MARKER_START}\n" + + "\n".join(imports) + + f"\n{IMPORTS_MARKER_END}\n" + ) + + updated = existing + + # Replace or insert imports block. + has_imports_start = IMPORTS_MARKER_START in updated + has_imports_end = IMPORTS_MARKER_END in updated + if has_imports_start ^ has_imports_end: + raise ValueError( + "Auto-generated imports markers are partially missing in the existing " + "module — refusing to overwrite. Restore both markers or remove both." + ) + if has_imports_start and has_imports_end: + updated = re.sub( + re.escape(IMPORTS_MARKER_START) + r".*?" + re.escape(IMPORTS_MARKER_END) + r"\n?", + imports_block, + updated, + count=1, + flags=re.DOTALL, + ) + elif imports_block: + # Prepend at top of file. + if updated and not updated.startswith("\n"): + updated = imports_block + "\n" + updated + else: + updated = imports_block + updated + + # Replace or append main block. + has_main_start = BLOCK_MARKER_START in updated + has_main_end = BLOCK_MARKER_END in updated + if has_main_start ^ has_main_end: + raise ValueError( + "Auto-generated map_item markers are partially missing in the existing " + "module — refusing to overwrite. Restore both markers or remove both." + ) + if has_main_start and has_main_end: + updated = re.sub( + re.escape(BLOCK_MARKER_START) + r".*?" + re.escape(BLOCK_MARKER_END) + r"\n?", + main_block, + updated, + count=1, + flags=re.DOTALL, + ) + else: + if not updated.endswith("\n"): + updated += "\n" + updated += "\n" + main_block + + return updated + + +def translate_one( + llm: LLMAdapter, + python_path: Path, + repo_root: Path, + zeeschuimer_root: Path, + use_structured_output: bool, +) -> dict: + """ + Translate one Python file. Returns a manifest entry dict. + """ + rel = python_path.relative_to(repo_root).as_posix() + entry = { + "python_file": rel, + "js_file": None, + "status": "failed", + "commentary": "", + "error": None, + } + + if not is_zeeschuimer_datasource(python_path): + entry["status"] = "skipped" + entry["error"] = "not a Zeeschuimer datasource (is_from_zeeschuimer != True)" + return entry + + js_rel = PLATFORM_MAP.get(rel) + if not js_rel: + entry["status"] = "skipped" + entry["error"] = f"no Zeeschuimer module mapped for {rel}" + return entry + entry["js_file"] = js_rel + + js_path = zeeschuimer_root / js_rel + if not js_path.exists(): + entry["status"] = "skipped" + entry["error"] = f"Zeeschuimer module {js_rel} does not exist in checkout" + return entry + + python_source = python_path.read_text(encoding="utf-8") + existing_module = js_path.read_text(encoding="utf-8") + user_prompt = build_user_prompt(python_source, existing_module, rel) + + try: + response = llm.generate_text(user_prompt, system_prompt=SYSTEM_PROMPT) + except Exception as e: + entry["error"] = f"LLM call failed: {e}" + return entry + + if use_structured_output: + # with_structured_output returns the parsed dict directly. + if isinstance(response, dict): + translation = response + else: + entry["error"] = ( + f"Expected dict from structured output, got {type(response).__name__}" + ) + return entry + else: + text = getattr(response, "content", str(response)) + translation = parse_freetext_response(text) + + bad = validate_translation(translation) + if bad: + entry["error"] = bad + return entry + + try: + spliced = splice_into_module(existing_module, translation, rel) + except ValueError as e: + entry["error"] = str(e) + return entry + + js_path.write_text(spliced, encoding="utf-8") + entry["status"] = "ok" + entry["commentary"] = translation.get("commentary", "").strip() + return entry + + +def main(): + cli = argparse.ArgumentParser(description=__doc__) + group = cli.add_mutually_exclusive_group(required=True) + group.add_argument("--files", nargs="+", help="Specific datasource files to translate.") + group.add_argument( + "--bootstrap", + action="store_true", + help="Translate every Zeeschuimer datasource in the repo.", + ) + cli.add_argument( + "--zeeschuimer-checkout", + required=True, + type=Path, + help="Path to a local clone of the Zeeschuimer repo.", + ) + cli.add_argument( + "--output-manifest", + required=True, + type=Path, + help="Where to write the JSON manifest of results.", + ) + cli.add_argument( + "--model", + default=os.environ.get("LLM_MODEL", DEFAULT_MODEL), + help=f"Ollama model to use (default: {DEFAULT_MODEL}, or $LLM_MODEL).", + ) + cli.add_argument( + "--no-structured-output", + action="store_true", + help="Disable JSON-schema structured output; parse the response as free text.", + ) + args = cli.parse_args() + + dmi_ollama_key = os.environ.get("DMI_OLLAMA_KEY") + if not dmi_ollama_key: + sys.exit("Error: DMI_OLLAMA_KEY environment variable not set.") + + repo_root = Path(__file__).resolve().parent.parent + + if args.bootstrap: + files = discover_bootstrap_files(repo_root) + if not files: + sys.exit("No Zeeschuimer datasources found to bootstrap.") + else: + files = [Path(f).resolve() for f in args.files] + + llm = LLMAdapter( + provider="ollama", + model=args.model, + base_url="https://ollama.digitalmethods.net", + temperature=0.2, + max_tokens=8192, + client_kwargs={"headers": {"X-API-KEY": dmi_ollama_key}}, + ) + + use_structured_output = not args.no_structured_output + if use_structured_output: + try: + llm.set_structure(LLM_SCHEMA) + except Exception as e: + print( + f"Warning: could not enable structured output ({e}); " + "falling back to free-text parsing.", + file=sys.stderr, + ) + use_structured_output = False + + print(f"Using model: {args.model} (provider: ollama, structured_output: {use_structured_output})") + + entries = [] + for python_path in files: + print(f"Translating {python_path.relative_to(repo_root).as_posix()}...") + try: + entry = translate_one( + llm, + python_path, + repo_root, + args.zeeschuimer_checkout.resolve(), + use_structured_output, + ) + except Exception as e: + entry = { + "python_file": str(python_path), + "js_file": None, + "status": "failed", + "commentary": "", + "error": f"unexpected exception: {e}\n{traceback.format_exc()}", + } + entry["model"] = args.model + entries.append(entry) + print(f" -> {entry['status']}" + (f" ({entry['error']})" if entry.get("error") else "")) + + manifest = { + "model": args.model, + "provider": "ollama", + "structured_output": use_structured_output, + "entries": entries, + } + args.output_manifest.parent.mkdir(parents=True, exist_ok=True) + args.output_manifest.write_text(json.dumps(manifest, indent=2), encoding="utf-8") + + n_ok = sum(1 for e in entries if e["status"] == "ok") + n_failed = sum(1 for e in entries if e["status"] == "failed") + n_skipped = sum(1 for e in entries if e["status"] == "skipped") + print(f"\nDone with model `{args.model}`: {n_ok} ok, {n_failed} failed, {n_skipped} skipped.") + print(f"Manifest written to {args.output_manifest}") + + if n_ok == 0 and n_failed > 0: + sys.exit(1) + + +if __name__ == "__main__": + main() From 585a2d1446af63944292710c9a2ab85c6f22d364 Mon Sep 17 00:00:00 2001 From: Dale Wahl Date: Tue, 5 May 2026 17:56:46 +0200 Subject: [PATCH 02/13] add streaming and timing and other stuff so i can see what's going on --- helper-scripts/map_item_converter.py | 197 ++++++++++++++++++++++++--- 1 file changed, 176 insertions(+), 21 deletions(-) diff --git a/helper-scripts/map_item_converter.py b/helper-scripts/map_item_converter.py index ffe6028a8..75d3d2ea8 100644 --- a/helper-scripts/map_item_converter.py +++ b/helper-scripts/map_item_converter.py @@ -24,6 +24,7 @@ import os import re import sys +import time import traceback from pathlib import Path from typing import Optional @@ -296,12 +297,91 @@ def splice_into_module(existing: str, translation: dict, python_rel: str) -> str return updated +REASONING_KEYS = ("reasoning_content", "reasoning", "thinking", "thought") + + +def _extract_reasoning(chunk) -> str: + """ + Pull reasoning/thinking tokens off a LangChain chunk. Different model + families surface them under different keys, so try several. + """ + kwargs = getattr(chunk, "additional_kwargs", None) or {} + for key in REASONING_KEYS: + val = kwargs.get(key) + if isinstance(val, str) and val: + return val + return "" + + +def call_llm_streaming(llm: LLMAdapter, user_prompt: str, system_prompt: str) -> str: + """ + Stream the LLM response to stderr chunk-by-chunk and return the accumulated + *visible content* (reasoning tokens are surfaced live but NOT included in + the returned string — they aren't part of the parseable answer). + + For reasoning models like gpt-oss / deepseek-r1, the model spends a long + time emitting reasoning tokens before producing the visible answer; without + surfacing them the stream looks frozen. We mark the reasoning vs content + transitions so the user can tell what phase the model is in. + """ + from langchain_core.messages import HumanMessage, SystemMessage + + messages = [] + if system_prompt: + messages.append(SystemMessage(content=system_prompt)) + messages.append(HumanMessage(content=user_prompt)) + + sys.stderr.write("\n--- LLM stream begin ---\n") + sys.stderr.flush() + + content_chunks: list[str] = [] + state: Optional[str] = None # "reasoning" | "content" + + for chunk in llm.llm.stream(messages): + reasoning = _extract_reasoning(chunk) + content = getattr(chunk, "content", "") or "" + + if reasoning: + if state != "reasoning": + sys.stderr.write("\n--- reasoning ---\n") + state = "reasoning" + sys.stderr.write(reasoning) + sys.stderr.flush() + + if content: + if state != "content": + # transitioning from reasoning (or nothing) to visible output + sys.stderr.write("\n--- output ---\n") + state = "content" + sys.stderr.write(content) + sys.stderr.flush() + content_chunks.append(content) + + sys.stderr.write("\n--- LLM stream end ---\n") + sys.stderr.flush() + return "".join(content_chunks) + + +def extract_raw_from_exception(exc: BaseException) -> Optional[str]: + """ + Pull whatever raw LLM output we can find off a LangChain exception. Tries + several attribute names since they vary by LangChain version. Returns None + if nothing recoverable. + """ + for attr in ("llm_output", "observation", "output"): + val = getattr(exc, attr, None) + if isinstance(val, str): + return val + return None + + def translate_one( llm: LLMAdapter, python_path: Path, repo_root: Path, zeeschuimer_root: Path, use_structured_output: bool, + stream: bool, ) -> dict: """ Translate one Python file. Returns a manifest entry dict. @@ -312,6 +392,8 @@ def translate_one( "js_file": None, "status": "failed", "commentary": "", + "duration_seconds": None, + "raw_response": None, "error": None, } @@ -337,24 +419,48 @@ def translate_one( existing_module = js_path.read_text(encoding="utf-8") user_prompt = build_user_prompt(python_source, existing_module, rel) + started = time.monotonic() + raw_response: Optional[str] = None + translation: Optional[dict] = None + llm_error: Optional[str] = None + try: - response = llm.generate_text(user_prompt, system_prompt=SYSTEM_PROMPT) + if stream: + raw_response = call_llm_streaming(llm, user_prompt, SYSTEM_PROMPT) + translation = parse_freetext_response(raw_response) + elif use_structured_output: + try: + response = llm.generate_text(user_prompt, system_prompt=SYSTEM_PROMPT) + except Exception as e: + raw_response = extract_raw_from_exception(e) + llm_error = f"LLM call failed: {e}" + response = None + if response is not None: + if isinstance(response, dict): + translation = response + # Structured-output success doesn't expose the raw text. + else: + llm_error = ( + f"Expected dict from structured output, got " + f"{type(response).__name__}" + ) + else: + response = llm.generate_text(user_prompt, system_prompt=SYSTEM_PROMPT) + raw_response = getattr(response, "content", str(response)) + translation = parse_freetext_response(raw_response) except Exception as e: - entry["error"] = f"LLM call failed: {e}" - return entry + raw_response = raw_response or extract_raw_from_exception(e) + llm_error = f"LLM call failed: {e}" - if use_structured_output: - # with_structured_output returns the parsed dict directly. - if isinstance(response, dict): - translation = response - else: - entry["error"] = ( - f"Expected dict from structured output, got {type(response).__name__}" - ) - return entry - else: - text = getattr(response, "content", str(response)) - translation = parse_freetext_response(text) + entry["duration_seconds"] = round(time.monotonic() - started, 2) + entry["raw_response"] = raw_response + + if llm_error: + entry["error"] = llm_error + return entry + if translation is None: + entry["error"] = "no translation produced (no error raised, no dict returned)" + return entry bad = validate_translation(translation) if bad: @@ -404,6 +510,25 @@ def main(): action="store_true", help="Disable JSON-schema structured output; parse the response as free text.", ) + cli.add_argument( + "--stream", + action="store_true", + help=( + "Stream LLM output to stderr as it is generated, so you can watch a " + "slow model work. Implies --no-structured-output (streaming and " + "structured output don't mix cleanly)." + ), + ) + cli.add_argument( + "--no-fail-fast", + action="store_true", + help=( + "Continue translating remaining files even after one fails. By " + "default the script aborts on the first failure, since failures here " + "are typically configuration- or model-correlated and continuing " + "wastes LLM time." + ), + ) args = cli.parse_args() dmi_ollama_key = os.environ.get("DMI_OLLAMA_KEY") @@ -428,7 +553,7 @@ def main(): client_kwargs={"headers": {"X-API-KEY": dmi_ollama_key}}, ) - use_structured_output = not args.no_structured_output + use_structured_output = not args.no_structured_output and not args.stream if use_structured_output: try: llm.set_structure(LLM_SCHEMA) @@ -440,11 +565,19 @@ def main(): ) use_structured_output = False - print(f"Using model: {args.model} (provider: ollama, structured_output: {use_structured_output})") + fail_fast = not args.no_fail_fast + print( + f"Using model: {args.model} " + f"(provider: ollama, structured_output: {use_structured_output}, " + f"stream: {args.stream}, fail_fast: {fail_fast})" + ) entries = [] + overall_started = time.monotonic() for python_path in files: - print(f"Translating {python_path.relative_to(repo_root).as_posix()}...") + rel_for_log = python_path.relative_to(repo_root).as_posix() + print(f"Translating {rel_for_log}...", flush=True) + per_file_started = time.monotonic() try: entry = translate_one( llm, @@ -452,6 +585,7 @@ def main(): repo_root, args.zeeschuimer_checkout.resolve(), use_structured_output, + args.stream, ) except Exception as e: entry = { @@ -459,16 +593,34 @@ def main(): "js_file": None, "status": "failed", "commentary": "", + "duration_seconds": round(time.monotonic() - per_file_started, 2), "error": f"unexpected exception: {e}\n{traceback.format_exc()}", } entry["model"] = args.model entries.append(entry) - print(f" -> {entry['status']}" + (f" ({entry['error']})" if entry.get("error") else "")) - + dur = entry.get("duration_seconds") + dur_str = f" in {dur}s" if dur is not None else "" + err_str = f" ({entry['error']})" if entry.get("error") else "" + print(f" -> {entry['status']}{dur_str}{err_str}", flush=True) + + if entry["status"] == "failed" and not args.no_fail_fast: + remaining = len(files) - len(entries) + if remaining > 0: + print( + f"\nFail-fast: aborting after first failure; skipping " + f"{remaining} remaining file(s). Pass --no-fail-fast to continue past failures.", + flush=True, + ) + break + + overall_duration = round(time.monotonic() - overall_started, 2) manifest = { "model": args.model, "provider": "ollama", "structured_output": use_structured_output, + "stream": args.stream, + "fail_fast": fail_fast, + "total_duration_seconds": overall_duration, "entries": entries, } args.output_manifest.parent.mkdir(parents=True, exist_ok=True) @@ -477,7 +629,10 @@ def main(): n_ok = sum(1 for e in entries if e["status"] == "ok") n_failed = sum(1 for e in entries if e["status"] == "failed") n_skipped = sum(1 for e in entries if e["status"] == "skipped") - print(f"\nDone with model `{args.model}`: {n_ok} ok, {n_failed} failed, {n_skipped} skipped.") + print( + f"\nDone with model `{args.model}` in {overall_duration}s: " + f"{n_ok} ok, {n_failed} failed, {n_skipped} skipped." + ) print(f"Manifest written to {args.output_manifest}") if n_ok == 0 and n_failed > 0: From 7594c9a254ff40c266ab76075d2d4a11aa9c3415 Mon Sep 17 00:00:00 2001 From: Dale Wahl Date: Tue, 5 May 2026 17:57:08 +0200 Subject: [PATCH 03/13] no streaming here --- .../workflows/zeeschuimer_map_item_sync.yml | 39 +++++++++++++------ 1 file changed, 28 insertions(+), 11 deletions(-) diff --git a/.github/workflows/zeeschuimer_map_item_sync.yml b/.github/workflows/zeeschuimer_map_item_sync.yml index 203df44c8..54f6a266c 100644 --- a/.github/workflows/zeeschuimer_map_item_sync.yml +++ b/.github/workflows/zeeschuimer_map_item_sync.yml @@ -164,6 +164,8 @@ jobs: model = manifest.get("model", "(unknown)") provider = manifest.get("provider", "ollama") structured_output = manifest.get("structured_output", False) + stream = manifest.get("stream", False) + total_duration = manifest.get("total_duration_seconds") entries = manifest.get("entries", []) short_sha = after[:7] @@ -171,7 +173,9 @@ jobs: lines.append("> :robot: This PR was auto-generated by the [4CAT map_item sync workflow](https://github.com/{}/actions/runs/{}). It is a **draft** because the JS was produced by an LLM and needs human review and end-to-end extension testing before merging.".format(repo, run_id)) lines.append("") lines.append("## Generation parameters") - lines.append("- **Model:** `{}` (provider: `{}`, structured output: `{}`)".format(model, provider, structured_output)) + lines.append("- **Model:** `{}` (provider: `{}`, structured output: `{}`, stream: `{}`)".format(model, provider, structured_output, stream)) + if total_duration is not None: + lines.append("- **Total LLM time:** {}s".format(total_duration)) if mode == "bootstrap": lines.append("- **Trigger:** manual `workflow_dispatch` with `bootstrap=true` (initial sync of all Zeeschuimer datasources).") elif event_name == "workflow_dispatch": @@ -190,8 +194,19 @@ jobs: lines.append("- :grey_question: {} skipped".format(len(skipped))) lines.append("") + if ok: + lines.append("| Datasource | Module | Time |") + lines.append("|---|---|---:|") + for entry in ok: + dur = entry.get("duration_seconds") + dur_cell = "{}s".format(dur) if dur is not None else "—" + lines.append("| `{}` | `{}` | {} |".format(entry["python_file"], entry["js_file"], dur_cell)) + lines.append("") + for entry in ok: - lines.append("## `{}` -> `{}`".format(entry["python_file"], entry["js_file"])) + dur = entry.get("duration_seconds") + header_dur = " ({}s)".format(dur) if dur is not None else "" + lines.append("## `{}` -> `{}`{}".format(entry["python_file"], entry["js_file"], header_dur)) if entry.get("commentary"): lines.append("**LLM commentary:**") lines.append("") @@ -207,19 +222,21 @@ jobs: diff = "" else: diff = "" - if diff.strip(): - lines.append("
Python diff") - lines.append("") - lines.append("```diff") - lines.append(diff.rstrip()) - lines.append("```") - lines.append("
") - lines.append("") + if diff.strip(): + lines.append("
Python diff") + lines.append("") + lines.append("```diff") + lines.append(diff.rstrip()) + lines.append("```") + lines.append("
") + lines.append("") if failed: lines.append("## Failures") for entry in failed: - lines.append("- `{}`: {}".format(entry["python_file"], entry.get("error", "(no error message)"))) + dur = entry.get("duration_seconds") + dur_str = " (after {}s)".format(dur) if dur is not None else "" + lines.append("- `{}`{}: {}".format(entry["python_file"], dur_str, entry.get("error", "(no error message)"))) lines.append("") if skipped: From 3d843c821c4e93142dd49b679b842ef943a4044c Mon Sep 17 00:00:00 2001 From: Dale Wahl Date: Wed, 6 May 2026 14:18:04 +0200 Subject: [PATCH 04/13] use qwen; lib.js is global apparently, add some helper functions; add a bunch of DON'T Do's; clean up code block fences; do some surface lint tests --- helper-scripts/map_item_converter.py | 317 +++++++++++++++++++++++++-- 1 file changed, 301 insertions(+), 16 deletions(-) diff --git a/helper-scripts/map_item_converter.py b/helper-scripts/map_item_converter.py index 75d3d2ea8..3d43a53d0 100644 --- a/helper-scripts/map_item_converter.py +++ b/helper-scripts/map_item_converter.py @@ -54,7 +54,7 @@ "datasources/xiaohongshu_comments/search_rednote_comments.py": "modules/rednote-comments.js", } -DEFAULT_MODEL = "gemma3:4b" +DEFAULT_MODEL = "qwen2.5-coder:14b" IMPORTS_MARKER_START = "// === auto-generated imports for map_item — DO NOT EDIT BY HAND ===" IMPORTS_MARKER_END = "// === end auto-generated imports ===" @@ -78,9 +78,12 @@ "type": "array", "items": {"type": "string"}, "description": ( - "Complete import statements (one per array entry) that map_item needs " - "and that are not already present in the existing module. Empty array " - "if none needed." + "Complete ES-module import statements that map_item needs. " + "Normally empty: Zeeschuimer's `js/lib.js` (which provides MappedItem, " + "MissingMappedField, normalize_url_encoding, strip_tags) is loaded as a " + "plain script, NOT an ES module — its declarations are already global, " + "so do not write `import { X } from '../js/lib.js'`. Only populate this " + "if you genuinely need to import from another ES module." ), }, "helpers_to_add": { @@ -107,10 +110,62 @@ "any imports it needs, any helper functions it depends on, and commentary " "for the human reviewer. You NEVER return the surrounding module file. " "You preserve the field names produced by the Python function exactly. " - "You do not invent fields not present in the Python output." + "You do not invent fields not present in the Python output. " + "You output raw JavaScript source — never wrap it in markdown code fences " + "(```js, ```javascript, etc.). The fields in your structured response are " + "already typed as code; fences make them invalid." ) +# Whitelist of helpers that Zeeschuimer makes available as globals at runtime. +# `js/lib.js` is loaded as a plain