diff --git a/.github/workflows/zeeschuimer_map_item_sync.yml b/.github/workflows/zeeschuimer_map_item_sync.yml index 54e3d8221..73fbd8bd6 100644 --- a/.github/workflows/zeeschuimer_map_item_sync.yml +++ b/.github/workflows/zeeschuimer_map_item_sync.yml @@ -1,14 +1,381 @@ -# Bootstrap the Zeeschuimer map_item sync workflow -# This is necessary to test workflow in PR (so far as I can tell) +# Auto-translate Zeeschuimer datasource `map_item` functions from Python to JS +# and open a draft PR per module against digitalmethodsinitiative/zeeschuimer. +# +# Triggers on pushes to master that touch any Zeeschuimer datasource (or the +# helper script itself). Also exposes a `workflow_dispatch` trigger with a +# `bootstrap` input for the initial run that translates all 15 datasources at +# once (single PR). +# +# Architecture: a `detect` job groups changed files by module and emits a +# matrix; a `sync` job fans out one parallel run per module, each opening +# (or updating) its own PR on a stable per-module branch. +# +# Required secrets (configured in repo Settings -> Secrets and variables -> Actions): +# DMI_OLLAMA_KEY - API key for https://ollama.digitalmethods.net (already used by helper script) +# ZEESCHUIMER_APP_ID - numeric App ID of the GitHub App installed on +# digitalmethodsinitiative/zeeschuimer with permissions +# contents:write + pull-requests:write (and nothing else) +# ZEESCHUIMER_APP_PRIVATE_KEY - full PEM private key for that App (including BEGIN/END lines) name: Sync Zeeschuimer map_item from 4CAT on: + push: + branches: [master] + paths: + - 'datasources/**/search_*.py' + - 'helper-scripts/map_item_converter.py' + - '.github/workflows/zeeschuimer_map_item_sync.yml' workflow_dispatch: + inputs: + bootstrap: + description: 'Translate every Zeeschuimer datasource (initial sync, single PR). Ignored if "files" is set.' + type: boolean + default: false + files: + description: 'Space-separated list of datasource files to translate (e.g. "datasources/tiktok/search_tiktok.py"). Overrides bootstrap. One PR per module.' + type: string + default: '' + model: + description: 'Override LLM model on DMI Ollama (default: qwen2.5-coder:14b). Examples: qwen2.5-coder:7b, deepseek-coder-v2:16b, gemma3:27b' + type: string + default: '' jobs: - sync-map-item: + detect: + name: Detect modules to translate runs-on: ubuntu-latest + outputs: + mode: ${{ steps.plan.outputs.mode }} + matrix: ${{ steps.plan.outputs.matrix }} steps: - - name: Placeholder - run: echo "Workflow scaffold is valid." \ No newline at end of file + - name: Checkout 4CAT + uses: actions/checkout@v4 + with: + fetch-depth: 2 + + - name: Plan translation matrix + id: plan + env: + EVENT_NAME: ${{ github.event_name }} + INPUTS_FILES: ${{ inputs.files }} + INPUTS_BOOTSTRAP: ${{ inputs.bootstrap }} + BEFORE_SHA: ${{ github.event.before }} + AFTER_SHA: ${{ github.sha }} + run: | + python - <<'EOF' + import json + import os + import subprocess + + event_name = os.environ["EVENT_NAME"] + inputs_files = os.environ.get("INPUTS_FILES", "").strip() + inputs_bootstrap = os.environ.get("INPUTS_BOOTSTRAP", "").lower() == "true" + before = os.environ.get("BEFORE_SHA", "") + after = os.environ.get("AFTER_SHA", "") + out_path = os.environ["GITHUB_OUTPUT"] + + def emit(mode, matrix): + with open(out_path, "a", encoding="utf-8") as f: + f.write("mode={}\n".format(mode)) + f.write("matrix={}\n".format(json.dumps(matrix))) + + # Bootstrap is special: one PR for all datasources. + # Explicit `files` input overrides bootstrap; honor that. + if event_name == "workflow_dispatch" and inputs_bootstrap and not inputs_files: + emit("bootstrap", [{"module": "bootstrap", "files": "", "bootstrap": True}]) + print("Plan: bootstrap (single PR)") + raise SystemExit(0) + + # Resolve the file list to translate. + if event_name == "workflow_dispatch" and inputs_files: + files = inputs_files.split() + else: + # push event: diff datasource files between before and after. + try: + out = subprocess.check_output( + ["git", "diff", "--name-only", before, after, "--", + "datasources/*/search_*.py"], + text=True, + ) + except subprocess.CalledProcessError: + out = "" + files = [f for f in out.splitlines() if f.strip()] + + # Group by module: datasources//search_*.py + modules = {} + for path in files: + parts = path.split("/") + if len(parts) >= 3 and parts[0] == "datasources": + modules.setdefault(parts[1], []).append(path) + + if not modules: + emit("none", []) + print("Plan: nothing to translate") + raise SystemExit(0) + + matrix = [ + {"module": mod, "files": " ".join(sorted(paths)), "bootstrap": False} + for mod, paths in sorted(modules.items()) + ] + emit("files", matrix) + print("Plan: {} module(s)".format(len(matrix))) + for entry in matrix: + print(" - {}: {}".format(entry["module"], entry["files"])) + EOF + + sync: + name: Sync ${{ matrix.target.module }} + needs: detect + if: needs.detect.outputs.mode != 'none' + runs-on: ubuntu-latest + # Per-module concurrency: a newer push to master supersedes any in-flight + # sync for the same module (LLM run gets cancelled, latest run wins). + # Each matrix instance gets its own group, so different modules don't block. + concurrency: + group: zeeschuimer-sync-${{ matrix.target.module }} + cancel-in-progress: true + strategy: + fail-fast: false + matrix: + target: ${{ fromJson(needs.detect.outputs.matrix) }} + steps: + - name: Checkout 4CAT + uses: actions/checkout@v4 + with: + fetch-depth: 2 + + - name: Set up Python + uses: actions/setup-python@v5 + with: + python-version: "3.11" + + - name: Install LLM dependencies + run: | + # LLMAdapter (common/lib/llm.py) imports every provider's langchain + # package at module load, so all of these are required even though + # we only use the Ollama provider at runtime. + pip install \ + langchain-core \ + langchain-ollama \ + langchain-openai \ + langchain-anthropic \ + langchain-google-genai \ + langchain-mistralai \ + langchain-deepseek \ + pydantic \ + requests + + - name: Mint Zeeschuimer App token + id: app_token + uses: actions/create-github-app-token@v1 + with: + app-id: ${{ secrets.ZEESCHUIMER_APP_ID }} + private-key: ${{ secrets.ZEESCHUIMER_APP_PRIVATE_KEY }} + owner: digitalmethodsinitiative + repositories: zeeschuimer + + - name: Checkout Zeeschuimer + uses: actions/checkout@v4 + with: + repository: digitalmethodsinitiative/zeeschuimer + path: zeeschuimer-checkout + token: ${{ steps.app_token.outputs.token }} + + - name: Run translation + env: + DMI_OLLAMA_KEY: ${{ secrets.DMI_OLLAMA_KEY }} + LLM_MODEL: ${{ inputs.model || 'qwen2.5-coder:14b' }} + run: | + if [ "${{ matrix.target.bootstrap }}" = "true" ]; then + python helper-scripts/map_item_converter.py \ + --bootstrap \ + --zeeschuimer-checkout ./zeeschuimer-checkout \ + --output-manifest ./manifest.json + else + python helper-scripts/map_item_converter.py \ + --files ${{ matrix.target.files }} \ + --zeeschuimer-checkout ./zeeschuimer-checkout \ + --output-manifest ./manifest.json + fi + + - name: Build PR body + id: pr_body + env: + MODULE: ${{ matrix.target.module }} + BOOTSTRAP: ${{ matrix.target.bootstrap }} + BEFORE_SHA: ${{ github.event.before }} + AFTER_SHA: ${{ github.sha }} + RUN_ID: ${{ github.run_id }} + EVENT_NAME: ${{ github.event_name }} + run: | + python - <<'EOF' + import json + import os + import subprocess + + module = os.environ["MODULE"] + is_bootstrap = os.environ.get("BOOTSTRAP", "").lower() == "true" + before = os.environ["BEFORE_SHA"] + after = os.environ["AFTER_SHA"] + run_id = os.environ["RUN_ID"] + event_name = os.environ["EVENT_NAME"] + repo = "${{ github.repository }}" + + with open("manifest.json") as f: + manifest = json.load(f) + + model = manifest.get("model", "(unknown)") + provider = manifest.get("provider", "ollama") + structured_output = manifest.get("structured_output", False) + stream = manifest.get("stream", False) + total_duration = manifest.get("total_duration_seconds") + entries = manifest.get("entries", []) + + short_sha = after[:7] + lines = [] + lines.append("> :robot: This PR was auto-generated by the [4CAT map_item sync workflow](https://github.com/{}/actions/runs/{}). The JavaScript was produced by an LLM and **requires human review** before merging — including manual fixes for any lint warnings flagged below.".format(repo, run_id)) + lines.append("") + lines.append("## Generation parameters") + lines.append("- **Model:** `{}` (provider: `{}`, structured output: `{}`, stream: `{}`)".format(model, provider, structured_output, stream)) + if total_duration is not None: + lines.append("- **Total LLM time:** {}s".format(total_duration)) + if is_bootstrap: + lines.append("- **Trigger:** manual `workflow_dispatch` with `bootstrap=true` (initial sync of all Zeeschuimer datasources).") + elif event_name == "workflow_dispatch": + lines.append("- **Trigger:** manual `workflow_dispatch` for `{}`.".format(module)) + else: + lines.append("- **Trigger:** push of [`{}`](https://github.com/{}/commit/{}) to 4CAT master (module: `{}`).".format(short_sha, repo, after, module)) + lines.append("") + + ok = [e for e in entries if e["status"] == "ok"] + ok_with_warnings = [e for e in ok if e.get("lint_warnings")] + failed = [e for e in entries if e["status"] == "failed"] + skipped = [e for e in entries if e["status"] == "skipped"] + + lines.append("## Summary") + lines.append("- :white_check_mark: {} translated".format(len(ok))) + if ok_with_warnings: + lines.append("- :warning: {} translated with lint warnings (require manual fix)".format(len(ok_with_warnings))) + lines.append("- :x: {} failed".format(len(failed))) + lines.append("- :grey_question: {} skipped".format(len(skipped))) + lines.append("") + + if ok: + lines.append("| Datasource | Module | Time | Warnings |") + lines.append("|---|---|---:|---:|") + for entry in ok: + dur = entry.get("duration_seconds") + dur_cell = "{}s".format(dur) if dur is not None else "—" + warn_count = len(entry.get("lint_warnings") or []) + warn_cell = ":warning: {}".format(warn_count) if warn_count else "—" + lines.append("| `{}` | `{}` | {} | {} |".format(entry["python_file"], entry["js_file"], dur_cell, warn_cell)) + lines.append("") + + if ok_with_warnings: + lines.append("## :warning: Lint warnings — fix before merging") + lines.append("") + lines.append("The following datasources translated successfully but the static lint flagged issues that need human fixes. The auto-generated code was spliced into the JS module as-is; please patch the file directly in this PR.") + lines.append("") + for entry in ok_with_warnings: + lines.append("**`{}` -> `{}`**".format(entry["python_file"], entry["js_file"])) + for w in entry["lint_warnings"]: + lines.append("- {}".format(w)) + lines.append("") + + for entry in ok: + dur = entry.get("duration_seconds") + header_dur = " ({}s)".format(dur) if dur is not None else "" + warn_marker = " :warning:" if entry.get("lint_warnings") else "" + lines.append("## `{}` -> `{}`{}{}".format(entry["python_file"], entry["js_file"], header_dur, warn_marker)) + if entry.get("commentary"): + lines.append("**LLM commentary:**") + lines.append("") + lines.append("> " + entry["commentary"].replace("\n", "\n> ")) + lines.append("") + if event_name == "push": + try: + diff = subprocess.check_output( + ["git", "diff", "{}..{}".format(before, after), "--", entry["python_file"]], + text=True, + ) + except subprocess.CalledProcessError: + diff = "" + else: + diff = "" + if diff.strip(): + lines.append("
Python diff") + lines.append("") + lines.append("```diff") + lines.append(diff.rstrip()) + lines.append("```") + lines.append("
") + lines.append("") + + if failed: + lines.append("## Failures") + for entry in failed: + dur = entry.get("duration_seconds") + dur_str = " (after {}s)".format(dur) if dur is not None else "" + lines.append("- `{}`{}: {}".format(entry["python_file"], dur_str, entry.get("error", "(no error message)"))) + lines.append("") + + if skipped: + lines.append("## Skipped") + for entry in skipped: + lines.append("- `{}`: {}".format(entry["python_file"], entry.get("error", ""))) + lines.append("") + + body = "\n".join(lines) + with open("pr_body.md", "w", encoding="utf-8") as f: + f.write(body) + print("Wrote pr_body.md ({} chars)".format(len(body))) + + # Title is single-module in the matrix path; bootstrap is its own + # special-case (one PR covering all 15 datasources). + ok_modules = [] + for entry in ok: + parts = entry["python_file"].split("/") + if len(parts) >= 2 and parts[0] == "datasources": + mod = parts[1] + if mod not in ok_modules: + ok_modules.append(mod) + + if is_bootstrap: + title = "Auto-translated map_item updates from 4CAT (bootstrap, {} datasources)".format(len(ok_modules)) + elif not ok_modules: + title = "Auto-translated map_item updates from 4CAT: {}".format(module) + else: + title = "Auto-translated map_item updates from 4CAT: {}".format(", ".join(ok_modules)) + + github_output = os.environ.get("GITHUB_OUTPUT") + if github_output: + with open(github_output, "a", encoding="utf-8") as f: + f.write("title={}\n".format(title)) + print("PR title: {}".format(title)) + EOF + + - name: Check there are JS changes to PR + id: have_changes + working-directory: zeeschuimer-checkout + run: | + if [ -z "$(git status --porcelain)" ]; then + echo "has_changes=false" >> "$GITHUB_OUTPUT" + echo "No JS changes produced by translation; not opening a PR." + else + echo "has_changes=true" >> "$GITHUB_OUTPUT" + fi + + - name: Open or update Zeeschuimer PR + if: steps.have_changes.outputs.has_changes == 'true' + uses: peter-evans/create-pull-request@v6 + with: + path: zeeschuimer-checkout + token: ${{ steps.app_token.outputs.token }} + # Stable per-module branch: a fresh push that retranslates the same + # module updates the same PR. Different modules never share a branch. + branch: auto/4cat-map-item-sync-${{ matrix.target.module }} + title: ${{ steps.pr_body.outputs.title }} + commit-message: "chore: sync map_item for ${{ matrix.target.module }} from 4CAT ${{ github.sha }}" + body-path: pr_body.md + draft: true diff --git a/.gitignore b/.gitignore index 8850a7bcc..76dd51700 100644 --- a/.gitignore +++ b/.gitignore @@ -46,6 +46,8 @@ webtool/venv/ *.ipynb venv/ __pycache__/ +.claude/ +extensions # do not ignore interface images !webtool/static/img/*.png diff --git a/helper-scripts/map_item_converter.py b/helper-scripts/map_item_converter.py new file mode 100644 index 000000000..6acda73e1 --- /dev/null +++ b/helper-scripts/map_item_converter.py @@ -0,0 +1,971 @@ +""" +Translate 4CAT Zeeschuimer-import datasource `map_item` functions from Python +to JavaScript and splice them into the corresponding Zeeschuimer +`modules/.js` file. + +Designed to be invoked by a GitHub Action whenever a Zeeschuimer datasource's +Python file changes on master. Can also be run locally for testing or via +`workflow_dispatch` with `--bootstrap` to translate every datasource at once. + +The LLM produces only the new `map_item` function (plus any imports/helpers it +needs and free-text commentary). This script does the file integration: it +locates a marker block in the existing JS module and replaces its contents, +preserving every hand-written line outside the markers. + +Usage: + DMI_OLLAMA_KEY=... python helper-scripts/map_item_converter.py \\ + --files datasources/tiktok/search_tiktok.py \\ + --zeeschuimer-checkout ../zeeschuimer \\ + --output-manifest /tmp/manifest.json +""" +import argparse +import ast +import json +import os +import re +import sys +import time +import traceback +from pathlib import Path +from typing import Optional + +sys.path.insert(0, os.path.join(os.path.abspath(os.path.dirname(__file__)), "..")) + +from common.lib.llm import LLMAdapter + +# Sibling module — lives next to this script in helper-scripts/. Python adds the +# script's directory to sys.path automatically when the file is run directly. +from map_item_rules import RULES, get_regex_lint_rules + + +# 4CAT datasource path -> Zeeschuimer module path is derived by convention: +# the Python file is `datasources//search_.py`; the JS module is +# `modules/.js`. The convention only depends on the Python +# *filename*, not the directory, so cases where they differ still work +# (e.g. `xiaohongshu/search_rednote.py` -> `modules/rednote.js`, +# `twitter-import/search_twitter.py` -> `modules/twitter.js`). +# +# Datasources without a matching Zeeschuimer module (today: facebook) are +# skipped automatically — the JS file existence check in `translate_one` +# handles them without any explicit allow-list. New Zeeschuimer datasources +# added to 4CAT are picked up automatically as long as Zeeschuimer ships the +# matching `modules/.js` file. +def python_to_js_module(python_rel: str) -> Optional[str]: + """ + Derive the Zeeschuimer module path for a 4CAT datasource Python file. + Returns None if the path doesn't follow `datasources//search_*.py`. + """ + parts = python_rel.split("/") + if len(parts) != 3 or parts[0] != "datasources": + return None + filename = parts[2] + if not filename.startswith("search_") or not filename.endswith(".py"): + return None + base = filename[len("search_"):-len(".py")] + if not base: + return None + return f"modules/{base.replace('_', '-')}.js" + +DEFAULT_MODEL = "qwen2.5-coder:14b" + +IMPORTS_MARKER_START = "// === auto-generated imports for map_item — BLOCK REPLACED AUTOMATICALLY ===" +IMPORTS_MARKER_END = "// === end auto-generated imports ===" +BLOCK_MARKER_START = "// === auto-generated by 4cat map_item sync — BLOCK REPLACED AUTOMATICALLY ===" +BLOCK_MARKER_END = "// === end auto-generated ===" + +LLM_SCHEMA = { + "title": "MapItemTranslation", + "type": "object", + "required": ["map_item_function", "imports_to_add", "helpers_to_add", "commentary"], + "properties": { + "map_item_function": { + "type": "string", + "description": ( + "Full JavaScript source of the new map_item function. Must include " + "the function declaration (e.g. 'export function map_item(item) {...}'). " + "Do not include surrounding code from the module." + ), + }, + "imports_to_add": { + "type": "array", + "items": {"type": "string"}, + "description": ( + "Complete ES-module import statements that map_item needs. " + "Normally empty: Zeeschuimer's `js/lib.js` (which provides MappedItem, " + "MissingMappedField, normalize_url_encoding, strip_tags) is loaded as a " + "plain script, NOT an ES module — its declarations are already global, " + "so do not write `import { X } from '../js/lib.js'`. Only populate this " + "if you genuinely need to import from another ES module." + ), + }, + "helpers_to_add": { + "type": "array", + "items": {"type": "string"}, + "description": ( + "Full source of any helper functions map_item depends on (e.g. a JS " + "port of normalize_url_encoding). Empty array if none needed." + ), + }, + "commentary": { + "type": "string", + "description": ( + "Notes for the human reviewer: assumptions made, fields you were " + "unsure about, Python idioms that don't translate cleanly. Plain text." + ), + }, + }, +} + +SYSTEM_PROMPT = ( + "You translate 4CAT Python `map_item` functions into JavaScript for the " + "Zeeschuimer browser extension. You return ONLY the new map_item function, " + "any imports it needs, any helper functions it depends on, and commentary " + "for the human reviewer. You NEVER return the surrounding module file. " + "You preserve the field names produced by the Python function exactly. " + "You do not invent fields not present in the Python output. " + "You output raw JavaScript source — never wrap it in markdown code fences " + "(```js, ```javascript, etc.). The fields in your structured response are " + "already typed as code; fences make them invalid." +) + + +# Whitelist of helpers that Zeeschuimer makes available as globals at runtime. +# `js/lib.js` is loaded as a plain