Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
377 changes: 372 additions & 5 deletions .github/workflows/zeeschuimer_map_item_sync.yml
Original file line number Diff line number Diff line change
@@ -1,14 +1,381 @@
# Bootstrap the Zeeschuimer map_item sync workflow
# This is necessary to test workflow in PR (so far as I can tell)
# Auto-translate Zeeschuimer datasource `map_item` functions from Python to JS
# and open a draft PR per module against digitalmethodsinitiative/zeeschuimer.
#
# Triggers on pushes to master that touch any Zeeschuimer datasource (or the
# helper script itself). Also exposes a `workflow_dispatch` trigger with a
# `bootstrap` input for the initial run that translates all 15 datasources at
# once (single PR).
#
# Architecture: a `detect` job groups changed files by module and emits a
# matrix; a `sync` job fans out one parallel run per module, each opening
# (or updating) its own PR on a stable per-module branch.
#
# Required secrets (configured in repo Settings -> Secrets and variables -> Actions):
# DMI_OLLAMA_KEY - API key for https://ollama.digitalmethods.net (already used by helper script)
# ZEESCHUIMER_APP_ID - numeric App ID of the GitHub App installed on
# digitalmethodsinitiative/zeeschuimer with permissions
# contents:write + pull-requests:write (and nothing else)
# ZEESCHUIMER_APP_PRIVATE_KEY - full PEM private key for that App (including BEGIN/END lines)

name: Sync Zeeschuimer map_item from 4CAT

on:
push:
branches: [master]
paths:
- 'datasources/**/search_*.py'
- 'helper-scripts/map_item_converter.py'
- '.github/workflows/zeeschuimer_map_item_sync.yml'
workflow_dispatch:
inputs:
bootstrap:
description: 'Translate every Zeeschuimer datasource (initial sync, single PR). Ignored if "files" is set.'
type: boolean
default: false
files:
description: 'Space-separated list of datasource files to translate (e.g. "datasources/tiktok/search_tiktok.py"). Overrides bootstrap. One PR per module.'
type: string
default: ''
model:
description: 'Override LLM model on DMI Ollama (default: qwen2.5-coder:14b). Examples: qwen2.5-coder:7b, deepseek-coder-v2:16b, gemma3:27b'
type: string
default: ''

jobs:
sync-map-item:
detect:
name: Detect modules to translate
runs-on: ubuntu-latest
outputs:
mode: ${{ steps.plan.outputs.mode }}
matrix: ${{ steps.plan.outputs.matrix }}
steps:
- name: Placeholder
run: echo "Workflow scaffold is valid."
- name: Checkout 4CAT
uses: actions/checkout@v4
with:
fetch-depth: 2

- name: Plan translation matrix
id: plan
env:
EVENT_NAME: ${{ github.event_name }}
INPUTS_FILES: ${{ inputs.files }}
INPUTS_BOOTSTRAP: ${{ inputs.bootstrap }}
BEFORE_SHA: ${{ github.event.before }}
AFTER_SHA: ${{ github.sha }}
run: |
python - <<'EOF'
import json
import os
import subprocess

event_name = os.environ["EVENT_NAME"]
inputs_files = os.environ.get("INPUTS_FILES", "").strip()
inputs_bootstrap = os.environ.get("INPUTS_BOOTSTRAP", "").lower() == "true"
before = os.environ.get("BEFORE_SHA", "")
after = os.environ.get("AFTER_SHA", "")
out_path = os.environ["GITHUB_OUTPUT"]

def emit(mode, matrix):
with open(out_path, "a", encoding="utf-8") as f:
f.write("mode={}\n".format(mode))
f.write("matrix={}\n".format(json.dumps(matrix)))

# Bootstrap is special: one PR for all datasources.
# Explicit `files` input overrides bootstrap; honor that.
if event_name == "workflow_dispatch" and inputs_bootstrap and not inputs_files:
emit("bootstrap", [{"module": "bootstrap", "files": "", "bootstrap": True}])
print("Plan: bootstrap (single PR)")
raise SystemExit(0)

# Resolve the file list to translate.
if event_name == "workflow_dispatch" and inputs_files:
files = inputs_files.split()
else:
# push event: diff datasource files between before and after.
try:
out = subprocess.check_output(
["git", "diff", "--name-only", before, after, "--",
"datasources/*/search_*.py"],
text=True,
)
except subprocess.CalledProcessError:
out = ""
files = [f for f in out.splitlines() if f.strip()]

# Group by module: datasources/<module>/search_*.py
modules = {}
for path in files:
parts = path.split("/")
if len(parts) >= 3 and parts[0] == "datasources":
modules.setdefault(parts[1], []).append(path)

if not modules:
emit("none", [])
print("Plan: nothing to translate")
raise SystemExit(0)

matrix = [
{"module": mod, "files": " ".join(sorted(paths)), "bootstrap": False}
for mod, paths in sorted(modules.items())
]
emit("files", matrix)
print("Plan: {} module(s)".format(len(matrix)))
for entry in matrix:
print(" - {}: {}".format(entry["module"], entry["files"]))
EOF

sync:
name: Sync ${{ matrix.target.module }}
needs: detect
if: needs.detect.outputs.mode != 'none'
runs-on: ubuntu-latest
# Per-module concurrency: a newer push to master supersedes any in-flight
# sync for the same module (LLM run gets cancelled, latest run wins).
# Each matrix instance gets its own group, so different modules don't block.
concurrency:
group: zeeschuimer-sync-${{ matrix.target.module }}
cancel-in-progress: true
strategy:
fail-fast: false
matrix:
target: ${{ fromJson(needs.detect.outputs.matrix) }}
steps:
- name: Checkout 4CAT
uses: actions/checkout@v4
with:
fetch-depth: 2

- name: Set up Python
uses: actions/setup-python@v5
with:
python-version: "3.11"

- name: Install LLM dependencies
run: |
# LLMAdapter (common/lib/llm.py) imports every provider's langchain
# package at module load, so all of these are required even though
# we only use the Ollama provider at runtime.
pip install \
langchain-core \
langchain-ollama \
langchain-openai \
langchain-anthropic \
langchain-google-genai \
langchain-mistralai \
langchain-deepseek \
pydantic \
requests

- name: Mint Zeeschuimer App token
id: app_token
uses: actions/create-github-app-token@v1
with:
app-id: ${{ secrets.ZEESCHUIMER_APP_ID }}
private-key: ${{ secrets.ZEESCHUIMER_APP_PRIVATE_KEY }}
owner: digitalmethodsinitiative
repositories: zeeschuimer

- name: Checkout Zeeschuimer
uses: actions/checkout@v4
with:
repository: digitalmethodsinitiative/zeeschuimer
path: zeeschuimer-checkout
token: ${{ steps.app_token.outputs.token }}

- name: Run translation
env:
DMI_OLLAMA_KEY: ${{ secrets.DMI_OLLAMA_KEY }}
LLM_MODEL: ${{ inputs.model || 'qwen2.5-coder:14b' }}
run: |
if [ "${{ matrix.target.bootstrap }}" = "true" ]; then
python helper-scripts/map_item_converter.py \
--bootstrap \
--zeeschuimer-checkout ./zeeschuimer-checkout \
--output-manifest ./manifest.json
else
python helper-scripts/map_item_converter.py \
--files ${{ matrix.target.files }} \
--zeeschuimer-checkout ./zeeschuimer-checkout \
--output-manifest ./manifest.json
fi

- name: Build PR body
id: pr_body
env:
MODULE: ${{ matrix.target.module }}
BOOTSTRAP: ${{ matrix.target.bootstrap }}
BEFORE_SHA: ${{ github.event.before }}
AFTER_SHA: ${{ github.sha }}
RUN_ID: ${{ github.run_id }}
EVENT_NAME: ${{ github.event_name }}
run: |
python - <<'EOF'
import json
import os
import subprocess

module = os.environ["MODULE"]
is_bootstrap = os.environ.get("BOOTSTRAP", "").lower() == "true"
before = os.environ["BEFORE_SHA"]
after = os.environ["AFTER_SHA"]
run_id = os.environ["RUN_ID"]
event_name = os.environ["EVENT_NAME"]
repo = "${{ github.repository }}"

with open("manifest.json") as f:
manifest = json.load(f)

model = manifest.get("model", "(unknown)")
provider = manifest.get("provider", "ollama")
structured_output = manifest.get("structured_output", False)
stream = manifest.get("stream", False)
total_duration = manifest.get("total_duration_seconds")
entries = manifest.get("entries", [])

short_sha = after[:7]
lines = []
lines.append("> :robot: This PR was auto-generated by the [4CAT map_item sync workflow](https://github.com/{}/actions/runs/{}). The JavaScript was produced by an LLM and **requires human review** before merging — including manual fixes for any lint warnings flagged below.".format(repo, run_id))
lines.append("")
lines.append("## Generation parameters")
lines.append("- **Model:** `{}` (provider: `{}`, structured output: `{}`, stream: `{}`)".format(model, provider, structured_output, stream))
if total_duration is not None:
lines.append("- **Total LLM time:** {}s".format(total_duration))
if is_bootstrap:
lines.append("- **Trigger:** manual `workflow_dispatch` with `bootstrap=true` (initial sync of all Zeeschuimer datasources).")
elif event_name == "workflow_dispatch":
lines.append("- **Trigger:** manual `workflow_dispatch` for `{}`.".format(module))
else:
lines.append("- **Trigger:** push of [`{}`](https://github.com/{}/commit/{}) to 4CAT master (module: `{}`).".format(short_sha, repo, after, module))
lines.append("")

ok = [e for e in entries if e["status"] == "ok"]
ok_with_warnings = [e for e in ok if e.get("lint_warnings")]
failed = [e for e in entries if e["status"] == "failed"]
skipped = [e for e in entries if e["status"] == "skipped"]

lines.append("## Summary")
lines.append("- :white_check_mark: {} translated".format(len(ok)))
if ok_with_warnings:
lines.append("- :warning: {} translated with lint warnings (require manual fix)".format(len(ok_with_warnings)))
lines.append("- :x: {} failed".format(len(failed)))
lines.append("- :grey_question: {} skipped".format(len(skipped)))
lines.append("")

if ok:
lines.append("| Datasource | Module | Time | Warnings |")
lines.append("|---|---|---:|---:|")
for entry in ok:
dur = entry.get("duration_seconds")
dur_cell = "{}s".format(dur) if dur is not None else "—"
warn_count = len(entry.get("lint_warnings") or [])
warn_cell = ":warning: {}".format(warn_count) if warn_count else "—"
lines.append("| `{}` | `{}` | {} | {} |".format(entry["python_file"], entry["js_file"], dur_cell, warn_cell))
lines.append("")

if ok_with_warnings:
lines.append("## :warning: Lint warnings — fix before merging")
lines.append("")
lines.append("The following datasources translated successfully but the static lint flagged issues that need human fixes. The auto-generated code was spliced into the JS module as-is; please patch the file directly in this PR.")
lines.append("")
for entry in ok_with_warnings:
lines.append("**`{}` -> `{}`**".format(entry["python_file"], entry["js_file"]))
for w in entry["lint_warnings"]:
lines.append("- {}".format(w))
lines.append("")

for entry in ok:
dur = entry.get("duration_seconds")
header_dur = " ({}s)".format(dur) if dur is not None else ""
warn_marker = " :warning:" if entry.get("lint_warnings") else ""
lines.append("## `{}` -> `{}`{}{}".format(entry["python_file"], entry["js_file"], header_dur, warn_marker))
if entry.get("commentary"):
lines.append("**LLM commentary:**")
lines.append("")
lines.append("> " + entry["commentary"].replace("\n", "\n> "))
lines.append("")
if event_name == "push":
try:
diff = subprocess.check_output(
["git", "diff", "{}..{}".format(before, after), "--", entry["python_file"]],
text=True,
)
except subprocess.CalledProcessError:
diff = ""
else:
diff = ""
if diff.strip():
lines.append("<details><summary>Python diff</summary>")
lines.append("")
lines.append("```diff")
lines.append(diff.rstrip())
lines.append("```")
lines.append("</details>")
lines.append("")

if failed:
lines.append("## Failures")
for entry in failed:
dur = entry.get("duration_seconds")
dur_str = " (after {}s)".format(dur) if dur is not None else ""
lines.append("- `{}`{}: {}".format(entry["python_file"], dur_str, entry.get("error", "(no error message)")))
lines.append("")

if skipped:
lines.append("## Skipped")
for entry in skipped:
lines.append("- `{}`: {}".format(entry["python_file"], entry.get("error", "")))
lines.append("")

body = "\n".join(lines)
with open("pr_body.md", "w", encoding="utf-8") as f:
f.write(body)
print("Wrote pr_body.md ({} chars)".format(len(body)))

# Title is single-module in the matrix path; bootstrap is its own
# special-case (one PR covering all 15 datasources).
ok_modules = []
for entry in ok:
parts = entry["python_file"].split("/")
if len(parts) >= 2 and parts[0] == "datasources":
mod = parts[1]
if mod not in ok_modules:
ok_modules.append(mod)

if is_bootstrap:
title = "Auto-translated map_item updates from 4CAT (bootstrap, {} datasources)".format(len(ok_modules))
elif not ok_modules:
title = "Auto-translated map_item updates from 4CAT: {}".format(module)
else:
title = "Auto-translated map_item updates from 4CAT: {}".format(", ".join(ok_modules))

github_output = os.environ.get("GITHUB_OUTPUT")
if github_output:
with open(github_output, "a", encoding="utf-8") as f:
f.write("title={}\n".format(title))
print("PR title: {}".format(title))
EOF

- name: Check there are JS changes to PR
id: have_changes
working-directory: zeeschuimer-checkout
run: |
if [ -z "$(git status --porcelain)" ]; then
echo "has_changes=false" >> "$GITHUB_OUTPUT"
echo "No JS changes produced by translation; not opening a PR."
else
echo "has_changes=true" >> "$GITHUB_OUTPUT"
fi

- name: Open or update Zeeschuimer PR
if: steps.have_changes.outputs.has_changes == 'true'
uses: peter-evans/create-pull-request@v6
with:
path: zeeschuimer-checkout
token: ${{ steps.app_token.outputs.token }}
# Stable per-module branch: a fresh push that retranslates the same
# module updates the same PR. Different modules never share a branch.
branch: auto/4cat-map-item-sync-${{ matrix.target.module }}
title: ${{ steps.pr_body.outputs.title }}
commit-message: "chore: sync map_item for ${{ matrix.target.module }} from 4CAT ${{ github.sha }}"
body-path: pr_body.md
draft: true
2 changes: 2 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -46,6 +46,8 @@ webtool/venv/
*.ipynb
venv/
__pycache__/
.claude/
extensions

# do not ignore interface images
!webtool/static/img/*.png
Expand Down
Loading
Loading