nashsu · RickSanchez88E · Apr 29, 2026 · May 1, 2026 · May 1, 2026 · May 2, 2026
diff --git a/.DS_Store b/.DS_Store
diff --git a/.github/workflows/deploy-microservice.yml b/.github/workflows/deploy-microservice.yml
@@ -0,0 +1,129 @@
+name: deploy-microservice
+
+on:
+  push:
+    branches: [feat/daily-microservice, main]
+    paths: &paths
+      - deploy/**
+      - crates/**
+      - scripts/sync_autocli_jobs.py
+      - scripts/job_priority_scorer.py
+      - scripts/job_priority_config.py
+      - scripts/sponsor_filter.py
+      - supabase/migrations/**
+      - rust-toolchain.toml
+      - .github/workflows/deploy-microservice.yml
+  pull_request:
+    paths: *paths
+  workflow_dispatch:
+
+env:
+  IS_MAIN: ${{ github.ref == 'refs/heads/main' }}
+  IS_PUSH: ${{ github.event_name == 'push' }}
+
+jobs:
+  build-autocli-binary:
+    runs-on: ubuntu-latest
+    # Build inside Debian Bookworm to match the daily runtime image's GLIBC
+    # (python:3.12-slim-bookworm = GLIBC 2.36). Building on ubuntu-latest host
+    # gave GLIBC 2.39 binaries that wouldn't load in the runtime image with
+    # "GLIBC_2.39 not found".
+    container:
+      image: rust:1.94-slim-bookworm
+    steps:
+      - uses: actions/checkout@v4
+      - name: Install build deps
+        run: |
+          apt-get update -qq
+          apt-get install -y -qq --no-install-recommends pkg-config libssl-dev
+      - uses: Swatinem/rust-cache@v2
+      - run: cargo build --release -p autocli
+      - name: Verify binary GLIBC requirement is bookworm-compatible
+        run: |
+          # objdump may not be present in slim; use readelf
+          apt-get install -y -qq --no-install-recommends binutils
+          REQ=$(readelf -V target/release/autocli 2>/dev/null | grep -oE 'GLIBC_[0-9.]+' | sort -V | tail -1)
+          echo "max GLIBC requirement: $REQ"
+          # Bookworm ships GLIBC 2.36. Reject if binary needs >2.36.
+          MAJMIN=$(echo "$REQ" | sed 's/GLIBC_//')
+          if [ "$(printf '%s\n%s\n' "$MAJMIN" "2.36" | sort -V | tail -1)" != "2.36" ] && [ "$MAJMIN" != "2.36" ]; then
+            # MAJMIN > 2.36 → fail loud
+            if [ "$(printf '%s\n%s\n' "$MAJMIN" "2.36" | sort -V | head -1)" = "2.36" ]; then
+              echo "FAIL: binary needs $REQ but runtime image is GLIBC 2.36" >&2
+              exit 1
+            fi
+          fi
+      - uses: actions/upload-artifact@v4
+        with:
+          name: autocli-bin
+          path: target/release/autocli
+          retention-days: 7
+
+  build-chrome-image:
+    runs-on: ubuntu-latest
+    permissions:
+      contents: read
+      packages: write
+    steps:
+      - uses: actions/checkout@v4
+      - uses: docker/setup-buildx-action@v3
+      - uses: docker/login-action@v3
+        with:
+          registry: ghcr.io
+          username: ${{ github.actor }}
+          password: ${{ secrets.GITHUB_TOKEN }}
+      # NOTE: slugifier — `type=ref,event=branch` runs metadata-action's
+      # slugifier, so `feat/daily-microservice` becomes
+      # `branch-feat-daily-microservice` (Docker-tag-safe).
+      - id: meta
+        uses: docker/metadata-action@v5
+        with:
+          images: ghcr.io/ricksanchez88e/autocli-chrome
+          flavor: latest=false
+          tags: |
+            type=raw,value=main,enable=${{ env.IS_MAIN }}
+            type=ref,event=branch,prefix=branch-,enable=${{ env.IS_MAIN == 'false' }}
+            type=sha,prefix=sha-,format=short
+      - uses: docker/build-push-action@v6
+        with:
+          context: .
+          file: deploy/chrome/Dockerfile
+          platforms: linux/amd64
+          tags: ${{ steps.meta.outputs.tags }}
+          push: ${{ env.IS_PUSH == 'true' }}
+
+  build-daily-image:
+    runs-on: ubuntu-latest
+    needs: [build-autocli-binary]
+    permissions:
+      contents: read
+      packages: write
+    steps:
+      - uses: actions/checkout@v4
+      - uses: actions/download-artifact@v4
+        with:
+          name: autocli-bin
+          path: deploy/daily/bin
+      - run: chmod +x deploy/daily/bin/autocli
+      - uses: docker/setup-buildx-action@v3
+      - uses: docker/login-action@v3
+        with:
+          registry: ghcr.io
+          username: ${{ github.actor }}
+          password: ${{ secrets.GITHUB_TOKEN }}
+      - id: meta
+        uses: docker/metadata-action@v5
+        with:
+          images: ghcr.io/ricksanchez88e/autocli-daily
+          flavor: latest=false
+          tags: |
+            type=raw,value=main,enable=${{ env.IS_MAIN }}
+            type=ref,event=branch,prefix=branch-,enable=${{ env.IS_MAIN == 'false' }}
+            type=sha,prefix=sha-,format=short
+      - uses: docker/build-push-action@v6
+        with:
+          context: .
+          file: deploy/daily/Dockerfile
+          platforms: linux/amd64
+          tags: ${{ steps.meta.outputs.tags }}
+          push: ${{ env.IS_PUSH == 'true' }}
diff --git a/.gitignore b/.gitignore
@@ -3,3 +3,20 @@
 /opencli
 test-results*.md
 twitter-downloads/
+
+# Local project files
+CHANGELOG.md
+/output/
+/test/
+.env
+
+# Python
+__pycache__/
+
+# macOS + local tooling noise
+.DS_Store
+.playwright-mcp/
+.serena/
+
+# Phase 0 local build output (CI downloads as artifact; never commit the binary)
+deploy/daily/bin/autocli
diff --git a/AGENTS.md b/AGENTS.md
@@ -0,0 +1,136 @@
+# AGENTS.md
+# Core Rule
+
+Use Serena first for code intelligence on non-trivial coding tasks, and use bounded subagents for complex engineering work.
+
+Do not claim Serena or subagents were used unless they actually were. If a required tool is unavailable, say so and continue with the smallest safe fallback.
+
+## Serena Workflow
+
+At the start of any non-trivial coding task (see definitions below), unfamiliar-code task, bug investigation, shared-symbol change, or cross-file change:
+
+Definitions:
+- **Non-trivial**: Any change affecting ≥1 function with external dependencies, ≥3 files, or requiring architectural reasoning
+- **Trivial**: Typo fixes, one-line config changes, single-file docs edits without code path impact
+
+Do not run the full Serena workflow for trivial tasks unless the code path is unfamiliar or risky.
+
+1. Check Serena availability.
+2. Run `serena.get_current_config`.
+3. If the active Serena project does not match the repository root, run `serena.activate_project`.
+4. Run `serena.check_onboarding_performed`.
+5. If onboarding is missing, run `serena.onboarding`.
+6. Read only relevant Serena memories.
+
+If Serena is unavailable, say:
+
+> Serena MCP is unavailable; falling back to built-in search/read tools.
+
+Then continue with targeted `rg`, file reads, and normal verification.
+
+Do not run the full Serena workflow for typo fixes, simple docs edits, or one-line config changes unless the code path is unfamiliar or risky.
+
+## Serena Navigation
+
+Prefer Serena before broad file reads:
+
+1. `serena.get_symbols_overview` for unfamiliar files.
+2. `serena.find_symbol` for functions, classes, handlers, schemas, adapters, providers, components, exported APIs, and config objects.
+3. `serena.find_referencing_symbols` before changing shared/public symbols.
+4. `serena.find_implementations` for interfaces, adapters, providers, and polymorphic dispatch.
+5. `serena.get_diagnostics_for_file` after meaningful edits.
+
+Use raw `rg`, grep, or full-file reads only when:
+
+- the target is not code,
+- the symbol name is unknown,
+- Serena cannot resolve the result,
+- Serena has already narrowed the search area,
+- or the task is trivial enough that Serena overhead exceeds value.
+
+Do not read entire large files first.
+
+## Editing Rules
+
+Before editing:
+
+- Map the real call path.
+- Check references for shared/exported symbols.
+- Pick the smallest safe patch.
+- Avoid unrelated files.
+- Prefer symbol-level edits for whole functions/classes/methods.
+- Add or update tests when behavior changes.
+
+After editing:
+
+1. Run the smallest relevant verification first (see Verification Tiers below).
+2. Then run broader checks if the change is cross-file or high-risk.
+3. Summarize changed files, reason, and verification result.
+
+Verification Tiers:
+- **Tier 1 (local)**: Single unit test or type check for the edited function/method
+- **Tier 2 (module)**: All tests in the affected package/directory
+- **Tier 3 (integration)**: Cross-module or end-to-end verification for cross-file/high-risk changes
+
+## Subagent Policy
+
+Use subagents for:
+
+- cross-file or cross-module changes,
+- unknown root cause,
+- refactors,
+- security/auth changes,
+- data-loss or migration risk,
+- queue/worker/scraper/infra changes,
+- PR or adversarial review,
+- bugs where investigation, review, and fix can be separated.
+
+Do not use subagents for:
+
+- direct Q&A,
+- typo fixes,
+- one-file trivial edits,
+- simple config changes,
+- tasks where overhead exceeds value.
+
+If subagents are unavailable, say so and continue in the parent agent using the same sequence manually: explore read-only, review risks, patch only if needed, then verify.
+
+## Subagent Roles
+
+- `explorer`: read-only. Map execution paths, symbols, references, data flow, likely owners, and risky files.
+- `reviewer`: read-only. Look for correctness bugs, regressions, race conditions, idempotency issues, auth/security problems, migration/data-loss risks, missing tests, and rollback gaps.
+- `fixer`: may edit only after the code path is understood. Keep the patch small, avoid unrelated files, use Serena reference checks, and verify targeted changes.
+
+Subagents may recommend actions, but must not broaden scope, introduce new architecture, or modify unrelated modules without parent approval.
+
+## Subagent Flow
+
+For complex tasks:
+
+1. Spawn `explorer` first.
+2. Spawn `reviewer` in parallel only when risk review helps.
+3. Wait for read-only findings.
+4. Summarize the evidence.
+5. Spawn `fixer` only if a patch is needed.
+6. Run verification.
+7. For high-risk changes, run one final reviewer pass.
+
+Default limit:
+
+- `explorer`: at most 1 before editing
+- `reviewer`: at most 1 in parallel with explorer or after
+- `fixer`: at most 1, only after read-only findings are complete
+- Do not create more subagents unless the user explicitly asks or a P0/P1 risk remains unresolved.
+- Maximum total: 3 subagents per task (2 read-only + 1 fixer)
+
+Subagents must return:
+
+- scope inspected,
+- Serena tools used,
+- key symbols/files,
+- findings,
+- risks,
+- recommended next action,
+- confidence level.
+
+Parent Codex owns the final decision.
diff --git a/CHANGELOG_pipeline.md b/CHANGELOG_pipeline.md
@@ -0,0 +1,21 @@
+# JD Structured Extraction Pipeline — Changelog
+
+## [0.1.0] — 2026-05-03
+
+### Added
+
+- **Pipeline orchestrator** (`jd_pipeline.py`): CLI tool (`--input`, `--dry-run`, `--limit`) that reads `output/final.json`, preprocesses JDs, extracts structured JSON via local LLM, and upserts results into Supabase.
+- **LLM client** (`jd_pipeline_llm.py`): Async batch client for llama.cpp `/chat/completions` with grammar-constrained generation (`json_schema`), 3-attempt retry (standard → repair with validation feedback → minimal), dynamic timeout, semaphore-limited concurrency, and latency tracking.
+- **Database client** (`jd_pipeline_db.py`): Atomic `claim_job` / `upsert_job_structured` / `mark_dead_letter` / `reap_stale_processing` RPCs, extraction_runs bookkeeping, `.env` auto-loading.
+- **Config** (`jd_pipeline_config.py`): Version constants, schema definitions (`JD_SCHEMA` + `MINIMAL_SCHEMA`), LLM/Supabase connection params, token limits, context-size tiers.
+- **Preprocessor** (`jd_pipeline_preprocess.py`): LinkedIn boilerplate removal, NFKC normalization, control-char strip, SHA-256 hashing.
+- **Supabase migrations** (6 files + RPC grants): `jobs` columns for structured extraction, `extraction_runs` table, `dead_letter_records` with stage/error tracking, atomic RPC functions with run-id guards.
+- **Per-run reporting**: Console summary with failed-jobs detail (URL, stage, error class, message) + JSON report file in `output/`.
+
+### Fixed
+
+- `dead_letter_records.reason` and `source_schema`/`source_job_id` made nullable — prevents write failures when fields are absent.
+- `skills.maxItems` raised from 30 → 50 to accommodate verbose model output.
+- System prompt improved: explicit rules for skills (technical only, max 25), summary (1–3 sentences), experience_level, employment_type.
+- Stale processing reaper threshold adjustable; 172 stuck jobs reaped successfully.
+- Duplicate counter increments removed — `PipelineStats.record_*()` methods now single source of truth.