diff --git a/.envs.example/.production/.django b/.envs.example/.production/.django new file mode 100644 index 000000000..ed62df35b --- /dev/null +++ b/.envs.example/.production/.django @@ -0,0 +1,63 @@ +# ============================================================================= +# OpenContracts — Django service env (production) +# Copy to .envs/.production/.django and fill in every below. +# Generate strong secrets with: ./scripts/easypanel/generate-env.sh +# ============================================================================= + +# --- Django core ---------------------------------------------------------- +DJANGO_SETTINGS_MODULE=config.settings.production +DJANGO_DEBUG=False +DJANGO_SECRET_KEY= +DJANGO_ADMIN_URL=admin// +DJANGO_ALLOWED_HOSTS=,django + +# Initial superuser (created via `createsuperuser` or signal) +DJANGO_SUPERUSER_USERNAME=admin +DJANGO_SUPERUSER_EMAIL= +DJANGO_SUPERUSER_PASSWORD= + +# Worker timeout for daphne (seconds). 60 is safe for most production loads. +DJANGO_WORKER_TIMEOUT=60 + +# --- Storage -------------------------------------------------------------- +# LOCAL = files on the django container's disk (fine for a single-host setup). +# Set to AWS or GCP for object storage; see docs/deployment/. +STORAGE_BACKEND=LOCAL + +# --- Redis / Celery ------------------------------------------------------- +REDIS_URL=redis://redis:6379/0 + +# Flower (Celery monitor) — protect with strong creds; port 5555 is exposed +# by the bundled traefik. Set both to a long random string. +CELERY_FLOWER_USER= +CELERY_FLOWER_PASSWORD= + +# --- Auth -------------------------------------------------------------------- +# Auth0 is OFF by default; OC falls back to JWT/session auth. +USE_AUTH0=false + +# --- LLM credentials ------------------------------------------------------ +# Required for embeddings + agent answers. The Bolivian-laws RAG service +# uses these for both per-area specialists and the orchestrator. +OPENAI_API_KEY= +OPENAI_MODEL=gpt-4o +ANTHROPIC_API_KEY= + +# --- Pipeline microservices (in-network URLs) ---------------------------- +EMBEDDINGS_MICROSERVICE_URL=http://vector-embedder:8000 +VECTOR_EMBEDDER_API_KEY= +DOCLING_PARSER_SERVICE_URL=http://docling-parser:8000/parse/ +DOCXODUS_PARSER_SERVICE_URL=http://docxodus-parser:8080/parse +DOCXODUS_PARSER_TIMEOUT=120 + +# --- Bolivian Laws RAG (overrides; sane defaults baked into settings) ---- +# Override only if a target site moves or you mirror it elsewhere. +# BOLIVIAN_LAWS_GACETA_BASE_URL=https://gacetaoficialdebolivia.gob.bo/ +# BOLIVIAN_LAWS_GACETA_LISTING_PATHS=/ +# BOLIVIAN_LAWS_TSJ_BASE_URL=https://tsj.bo/ +# BOLIVIAN_LAWS_TSJ_LISTING_PATHS=/jurisprudencia/ +# BOLIVIAN_LAWS_TCP_BASE_URL=https://tcpbolivia.bo/ +# BOLIVIAN_LAWS_TCP_LISTING_PATHS=/jurisprudencia/ +BOLIVIAN_LAWS_SCRAPER_USER_AGENT=OpenContractsBolivianLawsBot/1.0 (+contact: ) +BOLIVIAN_LAWS_SCRAPE_LOOKBACK_DAYS=30 +BOLIVIAN_LAWS_REQUEST_DELAY_SECONDS=1.0 diff --git a/.envs.example/.production/.frontend b/.envs.example/.production/.frontend new file mode 100644 index 000000000..380cd61e5 --- /dev/null +++ b/.envs.example/.production/.frontend @@ -0,0 +1,16 @@ +# ============================================================================= +# OpenContracts — Frontend env (production build) +# Copy to .envs/.production/.frontend and replace . +# These are baked into the Vite bundle at container build time. +# ============================================================================= + +REACT_APP_API_ROOT_URL=https:// +REACT_APP_APPLICATION_DOMAIN= + +# Auth0 off — match the django service's USE_AUTH0 setting. +REACT_APP_USE_AUTH0=false + +# Optional: pre-fill an Auth0 client if you switch USE_AUTH0=true. +# REACT_APP_AUTH0_DOMAIN= +# REACT_APP_AUTH0_CLIENT_ID= +# REACT_APP_AUTH0_AUDIENCE= diff --git a/.envs.example/.production/.postgres b/.envs.example/.production/.postgres new file mode 100644 index 000000000..ac609681f --- /dev/null +++ b/.envs.example/.production/.postgres @@ -0,0 +1,12 @@ +# ============================================================================= +# OpenContracts — Postgres env (production) +# Copy to .envs/.production/.postgres and fill in . +# POSTGRES_PASSWORD MUST match the password embedded in DATABASE_URL. +# ============================================================================= + +POSTGRES_HOST=postgres +POSTGRES_PORT=5432 +POSTGRES_DB=opencontractserver +POSTGRES_USER=opencontractserver +POSTGRES_PASSWORD= +DATABASE_URL=postgres://opencontractserver:@postgres:5432/opencontractserver diff --git a/CHANGELOG.md b/CHANGELOG.md index 7637a5d7f..8e3560a7d 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -7,6 +7,32 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ## [Unreleased] +### Added + +- **EasyPanel deployment kit**: + - **`easypanel.yml`** — dedicated Compose file parameterised entirely by environment variables (no `.envs/.production/*` files), with no bundled Traefik service: EasyPanel's built-in proxy handles TLS and domain routing. Missing required secrets fail-fast thanks to `${VAR:?error}` syntax. + - **`scripts/easypanel/print-env.sh`** — prints a ready-to-paste `KEY=value` block for the EasyPanel app's Environment tab, with all random secrets (`DJANGO_SECRET_KEY`, admin URL slug, Postgres password, Flower creds, vector-embedder API key) pre-generated. + - **`scripts/easypanel/deploy.sh`** — optional one-command bootstrap for folks SSH-ing into the host (wraps generate-env + configure-traefik + docker compose build/migrate/up + smoke test of the Bolivian-laws scrape). Used with the legacy `production.yml`. + - Building blocks for the legacy flow: commit-able env templates under `.envs.example/.production/`, `scripts/easypanel/generate-env.sh`, `scripts/easypanel/configure-traefik.sh`. + - Docs (`docs/deployment/easypanel.md`) rewritten around the GitHub-native flow: paste env vars → wire domain → deploy. +- **Bolivian Laws RAG service** (`opencontractserver/bolivian_laws/`): multi-agent RAG over Bolivian legal sources, organised by legal area to keep embeddings cost-aware and retrieval precise. + - One Corpus per `LegalArea` (constitucional, penal, civil, administrativo, laboral, tributario, familia, comercial, agrario, ambiental, otros), seeded idempotently from `AREA_PROFILES` (`opencontractserver/bolivian_laws/constants.py`). + - Tracking model `BolivianLegalDocument` with global SHA-256 dedupe and source attribution (`gaceta`, `tsj`, `tcp`, `manual`). + - Bulk ingestion via management command `python manage.py ingest_bolivian_laws --path ... --area ...` with optional LLM-based `--auto-classify`, dry-run and async (Celery) modes. + - Specialist agents per area + orchestrator agent (pydantic_ai) that routes questions to one or more specialists and synthesises answers (`opencontractserver/bolivian_laws/services/agents.py`). + - GraphQL mutation `askBolivianLaw(question, areas?)` returns the synthesised answer plus area-tagged source citations (`config/graphql/bolivian_laws_mutations.py`). + - Settings: `BOLIVIAN_LAWS_DEFAULT_EMBEDDER`, `BOLIVIAN_LAWS_CLASSIFIER_MODEL`, `BOLIVIAN_LAWS_ORCHESTRATOR_MODEL`, `BOLIVIAN_LAWS_SPECIALIST_MODEL` (`config/settings/base.py`). + - Documentation in `docs/features/bolivian_laws_rag.md`. +- **Bolivian Laws automatic scrapers** (`opencontractserver/bolivian_laws/scrapers/`): three pluggable scrapers that fetch legal PDFs from the Gaceta Oficial (`gacetaoficialdebolivia.gob.bo`), Tribunal Supremo de Justicia (`tsj.bo`) and Tribunal Constitucional Plurinacional (`tcpbolivia.bo`). + - `BaseScraper` with injectable `httpx.Client`, configurable User-Agent, rate limiting, and defensive per-listing error handling so a single broken page cannot abort a batch. + - Per-source classes (`GacetaOficialScraper`, `TribunalSupremoJusticiaScraper`, `TribunalConstitucionalScraper`) extract best-effort metadata: external ID (e.g. `LEY-1178`, `AS-123/2023`, `SCP-0250/2012`), publication date, and a suggested `LegalArea` via keyword heuristics (sala name for TSJ, SAFCO/tributario/etc. for Gaceta, always `constitucional` for TCP). + - Celery tasks `scrape_and_ingest_source(source_key)` and `scrape_and_ingest_all()` (`opencontractserver/bolivian_laws/tasks.py`). A SHA-256 pre-check before `ingest_pdf` makes re-runs cheap; download failures are logged and counted per-source without aborting the batch. + - New Beat schedule entry `bolivian-laws-scrape-all` running once daily (`config/settings/base.py`). + - Management command `python manage.py scrape_bolivian_laws [--source gaceta|tsj|tcp | --all] [--since-days N] [--max-entries N] [--sync]` for on-demand runs. + - Settings: `BOLIVIAN_LAWS_{GACETA,TSJ,TCP}_BASE_URL` / `_LISTING_PATHS`, `BOLIVIAN_LAWS_SCRAPER_USER_AGENT`, `BOLIVIAN_LAWS_SCRAPE_LOOKBACK_DAYS`, `BOLIVIAN_LAWS_REQUEST_DELAY_SECONDS`. + - Added `beautifulsoup4>=4.12,<5` to `requirements/base.txt` for HTML parsing. + - Tests use `httpx.MockTransport` with inline HTML fixtures; no real HTTP traffic. + ### Fixed - **GraphQL security hardening cleanup** (Issue #1198): diff --git a/config/graphql/bolivian_laws_mutations.py b/config/graphql/bolivian_laws_mutations.py new file mode 100644 index 000000000..e4524ae64 --- /dev/null +++ b/config/graphql/bolivian_laws_mutations.py @@ -0,0 +1,138 @@ +"""GraphQL mutation for the Bolivian Laws RAG service. + +Single mutation: ``askBolivianLaw(question, areas?)``. + +- If ``areas`` is empty / null: routes through the orchestrator agent + which decides which specialist(s) to consult. +- If ``areas`` is given: skips orchestration and consults the listed + specialists directly in parallel (cheaper, deterministic). +""" + +from __future__ import annotations + +import logging + +import graphene +from asgiref.sync import async_to_sync +from graphql_jwt.decorators import login_required + +from opencontractserver.bolivian_laws.constants import LegalArea +from opencontractserver.bolivian_laws.services.agents import ( + ask_orchestrator, + ask_specialists, +) + +logger = logging.getLogger(__name__) + + +class BolivianLawSourceType(graphene.ObjectType): + """Source citation returned by the orchestrator/specialists.""" + + area = graphene.String(required=True) + document_id = graphene.Int(required=False) + snippet = graphene.String(required=True) + similarity_score = graphene.Float(required=True) + + +class AskBolivianLawMutation(graphene.Mutation): + """Query the Bolivian Laws RAG service. + + Returns a synthesised answer plus per-source citations tagged by + legal area. + """ + + class Arguments: + question = graphene.String( + required=True, + description="Pregunta del usuario en lenguaje natural.", + ) + areas = graphene.List( + graphene.String, + required=False, + description=( + "Lista opcional de áreas (constitucional, penal, civil, " + "administrativo, laboral, tributario, familia, comercial, " + "agrario, ambiental, otros). Si se provee, se omite el " + "orquestador y se consultan en paralelo." + ), + ) + conversation_id = graphene.Int( + required=False, + description="ID de conversación a continuar (opcional).", + ) + + ok = graphene.Boolean() + message = graphene.String() + answer = graphene.String() + consulted_areas = graphene.List(graphene.String) + sources = graphene.List(BolivianLawSourceType) + conversation_id = graphene.Int() + + @staticmethod + @login_required + def mutate(root, info, question, areas=None, conversation_id=None): + question = (question or "").strip() + if not question: + return AskBolivianLawMutation( + ok=False, + message="Question must be non-empty.", + answer="", + consulted_areas=[], + sources=[], + ) + + valid_areas = {a.value for a in LegalArea} + if areas: + cleaned: list[str] = [] + for a in areas: + a_norm = (a or "").strip().lower() + if a_norm not in valid_areas: + return AskBolivianLawMutation( + ok=False, + message=f"Unknown area: {a!r}", + answer="", + consulted_areas=[], + sources=[], + ) + cleaned.append(a_norm) + areas = cleaned + + user_id = info.context.user.pk if info.context.user else None + + try: + if areas: + response = async_to_sync(ask_specialists)( + areas, question, user_id=user_id + ) + else: + response = async_to_sync(ask_orchestrator)( + question, + user_id=user_id, + conversation_id=conversation_id, + ) + except Exception as exc: + logger.exception("askBolivianLaw failed") + return AskBolivianLawMutation( + ok=False, + message=f"Internal error: {exc}", + answer="", + consulted_areas=[], + sources=[], + ) + + return AskBolivianLawMutation( + ok=True, + message="ok", + answer=response.answer, + consulted_areas=response.consulted_areas, + conversation_id=response.conversation_id, + sources=[ + BolivianLawSourceType( + area=s.area, + document_id=s.document_id, + snippet=s.snippet, + similarity_score=s.similarity_score, + ) + for s in response.sources + ], + ) diff --git a/config/graphql/mutations.py b/config/graphql/mutations.py index a2ea1aacb..978b2e754 100644 --- a/config/graphql/mutations.py +++ b/config/graphql/mutations.py @@ -165,6 +165,9 @@ UpdateModeratorPermissionsMutation, ) +# Import Bolivian Laws mutations +from config.graphql.bolivian_laws_mutations import AskBolivianLawMutation + # Import notification mutations from config.graphql.notification_mutations import ( DeleteNotificationMutation, @@ -397,6 +400,9 @@ class Mutation(graphene.ObjectType): update_agent_configuration = UpdateAgentConfigurationMutation.Field() delete_agent_configuration = DeleteAgentConfigurationMutation.Field() + # BOLIVIAN LAWS RAG ########################################################## + ask_bolivian_law = AskBolivianLawMutation.Field() + # PIPELINE SETTINGS MUTATIONS (Superuser only) ############################### update_pipeline_settings = UpdatePipelineSettingsMutation.Field() reset_pipeline_settings = ResetPipelineSettingsMutation.Field() diff --git a/config/settings/base.py b/config/settings/base.py index bce536d07..f51e35a3e 100644 --- a/config/settings/base.py +++ b/config/settings/base.py @@ -155,6 +155,7 @@ "opencontractserver.agents", "opencontractserver.worker_uploads", "opencontractserver.discovery", + "opencontractserver.bolivian_laws", ] # https://docs.djangoproject.com/en/dev/ref/settings/#installed-apps @@ -680,8 +681,48 @@ "schedule": 300.0, # every 5 minutes "options": {"queue": "worker_uploads"}, }, + # Bolivian-laws RAG: fan out a scrape+ingest task per registered + # source (Gaceta Oficial, TSJ, TCP) once a day. Ingestion dedupes + # by SHA-256 so re-running is cheap. + "bolivian-laws-scrape-all": { + "task": "bolivian_laws.scrape_and_ingest_all", + "schedule": 86400.0, # daily + }, } +# Bolivian Laws RAG service +# ------------------------------------------------------------------------------ +# Base URLs can be overridden per-deployment (useful for staging mirrors +# or archive snapshots). Listing paths are comma-separated strings. +BOLIVIAN_LAWS_GACETA_BASE_URL = env( + "BOLIVIAN_LAWS_GACETA_BASE_URL", default="https://gacetaoficialdebolivia.gob.bo/" +) +BOLIVIAN_LAWS_GACETA_LISTING_PATHS = env.list( + "BOLIVIAN_LAWS_GACETA_LISTING_PATHS", default=["/"] +) +BOLIVIAN_LAWS_TSJ_BASE_URL = env( + "BOLIVIAN_LAWS_TSJ_BASE_URL", default="https://tsj.bo/" +) +BOLIVIAN_LAWS_TSJ_LISTING_PATHS = env.list( + "BOLIVIAN_LAWS_TSJ_LISTING_PATHS", default=["/jurisprudencia/"] +) +BOLIVIAN_LAWS_TCP_BASE_URL = env( + "BOLIVIAN_LAWS_TCP_BASE_URL", default="https://tcpbolivia.bo/" +) +BOLIVIAN_LAWS_TCP_LISTING_PATHS = env.list( + "BOLIVIAN_LAWS_TCP_LISTING_PATHS", default=["/jurisprudencia/"] +) +BOLIVIAN_LAWS_SCRAPER_USER_AGENT = env( + "BOLIVIAN_LAWS_SCRAPER_USER_AGENT", + default="OpenContractsBolivianLawsBot/1.0 (+https://github.com/JSv4/OpenContracts)", +) +BOLIVIAN_LAWS_SCRAPE_LOOKBACK_DAYS = env.int( + "BOLIVIAN_LAWS_SCRAPE_LOOKBACK_DAYS", default=30 +) +BOLIVIAN_LAWS_REQUEST_DELAY_SECONDS = env.float( + "BOLIVIAN_LAWS_REQUEST_DELAY_SECONDS", default=1.0 +) + # Worker Upload Processing # ------------------------------------------------------------------------------ # Documents per batch when draining the staging table @@ -1241,3 +1282,19 @@ }, "cache_ttl": env.int("MCP_CACHE_TTL", default=300), } + + +# Bolivian Laws RAG Service +# ------------------------------------------------------------------------------ +# See docs/services/bolivian_laws.md +BOLIVIAN_LAWS_DEFAULT_EMBEDDER = env.str( + "BOLIVIAN_LAWS_DEFAULT_EMBEDDER", + default=DEFAULT_EMBEDDER, +) +BOLIVIAN_LAWS_CLASSIFIER_MODEL = env.str( + "BOLIVIAN_LAWS_CLASSIFIER_MODEL", default="gpt-4o-mini" +) +BOLIVIAN_LAWS_ORCHESTRATOR_MODEL = env.str( + "BOLIVIAN_LAWS_ORCHESTRATOR_MODEL", default="gpt-4o-mini" +) +BOLIVIAN_LAWS_SPECIALIST_MODEL = env.str("BOLIVIAN_LAWS_SPECIALIST_MODEL", default="") diff --git a/docs/deployment/easypanel.md b/docs/deployment/easypanel.md new file mode 100644 index 000000000..904d1671d --- /dev/null +++ b/docs/deployment/easypanel.md @@ -0,0 +1,184 @@ +# Deploying OpenContracts on EasyPanel (from GitHub) + +Native EasyPanel flow: pull the repo from GitHub, paste a block of +environment variables into the app, click **Deploy**. No SSH, no +scripts on the server, no `.env` files to upload. + +The stack uses `easypanel.yml` (dedicated Compose file) instead of +`production.yml`. It's parameterised entirely through env vars and +delegates TLS / domain routing to EasyPanel's built-in reverse proxy. + +## What you need + +- A domain pointing at your EasyPanel server (A record). +- Your fork of this repo on GitHub. +- An OpenAI API key. + +## Step 1 — generate secrets to paste + +On your laptop (or any machine with Python 3): + +```bash +./scripts/easypanel/print-env.sh \ + --domain oc.example.com \ + --email you@example.com \ + --openai-key sk-... \ + --admin-password 'StrongPass!' +``` + +This prints ~12 `KEY=value` lines: the four values you supplied plus +cryptographically-random `DJANGO_SECRET_KEY`, `DJANGO_ADMIN_URL_SLUG`, +`POSTGRES_PASSWORD`, `CELERY_FLOWER_USER`, `CELERY_FLOWER_PASSWORD`, +`VECTOR_EMBEDDER_API_KEY`. + +Copy the whole block. + +> Don't have Python 3 handy? Open the script and run the `python3 -c` +> lines in any Python REPL, or generate random strings with +> `openssl rand -hex 24` / `openssl rand -base64 64`. + +## Step 2 — create the EasyPanel app + +1. **Create Service → App** in your EasyPanel project. +2. **Source**: Git. + - Repository: your fork's URL. + - Branch: the one you want to deploy (e.g. + `claude/rag-bolivian-laws-service-OYXry`). +3. **Build Method**: **Docker Compose**. +4. **Compose File**: `easypanel.yml`. +5. **Environment**: paste the block from step 1 straight into the + EasyPanel env editor. +6. **Save**. Do **not** deploy yet — we need to wire domains first. + +## Step 3 — wire the domain in EasyPanel + +The Compose file intentionally has **no Traefik service**; EasyPanel's +proxy does TLS and routing. In the app's *Domains* tab: + +| Target service | Port | Path rules | +|---|---|---| +| `frontend` | 80 | default (everything that isn't a Django path) | +| `django` | 5000 | `/graphql`, `/api`, `/admin`, `/ws`, `/mcp`, `/sse`, `/.well-known`, `/robots.txt`, `/llms.txt`, `/llms-full.txt`, `/sitemap.xml` | + +Assign the same hostname (e.g. `oc.example.com`) to both entries. +EasyPanel will issue a Let's Encrypt cert automatically. + +*(Skipping Flower for now — the bundled Traefik is gone, so if you +want the Celery monitor UI, expose `celeryworker` separately or port- +forward `docker exec ... flower` when you need it.)* + +## Step 4 — deploy + +Click **Deploy** in EasyPanel. First build takes a few minutes +(pulls Docling + embedder images, builds the Django + frontend +images). + +Once all services are healthy, run migrations exactly once — either +from the EasyPanel terminal: + +```bash +docker compose -f easypanel.yml --profile migrate up migrate +``` + +…or from any service's shell: + +```bash +docker compose -f easypanel.yml exec django python manage.py migrate +docker compose -f easypanel.yml exec django python manage.py migrate_pipeline_settings +``` + +## Step 5 — verify + +- Browse to `https://` — the React app loads. +- `https:///admin//` — Django admin. The slug is + whatever `DJANGO_ADMIN_URL_SLUG` value you pasted. +- Smoke-test the Bolivian-laws scrape: + ```bash + docker compose -f easypanel.yml exec django \ + python manage.py scrape_bolivian_laws --all --since-days 7 --max-entries 3 --sync + ``` + Should print a summary like `{'source': 'gaceta', 'discovered': 3, + 'ingested': 3, ...}`. + +The Beat scheduler (`celerybeat` service) then runs +`bolivian-laws-scrape-all` automatically once a day. SHA-256 dedupe +makes re-runs cheap. + +## Environment variables reference + +Required: + +| Var | Description | +|---|---| +| `DOMAIN` | Public hostname (e.g. `oc.example.com`) | +| `ADMIN_EMAIL` | Contact email — used for the superuser and can be reused for Let's Encrypt | +| `ADMIN_PASSWORD` | Initial Django superuser password | +| `OPENAI_API_KEY` | Used for embeddings + agent answers | +| `DJANGO_SECRET_KEY` | Cryptographic secret key; random 64+ chars | +| `DJANGO_ADMIN_URL_SLUG` | Obfuscates the admin path; random 30 chars | +| `POSTGRES_PASSWORD` | PostgreSQL password | +| `CELERY_FLOWER_USER` / `CELERY_FLOWER_PASSWORD` | Flower basic-auth creds | +| `VECTOR_EMBEDDER_API_KEY` | Shared secret between Django and the embedder sidecar | + +Optional (safe defaults apply): + +| Var | Default | +|---|---| +| `OPENAI_MODEL` | `gpt-4o` | +| `ANTHROPIC_API_KEY` | (empty) | +| `STORAGE_BACKEND` | `LOCAL` (set `AWS` or `GCP` to use object storage) | +| `USE_AUTH0` | `false` | +| `ADMIN_USERNAME` | `admin` | +| `BOLIVIAN_LAWS_SCRAPER_USER_AGENT` | `OpenContractsBolivianLawsBot/1.0` | +| `BOLIVIAN_LAWS_SCRAPE_LOOKBACK_DAYS` | `30` | +| `BOLIVIAN_LAWS_REQUEST_DELAY_SECONDS` | `1.0` | + +Missing a required var? Compose will refuse to start and tell you +which one — that's the `${VAR:?error}` syntax in `easypanel.yml`. + +## Day-2 + +- **Redeploy** pulls the latest commit and rebuilds. +- **New migrations**: run the `--profile migrate` command after each + deploy that includes them. +- **On-demand scrape**: + ```bash + docker compose -f easypanel.yml exec django \ + python manage.py scrape_bolivian_laws --all --since-days 7 --sync + ``` +- **Logs**: EasyPanel tails each service. `celerybeat` should log + `Scheduler: Sending due task bolivian-laws-scrape-all` once a day. + +## Troubleshooting + +| Symptom | Fix | +|---|---| +| App fails to start with `ERROR: DJANGO_SECRET_KEY is required` | One of the required env vars is empty — re-run `print-env.sh` and paste the full block. | +| `psycopg2.OperationalError` on first boot | Postgres needs ~30 s — re-run the migrate step. | +| `443` returns the EasyPanel landing page | Domain not bound to the `frontend`/`django` services. Check the Domains tab (step 3). | +| `/graphql` returns the React index | Path rule for `/graphql` → `django:5000` missing. Add it in the Domains tab. | +| Scrape summary `discovered=0` | Target site changed structure — override `BOLIVIAN_LAWS__LISTING_PATHS` env var and redeploy. | +| `500` on `POST /graphql/` | `OPENAI_API_KEY` not set on the worker. EasyPanel env vars apply to all services in the Compose app — confirm they're at the app level, not one service. | + +## Security checklist + +- [ ] `DJANGO_DEBUG=False` (default in `easypanel.yml`). +- [ ] `DJANGO_SECRET_KEY`, `POSTGRES_PASSWORD`, Flower password are the + randomly-generated values (not placeholders). +- [ ] `DJANGO_ADMIN_URL_SLUG` is random (script default). +- [ ] EasyPanel cert visible in browser. +- [ ] `BOLIVIAN_LAWS_SCRAPER_USER_AGENT` identifies you with a contact + email — respect each target site's `robots.txt`. + +## Alternative: bundled Traefik (old flow) + +If you'd rather manage TLS yourself with the project's bundled Traefik +config (instead of EasyPanel's proxy), use the original +`production.yml`. See [production.yml](../../production.yml) and run +`./scripts/easypanel/deploy.sh` — that flow still works but is heavier. + +## Related + +- [Bolivian Laws RAG service](../features/bolivian_laws_rag.md) — what + the scrapers do and how to query the corpus via GraphQL. +- [GPU setup](./docker-gpu-setup.md) — co-locate a local LLM. diff --git a/docs/features/bolivian_laws_rag.md b/docs/features/bolivian_laws_rag.md new file mode 100644 index 000000000..cba9b2ae4 --- /dev/null +++ b/docs/features/bolivian_laws_rag.md @@ -0,0 +1,244 @@ +# Bolivian Laws RAG Service + +A turn-key RAG (Retrieval-Augmented Generation) service for Bolivian +legal sources. It scrapes the three main official publishers, ingests +their PDFs into per-area corpora, and exposes a ready-to-query agent +stack on top of the standard OpenContracts GraphQL API. + +## Architecture at a glance + +``` + ┌────────────────┐ ┌───────────────┐ ┌──────────────────┐ + │ Gaceta Oficial │ │ TSJ │ │ TCP │ + └───────┬────────┘ └───────┬───────┘ └────────┬─────────┘ + │ │ │ + ▼ ▼ ▼ + GacetaOficialScraper TsjScraper TcpScraper + │ │ │ + └──────┬─────────────┴────────┬───────────┘ + ▼ ▼ + scrape_and_ingest_source (Celery, one per source) + │ + ▼ (SHA-256 dedupe) + ingest_pdf() + │ + ▼ + Corpus.import_content() → parser pipeline (Docling/Text) + │ + ▼ + Per-area Corpus (+ pgvector embeddings) + │ + ▼ + Specialist + Orchestrator agents + │ + ▼ + GraphQL chat mutations (existing) +``` + +## Data model + +Everything lives under `opencontractserver/bolivian_laws/`. + +| Model | Purpose | +|---|---| +| `LegalAreaCorpus` | 1-to-1 idempotent mapping `area → Corpus`. Created on first ingest for each area. | +| `BolivianLegalDocument` | Tracking record per ingested PDF. `pdf_sha256` is globally unique for dedupe. Keeps `area`, `source` (gaceta / tsj / tcp / manual), `external_id`, `published_at`, status, and a FK to the resulting `Document`. | + +### Legal areas + +Defined in `constants.LegalArea`. Each area gets its own corpus and its +own specialist agent persona: + +`constitucional · penal · civil · administrativo · laboral · tributario · +familia · comercial · agrario · ambiental · otros` + +### Sources + +Defined in `constants.LegalSource`: + +- `gaceta` — Gaceta Oficial de Bolivia (legislation) +- `tsj` — Tribunal Supremo de Justicia (ordinary jurisprudence) +- `tcp` — Tribunal Constitucional Plurinacional (constitutional jurisprudence) +- `manual` — Files uploaded via the management command + +## Scrapers + +Each source has a scraper class under +`opencontractserver/bolivian_laws/scrapers/`: + +| File | Class | Source | +|---|---|---| +| `gaceta.py` | `GacetaOficialScraper` | `gacetaoficialdebolivia.gob.bo` | +| `tsj.py` | `TribunalSupremoJusticiaScraper` | `tsj.bo` | +| `tcp.py` | `TribunalConstitucionalScraper` | `tcpbolivia.bo` | + +All three inherit from `BaseScraper`, which provides: + +- Injectable `httpx.Client` (tests use `httpx.MockTransport` with HTML + fixtures; no real HTTP). +- Polite User-Agent and configurable per-request sleep + (`BOLIVIAN_LAWS_REQUEST_DELAY_SECONDS`). +- Defensive iteration: a failure on one listing page logs and moves on + instead of aborting the batch. + +Every scraper yields `ScrapedEntry` objects with best-effort metadata +(external ID, publication date, suggested legal area). The ingestion +task uses the suggested area and falls back to `OTROS` when no clear +match is found. Callers who want smarter classification can re-run +ingestion through the management command with `--auto-classify`. + +## Celery wiring + +Two tasks live in `tasks.py`: + +- `scrape_and_ingest_source(source_key, *, since_days=None, max_entries=None, user_id=None)` + runs a single scraper, deduplicates by SHA-256, and calls + `ingest_pdf` for every new PDF. Returns a summary dict. +- `scrape_and_ingest_all(*, since_days=None, max_entries_per_source=None)` + fans out one task per source. + +The Beat schedule is wired up in `config/settings/base.py`: + +```python +CELERY_BEAT_SCHEDULE = { + ... + "bolivian-laws-scrape-all": { + "task": "bolivian_laws.scrape_and_ingest_all", + "schedule": 86400.0, # daily + }, +} +``` + +## Configuration + +All knobs are environment-driven. Override only what you need. + +| Variable | Default | Meaning | +|---|---|---| +| `BOLIVIAN_LAWS_GACETA_BASE_URL` | `https://gacetaoficialdebolivia.gob.bo/` | Base URL of the Gaceta site | +| `BOLIVIAN_LAWS_GACETA_LISTING_PATHS` | `/` | Comma-separated listing paths | +| `BOLIVIAN_LAWS_TSJ_BASE_URL` | `https://tsj.bo/` | Base URL of the TSJ site | +| `BOLIVIAN_LAWS_TSJ_LISTING_PATHS` | `/jurisprudencia/` | TSJ listing paths | +| `BOLIVIAN_LAWS_TCP_BASE_URL` | `https://tcpbolivia.bo/` | Base URL of the TCP site | +| `BOLIVIAN_LAWS_TCP_LISTING_PATHS` | `/jurisprudencia/` | TCP listing paths | +| `BOLIVIAN_LAWS_SCRAPER_USER_AGENT` | `OpenContractsBolivianLawsBot/1.0 ...` | Outgoing User-Agent | +| `BOLIVIAN_LAWS_SCRAPE_LOOKBACK_DAYS` | `30` | Ignore entries older than N days (when a date is parseable) | +| `BOLIVIAN_LAWS_REQUEST_DELAY_SECONDS` | `1.0` | Sleep between HTTP calls | +| `BOLIVIAN_LAWS_SPECIALIST_MODEL` | *(none)* | Override the model for specialist agents | +| `BOLIVIAN_LAWS_ORCHESTRATOR_MODEL` | `gpt-4o-mini` | Model for the orchestrator | +| `BOLIVIAN_LAWS_CLASSIFIER_MODEL` | `gpt-4o-mini` | Model for the LLM-based area classifier | + +The LLM agents use whatever embedder/LLM credentials are already +configured for OpenContracts (`OPENAI_API_KEY`, etc.). + +## Operator workflows + +### Manual bulk ingest from a directory + +``` +python manage.py ingest_bolivian_laws \ + --path /data/leyes/ --area constitucional +``` + +- `--auto-classify` to let the LLM classifier pick the area. +- `--async` to enqueue Celery tasks instead of running inline. +- `--dry-run` to preview without writing. + +### On-demand scrape (one source or all) + +``` +python manage.py scrape_bolivian_laws --source gaceta +python manage.py scrape_bolivian_laws --all --since-days 7 --sync +python manage.py scrape_bolivian_laws --source tcp --max-entries 5 --sync +``` + +Without `--sync`, the command enqueues Celery tasks and returns the +task IDs so you can watch them via Flower. + +### Automatic periodic scrape + +The Beat schedule runs `scrape_and_ingest_all` once a day. It's +idempotent: already-ingested PDFs are a no-op thanks to SHA-256 +dedupe. + +## Consuming the RAG programmatically + +### Direct Python API + +```python +from opencontractserver.bolivian_laws.services.agents import ( + ask_orchestrator, ask_specialists, consult_specialist, +) + +# 1) Let the orchestrator decide which specialist(s) to consult: +result = await ask_orchestrator( + "¿Qué dice la SCP 0250/2012 sobre la consulta previa?" +) +print(result.answer) +for src in result.sources: + print(f"[{src.area}] doc#{src.document_id} — {src.snippet[:120]}") + +# 2) Or target one specialist directly: +answer, sources = await consult_specialist( + "penal", "Resume los elementos del tipo penal de trata de personas" +) + +# 3) Or fan out across several specialists in parallel: +result = await ask_specialists( + ["constitucional", "penal"], + "Detención de menores sin orden judicial", +) +``` + +### GraphQL + +The specialist corpora are regular OpenContracts corpora. Once ingested, +query them through the existing chat mutations: + +```graphql +mutation { + startConversation(corpusId: "") { + ok + conversation { id } + } +} + +mutation { + sendMessage(conversationId: "", content: "¿Qué exige la Ley 1178?") { + ok + response { content sources { document { id title } } } + } +} +``` + +You can look up the corpus IDs via `LegalAreaCorpus`: + +```python +from opencontractserver.bolivian_laws.models import LegalAreaCorpus +{a.area: a.corpus_id for a in LegalAreaCorpus.objects.all()} +``` + +## Testing + +``` +docker compose -f test.yml run django pytest \ + opencontractserver/bolivian_laws/tests -n 4 --dist loadscope +``` + +The scraper tests use `httpx.MockTransport` with inline HTML fixtures +and never hit the real government sites. + +## Operational notes + +- **Robots.txt**: respect each site's crawling rules. The scrapers + identify themselves with a clear User-Agent and rate-limit between + requests. +- **First backfill**: set `BOLIVIAN_LAWS_SCRAPE_LOOKBACK_DAYS=0` and + run `scrape_bolivian_laws --all --sync` once to seed the corpora, + then let Beat handle daily updates. +- **Embedders are locked** after the first document is added to a + corpus. Configure `preferred_embedder` before the first run if you + want a non-default embedder. +- **Large volumes**: the TSJ/TCP archives are large. Use + `--max-entries` or a short `--since-days` window during development + to avoid embedding everything at once. diff --git a/docs/services/bolivian_laws.md b/docs/services/bolivian_laws.md new file mode 100644 index 000000000..b2bc1d5fe --- /dev/null +++ b/docs/services/bolivian_laws.md @@ -0,0 +1,184 @@ +# Bolivian Laws RAG Service + +A multi-agent Retrieval-Augmented Generation (RAG) service over Bolivian +legal sources. Designed around two ideas: + +1. **Cost-aware corpora**: one Corpus per legal area (constitucional, + penal, civil, ...). Embeddings only run for the area you actually + ingest, and similarity search never crosses areas. +2. **Multi-agent orchestration**: each area has a specialist agent + (persona + corpus). A top-level orchestrator routes user questions to + the relevant specialist(s) and synthesises one consolidated answer + with citations. + +## Architecture + +``` + PDFs (flat dir) + │ + ▼ + ingest_bolivian_laws ─────► ensure_area_corpus(area) + (mgmt command) │ + │ ▼ + │ Corpus (preferred_embedder, persona, instructions) + │ │ + ▼ ▼ + ingest_pdf ─► Corpus.import_content ─► pgvector embeddings + │ + ▼ +GraphQL: askBolivianLaw ─► orchestrator ─► consult_ tool ─► specialist agent + │ │ + └────────── synthesises ◄────────────────┘ +``` + +## Legal areas + +| `area` (key) | Corpus slug | Specialist persona | +|-------------------|--------------------------|---------------------------------------------| +| `constitucional` | `bolivia-constitucional` | CPE 2009, jurisprudencia TCP | +| `penal` | `bolivia-penal` | Código Penal, CPP, sala penal TSJ | +| `civil` | `bolivia-civil` | Código Civil, contratos, sucesiones | +| `administrativo` | `bolivia-administrativo` | LPA, Ley SAFCO, contrataciones estatales | +| `laboral` | `bolivia-laboral` | LGT, sala social TSJ | +| `tributario` | `bolivia-tributario` | Código Tributario, SIN, AIT | +| `familia` | `bolivia-familia` | Código de las Familias | +| `comercial` | `bolivia-comercial` | Código de Comercio | +| `agrario` | `bolivia-agrario` | Ley INRA, Tribunal Agroambiental | +| `ambiental` | `bolivia-ambiental` | Ley 1333 | +| `otros` | `bolivia-otros` | residual | + +## Ingesting PDFs + +PDFs live in a flat directory. The management command does +SHA-256-based dedupe (a given PDF is ingested at most once across all +areas). + +```bash +# Explicit area (recommended for known batches): +docker compose -f local.yml run --rm django \ + python manage.py ingest_bolivian_laws \ + --path /data/leyes_constitucional/ \ + --area constitucional + +# Mixed batch — let an LLM classify each PDF: +docker compose -f local.yml run --rm django \ + python manage.py ingest_bolivian_laws \ + --path /data/leyes_mix/ \ + --auto-classify + +# Source attribution + async via Celery: +docker compose -f local.yml run --rm django \ + python manage.py ingest_bolivian_laws \ + --path /data/sentencias_tsj/ \ + --area civil --source tsj --async + +# Dry-run: just list what would happen +docker compose -f local.yml run --rm django \ + python manage.py ingest_bolivian_laws \ + --path /data/leyes_mix/ --area civil --dry-run +``` + +### Filename convention (orientative) + +`[area]_[year]_[number]_[title].pdf` — every segment is optional and +inferred best-effort by `infer_metadata_from_filename`. For example: + +- `constitucional_2009_001_cpe.pdf` → area=constitucional, year=2009 +- `ley_general_trabajo.pdf` → no area inference; falls back to `--area` + or `--auto-classify`. + +## Querying via GraphQL + +The mutation `askBolivianLaw` is the single entry point. It returns a +synthesised answer plus tagged sources. + +```graphql +mutation { + askBolivianLaw( + question: "Si soy detenido sin orden judicial, ¿qué garantías constitucionales y penales tengo?" + ) { + ok + answer + consultedAreas + sources { + area + documentId + snippet + similarityScore + } + } +} +``` + +If the caller already knows which areas matter, they can skip +orchestration (cheaper + deterministic): + +```graphql +mutation { + askBolivianLaw( + question: "¿Qué exige la Ley 1178?" + areas: ["administrativo"] + ) { + answer + consultedAreas + sources { area snippet } + } +} +``` + +`curl` example: + +```bash +curl -s -X POST http://localhost:8000/graphql/ \ + -H "Content-Type: application/json" \ + -H "Authorization: Bearer " \ + -d '{"query":"mutation { askBolivianLaw(question: \"¿Qué dice el art. 14 CPE?\") { answer consultedAreas sources { area snippet } } }"}' \ + | python3 -m json.tool +``` + +## Configuration + +Environment variables (all optional, sensible defaults baked in): + +| Var | Default | Purpose | +|-------------------------------------|---------------------------|----------------------------------------| +| `BOLIVIAN_LAWS_DEFAULT_EMBEDDER` | platform `DEFAULT_EMBEDDER` | embedder seeded for new area corpora | +| `BOLIVIAN_LAWS_CLASSIFIER_MODEL` | `gpt-4o-mini` | model used by `--auto-classify` | +| `BOLIVIAN_LAWS_ORCHESTRATOR_MODEL` | `gpt-4o-mini` | orchestrator LLM | +| `BOLIVIAN_LAWS_SPECIALIST_MODEL` | `""` (corpus default) | override for specialist agents | + +LLM provider keys (`OPENAI_API_KEY`, `ANTHROPIC_API_KEY`, etc.) are +read by the underlying `pydantic_ai` Agent. They can also be stored in +`PipelineSettings.encrypted_secrets` (see the LLM framework docs). + +## Extending: adding a new area + +1. Add a value to `LegalArea` and an entry to `AREA_PROFILES` in + `opencontractserver/bolivian_laws/constants.py`. +2. Generate a migration that updates the `area` field choices. +3. Done. The orchestrator auto-discovers all `LegalArea` values when + building its tools, and the management command exposes the new key + via `--area`. + +## Phase 3 roadmap (not yet implemented) + +Automatic scrapers for: + +- **Gaceta Oficial de Bolivia** (`gacetaoficialdebolivia.gob.bo`) +- **Tribunal Supremo de Justicia** (`tsj.bo`) +- **Tribunal Constitucional Plurinacional** (`tcpbolivia.bo`) + +These will sit in `opencontractserver/bolivian_laws/scrapers/`, share a +`BaseLegalScraper` interface, and run via `CELERY_BEAT_SCHEDULE` daily. +For now ingestion is manual / batch via the management command. + +## Risks + +- **Cost**: orchestrator may invoke 1-N specialists per question. + Mitigations: cheap specialist model; pass `areas` to bypass routing. +- **Latency**: parallel specialist calls (`ask_specialists` uses + `asyncio.gather`) keep wall time bounded by the slowest specialist. +- **`preferred_embedder` is immutable** once a corpus has documents. + Plan the embedder choice before the first ingest per area. +- **Auto-classification cost**: avoid `--auto-classify` for very large + batches; prefer pre-sorted batches with `--area`. diff --git a/easypanel.yml b/easypanel.yml new file mode 100644 index 000000000..0523ad858 --- /dev/null +++ b/easypanel.yml @@ -0,0 +1,164 @@ +# ============================================================================= +# easypanel.yml — EasyPanel-native Compose file. +# +# Differences from production.yml: +# - All env vars are read directly from the EasyPanel service env (no +# gitignored .envs/.production/* files required). +# - No bundled Traefik service: EasyPanel's built-in proxy handles TLS +# and routing. Expose `django` and `frontend` via the EasyPanel UI. +# - Safe defaults where it makes sense; required secrets fail-fast. +# +# Required env vars (set these in EasyPanel → App → Environment): +# +# DOMAIN e.g. oc.example.com +# DJANGO_SECRET_KEY python -c 'import secrets;print(secrets.token_urlsafe(64))' +# DJANGO_ADMIN_URL_SLUG python -c 'import secrets;print(secrets.token_urlsafe(24))' +# ADMIN_EMAIL you@example.com +# ADMIN_PASSWORD initial superuser password +# OPENAI_API_KEY sk-... +# POSTGRES_PASSWORD openssl rand -hex 24 +# CELERY_FLOWER_USER short random string +# CELERY_FLOWER_PASSWORD openssl rand -hex 24 +# VECTOR_EMBEDDER_API_KEY openssl rand -hex 16 +# +# ./scripts/easypanel/print-env.sh generates all of the above at once. +# ============================================================================= + +volumes: + easypanel_postgres_data: {} + easypanel_postgres_data_backups: {} + +services: + migrate: + build: + context: . + dockerfile: ./compose/production/django/Dockerfile + image: opencontractserver_easypanel_django + depends_on: + - postgres + environment: &django_env + DJANGO_SETTINGS_MODULE: config.settings.production + DJANGO_DEBUG: "False" + DJANGO_SECRET_KEY: ${DJANGO_SECRET_KEY:?DJANGO_SECRET_KEY is required} + DJANGO_ADMIN_URL: admin/${DJANGO_ADMIN_URL_SLUG:?DJANGO_ADMIN_URL_SLUG is required}/ + DJANGO_ALLOWED_HOSTS: ${DOMAIN:?DOMAIN is required},django,localhost + DJANGO_WORKER_TIMEOUT: "60" + DJANGO_SUPERUSER_USERNAME: ${ADMIN_USERNAME:-admin} + DJANGO_SUPERUSER_EMAIL: ${ADMIN_EMAIL:?ADMIN_EMAIL is required} + DJANGO_SUPERUSER_PASSWORD: ${ADMIN_PASSWORD:?ADMIN_PASSWORD is required} + STORAGE_BACKEND: ${STORAGE_BACKEND:-LOCAL} + REDIS_URL: redis://redis:6379/0 + CELERY_FLOWER_USER: ${CELERY_FLOWER_USER:?CELERY_FLOWER_USER is required} + CELERY_FLOWER_PASSWORD: ${CELERY_FLOWER_PASSWORD:?CELERY_FLOWER_PASSWORD is required} + USE_AUTH0: ${USE_AUTH0:-false} + OPENAI_API_KEY: ${OPENAI_API_KEY:?OPENAI_API_KEY is required} + OPENAI_MODEL: ${OPENAI_MODEL:-gpt-4o} + ANTHROPIC_API_KEY: ${ANTHROPIC_API_KEY:-} + EMBEDDINGS_MICROSERVICE_URL: http://vector-embedder:8000 + VECTOR_EMBEDDER_API_KEY: ${VECTOR_EMBEDDER_API_KEY:?VECTOR_EMBEDDER_API_KEY is required} + DOCLING_PARSER_SERVICE_URL: http://docling-parser:8000/parse/ + DOCXODUS_PARSER_SERVICE_URL: http://docxodus-parser:8080/parse + DOCXODUS_PARSER_TIMEOUT: "120" + # Bolivian-laws RAG (defaults from settings.base are fine) + BOLIVIAN_LAWS_SCRAPER_USER_AGENT: ${BOLIVIAN_LAWS_SCRAPER_USER_AGENT:-OpenContractsBolivianLawsBot/1.0} + BOLIVIAN_LAWS_SCRAPE_LOOKBACK_DAYS: ${BOLIVIAN_LAWS_SCRAPE_LOOKBACK_DAYS:-30} + BOLIVIAN_LAWS_REQUEST_DELAY_SECONDS: ${BOLIVIAN_LAWS_REQUEST_DELAY_SECONDS:-1.0} + # Postgres connection + POSTGRES_HOST: postgres + POSTGRES_PORT: "5432" + POSTGRES_DB: opencontractserver + POSTGRES_USER: opencontractserver + POSTGRES_PASSWORD: ${POSTGRES_PASSWORD:?POSTGRES_PASSWORD is required} + DATABASE_URL: postgres://opencontractserver:${POSTGRES_PASSWORD}@postgres:5432/opencontractserver + command: /bin/bash -c "python manage.py migrate --noinput && python manage.py migrate_pipeline_settings --sync-preferences --init-only && python manage.py migrate_pipeline_settings" + profiles: + - migrate + + django: &django + build: + context: . + dockerfile: ./compose/production/django/Dockerfile + image: opencontractserver_easypanel_django + depends_on: + postgres: + condition: service_started + redis: + condition: service_started + vector-embedder: + condition: service_started + docling-parser: + condition: service_started + required: false + docxodus-parser: + condition: service_started + required: false + environment: *django_env + # Expose port 5000 to EasyPanel's proxy (point the domain root + /graphql, + # /api, /admin, /ws here). + expose: + - "5000" + command: /bin/bash -c "daphne -b 0.0.0.0 -p 5000 --websocket_timeout $$DJANGO_WORKER_TIMEOUT config.asgi:application" + + celeryworker: + <<: *django + image: opencontractserver_easypanel_celeryworker + expose: [] + command: /start-celeryworker + + celerybeat: + <<: *django + image: opencontractserver_easypanel_celerybeat + expose: [] + command: /start-celerybeat + + postgres: + build: + context: . + dockerfile: ./compose/production/postgres/Dockerfile + image: opencontractserver_easypanel_postgres + shm_size: "4g" + volumes: + - easypanel_postgres_data:/var/lib/postgresql/data:Z + - easypanel_postgres_data_backups:/backups:z + environment: + POSTGRES_HOST: postgres + POSTGRES_PORT: "5432" + POSTGRES_DB: opencontractserver + POSTGRES_USER: opencontractserver + POSTGRES_PASSWORD: ${POSTGRES_PASSWORD:?POSTGRES_PASSWORD is required} + command: > + postgres + -c shared_buffers=1GB + -c maintenance_work_mem=512MB + -c effective_cache_size=4GB + -c work_mem=128MB + + redis: + image: redis:6 + + vector-embedder: + image: jscrudato/vector-embedder-microservice + environment: + PORT: "8000" + + docling-parser: + image: jscrudato/docsling-local + + docxodus-parser: + image: ghcr.io/open-source-legal/docxodus-service:1.1.0-docxodus5.4.2 + + frontend: + build: + context: ./frontend + dockerfile: ./Dockerfile + image: opencontractserver_easypanel_frontend + depends_on: + - django + environment: + REACT_APP_API_ROOT_URL: https://${DOMAIN:?DOMAIN is required} + REACT_APP_APPLICATION_DOMAIN: ${DOMAIN} + REACT_APP_USE_AUTH0: ${USE_AUTH0:-false} + # Expose nginx on 80 to EasyPanel's proxy (point the domain root here; + # the paths /graphql, /admin, /api, /ws should route to django:5000). + expose: + - "80" diff --git a/opencontractserver/bolivian_laws/__init__.py b/opencontractserver/bolivian_laws/__init__.py new file mode 100644 index 000000000..b633112db --- /dev/null +++ b/opencontractserver/bolivian_laws/__init__.py @@ -0,0 +1 @@ +default_app_config = "opencontractserver.bolivian_laws.apps.BolivianLawsConfig" diff --git a/opencontractserver/bolivian_laws/admin.py b/opencontractserver/bolivian_laws/admin.py new file mode 100644 index 000000000..7b8210ec9 --- /dev/null +++ b/opencontractserver/bolivian_laws/admin.py @@ -0,0 +1,48 @@ +from django.contrib import admin + +from opencontractserver.bolivian_laws.models import ( + BolivianLegalDocument, + LegalAreaCorpus, +) + + +@admin.register(LegalAreaCorpus) +class LegalAreaCorpusAdmin(admin.ModelAdmin): + list_display = ("area", "corpus", "created") + readonly_fields = ("area", "corpus", "created") + search_fields = ("area",) + + +@admin.register(BolivianLegalDocument) +class BolivianLegalDocumentAdmin(admin.ModelAdmin): + list_display = ( + "title", + "area", + "source", + "status", + "published_at", + "ingested_at", + ) + list_filter = ("area", "source", "status") + search_fields = ("title", "external_id", "pdf_sha256") + readonly_fields = ( + "pdf_sha256", + "document", + "corpus", + "created", + "ingested_at", + "last_error", + ) + actions = ["mark_pending_for_retry"] + + @admin.action(description="Marcar como pendiente para reintentar") + def mark_pending_for_retry(self, request, queryset): + # Resets tracking-record state so a fresh re-ingest can succeed. + # SHA-based dedupe will still block re-ingesting the exact same + # bytes — for a true retry, delete the record first. + updated = queryset.update( + status=BolivianLegalDocument.Status.PENDING, + last_error="", + ingested_at=None, + ) + self.message_user(request, f"{updated} record(s) marked as pending.") diff --git a/opencontractserver/bolivian_laws/apps.py b/opencontractserver/bolivian_laws/apps.py new file mode 100644 index 000000000..151403a04 --- /dev/null +++ b/opencontractserver/bolivian_laws/apps.py @@ -0,0 +1,7 @@ +from django.apps import AppConfig + + +class BolivianLawsConfig(AppConfig): + default_auto_field = "django.db.models.BigAutoField" + name = "opencontractserver.bolivian_laws" + verbose_name = "Bolivian Laws RAG" diff --git a/opencontractserver/bolivian_laws/constants.py b/opencontractserver/bolivian_laws/constants.py new file mode 100644 index 000000000..62db6d0e5 --- /dev/null +++ b/opencontractserver/bolivian_laws/constants.py @@ -0,0 +1,186 @@ +"""Constants for the Bolivian Laws RAG service. + +Defines the legal areas (specialties), document sources, and per-area +profiles consumed by both the ingestion pipeline (corpus creation) and +the agent layer (specialist personas / instructions). +""" + +from __future__ import annotations + +from dataclasses import dataclass + +from django.db import models + + +class LegalArea(models.TextChoices): + """Bolivian legal specialties. + + One Corpus is created per area on first ingestion. + """ + + CONSTITUCIONAL = "constitucional", "Derecho Constitucional" + PENAL = "penal", "Derecho Penal" + CIVIL = "civil", "Derecho Civil" + ADMINISTRATIVO = "administrativo", "Derecho Administrativo" + LABORAL = "laboral", "Derecho Laboral" + TRIBUTARIO = "tributario", "Derecho Tributario" + FAMILIA = "familia", "Derecho de Familia" + COMERCIAL = "comercial", "Derecho Comercial" + AGRARIO = "agrario", "Derecho Agrario" + AMBIENTAL = "ambiental", "Derecho Ambiental" + OTROS = "otros", "Otros" + + +class LegalSource(models.TextChoices): + """Origin of an ingested legal document.""" + + GACETA = "gaceta", "Gaceta Oficial de Bolivia" + TSJ = "tsj", "Tribunal Supremo de Justicia" + TCP = "tcp", "Tribunal Constitucional Plurinacional" + MANUAL = "manual", "Carga manual" + + +@dataclass(frozen=True) +class AreaProfile: + """Single source of truth for area-specific corpus + agent config.""" + + title: str + description: str + agent_persona: str + agent_instructions: str + + +_BASE_AGENT_RULES = ( + "Responde siempre en español. Cita la norma exacta (ley, código, " + "artículo, número y fecha) cuando esté disponible. Cuando recuperes " + "fragmentos del corpus, inclúyelos como soporte de tu respuesta. " + "Si la pregunta sale de tu área de especialidad, indícalo y sugiere " + "consultar al especialista correspondiente." +) + + +def _profile( + area_label: str, + short: str, + persona_focus: str, + extra_instruction: str = "", +) -> AreaProfile: + title = f"Bolivia — {area_label}" + description = ( + f"Corpus de fuentes jurídicas bolivianas en materia de {short}. " + "Incluye legislación, jurisprudencia y doctrina recopilada de " + "fuentes oficiales (Gaceta Oficial, TSJ, TCP) y cargas manuales." + ) + persona = ( + f"Eres un experto en derecho {short} boliviano. {persona_focus} " + "Te apoyas exclusivamente en el corpus indexado para responder." + ) + instructions = _BASE_AGENT_RULES + if extra_instruction: + instructions = f"{instructions} {extra_instruction}" + return AreaProfile( + title=title, + description=description, + agent_persona=persona, + agent_instructions=instructions, + ) + + +AREA_PROFILES: dict[str, AreaProfile] = { + LegalArea.CONSTITUCIONAL: _profile( + "Constitucional", + "constitucional", + "Dominas la Constitución Política del Estado (CPE) de 2009, " + "la jurisprudencia del Tribunal Constitucional Plurinacional, " + "garantías y derechos fundamentales.", + ), + LegalArea.PENAL: _profile( + "Penal", + "penal", + "Conoces el Código Penal boliviano, el Código de Procedimiento " + "Penal y la jurisprudencia penal del TSJ.", + ), + LegalArea.CIVIL: _profile( + "Civil", + "civil", + "Dominas el Código Civil, obligaciones, contratos, sucesiones, " + "derechos reales y la jurisprudencia civil del TSJ.", + ), + LegalArea.ADMINISTRATIVO: _profile( + "Administrativo", + "administrativo", + "Conoces la Ley de Procedimiento Administrativo, la Ley SAFCO " + "(Ley 1178), normativa de contrataciones estatales y la " + "jurisprudencia contencioso-administrativa.", + ), + LegalArea.LABORAL: _profile( + "Laboral", + "laboral", + "Dominas la Ley General del Trabajo, su reglamento y la " + "jurisprudencia social del TSJ.", + ), + LegalArea.TRIBUTARIO: _profile( + "Tributario", + "tributario", + "Conoces el Código Tributario boliviano, normativa del SIN y " + "la jurisprudencia de la AIT.", + ), + LegalArea.FAMILIA: _profile( + "Familia", + "familia", + "Dominas el Código de las Familias y del Proceso Familiar, " + "y la jurisprudencia en materia familiar.", + ), + LegalArea.COMERCIAL: _profile( + "Comercial", + "comercial", + "Conoces el Código de Comercio, sociedades comerciales y " + "normativa empresarial boliviana.", + ), + LegalArea.AGRARIO: _profile( + "Agrario", + "agrario", + "Dominas la Ley INRA, la Ley de Reconducción Comunitaria, " + "y la jurisprudencia del Tribunal Agroambiental.", + ), + LegalArea.AMBIENTAL: _profile( + "Ambiental", + "ambiental", + "Conoces la Ley del Medio Ambiente (Ley 1333), normativa " + "sectorial ambiental y la jurisprudencia ambiental.", + ), + LegalArea.OTROS: _profile( + "General", + "general", + "Cubres áreas residuales del derecho boliviano que no encajan " + "en una especialidad específica.", + ), +} + + +def get_profile(area: str) -> AreaProfile: + """Return the AreaProfile for a given area key, raising if unknown.""" + profile = AREA_PROFILES.get(area) + if profile is None: + raise KeyError(f"Unknown legal area: {area!r}") + return profile + + +# Slug used for the per-area corpus. Keep stable: changing it would +# orphan existing corpora. +def corpus_slug_for_area(area: str) -> str: + return f"bolivia-{area}" + + +# Orchestrator persona — used when no area is forced and the orchestrator +# routes the question to one or more specialists. +ORCHESTRATOR_PERSONA = ( + "Eres un orquestador jurídico para derecho boliviano. Recibes " + "preguntas de usuarios y decides qué especialista(s) consultar " + "(constitucional, penal, civil, administrativo, laboral, tributario, " + "familia, comercial, agrario, ambiental). Cuando una pregunta cruza " + "áreas, consulta a varios especialistas y sintetiza una respuesta " + "coherente. Cita siempre las fuentes que devuelven los especialistas " + "y deja claro de qué área proviene cada cita. No inventes normas: " + "si los especialistas no encuentran respuesta, dilo explícitamente." +) diff --git a/opencontractserver/bolivian_laws/management/__init__.py b/opencontractserver/bolivian_laws/management/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/opencontractserver/bolivian_laws/management/commands/__init__.py b/opencontractserver/bolivian_laws/management/commands/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/opencontractserver/bolivian_laws/management/commands/ingest_bolivian_laws.py b/opencontractserver/bolivian_laws/management/commands/ingest_bolivian_laws.py new file mode 100644 index 000000000..c95d641aa --- /dev/null +++ b/opencontractserver/bolivian_laws/management/commands/ingest_bolivian_laws.py @@ -0,0 +1,165 @@ +"""Bulk-ingest a directory of Bolivian legal PDFs into per-area corpora. + +Usage: + + python manage.py ingest_bolivian_laws --path /data/leyes/ --area constitucional + python manage.py ingest_bolivian_laws --path /data/leyes/ --auto-classify + python manage.py ingest_bolivian_laws --path /data/leyes/ --area penal --async + +The directory is scanned non-recursively for ``*.pdf`` files (the user +explicitly chose a "flat structure" workflow). For each PDF: + +1. SHA-256 dedupe against ``BolivianLegalDocument`` (skip if already + ingested anywhere). +2. Determine the area: explicit ``--area`` wins; otherwise the LLM + classifier is used if ``--auto-classify`` is set; otherwise the file + is skipped with a warning. +3. ``ensure_area_corpus`` (idempotent) and ``ingest_pdf`` (inline) — or + ``ingest_pdf_async.delay(...)`` if ``--async`` is passed. +""" + +from __future__ import annotations + +import asyncio +import logging +from pathlib import Path + +from django.core.management.base import BaseCommand, CommandError + +from opencontractserver.bolivian_laws.constants import LegalArea, LegalSource +from opencontractserver.bolivian_laws.services.ingestion import ( + classify_pdf_area, + infer_metadata_from_filename, + ingest_pdf, +) +from opencontractserver.bolivian_laws.tasks import ingest_pdf_async + +logger = logging.getLogger(__name__) + + +class Command(BaseCommand): + help = "Bulk-ingest Bolivian legal PDFs into per-area corpora." + + def add_arguments(self, parser): + parser.add_argument( + "--path", + required=True, + help="Directory containing flat PDFs to ingest.", + ) + parser.add_argument( + "--area", + choices=[a.value for a in LegalArea], + default=None, + help="Force this area for all PDFs in the batch.", + ) + parser.add_argument( + "--auto-classify", + action="store_true", + help="Use the LLM classifier when --area is not given.", + ) + parser.add_argument( + "--source", + choices=[s.value for s in LegalSource], + default=LegalSource.MANUAL, + help="Source attribution for the batch (default: manual).", + ) + parser.add_argument( + "--async", + action="store_true", + dest="run_async", + help="Enqueue Celery tasks instead of processing inline.", + ) + parser.add_argument( + "--dry-run", + action="store_true", + help="List files and resolved areas without ingesting.", + ) + + def handle(self, *args, **options): + path = Path(options["path"]).expanduser().resolve() + if not path.is_dir(): + raise CommandError(f"--path must be a directory: {path}") + + forced_area: str | None = options.get("area") + auto_classify: bool = options.get("auto_classify", False) + source: str = options["source"] + run_async: bool = options.get("run_async", False) + dry_run: bool = options.get("dry_run", False) + + if not forced_area and not auto_classify: + raise CommandError("Either --area or --auto-classify must be provided.") + + pdf_files = sorted(p for p in path.glob("*.pdf") if p.is_file()) + if not pdf_files: + self.stdout.write(self.style.WARNING(f"No PDFs found under {path}")) + return + + self.stdout.write( + f"Found {len(pdf_files)} PDF(s) under {path}; " + f"area={forced_area or 'auto'}, source={source}, " + f"async={run_async}, dry_run={dry_run}" + ) + + ingested = skipped = failed = 0 + + for pdf_path in pdf_files: + inferred = infer_metadata_from_filename(pdf_path.name) + area = forced_area or inferred.get("area") + title = inferred.get("title_hint") or pdf_path.stem + + if not area and auto_classify: + area = asyncio.run(classify_pdf_area(pdf_path, title=title)) + + if not area: + self.stdout.write( + self.style.WARNING(f" SKIP {pdf_path.name}: no area resolved.") + ) + skipped += 1 + continue + + if dry_run: + self.stdout.write(f" DRY {pdf_path.name} → area={area}") + continue + + try: + if run_async: + ingest_pdf_async.delay( + str(pdf_path), + area=area, + title=title, + source=source, + metadata=inferred, + ) + self.stdout.write(f" QUEUED {pdf_path.name} → {area}") + ingested += 1 + else: + record = ingest_pdf( + pdf_path, + area=area, + title=title, + source=source, + metadata=inferred, + filename=pdf_path.name, + ) + if record.status == record.Status.INGESTED: + self.stdout.write( + self.style.SUCCESS( + f" OK {pdf_path.name} → {area} " + f"(record #{record.pk})" + ) + ) + ingested += 1 + else: + self.stdout.write( + f" DEDUPE {pdf_path.name} (existing #{record.pk})" + ) + skipped += 1 + except Exception as exc: + self.stdout.write(self.style.ERROR(f" FAIL {pdf_path.name}: {exc}")) + failed += 1 + + self.stdout.write( + self.style.SUCCESS( + f"\nDone: ingested={ingested}, skipped={skipped}, failed={failed}" + ) + ) diff --git a/opencontractserver/bolivian_laws/management/commands/scrape_bolivian_laws.py b/opencontractserver/bolivian_laws/management/commands/scrape_bolivian_laws.py new file mode 100644 index 000000000..be70aa0f0 --- /dev/null +++ b/opencontractserver/bolivian_laws/management/commands/scrape_bolivian_laws.py @@ -0,0 +1,118 @@ +"""Run the Bolivian-laws scrapers on demand. + +Usage: + + python manage.py scrape_bolivian_laws --source gaceta + python manage.py scrape_bolivian_laws --all --since-days 7 + python manage.py scrape_bolivian_laws --source tcp --max-entries 5 --sync + +``--sync`` runs the scrape inline in this process (useful for the +initial bootstrap or manual backfills). Without it, the command +enqueues Celery tasks and returns immediately. +""" + +from __future__ import annotations + +from django.core.management.base import BaseCommand, CommandError + +from opencontractserver.bolivian_laws.scrapers import SCRAPERS +from opencontractserver.bolivian_laws.tasks import ( + scrape_and_ingest_all, + scrape_and_ingest_source, +) + + +class Command(BaseCommand): + help = "Scrape one or all Bolivian legal sources and ingest new PDFs." + + def add_arguments(self, parser): + group = parser.add_mutually_exclusive_group(required=True) + group.add_argument( + "--source", + choices=sorted(SCRAPERS.keys()), + help="Run a single source (gaceta | tsj | tcp).", + ) + group.add_argument( + "--all", + action="store_true", + dest="run_all", + help="Run every registered source.", + ) + parser.add_argument( + "--since-days", + type=int, + default=None, + help=( + "Only ingest entries whose published_at is within the last " + "N days (when the listing exposes a date)." + ), + ) + parser.add_argument( + "--max-entries", + type=int, + default=None, + help="Cap the number of entries processed per source.", + ) + parser.add_argument( + "--sync", + action="store_true", + help="Run inline instead of enqueuing Celery tasks.", + ) + parser.add_argument( + "--user-id", + type=int, + default=None, + help="Attribute created corpora/documents to this user ID.", + ) + + def handle(self, *args, **options): + source = options.get("source") + run_all = options.get("run_all", False) + since_days = options.get("since_days") + max_entries = options.get("max_entries") + run_sync = options.get("sync", False) + user_id = options.get("user_id") + + if run_all: + if run_sync: + summaries = [] + for key in SCRAPERS: + summary = scrape_and_ingest_source.run( + key, + since_days=since_days, + max_entries=max_entries, + user_id=user_id, + ) + summaries.append(summary) + self.stdout.write(self.style.SUCCESS(str(summaries))) + else: + task_ids = scrape_and_ingest_all.delay( + since_days=since_days, + max_entries_per_source=max_entries, + ) + self.stdout.write( + self.style.SUCCESS(f"Enqueued fan-out task: {task_ids.id}") + ) + return + + if source not in SCRAPERS: + raise CommandError(f"Unknown source: {source!r}") + + if run_sync: + summary = scrape_and_ingest_source.run( + source, + since_days=since_days, + max_entries=max_entries, + user_id=user_id, + ) + self.stdout.write(self.style.SUCCESS(str(summary))) + else: + result = scrape_and_ingest_source.delay( + source, + since_days=since_days, + max_entries=max_entries, + user_id=user_id, + ) + self.stdout.write( + self.style.SUCCESS(f"Enqueued task {result.id} for {source}") + ) diff --git a/opencontractserver/bolivian_laws/migrations/0001_initial.py b/opencontractserver/bolivian_laws/migrations/0001_initial.py new file mode 100644 index 000000000..e415ceaa9 --- /dev/null +++ b/opencontractserver/bolivian_laws/migrations/0001_initial.py @@ -0,0 +1,174 @@ +import django.db.models.deletion +from django.db import migrations, models + + +class Migration(migrations.Migration): + + initial = True + + dependencies = [ + ("corpuses", "0047_corpus_license_fields"), + ("documents", "0035_add_enabled_components_to_pipeline_settings"), + ] + + operations = [ + migrations.CreateModel( + name="LegalAreaCorpus", + fields=[ + ( + "id", + models.BigAutoField( + auto_created=True, + primary_key=True, + serialize=False, + verbose_name="ID", + ), + ), + ( + "area", + models.CharField( + choices=[ + ("constitucional", "Derecho Constitucional"), + ("penal", "Derecho Penal"), + ("civil", "Derecho Civil"), + ("administrativo", "Derecho Administrativo"), + ("laboral", "Derecho Laboral"), + ("tributario", "Derecho Tributario"), + ("familia", "Derecho de Familia"), + ("comercial", "Derecho Comercial"), + ("agrario", "Derecho Agrario"), + ("ambiental", "Derecho Ambiental"), + ("otros", "Otros"), + ], + max_length=32, + unique=True, + ), + ), + ("created", models.DateTimeField(auto_now_add=True)), + ( + "corpus", + models.OneToOneField( + on_delete=django.db.models.deletion.CASCADE, + related_name="bolivian_law_area", + to="corpuses.corpus", + ), + ), + ], + options={ + "verbose_name": "Bolivian Legal Area Corpus", + "verbose_name_plural": "Bolivian Legal Area Corpora", + }, + ), + migrations.CreateModel( + name="BolivianLegalDocument", + fields=[ + ( + "id", + models.BigAutoField( + auto_created=True, + primary_key=True, + serialize=False, + verbose_name="ID", + ), + ), + ( + "area", + models.CharField( + choices=[ + ("constitucional", "Derecho Constitucional"), + ("penal", "Derecho Penal"), + ("civil", "Derecho Civil"), + ("administrativo", "Derecho Administrativo"), + ("laboral", "Derecho Laboral"), + ("tributario", "Derecho Tributario"), + ("familia", "Derecho de Familia"), + ("comercial", "Derecho Comercial"), + ("agrario", "Derecho Agrario"), + ("ambiental", "Derecho Ambiental"), + ("otros", "Otros"), + ], + max_length=32, + ), + ), + ( + "source", + models.CharField( + choices=[ + ("gaceta", "Gaceta Oficial de Bolivia"), + ("tsj", "Tribunal Supremo de Justicia"), + ("tcp", "Tribunal Constitucional Plurinacional"), + ("manual", "Carga manual"), + ], + default="manual", + max_length=16, + ), + ), + ( + "external_id", + models.CharField( + blank=True, + default="", + help_text=( + "Identificador externo (número de gaceta, sentencia, " + "etc.). Opcional; depende de la fuente." + ), + max_length=255, + ), + ), + ("title", models.CharField(max_length=1024)), + ("published_at", models.DateField(blank=True, null=True)), + ( + "pdf_sha256", + models.CharField(max_length=64, unique=True), + ), + ("metadata", models.JSONField(blank=True, default=dict)), + ( + "status", + models.CharField( + choices=[ + ("pending", "Pendiente"), + ("ingested", "Ingestado"), + ("failed", "Fallido"), + ], + default="pending", + max_length=16, + ), + ), + ("last_error", models.TextField(blank=True, default="")), + ("created", models.DateTimeField(auto_now_add=True)), + ("ingested_at", models.DateTimeField(blank=True, null=True)), + ( + "corpus", + models.ForeignKey( + on_delete=django.db.models.deletion.CASCADE, + related_name="bolivian_legal_records", + to="corpuses.corpus", + ), + ), + ( + "document", + models.ForeignKey( + blank=True, + null=True, + on_delete=django.db.models.deletion.SET_NULL, + related_name="bolivian_legal_records", + to="documents.document", + ), + ), + ], + options={ + "verbose_name": "Bolivian Legal Document", + "verbose_name_plural": "Bolivian Legal Documents", + "indexes": [ + models.Index( + fields=["area", "status"], + name="bolivian_la_area_a92e57_idx", + ), + models.Index( + fields=["source", "status"], + name="bolivian_la_source_8a08f0_idx", + ), + ], + }, + ), + ] diff --git a/opencontractserver/bolivian_laws/migrations/__init__.py b/opencontractserver/bolivian_laws/migrations/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/opencontractserver/bolivian_laws/models.py b/opencontractserver/bolivian_laws/models.py new file mode 100644 index 000000000..c808ab053 --- /dev/null +++ b/opencontractserver/bolivian_laws/models.py @@ -0,0 +1,112 @@ +"""Models for the Bolivian Laws RAG service. + +Two models: + +- ``LegalAreaCorpus``: 1-to-1 idempotent mapping ``area → Corpus``. Avoids + hardcoding corpus IDs anywhere else in the system. +- ``BolivianLegalDocument``: tracking record per ingested PDF, providing + global SHA-256 dedupe, source attribution, area classification, and a + back-pointer to the resulting OC ``Document``. +""" + +from __future__ import annotations + +from django.contrib.auth import get_user_model +from django.db import models + +from opencontractserver.bolivian_laws.constants import LegalArea, LegalSource + +User = get_user_model() + + +class LegalAreaCorpus(models.Model): + """Maps a legal area to its dedicated Corpus. + + Created on-demand by ``services.ingestion.ensure_area_corpus`` the + first time documents for the area are ingested. + """ + + area = models.CharField( + max_length=32, + choices=LegalArea.choices, + unique=True, + ) + corpus = models.OneToOneField( + "corpuses.Corpus", + on_delete=models.CASCADE, + related_name="bolivian_law_area", + ) + created = models.DateTimeField(auto_now_add=True) + + class Meta: + verbose_name = "Bolivian Legal Area Corpus" + verbose_name_plural = "Bolivian Legal Area Corpora" + + def __str__(self) -> str: # pragma: no cover - trivial + return f"{self.get_area_display()} → corpus #{self.corpus_id}" + + +class BolivianLegalDocument(models.Model): + """Tracking record for a Bolivian legal PDF that has been (or is being) + ingested into the area corpus. + + ``pdf_sha256`` is globally unique to provide cheap dedupe across all + sources/areas: the same PDF will not be ingested twice. + """ + + class Status(models.TextChoices): + PENDING = "pending", "Pendiente" + INGESTED = "ingested", "Ingestado" + FAILED = "failed", "Fallido" + + area = models.CharField(max_length=32, choices=LegalArea.choices) + source = models.CharField( + max_length=16, + choices=LegalSource.choices, + default=LegalSource.MANUAL, + ) + external_id = models.CharField( + max_length=255, + blank=True, + default="", + help_text=( + "Identificador externo (número de gaceta, sentencia, etc.). " + "Opcional; depende de la fuente." + ), + ) + title = models.CharField(max_length=1024) + published_at = models.DateField(null=True, blank=True) + pdf_sha256 = models.CharField(max_length=64, unique=True) + metadata = models.JSONField(default=dict, blank=True) + + document = models.ForeignKey( + "documents.Document", + on_delete=models.SET_NULL, + null=True, + blank=True, + related_name="bolivian_legal_records", + ) + corpus = models.ForeignKey( + "corpuses.Corpus", + on_delete=models.CASCADE, + related_name="bolivian_legal_records", + ) + status = models.CharField( + max_length=16, + choices=Status.choices, + default=Status.PENDING, + ) + last_error = models.TextField(blank=True, default="") + created = models.DateTimeField(auto_now_add=True) + ingested_at = models.DateTimeField(null=True, blank=True) + + class Meta: + verbose_name = "Bolivian Legal Document" + verbose_name_plural = "Bolivian Legal Documents" + indexes = [ + models.Index(fields=["area", "status"]), + models.Index(fields=["source", "status"]), + ] + + def __str__(self) -> str: # pragma: no cover - trivial + return f"[{self.get_area_display()}] {self.title[:60]}" diff --git a/opencontractserver/bolivian_laws/scrapers/__init__.py b/opencontractserver/bolivian_laws/scrapers/__init__.py new file mode 100644 index 000000000..81f4a18de --- /dev/null +++ b/opencontractserver/bolivian_laws/scrapers/__init__.py @@ -0,0 +1,25 @@ +"""Scrapers for Bolivian legal sources. + +This package hosts one scraper per public source (Gaceta Oficial, TSJ, +TCP). Every scraper subclasses :class:`base.BaseScraper` and yields +normalized :class:`base.ScrapedEntry` objects that the ingestion layer +turns into ``BolivianLegalDocument`` records + ``Document`` uploads. +""" + +from opencontractserver.bolivian_laws.scrapers.base import ( + BaseScraper, + ScrapedEntry, +) +from opencontractserver.bolivian_laws.scrapers.registry import ( + SCRAPERS, + get_scraper_class, + iter_scraper_classes, +) + +__all__ = [ + "BaseScraper", + "ScrapedEntry", + "SCRAPERS", + "get_scraper_class", + "iter_scraper_classes", +] diff --git a/opencontractserver/bolivian_laws/scrapers/base.py b/opencontractserver/bolivian_laws/scrapers/base.py new file mode 100644 index 000000000..870b72aa9 --- /dev/null +++ b/opencontractserver/bolivian_laws/scrapers/base.py @@ -0,0 +1,237 @@ +"""Base scraper abstractions for Bolivian legal sources. + +Design goals: + +- **Defensive**: a single broken entry must not abort a batch. The base + class wraps per-entry parsing in try/except and logs, so ``iter_entries`` + always returns a usable iterator. +- **Testable**: HTTP is injected via an ``httpx.Client`` so tests can use + ``httpx.MockTransport`` with fixture HTML/PDFs. +- **Rate-limited**: a simple sleep between requests keeps us polite to + the government sites. The delay is tunable per-source (and zeroed in + tests). +- **Configurable**: listing URLs and per-source overrides come from + Django settings / env vars so deployments can point at staging mirrors + or freeze a specific archive URL without code changes. + +Each concrete scraper provides: + +- ``source_key`` — matches :class:`LegalSource` values. +- ``default_base_url`` / ``default_listing_paths`` — used if not overridden. +- ``extract_entries(html, url)`` — source-specific HTML parsing. +""" + +from __future__ import annotations + +import dataclasses +import datetime as dt +import logging +import time +from collections.abc import Iterable, Iterator +from typing import ClassVar +from urllib.parse import urljoin + +import httpx +from bs4 import BeautifulSoup +from django.conf import settings + +logger = logging.getLogger(__name__) + +DEFAULT_USER_AGENT = ( + "OpenContractsBolivianLawsBot/1.0 " "(+https://github.com/JSv4/OpenContracts)" +) +DEFAULT_TIMEOUT_SECONDS = 30.0 +DEFAULT_REQUEST_DELAY_SECONDS = 1.0 + + +@dataclasses.dataclass +class ScrapedEntry: + """A single candidate document discovered by a scraper. + + The scraper produces these from listing pages; the ingestion pipeline + turns them into ``BolivianLegalDocument`` + ``Document`` records. + """ + + source_key: str + pdf_url: str + title: str + external_id: str = "" + published_at: dt.date | None = None + suggested_area: str | None = None + metadata: dict = dataclasses.field(default_factory=dict) + + def as_dict(self) -> dict: + out = dataclasses.asdict(self) + if self.published_at is not None: + out["published_at"] = self.published_at.isoformat() + return out + + +class BaseScraper: + """Template-method base class for Bolivian legal source scrapers.""" + + source_key: ClassVar[str] = "" + default_base_url: ClassVar[str] = "" + default_listing_paths: ClassVar[tuple[str, ...]] = () + settings_base_url_key: ClassVar[str] = "" + settings_listing_paths_key: ClassVar[str] = "" + + def __init__( + self, + *, + client: httpx.Client | None = None, + user_agent: str | None = None, + request_delay_seconds: float | None = None, + base_url: str | None = None, + listing_paths: Iterable[str] | None = None, + ) -> None: + if not self.source_key: + raise NotImplementedError(f"{type(self).__name__} must set source_key.") + + self.user_agent = ( + user_agent + or getattr(settings, "BOLIVIAN_LAWS_SCRAPER_USER_AGENT", None) + or DEFAULT_USER_AGENT + ) + self.request_delay_seconds = ( + request_delay_seconds + if request_delay_seconds is not None + else float( + getattr( + settings, + "BOLIVIAN_LAWS_REQUEST_DELAY_SECONDS", + DEFAULT_REQUEST_DELAY_SECONDS, + ) + ) + ) + self.base_url = ( + base_url + or ( + getattr(settings, self.settings_base_url_key, None) + if self.settings_base_url_key + else None + ) + or self.default_base_url + ) + resolved_paths = listing_paths + if resolved_paths is None and self.settings_listing_paths_key: + resolved_paths = getattr(settings, self.settings_listing_paths_key, None) + if resolved_paths is None: + resolved_paths = self.default_listing_paths + self.listing_paths = tuple(resolved_paths) + + self._owns_client = client is None + self._client = client or httpx.Client( + headers={"User-Agent": self.user_agent}, + timeout=DEFAULT_TIMEOUT_SECONDS, + follow_redirects=True, + ) + + # -- lifecycle ---------------------------------------------------- + def close(self) -> None: + if self._owns_client: + self._client.close() + + def __enter__(self) -> BaseScraper: + return self + + def __exit__(self, exc_type, exc, tb) -> None: + self.close() + + # -- public API --------------------------------------------------- + def iter_entries( + self, + *, + since: dt.date | None = None, + max_entries: int | None = None, + ) -> Iterator[ScrapedEntry]: + """Yield :class:`ScrapedEntry` objects from every listing page. + + ``since`` is passed to :meth:`extract_entries` so subclasses can + prune; we also filter here as a safety net using + ``ScrapedEntry.published_at`` when present. + """ + count = 0 + for listing_url in self._iter_listing_urls(): + try: + html = self._http_get_text(listing_url) + except Exception as exc: + logger.warning( + "[%s] failed to fetch listing %s: %s", + self.source_key, + listing_url, + exc, + ) + continue + + try: + entries = list(self.extract_entries(html=html, url=listing_url)) + except Exception: + logger.exception( + "[%s] listing parse failure: %s", + self.source_key, + listing_url, + ) + continue + + for entry in entries: + if ( + since is not None + and entry.published_at is not None + and entry.published_at < since + ): + continue + yield entry + count += 1 + if max_entries is not None and count >= max_entries: + return + + def download_pdf(self, entry: ScrapedEntry) -> bytes: + """Download the PDF bytes for the given entry.""" + return self._http_get_bytes(entry.pdf_url) + + # -- hooks for subclasses ----------------------------------------- + def extract_entries(self, *, html: str, url: str) -> Iterable[ScrapedEntry]: + """Return entries parsed from a single listing page.""" + # Default implementation: treat every as an entry. + # Concrete scrapers should override for richer metadata. + soup = BeautifulSoup(html, "html.parser") + for anchor in soup.find_all("a"): + href = (anchor.get("href") or "").strip() + if not href.lower().endswith(".pdf"): + continue + pdf_url = urljoin(url, href) + title = (anchor.get_text() or href.rsplit("/", 1)[-1]).strip() + yield ScrapedEntry( + source_key=self.source_key, + pdf_url=pdf_url, + title=title[:1024], + ) + + # -- helpers ------------------------------------------------------ + def _iter_listing_urls(self) -> Iterator[str]: + for path in self.listing_paths: + yield self._absolute(path) + + def _absolute(self, path: str) -> str: + if path.startswith(("http://", "https://")): + return path + return urljoin(self.base_url, path) + + def _http_get_text(self, url: str) -> str: + self._throttle() + logger.debug("[%s] GET %s", self.source_key, url) + response = self._client.get(url) + response.raise_for_status() + return response.text + + def _http_get_bytes(self, url: str) -> bytes: + self._throttle() + logger.debug("[%s] GET(bytes) %s", self.source_key, url) + response = self._client.get(url) + response.raise_for_status() + return response.content + + def _throttle(self) -> None: + if self.request_delay_seconds > 0: + time.sleep(self.request_delay_seconds) diff --git a/opencontractserver/bolivian_laws/scrapers/gaceta.py b/opencontractserver/bolivian_laws/scrapers/gaceta.py new file mode 100644 index 000000000..375296ba1 --- /dev/null +++ b/opencontractserver/bolivian_laws/scrapers/gaceta.py @@ -0,0 +1,161 @@ +"""Scraper for the Gaceta Oficial de Bolivia. + +Site: https://gacetaoficialdebolivia.gob.bo/ + +The Gaceta publishes laws (Leyes), decretos supremos, resoluciones and +other official norms. Its listing page HTML structure is not formally +documented, so this scraper is deliberately defensive: it accepts any +```` tag that points to a PDF on the same host and enriches the +:class:`ScrapedEntry` with whatever metadata it can extract from the +surrounding row/text (year, issue number, publication date, rough +topic). + +The exact listing URL(s) are read from ``settings.BOLIVIAN_LAWS_GACETA_*`` +so that deployments can point at whichever index the Gaceta currently +exposes without touching code. +""" + +from __future__ import annotations + +import datetime as dt +import logging +import re +from collections.abc import Iterable, Iterator +from urllib.parse import urljoin, urlparse + +from bs4 import BeautifulSoup + +from opencontractserver.bolivian_laws.constants import LegalArea, LegalSource +from opencontractserver.bolivian_laws.scrapers.base import ( + BaseScraper, + ScrapedEntry, +) + +logger = logging.getLogger(__name__) + +_DATE_PATTERNS = ( + re.compile(r"(\d{1,2})/(\d{1,2})/(\d{4})"), + re.compile(r"(\d{4})-(\d{1,2})-(\d{1,2})"), +) +_ISSUE_PATTERN = re.compile( + r"(?:gaceta|edición|edicion|nro\.?|n[º°])\s*([0-9A-Za-z\-]+)", re.I +) +_LEY_PATTERN = re.compile(r"\bley(?:\s+(?:n[º°.]?\s*)?(\d+))?", re.I) +_DECRETO_PATTERN = re.compile(r"decreto\s+supremo\s+(?:n[º°.]?\s*)?([0-9]+)", re.I) + +# Lightweight keyword heuristics for the Gaceta. Not meant to be +# authoritative — the optional LLM classifier takes over when this +# returns OTROS. +_AREA_KEYWORDS: tuple[tuple[str, tuple[str, ...]], ...] = ( + (LegalArea.TRIBUTARIO, ("tributari", "impuesto", "sin ", "iva", "it ")), + (LegalArea.LABORAL, ("laboral", "trabajo", "sindicato", "salario")), + (LegalArea.PENAL, ("penal", "delito", "código penal")), + (LegalArea.CIVIL, ("civil", "código civil", "obligaciones")), + (LegalArea.FAMILIA, ("familia", "menor", "niñez", "adolescen")), + (LegalArea.AGRARIO, ("agrari", "inra", "tierras", "comunitaria")), + (LegalArea.AMBIENTAL, ("ambiental", "medio ambiente", "ley 1333")), + (LegalArea.COMERCIAL, ("comercial", "empresa", "sociedad")), + (LegalArea.ADMINISTRATIVO, ("administrativ", "safco", "contrataci")), + (LegalArea.CONSTITUCIONAL, ("constitucional", "cpe", "derechos fundament")), +) + + +def _guess_area(text: str) -> str: + lowered = text.lower() + for area, keywords in _AREA_KEYWORDS: + if any(k in lowered for k in keywords): + return area + return LegalArea.OTROS + + +def _parse_spanish_date(text: str) -> dt.date | None: + for pattern in _DATE_PATTERNS: + match = pattern.search(text) + if not match: + continue + groups = match.groups() + try: + if pattern is _DATE_PATTERNS[0]: + day, month, year = (int(g) for g in groups) + else: + year, month, day = (int(g) for g in groups) + return dt.date(year, month, day) + except ValueError: + continue + return None + + +class GacetaOficialScraper(BaseScraper): + """Scraper for https://gacetaoficialdebolivia.gob.bo/.""" + + source_key = LegalSource.GACETA.value + default_base_url = "https://gacetaoficialdebolivia.gob.bo/" + default_listing_paths = ("/",) + settings_base_url_key = "BOLIVIAN_LAWS_GACETA_BASE_URL" + settings_listing_paths_key = "BOLIVIAN_LAWS_GACETA_LISTING_PATHS" + + def extract_entries(self, *, html: str, url: str) -> Iterable[ScrapedEntry]: + soup = BeautifulSoup(html, "html.parser") + host = urlparse(self.base_url).netloc + + for anchor in soup.find_all("a"): + href = (anchor.get("href") or "").strip() + if not href: + continue + if not href.lower().endswith(".pdf"): + continue + + pdf_url = urljoin(url, href) + # Stay on the same host to avoid following off-site PDFs + if urlparse(pdf_url).netloc and urlparse(pdf_url).netloc != host: + continue + + link_text = (anchor.get_text() or "").strip() + context = link_text + # Also inspect the parent row for date/issue hints + parent = anchor.find_parent(["tr", "li", "article", "div"]) + if parent is not None: + context = f"{link_text} | {parent.get_text(' ', strip=True)}" + + external_id = "" + ley_match = _LEY_PATTERN.search(context) + decreto_match = _DECRETO_PATTERN.search(context) + issue_match = _ISSUE_PATTERN.search(context) + if ley_match and ley_match.group(1): + external_id = f"LEY-{ley_match.group(1)}" + elif decreto_match: + external_id = f"DS-{decreto_match.group(1)}" + elif issue_match: + external_id = f"GACETA-{issue_match.group(1)}" + + published_at = _parse_spanish_date(context) + suggested_area = _guess_area(context) + title = (link_text or href.rsplit("/", 1)[-1])[:1024] + + yield ScrapedEntry( + source_key=self.source_key, + pdf_url=pdf_url, + title=title, + external_id=external_id, + published_at=published_at, + suggested_area=suggested_area, + metadata={ + "listing_url": url, + "context": context[:500], + }, + ) + + def iter_entries( # type: ignore[override] + self, + *, + since: dt.date | None = None, + max_entries: int | None = None, + ) -> Iterator[ScrapedEntry]: + seen_urls: set[str] = set() + for entry in super().iter_entries(since=since, max_entries=None): + if entry.pdf_url in seen_urls: + continue + seen_urls.add(entry.pdf_url) + yield entry + if max_entries is not None and len(seen_urls) >= max_entries: + return diff --git a/opencontractserver/bolivian_laws/scrapers/registry.py b/opencontractserver/bolivian_laws/scrapers/registry.py new file mode 100644 index 000000000..072f37b79 --- /dev/null +++ b/opencontractserver/bolivian_laws/scrapers/registry.py @@ -0,0 +1,39 @@ +"""Registry of scrapers, keyed by :class:`LegalSource` value. + +The tasks and management commands use this registry to iterate all +sources or look up one by its short key (``gaceta``, ``tsj``, ``tcp``). +""" + +from __future__ import annotations + +from collections.abc import Iterator + +from opencontractserver.bolivian_laws.constants import LegalSource +from opencontractserver.bolivian_laws.scrapers.base import BaseScraper +from opencontractserver.bolivian_laws.scrapers.gaceta import GacetaOficialScraper +from opencontractserver.bolivian_laws.scrapers.tcp import ( + TribunalConstitucionalScraper, +) +from opencontractserver.bolivian_laws.scrapers.tsj import ( + TribunalSupremoJusticiaScraper, +) + +SCRAPERS: dict[str, type[BaseScraper]] = { + LegalSource.GACETA.value: GacetaOficialScraper, + LegalSource.TSJ.value: TribunalSupremoJusticiaScraper, + LegalSource.TCP.value: TribunalConstitucionalScraper, +} + + +def get_scraper_class(source_key: str) -> type[BaseScraper]: + try: + return SCRAPERS[source_key] + except KeyError as exc: + raise KeyError( + f"Unknown scraper source_key={source_key!r}. " + f"Valid keys: {sorted(SCRAPERS)}" + ) from exc + + +def iter_scraper_classes() -> Iterator[type[BaseScraper]]: + return iter(SCRAPERS.values()) diff --git a/opencontractserver/bolivian_laws/scrapers/tcp.py b/opencontractserver/bolivian_laws/scrapers/tcp.py new file mode 100644 index 000000000..6357e0770 --- /dev/null +++ b/opencontractserver/bolivian_laws/scrapers/tcp.py @@ -0,0 +1,125 @@ +"""Scraper for the Tribunal Constitucional Plurinacional (TCP) of Bolivia. + +Site: https://tcpbolivia.bo/ + +The TCP publishes *Sentencias Constitucionales Plurinacionales* (SCP), +*Declaraciones Constitucionales Plurinacionales* (DCP) and *Autos +Constitucionales* (AC). Every resolution is routed to the +``LegalArea.CONSTITUCIONAL`` corpus: the TCP deals exclusively with +constitutional matters, so area classification is trivial. + +Metadata-wise, we try to extract the resolution number (e.g. +``SCP-0250/2012``) and the publication date from the surrounding row. +""" + +from __future__ import annotations + +import datetime as dt +import logging +import re +from collections.abc import Iterable +from urllib.parse import urljoin, urlparse + +from bs4 import BeautifulSoup + +from opencontractserver.bolivian_laws.constants import LegalArea, LegalSource +from opencontractserver.bolivian_laws.scrapers.base import ( + BaseScraper, + ScrapedEntry, +) + +logger = logging.getLogger(__name__) + +_RESOLUTION_PATTERN = re.compile( + r"\b(SCP|DCP|AC)\s*[-–]?\s*(\d{1,6}/\d{4})", + re.I, +) +_DATE_PATTERN = re.compile(r"(\d{1,2})/(\d{1,2})/(\d{4})") +_ACCION_KEYWORDS: tuple[tuple[str, str], ...] = ( + ("amparo", "amparo_constitucional"), + ("libertad", "accion_de_libertad"), + ("popular", "accion_popular"), + ("cumplimiento", "accion_de_cumplimiento"), + ("inconstitucional", "accion_de_inconstitucionalidad"), + ("proteccion de privacidad", "proteccion_de_privacidad"), +) + + +def _parse_date(context: str) -> dt.date | None: + match = _DATE_PATTERN.search(context) + if not match: + return None + try: + day, month, year = (int(g) for g in match.groups()) + return dt.date(year, month, day) + except ValueError: + return None + + +def _detect_accion(context: str) -> str | None: + lowered = context.lower() + for keyword, label in _ACCION_KEYWORDS: + if keyword in lowered: + return label + return None + + +class TribunalConstitucionalScraper(BaseScraper): + """Scraper for https://tcpbolivia.bo/ constitutional jurisprudence.""" + + source_key = LegalSource.TCP.value + default_base_url = "https://tcpbolivia.bo/" + default_listing_paths = ("/jurisprudencia/",) + settings_base_url_key = "BOLIVIAN_LAWS_TCP_BASE_URL" + settings_listing_paths_key = "BOLIVIAN_LAWS_TCP_LISTING_PATHS" + + def extract_entries(self, *, html: str, url: str) -> Iterable[ScrapedEntry]: + soup = BeautifulSoup(html, "html.parser") + host = urlparse(self.base_url).netloc + + for anchor in soup.find_all("a"): + href = (anchor.get("href") or "").strip() + if not href or not href.lower().endswith(".pdf"): + continue + + pdf_url = urljoin(url, href) + if urlparse(pdf_url).netloc and urlparse(pdf_url).netloc != host: + continue + + link_text = (anchor.get_text() or "").strip() + parent = anchor.find_parent(["tr", "li", "article", "div"]) + context = ( + f"{link_text} | {parent.get_text(' ', strip=True)}" + if parent is not None + else link_text + ) + + external_id = "" + resolution_type = "" + match = _RESOLUTION_PATTERN.search(context) + if match: + resolution_type = match.group(1).upper() + external_id = f"{resolution_type}-{match.group(2)}" + + published_at = _parse_date(context) + accion = _detect_accion(context) + title = (link_text or href.rsplit("/", 1)[-1])[:1024] + + metadata = { + "listing_url": url, + "context": context[:500], + } + if resolution_type: + metadata["resolution_type"] = resolution_type + if accion: + metadata["accion"] = accion + + yield ScrapedEntry( + source_key=self.source_key, + pdf_url=pdf_url, + title=title, + external_id=external_id, + published_at=published_at, + suggested_area=LegalArea.CONSTITUCIONAL, + metadata=metadata, + ) diff --git a/opencontractserver/bolivian_laws/scrapers/tsj.py b/opencontractserver/bolivian_laws/scrapers/tsj.py new file mode 100644 index 000000000..f07160c42 --- /dev/null +++ b/opencontractserver/bolivian_laws/scrapers/tsj.py @@ -0,0 +1,120 @@ +"""Scraper for the Tribunal Supremo de Justicia (TSJ) of Bolivia. + +Site: https://tsj.bo/ + +The TSJ publishes *autos supremos* and *sentencias* grouped by sala +(Civil, Penal, Social / Laboral, Contencioso-Administrativa, etc.). +Each resolution is typically a PDF accessible from the jurisprudence +index. + +This scraper maps sala names (found in the listing context) to +:class:`LegalArea` values so the ingestion pipeline lands each PDF in +its specialist corpus automatically. +""" + +from __future__ import annotations + +import datetime as dt +import logging +import re +from collections.abc import Iterable +from urllib.parse import urljoin, urlparse + +from bs4 import BeautifulSoup + +from opencontractserver.bolivian_laws.constants import LegalArea, LegalSource +from opencontractserver.bolivian_laws.scrapers.base import ( + BaseScraper, + ScrapedEntry, +) + +logger = logging.getLogger(__name__) + +_AUTO_PATTERN = re.compile( + r"(?:auto\s+supremo|a\.s\.|sentencia)\s*(?:n[º°.]?\s*)?([0-9\-/]+)", + re.I, +) +_DATE_PATTERN = re.compile(r"(\d{1,2})/(\d{1,2})/(\d{4})") + +_SALA_TO_AREA: tuple[tuple[tuple[str, ...], str], ...] = ( + (("civil",), LegalArea.CIVIL), + (("penal",), LegalArea.PENAL), + (("social", "laboral"), LegalArea.LABORAL), + (("administrativ", "contencios"), LegalArea.ADMINISTRATIVO), + (("familia",), LegalArea.FAMILIA), + (("agrari",), LegalArea.AGRARIO), + (("tributari", "agroambien"), LegalArea.AGRARIO), +) + + +def _guess_area_from_context(context: str) -> str: + lowered = context.lower() + for keywords, area in _SALA_TO_AREA: + if any(k in lowered for k in keywords): + return area + return LegalArea.OTROS + + +def _parse_date(context: str) -> dt.date | None: + match = _DATE_PATTERN.search(context) + if not match: + return None + try: + day, month, year = (int(g) for g in match.groups()) + return dt.date(year, month, day) + except ValueError: + return None + + +class TribunalSupremoJusticiaScraper(BaseScraper): + """Scraper for https://tsj.bo/ jurisprudence listings.""" + + source_key = LegalSource.TSJ.value + default_base_url = "https://tsj.bo/" + default_listing_paths = ("/jurisprudencia/",) + settings_base_url_key = "BOLIVIAN_LAWS_TSJ_BASE_URL" + settings_listing_paths_key = "BOLIVIAN_LAWS_TSJ_LISTING_PATHS" + + def extract_entries(self, *, html: str, url: str) -> Iterable[ScrapedEntry]: + soup = BeautifulSoup(html, "html.parser") + host = urlparse(self.base_url).netloc + + for anchor in soup.find_all("a"): + href = (anchor.get("href") or "").strip() + if not href or not href.lower().endswith(".pdf"): + continue + + pdf_url = urljoin(url, href) + if urlparse(pdf_url).netloc and urlparse(pdf_url).netloc != host: + continue + + link_text = (anchor.get_text() or "").strip() + parent = anchor.find_parent(["tr", "li", "article", "div"]) + context = ( + f"{link_text} | {parent.get_text(' ', strip=True)}" + if parent is not None + else link_text + ) + + external_id = "" + match = _AUTO_PATTERN.search(context) + if match: + external_id = f"AS-{match.group(1)}" + + published_at = _parse_date(context) + suggested_area = _guess_area_from_context(context) + title = (link_text or href.rsplit("/", 1)[-1])[:1024] + + yield ScrapedEntry( + source_key=self.source_key, + pdf_url=pdf_url, + title=title, + external_id=external_id, + published_at=published_at, + suggested_area=suggested_area, + metadata={ + "listing_url": url, + "context": context[:500], + "sala_hint": suggested_area, + }, + ) diff --git a/opencontractserver/bolivian_laws/services/__init__.py b/opencontractserver/bolivian_laws/services/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/opencontractserver/bolivian_laws/services/agents.py b/opencontractserver/bolivian_laws/services/agents.py new file mode 100644 index 000000000..c9c5fe18c --- /dev/null +++ b/opencontractserver/bolivian_laws/services/agents.py @@ -0,0 +1,246 @@ +"""Specialist + orchestrator agents for the Bolivian Laws RAG service. + +Two factory functions: + +- ``build_specialist_agent(area)`` — wraps ``agents.for_corpus`` with + area-specific persona/instructions, bound to that area's corpus. +- ``build_orchestrator_agent()`` — a top-level pydantic_ai agent whose + tools are async functions that delegate to specialist agents. It + decides which specialist(s) to consult and synthesises the answer. + +Both are async-only (the underlying OC agent API is async-only). +""" + +from __future__ import annotations + +import asyncio +import logging +from dataclasses import dataclass +from typing import Optional + +from asgiref.sync import sync_to_async +from django.conf import settings + +from opencontractserver.bolivian_laws.constants import ( + AREA_PROFILES, + ORCHESTRATOR_PERSONA, + LegalArea, + get_profile, +) +from opencontractserver.bolivian_laws.models import LegalAreaCorpus +from opencontractserver.llms import agents as oc_agents +from opencontractserver.llms.agents.core_agents import ( + CoreAgent, + SourceNode, + UnifiedChatResponse, +) + +logger = logging.getLogger(__name__) + + +@dataclass +class OrchestratorSource: + """A source node tagged with the specialist area it came from.""" + + area: str + document_id: Optional[int] + snippet: str + similarity_score: float = 1.0 + + +@dataclass +class OrchestratorResponse: + """Aggregated response from the orchestrator across one or more + specialists. + """ + + answer: str + consulted_areas: list[str] + sources: list[OrchestratorSource] + conversation_id: Optional[int] = None + + +def _format_sources(area: str, sources: list[SourceNode]) -> list[OrchestratorSource]: + out: list[OrchestratorSource] = [] + for s in sources: + doc_id = None + meta = s.metadata or {} + if isinstance(meta, dict): + doc_id = meta.get("document_id") or meta.get("doc_id") + out.append( + OrchestratorSource( + area=area, + document_id=doc_id, + snippet=(s.content or "")[:600], + similarity_score=getattr(s, "similarity_score", 1.0) or 1.0, + ) + ) + return out + + +@sync_to_async +def _resolve_area_corpus_id(area: str) -> Optional[int]: + """Look up the corpus_id for a given area, or None if not yet ingested.""" + binding = ( + LegalAreaCorpus.objects.filter(area=area) + .values_list("corpus_id", flat=True) + .first() + ) + return binding + + +async def build_specialist_agent( + area: str, + *, + user_id: Optional[int] = None, + conversation_id: Optional[int] = None, + model: Optional[str] = None, + streaming: bool = False, +) -> CoreAgent: + """Create a corpus agent specialised for the given legal area. + + Raises ``LookupError`` if the area's corpus has not been created + (i.e. nothing has been ingested for that area yet). + """ + if area not in AREA_PROFILES: + raise ValueError(f"Unknown legal area: {area!r}") + + corpus_id = await _resolve_area_corpus_id(area) + if corpus_id is None: + raise LookupError( + f"No corpus exists for area={area!r}. Ingest documents first." + ) + + profile = get_profile(area) + chosen_model = ( + model or getattr(settings, "BOLIVIAN_LAWS_SPECIALIST_MODEL", None) or None + ) + + return await oc_agents.for_corpus( + corpus=corpus_id, + user_id=user_id, + conversation_id=conversation_id, + system_prompt=f"{profile.agent_persona}\n\n{profile.agent_instructions}", + model=chosen_model, + streaming=streaming, + ) + + +async def consult_specialist( + area: str, + question: str, + *, + user_id: Optional[int] = None, +) -> tuple[str, list[OrchestratorSource]]: + """Run a single question against one specialist; return its answer + + formatted sources tagged with the area. + """ + try: + agent = await build_specialist_agent(area, user_id=user_id) + except LookupError as exc: + return ( + f"[{area}] Sin corpus disponible: {exc}", + [], + ) + + response: UnifiedChatResponse = await agent.chat(question) + return response.content, _format_sources(area, response.sources) + + +async def ask_specialists( + areas: list[str], + question: str, + *, + user_id: Optional[int] = None, +) -> OrchestratorResponse: + """Skip orchestration: call N specialists in parallel and concatenate + their answers verbatim. Cheaper than the orchestrator when the + caller already knows which areas are relevant. + """ + results = await asyncio.gather( + *(consult_specialist(a, question, user_id=user_id) for a in areas), + return_exceptions=True, + ) + + parts: list[str] = [] + sources: list[OrchestratorSource] = [] + consulted: list[str] = [] + + for area, result in zip(areas, results): + if isinstance(result, Exception): + parts.append(f"### {area}\n_Error: {result}_") + consulted.append(area) + continue + answer, srcs = result + parts.append(f"### {get_profile(area).title}\n{answer}") + sources.extend(srcs) + consulted.append(area) + + return OrchestratorResponse( + answer="\n\n".join(parts), + consulted_areas=consulted, + sources=sources, + ) + + +async def ask_orchestrator( + question: str, + *, + user_id: Optional[int] = None, + conversation_id: Optional[int] = None, + model: Optional[str] = None, +) -> OrchestratorResponse: + """Route the question through the orchestrator. + + Builds a pydantic_ai Agent whose tools are async wrappers around the + specialist consultations. The orchestrator chooses which to invoke + and synthesises the final answer. + """ + from pydantic_ai import Agent + from pydantic_ai.tools import Tool + + chosen_model = ( + model + or getattr(settings, "BOLIVIAN_LAWS_ORCHESTRATOR_MODEL", None) + or "gpt-4o-mini" + ) + + # Mutable bag captured by the closures so we can collect every source + # the orchestrator ends up surfacing through tool calls. + captured_sources: list[OrchestratorSource] = [] + captured_areas: list[str] = [] + + def _make_tool(area: str): + async def _tool(question_for_specialist: str) -> str: + answer, srcs = await consult_specialist( + area, question_for_specialist, user_id=user_id + ) + captured_sources.extend(srcs) + if area not in captured_areas: + captured_areas.append(area) + return answer + + _tool.__name__ = f"consultar_{area}" + _tool.__doc__ = ( + f"Consulta al especialista en derecho {get_profile(area).title}. " + "Pásale la pregunta tal como la formularías a un abogado experto." + ) + return _tool + + tools = [Tool(_make_tool(area.value)) for area in LegalArea] + + agent = Agent( + chosen_model, + instructions=ORCHESTRATOR_PERSONA, + output_type=str, # type: ignore[arg-type] + tools=tools, + ) + + result = await agent.run(question) + + return OrchestratorResponse( + answer=result.output or "", + consulted_areas=captured_areas, + sources=captured_sources, + conversation_id=conversation_id, + ) diff --git a/opencontractserver/bolivian_laws/services/ingestion.py b/opencontractserver/bolivian_laws/services/ingestion.py new file mode 100644 index 000000000..e8c98a6a0 --- /dev/null +++ b/opencontractserver/bolivian_laws/services/ingestion.py @@ -0,0 +1,336 @@ +"""Ingestion service: PDFs → per-area Corpus. + +Three responsibilities: + +1. ``ensure_area_corpus(area, user)`` — idempotent corpus creation per area. +2. ``ingest_pdf(...)`` — SHA-256 dedupe + ``Corpus.import_content()`` call. +3. ``classify_pdf_area(...)`` — optional LLM-based area classifier. + +These are deliberately framework-agnostic Python functions (not Celery +tasks) so the management command can call them inline or wrap them in a +task as needed. +""" + +from __future__ import annotations + +import hashlib +import logging +from pathlib import Path +from typing import Optional, Union + +from django.contrib.auth import get_user_model +from django.db import transaction +from django.utils import timezone + +from opencontractserver.bolivian_laws.constants import ( + AREA_PROFILES, + LegalArea, + LegalSource, + corpus_slug_for_area, + get_profile, +) +from opencontractserver.bolivian_laws.models import ( + BolivianLegalDocument, + LegalAreaCorpus, +) +from opencontractserver.corpuses.models import Corpus + +logger = logging.getLogger(__name__) +User = get_user_model() + +PDF_MIMETYPE = "application/pdf" + + +def _resolve_user(user) -> User: + """Resolve the ingestion user; fall back to the first superuser.""" + if user is not None: + return user + su = User.objects.filter(is_superuser=True).order_by("pk").first() + if su is None: + raise RuntimeError( + "No user provided and no superuser exists; cannot create corpus." + ) + return su + + +def ensure_area_corpus(area: str, user=None) -> Corpus: + """Get or create the dedicated corpus for ``area``. + + Idempotent: subsequent calls return the same corpus. The + ``preferred_embedder`` and ``corpus_agent_instructions`` are seeded + from ``AREA_PROFILES`` and from the platform's default embedder. + + Args: + area: A ``LegalArea`` value (string). + user: User to record as creator. Defaults to the first superuser. + + Returns: + The ``Corpus`` instance bound to that area. + """ + if area not in AREA_PROFILES: + raise ValueError(f"Unknown legal area: {area!r}") + + existing = ( + LegalAreaCorpus.objects.filter(area=area).select_related("corpus").first() + ) + if existing is not None: + return existing.corpus + + profile = get_profile(area) + creator = _resolve_user(user) + + with transaction.atomic(): + # Re-check inside the transaction to avoid races. + existing = ( + LegalAreaCorpus.objects.select_for_update() + .filter(area=area) + .select_related("corpus") + .first() + ) + if existing is not None: + return existing.corpus + + corpus = Corpus.objects.create( + title=profile.title, + description=profile.description, + slug=corpus_slug_for_area(area), + corpus_agent_instructions=profile.agent_instructions, + creator=creator, + is_public=False, + ) + LegalAreaCorpus.objects.create(area=area, corpus=corpus) + logger.info( + "Created Bolivian-laws corpus for area=%s id=%s slug=%s", + area, + corpus.pk, + corpus.slug, + ) + return corpus + + +def _read_bytes(pdf: Union[str, Path, bytes]) -> bytes: + if isinstance(pdf, (str, Path)): + return Path(pdf).read_bytes() + if isinstance(pdf, (bytes, bytearray)): + return bytes(pdf) + raise TypeError(f"Unsupported pdf input type: {type(pdf)!r}") + + +def _sha256(content: bytes) -> str: + return hashlib.sha256(content).hexdigest() + + +def ingest_pdf( + pdf: Union[str, Path, bytes], + *, + area: str, + title: str, + source: str = LegalSource.MANUAL, + external_id: str = "", + published_at=None, + metadata: Optional[dict] = None, + filename: Optional[str] = None, + user=None, +) -> BolivianLegalDocument: + """Ingest a single PDF into the area's corpus, with SHA-256 dedupe. + + If a record with the same SHA-256 already exists (regardless of area + or source), this is a no-op and the existing record is returned. + + Returns: + The ``BolivianLegalDocument`` tracking record. ``status`` is + ``INGESTED`` on success, or ``FAILED`` if the underlying import + raised — in which case the exception is re-raised after the + record is persisted with ``last_error``. + """ + if area not in AREA_PROFILES: + raise ValueError(f"Unknown legal area: {area!r}") + + content = _read_bytes(pdf) + sha = _sha256(content) + + existing = BolivianLegalDocument.objects.filter(pdf_sha256=sha).first() + if existing is not None: + logger.info( + "Dedupe hit: PDF sha=%s already ingested as record #%s", + sha, + existing.pk, + ) + return existing + + corpus = ensure_area_corpus(area, user=user) + creator = _resolve_user(user) + + record = BolivianLegalDocument.objects.create( + area=area, + source=source, + external_id=external_id or "", + title=title, + published_at=published_at, + pdf_sha256=sha, + metadata=metadata or {}, + corpus=corpus, + status=BolivianLegalDocument.Status.PENDING, + ) + + try: + doc, _status, _doc_path = corpus.import_content( + content=content, + user=creator, + filename=filename or f"{title}.pdf", + file_type=PDF_MIMETYPE, + title=title, + description=f"[{LegalSource(source).label}] {title}", + ) + except Exception as exc: + record.status = BolivianLegalDocument.Status.FAILED + record.last_error = str(exc)[:2000] + record.save(update_fields=["status", "last_error"]) + logger.exception("Failed to ingest PDF sha=%s into area=%s", sha, area) + raise + + record.document = doc + record.status = BolivianLegalDocument.Status.INGESTED + record.ingested_at = timezone.now() + record.save(update_fields=["document", "status", "ingested_at"]) + return record + + +def infer_metadata_from_filename(name: str) -> dict: + """Best-effort metadata extraction from a filename. + + Convention (orientative, not enforced): + ``[area]_[year]_[number]_[title].pdf`` + + Returns a dict with whatever fields could be inferred. Always safe + to call; missing fields are simply absent from the result. + """ + stem = Path(name).stem + parts = stem.split("_") + out: dict = {} + + if not parts: + return out + + candidate_area = parts[0].lower() + if candidate_area in AREA_PROFILES: + out["area"] = candidate_area + parts = parts[1:] + + if parts and parts[0].isdigit() and len(parts[0]) == 4: + out["year"] = int(parts[0]) + parts = parts[1:] + + if parts and parts[0].isdigit(): + out["number"] = parts[0] + parts = parts[1:] + + if parts: + out["title_hint"] = " ".join(parts).replace("-", " ").strip() + + return out + + +# --- Optional LLM-based classifier ---------------------------------------- + + +def _extract_pdf_text_preview( + pdf: Union[str, Path, bytes], max_chars: int = 2000 +) -> str: + """Cheap text preview for classification. + + Uses pypdf if available; returns an empty string on any failure so + the classifier can fall back to ``LegalArea.OTROS`` gracefully. + """ + try: + from pypdf import PdfReader # type: ignore + except Exception: + try: + from PyPDF2 import PdfReader # type: ignore + except Exception: + return "" + + try: + if isinstance(pdf, (str, Path)): + reader = PdfReader(str(pdf)) + else: + from io import BytesIO + + reader = PdfReader(BytesIO(_read_bytes(pdf))) + chunks: list[str] = [] + total = 0 + for page in reader.pages[:5]: + try: + txt = page.extract_text() or "" + except Exception: + txt = "" + if not txt: + continue + chunks.append(txt) + total += len(txt) + if total >= max_chars: + break + return ("\n".join(chunks))[:max_chars] + except Exception: + return "" + + +async def classify_pdf_area( + pdf: Union[str, Path, bytes], + *, + title: Optional[str] = None, + model: Optional[str] = None, +) -> str: + """Classify a PDF into a ``LegalArea`` using a cheap LLM call. + + Falls back to ``LegalArea.OTROS`` on any error (missing model, no + API key, parse failure, etc.) so callers don't need to handle + partial failures during bulk ingestion. + """ + from django.conf import settings + + preview = _extract_pdf_text_preview(pdf) + title_hint = title or "(sin título)" + if not preview: + logger.warning( + "Classifier got empty preview for title=%r; defaulting to OTROS", title_hint + ) + return LegalArea.OTROS + + classifier_model = ( + model + or getattr(settings, "BOLIVIAN_LAWS_CLASSIFIER_MODEL", None) + or "gpt-4o-mini" + ) + + valid_areas = ", ".join(a.value for a in LegalArea) + prompt = ( + "Clasifica el siguiente documento jurídico boliviano en UNA de " + f"estas áreas: {valid_areas}. Responde solo el código (en " + "minúsculas, sin comillas).\n\n" + f"Título: {title_hint}\n\n" + f"Inicio del documento:\n{preview}" + ) + + try: + # Use a minimal corpus-less structured response: we don't have a + # corpus context here, so we fall back to a simple LLM call via + # the agents API by attaching to any existing corpus is not + # appropriate. Instead, use the structured response API on a + # placeholder document — but we have none. So we use a direct + # pydantic_ai call here, kept tiny and isolated. + from pydantic_ai import Agent + + agent = Agent(classifier_model, output_type=str) # type: ignore[arg-type] + result = await agent.run(prompt) + raw = (result.output or "").strip().lower() + for area in LegalArea: + if raw == area.value or raw.startswith(area.value): + return area.value + logger.warning( + "Classifier returned unrecognized area %r; defaulting to OTROS", raw + ) + return LegalArea.OTROS + except Exception: + logger.exception("LLM classification failed; defaulting to OTROS") + return LegalArea.OTROS diff --git a/opencontractserver/bolivian_laws/tasks.py b/opencontractserver/bolivian_laws/tasks.py new file mode 100644 index 000000000..87829a2bb --- /dev/null +++ b/opencontractserver/bolivian_laws/tasks.py @@ -0,0 +1,215 @@ +"""Celery tasks for the Bolivian Laws RAG service. + +Two concerns live here: + +- **Ingestion**: ``ingest_pdf_async`` takes an already-downloaded PDF on + disk and pushes it through the ingestion pipeline. +- **Scraping**: ``scrape_and_ingest_source`` / ``scrape_and_ingest_all`` + drive the per-source scrapers, deduplicate by SHA-256, and hand each + PDF off to ``ingest_pdf`` inline. + +The scraping tasks are idempotent and safe to run on a Celery Beat +schedule: already-ingested PDFs are a no-op thanks to the SHA-256 +dedupe in the ingestion service. +""" + +from __future__ import annotations + +import datetime as dt +import logging + +from celery import shared_task +from django.conf import settings + +from opencontractserver.bolivian_laws.constants import LegalArea, LegalSource +from opencontractserver.bolivian_laws.services.ingestion import ( + ingest_pdf, +) + +logger = logging.getLogger(__name__) + + +@shared_task(name="bolivian_laws.ingest_pdf_async") +def ingest_pdf_async( + pdf_path: str, + *, + area: str, + title: str, + source: str = LegalSource.MANUAL, + external_id: str = "", + published_at: str | None = None, + metadata: dict | None = None, + user_id: int | None = None, +) -> int: + """Async wrapper around ``ingest_pdf`` for bulk ingestion via Celery. + + Returns the resulting ``BolivianLegalDocument`` primary key. + """ + from django.contrib.auth import get_user_model + + user = None + if user_id is not None: + user = get_user_model().objects.filter(pk=user_id).first() + + parsed_date: dt.date | None = None + if published_at: + try: + parsed_date = dt.date.fromisoformat(published_at) + except ValueError: + logger.warning("Invalid published_at=%r; ignoring.", published_at) + + record = ingest_pdf( + pdf_path, + area=area, + title=title, + source=source, + external_id=external_id, + published_at=parsed_date, + metadata=metadata, + user=user, + ) + return record.pk + + +def _resolve_scrape_area(entry) -> str: + """Pick a legal area for a scraped entry. + + Priority: + 1. The scraper's own suggestion (keyword/sala-based heuristics). + 2. ``LegalArea.OTROS`` as a safe fallback so ingestion never fails + just because classification is uncertain. + + We intentionally avoid calling the LLM classifier from the scraping + task itself — that would make the periodic job expensive and + coupled to OpenAI availability. Users can re-classify records + manually or re-run ingestion through the management command with + ``--auto-classify`` when desired. + """ + suggested = getattr(entry, "suggested_area", None) + if suggested in {a.value for a in LegalArea}: + return suggested + return LegalArea.OTROS.value + + +@shared_task( + name="bolivian_laws.scrape_and_ingest_source", + acks_late=True, +) +def scrape_and_ingest_source( + source_key: str, + *, + since_days: int | None = None, + max_entries: int | None = None, + user_id: int | None = None, +) -> dict: + """Run a single scraper and ingest everything it yields. + + Returns a summary dict with per-status counts so the caller (or + Flower) can see at a glance what happened. + """ + # Imports live inside the task so Celery worker startup doesn't + # need httpx/bs4 resolved before Django is ready. + import hashlib + + from django.contrib.auth import get_user_model + + from opencontractserver.bolivian_laws.models import BolivianLegalDocument + from opencontractserver.bolivian_laws.scrapers import get_scraper_class + + user = None + if user_id is not None: + user = get_user_model().objects.filter(pk=user_id).first() + + lookback_days = ( + since_days + if since_days is not None + else int(getattr(settings, "BOLIVIAN_LAWS_SCRAPE_LOOKBACK_DAYS", 30) or 30) + ) + since: dt.date | None = None + if lookback_days > 0: + since = dt.date.today() - dt.timedelta(days=lookback_days) + + scraper_cls = get_scraper_class(source_key) + summary = { + "source": source_key, + "since": since.isoformat() if since else None, + "discovered": 0, + "ingested": 0, + "dedupe_hits": 0, + "failed": 0, + } + + with scraper_cls() as scraper: + for entry in scraper.iter_entries(since=since, max_entries=max_entries): + summary["discovered"] += 1 + try: + pdf_bytes = scraper.download_pdf(entry) + except Exception as exc: + summary["failed"] += 1 + logger.warning( + "[%s] failed to download %s: %s", + source_key, + entry.pdf_url, + exc, + ) + continue + + sha = hashlib.sha256(pdf_bytes).hexdigest() + if BolivianLegalDocument.objects.filter(pdf_sha256=sha).exists(): + summary["dedupe_hits"] += 1 + continue + + area = _resolve_scrape_area(entry) + try: + ingest_pdf( + pdf_bytes, + area=area, + title=entry.title or entry.pdf_url.rsplit("/", 1)[-1], + source=source_key, + external_id=entry.external_id, + published_at=entry.published_at, + metadata={**entry.metadata, "pdf_url": entry.pdf_url}, + user=user, + filename=entry.pdf_url.rsplit("/", 1)[-1] or None, + ) + except Exception as exc: + summary["failed"] += 1 + logger.exception( + "[%s] ingest failed for %s: %s", + source_key, + entry.pdf_url, + exc, + ) + continue + + summary["ingested"] += 1 + + logger.info( + "Bolivian-laws scrape finished: %s", + {k: v for k, v in summary.items() if v is not None}, + ) + return summary + + +@shared_task(name="bolivian_laws.scrape_and_ingest_all") +def scrape_and_ingest_all( + *, + since_days: int | None = None, + max_entries_per_source: int | None = None, +) -> list[str]: + """Fan out one :func:`scrape_and_ingest_source` task per known source. + + Returns the list of enqueued task IDs so callers can track them + (e.g. via ``GroupResult``) if desired. + """ + from opencontractserver.bolivian_laws.scrapers import SCRAPERS + + task_ids: list[str] = [] + for source_key in SCRAPERS: + async_result = scrape_and_ingest_source.delay( + source_key, + since_days=since_days, + max_entries=max_entries_per_source, + ) + task_ids.append(async_result.id) + return task_ids diff --git a/opencontractserver/bolivian_laws/tests/__init__.py b/opencontractserver/bolivian_laws/tests/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/opencontractserver/bolivian_laws/tests/test_agents.py b/opencontractserver/bolivian_laws/tests/test_agents.py new file mode 100644 index 000000000..285e1010e --- /dev/null +++ b/opencontractserver/bolivian_laws/tests/test_agents.py @@ -0,0 +1,137 @@ +"""Tests for the specialist + orchestrator agent layer. + +These tests exercise the orchestration glue (source tagging, area +routing) without spinning up real LLM calls — the underlying +``oc_agents.for_corpus`` and pydantic_ai ``Agent`` are patched. +""" + +from __future__ import annotations + +import asyncio +from dataclasses import dataclass, field +from typing import Any +from unittest.mock import AsyncMock, MagicMock, patch + +from django.contrib.auth import get_user_model +from django.test import TestCase + +from opencontractserver.bolivian_laws.constants import LegalArea +from opencontractserver.bolivian_laws.services.agents import ( + OrchestratorResponse, + ask_specialists, + consult_specialist, +) +from opencontractserver.bolivian_laws.services.ingestion import ensure_area_corpus + +User = get_user_model() + + +@dataclass +class _FakeSource: + content: str + metadata: dict = field(default_factory=dict) + similarity_score: float = 0.9 + + +@dataclass +class _FakeResponse: + content: str + sources: list = field(default_factory=list) + + +def _run(coro): + return asyncio.get_event_loop().run_until_complete(coro) + + +class TestSpecialistConsultation(TestCase): + @classmethod + def setUpTestData(cls): + cls.user = User.objects.create_superuser( + username="bl_agent_admin", + password="testpass123", + email="bl_agent@test.com", + ) + # Pre-create the area corpus so the agent layer finds it. + ensure_area_corpus(LegalArea.PENAL, user=cls.user) + + def test_consult_specialist_tags_sources_with_area(self): + fake_agent = MagicMock() + fake_agent.chat = AsyncMock( + return_value=_FakeResponse( + content="Respuesta penal.", + sources=[ + _FakeSource(content="art. 263 CP", metadata={"document_id": 42}) + ], + ) + ) + with patch( + "opencontractserver.bolivian_laws.services.agents.oc_agents.for_corpus", + new=AsyncMock(return_value=fake_agent), + ): + answer, sources = _run( + consult_specialist(LegalArea.PENAL.value, "¿Qué dice el art. 263?") + ) + self.assertEqual(answer, "Respuesta penal.") + self.assertEqual(len(sources), 1) + self.assertEqual(sources[0].area, "penal") + self.assertEqual(sources[0].document_id, 42) + self.assertIn("263", sources[0].snippet) + + def test_consult_specialist_handles_missing_corpus(self): + # Civil corpus was never created. + answer, sources = _run( + consult_specialist(LegalArea.CIVIL.value, "Pregunta cualquiera") + ) + self.assertIn("Sin corpus disponible", answer) + self.assertEqual(sources, []) + + +class TestAskSpecialistsParallel(TestCase): + @classmethod + def setUpTestData(cls): + cls.user = User.objects.create_superuser( + username="bl_parallel_admin", + password="testpass123", + email="bl_parallel@test.com", + ) + ensure_area_corpus(LegalArea.PENAL, user=cls.user) + ensure_area_corpus(LegalArea.CONSTITUCIONAL, user=cls.user) + + def test_aggregates_answers_from_multiple_specialists(self): + async def _fake_for_corpus(corpus, **kwargs) -> Any: + agent = MagicMock() + # Return area-specific answer based on the system prompt + sp = kwargs.get("system_prompt", "") + if "constitucional" in sp.lower(): + agent.chat = AsyncMock( + return_value=_FakeResponse( + content="Constitucional dice X.", + sources=[_FakeSource(content="Art. 14 CPE")], + ) + ) + else: + agent.chat = AsyncMock( + return_value=_FakeResponse( + content="Penal dice Y.", + sources=[_FakeSource(content="Art. 263 CP")], + ) + ) + return agent + + with patch( + "opencontractserver.bolivian_laws.services.agents.oc_agents.for_corpus", + new=_fake_for_corpus, + ): + response: OrchestratorResponse = _run( + ask_specialists( + [LegalArea.CONSTITUCIONAL.value, LegalArea.PENAL.value], + "Caso de detención sin orden", + ) + ) + self.assertIn("Constitucional dice X.", response.answer) + self.assertIn("Penal dice Y.", response.answer) + self.assertEqual(set(response.consulted_areas), {"constitucional", "penal"}) + self.assertEqual(len(response.sources), 2) + self.assertEqual( + {s.area for s in response.sources}, {"constitucional", "penal"} + ) diff --git a/opencontractserver/bolivian_laws/tests/test_command.py b/opencontractserver/bolivian_laws/tests/test_command.py new file mode 100644 index 000000000..bc115d58e --- /dev/null +++ b/opencontractserver/bolivian_laws/tests/test_command.py @@ -0,0 +1,82 @@ +"""Test the ingest_bolivian_laws management command.""" + +from __future__ import annotations + +import tempfile +from io import StringIO +from pathlib import Path +from unittest.mock import patch + +from django.contrib.auth import get_user_model +from django.core.management import call_command +from django.core.management.base import CommandError +from django.test import TestCase + +from opencontractserver.bolivian_laws.constants import LegalArea +from opencontractserver.bolivian_laws.models import BolivianLegalDocument +from opencontractserver.corpuses.models import Corpus + +User = get_user_model() + + +def _fake_import_content(self, *, content, user, filename=None, **kwargs): + class _Doc: + pk = 1 + id = 1 + + return _Doc(), "created", None + + +class TestIngestCommand(TestCase): + @classmethod + def setUpTestData(cls): + cls.user = User.objects.create_superuser( + username="bl_cmd_admin", + password="testpass123", + email="bl_cmd@test.com", + ) + + def setUp(self): + self.tmp = tempfile.TemporaryDirectory() + self.addCleanup(self.tmp.cleanup) + self.tmp_path = Path(self.tmp.name) + # Create three small "PDF" placeholders + for name in ("law_a.pdf", "law_b.pdf", "law_c.pdf"): + (self.tmp_path / name).write_bytes(f"%PDF-{name}".encode()) + + def test_requires_area_or_auto_classify(self): + with self.assertRaises(CommandError): + call_command("ingest_bolivian_laws", path=str(self.tmp_path)) + + def test_path_must_be_directory(self): + with self.assertRaises(CommandError): + call_command( + "ingest_bolivian_laws", + path="/this/path/does/not/exist", + area=LegalArea.PENAL, + ) + + def test_dry_run_does_not_ingest(self): + out = StringIO() + call_command( + "ingest_bolivian_laws", + path=str(self.tmp_path), + area=LegalArea.PENAL, + dry_run=True, + stdout=out, + ) + self.assertEqual(BolivianLegalDocument.objects.count(), 0) + self.assertIn("DRY", out.getvalue()) + + def test_inline_ingest_creates_records(self): + with patch.object( + Corpus, "import_content", autospec=True, side_effect=_fake_import_content + ): + call_command( + "ingest_bolivian_laws", + path=str(self.tmp_path), + area=LegalArea.PENAL, + ) + self.assertEqual(BolivianLegalDocument.objects.count(), 3) + statuses = set(BolivianLegalDocument.objects.values_list("status", flat=True)) + self.assertEqual(statuses, {BolivianLegalDocument.Status.INGESTED}) diff --git a/opencontractserver/bolivian_laws/tests/test_constants.py b/opencontractserver/bolivian_laws/tests/test_constants.py new file mode 100644 index 000000000..e6da2a5f5 --- /dev/null +++ b/opencontractserver/bolivian_laws/tests/test_constants.py @@ -0,0 +1,41 @@ +"""Tests for Bolivian Laws constants module.""" + +from django.test import SimpleTestCase + +from opencontractserver.bolivian_laws.constants import ( + AREA_PROFILES, + AreaProfile, + LegalArea, + LegalSource, + corpus_slug_for_area, + get_profile, +) + + +class TestLegalAreaProfiles(SimpleTestCase): + def test_every_area_has_profile(self): + for area in LegalArea: + self.assertIn(area.value, AREA_PROFILES) + profile = AREA_PROFILES[area.value] + self.assertIsInstance(profile, AreaProfile) + self.assertTrue(profile.title.startswith("Bolivia — ")) + self.assertTrue(profile.description) + self.assertTrue(profile.agent_persona) + self.assertTrue(profile.agent_instructions) + + def test_get_profile_unknown_raises(self): + with self.assertRaises(KeyError): + get_profile("not-a-real-area") + + def test_corpus_slug_is_deterministic(self): + self.assertEqual( + corpus_slug_for_area(LegalArea.CONSTITUCIONAL), + "bolivia-constitucional", + ) + self.assertEqual(corpus_slug_for_area(LegalArea.PENAL), "bolivia-penal") + + def test_legal_source_choices_present(self): + self.assertIn("gaceta", {s.value for s in LegalSource}) + self.assertIn("tsj", {s.value for s in LegalSource}) + self.assertIn("tcp", {s.value for s in LegalSource}) + self.assertIn("manual", {s.value for s in LegalSource}) diff --git a/opencontractserver/bolivian_laws/tests/test_ingestion.py b/opencontractserver/bolivian_laws/tests/test_ingestion.py new file mode 100644 index 000000000..b2dbc0fca --- /dev/null +++ b/opencontractserver/bolivian_laws/tests/test_ingestion.py @@ -0,0 +1,145 @@ +"""Tests for Bolivian Laws ingestion service.""" + +from __future__ import annotations + +from unittest.mock import patch + +from django.contrib.auth import get_user_model +from django.test import TestCase + +from opencontractserver.bolivian_laws.constants import LegalArea, LegalSource +from opencontractserver.bolivian_laws.models import ( + BolivianLegalDocument, + LegalAreaCorpus, +) +from opencontractserver.bolivian_laws.services.ingestion import ( + ensure_area_corpus, + infer_metadata_from_filename, + ingest_pdf, +) +from opencontractserver.corpuses.models import Corpus + +User = get_user_model() + + +class _FakeDocument: + """Stand-in returned by mocked ``Corpus.import_content``.""" + + def __init__(self, pk: int = 7) -> None: + self.pk = pk + self.id = pk + + +def _fake_import_content(self, *, content, user, filename=None, **kwargs): + return _FakeDocument(), "created", None + + +class TestEnsureAreaCorpus(TestCase): + @classmethod + def setUpTestData(cls): + cls.user = User.objects.create_superuser( + username="bl_test_admin", + password="testpass123", + email="bl_admin@test.com", + ) + + def test_creates_corpus_idempotently(self): + first = ensure_area_corpus(LegalArea.CONSTITUCIONAL, user=self.user) + second = ensure_area_corpus(LegalArea.CONSTITUCIONAL, user=self.user) + self.assertEqual(first.pk, second.pk) + self.assertEqual( + LegalAreaCorpus.objects.filter(area="constitucional").count(), 1 + ) + + def test_corpus_seeded_with_profile_fields(self): + corpus = ensure_area_corpus(LegalArea.PENAL, user=self.user) + self.assertTrue(corpus.title.startswith("Bolivia — ")) + self.assertTrue(corpus.corpus_agent_instructions) + self.assertEqual(corpus.slug, "bolivia-penal") + + def test_unknown_area_raises(self): + with self.assertRaises(ValueError): + ensure_area_corpus("not-a-real-area", user=self.user) + + +class TestIngestPdf(TestCase): + @classmethod + def setUpTestData(cls): + cls.user = User.objects.create_superuser( + username="bl_ingest_admin", + password="testpass123", + email="bl_ingest@test.com", + ) + + def test_ingest_creates_record_and_calls_import_content(self): + with patch.object( + Corpus, "import_content", autospec=True, side_effect=_fake_import_content + ) as mock_import: + record = ingest_pdf( + b"%PDF-fake", + area=LegalArea.LABORAL, + title="Ley General del Trabajo", + source=LegalSource.MANUAL, + user=self.user, + ) + mock_import.assert_called_once() + self.assertEqual(record.status, BolivianLegalDocument.Status.INGESTED) + self.assertEqual(record.area, "laboral") + self.assertEqual(record.source, "manual") + self.assertIsNotNone(record.ingested_at) + self.assertEqual(record.pdf_sha256, _sha256_of(b"%PDF-fake")) + + def test_dedupe_returns_existing_record(self): + with patch.object( + Corpus, "import_content", autospec=True, side_effect=_fake_import_content + ): + first = ingest_pdf( + b"%PDF-dup", + area=LegalArea.CIVIL, + title="Doc A", + user=self.user, + ) + second = ingest_pdf( + b"%PDF-dup", + area=LegalArea.PENAL, # different area, same bytes + title="Doc A again", + user=self.user, + ) + self.assertEqual(first.pk, second.pk) + self.assertEqual(BolivianLegalDocument.objects.count(), 1) + + def test_failure_marks_record_failed_and_reraises(self): + with patch.object( + Corpus, "import_content", autospec=True, side_effect=RuntimeError("boom") + ): + with self.assertRaises(RuntimeError): + ingest_pdf( + b"%PDF-fail", + area=LegalArea.TRIBUTARIO, + title="Doc fail", + user=self.user, + ) + record = BolivianLegalDocument.objects.get(pdf_sha256=_sha256_of(b"%PDF-fail")) + self.assertEqual(record.status, BolivianLegalDocument.Status.FAILED) + self.assertIn("boom", record.last_error) + + +class TestFilenameInference(TestCase): + def test_full_convention(self): + out = infer_metadata_from_filename("constitucional_2009_001_cpe.pdf") + self.assertEqual(out["area"], "constitucional") + self.assertEqual(out["year"], 2009) + self.assertEqual(out["number"], "001") + self.assertEqual(out["title_hint"], "cpe") + + def test_unknown_area_token_ignored(self): + out = infer_metadata_from_filename("contrato_2024_05_cliente.pdf") + # "contrato" is not a known area, so area is omitted; year/number still parse + self.assertNotIn("area", out) + self.assertEqual(out["year"], 2024) + + +def _sha256_of(content: bytes) -> str: + import hashlib + + return hashlib.sha256(content).hexdigest() diff --git a/opencontractserver/bolivian_laws/tests/test_scrape_tasks.py b/opencontractserver/bolivian_laws/tests/test_scrape_tasks.py new file mode 100644 index 000000000..b9c30826e --- /dev/null +++ b/opencontractserver/bolivian_laws/tests/test_scrape_tasks.py @@ -0,0 +1,202 @@ +"""Tests for the scraping Celery tasks. + +The scraper itself is mocked at the class level (``iter_entries`` + +``download_pdf``) so these tests focus on orchestration: SHA-256 +dedupe, status counting, and fan-out wiring. +""" + +from __future__ import annotations + +import datetime as dt +from collections.abc import Iterable +from contextlib import contextmanager +from unittest.mock import patch + +from django.contrib.auth import get_user_model +from django.test import TestCase + +from opencontractserver.bolivian_laws.constants import LegalArea, LegalSource +from opencontractserver.bolivian_laws.models import BolivianLegalDocument +from opencontractserver.bolivian_laws.scrapers.base import ScrapedEntry +from opencontractserver.bolivian_laws.tasks import ( + scrape_and_ingest_all, + scrape_and_ingest_source, +) +from opencontractserver.corpuses.models import Corpus + +User = get_user_model() + + +class _FakeDocument: + def __init__(self, pk: int = 1) -> None: + self.pk = pk + self.id = pk + + +def _fake_import_content(self, *, content, user, filename=None, **kwargs): + return _FakeDocument(), "created", None + + +class _FakeScraper: + """Stand-in that bypasses HTTP entirely.""" + + entries: list[ScrapedEntry] = [] + pdf_map: dict[str, bytes] = {} + + def __init__(self, *args, **kwargs) -> None: # swallow everything + pass + + def __enter__(self): + return self + + def __exit__(self, *exc) -> None: + return None + + def iter_entries(self, *, since=None, max_entries=None) -> Iterable[ScrapedEntry]: + yield from self.entries + + def download_pdf(self, entry: ScrapedEntry) -> bytes: + return self.pdf_map[entry.pdf_url] + + +@contextmanager +def _patch_scraper_class(source_key: str, fake: type[_FakeScraper]): + with patch( + "opencontractserver.bolivian_laws.scrapers.registry.SCRAPERS", + {source_key: fake}, + ): + yield + + +class TestScrapeAndIngestSource(TestCase): + @classmethod + def setUpTestData(cls): + cls.user = User.objects.create_superuser( + username="bl_scrape_admin", + password="testpass123", + email="bl_scrape@test.com", + ) + + def test_ingests_new_entries_and_dedupes_second_run(self): + entries = [ + ScrapedEntry( + source_key=LegalSource.GACETA.value, + pdf_url="http://test.local/ley-1178.pdf", + title="Ley 1178", + external_id="LEY-1178", + published_at=dt.date(1990, 7, 20), + suggested_area=LegalArea.ADMINISTRATIVO, + ), + ScrapedEntry( + source_key=LegalSource.GACETA.value, + pdf_url="http://test.local/ds-29894.pdf", + title="DS 29894", + external_id="DS-29894", + suggested_area=LegalArea.OTROS, + ), + ] + pdf_map = { + "http://test.local/ley-1178.pdf": b"%PDF-ley-1178", + "http://test.local/ds-29894.pdf": b"%PDF-ds-29894", + } + + FakeScraper = type( + "FakeGacetaScraper", + (_FakeScraper,), + {"entries": entries, "pdf_map": pdf_map}, + ) + + with ( + _patch_scraper_class(LegalSource.GACETA.value, FakeScraper), + patch.object( + Corpus, + "import_content", + autospec=True, + side_effect=_fake_import_content, + ), + ): + first = scrape_and_ingest_source.run( + LegalSource.GACETA.value, user_id=self.user.pk + ) + second = scrape_and_ingest_source.run( + LegalSource.GACETA.value, user_id=self.user.pk + ) + + self.assertEqual(first["discovered"], 2) + self.assertEqual(first["ingested"], 2) + self.assertEqual(first["dedupe_hits"], 0) + self.assertEqual(first["failed"], 0) + + self.assertEqual(second["discovered"], 2) + self.assertEqual(second["ingested"], 0) + self.assertEqual(second["dedupe_hits"], 2) + self.assertEqual(BolivianLegalDocument.objects.count(), 2) + + def test_failed_download_counted_and_does_not_abort_batch(self): + bad = ScrapedEntry( + source_key=LegalSource.GACETA.value, + pdf_url="http://test.local/missing.pdf", + title="Missing", + ) + good = ScrapedEntry( + source_key=LegalSource.GACETA.value, + pdf_url="http://test.local/ok.pdf", + title="OK", + suggested_area=LegalArea.OTROS, + ) + + class _PartialScraper(_FakeScraper): + entries = [bad, good] + pdf_map = {"http://test.local/ok.pdf": b"%PDF-ok"} + + def download_pdf(self, entry): # type: ignore[override] + if entry.pdf_url.endswith("missing.pdf"): + raise RuntimeError("404 not found") + return self.pdf_map[entry.pdf_url] + + with ( + _patch_scraper_class(LegalSource.GACETA.value, _PartialScraper), + patch.object( + Corpus, + "import_content", + autospec=True, + side_effect=_fake_import_content, + ), + ): + summary = scrape_and_ingest_source.run( + LegalSource.GACETA.value, user_id=self.user.pk + ) + + self.assertEqual(summary["discovered"], 2) + self.assertEqual(summary["ingested"], 1) + self.assertEqual(summary["failed"], 1) + self.assertEqual(BolivianLegalDocument.objects.count(), 1) + + def test_unknown_source_key_raises(self): + with self.assertRaises(KeyError): + scrape_and_ingest_source.run("definitely-not-a-source") + + +class TestScrapeAndIngestAllFanOut(TestCase): + def test_fan_out_enqueues_one_task_per_source(self): + enqueued: list[str] = [] + + class _FakeAsyncResult: + def __init__(self, source: str) -> None: + self.id = f"task-{source}" + + def _fake_delay(source_key, **kwargs): + enqueued.append(source_key) + return _FakeAsyncResult(source_key) + + with patch( + "opencontractserver.bolivian_laws.tasks.scrape_and_ingest_source.delay", + side_effect=_fake_delay, + ): + ids = scrape_and_ingest_all.run() + + self.assertEqual( + sorted(enqueued), + [LegalSource.GACETA.value, LegalSource.TCP.value, LegalSource.TSJ.value], + ) + self.assertEqual(len(ids), 3) diff --git a/opencontractserver/bolivian_laws/tests/test_scrapers.py b/opencontractserver/bolivian_laws/tests/test_scrapers.py new file mode 100644 index 000000000..c569ff7a5 --- /dev/null +++ b/opencontractserver/bolivian_laws/tests/test_scrapers.py @@ -0,0 +1,277 @@ +"""Tests for the Bolivian legal source scrapers. + +The scrapers are driven by ``httpx.MockTransport`` so no real HTTP +requests are made. Each source's HTML is a tiny, self-contained fixture +that mimics the structural features the parser relies on (anchors +ending in ``.pdf``, surrounding rows with dates and resolution IDs). +""" + +from __future__ import annotations + +import datetime as dt + +import httpx +from django.test import SimpleTestCase + +from opencontractserver.bolivian_laws.constants import LegalArea, LegalSource +from opencontractserver.bolivian_laws.scrapers import ( + SCRAPERS, + ScrapedEntry, + get_scraper_class, +) +from opencontractserver.bolivian_laws.scrapers.gaceta import GacetaOficialScraper +from opencontractserver.bolivian_laws.scrapers.tcp import ( + TribunalConstitucionalScraper, +) +from opencontractserver.bolivian_laws.scrapers.tsj import ( + TribunalSupremoJusticiaScraper, +) + +GACETA_HTML = """ + + + + + + + + + + + + + + +
Ley N° 1178 SAFCO — 20/07/1990Descargar PDF
Decreto Supremo 29894 — 07/02/2009Descargar PDF
Noticia sin PDFExterno
+ +""" + +TSJ_HTML = """ + +
+ +""" + +TCP_HTML = """ + +
+
+

SCP 0250/2012 — Acción de Amparo Constitucional

+

Publicada el 12/05/2012

+ Ver resolución +
+
+

DCP 0001/2020

+

Acción de Inconstitucionalidad — 03/02/2020

+ Ver resolución +
+ +""" + + +def _mock_transport(html: str) -> httpx.MockTransport: + pdf_bytes = b"%PDF-1.4 fake content" + + def _handler(request: httpx.Request) -> httpx.Response: + if request.url.path.endswith(".pdf"): + return httpx.Response(200, content=pdf_bytes) + return httpx.Response(200, text=html) + + return httpx.MockTransport(_handler) + + +def _make_client(html: str) -> httpx.Client: + return httpx.Client( + transport=_mock_transport(html), + base_url="http://test.local", + follow_redirects=True, + ) + + +class TestGacetaScraper(SimpleTestCase): + def test_extracts_pdf_entries_with_metadata(self): + client = _make_client(GACETA_HTML) + scraper = GacetaOficialScraper( + client=client, + request_delay_seconds=0, + base_url="http://test.local/", + listing_paths=("/",), + ) + entries = list(scraper.iter_entries()) + self.assertEqual(len(entries), 2) + + by_id = {e.external_id: e for e in entries} + self.assertIn("LEY-1178", by_id) + self.assertIn("DS-29894", by_id) + + ley = by_id["LEY-1178"] + self.assertEqual(ley.source_key, LegalSource.GACETA.value) + self.assertEqual(ley.published_at, dt.date(1990, 7, 20)) + # SAFCO is administrative-flavoured + self.assertEqual(ley.suggested_area, LegalArea.ADMINISTRATIVO) + + decreto = by_id["DS-29894"] + self.assertEqual(decreto.published_at, dt.date(2009, 2, 7)) + + def test_skips_entries_older_than_since(self): + client = _make_client(GACETA_HTML) + scraper = GacetaOficialScraper( + client=client, + request_delay_seconds=0, + base_url="http://test.local/", + listing_paths=("/",), + ) + entries = list(scraper.iter_entries(since=dt.date(2000, 1, 1))) + self.assertEqual(len(entries), 1) + self.assertEqual(entries[0].external_id, "DS-29894") + + def test_download_pdf_returns_bytes(self): + client = _make_client(GACETA_HTML) + scraper = GacetaOficialScraper( + client=client, + request_delay_seconds=0, + base_url="http://test.local/", + listing_paths=("/",), + ) + entry = next(iter(scraper.iter_entries())) + pdf = scraper.download_pdf(entry) + self.assertTrue(pdf.startswith(b"%PDF")) + + def test_dedupes_pdf_urls_within_run(self): + dup_html = GACETA_HTML + ('Enlace duplicado') + client = _make_client(dup_html) + scraper = GacetaOficialScraper( + client=client, + request_delay_seconds=0, + base_url="http://test.local/", + listing_paths=("/",), + ) + entries = list(scraper.iter_entries()) + urls = [e.pdf_url for e in entries] + self.assertEqual(len(urls), len(set(urls))) + + +class TestTsjScraper(SimpleTestCase): + def test_maps_sala_to_area(self): + client = _make_client(TSJ_HTML) + scraper = TribunalSupremoJusticiaScraper( + client=client, + request_delay_seconds=0, + base_url="http://test.local/", + listing_paths=("/jurisprudencia/",), + ) + entries = list(scraper.iter_entries()) + self.assertEqual(len(entries), 3) + areas = {e.pdf_url.rsplit("/", 1)[-1]: e.suggested_area for e in entries} + self.assertEqual(areas["as-123-2023.pdf"], LegalArea.PENAL) + self.assertEqual(areas["as-456-2022.pdf"], LegalArea.CIVIL) + self.assertEqual(areas["ss-77-2024.pdf"], LegalArea.LABORAL) + + def test_extracts_resolution_number(self): + client = _make_client(TSJ_HTML) + scraper = TribunalSupremoJusticiaScraper( + client=client, + request_delay_seconds=0, + base_url="http://test.local/", + listing_paths=("/jurisprudencia/",), + ) + entries = list(scraper.iter_entries()) + ids = {e.external_id for e in entries} + self.assertIn("AS-123/2023", ids) + + +class TestTcpScraper(SimpleTestCase): + def test_all_entries_routed_to_constitucional(self): + client = _make_client(TCP_HTML) + scraper = TribunalConstitucionalScraper( + client=client, + request_delay_seconds=0, + base_url="http://test.local/", + listing_paths=("/jurisprudencia/",), + ) + entries = list(scraper.iter_entries()) + self.assertEqual(len(entries), 2) + self.assertTrue( + all(e.suggested_area == LegalArea.CONSTITUCIONAL for e in entries) + ) + + def test_extracts_resolution_id_and_accion(self): + client = _make_client(TCP_HTML) + scraper = TribunalConstitucionalScraper( + client=client, + request_delay_seconds=0, + base_url="http://test.local/", + listing_paths=("/jurisprudencia/",), + ) + entries = {e.external_id: e for e in scraper.iter_entries()} + self.assertIn("SCP-0250/2012", entries) + self.assertEqual( + entries["SCP-0250/2012"].metadata.get("accion"), + "amparo_constitucional", + ) + self.assertIn("DCP-0001/2020", entries) + self.assertEqual( + entries["DCP-0001/2020"].metadata.get("accion"), + "accion_de_inconstitucionalidad", + ) + + +class TestScraperRegistry(SimpleTestCase): + def test_registry_covers_all_scraped_sources(self): + self.assertEqual( + set(SCRAPERS.keys()), + {LegalSource.GACETA.value, LegalSource.TSJ.value, LegalSource.TCP.value}, + ) + + def test_get_scraper_class_unknown_key_raises(self): + with self.assertRaises(KeyError): + get_scraper_class("no-such-source") + + +class TestBaseScraperDefensiveness(SimpleTestCase): + def test_broken_listing_page_does_not_abort_run(self): + """A 500 response on one page must not break the iterator.""" + + def _handler(request: httpx.Request) -> httpx.Response: + if "broken" in request.url.path: + return httpx.Response(500, text="server error") + if request.url.path.endswith(".pdf"): + return httpx.Response(200, content=b"%PDF-x") + return httpx.Response(200, text=GACETA_HTML) + + client = httpx.Client( + transport=httpx.MockTransport(_handler), + base_url="http://test.local", + follow_redirects=True, + ) + scraper = GacetaOficialScraper( + client=client, + request_delay_seconds=0, + base_url="http://test.local/", + listing_paths=("/broken/", "/ok/"), + ) + entries = list(scraper.iter_entries()) + # The /ok/ listing still yields the two entries from GACETA_HTML + self.assertEqual(len(entries), 2) + + +class TestScrapedEntryShape(SimpleTestCase): + def test_as_dict_serialises_date(self): + entry = ScrapedEntry( + source_key="gaceta", + pdf_url="http://x/y.pdf", + title="t", + published_at=dt.date(2024, 1, 2), + ) + payload = entry.as_dict() + self.assertEqual(payload["published_at"], "2024-01-02") diff --git a/requirements/base.txt b/requirements/base.txt index f0429ee84..0f43a6c6f 100644 --- a/requirements/base.txt +++ b/requirements/base.txt @@ -12,6 +12,7 @@ pydantic==2.* typing-extensions==4.* # https://github.com/python/typing_extensions requests==2.* # https://requests.readthedocs.io/en/latest/ httpx>=0.27.0,<1 # https://github.com/encode/httpx - async HTTP for agent tools +beautifulsoup4>=4.12,<5 # HTML parsing for Bolivian legal source scrapers tokenizers>=0.21,<0.23 # Pin to prevent conflicts with transformers diff --git a/scripts/easypanel/configure-traefik.sh b/scripts/easypanel/configure-traefik.sh new file mode 100755 index 000000000..ad50f83fe --- /dev/null +++ b/scripts/easypanel/configure-traefik.sh @@ -0,0 +1,69 @@ +#!/usr/bin/env bash +# ============================================================================= +# scripts/easypanel/configure-traefik.sh +# +# Patch compose/production/traefik/traefik.yml in-place to use your domain +# and Let's Encrypt contact email. The bundled Traefik config ships with +# the OpenContracts upstream domain — you must replace it before the +# first production deploy. +# +# Usage: +# ./scripts/easypanel/configure-traefik.sh \ +# --domain oc.example.com \ +# --email you@example.com +# ============================================================================= + +set -euo pipefail + +REPO_ROOT="$(cd -- "$(dirname -- "${BASH_SOURCE[0]}")/../.." && pwd)" +TRAEFIK="${REPO_ROOT}/compose/production/traefik/traefik.yml" + +DOMAIN="" +EMAIL="" + +while [[ $# -gt 0 ]]; do + case "$1" in + --domain) DOMAIN="$2"; shift 2 ;; + --email) EMAIL="$2"; shift 2 ;; + -h|--help) sed -n '2,/^# =\{20,\}$/p' "$0" | sed 's/^# \{0,1\}//'; exit 0 ;; + *) echo "Unknown flag: $1" >&2; exit 2 ;; + esac +done + +if [[ -z "$DOMAIN" ]]; then read -r -p "Public domain: " DOMAIN; fi +if [[ -z "$EMAIL" ]]; then read -r -p "ACME email: " EMAIL; fi + +if [[ ! -f "$TRAEFIK" ]]; then + echo "Traefik config not found at $TRAEFIK" >&2; exit 1 +fi + +cp "$TRAEFIK" "$TRAEFIK.bak" +echo "Backup written to $TRAEFIK.bak" + +python3 - "$TRAEFIK" "$DOMAIN" "$EMAIL" <<'PY' +import sys, pathlib, re +path, domain, email = sys.argv[1], sys.argv[2], sys.argv[3] +p = pathlib.Path(path) +text = p.read_text() + +# ACME email +text = re.sub( + r'email:\s*"[^"]*"', + f'email: "{email}"', + text, + count=1, +) + +# Replace both legacy hostnames everywhere (router rules, etc.) +text = text.replace("contracts.opensource.legal", domain) +text = text.replace(f"www.{domain}", f"www.{domain}") # no-op safety + +p.write_text(text) +PY + +echo "Updated $TRAEFIK with:" +echo " domain: $DOMAIN" +echo " email: $EMAIL" +echo +echo "Re-deploy the traefik service for the changes to take effect:" +echo " docker compose -f production.yml build traefik && docker compose -f production.yml up -d traefik" diff --git a/scripts/easypanel/deploy.sh b/scripts/easypanel/deploy.sh new file mode 100755 index 000000000..ca8c3475d --- /dev/null +++ b/scripts/easypanel/deploy.sh @@ -0,0 +1,108 @@ +#!/usr/bin/env bash +# ============================================================================= +# scripts/easypanel/deploy.sh +# +# One-command production deploy. Runs on the EasyPanel host (or any Docker +# host). Asks four questions, generates every secret, patches Traefik, +# brings the stack up, runs migrations, and seeds the Bolivian-laws +# scrape so you can test immediately. +# +# Usage: +# ./scripts/easypanel/deploy.sh +# +# Or fully non-interactive: +# ./scripts/easypanel/deploy.sh \ +# --domain oc.example.com \ +# --email you@example.com \ +# --openai-key sk-... \ +# --admin-password 'StrongPass!' +# +# Re-runs are safe: env files are kept, Traefik is re-patched idempotently, +# Compose just brings any missing services up. +# ============================================================================= + +set -euo pipefail + +REPO_ROOT="$(cd -- "$(dirname -- "${BASH_SOURCE[0]}")/../.." && pwd)" +cd "$REPO_ROOT" + +DOMAIN="${DOMAIN:-}" +EMAIL="${EMAIL:-}" +OPENAI_KEY="${OPENAI_KEY:-}" +ADMIN_PASSWORD="${ADMIN_PASSWORD:-}" +SKIP_SCRAPE_TEST=0 + +while [[ $# -gt 0 ]]; do + case "$1" in + --domain) DOMAIN="$2"; shift 2 ;; + --email) EMAIL="$2"; shift 2 ;; + --openai-key) OPENAI_KEY="$2"; shift 2 ;; + --admin-password) ADMIN_PASSWORD="$2"; shift 2 ;; + --skip-scrape-test) SKIP_SCRAPE_TEST=1; shift ;; + -h|--help) + sed -n '2,/^# =\{20,\}$/p' "$0" | sed 's/^# \{0,1\}//' + exit 0 ;; + *) echo "Unknown flag: $1" >&2; exit 2 ;; + esac +done + +# --- prompts (only ask for what wasn't passed) --------------------------- +if [[ -z "$DOMAIN" ]]; then read -r -p "Public domain (e.g. oc.example.com): " DOMAIN; fi +if [[ -z "$EMAIL" ]]; then read -r -p "Contact / Let's Encrypt email: " EMAIL; fi +if [[ -z "$OPENAI_KEY" ]]; then read -r -s -p "OpenAI API key (sk-...): " OPENAI_KEY; echo; fi +if [[ -z "$ADMIN_PASSWORD" ]]; then read -r -s -p "Initial admin password: " ADMIN_PASSWORD; echo; fi + +for var in DOMAIN EMAIL OPENAI_KEY ADMIN_PASSWORD; do + if [[ -z "${!var}" ]]; then + echo "Missing $var; aborting." >&2; exit 2 + fi +done + +# --- step 1: env files --------------------------------------------------- +echo +echo "[1/4] Generating .envs/.production/* with strong random secrets..." +"$REPO_ROOT/scripts/easypanel/generate-env.sh" \ + --domain "$DOMAIN" \ + --email "$EMAIL" \ + --openai-key "$OPENAI_KEY" \ + --superuser-password "$ADMIN_PASSWORD" + +# --- step 2: Traefik config ---------------------------------------------- +echo +echo "[2/4] Patching Traefik with your domain + ACME email..." +"$REPO_ROOT/scripts/easypanel/configure-traefik.sh" \ + --domain "$DOMAIN" \ + --email "$EMAIL" + +# --- step 3: bring up + migrate ----------------------------------------- +echo +echo "[3/4] Building images and starting the stack (this can take a while)..." +docker compose -f production.yml build +docker compose -f production.yml --profile migrate up migrate +docker compose -f production.yml up -d + +# --- step 4: smoke-test the Bolivian-laws scrape ------------------------ +if [[ "$SKIP_SCRAPE_TEST" -eq 1 ]]; then + echo + echo "[4/4] Skipping scrape smoke test (--skip-scrape-test)." +else + echo + echo "[4/4] Smoke-testing the Bolivian-laws scrape (--max-entries 3 per source)..." + docker compose -f production.yml exec -T django \ + python manage.py scrape_bolivian_laws --all --since-days 7 --max-entries 3 --sync \ + || echo " (scrape returned non-zero — sites may be unreachable; check logs)" +fi + +echo +echo "================================================================" +echo "Deploy complete." +echo +echo " App: https://$DOMAIN" +echo " Admin: https://$DOMAIN/$(grep DJANGO_ADMIN_URL .envs/.production/.django | cut -d= -f2)" +echo " Flower: https://$DOMAIN:5555" +echo +echo "Useful follow-ups:" +echo " - tail logs: docker compose -f production.yml logs -f django celeryworker celerybeat" +echo " - re-deploy: git pull && docker compose -f production.yml up -d --build" +echo " - migrations: docker compose -f production.yml --profile migrate up migrate" +echo "================================================================" diff --git a/scripts/easypanel/generate-env.sh b/scripts/easypanel/generate-env.sh new file mode 100755 index 000000000..90f0c07a4 --- /dev/null +++ b/scripts/easypanel/generate-env.sh @@ -0,0 +1,169 @@ +#!/usr/bin/env bash +# ============================================================================= +# scripts/easypanel/generate-env.sh +# +# Bootstrap helper for an EasyPanel "Compose" deploy of OpenContracts +# (Option A in docs/deployment/easypanel.md). +# +# What it does: +# 1. Reads the templates under .envs.example/.production/. +# 2. Generates strong random secrets for every / +# placeholder. +# 3. Asks (or accepts via env vars / flags) for the few values it can't +# guess: domain, ACME email, OpenAI key, superuser password. +# 4. Writes the filled files to .envs/.production/ (gitignored). +# 5. Prints a summary so you can paste credentials into a vault. +# +# Usage: +# ./scripts/easypanel/generate-env.sh \ +# --domain oc.example.com \ +# --email you@example.com \ +# --openai-key sk-... +# +# All flags are optional — the script will prompt for anything missing. +# Re-run safely: existing .envs/.production/* are NEVER overwritten unless +# you pass --force. +# ============================================================================= + +set -euo pipefail + +REPO_ROOT="$(cd -- "$(dirname -- "${BASH_SOURCE[0]}")/../.." && pwd)" +SRC_DIR="${REPO_ROOT}/.envs.example/.production" +DST_DIR="${REPO_ROOT}/.envs/.production" + +DOMAIN="" +EMAIL="" +OPENAI_KEY="" +SUPERUSER_PASSWORD="" +FORCE=0 + +usage() { + sed -n '2,/^# =\{20,\}$/p' "$0" | sed 's/^# \{0,1\}//' + exit 0 +} + +while [[ $# -gt 0 ]]; do + case "$1" in + --domain) DOMAIN="$2"; shift 2 ;; + --email) EMAIL="$2"; shift 2 ;; + --openai-key) OPENAI_KEY="$2"; shift 2 ;; + --superuser-password) SUPERUSER_PASSWORD="$2"; shift 2 ;; + --force) FORCE=1; shift ;; + -h|--help) usage ;; + *) echo "Unknown flag: $1" >&2; exit 2 ;; + esac +done + +prompt() { + local var_name="$1" + local message="$2" + local secret="${3:-0}" + local current="${!var_name}" + if [[ -n "$current" ]]; then return; fi + if [[ "$secret" == "1" ]]; then + read -r -s -p "$message: " value; echo + else + read -r -p "$message: " value + fi + if [[ -z "$value" ]]; then + echo "Empty value for $var_name; aborting." >&2; exit 2 + fi + printf -v "$var_name" '%s' "$value" +} + +if [[ ! -d "$SRC_DIR" ]]; then + echo "Templates not found at $SRC_DIR" >&2; exit 1 +fi + +prompt DOMAIN "Public domain (e.g. oc.example.com)" +prompt EMAIL "Contact / Let's Encrypt email" +prompt OPENAI_KEY "OpenAI API key (sk-...)" 1 +prompt SUPERUSER_PASSWORD "Initial Django superuser password" 1 + +# --- secret generation ---------------------------------------------------- +# Use Python to keep this portable across BSD/GNU grep variants. +gen() { python3 -c "import secrets;print(secrets.token_urlsafe($1))"; } +gen_admin_slug() { python3 -c "import secrets,string;print(''.join(secrets.choice(string.ascii_letters+string.digits) for _ in range(30)))"; } + +DJANGO_SECRET_KEY="$(gen 64)" +DJANGO_ADMIN_SLUG="$(gen_admin_slug)" +FLOWER_USER="$(gen 16)" +FLOWER_PASSWORD="$(gen 32)" +VECTOR_EMBEDDER_API_KEY="$(gen 24)" +POSTGRES_PASSWORD="$(gen 32)" + +mkdir -p "$DST_DIR" + +write_or_skip() { + local src="$1" dst="$2" + if [[ -e "$dst" && "$FORCE" -ne 1 ]]; then + echo "SKIP $dst (already exists; use --force to overwrite)" + return + fi + cp "$src" "$dst" + echo "WROTE $dst" +} + +write_or_skip "$SRC_DIR/.django" "$DST_DIR/.django" +write_or_skip "$SRC_DIR/.postgres" "$DST_DIR/.postgres" +write_or_skip "$SRC_DIR/.frontend" "$DST_DIR/.frontend" + +# --- substitution -------------------------------------------------------- +# Use Python for cross-platform sed semantics. +substitute() { + local file="$1" pattern="$2" value="$3" + python3 - "$file" "$pattern" "$value" <<'PY' +import sys, pathlib +path, pat, val = sys.argv[1], sys.argv[2], sys.argv[3] +p = pathlib.Path(path) +text = p.read_text() +if pat not in text: + sys.exit(0) # already substituted +p.write_text(text.replace(pat, val)) +PY +} + +# .django substitutions — every placeholder is unique so order is irrelevant. +DJANGO="$DST_DIR/.django" +substitute "$DJANGO" "" "$DJANGO_SECRET_KEY" +substitute "$DJANGO" "" "$DJANGO_ADMIN_SLUG" +substitute "$DJANGO" "" "$DOMAIN" +substitute "$DJANGO" "" "$EMAIL" +substitute "$DJANGO" "" "$SUPERUSER_PASSWORD" +substitute "$DJANGO" "" "$FLOWER_USER" +substitute "$DJANGO" "" "$FLOWER_PASSWORD" +substitute "$DJANGO" "" "$OPENAI_KEY" +substitute "$DJANGO" "" "$VECTOR_EMBEDDER_API_KEY" + +# .postgres substitutions (POSTGRES_PASSWORD appears twice; replace both) +PG="$DST_DIR/.postgres" +python3 - "$PG" "$POSTGRES_PASSWORD" <<'PY' +import sys, pathlib +path, val = sys.argv[1], sys.argv[2] +p = pathlib.Path(path) +text = p.read_text() +text = text.replace("", val) +text = text.replace("", val) +p.write_text(text) +PY + +# .frontend substitutions +FE="$DST_DIR/.frontend" +substitute "$FE" "" "$DOMAIN" + +echo +echo "=== Done. Generated env files in $DST_DIR ===" +echo +echo "Save these credentials in a password manager:" +echo " Domain: $DOMAIN" +echo " Admin URL: https://$DOMAIN/admin/$DJANGO_ADMIN_SLUG/" +echo " Superuser: admin / (the password you provided)" +echo " Flower user: $FLOWER_USER" +echo " Flower password: $FLOWER_PASSWORD" +echo " Postgres password: $POSTGRES_PASSWORD" +echo +echo "Next steps:" +echo " 1. ./scripts/easypanel/configure-traefik.sh --domain $DOMAIN --email $EMAIL" +echo " 2. Commit nothing in .envs/.production/ — it is gitignored." +echo " 3. In EasyPanel: deploy this repo with production.yml, then run:" +echo " docker compose -f production.yml --profile migrate up migrate" diff --git a/scripts/easypanel/print-env.sh b/scripts/easypanel/print-env.sh new file mode 100755 index 000000000..a03b4ad0c --- /dev/null +++ b/scripts/easypanel/print-env.sh @@ -0,0 +1,64 @@ +#!/usr/bin/env bash +# ============================================================================= +# scripts/easypanel/print-env.sh +# +# Generates every random secret the EasyPanel-native deploy needs +# (easypanel.yml) and prints them in the format EasyPanel's "Environment" +# section expects. Copy the output, paste it into the EasyPanel app's +# Environment tab, then fill in DOMAIN / ADMIN_EMAIL / ADMIN_PASSWORD / +# OPENAI_API_KEY manually (those can't be auto-generated). +# +# Usage: +# ./scripts/easypanel/print-env.sh +# +# Or pre-fill the user-provided values: +# ./scripts/easypanel/print-env.sh \ +# --domain oc.example.com \ +# --email you@example.com \ +# --openai-key sk-... \ +# --admin-password 'StrongPass!' +# ============================================================================= + +set -euo pipefail + +DOMAIN="${DOMAIN:-}" +EMAIL="${ADMIN_EMAIL:-}" +OPENAI_KEY="${OPENAI_API_KEY:-}" +ADMIN_PASSWORD="${ADMIN_PASSWORD:-}" + +while [[ $# -gt 0 ]]; do + case "$1" in + --domain) DOMAIN="$2"; shift 2 ;; + --email) EMAIL="$2"; shift 2 ;; + --openai-key) OPENAI_KEY="$2"; shift 2 ;; + --admin-password) ADMIN_PASSWORD="$2"; shift 2 ;; + -h|--help) sed -n '2,/^# =\{20,\}$/p' "$0" | sed 's/^# \{0,1\}//'; exit 0 ;; + *) echo "Unknown flag: $1" >&2; exit 2 ;; + esac +done + +gen() { python3 -c "import secrets;print(secrets.token_urlsafe($1))"; } +gen_slug() { python3 -c "import secrets,string;print(''.join(secrets.choice(string.ascii_letters+string.digits) for _ in range($1)))"; } + +cat <