From b62bcba8836eb0a71f5480a7901629041ed72450 Mon Sep 17 00:00:00 2001 From: Hongyu Wang Date: Sun, 3 May 2026 19:04:10 +0800 Subject: [PATCH 1/2] Add LegalBenchPro CI and oversight framing --- .github/workflows/ci.yml | 35 +++++++++++++++++++++++++++++++++++ README.md | 32 ++++++++++++++++++++++++++++---- pyproject.toml | 21 +++++++++++++++++++++ requirements.txt | 1 + 4 files changed, 85 insertions(+), 4 deletions(-) create mode 100644 .github/workflows/ci.yml create mode 100644 pyproject.toml diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml new file mode 100644 index 0000000..2aeb97c --- /dev/null +++ b/.github/workflows/ci.yml @@ -0,0 +1,35 @@ +name: CI + +on: + push: + pull_request: + +jobs: + test: + name: Python tests + runs-on: ubuntu-latest + strategy: + fail-fast: false + matrix: + python-version: ["3.11", "3.12"] + + steps: + - name: Check out repository + uses: actions/checkout@v4 + + - name: Set up Python + uses: actions/setup-python@v5 + with: + python-version: ${{ matrix.python-version }} + cache: pip + + - name: Install dependencies + run: | + python -m pip install --upgrade pip + python -m pip install -r requirements.txt + + - name: Run tests + run: python -m pytest -q + + - name: Compile Python sources + run: python -m compileall scripts src diff --git a/README.md b/README.md index 2e9ad70..8cfa792 100644 --- a/README.md +++ b/README.md @@ -110,6 +110,29 @@ model outputs are treated as evidence to be validated rather than accepted, and decisions are documented through schemas, rubrics, provenance notes, and rerunnable scripts. +## Relevance To Human-AI Oversight + +Although the current public snapshot focuses on legal and institutional text, +LegalBenchPro is also a compact example of the research infrastructure needed for +human-AI oversight work. The project turns domain-heavy tasks into auditable model +evaluation artifacts, then separates model outputs, scoring rubrics, reviewer-facing +protocols, and validation samples so that human judgments can be compared against AI +judgments rather than hidden inside a single aggregate score. + +The pieces most relevant to human-AI complementarity and scalable oversight are: + +- **annotation protocol design:** task instructions, score anchors, and reviewer notes + are documented in `docs/ANNOTATION_PROTOCOL.md` and `docs/SCORING_RUBRIC.md`; +- **human validation staging:** pilot rows are selected for expert review before + benchmark-level claims are made; +- **audit trails:** AI-assisted coding, scoring, and release decisions are separated in + `docs/AI_WORKFLOW.md`, metadata files, and reproducible scripts; +- **model failure analysis:** the workflow tracks answer consistency, factual grounding, + citation relevance, and cross-setting transfer failures across model configurations; +- **reproducible collaboration:** public samples, metadata, tests, and manuscript-status + notes make it easier for collaborators to inspect what is complete, what is private, + and what still needs validation. + ## Where To Start For a quick review of the project, start with: @@ -203,19 +226,20 @@ The repository includes a small test suite: macOS/Linux: ```bash -export PYTHONPATH="$PWD/src" -python -m unittest discover -s tests +python -m pytest -q python -m compileall scripts src ``` Windows PowerShell: ```powershell -$env:PYTHONPATH = "$PWD\src" -python -m unittest discover -s tests +python -m pytest -q python -m compileall scripts src ``` +The `pyproject.toml` test configuration points pytest at the `src/` package layout, so +manual `PYTHONPATH` setup is not required for local validation. + ## Research Software Signals This repository is intentionally organized as a research-engineering artifact, not only diff --git a/pyproject.toml b/pyproject.toml new file mode 100644 index 0000000..b0ec2fd --- /dev/null +++ b/pyproject.toml @@ -0,0 +1,21 @@ +[build-system] +requires = ["setuptools>=68"] +build-backend = "setuptools.build_meta" + +[project] +name = "legalbenchpro" +version = "0.1.0" +description = "Reproducible public utilities for the LegalBenchPro benchmark preview." +readme = "README.md" +requires-python = ">=3.10" +dependencies = [ + "matplotlib>=3.8", + "openpyxl>=3.1.2", +] + +[tool.setuptools.packages.find] +where = ["src"] + +[tool.pytest.ini_options] +pythonpath = ["src"] +testpaths = ["tests"] diff --git a/requirements.txt b/requirements.txt index e16b59c..f056fcc 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,2 +1,3 @@ matplotlib>=3.8 openpyxl>=3.1.2 +pytest>=8 From 094a3a575b7c248ee8fb00cf16a731bcb1ce1704 Mon Sep 17 00:00:00 2001 From: Hongyu Wang Date: Sun, 3 May 2026 19:17:59 +0800 Subject: [PATCH 2/2] Update LegalBenchPro status date --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 8cfa792..b55a969 100644 --- a/README.md +++ b/README.md @@ -14,7 +14,7 @@ The project asks two questions in parallel: 2. What does a defensible, auditable AI-assisted evaluation pipeline look like for legal and institutional text research? -**Status (as of April 2026):** manuscript draft in preparation; 20,768 LLM response +**Status (as of May 3, 2026):** manuscript draft in preparation; 20,768 LLM response cells collected across 22 model configurations; human-validation pilot underway; full data release pending licensing, privacy, and source-distribution review.