From b62bcba8836eb0a71f5480a7901629041ed72450 Mon Sep 17 00:00:00 2001
From: Hongyu Wang <hongyu_wang@ucsb.edu>
Date: Sun, 3 May 2026 19:04:10 +0800
Subject: [PATCH 1/2] Add LegalBenchPro CI and oversight framing

---
 .github/workflows/ci.yml | 35 +++++++++++++++++++++++++++++++++++
 README.md                | 32 ++++++++++++++++++++++++++++----
 pyproject.toml           | 21 +++++++++++++++++++++
 requirements.txt         |  1 +
 4 files changed, 85 insertions(+), 4 deletions(-)
 create mode 100644 .github/workflows/ci.yml
 create mode 100644 pyproject.toml

diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
new file mode 100644
index 0000000..2aeb97c
--- /dev/null
+++ b/.github/workflows/ci.yml
@@ -0,0 +1,35 @@
+name: CI
+
+on:
+  push:
+  pull_request:
+
+jobs:
+  test:
+    name: Python tests
+    runs-on: ubuntu-latest
+    strategy:
+      fail-fast: false
+      matrix:
+        python-version: ["3.11", "3.12"]
+
+    steps:
+      - name: Check out repository
+        uses: actions/checkout@v4
+
+      - name: Set up Python
+        uses: actions/setup-python@v5
+        with:
+          python-version: ${{ matrix.python-version }}
+          cache: pip
+
+      - name: Install dependencies
+        run: |
+          python -m pip install --upgrade pip
+          python -m pip install -r requirements.txt
+
+      - name: Run tests
+        run: python -m pytest -q
+
+      - name: Compile Python sources
+        run: python -m compileall scripts src
diff --git a/README.md b/README.md
index 2e9ad70..8cfa792 100644
--- a/README.md
+++ b/README.md
@@ -110,6 +110,29 @@ model outputs are treated as evidence to be validated rather than accepted, and
 decisions are documented through schemas, rubrics, provenance notes, and rerunnable
 scripts.
 
+## Relevance To Human-AI Oversight
+
+Although the current public snapshot focuses on legal and institutional text,
+LegalBenchPro is also a compact example of the research infrastructure needed for
+human-AI oversight work. The project turns domain-heavy tasks into auditable model
+evaluation artifacts, then separates model outputs, scoring rubrics, reviewer-facing
+protocols, and validation samples so that human judgments can be compared against AI
+judgments rather than hidden inside a single aggregate score.
+
+The pieces most relevant to human-AI complementarity and scalable oversight are:
+
+- **annotation protocol design:** task instructions, score anchors, and reviewer notes
+  are documented in `docs/ANNOTATION_PROTOCOL.md` and `docs/SCORING_RUBRIC.md`;
+- **human validation staging:** pilot rows are selected for expert review before
+  benchmark-level claims are made;
+- **audit trails:** AI-assisted coding, scoring, and release decisions are separated in
+  `docs/AI_WORKFLOW.md`, metadata files, and reproducible scripts;
+- **model failure analysis:** the workflow tracks answer consistency, factual grounding,
+  citation relevance, and cross-setting transfer failures across model configurations;
+- **reproducible collaboration:** public samples, metadata, tests, and manuscript-status
+  notes make it easier for collaborators to inspect what is complete, what is private,
+  and what still needs validation.
+
 ## Where To Start
 
 For a quick review of the project, start with:
@@ -203,19 +226,20 @@ The repository includes a small test suite:
 macOS/Linux:
 
 ```bash
-export PYTHONPATH="$PWD/src"
-python -m unittest discover -s tests
+python -m pytest -q
 python -m compileall scripts src
 ```
 
 Windows PowerShell:
 
 ```powershell
-$env:PYTHONPATH = "$PWD\src"
-python -m unittest discover -s tests
+python -m pytest -q
 python -m compileall scripts src
 ```
 
+The `pyproject.toml` test configuration points pytest at the `src/` package layout, so
+manual `PYTHONPATH` setup is not required for local validation.
+
 ## Research Software Signals
 
 This repository is intentionally organized as a research-engineering artifact, not only
diff --git a/pyproject.toml b/pyproject.toml
new file mode 100644
index 0000000..b0ec2fd
--- /dev/null
+++ b/pyproject.toml
@@ -0,0 +1,21 @@
+[build-system]
+requires = ["setuptools>=68"]
+build-backend = "setuptools.build_meta"
+
+[project]
+name = "legalbenchpro"
+version = "0.1.0"
+description = "Reproducible public utilities for the LegalBenchPro benchmark preview."
+readme = "README.md"
+requires-python = ">=3.10"
+dependencies = [
+  "matplotlib>=3.8",
+  "openpyxl>=3.1.2",
+]
+
+[tool.setuptools.packages.find]
+where = ["src"]
+
+[tool.pytest.ini_options]
+pythonpath = ["src"]
+testpaths = ["tests"]
diff --git a/requirements.txt b/requirements.txt
index e16b59c..f056fcc 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,2 +1,3 @@
 matplotlib>=3.8
 openpyxl>=3.1.2
+pytest>=8

From 094a3a575b7c248ee8fb00cf16a731bcb1ce1704 Mon Sep 17 00:00:00 2001
From: Hongyu Wang <hongyu_wang@ucsb.edu>
Date: Sun, 3 May 2026 19:17:59 +0800
Subject: [PATCH 2/2] Update LegalBenchPro status date

---
 README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/README.md b/README.md
index 8cfa792..b55a969 100644
--- a/README.md
+++ b/README.md
@@ -14,7 +14,7 @@ The project asks two questions in parallel:
 2. What does a defensible, auditable AI-assisted evaluation pipeline look like for
    legal and institutional text research?
 
-**Status (as of April 2026):** manuscript draft in preparation; 20,768 LLM response
+**Status (as of May 3, 2026):** manuscript draft in preparation; 20,768 LLM response
 cells collected across 22 model configurations; human-validation pilot underway; full
 data release pending licensing, privacy, and source-distribution review.