diff --git a/Justfile b/Justfile index 0c6b437..3d52c6b 100644 --- a/Justfile +++ b/Justfile @@ -76,10 +76,11 @@ vocab-canon: # ── Corpus / training pipeline ───────────────────────────────── # -# Three steps in order: provision upstream mirrors, run every extractor, -# then retrain. Each step is idempotent; re-running only touches what -# has changed. See scripts/provision_corpora.sh --list for the source -# catalogue. +# Five steps in order: provision upstream mirrors, run every extractor, +# merge per-prover proof_states into UNIFIED, align the premise files +# to UNIFIED's fresh ids, then retrain. Each step is idempotent; +# re-running only touches what has changed. See +# scripts/provision_corpora.sh --list for the source catalogue. # Clone every upstream prover corpus into external_corpora/. # Pass specific names to provision a subset: `just provision-corpora metamath mathlib4`. @@ -99,14 +100,28 @@ corpora-status: extract-corpora *NAMES: scripts/extract_all.sh {{NAMES}} +# Merge per-prover proof_states_*.jsonl into proof_states_UNIFIED.jsonl +# with fresh sequential ids (dedupes by prover+theorem). +merge-corpora: + julia --project=src/julia scripts/merge_corpus.jl + +# Rebuild premises_COMPLETE.jsonl with proof_ids that match UNIFIED. +# merge-corpora rewrites every proof_state id to a fresh sequential +# counter; the premise files keep the original extractor ids, so this +# step re-joins premises to UNIFIED via (prover, theorem) — the durable +# key merge_corpus.jl already dedupes on. Without this step the +# dataloader's proof_id join matches ~0% of records. +align-premises: + julia --project=src/julia scripts/align_premises.jl + # Full retrain from provisioned corpora. Honours ECHIDNA_MAX_PROOF_STATES # (0 = unlimited), ECHIDNA_NUM_EPOCHS, ECHIDNA_NUM_NEGATIVES. retrain: julia --project=src/julia src/julia/run_training.jl -# End-to-end pipeline: provision → extract → retrain. +# End-to-end pipeline: provision → extract → merge → align → retrain. # Use `ECHIDNA_MAX_PROOF_STATES=0 just corpus-refresh` to lift the sample cap. -corpus-refresh: provision-corpora extract-corpora retrain +corpus-refresh: provision-corpora extract-corpora merge-corpora align-premises retrain # Run the eight-axis metrics suite against the current corpus and post # results to VeriSimDB. Falls back to training_data/metrics_.jsonl diff --git a/scripts/align_premises.jl b/scripts/align_premises.jl new file mode 100755 index 0000000..4620329 --- /dev/null +++ b/scripts/align_premises.jl @@ -0,0 +1,249 @@ +#!/usr/bin/env julia +# SPDX-License-Identifier: PMPL-1.0-or-later +# SPDX-FileCopyrightText: 2026 Jonathan D.A. Jewell (hyperpolymath) +# +# align_premises.jl — rebuild training_data/premises_COMPLETE.jsonl with proof_ids +# that match the fresh sequential ids written by merge_corpus.jl into +# training_data/proof_states_UNIFIED.jsonl. +# +# Background +# ---------- +# The per-prover extractors (scripts/extract_*.jl and scripts/max_extract_*.jl) +# emit proof_states and premises sharing a per-extractor-local integer id space: +# +# {"id": 60123, "prover": "Lean", "theorem": "Foo.bar", ...} (proof_state) +# {"proof_id": 60123, "prover": "Lean", "theorem": "Foo.bar", ...} (premise) +# +# scripts/merge_corpus.jl then writes proof_states_UNIFIED.jsonl after dedup +# by (prover, theorem) and **rewrites every proof_state's id to a fresh +# sequential counter (1..N)**. The premise files retain the original +# extractor ids, so proof_state.id and premise.proof_id no longer share an id +# space — the dataloader's join by proof_id matches ~0% of records. +# +# This script rebuilds premises_COMPLETE.jsonl using the +# (canonical_prover, theorem) pair as the durable join key (which merge_corpus +# already guarantees is unique within UNIFIED). For every premise record, +# we look up the fresh id assigned by merge_corpus and rewrite +# `proof_id := UNIFIED_id`. Records whose (prover, theorem) does not appear +# in UNIFIED are dropped and counted. +# +# Usage +# ----- +# julia --project=src/julia scripts/align_premises.jl +# +# Outputs +# ------- +# training_data/premises_COMPLETE.jsonl — aligned premises +# training_data/premises_ALIGNED_stats.json — coverage report +# +# Run this after merge_corpus.jl and before run_training.jl. + +using JSON3 + +# --------------------------------------------------------------------------- +# Paths +# --------------------------------------------------------------------------- + +const REPO_ROOT = dirname(dirname(abspath(@__FILE__))) +const TRAINING_DIR = joinpath(REPO_ROOT, "training_data") + +const UNIFIED_PATH = joinpath(TRAINING_DIR, "proof_states_UNIFIED.jsonl") +const OUT_PATH = joinpath(TRAINING_DIR, "premises_COMPLETE.jsonl") +const STATS_PATH = joinpath(TRAINING_DIR, "premises_ALIGNED_stats.json") + +# Every premises_*.jsonl in training_data/ is a candidate source, except the +# output file itself and any ALIGNED_stats JSON we emit. We also skip the +# timestamped `premises_.jsonl` snapshots by default but include +# them when ECHIDNA_ALIGN_INCLUDE_DATED=1. +function collect_premise_sources() + sources = String[] + for f in readdir(TRAINING_DIR; join=true) + base = basename(f) + startswith(base, "premises_") || continue + endswith(base, ".jsonl") || continue + base == "premises_COMPLETE.jsonl" && continue + if get(ENV, "ECHIDNA_ALIGN_INCLUDE_DATED", "0") != "1" && + occursin(r"^premises_\d{4}-\d{2}-\d{2}\.jsonl$", base) + continue + end + push!(sources, f) + end + sort!(sources) + return sources +end + +# --------------------------------------------------------------------------- +# Prover-name canonicalisation — must match merge_corpus.jl's mapping. +# Keep this in lockstep with scripts/merge_corpus.jl's PROVER_CANONICAL. +# --------------------------------------------------------------------------- + +const PROVER_CANONICAL = Dict( + "Lean4" => "Lean", + "Rocq" => "Coq", + "Fstar" => "F*", + "CVC4" => "CVC5", + "AltErgo" => "Alt-Ergo", + "HOL Light" => "HOLLight", + "E Prover" => "EProver", + "OR-Tools" => "ORTools", +) + +canon(prover::AbstractString) = get(PROVER_CANONICAL, String(prover), String(prover)) + +# --------------------------------------------------------------------------- +# Build the (canonical_prover, theorem) → fresh_id index from UNIFIED. +# --------------------------------------------------------------------------- + +function build_unified_index(path::String) + isfile(path) || error("missing $(path); run scripts/merge_corpus.jl first.") + index = Dict{Tuple{String,String},Int}() + open(path, "r") do fh + for line in eachline(fh) + isempty(strip(line)) && continue + entry = try + JSON3.read(line, Dict{String,Any}) + catch + continue + end + prover = canon(get(entry, "prover", "")) + theorem = String(get(entry, "theorem", "")) + new_id = get(entry, "id", nothing) + (isempty(prover) || isempty(theorem) || new_id === nothing) && continue + key = (prover, theorem) + # First-write wins. merge_corpus already dedupes by (prover,theorem) + # so collisions here would indicate a merge bug — report and keep + # the earlier id. + if haskey(index, key) && index[key] != new_id + @warn "duplicate (prover,theorem) in UNIFIED" prover=prover theorem=theorem first=index[key] second=new_id + continue + end + index[key] = Int(new_id) + end + end + return index +end + +# --------------------------------------------------------------------------- +# Walk every premise source, rewrite proof_id, and emit premises_COMPLETE. +# --------------------------------------------------------------------------- + +function align_premises(index::Dict{Tuple{String,String},Int}, sources::Vector{String}) + written = 0 + dropped_no_theorem = 0 + dropped_no_match = 0 + dropped_malformed = 0 + per_source = Dict{String,Dict{String,Int}}() + + open(OUT_PATH, "w") do out + for src in sources + s_written = 0 + s_dropped = 0 + open(src, "r") do fh + for line in eachline(fh) + isempty(strip(line)) && continue + rec = try + JSON3.read(line, Dict{String,Any}) + catch + dropped_malformed += 1 + s_dropped += 1 + continue + end + prover = canon(get(rec, "prover", "")) + theorem = String(get(rec, "theorem", "")) + if isempty(prover) || isempty(theorem) + dropped_no_theorem += 1 + s_dropped += 1 + continue + end + new_id = get(index, (prover, theorem), nothing) + if new_id === nothing + dropped_no_match += 1 + s_dropped += 1 + continue + end + rec["proof_id"] = new_id + println(out, JSON3.write(rec)) + written += 1 + s_written += 1 + end + end + per_source[basename(src)] = Dict( + "written" => s_written, + "dropped" => s_dropped, + ) + end + end + + return ( + written = written, + dropped_no_theorem = dropped_no_theorem, + dropped_no_match = dropped_no_match, + dropped_malformed = dropped_malformed, + per_source = per_source, + ) +end + +# --------------------------------------------------------------------------- +# Main +# --------------------------------------------------------------------------- + +function main() + println("align_premises — reconcile premise proof_ids with proof_states_UNIFIED") + println("=" ^ 72) + + println("\nBuilding (prover, theorem) → fresh_id index from UNIFIED...") + index = build_unified_index(UNIFIED_PATH) + println(" $(length(index)) (prover, theorem) pairs loaded.") + + sources = collect_premise_sources() + println("\nPremise sources ($(length(sources)) files):") + for s in sources + println(" $(basename(s))") + end + + println("\nAligning...") + result = align_premises(index, sources) + + total_in = result.written + result.dropped_no_theorem + + result.dropped_no_match + result.dropped_malformed + match_rate = total_in == 0 ? 0.0 : 100.0 * result.written / total_in + + println("\nResults") + println("-" ^ 72) + println(" Wrote : $(result.written) records to $(OUT_PATH)") + println(" Dropped (no key) : $(result.dropped_no_theorem) (missing prover or theorem)") + println(" Dropped (no match): $(result.dropped_no_match) (theorem not in UNIFIED)") + println(" Dropped (bad json): $(result.dropped_malformed)") + println(" Match rate : $(round(match_rate; digits=2))%") + + println("\nPer source:") + for (name, counts) in sort(collect(result.per_source); by = p -> p[1]) + println(" $(rpad(name, 40)) written=$(counts["written"]) dropped=$(counts["dropped"])") + end + + # Persist a machine-readable stats file alongside the output. + stats = Dict( + "unified_path" => UNIFIED_PATH, + "output_path" => OUT_PATH, + "unified_index_size" => length(index), + "sources" => [basename(s) for s in sources], + "written" => result.written, + "dropped_no_theorem" => result.dropped_no_theorem, + "dropped_no_match" => result.dropped_no_match, + "dropped_malformed" => result.dropped_malformed, + "match_rate_percent" => match_rate, + "per_source" => result.per_source, + ) + open(STATS_PATH, "w") do fh + JSON3.pretty(fh, stats) + end + println("\nStats written to $(STATS_PATH)") + + if result.written == 0 + println("\nERROR: 0 records written. Check that the premise files carry 'prover' and 'theorem' fields.") + exit(2) + end + return 0 +end + +main()