diff --git a/benches/routing_benchmarks.rs b/benches/routing_benchmarks.rs index 0ad1e89..8589e91 100644 --- a/benches/routing_benchmarks.rs +++ b/benches/routing_benchmarks.rs @@ -305,6 +305,7 @@ fn bench_dispatch_config_construction(c: &mut Criterion) { track_axioms: true, generate_certificates: true, timeout: 60, + diagnostics: false, }) }) }); diff --git a/src/julia/run_training.jl b/src/julia/run_training.jl index 7c38652..93c8f54 100644 --- a/src/julia/run_training.jl +++ b/src/julia/run_training.jl @@ -10,6 +10,14 @@ # data_dir = training_data/ # save_dir = models/neural/ # +# Environment overrides (see below): +# ECHIDNA_MAX_PROOF_STATES — cap on proof states loaded (default 200000 on +# GPU, 50000 on CPU). Set to 0 to disable the cap and consume the entire +# expanded corpus; required when re-baselining after corpus growth. +# ECHIDNA_NUM_EPOCHS — training epochs (default 30). +# ECHIDNA_NUM_NEGATIVES — hard-negative premise samples per example +# (default 20). +# # This script: # 1. Loads JSONL training data (proof states + premises) # 2. Builds vocabulary from the corpus @@ -74,10 +82,24 @@ println("═══════════════════════ println("Loading training data...") println("═══════════════════════════════════════════════════════════") +# Default cap: 200k on GPU (enough to exercise the expanded corpus without +# OOM on a 24GB card), 50k on CPU (keeps wall-clock finite). An operator +# re-baselining after corpus growth sets ECHIDNA_MAX_PROOF_STATES=0 to lift +# the cap entirely. +default_cap = has_gpu ? 200_000 : 50_000 +cap_env = get(ENV, "ECHIDNA_MAX_PROOF_STATES", "") +max_proof_states = isempty(cap_env) ? default_cap : parse(Int, cap_env) +# `load_training_data` treats any value `<= 0` as "load everything". +cap_label = max_proof_states <= 0 ? "unlimited" : string(max_proof_states) +println(" max_proof_states = $cap_label") + +num_negatives = parse(Int, get(ENV, "ECHIDNA_NUM_NEGATIVES", "20")) +println(" num_negatives = $num_negatives") + train_data, val_data, vocab = load_training_data(data_dir; train_split=0.8f0, - max_proof_states=50000, # Cap for reasonable training time - num_negatives=20 + max_proof_states=max_proof_states, + num_negatives=num_negatives, ) if isempty(train_data.examples) @@ -101,8 +123,9 @@ println("Model created successfully") println() # Configure training +num_epochs = parse(Int, get(ENV, "ECHIDNA_NUM_EPOCHS", "30")) training_config = TrainingConfig( - num_epochs=30, + num_epochs=num_epochs, learning_rate=1f-4, lr_schedule=:cosine, weight_decay=1f-5, diff --git a/src/rust/agent/memory.rs b/src/rust/agent/memory.rs index 75fd294..a100211 100644 --- a/src/rust/agent/memory.rs +++ b/src/rust/agent/memory.rs @@ -12,7 +12,9 @@ use async_trait::async_trait; use serde::{Deserialize, Serialize}; use std::path::PathBuf; use tokio::sync::RwLock; -use tracing::{debug, info, warn}; +use tracing::{debug, info}; +#[cfg(feature = "verisim")] +use tracing::warn; use super::AgenticGoal; use crate::core::ProofState; diff --git a/src/rust/proof_encoding.rs b/src/rust/proof_encoding.rs index c04aaa0..ebfdd91 100644 --- a/src/rust/proof_encoding.rs +++ b/src/rust/proof_encoding.rs @@ -154,8 +154,8 @@ mod tests { hypotheses: vec![], }; - let id1 = proof_identity("my_theorem", &goal, ProverKind::Lean4); - let id2 = proof_identity("my_theorem", &goal, ProverKind::Lean4); + let id1 = proof_identity("my_theorem", &goal, ProverKind::Lean); + let id2 = proof_identity("my_theorem", &goal, ProverKind::Lean); assert_eq!(id1, id2, "Same inputs must produce same identity"); } @@ -167,7 +167,7 @@ mod tests { hypotheses: vec![], }; - let lean = proof_identity("thm", &goal, ProverKind::Lean4); + let lean = proof_identity("thm", &goal, ProverKind::Lean); let coq = proof_identity("thm", &goal, ProverKind::Coq); assert_ne!( lean, coq, diff --git a/src/rust/provers/mod.rs b/src/rust/provers/mod.rs index 338f851..e95bcdd 100644 --- a/src/rust/provers/mod.rs +++ b/src/rust/provers/mod.rs @@ -1355,9 +1355,13 @@ impl ProverFactory { // Note: .lean is shared between Lean 3 and Lean 4; default is Lean 4. // Use detect_from_file_content() for Lean 3 vs 4 disambiguation. "lean3" => Some(ProverKind::Lean3), // explicit extension - "thm" => Some(ProverKind::Abella), // Abella .thm files - "dk" | "lp" => Some(ProverKind::Dedukti), // Dedukti / λΠ - "bpl" => Some(ProverKind::Boogie), // Boogie intermediate language + "thm" => Some(ProverKind::Abella), // Abella .thm files + // Dedukti uses .dk; .lp (lambdapi dialect) is shadowed by GLPK above + // since LP/MIP files dominate that extension in the wild — a .lp + // input ambiguous between lambdapi and linear programming is + // resolved as GLPK. Use detect_from_file_content() to disambiguate. + "dk" => Some(ProverKind::Dedukti), // Dedukti / λΠ + "bpl" => Some(ProverKind::Boogie), // Boogie intermediate language "ftl" => Some(ProverKind::Naproche), // Naproche controlled-NL "ma" => Some(ProverKind::Matita), // Matita proof file "ard" => Some(ProverKind::Arend), // Arend cubical HoTT diff --git a/src/rust/vcl_ut.rs b/src/rust/vcl_ut.rs index 26dc9e0..c648e8a 100644 --- a/src/rust/vcl_ut.rs +++ b/src/rust/vcl_ut.rs @@ -22,7 +22,9 @@ use anyhow::{Context, Result}; use serde::{Deserialize, Serialize}; -use tracing::{debug, info, warn}; +use tracing::{debug, info}; +#[cfg(feature = "verisim")] +use tracing::warn; use crate::provers::ProverKind; @@ -493,13 +495,11 @@ impl QueryExecutor { /// Find a specific proof by theorem name and optional prover. async fn execute_find_proof(&self, query: &ProofQuery) -> Result { - type_info: None, - let theorem = query.theorem_name.as_deref().unwrap_or(""); - #[cfg(feature = "verisim")] if let Some(prover) = query.prover { // Generate the octad key and look it up directly if let Some(ref goal_display) = query.goal_display { + let theorem = query.theorem_name.as_deref().unwrap_or(""); let goal = Goal { id: "query".to_string(), target: crate::core::Term::Var(goal_display.clone()), diff --git a/src/rust/verification/mod.rs b/src/rust/verification/mod.rs index 4adb299..2cad8f5 100644 --- a/src/rust/verification/mod.rs +++ b/src/rust/verification/mod.rs @@ -16,6 +16,8 @@ pub mod confidence; pub mod mutation; pub mod pareto; pub mod portfolio; +#[cfg(feature = "verisim")] +pub mod proof; pub mod statistics; pub use axiom_tracker::{AxiomPolicy, AxiomTracker, AxiomUsage, DangerLevel}; @@ -24,4 +26,8 @@ pub use confidence::TrustLevel; pub use mutation::{MutationKind, MutationResult, MutationTester}; pub use pareto::{ParetoFrontier, ProofCandidate, ProofObjective}; pub use portfolio::{PortfolioConfig, PortfolioResult, PortfolioSolver}; +#[cfg(feature = "verisim")] +pub use proof::{ + theorem_identity, Proof, ProofStateRecord, ProofVersion, TacticApplication, TacticStatus, +}; pub use statistics::StatisticsTracker; diff --git a/src/rust/verisim_bridge.rs b/src/rust/verisim_bridge.rs index 2ad22d9..687ce30 100644 --- a/src/rust/verisim_bridge.rs +++ b/src/rust/verisim_bridge.rs @@ -23,9 +23,9 @@ use anyhow::{Context, Result}; use chrono::Utc; use serde::{Deserialize, Serialize}; use std::collections::HashMap; -use tracing::{debug, info, warn}; +use tracing::{debug, warn}; -use crate::core::{Goal, ProofState, Tactic}; +use crate::core::{Goal, ProofState}; use crate::proof_encoding; use crate::provers::ProverKind; @@ -252,7 +252,7 @@ pub struct SpatialPayload { /// /// Usage: /// ```ignore -/// let octad = ProofOctadBuilder::new("my_theorem", &goal, ProverKind::Lean4) +/// let octad = ProofOctadBuilder::new("my_theorem", &goal, ProverKind::Lean) /// .with_proof_state(&proof_state) /// .with_axioms(vec!["Classical.em".to_string()]) /// .with_aspects(vec!["logic".to_string()]) @@ -879,7 +879,7 @@ fn base64_encode(bytes: &[u8]) -> String { #[cfg(test)] mod tests { use super::*; - use crate::core::{Context, Term}; + use crate::core::{Context, Tactic, Term}; use std::collections::HashMap; fn sample_goal() -> Goal { @@ -921,7 +921,7 @@ mod tests { let goal = sample_goal(); let proof = sample_proof_state(); - let octad = ProofOctadBuilder::new("nat_add_zero", &goal, ProverKind::Lean4) + let octad = ProofOctadBuilder::new("nat_add_zero", &goal, ProverKind::Lean) .with_proof_state(&proof) .with_axioms(vec!["Nat.rec".to_string()]) .with_aspects(vec!["arithmetic".to_string(), "induction".to_string()]) @@ -932,9 +932,10 @@ mod tests { // Key should be a 64-char hex digest assert_eq!(octad.key.len(), 64); - // Semantic modality + // Semantic modality (ProverKind::Lean is the Lean 4 variant; Lean 3 + // is a sibling ProverKind::Lean3.) assert_eq!(octad.semantic.status, ProofStatus::Complete); - assert_eq!(octad.semantic.prover, "Lean4"); + assert_eq!(octad.semantic.prover, "Lean"); assert!(!octad.semantic.proof_blob_b64.is_empty()); assert_eq!(octad.semantic.axioms_used, vec!["Nat.rec"]); @@ -964,7 +965,7 @@ mod tests { // Graph modality assert_eq!(octad.graph.cross_prover_id.len(), 64); - assert!(octad.graph.prover_id.contains("Lean4")); + assert!(octad.graph.prover_id.contains("Lean")); // Tensor modality assert_eq!(*octad.tensor.metrics.get("time_ms").unwrap(), 42.0); diff --git a/tests/live_prover_suite.rs b/tests/live_prover_suite.rs index 936a14d..a31e280 100644 --- a/tests/live_prover_suite.rs +++ b/tests/live_prover_suite.rs @@ -111,6 +111,15 @@ fn kind_label(kind: ProverKind) -> &'static str { ProverKind::Dafny => "Dafny", ProverKind::FStar => "F*", ProverKind::TLAPS => "TLAPS", + ProverKind::Tamarin => "Tamarin", + ProverKind::ProVerif => "ProVerif", + ProverKind::Metamath => "Metamath", + ProverKind::Twelf => "Twelf", + ProverKind::ORTools => "OR-Tools", + ProverKind::HOL4 => "HOL4", + ProverKind::ACL2 => "ACL2", + ProverKind::SCIP => "SCIP", + ProverKind::Imandra => "Imandra", _ => "", } } @@ -215,3 +224,65 @@ async fn live_tlaps_version() { // TLA+ Proof System's prover is `tlapm` (per provers/mod.rs). assert_version_reachable(ProverKind::TLAPS, "tlapm").await; } + +// ========================================================================== +// Tier 3 — weekly. Upstream-tarball or heavier-build provers. Most of +// these SKIP locally and in PR CI; the weekly tier3 matrix in +// live-provers.yml provisions each binary in its own job before running +// `cargo test ... `, so only the matching test runs per job. +// ========================================================================== + +#[tokio::test] +async fn live_tamarin_version() { + assert_version_reachable(ProverKind::Tamarin, "tamarin-prover").await; +} + +#[tokio::test] +async fn live_proverif_version() { + assert_version_reachable(ProverKind::ProVerif, "proverif").await; +} + +#[tokio::test] +async fn live_metamath_version() { + assert_version_reachable(ProverKind::Metamath, "metamath").await; +} + +#[tokio::test] +async fn live_twelf_version() { + // Twelf's CLI entry is `twelf-server` per provers/mod.rs. + assert_version_reachable(ProverKind::Twelf, "twelf-server").await; +} + +#[tokio::test] +async fn live_ortools_version() { + // Echidna's ORTools backend invokes `ortools_solve` (wrapper around the + // OR-Tools C++ solve CLI). Provisioned via upstream tarball. + assert_version_reachable(ProverKind::ORTools, "ortools_solve").await; +} + +#[tokio::test] +async fn live_hol4_version() { + // HOL4 requires Poly/ML + a tree build; provisioning is deferred to + // Containerfile. Test SKIPs on runners without `hol` on PATH. + assert_version_reachable(ProverKind::HOL4, "hol").await; +} + +#[tokio::test] +async fn live_acl2_version() { + // ACL2 requires a Common Lisp image; provisioning deferred to Containerfile. + assert_version_reachable(ProverKind::ACL2, "acl2").await; +} + +#[tokio::test] +async fn live_scip_version() { + // SCIP requires a cmake build of SCIP Optimization Suite; deferred to + // Containerfile. Test SKIPs until provisioned. + assert_version_reachable(ProverKind::SCIP, "scip").await; +} + +#[tokio::test] +async fn live_imandra_version() { + // Imandra is proprietary; handled via vendor-supplied container where a + // licence is available. Test SKIPs on public CI. + assert_version_reachable(ProverKind::Imandra, "imandra").await; +}