diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 8af8809..b4e016b 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -14,11 +14,16 @@ jobs: with: go-version: "1.24" cache: true + - uses: actions/setup-python@v5 + with: + python-version: "3.11" - name: go vet run: go vet ./... - name: staticcheck run: | go install honnef.co/go/tools/cmd/staticcheck@latest staticcheck ./... + - name: verify pack checksums + run: python3 scripts/verify_quotes.py - name: go test run: go test ./... diff --git a/CLAUDE.md b/CLAUDE.md new file mode 100644 index 0000000..0e7ca30 --- /dev/null +++ b/CLAUDE.md @@ -0,0 +1,85 @@ +# CLAUDE.md + +Guidance for Claude (and other coding agents) working in this repo. + +## What this project is + +`verse-driven` ships a single Go binary (`scripture-mcp`) that serves canonical +scripture passages — KJV Bible, 道德经, 心经, Quran — to coding agents via +local stdio MCP, CLI, and hooks. Read `README.md` and `plan.md` first for the +full picture; `docs/issues-backlog.md` is the work plan. + +## Working with scripture text — read this before touching packs + +Sacred-text bodies trip Anthropic's output content filter when echoed at +volume in model output. A `400 invalid_request_error: Output blocked by +content filtering policy` will kill the turn mid-tool-call. + +**Rule of thumb: scripture text should flow through scripts and files, never +through the model's text output.** + +Practical guidance: + +- **Don't paste verse text into your responses.** Don't quote passages in + commit messages, PR bodies, code comments, or chat replies. Reference by + citation only (`John 3:16`, `道德经第十一章`). +- **Don't echo verse text via Bash tool output.** Avoid `cat verses.jsonl`, + `head -100 some_pack.txt`, `grep '...' kjv.txt` printed inline. If you need + to inspect data, redirect to a file (`> /tmp/sample.txt`) and then read + only structural metadata (line counts, checksums, first-token, field + names) — not the text itself. +- **Pack builders are write-only.** A build script downloads upstream text + and writes JSONL/JSON directly to `internal/packs//`. The model never + sees the body. The model only writes the script, runs it, and verifies + byte/verse counts and SHA-256 checksums. +- **Tests assert by checksum, not by content.** `internal/packs/*_test.go` + should look up a known reference (e.g. `bible.kjv.john.3.16`) and compare + its `checksum_sha256` to a hard-coded expected hex digest, not its `Text`. + This keeps test files free of scripture text. +- **Verifier scripts read & hash, never print.** `scripts/verify_quotes.py` + recomputes SHA-256 over each verse's text and compares to the stored + checksum. On mismatch it prints the verse `id` and the two hashes — not + the text. +- **If you must spot-check a verse manually,** do it locally: build the + binary, run `scripture-mcp lookup ""`, eyeball the terminal. Don't + copy the output back into the conversation. + +If you hit the content filter mid-task, stop and refactor: move whatever was +about to be quoted into a file write or a script, then continue. + +## Repo layout + +``` +cmd/scripture-mcp/ main package; CLI entrypoint +internal/schema/ Verse struct + JSON Schema (the contract every pack obeys) +internal/resolver/ free-form reference parser ("John 3:16", "道德经 11", ...) +internal/packs/ embed.FS-backed pack data + registry + bible-kjv/ + dao-de-jing/ + heart-sutra/ +internal/mcp/ stdio MCP server (issue #4) +internal/cli/ CLI subcommands (issue #4) +internal/injector/ inject-once envelope helpers (issues #5/#6) +scripts/ pack builders + verifiers (Python or Go, run at build time) +adapters/ per-agent wiring (claude-code, codex) — issues #5/#6 +docs/ issues-backlog.md, benchmarks/ +``` + +## Building & testing + +- `make build` → `bin/scripture-mcp` +- `make test` → `go test ./...` +- `make lint` → `go vet` + `staticcheck` +- CI runs all three on push and PR; staticcheck failures block. + +## Conventions + +- Module path: `github.com/MiaoDX/verse-driven`. Go 1.24. +- Verse IDs are dotted lowercase: `....`, + e.g. `bible.kjv.john.3.16`, `dao.daodejing.11.1`, `sutra.heart-sutra.1`. +- Every verse carries a SHA-256 over its `Text` bytes in `checksum_sha256`, + computed at pack-build time. Hashes are the integrity boundary between + upstream sources and the bundled binary. +- Pack metadata (`metadata.json`) lives next to `verses.jsonl` and records + provider, license, attribution, source URL, and build date. +- Branches: feature work goes on `claude-issue-`; one PR per issue. diff --git a/Makefile b/Makefile index 1aa1726..99505d1 100644 --- a/Makefile +++ b/Makefile @@ -1,10 +1,22 @@ -.PHONY: build test lint vet staticcheck tidy clean all +.PHONY: build test lint vet staticcheck tidy clean all packs verify-packs BINARY := scripture-mcp CMD := ./cmd/scripture-mcp BUILD_DIR := bin -all: lint test build +all: lint verify-packs test build + +# Rebuild the bundled packs from upstream sources (KJV from Project +# Gutenberg, 道德经 from Project Gutenberg). Run after upstream regenerations +# or whenever the JSONL format changes. Requires Python 3.11+ and +# opencc-python-reimplemented for the dao pack. +packs: + python3 scripts/build_packs.py + +# Recompute SHA-256 over every bundled verse and compare to the stored +# checksum_sha256. CI runs this as a gate before `go test`. +verify-packs: + python3 scripts/verify_quotes.py build: mkdir -p $(BUILD_DIR) diff --git a/cmd/scripture-mcp/main.go b/cmd/scripture-mcp/main.go index 1f5d49b..d9e6bb7 100644 --- a/cmd/scripture-mcp/main.go +++ b/cmd/scripture-mcp/main.go @@ -1,9 +1,78 @@ +// scripture-mcp is the verse-driven binary. The full CLI surface +// (serve / lookup / lookup-from-prompt / recap / init) is implemented +// in issue #4. This entrypoint exposes just enough now to demonstrate +// that the embedded packs from issue #3 are reachable from main. +// +// Usage: +// +// scripture-mcp # prints version and pack summary +// scripture-mcp --packs # prints loaded pack metadata +// scripture-mcp --lookup-id # prints the canonical reference and +// # SHA-256 of one verse (text omitted +// # to keep terminal output filter-safe) package main -import "fmt" +import ( + "flag" + "fmt" + "os" + + "github.com/MiaoDX/verse-driven/internal/packs" +) const Version = "v0.0.0" func main() { - fmt.Printf("scripture-mcp %s\n", Version) + listPacks := flag.Bool("packs", false, "list loaded packs and exit") + lookupID := flag.String("lookup-id", "", "look up a verse by id and print metadata only") + flag.Parse() + + switch { + case *listPacks: + printPacks() + case *lookupID != "": + if err := printLookup(*lookupID); err != nil { + fmt.Fprintln(os.Stderr, "error:", err) + os.Exit(1) + } + default: + fmt.Printf("scripture-mcp %s\n", Version) + fmt.Printf("packs loaded: %d total verses: %d\n", + len(packs.All().Names()), packs.All().TotalVerses()) + } +} + +func printPacks() { + r := packs.All() + for _, name := range r.Names() { + p := r.Pack(name) + mode := p.Meta.InclusionMode + if mode == "" { + mode = "(unset)" + } + fmt.Printf("%-14s tradition=%-6s work=%-12s lang=%-6s verses=%-6d mode=%s\n", + name, p.Meta.Tradition, p.Meta.Work, p.Meta.Lang, len(p.Verses()), mode) + } +} + +func printLookup(id string) error { + v, ok := packs.All().Lookup(id) + if !ok { + return fmt.Errorf("verse not found: %s", id) + } + // Deliberately print only structural fields — never the verse text. + // Callers who need the body should go through the MCP `lookup` tool + // (issue #4), which has explicit user-confirm gating. + fmt.Printf("id: %s\n", v.ID) + fmt.Printf("tradition: %s/%s\n", v.Tradition, v.Work) + fmt.Printf("ref: %s %d:%d", v.CanonicalRef.Book, v.CanonicalRef.Chapter, v.CanonicalRef.VerseStart) + if v.CanonicalRef.VerseEnd != 0 { + fmt.Printf("-%d", v.CanonicalRef.VerseEnd) + } + fmt.Println() + fmt.Printf("lang: %s\n", v.Lang) + fmt.Printf("checksum: %s\n", v.ChecksumSHA256) + fmt.Printf("text_len: %d bytes\n", len(v.Text)) + fmt.Printf("source: %s — %s\n", v.Source.Provider, v.Source.License) + return nil } diff --git a/internal/packs/bible-kjv/metadata.json b/internal/packs/bible-kjv/metadata.json new file mode 100644 index 0000000..733096c --- /dev/null +++ b/internal/packs/bible-kjv/metadata.json @@ -0,0 +1,82 @@ +{ + "attribution": "King James Version of the Bible, Project Gutenberg eBook #10", + "books": { + "1-chronicles": "1 Chronicles", + "1-corinthians": "1 Corinthians", + "1-john": "1 John", + "1-kings": "1 Kings", + "1-peter": "1 Peter", + "1-samuel": "1 Samuel", + "1-thessalonians": "1 Thessalonians", + "1-timothy": "1 Timothy", + "2-chronicles": "2 Chronicles", + "2-corinthians": "2 Corinthians", + "2-john": "2 John", + "2-kings": "2 Kings", + "2-peter": "2 Peter", + "2-samuel": "2 Samuel", + "2-thessalonians": "2 Thessalonians", + "2-timothy": "2 Timothy", + "3-john": "3 John", + "acts": "Acts", + "amos": "Amos", + "colossians": "Colossians", + "daniel": "Daniel", + "deuteronomy": "Deuteronomy", + "ecclesiastes": "Ecclesiastes", + "ephesians": "Ephesians", + "esther": "Esther", + "exodus": "Exodus", + "ezekiel": "Ezekiel", + "ezra": "Ezra", + "galatians": "Galatians", + "genesis": "Genesis", + "habakkuk": "Habakkuk", + "haggai": "Haggai", + "hebrews": "Hebrews", + "hosea": "Hosea", + "isaiah": "Isaiah", + "james": "James", + "jeremiah": "Jeremiah", + "job": "Job", + "joel": "Joel", + "john": "John", + "jonah": "Jonah", + "joshua": "Joshua", + "jude": "Jude", + "judges": "Judges", + "lamentations": "Lamentations", + "leviticus": "Leviticus", + "luke": "Luke", + "malachi": "Malachi", + "mark": "Mark", + "matthew": "Matthew", + "micah": "Micah", + "nahum": "Nahum", + "nehemiah": "Nehemiah", + "numbers": "Numbers", + "obadiah": "Obadiah", + "philemon": "Philemon", + "philippians": "Philippians", + "proverbs": "Proverbs", + "psalms": "Psalms", + "revelation": "Revelation", + "romans": "Romans", + "ruth": "Ruth", + "song-of-solomon": "Song of Solomon", + "titus": "Titus", + "zechariah": "Zechariah", + "zephaniah": "Zephaniah" + }, + "build_date": "2026-05-02", + "edition_id": "pg10-kjv", + "inclusion_mode": "bundled", + "lang": "en", + "license": "Public domain (United States)", + "provider": "Project Gutenberg eBook #10", + "sensitivity": "sacred_exact_quote", + "source_url": "https://www.gutenberg.org/cache/epub/10/pg10.txt", + "tradition": "bible", + "verse_count": 31102, + "work": "KJV" +} diff --git a/internal/packs/bible-kjv/verses.jsonl.gz b/internal/packs/bible-kjv/verses.jsonl.gz new file mode 100644 index 0000000..190d8f8 Binary files /dev/null and b/internal/packs/bible-kjv/verses.jsonl.gz differ diff --git a/internal/packs/dao-de-jing/metadata.json b/internal/packs/dao-de-jing/metadata.json new file mode 100644 index 0000000..09630d6 --- /dev/null +++ b/internal/packs/dao-de-jing/metadata.json @@ -0,0 +1,15 @@ +{ + "attribution": "《道德經》, Project Gutenberg eBook #7337 (produced by Ching-yi Chen). Simplified-Chinese rendering via OpenCC t2s.", + "build_date": "2026-05-02", + "edition_id": "pg7337-laozi-s", + "inclusion_mode": "bundled", + "lang": "zh-Hans", + "license": "Public domain", + "provider": "Project Gutenberg eBook #7337", + "sensitivity": "sacred_exact_quote", + "source_url": "https://www.gutenberg.org/cache/epub/7337/pg7337.txt", + "tradition": "dao", + "transform": "OpenCC t2s (Traditional → Simplified)", + "verse_count": 81, + "work": "daodejing" +} diff --git a/internal/packs/dao-de-jing/verses.jsonl.gz b/internal/packs/dao-de-jing/verses.jsonl.gz new file mode 100644 index 0000000..3106210 Binary files /dev/null and b/internal/packs/dao-de-jing/verses.jsonl.gz differ diff --git a/internal/packs/doc.go b/internal/packs/doc.go deleted file mode 100644 index b59b448..0000000 --- a/internal/packs/doc.go +++ /dev/null @@ -1,3 +0,0 @@ -// Package packs holds embedded verse data (KJV, 道德经, 心经, ...). -// Pack contents are added in issue #3. -package packs diff --git a/internal/packs/heart-sutra/metadata.json b/internal/packs/heart-sutra/metadata.json new file mode 100644 index 0000000..8a2c967 --- /dev/null +++ b/internal/packs/heart-sutra/metadata.json @@ -0,0 +1,14 @@ +{ + "attribution": "《般若波罗蜜多心经》, translated by Xuanzang (玄奘, Tang dynasty, c. 649 CE). Public domain text; CBETA digital edition has its own redistribution terms.", + "build_date": "2026-05-02", + "edition_id": "xuanzang-heart-sutra", + "inclusion_mode": "api_only", + "lang": "zh-Hans", + "license": "See pack release notes", + "note": "Stub pack: text not yet bundled. Issue #3 notes permit api-only fallback while CBETA terms are being reviewed.", + "provider": "CBETA (pending license audit)", + "source_url": "https://cbetaonline.dila.edu.tw/zh/T0251_001", + "tradition": "sutra", + "verse_count": 0, + "work": "heart-sutra" +} diff --git a/internal/packs/heart-sutra/verses.jsonl.gz b/internal/packs/heart-sutra/verses.jsonl.gz new file mode 100644 index 0000000..ca83f34 Binary files /dev/null and b/internal/packs/heart-sutra/verses.jsonl.gz differ diff --git a/internal/packs/packs.go b/internal/packs/packs.go new file mode 100644 index 0000000..0285b08 --- /dev/null +++ b/internal/packs/packs.go @@ -0,0 +1,260 @@ +// Package packs holds embedded verse data (KJV, 道德经, 心经, ...). +// +// On import, init() decompresses each pack's verses.jsonl.gz, materializes +// schema.Verse values from compact JSONL rows + metadata.json, and indexes +// them by id. Lookups are O(1). +// +// Compact JSONL row format (one per line in verses.jsonl.gz): +// +// {"id":"bible.kjv.john.3.16","c":3,"v":16,"t":"...","s":""} +// +// Optional fields: "ve" (verse_end), "b" (book display name; defaults to +// metadata.books[] for multi-book traditions). Pack-shared fields +// (tradition, work, lang, source.*, inclusion_mode, sensitivity) live in +// metadata.json so the JSONL stays small enough to fit the 6 MB budget. +package packs + +import ( + "bufio" + "compress/gzip" + "embed" + "encoding/json" + "errors" + "fmt" + "io" + "sort" + "strings" + + "github.com/MiaoDX/verse-driven/internal/schema" +) + +//go:embed all:bible-kjv all:dao-de-jing all:heart-sutra +var fs embed.FS + +// PackName identifies an embedded pack on disk. +type PackName string + +const ( + PackBibleKJV PackName = "bible-kjv" + PackDaoDeJing PackName = "dao-de-jing" + PackHeartSutra PackName = "heart-sutra" +) + +// Metadata is the parsed contents of a pack's metadata.json. +type Metadata struct { + Tradition string `json:"tradition"` + Work string `json:"work"` + Lang string `json:"lang"` + Provider string `json:"provider"` + License string `json:"license"` + Attribution string `json:"attribution"` + SourceURL string `json:"source_url,omitempty"` + EditionID string `json:"edition_id,omitempty"` + InclusionMode string `json:"inclusion_mode,omitempty"` + Sensitivity string `json:"sensitivity,omitempty"` + Transform string `json:"transform,omitempty"` + Note string `json:"note,omitempty"` + Books map[string]string `json:"books,omitempty"` + VerseCount int `json:"verse_count"` + BuildDate string `json:"build_date,omitempty"` +} + +// Pack is one loaded pack: metadata + indexed verses. +type Pack struct { + Name PackName + Meta Metadata + verses []schema.Verse + byID map[string]int // id -> index in verses +} + +// Verses returns all verses in pack order. +func (p *Pack) Verses() []schema.Verse { return p.verses } + +// Lookup returns the verse with the given id and whether it exists. +func (p *Pack) Lookup(id string) (schema.Verse, bool) { + i, ok := p.byID[id] + if !ok { + return schema.Verse{}, false + } + return p.verses[i], true +} + +// Registry is the union of all loaded packs, keyed by PackName. +type Registry struct { + packs map[PackName]*Pack +} + +// All returns the singleton registry. +func All() *Registry { return registry } + +// Pack returns the pack by name, or nil if unknown. +func (r *Registry) Pack(name PackName) *Pack { + if r == nil { + return nil + } + return r.packs[name] +} + +// Names returns the loaded pack names in deterministic order. +func (r *Registry) Names() []PackName { + if r == nil { + return nil + } + out := make([]PackName, 0, len(r.packs)) + for n := range r.packs { + out = append(out, n) + } + sort.Slice(out, func(i, j int) bool { return out[i] < out[j] }) + return out +} + +// Lookup searches all packs for the given verse id. +func (r *Registry) Lookup(id string) (schema.Verse, bool) { + if r == nil { + return schema.Verse{}, false + } + for _, n := range r.Names() { + if v, ok := r.packs[n].Lookup(id); ok { + return v, true + } + } + return schema.Verse{}, false +} + +// TotalVerses sums verse_count across loaded packs. +func (r *Registry) TotalVerses() int { + total := 0 + for _, p := range r.packs { + total += len(p.verses) + } + return total +} + +// ErrPackEmpty is returned for packs whose verses.jsonl.gz contains no rows +// (e.g. heart-sutra is shipped as inclusion_mode=api_only). +var ErrPackEmpty = errors.New("packs: pack contains no bundled verses") + +var registry *Registry + +func init() { + r, err := loadAll() + if err != nil { + // Fail loudly: a broken pack at startup means a build-side problem + // the user has to fix; silent fallback would hide regressions. + panic(fmt.Errorf("packs: init failed: %w", err)) + } + registry = r +} + +func loadAll() (*Registry, error) { + r := &Registry{packs: make(map[PackName]*Pack)} + for _, name := range []PackName{PackBibleKJV, PackDaoDeJing, PackHeartSutra} { + p, err := loadPack(name) + if err != nil { + return nil, fmt.Errorf("pack %s: %w", name, err) + } + r.packs[name] = p + } + return r, nil +} + +func loadPack(name PackName) (*Pack, error) { + metaBytes, err := fs.ReadFile(string(name) + "/metadata.json") + if err != nil { + return nil, fmt.Errorf("read metadata: %w", err) + } + var meta Metadata + if err := json.Unmarshal(metaBytes, &meta); err != nil { + return nil, fmt.Errorf("parse metadata: %w", err) + } + gzData, err := fs.ReadFile(string(name) + "/verses.jsonl.gz") + if err != nil { + return nil, fmt.Errorf("read verses.jsonl.gz: %w", err) + } + gr, err := gzip.NewReader(strings.NewReader(string(gzData))) + if err != nil { + return nil, fmt.Errorf("gzip open: %w", err) + } + defer gr.Close() + + verses, byID, err := parseRows(gr, meta) + if err != nil { + return nil, err + } + return &Pack{ + Name: name, + Meta: meta, + verses: verses, + byID: byID, + }, nil +} + +type compactRow struct { + ID string `json:"id"` + Chapter int `json:"c"` + Verse int `json:"v"` + VerseEnd int `json:"ve,omitempty"` + Book string `json:"b,omitempty"` + Text string `json:"t"` + Checksum string `json:"s"` +} + +func parseRows(r io.Reader, meta Metadata) ([]schema.Verse, map[string]int, error) { + scanner := bufio.NewScanner(r) + scanner.Buffer(make([]byte, 1<<16), 1<<20) + var out []schema.Verse + byID := make(map[string]int) + source := schema.Source{ + Provider: meta.Provider, + License: meta.License, + Attribution: meta.Attribution, + } + lineNo := 0 + for scanner.Scan() { + lineNo++ + line := scanner.Bytes() + if len(line) == 0 { + continue + } + var row compactRow + if err := json.Unmarshal(line, &row); err != nil { + return nil, nil, fmt.Errorf("line %d: %w", lineNo, err) + } + book := row.Book + if book == "" && meta.Books != nil { + // Slug is the third dotted segment: tradition.work..chapter.verse + parts := strings.Split(row.ID, ".") + if len(parts) >= 5 { + if disp, ok := meta.Books[parts[2]]; ok { + book = disp + } + } + } + v := schema.Verse{ + ID: row.ID, + Tradition: meta.Tradition, + Lang: meta.Lang, + Work: meta.Work, + CanonicalRef: schema.CanonicalRef{ + Book: book, + Chapter: row.Chapter, + VerseStart: row.Verse, + VerseEnd: row.VerseEnd, + }, + Text: row.Text, + Source: source, + ChecksumSHA256: row.Checksum, + InclusionMode: meta.InclusionMode, + Sensitivity: meta.Sensitivity, + } + if err := schema.Validate(v); err != nil { + return nil, nil, fmt.Errorf("line %d (%s): %w", lineNo, row.ID, err) + } + byID[row.ID] = len(out) + out = append(out, v) + } + if err := scanner.Err(); err != nil { + return nil, nil, fmt.Errorf("scan: %w", err) + } + return out, byID, nil +} diff --git a/internal/packs/packs_test.go b/internal/packs/packs_test.go new file mode 100644 index 0000000..587cb7a --- /dev/null +++ b/internal/packs/packs_test.go @@ -0,0 +1,177 @@ +package packs + +import ( + "crypto/sha256" + "encoding/hex" + "strings" + "testing" + + "github.com/MiaoDX/verse-driven/internal/schema" +) + +// TestRegistryLoaded ensures all three packs were registered at init. +func TestRegistryLoaded(t *testing.T) { + r := All() + if r == nil { + t.Fatal("registry nil") + } + got := r.Names() + want := []PackName{PackBibleKJV, PackDaoDeJing, PackHeartSutra} + if len(got) != len(want) { + t.Fatalf("Names: got %d packs, want %d", len(got), len(want)) + } + for i, n := range want { + if got[i] != n { + t.Errorf("Names[%d]: got %q, want %q", i, got[i], n) + } + } +} + +func TestKJVCounts(t *testing.T) { + pack := All().Pack(PackBibleKJV) + if pack == nil { + t.Fatal("PackBibleKJV missing") + } + if pack.Meta.Tradition != "bible" || pack.Meta.Work != "KJV" { + t.Errorf("metadata: got tradition=%q work=%q", pack.Meta.Tradition, pack.Meta.Work) + } + const want = 31102 // canonical KJV verse count + if got := len(pack.Verses()); got != want { + t.Errorf("KJV verse count: got %d, want %d", got, want) + } + if got := len(pack.Meta.Books); got != 66 { + t.Errorf("KJV book count in metadata: got %d, want 66", got) + } +} + +func TestDaoCounts(t *testing.T) { + pack := All().Pack(PackDaoDeJing) + if pack == nil { + t.Fatal("PackDaoDeJing missing") + } + if got := len(pack.Verses()); got != 81 { + t.Errorf("Dao chapter count: got %d, want 81", got) + } +} + +func TestHeartSutraStub(t *testing.T) { + pack := All().Pack(PackHeartSutra) + if pack == nil { + t.Fatal("PackHeartSutra missing") + } + if got := len(pack.Verses()); got != 0 { + t.Errorf("HeartSutra is shipped api-only; got %d verses, want 0", got) + } + if pack.Meta.InclusionMode != "api_only" { + t.Errorf("HeartSutra inclusion_mode: got %q, want %q", pack.Meta.InclusionMode, "api_only") + } +} + +// TestSpotChecksums asserts known stable verses by their SHA-256, never by +// text content. The checksums here were computed by scripts/build_packs.py +// from canonical Project Gutenberg sources; if upstream PG #10 or PG #7337 +// ever change, regenerate via `python3 scripts/build_packs.py`, run +// `python3 scripts/verify_quotes.py`, and update the values below. +func TestSpotChecksums(t *testing.T) { + cases := []struct { + id string + want string + }{ + // KJV anchors at the start of OT, the most-cited NT verse, and + // the very last verse of the canon. + {"bible.kjv.genesis.1.1", "6f785a86b2716dcc5a48caa0de944396ba871d5c7f3bf776993648335fcb2bb2"}, + {"bible.kjv.john.3.16", "8473c0b1c7664945528317faf77351258eb79f8b11ba821ef76d7e916cde711a"}, + {"bible.kjv.revelation.22.21", "76128832e1fddeeda339fb4424682d629e372e7965425ba19efbf31038b54ab2"}, + // Dao chapter 11 is the README example ("三十辐共一毂..."). + {"dao.daodejing.11.1", "81ba9b4c9a51241154bf5f1c7a8b37d16234717b4f29c9522b58d04ad73d95b3"}, + } + r := All() + for _, c := range cases { + v, ok := r.Lookup(c.id) + if !ok { + t.Errorf("Lookup(%q): not found", c.id) + continue + } + actual := hashOf(v.Text) + if actual != v.ChecksumSHA256 { + t.Errorf("%s: stored checksum %q != recomputed %q", c.id, v.ChecksumSHA256, actual) + } + // We compare to the test's expected only when it isn't the + // placeholder zeros. The build emits authoritative values; this + // table acts as a sanity tripwire and is updated alongside the + // pack regen. + if !isPlaceholder(c.want) && actual != c.want { + t.Errorf("%s: checksum drift: got %q, want %q (regenerate test fixtures)", c.id, actual, c.want) + } + } +} + +// TestEveryVerseChecksumSelfConsistent ensures every loaded verse's stored +// checksum_sha256 matches the SHA-256 of its Text — i.e. the JSONL did not +// drift from the text. This is the guarantee verify_quotes.py also enforces +// at build time. +func TestEveryVerseChecksumSelfConsistent(t *testing.T) { + r := All() + for _, name := range r.Names() { + pack := r.Pack(name) + for _, v := range pack.Verses() { + if v.ChecksumSHA256 != hashOf(v.Text) { + t.Errorf("%s: checksum drift", v.ID) + break // one failure per pack is enough + } + } + } +} + +func TestEveryVerseValidatesAgainstSchema(t *testing.T) { + r := All() + for _, name := range r.Names() { + pack := r.Pack(name) + for _, v := range pack.Verses() { + if err := schema.Validate(v); err != nil { + t.Errorf("%s: schema invalid: %v", v.ID, err) + break + } + } + } +} + +// TestKJVBookCoverage ensures every one of the 66 books has at least one +// verse — catches regressions like the Haggai parse bug. +func TestKJVBookCoverage(t *testing.T) { + pack := All().Pack(PackBibleKJV) + seen := make(map[string]int, 66) + for _, v := range pack.Verses() { + // id format: bible.kjv... + parts := strings.Split(v.ID, ".") + if len(parts) < 5 { + t.Errorf("malformed id: %q", v.ID) + continue + } + seen[parts[2]]++ + } + if len(seen) != 66 { + t.Errorf("expected 66 KJV books, got %d", len(seen)) + } + for slug := range pack.Meta.Books { + if seen[slug] == 0 { + t.Errorf("KJV book %q has no verses", slug) + } + } +} + +func hashOf(s string) string { + h := sha256.Sum256([]byte(s)) + return hex.EncodeToString(h[:]) +} + +// isPlaceholder distinguishes "not yet captured" sentinels (all zeros) from +// real expected hashes. +func isPlaceholder(s string) bool { + for _, c := range s { + if c != '0' { + return false + } + } + return true +} diff --git a/internal/schema/verse.go b/internal/schema/verse.go index 2731205..385ef06 100644 --- a/internal/schema/verse.go +++ b/internal/schema/verse.go @@ -55,7 +55,9 @@ var ( ) var ( - idPattern = regexp.MustCompile(`^[a-z0-9]+(\.[a-z0-9]+)+$`) + // id segments are lowercase alphanumerics with optional hyphens + // (e.g. "1-samuel", "song-of-solomon", "heart-sutra"). + idPattern = regexp.MustCompile(`^[a-z0-9]+(?:-[a-z0-9]+)*(\.[a-z0-9]+(?:-[a-z0-9]+)*)+$`) checksumPattern = regexp.MustCompile(`^[0-9a-f]{64}$`) ) diff --git a/internal/schema/verse.schema.json b/internal/schema/verse.schema.json index 46f4890..6ada22e 100644 --- a/internal/schema/verse.schema.json +++ b/internal/schema/verse.schema.json @@ -18,7 +18,7 @@ "id": { "type": "string", "description": "Stable dotted identifier, e.g. bible.kjv.john.3.16", - "pattern": "^[a-z0-9]+(\\.[a-z0-9]+)+$" + "pattern": "^[a-z0-9]+(?:-[a-z0-9]+)*(\\.[a-z0-9]+(?:-[a-z0-9]+)*)+$" }, "tradition": { "type": "string", diff --git a/scripts/build_packs.py b/scripts/build_packs.py new file mode 100644 index 0000000..c5b23c7 --- /dev/null +++ b/scripts/build_packs.py @@ -0,0 +1,457 @@ +#!/usr/bin/env python3 +"""Pack builder for KJV, 道德经, and 心经. + +Downloads upstream public-domain sources, normalizes, and writes +internal/packs//{verses.jsonl, metadata.json} files. + +Run: python3 scripts/build_packs.py +Output: internal/packs/{bible-kjv,dao-de-jing,heart-sutra}/ + +The script intentionally prints no verse text — only structural info +(verse counts, byte sizes, file paths, checksum spot-counts). This is +required to keep the model output filter-safe; see CLAUDE.md. +""" + +from __future__ import annotations + +import datetime as _dt +import hashlib +import json +import os +import re +import sys +import urllib.request +from pathlib import Path + +ROOT = Path(__file__).resolve().parent.parent +PACKS_DIR = ROOT / "internal" / "packs" + +KJV_URL = "https://www.gutenberg.org/cache/epub/10/pg10.txt" +DAO_URL = "https://www.gutenberg.org/cache/epub/7337/pg7337.txt" + +# ---------- shared helpers ---------- + +def _sha256(s: str) -> str: + return hashlib.sha256(s.encode("utf-8")).hexdigest() + + +def _fetch(url: str) -> str: + req = urllib.request.Request(url, headers={"User-Agent": "verse-driven/0.1 pack-builder"}) + with urllib.request.urlopen(req, timeout=120) as r: + return r.read().decode("utf-8-sig") + + +def _write_pack(name: str, verses: list[dict], metadata: dict) -> None: + """Write a pack. + + Layout: + internal/packs//verses.jsonl.gz - compact, gzip-compressed + internal/packs//metadata.json - shared fields + verse_count + + Each line in verses.jsonl.gz is a compact object: + {"id": str, "c": int, "v": int, "ve"?: int, "b"?: str, "t": str, "s": hex64} + + Pack-level fields (tradition, lang, work, source, sensitivity, + inclusion_mode, default_lang display strings) live in metadata.json so + they are not duplicated 31k times. + """ + import gzip + out_dir = PACKS_DIR / name + out_dir.mkdir(parents=True, exist_ok=True) + jsonl_path = out_dir / "verses.jsonl.gz" + payload_lines: list[bytes] = [] + for v in verses: + compact: dict = { + "id": v["id"], + "c": v["canonical_ref"]["chapter"], + "v": v["canonical_ref"]["verse_start"], + "t": v["text"], + "s": v["checksum_sha256"], + } + if v["canonical_ref"].get("verse_end"): + compact["ve"] = v["canonical_ref"]["verse_end"] + if v["canonical_ref"].get("book"): + compact["b"] = v["canonical_ref"]["book"] + payload_lines.append(json.dumps(compact, ensure_ascii=False, sort_keys=True).encode("utf-8")) + payload = b"\n".join(payload_lines) + (b"\n" if payload_lines else b"") + # Pin mtime=0 in the gzip header so the archive is reproducible across builds. + with open(jsonl_path, "wb") as raw_f: + with gzip.GzipFile(fileobj=raw_f, mode="wb", compresslevel=9, mtime=0) as f: + f.write(payload) + meta_path = out_dir / "metadata.json" + metadata = dict(metadata) + metadata["verse_count"] = len(verses) + metadata["build_date"] = _dt.date.today().isoformat() + with meta_path.open("w", encoding="utf-8") as f: + json.dump(metadata, f, ensure_ascii=False, indent=2, sort_keys=True) + f.write("\n") + size_kb = jsonl_path.stat().st_size / 1024 + print(f" -> {jsonl_path.relative_to(ROOT)} verses={len(verses)} gz_size={size_kb:.1f} KiB") + + +# ---------- KJV Bible ---------- + +# Canonical 66-book order with the exact Gutenberg PG10 section heading +# (after the table of contents) and the dotted-id slug used in verse ids. +KJV_BOOKS: list[tuple[str, str, str]] = [ + ("The First Book of Moses: Called Genesis", "Genesis", "genesis"), + ("The Second Book of Moses: Called Exodus", "Exodus", "exodus"), + ("The Third Book of Moses: Called Leviticus", "Leviticus", "leviticus"), + ("The Fourth Book of Moses: Called Numbers", "Numbers", "numbers"), + ("The Fifth Book of Moses: Called Deuteronomy", "Deuteronomy", "deuteronomy"), + ("The Book of Joshua", "Joshua", "joshua"), + ("The Book of Judges", "Judges", "judges"), + ("The Book of Ruth", "Ruth", "ruth"), + ("The First Book of Samuel", "1 Samuel", "1-samuel"), + ("The Second Book of Samuel", "2 Samuel", "2-samuel"), + ("The First Book of the Kings", "1 Kings", "1-kings"), + ("The Second Book of the Kings", "2 Kings", "2-kings"), + ("The First Book of the Chronicles", "1 Chronicles", "1-chronicles"), + ("The Second Book of the Chronicles", "2 Chronicles", "2-chronicles"), + ("Ezra", "Ezra", "ezra"), + ("The Book of Nehemiah", "Nehemiah", "nehemiah"), + ("The Book of Esther", "Esther", "esther"), + ("The Book of Job", "Job", "job"), + ("The Book of Psalms", "Psalms", "psalms"), + ("The Proverbs", "Proverbs", "proverbs"), + ("Ecclesiastes", "Ecclesiastes", "ecclesiastes"), + ("The Song of Solomon", "Song of Solomon", "song-of-solomon"), + ("The Book of the Prophet Isaiah", "Isaiah", "isaiah"), + ("The Book of the Prophet Jeremiah", "Jeremiah", "jeremiah"), + ("The Lamentations of Jeremiah", "Lamentations", "lamentations"), + ("The Book of the Prophet Ezekiel", "Ezekiel", "ezekiel"), + ("The Book of Daniel", "Daniel", "daniel"), + ("Hosea", "Hosea", "hosea"), + ("Joel", "Joel", "joel"), + ("Amos", "Amos", "amos"), + ("Obadiah", "Obadiah", "obadiah"), + ("Jonah", "Jonah", "jonah"), + ("Micah", "Micah", "micah"), + ("Nahum", "Nahum", "nahum"), + ("Habakkuk", "Habakkuk", "habakkuk"), + ("Zephaniah", "Zephaniah", "zephaniah"), + ("Haggai", "Haggai", "haggai"), + ("Zechariah", "Zechariah", "zechariah"), + ("Malachi", "Malachi", "malachi"), + ("The Gospel According to Saint Matthew", "Matthew", "matthew"), + ("The Gospel According to Saint Mark", "Mark", "mark"), + ("The Gospel According to Saint Luke", "Luke", "luke"), + ("The Gospel According to Saint John", "John", "john"), + ("The Acts of the Apostles", "Acts", "acts"), + ("The Epistle of Paul the Apostle to the Romans", "Romans", "romans"), + ("The First Epistle of Paul the Apostle to the Corinthians", "1 Corinthians", "1-corinthians"), + ("The Second Epistle of Paul the Apostle to the Corinthians", "2 Corinthians", "2-corinthians"), + ("The Epistle of Paul the Apostle to the Galatians", "Galatians", "galatians"), + ("The Epistle of Paul the Apostle to the Ephesians", "Ephesians", "ephesians"), + ("The Epistle of Paul the Apostle to the Philippians", "Philippians", "philippians"), + ("The Epistle of Paul the Apostle to the Colossians", "Colossians", "colossians"), + ("The First Epistle of Paul the Apostle to the Thessalonians", "1 Thessalonians", "1-thessalonians"), + ("The Second Epistle of Paul the Apostle to the Thessalonians", "2 Thessalonians", "2-thessalonians"), + ("The First Epistle of Paul the Apostle to Timothy", "1 Timothy", "1-timothy"), + ("The Second Epistle of Paul the Apostle to Timothy", "2 Timothy", "2-timothy"), + ("The Epistle of Paul the Apostle to Titus", "Titus", "titus"), + ("The Epistle of Paul the Apostle to Philemon", "Philemon", "philemon"), + ("The Epistle of Paul the Apostle to the Hebrews", "Hebrews", "hebrews"), + ("The General Epistle of James", "James", "james"), + ("The First Epistle General of Peter", "1 Peter", "1-peter"), + ("The Second General Epistle of Peter", "2 Peter", "2-peter"), + ("The First Epistle General of John", "1 John", "1-john"), + ("The Second Epistle General of John", "2 John", "2-john"), + ("The Third Epistle General of John", "3 John", "3-john"), + ("The General Epistle of Jude", "Jude", "jude"), + ("The Revelation of Saint John the Divine", "Revelation", "revelation"), +] + +VERSE_MARKER = re.compile(r"\b(\d+):(\d+)\b") +START_MARKER = "*** START OF THE PROJECT GUTENBERG EBOOK THE KING JAMES VERSION OF THE BIBLE ***" +END_MARKER = "*** END OF THE PROJECT GUTENBERG EBOOK THE KING JAMES VERSION OF THE BIBLE ***" + + +def _find_heading(body: str, heading: str, after: int) -> int: + """Find the next heading occurrence in `body` after offset `after`. + + A heading sits on its own line surrounded by blank lines (PG #10 puts + several blank lines before/after each book section header). We require + the match to be preceded by at least one blank line and followed by at + least one blank line so that verse-content mentions of the heading text + (e.g. the word 'Haggai' inside a verse) don't false-match. + """ + pos = after + while True: + idx = body.find(heading, pos) + if idx < 0: + return -1 + line_start = body.rfind("\n", 0, idx) + 1 + line_end = body.find("\n", idx + len(heading)) + if line_end < 0: + line_end = len(body) + line = body[line_start:line_end].strip(" \t\r") + if line == heading: + # confirm surrounding blank lines: previous non-empty line is far + # enough back, and next non-empty line is far enough forward. + before = body[max(0, line_start - 6) : line_start] + after_chunk = body[line_end : line_end + 6] + if before.count("\n") >= 1 and after_chunk.count("\n") >= 1: + return idx + pos = idx + len(heading) + + +def _slice_book(body: str, idx: int, cursor: int) -> tuple[str, int]: + """Return (book_text, new_cursor) for book idx; book_text excludes heading.""" + heading = KJV_BOOKS[idx][0] + start = _find_heading(body, heading, cursor) + if start < 0: + raise RuntimeError(f"KJV: heading not found: {heading!r}") + after_heading = start + len(heading) + if idx + 1 < len(KJV_BOOKS): + nxt = _find_heading(body, KJV_BOOKS[idx + 1][0], after_heading) + if nxt < 0: + raise RuntimeError(f"KJV: next heading not found after {heading!r}") + return body[after_heading:nxt], nxt + return body[after_heading:], len(body) + + +def _parse_kjv_book(book_text: str) -> list[tuple[int, int, str]]: + """Return [(chapter, verse, text)] for one book.""" + # Collapse the text: strip leading/trailing, normalize whitespace runs to single + # spaces but keep the verse markers intact. + text = book_text.strip() + # Find all verse-marker positions. + matches = list(VERSE_MARKER.finditer(text)) + if not matches: + return [] + out: list[tuple[int, int, str]] = [] + for i, m in enumerate(matches): + ch = int(m.group(1)) + vs = int(m.group(2)) + body_start = m.end() + body_end = matches[i + 1].start() if i + 1 < len(matches) else len(text) + verse_body = text[body_start:body_end] + # Normalize whitespace. + verse_body = re.sub(r"\s+", " ", verse_body).strip() + out.append((ch, vs, verse_body)) + return out + + +TOC_END_MARKER = "The Old Testament of the King James Version of the Bible" + + +def build_kjv() -> None: + print("[kjv] downloading...") + raw = _fetch(KJV_URL) + s = raw.find(START_MARKER) + e = raw.find(END_MARKER) + if s < 0 or e < 0: + raise RuntimeError("KJV: PG markers not found") + # PG #10 lists each book once in the TOC, then again as a section header + # before the verses. The TOC sits between START_MARKER and the *second* + # occurrence of TOC_END_MARKER ("The Old Testament..."). Skip past it. + after_start = s + len(START_MARKER) + first_old = raw.find(TOC_END_MARKER, after_start) + second_old = raw.find(TOC_END_MARKER, first_old + 1) if first_old >= 0 else -1 + if second_old < 0: + raise RuntimeError("KJV: failed to find TOC/body boundary") + body = raw[second_old:e] + verses: list[dict] = [] + cursor = 0 + for idx, (heading, display, slug) in enumerate(KJV_BOOKS): + sect, cursor = _slice_book(body, idx, cursor) + parsed = _parse_kjv_book(sect) + if not parsed: + raise RuntimeError(f"KJV: no verses parsed for {display!r}") + for ch, vs, text in parsed: + vid = f"bible.kjv.{slug}.{ch}.{vs}" + verses.append({ + "id": vid, + "tradition": "bible", + "lang": "en", + "work": "KJV", + "canonical_ref": {"book": display, "chapter": ch, "verse_start": vs}, + "text": text, + "source": { + "provider": "Project Gutenberg eBook #10", + "license": "Public domain (United States)", + "attribution": "King James Version of the Bible, Project Gutenberg eBook #10", + }, + "checksum_sha256": _sha256(text), + "inclusion_mode": "bundled", + "sensitivity": "sacred_exact_quote", + }) + meta = { + "tradition": "bible", + "work": "KJV", + "lang": "en", + "provider": "Project Gutenberg eBook #10", + "source_url": KJV_URL, + "license": "Public domain (United States)", + "attribution": "King James Version of the Bible, Project Gutenberg eBook #10", + "edition_id": "pg10-kjv", + "inclusion_mode": "bundled", + "sensitivity": "sacred_exact_quote", + # Slug → canonical book name. Slug is the second-to-last id segment + # before the chapter (e.g. bible.kjv.song-of-solomon.5.1 → "Song of Solomon"). + "books": {slug: display for (_, display, slug) in KJV_BOOKS}, + } + _write_pack("bible-kjv", verses, meta) + + +# ---------- 道德经 ---------- + +DAO_START_MARKER = "*** START OF THE PROJECT GUTENBERG EBOOK" +DAO_END_MARKER = "*** END OF THE PROJECT GUTENBERG EBOOK" +# Each chapter heading line in PG #7337 looks like "第一章", "第二章", ... "第八十一章". +DAO_CHAPTER_RE = re.compile(r"^第([一-鿿]+)章\s*$", re.MULTILINE) +# Map Chinese numerals 1..81 used in the source. +_DIGITS = {"零": 0, "一": 1, "二": 2, "三": 3, "四": 4, "五": 5, "六": 6, "七": 7, "八": 8, "九": 9} + + +def _parse_cn_numeral(s: str) -> int: + s = s.strip() + if s == "十": + return 10 + if s.startswith("十"): + return 10 + _DIGITS[s[1:]] + if "十" in s: + a, _, b = s.partition("十") + tens = _DIGITS[a] * 10 + return tens + (_DIGITS[b] if b else 0) + if len(s) == 1 and s in _DIGITS: + return _DIGITS[s] + raise ValueError(f"unknown CN numeral {s!r}") + + +def _t2s_dao(text: str) -> str: + try: + from opencc import OpenCC + except ImportError: + print("[dao] opencc-python-reimplemented not installed; skipping t->s conversion", file=sys.stderr) + return text + cc = OpenCC("t2s") + return cc.convert(text) + + +def build_dao() -> None: + print("[dao] downloading...") + raw = _fetch(DAO_URL) + s_idx = raw.find(DAO_START_MARKER) + e_idx = raw.find(DAO_END_MARKER) + if s_idx < 0 or e_idx < 0: + raise RuntimeError("DAO: PG markers not found") + # advance past the START line itself. + line_end = raw.find("\n", s_idx) + body = raw[line_end + 1 : e_idx] + + # Find chapter headings; segment by them. + headings = list(DAO_CHAPTER_RE.finditer(body)) + if len(headings) < 81: + raise RuntimeError(f"DAO: expected >=81 chapter headings, found {len(headings)}") + + # The Gutenberg edition repeats each chapter heading inside structural section + # banners ("老子《道德經》 第一~四十章") — keep only the first 81 occurrences. + chapters: dict[int, list[str]] = {} + for i, m in enumerate(headings): + cn = m.group(1) + try: + num = _parse_cn_numeral(cn) + except ValueError: + continue + if num < 1 or num > 81: + continue + body_start = m.end() + body_end = headings[i + 1].start() if i + 1 < len(headings) else len(body) + chunk = body[body_start:body_end].strip() + # Skip noise: section banner lines like "老子德經" tend to sit *before* a + # chapter heading, not after. The first non-empty post-heading block IS + # the chapter body. We accept the longest non-empty chunk for each + # chapter number across duplicate occurrences. + chapters.setdefault(num, []).append(chunk) + + if len(chapters) != 81: + raise RuntimeError(f"DAO: parsed {len(chapters)} unique chapters, expected 81") + + verses: list[dict] = [] + for n in range(1, 82): + candidates = [c for c in chapters[n] if c] + if not candidates: + raise RuntimeError(f"DAO: empty body for chapter {n}") + # longest candidate wins. + traditional = max(candidates, key=len) + # Normalize whitespace within the chapter: collapse runs to single spaces, + # but preserve the original line breaks as a single space delimiter. + traditional = re.sub(r"\s+", "", traditional) + simplified = _t2s_dao(traditional) + vid = f"dao.daodejing.{n}.1" + verses.append({ + "id": vid, + "tradition": "dao", + "lang": "zh-Hans", + "work": "daodejing", + "canonical_ref": {"chapter": n, "verse_start": 1}, + "display_ref": {"zh-Hans": f"道德经第{n}章", "en": f"Tao Te Ching, Chapter {n}"}, + "text": simplified, + "source": { + "provider": "Project Gutenberg eBook #7337", + "license": "Public domain", + "attribution": "《道德經》, Project Gutenberg eBook #7337 (produced by Ching-yi Chen). Simplified-Chinese rendering via OpenCC t2s.", + }, + "checksum_sha256": _sha256(simplified), + "inclusion_mode": "bundled", + "sensitivity": "sacred_exact_quote", + }) + meta = { + "tradition": "dao", + "work": "daodejing", + "lang": "zh-Hans", + "provider": "Project Gutenberg eBook #7337", + "source_url": DAO_URL, + "license": "Public domain", + "attribution": "《道德經》, Project Gutenberg eBook #7337 (produced by Ching-yi Chen). Simplified-Chinese rendering via OpenCC t2s.", + "edition_id": "pg7337-laozi-s", + "inclusion_mode": "bundled", + "sensitivity": "sacred_exact_quote", + "transform": "OpenCC t2s (Traditional → Simplified)", + } + _write_pack("dao-de-jing", verses, meta) + + +# ---------- 心经 ---------- +# CBETA's redistribution terms for the Xuanzang translation are non-trivial to +# audit at build time, and our reachable upstream sources don't reliably +# return the canonical text. Per issue #3 notes ("fall back to api-only mode +# for that pack if uncertain"), we ship the heart-sutra pack with +# inclusion_mode = api_only and 0 bundled verses. The registry still surfaces +# the pack via metadata.json, and a future PR can vendor verses once the +# CBETA license review is complete. + +def build_sutra() -> None: + verses: list[dict] = [] + meta = { + "tradition": "sutra", + "work": "heart-sutra", + "lang": "zh-Hans", + "provider": "CBETA (pending license audit)", + "source_url": "https://cbetaonline.dila.edu.tw/zh/T0251_001", + "license": "See pack release notes", + "attribution": "《般若波罗蜜多心经》, translated by Xuanzang (玄奘, Tang dynasty, c. 649 CE). Public domain text; CBETA digital edition has its own redistribution terms.", + "edition_id": "xuanzang-heart-sutra", + "inclusion_mode": "api_only", + "note": "Stub pack: text not yet bundled. Issue #3 notes permit api-only fallback while CBETA terms are being reviewed.", + } + _write_pack("heart-sutra", verses, meta) + + +# ---------- entrypoint ---------- + +def main() -> int: + targets = sys.argv[1:] or ["kjv", "dao", "sutra"] + if "kjv" in targets: + build_kjv() + if "dao" in targets: + build_dao() + if "sutra" in targets: + build_sutra() + return 0 + + +if __name__ == "__main__": + raise SystemExit(main()) diff --git a/scripts/verify_quotes.py b/scripts/verify_quotes.py new file mode 100644 index 0000000..05e50a5 --- /dev/null +++ b/scripts/verify_quotes.py @@ -0,0 +1,89 @@ +#!/usr/bin/env python3 +"""Recompute SHA-256 over each verse's text and compare to the stored checksum. + +Walks every internal/packs/*/verses.jsonl.gz, recomputes the SHA-256 over +the bytes of the `t` field, and fails if any row's stored `s` doesn't match. + +Run: python3 scripts/verify_quotes.py +Exit 0 - all packs verified +Exit 1 - at least one mismatch (or missing file) + +This script intentionally does not print verse text on mismatch — only the +verse `id`, expected hash, and recomputed hash. Sacred-text bodies are kept +out of CI logs and out of any Claude Code transcript that scrapes them. +See CLAUDE.md. +""" + +from __future__ import annotations + +import gzip +import hashlib +import json +import sys +from pathlib import Path + +ROOT = Path(__file__).resolve().parent.parent +PACKS_DIR = ROOT / "internal" / "packs" + + +def _sha256(s: str) -> str: + return hashlib.sha256(s.encode("utf-8")).hexdigest() + + +def verify_pack(pack_dir: Path) -> tuple[int, int]: + """Return (verses_checked, mismatches_for_this_pack).""" + jsonl_gz = pack_dir / "verses.jsonl.gz" + meta_path = pack_dir / "metadata.json" + if not meta_path.exists(): + print(f" [{pack_dir.name}] MISSING metadata.json", file=sys.stderr) + return 0, 1 + if not jsonl_gz.exists(): + print(f" [{pack_dir.name}] MISSING verses.jsonl.gz", file=sys.stderr) + return 0, 1 + meta = json.loads(meta_path.read_text(encoding="utf-8")) + declared_count = meta.get("verse_count", -1) + + n = 0 + bad = 0 + with gzip.open(jsonl_gz, "rb") as f: + for raw in f: + line = raw.decode("utf-8").strip() + if not line: + continue + row = json.loads(line) + n += 1 + actual = _sha256(row["t"]) + if actual != row["s"]: + # Print only ids and hashes — never row["t"]. + print(f" [{pack_dir.name}] MISMATCH id={row['id']}", file=sys.stderr) + print(f" stored: {row['s']}", file=sys.stderr) + print(f" recomputed: {actual}", file=sys.stderr) + bad += 1 + if declared_count >= 0 and declared_count != n: + print( + f" [{pack_dir.name}] verse_count mismatch: metadata={declared_count} jsonl={n}", + file=sys.stderr, + ) + bad += 1 + print(f" [{pack_dir.name}] verses={n} mismatches={bad}") + return n, bad + + +def main() -> int: + if not PACKS_DIR.exists(): + print(f"no packs at {PACKS_DIR}", file=sys.stderr) + return 1 + total = 0 + bad = 0 + pack_dirs = sorted(p for p in PACKS_DIR.iterdir() if p.is_dir()) + print(f"verify_quotes: scanning {len(pack_dirs)} pack(s)") + for pd in pack_dirs: + n, b = verify_pack(pd) + total += n + bad += b + print(f"verify_quotes: total verses={total} mismatches={bad}") + return 0 if bad == 0 else 1 + + +if __name__ == "__main__": + raise SystemExit(main())