diff --git a/CHANGELOG.md b/CHANGELOG.md index 24429e1..bbf0eb1 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,5 +1,11 @@ # Changelog +## Unreleased + +- Adds bilingual bundled packs for Bible, Dao, Quran, and Heart Sutra. +- Infers English versus Chinese edition from the matched reference alias. +- Preserves per-language display references in compact pack rows. + ## v0.1.0 - 2026-05-03 Initial release of Verse-Driven Development. diff --git a/CLAUDE.md b/CLAUDE.md index 2e96e4d..a8e8cf8 100644 --- a/CLAUDE.md +++ b/CLAUDE.md @@ -55,8 +55,13 @@ internal/schema/ Verse struct + JSON Schema (the contract every pack obey internal/resolver/ free-form reference parser ("John 3:16", "道德经 11", ...) internal/packs/ embed.FS-backed pack data + registry bible-kjv/ + bible-cuv-s/ dao-de-jing/ + dao-legge/ heart-sutra/ + heart-sutra-en/ + quran-pickthall/ + quran-majian/ internal/mcp/ stdio MCP server (issue #4) internal/cli/ CLI subcommands (issue #4) internal/injector/ inject-once envelope helpers (issues #5/#6) diff --git a/Makefile b/Makefile index b70b661..3c92df7 100644 --- a/Makefile +++ b/Makefile @@ -7,10 +7,9 @@ STATICCHECK_VERSION := v0.6.1 all: lint verify-packs test build -# Rebuild the bundled packs from upstream sources (KJV from Project -# Gutenberg, 道德经 from Project Gutenberg). Run after upstream regenerations -# or whenever the JSONL format changes. Requires Python 3.11+ and -# opencc-python-reimplemented for the dao pack. +# Rebuild bundled packs from upstream sources. Run after upstream +# regenerations or whenever the JSONL format changes. Requires Python 3.11+; +# opencc-python-reimplemented is required for the zh-Hans Dao/Sutra targets. packs: python3 scripts/build_packs.py diff --git a/README.md b/README.md index e231c50..d5fb001 100644 --- a/README.md +++ b/README.md @@ -19,7 +19,8 @@ stdio MCP, CLI commands, and agent hooks. Current v0.1.0 status: - Released binaries for macOS arm64, macOS x86_64, and Linux x86_64. -- Bundled packs: KJV Bible, 道德经, and 心经. +- Bundled packs: Bible (KJV + CUV-S), 道德经 (Chinese + Legge English), + 心经 (Chinese + English), and Quran (Pickthall English + Ma Jian Chinese). - Adapter support: Claude Code and Codex. - Safety gate: one-turn injection lifecycle tests and coding-quality benchmark passed for v0.1.0. @@ -80,8 +81,12 @@ Claude Code slash commands: ```text /bible John 3:13 /bible 约翰福音 3:16 +/dao 11 /dao 第十一章 /sutra 心经 +/sutra Heart Sutra +/quran 2:255 +/quran 古兰经 2:255 ``` Codex inline markers: @@ -96,7 +101,11 @@ Direct CLI lookup: ```bash scripture-mcp lookup "John 3:13" --format=json +scripture-mcp lookup "约翰福音 3:16" --format=json +scripture-mcp lookup "dao 11" --format=text scripture-mcp lookup "道德经第十一章" --format=text +scripture-mcp lookup "Quran 2:255" --format=text +scripture-mcp lookup "古兰经 2:255" --format=text scripture-mcp recap --terminal scripture-mcp recap --learning --first-letter ``` @@ -229,10 +238,13 @@ Codex transcript. | Pack | Source | State | |---|---|---| | KJV Bible | [Project Gutenberg eBook #10](https://www.gutenberg.org/ebooks/10) | Bundled, 31,102 verses | +| Chinese Union Version, Simplified | [open-bibles](https://github.com/seven1m/open-bibles) | Bundled, 31,100 verses | | 道德经 | [Project Gutenberg eBook #7337](https://www.gutenberg.org/ebooks/7337) | Bundled, 81 chapters | +| Tao Te Ching, Legge English | [Internet Classics Archive](https://classics.mit.edu/Lao/taote.html) | Bundled, 81 chapters | | 心经 | [CBETA XML P5 T0251](https://cbetaonline.dila.edu.tw/zh/T0251_001) | Bundled, 1 complete text | -| Quran | planned | Resolver only; no bundled text | -| 中文圣经 | planned | Needs licensing work | +| Heart Sutra, English | [Wikisource](https://en.wikisource.org/wiki/Translation:Shorter_Praj%C3%B1%C4%81p%C4%81ramit%C4%81_H%E1%B9%9Bdaya_S%C5%ABtra) | Bundled, 1 complete text | +| Quran, Pickthall English | [Tanzil](https://tanzil.net/trans/) | Bundled, 6,236 ayat; non-commercial translation terms | +| Quran, Ma Jian Chinese | [Tanzil](https://tanzil.net/trans/) | Bundled, 6,236 ayat; non-commercial translation terms | Each bundled entry stores a SHA-256 checksum over the text bytes. CI verifies that pack text and checksum metadata stay in sync. @@ -278,8 +290,8 @@ See [`CHANGELOG.md`](./CHANGELOG.md) for release notes. Useful next work: -- Quran pack with clear source provenance and attribution. -- Chinese Bible pack research. +- Broader translation-source licensing review. +- Optional explicit `--lang` selection for recap/random workflows. - Homebrew formula. - Release workflow automation. - Additional lifecycle probes for other MCP-compatible agents. diff --git a/docs/sources.md b/docs/sources.md new file mode 100644 index 0000000..5b015d8 --- /dev/null +++ b/docs/sources.md @@ -0,0 +1,40 @@ +# Bundled Text Sources + +This file records the upstream sources, terms, and attribution metadata for +the scripture packs bundled into `scripture-mcp`. It intentionally contains no +scripture passage bodies. + +Each pack also stores the same source metadata in +`internal/packs//metadata.json`, and every bundled row stores a +SHA-256 checksum over its text bytes. + +## Source Matrix + +| Pack | Tradition | Work | Language | Source | Terms | Attribution | +|---|---|---|---|---|---|---| +| `bible-kjv` | Bible | KJV | English | [Project Gutenberg eBook #10](https://www.gutenberg.org/cache/epub/10/pg10.txt) | Public domain (United States) | King James Version of the Bible, Project Gutenberg eBook #10 | +| `bible-cuv-s` | Bible | CUV-S | Simplified Chinese | [open-bibles USFX](https://raw.githubusercontent.com/seven1m/open-bibles/master/chi-cuv-simp.usfx.xml) | Public domain | Chinese Union Version (Simplified), open-bibles USFX | +| `dao-de-jing` | Dao | daodejing | Simplified Chinese | [Project Gutenberg eBook #7337](https://www.gutenberg.org/cache/epub/7337/pg7337.txt) | Public domain | `道德經`, Project Gutenberg eBook #7337, produced by Ching-yi Chen | +| `dao-legge` | Dao | legge | English | [Internet Classics Archive](https://classics.mit.edu/Lao/taote.mb.txt) | Public domain source text | Tao Te Ching, translated by James Legge (1891), Internet Classics Archive text | +| `heart-sutra` | Sutra | heart-sutra | Simplified Chinese | [CBETA XML P5 T0251](https://cbetaonline.dila.edu.tw/zh/T0251_001) | Ancient source text; CBETA digital edition terms apply | `般若波罗蜜多心经`, translated by Xuanzang, CBETA XML P5 T0251 | +| `heart-sutra-en` | Sutra | heart-sutra-en | English | [Wikisource raw page](https://en.wikisource.org/w/index.php?title=Translation:Shorter_Praj%C3%B1%C4%81p%C4%81ramit%C4%81_H%E1%B9%9Bdaya_S%C5%ABtra&action=raw) | Creative Commons Attribution-ShareAlike | Shorter Prajnaparamita Hrdaya Sutra, Wikisource translation | +| `quran-pickthall` | Quran | pickthall | English | [Tanzil `en.pickthall`](https://tanzil.net/trans/en.pickthall) | Tanzil translation terms: non-commercial use with attribution | Quran English translation by Mohammed Marmaduke William Pickthall, Tanzil | +| `quran-majian` | Quran | majian | Simplified Chinese | [Tanzil `zh.jian`](https://tanzil.net/trans/zh.jian) | Tanzil translation terms: non-commercial use with attribution | Quran Chinese translation by Ma Jian, Tanzil | + +## Transform Notes + +| Pack | Transform | +|---|---| +| `dao-de-jing` | Traditional Chinese source normalized to Simplified Chinese with OpenCC `t2s`. | +| `heart-sutra` | CBETA XML P5 body extraction, then OpenCC `t2s`. | +| `heart-sutra-en` | Wikisource raw wiki markup cleaned to the translation body. | +| `bible-cuv-s` | USFX verse markers parsed into compact JSONL rows. | +| `quran-pickthall` / `quran-majian` | Tanzil pipe-delimited translation rows parsed into compact JSONL rows. | + +## Cautions + +- Quran translation packs are not public-domain packs; retain Tanzil + attribution and non-commercial translation terms in releases. +- CBETA-derived text should retain the CBETA attribution and terms note. +- Do not paste passage bodies into docs, logs, PR descriptions, or chat output; + cite pack IDs, references, checksums, and source metadata instead. diff --git a/internal/cli/lookup.go b/internal/cli/lookup.go index 94a8aee..85941ef 100644 --- a/internal/cli/lookup.go +++ b/internal/cli/lookup.go @@ -227,28 +227,18 @@ func resolveTrailing(tradition, rest string) (schema.Verse, error) { return schema.Verse{}, lastErr } cur := strings.TrimSpace(rest) - candidate := func() string { - switch tradition { - case "bible": - return cur - default: // dao, quran - if cur == "" { - return tradition - } - return tradition + " " + cur - } - } var lastErr error for { - c := candidate() - if c == "" { + if cur == "" { break } - v, err := resolveAndLookup(c) - if err == nil { - return v, nil + for _, c := range markerCandidates(tradition, cur) { + v, err := resolveAndLookup(c) + if err == nil { + return v, nil + } + lastErr = err } - lastErr = err idx := strings.LastIndexAny(cur, " \t") if idx < 0 { break @@ -261,6 +251,19 @@ func resolveTrailing(tradition, rest string) (schema.Verse, error) { return schema.Verse{}, lastErr } +func markerCandidates(tradition, cur string) []string { + switch tradition { + case "bible": + return []string{cur} + case "dao", "quran": + // First try the raw ref so slash markers can carry a language-specific + // alias such as "/dao 道德经第十一章" or "/quran 古兰经 2:255". + return []string{cur, tradition + " " + cur} + default: + return []string{cur} + } +} + // scanMarker extracts (tradition, ref) from the leftmost marker in prompt. // Returns (_, _, false) when no marker is present. func scanMarker(prompt string) (tradition, ref string, ok bool) { diff --git a/internal/cli/lookup_test.go b/internal/cli/lookup_test.go index 7dd58e3..9ac249b 100644 --- a/internal/cli/lookup_test.go +++ b/internal/cli/lookup_test.go @@ -100,6 +100,37 @@ func TestRunLookupSutraBundled(t *testing.T) { } } +func TestRunLookupBilingualPacks(t *testing.T) { + cases := []struct { + name string + ref string + id string + lang string + }{ + {"bible_zh", "约翰福音 3:16", "bible.cuv-s.john.3.16", "zh-Hans"}, + {"dao_en", "dao 11", "dao.legge.11.1", "en"}, + {"dao_zh", "道德经 11", "dao.daodejing.11.1", "zh-Hans"}, + {"sutra_en", "Heart Sutra", "sutra.heart-sutra-en.1", "en"}, + {"quran_en", "Quran 2:255", "quran.pickthall.2.255", "en"}, + {"quran_zh", "古兰经 2:255", "quran.majian.2.255", "zh-Hans"}, + } + for _, tc := range cases { + t.Run(tc.name, func(t *testing.T) { + var out, errBuf bytes.Buffer + if code := runLookup([]string{tc.ref}, Streams{Out: &out, Err: &errBuf}); code != 0 { + t.Fatalf("exit %d, stderr=%q", code, errBuf.String()) + } + var v schema.Verse + if err := json.Unmarshal(out.Bytes(), &v); err != nil { + t.Fatalf("output is not JSON Verse: %v", err) + } + if v.ID != tc.id || v.Lang != tc.lang { + t.Errorf("lookup %q got id=%q lang=%q; want id=%q lang=%q", tc.ref, v.ID, v.Lang, tc.id, tc.lang) + } + }) + } +} + func TestLookupFromPromptSlashMarker(t *testing.T) { in := strings.NewReader("/bible John 3:16 Refactor the cron-string scheduler.") var out, errBuf bytes.Buffer @@ -141,8 +172,27 @@ func TestLookupFromPromptInlineMarker(t *testing.T) { if err := json.Unmarshal(out.Bytes(), &resp); err != nil { t.Fatalf("output not JSON: %v\n%s", err, out.String()) } + if !strings.Contains(resp.HookSpecificOutput.AdditionalContext, "Tao Te Ching") { + t.Errorf("dao envelope missing English display ref") + } +} + +func TestLookupFromPromptInlineMarkerChineseDao(t *testing.T) { + in := strings.NewReader("Please [[dao:道德经第十一章]] keep going on the helper.") + var out bytes.Buffer + if code := runLookupFromPrompt(nil, Streams{In: in, Out: &out, Err: &bytes.Buffer{}}); code != 0 { + t.Fatalf("exit %d", code) + } + var resp struct { + HookSpecificOutput struct { + AdditionalContext string `json:"additionalContext"` + } `json:"hookSpecificOutput"` + } + if err := json.Unmarshal(out.Bytes(), &resp); err != nil { + t.Fatalf("output not JSON: %v\n%s", err, out.String()) + } if !strings.Contains(resp.HookSpecificOutput.AdditionalContext, "道德经") { - t.Errorf("dao envelope missing display ref:\n%s", resp.HookSpecificOutput.AdditionalContext) + t.Errorf("dao envelope missing Chinese display ref") } } @@ -165,8 +215,8 @@ func TestLookupFromPromptDollarDaoAlias(t *testing.T) { t.Errorf("hookSpecificOutput hookEventName = %q, want UserPromptSubmit", resp.HookSpecificOutput.HookEventName) } - if !strings.Contains(resp.HookSpecificOutput.AdditionalContext, "道德经") { - t.Errorf("dao envelope missing display ref:\n%s", resp.HookSpecificOutput.AdditionalContext) + if !strings.Contains(resp.HookSpecificOutput.AdditionalContext, "Tao Te Ching") { + t.Errorf("dao envelope missing English display ref") } } @@ -184,8 +234,8 @@ func TestLookupFromPromptDollarDaoDotAlias(t *testing.T) { if err := json.Unmarshal(out.Bytes(), &resp); err != nil { t.Fatalf("output not JSON: %v\n%s", err, out.String()) } - if !strings.Contains(resp.HookSpecificOutput.AdditionalContext, "道德经") { - t.Errorf("dao envelope missing display ref:\n%s", resp.HookSpecificOutput.AdditionalContext) + if !strings.Contains(resp.HookSpecificOutput.AdditionalContext, "Tao Te Ching") { + t.Errorf("dao envelope missing English display ref") } } @@ -338,8 +388,8 @@ func TestLookupFromPromptHookEventFlag(t *testing.T) { t.Errorf("hookSpecificOutput hookEventName = %q, want UserPromptSubmit", resp.HookSpecificOutput.HookEventName) } - if !strings.Contains(resp.HookSpecificOutput.AdditionalContext, "道德经") { - t.Errorf("dao envelope missing display ref:\n%s", resp.HookSpecificOutput.AdditionalContext) + if !strings.Contains(resp.HookSpecificOutput.AdditionalContext, "Tao Te Ching") { + t.Errorf("dao envelope missing English display ref") } var raw map[string]any if err := json.Unmarshal(out.Bytes(), &raw); err != nil { diff --git a/internal/cli/recap_test.go b/internal/cli/recap_test.go index 13c9a18..f23a0ce 100644 --- a/internal/cli/recap_test.go +++ b/internal/cli/recap_test.go @@ -54,8 +54,8 @@ func TestRecapBibleHasAttribution(t *testing.T) { if !strings.Contains(s, "📖") { t.Errorf("recap output missing scripture marker: %s", s) } - if !strings.Contains(s, "King James Version") { - t.Errorf("bible recap missing KJV attribution: %s", s) + if !strings.Contains(s, "King James Version") && !strings.Contains(s, "Chinese Union Version") { + t.Errorf("bible recap missing known Bible attribution") } } diff --git a/internal/injector/envelope.go b/internal/injector/envelope.go index ed1c3aa..c3b243e 100644 --- a/internal/injector/envelope.go +++ b/internal/injector/envelope.go @@ -45,13 +45,21 @@ func Envelope(v schema.Verse) string { } // DisplayRef formats a verse's canonical reference for human display. -// Prefers DisplayRef["en"] when set; falls back to a tradition-specific -// rendering otherwise. +// Prefers the verse language's DisplayRef when set, then English, then +// Simplified Chinese, and finally a tradition-specific rendering. func DisplayRef(v schema.Verse) string { if v.DisplayRef != nil { + if v.Lang != "" { + if s, ok := v.DisplayRef[v.Lang]; ok && s != "" { + return s + } + } if s, ok := v.DisplayRef["en"]; ok && s != "" { return s } + if s, ok := v.DisplayRef["zh-Hans"]; ok && s != "" { + return s + } } switch v.Tradition { case "bible": diff --git a/internal/lifecycle/lifecycle_test.go b/internal/lifecycle/lifecycle_test.go index 8995511..315d4de 100644 --- a/internal/lifecycle/lifecycle_test.go +++ b/internal/lifecycle/lifecycle_test.go @@ -220,10 +220,13 @@ func TestSlashMarkerTraditionsLifecycle(t *testing.T) { ref string }{ {"/bible John 3:16", "John 3:16"}, - {"/dao 11", "道德经 11"}, - // sutra and quran ship api_only in v0.1, so the hook soft-fails - // to no-envelope; the lifecycle invariant is still preserved - // (no envelope = no leak possible) but there's nothing to leak. + {"/bible 约翰福音 3:16", "约翰福音 3:16"}, + {"/dao 11", "dao 11"}, + {"/dao 道德经第十一章", "道德经 11"}, + {"/sutra Heart Sutra", "Heart Sutra"}, + {"/sutra 心经", "心经"}, + {"/quran 2:255", "Quran 2:255"}, + {"/quran 古兰经 2:255", "古兰经 2:255"}, } for _, tc := range cases { t.Run(tc.marker, func(t *testing.T) { diff --git a/internal/mcp/server.go b/internal/mcp/server.go index 954c178..07433a9 100644 --- a/internal/mcp/server.go +++ b/internal/mcp/server.go @@ -294,9 +294,6 @@ func (s *Server) lookupByRef(ref string) (schema.Verse, error) { } v, ok := s.registry.Lookup(id) if !ok { - if r.Tradition == resolver.TraditionSutra || r.Tradition == resolver.TraditionQuran { - return schema.Verse{}, fmt.Errorf("%w: %s", packs.ErrNotBundled, r.Tradition) - } return schema.Verse{}, fmt.Errorf("verse not found: %s", id) } return v, nil @@ -444,4 +441,3 @@ func textContent(s string) map[string]any { "content": []map[string]any{{"type": "text", "text": s}}, } } - diff --git a/internal/packs/bible-cuv-s/metadata.json b/internal/packs/bible-cuv-s/metadata.json new file mode 100644 index 0000000..b60fa08 --- /dev/null +++ b/internal/packs/bible-cuv-s/metadata.json @@ -0,0 +1,82 @@ +{ + "attribution": "Chinese Union Version (Simplified), open-bibles USFX", + "books": { + "1-chronicles": "1 Chronicles", + "1-corinthians": "1 Corinthians", + "1-john": "1 John", + "1-kings": "1 Kings", + "1-peter": "1 Peter", + "1-samuel": "1 Samuel", + "1-thessalonians": "1 Thessalonians", + "1-timothy": "1 Timothy", + "2-chronicles": "2 Chronicles", + "2-corinthians": "2 Corinthians", + "2-john": "2 John", + "2-kings": "2 Kings", + "2-peter": "2 Peter", + "2-samuel": "2 Samuel", + "2-thessalonians": "2 Thessalonians", + "2-timothy": "2 Timothy", + "3-john": "3 John", + "acts": "Acts", + "amos": "Amos", + "colossians": "Colossians", + "daniel": "Daniel", + "deuteronomy": "Deuteronomy", + "ecclesiastes": "Ecclesiastes", + "ephesians": "Ephesians", + "esther": "Esther", + "exodus": "Exodus", + "ezekiel": "Ezekiel", + "ezra": "Ezra", + "galatians": "Galatians", + "genesis": "Genesis", + "habakkuk": "Habakkuk", + "haggai": "Haggai", + "hebrews": "Hebrews", + "hosea": "Hosea", + "isaiah": "Isaiah", + "james": "James", + "jeremiah": "Jeremiah", + "job": "Job", + "joel": "Joel", + "john": "John", + "jonah": "Jonah", + "joshua": "Joshua", + "jude": "Jude", + "judges": "Judges", + "lamentations": "Lamentations", + "leviticus": "Leviticus", + "luke": "Luke", + "malachi": "Malachi", + "mark": "Mark", + "matthew": "Matthew", + "micah": "Micah", + "nahum": "Nahum", + "nehemiah": "Nehemiah", + "numbers": "Numbers", + "obadiah": "Obadiah", + "philemon": "Philemon", + "philippians": "Philippians", + "proverbs": "Proverbs", + "psalms": "Psalms", + "revelation": "Revelation", + "romans": "Romans", + "ruth": "Ruth", + "song-of-solomon": "Song of Solomon", + "titus": "Titus", + "zechariah": "Zechariah", + "zephaniah": "Zephaniah" + }, + "build_date": "2026-05-04", + "edition_id": "open-bibles-cuv-simp", + "inclusion_mode": "bundled", + "lang": "zh-Hans", + "license": "Public domain", + "provider": "open-bibles", + "sensitivity": "sacred_exact_quote", + "source_url": "https://raw.githubusercontent.com/seven1m/open-bibles/master/chi-cuv-simp.usfx.xml", + "tradition": "bible", + "verse_count": 31100, + "work": "CUV-S" +} diff --git a/internal/packs/bible-cuv-s/verses.jsonl.gz b/internal/packs/bible-cuv-s/verses.jsonl.gz new file mode 100644 index 0000000..7f4a3a4 Binary files /dev/null and b/internal/packs/bible-cuv-s/verses.jsonl.gz differ diff --git a/internal/packs/dao-legge/metadata.json b/internal/packs/dao-legge/metadata.json new file mode 100644 index 0000000..adaff91 --- /dev/null +++ b/internal/packs/dao-legge/metadata.json @@ -0,0 +1,14 @@ +{ + "attribution": "Tao Te Ching, translated by James Legge (1891), Internet Classics Archive text", + "build_date": "2026-05-04", + "edition_id": "legge-1891", + "inclusion_mode": "bundled", + "lang": "en", + "license": "Public domain source text", + "provider": "Internet Classics Archive", + "sensitivity": "sacred_exact_quote", + "source_url": "https://classics.mit.edu/Lao/taote.mb.txt", + "tradition": "dao", + "verse_count": 81, + "work": "legge" +} diff --git a/internal/packs/dao-legge/verses.jsonl.gz b/internal/packs/dao-legge/verses.jsonl.gz new file mode 100644 index 0000000..47624b7 Binary files /dev/null and b/internal/packs/dao-legge/verses.jsonl.gz differ diff --git a/internal/packs/heart-sutra-en/metadata.json b/internal/packs/heart-sutra-en/metadata.json new file mode 100644 index 0000000..82643e9 --- /dev/null +++ b/internal/packs/heart-sutra-en/metadata.json @@ -0,0 +1,15 @@ +{ + "attribution": "Shorter Prajnaparamita Hrdaya Sutra, Wikisource translation", + "build_date": "2026-05-04", + "edition_id": "wikisource-heart-sutra-en", + "inclusion_mode": "bundled", + "lang": "en", + "license": "Creative Commons Attribution-ShareAlike", + "note": "Bundled with source attribution; this is an English translation, not the Xuanzang Chinese text.", + "provider": "Wikisource", + "sensitivity": "sacred_exact_quote", + "source_url": "https://en.wikisource.org/w/index.php?title=Translation:Shorter_Praj%C3%B1%C4%81p%C4%81ramit%C4%81_H%E1%B9%9Bdaya_S%C5%ABtra&action=raw", + "tradition": "sutra", + "verse_count": 1, + "work": "heart-sutra-en" +} diff --git a/internal/packs/heart-sutra-en/verses.jsonl.gz b/internal/packs/heart-sutra-en/verses.jsonl.gz new file mode 100644 index 0000000..e748e9b Binary files /dev/null and b/internal/packs/heart-sutra-en/verses.jsonl.gz differ diff --git a/internal/packs/lookup.go b/internal/packs/lookup.go index c4063bf..a06a77d 100644 --- a/internal/packs/lookup.go +++ b/internal/packs/lookup.go @@ -10,8 +10,8 @@ import ( ) // ErrNotBundled signals the requested reference resolved to a tradition -// that is shipped api-only in this build (heart-sutra, quran). Callers -// can distinguish this from a hard "id not in pack" miss with errors.Is. +// that is shipped api-only in this build. Callers can distinguish this +// from a hard "id not in pack" miss with errors.Is. var ErrNotBundled = errors.New("packs: verse not bundled in this build") // LookupReference maps a parsed resolver.Reference to a pack verse id and @@ -24,9 +24,6 @@ func LookupReference(r resolver.Reference) (schema.Verse, error) { } v, ok := All().Lookup(id) if !ok { - if r.Tradition == resolver.TraditionSutra || r.Tradition == resolver.TraditionQuran { - return schema.Verse{}, fmt.Errorf("%w: %s", ErrNotBundled, r.Tradition) - } return schema.Verse{}, fmt.Errorf("verse not found: %s", id) } return v, nil @@ -46,7 +43,11 @@ func ReferenceID(r resolver.Reference) (string, error) { if r.Chapter < 1 || verse < 1 { return "", fmt.Errorf("bible reference must include chapter and verse") } - return fmt.Sprintf("bible.kjv.%s.%d.%d", bookSlug(r.Book), r.Chapter, verse), nil + work := "kjv" + if r.Work == resolver.WorkCUVS || r.Lang == "zh-Hans" { + work = "cuv-s" + } + return fmt.Sprintf("bible.%s.%s.%d.%d", work, bookSlug(r.Book), r.Chapter, verse), nil case resolver.TraditionDao: if r.Chapter < 1 { return "", fmt.Errorf("dao reference missing chapter") @@ -54,17 +55,29 @@ func ReferenceID(r resolver.Reference) (string, error) { if verse < 1 { verse = 1 } - return fmt.Sprintf("dao.daodejing.%d.%d", r.Chapter, verse), nil + work := "daodejing" + if r.Work == resolver.WorkDaoLegge || r.Lang == "en" { + work = "legge" + } + return fmt.Sprintf("dao.%s.%d.%d", work, r.Chapter, verse), nil case resolver.TraditionSutra: if verse < 1 { verse = 1 } - return fmt.Sprintf("sutra.heart-sutra.%d", verse), nil + work := "heart-sutra" + if r.Work == resolver.WorkHeartSutraEn || r.Lang == "en" { + work = "heart-sutra-en" + } + return fmt.Sprintf("sutra.%s.%d", work, verse), nil case resolver.TraditionQuran: if r.Chapter < 1 || verse < 1 { return "", fmt.Errorf("quran reference must include surah and verse") } - return fmt.Sprintf("quran.quran.%d.%d", r.Chapter, verse), nil + work := "pickthall" + if r.Work == resolver.WorkQuranMajian || r.Lang == "zh-Hans" { + work = "majian" + } + return fmt.Sprintf("quran.%s.%d.%d", work, r.Chapter, verse), nil } return "", fmt.Errorf("unsupported tradition: %s", r.Tradition) } diff --git a/internal/packs/lookup_test.go b/internal/packs/lookup_test.go index d68e244..8d93150 100644 --- a/internal/packs/lookup_test.go +++ b/internal/packs/lookup_test.go @@ -39,9 +39,14 @@ func TestReferenceID(t *testing.T) { want: "sutra.heart-sutra.1", }, { - name: "quran follows surah:verse", - ref: resolver.Reference{Tradition: "quran", Work: "quran", Chapter: 2, VerseStart: 255}, - want: "quran.quran.2.255", + name: "quran English follows surah:verse", + ref: resolver.Reference{Tradition: "quran", Work: resolver.WorkQuranPickthall, Chapter: 2, VerseStart: 255}, + want: "quran.pickthall.2.255", + }, + { + name: "quran Chinese follows surah:verse", + ref: resolver.Reference{Tradition: "quran", Work: resolver.WorkQuranMajian, Chapter: 2, VerseStart: 255}, + want: "quran.majian.2.255", }, } for _, c := range cases { @@ -118,10 +123,18 @@ func TestLookupReferenceSutraFound(t *testing.T) { } } -func TestLookupReferenceQuranNotBundled(t *testing.T) { - _, err := LookupReference(resolver.Reference{Tradition: "quran", Work: "quran", Chapter: 2, VerseStart: 255}) - if !errors.Is(err, ErrNotBundled) { - t.Errorf("got %v, want ErrNotBundled", err) +func TestLookupReferenceQuranFound(t *testing.T) { + v, err := LookupReference(resolver.Reference{ + Tradition: "quran", + Work: resolver.WorkQuranPickthall, + Chapter: 2, + VerseStart: 255, + }) + if err != nil { + t.Fatalf("LookupReference err: %v", err) + } + if v.ID != "quran.pickthall.2.255" { + t.Errorf("got id %q", v.ID) } } diff --git a/internal/packs/packs.go b/internal/packs/packs.go index 0285b08..a168085 100644 --- a/internal/packs/packs.go +++ b/internal/packs/packs.go @@ -1,4 +1,4 @@ -// Package packs holds embedded verse data (KJV, 道德经, 心经, ...). +// Package packs holds embedded verse data (Bible, Dao, Quran, 心经, ...). // // On import, init() decompresses each pack's verses.jsonl.gz, materializes // schema.Verse values from compact JSONL rows + metadata.json, and indexes @@ -9,7 +9,8 @@ // {"id":"bible.kjv.john.3.16","c":3,"v":16,"t":"...","s":""} // // Optional fields: "ve" (verse_end), "b" (book display name; defaults to -// metadata.books[] for multi-book traditions). Pack-shared fields +// metadata.books[] for multi-book traditions), and "d" (per-language +// display_ref strings). Pack-shared fields // (tradition, work, lang, source.*, inclusion_mode, sensitivity) live in // metadata.json so the JSONL stays small enough to fit the 6 MB budget. package packs @@ -28,16 +29,21 @@ import ( "github.com/MiaoDX/verse-driven/internal/schema" ) -//go:embed all:bible-kjv all:dao-de-jing all:heart-sutra +//go:embed all:bible-kjv all:bible-cuv-s all:dao-de-jing all:dao-legge all:heart-sutra all:heart-sutra-en all:quran-pickthall all:quran-majian var fs embed.FS // PackName identifies an embedded pack on disk. type PackName string const ( - PackBibleKJV PackName = "bible-kjv" - PackDaoDeJing PackName = "dao-de-jing" - PackHeartSutra PackName = "heart-sutra" + PackBibleKJV PackName = "bible-kjv" + PackBibleCUVS PackName = "bible-cuv-s" + PackDaoDeJing PackName = "dao-de-jing" + PackDaoLegge PackName = "dao-legge" + PackHeartSutra PackName = "heart-sutra" + PackHeartSutraEn PackName = "heart-sutra-en" + PackQuranPickthall PackName = "quran-pickthall" + PackQuranMajian PackName = "quran-majian" ) // Metadata is the parsed contents of a pack's metadata.json. @@ -61,10 +67,10 @@ type Metadata struct { // Pack is one loaded pack: metadata + indexed verses. type Pack struct { - Name PackName - Meta Metadata - verses []schema.Verse - byID map[string]int // id -> index in verses + Name PackName + Meta Metadata + verses []schema.Verse + byID map[string]int // id -> index in verses } // Verses returns all verses in pack order. @@ -148,7 +154,16 @@ func init() { func loadAll() (*Registry, error) { r := &Registry{packs: make(map[PackName]*Pack)} - for _, name := range []PackName{PackBibleKJV, PackDaoDeJing, PackHeartSutra} { + for _, name := range []PackName{ + PackBibleKJV, + PackBibleCUVS, + PackDaoDeJing, + PackDaoLegge, + PackHeartSutra, + PackHeartSutraEn, + PackQuranPickthall, + PackQuranMajian, + } { p, err := loadPack(name) if err != nil { return nil, fmt.Errorf("pack %s: %w", name, err) @@ -190,13 +205,14 @@ func loadPack(name PackName) (*Pack, error) { } type compactRow struct { - ID string `json:"id"` - Chapter int `json:"c"` - Verse int `json:"v"` - VerseEnd int `json:"ve,omitempty"` - Book string `json:"b,omitempty"` - Text string `json:"t"` - Checksum string `json:"s"` + ID string `json:"id"` + Chapter int `json:"c"` + Verse int `json:"v"` + VerseEnd int `json:"ve,omitempty"` + Book string `json:"b,omitempty"` + DisplayRef map[string]string `json:"d,omitempty"` + Text string `json:"t"` + Checksum string `json:"s"` } func parseRows(r io.Reader, meta Metadata) ([]schema.Verse, map[string]int, error) { @@ -241,6 +257,7 @@ func parseRows(r io.Reader, meta Metadata) ([]schema.Verse, map[string]int, erro VerseStart: row.Verse, VerseEnd: row.VerseEnd, }, + DisplayRef: row.DisplayRef, Text: row.Text, Source: source, ChecksumSHA256: row.Checksum, diff --git a/internal/packs/packs_test.go b/internal/packs/packs_test.go index 2944ccc..097708b 100644 --- a/internal/packs/packs_test.go +++ b/internal/packs/packs_test.go @@ -9,14 +9,23 @@ import ( "github.com/MiaoDX/verse-driven/internal/schema" ) -// TestRegistryLoaded ensures all three packs were registered at init. +// TestRegistryLoaded ensures all bilingual packs were registered at init. func TestRegistryLoaded(t *testing.T) { r := All() if r == nil { t.Fatal("registry nil") } got := r.Names() - want := []PackName{PackBibleKJV, PackDaoDeJing, PackHeartSutra} + want := []PackName{ + PackBibleCUVS, + PackBibleKJV, + PackDaoDeJing, + PackDaoLegge, + PackHeartSutra, + PackHeartSutraEn, + PackQuranMajian, + PackQuranPickthall, + } if len(got) != len(want) { t.Fatalf("Names: got %d packs, want %d", len(got), len(want)) } @@ -44,6 +53,23 @@ func TestKJVCounts(t *testing.T) { } } +func TestCUVSCounts(t *testing.T) { + pack := All().Pack(PackBibleCUVS) + if pack == nil { + t.Fatal("PackBibleCUVS missing") + } + if pack.Meta.Tradition != "bible" || pack.Meta.Work != "CUV-S" || pack.Meta.Lang != "zh-Hans" { + t.Errorf("metadata: got tradition=%q work=%q lang=%q", pack.Meta.Tradition, pack.Meta.Work, pack.Meta.Lang) + } + const want = 31100 + if got := len(pack.Verses()); got != want { + t.Errorf("CUV-S verse count: got %d, want %d", got, want) + } + if got := len(pack.Meta.Books); got != 66 { + t.Errorf("CUV-S book count in metadata: got %d, want 66", got) + } +} + func TestDaoCounts(t *testing.T) { pack := All().Pack(PackDaoDeJing) if pack == nil { @@ -54,6 +80,19 @@ func TestDaoCounts(t *testing.T) { } } +func TestDaoLeggeCounts(t *testing.T) { + pack := All().Pack(PackDaoLegge) + if pack == nil { + t.Fatal("PackDaoLegge missing") + } + if got := len(pack.Verses()); got != 81 { + t.Errorf("Dao Legge chapter count: got %d, want 81", got) + } + if pack.Meta.Lang != "en" { + t.Errorf("Dao Legge lang: got %q, want en", pack.Meta.Lang) + } +} + func TestHeartSutraBundled(t *testing.T) { pack := All().Pack(PackHeartSutra) if pack == nil { @@ -67,6 +106,41 @@ func TestHeartSutraBundled(t *testing.T) { } } +func TestHeartSutraEnBundled(t *testing.T) { + pack := All().Pack(PackHeartSutraEn) + if pack == nil { + t.Fatal("PackHeartSutraEn missing") + } + if got := len(pack.Verses()); got != 1 { + t.Errorf("HeartSutraEn verse count: got %d, want 1", got) + } + if pack.Meta.Lang != "en" { + t.Errorf("HeartSutraEn lang: got %q, want en", pack.Meta.Lang) + } +} + +func TestQuranTranslationCounts(t *testing.T) { + cases := []struct { + name PackName + lang string + }{ + {PackQuranPickthall, "en"}, + {PackQuranMajian, "zh-Hans"}, + } + for _, c := range cases { + pack := All().Pack(c.name) + if pack == nil { + t.Fatalf("%s missing", c.name) + } + if got := len(pack.Verses()); got != 6236 { + t.Errorf("%s verse count: got %d, want 6236", c.name, got) + } + if pack.Meta.Lang != c.lang { + t.Errorf("%s lang: got %q, want %q", c.name, pack.Meta.Lang, c.lang) + } + } +} + // TestSpotChecksums asserts known stable verses by their SHA-256, never by // text content. The checksums here were computed by scripts/build_packs.py // from canonical Project Gutenberg sources; if upstream PG #10 or PG #7337 @@ -81,10 +155,15 @@ func TestSpotChecksums(t *testing.T) { // the very last verse of the canon. {"bible.kjv.genesis.1.1", "6f785a86b2716dcc5a48caa0de944396ba871d5c7f3bf776993648335fcb2bb2"}, {"bible.kjv.john.3.16", "8473c0b1c7664945528317faf77351258eb79f8b11ba821ef76d7e916cde711a"}, + {"bible.cuv-s.john.3.16", "eb9377734c9de55a5fb47ff21405fbd9fc7aac478ac88339491348fa117f9509"}, {"bible.kjv.revelation.22.21", "76128832e1fddeeda339fb4424682d629e372e7965425ba19efbf31038b54ab2"}, // Dao chapter 11 is the README example ("三十辐共一毂..."). {"dao.daodejing.11.1", "81ba9b4c9a51241154bf5f1c7a8b37d16234717b4f29c9522b58d04ad73d95b3"}, + {"dao.legge.11.1", "8ef8185a229525015081a5d3bcb5150bdcc0cd13d9a58d70efffb9141f2546f9"}, {"sutra.heart-sutra.1", "08cd20f4996c4b7f44b5978fbc65f6d82e738a3f3a01b2715303d7d94852fff2"}, + {"sutra.heart-sutra-en.1", "b3f7511dba60c53a7a8f536d8b749bceea7cb1123218f37af8145bbf1fef2e95"}, + {"quran.pickthall.2.255", "f174338173480fd74890bc2dcc0d605c6f418a9923abb17d4e76bee129a5cd64"}, + {"quran.majian.2.255", "afe17706c0849792de579c9382c285c073fb09e4102a5791414b4d08456e5475"}, } r := All() for _, c := range cases { diff --git a/internal/packs/quran-majian/metadata.json b/internal/packs/quran-majian/metadata.json new file mode 100644 index 0000000..056d472 --- /dev/null +++ b/internal/packs/quran-majian/metadata.json @@ -0,0 +1,15 @@ +{ + "attribution": "古兰经中文译本(马坚),Tanzil", + "build_date": "2026-05-04", + "edition_id": "tanzil-zh.jian", + "inclusion_mode": "bundled", + "lang": "zh-Hans", + "license": "Tanzil translations terms: non-commercial use with attribution", + "note": "Bundled translation data retains Tanzil attribution and non-commercial translation terms.", + "provider": "Tanzil Quran Translations", + "sensitivity": "sacred_exact_quote", + "source_url": "https://tanzil.net/trans/zh.jian", + "tradition": "quran", + "verse_count": 6236, + "work": "majian" +} diff --git a/internal/packs/quran-majian/verses.jsonl.gz b/internal/packs/quran-majian/verses.jsonl.gz new file mode 100644 index 0000000..3778cbc Binary files /dev/null and b/internal/packs/quran-majian/verses.jsonl.gz differ diff --git a/internal/packs/quran-pickthall/metadata.json b/internal/packs/quran-pickthall/metadata.json new file mode 100644 index 0000000..9adccda --- /dev/null +++ b/internal/packs/quran-pickthall/metadata.json @@ -0,0 +1,15 @@ +{ + "attribution": "Quran English translation by Mohammed Marmaduke William Pickthall, Tanzil", + "build_date": "2026-05-04", + "edition_id": "tanzil-en.pickthall", + "inclusion_mode": "bundled", + "lang": "en", + "license": "Tanzil translations terms: non-commercial use with attribution", + "note": "Bundled translation data retains Tanzil attribution and non-commercial translation terms.", + "provider": "Tanzil Quran Translations", + "sensitivity": "sacred_exact_quote", + "source_url": "https://tanzil.net/trans/en.pickthall", + "tradition": "quran", + "verse_count": 6236, + "work": "pickthall" +} diff --git a/internal/packs/quran-pickthall/verses.jsonl.gz b/internal/packs/quran-pickthall/verses.jsonl.gz new file mode 100644 index 0000000..79649b2 Binary files /dev/null and b/internal/packs/quran-pickthall/verses.jsonl.gz differ diff --git a/internal/resolver/reference.go b/internal/resolver/reference.go index d8ec0cd..fef9759 100644 --- a/internal/resolver/reference.go +++ b/internal/resolver/reference.go @@ -18,10 +18,15 @@ const ( // Work identifiers used in canonical references. const ( - WorkKJV = "KJV" - WorkDaodejing = "daodejing" - WorkHeartSutra = "heart-sutra" - WorkQuran = "quran" + WorkKJV = "KJV" + WorkCUVS = "CUV-S" + WorkDaodejing = "daodejing" + WorkDaoLegge = "legge" + WorkHeartSutra = "heart-sutra" + WorkHeartSutraEn = "heart-sutra-en" + WorkQuran = "quran" + WorkQuranPickthall = "pickthall" + WorkQuranMajian = "majian" ) // Reference is a parsed scripture reference. @@ -34,6 +39,7 @@ const ( type Reference struct { Tradition string Work string + Lang string Book string Chapter int VerseStart int diff --git a/internal/resolver/resolve.go b/internal/resolver/resolve.go index 458e7d4..77ada7e 100644 --- a/internal/resolver/resolve.go +++ b/internal/resolver/resolve.go @@ -70,27 +70,31 @@ func Resolve(input string) (Reference, error) { } norm := strings.ToLower(trimmed) - if rest, ok := matchAlias(norm, sutraAliases); ok { + if rest, alias, ok := matchAlias(norm, sutraAliases); ok { if strings.TrimSpace(rest) != "" { return Reference{}, fmt.Errorf("%w: heart sutra takes no chapter/verse", ErrUnrecognized) } - return Reference{Tradition: TraditionSutra, Work: WorkHeartSutra}, nil + work, lang := workLangForSutra(alias) + return Reference{Tradition: TraditionSutra, Work: work, Lang: lang}, nil } - if rest, ok := matchAlias(norm, daoAliases); ok { - return parseDao(rest) + if rest, alias, ok := matchAlias(norm, daoAliases); ok { + work, lang := workLangForDao(alias) + return parseDao(rest, work, lang) } - if rest, ok := matchAlias(norm, quranTextAliases); ok { - return parseQuranNumeric(rest) + if rest, alias, ok := matchAlias(norm, quranTextAliases); ok { + work, lang := workLangForQuran(alias) + return parseQuranNumeric(rest, work, lang) } - if rest, ok := matchAlias(norm, surahKeywords); ok { + if rest, _, ok := matchAlias(norm, surahKeywords); ok { return parseQuranSurahByName(rest) } for _, e := range bibleAliasIndex { if rest, ok := tryMatchPrefix(norm, e.alias); ok { - return parseBible(e.canonical, rest) + work, lang := workLangForBible(e.alias) + return parseBible(e.canonical, rest, work, lang) } } @@ -98,8 +102,8 @@ func Resolve(input string) (Reference, error) { return Reference{}, &AmbiguousError{ Input: input, Candidates: []Reference{ - {Tradition: TraditionQuran, Work: WorkQuran, Chapter: c, VerseStart: v}, - {Tradition: TraditionBible, Work: WorkKJV, Chapter: c, VerseStart: v}, + {Tradition: TraditionQuran, Work: WorkQuranPickthall, Lang: "en", Chapter: c, VerseStart: v}, + {Tradition: TraditionBible, Work: WorkKJV, Lang: "en", Chapter: c, VerseStart: v}, }, } } @@ -107,7 +111,7 @@ func Resolve(input string) (Reference, error) { return Reference{}, fmt.Errorf("%w: %q", ErrUnrecognized, input) } -func parseDao(rest string) (Reference, error) { +func parseDao(rest, work, lang string) (Reference, error) { rest = trimAliasSeparator(rest) if r, ok := tryMatchPrefix(rest, "chapter"); ok { rest = strings.TrimSpace(r) @@ -119,10 +123,10 @@ func parseDao(rest string) (Reference, error) { if !ok || n < 1 { return Reference{}, fmt.Errorf("%w: dao chapter %q", ErrInvalidNumber, rest) } - return Reference{Tradition: TraditionDao, Work: WorkDaodejing, Chapter: n}, nil + return Reference{Tradition: TraditionDao, Work: work, Lang: lang, Chapter: n}, nil } -func parseQuranNumeric(rest string) (Reference, error) { +func parseQuranNumeric(rest, work, lang string) (Reference, error) { rest = trimAliasSeparator(rest) if rest == "" { return Reference{}, fmt.Errorf("%w: quran reference missing surah/verse", ErrUnrecognized) @@ -135,10 +139,10 @@ func parseQuranNumeric(rest string) (Reference, error) { if ve != 0 && ve < v { return Reference{}, ErrInvalidRange } - return Reference{Tradition: TraditionQuran, Work: WorkQuran, Chapter: c, VerseStart: v, VerseEnd: ve}, nil + return Reference{Tradition: TraditionQuran, Work: work, Lang: lang, Chapter: c, VerseStart: v, VerseEnd: ve}, nil } if n, ok := parseNumber(rest); ok && n >= 1 { - return Reference{Tradition: TraditionQuran, Work: WorkQuran, Chapter: n}, nil + return Reference{Tradition: TraditionQuran, Work: work, Lang: lang, Chapter: n}, nil } return Reference{}, fmt.Errorf("%w: quran reference %q", ErrInvalidNumber, rest) } @@ -199,7 +203,7 @@ func parseQuranSurahByName(rest string) (Reference, error) { continue } r = strings.TrimSpace(r) - ref := Reference{Tradition: TraditionQuran, Work: WorkQuran, Chapter: e.number} + ref := Reference{Tradition: TraditionQuran, Work: WorkQuranPickthall, Lang: "en", Chapter: e.number} if r == "" { return ref, nil } @@ -217,7 +221,7 @@ func parseQuranSurahByName(rest string) (Reference, error) { return Reference{}, fmt.Errorf("%w: unknown surah %q", ErrUnrecognized, rest) } -func parseBible(canonical, rest string) (Reference, error) { +func parseBible(canonical, rest, work, lang string) (Reference, error) { rest = strings.TrimSpace(rest) if rest == "" { return Reference{}, fmt.Errorf("%w: %s reference missing chapter", ErrUnrecognized, canonical) @@ -231,7 +235,8 @@ func parseBible(canonical, rest string) (Reference, error) { } return Reference{ Tradition: TraditionBible, - Work: WorkKJV, + Work: work, + Lang: lang, Book: canonical, Chapter: c, VerseStart: v, @@ -241,7 +246,8 @@ func parseBible(canonical, rest string) (Reference, error) { if n, ok := parseNumber(rest); ok && n >= 1 { return Reference{ Tradition: TraditionBible, - Work: WorkKJV, + Work: work, + Lang: lang, Book: canonical, Chapter: n, }, nil @@ -249,13 +255,50 @@ func parseBible(canonical, rest string) (Reference, error) { return Reference{}, fmt.Errorf("%w: invalid bible reference %q", ErrInvalidNumber, rest) } -func matchAlias(input string, aliases []string) (string, bool) { +func matchAlias(input string, aliases []string) (rest, alias string, ok bool) { for _, a := range aliases { if rest, ok := tryMatchPrefix(input, a); ok { - return rest, true + return rest, a, true } } - return "", false + return "", "", false +} + +func workLangForBible(alias string) (work, lang string) { + if containsHan(alias) { + return WorkCUVS, "zh-Hans" + } + return WorkKJV, "en" +} + +func workLangForDao(alias string) (work, lang string) { + if containsHan(alias) { + return WorkDaodejing, "zh-Hans" + } + return WorkDaoLegge, "en" +} + +func workLangForSutra(alias string) (work, lang string) { + if containsHan(alias) { + return WorkHeartSutra, "zh-Hans" + } + return WorkHeartSutraEn, "en" +} + +func workLangForQuran(alias string) (work, lang string) { + if containsHan(alias) { + return WorkQuranMajian, "zh-Hans" + } + return WorkQuranPickthall, "en" +} + +func containsHan(s string) bool { + for _, r := range s { + if r >= '\u4e00' && r <= '\u9fff' { + return true + } + } + return false } // tryMatchPrefix returns input[len(alias):] if input starts with alias and diff --git a/internal/resolver/resolve_test.go b/internal/resolver/resolve_test.go index b48130c..4c59755 100644 --- a/internal/resolver/resolve_test.go +++ b/internal/resolver/resolve_test.go @@ -9,6 +9,7 @@ func TestResolve(t *testing.T) { type want struct { tradition string work string + lang string book string chapter int verseStart int @@ -21,77 +22,77 @@ func TestResolve(t *testing.T) { want want }{ // Bible — English canonical - {"bible/john_full", "John 3:16", want{TraditionBible, WorkKJV, "John", 3, 16, 0}}, - {"bible/john_lower", "john 3:16", want{TraditionBible, WorkKJV, "John", 3, 16, 0}}, - {"bible/john_upper", "JOHN 3:16", want{TraditionBible, WorkKJV, "John", 3, 16, 0}}, - {"bible/john_abbrev_jn", "Jn 3:16", want{TraditionBible, WorkKJV, "John", 3, 16, 0}}, - {"bible/john_abbrev_joh", "Joh 3:16", want{TraditionBible, WorkKJV, "John", 3, 16, 0}}, - {"bible/genesis", "Genesis 1:1", want{TraditionBible, WorkKJV, "Genesis", 1, 1, 0}}, - {"bible/genesis_abbrev", "Gen 1:1", want{TraditionBible, WorkKJV, "Genesis", 1, 1, 0}}, - {"bible/exodus", "Exodus 20:3", want{TraditionBible, WorkKJV, "Exodus", 20, 3, 0}}, - {"bible/psalms", "Psalms 23:1", want{TraditionBible, WorkKJV, "Psalms", 23, 1, 0}}, - {"bible/psalms_chapter_only", "Psalms 23", want{TraditionBible, WorkKJV, "Psalms", 23, 0, 0}}, - {"bible/matthew_range", "Matthew 5:3-12", want{TraditionBible, WorkKJV, "Matthew", 5, 3, 12}}, - {"bible/romans", "Romans 8:28", want{TraditionBible, WorkKJV, "Romans", 8, 28, 0}}, - {"bible/revelation", "Revelation 22:21", want{TraditionBible, WorkKJV, "Revelation", 22, 21, 0}}, - {"bible/jonah_vs_jn", "Jonah 1:1", want{TraditionBible, WorkKJV, "Jonah", 1, 1, 0}}, - {"bible/joel_vs_john", "Joel 2:28", want{TraditionBible, WorkKJV, "Joel", 2, 28, 0}}, + {"bible/john_full", "John 3:16", want{TraditionBible, WorkKJV, "en", "John", 3, 16, 0}}, + {"bible/john_lower", "john 3:16", want{TraditionBible, WorkKJV, "en", "John", 3, 16, 0}}, + {"bible/john_upper", "JOHN 3:16", want{TraditionBible, WorkKJV, "en", "John", 3, 16, 0}}, + {"bible/john_abbrev_jn", "Jn 3:16", want{TraditionBible, WorkKJV, "en", "John", 3, 16, 0}}, + {"bible/john_abbrev_joh", "Joh 3:16", want{TraditionBible, WorkKJV, "en", "John", 3, 16, 0}}, + {"bible/genesis", "Genesis 1:1", want{TraditionBible, WorkKJV, "en", "Genesis", 1, 1, 0}}, + {"bible/genesis_abbrev", "Gen 1:1", want{TraditionBible, WorkKJV, "en", "Genesis", 1, 1, 0}}, + {"bible/exodus", "Exodus 20:3", want{TraditionBible, WorkKJV, "en", "Exodus", 20, 3, 0}}, + {"bible/psalms", "Psalms 23:1", want{TraditionBible, WorkKJV, "en", "Psalms", 23, 1, 0}}, + {"bible/psalms_chapter_only", "Psalms 23", want{TraditionBible, WorkKJV, "en", "Psalms", 23, 0, 0}}, + {"bible/matthew_range", "Matthew 5:3-12", want{TraditionBible, WorkKJV, "en", "Matthew", 5, 3, 12}}, + {"bible/romans", "Romans 8:28", want{TraditionBible, WorkKJV, "en", "Romans", 8, 28, 0}}, + {"bible/revelation", "Revelation 22:21", want{TraditionBible, WorkKJV, "en", "Revelation", 22, 21, 0}}, + {"bible/jonah_vs_jn", "Jonah 1:1", want{TraditionBible, WorkKJV, "en", "Jonah", 1, 1, 0}}, + {"bible/joel_vs_john", "Joel 2:28", want{TraditionBible, WorkKJV, "en", "Joel", 2, 28, 0}}, // Bible — numbered books, English variants - {"bible/1john_spaced", "1 John 3:16", want{TraditionBible, WorkKJV, "1 John", 3, 16, 0}}, - {"bible/1john_unspaced", "1john 3:16", want{TraditionBible, WorkKJV, "1 John", 3, 16, 0}}, - {"bible/1john_abbrev", "1 Jn 3:16", want{TraditionBible, WorkKJV, "1 John", 3, 16, 0}}, - {"bible/1john_roman", "I John 3:16", want{TraditionBible, WorkKJV, "1 John", 3, 16, 0}}, - {"bible/2john", "2 John 1:1", want{TraditionBible, WorkKJV, "2 John", 1, 1, 0}}, - {"bible/3john", "3 John 1:4", want{TraditionBible, WorkKJV, "3 John", 1, 4, 0}}, - {"bible/1corinthians", "1 Corinthians 13:4", want{TraditionBible, WorkKJV, "1 Corinthians", 13, 4, 0}}, - {"bible/2samuel_abbrev", "2 Sam 7:12", want{TraditionBible, WorkKJV, "2 Samuel", 7, 12, 0}}, + {"bible/1john_spaced", "1 John 3:16", want{TraditionBible, WorkKJV, "en", "1 John", 3, 16, 0}}, + {"bible/1john_unspaced", "1john 3:16", want{TraditionBible, WorkKJV, "en", "1 John", 3, 16, 0}}, + {"bible/1john_abbrev", "1 Jn 3:16", want{TraditionBible, WorkKJV, "en", "1 John", 3, 16, 0}}, + {"bible/1john_roman", "I John 3:16", want{TraditionBible, WorkKJV, "en", "1 John", 3, 16, 0}}, + {"bible/2john", "2 John 1:1", want{TraditionBible, WorkKJV, "en", "2 John", 1, 1, 0}}, + {"bible/3john", "3 John 1:4", want{TraditionBible, WorkKJV, "en", "3 John", 1, 4, 0}}, + {"bible/1corinthians", "1 Corinthians 13:4", want{TraditionBible, WorkKJV, "en", "1 Corinthians", 13, 4, 0}}, + {"bible/2samuel_abbrev", "2 Sam 7:12", want{TraditionBible, WorkKJV, "en", "2 Samuel", 7, 12, 0}}, // Bible — Simplified Chinese - {"bible/john_zh", "约翰福音 3:16", want{TraditionBible, WorkKJV, "John", 3, 16, 0}}, - {"bible/john_zh_no_space", "约翰福音3:16", want{TraditionBible, WorkKJV, "John", 3, 16, 0}}, - {"bible/john_zh_fullwidth_colon", "约翰福音 3:16", want{TraditionBible, WorkKJV, "John", 3, 16, 0}}, - {"bible/1john_zh", "约翰一书 3:16", want{TraditionBible, WorkKJV, "1 John", 3, 16, 0}}, - {"bible/1john_zh_range", "约翰一书 3:16-17", want{TraditionBible, WorkKJV, "1 John", 3, 16, 17}}, - {"bible/2john_zh", "约翰二书 1:1", want{TraditionBible, WorkKJV, "2 John", 1, 1, 0}}, - {"bible/3john_zh", "约翰三书 1:4", want{TraditionBible, WorkKJV, "3 John", 1, 4, 0}}, - {"bible/genesis_zh", "创世记 1:1", want{TraditionBible, WorkKJV, "Genesis", 1, 1, 0}}, - {"bible/psalms_zh", "诗篇 23:1", want{TraditionBible, WorkKJV, "Psalms", 23, 1, 0}}, - {"bible/matthew_zh", "马太福音 5:3", want{TraditionBible, WorkKJV, "Matthew", 5, 3, 0}}, - {"bible/whitespace_padding", " John 3:16 ", want{TraditionBible, WorkKJV, "John", 3, 16, 0}}, + {"bible/john_zh", "约翰福音 3:16", want{TraditionBible, WorkCUVS, "zh-Hans", "John", 3, 16, 0}}, + {"bible/john_zh_no_space", "约翰福音3:16", want{TraditionBible, WorkCUVS, "zh-Hans", "John", 3, 16, 0}}, + {"bible/john_zh_fullwidth_colon", "约翰福音 3:16", want{TraditionBible, WorkCUVS, "zh-Hans", "John", 3, 16, 0}}, + {"bible/1john_zh", "约翰一书 3:16", want{TraditionBible, WorkCUVS, "zh-Hans", "1 John", 3, 16, 0}}, + {"bible/1john_zh_range", "约翰一书 3:16-17", want{TraditionBible, WorkCUVS, "zh-Hans", "1 John", 3, 16, 17}}, + {"bible/2john_zh", "约翰二书 1:1", want{TraditionBible, WorkCUVS, "zh-Hans", "2 John", 1, 1, 0}}, + {"bible/3john_zh", "约翰三书 1:4", want{TraditionBible, WorkCUVS, "zh-Hans", "3 John", 1, 4, 0}}, + {"bible/genesis_zh", "创世记 1:1", want{TraditionBible, WorkCUVS, "zh-Hans", "Genesis", 1, 1, 0}}, + {"bible/psalms_zh", "诗篇 23:1", want{TraditionBible, WorkCUVS, "zh-Hans", "Psalms", 23, 1, 0}}, + {"bible/matthew_zh", "马太福音 5:3", want{TraditionBible, WorkCUVS, "zh-Hans", "Matthew", 5, 3, 0}}, + {"bible/whitespace_padding", " John 3:16 ", want{TraditionBible, WorkKJV, "en", "John", 3, 16, 0}}, // Tao Te Ching - {"dao/zh_short", "道德经 11", want{TraditionDao, WorkDaodejing, "", 11, 0, 0}}, - {"dao/zh_di_zhang", "道德经第十一章", want{TraditionDao, WorkDaodejing, "", 11, 0, 0}}, - {"dao/zh_di_arabic", "道德经第11章", want{TraditionDao, WorkDaodejing, "", 11, 0, 0}}, - {"dao/zh_chapter_one", "道德经第一章", want{TraditionDao, WorkDaodejing, "", 1, 0, 0}}, - {"dao/zh_chapter_eighty_one", "道德经第八十一章", want{TraditionDao, WorkDaodejing, "", 81, 0, 0}}, - {"dao/en_dao", "dao 11", want{TraditionDao, WorkDaodejing, "", 11, 0, 0}}, - {"dao/en_dao_colon", "dao:11", want{TraditionDao, WorkDaodejing, "", 11, 0, 0}}, - {"dao/en_dao_dot", "dao.11", want{TraditionDao, WorkDaodejing, "", 11, 0, 0}}, - {"dao/en_dao_upper", "DAO 11", want{TraditionDao, WorkDaodejing, "", 11, 0, 0}}, - {"dao/en_daodejing_chapter", "daodejing chapter 11", want{TraditionDao, WorkDaodejing, "", 11, 0, 0}}, - {"dao/en_tao_te_ching", "Tao Te Ching 11", want{TraditionDao, WorkDaodejing, "", 11, 0, 0}}, + {"dao/zh_short", "道德经 11", want{TraditionDao, WorkDaodejing, "zh-Hans", "", 11, 0, 0}}, + {"dao/zh_di_zhang", "道德经第十一章", want{TraditionDao, WorkDaodejing, "zh-Hans", "", 11, 0, 0}}, + {"dao/zh_di_arabic", "道德经第11章", want{TraditionDao, WorkDaodejing, "zh-Hans", "", 11, 0, 0}}, + {"dao/zh_chapter_one", "道德经第一章", want{TraditionDao, WorkDaodejing, "zh-Hans", "", 1, 0, 0}}, + {"dao/zh_chapter_eighty_one", "道德经第八十一章", want{TraditionDao, WorkDaodejing, "zh-Hans", "", 81, 0, 0}}, + {"dao/en_dao", "dao 11", want{TraditionDao, WorkDaoLegge, "en", "", 11, 0, 0}}, + {"dao/en_dao_colon", "dao:11", want{TraditionDao, WorkDaoLegge, "en", "", 11, 0, 0}}, + {"dao/en_dao_dot", "dao.11", want{TraditionDao, WorkDaoLegge, "en", "", 11, 0, 0}}, + {"dao/en_dao_upper", "DAO 11", want{TraditionDao, WorkDaoLegge, "en", "", 11, 0, 0}}, + {"dao/en_daodejing_chapter", "daodejing chapter 11", want{TraditionDao, WorkDaoLegge, "en", "", 11, 0, 0}}, + {"dao/en_tao_te_ching", "Tao Te Ching 11", want{TraditionDao, WorkDaoLegge, "en", "", 11, 0, 0}}, // Heart Sutra - {"sutra/zh_short", "心经", want{TraditionSutra, WorkHeartSutra, "", 0, 0, 0}}, - {"sutra/zh_full", "般若波罗蜜多心经", want{TraditionSutra, WorkHeartSutra, "", 0, 0, 0}}, - {"sutra/en_heart_sutra", "Heart Sutra", want{TraditionSutra, WorkHeartSutra, "", 0, 0, 0}}, - {"sutra/en_sutra_heart", "sutra heart", want{TraditionSutra, WorkHeartSutra, "", 0, 0, 0}}, + {"sutra/zh_short", "心经", want{TraditionSutra, WorkHeartSutra, "zh-Hans", "", 0, 0, 0}}, + {"sutra/zh_full", "般若波罗蜜多心经", want{TraditionSutra, WorkHeartSutra, "zh-Hans", "", 0, 0, 0}}, + {"sutra/en_heart_sutra", "Heart Sutra", want{TraditionSutra, WorkHeartSutraEn, "en", "", 0, 0, 0}}, + {"sutra/en_sutra_heart", "sutra heart", want{TraditionSutra, WorkHeartSutraEn, "en", "", 0, 0, 0}}, // Quran - {"quran/numeric", "Quran 2:255", want{TraditionQuran, WorkQuran, "", 2, 255, 0}}, - {"quran/numeric_colon_after_alias", "quran:2:255", want{TraditionQuran, WorkQuran, "", 2, 255, 0}}, - {"quran/numeric_dot_after_alias", "quran.2.255", want{TraditionQuran, WorkQuran, "", 2, 255, 0}}, - {"quran/numeric_lower", "quran 2:255", want{TraditionQuran, WorkQuran, "", 2, 255, 0}}, - {"quran/zh", "古兰经 2:255", want{TraditionQuran, WorkQuran, "", 2, 255, 0}}, - {"quran/zh_first_surah", "古兰经 1:1", want{TraditionQuran, WorkQuran, "", 1, 1, 0}}, - {"quran/last_surah", "Quran 114:6", want{TraditionQuran, WorkQuran, "", 114, 6, 0}}, - {"quran/range", "Quran 2:1-5", want{TraditionQuran, WorkQuran, "", 2, 1, 5}}, - {"quran/surah_by_name", "Surah Al-Baqarah 255", want{TraditionQuran, WorkQuran, "", 2, 255, 0}}, - {"quran/surah_by_name_lower", "surah al-fatihah 1", want{TraditionQuran, WorkQuran, "", 1, 1, 0}}, - {"quran/surah_alt_spelling", "Surah Al Baqarah 255", want{TraditionQuran, WorkQuran, "", 2, 255, 0}}, - {"quran/sura_keyword", "Sura Yasin 1", want{TraditionQuran, WorkQuran, "", 36, 1, 0}}, + {"quran/numeric", "Quran 2:255", want{TraditionQuran, WorkQuranPickthall, "en", "", 2, 255, 0}}, + {"quran/numeric_colon_after_alias", "quran:2:255", want{TraditionQuran, WorkQuranPickthall, "en", "", 2, 255, 0}}, + {"quran/numeric_dot_after_alias", "quran.2.255", want{TraditionQuran, WorkQuranPickthall, "en", "", 2, 255, 0}}, + {"quran/numeric_lower", "quran 2:255", want{TraditionQuran, WorkQuranPickthall, "en", "", 2, 255, 0}}, + {"quran/zh", "古兰经 2:255", want{TraditionQuran, WorkQuranMajian, "zh-Hans", "", 2, 255, 0}}, + {"quran/zh_first_surah", "古兰经 1:1", want{TraditionQuran, WorkQuranMajian, "zh-Hans", "", 1, 1, 0}}, + {"quran/last_surah", "Quran 114:6", want{TraditionQuran, WorkQuranPickthall, "en", "", 114, 6, 0}}, + {"quran/range", "Quran 2:1-5", want{TraditionQuran, WorkQuranPickthall, "en", "", 2, 1, 5}}, + {"quran/surah_by_name", "Surah Al-Baqarah 255", want{TraditionQuran, WorkQuranPickthall, "en", "", 2, 255, 0}}, + {"quran/surah_by_name_lower", "surah al-fatihah 1", want{TraditionQuran, WorkQuranPickthall, "en", "", 1, 1, 0}}, + {"quran/surah_alt_spelling", "Surah Al Baqarah 255", want{TraditionQuran, WorkQuranPickthall, "en", "", 2, 255, 0}}, + {"quran/sura_keyword", "Sura Yasin 1", want{TraditionQuran, WorkQuranPickthall, "en", "", 36, 1, 0}}, } for _, tc := range cases { @@ -101,7 +102,7 @@ func TestResolve(t *testing.T) { t.Fatalf("Resolve(%q) returned error: %v", tc.input, err) } w := tc.want - if got.Tradition != w.tradition || got.Work != w.work || got.Book != w.book || + if got.Tradition != w.tradition || got.Work != w.work || got.Lang != w.lang || got.Book != w.book || got.Chapter != w.chapter || got.VerseStart != w.verseStart || got.VerseEnd != w.verseEnd { t.Errorf("Resolve(%q) = %+v; want %+v", tc.input, got, w) } diff --git a/scripts/build_packs.py b/scripts/build_packs.py index 64672a5..eba4d68 100644 --- a/scripts/build_packs.py +++ b/scripts/build_packs.py @@ -1,11 +1,11 @@ #!/usr/bin/env python3 -"""Pack builder for KJV, 道德经, and 心经. +"""Pack builder for bilingual Bible, Dao, Quran, and Heart Sutra packs. Downloads upstream public-domain sources, normalizes, and writes internal/packs//{verses.jsonl, metadata.json} files. Run: python3 scripts/build_packs.py -Output: internal/packs/{bible-kjv,dao-de-jing,heart-sutra}/ +Output: internal/packs// The script intentionally prints no verse text — only structural info (verse counts, byte sizes, file paths, checksum spot-counts). This is @@ -28,9 +28,14 @@ PACKS_DIR = ROOT / "internal" / "packs" KJV_URL = "https://www.gutenberg.org/cache/epub/10/pg10.txt" +CUV_S_URL = "https://raw.githubusercontent.com/seven1m/open-bibles/master/chi-cuv-simp.usfx.xml" DAO_URL = "https://www.gutenberg.org/cache/epub/7337/pg7337.txt" +DAO_LEGGE_URL = "https://classics.mit.edu/Lao/taote.mb.txt" SUTRA_XML_URL = "https://raw.githubusercontent.com/cbeta-org/xml-p5/master/T/T08/T08n0251.xml" SUTRA_SOURCE_URL = "https://cbetaonline.dila.edu.tw/zh/T0251_001" +SUTRA_EN_RAW_URL = "https://en.wikisource.org/w/index.php?title=Translation:Shorter_Praj%C3%B1%C4%81p%C4%81ramit%C4%81_H%E1%B9%9Bdaya_S%C5%ABtra&action=raw" +QURAN_PICKTHALL_URL = "https://tanzil.net/trans/en.pickthall" +QURAN_MAJIAN_URL = "https://tanzil.net/trans/zh.jian" # ---------- shared helpers ---------- @@ -64,7 +69,7 @@ def _write_pack(name: str, verses: list[dict], metadata: dict) -> None: internal/packs//metadata.json - shared fields + verse_count Each line in verses.jsonl.gz is a compact object: - {"id": str, "c": int, "v": int, "ve"?: int, "b"?: str, "t": str, "s": hex64} + {"id": str, "c": int, "v": int, "ve"?: int, "b"?: str, "d"?: map, "t": str, "s": hex64} Pack-level fields (tradition, lang, work, source, sensitivity, inclusion_mode, default_lang display strings) live in metadata.json so @@ -87,6 +92,8 @@ def _write_pack(name: str, verses: list[dict], metadata: dict) -> None: compact["ve"] = v["canonical_ref"]["verse_end"] if v["canonical_ref"].get("book"): compact["b"] = v["canonical_ref"]["book"] + if v.get("display_ref"): + compact["d"] = v["display_ref"] payload_lines.append(json.dumps(compact, ensure_ascii=False, sort_keys=True).encode("utf-8")) payload = b"\n".join(payload_lines) + (b"\n" if payload_lines else b"") # Pin mtime=0 in the gzip header so the archive is reproducible across builds. @@ -177,6 +184,37 @@ def _write_pack(name: str, verses: list[dict], metadata: dict) -> None: ("The Revelation of Saint John the Divine", "Revelation", "revelation"), ] +USFX_BOOK_CODES = [ + "GEN", "EXO", "LEV", "NUM", "DEU", "JOS", "JDG", "RUT", + "1SA", "2SA", "1KI", "2KI", "1CH", "2CH", "EZR", "NEH", "EST", + "JOB", "PSA", "PRO", "ECC", "SNG", "ISA", "JER", "LAM", "EZK", + "DAN", "HOS", "JOL", "AMO", "OBA", "JON", "MIC", "NAM", "HAB", + "ZEP", "HAG", "ZEC", "MAL", "MAT", "MRK", "LUK", "JHN", "ACT", + "ROM", "1CO", "2CO", "GAL", "EPH", "PHP", "COL", "1TH", "2TH", + "1TI", "2TI", "TIT", "PHM", "HEB", "JAS", "1PE", "2PE", "1JN", + "2JN", "3JN", "JUD", "REV", +] + +CUV_BOOK_NAMES = [ + "创世记", "出埃及记", "利未记", "民数记", "申命记", "约书亚记", "士师记", "路得记", + "撒母耳记上", "撒母耳记下", "列王纪上", "列王纪下", "历代志上", "历代志下", "以斯拉记", "尼希米记", "以斯帖记", + "约伯记", "诗篇", "箴言", "传道书", "雅歌", "以赛亚书", "耶利米书", "耶利米哀歌", "以西结书", + "但以理书", "何西阿书", "约珥书", "阿摩司书", "俄巴底亚书", "约拿书", "弥迦书", "那鸿书", "哈巴谷书", + "西番雅书", "哈该书", "撒迦利亚书", "玛拉基书", "马太福音", "马可福音", "路加福音", "约翰福音", "使徒行传", + "罗马书", "哥林多前书", "哥林多后书", "加拉太书", "以弗所书", "腓立比书", "歌罗西书", "帖撒罗尼迦前书", "帖撒罗尼迦后书", + "提摩太前书", "提摩太后书", "提多书", "腓利门书", "希伯来书", "雅各书", "彼得前书", "彼得后书", "约翰一书", + "约翰二书", "约翰三书", "犹大书", "启示录", +] + +CUV_BOOKS = { + code: { + "display": KJV_BOOKS[i][1], + "slug": KJV_BOOKS[i][2], + "zh": CUV_BOOK_NAMES[i], + } + for i, code in enumerate(USFX_BOOK_CODES) +} + VERSE_MARKER = re.compile(r"\b(\d+):(\d+)\b") START_MARKER = "*** START OF THE PROJECT GUTENBERG EBOOK THE KING JAMES VERSION OF THE BIBLE ***" END_MARKER = "*** END OF THE PROJECT GUTENBERG EBOOK THE KING JAMES VERSION OF THE BIBLE ***" @@ -310,6 +348,79 @@ def build_kjv() -> None: _write_pack("bible-kjv", verses, meta) +# ---------- Chinese Union Version Bible (Simplified) ---------- + +def build_cuv_s() -> None: + print("[cuv-s] downloading...") + raw = _fetch(CUV_S_URL) + root = ET.fromstring(raw) + verses: list[dict] = [] + seen_books: set[str] = set() + for book_el in root: + if _local_name(book_el.tag) != "book": + continue + code = book_el.attrib.get("id", "") + info = CUV_BOOKS.get(code) + if info is None: + raise RuntimeError(f"CUV-S: unknown USFX book code {code!r}") + seen_books.add(code) + chapter = 0 + for child in book_el: + tag = _local_name(child.tag) + if tag == "c": + chapter = int(child.attrib["id"]) + continue + if tag != "v": + continue + if chapter < 1: + raise RuntimeError(f"CUV-S: verse before chapter in {code}") + verse_id = child.attrib.get("id", "") + if not verse_id.isdigit(): + raise RuntimeError(f"CUV-S: unsupported verse id {verse_id!r} in {code} {chapter}") + verse = int(verse_id) + text = re.sub(r"\s+", "", child.tail or "") + if not text: + raise RuntimeError(f"CUV-S: empty verse in {code} {chapter}:{verse}") + vid = f"bible.cuv-s.{info['slug']}.{chapter}.{verse}" + verses.append({ + "id": vid, + "tradition": "bible", + "lang": "zh-Hans", + "work": "CUV-S", + "canonical_ref": {"book": info["display"], "chapter": chapter, "verse_start": verse}, + "display_ref": { + "zh-Hans": f"{info['zh']} {chapter}:{verse}", + "en": f"{info['display']} {chapter}:{verse}", + }, + "text": text, + "source": { + "provider": "open-bibles", + "license": "Public domain", + "attribution": "Chinese Union Version (Simplified), open-bibles USFX", + }, + "checksum_sha256": _sha256(text), + "inclusion_mode": "bundled", + "sensitivity": "sacred_exact_quote", + }) + if seen_books != set(USFX_BOOK_CODES): + missing = sorted(set(USFX_BOOK_CODES) - seen_books) + raise RuntimeError(f"CUV-S: missing book codes: {missing}") + meta = { + "tradition": "bible", + "work": "CUV-S", + "lang": "zh-Hans", + "provider": "open-bibles", + "source_url": CUV_S_URL, + "license": "Public domain", + "attribution": "Chinese Union Version (Simplified), open-bibles USFX", + "edition_id": "open-bibles-cuv-simp", + "inclusion_mode": "bundled", + "sensitivity": "sacred_exact_quote", + "books": {KJV_BOOKS[i][2]: KJV_BOOKS[i][1] for i in range(len(KJV_BOOKS))}, + } + _write_pack("bible-cuv-s", verses, meta) + + # ---------- 道德经 ---------- DAO_START_MARKER = "*** START OF THE PROJECT GUTENBERG EBOOK" @@ -423,6 +534,61 @@ def build_dao() -> None: _write_pack("dao-de-jing", verses, meta) +def build_dao_legge() -> None: + print("[dao-legge] downloading...") + raw = _fetch(DAO_LEGGE_URL) + markers = list(re.finditer(r"\bChapter\s+(\d{1,2})\b", raw)) + if len(markers) != 81: + raise RuntimeError(f"DAO-LEGGE: expected 81 chapter markers, found {len(markers)}") + + verses: list[dict] = [] + for i, m in enumerate(markers): + chapter = int(m.group(1)) + if chapter != i + 1: + raise RuntimeError(f"DAO-LEGGE: chapter sequence broke at {chapter}, want {i + 1}") + start = m.end() + if i + 1 < len(markers): + end = markers[i + 1].start() + else: + end = raw.find("----------------------------------------------------------------------", start) + if end < 0: + end = len(raw) + text = re.sub(r"\s+", " ", raw[start:end]).strip() + if not text: + raise RuntimeError(f"DAO-LEGGE: empty body for chapter {chapter}") + vid = f"dao.legge.{chapter}.1" + verses.append({ + "id": vid, + "tradition": "dao", + "lang": "en", + "work": "legge", + "canonical_ref": {"chapter": chapter, "verse_start": 1}, + "display_ref": {"en": f"Tao Te Ching, Chapter {chapter}", "zh-Hans": f"道德经第{chapter}章"}, + "text": text, + "source": { + "provider": "Internet Classics Archive", + "license": "Public domain source text", + "attribution": "Tao Te Ching, translated by James Legge (1891), Internet Classics Archive text", + }, + "checksum_sha256": _sha256(text), + "inclusion_mode": "bundled", + "sensitivity": "sacred_exact_quote", + }) + meta = { + "tradition": "dao", + "work": "legge", + "lang": "en", + "provider": "Internet Classics Archive", + "source_url": DAO_LEGGE_URL, + "license": "Public domain source text", + "attribution": "Tao Te Ching, translated by James Legge (1891), Internet Classics Archive text", + "edition_id": "legge-1891", + "inclusion_mode": "bundled", + "sensitivity": "sacred_exact_quote", + } + _write_pack("dao-legge", verses, meta) + + # ---------- 心经 ---------- def _local_name(tag: str) -> str: @@ -518,16 +684,175 @@ def build_sutra() -> None: _write_pack("heart-sutra", verses, meta) +# ---------- Heart Sutra (English Wikisource translation) ---------- + +def _clean_wikisource_translation(raw: str) -> str: + body: list[str] = [] + for line in raw.splitlines(): + s = line.strip() + if not s: + continue + if s.startswith(("{{", "}}", "|", "[[", "