diff --git a/.github/scripts/smoke-test.sh b/.github/scripts/smoke-test.sh index 3d9a9b02f..a940fe79e 100755 --- a/.github/scripts/smoke-test.sh +++ b/.github/scripts/smoke-test.sh @@ -171,11 +171,49 @@ echo "── robots.txt + sitemap.xml integrity ────────── file_contains "/robots.txt" "Content-Signal" "Content-Signal directive present" file_contains "/robots.txt" "Sitemap: https://panta-rhei.site" "sitemap reference present" -loc_count=$(grep -c '' "$SITE/sitemap.xml" 2>/dev/null || echo "0") -if [ "$loc_count" -ge 100 ]; then - pass "sitemap has ${loc_count} URLs (expected ≥100)" +# /sitemap.xml is a sitemap INDEX referencing six child sitemaps (see +# _includes/sitemap-bucket.liquid). Validate: +# 1. /sitemap.xml is a (not a ) +# 2. All six child sitemap files exist +# 3. Each child contains a non-trivial number of URLs +# 4. Total URLs across children ≥ 5000 (canonical ~8,864 on prod) +if grep -q '/dev/null; then + pass "sitemap.xml is a " else - fail "sitemap has only ${loc_count} URLs (expected ≥100)" + fail "sitemap.xml is not a — expected sitemap index format" +fi +CHECK_COUNT=$((CHECK_COUNT+1)) + +total_locs=0 +declare -A child_min=( + ["sitemap-core.xml"]=500 + ["sitemap-registry.xml"]=4000 + ["sitemap-bibliography.xml"]=1000 + ["sitemap-corpus-bulk.xml"]=1000 + ["sitemap-results-bulk.xml"]=500 + ["sitemap-predictions.xml"]=20 +) +for child in sitemap-core.xml sitemap-registry.xml sitemap-bibliography.xml sitemap-corpus-bulk.xml sitemap-results-bulk.xml sitemap-predictions.xml; do + if [ ! -f "$SITE/$child" ]; then + fail "MISSING /$child" + CHECK_COUNT=$((CHECK_COUNT+1)) + continue + fi + child_locs=$(grep -c '' "$SITE/$child" 2>/dev/null || echo "0") + total_locs=$((total_locs + child_locs)) + min_expected=${child_min[$child]} + if [ "$child_locs" -ge "$min_expected" ]; then + pass "/$child has ${child_locs} URLs (≥${min_expected} expected)" + else + fail "/$child has only ${child_locs} URLs (expected ≥${min_expected})" + fi + CHECK_COUNT=$((CHECK_COUNT+1)) +done + +if [ "$total_locs" -ge 5000 ]; then + pass "sitemap total URLs across children: ${total_locs} (≥5000 expected)" +else + fail "sitemap total URLs across children: only ${total_locs} (expected ≥5000)" fi CHECK_COUNT=$((CHECK_COUNT+1)) diff --git a/_includes/sitemap-bucket.liquid b/_includes/sitemap-bucket.liquid new file mode 100644 index 000000000..f8d91d575 --- /dev/null +++ b/_includes/sitemap-bucket.liquid @@ -0,0 +1,36 @@ +{%- comment -%} +sitemap-bucket.liquid — single source of truth for child-sitemap classification. + +Sets `_bucket` based on `include.url`. Buckets (mutually exclusive): + + * core — human-authored L0-L4 pages (default) + * registry — /registry/* (4,570 auto-generated registry objects) + * bibliography — /bibliography/* (1,149 auto-generated references) + * corpus-bulk — /corpus/monographs/*, /corpus/taulib/* (1,129 auto-generated) + * results-bulk — /results/{additional-noteworthy-results,problem,physics,life, + metaphysics,mathematics,calibration-cascade,falsifications, + predictions}/* (925 auto-generated facet/path pages) + * predictions — /predictions/* (~67 auto-generated) + +The split is by URL prefix (deterministic, no frontmatter inspection needed) so +it works identically across all six child sitemaps and the sitemap index. + +Usage: + {% include sitemap-bucket.liquid url=item.url %} + {% if _bucket == "core" %} ... {% endif %} +{%- endcomment -%} +{%- assign _parts = include.url | split: "/" -%} +{%- assign _t = _parts[1] -%} +{%- assign _s = _parts[2] -%} +{%- assign _bucket = "core" -%} +{%- if _t == "registry" -%}{%- assign _bucket = "registry" -%}{%- endif -%} +{%- if _t == "bibliography" -%}{%- assign _bucket = "bibliography" -%}{%- endif -%} +{%- if _t == "predictions" -%}{%- assign _bucket = "predictions" -%}{%- endif -%} +{%- if _t == "corpus" -%} + {%- if _s == "monographs" -%}{%- assign _bucket = "corpus-bulk" -%}{%- endif -%} + {%- if _s == "taulib" -%}{%- assign _bucket = "corpus-bulk" -%}{%- endif -%} +{%- endif -%} +{%- if _t == "results" -%} + {%- assign _rb_list = "additional-noteworthy-results,problem,physics,life,metaphysics,mathematics,calibration-cascade,falsifications,predictions" | split: "," -%} + {%- if _rb_list contains _s -%}{%- assign _bucket = "results-bulk" -%}{%- endif -%} +{%- endif -%} diff --git a/sitemap-bibliography.xml b/sitemap-bibliography.xml new file mode 100644 index 000000000..9fcf7eff2 --- /dev/null +++ b/sitemap-bibliography.xml @@ -0,0 +1,27 @@ +--- +# sitemap-bibliography.xml — /bibliography/* references (~1,149) +# +# Auto-generated bibliography entries. +# +# Inclusion rule: classifier `_bucket == "bibliography"` from +# `_includes/sitemap-bucket.liquid`. +layout: null +permalink: /sitemap-bibliography.xml +sitemap: false +--- +{%- assign all_items = site.html_pages | concat: site.documents -%} + + +{%- for item in all_items -%} + {%- if item.sitemap == false -%}{%- continue -%}{%- endif -%} + {%- if item.layout == "redirect" -%}{%- continue -%}{%- endif -%} + {%- include sitemap-bucket.liquid url=item.url -%} + {%- unless _bucket == "bibliography" -%}{%- continue -%}{%- endunless -%} + {%- assign _mod = item.last_modified_at | default: item.last_updated | default: item.date | default: site.time -%} + + {{ item.url | absolute_url | xml_escape }} + {{ _mod | date_to_xmlschema }} + +{%- endfor -%} + + diff --git a/sitemap-core.xml b/sitemap-core.xml new file mode 100644 index 000000000..56dbe7f88 --- /dev/null +++ b/sitemap-core.xml @@ -0,0 +1,31 @@ +--- +# sitemap-core.xml — human-authored L0-L4 pages +# +# Highest crawl-priority sitemap: ~1,047 manually-authored pages including +# all top-level lanes (program, agenda, corpus root + construction-spine + +# foundational-hinges, verify, publications, impact, engage, discover, +# media, results overview pages, etc.) but excluding the 7,800+ auto- +# generated programmatic pages routed via the other five child sitemaps. +# +# Inclusion rule: classifier `_bucket == "core"` from +# `_includes/sitemap-bucket.liquid`. +layout: null +permalink: /sitemap-core.xml +sitemap: false +--- +{%- assign all_items = site.html_pages | concat: site.documents -%} + + +{%- for item in all_items -%} + {%- if item.sitemap == false -%}{%- continue -%}{%- endif -%} + {%- if item.layout == "redirect" -%}{%- continue -%}{%- endif -%} + {%- include sitemap-bucket.liquid url=item.url -%} + {%- unless _bucket == "core" -%}{%- continue -%}{%- endunless -%} + {%- assign _mod = item.last_modified_at | default: item.last_updated | default: item.date | default: site.time -%} + + {{ item.url | absolute_url | xml_escape }} + {{ _mod | date_to_xmlschema }} + +{%- endfor -%} + + diff --git a/sitemap-corpus-bulk.xml b/sitemap-corpus-bulk.xml new file mode 100644 index 000000000..a30d4fd1e --- /dev/null +++ b/sitemap-corpus-bulk.xml @@ -0,0 +1,27 @@ +--- +# sitemap-corpus-bulk.xml — /corpus/monographs/* + /corpus/taulib/* (~1,129) +# +# Auto-generated TauLib-derived monograph and library reference pages. +# +# Inclusion rule: classifier `_bucket == "corpus-bulk"` from +# `_includes/sitemap-bucket.liquid`. +layout: null +permalink: /sitemap-corpus-bulk.xml +sitemap: false +--- +{%- assign all_items = site.html_pages | concat: site.documents -%} + + +{%- for item in all_items -%} + {%- if item.sitemap == false -%}{%- continue -%}{%- endif -%} + {%- if item.layout == "redirect" -%}{%- continue -%}{%- endif -%} + {%- include sitemap-bucket.liquid url=item.url -%} + {%- unless _bucket == "corpus-bulk" -%}{%- continue -%}{%- endunless -%} + {%- assign _mod = item.last_modified_at | default: item.last_updated | default: item.date | default: site.time -%} + + {{ item.url | absolute_url | xml_escape }} + {{ _mod | date_to_xmlschema }} + +{%- endfor -%} + + diff --git a/sitemap-predictions.xml b/sitemap-predictions.xml new file mode 100644 index 000000000..0ea408a02 --- /dev/null +++ b/sitemap-predictions.xml @@ -0,0 +1,29 @@ +--- +# sitemap-predictions.xml — /predictions/* prediction pages (~67) +# +# Auto-generated prediction record pages (separate from the +# /results/predictions/ facet aggregator pages, which live in +# sitemap-results-bulk.xml). +# +# Inclusion rule: classifier `_bucket == "predictions"` from +# `_includes/sitemap-bucket.liquid`. +layout: null +permalink: /sitemap-predictions.xml +sitemap: false +--- +{%- assign all_items = site.html_pages | concat: site.documents -%} + + +{%- for item in all_items -%} + {%- if item.sitemap == false -%}{%- continue -%}{%- endif -%} + {%- if item.layout == "redirect" -%}{%- continue -%}{%- endif -%} + {%- include sitemap-bucket.liquid url=item.url -%} + {%- unless _bucket == "predictions" -%}{%- continue -%}{%- endunless -%} + {%- assign _mod = item.last_modified_at | default: item.last_updated | default: item.date | default: site.time -%} + + {{ item.url | absolute_url | xml_escape }} + {{ _mod | date_to_xmlschema }} + +{%- endfor -%} + + diff --git a/sitemap-registry.xml b/sitemap-registry.xml new file mode 100644 index 000000000..7c1bd32b2 --- /dev/null +++ b/sitemap-registry.xml @@ -0,0 +1,29 @@ +--- +# sitemap-registry.xml — /registry/* auto-generated registry objects (~4,570) +# +# These are programmatically generated from the canonical registry corpus. +# Pulled into a separate sitemap so GSC can report indexing progress on +# this bulk content type independently from human-authored core pages. +# +# Inclusion rule: classifier `_bucket == "registry"` from +# `_includes/sitemap-bucket.liquid`. +layout: null +permalink: /sitemap-registry.xml +sitemap: false +--- +{%- assign all_items = site.html_pages | concat: site.documents -%} + + +{%- for item in all_items -%} + {%- if item.sitemap == false -%}{%- continue -%}{%- endif -%} + {%- if item.layout == "redirect" -%}{%- continue -%}{%- endif -%} + {%- include sitemap-bucket.liquid url=item.url -%} + {%- unless _bucket == "registry" -%}{%- continue -%}{%- endunless -%} + {%- assign _mod = item.last_modified_at | default: item.last_updated | default: item.date | default: site.time -%} + + {{ item.url | absolute_url | xml_escape }} + {{ _mod | date_to_xmlschema }} + +{%- endfor -%} + + diff --git a/sitemap-results-bulk.xml b/sitemap-results-bulk.xml new file mode 100644 index 000000000..c1ab9cb98 --- /dev/null +++ b/sitemap-results-bulk.xml @@ -0,0 +1,32 @@ +--- +# sitemap-results-bulk.xml — auto-generated /results/* facet/path pages (~925) +# +# Covers /results/{additional-noteworthy-results, problem, physics, life, +# metaphysics, mathematics, calibration-cascade, falsifications, +# predictions}/* — the templated facet structure under the Results lane. +# +# Curated /results/ overview pages (e.g. /results/, /results/predictions/browse/, +# /results/falsifications/browse/) stay in sitemap-core.xml. +# +# Inclusion rule: classifier `_bucket == "results-bulk"` from +# `_includes/sitemap-bucket.liquid`. +layout: null +permalink: /sitemap-results-bulk.xml +sitemap: false +--- +{%- assign all_items = site.html_pages | concat: site.documents -%} + + +{%- for item in all_items -%} + {%- if item.sitemap == false -%}{%- continue -%}{%- endif -%} + {%- if item.layout == "redirect" -%}{%- continue -%}{%- endif -%} + {%- include sitemap-bucket.liquid url=item.url -%} + {%- unless _bucket == "results-bulk" -%}{%- continue -%}{%- endunless -%} + {%- assign _mod = item.last_modified_at | default: item.last_updated | default: item.date | default: site.time -%} + + {{ item.url | absolute_url | xml_escape }} + {{ _mod | date_to_xmlschema }} + +{%- endfor -%} + + diff --git a/sitemap.xml b/sitemap.xml index 3e06c42f5..0ea5ae265 100644 --- a/sitemap.xml +++ b/sitemap.xml @@ -1,44 +1,54 @@ --- -# sitemap.xml — custom Jekyll sitemap that overrides jekyll-sitemap +# sitemap.xml — sitemap INDEX # -# When a sitemap.xml file exists at site source root, jekyll-sitemap defers to -# it and skips its own generation (per the plugin's `Site#generate` guard). This -# template emits one per published page/document with a non-empty -# for every URL — including the 86% of pages (registry objects, prediction -# pages, falsification packs, results, ...) where the upstream plugin emitted -# no because they have no `last_modified_at` frontmatter. +# As of May 2026, the sitemap is split into six child sitemaps so Google +# Search Console can report indexing progress per content type, and so the +# crawler can prioritize human-authored canonical pages over the 7,800+ +# auto-generated registry/bibliography/monograph/results-facet pages. # -# lastmod fallback chain (in order): -# 1. page.last_modified_at — jekyll-last-modified-at convention (not used yet) -# 2. page.last_updated — this project's canonical "page version" field -# 3. page.date — Jekyll-auto for posts; explicit on some pages -# 4. site.time — build time (always available; never null) +# Child sitemaps (mutually exclusive, classification by URL prefix in +# `_includes/sitemap-bucket.liquid`): # -# Exclusion rules: -# - sitemap: false — page-level opt-out -# - layout: redirect — legacy redirect stubs (already noindex'd; the -# jekyll-sitemap convention is to exclude these) -# - .html files with no `output: true` — Jekyll handles this automatically; -# unpublished docs do not appear in -# site.documents. +# /sitemap-core.xml ~1,047 human-authored L0-L4 pages +# /sitemap-registry.xml ~4,570 /registry/* registry objects +# /sitemap-bibliography.xml ~1,149 /bibliography/* references +# /sitemap-corpus-bulk.xml ~1,129 /corpus/monographs/* + /corpus/taulib/* +# /sitemap-results-bulk.xml ~925 /results/* facet + predictions/falsifications +# /sitemap-predictions.xml ~67 /predictions/* prediction pages # -# This file uses no Jekyll layout (`layout: null`) and is itself opted out of -# its own sitemap (`sitemap: false`). +# Total URLs across children equal the single-file v1 count (~8,875). +# +# robots.txt references /sitemap.xml — unchanged. Google auto-discovers +# child sitemaps from the index. Last-modified for each child is the build +# time, which is correct: any change in source triggers a full rebuild. layout: null permalink: /sitemap.xml sitemap: false --- -{%- assign all_items = site.html_pages | concat: site.documents -%} - -{%- for item in all_items -%} -{%- if item.sitemap == false -%}{%- continue -%}{%- endif -%} -{%- if item.layout == "redirect" -%}{%- continue -%}{%- endif -%} -{%- assign _mod = item.last_modified_at | default: item.last_updated | default: item.date | default: site.time -%} - - {{ item.url | absolute_url | xml_escape }} - {{ _mod | date_to_xmlschema }} - -{%- endfor -%} - - + + + {{ "/sitemap-core.xml" | absolute_url }} + {{ site.time | date_to_xmlschema }} + + + {{ "/sitemap-registry.xml" | absolute_url }} + {{ site.time | date_to_xmlschema }} + + + {{ "/sitemap-bibliography.xml" | absolute_url }} + {{ site.time | date_to_xmlschema }} + + + {{ "/sitemap-corpus-bulk.xml" | absolute_url }} + {{ site.time | date_to_xmlschema }} + + + {{ "/sitemap-results-bulk.xml" | absolute_url }} + {{ site.time | date_to_xmlschema }} + + + {{ "/sitemap-predictions.xml" | absolute_url }} + {{ site.time | date_to_xmlschema }} + +