Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
46 changes: 42 additions & 4 deletions .github/scripts/smoke-test.sh
Original file line number Diff line number Diff line change
Expand Up @@ -171,11 +171,49 @@ echo "── robots.txt + sitemap.xml integrity ──────────
file_contains "/robots.txt" "Content-Signal" "Content-Signal directive present"
file_contains "/robots.txt" "Sitemap: https://panta-rhei.site" "sitemap reference present"

loc_count=$(grep -c '<loc>' "$SITE/sitemap.xml" 2>/dev/null || echo "0")
if [ "$loc_count" -ge 100 ]; then
pass "sitemap has ${loc_count} URLs (expected ≥100)"
# /sitemap.xml is a sitemap INDEX referencing six child sitemaps (see
# _includes/sitemap-bucket.liquid). Validate:
# 1. /sitemap.xml is a <sitemapindex> (not a <urlset>)
# 2. All six child sitemap files exist
# 3. Each child contains a non-trivial number of <loc> URLs
# 4. Total URLs across children ≥ 5000 (canonical ~8,864 on prod)
if grep -q '<sitemapindex' "$SITE/sitemap.xml" 2>/dev/null; then
pass "sitemap.xml is a <sitemapindex>"
else
fail "sitemap has only ${loc_count} URLs (expected ≥100)"
fail "sitemap.xml is not a <sitemapindex> — expected sitemap index format"
fi
CHECK_COUNT=$((CHECK_COUNT+1))

total_locs=0
declare -A child_min=(
["sitemap-core.xml"]=500
["sitemap-registry.xml"]=4000
["sitemap-bibliography.xml"]=1000
["sitemap-corpus-bulk.xml"]=1000
["sitemap-results-bulk.xml"]=500
["sitemap-predictions.xml"]=20
)
for child in sitemap-core.xml sitemap-registry.xml sitemap-bibliography.xml sitemap-corpus-bulk.xml sitemap-results-bulk.xml sitemap-predictions.xml; do
if [ ! -f "$SITE/$child" ]; then
fail "MISSING /$child"
CHECK_COUNT=$((CHECK_COUNT+1))
continue
fi
child_locs=$(grep -c '<loc>' "$SITE/$child" 2>/dev/null || echo "0")
total_locs=$((total_locs + child_locs))
min_expected=${child_min[$child]}
if [ "$child_locs" -ge "$min_expected" ]; then
pass "/$child has ${child_locs} URLs (≥${min_expected} expected)"
else
fail "/$child has only ${child_locs} URLs (expected ≥${min_expected})"
fi
CHECK_COUNT=$((CHECK_COUNT+1))
done

if [ "$total_locs" -ge 5000 ]; then
pass "sitemap total URLs across children: ${total_locs} (≥5000 expected)"
else
fail "sitemap total URLs across children: only ${total_locs} (expected ≥5000)"
fi
CHECK_COUNT=$((CHECK_COUNT+1))

Expand Down
36 changes: 36 additions & 0 deletions _includes/sitemap-bucket.liquid
Original file line number Diff line number Diff line change
@@ -0,0 +1,36 @@
{%- comment -%}
sitemap-bucket.liquid — single source of truth for child-sitemap classification.

Sets `_bucket` based on `include.url`. Buckets (mutually exclusive):

* core — human-authored L0-L4 pages (default)
* registry — /registry/* (4,570 auto-generated registry objects)
* bibliography — /bibliography/* (1,149 auto-generated references)
* corpus-bulk — /corpus/monographs/*, /corpus/taulib/* (1,129 auto-generated)
* results-bulk — /results/{additional-noteworthy-results,problem,physics,life,
metaphysics,mathematics,calibration-cascade,falsifications,
predictions}/* (925 auto-generated facet/path pages)
* predictions — /predictions/* (~67 auto-generated)

The split is by URL prefix (deterministic, no frontmatter inspection needed) so
it works identically across all six child sitemaps and the sitemap index.

Usage:
{% include sitemap-bucket.liquid url=item.url %}
{% if _bucket == "core" %} ... {% endif %}
{%- endcomment -%}
{%- assign _parts = include.url | split: "/" -%}
{%- assign _t = _parts[1] -%}
{%- assign _s = _parts[2] -%}
{%- assign _bucket = "core" -%}
{%- if _t == "registry" -%}{%- assign _bucket = "registry" -%}{%- endif -%}
{%- if _t == "bibliography" -%}{%- assign _bucket = "bibliography" -%}{%- endif -%}
{%- if _t == "predictions" -%}{%- assign _bucket = "predictions" -%}{%- endif -%}
{%- if _t == "corpus" -%}
{%- if _s == "monographs" -%}{%- assign _bucket = "corpus-bulk" -%}{%- endif -%}
{%- if _s == "taulib" -%}{%- assign _bucket = "corpus-bulk" -%}{%- endif -%}
{%- endif -%}
{%- if _t == "results" -%}
{%- assign _rb_list = "additional-noteworthy-results,problem,physics,life,metaphysics,mathematics,calibration-cascade,falsifications,predictions" | split: "," -%}
{%- if _rb_list contains _s -%}{%- assign _bucket = "results-bulk" -%}{%- endif -%}
{%- endif -%}
27 changes: 27 additions & 0 deletions sitemap-bibliography.xml
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
---
# sitemap-bibliography.xml — /bibliography/* references (~1,149)
#
# Auto-generated bibliography entries.
#
# Inclusion rule: classifier `_bucket == "bibliography"` from
# `_includes/sitemap-bucket.liquid`.
layout: null
permalink: /sitemap-bibliography.xml
sitemap: false
---
{%- assign all_items = site.html_pages | concat: site.documents -%}
<?xml version="1.0" encoding="UTF-8"?>
<urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9">
{%- for item in all_items -%}
{%- if item.sitemap == false -%}{%- continue -%}{%- endif -%}
{%- if item.layout == "redirect" -%}{%- continue -%}{%- endif -%}
{%- include sitemap-bucket.liquid url=item.url -%}
{%- unless _bucket == "bibliography" -%}{%- continue -%}{%- endunless -%}
{%- assign _mod = item.last_modified_at | default: item.last_updated | default: item.date | default: site.time -%}
<url>
<loc>{{ item.url | absolute_url | xml_escape }}</loc>
<lastmod>{{ _mod | date_to_xmlschema }}</lastmod>
</url>
{%- endfor -%}

</urlset>
31 changes: 31 additions & 0 deletions sitemap-core.xml
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
---
# sitemap-core.xml — human-authored L0-L4 pages
#
# Highest crawl-priority sitemap: ~1,047 manually-authored pages including
# all top-level lanes (program, agenda, corpus root + construction-spine +
# foundational-hinges, verify, publications, impact, engage, discover,
# media, results overview pages, etc.) but excluding the 7,800+ auto-
# generated programmatic pages routed via the other five child sitemaps.
#
# Inclusion rule: classifier `_bucket == "core"` from
# `_includes/sitemap-bucket.liquid`.
layout: null
permalink: /sitemap-core.xml
sitemap: false
---
{%- assign all_items = site.html_pages | concat: site.documents -%}
<?xml version="1.0" encoding="UTF-8"?>
<urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9">
{%- for item in all_items -%}
{%- if item.sitemap == false -%}{%- continue -%}{%- endif -%}
{%- if item.layout == "redirect" -%}{%- continue -%}{%- endif -%}
{%- include sitemap-bucket.liquid url=item.url -%}
{%- unless _bucket == "core" -%}{%- continue -%}{%- endunless -%}
{%- assign _mod = item.last_modified_at | default: item.last_updated | default: item.date | default: site.time -%}
<url>
<loc>{{ item.url | absolute_url | xml_escape }}</loc>
<lastmod>{{ _mod | date_to_xmlschema }}</lastmod>
</url>
{%- endfor -%}

</urlset>
27 changes: 27 additions & 0 deletions sitemap-corpus-bulk.xml
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
---
# sitemap-corpus-bulk.xml — /corpus/monographs/* + /corpus/taulib/* (~1,129)
#
# Auto-generated TauLib-derived monograph and library reference pages.
#
# Inclusion rule: classifier `_bucket == "corpus-bulk"` from
# `_includes/sitemap-bucket.liquid`.
layout: null
permalink: /sitemap-corpus-bulk.xml
sitemap: false
---
{%- assign all_items = site.html_pages | concat: site.documents -%}
<?xml version="1.0" encoding="UTF-8"?>
<urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9">
{%- for item in all_items -%}
{%- if item.sitemap == false -%}{%- continue -%}{%- endif -%}
{%- if item.layout == "redirect" -%}{%- continue -%}{%- endif -%}
{%- include sitemap-bucket.liquid url=item.url -%}
{%- unless _bucket == "corpus-bulk" -%}{%- continue -%}{%- endunless -%}
{%- assign _mod = item.last_modified_at | default: item.last_updated | default: item.date | default: site.time -%}
<url>
<loc>{{ item.url | absolute_url | xml_escape }}</loc>
<lastmod>{{ _mod | date_to_xmlschema }}</lastmod>
</url>
{%- endfor -%}

</urlset>
29 changes: 29 additions & 0 deletions sitemap-predictions.xml
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
---
# sitemap-predictions.xml — /predictions/* prediction pages (~67)
#
# Auto-generated prediction record pages (separate from the
# /results/predictions/ facet aggregator pages, which live in
# sitemap-results-bulk.xml).
#
# Inclusion rule: classifier `_bucket == "predictions"` from
# `_includes/sitemap-bucket.liquid`.
layout: null
permalink: /sitemap-predictions.xml
sitemap: false
---
{%- assign all_items = site.html_pages | concat: site.documents -%}
<?xml version="1.0" encoding="UTF-8"?>
<urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9">
{%- for item in all_items -%}
{%- if item.sitemap == false -%}{%- continue -%}{%- endif -%}
{%- if item.layout == "redirect" -%}{%- continue -%}{%- endif -%}
{%- include sitemap-bucket.liquid url=item.url -%}
{%- unless _bucket == "predictions" -%}{%- continue -%}{%- endunless -%}
{%- assign _mod = item.last_modified_at | default: item.last_updated | default: item.date | default: site.time -%}
<url>
<loc>{{ item.url | absolute_url | xml_escape }}</loc>
<lastmod>{{ _mod | date_to_xmlschema }}</lastmod>
</url>
{%- endfor -%}

</urlset>
29 changes: 29 additions & 0 deletions sitemap-registry.xml
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
---
# sitemap-registry.xml — /registry/* auto-generated registry objects (~4,570)
#
# These are programmatically generated from the canonical registry corpus.
# Pulled into a separate sitemap so GSC can report indexing progress on
# this bulk content type independently from human-authored core pages.
#
# Inclusion rule: classifier `_bucket == "registry"` from
# `_includes/sitemap-bucket.liquid`.
layout: null
permalink: /sitemap-registry.xml
sitemap: false
---
{%- assign all_items = site.html_pages | concat: site.documents -%}
<?xml version="1.0" encoding="UTF-8"?>
<urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9">
{%- for item in all_items -%}
{%- if item.sitemap == false -%}{%- continue -%}{%- endif -%}
{%- if item.layout == "redirect" -%}{%- continue -%}{%- endif -%}
{%- include sitemap-bucket.liquid url=item.url -%}
{%- unless _bucket == "registry" -%}{%- continue -%}{%- endunless -%}
{%- assign _mod = item.last_modified_at | default: item.last_updated | default: item.date | default: site.time -%}
<url>
<loc>{{ item.url | absolute_url | xml_escape }}</loc>
<lastmod>{{ _mod | date_to_xmlschema }}</lastmod>
</url>
{%- endfor -%}

</urlset>
32 changes: 32 additions & 0 deletions sitemap-results-bulk.xml
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
---
# sitemap-results-bulk.xml — auto-generated /results/* facet/path pages (~925)
#
# Covers /results/{additional-noteworthy-results, problem, physics, life,
# metaphysics, mathematics, calibration-cascade, falsifications,
# predictions}/* — the templated facet structure under the Results lane.
#
# Curated /results/ overview pages (e.g. /results/, /results/predictions/browse/,
# /results/falsifications/browse/) stay in sitemap-core.xml.
#
# Inclusion rule: classifier `_bucket == "results-bulk"` from
# `_includes/sitemap-bucket.liquid`.
layout: null
permalink: /sitemap-results-bulk.xml
sitemap: false
---
{%- assign all_items = site.html_pages | concat: site.documents -%}
<?xml version="1.0" encoding="UTF-8"?>
<urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9">
{%- for item in all_items -%}
{%- if item.sitemap == false -%}{%- continue -%}{%- endif -%}
{%- if item.layout == "redirect" -%}{%- continue -%}{%- endif -%}
{%- include sitemap-bucket.liquid url=item.url -%}
{%- unless _bucket == "results-bulk" -%}{%- continue -%}{%- endunless -%}
{%- assign _mod = item.last_modified_at | default: item.last_updated | default: item.date | default: site.time -%}
<url>
<loc>{{ item.url | absolute_url | xml_escape }}</loc>
<lastmod>{{ _mod | date_to_xmlschema }}</lastmod>
</url>
{%- endfor -%}

</urlset>
78 changes: 44 additions & 34 deletions sitemap.xml
Original file line number Diff line number Diff line change
@@ -1,44 +1,54 @@
---
# sitemap.xml — custom Jekyll sitemap that overrides jekyll-sitemap
# sitemap.xml — sitemap INDEX
#
# When a sitemap.xml file exists at site source root, jekyll-sitemap defers to
# it and skips its own generation (per the plugin's `Site#generate` guard). This
# template emits one <url> per published page/document with a non-empty <lastmod>
# for every URL — including the 86% of pages (registry objects, prediction
# pages, falsification packs, results, ...) where the upstream plugin emitted
# no <lastmod> because they have no `last_modified_at` frontmatter.
# As of May 2026, the sitemap is split into six child sitemaps so Google
# Search Console can report indexing progress per content type, and so the
# crawler can prioritize human-authored canonical pages over the 7,800+
# auto-generated registry/bibliography/monograph/results-facet pages.
#
# lastmod fallback chain (in order):
# 1. page.last_modified_at — jekyll-last-modified-at convention (not used yet)
# 2. page.last_updated — this project's canonical "page version" field
# 3. page.date — Jekyll-auto for posts; explicit on some pages
# 4. site.time — build time (always available; never null)
# Child sitemaps (mutually exclusive, classification by URL prefix in
# `_includes/sitemap-bucket.liquid`):
#
# Exclusion rules:
# - sitemap: false — page-level opt-out
# - layout: redirect — legacy redirect stubs (already noindex'd; the
# jekyll-sitemap convention is to exclude these)
# - .html files with no `output: true` — Jekyll handles this automatically;
# unpublished docs do not appear in
# site.documents.
# /sitemap-core.xml ~1,047 human-authored L0-L4 pages
# /sitemap-registry.xml ~4,570 /registry/* registry objects
# /sitemap-bibliography.xml ~1,149 /bibliography/* references
# /sitemap-corpus-bulk.xml ~1,129 /corpus/monographs/* + /corpus/taulib/*
# /sitemap-results-bulk.xml ~925 /results/* facet + predictions/falsifications
# /sitemap-predictions.xml ~67 /predictions/* prediction pages
#
# This file uses no Jekyll layout (`layout: null`) and is itself opted out of
# its own sitemap (`sitemap: false`).
# Total URLs across children equal the single-file v1 count (~8,875).
#
# robots.txt references /sitemap.xml — unchanged. Google auto-discovers
# child sitemaps from the index. Last-modified for each child is the build
# time, which is correct: any change in source triggers a full rebuild.
layout: null
permalink: /sitemap.xml
sitemap: false
---
{%- assign all_items = site.html_pages | concat: site.documents -%}
<?xml version="1.0" encoding="UTF-8"?>
<urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9">
{%- for item in all_items -%}
{%- if item.sitemap == false -%}{%- continue -%}{%- endif -%}
{%- if item.layout == "redirect" -%}{%- continue -%}{%- endif -%}
{%- assign _mod = item.last_modified_at | default: item.last_updated | default: item.date | default: site.time -%}
<url>
<loc>{{ item.url | absolute_url | xml_escape }}</loc>
<lastmod>{{ _mod | date_to_xmlschema }}</lastmod>
</url>
{%- endfor -%}

</urlset>
<sitemapindex xmlns="http://www.sitemaps.org/schemas/sitemap/0.9">
<sitemap>
<loc>{{ "/sitemap-core.xml" | absolute_url }}</loc>
<lastmod>{{ site.time | date_to_xmlschema }}</lastmod>
</sitemap>
<sitemap>
<loc>{{ "/sitemap-registry.xml" | absolute_url }}</loc>
<lastmod>{{ site.time | date_to_xmlschema }}</lastmod>
</sitemap>
<sitemap>
<loc>{{ "/sitemap-bibliography.xml" | absolute_url }}</loc>
<lastmod>{{ site.time | date_to_xmlschema }}</lastmod>
</sitemap>
<sitemap>
<loc>{{ "/sitemap-corpus-bulk.xml" | absolute_url }}</loc>
<lastmod>{{ site.time | date_to_xmlschema }}</lastmod>
</sitemap>
<sitemap>
<loc>{{ "/sitemap-results-bulk.xml" | absolute_url }}</loc>
<lastmod>{{ site.time | date_to_xmlschema }}</lastmod>
</sitemap>
<sitemap>
<loc>{{ "/sitemap-predictions.xml" | absolute_url }}</loc>
<lastmod>{{ site.time | date_to_xmlschema }}</lastmod>
</sitemap>
</sitemapindex>