From bdf1c0caf167abfba6420d47b6c217d95663de4a Mon Sep 17 00:00:00 2001
From: straeter <mail@christoph-straeter.com>
Date: Fri, 13 Feb 2026 15:51:04 +0200
Subject: [PATCH 1/5] check repo readme for broken links

---
 docs-site/scripts/check-links.py | 321 ++++++++++++++++++++++---------
 1 file changed, 227 insertions(+), 94 deletions(-)
diff --git a/docs-site/scripts/check-links.py b/docs-site/scripts/check-links.py
index a5661d6e..59ddadf2 100644
--- a/docs-site/scripts/check-links.py
+++ b/docs-site/scripts/check-links.py
@@ -1,13 +1,14 @@
 #!/usr/bin/env python3
-"""Check for broken links in the static docs build output.
+"""Check for broken links in the static docs build output and README files.
 
-Parses all HTML files in the build output, extracts <a href> links, and:
+Parses all HTML files in the build output and markdown files, extracts links, and:
 - Verifies internal /docs links resolve to existing pages in the build output
 - HTTP-checks links to CHECKED_DOMAINS (matched by domain, checks all pages)
 - Skips links in SKIPPED_URLS (matched by exact URL, must be explicitly listed)
 - Errors on any external link not covered by either list
 """
 
+import re
 import sys
 import urllib.request
 from html.parser import HTMLParser
@@ -41,6 +42,7 @@
 # Each URL must be listed explicitly — new links to the same domain will
 # error until added here, so broken links don't slip through unnoticed.
 SKIPPED_URLS: set[str] = {
+    "https://arxiv.org/abs/2506.06287",
     "https://code.claude.com/docs/en/discover-plugins",
     "https://code.claude.com/docs/en/mcp",
     "https://cursor.com/deeplink/mcp-install-dark.svg",
@@ -56,9 +58,17 @@
     "https://github.com/anthropics/claude-code/issues/20377",
     "https://github.com/futuresearch/everyrow-sdk",
     "https://github.com/futuresearch/everyrow-sdk/releases",
+    "https://github.com/user-attachments/assets/254fa2ed-c1f3-4ee8-b93d-d169edf32f27",
     "https://huggingface.co/datasets/fancyzhx/dbpedia_14",
     "https://hugovk.github.io/top-pypi-packages/",
+    "https://img.shields.io/badge/Claude_Code-plugin-D97757?logo=claude&logoColor=fff",
+    "https://img.shields.io/badge/License-MIT-yellow.svg",
+    "https://img.shields.io/badge/python-3.12+-blue.svg",
+    "https://img.shields.io/pypi/v/everyrow.svg",
     "https://jqlang.org/",
+    "https://opensource.org/licenses/MIT",
+    "https://pypi.org/project/everyrow/",
+    "https://python.org/downloads/",
     "https://www.kaggle.com/code/rafaelpoyiadzi/active-learning-with-an-llm-oracle",
 }
 
@@ -80,6 +90,37 @@ def handle_starttag(self, tag, attrs):
             self.links.append(d["content"])
 
 
+def extract_markdown_links(content: str) -> list[str]:
+    """Extract URLs from markdown content.
+    
+    Finds:
+    - [text](url) inline links
+    - ![alt](url) image links
+    - <url> autolinks
+    - [text]: url reference-style links
+    """
+    links = []
+    
+    # Inline links: [text](url) and images: ![alt](url)
+    inline_pattern = r'!?\[([^\]]*)\]\(([^)]+)\)'
+    for match in re.finditer(inline_pattern, content):
+        url = match.group(2).split()[0]  # Split to handle optional titles
+        links.append(url)
+    
+    # Autolinks: <url>
+    autolink_pattern = r'<(https?://[^>]+)>'
+    for match in re.finditer(autolink_pattern, content):
+        links.append(match.group(1))
+    
+    # Reference-style links: [text]: url
+    reference_pattern = r'^\[([^\]]+)\]:\s*(.+?)(?:\s|$)'
+    for match in re.finditer(reference_pattern, content, re.MULTILINE):
+        url = match.group(2).strip()
+        links.append(url)
+    
+    return links
+
+
 def get_valid_paths(out_dir: Path) -> set[str]:
     """Build a set of valid URL paths from the static build output."""
     valid = set()
@@ -115,6 +156,119 @@ def check_url(url: str, cache: dict[str, int | str]) -> int | str:
         return str(e)
 
 
+def validate_link(
+    href: str,
+    file_label: str,
+    page_url: str | None,
+    valid_paths: set[str],
+    url_cache: dict[str, int | str],
+    file_path: Path,
+) -> tuple[str | None, str | None]:
+    """Validate a single link.
+    
+    Returns (error, unknown_url) where at most one is non-None.
+    """
+    if href.startswith(("#", "mailto:", "tel:", "javascript:")):
+        return None, None
+
+    parsed = urlparse(href)
+
+    # External link
+    if (
+        parsed.scheme in ("http", "https")
+        and parsed.netloc
+        and parsed.netloc != "site"
+    ):
+        domain = parsed.netloc
+        # Strip fragment for matching against skip list
+        url_without_fragment = href.split("#")[0]
+
+        # GitHub blob links to this repo: check the file exists locally
+        if url_without_fragment.startswith(REPO_BLOB_PREFIX):
+            rel_path = url_without_fragment[len(REPO_BLOB_PREFIX) :]
+            if rel_path.endswith(".csv"):
+                lfs_url = REPO_LFS_PREFIX + rel_path
+                return (
+                    f"  {file_label}: {href!r} is a blob URL for an"
+                    f" LFS-tracked CSV; use {lfs_url} instead",
+                    None
+                )
+            elif not (REPO_ROOT / rel_path).exists():
+                return (
+                    f"  {file_label}: file not found for {href!r}"
+                    f" (expected {rel_path})",
+                    None
+                )
+            return None, None
+
+        # Git LFS media URLs: verify the file exists locally
+        if url_without_fragment.startswith(REPO_LFS_PREFIX):
+            rel_path = url_without_fragment[len(REPO_LFS_PREFIX) :]
+            if not (REPO_ROOT / rel_path).exists():
+                return (
+                    f"  {file_label}: file not found for {href!r}"
+                    f" (expected {rel_path})",
+                    None
+                )
+            return None, None
+
+        if url_without_fragment in SKIPPED_URLS:
+            return None, None
+
+        if domain in CHECKED_DOMAINS:
+            # For everyrow.io docs links, check against the build
+            # output first if it exists
+            if domain == "everyrow.io" and parsed.path.startswith("/docs"):
+                local_path = parsed.path.rstrip("/") or "/docs"
+                if local_path in valid_paths or local_path + "/" in valid_paths:
+                    return None, None
+                # If build output doesn't exist, skip checking these
+                if not OUT_DIR.exists():
+                    return None, None
+            result = check_url(href, url_cache)
+            if isinstance(result, int) and 200 <= result < 400:
+                return None, None
+            return f"  {file_label}: {href} -> {result}", None
+
+        # Unknown URL — not in either list
+        return None, f"  {file_label}: unrecognized external link {href!r}"
+
+    # Internal links (relative or absolute paths)
+    if page_url:
+        # HTML file: resolve relative links against the page URL
+        if not parsed.scheme and not href.startswith("/"):
+            resolved = urlparse(urljoin(page_url, href))
+        elif href.startswith("/"):
+            resolved = urlparse(f"https://site{href}")
+        else:
+            resolved = parsed
+
+        path = resolved.path.rstrip("/") or BASE_PATH
+
+        if not path.startswith(BASE_PATH):
+            return None, None
+
+        # Skip static assets
+        if "/_next/" in path or path.endswith(
+            (".css", ".js", ".png", ".jpg", ".svg", ".ico")
+        ):
+            return None, None
+
+        if path not in valid_paths and path + "/" not in valid_paths:
+            return f"  {file_label}: broken link {href!r} -> {path}", None
+    else:
+        # Markdown file: check if relative file exists
+        if not parsed.scheme:
+            # Remove fragment/query
+            clean_path = href.split("#")[0].split("?")[0]
+            if clean_path:
+                target = (file_path.parent / clean_path).resolve()
+                if not target.exists():
+                    return f"  {file_label}: file not found for relative link {href!r}", None
+
+    return None, None
+
+
 def check_file(
     html_file: Path,
     valid_paths: set[str],
@@ -142,119 +296,91 @@ def check_file(
         if href in seen_hrefs:
             continue
         seen_hrefs.add(href)
-        if href.startswith(("#", "mailto:", "tel:", "javascript:")):
-            continue
+        
+        error, unknown = validate_link(
+            href, page_label, page_url, valid_paths, url_cache, html_file
+        )
+        if error:
+            errors.append(error)
+        if unknown:
+            unknown_urls.append(unknown)
 
-        parsed = urlparse(href)
+    return errors, unknown_urls
 
-        # External link
-        if (
-            parsed.scheme in ("http", "https")
-            and parsed.netloc
-            and parsed.netloc != "site"
-        ):
-            domain = parsed.netloc
-            # Strip fragment for matching against skip list
-            url_without_fragment = href.split("#")[0]
-
-            # GitHub blob links to this repo: check the file exists locally.
-            # CSV files (LFS-tracked) must use the media.githubusercontent.com
-            # URL so readers get the real data, not the LFS pointer.
-            if url_without_fragment.startswith(REPO_BLOB_PREFIX):
-                rel_path = url_without_fragment[len(REPO_BLOB_PREFIX) :]
-                if rel_path.endswith(".csv"):
-                    lfs_url = REPO_LFS_PREFIX + rel_path
-                    errors.append(
-                        f"  {page_label}: {href!r} is a blob URL for an"
-                        f" LFS-tracked CSV; use {lfs_url} instead"
-                    )
-                elif not (REPO_ROOT / rel_path).exists():
-                    errors.append(
-                        f"  {page_label}: file not found for {href!r}"
-                        f" (expected {rel_path})"
-                    )
-                continue
-
-            # Git LFS media URLs: verify the file exists locally instead
-            # of fetching from GitHub.
-            if url_without_fragment.startswith(REPO_LFS_PREFIX):
-                rel_path = url_without_fragment[len(REPO_LFS_PREFIX) :]
-                if not (REPO_ROOT / rel_path).exists():
-                    errors.append(
-                        f"  {page_label}: file not found for {href!r}"
-                        f" (expected {rel_path})"
-                    )
-                continue
-
-            if url_without_fragment in SKIPPED_URLS:
-                continue
-
-            if domain in CHECKED_DOMAINS:
-                # For everyrow.io docs links, check against the build
-                # output first. This avoids a chicken-and-egg problem
-                # where new pages can't deploy because their canonical
-                # URL doesn't exist on the live site yet.
-                if domain == "everyrow.io" and parsed.path.startswith("/docs"):
-                    local_path = parsed.path.rstrip("/") or "/docs"
-                    if local_path in valid_paths or local_path + "/" in valid_paths:
-                        continue
-                result = check_url(href, url_cache)
-                if isinstance(result, int) and 200 <= result < 400:
-                    continue
-                errors.append(f"  {page_label}: {href} -> {result}")
-                continue
-
-            # Unknown URL — not in either list
-            unknown_urls.append(f"  {page_label}: unrecognized external link {href!r}")
-            continue
 
-        # Resolve relative links against the page URL
-        if not parsed.scheme and not href.startswith("/"):
-            resolved = urlparse(urljoin(page_url, href))
-        elif href.startswith("/"):
-            resolved = urlparse(f"https://site{href}")
-        else:
-            resolved = parsed
+def check_markdown_file(
+    md_file: Path,
+    valid_paths: set[str],
+    url_cache: dict[str, int | str],
+) -> tuple[list[str], list[str]]:
+    """Check all links in a markdown file.
 
-        path = resolved.path.rstrip("/") or BASE_PATH
+    Returns (errors, unknown_urls).
+    """
+    file_label = str(md_file.relative_to(REPO_ROOT))
+    content = md_file.read_text()
+    links = extract_markdown_links(content)
 
-        if not path.startswith(BASE_PATH):
-            continue
+    errors: list[str] = []
+    unknown_urls: list[str] = []
+    seen_hrefs: set[str] = set()
 
-        # Skip static assets
-        if "/_next/" in path or path.endswith(
-            (".css", ".js", ".png", ".jpg", ".svg", ".ico")
-        ):
+    for href in links:
+        if href in seen_hrefs:
             continue
-
-        if path not in valid_paths and path + "/" not in valid_paths:
-            errors.append(f"  {page_label}: broken link {href!r} -> {path}")
+        seen_hrefs.add(href)
+        
+        error, unknown = validate_link(
+            href, file_label, None, valid_paths, url_cache, md_file
+        )
+        if error:
+            errors.append(error)
+        if unknown:
+            unknown_urls.append(unknown)
 
     return errors, unknown_urls
 
 
 def main() -> int:
-    if not OUT_DIR.exists():
-        print(f"Build output not found at {OUT_DIR}")
-        print("Run 'pnpm build' first.")
-        return 1
-
-    valid_paths = get_valid_paths(OUT_DIR)
-    html_files = [
-        f
-        for f in OUT_DIR.rglob("*.html")
-        if f.name not in ("404.html", "_not-found.html")
-    ]
+    # Check if build output exists for HTML checking
+    has_build_output = OUT_DIR.exists()
+    valid_paths: set[str] = set()
+    html_files: list[Path] = []
+    
+    if has_build_output:
+        valid_paths = get_valid_paths(OUT_DIR)
+        html_files = [
+            f
+            for f in OUT_DIR.rglob("*.html")
+            if f.name not in ("404.html", "_not-found.html")
+        ]
+    else:
+        print(f"Build output not found at {OUT_DIR}, skipping HTML checks")
+        print("(Run 'pnpm build' to enable HTML link checking)")
+        print()
+
+    # Find README files to check
+    readme_files = []
+    repo_readme = REPO_ROOT / "README.md"
+    if repo_readme.exists():
+        readme_files.append(repo_readme)
 
     url_cache: dict[str, int | str] = {}
     all_errors: list[str] = []
     all_unknown: list[str] = []
 
+    # Check HTML files
     for html_file in sorted(html_files):
         errors, unknown = check_file(html_file, valid_paths, url_cache)
         all_errors.extend(errors)
         all_unknown.extend(unknown)
 
+    # Check README files
+    for readme_file in sorted(readme_files):
+        errors, unknown = check_markdown_file(readme_file, valid_paths, url_cache)
+        all_errors.extend(errors)
+        all_unknown.extend(unknown)
+
     ok = True
 
     if all_unknown:
@@ -276,8 +402,15 @@ def main() -> int:
         checked = sum(
             1 for v in url_cache.values() if isinstance(v, int) and 200 <= v < 400
         )
+        total_files = len(html_files) + len(readme_files)
+        file_types = []
+        if html_files:
+            file_types.append(f"{len(html_files)} HTML")
+        if readme_files:
+            file_types.append(f"{len(readme_files)} markdown")
+        
         print(
-            f"All links OK across {len(html_files)} pages "
+            f"All links OK across {total_files} files ({', '.join(file_types)}) "
             f"({checked} external URLs checked, {len(SKIPPED_URLS)} URLs skipped)"
         )
 

From 2f6b2a639156d19d1d1182647deb47bac18e7252 Mon Sep 17 00:00:00 2001
From: straeter <mail@christoph-straeter.com>
Date: Fri, 13 Feb 2026 15:51:12 +0200
Subject: [PATCH 2/5] fix broken link in readme

---
 README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/README.md b/README.md
index 0eeb7eac..1f5893a3 100644
--- a/README.md
+++ b/README.md
@@ -92,7 +92,7 @@ result = await agent_map(
 print(result.data.head())
 ```
 
-See the API [docs](https://everyrow.io/docs/reference/RESEARCH.md), a case study of [labeling data](https://everyrow.io/docs/classify-dataframe-rows-llm) or a notebook for [researching government data](https://everyrow.io/docs/notebooks/research-and-rank-permit-times) at scale.
+See the API [docs](https://everyrow.io/docs/reference/RESEARCH), a case study of [labeling data](https://everyrow.io/docs/classify-dataframe-rows-llm) or a notebook for [researching government data](https://everyrow.io/docs/notebooks/research-and-rank-permit-times) at scale.
 
 
 ## Sessions

From 351c5744234eab1136fab1578557bdf5f62d587b Mon Sep 17 00:00:00 2001
From: straeter <mail@christoph-straeter.com>
Date: Fri, 13 Feb 2026 15:56:43 +0200
Subject: [PATCH 3/5] add mcp readme to link check

---
 docs-site/scripts/check-links.py | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/docs-site/scripts/check-links.py b/docs-site/scripts/check-links.py
index 59ddadf2..0fa32140 100644
--- a/docs-site/scripts/check-links.py
+++ b/docs-site/scripts/check-links.py
@@ -66,6 +66,7 @@
     "https://img.shields.io/badge/python-3.12+-blue.svg",
     "https://img.shields.io/pypi/v/everyrow.svg",
     "https://jqlang.org/",
+    "https://modelcontextprotocol.info/tools/registry/publishing/",
     "https://opensource.org/licenses/MIT",
     "https://pypi.org/project/everyrow/",
     "https://python.org/downloads/",
@@ -360,10 +361,10 @@ def main() -> int:
         print()
 
     # Find README files to check
-    readme_files = []
-    repo_readme = REPO_ROOT / "README.md"
-    if repo_readme.exists():
-        readme_files.append(repo_readme)
+    readme_files = [
+        REPO_ROOT / "README.md",
+        REPO_ROOT / "everyrow-mcp" / "README.md",
+    ]
 
     url_cache: dict[str, int | str] = {}
     all_errors: list[str] = []

From 923ee7893b4364cb5e45a8e5c4ae8beb5b69cb6b Mon Sep 17 00:00:00 2001
From: straeter <mail@christoph-straeter.com>
Date: Fri, 13 Feb 2026 15:59:38 +0200
Subject: [PATCH 4/5] add docs-site readme

---
 docs-site/scripts/check-links.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/docs-site/scripts/check-links.py b/docs-site/scripts/check-links.py
index 0fa32140..b3ee23de 100644
--- a/docs-site/scripts/check-links.py
+++ b/docs-site/scripts/check-links.py
@@ -364,6 +364,7 @@ def main() -> int:
     readme_files = [
         REPO_ROOT / "README.md",
         REPO_ROOT / "everyrow-mcp" / "README.md",
+        REPO_ROOT / "docs-site" / "README.md",
     ]
 
     url_cache: dict[str, int | str] = {}

From 3d9c4dc704fe568c1a7ab0e1325c085941c208a6 Mon Sep 17 00:00:00 2001
From: straeter <mail@christoph-straeter.com>
Date: Fri, 27 Feb 2026 13:18:20 +0100
Subject: [PATCH 5/5] correct image file

---
 docs-site/scripts/check-links.py | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/docs-site/scripts/check-links.py b/docs-site/scripts/check-links.py
index 3ae9cdf7..936ad876 100644
--- a/docs-site/scripts/check-links.py
+++ b/docs-site/scripts/check-links.py
@@ -64,7 +64,7 @@
     "https://github.com/anthropics/claude-code/issues/20377",
     "https://github.com/futuresearch/everyrow-sdk",
     "https://github.com/futuresearch/everyrow-sdk/releases",
-    "https://github.com/user-attachments/assets/254fa2ed-c1f3-4ee8-b93d-d169edf32f27",
+    "https://github.com/user-attachments/assets/8b746b6c-2acb-4591-9328-daebdb472f50",
     "https://huggingface.co/datasets/fancyzhx/dbpedia_14",
     "https://huggingface.co/datasets/google-research-datasets/paws",
     "https://hugovk.github.io/top-pypi-packages/",
@@ -82,7 +82,6 @@
     "https://www.kaggle.com/datasets/tunguz/pubmed-title-abstracts-2019-baseline",
     "https://arxiv.org/abs/2506.21558",
     "https://arxiv.org/abs/2506.06287",
-    "https://media.githubusercontent.com/media/futuresearch/everyrow-sdk/refs/heads/main/docs/data/fda_products.csv"
 }