From bdf1c0caf167abfba6420d47b6c217d95663de4a Mon Sep 17 00:00:00 2001 From: straeter Date: Fri, 13 Feb 2026 15:51:04 +0200 Subject: [PATCH 1/5] check repo readme for broken links --- docs-site/scripts/check-links.py | 321 ++++++++++++++++++++++--------- 1 file changed, 227 insertions(+), 94 deletions(-) diff --git a/docs-site/scripts/check-links.py b/docs-site/scripts/check-links.py index a5661d6e..59ddadf2 100644 --- a/docs-site/scripts/check-links.py +++ b/docs-site/scripts/check-links.py @@ -1,13 +1,14 @@ #!/usr/bin/env python3 -"""Check for broken links in the static docs build output. +"""Check for broken links in the static docs build output and README files. -Parses all HTML files in the build output, extracts links, and: +Parses all HTML files in the build output and markdown files, extracts links, and: - Verifies internal /docs links resolve to existing pages in the build output - HTTP-checks links to CHECKED_DOMAINS (matched by domain, checks all pages) - Skips links in SKIPPED_URLS (matched by exact URL, must be explicitly listed) - Errors on any external link not covered by either list """ +import re import sys import urllib.request from html.parser import HTMLParser @@ -41,6 +42,7 @@ # Each URL must be listed explicitly — new links to the same domain will # error until added here, so broken links don't slip through unnoticed. SKIPPED_URLS: set[str] = { + "https://arxiv.org/abs/2506.06287", "https://code.claude.com/docs/en/discover-plugins", "https://code.claude.com/docs/en/mcp", "https://cursor.com/deeplink/mcp-install-dark.svg", @@ -56,9 +58,17 @@ "https://github.com/anthropics/claude-code/issues/20377", "https://github.com/futuresearch/everyrow-sdk", "https://github.com/futuresearch/everyrow-sdk/releases", + "https://github.com/user-attachments/assets/254fa2ed-c1f3-4ee8-b93d-d169edf32f27", "https://huggingface.co/datasets/fancyzhx/dbpedia_14", "https://hugovk.github.io/top-pypi-packages/", + "https://img.shields.io/badge/Claude_Code-plugin-D97757?logo=claude&logoColor=fff", + "https://img.shields.io/badge/License-MIT-yellow.svg", + "https://img.shields.io/badge/python-3.12+-blue.svg", + "https://img.shields.io/pypi/v/everyrow.svg", "https://jqlang.org/", + "https://opensource.org/licenses/MIT", + "https://pypi.org/project/everyrow/", + "https://python.org/downloads/", "https://www.kaggle.com/code/rafaelpoyiadzi/active-learning-with-an-llm-oracle", } @@ -80,6 +90,37 @@ def handle_starttag(self, tag, attrs): self.links.append(d["content"]) +def extract_markdown_links(content: str) -> list[str]: + """Extract URLs from markdown content. + + Finds: + - [text](url) inline links + - ![alt](url) image links + - autolinks + - [text]: url reference-style links + """ + links = [] + + # Inline links: [text](url) and images: ![alt](url) + inline_pattern = r'!?\[([^\]]*)\]\(([^)]+)\)' + for match in re.finditer(inline_pattern, content): + url = match.group(2).split()[0] # Split to handle optional titles + links.append(url) + + # Autolinks: + autolink_pattern = r'<(https?://[^>]+)>' + for match in re.finditer(autolink_pattern, content): + links.append(match.group(1)) + + # Reference-style links: [text]: url + reference_pattern = r'^\[([^\]]+)\]:\s*(.+?)(?:\s|$)' + for match in re.finditer(reference_pattern, content, re.MULTILINE): + url = match.group(2).strip() + links.append(url) + + return links + + def get_valid_paths(out_dir: Path) -> set[str]: """Build a set of valid URL paths from the static build output.""" valid = set() @@ -115,6 +156,119 @@ def check_url(url: str, cache: dict[str, int | str]) -> int | str: return str(e) +def validate_link( + href: str, + file_label: str, + page_url: str | None, + valid_paths: set[str], + url_cache: dict[str, int | str], + file_path: Path, +) -> tuple[str | None, str | None]: + """Validate a single link. + + Returns (error, unknown_url) where at most one is non-None. + """ + if href.startswith(("#", "mailto:", "tel:", "javascript:")): + return None, None + + parsed = urlparse(href) + + # External link + if ( + parsed.scheme in ("http", "https") + and parsed.netloc + and parsed.netloc != "site" + ): + domain = parsed.netloc + # Strip fragment for matching against skip list + url_without_fragment = href.split("#")[0] + + # GitHub blob links to this repo: check the file exists locally + if url_without_fragment.startswith(REPO_BLOB_PREFIX): + rel_path = url_without_fragment[len(REPO_BLOB_PREFIX) :] + if rel_path.endswith(".csv"): + lfs_url = REPO_LFS_PREFIX + rel_path + return ( + f" {file_label}: {href!r} is a blob URL for an" + f" LFS-tracked CSV; use {lfs_url} instead", + None + ) + elif not (REPO_ROOT / rel_path).exists(): + return ( + f" {file_label}: file not found for {href!r}" + f" (expected {rel_path})", + None + ) + return None, None + + # Git LFS media URLs: verify the file exists locally + if url_without_fragment.startswith(REPO_LFS_PREFIX): + rel_path = url_without_fragment[len(REPO_LFS_PREFIX) :] + if not (REPO_ROOT / rel_path).exists(): + return ( + f" {file_label}: file not found for {href!r}" + f" (expected {rel_path})", + None + ) + return None, None + + if url_without_fragment in SKIPPED_URLS: + return None, None + + if domain in CHECKED_DOMAINS: + # For everyrow.io docs links, check against the build + # output first if it exists + if domain == "everyrow.io" and parsed.path.startswith("/docs"): + local_path = parsed.path.rstrip("/") or "/docs" + if local_path in valid_paths or local_path + "/" in valid_paths: + return None, None + # If build output doesn't exist, skip checking these + if not OUT_DIR.exists(): + return None, None + result = check_url(href, url_cache) + if isinstance(result, int) and 200 <= result < 400: + return None, None + return f" {file_label}: {href} -> {result}", None + + # Unknown URL — not in either list + return None, f" {file_label}: unrecognized external link {href!r}" + + # Internal links (relative or absolute paths) + if page_url: + # HTML file: resolve relative links against the page URL + if not parsed.scheme and not href.startswith("/"): + resolved = urlparse(urljoin(page_url, href)) + elif href.startswith("/"): + resolved = urlparse(f"https://site{href}") + else: + resolved = parsed + + path = resolved.path.rstrip("/") or BASE_PATH + + if not path.startswith(BASE_PATH): + return None, None + + # Skip static assets + if "/_next/" in path or path.endswith( + (".css", ".js", ".png", ".jpg", ".svg", ".ico") + ): + return None, None + + if path not in valid_paths and path + "/" not in valid_paths: + return f" {file_label}: broken link {href!r} -> {path}", None + else: + # Markdown file: check if relative file exists + if not parsed.scheme: + # Remove fragment/query + clean_path = href.split("#")[0].split("?")[0] + if clean_path: + target = (file_path.parent / clean_path).resolve() + if not target.exists(): + return f" {file_label}: file not found for relative link {href!r}", None + + return None, None + + def check_file( html_file: Path, valid_paths: set[str], @@ -142,119 +296,91 @@ def check_file( if href in seen_hrefs: continue seen_hrefs.add(href) - if href.startswith(("#", "mailto:", "tel:", "javascript:")): - continue + + error, unknown = validate_link( + href, page_label, page_url, valid_paths, url_cache, html_file + ) + if error: + errors.append(error) + if unknown: + unknown_urls.append(unknown) - parsed = urlparse(href) + return errors, unknown_urls - # External link - if ( - parsed.scheme in ("http", "https") - and parsed.netloc - and parsed.netloc != "site" - ): - domain = parsed.netloc - # Strip fragment for matching against skip list - url_without_fragment = href.split("#")[0] - - # GitHub blob links to this repo: check the file exists locally. - # CSV files (LFS-tracked) must use the media.githubusercontent.com - # URL so readers get the real data, not the LFS pointer. - if url_without_fragment.startswith(REPO_BLOB_PREFIX): - rel_path = url_without_fragment[len(REPO_BLOB_PREFIX) :] - if rel_path.endswith(".csv"): - lfs_url = REPO_LFS_PREFIX + rel_path - errors.append( - f" {page_label}: {href!r} is a blob URL for an" - f" LFS-tracked CSV; use {lfs_url} instead" - ) - elif not (REPO_ROOT / rel_path).exists(): - errors.append( - f" {page_label}: file not found for {href!r}" - f" (expected {rel_path})" - ) - continue - - # Git LFS media URLs: verify the file exists locally instead - # of fetching from GitHub. - if url_without_fragment.startswith(REPO_LFS_PREFIX): - rel_path = url_without_fragment[len(REPO_LFS_PREFIX) :] - if not (REPO_ROOT / rel_path).exists(): - errors.append( - f" {page_label}: file not found for {href!r}" - f" (expected {rel_path})" - ) - continue - - if url_without_fragment in SKIPPED_URLS: - continue - - if domain in CHECKED_DOMAINS: - # For everyrow.io docs links, check against the build - # output first. This avoids a chicken-and-egg problem - # where new pages can't deploy because their canonical - # URL doesn't exist on the live site yet. - if domain == "everyrow.io" and parsed.path.startswith("/docs"): - local_path = parsed.path.rstrip("/") or "/docs" - if local_path in valid_paths or local_path + "/" in valid_paths: - continue - result = check_url(href, url_cache) - if isinstance(result, int) and 200 <= result < 400: - continue - errors.append(f" {page_label}: {href} -> {result}") - continue - - # Unknown URL — not in either list - unknown_urls.append(f" {page_label}: unrecognized external link {href!r}") - continue - # Resolve relative links against the page URL - if not parsed.scheme and not href.startswith("/"): - resolved = urlparse(urljoin(page_url, href)) - elif href.startswith("/"): - resolved = urlparse(f"https://site{href}") - else: - resolved = parsed +def check_markdown_file( + md_file: Path, + valid_paths: set[str], + url_cache: dict[str, int | str], +) -> tuple[list[str], list[str]]: + """Check all links in a markdown file. - path = resolved.path.rstrip("/") or BASE_PATH + Returns (errors, unknown_urls). + """ + file_label = str(md_file.relative_to(REPO_ROOT)) + content = md_file.read_text() + links = extract_markdown_links(content) - if not path.startswith(BASE_PATH): - continue + errors: list[str] = [] + unknown_urls: list[str] = [] + seen_hrefs: set[str] = set() - # Skip static assets - if "/_next/" in path or path.endswith( - (".css", ".js", ".png", ".jpg", ".svg", ".ico") - ): + for href in links: + if href in seen_hrefs: continue - - if path not in valid_paths and path + "/" not in valid_paths: - errors.append(f" {page_label}: broken link {href!r} -> {path}") + seen_hrefs.add(href) + + error, unknown = validate_link( + href, file_label, None, valid_paths, url_cache, md_file + ) + if error: + errors.append(error) + if unknown: + unknown_urls.append(unknown) return errors, unknown_urls def main() -> int: - if not OUT_DIR.exists(): - print(f"Build output not found at {OUT_DIR}") - print("Run 'pnpm build' first.") - return 1 - - valid_paths = get_valid_paths(OUT_DIR) - html_files = [ - f - for f in OUT_DIR.rglob("*.html") - if f.name not in ("404.html", "_not-found.html") - ] + # Check if build output exists for HTML checking + has_build_output = OUT_DIR.exists() + valid_paths: set[str] = set() + html_files: list[Path] = [] + + if has_build_output: + valid_paths = get_valid_paths(OUT_DIR) + html_files = [ + f + for f in OUT_DIR.rglob("*.html") + if f.name not in ("404.html", "_not-found.html") + ] + else: + print(f"Build output not found at {OUT_DIR}, skipping HTML checks") + print("(Run 'pnpm build' to enable HTML link checking)") + print() + + # Find README files to check + readme_files = [] + repo_readme = REPO_ROOT / "README.md" + if repo_readme.exists(): + readme_files.append(repo_readme) url_cache: dict[str, int | str] = {} all_errors: list[str] = [] all_unknown: list[str] = [] + # Check HTML files for html_file in sorted(html_files): errors, unknown = check_file(html_file, valid_paths, url_cache) all_errors.extend(errors) all_unknown.extend(unknown) + # Check README files + for readme_file in sorted(readme_files): + errors, unknown = check_markdown_file(readme_file, valid_paths, url_cache) + all_errors.extend(errors) + all_unknown.extend(unknown) + ok = True if all_unknown: @@ -276,8 +402,15 @@ def main() -> int: checked = sum( 1 for v in url_cache.values() if isinstance(v, int) and 200 <= v < 400 ) + total_files = len(html_files) + len(readme_files) + file_types = [] + if html_files: + file_types.append(f"{len(html_files)} HTML") + if readme_files: + file_types.append(f"{len(readme_files)} markdown") + print( - f"All links OK across {len(html_files)} pages " + f"All links OK across {total_files} files ({', '.join(file_types)}) " f"({checked} external URLs checked, {len(SKIPPED_URLS)} URLs skipped)" ) From 2f6b2a639156d19d1d1182647deb47bac18e7252 Mon Sep 17 00:00:00 2001 From: straeter Date: Fri, 13 Feb 2026 15:51:12 +0200 Subject: [PATCH 2/5] fix broken link in readme --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 0eeb7eac..1f5893a3 100644 --- a/README.md +++ b/README.md @@ -92,7 +92,7 @@ result = await agent_map( print(result.data.head()) ``` -See the API [docs](https://everyrow.io/docs/reference/RESEARCH.md), a case study of [labeling data](https://everyrow.io/docs/classify-dataframe-rows-llm) or a notebook for [researching government data](https://everyrow.io/docs/notebooks/research-and-rank-permit-times) at scale. +See the API [docs](https://everyrow.io/docs/reference/RESEARCH), a case study of [labeling data](https://everyrow.io/docs/classify-dataframe-rows-llm) or a notebook for [researching government data](https://everyrow.io/docs/notebooks/research-and-rank-permit-times) at scale. ## Sessions From 351c5744234eab1136fab1578557bdf5f62d587b Mon Sep 17 00:00:00 2001 From: straeter Date: Fri, 13 Feb 2026 15:56:43 +0200 Subject: [PATCH 3/5] add mcp readme to link check --- docs-site/scripts/check-links.py | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/docs-site/scripts/check-links.py b/docs-site/scripts/check-links.py index 59ddadf2..0fa32140 100644 --- a/docs-site/scripts/check-links.py +++ b/docs-site/scripts/check-links.py @@ -66,6 +66,7 @@ "https://img.shields.io/badge/python-3.12+-blue.svg", "https://img.shields.io/pypi/v/everyrow.svg", "https://jqlang.org/", + "https://modelcontextprotocol.info/tools/registry/publishing/", "https://opensource.org/licenses/MIT", "https://pypi.org/project/everyrow/", "https://python.org/downloads/", @@ -360,10 +361,10 @@ def main() -> int: print() # Find README files to check - readme_files = [] - repo_readme = REPO_ROOT / "README.md" - if repo_readme.exists(): - readme_files.append(repo_readme) + readme_files = [ + REPO_ROOT / "README.md", + REPO_ROOT / "everyrow-mcp" / "README.md", + ] url_cache: dict[str, int | str] = {} all_errors: list[str] = [] From 923ee7893b4364cb5e45a8e5c4ae8beb5b69cb6b Mon Sep 17 00:00:00 2001 From: straeter Date: Fri, 13 Feb 2026 15:59:38 +0200 Subject: [PATCH 4/5] add docs-site readme --- docs-site/scripts/check-links.py | 1 + 1 file changed, 1 insertion(+) diff --git a/docs-site/scripts/check-links.py b/docs-site/scripts/check-links.py index 0fa32140..b3ee23de 100644 --- a/docs-site/scripts/check-links.py +++ b/docs-site/scripts/check-links.py @@ -364,6 +364,7 @@ def main() -> int: readme_files = [ REPO_ROOT / "README.md", REPO_ROOT / "everyrow-mcp" / "README.md", + REPO_ROOT / "docs-site" / "README.md", ] url_cache: dict[str, int | str] = {} From 3d9c4dc704fe568c1a7ab0e1325c085941c208a6 Mon Sep 17 00:00:00 2001 From: straeter Date: Fri, 27 Feb 2026 13:18:20 +0100 Subject: [PATCH 5/5] correct image file --- docs-site/scripts/check-links.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/docs-site/scripts/check-links.py b/docs-site/scripts/check-links.py index 3ae9cdf7..936ad876 100644 --- a/docs-site/scripts/check-links.py +++ b/docs-site/scripts/check-links.py @@ -64,7 +64,7 @@ "https://github.com/anthropics/claude-code/issues/20377", "https://github.com/futuresearch/everyrow-sdk", "https://github.com/futuresearch/everyrow-sdk/releases", - "https://github.com/user-attachments/assets/254fa2ed-c1f3-4ee8-b93d-d169edf32f27", + "https://github.com/user-attachments/assets/8b746b6c-2acb-4591-9328-daebdb472f50", "https://huggingface.co/datasets/fancyzhx/dbpedia_14", "https://huggingface.co/datasets/google-research-datasets/paws", "https://hugovk.github.io/top-pypi-packages/", @@ -82,7 +82,6 @@ "https://www.kaggle.com/datasets/tunguz/pubmed-title-abstracts-2019-baseline", "https://arxiv.org/abs/2506.21558", "https://arxiv.org/abs/2506.06287", - "https://media.githubusercontent.com/media/futuresearch/everyrow-sdk/refs/heads/main/docs/data/fda_products.csv" }