From dd141640e848de5f5cc4e1d51870cfffbcebaddb Mon Sep 17 00:00:00 2001 From: Builder Date: Wed, 13 May 2026 15:46:50 +0000 Subject: [PATCH] Fix user-visible mirror URL leaks --- docs/url-realism-audit.md | 103 ++++++++++++++++ scripts/check_url_realism.py | 115 ++++++++++++++++++ sites/allrecipes/app.py | 22 +++- sites/allrecipes/templates/recipe_detail.html | 2 +- sites/bbc_news/app.py | 7 ++ sites/bbc_news/templates/article_detail.html | 2 +- sites/booking/app.py | 24 +++- sites/booking/templates/deals.html | 2 +- sites/booking/templates/index.html | 2 +- sites/booking/templates/property_detail.html | 2 +- sites/booking/templates/stays.html | 2 +- sites/github/app.py | 2 +- sites/google_map/app.py | 17 +++ sites/google_map/seed_data.py | 13 +- sites/google_map/templates/place_detail.html | 8 +- 15 files changed, 300 insertions(+), 23 deletions(-) create mode 100644 docs/url-realism-audit.md create mode 100644 scripts/check_url_realism.py diff --git a/docs/url-realism-audit.md b/docs/url-realism-audit.md new file mode 100644 index 0000000..6320e90 --- /dev/null +++ b/docs/url-realism-audit.md @@ -0,0 +1,103 @@ +# URL Realism Audit + +## Scope + +This audit covers user-visible URL surfaces in WebHarbor mirror sites: share +links, copy-link buttons, hidden post-action return URLs, and middleware that +rewrites external hosts into a local mirror. Benchmark task entry points are +out of scope because `sites/*/tasks.jsonl` intentionally points agents at +`http://localhost:40000` through `http://localhost:40014`. + +## Findings + +### Google Map Share And Website URLs + +The Google Map place detail share box rendered a local mirror URL: + +```text +http://localhost:40008/place/ +``` + +The same site seeded many place website rows as `https://example.com/`. +Both values could be exposed in the user-facing place detail page. + +Fix: + +- The share box now uses a real Google Maps place URL: + +```text +https://www.google.com/maps/place// +``` + +- Future Google Map seed data writes Google Maps place URLs instead of + `example.com` placeholders. +- Runtime rendering falls back to the Google Maps place URL when an existing + packaged seed database still contains an `example.com` placeholder. + +### Booking And Allrecipes Return URLs + +Several save buttons wrote `{{ request.url }}` into hidden `next` inputs. In a +local mirror this serializes an absolute local URL such as +`http://localhost:40005/...` into the DOM. That URL is not a real upstream URL +and also makes post-action redirects trust host-derived input. + +Fix: + +- Booking and Allrecipes now render relative return paths via + `current_relative_url()`. +- Their post-action redirects validate `next` values with + `safe_redirect_target()` and only allow root-relative paths. + +### BBC News Share URL + +The BBC News article share button copied `window.location.href`, which copies +the local mirror article URL. The Article rows already carry a `source_url` +field from the source dataset. + +Fix: + +- Article detail now passes `article_share_url` to the template. +- The copy button writes the real `source_url`, or a BBC article fallback URL + when no source URL exists. + +### GitHub External-Host Recovery + +The GitHub mirror has middleware for agents that accidentally request +`github.com` with this Flask app as the backend. It previously redirected to a +fixed local mirror URL: + +```text +http://localhost:40006 +``` + +That is brittle outside the default port layout and leaks a local address when +the app is mounted under another host or port. + +Fix: + +- The middleware now redirects to the same relative path on the current host. + +## Intentional Matches + +These URL-looking values are intentional and should not be "fixed": + +- `sites/*/tasks.jsonl`: local mirror entry URLs for benchmark agents. +- `README.md`, `AGENTS.md`, `CONTRIBUTING.md`, `Dockerfile`, and + `site_runner.py`: local runtime and container wiring. +- Form placeholders such as `you@example.com` and editable profile placeholders + such as `https://example.com`. +- GitHub Codespaces terminal copy that says a dev server listens on + `http://localhost:3000`. +- JavaScript that uses `window.location.href` only to update the current page's + query string or navigate to a local route. + +## Regression Rule + +Run this before merging URL-related changes: + +```bash +python3 scripts/check_url_realism.py +``` + +The check guards the known user-visible leaks while leaving benchmark/runtime +localhost contracts intact. diff --git a/scripts/check_url_realism.py b/scripts/check_url_realism.py new file mode 100644 index 0000000..bf77719 --- /dev/null +++ b/scripts/check_url_realism.py @@ -0,0 +1,115 @@ +#!/usr/bin/env python3 +"""Regression checks for user-visible URL realism across WebHarbor sites.""" + +from pathlib import Path + + +ROOT = Path(__file__).resolve().parents[1] + +CHECKS = [ + ( + "Google Map share UI must not expose the local mirror port", + ROOT / "sites/google_map/templates/place_detail.html", + ["http://localhost:40008", "request.url"], + ), + ( + "Google Map seeds must not write example.com place websites", + ROOT / "sites/google_map/seed_data.py", + [ + 'website=f"https://example.com/{slug}"', + 'website=f"https://example.com/{syn_slug}"', + 'kwargs.pop("website", f"https://example.com/{slug}")', + ], + ), + ( + "BBC article share should not copy the mirror location", + ROOT / "sites/bbc_news/templates/article_detail.html", + ["navigator.clipboard.writeText(window.location.href)"], + ), + ( + "GitHub external-host recovery must not redirect to a fixed local port", + ROOT / "sites/github/app.py", + ['redirect(f"http://localhost:40006{target}"'], + ), +] + +REQUIRED = [ + ( + "Google Map runtime URL helpers", + ROOT / "sites/google_map/app.py", + ["def google_maps_place_url(place):", "def display_place_website(place):"], + ), + ( + "BBC real share URL helper", + ROOT / "sites/bbc_news/app.py", + ["def bbc_article_share_url(article):", "article_share_url=bbc_article_share_url(art)"], + ), + ( + "Allrecipes relative next helper", + ROOT / "sites/allrecipes/app.py", + ["def safe_redirect_target(target", "def current_relative_url():"], + ), + ( + "Booking relative next helper", + ROOT / "sites/booking/app.py", + ["def safe_redirect_target(target", "def current_relative_url():"], + ), +] + +RELATIVE_NEXT_TEMPLATES = [ + ROOT / "sites/allrecipes/templates/recipe_detail.html", + ROOT / "sites/booking/templates/index.html", + ROOT / "sites/booking/templates/deals.html", + ROOT / "sites/booking/templates/stays.html", + ROOT / "sites/booking/templates/property_detail.html", +] + + +def check_forbidden(): + failed = False + for label, path, needles in CHECKS: + text = path.read_text() + for needle in needles: + if needle in text: + print(f"{label}: forbidden pattern in {path.relative_to(ROOT)}: {needle}") + failed = True + return failed + + +def check_required(): + failed = False + for label, path, needles in REQUIRED: + text = path.read_text() + for needle in needles: + if needle not in text: + print(f"{label}: missing required pattern in {path.relative_to(ROOT)}: {needle}") + failed = True + return failed + + +def check_relative_next_templates(): + failed = False + for path in RELATIVE_NEXT_TEMPLATES: + text = path.read_text() + if 'name="next" value="{{ request.url }}"' in text: + print(f"absolute request.url next value in {path.relative_to(ROOT)}") + failed = True + if 'name="next" value="{{ current_relative_url() }}"' not in text: + print(f"missing current_relative_url() next value in {path.relative_to(ROOT)}") + failed = True + return failed + + +def main(): + failed = False + failed |= check_forbidden() + failed |= check_required() + failed |= check_relative_next_templates() + + if failed: + raise SystemExit(1) + print("URL realism checks passed") + + +if __name__ == "__main__": + main() diff --git a/sites/allrecipes/app.py b/sites/allrecipes/app.py index ac2198b..e4966b3 100644 --- a/sites/allrecipes/app.py +++ b/sites/allrecipes/app.py @@ -230,7 +230,21 @@ def is_in_recipe_box(recipe_id): user_id=current_user.id, recipe_id=recipe_id).first() is not None return False - return dict(recipe_box_count=recipe_box_count, is_in_recipe_box=is_in_recipe_box) + def current_relative_url(): + path = request.full_path.rstrip('?') + return path or url_for('index') + + return dict( + recipe_box_count=recipe_box_count, + is_in_recipe_box=is_in_recipe_box, + current_relative_url=current_relative_url, + ) + + +def safe_redirect_target(target, default_endpoint='index'): + if target and target.startswith('/') and not target.startswith('//'): + return target + return url_for(default_endpoint) # --------------------------------------------------------------------------- @@ -1113,7 +1127,7 @@ def login(): login_user(user, remember=request.form.get('remember')) flash('Welcome back!', 'success') next_page = request.args.get('next') - return redirect(next_page or url_for('index')) + return redirect(safe_redirect_target(next_page)) flash('Invalid email or password.', 'danger') return render_template('login.html') @@ -1275,8 +1289,8 @@ def save_to_recipe_box(recipe_id): flash(f'"{recipe.title}" saved to your Recipe Box.', 'success') else: flash(f'"{recipe.title}" is already in your Recipe Box.', 'info') - next_page = request.form.get('next') or request.referrer or url_for('recipe_box') - return redirect(next_page) + next_page = request.form.get('next') + return redirect(safe_redirect_target(next_page, 'recipe_box')) @app.route('/recipe-box/note/', methods=['POST']) diff --git a/sites/allrecipes/templates/recipe_detail.html b/sites/allrecipes/templates/recipe_detail.html index b66cb8b..bb20ef0 100644 --- a/sites/allrecipes/templates/recipe_detail.html +++ b/sites/allrecipes/templates/recipe_detail.html @@ -32,7 +32,7 @@

{{ recipe.title }}

{% if not is_in_recipe_box(recipe.id) %}
- +
{% endif %} diff --git a/sites/bbc_news/app.py b/sites/bbc_news/app.py index eb28068..d656657 100644 --- a/sites/bbc_news/app.py +++ b/sites/bbc_news/app.py @@ -48,6 +48,12 @@ csrf = CSRFProtect(app) +def bbc_article_share_url(article): + if article.source_url: + return article.source_url + return f"https://www.bbc.com/news/articles/{article.slug}" + + # ======================================================================= # MODELS # ======================================================================= @@ -1068,6 +1074,7 @@ def article_detail(slug): return render_template( "article_detail.html", article=art, + article_share_url=bbc_article_share_url(art), article_gallery=article_gallery, related=related, more_articles=more_articles, diff --git a/sites/bbc_news/templates/article_detail.html b/sites/bbc_news/templates/article_detail.html index eeb088d..a8dc1d1 100644 --- a/sites/bbc_news/templates/article_detail.html +++ b/sites/bbc_news/templates/article_detail.html @@ -212,7 +212,7 @@

{{ article.category.name }}

{% block extra_js %}