diff --git a/AGENTS.md b/AGENTS.md index 0fd18b8..19573a3 100644 --- a/AGENTS.md +++ b/AGENTS.md @@ -25,6 +25,11 @@ Self-hosted from Raspberry Pi 4 (4GB) via Coolify behind Cloudflare Tunnel (Full - Full pages can set cookies - but prefer client-side JavaScript to prevent CDN cache pollution - Generate per-user state client-side when possible (e.g., tracking IDs via `crypto.randomUUID()`) instead of server-side templating - keeps HTML cacheable across all users +## SEO Gotchas + +- Generated classifier search pages can look indexable because they have clean canonicals and titles, but if the unique results are only HTMX-loaded, bots may still see mostly boilerplate and defer indexing. +- Time-sensitive SEO claims such as the "current version" of a classifier must come from `CLASSIFIER_CONFIG`, not hardcoded template prose or JSON-LD. + ## Qdrant Index Contract Gotcha - Do not add a Qdrant full-text index to `original_id`. diff --git a/app/static/robots.txt b/app/static/robots.txt index d203c5a..67ecd98 100644 --- a/app/static/robots.txt +++ b/app/static/robots.txt @@ -1,3 +1,6 @@ User-agent: * Allow: / -Sitemap: https://classifast.com/sitemap.xml \ No newline at end of file +Disallow: /*/fragment +Disallow: /*?*top_k= +Disallow: /*?*version= +Sitemap: https://classifast.com/sitemap.xml diff --git a/app/static/sitemap.xml b/app/static/sitemap.xml index 5ca8d3a..7d97ffe 100644 --- a/app/static/sitemap.xml +++ b/app/static/sitemap.xml @@ -71,16 +71,4 @@ https://blog.classifast.com/2025/09/naics-classification/ 0.8 - - https://classifast.com/UNSPSC/laptop-computer/ - 0.64 - - - https://classifast.com/NAICS/short-term-rentals/ - 0.64 - - - https://classifast.com/NACE/pharmacy/ - 0.64 - \ No newline at end of file diff --git a/app/templates/classifier_page.html b/app/templates/classifier_page.html index b6cb60e..6774649 100644 --- a/app/templates/classifier_page.html +++ b/app/templates/classifier_page.html @@ -38,6 +38,7 @@ + @@ -177,7 +178,7 @@ "name": "How often are UNSPSC codes updated and what's the latest version?", "acceptedAnswer": { "@type": "Answer", - "text": "UNSPSC codes are updated semi-annually by GS1 US. The current version (v26.0801 from August 2023) contains over 150,000 active codes covering both products and services across all sectors. Updates include new codes for emerging technologies and market changes." + "text": "UNSPSC codes are updated semi-annually by GS1 US. The current dataset available on this page uses {{ primary_version_label }} and contains over 150,000 active codes covering both products and services across all sectors. Updates include new codes for emerging technologies and market changes." } } {% elif classifier_type == 'ETIM' %} @@ -742,7 +743,7 @@

{{ heading }}

- {% if trigger_search_on_load and versions %} + {% if should_trigger_initial_results_load and versions %}
on the 8-digit structure.

-

Current Version is UNSPSC UNv260801.1 (18 March - 2025) and it includes over 150,000+ active codes covering products and +

Current Version is {{ primary_version_label }} + and it includes over 150,000+ active codes covering products and services across all industries.

diff --git a/app/web.py b/app/web.py index af7bab5..5a6ec4e 100644 --- a/app/web.py +++ b/app/web.py @@ -48,6 +48,123 @@ def slugify(text: str) -> str: return text.strip("_") +def _decode_search_query(search_query: str) -> str: + if not search_query or not search_query.strip(): + return "" + + decoded_search_query = ( + unquote_plus(search_query).rstrip("/").replace("/", " ").replace("_", " ") + ) + decoded_search_query = re.sub(r"\s+", " ", decoded_search_query).strip() + if len(decoded_search_query) > 4000: + decoded_search_query = decoded_search_query[:4000].strip() + return decoded_search_query + + +def _build_classifier_canonical_url( + classifier_type: str, decoded_search_query: str +) -> str: + canonical_url = f"https://classifast.com/{classifier_type}" + if decoded_search_query: + canonical_url += f"/{quote(slugify(decoded_search_query), safe='')}" + if not canonical_url.endswith("/"): + canonical_url += "/" + return canonical_url + + +def _get_default_version(config: dict) -> str: + versions_list = list(config["versions"].keys()) + return versions_list[0] if versions_list else "" + + +def _get_example_query(config: dict) -> str: + return config["example"].replace("Example:", "").strip() + + +def _build_classifier_page_state( + request: Request, + classifier_type: str, + search_query: str, + version: str | None, + top_k: int, +) -> dict: + config = CLASSIFIER_CONFIG[classifier_type] + decoded_search_query = _decode_search_query(search_query) + default_version = _get_default_version(config) + has_version_param = "version" in request.query_params + has_top_k_param = "top_k" in request.query_params + has_any_query_params = bool(request.query_params) + is_generated_search_page = bool(decoded_search_query) + is_variant_url = has_version_param or has_top_k_param + should_ssr_initial_results = ( + not is_generated_search_page and not has_any_query_params + ) + page_robots_directive = ( + "index, follow" + if not is_generated_search_page and not is_variant_url + else "noindex, follow" + ) + initial_results_query = decoded_search_query or _get_example_query(config) + + return { + "canonical_url": _build_classifier_canonical_url( + classifier_type, decoded_search_query + ), + "config": config, + "decoded_search_query": decoded_search_query, + "default_version": default_version, + "has_any_query_params": has_any_query_params, + "initial_results_query": initial_results_query, + "is_generated_search_page": is_generated_search_page, + "is_variant_url": is_variant_url, + "page_robots_directive": page_robots_directive, + "selected_version": version or default_version, + "should_ssr_initial_results": should_ssr_initial_results, + "top_k": top_k if 1 <= top_k <= 100 else 10, + } + + +def _build_ssr_results_context( + request: Request, + classifier_type: str, + query: str, + version: str | None, + top_k: int, +) -> dict: + normalized_query = re.sub(r"\s+", " ", query).strip() + if not normalized_query: + return { + "query": "", + "results_for_query": [], + "base_url": "", + "tooltip": "", + "total_request_time": 0, + } + + start_total_time = time.perf_counter() + quantization_cache = getattr(request.app.state, "collection_quantization_cache", {}) + zclient = getattr(request.app.state, "zclient", None) + result = perform_classification( + embed_client=getattr(request.app.state, "embed_client", None), + qdrant_client=getattr(request.app.state, "qdrant_client", None), + query=normalized_query, + classifier_type=classifier_type, + version=version, + top_k=top_k, + quantization_cache=quantization_cache, + zclient=zclient, + ) + total_request_time = time.perf_counter() - start_total_time + + return { + "query": normalized_query, + "results_for_query": result["results"], + "base_url": result["version_config"].get("base_url", ""), + "tooltip": result["version_config"].get("tooltip", ""), + "total_request_time": total_request_time, + } + + # Serve the main homepage @router.get("/", response_class=HTMLResponse) @router.head("/") # Add HEAD support @@ -316,38 +433,10 @@ async def show_classifier_page_with_query( # Use the uppercase classifier_type from here effective_classifier_type = upper_type - - # Handle checkout return with token verification - checkout_success = request.query_params.get("checkout") - checkout_token = request.query_params.get("checkout_token") - if checkout_success == "success" and checkout_token: - redis_client = getattr(request.app.state, "redis_client", None) - await verify_checkout_token(checkout_token, request, redis_client) - - # Handle empty search query for base URLs - decoded_search_query = "" - if search_query and search_query.strip(): - decoded_search_query = ( - unquote_plus(search_query).rstrip("/").replace("/", " ").replace("_", " ") - ) - # Normalize internal whitespace (collapse multiple spaces/newlines into single space) - decoded_search_query = re.sub(r"\s+", " ", decoded_search_query).strip() - # Sanitize the decoded query - # Relaxed sanitization: allow characters like apostrophes, but keep length limit - if len(decoded_search_query) > 4000: - decoded_search_query = decoded_search_query[:4000] - decoded_search_query = decoded_search_query.strip() - - # Build canonical URL - # URL-encode slug to handle non-Latin characters in HTTP headers - canonical_url = f"https://classifast.com/{effective_classifier_type}" - if decoded_search_query: - slug = slugify(decoded_search_query) - canonical_url += f"/{quote(slug, safe='')}" - - # Ensure trailing slash for consistency with redirects and sitemap - if not canonical_url.endswith("/"): - canonical_url += "/" + page_state = _build_classifier_page_state( + request, effective_classifier_type, search_query, version, top_k + ) + canonical_url = page_state["canonical_url"] # For HEAD requests, return just headers if request.method == "HEAD": @@ -355,41 +444,51 @@ async def show_classifier_page_with_query( headers["Vary"] = "Accept-Encoding" headers["Content-Type"] = "text/html; charset=utf-8" headers["Link"] = f'<{canonical_url}>; rel="canonical"' + headers["X-Robots-Tag"] = page_state["page_robots_directive"] return Response(headers=headers) - # Validate top_k parameter - if top_k < 1 or top_k > 100: - top_k = 10 - - # Get first version for default handling - versions_list = list(config["versions"].keys()) - first_version = versions_list[0] if versions_list else "" + # Handle checkout return with token verification + checkout_success = request.query_params.get("checkout") + checkout_token = request.query_params.get("checkout_token") + if checkout_success == "success" and checkout_token: + redis_client = getattr(request.app.state, "redis_client", None) + await verify_checkout_token(checkout_token, request, redis_client) - # Initialize results data structure results_data = { "results_for_query": [], - "query": decoded_search_query, + "query": page_state["initial_results_query"], "base_url": "", "tooltip": "", "total_request_time": 0, } - - # Determine if we should trigger a search on load - # This is true if we have a URL search query OR if we're falling back to the example - trigger_search_on_load = False - - if decoded_search_query: - trigger_search_on_load = True - else: - # If no search query (base URL), use example query - example_query = config["example"].replace("Example:", "").strip() - if example_query: - results_data["query"] = example_query - trigger_search_on_load = True + used_ssr_initial_results = False + if page_state["should_ssr_initial_results"]: + try: + results_data = _build_ssr_results_context( + request, + effective_classifier_type, + page_state["initial_results_query"], + page_state["selected_version"], + page_state["top_k"], + ) + used_ssr_initial_results = True + except HTTPException as exc: + logger.warning( + "Falling back to HTMX initial load for '%s' landing page after SSR failure: %s", + effective_classifier_type, + exc.detail, + ) + except Exception as e: + logger.warning( + "Falling back to HTMX initial load for '%s' landing page after SSR failure: %s", + effective_classifier_type, + e, + ) today = datetime.now() current_year = today.year current_month_name = today.strftime("%B") + primary_version_label = page_state["default_version"] response = templates.TemplateResponse( request, @@ -402,11 +501,21 @@ async def show_classifier_page_with_query( "versions": list(config["versions"].keys()), "example": config["example"], "url_params": { - "search": decoded_search_query, - "version": version if version and version != first_version else "", - "top_k": top_k, + "search": page_state["decoded_search_query"], + "version": ( + version + if version and version != page_state["default_version"] + else "" + ), + "top_k": page_state["top_k"], }, - "trigger_search_on_load": trigger_search_on_load, + "meta_robots_content": page_state["page_robots_directive"], + "primary_version_label": primary_version_label, + "should_ssr_initial_results": used_ssr_initial_results, + "should_trigger_initial_results_load": ( + not used_ssr_initial_results + and bool(page_state["initial_results_query"]) + ), "canonical_url": canonical_url, "current_year": current_year, "current_month_name": current_month_name, @@ -418,6 +527,6 @@ async def show_classifier_page_with_query( response.headers.update(build_cache_headers(HTML_PAGE)) response.headers["Vary"] = "Accept-Encoding" response.headers["Link"] = f'<{canonical_url}>; rel="canonical"' - response.headers["X-Robots-Tag"] = "index, follow" + response.headers["X-Robots-Tag"] = page_state["page_robots_directive"] return response diff --git a/tests/test_classifier_page_seo.py b/tests/test_classifier_page_seo.py new file mode 100644 index 0000000..1929ced --- /dev/null +++ b/tests/test_classifier_page_seo.py @@ -0,0 +1,297 @@ +import unittest +from pathlib import Path +from unittest.mock import AsyncMock, Mock, patch + +import httpx +from fastapi import FastAPI, HTTPException +from fastapi.staticfiles import StaticFiles + +from app.classifier_config import CLASSIFIER_CONFIG +from app.web import router + +BASE_DIR = Path(__file__).resolve().parents[1] + + +def _build_test_app() -> FastAPI: + app = FastAPI() + app.mount( + "/static", StaticFiles(directory=BASE_DIR / "app" / "static"), name="static" + ) + app.include_router(router) + app.state.embed_client = object() + app.state.qdrant_client = object() + app.state.collection_quantization_cache = {} + app.state.zclient = None + app.state.redis_client = object() + return app + + +class ClassifierPageSeoTests(unittest.IsolatedAsyncioTestCase): + @classmethod + def setUpClass(cls) -> None: + cls.app = _build_test_app() + cls.classifier_type = "UNSPSC" + cls.primary_version_label = next(iter(CLASSIFIER_CONFIG["UNSPSC"]["versions"])) + + def _classification_result(self) -> dict: + return { + "results": [ + { + "score": 0.97, + "payload": { + "original_id": "43211503", + "class_name": "Laptop computers", + "definition": "Portable laptop computers for business use.", + }, + } + ], + "version_config": { + "base_url": "https://example.com/code/", + "tooltip": "Mock tooltip", + }, + } + + async def _request(self, method: str, path: str, **kwargs) -> httpx.Response: + transport = httpx.ASGITransport(app=self.app) + async with httpx.AsyncClient( + transport=transport, + base_url="http://testserver", + ) as client: + return await client.request(method, path, **kwargs) + + def _assert_landing_page_fallback_shell(self, response: httpx.Response) -> None: + example_query = ( + CLASSIFIER_CONFIG[self.classifier_type]["example"] + .replace("Example:", "") + .strip() + ) + + self.assertEqual(response.status_code, 200) + self.assertEqual(response.headers["X-Robots-Tag"], "index, follow") + self.assertIn('', response.text) + self.assertIn('data-initial-results-loader="true"', response.text) + self.assertIn(example_query, response.text) + self.assertNotIn("Laptop computers", response.text) + self.assertNotIn("Backend services not available", response.text) + self.assertNotIn("boom", response.text) + + def _assert_query_param_shell(self, response: httpx.Response, robots: str) -> None: + example_query = ( + CLASSIFIER_CONFIG[self.classifier_type]["example"] + .replace("Example:", "") + .strip() + ) + + self.assertEqual(response.status_code, 200) + self.assertEqual(response.headers["X-Robots-Tag"], robots) + self.assertIn( + f'', + response.text, + ) + self.assertIn( + '', + response.text, + ) + self.assertIn('data-initial-results-loader="true"', response.text) + self.assertIn(example_query, response.text) + self.assertNotIn("Laptop computers", response.text) + + @patch("app.web.perform_classification") + async def test_base_landing_page_is_indexable_and_server_renders_results( + self, + perform_classification_mock: Mock, + ) -> None: + perform_classification_mock.return_value = self._classification_result() + + response = await self._request("GET", f"/{self.classifier_type}/") + + self.assertEqual(response.status_code, 200) + self.assertEqual(response.headers["X-Robots-Tag"], "index, follow") + self.assertIn('', response.text) + self.assertIn("Laptop computers", response.text) + self.assertNotIn("Loading...", response.text) + self.assertNotIn('data-initial-results-loader="true"', response.text) + + async def test_generated_search_page_is_noindexed_but_keeps_search_flow( + self, + ) -> None: + response = await self._request( + "GET", f"/{self.classifier_type}/laptop-computer/" + ) + + self.assertEqual(response.status_code, 200) + self.assertEqual(response.headers["X-Robots-Tag"], "noindex, follow") + self.assertIn('', response.text) + self.assertIn( + '', + response.text, + ) + self.assertIn('data-initial-results-loader="true"', response.text) + self.assertIn("laptop-computer", response.text) + + async def test_variant_query_param_page_is_noindexed_and_keeps_base_canonical( + self, + ) -> None: + response = await self._request( + "GET", + f"/{self.classifier_type}/", + params={"version": self.primary_version_label}, + ) + + self.assertEqual(response.status_code, 200) + self.assertEqual(response.headers["X-Robots-Tag"], "noindex, follow") + self.assertIn('', response.text) + self.assertIn( + '', + response.text, + ) + + @patch("app.web.perform_classification") + async def test_tracking_param_landing_page_skips_ssr_but_stays_indexable( + self, + perform_classification_mock: Mock, + ) -> None: + response = await self._request( + "GET", + f"/{self.classifier_type}/", + params={"utm_source": "google"}, + ) + + self._assert_query_param_shell(response, "index, follow") + perform_classification_mock.assert_not_called() + + @patch("app.web.verify_checkout_token", new_callable=AsyncMock) + @patch("app.web.perform_classification") + async def test_checkout_return_params_skip_ssr_but_still_verify_checkout_token( + self, + perform_classification_mock: Mock, + verify_checkout_token_mock: AsyncMock, + ) -> None: + response = await self._request( + "GET", + f"/{self.classifier_type}/", + params={"checkout": "success", "checkout_token": "test-token"}, + ) + + self._assert_query_param_shell(response, "index, follow") + perform_classification_mock.assert_not_called() + verify_checkout_token_mock.assert_awaited_once() + + @patch("app.web.perform_classification") + async def test_top_k_variant_page_is_noindexed_and_skips_ssr( + self, + perform_classification_mock: Mock, + ) -> None: + response = await self._request( + "GET", + f"/{self.classifier_type}/", + params={"top_k": 30}, + ) + + self._assert_query_param_shell(response, "noindex, follow") + perform_classification_mock.assert_not_called() + + @patch("app.web.perform_classification") + async def test_version_variant_page_is_noindexed_and_skips_ssr( + self, + perform_classification_mock: Mock, + ) -> None: + response = await self._request( + "GET", + f"/{self.classifier_type}/", + params={"version": self.primary_version_label}, + ) + + self._assert_query_param_shell(response, "noindex, follow") + perform_classification_mock.assert_not_called() + + @patch("app.web.increment_usage", new_callable=AsyncMock) + @patch("app.web.check_usage", new_callable=AsyncMock) + @patch("app.web.perform_classification") + async def test_landing_page_ssr_does_not_track_usage( + self, + perform_classification_mock: Mock, + check_usage_mock: AsyncMock, + increment_usage_mock: AsyncMock, + ) -> None: + perform_classification_mock.return_value = self._classification_result() + + response = await self._request("GET", f"/{self.classifier_type}/") + + self.assertEqual(response.status_code, 200) + check_usage_mock.assert_not_awaited() + increment_usage_mock.assert_not_awaited() + + @patch("app.web.perform_classification") + async def test_landing_page_falls_back_to_shell_when_ssr_raises_http_exception( + self, + perform_classification_mock: Mock, + ) -> None: + perform_classification_mock.side_effect = HTTPException( + status_code=503, detail="Backend services not available" + ) + + response = await self._request("GET", f"/{self.classifier_type}/") + + self._assert_landing_page_fallback_shell(response) + + @patch("app.web.perform_classification") + async def test_landing_page_falls_back_to_shell_when_ssr_raises_runtime_error( + self, + perform_classification_mock: Mock, + ) -> None: + perform_classification_mock.side_effect = RuntimeError("boom") + + response = await self._request("GET", f"/{self.classifier_type}/") + + self._assert_landing_page_fallback_shell(response) + + @patch("app.web.increment_usage", new_callable=AsyncMock) + @patch("app.web.check_usage", new_callable=AsyncMock) + @patch("app.web.perform_classification") + async def test_landing_page_fallback_still_does_not_track_usage( + self, + perform_classification_mock: Mock, + check_usage_mock: AsyncMock, + increment_usage_mock: AsyncMock, + ) -> None: + perform_classification_mock.side_effect = HTTPException( + status_code=503, detail="Backend services not available" + ) + + response = await self._request("GET", f"/{self.classifier_type}/") + + self._assert_landing_page_fallback_shell(response) + check_usage_mock.assert_not_awaited() + increment_usage_mock.assert_not_awaited() + + @patch("app.web.perform_classification") + async def test_unspsc_version_text_comes_from_config( + self, + perform_classification_mock: Mock, + ) -> None: + perform_classification_mock.return_value = self._classification_result() + + response = await self._request("GET", f"/{self.classifier_type}/") + + self.assertEqual(response.status_code, 200) + self.assertIn(self.primary_version_label, response.text) + self.assertNotIn("August 2023", response.text) + + async def test_head_tracking_param_keeps_existing_indexable_policy(self) -> None: + response = await self._request( + "HEAD", + f"/{self.classifier_type}/", + params={"utm_source": "google"}, + ) + + self.assertEqual(response.status_code, 200) + self.assertEqual(response.headers["X-Robots-Tag"], "index, follow") + self.assertEqual( + response.headers["Link"], + '; rel="canonical"', + ) + + +if __name__ == "__main__": + unittest.main() diff --git a/tests/test_homepage_headers.py b/tests/test_homepage_headers.py index c52d8d1..f03a0d6 100644 --- a/tests/test_homepage_headers.py +++ b/tests/test_homepage_headers.py @@ -75,6 +75,17 @@ async def test_classifier_head_uses_query_specific_canonical_link(self) -> None: response.headers["Link"], '; rel="canonical"', ) + self.assertEqual(response.headers["X-Robots-Tag"], "noindex, follow") + + async def test_classifier_base_head_is_indexable(self) -> None: + response = await self._request("HEAD", "/NAICS/") + + self.assertEqual(response.status_code, 200) + self.assertEqual( + response.headers["Link"], + '; rel="canonical"', + ) + self.assertEqual(response.headers["X-Robots-Tag"], "index, follow") if __name__ == "__main__": diff --git a/tests/test_request_validation_and_metadata.py b/tests/test_request_validation_and_metadata.py index 6953f7f..a71a379 100644 --- a/tests/test_request_validation_and_metadata.py +++ b/tests/test_request_validation_and_metadata.py @@ -1,5 +1,6 @@ import unittest from pathlib import Path +from unittest.mock import patch import httpx from fastapi import FastAPI, Request @@ -30,6 +31,10 @@ def _build_web_test_app() -> FastAPI: "/static", StaticFiles(directory=BASE_DIR / "app" / "static"), name="static" ) app.include_router(router) + app.state.embed_client = object() + app.state.qdrant_client = object() + app.state.collection_quantization_cache = {} + app.state.zclient = None return app @@ -93,6 +98,24 @@ def setUpClass(cls) -> None: ) ) + def _classification_result(self) -> dict: + return { + "results": [ + { + "score": 0.92, + "payload": { + "original_id": "123456", + "class_name": "Industrial pumps", + "definition": "Industrial pump manufacturing.", + }, + } + ], + "version_config": { + "base_url": "", + "tooltip": "", + }, + } + async def test_search_page_renders_query_specific_json_ld(self): transport = httpx.ASGITransport(app=self.app) @@ -113,7 +136,11 @@ async def test_search_page_renders_query_specific_json_ld(self): response.text, ) - async def test_base_page_keeps_generic_json_ld(self): + @patch("app.web.perform_classification") + async def test_base_page_keeps_generic_json_ld( + self, perform_classification_mock + ): + perform_classification_mock.return_value = self._classification_result() transport = httpx.ASGITransport(app=self.app) async with httpx.AsyncClient( @@ -146,7 +173,11 @@ async def test_homepage_renders_stable_desktop_auth_slot(self): self.assertIn('data-auth-slot="desktop"', response.text) self.assertIn('id="desktop-auth-container"', response.text) - async def test_classifier_page_renders_stable_desktop_auth_slot(self): + @patch("app.web.perform_classification") + async def test_classifier_page_renders_stable_desktop_auth_slot( + self, perform_classification_mock + ): + perform_classification_mock.return_value = self._classification_result() transport = httpx.ASGITransport(app=self.app) async with httpx.AsyncClient( diff --git a/tests/test_static_headers.py b/tests/test_static_headers.py index 526cd35..c11e941 100644 --- a/tests/test_static_headers.py +++ b/tests/test_static_headers.py @@ -1,4 +1,5 @@ import unittest +from pathlib import Path from app.cache_profiles import ( STATIC_CODE, @@ -8,6 +9,8 @@ ) from app.main import get_static_cache_profile, static_file_response +BASE_DIR = Path(__file__).resolve().parents[1] + class StaticHeaderTests(unittest.TestCase): def test_js_assets_use_static_code_profile(self) -> None: @@ -70,6 +73,31 @@ def test_root_static_text_files_use_static_text_profile(self) -> None: ) self.assertEqual(response.headers["Cache-Tag"], "static-files") + def test_robots_txt_disallows_fragment_and_noisy_query_variants(self) -> None: + robots_text = (BASE_DIR / "app" / "static" / "robots.txt").read_text() + + self.assertIn("Disallow: /*/fragment", robots_text) + self.assertIn("Disallow: /*?*top_k=", robots_text) + self.assertIn("Disallow: /*?*version=", robots_text) + + def test_sitemap_excludes_generated_search_urls(self) -> None: + sitemap_text = (BASE_DIR / "app" / "static" / "sitemap.xml").read_text() + + self.assertIn("https://classifast.com/UNSPSC/", sitemap_text) + self.assertIn("https://blog.classifast.com/", sitemap_text) + self.assertNotIn( + "https://classifast.com/UNSPSC/laptop-computer/", + sitemap_text, + ) + self.assertNotIn( + "https://classifast.com/NAICS/short-term-rentals/", + sitemap_text, + ) + self.assertNotIn( + "https://classifast.com/NACE/pharmacy/", + sitemap_text, + ) + if __name__ == "__main__": unittest.main()