DmitryMatv · DmitryMatv · Mar 17, 2026 · Mar 17, 2026
diff --git a/AGENTS.md b/AGENTS.md
@@ -25,6 +25,11 @@ Self-hosted from Raspberry Pi 4 (4GB) via Coolify behind Cloudflare Tunnel (Full
 - Full pages can set cookies - but prefer client-side JavaScript to prevent CDN cache pollution
 - Generate per-user state client-side when possible (e.g., tracking IDs via `crypto.randomUUID()`) instead of server-side templating - keeps HTML cacheable across all users
 
+## SEO Gotchas
+
+- Generated classifier search pages can look indexable because they have clean canonicals and titles, but if the unique results are only HTMX-loaded, bots may still see mostly boilerplate and defer indexing.
+- Time-sensitive SEO claims such as the "current version" of a classifier must come from `CLASSIFIER_CONFIG`, not hardcoded template prose or JSON-LD.
+
 ## Qdrant Index Contract Gotcha
 
 - Do not add a Qdrant full-text index to `original_id`.

diff --git a/app/static/robots.txt b/app/static/robots.txt
@@ -1,3 +1,6 @@
 User-agent: *
 Allow: /
-Sitemap: https://classifast.com/sitemap.xml
+Disallow: /*/fragment
+Disallow: /*?*top_k=
+Disallow: /*?*version=
+Sitemap: https://classifast.com/sitemap.xml
diff --git a/app/static/sitemap.xml b/app/static/sitemap.xml
@@ -71,16 +71,4 @@
         <loc>https://blog.classifast.com/2025/09/naics-classification/</loc>
         <priority>0.8</priority>
     </url>
-    <url>
-        <loc>https://classifast.com/UNSPSC/laptop-computer/</loc>
-        <priority>0.64</priority>
-    </url>
-    <url>
-        <loc>https://classifast.com/NAICS/short-term-rentals/</loc>
-        <priority>0.64</priority>
-    </url>
-    <url>
-        <loc>https://classifast.com/NACE/pharmacy/</loc>
-        <priority>0.64</priority>
-    </url>
 </urlset>
diff --git a/app/templates/classifier_page.html b/app/templates/classifier_page.html
@@ -38,6 +38,7 @@
 
     <meta name="viewport" content="width=device-width, initial-scale=1.0">
     <meta http-equiv="X-UA-Compatible" content="IE=edge">
+    <meta name="robots" content="{{ meta_robots_content }}">
 
     <link rel="canonical" href="{{ canonical_url }}">
     <link rel="home" href="{{ url_for('read_root') }}">
@@ -177,7 +178,7 @@
                 "name": "How often are UNSPSC codes updated and what's the latest version?",
                 "acceptedAnswer": {
                     "@type": "Answer",
-                    "text": "UNSPSC codes are updated semi-annually by GS1 US. The current version (v26.0801 from August 2023) contains over 150,000 active codes covering both products and services across all sectors. Updates include new codes for emerging technologies and market changes."
+                    "text": "UNSPSC codes are updated semi-annually by GS1 US. The current dataset available on this page uses {{ primary_version_label }} and contains over 150,000 active codes covering both products and services across all sectors. Updates include new codes for emerging technologies and market changes."
                 }
             }
             {% elif classifier_type == 'ETIM' %}
@@ -742,7 +743,7 @@ <h2>{{ heading }}</h2>
             </div>
         </section>
 
-        {% if trigger_search_on_load and versions %}
+        {% if should_trigger_initial_results_load and versions %}
         <div hx-get="{{ url_for('get_classification_fragment', classifier_type=classifier_type) }}" hx-trigger="load"
             hx-target="#results-container" hx-swap="innerHTML" hx-indicator="#loading-indicator"
             data-initial-results-loader="true"

diff --git a/app/templates/unspsc_guide.html b/app/templates/unspsc_guide.html
@@ -122,8 +122,8 @@ <h3 class="text-lg font-semibold text-gray-800 mb-3 mt-8 flex items-center">
         on the 8-digit structure.</p>
 
     <div class="bg-sky-50 border-l-4 border-sky-500 p-4 mb-6 rounded-r-lg">
-        <p class="text-gray-700"><strong class="text-sky-700">Current Version</strong> is UNSPSC UNv260801.1 (18 March
-            2025) and it includes over <span class="font-semibold">150,000+</span> active codes covering products and
+        <p class="text-gray-700"><strong class="text-sky-700">Current Version</strong> is {{ primary_version_label }}
+            and it includes over <span class="font-semibold">150,000+</span> active codes covering products and
             services across all industries.</p>
     </div>
 

diff --git a/app/web.py b/app/web.py
@@ -48,6 +48,123 @@ def slugify(text: str) -> str:
     return text.strip("_")
 
 
+def _decode_search_query(search_query: str) -> str:
+    if not search_query or not search_query.strip():
+        return ""
+
+    decoded_search_query = (
+        unquote_plus(search_query).rstrip("/").replace("/", " ").replace("_", " ")
+    )
+    decoded_search_query = re.sub(r"\s+", " ", decoded_search_query).strip()
+    if len(decoded_search_query) > 4000:
+        decoded_search_query = decoded_search_query[:4000].strip()
+    return decoded_search_query
+
+
+def _build_classifier_canonical_url(
+    classifier_type: str, decoded_search_query: str
+) -> str:
+    canonical_url = f"https://classifast.com/{classifier_type}"
+    if decoded_search_query:
+        canonical_url += f"/{quote(slugify(decoded_search_query), safe='')}"
+    if not canonical_url.endswith("/"):
+        canonical_url += "/"
+    return canonical_url
+
+
+def _get_default_version(config: dict) -> str:
+    versions_list = list(config["versions"].keys())
+    return versions_list[0] if versions_list else ""
+
+
+def _get_example_query(config: dict) -> str:
+    return config["example"].replace("Example:", "").strip()
+
+
+def _build_classifier_page_state(
+    request: Request,
+    classifier_type: str,
+    search_query: str,
+    version: str | None,
+    top_k: int,
+) -> dict:
+    config = CLASSIFIER_CONFIG[classifier_type]
+    decoded_search_query = _decode_search_query(search_query)
+    default_version = _get_default_version(config)
+    has_version_param = "version" in request.query_params
+    has_top_k_param = "top_k" in request.query_params
+    has_any_query_params = bool(request.query_params)
+    is_generated_search_page = bool(decoded_search_query)
+    is_variant_url = has_version_param or has_top_k_param
+    should_ssr_initial_results = (
+        not is_generated_search_page and not has_any_query_params
+    )
+    page_robots_directive = (
+        "index, follow"
+        if not is_generated_search_page and not is_variant_url
+        else "noindex, follow"
+    )
+    initial_results_query = decoded_search_query or _get_example_query(config)
+
+    return {
+        "canonical_url": _build_classifier_canonical_url(
+            classifier_type, decoded_search_query
+        ),
+        "config": config,
+        "decoded_search_query": decoded_search_query,
+        "default_version": default_version,
+        "has_any_query_params": has_any_query_params,
+        "initial_results_query": initial_results_query,
+        "is_generated_search_page": is_generated_search_page,
+        "is_variant_url": is_variant_url,
+        "page_robots_directive": page_robots_directive,
+        "selected_version": version or default_version,
+        "should_ssr_initial_results": should_ssr_initial_results,
+        "top_k": top_k if 1 <= top_k <= 100 else 10,
+    }
+
+
+def _build_ssr_results_context(
+    request: Request,
+    classifier_type: str,
+    query: str,
+    version: str | None,
+    top_k: int,
+) -> dict:
+    normalized_query = re.sub(r"\s+", " ", query).strip()
+    if not normalized_query:
+        return {
+            "query": "",
+            "results_for_query": [],
+            "base_url": "",
+            "tooltip": "",
+            "total_request_time": 0,
+        }
+
+    start_total_time = time.perf_counter()
+    quantization_cache = getattr(request.app.state, "collection_quantization_cache", {})
+    zclient = getattr(request.app.state, "zclient", None)
+    result = perform_classification(
+        embed_client=getattr(request.app.state, "embed_client", None),
+        qdrant_client=getattr(request.app.state, "qdrant_client", None),
+        query=normalized_query,
+        classifier_type=classifier_type,
+        version=version,
+        top_k=top_k,
+        quantization_cache=quantization_cache,
+        zclient=zclient,
+    )
+    total_request_time = time.perf_counter() - start_total_time
+
+    return {
+        "query": normalized_query,
+        "results_for_query": result["results"],
+        "base_url": result["version_config"].get("base_url", ""),
+        "tooltip": result["version_config"].get("tooltip", ""),
+        "total_request_time": total_request_time,
+    }
+
+
 # Serve the main homepage
 @router.get("/", response_class=HTMLResponse)
 @router.head("/")  # Add HEAD support
@@ -316,80 +433,62 @@ async def show_classifier_page_with_query(
 
     # Use the uppercase classifier_type from here
     effective_classifier_type = upper_type
-
-    # Handle checkout return with token verification
-    checkout_success = request.query_params.get("checkout")
-    checkout_token = request.query_params.get("checkout_token")
-    if checkout_success == "success" and checkout_token:
-        redis_client = getattr(request.app.state, "redis_client", None)
-        await verify_checkout_token(checkout_token, request, redis_client)
-
-    # Handle empty search query for base URLs
-    decoded_search_query = ""
-    if search_query and search_query.strip():
-        decoded_search_query = (
-            unquote_plus(search_query).rstrip("/").replace("/", " ").replace("_", " ")
-        )
-        # Normalize internal whitespace (collapse multiple spaces/newlines into single space)
-        decoded_search_query = re.sub(r"\s+", " ", decoded_search_query).strip()
-        # Sanitize the decoded query
-        # Relaxed sanitization: allow characters like apostrophes, but keep length limit
-        if len(decoded_search_query) > 4000:
-            decoded_search_query = decoded_search_query[:4000]
-            decoded_search_query = decoded_search_query.strip()
-
-    # Build canonical URL
-    # URL-encode slug to handle non-Latin characters in HTTP headers
-    canonical_url = f"https://classifast.com/{effective_classifier_type}"
-    if decoded_search_query:
-        slug = slugify(decoded_search_query)
-        canonical_url += f"/{quote(slug, safe='')}"
-
-    # Ensure trailing slash for consistency with redirects and sitemap
-    if not canonical_url.endswith("/"):
-        canonical_url += "/"
+    page_state = _build_classifier_page_state(
+        request, effective_classifier_type, search_query, version, top_k
+    )
+    canonical_url = page_state["canonical_url"]
 
     # For HEAD requests, return just headers
     if request.method == "HEAD":
         headers = build_cache_headers(HTML_PAGE)
         headers["Vary"] = "Accept-Encoding"
         headers["Content-Type"] = "text/html; charset=utf-8"
         headers["Link"] = f'<{canonical_url}>; rel="canonical"'
+        headers["X-Robots-Tag"] = page_state["page_robots_directive"]
         return Response(headers=headers)
 
-    # Validate top_k parameter
-    if top_k < 1 or top_k > 100:
-        top_k = 10
-
-    # Get first version for default handling
-    versions_list = list(config["versions"].keys())
-    first_version = versions_list[0] if versions_list else ""
+    # Handle checkout return with token verification
+    checkout_success = request.query_params.get("checkout")
+    checkout_token = request.query_params.get("checkout_token")
+    if checkout_success == "success" and checkout_token:
+        redis_client = getattr(request.app.state, "redis_client", None)
+        await verify_checkout_token(checkout_token, request, redis_client)
 
-    # Initialize results data structure
     results_data = {
         "results_for_query": [],
-        "query": decoded_search_query,
+        "query": page_state["initial_results_query"],
         "base_url": "",
         "tooltip": "",
         "total_request_time": 0,
     }
-
-    # Determine if we should trigger a search on load
-    # This is true if we have a URL search query OR if we're falling back to the example
-    trigger_search_on_load = False
-
-    if decoded_search_query:
-        trigger_search_on_load = True
-    else:
-        # If no search query (base URL), use example query
-        example_query = config["example"].replace("Example:", "").strip()
-        if example_query:
-            results_data["query"] = example_query
-            trigger_search_on_load = True
+    used_ssr_initial_results = False
+    if page_state["should_ssr_initial_results"]:
+        try:
+            results_data = _build_ssr_results_context(
+                request,
+                effective_classifier_type,
+                page_state["initial_results_query"],
+                page_state["selected_version"],
+                page_state["top_k"],
+            )
+            used_ssr_initial_results = True
+        except HTTPException as exc:
+            logger.warning(
+                "Falling back to HTMX initial load for '%s' landing page after SSR failure: %s",
+                effective_classifier_type,
+                exc.detail,
+            )
+        except Exception as e:
+            logger.warning(
+                "Falling back to HTMX initial load for '%s' landing page after SSR failure: %s",
+                effective_classifier_type,
+                e,
+            )
 
     today = datetime.now()
     current_year = today.year
     current_month_name = today.strftime("%B")
+    primary_version_label = page_state["default_version"]
 
     response = templates.TemplateResponse(
         request,
@@ -402,11 +501,21 @@ async def show_classifier_page_with_query(
             "versions": list(config["versions"].keys()),
             "example": config["example"],
             "url_params": {
-                "search": decoded_search_query,
-                "version": version if version and version != first_version else "",
-                "top_k": top_k,
+                "search": page_state["decoded_search_query"],
+                "version": (
+                    version
+                    if version and version != page_state["default_version"]
+                    else ""
+                ),
+                "top_k": page_state["top_k"],
             },
-            "trigger_search_on_load": trigger_search_on_load,
+            "meta_robots_content": page_state["page_robots_directive"],
+            "primary_version_label": primary_version_label,
+            "should_ssr_initial_results": used_ssr_initial_results,
+            "should_trigger_initial_results_load": (
+                not used_ssr_initial_results
+                and bool(page_state["initial_results_query"])
+            ),
             "canonical_url": canonical_url,
             "current_year": current_year,
             "current_month_name": current_month_name,
@@ -418,6 +527,6 @@ async def show_classifier_page_with_query(
     response.headers.update(build_cache_headers(HTML_PAGE))
     response.headers["Vary"] = "Accept-Encoding"
     response.headers["Link"] = f'<{canonical_url}>; rel="canonical"'
-    response.headers["X-Robots-Tag"] = "index, follow"
+    response.headers["X-Robots-Tag"] = page_state["page_robots_directive"]
 
     return response