Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 5 additions & 0 deletions AGENTS.md
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,11 @@ Self-hosted from Raspberry Pi 4 (4GB) via Coolify behind Cloudflare Tunnel (Full
- Full pages can set cookies - but prefer client-side JavaScript to prevent CDN cache pollution
- Generate per-user state client-side when possible (e.g., tracking IDs via `crypto.randomUUID()`) instead of server-side templating - keeps HTML cacheable across all users

## SEO Gotchas

- Generated classifier search pages can look indexable because they have clean canonicals and titles, but if the unique results are only HTMX-loaded, bots may still see mostly boilerplate and defer indexing.
- Time-sensitive SEO claims such as the "current version" of a classifier must come from `CLASSIFIER_CONFIG`, not hardcoded template prose or JSON-LD.

## Qdrant Index Contract Gotcha

- Do not add a Qdrant full-text index to `original_id`.
Expand Down
5 changes: 4 additions & 1 deletion app/static/robots.txt
Original file line number Diff line number Diff line change
@@ -1,3 +1,6 @@
User-agent: *
Allow: /
Sitemap: https://classifast.com/sitemap.xml
Disallow: /*/fragment
Disallow: /*?*top_k=
Disallow: /*?*version=
Sitemap: https://classifast.com/sitemap.xml
12 changes: 0 additions & 12 deletions app/static/sitemap.xml
Original file line number Diff line number Diff line change
Expand Up @@ -71,16 +71,4 @@
<loc>https://blog.classifast.com/2025/09/naics-classification/</loc>
<priority>0.8</priority>
</url>
<url>
<loc>https://classifast.com/UNSPSC/laptop-computer/</loc>
<priority>0.64</priority>
</url>
<url>
<loc>https://classifast.com/NAICS/short-term-rentals/</loc>
<priority>0.64</priority>
</url>
<url>
<loc>https://classifast.com/NACE/pharmacy/</loc>
<priority>0.64</priority>
</url>
</urlset>
5 changes: 3 additions & 2 deletions app/templates/classifier_page.html
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,7 @@

<meta name="viewport" content="width=device-width, initial-scale=1.0">
<meta http-equiv="X-UA-Compatible" content="IE=edge">
<meta name="robots" content="{{ meta_robots_content }}">

<link rel="canonical" href="{{ canonical_url }}">
<link rel="home" href="{{ url_for('read_root') }}">
Expand Down Expand Up @@ -177,7 +178,7 @@
"name": "How often are UNSPSC codes updated and what's the latest version?",
"acceptedAnswer": {
"@type": "Answer",
"text": "UNSPSC codes are updated semi-annually by GS1 US. The current version (v26.0801 from August 2023) contains over 150,000 active codes covering both products and services across all sectors. Updates include new codes for emerging technologies and market changes."
"text": "UNSPSC codes are updated semi-annually by GS1 US. The current dataset available on this page uses {{ primary_version_label }} and contains over 150,000 active codes covering both products and services across all sectors. Updates include new codes for emerging technologies and market changes."
}
}
{% elif classifier_type == 'ETIM' %}
Expand Down Expand Up @@ -742,7 +743,7 @@ <h2>{{ heading }}</h2>
</div>
</section>

{% if trigger_search_on_load and versions %}
{% if should_trigger_initial_results_load and versions %}
<div hx-get="{{ url_for('get_classification_fragment', classifier_type=classifier_type) }}" hx-trigger="load"
hx-target="#results-container" hx-swap="innerHTML" hx-indicator="#loading-indicator"
data-initial-results-loader="true"
Expand Down
4 changes: 2 additions & 2 deletions app/templates/unspsc_guide.html
Original file line number Diff line number Diff line change
Expand Up @@ -122,8 +122,8 @@ <h3 class="text-lg font-semibold text-gray-800 mb-3 mt-8 flex items-center">
on the 8-digit structure.</p>

<div class="bg-sky-50 border-l-4 border-sky-500 p-4 mb-6 rounded-r-lg">
<p class="text-gray-700"><strong class="text-sky-700">Current Version</strong> is UNSPSC UNv260801.1 (18 March
2025) and it includes over <span class="font-semibold">150,000+</span> active codes covering products and
<p class="text-gray-700"><strong class="text-sky-700">Current Version</strong> is {{ primary_version_label }}
and it includes over <span class="font-semibold">150,000+</span> active codes covering products and
services across all industries.</p>
</div>

Expand Down
227 changes: 168 additions & 59 deletions app/web.py
Original file line number Diff line number Diff line change
Expand Up @@ -48,6 +48,123 @@ def slugify(text: str) -> str:
return text.strip("_")


def _decode_search_query(search_query: str) -> str:
if not search_query or not search_query.strip():
return ""

decoded_search_query = (
unquote_plus(search_query).rstrip("/").replace("/", " ").replace("_", " ")
)
decoded_search_query = re.sub(r"\s+", " ", decoded_search_query).strip()
if len(decoded_search_query) > 4000:
decoded_search_query = decoded_search_query[:4000].strip()
return decoded_search_query


def _build_classifier_canonical_url(
classifier_type: str, decoded_search_query: str
) -> str:
canonical_url = f"https://classifast.com/{classifier_type}"
if decoded_search_query:
canonical_url += f"/{quote(slugify(decoded_search_query), safe='')}"
if not canonical_url.endswith("/"):
canonical_url += "/"
return canonical_url


def _get_default_version(config: dict) -> str:
versions_list = list(config["versions"].keys())
return versions_list[0] if versions_list else ""


def _get_example_query(config: dict) -> str:
return config["example"].replace("Example:", "").strip()


def _build_classifier_page_state(
request: Request,
classifier_type: str,
search_query: str,
version: str | None,
top_k: int,
) -> dict:
config = CLASSIFIER_CONFIG[classifier_type]
decoded_search_query = _decode_search_query(search_query)
default_version = _get_default_version(config)
has_version_param = "version" in request.query_params
has_top_k_param = "top_k" in request.query_params
has_any_query_params = bool(request.query_params)
is_generated_search_page = bool(decoded_search_query)
is_variant_url = has_version_param or has_top_k_param
should_ssr_initial_results = (
not is_generated_search_page and not has_any_query_params
)
page_robots_directive = (
"index, follow"
if not is_generated_search_page and not is_variant_url
else "noindex, follow"
)
initial_results_query = decoded_search_query or _get_example_query(config)

return {
"canonical_url": _build_classifier_canonical_url(
classifier_type, decoded_search_query
),
"config": config,
"decoded_search_query": decoded_search_query,
"default_version": default_version,
"has_any_query_params": has_any_query_params,
"initial_results_query": initial_results_query,
"is_generated_search_page": is_generated_search_page,
"is_variant_url": is_variant_url,
"page_robots_directive": page_robots_directive,
"selected_version": version or default_version,
"should_ssr_initial_results": should_ssr_initial_results,
"top_k": top_k if 1 <= top_k <= 100 else 10,
}


def _build_ssr_results_context(
request: Request,
classifier_type: str,
query: str,
version: str | None,
top_k: int,
) -> dict:
normalized_query = re.sub(r"\s+", " ", query).strip()
if not normalized_query:
return {
"query": "",
"results_for_query": [],
"base_url": "",
"tooltip": "",
"total_request_time": 0,
}

start_total_time = time.perf_counter()
quantization_cache = getattr(request.app.state, "collection_quantization_cache", {})
zclient = getattr(request.app.state, "zclient", None)
result = perform_classification(
embed_client=getattr(request.app.state, "embed_client", None),
qdrant_client=getattr(request.app.state, "qdrant_client", None),
query=normalized_query,
classifier_type=classifier_type,
version=version,
top_k=top_k,
quantization_cache=quantization_cache,
zclient=zclient,
)
total_request_time = time.perf_counter() - start_total_time

return {
"query": normalized_query,
"results_for_query": result["results"],
"base_url": result["version_config"].get("base_url", ""),
"tooltip": result["version_config"].get("tooltip", ""),
"total_request_time": total_request_time,
}


# Serve the main homepage
@router.get("/", response_class=HTMLResponse)
@router.head("/") # Add HEAD support
Expand Down Expand Up @@ -316,80 +433,62 @@ async def show_classifier_page_with_query(

# Use the uppercase classifier_type from here
effective_classifier_type = upper_type

# Handle checkout return with token verification
checkout_success = request.query_params.get("checkout")
checkout_token = request.query_params.get("checkout_token")
if checkout_success == "success" and checkout_token:
redis_client = getattr(request.app.state, "redis_client", None)
await verify_checkout_token(checkout_token, request, redis_client)

# Handle empty search query for base URLs
decoded_search_query = ""
if search_query and search_query.strip():
decoded_search_query = (
unquote_plus(search_query).rstrip("/").replace("/", " ").replace("_", " ")
)
# Normalize internal whitespace (collapse multiple spaces/newlines into single space)
decoded_search_query = re.sub(r"\s+", " ", decoded_search_query).strip()
# Sanitize the decoded query
# Relaxed sanitization: allow characters like apostrophes, but keep length limit
if len(decoded_search_query) > 4000:
decoded_search_query = decoded_search_query[:4000]
decoded_search_query = decoded_search_query.strip()

# Build canonical URL
# URL-encode slug to handle non-Latin characters in HTTP headers
canonical_url = f"https://classifast.com/{effective_classifier_type}"
if decoded_search_query:
slug = slugify(decoded_search_query)
canonical_url += f"/{quote(slug, safe='')}"

# Ensure trailing slash for consistency with redirects and sitemap
if not canonical_url.endswith("/"):
canonical_url += "/"
page_state = _build_classifier_page_state(
request, effective_classifier_type, search_query, version, top_k
)
canonical_url = page_state["canonical_url"]

# For HEAD requests, return just headers
if request.method == "HEAD":
headers = build_cache_headers(HTML_PAGE)
headers["Vary"] = "Accept-Encoding"
headers["Content-Type"] = "text/html; charset=utf-8"
headers["Link"] = f'<{canonical_url}>; rel="canonical"'
headers["X-Robots-Tag"] = page_state["page_robots_directive"]
return Response(headers=headers)

# Validate top_k parameter
if top_k < 1 or top_k > 100:
top_k = 10

# Get first version for default handling
versions_list = list(config["versions"].keys())
first_version = versions_list[0] if versions_list else ""
# Handle checkout return with token verification
checkout_success = request.query_params.get("checkout")
checkout_token = request.query_params.get("checkout_token")
if checkout_success == "success" and checkout_token:
redis_client = getattr(request.app.state, "redis_client", None)
await verify_checkout_token(checkout_token, request, redis_client)

# Initialize results data structure
results_data = {
"results_for_query": [],
"query": decoded_search_query,
"query": page_state["initial_results_query"],
"base_url": "",
"tooltip": "",
"total_request_time": 0,
}

# Determine if we should trigger a search on load
# This is true if we have a URL search query OR if we're falling back to the example
trigger_search_on_load = False

if decoded_search_query:
trigger_search_on_load = True
else:
# If no search query (base URL), use example query
example_query = config["example"].replace("Example:", "").strip()
if example_query:
results_data["query"] = example_query
trigger_search_on_load = True
used_ssr_initial_results = False
if page_state["should_ssr_initial_results"]:
try:
results_data = _build_ssr_results_context(
request,
effective_classifier_type,
page_state["initial_results_query"],
page_state["selected_version"],
page_state["top_k"],
)
used_ssr_initial_results = True
except HTTPException as exc:
logger.warning(
"Falling back to HTMX initial load for '%s' landing page after SSR failure: %s",
effective_classifier_type,
exc.detail,
)
except Exception as e:
logger.warning(
"Falling back to HTMX initial load for '%s' landing page after SSR failure: %s",
effective_classifier_type,
e,
)

today = datetime.now()
current_year = today.year
current_month_name = today.strftime("%B")
primary_version_label = page_state["default_version"]

response = templates.TemplateResponse(
request,
Expand All @@ -402,11 +501,21 @@ async def show_classifier_page_with_query(
"versions": list(config["versions"].keys()),
"example": config["example"],
"url_params": {
"search": decoded_search_query,
"version": version if version and version != first_version else "",
"top_k": top_k,
"search": page_state["decoded_search_query"],
"version": (
version
if version and version != page_state["default_version"]
else ""
),
"top_k": page_state["top_k"],
},
"trigger_search_on_load": trigger_search_on_load,
"meta_robots_content": page_state["page_robots_directive"],
"primary_version_label": primary_version_label,
"should_ssr_initial_results": used_ssr_initial_results,
"should_trigger_initial_results_load": (
not used_ssr_initial_results
and bool(page_state["initial_results_query"])
),
"canonical_url": canonical_url,
"current_year": current_year,
"current_month_name": current_month_name,
Expand All @@ -418,6 +527,6 @@ async def show_classifier_page_with_query(
response.headers.update(build_cache_headers(HTML_PAGE))
response.headers["Vary"] = "Accept-Encoding"
response.headers["Link"] = f'<{canonical_url}>; rel="canonical"'
response.headers["X-Robots-Tag"] = "index, follow"
response.headers["X-Robots-Tag"] = page_state["page_robots_directive"]

return response
Loading