diff --git a/pipeline/quantity_cleaning.ipynb b/pipeline/quantity_cleaning.ipynb
new file mode 100644
index 0000000..98ac767
--- /dev/null
+++ b/pipeline/quantity_cleaning.ipynb
@@ -0,0 +1,3470 @@
+{
+ "cells": [
+ {
+ "cell_type": "markdown",
+ "id": "77c013ea",
+ "metadata": {},
+ "source": [
+ "# Quantity Cleaning\n",
+ "\n",
+ "This notebook builds one final cleaned output view named `quantity_cleaned`.\n",
+ "\n",
+ "The source parquet is never modified. The notebook reads the raw quantity fields, applies the cleaning rules, and produces:\n",
+ "\n",
+ "- `quantity_cleaned` as the only final output view\n",
+ "- `quantity_cleaning_debug` as an optional internal review view\n",
+ "- `quantity_change_audit` as an optional internal change-audit view\n"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "7f842ea2",
+ "metadata": {},
+ "source": [
+ "## Notebook Flow\n",
+ "\n",
+ "The notebook follows a simple layout:\n",
+ "\n",
+ "1. Load the raw quantity fields without changing source data.\n",
+ "2. Build an internal reference snapshot only for audit checks.\n",
+ "3. Set up the current helper rules, aliases, and safe OCR cleanup.\n",
+ "4. Parse the free-text `quantity` field and compare it with the structured fields.\n",
+ "5. Produce one final cleaned view: `quantity_cleaned`.\n",
+ "6. Run optional debug and audit tables so we can understand unresolved, conflict, and rule-impact cases.\n"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "28f62510",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# This cell imports the small Python helpers we need around DuckDB.\n",
+ "# DuckDB does the parsing and cleaning work. Python is only used to:\n",
+ "# - locate the parquet file\n",
+ "# - build SQL snippets from the approved alias lists\n",
+ "# - display query results in notebook-friendly tables\n",
+ "\n",
+ "import json\n",
+ "import re\n",
+ "from pathlib import Path\n",
+ "\n",
+ "import duckdb\n",
+ "from IPython.display import display\n",
+ "\n",
+ "\n",
+ "PARQUET_CANDIDATES = [\n",
+ " Path(\"../data/raw/off-canada.parquet\"),\n",
+ " Path(\"data/raw/off-canada.parquet\"),\n",
+ " Path(\"../data/off-canada.parquet\"),\n",
+ " Path(\"data/off-canada.parquet\"),\n",
+ "]\n",
+ "\n",
+ "SOURCE_COLUMNS = [\"code\", \"product_quantity_unit\", \"product_quantity\", \"quantity\"]\n",
+ "VALUE_MATCH_TOLERANCE = 0.02\n",
+ "ARTICLE_TOKENS = {\"a\", \"an\", \"un\", \"une\"}\n",
+ "\n",
+ "# Reviewed unit aliases only. These are deterministic and map to canonical base units.\n",
+ "MEASURE_ALIASES = [\n",
+ " (\"g\", \"g\", \"mass\", 1.0),\n",
+ " (\"gram\", \"g\", \"mass\", 1.0),\n",
+ " (\"grams\", \"g\", \"mass\", 1.0),\n",
+ " (\"gramme\", \"g\", \"mass\", 1.0),\n",
+ " (\"grammes\", \"g\", \"mass\", 1.0),\n",
+ " (\"kg\", \"g\", \"mass\", 1000.0),\n",
+ " (\"kilogram\", \"g\", \"mass\", 1000.0),\n",
+ " (\"kilograms\", \"g\", \"mass\", 1000.0),\n",
+ " (\"kilogramme\", \"g\", \"mass\", 1000.0),\n",
+ " (\"kilogrammes\", \"g\", \"mass\", 1000.0),\n",
+ " (\"oz\", \"g\", \"mass\", 28.349523125),\n",
+ " (\"lb\", \"g\", \"mass\", 453.59237),\n",
+ " (\"lbs\", \"g\", \"mass\", 453.59237),\n",
+ " (\"ml\", \"ml\", \"volume\", 1.0),\n",
+ " (\"millilitre\", \"ml\", \"volume\", 1.0),\n",
+ " (\"millilitres\", \"ml\", \"volume\", 1.0),\n",
+ " (\"milliliter\", \"ml\", \"volume\", 1.0),\n",
+ " (\"milliliters\", \"ml\", \"volume\", 1.0),\n",
+ " (\"l\", \"ml\", \"volume\", 1000.0),\n",
+ " (\"litre\", \"ml\", \"volume\", 1000.0),\n",
+ " (\"litres\", \"ml\", \"volume\", 1000.0),\n",
+ " (\"liter\", \"ml\", \"volume\", 1000.0),\n",
+ " (\"liters\", \"ml\", \"volume\", 1000.0),\n",
+ " (\"lit\", \"ml\", \"volume\", 1000.0),\n",
+ " (\"cl\", \"ml\", \"volume\", 10.0),\n",
+ " (\"dl\", \"ml\", \"volume\", 100.0),\n",
+ " (\"fl oz\", \"ml\", \"volume\", 29.5735295625),\n",
+ " (\"kj\", \"kj\", \"energy\", 1.0),\n",
+ " (\"kcal\", \"kcal\", \"energy\", 1.0),\n",
+ " (\"cal\", \"cal\", \"energy\", 1.0),\n",
+ "]\n",
+ "\n",
+ "# Count descriptors describe discrete items whose quantity is meaningful by count.\n",
+ "COUNT_DESCRIPTOR_ALIASES = [\n",
+ " (\"pc\", \"piece\", \"count\"),\n",
+ " (\"pcs\", \"piece\", \"count\"),\n",
+ " (\"piece\", \"piece\", \"count\"),\n",
+ " (\"pieces\", \"piece\", \"count\"),\n",
+ " (\"ea\", \"piece\", \"count\"),\n",
+ " (\"each\", \"piece\", \"count\"),\n",
+ " (\"tablet\", \"tablet\", \"count\"),\n",
+ " (\"tablets\", \"tablet\", \"count\"),\n",
+ " (\"tablilla\", \"tablet\", \"count\"),\n",
+ " (\"tablillas\", \"tablet\", \"count\"),\n",
+ " (\"capsule\", \"capsule\", \"count\"),\n",
+ " (\"capsules\", \"capsule\", \"count\"),\n",
+ " (\"cap\", \"capsule\", \"count\"),\n",
+ " (\"caps\", \"capsule\", \"count\"),\n",
+ " (\"sachet\", \"sachet\", \"count\"),\n",
+ " (\"sachets\", \"sachet\", \"count\"),\n",
+ " (\"morceau\", \"piece\", \"count\"),\n",
+ " (\"morceaux\", \"piece\", \"count\"),\n",
+ " (\"bottle\", \"bottle\", \"count\"),\n",
+ " (\"bottles\", \"bottle\", \"count\"),\n",
+ " (\"bouteille\", \"bottle\", \"count\"),\n",
+ " (\"bouteilles\", \"bottle\", \"count\"),\n",
+ " (\"can\", \"can\", \"count\"),\n",
+ " (\"cans\", \"can\", \"count\"),\n",
+ " (\"canette\", \"can\", \"count\"),\n",
+ " (\"canettes\", \"can\", \"count\"),\n",
+ "]\n",
+ "\n",
+ "# Packaging descriptors are meaningful, but by themselves they do not give a comparable size.\n",
+ "PACKAGING_DESCRIPTOR_ALIASES = [\n",
+ " (\"box\", \"box\", \"packaging_only\"),\n",
+ " (\"boxes\", \"box\", \"packaging_only\"),\n",
+ " (\"boite\", \"box\", \"packaging_only\"),\n",
+ " (\"boites\", \"box\", \"packaging_only\"),\n",
+ " (\"pack\", \"pack\", \"packaging_only\"),\n",
+ " (\"packs\", \"pack\", \"packaging_only\"),\n",
+ " (\"paquet\", \"pack\", \"packaging_only\"),\n",
+ " (\"paquets\", \"pack\", \"packaging_only\"),\n",
+ " (\"carton\", \"carton\", \"packaging_only\"),\n",
+ " (\"cartons\", \"carton\", \"packaging_only\"),\n",
+ " (\"case\", \"case\", \"packaging_only\"),\n",
+ " (\"cases\", \"case\", \"packaging_only\"),\n",
+ " (\"caisse\", \"case\", \"packaging_only\"),\n",
+ " (\"caisses\", \"case\", \"packaging_only\"),\n",
+ " (\"tray\", \"tray\", \"packaging_only\"),\n",
+ " (\"trays\", \"tray\", \"packaging_only\"),\n",
+ "]\n",
+ "\n",
+ "# Household and serving units are intentionally kept out of package-size normalization.\n",
+ "HOUSEHOLD_ALIASES = [\n",
+ " (\"tsp\", \"tsp\"),\n",
+ " (\"tbsp\", \"tbsp\"),\n",
+ " (\"cup\", \"cup\"),\n",
+ " (\"cups\", \"cup\"),\n",
+ " (\"tasse\", \"cup\"),\n",
+ " (\"tasses\", \"cup\"),\n",
+ " (\"portion\", \"portion\"),\n",
+ " (\"portions\", \"portion\"),\n",
+ " (\"serving\", \"serving\"),\n",
+ " (\"servings\", \"serving\"),\n",
+ "]\n",
+ "\n",
+ "PLACEHOLDER_TERMS = [\n",
+ " \"unknown\",\n",
+ " \"unknown quantity\",\n",
+ " \"n/a\",\n",
+ " \"none\",\n",
+ " \"not labeled!\",\n",
+ " \"good\",\n",
+ " \"bonne\",\n",
+ " \"bn batouta\",\n",
+ "]\n"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "2b784d10",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# This cell defines the small helper functions used by the notebook.\n",
+ "# They keep the SQL readable while making the logic reproducible.\n",
+ "\n",
+ "def resolve_parquet_path(candidates: list[Path]) -> Path:\n",
+ " for candidate in candidates:\n",
+ " if candidate.exists():\n",
+ " return candidate.resolve()\n",
+ " checked = \"\\n\".join(str(path) for path in candidates)\n",
+ " raise FileNotFoundError(f\"Could not find the OFF Canada parquet. Checked:\\n{checked}\")\n",
+ "\n",
+ "\n",
+ "def sql_quote(value: str) -> str:\n",
+ " return \"'\" + value.replace(\"'\", \"''\") + \"'\"\n",
+ "\n",
+ "\n",
+ "def build_token_pattern(tokens: list[str]) -> str:\n",
+ " escaped_tokens = []\n",
+ " for token in sorted(set(tokens), key=len, reverse=True):\n",
+ " escaped = re.escape(token).replace(r\"\\ \", r\"\\s*\")\n",
+ " escaped_tokens.append(escaped)\n",
+ " return \"(?:\" + \"|\".join(escaped_tokens) + \")\"\n",
+ "\n",
+ "\n",
+ "def normalize_sql(expr: str) -> str:\n",
+ " normalized = f\"lower(trim(coalesce({expr}, '')))\"\n",
+ " replacements = [\n",
+ " (\"’\", \"'\"),\n",
+ " (\"`\", \"'\"),\n",
+ " (\"´\", \"'\"),\n",
+ " (\"×\", \"x\"),\n",
+ " (\"é\", \"e\"),\n",
+ " (\"è\", \"e\"),\n",
+ " (\"ê\", \"e\"),\n",
+ " (\"ë\", \"e\"),\n",
+ " (\"à\", \"a\"),\n",
+ " (\"â\", \"a\"),\n",
+ " (\"ä\", \"a\"),\n",
+ " (\"î\", \"i\"),\n",
+ " (\"ï\", \"i\"),\n",
+ " (\"ô\", \"o\"),\n",
+ " (\"ö\", \"o\"),\n",
+ " (\"ù\", \"u\"),\n",
+ " (\"û\", \"u\"),\n",
+ " (\"ü\", \"u\"),\n",
+ " (\"ç\", \"c\"),\n",
+ " ]\n",
+ " for raw_text, canonical_text in replacements:\n",
+ " normalized = f\"replace({normalized}, {sql_quote(raw_text)}, {sql_quote(canonical_text)})\"\n",
+ " normalized = f\"regexp_replace({normalized}, '\\\\s+', ' ', 'g')\"\n",
+ " return normalized\n",
+ "\n",
+ "\n",
+ "def values_sql(rows: list[tuple]) -> str:\n",
+ " return \",\\n\".join(\"(\" + \", \".join(sql_quote(str(value)) if isinstance(value, str) else str(value) for value in row) + \")\" for row in rows)\n",
+ "\n",
+ "\n",
+ "def show_query(title: str, query: str, limit: int | None = None) -> None:\n",
+ " final_query = query if limit is None else f\"{query}\\nLIMIT {limit}\"\n",
+ " print(title)\n",
+ " display(con.sql(final_query).df())\n",
+ "\n",
+ "\n",
+ "PARQUET_PATH = resolve_parquet_path(PARQUET_CANDIDATES)\n",
+ "con = duckdb.connect()\n",
+ "\n",
+ "descriptor_rows = COUNT_DESCRIPTOR_ALIASES + PACKAGING_DESCRIPTOR_ALIASES\n",
+ "\n",
+ "con.execute(\n",
+ " f'''\n",
+ " CREATE OR REPLACE TEMP VIEW measure_aliases AS\n",
+ " SELECT *\n",
+ " FROM (VALUES\n",
+ " {values_sql(MEASURE_ALIASES)}\n",
+ " ) AS alias_rows(token, base_unit, quantity_category, factor);\n",
+ " '''\n",
+ ")\n",
+ "\n",
+ "con.execute(\n",
+ " f'''\n",
+ " CREATE OR REPLACE TEMP VIEW descriptor_aliases AS\n",
+ " SELECT *\n",
+ " FROM (VALUES\n",
+ " {values_sql(descriptor_rows)}\n",
+ " ) AS alias_rows(token, item_descriptor, quantity_category);\n",
+ " '''\n",
+ ")\n",
+ "\n",
+ "con.execute(\n",
+ " f'''\n",
+ " CREATE OR REPLACE TEMP VIEW household_aliases AS\n",
+ " SELECT *\n",
+ " FROM (VALUES\n",
+ " {values_sql(HOUSEHOLD_ALIASES)}\n",
+ " ) AS alias_rows(token, item_descriptor);\n",
+ " '''\n",
+ ")\n",
+ "\n",
+ "NUMBER_PATTERN = r\"[0-9]+(?:[.,][0-9]+)?\"\n",
+ "FRACTION_OR_NUMBER_PATTERN = rf\"(?:[0-9]+/[0-9]+|{NUMBER_PATTERN})\"\n",
+ "MEASURE_PATTERN = build_token_pattern([row[0] for row in MEASURE_ALIASES])\n",
+ "COUNT_DESCRIPTOR_PATTERN = build_token_pattern([row[0] for row in COUNT_DESCRIPTOR_ALIASES])\n",
+ "PACKAGING_DESCRIPTOR_PATTERN = build_token_pattern([row[0] for row in PACKAGING_DESCRIPTOR_ALIASES])\n",
+ "DESCRIPTOR_PATTERN = build_token_pattern([row[0] for row in descriptor_rows])\n",
+ "HOUSEHOLD_PATTERN = build_token_pattern([row[0] for row in HOUSEHOLD_ALIASES])\n",
+ "MEASURE_ANY_REGEX = rf\"\\b{MEASURE_PATTERN}\\b\"\n",
+ "HOUSEHOLD_ANY_REGEX = rf\"\\b{HOUSEHOLD_PATTERN}\\b\"\n",
+ "PLACEHOLDER_PATTERN = build_token_pattern(PLACEHOLDER_TERMS)\n",
+ "PLACEHOLDER_REGEX = rf\"^(?:{PLACEHOLDER_PATTERN}|\\?+)$\"\n",
+ "\n",
+ "SIMPLE_MEASURE_REGEX = rf\"^\\s*({NUMBER_PATTERN})\\s*({MEASURE_PATTERN})\\s*[.]?\\s*$\"\n",
+ "MULTIPACK_MEASURE_REGEX = rf\"({NUMBER_PATTERN})\\s*[*x]\\s*({NUMBER_PATTERN})\\s*({MEASURE_PATTERN})\\b\"\n",
+ "DESCRIPTOR_MULTIPACK_REGEX = rf\"^\\s*({NUMBER_PATTERN}|un|une|a|an)\\s*({DESCRIPTOR_PATTERN})\\b\\s*(?:de|of|[*x])\\s*({NUMBER_PATTERN})\\s*({MEASURE_PATTERN})\\b(?:\\s*(?:chacun|chacune|each))?\\s*[.]?\\s*$\"\n",
+ "DESCRIPTOR_WITH_MEASURE_REGEX = rf\"^\\s*({NUMBER_PATTERN}|un|une|a|an)?\\s*({DESCRIPTOR_PATTERN})\\b.*?({NUMBER_PATTERN})\\s*({MEASURE_PATTERN})\\b\"\n",
+ "DESCRIPTOR_ONLY_REGEX = rf\"^\\s*({NUMBER_PATTERN}|un|une|a|an)?\\s*({DESCRIPTOR_PATTERN})\\b.*$\"\n",
+ "PER_PACKAGING_REGEX = rf\"^\\s*({NUMBER_PATTERN})\\s+par\\s+({PACKAGING_DESCRIPTOR_PATTERN})\\b\"\n",
+ "HOUSEHOLD_REGEX = rf\"^\\s*({FRACTION_OR_NUMBER_PATTERN})\\s*({HOUSEHOLD_PATTERN})\\b\"\n",
+ "NUMBER_ONLY_REGEX = rf\"^\\s*({NUMBER_PATTERN})\\s*$\"\n",
+ "\n",
+ "print(f\"Using parquet: {PARQUET_PATH}\")\n"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "7477f968",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# This cell loads only the raw fields needed for the quantity cleaner.\n",
+ "# A notebook-local row id is added so later debugging stays easy.\n",
+ "\n",
+ "quantity_raw_sql = f'''\n",
+ "CREATE OR REPLACE TEMP VIEW quantity_raw AS\n",
+ "SELECT\n",
+ " row_number() OVER () AS row_id,\n",
+ " {\", \".join(SOURCE_COLUMNS)}\n",
+ "FROM read_parquet({sql_quote(PARQUET_PATH.as_posix())});\n",
+ "'''\n",
+ "\n",
+ "con.execute(quantity_raw_sql)\n",
+ "\n",
+ "show_query(\"Raw quantity rows\", \"SELECT COUNT(*) AS row_count FROM quantity_raw\")\n",
+ "show_query(\"Sample raw rows\", \"SELECT * FROM quantity_raw ORDER BY row_id\", limit=10)\n"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "d0d2b531",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# This cell rebuilds the earlier reference text-parser for internal audit use only.\n",
+ "# It is not the final output of the notebook.\n",
+ "\n",
+ "# This cell creates the feature view that parses the raw fields into candidate signals.\n",
+ "# The view does not make the final decision yet. It only extracts everything needed for:\n",
+ "# - structured product field parsing\n",
+ "# - free-text quantity parsing\n",
+ "# - later cross-field comparison\n",
+ "\n",
+ "quantity_features_sql = f'''\n",
+ "CREATE OR REPLACE TEMP VIEW quantity_cleaning_features_reference AS\n",
+ "WITH base AS (\n",
+ " SELECT\n",
+ " row_id,\n",
+ " code,\n",
+ " product_quantity_unit,\n",
+ " {normalize_sql(\"product_quantity_unit\")} AS product_quantity_unit_normalized,\n",
+ " product_quantity,\n",
+ " TRY_CAST(replace(NULLIF(trim(product_quantity), ''), ',', '.') AS DOUBLE) AS product_quantity_numeric,\n",
+ " quantity,\n",
+ " {normalize_sql(\"quantity\")} AS quantity_normalized\n",
+ " FROM quantity_raw\n",
+ "),\n",
+ "extracted AS (\n",
+ " SELECT\n",
+ " b.*,\n",
+ " NULLIF(regexp_extract(quantity_normalized, '{MULTIPACK_MEASURE_REGEX}', 1), '') AS multipack_count_text,\n",
+ " NULLIF(regexp_extract(quantity_normalized, '{MULTIPACK_MEASURE_REGEX}', 2), '') AS multipack_inner_value_text,\n",
+ " NULLIF(regexp_extract(quantity_normalized, '{MULTIPACK_MEASURE_REGEX}', 3), '') AS multipack_unit_token,\n",
+ " NULLIF(regexp_extract(quantity_normalized, '{DESCRIPTOR_MULTIPACK_REGEX}', 1), '') AS descriptor_multipack_count_text,\n",
+ " NULLIF(regexp_extract(quantity_normalized, '{DESCRIPTOR_MULTIPACK_REGEX}', 2), '') AS descriptor_multipack_token,\n",
+ " NULLIF(regexp_extract(quantity_normalized, '{DESCRIPTOR_MULTIPACK_REGEX}', 3), '') AS descriptor_multipack_inner_value_text,\n",
+ " NULLIF(regexp_extract(quantity_normalized, '{DESCRIPTOR_MULTIPACK_REGEX}', 4), '') AS descriptor_multipack_unit_token,\n",
+ " NULLIF(regexp_extract(quantity_normalized, '{DESCRIPTOR_WITH_MEASURE_REGEX}', 1), '') AS descriptor_measure_leading_token,\n",
+ " NULLIF(regexp_extract(quantity_normalized, '{DESCRIPTOR_WITH_MEASURE_REGEX}', 2), '') AS descriptor_measure_token,\n",
+ " NULLIF(regexp_extract(quantity_normalized, '{DESCRIPTOR_WITH_MEASURE_REGEX}', 3), '') AS descriptor_measure_value_text,\n",
+ " NULLIF(regexp_extract(quantity_normalized, '{DESCRIPTOR_WITH_MEASURE_REGEX}', 4), '') AS descriptor_measure_unit_token,\n",
+ " NULLIF(regexp_extract(quantity_normalized, '{SIMPLE_MEASURE_REGEX}', 1), '') AS simple_value_text,\n",
+ " NULLIF(regexp_extract(quantity_normalized, '{SIMPLE_MEASURE_REGEX}', 2), '') AS simple_unit_token,\n",
+ " NULLIF(regexp_extract(quantity_normalized, '{DESCRIPTOR_ONLY_REGEX}', 1), '') AS descriptor_only_leading_token,\n",
+ " NULLIF(regexp_extract(quantity_normalized, '{DESCRIPTOR_ONLY_REGEX}', 2), '') AS descriptor_only_token,\n",
+ " NULLIF(regexp_extract(quantity_normalized, '{PER_PACKAGING_REGEX}', 1), '') AS per_pack_count_text,\n",
+ " NULLIF(regexp_extract(quantity_normalized, '{PER_PACKAGING_REGEX}', 2), '') AS per_pack_token,\n",
+ " NULLIF(regexp_extract(quantity_normalized, '{HOUSEHOLD_REGEX}', 1), '') AS household_value_text,\n",
+ " NULLIF(regexp_extract(quantity_normalized, '{HOUSEHOLD_REGEX}', 2), '') AS household_unit_token,\n",
+ " regexp_matches(quantity_normalized, '{PLACEHOLDER_REGEX}') AS quantity_is_placeholder,\n",
+ " regexp_matches(quantity_normalized, '{NUMBER_ONLY_REGEX}') AS quantity_is_number_only,\n",
+ " regexp_matches(quantity_normalized, '{HOUSEHOLD_ANY_REGEX}') AS quantity_has_household_token,\n",
+ " regexp_matches(quantity_normalized, '{MEASURE_ANY_REGEX}') AS quantity_has_measure_token,\n",
+ " regexp_matches(quantity_normalized, '\\\\d') AS quantity_has_digits\n",
+ " FROM base b\n",
+ "),\n",
+ "joined AS (\n",
+ " SELECT\n",
+ " e.*,\n",
+ " pm.base_unit AS product_base_unit,\n",
+ " pm.quantity_category AS product_category,\n",
+ " pm.factor AS product_factor,\n",
+ " mm.base_unit AS multipack_base_unit,\n",
+ " mm.quantity_category AS multipack_category,\n",
+ " mm.factor AS multipack_factor,\n",
+ " dpm.base_unit AS descriptor_multipack_base_unit,\n",
+ " dpm.quantity_category AS descriptor_multipack_category,\n",
+ " dpm.factor AS descriptor_multipack_factor,\n",
+ " dm.base_unit AS descriptor_measure_base_unit,\n",
+ " dm.quantity_category AS descriptor_measure_category,\n",
+ " dm.factor AS descriptor_measure_factor,\n",
+ " sm.base_unit AS simple_base_unit,\n",
+ " sm.quantity_category AS simple_category,\n",
+ " sm.factor AS simple_factor,\n",
+ " dpd.item_descriptor AS descriptor_multipack_item_descriptor,\n",
+ " de.item_descriptor AS descriptor_measure_item_descriptor,\n",
+ " de.quantity_category AS descriptor_measure_descriptor_category,\n",
+ " d0.item_descriptor AS descriptor_only_item_descriptor,\n",
+ " d0.quantity_category AS descriptor_only_category,\n",
+ " pp.item_descriptor AS per_pack_item_descriptor,\n",
+ " hh.item_descriptor AS household_descriptor\n",
+ " FROM extracted e\n",
+ " LEFT JOIN measure_aliases pm ON e.product_quantity_unit_normalized = pm.token\n",
+ " LEFT JOIN measure_aliases mm ON e.multipack_unit_token = mm.token\n",
+ " LEFT JOIN measure_aliases dpm ON e.descriptor_multipack_unit_token = dpm.token\n",
+ " LEFT JOIN measure_aliases dm ON e.descriptor_measure_unit_token = dm.token\n",
+ " LEFT JOIN measure_aliases sm ON e.simple_unit_token = sm.token\n",
+ " LEFT JOIN descriptor_aliases dpd ON e.descriptor_multipack_token = dpd.token\n",
+ " LEFT JOIN descriptor_aliases de ON e.descriptor_measure_token = de.token\n",
+ " LEFT JOIN descriptor_aliases d0 ON e.descriptor_only_token = d0.token\n",
+ " LEFT JOIN descriptor_aliases pp ON e.per_pack_token = pp.token\n",
+ " LEFT JOIN household_aliases hh ON e.household_unit_token = hh.token\n",
+ "),\n",
+ "typed AS (\n",
+ " SELECT\n",
+ " *,\n",
+ " CASE\n",
+ " WHEN lower(coalesce(descriptor_multipack_count_text, '')) IN ('un', 'une', 'a', 'an') THEN 1.0\n",
+ " ELSE TRY_CAST(replace(descriptor_multipack_count_text, ',', '.') AS DOUBLE)\n",
+ " END AS descriptor_multipack_count_numeric,\n",
+ " CASE\n",
+ " WHEN lower(coalesce(descriptor_measure_leading_token, '')) IN ('un', 'une', 'a', 'an') THEN 1.0\n",
+ " ELSE TRY_CAST(replace(descriptor_measure_leading_token, ',', '.') AS DOUBLE)\n",
+ " END AS descriptor_measure_count_numeric,\n",
+ " CASE\n",
+ " WHEN lower(coalesce(descriptor_only_leading_token, '')) IN ('un', 'une', 'a', 'an') THEN 1.0\n",
+ " ELSE TRY_CAST(replace(descriptor_only_leading_token, ',', '.') AS DOUBLE)\n",
+ " END AS descriptor_only_count_numeric,\n",
+ " TRY_CAST(replace(descriptor_multipack_inner_value_text, ',', '.') AS DOUBLE) AS descriptor_multipack_inner_value_numeric,\n",
+ " TRY_CAST(replace(multipack_count_text, ',', '.') AS DOUBLE) AS multipack_count_numeric,\n",
+ " TRY_CAST(replace(multipack_inner_value_text, ',', '.') AS DOUBLE) AS multipack_inner_value_numeric,\n",
+ " TRY_CAST(replace(descriptor_measure_value_text, ',', '.') AS DOUBLE) AS descriptor_measure_value_numeric,\n",
+ " TRY_CAST(replace(simple_value_text, ',', '.') AS DOUBLE) AS simple_value_numeric,\n",
+ " TRY_CAST(replace(per_pack_count_text, ',', '.') AS DOUBLE) AS per_pack_count_numeric,\n",
+ " CASE\n",
+ " WHEN household_value_text LIKE '%/%' THEN\n",
+ " TRY_CAST(split_part(household_value_text, '/', 1) AS DOUBLE)\n",
+ " / NULLIF(TRY_CAST(split_part(household_value_text, '/', 2) AS DOUBLE), 0)\n",
+ " ELSE TRY_CAST(replace(household_value_text, ',', '.') AS DOUBLE)\n",
+ " END AS household_value_numeric,\n",
+ " CASE\n",
+ " WHEN product_quantity_numeric IS NOT NULL AND product_factor IS NOT NULL THEN product_quantity_numeric * product_factor\n",
+ " END AS product_normalized_value,\n",
+ " CASE\n",
+ " WHEN multipack_inner_value_numeric IS NOT NULL\n",
+ " AND multipack_factor IS NOT NULL\n",
+ " THEN multipack_inner_value_numeric * multipack_factor\n",
+ " END AS multipack_inner_normalized_value,\n",
+ " CASE\n",
+ " WHEN multipack_count_numeric IS NOT NULL\n",
+ " AND multipack_inner_value_numeric IS NOT NULL\n",
+ " AND multipack_factor IS NOT NULL\n",
+ " THEN multipack_count_numeric * multipack_inner_value_numeric * multipack_factor\n",
+ " END AS multipack_normalized_value,\n",
+ " CASE\n",
+ " WHEN descriptor_multipack_count_numeric IS NOT NULL\n",
+ " AND descriptor_multipack_count_numeric > 1\n",
+ " AND descriptor_multipack_inner_value_numeric IS NOT NULL\n",
+ " AND descriptor_multipack_factor IS NOT NULL\n",
+ " THEN descriptor_multipack_inner_value_numeric * descriptor_multipack_factor\n",
+ " END AS descriptor_multipack_inner_normalized_value,\n",
+ " CASE\n",
+ " WHEN descriptor_multipack_count_numeric IS NOT NULL\n",
+ " AND descriptor_multipack_count_numeric > 1\n",
+ " AND descriptor_multipack_inner_value_numeric IS NOT NULL\n",
+ " AND descriptor_multipack_factor IS NOT NULL\n",
+ " THEN descriptor_multipack_count_numeric * descriptor_multipack_inner_value_numeric * descriptor_multipack_factor\n",
+ " END AS descriptor_multipack_total_normalized_value,\n",
+ " CASE\n",
+ " WHEN descriptor_measure_value_numeric IS NOT NULL\n",
+ " AND descriptor_measure_factor IS NOT NULL\n",
+ " THEN descriptor_measure_value_numeric * descriptor_measure_factor\n",
+ " END AS descriptor_measure_normalized_value,\n",
+ " CASE\n",
+ " WHEN simple_value_numeric IS NOT NULL AND simple_factor IS NOT NULL\n",
+ " THEN simple_value_numeric * simple_factor\n",
+ " END AS simple_normalized_value\n",
+ " FROM joined\n",
+ ")\n",
+ "SELECT\n",
+ " *,\n",
+ " CASE\n",
+ " WHEN quantity IS NULL THEN NULL\n",
+ " WHEN quantity_normalized = '' THEN 'unresolved'\n",
+ " WHEN quantity_is_placeholder THEN 'unresolved'\n",
+ " WHEN quantity_has_household_token THEN 'unresolved'\n",
+ " WHEN multipack_normalized_value IS NOT NULL AND multipack_category IN ('mass', 'volume') THEN 'resolved'\n",
+ " WHEN descriptor_multipack_total_normalized_value IS NOT NULL AND descriptor_multipack_category IN ('mass', 'volume') THEN 'resolved'\n",
+ " WHEN descriptor_measure_normalized_value IS NOT NULL AND descriptor_measure_category = 'energy' THEN 'partial'\n",
+ " WHEN descriptor_measure_normalized_value IS NOT NULL AND descriptor_measure_category IN ('mass', 'volume') THEN 'resolved'\n",
+ " WHEN simple_normalized_value IS NOT NULL AND simple_category = 'energy' THEN 'partial'\n",
+ " WHEN simple_normalized_value IS NOT NULL AND simple_category IN ('mass', 'volume') THEN 'resolved'\n",
+ " WHEN descriptor_only_item_descriptor IS NOT NULL AND descriptor_only_category = 'count' THEN 'resolved'\n",
+ " WHEN descriptor_only_item_descriptor IS NOT NULL AND descriptor_only_category = 'packaging_only' THEN 'partial'\n",
+ " WHEN per_pack_item_descriptor IS NOT NULL THEN 'partial'\n",
+ " WHEN quantity_is_number_only THEN 'unresolved'\n",
+ " WHEN NOT quantity_has_digits THEN 'unresolved'\n",
+ " ELSE 'unresolved'\n",
+ " END AS text_status,\n",
+ " CASE\n",
+ " WHEN quantity IS NULL THEN NULL\n",
+ " WHEN quantity_normalized = '' THEN NULL\n",
+ " WHEN quantity_is_placeholder THEN 'placeholder_or_unknown'\n",
+ " WHEN quantity_has_household_token THEN 'household_unit'\n",
+ " WHEN multipack_normalized_value IS NOT NULL AND multipack_category IN ('mass', 'volume') THEN 'multipack_measure'\n",
+ " WHEN descriptor_multipack_total_normalized_value IS NOT NULL AND descriptor_multipack_category IN ('mass', 'volume') THEN 'multipack_measure'\n",
+ " WHEN descriptor_measure_normalized_value IS NOT NULL THEN descriptor_measure_category\n",
+ " WHEN simple_normalized_value IS NOT NULL THEN simple_category\n",
+ " WHEN descriptor_only_item_descriptor IS NOT NULL THEN descriptor_only_category\n",
+ " WHEN per_pack_item_descriptor IS NOT NULL THEN 'packaging_only'\n",
+ " WHEN quantity_has_measure_token THEN 'mixed_measure'\n",
+ " WHEN NOT quantity_has_digits THEN 'noise_or_non_quantity'\n",
+ " ELSE NULL\n",
+ " END AS text_category,\n",
+ " CASE\n",
+ " WHEN multipack_normalized_value IS NOT NULL AND multipack_category IN ('mass', 'volume') THEN multipack_normalized_value\n",
+ " WHEN descriptor_multipack_total_normalized_value IS NOT NULL AND descriptor_multipack_category IN ('mass', 'volume') THEN descriptor_multipack_total_normalized_value\n",
+ " WHEN descriptor_measure_normalized_value IS NOT NULL THEN descriptor_measure_normalized_value\n",
+ " WHEN simple_normalized_value IS NOT NULL THEN simple_normalized_value\n",
+ " WHEN descriptor_only_item_descriptor IS NOT NULL AND descriptor_only_category = 'count' THEN coalesce(descriptor_only_count_numeric, 1.0)\n",
+ " WHEN descriptor_only_item_descriptor IS NOT NULL AND descriptor_only_category = 'packaging_only' THEN coalesce(descriptor_only_count_numeric, 1.0)\n",
+ " WHEN per_pack_item_descriptor IS NOT NULL THEN per_pack_count_numeric\n",
+ " ELSE NULL\n",
+ " END AS text_normalized_value,\n",
+ " CASE\n",
+ " WHEN multipack_normalized_value IS NOT NULL AND multipack_category IN ('mass', 'volume') THEN multipack_base_unit\n",
+ " WHEN descriptor_multipack_total_normalized_value IS NOT NULL AND descriptor_multipack_category IN ('mass', 'volume') THEN descriptor_multipack_base_unit\n",
+ " WHEN descriptor_measure_normalized_value IS NOT NULL THEN descriptor_measure_base_unit\n",
+ " WHEN simple_normalized_value IS NOT NULL THEN simple_base_unit\n",
+ " WHEN descriptor_only_item_descriptor IS NOT NULL THEN 'count'\n",
+ " WHEN per_pack_item_descriptor IS NOT NULL THEN 'count'\n",
+ " ELSE NULL\n",
+ " END AS text_normalized_unit,\n",
+ " CASE\n",
+ " WHEN multipack_normalized_value IS NOT NULL AND multipack_category IN ('mass', 'volume') THEN multipack_inner_normalized_value\n",
+ " WHEN descriptor_multipack_total_normalized_value IS NOT NULL AND descriptor_multipack_category IN ('mass', 'volume') THEN descriptor_multipack_inner_normalized_value\n",
+ " ELSE NULL\n",
+ " END AS text_inner_normalized_value,\n",
+ " CASE\n",
+ " WHEN multipack_normalized_value IS NOT NULL AND multipack_category IN ('mass', 'volume') THEN multipack_count_numeric\n",
+ " WHEN descriptor_multipack_total_normalized_value IS NOT NULL AND descriptor_multipack_category IN ('mass', 'volume') THEN descriptor_multipack_count_numeric\n",
+ " WHEN descriptor_measure_normalized_value IS NOT NULL AND descriptor_measure_category IN ('mass', 'volume', 'energy') THEN coalesce(descriptor_measure_count_numeric, 1.0)\n",
+ " WHEN descriptor_only_item_descriptor IS NOT NULL AND descriptor_only_category = 'packaging_only' THEN coalesce(descriptor_only_count_numeric, 1.0)\n",
+ " WHEN per_pack_item_descriptor IS NOT NULL THEN per_pack_count_numeric\n",
+ " WHEN simple_normalized_value IS NOT NULL THEN 1.0\n",
+ " ELSE NULL\n",
+ " END AS text_pack_count,\n",
+ " CASE\n",
+ " WHEN descriptor_multipack_item_descriptor IS NOT NULL THEN descriptor_multipack_item_descriptor\n",
+ " WHEN descriptor_measure_item_descriptor IS NOT NULL THEN descriptor_measure_item_descriptor\n",
+ " WHEN descriptor_only_item_descriptor IS NOT NULL THEN descriptor_only_item_descriptor\n",
+ " WHEN per_pack_item_descriptor IS NOT NULL THEN per_pack_item_descriptor\n",
+ " ELSE NULL\n",
+ " END AS text_item_descriptor,\n",
+ " CASE\n",
+ " WHEN quantity IS NULL THEN 'quantity missing'\n",
+ " WHEN quantity_normalized = '' THEN 'blank quantity string'\n",
+ " WHEN quantity_is_placeholder THEN 'placeholder or unknown text'\n",
+ " WHEN quantity_has_household_token THEN 'household unit not used for consolidation'\n",
+ " WHEN multipack_normalized_value IS NOT NULL AND multipack_category IN ('mass', 'volume') THEN 'derived total from quantity multipack'\n",
+ " WHEN descriptor_multipack_total_normalized_value IS NOT NULL AND descriptor_multipack_category IN ('mass', 'volume') THEN 'derived total from quantity descriptor multipack'\n",
+ " WHEN descriptor_measure_normalized_value IS NOT NULL AND descriptor_measure_category = 'energy' THEN 'energy text captured but not used for consolidation'\n",
+ " WHEN descriptor_measure_normalized_value IS NOT NULL AND descriptor_measure_category IN ('mass', 'volume') THEN 'descriptor plus measure from quantity'\n",
+ " WHEN simple_normalized_value IS NOT NULL AND simple_category = 'energy' THEN 'energy text captured but not used for consolidation'\n",
+ " WHEN simple_normalized_value IS NOT NULL AND simple_category IN ('mass', 'volume') THEN 'simple measure from quantity'\n",
+ " WHEN descriptor_only_item_descriptor IS NOT NULL AND descriptor_only_category = 'count' THEN 'count descriptor from quantity'\n",
+ " WHEN descriptor_only_item_descriptor IS NOT NULL AND descriptor_only_category = 'packaging_only' THEN 'packaging only, no comparable size'\n",
+ " WHEN per_pack_item_descriptor IS NOT NULL THEN 'per packaging phrase, no comparable size'\n",
+ " WHEN quantity_is_number_only THEN 'number only, unit missing'\n",
+ " WHEN NOT quantity_has_digits THEN 'non-quantity text in quantity field'\n",
+ " WHEN quantity_has_measure_token THEN 'mixed or unsupported measure expression'\n",
+ " ELSE 'unparsed quantity text'\n",
+ " END AS text_note\n",
+ "FROM typed;\n",
+ "'''\n",
+ "\n",
+ "con.execute(quantity_features_sql)\n",
+ "\n",
+ "print(\"Reference text-parser view ready for internal change audit.\")\n"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "36dd7772",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# This cell rebuilds the earlier reference cleaned view for internal audit use only.\n",
+ "# It is not the final output of the notebook.\n",
+ "\n",
+ "# This cell applies the final cross-field decision rules and produces the reference cleaned view.\n",
+ "# The logic is intentionally conservative:\n",
+ "# - use structured product fields when they are clear\n",
+ "# - use quantity text when it cleanly fills missing structure\n",
+ "# - keep packaging-only and household cases visible\n",
+ "# - mark real disagreements as conflicts instead of guessing\n",
+ "\n",
+ "quantity_cleaned_sql = f'''\n",
+ "CREATE OR REPLACE TEMP VIEW quantity_cleaned_reference AS\n",
+ "WITH staged AS (\n",
+ " SELECT\n",
+ " *,\n",
+ " CASE\n",
+ " WHEN product_normalized_value IS NOT NULL AND product_category IN ('mass', 'volume') THEN 'resolved'\n",
+ " WHEN product_normalized_value IS NOT NULL AND product_category = 'energy' THEN 'partial'\n",
+ " ELSE NULL\n",
+ " END AS product_status,\n",
+ " CASE\n",
+ " WHEN product_category IN ('mass', 'volume', 'energy')\n",
+ " AND text_normalized_value IS NOT NULL\n",
+ " AND text_normalized_unit IS NOT NULL\n",
+ " THEN product_base_unit = text_normalized_unit\n",
+ " ELSE FALSE\n",
+ " END AS comparable_unit_match,\n",
+ " CASE\n",
+ " WHEN product_category IN ('mass', 'volume', 'energy')\n",
+ " AND text_normalized_value IS NOT NULL\n",
+ " AND text_normalized_unit IS NOT NULL\n",
+ " AND product_base_unit = text_normalized_unit\n",
+ " THEN abs(product_normalized_value - text_normalized_value)\n",
+ " <= greatest(0.01, {VALUE_MATCH_TOLERANCE} * greatest(abs(product_normalized_value), abs(text_normalized_value)))\n",
+ " ELSE FALSE\n",
+ " END AS comparable_value_match,\n",
+ " CASE\n",
+ " WHEN product_category IN ('mass', 'volume')\n",
+ " AND text_category = 'multipack_measure'\n",
+ " AND text_inner_normalized_value IS NOT NULL\n",
+ " AND product_base_unit = text_normalized_unit\n",
+ " THEN abs(product_normalized_value - text_inner_normalized_value)\n",
+ " <= greatest(0.01, {VALUE_MATCH_TOLERANCE} * greatest(abs(product_normalized_value), abs(text_inner_normalized_value)))\n",
+ " ELSE FALSE\n",
+ " END AS multipack_inner_value_match,\n",
+ " CASE\n",
+ " WHEN product_quantity_numeric IS NOT NULL\n",
+ " AND text_category = 'multipack_measure'\n",
+ " AND text_normalized_value IS NOT NULL\n",
+ " THEN (\n",
+ " abs(product_quantity_numeric - text_normalized_value)\n",
+ " <= greatest(0.01, {VALUE_MATCH_TOLERANCE} * greatest(abs(product_quantity_numeric), abs(text_normalized_value)))\n",
+ " ) OR (\n",
+ " text_inner_normalized_value IS NOT NULL\n",
+ " AND abs(product_quantity_numeric - text_inner_normalized_value)\n",
+ " <= greatest(0.01, {VALUE_MATCH_TOLERANCE} * greatest(abs(product_quantity_numeric), abs(text_inner_normalized_value)))\n",
+ " )\n",
+ " WHEN product_quantity_numeric IS NOT NULL\n",
+ " AND text_normalized_value IS NOT NULL\n",
+ " AND text_normalized_unit = 'count'\n",
+ " THEN product_quantity_numeric = text_normalized_value\n",
+ " WHEN product_quantity_numeric IS NOT NULL\n",
+ " AND text_normalized_value IS NOT NULL\n",
+ " THEN abs(product_quantity_numeric - text_normalized_value)\n",
+ " <= greatest(0.01, {VALUE_MATCH_TOLERANCE} * greatest(abs(product_quantity_numeric), abs(text_normalized_value)))\n",
+ " ELSE FALSE\n",
+ " END AS raw_numeric_matches_text\n",
+ " FROM quantity_cleaning_features_reference\n",
+ ")\n",
+ "SELECT\n",
+ " row_id,\n",
+ " code,\n",
+ " product_quantity_unit,\n",
+ " product_quantity,\n",
+ " quantity,\n",
+ " CASE\n",
+ " WHEN product_status = 'resolved'\n",
+ " AND text_status IN ('resolved', 'partial')\n",
+ " AND text_category IN ('mass', 'volume', 'multipack_measure', 'energy')\n",
+ " AND NOT comparable_value_match\n",
+ " AND NOT multipack_inner_value_match\n",
+ " THEN 'conflict'\n",
+ " WHEN product_status = 'partial'\n",
+ " AND text_status = 'partial'\n",
+ " AND text_category = 'energy'\n",
+ " AND comparable_value_match\n",
+ " THEN 'partial'\n",
+ " WHEN product_status = 'resolved'\n",
+ " AND text_status = 'resolved'\n",
+ " AND text_category = 'multipack_measure'\n",
+ " AND multipack_inner_value_match\n",
+ " THEN 'resolved'\n",
+ " WHEN product_status = 'resolved'\n",
+ " AND text_status = 'resolved'\n",
+ " AND text_category = 'multipack_measure'\n",
+ " AND comparable_value_match\n",
+ " THEN 'resolved'\n",
+ " WHEN product_status = 'resolved'\n",
+ " AND text_status = 'resolved'\n",
+ " AND text_category IN ('mass', 'volume')\n",
+ " AND comparable_value_match\n",
+ " THEN 'resolved'\n",
+ " WHEN product_status = 'resolved'\n",
+ " AND text_status IN ('resolved', 'partial')\n",
+ " AND text_category IN ('count', 'packaging_only')\n",
+ " THEN 'resolved'\n",
+ " WHEN product_status = 'resolved'\n",
+ " THEN 'resolved'\n",
+ " WHEN product_quantity_numeric = 0\n",
+ " AND text_status IN ('resolved', 'partial')\n",
+ " AND text_normalized_value IS NOT NULL\n",
+ " AND coalesce(text_normalized_value, 0) > 0\n",
+ " THEN text_status\n",
+ " WHEN product_quantity_numeric IS NOT NULL\n",
+ " AND product_base_unit IS NULL\n",
+ " AND text_status IN ('resolved', 'partial')\n",
+ " AND raw_numeric_matches_text\n",
+ " THEN text_status\n",
+ " WHEN product_quantity_numeric IS NOT NULL\n",
+ " AND product_base_unit IS NULL\n",
+ " AND text_status IN ('resolved', 'partial')\n",
+ " AND NOT raw_numeric_matches_text\n",
+ " THEN 'conflict'\n",
+ " WHEN text_status IS NOT NULL\n",
+ " THEN text_status\n",
+ " WHEN product_quantity_numeric IS NOT NULL AND product_base_unit IS NULL\n",
+ " THEN 'unresolved'\n",
+ " ELSE 'unresolved'\n",
+ " END AS quantity_status,\n",
+ " CASE\n",
+ " WHEN product_status = 'resolved'\n",
+ " AND text_status IN ('resolved', 'partial')\n",
+ " AND text_category IN ('mass', 'volume', 'multipack_measure', 'energy')\n",
+ " AND NOT comparable_value_match\n",
+ " AND NOT multipack_inner_value_match\n",
+ " THEN product_category\n",
+ " WHEN product_status = 'partial'\n",
+ " AND text_status = 'partial'\n",
+ " AND text_category = 'energy'\n",
+ " AND comparable_value_match\n",
+ " THEN 'energy'\n",
+ " WHEN product_status = 'resolved'\n",
+ " AND text_status = 'resolved'\n",
+ " AND text_category = 'multipack_measure'\n",
+ " AND multipack_inner_value_match\n",
+ " THEN 'multipack_measure'\n",
+ " WHEN product_status = 'resolved'\n",
+ " AND text_status = 'resolved'\n",
+ " AND text_category = 'multipack_measure'\n",
+ " AND comparable_value_match\n",
+ " THEN 'multipack_measure'\n",
+ " WHEN product_status = 'resolved'\n",
+ " AND text_status = 'resolved'\n",
+ " AND text_category IN ('mass', 'volume')\n",
+ " AND comparable_value_match\n",
+ " THEN product_category\n",
+ " WHEN product_status = 'resolved'\n",
+ " AND text_status IN ('resolved', 'partial')\n",
+ " AND text_category IN ('count', 'packaging_only')\n",
+ " THEN product_category\n",
+ " WHEN product_status = 'resolved'\n",
+ " THEN product_category\n",
+ " WHEN product_quantity_numeric = 0\n",
+ " AND text_status IN ('resolved', 'partial')\n",
+ " AND text_normalized_value IS NOT NULL\n",
+ " AND coalesce(text_normalized_value, 0) > 0\n",
+ " THEN text_category\n",
+ " WHEN product_quantity_numeric IS NOT NULL\n",
+ " AND product_base_unit IS NULL\n",
+ " AND text_status IN ('resolved', 'partial')\n",
+ " AND raw_numeric_matches_text\n",
+ " THEN text_category\n",
+ " WHEN product_quantity_numeric IS NOT NULL\n",
+ " AND product_base_unit IS NULL\n",
+ " AND text_status IN ('resolved', 'partial')\n",
+ " AND NOT raw_numeric_matches_text\n",
+ " THEN text_category\n",
+ " WHEN text_status IS NOT NULL\n",
+ " THEN text_category\n",
+ " ELSE NULL\n",
+ " END AS quantity_category,\n",
+ " CASE\n",
+ " WHEN product_status = 'resolved'\n",
+ " AND text_status IN ('resolved', 'partial')\n",
+ " AND text_category IN ('mass', 'volume', 'multipack_measure', 'energy')\n",
+ " AND NOT comparable_value_match\n",
+ " AND NOT multipack_inner_value_match\n",
+ " THEN NULL\n",
+ " WHEN product_status = 'resolved'\n",
+ " AND text_status = 'resolved'\n",
+ " AND text_category = 'multipack_measure'\n",
+ " AND multipack_inner_value_match\n",
+ " THEN text_normalized_value\n",
+ " WHEN product_status = 'resolved'\n",
+ " AND text_status = 'resolved'\n",
+ " AND text_category = 'multipack_measure'\n",
+ " AND comparable_value_match\n",
+ " THEN text_normalized_value\n",
+ " WHEN product_status = 'resolved'\n",
+ " THEN product_normalized_value\n",
+ " WHEN product_status = 'partial'\n",
+ " AND text_status = 'partial'\n",
+ " AND text_category = 'energy'\n",
+ " AND comparable_value_match\n",
+ " THEN product_normalized_value\n",
+ " WHEN product_quantity_numeric = 0\n",
+ " AND text_status IN ('resolved', 'partial')\n",
+ " AND text_normalized_value IS NOT NULL\n",
+ " AND coalesce(text_normalized_value, 0) > 0\n",
+ " THEN text_normalized_value\n",
+ " WHEN product_quantity_numeric IS NOT NULL\n",
+ " AND product_base_unit IS NULL\n",
+ " AND text_status IN ('resolved', 'partial')\n",
+ " AND raw_numeric_matches_text\n",
+ " THEN text_normalized_value\n",
+ " WHEN product_quantity_numeric IS NOT NULL\n",
+ " AND product_base_unit IS NULL\n",
+ " AND text_status IN ('resolved', 'partial')\n",
+ " AND NOT raw_numeric_matches_text\n",
+ " THEN NULL\n",
+ " WHEN text_status IN ('resolved', 'partial')\n",
+ " THEN text_normalized_value\n",
+ " ELSE NULL\n",
+ " END AS normalized_value,\n",
+ " CASE\n",
+ " WHEN product_status = 'resolved'\n",
+ " AND text_status IN ('resolved', 'partial')\n",
+ " AND text_category IN ('mass', 'volume', 'multipack_measure', 'energy')\n",
+ " AND NOT comparable_value_match\n",
+ " AND NOT multipack_inner_value_match\n",
+ " THEN NULL\n",
+ " WHEN product_status = 'resolved'\n",
+ " AND text_status = 'resolved'\n",
+ " AND text_category = 'multipack_measure'\n",
+ " AND multipack_inner_value_match\n",
+ " THEN text_normalized_unit\n",
+ " WHEN product_status = 'resolved'\n",
+ " AND text_status = 'resolved'\n",
+ " AND text_category = 'multipack_measure'\n",
+ " AND comparable_value_match\n",
+ " THEN text_normalized_unit\n",
+ " WHEN product_status = 'resolved'\n",
+ " THEN product_base_unit\n",
+ " WHEN product_status = 'partial'\n",
+ " AND text_status = 'partial'\n",
+ " AND text_category = 'energy'\n",
+ " AND comparable_value_match\n",
+ " THEN product_base_unit\n",
+ " WHEN product_quantity_numeric = 0\n",
+ " AND text_status IN ('resolved', 'partial')\n",
+ " AND text_normalized_value IS NOT NULL\n",
+ " AND coalesce(text_normalized_value, 0) > 0\n",
+ " THEN text_normalized_unit\n",
+ " WHEN product_quantity_numeric IS NOT NULL\n",
+ " AND product_base_unit IS NULL\n",
+ " AND text_status IN ('resolved', 'partial')\n",
+ " AND raw_numeric_matches_text\n",
+ " THEN text_normalized_unit\n",
+ " WHEN product_quantity_numeric IS NOT NULL\n",
+ " AND product_base_unit IS NULL\n",
+ " AND text_status IN ('resolved', 'partial')\n",
+ " AND NOT raw_numeric_matches_text\n",
+ " THEN NULL\n",
+ " WHEN text_status IN ('resolved', 'partial')\n",
+ " THEN text_normalized_unit\n",
+ " ELSE NULL\n",
+ " END AS normalized_unit,\n",
+ " CASE\n",
+ " WHEN product_status = 'resolved'\n",
+ " AND text_status = 'resolved'\n",
+ " AND text_category = 'multipack_measure'\n",
+ " AND multipack_inner_value_match\n",
+ " THEN text_pack_count\n",
+ " WHEN product_status = 'resolved'\n",
+ " AND text_status = 'resolved'\n",
+ " AND text_category = 'multipack_measure'\n",
+ " AND comparable_value_match\n",
+ " THEN text_pack_count\n",
+ " WHEN product_status = 'resolved'\n",
+ " AND text_status IN ('resolved', 'partial')\n",
+ " AND text_category IN ('count', 'packaging_only')\n",
+ " THEN coalesce(text_pack_count, 1.0)\n",
+ " WHEN product_status = 'resolved'\n",
+ " THEN 1.0\n",
+ " WHEN product_status = 'partial'\n",
+ " AND text_status = 'partial'\n",
+ " AND text_category = 'energy'\n",
+ " AND comparable_value_match\n",
+ " THEN coalesce(text_pack_count, 1.0)\n",
+ " WHEN product_quantity_numeric = 0\n",
+ " AND text_status IN ('resolved', 'partial')\n",
+ " THEN coalesce(text_pack_count, CASE WHEN text_category IN ('count', 'packaging_only') THEN text_normalized_value ELSE 1.0 END)\n",
+ " WHEN product_quantity_numeric IS NOT NULL\n",
+ " AND product_base_unit IS NULL\n",
+ " AND text_status IN ('resolved', 'partial')\n",
+ " AND raw_numeric_matches_text\n",
+ " THEN coalesce(text_pack_count, CASE WHEN text_category IN ('count', 'packaging_only') THEN text_normalized_value ELSE 1.0 END)\n",
+ " WHEN text_status IN ('resolved', 'partial')\n",
+ " THEN coalesce(text_pack_count, CASE WHEN text_category IN ('count', 'packaging_only') THEN text_normalized_value ELSE 1.0 END)\n",
+ " ELSE NULL\n",
+ " END AS pack_count,\n",
+ " CASE\n",
+ " WHEN product_status = 'resolved'\n",
+ " AND text_status IN ('resolved', 'partial')\n",
+ " AND text_category IN ('count', 'packaging_only', 'multipack_measure', 'mass', 'volume', 'mixed_measure')\n",
+ " THEN text_item_descriptor\n",
+ " WHEN product_status = 'resolved'\n",
+ " THEN NULL\n",
+ " WHEN product_status = 'partial'\n",
+ " AND text_status = 'partial'\n",
+ " AND text_category = 'energy'\n",
+ " AND comparable_value_match\n",
+ " THEN text_item_descriptor\n",
+ " WHEN product_quantity_numeric = 0\n",
+ " AND text_status IN ('resolved', 'partial')\n",
+ " THEN text_item_descriptor\n",
+ " WHEN product_quantity_numeric IS NOT NULL\n",
+ " AND product_base_unit IS NULL\n",
+ " AND text_status IN ('resolved', 'partial')\n",
+ " AND raw_numeric_matches_text\n",
+ " THEN text_item_descriptor\n",
+ " WHEN text_status IN ('resolved', 'partial')\n",
+ " THEN text_item_descriptor\n",
+ " ELSE NULL\n",
+ " END AS item_descriptor,\n",
+ " CASE\n",
+ " WHEN product_status = 'resolved'\n",
+ " AND text_status IN ('resolved', 'partial')\n",
+ " AND text_category IN ('mass', 'volume', 'multipack_measure', 'energy')\n",
+ " AND NOT comparable_value_match\n",
+ " AND NOT multipack_inner_value_match\n",
+ " THEN TRUE\n",
+ " WHEN product_quantity_numeric IS NOT NULL\n",
+ " AND product_base_unit IS NULL\n",
+ " AND text_status IN ('resolved', 'partial')\n",
+ " AND NOT raw_numeric_matches_text\n",
+ " THEN TRUE\n",
+ " ELSE FALSE\n",
+ " END AS quantity_conflict_flag,\n",
+ " CASE\n",
+ " WHEN product_status = 'resolved'\n",
+ " AND text_status = 'resolved'\n",
+ " AND text_category = 'multipack_measure'\n",
+ " AND multipack_inner_value_match\n",
+ " THEN 'product fields match inner quantity; total derived from quantity multipack'\n",
+ " WHEN product_status = 'resolved'\n",
+ " AND text_status = 'resolved'\n",
+ " AND text_category = 'multipack_measure'\n",
+ " AND comparable_value_match\n",
+ " THEN 'structured total matches quantity multipack'\n",
+ " WHEN product_status = 'resolved'\n",
+ " AND text_status = 'resolved'\n",
+ " AND text_category IN ('mass', 'volume')\n",
+ " AND comparable_value_match\n",
+ " THEN 'product fields and quantity agree'\n",
+ " WHEN product_status = 'partial'\n",
+ " AND text_status = 'partial'\n",
+ " AND text_category = 'energy'\n",
+ " AND comparable_value_match\n",
+ " THEN 'energy captured consistently but not used for consolidation'\n",
+ " WHEN product_status = 'resolved'\n",
+ " AND text_status IN ('resolved', 'partial')\n",
+ " AND text_category IN ('count', 'packaging_only')\n",
+ " THEN 'product fields used with descriptor from quantity'\n",
+ " WHEN product_status = 'resolved'\n",
+ " AND text_status IN ('resolved', 'partial')\n",
+ " AND text_category IN ('mass', 'volume', 'multipack_measure', 'energy')\n",
+ " AND NOT comparable_value_match\n",
+ " AND NOT multipack_inner_value_match\n",
+ " THEN 'conflict between product fields and quantity text'\n",
+ " WHEN product_status = 'resolved'\n",
+ " THEN 'structured product fields used'\n",
+ " WHEN product_quantity_numeric = 0\n",
+ " AND text_status IN ('resolved', 'partial')\n",
+ " AND text_normalized_value IS NOT NULL\n",
+ " AND coalesce(text_normalized_value, 0) > 0\n",
+ " THEN 'structured zero ignored; quantity text used'\n",
+ " WHEN product_quantity_numeric IS NOT NULL\n",
+ " AND product_base_unit IS NULL\n",
+ " AND text_status IN ('resolved', 'partial')\n",
+ " AND raw_numeric_matches_text\n",
+ " THEN 'filled missing structure from quantity'\n",
+ " WHEN product_quantity_numeric IS NOT NULL\n",
+ " AND product_base_unit IS NULL\n",
+ " AND text_status IN ('resolved', 'partial')\n",
+ " AND NOT raw_numeric_matches_text\n",
+ " THEN 'structured numeric value disagrees with quantity text'\n",
+ " WHEN text_status IS NOT NULL\n",
+ " THEN text_note\n",
+ " WHEN product_quantity_numeric IS NOT NULL AND product_base_unit IS NULL\n",
+ " THEN 'structured numeric value present, but no usable unit found'\n",
+ " ELSE 'no usable quantity signal found'\n",
+ " END AS quantity_note\n",
+ "FROM staged;\n",
+ "'''\n",
+ "\n",
+ "con.execute(quantity_cleaned_sql)\n",
+ "\n",
+ "print(\"Reference cleaned view ready for internal change audit.\")\n"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "2a907697",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "\n",
+ "# This cell refreshes the helper setup for the current cleaner.\n",
+ "# DuckDB stays the main engine while this cell adds:\n",
+ "# - safe OCR cleanup before parsing\n",
+ "# - dataset-reviewed multilingual aliases\n",
+ "# - mixed-measure helpers for dual-unit and compound imperial expressions\n",
+ "\n",
+ "from duckdb import sqltypes\n",
+ "\n",
+ "FL_OZ_US_FACTOR = 29.5735295625\n",
+ "FL_OZ_IMPERIAL_FACTOR = 28.4130625\n",
+ "\n",
+ "MEASURE_ALIASES = list(\n",
+ " dict.fromkeys(\n",
+ " MEASURE_ALIASES\n",
+ " + [\n",
+ " (\"gr\", \"g\", \"mass\", 1.0),\n",
+ " (\"mg\", \"g\", \"mass\", 0.001),\n",
+ " (\"kilo\", \"g\", \"mass\", 1000.0),\n",
+ " (\"pound\", \"g\", \"mass\", 453.59237),\n",
+ " (\"pounds\", \"g\", \"mass\", 453.59237),\n",
+ " ]\n",
+ " )\n",
+ ")\n",
+ "\n",
+ "COUNT_DESCRIPTOR_ALIASES = list(\n",
+ " dict.fromkeys(\n",
+ " COUNT_DESCRIPTOR_ALIASES\n",
+ " + [\n",
+ " (\"unit\", \"piece\", \"count\"),\n",
+ " (\"units\", \"piece\", \"count\"),\n",
+ " (\"unite\", \"piece\", \"count\"),\n",
+ " (\"unites\", \"piece\", \"count\"),\n",
+ " (\"unité\", \"piece\", \"count\"),\n",
+ " (\"unités\", \"piece\", \"count\"),\n",
+ " (\"oeuf\", \"piece\", \"count\"),\n",
+ " (\"oeufs\", \"piece\", \"count\"),\n",
+ " (\"egg\", \"piece\", \"count\"),\n",
+ " (\"eggs\", \"piece\", \"count\"),\n",
+ " (\"bar\", \"piece\", \"count\"),\n",
+ " (\"bars\", \"piece\", \"count\"),\n",
+ " (\"barre\", \"piece\", \"count\"),\n",
+ " (\"barres\", \"piece\", \"count\"),\n",
+ " (\"comprime\", \"tablet\", \"count\"),\n",
+ " (\"comprimes\", \"tablet\", \"count\"),\n",
+ " (\"caplet\", \"tablet\", \"count\"),\n",
+ " (\"caplets\", \"tablet\", \"count\"),\n",
+ " (\"softgel\", \"capsule\", \"count\"),\n",
+ " (\"softgels\", \"capsule\", \"count\"),\n",
+ " ]\n",
+ " )\n",
+ ")\n",
+ "\n",
+ "APPROVED_MULTILINGUAL_ALIAS_TOKENS = [\n",
+ " \"bar\",\n",
+ " \"barre\",\n",
+ " \"barres\",\n",
+ " \"boite\",\n",
+ " \"boites\",\n",
+ " \"caplet\",\n",
+ " \"caplets\",\n",
+ " \"chaque\",\n",
+ " \"chacun\",\n",
+ " \"chacune\",\n",
+ " \"comprime\",\n",
+ " \"comprimes\",\n",
+ " \"egg\",\n",
+ " \"eggs\",\n",
+ " \"gr\",\n",
+ " \"kilo\",\n",
+ " \"lit\",\n",
+ " \"morceau\",\n",
+ " \"morceaux\",\n",
+ " \"oeuf\",\n",
+ " \"oeufs\",\n",
+ " \"paquet\",\n",
+ " \"paquets\",\n",
+ " \"softgel\",\n",
+ " \"softgels\",\n",
+ " \"unite\",\n",
+ " \"unites\",\n",
+ " \"unit\",\n",
+ " \"units\",\n",
+ " \"unité\",\n",
+ " \"unités\",\n",
+ "]\n",
+ "\n",
+ "STRUCTURAL_TAIL_TOKENS = [\"each\", \"chacun\", \"chacune\", \"chaque\"]\n",
+ "ALIAS_MINING_STOPWORDS = [\n",
+ " \"a\",\n",
+ " \"an\",\n",
+ " \"and\",\n",
+ " \"au\",\n",
+ " \"aux\",\n",
+ " \"avec\",\n",
+ " \"chaque\",\n",
+ " \"chacun\",\n",
+ " \"chacune\",\n",
+ " \"d\",\n",
+ " \"de\",\n",
+ " \"des\",\n",
+ " \"du\",\n",
+ " \"each\",\n",
+ " \"en\",\n",
+ " \"et\",\n",
+ " \"for\",\n",
+ " \"l\",\n",
+ " \"la\",\n",
+ " \"le\",\n",
+ " \"les\",\n",
+ " \"net\",\n",
+ " \"of\",\n",
+ " \"ou\",\n",
+ " \"par\",\n",
+ " \"pour\",\n",
+ " \"the\",\n",
+ " \"wt\",\n",
+ " \"x\",\n",
+ " \"xl\",\n",
+ "]\n",
+ "\n",
+ "\n",
+ "def normalize_sql(expr: str) -> str:\n",
+ " normalized = f\"lower(trim(coalesce({expr}, '')))\"\n",
+ " replacements = [\n",
+ " (\"’\", \"'\"),\n",
+ " (\"’\", \"'\"),\n",
+ " (\"`\", \"'\"),\n",
+ " (\"´\", \"'\"),\n",
+ " (\"×\", \"x\"),\n",
+ " (\"×\", \"x\"),\n",
+ " (\"é\", \"e\"),\n",
+ " (\"è\", \"e\"),\n",
+ " (\"ê\", \"e\"),\n",
+ " (\"ë\", \"e\"),\n",
+ " (\"é\", \"e\"),\n",
+ " (\"è\", \"e\"),\n",
+ " (\"ê\", \"e\"),\n",
+ " (\"ë\", \"e\"),\n",
+ " (\"à\", \"a\"),\n",
+ " (\"â\", \"a\"),\n",
+ " (\"ä\", \"a\"),\n",
+ " (\"Ã \", \"a\"),\n",
+ " (\"â\", \"a\"),\n",
+ " (\"ä\", \"a\"),\n",
+ " (\"î\", \"i\"),\n",
+ " (\"ï\", \"i\"),\n",
+ " (\"î\", \"i\"),\n",
+ " (\"ï\", \"i\"),\n",
+ " (\"ô\", \"o\"),\n",
+ " (\"ö\", \"o\"),\n",
+ " (\"ô\", \"o\"),\n",
+ " (\"ö\", \"o\"),\n",
+ " (\"ù\", \"u\"),\n",
+ " (\"û\", \"u\"),\n",
+ " (\"ü\", \"u\"),\n",
+ " (\"ù\", \"u\"),\n",
+ " (\"û\", \"u\"),\n",
+ " (\"ü\", \"u\"),\n",
+ " (\"ç\", \"c\"),\n",
+ " (\"ç\", \"c\"),\n",
+ " (\"œ\", \"oe\"),\n",
+ " ]\n",
+ " for raw_text, canonical_text in replacements:\n",
+ " normalized = f\"replace({normalized}, {sql_quote(raw_text)}, {sql_quote(canonical_text)})\"\n",
+ " normalized = f\"regexp_replace({normalized}, '\\\\s+', ' ', 'g')\"\n",
+ " return normalized\n",
+ "\n",
+ "\n",
+ "OCR_O_TOKEN_REGEX = re.compile(r\"(? str:\n",
+ " corrected = token\n",
+ " if corrected.startswith((\".\", \",\")):\n",
+ " corrected = \"0\" + corrected\n",
+ " if re.fullmatch(r\"[0-9o]+(?:[.,][0-9o]+)?\", corrected):\n",
+ " return corrected.replace(\"o\", \"0\")\n",
+ " return corrected\n",
+ "\n",
+ "\n",
+ "def _correct_leading_il_numeric_token(token: str) -> str:\n",
+ " if re.fullmatch(r\"[il][0-9]{2,}(?:[.,][0-9]+)?\", token):\n",
+ " return \"1\" + token[1:]\n",
+ " return token\n",
+ "\n",
+ "\n",
+ "def safe_quantity_cleanup(text: str | None) -> str | None:\n",
+ " if text is None:\n",
+ " return None\n",
+ "\n",
+ " cleaned = re.sub(r\"\\s+\", \" \", text.strip())\n",
+ " cleaned = cleaned.replace(\"fl. oz\", \"fl oz\").replace(\"fl.oz\", \"fl oz\")\n",
+ " cleaned = re.sub(r\"(?<=\\d)\\.\\s+(?=\\d)\", \".\", cleaned)\n",
+ " cleaned = re.sub(r\"(? bool:\n",
+ " if text is None:\n",
+ " return False\n",
+ " for match in OCR_O_TOKEN_REGEX.finditer(text):\n",
+ " token = match.group(1)\n",
+ " if _correct_o_numeric_token(token) != token:\n",
+ " return True\n",
+ " for match in OCR_LEADING_IL_TOKEN_REGEX.finditer(text):\n",
+ " token = match.group(1)\n",
+ " if _correct_leading_il_numeric_token(token) != token:\n",
+ " return True\n",
+ " return False\n",
+ "\n",
+ "\n",
+ "def has_non_ascii_alpha(text: str | None) -> bool:\n",
+ " if text is None:\n",
+ " return False\n",
+ " return any(character.isalpha() and ord(character) > 127 for character in text)\n",
+ "\n",
+ "\n",
+ "MEASURE_LOOKUP = {\n",
+ " token: {\"base_unit\": base_unit, \"category\": category, \"factor\": factor}\n",
+ " for token, base_unit, category, factor in MEASURE_ALIASES\n",
+ "}\n",
+ "METRIC_MASS_TOKENS = {\n",
+ " \"g\",\n",
+ " \"gr\",\n",
+ " \"gram\",\n",
+ " \"grams\",\n",
+ " \"gramme\",\n",
+ " \"grammes\",\n",
+ " \"mg\",\n",
+ " \"kg\",\n",
+ " \"kilo\",\n",
+ " \"kilogram\",\n",
+ " \"kilograms\",\n",
+ " \"kilogramme\",\n",
+ " \"kilogrammes\",\n",
+ "}\n",
+ "METRIC_VOLUME_TOKENS = {\n",
+ " \"ml\",\n",
+ " \"millilitre\",\n",
+ " \"millilitres\",\n",
+ " \"milliliter\",\n",
+ " \"milliliters\",\n",
+ " \"l\",\n",
+ " \"lit\",\n",
+ " \"litre\",\n",
+ " \"litres\",\n",
+ " \"liter\",\n",
+ " \"liters\",\n",
+ " \"cl\",\n",
+ " \"dl\",\n",
+ "}\n",
+ "IMPERIAL_MASS_TOKENS = {\"oz\", \"lb\", \"lbs\", \"pound\", \"pounds\"}\n",
+ "\n",
+ "DESCRIPTOR_LOOKUP: dict[str, dict[str, str]] = {}\n",
+ "PY_MEASURE_OCCURRENCE_REGEX = None\n",
+ "PY_PLAIN_MULTIPACK_ANY_REGEX = None\n",
+ "PY_DESCRIPTOR_MULTIPACK_ANY_REGEX = None\n",
+ "PY_CONTEXTUAL_MIXED_KEYWORD_REGEX = None\n",
+ "\n",
+ "\n",
+ "def parse_numeric_text(text: str | None) -> float | None:\n",
+ " if text is None:\n",
+ " return None\n",
+ " cleaned = text.strip().replace(\",\", \".\")\n",
+ " if cleaned.startswith(\".\"):\n",
+ " cleaned = \"0\" + cleaned\n",
+ " try:\n",
+ " return float(cleaned)\n",
+ " except ValueError:\n",
+ " return None\n",
+ "\n",
+ "\n",
+ "def decimal_places(text: str | None) -> int:\n",
+ " if text is None:\n",
+ " return 0\n",
+ " cleaned = text.strip().replace(\",\", \".\")\n",
+ " if \".\" not in cleaned:\n",
+ " return 0\n",
+ " return len(cleaned.rsplit(\".\", 1)[1])\n",
+ "\n",
+ "\n",
+ "def to_base_value(value: float, unit: str, system: str = \"default\") -> tuple[float | None, str | None, str | None]:\n",
+ " if unit == \"fl oz\":\n",
+ " factor = FL_OZ_IMPERIAL_FACTOR if system == \"imperial\" else FL_OZ_US_FACTOR\n",
+ " return value * factor, \"ml\", \"volume\"\n",
+ " info = MEASURE_LOOKUP.get(unit)\n",
+ " if info is None:\n",
+ " return None, None, None\n",
+ " return value * info[\"factor\"], info[\"base_unit\"], info[\"category\"]\n",
+ "\n",
+ "\n",
+ "def rounding_half_step(value_text: str, unit: str, system: str = \"default\") -> float:\n",
+ " decimals = decimal_places(value_text)\n",
+ " step = 10 ** (-decimals)\n",
+ " if unit == \"fl oz\":\n",
+ " factor = FL_OZ_IMPERIAL_FACTOR if system == \"imperial\" else FL_OZ_US_FACTOR\n",
+ " else:\n",
+ " info = MEASURE_LOOKUP.get(unit)\n",
+ " factor = 1.0 if info is None else info[\"factor\"]\n",
+ " return 0.5 * step * factor\n",
+ "\n",
+ "\n",
+ "def choose_metric_canonical(measures: list[dict], category: str, system: str = \"default\") -> tuple[float | None, str | None]:\n",
+ " preferred_tokens = METRIC_MASS_TOKENS if category == \"mass\" else METRIC_VOLUME_TOKENS\n",
+ " for measure in measures:\n",
+ " if measure[\"unit\"] in preferred_tokens:\n",
+ " normalized_value, base_unit, _ = to_base_value(measure[\"value\"], measure[\"unit\"], system)\n",
+ " return normalized_value, base_unit\n",
+ " first = measures[0]\n",
+ " normalized_value, base_unit, _ = to_base_value(first[\"value\"], first[\"unit\"], system)\n",
+ " return normalized_value, base_unit\n",
+ "\n",
+ "\n",
+ "def equivalent_pair_result(measures: list[dict]) -> dict | None:\n",
+ " first, second = measures\n",
+ " if first[\"category\"] != second[\"category\"] or first[\"category\"] not in {\"mass\", \"volume\"}:\n",
+ " return None\n",
+ "\n",
+ " candidate_systems = [\"default\"]\n",
+ " if first[\"category\"] == \"volume\" and \"fl oz\" in {first[\"unit\"], second[\"unit\"]}:\n",
+ " if any(unit in METRIC_VOLUME_TOKENS for unit in {first[\"unit\"], second[\"unit\"]}):\n",
+ " candidate_systems = [\"us\", \"imperial\"]\n",
+ "\n",
+ " pair_units = {first[\"unit\"], second[\"unit\"]}\n",
+ " metric_imperial_mass_pair = (\n",
+ " first[\"category\"] == \"mass\"\n",
+ " and bool(pair_units & METRIC_MASS_TOKENS)\n",
+ " and bool(pair_units & IMPERIAL_MASS_TOKENS)\n",
+ " )\n",
+ "\n",
+ " matches: list[dict] = []\n",
+ " for system in candidate_systems:\n",
+ " value_1, unit_1, _ = to_base_value(first[\"value\"], first[\"unit\"], system)\n",
+ " value_2, unit_2, _ = to_base_value(second[\"value\"], second[\"unit\"], system)\n",
+ " if value_1 is None or value_2 is None or unit_1 != unit_2:\n",
+ " continue\n",
+ " relative_allowance = (\n",
+ " 0.045 if metric_imperial_mass_pair else VALUE_MATCH_TOLERANCE\n",
+ " ) * max(abs(value_1), abs(value_2))\n",
+ " rounding_allowance = (\n",
+ " rounding_half_step(first[\"value_text\"], first[\"unit\"], system)\n",
+ " + rounding_half_step(second[\"value_text\"], second[\"unit\"], system)\n",
+ " )\n",
+ " if metric_imperial_mass_pair:\n",
+ " allowance = max(0.01, relative_allowance) + min(rounding_allowance, 1.0)\n",
+ " else:\n",
+ " allowance = max(0.01, relative_allowance, rounding_allowance)\n",
+ " difference = abs(value_1 - value_2)\n",
+ " if difference <= allowance:\n",
+ " normalized_value, normalized_unit = choose_metric_canonical(measures, first[\"category\"], system)\n",
+ " matches.append(\n",
+ " {\n",
+ " \"status\": \"dual_unit_equivalent\",\n",
+ " \"category\": first[\"category\"],\n",
+ " \"normalized_value\": normalized_value,\n",
+ " \"normalized_unit\": normalized_unit,\n",
+ " \"note\": \"equivalent dual-unit text normalized\",\n",
+ " \"system_used\": system,\n",
+ " \"difference\": difference,\n",
+ " }\n",
+ " )\n",
+ "\n",
+ " if not matches:\n",
+ " return None\n",
+ " best_match = min(matches, key=lambda item: item[\"difference\"])\n",
+ " best_match.pop(\"difference\", None)\n",
+ " return best_match\n",
+ "\n",
+ "\n",
+ "def compound_imperial_result(measures: list[dict]) -> dict | None:\n",
+ " categories = {measure[\"category\"] for measure in measures}\n",
+ " if categories != {\"mass\"}:\n",
+ " return None\n",
+ "\n",
+ " metric_measures = [measure for measure in measures if measure[\"unit\"] in METRIC_MASS_TOKENS]\n",
+ " imperial_measures = [measure for measure in measures if measure[\"unit\"] in IMPERIAL_MASS_TOKENS]\n",
+ " if len(metric_measures) != 1 or not imperial_measures:\n",
+ " return None\n",
+ "\n",
+ " metric_measure = metric_measures[0]\n",
+ " metric_total, metric_unit, _ = to_base_value(metric_measure[\"value\"], metric_measure[\"unit\"])\n",
+ " individually_equivalent_imperial: list[dict] = []\n",
+ " remaining_imperial: list[dict] = []\n",
+ " for measure in imperial_measures:\n",
+ " converted_value, _, _ = to_base_value(measure[\"value\"], measure[\"unit\"])\n",
+ " if converted_value is None:\n",
+ " return None\n",
+ " direct_allowance = max(\n",
+ " 0.01,\n",
+ " VALUE_MATCH_TOLERANCE * max(abs(metric_total), abs(converted_value)),\n",
+ " )\n",
+ " if abs(metric_total - converted_value) <= direct_allowance:\n",
+ " individually_equivalent_imperial.append(measure)\n",
+ " else:\n",
+ " remaining_imperial.append(measure)\n",
+ "\n",
+ " if remaining_imperial:\n",
+ " imperial_total = 0.0\n",
+ " imperial_allowance = 0.0\n",
+ " for measure in remaining_imperial:\n",
+ " converted_value, _, _ = to_base_value(measure[\"value\"], measure[\"unit\"])\n",
+ " if converted_value is None:\n",
+ " return None\n",
+ " imperial_total += converted_value\n",
+ " imperial_allowance += rounding_half_step(measure[\"value_text\"], measure[\"unit\"])\n",
+ "\n",
+ " allowance = max(\n",
+ " 0.01,\n",
+ " VALUE_MATCH_TOLERANCE * max(abs(metric_total), abs(imperial_total)),\n",
+ " )\n",
+ " if abs(metric_total - imperial_total) > allowance:\n",
+ " return {\n",
+ " \"status\": \"mixed_conflict\",\n",
+ " \"category\": \"mixed_measure\",\n",
+ " \"note\": \"mixed expression contradictory\",\n",
+ " \"system_used\": \"default\",\n",
+ " }\n",
+ " elif not individually_equivalent_imperial:\n",
+ " return None\n",
+ "\n",
+ " return {\n",
+ " \"status\": \"compound_imperial_equivalent\",\n",
+ " \"category\": \"mass\",\n",
+ " \"normalized_value\": metric_total,\n",
+ " \"normalized_unit\": metric_unit,\n",
+ " \"note\": \"compound imperial text normalized against metric total\",\n",
+ " \"system_used\": \"default\",\n",
+ " }\n",
+ "\n",
+ "\n",
+ "def resolve_mixed_measure_json(\n",
+ " value_1_text: str | None,\n",
+ " unit_1: str | None,\n",
+ " value_2_text: str | None,\n",
+ " unit_2: str | None,\n",
+ " value_3_text: str | None,\n",
+ " unit_3: str | None,\n",
+ " value_4_text: str | None,\n",
+ " unit_4: str | None,\n",
+ ") -> str:\n",
+ " measures: list[dict] = []\n",
+ " for value_text, unit in [\n",
+ " (value_1_text, unit_1),\n",
+ " (value_2_text, unit_2),\n",
+ " (value_3_text, unit_3),\n",
+ " (value_4_text, unit_4),\n",
+ " ]:\n",
+ " if value_text is None or unit is None:\n",
+ " continue\n",
+ " value = parse_numeric_text(value_text)\n",
+ " if value is None:\n",
+ " return json.dumps(\n",
+ " {\n",
+ " \"status\": \"mixed_unresolved\",\n",
+ " \"category\": \"mixed_measure\",\n",
+ " \"note\": \"mixed expression not safely reducible\",\n",
+ " \"system_used\": \"default\",\n",
+ " }\n",
+ " )\n",
+ " _, _, category = to_base_value(value, unit)\n",
+ " if category is None:\n",
+ " return json.dumps(\n",
+ " {\n",
+ " \"status\": \"mixed_unresolved\",\n",
+ " \"category\": \"mixed_measure\",\n",
+ " \"note\": \"mixed expression not safely reducible\",\n",
+ " \"system_used\": \"default\",\n",
+ " }\n",
+ " )\n",
+ " measures.append(\n",
+ " {\n",
+ " \"value\": value,\n",
+ " \"value_text\": value_text,\n",
+ " \"unit\": unit,\n",
+ " \"category\": category,\n",
+ " }\n",
+ " )\n",
+ "\n",
+ " if len(measures) < 2:\n",
+ " return json.dumps({})\n",
+ "\n",
+ " if len(measures) == 2:\n",
+ " pair_result = equivalent_pair_result(measures)\n",
+ " if pair_result is not None:\n",
+ " return json.dumps(pair_result)\n",
+ " if len({measure[\"category\"] for measure in measures}) == 1 and measures[0][\"category\"] in {\"mass\", \"volume\"}:\n",
+ " return json.dumps(\n",
+ " {\n",
+ " \"status\": \"mixed_conflict\",\n",
+ " \"category\": \"mixed_measure\",\n",
+ " \"note\": \"mixed expression contradictory\",\n",
+ " \"system_used\": \"default\",\n",
+ " }\n",
+ " )\n",
+ " return json.dumps(\n",
+ " {\n",
+ " \"status\": \"mixed_unresolved\",\n",
+ " \"category\": \"mixed_measure\",\n",
+ " \"note\": \"mixed expression not safely reducible\",\n",
+ " \"system_used\": \"default\",\n",
+ " }\n",
+ " )\n",
+ "\n",
+ " compound_result = compound_imperial_result(measures)\n",
+ " if compound_result is not None:\n",
+ " return json.dumps(compound_result)\n",
+ "\n",
+ " if len({measure[\"category\"] for measure in measures}) > 1:\n",
+ " return json.dumps(\n",
+ " {\n",
+ " \"status\": \"mixed_unresolved\",\n",
+ " \"category\": \"mixed_measure\",\n",
+ " \"note\": \"mixed expression not safely reducible\",\n",
+ " \"system_used\": \"default\",\n",
+ " }\n",
+ " )\n",
+ "\n",
+ " return json.dumps(\n",
+ " {\n",
+ " \"status\": \"mixed_conflict\",\n",
+ " \"category\": \"mixed_measure\",\n",
+ " \"note\": \"mixed expression contradictory\",\n",
+ " \"system_used\": \"default\",\n",
+ " }\n",
+ " )\n",
+ "\n",
+ "\n",
+ "def normalized_values_close(value_1: float | None, value_2: float | None, base_unit: str | None) -> bool:\n",
+ " if value_1 is None or value_2 is None:\n",
+ " return False\n",
+ " absolute_allowance = 1.0 if base_unit in {\"g\", \"ml\"} else 0.01\n",
+ " return abs(value_1 - value_2) <= max(\n",
+ " absolute_allowance,\n",
+ " VALUE_MATCH_TOLERANCE * max(abs(value_1), abs(value_2)),\n",
+ " )\n",
+ "\n",
+ "\n",
+ "def parse_count_token(text: str | None) -> float | None:\n",
+ " if text is None:\n",
+ " return None\n",
+ " lowered = text.strip().lower()\n",
+ " if lowered in {\"un\", \"une\", \"a\", \"an\"}:\n",
+ " return 1.0\n",
+ " return parse_numeric_text(lowered)\n",
+ "\n",
+ "\n",
+ "def descriptor_from_phrase(text: str | None) -> tuple[str | None, str | None]:\n",
+ " if text is None:\n",
+ " return None, None\n",
+ " tokens = re.findall(r\"[a-z]+(?:[-/][a-z]+)*\", text.lower())\n",
+ " for token in reversed(tokens):\n",
+ " descriptor_info = DESCRIPTOR_LOOKUP.get(token)\n",
+ " if descriptor_info is not None:\n",
+ " return descriptor_info[\"item_descriptor\"], descriptor_info[\"quantity_category\"]\n",
+ " return None, None\n",
+ "\n",
+ "\n",
+ "def extract_measure_occurrences(text: str) -> list[dict]:\n",
+ " if PY_MEASURE_OCCURRENCE_REGEX is None:\n",
+ " return []\n",
+ " measures: list[dict] = []\n",
+ " for match in PY_MEASURE_OCCURRENCE_REGEX.finditer(text):\n",
+ " value_text = match.group(1)\n",
+ " unit = match.group(2)\n",
+ " value = parse_numeric_text(value_text)\n",
+ " if value is None:\n",
+ " continue\n",
+ " normalized_value, base_unit, category = to_base_value(value, unit)\n",
+ " if normalized_value is None or base_unit is None or category is None:\n",
+ " continue\n",
+ " measures.append(\n",
+ " {\n",
+ " \"value_text\": value_text,\n",
+ " \"value\": value,\n",
+ " \"unit\": unit,\n",
+ " \"normalized_value\": normalized_value,\n",
+ " \"base_unit\": base_unit,\n",
+ " \"category\": category,\n",
+ " }\n",
+ " )\n",
+ " return measures\n",
+ "\n",
+ "\n",
+ "def build_multipack_occurrence(\n",
+ " count_text: str | None,\n",
+ " descriptor_text: str | None,\n",
+ " inner_value_text: str | None,\n",
+ " unit_token: str | None,\n",
+ ") -> dict | None:\n",
+ " count_numeric = parse_count_token(count_text)\n",
+ " inner_value = parse_numeric_text(inner_value_text)\n",
+ " if count_numeric is None or count_numeric <= 1 or inner_value is None or unit_token is None:\n",
+ " return None\n",
+ " inner_normalized_value, base_unit, category = to_base_value(inner_value, unit_token)\n",
+ " if inner_normalized_value is None or base_unit is None or category not in {\"mass\", \"volume\"}:\n",
+ " return None\n",
+ " item_descriptor, descriptor_category = descriptor_from_phrase(descriptor_text)\n",
+ " return {\n",
+ " \"count_numeric\": count_numeric,\n",
+ " \"inner_normalized_value\": inner_normalized_value,\n",
+ " \"total_normalized_value\": count_numeric * inner_normalized_value,\n",
+ " \"base_unit\": base_unit,\n",
+ " \"category\": category,\n",
+ " \"item_descriptor\": item_descriptor,\n",
+ " \"descriptor_category\": descriptor_category,\n",
+ " }\n",
+ "\n",
+ "\n",
+ "def resolve_complex_quantity_json(text: str | None) -> str:\n",
+ " if text is None or text == \"\":\n",
+ " return json.dumps({})\n",
+ "\n",
+ " measures = extract_measure_occurrences(text)\n",
+ " contextual_mixed = bool(PY_CONTEXTUAL_MIXED_KEYWORD_REGEX and PY_CONTEXTUAL_MIXED_KEYWORD_REGEX.search(text))\n",
+ "\n",
+ " descriptor_occurrences: list[dict] = []\n",
+ " occupied_spans: list[tuple[int, int]] = []\n",
+ " if PY_DESCRIPTOR_MULTIPACK_ANY_REGEX is not None:\n",
+ " for match in PY_DESCRIPTOR_MULTIPACK_ANY_REGEX.finditer(text):\n",
+ " occurrence = build_multipack_occurrence(match.group(1), match.group(2), match.group(3), match.group(4))\n",
+ " if occurrence is None:\n",
+ " continue\n",
+ " descriptor_occurrences.append(occurrence)\n",
+ " occupied_spans.append(match.span())\n",
+ "\n",
+ " occurrences = descriptor_occurrences[:]\n",
+ " if PY_PLAIN_MULTIPACK_ANY_REGEX is not None:\n",
+ " for match in PY_PLAIN_MULTIPACK_ANY_REGEX.finditer(text):\n",
+ " start, end = match.span()\n",
+ " if any(start >= span_start and end <= span_end for span_start, span_end in occupied_spans):\n",
+ " continue\n",
+ " occurrence = build_multipack_occurrence(match.group(1), None, match.group(2), match.group(3))\n",
+ " if occurrence is None:\n",
+ " continue\n",
+ " occurrences.append(occurrence)\n",
+ "\n",
+ " if occurrences:\n",
+ " primary = occurrences[0]\n",
+ " consistent_totals = all(\n",
+ " occurrence[\"base_unit\"] == primary[\"base_unit\"]\n",
+ " and occurrence[\"category\"] == primary[\"category\"]\n",
+ " and normalized_values_close(\n",
+ " occurrence[\"total_normalized_value\"],\n",
+ " primary[\"total_normalized_value\"],\n",
+ " primary[\"base_unit\"],\n",
+ " )\n",
+ " for occurrence in occurrences\n",
+ " )\n",
+ " if not consistent_totals:\n",
+ " return json.dumps(\n",
+ " {\n",
+ " \"status\": \"mixed_conflict\",\n",
+ " \"category\": \"mixed_measure\",\n",
+ " \"note\": \"mixed expression contradictory\",\n",
+ " }\n",
+ " )\n",
+ "\n",
+ " same_pack_structure = all(\n",
+ " normalized_values_close(\n",
+ " occurrence[\"count_numeric\"],\n",
+ " primary[\"count_numeric\"],\n",
+ " None,\n",
+ " )\n",
+ " and normalized_values_close(\n",
+ " occurrence[\"inner_normalized_value\"],\n",
+ " primary[\"inner_normalized_value\"],\n",
+ " primary[\"base_unit\"],\n",
+ " )\n",
+ " for occurrence in occurrences\n",
+ " )\n",
+ " if len(occurrences) > 1 and not same_pack_structure:\n",
+ " return json.dumps(\n",
+ " {\n",
+ " \"status\": \"contextual_mixed_unresolved\" if contextual_mixed else \"mixed_unresolved\",\n",
+ " \"category\": \"mixed_measure\",\n",
+ " \"note\": \"mixed expression not safely reducible\",\n",
+ " }\n",
+ " )\n",
+ "\n",
+ " inner_values = [occurrence[\"inner_normalized_value\"] for occurrence in occurrences]\n",
+ " has_matching_total_measure = False\n",
+ " has_conflicting_same_category_measure = False\n",
+ "\n",
+ " for measure in measures:\n",
+ " if measure[\"base_unit\"] != primary[\"base_unit\"] or measure[\"category\"] != primary[\"category\"]:\n",
+ " continue\n",
+ " if any(\n",
+ " normalized_values_close(measure[\"normalized_value\"], inner_value, primary[\"base_unit\"])\n",
+ " for inner_value in inner_values\n",
+ " ):\n",
+ " continue\n",
+ " if normalized_values_close(\n",
+ " measure[\"normalized_value\"],\n",
+ " primary[\"total_normalized_value\"],\n",
+ " primary[\"base_unit\"],\n",
+ " ):\n",
+ " has_matching_total_measure = True\n",
+ " else:\n",
+ " has_conflicting_same_category_measure = True\n",
+ "\n",
+ " if has_conflicting_same_category_measure:\n",
+ " return json.dumps(\n",
+ " {\n",
+ " \"status\": \"contextual_mixed_unresolved\" if contextual_mixed else \"mixed_conflict\",\n",
+ " \"category\": \"mixed_measure\",\n",
+ " \"note\": \"mixed expression not safely reducible\" if contextual_mixed else \"mixed expression contradictory\",\n",
+ " }\n",
+ " )\n",
+ "\n",
+ " note = \"embedded multipack resolved from quantity text\"\n",
+ " if has_matching_total_measure:\n",
+ " note = \"embedded multipack total confirmed by surrounding text\"\n",
+ " if primary[\"item_descriptor\"] is not None:\n",
+ " note = \"embedded descriptor multipack resolved from quantity text\"\n",
+ " if has_matching_total_measure:\n",
+ " note = \"embedded descriptor multipack total confirmed by surrounding text\"\n",
+ "\n",
+ " return json.dumps(\n",
+ " {\n",
+ " \"status\": \"embedded_descriptor_multipack\" if primary[\"item_descriptor\"] is not None else \"embedded_plain_multipack\",\n",
+ " \"category\": \"multipack_measure\",\n",
+ " \"normalized_value\": primary[\"total_normalized_value\"],\n",
+ " \"normalized_unit\": primary[\"base_unit\"],\n",
+ " \"inner_normalized_value\": primary[\"inner_normalized_value\"],\n",
+ " \"pack_count\": primary[\"count_numeric\"],\n",
+ " \"item_descriptor\": primary[\"item_descriptor\"],\n",
+ " \"note\": note,\n",
+ " }\n",
+ " )\n",
+ "\n",
+ " if len(measures) >= 2 and contextual_mixed:\n",
+ " return json.dumps(\n",
+ " {\n",
+ " \"status\": \"contextual_mixed_unresolved\",\n",
+ " \"category\": \"mixed_measure\",\n",
+ " \"note\": \"mixed expression not safely reducible\",\n",
+ " }\n",
+ " )\n",
+ "\n",
+ " return json.dumps({})\n",
+ "\n",
+ "\n",
+ "for function_name in [\n",
+ " \"safe_quantity_cleanup\",\n",
+ " \"has_ocr_like_numeric_token\",\n",
+ " \"has_non_ascii_alpha\",\n",
+ " \"resolve_complex_quantity_json\",\n",
+ " \"resolve_mixed_measure_json\",\n",
+ "]:\n",
+ " try:\n",
+ " con.remove_function(function_name)\n",
+ " except Exception:\n",
+ " pass\n",
+ "\n",
+ "con.create_function(\n",
+ " \"safe_quantity_cleanup\",\n",
+ " safe_quantity_cleanup,\n",
+ " [sqltypes.VARCHAR],\n",
+ " sqltypes.VARCHAR,\n",
+ ")\n",
+ "con.create_function(\n",
+ " \"has_ocr_like_numeric_token\",\n",
+ " has_ocr_like_numeric_token,\n",
+ " [sqltypes.VARCHAR],\n",
+ " sqltypes.BOOLEAN,\n",
+ ")\n",
+ "con.create_function(\n",
+ " \"has_non_ascii_alpha\",\n",
+ " has_non_ascii_alpha,\n",
+ " [sqltypes.VARCHAR],\n",
+ " sqltypes.BOOLEAN,\n",
+ ")\n",
+ "con.create_function(\n",
+ " \"resolve_complex_quantity_json\",\n",
+ " resolve_complex_quantity_json,\n",
+ " [sqltypes.VARCHAR],\n",
+ " sqltypes.VARCHAR,\n",
+ " null_handling=\"special\",\n",
+ ")\n",
+ "con.create_function(\n",
+ " \"resolve_mixed_measure_json\",\n",
+ " resolve_mixed_measure_json,\n",
+ " [\n",
+ " sqltypes.VARCHAR,\n",
+ " sqltypes.VARCHAR,\n",
+ " sqltypes.VARCHAR,\n",
+ " sqltypes.VARCHAR,\n",
+ " sqltypes.VARCHAR,\n",
+ " sqltypes.VARCHAR,\n",
+ " sqltypes.VARCHAR,\n",
+ " sqltypes.VARCHAR,\n",
+ " ],\n",
+ " sqltypes.VARCHAR,\n",
+ " null_handling=\"special\",\n",
+ ")\n",
+ "\n",
+ "descriptor_rows = COUNT_DESCRIPTOR_ALIASES + PACKAGING_DESCRIPTOR_ALIASES\n",
+ "DESCRIPTOR_LOOKUP = {\n",
+ " token: {\"item_descriptor\": item_descriptor, \"quantity_category\": quantity_category}\n",
+ " for token, item_descriptor, quantity_category in descriptor_rows\n",
+ "}\n",
+ "\n",
+ "con.execute(\n",
+ " f'''\n",
+ " CREATE OR REPLACE TEMP VIEW measure_aliases AS\n",
+ " SELECT *\n",
+ " FROM (VALUES\n",
+ " {values_sql(MEASURE_ALIASES)}\n",
+ " ) AS alias_rows(token, base_unit, quantity_category, factor);\n",
+ " '''\n",
+ ")\n",
+ "\n",
+ "con.execute(\n",
+ " f'''\n",
+ " CREATE OR REPLACE TEMP VIEW descriptor_aliases AS\n",
+ " SELECT *\n",
+ " FROM (VALUES\n",
+ " {values_sql(descriptor_rows)}\n",
+ " ) AS alias_rows(token, item_descriptor, quantity_category);\n",
+ " '''\n",
+ ")\n",
+ "\n",
+ "con.execute(\n",
+ " f'''\n",
+ " CREATE OR REPLACE TEMP VIEW household_aliases AS\n",
+ " SELECT *\n",
+ " FROM (VALUES\n",
+ " {values_sql(HOUSEHOLD_ALIASES)}\n",
+ " ) AS alias_rows(token, item_descriptor);\n",
+ " '''\n",
+ ")\n",
+ "\n",
+ "NUMBER_PATTERN = r\"(?:[0-9]+(?:[.,][0-9]+)?|[.,][0-9]+)\"\n",
+ "FRACTION_OR_NUMBER_PATTERN = rf\"(?:[0-9]+/[0-9]+|{NUMBER_PATTERN})\"\n",
+ "FREE_DESCRIPTOR_TOKEN_PATTERN = r\"[a-z]+(?:[-/][a-z]+)*\"\n",
+ "FREE_DESCRIPTOR_PHRASE_PATTERN = rf\"{FREE_DESCRIPTOR_TOKEN_PATTERN}(?:\\s+{FREE_DESCRIPTOR_TOKEN_PATTERN}){{0,2}}\"\n",
+ "MEASURE_PATTERN = build_token_pattern([row[0] for row in MEASURE_ALIASES])\n",
+ "COUNT_DESCRIPTOR_PATTERN = build_token_pattern([row[0] for row in COUNT_DESCRIPTOR_ALIASES])\n",
+ "PACKAGING_DESCRIPTOR_PATTERN = build_token_pattern([row[0] for row in PACKAGING_DESCRIPTOR_ALIASES])\n",
+ "DESCRIPTOR_PATTERN = build_token_pattern([row[0] for row in descriptor_rows])\n",
+ "HOUSEHOLD_PATTERN = build_token_pattern([row[0] for row in HOUSEHOLD_ALIASES])\n",
+ "PLACEHOLDER_PATTERN = build_token_pattern(PLACEHOLDER_TERMS)\n",
+ "STRUCTURAL_TAIL_PATTERN = build_token_pattern(STRUCTURAL_TAIL_TOKENS)\n",
+ "APPROVED_MULTILINGUAL_PATTERN = build_token_pattern(APPROVED_MULTILINGUAL_ALIAS_TOKENS)\n",
+ "\n",
+ "MEASURE_ANY_REGEX = rf\"\\b{MEASURE_PATTERN}\\b\"\n",
+ "HOUSEHOLD_ANY_REGEX = rf\"\\b{HOUSEHOLD_PATTERN}\\b\"\n",
+ "PLACEHOLDER_REGEX = rf\"^(?:{PLACEHOLDER_PATTERN}|\\?+)$\"\n",
+ "SIMPLE_MEASURE_REGEX = rf\"^\\s*({NUMBER_PATTERN})\\s*({MEASURE_PATTERN})\\s*[.]?\\s*$\"\n",
+ "MULTIPACK_MEASURE_REGEX = rf\"^\\s*({NUMBER_PATTERN})\\s*[*x]\\s*({NUMBER_PATTERN})\\s*({MEASURE_PATTERN})\\b(?:\\s*(?:{STRUCTURAL_TAIL_PATTERN}))?\\s*[.]?\\s*$\"\n",
+ "DESCRIPTOR_MULTIPACK_REGEX = rf\"^\\s*({NUMBER_PATTERN}|un|une|a|an)\\s+({FREE_DESCRIPTOR_PHRASE_PATTERN})\\b\\s*(?:de|of|[*x])\\s*({NUMBER_PATTERN})\\s*({MEASURE_PATTERN})\\b(?:\\s*(?:{STRUCTURAL_TAIL_PATTERN}))?\\s*[.]?\\s*$\"\n",
+ "TOTAL_PLUS_COUNT_REGEX = rf\"^\\s*({NUMBER_PATTERN})\\s*({MEASURE_PATTERN})\\b\\s*(?:/|-|,)\\s*({NUMBER_PATTERN}|un|une|a|an)\\s*({DESCRIPTOR_PATTERN})\\b(?:\\s*(?:{STRUCTURAL_TAIL_PATTERN}))?\\s*[.]?\\s*$\"\n",
+ "TOTAL_PLUS_INNER_REGEX = rf\"^\\s*({NUMBER_PATTERN})\\s*({MEASURE_PATTERN})\\b\\s*(?:/|-|,)\\s*({NUMBER_PATTERN}|un|une|a|an)\\s*({FREE_DESCRIPTOR_PHRASE_PATTERN})\\b\\s*(?:de|of|[*x])\\s*({NUMBER_PATTERN})\\s*({MEASURE_PATTERN})\\b(?:\\s*(?:{STRUCTURAL_TAIL_PATTERN}))?\\s*[.]?\\s*$\"\n",
+ "DESCRIPTOR_WITH_MEASURE_REGEX = rf\"^\\s*({NUMBER_PATTERN}|un|une|a|an)?\\s*({DESCRIPTOR_PATTERN})\\b.*?({NUMBER_PATTERN})\\s*({MEASURE_PATTERN})\\b\"\n",
+ "DESCRIPTOR_ONLY_REGEX = rf\"^\\s*({NUMBER_PATTERN}|un|une|a|an)?\\s*({DESCRIPTOR_PATTERN})\\b.*$\"\n",
+ "PER_PACKAGING_REGEX = rf\"^\\s*({NUMBER_PATTERN})\\s+par\\s+({PACKAGING_DESCRIPTOR_PATTERN})\\b\"\n",
+ "HOUSEHOLD_REGEX = rf\"^\\s*({FRACTION_OR_NUMBER_PATTERN})\\s*({HOUSEHOLD_PATTERN})\\b\"\n",
+ "NUMBER_ONLY_REGEX = rf\"^\\s*({NUMBER_PATTERN})\\s*$\"\n",
+ "MEASURE_VALUE_UNIT_REGEX = rf\"({NUMBER_PATTERN})\\s*({MEASURE_PATTERN})\\b\"\n",
+ "\n",
+ "APPROVED_MULTILINGUAL_REGEX = rf\"\\b{APPROVED_MULTILINGUAL_PATTERN}\\b\"\n",
+ "CONTEXTUAL_MIXED_ANY_REGEX = r\"(?:/ch\\b|\\b(?:capacity|capacite|ch|dr|drain|drained|dw|egoutte|egouttee|each|ea|net|par|per|poids|portion|qt|serving|servings|scoop|scoops|total|weight|wt)\\b|\\bu\\b\\s*[*x])\"\n",
+ "PY_MEASURE_OCCURRENCE_REGEX = re.compile(rf\"({NUMBER_PATTERN})\\s*({MEASURE_PATTERN})\\b\")\n",
+ "PY_PLAIN_MULTIPACK_ANY_REGEX = re.compile(\n",
+ " rf\"(? 1 AND descriptor_multipack_inner_value_numeric IS NOT NULL AND descriptor_multipack_factor IS NOT NULL THEN descriptor_multipack_inner_value_numeric * descriptor_multipack_factor END AS descriptor_multipack_inner_normalized_value,\n",
+ " CASE WHEN descriptor_multipack_count_numeric IS NOT NULL AND descriptor_multipack_count_numeric > 1 AND descriptor_multipack_inner_value_numeric IS NOT NULL AND descriptor_multipack_factor IS NOT NULL THEN descriptor_multipack_count_numeric * descriptor_multipack_inner_value_numeric * descriptor_multipack_factor END AS descriptor_multipack_total_normalized_value,\n",
+ " CASE WHEN total_plus_value_numeric IS NOT NULL AND total_plus_factor IS NOT NULL THEN total_plus_value_numeric * total_plus_factor END AS total_plus_total_normalized_value,\n",
+ " CASE WHEN total_plus_inner_total_value_numeric IS NOT NULL AND total_plus_inner_total_factor IS NOT NULL THEN total_plus_inner_total_value_numeric * total_plus_inner_total_factor END AS total_plus_inner_total_normalized_value,\n",
+ " CASE WHEN total_plus_inner_value_numeric IS NOT NULL AND total_plus_inner_factor IS NOT NULL THEN total_plus_inner_value_numeric * total_plus_inner_factor END AS total_plus_inner_normalized_value,\n",
+ " CASE WHEN total_plus_inner_count_numeric IS NOT NULL AND total_plus_inner_value_numeric IS NOT NULL AND total_plus_inner_factor IS NOT NULL THEN total_plus_inner_count_numeric * total_plus_inner_value_numeric * total_plus_inner_factor END AS total_plus_inner_derived_total_normalized_value,\n",
+ " CASE WHEN descriptor_measure_value_numeric IS NOT NULL AND descriptor_measure_factor IS NOT NULL THEN descriptor_measure_value_numeric * descriptor_measure_factor END AS descriptor_measure_normalized_value,\n",
+ " CASE WHEN simple_value_numeric IS NOT NULL AND simple_factor IS NOT NULL THEN simple_value_numeric * simple_factor END AS simple_normalized_value,\n",
+ " resolve_mixed_measure_json(\n",
+ " measure_value_1_text,\n",
+ " measure_unit_1_token,\n",
+ " measure_value_2_text,\n",
+ " measure_unit_2_token,\n",
+ " measure_value_3_text,\n",
+ " measure_unit_3_token,\n",
+ " measure_value_4_text,\n",
+ " measure_unit_4_token\n",
+ " ) AS mixed_measure_resolution_json,\n",
+ " resolve_complex_quantity_json(quantity_normalized) AS complex_quantity_resolution_json\n",
+ " FROM joined\n",
+ "),\n",
+ "calculated AS (\n",
+ " SELECT\n",
+ " *,\n",
+ " NULLIF(json_extract_string(complex_quantity_resolution_json, '$.status'), '') AS complex_resolution_status,\n",
+ " NULLIF(json_extract_string(complex_quantity_resolution_json, '$.category'), '') AS complex_resolution_category,\n",
+ " TRY_CAST(json_extract_string(complex_quantity_resolution_json, '$.normalized_value') AS DOUBLE) AS complex_resolution_value,\n",
+ " NULLIF(json_extract_string(complex_quantity_resolution_json, '$.normalized_unit'), '') AS complex_resolution_unit,\n",
+ " TRY_CAST(json_extract_string(complex_quantity_resolution_json, '$.inner_normalized_value') AS DOUBLE) AS complex_resolution_inner_value,\n",
+ " TRY_CAST(json_extract_string(complex_quantity_resolution_json, '$.pack_count') AS DOUBLE) AS complex_resolution_pack_count,\n",
+ " NULLIF(json_extract_string(complex_quantity_resolution_json, '$.item_descriptor'), '') AS complex_resolution_item_descriptor,\n",
+ " NULLIF(json_extract_string(complex_quantity_resolution_json, '$.note'), '') AS complex_resolution_note,\n",
+ " NULLIF(json_extract_string(mixed_measure_resolution_json, '$.status'), '') AS mixed_resolution_status,\n",
+ " NULLIF(json_extract_string(mixed_measure_resolution_json, '$.category'), '') AS mixed_resolution_category,\n",
+ " TRY_CAST(json_extract_string(mixed_measure_resolution_json, '$.normalized_value') AS DOUBLE) AS mixed_resolution_value,\n",
+ " NULLIF(json_extract_string(mixed_measure_resolution_json, '$.normalized_unit'), '') AS mixed_resolution_unit,\n",
+ " NULLIF(json_extract_string(mixed_measure_resolution_json, '$.note'), '') AS mixed_resolution_note,\n",
+ " NULLIF(json_extract_string(mixed_measure_resolution_json, '$.system_used'), '') AS mixed_resolution_system,\n",
+ " CASE\n",
+ " WHEN total_plus_inner_total_normalized_value IS NOT NULL\n",
+ " AND total_plus_inner_derived_total_normalized_value IS NOT NULL\n",
+ " AND total_plus_inner_total_base_unit = total_plus_inner_base_unit\n",
+ " THEN abs(total_plus_inner_total_normalized_value - total_plus_inner_derived_total_normalized_value)\n",
+ " <= greatest(\n",
+ " 0.01,\n",
+ " {VALUE_MATCH_TOLERANCE} * greatest(\n",
+ " abs(total_plus_inner_total_normalized_value),\n",
+ " abs(total_plus_inner_derived_total_normalized_value)\n",
+ " )\n",
+ " )\n",
+ " ELSE FALSE\n",
+ " END AS total_plus_inner_matches_total\n",
+ " FROM typed\n",
+ "),\n",
+ "classified AS (\n",
+ " SELECT\n",
+ " *,\n",
+ " CASE\n",
+ " WHEN quantity IS NULL THEN NULL\n",
+ " WHEN quantity_normalized = '' THEN 'blank'\n",
+ " WHEN quantity_is_placeholder THEN 'placeholder_or_unknown'\n",
+ " WHEN quantity_has_household_token THEN 'household_unit'\n",
+ " WHEN simple_normalized_value IS NOT NULL AND simple_category IN ('mass', 'volume') THEN 'simple_measure'\n",
+ " WHEN simple_normalized_value IS NOT NULL AND simple_category = 'energy' THEN 'simple_energy'\n",
+ " WHEN multipack_normalized_value IS NOT NULL AND multipack_category IN ('mass', 'volume') THEN 'plain_multipack'\n",
+ " WHEN descriptor_multipack_total_normalized_value IS NOT NULL AND descriptor_multipack_category IN ('mass', 'volume') THEN 'descriptor_multipack'\n",
+ " WHEN total_plus_inner_matches_total THEN 'total_plus_inner_multipack'\n",
+ " WHEN total_plus_total_normalized_value IS NOT NULL\n",
+ " AND total_plus_measure_category IN ('mass', 'volume')\n",
+ " AND total_plus_count_numeric IS NOT NULL\n",
+ " AND total_plus_descriptor_category IN ('count', 'packaging_only')\n",
+ " THEN 'total_plus_count'\n",
+ " WHEN complex_resolution_status IS NOT NULL THEN complex_resolution_status\n",
+ " WHEN descriptor_measure_normalized_value IS NOT NULL AND descriptor_measure_category IN ('mass', 'volume') THEN 'descriptor_measure'\n",
+ " WHEN descriptor_measure_normalized_value IS NOT NULL AND descriptor_measure_category = 'energy' THEN 'descriptor_measure_energy'\n",
+ " WHEN descriptor_only_item_descriptor IS NOT NULL AND descriptor_only_category = 'count' THEN 'count_descriptor'\n",
+ " WHEN descriptor_only_item_descriptor IS NOT NULL AND descriptor_only_category = 'packaging_only' THEN 'packaging_only'\n",
+ " WHEN per_pack_item_descriptor IS NOT NULL THEN 'per_packaging'\n",
+ " WHEN mixed_resolution_status = 'mixed_conflict' AND quantity_has_contextual_mixed_token THEN 'contextual_mixed_unresolved'\n",
+ " WHEN mixed_resolution_status IS NOT NULL THEN mixed_resolution_status\n",
+ " WHEN quantity_has_non_ascii_alpha THEN 'unsupported_foreign_unit'\n",
+ " WHEN quantity_has_ocr_like_numeric_token THEN 'ocr_like_numeric_token'\n",
+ " WHEN quantity_is_number_only THEN 'number_only'\n",
+ " WHEN NOT quantity_has_digits THEN 'noise_or_non_quantity'\n",
+ " WHEN quantity_has_measure_token THEN 'mixed_unresolved'\n",
+ " ELSE 'unparsed'\n",
+ " END AS text_subtype\n",
+ " FROM calculated\n",
+ ")\n",
+ "SELECT\n",
+ " *,\n",
+ " CASE\n",
+ " WHEN quantity IS NULL THEN NULL\n",
+ " WHEN text_subtype IN (\n",
+ " 'plain_multipack',\n",
+ " 'descriptor_multipack',\n",
+ " 'embedded_plain_multipack',\n",
+ " 'embedded_descriptor_multipack',\n",
+ " 'total_plus_count',\n",
+ " 'total_plus_inner_multipack',\n",
+ " 'dual_unit_equivalent',\n",
+ " 'compound_imperial_equivalent',\n",
+ " 'descriptor_measure',\n",
+ " 'simple_measure',\n",
+ " 'count_descriptor'\n",
+ " ) THEN 'resolved'\n",
+ " WHEN text_subtype IN ('descriptor_measure_energy', 'simple_energy', 'packaging_only', 'per_packaging') THEN 'partial'\n",
+ " WHEN text_subtype = 'mixed_conflict' THEN 'conflict'\n",
+ " ELSE 'unresolved'\n",
+ " END AS text_status,\n",
+ " CASE\n",
+ " WHEN quantity IS NULL THEN NULL\n",
+ " WHEN text_subtype = 'placeholder_or_unknown' THEN 'placeholder_or_unknown'\n",
+ " WHEN text_subtype = 'household_unit' THEN 'household_unit'\n",
+ " WHEN text_subtype IN ('plain_multipack', 'descriptor_multipack', 'embedded_plain_multipack', 'embedded_descriptor_multipack', 'total_plus_count', 'total_plus_inner_multipack') THEN 'multipack_measure'\n",
+ " WHEN text_subtype IN ('dual_unit_equivalent', 'compound_imperial_equivalent') THEN mixed_resolution_category\n",
+ " WHEN text_subtype IN ('contextual_mixed_unresolved', 'mixed_conflict', 'mixed_unresolved') THEN 'mixed_measure'\n",
+ " WHEN text_subtype IN ('descriptor_measure', 'simple_measure') THEN coalesce(descriptor_measure_category, simple_category)\n",
+ " WHEN text_subtype IN ('descriptor_measure_energy', 'simple_energy') THEN 'energy'\n",
+ " WHEN text_subtype = 'count_descriptor' THEN 'count'\n",
+ " WHEN text_subtype IN ('packaging_only', 'per_packaging') THEN 'packaging_only'\n",
+ " WHEN text_subtype = 'noise_or_non_quantity' THEN 'noise_or_non_quantity'\n",
+ " ELSE NULL\n",
+ " END AS text_category,\n",
+ " CASE\n",
+ " WHEN text_subtype = 'plain_multipack' THEN multipack_normalized_value\n",
+ " WHEN text_subtype = 'descriptor_multipack' THEN descriptor_multipack_total_normalized_value\n",
+ " WHEN text_subtype IN ('embedded_plain_multipack', 'embedded_descriptor_multipack') THEN complex_resolution_value\n",
+ " WHEN text_subtype = 'total_plus_count' THEN total_plus_total_normalized_value\n",
+ " WHEN text_subtype = 'total_plus_inner_multipack' THEN total_plus_inner_total_normalized_value\n",
+ " WHEN text_subtype IN ('dual_unit_equivalent', 'compound_imperial_equivalent') THEN mixed_resolution_value\n",
+ " WHEN text_subtype IN ('descriptor_measure', 'descriptor_measure_energy') THEN descriptor_measure_normalized_value\n",
+ " WHEN text_subtype IN ('simple_measure', 'simple_energy') THEN simple_normalized_value\n",
+ " WHEN text_subtype = 'count_descriptor' THEN coalesce(descriptor_only_count_numeric, 1.0)\n",
+ " ELSE NULL\n",
+ " END AS text_normalized_value,\n",
+ " CASE\n",
+ " WHEN text_subtype = 'plain_multipack' THEN multipack_base_unit\n",
+ " WHEN text_subtype = 'descriptor_multipack' THEN descriptor_multipack_base_unit\n",
+ " WHEN text_subtype IN ('embedded_plain_multipack', 'embedded_descriptor_multipack') THEN complex_resolution_unit\n",
+ " WHEN text_subtype = 'total_plus_count' THEN total_plus_base_unit\n",
+ " WHEN text_subtype = 'total_plus_inner_multipack' THEN total_plus_inner_total_base_unit\n",
+ " WHEN text_subtype IN ('dual_unit_equivalent', 'compound_imperial_equivalent') THEN mixed_resolution_unit\n",
+ " WHEN text_subtype IN ('descriptor_measure', 'descriptor_measure_energy') THEN descriptor_measure_base_unit\n",
+ " WHEN text_subtype IN ('simple_measure', 'simple_energy') THEN simple_base_unit\n",
+ " WHEN text_subtype = 'count_descriptor' THEN 'count'\n",
+ " ELSE NULL\n",
+ " END AS text_normalized_unit,\n",
+ " CASE\n",
+ " WHEN text_subtype = 'plain_multipack' THEN multipack_inner_normalized_value\n",
+ " WHEN text_subtype = 'descriptor_multipack' THEN descriptor_multipack_inner_normalized_value\n",
+ " WHEN text_subtype IN ('embedded_plain_multipack', 'embedded_descriptor_multipack') THEN complex_resolution_inner_value\n",
+ " WHEN text_subtype = 'total_plus_inner_multipack' THEN total_plus_inner_normalized_value\n",
+ " ELSE NULL\n",
+ " END AS text_inner_normalized_value,\n",
+ " CASE\n",
+ " WHEN text_subtype = 'plain_multipack' THEN multipack_count_numeric\n",
+ " WHEN text_subtype = 'descriptor_multipack' THEN descriptor_multipack_count_numeric\n",
+ " WHEN text_subtype IN ('embedded_plain_multipack', 'embedded_descriptor_multipack') THEN complex_resolution_pack_count\n",
+ " WHEN text_subtype = 'total_plus_count' THEN total_plus_count_numeric\n",
+ " WHEN text_subtype = 'total_plus_inner_multipack' THEN total_plus_inner_count_numeric\n",
+ " WHEN text_subtype = 'count_descriptor' THEN 1.0\n",
+ " WHEN text_subtype IN ('packaging_only', 'per_packaging') THEN coalesce(descriptor_only_count_numeric, per_pack_count_numeric, 1.0)\n",
+ " WHEN text_subtype IN ('descriptor_measure', 'descriptor_measure_energy', 'simple_measure', 'simple_energy', 'dual_unit_equivalent', 'compound_imperial_equivalent') THEN 1.0\n",
+ " ELSE NULL\n",
+ " END AS text_pack_count,\n",
+ " CASE\n",
+ " WHEN text_subtype = 'total_plus_count' THEN total_plus_item_descriptor\n",
+ " WHEN text_subtype = 'total_plus_inner_multipack' THEN total_plus_inner_item_descriptor\n",
+ " WHEN text_subtype = 'descriptor_multipack' THEN descriptor_multipack_item_descriptor\n",
+ " WHEN text_subtype = 'embedded_descriptor_multipack' THEN complex_resolution_item_descriptor\n",
+ " WHEN text_subtype IN ('descriptor_measure', 'descriptor_measure_energy') THEN descriptor_measure_item_descriptor\n",
+ " WHEN text_subtype IN ('count_descriptor', 'packaging_only') THEN descriptor_only_item_descriptor\n",
+ " WHEN text_subtype = 'per_packaging' THEN per_pack_item_descriptor\n",
+ " ELSE NULL\n",
+ " END AS text_item_descriptor,\n",
+ " CASE\n",
+ " WHEN quantity IS NULL THEN 'quantity missing'\n",
+ " WHEN text_subtype = 'blank' THEN 'blank quantity string'\n",
+ " WHEN text_subtype = 'placeholder_or_unknown' THEN 'placeholder or unknown text'\n",
+ " WHEN text_subtype = 'household_unit' THEN 'household unit not used for consolidation'\n",
+ " WHEN text_subtype = 'plain_multipack' THEN 'derived total from quantity multipack'\n",
+ " WHEN text_subtype = 'descriptor_multipack' THEN 'derived total from quantity descriptor multipack'\n",
+ " WHEN text_subtype IN ('embedded_plain_multipack', 'embedded_descriptor_multipack', 'contextual_mixed_unresolved') THEN complex_resolution_note\n",
+ " WHEN text_subtype = 'total_plus_count' THEN 'total size plus count metadata from quantity'\n",
+ " WHEN text_subtype = 'total_plus_inner_multipack' THEN 'explicit total matches descriptor-based inner multipack math'\n",
+ " WHEN text_subtype = 'compound_imperial_equivalent' THEN 'compound imperial text normalized against metric total'\n",
+ " WHEN text_subtype = 'dual_unit_equivalent' AND quantity_ocr_cleanup_applied THEN 'equivalent dual-unit text normalized after OCR cleanup'\n",
+ " WHEN text_subtype = 'dual_unit_equivalent' THEN 'equivalent dual-unit text normalized'\n",
+ " WHEN text_subtype = 'mixed_conflict' THEN 'mixed expression contradictory'\n",
+ " WHEN text_subtype = 'mixed_unresolved' THEN 'mixed expression not safely reducible'\n",
+ " WHEN text_subtype = 'descriptor_measure_energy' THEN 'energy text captured but not used for consolidation'\n",
+ " WHEN text_subtype = 'descriptor_measure' AND quantity_ocr_cleanup_applied THEN 'descriptor plus measure from quantity after OCR cleanup'\n",
+ " WHEN text_subtype = 'descriptor_measure' THEN 'descriptor plus measure from quantity'\n",
+ " WHEN text_subtype = 'simple_energy' THEN 'energy text captured but not used for consolidation'\n",
+ " WHEN text_subtype = 'simple_measure' AND quantity_ocr_cleanup_applied THEN 'simple measure from quantity after OCR cleanup'\n",
+ " WHEN text_subtype = 'simple_measure' THEN 'simple measure from quantity'\n",
+ " WHEN text_subtype = 'count_descriptor' THEN 'count descriptor from quantity'\n",
+ " WHEN text_subtype = 'packaging_only' THEN 'packaging only, no comparable size'\n",
+ " WHEN text_subtype = 'per_packaging' THEN 'per packaging phrase, no comparable size'\n",
+ " WHEN text_subtype = 'unsupported_foreign_unit' THEN 'unsupported foreign unit token'\n",
+ " WHEN text_subtype = 'ocr_like_numeric_token' THEN 'ocr-like numeric token'\n",
+ " WHEN text_subtype = 'number_only' THEN 'number only, unit missing'\n",
+ " WHEN text_subtype = 'noise_or_non_quantity' THEN 'non-quantity text in quantity field'\n",
+ " ELSE 'unparsed quantity text'\n",
+ " END AS text_note\n",
+ "FROM classified;\n",
+ "'''\n",
+ "\n",
+ "con.execute(quantity_features_sql)\n",
+ "\n",
+ "show_query(\n",
+ " \"Current parsed text signal summary\",\n",
+ " '''\n",
+ " SELECT COALESCE(text_status, 'null') AS text_status,\n",
+ " COALESCE(text_category, 'null') AS text_category,\n",
+ " COALESCE(text_subtype, 'null') AS text_subtype,\n",
+ " COUNT(*) AS rows\n",
+ " FROM quantity_cleaning_features\n",
+ " GROUP BY 1, 2, 3\n",
+ " ORDER BY rows DESC, text_status, text_category, text_subtype\n",
+ " '''\n",
+ ")\n"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "d03c74a2",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "\n",
+ "# This cell applies the final cross-field decision rules and produces the single cleaned output view.\n",
+ "# It keeps one user-facing table named `quantity_cleaned` and one internal debug table.\n",
+ "\n",
+ "quantity_cleaned_sql = f'''\n",
+ "CREATE OR REPLACE TEMP VIEW quantity_cleaned AS\n",
+ "WITH staged AS (\n",
+ " SELECT\n",
+ " *,\n",
+ " CASE\n",
+ " WHEN product_normalized_value IS NOT NULL AND product_category IN ('mass', 'volume') THEN 'resolved'\n",
+ " WHEN product_normalized_value IS NOT NULL AND product_category = 'energy' THEN 'partial'\n",
+ " ELSE NULL\n",
+ " END AS product_status,\n",
+ " CASE\n",
+ " WHEN product_category IN ('mass', 'volume', 'energy')\n",
+ " AND text_normalized_value IS NOT NULL\n",
+ " AND text_normalized_unit IS NOT NULL\n",
+ " THEN product_base_unit = text_normalized_unit\n",
+ " ELSE FALSE\n",
+ " END AS comparable_unit_match,\n",
+ " CASE\n",
+ " WHEN product_category IN ('mass', 'volume', 'energy')\n",
+ " AND text_normalized_value IS NOT NULL\n",
+ " AND text_normalized_unit IS NOT NULL\n",
+ " AND product_base_unit = text_normalized_unit\n",
+ " THEN abs(product_normalized_value - text_normalized_value)\n",
+ " <= CASE\n",
+ " WHEN text_subtype IN ('dual_unit_equivalent', 'compound_imperial_equivalent')\n",
+ " THEN greatest(1.0, 0.045 * greatest(abs(product_normalized_value), abs(text_normalized_value)))\n",
+ " ELSE greatest(0.01, {VALUE_MATCH_TOLERANCE} * greatest(abs(product_normalized_value), abs(text_normalized_value)))\n",
+ " END\n",
+ " ELSE FALSE\n",
+ " END AS comparable_value_match,\n",
+ " CASE\n",
+ " WHEN product_category IN ('mass', 'volume')\n",
+ " AND text_category = 'multipack_measure'\n",
+ " AND text_inner_normalized_value IS NOT NULL\n",
+ " AND product_base_unit = text_normalized_unit\n",
+ " THEN abs(product_normalized_value - text_inner_normalized_value)\n",
+ " <= greatest(0.01, {VALUE_MATCH_TOLERANCE} * greatest(abs(product_normalized_value), abs(text_inner_normalized_value)))\n",
+ " ELSE FALSE\n",
+ " END AS multipack_inner_value_match,\n",
+ " CASE\n",
+ " WHEN product_quantity_numeric IS NOT NULL\n",
+ " THEN coalesce(\n",
+ " abs(product_quantity_numeric - measure_1_numeric)\n",
+ " <= greatest(0.01, {VALUE_MATCH_TOLERANCE} * greatest(abs(product_quantity_numeric), abs(measure_1_numeric))),\n",
+ " FALSE\n",
+ " ) OR coalesce(\n",
+ " abs(product_quantity_numeric - measure_2_numeric)\n",
+ " <= greatest(0.01, {VALUE_MATCH_TOLERANCE} * greatest(abs(product_quantity_numeric), abs(measure_2_numeric))),\n",
+ " FALSE\n",
+ " ) OR coalesce(\n",
+ " abs(product_quantity_numeric - measure_3_numeric)\n",
+ " <= greatest(0.01, {VALUE_MATCH_TOLERANCE} * greatest(abs(product_quantity_numeric), abs(measure_3_numeric))),\n",
+ " FALSE\n",
+ " ) OR coalesce(\n",
+ " abs(product_quantity_numeric - measure_4_numeric)\n",
+ " <= greatest(0.01, {VALUE_MATCH_TOLERANCE} * greatest(abs(product_quantity_numeric), abs(measure_4_numeric))),\n",
+ " FALSE\n",
+ " )\n",
+ " ELSE FALSE\n",
+ " END AS raw_numeric_matches_any_text_measure,\n",
+ " CASE\n",
+ " WHEN product_quantity_numeric IS NOT NULL\n",
+ " AND text_category = 'multipack_measure'\n",
+ " AND text_normalized_value IS NOT NULL\n",
+ " THEN (\n",
+ " abs(product_quantity_numeric - text_normalized_value)\n",
+ " <= greatest(0.01, {VALUE_MATCH_TOLERANCE} * greatest(abs(product_quantity_numeric), abs(text_normalized_value)))\n",
+ " ) OR (\n",
+ " text_inner_normalized_value IS NOT NULL\n",
+ " AND abs(product_quantity_numeric - text_inner_normalized_value)\n",
+ " <= greatest(0.01, {VALUE_MATCH_TOLERANCE} * greatest(abs(product_quantity_numeric), abs(text_inner_normalized_value)))\n",
+ " )\n",
+ " WHEN product_quantity_numeric IS NOT NULL\n",
+ " AND text_category = 'count'\n",
+ " AND text_normalized_value IS NOT NULL\n",
+ " THEN product_quantity_numeric = text_normalized_value\n",
+ " WHEN product_quantity_numeric IS NOT NULL\n",
+ " AND text_normalized_value IS NOT NULL\n",
+ " THEN abs(product_quantity_numeric - text_normalized_value)\n",
+ " <= greatest(0.01, {VALUE_MATCH_TOLERANCE} * greatest(abs(product_quantity_numeric), abs(text_normalized_value)))\n",
+ " ELSE FALSE\n",
+ " END AS raw_numeric_matches_text\n",
+ " FROM quantity_cleaning_features\n",
+ ")\n",
+ "SELECT\n",
+ " row_id,\n",
+ " code,\n",
+ " product_quantity_unit,\n",
+ " product_quantity,\n",
+ " quantity,\n",
+ " CASE\n",
+ " WHEN text_status = 'conflict' THEN 'conflict'\n",
+ " WHEN product_status = 'resolved'\n",
+ " AND text_status IN ('resolved', 'partial')\n",
+ " AND text_category IN ('mass', 'volume', 'multipack_measure', 'energy')\n",
+ " AND NOT comparable_value_match\n",
+ " AND NOT multipack_inner_value_match\n",
+ " THEN 'conflict'\n",
+ " WHEN product_status = 'partial'\n",
+ " AND text_status = 'partial'\n",
+ " AND text_category = 'energy'\n",
+ " AND comparable_value_match\n",
+ " THEN 'partial'\n",
+ " WHEN product_status = 'resolved'\n",
+ " AND text_status = 'resolved'\n",
+ " AND text_category = 'multipack_measure'\n",
+ " AND (multipack_inner_value_match OR comparable_value_match)\n",
+ " THEN 'resolved'\n",
+ " WHEN product_status = 'resolved'\n",
+ " AND text_status = 'resolved'\n",
+ " AND text_category IN ('mass', 'volume')\n",
+ " AND comparable_value_match\n",
+ " THEN 'resolved'\n",
+ " WHEN product_status = 'resolved'\n",
+ " AND text_status IN ('resolved', 'partial')\n",
+ " AND text_category IN ('count', 'packaging_only')\n",
+ " THEN 'resolved'\n",
+ " WHEN product_status = 'resolved' THEN 'resolved'\n",
+ " WHEN product_quantity_numeric = 0\n",
+ " AND text_status IN ('resolved', 'partial')\n",
+ " AND (text_normalized_value IS NOT NULL OR text_category = 'packaging_only')\n",
+ " THEN text_status\n",
+ " WHEN product_quantity_numeric IS NOT NULL\n",
+ " AND product_base_unit IS NULL\n",
+ " AND text_status IN ('resolved', 'partial')\n",
+ " AND (raw_numeric_matches_text OR raw_numeric_matches_any_text_measure)\n",
+ " THEN text_status\n",
+ " WHEN product_quantity_numeric IS NOT NULL\n",
+ " AND product_base_unit IS NULL\n",
+ " AND text_status IN ('resolved', 'partial')\n",
+ " AND NOT (raw_numeric_matches_text OR raw_numeric_matches_any_text_measure)\n",
+ " THEN 'conflict'\n",
+ " WHEN text_status IS NOT NULL THEN text_status\n",
+ " WHEN product_quantity_numeric IS NOT NULL AND product_base_unit IS NULL THEN 'unresolved'\n",
+ " ELSE 'unresolved'\n",
+ " END AS quantity_status,\n",
+ " CASE\n",
+ " WHEN text_status = 'conflict' THEN coalesce(text_category, product_category)\n",
+ " WHEN product_status = 'resolved'\n",
+ " AND text_status = 'resolved'\n",
+ " AND text_category = 'multipack_measure'\n",
+ " AND (multipack_inner_value_match OR comparable_value_match)\n",
+ " THEN 'multipack_measure'\n",
+ " WHEN product_status = 'resolved' THEN product_category\n",
+ " WHEN product_status = 'partial'\n",
+ " AND text_status = 'partial'\n",
+ " AND text_category = 'energy'\n",
+ " AND comparable_value_match\n",
+ " THEN 'energy'\n",
+ " WHEN text_status IS NOT NULL THEN text_category\n",
+ " ELSE product_category\n",
+ " END AS quantity_category,\n",
+ " CASE\n",
+ " WHEN text_status = 'conflict' THEN NULL\n",
+ " WHEN product_status = 'resolved'\n",
+ " AND text_status = 'resolved'\n",
+ " AND text_category = 'multipack_measure'\n",
+ " AND (multipack_inner_value_match OR comparable_value_match)\n",
+ " THEN text_normalized_value\n",
+ " WHEN product_status = 'resolved'\n",
+ " AND text_status IN ('resolved', 'partial')\n",
+ " AND text_category IN ('count', 'packaging_only')\n",
+ " THEN product_normalized_value\n",
+ " WHEN product_status = 'resolved' THEN product_normalized_value\n",
+ " WHEN product_status = 'partial'\n",
+ " AND text_status = 'partial'\n",
+ " AND text_category = 'energy'\n",
+ " AND comparable_value_match\n",
+ " THEN product_normalized_value\n",
+ " WHEN product_quantity_numeric = 0\n",
+ " AND text_status = 'resolved'\n",
+ " AND text_category = 'count'\n",
+ " THEN text_normalized_value\n",
+ " WHEN product_quantity_numeric = 0\n",
+ " AND text_status = 'partial'\n",
+ " AND text_category = 'packaging_only'\n",
+ " THEN NULL\n",
+ " WHEN product_quantity_numeric IS NOT NULL\n",
+ " AND product_base_unit IS NULL\n",
+ " AND text_status IN ('resolved', 'partial')\n",
+ " AND (raw_numeric_matches_text OR raw_numeric_matches_any_text_measure)\n",
+ " AND text_category <> 'packaging_only'\n",
+ " THEN text_normalized_value\n",
+ " WHEN text_status = 'resolved'\n",
+ " AND text_category <> 'packaging_only'\n",
+ " THEN text_normalized_value\n",
+ " WHEN text_status = 'partial'\n",
+ " AND text_category = 'energy'\n",
+ " THEN text_normalized_value\n",
+ " ELSE NULL\n",
+ " END AS normalized_value,\n",
+ " CASE\n",
+ " WHEN text_status = 'conflict' THEN NULL\n",
+ " WHEN product_status = 'resolved'\n",
+ " AND text_status = 'resolved'\n",
+ " AND text_category = 'multipack_measure'\n",
+ " AND (multipack_inner_value_match OR comparable_value_match)\n",
+ " THEN text_normalized_unit\n",
+ " WHEN product_status = 'resolved'\n",
+ " AND text_status IN ('resolved', 'partial')\n",
+ " AND text_category IN ('count', 'packaging_only')\n",
+ " THEN product_base_unit\n",
+ " WHEN product_status = 'resolved' THEN product_base_unit\n",
+ " WHEN product_status = 'partial'\n",
+ " AND text_status = 'partial'\n",
+ " AND text_category = 'energy'\n",
+ " AND comparable_value_match\n",
+ " THEN product_base_unit\n",
+ " WHEN product_quantity_numeric = 0\n",
+ " AND text_status = 'resolved'\n",
+ " AND text_category = 'count'\n",
+ " THEN text_normalized_unit\n",
+ " WHEN product_quantity_numeric IS NOT NULL\n",
+ " AND product_base_unit IS NULL\n",
+ " AND text_status IN ('resolved', 'partial')\n",
+ " AND (raw_numeric_matches_text OR raw_numeric_matches_any_text_measure)\n",
+ " AND text_category <> 'packaging_only'\n",
+ " THEN text_normalized_unit\n",
+ " WHEN text_status = 'resolved'\n",
+ " AND text_category <> 'packaging_only'\n",
+ " THEN text_normalized_unit\n",
+ " WHEN text_status = 'partial'\n",
+ " AND text_category = 'energy'\n",
+ " THEN text_normalized_unit\n",
+ " ELSE NULL\n",
+ " END AS normalized_unit,\n",
+ " CASE\n",
+ " WHEN text_status = 'conflict' THEN NULL\n",
+ " WHEN product_status = 'resolved'\n",
+ " AND text_status IN ('resolved', 'partial')\n",
+ " AND text_category = 'count'\n",
+ " THEN greatest(coalesce(text_normalized_value, 1.0), 1.0)\n",
+ " WHEN product_status = 'resolved'\n",
+ " AND text_status IN ('resolved', 'partial')\n",
+ " AND text_category = 'packaging_only'\n",
+ " THEN coalesce(text_pack_count, 1.0)\n",
+ " WHEN product_status = 'resolved'\n",
+ " AND text_status = 'resolved'\n",
+ " AND text_category = 'multipack_measure'\n",
+ " THEN coalesce(text_pack_count, 1.0)\n",
+ " WHEN product_status = 'resolved' THEN 1.0\n",
+ " WHEN text_status = 'resolved' AND text_category = 'count' THEN 1.0\n",
+ " WHEN text_status = 'partial' AND text_category = 'packaging_only' THEN coalesce(text_pack_count, 1.0)\n",
+ " WHEN text_status = 'resolved' AND text_category = 'multipack_measure' THEN coalesce(text_pack_count, 1.0)\n",
+ " WHEN text_status IN ('resolved', 'partial') THEN 1.0\n",
+ " ELSE NULL\n",
+ " END AS pack_count,\n",
+ " CASE\n",
+ " WHEN text_status = 'conflict' THEN NULL\n",
+ " WHEN product_status = 'resolved'\n",
+ " AND text_status IN ('resolved', 'partial')\n",
+ " AND text_item_descriptor IS NOT NULL\n",
+ " THEN text_item_descriptor\n",
+ " WHEN text_status IN ('resolved', 'partial') THEN text_item_descriptor\n",
+ " ELSE NULL\n",
+ " END AS item_descriptor,\n",
+ " CASE\n",
+ " WHEN text_status = 'conflict' THEN TRUE\n",
+ " WHEN product_status = 'resolved'\n",
+ " AND text_status IN ('resolved', 'partial')\n",
+ " AND text_category IN ('mass', 'volume', 'multipack_measure', 'energy')\n",
+ " AND NOT comparable_value_match\n",
+ " AND NOT multipack_inner_value_match\n",
+ " THEN TRUE\n",
+ " WHEN product_quantity_numeric IS NOT NULL\n",
+ " AND product_base_unit IS NULL\n",
+ " AND text_status IN ('resolved', 'partial')\n",
+ " AND NOT (raw_numeric_matches_text OR raw_numeric_matches_any_text_measure)\n",
+ " THEN TRUE\n",
+ " ELSE FALSE\n",
+ " END AS quantity_conflict_flag,\n",
+ " CASE\n",
+ " WHEN text_status = 'conflict' THEN text_note\n",
+ " WHEN product_status = 'resolved'\n",
+ " AND text_status = 'resolved'\n",
+ " AND text_category = 'multipack_measure'\n",
+ " AND text_subtype = 'total_plus_inner_multipack'\n",
+ " AND comparable_value_match\n",
+ " THEN 'structured total matches quantity total-plus-inner multipack'\n",
+ " WHEN product_status = 'resolved'\n",
+ " AND text_status = 'resolved'\n",
+ " AND text_category = 'multipack_measure'\n",
+ " AND text_subtype = 'total_plus_count'\n",
+ " AND comparable_value_match\n",
+ " THEN 'structured total matches quantity total-plus-count'\n",
+ " WHEN product_status = 'resolved'\n",
+ " AND text_status = 'resolved'\n",
+ " AND text_category = 'multipack_measure'\n",
+ " AND multipack_inner_value_match\n",
+ " THEN 'product fields match inner quantity; total derived from quantity multipack'\n",
+ " WHEN product_status = 'resolved'\n",
+ " AND text_status = 'resolved'\n",
+ " AND text_category = 'multipack_measure'\n",
+ " AND comparable_value_match\n",
+ " THEN 'structured total matches quantity multipack'\n",
+ " WHEN product_status = 'resolved'\n",
+ " AND text_status = 'resolved'\n",
+ " AND text_subtype IN ('dual_unit_equivalent', 'compound_imperial_equivalent')\n",
+ " AND text_category IN ('mass', 'volume')\n",
+ " AND comparable_value_match\n",
+ " THEN 'product fields agree with equivalent mixed-unit text'\n",
+ " WHEN product_status = 'resolved'\n",
+ " AND text_status = 'resolved'\n",
+ " AND text_category IN ('mass', 'volume')\n",
+ " AND comparable_value_match\n",
+ " THEN 'product fields and quantity agree'\n",
+ " WHEN product_status = 'partial'\n",
+ " AND text_status = 'partial'\n",
+ " AND text_category = 'energy'\n",
+ " AND comparable_value_match\n",
+ " THEN 'energy captured consistently but not used for consolidation'\n",
+ " WHEN product_status = 'resolved'\n",
+ " AND text_status = 'resolved'\n",
+ " AND text_category = 'count'\n",
+ " THEN 'structured product fields used with count metadata from quantity'\n",
+ " WHEN product_status = 'resolved'\n",
+ " AND text_status = 'partial'\n",
+ " AND text_category = 'packaging_only'\n",
+ " THEN 'structured product fields used with packaging metadata from quantity'\n",
+ " WHEN product_status = 'resolved'\n",
+ " AND text_status IN ('resolved', 'partial')\n",
+ " AND text_category IN ('mass', 'volume', 'multipack_measure', 'energy')\n",
+ " AND NOT comparable_value_match\n",
+ " AND NOT multipack_inner_value_match\n",
+ " THEN 'conflict between product fields and quantity text'\n",
+ " WHEN product_status = 'resolved'\n",
+ " THEN 'structured product fields used'\n",
+ " WHEN product_quantity_numeric = 0\n",
+ " AND text_status = 'partial'\n",
+ " AND text_category = 'packaging_only'\n",
+ " THEN 'structured zero ignored; packaging text kept as partial metadata'\n",
+ " WHEN product_quantity_numeric = 0\n",
+ " AND text_status IN ('resolved', 'partial')\n",
+ " AND text_normalized_value IS NOT NULL\n",
+ " THEN 'structured zero ignored; quantity text used'\n",
+ " WHEN product_quantity_numeric IS NOT NULL\n",
+ " AND product_base_unit IS NULL\n",
+ " AND text_status IN ('resolved', 'partial')\n",
+ " AND (raw_numeric_matches_text OR raw_numeric_matches_any_text_measure)\n",
+ " THEN 'filled missing structure from quantity'\n",
+ " WHEN product_quantity_numeric IS NOT NULL\n",
+ " AND product_base_unit IS NULL\n",
+ " AND text_status IN ('resolved', 'partial')\n",
+ " AND NOT (raw_numeric_matches_text OR raw_numeric_matches_any_text_measure)\n",
+ " THEN 'structured numeric value disagrees with quantity text'\n",
+ " WHEN text_status IS NOT NULL THEN text_note\n",
+ " WHEN product_quantity_numeric IS NOT NULL AND product_base_unit IS NULL THEN 'structured numeric value present, but no usable unit found'\n",
+ " ELSE 'no usable quantity signal found'\n",
+ " END AS quantity_note\n",
+ "FROM staged;\n",
+ "'''\n",
+ "\n",
+ "con.execute(quantity_cleaned_sql)\n",
+ "\n",
+ "review_sql = '''\n",
+ "CREATE OR REPLACE TEMP VIEW quantity_cleaning_debug AS\n",
+ "SELECT\n",
+ " q.row_id,\n",
+ " q.code,\n",
+ " q.product_quantity_unit,\n",
+ " q.product_quantity,\n",
+ " q.quantity,\n",
+ " q.quantity_status,\n",
+ " q.quantity_category,\n",
+ " q.normalized_value,\n",
+ " q.normalized_unit,\n",
+ " q.pack_count,\n",
+ " q.item_descriptor,\n",
+ " q.quantity_conflict_flag,\n",
+ " q.quantity_note,\n",
+ " f.quantity_normalized_raw AS raw_cleaned_quantity_text,\n",
+ " f.quantity_normalized AS cleaned_quantity_text,\n",
+ " f.quantity_ocr_cleanup_applied AS used_ocr_cleanup,\n",
+ " f.quantity_has_ocr_like_numeric_token AS has_ocr_like_token,\n",
+ " f.quantity_has_non_ascii_alpha AS has_non_ascii_token,\n",
+ " f.quantity_uses_new_multilingual_alias AS used_multilingual_alias,\n",
+ " f.mixed_resolution_system AS equivalence_system_used,\n",
+ " f.text_subtype AS parse_path,\n",
+ " b.quantity_status AS reference_status,\n",
+ " b.quantity_category AS reference_category,\n",
+ " b.quantity_note AS reference_note\n",
+ "FROM quantity_cleaned q\n",
+ "JOIN quantity_cleaning_features f USING (row_id)\n",
+ "LEFT JOIN quantity_cleaned_reference b USING (row_id);\n",
+ "'''\n",
+ "\n",
+ "con.execute(review_sql)\n",
+ "\n",
+ "show_query(\"Final cleaned schema\", \"DESCRIBE quantity_cleaned\")\n",
+ "show_query(\n",
+ " \"Final status counts\",\n",
+ " '''\n",
+ " SELECT quantity_status, COUNT(*) AS rows\n",
+ " FROM quantity_cleaned\n",
+ " GROUP BY 1\n",
+ " ORDER BY rows DESC, quantity_status\n",
+ " '''\n",
+ ")\n",
+ "show_query(\n",
+ " \"Final category counts\",\n",
+ " '''\n",
+ " SELECT coalesce(quantity_category, 'null') AS quantity_category, COUNT(*) AS rows\n",
+ " FROM quantity_cleaned\n",
+ " GROUP BY 1\n",
+ " ORDER BY rows DESC, quantity_category\n",
+ " '''\n",
+ ")\n"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "6b322aff",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# This cell shows real row examples from the final cleaned output view.\n",
+ "# It keeps the focus on the columns a downstream user actually needs.\n",
+ "\n",
+ "show_query(\n",
+ " \"Sample cleaned rows by status\",\n",
+ " '''\n",
+ " WITH ranked AS (\n",
+ " SELECT\n",
+ " quantity_status,\n",
+ " code,\n",
+ " product_quantity_unit,\n",
+ " product_quantity,\n",
+ " quantity,\n",
+ " quantity_category,\n",
+ " normalized_value,\n",
+ " normalized_unit,\n",
+ " pack_count,\n",
+ " item_descriptor,\n",
+ " quantity_conflict_flag,\n",
+ " quantity_note,\n",
+ " row_number() OVER (PARTITION BY quantity_status ORDER BY row_id) AS rn\n",
+ " FROM quantity_cleaned\n",
+ " )\n",
+ " SELECT\n",
+ " quantity_status,\n",
+ " code,\n",
+ " product_quantity_unit,\n",
+ " product_quantity,\n",
+ " quantity,\n",
+ " quantity_category,\n",
+ " normalized_value,\n",
+ " normalized_unit,\n",
+ " pack_count,\n",
+ " item_descriptor,\n",
+ " quantity_conflict_flag,\n",
+ " quantity_note\n",
+ " FROM ranked\n",
+ " WHERE rn <= 8\n",
+ " ORDER BY quantity_status, rn\n",
+ " '''\n",
+ ")\n",
+ "\n",
+ "show_query(\n",
+ " \"Sample rows where quantity text helped the final result\",\n",
+ " '''\n",
+ " SELECT\n",
+ " code,\n",
+ " product_quantity_unit,\n",
+ " product_quantity,\n",
+ " quantity,\n",
+ " quantity_status,\n",
+ " quantity_category,\n",
+ " normalized_value,\n",
+ " normalized_unit,\n",
+ " pack_count,\n",
+ " item_descriptor,\n",
+ " quantity_note\n",
+ " FROM quantity_cleaning_debug\n",
+ " WHERE quantity_note IN (\n",
+ " 'filled missing structure from quantity',\n",
+ " 'structured zero ignored; quantity text used',\n",
+ " 'derived total from quantity multipack',\n",
+ " 'count descriptor from quantity',\n",
+ " 'descriptor plus measure from quantity',\n",
+ " 'descriptor plus measure from quantity after OCR cleanup',\n",
+ " 'equivalent dual-unit text normalized',\n",
+ " 'equivalent dual-unit text normalized after OCR cleanup',\n",
+ " 'total size plus count metadata from quantity'\n",
+ " )\n",
+ " ORDER BY row_id\n",
+ " LIMIT 25\n",
+ " '''\n",
+ ")\n"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "e3e8db6f",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "\n",
+ "# This cell gives a compact QA summary for the most important rule families in the current cleaner.\n",
+ "\n",
+ "show_query(\n",
+ " \"QA summary\",\n",
+ " '''\n",
+ " SELECT 'rows_with_conflicts' AS metric, COUNT(*) AS value\n",
+ " FROM quantity_cleaned\n",
+ " WHERE quantity_status = 'conflict'\n",
+ " UNION ALL\n",
+ " SELECT 'rows_resolved_from_dual_unit_equivalence', COUNT(*)\n",
+ " FROM quantity_cleaning_debug\n",
+ " WHERE parse_path = 'dual_unit_equivalent' AND quantity_status = 'resolved'\n",
+ " UNION ALL\n",
+ " SELECT 'rows_resolved_from_embedded_plain_multipack', COUNT(*)\n",
+ " FROM quantity_cleaning_debug\n",
+ " WHERE parse_path = 'embedded_plain_multipack' AND quantity_status = 'resolved'\n",
+ " UNION ALL\n",
+ " SELECT 'rows_resolved_from_embedded_descriptor_multipack', COUNT(*)\n",
+ " FROM quantity_cleaning_debug\n",
+ " WHERE parse_path = 'embedded_descriptor_multipack' AND quantity_status = 'resolved'\n",
+ " UNION ALL\n",
+ " SELECT 'rows_resolved_from_compound_imperial_equivalence', COUNT(*)\n",
+ " FROM quantity_cleaning_debug\n",
+ " WHERE parse_path = 'compound_imperial_equivalent' AND quantity_status = 'resolved'\n",
+ " UNION ALL\n",
+ " SELECT 'rows_resolved_from_total_plus_count', COUNT(*)\n",
+ " FROM quantity_cleaning_debug\n",
+ " WHERE parse_path = 'total_plus_count' AND quantity_status = 'resolved'\n",
+ " UNION ALL\n",
+ " SELECT 'rows_resolved_from_total_plus_inner_multipack', COUNT(*)\n",
+ " FROM quantity_cleaning_debug\n",
+ " WHERE parse_path = 'total_plus_inner_multipack' AND quantity_status = 'resolved'\n",
+ " UNION ALL\n",
+ " SELECT 'rows_resolved_after_safe_ocr_cleanup', COUNT(*)\n",
+ " FROM quantity_cleaning_debug\n",
+ " WHERE used_ocr_cleanup AND quantity_status = 'resolved'\n",
+ " UNION ALL\n",
+ " SELECT 'rows_using_approved_multilingual_aliases', COUNT(*)\n",
+ " FROM quantity_cleaning_debug\n",
+ " WHERE used_multilingual_alias\n",
+ " UNION ALL\n",
+ " SELECT 'rows_resolved_with_imperial_fl_oz_match', COUNT(*)\n",
+ " FROM quantity_cleaning_debug\n",
+ " WHERE equivalence_system_used = 'imperial' AND quantity_status = 'resolved'\n",
+ " UNION ALL\n",
+ " SELECT 'rows_left_unresolved_as_household_units', COUNT(*)\n",
+ " FROM quantity_cleaning_debug\n",
+ " WHERE quantity_category = 'household_unit'\n",
+ " UNION ALL\n",
+ " SELECT 'rows_left_unresolved_as_contextual_mixed', COUNT(*)\n",
+ " FROM quantity_cleaning_debug\n",
+ " WHERE parse_path = 'contextual_mixed_unresolved'\n",
+ " UNION ALL\n",
+ " SELECT 'rows_with_unsupported_foreign_unit_reason', COUNT(*)\n",
+ " FROM quantity_cleaning_debug\n",
+ " WHERE parse_path = 'unsupported_foreign_unit'\n",
+ " UNION ALL\n",
+ " SELECT 'rows_split_into_mixed_conflict', COUNT(*)\n",
+ " FROM quantity_cleaning_debug\n",
+ " WHERE parse_path = 'mixed_conflict'\n",
+ " UNION ALL\n",
+ " SELECT 'rows_split_into_mixed_unresolved', COUNT(*)\n",
+ " FROM quantity_cleaning_debug\n",
+ " WHERE parse_path = 'mixed_unresolved'\n",
+ " '''\n",
+ ")\n"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "d8ddd666",
+ "metadata": {},
+ "source": [
+ "## Final Table EDA\n",
+ "\n",
+ "The earlier cells built the final cleaned view and the optional debug helpers. This section explores what the current cleaner is actually doing in practice:\n",
+ "\n",
+ "- how many rows are `resolved`, `partial`, `unresolved`, or `conflict`\n",
+ "- how the final categories are distributed\n",
+ "- which `quantity_note` messages and parse paths are actually being used\n",
+ "- how often the cleaned numeric/unit fields are populated\n",
+ "- what unresolved and conflict rows still look like after cleaning\n"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "f3019d06",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# This cell gives the high-level shape of the final cleaned table.\n",
+ "# It shows:\n",
+ "# - output column coverage\n",
+ "# - final status counts\n",
+ "# - final category counts\n",
+ "# - how status and category combine together\n",
+ "\n",
+ "show_query(\n",
+ " \"Final table output coverage\",\n",
+ " '''\n",
+ " SELECT 'quantity_status' AS field,\n",
+ " COUNT(*) FILTER (WHERE quantity_status IS NOT NULL) AS non_null_count,\n",
+ " COUNT(DISTINCT quantity_status) FILTER (WHERE quantity_status IS NOT NULL) AS distinct_non_null_count\n",
+ " FROM quantity_cleaned\n",
+ " UNION ALL\n",
+ " SELECT 'quantity_category',\n",
+ " COUNT(*) FILTER (WHERE quantity_category IS NOT NULL),\n",
+ " COUNT(DISTINCT quantity_category) FILTER (WHERE quantity_category IS NOT NULL)\n",
+ " FROM quantity_cleaned\n",
+ " UNION ALL\n",
+ " SELECT 'normalized_value',\n",
+ " COUNT(*) FILTER (WHERE normalized_value IS NOT NULL),\n",
+ " COUNT(DISTINCT normalized_value) FILTER (WHERE normalized_value IS NOT NULL)\n",
+ " FROM quantity_cleaned\n",
+ " UNION ALL\n",
+ " SELECT 'normalized_unit',\n",
+ " COUNT(*) FILTER (WHERE normalized_unit IS NOT NULL),\n",
+ " COUNT(DISTINCT normalized_unit) FILTER (WHERE normalized_unit IS NOT NULL)\n",
+ " FROM quantity_cleaned\n",
+ " UNION ALL\n",
+ " SELECT 'pack_count',\n",
+ " COUNT(*) FILTER (WHERE pack_count IS NOT NULL),\n",
+ " COUNT(DISTINCT pack_count) FILTER (WHERE pack_count IS NOT NULL)\n",
+ " FROM quantity_cleaned\n",
+ " UNION ALL\n",
+ " SELECT 'item_descriptor',\n",
+ " COUNT(*) FILTER (WHERE item_descriptor IS NOT NULL),\n",
+ " COUNT(DISTINCT item_descriptor) FILTER (WHERE item_descriptor IS NOT NULL)\n",
+ " FROM quantity_cleaned\n",
+ " UNION ALL\n",
+ " SELECT 'quantity_conflict_flag',\n",
+ " COUNT(*) FILTER (WHERE quantity_conflict_flag IS NOT NULL),\n",
+ " COUNT(DISTINCT quantity_conflict_flag) FILTER (WHERE quantity_conflict_flag IS NOT NULL)\n",
+ " FROM quantity_cleaned\n",
+ " UNION ALL\n",
+ " SELECT 'quantity_note',\n",
+ " COUNT(*) FILTER (WHERE quantity_note IS NOT NULL),\n",
+ " COUNT(DISTINCT quantity_note) FILTER (WHERE quantity_note IS NOT NULL)\n",
+ " FROM quantity_cleaned\n",
+ " '''\n",
+ ")\n",
+ "\n",
+ "show_query(\n",
+ " \"Final status counts\",\n",
+ " '''\n",
+ " SELECT quantity_status, COUNT(*) AS rows\n",
+ " FROM quantity_cleaned\n",
+ " GROUP BY 1\n",
+ " ORDER BY rows DESC, quantity_status\n",
+ " '''\n",
+ ")\n",
+ "\n",
+ "show_query(\n",
+ " \"Final category counts\",\n",
+ " '''\n",
+ " SELECT COALESCE(quantity_category, 'null') AS quantity_category, COUNT(*) AS rows\n",
+ " FROM quantity_cleaned\n",
+ " GROUP BY 1\n",
+ " ORDER BY rows DESC, quantity_category\n",
+ " '''\n",
+ ")\n",
+ "\n",
+ "show_query(\n",
+ " \"Status by category\",\n",
+ " '''\n",
+ " SELECT quantity_status,\n",
+ " COALESCE(quantity_category, 'null') AS quantity_category,\n",
+ " COUNT(*) AS rows\n",
+ " FROM quantity_cleaned\n",
+ " GROUP BY 1, 2\n",
+ " ORDER BY quantity_status, rows DESC, quantity_category\n",
+ " '''\n",
+ ")\n"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "f11eae03",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# This cell focuses on the explanation columns because they show how the cleaner reached its result.\n",
+ "# It answers:\n",
+ "# - how many distinct final messages we actually ended up using\n",
+ "# - which messages are most common\n",
+ "# - which internal parse paths are most common in the debug view\n",
+ "\n",
+ "show_query(\n",
+ " \"quantity_note distinct-count summary\",\n",
+ " '''\n",
+ " SELECT COUNT(DISTINCT quantity_note) AS distinct_quantity_note_messages\n",
+ " FROM quantity_cleaned\n",
+ " WHERE quantity_note IS NOT NULL\n",
+ " '''\n",
+ ")\n",
+ "\n",
+ "show_query(\n",
+ " \"quantity_note counts\",\n",
+ " '''\n",
+ " SELECT quantity_note, COUNT(*) AS rows\n",
+ " FROM quantity_cleaned\n",
+ " GROUP BY 1\n",
+ " ORDER BY rows DESC, quantity_note\n",
+ " '''\n",
+ ")\n",
+ "\n",
+ "show_query(\n",
+ " \"quantity_note by final status and category\",\n",
+ " '''\n",
+ " SELECT quantity_note,\n",
+ " quantity_status,\n",
+ " COALESCE(quantity_category, 'null') AS quantity_category,\n",
+ " COUNT(*) AS rows\n",
+ " FROM quantity_cleaned\n",
+ " GROUP BY 1, 2, 3\n",
+ " ORDER BY quantity_note, rows DESC, quantity_status, quantity_category\n",
+ " '''\n",
+ ")\n",
+ "\n",
+ "show_query(\n",
+ " \"parse_path counts\",\n",
+ " '''\n",
+ " SELECT parse_path, COUNT(*) AS rows\n",
+ " FROM quantity_cleaning_debug\n",
+ " GROUP BY 1\n",
+ " ORDER BY rows DESC, parse_path\n",
+ " '''\n",
+ ")\n"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "8de8a6cf",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# This cell explores the structured outputs that downstream users would actually consume.\n",
+ "# It shows:\n",
+ "# - which normalized units are most common\n",
+ "# - which descriptors are being used\n",
+ "# - how pack_count behaves across categories\n",
+ "\n",
+ "show_query(\n",
+ " \"normalized_unit counts\",\n",
+ " '''\n",
+ " SELECT COALESCE(normalized_unit, 'null') AS normalized_unit, COUNT(*) AS rows\n",
+ " FROM quantity_cleaned\n",
+ " GROUP BY 1\n",
+ " ORDER BY rows DESC, normalized_unit\n",
+ " '''\n",
+ ")\n",
+ "\n",
+ "show_query(\n",
+ " \"item_descriptor counts\",\n",
+ " '''\n",
+ " SELECT COALESCE(item_descriptor, 'null') AS item_descriptor, COUNT(*) AS rows\n",
+ " FROM quantity_cleaned\n",
+ " GROUP BY 1\n",
+ " ORDER BY rows DESC, item_descriptor\n",
+ " '''\n",
+ ")\n",
+ "\n",
+ "show_query(\n",
+ " \"pack_count profile by category\",\n",
+ " '''\n",
+ " SELECT COALESCE(quantity_category, 'null') AS quantity_category,\n",
+ " COUNT(*) FILTER (WHERE pack_count IS NOT NULL) AS rows_with_pack_count,\n",
+ " COUNT(*) FILTER (WHERE pack_count = 1) AS rows_with_pack_count_1,\n",
+ " COUNT(*) FILTER (WHERE pack_count > 1) AS rows_with_pack_count_gt_1,\n",
+ " COUNT(DISTINCT pack_count) FILTER (WHERE pack_count IS NOT NULL) AS distinct_pack_count_values\n",
+ " FROM quantity_cleaned\n",
+ " GROUP BY 1\n",
+ " ORDER BY rows_with_pack_count DESC, quantity_category\n",
+ " '''\n",
+ ")\n",
+ "\n",
+ "show_query(\n",
+ " \"Most common pack_count values\",\n",
+ " '''\n",
+ " SELECT pack_count, COUNT(*) AS rows\n",
+ " FROM quantity_cleaned\n",
+ " WHERE pack_count IS NOT NULL\n",
+ " GROUP BY 1\n",
+ " ORDER BY rows DESC, pack_count\n",
+ " '''\n",
+ ")\n"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "61230411",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "\n",
+ "# This cell drills into the rows that still need attention after cleaning.\n",
+ "# It uses the smaller debug view so we can inspect the cleaned text and parse path without overwhelming the table.\n",
+ "\n",
+ "show_query(\n",
+ " \"Unresolved and conflict note counts\",\n",
+ " '''\n",
+ " SELECT quantity_status,\n",
+ " quantity_note,\n",
+ " COUNT(*) AS rows\n",
+ " FROM quantity_cleaned\n",
+ " WHERE quantity_status IN ('unresolved', 'conflict')\n",
+ " GROUP BY 1, 2\n",
+ " ORDER BY quantity_status, rows DESC, quantity_note\n",
+ " '''\n",
+ ")\n",
+ "\n",
+ "show_query(\n",
+ " \"Sample conflict rows\",\n",
+ " '''\n",
+ " SELECT\n",
+ " code,\n",
+ " product_quantity_unit,\n",
+ " product_quantity,\n",
+ " quantity,\n",
+ " cleaned_quantity_text,\n",
+ " equivalence_system_used,\n",
+ " parse_path,\n",
+ " quantity_category,\n",
+ " quantity_conflict_flag,\n",
+ " quantity_note\n",
+ " FROM quantity_cleaning_debug\n",
+ " WHERE quantity_status = 'conflict'\n",
+ " ORDER BY row_id\n",
+ " '''\n",
+ ")\n",
+ "\n",
+ "show_query(\n",
+ " \"Sample partial rows\",\n",
+ " '''\n",
+ " SELECT\n",
+ " code,\n",
+ " product_quantity_unit,\n",
+ " product_quantity,\n",
+ " quantity,\n",
+ " cleaned_quantity_text,\n",
+ " parse_path,\n",
+ " quantity_category,\n",
+ " normalized_value,\n",
+ " normalized_unit,\n",
+ " pack_count,\n",
+ " item_descriptor,\n",
+ " quantity_note\n",
+ " FROM quantity_cleaning_debug\n",
+ " WHERE quantity_status = 'partial'\n",
+ " ORDER BY row_id\n",
+ " LIMIT 15\n",
+ " '''\n",
+ ")\n"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "bdb3d6a6",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# This cell performs the dataset-driven alias-mining pass from the rows that are still unresolved.\n",
+ "# It does not auto-approve anything. It only surfaces frequent unknown tokens for manual review.\n",
+ "\n",
+ "show_query(\n",
+ " \"Alias mining candidates from unresolved numeric-like rows\",\n",
+ " f'''\n",
+ " WITH candidate_rows AS (\n",
+ " SELECT row_id, cleaned_quantity_text\n",
+ " FROM quantity_cleaning_debug\n",
+ " WHERE quantity_status = 'unresolved'\n",
+ " AND cleaned_quantity_text IS NOT NULL\n",
+ " AND (regexp_matches(cleaned_quantity_text, '\\\\d') OR parse_path IN ('mixed_unresolved', 'unsupported_foreign_unit', 'ocr_like_numeric_token'))\n",
+ " ),\n",
+ " stripped AS (\n",
+ " SELECT\n",
+ " row_id,\n",
+ " regexp_replace(cleaned_quantity_text, '{MEASURE_ANY_REGEX}', ' ', 'g') AS no_measures\n",
+ " FROM candidate_rows\n",
+ " ),\n",
+ " tokenized AS (\n",
+ " SELECT\n",
+ " row_id,\n",
+ " unnest(\n",
+ " string_split(\n",
+ " trim(\n",
+ " regexp_replace(\n",
+ " regexp_replace(\n",
+ " regexp_replace(no_measures, '{NUMBER_PATTERN}', ' ', 'g'),\n",
+ " '[^a-z]+',\n",
+ " ' ',\n",
+ " 'g'\n",
+ " ),\n",
+ " '\\\\s+',\n",
+ " ' ',\n",
+ " 'g'\n",
+ " )\n",
+ " ),\n",
+ " ' '\n",
+ " )\n",
+ " ) AS token\n",
+ " FROM stripped\n",
+ " )\n",
+ " SELECT token, COUNT(*) AS rows\n",
+ " FROM tokenized\n",
+ " WHERE token <> ''\n",
+ " AND token NOT IN (SELECT token FROM measure_aliases)\n",
+ " AND token NOT IN (SELECT token FROM descriptor_aliases)\n",
+ " AND token NOT IN (SELECT token FROM household_aliases)\n",
+ " AND token NOT IN ({ALIAS_MINING_STOPWORDS_SQL_LIST})\n",
+ " GROUP BY 1\n",
+ " ORDER BY rows DESC, token\n",
+ " '''\n",
+ ")\n",
+ "\n",
+ "show_query(\n",
+ " \"Sample unresolved rows for top alias candidates\",\n",
+ " f'''\n",
+ " WITH candidate_rows AS (\n",
+ " SELECT row_id, cleaned_quantity_text\n",
+ " FROM quantity_cleaning_debug\n",
+ " WHERE quantity_status = 'unresolved'\n",
+ " AND cleaned_quantity_text IS NOT NULL\n",
+ " AND (regexp_matches(cleaned_quantity_text, '\\\\d') OR parse_path IN ('mixed_unresolved', 'unsupported_foreign_unit', 'ocr_like_numeric_token'))\n",
+ " ),\n",
+ " stripped AS (\n",
+ " SELECT\n",
+ " row_id,\n",
+ " regexp_replace(cleaned_quantity_text, '{MEASURE_ANY_REGEX}', ' ', 'g') AS no_measures\n",
+ " FROM candidate_rows\n",
+ " ),\n",
+ " tokenized AS (\n",
+ " SELECT\n",
+ " row_id,\n",
+ " unnest(\n",
+ " string_split(\n",
+ " trim(\n",
+ " regexp_replace(\n",
+ " regexp_replace(\n",
+ " regexp_replace(no_measures, '{NUMBER_PATTERN}', ' ', 'g'),\n",
+ " '[^a-z]+',\n",
+ " ' ',\n",
+ " 'g'\n",
+ " ),\n",
+ " '\\\\s+',\n",
+ " ' ',\n",
+ " 'g'\n",
+ " )\n",
+ " ),\n",
+ " ' '\n",
+ " )\n",
+ " ) AS token\n",
+ " FROM stripped\n",
+ " ),\n",
+ " top_tokens AS (\n",
+ " SELECT token\n",
+ " FROM tokenized\n",
+ " WHERE token <> ''\n",
+ " AND token NOT IN (SELECT token FROM measure_aliases)\n",
+ " AND token NOT IN (SELECT token FROM descriptor_aliases)\n",
+ " AND token NOT IN (SELECT token FROM household_aliases)\n",
+ " AND token NOT IN ({ALIAS_MINING_STOPWORDS_SQL_LIST})\n",
+ " GROUP BY 1\n",
+ " ORDER BY COUNT(*) DESC, token\n",
+ " LIMIT 10\n",
+ " ),\n",
+ " ranked AS (\n",
+ " SELECT\n",
+ " t.token,\n",
+ " r.code,\n",
+ " r.quantity,\n",
+ " r.cleaned_quantity_text,\n",
+ " r.quantity_note,\n",
+ " row_number() OVER (PARTITION BY t.token ORDER BY r.row_id) AS rn\n",
+ " FROM top_tokens t\n",
+ " JOIN quantity_cleaning_debug r\n",
+ " ON contains(' ' || r.cleaned_quantity_text || ' ', ' ' || t.token || ' ')\n",
+ " WHERE r.quantity_status = 'unresolved'\n",
+ " )\n",
+ " SELECT token, code, quantity, cleaned_quantity_text, quantity_note\n",
+ " FROM ranked\n",
+ " WHERE rn <= 5\n",
+ " ORDER BY token, rn\n",
+ " '''\n",
+ ")\n"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "a530beb4",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "\n",
+ "# This cell audits how the current cleaner differs from the internal reference snapshot.\n",
+ "# It is optional review material, not a second final output table.\n",
+ "\n",
+ "con.execute(\n",
+ " '''\n",
+ " CREATE OR REPLACE TEMP VIEW quantity_change_audit AS\n",
+ " WITH joined AS (\n",
+ " SELECT\n",
+ " row_id,\n",
+ " code,\n",
+ " product_quantity_unit,\n",
+ " product_quantity,\n",
+ " quantity,\n",
+ " raw_cleaned_quantity_text,\n",
+ " cleaned_quantity_text,\n",
+ " reference_status,\n",
+ " reference_category,\n",
+ " reference_note,\n",
+ " quantity_status,\n",
+ " quantity_category,\n",
+ " quantity_note,\n",
+ " normalized_value,\n",
+ " normalized_unit,\n",
+ " pack_count,\n",
+ " item_descriptor,\n",
+ " parse_path,\n",
+ " used_ocr_cleanup,\n",
+ " used_multilingual_alias,\n",
+ " equivalence_system_used\n",
+ " FROM quantity_cleaning_debug\n",
+ " )\n",
+ " SELECT 'dual_unit_equivalence' AS rule_name, * FROM joined WHERE parse_path = 'dual_unit_equivalent'\n",
+ " UNION ALL\n",
+ " SELECT 'embedded_plain_multipack', * FROM joined WHERE parse_path = 'embedded_plain_multipack'\n",
+ " UNION ALL\n",
+ " SELECT 'embedded_descriptor_multipack', * FROM joined WHERE parse_path = 'embedded_descriptor_multipack'\n",
+ " UNION ALL\n",
+ " SELECT 'compound_imperial_equivalence', * FROM joined WHERE parse_path = 'compound_imperial_equivalent'\n",
+ " UNION ALL\n",
+ " SELECT 'total_plus_count', * FROM joined WHERE parse_path = 'total_plus_count'\n",
+ " UNION ALL\n",
+ " SELECT 'total_plus_inner_multipack', * FROM joined WHERE parse_path = 'total_plus_inner_multipack'\n",
+ " UNION ALL\n",
+ " SELECT 'descriptor_multipack', * FROM joined WHERE parse_path = 'descriptor_multipack'\n",
+ " UNION ALL\n",
+ " SELECT 'contextual_mixed_unresolved', * FROM joined WHERE parse_path = 'contextual_mixed_unresolved'\n",
+ " UNION ALL\n",
+ " SELECT 'mixed_measure_conflict_split', * FROM joined WHERE parse_path = 'mixed_conflict'\n",
+ " UNION ALL\n",
+ " SELECT 'mixed_measure_unresolved_split', * FROM joined WHERE parse_path = 'mixed_unresolved'\n",
+ " UNION ALL\n",
+ " SELECT 'safe_ocr_cleanup', * FROM joined WHERE used_ocr_cleanup\n",
+ " UNION ALL\n",
+ " SELECT 'approved_multilingual_alias', * FROM joined WHERE used_multilingual_alias\n",
+ " UNION ALL\n",
+ " SELECT 'explicit_household_reason', * FROM joined WHERE parse_path = 'household_unit';\n",
+ " '''\n",
+ ")\n",
+ "\n",
+ "show_query(\n",
+ " \"Change-audit summary\",\n",
+ " '''\n",
+ " SELECT\n",
+ " rule_name,\n",
+ " COUNT(*) AS rows_in_rule,\n",
+ " COUNT(*) FILTER (WHERE reference_status IS DISTINCT FROM quantity_status) AS changed_rows,\n",
+ " COUNT(*) FILTER (WHERE reference_status = 'unresolved' AND quantity_status = 'resolved') AS unresolved_to_resolved,\n",
+ " COUNT(*) FILTER (WHERE reference_status = 'conflict' AND quantity_status = 'resolved') AS conflict_to_resolved,\n",
+ " COUNT(*) FILTER (WHERE reference_status = 'unresolved' AND quantity_status = 'conflict') AS unresolved_to_conflict\n",
+ " FROM quantity_change_audit\n",
+ " GROUP BY 1\n",
+ " ORDER BY rows_in_rule DESC, rule_name\n",
+ " '''\n",
+ ")\n",
+ "\n",
+ "show_query(\n",
+ " \"Change-audit samples\",\n",
+ " '''\n",
+ " WITH ranked AS (\n",
+ " SELECT\n",
+ " rule_name,\n",
+ " reference_status,\n",
+ " quantity_status,\n",
+ " parse_path,\n",
+ " reference_note,\n",
+ " quantity_note,\n",
+ " code,\n",
+ " product_quantity_unit,\n",
+ " product_quantity,\n",
+ " quantity,\n",
+ " cleaned_quantity_text,\n",
+ " quantity_category,\n",
+ " normalized_value,\n",
+ " normalized_unit,\n",
+ " pack_count,\n",
+ " item_descriptor,\n",
+ " equivalence_system_used,\n",
+ " row_number() OVER (PARTITION BY rule_name ORDER BY row_id) AS rn\n",
+ " FROM quantity_change_audit\n",
+ " )\n",
+ " SELECT\n",
+ " rule_name,\n",
+ " reference_status,\n",
+ " quantity_status,\n",
+ " parse_path,\n",
+ " reference_note,\n",
+ " quantity_note,\n",
+ " code,\n",
+ " product_quantity_unit,\n",
+ " product_quantity,\n",
+ " quantity,\n",
+ " cleaned_quantity_text,\n",
+ " equivalence_system_used,\n",
+ " quantity_category,\n",
+ " normalized_value,\n",
+ " normalized_unit,\n",
+ " pack_count,\n",
+ " item_descriptor\n",
+ " FROM ranked\n",
+ " WHERE rn <= 6\n",
+ " ORDER BY rule_name, rn\n",
+ " '''\n",
+ ")\n"
+ ]
+ }
+ ],
+ "metadata": {
+ "kernelspec": {
+ "display_name": "Python 3",
+ "language": "python",
+ "name": "python3"
+ },
+ "language_info": {
+ "codemirror_mode": {
+ "name": "ipython",
+ "version": 3
+ },
+ "file_extension": ".py",
+ "mimetype": "text/x-python",
+ "name": "python",
+ "nbconvert_exporter": "python",
+ "pygments_lexer": "ipython3",
+ "version": "3.10.0"
+ }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
\ No newline at end of file
diff --git a/pipeline/quantity_exploration.ipynb b/pipeline/quantity_exploration.ipynb
new file mode 100644
index 0000000..7c689b3
--- /dev/null
+++ b/pipeline/quantity_exploration.ipynb
@@ -0,0 +1,3177 @@
+{
+ "cells": [
+ {
+ "cell_type": "markdown",
+ "id": "overview",
+ "metadata": {},
+ "source": [
+ "# Quantity Field Exploration\n",
+ "\n",
+ "This notebook explores the three Open Food Facts Canada quantity-related fields:\n",
+ "\n",
+ "- `product_quantity_unit`\n",
+ "- `product_quantity`\n",
+ "- `quantity`\n",
+ "\n",
+ "This notebook is intentionally DuckDB-first. The parquet loading, field profiling, regex pattern classification, cross-field consistency checks, and anomaly surfacing are all pushed into DuckDB SQL as much as possible."
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "setup-note",
+ "metadata": {},
+ "source": [
+ "## 1. Setup\n",
+ "\n",
+ "This cell imports DuckDB and a few lightweight Python helpers. The data processing will happen in DuckDB, while Python is only used to assemble reusable SQL snippets and display query results."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 8,
+ "id": "70520de3",
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Collecting duckdb\n",
+ " Downloading duckdb-1.5.3-cp310-cp310-win_amd64.whl (13.1 MB)\n",
+ "Installing collected packages: duckdb\n",
+ "Successfully installed duckdb-1.5.3\n",
+ "Note: you may need to restart the kernel to use updated packages.\n"
+ ]
+ },
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "WARNING: You are using pip version 21.2.3; however, version 26.1.2 is available.\n",
+ "You should consider upgrading via the 'c:\\Users\\Sanjay H\\AppData\\Local\\Programs\\Python\\Python310\\python.exe -m pip install --upgrade pip' command.\n"
+ ]
+ }
+ ],
+ "source": [
+ "pip install duckdb"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 9,
+ "id": "setup",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# Import DuckDB for the actual exploration work.\n",
+ "import duckdb\n",
+ "import re\n",
+ "from pathlib import Path\n",
+ "\n",
+ "# Import display so query results render nicely inside the notebook.\n",
+ "from IPython.display import display\n",
+ "\n",
+ "# These are the OFF Canada fields we are exploring in this notebook.\n",
+ "DATA_COLUMNS = [\"product_quantity_unit\", \"product_quantity\", \"quantity\"]\n",
+ "\n",
+ "# Surface very large numeric quantities separately because they can hide the real distribution.\n",
+ "LARGE_QUANTITY_THRESHOLD = 100000.0\n",
+ "\n",
+ "# Try both notebook-relative locations so the notebook works from repo root or the pipeline folder.\n",
+ "PARQUET_CANDIDATES = [\n",
+ " Path(\"data/raw/off-canada.parquet\"),\n",
+ " Path(\"../data/raw/off-canada.parquet\"),\n",
+ "]\n",
+ "\n",
+ "# Map raw unit spellings to a canonical base unit, quantity type, and conversion factor.\n",
+ "UNIT_ALIASES = [\n",
+ " (\"g\", \"g\", \"mass\", 1.0),\n",
+ " (\"gram\", \"g\", \"mass\", 1.0),\n",
+ " (\"grams\", \"g\", \"mass\", 1.0),\n",
+ " (\"gramme\", \"g\", \"mass\", 1.0),\n",
+ " (\"grammes\", \"g\", \"mass\", 1.0),\n",
+ " (\"kg\", \"g\", \"mass\", 1000.0),\n",
+ " (\"kilo\", \"g\", \"mass\", 1000.0),\n",
+ " (\"kilos\", \"g\", \"mass\", 1000.0),\n",
+ " (\"kilogram\", \"g\", \"mass\", 1000.0),\n",
+ " (\"kilograms\", \"g\", \"mass\", 1000.0),\n",
+ " (\"mg\", \"g\", \"mass\", 0.001),\n",
+ " (\"ml\", \"ml\", \"volume\", 1.0),\n",
+ " (\"m l\", \"ml\", \"volume\", 1.0),\n",
+ " (\"l\", \"ml\", \"volume\", 1000.0),\n",
+ " (\"liter\", \"ml\", \"volume\", 1000.0),\n",
+ " (\"liters\", \"ml\", \"volume\", 1000.0),\n",
+ " (\"litre\", \"ml\", \"volume\", 1000.0),\n",
+ " (\"litres\", \"ml\", \"volume\", 1000.0),\n",
+ " (\"cl\", \"ml\", \"volume\", 10.0),\n",
+ " (\"oz\", \"g\", \"mass\", 28.349523125),\n",
+ " (\"lb\", \"g\", \"mass\", 453.59237),\n",
+ " (\"lbs\", \"g\", \"mass\", 453.59237),\n",
+ " (\"fl oz\", \"ml\", \"volume\", 29.5735),\n",
+ " (\"pc\", \"count\", \"count\", 1.0),\n",
+ " (\"pcs\", \"count\", \"count\", 1.0),\n",
+ " (\"piece\", \"count\", \"count\", 1.0),\n",
+ " (\"pieces\", \"count\", \"count\", 1.0),\n",
+ " (\"tablet\", \"count\", \"count\", 1.0),\n",
+ " (\"tablets\", \"count\", \"count\", 1.0),\n",
+ " (\"caps\", \"count\", \"count\", 1.0),\n",
+ " (\"cap\", \"count\", \"count\", 1.0),\n",
+ " (\"capsule\", \"count\", \"count\", 1.0),\n",
+ " (\"capsules\", \"count\", \"count\", 1.0),\n",
+ " (\"egg\", \"count\", \"count\", 1.0),\n",
+ " (\"eggs\", \"count\", \"count\", 1.0),\n",
+ " (\"bag\", \"count\", \"count\", 1.0),\n",
+ " (\"bags\", \"count\", \"count\", 1.0),\n",
+ " (\"bar\", \"count\", \"count\", 1.0),\n",
+ " (\"bars\", \"count\", \"count\", 1.0),\n",
+ " (\"barre\", \"count\", \"count\", 1.0),\n",
+ " (\"barres\", \"count\", \"count\", 1.0),\n",
+ " (\"portion\", \"count\", \"count\", 1.0),\n",
+ " (\"portions\", \"count\", \"count\", 1.0),\n",
+ " (\"can\", \"count\", \"count\", 1.0),\n",
+ " (\"cans\", \"count\", \"count\", 1.0),\n",
+ " (\"mint\", \"count\", \"count\", 1.0),\n",
+ " (\"mints\", \"count\", \"count\", 1.0),\n",
+ " (\"sachet\", \"count\", \"count\", 1.0),\n",
+ " (\"sachets\", \"count\", \"count\", 1.0),\n",
+ " (\"bottle\", \"count\", \"count\", 1.0),\n",
+ " (\"bottles\", \"count\", \"count\", 1.0),\n",
+ " (\"bun\", \"count\", \"count\", 1.0),\n",
+ " (\"buns\", \"count\", \"count\", 1.0),\n",
+ " (\"yaourt\", \"count\", \"count\", 1.0),\n",
+ " (\"yaourts\", \"count\", \"count\", 1.0),\n",
+ " (\"tranche\", \"count\", \"count\", 1.0),\n",
+ " (\"tranches\", \"count\", \"count\", 1.0),\n",
+ " (\"serving\", \"count\", \"count\", 1.0),\n",
+ " (\"servings\", \"count\", \"count\", 1.0),\n",
+ "]\n",
+ "\n",
+ "# These raw tokens usually indicate the free-text quantity is not directly useful yet.\n",
+ "PLACEHOLDER_REGEX = r\"(unknown|bonne|good|bn batouta)\"\n",
+ "\n",
+ "# These count-like words help us separate measure rows from count-descriptor rows.\n",
+ "COUNT_DESCRIPTOR_REGEX = r\"\\b(count|pc|pcs|piece|pieces|tablet|tablets|caps|cap|capsule|capsules|egg|eggs|bag|bags|bar|bars|barre|barres|portion|portions|can|cans|mint|mints|sachet|sachets|serving|servings)\\b\"\n",
+ "\n",
+ "# Ignore these leftover tokens when surfacing unresolved words.\n",
+ "UNRESOLVED_TOKEN_STOPWORDS = [\n",
+ " \"and\", \"de\", \"des\", \"du\", \"en\", \"et\", \"in\", \"net\", \"wt\",\n",
+ " \"quot\", \"the\", \"of\", \"pour\", \"double\"\n",
+ "]\n",
+ "\n",
+ "\n",
+ "# Resolve the parquet path once so the notebook can run from multiple working directories.\n",
+ "def resolve_parquet_path(candidates):\n",
+ " for candidate in candidates:\n",
+ " if candidate.exists():\n",
+ " return candidate\n",
+ " raise FileNotFoundError(f\"Could not find OFF parquet. Tried: {candidates}\")\n",
+ "\n",
+ "\n",
+ "# Escape a Python string so it is safe to embed inside SQL string literals.\n",
+ "def sql_string(value):\n",
+ " return \"'\" + str(value).replace(\"'\", \"''\") + \"'\"\n",
+ "\n",
+ "\n",
+ "# Build one regex pattern covering all currently recognized unit tokens.\n",
+ "def build_unit_pattern(unit_aliases):\n",
+ " tokens = [token for token, _, _, _ in unit_aliases]\n",
+ " return \"(?:\" + \"|\".join(re.escape(token).replace(r\"\\ \", r\"\\s*\") for token in sorted(tokens, key=len, reverse=True)) + \")\"\n",
+ "\n",
+ "\n",
+ "# Build a reusable SQL CASE expression that maps a raw token to its canonical unit metadata.\n",
+ "def build_unit_case(expr, target):\n",
+ " target_index = {\"base_unit\": 1, \"quantity_type\": 2, \"factor\": 3}[target]\n",
+ " lines = [\"CASE\"]\n",
+ " for token, base_unit, quantity_type, factor in UNIT_ALIASES:\n",
+ " mapped_value = [token, base_unit, quantity_type, factor][target_index]\n",
+ " sql_value = str(mapped_value) if target == \"factor\" else sql_string(mapped_value)\n",
+ " lines.append(f\" WHEN {expr} = {sql_string(token)} THEN {sql_value}\")\n",
+ " lines.append(\" ELSE NULL\")\n",
+ " lines.append(\"END\")\n",
+ " return \"\\n\".join(lines)\n",
+ "\n",
+ "\n",
+ "# Small helper so every query cell can render a titled result table consistently.\n",
+ "def show_query(title, query):\n",
+ " print(title)\n",
+ " display(con.sql(query).df())\n",
+ "\n",
+ "\n",
+ "UNIT_PATTERN = build_unit_pattern(UNIT_ALIASES)\n",
+ "SIMPLE_MEASURE_REGEX = rf\"^([0-9]+(?:[.,][0-9]+)?)\\s*({UNIT_PATTERN})$\"\n",
+ "MULTIPACK_REGEX = rf\"\\b(\\d+)\\s*[*x]\\s*([0-9]+(?:[.,][0-9]+)?)\\s*({UNIT_PATTERN})\\b\"\n",
+ "GENERAL_UNIT_REGEX = rf\"({UNIT_PATTERN})\"\n",
+ "\n",
+ "RECOGNIZED_UNIT_TOKENS_SQL = \", \".join(sql_string(token) for token, _, _, _ in UNIT_ALIASES)\n",
+ "UNRESOLVED_STOPWORDS_SQL = \", \".join(sql_string(token) for token in UNRESOLVED_TOKEN_STOPWORDS)\n",
+ "\n",
+ "# Create the in-memory DuckDB connection once for the whole notebook.\n",
+ "con = duckdb.connect()\n"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "load-note",
+ "metadata": {},
+ "source": [
+ "## 2. Load The Quantity Fields Into DuckDB\n",
+ "\n",
+ "This cell reads the parquet through DuckDB and creates two views:\n",
+ "\n",
+ "- `quantity_raw`: the three original OFF quantity fields only\n",
+ "- `quantity_features`: a DuckDB-derived profiling view with normalized text, parsed values, canonical units, and pattern labels\n",
+ "\n",
+ "The notebook will query these views directly instead of pushing the analysis into pandas."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 10,
+ "id": "load-and-derive",
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Source parquet: ..\\data\\raw\\off-canada.parquet\n",
+ "Raw schema\n"
+ ]
+ },
+ {
+ "data": {
+ "text/html": [
+ "
\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " column_name | \n",
+ " column_type | \n",
+ " null | \n",
+ " key | \n",
+ " default | \n",
+ " extra | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " | 0 | \n",
+ " product_quantity_unit | \n",
+ " VARCHAR | \n",
+ " YES | \n",
+ " None | \n",
+ " None | \n",
+ " None | \n",
+ "
\n",
+ " \n",
+ " | 1 | \n",
+ " product_quantity | \n",
+ " VARCHAR | \n",
+ " YES | \n",
+ " None | \n",
+ " None | \n",
+ " None | \n",
+ "
\n",
+ " \n",
+ " | 2 | \n",
+ " quantity | \n",
+ " VARCHAR | \n",
+ " YES | \n",
+ " None | \n",
+ " None | \n",
+ " None | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " column_name column_type null key default extra\n",
+ "0 product_quantity_unit VARCHAR YES None None None\n",
+ "1 product_quantity VARCHAR YES None None None\n",
+ "2 quantity VARCHAR YES None None None"
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Row count\n"
+ ]
+ },
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " row_count | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " | 0 | \n",
+ " 114453 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " row_count\n",
+ "0 114453"
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Sample rows\n"
+ ]
+ },
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " product_quantity_unit | \n",
+ " product_quantity | \n",
+ " quantity | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " | 0 | \n",
+ " ml | \n",
+ " 946 | \n",
+ " 946 ml | \n",
+ "
\n",
+ " \n",
+ " | 1 | \n",
+ " None | \n",
+ " 118 | \n",
+ " 118 ml | \n",
+ "
\n",
+ " \n",
+ " | 2 | \n",
+ " None | \n",
+ " None | \n",
+ " | \n",
+ "
\n",
+ " \n",
+ " | 3 | \n",
+ " None | \n",
+ " 235 | \n",
+ " 235 g | \n",
+ "
\n",
+ " \n",
+ " | 4 | \n",
+ " None | \n",
+ " 0 | \n",
+ " 50 | \n",
+ "
\n",
+ " \n",
+ " | 5 | \n",
+ " g | \n",
+ " 50 | \n",
+ " 50g | \n",
+ "
\n",
+ " \n",
+ " | 6 | \n",
+ " g | \n",
+ " 192 | \n",
+ " 192 g | \n",
+ "
\n",
+ " \n",
+ " | 7 | \n",
+ " None | \n",
+ " None | \n",
+ " None | \n",
+ "
\n",
+ " \n",
+ " | 8 | \n",
+ " None | \n",
+ " None | \n",
+ " None | \n",
+ "
\n",
+ " \n",
+ " | 9 | \n",
+ " ml | \n",
+ " 480 | \n",
+ " 480 mL | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " product_quantity_unit product_quantity quantity\n",
+ "0 ml 946 946 ml\n",
+ "1 None 118 118 ml\n",
+ "2 None None \n",
+ "3 None 235 235 g\n",
+ "4 None 0 50\n",
+ "5 g 50 50g\n",
+ "6 g 192 192 g\n",
+ "7 None None None\n",
+ "8 None None None\n",
+ "9 ml 480 480 mL"
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ }
+ ],
+ "source": [
+ "# Resolve the parquet path and expose only the three relevant fields as a DuckDB view.\n",
+ "PARQUET_PATH = resolve_parquet_path(PARQUET_CANDIDATES)\n",
+ "PARQUET_SQL_PATH = PARQUET_PATH.as_posix().replace(\"'\", \"''\")\n",
+ "\n",
+ "con.execute(\n",
+ " f\"\"\"\n",
+ " CREATE OR REPLACE TEMP VIEW quantity_raw AS\n",
+ " SELECT {', '.join(DATA_COLUMNS)}\n",
+ " FROM read_parquet('{PARQUET_SQL_PATH}')\n",
+ " \"\"\"\n",
+ ")\n",
+ "\n",
+ "# Build reusable SQL CASE expressions for unit normalization.\n",
+ "PRODUCT_UNIT_BASE_CASE = build_unit_case(\"product_quantity_unit_normalized\", \"base_unit\")\n",
+ "PRODUCT_UNIT_TYPE_CASE = build_unit_case(\"product_quantity_unit_normalized\", \"quantity_type\")\n",
+ "SIMPLE_UNIT_BASE_CASE = build_unit_case(\"simple_unit_token\", \"base_unit\")\n",
+ "SIMPLE_UNIT_TYPE_CASE = build_unit_case(\"simple_unit_token\", \"quantity_type\")\n",
+ "SIMPLE_UNIT_FACTOR_CASE = build_unit_case(\"simple_unit_token\", \"factor\")\n",
+ "PACK_UNIT_BASE_CASE = build_unit_case(\"pack_unit_token\", \"base_unit\")\n",
+ "PACK_UNIT_TYPE_CASE = build_unit_case(\"pack_unit_token\", \"quantity_type\")\n",
+ "PACK_UNIT_FACTOR_CASE = build_unit_case(\"pack_unit_token\", \"factor\")\n",
+ "FIRST_UNIT_BASE_CASE = build_unit_case(\"first_unit_token\", \"base_unit\")\n",
+ "FIRST_UNIT_TYPE_CASE = build_unit_case(\"first_unit_token\", \"quantity_type\")\n",
+ "\n",
+ "# Create one derived DuckDB view that holds the parsed and normalized exploration features.\n",
+ "quantity_features_sql = f\"\"\"\n",
+ "CREATE OR REPLACE TEMP VIEW quantity_features AS\n",
+ "WITH base AS (\n",
+ " SELECT\n",
+ " product_quantity_unit,\n",
+ " lower(trim(product_quantity_unit)) AS product_quantity_unit_normalized,\n",
+ " product_quantity,\n",
+ " TRY_CAST(product_quantity AS DOUBLE) AS product_quantity_numeric,\n",
+ " quantity,\n",
+ " lower(trim(quantity)) AS quantity_normalized\n",
+ " FROM quantity_raw\n",
+ "),\n",
+ "parsed AS (\n",
+ " SELECT\n",
+ " *,\n",
+ " NULLIF(regexp_extract(quantity_normalized, '{SIMPLE_MEASURE_REGEX}', 1), '') AS simple_value_text,\n",
+ " NULLIF(regexp_extract(quantity_normalized, '{SIMPLE_MEASURE_REGEX}', 2), '') AS simple_unit_token,\n",
+ " TRY_CAST(replace(NULLIF(regexp_extract(quantity_normalized, '{SIMPLE_MEASURE_REGEX}', 1), ''), ',', '.') AS DOUBLE) AS simple_value_raw,\n",
+ " TRY_CAST(NULLIF(regexp_extract(quantity_normalized, '{MULTIPACK_REGEX}', 1), '') AS DOUBLE) AS pack_count,\n",
+ " TRY_CAST(replace(NULLIF(regexp_extract(quantity_normalized, '{MULTIPACK_REGEX}', 2), ''), ',', '.') AS DOUBLE) AS pack_value_raw,\n",
+ " NULLIF(regexp_extract(quantity_normalized, '{MULTIPACK_REGEX}', 3), '') AS pack_unit_token,\n",
+ " NULLIF(regexp_extract(quantity_normalized, '{GENERAL_UNIT_REGEX}', 1), '') AS first_unit_token\n",
+ " FROM base\n",
+ "),\n",
+ "normalized AS (\n",
+ " SELECT\n",
+ " *,\n",
+ " {PRODUCT_UNIT_BASE_CASE} AS product_quantity_unit_base,\n",
+ " {PRODUCT_UNIT_TYPE_CASE} AS product_quantity_unit_type,\n",
+ " {SIMPLE_UNIT_BASE_CASE} AS simple_base_unit,\n",
+ " {SIMPLE_UNIT_TYPE_CASE} AS simple_quantity_type,\n",
+ " {SIMPLE_UNIT_FACTOR_CASE} AS simple_factor,\n",
+ " CASE\n",
+ " WHEN simple_value_raw IS NOT NULL AND {SIMPLE_UNIT_FACTOR_CASE} IS NOT NULL\n",
+ " THEN simple_value_raw * {SIMPLE_UNIT_FACTOR_CASE}\n",
+ " ELSE NULL\n",
+ " END AS simple_normalized_value,\n",
+ " {PACK_UNIT_BASE_CASE} AS pack_base_unit,\n",
+ " {PACK_UNIT_TYPE_CASE} AS pack_quantity_type,\n",
+ " {PACK_UNIT_FACTOR_CASE} AS pack_factor,\n",
+ " CASE\n",
+ " WHEN pack_count IS NOT NULL AND pack_value_raw IS NOT NULL AND {PACK_UNIT_FACTOR_CASE} IS NOT NULL\n",
+ " THEN pack_count * pack_value_raw * {PACK_UNIT_FACTOR_CASE}\n",
+ " ELSE NULL\n",
+ " END AS pack_normalized_total_value,\n",
+ " {FIRST_UNIT_BASE_CASE} AS first_unit_base,\n",
+ " {FIRST_UNIT_TYPE_CASE} AS first_quantity_type\n",
+ " FROM parsed\n",
+ ")\n",
+ "SELECT\n",
+ " *,\n",
+ " CASE\n",
+ " WHEN quantity IS NULL THEN 'null'\n",
+ " WHEN quantity_normalized = '' THEN 'blank'\n",
+ " WHEN regexp_matches(quantity_normalized, '{PLACEHOLDER_REGEX}') THEN 'placeholder_or_unknown'\n",
+ " WHEN simple_value_raw IS NOT NULL AND simple_factor IS NOT NULL THEN 'simple_measure'\n",
+ " WHEN pack_count IS NOT NULL AND pack_value_raw IS NOT NULL AND pack_factor IS NOT NULL THEN 'multipack_measure'\n",
+ " WHEN regexp_matches(quantity_normalized, '^[0-9]+(?:[.,][0-9]+)?$') THEN 'number_only'\n",
+ " WHEN regexp_matches(quantity_normalized, '\\\\b\\\\d+\\\\s*[*x]\\\\s*\\\\d+') THEN 'multipack_unparsed'\n",
+ " WHEN regexp_matches(quantity_normalized, '[()/|]') AND first_unit_token IS NOT NULL THEN 'mixed_measure_expression'\n",
+ " WHEN first_unit_token IS NOT NULL AND regexp_matches(quantity_normalized, '\\\\d') THEN 'text_with_measure'\n",
+ " WHEN regexp_matches(quantity_normalized, '{COUNT_DESCRIPTOR_REGEX}') THEN 'count_descriptor'\n",
+ " ELSE 'unparsed_text'\n",
+ " END AS quantity_pattern\n",
+ "FROM normalized\n",
+ "\"\"\"\n",
+ "\n",
+ "con.execute(quantity_features_sql)\n",
+ "\n",
+ "# Show the parquet source, row count, schema, and a few sample rows from the raw view.\n",
+ "print(f\"Source parquet: {PARQUET_PATH}\")\n",
+ "show_query(\"Raw schema\", \"DESCRIBE quantity_raw\")\n",
+ "show_query(\"Row count\", \"SELECT COUNT(*) AS row_count FROM quantity_raw\")\n",
+ "show_query(\"Sample rows\", \"SELECT * FROM quantity_raw LIMIT 10\")\n"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "coverage-note",
+ "metadata": {},
+ "source": [
+ "## 3. Field Coverage Profile\n",
+ "\n",
+ "This cell gives the completeness picture for all three fields: null counts, blank-string counts, non-blank counts, and distinct-value counts. Because the work is in DuckDB, these numbers come directly from SQL aggregations over the parquet-backed view."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 11,
+ "id": "coverage",
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Coverage summary\n"
+ ]
+ },
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " field | \n",
+ " row_count | \n",
+ " null_count | \n",
+ " non_null_count | \n",
+ " blank_string_count | \n",
+ " non_blank_count | \n",
+ " distinct_non_blank_count | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " | 0 | \n",
+ " product_quantity_unit | \n",
+ " 114453 | \n",
+ " 98638 | \n",
+ " 15815 | \n",
+ " 0 | \n",
+ " 15815 | \n",
+ " 3 | \n",
+ "
\n",
+ " \n",
+ " | 1 | \n",
+ " product_quantity | \n",
+ " 114453 | \n",
+ " 93022 | \n",
+ " 21431 | \n",
+ " 0 | \n",
+ " 21431 | \n",
+ " 1652 | \n",
+ "
\n",
+ " \n",
+ " | 2 | \n",
+ " quantity | \n",
+ " 114453 | \n",
+ " 90285 | \n",
+ " 24168 | \n",
+ " 2416 | \n",
+ " 21752 | \n",
+ " 4362 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " field row_count null_count non_null_count blank_string_count non_blank_count distinct_non_blank_count\n",
+ "0 product_quantity_unit 114453 98638 15815 0 15815 3\n",
+ "1 product_quantity 114453 93022 21431 0 21431 1652\n",
+ "2 quantity 114453 90285 24168 2416 21752 4362"
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ }
+ ],
+ "source": [
+ "# Compare the three quantity fields side by side using one SQL query.\n",
+ "coverage_query = \"\"\"\n",
+ "SELECT 'product_quantity_unit' AS field,\n",
+ " COUNT(*) AS row_count,\n",
+ " COUNT(*) FILTER (WHERE product_quantity_unit IS NULL) AS null_count,\n",
+ " COUNT(*) FILTER (WHERE product_quantity_unit IS NOT NULL) AS non_null_count,\n",
+ " COUNT(*) FILTER (WHERE product_quantity_unit IS NOT NULL AND trim(product_quantity_unit) = '') AS blank_string_count,\n",
+ " COUNT(*) FILTER (WHERE product_quantity_unit IS NOT NULL AND trim(product_quantity_unit) <> '') AS non_blank_count,\n",
+ " COUNT(DISTINCT product_quantity_unit) FILTER (WHERE product_quantity_unit IS NOT NULL AND trim(product_quantity_unit) <> '') AS distinct_non_blank_count\n",
+ "FROM quantity_raw\n",
+ "UNION ALL\n",
+ "SELECT 'product_quantity' AS field,\n",
+ " COUNT(*) AS row_count,\n",
+ " COUNT(*) FILTER (WHERE product_quantity IS NULL) AS null_count,\n",
+ " COUNT(*) FILTER (WHERE product_quantity IS NOT NULL) AS non_null_count,\n",
+ " COUNT(*) FILTER (WHERE product_quantity IS NOT NULL AND trim(product_quantity) = '') AS blank_string_count,\n",
+ " COUNT(*) FILTER (WHERE product_quantity IS NOT NULL AND trim(product_quantity) <> '') AS non_blank_count,\n",
+ " COUNT(DISTINCT product_quantity) FILTER (WHERE product_quantity IS NOT NULL AND trim(product_quantity) <> '') AS distinct_non_blank_count\n",
+ "FROM quantity_raw\n",
+ "UNION ALL\n",
+ "SELECT 'quantity' AS field,\n",
+ " COUNT(*) AS row_count,\n",
+ " COUNT(*) FILTER (WHERE quantity IS NULL) AS null_count,\n",
+ " COUNT(*) FILTER (WHERE quantity IS NOT NULL) AS non_null_count,\n",
+ " COUNT(*) FILTER (WHERE quantity IS NOT NULL AND trim(quantity) = '') AS blank_string_count,\n",
+ " COUNT(*) FILTER (WHERE quantity IS NOT NULL AND trim(quantity) <> '') AS non_blank_count,\n",
+ " COUNT(DISTINCT quantity) FILTER (WHERE quantity IS NOT NULL AND trim(quantity) <> '') AS distinct_non_blank_count\n",
+ "FROM quantity_raw\n",
+ "\"\"\"\n",
+ "\n",
+ "show_query(\"Coverage summary\", coverage_query)\n"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "unit-note",
+ "metadata": {},
+ "source": [
+ "## 4. Explore `product_quantity_unit`\n",
+ "\n",
+ "This cell focuses only on the normalized unit field. We want to know how sparse it is, which values dominate it, and whether any unexpected unit tokens are present."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 12,
+ "id": "unit-exploration",
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Top raw product_quantity_unit values\n"
+ ]
+ },
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " value | \n",
+ " count | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " | 0 | \n",
+ " <NA> | \n",
+ " 98638 | \n",
+ "
\n",
+ " \n",
+ " | 1 | \n",
+ " g | \n",
+ " 11813 | \n",
+ "
\n",
+ " \n",
+ " | 2 | \n",
+ " ml | \n",
+ " 4001 | \n",
+ "
\n",
+ " \n",
+ " | 3 | \n",
+ " kj | \n",
+ " 1 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " value count\n",
+ "0 98638\n",
+ "1 g 11813\n",
+ "2 ml 4001\n",
+ "3 kj 1"
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Canonical base unit counts from product_quantity_unit\n"
+ ]
+ },
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " canonical_base_unit | \n",
+ " count | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " | 0 | \n",
+ " g | \n",
+ " 11813 | \n",
+ "
\n",
+ " \n",
+ " | 1 | \n",
+ " ml | \n",
+ " 4001 | \n",
+ "
\n",
+ " \n",
+ " | 2 | \n",
+ " __unrecognized__ | \n",
+ " 1 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " canonical_base_unit count\n",
+ "0 g 11813\n",
+ "1 ml 4001\n",
+ "2 __unrecognized__ 1"
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Unrecognized product_quantity_unit values\n"
+ ]
+ },
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " unrecognized_unit | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " | 0 | \n",
+ " kj | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " unrecognized_unit\n",
+ "0 kj"
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ }
+ ],
+ "source": [
+ "# Show the most common raw values in product_quantity_unit.\n",
+ "show_query(\n",
+ " \"Top raw product_quantity_unit values\",\n",
+ " \"\"\"\n",
+ " SELECT COALESCE(product_quantity_unit, '') AS value, COUNT(*) AS count\n",
+ " FROM quantity_raw\n",
+ " GROUP BY 1\n",
+ " ORDER BY count DESC, value\n",
+ " LIMIT 20\n",
+ " \"\"\"\n",
+ ")\n",
+ "\n",
+ "# Map the raw unit tokens to canonical base units so we can see the real unit distribution.\n",
+ "show_query(\n",
+ " \"Canonical base unit counts from product_quantity_unit\",\n",
+ " f\"\"\"\n",
+ " SELECT COALESCE(product_quantity_unit_base, '__unrecognized__') AS canonical_base_unit,\n",
+ " COUNT(*) AS count\n",
+ " FROM quantity_features\n",
+ " WHERE product_quantity_unit IS NOT NULL AND trim(product_quantity_unit) <> ''\n",
+ " GROUP BY 1\n",
+ " ORDER BY count DESC, canonical_base_unit\n",
+ " \"\"\"\n",
+ ")\n",
+ "\n",
+ "# Surface any raw unit tokens that we are not recognizing yet.\n",
+ "show_query(\n",
+ " \"Unrecognized product_quantity_unit values\",\n",
+ " \"\"\"\n",
+ " SELECT DISTINCT product_quantity_unit AS unrecognized_unit\n",
+ " FROM quantity_features\n",
+ " WHERE product_quantity_unit IS NOT NULL\n",
+ " AND trim(product_quantity_unit) <> ''\n",
+ " AND product_quantity_unit_base IS NULL\n",
+ " ORDER BY unrecognized_unit\n",
+ " \"\"\"\n",
+ ")\n"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "product-quantity-note",
+ "metadata": {},
+ "source": [
+ "## 5. Explore `product_quantity`\n",
+ "\n",
+ "This cell profiles the numeric quantity field. The query checks how often it is populated, whether it is numeric when present, how often zero appears, what the typical range looks like, and whether a few extreme rows distort the mean."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 13,
+ "id": "product-quantity-exploration",
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Numeric profile for product_quantity\n"
+ ]
+ },
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " non_null_count | \n",
+ " numeric_cast_count | \n",
+ " numeric_cast_failure_count | \n",
+ " zero_count | \n",
+ " negative_count | \n",
+ " positive_count | \n",
+ " integer_like_count | \n",
+ " fractional_count | \n",
+ " min_value | \n",
+ " positive_min_value | \n",
+ " p5 | \n",
+ " median | \n",
+ " mean | \n",
+ " trimmed_mean_excluding_top_1_percent | \n",
+ " p95 | \n",
+ " p99 | \n",
+ " max_value | \n",
+ " rows_above_100000 | \n",
+ " rows_above_p99 | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " | 0 | \n",
+ " 21431 | \n",
+ " 21431 | \n",
+ " 0 | \n",
+ " 738 | \n",
+ " 0 | \n",
+ " 20693 | \n",
+ " 20609 | \n",
+ " 822 | \n",
+ " 0.0 | \n",
+ " 0.000062 | \n",
+ " 24.0 | \n",
+ " 354.0 | \n",
+ " 4.666138e+13 | \n",
+ " 484.149003 | \n",
+ " 1750.0 | \n",
+ " 4056.0 | \n",
+ " 1.000000e+18 | \n",
+ " 3 | \n",
+ " 215 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " non_null_count numeric_cast_count numeric_cast_failure_count zero_count negative_count positive_count integer_like_count fractional_count \\\n",
+ "0 21431 21431 0 738 0 20693 20609 822 \n",
+ "\n",
+ " min_value positive_min_value p5 median mean trimmed_mean_excluding_top_1_percent p95 p99 max_value rows_above_100000 \\\n",
+ "0 0.0 0.000062 24.0 354.0 4.666138e+13 484.149003 1750.0 4056.0 1.000000e+18 3 \n",
+ "\n",
+ " rows_above_p99 \n",
+ "0 215 "
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Top raw product_quantity values\n"
+ ]
+ },
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " value | \n",
+ " count | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " | 0 | \n",
+ " <NA> | \n",
+ " 93022 | \n",
+ "
\n",
+ " \n",
+ " | 1 | \n",
+ " 500 | \n",
+ " 725 | \n",
+ "
\n",
+ " \n",
+ " | 2 | \n",
+ " 1000 | \n",
+ " 572 | \n",
+ "
\n",
+ " \n",
+ " | 3 | \n",
+ " 0 | \n",
+ " 518 | \n",
+ "
\n",
+ " \n",
+ " | 4 | \n",
+ " 400 | \n",
+ " 452 | \n",
+ "
\n",
+ " \n",
+ " | 5 | \n",
+ " 200 | \n",
+ " 417 | \n",
+ "
\n",
+ " \n",
+ " | 6 | \n",
+ " 100 | \n",
+ " 409 | \n",
+ "
\n",
+ " \n",
+ " | 7 | \n",
+ " 300 | \n",
+ " 390 | \n",
+ "
\n",
+ " \n",
+ " | 8 | \n",
+ " 250 | \n",
+ " 375 | \n",
+ "
\n",
+ " \n",
+ " | 9 | \n",
+ " 454 | \n",
+ " 278 | \n",
+ "
\n",
+ " \n",
+ " | 10 | \n",
+ " 500.0 | \n",
+ " 265 | \n",
+ "
\n",
+ " \n",
+ " | 11 | \n",
+ " 2000 | \n",
+ " 222 | \n",
+ "
\n",
+ " \n",
+ " | 12 | \n",
+ " 0.0 | \n",
+ " 220 | \n",
+ "
\n",
+ " \n",
+ " | 13 | \n",
+ " 1000.0 | \n",
+ " 216 | \n",
+ "
\n",
+ " \n",
+ " | 14 | \n",
+ " 473 | \n",
+ " 210 | \n",
+ "
\n",
+ " \n",
+ " | 15 | \n",
+ " 355 | \n",
+ " 208 | \n",
+ "
\n",
+ " \n",
+ " | 16 | \n",
+ " 150 | \n",
+ " 207 | \n",
+ "
\n",
+ " \n",
+ " | 17 | \n",
+ " 750 | \n",
+ " 207 | \n",
+ "
\n",
+ " \n",
+ " | 18 | \n",
+ " 340 | \n",
+ " 196 | \n",
+ "
\n",
+ " \n",
+ " | 19 | \n",
+ " 450 | \n",
+ " 196 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " value count\n",
+ "0 93022\n",
+ "1 500 725\n",
+ "2 1000 572\n",
+ "3 0 518\n",
+ "4 400 452\n",
+ "5 200 417\n",
+ "6 100 409\n",
+ "7 300 390\n",
+ "8 250 375\n",
+ "9 454 278\n",
+ "10 500.0 265\n",
+ "11 2000 222\n",
+ "12 0.0 220\n",
+ "13 1000.0 216\n",
+ "14 473 210\n",
+ "15 355 208\n",
+ "16 150 207\n",
+ "17 750 207\n",
+ "18 340 196\n",
+ "19 450 196"
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Large product_quantity examples\n"
+ ]
+ },
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " product_quantity | \n",
+ " quantity | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " | 0 | \n",
+ " 999999999999999999 | \n",
+ " 999999999999999999 g | \n",
+ "
\n",
+ " \n",
+ " | 1 | \n",
+ " 1250000 | \n",
+ " 1250 kg | \n",
+ "
\n",
+ " \n",
+ " | 2 | \n",
+ " 400000 | \n",
+ " 1 400 kg | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " product_quantity quantity\n",
+ "0 999999999999999999 999999999999999999 g\n",
+ "1 1250000 1250 kg\n",
+ "2 400000 1 400 kg"
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ }
+ ],
+ "source": [
+ "# Build a robust numeric profile directly in SQL.\n",
+ "product_quantity_profile_query = f\"\"\"\n",
+ " WITH numeric_base AS (\n",
+ " SELECT product_quantity,\n",
+ " product_quantity_numeric\n",
+ " FROM quantity_features\n",
+ " ),\n",
+ " thresholds AS (\n",
+ " SELECT quantile_cont(product_quantity_numeric, 0.99) AS p99\n",
+ " FROM numeric_base\n",
+ " WHERE product_quantity_numeric IS NOT NULL\n",
+ " )\n",
+ " SELECT\n",
+ " COUNT(*) FILTER (WHERE product_quantity IS NOT NULL) AS non_null_count,\n",
+ " COUNT(*) FILTER (WHERE product_quantity_numeric IS NOT NULL) AS numeric_cast_count,\n",
+ " COUNT(*) FILTER (WHERE product_quantity IS NOT NULL AND product_quantity_numeric IS NULL) AS numeric_cast_failure_count,\n",
+ " COUNT(*) FILTER (WHERE product_quantity_numeric = 0) AS zero_count,\n",
+ " COUNT(*) FILTER (WHERE product_quantity_numeric < 0) AS negative_count,\n",
+ " COUNT(*) FILTER (WHERE product_quantity_numeric > 0) AS positive_count,\n",
+ " COUNT(*) FILTER (WHERE product_quantity_numeric IS NOT NULL AND product_quantity_numeric = floor(product_quantity_numeric)) AS integer_like_count,\n",
+ " COUNT(*) FILTER (WHERE product_quantity_numeric IS NOT NULL AND product_quantity_numeric <> floor(product_quantity_numeric)) AS fractional_count,\n",
+ " MIN(product_quantity_numeric) AS min_value,\n",
+ " MIN(product_quantity_numeric) FILTER (WHERE product_quantity_numeric > 0) AS positive_min_value,\n",
+ " quantile_cont(product_quantity_numeric, 0.05) AS p5,\n",
+ " median(product_quantity_numeric) AS median,\n",
+ " AVG(product_quantity_numeric) AS mean,\n",
+ " AVG(product_quantity_numeric) FILTER (WHERE product_quantity_numeric <= p99) AS trimmed_mean_excluding_top_1_percent,\n",
+ " quantile_cont(product_quantity_numeric, 0.95) AS p95,\n",
+ " MAX(p99) AS p99,\n",
+ " MAX(product_quantity_numeric) AS max_value,\n",
+ " COUNT(*) FILTER (WHERE product_quantity_numeric > {LARGE_QUANTITY_THRESHOLD}) AS rows_above_100000,\n",
+ " COUNT(*) FILTER (WHERE product_quantity_numeric > p99) AS rows_above_p99\n",
+ " FROM numeric_base\n",
+ " CROSS JOIN thresholds\n",
+ " WHERE product_quantity_numeric IS NOT NULL\n",
+ "\"\"\"\n",
+ "\n",
+ "show_query(\"Numeric profile for product_quantity\", product_quantity_profile_query)\n",
+ "\n",
+ "# Show the most common raw product_quantity values because repeated sizes and repeated zeros matter later.\n",
+ "show_query(\n",
+ " \"Top raw product_quantity values\",\n",
+ " \"\"\"\n",
+ " SELECT COALESCE(product_quantity, '') AS value, COUNT(*) AS count\n",
+ " FROM quantity_raw\n",
+ " GROUP BY 1\n",
+ " ORDER BY count DESC, value\n",
+ " LIMIT 20\n",
+ " \"\"\"\n",
+ ")\n",
+ "\n",
+ "# Surface the largest rows explicitly so extreme outliers do not stay hidden in the summary stats.\n",
+ "show_query(\n",
+ " \"Large product_quantity examples\",\n",
+ " f\"\"\"\n",
+ " SELECT product_quantity, quantity\n",
+ " FROM quantity_features\n",
+ " WHERE product_quantity_numeric > {LARGE_QUANTITY_THRESHOLD}\n",
+ " ORDER BY product_quantity_numeric DESC\n",
+ " \"\"\"\n",
+ ")\n"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "quantity-note",
+ "metadata": {},
+ "source": [
+ "## 6. Explore `quantity`\n",
+ "\n",
+ "This cell profiles the free-text quantity field. The derived view has already labeled each row into one pattern bucket, so here we can inspect the variation directly from SQL: top raw values, pattern counts, implied base units, and example rows by pattern."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 14,
+ "id": "quantity-exploration",
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Top raw quantity values\n"
+ ]
+ },
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " value | \n",
+ " count | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " | 0 | \n",
+ " <NA> | \n",
+ " 90285 | \n",
+ "
\n",
+ " \n",
+ " | 1 | \n",
+ " | \n",
+ " 2416 | \n",
+ "
\n",
+ " \n",
+ " | 2 | \n",
+ " 500 g | \n",
+ " 421 | \n",
+ "
\n",
+ " \n",
+ " | 3 | \n",
+ " 100 g | \n",
+ " 419 | \n",
+ "
\n",
+ " \n",
+ " | 4 | \n",
+ " 200 g | \n",
+ " 367 | \n",
+ "
\n",
+ " \n",
+ " | 5 | \n",
+ " 400 g | \n",
+ " 337 | \n",
+ "
\n",
+ " \n",
+ " | 6 | \n",
+ " 300 g | \n",
+ " 316 | \n",
+ "
\n",
+ " \n",
+ " | 7 | \n",
+ " 454 g | \n",
+ " 306 | \n",
+ "
\n",
+ " \n",
+ " | 8 | \n",
+ " 1 kg | \n",
+ " 280 | \n",
+ "
\n",
+ " \n",
+ " | 9 | \n",
+ " 250 g | \n",
+ " 234 | \n",
+ "
\n",
+ " \n",
+ " | 10 | \n",
+ " 355 mL | \n",
+ " 180 | \n",
+ "
\n",
+ " \n",
+ " | 11 | \n",
+ " 473 mL | \n",
+ " 180 | \n",
+ "
\n",
+ " \n",
+ " | 12 | \n",
+ " 150 g | \n",
+ " 178 | \n",
+ "
\n",
+ " \n",
+ " | 13 | \n",
+ " 1 | \n",
+ " 177 | \n",
+ "
\n",
+ " \n",
+ " | 14 | \n",
+ " 340 g | \n",
+ " 177 | \n",
+ "
\n",
+ " \n",
+ " | 15 | \n",
+ " 500 mL | \n",
+ " 173 | \n",
+ "
\n",
+ " \n",
+ " | 16 | \n",
+ " 120 g | \n",
+ " 170 | \n",
+ "
\n",
+ " \n",
+ " | 17 | \n",
+ " 1 L | \n",
+ " 169 | \n",
+ "
\n",
+ " \n",
+ " | 18 | \n",
+ " 600 g | \n",
+ " 167 | \n",
+ "
\n",
+ " \n",
+ " | 19 | \n",
+ " 50 g | \n",
+ " 164 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " value count\n",
+ "0 90285\n",
+ "1 2416\n",
+ "2 500 g 421\n",
+ "3 100 g 419\n",
+ "4 200 g 367\n",
+ "5 400 g 337\n",
+ "6 300 g 316\n",
+ "7 454 g 306\n",
+ "8 1 kg 280\n",
+ "9 250 g 234\n",
+ "10 355 mL 180\n",
+ "11 473 mL 180\n",
+ "12 150 g 178\n",
+ "13 1 177\n",
+ "14 340 g 177\n",
+ "15 500 mL 173\n",
+ "16 120 g 170\n",
+ "17 1 L 169\n",
+ "18 600 g 167\n",
+ "19 50 g 164"
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Quantity pattern counts\n"
+ ]
+ },
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " quantity_pattern | \n",
+ " count | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " | 0 | \n",
+ " null | \n",
+ " 90285 | \n",
+ "
\n",
+ " \n",
+ " | 1 | \n",
+ " simple_measure | \n",
+ " 17373 | \n",
+ "
\n",
+ " \n",
+ " | 2 | \n",
+ " multipack_measure | \n",
+ " 2734 | \n",
+ "
\n",
+ " \n",
+ " | 3 | \n",
+ " blank | \n",
+ " 2416 | \n",
+ "
\n",
+ " \n",
+ " | 4 | \n",
+ " number_only | \n",
+ " 577 | \n",
+ "
\n",
+ " \n",
+ " | 5 | \n",
+ " text_with_measure | \n",
+ " 569 | \n",
+ "
\n",
+ " \n",
+ " | 6 | \n",
+ " mixed_measure_expression | \n",
+ " 284 | \n",
+ "
\n",
+ " \n",
+ " | 7 | \n",
+ " unparsed_text | \n",
+ " 191 | \n",
+ "
\n",
+ " \n",
+ " | 8 | \n",
+ " multipack_unparsed | \n",
+ " 12 | \n",
+ "
\n",
+ " \n",
+ " | 9 | \n",
+ " placeholder_or_unknown | \n",
+ " 9 | \n",
+ "
\n",
+ " \n",
+ " | 10 | \n",
+ " count_descriptor | \n",
+ " 3 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " quantity_pattern count\n",
+ "0 null 90285\n",
+ "1 simple_measure 17373\n",
+ "2 multipack_measure 2734\n",
+ "3 blank 2416\n",
+ "4 number_only 577\n",
+ "5 text_with_measure 569\n",
+ "6 mixed_measure_expression 284\n",
+ "7 unparsed_text 191\n",
+ "8 multipack_unparsed 12\n",
+ "9 placeholder_or_unknown 9\n",
+ "10 count_descriptor 3"
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Recognized base units from quantity\n"
+ ]
+ },
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " recognized_base_unit | \n",
+ " count | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " | 0 | \n",
+ " <NA> | \n",
+ " 93438 | \n",
+ "
\n",
+ " \n",
+ " | 1 | \n",
+ " g | \n",
+ " 15785 | \n",
+ "
\n",
+ " \n",
+ " | 2 | \n",
+ " ml | \n",
+ " 4932 | \n",
+ "
\n",
+ " \n",
+ " | 3 | \n",
+ " count | \n",
+ " 298 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " recognized_base_unit count\n",
+ "0 93438\n",
+ "1 g 15785\n",
+ "2 ml 4932\n",
+ "3 count 298"
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Sample quantity values by pattern\n"
+ ]
+ },
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " quantity_pattern | \n",
+ " quantity | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " | 0 | \n",
+ " blank | \n",
+ " | \n",
+ "
\n",
+ " \n",
+ " | 1 | \n",
+ " blank | \n",
+ " | \n",
+ "
\n",
+ " \n",
+ " | 2 | \n",
+ " blank | \n",
+ " | \n",
+ "
\n",
+ " \n",
+ " | 3 | \n",
+ " blank | \n",
+ " | \n",
+ "
\n",
+ " \n",
+ " | 4 | \n",
+ " blank | \n",
+ " | \n",
+ "
\n",
+ " \n",
+ " | ... | \n",
+ " ... | \n",
+ " ... | \n",
+ "
\n",
+ " \n",
+ " | 87 | \n",
+ " unparsed_text | \n",
+ " 068488 075279 | \n",
+ "
\n",
+ " \n",
+ " | 88 | \n",
+ " unparsed_text | \n",
+ " 1 23 i puł | \n",
+ "
\n",
+ " \n",
+ " | 89 | \n",
+ " unparsed_text | \n",
+ " 1 Dry Pint | \n",
+ "
\n",
+ " \n",
+ " | 90 | \n",
+ " unparsed_text | \n",
+ " 1 EA | \n",
+ "
\n",
+ " \n",
+ " | 91 | \n",
+ " unparsed_text | \n",
+ " 1 Tbsp | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
92 rows × 2 columns
\n",
+ "
"
+ ],
+ "text/plain": [
+ " quantity_pattern quantity\n",
+ "0 blank \n",
+ "1 blank \n",
+ "2 blank \n",
+ "3 blank \n",
+ "4 blank \n",
+ ".. ... ...\n",
+ "87 unparsed_text 068488 075279\n",
+ "88 unparsed_text 1 23 i puł\n",
+ "89 unparsed_text 1 Dry Pint\n",
+ "90 unparsed_text 1 EA\n",
+ "91 unparsed_text 1 Tbsp\n",
+ "\n",
+ "[92 rows x 2 columns]"
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ }
+ ],
+ "source": [
+ "# Show the most common raw values in the free-text quantity field.\n",
+ "show_query(\n",
+ " \"Top raw quantity values\",\n",
+ " \"\"\"\n",
+ " SELECT COALESCE(quantity, '') AS value, COUNT(*) AS count\n",
+ " FROM quantity_raw\n",
+ " GROUP BY 1\n",
+ " ORDER BY count DESC, value\n",
+ " LIMIT 20\n",
+ " \"\"\"\n",
+ ")\n",
+ "\n",
+ "# Count how many rows fall into each derived pattern bucket.\n",
+ "show_query(\n",
+ " \"Quantity pattern counts\",\n",
+ " \"\"\"\n",
+ " SELECT quantity_pattern, COUNT(*) AS count\n",
+ " FROM quantity_features\n",
+ " GROUP BY 1\n",
+ " ORDER BY count DESC, quantity_pattern\n",
+ " \"\"\"\n",
+ ")\n",
+ "\n",
+ "# Count the implied canonical base units coming from the free-text quantity field.\n",
+ "show_query(\n",
+ " \"Recognized base units from quantity\",\n",
+ " \"\"\"\n",
+ " SELECT COALESCE(\n",
+ " CASE\n",
+ " WHEN simple_base_unit IS NOT NULL THEN simple_base_unit\n",
+ " WHEN pack_base_unit IS NOT NULL THEN pack_base_unit\n",
+ " ELSE first_unit_base\n",
+ " END,\n",
+ " ''\n",
+ " ) AS recognized_base_unit,\n",
+ " COUNT(*) AS count\n",
+ " FROM quantity_features\n",
+ " GROUP BY 1\n",
+ " ORDER BY count DESC, recognized_base_unit\n",
+ " \"\"\"\n",
+ ")\n",
+ "\n",
+ "# Keep a few example rows per pattern so the variation is visible without scanning the whole dataset.\n",
+ "show_query(\n",
+ " \"Sample quantity values by pattern\",\n",
+ " \"\"\"\n",
+ " WITH ranked AS (\n",
+ " SELECT quantity_pattern,\n",
+ " quantity,\n",
+ " ROW_NUMBER() OVER (PARTITION BY quantity_pattern ORDER BY quantity) AS rn\n",
+ " FROM quantity_features\n",
+ " WHERE quantity IS NOT NULL\n",
+ " )\n",
+ " SELECT quantity_pattern, quantity\n",
+ " FROM ranked\n",
+ " WHERE rn <= 10\n",
+ " ORDER BY quantity_pattern, rn\n",
+ " \"\"\"\n",
+ ")\n"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "cross-note",
+ "metadata": {},
+ "source": [
+ "## 7. Cross-Field Relationships\n",
+ "\n",
+ "This cell compares the three fields together. It shows which fields co-occur, how many missing-unit rows look recoverable from the free-text quantity, and how often the numeric quantity agrees with the parseable raw quantity strings."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 16,
+ "id": "cross-field",
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Presence combinations across the three fields\n"
+ ]
+ },
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " product_quantity_unit_present | \n",
+ " product_quantity_present | \n",
+ " quantity_present | \n",
+ " rows | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " | 0 | \n",
+ " False | \n",
+ " False | \n",
+ " False | \n",
+ " 90285 | \n",
+ "
\n",
+ " \n",
+ " | 1 | \n",
+ " True | \n",
+ " True | \n",
+ " True | \n",
+ " 15815 | \n",
+ "
\n",
+ " \n",
+ " | 2 | \n",
+ " False | \n",
+ " True | \n",
+ " True | \n",
+ " 5616 | \n",
+ "
\n",
+ " \n",
+ " | 3 | \n",
+ " False | \n",
+ " False | \n",
+ " True | \n",
+ " 2737 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " product_quantity_unit_present product_quantity_present quantity_present rows\n",
+ "0 False False False 90285\n",
+ "1 True True True 15815\n",
+ "2 False True True 5616\n",
+ "3 False False True 2737"
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Cross-field summary metrics\n"
+ ]
+ },
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " metric | \n",
+ " value | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " | 0 | \n",
+ " rows_with_blank_quantity_string | \n",
+ " 2416 | \n",
+ "
\n",
+ " \n",
+ " | 1 | \n",
+ " rows_with_product_quantity_zero | \n",
+ " 738 | \n",
+ "
\n",
+ " \n",
+ " | 2 | \n",
+ " rows_with_numeric_and_missing_unit | \n",
+ " 5616 | \n",
+ "
\n",
+ " \n",
+ " | 3 | \n",
+ " rows_with_numeric_and_missing_unit_recoverable_from_quantity | \n",
+ " 5159 | \n",
+ "
\n",
+ " \n",
+ " | 4 | \n",
+ " rows_with_only_quantity | \n",
+ " 321 | \n",
+ "
\n",
+ " \n",
+ " | 5 | \n",
+ " rows_with_only_quantity_simple_or_multipack_parse | \n",
+ " 16 | \n",
+ "
\n",
+ " \n",
+ " | 6 | \n",
+ " simple_or_multipack_rows | \n",
+ " 20107 | \n",
+ "
\n",
+ " \n",
+ " | 7 | \n",
+ " simple_or_multipack_value_match_count | \n",
+ " 19851 | \n",
+ "
\n",
+ " \n",
+ " | 8 | \n",
+ " simple_or_multipack_unit_match_count | \n",
+ " 15117 | \n",
+ "
\n",
+ " \n",
+ " | 9 | \n",
+ " simple_or_multipack_value_mismatch_count | \n",
+ " 240 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " metric value\n",
+ "0 rows_with_blank_quantity_string 2416\n",
+ "1 rows_with_product_quantity_zero 738\n",
+ "2 rows_with_numeric_and_missing_unit 5616\n",
+ "3 rows_with_numeric_and_missing_unit_recoverable_from_quantity 5159\n",
+ "4 rows_with_only_quantity 321\n",
+ "5 rows_with_only_quantity_simple_or_multipack_parse 16\n",
+ "6 simple_or_multipack_rows 20107\n",
+ "7 simple_or_multipack_value_match_count 19851\n",
+ "8 simple_or_multipack_unit_match_count 15117\n",
+ "9 simple_or_multipack_value_mismatch_count 240"
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Recoverable missing-unit examples\n"
+ ]
+ },
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " product_quantity | \n",
+ " quantity | \n",
+ " inferred_base_unit | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " | 0 | \n",
+ " 118 | \n",
+ " 118 ml | \n",
+ " ml | \n",
+ "
\n",
+ " \n",
+ " | 1 | \n",
+ " 235 | \n",
+ " 235 g | \n",
+ " g | \n",
+ "
\n",
+ " \n",
+ " | 2 | \n",
+ " 3 | \n",
+ " 3 grammes | \n",
+ " g | \n",
+ "
\n",
+ " \n",
+ " | 3 | \n",
+ " 0 | \n",
+ " 180 g | \n",
+ " g | \n",
+ "
\n",
+ " \n",
+ " | 4 | \n",
+ " 0.0 | \n",
+ " 1pcs | \n",
+ " count | \n",
+ "
\n",
+ " \n",
+ " | ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ "
\n",
+ " \n",
+ " | 95 | \n",
+ " 175 | \n",
+ " 175 grammes | \n",
+ " g | \n",
+ "
\n",
+ " \n",
+ " | 96 | \n",
+ " 1560 | \n",
+ " 1.56 kg | \n",
+ " g | \n",
+ "
\n",
+ " \n",
+ " | 97 | \n",
+ " 750 | \n",
+ " 750 g | \n",
+ " g | \n",
+ "
\n",
+ " \n",
+ " | 98 | \n",
+ " 480 | \n",
+ " 4 x 120 g | \n",
+ " g | \n",
+ "
\n",
+ " \n",
+ " | 99 | \n",
+ " 162 | \n",
+ " 162ml | \n",
+ " ml | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
100 rows × 3 columns
\n",
+ "
"
+ ],
+ "text/plain": [
+ " product_quantity quantity inferred_base_unit\n",
+ "0 118 118 ml ml\n",
+ "1 235 235 g g\n",
+ "2 3 3 grammes g\n",
+ "3 0 180 g g\n",
+ "4 0.0 1pcs count\n",
+ ".. ... ... ...\n",
+ "95 175 175 grammes g\n",
+ "96 1560 1.56 kg g\n",
+ "97 750 750 g g\n",
+ "98 480 4 x 120 g g\n",
+ "99 162 162ml ml\n",
+ "\n",
+ "[100 rows x 3 columns]"
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Simple or multipack value mismatch examples\n"
+ ]
+ },
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " product_quantity_unit | \n",
+ " product_quantity | \n",
+ " quantity | \n",
+ " parsed_normalized_quantity | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " | 0 | \n",
+ " None | \n",
+ " 0 | \n",
+ " 180 g | \n",
+ " 180.00000 | \n",
+ "
\n",
+ " \n",
+ " | 1 | \n",
+ " None | \n",
+ " 0.0 | \n",
+ " 1pcs | \n",
+ " 1.00000 | \n",
+ "
\n",
+ " \n",
+ " | 2 | \n",
+ " None | \n",
+ " 0.0 | \n",
+ " 32 portions | \n",
+ " 32.00000 | \n",
+ "
\n",
+ " \n",
+ " | 3 | \n",
+ " None | \n",
+ " 1359.9999999999998 | \n",
+ " 1.36kg | \n",
+ " 1360.00000 | \n",
+ "
\n",
+ " \n",
+ " | 4 | \n",
+ " None | \n",
+ " 0.0 | \n",
+ " 12 yaourts | \n",
+ " 12.00000 | \n",
+ "
\n",
+ " \n",
+ " | 5 | \n",
+ " None | \n",
+ " 0.0 | \n",
+ " 10 bags | \n",
+ " 10.00000 | \n",
+ "
\n",
+ " \n",
+ " | 6 | \n",
+ " None | \n",
+ " 1890.0000000000002 | \n",
+ " 1.89 l | \n",
+ " 1890.00000 | \n",
+ "
\n",
+ " \n",
+ " | 7 | \n",
+ " None | \n",
+ " 0 | \n",
+ " 100 tablets | \n",
+ " 100.00000 | \n",
+ "
\n",
+ " \n",
+ " | 8 | \n",
+ " None | \n",
+ " 0.0 | \n",
+ " 1 can | \n",
+ " 1.00000 | \n",
+ "
\n",
+ " \n",
+ " | 9 | \n",
+ " None | \n",
+ " 0 | \n",
+ " 20 sachets | \n",
+ " 20.00000 | \n",
+ "
\n",
+ " \n",
+ " | 10 | \n",
+ " None | \n",
+ " 3000 | \n",
+ " 3 lb | \n",
+ " 1360.77711 | \n",
+ "
\n",
+ " \n",
+ " | 11 | \n",
+ " None | \n",
+ " 0.0 | \n",
+ " 5pcs | \n",
+ " 5.00000 | \n",
+ "
\n",
+ " \n",
+ " | 12 | \n",
+ " None | \n",
+ " 0.0 | \n",
+ " 8 barres | \n",
+ " 8.00000 | \n",
+ "
\n",
+ " \n",
+ " | 13 | \n",
+ " None | \n",
+ " 0 | \n",
+ " 5 barres | \n",
+ " 5.00000 | \n",
+ "
\n",
+ " \n",
+ " | 14 | \n",
+ " None | \n",
+ " 0.0 | \n",
+ " 24 bars | \n",
+ " 24.00000 | \n",
+ "
\n",
+ " \n",
+ " | 15 | \n",
+ " None | \n",
+ " 0 | \n",
+ " 5 barres | \n",
+ " 5.00000 | \n",
+ "
\n",
+ " \n",
+ " | 16 | \n",
+ " None | \n",
+ " 1.6600000000000001 | \n",
+ " 1.66 g | \n",
+ " 1.66000 | \n",
+ "
\n",
+ " \n",
+ " | 17 | \n",
+ " None | \n",
+ " 0 | \n",
+ " 1 kilo | \n",
+ " 1000.00000 | \n",
+ "
\n",
+ " \n",
+ " | 18 | \n",
+ " None | \n",
+ " 0 | \n",
+ " 1.81 kg | \n",
+ " 1810.00000 | \n",
+ "
\n",
+ " \n",
+ " | 19 | \n",
+ " None | \n",
+ " 1890.0000000000002 | \n",
+ " 1,89L | \n",
+ " 1890.00000 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " product_quantity_unit product_quantity quantity parsed_normalized_quantity\n",
+ "0 None 0 180 g 180.00000\n",
+ "1 None 0.0 1pcs 1.00000\n",
+ "2 None 0.0 32 portions 32.00000\n",
+ "3 None 1359.9999999999998 1.36kg 1360.00000\n",
+ "4 None 0.0 12 yaourts 12.00000\n",
+ "5 None 0.0 10 bags 10.00000\n",
+ "6 None 1890.0000000000002 1.89 l 1890.00000\n",
+ "7 None 0 100 tablets 100.00000\n",
+ "8 None 0.0 1 can 1.00000\n",
+ "9 None 0 20 sachets 20.00000\n",
+ "10 None 3000 3 lb 1360.77711\n",
+ "11 None 0.0 5pcs 5.00000\n",
+ "12 None 0.0 8 barres 8.00000\n",
+ "13 None 0 5 barres 5.00000\n",
+ "14 None 0.0 24 bars 24.00000\n",
+ "15 None 0 5 barres 5.00000\n",
+ "16 None 1.6600000000000001 1.66 g 1.66000\n",
+ "17 None 0 1 kilo 1000.00000\n",
+ "18 None 0 1.81 kg 1810.00000\n",
+ "19 None 1890.0000000000002 1,89L 1890.00000"
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ }
+ ],
+ "source": [
+ "# Show the overlap pattern across the three quantity fields.\n",
+ "show_query(\n",
+ " \"Presence combinations across the three fields\",\n",
+ " \"\"\"\n",
+ " SELECT\n",
+ " product_quantity_unit IS NOT NULL AS product_quantity_unit_present,\n",
+ " product_quantity IS NOT NULL AS product_quantity_present,\n",
+ " quantity IS NOT NULL AS quantity_present,\n",
+ " COUNT(*) AS rows\n",
+ " FROM quantity_raw\n",
+ " GROUP BY 1, 2, 3\n",
+ " ORDER BY rows DESC\n",
+ " \"\"\"\n",
+ ")\n",
+ "\n",
+ "# Summarize the key relationship counts we need before defining cleaning rules.\n",
+ "cross_field_summary_query = \"\"\"\n",
+ " SELECT 'rows_with_blank_quantity_string' AS metric,\n",
+ " COUNT(*) AS value\n",
+ " FROM quantity_features\n",
+ " WHERE quantity IS NOT NULL AND quantity_normalized = ''\n",
+ " UNION ALL\n",
+ " SELECT 'rows_with_product_quantity_zero' AS metric,\n",
+ " COUNT(*) AS value\n",
+ " FROM quantity_features\n",
+ " WHERE product_quantity_numeric = 0\n",
+ " UNION ALL\n",
+ " SELECT 'rows_with_numeric_and_missing_unit' AS metric,\n",
+ " COUNT(*) AS value\n",
+ " FROM quantity_features\n",
+ " WHERE product_quantity_numeric IS NOT NULL\n",
+ " AND (product_quantity_unit IS NULL OR trim(product_quantity_unit) = '')\n",
+ " UNION ALL\n",
+ " SELECT 'rows_with_numeric_and_missing_unit_recoverable_from_quantity' AS metric,\n",
+ " COUNT(*) AS value\n",
+ " FROM quantity_features\n",
+ " WHERE product_quantity_numeric IS NOT NULL\n",
+ " AND (product_quantity_unit IS NULL OR trim(product_quantity_unit) = '')\n",
+ " AND (simple_base_unit IS NOT NULL OR pack_base_unit IS NOT NULL OR first_unit_base IS NOT NULL)\n",
+ " UNION ALL\n",
+ " SELECT 'rows_with_only_quantity' AS metric,\n",
+ " COUNT(*) AS value\n",
+ " FROM quantity_features\n",
+ " WHERE product_quantity_numeric IS NULL\n",
+ " AND quantity IS NOT NULL\n",
+ " AND quantity_normalized <> ''\n",
+ " UNION ALL\n",
+ " SELECT 'rows_with_only_quantity_simple_or_multipack_parse' AS metric,\n",
+ " COUNT(*) AS value\n",
+ " FROM quantity_features\n",
+ " WHERE product_quantity_numeric IS NULL\n",
+ " AND quantity_pattern IN ('simple_measure', 'multipack_measure')\n",
+ " UNION ALL\n",
+ " SELECT 'simple_or_multipack_rows' AS metric,\n",
+ " COUNT(*) AS value\n",
+ " FROM quantity_features\n",
+ " WHERE quantity_pattern IN ('simple_measure', 'multipack_measure')\n",
+ " UNION ALL\n",
+ " SELECT 'simple_or_multipack_value_match_count' AS metric,\n",
+ " COUNT(*) AS value\n",
+ " FROM quantity_features\n",
+ " WHERE quantity_pattern IN ('simple_measure', 'multipack_measure')\n",
+ " AND product_quantity_numeric IS NOT NULL\n",
+ " AND product_quantity_numeric = COALESCE(simple_normalized_value, pack_normalized_total_value)\n",
+ " UNION ALL\n",
+ " SELECT 'simple_or_multipack_unit_match_count' AS metric,\n",
+ " COUNT(*) AS value\n",
+ " FROM quantity_features\n",
+ " WHERE quantity_pattern IN ('simple_measure', 'multipack_measure')\n",
+ " AND product_quantity_unit_base IS NOT NULL\n",
+ " AND product_quantity_unit_base = COALESCE(simple_base_unit, pack_base_unit)\n",
+ " UNION ALL\n",
+ " SELECT 'simple_or_multipack_value_mismatch_count' AS metric,\n",
+ " COUNT(*) AS value\n",
+ " FROM quantity_features\n",
+ " WHERE quantity_pattern IN ('simple_measure', 'multipack_measure')\n",
+ " AND product_quantity_numeric IS NOT NULL\n",
+ " AND product_quantity_numeric <> COALESCE(simple_normalized_value, pack_normalized_total_value)\n",
+ "\"\"\"\n",
+ "\n",
+ "show_query(\"Cross-field summary metrics\", cross_field_summary_query)\n",
+ "\n",
+ "# Show recoverable missing-unit rows so we can verify what a future backfill would look like.\n",
+ "show_query(\n",
+ " \"Recoverable missing-unit examples\",\n",
+ " \"\"\"\n",
+ " SELECT product_quantity, quantity,\n",
+ " COALESCE(simple_base_unit, pack_base_unit, first_unit_base) AS inferred_base_unit\n",
+ " FROM quantity_features\n",
+ " WHERE product_quantity_numeric IS NOT NULL\n",
+ " AND (product_quantity_unit IS NULL OR trim(product_quantity_unit) = '')\n",
+ " AND COALESCE(simple_base_unit, pack_base_unit, first_unit_base) IS NOT NULL\n",
+ " LIMIT 100\n",
+ " \"\"\"\n",
+ ")\n",
+ "\n",
+ "# Show mismatches so later cleaning logic can treat them carefully instead of assuming all rows agree.\n",
+ "show_query(\n",
+ " \"Simple or multipack value mismatch examples\",\n",
+ " \"\"\"\n",
+ " SELECT product_quantity_unit,\n",
+ " product_quantity,\n",
+ " quantity,\n",
+ " COALESCE(simple_normalized_value, pack_normalized_total_value) AS parsed_normalized_quantity\n",
+ " FROM quantity_features\n",
+ " WHERE quantity_pattern IN ('simple_measure', 'multipack_measure')\n",
+ " AND product_quantity_numeric IS NOT NULL\n",
+ " AND product_quantity_numeric <> COALESCE(simple_normalized_value, pack_normalized_total_value)\n",
+ " LIMIT 20\n",
+ " \"\"\"\n",
+ ")\n"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "anomaly-note",
+ "metadata": {},
+ "source": [
+ "## 8. Anomalies And Unresolved Tokens\n",
+ "\n",
+ "This cell surfaces the rows and leftover raw text fragments that still need explicit cleaning rules later. The goal here is not to clean them yet, but to make sure the next step starts from concrete evidence instead of guesswork."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 17,
+ "id": "anomalies",
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Zero product_quantity examples\n"
+ ]
+ },
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " product_quantity | \n",
+ " quantity | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " | 0 | \n",
+ " 0 | \n",
+ " 50 | \n",
+ "
\n",
+ " \n",
+ " | 1 | \n",
+ " 0 | \n",
+ " 180 g | \n",
+ "
\n",
+ " \n",
+ " | 2 | \n",
+ " 0.0 | \n",
+ " 1pcs | \n",
+ "
\n",
+ " \n",
+ " | 3 | \n",
+ " 0.0 | \n",
+ " 6 Tablets / Tablillas | \n",
+ "
\n",
+ " \n",
+ " | 4 | \n",
+ " 0 | \n",
+ " 1 | \n",
+ "
\n",
+ " \n",
+ " | 5 | \n",
+ " 0.0 | \n",
+ " 1 | \n",
+ "
\n",
+ " \n",
+ " | 6 | \n",
+ " 0.0 | \n",
+ " 32 portions | \n",
+ "
\n",
+ " \n",
+ " | 7 | \n",
+ " 0 | \n",
+ " 6 | \n",
+ "
\n",
+ " \n",
+ " | 8 | \n",
+ " 0.0 | \n",
+ " 4 | \n",
+ "
\n",
+ " \n",
+ " | 9 | \n",
+ " 0.0 | \n",
+ " 450 | \n",
+ "
\n",
+ " \n",
+ " | 10 | \n",
+ " 0 | \n",
+ " 1 | \n",
+ "
\n",
+ " \n",
+ " | 11 | \n",
+ " 0 | \n",
+ " Unknown Quantity | \n",
+ "
\n",
+ " \n",
+ " | 12 | \n",
+ " 0.0 | \n",
+ " 12 yaourts | \n",
+ "
\n",
+ " \n",
+ " | 13 | \n",
+ " 0.0 | \n",
+ " 8 | \n",
+ "
\n",
+ " \n",
+ " | 14 | \n",
+ " 0 | \n",
+ " Unknown Quantity | \n",
+ "
\n",
+ " \n",
+ " | 15 | \n",
+ " 0 | \n",
+ " 12 morceaux | \n",
+ "
\n",
+ " \n",
+ " | 16 | \n",
+ " 0.0 | \n",
+ " 10 bags | \n",
+ "
\n",
+ " \n",
+ " | 17 | \n",
+ " 0.0 | \n",
+ " 6 | \n",
+ "
\n",
+ " \n",
+ " | 18 | \n",
+ " 0 | \n",
+ " 1 | \n",
+ "
\n",
+ " \n",
+ " | 19 | \n",
+ " 0.0 | \n",
+ " 12 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " product_quantity quantity\n",
+ "0 0 50\n",
+ "1 0 180 g\n",
+ "2 0.0 1pcs\n",
+ "3 0.0 6 Tablets / Tablillas\n",
+ "4 0 1 \n",
+ "5 0.0 1\n",
+ "6 0.0 32 portions\n",
+ "7 0 6\n",
+ "8 0.0 4\n",
+ "9 0.0 450\n",
+ "10 0 1\n",
+ "11 0 Unknown Quantity\n",
+ "12 0.0 12 yaourts\n",
+ "13 0.0 8\n",
+ "14 0 Unknown Quantity\n",
+ "15 0 12 morceaux\n",
+ "16 0.0 10 bags\n",
+ "17 0.0 6\n",
+ "18 0 1\n",
+ "19 0.0 12"
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Top unresolved tokens from hard-to-parse quantity rows\n"
+ ]
+ },
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " value | \n",
+ " count | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " | 0 | \n",
+ " per | \n",
+ " 9 | \n",
+ "
\n",
+ " \n",
+ " | 1 | \n",
+ " qty | \n",
+ " 6 | \n",
+ "
\n",
+ " \n",
+ " | 2 | \n",
+ " comprim | \n",
+ " 5 | \n",
+ "
\n",
+ " \n",
+ " | 3 | \n",
+ " morceaux | \n",
+ " 5 | \n",
+ "
\n",
+ " \n",
+ " | 4 | \n",
+ " quantity | \n",
+ " 5 | \n",
+ "
\n",
+ " \n",
+ " | 5 | \n",
+ " unknown | \n",
+ " 5 | \n",
+ "
\n",
+ " \n",
+ " | 6 | \n",
+ " chips | \n",
+ " 3 | \n",
+ "
\n",
+ " \n",
+ " | 7 | \n",
+ " dry | \n",
+ " 3 | \n",
+ "
\n",
+ " \n",
+ " | 8 | \n",
+ " gr | \n",
+ " 3 | \n",
+ "
\n",
+ " \n",
+ " | 9 | \n",
+ " pack | \n",
+ " 3 | \n",
+ "
\n",
+ " \n",
+ " | 10 | \n",
+ " pint | \n",
+ " 3 | \n",
+ "
\n",
+ " \n",
+ " | 11 | \n",
+ " pound | \n",
+ " 3 | \n",
+ "
\n",
+ " \n",
+ " | 12 | \n",
+ " unit | \n",
+ " 3 | \n",
+ "
\n",
+ " \n",
+ " | 13 | \n",
+ " varies | \n",
+ " 3 | \n",
+ "
\n",
+ " \n",
+ " | 14 | \n",
+ " good | \n",
+ " 2 | \n",
+ "
\n",
+ " \n",
+ " | 15 | \n",
+ " halal | \n",
+ " 2 | \n",
+ "
\n",
+ " \n",
+ " | 16 | \n",
+ " original | \n",
+ " 2 | \n",
+ "
\n",
+ " \n",
+ " | 17 | \n",
+ " pains | \n",
+ " 2 | \n",
+ "
\n",
+ " \n",
+ " | 18 | \n",
+ " paquets | \n",
+ " 2 | \n",
+ "
\n",
+ " \n",
+ " | 19 | \n",
+ " par | \n",
+ " 2 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " value count\n",
+ "0 per 9\n",
+ "1 qty 6\n",
+ "2 comprim 5\n",
+ "3 morceaux 5\n",
+ "4 quantity 5\n",
+ "5 unknown 5\n",
+ "6 chips 3\n",
+ "7 dry 3\n",
+ "8 gr 3\n",
+ "9 pack 3\n",
+ "10 pint 3\n",
+ "11 pound 3\n",
+ "12 unit 3\n",
+ "13 varies 3\n",
+ "14 good 2\n",
+ "15 halal 2\n",
+ "16 original 2\n",
+ "17 pains 2\n",
+ "18 paquets 2\n",
+ "19 par 2"
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Sample unresolved quantity rows\n"
+ ]
+ },
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " quantity_pattern | \n",
+ " quantity | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " | 0 | \n",
+ " placeholder_or_unknown | \n",
+ " Unknown Quantity | \n",
+ "
\n",
+ " \n",
+ " | 1 | \n",
+ " placeholder_or_unknown | \n",
+ " Unknown Quantity | \n",
+ "
\n",
+ " \n",
+ " | 2 | \n",
+ " unparsed_text | \n",
+ " 12 morceaux | \n",
+ "
\n",
+ " \n",
+ " | 3 | \n",
+ " unparsed_text | \n",
+ " 1/2 | \n",
+ "
\n",
+ " \n",
+ " | 4 | \n",
+ " unparsed_text | \n",
+ " 14 tasses | \n",
+ "
\n",
+ " \n",
+ " | 5 | \n",
+ " unparsed_text | \n",
+ " 1.13 k | \n",
+ "
\n",
+ " \n",
+ " | 6 | \n",
+ " unparsed_text | \n",
+ " 1 tsp | \n",
+ "
\n",
+ " \n",
+ " | 7 | \n",
+ " placeholder_or_unknown | \n",
+ " Bonne | \n",
+ "
\n",
+ " \n",
+ " | 8 | \n",
+ " placeholder_or_unknown | \n",
+ " bn batouta | \n",
+ "
\n",
+ " \n",
+ " | 9 | \n",
+ " placeholder_or_unknown | \n",
+ " Good | \n",
+ "
\n",
+ " \n",
+ " | 10 | \n",
+ " unparsed_text | \n",
+ " 1 boite | \n",
+ "
\n",
+ " \n",
+ " | 11 | \n",
+ " unparsed_text | \n",
+ " Oikos triple zéro yaourts par Danone | \n",
+ "
\n",
+ " \n",
+ " | 12 | \n",
+ " unparsed_text | \n",
+ " 90 + 30 | \n",
+ "
\n",
+ " \n",
+ " | 13 | \n",
+ " unparsed_text | \n",
+ " 49 г | \n",
+ "
\n",
+ " \n",
+ " | 14 | \n",
+ " multipack_unparsed | \n",
+ " 5X2 | \n",
+ "
\n",
+ " \n",
+ " | 15 | \n",
+ " count_descriptor | \n",
+ " Un sachet | \n",
+ "
\n",
+ " \n",
+ " | 16 | \n",
+ " unparsed_text | \n",
+ " 8 pains | \n",
+ "
\n",
+ " \n",
+ " | 17 | \n",
+ " unparsed_text | \n",
+ " 12 morceaux | \n",
+ "
\n",
+ " \n",
+ " | 18 | \n",
+ " unparsed_text | \n",
+ " 1/2 tasse | \n",
+ "
\n",
+ " \n",
+ " | 19 | \n",
+ " unparsed_text | \n",
+ " 455 m' | \n",
+ "
\n",
+ " \n",
+ " | 20 | \n",
+ " unparsed_text | \n",
+ " 12 biscuits | \n",
+ "
\n",
+ " \n",
+ " | 21 | \n",
+ " unparsed_text | \n",
+ " Que Pasa Tortilla Chips croustilles | \n",
+ "
\n",
+ " \n",
+ " | 22 | \n",
+ " unparsed_text | \n",
+ " 6 paquets | \n",
+ "
\n",
+ " \n",
+ " | 23 | \n",
+ " unparsed_text | \n",
+ " 6 par boîte | \n",
+ "
\n",
+ " \n",
+ " | 24 | \n",
+ " multipack_unparsed | \n",
+ " 12 * 4 paquets | \n",
+ "
\n",
+ " \n",
+ " | 25 | \n",
+ " unparsed_text | \n",
+ " 40 morceaux | \n",
+ "
\n",
+ " \n",
+ " | 26 | \n",
+ " unparsed_text | \n",
+ " 40 morceaux | \n",
+ "
\n",
+ " \n",
+ " | 27 | \n",
+ " multipack_unparsed | \n",
+ " 6x228 | \n",
+ "
\n",
+ " \n",
+ " | 28 | \n",
+ " unparsed_text | \n",
+ " 5 pack | \n",
+ "
\n",
+ " \n",
+ " | 29 | \n",
+ " unparsed_text | \n",
+ " 250 г | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " quantity_pattern quantity\n",
+ "0 placeholder_or_unknown Unknown Quantity\n",
+ "1 placeholder_or_unknown Unknown Quantity\n",
+ "2 unparsed_text 12 morceaux\n",
+ "3 unparsed_text 1/2\n",
+ "4 unparsed_text 14 tasses\n",
+ "5 unparsed_text 1.13 k\n",
+ "6 unparsed_text 1 tsp\n",
+ "7 placeholder_or_unknown Bonne\n",
+ "8 placeholder_or_unknown bn batouta\n",
+ "9 placeholder_or_unknown Good \n",
+ "10 unparsed_text 1 boite\n",
+ "11 unparsed_text Oikos triple zéro yaourts par Danone\n",
+ "12 unparsed_text 90 + 30\n",
+ "13 unparsed_text 49 г\n",
+ "14 multipack_unparsed 5X2\n",
+ "15 count_descriptor Un sachet\n",
+ "16 unparsed_text 8 pains\n",
+ "17 unparsed_text 12 morceaux\n",
+ "18 unparsed_text 1/2 tasse\n",
+ "19 unparsed_text 455 m'\n",
+ "20 unparsed_text 12 biscuits\n",
+ "21 unparsed_text Que Pasa Tortilla Chips croustilles\n",
+ "22 unparsed_text 6 paquets\n",
+ "23 unparsed_text 6 par boîte\n",
+ "24 multipack_unparsed 12 * 4 paquets\n",
+ "25 unparsed_text 40 morceaux\n",
+ "26 unparsed_text 40 morceaux\n",
+ "27 multipack_unparsed 6x228\n",
+ "28 unparsed_text 5 pack\n",
+ "29 unparsed_text 250 г"
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ }
+ ],
+ "source": [
+ "# Show zero-valued product_quantity rows with their raw quantity text.\n",
+ "show_query(\n",
+ " \"Zero product_quantity examples\",\n",
+ " \"\"\"\n",
+ " SELECT product_quantity, quantity\n",
+ " FROM quantity_features\n",
+ " WHERE product_quantity_numeric = 0\n",
+ " AND quantity IS NOT NULL\n",
+ " LIMIT 20\n",
+ " \"\"\"\n",
+ ")\n",
+ "\n",
+ "# Surface the most common leftover tokens from unresolved quantity strings.\n",
+ "show_query(\n",
+ " \"Top unresolved tokens from hard-to-parse quantity rows\",\n",
+ " f\"\"\"\n",
+ " WITH tokens AS (\n",
+ " SELECT token\n",
+ " FROM quantity_features,\n",
+ " UNNEST(regexp_extract_all(COALESCE(quantity_normalized, ''), '[a-z]+')) AS token_table(token)\n",
+ " WHERE quantity_pattern IN ('unparsed_text', 'multipack_unparsed', 'placeholder_or_unknown', 'count_descriptor')\n",
+ " )\n",
+ " SELECT token AS value, COUNT(*) AS count\n",
+ " FROM tokens\n",
+ " WHERE length(token) > 1\n",
+ " AND token NOT IN ({RECOGNIZED_UNIT_TOKENS_SQL})\n",
+ " AND token NOT IN ({UNRESOLVED_STOPWORDS_SQL})\n",
+ " GROUP BY 1\n",
+ " ORDER BY count DESC, value\n",
+ " LIMIT 20\n",
+ " \"\"\"\n",
+ ")\n",
+ "\n",
+ "# Show unresolved quantity rows directly so later cleaning rules can be based on real examples.\n",
+ "show_query(\n",
+ " \"Sample unresolved quantity rows\",\n",
+ " \"\"\"\n",
+ " SELECT quantity_pattern, quantity\n",
+ " FROM quantity_features\n",
+ " WHERE quantity_pattern IN ('unparsed_text', 'multipack_unparsed', 'placeholder_or_unknown', 'count_descriptor')\n",
+ " LIMIT 30\n",
+ " \"\"\"\n",
+ ")\n"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "ready-note",
+ "metadata": {},
+ "source": [
+ "## 9. Ready For The Cleaning Stage\n",
+ "\n",
+ "After running the notebook, we should have the main information needed for the next stage:\n",
+ "\n",
+ "- completeness and sparsity of each field\n",
+ "- actual unit vocabulary and unrecognized unit outliers\n",
+ "- numeric behavior and outliers in `product_quantity`\n",
+ "- raw text patterns and unresolved variants in `quantity`\n",
+ "- overlap and consistency across all three fields\n",
+ "- real examples that will drive the cleaning rules later\n",
+ "\n",
+ "The next step can now focus on cleaning design instead of first-pass discovery."
+ ]
+ }
+ ],
+ "metadata": {
+ "kernelspec": {
+ "display_name": "Python 3",
+ "language": "python",
+ "name": "python3"
+ },
+ "language_info": {
+ "codemirror_mode": {
+ "name": "ipython",
+ "version": 3
+ },
+ "file_extension": ".py",
+ "mimetype": "text/x-python",
+ "name": "python",
+ "nbconvert_exporter": "python",
+ "pygments_lexer": "ipython3",
+ "version": "3.10.0"
+ }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}