diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml
index 6092c40..0dc6afd 100644
--- a/.github/workflows/test.yml
+++ b/.github/workflows/test.yml
@@ -32,6 +32,9 @@ jobs:
           python -m pip install --upgrade pip
           pip install ".[aimodel,huggingface]" --group test
 
+      - name: Build licenseid database
+        run: licenseid update
+
       - name: Run tests
         run: |
           pytest tests/ -v --tb=short
diff --git a/docs/implementation/license-pipeline.md b/docs/implementation/license-pipeline.md
index cf213d8..edf5ea7 100644
--- a/docs/implementation/license-pipeline.md
+++ b/docs/implementation/license-pipeline.md
@@ -134,7 +134,13 @@ relationships.
 
 1. **Card YAML** -- reads `license:` from the model card frontmatter. If
    the value is not a vague sentinel (`other`, `custom`, `proprietary`,
-   `unknown`, `unlicensed`), it is accepted as-is and stored in
+   `unknown`, `unlicensed`), it is passed through `_canonicalize_license_id()`,
+   which calls `AggregatedLicenseMatcher.match(license_id=raw)` from the
+   `licenseid` library for a direct database lookup. Recognised SPDX
+   License IDs are returned in canonical casing (e.g. `"apache-2.0"` →
+   `"Apache-2.0"`). Values not recognised — proprietary or non-SPDX
+   identifiers such as `"gemma"`, `"llama3.2"`, or deprecated bare
+   copyleft forms — are returned verbatim. The result is stored in
    `AiModelMetadata.license`.
 2. **File detection** -- when the card YAML value is absent or vague,
    `_detect_license_from_hf_files()` iterates through candidate files in
@@ -148,15 +154,18 @@ relationships.
 ### `licenseid` dependency
 
 Text-based licence detection (`detect_license_from_text()` in
-`_license.py`) relies on the optional `licenseid` package. When the
-package is not installed or its database has not been built, detection
-is silently skipped and the function returns `None`. To enable it:
+`_license.py`) uses the `licenseid` package, which is a mandatory
+pitloom dependency. The database must be built before detection is
+possible:
 
 ```shell
-pip install pitloom[license]
 licenseid update
 ```
 
+When the database has not been built, `detect_license_from_text()`
+logs a warning and returns `None`; other licence sources (card YAML,
+`CITATION.cff`, `codemeta.json`) are unaffected.
+
 The database is stored at
 `~/.local/share/licenseid/licenses.db`. Detection uses cosine similarity
 against vectorised licence texts with a default threshold of 0.85.
diff --git a/pyproject.toml b/pyproject.toml
index 682a9b7..449d318 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -46,6 +46,7 @@ classifiers = [
 ]
 dependencies = [
     "hatchling>=1.28.0",
+    "licenseid>=0.2.3",
     "pyproject-metadata>=0.11.0",
     "rfc8785>=0.1.4",
     "spdx-python-model==0.0.4",
@@ -63,7 +64,6 @@ aimodel = [
     "safetensors[numpy]>=0.7.0",
 ]
 huggingface = ["huggingface_hub>=1.14.0"]
-license = ["licenseid>=0.2.2"]
 
 fasttext = ["fasttext>=0.9.3"]                # or fasttext-community>=0.11.7
 gguf = ["gguf>=0.10.0"]
@@ -124,6 +124,7 @@ exclude = [
     "tests/fixtures/aimodels/safetensors/*.safetensors",
     "tests/fixtures/croissant/*.json",
     "tests/fixtures/fragments/*.json",
+    "tests/fixtures/huggingface-hub/*.txt",
 ]
 
 [tool.hatch.build.targets.wheel]
diff --git a/src/pitloom/assemble/spdx3/deps.py b/src/pitloom/assemble/spdx3/deps.py
index 2e81d0a..6230c82 100644
--- a/src/pitloom/assemble/spdx3/deps.py
+++ b/src/pitloom/assemble/spdx3/deps.py
@@ -111,12 +111,12 @@ def _enrich_from_installed(
     provenance_source = f"Source: installed metadata | Package: {dep_name}"
 
     # description
-    summary = pkg_meta["Summary"] or ""
+    summary = pkg_meta.get("Summary") or ""
     if summary and summary != "UNKNOWN":
         dep_package.description = summary
 
     # homePage -- core field first, then well-known Project-URL labels
-    home_page = pkg_meta["Home-page"] or ""
+    home_page = pkg_meta.get("Home-page") or ""
     if not home_page or home_page == "UNKNOWN":
         for label in _HOMEPAGE_LABELS:
             if label in project_urls:
@@ -126,7 +126,7 @@ def _enrich_from_installed(
         dep_package.software_homePage = home_page
 
     # downloadLocation -- core field first, then well-known Project-URL labels
-    download_url = pkg_meta["Download-URL"] or ""
+    download_url = pkg_meta.get("Download-URL") or ""
     if not download_url or download_url == "UNKNOWN":
         for label in _DOWNLOAD_LABELS:
             if label in project_urls:
@@ -145,7 +145,7 @@ def _enrich_from_installed(
         dep_package.software_packageUrl = f"pkg:pypi/{purl_name}@{version}"
 
     # hasDeclaredLicense -- prefer PEP 639 License-Expression over legacy License
-    license_id = pkg_meta["License-Expression"] or pkg_meta["License"] or ""
+    license_id = pkg_meta.get("License-Expression") or pkg_meta.get("License") or ""
     if license_id and license_id != "UNKNOWN":
         rel_declared, _ = build_license_elements(
             license_id=license_id,
diff --git a/src/pitloom/extract/_huggingface.py b/src/pitloom/extract/_huggingface.py
index 59af49e..875067f 100644
--- a/src/pitloom/extract/_huggingface.py
+++ b/src/pitloom/extract/_huggingface.py
@@ -102,6 +102,29 @@
     {"other", "custom", "proprietary", "unknown", "unlicensed"}
 )
 
+
+def _canonicalize_license_id(raw: str) -> str:
+    """Return the canonical SPDX License ID for *raw*, or *raw* unchanged.
+
+    Delegates to :func:`~pitloom.extract._license.canonicalize_license_id`,
+    which uses ``AggregatedLicenseMatcher.match()`` from the ``licenseid``
+    library.  When *raw* is recognised as an SPDX License ID the canonical
+    casing is returned (e.g. ``"bsd-3-clause"`` → ``"BSD-3-Clause"``).
+
+    When *raw* is not recognised it is returned verbatim — pitloom records
+    what it found and leaves further interpretation (e.g. deciding whether to
+    add a ``LicenseRef-`` prefix for non-SPDX identifiers) to the
+    ``licenseid`` library or downstream SBOM tooling.
+
+    Requires a populated ``licenseid`` database (``licenseid update``).
+    When the database has not been built, *raw* is returned unchanged.
+    """
+    # pylint: disable=import-outside-toplevel
+    from pitloom.extract._license import canonicalize_license_id
+
+    return canonicalize_license_id(raw)
+
+
 # Filenames (case-sensitive, root of repo) considered license candidates.
 # Listed in priority order: no extension first, then common suffixes.
 _HF_LICENSE_FILENAMES: tuple[str, ...] = (
@@ -436,7 +459,7 @@ def _resolve_license(
         provenance["license"] = (
             "Source: Hugging Face Hub | Field: model card YAML (license)"
         )
-        return raw_license_str, None
+        return _canonicalize_license_id(raw_license_str), None
 
     vague_raw = (
         raw_license_str
diff --git a/src/pitloom/extract/_license.py b/src/pitloom/extract/_license.py
index 7276109..7497655 100644
--- a/src/pitloom/extract/_license.py
+++ b/src/pitloom/extract/_license.py
@@ -4,13 +4,12 @@
 
 """License text detection utilities using the licenseid library.
 
-Provides best-effort SPDX license ID detection from license text found in
-project files.  All detection is optional and degrades gracefully when the
-``licenseid`` package is not installed or its database has not been built.
+Provides SPDX license ID detection from license text and metadata found in
+project files.  Text detection requires a populated database; other sources
+(``CITATION.cff``, ``codemeta.json``) work without it.
 
-To enable detection, install the package and build the database::
+Build the database before first use::
 
-    pip install licenseid
     licenseid update
 """
 
@@ -21,6 +20,8 @@
 import re
 from pathlib import Path
 
+from licenseid import AggregatedLicenseMatcher
+
 _logger = logging.getLogger(__name__)
 
 # Heuristic: single-token SPDX License IDs and expressions like "GPL-3.0-or-later"
@@ -56,45 +57,59 @@ def _looks_like_spdx_license_expression(value: str) -> bool:
     return bool(_SPDX_LICENSE_EXPR_KEYWORDS_RE.search(stripped))
 
 
-def _get_licenseid_db_path() -> Path:
-    return Path.home() / ".local" / "share" / "licenseid" / "licenses.db"
-
-
 def detect_license_from_text(text: str, threshold: float = 0.85) -> str | None:
     """Detect SPDX License ID from *text* using the licenseid library.
 
     Returns the top-ranked SPDX License ID when its score meets *threshold*, or
-    ``None`` when the database is absent, the library is not installed, or no
-    match exceeds the threshold.
+    ``None`` when the database is not populated or no match exceeds the threshold.
 
     The database must be built before detection is possible::
 
         licenseid update
     """
-    db_path = _get_licenseid_db_path()
-    if not db_path.exists():
-        _logger.warning(
-            "licenseid database not found at %s -- "
-            "run 'licenseid update' to enable license text detection",
-            db_path,
-        )
-        return None
     try:
-        # pylint: disable=import-outside-toplevel
-        from licenseid import AggregatedLicenseMatcher
-    except ImportError:
-        _logger.debug("licenseid not installed; skipping license text detection")
-        return None
-    try:
-        matcher = AggregatedLicenseMatcher(str(db_path))
+        matcher = AggregatedLicenseMatcher()
+        # Probe with a well-known license ID to confirm the database is populated.
+        if not matcher.match(license_id="MIT"):
+            _logger.warning(
+                "licenseid database appears empty -- "
+                "run 'licenseid update' to enable license text detection"
+            )
+            return None
         results = matcher.match(text)
         filtered = [r for r in results if r["score"] >= threshold]
-        return filtered[0]["license_id"] if filtered else None
+        return str(filtered[0]["license_id"]) if filtered else None
     except Exception as exc:  # pylint: disable=broad-exception-caught
         _logger.debug("licenseid detection failed: %s", exc)
         return None
 
 
+def canonicalize_license_id(raw: str) -> str:
+    """Return the canonical SPDX License ID for *raw*, or *raw* unchanged.
+
+    Uses ``AggregatedLicenseMatcher.match(license_id=raw)`` for a direct
+    database lookup.  Returns the canonical casing when *raw* is a recognised
+    SPDX License ID (e.g. ``"bsd-3-clause"`` → ``"BSD-3-Clause"``).
+
+    For unrecognised values — non-SPDX identifiers, deprecated bare
+    copyleft forms (``"agpl-3.0"``, ``"gpl-3.0"``), or vendor-specific
+    strings (``"gemma"``, ``"llama3.2"``) — the original string is
+    returned verbatim.  pitloom records what it found and leaves further
+    interpretation (e.g. deciding whether to add a ``LicenseRef-`` prefix)
+    to the ``licenseid`` library or downstream SBOM tooling.
+
+    Requires a populated database (``licenseid update``).  When the database
+    is not populated *raw* is returned unchanged.
+    """
+    try:
+        results = AggregatedLicenseMatcher().match(license_id=raw)
+        if results:
+            return str(results[0]["license_id"])
+    except Exception:  # pylint: disable=broad-exception-caught
+        pass
+    return raw
+
+
 def find_license_files(project_dir: Path) -> list[Path]:
     """Return existing license files in *project_dir* in priority order.
 
diff --git a/tests/fixtures/huggingface-hub/README.md b/tests/fixtures/huggingface-hub/README.md
index 7df0fbb..21499c6 100644
--- a/tests/fixtures/huggingface-hub/README.md
+++ b/tests/fixtures/huggingface-hub/README.md
@@ -224,32 +224,79 @@ The `hf.language` extractor passes non-standard values through unchanged
 | `"code"` | Programming-language content | `huggingface/CodeBERTa-small-v1` |
 | `False` (bool) | ISO code `"no"` parsed by YAML 1.1 | `openai/whisper-large-v3` — filtered out |
 
-### 8. License handling: passthrough, vague, and no-license patterns
+### 8. License handling: sources, canonicalization, vague values, and no-license patterns
+
+pitloom's role is to **find license information** from available sources.
+Once found, it delegates identification to the `licenseid` library.
+
+#### License sources (priority order)
+
+1. **Model card YAML `license` field** — the primary source; a license
+   identifier string supplied by the model author.
+2. **License files** (`LICENSE`, `COPYING`, etc.) — full license text;
+   used when the card value is vague (see below).  Detection performed by
+   `_detect_license_from_hf_files()` via `licenseid`.
+
+#### Canonicalization: `_canonicalize_license_id()`
+
+HF Hub stores license identifiers in **lowercase** (e.g. `apache-2.0`,
+`bsd-3-clause`), while SPDX requires specific mixed casing.
+`_canonicalize_license_id()` delegates to `canonicalize_license_id()` in
+`pitloom.extract._license`, which calls `AggregatedLicenseMatcher.match()`
+from the `licenseid` library.  The matcher's Tier-0 short-text path performs
+fuzzy name/ID matching and returns the canonical casing when it finds an
+exact match (internal score > 1.0); otherwise the value is returned
+**verbatim**.
+
+pitloom does **not** add `LicenseRef-` prefixes or otherwise interpret
+unrecognized values — that is the responsibility of `licenseid` or downstream
+SBOM tooling.
+
+| HF card value | `meta.license` recorded | Source |
+| :--- | :--- | :--- |
+| `apache-2.0` | `Apache-2.0` | recognized by licenseid — canonical SPDX License ID |
+| `mit` | `MIT` | recognized by licenseid |
+| `bsd-3-clause` | `BSD-3-Clause` | recognized by licenseid |
+| `cc-by-4.0` | `CC-BY-4.0` | recognized by licenseid |
+| `cc-by-nc-4.0` | `CC-BY-NC-4.0` | recognized by licenseid |
+| `cc-by-nc-nd-4.0` | `CC-BY-NC-ND-4.0` | recognized by licenseid |
+| `cc-by-sa-4.0` | `CC-BY-SA-4.0` | recognized by licenseid |
+| `gemma` | `gemma` | not recognized by licenseid — recorded verbatim |
+| `llama3.2` | `llama3.2` | not recognized by licenseid — recorded verbatim |
+| `llama3` | `llama3` | not recognized by licenseid — recorded verbatim |
+| `apple-amlr` | `apple-amlr` | not recognized by licenseid — recorded verbatim |
+| `bigscience-bloom-rail-1.0` | `bigscience-bloom-rail-1.0` | not recognized by licenseid — recorded verbatim |
+| `bigcode-openrail-m` | `bigcode-openrail-m` | not recognized by licenseid — recorded verbatim |
+| `openrail++` | `openrail++` | not recognized by licenseid — recorded verbatim |
+| `agpl-3.0` | `agpl-3.0` | deprecated SPDX License ID, not recognized by licenseid — recorded verbatim |
+| `gpl-3.0` | `gpl-3.0` | deprecated SPDX License ID, not recognized by licenseid — recorded verbatim |
+
+Deprecated bare copyleft forms (`agpl-3.0`, `gpl-3.0`, `lgpl-2.1`, etc.) are
+not recognized by `licenseid` (SPDX replaced them with `-only` /
+`-or-later` variants requiring explicit intent).  They are recorded verbatim;
+downstream tooling can decide whether to map them to `-only`, `-or-later`, or
+add a `LicenseRef-` prefix.
+
+Requires `licenseid update` to populate the database.  When the database is
+unavailable all card values are recorded verbatim (no canonicalization).
+
+#### Vague license values
 
 `_VAGUE_LICENSE_VALUES` = `{"other", "custom", "proprietary", "unknown", "unlicensed"}`.
-Anything outside this set is stored as-is in `meta.license`.  This includes
-non-SPDX HuggingFace custom identifiers:
+When the card YAML `license` field matches one of these, the extractor:
 
-| Custom identifier | Example model(s) |
-| :--- | :--- |
-| `gemma` | `google/gemma-2b`, `aisingapore/Gemma-SEA-LION-v4-4B-VL-GGUF`, `lerobot/pi05_base`, `bakrianoo/arabic-legal-documents-ocr-1.0` |
-| `llama3.2` | `meta-llama/Llama-3.2-1B`, `meta-llama/Llama-3.2-3B`, `meta-llama/Llama-3.2-3B-Instruct` |
-| `llama3` | `NousResearch/Hermes-3-Llama-3.2-3B` |
-| `apple-amlr` | `apple/DepthPro-hf`, `apple/OpenELM-270M`, `apple/Sharp` |
-| `kanana-license` | `kakaobank/kanana-1.5-v-3b-instruct` |
-| `bigscience-bloom-rail-1.0` | `bigscience/bloom`, `bigscience/bloomz-7b1` |
-| `bigcode-openrail-m` | `bigcode/starcoder2-3b` |
-| `openrail++` | `tum-nlp/Deberta_Human_Value_Detector` |
-| `bsd-3-clause` | `Salesforce/blip-vqa-base` |
-| `cc-by-nc-4.0` | `geolocal/StreetCLIP`, `MahmoodLab/UNI2-h`, `facebook/seamless-m4t-v2-large`, `HKUSTAudio/Llasa-3B` |
-| `cc-by-4.0` | `pyannote/speaker-diarization-community-1`, `TildeAI/TildeOpen-30b-64k`, `TildeAI/TildeOpen-30b` |
-| `cc-by-sa-4.0` | `sonoisa/sentence-bert-base-ja-mean-tokens`, `pythainlp/wangchanglm-7.5B-sft-enth` |
-| `cc-by-nc-nd-4.0` | `MahmoodLab/UNI2-h` |
-| `qwen` | `openthaigpt/openthaigpt-r1-32b-instruct` (via `license_name`) |
-| `llava2` | `llava-hf/LLaVA-NeXT-Video-7B-hf` |
-
-**The `license_name` secondary field** appears when the primary `license` field
-is vague or unrecognised.  It is stored in `extra_data["hf.license_name"]`:
+1. Saves the raw value in `extra_data["hf.license_raw"]`
+2. Calls `_detect_license_from_hf_files()` — downloads license files and uses
+   `licenseid` text detection to identify the SPDX License ID
+
+When the `license` field is absent entirely, `meta.license` is `None` and
+`hf.license_raw` is not set (distinct from the vague-value path).
+
+#### `license_name` secondary field
+
+Some model cards include a `license_name` key alongside the primary `license`
+field — typically a human-readable or more specific name.  pitloom records it
+verbatim in `extra_data["hf.license_name"]` without canonicalization.
 
 | `license_name` value | Example model | Notes |
 | :--- | :--- | :--- |
@@ -257,14 +304,7 @@ is vague or unrecognised.  It is stored in `extra_data["hf.license_name"]`:
 | `sai-nc-community` | `stabilityai/stable-zero123` | Stability AI non-commercial |
 | `tencent-hunyuan-community` | `tencent/HY-Motion-1.0` | Tencent Hunyuan community |
 | `open-aleph-license` | `Aleph-Alpha/Pharia-1-LLM-7B-control` | Aleph Alpha open licence |
-| `bsd-3-clause` | `TencentARC/TimeLens-8B` | SPDX ID used as `license_name` when HF has no SPDX slot |
-
-When the card YAML contains a vague `license` value, the raw string is saved in
-`extra_data["hf.license_raw"]` and `_detect_license_from_hf_files` is called
-to look for a real SPDX ID in license files (`LICENSE`, `COPYING`, etc.).
-
-When the `license` field is absent entirely, `meta.license` is `None` and
-`hf.license_raw` is not set (distinct from the vague-value path).
+| `bsd-3-clause` | `TencentARC/TimeLens-8B` | SPDX License ID used as secondary name |
 
 ### 9. BLOOM architecture: non-standard config key names (known gap)
 
@@ -618,50 +658,50 @@ come from `config.json`.
 
 | Model ID | Pattern | Notable |
 | :--- | :--- | :--- |
-| `mistralai/Mistral-7B-v0.1` | Baseline transformer | Standard LLM: GQA, apache-2.0, `text-generation` pipeline |
-| `Qwen/Qwen3-235B-A22B` | MoE, `qwen` license passthrough, generation config | `qwen3_moe` arch; `Qwen3MoeForCausalLM`; thinking-mode temperature+top_p |
-| `Qwen/Qwen3.5-27B` | Dense Qwen3.5, GQA (8 KV heads), apache-2.0 | `qwen3` arch; `Qwen3ForCausalLM`; 40 attention heads / 8 KV heads |
+| `mistralai/Mistral-7B-v0.1` | Baseline transformer | Standard LLM: GQA, `Apache-2.0`, `text-generation` pipeline |
+| `Qwen/Qwen3-235B-A22B` | MoE, `qwen` license (verbatim), generation config | `qwen3_moe` arch; `Qwen3MoeForCausalLM`; thinking-mode temperature+top_p |
+| `Qwen/Qwen3.5-27B` | Dense Qwen3.5, GQA (8 KV heads), `Apache-2.0` | `qwen3` arch; `Qwen3ForCausalLM`; 40 attention heads / 8 KV heads |
 | `openthaigpt/openthaigpt-r1-32b-instruct` | Vague license + file detection | `license=other`; `license_name=qwen` secondary field; Thai |
 | `hexgrad/Kokoro-82M` | No `model_type` / `architectures` | Custom config schema → `type_of_model=None`, `architecture=None` |
 | `moonshotai/Kimi-K2.6` | Vague license + file detection | `license=other` → `hf.license_raw`; `_detect_license_from_hf_files` triggered |
-| `google/gemma-2b` | Gated config, custom license | 401 on config.json; `gemma` license in card |
-| `meta-llama/Llama-3.2-1B` | Gated config, custom license, 8 languages | `llama3.2` license; config inaccessible → no arch |
-| `meta-llama/Llama-3.2-3B` | Gated base, no architecture | Config gated → `type_of_model=None`; llama3.2 license |
+| `google/gemma-2b` | Gated config, custom license | 401 on config.json; `gemma` recorded verbatim |
+| `meta-llama/Llama-3.2-1B` | Gated config, custom license, 8 languages | `llama3.2` recorded verbatim; config inaccessible → no arch |
+| `meta-llama/Llama-3.2-3B` | Gated base, no architecture | Config gated → `type_of_model=None`; `llama3.2` recorded verbatim |
 | `meta-llama/Llama-3.2-3B-Instruct` | Gated instruct, base_model finetune | Config gated; `base_model_relation=finetune` from 3B base |
-| `NousResearch/Hermes-3-Llama-3.2-3B` | Not gated, llama3 license, finetune | `LlamaForCausalLM`; `llama3` license; finetune from Llama-3.2-3B |
+| `NousResearch/Hermes-3-Llama-3.2-3B` | Not gated, llama3 license, finetune | `LlamaForCausalLM`; `llama3` recorded verbatim; finetune from Llama-3.2-3B |
 | `deepseek-ai/DeepSeek-R1` | MIT license, no pipeline_tag, MoE | Empty `usage.domains`; standard SPDX MIT |
-| `bigcode/starcoder2-3b` | `"code"` tag → domain, dataset ref | `code` → `usage.domains` not `extra_lists["hf.tags"]`; training dataset |
+| `bigcode/starcoder2-3b` | `"code"` tag → domain, dataset ref, custom license | `code` → `usage.domains`; `bigcode-openrail-m` recorded verbatim |
 | `SeaLLMs/SeaLLMs-v3-7B-Chat` | Vague license, 12 SEA/Asian languages | `license=other`; no pipeline_tag; qwen2 base |
-| `typhoon-ai/typhoon-7b` | Thai-only, GQA | `["th"]`; `num_key_value_heads=8`; apache-2.0 |
-| `iapp/chinda-qwen3-4b` | Base_model finetune, DOI | Thai LLM; Qwen3-4B base; `doi:10.57967/hf/5709`; apache-2.0 |
+| `typhoon-ai/typhoon-7b` | Thai-only, GQA | `["th"]`; `num_key_value_heads=8`; Apache-2.0 |
+| `iapp/chinda-qwen3-4b` | Base_model finetune, DOI | Thai LLM; Qwen3-4B base; `doi:10.57967/hf/5709`; Apache-2.0 |
 | `iapp/chinda-qwen3-4b-gguf` | GGUF-only, base_model quantized, scalar base_model | `base_model` as scalar string in card YAML; no config.json |
 | `talkie-lm/talkie-1930-13b-it` | No config.json, finetune, no domain | No pipeline_tag → empty `usage.domains` |
-| `pythainlp/wangchanglm-7.5B-sft-enth` | Multi-dataset, tokenizer sentinel | 3 datasets; `model_max_length` sentinel filtered; cc-by-sa-4.0 |
+| `pythainlp/wangchanglm-7.5B-sft-enth` | Multi-dataset, tokenizer sentinel | 3 datasets; `model_max_length` sentinel filtered; `CC-BY-SA-4.0` |
 | `mesolitica/mallam-1.1B-4096` | No license, Malay only | `license=None`; `language=["ms"]`; mistral base |
-| `llm-jp/llm-jp-3-1.8b` | Large JP vocab LLaMA | 99 584-token vocab; apache-2.0; Japanese+English |
+| `llm-jp/llm-jp-3-1.8b` | Large JP vocab LLaMA | 99 584-token vocab; Apache-2.0; Japanese+English |
 | `mistralai/Mistral-Medium-3.5-128B` | 22 languages, vague license, no pipeline_tag | `usage.domains==[]`; `license=other` |
 | `poolside/Laguna-XS.2` | Custom `model_type` and architecture | `model_type=laguna`; `LagunaForCausalLM`; custom tags preserved |
 | `abeja/gpt-neox-japanese-2.7b` | Language scalar, multi-dataset | `language: ja` scalar → `["ja"]`; cc100+wikipedia datasets |
-| `ibm-granite/granite-4.1-8b` | GQA (8 KV heads), 12 languages, finetune | granite arch; finetune from granite-4.1-8b-base; apache-2.0 |
+| `ibm-granite/granite-4.1-8b` | GQA (8 KV heads), 12 languages, finetune | granite arch; finetune from granite-4.1-8b-base; Apache-2.0 |
 | `Crownelius/Crow-9B-HERETIC-4.6` | `base_model_relation=merge`, 26 languages | Qwen3.5; merged/distilled from Claude |
 | `SamsungSAILMontreal/Qwen3-Coder-Next-REAP` | `base_model_relation=merge`, MoE | Qwen3-Next 80B→60B expert pruning; merge relation |
 | `facebook/opt-2.7b` | Vague license (other), OPT arch | `opt` arch; Meta non-commercial → `hf.license_raw=other` |
 | `facebook/opt-iml-max-1.3b` | Vague license, arxiv, instruction-tuned OPT | `arxiv:2212.12017`; instruction-tuned on ~2000 NLP tasks |
-| `EleutherAI/gpt-neo-2.7B` | gpt_neo arch, standard SPDX license | `GPTNeoForCausalLM`; 32 layers; apache-2.0 |
-| `stabilityai/stablelm-2-zephyr-1_6b` | stablelm_epoch arch, 12 languages | `StableLMEpochForCausalLM`; 100 352-token vocab; apache-2.0 |
-| `TinyLlama/TinyLlama-1.1B-Chat-v1.0` | Shallow LLaMA (22 layers) | Shallower than standard 7B (32 layers); apache-2.0 |
+| `EleutherAI/gpt-neo-2.7B` | gpt_neo arch, standard SPDX license | `GPTNeoForCausalLM`; 32 layers; Apache-2.0 |
+| `stabilityai/stablelm-2-zephyr-1_6b` | stablelm_epoch arch, 12 languages | `StableLMEpochForCausalLM`; 100 352-token vocab; Apache-2.0 |
+| `TinyLlama/TinyLlama-1.1B-Chat-v1.0` | Shallow LLaMA (22 layers) | Shallower than standard 7B (32 layers); Apache-2.0 |
 | `microsoft/phi-2` | phi arch, `"code"` tag → domain | `code` in `_DOMAIN_TAGS`; MIT; `code` not in `hf.tags` |
-| `tokyotech-llm/Qwen3-Swallow-8B-SFT-v0.2` | Qwen3 finetune, Japanese+English | SFT from CPT stage; apache-2.0 |
+| `tokyotech-llm/Qwen3-Swallow-8B-SFT-v0.2` | Qwen3 finetune, Japanese+English | SFT from CPT stage; Apache-2.0 |
 | `aisingapore/Gemma-SEA-LION-v4-27B-IT` | `image-text-to-text` in tags → extra domain | Gemma3 27B; 11 SEA languages; `gemma` license |
 | `FINAL-Bench/Darwin-28B-KR-Legal` | Korean legal LLM, finetune | `qwen3_5` arch; 64 layers; Korean+English |
-| `Intelligent-Internet/II-Medical-8B` | Qwen3 finetune, empty card tags | Medical domain; `hidden_size=4096`; no pipeline_tag; apache-2.0 |
-| `THUDM/GLM-4.5-Air-REAP` | MoE, `base_model_relation=merge`, apache-2.0 | `glm4_moe` arch; `Glm4MoeForCausalLM`; Samsung REAP merge from GLM-4.5-Air |
-| `Fujitsu/Fujitsu-LLM-KG-8x7B` | Gated config, NeMo library | Config 401; `library_name=nemo` → `hf.library_name`; apache-2.0 |
+| `Intelligent-Internet/II-Medical-8B` | Qwen3 finetune, empty card tags | Medical domain; `hidden_size=4096`; no pipeline_tag; Apache-2.0 |
+| `THUDM/GLM-4.5-Air-REAP` | MoE, `base_model_relation=merge`, `Apache-2.0` | `glm4_moe` arch; `Glm4MoeForCausalLM`; Samsung REAP merge from GLM-4.5-Air |
+| `Fujitsu/Fujitsu-LLM-KG-8x7B` | Gated config, NeMo library | Config 401; `library_name=nemo` → `hf.library_name`; Apache-2.0 |
 | `mistralai/Voxtral-Mini-4B-Realtime-2602` | Multimodal audio+text (ASR), `vllm` library | `voxtral_realtime` arch; audio encoder + text decoder; `library_name=vllm` |
-| `TildeAI/TildeOpen-30b-64k` | YaRN RoPE context extension, 7 datasets, cc-by-4.0 | 8 192 → 65 536 tokens via YaRN; `rope_scaling` not in `_HYPER_KEYS`; `tokenizer_max_length=65536` |
+| `TildeAI/TildeOpen-30b-64k` | YaRN RoPE context extension, 7 datasets, `CC-BY-4.0` | 8 192 → 65 536 tokens via YaRN; `rope_scaling` not in `_HYPER_KEYS`; `tokenizer_max_length=65536` |
 | `TildeAI/TildeOpen-30b` | Base 30B, unlimited tokenizer sentinel | Same 7 corpora; LlamaTokenizer sentinel filtered; no YaRN |
 | `openeurollm/datamix-9b-80-20` | Gemma-3 tokenizer (262K vocab), no GQA, no pipeline_tag | `vocab_size=262400`; `num_kv_heads=num_attn_heads=32`; empty `usage.domains` |
-| `bigscience/bloom` | BLOOM 176B, ALiBi, custom key names, custom license | `n_layer`/`n_head` not in `_HYPER_KEYS` → layers skipped; `bigscience-bloom-rail-1.0` passthrough |
+| `bigscience/bloom` | BLOOM 176B, ALiBi, custom key names, custom license | `n_layer`/`n_head` not in `_HYPER_KEYS` → layers skipped; `bigscience-bloom-rail-1.0` recorded verbatim |
 | `bigscience/bloomz-7b1` | BLOOM 7B, `seq_length` captured, xP3 finetune | `seq_length=2048` (new `_HYPER_KEYS` entry); finetune from bloom-7b1; `bigscience/xP3` dataset |
 | `CohereLabs/aya-23-8B` | Fully gated (card + config 401) | Same pattern as `CohereLabs/aya-vision-8b`; only `hf.author` from `model_info` |
 | `occiglot/occiglot-7b-eu5-instruct` | Mistral, `sliding_window` captured, 5 EU langs | `sliding_window=4096` in `_HYPER_KEYS`; finetune from occiglot-7b-eu5 |
@@ -671,8 +711,8 @@ come from `config.json`.
 | `FreedomIntelligence/BlenderLLM` | text-to-3d pipeline, Qwen2 LLM | Standard Qwen2 decoder fine-tuned for Blender script generation; `pipeline_tag=text-to-3d` |
 | `hellork/BlenderLLM-IQ3_XXS-GGUF` | GGUF of BlenderLLM, text-to-3d, quantized | No config.json; `base_model_relation=quantized`; `text-to-3d` domain inherited |
 | `MiniMaxAI/MiniMax-M2.7` | minimax_m2 arch, MoE with MTP, 1M context, vague license | `MiniMaxM2ForCausalLM`; `max_position_embeddings=1_000_000`; `license=other` |
-| `apple/OpenELM-270M` | openelm arch, apple-amlr passthrough, head_dim captured | Custom efficient arch; `head_dim=64` in `_HYPER_KEYS` → captured; non-standard keys skipped |
-| `sail/Sailor2-20B` | Qwen2, 10 SEA languages, apache-2.0 | Covers Thai, Khmer, Lao, Malay, Burmese, Filipino; `num_key_value_heads=8` |
+| `apple/OpenELM-270M` | openelm arch, `apple-amlr` license (verbatim), head_dim captured | Custom efficient arch; `head_dim=64` in `_HYPER_KEYS` → captured; non-standard keys skipped |
+| `sail/Sailor2-20B` | Qwen2, 10 SEA languages, `Apache-2.0` | Covers Thai, Khmer, Lao, Malay, Burmese, Filipino; `num_key_value_heads=8` |
 | `huggingface/CodeBERTa-small-v1` | RoBERTa, fill-mask, no license, `language=["code"]` | Pre-trained on The Stack; `language="code"` preserved (non-ISO identifier) |
 | `Bencode92/tradepulse-finbert-sentiment` | BERT, text-classification, finetune from finbert | `BertForSequenceClassification`; financial sentiment; `base_model_relation=finetune` |
 | `OpenVINO/Mixtral-8x7B-Instruct-v0.1-int8-ov` | OpenVINO int8 quant, config accessible | Config.json present (unlike GGUF); `torch_dtype=int8` captured; `library_name=openvino` |
@@ -682,7 +722,7 @@ come from `config.json`.
 
 | Model ID | Pattern | Notable |
 | :--- | :--- | :--- |
-| `sonoisa/sentence-bert-base-ja-mean-tokens` | Language scalar string fix | `language: ja` → `["ja"]`; sentence-similarity; cc-by-sa-4.0 |
+| `sonoisa/sentence-bert-base-ja-mean-tokens` | Language scalar string fix | `language: ja` → `["ja"]`; sentence-similarity; `CC-BY-SA-4.0` |
 | `cl-nagoya/ruri-v3-310m` | ModernBERT, base_model finetune, arxiv | `arxiv:2409.07737`; Japanese embedding; sentence-similarity |
 | `nomic-ai/nomic-embed-text-v1.5-GGUF` | GGUF-only, base_model quantized | No config.json; `base_model_relation=quantized`; nomic-embed |
 | `ibm-granite/granite-embedding-97m-multilingual-r2` | ModernBERT, sentence-transformers library | 200+ languages; `hf.library_name=sentence-transformers`; feature-extraction |
@@ -692,11 +732,11 @@ come from `config.json`.
 | `airesearch/WangchanX-Legal-ThaiCCL-Retriever` | Base_model finetune, MIT, dataset ref | Fine-tuned from BAAI/bge-m3; xlm-roberta arch; Thai legal |
 | `jinaai/jina-embeddings-v4` | visual-document-retrieval domain, no license | 131 072 token context; `language=["multilingual"]` keyword preserved; `license=None` |
 | `HuggingFaceFW/fineweb-edu-classifier` | text-classification, base_model finetune | Fine-tuned from Snowflake arctic-embed; educational quality 0–5 |
-| `tum-nlp/Deberta_Human_Value_Detector` | text-classification, `openrail++` passthrough | `openrail++` ∉ `_VAGUE_LICENSE_VALUES`; 20 value categories |
+| `tum-nlp/Deberta_Human_Value_Detector` | text-classification, `openrail++` verbatim | `openrail++` ∉ `_VAGUE_LICENSE_VALUES`; not recognized by licenseid → recorded verbatim; 20 value categories |
 | `nlp-chula/aspect-finnlp-th` | text-classification, Thai financial, no license | CamemBERT-based; fine-tuned from wangchanberta; `license=None` |
 | `openai/privacy-filter` | token-classification, 128 K context | `hf.tokenizer_max_length=128000` captured; custom arch |
-| `line-corporation/line-distilbert-base-japanese` | fill-mask, DistilBERT (6 layers) | Japanese BERT distilled to 6 layers; `DistilBertForMaskedLM`; apache-2.0 |
-| `line-corporation/clip-japanese-base-v2` | feature-extraction, custom `clyp` model_type | Line Corp CLIP variant; `CLYPModel` arch; apache-2.0; Japanese |
+| `line-corporation/line-distilbert-base-japanese` | fill-mask, DistilBERT (6 layers) | Japanese BERT distilled to 6 layers; `DistilBertForMaskedLM`; Apache-2.0 |
+| `line-corporation/clip-japanese-base-v2` | feature-extraction, custom `clyp` model_type | Line Corp CLIP variant; `CLYPModel` arch; Apache-2.0; Japanese |
 | `Alibaba-NLP/gte-multilingual-reranker-base` | text-ranking domain, `model_type="new"` placeholder | `NewForSequenceClassification`; `model_type="new"` is a literal string, not a typo |
 | `Alibaba-NLP/gte-modernbert-base` | modernbert arch, sentence-similarity, multilingual | `ModernBertModel` (base encoder); `max_position_embeddings=8192` captured |
 | `dbmdz/bert-base-turkish-cased` | bert, no `architectures` field, Turkish, no pipeline_tag | `model_type=bert` present; `architectures` absent → `architecture=None`; minimal card (2 fields) |
@@ -706,26 +746,26 @@ come from `config.json`.
 
 | Model ID | Pattern | Notable |
 | :--- | :--- | :--- |
-| `apple/DepthPro-hf` | depth-estimation domain, custom license | `apple-amlr` ∉ `_VAGUE_LICENSE_VALUES` → stored as-is; DepthPro arch |
+| `apple/DepthPro-hf` | depth-estimation domain, custom license | `apple-amlr` recorded verbatim; DepthPro arch |
 | `prs-eth/marigold-depth-v1-0` | depth-estimation, diffusers, no config | `library_name=diffusers`; no config.json → no arch |
 | `usyd-community/vitpose-plus-huge` | keypoint-detection domain | ViTPose arch; human pose estimation |
 | `laion/CLIP-convnext_base_w-laion2B-s13B-b82K-augreg` | zero-shot-image-classification, no config | No config.json → no arch; `library_name=open_clip` |
-| `geolocal/StreetCLIP` | zero-shot-image-classification, CLIP, cc-by-nc-4.0 | CLIP arch; geo-localisation tags in extra_lists |
+| `geolocal/StreetCLIP` | zero-shot-image-classification, CLIP, `CC-BY-NC-4.0` | CLIP arch; geo-localisation tags in extra_lists |
 | `microsoft/swin-tiny-patch4-window7-224` | No pipeline_tag, domain from card tags | `"image-classification"` in `tags` → domain; imagenet-1k dataset |
-| `microsoft/resnet-18` | image-classification from card tags | `resnet` arch; apache-2.0; same tag-domain pattern as Swin |
-| `facebook/dinov2-small` | image-feature-extraction domain | DINOv2 self-supervised ViT; apache-2.0 |
+| `microsoft/resnet-18` | image-classification from card tags | `resnet` arch; Apache-2.0; same tag-domain pattern as Swin |
+| `facebook/dinov2-small` | image-feature-extraction domain | DINOv2 self-supervised ViT; Apache-2.0 |
 | `microsoft/rad-dino` | image-feature-extraction, no license | DINOv2 fine-tuned on radiology; `license=None` |
-| `MahmoodLab/UNI2-h` | Gated config, `cc-by-nc-nd-4.0` | Pathology/histology ViT; restrictive NC+ND license; tags in extra_lists |
+| `MahmoodLab/UNI2-h` | Gated config, `CC-BY-NC-ND-4.0` | Pathology/histology ViT; restrictive NC+ND license; tags in extra_lists |
 | `timm/convnext_large.dinov3_lvd1689m` | Vague license, timm library, no config | `license=other`; `library_name=timm`; no config.json |
 | `briaai/RMBG-1.4` | Vague license, image-segmentation | `license=other` → `hf.license_raw`; custom tags |
 | `briaai/RMBG-2.0` | Gated config, vague license | Config gated → no type_of_model; domain from card; `license=other` |
 | `ibm-granite/granite-geospatial-uki-flooddetection` | image-segmentation, TerraTorch, HF dataset refs | No transformers config; two `/datasets/` repos as `DatasetReference` |
 | `prithivMLmods/Flood-Image-Detection` | image-classification, siglip, arxiv, finetune | Fine-tuned from google/siglip2-base-patch16-512; `arxiv:2502.14786` |
 | `LGAI-EXAONE/EXAONE-Path-2.0-rev-EGFR` | Gated config, non-standard pipeline tag | Config 401; `pathology-image-analysis` captured as domain (pipeline_tag, not tags); `license=other` |
-| `windowseat-ai/windowseat-reflection` | No config, PEFT library, image-to-image | Config 404; `library_name=peft` → `hf.library_name`; apache-2.0 |
+| `windowseat-ai/windowseat-reflection` | No config, PEFT library, image-to-image | Config 404; `library_name=peft` → `hf.library_name`; Apache-2.0 |
 | `stabilityai/stable-zero123` | text-to-3d, diffusers, vague license + `license_name` | No config.json; `hf.license_name=sai-nc-community`; `library_name=diffusers` |
 | `openai/shap-e` | text-to-3d, MIT, no config | Generates 3D assets from text/images; no config.json; MIT |
-| `apple/Sharp` | image-to-3d, apple-amlr passthrough, ml-sharp library | Single-image 3D generation; `library_name=ml-sharp`; no config.json |
+| `apple/Sharp` | image-to-3d, `apple-amlr` license (verbatim), ml-sharp library | Single-image 3D generation; `library_name=ml-sharp`; no config.json |
 | `FireRedTeam/FireRedVAD` | voice-activity-detection, no config, apache-2.0 | VAD; new `_DOMAIN_TAGS` entry; no config.json → `type_of_model=None` |
 | `ETH-CVG/lightglue_superpoint` | keypoint-detection, lightglue arch, vague license | Feature matching; non-standard config keys → `hyperparameters={}`; `license=other` |
 | `qualcomm/HRNetPose` | keypoint-detection, pytorch library, vague license | Native PyTorch format; `library_name=pytorch`; no config.json; `license=other` |
@@ -738,30 +778,30 @@ come from `config.json`.
 | :--- | :--- | :--- |
 | `dandelin/vilt-b32-finetuned-vqa` | visual-question-answering, base_model finetune, arxiv | ViLT on VQAv2; `arxiv:2102.03334`; finetune from vilt-b32 |
 | `google/deplot` | visual-question-answering + `image-text-to-text` in tags | pix2struct; both pipeline tag and card tag → two domains; `arxiv:2212.10505` |
-| `Salesforce/blip-vqa-base` | visual-question-answering, `bsd-3-clause` passthrough | `bsd-3-clause` ∉ `_VAGUE_LICENSE_VALUES`; blip arch |
+| `Salesforce/blip-vqa-base` | visual-question-answering, `BSD-3-Clause` | `bsd-3-clause` → `BSD-3-Clause` (canonical SPDX); blip arch |
 | `naver-clova-ix/donut-base-finetuned-docvqa` | document-question-answering, vision-encoder-decoder | `image-to-text` also captured via card tags; donut arch |
 | `impira/layoutlm-document-qa` | document-question-answering, language scalar | `language: en` scalar → `["en"]`; layoutlm arch; MIT |
 | `google/tapas-large-finetuned-wtq` | table-question-answering, language scalar, dataset ref | `language: en` scalar; dataset ref; tapas arch |
-| `llava-hf/LLaVA-NeXT-Video-7B-hf` | video-text-to-text + image-text-to-text (two domains) | Both pipeline tag and card tag → two domain entries; llava2 license |
-| `aisingapore/Gemma-SEA-LION-v4-4B-VL` | image-text-to-text, gemma license, SEA, finetune | Gemma3 multimodal; 9 SEA languages; finetune from google/gemma-3-4b-it |
+| `llava-hf/LLaVA-NeXT-Video-7B-hf` | video-text-to-text + image-text-to-text (two domains) | Both pipeline tag and card tag → two domain entries; `llava2` recorded verbatim |
+| `aisingapore/Gemma-SEA-LION-v4-4B-VL` | image-text-to-text, `gemma` license (verbatim), SEA, finetune | Gemma3 multimodal; 9 SEA languages; finetune from google/gemma-3-4b-it |
 | `openvla/openvla-7b` | robotics + image-text-to-text (two domains), MIT | VLA policy; pipeline tag and card tag → two domains |
 | `nvidia/Nemotron-3-Nano-Omni-30B-A3B-Reasoning-BF16` | any-to-any domain, vague license | Reasoning + audio+video+text; `license=other`; card dataset takes priority |
 | `briaai/Fibo-Edit-RMBG` | image-to-image, arxiv, base_model finetune | `arxiv:2511.06876`; finetune from briaai/Fibo-Edit; diffusers |
-| `baidu/ERNIE-Image-Turbo` | text-to-image, diffusers, Chinese+English | Distilled DiT; `library_name=diffusers`; apache-2.0 |
+| `baidu/ERNIE-Image-Turbo` | text-to-image, diffusers, Chinese+English | Distilled DiT; `library_name=diffusers`; Apache-2.0 |
 | `Doses-AI/boba-0.8b-food-GGUF` | image-text-to-text, GGUF, food domain | No config.json → `type_of_model=None`; finetune from Qwen3.5-0.8B |
-| `bakrianoo/arabic-legal-documents-ocr-1.0` | image-text-to-text, gemma license, Arabic OCR | Gemma3; `license=gemma`; scanned Arabic legal documents |
-| `kakaobank/kanana-1.5-v-3b-instruct` | image-text-to-text, `kanana-license` passthrough | `kanana-1.5-v` arch; `KananaVForConditionalGeneration`; Korean VLM |
+| `bakrianoo/arabic-legal-documents-ocr-1.0` | image-text-to-text, `gemma` license (verbatim), Arabic OCR | Gemma3; `gemma` recorded verbatim; scanned Arabic legal documents |
+| `kakaobank/kanana-1.5-v-3b-instruct` | image-text-to-text, `kanana-license` (verbatim) | `kanana-1.5-v` arch; `KananaVForConditionalGeneration`; Korean VLM |
 | `LGAI-EXAONE/EXAONE-4.5-33B` | image-text-to-text, vague license, 6 languages | `exaone4_5` arch; Korean+multilingual; `license=other` → `hf.license_raw` |
 | `LGAI-EXAONE/EXAONE-4.5-33B-AWQ` | AWQ quantized, config accessible (unlike GGUF) | Config present; `base_model_relation=quantized`; `license=other` |
 | `LGAI-EXAONE/EXAONE-4.5-33B-FP8` | FP8 quantized, `torch_dtype=float8_e4m3fn` | `torch_dtype` in `_HYPER_KEYS` → captured in hyperparameters |
 | `LGAI-EXAONE/EXAONE-4.5-33B-GGUF` | GGUF, no config.json, vague license | `type_of_model=None`; `base_model_relation=quantized`; `license=other` |
-| `aisingapore/Gemma-SEA-LION-v4-4B-VL-GGUF` | GGUF-only VLM, `gemma` license, SEA langs | No config.json; `image-text-to-text`; 9 SEA languages; `gemma` license passthrough |
+| `aisingapore/Gemma-SEA-LION-v4-4B-VL-GGUF` | GGUF-only VLM, `gemma` license (verbatim), SEA langs | No config.json; `image-text-to-text`; 9 SEA languages; `gemma` recorded verbatim |
 | `Gen-Verse/MMaDA-8B-Base` | llada arch, ALiBi positional bias, any-to-any, MIT | ALiBi: no `max_position_embeddings`; `vocab_size=32000` captured; masked-token diffusion |
 | `mlx-community/gemma-4-e2b-it-4bit` | MLX 4-bit quant, gemma4 arch, any-to-any | `library_name=mlx`; config.json accessible; `base_model_relation=quantized` |
 | `onnx-community/gemma-4-E2B-it-ONNX` | ONNX export, gemma4 arch, transformers.js | `library_name=transformers.js`; config.json accessible; `base_model_relation=quantized` |
 | `ByteDance-Seed/BAGEL-7B-MoT` | bagel arch, any-to-any, nested config | Mixture-of-Tokens multimodal; `hyperparameters={}`; `library_name=bagel-mot` |
 | `sensenova/SenseNova-U1-8B-MoT` | neo_chat arch, any-to-any, nested config | `NEOChatModel`; Chinese+English; `hyperparameters={}` |
-| `inclusionAI/LLaDA2.0-Uni` | llada2_moe arch, discrete diffusion, any-to-any | Masked-token diffusion model; `LLaDA2MoeModelLM`; apache-2.0 |
+| `inclusionAI/LLaDA2.0-Uni` | llada2_moe arch, discrete diffusion, any-to-any | Masked-token diffusion model; `LLaDA2MoeModelLM`; Apache-2.0 |
 | `XiaomiMiMo/MiMo-Audio-7B-Instruct` | qwen2 base + MiMoAudioModel wrapper, any-to-any | Architecture field captures custom wrapper; `model_type=qwen2` (base) preserved |
 | `tencent/HY-Motion-1.0` | text-to-3d, custom config, vague license + `license_name` | `library_name=HY-Motion-1.0`; non-standard config → `type_of_model=None`; `hf.license_name=tencent-hunyuan-community` |
 | `TencentARC/TimeLens-8B` | qwen3_vl, nested text_config, video-text-to-text, `dtype` key | All LM keys inside `text_config` → `hyperparameters={}`; `dtype` (not `torch_dtype`); `license_name=bsd-3-clause` |
@@ -771,8 +811,8 @@ come from `config.json`.
 
 | Model ID | Pattern | Notable |
 | :--- | :--- | :--- |
-| `openai/whisper-large-v3` | 99-language ASR, YAML 1.1 boolean hazard | ISO code `"no"` parsed as `False` → filtered; apache-2.0 |
-| `facebook/seamless-m4t-v2-large` | ASR pipeline + `audio-to-audio` + `text-to-speech` from tags | Three domains captured; cc-by-nc-4.0 |
+| `openai/whisper-large-v3` | 99-language ASR, YAML 1.1 boolean hazard | ISO code `"no"` parsed as `False` → filtered; Apache-2.0 |
+| `facebook/seamless-m4t-v2-large` | ASR pipeline + `audio-to-audio` + `text-to-speech` from tags | Three domains captured; `CC-BY-NC-4.0` |
 | `ibm-granite/granite-speech-4.1-2b` | ASR, base_model finetune, 6 languages | Conformer + Q-Former + granite LM; finetune from granite-4.0-1b-base |
 | `ai4bharat/indic-conformer-600m-multilingual` | Gated ASR, 22 Indian language codes | MIT; config gated; 22 ISO language codes extracted from card |
 | `cstr/mimo-asr-GGUF` | GGUF ASR, base_model quantized | Qwen2-based; quantized from XiaomiMiMo/MiMo-V2.5-ASR; zh+en |
@@ -782,8 +822,8 @@ come from `config.json`.
 | `jonatasgrosman/wav2vec2-large-xlsr-53-japanese` | Language scalar, DOI | `language: ja` scalar; `doi:10.57967/hf/3568`; ASR domain |
 | `k2-fsa/OmniVoice` | text-to-speech domain, arxiv, base_model finetune | 646 languages as `["multilingual"]`; `arxiv:2604.00688`; Qwen3-0.6B base |
 | `drbaph/OmniVoice-bf16` | text-to-speech domain, finetune | BF16 conversion of k2-fsa/OmniVoice; same TTS domain |
-| `pyannote/speaker-diarization-community-1` | speaker-diarization domain, gated, pyannote.audio | cc-by-4.0 (permissive, despite gating); no config.json; `library_name=pyannote.audio` |
-| `HKUSTAudio/Llasa-3B` | text-to-speech, LLaMA arch, large vocab | `LlamaForCausalLM` repurposed for TTS; `vocab_size=193800` (speech tokens); cc-by-nc-4.0 |
+| `pyannote/speaker-diarization-community-1` | speaker-diarization domain, gated, pyannote.audio | `CC-BY-4.0` (permissive, despite gating); no config.json; `library_name=pyannote.audio` |
+| `HKUSTAudio/Llasa-3B` | text-to-speech, LLaMA arch, large vocab | `LlamaForCausalLM` repurposed for TTS; `vocab_size=193800` (speech tokens); `CC-BY-NC-4.0` |
 
 ### Translation, seq2seq, and domain-specific
 
@@ -795,16 +835,16 @@ come from `config.json`.
 | `tencent/Hy-MT1.5-1.8B-2bit-GGUF` | GGUF quantized, `"multilingual"` language keyword | No config.json; `language=["multilingual"]`; `base_model_relation=quantized` |
 | `tencent/Hunyuan-MT-7B` | Translation from tag, no license | Same hunyuan arch as HY-MT1.5; `license=None` |
 | `protonx-models/protonx-legal-tc` | text2text-generation, NC license → other, Vietnamese | T5; proprietary non-commercial → `license=other` → `hf.license_raw` |
-| `ReDiX/Legal-Embedding-ita-0.6B` | sentence-similarity, Italian legal, cc-by-nc-4.0 | Qwen3 base; Italian legal corpus |
+| `ReDiX/Legal-Embedding-ita-0.6B` | sentence-similarity, Italian legal, `CC-BY-NC-4.0` | Qwen3 base; Italian legal corpus |
 | `lmg-anon/vntl-llama3-8b-v2-gguf` | GGUF, base_model quantized, llama3 license | Quantized from rinna/llama-3-youko-8b; translation domain |
 | `sugoitoolkit/Sugoi-14B-Ultra-GGUF` | GGUF, base_model as list | `base_model: ["sugoitoolkit/Sugoi-14B-Ultra-HF"]` → first entry extracted |
 | `Falconsai/medical_summarization` | T5 summarization, tokenizer max length | `model_type=t5`; `hf.tokenizer_max_length=512` captured |
 | `UBC-NLP/serengeti-E250` | No model card, 250 K-vocab Electra, tokenizer sentinel | Domains/languages only in `model_info.tags` → not captured; sentinel filtered |
-| `CohereLabs/aya-vision-8b` | Fully gated, license not captured | Card + config 401; `cc-by-nc-4.0` only in `model_info` object |
+| `CohereLabs/aya-vision-8b` | Fully gated, license not captured | Card + config 401; `CC-BY-NC-4.0` only in `model_info` object |
 | `lelapa/InkubaLM-0.4B` | Fully gated, dataset captured via tag fallback | Card + config 401; `dataset:lelapa/Inkuba-Mono` captured from `model_info` tags |
 | `nvidia/GR00T-N1.7-3B` | Robotics domain, no license | Humanoid robot foundation model; `pipeline_tag=robotics`; `license=None` |
 | `lerobot/pi05_base` | Robotics, lerobot library, custom license, no config | `license=gemma`; `library_name=lerobot`; no config.json; Pi0.5 policy |
-| `Salesforce/moirai-2.0-R-small` | `time-series-forecasting` domain, custom config keys | New `_DOMAIN_TAGS` entry; config keys (`d_model`, `patch_sizes`) not in `_HYPER_KEYS` → empty hyperparameters; cc-by-nc-4.0 |
+| `Salesforce/moirai-2.0-R-small` | `time-series-forecasting` domain, custom config keys | New `_DOMAIN_TAGS` entry; config keys (`d_model`, `patch_sizes`) not in `_HYPER_KEYS` → empty hyperparameters; `CC-BY-NC-4.0` |
 | `stanfordnlp/stanza-fi` | stanza library, no config, Finnish, empty domains | `library_name=stanza`; no pipeline_tag → empty `usage.domains`; language=`["fi"]` |
 | `stanfordnlp/stanza-de` | stanza library, no config, German | Same pattern as stanza-fi; language=`["de"]`; no config.json |
 | `SAP/sap-rpt-1-oss` | tabular-classification, gated config, self-referential `library_name` | Uses `.pt` files; `library_name=sap-rpt-1-oss` (same as model slug); `arxiv:2506.10707` |
diff --git a/tests/test_extract_huggingface.py b/tests/test_extract_huggingface.py
index d478f3a..071bb90 100644
--- a/tests/test_extract_huggingface.py
+++ b/tests/test_extract_huggingface.py
@@ -258,7 +258,7 @@ def test_read_huggingface_hyperparameters_include_vocab_size() -> None:
 def test_read_huggingface_license_from_card() -> None:
     with _patch_hf_calls():
         meta = read_huggingface("mistralai/Mistral-7B-v0.1")
-    assert meta.license == "apache-2.0"
+    assert meta.license == "Apache-2.0"
 
 
 def test_read_huggingface_domain_from_pipeline_tag_via_usage() -> None:
@@ -573,7 +573,7 @@ def test_license_from_file_when_card_says_other() -> None:
 
 
 def test_vague_license_raw_not_stored_when_card_has_real_spdx_id() -> None:
-    # A proper SPDX ID in the card YAML should NOT create hf.license_raw.
+    # A proper SPDX License ID in the card YAML should NOT create hf.license_raw.
     with _patch_hf_calls():  # uses apache-2.0 card data
         meta = read_huggingface("mistralai/Mistral-7B-v0.1")
     assert "hf.license_raw" not in meta.extra_data
@@ -694,7 +694,7 @@ def test_kokoro_name() -> None:
 def test_kokoro_license() -> None:
     with _patch_kokoro():
         meta = read_huggingface("hexgrad/Kokoro-82M")
-    assert meta.license == "apache-2.0"
+    assert meta.license == "Apache-2.0"
 
 
 def test_kokoro_tts_domain() -> None:
@@ -960,7 +960,7 @@ def test_kimi_multimodal_domain() -> None:
 # ---------------------------------------------------------------------------
 
 _GEMMA_CARD_DATA = _make_card_data(
-    license="gemma",  # Non-standard but passes SPDX ID regex -> not vague
+    license="gemma",  # Non-standard but passes SPDX License ID regex -> not vague
     pipeline_tag=None,
     tags=None,
     language=None,
@@ -1027,7 +1027,9 @@ def _patch_llama() -> Any:
 
 
 def test_llama_custom_license_used_directly() -> None:
-    # "llama3.2" matches SPDX ID regex - treated as a license identifier
+    # "llama3.2" is not in _VAGUE_LICENSE_VALUES, so it is taken from the card.
+    # Not recognized by licenseid matcher → _canonicalize_license_id returns
+    # it unchanged.
     with _patch_llama():
         meta = read_huggingface("meta-llama/Llama-3.2-1B")
     assert meta.license == "llama3.2"
@@ -1098,7 +1100,7 @@ def test_deepseek_architecture() -> None:
 def test_deepseek_mit_license() -> None:
     with _patch_deepseek():
         meta = read_huggingface("deepseek-ai/DeepSeek-R1")
-    assert meta.license == "mit"
+    assert meta.license == "MIT"
 
 
 def test_deepseek_no_domain_when_no_pipeline_tag() -> None:
@@ -1295,7 +1297,7 @@ def test_typhoon_thai_language() -> None:
 def test_typhoon_license() -> None:
     with _patch_typhoon():
         meta = read_huggingface("typhoon-ai/typhoon-7b")
-    assert meta.license == "apache-2.0"
+    assert meta.license == "Apache-2.0"
 
 
 def test_typhoon_grouped_query_attention_hyperparameter() -> None:
@@ -2756,7 +2758,7 @@ def _patch_uni2() -> Any:
 def test_uni2_nc_nd_license() -> None:
     with _patch_uni2():
         meta = read_huggingface("MahmoodLab/UNI2-h")
-    assert meta.license == "cc-by-nc-nd-4.0"
+    assert meta.license == "CC-BY-NC-ND-4.0"
 
 
 def test_uni2_pathology_tags_in_extra_lists() -> None:
@@ -3577,7 +3579,7 @@ def test_deplot_arxiv() -> None:
 
 
 # Salesforce/blip-vqa-base
-# BLIP for VQA; bsd-3-clause license (non-SPDX passthrough).
+# BLIP for VQA; bsd-3-clause license (normalised to BSD-3-Clause).
 def _patch_blip_vqa() -> Any:
     return _patch_hf_calls(
         config={
@@ -3604,11 +3606,12 @@ def test_blip_vqa_domain() -> None:
     assert "visual-question-answering" in meta.usage.domains
 
 
-def test_blip_vqa_bsd_license_passthrough() -> None:
-    # bsd-3-clause not in _VAGUE_LICENSE_VALUES -- passed through as-is.
+def test_blip_vqa_bsd_license_normalized() -> None:
+    # bsd-3-clause not in _VAGUE_LICENSE_VALUES; _canonicalize_license_id maps
+    # it to the canonical SPDX License ID BSD-3-Clause via licenseid matcher.
     with _patch_blip_vqa():
         meta = read_huggingface("Salesforce/blip-vqa-base")
-    assert meta.license == "bsd-3-clause"
+    assert meta.license == "BSD-3-Clause"
 
 
 # ---------------------------------------------------------------------------
@@ -3762,7 +3765,7 @@ def test_seamless_audio_to_audio_tag_in_domain() -> None:
 def test_seamless_nc_license() -> None:
     with _patch_seamless_m4t():
         meta = read_huggingface("facebook/seamless-m4t-v2-large")
-    assert meta.license == "cc-by-nc-4.0"
+    assert meta.license == "CC-BY-NC-4.0"
 
 
 # ---------------------------------------------------------------------------
@@ -4609,7 +4612,7 @@ def test_phi2_phi_architecture_mit_license() -> None:
     with _patch_phi2():
         meta = read_huggingface("microsoft/phi-2")
     assert meta.type_of_model == "phi"
-    assert meta.license == "mit"
+    assert meta.license == "MIT"
 
 
 def test_phi2_code_tag_in_domain() -> None:
@@ -4927,7 +4930,7 @@ def _patch_legal_embed_ita() -> Any:
 def test_legal_embed_ita_nc_license_italian() -> None:
     with _patch_legal_embed_ita():
         meta = read_huggingface("ReDiX/Legal-Embedding-ita-0.6B")
-    assert meta.license == "cc-by-nc-4.0"
+    assert meta.license == "CC-BY-NC-4.0"
     assert meta.extra_lists.get("hf.language") == ["it"]
     assert "sentence-similarity" in meta.usage.domains
 
@@ -5170,7 +5173,8 @@ def test_qwen3_235b_moe_architecture() -> None:
 
 
 def test_qwen3_235b_qwen_license_passthrough() -> None:
-    # "qwen" not in _VAGUE_LICENSE_VALUES → stored as-is
+    # "qwen" not in _VAGUE_LICENSE_VALUES; not recognized by licenseid matcher →
+    # _canonicalize_license_id returns it unchanged
     with _patch_qwen3_235b():
         meta = read_huggingface("Qwen/Qwen3-235B-A22B")
     assert meta.license == "qwen"
@@ -5243,7 +5247,7 @@ def test_qwen35_27b_architecture() -> None:
 def test_qwen35_27b_apache_license() -> None:
     with _patch_qwen35_27b():
         meta = read_huggingface("Qwen/Qwen3.5-27B")
-    assert meta.license == "apache-2.0"
+    assert meta.license == "Apache-2.0"
 
 
 def test_qwen35_27b_gqa() -> None:
@@ -5265,7 +5269,8 @@ def test_qwen35_27b_text_generation_domain() -> None:
 # ---------------------------------------------------------------------------
 
 # Korean multimodal VLM from Kakao Bank. Custom "kanana-license" identifier is
-# NOT in _VAGUE_LICENSE_VALUES → stored as-is (passthrough).
+# NOT in _VAGUE_LICENSE_VALUES; not recognized by licenseid matcher →
+# _canonicalize_license_id returns it unchanged.
 # image-text-to-text pipeline tag.
 
 _KANANA_15V_CONFIG: dict[str, Any] = {
@@ -5310,7 +5315,9 @@ def test_kanana_15v_architecture() -> None:
 
 
 def test_kanana_15v_license_passthrough() -> None:
-    # "kanana-license" is not in _VAGUE_LICENSE_VALUES → stored as-is, no detection
+    # "kanana-license" not in _VAGUE_LICENSE_VALUES; not recognized by
+    # licenseid matcher → _canonicalize_license_id returns it unchanged;
+    # no file detection triggered
     with _patch_kanana_15v():
         meta = read_huggingface("kakaobank/kanana-1.5-v-3b-instruct")
     assert meta.license == "kanana-license"
@@ -5712,7 +5719,7 @@ def test_glm45_air_reap_architecture() -> None:
 def test_glm45_air_reap_apache_license() -> None:
     with _patch_glm45_air_reap():
         meta = read_huggingface("THUDM/GLM-4.5-Air-REAP")
-    assert meta.license == "apache-2.0"
+    assert meta.license == "Apache-2.0"
 
 
 def test_glm45_air_reap_merge_relation() -> None:
@@ -5790,7 +5797,7 @@ def test_line_distilbert_fill_mask_domain() -> None:
 def test_line_distilbert_apache_license() -> None:
     with _patch_line_distilbert():
         meta = read_huggingface("line-corporation/line-distilbert-base-japanese")
-    assert meta.license == "apache-2.0"
+    assert meta.license == "Apache-2.0"
 
 
 # ---------------------------------------------------------------------------
@@ -5847,7 +5854,7 @@ def test_clip_japanese_v2_feature_extraction_domain() -> None:
 def test_clip_japanese_v2_apache_license() -> None:
     with _patch_clip_japanese_v2():
         meta = read_huggingface("line-corporation/clip-japanese-base-v2")
-    assert meta.license == "apache-2.0"
+    assert meta.license == "Apache-2.0"
 
 
 def test_clip_japanese_v2_hidden_size() -> None:
@@ -5898,7 +5905,7 @@ def test_fujitsu_llm_nemo_library_name() -> None:
 def test_fujitsu_llm_apache_license() -> None:
     with _patch_fujitsu_llm():
         meta = read_huggingface("Fujitsu/Fujitsu-LLM-KG-8x7B")
-    assert meta.license == "apache-2.0"
+    assert meta.license == "Apache-2.0"
 
 
 def test_fujitsu_llm_text_generation_domain() -> None:
@@ -5955,7 +5962,7 @@ def test_windowseat_image_to_image_domain() -> None:
 def test_windowseat_apache_license() -> None:
     with _patch_windowseat():
         meta = read_huggingface("windowseat-ai/windowseat-reflection")
-    assert meta.license == "apache-2.0"
+    assert meta.license == "Apache-2.0"
 
 
 # ---------------------------------------------------------------------------
@@ -6023,7 +6030,7 @@ def test_moirai_time_series_forecasting_domain() -> None:
 def test_moirai_cc_by_nc_license() -> None:
     with _patch_moirai():
         meta = read_huggingface("Salesforce/moirai-2.0-R-small")
-    assert meta.license == "cc-by-nc-4.0"
+    assert meta.license == "CC-BY-NC-4.0"
 
 
 # ---------------------------------------------------------------------------
@@ -6091,7 +6098,7 @@ def test_llasa_3b_text_to_speech_domain() -> None:
 def test_llasa_3b_cc_by_nc_license() -> None:
     with _patch_llasa_3b():
         meta = read_huggingface("HKUSTAudio/Llasa-3B")
-    assert meta.license == "cc-by-nc-4.0"
+    assert meta.license == "CC-BY-NC-4.0"
 
 
 # ---------------------------------------------------------------------------
@@ -6310,7 +6317,7 @@ def test_tildeopen_30b_64k_seven_datasets() -> None:
 def test_tildeopen_30b_64k_cc_by_license() -> None:
     with _patch_tildeopen_30b_64k():
         meta = read_huggingface("TildeAI/TildeOpen-30b-64k")
-    assert meta.license == "cc-by-4.0"
+    assert meta.license == "CC-BY-4.0"
 
 
 # ---------------------------------------------------------------------------
@@ -6547,7 +6554,8 @@ def test_openeurollm_three_datasets() -> None:
 #  • No max_position_embeddings (uses ALiBi positional bias, not RoPE)
 #  • seq_length (added to _HYPER_KEYS) is absent in the 176B config
 #  • bigscience-bloom-rail-1.0 is a custom HF identifier NOT in
-#    _VAGUE_LICENSE_VALUES → stored as-is (passthrough)
+#    _VAGUE_LICENSE_VALUES; not recognized by licenseid matcher →
+#    _canonicalize_license_id returns it unchanged
 #  • 59 languages (46 natural + 13 programming languages)
 
 _BLOOM_CONFIG: dict[str, Any] = {
@@ -6641,7 +6649,9 @@ def test_bloom_architecture() -> None:
 
 
 def test_bloom_custom_license_passthrough() -> None:
-    # "bigscience-bloom-rail-1.0" not in _VAGUE_LICENSE_VALUES → stored as-is
+    # "bigscience-bloom-rail-1.0" not in _VAGUE_LICENSE_VALUES, not a known
+    # Not recognized by licenseid matcher → _canonicalize_license_id returns
+    # it unchanged.
     with _patch_bloom():
         meta = read_huggingface("bigscience/bloom")
     assert meta.license == "bigscience-bloom-rail-1.0"
@@ -7285,7 +7295,7 @@ def test_shap_e_text_to_3d_domain() -> None:
 def test_shap_e_mit_license() -> None:
     with _patch_shap_e():
         meta = read_huggingface("openai/shap-e")
-    assert meta.license == "mit"
+    assert meta.license == "MIT"
 
 
 def test_shap_e_no_architecture() -> None:
@@ -7468,7 +7478,8 @@ def test_hy_motion_library_name() -> None:
 
 # Apple Sharp generates 3-D from a single 2-D image. pipeline_tag=image-to-3d
 # (new _DOMAIN_TAGS entry). library_name=ml-sharp (Apple's custom library).
-# license=apple-amlr → passthrough (not in _VAGUE_LICENSE_VALUES). No config.
+# license=apple-amlr → not in _VAGUE_LICENSE_VALUES; not recognized by
+# licenseid matcher → _canonicalize_license_id returns it unchanged. No config.
 
 _APPLE_SHARP_CARD_DATA = _make_card_data(
     license="apple-amlr",
@@ -7495,7 +7506,8 @@ def test_apple_sharp_image_to_3d_domain() -> None:
 
 
 def test_apple_sharp_apple_amlr_license_passthrough() -> None:
-    # apple-amlr not in _VAGUE_LICENSE_VALUES → stored as-is
+    # apple-amlr not in _VAGUE_LICENSE_VALUES; not recognized by licenseid matcher →
+    # _canonicalize_license_id returns it unchanged
     with _patch_apple_sharp():
         meta = read_huggingface("apple/Sharp")
     assert meta.license == "apple-amlr"
@@ -7548,7 +7560,7 @@ def test_firered_vad_voice_activity_detection_domain() -> None:
 def test_firered_vad_apache_license() -> None:
     with _patch_firered_vad():
         meta = read_huggingface("FireRedTeam/FireRedVAD")
-    assert meta.license == "apache-2.0"
+    assert meta.license == "Apache-2.0"
 
 
 def test_firered_vad_no_architecture() -> None:
@@ -7610,7 +7622,7 @@ def test_gte_reranker_model_type_placeholder() -> None:
 def test_gte_reranker_apache_license() -> None:
     with _patch_gte_reranker():
         meta = read_huggingface("Alibaba-NLP/gte-multilingual-reranker-base")
-    assert meta.license == "apache-2.0"
+    assert meta.license == "Apache-2.0"
 
 
 def test_gte_reranker_sentence_transformers_library() -> None:
@@ -7626,7 +7638,8 @@ def test_gte_reranker_sentence_transformers_library() -> None:
 # Apple OpenELM-270M uses a custom efficient architecture ("openelm").
 # Config has non-standard keys (activation_fn_name, ffn_dim_divisor) alongside
 # the standard head_dim (which IS in _HYPER_KEYS → captured).
-# license=apple-amlr → passthrough.
+# license=apple-amlr → not in _VAGUE_LICENSE_VALUES; not recognized by
+# licenseid matcher → _canonicalize_license_id returns it unchanged.
 
 _OPENELM_270M_CONFIG: dict[str, Any] = {
     "model_type": "openelm",
@@ -7810,7 +7823,7 @@ def test_llada2_moe_any_to_any_domain() -> None:
 def test_llada2_moe_apache_license() -> None:
     with _patch_llada2_moe():
         meta = read_huggingface("inclusionAI/LLaDA2.0-Uni")
-    assert meta.license == "apache-2.0"
+    assert meta.license == "Apache-2.0"
 
 
 # ---------------------------------------------------------------------------
diff --git a/tests/test_license.py b/tests/test_license.py
index 6976985..5df373f 100644
--- a/tests/test_license.py
+++ b/tests/test_license.py
@@ -239,11 +239,14 @@ def test_collect_candidates_empty_dir() -> None:
 
 @pytest.fixture(name="licenseid_db_path")
 def licenseid_db_path_fixture() -> Path:
-    """Return the path to the licenseid database, skipping if not built.
+    """Skip if the licenseid database has not been built yet.
 
     Build with: ``licenseid update``
     """
-    db = Path.home() / ".local" / "share" / "licenseid" / "licenses.db"
+    # pylint: disable=import-outside-toplevel
+    from licenseid.database import get_default_db_path
+
+    db = Path(get_default_db_path())
     if not db.exists():
         pytest.skip("licenseid database not built -- run 'licenseid update'")
     return db
@@ -254,31 +257,16 @@ def licenseid_db_path_fixture() -> Path:
 # ---------------------------------------------------------------------------
 
 
-def test_detect_license_from_text_db_missing(tmp_path: Path) -> None:
-    """Returns None gracefully when the licenseid database does not exist."""
+def test_detect_license_from_text_db_not_populated(tmp_path: Path) -> None:
+    """Returns None gracefully when the licenseid database is not populated."""
     with patch(
-        "pitloom.extract._license._get_licenseid_db_path",
-        return_value=tmp_path / "nonexistent.db",
+        "licenseid.matcher.get_default_db_path",
+        return_value=str(tmp_path / "empty.db"),
     ):
         result = detect_license_from_text("MIT License\n\nPermission is hereby granted")
         assert result is None
 
 
-def test_detect_license_from_text_library_not_installed(tmp_path: Path) -> None:
-    """Returns None gracefully when the licenseid library is not installed."""
-    fake_db = tmp_path / "licenses.db"
-    fake_db.touch()
-    with (
-        patch(
-            "pitloom.extract._license._get_licenseid_db_path",
-            return_value=fake_db,
-        ),
-        patch.dict("sys.modules", {"licenseid": None}),
-    ):
-        result = detect_license_from_text("MIT License")
-        assert result is None
-
-
 # ---------------------------------------------------------------------------
 # detect_license_for_project -- mocked detection
 # ---------------------------------------------------------------------------
@@ -402,14 +390,10 @@ def test_detect_project_hint_text_detection_fails_returns_hint() -> None:
 """
 
 
-def test_detect_license_from_text_returns_spdx_id(licenseid_db_path: Path) -> None:
+def test_detect_license_from_text_returns_spdx_id() -> None:
     """Detection with a real DB returns a valid SPDX License ID string
     (not None or raw text)."""
-    with patch(
-        "pitloom.extract._license._get_licenseid_db_path",
-        return_value=licenseid_db_path,
-    ):
-        result = detect_license_from_text(_MIT_TEXT)
+    result = detect_license_from_text(_MIT_TEXT)
     # Result may be None if score is below threshold; when not None it must
     # look like an SPDX License ID (no newlines, alphanumeric with dashes/dots)
     if result is not None:
@@ -418,17 +402,13 @@ def test_detect_license_from_text_returns_spdx_id(licenseid_db_path: Path) -> No
         )
 
 
-def test_detect_project_from_license_file_integration(licenseid_db_path: Path) -> None:
+def test_detect_project_from_license_file_integration() -> None:
     """End-to-end: LICENSE file text is processed;
     result is None or a valid SPDX License ID."""
     with tempfile.TemporaryDirectory() as d:
         p = Path(d)
         (p / "LICENSE").write_text(_MIT_TEXT)
-        with patch(
-            "pitloom.extract._license._get_licenseid_db_path",
-            return_value=licenseid_db_path,
-        ):
-            result_id, prov = detect_license_for_project(p)
+        result_id, prov = detect_license_for_project(p)
     if result_id is not None:
         assert _looks_like_spdx_license_id(result_id), (
             f"Expected SPDX License ID, got: {result_id!r}"