diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index 6092c40..0dc6afd 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -32,6 +32,9 @@ jobs: python -m pip install --upgrade pip pip install ".[aimodel,huggingface]" --group test + - name: Build licenseid database + run: licenseid update + - name: Run tests run: | pytest tests/ -v --tb=short diff --git a/docs/implementation/license-pipeline.md b/docs/implementation/license-pipeline.md index cf213d8..edf5ea7 100644 --- a/docs/implementation/license-pipeline.md +++ b/docs/implementation/license-pipeline.md @@ -134,7 +134,13 @@ relationships. 1. **Card YAML** -- reads `license:` from the model card frontmatter. If the value is not a vague sentinel (`other`, `custom`, `proprietary`, - `unknown`, `unlicensed`), it is accepted as-is and stored in + `unknown`, `unlicensed`), it is passed through `_canonicalize_license_id()`, + which calls `AggregatedLicenseMatcher.match(license_id=raw)` from the + `licenseid` library for a direct database lookup. Recognised SPDX + License IDs are returned in canonical casing (e.g. `"apache-2.0"` → + `"Apache-2.0"`). Values not recognised — proprietary or non-SPDX + identifiers such as `"gemma"`, `"llama3.2"`, or deprecated bare + copyleft forms — are returned verbatim. The result is stored in `AiModelMetadata.license`. 2. **File detection** -- when the card YAML value is absent or vague, `_detect_license_from_hf_files()` iterates through candidate files in @@ -148,15 +154,18 @@ relationships. ### `licenseid` dependency Text-based licence detection (`detect_license_from_text()` in -`_license.py`) relies on the optional `licenseid` package. When the -package is not installed or its database has not been built, detection -is silently skipped and the function returns `None`. To enable it: +`_license.py`) uses the `licenseid` package, which is a mandatory +pitloom dependency. The database must be built before detection is +possible: ```shell -pip install pitloom[license] licenseid update ``` +When the database has not been built, `detect_license_from_text()` +logs a warning and returns `None`; other licence sources (card YAML, +`CITATION.cff`, `codemeta.json`) are unaffected. + The database is stored at `~/.local/share/licenseid/licenses.db`. Detection uses cosine similarity against vectorised licence texts with a default threshold of 0.85. diff --git a/pyproject.toml b/pyproject.toml index 682a9b7..449d318 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -46,6 +46,7 @@ classifiers = [ ] dependencies = [ "hatchling>=1.28.0", + "licenseid>=0.2.3", "pyproject-metadata>=0.11.0", "rfc8785>=0.1.4", "spdx-python-model==0.0.4", @@ -63,7 +64,6 @@ aimodel = [ "safetensors[numpy]>=0.7.0", ] huggingface = ["huggingface_hub>=1.14.0"] -license = ["licenseid>=0.2.2"] fasttext = ["fasttext>=0.9.3"] # or fasttext-community>=0.11.7 gguf = ["gguf>=0.10.0"] @@ -124,6 +124,7 @@ exclude = [ "tests/fixtures/aimodels/safetensors/*.safetensors", "tests/fixtures/croissant/*.json", "tests/fixtures/fragments/*.json", + "tests/fixtures/huggingface-hub/*.txt", ] [tool.hatch.build.targets.wheel] diff --git a/src/pitloom/assemble/spdx3/deps.py b/src/pitloom/assemble/spdx3/deps.py index 2e81d0a..6230c82 100644 --- a/src/pitloom/assemble/spdx3/deps.py +++ b/src/pitloom/assemble/spdx3/deps.py @@ -111,12 +111,12 @@ def _enrich_from_installed( provenance_source = f"Source: installed metadata | Package: {dep_name}" # description - summary = pkg_meta["Summary"] or "" + summary = pkg_meta.get("Summary") or "" if summary and summary != "UNKNOWN": dep_package.description = summary # homePage -- core field first, then well-known Project-URL labels - home_page = pkg_meta["Home-page"] or "" + home_page = pkg_meta.get("Home-page") or "" if not home_page or home_page == "UNKNOWN": for label in _HOMEPAGE_LABELS: if label in project_urls: @@ -126,7 +126,7 @@ def _enrich_from_installed( dep_package.software_homePage = home_page # downloadLocation -- core field first, then well-known Project-URL labels - download_url = pkg_meta["Download-URL"] or "" + download_url = pkg_meta.get("Download-URL") or "" if not download_url or download_url == "UNKNOWN": for label in _DOWNLOAD_LABELS: if label in project_urls: @@ -145,7 +145,7 @@ def _enrich_from_installed( dep_package.software_packageUrl = f"pkg:pypi/{purl_name}@{version}" # hasDeclaredLicense -- prefer PEP 639 License-Expression over legacy License - license_id = pkg_meta["License-Expression"] or pkg_meta["License"] or "" + license_id = pkg_meta.get("License-Expression") or pkg_meta.get("License") or "" if license_id and license_id != "UNKNOWN": rel_declared, _ = build_license_elements( license_id=license_id, diff --git a/src/pitloom/extract/_huggingface.py b/src/pitloom/extract/_huggingface.py index 59af49e..875067f 100644 --- a/src/pitloom/extract/_huggingface.py +++ b/src/pitloom/extract/_huggingface.py @@ -102,6 +102,29 @@ {"other", "custom", "proprietary", "unknown", "unlicensed"} ) + +def _canonicalize_license_id(raw: str) -> str: + """Return the canonical SPDX License ID for *raw*, or *raw* unchanged. + + Delegates to :func:`~pitloom.extract._license.canonicalize_license_id`, + which uses ``AggregatedLicenseMatcher.match()`` from the ``licenseid`` + library. When *raw* is recognised as an SPDX License ID the canonical + casing is returned (e.g. ``"bsd-3-clause"`` → ``"BSD-3-Clause"``). + + When *raw* is not recognised it is returned verbatim — pitloom records + what it found and leaves further interpretation (e.g. deciding whether to + add a ``LicenseRef-`` prefix for non-SPDX identifiers) to the + ``licenseid`` library or downstream SBOM tooling. + + Requires a populated ``licenseid`` database (``licenseid update``). + When the database has not been built, *raw* is returned unchanged. + """ + # pylint: disable=import-outside-toplevel + from pitloom.extract._license import canonicalize_license_id + + return canonicalize_license_id(raw) + + # Filenames (case-sensitive, root of repo) considered license candidates. # Listed in priority order: no extension first, then common suffixes. _HF_LICENSE_FILENAMES: tuple[str, ...] = ( @@ -436,7 +459,7 @@ def _resolve_license( provenance["license"] = ( "Source: Hugging Face Hub | Field: model card YAML (license)" ) - return raw_license_str, None + return _canonicalize_license_id(raw_license_str), None vague_raw = ( raw_license_str diff --git a/src/pitloom/extract/_license.py b/src/pitloom/extract/_license.py index 7276109..7497655 100644 --- a/src/pitloom/extract/_license.py +++ b/src/pitloom/extract/_license.py @@ -4,13 +4,12 @@ """License text detection utilities using the licenseid library. -Provides best-effort SPDX license ID detection from license text found in -project files. All detection is optional and degrades gracefully when the -``licenseid`` package is not installed or its database has not been built. +Provides SPDX license ID detection from license text and metadata found in +project files. Text detection requires a populated database; other sources +(``CITATION.cff``, ``codemeta.json``) work without it. -To enable detection, install the package and build the database:: +Build the database before first use:: - pip install licenseid licenseid update """ @@ -21,6 +20,8 @@ import re from pathlib import Path +from licenseid import AggregatedLicenseMatcher + _logger = logging.getLogger(__name__) # Heuristic: single-token SPDX License IDs and expressions like "GPL-3.0-or-later" @@ -56,45 +57,59 @@ def _looks_like_spdx_license_expression(value: str) -> bool: return bool(_SPDX_LICENSE_EXPR_KEYWORDS_RE.search(stripped)) -def _get_licenseid_db_path() -> Path: - return Path.home() / ".local" / "share" / "licenseid" / "licenses.db" - - def detect_license_from_text(text: str, threshold: float = 0.85) -> str | None: """Detect SPDX License ID from *text* using the licenseid library. Returns the top-ranked SPDX License ID when its score meets *threshold*, or - ``None`` when the database is absent, the library is not installed, or no - match exceeds the threshold. + ``None`` when the database is not populated or no match exceeds the threshold. The database must be built before detection is possible:: licenseid update """ - db_path = _get_licenseid_db_path() - if not db_path.exists(): - _logger.warning( - "licenseid database not found at %s -- " - "run 'licenseid update' to enable license text detection", - db_path, - ) - return None try: - # pylint: disable=import-outside-toplevel - from licenseid import AggregatedLicenseMatcher - except ImportError: - _logger.debug("licenseid not installed; skipping license text detection") - return None - try: - matcher = AggregatedLicenseMatcher(str(db_path)) + matcher = AggregatedLicenseMatcher() + # Probe with a well-known license ID to confirm the database is populated. + if not matcher.match(license_id="MIT"): + _logger.warning( + "licenseid database appears empty -- " + "run 'licenseid update' to enable license text detection" + ) + return None results = matcher.match(text) filtered = [r for r in results if r["score"] >= threshold] - return filtered[0]["license_id"] if filtered else None + return str(filtered[0]["license_id"]) if filtered else None except Exception as exc: # pylint: disable=broad-exception-caught _logger.debug("licenseid detection failed: %s", exc) return None +def canonicalize_license_id(raw: str) -> str: + """Return the canonical SPDX License ID for *raw*, or *raw* unchanged. + + Uses ``AggregatedLicenseMatcher.match(license_id=raw)`` for a direct + database lookup. Returns the canonical casing when *raw* is a recognised + SPDX License ID (e.g. ``"bsd-3-clause"`` → ``"BSD-3-Clause"``). + + For unrecognised values — non-SPDX identifiers, deprecated bare + copyleft forms (``"agpl-3.0"``, ``"gpl-3.0"``), or vendor-specific + strings (``"gemma"``, ``"llama3.2"``) — the original string is + returned verbatim. pitloom records what it found and leaves further + interpretation (e.g. deciding whether to add a ``LicenseRef-`` prefix) + to the ``licenseid`` library or downstream SBOM tooling. + + Requires a populated database (``licenseid update``). When the database + is not populated *raw* is returned unchanged. + """ + try: + results = AggregatedLicenseMatcher().match(license_id=raw) + if results: + return str(results[0]["license_id"]) + except Exception: # pylint: disable=broad-exception-caught + pass + return raw + + def find_license_files(project_dir: Path) -> list[Path]: """Return existing license files in *project_dir* in priority order. diff --git a/tests/fixtures/huggingface-hub/README.md b/tests/fixtures/huggingface-hub/README.md index 7df0fbb..21499c6 100644 --- a/tests/fixtures/huggingface-hub/README.md +++ b/tests/fixtures/huggingface-hub/README.md @@ -224,32 +224,79 @@ The `hf.language` extractor passes non-standard values through unchanged | `"code"` | Programming-language content | `huggingface/CodeBERTa-small-v1` | | `False` (bool) | ISO code `"no"` parsed by YAML 1.1 | `openai/whisper-large-v3` — filtered out | -### 8. License handling: passthrough, vague, and no-license patterns +### 8. License handling: sources, canonicalization, vague values, and no-license patterns + +pitloom's role is to **find license information** from available sources. +Once found, it delegates identification to the `licenseid` library. + +#### License sources (priority order) + +1. **Model card YAML `license` field** — the primary source; a license + identifier string supplied by the model author. +2. **License files** (`LICENSE`, `COPYING`, etc.) — full license text; + used when the card value is vague (see below). Detection performed by + `_detect_license_from_hf_files()` via `licenseid`. + +#### Canonicalization: `_canonicalize_license_id()` + +HF Hub stores license identifiers in **lowercase** (e.g. `apache-2.0`, +`bsd-3-clause`), while SPDX requires specific mixed casing. +`_canonicalize_license_id()` delegates to `canonicalize_license_id()` in +`pitloom.extract._license`, which calls `AggregatedLicenseMatcher.match()` +from the `licenseid` library. The matcher's Tier-0 short-text path performs +fuzzy name/ID matching and returns the canonical casing when it finds an +exact match (internal score > 1.0); otherwise the value is returned +**verbatim**. + +pitloom does **not** add `LicenseRef-` prefixes or otherwise interpret +unrecognized values — that is the responsibility of `licenseid` or downstream +SBOM tooling. + +| HF card value | `meta.license` recorded | Source | +| :--- | :--- | :--- | +| `apache-2.0` | `Apache-2.0` | recognized by licenseid — canonical SPDX License ID | +| `mit` | `MIT` | recognized by licenseid | +| `bsd-3-clause` | `BSD-3-Clause` | recognized by licenseid | +| `cc-by-4.0` | `CC-BY-4.0` | recognized by licenseid | +| `cc-by-nc-4.0` | `CC-BY-NC-4.0` | recognized by licenseid | +| `cc-by-nc-nd-4.0` | `CC-BY-NC-ND-4.0` | recognized by licenseid | +| `cc-by-sa-4.0` | `CC-BY-SA-4.0` | recognized by licenseid | +| `gemma` | `gemma` | not recognized by licenseid — recorded verbatim | +| `llama3.2` | `llama3.2` | not recognized by licenseid — recorded verbatim | +| `llama3` | `llama3` | not recognized by licenseid — recorded verbatim | +| `apple-amlr` | `apple-amlr` | not recognized by licenseid — recorded verbatim | +| `bigscience-bloom-rail-1.0` | `bigscience-bloom-rail-1.0` | not recognized by licenseid — recorded verbatim | +| `bigcode-openrail-m` | `bigcode-openrail-m` | not recognized by licenseid — recorded verbatim | +| `openrail++` | `openrail++` | not recognized by licenseid — recorded verbatim | +| `agpl-3.0` | `agpl-3.0` | deprecated SPDX License ID, not recognized by licenseid — recorded verbatim | +| `gpl-3.0` | `gpl-3.0` | deprecated SPDX License ID, not recognized by licenseid — recorded verbatim | + +Deprecated bare copyleft forms (`agpl-3.0`, `gpl-3.0`, `lgpl-2.1`, etc.) are +not recognized by `licenseid` (SPDX replaced them with `-only` / +`-or-later` variants requiring explicit intent). They are recorded verbatim; +downstream tooling can decide whether to map them to `-only`, `-or-later`, or +add a `LicenseRef-` prefix. + +Requires `licenseid update` to populate the database. When the database is +unavailable all card values are recorded verbatim (no canonicalization). + +#### Vague license values `_VAGUE_LICENSE_VALUES` = `{"other", "custom", "proprietary", "unknown", "unlicensed"}`. -Anything outside this set is stored as-is in `meta.license`. This includes -non-SPDX HuggingFace custom identifiers: +When the card YAML `license` field matches one of these, the extractor: -| Custom identifier | Example model(s) | -| :--- | :--- | -| `gemma` | `google/gemma-2b`, `aisingapore/Gemma-SEA-LION-v4-4B-VL-GGUF`, `lerobot/pi05_base`, `bakrianoo/arabic-legal-documents-ocr-1.0` | -| `llama3.2` | `meta-llama/Llama-3.2-1B`, `meta-llama/Llama-3.2-3B`, `meta-llama/Llama-3.2-3B-Instruct` | -| `llama3` | `NousResearch/Hermes-3-Llama-3.2-3B` | -| `apple-amlr` | `apple/DepthPro-hf`, `apple/OpenELM-270M`, `apple/Sharp` | -| `kanana-license` | `kakaobank/kanana-1.5-v-3b-instruct` | -| `bigscience-bloom-rail-1.0` | `bigscience/bloom`, `bigscience/bloomz-7b1` | -| `bigcode-openrail-m` | `bigcode/starcoder2-3b` | -| `openrail++` | `tum-nlp/Deberta_Human_Value_Detector` | -| `bsd-3-clause` | `Salesforce/blip-vqa-base` | -| `cc-by-nc-4.0` | `geolocal/StreetCLIP`, `MahmoodLab/UNI2-h`, `facebook/seamless-m4t-v2-large`, `HKUSTAudio/Llasa-3B` | -| `cc-by-4.0` | `pyannote/speaker-diarization-community-1`, `TildeAI/TildeOpen-30b-64k`, `TildeAI/TildeOpen-30b` | -| `cc-by-sa-4.0` | `sonoisa/sentence-bert-base-ja-mean-tokens`, `pythainlp/wangchanglm-7.5B-sft-enth` | -| `cc-by-nc-nd-4.0` | `MahmoodLab/UNI2-h` | -| `qwen` | `openthaigpt/openthaigpt-r1-32b-instruct` (via `license_name`) | -| `llava2` | `llava-hf/LLaVA-NeXT-Video-7B-hf` | - -**The `license_name` secondary field** appears when the primary `license` field -is vague or unrecognised. It is stored in `extra_data["hf.license_name"]`: +1. Saves the raw value in `extra_data["hf.license_raw"]` +2. Calls `_detect_license_from_hf_files()` — downloads license files and uses + `licenseid` text detection to identify the SPDX License ID + +When the `license` field is absent entirely, `meta.license` is `None` and +`hf.license_raw` is not set (distinct from the vague-value path). + +#### `license_name` secondary field + +Some model cards include a `license_name` key alongside the primary `license` +field — typically a human-readable or more specific name. pitloom records it +verbatim in `extra_data["hf.license_name"]` without canonicalization. | `license_name` value | Example model | Notes | | :--- | :--- | :--- | @@ -257,14 +304,7 @@ is vague or unrecognised. It is stored in `extra_data["hf.license_name"]`: | `sai-nc-community` | `stabilityai/stable-zero123` | Stability AI non-commercial | | `tencent-hunyuan-community` | `tencent/HY-Motion-1.0` | Tencent Hunyuan community | | `open-aleph-license` | `Aleph-Alpha/Pharia-1-LLM-7B-control` | Aleph Alpha open licence | -| `bsd-3-clause` | `TencentARC/TimeLens-8B` | SPDX ID used as `license_name` when HF has no SPDX slot | - -When the card YAML contains a vague `license` value, the raw string is saved in -`extra_data["hf.license_raw"]` and `_detect_license_from_hf_files` is called -to look for a real SPDX ID in license files (`LICENSE`, `COPYING`, etc.). - -When the `license` field is absent entirely, `meta.license` is `None` and -`hf.license_raw` is not set (distinct from the vague-value path). +| `bsd-3-clause` | `TencentARC/TimeLens-8B` | SPDX License ID used as secondary name | ### 9. BLOOM architecture: non-standard config key names (known gap) @@ -618,50 +658,50 @@ come from `config.json`. | Model ID | Pattern | Notable | | :--- | :--- | :--- | -| `mistralai/Mistral-7B-v0.1` | Baseline transformer | Standard LLM: GQA, apache-2.0, `text-generation` pipeline | -| `Qwen/Qwen3-235B-A22B` | MoE, `qwen` license passthrough, generation config | `qwen3_moe` arch; `Qwen3MoeForCausalLM`; thinking-mode temperature+top_p | -| `Qwen/Qwen3.5-27B` | Dense Qwen3.5, GQA (8 KV heads), apache-2.0 | `qwen3` arch; `Qwen3ForCausalLM`; 40 attention heads / 8 KV heads | +| `mistralai/Mistral-7B-v0.1` | Baseline transformer | Standard LLM: GQA, `Apache-2.0`, `text-generation` pipeline | +| `Qwen/Qwen3-235B-A22B` | MoE, `qwen` license (verbatim), generation config | `qwen3_moe` arch; `Qwen3MoeForCausalLM`; thinking-mode temperature+top_p | +| `Qwen/Qwen3.5-27B` | Dense Qwen3.5, GQA (8 KV heads), `Apache-2.0` | `qwen3` arch; `Qwen3ForCausalLM`; 40 attention heads / 8 KV heads | | `openthaigpt/openthaigpt-r1-32b-instruct` | Vague license + file detection | `license=other`; `license_name=qwen` secondary field; Thai | | `hexgrad/Kokoro-82M` | No `model_type` / `architectures` | Custom config schema → `type_of_model=None`, `architecture=None` | | `moonshotai/Kimi-K2.6` | Vague license + file detection | `license=other` → `hf.license_raw`; `_detect_license_from_hf_files` triggered | -| `google/gemma-2b` | Gated config, custom license | 401 on config.json; `gemma` license in card | -| `meta-llama/Llama-3.2-1B` | Gated config, custom license, 8 languages | `llama3.2` license; config inaccessible → no arch | -| `meta-llama/Llama-3.2-3B` | Gated base, no architecture | Config gated → `type_of_model=None`; llama3.2 license | +| `google/gemma-2b` | Gated config, custom license | 401 on config.json; `gemma` recorded verbatim | +| `meta-llama/Llama-3.2-1B` | Gated config, custom license, 8 languages | `llama3.2` recorded verbatim; config inaccessible → no arch | +| `meta-llama/Llama-3.2-3B` | Gated base, no architecture | Config gated → `type_of_model=None`; `llama3.2` recorded verbatim | | `meta-llama/Llama-3.2-3B-Instruct` | Gated instruct, base_model finetune | Config gated; `base_model_relation=finetune` from 3B base | -| `NousResearch/Hermes-3-Llama-3.2-3B` | Not gated, llama3 license, finetune | `LlamaForCausalLM`; `llama3` license; finetune from Llama-3.2-3B | +| `NousResearch/Hermes-3-Llama-3.2-3B` | Not gated, llama3 license, finetune | `LlamaForCausalLM`; `llama3` recorded verbatim; finetune from Llama-3.2-3B | | `deepseek-ai/DeepSeek-R1` | MIT license, no pipeline_tag, MoE | Empty `usage.domains`; standard SPDX MIT | -| `bigcode/starcoder2-3b` | `"code"` tag → domain, dataset ref | `code` → `usage.domains` not `extra_lists["hf.tags"]`; training dataset | +| `bigcode/starcoder2-3b` | `"code"` tag → domain, dataset ref, custom license | `code` → `usage.domains`; `bigcode-openrail-m` recorded verbatim | | `SeaLLMs/SeaLLMs-v3-7B-Chat` | Vague license, 12 SEA/Asian languages | `license=other`; no pipeline_tag; qwen2 base | -| `typhoon-ai/typhoon-7b` | Thai-only, GQA | `["th"]`; `num_key_value_heads=8`; apache-2.0 | -| `iapp/chinda-qwen3-4b` | Base_model finetune, DOI | Thai LLM; Qwen3-4B base; `doi:10.57967/hf/5709`; apache-2.0 | +| `typhoon-ai/typhoon-7b` | Thai-only, GQA | `["th"]`; `num_key_value_heads=8`; Apache-2.0 | +| `iapp/chinda-qwen3-4b` | Base_model finetune, DOI | Thai LLM; Qwen3-4B base; `doi:10.57967/hf/5709`; Apache-2.0 | | `iapp/chinda-qwen3-4b-gguf` | GGUF-only, base_model quantized, scalar base_model | `base_model` as scalar string in card YAML; no config.json | | `talkie-lm/talkie-1930-13b-it` | No config.json, finetune, no domain | No pipeline_tag → empty `usage.domains` | -| `pythainlp/wangchanglm-7.5B-sft-enth` | Multi-dataset, tokenizer sentinel | 3 datasets; `model_max_length` sentinel filtered; cc-by-sa-4.0 | +| `pythainlp/wangchanglm-7.5B-sft-enth` | Multi-dataset, tokenizer sentinel | 3 datasets; `model_max_length` sentinel filtered; `CC-BY-SA-4.0` | | `mesolitica/mallam-1.1B-4096` | No license, Malay only | `license=None`; `language=["ms"]`; mistral base | -| `llm-jp/llm-jp-3-1.8b` | Large JP vocab LLaMA | 99 584-token vocab; apache-2.0; Japanese+English | +| `llm-jp/llm-jp-3-1.8b` | Large JP vocab LLaMA | 99 584-token vocab; Apache-2.0; Japanese+English | | `mistralai/Mistral-Medium-3.5-128B` | 22 languages, vague license, no pipeline_tag | `usage.domains==[]`; `license=other` | | `poolside/Laguna-XS.2` | Custom `model_type` and architecture | `model_type=laguna`; `LagunaForCausalLM`; custom tags preserved | | `abeja/gpt-neox-japanese-2.7b` | Language scalar, multi-dataset | `language: ja` scalar → `["ja"]`; cc100+wikipedia datasets | -| `ibm-granite/granite-4.1-8b` | GQA (8 KV heads), 12 languages, finetune | granite arch; finetune from granite-4.1-8b-base; apache-2.0 | +| `ibm-granite/granite-4.1-8b` | GQA (8 KV heads), 12 languages, finetune | granite arch; finetune from granite-4.1-8b-base; Apache-2.0 | | `Crownelius/Crow-9B-HERETIC-4.6` | `base_model_relation=merge`, 26 languages | Qwen3.5; merged/distilled from Claude | | `SamsungSAILMontreal/Qwen3-Coder-Next-REAP` | `base_model_relation=merge`, MoE | Qwen3-Next 80B→60B expert pruning; merge relation | | `facebook/opt-2.7b` | Vague license (other), OPT arch | `opt` arch; Meta non-commercial → `hf.license_raw=other` | | `facebook/opt-iml-max-1.3b` | Vague license, arxiv, instruction-tuned OPT | `arxiv:2212.12017`; instruction-tuned on ~2000 NLP tasks | -| `EleutherAI/gpt-neo-2.7B` | gpt_neo arch, standard SPDX license | `GPTNeoForCausalLM`; 32 layers; apache-2.0 | -| `stabilityai/stablelm-2-zephyr-1_6b` | stablelm_epoch arch, 12 languages | `StableLMEpochForCausalLM`; 100 352-token vocab; apache-2.0 | -| `TinyLlama/TinyLlama-1.1B-Chat-v1.0` | Shallow LLaMA (22 layers) | Shallower than standard 7B (32 layers); apache-2.0 | +| `EleutherAI/gpt-neo-2.7B` | gpt_neo arch, standard SPDX license | `GPTNeoForCausalLM`; 32 layers; Apache-2.0 | +| `stabilityai/stablelm-2-zephyr-1_6b` | stablelm_epoch arch, 12 languages | `StableLMEpochForCausalLM`; 100 352-token vocab; Apache-2.0 | +| `TinyLlama/TinyLlama-1.1B-Chat-v1.0` | Shallow LLaMA (22 layers) | Shallower than standard 7B (32 layers); Apache-2.0 | | `microsoft/phi-2` | phi arch, `"code"` tag → domain | `code` in `_DOMAIN_TAGS`; MIT; `code` not in `hf.tags` | -| `tokyotech-llm/Qwen3-Swallow-8B-SFT-v0.2` | Qwen3 finetune, Japanese+English | SFT from CPT stage; apache-2.0 | +| `tokyotech-llm/Qwen3-Swallow-8B-SFT-v0.2` | Qwen3 finetune, Japanese+English | SFT from CPT stage; Apache-2.0 | | `aisingapore/Gemma-SEA-LION-v4-27B-IT` | `image-text-to-text` in tags → extra domain | Gemma3 27B; 11 SEA languages; `gemma` license | | `FINAL-Bench/Darwin-28B-KR-Legal` | Korean legal LLM, finetune | `qwen3_5` arch; 64 layers; Korean+English | -| `Intelligent-Internet/II-Medical-8B` | Qwen3 finetune, empty card tags | Medical domain; `hidden_size=4096`; no pipeline_tag; apache-2.0 | -| `THUDM/GLM-4.5-Air-REAP` | MoE, `base_model_relation=merge`, apache-2.0 | `glm4_moe` arch; `Glm4MoeForCausalLM`; Samsung REAP merge from GLM-4.5-Air | -| `Fujitsu/Fujitsu-LLM-KG-8x7B` | Gated config, NeMo library | Config 401; `library_name=nemo` → `hf.library_name`; apache-2.0 | +| `Intelligent-Internet/II-Medical-8B` | Qwen3 finetune, empty card tags | Medical domain; `hidden_size=4096`; no pipeline_tag; Apache-2.0 | +| `THUDM/GLM-4.5-Air-REAP` | MoE, `base_model_relation=merge`, `Apache-2.0` | `glm4_moe` arch; `Glm4MoeForCausalLM`; Samsung REAP merge from GLM-4.5-Air | +| `Fujitsu/Fujitsu-LLM-KG-8x7B` | Gated config, NeMo library | Config 401; `library_name=nemo` → `hf.library_name`; Apache-2.0 | | `mistralai/Voxtral-Mini-4B-Realtime-2602` | Multimodal audio+text (ASR), `vllm` library | `voxtral_realtime` arch; audio encoder + text decoder; `library_name=vllm` | -| `TildeAI/TildeOpen-30b-64k` | YaRN RoPE context extension, 7 datasets, cc-by-4.0 | 8 192 → 65 536 tokens via YaRN; `rope_scaling` not in `_HYPER_KEYS`; `tokenizer_max_length=65536` | +| `TildeAI/TildeOpen-30b-64k` | YaRN RoPE context extension, 7 datasets, `CC-BY-4.0` | 8 192 → 65 536 tokens via YaRN; `rope_scaling` not in `_HYPER_KEYS`; `tokenizer_max_length=65536` | | `TildeAI/TildeOpen-30b` | Base 30B, unlimited tokenizer sentinel | Same 7 corpora; LlamaTokenizer sentinel filtered; no YaRN | | `openeurollm/datamix-9b-80-20` | Gemma-3 tokenizer (262K vocab), no GQA, no pipeline_tag | `vocab_size=262400`; `num_kv_heads=num_attn_heads=32`; empty `usage.domains` | -| `bigscience/bloom` | BLOOM 176B, ALiBi, custom key names, custom license | `n_layer`/`n_head` not in `_HYPER_KEYS` → layers skipped; `bigscience-bloom-rail-1.0` passthrough | +| `bigscience/bloom` | BLOOM 176B, ALiBi, custom key names, custom license | `n_layer`/`n_head` not in `_HYPER_KEYS` → layers skipped; `bigscience-bloom-rail-1.0` recorded verbatim | | `bigscience/bloomz-7b1` | BLOOM 7B, `seq_length` captured, xP3 finetune | `seq_length=2048` (new `_HYPER_KEYS` entry); finetune from bloom-7b1; `bigscience/xP3` dataset | | `CohereLabs/aya-23-8B` | Fully gated (card + config 401) | Same pattern as `CohereLabs/aya-vision-8b`; only `hf.author` from `model_info` | | `occiglot/occiglot-7b-eu5-instruct` | Mistral, `sliding_window` captured, 5 EU langs | `sliding_window=4096` in `_HYPER_KEYS`; finetune from occiglot-7b-eu5 | @@ -671,8 +711,8 @@ come from `config.json`. | `FreedomIntelligence/BlenderLLM` | text-to-3d pipeline, Qwen2 LLM | Standard Qwen2 decoder fine-tuned for Blender script generation; `pipeline_tag=text-to-3d` | | `hellork/BlenderLLM-IQ3_XXS-GGUF` | GGUF of BlenderLLM, text-to-3d, quantized | No config.json; `base_model_relation=quantized`; `text-to-3d` domain inherited | | `MiniMaxAI/MiniMax-M2.7` | minimax_m2 arch, MoE with MTP, 1M context, vague license | `MiniMaxM2ForCausalLM`; `max_position_embeddings=1_000_000`; `license=other` | -| `apple/OpenELM-270M` | openelm arch, apple-amlr passthrough, head_dim captured | Custom efficient arch; `head_dim=64` in `_HYPER_KEYS` → captured; non-standard keys skipped | -| `sail/Sailor2-20B` | Qwen2, 10 SEA languages, apache-2.0 | Covers Thai, Khmer, Lao, Malay, Burmese, Filipino; `num_key_value_heads=8` | +| `apple/OpenELM-270M` | openelm arch, `apple-amlr` license (verbatim), head_dim captured | Custom efficient arch; `head_dim=64` in `_HYPER_KEYS` → captured; non-standard keys skipped | +| `sail/Sailor2-20B` | Qwen2, 10 SEA languages, `Apache-2.0` | Covers Thai, Khmer, Lao, Malay, Burmese, Filipino; `num_key_value_heads=8` | | `huggingface/CodeBERTa-small-v1` | RoBERTa, fill-mask, no license, `language=["code"]` | Pre-trained on The Stack; `language="code"` preserved (non-ISO identifier) | | `Bencode92/tradepulse-finbert-sentiment` | BERT, text-classification, finetune from finbert | `BertForSequenceClassification`; financial sentiment; `base_model_relation=finetune` | | `OpenVINO/Mixtral-8x7B-Instruct-v0.1-int8-ov` | OpenVINO int8 quant, config accessible | Config.json present (unlike GGUF); `torch_dtype=int8` captured; `library_name=openvino` | @@ -682,7 +722,7 @@ come from `config.json`. | Model ID | Pattern | Notable | | :--- | :--- | :--- | -| `sonoisa/sentence-bert-base-ja-mean-tokens` | Language scalar string fix | `language: ja` → `["ja"]`; sentence-similarity; cc-by-sa-4.0 | +| `sonoisa/sentence-bert-base-ja-mean-tokens` | Language scalar string fix | `language: ja` → `["ja"]`; sentence-similarity; `CC-BY-SA-4.0` | | `cl-nagoya/ruri-v3-310m` | ModernBERT, base_model finetune, arxiv | `arxiv:2409.07737`; Japanese embedding; sentence-similarity | | `nomic-ai/nomic-embed-text-v1.5-GGUF` | GGUF-only, base_model quantized | No config.json; `base_model_relation=quantized`; nomic-embed | | `ibm-granite/granite-embedding-97m-multilingual-r2` | ModernBERT, sentence-transformers library | 200+ languages; `hf.library_name=sentence-transformers`; feature-extraction | @@ -692,11 +732,11 @@ come from `config.json`. | `airesearch/WangchanX-Legal-ThaiCCL-Retriever` | Base_model finetune, MIT, dataset ref | Fine-tuned from BAAI/bge-m3; xlm-roberta arch; Thai legal | | `jinaai/jina-embeddings-v4` | visual-document-retrieval domain, no license | 131 072 token context; `language=["multilingual"]` keyword preserved; `license=None` | | `HuggingFaceFW/fineweb-edu-classifier` | text-classification, base_model finetune | Fine-tuned from Snowflake arctic-embed; educational quality 0–5 | -| `tum-nlp/Deberta_Human_Value_Detector` | text-classification, `openrail++` passthrough | `openrail++` ∉ `_VAGUE_LICENSE_VALUES`; 20 value categories | +| `tum-nlp/Deberta_Human_Value_Detector` | text-classification, `openrail++` verbatim | `openrail++` ∉ `_VAGUE_LICENSE_VALUES`; not recognized by licenseid → recorded verbatim; 20 value categories | | `nlp-chula/aspect-finnlp-th` | text-classification, Thai financial, no license | CamemBERT-based; fine-tuned from wangchanberta; `license=None` | | `openai/privacy-filter` | token-classification, 128 K context | `hf.tokenizer_max_length=128000` captured; custom arch | -| `line-corporation/line-distilbert-base-japanese` | fill-mask, DistilBERT (6 layers) | Japanese BERT distilled to 6 layers; `DistilBertForMaskedLM`; apache-2.0 | -| `line-corporation/clip-japanese-base-v2` | feature-extraction, custom `clyp` model_type | Line Corp CLIP variant; `CLYPModel` arch; apache-2.0; Japanese | +| `line-corporation/line-distilbert-base-japanese` | fill-mask, DistilBERT (6 layers) | Japanese BERT distilled to 6 layers; `DistilBertForMaskedLM`; Apache-2.0 | +| `line-corporation/clip-japanese-base-v2` | feature-extraction, custom `clyp` model_type | Line Corp CLIP variant; `CLYPModel` arch; Apache-2.0; Japanese | | `Alibaba-NLP/gte-multilingual-reranker-base` | text-ranking domain, `model_type="new"` placeholder | `NewForSequenceClassification`; `model_type="new"` is a literal string, not a typo | | `Alibaba-NLP/gte-modernbert-base` | modernbert arch, sentence-similarity, multilingual | `ModernBertModel` (base encoder); `max_position_embeddings=8192` captured | | `dbmdz/bert-base-turkish-cased` | bert, no `architectures` field, Turkish, no pipeline_tag | `model_type=bert` present; `architectures` absent → `architecture=None`; minimal card (2 fields) | @@ -706,26 +746,26 @@ come from `config.json`. | Model ID | Pattern | Notable | | :--- | :--- | :--- | -| `apple/DepthPro-hf` | depth-estimation domain, custom license | `apple-amlr` ∉ `_VAGUE_LICENSE_VALUES` → stored as-is; DepthPro arch | +| `apple/DepthPro-hf` | depth-estimation domain, custom license | `apple-amlr` recorded verbatim; DepthPro arch | | `prs-eth/marigold-depth-v1-0` | depth-estimation, diffusers, no config | `library_name=diffusers`; no config.json → no arch | | `usyd-community/vitpose-plus-huge` | keypoint-detection domain | ViTPose arch; human pose estimation | | `laion/CLIP-convnext_base_w-laion2B-s13B-b82K-augreg` | zero-shot-image-classification, no config | No config.json → no arch; `library_name=open_clip` | -| `geolocal/StreetCLIP` | zero-shot-image-classification, CLIP, cc-by-nc-4.0 | CLIP arch; geo-localisation tags in extra_lists | +| `geolocal/StreetCLIP` | zero-shot-image-classification, CLIP, `CC-BY-NC-4.0` | CLIP arch; geo-localisation tags in extra_lists | | `microsoft/swin-tiny-patch4-window7-224` | No pipeline_tag, domain from card tags | `"image-classification"` in `tags` → domain; imagenet-1k dataset | -| `microsoft/resnet-18` | image-classification from card tags | `resnet` arch; apache-2.0; same tag-domain pattern as Swin | -| `facebook/dinov2-small` | image-feature-extraction domain | DINOv2 self-supervised ViT; apache-2.0 | +| `microsoft/resnet-18` | image-classification from card tags | `resnet` arch; Apache-2.0; same tag-domain pattern as Swin | +| `facebook/dinov2-small` | image-feature-extraction domain | DINOv2 self-supervised ViT; Apache-2.0 | | `microsoft/rad-dino` | image-feature-extraction, no license | DINOv2 fine-tuned on radiology; `license=None` | -| `MahmoodLab/UNI2-h` | Gated config, `cc-by-nc-nd-4.0` | Pathology/histology ViT; restrictive NC+ND license; tags in extra_lists | +| `MahmoodLab/UNI2-h` | Gated config, `CC-BY-NC-ND-4.0` | Pathology/histology ViT; restrictive NC+ND license; tags in extra_lists | | `timm/convnext_large.dinov3_lvd1689m` | Vague license, timm library, no config | `license=other`; `library_name=timm`; no config.json | | `briaai/RMBG-1.4` | Vague license, image-segmentation | `license=other` → `hf.license_raw`; custom tags | | `briaai/RMBG-2.0` | Gated config, vague license | Config gated → no type_of_model; domain from card; `license=other` | | `ibm-granite/granite-geospatial-uki-flooddetection` | image-segmentation, TerraTorch, HF dataset refs | No transformers config; two `/datasets/` repos as `DatasetReference` | | `prithivMLmods/Flood-Image-Detection` | image-classification, siglip, arxiv, finetune | Fine-tuned from google/siglip2-base-patch16-512; `arxiv:2502.14786` | | `LGAI-EXAONE/EXAONE-Path-2.0-rev-EGFR` | Gated config, non-standard pipeline tag | Config 401; `pathology-image-analysis` captured as domain (pipeline_tag, not tags); `license=other` | -| `windowseat-ai/windowseat-reflection` | No config, PEFT library, image-to-image | Config 404; `library_name=peft` → `hf.library_name`; apache-2.0 | +| `windowseat-ai/windowseat-reflection` | No config, PEFT library, image-to-image | Config 404; `library_name=peft` → `hf.library_name`; Apache-2.0 | | `stabilityai/stable-zero123` | text-to-3d, diffusers, vague license + `license_name` | No config.json; `hf.license_name=sai-nc-community`; `library_name=diffusers` | | `openai/shap-e` | text-to-3d, MIT, no config | Generates 3D assets from text/images; no config.json; MIT | -| `apple/Sharp` | image-to-3d, apple-amlr passthrough, ml-sharp library | Single-image 3D generation; `library_name=ml-sharp`; no config.json | +| `apple/Sharp` | image-to-3d, `apple-amlr` license (verbatim), ml-sharp library | Single-image 3D generation; `library_name=ml-sharp`; no config.json | | `FireRedTeam/FireRedVAD` | voice-activity-detection, no config, apache-2.0 | VAD; new `_DOMAIN_TAGS` entry; no config.json → `type_of_model=None` | | `ETH-CVG/lightglue_superpoint` | keypoint-detection, lightglue arch, vague license | Feature matching; non-standard config keys → `hyperparameters={}`; `license=other` | | `qualcomm/HRNetPose` | keypoint-detection, pytorch library, vague license | Native PyTorch format; `library_name=pytorch`; no config.json; `license=other` | @@ -738,30 +778,30 @@ come from `config.json`. | :--- | :--- | :--- | | `dandelin/vilt-b32-finetuned-vqa` | visual-question-answering, base_model finetune, arxiv | ViLT on VQAv2; `arxiv:2102.03334`; finetune from vilt-b32 | | `google/deplot` | visual-question-answering + `image-text-to-text` in tags | pix2struct; both pipeline tag and card tag → two domains; `arxiv:2212.10505` | -| `Salesforce/blip-vqa-base` | visual-question-answering, `bsd-3-clause` passthrough | `bsd-3-clause` ∉ `_VAGUE_LICENSE_VALUES`; blip arch | +| `Salesforce/blip-vqa-base` | visual-question-answering, `BSD-3-Clause` | `bsd-3-clause` → `BSD-3-Clause` (canonical SPDX); blip arch | | `naver-clova-ix/donut-base-finetuned-docvqa` | document-question-answering, vision-encoder-decoder | `image-to-text` also captured via card tags; donut arch | | `impira/layoutlm-document-qa` | document-question-answering, language scalar | `language: en` scalar → `["en"]`; layoutlm arch; MIT | | `google/tapas-large-finetuned-wtq` | table-question-answering, language scalar, dataset ref | `language: en` scalar; dataset ref; tapas arch | -| `llava-hf/LLaVA-NeXT-Video-7B-hf` | video-text-to-text + image-text-to-text (two domains) | Both pipeline tag and card tag → two domain entries; llava2 license | -| `aisingapore/Gemma-SEA-LION-v4-4B-VL` | image-text-to-text, gemma license, SEA, finetune | Gemma3 multimodal; 9 SEA languages; finetune from google/gemma-3-4b-it | +| `llava-hf/LLaVA-NeXT-Video-7B-hf` | video-text-to-text + image-text-to-text (two domains) | Both pipeline tag and card tag → two domain entries; `llava2` recorded verbatim | +| `aisingapore/Gemma-SEA-LION-v4-4B-VL` | image-text-to-text, `gemma` license (verbatim), SEA, finetune | Gemma3 multimodal; 9 SEA languages; finetune from google/gemma-3-4b-it | | `openvla/openvla-7b` | robotics + image-text-to-text (two domains), MIT | VLA policy; pipeline tag and card tag → two domains | | `nvidia/Nemotron-3-Nano-Omni-30B-A3B-Reasoning-BF16` | any-to-any domain, vague license | Reasoning + audio+video+text; `license=other`; card dataset takes priority | | `briaai/Fibo-Edit-RMBG` | image-to-image, arxiv, base_model finetune | `arxiv:2511.06876`; finetune from briaai/Fibo-Edit; diffusers | -| `baidu/ERNIE-Image-Turbo` | text-to-image, diffusers, Chinese+English | Distilled DiT; `library_name=diffusers`; apache-2.0 | +| `baidu/ERNIE-Image-Turbo` | text-to-image, diffusers, Chinese+English | Distilled DiT; `library_name=diffusers`; Apache-2.0 | | `Doses-AI/boba-0.8b-food-GGUF` | image-text-to-text, GGUF, food domain | No config.json → `type_of_model=None`; finetune from Qwen3.5-0.8B | -| `bakrianoo/arabic-legal-documents-ocr-1.0` | image-text-to-text, gemma license, Arabic OCR | Gemma3; `license=gemma`; scanned Arabic legal documents | -| `kakaobank/kanana-1.5-v-3b-instruct` | image-text-to-text, `kanana-license` passthrough | `kanana-1.5-v` arch; `KananaVForConditionalGeneration`; Korean VLM | +| `bakrianoo/arabic-legal-documents-ocr-1.0` | image-text-to-text, `gemma` license (verbatim), Arabic OCR | Gemma3; `gemma` recorded verbatim; scanned Arabic legal documents | +| `kakaobank/kanana-1.5-v-3b-instruct` | image-text-to-text, `kanana-license` (verbatim) | `kanana-1.5-v` arch; `KananaVForConditionalGeneration`; Korean VLM | | `LGAI-EXAONE/EXAONE-4.5-33B` | image-text-to-text, vague license, 6 languages | `exaone4_5` arch; Korean+multilingual; `license=other` → `hf.license_raw` | | `LGAI-EXAONE/EXAONE-4.5-33B-AWQ` | AWQ quantized, config accessible (unlike GGUF) | Config present; `base_model_relation=quantized`; `license=other` | | `LGAI-EXAONE/EXAONE-4.5-33B-FP8` | FP8 quantized, `torch_dtype=float8_e4m3fn` | `torch_dtype` in `_HYPER_KEYS` → captured in hyperparameters | | `LGAI-EXAONE/EXAONE-4.5-33B-GGUF` | GGUF, no config.json, vague license | `type_of_model=None`; `base_model_relation=quantized`; `license=other` | -| `aisingapore/Gemma-SEA-LION-v4-4B-VL-GGUF` | GGUF-only VLM, `gemma` license, SEA langs | No config.json; `image-text-to-text`; 9 SEA languages; `gemma` license passthrough | +| `aisingapore/Gemma-SEA-LION-v4-4B-VL-GGUF` | GGUF-only VLM, `gemma` license (verbatim), SEA langs | No config.json; `image-text-to-text`; 9 SEA languages; `gemma` recorded verbatim | | `Gen-Verse/MMaDA-8B-Base` | llada arch, ALiBi positional bias, any-to-any, MIT | ALiBi: no `max_position_embeddings`; `vocab_size=32000` captured; masked-token diffusion | | `mlx-community/gemma-4-e2b-it-4bit` | MLX 4-bit quant, gemma4 arch, any-to-any | `library_name=mlx`; config.json accessible; `base_model_relation=quantized` | | `onnx-community/gemma-4-E2B-it-ONNX` | ONNX export, gemma4 arch, transformers.js | `library_name=transformers.js`; config.json accessible; `base_model_relation=quantized` | | `ByteDance-Seed/BAGEL-7B-MoT` | bagel arch, any-to-any, nested config | Mixture-of-Tokens multimodal; `hyperparameters={}`; `library_name=bagel-mot` | | `sensenova/SenseNova-U1-8B-MoT` | neo_chat arch, any-to-any, nested config | `NEOChatModel`; Chinese+English; `hyperparameters={}` | -| `inclusionAI/LLaDA2.0-Uni` | llada2_moe arch, discrete diffusion, any-to-any | Masked-token diffusion model; `LLaDA2MoeModelLM`; apache-2.0 | +| `inclusionAI/LLaDA2.0-Uni` | llada2_moe arch, discrete diffusion, any-to-any | Masked-token diffusion model; `LLaDA2MoeModelLM`; Apache-2.0 | | `XiaomiMiMo/MiMo-Audio-7B-Instruct` | qwen2 base + MiMoAudioModel wrapper, any-to-any | Architecture field captures custom wrapper; `model_type=qwen2` (base) preserved | | `tencent/HY-Motion-1.0` | text-to-3d, custom config, vague license + `license_name` | `library_name=HY-Motion-1.0`; non-standard config → `type_of_model=None`; `hf.license_name=tencent-hunyuan-community` | | `TencentARC/TimeLens-8B` | qwen3_vl, nested text_config, video-text-to-text, `dtype` key | All LM keys inside `text_config` → `hyperparameters={}`; `dtype` (not `torch_dtype`); `license_name=bsd-3-clause` | @@ -771,8 +811,8 @@ come from `config.json`. | Model ID | Pattern | Notable | | :--- | :--- | :--- | -| `openai/whisper-large-v3` | 99-language ASR, YAML 1.1 boolean hazard | ISO code `"no"` parsed as `False` → filtered; apache-2.0 | -| `facebook/seamless-m4t-v2-large` | ASR pipeline + `audio-to-audio` + `text-to-speech` from tags | Three domains captured; cc-by-nc-4.0 | +| `openai/whisper-large-v3` | 99-language ASR, YAML 1.1 boolean hazard | ISO code `"no"` parsed as `False` → filtered; Apache-2.0 | +| `facebook/seamless-m4t-v2-large` | ASR pipeline + `audio-to-audio` + `text-to-speech` from tags | Three domains captured; `CC-BY-NC-4.0` | | `ibm-granite/granite-speech-4.1-2b` | ASR, base_model finetune, 6 languages | Conformer + Q-Former + granite LM; finetune from granite-4.0-1b-base | | `ai4bharat/indic-conformer-600m-multilingual` | Gated ASR, 22 Indian language codes | MIT; config gated; 22 ISO language codes extracted from card | | `cstr/mimo-asr-GGUF` | GGUF ASR, base_model quantized | Qwen2-based; quantized from XiaomiMiMo/MiMo-V2.5-ASR; zh+en | @@ -782,8 +822,8 @@ come from `config.json`. | `jonatasgrosman/wav2vec2-large-xlsr-53-japanese` | Language scalar, DOI | `language: ja` scalar; `doi:10.57967/hf/3568`; ASR domain | | `k2-fsa/OmniVoice` | text-to-speech domain, arxiv, base_model finetune | 646 languages as `["multilingual"]`; `arxiv:2604.00688`; Qwen3-0.6B base | | `drbaph/OmniVoice-bf16` | text-to-speech domain, finetune | BF16 conversion of k2-fsa/OmniVoice; same TTS domain | -| `pyannote/speaker-diarization-community-1` | speaker-diarization domain, gated, pyannote.audio | cc-by-4.0 (permissive, despite gating); no config.json; `library_name=pyannote.audio` | -| `HKUSTAudio/Llasa-3B` | text-to-speech, LLaMA arch, large vocab | `LlamaForCausalLM` repurposed for TTS; `vocab_size=193800` (speech tokens); cc-by-nc-4.0 | +| `pyannote/speaker-diarization-community-1` | speaker-diarization domain, gated, pyannote.audio | `CC-BY-4.0` (permissive, despite gating); no config.json; `library_name=pyannote.audio` | +| `HKUSTAudio/Llasa-3B` | text-to-speech, LLaMA arch, large vocab | `LlamaForCausalLM` repurposed for TTS; `vocab_size=193800` (speech tokens); `CC-BY-NC-4.0` | ### Translation, seq2seq, and domain-specific @@ -795,16 +835,16 @@ come from `config.json`. | `tencent/Hy-MT1.5-1.8B-2bit-GGUF` | GGUF quantized, `"multilingual"` language keyword | No config.json; `language=["multilingual"]`; `base_model_relation=quantized` | | `tencent/Hunyuan-MT-7B` | Translation from tag, no license | Same hunyuan arch as HY-MT1.5; `license=None` | | `protonx-models/protonx-legal-tc` | text2text-generation, NC license → other, Vietnamese | T5; proprietary non-commercial → `license=other` → `hf.license_raw` | -| `ReDiX/Legal-Embedding-ita-0.6B` | sentence-similarity, Italian legal, cc-by-nc-4.0 | Qwen3 base; Italian legal corpus | +| `ReDiX/Legal-Embedding-ita-0.6B` | sentence-similarity, Italian legal, `CC-BY-NC-4.0` | Qwen3 base; Italian legal corpus | | `lmg-anon/vntl-llama3-8b-v2-gguf` | GGUF, base_model quantized, llama3 license | Quantized from rinna/llama-3-youko-8b; translation domain | | `sugoitoolkit/Sugoi-14B-Ultra-GGUF` | GGUF, base_model as list | `base_model: ["sugoitoolkit/Sugoi-14B-Ultra-HF"]` → first entry extracted | | `Falconsai/medical_summarization` | T5 summarization, tokenizer max length | `model_type=t5`; `hf.tokenizer_max_length=512` captured | | `UBC-NLP/serengeti-E250` | No model card, 250 K-vocab Electra, tokenizer sentinel | Domains/languages only in `model_info.tags` → not captured; sentinel filtered | -| `CohereLabs/aya-vision-8b` | Fully gated, license not captured | Card + config 401; `cc-by-nc-4.0` only in `model_info` object | +| `CohereLabs/aya-vision-8b` | Fully gated, license not captured | Card + config 401; `CC-BY-NC-4.0` only in `model_info` object | | `lelapa/InkubaLM-0.4B` | Fully gated, dataset captured via tag fallback | Card + config 401; `dataset:lelapa/Inkuba-Mono` captured from `model_info` tags | | `nvidia/GR00T-N1.7-3B` | Robotics domain, no license | Humanoid robot foundation model; `pipeline_tag=robotics`; `license=None` | | `lerobot/pi05_base` | Robotics, lerobot library, custom license, no config | `license=gemma`; `library_name=lerobot`; no config.json; Pi0.5 policy | -| `Salesforce/moirai-2.0-R-small` | `time-series-forecasting` domain, custom config keys | New `_DOMAIN_TAGS` entry; config keys (`d_model`, `patch_sizes`) not in `_HYPER_KEYS` → empty hyperparameters; cc-by-nc-4.0 | +| `Salesforce/moirai-2.0-R-small` | `time-series-forecasting` domain, custom config keys | New `_DOMAIN_TAGS` entry; config keys (`d_model`, `patch_sizes`) not in `_HYPER_KEYS` → empty hyperparameters; `CC-BY-NC-4.0` | | `stanfordnlp/stanza-fi` | stanza library, no config, Finnish, empty domains | `library_name=stanza`; no pipeline_tag → empty `usage.domains`; language=`["fi"]` | | `stanfordnlp/stanza-de` | stanza library, no config, German | Same pattern as stanza-fi; language=`["de"]`; no config.json | | `SAP/sap-rpt-1-oss` | tabular-classification, gated config, self-referential `library_name` | Uses `.pt` files; `library_name=sap-rpt-1-oss` (same as model slug); `arxiv:2506.10707` | diff --git a/tests/test_extract_huggingface.py b/tests/test_extract_huggingface.py index d478f3a..071bb90 100644 --- a/tests/test_extract_huggingface.py +++ b/tests/test_extract_huggingface.py @@ -258,7 +258,7 @@ def test_read_huggingface_hyperparameters_include_vocab_size() -> None: def test_read_huggingface_license_from_card() -> None: with _patch_hf_calls(): meta = read_huggingface("mistralai/Mistral-7B-v0.1") - assert meta.license == "apache-2.0" + assert meta.license == "Apache-2.0" def test_read_huggingface_domain_from_pipeline_tag_via_usage() -> None: @@ -573,7 +573,7 @@ def test_license_from_file_when_card_says_other() -> None: def test_vague_license_raw_not_stored_when_card_has_real_spdx_id() -> None: - # A proper SPDX ID in the card YAML should NOT create hf.license_raw. + # A proper SPDX License ID in the card YAML should NOT create hf.license_raw. with _patch_hf_calls(): # uses apache-2.0 card data meta = read_huggingface("mistralai/Mistral-7B-v0.1") assert "hf.license_raw" not in meta.extra_data @@ -694,7 +694,7 @@ def test_kokoro_name() -> None: def test_kokoro_license() -> None: with _patch_kokoro(): meta = read_huggingface("hexgrad/Kokoro-82M") - assert meta.license == "apache-2.0" + assert meta.license == "Apache-2.0" def test_kokoro_tts_domain() -> None: @@ -960,7 +960,7 @@ def test_kimi_multimodal_domain() -> None: # --------------------------------------------------------------------------- _GEMMA_CARD_DATA = _make_card_data( - license="gemma", # Non-standard but passes SPDX ID regex -> not vague + license="gemma", # Non-standard but passes SPDX License ID regex -> not vague pipeline_tag=None, tags=None, language=None, @@ -1027,7 +1027,9 @@ def _patch_llama() -> Any: def test_llama_custom_license_used_directly() -> None: - # "llama3.2" matches SPDX ID regex - treated as a license identifier + # "llama3.2" is not in _VAGUE_LICENSE_VALUES, so it is taken from the card. + # Not recognized by licenseid matcher → _canonicalize_license_id returns + # it unchanged. with _patch_llama(): meta = read_huggingface("meta-llama/Llama-3.2-1B") assert meta.license == "llama3.2" @@ -1098,7 +1100,7 @@ def test_deepseek_architecture() -> None: def test_deepseek_mit_license() -> None: with _patch_deepseek(): meta = read_huggingface("deepseek-ai/DeepSeek-R1") - assert meta.license == "mit" + assert meta.license == "MIT" def test_deepseek_no_domain_when_no_pipeline_tag() -> None: @@ -1295,7 +1297,7 @@ def test_typhoon_thai_language() -> None: def test_typhoon_license() -> None: with _patch_typhoon(): meta = read_huggingface("typhoon-ai/typhoon-7b") - assert meta.license == "apache-2.0" + assert meta.license == "Apache-2.0" def test_typhoon_grouped_query_attention_hyperparameter() -> None: @@ -2756,7 +2758,7 @@ def _patch_uni2() -> Any: def test_uni2_nc_nd_license() -> None: with _patch_uni2(): meta = read_huggingface("MahmoodLab/UNI2-h") - assert meta.license == "cc-by-nc-nd-4.0" + assert meta.license == "CC-BY-NC-ND-4.0" def test_uni2_pathology_tags_in_extra_lists() -> None: @@ -3577,7 +3579,7 @@ def test_deplot_arxiv() -> None: # Salesforce/blip-vqa-base -# BLIP for VQA; bsd-3-clause license (non-SPDX passthrough). +# BLIP for VQA; bsd-3-clause license (normalised to BSD-3-Clause). def _patch_blip_vqa() -> Any: return _patch_hf_calls( config={ @@ -3604,11 +3606,12 @@ def test_blip_vqa_domain() -> None: assert "visual-question-answering" in meta.usage.domains -def test_blip_vqa_bsd_license_passthrough() -> None: - # bsd-3-clause not in _VAGUE_LICENSE_VALUES -- passed through as-is. +def test_blip_vqa_bsd_license_normalized() -> None: + # bsd-3-clause not in _VAGUE_LICENSE_VALUES; _canonicalize_license_id maps + # it to the canonical SPDX License ID BSD-3-Clause via licenseid matcher. with _patch_blip_vqa(): meta = read_huggingface("Salesforce/blip-vqa-base") - assert meta.license == "bsd-3-clause" + assert meta.license == "BSD-3-Clause" # --------------------------------------------------------------------------- @@ -3762,7 +3765,7 @@ def test_seamless_audio_to_audio_tag_in_domain() -> None: def test_seamless_nc_license() -> None: with _patch_seamless_m4t(): meta = read_huggingface("facebook/seamless-m4t-v2-large") - assert meta.license == "cc-by-nc-4.0" + assert meta.license == "CC-BY-NC-4.0" # --------------------------------------------------------------------------- @@ -4609,7 +4612,7 @@ def test_phi2_phi_architecture_mit_license() -> None: with _patch_phi2(): meta = read_huggingface("microsoft/phi-2") assert meta.type_of_model == "phi" - assert meta.license == "mit" + assert meta.license == "MIT" def test_phi2_code_tag_in_domain() -> None: @@ -4927,7 +4930,7 @@ def _patch_legal_embed_ita() -> Any: def test_legal_embed_ita_nc_license_italian() -> None: with _patch_legal_embed_ita(): meta = read_huggingface("ReDiX/Legal-Embedding-ita-0.6B") - assert meta.license == "cc-by-nc-4.0" + assert meta.license == "CC-BY-NC-4.0" assert meta.extra_lists.get("hf.language") == ["it"] assert "sentence-similarity" in meta.usage.domains @@ -5170,7 +5173,8 @@ def test_qwen3_235b_moe_architecture() -> None: def test_qwen3_235b_qwen_license_passthrough() -> None: - # "qwen" not in _VAGUE_LICENSE_VALUES → stored as-is + # "qwen" not in _VAGUE_LICENSE_VALUES; not recognized by licenseid matcher → + # _canonicalize_license_id returns it unchanged with _patch_qwen3_235b(): meta = read_huggingface("Qwen/Qwen3-235B-A22B") assert meta.license == "qwen" @@ -5243,7 +5247,7 @@ def test_qwen35_27b_architecture() -> None: def test_qwen35_27b_apache_license() -> None: with _patch_qwen35_27b(): meta = read_huggingface("Qwen/Qwen3.5-27B") - assert meta.license == "apache-2.0" + assert meta.license == "Apache-2.0" def test_qwen35_27b_gqa() -> None: @@ -5265,7 +5269,8 @@ def test_qwen35_27b_text_generation_domain() -> None: # --------------------------------------------------------------------------- # Korean multimodal VLM from Kakao Bank. Custom "kanana-license" identifier is -# NOT in _VAGUE_LICENSE_VALUES → stored as-is (passthrough). +# NOT in _VAGUE_LICENSE_VALUES; not recognized by licenseid matcher → +# _canonicalize_license_id returns it unchanged. # image-text-to-text pipeline tag. _KANANA_15V_CONFIG: dict[str, Any] = { @@ -5310,7 +5315,9 @@ def test_kanana_15v_architecture() -> None: def test_kanana_15v_license_passthrough() -> None: - # "kanana-license" is not in _VAGUE_LICENSE_VALUES → stored as-is, no detection + # "kanana-license" not in _VAGUE_LICENSE_VALUES; not recognized by + # licenseid matcher → _canonicalize_license_id returns it unchanged; + # no file detection triggered with _patch_kanana_15v(): meta = read_huggingface("kakaobank/kanana-1.5-v-3b-instruct") assert meta.license == "kanana-license" @@ -5712,7 +5719,7 @@ def test_glm45_air_reap_architecture() -> None: def test_glm45_air_reap_apache_license() -> None: with _patch_glm45_air_reap(): meta = read_huggingface("THUDM/GLM-4.5-Air-REAP") - assert meta.license == "apache-2.0" + assert meta.license == "Apache-2.0" def test_glm45_air_reap_merge_relation() -> None: @@ -5790,7 +5797,7 @@ def test_line_distilbert_fill_mask_domain() -> None: def test_line_distilbert_apache_license() -> None: with _patch_line_distilbert(): meta = read_huggingface("line-corporation/line-distilbert-base-japanese") - assert meta.license == "apache-2.0" + assert meta.license == "Apache-2.0" # --------------------------------------------------------------------------- @@ -5847,7 +5854,7 @@ def test_clip_japanese_v2_feature_extraction_domain() -> None: def test_clip_japanese_v2_apache_license() -> None: with _patch_clip_japanese_v2(): meta = read_huggingface("line-corporation/clip-japanese-base-v2") - assert meta.license == "apache-2.0" + assert meta.license == "Apache-2.0" def test_clip_japanese_v2_hidden_size() -> None: @@ -5898,7 +5905,7 @@ def test_fujitsu_llm_nemo_library_name() -> None: def test_fujitsu_llm_apache_license() -> None: with _patch_fujitsu_llm(): meta = read_huggingface("Fujitsu/Fujitsu-LLM-KG-8x7B") - assert meta.license == "apache-2.0" + assert meta.license == "Apache-2.0" def test_fujitsu_llm_text_generation_domain() -> None: @@ -5955,7 +5962,7 @@ def test_windowseat_image_to_image_domain() -> None: def test_windowseat_apache_license() -> None: with _patch_windowseat(): meta = read_huggingface("windowseat-ai/windowseat-reflection") - assert meta.license == "apache-2.0" + assert meta.license == "Apache-2.0" # --------------------------------------------------------------------------- @@ -6023,7 +6030,7 @@ def test_moirai_time_series_forecasting_domain() -> None: def test_moirai_cc_by_nc_license() -> None: with _patch_moirai(): meta = read_huggingface("Salesforce/moirai-2.0-R-small") - assert meta.license == "cc-by-nc-4.0" + assert meta.license == "CC-BY-NC-4.0" # --------------------------------------------------------------------------- @@ -6091,7 +6098,7 @@ def test_llasa_3b_text_to_speech_domain() -> None: def test_llasa_3b_cc_by_nc_license() -> None: with _patch_llasa_3b(): meta = read_huggingface("HKUSTAudio/Llasa-3B") - assert meta.license == "cc-by-nc-4.0" + assert meta.license == "CC-BY-NC-4.0" # --------------------------------------------------------------------------- @@ -6310,7 +6317,7 @@ def test_tildeopen_30b_64k_seven_datasets() -> None: def test_tildeopen_30b_64k_cc_by_license() -> None: with _patch_tildeopen_30b_64k(): meta = read_huggingface("TildeAI/TildeOpen-30b-64k") - assert meta.license == "cc-by-4.0" + assert meta.license == "CC-BY-4.0" # --------------------------------------------------------------------------- @@ -6547,7 +6554,8 @@ def test_openeurollm_three_datasets() -> None: # • No max_position_embeddings (uses ALiBi positional bias, not RoPE) # • seq_length (added to _HYPER_KEYS) is absent in the 176B config # • bigscience-bloom-rail-1.0 is a custom HF identifier NOT in -# _VAGUE_LICENSE_VALUES → stored as-is (passthrough) +# _VAGUE_LICENSE_VALUES; not recognized by licenseid matcher → +# _canonicalize_license_id returns it unchanged # • 59 languages (46 natural + 13 programming languages) _BLOOM_CONFIG: dict[str, Any] = { @@ -6641,7 +6649,9 @@ def test_bloom_architecture() -> None: def test_bloom_custom_license_passthrough() -> None: - # "bigscience-bloom-rail-1.0" not in _VAGUE_LICENSE_VALUES → stored as-is + # "bigscience-bloom-rail-1.0" not in _VAGUE_LICENSE_VALUES, not a known + # Not recognized by licenseid matcher → _canonicalize_license_id returns + # it unchanged. with _patch_bloom(): meta = read_huggingface("bigscience/bloom") assert meta.license == "bigscience-bloom-rail-1.0" @@ -7285,7 +7295,7 @@ def test_shap_e_text_to_3d_domain() -> None: def test_shap_e_mit_license() -> None: with _patch_shap_e(): meta = read_huggingface("openai/shap-e") - assert meta.license == "mit" + assert meta.license == "MIT" def test_shap_e_no_architecture() -> None: @@ -7468,7 +7478,8 @@ def test_hy_motion_library_name() -> None: # Apple Sharp generates 3-D from a single 2-D image. pipeline_tag=image-to-3d # (new _DOMAIN_TAGS entry). library_name=ml-sharp (Apple's custom library). -# license=apple-amlr → passthrough (not in _VAGUE_LICENSE_VALUES). No config. +# license=apple-amlr → not in _VAGUE_LICENSE_VALUES; not recognized by +# licenseid matcher → _canonicalize_license_id returns it unchanged. No config. _APPLE_SHARP_CARD_DATA = _make_card_data( license="apple-amlr", @@ -7495,7 +7506,8 @@ def test_apple_sharp_image_to_3d_domain() -> None: def test_apple_sharp_apple_amlr_license_passthrough() -> None: - # apple-amlr not in _VAGUE_LICENSE_VALUES → stored as-is + # apple-amlr not in _VAGUE_LICENSE_VALUES; not recognized by licenseid matcher → + # _canonicalize_license_id returns it unchanged with _patch_apple_sharp(): meta = read_huggingface("apple/Sharp") assert meta.license == "apple-amlr" @@ -7548,7 +7560,7 @@ def test_firered_vad_voice_activity_detection_domain() -> None: def test_firered_vad_apache_license() -> None: with _patch_firered_vad(): meta = read_huggingface("FireRedTeam/FireRedVAD") - assert meta.license == "apache-2.0" + assert meta.license == "Apache-2.0" def test_firered_vad_no_architecture() -> None: @@ -7610,7 +7622,7 @@ def test_gte_reranker_model_type_placeholder() -> None: def test_gte_reranker_apache_license() -> None: with _patch_gte_reranker(): meta = read_huggingface("Alibaba-NLP/gte-multilingual-reranker-base") - assert meta.license == "apache-2.0" + assert meta.license == "Apache-2.0" def test_gte_reranker_sentence_transformers_library() -> None: @@ -7626,7 +7638,8 @@ def test_gte_reranker_sentence_transformers_library() -> None: # Apple OpenELM-270M uses a custom efficient architecture ("openelm"). # Config has non-standard keys (activation_fn_name, ffn_dim_divisor) alongside # the standard head_dim (which IS in _HYPER_KEYS → captured). -# license=apple-amlr → passthrough. +# license=apple-amlr → not in _VAGUE_LICENSE_VALUES; not recognized by +# licenseid matcher → _canonicalize_license_id returns it unchanged. _OPENELM_270M_CONFIG: dict[str, Any] = { "model_type": "openelm", @@ -7810,7 +7823,7 @@ def test_llada2_moe_any_to_any_domain() -> None: def test_llada2_moe_apache_license() -> None: with _patch_llada2_moe(): meta = read_huggingface("inclusionAI/LLaDA2.0-Uni") - assert meta.license == "apache-2.0" + assert meta.license == "Apache-2.0" # --------------------------------------------------------------------------- diff --git a/tests/test_license.py b/tests/test_license.py index 6976985..5df373f 100644 --- a/tests/test_license.py +++ b/tests/test_license.py @@ -239,11 +239,14 @@ def test_collect_candidates_empty_dir() -> None: @pytest.fixture(name="licenseid_db_path") def licenseid_db_path_fixture() -> Path: - """Return the path to the licenseid database, skipping if not built. + """Skip if the licenseid database has not been built yet. Build with: ``licenseid update`` """ - db = Path.home() / ".local" / "share" / "licenseid" / "licenses.db" + # pylint: disable=import-outside-toplevel + from licenseid.database import get_default_db_path + + db = Path(get_default_db_path()) if not db.exists(): pytest.skip("licenseid database not built -- run 'licenseid update'") return db @@ -254,31 +257,16 @@ def licenseid_db_path_fixture() -> Path: # --------------------------------------------------------------------------- -def test_detect_license_from_text_db_missing(tmp_path: Path) -> None: - """Returns None gracefully when the licenseid database does not exist.""" +def test_detect_license_from_text_db_not_populated(tmp_path: Path) -> None: + """Returns None gracefully when the licenseid database is not populated.""" with patch( - "pitloom.extract._license._get_licenseid_db_path", - return_value=tmp_path / "nonexistent.db", + "licenseid.matcher.get_default_db_path", + return_value=str(tmp_path / "empty.db"), ): result = detect_license_from_text("MIT License\n\nPermission is hereby granted") assert result is None -def test_detect_license_from_text_library_not_installed(tmp_path: Path) -> None: - """Returns None gracefully when the licenseid library is not installed.""" - fake_db = tmp_path / "licenses.db" - fake_db.touch() - with ( - patch( - "pitloom.extract._license._get_licenseid_db_path", - return_value=fake_db, - ), - patch.dict("sys.modules", {"licenseid": None}), - ): - result = detect_license_from_text("MIT License") - assert result is None - - # --------------------------------------------------------------------------- # detect_license_for_project -- mocked detection # --------------------------------------------------------------------------- @@ -402,14 +390,10 @@ def test_detect_project_hint_text_detection_fails_returns_hint() -> None: """ -def test_detect_license_from_text_returns_spdx_id(licenseid_db_path: Path) -> None: +def test_detect_license_from_text_returns_spdx_id() -> None: """Detection with a real DB returns a valid SPDX License ID string (not None or raw text).""" - with patch( - "pitloom.extract._license._get_licenseid_db_path", - return_value=licenseid_db_path, - ): - result = detect_license_from_text(_MIT_TEXT) + result = detect_license_from_text(_MIT_TEXT) # Result may be None if score is below threshold; when not None it must # look like an SPDX License ID (no newlines, alphanumeric with dashes/dots) if result is not None: @@ -418,17 +402,13 @@ def test_detect_license_from_text_returns_spdx_id(licenseid_db_path: Path) -> No ) -def test_detect_project_from_license_file_integration(licenseid_db_path: Path) -> None: +def test_detect_project_from_license_file_integration() -> None: """End-to-end: LICENSE file text is processed; result is None or a valid SPDX License ID.""" with tempfile.TemporaryDirectory() as d: p = Path(d) (p / "LICENSE").write_text(_MIT_TEXT) - with patch( - "pitloom.extract._license._get_licenseid_db_path", - return_value=licenseid_db_path, - ): - result_id, prov = detect_license_for_project(p) + result_id, prov = detect_license_for_project(p) if result_id is not None: assert _looks_like_spdx_license_id(result_id), ( f"Expected SPDX License ID, got: {result_id!r}"