Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions .github/workflows/test.yml
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,9 @@ jobs:
python -m pip install --upgrade pip
pip install ".[aimodel,huggingface]" --group test

- name: Build licenseid database
run: licenseid update

- name: Run tests
run: |
pytest tests/ -v --tb=short
Expand Down
19 changes: 14 additions & 5 deletions docs/implementation/license-pipeline.md
Original file line number Diff line number Diff line change
Expand Up @@ -134,7 +134,13 @@ relationships.

1. **Card YAML** -- reads `license:` from the model card frontmatter. If
the value is not a vague sentinel (`other`, `custom`, `proprietary`,
`unknown`, `unlicensed`), it is accepted as-is and stored in
`unknown`, `unlicensed`), it is passed through `_canonicalize_license_id()`,
which calls `AggregatedLicenseMatcher.match(license_id=raw)` from the
`licenseid` library for a direct database lookup. Recognised SPDX
License IDs are returned in canonical casing (e.g. `"apache-2.0"` →
`"Apache-2.0"`). Values not recognised — proprietary or non-SPDX
identifiers such as `"gemma"`, `"llama3.2"`, or deprecated bare
copyleft forms — are returned verbatim. The result is stored in
`AiModelMetadata.license`.
2. **File detection** -- when the card YAML value is absent or vague,
`_detect_license_from_hf_files()` iterates through candidate files in
Expand All @@ -148,15 +154,18 @@ relationships.
### `licenseid` dependency

Text-based licence detection (`detect_license_from_text()` in
`_license.py`) relies on the optional `licenseid` package. When the
package is not installed or its database has not been built, detection
is silently skipped and the function returns `None`. To enable it:
`_license.py`) uses the `licenseid` package, which is a mandatory
pitloom dependency. The database must be built before detection is
possible:

```shell
pip install pitloom[license]
licenseid update
```

When the database has not been built, `detect_license_from_text()`
logs a warning and returns `None`; other licence sources (card YAML,
`CITATION.cff`, `codemeta.json`) are unaffected.

The database is stored at
`~/.local/share/licenseid/licenses.db`. Detection uses cosine similarity
against vectorised licence texts with a default threshold of 0.85.
Expand Down
3 changes: 2 additions & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -46,6 +46,7 @@ classifiers = [
]
dependencies = [
"hatchling>=1.28.0",
"licenseid>=0.2.3",
"pyproject-metadata>=0.11.0",
"rfc8785>=0.1.4",
"spdx-python-model==0.0.4",
Expand All @@ -63,7 +64,6 @@ aimodel = [
"safetensors[numpy]>=0.7.0",
]
huggingface = ["huggingface_hub>=1.14.0"]
license = ["licenseid>=0.2.2"]

fasttext = ["fasttext>=0.9.3"] # or fasttext-community>=0.11.7
gguf = ["gguf>=0.10.0"]
Expand Down Expand Up @@ -124,6 +124,7 @@ exclude = [
"tests/fixtures/aimodels/safetensors/*.safetensors",
"tests/fixtures/croissant/*.json",
"tests/fixtures/fragments/*.json",
"tests/fixtures/huggingface-hub/*.txt",
]

[tool.hatch.build.targets.wheel]
Expand Down
8 changes: 4 additions & 4 deletions src/pitloom/assemble/spdx3/deps.py
Original file line number Diff line number Diff line change
Expand Up @@ -111,12 +111,12 @@ def _enrich_from_installed(
provenance_source = f"Source: installed metadata | Package: {dep_name}"

# description
summary = pkg_meta["Summary"] or ""
summary = pkg_meta.get("Summary") or ""
if summary and summary != "UNKNOWN":
dep_package.description = summary

# homePage -- core field first, then well-known Project-URL labels
home_page = pkg_meta["Home-page"] or ""
home_page = pkg_meta.get("Home-page") or ""
if not home_page or home_page == "UNKNOWN":
for label in _HOMEPAGE_LABELS:
if label in project_urls:
Expand All @@ -126,7 +126,7 @@ def _enrich_from_installed(
dep_package.software_homePage = home_page

# downloadLocation -- core field first, then well-known Project-URL labels
download_url = pkg_meta["Download-URL"] or ""
download_url = pkg_meta.get("Download-URL") or ""
if not download_url or download_url == "UNKNOWN":
for label in _DOWNLOAD_LABELS:
if label in project_urls:
Expand All @@ -145,7 +145,7 @@ def _enrich_from_installed(
dep_package.software_packageUrl = f"pkg:pypi/{purl_name}@{version}"

# hasDeclaredLicense -- prefer PEP 639 License-Expression over legacy License
license_id = pkg_meta["License-Expression"] or pkg_meta["License"] or ""
license_id = pkg_meta.get("License-Expression") or pkg_meta.get("License") or ""
if license_id and license_id != "UNKNOWN":
rel_declared, _ = build_license_elements(
license_id=license_id,
Expand Down
25 changes: 24 additions & 1 deletion src/pitloom/extract/_huggingface.py
Original file line number Diff line number Diff line change
Expand Up @@ -102,6 +102,29 @@
{"other", "custom", "proprietary", "unknown", "unlicensed"}
)


def _canonicalize_license_id(raw: str) -> str:
"""Return the canonical SPDX License ID for *raw*, or *raw* unchanged.

Delegates to :func:`~pitloom.extract._license.canonicalize_license_id`,
which uses ``AggregatedLicenseMatcher.match()`` from the ``licenseid``
library. When *raw* is recognised as an SPDX License ID the canonical
casing is returned (e.g. ``"bsd-3-clause"`` → ``"BSD-3-Clause"``).

When *raw* is not recognised it is returned verbatim — pitloom records
what it found and leaves further interpretation (e.g. deciding whether to
add a ``LicenseRef-`` prefix for non-SPDX identifiers) to the
``licenseid`` library or downstream SBOM tooling.

Requires a populated ``licenseid`` database (``licenseid update``).
When the database has not been built, *raw* is returned unchanged.
"""
# pylint: disable=import-outside-toplevel
from pitloom.extract._license import canonicalize_license_id

return canonicalize_license_id(raw)


# Filenames (case-sensitive, root of repo) considered license candidates.
# Listed in priority order: no extension first, then common suffixes.
_HF_LICENSE_FILENAMES: tuple[str, ...] = (
Expand Down Expand Up @@ -436,7 +459,7 @@ def _resolve_license(
provenance["license"] = (
"Source: Hugging Face Hub | Field: model card YAML (license)"
)
return raw_license_str, None
return _canonicalize_license_id(raw_license_str), None

vague_raw = (
raw_license_str
Expand Down
69 changes: 42 additions & 27 deletions src/pitloom/extract/_license.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,13 +4,12 @@

"""License text detection utilities using the licenseid library.

Provides best-effort SPDX license ID detection from license text found in
project files. All detection is optional and degrades gracefully when the
``licenseid`` package is not installed or its database has not been built.
Provides SPDX license ID detection from license text and metadata found in
project files. Text detection requires a populated database; other sources
(``CITATION.cff``, ``codemeta.json``) work without it.

To enable detection, install the package and build the database::
Build the database before first use::

pip install licenseid
licenseid update
"""

Expand All @@ -21,6 +20,8 @@
import re
from pathlib import Path

from licenseid import AggregatedLicenseMatcher

_logger = logging.getLogger(__name__)

# Heuristic: single-token SPDX License IDs and expressions like "GPL-3.0-or-later"
Expand Down Expand Up @@ -56,45 +57,59 @@ def _looks_like_spdx_license_expression(value: str) -> bool:
return bool(_SPDX_LICENSE_EXPR_KEYWORDS_RE.search(stripped))


def _get_licenseid_db_path() -> Path:
return Path.home() / ".local" / "share" / "licenseid" / "licenses.db"


def detect_license_from_text(text: str, threshold: float = 0.85) -> str | None:
"""Detect SPDX License ID from *text* using the licenseid library.

Returns the top-ranked SPDX License ID when its score meets *threshold*, or
``None`` when the database is absent, the library is not installed, or no
match exceeds the threshold.
``None`` when the database is not populated or no match exceeds the threshold.

The database must be built before detection is possible::

licenseid update
"""
db_path = _get_licenseid_db_path()
if not db_path.exists():
_logger.warning(
"licenseid database not found at %s -- "
"run 'licenseid update' to enable license text detection",
db_path,
)
return None
try:
# pylint: disable=import-outside-toplevel
from licenseid import AggregatedLicenseMatcher
except ImportError:
_logger.debug("licenseid not installed; skipping license text detection")
return None
try:
matcher = AggregatedLicenseMatcher(str(db_path))
matcher = AggregatedLicenseMatcher()
# Probe with a well-known license ID to confirm the database is populated.
if not matcher.match(license_id="MIT"):
_logger.warning(
"licenseid database appears empty -- "
"run 'licenseid update' to enable license text detection"
)
return None
results = matcher.match(text)
filtered = [r for r in results if r["score"] >= threshold]
return filtered[0]["license_id"] if filtered else None
return str(filtered[0]["license_id"]) if filtered else None
except Exception as exc: # pylint: disable=broad-exception-caught
_logger.debug("licenseid detection failed: %s", exc)
return None


def canonicalize_license_id(raw: str) -> str:
"""Return the canonical SPDX License ID for *raw*, or *raw* unchanged.

Uses ``AggregatedLicenseMatcher.match(license_id=raw)`` for a direct
database lookup. Returns the canonical casing when *raw* is a recognised
SPDX License ID (e.g. ``"bsd-3-clause"`` → ``"BSD-3-Clause"``).

For unrecognised values — non-SPDX identifiers, deprecated bare
copyleft forms (``"agpl-3.0"``, ``"gpl-3.0"``), or vendor-specific
strings (``"gemma"``, ``"llama3.2"``) — the original string is
returned verbatim. pitloom records what it found and leaves further
interpretation (e.g. deciding whether to add a ``LicenseRef-`` prefix)
to the ``licenseid`` library or downstream SBOM tooling.

Requires a populated database (``licenseid update``). When the database
is not populated *raw* is returned unchanged.
"""
try:
results = AggregatedLicenseMatcher().match(license_id=raw)
if results:
return str(results[0]["license_id"])
except Exception: # pylint: disable=broad-exception-caught
pass
return raw


def find_license_files(project_dir: Path) -> list[Path]:
"""Return existing license files in *project_dir* in priority order.

Expand Down
Loading
Loading