From d3dfb4d11a2d173b614376d6b243d9530080b894 Mon Sep 17 00:00:00 2001 From: tdruez Date: Mon, 24 Nov 2025 16:01:39 +0400 Subject: [PATCH 1/5] Add package_content PurlDB field on Package model Signed-off-by: tdruez --- .../0013_package_package_content.py | 18 ++++++++ component_catalog/models.py | 41 ++++++++++++++++++- 2 files changed, 58 insertions(+), 1 deletion(-) create mode 100644 component_catalog/migrations/0013_package_package_content.py diff --git a/component_catalog/migrations/0013_package_package_content.py b/component_catalog/migrations/0013_package_package_content.py new file mode 100644 index 00000000..ccf667de --- /dev/null +++ b/component_catalog/migrations/0013_package_package_content.py @@ -0,0 +1,18 @@ +# Generated by Django 5.2.8 on 2025-11-24 12:00 + +from django.db import migrations, models + + +class Migration(migrations.Migration): + + dependencies = [ + ('component_catalog', '0012_alter_component_children'), + ] + + operations = [ + migrations.AddField( + model_name='package', + name='package_content', + field=models.IntegerField(blank=True, choices=[(1, 'curation'), (2, 'patch'), (3, 'source_repo'), (4, 'source_archive'), (5, 'binary'), (6, 'test'), (7, 'doc')], help_text='Content of this Package as one of: curation, patch, source_repo, source_archive, binary, test, doc', null=True), + ), + ] diff --git a/component_catalog/models.py b/component_catalog/models.py index e0a5f46a..3336bc2e 100644 --- a/component_catalog/models.py +++ b/component_catalog/models.py @@ -1653,6 +1653,44 @@ def __str__(self): return self.label +class PackageContentFieldMixin(models.Model): + """ + Field extracted from the `purldb.packagedb.models.Package` model. + It need to stay aligned with its upstream PurlDB implementation. + """ + + class PackageContentType(models.IntegerChoices): + CURATION = 1, "curation" + PATCH = 2, "patch" + SOURCE_REPO = 3, "source_repo" + SOURCE_ARCHIVE = 4, "source_archive" + BINARY = 5, "binary" + TEST = 6, "test" + DOC = 7, "doc" + + package_content = models.IntegerField( + null=True, + blank=True, + choices=PackageContentType.choices, + help_text=_( + "Content of this Package as one of: {}".format(", ".join(PackageContentType.labels)) + ), + ) + + class Meta: + abstract = True + + @classmethod + def get_package_content_value_from_label(cls, label): + """Convert a package_content string label to its integer value.""" + if not label: + return None + try: + return cls.PackageContentType[label.upper()].value + except (KeyError, AttributeError): + return None + + PACKAGE_URL_FIELDS = ["type", "namespace", "name", "version", "qualifiers", "subpath"] @@ -1792,6 +1830,7 @@ class Package( URLFieldsMixin, HashFieldsMixin, PackageURLMixin, + PackageContentFieldMixin, DataspacedModel, ): filename = models.CharField( @@ -2558,7 +2597,7 @@ def get_purldb_entries(self, user, max_request_call=0, timeout=10): if self.download_url: payloads.append({"download_url": self.download_url}) if package_url: - payloads.append({"purl": package_url}) + payloads.append({"purl": package_url, "sort": "package_content"}) purldb = PurlDB(user.dataspace) for index, payload in enumerate(payloads): From b1014d185e0557a838047c61c49b405053442b8a Mon Sep 17 00:00:00 2001 From: tdruez Date: Mon, 24 Nov 2025 18:56:50 +0400 Subject: [PATCH 2/5] Add package_content field in forms, admin, API Signed-off-by: tdruez --- component_catalog/admin.py | 1 + component_catalog/api.py | 1 + component_catalog/forms.py | 4 +++- component_catalog/tests/test_models.py | 1 + component_catalog/views.py | 6 ++++++ dejacode_toolkit/purldb.py | 2 ++ 6 files changed, 14 insertions(+), 1 deletion(-) diff --git a/component_catalog/admin.py b/component_catalog/admin.py index 3e96cf22..edb32988 100644 --- a/component_catalog/admin.py +++ b/component_catalog/admin.py @@ -884,6 +884,7 @@ class PackageAdmin( "parties", "datasource_id", "file_references", + "package_content", ) }, ), diff --git a/component_catalog/api.py b/component_catalog/api.py index b7054945..602624ea 100644 --- a/component_catalog/api.py +++ b/component_catalog/api.py @@ -687,6 +687,7 @@ class Meta: "parties", "datasource_id", "file_references", + "package_content", "external_references", "created_date", "last_modified_date", diff --git a/component_catalog/forms.py b/component_catalog/forms.py index eb824a0a..a586bdc7 100644 --- a/component_catalog/forms.py +++ b/component_catalog/forms.py @@ -339,6 +339,7 @@ class Meta: "version", "qualifiers", "subpath", + "package_content", "collect_data", ] widgets = { @@ -407,7 +408,7 @@ def helper(self): HTML("
"), Group("description", "keywords"), Group("primary_language", "cpe"), - Group("size", "release_date"), + Group("package_content", "size", "release_date"), Group("dependencies", "notes"), HTML("
"), Group("homepage_url", "code_view_url"), @@ -1183,6 +1184,7 @@ class Meta: "version", "qualifiers", "subpath", + "package_content", ] diff --git a/component_catalog/tests/test_models.py b/component_catalog/tests/test_models.py index a97afe3e..899d5d5c 100644 --- a/component_catalog/tests/test_models.py +++ b/component_catalog/tests/test_models.py @@ -1366,6 +1366,7 @@ def test_component_catalog_models_get_exclude_candidates_fields(self): "file_references", "other_license_expression", "parties", + "package_content", ], ), ) diff --git a/component_catalog/views.py b/component_catalog/views.py index 00198a08..4341e2a7 100644 --- a/component_catalog/views.py +++ b/component_catalog/views.py @@ -1930,6 +1930,12 @@ def get_initial(self): if purldb_entry := self.get_entry_from_purldb(): # Duplicate the declared_license_expression as the "concluded" license_expression purldb_entry["license_expression"] = purldb_entry.get("declared_license_expression") + + # Convert package_content string label to integer value + if content_label := purldb_entry.pop("package_content", None): + if content_value := Package.get_package_content_value_from_label(content_label): + purldb_entry["package_content"] = content_value + model_fields = [field.name for field in Package._meta.get_fields()] initial_from_purldb_entry = { field_name: value diff --git a/dejacode_toolkit/purldb.py b/dejacode_toolkit/purldb.py index 0f63d7a2..81191b49 100644 --- a/dejacode_toolkit/purldb.py +++ b/dejacode_toolkit/purldb.py @@ -61,6 +61,8 @@ def get_package_by_purl(self, package_url): def find_packages(self, payload, timeout=None): """Get Packages details using provided `payload` filters on the PurlDB package list.""" + payload.update({"sort": "package_content"}) + response = self.request_get(self.package_api_url, params=payload, timeout=timeout) if response and response.get("count") > 0: return response.get("results") From 4a7a13bb8f1eaeecdb3ced3a4234158e8027cf2c Mon Sep 17 00:00:00 2001 From: tdruez Date: Mon, 24 Nov 2025 18:58:32 +0400 Subject: [PATCH 3/5] Add package_content in details view Signed-off-by: tdruez --- component_catalog/views.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/component_catalog/views.py b/component_catalog/views.py index 4341e2a7..8e18d7be 100644 --- a/component_catalog/views.py +++ b/component_catalog/views.py @@ -1140,6 +1140,7 @@ class PackageDetailsView( "parties", "datasource_id", "file_references", + "package_content", ], }, "components": { @@ -1293,6 +1294,7 @@ def tab_others(self): TabField("parties"), TabField("datasource_id"), TabField("file_references"), + TabField("package_content"), ] fields = self.get_tab_fields(tab_fields) From 4ac11c2a1a222df01d74b2b54c9ea69ef1eb42eb Mon Sep 17 00:00:00 2001 From: tdruez Date: Mon, 24 Nov 2025 19:01:06 +0400 Subject: [PATCH 4/5] Fix unit tests Signed-off-by: tdruez --- dje/tests/testfiles/test_dataset_cc_only.json | 5 +++-- dje/tests/testfiles/test_dataset_pp_only.json | 1 + 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/dje/tests/testfiles/test_dataset_cc_only.json b/dje/tests/testfiles/test_dataset_cc_only.json index b80388ec..cab86eff 100644 --- a/dje/tests/testfiles/test_dataset_cc_only.json +++ b/dje/tests/testfiles/test_dataset_cc_only.json @@ -292,12 +292,13 @@ "vcs_url": "", "code_view_url": "", "bug_tracking_url": "", + "md5": "", + "sha1": "", "sha256": "", "sha512": "", + "package_content": null, "filename": "systemu-2.5.2.gem", "download_url": "https://s3.amazonaws.com/production.s3.rubygems.org/gems/systemu-2.5.2.gem", - "sha1": "", - "md5": "", "size": null, "release_date": null, "primary_language": "", diff --git a/dje/tests/testfiles/test_dataset_pp_only.json b/dje/tests/testfiles/test_dataset_pp_only.json index 988add2d..264e1bff 100644 --- a/dje/tests/testfiles/test_dataset_pp_only.json +++ b/dje/tests/testfiles/test_dataset_pp_only.json @@ -30,6 +30,7 @@ "sha1": "", "sha256": "", "sha512": "", + "package_content": null, "filename": "systemu-2.5.2.gem", "download_url": "https://s3.amazonaws.com/production.s3.rubygems.org/gems/systemu-2.5.2.gem", "size": null, From bf640c650f67209566b2dd53e5ddb5f8c4492438 Mon Sep 17 00:00:00 2001 From: tdruez Date: Mon, 24 Nov 2025 19:34:43 +0400 Subject: [PATCH 5/5] Pick the source package when multiple purldb_entries returned Signed-off-by: tdruez --- component_catalog/models.py | 18 ++++++++++++++++-- dejacode_toolkit/purldb.py | 14 ++++++++++++++ 2 files changed, 30 insertions(+), 2 deletions(-) diff --git a/component_catalog/models.py b/component_catalog/models.py index 3336bc2e..5f72e942 100644 --- a/component_catalog/models.py +++ b/component_catalog/models.py @@ -59,6 +59,7 @@ from dejacode_toolkit.download import collect_package_data from dejacode_toolkit.purldb import PurlDB from dejacode_toolkit.purldb import pick_purldb_entry +from dejacode_toolkit.purldb import pick_source_package from dejacode_toolkit.scancodeio import ScanCodeIO from dje import urn from dje.copier import post_copy @@ -2535,7 +2536,7 @@ def create_from_url(cls, url, user): package_for_match = cls(download_url=download_url) package_for_match.set_package_url(package_url) purldb_entries = package_for_match.get_purldb_entries(user) - # Look for one ith the same exact purl in that case + # Look for one with the same exact purl in that case if purldb_data := pick_purldb_entry(purldb_entries, purl=url): # The format from PurlDB is "2019-11-18T00:00:00Z" from DateTimeField if release_date := purldb_data.get("release_date"): @@ -2597,7 +2598,7 @@ def get_purldb_entries(self, user, max_request_call=0, timeout=10): if self.download_url: payloads.append({"download_url": self.download_url}) if package_url: - payloads.append({"purl": package_url, "sort": "package_content"}) + payloads.append({"purl": package_url}) purldb = PurlDB(user.dataspace) for index, payload in enumerate(payloads): @@ -2628,6 +2629,8 @@ def update_from_purldb(self, user): - Retrieves matching entries from PurlDB using the given user. - If exactly one match is found, its data is used directly. + - If multiple entries are found, leverage the package_content value when + available to select a "source" package. - If multiple entries are found, only values that are non-empty and common across all entries are merged and used to update the Package. """ @@ -2638,6 +2641,11 @@ def update_from_purldb(self, user): purldb_entries_count = len(purldb_entries) if purldb_entries_count == 1: package_data = purldb_entries[0] + elif source_package := pick_source_package(purldb_entries): + package_data = source_package + package_data["package_content"] = Package.get_package_content_value_from_label( + package_data["package_content"] + ) else: package_data = merge_common_non_empty_values(purldb_entries) @@ -2678,6 +2686,12 @@ def update_from_purldb(self, user): override=False, override_unknown=True, ) + + if updated_fields: + msg = f"Automatically updated {', '.join(updated_fields)} from PurlDB." + logger.debug(f"PurlDB: {msg}") + History.log_change(user, self, message=msg) + return updated_fields def update_from_scan(self, user, update_products=False): diff --git a/dejacode_toolkit/purldb.py b/dejacode_toolkit/purldb.py index 81191b49..83db1ab4 100644 --- a/dejacode_toolkit/purldb.py +++ b/dejacode_toolkit/purldb.py @@ -90,3 +90,17 @@ def pick_purldb_entry(purldb_entries, purl=None): matches = [entry for entry in purldb_entries if entry.get("purl") == purl] if len(matches) == 1: return matches[0] + + +def pick_source_package(purldb_entries): + """Pick a source package from a list of PurlDB entries.""" + if not purldb_entries: + return + + if len(purldb_entries) == 1: + return purldb_entries[0] + + for entry in purldb_entries: + package_content = entry.get("package_content") + if package_content and package_content.lower() == "source_archive": + return entry