Skip to content

Commit c40d31f

Browse files
committed
New content ID pipeline
Signed-off-by: Tushar Goel <tushar.goel.dav@gmail.com>
1 parent 85e6e08 commit c40d31f

12 files changed

+644
-312
lines changed

vulnerabilities/importer.py

Lines changed: 28 additions & 37 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,7 @@
99

1010
import dataclasses
1111
import datetime
12+
import functools
1213
import logging
1314
import os
1415
import shutil
@@ -46,7 +47,8 @@
4647
logger = logging.getLogger(__name__)
4748

4849

49-
@dataclasses.dataclass(frozen=True)
50+
@dataclasses.dataclass(eq=True)
51+
@functools.total_ordering
5052
class VulnerabilitySeverity:
5153
# FIXME: this should be named scoring_system, like in the model
5254
system: ScoringSystem
@@ -55,25 +57,23 @@ class VulnerabilitySeverity:
5557
published_at: Optional[datetime.datetime] = None
5658

5759
def to_dict(self):
58-
published_at_dict = (
59-
{"published_at": self.published_at.isoformat()} if self.published_at else {}
60-
)
61-
return {
60+
data = {
6261
"system": self.system.identifier,
6362
"value": self.value,
6463
"scoring_elements": self.scoring_elements,
65-
**published_at_dict,
6664
}
67-
68-
def __eq__(self, other):
69-
if not isinstance(other, VulnerabilitySeverity):
70-
return NotImplemented
71-
return str(self.to_dict()) == str(other.to_dict())
65+
if self.published_at:
66+
data["published_at"] = self.published_at.isoformat()
67+
return data
7268

7369
def __lt__(self, other):
7470
if not isinstance(other, VulnerabilitySeverity):
7571
return NotImplemented
76-
return str(self.to_dict()) < str(other.to_dict())
72+
return self._cmp_key() < other._cmp_key()
73+
74+
# TODO: Add cache
75+
def _cmp_key(self):
76+
return (str(self.system), self.value, self.scoring_elements, self.published_at)
7777

7878
@classmethod
7979
def from_dict(cls, severity: dict):
@@ -89,7 +89,8 @@ def from_dict(cls, severity: dict):
8989
)
9090

9191

92-
@dataclasses.dataclass(frozen=True)
92+
@dataclasses.dataclass(eq=True)
93+
@functools.total_ordering
9394
class Reference:
9495
reference_id: str = ""
9596
reference_type: str = ""
@@ -100,31 +101,22 @@ def __post_init__(self):
100101
if not self.url:
101102
raise TypeError("Reference must have a url")
102103

103-
def normalized(self):
104-
severities = sorted(self.severities)
105-
return Reference(
106-
reference_id=self.reference_id,
107-
url=self.url,
108-
severities=severities,
109-
reference_type=self.reference_type,
110-
)
111-
112-
def __eq__(self, other):
113-
if not isinstance(other, Reference):
114-
return NotImplemented
115-
return str(self.to_dict()) == str(other.to_dict())
116-
117104
def __lt__(self, other):
118105
if not isinstance(other, Reference):
119106
return NotImplemented
120-
return str(self.to_dict()) < str(other.to_dict())
107+
return self._cmp_key() < other._cmp_key()
108+
109+
# TODO: Add cache
110+
def _cmp_key(self):
111+
return (self.reference_id, self.reference_type, self.url, tuple(self.severities))
121112

122113
def to_dict(self):
114+
"""Return a normalized dictionary representation"""
123115
return {
124116
"reference_id": self.reference_id,
125117
"reference_type": self.reference_type,
126118
"url": self.url,
127-
"severities": [severity.to_dict() for severity in self.severities],
119+
"severities": [severity.to_dict() for severity in sorted(self.severities)],
128120
}
129121

130122
@classmethod
@@ -160,7 +152,8 @@ class NoAffectedPackages(Exception):
160152
"""
161153

162154

163-
@dataclasses.dataclass(frozen=True)
155+
@functools.total_ordering
156+
@dataclasses.dataclass(eq=True)
164157
class AffectedPackage:
165158
"""
166159
Relate a Package URL with a range of affected versions and a fixed version.
@@ -190,15 +183,14 @@ def get_fixed_purl(self):
190183
raise ValueError(f"Affected Package {self.package!r} does not have a fixed version")
191184
return update_purl_version(purl=self.package, version=str(self.fixed_version))
192185

193-
def __eq__(self, other):
194-
if not isinstance(other, AffectedPackage):
195-
return NotImplemented
196-
return str(self.to_dict()) == str(other.to_dict())
197-
198186
def __lt__(self, other):
199187
if not isinstance(other, AffectedPackage):
200188
return NotImplemented
201-
return str(self.to_dict()) < str(other.to_dict())
189+
return self._cmp_key() < other._cmp_key()
190+
191+
# TODO: Add cache
192+
def _cmp_key(self):
193+
return (str(self.package), str(self.affected_version_range), str(self.fixed_version))
202194

203195
@classmethod
204196
def merge(
@@ -304,7 +296,6 @@ class AdvisoryData:
304296
date_published: Optional[datetime.datetime] = None
305297
weaknesses: List[int] = dataclasses.field(default_factory=list)
306298
url: Optional[str] = None
307-
created_by: Optional[str] = None
308299

309300
def __post_init__(self):
310301
if self.date_published and not self.date_published.tzinfo:
Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,18 @@
1+
# Generated by Django 4.2.16 on 2025-02-06 14:27
2+
3+
from django.db import migrations, models
4+
5+
6+
class Migration(migrations.Migration):
7+
8+
dependencies = [
9+
("vulnerabilities", "0088_fix_alpine_purl_type"),
10+
]
11+
12+
operations = [
13+
migrations.AlterField(
14+
model_name="advisory",
15+
name="unique_content_id",
16+
field=models.CharField(blank=True, max_length=64),
17+
),
18+
]

vulnerabilities/models.py

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -44,7 +44,8 @@
4444
from aboutcode import hashid
4545
from vulnerabilities import utils
4646
from vulnerabilities.severity_systems import SCORING_SYSTEMS
47-
from vulnerabilities.utils import compute_content_id, normalize_purl
47+
from vulnerabilities.utils import compute_content_id
48+
from vulnerabilities.utils import normalize_purl
4849
from vulnerabilities.utils import purl_to_dict
4950
from vulnerablecode import __version__ as VULNERABLECODE_VERSION
5051

@@ -1177,7 +1178,7 @@ class Advisory(models.Model):
11771178
"""
11781179

11791180
unique_content_id = models.CharField(
1180-
max_length=32,
1181+
max_length=64,
11811182
blank=True,
11821183
)
11831184
aliases = models.JSONField(blank=True, default=list, help_text="A list of alias strings")
@@ -1230,10 +1231,10 @@ def save(self, *args, **kwargs):
12301231
checksum.update(value)
12311232
self.unique_content_id = checksum.hexdigest()
12321233
super().save(*args, **kwargs)
1233-
1234+
12341235
def save(self, *args, **kwargs):
12351236
advisory_data = self.to_advisory_data()
1236-
self.unique_content_id = compute_content_id(advisory_data, include_metadata=False)
1237+
self.unique_content_id = compute_content_id(advisory_data, include_metadata=False)[:31]
12371238
super().save(*args, **kwargs)
12381239

12391240
def to_advisory_data(self) -> "AdvisoryData":
@@ -1251,7 +1252,6 @@ def to_advisory_data(self) -> "AdvisoryData":
12511252
date_published=self.date_published,
12521253
weaknesses=self.weaknesses,
12531254
url=self.url,
1254-
created_by=self.created_by
12551255
)
12561256

12571257

Lines changed: 79 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,79 @@
1+
#
2+
# Copyright (c) nexB Inc. and others. All rights reserved.
3+
# VulnerableCode is a trademark of nexB Inc.
4+
# SPDX-License-Identifier: Apache-2.0
5+
# See http://www.apache.org/licenses/LICENSE-2.0 for the license text.
6+
# See https://github.com/aboutcode-org/vulnerablecode for support or download.
7+
# See https://aboutcode.org for more information about nexB OSS projects.
8+
#
9+
10+
import logging
11+
from itertools import groupby
12+
from operator import attrgetter
13+
14+
from django.db.models import Count
15+
from django.db.models import Q
16+
17+
from vulnerabilities.models import Advisory
18+
from vulnerabilities.pipelines import VulnerableCodePipeline
19+
from vulnerabilities.utils import compute_content_id
20+
21+
22+
class RemoveDuplicateAdvisoriesPipeline(VulnerableCodePipeline):
23+
"""Pipeline to remove duplicate advisories based on their content."""
24+
25+
pipeline_id = "remove_duplicate_advisories"
26+
27+
@classmethod
28+
def steps(cls):
29+
return (cls.remove_duplicates,)
30+
31+
def remove_duplicates(self):
32+
"""
33+
Find advisories with the same content and keep only the latest one.
34+
"""
35+
# Get all advisories that have duplicates based on content ID
36+
duplicate_content_ids = (
37+
Advisory.objects.values("unique_content_id")
38+
.annotate(count=Count("id"))
39+
.filter(count__gt=1)
40+
.values_list("unique_content_id", flat=True)
41+
)
42+
43+
self.log(
44+
f"Found {len(duplicate_content_ids)} content IDs with duplicates", level=logging.INFO
45+
)
46+
47+
for content_id in duplicate_content_ids:
48+
# Get all advisories with this content ID
49+
advisories = Advisory.objects.filter(unique_content_id=content_id)
50+
51+
# Find the latest advisory
52+
latest = advisories.latest("date_imported")
53+
54+
# Delete all except the latest
55+
advisories.exclude(id=latest.id).delete()
56+
57+
if self.log:
58+
self.log(
59+
f"Kept advisory {latest.id} and removed "
60+
f"{advisories.count() - 1} duplicates for content ID {content_id}",
61+
level=logging.INFO,
62+
)
63+
64+
def update_content_ids(self):
65+
"""
66+
Update content IDs for all advisories that don't have one.
67+
"""
68+
advisories = Advisory.objects.filter(
69+
Q(unique_content_id="") | Q(unique_content_id__isnull=True)
70+
)
71+
72+
self.log(f"Found {advisories.count()} advisories without content ID", level=logging.INFO)
73+
74+
for advisory in advisories:
75+
advisory.unique_content_id = compute_content_id(advisory)
76+
advisory.save()
77+
78+
if self.log:
79+
self.log(f"Updated content ID for advisory {advisory.id}", level=logging.DEBUG)

vulnerabilities/severity_systems.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -42,6 +42,9 @@ def compute(self, scoring_elements: str) -> str:
4242
def get(self, scoring_elements: str):
4343
return NotImplementedError
4444

45+
def __str__(self):
46+
return f"{self.identifier}"
47+
4548

4649
@dataclasses.dataclass(order=True)
4750
class Cvssv2ScoringSystem(ScoringSystem):

0 commit comments

Comments
 (0)