From d82c45b9b49369fa660b253806cc7f97e54be1e1 Mon Sep 17 00:00:00 2001 From: Shrish0098 Date: Sat, 8 Mar 2025 11:20:33 +0530 Subject: [PATCH 1/2] Removed paginated() and added iterator () Signed-off-by: Shrish0098 --- vulnerabilities/improvers/default.py | 4 ++-- vulnerabilities/improvers/valid_versions.py | 6 +++--- vulnerabilities/improvers/vulnerability_status.py | 2 +- vulnerabilities/management/commands/export.py | 2 +- vulnerabilities/pipelines/__init__.py | 2 +- vulnerabilities/pipelines/add_cvss31_to_CVEs.py | 2 +- vulnerabilities/pipelines/collect_commits.py | 4 +--- vulnerabilities/pipelines/compute_package_risk.py | 4 ++-- vulnerabilities/pipelines/flag_ghost_packages.py | 2 +- 9 files changed, 13 insertions(+), 15 deletions(-) diff --git a/vulnerabilities/improvers/default.py b/vulnerabilities/improvers/default.py index f2e9009e8..7e1a90fda 100644 --- a/vulnerabilities/improvers/default.py +++ b/vulnerabilities/improvers/default.py @@ -44,9 +44,9 @@ def interesting_advisories(self) -> QuerySet: return ( Advisory.objects.filter(Q(created_by=self.importer.qualified_name)) .order_by("-date_collected") - .paginated() + .iterator() ) - return Advisory.objects.all().order_by("-date_collected").paginated() + return Advisory.objects.all().order_by("-date_collected").iterator() def get_inferences(self, advisory_data: AdvisoryData) -> Iterable[Inference]: if not advisory_data: diff --git a/vulnerabilities/improvers/valid_versions.py b/vulnerabilities/improvers/valid_versions.py index 916f36f59..434034bcc 100644 --- a/vulnerabilities/improvers/valid_versions.py +++ b/vulnerabilities/improvers/valid_versions.py @@ -64,8 +64,8 @@ class ValidVersionImprover(Improver): @property def interesting_advisories(self) -> QuerySet: if issubclass(self.importer, VulnerableCodeBaseImporterPipeline): - return Advisory.objects.filter(Q(created_by=self.importer.pipeline_id)).paginated() - return Advisory.objects.filter(Q(created_by=self.importer.qualified_name)).paginated() + return Advisory.objects.filter(Q(created_by=self.importer.pipeline_id)).iterator() + return Advisory.objects.filter(Q(created_by=self.importer.qualified_name)).iterator() def get_package_versions( self, package_url: PackageURL, until: Optional[datetime] = None @@ -222,7 +222,7 @@ class NginxBasicImprover(Improver): @property def interesting_advisories(self) -> QuerySet: - return Advisory.objects.filter(created_by=NginxImporterPipeline.pipeline_id).paginated() + return Advisory.objects.filter(created_by=NginxImporterPipeline.pipeline_id).iterator() def get_inferences(self, advisory_data: AdvisoryData) -> Iterable[Inference]: all_versions = list(self.fetch_nginx_version_from_git_tags()) diff --git a/vulnerabilities/improvers/vulnerability_status.py b/vulnerabilities/improvers/vulnerability_status.py index 214e6dc35..cd717da1d 100644 --- a/vulnerabilities/improvers/vulnerability_status.py +++ b/vulnerabilities/improvers/vulnerability_status.py @@ -40,7 +40,7 @@ def interesting_advisories(self) -> QuerySet: return ( Advisory.objects.filter(Q(created_by=NVDImporterPipeline.pipeline_id)) .distinct("aliases") - .paginated() + .iterator() ) def get_inferences(self, advisory_data: AdvisoryData) -> Iterable[Inference]: diff --git a/vulnerabilities/management/commands/export.py b/vulnerabilities/management/commands/export.py index 08685e33d..0da38bbdc 100644 --- a/vulnerabilities/management/commands/export.py +++ b/vulnerabilities/management/commands/export.py @@ -159,7 +159,7 @@ def packages_by_type_ns_name(): "fixing_vulnerabilities__weaknesses", "fixing_vulnerabilities__severities", ) - .paginated() + .iterator() ) for tp_ns_name, packages in groupby(qs, key=by_purl_type_ns_name): diff --git a/vulnerabilities/pipelines/__init__.py b/vulnerabilities/pipelines/__init__.py index d74db9f35..1b3613933 100644 --- a/vulnerabilities/pipelines/__init__.py +++ b/vulnerabilities/pipelines/__init__.py @@ -170,7 +170,7 @@ def import_new_advisories(self): imported_advisory_count = 0 progress = LoopProgress(total_iterations=new_advisories_count, logger=self.log) - for advisory in progress.iter(new_advisories.paginated()): + for advisory in progress.iter(new_advisories.iterator()): self.import_advisory(advisory=advisory) if advisory.date_imported: imported_advisory_count += 1 diff --git a/vulnerabilities/pipelines/add_cvss31_to_CVEs.py b/vulnerabilities/pipelines/add_cvss31_to_CVEs.py index acda42b52..9de6f9aae 100644 --- a/vulnerabilities/pipelines/add_cvss31_to_CVEs.py +++ b/vulnerabilities/pipelines/add_cvss31_to_CVEs.py @@ -47,7 +47,7 @@ def process_cve_advisory_mapping(self): batch_size = 1000 results = [] - for severity in progress.iter(nvd_severities.paginated(per_page=batch_size)): + for severity in progress.iter(nvd_severities.iterator()): print(severity.url) cve_pattern = re.compile(r"(CVE-\d{4}-\d{4,7})").search cve_match = cve_pattern(severity.url) diff --git a/vulnerabilities/pipelines/collect_commits.py b/vulnerabilities/pipelines/collect_commits.py index 92145c051..347fdb94d 100644 --- a/vulnerabilities/pipelines/collect_commits.py +++ b/vulnerabilities/pipelines/collect_commits.py @@ -51,9 +51,7 @@ def collect_and_store_fix_commits(self): total_iterations=affected_by_package_related_vulnerabilities.count(), logger=self.log ) - for apv in progress.iter( - affected_by_package_related_vulnerabilities.paginated(per_page=500) - ): + for apv in progress.iter(affected_by_package_related_vulnerabilities.iterator()): vulnerability = apv.vulnerability for reference in vulnerability.references.all(): if not "/commit/" in reference.url: diff --git a/vulnerabilities/pipelines/compute_package_risk.py b/vulnerabilities/pipelines/compute_package_risk.py index 7ac4de838..2f77467df 100644 --- a/vulnerabilities/pipelines/compute_package_risk.py +++ b/vulnerabilities/pipelines/compute_package_risk.py @@ -54,7 +54,7 @@ def compute_and_store_vulnerability_risk_score(self): updated_vulnerability_count = 0 batch_size = 5000 - for vulnerability in progress.iter(affected_vulnerabilities.paginated(per_page=batch_size)): + for vulnerability in progress.iter(affected_vulnerabilities.iterator()): severities = vulnerability.severities.all() references = vulnerability.references.all() exploits = vulnerability.exploits.all() @@ -110,7 +110,7 @@ def compute_and_store_package_risk_score(self): updated_package_count = 0 batch_size = 10000 - for package in progress.iter(affected_packages.paginated(per_page=batch_size)): + for package in progress.iter(affected_packages.iterator()): risk_score = compute_package_risk(package) if not risk_score: diff --git a/vulnerabilities/pipelines/flag_ghost_packages.py b/vulnerabilities/pipelines/flag_ghost_packages.py index 7daee4115..33e504676 100644 --- a/vulnerabilities/pipelines/flag_ghost_packages.py +++ b/vulnerabilities/pipelines/flag_ghost_packages.py @@ -49,7 +49,7 @@ def detect_and_flag_ghost_packages(logger=None): ) grouped_packages = groupby( - interesting_packages_qs.paginated(), + interesting_packages_qs.iterator(), key=lambda pkg: (pkg.type, pkg.namespace, pkg.name), ) From ab2ab47fb0f3567102f0ee0e8e70e9e70bd0b10b Mon Sep 17 00:00:00 2001 From: Shrish0098 Date: Thu, 13 Mar 2025 22:43:39 +0530 Subject: [PATCH 2/2] Made the changes required Signed-off-by: Shrish Mishra shrish409@gmail.com Signed-off-by: Shrish0098 --- vulnerabilities/models.py | 14 -------------- vulnerabilities/pipelines/add_cvss31_to_CVEs.py | 3 +-- vulnerabilities/pipelines/collect_commits.py | 4 +++- vulnerabilities/pipelines/compute_package_risk.py | 4 ++-- 4 files changed, 6 insertions(+), 19 deletions(-) diff --git a/vulnerabilities/models.py b/vulnerabilities/models.py index 9b6df7c13..c93a7ff5d 100644 --- a/vulnerabilities/models.py +++ b/vulnerabilities/models.py @@ -74,20 +74,6 @@ def get_or_none(self, *args, **kwargs): with suppress(self.model.DoesNotExist, ValidationError): return self.get(*args, **kwargs) - def paginated(self, per_page=5000): - """ - Iterate over a (large) QuerySet by chunks of ``per_page`` items. - This technique is essential for preventing memory issues when iterating - See these links for inspiration: - https://nextlinklabs.com/resources/insights/django-big-data-iteration - https://stackoverflow.com/questions/4222176/why-is-iterating-through-a-large-django-queryset-consuming-massive-amounts-of-me/ - """ - paginator = Paginator(self, per_page=per_page) - for page_number in paginator.page_range: - page = paginator.page(page_number) - for obj in page.object_list: - yield obj - class VulnerabilityQuerySet(BaseQuerySet): def affecting_vulnerabilities(self): diff --git a/vulnerabilities/pipelines/add_cvss31_to_CVEs.py b/vulnerabilities/pipelines/add_cvss31_to_CVEs.py index 9de6f9aae..5582eff8f 100644 --- a/vulnerabilities/pipelines/add_cvss31_to_CVEs.py +++ b/vulnerabilities/pipelines/add_cvss31_to_CVEs.py @@ -44,10 +44,9 @@ def process_cve_advisory_mapping(self): progress_step=5, ) - batch_size = 1000 results = [] - for severity in progress.iter(nvd_severities.iterator()): + for severity in progress.iter(nvd_severities.iterator(chunk_size=2000)): print(severity.url) cve_pattern = re.compile(r"(CVE-\d{4}-\d{4,7})").search cve_match = cve_pattern(severity.url) diff --git a/vulnerabilities/pipelines/collect_commits.py b/vulnerabilities/pipelines/collect_commits.py index 347fdb94d..6d18bb831 100644 --- a/vulnerabilities/pipelines/collect_commits.py +++ b/vulnerabilities/pipelines/collect_commits.py @@ -51,7 +51,9 @@ def collect_and_store_fix_commits(self): total_iterations=affected_by_package_related_vulnerabilities.count(), logger=self.log ) - for apv in progress.iter(affected_by_package_related_vulnerabilities.iterator()): + for apv in progress.iter( + affected_by_package_related_vulnerabilities.iterator(chunk_size=2000) + ): vulnerability = apv.vulnerability for reference in vulnerability.references.all(): if not "/commit/" in reference.url: diff --git a/vulnerabilities/pipelines/compute_package_risk.py b/vulnerabilities/pipelines/compute_package_risk.py index 2f77467df..d1c75f042 100644 --- a/vulnerabilities/pipelines/compute_package_risk.py +++ b/vulnerabilities/pipelines/compute_package_risk.py @@ -54,7 +54,7 @@ def compute_and_store_vulnerability_risk_score(self): updated_vulnerability_count = 0 batch_size = 5000 - for vulnerability in progress.iter(affected_vulnerabilities.iterator()): + for vulnerability in progress.iter(affected_vulnerabilities.iterator(chunk_size=2000)): severities = vulnerability.severities.all() references = vulnerability.references.all() exploits = vulnerability.exploits.all() @@ -110,7 +110,7 @@ def compute_and_store_package_risk_score(self): updated_package_count = 0 batch_size = 10000 - for package in progress.iter(affected_packages.iterator()): + for package in progress.iter(affected_packages.iterator(chunk_size=2000)): risk_score = compute_package_risk(package) if not risk_score: