From 0922e447c852471e0295eea5da6cd8f75cba869a Mon Sep 17 00:00:00 2001 From: Tushar Goel Date: Wed, 16 Jul 2025 14:51:07 +0530 Subject: [PATCH 01/13] Migrate Xen importer Signed-off-by: Tushar Goel --- vulnerabilities/importer.py | 2 + vulnerabilities/importers/__init__.py | 4 +- vulnerabilities/importers/curl.py | 2 +- vulnerabilities/importers/osv.py | 4 +- vulnerabilities/improvers/__init__.py | 4 - .../0099_advisoryv2_original_advisory_text.py | 22 ++++ vulnerabilities/models.py | 6 ++ vulnerabilities/pipelines/__init__.py | 21 ++-- .../v2_importers/apache_httpd_importer.py | 7 ++ .../v2_importers/elixir_security_importer.py | 5 + .../pipelines/v2_importers/github_importer.py | 2 + .../pipelines/v2_importers/gitlab_importer.py | 2 + .../pipelines/v2_importers/npm_importer.py | 5 + .../pipelines/v2_importers/nvd_importer.py | 1 + .../v2_importers/postgresql_importer.py | 7 +- .../pipelines/v2_importers/pypa_importer.py | 4 +- .../pipelines/v2_importers/pysec_importer.py | 2 + .../v2_importers/vulnrichment_importer.py | 1 + .../pipelines/v2_importers/xen_importer.py | 101 ++++++++++++++++++ vulnerabilities/pipes/advisory.py | 1 + 20 files changed, 184 insertions(+), 19 deletions(-) create mode 100644 vulnerabilities/migrations/0099_advisoryv2_original_advisory_text.py create mode 100644 vulnerabilities/pipelines/v2_importers/xen_importer.py diff --git a/vulnerabilities/importer.py b/vulnerabilities/importer.py index 4e0f015da..52f9d62dd 100644 --- a/vulnerabilities/importer.py +++ b/vulnerabilities/importer.py @@ -22,6 +22,7 @@ from typing import Optional from typing import Set from typing import Tuple +from typing import Union import pytz from dateutil import parser as dateparser @@ -361,6 +362,7 @@ class AdvisoryData: weaknesses: List[int] = dataclasses.field(default_factory=list) severities: List[VulnerabilitySeverity] = dataclasses.field(default_factory=list) url: Optional[str] = None + original_advisory_text: Optional[str] = None def __post_init__(self): if self.date_published and not self.date_published.tzinfo: diff --git a/vulnerabilities/importers/__init__.py b/vulnerabilities/importers/__init__.py index a72ef981e..77277262f 100644 --- a/vulnerabilities/importers/__init__.py +++ b/vulnerabilities/importers/__init__.py @@ -45,26 +45,26 @@ from vulnerabilities.pipelines.v2_importers import ( elixir_security_importer as elixir_security_importer_v2, ) -from vulnerabilities.pipelines.v2_importers import github_importer as github_importer_v2 from vulnerabilities.pipelines.v2_importers import gitlab_importer as gitlab_importer_v2 from vulnerabilities.pipelines.v2_importers import npm_importer as npm_importer_v2 from vulnerabilities.pipelines.v2_importers import nvd_importer as nvd_importer_v2 from vulnerabilities.pipelines.v2_importers import pypa_importer as pypa_importer_v2 from vulnerabilities.pipelines.v2_importers import pysec_importer as pysec_importer_v2 from vulnerabilities.pipelines.v2_importers import vulnrichment_importer as vulnrichment_importer_v2 +from vulnerabilities.pipelines.v2_importers import xen_importer as xen_importer_v2 from vulnerabilities.utils import create_registry IMPORTERS_REGISTRY = create_registry( [ nvd_importer_v2.NVDImporterPipeline, elixir_security_importer_v2.ElixirSecurityImporterPipeline, - github_importer_v2.GitHubAPIImporterPipeline, npm_importer_v2.NpmImporterPipeline, vulnrichment_importer_v2.VulnrichImporterPipeline, apache_httpd_v2.ApacheHTTPDImporterPipeline, pypa_importer_v2.PyPaImporterPipeline, gitlab_importer_v2.GitLabImporterPipeline, pysec_importer_v2.PyPIImporterPipeline, + xen_importer_v2.XenImporterPipeline, nvd_importer.NVDImporterPipeline, github_importer.GitHubAPIImporterPipeline, gitlab_importer.GitLabImporterPipeline, diff --git a/vulnerabilities/importers/curl.py b/vulnerabilities/importers/curl.py index 7cbc3208e..30fa9f6c5 100644 --- a/vulnerabilities/importers/curl.py +++ b/vulnerabilities/importers/curl.py @@ -97,7 +97,7 @@ def parse_advisory_data(raw_data) -> AdvisoryData: ... ] ... } >>> parse_advisory_data(raw_data) - AdvisoryData(advisory_id='', aliases=['CVE-2024-2379'], summary='QUIC certificate check bypass with wolfSSL', affected_packages=[AffectedPackage(package=PackageURL(type='generic', namespace='curl.se', name='curl', version=None, qualifiers={}, subpath=None), affected_version_range=GenericVersionRange(constraints=(VersionConstraint(comparator='=', version=SemverVersion(string='8.6.0')),)), fixed_version=SemverVersion(string='8.7.0'))], references=[Reference(reference_id='', reference_type='', url='https://curl.se/docs/CVE-2024-2379.html', severities=[VulnerabilitySeverity(system=Cvssv3ScoringSystem(identifier='cvssv3.1', name='CVSSv3.1 Base Score', url='https://www.first.org/cvss/v3-1/', notes='CVSSv3.1 base score and vector'), value='Low', scoring_elements='', published_at=None, url=None)]), Reference(reference_id='', reference_type='', url='https://hackerone.com/reports/2410774', severities=[])], references_v2=[], date_published=datetime.datetime(2024, 3, 27, 8, 0, tzinfo=datetime.timezone.utc), weaknesses=[297], severities=[], url='https://curl.se/docs/CVE-2024-2379.json') + AdvisoryData(advisory_id='', aliases=['CVE-2024-2379'], summary='QUIC certificate check bypass with wolfSSL', affected_packages=[AffectedPackage(package=PackageURL(type='generic', namespace='curl.se', name='curl', version=None, qualifiers={}, subpath=None), affected_version_range=GenericVersionRange(constraints=(VersionConstraint(comparator='=', version=SemverVersion(string='8.6.0')),)), fixed_version=SemverVersion(string='8.7.0'))], references=[Reference(reference_id='', reference_type='', url='https://curl.se/docs/CVE-2024-2379.html', severities=[VulnerabilitySeverity(system=Cvssv3ScoringSystem(identifier='cvssv3.1', name='CVSSv3.1 Base Score', url='https://www.first.org/cvss/v3-1/', notes='CVSSv3.1 base score and vector'), value='Low', scoring_elements='', published_at=None, url=None)]), Reference(reference_id='', reference_type='', url='https://hackerone.com/reports/2410774', severities=[])], references_v2=[], date_published=datetime.datetime(2024, 3, 27, 8, 0, tzinfo=datetime.timezone.utc), weaknesses=[297], severities=[], url='https://curl.se/docs/CVE-2024-2379.json', original_advisory_text=None) """ affected = get_item(raw_data, "affected")[0] if len(get_item(raw_data, "affected")) > 0 else [] diff --git a/vulnerabilities/importers/osv.py b/vulnerabilities/importers/osv.py index 76be8ef0f..be27492d7 100644 --- a/vulnerabilities/importers/osv.py +++ b/vulnerabilities/importers/osv.py @@ -7,6 +7,7 @@ # See https://aboutcode.org for more information about nexB OSS projects. # +import json import logging from typing import Iterable from typing import List @@ -109,7 +110,7 @@ def parse_advisory_data( def parse_advisory_data_v2( - raw_data: dict, supported_ecosystems, advisory_url: str + raw_data: dict, supported_ecosystems, advisory_url: str, advisory_text: str ) -> Optional[AdvisoryData]: """ Return an AdvisoryData build from a ``raw_data`` mapping of OSV advisory and @@ -173,6 +174,7 @@ def parse_advisory_data_v2( date_published=date_published, weaknesses=weaknesses, url=advisory_url, + original_advisory_text=advisory_text or json.dumps(raw_data, indent=2, ensure_ascii=False), ) diff --git a/vulnerabilities/improvers/__init__.py b/vulnerabilities/improvers/__init__.py index be6f73cb9..af8de5dbd 100644 --- a/vulnerabilities/improvers/__init__.py +++ b/vulnerabilities/improvers/__init__.py @@ -10,7 +10,6 @@ from vulnerabilities.improvers import valid_versions from vulnerabilities.improvers import vulnerability_status from vulnerabilities.pipelines import add_cvss31_to_CVEs -from vulnerabilities.pipelines import collect_commits from vulnerabilities.pipelines import compute_advisory_todo from vulnerabilities.pipelines import compute_package_risk from vulnerabilities.pipelines import compute_package_version_rank @@ -20,7 +19,6 @@ from vulnerabilities.pipelines import flag_ghost_packages from vulnerabilities.pipelines import populate_vulnerability_summary_pipeline from vulnerabilities.pipelines import remove_duplicate_advisories -from vulnerabilities.pipelines.v2_improvers import collect_commits as collect_commits_v2 from vulnerabilities.pipelines.v2_improvers import compute_package_risk as compute_package_risk_v2 from vulnerabilities.pipelines.v2_improvers import ( computer_package_version_rank as compute_version_rank_v2, @@ -58,7 +56,6 @@ enhance_with_exploitdb.ExploitDBImproverPipeline, compute_package_risk.ComputePackageRiskPipeline, compute_package_version_rank.ComputeVersionRankPipeline, - collect_commits.CollectFixCommitsPipeline, add_cvss31_to_CVEs.CVEAdvisoryMappingPipeline, remove_duplicate_advisories.RemoveDuplicateAdvisoriesPipeline, populate_vulnerability_summary_pipeline.PopulateVulnerabilitySummariesPipeline, @@ -68,7 +65,6 @@ enhance_with_metasploit_v2.MetasploitImproverPipeline, compute_package_risk_v2.ComputePackageRiskPipeline, compute_version_rank_v2.ComputeVersionRankPipeline, - collect_commits_v2.CollectFixCommitsPipeline, compute_advisory_todo.ComputeToDo, ] ) diff --git a/vulnerabilities/migrations/0099_advisoryv2_original_advisory_text.py b/vulnerabilities/migrations/0099_advisoryv2_original_advisory_text.py new file mode 100644 index 000000000..703aa81f6 --- /dev/null +++ b/vulnerabilities/migrations/0099_advisoryv2_original_advisory_text.py @@ -0,0 +1,22 @@ +# Generated by Django 4.2.22 on 2025-07-16 08:39 + +from django.db import migrations, models + + +class Migration(migrations.Migration): + + dependencies = [ + ("vulnerabilities", "0098_alter_advisory_options_alter_advisoryalias_options_and_more"), + ] + + operations = [ + migrations.AddField( + model_name="advisoryv2", + name="original_advisory_text", + field=models.TextField( + blank=True, + help_text="Raw advisory data as collected from the upstream datasource.", + null=True, + ), + ), + ] diff --git a/vulnerabilities/models.py b/vulnerabilities/models.py index 54a00d032..c4a302536 100644 --- a/vulnerabilities/models.py +++ b/vulnerabilities/models.py @@ -2744,6 +2744,12 @@ class AdvisoryV2(models.Model): blank=True, null=True, help_text="UTC Date on which the advisory was imported" ) + original_advisory_text = models.TextField( + blank=True, + null=True, + help_text="Raw advisory data as collected from the upstream datasource.", + ) + affecting_packages = models.ManyToManyField( "PackageV2", related_name="affected_by_advisories", diff --git a/vulnerabilities/pipelines/__init__.py b/vulnerabilities/pipelines/__init__.py index 0a49e080c..6232a294d 100644 --- a/vulnerabilities/pipelines/__init__.py +++ b/vulnerabilities/pipelines/__init__.py @@ -307,13 +307,20 @@ def collect_and_store_advisories(self): if advisory is None: self.log("Advisory is None, skipping") continue - if _obj := insert_advisory_v2( - advisory=advisory, - pipeline_id=self.pipeline_id, - get_advisory_packages=self.get_advisory_packages, - logger=self.log, - ): - collected_advisory_count += 1 + try: + if _obj := insert_advisory_v2( + advisory=advisory, + pipeline_id=self.pipeline_id, + get_advisory_packages=self.get_advisory_packages, + logger=self.log, + ): + collected_advisory_count += 1 + except Exception as e: + self.log( + f"Failed to import advisory: {advisory!r} with error {e!r}:\n{traceback_format_exc()}", + level=logging.ERROR, + ) + continue self.log(f"Successfully collected {collected_advisory_count:,d} advisories") diff --git a/vulnerabilities/pipelines/v2_importers/apache_httpd_importer.py b/vulnerabilities/pipelines/v2_importers/apache_httpd_importer.py index 77b55f7dc..c90af00c4 100644 --- a/vulnerabilities/pipelines/v2_importers/apache_httpd_importer.py +++ b/vulnerabilities/pipelines/v2_importers/apache_httpd_importer.py @@ -7,6 +7,7 @@ # See https://aboutcode.org for more information about nexB OSS projects. # +import json import logging import re import urllib.parse @@ -14,6 +15,7 @@ import requests from bs4 import BeautifulSoup +from dateutil import parser as date_parser from packageurl import PackageURL from univers.version_constraint import VersionConstraint from univers.version_range import ApacheVersionRange @@ -272,8 +274,11 @@ def to_advisory(self, data): versions_data.append(version_data) fixed_versions = [] + date_published = None for timeline_object in data.get("timeline") or []: timeline_value = timeline_object.get("value") + if timeline_value == "public": + date_published = timeline_object.get("time") if "release" in timeline_value: split_timeline_value = timeline_value.split(" ") if "never" in timeline_value: @@ -307,6 +312,8 @@ def to_advisory(self, data): weaknesses=weaknesses, url=reference.url, severities=severities, + original_advisory_text=json.dumps(data, indent=2, ensure_ascii=False), + date_published=date_parser.parse(date_published) if date_published else None, ) def to_version_ranges(self, versions_data, fixed_versions): diff --git a/vulnerabilities/pipelines/v2_importers/elixir_security_importer.py b/vulnerabilities/pipelines/v2_importers/elixir_security_importer.py index 384a2dafb..64c31cb45 100644 --- a/vulnerabilities/pipelines/v2_importers/elixir_security_importer.py +++ b/vulnerabilities/pipelines/v2_importers/elixir_security_importer.py @@ -73,6 +73,10 @@ def process_file(self, file, base_path) -> Iterable[AdvisoryData]: advisory_url = ( f"https://github.com/dependabot/elixir-security-advisories/blob/master/{relative_path}" ) + advisory_text = None + with open(str(file)) as f: + advisory_text = f.read() + yaml_file = load_yaml(str(file)) summary = yaml_file.get("description") or "" @@ -129,4 +133,5 @@ def process_file(self, file, base_path) -> Iterable[AdvisoryData]: affected_packages=affected_packages, url=advisory_url, date_published=date_published, + original_advisory_text=advisory_text or str(yaml_file), ) diff --git a/vulnerabilities/pipelines/v2_importers/github_importer.py b/vulnerabilities/pipelines/v2_importers/github_importer.py index 55ac93716..3101679e6 100644 --- a/vulnerabilities/pipelines/v2_importers/github_importer.py +++ b/vulnerabilities/pipelines/v2_importers/github_importer.py @@ -7,6 +7,7 @@ # See https://aboutcode.org for more information about nexB OSS projects. # +import json import logging from traceback import format_exc as traceback_format_exc from typing import Callable @@ -368,6 +369,7 @@ def process_response( date_published=date_published, weaknesses=weaknesses, url=f"https://github.com/advisories/{ghsa_id}", + original_advisory_text=json.dumps(github_advisory, indent=2, ensure_ascii=False), ) diff --git a/vulnerabilities/pipelines/v2_importers/gitlab_importer.py b/vulnerabilities/pipelines/v2_importers/gitlab_importer.py index 13f61bd75..52d9eb147 100644 --- a/vulnerabilities/pipelines/v2_importers/gitlab_importer.py +++ b/vulnerabilities/pipelines/v2_importers/gitlab_importer.py @@ -7,6 +7,7 @@ # See https://aboutcode.org for more information about nexB OSS projects. # +import json import logging import traceback from pathlib import Path @@ -325,4 +326,5 @@ def parse_gitlab_advisory( affected_packages=affected_packages, weaknesses=cwe_list, url=advisory_url, + original_advisory_text=json.dumps(gitlab_advisory, indent=2, ensure_ascii=False), ) diff --git a/vulnerabilities/pipelines/v2_importers/npm_importer.py b/vulnerabilities/pipelines/v2_importers/npm_importer.py index 67e2a4355..5f0306413 100644 --- a/vulnerabilities/pipelines/v2_importers/npm_importer.py +++ b/vulnerabilities/pipelines/v2_importers/npm_importer.py @@ -9,6 +9,7 @@ # Author: Navonil Das (@NavonilDas) +import json from pathlib import Path from typing import Iterable @@ -69,6 +70,9 @@ def to_advisory_data(self, file: Path) -> Iterable[AdvisoryData]: self.log(f"Skipping {file.name} file") return data = load_json(file) + advisory_text = None + with open(file) as f: + advisory_text = f.read() id = data.get("id") description = data.get("overview") or "" summary = data.get("title") or "" @@ -130,6 +134,7 @@ def to_advisory_data(self, file: Path) -> Iterable[AdvisoryData]: references_v2=references, severities=severities, url=f"https://github.com/nodejs/security-wg/blob/main/vuln/npm/{id}.json", + original_advisory_text=advisory_text or json.dumps(data, indent=2, ensure_ascii=False), ) def get_affected_package(self, data, package_name): diff --git a/vulnerabilities/pipelines/v2_importers/nvd_importer.py b/vulnerabilities/pipelines/v2_importers/nvd_importer.py index a6654d6b1..2b9de3bd8 100644 --- a/vulnerabilities/pipelines/v2_importers/nvd_importer.py +++ b/vulnerabilities/pipelines/v2_importers/nvd_importer.py @@ -326,6 +326,7 @@ def to_advisory(self): weaknesses=self.weaknesses, severities=self.severities, url=f"https://nvd.nist.gov/vuln/detail/{self.cve_id}", + raw_data=json.dumps(self.cve_item, indent=2, ensure_ascii=False), ) diff --git a/vulnerabilities/pipelines/v2_importers/postgresql_importer.py b/vulnerabilities/pipelines/v2_importers/postgresql_importer.py index 2f98a7ce4..b7580b8c3 100644 --- a/vulnerabilities/pipelines/v2_importers/postgresql_importer.py +++ b/vulnerabilities/pipelines/v2_importers/postgresql_importer.py @@ -53,7 +53,7 @@ def collect_advisories(self) -> Iterable[AdvisoryData]: for url in self.links: data = requests.get(url).content - yield from self.to_advisories(data) + yield from self.to_advisories(data, url) def collect_links(self): known_urls = {self.base_url} @@ -69,7 +69,7 @@ def collect_links(self): break self.links = known_urls - def to_advisories(self, data): + def to_advisories(self, data, url): advisories = [] soup = BeautifulSoup(data, features="lxml") tables = soup.select("table") @@ -150,7 +150,8 @@ def to_advisories(self, data): references_v2=references, severities=severities, affected_packages=affected_packages, - url=f"https://www.postgresql.org/support/security/{cve_id}", + url=url, + original_advisory_text=str(row), ) ) diff --git a/vulnerabilities/pipelines/v2_importers/pypa_importer.py b/vulnerabilities/pipelines/v2_importers/pypa_importer.py index 7463cc4bd..8f9f57ddf 100644 --- a/vulnerabilities/pipelines/v2_importers/pypa_importer.py +++ b/vulnerabilities/pipelines/v2_importers/pypa_importer.py @@ -58,11 +58,13 @@ def collect_advisories(self) -> Iterable[AdvisoryData]: base_path=base_directory, url="https://github.com/pypa/advisory-database/blob/main/", ) - advisory_dict = saneyaml.load(advisory.read_text()) + advisory_text = advisory.read_text() + advisory_dict = saneyaml.load(advisory_text) yield parse_advisory_data_v2( raw_data=advisory_dict, supported_ecosystems=["pypi"], advisory_url=advisory_url, + advisory_text=advisory_text, ) def clean_downloads(self): diff --git a/vulnerabilities/pipelines/v2_importers/pysec_importer.py b/vulnerabilities/pipelines/v2_importers/pysec_importer.py index e67f41a28..ed41fdc87 100644 --- a/vulnerabilities/pipelines/v2_importers/pysec_importer.py +++ b/vulnerabilities/pipelines/v2_importers/pysec_importer.py @@ -60,8 +60,10 @@ def collect_advisories(self) -> Iterable[AdvisoryData]: continue with zip_file.open(file_name) as f: vul_info = json.load(f) + advisory_text = f.read() yield parse_advisory_data_v2( raw_data=vul_info, supported_ecosystems=["pypi"], advisory_url=self.url, + advisory_text=advisory_text.decode("utf-8"), ) diff --git a/vulnerabilities/pipelines/v2_importers/vulnrichment_importer.py b/vulnerabilities/pipelines/v2_importers/vulnrichment_importer.py index 85b51e4ec..d8590b498 100644 --- a/vulnerabilities/pipelines/v2_importers/vulnrichment_importer.py +++ b/vulnerabilities/pipelines/v2_importers/vulnrichment_importer.py @@ -193,6 +193,7 @@ def parse_cve_advisory(self, raw_data, advisory_url): weaknesses=sorted(weaknesses), url=advisory_url, severities=severities, + original_advisory_text=json.dumps(raw_data, indent=2, ensure_ascii=False), ) def clean_downloads(self): diff --git a/vulnerabilities/pipelines/v2_importers/xen_importer.py b/vulnerabilities/pipelines/v2_importers/xen_importer.py new file mode 100644 index 000000000..d0b7ff3cd --- /dev/null +++ b/vulnerabilities/pipelines/v2_importers/xen_importer.py @@ -0,0 +1,101 @@ +# +# Copyright (c) nexB Inc. and others. All rights reserved. +# VulnerableCode is a trademark of nexB Inc. +# SPDX-License-Identifier: Apache-2.0 +# See http://www.apache.org/licenses/LICENSE-2.0 for the license text. +# See https://github.com/aboutcode-org/vulnerablecode for support or download. +# See https://aboutcode.org for more information about nexB OSS projects. +# + +import json +from typing import Iterable + +from dateutil import parser + +from vulnerabilities.importer import AdvisoryData +from vulnerabilities.importer import ReferenceV2 +from vulnerabilities.pipelines import VulnerableCodeBaseImporterPipelineV2 +from vulnerabilities.references import XsaReference +from vulnerabilities.utils import fetch_response + + +class XenImporterPipeline(VulnerableCodeBaseImporterPipelineV2): + """ + Importer for Xen Security Advisories from xsa.json. + """ + + pipeline_id = "xen_importer_v2" + url = "https://xenbits.xen.org/xsa/xsa.json" + spdx_license_expression = "LicenseRef-scancode-other-permissive" + license_url = "https://xenbits.xen.org/xsa/" + notice = """ + From: George Dunlap + Date: Wed, Jan 25, 2023 at 4:57 PM + Subject: Re: Usage of Xen Security Data in VulnerableCode + To: Tushar Goel + Cc: Andrew Cooper , xen-devel@lists.xenproject.org , Xen Security , Philippe Ombredanne , + + On Thu, Jan 19, 2023 at 1:10 PM Tushar Goel wrote: + > + > Hi Andrew, + > + > > Maybe we want to make it CC-BY-4 to require people to reference back to + > > the canonical upstream ? + > Thanks for your response, can we have a more declarative statement on + > the license from your end + > and also can you please provide your acknowledgement over the usage of + > Xen security data in vulnerablecode. + + + Hey Tushar, + Informally, the Xen Project Security Team is happy for you to include the data from xsa.json in your open-source vulnerability database. As a courtesy we'd request that it be documented where the information came from. (I think if the data includes links to then advisories on our website, that will suffice.) + Formally, we're not copyright lawyers; but we don't think there's anything copyright-able in the xsa.json: There is no editorial or creative control in the generation of that file; it's just a collection of facts which you could re-generate by scanning all the advisories. (In fact that's exactly how the file is created; i.e., the collection of advisory texts is our "source of truth".) + We do have "Officially license all advisory text as CC-BY-4" on our to-do list; if you'd be more comfortable with an official license for xsa.json as well, we can add that to the list. + + -George + """ + + _cached_data = None # Class-level cache + + @classmethod + def steps(cls): + return (cls.collect_and_store_advisories,) + + def get_xsa_data(self): + if self._cached_data is None: + self._cached_data = fetch_response(self.url).json() + return self._cached_data + + def advisories_count(self) -> int: + data = self.get_xsa_data() + return len(data[0].get("xsas", [])) if data else 0 + + def collect_advisories(self) -> Iterable[AdvisoryData]: + data = self.get_xsa_data() + if not data: + return + + for xsa in data[0].get("xsas", []): + yield from self.to_advisories(xsa) + + def to_advisories(self, xsa) -> Iterable[AdvisoryData]: + xsa_id = xsa.get("xsa") + references = [] + + if xsa_id: + xsa_reference = XsaReference.from_number(number=xsa_id) + references.append(ReferenceV2(url=xsa_reference.url)) + + title = xsa.get("title", "") + date_published = xsa.get("public_time") + cve = xsa.get("cve", []) + + yield AdvisoryData( + advisory_id=f"XSA-{xsa_id}", + aliases=cve, + url="https://xenbits.xen.org/xsa/", + summary=title, + references_v2=references, + date_published=parser.parse(date_published), + original_advisory_text=json.dumps(xsa, indent=2, ensure_ascii=False), + ) diff --git a/vulnerabilities/pipes/advisory.py b/vulnerabilities/pipes/advisory.py index d5d88fbfd..2736d8874 100644 --- a/vulnerabilities/pipes/advisory.py +++ b/vulnerabilities/pipes/advisory.py @@ -159,6 +159,7 @@ def insert_advisory_v2( "summary": advisory.summary, "date_published": advisory.date_published, "date_collected": datetime.now(timezone.utc), + "original_advisory_text": advisory.original_advisory_text, } advisory_obj, _ = AdvisoryV2.objects.get_or_create( From e954b603cf229071026e60415e8a117804ec608c Mon Sep 17 00:00:00 2001 From: Tushar Goel Date: Wed, 16 Jul 2025 15:03:28 +0530 Subject: [PATCH 02/13] Add tests for Xen importer Signed-off-by: Tushar Goel --- .../tests/pipelines/test_xen_importer_v2.py | 102 ++++++++++++++++++ 1 file changed, 102 insertions(+) create mode 100644 vulnerabilities/tests/pipelines/test_xen_importer_v2.py diff --git a/vulnerabilities/tests/pipelines/test_xen_importer_v2.py b/vulnerabilities/tests/pipelines/test_xen_importer_v2.py new file mode 100644 index 000000000..f5813896e --- /dev/null +++ b/vulnerabilities/tests/pipelines/test_xen_importer_v2.py @@ -0,0 +1,102 @@ +# +# Copyright (c) nexB Inc. and others. All rights reserved. +# VulnerableCode is a trademark of nexB Inc. +# SPDX-License-Identifier: Apache-2.0 +# See http://www.apache.org/licenses/LICENSE-2.0 for the license text. +# See https://github.com/aboutcode-org/vulnerablecode for support or download. +# See https://aboutcode.org for more information about nexB OSS projects. +# + +from unittest.mock import patch + +import pytest +from dateutil.parser import parse as date_parse + +from vulnerabilities.importer import AdvisoryData +from vulnerabilities.importer import ReferenceV2 +from vulnerabilities.pipelines.v2_importers.xen_importer import XenImporterPipeline + +SAMPLE_XSA_JSON = [ + { + "xsas": [ + { + "xsa": 123, + "title": "Sample Xen Advisory", + "public_time": "2022-09-15T00:00:00Z", + "cve": ["CVE-2022-12345"], + }, + { + "xsa": 456, + "title": "Another Advisory", + "public_time": "2023-01-01T00:00:00Z", + "cve": [], + }, + ] + } +] + + +@pytest.fixture +def pipeline(): + return XenImporterPipeline() + + +@patch("vulnerabilities.pipelines.v2_importers.xen_importer.fetch_response") +def test_get_xsa_data(mock_fetch, pipeline): + mock_fetch.return_value.json.return_value = SAMPLE_XSA_JSON + data = pipeline.get_xsa_data() + assert isinstance(data, list) + assert "xsas" in data[0] + + +@patch("vulnerabilities.pipelines.v2_importers.xen_importer.fetch_response") +def test_advisories_count(mock_fetch, pipeline): + mock_fetch.return_value.json.return_value = SAMPLE_XSA_JSON + count = pipeline.advisories_count() + assert count == 2 + + +@patch("vulnerabilities.pipelines.v2_importers.xen_importer.fetch_response") +def test_collect_advisories(mock_fetch, pipeline): + mock_fetch.return_value.json.return_value = SAMPLE_XSA_JSON + advisories = list(pipeline.collect_advisories()) + + assert len(advisories) == 2 + + first = advisories[0] + assert isinstance(first, AdvisoryData) + assert first.advisory_id == "XSA-123" + assert first.aliases == ["CVE-2022-12345"] + assert first.summary == "Sample Xen Advisory" + assert isinstance(first.references_v2[0], ReferenceV2) + assert first.date_published == date_parse("2022-09-15T00:00:00Z") + + +def test_to_advisories_single(pipeline): + xsa_sample = { + "xsa": 999, + "title": "Test Advisory", + "public_time": "2021-07-01T00:00:00Z", + "cve": ["CVE-2021-9999"], + } + + results = list(pipeline.to_advisories(xsa_sample)) + assert len(results) == 1 + + advisory = results[0] + assert advisory.advisory_id == "XSA-999" + assert advisory.aliases == ["CVE-2021-9999"] + assert advisory.summary == "Test Advisory" + assert advisory.date_published == date_parse("2021-07-01T00:00:00Z") + assert advisory.original_advisory_text.startswith('{\n "xsa"') + + +def test_to_advisories_missing_fields(pipeline): + xsa_sample = {"xsa": None, "title": None, "public_time": "2020-01-01T00:00:00Z", "cve": []} + + results = list(pipeline.to_advisories(xsa_sample)) + advisory = results[0] + + assert advisory.advisory_id == "XSA-None" + assert advisory.aliases == [] + assert advisory.summary == None From 0b5f4abb8aed896acc777cec62e072174a271bb2 Mon Sep 17 00:00:00 2001 From: Tushar Goel Date: Wed, 16 Jul 2025 15:45:30 +0530 Subject: [PATCH 03/13] Migrate CURL importer Signed-off-by: Tushar Goel --- vulnerabilities/importers/__init__.py | 2 + .../pipelines/v2_importers/curl_importer.py | 155 ++++++++++++++++++ .../tests/pipelines/test_curl_importer_v2.py | 119 ++++++++++++++ 3 files changed, 276 insertions(+) create mode 100644 vulnerabilities/pipelines/v2_importers/curl_importer.py create mode 100644 vulnerabilities/tests/pipelines/test_curl_importer_v2.py diff --git a/vulnerabilities/importers/__init__.py b/vulnerabilities/importers/__init__.py index 77277262f..39245d276 100644 --- a/vulnerabilities/importers/__init__.py +++ b/vulnerabilities/importers/__init__.py @@ -42,6 +42,7 @@ from vulnerabilities.pipelines import pypa_importer from vulnerabilities.pipelines import pysec_importer from vulnerabilities.pipelines.v2_importers import apache_httpd_importer as apache_httpd_v2 +from vulnerabilities.pipelines.v2_importers import curl_importer as curl_importer_v2 from vulnerabilities.pipelines.v2_importers import ( elixir_security_importer as elixir_security_importer_v2, ) @@ -65,6 +66,7 @@ gitlab_importer_v2.GitLabImporterPipeline, pysec_importer_v2.PyPIImporterPipeline, xen_importer_v2.XenImporterPipeline, + curl_importer_v2.CurlImporterPipeline, nvd_importer.NVDImporterPipeline, github_importer.GitHubAPIImporterPipeline, gitlab_importer.GitLabImporterPipeline, diff --git a/vulnerabilities/pipelines/v2_importers/curl_importer.py b/vulnerabilities/pipelines/v2_importers/curl_importer.py new file mode 100644 index 000000000..52715c22a --- /dev/null +++ b/vulnerabilities/pipelines/v2_importers/curl_importer.py @@ -0,0 +1,155 @@ +# +# Copyright (c) nexB Inc. and others. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# + +import json +import logging +from datetime import datetime +from datetime import timezone +from typing import Iterable + +from cwe2.database import Database +from packageurl import PackageURL +from univers.version_range import GenericVersionRange +from univers.versions import SemverVersion + +from vulnerabilities.importer import AdvisoryData +from vulnerabilities.importer import AffectedPackage +from vulnerabilities.importer import ReferenceV2 +from vulnerabilities.importer import VulnerabilitySeverity +from vulnerabilities.pipelines import VulnerableCodeBaseImporterPipelineV2 +from vulnerabilities.severity_systems import SCORING_SYSTEMS +from vulnerabilities.utils import fetch_response +from vulnerabilities.utils import get_cwe_id +from vulnerabilities.utils import get_item + +logger = logging.getLogger(__name__) + + +class CurlImporterPipeline(VulnerableCodeBaseImporterPipelineV2): + """ + Pipeline-based importer for curl advisories from curl.se. + """ + + pipeline_id = "curl_importer_v2" + spdx_license_expression = "curl" + license_url = "https://curl.se/docs/copyright.html" + repo_url = "https://github.com/curl/curl-www/" + url = "https://curl.se/docs/vuln.json" + + @classmethod + def steps(cls): + return (cls.collect_and_store_advisories,) + + def fetch_data(self): + return fetch_response(self.url).json() + + def advisories_count(self) -> int: + return len(self.fetch_data()) + + def collect_advisories(self) -> Iterable[AdvisoryData]: + for entry in self.fetch_data(): + cve_id = entry.get("aliases") or [] + cve_id = cve_id[0] if cve_id else None + if not cve_id or not cve_id.startswith("CVE"): + package = get_item(entry, "database_specific", "package") + logger.error(f"Invalid CVE ID: {cve_id} in package {package}") + continue + yield parse_curl_advisory(entry) + + +def parse_curl_advisory(raw_data) -> AdvisoryData: + """ + Parse advisory data from raw JSON data and return an AdvisoryData object. + + Args: + raw_data (dict): Raw JSON data containing advisory information. + + Returns: + AdvisoryData: Parsed advisory data as an AdvisoryData object. + """ + affected = get_item(raw_data, "affected")[0] if len(get_item(raw_data, "affected")) > 0 else [] + + ranges = get_item(affected, "ranges")[0] if len(get_item(affected, "ranges")) > 0 else [] + events = get_item(ranges, "events")[1] if len(get_item(ranges, "events")) > 1 else {} + version_type = get_item(ranges, "type") if get_item(ranges, "type") else "" + fixed_version = events.get("fixed") + if version_type == "SEMVER" and fixed_version: + fixed_version = SemverVersion(fixed_version) + + purl = PackageURL(type="generic", namespace="curl.se", name="curl") + versions = affected.get("versions") or [] + affected_version_range = GenericVersionRange.from_versions(versions) + + affected_package = AffectedPackage( + package=purl, + affected_version_range=affected_version_range, + fixed_version=fixed_version, + ) + + database_specific = raw_data.get("database_specific") or {} + + references = [] + www_url = database_specific.get("www") + issue_url = database_specific.get("issue") + json_url = database_specific.get("URL") + + if www_url: + references.append(ReferenceV2(url=www_url)) + if issue_url: + references.append(ReferenceV2(url=issue_url)) + severity = VulnerabilitySeverity( + system=SCORING_SYSTEMS["cvssv3.1"], value=database_specific.get("severity", ""), url=www_url + ) + + published = raw_data.get("published", "") + date_published = ( + datetime.strptime(published, "%Y-%m-%dT%H:%M:%S.%fZ").replace(tzinfo=timezone.utc) + if published + else None + ) + + weaknesses = get_cwe_from_curl_advisory(raw_data) + + aliases = raw_data.get("aliases", []) + advisory_id = raw_data.get("id") or "" + + if advisory_id in aliases: + aliases.remove(advisory_id) + + return AdvisoryData( + advisory_id=advisory_id, + aliases=aliases, + summary=raw_data.get("summary") or "", + affected_packages=[affected_package], + references_v2=references, + date_published=date_published, + weaknesses=weaknesses, + url=json_url, + severities=[severity], + original_advisory_text=json.dumps(raw_data, indent=2, ensure_ascii=False), + ) + + +def get_cwe_from_curl_advisory(raw_data): + """ + Extracts CWE IDs from the given raw_data and returns a list of CWE IDs. + + >>> get_cwe_from_curl_advisory({"database_specific": {"CWE": {"id": "CWE-333"}}}) + [333] + >>> get_cwe_from_curl_advisory({"database_specific": {"CWE": {"id": ""}}}) + [] + """ + weaknesses = [] + db = Database() + cwe_string = get_item(raw_data, "database_specific", "CWE", "id") or "" + + if cwe_string: + try: + cwe_id = get_cwe_id(cwe_string) + db.get(cwe_id) # validate CWE exists + weaknesses.append(cwe_id) + except Exception: + logger.error(f"Invalid CWE id: {cwe_string}") + return weaknesses diff --git a/vulnerabilities/tests/pipelines/test_curl_importer_v2.py b/vulnerabilities/tests/pipelines/test_curl_importer_v2.py new file mode 100644 index 000000000..39c786d3a --- /dev/null +++ b/vulnerabilities/tests/pipelines/test_curl_importer_v2.py @@ -0,0 +1,119 @@ +# +# Copyright (c) nexB Inc. and others. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# + +from datetime import datetime +from datetime import timezone +from unittest.mock import patch + +import pytest +from packageurl import PackageURL +from univers.versions import SemverVersion + +from vulnerabilities.importer import AdvisoryData +from vulnerabilities.importer import AffectedPackage +from vulnerabilities.pipelines.v2_importers.curl_importer import CurlImporterPipeline +from vulnerabilities.pipelines.v2_importers.curl_importer import get_cwe_from_curl_advisory +from vulnerabilities.pipelines.v2_importers.curl_importer import parse_curl_advisory + +SAMPLE_CURL_ADVISORY = { + "aliases": ["CVE-2024-12345"], + "id": "CVE-2024-12345", + "summary": "Sample vulnerability in curl", + "published": "2024-06-30T08:00:00.00Z", + "affected": [ + { + "ranges": [{"type": "SEMVER", "events": [{"introduced": "8.6.0"}, {"fixed": "8.7.0"}]}], + "versions": ["8.6.0"], + } + ], + "database_specific": { + "package": "curl", + "URL": "https://curl.se/docs/CVE-2024-12345.json", + "www": "https://curl.se/docs/CVE-2024-12345.html", + "issue": "https://hackerone.com/reports/1111111", + "severity": "High", + "CWE": { + "id": "CWE-119", + "desc": "Improper restriction of operations within bounds of a memory buffer", + }, + }, +} + + +@pytest.fixture +def pipeline(): + return CurlImporterPipeline() + + +@patch("vulnerabilities.importers.curl_importer.fetch_response") +def test_advisories_count(mock_fetch, pipeline): + mock_fetch.return_value.json.return_value = [SAMPLE_CURL_ADVISORY] + assert pipeline.advisories_count() == 1 + + +@patch("vulnerabilities.importers.curl_importer.fetch_response") +def test_collect_advisories(mock_fetch, pipeline): + mock_fetch.return_value.json.return_value = [SAMPLE_CURL_ADVISORY] + advisories = list(pipeline.collect_advisories()) + assert len(advisories) == 1 + + advisory = advisories[0] + assert isinstance(advisory, AdvisoryData) + assert advisory.advisory_id == "CVE-2024-12345" + assert advisory.aliases == [] + assert advisory.summary == "Sample vulnerability in curl" + assert advisory.date_published == datetime(2024, 6, 30, 8, 0, tzinfo=timezone.utc) + assert advisory.url == "https://curl.se/docs/CVE-2024-12345.json" + assert advisory.weaknesses == [119] + + # Affected package check + pkg = advisory.affected_packages[0] + assert isinstance(pkg, AffectedPackage) + assert pkg.package == PackageURL(type="generic", namespace="curl.se", name="curl") + assert pkg.fixed_version == SemverVersion("8.7.0") + assert "8.6.0" in str(pkg.affected_version_range) + + # References + urls = [ref.url for ref in advisory.references_v2] + assert "https://curl.se/docs/CVE-2024-12345.html" in urls + assert "https://hackerone.com/reports/1111111" in urls + + # Severity + severity = advisory.severities[0] + assert severity.value == "High" + assert severity.system.identifier == "cvssv3.1" + + +def test_parse_curl_advisory_minimal(): + data = dict(SAMPLE_CURL_ADVISORY) + data.pop("database_specific") + data["aliases"] = ["CVE-2024-99999"] + data["id"] = "CVE-2024-99999" + data["database_specific"] = {} + + parsed = parse_curl_advisory(data) + + assert parsed.advisory_id == "CVE-2024-99999" + assert parsed.aliases == [] + assert parsed.references_v2 == [] + assert parsed.severities[0].value == "" + + +def test_get_cwe_from_valid(): + cwe_data = {"database_specific": {"CWE": {"id": "CWE-79", "desc": "Cross-site scripting"}}} + result = get_cwe_from_curl_advisory(cwe_data) + assert result == [79] + + +def test_get_cwe_from_invalid(): + bad_cwe_data = {"database_specific": {"CWE": {"id": "CWE-999999"}}} + result = get_cwe_from_curl_advisory(bad_cwe_data) + assert result == [] + + +def test_get_cwe_from_empty(): + empty_cwe_data = {"database_specific": {"CWE": {"id": ""}}} + result = get_cwe_from_curl_advisory(empty_cwe_data) + assert result == [] From 655c58351bfc75cb07ce7555f4ec53106ad13583 Mon Sep 17 00:00:00 2001 From: Tushar Goel Date: Wed, 16 Jul 2025 15:48:38 +0530 Subject: [PATCH 04/13] Fix tests Signed-off-by: Tushar Goel --- vulnerabilities/tests/pipelines/test_curl_importer_v2.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/vulnerabilities/tests/pipelines/test_curl_importer_v2.py b/vulnerabilities/tests/pipelines/test_curl_importer_v2.py index 39c786d3a..0d9a25358 100644 --- a/vulnerabilities/tests/pipelines/test_curl_importer_v2.py +++ b/vulnerabilities/tests/pipelines/test_curl_importer_v2.py @@ -47,13 +47,13 @@ def pipeline(): return CurlImporterPipeline() -@patch("vulnerabilities.importers.curl_importer.fetch_response") +@patch("vulnerabilities.pipelines.v2_importers.curl_importer.fetch_response") def test_advisories_count(mock_fetch, pipeline): mock_fetch.return_value.json.return_value = [SAMPLE_CURL_ADVISORY] assert pipeline.advisories_count() == 1 -@patch("vulnerabilities.importers.curl_importer.fetch_response") +@patch("vulnerabilities.pipelines.v2_importers.curl_importer.fetch_response") def test_collect_advisories(mock_fetch, pipeline): mock_fetch.return_value.json.return_value = [SAMPLE_CURL_ADVISORY] advisories = list(pipeline.collect_advisories()) From 729c86a7e31f3a02316a5f7fcf57dfd1de28e2fe Mon Sep 17 00:00:00 2001 From: Tushar Goel Date: Wed, 16 Jul 2025 18:14:30 +0530 Subject: [PATCH 05/13] Add OSS Fuzz importer Signed-off-by: Tushar Goel --- vulnerabilities/importers/__init__.py | 2 + .../pipelines/v2_importers/oss_fuzz.py | 73 +++++++++++++++++++ 2 files changed, 75 insertions(+) create mode 100644 vulnerabilities/pipelines/v2_importers/oss_fuzz.py diff --git a/vulnerabilities/importers/__init__.py b/vulnerabilities/importers/__init__.py index 39245d276..f6232046d 100644 --- a/vulnerabilities/importers/__init__.py +++ b/vulnerabilities/importers/__init__.py @@ -53,6 +53,7 @@ from vulnerabilities.pipelines.v2_importers import pysec_importer as pysec_importer_v2 from vulnerabilities.pipelines.v2_importers import vulnrichment_importer as vulnrichment_importer_v2 from vulnerabilities.pipelines.v2_importers import xen_importer as xen_importer_v2 +from vulnerabilities.pipelines.v2_importers import oss_fuzz as oss_fuzz_v2 from vulnerabilities.utils import create_registry IMPORTERS_REGISTRY = create_registry( @@ -67,6 +68,7 @@ pysec_importer_v2.PyPIImporterPipeline, xen_importer_v2.XenImporterPipeline, curl_importer_v2.CurlImporterPipeline, + oss_fuzz_v2.OSSFuzzImporterPipeline, nvd_importer.NVDImporterPipeline, github_importer.GitHubAPIImporterPipeline, gitlab_importer.GitLabImporterPipeline, diff --git a/vulnerabilities/pipelines/v2_importers/oss_fuzz.py b/vulnerabilities/pipelines/v2_importers/oss_fuzz.py new file mode 100644 index 000000000..c4afad4c0 --- /dev/null +++ b/vulnerabilities/pipelines/v2_importers/oss_fuzz.py @@ -0,0 +1,73 @@ +# +# Copyright (c) nexB Inc. and others. All rights reserved. +# VulnerableCode is a trademark of nexB Inc. +# SPDX-License-Identifier: Apache-2.0 +# See http://www.apache.org/licenses/LICENSE-2.0 for the license text. +# See https://github.com/aboutcode-org/vulnerablecode for support or download. +# See https://aboutcode.org for more information about nexB OSS projects. +# +import logging +from pathlib import Path +from typing import Iterable + +import saneyaml + +from vulnerabilities.importer import AdvisoryData +from vulnerabilities.pipelines import VulnerableCodeBaseImporterPipelineV2 +from vulnerabilities.utils import get_advisory_url +from fetchcode.vcs import fetch_via_vcs + +logger = logging.getLogger(__name__) + + +class OSSFuzzImporterPipeline(VulnerableCodeBaseImporterPipelineV2): + pipeline_id = "oss_fuzz_importer_v2" + spdx_license_expression = "CC-BY-4.0" + license_url = "https://github.com/google/oss-fuzz-vulns/blob/main/LICENSE" + repo_url = "git+https://github.com/google/oss-fuzz-vulns" + unfurl_version_ranges = True + + @classmethod + def steps(cls): + return ( + cls.clone, + cls.collect_and_store_advisories, + cls.clean_downloads, + ) + + def clone(self): + self.log(f"Cloning `{self.repo_url}`") + self.vcs_response = fetch_via_vcs(self.repo_url) + + def advisories_count(self): + vulns_directory = Path(self.vcs_response.dest_dir) / "vulns" + return sum(1 for _ in vulns_directory.rglob("*.yaml")) + + def collect_advisories(self) -> Iterable[AdvisoryData]: + from vulnerabilities.importers.osv import parse_advisory_data_v2 + + base_directory = Path(self.vcs_response.dest_dir) + vulns_directory = base_directory / "vulns" + + for advisory in vulns_directory.rglob("*.yaml"): + advisory_url = get_advisory_url( + file=advisory, + base_path=base_directory, + url="https://github.com/google/oss-fuzz-vulns/blob/main/", + ) + advisory_text = advisory.read_text() + advisory_dict = saneyaml.load(advisory_text) + yield parse_advisory_data_v2( + raw_data=advisory_dict, + supported_ecosystems=["generic"], + advisory_url=advisory_url, + advisory_text=advisory_text, + ) + + def clean_downloads(self): + if self.vcs_response: + self.log(f"Removing cloned repository") + self.vcs_response.delete() + + def on_failure(self): + self.clean_downloads() From a08bde06f8f330eaeb6282c47e5300ea5ab2ee15 Mon Sep 17 00:00:00 2001 From: Tushar Goel Date: Wed, 16 Jul 2025 18:14:50 +0530 Subject: [PATCH 06/13] Add OSS Fuzz importer Signed-off-by: Tushar Goel --- vulnerabilities/importers/__init__.py | 2 +- vulnerabilities/pipelines/v2_importers/oss_fuzz.py | 8 ++++---- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/vulnerabilities/importers/__init__.py b/vulnerabilities/importers/__init__.py index f6232046d..f7dbf6b94 100644 --- a/vulnerabilities/importers/__init__.py +++ b/vulnerabilities/importers/__init__.py @@ -49,11 +49,11 @@ from vulnerabilities.pipelines.v2_importers import gitlab_importer as gitlab_importer_v2 from vulnerabilities.pipelines.v2_importers import npm_importer as npm_importer_v2 from vulnerabilities.pipelines.v2_importers import nvd_importer as nvd_importer_v2 +from vulnerabilities.pipelines.v2_importers import oss_fuzz as oss_fuzz_v2 from vulnerabilities.pipelines.v2_importers import pypa_importer as pypa_importer_v2 from vulnerabilities.pipelines.v2_importers import pysec_importer as pysec_importer_v2 from vulnerabilities.pipelines.v2_importers import vulnrichment_importer as vulnrichment_importer_v2 from vulnerabilities.pipelines.v2_importers import xen_importer as xen_importer_v2 -from vulnerabilities.pipelines.v2_importers import oss_fuzz as oss_fuzz_v2 from vulnerabilities.utils import create_registry IMPORTERS_REGISTRY = create_registry( diff --git a/vulnerabilities/pipelines/v2_importers/oss_fuzz.py b/vulnerabilities/pipelines/v2_importers/oss_fuzz.py index c4afad4c0..9c9d6bed8 100644 --- a/vulnerabilities/pipelines/v2_importers/oss_fuzz.py +++ b/vulnerabilities/pipelines/v2_importers/oss_fuzz.py @@ -11,11 +11,11 @@ from typing import Iterable import saneyaml +from fetchcode.vcs import fetch_via_vcs from vulnerabilities.importer import AdvisoryData from vulnerabilities.pipelines import VulnerableCodeBaseImporterPipelineV2 from vulnerabilities.utils import get_advisory_url -from fetchcode.vcs import fetch_via_vcs logger = logging.getLogger(__name__) @@ -37,8 +37,8 @@ def steps(cls): def clone(self): self.log(f"Cloning `{self.repo_url}`") - self.vcs_response = fetch_via_vcs(self.repo_url) - + self.vcs_response = fetch_via_vcs(self.repo_url) + def advisories_count(self): vulns_directory = Path(self.vcs_response.dest_dir) / "vulns" return sum(1 for _ in vulns_directory.rglob("*.yaml")) @@ -62,7 +62,7 @@ def collect_advisories(self) -> Iterable[AdvisoryData]: supported_ecosystems=["generic"], advisory_url=advisory_url, advisory_text=advisory_text, - ) + ) def clean_downloads(self): if self.vcs_response: From b63464e3c42395fc9dfdc06bdcb5a73183f12750 Mon Sep 17 00:00:00 2001 From: Tushar Goel Date: Wed, 16 Jul 2025 19:09:03 +0530 Subject: [PATCH 07/13] Migrate Istio importer Signed-off-by: Tushar Goel --- vulnerabilities/importers/__init__.py | 2 + .../pipelines/v2_importers/curl_importer.py | 1 + .../pipelines/v2_importers/istio_importer.py | 170 ++++++++++++++++++ .../tests/pipelines/test_istio_importer_v2.py | 91 ++++++++++ 4 files changed, 264 insertions(+) create mode 100644 vulnerabilities/pipelines/v2_importers/istio_importer.py create mode 100644 vulnerabilities/tests/pipelines/test_istio_importer_v2.py diff --git a/vulnerabilities/importers/__init__.py b/vulnerabilities/importers/__init__.py index f7dbf6b94..1bb4c43f1 100644 --- a/vulnerabilities/importers/__init__.py +++ b/vulnerabilities/importers/__init__.py @@ -47,6 +47,7 @@ elixir_security_importer as elixir_security_importer_v2, ) from vulnerabilities.pipelines.v2_importers import gitlab_importer as gitlab_importer_v2 +from vulnerabilities.pipelines.v2_importers import istio_importer as istio_importer_v2 from vulnerabilities.pipelines.v2_importers import npm_importer as npm_importer_v2 from vulnerabilities.pipelines.v2_importers import nvd_importer as nvd_importer_v2 from vulnerabilities.pipelines.v2_importers import oss_fuzz as oss_fuzz_v2 @@ -69,6 +70,7 @@ xen_importer_v2.XenImporterPipeline, curl_importer_v2.CurlImporterPipeline, oss_fuzz_v2.OSSFuzzImporterPipeline, + istio_importer_v2.IstioImporterPipeline, nvd_importer.NVDImporterPipeline, github_importer.GitHubAPIImporterPipeline, gitlab_importer.GitLabImporterPipeline, diff --git a/vulnerabilities/pipelines/v2_importers/curl_importer.py b/vulnerabilities/pipelines/v2_importers/curl_importer.py index 52715c22a..e3253b4b4 100644 --- a/vulnerabilities/pipelines/v2_importers/curl_importer.py +++ b/vulnerabilities/pipelines/v2_importers/curl_importer.py @@ -37,6 +37,7 @@ class CurlImporterPipeline(VulnerableCodeBaseImporterPipelineV2): license_url = "https://curl.se/docs/copyright.html" repo_url = "https://github.com/curl/curl-www/" url = "https://curl.se/docs/vuln.json" + unfurl_version_ranges = True @classmethod def steps(cls): diff --git a/vulnerabilities/pipelines/v2_importers/istio_importer.py b/vulnerabilities/pipelines/v2_importers/istio_importer.py new file mode 100644 index 000000000..bc544f7f8 --- /dev/null +++ b/vulnerabilities/pipelines/v2_importers/istio_importer.py @@ -0,0 +1,170 @@ +# +# Copyright (c) nexB Inc. and others. All rights reserved. +# VulnerableCode is a trademark of nexB Inc. +# SPDX-License-Identifier: Apache-2.0 +# See http://www.apache.org/licenses/LICENSE-2.0 for the license text. +# See https://github.com/aboutcode-org/vulnerablecode for support or download. +# See https://aboutcode.org for more information about nexB OSS projects. +# + +import re +from pathlib import Path +from typing import Iterable +from typing import List + +import pytz +import saneyaml +from dateutil import parser +from fetchcode.vcs import fetch_via_vcs +from packageurl import PackageURL +from univers.version_constraint import VersionConstraint +from univers.version_range import GitHubVersionRange +from univers.version_range import GolangVersionRange +from univers.versions import SemverVersion + +from vulnerabilities.importer import AdvisoryData +from vulnerabilities.importer import AffectedPackage +from vulnerabilities.importer import ReferenceV2 +from vulnerabilities.pipelines import VulnerableCodeBaseImporterPipelineV2 +from vulnerabilities.utils import get_advisory_url +from vulnerabilities.utils import split_markdown_front_matter + +is_release = re.compile(r"^[\d.]+$", re.IGNORECASE).match + + +class IstioImporterPipeline(VulnerableCodeBaseImporterPipelineV2): + """ + Importer for Istio.io security advisories. + """ + + pipeline_id = "istio_importer_v2" + spdx_license_expression = "Apache-2.0" + license_url = "https://github.com/istio/istio.io/blob/master/LICENSE" + repo_url = "git+https://github.com/istio/istio.io" + unfurl_version_ranges = True + + @classmethod + def steps(cls): + return ( + cls.clone, + cls.collect_and_store_advisories, + cls.clean_downloads, + ) + + def advisories_count(self) -> int: + base_path = Path(self.vcs_response.dest_dir) + advisories_dir = base_path / "content/en/news/security" + return sum( + 1 for file in advisories_dir.rglob("*.md") if not file.name.endswith("_index.md") + ) + + def clone(self): + self.log(f"Cloning `{self.repo_url}`") + self.vcs_response = fetch_via_vcs(self.repo_url) + + def collect_advisories(self) -> Iterable[AdvisoryData]: + base_path = Path(self.vcs_response.dest_dir) + advisories_dir = base_path / "content/en/news/security" + + for md_file in advisories_dir.rglob("*.md"): + if md_file.name.endswith("_index.md"): + continue + + data = self.parse_markdown(md_file) + advisory_url = get_advisory_url( + file=md_file, + base_path=base_path, + url="https://github.com/istio/istio.io/blob/master/", + ) + published_date = data.get("publishdate") + release_date = ( + parser.parse(published_date).replace(tzinfo=pytz.UTC) if published_date else None + ) + constraints = self.get_version_constraints(data.get("releases", [])) + + cves = data.get("cves", []) + + affected_packages = [] + if constraints: + affected_packages.extend( + [ + AffectedPackage( + package=PackageURL(type="golang", namespace="istio.io", name="istio"), + affected_version_range=GolangVersionRange(constraints=constraints), + ), + AffectedPackage( + package=PackageURL(type="github", namespace="istio", name="istio"), + affected_version_range=GitHubVersionRange(constraints=constraints), + ), + ] + ) + + title = data.get("title") or "" + summary = data.get("description") or "" + references = [] + if title: + references.append( + ReferenceV2( + reference_id=title, + url=f"https://istio.io/latest/news/security/{title}/", + ) + ) + + yield AdvisoryData( + advisory_id=title, + aliases=cves, + summary=summary, + affected_packages=affected_packages, + references_v2=references, + date_published=release_date, + url=advisory_url, + original_advisory_text=md_file.read_text(encoding="utf-8"), + ) + + def parse_markdown(self, path: Path) -> dict: + """Return a mapping of vulnerability data extracted from an advisory.""" + text = path.read_text(encoding="utf-8") + front_matter, _ = split_markdown_front_matter(text) + return saneyaml.load(front_matter) + + def get_version_constraints(self, releases: List[str]) -> List[VersionConstraint]: + constraints = [] + for release in releases: + release = release.strip() + + if "All releases prior" in release: + _, _, version = release.rpartition(" ") + constraints.append( + VersionConstraint(version=SemverVersion(version), comparator="<") + ) + + elif "All releases" in release and "and later" in release: + version = release.replace("All releases", "").replace("and later", "").strip() + if is_release(version): + constraints.append( + VersionConstraint(version=SemverVersion(version), comparator=">=") + ) + + elif "to" in release: + lower, _, upper = release.partition("to") + constraints.append( + VersionConstraint(version=SemverVersion(lower.strip()), comparator=">=") + ) + constraints.append( + VersionConstraint(version=SemverVersion(upper.strip()), comparator="<=") + ) + + elif is_release(release): + constraints.append( + VersionConstraint(version=SemverVersion(release), comparator="=") + ) + + return constraints + + def clean_downloads(self): + if self.vcs_response: + self.log("Removing cloned repository") + self.vcs_response.delete() + + def on_failure(self): + self.clean_downloads() diff --git a/vulnerabilities/tests/pipelines/test_istio_importer_v2.py b/vulnerabilities/tests/pipelines/test_istio_importer_v2.py new file mode 100644 index 000000000..b64dd532f --- /dev/null +++ b/vulnerabilities/tests/pipelines/test_istio_importer_v2.py @@ -0,0 +1,91 @@ +import tempfile +from pathlib import Path +from textwrap import dedent + +import pytest +from packageurl import PackageURL +from univers.version_constraint import VersionConstraint +from univers.version_range import GitHubVersionRange +from univers.version_range import GolangVersionRange +from univers.versions import SemverVersion + +from vulnerabilities.importer import AdvisoryData +from vulnerabilities.importer import AffectedPackage +from vulnerabilities.importer import ReferenceV2 +from vulnerabilities.pipelines.v2_importers.istio_importer import IstioImporterPipeline + + +@pytest.mark.django_db +def test_istio_advisory_parsing(): + sample_md = dedent( + """\ + --- + title: ISTIO-SECURITY-2019-002 + subtitle: Security Bulletin + description: Denial of service affecting JWT access token parsing. + cves: [CVE-2019-12995] + cvss: "7.5" + vector: "AV:N/AC:L/PR:N/UI:N/S:U/C:N/I:N/A:H/E:F/RL:O/RC:C" + cvss_version: "3.0" + releases: ["1.0 to 1.0.8", "1.1 to 1.1.9", "1.2 to 1.2.1"] + publishdate: 2019-06-28 + keywords: [CVE] + skip_seealso: true + aliases: + - /blog/2019/cve-2019-12995 + - /news/2019/cve-2019-12995 + --- + + A bug in Istio’s JWT validation filter causes Envoy to crash... + """ + ) + + with tempfile.TemporaryDirectory() as tmp_dir: + base_path = Path(tmp_dir) + advisory_dir = base_path / "content/en/news/security" + advisory_dir.mkdir(parents=True) + advisory_file = advisory_dir / "ISTIO-SECURITY-2019-002.md" + advisory_file.write_text(sample_md, encoding="utf-8") + + importer = IstioImporterPipeline() + importer.vcs_response = type( + "FakeVCS", (), {"dest_dir": tmp_dir, "delete": lambda x: None} + )() + + advisories = list(importer.collect_advisories()) + + assert len(advisories) == 1 + advisory = advisories[0] + + assert isinstance(advisory, AdvisoryData) + assert advisory.advisory_id == "ISTIO-SECURITY-2019-002" + assert advisory.aliases == ["CVE-2019-12995"] + assert advisory.summary.startswith("Denial of service affecting JWT access token") + assert advisory.date_published.isoformat() == "2019-06-28T00:00:00+00:00" + assert advisory.url.endswith("ISTIO-SECURITY-2019-002.md") + assert advisory.references_v2[0] == ReferenceV2( + reference_id="ISTIO-SECURITY-2019-002", + url="https://istio.io/latest/news/security/ISTIO-SECURITY-2019-002/", + ) + + expected_versions = [ + VersionConstraint(version=SemverVersion("1.0"), comparator=">="), + VersionConstraint(version=SemverVersion("1.0.8"), comparator="<="), + VersionConstraint(version=SemverVersion("1.1"), comparator=">="), + VersionConstraint(version=SemverVersion("1.1.9"), comparator="<="), + VersionConstraint(version=SemverVersion("1.2"), comparator=">="), + VersionConstraint(version=SemverVersion("1.2.1"), comparator="<="), + ] + + expected_packages = [ + AffectedPackage( + package=PackageURL(type="golang", namespace="istio.io", name="istio"), + affected_version_range=GolangVersionRange(constraints=expected_versions), + ), + AffectedPackage( + package=PackageURL(type="github", namespace="istio", name="istio"), + affected_version_range=GitHubVersionRange(constraints=expected_versions), + ), + ] + + assert advisory.affected_packages == expected_packages From da0bf371e5279ba766a59f0c7877e600178d4e1f Mon Sep 17 00:00:00 2001 From: Tushar Goel Date: Wed, 16 Jul 2025 19:20:57 +0530 Subject: [PATCH 08/13] Add tests for OSS-FUZZ Signed-off-by: Tushar Goel --- .../tests/pipelines/test_curl_importer_v2.py | 4 ++ .../tests/pipelines/test_istio_importer_v2.py | 9 +++ .../tests/pipelines/test_oss_fuzz_v2.py | 55 +++++++++++++++++++ 3 files changed, 68 insertions(+) create mode 100644 vulnerabilities/tests/pipelines/test_oss_fuzz_v2.py diff --git a/vulnerabilities/tests/pipelines/test_curl_importer_v2.py b/vulnerabilities/tests/pipelines/test_curl_importer_v2.py index 0d9a25358..7833d1397 100644 --- a/vulnerabilities/tests/pipelines/test_curl_importer_v2.py +++ b/vulnerabilities/tests/pipelines/test_curl_importer_v2.py @@ -1,6 +1,10 @@ # # Copyright (c) nexB Inc. and others. All rights reserved. +# VulnerableCode is a trademark of nexB Inc. # SPDX-License-Identifier: Apache-2.0 +# See http://www.apache.org/licenses/LICENSE-2.0 for the license text. +# See https://github.com/aboutcode-org/vulnerablecode for support or download. +# See https://aboutcode.org for more information about nexB OSS projects. # from datetime import datetime diff --git a/vulnerabilities/tests/pipelines/test_istio_importer_v2.py b/vulnerabilities/tests/pipelines/test_istio_importer_v2.py index b64dd532f..162eddbbe 100644 --- a/vulnerabilities/tests/pipelines/test_istio_importer_v2.py +++ b/vulnerabilities/tests/pipelines/test_istio_importer_v2.py @@ -1,3 +1,12 @@ +# +# Copyright (c) nexB Inc. and others. All rights reserved. +# VulnerableCode is a trademark of nexB Inc. +# SPDX-License-Identifier: Apache-2.0 +# See http://www.apache.org/licenses/LICENSE-2.0 for the license text. +# See https://github.com/aboutcode-org/vulnerablecode for support or download. +# See https://aboutcode.org for more information about nexB OSS projects. +# + import tempfile from pathlib import Path from textwrap import dedent diff --git a/vulnerabilities/tests/pipelines/test_oss_fuzz_v2.py b/vulnerabilities/tests/pipelines/test_oss_fuzz_v2.py new file mode 100644 index 000000000..3ad08d014 --- /dev/null +++ b/vulnerabilities/tests/pipelines/test_oss_fuzz_v2.py @@ -0,0 +1,55 @@ +# +# Copyright (c) nexB Inc. and others. All rights reserved. +# VulnerableCode is a trademark of nexB Inc. +# SPDX-License-Identifier: Apache-2.0 +# See http://www.apache.org/licenses/LICENSE-2.0 for the license text. +# See https://github.com/aboutcode-org/vulnerablecode for support or download. +# See https://aboutcode.org for more information about nexB OSS projects. +# +import pytest +from unittest import mock + +import yaml +from vulnerabilities.pipelines.v2_importers.oss_fuzz import OSSFuzzImporterPipeline +from vulnerabilities.importer import AdvisoryData + + +@pytest.mark.django_db +def test_collect_advisories_parses_yaml_correctly(tmp_path): + advisory_path = tmp_path / "vulns" / "dummy_project" + advisory_path.mkdir(parents=True) + yaml_file = advisory_path / "CVE-2024-1234.yaml" + + advisory_dict = { + "id": "CVE-2024-1234", + "summary": "Some summary here", + "affected": [ + { + "package": {"name": "some-lib"}, + "versions": ["1.0.0"] + } + ] + } + yaml_file.write_text(yaml.dump(advisory_dict), encoding="utf-8") + + pipeline = OSSFuzzImporterPipeline() + pipeline.vcs_response = mock.Mock() + pipeline.vcs_response.dest_dir = tmp_path + + advisories = list(pipeline.collect_advisories()) + assert len(advisories) == 1 + assert advisories[0].advisory_id == "CVE-2024-1234" + assert advisories[0].summary == "Some summary here" + + +@pytest.mark.django_db +def test_advisories_count(tmp_path): + (tmp_path / "vulns" / "project").mkdir(parents=True) + (tmp_path / "vulns" / "project" / "CVE-2023-0001.yaml").write_text("id: CVE-2023-0001") + (tmp_path / "vulns" / "project" / "CVE-2023-0002.yaml").write_text("id: CVE-2023-0002") + + pipeline = OSSFuzzImporterPipeline() + pipeline.vcs_response = mock.Mock() + pipeline.vcs_response.dest_dir = tmp_path + + assert pipeline.advisories_count() == 2 From 2e10f7eca231b3ea6e30749178dad8e1f44ad097 Mon Sep 17 00:00:00 2001 From: Tushar Goel Date: Wed, 16 Jul 2025 19:24:42 +0530 Subject: [PATCH 09/13] Fix tests Signed-off-by: Tushar Goel --- vulnerabilities/tests/pipelines/test_oss_fuzz_v2.py | 12 ++++-------- 1 file changed, 4 insertions(+), 8 deletions(-) diff --git a/vulnerabilities/tests/pipelines/test_oss_fuzz_v2.py b/vulnerabilities/tests/pipelines/test_oss_fuzz_v2.py index 3ad08d014..40cf420d8 100644 --- a/vulnerabilities/tests/pipelines/test_oss_fuzz_v2.py +++ b/vulnerabilities/tests/pipelines/test_oss_fuzz_v2.py @@ -6,12 +6,13 @@ # See https://github.com/aboutcode-org/vulnerablecode for support or download. # See https://aboutcode.org for more information about nexB OSS projects. # -import pytest from unittest import mock +import pytest import yaml -from vulnerabilities.pipelines.v2_importers.oss_fuzz import OSSFuzzImporterPipeline + from vulnerabilities.importer import AdvisoryData +from vulnerabilities.pipelines.v2_importers.oss_fuzz import OSSFuzzImporterPipeline @pytest.mark.django_db @@ -23,12 +24,7 @@ def test_collect_advisories_parses_yaml_correctly(tmp_path): advisory_dict = { "id": "CVE-2024-1234", "summary": "Some summary here", - "affected": [ - { - "package": {"name": "some-lib"}, - "versions": ["1.0.0"] - } - ] + "affected": [{"package": {"name": "some-lib"}, "versions": ["1.0.0"]}], } yaml_file.write_text(yaml.dump(advisory_dict), encoding="utf-8") From 3e09dc9f35f5ffa1415e50148558bd2b8ad4d1f3 Mon Sep 17 00:00:00 2001 From: Tushar Goel Date: Thu, 17 Jul 2025 12:09:37 +0530 Subject: [PATCH 10/13] Add postgresql importer Signed-off-by: Tushar Goel --- .../v2_importers/postgresql_importer.py | 12 +- .../pipelines/test_postgresql_v2_importer.py | 105 +++++++----------- 2 files changed, 43 insertions(+), 74 deletions(-) diff --git a/vulnerabilities/pipelines/v2_importers/postgresql_importer.py b/vulnerabilities/pipelines/v2_importers/postgresql_importer.py index b7580b8c3..60b099ff6 100644 --- a/vulnerabilities/pipelines/v2_importers/postgresql_importer.py +++ b/vulnerabilities/pipelines/v2_importers/postgresql_importer.py @@ -43,17 +43,13 @@ def steps(cls): return (cls.collect_and_store_advisories,) def advisories_count(self) -> int: - if not self.links: - self.collect_links() - return len(self.links) + return 30 def collect_advisories(self) -> Iterable[AdvisoryData]: - if not self.links: - self.collect_links() + url = "https://www.postgresql.org/support/security/" - for url in self.links: - data = requests.get(url).content - yield from self.to_advisories(data, url) + data = requests.get(url).content + yield from self.to_advisories(data, url) def collect_links(self): known_urls = {self.base_url} diff --git a/vulnerabilities/tests/pipelines/test_postgresql_v2_importer.py b/vulnerabilities/tests/pipelines/test_postgresql_v2_importer.py index 154ad1fc4..c138c0746 100644 --- a/vulnerabilities/tests/pipelines/test_postgresql_v2_importer.py +++ b/vulnerabilities/tests/pipelines/test_postgresql_v2_importer.py @@ -34,9 +34,9 @@ - CVE-2022-1234
- Announcement
- + CVE-2022-1234
+ Announcement
+ 10.0, 10.1 10.2 9.8 @@ -48,43 +48,39 @@ """ +HTML_NO_FIX_ADVISORY = """ + + + + + + + + + + + + +
+ CVE-2023-5678
+ Announcement
+
9.5, 9.6Unpatched issue
+ + +""" + @pytest.fixture def importer(): return PostgreSQLImporterPipeline() -@patch("vulnerabilities.pipelines.v2_importers.postgresql_importer.requests.get") -def test_collect_links(mock_get, importer): - mock_get.return_value.content = HTML_PAGE_WITH_LINKS.encode("utf-8") - - importer.collect_links() - - assert len(importer.links) == 3 # base + 2 new - assert any("advisory1.html" in link for link in importer.links) - assert any("advisory2.html" in link for link in importer.links) - - -@patch("vulnerabilities.pipelines.v2_importers.postgresql_importer.requests.get") -def test_advisories_count(mock_get, importer): - mock_get.return_value.content = HTML_PAGE_WITH_LINKS.encode("utf-8") - - count = importer.advisories_count() - assert count >= 3 - - @patch("vulnerabilities.pipelines.v2_importers.postgresql_importer.requests.get") def test_collect_advisories(mock_get, importer): - importer.links = { - "https://www.postgresql.org/support/security/advisory1.html", - "https://www.postgresql.org/support/security/advisory2.html", - } - mock_get.return_value.content = HTML_ADVISORY.encode("utf-8") - advisories = list(importer.collect_advisories()) - assert len(advisories) == 2 + assert len(advisories) == 1 advisory = advisories[0] assert isinstance(advisory, AdvisoryData) assert advisory.advisory_id == "CVE-2022-1234" @@ -98,57 +94,34 @@ def test_collect_advisories(mock_get, importer): @patch("vulnerabilities.pipelines.v2_importers.postgresql_importer.requests.get") def test_collect_advisories_with_no_fixed_version(mock_get, importer): - no_fix_html = """ - - - - - - - - - - - - -
- CVE-2023-5678
- Announcement
-
9.5, 9.6Unpatched issue
- - - """ - - def side_effect(url, *args, **kwargs): - if "advisory" not in url: - return MagicMock(content=HTML_PAGE_WITH_LINKS.encode("utf-8")) - return MagicMock(content=no_fix_html.encode("utf-8")) - - mock_get.side_effect = side_effect - + mock_get.return_value.content = HTML_NO_FIX_ADVISORY.encode("utf-8") advisories = list(importer.collect_advisories()) - assert len(advisories) == 2 + assert len(advisories) == 1 advisory = advisories[0] assert advisory.advisory_id == "CVE-2023-5678" assert advisory.affected_packages[0].fixed_version is None assert advisory.affected_packages[0].affected_version_range.contains(SemverVersion("9.5")) + assert advisory.affected_packages[0].affected_version_range.contains(SemverVersion("9.6")) @patch("vulnerabilities.pipelines.v2_importers.postgresql_importer.requests.get") def test_cvss_parsing(mock_get, importer): - mock_get.side_effect = lambda url, *args, **kwargs: MagicMock( - content=HTML_ADVISORY.encode("utf-8") - ) - - importer.links = {"https://www.postgresql.org/support/security/advisory1.html"} - + mock_get.return_value.content = HTML_ADVISORY.encode("utf-8") advisories = list(importer.collect_advisories()) assert len(advisories) == 1 - reference = advisories[0].references_v2[0] - severity = advisories[0].severities[0] assert severity.system.identifier == "cvssv3" assert severity.value == "9.8" assert "AV:N/AC:L/PR:N/UI:N" in severity.scoring_elements + + +@patch("vulnerabilities.pipelines.v2_importers.postgresql_importer.requests.get") +def test_collect_links(mock_get, importer): + mock_get.return_value.content = HTML_PAGE_WITH_LINKS.encode("utf-8") + importer.collect_links() + + assert len(importer.links) == 3 + assert any("advisory1.html" in link for link in importer.links) + assert any("advisory2.html" in link for link in importer.links) From edbbc677c132c4fb9706be78f59ea9cf62a87b28 Mon Sep 17 00:00:00 2001 From: Tushar Goel Date: Thu, 17 Jul 2025 12:53:20 +0530 Subject: [PATCH 11/13] Add mozilla importer Signed-off-by: Tushar Goel --- vulnerabilities/importers/__init__.py | 4 + .../v2_importers/mozilla_importer.py | 229 ++++++++++++++++++ .../pipelines/test_mozilla_importer_v2.py | 86 +++++++ 3 files changed, 319 insertions(+) create mode 100644 vulnerabilities/pipelines/v2_importers/mozilla_importer.py create mode 100644 vulnerabilities/tests/pipelines/test_mozilla_importer_v2.py diff --git a/vulnerabilities/importers/__init__.py b/vulnerabilities/importers/__init__.py index 1bb4c43f1..6441f623a 100644 --- a/vulnerabilities/importers/__init__.py +++ b/vulnerabilities/importers/__init__.py @@ -48,9 +48,11 @@ ) from vulnerabilities.pipelines.v2_importers import gitlab_importer as gitlab_importer_v2 from vulnerabilities.pipelines.v2_importers import istio_importer as istio_importer_v2 +from vulnerabilities.pipelines.v2_importers import mozilla_importer as mozilla_importer_v2 from vulnerabilities.pipelines.v2_importers import npm_importer as npm_importer_v2 from vulnerabilities.pipelines.v2_importers import nvd_importer as nvd_importer_v2 from vulnerabilities.pipelines.v2_importers import oss_fuzz as oss_fuzz_v2 +from vulnerabilities.pipelines.v2_importers import postgresql_importer as postgresql_importer_v2 from vulnerabilities.pipelines.v2_importers import pypa_importer as pypa_importer_v2 from vulnerabilities.pipelines.v2_importers import pysec_importer as pysec_importer_v2 from vulnerabilities.pipelines.v2_importers import vulnrichment_importer as vulnrichment_importer_v2 @@ -71,6 +73,8 @@ curl_importer_v2.CurlImporterPipeline, oss_fuzz_v2.OSSFuzzImporterPipeline, istio_importer_v2.IstioImporterPipeline, + postgresql_importer_v2.PostgreSQLImporterPipeline, + mozilla_importer_v2.MozillaImporterPipeline, nvd_importer.NVDImporterPipeline, github_importer.GitHubAPIImporterPipeline, gitlab_importer.GitLabImporterPipeline, diff --git a/vulnerabilities/pipelines/v2_importers/mozilla_importer.py b/vulnerabilities/pipelines/v2_importers/mozilla_importer.py new file mode 100644 index 000000000..e3e10538b --- /dev/null +++ b/vulnerabilities/pipelines/v2_importers/mozilla_importer.py @@ -0,0 +1,229 @@ +# +# Copyright (c) nexB Inc. and others. All rights reserved. +# VulnerableCode is a trademark of nexB Inc. +# SPDX-License-Identifier: Apache-2.0 +# See http://www.apache.org/licenses/LICENSE-2.0 for the license text. +# See https://github.com/aboutcode-org/vulnerablecode for support or download. +# See https://aboutcode.org for more information about nexB OSS projects. +# +import json +import logging +import re +from pathlib import Path +from typing import Iterable + +import yaml +from bs4 import BeautifulSoup +from dateutil import parser as date_parser +from fetchcode.vcs import fetch_via_vcs +from markdown import markdown +from packageurl import PackageURL +from univers.versions import SemverVersion + +from vulnerabilities.importer import AdvisoryData +from vulnerabilities.importer import AffectedPackage +from vulnerabilities.importer import ReferenceV2 +from vulnerabilities.importer import VulnerabilitySeverity +from vulnerabilities.pipelines import VulnerableCodeBaseImporterPipelineV2 +from vulnerabilities.severity_systems import GENERIC +from vulnerabilities.utils import get_advisory_url +from vulnerabilities.utils import is_cve +from vulnerabilities.utils import split_markdown_front_matter + +logger = logging.getLogger(__name__) + +MFSA_FILENAME_RE = re.compile(r"mfsa(\d{4}-\d{2,3})\.(md|yml)$") + + +class MozillaImporterPipeline(VulnerableCodeBaseImporterPipelineV2): + """ + Pipeline-based importer for Mozilla Foundation Security Advisories. + """ + + pipeline_id = "mozilla_importer_v2" + repo_url = "git+https://github.com/mozilla/foundation-security-advisories" + spdx_license_expression = "MPL-2.0" + license_url = "https://github.com/mozilla/foundation-security-advisories/blob/master/LICENSE" + + @classmethod + def steps(cls): + return ( + cls.clone, + cls.collect_and_store_advisories, + ) + + def clone(self): + self.log(f"Cloning `{self.repo_url}`") + self.vcs_response = fetch_via_vcs(self.repo_url) + + def advisories_count(self) -> int: + base_path = Path(self.vcs_response.dest_dir) + yml = list((base_path / "announce").glob("**/*.yml")) + md = list((base_path / "announce").glob("**/*.md")) + return len(yml) + len(md) + + def collect_advisories(self) -> Iterable[AdvisoryData]: + base_path = Path(self.vcs_response.dest_dir) + advisory_dir = base_path / "announce" + + for file_path in advisory_dir.glob("**/*"): + if file_path.suffix not in [".yml", ".md"]: + continue + yield from parse_advisory(file_path, base_path) + + +def parse_advisory(file_path: Path, base_path: Path) -> Iterable[AdvisoryData]: + advisory_url = get_advisory_url( + file=file_path, + base_path=base_path, + url="https://github.com/mozilla/foundation-security-advisories/blob/master/", + ) + + mfsa_id = mfsa_id_from_filename(file_path.name) + if not mfsa_id: + return [] + + with open(file_path) as lines: + if file_path.suffix == ".md": + yield from parse_md_advisory(mfsa_id, lines, advisory_url) + elif file_path.suffix == ".yml": + yield from parse_yml_advisory(mfsa_id, lines, advisory_url) + + +def parse_yml_advisory(mfsa_id, lines, advisory_url) -> Iterable[AdvisoryData]: + data = yaml.safe_load(lines) + + affected_packages = list(parse_affected_packages(data.get("fixed_in") or [])) + reference = ReferenceV2( + url=f"https://www.mozilla.org/en-US/security/advisories/{mfsa_id}", + ) + severity = get_severity_from_impact(data.get("impact"), url=reference.url) + date_published = data.get("announced") + mfsa_summary = data.get("description", "") + mfsa_summary = BeautifulSoup(mfsa_summary, features="lxml").get_text() + + advisories = data.get("advisories", {}) + + if not advisories: + yield AdvisoryData( + advisory_id=mfsa_id, + aliases=[], + summary=mfsa_summary, + affected_packages=affected_packages, + references_v2=[reference], + severities=[severity], + url=advisory_url, + date_published=date_parser.parse(date_published) if date_published else None, + original_advisory_text=json.dumps(data, indent=2, ensure_ascii=False), + ) + + for cve, advisory in advisories.items(): + if not is_cve(cve): + continue + + advisory_summary = BeautifulSoup( + advisory.get("description", ""), features="lxml" + ).get_text() + impact = advisory.get("impact", "") + advisory_severity = get_severity_from_impact(impact, url=reference.url) + + yield AdvisoryData( + advisory_id=f"{mfsa_id}/{cve}", + aliases=[cve], + summary=mfsa_summary + "\n" + advisory_summary, + affected_packages=affected_packages, + references_v2=[reference], + url=advisory_url, + severities=[advisory_severity], + date_published=date_parser.parse(date_published) if date_published else None, + original_advisory_text=json.dumps(advisory, indent=2, ensure_ascii=False), + ) + + +def parse_md_advisory(mfsa_id, lines, advisory_url) -> Iterable[AdvisoryData]: + yamltext, mdtext = split_markdown_front_matter(lines.read()) + data = yaml.safe_load(yamltext) + + affected_packages = list(parse_affected_packages(data.get("fixed_in") or [])) + reference = ReferenceV2( + url=f"https://www.mozilla.org/en-US/security/advisories/{mfsa_id}", + ) + severity = get_severity_from_impact(data.get("impact"), url=reference.url) + description = extract_description_from_html(mdtext) + + yield AdvisoryData( + advisory_id=mfsa_id, + aliases=[], + summary=description, + affected_packages=affected_packages, + references_v2=[reference], + severities=[severity], + url=advisory_url, + date_published=date_parser.parse(data.get("announced")) if data.get("announced") else None, + original_advisory_text=json.dumps(data, indent=2, ensure_ascii=False), + ) + + +def extract_description_from_html(md_text: str) -> str: + html = markdown(md_text) + soup = BeautifulSoup(html, features="lxml") + h3tag = soup.find("h3", string=lambda s: s and s.lower() == "description") + if not h3tag: + return "" + + description_parts = [] + for sibling in h3tag.find_next_siblings(): + if sibling.name != "p": + break + description_parts.append(sibling.get_text()) + + return "\n".join(description_parts).strip() + + +def parse_affected_packages(pkgs: list) -> Iterable[AffectedPackage]: + for pkg in pkgs: + if not pkg: + continue + + name, _, version = pkg.rpartition(" ") + if version.count(".") == 3: + continue # invalid SemVer + try: + fixed_version = SemverVersion(version) + except Exception: + logger.debug(f"Invalid version '{version}' for package '{name}'") + continue + + yield AffectedPackage( + package=PackageURL(type="mozilla", name=name), + fixed_version=fixed_version, + ) + + +def get_reference_and_severity(mfsa_id: str, impact: str) -> ReferenceV2: + return ReferenceV2( + url=f"https://www.mozilla.org/en-US/security/advisories/{mfsa_id}", + ) + + +def mfsa_id_from_filename(filename: str) -> str | None: + match = MFSA_FILENAME_RE.search(filename) + return f"mfsa{match.group(1)}" if match else None + + +def get_severity_from_impact(impact: str, url=None) -> VulnerabilitySeverity: + """ + Extracts the severity from the impact string. + """ + impact = (impact or "").lower() + if impact == "moderate": + impact = "medium" + severities = ["critical", "high", "medium", "low", "none"] + severity_value = "none" + + for level in severities: + if level in impact: + severity_value = level + break + + return VulnerabilitySeverity(system=GENERIC, value=severity_value, url=url) diff --git a/vulnerabilities/tests/pipelines/test_mozilla_importer_v2.py b/vulnerabilities/tests/pipelines/test_mozilla_importer_v2.py new file mode 100644 index 000000000..556a609ac --- /dev/null +++ b/vulnerabilities/tests/pipelines/test_mozilla_importer_v2.py @@ -0,0 +1,86 @@ +# +# Copyright (c) nexB Inc. and others. All rights reserved. +# VulnerableCode is a trademark of nexB Inc. +# SPDX-License-Identifier: Apache-2.0 +# See http://www.apache.org/licenses/LICENSE-2.0 for the license text. +# See https://github.com/aboutcode-org/vulnerablecode for support or download. +# See https://aboutcode.org for more information about nexB OSS projects. +# +import json +from pathlib import Path +from textwrap import dedent + +from vulnerabilities.pipelines.v2_importers.mozilla_importer import extract_description_from_html +from vulnerabilities.pipelines.v2_importers.mozilla_importer import get_severity_from_impact +from vulnerabilities.pipelines.v2_importers.mozilla_importer import mfsa_id_from_filename +from vulnerabilities.pipelines.v2_importers.mozilla_importer import parse_affected_packages +from vulnerabilities.pipelines.v2_importers.mozilla_importer import parse_md_advisory +from vulnerabilities.pipelines.v2_importers.mozilla_importer import parse_yml_advisory + + +def test_mfsa_id_from_filename(): + assert mfsa_id_from_filename("mfsa2022-01.md") == "mfsa2022-01" + assert mfsa_id_from_filename("mfsa2022-099.yml") == "mfsa2022-099" + assert mfsa_id_from_filename("notmfsa.txt") is None + + +def test_get_severity_from_impact(): + assert get_severity_from_impact("Critical").value == "critical" + assert get_severity_from_impact("Moderate").value == "medium" + assert get_severity_from_impact("Low").value == "low" + assert get_severity_from_impact("Random Text").value == "none" + assert get_severity_from_impact(None).value == "none" + + +def test_extract_description_from_html(): + md_text = dedent( + """ + ### Description + + This vulnerability affects Firefox. + + It could allow attackers to execute arbitrary code. + + ### Impact + + Critical + """ + ) + expected = ( + "This vulnerability affects Firefox.\nIt could allow attackers to execute arbitrary code." + ) + assert extract_description_from_html(md_text) == expected + + +def test_parse_affected_packages_valid(): + packages = ["firefox 89.0", "thunderbird 78.10"] + result = list(parse_affected_packages(packages)) + assert len(result) == 2 + assert result[0].package.name == "firefox" + assert str(result[0].fixed_version) == "89.0.0" + + +def test_parse_affected_packages_invalid(): + packages = ["firefox 89.0.0.1", "invalidpackage"] + result = list(parse_affected_packages(packages)) + assert len(result) == 0 # invalid SemVer or malformed + + +def test_parse_yml_advisory(tmp_path: Path): + advisory = { + "announced": "2022-01-01", + "description": "

This is a test

", + "impact": "High", + "fixed_in": ["firefox 89.0"], + "advisories": { + "CVE-2022-1234": {"description": "

Memory safety issue

", "impact": "Critical"} + }, + } + file = tmp_path / "mfsa2022-01.yml" + file.write_text(json.dumps(advisory)) + + results = list( + parse_yml_advisory("mfsa2022-01", file.open(), advisory_url="https://example.com") + ) + assert len(results) == 1 or len(results) == 2 + assert all(isinstance(r.summary, str) for r in results) From 4c2070908f6b642ad19c38c42873839f931c45f1 Mon Sep 17 00:00:00 2001 From: Tushar Goel Date: Thu, 17 Jul 2025 17:20:55 +0530 Subject: [PATCH 12/13] Fix tests Signed-off-by: Tushar Goel --- vulnerabilities/importers/__init__.py | 2 + .../v2_importers/elixir_security_importer.py | 3 + .../pipelines/v2_importers/github_importer.py | 395 ------------------ .../v2_importers/github_osv_importer.py | 89 ++++ .../v2_importers/mozilla_importer.py | 9 + .../pipelines/test_github_importer_v2.py | 168 -------- .../pipelines/test_github_osv_importer_v2.py | 69 +++ 7 files changed, 172 insertions(+), 563 deletions(-) delete mode 100644 vulnerabilities/pipelines/v2_importers/github_importer.py create mode 100644 vulnerabilities/pipelines/v2_importers/github_osv_importer.py delete mode 100644 vulnerabilities/tests/pipelines/test_github_importer_v2.py create mode 100644 vulnerabilities/tests/pipelines/test_github_osv_importer_v2.py diff --git a/vulnerabilities/importers/__init__.py b/vulnerabilities/importers/__init__.py index 6441f623a..706ca3c07 100644 --- a/vulnerabilities/importers/__init__.py +++ b/vulnerabilities/importers/__init__.py @@ -46,6 +46,7 @@ from vulnerabilities.pipelines.v2_importers import ( elixir_security_importer as elixir_security_importer_v2, ) +from vulnerabilities.pipelines.v2_importers import github_osv_importer as github_osv_importer_v2 from vulnerabilities.pipelines.v2_importers import gitlab_importer as gitlab_importer_v2 from vulnerabilities.pipelines.v2_importers import istio_importer as istio_importer_v2 from vulnerabilities.pipelines.v2_importers import mozilla_importer as mozilla_importer_v2 @@ -75,6 +76,7 @@ istio_importer_v2.IstioImporterPipeline, postgresql_importer_v2.PostgreSQLImporterPipeline, mozilla_importer_v2.MozillaImporterPipeline, + github_osv_importer_v2.GithubOSVImporterPipeline, nvd_importer.NVDImporterPipeline, github_importer.GitHubAPIImporterPipeline, gitlab_importer.GitLabImporterPipeline, diff --git a/vulnerabilities/pipelines/v2_importers/elixir_security_importer.py b/vulnerabilities/pipelines/v2_importers/elixir_security_importer.py index 64c31cb45..0c47b9c1a 100644 --- a/vulnerabilities/pipelines/v2_importers/elixir_security_importer.py +++ b/vulnerabilities/pipelines/v2_importers/elixir_security_importer.py @@ -65,6 +65,9 @@ def collect_advisories(self) -> Iterable[AdvisoryData]: if self.vcs_response: self.vcs_response.delete() + def on_failure(self): + self.clean_downloads() + def process_file(self, file, base_path) -> Iterable[AdvisoryData]: relative_path = str(file.relative_to(base_path)).strip("/") path_segments = str(file).split("/") diff --git a/vulnerabilities/pipelines/v2_importers/github_importer.py b/vulnerabilities/pipelines/v2_importers/github_importer.py deleted file mode 100644 index 3101679e6..000000000 --- a/vulnerabilities/pipelines/v2_importers/github_importer.py +++ /dev/null @@ -1,395 +0,0 @@ -# -# Copyright (c) nexB Inc. and others. All rights reserved. -# VulnerableCode is a trademark of nexB Inc. -# SPDX-License-Identifier: Apache-2.0 -# See http://www.apache.org/licenses/LICENSE-2.0 for the license text. -# See https://github.com/aboutcode-org/vulnerablecode for support or download. -# See https://aboutcode.org for more information about nexB OSS projects. -# - -import json -import logging -from traceback import format_exc as traceback_format_exc -from typing import Callable -from typing import Iterable -from typing import List -from typing import Optional - -from cwe2.database import Database -from dateutil import parser as dateparser -from packageurl import PackageURL -from univers.version_range import RANGE_CLASS_BY_SCHEMES -from univers.version_range import build_range_from_github_advisory_constraint - -from vulnerabilities import severity_systems -from vulnerabilities import utils -from vulnerabilities.importer import AdvisoryData -from vulnerabilities.importer import AffectedPackage -from vulnerabilities.importer import ReferenceV2 -from vulnerabilities.importer import VulnerabilitySeverity -from vulnerabilities.pipelines import VulnerableCodeBaseImporterPipelineV2 -from vulnerabilities.utils import dedupe -from vulnerabilities.utils import get_cwe_id -from vulnerabilities.utils import get_item - - -class GitHubAPIImporterPipeline(VulnerableCodeBaseImporterPipelineV2): - """ - GitHub Importer Pipeline - - This pipeline imports security advisories from GitHub Security Advisories. - """ - - pipeline_id = "github_importer_v2" - spdx_license_expression = "CC-BY-4.0" - license_url = "https://github.com/github/advisory-database/blob/main/LICENSE.md" - unfurl_version_ranges = True - - ignorable_versions = frozenset( - [ - "0.1-bulbasaur", - "0.1-charmander", - "0.3m1", - "0.3m2", - "0.3m3", - "0.3m4", - "0.3m5", - "0.4m1", - "0.4m2", - "0.4m3", - "0.4m4", - "0.4m5", - "0.5m1", - "0.5m2", - "0.5m3", - "0.5m4", - "0.5m5", - "0.6m1", - "0.6m2", - "0.6m3", - "0.6m4", - "0.6m5", - "0.6m6", - "0.7.10p1", - "0.7.11p1", - "0.7.11p2", - "0.7.11p3", - "0.8.1p1", - "0.8.3p1", - "0.8.4p1", - "0.8.4p2", - "0.8.6p1", - "0.8.7p1", - "0.9-doduo", - "0.9-eevee", - "0.9-fearow", - "0.9-gyarados", - "0.9-horsea", - "0.9-ivysaur", - "2013-01-21T20:33:09+0100", - "2013-01-23T17:11:52+0100", - "2013-02-01T20:50:46+0100", - "2013-02-02T19:59:03+0100", - "2013-02-02T20:23:17+0100", - "2013-02-08T17:40:57+0000", - "2013-03-27T16:32:26+0100", - "2013-05-09T12:47:53+0200", - "2013-05-10T17:55:56+0200", - "2013-05-14T20:16:05+0200", - "2013-06-01T10:32:51+0200", - "2013-07-19T09:11:08+0000", - "2013-08-12T21:48:56+0200", - "2013-09-11T19-27-10", - "2013-12-23T17-51-15", - "2014-01-12T15-52-10", - "2.0.1rc2-git", - "3.0.0b3-", - "3.0b6dev-r41684", - "-class.-jw.util.version.Version-", - "vulnerabilities", - ] - ) - - @classmethod - def steps(cls): - return (cls.collect_and_store_advisories,) - - package_type_by_github_ecosystem = { - "MAVEN": "maven", - "NUGET": "nuget", - "COMPOSER": "composer", - "PIP": "pypi", - "RUBYGEMS": "gem", - "NPM": "npm", - "RUST": "cargo", - # "GO": "golang", - } - - def advisories_count(self): - advisory_query = """ - query{ - securityVulnerabilities(first: 0, ecosystem: %s) { - totalCount - } - } - """ - advisory_counts = 0 - for ecosystem in self.package_type_by_github_ecosystem.keys(): - graphql_query = {"query": advisory_query % (ecosystem)} - response = utils.fetch_github_graphql_query(graphql_query) - advisory_counts += get_item(response, "data", "securityVulnerabilities", "totalCount") - return advisory_counts - - def collect_advisories(self) -> Iterable[AdvisoryData]: - - # TODO: We will try to gather more info from GH API - # Check https://github.com/nexB/vulnerablecode/issues/1039#issuecomment-1366458885 - # Check https://github.com/nexB/vulnerablecode/issues/645 - # set of all possible values of first '%s' = {'MAVEN','COMPOSER', 'NUGET', 'RUBYGEMS', 'PYPI', 'NPM', 'RUST'} - # second '%s' is interesting, it will have the value '' for the first request, - advisory_query = """ - query{ - securityVulnerabilities(first: 100, ecosystem: %s, %s) { - edges { - node { - advisory { - identifiers { - type - value - } - summary - references { - url - } - severity - cwes(first: 10){ - nodes { - cweId - } - } - publishedAt - } - firstPatchedVersion{ - identifier - } - package { - name - } - vulnerableVersionRange - } - } - pageInfo { - hasNextPage - endCursor - } - } - } - """ - for ecosystem, package_type in self.package_type_by_github_ecosystem.items(): - end_cursor_exp = "" - while True: - graphql_query = {"query": advisory_query % (ecosystem, end_cursor_exp)} - response = utils.fetch_github_graphql_query(graphql_query) - - page_info = get_item(response, "data", "securityVulnerabilities", "pageInfo") - end_cursor = get_item(page_info, "endCursor") - if end_cursor: - end_cursor = f'"{end_cursor}"' - end_cursor_exp = f"after: {end_cursor}" - - yield from process_response(response, package_type=package_type) - - if not get_item(page_info, "hasNextPage"): - break - - -def get_purl(pkg_type: str, github_name: str, logger: Callable = None) -> Optional[PackageURL]: - """ - Return a PackageURL by splitting the `github_name` using the `pkg_type` - convention. Return None and log an error if we can not split or it is an - unknown package type. - - For example:: - >>> expected = PackageURL(type='maven', namespace='org.apache.commons', name='commons-lang3') - >>> assert get_purl("maven", "org.apache.commons:commons-lang3") == expected - - >>> expected = PackageURL(type="composer", namespace="foo", name="bar") - >>> assert get_purl("composer", "foo/bar") == expected - """ - if pkg_type == "maven": - if ":" not in github_name: - if logger: - logger(f"get_purl: Invalid maven package name {github_name}", level=logging.ERROR) - return - ns, _, name = github_name.partition(":") - return PackageURL(type=pkg_type, namespace=ns, name=name) - - if pkg_type in ("composer", "npm"): - if "/" not in github_name: - return PackageURL(type=pkg_type, name=github_name) - vendor, _, name = github_name.partition("/") - return PackageURL(type=pkg_type, namespace=vendor, name=name) - - if pkg_type in ("nuget", "pypi", "gem", "golang", "npm", "cargo"): - return PackageURL(type=pkg_type, name=github_name) - - if logger: - logger(f"get_purl: Unknown package type {pkg_type}", level=logging.ERROR) - - -def process_response( - resp: dict, package_type: str, logger: Callable = None -) -> Iterable[AdvisoryData]: - """ - Yield `AdvisoryData` by taking `resp` and `ecosystem` as input - """ - vulnerabilities = get_item(resp, "data", "securityVulnerabilities", "edges") or [] - if not vulnerabilities: - if logger: - logger( - f"No vulnerabilities found for package_type: {package_type!r} in response: {resp!r}", - level=logging.ERROR, - ) - return - - for vulnerability in vulnerabilities: - aliases = [] - affected_packages = [] - github_advisory = get_item(vulnerability, "node") - if not github_advisory: - if logger: - logger(f"No node found in {vulnerability!r}", level=logging.ERROR) - continue - - advisory = get_item(github_advisory, "advisory") - if not advisory: - if logger: - logger(f"No advisory found in {github_advisory!r}", level=logging.ERROR) - continue - - summary = get_item(advisory, "summary") or "" - - references = get_item(advisory, "references") or [] - if references: - urls = (ref["url"] for ref in references) - references = [ReferenceV2.from_url(u) for u in urls] - - date_published = get_item(advisory, "publishedAt") - if date_published: - date_published = dateparser.parse(date_published) - - name = get_item(github_advisory, "package", "name") - if name: - purl = get_purl(pkg_type=package_type, github_name=name, logger=logger) - if purl: - affected_range = get_item(github_advisory, "vulnerableVersionRange") - fixed_version = get_item(github_advisory, "firstPatchedVersion", "identifier") - if affected_range: - try: - affected_range = build_range_from_github_advisory_constraint( - package_type, affected_range - ) - except Exception as e: - if logger: - logger( - f"Could not parse affected range {affected_range!r} {e!r} \n {traceback_format_exc()}", - level=logging.ERROR, - ) - affected_range = None - if fixed_version: - try: - fixed_version = RANGE_CLASS_BY_SCHEMES[package_type].version_class( - fixed_version - ) - except Exception as e: - if logger: - logger( - f"Invalid fixed version {fixed_version!r} {e!r} \n {traceback_format_exc()}", - level=logging.ERROR, - ) - fixed_version = None - if affected_range or fixed_version: - affected_packages.append( - AffectedPackage( - package=purl, - affected_version_range=affected_range, - fixed_version=fixed_version, - ) - ) - identifiers = get_item(advisory, "identifiers") or [] - ghsa_id = "" - severities = [] - for identifier in identifiers: - value = identifier["value"] - identifier_type = identifier["type"] - aliases.append(value) - # attach the GHSA with severity score - if identifier_type == "GHSA": - # Each Node has only one GHSA, hence exit after attaching - # score to this GHSA - ghsa_id = value - for ref in references: - if ref.reference_id == value: - severity = get_item(advisory, "severity") - if severity: - severities = [ - VulnerabilitySeverity( - system=severity_systems.CVSS31_QUALITY, - value=severity, - url=ref.url, - ) - ] - - elif identifier_type == "CVE": - pass - else: - if logger: - logger( - f"Unknown identifier type {identifier_type!r} and value {value!r}", - level=logging.ERROR, - ) - - weaknesses = get_cwes_from_github_advisory(advisory, logger) - - advisory_id = None - - aliases = sorted(dedupe(aliases)) - - advisory_id = ghsa_id or aliases[0] - - aliases.remove(advisory_id) - - yield AdvisoryData( - advisory_id=ghsa_id, - aliases=aliases, - summary=summary, - references_v2=references, - severities=severities, - affected_packages=affected_packages, - date_published=date_published, - weaknesses=weaknesses, - url=f"https://github.com/advisories/{ghsa_id}", - original_advisory_text=json.dumps(github_advisory, indent=2, ensure_ascii=False), - ) - - -def get_cwes_from_github_advisory(advisory, logger=None) -> List[int]: - """ - Return the cwe-id list from advisory ex: [ 522 ] - by extracting the cwe_list from advisory ex: [{'cweId': 'CWE-522'}] - then remove the CWE- from string and convert it to integer 522 and Check if the CWE in CWE-Database - """ - weaknesses = [] - db = Database() - cwe_list = get_item(advisory, "cwes", "nodes") or [] - for cwe_item in cwe_list: - cwe_string = get_item(cwe_item, "cweId") - if cwe_string: - cwe_id = get_cwe_id(cwe_string) - try: - db.get(cwe_id) - weaknesses.append(cwe_id) - except Exception as e: - if logger: - logger(f"Invalid CWE id {e!r} \n {traceback_format_exc()}", level=logging.ERROR) - return weaknesses diff --git a/vulnerabilities/pipelines/v2_importers/github_osv_importer.py b/vulnerabilities/pipelines/v2_importers/github_osv_importer.py new file mode 100644 index 000000000..ef96e4a84 --- /dev/null +++ b/vulnerabilities/pipelines/v2_importers/github_osv_importer.py @@ -0,0 +1,89 @@ +# +# Copyright (c) nexB Inc. and others. All rights reserved. +# VulnerableCode is a trademark of nexB Inc. +# SPDX-License-Identifier: Apache-2.0 +# See http://www.apache.org/licenses/LICENSE-2.0 for the license text. +# See https://github.com/aboutcode-org/vulnerablecode for support or download. +# See https://aboutcode.org for more information about nexB OSS projects. +# + +import json +from pathlib import Path +from typing import Iterable + +from fetchcode.vcs import fetch_via_vcs + +from vulnerabilities.importer import AdvisoryData +from vulnerabilities.pipelines import VulnerableCodeBaseImporterPipelineV2 +from vulnerabilities.utils import get_advisory_url + + +class GithubOSVImporterPipeline(VulnerableCodeBaseImporterPipelineV2): + """ + GithubOSV Importer Pipeline + + Collect advisories from the GitHub Advisory Database repository. + """ + + pipeline_id = "github_osv_importer_v2" + spdx_license_expression = "CC-BY-4.0" + license_url = "https://github.com/github/advisory-database/blob/main/LICENSE.md" + repo_url = "git+https://github.com/github/advisory-database/" + unfurl_version_ranges = True + + @classmethod + def steps(cls): + return ( + cls.clone, + cls.collect_and_store_advisories, + cls.clean_downloads, + ) + + def clone(self): + self.log(f"Cloning `{self.repo_url}`") + self.vcs_response = fetch_via_vcs(self.repo_url) + + def advisories_count(self): + advisory_dir = Path(self.vcs_response.dest_dir) / "advisories/github-reviewed" + return sum(1 for _ in advisory_dir.rglob("*.json")) + + def collect_advisories(self) -> Iterable[AdvisoryData]: + from vulnerabilities.importers.osv import parse_advisory_data_v2 + + supported_ecosystems = [ + "pypi", + "npm", + "maven", + # "golang", + "composer", + "hex", + "gem", + "nuget", + "cargo", + ] + base_path = Path(self.vcs_response.dest_dir) + advisory_dir = base_path / "advisories/github-reviewed" + + for file in advisory_dir.rglob("*.json"): + advisory_url = get_advisory_url( + file=file, + base_path=base_path, + url="https://github.com/github/advisory-database/blob/main/", + ) + with open(file) as f: + raw_data = json.load(f) + advisory_text = file.read_text() + yield parse_advisory_data_v2( + raw_data=raw_data, + supported_ecosystems=supported_ecosystems, + advisory_url=advisory_url, + advisory_text=advisory_text, + ) + + def clean_downloads(self): + if self.vcs_response: + self.log("Removing cloned repository") + self.vcs_response.delete() + + def on_failure(self): + self.clean_downloads() diff --git a/vulnerabilities/pipelines/v2_importers/mozilla_importer.py b/vulnerabilities/pipelines/v2_importers/mozilla_importer.py index e3e10538b..5baecd62c 100644 --- a/vulnerabilities/pipelines/v2_importers/mozilla_importer.py +++ b/vulnerabilities/pipelines/v2_importers/mozilla_importer.py @@ -50,12 +50,21 @@ def steps(cls): return ( cls.clone, cls.collect_and_store_advisories, + cls.clean_downloads, ) def clone(self): self.log(f"Cloning `{self.repo_url}`") self.vcs_response = fetch_via_vcs(self.repo_url) + def clean_downloads(self): + if self.vcs_response: + self.log(f"Removing cloned repository") + self.vcs_response.delete() + + def on_failure(self): + self.clean_downloads() + def advisories_count(self) -> int: base_path = Path(self.vcs_response.dest_dir) yml = list((base_path / "announce").glob("**/*.yml")) diff --git a/vulnerabilities/tests/pipelines/test_github_importer_v2.py b/vulnerabilities/tests/pipelines/test_github_importer_v2.py deleted file mode 100644 index 4459f58cd..000000000 --- a/vulnerabilities/tests/pipelines/test_github_importer_v2.py +++ /dev/null @@ -1,168 +0,0 @@ -# -# Copyright (c) nexB Inc. and others. All rights reserved. -# VulnerableCode is a trademark of nexB Inc. -# SPDX-License-Identifier: Apache-2.0 -# See http://www.apache.org/licenses/LICENSE-2.0 for the license text. -# See https://github.com/aboutcode-org/vulnerablecode for support or download. -# See https://aboutcode.org for more information about nexB OSS projects. -# - -from unittest.mock import patch - -import pytest -from packageurl import PackageURL - -from vulnerabilities.pipelines.v2_importers.github_importer import GitHubAPIImporterPipeline -from vulnerabilities.pipelines.v2_importers.github_importer import get_cwes_from_github_advisory -from vulnerabilities.pipelines.v2_importers.github_importer import get_purl -from vulnerabilities.utils import get_item - - -@pytest.fixture -def mock_fetch(): - with patch( - "vulnerabilities.pipelines.v2_importers.github_importer.utils.fetch_github_graphql_query" - ) as mock: - yield mock - - -def test_advisories_count(mock_fetch): - # Mock the GraphQL query response for advisory count - mock_fetch.return_value = {"data": {"securityVulnerabilities": {"totalCount": 10}}} - - pipeline = GitHubAPIImporterPipeline() - - count = pipeline.advisories_count() - - # Assert that the count is correct - assert count == 70 - - -def test_collect_advisories(mock_fetch): - # Mock advisory data for GitHub - advisory_data = { - "data": { - "securityVulnerabilities": { - "edges": [ - { - "node": { - "advisory": { - "identifiers": [{"type": "GHSA", "value": "GHSA-1234-ABCD"}], - "summary": "Sample advisory description", - "references": [ - {"url": "https://github.com/advisories/GHSA-1234-ABCD"} - ], - "severity": "HIGH", - "cwes": {"nodes": [{"cweId": "CWE-123"}]}, - "publishedAt": "2023-01-01T00:00:00Z", - }, - "firstPatchedVersion": {"identifier": "1.2.3"}, - "package": {"name": "example-package"}, - "vulnerableVersionRange": ">=1.0.0,<=1.2.0", - } - } - ], - "pageInfo": {"hasNextPage": False, "endCursor": None}, - } - } - } - - # Mock the response from GitHub GraphQL query - mock_fetch.return_value = advisory_data - - # Instantiate the pipeline - pipeline = GitHubAPIImporterPipeline() - - # Collect advisories - advisories = list(pipeline.collect_advisories()) - - # Check if advisories were correctly parsed - assert len(advisories) == 7 - advisory = advisories[0] - - # Validate advisory fields - assert advisory.advisory_id == "GHSA-1234-ABCD" - assert advisory.summary == "Sample advisory description" - assert advisory.url == "https://github.com/advisories/GHSA-1234-ABCD" - assert len(advisory.references_v2) == 1 - assert advisory.references_v2[0].reference_id == "GHSA-1234-ABCD" - assert advisory.severities[0].value == "HIGH" - # Check CWE extraction - assert advisory.weaknesses == [123] - - -def test_get_purl(mock_fetch): - # Test for package URL generation - result = get_purl("cargo", "example/package-name") - - # Validate that the correct PackageURL is generated - assert isinstance(result, PackageURL) - assert result.type == "cargo" - assert result.namespace == None - assert result.name == "example/package-name" - - -def test_process_response(mock_fetch): - # Mock advisory data as input for the process_response function - advisory_data = { - "data": { - "securityVulnerabilities": { - "edges": [ - { - "node": { - "advisory": { - "identifiers": [{"type": "GHSA", "value": "GHSA-5678-EFGH"}], - "summary": "Another advisory", - "references": [ - {"url": "https://github.com/advisories/GHSA-5678-EFGH"} - ], - "severity": "MEDIUM", - "cwes": {"nodes": [{"cweId": "CWE-200"}]}, - "publishedAt": "2023-02-01T00:00:00Z", - }, - "firstPatchedVersion": {"identifier": "2.0.0"}, - "package": {"name": "another-package"}, - "vulnerableVersionRange": ">=2.0.0,<=3.0.0", - } - } - ], - "pageInfo": {"hasNextPage": False, "endCursor": None}, - } - } - } - - # Mock the response from GitHub GraphQL query - mock_fetch.return_value = advisory_data - - # Process the mock response - result = list(GitHubAPIImporterPipeline().collect_advisories()) - - # Check the results - assert len(result) == 7 - advisory = result[0] - - # Validate the advisory data - assert advisory.advisory_id == "GHSA-5678-EFGH" - assert advisory.summary == "Another advisory" - assert advisory.url == "https://github.com/advisories/GHSA-5678-EFGH" - - # Check CWE extraction - assert advisory.weaknesses == [200] - - -def test_get_cwes_from_github_advisory(mock_fetch): - # Mock CWEs extraction from GitHub advisory - advisory_data = {"cwes": {"nodes": [{"cweId": "CWE-522"}]}} - - cwes = get_cwes_from_github_advisory(advisory_data) - - # Validate the CWE ID extraction - assert cwes == [522] - - -def test_invalid_package_type_in_get_purl(mock_fetch): - # Test for invalid package type - result = get_purl("invalidpkg", "example/package-name") - - # Assert that None is returned for an invalid package type - assert result is None diff --git a/vulnerabilities/tests/pipelines/test_github_osv_importer_v2.py b/vulnerabilities/tests/pipelines/test_github_osv_importer_v2.py new file mode 100644 index 000000000..422d26a14 --- /dev/null +++ b/vulnerabilities/tests/pipelines/test_github_osv_importer_v2.py @@ -0,0 +1,69 @@ +# +# Copyright (c) nexB Inc. and others. All rights reserved. +# VulnerableCode is a trademark of nexB Inc. +# SPDX-License-Identifier: Apache-2.0 +# See http://www.apache.org/licenses/LICENSE-2.0 for the license text. +# See https://github.com/aboutcode-org/vulnerablecode for support or download. +# See https://aboutcode.org for more information about nexB OSS projects. +# + +import json +from pathlib import Path + +import pytest + +from vulnerabilities.importer import AdvisoryData +from vulnerabilities.pipelines.v2_importers.github_osv_importer import GithubOSVImporterPipeline + + +@pytest.fixture +def sample_osv_advisory(tmp_path: Path): + advisory_data = { + "id": "GHSA-xxxx-yyyy-zzzz", + "summary": "Sample summary", + "details": "Sample details", + "aliases": ["CVE-2021-99999"], + "affected": [ + { + "package": {"name": "sample", "ecosystem": "pypi"}, + "ranges": [ + {"type": "ECOSYSTEM", "events": [{"introduced": "1.0.0"}, {"fixed": "1.2.0"}]} + ], + "versions": ["1.0.0", "1.1.0"], + } + ], + "database_specific": {"cwe_ids": ["CWE-79"]}, + } + + advisory_dir = tmp_path / "advisories/github-reviewed/sample_project" + advisory_dir.mkdir(parents=True) + + advisory_file = advisory_dir / "GHSA-xxxx-yyyy-zzzz.json" + advisory_file.write_text(json.dumps(advisory_data, indent=2)) + + return tmp_path, advisory_file.read_text(), advisory_data + + +def test_collect_advisories_from_github_osv(monkeypatch, sample_osv_advisory): + tmp_path, advisory_text, advisory_json = sample_osv_advisory + + class DummyVCSResponse: + dest_dir = str(tmp_path) + + def delete(self): + pass + + importer = GithubOSVImporterPipeline() + importer.vcs_response = DummyVCSResponse() + + advisories = list(importer.collect_advisories()) + assert len(advisories) == 1 + + advisory = advisories[0] + assert isinstance(advisory, AdvisoryData) + assert advisory.advisory_id == "GHSA-xxxx-yyyy-zzzz" + assert "CVE-2021-99999" in advisory.aliases + assert advisory.summary.startswith("Sample") + assert advisory.original_advisory_text.strip().startswith("{") + assert advisory.affected_packages + assert advisory.affected_packages[0].package.type == "pypi" From 3479b49c9d0a2ccfaff36237ae1430993f8259e6 Mon Sep 17 00:00:00 2001 From: Tushar Goel Date: Thu, 17 Jul 2025 17:29:14 +0530 Subject: [PATCH 13/13] Fix linting errors Signed-off-by: Tushar Goel --- vulnerabilities/pipelines/v2_importers/mozilla_importer.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vulnerabilities/pipelines/v2_importers/mozilla_importer.py b/vulnerabilities/pipelines/v2_importers/mozilla_importer.py index 5baecd62c..668b6a498 100644 --- a/vulnerabilities/pipelines/v2_importers/mozilla_importer.py +++ b/vulnerabilities/pipelines/v2_importers/mozilla_importer.py @@ -215,7 +215,7 @@ def get_reference_and_severity(mfsa_id: str, impact: str) -> ReferenceV2: ) -def mfsa_id_from_filename(filename: str) -> str | None: +def mfsa_id_from_filename(filename: str): match = MFSA_FILENAME_RE.search(filename) return f"mfsa{match.group(1)}" if match else None