diff --git a/vulnerabilities/pipelines/v2_importers/collect_repo_fix_commits.py b/vulnerabilities/pipelines/v2_importers/collect_repo_fix_commits.py new file mode 100644 index 000000000..5706eb8d5 --- /dev/null +++ b/vulnerabilities/pipelines/v2_importers/collect_repo_fix_commits.py @@ -0,0 +1,129 @@ +import re +import shutil +import tempfile +from collections import defaultdict + +from git import Repo + +from vulnerabilities.importer import AdvisoryData +from vulnerabilities.importer import AffectedPackageV2 +from vulnerabilities.importer import PackageCommitPatchData +from vulnerabilities.pipelines import VulnerableCodeBaseImporterPipelineV2 + +SECURITY_PATTERNS = [ + r"\bCVE-\d{4}-\d{4,19}\b", + r"\bGHSA-[a-z0-9]{4}-[a-z0-9]{4}-[a-z0-9]{4}\b", + r"\bPYSEC-\d{4}-\d{1,6}\b", + r"\bXSA-\d{1,4}\b", +] + + +class CollectRepoFixCommitPipeline(VulnerableCodeBaseImporterPipelineV2): + """ + Pipeline to collect fix commits from any git repository. + """ + + pipeline_id = "collect_fix_commit" + + @classmethod + def steps(cls): + return ( + cls.clone, + cls.collect_and_store_advisories, + cls.clean_downloads, + ) + + def clone(self): + """Clone the repository.""" + self.repo_url = self.inputs["repo_url"] + if not self.repo_url: + raise ValueError("Repo is required for CollectRepoFixCommitPipeline") + + self.purl = self.inputs["purl"] + self.repo = Repo.clone_from( + url=self.repo_url, + to_path=tempfile.mkdtemp(), + bare=True, + no_checkout=True, + multi_options=["--filter=blob:none"], + ) + + def advisories_count(self) -> int: + return 0 + + def extract_vulnerability_id(self, commit) -> list[str]: + """ + Extract vulnerability id from a commit message. + Returns a list of matched vulnerability IDs + """ + matches = [] + for pattern in SECURITY_PATTERNS: + found = re.findall(pattern, commit.message, flags=re.IGNORECASE) + matches.extend(found) + return matches + + def collect_fix_commits(self): + """ + Iterate through repository commits and group them by vulnerability identifiers. + return a list with (vuln_id, [(commit_id, commit_message)]). + """ + self.log("Processing git repository fix commits (grouped by vulnerability IDs).") + + grouped_commits = defaultdict(list) + for commit in self.repo.iter_commits("--all"): + matched_ids = self.extract_vulnerability_id(commit) + if not matched_ids: + continue + + commit_id = commit.hexsha + commit_message = commit.message.strip() + + for vuln_id in matched_ids: + grouped_commits[vuln_id].append((commit_id, commit_message)) + + self.log(f"Found {len(grouped_commits)} vulnerabilities with related commits.") + self.log("Finished processing all commits.") + return grouped_commits + + def collect_advisories(self): + """ + Generate AdvisoryData objects for each vulnerability ID grouped with its related commits. + """ + self.log("Generating AdvisoryData objects from grouped commits.") + grouped_commits = self.collect_fix_commits() + for vuln_id, commits_data in grouped_commits.items(): + if not commits_data or not vuln_id: + continue + + summary_lines = [] + for c_hash, msg in commits_data: + summary_lines.append(f"{c_hash}: {msg}") + summary = f"Commits fixing {vuln_id}:\n" + "\n".join(summary_lines) + + commit_hash_set = {commit_hash for commit_hash, _ in commits_data} + affected_packages = [ + AffectedPackageV2( + package=self.purl, + fixed_by_commit_patches=[ + PackageCommitPatchData(vcs_url=self.repo_url, commit_hash=commit_hash) + for commit_hash in commit_hash_set + ], + ) + ] + + yield AdvisoryData( + advisory_id=vuln_id, + summary=summary, + affected_packages=affected_packages, + url=self.repo_url, + ) + + def clean_downloads(self): + """Cleanup any temporary repository data.""" + self.log("Cleaning up local repository resources.") + if hasattr(self, "repo") and self.repo.working_dir: + shutil.rmtree(path=self.repo.working_dir) + + def on_failure(self): + """Ensure cleanup is always performed on failure.""" + self.clean_downloads() diff --git a/vulnerabilities/tests/pipelines/v2_importers/test_collect_fix_commit.py b/vulnerabilities/tests/pipelines/v2_importers/test_collect_fix_commit.py new file mode 100644 index 000000000..fb5d72e9b --- /dev/null +++ b/vulnerabilities/tests/pipelines/v2_importers/test_collect_fix_commit.py @@ -0,0 +1,126 @@ +# +# Copyright (c) nexB Inc. and others. All rights reserved. +# VulnerableCode is a trademark of nexB Inc. +# SPDX-License-Identifier: Apache-2.0 +# See http://www.apache.org/licenses/LICENSE-2.0 for the license text. +# See https://github.com/aboutcode-org/vulnerablecode for support or download. +# See https://aboutcode.org for more information about nexB OSS projects. +# + +import json +from pathlib import Path +from unittest import TestCase +from unittest.mock import MagicMock +from unittest.mock import patch + +import pytest +from packageurl import PackageURL + +from vulnerabilities.pipelines.v2_importers.collect_repo_fix_commits import ( + CollectRepoFixCommitPipeline, +) +from vulnerabilities.tests import util_tests + + +@pytest.fixture +def pipeline(): + pipeline = CollectRepoFixCommitPipeline() + pipeline.repo_url = "https://github.com/test/repo" + pipeline.log = MagicMock() + return pipeline + + +def test_classify_commit_type_extracts_ids(pipeline): + class DummyCommit: + message = "Fix for CVE-2023-1234 and GHSA-2479-qvv7-47qq" + + result = pipeline.extract_vulnerability_id(DummyCommit) + assert result == ["CVE-2023-1234", "GHSA-2479-qvv7-47qq"] + + +@patch("vulnerabilities.pipelines.v2_importers.collect_repo_fix_commits.Repo") +def test_collect_fix_commits_groups_by_vuln(mock_repo, pipeline): + commit1 = MagicMock(message="Fix CVE-2021-0001", hexsha="abc123") + commit2 = MagicMock(message="Patch GHSA-dead-beef-baad", hexsha="def456") + commit3 = MagicMock(message="Unrelated change", hexsha="ghi789") + + pipeline.repo = MagicMock() + pipeline.repo.iter_commits.return_value = [commit1, commit2, commit3] + + pipeline.classify_commit_type = MagicMock( + side_effect=lambda c: ( + ["CVE-2021-0001"] + if "CVE" in c.message + else ["GHSA-dead-beef-baad"] + if "GHSA" in c.message + else [] + ) + ) + + grouped = pipeline.collect_fix_commits() + + expected = { + "CVE-2021-0001": [("abc123", "Fix CVE-2021-0001")], + "GHSA-dead-beef-baad": [("def456", "Patch GHSA-dead-beef-baad")], + } + + assert grouped == expected + + +TEST_DATA = Path(__file__).parent.parent.parent / "test_data" / "fix_commits" + + +class TestRepoFixCommitPipeline(TestCase): + def test_collect_advisories_from_json(self): + input_file = TEST_DATA / "grouped_commits_input.json" + expected_file = TEST_DATA / "expected_linux_advisory_output.json" + + grouped_commits = json.loads(input_file.read_text(encoding="utf-8")) + + pipeline = CollectRepoFixCommitPipeline() + pipeline.repo_url = "https://github.com/test/repo" + pipeline.purl = PackageURL.from_string("pkg:generic/test") + pipeline.log = MagicMock() + pipeline.collect_fix_commits = MagicMock(return_value=grouped_commits) + + result = [adv.to_dict() for adv in pipeline.collect_advisories()] + + util_tests.check_results_against_json(result, expected_file, True) + + +@pytest.mark.parametrize( + "commit_message, expected_ids", + [ + ("Fix CVE-2023-12345 buffer overflow", ["CVE-2023-12345"]), + ("Address GHSA-abcd-1234-efgh report", ["GHSA-abcd-1234-efgh"]), + ("Python security PYSEC-2021-12345 fix", ["PYSEC-2021-12345"]), + ("Xen XSA-43 security update", ["XSA-43"]), + ( + "Fix CVE-2023-1111 and GHSA-aaaa-bbbb-cccc in kernel", + ["CVE-2023-1111", "GHSA-aaaa-bbbb-cccc"], + ), + ("Refactor logging system with no security ID", []), + ], +) +def test_classify_commit_type_detects_vuln_ids(pipeline, commit_message, expected_ids): + """Ensure classify_commit_type correctly extracts vulnerability IDs.""" + + class DummyCommit: + def __init__(self, message): + self.message = message + + commit = DummyCommit(commit_message) + result = pipeline.extract_vulnerability_id(commit) + + assert result == expected_ids, f"Unexpected result for message: {commit_message}" + + +def test_classify_commit_type_case_insensitive(pipeline): + """Ensure pattern matching is case-insensitive.""" + + class DummyCommit: + message = "fix cVe-2022-9999 and ghSa-dead-beef-baad" + + result = pipeline.extract_vulnerability_id(DummyCommit) + assert any("CVE-2022-9999" in r.upper() for r in result) + assert any("GHSA-DEAD-BEEF-BAAD" in r.upper() for r in result) diff --git a/vulnerabilities/tests/test_data/fix_commits/expected_linux_advisory_output.json b/vulnerabilities/tests/test_data/fix_commits/expected_linux_advisory_output.json new file mode 100644 index 000000000..182427683 --- /dev/null +++ b/vulnerabilities/tests/test_data/fix_commits/expected_linux_advisory_output.json @@ -0,0 +1,70 @@ +[ + { + "advisory_id": "CVE-2021-0001", + "aliases": [], + "summary": "Commits fixing CVE-2021-0001:\n41b43c74bda19753c757036673ea9db74acf494a: Fixed CVE-2025-59681 -- Protected QuerySet.annotate(), alias(), aggregate(), and extra() against SQL injection in column aliases on MySQL/MariaDB.", + "affected_packages": [ + { + "package": { + "type": "generic", + "namespace": "", + "name": "test", + "version": "", + "qualifiers": "", + "subpath": "" + }, + "affected_version_range": null, + "fixed_version_range": null, + "introduced_by_commit_patches": [], + "fixed_by_commit_patches": [ + { + "vcs_url": "https://github.com/test/repo", + "commit_hash": "41b43c74bda19753c757036673ea9db74acf494a", + "patch_text": null, + "patch_checksum": null + } + ] + } + ], + "references_v2": [], + "patches": [], + "severities": [], + "date_published": null, + "weaknesses": [], + "url": "https://github.com/test/repo" + }, + { + "advisory_id": "GHSA-dead-beef-baad", + "aliases": [], + "summary": "Commits fixing GHSA-dead-beef-baad:\n49ff1042aa66bb25eda87e9a8ef82f3b0ad4eeba: Fixed CVE-2024-53907 -- Mitigated potential DoS in strip_tags().", + "affected_packages": [ + { + "package": { + "type": "generic", + "namespace": "", + "name": "test", + "version": "", + "qualifiers": "", + "subpath": "" + }, + "affected_version_range": null, + "fixed_version_range": null, + "introduced_by_commit_patches": [], + "fixed_by_commit_patches": [ + { + "vcs_url": "https://github.com/test/repo", + "commit_hash": "49ff1042aa66bb25eda87e9a8ef82f3b0ad4eeba", + "patch_text": null, + "patch_checksum": null + } + ] + } + ], + "references_v2": [], + "patches": [], + "severities": [], + "date_published": null, + "weaknesses": [], + "url": "https://github.com/test/repo" + } +] \ No newline at end of file diff --git a/vulnerabilities/tests/test_data/fix_commits/grouped_commits_input.json b/vulnerabilities/tests/test_data/fix_commits/grouped_commits_input.json new file mode 100644 index 000000000..f905c9710 --- /dev/null +++ b/vulnerabilities/tests/test_data/fix_commits/grouped_commits_input.json @@ -0,0 +1,8 @@ +{ + "CVE-2021-0001": [ + ["41b43c74bda19753c757036673ea9db74acf494a", "Fixed CVE-2025-59681 -- Protected QuerySet.annotate(), alias(), aggregate(), and extra() against SQL injection in column aliases on MySQL/MariaDB."] + ], + "GHSA-dead-beef-baad": [ + ["49ff1042aa66bb25eda87e9a8ef82f3b0ad4eeba", "Fixed CVE-2024-53907 -- Mitigated potential DoS in strip_tags()."] + ] +} \ No newline at end of file