Skip to content
Open
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions vulnerabilities/importers/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -59,6 +59,7 @@
from vulnerabilities.pipelines.v2_importers import github_osv_importer as github_osv_importer_v2
from vulnerabilities.pipelines.v2_importers import gitlab_importer as gitlab_importer_v2
from vulnerabilities.pipelines.v2_importers import istio_importer as istio_importer_v2
from vulnerabilities.pipelines.v2_importers import libreoffice_importer as libreoffice_importer_v2
from vulnerabilities.pipelines.v2_importers import mattermost_importer as mattermost_importer_v2
from vulnerabilities.pipelines.v2_importers import mozilla_importer as mozilla_importer_v2
from vulnerabilities.pipelines.v2_importers import nginx_importer as nginx_importer_v2
Expand Down Expand Up @@ -118,6 +119,7 @@
retiredotnet_importer_v2.RetireDotnetImporterPipeline,
ubuntu_osv_importer_v2.UbuntuOSVImporterPipeline,
alpine_linux_importer_v2.AlpineLinuxImporterPipeline,
libreoffice_importer_v2.LibreOfficeImporterPipeline,
nvd_importer.NVDImporterPipeline,
github_importer.GitHubAPIImporterPipeline,
gitlab_importer.GitLabImporterPipeline,
Expand Down
154 changes: 154 additions & 0 deletions vulnerabilities/pipelines/v2_importers/libreoffice_importer.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,154 @@
#
# Copyright (c) nexB Inc. and others. All rights reserved.
# VulnerableCode is a trademark of nexB Inc.
# SPDX-License-Identifier: Apache-2.0
# See http://www.apache.org/licenses/LICENSE-2.0 for the license text.
# See https://github.com/aboutcode-org/vulnerablecode for support or download.
# See https://aboutcode.org for more information about nexB OSS projects.
#

import json
import logging
import re
from typing import Iterable

import dateparser
import requests

from vulnerabilities.importer import AdvisoryDataV2
from vulnerabilities.importer import ReferenceV2
from vulnerabilities.importer import VulnerabilitySeverity
from vulnerabilities.pipelines import VulnerableCodeBaseImporterPipelineV2
from vulnerabilities.severity_systems import SCORING_SYSTEMS
from vulnerabilities.utils import get_cwe_id

logger = logging.getLogger(__name__)

ADVISORIES_URL = "https://www.libreoffice.org/about-us/security/advisories/"
CVE_API_URL = "https://cveawg.mitre.org/api/cve/{cve_id}"
Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@NucleiAv This is incorrect. You are using two data sources: https://cveawg.mitre.org and https://www.libreoffice.org/about-us/security/advisories/. We should only use https://www.libreoffice.org/about-us/security/advisories/.

If https://www.libreoffice.org/about-us/security/advisories/ does not provide an API (feel free to do a deep search to confirm this), you should parse the HTML instead. Please take a look at other importers, such as the nginx importer: nginx_importer_v2.NginxImporterPipeline.

Copy link
Copy Markdown
Author

@NucleiAv NucleiAv Mar 14, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@ziadhany sure! I will incorporate the changes, but I thought that HTML parsing wont work if the website layout changes. Moreover libreoffice website does not provide details like CVSS score, CVSS version(2.0, 3.x, 4.0), Severity or CWEs, etc. To populate those details I those to use the api approach. I will research again if libreoffice provides an api and if not, will modify the code to do html parsing.

(below, no details regarding CVSS, CWE, etc is mentioned in the website)
image


CVSS_KEY_MAP = {
"cvssV4_0": SCORING_SYSTEMS["cvssv4"],
"cvssV3_1": SCORING_SYSTEMS["cvssv3.1"],
"cvssV3_0": SCORING_SYSTEMS["cvssv3"],
"cvssV2_0": SCORING_SYSTEMS["cvssv2"],
}


class LibreOfficeImporterPipeline(VulnerableCodeBaseImporterPipelineV2):
"""Collect LibreOffice security advisories via the CVE API."""

pipeline_id = "libreoffice_importer"
spdx_license_expression = "LicenseRef-scancode-proprietary-license"
license_url = "https://www.libreoffice.org/about-us/security/"
precedence = 200

@classmethod
def steps(cls):
return (
cls.fetch,
cls.collect_and_store_advisories,
)

def fetch(self):
self.log(f"Fetch `{ADVISORIES_URL}`")
resp = requests.get(ADVISORIES_URL, timeout=30)
resp.raise_for_status()
self.cve_ids = parse_cve_ids(resp.text)

def advisories_count(self):
return len(self.cve_ids)

def collect_advisories(self) -> Iterable[AdvisoryDataV2]:
for cve_id in self.cve_ids:
url = CVE_API_URL.format(cve_id=cve_id)
try:
resp = requests.get(url, timeout=30)
resp.raise_for_status()
except Exception as e:
logger.error("Failed to fetch CVE API for %s: %s", cve_id, e)
continue
advisory = parse_cve_advisory(resp.json(), cve_id)
if advisory:
yield advisory


def parse_cve_ids(html: str) -> list:
"""Return deduplicated CVE IDs from the LibreOffice advisories listing page."""
return list(dict.fromkeys(re.findall(r"CVE-\d{4}-\d+", html)))


def parse_cve_advisory(data: dict, cve_id: str):
"""Parse a CVE 5.0 JSON record from cveawg.mitre.org; return None if CVE ID is absent."""
cve_metadata = data.get("cveMetadata") or {}
advisory_id = cve_metadata.get("cveId") or cve_id
if not advisory_id:
return None

date_published = None
raw_date = cve_metadata.get("datePublished") or ""
if raw_date:
date_published = dateparser.parse(
raw_date,
settings={"TIMEZONE": "UTC", "RETURN_AS_TIMEZONE_AWARE": True, "TO_TIMEZONE": "UTC"},
)
if date_published is None:
logger.warning("Could not parse date %r for %s", raw_date, advisory_id)

cna = (data.get("containers") or {}).get("cna") or {}

summary = ""
for desc in cna.get("descriptions") or []:
if desc.get("lang") in ("en", "en-US"):
summary = desc.get("value") or ""
break

severities = []
for metric in cna.get("metrics") or []:
for key, system in CVSS_KEY_MAP.items():
cvss = metric.get(key)
if not cvss:
continue
vector = cvss.get("vectorString") or ""
score = cvss.get("baseScore")
if vector and score is not None:
severities.append(
VulnerabilitySeverity(
system=system,
value=str(score),
scoring_elements=vector,
)
)
break

weaknesses = []
for problem_type in cna.get("problemTypes") or []:
for desc in problem_type.get("descriptions") or []:
cwe_str = desc.get("cweId") or ""
if cwe_str.upper().startswith("CWE-"):
try:
weaknesses.append(get_cwe_id(cwe_str))
except Exception:
pass

advisory_url = (
f"https://www.libreoffice.org/about-us/security/advisories/{advisory_id.lower()}/"
)
references = []
for ref in cna.get("references") or []:
url = ref.get("url") or ""
if url:
references.append(ReferenceV2(url=url))

return AdvisoryDataV2(
advisory_id=advisory_id,
aliases=[],
summary=summary,
affected_packages=[],
references=references,
date_published=date_published,
weaknesses=weaknesses,
severities=severities,
url=advisory_url,
original_advisory_text=json.dumps(data, indent=2, ensure_ascii=False),
)
Original file line number Diff line number Diff line change
@@ -0,0 +1,144 @@
#
# Copyright (c) nexB Inc. and others. All rights reserved.
# VulnerableCode is a trademark of nexB Inc.
# SPDX-License-Identifier: Apache-2.0
# See http://www.apache.org/licenses/LICENSE-2.0 for the license text.
# See https://github.com/aboutcode-org/vulnerablecode for support or download.
# See https://aboutcode.org for more information about nexB OSS projects.
#

import json
import os
from unittest import TestCase
from unittest.mock import MagicMock
from unittest.mock import patch

from vulnerabilities.pipelines.v2_importers.libreoffice_importer import LibreOfficeImporterPipeline
from vulnerabilities.pipelines.v2_importers.libreoffice_importer import parse_cve_advisory
from vulnerabilities.pipelines.v2_importers.libreoffice_importer import parse_cve_ids

TEST_DATA = os.path.join(os.path.dirname(__file__), "..", "..", "test_data", "libreoffice")


def load_json(filename):
with open(os.path.join(TEST_DATA, filename), encoding="utf-8") as f:
return json.load(f)


def load_html(filename):
with open(os.path.join(TEST_DATA, filename), encoding="utf-8") as f:
return f.read()


class TestParseCveIds(TestCase):
def test_extracts_cve_ids_from_html(self):
html = load_html("advisories.html")
cve_ids = parse_cve_ids(html)
self.assertIn("CVE-2025-1080", cve_ids)
self.assertIn("CVE-2023-2255", cve_ids)
self.assertIn("CVE-2023-4863", cve_ids)

def test_deduplicates_repeated_ids(self):
html = "<a>CVE-2025-1080</a> ... <a>CVE-2025-1080</a>"
self.assertEqual(parse_cve_ids(html), ["CVE-2025-1080"])

def test_empty_html_returns_empty_list(self):
self.assertEqual(parse_cve_ids("<html></html>"), [])


class TestParseCveAdvisory(TestCase):
def test_cvss4_and_cwe(self):
data = load_json("cve_2025_1080.json")
advisory = parse_cve_advisory(data, "CVE-2025-1080")
self.assertIsNotNone(advisory)
self.assertEqual(advisory.advisory_id, "CVE-2025-1080")
self.assertEqual(advisory.aliases, [])
self.assertIn("macro", advisory.summary.lower())
self.assertEqual(len(advisory.severities), 1)
self.assertEqual(advisory.severities[0].value, "7.2")
self.assertIn("CVSS:4.0/", advisory.severities[0].scoring_elements)
self.assertEqual(advisory.weaknesses, [20])
self.assertIsNotNone(advisory.date_published)
self.assertIn("cve-2025-1080", advisory.url)

def test_no_cvss_has_empty_severities(self):
data = load_json("cve_2023_2255.json")
advisory = parse_cve_advisory(data, "CVE-2023-2255")
self.assertIsNotNone(advisory)
self.assertEqual(advisory.severities, [])

def test_cwe_264_extracted(self):
data = load_json("cve_2023_2255.json")
advisory = parse_cve_advisory(data, "CVE-2023-2255")
self.assertEqual(advisory.weaknesses, [264])

def test_references_from_cna(self):
data = load_json("cve_2023_2255.json")
advisory = parse_cve_advisory(data, "CVE-2023-2255")
urls = [r.url for r in advisory.references]
self.assertIn("https://www.debian.org/security/2023/dsa-5415", urls)
self.assertIn("https://security.gentoo.org/glsa/202311-15", urls)

def test_missing_cve_id_returns_none(self):
advisory = parse_cve_advisory({"cveMetadata": {"cveId": ""}, "containers": {}}, "")
self.assertIsNone(advisory)

def test_original_advisory_text_is_json(self):
data = load_json("cve_2025_1080.json")
advisory = parse_cve_advisory(data, "CVE-2025-1080")
parsed = json.loads(advisory.original_advisory_text)
self.assertEqual(parsed["cveMetadata"]["cveId"], "CVE-2025-1080")

def test_malformed_cwe_skipped(self):
data = load_json("cve_2025_1080.json")
data = json.loads(json.dumps(data))
data["containers"]["cna"]["problemTypes"] = [
{"descriptions": [{"cweId": "CWE-INVALID", "lang": "en", "type": "CWE"}]}
]
advisory = parse_cve_advisory(data, "CVE-2025-1080")
self.assertEqual(advisory.weaknesses, [])


class TestLibreOfficeImporterPipeline(TestCase):
def _make_resp(self, data, status=200):
resp = MagicMock()
resp.json.return_value = data
resp.text = json.dumps(data)
resp.raise_for_status.return_value = None
resp.status_code = status
return resp

@patch("vulnerabilities.pipelines.v2_importers.libreoffice_importer.requests.get")
def test_fetch_stores_cve_ids(self, mock_get):
html = load_html("advisories.html")
mock_get.return_value = MagicMock(text=html, raise_for_status=MagicMock())
pipeline = LibreOfficeImporterPipeline()
pipeline.fetch()
self.assertIn("CVE-2025-1080", pipeline.cve_ids)
self.assertIn("CVE-2023-2255", pipeline.cve_ids)

@patch("vulnerabilities.pipelines.v2_importers.libreoffice_importer.requests.get")
def test_collect_advisories_yields_advisory(self, mock_get):
cve_data = load_json("cve_2025_1080.json")
pipeline = LibreOfficeImporterPipeline()
pipeline.cve_ids = ["CVE-2025-1080"]
mock_get.return_value = self._make_resp(cve_data)
advisories = list(pipeline.collect_advisories())
self.assertEqual(len(advisories), 1)
self.assertEqual(advisories[0].advisory_id, "CVE-2025-1080")

@patch("vulnerabilities.pipelines.v2_importers.libreoffice_importer.requests.get")
def test_collect_advisories_skips_on_http_error(self, mock_get):
pipeline = LibreOfficeImporterPipeline()
pipeline.cve_ids = ["CVE-2025-1080"]
mock_get.side_effect = Exception("timeout")
logger_name = "vulnerabilities.pipelines.v2_importers.libreoffice_importer"
with self.assertLogs(logger_name, level="ERROR") as cm:
advisories = list(pipeline.collect_advisories())
self.assertEqual(advisories, [])
self.assertTrue(any("CVE-2025-1080" in msg for msg in cm.output))

def test_advisories_count(self):
pipeline = LibreOfficeImporterPipeline()
pipeline.cve_ids = ["CVE-2025-1080", "CVE-2023-2255"]
self.assertEqual(pipeline.advisories_count(), 2)
17 changes: 17 additions & 0 deletions vulnerabilities/tests/test_data/libreoffice/advisories.html
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
<!DOCTYPE html>
<html>
<body>
<h3>Addressed in LibreOffice 24.8.5 and 25.2.1</h3>
<ul>
<li><a href="/about-us/security/advisories/cve-2025-1080/">CVE-2025-1080</a> Macro URL arbitrary script execution</li>
</ul>
<h3>Addressed in LibreOffice 7.4.7 and 7.5.3</h3>
<ul>
<li><a href="/about-us/security/advisories/cve-2023-2255/">CVE-2023-2255</a> Remote documents loaded without prompt via IFrame</li>
</ul>
<h3>Third Party Advisories</h3>
<ul>
<li><a href="/about-us/security/advisories/cve-2023-4863/">CVE-2023-4863</a> libwebp heap buffer overflow</li>
</ul>
</body>
</html>
Loading
Loading