Skip to content

Commit 4daa784

Browse files
authored
feat(heuristics): add SimilarProjectAnalyzer to detect structural similarity across packages from same maintainer (#1089)
This PR adds a new heuristic analyzer called SimilarProjectAnalyzer. It checks whether a PyPI package has a similar file/folder structure to other packages maintained by the same user. This helps in identifying potentially malicious packages that replicate existing structures. Signed-off-by: Amine <amine.raouane@enim.ac.ma>
1 parent c5436d8 commit 4daa784

File tree

5 files changed

+310
-1
lines changed

5 files changed

+310
-1
lines changed

src/macaron/malware_analyzer/pypi_heuristics/heuristics.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -46,6 +46,9 @@ class Heuristics(str, Enum):
4646
#: Indicates that the package maintainer's email address is suspicious or invalid.
4747
FAKE_EMAIL = "fake_email"
4848

49+
#: Indicates that the package has a similar structure to other packages maintained by the same user.
50+
SIMILAR_PROJECTS = "similar_projects"
51+
4952

5053
class HeuristicResult(str, Enum):
5154
"""Result type indicating the outcome of a heuristic."""
Lines changed: 147 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,147 @@
1+
# Copyright (c) 2024 - 2025, Oracle and/or its affiliates. All rights reserved.
2+
# Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl/.
3+
4+
"""This analyzer checks if the package has a similar structure to other packages maintained by the same user."""
5+
6+
import hashlib
7+
import io
8+
import logging
9+
import tarfile
10+
11+
from macaron.json_tools import JsonType
12+
from macaron.malware_analyzer.pypi_heuristics.base_analyzer import BaseHeuristicAnalyzer
13+
from macaron.malware_analyzer.pypi_heuristics.heuristics import HeuristicResult, Heuristics
14+
from macaron.slsa_analyzer.package_registry.pypi_registry import PyPIPackageJsonAsset
15+
from macaron.util import send_get_http, send_get_http_raw
16+
17+
logger: logging.Logger = logging.getLogger(__name__)
18+
19+
20+
class SimilarProjectAnalyzer(BaseHeuristicAnalyzer):
21+
"""Check whether the package has a similar structure to other packages maintained by the same user."""
22+
23+
def __init__(self) -> None:
24+
super().__init__(
25+
name="similar_project_analyzer",
26+
heuristic=Heuristics.SIMILAR_PROJECTS,
27+
depends_on=None,
28+
)
29+
30+
def analyze(self, pypi_package_json: PyPIPackageJsonAsset) -> tuple[HeuristicResult, dict[str, JsonType]]:
31+
"""Analyze the package.
32+
33+
Parameters
34+
----------
35+
pypi_package_json: PyPIPackageJsonAsset
36+
The PyPI package JSON asset object.
37+
38+
Returns
39+
-------
40+
tuple[HeuristicResult, dict[str, JsonType]]:
41+
The result and related information collected during the analysis.
42+
43+
Raises
44+
------
45+
HeuristicAnalyzerValueError
46+
if the analysis fails.
47+
"""
48+
package_name = pypi_package_json.component_name
49+
target_hash = self.get_structure_hash(package_name)
50+
if not target_hash:
51+
return HeuristicResult.SKIP, {}
52+
53+
maintainers = pypi_package_json.pypi_registry.get_maintainers_of_package(package_name)
54+
if maintainers:
55+
for maintainer in maintainers:
56+
maintainer_packages = pypi_package_json.pypi_registry.get_packages_by_username(maintainer)
57+
if not maintainer_packages:
58+
continue
59+
for package in maintainer_packages:
60+
if package == package_name:
61+
continue
62+
63+
hash_value = self.get_structure_hash(package)
64+
if target_hash == hash_value:
65+
return HeuristicResult.FAIL, {
66+
"message": f"The package {package_name} has a similar structure to {package}.",
67+
"similar_package": package,
68+
}
69+
70+
return HeuristicResult.PASS, {}
71+
72+
def get_url(self, package_name: str, package_type: str = "sdist") -> str | None:
73+
"""Get the URL of the package's sdist.
74+
75+
Parameters
76+
----------
77+
package_name : str
78+
The name of the package.
79+
package_type: str
80+
The package type to retrieve the URL of.
81+
82+
Returns
83+
-------
84+
str | None:
85+
The URL of the package's sdist or None if not found.
86+
"""
87+
json_url = f"https://pypi.org/pypi/{package_name}/json"
88+
data = send_get_http(json_url, headers={})
89+
if not data:
90+
logger.debug("Failed to fetch package data for %s.", package_name)
91+
return None
92+
93+
sdist = next((url for url in data["urls"] if url["packagetype"] == package_type and url.get("url")), None)
94+
return sdist["url"] if sdist else None
95+
96+
def get_structure(self, package_name: str) -> list[str]:
97+
"""Get the file structure of the package's sdist.
98+
99+
Parameters
100+
----------
101+
package_name : str
102+
The name of the package.
103+
104+
Returns
105+
-------
106+
list[str]:
107+
The list of files in the package's sdist.
108+
"""
109+
sdist_url = self.get_url(package_name)
110+
if not sdist_url:
111+
logger.debug("Package %s does not have a sdist.", package_name)
112+
return []
113+
114+
response = send_get_http_raw(sdist_url)
115+
if not response:
116+
logger.debug("Failed to download sdist for package %s.", package_name)
117+
return []
118+
119+
buffer = io.BytesIO(response.content)
120+
with tarfile.open(fileobj=buffer, mode="r:gz") as tf:
121+
members = [
122+
member.name for member in tf.getmembers() if member.name and not member.name.startswith("PAXHeaders/")
123+
]
124+
125+
return members
126+
127+
def get_structure_hash(self, package_name: str) -> str:
128+
"""Get the hash of the package's file structure.
129+
130+
Parameters
131+
----------
132+
package_name : str
133+
The name of the package.
134+
135+
Returns
136+
-------
137+
str:
138+
The hash of the package's file structure.
139+
"""
140+
structure = self.get_structure(package_name)
141+
if not structure:
142+
return ""
143+
144+
normalized = sorted([p.replace(package_name, "<ROOT>") for p in structure])
145+
146+
joined = "\n".join(normalized).encode("utf-8")
147+
return hashlib.sha256(joined).hexdigest()

src/macaron/slsa_analyzer/checks/detect_malicious_metadata_check.py

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,7 @@
2323
from macaron.malware_analyzer.pypi_heuristics.metadata.fake_email import FakeEmailAnalyzer
2424
from macaron.malware_analyzer.pypi_heuristics.metadata.high_release_frequency import HighReleaseFrequencyAnalyzer
2525
from macaron.malware_analyzer.pypi_heuristics.metadata.one_release import OneReleaseAnalyzer
26+
from macaron.malware_analyzer.pypi_heuristics.metadata.similar_projects import SimilarProjectAnalyzer
2627
from macaron.malware_analyzer.pypi_heuristics.metadata.source_code_repo import SourceCodeRepoAnalyzer
2728
from macaron.malware_analyzer.pypi_heuristics.metadata.typosquatting_presence import TyposquattingPresenceAnalyzer
2829
from macaron.malware_analyzer.pypi_heuristics.metadata.unchanged_release import UnchangedReleaseAnalyzer
@@ -364,6 +365,7 @@ def run_check(self, ctx: AnalyzeContext) -> CheckResultData:
364365
AnomalousVersionAnalyzer,
365366
TyposquattingPresenceAnalyzer,
366367
FakeEmailAnalyzer,
368+
SimilarProjectAnalyzer,
367369
]
368370

369371
# name used to query the result of all problog rules, so it can be accessed outside the model.
@@ -434,7 +436,9 @@ def run_check(self, ctx: AnalyzeContext) -> CheckResultData:
434436
% Package released recently with the a maintainer email address that is not valid.
435437
{Confidence.MEDIUM.value}::trigger(malware_medium_confidence_3) :-
436438
quickUndetailed,
437-
failed({Heuristics.FAKE_EMAIL.value}).
439+
failed({Heuristics.FAKE_EMAIL.value}),
440+
failed({Heuristics.SIMILAR_PROJECTS.value}).
441+
438442
% ----- Evaluation -----
439443
440444
% Aggregate result

src/macaron/slsa_analyzer/package_registry/pypi_registry.py

Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -369,6 +369,27 @@ def get_maintainer_profile_page(self, username: str) -> str | None:
369369
return html_snippets
370370
return None
371371

372+
def get_packages_by_username(self, username: str) -> list[str] | None:
373+
"""Implement custom API to get the maintainer's packages.
374+
375+
Parameters
376+
----------
377+
username: str
378+
The maintainer's username.
379+
380+
Returns
381+
-------
382+
list[str]: A list of package names.
383+
"""
384+
user_page: str | None = self.get_maintainer_profile_page(username)
385+
if user_page is None:
386+
return None
387+
388+
soup = BeautifulSoup(user_page, "html.parser")
389+
headers = soup.find_all("h3", class_="package-snippet__title")
390+
packages = list({header.get_text(strip=True) for header in headers})
391+
return packages
392+
372393
def get_maintainer_join_date(self, username: str) -> datetime | None:
373394
"""Implement custom API to get the maintainer's join date.
374395
Lines changed: 134 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,134 @@
1+
# Copyright (c) 2024 - 2025, Oracle and/or its affiliates. All rights reserved.
2+
# Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl/.
3+
4+
"""Tests for the SimilarProjectAnalyzer heuristic."""
5+
# pylint: disable=redefined-outer-name
6+
7+
from unittest.mock import MagicMock, patch
8+
9+
import pytest
10+
11+
from macaron.malware_analyzer.pypi_heuristics.heuristics import HeuristicResult
12+
from macaron.malware_analyzer.pypi_heuristics.metadata.similar_projects import SimilarProjectAnalyzer
13+
14+
15+
@pytest.fixture()
16+
def analyzer() -> SimilarProjectAnalyzer:
17+
"""Pytest fixture to create a SimilarProjectAnalyzer instance."""
18+
analyzer_instance = SimilarProjectAnalyzer()
19+
return analyzer_instance
20+
21+
22+
def test_analyze_skip_no_target_hash(analyzer: SimilarProjectAnalyzer, pypi_package_json: MagicMock) -> None:
23+
"""Test the analyzer skips when the target package has no structure hash."""
24+
pypi_package_json.component_name = "test_package"
25+
with patch(
26+
"macaron.malware_analyzer.pypi_heuristics.metadata.similar_projects.SimilarProjectAnalyzer.get_structure_hash"
27+
) as mock_get_structure_hash:
28+
mock_get_structure_hash.return_value = ""
29+
result, info = analyzer.analyze(pypi_package_json)
30+
assert result == HeuristicResult.SKIP
31+
assert info == {}
32+
33+
34+
def test_analyze_fail_similar_project_found(analyzer: SimilarProjectAnalyzer, pypi_package_json: MagicMock) -> None:
35+
"""Test the analyzer fails when a similar project with the same structure hash is found."""
36+
pypi_package_json.component_name = "test_package"
37+
pypi_package_json.pypi_registry.get_maintainers_of_package.return_value = ["user1"]
38+
pypi_package_json.pypi_registry.get_packages_by_username.return_value = ["similar_package"]
39+
40+
with patch(
41+
"macaron.malware_analyzer.pypi_heuristics.metadata.similar_projects.SimilarProjectAnalyzer.get_structure_hash"
42+
) as mock_get_structure_hash:
43+
mock_get_structure_hash.return_value = "same_hash"
44+
result, info = analyzer.analyze(pypi_package_json)
45+
46+
assert result == HeuristicResult.FAIL
47+
assert info["similar_package"] == "similar_package"
48+
49+
50+
def test_analyze_fail_when_package_compares_to_itself(
51+
analyzer: SimilarProjectAnalyzer, pypi_package_json: MagicMock
52+
) -> None:
53+
"""Test the analyzer passes when a package is compared against itself."""
54+
pypi_package_json.component_name = "test_package"
55+
pypi_package_json.pypi_registry.get_maintainers_of_package.return_value = ["user1"]
56+
pypi_package_json.pypi_registry.get_packages_by_username.return_value = ["test_package", "other_package"]
57+
58+
with patch(
59+
"macaron.malware_analyzer.pypi_heuristics.metadata.similar_projects.SimilarProjectAnalyzer.get_structure_hash"
60+
) as mock_get_structure_hash:
61+
mock_get_structure_hash.side_effect = ["hash1", "hash2"] # test_package, other_package
62+
result, _ = analyzer.analyze(pypi_package_json)
63+
64+
assert result == HeuristicResult.PASS
65+
66+
67+
def test_analyze_pass_no_similar_hash(analyzer: SimilarProjectAnalyzer, pypi_package_json: MagicMock) -> None:
68+
"""Test the analyzer passes when no similar project has the same structure hash."""
69+
pypi_package_json.component_name = "test_package"
70+
pypi_package_json.pypi_registry.get_maintainers_of_package.return_value = ["user1"]
71+
pypi_package_json.pypi_registry.get_packages_by_username.return_value = ["other_package"]
72+
73+
with patch(
74+
"macaron.malware_analyzer.pypi_heuristics.metadata.similar_projects.SimilarProjectAnalyzer.get_structure_hash"
75+
) as mock_get_structure_hash:
76+
mock_get_structure_hash.side_effect = ["hash1", "hash2"]
77+
result, _ = analyzer.analyze(pypi_package_json)
78+
79+
assert result == HeuristicResult.PASS
80+
81+
82+
def test_get_url_success(analyzer: SimilarProjectAnalyzer) -> None:
83+
"""Test get_url method with a successful response."""
84+
mock_data = {
85+
"urls": [
86+
{"packagetype": "bdist_wheel", "url": "http://example.com/wheel.whl"},
87+
{"packagetype": "sdist", "url": "http://example.com/sdist.tar.gz"},
88+
]
89+
}
90+
with patch("macaron.malware_analyzer.pypi_heuristics.metadata.similar_projects.send_get_http") as mock_send_get:
91+
mock_send_get.return_value = mock_data
92+
url = analyzer.get_url("test_package")
93+
assert url == "http://example.com/sdist.tar.gz"
94+
95+
96+
def test_get_url_no_sdist(analyzer: SimilarProjectAnalyzer) -> None:
97+
"""Test get_url method when no sdist is found."""
98+
mock_data = {"urls": [{"packagetype": "bdist_wheel", "url": "http://example.com/wheel.whl"}]}
99+
with patch("macaron.malware_analyzer.pypi_heuristics.metadata.similar_projects.send_get_http") as mock_send_get:
100+
mock_send_get.return_value = mock_data
101+
url = analyzer.get_url("test_package")
102+
assert url is None
103+
104+
105+
def test_get_url_request_fails(analyzer: SimilarProjectAnalyzer) -> None:
106+
"""Test get_url method when the HTTP request fails."""
107+
with patch("macaron.malware_analyzer.pypi_heuristics.metadata.similar_projects.send_get_http") as mock_send_get:
108+
mock_send_get.return_value = None
109+
url = analyzer.get_url("test_package")
110+
assert url is None
111+
112+
113+
def test_get_structure_hash(analyzer: SimilarProjectAnalyzer) -> None:
114+
"""Test get_structure_hash method."""
115+
with patch(
116+
"macaron.malware_analyzer.pypi_heuristics.metadata.similar_projects.SimilarProjectAnalyzer.get_structure"
117+
) as mock_get_structure:
118+
mock_get_structure.return_value = [
119+
"test_package-1.0/setup.py",
120+
"test_package-1.0/test_package/__init__.py",
121+
]
122+
123+
structure_hash = analyzer.get_structure_hash("test_package")
124+
125+
assert isinstance(structure_hash, str)
126+
assert len(structure_hash) == 64
127+
128+
# Verify normalization
129+
mock_get_structure.return_value = [
130+
"other_package-1.0/other_package/__init__.py",
131+
"other_package-1.0/setup.py",
132+
]
133+
structure_hash2 = analyzer.get_structure_hash("other_package")
134+
assert structure_hash == structure_hash2

0 commit comments

Comments
 (0)