Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
19 changes: 18 additions & 1 deletion modelaudit/scanners/pickle_scanner.py
Original file line number Diff line number Diff line change
Expand Up @@ -6882,14 +6882,31 @@ def extract_metadata(self, file_path: str) -> dict[str, Any]:
metadata = super().extract_metadata(file_path)

allow_deserialization = bool(self.config.get("allow_metadata_deserialization"))
max_metadata_read_size = int(self.config.get("max_metadata_pickle_read_size", 10 * 1024 * 1024))

try:
import pickle
import pickletools
from io import BytesIO

if max_metadata_read_size <= 0:
raise ValueError(
f"Invalid pickle metadata read limit: {max_metadata_read_size} (must be greater than 0)"
)
Comment on lines +6885 to +6895
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

⚠️ Potential issue | 🟠 Major

Enforce a hard 10 MiB upper bound for max_metadata_pickle_read_size.

The new validation rejects non-positive values, but any very large positive config still weakens the metadata DoS protection.

🔧 Proposed fix
             if max_metadata_read_size <= 0:
                 raise ValueError(
                     f"Invalid pickle metadata read limit: {max_metadata_read_size} (must be greater than 0)"
                 )
+            hard_cap = 10 * 1024 * 1024
+            if max_metadata_read_size > hard_cap:
+                raise ValueError(
+                    f"Invalid pickle metadata read limit: {max_metadata_read_size} (must be <= {hard_cap})"
+                )
As per coding guidelines "Cap archive member reads to 10 MB for metadata validation to prevent memory spikes on large pickles".
🤖 Prompt for AI Agents
Verify each finding against the current code and only fix it if needed.

In `@modelaudit/scanners/pickle_scanner.py` around lines 6885 - 6895, The code
reads max_metadata_read_size from self.config but only rejects non-positive
values and allows arbitrarily large sizes; change the logic around
max_metadata_read_size in pickle_scanner.py (the variable
max_metadata_read_size, the config.get call) so that after parsing you enforce a
hard upper bound of 10 * 1024 * 1024 (10 MiB) — e.g., validate > 0 then apply
max_metadata_read_size = min(max_metadata_read_size, 10 * 1024 * 1024) (or
replace oversized values with the cap) to ensure metadata reads cannot exceed 10
MiB.


file_size = self.get_file_size(file_path)
if file_size > max_metadata_read_size:
raise ValueError(
f"Pickle metadata read limit exceeded: {file_size} bytes (max: {max_metadata_read_size})"
)

with open(file_path, "rb") as f:
pickle_data = f.read()
pickle_data = f.read(max_metadata_read_size + 1)

if len(pickle_data) > max_metadata_read_size:
raise ValueError(
f"Pickle metadata read limit exceeded: {len(pickle_data)} bytes (max: {max_metadata_read_size})"
)

# Analyze pickle structure
metadata.update(
Expand Down
21 changes: 21 additions & 0 deletions tests/test_metadata_extractor.py
Original file line number Diff line number Diff line change
Expand Up @@ -466,6 +466,27 @@ def __reduce__(self):
assert "REDUCE" in metadata.get("dangerous_opcodes", [])
assert metadata.get("has_dangerous_opcodes") is True

@pytest.mark.parametrize(
("limit", "expected_error"),
[
(64, "read limit exceeded"),
(0, "must be greater than 0"),
(-1, "must be greater than 0"),
],
)
def test_pickle_metadata_enforces_read_limit(self, tmp_path: Path, limit: int, expected_error: str) -> None:
"""Ensure pickle metadata extraction rejects oversized and invalid read limits."""
from modelaudit.scanners.pickle_scanner import PickleScanner

pkl_file = tmp_path / "oversized.pkl"
pkl_file.write_bytes(b"x" * 128)

scanner = PickleScanner({"max_metadata_pickle_read_size": limit})
metadata = scanner.extract_metadata(str(pkl_file))

assert "extraction_error" in metadata
assert expected_error in metadata["extraction_error"]

Comment on lines +469 to +489
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

🧹 Nitpick | 🔵 Trivial

Add a success-path case to this limit matrix.

This parametrization only checks failing paths. Add one valid limit case (e.g., 256) and assert metadata extraction succeeds without extraction_error to prevent over-restrictive regressions.

Proposed test refinement
 `@pytest.mark.parametrize`(
-    ("limit", "expected_error"),
+    ("limit", "expected_error"),
     [
         (64, "read limit exceeded"),
         (0, "must be greater than 0"),
         (-1, "must be greater than 0"),
+        (256, None),
     ],
 )
-def test_pickle_metadata_enforces_read_limit(self, tmp_path: Path, limit: int, expected_error: str) -> None:
+def test_pickle_metadata_enforces_read_limit(
+    self, tmp_path: Path, limit: int, expected_error: str | None
+) -> None:
@@
-    assert "extraction_error" in metadata
-    assert expected_error in metadata["extraction_error"]
+    if expected_error is None:
+        assert "extraction_error" not in metadata
+        assert metadata.get("pickle_size") == 128
+    else:
+        assert "extraction_error" in metadata
+        assert expected_error in metadata["extraction_error"]
Based on learnings "Preserve or strengthen security detections; test both benign and malicious samples when adding scanner/feature changes".
🤖 Prompt for AI Agents
Verify each finding against the current code and only fix it if needed.

In `@tests/test_metadata_extractor.py` around lines 469 - 489, The parametrized
test test_pickle_metadata_enforces_read_limit currently only asserts failure
cases; add a valid success case (e.g., limit 256) to the ("limit",
"expected_error") matrix and update assertions so that when limit is the success
value the test asserts that PickleScanner({"max_metadata_pickle_read_size":
limit}).extract_metadata(...) does not contain "extraction_error" (i.e., assert
"extraction_error" not in metadata), using the same tmp_path pkl_file setup and
referencing PickleScanner and the max_metadata_pickle_read_size config key.

def test_pickle_safe_data_no_dangerous_opcodes(self, tmp_path: Path) -> None:
"""Ensure simple data structures don't trigger dangerous opcode detection."""
from modelaudit.scanners.pickle_scanner import PickleScanner
Expand Down
Loading