Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
21 commits
Select commit Hold shift + click to select a range
b7cb604
fix: recurse into numpy object pickle payloads
mldangelo Mar 13, 2026
407652a
test: type annotate numpy recursion regressions
mldangelo Mar 13, 2026
56782ee
fix(numpy): preserve npz member check context
mldangelo Mar 13, 2026
fa0abda
Merge remote-tracking branch 'refs/remotes/origin/feat/numpy-object-p…
mldangelo Mar 13, 2026
f42f854
test: format asset extraction regressions
mldangelo Mar 13, 2026
d45df12
Merge origin/main into feat/numpy-object-pickle-recursion
mldangelo Mar 14, 2026
432c71e
Merge remote-tracking branch 'origin/feat/numpy-object-pickle-recursi…
mldangelo Mar 14, 2026
d42949a
Merge remote-tracking branch 'origin/main' into feat/numpy-object-pic…
mldangelo Mar 14, 2026
2e16362
Merge remote-tracking branch 'origin/feat/numpy-object-pickle-recursi…
mldangelo Mar 14, 2026
5a114b2
Merge remote-tracking branch 'origin/main' into feat/numpy-object-pic…
mldangelo Mar 14, 2026
95ae02c
test: type annotate numpy trailing-bytes regression
mldangelo Mar 14, 2026
644d00a
Merge remote-tracking branch 'origin/main' into feat/numpy-object-pic…
mldangelo Mar 14, 2026
2098e6c
Merge remote-tracking branch 'origin/main' into feat/numpy-object-pic…
mldangelo Mar 15, 2026
0e141f4
fix: harden numpy recursion follow-up checks
mldangelo Mar 15, 2026
e146683
Merge remote-tracking branch 'origin/main' into feat/numpy-object-pic…
mldangelo Mar 15, 2026
74e1117
Merge remote-tracking branch 'origin/main' into feat/numpy-object-pic…
mldangelo Mar 15, 2026
fb19fc1
Merge remote-tracking branch 'origin/main' into feat/numpy-object-pic…
mldangelo Mar 16, 2026
9535dac
Merge remote-tracking branch 'origin/main' into feat/numpy-object-pic…
mldangelo Mar 16, 2026
81229c0
fix: harden numpy recursion and local streaming
mldangelo Mar 16, 2026
383c543
Merge branch 'main' into review-pr-699
mldangelo Mar 17, 2026
827d186
Merge remote-tracking branch 'origin/main' into audit-pr699-mainmerge
mldangelo Mar 18, 2026
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -81,6 +81,8 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0

### Fixed

- **cli:** preserve original local files during `--stream` directory scans instead of unlinking them after analysis
- **security:** recurse into object-dtype `.npy` payloads and `.npz` object members with the pickle scanner while preserving CVE-2019-6446 warnings and archive-member context
- **security:** remove `dill.load` / `dill.loads` from the pickle safe-global allowlist so recursive dill deserializers stay flagged as dangerous loader entry points
- **security:** add exact dangerous helper coverage for validated torch and NumPy refs such as `numpy.f2py.crackfortran.getlincoef`, `torch._dynamo.guards.GuardBuilder.get`, and `torch.utils.collect_env.run`
- **security:** add exact dangerous-global coverage for `numpy.load`, `site.main`, `_io.FileIO`, `test.support.script_helper.assert_python_ok`, `_osx_support._read_output`, `_aix_support._read_cmd_output`, `_pyrepl.pager.pipe_pager`, `torch.serialization.load`, and `torch._inductor.codecache.compile_file` (9 PickleScan-only loader and execution primitives)
Expand Down
5 changes: 3 additions & 2 deletions modelaudit/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -1683,11 +1683,12 @@ def enhanced_progress_callback(message, percentage):
# Create file iterator
file_generator = iterate_files_streaming(actual_path)

# Scan with streaming mode - propagate all config
# Scan with streaming mode - propagate all config.
# Local files already live on disk, so preserve the originals.
streaming_result = scan_model_streaming(
file_generator=file_generator,
timeout=final_timeout,
delete_after_scan=True, # Delete files after scanning in streaming mode
delete_after_scan=False,
progress_callback=progress_callback,
blacklist_patterns=list(blacklist) if blacklist else None,
max_file_size=final_max_file_size,
Expand Down
77 changes: 42 additions & 35 deletions modelaudit/core.py
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,35 @@

logger = logging.getLogger("modelaudit.core")

OPERATIONAL_ERROR_INDICATORS = (
"Error during scan",
"Error checking file size",
"Error scanning file",
"Scanner crashed",
"Scan timeout",
"Path does not exist",
"Path is not readable",
"Permission denied",
"File not found",
"not installed, cannot scan",
"Missing dependency",
"Import error",
"Module not found",
"not a valid",
"Invalid file format",
"Corrupted file",
"Bad file signature",
"Unable to parse",
"Out of memory",
"Disk space",
"Too many open files",
)


def _has_operational_error_message(message: Any) -> bool:
"""Return True when an issue message reflects an operational scan failure."""
return isinstance(message, str) and any(indicator in message for indicator in OPERATIONAL_ERROR_INDICATORS)


def _to_telemetry_severity(severity: Any) -> str:
"""Normalize severity values to stable telemetry strings."""
Expand Down Expand Up @@ -272,8 +301,12 @@ def _group_checks_by_asset(checks_list: list[Any]) -> dict[tuple[str, str], list
check_name = check.get("name", "Unknown Check")
location = check.get("location", "")
primary_asset = _extract_primary_asset_from_location(location)
details = check.get("details")
zip_entry = details.get("zip_entry") if isinstance(details, dict) else None

group_key = (check_name, primary_asset)
asset_group = f"{primary_asset}:{zip_entry}" if isinstance(zip_entry, str) and zip_entry else primary_asset

group_key = (check_name, asset_group)
check_groups[group_key].append(check)

return check_groups
Expand Down Expand Up @@ -1029,39 +1062,10 @@ def scan_model_directory_or_file(
# Determine if there were operational scan errors vs security findings
# has_errors should only be True for operational errors (scanner crashes,
# file not found, etc.) not for security findings detected in models
operational_error_indicators = [
# Scanner execution errors
"Error during scan",
"Error checking file size",
"Error scanning file",
"Scanner crashed",
"Scan timeout",
# File system errors
"Path does not exist",
"Path is not readable",
"Permission denied",
"File not found",
# Dependency/environment errors
"not installed, cannot scan",
"Missing dependency",
"Import error",
"Module not found",
# File format/corruption errors
"not a valid",
"Invalid file format",
"Corrupted file",
"Bad file signature",
"Unable to parse",
# Resource/system errors
"Out of memory",
"Disk space",
"Too many open files",
]

# Check for operational errors in issues
results.has_errors = (
any(
any(indicator in issue.message for indicator in operational_error_indicators)
_has_operational_error_message(issue.message)
for issue in results.issues
if issue.severity in {IssueSeverity.WARNING, IssueSeverity.CRITICAL}
)
Expand Down Expand Up @@ -1591,6 +1595,9 @@ def scan_model_streaming(
if scan_result:
metadata_dict = dict(scan_result.metadata or {})
metadata_dict.setdefault("file_size", file_path.stat().st_size)
operational_scan_failure = any(
_has_operational_error_message(issue.message) for issue in (scan_result.issues or [])
)

existing_hashes = metadata_dict.get("file_hashes")
if isinstance(existing_hashes, dict):
Expand All @@ -1602,10 +1609,10 @@ def scan_model_streaming(
scan_result_dict = {
"bytes_scanned": scan_result.bytes_scanned,
"files_scanned": 1, # Each scan_result represents one file
# ScanResult.has_errors means "critical findings", but
# ModelAuditResultModel.has_errors is reserved for
# operational scan failures.
"has_errors": not scan_result.success,
# Preserve the main scan semantics: success=False does not
# imply an operational error when the scanner completed
# and only reported informational integrity findings.
"has_errors": operational_scan_failure,
"success": scan_result.success,
"issues": [issue.__dict__ for issue in (scan_result.issues or [])],
"checks": [check.__dict__ for check in (scan_result.checks or [])],
Expand Down
71 changes: 69 additions & 2 deletions modelaudit/scanners/numpy_scanner.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,9 +4,10 @@

import sys
import warnings
from typing import TYPE_CHECKING, Any, ClassVar
from typing import TYPE_CHECKING, Any, BinaryIO, ClassVar

from .base import BaseScanner, IssueSeverity, ScanResult
from .pickle_scanner import PickleScanner

# Import NumPy with compatibility handling
try:
Expand Down Expand Up @@ -88,6 +89,17 @@ def _validate_array_dimensions(self, shape: tuple[int, ...]) -> None:
CVE_2019_6446_CVSS = 9.8
CVE_2019_6446_CWE = "CWE-502"

def _scan_embedded_pickle_payload(
self,
file_obj: BinaryIO,
payload_size: int,
context_path: str,
) -> ScanResult:
"""Reuse PickleScanner analysis for object-dtype NumPy payloads."""
pickle_scanner = PickleScanner(config=self.config)
pickle_scanner.current_file_path = context_path
return pickle_scanner._scan_pickle_bytes(file_obj, payload_size)

def _validate_dtype(self, dtype: Any) -> None:
"""Validate numpy dtype for security"""
# Check for problematic data types
Expand Down Expand Up @@ -256,7 +268,8 @@ def scan(self, path: str) -> ScanResult:
# enabling arbitrary code execution.
# dtype.hasobject catches structured dtypes with
# object fields; kind=="O" catches plain object arrays.
if dtype.kind == "O" or bool(getattr(dtype, "hasobject", False)):
has_object_dtype = dtype.kind == "O" or bool(getattr(dtype, "hasobject", False))
if has_object_dtype:
result.add_check(
name=f"{self.CVE_2019_6446_ID}: Object Dtype Pickle Deserialization",
passed=False,
Expand Down Expand Up @@ -299,6 +312,60 @@ def scan(self, path: str) -> ScanResult:
),
)

f.seek(data_offset)
embedded_result = self._scan_embedded_pickle_payload(
f,
file_size - data_offset,
path,
)
result.issues.extend(embedded_result.issues)
result.checks.extend(embedded_result.checks)

pickle_end_offset = embedded_result.metadata.get("first_pickle_end_pos")
if isinstance(pickle_end_offset, int) and pickle_end_offset < file_size:
trailing_bytes = file_size - pickle_end_offset
result.add_check(
name="File Integrity Check",
passed=False,
message=(
"Object-dtype payload contains trailing bytes after the embedded pickle stream"
),
severity=IssueSeverity.INFO,
location=path,
rule_code="S902",
details={
"expected_pickle_end": pickle_end_offset,
"actual_size": file_size,
"trailing_bytes": trailing_bytes,
"dtype": str(dtype),
},
)
result.finish(success=False)
return result

# Object-dtype .npy payloads are stored as a pickle stream rather than
# fixed-width element data, so the numeric dtype/size validation path
# is not applicable after we recurse into the embedded pickle payload.
result.add_check(
name="Data Type Safety Check",
passed=True,
message=f"Object dtype '{dtype}' handled via recursive pickle analysis",
location=path,
rule_code=None,
details={
"dtype": str(dtype),
"dtype_kind": dtype.kind,
"handled_via": "embedded_pickle_scan",
"cve_id": self.CVE_2019_6446_ID,
},
)
result.bytes_scanned = file_size
result.metadata.update(
{"shape": shape, "dtype": str(dtype), "fortran_order": fortran},
)
result.finish(success=True)
return result

self._validate_dtype(dtype)
result.add_check(
name="Data Type Safety Check",
Expand Down
12 changes: 8 additions & 4 deletions modelaudit/scanners/pickle_scanner.py
Original file line number Diff line number Diff line change
Expand Up @@ -4428,12 +4428,14 @@ def _scan_pickle_bytes(self, file_obj: BinaryIO, file_size: int) -> ScanResult:
suspicious_count = 0

# For large files, use chunked reading to avoid memory issues
MAX_MEMORY_READ = 50 * 1024 * 1024 # 50MB max in memory at once
MAX_MEMORY_READ = 10 * 1024 * 1024 # 10MB max in memory at once

current_pos = file_obj.tell()

# Read file data - either all at once for small files or first chunk for large files
# For large files, read first 50MB for pattern analysis (critical malicious code is usually at the beginning)
# Read file data - either all at once for small files or first chunk for large files.
# For large files, read only the first 10MB for pattern analysis to cap
# embedded-pickle memory usage while still inspecting the most security-
# relevant prefix.
file_data = file_obj.read() if file_size <= MAX_MEMORY_READ else file_obj.read(MAX_MEMORY_READ)

file_obj.seek(current_pos) # Reset position
Expand Down Expand Up @@ -4629,7 +4631,9 @@ def _scan_pickle_bytes(self, file_obj: BinaryIO, file_size: int) -> ScanResult:
elif opcode.name == "STOP":
current_stack_depth = 0
if first_pickle_end_pos is None:
first_pickle_end_pos = start_pos + pos + 1
# pickletools reports absolute positions even when parsing
# starts from a non-zero file offset.
first_pickle_end_pos = pos + 1

# Store stack depth warnings for ML-context-aware processing later
if current_stack_depth > base_stack_depth_limit:
Expand Down
70 changes: 40 additions & 30 deletions modelaudit/scanners/zip_scanner.py
Original file line number Diff line number Diff line change
Expand Up @@ -117,6 +117,44 @@ def scan(self, path: str) -> ScanResult:
result.metadata["file_size"] = os.path.getsize(path)
return result

def _rewrite_nested_result_context(
self, scan_result: ScanResult, tmp_path: str, archive_path: str, entry_name: str
) -> None:
"""Rewrite nested result locations so archive members, not temp files, are reported."""
archive_location = f"{archive_path}:{entry_name}"

for issue in scan_result.issues:
if issue.location:
if issue.location.startswith(tmp_path):
issue.location = issue.location.replace(tmp_path, archive_location, 1)
else:
issue.location = f"{archive_location} {issue.location}"
else:
issue.location = archive_location

existing_issue_entry = issue.details.get("zip_entry")
issue.details["zip_entry"] = (
f"{entry_name}:{existing_issue_entry}"
if isinstance(existing_issue_entry, str) and existing_issue_entry
else entry_name
)

for check in scan_result.checks:
if check.location:
if check.location.startswith(tmp_path):
check.location = check.location.replace(tmp_path, archive_location, 1)
else:
check.location = f"{archive_location} {check.location}"
else:
check.location = archive_location

existing_check_entry = check.details.get("zip_entry")
check.details["zip_entry"] = (
f"{entry_name}:{existing_check_entry}"
if isinstance(existing_check_entry, str) and existing_check_entry
else entry_name
)

def _scan_zip_file(self, path: str, depth: int = 0) -> ScanResult:
"""Recursively scan a ZIP file and its contents"""
result = ScanResult(scanner_name=self.name)
Expand Down Expand Up @@ -319,16 +357,7 @@ def _scan_zip_file(self, path: str, depth: int = 0) -> ScanResult:
if name.lower().endswith(".zip"):
try:
nested_result = self._scan_zip_file(tmp_path, depth + 1)
# Update locations in nested results
for issue in nested_result.issues:
if issue.location and issue.location.startswith(
tmp_path,
):
issue.location = issue.location.replace(
tmp_path,
f"{path}:{name}",
1,
)
self._rewrite_nested_result_context(nested_result, tmp_path, path, name)
result.merge(nested_result)

asset_entry = asset_from_scan_result(
Expand All @@ -355,26 +384,7 @@ def _scan_zip_file(self, path: str, depth: int = 0) -> ScanResult:

# Use core.scan_file to scan with appropriate scanner
file_result = core.scan_file(tmp_path, self.config)

# Update locations in file results
for issue in file_result.issues:
if issue.location:
if issue.location.startswith(tmp_path):
issue.location = issue.location.replace(
tmp_path,
f"{path}:{name}",
1,
)
else:
issue.location = f"{path}:{name} {issue.location}"
else:
issue.location = f"{path}:{name}"

# Add zip entry name to details
if issue.details:
issue.details["zip_entry"] = name
else:
issue.details = {"zip_entry": name}
self._rewrite_nested_result_context(file_result, tmp_path, path, name)

result.merge(file_result)

Expand Down
Loading
Loading