-
Notifications
You must be signed in to change notification settings - Fork 141
Scan compressed pickle artifacts #345
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -1,4 +1,8 @@ | ||
| import logging | ||
| import bz2 | ||
| import gzip | ||
| import io | ||
| import lzma | ||
| import pickletools # nosec | ||
| from tarfile import TarError | ||
| from typing import IO, Any, Dict, List, Set, Tuple, Union, Optional | ||
|
|
@@ -15,6 +19,13 @@ | |
|
|
||
| from .utils import MAGIC_NUMBER, _should_read_directly, get_magic_number | ||
|
|
||
| COMPRESSED_PICKLE_SUFFIXES = { | ||
| ".bz2": bz2.decompress, | ||
| ".gz": gzip.decompress, | ||
| ".lzma": lzma.decompress, | ||
| ".xz": lzma.decompress, | ||
| } | ||
|
|
||
|
|
||
| class GenOpsError(Exception): | ||
| def __init__(self, msg: str, globals: Optional[Set[Tuple[str, str]]]): | ||
|
|
@@ -128,8 +139,26 @@ def scan_pickle_bytes( | |
| ) -> ScanResults: | ||
| """Disassemble a Pickle stream and report issues""" | ||
| issues: List[Issue] = [] | ||
| stream = model.get_stream(offset) | ||
| decompress = COMPRESSED_PICKLE_SUFFIXES.get(model.get_source().suffix) | ||
| if decompress is not None: | ||
| try: | ||
| stream = io.BytesIO(decompress(stream.read())) | ||
|
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Self-review: decompression is intentionally placed before |
||
| except Exception as e: | ||
| return ScanResults( | ||
| issues, | ||
| [ | ||
| PickleGenopsError( | ||
| scan_name, | ||
| f"Decompression error: {e}", | ||
| model, | ||
| ) | ||
| ], | ||
| [], | ||
| ) | ||
|
|
||
| try: | ||
| raw_globals = _list_globals(model.get_stream(offset), multiple_pickles) | ||
| raw_globals = _list_globals(stream, multiple_pickles) | ||
| except GenOpsError as e: | ||
| if e.globals is not None: | ||
| return _build_scan_result_from_raw_globals( | ||
|
|
@@ -228,8 +257,22 @@ def scan_numpy(model: Model, settings: Dict[str, Any]) -> ScanResults: | |
| elif magic == np.lib.format.MAGIC_PREFIX: | ||
| # .npy file | ||
| version = np.lib.format.read_magic(stream) # type: ignore[no-untyped-call] | ||
| np.lib.format._check_version(version) # type: ignore[attr-defined] | ||
| _, _, dtype = np.lib.format._read_array_header(stream, version) # type: ignore[attr-defined] | ||
| if version == (1, 0): | ||
| _, _, dtype = np.lib.format.read_array_header_1_0(stream) | ||
| elif version == (2, 0): | ||
| _, _, dtype = np.lib.format.read_array_header_2_0(stream) | ||
| else: | ||
| return ScanResults( | ||
| [], | ||
| [ | ||
| PickleGenopsError( | ||
| scan_name, | ||
| f"Unsupported numpy file format version: {version}", | ||
| model, | ||
| ) | ||
| ], | ||
| [], | ||
| ) | ||
|
|
||
| if dtype.hasobject: | ||
| return scan_pickle_bytes(model, settings, scan_name, True, stream.tell()) | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -1,5 +1,6 @@ | ||
| import aiohttp | ||
| import bdb | ||
| import gzip | ||
| import http.client | ||
| import importlib | ||
| import io | ||
|
|
@@ -283,6 +284,10 @@ def file_path(tmp_path_factory: Any) -> Any: | |
| initialize_pickle_file(f"{tmp}/data/malicious8.pkl", Malicious7(), 4) | ||
| initialize_pickle_file(f"{tmp}/data/malicious9.pkl", Malicious8(), 4) | ||
| initialize_pickle_file(f"{tmp}/data/malicious15.pkl", Malicious15(), 4) | ||
| initialize_data_file( | ||
| f"{tmp}/data/malicious16.joblib.gz", | ||
| gzip.compress(pickle.dumps(Malicious2(), protocol=4)), | ||
| ) | ||
|
|
||
| # Malicious Pickle from Capture-the-Flag challenge 'Misc/Safe Pickle' at https://imaginaryctf.org/Challenges | ||
| # GitHub Issue: https://github.com/mmaitre314/picklescan/issues/22 | ||
|
|
@@ -670,6 +675,25 @@ def test_scan_file_path(file_path: Any) -> None: | |
| assert results["summary"]["skipped"]["skipped_files"] == [] | ||
| assert results["errors"] == [] | ||
|
|
||
| compressed_joblib = ModelScan() | ||
| expected_compressed_joblib = { | ||
| Issue( | ||
| IssueCode.UNSAFE_OPERATOR, | ||
| IssueSeverity.CRITICAL, | ||
| OperatorIssueDetails( | ||
| "posix", | ||
| "system", | ||
| IssueSeverity.CRITICAL, | ||
| f"{file_path}/data/malicious16.joblib.gz", | ||
| ), | ||
| ), | ||
| } | ||
| results = compressed_joblib.scan(Path(f"{file_path}/data/malicious16.joblib.gz")) | ||
|
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Self-review: this regression asserts the formerly skipped |
||
| compare_results(compressed_joblib.issues.all_issues, expected_compressed_joblib) | ||
| assert results["summary"]["scanned"]["scanned_files"] == ["malicious16.joblib.gz"] | ||
| assert results["summary"]["skipped"]["skipped_files"] == [] | ||
| assert results["errors"] == [] | ||
|
|
||
|
|
||
| def test_scan_pickle_operators(file_path: Any) -> None: | ||
| # Tests the unsafe pickle operators we screen for, across differences in pickle versions 0-2, 3, and 4 | ||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Self-review: this uses the outer compound extension to decide whether to decompress before pickle opcode scanning. That directly covers valid
joblib.dump(..., compress=...)artifacts such as.joblib.gz; the tradeoff is that compression policy stays extension-based rather than magic-byte based.