From da0125e9547646927668938bb49c39bb1d4b5353 Mon Sep 17 00:00:00 2001 From: Yash Chhabria Date: Mon, 16 Mar 2026 11:08:09 -0700 Subject: [PATCH 1/4] fix: accept executorch flatbuffer binaries Recognize valid ExecuTorch FlatBuffers programs in .pte files, prevent file-type validation noise for those binaries, and add regression coverage for scanner and detection helpers. Co-Authored-By: Codex --- CHANGELOG.md | 4 ++++ modelaudit/scanners/executorch_scanner.py | 19 ++++++++++++++++++- modelaudit/utils/file/detection.py | 10 +++++++++- tests/conftest.py | 1 + tests/scanners/test_executorch_scanner.py | 11 +++++++++++ tests/utils/file/test_filetype.py | 7 +++++++ 6 files changed, 50 insertions(+), 2 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 44ce0e28..7b296120 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -81,6 +81,10 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ### Fixed +- eliminate false positives for valid ExecuTorch FlatBuffers binaries and file-type validation on public `.pte` models + +### Fixed + - **security:** remove `dill.load` / `dill.loads` from the pickle safe-global allowlist so recursive dill deserializers stay flagged as dangerous loader entry points - **security:** add exact dangerous helper coverage for validated torch and NumPy refs such as `numpy.f2py.crackfortran.getlincoef`, `torch._dynamo.guards.GuardBuilder.get`, and `torch.utils.collect_env.run` - **security:** add exact dangerous-global coverage for `numpy.load`, `site.main`, `_io.FileIO`, `test.support.script_helper.assert_python_ok`, `_osx_support._read_output`, `_aix_support._read_cmd_output`, `_pyrepl.pager.pipe_pager`, `torch.serialization.load`, and `torch._inductor.codecache.compile_file` (9 PickleScan-only loader and execution primitives) diff --git a/modelaudit/scanners/executorch_scanner.py b/modelaudit/scanners/executorch_scanner.py index 102369ea..154945e5 100644 --- a/modelaudit/scanners/executorch_scanner.py +++ b/modelaudit/scanners/executorch_scanner.py @@ -37,6 +37,11 @@ def _read_header(path: str, length: int = 4) -> bytes: except Exception: return b"" + @staticmethod + def _is_executorch_binary(header: bytes) -> bool: + # Real-world .pte files use a FlatBuffers-style file identifier at bytes 4..7. + return len(header) >= 8 and header[4:6] == b"ET" + def scan(self, path: str) -> ScanResult: path_check_result = self._check_path(path) if path_check_result: @@ -50,7 +55,19 @@ def scan(self, path: str) -> ScanResult: file_size = self.get_file_size(path) result.metadata["file_size"] = file_size - header = self._read_header(path) + header = self._read_header(path, length=8) + if self._is_executorch_binary(header): + result.add_check( + name="ExecuTorch Binary Format Validation", + passed=True, + message="Valid ExecuTorch binary program format detected", + location=path, + details={"path": path, "format": "executorch_binary"}, + ) + result.bytes_scanned = file_size + result.finish(success=True) + return result + if not header.startswith(b"PK"): result.add_check( name="ExecuTorch Archive Format Validation", diff --git a/modelaudit/utils/file/detection.py b/modelaudit/utils/file/detection.py index f4ffd1f6..40bfb6c5 100644 --- a/modelaudit/utils/file/detection.py +++ b/modelaudit/utils/file/detection.py @@ -189,6 +189,11 @@ def _is_lightgbm_signature(prefix: bytes) -> bool: return (starts_with_tree or "tree=" in preview) and header_hits >= 3 and tree_hits >= 2 and not xgboost_like +def _is_executorch_binary_signature(prefix: bytes) -> bool: + """Recognize ExecuTorch FlatBuffers binaries by their file identifier.""" + return len(prefix) >= 8 and prefix[4:8] == b"ET12" + + def _is_zlib_header(prefix: bytes) -> bool: if len(prefix) < 2: return False @@ -312,6 +317,9 @@ def detect_file_format_from_magic(path: str) -> str: magic8 = header[:8] magic16 = header[:16] + if _is_executorch_binary_signature(header): + return "executorch" + # Try the new pattern matching approach first format_result = detect_format_from_magic_bytes(magic4, magic8, magic16) if format_result != "unknown": @@ -845,7 +853,7 @@ def validate_file_type(path: str) -> bool: # ExecuTorch files should be zip archives if ext_format == "executorch": - return header_format == "zip" + return header_format == "zip" or _is_executorch_binary_signature(read_magic_bytes(path, 16)) # Keras files can be either ZIP (Keras 3.x) or HDF5 (legacy Keras) if ext_format == "keras": diff --git a/tests/conftest.py b/tests/conftest.py index 848f456b..599cd7ba 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -102,6 +102,7 @@ def pytest_runtest_setup(item): "test_mxnet_scanner.py", # MXNet scanner tests "test_tf_metagraph_scanner.py", # TensorFlow MetaGraph scanner tests "test_torchserve_mar_scanner.py", # TorchServe .mar scanner tests + "test_executorch_scanner.py", # ExecuTorch scanner tests "test_telemetry.py", # telemetry payload and availability tests "test_telemetry_decoupling.py", # telemetry failure-isolation tests "test_debug_command.py", # debug output telemetry flags diff --git a/tests/scanners/test_executorch_scanner.py b/tests/scanners/test_executorch_scanner.py index b026e287..95a1791e 100644 --- a/tests/scanners/test_executorch_scanner.py +++ b/tests/scanners/test_executorch_scanner.py @@ -55,3 +55,14 @@ def test_executorch_scanner_invalid_zip(tmp_path): result = scanner.scan(str(file_path)) assert not result.success assert any("executorch" in i.message.lower() for i in result.issues) + + +def test_executorch_scanner_accepts_binary_program_header(tmp_path): + file_path = tmp_path / "program.pte" + file_path.write_bytes(b"\x40\x00\x00\x00ET12eh00\x20\x00\x00\x00\xe8\x8c\x01\x00\x00\x00\x00\x00") + scanner = ExecuTorchScanner() + result = scanner.scan(str(file_path)) + assert result.success is True + assert result.bytes_scanned == file_path.stat().st_size + assert not any("not a valid executorch archive" in issue.message.lower() for issue in result.issues) + assert not any("file type validation failed" in issue.message.lower() for issue in result.issues) diff --git a/tests/utils/file/test_filetype.py b/tests/utils/file/test_filetype.py index 65bc34e5..276ca05f 100644 --- a/tests/utils/file/test_filetype.py +++ b/tests/utils/file/test_filetype.py @@ -523,6 +523,13 @@ def test_validate_file_type(tmp_path): mar.writestr("weights.bin", b"weights") mar.writestr("handler.py", b"def handle(data, context):\n return data\n") assert validate_file_type(str(mar_path)) is True + + # ExecuTorch binaries use a FlatBuffers identifier at bytes 4..7. + executorch_path = tmp_path / "program.pte" + executorch_path.write_bytes(b"\x40\x00\x00\x00ET12eh00" + b"\x20\x00\x00\x00" + b"\x00" * 16) + assert detect_file_format_from_magic(str(executorch_path)) == "executorch" + assert validate_file_type(str(executorch_path)) is True + # Llamafile wrappers validate by extension with scanner-level marker checks. llamafile_path = tmp_path / "model.llamafile" llamafile_path.write_bytes(b"\x7fELF" + b"\x00" * 32 + b"llamafile") From 6c5ba7bce055a9a4b6787d2584f3d753b2649a88 Mon Sep 17 00:00:00 2001 From: Yash Chhabria Date: Mon, 16 Mar 2026 11:17:44 -0700 Subject: [PATCH 2/4] fix: address executorch review feedback Align ExecuTorch binary signature checks with the shared detector, remove the duplicate changelog heading, and add the missing test return annotation. Co-Authored-By: Codex --- CHANGELOG.md | 3 --- modelaudit/scanners/executorch_scanner.py | 4 ++-- tests/scanners/test_executorch_scanner.py | 2 +- 3 files changed, 3 insertions(+), 6 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 7b296120..007fab45 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -82,9 +82,6 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ### Fixed - eliminate false positives for valid ExecuTorch FlatBuffers binaries and file-type validation on public `.pte` models - -### Fixed - - **security:** remove `dill.load` / `dill.loads` from the pickle safe-global allowlist so recursive dill deserializers stay flagged as dangerous loader entry points - **security:** add exact dangerous helper coverage for validated torch and NumPy refs such as `numpy.f2py.crackfortran.getlincoef`, `torch._dynamo.guards.GuardBuilder.get`, and `torch.utils.collect_env.run` - **security:** add exact dangerous-global coverage for `numpy.load`, `site.main`, `_io.FileIO`, `test.support.script_helper.assert_python_ok`, `_osx_support._read_output`, `_aix_support._read_cmd_output`, `_pyrepl.pager.pipe_pager`, `torch.serialization.load`, and `torch._inductor.codecache.compile_file` (9 PickleScan-only loader and execution primitives) diff --git a/modelaudit/scanners/executorch_scanner.py b/modelaudit/scanners/executorch_scanner.py index 154945e5..087e9cfa 100644 --- a/modelaudit/scanners/executorch_scanner.py +++ b/modelaudit/scanners/executorch_scanner.py @@ -39,8 +39,8 @@ def _read_header(path: str, length: int = 4) -> bytes: @staticmethod def _is_executorch_binary(header: bytes) -> bool: - # Real-world .pte files use a FlatBuffers-style file identifier at bytes 4..7. - return len(header) >= 8 and header[4:6] == b"ET" + # Real-world .pte files use the FlatBuffers file identifier "ET12" at bytes 4..7. + return len(header) >= 8 and header[4:8] == b"ET12" def scan(self, path: str) -> ScanResult: path_check_result = self._check_path(path) diff --git a/tests/scanners/test_executorch_scanner.py b/tests/scanners/test_executorch_scanner.py index 95a1791e..19e777a0 100644 --- a/tests/scanners/test_executorch_scanner.py +++ b/tests/scanners/test_executorch_scanner.py @@ -57,7 +57,7 @@ def test_executorch_scanner_invalid_zip(tmp_path): assert any("executorch" in i.message.lower() for i in result.issues) -def test_executorch_scanner_accepts_binary_program_header(tmp_path): +def test_executorch_scanner_accepts_binary_program_header(tmp_path) -> None: file_path = tmp_path / "program.pte" file_path.write_bytes(b"\x40\x00\x00\x00ET12eh00\x20\x00\x00\x00\xe8\x8c\x01\x00\x00\x00\x00\x00") scanner = ExecuTorchScanner() From 3cf01089d1b5a741f08fb097a3b2b17aa6adf560 Mon Sep 17 00:00:00 2001 From: Yash Chhabria Date: Mon, 16 Mar 2026 12:17:31 -0700 Subject: [PATCH 3/4] fix: annotate executorch regression test --- tests/scanners/test_executorch_scanner.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/scanners/test_executorch_scanner.py b/tests/scanners/test_executorch_scanner.py index 19e777a0..0f4566b1 100644 --- a/tests/scanners/test_executorch_scanner.py +++ b/tests/scanners/test_executorch_scanner.py @@ -57,7 +57,7 @@ def test_executorch_scanner_invalid_zip(tmp_path): assert any("executorch" in i.message.lower() for i in result.issues) -def test_executorch_scanner_accepts_binary_program_header(tmp_path) -> None: +def test_executorch_scanner_accepts_binary_program_header(tmp_path: Path) -> None: file_path = tmp_path / "program.pte" file_path.write_bytes(b"\x40\x00\x00\x00ET12eh00\x20\x00\x00\x00\xe8\x8c\x01\x00\x00\x00\x00\x00") scanner = ExecuTorchScanner() From 74a8bd6152cb667e6943a854fde7a10e1c5dff2c Mon Sep 17 00:00:00 2001 From: mldangelo Date: Tue, 17 Mar 2026 07:37:31 -0700 Subject: [PATCH 4/4] fix(executorch): validate versioned flatbuffer binaries --- modelaudit/scanners/executorch_scanner.py | 8 +-- modelaudit/utils/file/detection.py | 59 +++++++++++++++++++++-- tests/scanners/test_executorch_scanner.py | 31 +++++++++++- tests/utils/file/test_filetype.py | 25 +++++++++- 4 files changed, 109 insertions(+), 14 deletions(-) diff --git a/modelaudit/scanners/executorch_scanner.py b/modelaudit/scanners/executorch_scanner.py index 087e9cfa..93729de3 100644 --- a/modelaudit/scanners/executorch_scanner.py +++ b/modelaudit/scanners/executorch_scanner.py @@ -7,6 +7,7 @@ from typing import Any, ClassVar from ..utils import sanitize_archive_path +from ..utils.file.detection import _is_valid_executorch_binary from .base import BaseScanner, IssueSeverity, ScanResult from .pickle_scanner import PickleScanner @@ -37,11 +38,6 @@ def _read_header(path: str, length: int = 4) -> bytes: except Exception: return b"" - @staticmethod - def _is_executorch_binary(header: bytes) -> bool: - # Real-world .pte files use the FlatBuffers file identifier "ET12" at bytes 4..7. - return len(header) >= 8 and header[4:8] == b"ET12" - def scan(self, path: str) -> ScanResult: path_check_result = self._check_path(path) if path_check_result: @@ -56,7 +52,7 @@ def scan(self, path: str) -> ScanResult: result.metadata["file_size"] = file_size header = self._read_header(path, length=8) - if self._is_executorch_binary(header): + if _is_valid_executorch_binary(path): result.add_check( name="ExecuTorch Binary Format Validation", passed=True, diff --git a/modelaudit/utils/file/detection.py b/modelaudit/utils/file/detection.py index 40bfb6c5..deb69dd8 100644 --- a/modelaudit/utils/file/detection.py +++ b/modelaudit/utils/file/detection.py @@ -190,8 +190,59 @@ def _is_lightgbm_signature(prefix: bytes) -> bool: def _is_executorch_binary_signature(prefix: bytes) -> bool: - """Recognize ExecuTorch FlatBuffers binaries by their file identifier.""" - return len(prefix) >= 8 and prefix[4:8] == b"ET12" + """Recognize versioned ExecuTorch FlatBuffers binaries by their file identifier.""" + return len(prefix) >= 8 and prefix[4:6] == b"ET" and prefix[6:8].isdigit() + + +def _is_valid_executorch_binary(path: str | Path) -> bool: + """Validate the minimal FlatBuffers structure for ExecuTorch binaries.""" + file_path = Path(path) + if not file_path.is_file(): + return False + + try: + file_size = file_path.stat().st_size + if file_size < 16: + return False + + with file_path.open("rb") as f: + header = f.read(8) + if not _is_executorch_binary_signature(header): + return False + + root_table_offset = struct.unpack(" file_size: + return False + + f.seek(root_table_offset) + table_header = f.read(4) + if len(table_header) != 4: + return False + + vtable_back_offset = struct.unpack(" root_table_offset: + return False + + vtable_offset = root_table_offset - vtable_back_offset + if vtable_offset < 8 or vtable_offset + 4 > file_size: + return False + + f.seek(vtable_offset) + vtable_header = f.read(4) + if len(vtable_header) != 4: + return False + + vtable_size, object_size = struct.unpack(" file_size: + return False + if root_table_offset + object_size > file_size: + return False + except (OSError, struct.error): + return False + + return True def _is_zlib_header(prefix: bytes) -> bool: @@ -317,7 +368,7 @@ def detect_file_format_from_magic(path: str) -> str: magic8 = header[:8] magic16 = header[:16] - if _is_executorch_binary_signature(header): + if _is_valid_executorch_binary(file_path): return "executorch" # Try the new pattern matching approach first @@ -853,7 +904,7 @@ def validate_file_type(path: str) -> bool: # ExecuTorch files should be zip archives if ext_format == "executorch": - return header_format == "zip" or _is_executorch_binary_signature(read_magic_bytes(path, 16)) + return header_format == "zip" or _is_valid_executorch_binary(path) # Keras files can be either ZIP (Keras 3.x) or HDF5 (legacy Keras) if ext_format == "keras": diff --git a/tests/scanners/test_executorch_scanner.py b/tests/scanners/test_executorch_scanner.py index 0f4566b1..d33cd8d8 100644 --- a/tests/scanners/test_executorch_scanner.py +++ b/tests/scanners/test_executorch_scanner.py @@ -6,6 +6,13 @@ from modelaudit.scanners.executorch_scanner import ExecuTorchScanner +def create_executorch_binary(tmp_path: Path, *, identifier: bytes = b"ET12") -> Path: + binary_path = tmp_path / "program.pte" + # Minimal valid FlatBuffer with the ExecuTorch file identifier. + binary_path.write_bytes(b"\x0c\x00\x00\x00" + identifier + b"\x04\x00\x04\x00\x04\x00\x00\x00") + return binary_path + + def create_executorch_archive(tmp_path: Path, *, malicious: bool = False) -> Path: zip_path = tmp_path / "model.ptl" with zipfile.ZipFile(zip_path, "w") as z: @@ -58,11 +65,31 @@ def test_executorch_scanner_invalid_zip(tmp_path): def test_executorch_scanner_accepts_binary_program_header(tmp_path: Path) -> None: - file_path = tmp_path / "program.pte" - file_path.write_bytes(b"\x40\x00\x00\x00ET12eh00\x20\x00\x00\x00\xe8\x8c\x01\x00\x00\x00\x00\x00") + file_path = create_executorch_binary(tmp_path) scanner = ExecuTorchScanner() result = scanner.scan(str(file_path)) assert result.success is True assert result.bytes_scanned == file_path.stat().st_size assert not any("not a valid executorch archive" in issue.message.lower() for issue in result.issues) assert not any("file type validation failed" in issue.message.lower() for issue in result.issues) + + +def test_executorch_scanner_accepts_versioned_binary_program_header(tmp_path: Path) -> None: + file_path = create_executorch_binary(tmp_path, identifier=b"ET13") + scanner = ExecuTorchScanner() + result = scanner.scan(str(file_path)) + + assert result.success is True + assert result.bytes_scanned == file_path.stat().st_size + assert not result.issues + + +def test_executorch_scanner_rejects_invalid_binary_signature_match(tmp_path: Path) -> None: + file_path = tmp_path / "fake-program.pte" + file_path.write_bytes(b"JUNKET12notflatbufferatall") + + scanner = ExecuTorchScanner() + result = scanner.scan(str(file_path)) + + assert result.success is False + assert any(issue.rule_code == "S104" for issue in result.issues) diff --git a/tests/utils/file/test_filetype.py b/tests/utils/file/test_filetype.py index 276ca05f..316122fb 100644 --- a/tests/utils/file/test_filetype.py +++ b/tests/utils/file/test_filetype.py @@ -218,6 +218,22 @@ def test_detect_torch7_formats_by_signature(tmp_path: Path) -> None: assert detect_file_format_from_magic(str(torch7_path)) == "torch7" assert validate_file_type(str(torch7_path)) is True + +def test_detect_executorch_binary_requires_valid_flatbuffer_structure(tmp_path: Path) -> None: + executorch_path = tmp_path / "program.pte" + executorch_path.write_bytes(b"\x0c\x00\x00\x00ET13\x04\x00\x04\x00\x04\x00\x00\x00") + + assert detect_file_format(str(executorch_path)) == "executorch" + assert detect_file_format_from_magic(str(executorch_path)) == "executorch" + assert validate_file_type(str(executorch_path)) is True + + fake_executorch_path = tmp_path / "fake-program.pte" + fake_executorch_path.write_bytes(b"JUNKET12notflatbufferatall") + + assert detect_file_format(str(fake_executorch_path)) == "executorch" + assert detect_file_format_from_magic(str(fake_executorch_path)) == "unknown" + assert validate_file_type(str(fake_executorch_path)) is False + fake_torch7 = tmp_path / "fake.t7" fake_torch7.write_text("not torch7") assert detect_file_format(str(fake_torch7)) == "unknown" @@ -524,12 +540,17 @@ def test_validate_file_type(tmp_path): mar.writestr("handler.py", b"def handle(data, context):\n return data\n") assert validate_file_type(str(mar_path)) is True - # ExecuTorch binaries use a FlatBuffers identifier at bytes 4..7. + # ExecuTorch binaries require a valid FlatBuffers layout in addition to the file identifier. executorch_path = tmp_path / "program.pte" - executorch_path.write_bytes(b"\x40\x00\x00\x00ET12eh00" + b"\x20\x00\x00\x00" + b"\x00" * 16) + executorch_path.write_bytes(b"\x0c\x00\x00\x00ET13\x04\x00\x04\x00\x04\x00\x00\x00") assert detect_file_format_from_magic(str(executorch_path)) == "executorch" assert validate_file_type(str(executorch_path)) is True + invalid_executorch_path = tmp_path / "invalid-program.pte" + invalid_executorch_path.write_bytes(b"\x0c\x00\x00\x00ETAA\x04\x00\x04\x00\x04\x00\x00\x00") + assert detect_file_format_from_magic(str(invalid_executorch_path)) == "unknown" + assert validate_file_type(str(invalid_executorch_path)) is False + # Llamafile wrappers validate by extension with scanner-level marker checks. llamafile_path = tmp_path / "model.llamafile" llamafile_path.write_bytes(b"\x7fELF" + b"\x00" * 32 + b"llamafile")