From 5e76c2d2df274e6bc8c7693ce85355a1ebee7f88 Mon Sep 17 00:00:00 2001 From: Chris Griffith Date: Mon, 23 Mar 2026 11:37:14 -0500 Subject: [PATCH] - Fixing #141 deep scan no longer overrides valid binary format matches (e.g. JPEG) for files without extensions (thanks to marph91) --- CHANGELOG.md | 5 +++++ puremagic/main.py | 18 +++++++++++------- test/test_common_extensions.py | 1 - test/test_scanners.py | 21 ++++++++++++++++++++- 4 files changed, 36 insertions(+), 9 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 3528335..6300553 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,6 +1,11 @@ Changelog ========= +Version 2.1.1 +------------- + +- Fixing #141 deep scan no longer overrides valid binary format matches (e.g. JPEG) for files without extensions (thanks to marph91) + Version 2.1.0 ------------- diff --git a/puremagic/main.py b/puremagic/main.py index d824563..56942c6 100644 --- a/puremagic/main.py +++ b/puremagic/main.py @@ -35,7 +35,7 @@ ) __author__ = "Chris Griffith" -__version__ = "2.1.0" +__version__ = "2.1.1" __all__ = [ "magic_file", "magic_string", @@ -212,13 +212,12 @@ def perform_magic(header: bytes, footer: bytes, mime: bool, ext=None, filename=N raise PureValueError("Input was empty") infos = identify_all(header, footer, ext) if filename and os.path.isfile(filename) and os.getenv("PUREMAGIC_DEEPSCAN") != "0": - results = run_deep_scan(infos, filename, header, footer, raise_on_none=True) - if results: - if results[0].extension == "": - raise PureError("Could not identify file") + results = run_deep_scan(infos, filename, header, footer, raise_on_none=not infos) + if results and results[0].extension != "": if mime: return results[0].mime_type return results[0].extension or "" + # Deep scan returned empty extension or no results — fall through to original matches if not infos: raise PureError("Could not identify file") info = infos[0] @@ -556,13 +555,18 @@ def run_deep_scan( # No specific scanner matched — try the catch-all text scanner # Only override when existing matches are very low confidence (e.g. 2-byte BOM signatures) - if matches[0].confidence < 0.5: + # Only let the catch-all text scanner override when existing matches are + # generic text types (e.g. BOM-only signatures). If the magic database + # already identified a specific non-text file type, trust it over a generic text guess. + best_mime = matches[0].mime_type or "" + is_generic = best_mime.startswith("text/") or best_mime == "application/octet-stream" or not best_mime + if matches[0].confidence < 0.5 and is_generic: try: result = catch_all_deep_scan(filename, head, foot) except Exception: pass else: - if result and result.confidence > matches[0].confidence: + if result and result.extension and result.confidence > matches[0].confidence: return [ PureMagicWithConfidence( confidence=result.confidence, diff --git a/test/test_common_extensions.py b/test/test_common_extensions.py index 5bcf568..0294b2b 100644 --- a/test/test_common_extensions.py +++ b/test/test_common_extensions.py @@ -269,7 +269,6 @@ def test_riff_wav_mime(): assert mime == "audio/wav" - def test_cfbf_doc(): """CFBF scanner correctly identifies Word .doc""" ext = puremagic.from_file(os.path.join(OFFICE_DIR, "test.doc")) diff --git a/test/test_scanners.py b/test/test_scanners.py index b14344b..4d4a0cc 100644 --- a/test/test_scanners.py +++ b/test/test_scanners.py @@ -1,5 +1,5 @@ import puremagic -from test.common import OFFICE_DIR, SYSTEM_DIR, AUDIO_DIR +from test.common import IMAGE_DIR, OFFICE_DIR, SYSTEM_DIR, AUDIO_DIR from puremagic.scanners import python_scanner, json_scanner, sndhdr_scanner sample_text = b"""Lorem ipsum dolor sit amet, consectetur adipiscing elit,{ending} @@ -104,6 +104,25 @@ def test_eml_scanner(): assert results[0].confidence == 1.0 +def test_jpg_without_extension(): + # GH #141: JPEG file without extension should still be identified as image/jpeg + import struct + + data = b"\xff\xd8\xff\xe0" + data += struct.pack(">H", 16) + data += b"JFIF\x00\x01\x01\x00" + data += struct.pack(">HH", 1, 1) + data += b"\x00\x00\xff\xd9" + + no_ext_file = IMAGE_DIR / "test_jpeg_no_ext" + no_ext_file.write_bytes(data) + try: + result = puremagic.from_file(no_ext_file, mime=True) + assert result == "image/jpeg", f"Expected image/jpeg, got {result}" + finally: + no_ext_file.unlink() + + def test_sndhdr_scanner(): # Test the sndhdr scanner with sndr file sndr_file = AUDIO_DIR / "test.sndr"