Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 5 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,6 +1,11 @@
Changelog
=========

Version 2.1.1
-------------

- Fixing #141 deep scan no longer overrides valid binary format matches (e.g. JPEG) for files without extensions (thanks to marph91)

Version 2.1.0
-------------

Expand Down
18 changes: 11 additions & 7 deletions puremagic/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,7 @@
)

__author__ = "Chris Griffith"
__version__ = "2.1.0"
__version__ = "2.1.1"
__all__ = [
"magic_file",
"magic_string",
Expand Down Expand Up @@ -212,13 +212,12 @@ def perform_magic(header: bytes, footer: bytes, mime: bool, ext=None, filename=N
raise PureValueError("Input was empty")
infos = identify_all(header, footer, ext)
if filename and os.path.isfile(filename) and os.getenv("PUREMAGIC_DEEPSCAN") != "0":
results = run_deep_scan(infos, filename, header, footer, raise_on_none=True)
if results:
if results[0].extension == "":
raise PureError("Could not identify file")
results = run_deep_scan(infos, filename, header, footer, raise_on_none=not infos)
if results and results[0].extension != "":
if mime:
return results[0].mime_type
return results[0].extension or ""
# Deep scan returned empty extension or no results — fall through to original matches
if not infos:
raise PureError("Could not identify file")
info = infos[0]
Expand Down Expand Up @@ -556,13 +555,18 @@ def run_deep_scan(

# No specific scanner matched — try the catch-all text scanner
# Only override when existing matches are very low confidence (e.g. 2-byte BOM signatures)
if matches[0].confidence < 0.5:
# Only let the catch-all text scanner override when existing matches are
# generic text types (e.g. BOM-only signatures). If the magic database
# already identified a specific non-text file type, trust it over a generic text guess.
best_mime = matches[0].mime_type or ""
is_generic = best_mime.startswith("text/") or best_mime == "application/octet-stream" or not best_mime
if matches[0].confidence < 0.5 and is_generic:
try:
result = catch_all_deep_scan(filename, head, foot)
except Exception:
pass
else:
if result and result.confidence > matches[0].confidence:
if result and result.extension and result.confidence > matches[0].confidence:
return [
PureMagicWithConfidence(
confidence=result.confidence,
Expand Down
1 change: 0 additions & 1 deletion test/test_common_extensions.py
Original file line number Diff line number Diff line change
Expand Up @@ -269,7 +269,6 @@ def test_riff_wav_mime():
assert mime == "audio/wav"



def test_cfbf_doc():
"""CFBF scanner correctly identifies Word .doc"""
ext = puremagic.from_file(os.path.join(OFFICE_DIR, "test.doc"))
Expand Down
21 changes: 20 additions & 1 deletion test/test_scanners.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
import puremagic
from test.common import OFFICE_DIR, SYSTEM_DIR, AUDIO_DIR
from test.common import IMAGE_DIR, OFFICE_DIR, SYSTEM_DIR, AUDIO_DIR
from puremagic.scanners import python_scanner, json_scanner, sndhdr_scanner

sample_text = b"""Lorem ipsum dolor sit amet, consectetur adipiscing elit,{ending}
Expand Down Expand Up @@ -104,6 +104,25 @@ def test_eml_scanner():
assert results[0].confidence == 1.0


def test_jpg_without_extension():
# GH #141: JPEG file without extension should still be identified as image/jpeg
import struct

data = b"\xff\xd8\xff\xe0"
data += struct.pack(">H", 16)
data += b"JFIF\x00\x01\x01\x00"
data += struct.pack(">HH", 1, 1)
data += b"\x00\x00\xff\xd9"

no_ext_file = IMAGE_DIR / "test_jpeg_no_ext"
no_ext_file.write_bytes(data)
try:
result = puremagic.from_file(no_ext_file, mime=True)
assert result == "image/jpeg", f"Expected image/jpeg, got {result}"
finally:
no_ext_file.unlink()


def test_sndhdr_scanner():
# Test the sndhdr scanner with sndr file
sndr_file = AUDIO_DIR / "test.sndr"
Expand Down
Loading