Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions .pre-commit-config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,7 @@ repos:

- repo: https://github.com/astral-sh/ruff-pre-commit
# Ruff version.
rev: v0.15.2
rev: v0.15.5
hooks:
# Run the linter.
- id: ruff-check
Expand All @@ -53,7 +53,7 @@ repos:
pass_filenames: false

- repo: https://github.com/tox-dev/pyproject-fmt
rev: v2.16.1
rev: v2.16.2
hooks:
- id: pyproject-fmt

Expand Down
7 changes: 7 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,6 +1,13 @@
Changelog
=========

Version 2.0.1
-------------

- Fixing #126 Python scanner false positives — now uses AST node walking instead of regex to verify real Python constructs (thanks to ahobson)
- Fixing #134 UTF-16 LE BOM (FF FE) no longer misidentified as .mp1 audio, added UTF-16 BOM detection to text scanner (thanks to tomazfs)
- Fixing #135 from_string(), from_stream(), magic_string(), and magic_stream() now perform deep scan when filename is provided (thanks to denisw)

Version 2.0.0
-------------

Expand Down
6 changes: 1 addition & 5 deletions puremagic/magic_data.json
Original file line number Diff line number Diff line change
Expand Up @@ -714,9 +714,6 @@
"fffd": [
["544147", -128, ".mp1", "audio/mpeg", "MPEG-1 Audio Layer I (MP1) file"]
],
"fffe": [
["544147", -128, ".mp1", "audio/mpeg", "MPEG-1 Audio Layer I (MP1) file"]
],
"ffff": [
["544147", -128, ".mp1", "audio/mpeg", "MPEG-1 Audio Layer I (MP1) file"]
]
Expand Down Expand Up @@ -974,7 +971,6 @@
["fffb", 0, ".mp1", "audio/mpeg", "MPEG-1 Audio Layer I (MP1) file"],
["fffc", 0, ".mp1", "audio/mpeg", "MPEG-1 Audio Layer I (MP1) file"],
["fffd", 0, ".mp1", "audio/mpeg", "MPEG-1 Audio Layer I (MP1) file"],
["fffe", 0, ".mp1", "audio/mpeg", "MPEG-1 Audio Layer I (MP1) file"],
["ffff", 0, ".mp1", "audio/mpeg", "MPEG-1 Audio Layer I (MP1) file"],
["4f67675300020000000000000000", 0, ".ogg", "application/ogg", "Ogg Vorbis audio file"],
["57415645666d7420", 8, ".wav", "audio/x-wav", "Windows audio file "],
Expand Down Expand Up @@ -1027,7 +1023,7 @@
["504b0304140008000800", 0, ".jar", "application/java-archive", "Java Archive file"],
["5f27a889", 0, ".jar", "application/java-archive", "Jar Archive file"],
["edabeedb", 0, ".rpm", "application/x-rpm", "RedHat Package Manager file"],
["fffe", 0, ".ini", "text/plain", "Windows INI file"],
["fffe", 0, ".txt", "text/plain", "UTF-16 LE text file"],
["fffe23006c0069006e00650020003100", 0, ".mof", "text/plain", "Windows MSinfo file"],
["ffffffff", 0, ".sys", "text/plain", "DOS system driver"],
[
Expand Down
43 changes: 38 additions & 5 deletions puremagic/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,7 @@
)

__author__ = "Chris Griffith"
__version__ = "2.0.0"
__version__ = "2.0.1"
__all__ = [
"magic_file",
"magic_string",
Expand Down Expand Up @@ -215,7 +215,7 @@ def perform_magic(header: bytes, footer: bytes, mime: bool, ext=None, filename=N
raise PureError("Could not identify file")
if mime:
return results[0].mime_type
return results[0].extension
return results[0].extension or ""
if not infos:
raise PureError("Could not identify file")
info = infos[0]
Expand Down Expand Up @@ -306,7 +306,7 @@ def from_string(string: str | bytes, mime: bool = False, filename: os.PathLike |
string = string.encode("utf-8")
head, foot = string_details(string)
ext = ext_from_filename(filename) if filename else None
return perform_magic(head, foot, mime, ext)
return perform_magic(head, foot, mime, ext, filename=filename)


def from_stream(stream, mime: bool = False, filename: os.PathLike | str | None = None) -> str:
Expand All @@ -322,7 +322,7 @@ def from_stream(stream, mime: bool = False, filename: os.PathLike | str | None =
"""
head, foot = stream_details(stream)
ext = ext_from_filename(filename) if filename else None
return perform_magic(head, foot, mime, ext)
return perform_magic(head, foot, mime, ext, filename=filename)


def magic_file(filename: os.PathLike | str) -> list[PureMagicWithConfidence]:
Expand Down Expand Up @@ -362,6 +362,8 @@ def magic_string(string, filename: os.PathLike | str | None = None) -> list[Pure
ext = ext_from_filename(filename) if filename else None
info = identify_all(head, foot, ext)
info.sort(key=lambda x: x.confidence, reverse=True)
if filename and os.getenv("PUREMAGIC_DEEPSCAN") != "0":
return run_deep_scan(info, filename, head, foot, raise_on_none=False)
return info


Expand All @@ -383,6 +385,8 @@ def magic_stream(
ext = ext_from_filename(filename) if filename else None
info = identify_all(head, foot, ext)
info.sort(key=lambda x: x.confidence, reverse=True)
if filename and os.getenv("PUREMAGIC_DEEPSCAN") != "0":
return run_deep_scan(info, filename, head, foot, raise_on_none=False)
return info


Expand Down Expand Up @@ -469,7 +473,16 @@ def run_deep_scan(
raise
else:
if result:
return [result]
return [
PureMagicWithConfidence(
confidence=result.confidence,
byte_match=None,
offset=None,
extension=result.extension,
mime_type=result.mime_type,
name=result.name,
)
]
if raise_on_none:
raise PureError("Could not identify file")

Expand All @@ -490,6 +503,26 @@ def run_deep_scan(
name=result.name,
)
]

# No specific scanner matched — try the catch-all text scanner
# Only override when existing matches are very low confidence (e.g. 2-byte BOM signatures)
if matches[0].confidence < 0.5:
try:
result = catch_all_deep_scan(filename, head, foot)
except Exception:
pass
else:
if result and result.confidence > matches[0].confidence:
return [
PureMagicWithConfidence(
confidence=result.confidence,
byte_match=None,
offset=None,
extension=result.extension,
mime_type=result.mime_type,
name=result.name,
)
]
return matches


Expand Down
18 changes: 16 additions & 2 deletions puremagic/scanners/hdf5_scanner.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,11 +12,25 @@
# Loom - single-cell genomics
([], [b"/matrix", b"/row_attrs", b"/col_attrs"], 2, ".loom", "Loom single-cell data", "application/x-loom"),
# Multi-resolution Cooler (must check before single-resolution)
([b"/resolutions"], [b"/bins", b"/chroms"], 1, ".mcool", "Multi-resolution Cooler contact matrix", "application/x-mcool"),
(
[b"/resolutions"],
[b"/bins", b"/chroms"],
1,
".mcool",
"Multi-resolution Cooler contact matrix",
"application/x-mcool",
),
# Cooler - genomic contact matrices
([], [b"/bins", b"/chroms", b"/pixels"], 2, ".cool", "Cooler contact matrix", "application/x-cooler"),
# BIOM v2 - biological observation matrix
([], [b"BIOM", b"/observation", b"/sample"], 2, ".biom2", "BIOM v2 biological observation matrix", "application/x-biom2"),
(
[],
[b"BIOM", b"/observation", b"/sample"],
2,
".biom2",
"BIOM v2 biological observation matrix",
"application/x-biom2",
),
# mz5 - mass spectrometry
([], [b"/SpectrumMetaData", b"/ChomatogramMetaData"], 1, ".mz5", "mz5 mass spectrometry data", "application/x-mz5"),
# h5mlm - ML model
Expand Down
4 changes: 3 additions & 1 deletion puremagic/scanners/json_scanner.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,9 @@


def main(file_path: os.PathLike | str, head: bytes, foot: bytes) -> Match | None:
if not (head.strip().startswith(b"{") and foot.strip().endswith(b"}")) and not (head.strip().startswith(b"[") and foot.strip().endswith(b"]")):
if not (head.strip().startswith(b"{") and foot.strip().endswith(b"}")) and not (
head.strip().startswith(b"[") and foot.strip().endswith(b"]")
):
return None
try:
with open(file_path, "rb") as file:
Expand Down
4 changes: 3 additions & 1 deletion puremagic/scanners/mpeg_audio_scanner.py
Original file line number Diff line number Diff line change
Expand Up @@ -64,7 +64,7 @@
b"\xff\xf8", # MPEG-1, Layer II (MP2), No Protection (CRC not used)
# Layer I (MP1) - Layer Bits = 11
b"\xff\xff", # MPEG-1, Layer I (MP1), Protected (CRC used)
b"\xff\xfe", # MPEG-1, Layer I (MP1), Protected (CRC used)
# b"\xff\xfe" excluded — conflicts with UTF-16 LE BOM (GH #134)
b"\xff\xfd", # MPEG-1, Layer I (MP1), No Protection (CRC not used)
b"\xff\xfc", # MPEG-1, Layer I (MP1), No Protection (CRC not used)
b"\xff\xfb", # MPEG-1, Layer I (MP1), Protected (CRC used)
Expand Down Expand Up @@ -614,6 +614,8 @@ def _parse_vbr_header(self, frame_bytes: bytes, header_results: Dict[str, Any])

# 2. Determine Offsets using validated results
mpeg_version_str = header_results.get("mpeg_version")
if mpeg_version_str is None:
return None
mpeg_version_index = self.mpeg_version_reverse.get(mpeg_version_str)

if mpeg_version_index is None:
Expand Down
109 changes: 34 additions & 75 deletions puremagic/scanners/python_scanner.py
Original file line number Diff line number Diff line change
@@ -1,40 +1,40 @@
import ast
import os
import re

from puremagic.scanners.helpers import Match

python_common_keywords = [
re.compile("\bdef\b"),
re.compile("\bclass\b"),
re.compile("\bimport\b"),
re.compile("\belif\b"),
re.compile("\bwhile\b"),
re.compile("\bexcept\b"),
re.compile("\bfinally\b"),
re.compile("\breturn\b"),
re.compile("\byield\b"),
re.compile("\blambda\b"),
re.compile("\bTrue\b"),
re.compile("\bFalse\b"),
re.compile("\bNone\b"),
re.compile("\b__version__\b"),
re.compile("__main__"),
]

python_patterns = [
re.compile(r"\bdef\s+\w+\s*\("), # Function definitions
re.compile(r"\bclass\s+\w+\s*[\(:]"), # Class definitions
re.compile(r"\bimport\s+\w+"), # Import statements
re.compile(r"\bfrom\s+\w+\s+import"), # From-import statements
re.compile(r"\bif\s+.*:"), # If statements
re.compile(r"\bfor\s+\w+\s+in\s+.*:"), # For loops
re.compile(r"\bwhile\s+.*:"), # While loops
re.compile(r"\btry\s*:"), # Try blocks
re.compile(r"\.append\("), # Method calls
re.compile(r"\.join\("), # String operations
re.compile(r"print\s*\("), # Print statements
]
# AST node types that are strong indicators of real Python code
_PYTHON_NODE_TYPES = (
ast.Import,
ast.ImportFrom,
ast.FunctionDef,
ast.AsyncFunctionDef,
ast.ClassDef,
ast.For,
ast.AsyncFor,
ast.While,
ast.With,
ast.AsyncWith,
ast.Try,
ast.Raise,
ast.Assert,
)


def _has_python_constructs(tree: ast.Module, threshold: int = 4) -> bool:
"""Walk the AST and check for node types that indicate real Python code.

Simple expressions (tuples, names, constants) can appear in CSV, config files,
and other non-Python text that happens to parse. Real Python code will contain
imports, function/class definitions, control flow, etc.
"""
count = 0
for node in ast.walk(tree):
if isinstance(node, _PYTHON_NODE_TYPES):
count += 1
if count >= threshold:
return True
return False


def main(file_path: os.PathLike | str, _, __) -> Match | None:
Expand All @@ -48,11 +48,10 @@ def main(file_path: os.PathLike | str, _, __) -> Match | None:
with open(file_path, "r", encoding="utf-8") as file:
content = file.read()

# Parse to ensure it's valid Python syntax
ast.parse(content)
tree = ast.parse(content)

if not str(file_path).endswith(".py"):
if not is_substantial_python_code(content):
if not _has_python_constructs(tree):
return None

except (SyntaxError, UnicodeDecodeError, PermissionError, OSError):
Expand All @@ -64,43 +63,3 @@ def main(file_path: os.PathLike | str, _, __) -> Match | None:
mime_type="text/x-python",
confidence=1.0,
)


def is_substantial_python_code(content: str) -> bool:
"""
Check if the content contains substantial Python code indicators.
Returns True if the content appears to be meaningful Python code.
"""
# Remove comments and strings to focus on actual code
content_lines = content.splitlines()
code_lines = []

for line in content_lines:
# Remove comments (basic approach - doesn't handle strings containing #)
line = line.split("#")[0].strip()
if line: # Non-empty after removing comments
code_lines.append(line)

# If too few substantial lines, it's probably not real code
if len(code_lines) < 2:
return False

code_text = " ".join(code_lines)

# Check for Python keywords that indicate actual code

# Count how many keywords are present
keyword_count = 0
for keyword in python_common_keywords:
if keyword.search(code_text):
keyword_count += 1

# Require at least 2 keywords for substantial code
if keyword_count < 2:
return False

# Check for common Python patterns
for pattern in python_patterns:
if pattern.search(code_text):
return True
return False
Loading