cdgriffith · cdgriffith · Mar 9, 2026 · Feb 20, 2026 · Feb 20, 2026 · Feb 20, 2026
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -35,7 +35,7 @@ repos:
 
 -   repo: https://github.com/astral-sh/ruff-pre-commit
     # Ruff version.
-    rev: v0.15.2
+    rev: v0.15.5
     hooks:
       # Run the linter.
       - id: ruff-check
@@ -53,7 +53,7 @@ repos:
       pass_filenames: false
 
 -   repo: https://github.com/tox-dev/pyproject-fmt
-    rev: v2.16.1
+    rev: v2.16.2
     hooks:
       - id: pyproject-fmt
 

diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -1,6 +1,13 @@
 Changelog
 =========
 
+Version 2.0.1
+-------------
+
+- Fixing #126 Python scanner false positives — now uses AST node walking instead of regex to verify real Python constructs (thanks to ahobson)
+- Fixing #134 UTF-16 LE BOM (FF FE) no longer misidentified as .mp1 audio, added UTF-16 BOM detection to text scanner (thanks to tomazfs)
+- Fixing #135 from_string(), from_stream(), magic_string(), and magic_stream() now perform deep scan when filename is provided (thanks to denisw)
+
 Version 2.0.0
 -------------
 

diff --git a/puremagic/magic_data.json b/puremagic/magic_data.json
@@ -714,9 +714,6 @@
     "fffd": [
         ["544147", -128, ".mp1", "audio/mpeg", "MPEG-1 Audio Layer I (MP1) file"]
     ],
-    "fffe": [
-        ["544147", -128, ".mp1", "audio/mpeg", "MPEG-1 Audio Layer I (MP1) file"]
-    ],
     "ffff": [
         ["544147", -128, ".mp1", "audio/mpeg", "MPEG-1 Audio Layer I (MP1) file"]
     ]
@@ -974,7 +971,6 @@
     ["fffb", 0, ".mp1", "audio/mpeg", "MPEG-1 Audio Layer I (MP1) file"],
     ["fffc", 0, ".mp1", "audio/mpeg", "MPEG-1 Audio Layer I (MP1) file"],
     ["fffd", 0, ".mp1", "audio/mpeg", "MPEG-1 Audio Layer I (MP1) file"],
-    ["fffe", 0, ".mp1", "audio/mpeg", "MPEG-1 Audio Layer I (MP1) file"],
     ["ffff", 0, ".mp1", "audio/mpeg", "MPEG-1 Audio Layer I (MP1) file"],
     ["4f67675300020000000000000000", 0, ".ogg", "application/ogg", "Ogg Vorbis audio file"],
     ["57415645666d7420", 8, ".wav", "audio/x-wav", "Windows audio file "],
@@ -1027,7 +1023,7 @@
     ["504b0304140008000800", 0, ".jar", "application/java-archive", "Java Archive file"],
     ["5f27a889", 0, ".jar", "application/java-archive", "Jar Archive file"],
     ["edabeedb", 0, ".rpm", "application/x-rpm", "RedHat Package Manager file"],
-    ["fffe", 0, ".ini", "text/plain", "Windows INI file"],
+    ["fffe", 0, ".txt", "text/plain", "UTF-16 LE text file"],
     ["fffe23006c0069006e00650020003100", 0, ".mof", "text/plain", "Windows MSinfo file"],
     ["ffffffff", 0, ".sys", "text/plain", "DOS system driver"],
     [

diff --git a/puremagic/main.py b/puremagic/main.py
@@ -34,7 +34,7 @@
     )
 
 __author__ = "Chris Griffith"
-__version__ = "2.0.0"
+__version__ = "2.0.1"
 __all__ = [
     "magic_file",
     "magic_string",
@@ -215,7 +215,7 @@ def perform_magic(header: bytes, footer: bytes, mime: bool, ext=None, filename=N
                 raise PureError("Could not identify file")
             if mime:
                 return results[0].mime_type
-            return results[0].extension
+            return results[0].extension or ""
     if not infos:
         raise PureError("Could not identify file")
     info = infos[0]
@@ -306,7 +306,7 @@ def from_string(string: str | bytes, mime: bool = False, filename: os.PathLike |
         string = string.encode("utf-8")
     head, foot = string_details(string)
     ext = ext_from_filename(filename) if filename else None
-    return perform_magic(head, foot, mime, ext)
+    return perform_magic(head, foot, mime, ext, filename=filename)
 
 
 def from_stream(stream, mime: bool = False, filename: os.PathLike | str | None = None) -> str:
@@ -322,7 +322,7 @@ def from_stream(stream, mime: bool = False, filename: os.PathLike | str | None =
     """
     head, foot = stream_details(stream)
     ext = ext_from_filename(filename) if filename else None
-    return perform_magic(head, foot, mime, ext)
+    return perform_magic(head, foot, mime, ext, filename=filename)
 
 
 def magic_file(filename: os.PathLike | str) -> list[PureMagicWithConfidence]:
@@ -362,6 +362,8 @@ def magic_string(string, filename: os.PathLike | str | None = None) -> list[Pure
     ext = ext_from_filename(filename) if filename else None
     info = identify_all(head, foot, ext)
     info.sort(key=lambda x: x.confidence, reverse=True)
+    if filename and os.getenv("PUREMAGIC_DEEPSCAN") != "0":
+        return run_deep_scan(info, filename, head, foot, raise_on_none=False)
     return info
 
 
@@ -383,6 +385,8 @@ def magic_stream(
     ext = ext_from_filename(filename) if filename else None
     info = identify_all(head, foot, ext)
     info.sort(key=lambda x: x.confidence, reverse=True)
+    if filename and os.getenv("PUREMAGIC_DEEPSCAN") != "0":
+        return run_deep_scan(info, filename, head, foot, raise_on_none=False)
     return info
 
 
@@ -469,7 +473,16 @@ def run_deep_scan(
             raise
         else:
             if result:
-                return [result]
+                return [
+                    PureMagicWithConfidence(
+                        confidence=result.confidence,
+                        byte_match=None,
+                        offset=None,
+                        extension=result.extension,
+                        mime_type=result.mime_type,
+                        name=result.name,
+                    )
+                ]
         if raise_on_none:
             raise PureError("Could not identify file")
 
@@ -490,6 +503,26 @@ def run_deep_scan(
                     name=result.name,
                 )
             ]
+
+    # No specific scanner matched — try the catch-all text scanner
+    # Only override when existing matches are very low confidence (e.g. 2-byte BOM signatures)
+    if matches[0].confidence < 0.5:
+        try:
+            result = catch_all_deep_scan(filename, head, foot)
+        except Exception:
+            pass
+        else:
+            if result and result.confidence > matches[0].confidence:
+                return [
+                    PureMagicWithConfidence(
+                        confidence=result.confidence,
+                        byte_match=None,
+                        offset=None,
+                        extension=result.extension,
+                        mime_type=result.mime_type,
+                        name=result.name,
+                    )
+                ]
     return matches
 
 

diff --git a/puremagic/scanners/hdf5_scanner.py b/puremagic/scanners/hdf5_scanner.py
@@ -12,11 +12,25 @@
     # Loom - single-cell genomics
     ([], [b"/matrix", b"/row_attrs", b"/col_attrs"], 2, ".loom", "Loom single-cell data", "application/x-loom"),
     # Multi-resolution Cooler (must check before single-resolution)
-    ([b"/resolutions"], [b"/bins", b"/chroms"], 1, ".mcool", "Multi-resolution Cooler contact matrix", "application/x-mcool"),
+    (
+        [b"/resolutions"],
+        [b"/bins", b"/chroms"],
+        1,
+        ".mcool",
+        "Multi-resolution Cooler contact matrix",
+        "application/x-mcool",
+    ),
     # Cooler - genomic contact matrices
     ([], [b"/bins", b"/chroms", b"/pixels"], 2, ".cool", "Cooler contact matrix", "application/x-cooler"),
     # BIOM v2 - biological observation matrix
-    ([], [b"BIOM", b"/observation", b"/sample"], 2, ".biom2", "BIOM v2 biological observation matrix", "application/x-biom2"),
+    (
+        [],
+        [b"BIOM", b"/observation", b"/sample"],
+        2,
+        ".biom2",
+        "BIOM v2 biological observation matrix",
+        "application/x-biom2",
+    ),
     # mz5 - mass spectrometry
     ([], [b"/SpectrumMetaData", b"/ChomatogramMetaData"], 1, ".mz5", "mz5 mass spectrometry data", "application/x-mz5"),
     # h5mlm - ML model

diff --git a/puremagic/scanners/json_scanner.py b/puremagic/scanners/json_scanner.py
@@ -7,7 +7,9 @@
 
 
 def main(file_path: os.PathLike | str, head: bytes, foot: bytes) -> Match | None:
-    if not (head.strip().startswith(b"{") and foot.strip().endswith(b"}")) and not (head.strip().startswith(b"[") and foot.strip().endswith(b"]")):
+    if not (head.strip().startswith(b"{") and foot.strip().endswith(b"}")) and not (
+        head.strip().startswith(b"[") and foot.strip().endswith(b"]")
+    ):
         return None
     try:
         with open(file_path, "rb") as file:

diff --git a/puremagic/scanners/mpeg_audio_scanner.py b/puremagic/scanners/mpeg_audio_scanner.py
@@ -64,7 +64,7 @@
     b"\xff\xf8",  # MPEG-1, Layer II (MP2), No Protection (CRC not used)
     # Layer I (MP1) - Layer Bits = 11
     b"\xff\xff",  # MPEG-1, Layer I (MP1), Protected (CRC used)
-    b"\xff\xfe",  # MPEG-1, Layer I (MP1), Protected (CRC used)
+    # b"\xff\xfe" excluded — conflicts with UTF-16 LE BOM (GH #134)
     b"\xff\xfd",  # MPEG-1, Layer I (MP1), No Protection (CRC not used)
     b"\xff\xfc",  # MPEG-1, Layer I (MP1), No Protection (CRC not used)
     b"\xff\xfb",  # MPEG-1, Layer I (MP1), Protected (CRC used)
@@ -614,6 +614,8 @@ def _parse_vbr_header(self, frame_bytes: bytes, header_results: Dict[str, Any])
 
         # 2. Determine Offsets using validated results
         mpeg_version_str = header_results.get("mpeg_version")
+        if mpeg_version_str is None:
+            return None
         mpeg_version_index = self.mpeg_version_reverse.get(mpeg_version_str)
 
         if mpeg_version_index is None:

diff --git a/puremagic/scanners/python_scanner.py b/puremagic/scanners/python_scanner.py
@@ -1,40 +1,40 @@
 import ast
 import os
-import re
 
 from puremagic.scanners.helpers import Match
 
-python_common_keywords = [
-    re.compile("\bdef\b"),
-    re.compile("\bclass\b"),
-    re.compile("\bimport\b"),
-    re.compile("\belif\b"),
-    re.compile("\bwhile\b"),
-    re.compile("\bexcept\b"),
-    re.compile("\bfinally\b"),
-    re.compile("\breturn\b"),
-    re.compile("\byield\b"),
-    re.compile("\blambda\b"),
-    re.compile("\bTrue\b"),
-    re.compile("\bFalse\b"),
-    re.compile("\bNone\b"),
-    re.compile("\b__version__\b"),
-    re.compile("__main__"),
-]
-
-python_patterns = [
-    re.compile(r"\bdef\s+\w+\s*\("),  # Function definitions
-    re.compile(r"\bclass\s+\w+\s*[\(:]"),  # Class definitions
-    re.compile(r"\bimport\s+\w+"),  # Import statements
-    re.compile(r"\bfrom\s+\w+\s+import"),  # From-import statements
-    re.compile(r"\bif\s+.*:"),  # If statements
-    re.compile(r"\bfor\s+\w+\s+in\s+.*:"),  # For loops
-    re.compile(r"\bwhile\s+.*:"),  # While loops
-    re.compile(r"\btry\s*:"),  # Try blocks
-    re.compile(r"\.append\("),  # Method calls
-    re.compile(r"\.join\("),  # String operations
-    re.compile(r"print\s*\("),  # Print statements
-]
+# AST node types that are strong indicators of real Python code
+_PYTHON_NODE_TYPES = (
+    ast.Import,
+    ast.ImportFrom,
+    ast.FunctionDef,
+    ast.AsyncFunctionDef,
+    ast.ClassDef,
+    ast.For,
+    ast.AsyncFor,
+    ast.While,
+    ast.With,
+    ast.AsyncWith,
+    ast.Try,
+    ast.Raise,
+    ast.Assert,
+)
+
+
+def _has_python_constructs(tree: ast.Module, threshold: int = 4) -> bool:
+    """Walk the AST and check for node types that indicate real Python code.
+
+    Simple expressions (tuples, names, constants) can appear in CSV, config files,
+    and other non-Python text that happens to parse. Real Python code will contain
+    imports, function/class definitions, control flow, etc.
+    """
+    count = 0
+    for node in ast.walk(tree):
+        if isinstance(node, _PYTHON_NODE_TYPES):
+            count += 1
+            if count >= threshold:
+                return True
+    return False
 
 
 def main(file_path: os.PathLike | str, _, __) -> Match | None:
@@ -48,11 +48,10 @@ def main(file_path: os.PathLike | str, _, __) -> Match | None:
         with open(file_path, "r", encoding="utf-8") as file:
             content = file.read()
 
-        # Parse to ensure it's valid Python syntax
-        ast.parse(content)
+        tree = ast.parse(content)
 
         if not str(file_path).endswith(".py"):
-            if not is_substantial_python_code(content):
+            if not _has_python_constructs(tree):
                 return None
 
     except (SyntaxError, UnicodeDecodeError, PermissionError, OSError):
@@ -64,43 +63,3 @@ def main(file_path: os.PathLike | str, _, __) -> Match | None:
         mime_type="text/x-python",
         confidence=1.0,
     )
-
-
-def is_substantial_python_code(content: str) -> bool:
-    """
-    Check if the content contains substantial Python code indicators.
-    Returns True if the content appears to be meaningful Python code.
-    """
-    # Remove comments and strings to focus on actual code
-    content_lines = content.splitlines()
-    code_lines = []
-
-    for line in content_lines:
-        # Remove comments (basic approach - doesn't handle strings containing #)
-        line = line.split("#")[0].strip()
-        if line:  # Non-empty after removing comments
-            code_lines.append(line)
-
-    # If too few substantial lines, it's probably not real code
-    if len(code_lines) < 2:
-        return False
-
-    code_text = " ".join(code_lines)
-
-    # Check for Python keywords that indicate actual code
-
-    # Count how many keywords are present
-    keyword_count = 0
-    for keyword in python_common_keywords:
-        if keyword.search(code_text):
-            keyword_count += 1
-
-    # Require at least 2 keywords for substantial code
-    if keyword_count < 2:
-        return False
-
-    # Check for common Python patterns
-    for pattern in python_patterns:
-        if pattern.search(code_text):
-            return True
-    return False