- Adding #119 CFBF/OLE2 scanner to distinguish Word, Excel, PowerPoint, Outlook MSG, Visio, and Publisher files (thanks to Simeon Stoykov and Andy - NebularNerd)

cdgriffith · cdgriffith · commit 06c248d8fa2e · 2026-03-11T22:24:59.000-05:00
- Fixing #104 WAV MIME type changed from `audio/wave` to `audio/wav` (thanks to Simon Willison and Andy - NebularNerd)
diff --git a/.claude/skills/changelog.md b/.claude/skills/changelog.md
@@ -0,0 +1,53 @@
+---
+name: changelog
+description: Update the CHANGELOG.md changelog file with new entries
+user_invocable: true
+---
+
+# Changelog Skill
+
+When updating the `CHANGELOG.md` file, follow these rules:
+
+## Entry Format
+
+Each entry is a single bullet point starting with `- `:
+
+```
+- {Verb} {description}
+```
+
+## Verbs and Ordering
+
+Entries MUST use one of these four starting verbs, and MUST appear in this order within each version section:
+
+1. **Adding** — new features
+2. **Changing** — modifications to existing behavior
+3. **Fixing** — bug fixes
+4. **Removing** — removed features or deprecated items
+
+## GitHub Issue Entries
+
+- Entries that reference a GitHub issue include the issue number after the verb: `* Fixing #725 description...`
+- Within each verb group, entries WITH issue numbers come FIRST, sorted by issue number ascending (smallest to largest)
+- Entries WITHOUT issue numbers follow after
+
+## Thanks Attribution
+
+- When an entry references a GitHub issue, thank the issue author by their **GitHub display name** (not username)
+- Look up the display name via `gh api users/{username} --jq '.name // .login'`
+- Format: `(thanks to {display name})`
+- If multiple people contributed (e.g., reporter and commenter with the fix), thank all of them
+- The thanks attribution goes at the end of the entry
+
+## Example
+
+```
+Version 1.27
+------------
+
+- Adding #92 include py.typed in sdist (thanks to Nicholas Bollweg - bollwyvl)
+- Adding #93 Improve PDF file detection, fix json description (thanks to Péter - peterekepeter)
+- Adding new verbose output to command line with `-v` or `--verbose`
+- Fixing #96 #86 stream does not work properly on opened small files (thanks to Felipe Lema and Andy - NebularNerd)
+- Removing expected invalid WinZip signature
+```
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -5,8 +5,10 @@ Version 2.1.0
 -------------
 
 - Adding #6 `from_extension()` and `magic_extension()` to look up MIME types by file extension
-- Adding `-e`/`--extension` CLI flag for extension-based MIME type lookup
+- Adding #119 CFBF/OLE2 scanner to distinguish Word, Excel, PowerPoint, Outlook MSG, Visio, and Publisher files (thanks to Simeon Stoykov and Andy - NebularNerd)
 - Adding #139 codespell check (thanks to Christian Clauss)
+- Adding `-e`/`--extension` CLI flag for extension-based MIME type lookup
+- Fixing #104 WAV MIME type changed from `audio/wave` to `audio/wav` (thanks to Simon Willison and Andy - NebularNerd)
 
 Version 2.0.2
 -------------
diff --git a/puremagic/magic_data.json b/puremagic/magic_data.json
@@ -100,7 +100,7 @@
       ["53434448", 8, ".sc2", "", "SimCity 2000 Map File"]
     ],
     "52494646": [
-      ["57415645", 8, ".wav", "audio/wave", "Waveform Audio File Format"],
+      ["57415645", 8, ".wav", "audio/wav", "Waveform Audio File Format"],
       ["41564920", 8, ".avi", "video/avi", "Audio Video Interleave"],
       ["57454250", 8, ".webp", "image/webp", "WebP graphics file format"],
       ["41434f4e", 8, ".ani", "", "Animated cursor"],
diff --git a/puremagic/main.py b/puremagic/main.py
@@ -31,6 +31,7 @@
         sndhdr_scanner,
         mpeg_audio_scanner,
         hdf5_scanner,
+        cfbf_scanner,
     )
 
 __author__ = "Chris Griffith"
@@ -464,6 +465,8 @@ def single_deep_scan(
             result = mpeg_audio_scanner.main(filename, head, foot)
             if result and result.confidence > confidence:
                 return result
+        case cfbf_scanner.match_bytes | cfbf_scanner.match_bytes_short:
+            return cfbf_scanner.main(filename, head, foot)
 
     if eml_result := text_scanner.eml_check(head):
         return eml_result
diff --git a/puremagic/scanners/cfbf_scanner.py b/puremagic/scanners/cfbf_scanner.py
@@ -0,0 +1,97 @@
+import os
+import struct
+
+from puremagic.scanners.helpers import Match
+
+match_bytes = b"\xd0\xcf\x11\xe0\xa1\xb1\x1a\xe1"
+match_bytes_short = b"\xd0\xcf\x11\xe0"
+
+# Stream names that identify specific CFBF-based formats, checked in priority order.
+# Each entry: (stream_name, extension, name, mime_type)
+# Using startswith for prefix matching where noted.
+_STREAM_MATCHES = [
+    ("__nameid_version1.0", ".msg", "Outlook Message", "application/vnd.ms-outlook"),
+    ("PowerPoint Document", ".ppt", "PowerPoint Presentation", "application/vnd.ms-powerpoint"),
+    ("Current User", ".ppt", "PowerPoint Presentation", "application/vnd.ms-powerpoint"),
+    ("Workbook", ".xls", "Excel Spreadsheet", "application/vnd.ms-excel"),
+    ("Book", ".xls", "Excel Spreadsheet", "application/vnd.ms-excel"),
+    ("WordDocument", ".doc", "Word Document", "application/msword"),
+    ("VisioDocument", ".vsd", "Visio Drawing", "application/x-visio"),
+    ("Quill", ".pub", "Publisher Document", "application/x-mspublisher"),
+]
+
+_PREFIX_MATCHES = [
+    ("__substg1.0_", ".msg", "Outlook Message", "application/vnd.ms-outlook"),
+]
+
+
+def _extract_stream_names(dir_data: bytes) -> set[str]:
+    """Parse CFBF directory entries and return the set of stream/storage names."""
+    names: set[str] = set()
+    for i in range(0, len(dir_data), 128):
+        entry = dir_data[i : i + 128]
+        if len(entry) < 128:
+            break
+        name_size = struct.unpack_from("<H", entry, 64)[0]
+        if name_size < 2 or name_size > 64:
+            continue
+        obj_type = entry[66]
+        # obj_type: 0=unknown, 1=storage, 2=stream, 5=root
+        if obj_type not in (1, 2, 5):
+            continue
+        name = entry[: name_size - 2].decode("utf-16-le", errors="ignore")
+        if name:
+            names.add(name)
+    return names
+
+
+def _identify_format(stream_names: set[str]) -> Match | None:
+    """Match stream names against known CFBF format signatures."""
+    # Check prefix matches first (e.g. __substg1.0_ for MSG)
+    for name in stream_names:
+        for prefix, ext, fmt_name, mime in _PREFIX_MATCHES:
+            if name.startswith(prefix):
+                return Match(ext, fmt_name, mime)
+
+    # Check exact stream name matches in priority order
+    for stream_name, ext, fmt_name, mime in _STREAM_MATCHES:
+        if stream_name in stream_names:
+            return Match(ext, fmt_name, mime)
+
+    return None
+
+
+def main(file_path: os.PathLike, head: bytes, foot: bytes) -> Match | None:
+    if len(head) < 76:
+        return None
+
+    # Verify magic bytes
+    if head[:8] != match_bytes:
+        if head[:4] != match_bytes_short:
+            return None
+
+    # Parse CFBF header
+    sector_shift = struct.unpack_from("<H", head, 30)[0]
+    if sector_shift not in (9, 12):
+        return None
+    sector_size = 1 << sector_shift
+
+    first_dir_secid = struct.unpack_from("<i", head, 48)[0]
+    if first_dir_secid < 0:
+        return None
+
+    # Directory sector offset: header occupies first sector_size bytes
+    dir_offset = (first_dir_secid + 1) * sector_size
+
+    try:
+        with open(file_path, "rb") as f:
+            f.seek(dir_offset)
+            dir_data = f.read(sector_size)
+    except (OSError, ValueError):
+        return None
+
+    if not dir_data:
+        return None
+
+    stream_names = _extract_stream_names(dir_data)
+    return _identify_format(stream_names)
diff --git a/test/resources/office/test.msg b/test/resources/office/test.msg
diff --git a/test/test_common_extensions.py b/test/test_common_extensions.py
@@ -261,3 +261,34 @@ def test_cmd_extension_option():
 def test_fake_file():
     results = puremagic.magic_file(filename=Path(LOCAL_DIR, "resources", "fake_file"))
     assert results[0].confidence == 0.5, results
+
+
+def test_riff_wav_mime():
+    """RIFF scanner returns audio/wav (not audio/wave) for WAV files"""
+    mime = puremagic.from_file(os.path.join(AUDIO_DIR, "test.wav"), mime=True)
+    assert mime == "audio/wav"
+
+
+
+def test_cfbf_doc():
+    """CFBF scanner correctly identifies Word .doc"""
+    ext = puremagic.from_file(os.path.join(OFFICE_DIR, "test.doc"))
+    assert ext == ".doc"
+    mime = puremagic.from_file(os.path.join(OFFICE_DIR, "test.doc"), mime=True)
+    assert mime == "application/msword"
+
+
+def test_cfbf_ppt():
+    """CFBF scanner correctly identifies PowerPoint .ppt"""
+    ext = puremagic.from_file(os.path.join(OFFICE_DIR, "test.ppt"))
+    assert ext == ".ppt"
+    mime = puremagic.from_file(os.path.join(OFFICE_DIR, "test.ppt"), mime=True)
+    assert mime == "application/vnd.ms-powerpoint"
+
+
+def test_cfbf_msg():
+    """CFBF scanner correctly identifies Outlook .msg"""
+    ext = puremagic.from_file(os.path.join(OFFICE_DIR, "test.msg"))
+    assert ext == ".msg"
+    mime = puremagic.from_file(os.path.join(OFFICE_DIR, "test.msg"), mime=True)
+    assert mime == "application/vnd.ms-outlook"