Skip to content

Commit 06c248d

Browse files
committed
- Adding #119 CFBF/OLE2 scanner to distinguish Word, Excel, PowerPoint, Outlook MSG, Visio, and Publisher files (thanks to Simeon Stoykov and Andy - NebularNerd)
- Fixing #104 WAV MIME type changed from `audio/wave` to `audio/wav` (thanks to Simon Willison and Andy - NebularNerd)
1 parent e818fed commit 06c248d

7 files changed

Lines changed: 188 additions & 2 deletions

File tree

.claude/skills/changelog.md

Lines changed: 53 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,53 @@
1+
---
2+
name: changelog
3+
description: Update the CHANGELOG.md changelog file with new entries
4+
user_invocable: true
5+
---
6+
7+
# Changelog Skill
8+
9+
When updating the `CHANGELOG.md` file, follow these rules:
10+
11+
## Entry Format
12+
13+
Each entry is a single bullet point starting with `- `:
14+
15+
```
16+
- {Verb} {description}
17+
```
18+
19+
## Verbs and Ordering
20+
21+
Entries MUST use one of these four starting verbs, and MUST appear in this order within each version section:
22+
23+
1. **Adding** — new features
24+
2. **Changing** — modifications to existing behavior
25+
3. **Fixing** — bug fixes
26+
4. **Removing** — removed features or deprecated items
27+
28+
## GitHub Issue Entries
29+
30+
- Entries that reference a GitHub issue include the issue number after the verb: `* Fixing #725 description...`
31+
- Within each verb group, entries WITH issue numbers come FIRST, sorted by issue number ascending (smallest to largest)
32+
- Entries WITHOUT issue numbers follow after
33+
34+
## Thanks Attribution
35+
36+
- When an entry references a GitHub issue, thank the issue author by their **GitHub display name** (not username)
37+
- Look up the display name via `gh api users/{username} --jq '.name // .login'`
38+
- Format: `(thanks to {display name})`
39+
- If multiple people contributed (e.g., reporter and commenter with the fix), thank all of them
40+
- The thanks attribution goes at the end of the entry
41+
42+
## Example
43+
44+
```
45+
Version 1.27
46+
------------
47+
48+
- Adding #92 include py.typed in sdist (thanks to Nicholas Bollweg - bollwyvl)
49+
- Adding #93 Improve PDF file detection, fix json description (thanks to Péter - peterekepeter)
50+
- Adding new verbose output to command line with `-v` or `--verbose`
51+
- Fixing #96 #86 stream does not work properly on opened small files (thanks to Felipe Lema and Andy - NebularNerd)
52+
- Removing expected invalid WinZip signature
53+
```

CHANGELOG.md

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5,8 +5,10 @@ Version 2.1.0
55
-------------
66

77
- Adding #6 `from_extension()` and `magic_extension()` to look up MIME types by file extension
8-
- Adding `-e`/`--extension` CLI flag for extension-based MIME type lookup
8+
- Adding #119 CFBF/OLE2 scanner to distinguish Word, Excel, PowerPoint, Outlook MSG, Visio, and Publisher files (thanks to Simeon Stoykov and Andy - NebularNerd)
99
- Adding #139 codespell check (thanks to Christian Clauss)
10+
- Adding `-e`/`--extension` CLI flag for extension-based MIME type lookup
11+
- Fixing #104 WAV MIME type changed from `audio/wave` to `audio/wav` (thanks to Simon Willison and Andy - NebularNerd)
1012

1113
Version 2.0.2
1214
-------------

puremagic/magic_data.json

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -100,7 +100,7 @@
100100
["53434448", 8, ".sc2", "", "SimCity 2000 Map File"]
101101
],
102102
"52494646": [
103-
["57415645", 8, ".wav", "audio/wave", "Waveform Audio File Format"],
103+
["57415645", 8, ".wav", "audio/wav", "Waveform Audio File Format"],
104104
["41564920", 8, ".avi", "video/avi", "Audio Video Interleave"],
105105
["57454250", 8, ".webp", "image/webp", "WebP graphics file format"],
106106
["41434f4e", 8, ".ani", "", "Animated cursor"],

puremagic/main.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -31,6 +31,7 @@
3131
sndhdr_scanner,
3232
mpeg_audio_scanner,
3333
hdf5_scanner,
34+
cfbf_scanner,
3435
)
3536

3637
__author__ = "Chris Griffith"
@@ -464,6 +465,8 @@ def single_deep_scan(
464465
result = mpeg_audio_scanner.main(filename, head, foot)
465466
if result and result.confidence > confidence:
466467
return result
468+
case cfbf_scanner.match_bytes | cfbf_scanner.match_bytes_short:
469+
return cfbf_scanner.main(filename, head, foot)
467470

468471
if eml_result := text_scanner.eml_check(head):
469472
return eml_result

puremagic/scanners/cfbf_scanner.py

Lines changed: 97 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,97 @@
1+
import os
2+
import struct
3+
4+
from puremagic.scanners.helpers import Match
5+
6+
match_bytes = b"\xd0\xcf\x11\xe0\xa1\xb1\x1a\xe1"
7+
match_bytes_short = b"\xd0\xcf\x11\xe0"
8+
9+
# Stream names that identify specific CFBF-based formats, checked in priority order.
10+
# Each entry: (stream_name, extension, name, mime_type)
11+
# Using startswith for prefix matching where noted.
12+
_STREAM_MATCHES = [
13+
("__nameid_version1.0", ".msg", "Outlook Message", "application/vnd.ms-outlook"),
14+
("PowerPoint Document", ".ppt", "PowerPoint Presentation", "application/vnd.ms-powerpoint"),
15+
("Current User", ".ppt", "PowerPoint Presentation", "application/vnd.ms-powerpoint"),
16+
("Workbook", ".xls", "Excel Spreadsheet", "application/vnd.ms-excel"),
17+
("Book", ".xls", "Excel Spreadsheet", "application/vnd.ms-excel"),
18+
("WordDocument", ".doc", "Word Document", "application/msword"),
19+
("VisioDocument", ".vsd", "Visio Drawing", "application/x-visio"),
20+
("Quill", ".pub", "Publisher Document", "application/x-mspublisher"),
21+
]
22+
23+
_PREFIX_MATCHES = [
24+
("__substg1.0_", ".msg", "Outlook Message", "application/vnd.ms-outlook"),
25+
]
26+
27+
28+
def _extract_stream_names(dir_data: bytes) -> set[str]:
29+
"""Parse CFBF directory entries and return the set of stream/storage names."""
30+
names: set[str] = set()
31+
for i in range(0, len(dir_data), 128):
32+
entry = dir_data[i : i + 128]
33+
if len(entry) < 128:
34+
break
35+
name_size = struct.unpack_from("<H", entry, 64)[0]
36+
if name_size < 2 or name_size > 64:
37+
continue
38+
obj_type = entry[66]
39+
# obj_type: 0=unknown, 1=storage, 2=stream, 5=root
40+
if obj_type not in (1, 2, 5):
41+
continue
42+
name = entry[: name_size - 2].decode("utf-16-le", errors="ignore")
43+
if name:
44+
names.add(name)
45+
return names
46+
47+
48+
def _identify_format(stream_names: set[str]) -> Match | None:
49+
"""Match stream names against known CFBF format signatures."""
50+
# Check prefix matches first (e.g. __substg1.0_ for MSG)
51+
for name in stream_names:
52+
for prefix, ext, fmt_name, mime in _PREFIX_MATCHES:
53+
if name.startswith(prefix):
54+
return Match(ext, fmt_name, mime)
55+
56+
# Check exact stream name matches in priority order
57+
for stream_name, ext, fmt_name, mime in _STREAM_MATCHES:
58+
if stream_name in stream_names:
59+
return Match(ext, fmt_name, mime)
60+
61+
return None
62+
63+
64+
def main(file_path: os.PathLike, head: bytes, foot: bytes) -> Match | None:
65+
if len(head) < 76:
66+
return None
67+
68+
# Verify magic bytes
69+
if head[:8] != match_bytes:
70+
if head[:4] != match_bytes_short:
71+
return None
72+
73+
# Parse CFBF header
74+
sector_shift = struct.unpack_from("<H", head, 30)[0]
75+
if sector_shift not in (9, 12):
76+
return None
77+
sector_size = 1 << sector_shift
78+
79+
first_dir_secid = struct.unpack_from("<i", head, 48)[0]
80+
if first_dir_secid < 0:
81+
return None
82+
83+
# Directory sector offset: header occupies first sector_size bytes
84+
dir_offset = (first_dir_secid + 1) * sector_size
85+
86+
try:
87+
with open(file_path, "rb") as f:
88+
f.seek(dir_offset)
89+
dir_data = f.read(sector_size)
90+
except (OSError, ValueError):
91+
return None
92+
93+
if not dir_data:
94+
return None
95+
96+
stream_names = _extract_stream_names(dir_data)
97+
return _identify_format(stream_names)

test/resources/office/test.msg

1.5 KB
Binary file not shown.

test/test_common_extensions.py

Lines changed: 31 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -261,3 +261,34 @@ def test_cmd_extension_option():
261261
def test_fake_file():
262262
results = puremagic.magic_file(filename=Path(LOCAL_DIR, "resources", "fake_file"))
263263
assert results[0].confidence == 0.5, results
264+
265+
266+
def test_riff_wav_mime():
267+
"""RIFF scanner returns audio/wav (not audio/wave) for WAV files"""
268+
mime = puremagic.from_file(os.path.join(AUDIO_DIR, "test.wav"), mime=True)
269+
assert mime == "audio/wav"
270+
271+
272+
273+
def test_cfbf_doc():
274+
"""CFBF scanner correctly identifies Word .doc"""
275+
ext = puremagic.from_file(os.path.join(OFFICE_DIR, "test.doc"))
276+
assert ext == ".doc"
277+
mime = puremagic.from_file(os.path.join(OFFICE_DIR, "test.doc"), mime=True)
278+
assert mime == "application/msword"
279+
280+
281+
def test_cfbf_ppt():
282+
"""CFBF scanner correctly identifies PowerPoint .ppt"""
283+
ext = puremagic.from_file(os.path.join(OFFICE_DIR, "test.ppt"))
284+
assert ext == ".ppt"
285+
mime = puremagic.from_file(os.path.join(OFFICE_DIR, "test.ppt"), mime=True)
286+
assert mime == "application/vnd.ms-powerpoint"
287+
288+
289+
def test_cfbf_msg():
290+
"""CFBF scanner correctly identifies Outlook .msg"""
291+
ext = puremagic.from_file(os.path.join(OFFICE_DIR, "test.msg"))
292+
assert ext == ".msg"
293+
mime = puremagic.from_file(os.path.join(OFFICE_DIR, "test.msg"), mime=True)
294+
assert mime == "application/vnd.ms-outlook"

0 commit comments

Comments
 (0)