cdgriffith · cdgriffith · Mar 13, 2026 · Mar 12, 2026 · Mar 12, 2026 · Mar 12, 2026
diff --git a/.claude/skills/changelog.md b/.claude/skills/changelog.md
@@ -0,0 +1,53 @@
+---
+name: changelog
+description: Update the CHANGELOG.md changelog file with new entries
+user_invocable: true
+---
+
+# Changelog Skill
+
+When updating the `CHANGELOG.md` file, follow these rules:
+
+## Entry Format
+
+Each entry is a single bullet point starting with `- `:
+
+```
+- {Verb} {description}
+```
+
+## Verbs and Ordering
+
+Entries MUST use one of these four starting verbs, and MUST appear in this order within each version section:
+
+1. **Adding** — new features
+2. **Changing** — modifications to existing behavior
+3. **Fixing** — bug fixes
+4. **Removing** — removed features or deprecated items
+
+## GitHub Issue Entries
+
+- Entries that reference a GitHub issue include the issue number after the verb: `* Fixing #725 description...`
+- Within each verb group, entries WITH issue numbers come FIRST, sorted by issue number ascending (smallest to largest)
+- Entries WITHOUT issue numbers follow after
+
+## Thanks Attribution
+
+- When an entry references a GitHub issue, thank the issue author by their **GitHub display name** (not username)
+- Look up the display name via `gh api users/{username} --jq '.name // .login'`
+- Format: `(thanks to {display name})`
+- If multiple people contributed (e.g., reporter and commenter with the fix), thank all of them
+- The thanks attribution goes at the end of the entry
+
+## Example
+
+```
+Version 1.27
+------------
+
+- Adding #92 include py.typed in sdist (thanks to Nicholas Bollweg - bollwyvl)
+- Adding #93 Improve PDF file detection, fix json description (thanks to Péter - peterekepeter)
+- Adding new verbose output to command line with `-v` or `--verbose`
+- Fixing #96 #86 stream does not work properly on opened small files (thanks to Felipe Lema and Andy - NebularNerd)
+- Removing expected invalid WinZip signature
+```
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -52,6 +52,17 @@ repos:
       types: [python]
       pass_filenames: false
 
+-   repo: https://github.com/codespell-project/codespell
+    rev: v2.4.1
+    hooks:
+      - id: codespell
+        args: [--ignore-words-list, "addin,caf,tye,wors"]
+        exclude: |
+            (?x)^(
+                test/resources/|
+                .*\.json$
+            )
+
 -   repo: https://github.com/tox-dev/pyproject-fmt
     rev: v2.16.2
     hooks:

diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -1,6 +1,15 @@
 Changelog
 =========
 
+Version 2.1.0
+-------------
+
+- Adding #6 `from_extension()` and `magic_extension()` to look up MIME types by file extension
+- Adding #119 CFBF/OLE2 scanner to distinguish Word, Excel, PowerPoint, Outlook MSG, Visio, and Publisher files (thanks to Simeon Stoykov and Andy - NebularNerd)
+- Adding #139 codespell check (thanks to Christian Clauss)
+- Adding `-e`/`--extension` CLI flag for extension-based MIME type lookup
+- Fixing #104 WAV MIME type changed from `audio/wave` to `audio/wav` (thanks to Simon Willison and Andy - NebularNerd)
+
 Version 2.0.2
 -------------
 

diff --git a/puremagic/magic_data.json b/puremagic/magic_data.json
@@ -100,7 +100,7 @@
       ["53434448", 8, ".sc2", "", "SimCity 2000 Map File"]
     ],
     "52494646": [
-      ["57415645", 8, ".wav", "audio/wave", "Waveform Audio File Format"],
+      ["57415645", 8, ".wav", "audio/wav", "Waveform Audio File Format"],
       ["41564920", 8, ".avi", "video/avi", "Audio Video Interleave"],
       ["57454250", 8, ".webp", "image/webp", "WebP graphics file format"],
       ["41434f4e", 8, ".ani", "", "Animated cursor"],

diff --git a/puremagic/main.py b/puremagic/main.py
@@ -31,17 +31,20 @@
         sndhdr_scanner,
         mpeg_audio_scanner,
         hdf5_scanner,
+        cfbf_scanner,
     )
 
 __author__ = "Chris Griffith"
-__version__ = "2.0.2"
+__version__ = "2.1.0"
 __all__ = [
     "magic_file",
     "magic_string",
     "magic_stream",
+    "magic_extension",
     "from_file",
     "from_string",
     "from_stream",
+    "from_extension",
     "ext_from_filename",
     "PureError",
     "PureMagic",
@@ -390,6 +393,51 @@ def magic_stream(
     return info
 
 
+def from_extension(extension: str, mime: bool = True) -> str:
+    """Look up a file type by its extension and return the MIME type or name.
+
+    :param extension: file extension with or without leading dot (e.g. ".pdf" or "pdf")
+    :param mime: Return mime type (default True), or human-readable name if False
+    :return: MIME type string or name
+    :raises PureError: if no match is found
+    """
+    ext = extension.strip().lower()
+    if not ext.startswith("."):
+        ext = f".{ext}"
+
+    matches = []
+    for entry in chain(magic_header_array, magic_footer_array, extension_only_array):
+        if entry.extension == ext:
+            matches.append(entry)
+
+    if not matches:
+        raise PureError(f"Could not find extension {ext!r} in magic database")
+
+    # Prefer entries with longer byte_match (more specific signatures)
+    matches.sort(key=lambda x: len(x.byte_match), reverse=True)
+    best = matches[0]
+    return best.mime_type if mime else best.name
+
+
+def magic_extension(extension: str) -> list[PureMagicWithConfidence]:
+    """Return all matches for a given file extension.
+
+    :param extension: file extension with or without leading dot (e.g. ".pdf" or "pdf")
+    :return: list of PureMagicWithConfidence sorted by confidence descending
+    """
+    ext = extension.strip().lower()
+    if not ext.startswith("."):
+        ext = f".{ext}"
+
+    matches = []
+    for entry in chain(magic_header_array, magic_footer_array, extension_only_array):
+        if entry.extension == ext:
+            con = 0.8 if len(entry.byte_match) >= 9 else float(f"0.{len(entry.byte_match)}")
+            matches.append(PureMagicWithConfidence(confidence=con, **entry._asdict()))
+
+    return sorted(matches, key=lambda x: (x.confidence, len(x.byte_match)), reverse=True)
+
+
 def single_deep_scan(
     bytes_match: bytes | bytearray | None,
     filename: os.PathLike | str,
@@ -417,6 +465,8 @@ def single_deep_scan(
             result = mpeg_audio_scanner.main(filename, head, foot)
             if result and result.confidence > confidence:
                 return result
+        case cfbf_scanner.match_bytes | cfbf_scanner.match_bytes_short:
+            return cfbf_scanner.main(filename, head, foot)
 
     if eml_result := text_scanner.eml_check(head):
         return eml_result
@@ -544,10 +594,43 @@ def command_line_entry(*args):
         help="Return the mime type instead of file type",
     )
     parser.add_argument("-v", "--verbose", action="store_true", dest="verbose", help="Print verbose output")
-    parser.add_argument("files", nargs="+", type=Path)
+    parser.add_argument(
+        "-e",
+        "--extension",
+        dest="extension",
+        help="Look up MIME type for a file extension (e.g. pdf or .pdf)",
+    )
+    parser.add_argument("files", nargs="*", type=Path)
     parser.add_argument("--version", action="version", version=puremagic.__version__)
     args = parser.parse_args(args if args else sys.argv[1:])
 
+    if args.extension:
+        if args.verbose:
+            matches = magic_extension(args.extension)
+            if not matches:
+                print(f"No matches found for extension '{args.extension}'")
+            else:
+                print(f"Total Possible Matches: {len(matches)}")
+                for i, result in enumerate(matches):
+                    if i == 0:
+                        print("\n\tBest Match")
+                    else:
+                        print(f"\tAlternative Match #{i}")
+                    print(f"\tName: {result.name}")
+                    print(f"\tConfidence: {int(result.confidence * 100)}%")
+                    print(f"\tExtension: {result.extension}")
+                    print(f"\tMime Type: {result.mime_type}")
+                    print(f"\tByte Match: {result.byte_match}\n")
+        else:
+            try:
+                print(from_extension(args.extension, mime=not args.mime))
+            except PureError as e:
+                print(str(e))
+        return
+
+    if not args.files:
+        parser.error("the following arguments are required: files (or use -e/--extension)")
+
     for fn in args.files:
         if not fn.exists():
             print(f"File '{fn}' does not exist!")

diff --git a/puremagic/scanners/cfbf_scanner.py b/puremagic/scanners/cfbf_scanner.py
@@ -0,0 +1,97 @@
+import os
+import struct
+
+from puremagic.scanners.helpers import Match
+
+match_bytes = b"\xd0\xcf\x11\xe0\xa1\xb1\x1a\xe1"
+match_bytes_short = b"\xd0\xcf\x11\xe0"
+
+# Stream names that identify specific CFBF-based formats, checked in priority order.
+# Each entry: (stream_name, extension, name, mime_type)
+# Using startswith for prefix matching where noted.
+_STREAM_MATCHES = [
+    ("__nameid_version1.0", ".msg", "Outlook Message", "application/vnd.ms-outlook"),
+    ("PowerPoint Document", ".ppt", "PowerPoint Presentation", "application/vnd.ms-powerpoint"),
+    ("Current User", ".ppt", "PowerPoint Presentation", "application/vnd.ms-powerpoint"),
+    ("Workbook", ".xls", "Excel Spreadsheet", "application/vnd.ms-excel"),
+    ("Book", ".xls", "Excel Spreadsheet", "application/vnd.ms-excel"),
+    ("WordDocument", ".doc", "Word Document", "application/msword"),
+    ("VisioDocument", ".vsd", "Visio Drawing", "application/x-visio"),
+    ("Quill", ".pub", "Publisher Document", "application/x-mspublisher"),
+]
+
+_PREFIX_MATCHES = [
+    ("__substg1.0_", ".msg", "Outlook Message", "application/vnd.ms-outlook"),
+]
+
+
+def _extract_stream_names(dir_data: bytes) -> set[str]:
+    """Parse CFBF directory entries and return the set of stream/storage names."""
+    names: set[str] = set()
+    for i in range(0, len(dir_data), 128):
+        entry = dir_data[i : i + 128]
+        if len(entry) < 128:
+            break
+        name_size = struct.unpack_from("<H", entry, 64)[0]
+        if name_size < 2 or name_size > 64:
+            continue
+        obj_type = entry[66]
+        # obj_type: 0=unknown, 1=storage, 2=stream, 5=root
+        if obj_type not in (1, 2, 5):
+            continue
+        name = entry[: name_size - 2].decode("utf-16-le", errors="ignore")
+        if name:
+            names.add(name)
+    return names
+
+
+def _identify_format(stream_names: set[str]) -> Match | None:
+    """Match stream names against known CFBF format signatures."""
+    # Check prefix matches first (e.g. __substg1.0_ for MSG)
+    for name in stream_names:
+        for prefix, ext, fmt_name, mime in _PREFIX_MATCHES:
+            if name.startswith(prefix):
+                return Match(ext, fmt_name, mime)
+
+    # Check exact stream name matches in priority order
+    for stream_name, ext, fmt_name, mime in _STREAM_MATCHES:
+        if stream_name in stream_names:
+            return Match(ext, fmt_name, mime)
+
+    return None
+
+
+def main(file_path: os.PathLike, head: bytes, foot: bytes) -> Match | None:
+    if len(head) < 76:
+        return None
+
+    # Verify magic bytes
+    if head[:8] != match_bytes:
+        if head[:4] != match_bytes_short:
+            return None
+
+    # Parse CFBF header
+    sector_shift = struct.unpack_from("<H", head, 30)[0]
+    if sector_shift not in (9, 12):
+        return None
+    sector_size = 1 << sector_shift
+
+    first_dir_secid = struct.unpack_from("<i", head, 48)[0]
+    if first_dir_secid < 0:
+        return None
+
+    # Directory sector offset: header occupies first sector_size bytes
+    dir_offset = (first_dir_secid + 1) * sector_size
+
+    try:
+        with open(file_path, "rb") as f:
+            f.seek(dir_offset)
+            dir_data = f.read(sector_size)
+    except (OSError, ValueError):
+        return None
+
+    if not dir_data:
+        return None
+
+    stream_names = _extract_stream_names(dir_data)
+    return _identify_format(stream_names)
diff --git a/puremagic/scanners/mpeg_audio_scanner.py b/puremagic/scanners/mpeg_audio_scanner.py
@@ -228,7 +228,7 @@ def _tag_plus(self) -> None:
         assert self.foot_string is not None
         tag_size = 128
         tag_plus_size = 227
-        speed_loc = 184  # Speed byte posistion in tag
+        speed_loc = 184  # Speed byte position in tag
         combined_size = tag_plus_size + tag_size
 
         if self.foot_size < combined_size:  # TAG+ + ID3v1
@@ -466,7 +466,7 @@ def _ape(self, id3v1: bool | None) -> None:
             b"MCN",
         )
         id3v1_size = 128
-        max_tag_size = 1048576  # This is a pratical scan range of 1MB, Ape v2 in theory can be 4GB
+        max_tag_size = 1048576  # This is a practical scan range of 1MB, Ape v2 in theory can be 4GB
         combined_size = (max_tag_size + id3v1_size) if id3v1 else max_tag_size
 
         if self.foot_size < combined_size:  # APE OR APE + ID3v1
@@ -493,7 +493,7 @@ def _ape(self, id3v1: bool | None) -> None:
             f_size = struct.unpack("<I", self.foot_string[end_tag_start + 12 : end_tag_start + 16])[0]
 
             if f_version == 1000:  # v1
-                # Reach first key in tag, in APE the tag key name is preceeded by 8 bytes associated with it.
+                # Reach first key in tag, in APE the tag key name is preceded by 8 bytes associated with it.
                 first_key = combined_size - ((end_size + f_size) - apextag_size) + 8
                 # APE does not care about case for tag keys, but Title and UPPER are commonly accepted as standard
                 if not self.foot_string[first_key:].title().startswith(common_ape_keys):

diff --git a/pyproject.toml b/pyproject.toml
@@ -30,6 +30,7 @@ scripts.puremagic = "puremagic.main:command_line_entry"
 
 [dependency-groups]
 dev = [
+  "codespell>=2.4.2",
   "coverage>=7.13.4",
   "poethepoet>=0.42.1",
   "pre-commit>=4.5.1",
@@ -108,3 +109,4 @@ tasks.test = "pytest --cov=puremagic test/"
 tasks.lint = "ruff check --fix"
 tasks.format = "ruff format"
 tasks.typecheck = "ty check"
+tasks.spellcheck = "codespell --ignore-words-list=addin,caf,tye,wors --skip='*.json' puremagic/"
diff --git a/test/resources/office/test.msg b/test/resources/office/test.msg