From 19c3586c946a29d0c8859d6d4e630873c582e8c9 Mon Sep 17 00:00:00 2001 From: Christian Clauss Date: Thu, 12 Mar 2026 01:24:34 +0100 Subject: [PATCH 1/3] Fix typos discovered by codespell (#139) --- puremagic/scanners/mpeg_audio_scanner.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/puremagic/scanners/mpeg_audio_scanner.py b/puremagic/scanners/mpeg_audio_scanner.py index f1e6b17..8b0cd9e 100644 --- a/puremagic/scanners/mpeg_audio_scanner.py +++ b/puremagic/scanners/mpeg_audio_scanner.py @@ -228,7 +228,7 @@ def _tag_plus(self) -> None: assert self.foot_string is not None tag_size = 128 tag_plus_size = 227 - speed_loc = 184 # Speed byte posistion in tag + speed_loc = 184 # Speed byte position in tag combined_size = tag_plus_size + tag_size if self.foot_size < combined_size: # TAG+ + ID3v1 @@ -466,7 +466,7 @@ def _ape(self, id3v1: bool | None) -> None: b"MCN", ) id3v1_size = 128 - max_tag_size = 1048576 # This is a pratical scan range of 1MB, Ape v2 in theory can be 4GB + max_tag_size = 1048576 # This is a practical scan range of 1MB, Ape v2 in theory can be 4GB combined_size = (max_tag_size + id3v1_size) if id3v1 else max_tag_size if self.foot_size < combined_size: # APE OR APE + ID3v1 @@ -493,7 +493,7 @@ def _ape(self, id3v1: bool | None) -> None: f_size = struct.unpack(" Date: Wed, 11 Mar 2026 19:34:13 -0500 Subject: [PATCH 2/3] Adding #6 `from_extension()` and `magic_extension()` to look up MIME types by file extension --- .pre-commit-config.yaml | 11 +++++ CHANGELOG.md | 7 +++ puremagic/main.py | 84 +++++++++++++++++++++++++++++++++- pyproject.toml | 2 + test/test_common_extensions.py | 38 +++++++++++++++ uv.lock | 17 +++++-- 6 files changed, 154 insertions(+), 5 deletions(-) diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index a1dd6c9..839066d 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -52,6 +52,17 @@ repos: types: [python] pass_filenames: false +- repo: https://github.com/codespell-project/codespell + rev: v2.4.1 + hooks: + - id: codespell + args: [--ignore-words-list, "addin,caf,tye,wors"] + exclude: | + (?x)^( + test/resources/| + .*\.json$ + ) + - repo: https://github.com/tox-dev/pyproject-fmt rev: v2.16.2 hooks: diff --git a/CHANGELOG.md b/CHANGELOG.md index cc794ac..4bd2145 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,6 +1,13 @@ Changelog ========= +Version 2.1.0 +------------- + +- Adding #6 `from_extension()` and `magic_extension()` to look up MIME types by file extension +- Adding `-e`/`--extension` CLI flag for extension-based MIME type lookup +- Adding #139 codespell check (thanks to Christian Clauss) + Version 2.0.2 ------------- diff --git a/puremagic/main.py b/puremagic/main.py index 2ca1e7c..a0e728a 100644 --- a/puremagic/main.py +++ b/puremagic/main.py @@ -34,14 +34,16 @@ ) __author__ = "Chris Griffith" -__version__ = "2.0.2" +__version__ = "2.1.0" __all__ = [ "magic_file", "magic_string", "magic_stream", + "magic_extension", "from_file", "from_string", "from_stream", + "from_extension", "ext_from_filename", "PureError", "PureMagic", @@ -390,6 +392,51 @@ def magic_stream( return info +def from_extension(extension: str, mime: bool = True) -> str: + """Look up a file type by its extension and return the MIME type or name. + + :param extension: file extension with or without leading dot (e.g. ".pdf" or "pdf") + :param mime: Return mime type (default True), or human-readable name if False + :return: MIME type string or name + :raises PureError: if no match is found + """ + ext = extension.strip().lower() + if not ext.startswith("."): + ext = f".{ext}" + + matches = [] + for entry in chain(magic_header_array, magic_footer_array, extension_only_array): + if entry.extension == ext: + matches.append(entry) + + if not matches: + raise PureError(f"Could not find extension {ext!r} in magic database") + + # Prefer entries with longer byte_match (more specific signatures) + matches.sort(key=lambda x: len(x.byte_match), reverse=True) + best = matches[0] + return best.mime_type if mime else best.name + + +def magic_extension(extension: str) -> list[PureMagicWithConfidence]: + """Return all matches for a given file extension. + + :param extension: file extension with or without leading dot (e.g. ".pdf" or "pdf") + :return: list of PureMagicWithConfidence sorted by confidence descending + """ + ext = extension.strip().lower() + if not ext.startswith("."): + ext = f".{ext}" + + matches = [] + for entry in chain(magic_header_array, magic_footer_array, extension_only_array): + if entry.extension == ext: + con = 0.8 if len(entry.byte_match) >= 9 else float(f"0.{len(entry.byte_match)}") + matches.append(PureMagicWithConfidence(confidence=con, **entry._asdict())) + + return sorted(matches, key=lambda x: (x.confidence, len(x.byte_match)), reverse=True) + + def single_deep_scan( bytes_match: bytes | bytearray | None, filename: os.PathLike | str, @@ -544,10 +591,43 @@ def command_line_entry(*args): help="Return the mime type instead of file type", ) parser.add_argument("-v", "--verbose", action="store_true", dest="verbose", help="Print verbose output") - parser.add_argument("files", nargs="+", type=Path) + parser.add_argument( + "-e", + "--extension", + dest="extension", + help="Look up MIME type for a file extension (e.g. pdf or .pdf)", + ) + parser.add_argument("files", nargs="*", type=Path) parser.add_argument("--version", action="version", version=puremagic.__version__) args = parser.parse_args(args if args else sys.argv[1:]) + if args.extension: + if args.verbose: + matches = magic_extension(args.extension) + if not matches: + print(f"No matches found for extension '{args.extension}'") + else: + print(f"Total Possible Matches: {len(matches)}") + for i, result in enumerate(matches): + if i == 0: + print("\n\tBest Match") + else: + print(f"\tAlternative Match #{i}") + print(f"\tName: {result.name}") + print(f"\tConfidence: {int(result.confidence * 100)}%") + print(f"\tExtension: {result.extension}") + print(f"\tMime Type: {result.mime_type}") + print(f"\tByte Match: {result.byte_match}\n") + else: + try: + print(from_extension(args.extension, mime=not args.mime)) + except PureError as e: + print(str(e)) + return + + if not args.files: + parser.error("the following arguments are required: files (or use -e/--extension)") + for fn in args.files: if not fn.exists(): print(f"File '{fn}' does not exist!") diff --git a/pyproject.toml b/pyproject.toml index eea3da0..3fbdfd5 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -30,6 +30,7 @@ scripts.puremagic = "puremagic.main:command_line_entry" [dependency-groups] dev = [ + "codespell>=2.4.2", "coverage>=7.13.4", "poethepoet>=0.42.1", "pre-commit>=4.5.1", @@ -108,3 +109,4 @@ tasks.test = "pytest --cov=puremagic test/" tasks.lint = "ruff check --fix" tasks.format = "ruff format" tasks.typecheck = "ty check" +tasks.spellcheck = "codespell --ignore-words-list=addin,caf,tye,wors --skip='*.json' puremagic/" diff --git a/test/test_common_extensions.py b/test/test_common_extensions.py index dbee423..c5d5ab9 100644 --- a/test/test_common_extensions.py +++ b/test/test_common_extensions.py @@ -220,6 +220,44 @@ def test_bad_magic_input(): puremagic.main.perform_magic(None, None, None) # type: ignore[invalid-argument-type] +def test_from_extension(): + """Test from_extension lookup""" + assert puremagic.from_extension(".pdf") == "application/pdf" + assert puremagic.from_extension("pdf") == "application/pdf" + assert puremagic.from_extension("PDF") == "application/pdf" + assert puremagic.from_extension(".jpg") == "image/jpeg" + assert puremagic.from_extension("png") == "image/png" + # Test name mode + name = puremagic.from_extension(".pdf", mime=False) + assert name != "" + # Test unknown extension raises PureError + with pytest.raises(puremagic.PureError): + puremagic.from_extension(".xyz_unknown_ext") + + +def test_magic_extension(): + """Test magic_extension returns list of matches""" + results = puremagic.magic_extension(".pdf") + assert len(results) >= 1 + assert results[0].mime_type == "application/pdf" + assert results[0].extension == ".pdf" + # Check sorted by confidence descending + for i in range(len(results) - 1): + assert results[i].confidence >= results[i + 1].confidence + # Unknown extension returns empty list + assert puremagic.magic_extension(".xyz_unknown_ext") == [] + + +def test_cmd_extension_option(): + """Test CLI -e option""" + from puremagic.main import command_line_entry # noqa: PLC0415 + + command_line_entry("-e", "pdf") + command_line_entry("-e", ".jpg") + command_line_entry("-e", "pdf", "-v") + command_line_entry("-e", "xyz_unknown_ext_123") + + def test_fake_file(): results = puremagic.magic_file(filename=Path(LOCAL_DIR, "resources", "fake_file")) assert results[0].confidence == 0.5, results diff --git a/uv.lock b/uv.lock index f69a182..52c02d2 100644 --- a/uv.lock +++ b/uv.lock @@ -92,6 +92,15 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/20/94/c5790835a017658cbfabd07f3bfb549140c3ac458cfc196323996b10095a/charset_normalizer-3.4.2-py3-none-any.whl", hash = "sha256:7f56930ab0abd1c45cd15be65cc741c28b1c9a34876ce8c17a2fa107810c0af0", size = 52626, upload-time = "2025-05-02T08:34:40.053Z" }, ] +[[package]] +name = "codespell" +version = "2.4.2" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/2d/9d/1d0903dff693160f893ca6abcabad545088e7a2ee0a6deae7c24e958be69/codespell-2.4.2.tar.gz", hash = "sha256:3c33be9ae34543807f088aeb4832dfad8cb2dae38da61cac0a7045dd376cfdf3", size = 352058, upload-time = "2026-03-05T18:10:42.936Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/42/a1/52fa05533e95fe45bcc09bcf8a503874b1c08f221a4e35608017e0938f55/codespell-2.4.2-py3-none-any.whl", hash = "sha256:97e0c1060cf46bd1d5db89a936c98db8c2b804e1fdd4b5c645e82a1ec6b1f886", size = 353715, upload-time = "2026-03-05T18:10:41.398Z" }, +] + [[package]] name = "colorama" version = "0.4.6" @@ -495,6 +504,7 @@ source = { editable = "." } [package.dev-dependencies] dev = [ + { name = "codespell" }, { name = "coverage" }, { name = "poethepoet" }, { name = "pre-commit" }, @@ -511,14 +521,15 @@ dev = [ [package.metadata.requires-dev] dev = [ + { name = "codespell", specifier = ">=2.4.2" }, { name = "coverage", specifier = ">=7.13.4" }, { name = "poethepoet", specifier = ">=0.42.1" }, { name = "pre-commit", specifier = ">=4.5.1" }, { name = "pytest", specifier = ">=9.0.2" }, - { name = "pytest-cov", specifier = ">=7.0.0" }, + { name = "pytest-cov", specifier = ">=7" }, { name = "ruff", specifier = ">=0.15.5" }, - { name = "setuptools", specifier = ">=82.0.0" }, - { name = "twine", specifier = ">=6.2.0" }, + { name = "setuptools", specifier = ">=82" }, + { name = "twine", specifier = ">=6.2" }, { name = "ty", specifier = ">=0.0.21" }, { name = "wheel", specifier = ">=0.46.3" }, ] From 06c248d8fa2e47b59da0a9f09a74f300c3f8258c Mon Sep 17 00:00:00 2001 From: Chris Griffith Date: Wed, 11 Mar 2026 22:24:59 -0500 Subject: [PATCH 3/3] - Adding #119 CFBF/OLE2 scanner to distinguish Word, Excel, PowerPoint, Outlook MSG, Visio, and Publisher files (thanks to Simeon Stoykov and Andy - NebularNerd) - Fixing #104 WAV MIME type changed from `audio/wave` to `audio/wav` (thanks to Simon Willison and Andy - NebularNerd) --- .claude/skills/changelog.md | 53 ++++++++++++++++ CHANGELOG.md | 4 +- puremagic/magic_data.json | 2 +- puremagic/main.py | 3 + puremagic/scanners/cfbf_scanner.py | 97 +++++++++++++++++++++++++++++ test/resources/office/test.msg | Bin 0 -> 1536 bytes test/test_common_extensions.py | 31 +++++++++ 7 files changed, 188 insertions(+), 2 deletions(-) create mode 100644 .claude/skills/changelog.md create mode 100644 puremagic/scanners/cfbf_scanner.py create mode 100644 test/resources/office/test.msg diff --git a/.claude/skills/changelog.md b/.claude/skills/changelog.md new file mode 100644 index 0000000..01e90f9 --- /dev/null +++ b/.claude/skills/changelog.md @@ -0,0 +1,53 @@ +--- +name: changelog +description: Update the CHANGELOG.md changelog file with new entries +user_invocable: true +--- + +# Changelog Skill + +When updating the `CHANGELOG.md` file, follow these rules: + +## Entry Format + +Each entry is a single bullet point starting with `- `: + +``` +- {Verb} {description} +``` + +## Verbs and Ordering + +Entries MUST use one of these four starting verbs, and MUST appear in this order within each version section: + +1. **Adding** — new features +2. **Changing** — modifications to existing behavior +3. **Fixing** — bug fixes +4. **Removing** — removed features or deprecated items + +## GitHub Issue Entries + +- Entries that reference a GitHub issue include the issue number after the verb: `* Fixing #725 description...` +- Within each verb group, entries WITH issue numbers come FIRST, sorted by issue number ascending (smallest to largest) +- Entries WITHOUT issue numbers follow after + +## Thanks Attribution + +- When an entry references a GitHub issue, thank the issue author by their **GitHub display name** (not username) +- Look up the display name via `gh api users/{username} --jq '.name // .login'` +- Format: `(thanks to {display name})` +- If multiple people contributed (e.g., reporter and commenter with the fix), thank all of them +- The thanks attribution goes at the end of the entry + +## Example + +``` +Version 1.27 +------------ + +- Adding #92 include py.typed in sdist (thanks to Nicholas Bollweg - bollwyvl) +- Adding #93 Improve PDF file detection, fix json description (thanks to Péter - peterekepeter) +- Adding new verbose output to command line with `-v` or `--verbose` +- Fixing #96 #86 stream does not work properly on opened small files (thanks to Felipe Lema and Andy - NebularNerd) +- Removing expected invalid WinZip signature +``` diff --git a/CHANGELOG.md b/CHANGELOG.md index 4bd2145..3528335 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -5,8 +5,10 @@ Version 2.1.0 ------------- - Adding #6 `from_extension()` and `magic_extension()` to look up MIME types by file extension -- Adding `-e`/`--extension` CLI flag for extension-based MIME type lookup +- Adding #119 CFBF/OLE2 scanner to distinguish Word, Excel, PowerPoint, Outlook MSG, Visio, and Publisher files (thanks to Simeon Stoykov and Andy - NebularNerd) - Adding #139 codespell check (thanks to Christian Clauss) +- Adding `-e`/`--extension` CLI flag for extension-based MIME type lookup +- Fixing #104 WAV MIME type changed from `audio/wave` to `audio/wav` (thanks to Simon Willison and Andy - NebularNerd) Version 2.0.2 ------------- diff --git a/puremagic/magic_data.json b/puremagic/magic_data.json index 78833c9..5a051d4 100644 --- a/puremagic/magic_data.json +++ b/puremagic/magic_data.json @@ -100,7 +100,7 @@ ["53434448", 8, ".sc2", "", "SimCity 2000 Map File"] ], "52494646": [ - ["57415645", 8, ".wav", "audio/wave", "Waveform Audio File Format"], + ["57415645", 8, ".wav", "audio/wav", "Waveform Audio File Format"], ["41564920", 8, ".avi", "video/avi", "Audio Video Interleave"], ["57454250", 8, ".webp", "image/webp", "WebP graphics file format"], ["41434f4e", 8, ".ani", "", "Animated cursor"], diff --git a/puremagic/main.py b/puremagic/main.py index a0e728a..d824563 100644 --- a/puremagic/main.py +++ b/puremagic/main.py @@ -31,6 +31,7 @@ sndhdr_scanner, mpeg_audio_scanner, hdf5_scanner, + cfbf_scanner, ) __author__ = "Chris Griffith" @@ -464,6 +465,8 @@ def single_deep_scan( result = mpeg_audio_scanner.main(filename, head, foot) if result and result.confidence > confidence: return result + case cfbf_scanner.match_bytes | cfbf_scanner.match_bytes_short: + return cfbf_scanner.main(filename, head, foot) if eml_result := text_scanner.eml_check(head): return eml_result diff --git a/puremagic/scanners/cfbf_scanner.py b/puremagic/scanners/cfbf_scanner.py new file mode 100644 index 0000000..5ecaf10 --- /dev/null +++ b/puremagic/scanners/cfbf_scanner.py @@ -0,0 +1,97 @@ +import os +import struct + +from puremagic.scanners.helpers import Match + +match_bytes = b"\xd0\xcf\x11\xe0\xa1\xb1\x1a\xe1" +match_bytes_short = b"\xd0\xcf\x11\xe0" + +# Stream names that identify specific CFBF-based formats, checked in priority order. +# Each entry: (stream_name, extension, name, mime_type) +# Using startswith for prefix matching where noted. +_STREAM_MATCHES = [ + ("__nameid_version1.0", ".msg", "Outlook Message", "application/vnd.ms-outlook"), + ("PowerPoint Document", ".ppt", "PowerPoint Presentation", "application/vnd.ms-powerpoint"), + ("Current User", ".ppt", "PowerPoint Presentation", "application/vnd.ms-powerpoint"), + ("Workbook", ".xls", "Excel Spreadsheet", "application/vnd.ms-excel"), + ("Book", ".xls", "Excel Spreadsheet", "application/vnd.ms-excel"), + ("WordDocument", ".doc", "Word Document", "application/msword"), + ("VisioDocument", ".vsd", "Visio Drawing", "application/x-visio"), + ("Quill", ".pub", "Publisher Document", "application/x-mspublisher"), +] + +_PREFIX_MATCHES = [ + ("__substg1.0_", ".msg", "Outlook Message", "application/vnd.ms-outlook"), +] + + +def _extract_stream_names(dir_data: bytes) -> set[str]: + """Parse CFBF directory entries and return the set of stream/storage names.""" + names: set[str] = set() + for i in range(0, len(dir_data), 128): + entry = dir_data[i : i + 128] + if len(entry) < 128: + break + name_size = struct.unpack_from(" 64: + continue + obj_type = entry[66] + # obj_type: 0=unknown, 1=storage, 2=stream, 5=root + if obj_type not in (1, 2, 5): + continue + name = entry[: name_size - 2].decode("utf-16-le", errors="ignore") + if name: + names.add(name) + return names + + +def _identify_format(stream_names: set[str]) -> Match | None: + """Match stream names against known CFBF format signatures.""" + # Check prefix matches first (e.g. __substg1.0_ for MSG) + for name in stream_names: + for prefix, ext, fmt_name, mime in _PREFIX_MATCHES: + if name.startswith(prefix): + return Match(ext, fmt_name, mime) + + # Check exact stream name matches in priority order + for stream_name, ext, fmt_name, mime in _STREAM_MATCHES: + if stream_name in stream_names: + return Match(ext, fmt_name, mime) + + return None + + +def main(file_path: os.PathLike, head: bytes, foot: bytes) -> Match | None: + if len(head) < 76: + return None + + # Verify magic bytes + if head[:8] != match_bytes: + if head[:4] != match_bytes_short: + return None + + # Parse CFBF header + sector_shift = struct.unpack_from("jDeLA7IN^ifq-u0@*8dMq_F)Fv9&$j{RuH(VYtbS@87J literal 0 HcmV?d00001 diff --git a/test/test_common_extensions.py b/test/test_common_extensions.py index c5d5ab9..5bcf568 100644 --- a/test/test_common_extensions.py +++ b/test/test_common_extensions.py @@ -261,3 +261,34 @@ def test_cmd_extension_option(): def test_fake_file(): results = puremagic.magic_file(filename=Path(LOCAL_DIR, "resources", "fake_file")) assert results[0].confidence == 0.5, results + + +def test_riff_wav_mime(): + """RIFF scanner returns audio/wav (not audio/wave) for WAV files""" + mime = puremagic.from_file(os.path.join(AUDIO_DIR, "test.wav"), mime=True) + assert mime == "audio/wav" + + + +def test_cfbf_doc(): + """CFBF scanner correctly identifies Word .doc""" + ext = puremagic.from_file(os.path.join(OFFICE_DIR, "test.doc")) + assert ext == ".doc" + mime = puremagic.from_file(os.path.join(OFFICE_DIR, "test.doc"), mime=True) + assert mime == "application/msword" + + +def test_cfbf_ppt(): + """CFBF scanner correctly identifies PowerPoint .ppt""" + ext = puremagic.from_file(os.path.join(OFFICE_DIR, "test.ppt")) + assert ext == ".ppt" + mime = puremagic.from_file(os.path.join(OFFICE_DIR, "test.ppt"), mime=True) + assert mime == "application/vnd.ms-powerpoint" + + +def test_cfbf_msg(): + """CFBF scanner correctly identifies Outlook .msg""" + ext = puremagic.from_file(os.path.join(OFFICE_DIR, "test.msg")) + assert ext == ".msg" + mime = puremagic.from_file(os.path.join(OFFICE_DIR, "test.msg"), mime=True) + assert mime == "application/vnd.ms-outlook"