From 3858ddfe238a375f980abfa2cf5b6911b658bff6 Mon Sep 17 00:00:00 2001
From: Pushkar Patel <git@thepushkarp.com>
Date: Tue, 3 Mar 2026 13:10:40 +0530
Subject: [PATCH 01/10] feat: add document support (PDF, Office, ODF, text)

Add MediaKind::Document as a new file category that bypasses ffprobe
entirely, using pure Rust crates for metadata extraction.

Supported formats: PDF, DOCX/DOC, XLSX/XLS, PPTX/PPT, ODT/ODS/ODP,
CSV, TSV, TXT, MD.

- New document.rs module with format-specific extractors (lopdf, zip,
  quick-xml, cfb)
- DocumentInfo struct with page_count, word_count, line_count,
  sheet_count, author, title, and more
- Filter support: media.doc.* field paths + pages/author aliases
- Sort support: pages/page_count sort key
- TUI: KindFilter::Document (key 5), "D" icon, document metadata panel
- Bump package version to 0.0.2, schema version to 0.2.0
---
 CLAUDE.md         |  17 +-
 Cargo.lock        | 279 +++++++++++++++++-
 Cargo.toml        |   6 +-
 src/document.rs   | 729 ++++++++++++++++++++++++++++++++++++++++++++++
 src/filter.rs     |  70 +++++
 src/main.rs       |  12 +-
 src/output.rs     |   5 +-
 src/probe.rs      |  61 ++++
 src/scan.rs       |  19 +-
 src/sort.rs       |   9 +
 src/tui/layout.rs |  53 +++-
 src/tui/mod.rs    |  10 +
 src/tui/triage.rs |   1 +
 src/types.rs      | 102 ++++++-
 tests/cli.rs      |  87 +++++-
 15 files changed, 1430 insertions(+), 30 deletions(-)
 create mode 100644 src/document.rs
diff --git a/CLAUDE.md b/CLAUDE.md
index 10c3c64..ac7a182 100644
--- a/CLAUDE.md
+++ b/CLAUDE.md
@@ -2,7 +2,7 @@
 
 ## What is this
 
-mls (Media LS) — terminal-native audio/video/image file browser. Dual-mode: TUI for humans, JSON/NDJSON for scripts/agents. Rust + Ratatui + Tokio. macOS-first.
+mls (Media LS) — terminal-native audio/video/image/document file browser. Dual-mode: TUI for humans, JSON/NDJSON for scripts/agents. Rust + Ratatui + Tokio. macOS-first.
 
 PRD: `docs/plans/resilient-gliding-bear.md`
 
@@ -25,9 +25,10 @@ src/
 ├── cli.rs         # clap derive CLI definitions
 ├── types.rs       # ALL shared types (MediaEntry, MediaInfo, Fps, etc.)
 ├── deps.rs        # Startup dependency check (ffprobe, ffmpeg, mpv)
-├── probe.rs       # ffprobe subprocess + JSON parsing → MediaEntry (image detection + EXIF dispatch)
+├── probe.rs       # ffprobe subprocess + JSON parsing → MediaEntry (image detection + EXIF dispatch; document dispatch)
 ├── exif.rs        # EXIF metadata extraction (kamadak-exif)
-├── scan.rs        # Directory walk + concurrent probing (JoinSet)
+├── document.rs    # Document metadata extraction (PDF, OOXML, ODF, OLE2, text) — pure Rust, no external tools
+├── scan.rs        # Directory walk + concurrent probing (JoinSet) — routes documents to native probe
 ├── filter.rs      # Hand-rolled expression parser (lexer → parser → AST → eval)
 ├── sort.rs        # Sort key parsing + comparison
 ├── output.rs      # JSON/NDJSON serialization (borrowing, zero-clone)
@@ -44,13 +45,13 @@ src/
 
 1. `cli.rs` parses args → `main.rs` routes to subcommand
 2. `deps.rs` checks ffprobe/ffmpeg/mpv availability
-3. `scan.rs` walks directories → `probe.rs` runs ffprobe per file → `MediaEntry` (images bypass ffprobe stream classification; EXIF extracted via `exif.rs`)
+3. `scan.rs` walks directories → `probe.rs` runs ffprobe per file → `MediaEntry` (images bypass ffprobe stream classification; EXIF extracted via `exif.rs`; documents bypass ffprobe entirely → `document.rs` extracts metadata natively)
 4. Filter (`filter.rs`) and sort (`sort.rs`) applied to entries
 5. Output: `tui/` renders interactively, or `output.rs` emits JSON/NDJSON
 
 ### Key type: `MediaEntry` (in `types.rs`)
 
-The central data type. Every module reads or produces it. It serializes to the JSON schema (version `0.1.0`). If you change `MediaEntry`, you affect JSON output, TUI rendering, filter evaluation, and sort comparison. `MediaKind` includes a `Image` variant. `MediaInfo` has an optional `exif: Option<ExifInfo>` field populated for image files.
+The central data type. Every module reads or produces it. It serializes to the JSON schema (version `0.2.0`). If you change `MediaEntry`, you affect JSON output, TUI rendering, filter evaluation, and sort comparison. `MediaKind` variants: `Video`, `Audio`, `Av`, `Image`, `Document`. `MediaInfo` has optional `exif: Option<ExifInfo>` (images) and `doc: Option<DocumentInfo>` (documents) fields.
 
 ## Conventions
 
@@ -84,7 +85,7 @@ Unit tests are co-located `#[cfg(test)]` modules at the bottom of each source fi
 
 ### Output format
 
-JSON output uses borrowing structs (`ListEnvelopeRef<'a>`, `NdjsonEntryRef<'a>`) to avoid cloning `MediaEntry` vectors. Schema version `"0.1.0"` is embedded in output.
+JSON output uses borrowing structs (`ListEnvelopeRef<'a>`, `NdjsonEntryRef<'a>`) to avoid cloning `MediaEntry` vectors. Schema version `"0.2.0"` is embedded in output.
 
 ### External processes
 
@@ -101,7 +102,9 @@ JSON output uses borrowing structs (`ListEnvelopeRef<'a>`, `NdjsonEntryRef<'a>`)
 - `triage.rs` Move (`m` key) works via text input; interactive directory picker not yet built.
 - `scan.rs` uses bounded `JoinSet` spawns (not a semaphore) for concurrency control, with `mpsc` channel for streaming results to the caller.
 - Images are loaded directly in `thumbnail.rs` (bypass ffmpeg + LRU cache).
-- `filter.rs` supports `media.exif.*` field paths and `camera`/`iso` shorthand aliases.
+- `filter.rs` supports `media.exif.*` field paths and `camera`/`iso` shorthand aliases, plus `media.doc.*` field paths and `pages`/`author` shorthand aliases.
+- **Documents bypass ffprobe entirely** — probed by `document.rs` using pure Rust crates (`lopdf`, `zip`, `quick-xml`, `cfb`). Recognized extensions: pdf, docx, doc, odt, xlsx, xls, ods, pptx, ppt, odp, csv, tsv, txt, md.
+- **TUI kind filter key `5`** filters to documents. Document entries show "D" icon in file list.
 
 ## Dependencies (external)
 
diff --git a/Cargo.lock b/Cargo.lock
index 262fed0..8523749 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -8,6 +8,17 @@ version = "2.0.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "320119579fcad9c21884f5c4861d16174d0e06250625266f50fe6898340abefa"
 
+[[package]]
+name = "aes"
+version = "0.8.4"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "b169f7a6d4742236a0a00c541b845991d0ac43e546831af1249753ab4c3aa3a0"
+dependencies = [
+ "cfg-if",
+ "cipher",
+ "cpufeatures",
+]
+
 [[package]]
 name = "aho-corasick"
 version = "1.1.4"
@@ -182,6 +193,15 @@ dependencies = [
  "generic-array",
 ]
 
+[[package]]
+name = "block-padding"
+version = "0.3.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "a8894febbff9f758034a5b8e12d87918f56dfc64a8e1fe757d65e29041538d93"
+dependencies = [
+ "generic-array",
+]
+
 [[package]]
 name = "bstr"
 version = "1.12.1"
@@ -205,6 +225,12 @@ version = "1.2.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "64fa3c856b712db6612c019f14756e64e4bcea13337a6b33b696333a9eaa2d06"
 
+[[package]]
+name = "bytecount"
+version = "0.6.9"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "175812e0be2bccb6abe50bb8d566126198344f707e304f45c648fd8f2cc0365e"
+
 [[package]]
 name = "bytemuck"
 version = "1.25.0"
@@ -225,6 +251,12 @@ dependencies = [
  "syn 2.0.117",
 ]
 
+[[package]]
+name = "byteorder"
+version = "1.5.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "1fd0f2584146f6f2ef48085050886acf353beff7305ebd1ae69500e27c67f64b"
+
 [[package]]
 name = "byteorder-lite"
 version = "0.1.0"
@@ -246,6 +278,15 @@ dependencies = [
  "rustversion",
 ]
 
+[[package]]
+name = "cbc"
+version = "0.1.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "26b52a9543ae338f279b96b0b9fed9c8093744685043739079ce85cd58f289a6"
+dependencies = [
+ "cipher",
+]
+
 [[package]]
 name = "cc"
 version = "1.2.56"
@@ -256,6 +297,17 @@ dependencies = [
  "shlex",
 ]
 
+[[package]]
+name = "cfb"
+version = "0.10.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "d8a4f8e55be323b378facfcf1f06aa97f6ec17cf4ac84fb17325093aaf62da41"
+dependencies = [
+ "byteorder",
+ "fnv",
+ "uuid",
+]
+
 [[package]]
 name = "cfg-if"
 version = "1.0.4"
@@ -282,6 +334,16 @@ dependencies = [
  "windows-link",
 ]
 
+[[package]]
+name = "cipher"
+version = "0.4.4"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "773f3b9af64447d2ce9850330c473515014aa235e6a783b02db81ff39e4a3dad"
+dependencies = [
+ "crypto-common",
+ "inout",
+]
+
 [[package]]
 name = "clap"
 version = "4.5.60"
@@ -556,12 +618,30 @@ dependencies = [
  "litrs",
 ]
 
+[[package]]
+name = "ecb"
+version = "0.1.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "1a8bfa975b1aec2145850fcaa1c6fe269a16578c44705a532ae3edc92b8881c7"
+dependencies = [
+ "cipher",
+]
+
 [[package]]
 name = "either"
 version = "1.15.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "48c757948c5ede0e46177b7add2e67155f70e33c07fea8284df6576da70b3719"
 
+[[package]]
+name = "encoding_rs"
+version = "0.8.35"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "75030f3c4f45dafd7586dd6780965a8c7e8e285a5ecb86713e63a79c5b2766f3"
+dependencies = [
+ "cfg-if",
+]
+
 [[package]]
 name = "equivalent"
 version = "1.0.2"
@@ -675,6 +755,7 @@ checksum = "843fba2746e448b37e26a819579957415c8cef339bf08564fe8b7ddbd959573c"
 dependencies = [
  "crc32fast",
  "miniz_oxide",
+ "zlib-rs",
 ]
 
 [[package]]
@@ -911,6 +992,16 @@ dependencies = [
  "rustversion",
 ]
 
+[[package]]
+name = "inout"
+version = "0.1.4"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "879f10e63c20629ecabbb64a8010319738c66a5cd0c29b02d63d272b03751d01"
+dependencies = [
+ "block-padding",
+ "generic-array",
+]
+
 [[package]]
 name = "instability"
 version = "0.3.11"
@@ -1047,6 +1138,34 @@ version = "0.4.29"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "5e5032e24019045c762d3c0f28f5b6b8bbf38563a65908389bf7978758920897"
 
+[[package]]
+name = "lopdf"
+version = "0.39.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "f560f57dfb9142a02d673e137622fd515d4231e51feb8b4af28d92647d83f35b"
+dependencies = [
+ "aes",
+ "bitflags 2.11.0",
+ "cbc",
+ "ecb",
+ "encoding_rs",
+ "flate2",
+ "getrandom 0.3.4",
+ "indexmap",
+ "itoa",
+ "log",
+ "md-5",
+ "nom 8.0.0",
+ "nom_locate",
+ "rand 0.9.2",
+ "rangemap",
+ "sha2",
+ "stringprep",
+ "thiserror 2.0.18",
+ "ttf-parser",
+ "weezl",
+]
+
 [[package]]
 name = "lru"
 version = "0.16.3"
@@ -1075,20 +1194,33 @@ dependencies = [
  "regex-automata",
 ]
 
+[[package]]
+name = "md-5"
+version = "0.10.6"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "d89e7ee0cfbedfc4da3340218492196241d89eefb6dab27de5df917a6d2e78cf"
+dependencies = [
+ "cfg-if",
+ "digest",
+]
+
 [[package]]
 name = "media-ls"
-version = "0.0.1"
+version = "0.0.2"
 dependencies = [
  "anyhow",
  "assert_cmd",
+ "cfb",
  "chrono",
  "clap",
  "crossterm",
  "image",
  "kamadak-exif",
+ "lopdf",
  "lru",
  "nucleo-matcher",
  "predicates",
+ "quick-xml",
  "ratatui",
  "ratatui-image",
  "serde",
@@ -1098,6 +1230,7 @@ dependencies = [
  "tokio",
  "tracing",
  "tracing-subscriber",
+ "zip",
 ]
 
 [[package]]
@@ -1188,6 +1321,26 @@ dependencies = [
  "minimal-lexical",
 ]
 
+[[package]]
+name = "nom"
+version = "8.0.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "df9761775871bdef83bee530e60050f7e54b1105350d6884eb0fb4f46c2f9405"
+dependencies = [
+ "memchr",
+]
+
+[[package]]
+name = "nom_locate"
+version = "5.0.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "0b577e2d69827c4740cba2b52efaad1c4cc7c73042860b199710b3575c68438d"
+dependencies = [
+ "bytecount",
+ "memchr",
+ "nom 8.0.0",
+]
+
 [[package]]
 name = "normalize-line-endings"
 version = "0.3.0"
@@ -1551,6 +1704,15 @@ version = "2.0.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "a993555f31e5a609f617c12db6250dedcac1b0a85076912c436e6fc9b2c8e6a3"
 
+[[package]]
+name = "quick-xml"
+version = "0.37.5"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "331e97a1af0bf59823e6eadffe373d7b27f485be8748f71471c662c1f269b7fb"
+dependencies = [
+ "memchr",
+]
+
 [[package]]
 name = "quote"
 version = "1.0.44"
@@ -1579,7 +1741,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "34af8d1a0e25924bc5b7c43c079c942339d8f0a8b57c39049bef581b46327404"
 dependencies = [
  "libc",
- "rand_chacha",
+ "rand_chacha 0.3.1",
  "rand_core 0.6.4",
 ]
 
@@ -1589,6 +1751,7 @@ version = "0.9.2"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "6db2770f06117d490610c7488547d543617b21bfa07796d7a12f6f1bd53850d1"
 dependencies = [
+ "rand_chacha 0.9.0",
  "rand_core 0.9.5",
 ]
 
@@ -1602,6 +1765,16 @@ dependencies = [
  "rand_core 0.6.4",
 ]
 
+[[package]]
+name = "rand_chacha"
+version = "0.9.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "d3022b5f1df60f26e1ffddd6c66e8aa15de382ae63b3a0c1bfc0e4d3e3f325cb"
+dependencies = [
+ "ppv-lite86",
+ "rand_core 0.9.5",
+]
+
 [[package]]
 name = "rand_core"
 version = "0.6.4"
@@ -1616,6 +1789,9 @@ name = "rand_core"
 version = "0.9.5"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "76afc826de14238e6e8c374ddcc1fa19e374fd8dd986b0d2af0d02377261d83c"
+dependencies = [
+ "getrandom 0.3.4",
+]
 
 [[package]]
 name = "rand_xoshiro"
@@ -1626,6 +1802,12 @@ dependencies = [
  "rand_core 0.9.5",
 ]
 
+[[package]]
+name = "rangemap"
+version = "1.7.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "973443cf09a9c8656b574a866ab68dfa19f0867d0340648c7d2f6a71b8a8ea68"
+
 [[package]]
 name = "ratatui"
 version = "0.30.0"
@@ -2007,6 +2189,17 @@ version = "1.1.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "a2eb9349b6444b326872e140eb1cf5e7c522154d69e7a0ffb0fb81c06b37543f"
 
+[[package]]
+name = "stringprep"
+version = "0.1.5"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "7b4df3d392d81bd458a8a621b8bffbd2302a12ffe288a9d931670948749463b1"
+dependencies = [
+ "unicode-bidi",
+ "unicode-normalization",
+ "unicode-properties",
+]
+
 [[package]]
 name = "strsim"
 version = "0.11.1"
@@ -2082,7 +2275,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "d4ea810f0692f9f51b382fff5893887bb4580f5fa246fde546e0b13e7fcee662"
 dependencies = [
  "fnv",
- "nom",
+ "nom 7.1.3",
  "phf",
  "phf_codegen",
 ]
@@ -2228,6 +2421,21 @@ version = "0.1.8"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "7694e1cfe791f8d31026952abf09c69ca6f6fa4e1a1229e18988f06a04a12dca"
 
+[[package]]
+name = "tinyvec"
+version = "1.10.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "bfa5fdc3bce6191a1dbc8c02d5c8bffcf557bafa17c124c5264a458f1b0613fa"
+dependencies = [
+ "tinyvec_macros",
+]
+
+[[package]]
+name = "tinyvec_macros"
+version = "0.1.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "1f3ccbac311fea05f86f61904b462b55fb3df8837a366dfc601a0161d0532f20"
+
 [[package]]
 name = "tokio"
 version = "1.49.0"
@@ -2317,6 +2525,18 @@ dependencies = [
  "tracing-log",
 ]
 
+[[package]]
+name = "ttf-parser"
+version = "0.25.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "d2df906b07856748fa3f6e0ad0cbaa047052d4a7dd609e231c4f72cee8c36f31"
+
+[[package]]
+name = "typed-path"
+version = "0.12.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "8e28f89b80c87b8fb0cf04ab448d5dd0dd0ade2f8891bae878de66a75a28600e"
+
 [[package]]
 name = "typenum"
 version = "1.19.0"
@@ -2329,12 +2549,33 @@ version = "0.1.7"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "2896d95c02a80c6d6a5d6e953d479f5ddf2dfdb6a244441010e373ac0fb88971"
 
+[[package]]
+name = "unicode-bidi"
+version = "0.3.18"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "5c1cb5db39152898a79168971543b1cb5020dff7fe43c8dc468b0885f5e29df5"
+
 [[package]]
 name = "unicode-ident"
 version = "1.0.24"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "e6e4313cd5fcd3dad5cafa179702e2b244f760991f45397d14d4ebf38247da75"
 
+[[package]]
+name = "unicode-normalization"
+version = "0.1.25"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "5fd4f6878c9cb28d874b009da9e8d183b5abc80117c40bbd187a1fde336be6e8"
+dependencies = [
+ "tinyvec",
+]
+
+[[package]]
+name = "unicode-properties"
+version = "0.1.4"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "7df058c713841ad818f1dc5d3fd88063241cc61f49f5fbea4b951e8cf5a8d71d"
+
 [[package]]
 name = "unicode-segmentation"
 version = "1.12.0"
@@ -3027,12 +3268,44 @@ dependencies = [
  "syn 2.0.117",
 ]
 
+[[package]]
+name = "zip"
+version = "8.2.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "b680f2a0cd479b4cff6e1233c483fdead418106eae419dc60200ae9850f6d004"
+dependencies = [
+ "crc32fast",
+ "flate2",
+ "indexmap",
+ "memchr",
+ "typed-path",
+ "zopfli",
+]
+
+[[package]]
+name = "zlib-rs"
+version = "0.6.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "3be3d40e40a133f9c916ee3f9f4fa2d9d63435b5fbe1bfc6d9dae0aa0ada1513"
+
 [[package]]
 name = "zmij"
 version = "1.0.21"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "b8848ee67ecc8aedbaf3e4122217aff892639231befc6a1b58d29fff4c2cabaa"
 
+[[package]]
+name = "zopfli"
+version = "0.8.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "f05cd8797d63865425ff89b5c4a48804f35ba0ce8d125800027ad6017d2b5249"
+dependencies = [
+ "bumpalo",
+ "crc32fast",
+ "log",
+ "simd-adler32",
+]
+
 [[package]]
 name = "zune-core"
 version = "0.4.12"
diff --git a/Cargo.toml b/Cargo.toml
index 76038d1..0fdc284 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -1,6 +1,6 @@
 [package]
 name = "media-ls"
-version = "0.0.1"
+version = "0.0.2"
 edition = "2024"
 description = "Media LS — terminal-native audio/video file browser with metadata columns, TUI preview, and structured JSON output"
 license = "MIT"
@@ -32,6 +32,10 @@ image = { version = "0.25.9", default-features = false, features = [
 kamadak-exif = "0.6.1"
 ratatui-image = { version = "10.0.6", default-features = false, features = ["crossterm"] }
 tokio = { version = "1.49.0", features = ["full"] }
+lopdf = { version = "0.39.0", default-features = false }
+zip = { version = "8.2.0", default-features = false, features = ["deflate"] }
+quick-xml = "0.37.5"
+cfb = "0.10.0"
 tracing = "0.1.44"
 tracing-subscriber = { version = "0.3.22", features = ["env-filter"] }
 
diff --git a/src/document.rs b/src/document.rs
new file mode 100644
index 0000000..bb6a129
--- /dev/null
+++ b/src/document.rs
@@ -0,0 +1,729 @@
+/// Document metadata extraction (PDF, Office, `OpenDocument`, plain text).
+///
+/// All extractors are best-effort: failures are silently swallowed and
+/// logged at debug level. Follows the same pattern as `exif.rs`.
+use crate::types::DocumentInfo;
+use std::io::{BufRead, BufReader};
+use std::path::Path;
+
+/// Extract metadata from a document file, dispatching by extension.
+///
+/// Returns `None` on any failure (corrupt file, unsupported format, I/O error).
+pub fn probe_document(path: &Path) -> Option<DocumentInfo> {
+    let ext = path
+        .extension()
+        .and_then(|e| e.to_str())
+        .map(str::to_ascii_lowercase)?;
+
+    let result = match ext.as_str() {
+        "pdf" => probe_pdf(path),
+        "docx" => probe_ooxml_doc(path),
+        "xlsx" => probe_ooxml_spreadsheet(path),
+        "pptx" => probe_ooxml_presentation(path),
+        "odt" | "ods" | "odp" => probe_odf(path, &ext),
+        "doc" | "xls" | "ppt" => probe_ole2(path, &ext),
+        "csv" | "tsv" => probe_text_table(path, &ext),
+        "txt" | "md" => probe_text(path, &ext),
+        _ => None,
+    };
+
+    if result.is_none() {
+        tracing::debug!(path = %path.display(), ext = %ext, "document probe returned no metadata");
+    }
+
+    result
+}
+
+// ─── PDF ────────────────────────────────────────────────────────────────
+
+fn probe_pdf(path: &Path) -> Option<DocumentInfo> {
+    let doc = lopdf::Document::load(path).ok()?;
+
+    let page_count = doc.get_pages().len();
+
+    let trailer_info = doc
+        .trailer
+        .get(b"Info")
+        .ok()
+        .and_then(|obj| obj.as_reference().ok())
+        .and_then(|r| doc.get_object(r).ok());
+
+    let get_info_str = |key: &[u8]| -> Option<String> {
+        trailer_info?
+            .as_dict()
+            .ok()?
+            .get(key)
+            .ok()
+            .and_then(pdf_object_to_string)
+            .map(|s| s.trim().to_owned())
+            .filter(|s| !s.is_empty())
+    };
+
+    #[expect(clippy::cast_possible_truncation)]
+    Some(DocumentInfo {
+        format: "pdf".to_string(),
+        page_count: Some(page_count as u32),
+        word_count: None,
+        line_count: None,
+        sheet_count: None,
+        author: get_info_str(b"Author"),
+        title: get_info_str(b"Title"),
+        subject: get_info_str(b"Subject"),
+        creator_app: get_info_str(b"Creator"),
+        creation_date: get_info_str(b"CreationDate"),
+        modification_date: get_info_str(b"ModDate"),
+    })
+}
+
+fn pdf_object_to_string(obj: &lopdf::Object) -> Option<String> {
+    match obj {
+        lopdf::Object::String(bytes, _) => String::from_utf8(bytes.clone()).ok(),
+        lopdf::Object::Name(name) => String::from_utf8(name.clone()).ok(),
+        _ => None,
+    }
+}
+
+// ─── OOXML (DOCX/XLSX/PPTX) ────────────────────────────────────────────
+
+fn read_xml_from_zip(path: &Path, inner_path: &str) -> Option<String> {
+    let file = std::fs::File::open(path).ok()?;
+    let mut archive = zip::ZipArchive::new(file).ok()?;
+    let mut entry = archive.by_name(inner_path).ok()?;
+    let mut contents = String::new();
+    std::io::Read::read_to_string(&mut entry, &mut contents).ok()?;
+    Some(contents)
+}
+
+fn parse_ooxml_core(path: &Path) -> OoxmlCoreProps {
+    let xml = read_xml_from_zip(path, "docProps/core.xml").unwrap_or_default();
+    parse_core_xml(&xml)
+}
+
+fn parse_ooxml_app(path: &Path) -> OoxmlAppProps {
+    let xml = read_xml_from_zip(path, "docProps/app.xml").unwrap_or_default();
+    parse_app_xml(&xml)
+}
+
+struct OoxmlCoreProps {
+    author: Option<String>,
+    title: Option<String>,
+    subject: Option<String>,
+    created: Option<String>,
+    modified: Option<String>,
+}
+
+struct OoxmlAppProps {
+    pages: Option<u32>,
+    words: Option<u64>,
+    slides: Option<u32>,
+    app_name: Option<String>,
+}
+
+fn parse_core_xml(xml: &str) -> OoxmlCoreProps {
+    let mut props = OoxmlCoreProps {
+        author: None,
+        title: None,
+        subject: None,
+        created: None,
+        modified: None,
+    };
+
+    let mut reader = quick_xml::Reader::from_str(xml);
+    let mut buf = Vec::new();
+    let mut current_tag = String::new();
+
+    loop {
+        match reader.read_event_into(&mut buf) {
+            Ok(quick_xml::events::Event::Start(ref e) | quick_xml::events::Event::Empty(ref e)) => {
+                current_tag = local_name(e.name().as_ref());
+            }
+            Ok(quick_xml::events::Event::Text(ref e)) => {
+                let text = e.unescape().ok().map(|s| s.trim().to_owned());
+                if let Some(val) = text.filter(|s| !s.is_empty()) {
+                    match current_tag.as_str() {
+                        "creator" => props.author = Some(val),
+                        "title" => props.title = Some(val),
+                        "subject" => props.subject = Some(val),
+                        "created" => props.created = Some(val),
+                        "modified" => props.modified = Some(val),
+                        _ => {}
+                    }
+                }
+            }
+            Ok(quick_xml::events::Event::End(_)) => {
+                current_tag.clear();
+            }
+            Ok(quick_xml::events::Event::Eof) | Err(_) => break,
+            _ => {}
+        }
+        buf.clear();
+    }
+
+    props
+}
+
+fn parse_app_xml(xml: &str) -> OoxmlAppProps {
+    let mut props = OoxmlAppProps {
+        pages: None,
+        words: None,
+        slides: None,
+        app_name: None,
+    };
+
+    let mut reader = quick_xml::Reader::from_str(xml);
+    let mut buf = Vec::new();
+    let mut current_tag = String::new();
+
+    loop {
+        match reader.read_event_into(&mut buf) {
+            Ok(quick_xml::events::Event::Start(ref e) | quick_xml::events::Event::Empty(ref e)) => {
+                current_tag = local_name(e.name().as_ref());
+            }
+            Ok(quick_xml::events::Event::Text(ref e)) => {
+                let text = e.unescape().ok().map(|s| s.trim().to_owned());
+                if let Some(val) = text.filter(|s| !s.is_empty()) {
+                    match current_tag.as_str() {
+                        "Pages" => props.pages = val.parse().ok(),
+                        "Words" => props.words = val.parse().ok(),
+                        "Slides" => props.slides = val.parse().ok(),
+                        "Application" => props.app_name = Some(val),
+                        _ => {}
+                    }
+                }
+            }
+            Ok(quick_xml::events::Event::End(_)) => {
+                current_tag.clear();
+            }
+            Ok(quick_xml::events::Event::Eof) | Err(_) => break,
+            _ => {}
+        }
+        buf.clear();
+    }
+
+    props
+}
+
+fn probe_ooxml_doc(path: &Path) -> Option<DocumentInfo> {
+    let file = std::fs::File::open(path).ok()?;
+    let _archive = zip::ZipArchive::new(file).ok()?;
+
+    let core = parse_ooxml_core(path);
+    let app = parse_ooxml_app(path);
+
+    Some(DocumentInfo {
+        format: "docx".to_string(),
+        page_count: app.pages,
+        word_count: app.words,
+        line_count: None,
+        sheet_count: None,
+        author: core.author,
+        title: core.title,
+        subject: core.subject,
+        creator_app: app.app_name,
+        creation_date: core.created,
+        modification_date: core.modified,
+    })
+}
+
+fn probe_ooxml_spreadsheet(path: &Path) -> Option<DocumentInfo> {
+    let file = std::fs::File::open(path).ok()?;
+    let _archive = zip::ZipArchive::new(file).ok()?;
+
+    let core = parse_ooxml_core(path);
+    let app = parse_ooxml_app(path);
+
+    let sheet_count = read_xml_from_zip(path, "xl/workbook.xml")
+        .map(|xml| count_xml_elements(&xml, "sheet"))
+        .filter(|&c| c > 0);
+
+    Some(DocumentInfo {
+        format: "xlsx".to_string(),
+        page_count: None,
+        word_count: None,
+        line_count: None,
+        sheet_count,
+        author: core.author,
+        title: core.title,
+        subject: core.subject,
+        creator_app: app.app_name,
+        creation_date: core.created,
+        modification_date: core.modified,
+    })
+}
+
+fn probe_ooxml_presentation(path: &Path) -> Option<DocumentInfo> {
+    let file = std::fs::File::open(path).ok()?;
+    let _archive = zip::ZipArchive::new(file).ok()?;
+
+    let core = parse_ooxml_core(path);
+    let app = parse_ooxml_app(path);
+
+    let slide_count = app.slides.or_else(|| {
+        read_xml_from_zip(path, "ppt/presentation.xml")
+            .map(|xml| count_xml_elements(&xml, "sldId"))
+            .filter(|&c| c > 0)
+    });
+
+    Some(DocumentInfo {
+        format: "pptx".to_string(),
+        page_count: slide_count,
+        word_count: None,
+        line_count: None,
+        sheet_count: None,
+        author: core.author,
+        title: core.title,
+        subject: core.subject,
+        creator_app: app.app_name,
+        creation_date: core.created,
+        modification_date: core.modified,
+    })
+}
+
+// ─── ODF (ODT/ODS/ODP) ─────────────────────────────────────────────────
+
+fn probe_odf(path: &Path, ext: &str) -> Option<DocumentInfo> {
+    let file = std::fs::File::open(path).ok()?;
+    let _archive = zip::ZipArchive::new(file).ok()?;
+
+    let meta_xml = read_xml_from_zip(path, "meta.xml").unwrap_or_default();
+    let meta = parse_odf_meta(&meta_xml);
+
+    let (page_count, sheet_count) = match ext {
+        "ods" => {
+            let content = read_xml_from_zip(path, "content.xml").unwrap_or_default();
+            let sheets = count_xml_elements(&content, "table");
+            (None, if sheets > 0 { Some(sheets) } else { None })
+        }
+        "odp" => {
+            let content = read_xml_from_zip(path, "content.xml").unwrap_or_default();
+            let slides = count_xml_elements(&content, "page");
+            (if slides > 0 { Some(slides) } else { None }, None)
+        }
+        _ => (meta.page_count, None),
+    };
+
+    Some(DocumentInfo {
+        format: ext.to_string(),
+        page_count,
+        word_count: meta.word_count,
+        line_count: None,
+        sheet_count,
+        author: meta.author,
+        title: meta.title,
+        subject: meta.subject,
+        creator_app: meta.generator,
+        creation_date: meta.creation_date,
+        modification_date: meta.modification_date,
+    })
+}
+
+struct OdfMeta {
+    author: Option<String>,
+    title: Option<String>,
+    subject: Option<String>,
+    generator: Option<String>,
+    creation_date: Option<String>,
+    modification_date: Option<String>,
+    page_count: Option<u32>,
+    word_count: Option<u64>,
+}
+
+fn parse_odf_meta(xml: &str) -> OdfMeta {
+    let mut meta = OdfMeta {
+        author: None,
+        title: None,
+        subject: None,
+        generator: None,
+        creation_date: None,
+        modification_date: None,
+        page_count: None,
+        word_count: None,
+    };
+
+    let mut reader = quick_xml::Reader::from_str(xml);
+    let mut buf = Vec::new();
+    let mut current_tag = String::new();
+
+    loop {
+        match reader.read_event_into(&mut buf) {
+            Ok(quick_xml::events::Event::Start(ref e) | quick_xml::events::Event::Empty(ref e)) => {
+                let local = local_name(e.name().as_ref());
+
+                // ODF stores statistics as attributes on `meta:document-statistic`
+                if local == "document-statistic" {
+                    for attr in e.attributes().flatten() {
+                        let key = local_name(attr.key.as_ref());
+                        let val = String::from_utf8_lossy(&attr.value).to_string();
+                        match key.as_str() {
+                            "page-count" => meta.page_count = val.parse().ok(),
+                            "word-count" => meta.word_count = val.parse().ok(),
+                            _ => {}
+                        }
+                    }
+                }
+
+                current_tag = local;
+            }
+            Ok(quick_xml::events::Event::Text(ref e)) => {
+                let text = e.unescape().ok().map(|s| s.trim().to_owned());
+                if let Some(val) = text.filter(|s| !s.is_empty()) {
+                    match current_tag.as_str() {
+                        "initial-creator" | "creator" => meta.author = Some(val),
+                        "title" => meta.title = Some(val),
+                        "subject" => meta.subject = Some(val),
+                        "generator" => meta.generator = Some(val),
+                        "creation-date" => meta.creation_date = Some(val),
+                        "date" => meta.modification_date = Some(val),
+                        _ => {}
+                    }
+                }
+            }
+            Ok(quick_xml::events::Event::End(_)) => {
+                current_tag.clear();
+            }
+            Ok(quick_xml::events::Event::Eof) | Err(_) => break,
+            _ => {}
+        }
+        buf.clear();
+    }
+
+    meta
+}
+
+// ─── OLE2 (legacy DOC/XLS/PPT) ─────────────────────────────────────────
+
+fn probe_ole2(path: &Path, ext: &str) -> Option<DocumentInfo> {
+    let mut comp = cfb::open(path).ok()?;
+
+    let mut info = DocumentInfo {
+        format: ext.to_string(),
+        ..DocumentInfo::default()
+    };
+
+    // Try to read the SummaryInformation stream
+    if let Ok(stream) = comp.open_stream("/\x05SummaryInformation") {
+        let data: Vec<u8> = std::io::Read::bytes(stream)
+            .take(4096)
+            .filter_map(Result::ok)
+            .collect();
+        parse_summary_info(&data, &mut info);
+    }
+
+    Some(info)
+}
+
+/// Best-effort extraction from OLE2 `SummaryInformation` stream.
+///
+/// The stream uses MS-OLEPS binary format with property sets.
+/// We extract string properties by well-known IDs
+/// (2=Title, 4=Author, 5=Subject, 18=`AppName`).
+fn parse_summary_info(data: &[u8], info: &mut DocumentInfo) {
+    if data.len() < 48 || read_u16_le(data, 0) != 0xFFFE {
+        return;
+    }
+
+    let section_offset = read_u32_le(data, 44) as usize;
+    if section_offset >= data.len() || section_offset + 8 > data.len() {
+        return;
+    }
+
+    let prop_count = read_u32_le(data, section_offset + 4) as usize;
+    if prop_count > 100 {
+        return;
+    }
+
+    for i in 0..prop_count {
+        let entry_offset = section_offset + 8 + i * 8;
+        if entry_offset + 8 > data.len() {
+            break;
+        }
+
+        let prop_id = read_u32_le(data, entry_offset);
+        let prop_offset = read_u32_le(data, entry_offset + 4) as usize;
+        let abs_offset = section_offset + prop_offset;
+
+        if abs_offset + 8 > data.len() {
+            continue;
+        }
+
+        let prop_type = read_u32_le(data, abs_offset);
+
+        // VT_LPSTR = 0x1E
+        if prop_type == 0x1E {
+            let str_len = read_u32_le(data, abs_offset + 4) as usize;
+            let str_start = abs_offset + 8;
+            if str_start + str_len <= data.len() {
+                let raw = &data[str_start..str_start + str_len];
+                let s = String::from_utf8_lossy(raw)
+                    .trim_end_matches('\0')
+                    .trim()
+                    .to_owned();
+                if !s.is_empty() {
+                    match prop_id {
+                        2 => info.title = Some(s),
+                        4 => info.author = Some(s),
+                        5 => info.subject = Some(s),
+                        18 => info.creator_app = Some(s),
+                        _ => {}
+                    }
+                }
+            }
+        }
+
+        // VT_I4 = 0x03
+        if prop_type == 0x03 && abs_offset + 8 <= data.len() {
+            let val = read_u32_le(data, abs_offset + 4);
+            if val > 0 {
+                match prop_id {
+                    14 => info.page_count = Some(val),
+                    15 => info.word_count = Some(u64::from(val)),
+                    _ => {}
+                }
+            }
+        }
+    }
+}
+
+fn read_u16_le(data: &[u8], offset: usize) -> u16 {
+    if offset + 2 > data.len() {
+        return 0;
+    }
+    u16::from_le_bytes([data[offset], data[offset + 1]])
+}
+
+fn read_u32_le(data: &[u8], offset: usize) -> u32 {
+    if offset + 4 > data.len() {
+        return 0;
+    }
+    u32::from_le_bytes([
+        data[offset],
+        data[offset + 1],
+        data[offset + 2],
+        data[offset + 3],
+    ])
+}
+
+// ─── Text-based formats (CSV/TSV/TXT/MD) ───────────────────────────────
+
+fn probe_text_table(path: &Path, ext: &str) -> Option<DocumentInfo> {
+    let file = std::fs::File::open(path).ok()?;
+    let reader = BufReader::new(file);
+    let mut line_count: u64 = 0;
+
+    for line in reader.lines() {
+        if line.is_err() {
+            break;
+        }
+        line_count += 1;
+    }
+
+    Some(DocumentInfo {
+        format: ext.to_string(),
+        line_count: Some(line_count),
+        ..DocumentInfo::default()
+    })
+}
+
+fn probe_text(path: &Path, ext: &str) -> Option<DocumentInfo> {
+    let file = std::fs::File::open(path).ok()?;
+    let reader = BufReader::new(file);
+    let mut line_count: u64 = 0;
+    let mut word_count: u64 = 0;
+
+    for line in reader.lines() {
+        let Ok(line) = line else {
+            break;
+        };
+        line_count += 1;
+        word_count += line.split_whitespace().count() as u64;
+    }
+
+    Some(DocumentInfo {
+        format: ext.to_string(),
+        word_count: Some(word_count),
+        line_count: Some(line_count),
+        ..DocumentInfo::default()
+    })
+}
+
+// ─── XML helpers ────────────────────────────────────────────────────────
+
+/// Extract the local part of a possibly namespaced XML name.
+///
+/// Examples: `dc:creator` becomes `creator`, `meta:creation-date` becomes `creation-date`.
+fn local_name(name: &[u8]) -> String {
+    let full = String::from_utf8_lossy(name);
+    full.rsplit_once(':')
+        .map_or(full.to_string(), |(_, local)| local.to_string())
+}
+
+fn count_xml_elements(xml: &str, element_local_name: &str) -> u32 {
+    let mut reader = quick_xml::Reader::from_str(xml);
+    let mut buf = Vec::new();
+    let mut count: u32 = 0;
+
+    loop {
+        match reader.read_event_into(&mut buf) {
+            Ok(quick_xml::events::Event::Start(ref e) | quick_xml::events::Event::Empty(ref e)) => {
+                let local = local_name(e.name().as_ref());
+                if local == element_local_name {
+                    count = count.saturating_add(1);
+                }
+            }
+            Ok(quick_xml::events::Event::Eof) | Err(_) => break,
+            _ => {}
+        }
+        buf.clear();
+    }
+
+    count
+}
+
+#[cfg(test)]
+#[expect(clippy::unwrap_used)]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn local_name_strips_namespace() {
+        assert_eq!(local_name(b"dc:creator"), "creator");
+        assert_eq!(local_name(b"meta:creation-date"), "creation-date");
+        assert_eq!(local_name(b"title"), "title");
+    }
+
+    #[test]
+    fn probe_text_counts_lines_and_words() {
+        let tmp = tempfile::tempdir().unwrap();
+        let path = tmp.path().join("test.txt");
+        std::fs::write(&path, "hello world\nfoo bar baz\n").unwrap();
+
+        let info = probe_text(&path, "txt").unwrap();
+        assert_eq!(info.format, "txt");
+        assert_eq!(info.line_count, Some(2));
+        assert_eq!(info.word_count, Some(5));
+    }
+
+    #[test]
+    fn probe_text_empty_file() {
+        let tmp = tempfile::tempdir().unwrap();
+        let path = tmp.path().join("empty.txt");
+        std::fs::write(&path, "").unwrap();
+
+        let info = probe_text(&path, "txt").unwrap();
+        assert_eq!(info.line_count, Some(0));
+        assert_eq!(info.word_count, Some(0));
+    }
+
+    #[test]
+    fn probe_text_table_counts_lines() {
+        let tmp = tempfile::tempdir().unwrap();
+        let path = tmp.path().join("data.csv");
+        std::fs::write(&path, "a,b,c\n1,2,3\n4,5,6\n").unwrap();
+
+        let info = probe_text_table(&path, "csv").unwrap();
+        assert_eq!(info.format, "csv");
+        assert_eq!(info.line_count, Some(3));
+    }
+
+    #[test]
+    fn probe_nonexistent_file_returns_none() {
+        let path = Path::new("/nonexistent/file.pdf");
+        assert!(probe_document(path).is_none());
+    }
+
+    #[test]
+    fn probe_corrupt_pdf_returns_none() {
+        let tmp = tempfile::tempdir().unwrap();
+        let path = tmp.path().join("bad.pdf");
+        std::fs::write(&path, b"this is not a pdf").unwrap();
+
+        assert!(probe_pdf(&path).is_none());
+    }
+
+    #[test]
+    fn probe_corrupt_zip_returns_none() {
+        let tmp = tempfile::tempdir().unwrap();
+        let path = tmp.path().join("bad.docx");
+        std::fs::write(&path, b"not a zip file").unwrap();
+
+        assert!(probe_ooxml_doc(&path).is_none());
+    }
+
+    #[test]
+    fn parse_core_xml_extracts_fields() {
+        let xml = r#"<?xml version="1.0" encoding="UTF-8"?>
+        <cp:coreProperties xmlns:dc="http://purl.org/dc/elements/1.1/"
+            xmlns:cp="http://schemas.openxmlformats.org/package/2006/metadata/core-properties"
+            xmlns:dcterms="http://purl.org/dc/terms/">
+            <dc:creator>Jane Doe</dc:creator>
+            <dc:title>My Document</dc:title>
+            <dc:subject>Testing</dc:subject>
+            <dcterms:created>2024-01-15T10:30:00Z</dcterms:created>
+            <dcterms:modified>2024-06-20T14:00:00Z</dcterms:modified>
+        </cp:coreProperties>"#;
+
+        let props = parse_core_xml(xml);
+        assert_eq!(props.author.as_deref(), Some("Jane Doe"));
+        assert_eq!(props.title.as_deref(), Some("My Document"));
+        assert_eq!(props.subject.as_deref(), Some("Testing"));
+        assert_eq!(props.created.as_deref(), Some("2024-01-15T10:30:00Z"));
+        assert_eq!(props.modified.as_deref(), Some("2024-06-20T14:00:00Z"));
+    }
+
+    #[test]
+    fn parse_app_xml_extracts_fields() {
+        let xml = r#"<?xml version="1.0" encoding="UTF-8"?>
+        <Properties xmlns="http://schemas.openxmlformats.org/officeDocument/2006/extended-properties">
+            <Application>Microsoft Word</Application>
+            <Pages>42</Pages>
+            <Words>12500</Words>
+        </Properties>"#;
+
+        let props = parse_app_xml(xml);
+        assert_eq!(props.app_name.as_deref(), Some("Microsoft Word"));
+        assert_eq!(props.pages, Some(42));
+        assert_eq!(props.words, Some(12500));
+    }
+
+    #[test]
+    fn parse_odf_meta_extracts_fields() {
+        let xml = r#"<?xml version="1.0" encoding="UTF-8"?>
+        <office:document-meta xmlns:office="urn:oasis:names:tc:opendocument:xmlns:office:1.0"
+            xmlns:dc="http://purl.org/dc/elements/1.1/"
+            xmlns:meta="urn:oasis:names:tc:opendocument:xmlns:meta:1.0">
+            <office:meta>
+                <meta:initial-creator>John Smith</meta:initial-creator>
+                <dc:title>ODF Doc</dc:title>
+                <dc:subject>Testing ODF</dc:subject>
+                <meta:generator>LibreOffice/7.5</meta:generator>
+                <meta:creation-date>2024-03-01T09:00:00</meta:creation-date>
+                <dc:date>2024-03-15T12:00:00</dc:date>
+                <meta:document-statistic meta:page-count="10" meta:word-count="2500"/>
+            </office:meta>
+        </office:document-meta>"#;
+
+        let meta = parse_odf_meta(xml);
+        assert_eq!(meta.author.as_deref(), Some("John Smith"));
+        assert_eq!(meta.title.as_deref(), Some("ODF Doc"));
+        assert_eq!(meta.subject.as_deref(), Some("Testing ODF"));
+        assert_eq!(meta.generator.as_deref(), Some("LibreOffice/7.5"));
+        assert_eq!(meta.page_count, Some(10));
+        assert_eq!(meta.word_count, Some(2500));
+    }
+
+    #[test]
+    fn count_xml_elements_counts_correctly() {
+        let xml = r#"<root><sheet name="A"/><sheet name="B"/><other/><sheet name="C"/></root>"#;
+        assert_eq!(count_xml_elements(xml, "sheet"), 3);
+        assert_eq!(count_xml_elements(xml, "other"), 1);
+        assert_eq!(count_xml_elements(xml, "missing"), 0);
+    }
+
+    #[test]
+    fn summary_info_empty_data() {
+        let mut info = DocumentInfo::default();
+        parse_summary_info(&[], &mut info);
+        assert!(info.title.is_none());
+        assert!(info.author.is_none());
+    }
+}
diff --git a/src/filter.rs b/src/filter.rs
index 81329f1..0c9f5ae 100644
--- a/src/filter.rs
+++ b/src/filter.rs
@@ -641,6 +641,72 @@ fn resolve_field_typed<'a>(entry: &'a MediaEntry, path: &str) -> FieldValue<'a>
             .and_then(|e| e.orientation)
             .map_or(FieldValue::Null, |v| FieldValue::Num(f64::from(v))),
 
+        // media.doc.* — string fields
+        "media.doc.format" => entry.media.doc.as_ref().map_or(FieldValue::Null, |d| {
+            FieldValue::Str(Cow::Borrowed(&d.format))
+        }),
+        "media.doc.author" => entry
+            .media
+            .doc
+            .as_ref()
+            .and_then(|d| d.author.as_ref())
+            .map_or(FieldValue::Null, |v| FieldValue::Str(Cow::Borrowed(v))),
+        "media.doc.title" => entry
+            .media
+            .doc
+            .as_ref()
+            .and_then(|d| d.title.as_ref())
+            .map_or(FieldValue::Null, |v| FieldValue::Str(Cow::Borrowed(v))),
+        "media.doc.subject" => entry
+            .media
+            .doc
+            .as_ref()
+            .and_then(|d| d.subject.as_ref())
+            .map_or(FieldValue::Null, |v| FieldValue::Str(Cow::Borrowed(v))),
+        "media.doc.creator_app" => entry
+            .media
+            .doc
+            .as_ref()
+            .and_then(|d| d.creator_app.as_ref())
+            .map_or(FieldValue::Null, |v| FieldValue::Str(Cow::Borrowed(v))),
+        "media.doc.creation_date" => entry
+            .media
+            .doc
+            .as_ref()
+            .and_then(|d| d.creation_date.as_ref())
+            .map_or(FieldValue::Null, |v| FieldValue::Str(Cow::Borrowed(v))),
+        "media.doc.modification_date" => entry
+            .media
+            .doc
+            .as_ref()
+            .and_then(|d| d.modification_date.as_ref())
+            .map_or(FieldValue::Null, |v| FieldValue::Str(Cow::Borrowed(v))),
+        // media.doc.* — numeric fields
+        "media.doc.page_count" => entry
+            .media
+            .doc
+            .as_ref()
+            .and_then(|d| d.page_count)
+            .map_or(FieldValue::Null, |v| FieldValue::Num(f64::from(v))),
+        "media.doc.word_count" => entry
+            .media
+            .doc
+            .as_ref()
+            .and_then(|d| d.word_count)
+            .map_or(FieldValue::Null, |v| FieldValue::Num(v as f64)),
+        "media.doc.line_count" => entry
+            .media
+            .doc
+            .as_ref()
+            .and_then(|d| d.line_count)
+            .map_or(FieldValue::Null, |v| FieldValue::Num(v as f64)),
+        "media.doc.sheet_count" => entry
+            .media
+            .doc
+            .as_ref()
+            .and_then(|d| d.sheet_count)
+            .map_or(FieldValue::Null, |v| FieldValue::Num(f64::from(v))),
+
         // Convenience aliases (top-level shortcuts for common fields)
         "duration_ms" => resolve_field_typed(entry, "media.duration_ms"),
         "size_bytes" => resolve_field_typed(entry, "fs.size_bytes"),
@@ -650,6 +716,8 @@ fn resolve_field_typed<'a>(entry: &'a MediaEntry, path: &str) -> FieldValue<'a>
         "bitrate_bps" | "bitrate" => resolve_field_typed(entry, "media.overall_bitrate_bps"),
         "camera" => resolve_field_typed(entry, "media.exif.camera_model"),
         "iso" => resolve_field_typed(entry, "media.exif.iso"),
+        "pages" => resolve_field_typed(entry, "media.doc.page_count"),
+        "author" => resolve_field_typed(entry, "media.doc.author"),
 
         // Unknown field
         _ => {
@@ -783,6 +851,7 @@ mod tests {
                 streams: vec![],
                 tags: MediaTags::default(),
                 exif: None,
+                doc: None,
             },
             probe: ProbeInfo {
                 backend: Cow::Borrowed("ffprobe"),
@@ -1292,6 +1361,7 @@ mod tests {
                     gps_longitude: Some(139.767_125),
                     orientation: Some(1),
                 }),
+                doc: None,
             },
             probe: ProbeInfo {
                 backend: Cow::Borrowed("ffprobe"),
diff --git a/src/main.rs b/src/main.rs
index 0a1c1da..2f59808 100644
--- a/src/main.rs
+++ b/src/main.rs
@@ -4,6 +4,7 @@
 /// output for scripts and AI agents. Think `fd` meets `ffprobe` meets `lazygit`.
 mod cli;
 mod deps;
+mod document;
 mod exif;
 mod filter;
 mod output;
@@ -218,7 +219,16 @@ async fn run_ndjson(cli: &Cli, paths: &[std::path::PathBuf]) -> Result<()> {
 async fn run_info(cli: &Cli, files: &[std::path::PathBuf]) -> Result<()> {
     let mut entries = Vec::new();
     for file in files {
-        match probe::probe_file(file, cli.timeout_ms).await {
+        let is_doc = file
+            .extension()
+            .and_then(|e| e.to_str())
+            .is_some_and(types::is_document_extension);
+        let result = if is_doc {
+            probe::probe_document_file(file).await
+        } else {
+            probe::probe_file(file, cli.timeout_ms).await
+        };
+        match result {
             Ok(entry) => entries.push(entry),
             Err(e) => {
                 tracing::error!(path = %file.display(), "error probing file: {e}");
diff --git a/src/output.rs b/src/output.rs
index 92d34b8..f44187f 100644
--- a/src/output.rs
+++ b/src/output.rs
@@ -7,7 +7,7 @@ use chrono::{DateTime, Utc};
 use serde::Serialize;
 use std::io::Write;
 
-const SCHEMA_VERSION: &str = "0.1.0";
+const SCHEMA_VERSION: &str = "0.2.0";
 const MLS_VERSION: &str = env!("CARGO_PKG_VERSION");
 
 /// Borrowing envelope for JSON serialization (avoids cloning entries).
@@ -167,6 +167,7 @@ mod tests {
                 streams: vec![],
                 tags: MediaTags::default(),
                 exif: None,
+                doc: None,
             },
             probe: ProbeInfo {
                 backend: Cow::Borrowed("ffprobe"),
@@ -199,7 +200,7 @@ mod tests {
         write_json(&mut buf, &[], &[]).unwrap();
         let val: serde_json::Value = serde_json::from_slice(&buf).unwrap();
         assert_eq!(val["type"], "mls.list");
-        assert_eq!(val["schema_version"], "0.1.0");
+        assert_eq!(val["schema_version"], "0.2.0");
         assert_eq!(val["summary"]["entries_total"], 0);
     }
 
diff --git a/src/probe.rs b/src/probe.rs
index e4153d8..f512abc 100644
--- a/src/probe.rs
+++ b/src/probe.rs
@@ -126,6 +126,66 @@ pub async fn probe_file(path: &Path, timeout_ms: u64) -> Result<MediaEntry> {
     })
 }
 
+/// Probe a document file using native Rust extractors (no ffprobe).
+///
+/// # Errors
+/// Returns an error if file metadata cannot be read.
+pub async fn probe_document_file(path: &Path) -> Result<MediaEntry> {
+    let start = Instant::now();
+
+    let fs_meta = tokio::fs::metadata(path)
+        .await
+        .context("failed to read file metadata")?;
+    let fs = build_fs_info(&fs_meta);
+
+    let extension = path
+        .extension()
+        .map_or_else(String::new, |e| e.to_string_lossy().into_owned());
+
+    let doc_path = path.to_path_buf();
+    let doc_info = tokio::task::spawn_blocking(move || crate::document::probe_document(&doc_path))
+        .await
+        .unwrap_or(None);
+
+    let format_name = doc_info
+        .as_ref()
+        .map_or_else(|| extension.clone(), |d| d.format.clone());
+
+    let file_name = path
+        .file_name()
+        .map_or_else(String::new, |n| n.to_string_lossy().into_owned());
+
+    #[expect(clippy::cast_possible_truncation)]
+    let took_ms = start.elapsed().as_millis() as u64;
+
+    Ok(MediaEntry {
+        path: path.to_path_buf(),
+        file_name,
+        extension,
+        fs,
+        media: MediaInfo {
+            kind: MediaKind::Document,
+            container: ContainerInfo {
+                format_name: format_name.clone(),
+                format_primary: format_name,
+            },
+            duration_ms: None,
+            overall_bitrate_bps: None,
+            video: None,
+            audio: None,
+            streams: vec![],
+            tags: MediaTags::default(),
+            exif: None,
+            doc: doc_info,
+        },
+        probe: ProbeInfo {
+            backend: Cow::Borrowed("native"),
+            took_ms,
+            error: None,
+        },
+    })
+}
+
 #[expect(
     clippy::cast_possible_wrap,
     reason = "Unix timestamp seconds fit i64 until year 2262"
@@ -250,6 +310,7 @@ fn build_media_info(raw: &FfprobeOutput, ext: &str) -> MediaInfo {
         streams,
         tags,
         exif: None,
+        doc: None,
     }
 }
 
diff --git a/src/scan.rs b/src/scan.rs
index ac6a3ef..a333d27 100644
--- a/src/scan.rs
+++ b/src/scan.rs
@@ -3,7 +3,7 @@
 /// Filters by recognized media file extensions. Uses tokio for concurrent
 /// metadata probing with configurable concurrency.
 use crate::probe;
-use crate::types::{MediaEntry, ProbeError, is_media_extension};
+use crate::types::{MediaEntry, ProbeError, is_document_extension, is_media_extension};
 use anyhow::{Context, Result};
 use std::collections::HashSet;
 use std::os::unix::fs::MetadataExt;
@@ -119,8 +119,17 @@ pub async fn probe_files(
         }
 
         let tx = tx.clone();
+        let is_doc = file
+            .extension()
+            .and_then(|e| e.to_str())
+            .is_some_and(is_document_extension);
         tasks.spawn(async move {
-            match probe::probe_file(&file, timeout_ms).await {
+            let result = if is_doc {
+                probe::probe_document_file(&file).await
+            } else {
+                probe::probe_file(&file, timeout_ms).await
+            };
+            match result {
                 Ok(entry) => {
                     let _ = tx.send(ScanResult::Entry(Box::new(entry))).await;
                 }
@@ -183,10 +192,12 @@ mod tests {
 
         fs::write(root.join("a.mp4"), b"fake").unwrap();
         fs::write(root.join("b.mp3"), b"fake").unwrap();
-        fs::write(root.join("c.txt"), b"not media").unwrap();
+        fs::write(root.join("c.txt"), b"text file").unwrap();
+        fs::write(root.join("d.xyz"), b"not media").unwrap();
 
         let files = discover_media_files(&[root.to_path_buf()], None);
-        assert_eq!(files.len(), 2);
+        // mp4, mp3, txt are recognized (3 files); xyz is not
+        assert_eq!(files.len(), 3);
     }
 
     #[test]
diff --git a/src/sort.rs b/src/sort.rs
index e5cc84c..4cc1595 100644
--- a/src/sort.rs
+++ b/src/sort.rs
@@ -21,6 +21,7 @@ pub fn parse_sort_spec(spec: &str) -> Option<(SortKey, SortDir)> {
         "resolution" => SortKey::Resolution,
         "codec" => SortKey::Codec,
         "bitrate" => SortKey::Bitrate,
+        "pages" | "page_count" => SortKey::Pages,
         _ => return None,
     };
     let dir = match dir_str {
@@ -100,6 +101,11 @@ fn compare_by_key(a: &MediaEntry, b: &MediaEntry, key: SortKey) -> std::cmp::Ord
             .media
             .overall_bitrate_bps
             .cmp(&b.media.overall_bitrate_bps),
+        SortKey::Pages => {
+            let pages_a = a.media.doc.as_ref().and_then(|d| d.page_count);
+            let pages_b = b.media.doc.as_ref().and_then(|d| d.page_count);
+            pages_a.cmp(&pages_b)
+        }
     }
 }
 
@@ -137,6 +143,7 @@ mod tests {
                 streams: vec![],
                 tags: MediaTags::default(),
                 exif: None,
+                doc: None,
             },
             probe: ProbeInfo {
                 backend: Cow::Borrowed("ffprobe"),
@@ -231,6 +238,8 @@ mod tests {
             "resolution",
             "codec",
             "bitrate",
+            "pages",
+            "page_count",
         ];
         for key in keys {
             assert!(
diff --git a/src/tui/layout.rs b/src/tui/layout.rs
index 8fbcffa..7542858 100644
--- a/src/tui/layout.rs
+++ b/src/tui/layout.rs
@@ -186,6 +186,7 @@ fn render_file_list(frame: &mut Frame, app: &App, area: Rect) {
                     MediaKind::Video | MediaKind::Av => "V",
                     MediaKind::Audio => "A",
                     MediaKind::Image => "I",
+                    MediaKind::Document => "D",
                 };
 
                 let resolution = entry.media.video.as_ref().map_or_else(
@@ -513,6 +514,54 @@ fn render_metadata_text(frame: &mut Frame, entry: &crate::types::MediaEntry, are
         }
     }
 
+    if let Some(ref doc) = entry.media.doc {
+        lines.push(Line::from(""));
+        lines.push(Line::styled(
+            "── Document ──",
+            Style::default().fg(Color::Blue),
+        ));
+        lines.push(Line::from(vec![
+            Span::styled("Format: ", Style::default().fg(Color::DarkGray)),
+            Span::raw(&doc.format),
+        ]));
+        if let Some(pages) = doc.page_count {
+            lines.push(Line::from(vec![
+                Span::styled("Pages: ", Style::default().fg(Color::DarkGray)),
+                Span::raw(format!("{pages}")),
+            ]));
+        }
+        if let Some(words) = doc.word_count {
+            lines.push(Line::from(vec![
+                Span::styled("Words: ", Style::default().fg(Color::DarkGray)),
+                Span::raw(format!("{words}")),
+            ]));
+        }
+        if let Some(line_count) = doc.line_count {
+            lines.push(Line::from(vec![
+                Span::styled("Lines: ", Style::default().fg(Color::DarkGray)),
+                Span::raw(format!("{line_count}")),
+            ]));
+        }
+        if let Some(sheets) = doc.sheet_count {
+            lines.push(Line::from(vec![
+                Span::styled("Sheets: ", Style::default().fg(Color::DarkGray)),
+                Span::raw(format!("{sheets}")),
+            ]));
+        }
+        if let Some(ref author) = doc.author {
+            lines.push(Line::from(vec![
+                Span::styled("Author: ", Style::default().fg(Color::DarkGray)),
+                Span::raw(author),
+            ]));
+        }
+        if let Some(ref title) = doc.title {
+            lines.push(Line::from(vec![
+                Span::styled("Title: ", Style::default().fg(Color::DarkGray)),
+                Span::raw(title),
+            ]));
+        }
+    }
+
     let preview = Paragraph::new(lines).wrap(Wrap { trim: true });
     frame.render_widget(preview, area);
 }
@@ -705,7 +754,7 @@ fn render_footer(frame: &mut Frame, app: &App, area: Rect) {
     let keys = if app.triage.is_some() {
         "[y] keep  [n] delete  [m] move  [u] undo  [q] quit triage"
     } else {
-        "[j/k] nav  [Enter] open  [p] play  [/] filter  [1/2/3/4] kind  [s] sort  [t] triage  [?] help"
+        "[j/k] nav  [Enter] open  [p] play  [/] filter  [1-5] kind  [s] sort  [t] triage  [?] help"
     };
     let keybindings = Paragraph::new(Line::styled(keys, Style::default().fg(Color::DarkGray)));
     frame.render_widget(keybindings, footer_layout[1]);
@@ -732,7 +781,7 @@ fn render_help_overlay(frame: &mut Frame, area: Rect) {
         Line::from(""),
         Line::styled("Actions", Style::default().add_modifier(Modifier::BOLD)),
         Line::from("  /            Fuzzy filter (prefix = for structured)"),
-        Line::from("  1/2/3/4      Filter: All/Video/Audio/Image"),
+        Line::from("  1/2/3/4/5    Filter: All/Video/Audio/Image/Doc"),
         Line::from("  s/S          Cycle sort / reverse"),
         Line::from("  i            Toggle metadata panel"),
         Line::from("  Space        Mark/unmark file"),
diff --git a/src/tui/mod.rs b/src/tui/mod.rs
index 4cabf20..9546e64 100644
--- a/src/tui/mod.rs
+++ b/src/tui/mod.rs
@@ -48,6 +48,8 @@ pub enum KindFilter {
     Audio,
     /// Show only image files.
     Image,
+    /// Show only document files.
+    Document,
 }
 
 impl KindFilter {
@@ -59,6 +61,7 @@ impl KindFilter {
             Self::Video => "Video",
             Self::Audio => "Audio",
             Self::Image => "Image",
+            Self::Document => "Document",
         }
     }
 }
@@ -273,6 +276,7 @@ impl App {
             KindFilter::Video => matches!(entry.media.kind, MediaKind::Video | MediaKind::Av),
             KindFilter::Audio => matches!(entry.media.kind, MediaKind::Audio),
             KindFilter::Image => matches!(entry.media.kind, MediaKind::Image),
+            KindFilter::Document => matches!(entry.media.kind, MediaKind::Document),
         }
     }
 
@@ -811,6 +815,11 @@ async fn handle_key(app: &mut App, key: KeyEvent) {
             app.apply_filter();
             app.set_status("Filter: Image".to_string());
         }
+        (KeyCode::Char('5'), _) => {
+            app.kind_filter = KindFilter::Document;
+            app.apply_filter();
+            app.set_status("Filter: Document".to_string());
+        }
         // Playback
         (KeyCode::Char('p'), _) => handle_playback(app).await,
         (KeyCode::Char('P'), _) => {
@@ -994,6 +1003,7 @@ mod tests {
                 streams: vec![],
                 tags: MediaTags::default(),
                 exif: None,
+                doc: None,
             },
             probe: ProbeInfo {
                 backend: Cow::Borrowed("ffprobe"),
diff --git a/src/tui/triage.rs b/src/tui/triage.rs
index 6ebd8d7..ef44f24 100644
--- a/src/tui/triage.rs
+++ b/src/tui/triage.rs
@@ -367,6 +367,7 @@ mod tests {
                 streams: vec![],
                 tags: MediaTags::default(),
                 exif: None,
+                doc: None,
             },
             probe: ProbeInfo {
                 backend: Cow::Borrowed("ffprobe"),
diff --git a/src/types.rs b/src/types.rs
index e32d2d4..7eee718 100644
--- a/src/types.rs
+++ b/src/types.rs
@@ -17,6 +17,8 @@ pub enum MediaKind {
     Av,
     /// Still image (JPEG, PNG, etc.).
     Image,
+    /// Document file (PDF, DOCX, TXT, etc.).
+    Document,
 }
 
 impl std::fmt::Display for MediaKind {
@@ -26,6 +28,7 @@ impl std::fmt::Display for MediaKind {
             Self::Audio => write!(f, "audio"),
             Self::Av => write!(f, "av"),
             Self::Image => write!(f, "image"),
+            Self::Document => write!(f, "document"),
         }
     }
 }
@@ -172,6 +175,32 @@ pub struct ExifInfo {
     pub orientation: Option<u32>,
 }
 
+/// Document metadata (PDF, DOCX, XLSX, TXT, etc.).
+#[derive(Debug, Clone, Default, Serialize, Deserialize)]
+pub struct DocumentInfo {
+    pub format: String,
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub page_count: Option<u32>,
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub word_count: Option<u64>,
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub line_count: Option<u64>,
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub sheet_count: Option<u32>,
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub author: Option<String>,
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub title: Option<String>,
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub subject: Option<String>,
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub creator_app: Option<String>,
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub creation_date: Option<String>,
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub modification_date: Option<String>,
+}
+
 /// Aggregated media metadata.
 #[derive(Debug, Clone, Serialize, Deserialize)]
 pub struct MediaInfo {
@@ -190,6 +219,8 @@ pub struct MediaInfo {
     pub tags: MediaTags,
     #[serde(skip_serializing_if = "Option::is_none")]
     pub exif: Option<ExifInfo>,
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub doc: Option<DocumentInfo>,
 }
 
 /// File-system metadata.
@@ -269,6 +300,7 @@ pub enum SortKey {
     Resolution,
     Codec,
     Bitrate,
+    Pages,
 }
 
 impl SortKey {
@@ -283,6 +315,7 @@ impl SortKey {
             Self::Resolution => "resolution",
             Self::Codec => "codec",
             Self::Bitrate => "bitrate",
+            Self::Pages => "pages",
         }
     }
 
@@ -297,7 +330,8 @@ impl SortKey {
             Self::Duration => Self::Resolution,
             Self::Resolution => Self::Codec,
             Self::Codec => Self::Bitrate,
-            Self::Bitrate => Self::Path,
+            Self::Bitrate => Self::Pages,
+            Self::Pages => Self::Path,
         }
     }
 }
@@ -381,6 +415,11 @@ pub const AUDIO_EXTENSIONS: &[&str] = &[
 
 pub const IMAGE_EXTENSIONS: &[&str] = &["jpg", "jpeg", "png", "webp", "gif", "bmp", "tiff", "tif"];
 
+pub const DOCUMENT_EXTENSIONS: &[&str] = &[
+    "pdf", "docx", "doc", "odt", "xlsx", "xls", "ods", "pptx", "ppt", "odp", "csv", "tsv", "txt",
+    "md",
+];
+
 /// Check if a file extension is a recognized image type.
 #[must_use]
 pub fn is_image_extension(ext: &str) -> bool {
@@ -389,7 +428,15 @@ pub fn is_image_extension(ext: &str) -> bool {
         .any(|known| ext.eq_ignore_ascii_case(known))
 }
 
-/// Check if a file extension is a recognized media type (video, audio, or image).
+/// Check if a file extension is a recognized document type.
+#[must_use]
+pub fn is_document_extension(ext: &str) -> bool {
+    DOCUMENT_EXTENSIONS
+        .iter()
+        .any(|known| ext.eq_ignore_ascii_case(known))
+}
+
+/// Check if a file extension is a recognized media type (video, audio, image, or document).
 #[must_use]
 pub fn is_media_extension(ext: &str) -> bool {
     VIDEO_EXTENSIONS
@@ -397,6 +444,7 @@ pub fn is_media_extension(ext: &str) -> bool {
         .chain(AUDIO_EXTENSIONS.iter())
         .any(|known| ext.eq_ignore_ascii_case(known))
         || is_image_extension(ext)
+        || is_document_extension(ext)
 }
 
 /// Check if a file extension is a recognized video type.
@@ -627,13 +675,14 @@ mod tests {
         assert_eq!(SortKey::Resolution.label(), "resolution");
         assert_eq!(SortKey::Codec.label(), "codec");
         assert_eq!(SortKey::Bitrate.label(), "bitrate");
+        assert_eq!(SortKey::Pages.label(), "pages");
     }
 
     #[test]
     fn sort_key_cycle_returns_to_start() {
         let start = SortKey::Path;
         let mut current = start;
-        for _ in 0..8 {
+        for _ in 0..9 {
             current = current.next();
         }
         assert_eq!(current, start);
@@ -642,7 +691,8 @@ mod tests {
     #[test]
     fn sort_key_next_sequence() {
         assert_eq!(SortKey::Path.next(), SortKey::Name);
-        assert_eq!(SortKey::Bitrate.next(), SortKey::Path);
+        assert_eq!(SortKey::Bitrate.next(), SortKey::Pages);
+        assert_eq!(SortKey::Pages.next(), SortKey::Path);
     }
 
     // --- SortDir ---
@@ -661,6 +711,7 @@ mod tests {
         assert_eq!(MediaKind::Audio.to_string(), "audio");
         assert_eq!(MediaKind::Av.to_string(), "av");
         assert_eq!(MediaKind::Image.to_string(), "image");
+        assert_eq!(MediaKind::Document.to_string(), "document");
     }
 
     // --- Extension checks ---
@@ -695,11 +746,50 @@ mod tests {
 
     #[test]
     fn is_media_extension_rejects_unknown() {
-        assert!(!is_media_extension("txt"));
-        assert!(!is_media_extension("pdf"));
+        assert!(!is_media_extension("xyz"));
+        assert!(!is_media_extension("exe"));
         assert!(!is_media_extension(""));
     }
 
+    #[test]
+    fn is_media_extension_document() {
+        assert!(is_media_extension("pdf"));
+        assert!(is_media_extension("docx"));
+        assert!(is_media_extension("txt"));
+        assert!(is_media_extension("csv"));
+    }
+
+    #[test]
+    fn is_document_extension_accepts_documents() {
+        assert!(is_document_extension("pdf"));
+        assert!(is_document_extension("docx"));
+        assert!(is_document_extension("doc"));
+        assert!(is_document_extension("odt"));
+        assert!(is_document_extension("xlsx"));
+        assert!(is_document_extension("xls"));
+        assert!(is_document_extension("ods"));
+        assert!(is_document_extension("pptx"));
+        assert!(is_document_extension("ppt"));
+        assert!(is_document_extension("odp"));
+        assert!(is_document_extension("csv"));
+        assert!(is_document_extension("tsv"));
+        assert!(is_document_extension("txt"));
+        assert!(is_document_extension("md"));
+    }
+
+    #[test]
+    fn is_document_extension_case_insensitive() {
+        assert!(is_document_extension("PDF"));
+        assert!(is_document_extension("Docx"));
+    }
+
+    #[test]
+    fn is_document_extension_rejects_non_documents() {
+        assert!(!is_document_extension("mp4"));
+        assert!(!is_document_extension("jpg"));
+        assert!(!is_document_extension(""));
+    }
+
     #[test]
     fn is_video_extension_accepts_video() {
         assert!(is_video_extension("mkv"));
diff --git a/tests/cli.rs b/tests/cli.rs
index 37a9ba8..d8ace95 100644
--- a/tests/cli.rs
+++ b/tests/cli.rs
@@ -121,14 +121,14 @@ fn json_output_valid_schema() {
     let json: serde_json::Value = serde_json::from_slice(&output.stdout).unwrap();
 
     assert_eq!(json["type"], "mls.list");
-    assert_eq!(json["schema_version"], "0.1.0");
+    assert_eq!(json["schema_version"], "0.2.0");
     assert!(json["entries"].is_array());
     assert!(json["summary"].is_object());
     assert!(json["summary"]["entries_total"].is_number());
 
     let entries = json["entries"].as_array().unwrap();
-    // Should find 5 media files (mp4, mkv, mp3, jpg, png) — not the .txt
-    assert_eq!(entries.len(), 5);
+    // 5 AV/image files (mp4, mkv, mp3, jpg, png) + 1 document (txt)
+    assert_eq!(entries.len(), 6);
 }
 
 #[test]
@@ -164,7 +164,7 @@ fn ndjson_has_header_and_footer() {
 
     let header: serde_json::Value = serde_json::from_str(lines[0]).unwrap();
     assert_eq!(header["type"], "mls.header");
-    assert_eq!(header["schema_version"], "0.1.0");
+    assert_eq!(header["schema_version"], "0.2.0");
 
     let footer: serde_json::Value = serde_json::from_str(lines.last().unwrap()).unwrap();
     assert_eq!(footer["type"], "mls.footer");
@@ -319,3 +319,82 @@ fn json_filter_kind_excludes_other_kinds() {
     assert_eq!(entries[0]["media"]["kind"], "audio");
     assert_eq!(entries[0]["extension"], "mp3");
 }
+
+// --- Document support ---
+
+#[test]
+fn json_documents_have_kind_document() {
+    let tmp = setup_media_dir();
+    let output = mls_cmd().arg("--json").arg(tmp.path()).output().unwrap();
+
+    assert!(output.status.success());
+    let json: serde_json::Value = serde_json::from_slice(&output.stdout).unwrap();
+    let entries = json["entries"].as_array().unwrap();
+
+    let docs: Vec<&serde_json::Value> = entries
+        .iter()
+        .filter(|e| e["media"]["kind"] == "document")
+        .collect();
+
+    assert_eq!(docs.len(), 1, "expected 1 document entry (txt)");
+    assert_eq!(docs[0]["extension"], "txt");
+    assert_eq!(docs[0]["probe"]["backend"], "native");
+}
+
+#[test]
+fn json_filter_kind_document_returns_only_documents() {
+    let tmp = setup_media_dir();
+    let output = mls_cmd()
+        .arg("--json")
+        .arg("--filter")
+        .arg("kind == document")
+        .arg(tmp.path())
+        .output()
+        .unwrap();
+
+    assert!(output.status.success());
+    let json: serde_json::Value = serde_json::from_slice(&output.stdout).unwrap();
+    let entries = json["entries"].as_array().unwrap();
+
+    assert_eq!(entries.len(), 1, "expected only 1 document entry");
+    assert_eq!(entries[0]["media"]["kind"], "document");
+}
+
+#[test]
+fn json_document_has_line_count() {
+    let tmp = tempfile::tempdir().unwrap();
+    fs::write(
+        tmp.path().join("notes.txt"),
+        b"line one\nline two\nline three\n",
+    )
+    .unwrap();
+
+    let output = mls_cmd().arg("--json").arg(tmp.path()).output().unwrap();
+
+    assert!(output.status.success());
+    let json: serde_json::Value = serde_json::from_slice(&output.stdout).unwrap();
+    let entries = json["entries"].as_array().unwrap();
+
+    assert_eq!(entries.len(), 1);
+    let doc = &entries[0];
+    assert_eq!(doc["media"]["kind"], "document");
+    assert_eq!(doc["media"]["doc"]["format"], "txt");
+    assert_eq!(doc["media"]["doc"]["line_count"], 3);
+}
+
+#[test]
+fn json_sort_by_pages() {
+    let tmp = setup_media_dir();
+    let output = mls_cmd()
+        .arg("--json")
+        .arg("--sort")
+        .arg("pages")
+        .arg(tmp.path())
+        .output()
+        .unwrap();
+
+    assert!(output.status.success());
+    let json: serde_json::Value = serde_json::from_slice(&output.stdout).unwrap();
+    let entries = json["entries"].as_array().unwrap();
+    assert!(!entries.is_empty());
+}

From 63b90f57a1ceac0e34cfadb401d6cdcb6562a551 Mon Sep 17 00:00:00 2001
From: Pushkar Patel <git@thepushkarp.com>
Date: Tue, 3 Mar 2026 15:12:36 +0530
Subject: [PATCH 02/10] feat(tui): make kind filter multi-select with toggle
 keys

Replace exclusive KindFilter enum with a struct of per-kind booleans.
Keys 2-5 now toggle individual kinds on/off; key 1 resets to show all.
Footer shows checkbox indicators for each kind's active state.
---
 src/tui/layout.rs |  46 ++++++++-----
 src/tui/mod.rs    | 163 ++++++++++++++++++++++++++++++++++------------
 2 files changed, 150 insertions(+), 59 deletions(-)

diff --git a/src/tui/layout.rs b/src/tui/layout.rs
index 7542858..ed0c929 100644
--- a/src/tui/layout.rs
+++ b/src/tui/layout.rs
@@ -732,21 +732,32 @@ fn render_footer(frame: &mut Frame, app: &App, area: Rect) {
             Style::default().fg(Color::Red),
         )
     } else {
-        let kind_label = app.kind_filter.label();
-        Line::styled(
-            format!(
-                "{}/{} files │ Sort: {} │ [{}]",
-                if app.visible_count() == 0 {
-                    0
-                } else {
-                    app.selected + 1
-                },
-                app.visible_count(),
-                app.sort_key.label(),
-                kind_label,
-            ),
-            Style::default().fg(Color::DarkGray),
-        )
+        let prefix = format!(
+            "{}/{} files \u{2502} Sort: {} \u{2502} ",
+            if app.visible_count() == 0 {
+                0
+            } else {
+                app.selected + 1
+            },
+            app.visible_count(),
+            app.sort_key.label(),
+        );
+        let kf = &app.kind_filter;
+        let check = |on: bool| if on { "\u{2713}" } else { " " };
+        let dim = Style::default().fg(Color::DarkGray);
+        let spans = vec![
+            Span::styled(prefix, dim),
+            Span::styled("V[", dim),
+            Span::raw(check(kf.video)),
+            Span::styled("] A[", dim),
+            Span::raw(check(kf.audio)),
+            Span::styled("] I[", dim),
+            Span::raw(check(kf.image)),
+            Span::styled("] D[", dim),
+            Span::raw(check(kf.doc)),
+            Span::styled("]", dim),
+        ];
+        Line::from(spans)
     };
     frame.render_widget(Paragraph::new(status), footer_layout[0]);
 
@@ -754,7 +765,7 @@ fn render_footer(frame: &mut Frame, app: &App, area: Rect) {
     let keys = if app.triage.is_some() {
         "[y] keep  [n] delete  [m] move  [u] undo  [q] quit triage"
     } else {
-        "[j/k] nav  [Enter] open  [p] play  [/] filter  [1-5] kind  [s] sort  [t] triage  [?] help"
+        "[j/k] nav  [Enter] open  [p] play  [/] filter  [1] all  [2-5] kind  [s] sort  [t] triage  [?] help"
     };
     let keybindings = Paragraph::new(Line::styled(keys, Style::default().fg(Color::DarkGray)));
     frame.render_widget(keybindings, footer_layout[1]);
@@ -781,7 +792,8 @@ fn render_help_overlay(frame: &mut Frame, area: Rect) {
         Line::from(""),
         Line::styled("Actions", Style::default().add_modifier(Modifier::BOLD)),
         Line::from("  /            Fuzzy filter (prefix = for structured)"),
-        Line::from("  1/2/3/4/5    Filter: All/Video/Audio/Image/Doc"),
+        Line::from("  1            Show all kinds"),
+        Line::from("  2/3/4/5      Toggle Video/Audio/Image/Doc"),
         Line::from("  s/S          Cycle sort / reverse"),
         Line::from("  i            Toggle metadata panel"),
         Line::from("  Space        Mark/unmark file"),
diff --git a/src/tui/mod.rs b/src/tui/mod.rs
index 9546e64..36b2fdd 100644
--- a/src/tui/mod.rs
+++ b/src/tui/mod.rs
@@ -37,32 +37,27 @@ pub enum FilterMode {
     Structured,
 }
 
-/// Media kind pre-filter (1/2/3/4 keys).
+/// Media kind multi-select filter (1=all, 2-5 toggle individual kinds).
+#[expect(clippy::struct_excessive_bools, reason = "one bool per UI checkbox")]
 #[derive(Debug, Clone, Copy, PartialEq, Eq)]
-pub enum KindFilter {
-    /// Show all media types.
-    All,
-    /// Show only video/av files.
-    Video,
-    /// Show only audio-only files (no video stream).
-    Audio,
-    /// Show only image files.
-    Image,
-    /// Show only document files.
-    Document,
+pub struct KindFilter {
+    pub video: bool,
+    pub audio: bool,
+    pub image: bool,
+    pub doc: bool,
 }
 
 impl KindFilter {
-    /// Label for display in the footer.
+    pub const ALL: Self = Self {
+        video: true,
+        audio: true,
+        image: true,
+        doc: true,
+    };
+
     #[must_use]
-    pub fn label(self) -> &'static str {
-        match self {
-            Self::All => "All",
-            Self::Video => "Video",
-            Self::Audio => "Audio",
-            Self::Image => "Image",
-            Self::Document => "Document",
-        }
+    pub fn is_empty(self) -> bool {
+        !self.video && !self.audio && !self.image && !self.doc
     }
 }
 
@@ -206,7 +201,7 @@ impl App {
             dir_scanning: false,
             scan_concurrency,
             scan_timeout_ms,
-            kind_filter: KindFilter::All,
+            kind_filter: KindFilter::ALL,
             filter_mode: FilterMode::Fuzzy,
             filter_expr: None,
             playback_position: None,
@@ -271,12 +266,11 @@ impl App {
 
     /// Check if an entry matches the current kind filter.
     fn matches_kind(&self, entry: &MediaEntry) -> bool {
-        match self.kind_filter {
-            KindFilter::All => true,
-            KindFilter::Video => matches!(entry.media.kind, MediaKind::Video | MediaKind::Av),
-            KindFilter::Audio => matches!(entry.media.kind, MediaKind::Audio),
-            KindFilter::Image => matches!(entry.media.kind, MediaKind::Image),
-            KindFilter::Document => matches!(entry.media.kind, MediaKind::Document),
+        match entry.media.kind {
+            MediaKind::Video | MediaKind::Av => self.kind_filter.video,
+            MediaKind::Audio => self.kind_filter.audio,
+            MediaKind::Image => self.kind_filter.image,
+            MediaKind::Document => self.kind_filter.doc,
         }
     }
 
@@ -796,29 +790,36 @@ async fn handle_key(app: &mut App, key: KeyEvent) {
         }
         // Kind filter
         (KeyCode::Char('1'), _) => {
-            app.kind_filter = KindFilter::All;
+            app.kind_filter = KindFilter::ALL;
             app.apply_filter();
-            app.set_status("Filter: All".to_string());
         }
         (KeyCode::Char('2'), _) => {
-            app.kind_filter = KindFilter::Video;
+            app.kind_filter.video = !app.kind_filter.video;
             app.apply_filter();
-            app.set_status("Filter: Video".to_string());
+            if app.kind_filter.is_empty() {
+                app.set_status("Kind: nothing selected \u{2014} 1 to show all".to_string());
+            }
         }
         (KeyCode::Char('3'), _) => {
-            app.kind_filter = KindFilter::Audio;
+            app.kind_filter.audio = !app.kind_filter.audio;
             app.apply_filter();
-            app.set_status("Filter: Audio".to_string());
+            if app.kind_filter.is_empty() {
+                app.set_status("Kind: nothing selected \u{2014} 1 to show all".to_string());
+            }
         }
         (KeyCode::Char('4'), _) => {
-            app.kind_filter = KindFilter::Image;
+            app.kind_filter.image = !app.kind_filter.image;
             app.apply_filter();
-            app.set_status("Filter: Image".to_string());
+            if app.kind_filter.is_empty() {
+                app.set_status("Kind: nothing selected \u{2014} 1 to show all".to_string());
+            }
         }
         (KeyCode::Char('5'), _) => {
-            app.kind_filter = KindFilter::Document;
+            app.kind_filter.doc = !app.kind_filter.doc;
             app.apply_filter();
-            app.set_status("Filter: Document".to_string());
+            if app.kind_filter.is_empty() {
+                app.set_status("Kind: nothing selected \u{2014} 1 to show all".to_string());
+            }
         }
         // Playback
         (KeyCode::Char('p'), _) => handle_playback(app).await,
@@ -1182,7 +1183,12 @@ mod tests {
             5000,
         );
 
-        app.kind_filter = KindFilter::Video;
+        app.kind_filter = KindFilter {
+            video: true,
+            audio: false,
+            image: false,
+            doc: false,
+        };
         app.apply_filter();
 
         // Video filter matches MediaKind::Video and MediaKind::Av (2 entries).
@@ -1209,7 +1215,12 @@ mod tests {
         );
 
         // Audio filter matches only MediaKind::Audio (1 entry).
-        app.kind_filter = KindFilter::Audio;
+        app.kind_filter = KindFilter {
+            video: false,
+            audio: true,
+            image: false,
+            doc: false,
+        };
         app.apply_filter();
         assert_eq!(app.filtered_indices.len(), 1);
     }
@@ -1217,7 +1228,7 @@ mod tests {
     #[test]
     fn kind_filter_all_shows_everything() {
         let mut app = make_test_app(&["a.mp4", "b.mkv", "c.mp3"]);
-        app.kind_filter = KindFilter::All;
+        app.kind_filter = KindFilter::ALL;
         app.apply_filter();
         assert_eq!(app.filtered_indices.len(), 3);
     }
@@ -1243,13 +1254,81 @@ mod tests {
         );
 
         // Video kind filter + fuzzy "alpha" → only alpha.mp4 passes both predicates.
-        app.kind_filter = KindFilter::Video;
+        app.kind_filter = KindFilter {
+            video: true,
+            audio: false,
+            image: false,
+            doc: false,
+        };
         app.filter_text = "alpha".to_string();
         app.apply_filter();
         assert_eq!(app.filtered_indices.len(), 1);
         assert_eq!(app.entries[app.filtered_indices[0]].file_name, "alpha.mp4");
     }
 
+    #[test]
+    fn kind_filter_multi_select() {
+        let entries = vec![
+            make_entry_with_kind("video.mp4", MediaKind::Video),
+            make_entry_with_kind("song.mp3", MediaKind::Audio),
+            make_entry_with_kind("photo.jpg", MediaKind::Image),
+            make_entry_with_kind("doc.pdf", MediaKind::Document),
+        ];
+        let tmp = tempfile::tempdir().unwrap();
+        let thumb_cache = ThumbnailCache::new(10, tmp.path().to_path_buf()).unwrap();
+        let picker = Picker::halfblocks();
+        let mut app = App::new(
+            entries,
+            vec![],
+            PathBuf::from("/test"),
+            thumb_cache,
+            picker,
+            4,
+            5000,
+        );
+
+        // Video + audio enabled, image + doc disabled.
+        app.kind_filter = KindFilter {
+            video: true,
+            audio: true,
+            image: false,
+            doc: false,
+        };
+        app.apply_filter();
+        assert_eq!(app.filtered_indices.len(), 2);
+    }
+
+    #[test]
+    fn kind_filter_empty_shows_nothing() {
+        let entries = vec![
+            make_entry_with_kind("video.mp4", MediaKind::Video),
+            make_entry_with_kind("song.mp3", MediaKind::Audio),
+        ];
+        let tmp = tempfile::tempdir().unwrap();
+        let thumb_cache = ThumbnailCache::new(10, tmp.path().to_path_buf()).unwrap();
+        let picker = Picker::halfblocks();
+        let mut app = App::new(
+            entries,
+            vec![],
+            PathBuf::from("/test"),
+            thumb_cache,
+            picker,
+            4,
+            5000,
+        );
+
+        // All kinds disabled → empty list.
+        app.kind_filter = KindFilter {
+            video: false,
+            audio: false,
+            image: false,
+            doc: false,
+        };
+        app.apply_filter();
+        assert!(app.filtered_indices.is_empty());
+        assert!(app.kind_filter.is_empty());
+    }
+
     #[test]
     fn filter_mode_defaults_to_fuzzy() {
         let app = make_test_app(&["a.mp4"]);

From f029fc2e0930f3974b44b5ad227da5f23f64f36b Mon Sep 17 00:00:00 2001
From: Pushkar Patel <git@thepushkarp.com>
Date: Tue, 3 Mar 2026 15:57:18 +0530
Subject: [PATCH 03/10] fix(tui): preserve selection when navigating back to
 parent directory

Save current_dir before navigating to parent, then restore cursor
to the child directory in the new dir_items list. Also fix
apply_filter clamping to use visible_count() (dirs + media) instead
of filtered_indices.len() (media only).
---
 src/tui/mod.rs | 10 +++++++---
 1 file changed, 7 insertions(+), 3 deletions(-)

diff --git a/src/tui/mod.rs b/src/tui/mod.rs
index 36b2fdd..0aeacb0 100644
--- a/src/tui/mod.rs
+++ b/src/tui/mod.rs
@@ -258,9 +258,9 @@ impl App {
                 FilterMode::Structured => self.apply_structured_filter(&kind_indices),
             }
         }
-        // Keep selected index in bounds
-        if self.selected >= self.filtered_indices.len() {
-            self.selected = self.filtered_indices.len().saturating_sub(1);
+        // Keep selected index in bounds (dirs + media)
+        if self.selected >= self.visible_count() {
+            self.selected = self.visible_count().saturating_sub(1);
         }
     }
 
@@ -850,7 +850,11 @@ async fn handle_key(app: &mut App, key: KeyEvent) {
             if app.current_dir != app.root_dir
                 && let Some(parent) = app.current_dir.parent().map(std::path::Path::to_path_buf)
             {
+                let child = app.current_dir.clone();
                 app.navigate_to_dir(parent);
+                if let Some(idx) = app.dir_items.iter().position(|d| *d == child) {
+                    app.selected = idx;
+                }
             }
         }
         _ => {}

From 87793a7a7f3bd3911c2fec3aac34d8d15abb3634 Mon Sep 17 00:00:00 2001
From: Pushkar Patel <git@thepushkarp.com>
Date: Tue, 3 Mar 2026 16:34:34 +0530
Subject: [PATCH 04/10] feat(tui): sort folders by current sort key (dirs
 always on top)

Folders now respect the user's sort key (name, size, modified) instead
of always being alphabetical. Media-only keys (duration, resolution,
codec, bitrate, pages) fall back to name sort for directories.

Introduce DirItem struct to carry cached metadata (name, size,
modified_at) alongside directory paths, avoiding re-stat on every sort.
Decouple list_sibling_dirs from list_subdirs so the parent pane stays
alphabetical while the current pane sorts dynamically.
---
 src/sort.rs       | 112 +++++++++++++++++++++++++++++++++++++++++++++-
 src/tui/layout.rs |   5 +--
 src/tui/mod.rs    |  88 ++++++++++++++++++++++++++----------
 src/types.rs      |  21 +++++++++
 4 files changed, 196 insertions(+), 30 deletions(-)

diff --git a/src/sort.rs b/src/sort.rs
index 4cc1595..9c9aa48 100644
--- a/src/sort.rs
+++ b/src/sort.rs
@@ -1,7 +1,7 @@
-/// Sorting logic for media entries.
+/// Sorting logic for media entries and directory items.
 ///
 /// Supports sorting by all metadata fields with configurable direction.
-use crate::types::{MediaEntry, SortDir, SortKey};
+use crate::types::{DirItem, MediaEntry, SortDir, SortKey};
 
 /// Parse a sort specification string (e.g., "`duration_ms:desc`", "name:asc").
 ///
@@ -60,6 +60,29 @@ pub fn sort_entries(entries: &mut [MediaEntry], key: SortKey, dir: SortDir) {
     });
 }
 
+/// Sort directory items in place by the given key and direction.
+///
+/// Falls back to Name sort for media-only keys (Duration, Resolution, etc.).
+pub fn sort_dir_items(dirs: &mut [DirItem], key: SortKey, dir: SortDir) {
+    let effective_key = if key.applies_to_dirs() {
+        key
+    } else {
+        SortKey::Name
+    };
+    dirs.sort_by(|a, b| {
+        let cmp = match effective_key {
+            SortKey::Size => a.size_bytes.cmp(&b.size_bytes),
+            SortKey::Modified => a.modified_at.cmp(&b.modified_at),
+            // Name, Path, and any fallback: sort by lowercased name
+            _ => a.name_lower.cmp(&b.name_lower),
+        };
+        match dir {
+            SortDir::Asc => cmp,
+            SortDir::Desc => cmp.reverse(),
+        }
+    });
+}
+
 fn compare_by_key(a: &MediaEntry, b: &MediaEntry, key: SortKey) -> std::cmp::Ordering {
     match key {
         SortKey::Path => a.path.cmp(&b.path),
@@ -414,4 +437,89 @@ mod tests {
         assert_eq!(entries[0].file_name, "no_duration.mp4");
         assert_eq!(entries[1].file_name, "has_duration.mp4");
     }
+
+    // --- sort_dir_items ---
+
+    fn make_dir_item(name: &str, size: u64, modified: Option<std::time::SystemTime>) -> DirItem {
+        DirItem {
+            path: PathBuf::from(format!("/test/{name}")),
+            name: name.to_string(),
+            name_lower: name.to_lowercase(),
+            size_bytes: size,
+            modified_at: modified,
+        }
+    }
+
+    #[test]
+    fn sort_dir_items_by_name_asc() {
+        let mut dirs = vec![
+            make_dir_item("Zebra", 0, None),
+            make_dir_item("alpha", 0, None),
+            make_dir_item("middle", 0, None),
+        ];
+        sort_dir_items(&mut dirs, SortKey::Name, SortDir::Asc);
+        let names: Vec<&str> = dirs.iter().map(|d| d.name.as_str()).collect();
+        assert_eq!(names, vec!["alpha", "middle", "Zebra"]);
+    }
+
+    #[test]
+    fn sort_dir_items_by_name_desc() {
+        let mut dirs = vec![
+            make_dir_item("alpha", 0, None),
+            make_dir_item("Zebra", 0, None),
+            make_dir_item("middle", 0, None),
+        ];
+        sort_dir_items(&mut dirs, SortKey::Name, SortDir::Desc);
+        let names: Vec<&str> = dirs.iter().map(|d| d.name.as_str()).collect();
+        assert_eq!(names, vec!["Zebra", "middle", "alpha"]);
+    }
+
+    #[test]
+    fn sort_dir_items_by_size_asc() {
+        let mut dirs = vec![
+            make_dir_item("big", 300, None),
+            make_dir_item("small", 100, None),
+            make_dir_item("medium", 200, None),
+        ];
+        sort_dir_items(&mut dirs, SortKey::Size, SortDir::Asc);
+        let sizes: Vec<u64> = dirs.iter().map(|d| d.size_bytes).collect();
+        assert_eq!(sizes, vec![100, 200, 300]);
+    }
+
+    #[test]
+    fn sort_dir_items_by_modified() {
+        use std::time::{Duration, SystemTime};
+        let t1 = SystemTime::UNIX_EPOCH + Duration::from_secs(1000);
+        let t2 = SystemTime::UNIX_EPOCH + Duration::from_secs(2000);
+        let t3 = SystemTime::UNIX_EPOCH + Duration::from_secs(3000);
+        let mut dirs = vec![
+            make_dir_item("newest", 0, Some(t3)),
+            make_dir_item("oldest", 0, Some(t1)),
+            make_dir_item("middle", 0, Some(t2)),
+        ];
+        sort_dir_items(&mut dirs, SortKey::Modified, SortDir::Asc);
+        let names: Vec<&str> = dirs.iter().map(|d| d.name.as_str()).collect();
+        assert_eq!(names, vec!["oldest", "middle", "newest"]);
+    }
+
+    #[test]
+    fn sort_dir_items_media_key_falls_back_to_name() {
+        let mut dirs = vec![
+            make_dir_item("Zebra", 0, None),
+            make_dir_item("alpha", 0, None),
+        ];
+        // Duration is media-only, should fall back to Name sort
+        sort_dir_items(&mut dirs, SortKey::Duration, SortDir::Asc);
+        let names: Vec<&str> = dirs.iter().map(|d| d.name.as_str()).collect();
+        assert_eq!(names, vec!["alpha", "Zebra"]);
+
+        // Codec too
+        let mut dirs2 = vec![
+            make_dir_item("Zebra", 0, None),
+            make_dir_item("alpha", 0, None),
+        ];
+        sort_dir_items(&mut dirs2, SortKey::Codec, SortDir::Asc);
+        let names2: Vec<&str> = dirs2.iter().map(|d| d.name.as_str()).collect();
+        assert_eq!(names2, vec!["alpha", "Zebra"]);
+    }
 }
diff --git a/src/tui/layout.rs b/src/tui/layout.rs
index ed0c929..8b91e82 100644
--- a/src/tui/layout.rs
+++ b/src/tui/layout.rs
@@ -161,10 +161,7 @@ fn render_file_list(frame: &mut Frame, app: &App, area: Rect) {
             if vis_idx < dir_count {
                 // Directory item
                 let dir = &app.dir_items[vis_idx];
-                let name = dir
-                    .file_name()
-                    .map_or_else(|| ".".to_string(), |n| n.to_string_lossy().into_owned());
-                let line = format!("  D {name}/");
+                let line = format!("  D {}/", dir.name);
                 let style = if is_selected {
                     Style::default()
                         .bg(Color::DarkGray)
diff --git a/src/tui/mod.rs b/src/tui/mod.rs
index 0aeacb0..72eb7ae 100644
--- a/src/tui/mod.rs
+++ b/src/tui/mod.rs
@@ -9,9 +9,9 @@ pub mod triage;
 use crate::filter::Filter;
 use crate::playback::{MpvController, PlaybackState};
 use crate::scan;
-use crate::sort::sort_entries;
+use crate::sort::{sort_dir_items, sort_entries};
 use crate::thumbnail::ThumbnailCache;
-use crate::types::{MediaEntry, MediaKind, ProbeError, SortDir, SortKey};
+use crate::types::{DirItem, MediaEntry, MediaKind, ProbeError, SortDir, SortKey};
 use anyhow::{Context, Result};
 use crossterm::ExecutableCommand;
 use crossterm::event::{self, Event, KeyCode, KeyEvent, KeyModifiers};
@@ -132,8 +132,8 @@ pub struct App {
     filter_mode: FilterMode,
     /// Last successfully parsed structured filter expression.
     filter_expr: Option<Filter>,
-    /// Subdirectories of `current_dir`, sorted alphabetically.
-    dir_items: Vec<PathBuf>,
+    /// Subdirectories of `current_dir`, sorted by current sort key.
+    dir_items: Vec<DirItem>,
     /// Cached sibling directories (for parent pane rendering).
     sibling_dirs: Vec<PathBuf>,
     /// Receiver for async directory scan results.
@@ -226,11 +226,11 @@ impl App {
             .and_then(|&idx| self.entries.get(idx))
     }
 
-    /// Get the currently selected directory (if any).
+    /// Get the currently selected directory path (if any).
     #[must_use]
     pub fn selected_dir(&self) -> Option<&PathBuf> {
         if self.selected < self.dir_items.len() {
-            Some(&self.dir_items[self.selected])
+            Some(&self.dir_items[self.selected].path)
         } else {
             None
         }
@@ -328,8 +328,9 @@ impl App {
         }
     }
 
-    /// Apply current sort to entries and rebuild indices.
+    /// Apply current sort to entries and dir items, then rebuild indices.
     fn apply_sort(&mut self) {
+        sort_dir_items(&mut self.dir_items, self.sort_key, self.sort_dir);
         sort_entries(&mut self.entries, self.sort_key, self.sort_dir);
         self.apply_filter();
     }
@@ -513,6 +514,7 @@ impl App {
     /// Navigate to a directory: load subdirs, clear state, spawn async scan.
     fn navigate_to_dir(&mut self, path: PathBuf) {
         self.dir_items = list_subdirs(&path);
+        sort_dir_items(&mut self.dir_items, self.sort_key, self.sort_dir);
         self.sibling_dirs = list_sibling_dirs(&path);
 
         // Clear media state (but NOT mpv playback — per spec)
@@ -578,23 +580,51 @@ impl App {
     }
 }
 
-/// List subdirectories of a path, sorted alphabetically.
-fn list_subdirs(path: &std::path::Path) -> Vec<PathBuf> {
+/// List subdirectories of a path as `DirItem`s, sorted alphabetically by name.
+fn list_subdirs(path: &std::path::Path) -> Vec<DirItem> {
     let Ok(entries) = std::fs::read_dir(path) else {
         return vec![];
     };
-    let mut dirs: Vec<PathBuf> = entries
+    let mut dirs: Vec<DirItem> = entries
         .flatten()
-        .filter(|e| e.path().is_dir())
-        .map(|e| e.path())
+        .filter(|e| e.file_type().is_ok_and(|ft| ft.is_dir()))
+        .filter_map(|e| {
+            let path = e.path();
+            let name = path.file_name().map(|n| n.to_string_lossy().into_owned())?;
+            let name_lower = name.to_lowercase();
+            let meta = e.metadata().ok();
+            let size_bytes = meta.as_ref().map_or(0, std::fs::Metadata::len);
+            let modified_at = meta.as_ref().and_then(|m| m.modified().ok());
+            Some(DirItem {
+                path,
+                name,
+                name_lower,
+                size_bytes,
+                modified_at,
+            })
+        })
         .collect();
-    dirs.sort();
+    dirs.sort_by(|a, b| a.name_lower.cmp(&b.name_lower));
     dirs
 }
 
 /// List sibling directories (dirs in parent) for the parent pane.
+///
+/// Returns plain `PathBuf`s since the parent pane is always alphabetical.
 fn list_sibling_dirs(path: &std::path::Path) -> Vec<PathBuf> {
-    path.parent().map_or_else(Vec::new, list_subdirs)
+    let Some(parent) = path.parent() else {
+        return vec![];
+    };
+    let Ok(entries) = std::fs::read_dir(parent) else {
+        return vec![];
+    };
+    let mut dirs: Vec<PathBuf> = entries
+        .flatten()
+        .filter(|e| e.file_type().is_ok_and(|ft| ft.is_dir()))
+        .map(|e| e.path())
+        .collect();
+    dirs.sort();
+    dirs
 }
 
 /// Run the TUI application.
@@ -852,7 +882,7 @@ async fn handle_key(app: &mut App, key: KeyEvent) {
             {
                 let child = app.current_dir.clone();
                 app.navigate_to_dir(parent);
-                if let Some(idx) = app.dir_items.iter().position(|d| *d == child) {
+                if let Some(idx) = app.dir_items.iter().position(|d| d.path == child) {
                     app.selected = idx;
                 }
             }
@@ -1018,6 +1048,19 @@ mod tests {
         }
     }
 
+    fn make_dir_item(path: &str) -> DirItem {
+        let name = std::path::Path::new(path)
+            .file_name()
+            .map_or_else(|| ".".to_string(), |n| n.to_string_lossy().into_owned());
+        DirItem {
+            path: PathBuf::from(path),
+            name_lower: name.to_lowercase(),
+            name,
+            size_bytes: 0,
+            modified_at: None,
+        }
+    }
+
     fn make_test_app(names: &[&str]) -> App {
         let entries: Vec<MediaEntry> = names.iter().map(|n| make_entry(n)).collect();
         let tmp = tempfile::tempdir().unwrap();
@@ -1344,8 +1387,8 @@ mod tests {
     fn selected_dir_returns_path_when_dir_selected() {
         let mut app = make_test_app(&["a.mp4"]);
         app.dir_items = vec![
-            PathBuf::from("/test/subdir1"),
-            PathBuf::from("/test/subdir2"),
+            make_dir_item("/test/subdir1"),
+            make_dir_item("/test/subdir2"),
         ];
         app.selected = 0;
         assert_eq!(app.selected_dir(), Some(&PathBuf::from("/test/subdir1")));
@@ -1355,7 +1398,7 @@ mod tests {
     #[test]
     fn selected_entry_offsets_correctly() {
         let mut app = make_test_app(&["a.mp4", "b.mkv"]);
-        app.dir_items = vec![PathBuf::from("/test/subdir")];
+        app.dir_items = vec![make_dir_item("/test/subdir")];
         // selected=0 → directory
         app.selected = 0;
         assert!(app.selected_entry().is_none());
@@ -1378,7 +1421,7 @@ mod tests {
     #[test]
     fn visible_count_includes_dirs() {
         let mut app = make_test_app(&["a.mp4"]);
-        app.dir_items = vec![PathBuf::from("/test/d1"), PathBuf::from("/test/d2")];
+        app.dir_items = vec![make_dir_item("/test/d1"), make_dir_item("/test/d2")];
         assert_eq!(app.visible_count(), 3); // 2 dirs + 1 media
     }
 
@@ -1408,10 +1451,7 @@ mod tests {
         std::fs::create_dir(root.join("middle")).unwrap();
 
         let dirs = super::list_subdirs(root);
-        let names: Vec<String> = dirs
-            .iter()
-            .map(|d| d.file_name().unwrap().to_string_lossy().into_owned())
-            .collect();
+        let names: Vec<&str> = dirs.iter().map(|d| d.name.as_str()).collect();
         assert_eq!(names, vec!["alpha", "middle", "zebra"]);
     }
 
@@ -1442,7 +1482,7 @@ mod tests {
     #[test]
     fn toggle_mark_on_dir_is_noop() {
         let mut app = make_test_app(&["a.mp4"]);
-        app.dir_items = vec![PathBuf::from("/d")];
+        app.dir_items = vec![make_dir_item("/d")];
         app.selected = 0;
         app.toggle_mark();
         assert!(app.marked.is_empty());
diff --git a/src/types.rs b/src/types.rs
index 7eee718..817024e 100644
--- a/src/types.rs
+++ b/src/types.rs
@@ -289,6 +289,18 @@ pub enum NdjsonRecord {
     },
 }
 
+/// Directory entry with cached metadata for sorting.
+#[derive(Debug, Clone)]
+pub struct DirItem {
+    pub path: PathBuf,
+    /// Display name (original case).
+    pub name: String,
+    /// Pre-lowercased name for case-insensitive sorting.
+    pub name_lower: String,
+    pub size_bytes: u64,
+    pub modified_at: Option<std::time::SystemTime>,
+}
+
 /// Sort key for media entries.
 #[derive(Debug, Clone, Copy, PartialEq, Eq)]
 pub enum SortKey {
@@ -319,6 +331,15 @@ impl SortKey {
         }
     }
 
+    /// Whether this sort key applies to directories.
+    ///
+    /// Media-only keys (Duration, Resolution, Codec, Bitrate, Pages) return
+    /// `false` — callers should fall back to Name sort for dirs.
+    #[must_use]
+    pub fn applies_to_dirs(self) -> bool {
+        matches!(self, Self::Path | Self::Name | Self::Size | Self::Modified)
+    }
+
     /// Cycle to next sort key.
     #[must_use]
     pub fn next(self) -> Self {

From 68a4b067c671322e8bbf88a8082c9ff6c4f2a099 Mon Sep 17 00:00:00 2001
From: Pushkar Patel <git@thepushkarp.com>
Date: Wed, 4 Mar 2026 14:02:23 +0530
Subject: [PATCH 05/10] fix(document): bound all file reads to prevent OOM on
 crafted inputs

- Cap zip XML reads at 4 MiB via `take(MAX_XML_BYTES)` in `read_xml_from_zip`
- Cap text/CSV scanning at 256 MiB via `take(MAX_TEXT_SCAN_BYTES)`
- Replace `BufReader::lines()` with byte-level counting in `probe_text_table`
- Replace `BufReader::lines()` with capped `read_line` in `probe_text`

Addresses security review comments about unbounded reads.
---
 src/document.rs | 46 +++++++++++++++++++++++++++-------------------
 1 file changed, 27 insertions(+), 19 deletions(-)

diff --git a/src/document.rs b/src/document.rs
index bb6a129..e720656 100644
--- a/src/document.rs
+++ b/src/document.rs
@@ -3,9 +3,17 @@
 /// All extractors are best-effort: failures are silently swallowed and
 /// logged at debug level. Follows the same pattern as `exif.rs`.
 use crate::types::DocumentInfo;
-use std::io::{BufRead, BufReader};
+use std::io::{BufRead, BufReader, Read};
 use std::path::Path;
 
+/// Maximum bytes to read from a single XML file inside a zip archive (4 MiB).
+/// Prevents OOM from crafted archives with enormous embedded XML.
+const MAX_XML_BYTES: u64 = 4 * 1024 * 1024;
+
+/// Maximum bytes to scan from a text/CSV/TSV file (256 MiB).
+/// Prevents scanning multi-GB log files or data dumps.
+const MAX_TEXT_SCAN_BYTES: u64 = 256 * 1024 * 1024;
+
 /// Extract metadata from a document file, dispatching by extension.
 ///
 /// Returns `None` on any failure (corrupt file, unsupported format, I/O error).
@@ -88,9 +96,9 @@ fn pdf_object_to_string(obj: &lopdf::Object) -> Option<String> {
 fn read_xml_from_zip(path: &Path, inner_path: &str) -> Option<String> {
     let file = std::fs::File::open(path).ok()?;
     let mut archive = zip::ZipArchive::new(file).ok()?;
-    let mut entry = archive.by_name(inner_path).ok()?;
+    let entry = archive.by_name(inner_path).ok()?;
     let mut contents = String::new();
-    std::io::Read::read_to_string(&mut entry, &mut contents).ok()?;
+    entry.take(MAX_XML_BYTES).read_to_string(&mut contents).ok()?;
     Some(contents)
 }
 
@@ -507,15 +515,11 @@ fn read_u32_le(data: &[u8], offset: usize) -> u32 {
 
 fn probe_text_table(path: &Path, ext: &str) -> Option<DocumentInfo> {
     let file = std::fs::File::open(path).ok()?;
-    let reader = BufReader::new(file);
-    let mut line_count: u64 = 0;
-
-    for line in reader.lines() {
-        if line.is_err() {
-            break;
-        }
-        line_count += 1;
-    }
+    let reader = BufReader::new(file.take(MAX_TEXT_SCAN_BYTES));
+    let line_count = reader
+        .bytes()
+        .filter(|b| b.as_ref().is_ok_and(|&c| c == b'\n'))
+        .count() as u64;
 
     Some(DocumentInfo {
         format: ext.to_string(),
@@ -526,16 +530,20 @@ fn probe_text_table(path: &Path, ext: &str) -> Option<DocumentInfo> {
 
 fn probe_text(path: &Path, ext: &str) -> Option<DocumentInfo> {
     let file = std::fs::File::open(path).ok()?;
-    let reader = BufReader::new(file);
+    let mut reader = BufReader::new(file.take(MAX_TEXT_SCAN_BYTES));
     let mut line_count: u64 = 0;
     let mut word_count: u64 = 0;
+    let mut buf = String::new();
 
-    for line in reader.lines() {
-        let Ok(line) = line else {
-            break;
-        };
-        line_count += 1;
-        word_count += line.split_whitespace().count() as u64;
+    loop {
+        buf.clear();
+        match reader.read_line(&mut buf) {
+            Ok(0) | Err(_) => break,
+            Ok(_) => {
+                line_count += 1;
+                word_count += buf.split_whitespace().count() as u64;
+            }
+        }
     }
 
     Some(DocumentInfo {

From 408746e47fb3e79509465bfd742e8d0aa64741a9 Mon Sep 17 00:00:00 2001
From: Pushkar Patel <git@thepushkarp.com>
Date: Wed, 4 Mar 2026 14:02:43 +0530
Subject: [PATCH 06/10] fix(scan): wrap document probes in timeout to match
 media probe behavior

Document probes had no timeout, unlike ffprobe-based media probes.
A stuck document read (e.g., network-mounted PDF) would block
the scan task indefinitely.
---
 src/scan.rs | 10 +++++++++-
 1 file changed, 9 insertions(+), 1 deletion(-)

diff --git a/src/scan.rs b/src/scan.rs
index a333d27..2d1eb82 100644
--- a/src/scan.rs
+++ b/src/scan.rs
@@ -125,7 +125,15 @@ pub async fn probe_files(
             .is_some_and(is_document_extension);
         tasks.spawn(async move {
             let result = if is_doc {
-                probe::probe_document_file(&file).await
+                tokio::time::timeout(
+                    std::time::Duration::from_millis(timeout_ms),
+                    probe::probe_document_file(&file),
+                )
+                .await
+                .unwrap_or_else(|_| {
+                    tracing::debug!(path = %file.display(), "document probe timed out");
+                    Err(anyhow::anyhow!("document probe timed out"))
+                })
             } else {
                 probe::probe_file(&file, timeout_ms).await
             };

From 72744d8a2dba09c17f22092bf55305cd28b3331e Mon Sep 17 00:00:00 2001
From: Pushkar Patel <git@thepushkarp.com>
Date: Wed, 4 Mar 2026 14:03:10 +0530
Subject: [PATCH 07/10] fix(tui): follow symlinks in directory listings

Replace `DirEntry::file_type().is_dir()` with `Path::is_dir()` in both
`list_subdirs` and `list_sibling_dirs`. The former doesn't follow
symlinks, causing symlinked directories to disappear from the TUI.
---
 src/tui/mod.rs | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/tui/mod.rs b/src/tui/mod.rs
index 72eb7ae..7984089 100644
--- a/src/tui/mod.rs
+++ b/src/tui/mod.rs
@@ -587,7 +587,7 @@ fn list_subdirs(path: &std::path::Path) -> Vec<DirItem> {
     };
     let mut dirs: Vec<DirItem> = entries
         .flatten()
-        .filter(|e| e.file_type().is_ok_and(|ft| ft.is_dir()))
+        .filter(|e| e.path().is_dir())
         .filter_map(|e| {
             let path = e.path();
             let name = path.file_name().map(|n| n.to_string_lossy().into_owned())?;
@@ -620,7 +620,7 @@ fn list_sibling_dirs(path: &std::path::Path) -> Vec<PathBuf> {
     };
     let mut dirs: Vec<PathBuf> = entries
         .flatten()
-        .filter(|e| e.file_type().is_ok_and(|ft| ft.is_dir()))
+        .filter(|e| e.path().is_dir())
         .map(|e| e.path())
         .collect();
     dirs.sort();

From 5459c06791c0288d1059080b9fb6ead55e8f6462 Mon Sep 17 00:00:00 2001
From: Pushkar Patel <git@thepushkarp.com>
Date: Wed, 4 Mar 2026 14:04:36 +0530
Subject: [PATCH 08/10] refactor(document): extract parse_xml_text_fields
 helper

Deduplicates the identical XML event-loop boilerplate shared by
parse_core_xml and parse_app_xml. Each is now a thin wrapper that
passes a closure for tag-to-field dispatch.
---
 src/document.rs | 109 ++++++++++++++++++++++--------------------------
 1 file changed, 50 insertions(+), 59 deletions(-)

diff --git a/src/document.rs b/src/document.rs
index e720656..71db5c3 100644
--- a/src/document.rs
+++ b/src/document.rs
@@ -136,36 +136,14 @@ fn parse_core_xml(xml: &str) -> OoxmlCoreProps {
         modified: None,
     };
 
-    let mut reader = quick_xml::Reader::from_str(xml);
-    let mut buf = Vec::new();
-    let mut current_tag = String::new();
-
-    loop {
-        match reader.read_event_into(&mut buf) {
-            Ok(quick_xml::events::Event::Start(ref e) | quick_xml::events::Event::Empty(ref e)) => {
-                current_tag = local_name(e.name().as_ref());
-            }
-            Ok(quick_xml::events::Event::Text(ref e)) => {
-                let text = e.unescape().ok().map(|s| s.trim().to_owned());
-                if let Some(val) = text.filter(|s| !s.is_empty()) {
-                    match current_tag.as_str() {
-                        "creator" => props.author = Some(val),
-                        "title" => props.title = Some(val),
-                        "subject" => props.subject = Some(val),
-                        "created" => props.created = Some(val),
-                        "modified" => props.modified = Some(val),
-                        _ => {}
-                    }
-                }
-            }
-            Ok(quick_xml::events::Event::End(_)) => {
-                current_tag.clear();
-            }
-            Ok(quick_xml::events::Event::Eof) | Err(_) => break,
-            _ => {}
-        }
-        buf.clear();
-    }
+    parse_xml_text_fields(xml, |tag, val| match tag {
+        "creator" => props.author = Some(val),
+        "title" => props.title = Some(val),
+        "subject" => props.subject = Some(val),
+        "created" => props.created = Some(val),
+        "modified" => props.modified = Some(val),
+        _ => {}
+    });
 
     props
 }
@@ -178,35 +156,13 @@ fn parse_app_xml(xml: &str) -> OoxmlAppProps {
         app_name: None,
     };
 
-    let mut reader = quick_xml::Reader::from_str(xml);
-    let mut buf = Vec::new();
-    let mut current_tag = String::new();
-
-    loop {
-        match reader.read_event_into(&mut buf) {
-            Ok(quick_xml::events::Event::Start(ref e) | quick_xml::events::Event::Empty(ref e)) => {
-                current_tag = local_name(e.name().as_ref());
-            }
-            Ok(quick_xml::events::Event::Text(ref e)) => {
-                let text = e.unescape().ok().map(|s| s.trim().to_owned());
-                if let Some(val) = text.filter(|s| !s.is_empty()) {
-                    match current_tag.as_str() {
-                        "Pages" => props.pages = val.parse().ok(),
-                        "Words" => props.words = val.parse().ok(),
-                        "Slides" => props.slides = val.parse().ok(),
-                        "Application" => props.app_name = Some(val),
-                        _ => {}
-                    }
-                }
-            }
-            Ok(quick_xml::events::Event::End(_)) => {
-                current_tag.clear();
-            }
-            Ok(quick_xml::events::Event::Eof) | Err(_) => break,
-            _ => {}
-        }
-        buf.clear();
-    }
+    parse_xml_text_fields(xml, |tag, val| match tag {
+        "Pages" => props.pages = val.parse().ok(),
+        "Words" => props.words = val.parse().ok(),
+        "Slides" => props.slides = val.parse().ok(),
+        "Application" => props.app_name = Some(val),
+        _ => {}
+    });
 
     props
 }
@@ -556,6 +512,41 @@ fn probe_text(path: &Path, ext: &str) -> Option<DocumentInfo> {
 
 // ─── XML helpers ────────────────────────────────────────────────────────
 
+/// Run the `quick_xml` event loop and call `on_field` for each tag-text pair.
+///
+/// Shared by `parse_core_xml` and `parse_app_xml` which differ only in
+/// which tags they care about. Not used by `parse_odf_meta` which also
+/// reads attributes.
+fn parse_xml_text_fields(xml: &str, mut on_field: impl FnMut(&str, String)) {
+    use quick_xml::events::Event;
+
+    let mut reader = quick_xml::Reader::from_str(xml);
+    let mut buf = Vec::new();
+    let mut current_tag = String::new();
+
+    loop {
+        match reader.read_event_into(&mut buf) {
+            Ok(Event::Start(ref e) | Event::Empty(ref e)) => {
+                current_tag = local_name(e.name().as_ref());
+            }
+            Ok(Event::Text(ref e)) => {
+                if let Some(val) = e
+                    .unescape()
+                    .ok()
+                    .map(|s| s.trim().to_owned())
+                    .filter(|s| !s.is_empty())
+                {
+                    on_field(&current_tag, val);
+                }
+            }
+            Ok(Event::End(_)) => current_tag.clear(),
+            Ok(Event::Eof) | Err(_) => break,
+            _ => {}
+        }
+        buf.clear();
+    }
+}
+
 /// Extract the local part of a possibly namespaced XML name.
 ///
 /// Examples: `dc:creator` becomes `creator`, `meta:creation-date` becomes `creation-date`.

From 6fad9c507a8f449222bd317a22c48bf5e8ee6329 Mon Sep 17 00:00:00 2001
From: Pushkar Patel <git@thepushkarp.com>
Date: Wed, 4 Mar 2026 14:05:43 +0530
Subject: [PATCH 09/10] refactor(document): replace OLE2 magic numbers with
 named constants

Adds OLEPS_*, VT_*, and PIDSI_* constants for the MS-OLEPS binary
format values used in parse_summary_info. Makes the binary format
parsing self-documenting.
---
 src/document.rs | 53 ++++++++++++++++++++++++++++++-------------------
 1 file changed, 33 insertions(+), 20 deletions(-)

diff --git a/src/document.rs b/src/document.rs
index 71db5c3..2c05a95 100644
--- a/src/document.rs
+++ b/src/document.rs
@@ -376,29 +376,44 @@ fn probe_ole2(path: &Path, ext: &str) -> Option<DocumentInfo> {
     Some(info)
 }
 
+// MS-OLEPS binary format constants
+const OLEPS_BYTE_ORDER_LE: u16 = 0xFFFE;
+const OLEPS_HEADER_MIN_LEN: usize = 48;
+const OLEPS_SECTION_OFFSET_POS: usize = 44;
+const OLEPS_MAX_PROPS: usize = 100;
+const OLEPS_SECTION_HEADER_SIZE: usize = 8;
+const OLEPS_PROP_ENTRY_SIZE: usize = 8;
+const VT_I4: u32 = 0x03;
+const VT_LPSTR: u32 = 0x1E;
+// Property IDs from the Summary Information property set
+const PIDSI_TITLE: u32 = 2;
+const PIDSI_AUTHOR: u32 = 4;
+const PIDSI_SUBJECT: u32 = 5;
+const PIDSI_PAGECOUNT: u32 = 14;
+const PIDSI_WORDCOUNT: u32 = 15;
+const PIDSI_APPNAME: u32 = 18;
+
 /// Best-effort extraction from OLE2 `SummaryInformation` stream.
 ///
 /// The stream uses MS-OLEPS binary format with property sets.
-/// We extract string properties by well-known IDs
-/// (2=Title, 4=Author, 5=Subject, 18=`AppName`).
 fn parse_summary_info(data: &[u8], info: &mut DocumentInfo) {
-    if data.len() < 48 || read_u16_le(data, 0) != 0xFFFE {
+    if data.len() < OLEPS_HEADER_MIN_LEN || read_u16_le(data, 0) != OLEPS_BYTE_ORDER_LE {
         return;
     }
 
-    let section_offset = read_u32_le(data, 44) as usize;
-    if section_offset >= data.len() || section_offset + 8 > data.len() {
+    let section_offset = read_u32_le(data, OLEPS_SECTION_OFFSET_POS) as usize;
+    if section_offset + OLEPS_SECTION_HEADER_SIZE > data.len() {
         return;
     }
 
     let prop_count = read_u32_le(data, section_offset + 4) as usize;
-    if prop_count > 100 {
+    if prop_count > OLEPS_MAX_PROPS {
         return;
     }
 
     for i in 0..prop_count {
-        let entry_offset = section_offset + 8 + i * 8;
-        if entry_offset + 8 > data.len() {
+        let entry_offset = section_offset + OLEPS_SECTION_HEADER_SIZE + i * OLEPS_PROP_ENTRY_SIZE;
+        if entry_offset + OLEPS_PROP_ENTRY_SIZE > data.len() {
             break;
         }
 
@@ -406,16 +421,15 @@ fn parse_summary_info(data: &[u8], info: &mut DocumentInfo) {
         let prop_offset = read_u32_le(data, entry_offset + 4) as usize;
         let abs_offset = section_offset + prop_offset;
 
-        if abs_offset + 8 > data.len() {
+        if abs_offset + OLEPS_SECTION_HEADER_SIZE > data.len() {
             continue;
         }
 
         let prop_type = read_u32_le(data, abs_offset);
 
-        // VT_LPSTR = 0x1E
-        if prop_type == 0x1E {
+        if prop_type == VT_LPSTR {
             let str_len = read_u32_le(data, abs_offset + 4) as usize;
-            let str_start = abs_offset + 8;
+            let str_start = abs_offset + OLEPS_SECTION_HEADER_SIZE;
             if str_start + str_len <= data.len() {
                 let raw = &data[str_start..str_start + str_len];
                 let s = String::from_utf8_lossy(raw)
@@ -424,23 +438,22 @@ fn parse_summary_info(data: &[u8], info: &mut DocumentInfo) {
                     .to_owned();
                 if !s.is_empty() {
                     match prop_id {
-                        2 => info.title = Some(s),
-                        4 => info.author = Some(s),
-                        5 => info.subject = Some(s),
-                        18 => info.creator_app = Some(s),
+                        PIDSI_TITLE => info.title = Some(s),
+                        PIDSI_AUTHOR => info.author = Some(s),
+                        PIDSI_SUBJECT => info.subject = Some(s),
+                        PIDSI_APPNAME => info.creator_app = Some(s),
                         _ => {}
                     }
                 }
             }
         }
 
-        // VT_I4 = 0x03
-        if prop_type == 0x03 && abs_offset + 8 <= data.len() {
+        if prop_type == VT_I4 && abs_offset + OLEPS_SECTION_HEADER_SIZE <= data.len() {
             let val = read_u32_le(data, abs_offset + 4);
             if val > 0 {
                 match prop_id {
-                    14 => info.page_count = Some(val),
-                    15 => info.word_count = Some(u64::from(val)),
+                    PIDSI_PAGECOUNT => info.page_count = Some(val),
+                    PIDSI_WORDCOUNT => info.word_count = Some(u64::from(val)),
                     _ => {}
                 }
             }

From 35cfb2edd2fbb4b110b0aa6407d966ce654edc42 Mon Sep 17 00:00:00 2001
From: Pushkar Patel <git@thepushkarp.com>
Date: Wed, 4 Mar 2026 14:06:54 +0530
Subject: [PATCH 10/10] refactor(tui): extract toggle_kind_filter method

Reduces the four kind-filter match arms (keys 2-5) to one-liners by
extracting the shared toggle-apply-check-status pattern into a method
on App.
---
 src/document.rs |  5 ++++-
 src/tui/mod.rs  | 41 +++++++++++++----------------------------
 2 files changed, 17 insertions(+), 29 deletions(-)

diff --git a/src/document.rs b/src/document.rs
index 2c05a95..ec68a78 100644
--- a/src/document.rs
+++ b/src/document.rs
@@ -98,7 +98,10 @@ fn read_xml_from_zip(path: &Path, inner_path: &str) -> Option<String> {
     let mut archive = zip::ZipArchive::new(file).ok()?;
     let entry = archive.by_name(inner_path).ok()?;
     let mut contents = String::new();
-    entry.take(MAX_XML_BYTES).read_to_string(&mut contents).ok()?;
+    entry
+        .take(MAX_XML_BYTES)
+        .read_to_string(&mut contents)
+        .ok()?;
     Some(contents)
 }
 
diff --git a/src/tui/mod.rs b/src/tui/mod.rs
index 7984089..7f88a9d 100644
--- a/src/tui/mod.rs
+++ b/src/tui/mod.rs
@@ -441,6 +441,15 @@ impl App {
         self.set_status(format!("Sort: {} {dir_label}", self.sort_key.label()));
     }
 
+    /// Toggle one kind in the filter, re-apply, and warn if nothing selected.
+    fn toggle_kind_filter(&mut self, toggle: fn(&mut KindFilter)) {
+        toggle(&mut self.kind_filter);
+        self.apply_filter();
+        if self.kind_filter.is_empty() {
+            self.set_status("Kind: nothing selected \u{2014} 1 to show all".to_string());
+        }
+    }
+
     /// Start background thumbnail fetch for the currently selected entry.
     /// Skips if already loading the same path or if the file has no video.
     fn kick_thumbnail_fetch(&mut self) {
@@ -823,34 +832,10 @@ async fn handle_key(app: &mut App, key: KeyEvent) {
             app.kind_filter = KindFilter::ALL;
             app.apply_filter();
         }
-        (KeyCode::Char('2'), _) => {
-            app.kind_filter.video = !app.kind_filter.video;
-            app.apply_filter();
-            if app.kind_filter.is_empty() {
-                app.set_status("Kind: nothing selected \u{2014} 1 to show all".to_string());
-            }
-        }
-        (KeyCode::Char('3'), _) => {
-            app.kind_filter.audio = !app.kind_filter.audio;
-            app.apply_filter();
-            if app.kind_filter.is_empty() {
-                app.set_status("Kind: nothing selected \u{2014} 1 to show all".to_string());
-            }
-        }
-        (KeyCode::Char('4'), _) => {
-            app.kind_filter.image = !app.kind_filter.image;
-            app.apply_filter();
-            if app.kind_filter.is_empty() {
-                app.set_status("Kind: nothing selected \u{2014} 1 to show all".to_string());
-            }
-        }
-        (KeyCode::Char('5'), _) => {
-            app.kind_filter.doc = !app.kind_filter.doc;
-            app.apply_filter();
-            if app.kind_filter.is_empty() {
-                app.set_status("Kind: nothing selected \u{2014} 1 to show all".to_string());
-            }
-        }
+        (KeyCode::Char('2'), _) => app.toggle_kind_filter(|kf| kf.video = !kf.video),
+        (KeyCode::Char('3'), _) => app.toggle_kind_filter(|kf| kf.audio = !kf.audio),
+        (KeyCode::Char('4'), _) => app.toggle_kind_filter(|kf| kf.image = !kf.image),
+        (KeyCode::Char('5'), _) => app.toggle_kind_filter(|kf| kf.doc = !kf.doc),
         // Playback
         (KeyCode::Char('p'), _) => handle_playback(app).await,
         (KeyCode::Char('P'), _) => {