From 3858ddfe238a375f980abfa2cf5b6911b658bff6 Mon Sep 17 00:00:00 2001 From: Pushkar Patel Date: Tue, 3 Mar 2026 13:10:40 +0530 Subject: [PATCH 01/10] feat: add document support (PDF, Office, ODF, text) Add MediaKind::Document as a new file category that bypasses ffprobe entirely, using pure Rust crates for metadata extraction. Supported formats: PDF, DOCX/DOC, XLSX/XLS, PPTX/PPT, ODT/ODS/ODP, CSV, TSV, TXT, MD. - New document.rs module with format-specific extractors (lopdf, zip, quick-xml, cfb) - DocumentInfo struct with page_count, word_count, line_count, sheet_count, author, title, and more - Filter support: media.doc.* field paths + pages/author aliases - Sort support: pages/page_count sort key - TUI: KindFilter::Document (key 5), "D" icon, document metadata panel - Bump package version to 0.0.2, schema version to 0.2.0 --- CLAUDE.md | 17 +- Cargo.lock | 279 +++++++++++++++++- Cargo.toml | 6 +- src/document.rs | 729 ++++++++++++++++++++++++++++++++++++++++++++++ src/filter.rs | 70 +++++ src/main.rs | 12 +- src/output.rs | 5 +- src/probe.rs | 61 ++++ src/scan.rs | 19 +- src/sort.rs | 9 + src/tui/layout.rs | 53 +++- src/tui/mod.rs | 10 + src/tui/triage.rs | 1 + src/types.rs | 102 ++++++- tests/cli.rs | 87 +++++- 15 files changed, 1430 insertions(+), 30 deletions(-) create mode 100644 src/document.rs diff --git a/CLAUDE.md b/CLAUDE.md index 10c3c64..ac7a182 100644 --- a/CLAUDE.md +++ b/CLAUDE.md @@ -2,7 +2,7 @@ ## What is this -mls (Media LS) — terminal-native audio/video/image file browser. Dual-mode: TUI for humans, JSON/NDJSON for scripts/agents. Rust + Ratatui + Tokio. macOS-first. +mls (Media LS) — terminal-native audio/video/image/document file browser. Dual-mode: TUI for humans, JSON/NDJSON for scripts/agents. Rust + Ratatui + Tokio. macOS-first. PRD: `docs/plans/resilient-gliding-bear.md` @@ -25,9 +25,10 @@ src/ ├── cli.rs # clap derive CLI definitions ├── types.rs # ALL shared types (MediaEntry, MediaInfo, Fps, etc.) ├── deps.rs # Startup dependency check (ffprobe, ffmpeg, mpv) -├── probe.rs # ffprobe subprocess + JSON parsing → MediaEntry (image detection + EXIF dispatch) +├── probe.rs # ffprobe subprocess + JSON parsing → MediaEntry (image detection + EXIF dispatch; document dispatch) ├── exif.rs # EXIF metadata extraction (kamadak-exif) -├── scan.rs # Directory walk + concurrent probing (JoinSet) +├── document.rs # Document metadata extraction (PDF, OOXML, ODF, OLE2, text) — pure Rust, no external tools +├── scan.rs # Directory walk + concurrent probing (JoinSet) — routes documents to native probe ├── filter.rs # Hand-rolled expression parser (lexer → parser → AST → eval) ├── sort.rs # Sort key parsing + comparison ├── output.rs # JSON/NDJSON serialization (borrowing, zero-clone) @@ -44,13 +45,13 @@ src/ 1. `cli.rs` parses args → `main.rs` routes to subcommand 2. `deps.rs` checks ffprobe/ffmpeg/mpv availability -3. `scan.rs` walks directories → `probe.rs` runs ffprobe per file → `MediaEntry` (images bypass ffprobe stream classification; EXIF extracted via `exif.rs`) +3. `scan.rs` walks directories → `probe.rs` runs ffprobe per file → `MediaEntry` (images bypass ffprobe stream classification; EXIF extracted via `exif.rs`; documents bypass ffprobe entirely → `document.rs` extracts metadata natively) 4. Filter (`filter.rs`) and sort (`sort.rs`) applied to entries 5. Output: `tui/` renders interactively, or `output.rs` emits JSON/NDJSON ### Key type: `MediaEntry` (in `types.rs`) -The central data type. Every module reads or produces it. It serializes to the JSON schema (version `0.1.0`). If you change `MediaEntry`, you affect JSON output, TUI rendering, filter evaluation, and sort comparison. `MediaKind` includes a `Image` variant. `MediaInfo` has an optional `exif: Option` field populated for image files. +The central data type. Every module reads or produces it. It serializes to the JSON schema (version `0.2.0`). If you change `MediaEntry`, you affect JSON output, TUI rendering, filter evaluation, and sort comparison. `MediaKind` variants: `Video`, `Audio`, `Av`, `Image`, `Document`. `MediaInfo` has optional `exif: Option` (images) and `doc: Option` (documents) fields. ## Conventions @@ -84,7 +85,7 @@ Unit tests are co-located `#[cfg(test)]` modules at the bottom of each source fi ### Output format -JSON output uses borrowing structs (`ListEnvelopeRef<'a>`, `NdjsonEntryRef<'a>`) to avoid cloning `MediaEntry` vectors. Schema version `"0.1.0"` is embedded in output. +JSON output uses borrowing structs (`ListEnvelopeRef<'a>`, `NdjsonEntryRef<'a>`) to avoid cloning `MediaEntry` vectors. Schema version `"0.2.0"` is embedded in output. ### External processes @@ -101,7 +102,9 @@ JSON output uses borrowing structs (`ListEnvelopeRef<'a>`, `NdjsonEntryRef<'a>`) - `triage.rs` Move (`m` key) works via text input; interactive directory picker not yet built. - `scan.rs` uses bounded `JoinSet` spawns (not a semaphore) for concurrency control, with `mpsc` channel for streaming results to the caller. - Images are loaded directly in `thumbnail.rs` (bypass ffmpeg + LRU cache). -- `filter.rs` supports `media.exif.*` field paths and `camera`/`iso` shorthand aliases. +- `filter.rs` supports `media.exif.*` field paths and `camera`/`iso` shorthand aliases, plus `media.doc.*` field paths and `pages`/`author` shorthand aliases. +- **Documents bypass ffprobe entirely** — probed by `document.rs` using pure Rust crates (`lopdf`, `zip`, `quick-xml`, `cfb`). Recognized extensions: pdf, docx, doc, odt, xlsx, xls, ods, pptx, ppt, odp, csv, tsv, txt, md. +- **TUI kind filter key `5`** filters to documents. Document entries show "D" icon in file list. ## Dependencies (external) diff --git a/Cargo.lock b/Cargo.lock index 262fed0..8523749 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -8,6 +8,17 @@ version = "2.0.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "320119579fcad9c21884f5c4861d16174d0e06250625266f50fe6898340abefa" +[[package]] +name = "aes" +version = "0.8.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b169f7a6d4742236a0a00c541b845991d0ac43e546831af1249753ab4c3aa3a0" +dependencies = [ + "cfg-if", + "cipher", + "cpufeatures", +] + [[package]] name = "aho-corasick" version = "1.1.4" @@ -182,6 +193,15 @@ dependencies = [ "generic-array", ] +[[package]] +name = "block-padding" +version = "0.3.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a8894febbff9f758034a5b8e12d87918f56dfc64a8e1fe757d65e29041538d93" +dependencies = [ + "generic-array", +] + [[package]] name = "bstr" version = "1.12.1" @@ -205,6 +225,12 @@ version = "1.2.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "64fa3c856b712db6612c019f14756e64e4bcea13337a6b33b696333a9eaa2d06" +[[package]] +name = "bytecount" +version = "0.6.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "175812e0be2bccb6abe50bb8d566126198344f707e304f45c648fd8f2cc0365e" + [[package]] name = "bytemuck" version = "1.25.0" @@ -225,6 +251,12 @@ dependencies = [ "syn 2.0.117", ] +[[package]] +name = "byteorder" +version = "1.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1fd0f2584146f6f2ef48085050886acf353beff7305ebd1ae69500e27c67f64b" + [[package]] name = "byteorder-lite" version = "0.1.0" @@ -246,6 +278,15 @@ dependencies = [ "rustversion", ] +[[package]] +name = "cbc" +version = "0.1.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "26b52a9543ae338f279b96b0b9fed9c8093744685043739079ce85cd58f289a6" +dependencies = [ + "cipher", +] + [[package]] name = "cc" version = "1.2.56" @@ -256,6 +297,17 @@ dependencies = [ "shlex", ] +[[package]] +name = "cfb" +version = "0.10.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d8a4f8e55be323b378facfcf1f06aa97f6ec17cf4ac84fb17325093aaf62da41" +dependencies = [ + "byteorder", + "fnv", + "uuid", +] + [[package]] name = "cfg-if" version = "1.0.4" @@ -282,6 +334,16 @@ dependencies = [ "windows-link", ] +[[package]] +name = "cipher" +version = "0.4.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "773f3b9af64447d2ce9850330c473515014aa235e6a783b02db81ff39e4a3dad" +dependencies = [ + "crypto-common", + "inout", +] + [[package]] name = "clap" version = "4.5.60" @@ -556,12 +618,30 @@ dependencies = [ "litrs", ] +[[package]] +name = "ecb" +version = "0.1.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1a8bfa975b1aec2145850fcaa1c6fe269a16578c44705a532ae3edc92b8881c7" +dependencies = [ + "cipher", +] + [[package]] name = "either" version = "1.15.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "48c757948c5ede0e46177b7add2e67155f70e33c07fea8284df6576da70b3719" +[[package]] +name = "encoding_rs" +version = "0.8.35" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "75030f3c4f45dafd7586dd6780965a8c7e8e285a5ecb86713e63a79c5b2766f3" +dependencies = [ + "cfg-if", +] + [[package]] name = "equivalent" version = "1.0.2" @@ -675,6 +755,7 @@ checksum = "843fba2746e448b37e26a819579957415c8cef339bf08564fe8b7ddbd959573c" dependencies = [ "crc32fast", "miniz_oxide", + "zlib-rs", ] [[package]] @@ -911,6 +992,16 @@ dependencies = [ "rustversion", ] +[[package]] +name = "inout" +version = "0.1.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "879f10e63c20629ecabbb64a8010319738c66a5cd0c29b02d63d272b03751d01" +dependencies = [ + "block-padding", + "generic-array", +] + [[package]] name = "instability" version = "0.3.11" @@ -1047,6 +1138,34 @@ version = "0.4.29" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "5e5032e24019045c762d3c0f28f5b6b8bbf38563a65908389bf7978758920897" +[[package]] +name = "lopdf" +version = "0.39.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f560f57dfb9142a02d673e137622fd515d4231e51feb8b4af28d92647d83f35b" +dependencies = [ + "aes", + "bitflags 2.11.0", + "cbc", + "ecb", + "encoding_rs", + "flate2", + "getrandom 0.3.4", + "indexmap", + "itoa", + "log", + "md-5", + "nom 8.0.0", + "nom_locate", + "rand 0.9.2", + "rangemap", + "sha2", + "stringprep", + "thiserror 2.0.18", + "ttf-parser", + "weezl", +] + [[package]] name = "lru" version = "0.16.3" @@ -1075,20 +1194,33 @@ dependencies = [ "regex-automata", ] +[[package]] +name = "md-5" +version = "0.10.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d89e7ee0cfbedfc4da3340218492196241d89eefb6dab27de5df917a6d2e78cf" +dependencies = [ + "cfg-if", + "digest", +] + [[package]] name = "media-ls" -version = "0.0.1" +version = "0.0.2" dependencies = [ "anyhow", "assert_cmd", + "cfb", "chrono", "clap", "crossterm", "image", "kamadak-exif", + "lopdf", "lru", "nucleo-matcher", "predicates", + "quick-xml", "ratatui", "ratatui-image", "serde", @@ -1098,6 +1230,7 @@ dependencies = [ "tokio", "tracing", "tracing-subscriber", + "zip", ] [[package]] @@ -1188,6 +1321,26 @@ dependencies = [ "minimal-lexical", ] +[[package]] +name = "nom" +version = "8.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "df9761775871bdef83bee530e60050f7e54b1105350d6884eb0fb4f46c2f9405" +dependencies = [ + "memchr", +] + +[[package]] +name = "nom_locate" +version = "5.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0b577e2d69827c4740cba2b52efaad1c4cc7c73042860b199710b3575c68438d" +dependencies = [ + "bytecount", + "memchr", + "nom 8.0.0", +] + [[package]] name = "normalize-line-endings" version = "0.3.0" @@ -1551,6 +1704,15 @@ version = "2.0.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "a993555f31e5a609f617c12db6250dedcac1b0a85076912c436e6fc9b2c8e6a3" +[[package]] +name = "quick-xml" +version = "0.37.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "331e97a1af0bf59823e6eadffe373d7b27f485be8748f71471c662c1f269b7fb" +dependencies = [ + "memchr", +] + [[package]] name = "quote" version = "1.0.44" @@ -1579,7 +1741,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "34af8d1a0e25924bc5b7c43c079c942339d8f0a8b57c39049bef581b46327404" dependencies = [ "libc", - "rand_chacha", + "rand_chacha 0.3.1", "rand_core 0.6.4", ] @@ -1589,6 +1751,7 @@ version = "0.9.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "6db2770f06117d490610c7488547d543617b21bfa07796d7a12f6f1bd53850d1" dependencies = [ + "rand_chacha 0.9.0", "rand_core 0.9.5", ] @@ -1602,6 +1765,16 @@ dependencies = [ "rand_core 0.6.4", ] +[[package]] +name = "rand_chacha" +version = "0.9.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d3022b5f1df60f26e1ffddd6c66e8aa15de382ae63b3a0c1bfc0e4d3e3f325cb" +dependencies = [ + "ppv-lite86", + "rand_core 0.9.5", +] + [[package]] name = "rand_core" version = "0.6.4" @@ -1616,6 +1789,9 @@ name = "rand_core" version = "0.9.5" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "76afc826de14238e6e8c374ddcc1fa19e374fd8dd986b0d2af0d02377261d83c" +dependencies = [ + "getrandom 0.3.4", +] [[package]] name = "rand_xoshiro" @@ -1626,6 +1802,12 @@ dependencies = [ "rand_core 0.9.5", ] +[[package]] +name = "rangemap" +version = "1.7.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "973443cf09a9c8656b574a866ab68dfa19f0867d0340648c7d2f6a71b8a8ea68" + [[package]] name = "ratatui" version = "0.30.0" @@ -2007,6 +2189,17 @@ version = "1.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "a2eb9349b6444b326872e140eb1cf5e7c522154d69e7a0ffb0fb81c06b37543f" +[[package]] +name = "stringprep" +version = "0.1.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7b4df3d392d81bd458a8a621b8bffbd2302a12ffe288a9d931670948749463b1" +dependencies = [ + "unicode-bidi", + "unicode-normalization", + "unicode-properties", +] + [[package]] name = "strsim" version = "0.11.1" @@ -2082,7 +2275,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "d4ea810f0692f9f51b382fff5893887bb4580f5fa246fde546e0b13e7fcee662" dependencies = [ "fnv", - "nom", + "nom 7.1.3", "phf", "phf_codegen", ] @@ -2228,6 +2421,21 @@ version = "0.1.8" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "7694e1cfe791f8d31026952abf09c69ca6f6fa4e1a1229e18988f06a04a12dca" +[[package]] +name = "tinyvec" +version = "1.10.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "bfa5fdc3bce6191a1dbc8c02d5c8bffcf557bafa17c124c5264a458f1b0613fa" +dependencies = [ + "tinyvec_macros", +] + +[[package]] +name = "tinyvec_macros" +version = "0.1.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1f3ccbac311fea05f86f61904b462b55fb3df8837a366dfc601a0161d0532f20" + [[package]] name = "tokio" version = "1.49.0" @@ -2317,6 +2525,18 @@ dependencies = [ "tracing-log", ] +[[package]] +name = "ttf-parser" +version = "0.25.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d2df906b07856748fa3f6e0ad0cbaa047052d4a7dd609e231c4f72cee8c36f31" + +[[package]] +name = "typed-path" +version = "0.12.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8e28f89b80c87b8fb0cf04ab448d5dd0dd0ade2f8891bae878de66a75a28600e" + [[package]] name = "typenum" version = "1.19.0" @@ -2329,12 +2549,33 @@ version = "0.1.7" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "2896d95c02a80c6d6a5d6e953d479f5ddf2dfdb6a244441010e373ac0fb88971" +[[package]] +name = "unicode-bidi" +version = "0.3.18" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5c1cb5db39152898a79168971543b1cb5020dff7fe43c8dc468b0885f5e29df5" + [[package]] name = "unicode-ident" version = "1.0.24" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "e6e4313cd5fcd3dad5cafa179702e2b244f760991f45397d14d4ebf38247da75" +[[package]] +name = "unicode-normalization" +version = "0.1.25" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5fd4f6878c9cb28d874b009da9e8d183b5abc80117c40bbd187a1fde336be6e8" +dependencies = [ + "tinyvec", +] + +[[package]] +name = "unicode-properties" +version = "0.1.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7df058c713841ad818f1dc5d3fd88063241cc61f49f5fbea4b951e8cf5a8d71d" + [[package]] name = "unicode-segmentation" version = "1.12.0" @@ -3027,12 +3268,44 @@ dependencies = [ "syn 2.0.117", ] +[[package]] +name = "zip" +version = "8.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b680f2a0cd479b4cff6e1233c483fdead418106eae419dc60200ae9850f6d004" +dependencies = [ + "crc32fast", + "flate2", + "indexmap", + "memchr", + "typed-path", + "zopfli", +] + +[[package]] +name = "zlib-rs" +version = "0.6.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3be3d40e40a133f9c916ee3f9f4fa2d9d63435b5fbe1bfc6d9dae0aa0ada1513" + [[package]] name = "zmij" version = "1.0.21" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "b8848ee67ecc8aedbaf3e4122217aff892639231befc6a1b58d29fff4c2cabaa" +[[package]] +name = "zopfli" +version = "0.8.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f05cd8797d63865425ff89b5c4a48804f35ba0ce8d125800027ad6017d2b5249" +dependencies = [ + "bumpalo", + "crc32fast", + "log", + "simd-adler32", +] + [[package]] name = "zune-core" version = "0.4.12" diff --git a/Cargo.toml b/Cargo.toml index 76038d1..0fdc284 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "media-ls" -version = "0.0.1" +version = "0.0.2" edition = "2024" description = "Media LS — terminal-native audio/video file browser with metadata columns, TUI preview, and structured JSON output" license = "MIT" @@ -32,6 +32,10 @@ image = { version = "0.25.9", default-features = false, features = [ kamadak-exif = "0.6.1" ratatui-image = { version = "10.0.6", default-features = false, features = ["crossterm"] } tokio = { version = "1.49.0", features = ["full"] } +lopdf = { version = "0.39.0", default-features = false } +zip = { version = "8.2.0", default-features = false, features = ["deflate"] } +quick-xml = "0.37.5" +cfb = "0.10.0" tracing = "0.1.44" tracing-subscriber = { version = "0.3.22", features = ["env-filter"] } diff --git a/src/document.rs b/src/document.rs new file mode 100644 index 0000000..bb6a129 --- /dev/null +++ b/src/document.rs @@ -0,0 +1,729 @@ +/// Document metadata extraction (PDF, Office, `OpenDocument`, plain text). +/// +/// All extractors are best-effort: failures are silently swallowed and +/// logged at debug level. Follows the same pattern as `exif.rs`. +use crate::types::DocumentInfo; +use std::io::{BufRead, BufReader}; +use std::path::Path; + +/// Extract metadata from a document file, dispatching by extension. +/// +/// Returns `None` on any failure (corrupt file, unsupported format, I/O error). +pub fn probe_document(path: &Path) -> Option { + let ext = path + .extension() + .and_then(|e| e.to_str()) + .map(str::to_ascii_lowercase)?; + + let result = match ext.as_str() { + "pdf" => probe_pdf(path), + "docx" => probe_ooxml_doc(path), + "xlsx" => probe_ooxml_spreadsheet(path), + "pptx" => probe_ooxml_presentation(path), + "odt" | "ods" | "odp" => probe_odf(path, &ext), + "doc" | "xls" | "ppt" => probe_ole2(path, &ext), + "csv" | "tsv" => probe_text_table(path, &ext), + "txt" | "md" => probe_text(path, &ext), + _ => None, + }; + + if result.is_none() { + tracing::debug!(path = %path.display(), ext = %ext, "document probe returned no metadata"); + } + + result +} + +// ─── PDF ──────────────────────────────────────────────────────────────── + +fn probe_pdf(path: &Path) -> Option { + let doc = lopdf::Document::load(path).ok()?; + + let page_count = doc.get_pages().len(); + + let trailer_info = doc + .trailer + .get(b"Info") + .ok() + .and_then(|obj| obj.as_reference().ok()) + .and_then(|r| doc.get_object(r).ok()); + + let get_info_str = |key: &[u8]| -> Option { + trailer_info? + .as_dict() + .ok()? + .get(key) + .ok() + .and_then(pdf_object_to_string) + .map(|s| s.trim().to_owned()) + .filter(|s| !s.is_empty()) + }; + + #[expect(clippy::cast_possible_truncation)] + Some(DocumentInfo { + format: "pdf".to_string(), + page_count: Some(page_count as u32), + word_count: None, + line_count: None, + sheet_count: None, + author: get_info_str(b"Author"), + title: get_info_str(b"Title"), + subject: get_info_str(b"Subject"), + creator_app: get_info_str(b"Creator"), + creation_date: get_info_str(b"CreationDate"), + modification_date: get_info_str(b"ModDate"), + }) +} + +fn pdf_object_to_string(obj: &lopdf::Object) -> Option { + match obj { + lopdf::Object::String(bytes, _) => String::from_utf8(bytes.clone()).ok(), + lopdf::Object::Name(name) => String::from_utf8(name.clone()).ok(), + _ => None, + } +} + +// ─── OOXML (DOCX/XLSX/PPTX) ──────────────────────────────────────────── + +fn read_xml_from_zip(path: &Path, inner_path: &str) -> Option { + let file = std::fs::File::open(path).ok()?; + let mut archive = zip::ZipArchive::new(file).ok()?; + let mut entry = archive.by_name(inner_path).ok()?; + let mut contents = String::new(); + std::io::Read::read_to_string(&mut entry, &mut contents).ok()?; + Some(contents) +} + +fn parse_ooxml_core(path: &Path) -> OoxmlCoreProps { + let xml = read_xml_from_zip(path, "docProps/core.xml").unwrap_or_default(); + parse_core_xml(&xml) +} + +fn parse_ooxml_app(path: &Path) -> OoxmlAppProps { + let xml = read_xml_from_zip(path, "docProps/app.xml").unwrap_or_default(); + parse_app_xml(&xml) +} + +struct OoxmlCoreProps { + author: Option, + title: Option, + subject: Option, + created: Option, + modified: Option, +} + +struct OoxmlAppProps { + pages: Option, + words: Option, + slides: Option, + app_name: Option, +} + +fn parse_core_xml(xml: &str) -> OoxmlCoreProps { + let mut props = OoxmlCoreProps { + author: None, + title: None, + subject: None, + created: None, + modified: None, + }; + + let mut reader = quick_xml::Reader::from_str(xml); + let mut buf = Vec::new(); + let mut current_tag = String::new(); + + loop { + match reader.read_event_into(&mut buf) { + Ok(quick_xml::events::Event::Start(ref e) | quick_xml::events::Event::Empty(ref e)) => { + current_tag = local_name(e.name().as_ref()); + } + Ok(quick_xml::events::Event::Text(ref e)) => { + let text = e.unescape().ok().map(|s| s.trim().to_owned()); + if let Some(val) = text.filter(|s| !s.is_empty()) { + match current_tag.as_str() { + "creator" => props.author = Some(val), + "title" => props.title = Some(val), + "subject" => props.subject = Some(val), + "created" => props.created = Some(val), + "modified" => props.modified = Some(val), + _ => {} + } + } + } + Ok(quick_xml::events::Event::End(_)) => { + current_tag.clear(); + } + Ok(quick_xml::events::Event::Eof) | Err(_) => break, + _ => {} + } + buf.clear(); + } + + props +} + +fn parse_app_xml(xml: &str) -> OoxmlAppProps { + let mut props = OoxmlAppProps { + pages: None, + words: None, + slides: None, + app_name: None, + }; + + let mut reader = quick_xml::Reader::from_str(xml); + let mut buf = Vec::new(); + let mut current_tag = String::new(); + + loop { + match reader.read_event_into(&mut buf) { + Ok(quick_xml::events::Event::Start(ref e) | quick_xml::events::Event::Empty(ref e)) => { + current_tag = local_name(e.name().as_ref()); + } + Ok(quick_xml::events::Event::Text(ref e)) => { + let text = e.unescape().ok().map(|s| s.trim().to_owned()); + if let Some(val) = text.filter(|s| !s.is_empty()) { + match current_tag.as_str() { + "Pages" => props.pages = val.parse().ok(), + "Words" => props.words = val.parse().ok(), + "Slides" => props.slides = val.parse().ok(), + "Application" => props.app_name = Some(val), + _ => {} + } + } + } + Ok(quick_xml::events::Event::End(_)) => { + current_tag.clear(); + } + Ok(quick_xml::events::Event::Eof) | Err(_) => break, + _ => {} + } + buf.clear(); + } + + props +} + +fn probe_ooxml_doc(path: &Path) -> Option { + let file = std::fs::File::open(path).ok()?; + let _archive = zip::ZipArchive::new(file).ok()?; + + let core = parse_ooxml_core(path); + let app = parse_ooxml_app(path); + + Some(DocumentInfo { + format: "docx".to_string(), + page_count: app.pages, + word_count: app.words, + line_count: None, + sheet_count: None, + author: core.author, + title: core.title, + subject: core.subject, + creator_app: app.app_name, + creation_date: core.created, + modification_date: core.modified, + }) +} + +fn probe_ooxml_spreadsheet(path: &Path) -> Option { + let file = std::fs::File::open(path).ok()?; + let _archive = zip::ZipArchive::new(file).ok()?; + + let core = parse_ooxml_core(path); + let app = parse_ooxml_app(path); + + let sheet_count = read_xml_from_zip(path, "xl/workbook.xml") + .map(|xml| count_xml_elements(&xml, "sheet")) + .filter(|&c| c > 0); + + Some(DocumentInfo { + format: "xlsx".to_string(), + page_count: None, + word_count: None, + line_count: None, + sheet_count, + author: core.author, + title: core.title, + subject: core.subject, + creator_app: app.app_name, + creation_date: core.created, + modification_date: core.modified, + }) +} + +fn probe_ooxml_presentation(path: &Path) -> Option { + let file = std::fs::File::open(path).ok()?; + let _archive = zip::ZipArchive::new(file).ok()?; + + let core = parse_ooxml_core(path); + let app = parse_ooxml_app(path); + + let slide_count = app.slides.or_else(|| { + read_xml_from_zip(path, "ppt/presentation.xml") + .map(|xml| count_xml_elements(&xml, "sldId")) + .filter(|&c| c > 0) + }); + + Some(DocumentInfo { + format: "pptx".to_string(), + page_count: slide_count, + word_count: None, + line_count: None, + sheet_count: None, + author: core.author, + title: core.title, + subject: core.subject, + creator_app: app.app_name, + creation_date: core.created, + modification_date: core.modified, + }) +} + +// ─── ODF (ODT/ODS/ODP) ───────────────────────────────────────────────── + +fn probe_odf(path: &Path, ext: &str) -> Option { + let file = std::fs::File::open(path).ok()?; + let _archive = zip::ZipArchive::new(file).ok()?; + + let meta_xml = read_xml_from_zip(path, "meta.xml").unwrap_or_default(); + let meta = parse_odf_meta(&meta_xml); + + let (page_count, sheet_count) = match ext { + "ods" => { + let content = read_xml_from_zip(path, "content.xml").unwrap_or_default(); + let sheets = count_xml_elements(&content, "table"); + (None, if sheets > 0 { Some(sheets) } else { None }) + } + "odp" => { + let content = read_xml_from_zip(path, "content.xml").unwrap_or_default(); + let slides = count_xml_elements(&content, "page"); + (if slides > 0 { Some(slides) } else { None }, None) + } + _ => (meta.page_count, None), + }; + + Some(DocumentInfo { + format: ext.to_string(), + page_count, + word_count: meta.word_count, + line_count: None, + sheet_count, + author: meta.author, + title: meta.title, + subject: meta.subject, + creator_app: meta.generator, + creation_date: meta.creation_date, + modification_date: meta.modification_date, + }) +} + +struct OdfMeta { + author: Option, + title: Option, + subject: Option, + generator: Option, + creation_date: Option, + modification_date: Option, + page_count: Option, + word_count: Option, +} + +fn parse_odf_meta(xml: &str) -> OdfMeta { + let mut meta = OdfMeta { + author: None, + title: None, + subject: None, + generator: None, + creation_date: None, + modification_date: None, + page_count: None, + word_count: None, + }; + + let mut reader = quick_xml::Reader::from_str(xml); + let mut buf = Vec::new(); + let mut current_tag = String::new(); + + loop { + match reader.read_event_into(&mut buf) { + Ok(quick_xml::events::Event::Start(ref e) | quick_xml::events::Event::Empty(ref e)) => { + let local = local_name(e.name().as_ref()); + + // ODF stores statistics as attributes on `meta:document-statistic` + if local == "document-statistic" { + for attr in e.attributes().flatten() { + let key = local_name(attr.key.as_ref()); + let val = String::from_utf8_lossy(&attr.value).to_string(); + match key.as_str() { + "page-count" => meta.page_count = val.parse().ok(), + "word-count" => meta.word_count = val.parse().ok(), + _ => {} + } + } + } + + current_tag = local; + } + Ok(quick_xml::events::Event::Text(ref e)) => { + let text = e.unescape().ok().map(|s| s.trim().to_owned()); + if let Some(val) = text.filter(|s| !s.is_empty()) { + match current_tag.as_str() { + "initial-creator" | "creator" => meta.author = Some(val), + "title" => meta.title = Some(val), + "subject" => meta.subject = Some(val), + "generator" => meta.generator = Some(val), + "creation-date" => meta.creation_date = Some(val), + "date" => meta.modification_date = Some(val), + _ => {} + } + } + } + Ok(quick_xml::events::Event::End(_)) => { + current_tag.clear(); + } + Ok(quick_xml::events::Event::Eof) | Err(_) => break, + _ => {} + } + buf.clear(); + } + + meta +} + +// ─── OLE2 (legacy DOC/XLS/PPT) ───────────────────────────────────────── + +fn probe_ole2(path: &Path, ext: &str) -> Option { + let mut comp = cfb::open(path).ok()?; + + let mut info = DocumentInfo { + format: ext.to_string(), + ..DocumentInfo::default() + }; + + // Try to read the SummaryInformation stream + if let Ok(stream) = comp.open_stream("/\x05SummaryInformation") { + let data: Vec = std::io::Read::bytes(stream) + .take(4096) + .filter_map(Result::ok) + .collect(); + parse_summary_info(&data, &mut info); + } + + Some(info) +} + +/// Best-effort extraction from OLE2 `SummaryInformation` stream. +/// +/// The stream uses MS-OLEPS binary format with property sets. +/// We extract string properties by well-known IDs +/// (2=Title, 4=Author, 5=Subject, 18=`AppName`). +fn parse_summary_info(data: &[u8], info: &mut DocumentInfo) { + if data.len() < 48 || read_u16_le(data, 0) != 0xFFFE { + return; + } + + let section_offset = read_u32_le(data, 44) as usize; + if section_offset >= data.len() || section_offset + 8 > data.len() { + return; + } + + let prop_count = read_u32_le(data, section_offset + 4) as usize; + if prop_count > 100 { + return; + } + + for i in 0..prop_count { + let entry_offset = section_offset + 8 + i * 8; + if entry_offset + 8 > data.len() { + break; + } + + let prop_id = read_u32_le(data, entry_offset); + let prop_offset = read_u32_le(data, entry_offset + 4) as usize; + let abs_offset = section_offset + prop_offset; + + if abs_offset + 8 > data.len() { + continue; + } + + let prop_type = read_u32_le(data, abs_offset); + + // VT_LPSTR = 0x1E + if prop_type == 0x1E { + let str_len = read_u32_le(data, abs_offset + 4) as usize; + let str_start = abs_offset + 8; + if str_start + str_len <= data.len() { + let raw = &data[str_start..str_start + str_len]; + let s = String::from_utf8_lossy(raw) + .trim_end_matches('\0') + .trim() + .to_owned(); + if !s.is_empty() { + match prop_id { + 2 => info.title = Some(s), + 4 => info.author = Some(s), + 5 => info.subject = Some(s), + 18 => info.creator_app = Some(s), + _ => {} + } + } + } + } + + // VT_I4 = 0x03 + if prop_type == 0x03 && abs_offset + 8 <= data.len() { + let val = read_u32_le(data, abs_offset + 4); + if val > 0 { + match prop_id { + 14 => info.page_count = Some(val), + 15 => info.word_count = Some(u64::from(val)), + _ => {} + } + } + } + } +} + +fn read_u16_le(data: &[u8], offset: usize) -> u16 { + if offset + 2 > data.len() { + return 0; + } + u16::from_le_bytes([data[offset], data[offset + 1]]) +} + +fn read_u32_le(data: &[u8], offset: usize) -> u32 { + if offset + 4 > data.len() { + return 0; + } + u32::from_le_bytes([ + data[offset], + data[offset + 1], + data[offset + 2], + data[offset + 3], + ]) +} + +// ─── Text-based formats (CSV/TSV/TXT/MD) ─────────────────────────────── + +fn probe_text_table(path: &Path, ext: &str) -> Option { + let file = std::fs::File::open(path).ok()?; + let reader = BufReader::new(file); + let mut line_count: u64 = 0; + + for line in reader.lines() { + if line.is_err() { + break; + } + line_count += 1; + } + + Some(DocumentInfo { + format: ext.to_string(), + line_count: Some(line_count), + ..DocumentInfo::default() + }) +} + +fn probe_text(path: &Path, ext: &str) -> Option { + let file = std::fs::File::open(path).ok()?; + let reader = BufReader::new(file); + let mut line_count: u64 = 0; + let mut word_count: u64 = 0; + + for line in reader.lines() { + let Ok(line) = line else { + break; + }; + line_count += 1; + word_count += line.split_whitespace().count() as u64; + } + + Some(DocumentInfo { + format: ext.to_string(), + word_count: Some(word_count), + line_count: Some(line_count), + ..DocumentInfo::default() + }) +} + +// ─── XML helpers ──────────────────────────────────────────────────────── + +/// Extract the local part of a possibly namespaced XML name. +/// +/// Examples: `dc:creator` becomes `creator`, `meta:creation-date` becomes `creation-date`. +fn local_name(name: &[u8]) -> String { + let full = String::from_utf8_lossy(name); + full.rsplit_once(':') + .map_or(full.to_string(), |(_, local)| local.to_string()) +} + +fn count_xml_elements(xml: &str, element_local_name: &str) -> u32 { + let mut reader = quick_xml::Reader::from_str(xml); + let mut buf = Vec::new(); + let mut count: u32 = 0; + + loop { + match reader.read_event_into(&mut buf) { + Ok(quick_xml::events::Event::Start(ref e) | quick_xml::events::Event::Empty(ref e)) => { + let local = local_name(e.name().as_ref()); + if local == element_local_name { + count = count.saturating_add(1); + } + } + Ok(quick_xml::events::Event::Eof) | Err(_) => break, + _ => {} + } + buf.clear(); + } + + count +} + +#[cfg(test)] +#[expect(clippy::unwrap_used)] +mod tests { + use super::*; + + #[test] + fn local_name_strips_namespace() { + assert_eq!(local_name(b"dc:creator"), "creator"); + assert_eq!(local_name(b"meta:creation-date"), "creation-date"); + assert_eq!(local_name(b"title"), "title"); + } + + #[test] + fn probe_text_counts_lines_and_words() { + let tmp = tempfile::tempdir().unwrap(); + let path = tmp.path().join("test.txt"); + std::fs::write(&path, "hello world\nfoo bar baz\n").unwrap(); + + let info = probe_text(&path, "txt").unwrap(); + assert_eq!(info.format, "txt"); + assert_eq!(info.line_count, Some(2)); + assert_eq!(info.word_count, Some(5)); + } + + #[test] + fn probe_text_empty_file() { + let tmp = tempfile::tempdir().unwrap(); + let path = tmp.path().join("empty.txt"); + std::fs::write(&path, "").unwrap(); + + let info = probe_text(&path, "txt").unwrap(); + assert_eq!(info.line_count, Some(0)); + assert_eq!(info.word_count, Some(0)); + } + + #[test] + fn probe_text_table_counts_lines() { + let tmp = tempfile::tempdir().unwrap(); + let path = tmp.path().join("data.csv"); + std::fs::write(&path, "a,b,c\n1,2,3\n4,5,6\n").unwrap(); + + let info = probe_text_table(&path, "csv").unwrap(); + assert_eq!(info.format, "csv"); + assert_eq!(info.line_count, Some(3)); + } + + #[test] + fn probe_nonexistent_file_returns_none() { + let path = Path::new("/nonexistent/file.pdf"); + assert!(probe_document(path).is_none()); + } + + #[test] + fn probe_corrupt_pdf_returns_none() { + let tmp = tempfile::tempdir().unwrap(); + let path = tmp.path().join("bad.pdf"); + std::fs::write(&path, b"this is not a pdf").unwrap(); + + assert!(probe_pdf(&path).is_none()); + } + + #[test] + fn probe_corrupt_zip_returns_none() { + let tmp = tempfile::tempdir().unwrap(); + let path = tmp.path().join("bad.docx"); + std::fs::write(&path, b"not a zip file").unwrap(); + + assert!(probe_ooxml_doc(&path).is_none()); + } + + #[test] + fn parse_core_xml_extracts_fields() { + let xml = r#" + + Jane Doe + My Document + Testing + 2024-01-15T10:30:00Z + 2024-06-20T14:00:00Z + "#; + + let props = parse_core_xml(xml); + assert_eq!(props.author.as_deref(), Some("Jane Doe")); + assert_eq!(props.title.as_deref(), Some("My Document")); + assert_eq!(props.subject.as_deref(), Some("Testing")); + assert_eq!(props.created.as_deref(), Some("2024-01-15T10:30:00Z")); + assert_eq!(props.modified.as_deref(), Some("2024-06-20T14:00:00Z")); + } + + #[test] + fn parse_app_xml_extracts_fields() { + let xml = r#" + + Microsoft Word + 42 + 12500 + "#; + + let props = parse_app_xml(xml); + assert_eq!(props.app_name.as_deref(), Some("Microsoft Word")); + assert_eq!(props.pages, Some(42)); + assert_eq!(props.words, Some(12500)); + } + + #[test] + fn parse_odf_meta_extracts_fields() { + let xml = r#" + + + John Smith + ODF Doc + Testing ODF + LibreOffice/7.5 + 2024-03-01T09:00:00 + 2024-03-15T12:00:00 + + + "#; + + let meta = parse_odf_meta(xml); + assert_eq!(meta.author.as_deref(), Some("John Smith")); + assert_eq!(meta.title.as_deref(), Some("ODF Doc")); + assert_eq!(meta.subject.as_deref(), Some("Testing ODF")); + assert_eq!(meta.generator.as_deref(), Some("LibreOffice/7.5")); + assert_eq!(meta.page_count, Some(10)); + assert_eq!(meta.word_count, Some(2500)); + } + + #[test] + fn count_xml_elements_counts_correctly() { + let xml = r#""#; + assert_eq!(count_xml_elements(xml, "sheet"), 3); + assert_eq!(count_xml_elements(xml, "other"), 1); + assert_eq!(count_xml_elements(xml, "missing"), 0); + } + + #[test] + fn summary_info_empty_data() { + let mut info = DocumentInfo::default(); + parse_summary_info(&[], &mut info); + assert!(info.title.is_none()); + assert!(info.author.is_none()); + } +} diff --git a/src/filter.rs b/src/filter.rs index 81329f1..0c9f5ae 100644 --- a/src/filter.rs +++ b/src/filter.rs @@ -641,6 +641,72 @@ fn resolve_field_typed<'a>(entry: &'a MediaEntry, path: &str) -> FieldValue<'a> .and_then(|e| e.orientation) .map_or(FieldValue::Null, |v| FieldValue::Num(f64::from(v))), + // media.doc.* — string fields + "media.doc.format" => entry.media.doc.as_ref().map_or(FieldValue::Null, |d| { + FieldValue::Str(Cow::Borrowed(&d.format)) + }), + "media.doc.author" => entry + .media + .doc + .as_ref() + .and_then(|d| d.author.as_ref()) + .map_or(FieldValue::Null, |v| FieldValue::Str(Cow::Borrowed(v))), + "media.doc.title" => entry + .media + .doc + .as_ref() + .and_then(|d| d.title.as_ref()) + .map_or(FieldValue::Null, |v| FieldValue::Str(Cow::Borrowed(v))), + "media.doc.subject" => entry + .media + .doc + .as_ref() + .and_then(|d| d.subject.as_ref()) + .map_or(FieldValue::Null, |v| FieldValue::Str(Cow::Borrowed(v))), + "media.doc.creator_app" => entry + .media + .doc + .as_ref() + .and_then(|d| d.creator_app.as_ref()) + .map_or(FieldValue::Null, |v| FieldValue::Str(Cow::Borrowed(v))), + "media.doc.creation_date" => entry + .media + .doc + .as_ref() + .and_then(|d| d.creation_date.as_ref()) + .map_or(FieldValue::Null, |v| FieldValue::Str(Cow::Borrowed(v))), + "media.doc.modification_date" => entry + .media + .doc + .as_ref() + .and_then(|d| d.modification_date.as_ref()) + .map_or(FieldValue::Null, |v| FieldValue::Str(Cow::Borrowed(v))), + // media.doc.* — numeric fields + "media.doc.page_count" => entry + .media + .doc + .as_ref() + .and_then(|d| d.page_count) + .map_or(FieldValue::Null, |v| FieldValue::Num(f64::from(v))), + "media.doc.word_count" => entry + .media + .doc + .as_ref() + .and_then(|d| d.word_count) + .map_or(FieldValue::Null, |v| FieldValue::Num(v as f64)), + "media.doc.line_count" => entry + .media + .doc + .as_ref() + .and_then(|d| d.line_count) + .map_or(FieldValue::Null, |v| FieldValue::Num(v as f64)), + "media.doc.sheet_count" => entry + .media + .doc + .as_ref() + .and_then(|d| d.sheet_count) + .map_or(FieldValue::Null, |v| FieldValue::Num(f64::from(v))), + // Convenience aliases (top-level shortcuts for common fields) "duration_ms" => resolve_field_typed(entry, "media.duration_ms"), "size_bytes" => resolve_field_typed(entry, "fs.size_bytes"), @@ -650,6 +716,8 @@ fn resolve_field_typed<'a>(entry: &'a MediaEntry, path: &str) -> FieldValue<'a> "bitrate_bps" | "bitrate" => resolve_field_typed(entry, "media.overall_bitrate_bps"), "camera" => resolve_field_typed(entry, "media.exif.camera_model"), "iso" => resolve_field_typed(entry, "media.exif.iso"), + "pages" => resolve_field_typed(entry, "media.doc.page_count"), + "author" => resolve_field_typed(entry, "media.doc.author"), // Unknown field _ => { @@ -783,6 +851,7 @@ mod tests { streams: vec![], tags: MediaTags::default(), exif: None, + doc: None, }, probe: ProbeInfo { backend: Cow::Borrowed("ffprobe"), @@ -1292,6 +1361,7 @@ mod tests { gps_longitude: Some(139.767_125), orientation: Some(1), }), + doc: None, }, probe: ProbeInfo { backend: Cow::Borrowed("ffprobe"), diff --git a/src/main.rs b/src/main.rs index 0a1c1da..2f59808 100644 --- a/src/main.rs +++ b/src/main.rs @@ -4,6 +4,7 @@ /// output for scripts and AI agents. Think `fd` meets `ffprobe` meets `lazygit`. mod cli; mod deps; +mod document; mod exif; mod filter; mod output; @@ -218,7 +219,16 @@ async fn run_ndjson(cli: &Cli, paths: &[std::path::PathBuf]) -> Result<()> { async fn run_info(cli: &Cli, files: &[std::path::PathBuf]) -> Result<()> { let mut entries = Vec::new(); for file in files { - match probe::probe_file(file, cli.timeout_ms).await { + let is_doc = file + .extension() + .and_then(|e| e.to_str()) + .is_some_and(types::is_document_extension); + let result = if is_doc { + probe::probe_document_file(file).await + } else { + probe::probe_file(file, cli.timeout_ms).await + }; + match result { Ok(entry) => entries.push(entry), Err(e) => { tracing::error!(path = %file.display(), "error probing file: {e}"); diff --git a/src/output.rs b/src/output.rs index 92d34b8..f44187f 100644 --- a/src/output.rs +++ b/src/output.rs @@ -7,7 +7,7 @@ use chrono::{DateTime, Utc}; use serde::Serialize; use std::io::Write; -const SCHEMA_VERSION: &str = "0.1.0"; +const SCHEMA_VERSION: &str = "0.2.0"; const MLS_VERSION: &str = env!("CARGO_PKG_VERSION"); /// Borrowing envelope for JSON serialization (avoids cloning entries). @@ -167,6 +167,7 @@ mod tests { streams: vec![], tags: MediaTags::default(), exif: None, + doc: None, }, probe: ProbeInfo { backend: Cow::Borrowed("ffprobe"), @@ -199,7 +200,7 @@ mod tests { write_json(&mut buf, &[], &[]).unwrap(); let val: serde_json::Value = serde_json::from_slice(&buf).unwrap(); assert_eq!(val["type"], "mls.list"); - assert_eq!(val["schema_version"], "0.1.0"); + assert_eq!(val["schema_version"], "0.2.0"); assert_eq!(val["summary"]["entries_total"], 0); } diff --git a/src/probe.rs b/src/probe.rs index e4153d8..f512abc 100644 --- a/src/probe.rs +++ b/src/probe.rs @@ -126,6 +126,66 @@ pub async fn probe_file(path: &Path, timeout_ms: u64) -> Result { }) } +/// Probe a document file using native Rust extractors (no ffprobe). +/// +/// # Errors +/// Returns an error if file metadata cannot be read. +pub async fn probe_document_file(path: &Path) -> Result { + let start = Instant::now(); + + let fs_meta = tokio::fs::metadata(path) + .await + .context("failed to read file metadata")?; + let fs = build_fs_info(&fs_meta); + + let extension = path + .extension() + .map_or_else(String::new, |e| e.to_string_lossy().into_owned()); + + let doc_path = path.to_path_buf(); + let doc_info = tokio::task::spawn_blocking(move || crate::document::probe_document(&doc_path)) + .await + .unwrap_or(None); + + let format_name = doc_info + .as_ref() + .map_or_else(|| extension.clone(), |d| d.format.clone()); + + let file_name = path + .file_name() + .map_or_else(String::new, |n| n.to_string_lossy().into_owned()); + + #[expect(clippy::cast_possible_truncation)] + let took_ms = start.elapsed().as_millis() as u64; + + Ok(MediaEntry { + path: path.to_path_buf(), + file_name, + extension, + fs, + media: MediaInfo { + kind: MediaKind::Document, + container: ContainerInfo { + format_name: format_name.clone(), + format_primary: format_name, + }, + duration_ms: None, + overall_bitrate_bps: None, + video: None, + audio: None, + streams: vec![], + tags: MediaTags::default(), + exif: None, + doc: doc_info, + }, + probe: ProbeInfo { + backend: Cow::Borrowed("native"), + took_ms, + error: None, + }, + }) +} + #[expect( clippy::cast_possible_wrap, reason = "Unix timestamp seconds fit i64 until year 2262" @@ -250,6 +310,7 @@ fn build_media_info(raw: &FfprobeOutput, ext: &str) -> MediaInfo { streams, tags, exif: None, + doc: None, } } diff --git a/src/scan.rs b/src/scan.rs index ac6a3ef..a333d27 100644 --- a/src/scan.rs +++ b/src/scan.rs @@ -3,7 +3,7 @@ /// Filters by recognized media file extensions. Uses tokio for concurrent /// metadata probing with configurable concurrency. use crate::probe; -use crate::types::{MediaEntry, ProbeError, is_media_extension}; +use crate::types::{MediaEntry, ProbeError, is_document_extension, is_media_extension}; use anyhow::{Context, Result}; use std::collections::HashSet; use std::os::unix::fs::MetadataExt; @@ -119,8 +119,17 @@ pub async fn probe_files( } let tx = tx.clone(); + let is_doc = file + .extension() + .and_then(|e| e.to_str()) + .is_some_and(is_document_extension); tasks.spawn(async move { - match probe::probe_file(&file, timeout_ms).await { + let result = if is_doc { + probe::probe_document_file(&file).await + } else { + probe::probe_file(&file, timeout_ms).await + }; + match result { Ok(entry) => { let _ = tx.send(ScanResult::Entry(Box::new(entry))).await; } @@ -183,10 +192,12 @@ mod tests { fs::write(root.join("a.mp4"), b"fake").unwrap(); fs::write(root.join("b.mp3"), b"fake").unwrap(); - fs::write(root.join("c.txt"), b"not media").unwrap(); + fs::write(root.join("c.txt"), b"text file").unwrap(); + fs::write(root.join("d.xyz"), b"not media").unwrap(); let files = discover_media_files(&[root.to_path_buf()], None); - assert_eq!(files.len(), 2); + // mp4, mp3, txt are recognized (3 files); xyz is not + assert_eq!(files.len(), 3); } #[test] diff --git a/src/sort.rs b/src/sort.rs index e5cc84c..4cc1595 100644 --- a/src/sort.rs +++ b/src/sort.rs @@ -21,6 +21,7 @@ pub fn parse_sort_spec(spec: &str) -> Option<(SortKey, SortDir)> { "resolution" => SortKey::Resolution, "codec" => SortKey::Codec, "bitrate" => SortKey::Bitrate, + "pages" | "page_count" => SortKey::Pages, _ => return None, }; let dir = match dir_str { @@ -100,6 +101,11 @@ fn compare_by_key(a: &MediaEntry, b: &MediaEntry, key: SortKey) -> std::cmp::Ord .media .overall_bitrate_bps .cmp(&b.media.overall_bitrate_bps), + SortKey::Pages => { + let pages_a = a.media.doc.as_ref().and_then(|d| d.page_count); + let pages_b = b.media.doc.as_ref().and_then(|d| d.page_count); + pages_a.cmp(&pages_b) + } } } @@ -137,6 +143,7 @@ mod tests { streams: vec![], tags: MediaTags::default(), exif: None, + doc: None, }, probe: ProbeInfo { backend: Cow::Borrowed("ffprobe"), @@ -231,6 +238,8 @@ mod tests { "resolution", "codec", "bitrate", + "pages", + "page_count", ]; for key in keys { assert!( diff --git a/src/tui/layout.rs b/src/tui/layout.rs index 8fbcffa..7542858 100644 --- a/src/tui/layout.rs +++ b/src/tui/layout.rs @@ -186,6 +186,7 @@ fn render_file_list(frame: &mut Frame, app: &App, area: Rect) { MediaKind::Video | MediaKind::Av => "V", MediaKind::Audio => "A", MediaKind::Image => "I", + MediaKind::Document => "D", }; let resolution = entry.media.video.as_ref().map_or_else( @@ -513,6 +514,54 @@ fn render_metadata_text(frame: &mut Frame, entry: &crate::types::MediaEntry, are } } + if let Some(ref doc) = entry.media.doc { + lines.push(Line::from("")); + lines.push(Line::styled( + "── Document ──", + Style::default().fg(Color::Blue), + )); + lines.push(Line::from(vec![ + Span::styled("Format: ", Style::default().fg(Color::DarkGray)), + Span::raw(&doc.format), + ])); + if let Some(pages) = doc.page_count { + lines.push(Line::from(vec![ + Span::styled("Pages: ", Style::default().fg(Color::DarkGray)), + Span::raw(format!("{pages}")), + ])); + } + if let Some(words) = doc.word_count { + lines.push(Line::from(vec![ + Span::styled("Words: ", Style::default().fg(Color::DarkGray)), + Span::raw(format!("{words}")), + ])); + } + if let Some(line_count) = doc.line_count { + lines.push(Line::from(vec![ + Span::styled("Lines: ", Style::default().fg(Color::DarkGray)), + Span::raw(format!("{line_count}")), + ])); + } + if let Some(sheets) = doc.sheet_count { + lines.push(Line::from(vec![ + Span::styled("Sheets: ", Style::default().fg(Color::DarkGray)), + Span::raw(format!("{sheets}")), + ])); + } + if let Some(ref author) = doc.author { + lines.push(Line::from(vec![ + Span::styled("Author: ", Style::default().fg(Color::DarkGray)), + Span::raw(author), + ])); + } + if let Some(ref title) = doc.title { + lines.push(Line::from(vec![ + Span::styled("Title: ", Style::default().fg(Color::DarkGray)), + Span::raw(title), + ])); + } + } + let preview = Paragraph::new(lines).wrap(Wrap { trim: true }); frame.render_widget(preview, area); } @@ -705,7 +754,7 @@ fn render_footer(frame: &mut Frame, app: &App, area: Rect) { let keys = if app.triage.is_some() { "[y] keep [n] delete [m] move [u] undo [q] quit triage" } else { - "[j/k] nav [Enter] open [p] play [/] filter [1/2/3/4] kind [s] sort [t] triage [?] help" + "[j/k] nav [Enter] open [p] play [/] filter [1-5] kind [s] sort [t] triage [?] help" }; let keybindings = Paragraph::new(Line::styled(keys, Style::default().fg(Color::DarkGray))); frame.render_widget(keybindings, footer_layout[1]); @@ -732,7 +781,7 @@ fn render_help_overlay(frame: &mut Frame, area: Rect) { Line::from(""), Line::styled("Actions", Style::default().add_modifier(Modifier::BOLD)), Line::from(" / Fuzzy filter (prefix = for structured)"), - Line::from(" 1/2/3/4 Filter: All/Video/Audio/Image"), + Line::from(" 1/2/3/4/5 Filter: All/Video/Audio/Image/Doc"), Line::from(" s/S Cycle sort / reverse"), Line::from(" i Toggle metadata panel"), Line::from(" Space Mark/unmark file"), diff --git a/src/tui/mod.rs b/src/tui/mod.rs index 4cabf20..9546e64 100644 --- a/src/tui/mod.rs +++ b/src/tui/mod.rs @@ -48,6 +48,8 @@ pub enum KindFilter { Audio, /// Show only image files. Image, + /// Show only document files. + Document, } impl KindFilter { @@ -59,6 +61,7 @@ impl KindFilter { Self::Video => "Video", Self::Audio => "Audio", Self::Image => "Image", + Self::Document => "Document", } } } @@ -273,6 +276,7 @@ impl App { KindFilter::Video => matches!(entry.media.kind, MediaKind::Video | MediaKind::Av), KindFilter::Audio => matches!(entry.media.kind, MediaKind::Audio), KindFilter::Image => matches!(entry.media.kind, MediaKind::Image), + KindFilter::Document => matches!(entry.media.kind, MediaKind::Document), } } @@ -811,6 +815,11 @@ async fn handle_key(app: &mut App, key: KeyEvent) { app.apply_filter(); app.set_status("Filter: Image".to_string()); } + (KeyCode::Char('5'), _) => { + app.kind_filter = KindFilter::Document; + app.apply_filter(); + app.set_status("Filter: Document".to_string()); + } // Playback (KeyCode::Char('p'), _) => handle_playback(app).await, (KeyCode::Char('P'), _) => { @@ -994,6 +1003,7 @@ mod tests { streams: vec![], tags: MediaTags::default(), exif: None, + doc: None, }, probe: ProbeInfo { backend: Cow::Borrowed("ffprobe"), diff --git a/src/tui/triage.rs b/src/tui/triage.rs index 6ebd8d7..ef44f24 100644 --- a/src/tui/triage.rs +++ b/src/tui/triage.rs @@ -367,6 +367,7 @@ mod tests { streams: vec![], tags: MediaTags::default(), exif: None, + doc: None, }, probe: ProbeInfo { backend: Cow::Borrowed("ffprobe"), diff --git a/src/types.rs b/src/types.rs index e32d2d4..7eee718 100644 --- a/src/types.rs +++ b/src/types.rs @@ -17,6 +17,8 @@ pub enum MediaKind { Av, /// Still image (JPEG, PNG, etc.). Image, + /// Document file (PDF, DOCX, TXT, etc.). + Document, } impl std::fmt::Display for MediaKind { @@ -26,6 +28,7 @@ impl std::fmt::Display for MediaKind { Self::Audio => write!(f, "audio"), Self::Av => write!(f, "av"), Self::Image => write!(f, "image"), + Self::Document => write!(f, "document"), } } } @@ -172,6 +175,32 @@ pub struct ExifInfo { pub orientation: Option, } +/// Document metadata (PDF, DOCX, XLSX, TXT, etc.). +#[derive(Debug, Clone, Default, Serialize, Deserialize)] +pub struct DocumentInfo { + pub format: String, + #[serde(skip_serializing_if = "Option::is_none")] + pub page_count: Option, + #[serde(skip_serializing_if = "Option::is_none")] + pub word_count: Option, + #[serde(skip_serializing_if = "Option::is_none")] + pub line_count: Option, + #[serde(skip_serializing_if = "Option::is_none")] + pub sheet_count: Option, + #[serde(skip_serializing_if = "Option::is_none")] + pub author: Option, + #[serde(skip_serializing_if = "Option::is_none")] + pub title: Option, + #[serde(skip_serializing_if = "Option::is_none")] + pub subject: Option, + #[serde(skip_serializing_if = "Option::is_none")] + pub creator_app: Option, + #[serde(skip_serializing_if = "Option::is_none")] + pub creation_date: Option, + #[serde(skip_serializing_if = "Option::is_none")] + pub modification_date: Option, +} + /// Aggregated media metadata. #[derive(Debug, Clone, Serialize, Deserialize)] pub struct MediaInfo { @@ -190,6 +219,8 @@ pub struct MediaInfo { pub tags: MediaTags, #[serde(skip_serializing_if = "Option::is_none")] pub exif: Option, + #[serde(skip_serializing_if = "Option::is_none")] + pub doc: Option, } /// File-system metadata. @@ -269,6 +300,7 @@ pub enum SortKey { Resolution, Codec, Bitrate, + Pages, } impl SortKey { @@ -283,6 +315,7 @@ impl SortKey { Self::Resolution => "resolution", Self::Codec => "codec", Self::Bitrate => "bitrate", + Self::Pages => "pages", } } @@ -297,7 +330,8 @@ impl SortKey { Self::Duration => Self::Resolution, Self::Resolution => Self::Codec, Self::Codec => Self::Bitrate, - Self::Bitrate => Self::Path, + Self::Bitrate => Self::Pages, + Self::Pages => Self::Path, } } } @@ -381,6 +415,11 @@ pub const AUDIO_EXTENSIONS: &[&str] = &[ pub const IMAGE_EXTENSIONS: &[&str] = &["jpg", "jpeg", "png", "webp", "gif", "bmp", "tiff", "tif"]; +pub const DOCUMENT_EXTENSIONS: &[&str] = &[ + "pdf", "docx", "doc", "odt", "xlsx", "xls", "ods", "pptx", "ppt", "odp", "csv", "tsv", "txt", + "md", +]; + /// Check if a file extension is a recognized image type. #[must_use] pub fn is_image_extension(ext: &str) -> bool { @@ -389,7 +428,15 @@ pub fn is_image_extension(ext: &str) -> bool { .any(|known| ext.eq_ignore_ascii_case(known)) } -/// Check if a file extension is a recognized media type (video, audio, or image). +/// Check if a file extension is a recognized document type. +#[must_use] +pub fn is_document_extension(ext: &str) -> bool { + DOCUMENT_EXTENSIONS + .iter() + .any(|known| ext.eq_ignore_ascii_case(known)) +} + +/// Check if a file extension is a recognized media type (video, audio, image, or document). #[must_use] pub fn is_media_extension(ext: &str) -> bool { VIDEO_EXTENSIONS @@ -397,6 +444,7 @@ pub fn is_media_extension(ext: &str) -> bool { .chain(AUDIO_EXTENSIONS.iter()) .any(|known| ext.eq_ignore_ascii_case(known)) || is_image_extension(ext) + || is_document_extension(ext) } /// Check if a file extension is a recognized video type. @@ -627,13 +675,14 @@ mod tests { assert_eq!(SortKey::Resolution.label(), "resolution"); assert_eq!(SortKey::Codec.label(), "codec"); assert_eq!(SortKey::Bitrate.label(), "bitrate"); + assert_eq!(SortKey::Pages.label(), "pages"); } #[test] fn sort_key_cycle_returns_to_start() { let start = SortKey::Path; let mut current = start; - for _ in 0..8 { + for _ in 0..9 { current = current.next(); } assert_eq!(current, start); @@ -642,7 +691,8 @@ mod tests { #[test] fn sort_key_next_sequence() { assert_eq!(SortKey::Path.next(), SortKey::Name); - assert_eq!(SortKey::Bitrate.next(), SortKey::Path); + assert_eq!(SortKey::Bitrate.next(), SortKey::Pages); + assert_eq!(SortKey::Pages.next(), SortKey::Path); } // --- SortDir --- @@ -661,6 +711,7 @@ mod tests { assert_eq!(MediaKind::Audio.to_string(), "audio"); assert_eq!(MediaKind::Av.to_string(), "av"); assert_eq!(MediaKind::Image.to_string(), "image"); + assert_eq!(MediaKind::Document.to_string(), "document"); } // --- Extension checks --- @@ -695,11 +746,50 @@ mod tests { #[test] fn is_media_extension_rejects_unknown() { - assert!(!is_media_extension("txt")); - assert!(!is_media_extension("pdf")); + assert!(!is_media_extension("xyz")); + assert!(!is_media_extension("exe")); assert!(!is_media_extension("")); } + #[test] + fn is_media_extension_document() { + assert!(is_media_extension("pdf")); + assert!(is_media_extension("docx")); + assert!(is_media_extension("txt")); + assert!(is_media_extension("csv")); + } + + #[test] + fn is_document_extension_accepts_documents() { + assert!(is_document_extension("pdf")); + assert!(is_document_extension("docx")); + assert!(is_document_extension("doc")); + assert!(is_document_extension("odt")); + assert!(is_document_extension("xlsx")); + assert!(is_document_extension("xls")); + assert!(is_document_extension("ods")); + assert!(is_document_extension("pptx")); + assert!(is_document_extension("ppt")); + assert!(is_document_extension("odp")); + assert!(is_document_extension("csv")); + assert!(is_document_extension("tsv")); + assert!(is_document_extension("txt")); + assert!(is_document_extension("md")); + } + + #[test] + fn is_document_extension_case_insensitive() { + assert!(is_document_extension("PDF")); + assert!(is_document_extension("Docx")); + } + + #[test] + fn is_document_extension_rejects_non_documents() { + assert!(!is_document_extension("mp4")); + assert!(!is_document_extension("jpg")); + assert!(!is_document_extension("")); + } + #[test] fn is_video_extension_accepts_video() { assert!(is_video_extension("mkv")); diff --git a/tests/cli.rs b/tests/cli.rs index 37a9ba8..d8ace95 100644 --- a/tests/cli.rs +++ b/tests/cli.rs @@ -121,14 +121,14 @@ fn json_output_valid_schema() { let json: serde_json::Value = serde_json::from_slice(&output.stdout).unwrap(); assert_eq!(json["type"], "mls.list"); - assert_eq!(json["schema_version"], "0.1.0"); + assert_eq!(json["schema_version"], "0.2.0"); assert!(json["entries"].is_array()); assert!(json["summary"].is_object()); assert!(json["summary"]["entries_total"].is_number()); let entries = json["entries"].as_array().unwrap(); - // Should find 5 media files (mp4, mkv, mp3, jpg, png) — not the .txt - assert_eq!(entries.len(), 5); + // 5 AV/image files (mp4, mkv, mp3, jpg, png) + 1 document (txt) + assert_eq!(entries.len(), 6); } #[test] @@ -164,7 +164,7 @@ fn ndjson_has_header_and_footer() { let header: serde_json::Value = serde_json::from_str(lines[0]).unwrap(); assert_eq!(header["type"], "mls.header"); - assert_eq!(header["schema_version"], "0.1.0"); + assert_eq!(header["schema_version"], "0.2.0"); let footer: serde_json::Value = serde_json::from_str(lines.last().unwrap()).unwrap(); assert_eq!(footer["type"], "mls.footer"); @@ -319,3 +319,82 @@ fn json_filter_kind_excludes_other_kinds() { assert_eq!(entries[0]["media"]["kind"], "audio"); assert_eq!(entries[0]["extension"], "mp3"); } + +// --- Document support --- + +#[test] +fn json_documents_have_kind_document() { + let tmp = setup_media_dir(); + let output = mls_cmd().arg("--json").arg(tmp.path()).output().unwrap(); + + assert!(output.status.success()); + let json: serde_json::Value = serde_json::from_slice(&output.stdout).unwrap(); + let entries = json["entries"].as_array().unwrap(); + + let docs: Vec<&serde_json::Value> = entries + .iter() + .filter(|e| e["media"]["kind"] == "document") + .collect(); + + assert_eq!(docs.len(), 1, "expected 1 document entry (txt)"); + assert_eq!(docs[0]["extension"], "txt"); + assert_eq!(docs[0]["probe"]["backend"], "native"); +} + +#[test] +fn json_filter_kind_document_returns_only_documents() { + let tmp = setup_media_dir(); + let output = mls_cmd() + .arg("--json") + .arg("--filter") + .arg("kind == document") + .arg(tmp.path()) + .output() + .unwrap(); + + assert!(output.status.success()); + let json: serde_json::Value = serde_json::from_slice(&output.stdout).unwrap(); + let entries = json["entries"].as_array().unwrap(); + + assert_eq!(entries.len(), 1, "expected only 1 document entry"); + assert_eq!(entries[0]["media"]["kind"], "document"); +} + +#[test] +fn json_document_has_line_count() { + let tmp = tempfile::tempdir().unwrap(); + fs::write( + tmp.path().join("notes.txt"), + b"line one\nline two\nline three\n", + ) + .unwrap(); + + let output = mls_cmd().arg("--json").arg(tmp.path()).output().unwrap(); + + assert!(output.status.success()); + let json: serde_json::Value = serde_json::from_slice(&output.stdout).unwrap(); + let entries = json["entries"].as_array().unwrap(); + + assert_eq!(entries.len(), 1); + let doc = &entries[0]; + assert_eq!(doc["media"]["kind"], "document"); + assert_eq!(doc["media"]["doc"]["format"], "txt"); + assert_eq!(doc["media"]["doc"]["line_count"], 3); +} + +#[test] +fn json_sort_by_pages() { + let tmp = setup_media_dir(); + let output = mls_cmd() + .arg("--json") + .arg("--sort") + .arg("pages") + .arg(tmp.path()) + .output() + .unwrap(); + + assert!(output.status.success()); + let json: serde_json::Value = serde_json::from_slice(&output.stdout).unwrap(); + let entries = json["entries"].as_array().unwrap(); + assert!(!entries.is_empty()); +} From 63b90f57a1ceac0e34cfadb401d6cdcb6562a551 Mon Sep 17 00:00:00 2001 From: Pushkar Patel Date: Tue, 3 Mar 2026 15:12:36 +0530 Subject: [PATCH 02/10] feat(tui): make kind filter multi-select with toggle keys Replace exclusive KindFilter enum with a struct of per-kind booleans. Keys 2-5 now toggle individual kinds on/off; key 1 resets to show all. Footer shows checkbox indicators for each kind's active state. --- src/tui/layout.rs | 46 ++++++++----- src/tui/mod.rs | 163 ++++++++++++++++++++++++++++++++++------------ 2 files changed, 150 insertions(+), 59 deletions(-) diff --git a/src/tui/layout.rs b/src/tui/layout.rs index 7542858..ed0c929 100644 --- a/src/tui/layout.rs +++ b/src/tui/layout.rs @@ -732,21 +732,32 @@ fn render_footer(frame: &mut Frame, app: &App, area: Rect) { Style::default().fg(Color::Red), ) } else { - let kind_label = app.kind_filter.label(); - Line::styled( - format!( - "{}/{} files │ Sort: {} │ [{}]", - if app.visible_count() == 0 { - 0 - } else { - app.selected + 1 - }, - app.visible_count(), - app.sort_key.label(), - kind_label, - ), - Style::default().fg(Color::DarkGray), - ) + let prefix = format!( + "{}/{} files \u{2502} Sort: {} \u{2502} ", + if app.visible_count() == 0 { + 0 + } else { + app.selected + 1 + }, + app.visible_count(), + app.sort_key.label(), + ); + let kf = &app.kind_filter; + let check = |on: bool| if on { "\u{2713}" } else { " " }; + let dim = Style::default().fg(Color::DarkGray); + let spans = vec![ + Span::styled(prefix, dim), + Span::styled("V[", dim), + Span::raw(check(kf.video)), + Span::styled("] A[", dim), + Span::raw(check(kf.audio)), + Span::styled("] I[", dim), + Span::raw(check(kf.image)), + Span::styled("] D[", dim), + Span::raw(check(kf.doc)), + Span::styled("]", dim), + ]; + Line::from(spans) }; frame.render_widget(Paragraph::new(status), footer_layout[0]); @@ -754,7 +765,7 @@ fn render_footer(frame: &mut Frame, app: &App, area: Rect) { let keys = if app.triage.is_some() { "[y] keep [n] delete [m] move [u] undo [q] quit triage" } else { - "[j/k] nav [Enter] open [p] play [/] filter [1-5] kind [s] sort [t] triage [?] help" + "[j/k] nav [Enter] open [p] play [/] filter [1] all [2-5] kind [s] sort [t] triage [?] help" }; let keybindings = Paragraph::new(Line::styled(keys, Style::default().fg(Color::DarkGray))); frame.render_widget(keybindings, footer_layout[1]); @@ -781,7 +792,8 @@ fn render_help_overlay(frame: &mut Frame, area: Rect) { Line::from(""), Line::styled("Actions", Style::default().add_modifier(Modifier::BOLD)), Line::from(" / Fuzzy filter (prefix = for structured)"), - Line::from(" 1/2/3/4/5 Filter: All/Video/Audio/Image/Doc"), + Line::from(" 1 Show all kinds"), + Line::from(" 2/3/4/5 Toggle Video/Audio/Image/Doc"), Line::from(" s/S Cycle sort / reverse"), Line::from(" i Toggle metadata panel"), Line::from(" Space Mark/unmark file"), diff --git a/src/tui/mod.rs b/src/tui/mod.rs index 9546e64..36b2fdd 100644 --- a/src/tui/mod.rs +++ b/src/tui/mod.rs @@ -37,32 +37,27 @@ pub enum FilterMode { Structured, } -/// Media kind pre-filter (1/2/3/4 keys). +/// Media kind multi-select filter (1=all, 2-5 toggle individual kinds). +#[expect(clippy::struct_excessive_bools, reason = "one bool per UI checkbox")] #[derive(Debug, Clone, Copy, PartialEq, Eq)] -pub enum KindFilter { - /// Show all media types. - All, - /// Show only video/av files. - Video, - /// Show only audio-only files (no video stream). - Audio, - /// Show only image files. - Image, - /// Show only document files. - Document, +pub struct KindFilter { + pub video: bool, + pub audio: bool, + pub image: bool, + pub doc: bool, } impl KindFilter { - /// Label for display in the footer. + pub const ALL: Self = Self { + video: true, + audio: true, + image: true, + doc: true, + }; + #[must_use] - pub fn label(self) -> &'static str { - match self { - Self::All => "All", - Self::Video => "Video", - Self::Audio => "Audio", - Self::Image => "Image", - Self::Document => "Document", - } + pub fn is_empty(self) -> bool { + !self.video && !self.audio && !self.image && !self.doc } } @@ -206,7 +201,7 @@ impl App { dir_scanning: false, scan_concurrency, scan_timeout_ms, - kind_filter: KindFilter::All, + kind_filter: KindFilter::ALL, filter_mode: FilterMode::Fuzzy, filter_expr: None, playback_position: None, @@ -271,12 +266,11 @@ impl App { /// Check if an entry matches the current kind filter. fn matches_kind(&self, entry: &MediaEntry) -> bool { - match self.kind_filter { - KindFilter::All => true, - KindFilter::Video => matches!(entry.media.kind, MediaKind::Video | MediaKind::Av), - KindFilter::Audio => matches!(entry.media.kind, MediaKind::Audio), - KindFilter::Image => matches!(entry.media.kind, MediaKind::Image), - KindFilter::Document => matches!(entry.media.kind, MediaKind::Document), + match entry.media.kind { + MediaKind::Video | MediaKind::Av => self.kind_filter.video, + MediaKind::Audio => self.kind_filter.audio, + MediaKind::Image => self.kind_filter.image, + MediaKind::Document => self.kind_filter.doc, } } @@ -796,29 +790,36 @@ async fn handle_key(app: &mut App, key: KeyEvent) { } // Kind filter (KeyCode::Char('1'), _) => { - app.kind_filter = KindFilter::All; + app.kind_filter = KindFilter::ALL; app.apply_filter(); - app.set_status("Filter: All".to_string()); } (KeyCode::Char('2'), _) => { - app.kind_filter = KindFilter::Video; + app.kind_filter.video = !app.kind_filter.video; app.apply_filter(); - app.set_status("Filter: Video".to_string()); + if app.kind_filter.is_empty() { + app.set_status("Kind: nothing selected \u{2014} 1 to show all".to_string()); + } } (KeyCode::Char('3'), _) => { - app.kind_filter = KindFilter::Audio; + app.kind_filter.audio = !app.kind_filter.audio; app.apply_filter(); - app.set_status("Filter: Audio".to_string()); + if app.kind_filter.is_empty() { + app.set_status("Kind: nothing selected \u{2014} 1 to show all".to_string()); + } } (KeyCode::Char('4'), _) => { - app.kind_filter = KindFilter::Image; + app.kind_filter.image = !app.kind_filter.image; app.apply_filter(); - app.set_status("Filter: Image".to_string()); + if app.kind_filter.is_empty() { + app.set_status("Kind: nothing selected \u{2014} 1 to show all".to_string()); + } } (KeyCode::Char('5'), _) => { - app.kind_filter = KindFilter::Document; + app.kind_filter.doc = !app.kind_filter.doc; app.apply_filter(); - app.set_status("Filter: Document".to_string()); + if app.kind_filter.is_empty() { + app.set_status("Kind: nothing selected \u{2014} 1 to show all".to_string()); + } } // Playback (KeyCode::Char('p'), _) => handle_playback(app).await, @@ -1182,7 +1183,12 @@ mod tests { 5000, ); - app.kind_filter = KindFilter::Video; + app.kind_filter = KindFilter { + video: true, + audio: false, + image: false, + doc: false, + }; app.apply_filter(); // Video filter matches MediaKind::Video and MediaKind::Av (2 entries). @@ -1209,7 +1215,12 @@ mod tests { ); // Audio filter matches only MediaKind::Audio (1 entry). - app.kind_filter = KindFilter::Audio; + app.kind_filter = KindFilter { + video: false, + audio: true, + image: false, + doc: false, + }; app.apply_filter(); assert_eq!(app.filtered_indices.len(), 1); } @@ -1217,7 +1228,7 @@ mod tests { #[test] fn kind_filter_all_shows_everything() { let mut app = make_test_app(&["a.mp4", "b.mkv", "c.mp3"]); - app.kind_filter = KindFilter::All; + app.kind_filter = KindFilter::ALL; app.apply_filter(); assert_eq!(app.filtered_indices.len(), 3); } @@ -1243,13 +1254,81 @@ mod tests { ); // Video kind filter + fuzzy "alpha" → only alpha.mp4 passes both predicates. - app.kind_filter = KindFilter::Video; + app.kind_filter = KindFilter { + video: true, + audio: false, + image: false, + doc: false, + }; app.filter_text = "alpha".to_string(); app.apply_filter(); assert_eq!(app.filtered_indices.len(), 1); assert_eq!(app.entries[app.filtered_indices[0]].file_name, "alpha.mp4"); } + #[test] + fn kind_filter_multi_select() { + let entries = vec![ + make_entry_with_kind("video.mp4", MediaKind::Video), + make_entry_with_kind("song.mp3", MediaKind::Audio), + make_entry_with_kind("photo.jpg", MediaKind::Image), + make_entry_with_kind("doc.pdf", MediaKind::Document), + ]; + let tmp = tempfile::tempdir().unwrap(); + let thumb_cache = ThumbnailCache::new(10, tmp.path().to_path_buf()).unwrap(); + let picker = Picker::halfblocks(); + let mut app = App::new( + entries, + vec![], + PathBuf::from("/test"), + thumb_cache, + picker, + 4, + 5000, + ); + + // Video + audio enabled, image + doc disabled. + app.kind_filter = KindFilter { + video: true, + audio: true, + image: false, + doc: false, + }; + app.apply_filter(); + assert_eq!(app.filtered_indices.len(), 2); + } + + #[test] + fn kind_filter_empty_shows_nothing() { + let entries = vec![ + make_entry_with_kind("video.mp4", MediaKind::Video), + make_entry_with_kind("song.mp3", MediaKind::Audio), + ]; + let tmp = tempfile::tempdir().unwrap(); + let thumb_cache = ThumbnailCache::new(10, tmp.path().to_path_buf()).unwrap(); + let picker = Picker::halfblocks(); + let mut app = App::new( + entries, + vec![], + PathBuf::from("/test"), + thumb_cache, + picker, + 4, + 5000, + ); + + // All kinds disabled → empty list. + app.kind_filter = KindFilter { + video: false, + audio: false, + image: false, + doc: false, + }; + app.apply_filter(); + assert!(app.filtered_indices.is_empty()); + assert!(app.kind_filter.is_empty()); + } + #[test] fn filter_mode_defaults_to_fuzzy() { let app = make_test_app(&["a.mp4"]); From f029fc2e0930f3974b44b5ad227da5f23f64f36b Mon Sep 17 00:00:00 2001 From: Pushkar Patel Date: Tue, 3 Mar 2026 15:57:18 +0530 Subject: [PATCH 03/10] fix(tui): preserve selection when navigating back to parent directory Save current_dir before navigating to parent, then restore cursor to the child directory in the new dir_items list. Also fix apply_filter clamping to use visible_count() (dirs + media) instead of filtered_indices.len() (media only). --- src/tui/mod.rs | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/src/tui/mod.rs b/src/tui/mod.rs index 36b2fdd..0aeacb0 100644 --- a/src/tui/mod.rs +++ b/src/tui/mod.rs @@ -258,9 +258,9 @@ impl App { FilterMode::Structured => self.apply_structured_filter(&kind_indices), } } - // Keep selected index in bounds - if self.selected >= self.filtered_indices.len() { - self.selected = self.filtered_indices.len().saturating_sub(1); + // Keep selected index in bounds (dirs + media) + if self.selected >= self.visible_count() { + self.selected = self.visible_count().saturating_sub(1); } } @@ -850,7 +850,11 @@ async fn handle_key(app: &mut App, key: KeyEvent) { if app.current_dir != app.root_dir && let Some(parent) = app.current_dir.parent().map(std::path::Path::to_path_buf) { + let child = app.current_dir.clone(); app.navigate_to_dir(parent); + if let Some(idx) = app.dir_items.iter().position(|d| *d == child) { + app.selected = idx; + } } } _ => {} From 87793a7a7f3bd3911c2fec3aac34d8d15abb3634 Mon Sep 17 00:00:00 2001 From: Pushkar Patel Date: Tue, 3 Mar 2026 16:34:34 +0530 Subject: [PATCH 04/10] feat(tui): sort folders by current sort key (dirs always on top) Folders now respect the user's sort key (name, size, modified) instead of always being alphabetical. Media-only keys (duration, resolution, codec, bitrate, pages) fall back to name sort for directories. Introduce DirItem struct to carry cached metadata (name, size, modified_at) alongside directory paths, avoiding re-stat on every sort. Decouple list_sibling_dirs from list_subdirs so the parent pane stays alphabetical while the current pane sorts dynamically. --- src/sort.rs | 112 +++++++++++++++++++++++++++++++++++++++++++++- src/tui/layout.rs | 5 +-- src/tui/mod.rs | 88 ++++++++++++++++++++++++++---------- src/types.rs | 21 +++++++++ 4 files changed, 196 insertions(+), 30 deletions(-) diff --git a/src/sort.rs b/src/sort.rs index 4cc1595..9c9aa48 100644 --- a/src/sort.rs +++ b/src/sort.rs @@ -1,7 +1,7 @@ -/// Sorting logic for media entries. +/// Sorting logic for media entries and directory items. /// /// Supports sorting by all metadata fields with configurable direction. -use crate::types::{MediaEntry, SortDir, SortKey}; +use crate::types::{DirItem, MediaEntry, SortDir, SortKey}; /// Parse a sort specification string (e.g., "`duration_ms:desc`", "name:asc"). /// @@ -60,6 +60,29 @@ pub fn sort_entries(entries: &mut [MediaEntry], key: SortKey, dir: SortDir) { }); } +/// Sort directory items in place by the given key and direction. +/// +/// Falls back to Name sort for media-only keys (Duration, Resolution, etc.). +pub fn sort_dir_items(dirs: &mut [DirItem], key: SortKey, dir: SortDir) { + let effective_key = if key.applies_to_dirs() { + key + } else { + SortKey::Name + }; + dirs.sort_by(|a, b| { + let cmp = match effective_key { + SortKey::Size => a.size_bytes.cmp(&b.size_bytes), + SortKey::Modified => a.modified_at.cmp(&b.modified_at), + // Name, Path, and any fallback: sort by lowercased name + _ => a.name_lower.cmp(&b.name_lower), + }; + match dir { + SortDir::Asc => cmp, + SortDir::Desc => cmp.reverse(), + } + }); +} + fn compare_by_key(a: &MediaEntry, b: &MediaEntry, key: SortKey) -> std::cmp::Ordering { match key { SortKey::Path => a.path.cmp(&b.path), @@ -414,4 +437,89 @@ mod tests { assert_eq!(entries[0].file_name, "no_duration.mp4"); assert_eq!(entries[1].file_name, "has_duration.mp4"); } + + // --- sort_dir_items --- + + fn make_dir_item(name: &str, size: u64, modified: Option) -> DirItem { + DirItem { + path: PathBuf::from(format!("/test/{name}")), + name: name.to_string(), + name_lower: name.to_lowercase(), + size_bytes: size, + modified_at: modified, + } + } + + #[test] + fn sort_dir_items_by_name_asc() { + let mut dirs = vec![ + make_dir_item("Zebra", 0, None), + make_dir_item("alpha", 0, None), + make_dir_item("middle", 0, None), + ]; + sort_dir_items(&mut dirs, SortKey::Name, SortDir::Asc); + let names: Vec<&str> = dirs.iter().map(|d| d.name.as_str()).collect(); + assert_eq!(names, vec!["alpha", "middle", "Zebra"]); + } + + #[test] + fn sort_dir_items_by_name_desc() { + let mut dirs = vec![ + make_dir_item("alpha", 0, None), + make_dir_item("Zebra", 0, None), + make_dir_item("middle", 0, None), + ]; + sort_dir_items(&mut dirs, SortKey::Name, SortDir::Desc); + let names: Vec<&str> = dirs.iter().map(|d| d.name.as_str()).collect(); + assert_eq!(names, vec!["Zebra", "middle", "alpha"]); + } + + #[test] + fn sort_dir_items_by_size_asc() { + let mut dirs = vec![ + make_dir_item("big", 300, None), + make_dir_item("small", 100, None), + make_dir_item("medium", 200, None), + ]; + sort_dir_items(&mut dirs, SortKey::Size, SortDir::Asc); + let sizes: Vec = dirs.iter().map(|d| d.size_bytes).collect(); + assert_eq!(sizes, vec![100, 200, 300]); + } + + #[test] + fn sort_dir_items_by_modified() { + use std::time::{Duration, SystemTime}; + let t1 = SystemTime::UNIX_EPOCH + Duration::from_secs(1000); + let t2 = SystemTime::UNIX_EPOCH + Duration::from_secs(2000); + let t3 = SystemTime::UNIX_EPOCH + Duration::from_secs(3000); + let mut dirs = vec![ + make_dir_item("newest", 0, Some(t3)), + make_dir_item("oldest", 0, Some(t1)), + make_dir_item("middle", 0, Some(t2)), + ]; + sort_dir_items(&mut dirs, SortKey::Modified, SortDir::Asc); + let names: Vec<&str> = dirs.iter().map(|d| d.name.as_str()).collect(); + assert_eq!(names, vec!["oldest", "middle", "newest"]); + } + + #[test] + fn sort_dir_items_media_key_falls_back_to_name() { + let mut dirs = vec![ + make_dir_item("Zebra", 0, None), + make_dir_item("alpha", 0, None), + ]; + // Duration is media-only, should fall back to Name sort + sort_dir_items(&mut dirs, SortKey::Duration, SortDir::Asc); + let names: Vec<&str> = dirs.iter().map(|d| d.name.as_str()).collect(); + assert_eq!(names, vec!["alpha", "Zebra"]); + + // Codec too + let mut dirs2 = vec![ + make_dir_item("Zebra", 0, None), + make_dir_item("alpha", 0, None), + ]; + sort_dir_items(&mut dirs2, SortKey::Codec, SortDir::Asc); + let names2: Vec<&str> = dirs2.iter().map(|d| d.name.as_str()).collect(); + assert_eq!(names2, vec!["alpha", "Zebra"]); + } } diff --git a/src/tui/layout.rs b/src/tui/layout.rs index ed0c929..8b91e82 100644 --- a/src/tui/layout.rs +++ b/src/tui/layout.rs @@ -161,10 +161,7 @@ fn render_file_list(frame: &mut Frame, app: &App, area: Rect) { if vis_idx < dir_count { // Directory item let dir = &app.dir_items[vis_idx]; - let name = dir - .file_name() - .map_or_else(|| ".".to_string(), |n| n.to_string_lossy().into_owned()); - let line = format!(" D {name}/"); + let line = format!(" D {}/", dir.name); let style = if is_selected { Style::default() .bg(Color::DarkGray) diff --git a/src/tui/mod.rs b/src/tui/mod.rs index 0aeacb0..72eb7ae 100644 --- a/src/tui/mod.rs +++ b/src/tui/mod.rs @@ -9,9 +9,9 @@ pub mod triage; use crate::filter::Filter; use crate::playback::{MpvController, PlaybackState}; use crate::scan; -use crate::sort::sort_entries; +use crate::sort::{sort_dir_items, sort_entries}; use crate::thumbnail::ThumbnailCache; -use crate::types::{MediaEntry, MediaKind, ProbeError, SortDir, SortKey}; +use crate::types::{DirItem, MediaEntry, MediaKind, ProbeError, SortDir, SortKey}; use anyhow::{Context, Result}; use crossterm::ExecutableCommand; use crossterm::event::{self, Event, KeyCode, KeyEvent, KeyModifiers}; @@ -132,8 +132,8 @@ pub struct App { filter_mode: FilterMode, /// Last successfully parsed structured filter expression. filter_expr: Option, - /// Subdirectories of `current_dir`, sorted alphabetically. - dir_items: Vec, + /// Subdirectories of `current_dir`, sorted by current sort key. + dir_items: Vec, /// Cached sibling directories (for parent pane rendering). sibling_dirs: Vec, /// Receiver for async directory scan results. @@ -226,11 +226,11 @@ impl App { .and_then(|&idx| self.entries.get(idx)) } - /// Get the currently selected directory (if any). + /// Get the currently selected directory path (if any). #[must_use] pub fn selected_dir(&self) -> Option<&PathBuf> { if self.selected < self.dir_items.len() { - Some(&self.dir_items[self.selected]) + Some(&self.dir_items[self.selected].path) } else { None } @@ -328,8 +328,9 @@ impl App { } } - /// Apply current sort to entries and rebuild indices. + /// Apply current sort to entries and dir items, then rebuild indices. fn apply_sort(&mut self) { + sort_dir_items(&mut self.dir_items, self.sort_key, self.sort_dir); sort_entries(&mut self.entries, self.sort_key, self.sort_dir); self.apply_filter(); } @@ -513,6 +514,7 @@ impl App { /// Navigate to a directory: load subdirs, clear state, spawn async scan. fn navigate_to_dir(&mut self, path: PathBuf) { self.dir_items = list_subdirs(&path); + sort_dir_items(&mut self.dir_items, self.sort_key, self.sort_dir); self.sibling_dirs = list_sibling_dirs(&path); // Clear media state (but NOT mpv playback — per spec) @@ -578,23 +580,51 @@ impl App { } } -/// List subdirectories of a path, sorted alphabetically. -fn list_subdirs(path: &std::path::Path) -> Vec { +/// List subdirectories of a path as `DirItem`s, sorted alphabetically by name. +fn list_subdirs(path: &std::path::Path) -> Vec { let Ok(entries) = std::fs::read_dir(path) else { return vec![]; }; - let mut dirs: Vec = entries + let mut dirs: Vec = entries .flatten() - .filter(|e| e.path().is_dir()) - .map(|e| e.path()) + .filter(|e| e.file_type().is_ok_and(|ft| ft.is_dir())) + .filter_map(|e| { + let path = e.path(); + let name = path.file_name().map(|n| n.to_string_lossy().into_owned())?; + let name_lower = name.to_lowercase(); + let meta = e.metadata().ok(); + let size_bytes = meta.as_ref().map_or(0, std::fs::Metadata::len); + let modified_at = meta.as_ref().and_then(|m| m.modified().ok()); + Some(DirItem { + path, + name, + name_lower, + size_bytes, + modified_at, + }) + }) .collect(); - dirs.sort(); + dirs.sort_by(|a, b| a.name_lower.cmp(&b.name_lower)); dirs } /// List sibling directories (dirs in parent) for the parent pane. +/// +/// Returns plain `PathBuf`s since the parent pane is always alphabetical. fn list_sibling_dirs(path: &std::path::Path) -> Vec { - path.parent().map_or_else(Vec::new, list_subdirs) + let Some(parent) = path.parent() else { + return vec![]; + }; + let Ok(entries) = std::fs::read_dir(parent) else { + return vec![]; + }; + let mut dirs: Vec = entries + .flatten() + .filter(|e| e.file_type().is_ok_and(|ft| ft.is_dir())) + .map(|e| e.path()) + .collect(); + dirs.sort(); + dirs } /// Run the TUI application. @@ -852,7 +882,7 @@ async fn handle_key(app: &mut App, key: KeyEvent) { { let child = app.current_dir.clone(); app.navigate_to_dir(parent); - if let Some(idx) = app.dir_items.iter().position(|d| *d == child) { + if let Some(idx) = app.dir_items.iter().position(|d| d.path == child) { app.selected = idx; } } @@ -1018,6 +1048,19 @@ mod tests { } } + fn make_dir_item(path: &str) -> DirItem { + let name = std::path::Path::new(path) + .file_name() + .map_or_else(|| ".".to_string(), |n| n.to_string_lossy().into_owned()); + DirItem { + path: PathBuf::from(path), + name_lower: name.to_lowercase(), + name, + size_bytes: 0, + modified_at: None, + } + } + fn make_test_app(names: &[&str]) -> App { let entries: Vec = names.iter().map(|n| make_entry(n)).collect(); let tmp = tempfile::tempdir().unwrap(); @@ -1344,8 +1387,8 @@ mod tests { fn selected_dir_returns_path_when_dir_selected() { let mut app = make_test_app(&["a.mp4"]); app.dir_items = vec![ - PathBuf::from("/test/subdir1"), - PathBuf::from("/test/subdir2"), + make_dir_item("/test/subdir1"), + make_dir_item("/test/subdir2"), ]; app.selected = 0; assert_eq!(app.selected_dir(), Some(&PathBuf::from("/test/subdir1"))); @@ -1355,7 +1398,7 @@ mod tests { #[test] fn selected_entry_offsets_correctly() { let mut app = make_test_app(&["a.mp4", "b.mkv"]); - app.dir_items = vec![PathBuf::from("/test/subdir")]; + app.dir_items = vec![make_dir_item("/test/subdir")]; // selected=0 → directory app.selected = 0; assert!(app.selected_entry().is_none()); @@ -1378,7 +1421,7 @@ mod tests { #[test] fn visible_count_includes_dirs() { let mut app = make_test_app(&["a.mp4"]); - app.dir_items = vec![PathBuf::from("/test/d1"), PathBuf::from("/test/d2")]; + app.dir_items = vec![make_dir_item("/test/d1"), make_dir_item("/test/d2")]; assert_eq!(app.visible_count(), 3); // 2 dirs + 1 media } @@ -1408,10 +1451,7 @@ mod tests { std::fs::create_dir(root.join("middle")).unwrap(); let dirs = super::list_subdirs(root); - let names: Vec = dirs - .iter() - .map(|d| d.file_name().unwrap().to_string_lossy().into_owned()) - .collect(); + let names: Vec<&str> = dirs.iter().map(|d| d.name.as_str()).collect(); assert_eq!(names, vec!["alpha", "middle", "zebra"]); } @@ -1442,7 +1482,7 @@ mod tests { #[test] fn toggle_mark_on_dir_is_noop() { let mut app = make_test_app(&["a.mp4"]); - app.dir_items = vec![PathBuf::from("/d")]; + app.dir_items = vec![make_dir_item("/d")]; app.selected = 0; app.toggle_mark(); assert!(app.marked.is_empty()); diff --git a/src/types.rs b/src/types.rs index 7eee718..817024e 100644 --- a/src/types.rs +++ b/src/types.rs @@ -289,6 +289,18 @@ pub enum NdjsonRecord { }, } +/// Directory entry with cached metadata for sorting. +#[derive(Debug, Clone)] +pub struct DirItem { + pub path: PathBuf, + /// Display name (original case). + pub name: String, + /// Pre-lowercased name for case-insensitive sorting. + pub name_lower: String, + pub size_bytes: u64, + pub modified_at: Option, +} + /// Sort key for media entries. #[derive(Debug, Clone, Copy, PartialEq, Eq)] pub enum SortKey { @@ -319,6 +331,15 @@ impl SortKey { } } + /// Whether this sort key applies to directories. + /// + /// Media-only keys (Duration, Resolution, Codec, Bitrate, Pages) return + /// `false` — callers should fall back to Name sort for dirs. + #[must_use] + pub fn applies_to_dirs(self) -> bool { + matches!(self, Self::Path | Self::Name | Self::Size | Self::Modified) + } + /// Cycle to next sort key. #[must_use] pub fn next(self) -> Self { From 68a4b067c671322e8bbf88a8082c9ff6c4f2a099 Mon Sep 17 00:00:00 2001 From: Pushkar Patel Date: Wed, 4 Mar 2026 14:02:23 +0530 Subject: [PATCH 05/10] fix(document): bound all file reads to prevent OOM on crafted inputs - Cap zip XML reads at 4 MiB via `take(MAX_XML_BYTES)` in `read_xml_from_zip` - Cap text/CSV scanning at 256 MiB via `take(MAX_TEXT_SCAN_BYTES)` - Replace `BufReader::lines()` with byte-level counting in `probe_text_table` - Replace `BufReader::lines()` with capped `read_line` in `probe_text` Addresses security review comments about unbounded reads. --- src/document.rs | 46 +++++++++++++++++++++++++++------------------- 1 file changed, 27 insertions(+), 19 deletions(-) diff --git a/src/document.rs b/src/document.rs index bb6a129..e720656 100644 --- a/src/document.rs +++ b/src/document.rs @@ -3,9 +3,17 @@ /// All extractors are best-effort: failures are silently swallowed and /// logged at debug level. Follows the same pattern as `exif.rs`. use crate::types::DocumentInfo; -use std::io::{BufRead, BufReader}; +use std::io::{BufRead, BufReader, Read}; use std::path::Path; +/// Maximum bytes to read from a single XML file inside a zip archive (4 MiB). +/// Prevents OOM from crafted archives with enormous embedded XML. +const MAX_XML_BYTES: u64 = 4 * 1024 * 1024; + +/// Maximum bytes to scan from a text/CSV/TSV file (256 MiB). +/// Prevents scanning multi-GB log files or data dumps. +const MAX_TEXT_SCAN_BYTES: u64 = 256 * 1024 * 1024; + /// Extract metadata from a document file, dispatching by extension. /// /// Returns `None` on any failure (corrupt file, unsupported format, I/O error). @@ -88,9 +96,9 @@ fn pdf_object_to_string(obj: &lopdf::Object) -> Option { fn read_xml_from_zip(path: &Path, inner_path: &str) -> Option { let file = std::fs::File::open(path).ok()?; let mut archive = zip::ZipArchive::new(file).ok()?; - let mut entry = archive.by_name(inner_path).ok()?; + let entry = archive.by_name(inner_path).ok()?; let mut contents = String::new(); - std::io::Read::read_to_string(&mut entry, &mut contents).ok()?; + entry.take(MAX_XML_BYTES).read_to_string(&mut contents).ok()?; Some(contents) } @@ -507,15 +515,11 @@ fn read_u32_le(data: &[u8], offset: usize) -> u32 { fn probe_text_table(path: &Path, ext: &str) -> Option { let file = std::fs::File::open(path).ok()?; - let reader = BufReader::new(file); - let mut line_count: u64 = 0; - - for line in reader.lines() { - if line.is_err() { - break; - } - line_count += 1; - } + let reader = BufReader::new(file.take(MAX_TEXT_SCAN_BYTES)); + let line_count = reader + .bytes() + .filter(|b| b.as_ref().is_ok_and(|&c| c == b'\n')) + .count() as u64; Some(DocumentInfo { format: ext.to_string(), @@ -526,16 +530,20 @@ fn probe_text_table(path: &Path, ext: &str) -> Option { fn probe_text(path: &Path, ext: &str) -> Option { let file = std::fs::File::open(path).ok()?; - let reader = BufReader::new(file); + let mut reader = BufReader::new(file.take(MAX_TEXT_SCAN_BYTES)); let mut line_count: u64 = 0; let mut word_count: u64 = 0; + let mut buf = String::new(); - for line in reader.lines() { - let Ok(line) = line else { - break; - }; - line_count += 1; - word_count += line.split_whitespace().count() as u64; + loop { + buf.clear(); + match reader.read_line(&mut buf) { + Ok(0) | Err(_) => break, + Ok(_) => { + line_count += 1; + word_count += buf.split_whitespace().count() as u64; + } + } } Some(DocumentInfo { From 408746e47fb3e79509465bfd742e8d0aa64741a9 Mon Sep 17 00:00:00 2001 From: Pushkar Patel Date: Wed, 4 Mar 2026 14:02:43 +0530 Subject: [PATCH 06/10] fix(scan): wrap document probes in timeout to match media probe behavior Document probes had no timeout, unlike ffprobe-based media probes. A stuck document read (e.g., network-mounted PDF) would block the scan task indefinitely. --- src/scan.rs | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/src/scan.rs b/src/scan.rs index a333d27..2d1eb82 100644 --- a/src/scan.rs +++ b/src/scan.rs @@ -125,7 +125,15 @@ pub async fn probe_files( .is_some_and(is_document_extension); tasks.spawn(async move { let result = if is_doc { - probe::probe_document_file(&file).await + tokio::time::timeout( + std::time::Duration::from_millis(timeout_ms), + probe::probe_document_file(&file), + ) + .await + .unwrap_or_else(|_| { + tracing::debug!(path = %file.display(), "document probe timed out"); + Err(anyhow::anyhow!("document probe timed out")) + }) } else { probe::probe_file(&file, timeout_ms).await }; From 72744d8a2dba09c17f22092bf55305cd28b3331e Mon Sep 17 00:00:00 2001 From: Pushkar Patel Date: Wed, 4 Mar 2026 14:03:10 +0530 Subject: [PATCH 07/10] fix(tui): follow symlinks in directory listings Replace `DirEntry::file_type().is_dir()` with `Path::is_dir()` in both `list_subdirs` and `list_sibling_dirs`. The former doesn't follow symlinks, causing symlinked directories to disappear from the TUI. --- src/tui/mod.rs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/tui/mod.rs b/src/tui/mod.rs index 72eb7ae..7984089 100644 --- a/src/tui/mod.rs +++ b/src/tui/mod.rs @@ -587,7 +587,7 @@ fn list_subdirs(path: &std::path::Path) -> Vec { }; let mut dirs: Vec = entries .flatten() - .filter(|e| e.file_type().is_ok_and(|ft| ft.is_dir())) + .filter(|e| e.path().is_dir()) .filter_map(|e| { let path = e.path(); let name = path.file_name().map(|n| n.to_string_lossy().into_owned())?; @@ -620,7 +620,7 @@ fn list_sibling_dirs(path: &std::path::Path) -> Vec { }; let mut dirs: Vec = entries .flatten() - .filter(|e| e.file_type().is_ok_and(|ft| ft.is_dir())) + .filter(|e| e.path().is_dir()) .map(|e| e.path()) .collect(); dirs.sort(); From 5459c06791c0288d1059080b9fb6ead55e8f6462 Mon Sep 17 00:00:00 2001 From: Pushkar Patel Date: Wed, 4 Mar 2026 14:04:36 +0530 Subject: [PATCH 08/10] refactor(document): extract parse_xml_text_fields helper Deduplicates the identical XML event-loop boilerplate shared by parse_core_xml and parse_app_xml. Each is now a thin wrapper that passes a closure for tag-to-field dispatch. --- src/document.rs | 109 ++++++++++++++++++++++-------------------------- 1 file changed, 50 insertions(+), 59 deletions(-) diff --git a/src/document.rs b/src/document.rs index e720656..71db5c3 100644 --- a/src/document.rs +++ b/src/document.rs @@ -136,36 +136,14 @@ fn parse_core_xml(xml: &str) -> OoxmlCoreProps { modified: None, }; - let mut reader = quick_xml::Reader::from_str(xml); - let mut buf = Vec::new(); - let mut current_tag = String::new(); - - loop { - match reader.read_event_into(&mut buf) { - Ok(quick_xml::events::Event::Start(ref e) | quick_xml::events::Event::Empty(ref e)) => { - current_tag = local_name(e.name().as_ref()); - } - Ok(quick_xml::events::Event::Text(ref e)) => { - let text = e.unescape().ok().map(|s| s.trim().to_owned()); - if let Some(val) = text.filter(|s| !s.is_empty()) { - match current_tag.as_str() { - "creator" => props.author = Some(val), - "title" => props.title = Some(val), - "subject" => props.subject = Some(val), - "created" => props.created = Some(val), - "modified" => props.modified = Some(val), - _ => {} - } - } - } - Ok(quick_xml::events::Event::End(_)) => { - current_tag.clear(); - } - Ok(quick_xml::events::Event::Eof) | Err(_) => break, - _ => {} - } - buf.clear(); - } + parse_xml_text_fields(xml, |tag, val| match tag { + "creator" => props.author = Some(val), + "title" => props.title = Some(val), + "subject" => props.subject = Some(val), + "created" => props.created = Some(val), + "modified" => props.modified = Some(val), + _ => {} + }); props } @@ -178,35 +156,13 @@ fn parse_app_xml(xml: &str) -> OoxmlAppProps { app_name: None, }; - let mut reader = quick_xml::Reader::from_str(xml); - let mut buf = Vec::new(); - let mut current_tag = String::new(); - - loop { - match reader.read_event_into(&mut buf) { - Ok(quick_xml::events::Event::Start(ref e) | quick_xml::events::Event::Empty(ref e)) => { - current_tag = local_name(e.name().as_ref()); - } - Ok(quick_xml::events::Event::Text(ref e)) => { - let text = e.unescape().ok().map(|s| s.trim().to_owned()); - if let Some(val) = text.filter(|s| !s.is_empty()) { - match current_tag.as_str() { - "Pages" => props.pages = val.parse().ok(), - "Words" => props.words = val.parse().ok(), - "Slides" => props.slides = val.parse().ok(), - "Application" => props.app_name = Some(val), - _ => {} - } - } - } - Ok(quick_xml::events::Event::End(_)) => { - current_tag.clear(); - } - Ok(quick_xml::events::Event::Eof) | Err(_) => break, - _ => {} - } - buf.clear(); - } + parse_xml_text_fields(xml, |tag, val| match tag { + "Pages" => props.pages = val.parse().ok(), + "Words" => props.words = val.parse().ok(), + "Slides" => props.slides = val.parse().ok(), + "Application" => props.app_name = Some(val), + _ => {} + }); props } @@ -556,6 +512,41 @@ fn probe_text(path: &Path, ext: &str) -> Option { // ─── XML helpers ──────────────────────────────────────────────────────── +/// Run the `quick_xml` event loop and call `on_field` for each tag-text pair. +/// +/// Shared by `parse_core_xml` and `parse_app_xml` which differ only in +/// which tags they care about. Not used by `parse_odf_meta` which also +/// reads attributes. +fn parse_xml_text_fields(xml: &str, mut on_field: impl FnMut(&str, String)) { + use quick_xml::events::Event; + + let mut reader = quick_xml::Reader::from_str(xml); + let mut buf = Vec::new(); + let mut current_tag = String::new(); + + loop { + match reader.read_event_into(&mut buf) { + Ok(Event::Start(ref e) | Event::Empty(ref e)) => { + current_tag = local_name(e.name().as_ref()); + } + Ok(Event::Text(ref e)) => { + if let Some(val) = e + .unescape() + .ok() + .map(|s| s.trim().to_owned()) + .filter(|s| !s.is_empty()) + { + on_field(¤t_tag, val); + } + } + Ok(Event::End(_)) => current_tag.clear(), + Ok(Event::Eof) | Err(_) => break, + _ => {} + } + buf.clear(); + } +} + /// Extract the local part of a possibly namespaced XML name. /// /// Examples: `dc:creator` becomes `creator`, `meta:creation-date` becomes `creation-date`. From 6fad9c507a8f449222bd317a22c48bf5e8ee6329 Mon Sep 17 00:00:00 2001 From: Pushkar Patel Date: Wed, 4 Mar 2026 14:05:43 +0530 Subject: [PATCH 09/10] refactor(document): replace OLE2 magic numbers with named constants Adds OLEPS_*, VT_*, and PIDSI_* constants for the MS-OLEPS binary format values used in parse_summary_info. Makes the binary format parsing self-documenting. --- src/document.rs | 53 ++++++++++++++++++++++++++++++------------------- 1 file changed, 33 insertions(+), 20 deletions(-) diff --git a/src/document.rs b/src/document.rs index 71db5c3..2c05a95 100644 --- a/src/document.rs +++ b/src/document.rs @@ -376,29 +376,44 @@ fn probe_ole2(path: &Path, ext: &str) -> Option { Some(info) } +// MS-OLEPS binary format constants +const OLEPS_BYTE_ORDER_LE: u16 = 0xFFFE; +const OLEPS_HEADER_MIN_LEN: usize = 48; +const OLEPS_SECTION_OFFSET_POS: usize = 44; +const OLEPS_MAX_PROPS: usize = 100; +const OLEPS_SECTION_HEADER_SIZE: usize = 8; +const OLEPS_PROP_ENTRY_SIZE: usize = 8; +const VT_I4: u32 = 0x03; +const VT_LPSTR: u32 = 0x1E; +// Property IDs from the Summary Information property set +const PIDSI_TITLE: u32 = 2; +const PIDSI_AUTHOR: u32 = 4; +const PIDSI_SUBJECT: u32 = 5; +const PIDSI_PAGECOUNT: u32 = 14; +const PIDSI_WORDCOUNT: u32 = 15; +const PIDSI_APPNAME: u32 = 18; + /// Best-effort extraction from OLE2 `SummaryInformation` stream. /// /// The stream uses MS-OLEPS binary format with property sets. -/// We extract string properties by well-known IDs -/// (2=Title, 4=Author, 5=Subject, 18=`AppName`). fn parse_summary_info(data: &[u8], info: &mut DocumentInfo) { - if data.len() < 48 || read_u16_le(data, 0) != 0xFFFE { + if data.len() < OLEPS_HEADER_MIN_LEN || read_u16_le(data, 0) != OLEPS_BYTE_ORDER_LE { return; } - let section_offset = read_u32_le(data, 44) as usize; - if section_offset >= data.len() || section_offset + 8 > data.len() { + let section_offset = read_u32_le(data, OLEPS_SECTION_OFFSET_POS) as usize; + if section_offset + OLEPS_SECTION_HEADER_SIZE > data.len() { return; } let prop_count = read_u32_le(data, section_offset + 4) as usize; - if prop_count > 100 { + if prop_count > OLEPS_MAX_PROPS { return; } for i in 0..prop_count { - let entry_offset = section_offset + 8 + i * 8; - if entry_offset + 8 > data.len() { + let entry_offset = section_offset + OLEPS_SECTION_HEADER_SIZE + i * OLEPS_PROP_ENTRY_SIZE; + if entry_offset + OLEPS_PROP_ENTRY_SIZE > data.len() { break; } @@ -406,16 +421,15 @@ fn parse_summary_info(data: &[u8], info: &mut DocumentInfo) { let prop_offset = read_u32_le(data, entry_offset + 4) as usize; let abs_offset = section_offset + prop_offset; - if abs_offset + 8 > data.len() { + if abs_offset + OLEPS_SECTION_HEADER_SIZE > data.len() { continue; } let prop_type = read_u32_le(data, abs_offset); - // VT_LPSTR = 0x1E - if prop_type == 0x1E { + if prop_type == VT_LPSTR { let str_len = read_u32_le(data, abs_offset + 4) as usize; - let str_start = abs_offset + 8; + let str_start = abs_offset + OLEPS_SECTION_HEADER_SIZE; if str_start + str_len <= data.len() { let raw = &data[str_start..str_start + str_len]; let s = String::from_utf8_lossy(raw) @@ -424,23 +438,22 @@ fn parse_summary_info(data: &[u8], info: &mut DocumentInfo) { .to_owned(); if !s.is_empty() { match prop_id { - 2 => info.title = Some(s), - 4 => info.author = Some(s), - 5 => info.subject = Some(s), - 18 => info.creator_app = Some(s), + PIDSI_TITLE => info.title = Some(s), + PIDSI_AUTHOR => info.author = Some(s), + PIDSI_SUBJECT => info.subject = Some(s), + PIDSI_APPNAME => info.creator_app = Some(s), _ => {} } } } } - // VT_I4 = 0x03 - if prop_type == 0x03 && abs_offset + 8 <= data.len() { + if prop_type == VT_I4 && abs_offset + OLEPS_SECTION_HEADER_SIZE <= data.len() { let val = read_u32_le(data, abs_offset + 4); if val > 0 { match prop_id { - 14 => info.page_count = Some(val), - 15 => info.word_count = Some(u64::from(val)), + PIDSI_PAGECOUNT => info.page_count = Some(val), + PIDSI_WORDCOUNT => info.word_count = Some(u64::from(val)), _ => {} } } From 35cfb2edd2fbb4b110b0aa6407d966ce654edc42 Mon Sep 17 00:00:00 2001 From: Pushkar Patel Date: Wed, 4 Mar 2026 14:06:54 +0530 Subject: [PATCH 10/10] refactor(tui): extract toggle_kind_filter method Reduces the four kind-filter match arms (keys 2-5) to one-liners by extracting the shared toggle-apply-check-status pattern into a method on App. --- src/document.rs | 5 ++++- src/tui/mod.rs | 41 +++++++++++++---------------------------- 2 files changed, 17 insertions(+), 29 deletions(-) diff --git a/src/document.rs b/src/document.rs index 2c05a95..ec68a78 100644 --- a/src/document.rs +++ b/src/document.rs @@ -98,7 +98,10 @@ fn read_xml_from_zip(path: &Path, inner_path: &str) -> Option { let mut archive = zip::ZipArchive::new(file).ok()?; let entry = archive.by_name(inner_path).ok()?; let mut contents = String::new(); - entry.take(MAX_XML_BYTES).read_to_string(&mut contents).ok()?; + entry + .take(MAX_XML_BYTES) + .read_to_string(&mut contents) + .ok()?; Some(contents) } diff --git a/src/tui/mod.rs b/src/tui/mod.rs index 7984089..7f88a9d 100644 --- a/src/tui/mod.rs +++ b/src/tui/mod.rs @@ -441,6 +441,15 @@ impl App { self.set_status(format!("Sort: {} {dir_label}", self.sort_key.label())); } + /// Toggle one kind in the filter, re-apply, and warn if nothing selected. + fn toggle_kind_filter(&mut self, toggle: fn(&mut KindFilter)) { + toggle(&mut self.kind_filter); + self.apply_filter(); + if self.kind_filter.is_empty() { + self.set_status("Kind: nothing selected \u{2014} 1 to show all".to_string()); + } + } + /// Start background thumbnail fetch for the currently selected entry. /// Skips if already loading the same path or if the file has no video. fn kick_thumbnail_fetch(&mut self) { @@ -823,34 +832,10 @@ async fn handle_key(app: &mut App, key: KeyEvent) { app.kind_filter = KindFilter::ALL; app.apply_filter(); } - (KeyCode::Char('2'), _) => { - app.kind_filter.video = !app.kind_filter.video; - app.apply_filter(); - if app.kind_filter.is_empty() { - app.set_status("Kind: nothing selected \u{2014} 1 to show all".to_string()); - } - } - (KeyCode::Char('3'), _) => { - app.kind_filter.audio = !app.kind_filter.audio; - app.apply_filter(); - if app.kind_filter.is_empty() { - app.set_status("Kind: nothing selected \u{2014} 1 to show all".to_string()); - } - } - (KeyCode::Char('4'), _) => { - app.kind_filter.image = !app.kind_filter.image; - app.apply_filter(); - if app.kind_filter.is_empty() { - app.set_status("Kind: nothing selected \u{2014} 1 to show all".to_string()); - } - } - (KeyCode::Char('5'), _) => { - app.kind_filter.doc = !app.kind_filter.doc; - app.apply_filter(); - if app.kind_filter.is_empty() { - app.set_status("Kind: nothing selected \u{2014} 1 to show all".to_string()); - } - } + (KeyCode::Char('2'), _) => app.toggle_kind_filter(|kf| kf.video = !kf.video), + (KeyCode::Char('3'), _) => app.toggle_kind_filter(|kf| kf.audio = !kf.audio), + (KeyCode::Char('4'), _) => app.toggle_kind_filter(|kf| kf.image = !kf.image), + (KeyCode::Char('5'), _) => app.toggle_kind_filter(|kf| kf.doc = !kf.doc), // Playback (KeyCode::Char('p'), _) => handle_playback(app).await, (KeyCode::Char('P'), _) => {