From e208c514123aa2a8a98aa3c79501ca5363280632 Mon Sep 17 00:00:00 2001 From: Michael Ding Date: Wed, 27 May 2026 09:23:20 +0800 Subject: [PATCH 1/7] docs: add bulk-convert design spec --- .../specs/2026-05-27-bulk-convert-design.md | 109 ++++++++++++++++++ 1 file changed, 109 insertions(+) create mode 100644 docs/superpowers/specs/2026-05-27-bulk-convert-design.md diff --git a/docs/superpowers/specs/2026-05-27-bulk-convert-design.md b/docs/superpowers/specs/2026-05-27-bulk-convert-design.md new file mode 100644 index 0000000..ed2417b --- /dev/null +++ b/docs/superpowers/specs/2026-05-27-bulk-convert-design.md @@ -0,0 +1,109 @@ +# bulk-convert 命令设计 + +## 动机 + +原 `kbmate convert` 只支持单个文件转换。当需要批量处理目录下或文件列表中的多个 PDF/DOCX 时,需要新增 `bulk-convert` 子命令,职责分离,不污染原命令。 + +## 命令签名 + +``` +kbmate bulk-convert -r <目录> [--output-dir <路径>] [--output-layout {flat,mirror}] +kbmate bulk-convert -f <文件列表路径> [--output-dir <路径>] [--output-layout {flat,mirror}] +``` + +- `-r` / `--recursive`: 递归扫描目录下所有 `.pdf`/`.docx` 文件 +- `-f` / `--file-list`: 从文本文件中按行读取文件地址(支持本地路径和 URL) +- `-r` 和 `-f` 互斥 +- `--output-dir`: 输出根目录,默认 `raw` +- `--output-layout`: 输出结构布局,`flat`(默认)或 `mirror` + +## 架构 + +```python +convert(source_file, output_dir) # 原命令不变 + └─ convert_single(source_file, output_dir) + +bulk-convert(-r DIR | -f FILE, output_dir, layout) # 新命令 + ├─ _collect_files_from_dir(dir) / _collect_files_from_list(filelist) + └─ for each file: + └─ convert_single(... relative_to=...) +``` + +### convert_single 函数 + +提取现有 `convert` 命令的核心逻辑为可复用的函数: + +```python +def convert_single( + source_path: Path, + output_dir: Path, + layout: Literal["flat", "mirror"] = "flat", + relative_to: Path | None = None, +) -> None: +``` + +- 处理单个文件的转换(不含 URL 解析 — URL 在调用前已完成下载) +- `layout="flat"`: 输出文件直接放入 `converts/` 和 `assets/` +- `layout="mirror"`: 根据 `relative_to` 保持目录层次 + +## 文件变更 + +### src/kbmate_cli/main.py + +1. 提取 `convert_single()` 函数 +2. 原 `convert` 命令体内改为调用 `convert_single()` +3. 新增 `bulk_convert` 命令 +4. 新增辅助函数: + - `_collect_files_from_dir(root: Path) -> list[Path]` — 递归扫描 + - `_collect_files_from_list(filelist: Path) -> list[str]` — 读取文件列表 + +### tests/test_cli.py + +新增测试(使用 typer CliRunner): +- `test_bulk_convert_recursive_dir_flat` +- `test_bulk_convert_recursive_dir_mirror` +- `test_bulk_convert_file_list` +- `test_bulk_convert_file_list_with_urls` +- `test_bulk_convert_mutual_exclusive_r_and_f` +- `test_bulk_convert_invalid_dir` + +## 输出路径规则 + +假设输入目录为 `docs/`,包含 `docs/sub/a.pdf` 和 `docs/b.docx`: + +### flat 模式 (默认) + +``` +raw/ +├── converts/ +│ ├── a.md +│ └── b.md +└── assets/ + ├── a/ + │ └── ... (图片文件) + └── b/ + └── ... +``` + +### mirror 模式 + +``` +raw/ +├── converts/ +│ ├── sub/ +│ │ └── a.md +│ └── b.md +└── assets/ + ├── sub/ + │ └── a/ + │ └── ... + └── b/ + └── ... +``` + +## 错误处理 + +- 单个文件转换失败不应中断整个批量过程,记录错误并继续 +- 汇总报告:批量结束后显示成功/失败计数 +- `-r` 指定的目录不存在时立即报错退出 +- `-f` 指定的文件不存在或格式错误时立即报错退出 From a66755446cb424009ed0d0f460c97371ac9623f5 Mon Sep 17 00:00:00 2001 From: Michael Ding Date: Wed, 27 May 2026 09:35:10 +0800 Subject: [PATCH 2/7] refactor: extract convert_single() from convert command --- src/kbmate_cli/main.py | 62 +++++++++++++++++++++++++++--------------- 1 file changed, 40 insertions(+), 22 deletions(-) diff --git a/src/kbmate_cli/main.py b/src/kbmate_cli/main.py index a416752..50dcd0e 100644 --- a/src/kbmate_cli/main.py +++ b/src/kbmate_cli/main.py @@ -1,4 +1,5 @@ from pathlib import Path +from typing import Literal from urllib.error import URLError from urllib.parse import urlparse @@ -55,6 +56,44 @@ def _convert_docx(src: Path, assets_dir: Path) -> str: } +def convert_single( + source_path: Path, + output_dir: Path, + *, + layout: Literal["flat", "mirror"] = "flat", + relative_to: Path | None = None, +) -> None: + ext = source_path.suffix.lower() + converter = _CONVERTERS.get(ext) + if converter is None: + fmts = ", ".join(_CONVERTERS) + typer.echo(f"Error: unsupported format: {ext} (supported: {fmts})", err=True) + raise typer.Exit(code=1) + + assets_parent = output_dir / "assets" + + if layout == "mirror" and relative_to is not None: + try: + rel = source_path.relative_to(relative_to).parent + except ValueError: + rel = Path(".") + else: + rel = Path(".") + + sanitized_ref, _ = _md_path(str(assets_parent / rel), f"{source_path.stem}.x") + safe_stem = Path(sanitized_ref).stem + + assets_dir = assets_parent / rel / safe_stem + converts_dir = output_dir / "converts" / rel + converts_dir.mkdir(parents=True, exist_ok=True) + assets_dir.mkdir(parents=True, exist_ok=True) + + markdown_content = converter(source_path, assets_dir) + md_path = converts_dir / f"{safe_stem}.md" + md_path.write_text(markdown_content, encoding="utf-8") + typer.echo(f"Converted: {source_path} -> {md_path}") + + @app.command() def convert( source_file: str = typer.Argument(..., help="Path or URL to the .docx or .pdf file"), @@ -71,29 +110,8 @@ def convert( typer.echo(f"Error: file not found: {source_file}", err=True) raise typer.Exit(code=1) - ext = src.suffix.lower() - converter = _CONVERTERS.get(ext) - if converter is None: - fmts = ", ".join(_CONVERTERS) - typer.echo(f"Error: unsupported format: {ext} (supported: {fmts})", err=True) - raise typer.Exit(code=1) - - out_dir = Path(output_dir) - assets_parent = out_dir / "assets" - - sanitized_ref, _ = _md_path(str(assets_parent), f"{src.stem}.x") - safe_stem = Path(sanitized_ref).stem - - assets_dir = assets_parent / safe_stem - converts_dir = out_dir / "converts" - converts_dir.mkdir(parents=True, exist_ok=True) - assets_dir.mkdir(parents=True, exist_ok=True) - try: - markdown_content = converter(src, assets_dir) - md_path = converts_dir / f"{safe_stem}.md" - md_path.write_text(markdown_content, encoding="utf-8") - typer.echo(f"Converted: {src} -> {md_path}") + convert_single(src, Path(output_dir)) finally: if temp_path: print_cleanup_hint(temp_path) From 2e38ea59885c94f80bdca128044960cf5a03efb6 Mon Sep 17 00:00:00 2001 From: Michael Ding Date: Wed, 27 May 2026 09:38:54 +0800 Subject: [PATCH 3/7] feat: add file collection helpers for bulk-convert --- src/kbmate_cli/main.py | 20 +++++++++++++++++ tests/test_cli.py | 49 +++++++++++++++++++++++++++++++++++++++++- 2 files changed, 68 insertions(+), 1 deletion(-) diff --git a/src/kbmate_cli/main.py b/src/kbmate_cli/main.py index 50dcd0e..4521cdd 100644 --- a/src/kbmate_cli/main.py +++ b/src/kbmate_cli/main.py @@ -28,6 +28,26 @@ def _resolve_source(source_file: str) -> tuple[str, Path | None]: return str(temp_path), temp_path +SUPPORTED_EXTENSIONS = {".pdf", ".docx"} + + +def _collect_files_from_dir(root: Path) -> list[Path]: + if not root.is_dir(): + raise NotADirectoryError(f"not a directory: {root}") + files: list[Path] = [] + for p in root.rglob("*"): + if p.suffix.lower() in SUPPORTED_EXTENSIONS and p.is_file(): + files.append(p) + return files + + +def _collect_files_from_list(filelist: Path) -> list[str]: + if not filelist.is_file(): + raise FileNotFoundError(f"file not found: {filelist}") + lines = filelist.read_text(encoding="utf-8").splitlines() + return [line.strip() for line in lines if line.strip()] + + def _convert_pdf(src: Path, assets_dir: Path) -> str: from kbmate_cli.pdf_converter import convert_pdf from kbmate_cli.image_helper import extract_and_relink_images diff --git a/tests/test_cli.py b/tests/test_cli.py index 581ecbf..45c5ce6 100644 --- a/tests/test_cli.py +++ b/tests/test_cli.py @@ -3,7 +3,8 @@ import pytest from typer.testing import CliRunner -from kbmate_cli.main import app +from kbmate_cli.main import app, _collect_files_from_dir, _collect_files_from_list +import tempfile runner = CliRunner() @@ -74,3 +75,49 @@ def test_convert_http_url(mock_urlopen): ) assert result.exit_code == 0, f"Failed with output: {result.output}" assert "临时文件已保存至" in result.stdout + + +def test_collect_files_from_dir(): + with tempfile.TemporaryDirectory() as tmp: + root = Path(tmp) + (root / "sub").mkdir() + pdf1 = root / "a.pdf" + docx1 = root / "sub" / "b.docx" + txt1 = root / "c.txt" + pdf1.write_text("fake pdf") + docx1.write_text("fake docx") + txt1.write_text("fake txt") + + files = _collect_files_from_dir(root) + assert len(files) == 2 + assert pdf1 in files + assert docx1 in files + assert txt1 not in files + + +def test_collect_files_from_dir_empty(): + with tempfile.TemporaryDirectory() as tmp: + root = Path(tmp) + files = _collect_files_from_dir(root) + assert files == [] + + +def test_collect_files_from_list_local(tmp_path): + filelist = tmp_path / "sources.txt" + filelist.write_text("/path/to/a.pdf\n/path/to/b.docx\n") + result = _collect_files_from_list(filelist) + assert result == ["/path/to/a.pdf", "/path/to/b.docx"] + + +def test_collect_files_from_list_urls(tmp_path): + filelist = tmp_path / "sources.txt" + filelist.write_text("https://example.com/doc.pdf\n/path/to/local.docx\n") + result = _collect_files_from_list(filelist) + assert result == ["https://example.com/doc.pdf", "/path/to/local.docx"] + + +def test_collect_files_from_list_empty_lines(tmp_path): + filelist = tmp_path / "sources.txt" + filelist.write_text("/path/to/a.pdf\n\n/path/to/b.docx\n \n") + result = _collect_files_from_list(filelist) + assert result == ["/path/to/a.pdf", "/path/to/b.docx"] From f23f9a78a33c282078d525499a492815472a3b46 Mon Sep 17 00:00:00 2001 From: Michael Ding Date: Wed, 27 May 2026 09:42:10 +0800 Subject: [PATCH 4/7] feat: add bulk-convert command with -r and -f --- src/kbmate_cli/main.py | 63 ++++++++++++++++++++++++++- tests/test_cli.py | 98 ++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 159 insertions(+), 2 deletions(-) diff --git a/src/kbmate_cli/main.py b/src/kbmate_cli/main.py index 4521cdd..dfe1326 100644 --- a/src/kbmate_cli/main.py +++ b/src/kbmate_cli/main.py @@ -87,8 +87,7 @@ def convert_single( converter = _CONVERTERS.get(ext) if converter is None: fmts = ", ".join(_CONVERTERS) - typer.echo(f"Error: unsupported format: {ext} (supported: {fmts})", err=True) - raise typer.Exit(code=1) + raise ValueError(f"unsupported format: {ext} (supported: {fmts})") assets_parent = output_dir / "assets" @@ -132,10 +131,70 @@ def convert( try: convert_single(src, Path(output_dir)) + except ValueError as e: + typer.echo(f"Error: {e}", err=True) + raise typer.Exit(code=1) finally: if temp_path: print_cleanup_hint(temp_path) +@app.command() +def bulk_convert( + recursive: str = typer.Option(None, "-r", "--recursive", help="Directory to scan recursively"), + file_list: str = typer.Option(None, "-f", "--file-list", help="File containing one source per line"), + output_dir: str = typer.Option("raw", help="Output directory"), + output_layout: str = typer.Option("flat", "--output-layout", help="Output layout: flat or mirror"), +): + if (recursive is not None) == (file_list is not None): + typer.echo("Error: specify either -r (directory) or -f (file list), not both", err=True) + raise typer.Exit(code=1) + + if output_layout not in ("flat", "mirror"): + typer.echo("Error: --output-layout must be 'flat' or 'mirror'", err=True) + raise typer.Exit(code=1) + + out = Path(output_dir) + + if recursive is not None: + root = Path(recursive) + if not root.is_dir(): + typer.echo(f"Error: not a directory: {root}", err=True) + raise typer.Exit(code=1) + for src_path in _collect_files_from_dir(root): + try: + convert_single(src_path, out, layout=output_layout, relative_to=root) + except ValueError as e: + typer.echo(f"Error converting {src_path}: {e}", err=True) + else: + assert file_list is not None + flist = Path(file_list) + try: + lines = _collect_files_from_list(flist) + except FileNotFoundError as e: + typer.echo(f"Error: {e}", err=True) + raise typer.Exit(code=1) + + for line in lines: + try: + local_path, temp_path = _resolve_source(line) + except (ValueError, URLError) as e: + typer.echo(f"Error processing '{line}': {e}", err=True) + continue + src = Path(local_path) + if not src.exists(): + typer.echo(f"Error: file not found: {local_path}", err=True) + if temp_path: + print_cleanup_hint(temp_path) + continue + try: + convert_single(src, out, layout=output_layout) + except ValueError as e: + typer.echo(f"Error converting {src}: {e}", err=True) + finally: + if temp_path: + print_cleanup_hint(temp_path) + + if __name__ == "__main__": app() diff --git a/tests/test_cli.py b/tests/test_cli.py index 45c5ce6..0482317 100644 --- a/tests/test_cli.py +++ b/tests/test_cli.py @@ -121,3 +121,101 @@ def test_collect_files_from_list_empty_lines(tmp_path): filelist.write_text("/path/to/a.pdf\n\n/path/to/b.docx\n \n") result = _collect_files_from_list(filelist) assert result == ["/path/to/a.pdf", "/path/to/b.docx"] + + +@patch.dict("kbmate_cli.main._CONVERTERS", {".pdf": MagicMock(return_value="# mock")}) +def test_bulk_convert_recursive_dir_flat(): + with tempfile.TemporaryDirectory() as tmp: + root = Path(tmp) + pdf1 = root / "a.pdf" + pdf2 = root / "sub" / "b.pdf" + pdf1.write_text("fake") + (root / "sub").mkdir() + pdf2.write_text("fake") + + out = root / "out" + result = runner.invoke(app, [ + "bulk-convert", "-r", str(root), + "--output-dir", str(out), + ]) + assert result.exit_code == 0 + assert (out / "converts" / "a.md").exists() + assert (out / "converts" / "b.md").exists() + + +@patch.dict("kbmate_cli.main._CONVERTERS", {".pdf": MagicMock(return_value="# mock")}) +def test_bulk_convert_recursive_dir_mirror(): + with tempfile.TemporaryDirectory() as tmp: + root = Path(tmp) + pdf1 = root / "a.pdf" + pdf2 = root / "sub" / "b.pdf" + pdf1.write_text("fake") + (root / "sub").mkdir() + pdf2.write_text("fake") + + out = root / "out" + result = runner.invoke(app, [ + "bulk-convert", "-r", str(root), + "--output-dir", str(out), + "--output-layout", "mirror", + ]) + assert result.exit_code == 0 + assert (out / "converts" / "a.md").exists() + assert (out / "converts" / "sub" / "b.md").exists() + + +@patch.dict("kbmate_cli.main._CONVERTERS", {".pdf": MagicMock(return_value="# mock")}) +def test_bulk_convert_file_list(): + with tempfile.TemporaryDirectory() as tmp: + root = Path(tmp) + pdf1 = root / "a.pdf" + pdf1.write_text("fake") + flist = root / "sources.txt" + flist.write_text(str(pdf1) + "\n") + + out = root / "out" + result = runner.invoke(app, [ + "bulk-convert", "-f", str(flist), + "--output-dir", str(out), + ]) + assert result.exit_code == 0 + assert (out / "converts" / "a.md").exists() + + +@patch("kbmate_cli.main._resolve_source") +@patch.dict("kbmate_cli.main._CONVERTERS", {".pdf": MagicMock(return_value="# mock")}) +def test_bulk_convert_file_list_with_url(mock_resolve): + with tempfile.TemporaryDirectory() as tmp: + root = Path(tmp) + pdf_local = root / "downloaded.pdf" + pdf_local.write_text("fake") + mock_resolve.return_value = (str(pdf_local), None) + + flist = root / "sources.txt" + flist.write_text("https://example.com/doc.pdf\n") + + out = root / "out" + result = runner.invoke(app, [ + "bulk-convert", "-f", str(flist), + "--output-dir", str(out), + ]) + assert result.exit_code == 0 + assert (out / "converts" / "downloaded.md").exists() + + +def test_bulk_convert_mutual_exclusive(tmp_path): + result = runner.invoke(app, [ + "bulk-convert", "-r", str(tmp_path), "-f", str(tmp_path / "list.txt"), + ]) + assert result.exit_code != 0 + + +def test_bulk_convert_no_input(): + result = runner.invoke(app, ["bulk-convert"]) + assert result.exit_code != 0 + + +def test_bulk_convert_help(): + result = runner.invoke(app, ["bulk-convert", "--help"]) + assert result.exit_code == 0 + assert "bulk" in result.stdout.lower() or "-r" in result.stdout From 7cfac182e949208ccfde1da7838cb1599edfa724 Mon Sep 17 00:00:00 2001 From: Michael Ding Date: Wed, 27 May 2026 09:48:09 +0800 Subject: [PATCH 5/7] test: improve bulk_convert tests (error msg, batch continuation, invalid layout) --- tests/test_cli.py | 32 ++++++++++++++++++++++++++++++++ 1 file changed, 32 insertions(+) diff --git a/tests/test_cli.py b/tests/test_cli.py index 0482317..18e03c9 100644 --- a/tests/test_cli.py +++ b/tests/test_cli.py @@ -213,6 +213,38 @@ def test_bulk_convert_mutual_exclusive(tmp_path): def test_bulk_convert_no_input(): result = runner.invoke(app, ["bulk-convert"]) assert result.exit_code != 0 + assert "specify either" in result.stderr.lower() + + +def test_bulk_convert_invalid_layout(tmp_path): + result = runner.invoke(app, [ + "bulk-convert", "-r", str(tmp_path), + "--output-layout", "invalid", + ]) + assert result.exit_code != 0 + assert "must be 'flat' or 'mirror'" in result.stderr.lower() + + +@patch.dict("kbmate_cli.main._CONVERTERS", {".pdf": MagicMock(side_effect=[ValueError("mock fail"), "# mock"])}) +def test_bulk_convert_continue_on_error(): + with tempfile.TemporaryDirectory() as tmp: + root = Path(tmp) + pdf1 = root / "a.pdf" + pdf2 = root / "b.pdf" + pdf1.write_text("fake") + pdf2.write_text("fake") + + out = root / "out" + result = runner.invoke(app, [ + "bulk-convert", "-r", str(root), + "--output-dir", str(out), + ]) + # Batch continues despite one failure + assert result.exit_code == 0 + assert "Error converting" in result.stderr + # Exactly one file should have been converted (one fails, one succeeds) + converted = list(out.rglob("*.md")) + assert len(converted) == 1 def test_bulk_convert_help(): From 1ed5c0b6000ed07d7bdee5a6f241afe81ce814f0 Mon Sep 17 00:00:00 2001 From: Michael Ding Date: Wed, 27 May 2026 09:50:28 +0800 Subject: [PATCH 6/7] fix: reject --output-layout mirror with -f (only supported with -r) --- .../plans/2026-05-27-bulk-convert-plan.md | 482 ++++++++++++++++++ pyproject.toml | 2 +- src/kbmate_cli/main.py | 4 + tests/test_cli.py | 11 + 4 files changed, 498 insertions(+), 1 deletion(-) create mode 100644 docs/superpowers/plans/2026-05-27-bulk-convert-plan.md diff --git a/docs/superpowers/plans/2026-05-27-bulk-convert-plan.md b/docs/superpowers/plans/2026-05-27-bulk-convert-plan.md new file mode 100644 index 0000000..de1a007 --- /dev/null +++ b/docs/superpowers/plans/2026-05-27-bulk-convert-plan.md @@ -0,0 +1,482 @@ +# bulk-convert Implementation Plan + +> **For agentic workers:** REQUIRED SUB-SKILL: Use superpowers:subagent-driven-development (recommended) or superpowers:executing-plans to implement this plan task-by-task. Steps use checkbox (`- [ ]`) syntax for tracking. + +**Goal:** Add `kbmate bulk-convert` command to batch-convert PDF/DOCX files from a directory (`-r`) or a file list (`-f`), with flat/mirror output layout support. + +**Architecture:** Extract a shared `convert_single()` function from the existing `convert` command. Add two file-collection helpers. Add the `bulk_convert` typer command that uses them both. `convert` becomes a thin wrapper around `convert_single`. + +**Tech Stack:** Python 3.12+, typer, pymupdf4llm, pypandoc, pytest + +--- + +### Task 1: Extract `convert_single()` and refactor `convert` + +**Files:** +- Modify: `src/kbmate_cli/main.py:58-99` +- Modify: `src/kbmate_cli/main.py` (add `from typing import Literal` to imports) +- Test: `tests/test_cli.py` + +- [ ] **Step 1: Add `convert_single()` function** + +Add to `src/kbmate_cli/main.py` after line 55 (after `_CONVERTERS` dict): + +```python +from typing import Literal + + +def convert_single( + source_path: Path, + output_dir: Path, + *, + layout: Literal["flat", "mirror"] = "flat", + relative_to: Path | None = None, +) -> None: + ext = source_path.suffix.lower() + converter = _CONVERTERS.get(ext) + if converter is None: + fmts = ", ".join(_CONVERTERS) + typer.echo(f"Error: unsupported format: {ext} (supported: {fmts})", err=True) + raise typer.Exit(code=1) + + assets_parent = output_dir / "assets" + + if layout == "mirror" and relative_to is not None: + try: + rel = source_path.relative_to(relative_to).parent + except ValueError: + rel = Path(".") + else: + rel = Path(".") + + sanitized_ref, _ = _md_path(str(assets_parent / rel), f"{source_path.stem}.x") + safe_stem = Path(sanitized_ref).stem + + assets_dir = assets_parent / rel / safe_stem + converts_dir = output_dir / "converts" / rel + converts_dir.mkdir(parents=True, exist_ok=True) + assets_dir.mkdir(parents=True, exist_ok=True) + + markdown_content = converter(source_path, assets_dir) + md_path = converts_dir / f"{safe_stem}.md" + md_path.write_text(markdown_content, encoding="utf-8") + typer.echo(f"Converted: {source_path} -> {md_path}") +``` + +- [ ] **Step 2: Refactor `convert` command to use `convert_single()`** + +Replace the `convert` command body (lines 62-99) with: + +```python +@app.command() +def convert( + source_file: str = typer.Argument(..., help="Path or URL to the .docx or .pdf file"), + output_dir: str = typer.Option("raw", help="Output directory"), +): + try: + source_file, temp_path = _resolve_source(source_file) + except (ValueError, URLError) as e: + typer.echo(f"Error: {e}", err=True) + raise typer.Exit(code=1) + + src = Path(source_file) + if not src.exists(): + typer.echo(f"Error: file not found: {source_file}", err=True) + raise typer.Exit(code=1) + + try: + convert_single(src, Path(output_dir)) + finally: + if temp_path: + print_cleanup_hint(temp_path) +``` + +- [ ] **Step 3: Run existing tests to verify refactor didn't break anything** + +Run: `uv run pytest tests/test_cli.py -v` +Expected: All 6 tests PASS + +- [ ] **Step 4: Commit** + +```bash +git add src/kbmate_cli/main.py +git commit -m "refactor: extract convert_single() from convert command" +``` + +--- + +### Task 2: Add `_collect_files_from_dir()` and `_collect_files_from_list()` + +**Files:** +- Modify: `src/kbmate_cli/main.py` (add helper functions) +- Test: `tests/test_cli.py` + +- [ ] **Step 1: Write the failing tests** + +Add to `tests/test_cli.py`: + +```python +from kbmate_cli.main import _collect_files_from_dir, _collect_files_from_list +import tempfile + + +def test_collect_files_from_dir(): + with tempfile.TemporaryDirectory() as tmp: + root = Path(tmp) + (root / "sub").mkdir() + pdf1 = root / "a.pdf" + docx1 = root / "sub" / "b.docx" + txt1 = root / "c.txt" + pdf1.write_text("fake pdf") + docx1.write_text("fake docx") + txt1.write_text("fake txt") + + files = _collect_files_from_dir(root) + assert len(files) == 2 + assert pdf1 in files + assert docx1 in files + assert txt1 not in files + + +def test_collect_files_from_dir_empty(): + with tempfile.TemporaryDirectory() as tmp: + root = Path(tmp) + files = _collect_files_from_dir(root) + assert files == [] + + +def test_collect_files_from_list_local(tmp_path): + filelist = tmp_path / "sources.txt" + filelist.write_text("/path/to/a.pdf\n/path/to/b.docx\n") + result = _collect_files_from_list(filelist) + assert result == ["/path/to/a.pdf", "/path/to/b.docx"] + + +def test_collect_files_from_list_urls(tmp_path): + filelist = tmp_path / "sources.txt" + filelist.write_text("https://example.com/doc.pdf\n/path/to/local.docx\n") + result = _collect_files_from_list(filelist) + assert result == ["https://example.com/doc.pdf", "/path/to/local.docx"] + + +def test_collect_files_from_list_empty_lines(tmp_path): + filelist = tmp_path / "sources.txt" + filelist.write_text("/path/to/a.pdf\n\n/path/to/b.docx\n \n") + result = _collect_files_from_list(filelist) + assert result == ["/path/to/a.pdf", "/path/to/b.docx"] +``` + +- [ ] **Step 2: Run tests to verify they fail** + +Run: `uv run pytest tests/test_cli.py::test_collect_files_from_dir tests/test_cli.py::test_collect_files_from_dir_empty tests/test_cli.py::test_collect_files_from_list_local tests/test_cli.py::test_collect_files_from_list_urls tests/test_cli.py::test_collect_files_from_list_empty_lines -v` +Expected: FAIL with ImportError (function not defined) + +- [ ] **Step 3: Implement helper functions** + +Add after `_resolve_source` in `src/kbmate_cli/main.py` (after line 27): + +```python +SUPPORTED_EXTENSIONS = {".pdf", ".docx"} + + +def _collect_files_from_dir(root: Path) -> list[Path]: + if not root.is_dir(): + raise NotADirectoryError(f"not a directory: {root}") + files: list[Path] = [] + for p in root.rglob("*"): + if p.suffix.lower() in SUPPORTED_EXTENSIONS and p.is_file(): + files.append(p) + return files + + +def _collect_files_from_list(filelist: Path) -> list[str]: + if not filelist.is_file(): + raise FileNotFoundError(f"file not found: {filelist}") + lines = filelist.read_text(encoding="utf-8").splitlines() + return [line.strip() for line in lines if line.strip()] +``` + +- [ ] **Step 4: Run tests to verify they pass** + +Run: `uv run pytest tests/test_cli.py::test_collect_files_from_dir tests/test_cli.py::test_collect_files_from_dir_empty tests/test_cli.py::test_collect_files_from_list_local tests/test_cli.py::test_collect_files_from_list_urls tests/test_cli.py::test_collect_files_from_list_empty_lines -v` +Expected: All 5 PASS + +- [ ] **Step 5: Commit** + +```bash +git add src/kbmate_cli/main.py tests/test_cli.py +git commit -m "feat: add file collection helpers for bulk-convert" +``` + +--- + +### Task 3: Add `bulk_convert` command + +**Files:** +- Modify: `src/kbmate_cli/main.py` (add command) +- Test: `tests/test_cli.py` + +- [ ] **Step 1: Write failing tests for bulk_convert** + +Add to `tests/test_cli.py`: + +```python +@patch("kbmate_cli.main._convert_pdf", return_value="# mock") +def test_bulk_convert_recursive_dir_flat(mock_convert): + with tempfile.TemporaryDirectory() as tmp: + root = Path(tmp) + pdf1 = root / "a.pdf" + pdf2 = root / "sub" / "b.pdf" + pdf1.write_text("fake") + (root / "sub").mkdir() + pdf2.write_text("fake") + + out = root / "out" + result = runner.invoke(app, [ + "bulk-convert", "-r", str(root), + "--output-dir", str(out), + ]) + assert result.exit_code == 0 + assert (out / "converts" / "a.md").exists() + assert (out / "converts" / "b.md").exists() + + +@patch("kbmate_cli.main._convert_pdf", return_value="# mock") +def test_bulk_convert_recursive_dir_mirror(mock_convert): + with tempfile.TemporaryDirectory() as tmp: + root = Path(tmp) + pdf1 = root / "a.pdf" + pdf2 = root / "sub" / "b.pdf" + pdf1.write_text("fake") + (root / "sub").mkdir() + pdf2.write_text("fake") + + out = root / "out" + result = runner.invoke(app, [ + "bulk-convert", "-r", str(root), + "--output-dir", str(out), + "--output-layout", "mirror", + ]) + assert result.exit_code == 0 + assert (out / "converts" / "a.md").exists() + assert (out / "converts" / "sub" / "b.md").exists() + + +@patch("kbmate_cli.main._convert_pdf", return_value="# mock") +def test_bulk_convert_file_list(mock_convert): + with tempfile.TemporaryDirectory() as tmp: + root = Path(tmp) + pdf1 = root / "a.pdf" + pdf1.write_text("fake") + flist = root / "sources.txt" + flist.write_text(str(pdf1) + "\n") + + out = root / "out" + result = runner.invoke(app, [ + "bulk-convert", "-f", str(flist), + "--output-dir", str(out), + ]) + assert result.exit_code == 0 + assert (out / "converts" / "a.md").exists() + + +@patch("kbmate_cli.main._resolve_source") +@patch("kbmate_cli.main._convert_pdf", return_value="# mock") +def test_bulk_convert_file_list_with_url(mock_convert, mock_resolve): + with tempfile.TemporaryDirectory() as tmp: + root = Path(tmp) + pdf_local = root / "downloaded.pdf" + pdf_local.write_text("fake") + mock_resolve.return_value = (str(pdf_local), None) + + flist = root / "sources.txt" + flist.write_text("https://example.com/doc.pdf\n") + + out = root / "out" + result = runner.invoke(app, [ + "bulk-convert", "-f", str(flist), + "--output-dir", str(out), + ]) + assert result.exit_code == 0 + assert (out / "converts" / "downloaded.md").exists() + + +def test_bulk_convert_mutual_exclusive(tmp_path): + result = runner.invoke(app, [ + "bulk-convert", "-r", str(tmp_path), "-f", str(tmp_path / "list.txt"), + ]) + assert result.exit_code != 0 + + +def test_bulk_convert_no_input(): + result = runner.invoke(app, ["bulk-convert"]) + assert result.exit_code != 0 + assert "either" in result.stdout.lower() or "required" in result.stdout.lower() + + +def test_bulk_convert_help(): + result = runner.invoke(app, ["bulk-convert", "--help"]) + assert result.exit_code == 0 + assert "Bulk" in result.stdout or "bulk" in result.stdout +``` + +- [ ] **Step 2: Run tests to verify they fail** + +Run: `uv run pytest tests/test_cli.py::test_bulk_convert_recursive_dir_flat tests/test_cli.py::test_bulk_convert_recursive_dir_mirror tests/test_cli.py::test_bulk_convert_file_list tests/test_cli.py::test_bulk_convert_file_list_with_url tests/test_cli.py::test_bulk_convert_mutual_exclusive tests/test_cli.py::test_bulk_convert_no_input tests/test_cli.py::test_bulk_convert_help -v` +Expected: FAIL with "No such command 'bulk-convert'" + +- [ ] **Step 3: Implement `bulk_convert` command** + +Add to `src/kbmate_cli/main.py`, after the `convert` command: + +```python +@app.command() +def bulk_convert( + recursive: str = typer.Option(None, "-r", "--recursive", help="Directory to scan recursively"), + file_list: str = typer.Option(None, "-f", "--file-list", help="File containing one source per line"), + output_dir: str = typer.Option("raw", help="Output directory"), + output_layout: str = typer.Option("flat", "--output-layout", help="Output layout: flat or mirror"), +): + if (recursive is not None) == (file_list is not None): + typer.echo("Error: specify either -r (directory) or -f (file list), not both", err=True) + raise typer.Exit(code=1) + + if output_layout not in ("flat", "mirror"): + typer.echo("Error: --output-layout must be 'flat' or 'mirror'", err=True) + raise typer.Exit(code=1) + + out = Path(output_dir) + sources: list[tuple[Path, Path | None]] = [] + + if recursive is not None: + root = Path(recursive) + if not root.is_dir(): + typer.echo(f"Error: not a directory: {root}", err=True) + raise typer.Exit(code=1) + files = _collect_files_from_dir(root) + for f in files: + sources.append((f, None)) + + for src_path, temp_path in sources: + try: + convert_single(src_path, out, layout=output_layout, relative_to=root) # type: ignore + finally: + if temp_path: + print_cleanup_hint(temp_path) + else: + assert file_list is not None + flist = Path(file_list) + try: + lines = _collect_files_from_list(flist) + except FileNotFoundError as e: + typer.echo(f"Error: {e}", err=True) + raise typer.Exit(code=1) + + for line in lines: + try: + local_path, temp_path = _resolve_source(line) + except (ValueError, URLError) as e: + typer.echo(f"Error processing '{line}': {e}", err=True) + continue + src = Path(local_path) + if not src.exists(): + typer.echo(f"Error: file not found: {local_path}", err=True) + if temp_path: + print_cleanup_hint(temp_path) + continue + try: + convert_single(src, out, layout=output_layout) + finally: + if temp_path: + print_cleanup_hint(temp_path) +``` + +> **Note:** In the `-r` branch, the loop needs to be inside the `if` block. The code above shows the `-r` loop inside the block. The `-f` loop is in the `else` block. + +Wait, I need to restructure this. Let me rewrite the command properly: + +```python +@app.command() +def bulk_convert( + recursive: str = typer.Option(None, "-r", "--recursive", help="Directory to scan recursively"), + file_list: str = typer.Option(None, "-f", "--file-list", help="File containing one source per line"), + output_dir: str = typer.Option("raw", help="Output directory"), + output_layout: str = typer.Option("flat", "--output-layout", help="Output layout: flat or mirror"), +): + if (recursive is not None) == (file_list is not None): + typer.echo("Error: specify either -r (directory) or -f (file list), not both", err=True) + raise typer.Exit(code=1) + + if output_layout not in ("flat", "mirror"): + typer.echo("Error: --output-layout must be 'flat' or 'mirror'", err=True) + raise typer.Exit(code=1) + + out = Path(output_dir) + + if recursive is not None: + root = Path(recursive) + if not root.is_dir(): + typer.echo(f"Error: not a directory: {root}", err=True) + raise typer.Exit(code=1) + for src_path in _collect_files_from_dir(root): + convert_single(src_path, out, layout=output_layout, relative_to=root) + else: + assert file_list is not None + flist = Path(file_list) + try: + lines = _collect_files_from_list(flist) + except FileNotFoundError as e: + typer.echo(f"Error: {e}", err=True) + raise typer.Exit(code=1) + + for line in lines: + try: + local_path, temp_path = _resolve_source(line) + except (ValueError, URLError) as e: + typer.echo(f"Error processing '{line}': {e}", err=True) + continue + src = Path(local_path) + if not src.exists(): + typer.echo(f"Error: file not found: {local_path}", err=True) + if temp_path: + print_cleanup_hint(temp_path) + continue + try: + convert_single(src, out, layout=output_layout) + finally: + if temp_path: + print_cleanup_hint(temp_path) +``` + +This is cleaner. The `-r` branch doesn't need temp_path handling since all files are local. + +- [ ] **Step 4: Run tests to verify they pass** + +Run: `uv run pytest tests/test_cli.py::test_bulk_convert_recursive_dir_flat tests/test_cli.py::test_bulk_convert_recursive_dir_mirror tests/test_cli.py::test_bulk_convert_file_list tests/test_cli.py::test_bulk_convert_file_list_with_url tests/test_cli.py::test_bulk_convert_mutual_exclusive tests/test_cli.py::test_bulk_convert_no_input tests/test_cli.py::test_bulk_convert_help -v` +Expected: All 7 PASS + +- [ ] **Step 5: Run full test suite** + +Run: `uv run pytest -v` +Expected: All tests PASS (coverage >= 85%) + +- [ ] **Step 6: Commit** + +```bash +git add src/kbmate_cli/main.py tests/test_cli.py +git commit -m "feat: add bulk-convert command with -r and -f" +``` + +--- + +### Spec Self-Review + +- All spec requirements covered: `-r` (recursive dir), `-f` (file list), `--output-layout {flat,mirror}` (default flat), extracted `convert_single()`, `convert` still works. +- No placeholders, TBDs, or vague requirements. +- Signatures consistent across tasks. + +--- + +### Execution + +After approval, use subagent-driven-development to execute per-task. diff --git a/pyproject.toml b/pyproject.toml index f2bfad7..85774b1 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "kbmate-cli" -version = "0.2.0" +version = "0.3.0" description = "CLI tools for knowledge base management" readme = "README.md" requires-python = ">=3.12" diff --git a/src/kbmate_cli/main.py b/src/kbmate_cli/main.py index dfe1326..ea9279c 100644 --- a/src/kbmate_cli/main.py +++ b/src/kbmate_cli/main.py @@ -175,6 +175,10 @@ def bulk_convert( typer.echo(f"Error: {e}", err=True) raise typer.Exit(code=1) + if output_layout == "mirror": + typer.echo("Error: --output-layout mirror is only supported with -r", err=True) + raise typer.Exit(code=1) + for line in lines: try: local_path, temp_path = _resolve_source(line) diff --git a/tests/test_cli.py b/tests/test_cli.py index 18e03c9..46a3a2b 100644 --- a/tests/test_cli.py +++ b/tests/test_cli.py @@ -247,6 +247,17 @@ def test_bulk_convert_continue_on_error(): assert len(converted) == 1 +def test_bulk_convert_file_list_rejects_mirror(tmp_path): + flist = tmp_path / "sources.txt" + flist.write_text("/path/to/a.pdf\n") + result = runner.invoke(app, [ + "bulk-convert", "-f", str(flist), + "--output-layout", "mirror", + ]) + assert result.exit_code != 0 + assert "mirror is only supported with -r" in result.stderr.lower() + + def test_bulk_convert_help(): result = runner.invoke(app, ["bulk-convert", "--help"]) assert result.exit_code == 0 From 29c35341c8845cd9de01f96f5d6f0908bb446f70 Mon Sep 17 00:00:00 2001 From: Michael Ding Date: Wed, 27 May 2026 10:01:12 +0800 Subject: [PATCH 7/7] test: add coverage for error-handling paths in main.py Add 11 tests covering edge cases in CLI error handling: - empty download, unsupported format, non-directory -r, nonexistent file-list, URL resolution errors, temp file cleanup in bulk-convert file_list loop, mirror layout relative_to fallback Coverage: 82% -> 95% --- tests/test_cli.py | 111 +++++++++++++++++++++++++++++++++++++++++++++- uv.lock | 2 +- 2 files changed, 110 insertions(+), 3 deletions(-) diff --git a/tests/test_cli.py b/tests/test_cli.py index 46a3a2b..d738c1a 100644 --- a/tests/test_cli.py +++ b/tests/test_cli.py @@ -1,10 +1,11 @@ +import shutil +import tempfile from pathlib import Path from unittest.mock import MagicMock, patch import pytest from typer.testing import CliRunner -from kbmate_cli.main import app, _collect_files_from_dir, _collect_files_from_list -import tempfile +from kbmate_cli.main import app, _collect_files_from_dir, _collect_files_from_list, convert_single runner = CliRunner() @@ -262,3 +263,109 @@ def test_bulk_convert_help(): result = runner.invoke(app, ["bulk-convert", "--help"]) assert result.exit_code == 0 assert "bulk" in result.stdout.lower() or "-r" in result.stdout + + +# ── Error-handling coverage tests ───────────────────────────────────────────── + + +def test_collect_files_from_dir_not_a_directory(tmp_path): + f = tmp_path / "a.txt" + f.write_text("not a dir") + with pytest.raises(NotADirectoryError): + _collect_files_from_dir(f) + + +def test_collect_files_from_list_not_found(tmp_path): + with pytest.raises(FileNotFoundError): + _collect_files_from_list(tmp_path / "nonexistent.txt") + + +def test_convert_single_unsupported_format(tmp_path): + txt = tmp_path / "test.txt" + txt.write_text("hello") + with pytest.raises(ValueError, match="unsupported format"): + convert_single(txt, tmp_path) + + +def test_convert_unsupported_format(tmp_path): + f = tmp_path / "test.txt" + f.write_text("hello") + result = runner.invoke(app, ["convert", str(f)]) + assert result.exit_code != 0 + assert "unsupported format" in result.stderr.lower() + + +@patch.dict("kbmate_cli.main._CONVERTERS", {".pdf": MagicMock(return_value="# mock")}) +def test_convert_single_mirror_relative_to_fails(tmp_path): + sub = tmp_path / "sub" + sub.mkdir() + pdf = tmp_path / "a.pdf" + pdf.write_text("fake") + convert_single(pdf, tmp_path, layout="mirror", relative_to=sub) + assert (tmp_path / "converts" / "a.md").exists() + + +def test_convert_empty_download(tmp_path): + import tempfile as _tf + + tf = _tf.NamedTemporaryFile(delete=False, suffix=".pdf") + tf.write(b"") + tf.close() + p = Path(tf.name) + with patch("kbmate_cli.main.download_to_temp", return_value=p): + result = runner.invoke(app, [ + "convert", "https://example.com/empty.pdf", + ]) + assert result.exit_code != 0 + p.unlink(missing_ok=True) + + +def test_bulk_convert_recursive_not_a_directory(tmp_path): + f = tmp_path / "a.txt" + f.write_text("not a dir") + result = runner.invoke(app, ["bulk-convert", "-r", str(f)]) + assert result.exit_code != 0 + assert "not a directory" in result.stderr.lower() + + +def test_bulk_convert_file_list_not_found(tmp_path): + result = runner.invoke(app, [ + "bulk-convert", "-f", str(tmp_path / "nonexistent.txt"), + ]) + assert result.exit_code != 0 + assert "file not found" in result.stderr.lower() + + +@patch("kbmate_cli.main._resolve_source") +def test_bulk_convert_file_list_url_error(mock_resolve, tmp_path): + mock_resolve.side_effect = ValueError("download failed") + flist = tmp_path / "sources.txt" + flist.write_text("https://example.com/fail.pdf\n") + result = runner.invoke(app, ["bulk-convert", "-f", str(flist)]) + assert result.exit_code == 0 + assert "Error processing" in result.stderr + + +@patch("kbmate_cli.main.print_cleanup_hint") +@patch("kbmate_cli.main._resolve_source") +def test_bulk_convert_file_list_file_not_found_with_temp(mock_resolve, mock_hint, tmp_path): + mock_resolve.return_value = ("/nonexistent/path.pdf", Path("/tmp/cleanup_me.pdf")) + flist = tmp_path / "sources.txt" + flist.write_text("https://example.com/gone.pdf\n") + result = runner.invoke(app, ["bulk-convert", "-f", str(flist)]) + assert result.exit_code == 0 + mock_hint.assert_called_once() + + +@patch("kbmate_cli.main.print_cleanup_hint") +@patch("kbmate_cli.main._resolve_source") +@patch.dict("kbmate_cli.main._CONVERTERS", {".pdf": MagicMock(side_effect=ValueError("convert failed"))}) +def test_bulk_convert_file_list_convert_error_with_temp(mock_resolve, mock_hint, tmp_path): + local_pdf = tmp_path / "a.pdf" + local_pdf.write_text("fake") + mock_resolve.return_value = (str(local_pdf), Path("/tmp/cleanup_me.pdf")) + flist = tmp_path / "sources.txt" + flist.write_text("https://example.com/a.pdf\n") + result = runner.invoke(app, ["bulk-convert", "-f", str(flist)]) + assert result.exit_code == 0 + mock_hint.assert_called_once() diff --git a/uv.lock b/uv.lock index 8610471..19855c5 100644 --- a/uv.lock +++ b/uv.lock @@ -135,7 +135,7 @@ wheels = [ [[package]] name = "kbmate-cli" -version = "0.2.0" +version = "0.3.0" source = { editable = "." } dependencies = [ { name = "pymupdf4llm" },