From 3bb798e2b4e2700a0f41da9233ab1c7eb635f7a3 Mon Sep 17 00:00:00 2001 From: mikemolinet Date: Mon, 13 Apr 2026 15:37:27 -0700 Subject: [PATCH 1/2] fix: harden post-commit hook against non-UTF8 encoding MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The post-commit hook's subprocess output .decode() calls raise UnicodeDecodeError if git returns non-UTF8 bytes — e.g., when i18n.commitEncoding is set to a non-UTF8 charset, when the repo has legacy SVN/CVS-imported commit metadata, or when file paths contain non-UTF8 bytes. The outer try/except only catches CalledProcessError and FileNotFoundError, so UnicodeDecodeError escapes and the hook prints a traceback, silently skipping BrainLayer indexing for that commit. Pass errors="replace" to the four .decode() calls so invalid bytes become U+FFFD and the hook completes normally. Matches the existing errors="replace" pattern in src/brainlayer/pipeline/extract_markdown.py. Adds tests/test_post_commit_hook.py with three tests: - non-UTF8 commit message (discriminating; fails without the fix) - non-UTF8 file path - valid UTF-8 (sanity; ensures no behavior change on the happy path) --- hooks/post-commit.py | 16 +++- tests/test_post_commit_hook.py | 141 +++++++++++++++++++++++++++++++++ 2 files changed, 153 insertions(+), 4 deletions(-) create mode 100644 tests/test_post_commit_hook.py diff --git a/hooks/post-commit.py b/hooks/post-commit.py index 49f8a0ef..60e49f2b 100755 --- a/hooks/post-commit.py +++ b/hooks/post-commit.py @@ -14,16 +14,22 @@ def main(): try: - commit_hash = subprocess.check_output(["git", "rev-parse", "HEAD"], stderr=subprocess.DEVNULL).decode().strip() + commit_hash = ( + subprocess.check_output(["git", "rev-parse", "HEAD"], stderr=subprocess.DEVNULL) + .decode("utf-8", errors="replace") + .strip() + ) commit_msg = ( - subprocess.check_output(["git", "log", "-1", "--pretty=%B"], stderr=subprocess.DEVNULL).decode().strip() + subprocess.check_output(["git", "log", "-1", "--pretty=%B"], stderr=subprocess.DEVNULL) + .decode("utf-8", errors="replace") + .strip() ) files_changed = ( subprocess.check_output( ["git", "diff-tree", "--no-commit-id", "-r", "--name-only", "HEAD"], stderr=subprocess.DEVNULL, ) - .decode() + .decode("utf-8", errors="replace") .strip() .split("\n") ) @@ -34,7 +40,9 @@ def main(): # Detect project from repo root directory name try: repo_root = ( - subprocess.check_output(["git", "rev-parse", "--show-toplevel"], stderr=subprocess.DEVNULL).decode().strip() + subprocess.check_output(["git", "rev-parse", "--show-toplevel"], stderr=subprocess.DEVNULL) + .decode("utf-8", errors="replace") + .strip() ) project = os.path.basename(repo_root) except (subprocess.CalledProcessError, FileNotFoundError): diff --git a/tests/test_post_commit_hook.py b/tests/test_post_commit_hook.py new file mode 100644 index 00000000..47f9960d --- /dev/null +++ b/tests/test_post_commit_hook.py @@ -0,0 +1,141 @@ +"""Tests for hooks/post-commit.py — ensures the hook survives non-UTF8 +subprocess output from git (commit messages with non-UTF8 encoding, +legacy-imported commit metadata, or non-UTF8 file paths). +""" + +import importlib.util +import sys +import types +from pathlib import Path + +import pytest + +HOOKS_DIR = Path(__file__).parent.parent / "hooks" + + +def _load_hook(): + spec = importlib.util.spec_from_file_location( + "post_commit_hook", + HOOKS_DIR / "post-commit.py", + ) + mod = importlib.util.module_from_spec(spec) + spec.loader.exec_module(mod) + return mod + + +@pytest.fixture +def captured_calls(monkeypatch): + """Stub out brainlayer.* modules so the hook can import them without + loading the real embedding model, and capture store_memory() calls. + """ + calls = [] + + store_mod = types.ModuleType("brainlayer.store") + + def fake_store_memory(**kwargs): + calls.append(kwargs) + + store_mod.store_memory = fake_store_memory + + vs_mod = types.ModuleType("brainlayer.vector_store") + + class _FakeStore: + def __init__(self, *args, **kwargs): + pass + + def close(self): + pass + + vs_mod.VectorStore = _FakeStore + + embed_mod = types.ModuleType("brainlayer.embeddings") + + class _FakeModel: + def embed_query(self, text): + return [0.0] * 8 + + embed_mod.get_embedding_model = lambda: _FakeModel() + + paths_mod = types.ModuleType("brainlayer.paths") + paths_mod.DEFAULT_DB_PATH = "/tmp/fake-brainlayer.db" + + monkeypatch.setitem(sys.modules, "brainlayer.store", store_mod) + monkeypatch.setitem(sys.modules, "brainlayer.vector_store", vs_mod) + monkeypatch.setitem(sys.modules, "brainlayer.embeddings", embed_mod) + monkeypatch.setitem(sys.modules, "brainlayer.paths", paths_mod) + + return calls + + +def _make_fake_check_output( + commit_msg_bytes, + file_list_bytes=b"src/a.py\nsrc/b.py\n", + toplevel_bytes=b"/tmp/repo\n", +): + def fake(cmd, **kwargs): + if "rev-parse" in cmd and "HEAD" in cmd: + return b"deadbeefcafe1234\n" + if "log" in cmd and "--pretty=%B" in cmd: + return commit_msg_bytes + if "diff-tree" in cmd: + return file_list_bytes + if "--show-toplevel" in cmd: + return toplevel_bytes + raise AssertionError(f"unexpected subprocess call: {cmd}") + + return fake + + +def test_post_commit_handles_non_utf8_commit_message(captured_calls, monkeypatch): + """Discriminating test: non-UTF8 bytes in commit message must not crash + the hook. Fails before the fix (UnicodeDecodeError escapes the outer + try/except), passes after. + """ + hook_mod = _load_hook() + monkeypatch.setattr( + hook_mod.subprocess, + "check_output", + _make_fake_check_output(commit_msg_bytes=b"Latin-1 commit \xff\xfe body\n"), + ) + + hook_mod.main() # must not raise + + assert len(captured_calls) == 1 + assert "\ufffd" in captured_calls[0]["content"] + + +def test_post_commit_handles_non_utf8_file_path(captured_calls, monkeypatch): + hook_mod = _load_hook() + monkeypatch.setattr( + hook_mod.subprocess, + "check_output", + _make_fake_check_output( + commit_msg_bytes=b"fix: something\n", + file_list_bytes=b"src/\xff\xfe.py\nsrc/b.py\n", + ), + ) + + hook_mod.main() # must not raise + + assert len(captured_calls) == 1 + # U+FFFD must appear in the Files portion (from the non-UTF8 filename) + assert "\ufffd" in captured_calls[0]["content"] + + +def test_post_commit_preserves_valid_utf8(captured_calls, monkeypatch): + """Sanity: valid UTF-8 (including non-ASCII) flows through unchanged.""" + hook_mod = _load_hook() + monkeypatch.setattr( + hook_mod.subprocess, + "check_output", + _make_fake_check_output( + commit_msg_bytes="fix: 日本語 commit\n".encode("utf-8"), + ), + ) + + hook_mod.main() + + assert len(captured_calls) == 1 + content = captured_calls[0]["content"] + assert "\ufffd" not in content + assert "日本語" in content From f7c11c07221d459b9a6ba15b0c5976b64bc959be Mon Sep 17 00:00:00 2001 From: mikemolinet Date: Mon, 13 Apr 2026 15:53:50 -0700 Subject: [PATCH 2/2] test: cover non-UTF8 repo toplevel path in post-commit hook MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Adds test_post_commit_handles_non_utf8_repo_toplevel to exercise the fourth decode call in hooks/post-commit.py — the `git rev-parse --show-toplevel` path that feeds os.path.basename() for the project kwarg. Completes coverage of all four fixed decode sites. Addresses CodeRabbit review on #241. --- tests/test_post_commit_hook.py | 22 ++++++++++++++++++++++ 1 file changed, 22 insertions(+) diff --git a/tests/test_post_commit_hook.py b/tests/test_post_commit_hook.py index 47f9960d..a2933b62 100644 --- a/tests/test_post_commit_hook.py +++ b/tests/test_post_commit_hook.py @@ -122,6 +122,28 @@ def test_post_commit_handles_non_utf8_file_path(captured_calls, monkeypatch): assert "\ufffd" in captured_calls[0]["content"] +def test_post_commit_handles_non_utf8_repo_toplevel(captured_calls, monkeypatch): + """Non-UTF8 bytes returned by `git rev-parse --show-toplevel` must not + crash the hook. Exercises the fourth decode call in the second try block. + """ + hook_mod = _load_hook() + monkeypatch.setattr( + hook_mod.subprocess, + "check_output", + _make_fake_check_output( + commit_msg_bytes=b"fix: something\n", + toplevel_bytes=b"/tmp/repo-\xff\xfe\n", + ), + ) + + hook_mod.main() # must not raise + + assert len(captured_calls) == 1 + # The toplevel path decode feeds os.path.basename(...) which becomes the + # `project` kwarg passed to store_memory. + assert "\ufffd" in captured_calls[0]["project"] + + def test_post_commit_preserves_valid_utf8(captured_calls, monkeypatch): """Sanity: valid UTF-8 (including non-ASCII) flows through unchanged.""" hook_mod = _load_hook()