From 40af3b395cebed928b0eb68fe1fdb70bee1f9288 Mon Sep 17 00:00:00 2001 From: Paul Date: Sun, 7 Dec 2025 21:10:10 +0530 Subject: [PATCH 1/3] Fix UnicodeDecodeError when reading packed-refs with non-UTF8 characters Fixes #2064 The packed-refs file can contain ref names that are not valid UTF-8 (e.g., Latin-1 encoded tag names created by older Git versions or non-UTF8 systems). Previously, opening the file with encoding='UTF-8' would raise UnicodeDecodeError. Changes: - Add errors='surrogateescape' to the open() call in _iter_packed_refs() - This allows reading files with arbitrary byte sequences while still treating valid UTF-8 as text - Add test that verifies non-UTF8 packed-refs can be read successfully The 'surrogateescape' error handler is the standard Python approach for handling potentially non-UTF8 data in filesystem operations, as it preserves the original bytes in a reversible way. --- git/refs/symbolic.py | 2 +- test/test_refs.py | 39 +++++++++++++++++++++++++++++++++++++++ 2 files changed, 40 insertions(+), 1 deletion(-) diff --git a/git/refs/symbolic.py b/git/refs/symbolic.py index 99af4f57c..e88bf93df 100644 --- a/git/refs/symbolic.py +++ b/git/refs/symbolic.py @@ -123,7 +123,7 @@ def _iter_packed_refs(cls, repo: "Repo") -> Iterator[Tuple[str, str]]: The packed refs file will be kept open as long as we iterate. """ try: - with open(cls._get_packed_refs_path(repo), "rt", encoding="UTF-8") as fp: + with open(cls._get_packed_refs_path(repo), "rt", encoding="UTF-8", errors="surrogateescape") as fp: for line in fp: line = line.strip() if not line: diff --git a/test/test_refs.py b/test/test_refs.py index 329515807..3bbac0fed 100644 --- a/test/test_refs.py +++ b/test/test_refs.py @@ -629,6 +629,45 @@ def test_tag_message(self, rw_repo): ) assert tag_ref.tag.message == "test2" + @with_rw_repo("0.1.6") + def test_packed_refs_with_non_utf8_encoding(self, rw_repo): + """Test that packed-refs files with non-UTF8 encoded ref names can be read. + + This addresses issue #2064 where GitPython would fail with UnicodeDecodeError + when reading packed-refs files containing non-UTF8 characters (e.g., Latin-1 + encoded tag names). + """ + # Create a tag with ASCII name first + TagReference.create(rw_repo, "normal-tag") + + # Pack refs + rw_repo.git.pack_refs(all=True) + + # Manually insert a non-UTF8 ref into the packed-refs file + # Using Latin-1 characters that are invalid UTF-8 + packed_refs_path = osp.join(rw_repo.common_dir, "packed-refs") + + with open(packed_refs_path, "rb") as f: + content = f.read() + + # Add a fake ref with Latin-1 encoded name (ñ = 0xF1 in Latin-1, invalid UTF-8) + # Using a valid SHA from the repo + head_sha = rw_repo.head.commit.hexsha + non_utf8_line = f"\n{head_sha} refs/tags/caf\xf1\n".encode("latin-1") + + with open(packed_refs_path, "wb") as f: + f.write(content + non_utf8_line) + + # This should NOT raise UnicodeDecodeError with the fix + # It should successfully read all tags including the non-UTF8 one + tags = list(rw_repo.tags) + assert len(tags) >= 1 + + # Verify we can iterate packed refs without error + from git.refs import SymbolicReference + packed_refs = list(SymbolicReference._iter_packed_refs(rw_repo)) + assert len(packed_refs) >= 2 # At least normal-tag and the non-UTF8 tag + def test_dereference_recursive(self): # For now, just test the HEAD. assert SymbolicReference.dereference_recursive(self.rorepo, "HEAD") From 963604b4e238c7b8de62c8d9067fd814467e74f1 Mon Sep 17 00:00:00 2001 From: Paul Desai Date: Mon, 8 Dec 2025 13:58:10 +0530 Subject: [PATCH 2/3] Fix ruff lint: remove whitespace from blank lines --- test/test_refs.py | 17 +++++++++-------- 1 file changed, 9 insertions(+), 8 deletions(-) diff --git a/test/test_refs.py b/test/test_refs.py index 3bbac0fed..c5378ee5f 100644 --- a/test/test_refs.py +++ b/test/test_refs.py @@ -632,39 +632,40 @@ def test_tag_message(self, rw_repo): @with_rw_repo("0.1.6") def test_packed_refs_with_non_utf8_encoding(self, rw_repo): """Test that packed-refs files with non-UTF8 encoded ref names can be read. - + This addresses issue #2064 where GitPython would fail with UnicodeDecodeError when reading packed-refs files containing non-UTF8 characters (e.g., Latin-1 encoded tag names). """ # Create a tag with ASCII name first TagReference.create(rw_repo, "normal-tag") - + # Pack refs rw_repo.git.pack_refs(all=True) - + # Manually insert a non-UTF8 ref into the packed-refs file # Using Latin-1 characters that are invalid UTF-8 packed_refs_path = osp.join(rw_repo.common_dir, "packed-refs") - + with open(packed_refs_path, "rb") as f: content = f.read() - + # Add a fake ref with Latin-1 encoded name (ñ = 0xF1 in Latin-1, invalid UTF-8) # Using a valid SHA from the repo head_sha = rw_repo.head.commit.hexsha non_utf8_line = f"\n{head_sha} refs/tags/caf\xf1\n".encode("latin-1") - + with open(packed_refs_path, "wb") as f: f.write(content + non_utf8_line) - + # This should NOT raise UnicodeDecodeError with the fix # It should successfully read all tags including the non-UTF8 one tags = list(rw_repo.tags) assert len(tags) >= 1 - + # Verify we can iterate packed refs without error from git.refs import SymbolicReference + packed_refs = list(SymbolicReference._iter_packed_refs(rw_repo)) assert len(packed_refs) >= 2 # At least normal-tag and the non-UTF8 tag From 5e5e1c1b1c6a8869defec4e828945761ff0fc7a8 Mon Sep 17 00:00:00 2001 From: Paul Desai Date: Mon, 8 Dec 2025 14:25:24 +0530 Subject: [PATCH 3/3] Fix codespell: rename test ref to avoid 'caf' typo detection --- test/test_refs.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/test_refs.py b/test/test_refs.py index c5378ee5f..ff1923fbc 100644 --- a/test/test_refs.py +++ b/test/test_refs.py @@ -653,7 +653,7 @@ def test_packed_refs_with_non_utf8_encoding(self, rw_repo): # Add a fake ref with Latin-1 encoded name (ñ = 0xF1 in Latin-1, invalid UTF-8) # Using a valid SHA from the repo head_sha = rw_repo.head.commit.hexsha - non_utf8_line = f"\n{head_sha} refs/tags/caf\xf1\n".encode("latin-1") + non_utf8_line = f"\n{head_sha} refs/tags/test-\xf1ame\n".encode("latin-1") with open(packed_refs_path, "wb") as f: f.write(content + non_utf8_line)