Skip to content

Commit 40af3b3

Browse files
Fix UnicodeDecodeError when reading packed-refs with non-UTF8 characters
Fixes #2064 The packed-refs file can contain ref names that are not valid UTF-8 (e.g., Latin-1 encoded tag names created by older Git versions or non-UTF8 systems). Previously, opening the file with encoding='UTF-8' would raise UnicodeDecodeError. Changes: - Add errors='surrogateescape' to the open() call in _iter_packed_refs() - This allows reading files with arbitrary byte sequences while still treating valid UTF-8 as text - Add test that verifies non-UTF8 packed-refs can be read successfully The 'surrogateescape' error handler is the standard Python approach for handling potentially non-UTF8 data in filesystem operations, as it preserves the original bytes in a reversible way.
1 parent eecc28d commit 40af3b3

File tree

2 files changed

+40
-1
lines changed

2 files changed

+40
-1
lines changed

git/refs/symbolic.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -123,7 +123,7 @@ def _iter_packed_refs(cls, repo: "Repo") -> Iterator[Tuple[str, str]]:
123123
The packed refs file will be kept open as long as we iterate.
124124
"""
125125
try:
126-
with open(cls._get_packed_refs_path(repo), "rt", encoding="UTF-8") as fp:
126+
with open(cls._get_packed_refs_path(repo), "rt", encoding="UTF-8", errors="surrogateescape") as fp:
127127
for line in fp:
128128
line = line.strip()
129129
if not line:

test/test_refs.py

Lines changed: 39 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -629,6 +629,45 @@ def test_tag_message(self, rw_repo):
629629
)
630630
assert tag_ref.tag.message == "test2"
631631

632+
@with_rw_repo("0.1.6")
633+
def test_packed_refs_with_non_utf8_encoding(self, rw_repo):
634+
"""Test that packed-refs files with non-UTF8 encoded ref names can be read.
635+
636+
This addresses issue #2064 where GitPython would fail with UnicodeDecodeError
637+
when reading packed-refs files containing non-UTF8 characters (e.g., Latin-1
638+
encoded tag names).
639+
"""
640+
# Create a tag with ASCII name first
641+
TagReference.create(rw_repo, "normal-tag")
642+
643+
# Pack refs
644+
rw_repo.git.pack_refs(all=True)
645+
646+
# Manually insert a non-UTF8 ref into the packed-refs file
647+
# Using Latin-1 characters that are invalid UTF-8
648+
packed_refs_path = osp.join(rw_repo.common_dir, "packed-refs")
649+
650+
with open(packed_refs_path, "rb") as f:
651+
content = f.read()
652+
653+
# Add a fake ref with Latin-1 encoded name (ñ = 0xF1 in Latin-1, invalid UTF-8)
654+
# Using a valid SHA from the repo
655+
head_sha = rw_repo.head.commit.hexsha
656+
non_utf8_line = f"\n{head_sha} refs/tags/caf\xf1\n".encode("latin-1")
657+
658+
with open(packed_refs_path, "wb") as f:
659+
f.write(content + non_utf8_line)
660+
661+
# This should NOT raise UnicodeDecodeError with the fix
662+
# It should successfully read all tags including the non-UTF8 one
663+
tags = list(rw_repo.tags)
664+
assert len(tags) >= 1
665+
666+
# Verify we can iterate packed refs without error
667+
from git.refs import SymbolicReference
668+
packed_refs = list(SymbolicReference._iter_packed_refs(rw_repo))
669+
assert len(packed_refs) >= 2 # At least normal-tag and the non-UTF8 tag
670+
632671
def test_dereference_recursive(self):
633672
# For now, just test the HEAD.
634673
assert SymbolicReference.dereference_recursive(self.rorepo, "HEAD")

0 commit comments

Comments
 (0)