Skip to content

Commit 7ea5d76

Browse files
committed
Fix #13882: Decrement vocab.length when memory_zone clears transient lexemes
1 parent 297938e commit 7ea5d76

File tree

2 files changed

+73
-1
lines changed

2 files changed

+73
-1
lines changed

spacy/tests/vocab_vectors/test_memory_zone.py

Lines changed: 66 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,5 @@
11
from spacy.vocab import Vocab
2+
import pytest
23

34

45
def test_memory_zone_no_insertion():
@@ -57,3 +58,68 @@ def test_memory_zone_exception_cleanup():
5758
# Vocab should be fully usable for new operations
5859
lex = vocab["cat"]
5960
assert lex.text == "cat"
61+
62+
63+
@pytest.mark.issue(13882)
64+
def test_memory_zone_vocab_length_decremented():
65+
"""Test that vocab.length is correctly decremented when memory_zone
66+
clears transient lexemes.
67+
68+
Bug: The length counter was incremented when adding lexemes but never
69+
decremented when memory_zone cleared transient lexemes, causing
70+
len(vocab) to grow continuously even though lexemes were properly removed.
71+
"""
72+
vocab = Vocab()
73+
74+
# Add some permanent lexemes
75+
vocab["hello"]
76+
vocab["world"]
77+
initial_len = len(vocab)
78+
assert initial_len == 2
79+
80+
# Add transient lexemes inside memory_zone
81+
with vocab.memory_zone():
82+
vocab["transient1"]
83+
vocab["transient2"]
84+
vocab["transient3"]
85+
inside_len = len(vocab)
86+
assert inside_len == 5 # 2 permanent + 3 transient
87+
88+
# After exiting memory_zone, length should return to initial
89+
after_zone_len = len(vocab)
90+
assert after_zone_len == initial_len, (
91+
f"vocab.length should be {initial_len} after memory_zone, "
92+
f"but got {after_zone_len}"
93+
)
94+
95+
# Verify by iteration that only permanent lexemes remain
96+
actual_count = sum(1 for _ in vocab)
97+
assert actual_count == initial_len
98+
assert after_zone_len == actual_count
99+
100+
101+
@pytest.mark.issue(13882)
102+
def test_memory_zone_multiple_cycles():
103+
"""Test that vocab.length is correctly maintained across multiple
104+
memory_zone cycles."""
105+
vocab = Vocab()
106+
vocab["permanent"]
107+
base_len = len(vocab)
108+
assert base_len == 1
109+
110+
# Multiple memory_zone cycles
111+
for i in range(3):
112+
with vocab.memory_zone():
113+
for j in range(5):
114+
vocab[f"temp_{i}_{j}"]
115+
116+
# Length should return to base after each cycle
117+
assert len(vocab) == base_len, (
118+
f"After cycle {i+1}, vocab.length should be {base_len}, "
119+
f"but got {len(vocab)}"
120+
)
121+
122+
# Final verification
123+
final_len = len(vocab)
124+
actual_count = sum(1 for _ in vocab)
125+
assert final_len == actual_count == base_len

spacy/vocab.pyx

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -251,9 +251,15 @@ cdef class Vocab:
251251

252252
def _clear_transient_orths(self):
253253
"""Remove transient lexemes from the index (generally at the end of the memory zone)"""
254+
cdef hash_t orth
255+
cdef int num_cleared = 0
256+
254257
for orth in self._transient_orths:
255-
map_clear(self._by_orth.c_map, orth)
258+
if self._by_orth.get(orth) is not NULL:
259+
map_clear(self._by_orth.c_map, orth)
260+
num_cleared += 1
256261
self._transient_orths.clear()
262+
self.length -= num_cleared
257263

258264
def __contains__(self, key):
259265
"""Check whether the string or int key has an entry in the vocabulary.

0 commit comments

Comments
 (0)