Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
66 changes: 66 additions & 0 deletions spacy/tests/vocab_vectors/test_memory_zone.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
from spacy.vocab import Vocab
import pytest


def test_memory_zone_no_insertion():
Expand Down Expand Up @@ -57,3 +58,68 @@ def test_memory_zone_exception_cleanup():
# Vocab should be fully usable for new operations
lex = vocab["cat"]
assert lex.text == "cat"


@pytest.mark.issue(13882)
def test_memory_zone_vocab_length_decremented():
"""Test that vocab.length is correctly decremented when memory_zone
clears transient lexemes.

Bug: The length counter was incremented when adding lexemes but never
decremented when memory_zone cleared transient lexemes, causing
len(vocab) to grow continuously even though lexemes were properly removed.
"""
vocab = Vocab()

# Add some permanent lexemes
vocab["hello"]
vocab["world"]
initial_len = len(vocab)
assert initial_len == 2

# Add transient lexemes inside memory_zone
with vocab.memory_zone():
vocab["transient1"]
vocab["transient2"]
vocab["transient3"]
inside_len = len(vocab)
assert inside_len == 5 # 2 permanent + 3 transient

# After exiting memory_zone, length should return to initial
after_zone_len = len(vocab)
assert after_zone_len == initial_len, (
f"vocab.length should be {initial_len} after memory_zone, "
f"but got {after_zone_len}"
)

# Verify by iteration that only permanent lexemes remain
actual_count = sum(1 for _ in vocab)
assert actual_count == initial_len
assert after_zone_len == actual_count


@pytest.mark.issue(13882)
def test_memory_zone_multiple_cycles():
"""Test that vocab.length is correctly maintained across multiple
memory_zone cycles."""
vocab = Vocab()
vocab["permanent"]
base_len = len(vocab)
assert base_len == 1

# Multiple memory_zone cycles
for i in range(3):
with vocab.memory_zone():
for j in range(5):
vocab[f"temp_{i}_{j}"]

# Length should return to base after each cycle
assert len(vocab) == base_len, (
f"After cycle {i+1}, vocab.length should be {base_len}, "
f"but got {len(vocab)}"
)

# Final verification
final_len = len(vocab)
actual_count = sum(1 for _ in vocab)
assert final_len == actual_count == base_len
8 changes: 7 additions & 1 deletion spacy/vocab.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -251,9 +251,15 @@ cdef class Vocab:

def _clear_transient_orths(self):
"""Remove transient lexemes from the index (generally at the end of the memory zone)"""
cdef hash_t orth
cdef int num_cleared = 0

for orth in self._transient_orths:
map_clear(self._by_orth.c_map, orth)
if self._by_orth.get(orth) is not NULL:
map_clear(self._by_orth.c_map, orth)
num_cleared += 1
self._transient_orths.clear()
self.length -= num_cleared

def __contains__(self, key):
"""Check whether the string or int key has an entry in the vocabulary.
Expand Down