diff --git a/spacy/tests/vocab_vectors/test_memory_zone.py b/spacy/tests/vocab_vectors/test_memory_zone.py index f718afa2f6..d2f9ee1d9d 100644 --- a/spacy/tests/vocab_vectors/test_memory_zone.py +++ b/spacy/tests/vocab_vectors/test_memory_zone.py @@ -1,4 +1,5 @@ from spacy.vocab import Vocab +import pytest def test_memory_zone_no_insertion(): @@ -57,3 +58,68 @@ def test_memory_zone_exception_cleanup(): # Vocab should be fully usable for new operations lex = vocab["cat"] assert lex.text == "cat" + + +@pytest.mark.issue(13882) +def test_memory_zone_vocab_length_decremented(): + """Test that vocab.length is correctly decremented when memory_zone + clears transient lexemes. + + Bug: The length counter was incremented when adding lexemes but never + decremented when memory_zone cleared transient lexemes, causing + len(vocab) to grow continuously even though lexemes were properly removed. + """ + vocab = Vocab() + + # Add some permanent lexemes + vocab["hello"] + vocab["world"] + initial_len = len(vocab) + assert initial_len == 2 + + # Add transient lexemes inside memory_zone + with vocab.memory_zone(): + vocab["transient1"] + vocab["transient2"] + vocab["transient3"] + inside_len = len(vocab) + assert inside_len == 5 # 2 permanent + 3 transient + + # After exiting memory_zone, length should return to initial + after_zone_len = len(vocab) + assert after_zone_len == initial_len, ( + f"vocab.length should be {initial_len} after memory_zone, " + f"but got {after_zone_len}" + ) + + # Verify by iteration that only permanent lexemes remain + actual_count = sum(1 for _ in vocab) + assert actual_count == initial_len + assert after_zone_len == actual_count + + +@pytest.mark.issue(13882) +def test_memory_zone_multiple_cycles(): + """Test that vocab.length is correctly maintained across multiple + memory_zone cycles.""" + vocab = Vocab() + vocab["permanent"] + base_len = len(vocab) + assert base_len == 1 + + # Multiple memory_zone cycles + for i in range(3): + with vocab.memory_zone(): + for j in range(5): + vocab[f"temp_{i}_{j}"] + + # Length should return to base after each cycle + assert len(vocab) == base_len, ( + f"After cycle {i+1}, vocab.length should be {base_len}, " + f"but got {len(vocab)}" + ) + + # Final verification + final_len = len(vocab) + actual_count = sum(1 for _ in vocab) + assert final_len == actual_count == base_len diff --git a/spacy/vocab.pyx b/spacy/vocab.pyx index 4bf80c85d8..729945006d 100644 --- a/spacy/vocab.pyx +++ b/spacy/vocab.pyx @@ -251,9 +251,15 @@ cdef class Vocab: def _clear_transient_orths(self): """Remove transient lexemes from the index (generally at the end of the memory zone)""" + cdef hash_t orth + cdef int num_cleared = 0 + for orth in self._transient_orths: - map_clear(self._by_orth.c_map, orth) + if self._by_orth.get(orth) is not NULL: + map_clear(self._by_orth.c_map, orth) + num_cleared += 1 self._transient_orths.clear() + self.length -= num_cleared def __contains__(self, key): """Check whether the string or int key has an entry in the vocabulary.