|
1 | 1 | from spacy.vocab import Vocab |
| 2 | +import pytest |
2 | 3 |
|
3 | 4 |
|
4 | 5 | def test_memory_zone_no_insertion(): |
@@ -57,3 +58,68 @@ def test_memory_zone_exception_cleanup(): |
57 | 58 | # Vocab should be fully usable for new operations |
58 | 59 | lex = vocab["cat"] |
59 | 60 | assert lex.text == "cat" |
| 61 | + |
| 62 | + |
| 63 | +@pytest.mark.issue(13882) |
| 64 | +def test_memory_zone_vocab_length_decremented(): |
| 65 | + """Test that vocab.length is correctly decremented when memory_zone |
| 66 | + clears transient lexemes. |
| 67 | +
|
| 68 | + Bug: The length counter was incremented when adding lexemes but never |
| 69 | + decremented when memory_zone cleared transient lexemes, causing |
| 70 | + len(vocab) to grow continuously even though lexemes were properly removed. |
| 71 | + """ |
| 72 | + vocab = Vocab() |
| 73 | + |
| 74 | + # Add some permanent lexemes |
| 75 | + vocab["hello"] |
| 76 | + vocab["world"] |
| 77 | + initial_len = len(vocab) |
| 78 | + assert initial_len == 2 |
| 79 | + |
| 80 | + # Add transient lexemes inside memory_zone |
| 81 | + with vocab.memory_zone(): |
| 82 | + vocab["transient1"] |
| 83 | + vocab["transient2"] |
| 84 | + vocab["transient3"] |
| 85 | + inside_len = len(vocab) |
| 86 | + assert inside_len == 5 # 2 permanent + 3 transient |
| 87 | + |
| 88 | + # After exiting memory_zone, length should return to initial |
| 89 | + after_zone_len = len(vocab) |
| 90 | + assert after_zone_len == initial_len, ( |
| 91 | + f"vocab.length should be {initial_len} after memory_zone, " |
| 92 | + f"but got {after_zone_len}" |
| 93 | + ) |
| 94 | + |
| 95 | + # Verify by iteration that only permanent lexemes remain |
| 96 | + actual_count = sum(1 for _ in vocab) |
| 97 | + assert actual_count == initial_len |
| 98 | + assert after_zone_len == actual_count |
| 99 | + |
| 100 | + |
| 101 | +@pytest.mark.issue(13882) |
| 102 | +def test_memory_zone_multiple_cycles(): |
| 103 | + """Test that vocab.length is correctly maintained across multiple |
| 104 | + memory_zone cycles.""" |
| 105 | + vocab = Vocab() |
| 106 | + vocab["permanent"] |
| 107 | + base_len = len(vocab) |
| 108 | + assert base_len == 1 |
| 109 | + |
| 110 | + # Multiple memory_zone cycles |
| 111 | + for i in range(3): |
| 112 | + with vocab.memory_zone(): |
| 113 | + for j in range(5): |
| 114 | + vocab[f"temp_{i}_{j}"] |
| 115 | + |
| 116 | + # Length should return to base after each cycle |
| 117 | + assert len(vocab) == base_len, ( |
| 118 | + f"After cycle {i+1}, vocab.length should be {base_len}, " |
| 119 | + f"but got {len(vocab)}" |
| 120 | + ) |
| 121 | + |
| 122 | + # Final verification |
| 123 | + final_len = len(vocab) |
| 124 | + actual_count = sum(1 for _ in vocab) |
| 125 | + assert final_len == actual_count == base_len |
0 commit comments