Skip to content

Commit 7ade08b

Browse files
refactor: remove redundant modelMaxLength and expose token arrays
1 parent c907ffd commit 7ade08b

File tree

3 files changed

+4
-35
lines changed

3 files changed

+4
-35
lines changed

README.md

Lines changed: 0 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -157,9 +157,6 @@ $maxLength = $tokenizer->getConfig('model_max_length'); // 512
157157
$cleanup = $tokenizer->getConfig('clean_up_tokenization_spaces'); // true
158158
$custom = $tokenizer->getConfig('unknown_key', 'default'); // 'default'
159159

160-
// Convenience property for model_max_length
161-
echo $tokenizer->modelMaxLength; // 512
162-
163160
// Get all configuration (pass null or no arguments)
164161
$allConfig = $tokenizer->getConfig();
165162
```
@@ -170,8 +167,6 @@ Common configuration keys:
170167
- `do_lowercase_and_remove_accent` — Whether to lowercase and strip accents
171168
- `clean_up_tokenization_spaces` — Whether to clean up spaces during decoding
172169

173-
> **Note:** `model_max_length` is the tokenizer's configured max length, not necessarily the model's actual context window. For most models, these are the same. However, some tokenizers (like Llama 3) set this to an extremely large value. When building applications, you may want to use known context window limits for specific models rather than relying solely on this value.
174-
175170
## Encoding Text
176171

177172
The `encode()` method tokenizes text and returns an `Encoding` object containing the token IDs, tokens, and type IDs.

src/Tokenizer.php

Lines changed: 2 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -23,10 +23,6 @@
2323

2424
readonly class Tokenizer
2525
{
26-
/**
27-
* The model's maximum sequence length (convenience accessor for config).
28-
*/
29-
public ?int $modelMaxLength;
3026
protected DictionarySplitter $addedTokensSplitter;
3127

3228
/**
@@ -45,14 +41,11 @@ public function __construct(
4541
public PreTokenizerInterface $preTokenizer,
4642
public PostProcessorInterface $postProcessor,
4743
public DecoderInterface $decoder,
48-
protected array $specialTokens = [],
49-
protected array $addedTokens = [],
44+
public array $specialTokens = [],
45+
public array $addedTokens = [],
5046
protected array $config = []
5147
) {
5248
$this->addedTokensSplitter = new DictionarySplitter(array_keys($this->addedTokens));
53-
54-
$maxLength = $this->config['model_max_length'] ?? null;
55-
$this->modelMaxLength = null !== $maxLength ? (int) $maxLength : null;
5649
}
5750

5851
/**

tests/Unit/TokenizerBuilderTest.php

Lines changed: 2 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -157,7 +157,7 @@ public function getConfig(?string $key = null, mixed $default = null): mixed
157157
->build()
158158
;
159159

160-
expect($tokenizer->modelMaxLength)->toBe(512)
160+
expect($tokenizer->getConfig('model_max_length'))->toBe(512)
161161
->and($tokenizer->getConfig('remove_space'))->toBeTrue()
162162
->and($tokenizer->getConfig('clean_up_tokenization_spaces'))->toBeFalse()
163163
->and($tokenizer->getConfig('custom_option'))->toBe('custom_value')
@@ -194,23 +194,4 @@ public function getConfig(?string $key = null, mixed $default = null): mixed
194194
// Should not throw, defaults are used
195195
$encoding = $tokenizer->encode('HELLO WORLD');
196196
expect($encoding->ids)->toBeArray();
197-
});
198-
199-
it('sets modelMaxLength from config', function () {
200-
$tokenizer = (new TokenizerBuilder())
201-
->withModel(createMockModel())
202-
->withConfig('model_max_length', 2048)
203-
->build()
204-
;
205-
206-
expect($tokenizer->modelMaxLength)->toBe(2048);
207-
});
208-
209-
it('has null modelMaxLength when not configured', function () {
210-
$tokenizer = (new TokenizerBuilder())
211-
->withModel(createMockModel())
212-
->build()
213-
;
214-
215-
expect($tokenizer->modelMaxLength)->toBeNull();
216-
});
197+
});

0 commit comments

Comments
 (0)