Skip to content

Commit 637bca7

Browse files
feat: add FixedLengthPreTokenizer for fixed-length tokenization
1 parent e5db652 commit 637bca7

File tree

2 files changed

+42
-0
lines changed

2 files changed

+42
-0
lines changed

src/Factories/PreTokenizerFactory.php

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,7 @@
88
use Codewithkyrian\Tokenizers\PreTokenizers\BertPreTokenizer;
99
use Codewithkyrian\Tokenizers\PreTokenizers\ByteLevelPreTokenizer;
1010
use Codewithkyrian\Tokenizers\PreTokenizers\DigitsPreTokenizer;
11+
use Codewithkyrian\Tokenizers\PreTokenizers\FixedLengthPreTokenizer;
1112
use Codewithkyrian\Tokenizers\PreTokenizers\MetaspacePreTokenizer;
1213
use Codewithkyrian\Tokenizers\PreTokenizers\PreTokenizerSequence;
1314
use Codewithkyrian\Tokenizers\PreTokenizers\PunctuationPreTokenizer;
@@ -37,6 +38,9 @@ public static function create(array $config): PreTokenizerInterface
3738
'Digits' => new DigitsPreTokenizer(
3839
individualDigits: $config['individual_digits'] ?? false
3940
),
41+
'FixedLength' => new FixedLengthPreTokenizer(
42+
length: $config['length']
43+
),
4044
'Metaspace' => new MetaspacePreTokenizer(
4145
replacement: $config['replacement'] ?? ' ',
4246
addPrefixSpace: $config['add_prefix_space'] ?? true,
Lines changed: 38 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,38 @@
1+
<?php
2+
3+
declare(strict_types=1);
4+
5+
namespace Codewithkyrian\Tokenizers\PreTokenizers;
6+
7+
use Codewithkyrian\Tokenizers\Contracts\PreTokenizerInterface;
8+
9+
/**
10+
* Splits text into fixed-length tokens.
11+
*/
12+
class FixedLengthPreTokenizer implements PreTokenizerInterface
13+
{
14+
public function __construct(
15+
protected int $length
16+
) {}
17+
18+
public function preTokenize(array|string $text, array $options = []): array
19+
{
20+
if (\is_array($text)) {
21+
$result = [];
22+
foreach ($text as $t) {
23+
$result = array_merge($result, $this->preTokenize($t, $options));
24+
}
25+
26+
return $result;
27+
}
28+
29+
$tokens = [];
30+
$len = mb_strlen($text);
31+
32+
for ($i = 0; $i < $len; $i += $this->length) {
33+
$tokens[] = mb_substr($text, $i, $this->length);
34+
}
35+
36+
return $tokens;
37+
}
38+
}

0 commit comments

Comments
 (0)