File tree Expand file tree Collapse file tree 2 files changed +42
-0
lines changed
Expand file tree Collapse file tree 2 files changed +42
-0
lines changed Original file line number Diff line number Diff line change 88use Codewithkyrian \Tokenizers \PreTokenizers \BertPreTokenizer ;
99use Codewithkyrian \Tokenizers \PreTokenizers \ByteLevelPreTokenizer ;
1010use Codewithkyrian \Tokenizers \PreTokenizers \DigitsPreTokenizer ;
11+ use Codewithkyrian \Tokenizers \PreTokenizers \FixedLengthPreTokenizer ;
1112use Codewithkyrian \Tokenizers \PreTokenizers \MetaspacePreTokenizer ;
1213use Codewithkyrian \Tokenizers \PreTokenizers \PreTokenizerSequence ;
1314use Codewithkyrian \Tokenizers \PreTokenizers \PunctuationPreTokenizer ;
@@ -37,6 +38,9 @@ public static function create(array $config): PreTokenizerInterface
3738 'Digits ' => new DigitsPreTokenizer (
3839 individualDigits: $ config ['individual_digits ' ] ?? false
3940 ),
41+ 'FixedLength ' => new FixedLengthPreTokenizer (
42+ length: $ config ['length ' ]
43+ ),
4044 'Metaspace ' => new MetaspacePreTokenizer (
4145 replacement: $ config ['replacement ' ] ?? ' ' ,
4246 addPrefixSpace: $ config ['add_prefix_space ' ] ?? true ,
Original file line number Diff line number Diff line change 1+ <?php
2+
3+ declare (strict_types=1 );
4+
5+ namespace Codewithkyrian \Tokenizers \PreTokenizers ;
6+
7+ use Codewithkyrian \Tokenizers \Contracts \PreTokenizerInterface ;
8+
9+ /**
10+ * Splits text into fixed-length tokens.
11+ */
12+ class FixedLengthPreTokenizer implements PreTokenizerInterface
13+ {
14+ public function __construct (
15+ protected int $ length
16+ ) {}
17+
18+ public function preTokenize (array |string $ text , array $ options = []): array
19+ {
20+ if (\is_array ($ text )) {
21+ $ result = [];
22+ foreach ($ text as $ t ) {
23+ $ result = array_merge ($ result , $ this ->preTokenize ($ t , $ options ));
24+ }
25+
26+ return $ result ;
27+ }
28+
29+ $ tokens = [];
30+ $ len = mb_strlen ($ text );
31+
32+ for ($ i = 0 ; $ i < $ len ; $ i += $ this ->length ) {
33+ $ tokens [] = mb_substr ($ text , $ i , $ this ->length );
34+ }
35+
36+ return $ tokens ;
37+ }
38+ }
You can’t perform that action at this time.
0 commit comments