-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathDecoderFactory.php
More file actions
88 lines (78 loc) · 3.24 KB
/
DecoderFactory.php
File metadata and controls
88 lines (78 loc) · 3.24 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
<?php
declare(strict_types=1);
namespace Codewithkyrian\Tokenizers\Factories;
use Codewithkyrian\Tokenizers\Contracts\DecoderInterface;
use Codewithkyrian\Tokenizers\DataStructures\AddedToken;
use Codewithkyrian\Tokenizers\Decoders\BaseDecoder;
use Codewithkyrian\Tokenizers\Decoders\BPEDecoder;
use Codewithkyrian\Tokenizers\Decoders\ByteFallbackDecoder;
use Codewithkyrian\Tokenizers\Decoders\ByteLevelDecoder;
use Codewithkyrian\Tokenizers\Decoders\CTCDecoder;
use Codewithkyrian\Tokenizers\Decoders\DecoderSequence;
use Codewithkyrian\Tokenizers\Decoders\FuseDecoder;
use Codewithkyrian\Tokenizers\Decoders\MetaspaceDecoder;
use Codewithkyrian\Tokenizers\Decoders\ReplaceDecoder;
use Codewithkyrian\Tokenizers\Decoders\StripDecoder;
use Codewithkyrian\Tokenizers\Decoders\WordPieceDecoder;
class DecoderFactory
{
/**
* @param array<string, mixed> $config the decoder configuration
* @param array<string, AddedToken> $addedTokens Optional. Only needed for ByteLevelDecoder.
* @param null|string $endOfWordSuffix Optional. Only needed for ByteLevelDecoder.
*/
public static function create(array $config, array $addedTokens = [], ?string $endOfWordSuffix = null): DecoderInterface
{
if (empty($config)) {
return new FuseDecoder(' ');
}
$type = $config['type'] ?? null;
return match ($type) {
'WordPiece' => new WordPieceDecoder(
prefix: $config['prefix'] ?? '##',
cleanup: $config['cleanup'] ?? true
),
'Metaspace' => new MetaspaceDecoder(
replacement: $config['replacement'] ?? ' ',
addPrefixSpace: $config['add_prefix_space'] ?? true
),
'Replace' => new ReplaceDecoder(
regex: $config['pattern']['Regex'] ?? null,
subString: $config['pattern']['String'] ?? null,
replacement: $config['content'] ?? ''
),
'BPEDecoder' => new BPEDecoder(
suffix: $config['suffix'] ?? ''
),
'ByteLevel' => new ByteLevelDecoder($addedTokens, $endOfWordSuffix),
'ByteFallback' => new ByteFallbackDecoder(),
'CTC' => new CTCDecoder(
padToken: $config['pad_token'] ?? '<pad>',
wordDelimiterToken: $config['word_delimiter_token'] ?? '|',
cleanup: $config['cleanup'] ?? true
),
'Fuse' => new FuseDecoder(),
'Strip' => new StripDecoder(
content: $config['content'],
start: $config['start'],
stop: $config['stop']
),
'Sequence' => self::createSequence($config['decoders'] ?? []),
default => throw new \Exception("Unknown decoder type: {$type}"),
};
}
/**
* @param array<array<string, mixed>> $configs
*/
private static function createSequence(array $configs): DecoderSequence
{
$decoders = [];
foreach ($configs as $config) {
$decoder = self::create($config);
if ($decoder instanceof BaseDecoder) {
$decoders[] = $decoder;
}
}
return new DecoderSequence($decoders);
}
}