Skip to content

Commit c907ffd

Browse files
feat: implement getConfig for tokenizer and all components
1 parent 637bca7 commit c907ffd

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

58 files changed

+916
-72
lines changed

src/Contracts/DecoderInterface.php

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -12,4 +12,14 @@ interface DecoderInterface
1212
* @param string[] $tokens
1313
*/
1414
public function decode(array $tokens): string;
15+
16+
/**
17+
* Get configuration value(s).
18+
*
19+
* @param null|string $key The configuration key. If null, returns all config.
20+
* @param mixed $default The default value if the key doesn't exist
21+
*
22+
* @return mixed the configuration value, or full config array if $key is null
23+
*/
24+
public function getConfig(?string $key = null, mixed $default = null): mixed;
1525
}

src/Contracts/ModelInterface.php

Lines changed: 6 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -54,10 +54,12 @@ public function getVocabSize(): int;
5454
public function addToken(string $token, int $id): void;
5555

5656
/**
57-
* Get the end of word suffix, if any.
58-
* Only some models (like BPE) have this property.
57+
* Get configuration value(s).
5958
*
60-
* @return null|string the end of word suffix
59+
* @param null|string $key The configuration key (e.g., 'dropout'). If null, returns all config.
60+
* @param mixed $default The default value if the key doesn't exist (ignored when $key is null)
61+
*
62+
* @return mixed the configuration value, or full config array if $key is null
6163
*/
62-
public function getEndOfWordSuffix(): ?string;
64+
public function getConfig(?string $key = null, mixed $default = null): mixed;
6365
}

src/Contracts/NormalizerInterface.php

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7,4 +7,14 @@
77
interface NormalizerInterface
88
{
99
public function normalize(string $text): string;
10+
11+
/**
12+
* Get configuration value(s).
13+
*
14+
* @param null|string $key The configuration key. If null, returns all config.
15+
* @param mixed $default The default value if the key doesn't exist
16+
*
17+
* @return mixed the configuration value, or full config array if $key is null
18+
*/
19+
public function getConfig(?string $key = null, mixed $default = null): mixed;
1020
}

src/Contracts/PostProcessorInterface.php

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -15,4 +15,14 @@ interface PostProcessorInterface
1515
* @return array{0: string[], 1: int[]} the processed tokens and type IDs
1616
*/
1717
public function process(array $tokens, ?array $pair = null, bool $addSpecialTokens = true): array;
18+
19+
/**
20+
* Get configuration value(s).
21+
*
22+
* @param null|string $key The configuration key. If null, returns all config.
23+
* @param mixed $default The default value if the key doesn't exist
24+
*
25+
* @return mixed the configuration value, or full config array if $key is null
26+
*/
27+
public function getConfig(?string $key = null, mixed $default = null): mixed;
1828
}

src/Contracts/PreTokenizerInterface.php

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -15,4 +15,14 @@ interface PreTokenizerInterface
1515
* @return string[]
1616
*/
1717
public function preTokenize(array|string $text, array $options = []): array;
18+
19+
/**
20+
* Get configuration value(s).
21+
*
22+
* @param null|string $key The configuration key. If null, returns all config.
23+
* @param mixed $default The default value if the key doesn't exist
24+
*
25+
* @return mixed the configuration value, or full config array if $key is null
26+
*/
27+
public function getConfig(?string $key = null, mixed $default = null): mixed;
1828
}

src/DataStructures/AddedToken.php

Lines changed: 17 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,7 @@
1010
* - Whether they should only match single words
1111
* - Whether to include any whitespace on its left or right.
1212
*/
13-
readonly class AddedToken
13+
class AddedToken implements \JsonSerializable
1414
{
1515
public function __construct(
1616
/**
@@ -58,4 +58,20 @@ public static function fromArray(array $data): self
5858
$data['special'] ?? false,
5959
);
6060
}
61+
62+
/**
63+
* @return array<string, mixed>
64+
*/
65+
public function jsonSerialize(): array
66+
{
67+
return [
68+
'id' => $this->id,
69+
'content' => $this->content,
70+
'single_word' => $this->singleWord,
71+
'lstrip' => $this->lStrip,
72+
'rstrip' => $this->rStrip,
73+
'normalized' => $this->normalized,
74+
'special' => $this->special,
75+
];
76+
}
6177
}

src/Decoders/BPEDecoder.php

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,22 @@ class BPEDecoder extends BaseDecoder
88
{
99
public function __construct(protected string $suffix = '') {}
1010

11+
public function getConfig(?string $key = null, mixed $default = null): mixed
12+
{
13+
if (null !== $key) {
14+
return match ($key) {
15+
'type' => 'BPEDecoder',
16+
'suffix' => $this->suffix,
17+
default => $default,
18+
};
19+
}
20+
21+
return [
22+
'type' => 'BPEDecoder',
23+
'suffix' => $this->suffix,
24+
];
25+
}
26+
1127
protected function processTokens(array $tokens): array
1228
{
1329
$decoded = [];

src/Decoders/ByteFallbackDecoder.php

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,15 @@
66

77
class ByteFallbackDecoder extends BaseDecoder
88
{
9+
public function getConfig(?string $key = null, mixed $default = null): mixed
10+
{
11+
if (null !== $key) {
12+
return 'type' === $key ? 'ByteFallback' : $default;
13+
}
14+
15+
return ['type' => 'ByteFallback'];
16+
}
17+
918
protected function processTokens(array $tokens): array
1019
{
1120
$newTokens = [];

src/Decoders/ByteLevelDecoder.php

Lines changed: 9 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -275,11 +275,16 @@ public function __construct(protected array $addedTokens = [], protected ?string
275275

276276
/**
277277
* Convert an array of tokens to a string by decoding each byte.
278-
*
279-
* @param string[] $tokens array of tokens to be decoded
280-
*
281-
* @return string the decoded string
282278
*/
279+
public function getConfig(?string $key = null, mixed $default = null): mixed
280+
{
281+
if (null !== $key) {
282+
return 'type' === $key ? 'ByteLevel' : $default;
283+
}
284+
285+
return ['type' => 'ByteLevel'];
286+
}
287+
283288
public function decode(array $tokens): string
284289
{
285290
$decoded = parent::decode($tokens);

src/Decoders/CTCDecoder.php

Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,26 @@ public function __construct(
1414
protected bool $cleanup = true
1515
) {}
1616

17+
public function getConfig(?string $key = null, mixed $default = null): mixed
18+
{
19+
if (null !== $key) {
20+
return match ($key) {
21+
'type' => 'CTC',
22+
'pad_token' => $this->padToken,
23+
'word_delimiter_token' => $this->wordDelimiterToken,
24+
'cleanup' => $this->cleanup,
25+
default => $default,
26+
};
27+
}
28+
29+
return [
30+
'type' => 'CTC',
31+
'pad_token' => $this->padToken,
32+
'word_delimiter_token' => $this->wordDelimiterToken,
33+
'cleanup' => $this->cleanup,
34+
];
35+
}
36+
1737
protected function processTokens(array $tokens): array
1838
{
1939
if (empty($tokens)) {

0 commit comments

Comments
 (0)