-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathtrain_tokenizer.py
More file actions
34 lines (18 loc) · 1019 Bytes
/
train_tokenizer.py
File metadata and controls
34 lines (18 loc) · 1019 Bytes
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
#!/usr/bin/python3
# Based on https://huggingface.co/learn/nlp-course/en/chapter6/8?fw=pt#building-a-bpe-tokenizer-from-scratch
from tokenizers import decoders, models, pre_tokenizers, processors, trainers, Tokenizer
from transformers import GPT2TokenizerFast
import glob
CORPUS_DIRPATH = 'text_corpus'
TOKENIZER_DIRNAME = '_tokenizer_'
VOCAB_SIZE = 0xC000 # more? less? see e.g. https://www.rohan-paul.com/p/tutorial-balancing-vocabulary-size
tokenizer = Tokenizer(models.BPE())
tokenizer.pre_tokenizer = pre_tokenizers.ByteLevel(add_prefix_space=False)
trainer = trainers.BpeTrainer(vocab_size=VOCAB_SIZE, special_tokens=['<|endoftext|>'])
tokenizer.model = models.BPE()
corpus_filepaths = glob.glob(f"{CORPUS_DIRPATH}/*")
tokenizer.train(corpus_filepaths, trainer=trainer)
tokenizer.post_processor = processors.ByteLevel(trim_offsets=False)
tokenizer.decoder = decoders.ByteLevel()
wrapped_tokenizer = GPT2TokenizerFast(tokenizer_object=tokenizer)
wrapped_tokenizer.save_pretrained(TOKENIZER_DIRNAME)