-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathtokenizer.py
More file actions
37 lines (31 loc) · 1.25 KB
/
tokenizer.py
File metadata and controls
37 lines (31 loc) · 1.25 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
from transformers import BertTokenizer
import regex as re
PATTERN = "(\[[^\]]+]|Br?|Cl?|N|O|S|P|F|I|b|c|n|o|s|p|\(|\)|\.|=|#|-|\+|\\\\|\/|:|~|@|\?|>|\*|\$|\%[0-9]{2}|[0-9])"
class MolTranBertTokenizer(BertTokenizer):
def __init__(self, vocab_file: str = '',
do_lower_case=False,
unk_token='<pad>',
sep_token='<eos>',
pad_token='<pad>',
cls_token='<bos>',
mask_token='<mask>',
**kwargs):
super().__init__(vocab_file,
unk_token=unk_token,
sep_token=sep_token,
pad_token=pad_token,
cls_token=cls_token,
mask_token=mask_token,
**kwargs)
self.regex_tokenizer = re.compile(PATTERN)
self.wordpiece_tokenizer = None
self.basic_tokenizer = None
self.eos_token_id = 1
self.bos_token_id = 0
# self.pad_token_id = 2499
def _tokenize(self, text):
split_tokens = self.regex_tokenizer.findall(text)
return split_tokens
def convert_tokens_to_string(self, tokens):
out_string = "".join(tokens).strip()
return out_string