-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathtokenizers.py
More file actions
154 lines (125 loc) · 4.28 KB
/
tokenizers.py
File metadata and controls
154 lines (125 loc) · 4.28 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
import os
from utils import save_json, load_txt, load_json
import torch
def create_word_token_map(file_path:str, punctuations=set('.,:;!?()[]"-§¶_\'')):
"""
Function for tokenize the text using characterlevel tokenzier.
- Saves 2 JSON files (string to integers, integers to string)
"""
# Reading the selected data
text = load_txt(filepath=file_path)
tokens = word_tokenize(text=text, punctuations=punctuations)
unique_tokens = sorted(set(tokens))
# Tokenizer (using chracter level tokenizer)
string_to_int = dict()
int_to_string = dict()
for i, token in enumerate(unique_tokens):
string_to_int[token] = i
int_to_string[i] = token
unk_idx = len(string_to_int)
string_to_int["<unk>"] = unk_idx
int_to_string[unk_idx] = "<unk>"
# Creates a folder if it not already exist
folder = "artifacts"
os.makedirs(folder, exist_ok=True)
# Saves the dicts as Json
save_json(string_to_int, os.path.join(folder,"string_to_int_word.json" ))
save_json(int_to_string, os.path.join(folder,"int_to_string_word.json" ))
def word_tokenize(text, punctuations=set('.,:;!?()[]"-§¶_\'')):
"""
Function for tokenixe a text, seperated the punctuations.
"""
# Create tokens by looping through the text and set punctuations to be single tokens aswell
tokens = []
current_word = ""
for ch in text:
if ch.isalnum():
current_word += ch
else:
if current_word:
tokens.append(current_word)
current_word = ""
if ch in punctuations:
tokens.append(ch)
if current_word:
tokens.append(current_word)
return tokens
def word_encoder(text, tokenizer_path:str):
"""
Function for encode a text snipet
"""
tokens = word_tokenize(text)
string_to_int = load_json(tokenizer_path)
output = list()
for word in tokens:
try:
token = string_to_int[word]
except:
token = string_to_int["<unk>"]
output.append(token)
return output
def word_decoder(text, tokenizer_path:str, sep="", punctuations=set('.,:;!?()[]"-§¶_\'')):
"""
Function for decode a text snipet
"""
int_to_string = load_json(tokenizer_path)
output = list()
n =len(text)
for idx, token in enumerate(text):
output.append(int_to_string[str(token)])
if idx < n-1:
nxt = int_to_string[str(text[idx + 1])]
if nxt in punctuations:
continue
output.append(sep)
return "".join(output)
""" Character-level tokenization """
def create_char_token_map(file_path:str):
"""
Function for tokenize the text using characterlevel tokenzier.
- Saves 2 JSON files (string to integers, integers to string)
"""
# Reading the selected data
text = load_txt(filepath=file_path)
chars = sorted(set(text))
# Tokenizer (using chracter level tokenizer)
string_to_int = dict()
int_to_string = dict()
for i, char in enumerate(chars):
string_to_int[char] = i
int_to_string[i] = char
unk_idx = len(string_to_int)
string_to_int["<unk>"] = unk_idx
int_to_string[unk_idx] = "<unk>"
# Creates a folder if it not already exist
folder = "artifacts"
os.makedirs(folder, exist_ok=True)
# Saves the dicts as Json
save_json(string_to_int, os.path.join(folder,"string_to_int_char.json" ))
save_json(int_to_string, os.path.join(folder,"int_to_string_char.json" ))
def char_encoder(text, tokenizer_path:str):
"""
Function for encode a text snipet
"""
string_to_int = load_json(tokenizer_path)
output = list()
for ch in text:
try:
token = string_to_int[ch]
except:
token = string_to_int["<unk>"]
output.append(token)
return output
def char_decoder(text, tokenizer_path:str):
"""
Function for decode a text snipet
"""
int_to_string = load_json(tokenizer_path)
output = str()
for ch in text:
output += int_to_string[str(ch)]
return output
if __name__ == "__main__":
file_path = "data/wikitext2_train.txt"
#create_word_token_map(file_path=file_path, punctuations=set('.,:;!?()[]"-§¶_\''))
create_char_token_map(file_path=file_path)