-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathtokenization.py
More file actions
129 lines (107 loc) · 3.67 KB
/
tokenization.py
File metadata and controls
129 lines (107 loc) · 3.67 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
import os
from utils import save_json, load_txt, load_json
""" Word-level tokenization """
def create_word_tok_map(file_path:str, punctuations=set('.,:;!?()[]"-§¶_\'')):
"""
Function for tokenize the text using characterlevel tokenzier.
- Saves 2 JSON files (string to integers, integers to string)
"""
# Reading the selected data
text = load_txt(filepath=file_path)
tokens = word_tokenize(text=text, punctuations=punctuations)
unique_tokens = sorted(set(tokens))
# Tokenizer (using chracter level tokenizer)
string_to_int = dict()
int_to_string = dict()
for i, token in enumerate(unique_tokens):
string_to_int[token] = i
int_to_string[i] = token
# Creates a folder if it not already exist
folder = "artifacts"
os.makedirs(folder, exist_ok=True)
# Saves the dicts as Json
save_json(string_to_int, os.path.join(folder,"string_to_int_word.json" ))
save_json(int_to_string, os.path.join(folder,"int_to_string_word.json" ))
def word_tokenize(text, punctuations=set('.,:;!?()[]"-§¶_\'')):
"""
Function for tokenixe a text, seperated the punctuations.
"""
# Create tokens by looping through the text and set punctuations to be single tokens aswell
tokens = []
current_word = ""
for ch in text:
if ch.isalnum():
current_word += ch
else:
if current_word:
tokens.append(current_word)
current_word = ""
if ch in punctuations:
tokens.append(ch)
if current_word:
tokens.append(current_word)
return tokens
def word_encode(text, tokenizer_path:str):
"""
Function for encode a text snipet
"""
tokens = word_tokenize(text)
string_to_int = load_json(tokenizer_path)
output = list()
for word in tokens:
output.append(string_to_int[word])
return output
def word_decode(text, tokenizer_path:str, sep="", punctuations=set('.,:;!?()[]"-§¶_\'')):
"""
Function for decode a text snipet
"""
int_to_string = load_json(tokenizer_path)
output = list()
n =len(text)
for idx, token in enumerate(text):
output.append(int_to_string[str(token)])
if idx < n-1:
nxt = int_to_string[str(text[idx + 1])]
if nxt in punctuations:
continue
output.append(sep)
return "".join(output)
""" Character-level tokenization """
def create_char_tok_map(file_path:str):
"""
Function for tokenize the text using characterlevel tokenzier.
- Saves 2 JSON files (string to integers, integers to string)
"""
# Reading the selected data
text = load_txt(filepath=file_path)
chars = sorted(set(text))
# Tokenizer (using chracter level tokenizer)
string_to_int = dict()
int_to_string = dict()
for i, char in enumerate(chars):
string_to_int[char] = i
int_to_string[i] = char
# Creates a folder if it not already exist
folder = "artifacts"
os.makedirs(folder, exist_ok=True)
# Saves the dicts as Json
save_json(string_to_int, os.path.join(folder,"string_to_int_char.json" ))
save_json(int_to_string, os.path.join(folder,"int_to_string_char1.json" ))
def char_encode(text, tokenizer_path:str):
"""
Function for encode a text snipet
"""
string_to_int = load_json(tokenizer_path)
output = list()
for ch in text:
output.append(string_to_int[ch])
return output
def char_decode(text, tokenizer_path:str):
"""
Function for decode a text snipet
"""
int_to_string = load_json(tokenizer_path)
output = str()
for ch in text:
output += int_to_string[str(ch)]
return output