-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathembedding.py
More file actions
35 lines (23 loc) · 881 Bytes
/
embedding.py
File metadata and controls
35 lines (23 loc) · 881 Bytes
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
import urllib.request
import os
import re
class Simple_Tokenizer:
def __init__(self, vocab):
self.int_to_str = [{i:value} for i, value in enumerate(vocab)]
self.str_to_int = vocab
def encode(self, file_name):
with open(file_name , "r") as if_file:
raw_text = if_file.read()
print(len(raw_text))
if __name__== "__main__":
## make a parser
## should take a file for building vocabulary
## should a take as input
with open("vocab.txt", "r" )as file_object:
raw_text = file_object.read()
print(len(raw_text))
pre_processed = re.split(r'([.,:;?()"_!\']|--|\s)', raw_text)
pre_processed = [item.strip() for item in pre_processed if item.strip() ]
all_words = sorted(set(pre_processed))
Tokenizer_object = Simple_Tokenizer(all_words)
Tokenizer_object.encode("sample_input.txt")