BigramLanguageModel/tokenization.py at main · Joelcic/BigramLanguageModel · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
import os
from utils import save_json, load_txt, load_json

""" Word-level tokenization """

def create_word_tok_map(file_path:str,  punctuations=set('.,:;!?()[]"-§¶_\'')):
    """
    Function for tokenize the text using characterlevel tokenzier.
    - Saves 2 JSON files (string to integers, integers to string)
    """
    # Reading the selected data
    text = load_txt(filepath=file_path)

    tokens = word_tokenize(text=text, punctuations=punctuations)

    unique_tokens = sorted(set(tokens))
    # Tokenizer (using chracter level tokenizer)
    string_to_int = dict()
    int_to_string = dict()
    for i, token in enumerate(unique_tokens):
        string_to_int[token] = i
        int_to_string[i] = token

    # Creates a folder if it not already exist
    folder = "artifacts"
    os.makedirs(folder, exist_ok=True)
    # Saves the dicts as Json
    save_json(string_to_int, os.path.join(folder,"string_to_int_word.json" ))
    save_json(int_to_string, os.path.join(folder,"int_to_string_word.json" ))

def word_tokenize(text,  punctuations=set('.,:;!?()[]"-§¶_\'')):
    """
    Function for tokenixe a text, seperated the punctuations.
    """
    # Create tokens by looping through the text and set punctuations to be single tokens aswell
    tokens = []
    current_word = ""
    for ch in text:
        if ch.isalnum():
            current_word += ch
        else:
            if current_word:
                tokens.append(current_word)
                current_word = ""
            if ch in punctuations:
                tokens.append(ch)

    if current_word:
        tokens.append(current_word)

    return tokens

def word_encode(text, tokenizer_path:str):
    """
    Function for encode a text snipet
    """
    tokens = word_tokenize(text)
    string_to_int = load_json(tokenizer_path)
    output = list()
    for word in tokens:
        output.append(string_to_int[word])
    return output


def word_decode(text, tokenizer_path:str, sep="", punctuations=set('.,:;!?()[]"-§¶_\'')):
    """
    Function for decode a text snipet
    """
    int_to_string = load_json(tokenizer_path)
    output = list()
    n =len(text)
    for idx, token in enumerate(text):
        output.append(int_to_string[str(token)])
        if idx < n-1:
            nxt = int_to_string[str(text[idx + 1])]
            if nxt in punctuations:
                continue
            output.append(sep)

    return "".join(output)


""" Character-level tokenization """

def create_char_tok_map(file_path:str):
    """
    Function for tokenize the text using characterlevel tokenzier.
    - Saves 2 JSON files (string to integers, integers to string)
    """
    # Reading the selected data
    text = load_txt(filepath=file_path)

    chars = sorted(set(text))

    # Tokenizer (using chracter level tokenizer)
    string_to_int = dict()
    int_to_string = dict()
    for i, char in enumerate(chars):
        string_to_int[char] = i
        int_to_string[i] = char

    # Creates a folder if it not already exist
    folder = "artifacts"
    os.makedirs(folder, exist_ok=True)
    # Saves the dicts as Json
    save_json(string_to_int, os.path.join(folder,"string_to_int_char.json" ))
    save_json(int_to_string, os.path.join(folder,"int_to_string_char1.json" ))


def char_encode(text, tokenizer_path:str):
    """
    Function for encode a text snipet
    """
    string_to_int = load_json(tokenizer_path)
    output = list()
    for ch in text:
        output.append(string_to_int[ch])
    return output


def char_decode(text, tokenizer_path:str):
    """
    Function for decode a text snipet
    """
    int_to_string = load_json(tokenizer_path)
    output = str()
    for ch in text:
        output += int_to_string[str(ch)]
    return output