TinyLLM_LanguageModelFromScratch/tokenizers.py at main · Joelcic/TinyLLM_LanguageModelFromScratch · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
import os
from utils import save_json, load_txt, load_json
import torch

def create_word_token_map(file_path:str,  punctuations=set('.,:;!?()[]"-§¶_\'')):
    """
    Function for tokenize the text using characterlevel tokenzier.
    - Saves 2 JSON files (string to integers, integers to string)
    """
    # Reading the selected data
    text = load_txt(filepath=file_path)

    tokens = word_tokenize(text=text, punctuations=punctuations)

    unique_tokens = sorted(set(tokens))
    # Tokenizer (using chracter level tokenizer)
    string_to_int = dict()
    int_to_string = dict()
    for i, token in enumerate(unique_tokens):
        string_to_int[token] = i
        int_to_string[i] = token

    unk_idx = len(string_to_int)
    string_to_int["<unk>"] = unk_idx
    int_to_string[unk_idx] = "<unk>"

    # Creates a folder if it not already exist
    folder = "artifacts"
    os.makedirs(folder, exist_ok=True)
    # Saves the dicts as Json
    save_json(string_to_int, os.path.join(folder,"string_to_int_word.json" ))
    save_json(int_to_string, os.path.join(folder,"int_to_string_word.json" ))

def word_tokenize(text,  punctuations=set('.,:;!?()[]"-§¶_\'')):
    """
    Function for tokenixe a text, seperated the punctuations.
    """
    # Create tokens by looping through the text and set punctuations to be single tokens aswell
    tokens = []
    current_word = ""
    for ch in text:
        if ch.isalnum():
            current_word += ch
        else:
            if current_word:
                tokens.append(current_word)
                current_word = ""
            if ch in punctuations:
                tokens.append(ch)

    if current_word:
        tokens.append(current_word)

    return tokens

def word_encoder(text, tokenizer_path:str):
    """
    Function for encode a text snipet
    """
    tokens = word_tokenize(text)
    string_to_int = load_json(tokenizer_path)
    output = list()
    for word in tokens:
        try:
            token = string_to_int[word]
        except:
            token = string_to_int["<unk>"]
        output.append(token)
    return output


def word_decoder(text, tokenizer_path:str, sep="", punctuations=set('.,:;!?()[]"-§¶_\'')):
    """
    Function for decode a text snipet
    """
    int_to_string = load_json(tokenizer_path)
    output = list()
    n =len(text)
    for idx, token in enumerate(text):
        output.append(int_to_string[str(token)])
        if idx < n-1:
            nxt = int_to_string[str(text[idx + 1])]
            if nxt in punctuations:
                continue
            output.append(sep)

    return "".join(output)


""" Character-level tokenization """

def create_char_token_map(file_path:str):
    """
    Function for tokenize the text using characterlevel tokenzier.
    - Saves 2 JSON files (string to integers, integers to string)
    """
    # Reading the selected data
    text = load_txt(filepath=file_path)

    chars = sorted(set(text))

    # Tokenizer (using chracter level tokenizer)
    string_to_int = dict()
    int_to_string = dict()
    for i, char in enumerate(chars):
        string_to_int[char] = i
        int_to_string[i] = char

    unk_idx = len(string_to_int)
    string_to_int["<unk>"] = unk_idx
    int_to_string[unk_idx] = "<unk>"

    # Creates a folder if it not already exist
    folder = "artifacts"
    os.makedirs(folder, exist_ok=True)
    # Saves the dicts as Json
    save_json(string_to_int, os.path.join(folder,"string_to_int_char.json" ))
    save_json(int_to_string, os.path.join(folder,"int_to_string_char.json" ))


def char_encoder(text, tokenizer_path:str):
    """
    Function for encode a text snipet
    """
    string_to_int = load_json(tokenizer_path)
    output = list()
    for ch in text:
        try:
            token = string_to_int[ch]
        except:
            token = string_to_int["<unk>"]
        output.append(token)
    return output


def char_decoder(text, tokenizer_path:str):
    """
    Function for decode a text snipet
    """

    int_to_string = load_json(tokenizer_path)
    output = str()
    for ch in text:
        output += int_to_string[str(ch)]
    return output


if __name__ == "__main__":

    file_path = "data/wikitext2_train.txt"
    #create_word_token_map(file_path=file_path,  punctuations=set('.,:;!?()[]"-§¶_\''))
    create_char_token_map(file_path=file_path)