InfiniText/main.py at master · BryanValc/InfiniText · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
import json
import random as r
import re
from collections import defaultdict
from concurrent.futures import ProcessPoolExecutor

def save_model(file):
    # crete a list of words out of the text, first splitting the text to remove spaces and line breaks
    words = file.read().split()

    #now we replace everything that is not a letter with an empty string, using regular expressions
    for i in range(len(words)):
        words[i] = re.sub('[^a-z]', '', words[i].lower())

    #now we create the dictionary that will hold the model weights
    model = defaultdict(lambda: defaultdict(int))

    #now we iterate through the words and add the next word to the model, this is basically assigning the weights
    for i in range(len(words) - 1):
        model[words[i]][words[i + 1]] += 1

    #here we store the weights in a json file
    with open('model.json', 'w') as outfile:
        json.dump(model, outfile)

def load_model(path):
    # load the json as a Python dictionary
    with open(path, 'r') as infile:
        model = json.load(infile)

    # Convert back to defaultdict
    model = defaultdict(lambda: defaultdict(int), model)

    for key in model:
        model[key] = defaultdict(int, model[key])

    return model

def print_model(model):
    for word, weights in model.items():
        print(f"{word}:")
        for following_word, weight in weights.items():
            print(f"\t{following_word}: {weight}")

def encode_target_text(text):
    # in this method we create a 26 bit binary string that represents true or false for each letter of the alphabet in the target text
    # we will use this to discriminate words from the model that are not in the target text, for instance, if the text only contain the
    # letter a b and c, the first 3 bits of the string will be 1 and the rest will be 0

    # we will do the same with the lengths of the words, so we can discriminate words that are too long or too short, using a 16 bit binary string
    # for instance, if the target text only contains words of length 3 and, the binary string will be 0010100000000000
    alphabet = 'abcdefghijklmnopqrstuvwxyz'
    bit_string_letters = ['0'] * 26
    bit_string_length = ['0'] * 16
    bit_string_word_count = ['0'] * 8

    words = re.findall(r'\b\w+\b', text)
    unique_letters = set(text.lower())  # gets all unique letters in the text
    word_lengths = set(len(word) for word in words)  # gets all unique word lengths in the text
    word_count = len(words)

    # Check if the letter is in unique_letters
    for i, letter in enumerate(alphabet):
        if letter in unique_letters:
            bit_string_letters[i] = '1'

    # Check if the length is in word_lengths
    for length in word_lengths:
        if length <= 16:  # we only have space for lengths up to 16
            bit_string_length[length - 1] = '1'  # subtract 1 because indices start at 0

    # Convert the word_count to 8-bit binary and store it in bit_string_word_count
    if word_count <= 255:
        bit_string_word_count = list(format(word_count, '08b'))

    return ''.join(bit_string_letters), ''.join(bit_string_length), ''.join(bit_string_word_count)

def create_submodel(model, letters_bit_string, length_bit_string):
    submodel = defaultdict(lambda: defaultdict(int))

    alphabet = 'abcdefghijklmnopqrstuvwxyz'
    allowed_letters = {letter for i, letter in enumerate(alphabet) if letters_bit_string[i] == '1'}
    allowed_lengths = {i + 1 for i in range(16) if length_bit_string[i] == '1'} # indices are 0-based, lengths are 1-based

    for word, weights in model.items():
        if set(word).issubset(allowed_letters) and len(word) in allowed_lengths:
            for following_word, weight in weights.items():
                if set(following_word).issubset(allowed_letters) and len(following_word) in allowed_lengths:
                    submodel[word][following_word] = weight

    return submodel

def get_word_index(target_text, submodel):
    words = sorted(submodel.keys())
    word_to_index = {word: index for index, word in enumerate(words)}
    first_word = target_text.split()[0] if target_text.strip() != "" else None
    return format(word_to_index.get(first_word, 0), '016b')


def find_matching_seed(target_text, model, word_index, word_count):
    words = sorted(model.keys())
    word_to_index = {word: index for index, word in enumerate(words)}
    target_words = target_text.split()

    for seed in range(2 ** 32):  # try all possible 32-bit seeds
        r.seed(seed)
        sequence = [words[word_index]]

        for _ in range(word_count - 1):
            if sequence[-1] not in model:
                break  # this sequence can't match the target sequence
            following_words = list(model[sequence[-1]].keys())
            following_weights = list(model[sequence[-1]].values())
            next_word = r.choices(following_words, following_weights)[0]
            sequence.append(next_word)


        if sequence == target_words:
            return seed  # found a matching seed

    return None  # no matching seed found

def recreate_text(model, letters_bit_string, length_bit_string, word_count_bit_string, word_index_bit_string, seed):
    # Create submodel
    submodel = create_submodel(model, letters_bit_string, length_bit_string)

    # Convert word count and word index to integers
    word_count = int(word_count_bit_string, 2)
    word_index = int(word_index_bit_string, 2)

    # Initialize random number generator
    r.seed(seed)

    # Create a list of words in the submodel
    words = sorted(submodel.keys())

    # Initialize sequence with the word at the given index
    sequence = [words[word_index]]

    # Generate the remaining words
    for _ in range(word_count - 1):
        if sequence[-1] not in submodel:
            break  # no more words can follow
        following_words = list(submodel[sequence[-1]].keys())
        following_weights = list(submodel[sequence[-1]].values())
        next_word = r.choices(following_words, following_weights)[0]
        sequence.append(next_word)


    # Join the sequence into a single string
    text = ' '.join(sequence)

    return text

#static tests

# text_file = open("input/alice.txt", "r")
# target_text = open("input/target.txt", "r")
#
# save_model(text_file)
# model = load_model('model.json')
# # print_model(model)
#
# # encode the target text for discrimination of words
# bit_string_letters, bit_string_length, bit_word_count = encode_target_text(target_text.read())
# print(f"Letters bit string: {bit_string_letters}")
# print(f"Length bit string: {bit_string_length}")
# print(f"Word count bit string: {bit_word_count}")
#
# # create a submodel based on the target text discrimination parameters
# submodel = create_submodel(model, bit_string_letters, bit_string_length)
# print_model(submodel)
#
# # get the new model index of the first word in the target text
# target_text = open("input/target.txt", "r")
# bit_word_index = get_word_index(target_text.read(), submodel)
#
# print(f"Word index: {bit_word_index}")
#
# # find a matching seed
# target_text = open("input/target.txt", "r")
# seed = find_matching_seed(target_text.read(), submodel, int(bit_word_index, 2), int(bit_word_count, 2))
# print(f"Seed: {seed}")
#
# final_text = recreate_text(model, bit_string_letters, bit_string_length, bit_word_count, bit_word_index, seed)
#
# print(final_text + "... This was a triumph!")