-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathtesting_models.py
More file actions
99 lines (78 loc) · 3.08 KB
/
testing_models.py
File metadata and controls
99 lines (78 loc) · 3.08 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
import gzip
import random
from n_gram import Ngram
from backoff import Backoff
from weighted import LearnedWeighted
from utils import run_model
import numpy as np
def get_from(start, size):
enwik = open('files/enwik9', 'rb')
enwik.seek(start)
data = enwik.read(size)
enwik.close()
return data
# random data from enwik9
def get_random_enwik(size):
start = random.randint(0, int(1e9))
data = get_from(start, size)
return data
def get_zip_results(starts, size):
zip_sizes = []
for i in range(len(starts)):
data = get_from(starts[i], size)
zip_out = gzip.compress(data)
zip_sizes.append(len(zip_out))
avg_zip_size = sum(zip_sizes) / len(zip_sizes)
return avg_zip_size
def get_model_results(model, starts, size):
compressed_sizes = []
theoretical_compressions = []
for i in range(len(starts)):
data = get_from(starts[i], size)
compressed_size, theoretical_compression = run_model(model, data)
compressed_sizes.append(compressed_size)
theoretical_compressions.append(theoretical_compression)
avg_compressed_size = sum(compressed_sizes) / len(compressed_sizes)
avg_theoretical_compression = sum(theoretical_compressions) / len(theoretical_compressions)
return avg_compressed_size, avg_theoretical_compression
def format_into_table(model_results):
# format the dictionary into a markdown table
table = "| Model | Compressed Size | Theoretical Compression |\n"
table += "| --- | --- | --- |\n"
for model_name, results in model_results.items():
table += f"| {model_name} | {results[0]} | {results[1]} |\n"
return table
def update_readme(table, description):
new_results_section = description + '\n' + table
# Read the current content of the README file
with open('README.md', 'r') as readme:
readme_content = readme.read()
# Find the start and end indices of the "Results" section
start_marker = "Results:"
start_index = readme_content.find(start_marker)
if start_index != -1:
# Replace the existing "Results" section with the new content
updated_readme_content = (
readme_content[:start_index] + new_results_section
)
# Write the updated content back to the README file
with open('README.md', 'w') as readme:
readme.write(updated_readme_content)
if __name__ == "__main__":
size = int(1e3)
n_tests = 3
n = 16
starts = np.random.randint(0, int(1e9), n_tests)
model_results = dict()
zip_size = get_zip_results(starts, size)
model_results['zip'] = [zip_size, 'N/A']
model_names = ['Ngram', 'Backoff', 'LearnedWeighted']
models = [Ngram(n), Backoff(n), LearnedWeighted(n)]
for i in range(len(model_names)):
model = models[i]
model_name = model_names[i]
c, t = get_model_results(model, starts, size)
model_results[model_name] = [c, t]
table = format_into_table(model_results)
description = f'Results: \n\n averaged on {n_tests} random samples of {size} bytes from enwik9\n'
update_readme(table, description)