|
11 | 11 |
|
12 | 12 | import math |
13 | 13 | import pickle |
14 | | -import os |
| 14 | +from pathlib import Path |
15 | 15 |
|
16 | | -data_dir = os.path.dirname(os.path.abspath(__file__)) + '/data/gibberish/' |
17 | | -model_path = data_dir + 'gib_model.pki' |
| 16 | +data_dir = Path(__file__).parent / 'data' / 'gibberish' |
| 17 | +model_path = data_dir / 'gib_model.pki' |
| 18 | +big_file_path = data_dir / 'big.txt' |
| 19 | +good_file_path = data_dir / 'good.txt' |
| 20 | +bad_file_path = data_dir / 'bad.txt' |
18 | 21 |
|
19 | 22 | accepted_chars = 'abcdefghijklmnopqrstuvwxyz0123456789- ' |
20 | 23 | pos = dict([(char, idx) for idx, char in enumerate(accepted_chars)]) |
21 | 24 |
|
22 | 25 |
|
23 | 26 | class Gibberish(object): |
24 | 27 | def __init__(self): |
25 | | - self.train_if_necessary() |
26 | | - |
27 | | - def train_if_necessary(self): |
28 | | - if not os.path.isfile(model_path): |
29 | | - self.train() |
30 | | - else: |
| 28 | + if model_path.exists(): |
31 | 29 | self.load_persisted_model() |
| 30 | + else: |
| 31 | + self.train() |
32 | 32 |
|
33 | 33 | def persist_model(self): |
34 | 34 | with open(model_path, 'wb') as f: |
@@ -62,8 +62,8 @@ def avg_transition_prob(self, l, log_prob_mat): |
62 | 62 | # The exponentiation translates from log probs to probs. |
63 | 63 | return math.exp(log_prob / (transition_ct or 1)) |
64 | 64 |
|
65 | | - def train(self, bigfile=data_dir + 'big.txt', goodfile=data_dir + 'good.txt', |
66 | | - badfile=data_dir + 'bad.txt'): |
| 65 | + def train(self, bigfile=big_file_path, goodfile=good_file_path, |
| 66 | + badfile=bad_file_path): |
67 | 67 | """ Write a simple model as a pickle file """ |
68 | 68 | k = len(accepted_chars) |
69 | 69 | # Assume we have seen 10 of each character pair. This acts as a kind of |
@@ -103,9 +103,7 @@ def train(self, bigfile=data_dir + 'big.txt', goodfile=data_dir + 'good.txt', |
103 | 103 | self.persist_model() |
104 | 104 |
|
105 | 105 | def detect_gibberish(self, text): |
106 | | - |
107 | 106 | text = ''.join(self.normalize(text)) |
108 | | - |
109 | 107 | return self.avg_transition_prob(text, self.mat) < self.thresh |
110 | 108 |
|
111 | 109 | def percent_gibberish(self, text): |
|
0 commit comments