Handle paths with pathlib.Path #2402

JonoYang · JonoYang · commit 21b7450704ea · 2025-12-19T14:49:37.000-08:00
* Remove unnecessary tests

Signed-off-by: Jono Yang &lt;jyang@nexb.com&gt;
diff --git a/src/textcode/gibberish.py b/src/textcode/gibberish.py
@@ -11,24 +11,24 @@
 
 import math
 import pickle
-import os
+from pathlib import Path
 
-data_dir =  os.path.dirname(os.path.abspath(__file__)) + '/data/gibberish/'
-model_path = data_dir + 'gib_model.pki'
+data_dir = Path(__file__).parent / 'data' / 'gibberish'
+model_path = data_dir / 'gib_model.pki'
+big_file_path = data_dir / 'big.txt'
+good_file_path = data_dir / 'good.txt'
+bad_file_path = data_dir / 'bad.txt'
 
 accepted_chars = 'abcdefghijklmnopqrstuvwxyz0123456789- '
 pos = dict([(char, idx) for idx, char in enumerate(accepted_chars)])
 
 
 class Gibberish(object):
     def __init__(self):
-        self.train_if_necessary()
-
-    def train_if_necessary(self):
-        if not os.path.isfile(model_path):
-            self.train()
-        else:
+        if model_path.exists():
             self.load_persisted_model()
+        else:
+            self.train()
 
     def persist_model(self):
         with open(model_path, 'wb') as f:
@@ -62,8 +62,8 @@ def avg_transition_prob(self, l, log_prob_mat):
         # The exponentiation translates from log probs to probs.
         return math.exp(log_prob / (transition_ct or 1))
 
-    def train(self, bigfile=data_dir + 'big.txt', goodfile=data_dir + 'good.txt',
-              badfile=data_dir + 'bad.txt'):
+    def train(self, bigfile=big_file_path, goodfile=good_file_path,
+              badfile=bad_file_path):
         """ Write a simple model as a pickle file """
         k = len(accepted_chars)
         # Assume we have seen 10 of each character pair.  This acts as a kind of
@@ -103,9 +103,7 @@ def train(self, bigfile=data_dir + 'big.txt', goodfile=data_dir + 'good.txt',
         self.persist_model()
 
     def detect_gibberish(self, text):
-
         text = ''.join(self.normalize(text))
-
         return self.avg_transition_prob(text, self.mat) < self.thresh
 
     def percent_gibberish(self, text):
diff --git a/tests/cluecode/test_copyrights_basic.py b/tests/cluecode/test_copyrights_basic.py
@@ -249,26 +249,6 @@ def test_is_candidate_should_not_select_line_with_junk_hex(self):
         line = prepare_text_line(line)
         assert not copyrights.is_candidate(line)
 
-    def test_is_candidate_should_select_line_with_a_trailing_years(self):
-        line = '01061C3F5280CD4AC504152B81E452BD820154 2014\n'
-        line = prepare_text_line(line)
-        assert copyrights.is_candidate(line)
-
-    def test_is_candidate_should_select_line_with_proper_years(self):
-        line = '01061C3F5280CD4AC504152B81E452BD820154 2014-'
-        line = prepare_text_line(line)
-        assert copyrights.is_candidate(line)
-
-    def test_is_candidate_should_select_line_with_proper_years2(self):
-        line = '01061C3F5280CD4,2016 152B81E452BD820154'
-        line = prepare_text_line(line)
-        assert copyrights.is_candidate(line)
-
-    def test_is_candidate_should_select_line_with_dashed_year(self):
-        line = 'pub   1024D/CCD6F801 2006-11-15'
-        line = prepare_text_line(line)
-        assert copyrights.is_candidate(line)
-
     def test_is_candidate_should_select_line_with_iso_date_year(self):
         line = 'sig 3 ccd6f801 2006-11-15 nathan mittler <nathan.mittler@gmail.com>'
         line = prepare_text_line(line)