Use 2-char markov chain gibberish detector #2402

JonoYang · JonoYang · commit 7e6317ff0ec7 · 2025-12-19T14:49:37.000-08:00
Signed-off-by: Jono Yang &lt;jyang@nexb.com&gt;
diff --git a/setup.cfg b/setup.cfg
@@ -117,7 +117,6 @@ install_requires =
     typecode[full] >= 30.0.1
     extractcode[full] >= 31.0.0
     cyseq >= 0.0.2
-    nostril-detector
 
 
 [options.packages.find]
diff --git a/src/cluecode/copyrights.py b/src/cluecode/copyrights.py
@@ -17,10 +17,10 @@
 from time import time
 
 import attr
-from nostril_detector.nonsense_detector import generate_nonsense_detector
 
 from commoncode.text import toascii
 from commoncode.text import unixlinesep
+from textcode.gibberish import Gibberish
 from pygmars import lex
 from pygmars import parse
 from pygmars import Token
@@ -62,9 +62,6 @@ def logger_debug(*args):
     logger_debug = print
 
 
-nonsense = generate_nonsense_detector(min_length=1)
-
-
 """
 Detect and collect copyright statements.
 
@@ -4271,6 +4268,8 @@ def strip_balanced_edge_parens(s):
 
 is_only_digit_and_punct = re.compile('^[^A-Za-z]+$').match
 
+gibberish_detector = Gibberish()
+
 
 def is_candidate(prepared_line):
     """
@@ -4288,10 +4287,9 @@ def is_candidate(prepared_line):
 
         return False
 
-    if nonsense(prepared_line):
+    if gibberish_detector.detect_gibberish(prepared_line):
         if TRACE:
-            logger_debug(f'is_candidate: nonsense:\n{prepared_line!r}')
-
+            logger_debug(f'is_candidate: gibberish_detector.detect_gibberish:\n{prepared_line!r}')
         return False
 
     if copyrights_hint.years(prepared_line):
diff --git a/src/textcode/data/gibberish/bad.txt b/src/textcode/data/gibberish/bad.txt
@@ -0,0 +1,5 @@
+zxcvwerjasc
+nmnjcviburili,<>
+zxcvnadtruqe
+ertrjiloifdfyyoiu
+grty iuewdiivjh
diff --git a/src/textcode/data/gibberish/big.txt b/src/textcode/data/gibberish/big.txt
diff --git a/src/textcode/data/gibberish/gib_model.pki b/src/textcode/data/gibberish/gib_model.pki
diff --git a/src/textcode/data/gibberish/good.txt b/src/textcode/data/gibberish/good.txt
@@ -0,0 +1,100 @@
+Copyright (c) All Rights Reserved. Hair Plus Trading Co., Inc.
+South Baylo University Copyright (c) All Right Reserved.
+Created by shazron on 11-06-15. Copyright 2011 . All rights reserved.
+Copyright (c) All Rights Reserved 2014-2019 New Avenue Foundation.
+'Copyright 2017 AllThingsTalk'
+Copyright (C) All Rights Are Reserved. Chungjungwon​. Iotacoffee.Com 2011
+copyright(c) All rights reserved localism,Inc.
+Crown Copyright C All rights reserved.
+copyright(c) All rights reserved istyle Inc.
+[assembly: AssemblyCopyright(""Copyright ©  2013"")]
+<span>Copyright (C) All Rights Reserved </span>   <span>2007-2020版权所有: 镇江日报社  </span>
+Copyright (c) - All Rights Reserved - PROAIM Medical.
+Copyright (c), ALL Consulting, 2008
+Created by Samvel Khalatyan, May 28, 2013 Copyright 2013, All rights reserved
+Iotacoffee.Com 2011 Copyright (C) All Rights Are Reserved.
+Copyright (C) All Rights Reserved, Lei Connection Inc.
+Copyright(c) All Saints Episcopal Church, Fort Worth, 2011, church based at 3290 Lackland Road,, Fort Worth, TX 76116
+*  Created by claudio beatrice on 2/21/10. Copyright 2010. All rights reserved.
+Copyright(c) All rights reserved by Minds, Japan Council for Quality Health Care.
+Copyright (C) All Rights Reserved by Leh. www.leh.jp
+Copyright (C) All rights Reserved by 株式会社　朝日住宅社
+/*  For iOS video I/O
+*  by Eduard Feicho on 29/07/12
+*  Copyright 2012. All rights reserved.
+// Copyright (c) 2002-2010, Industrial Light & Magic, a division of Lucas
+// Digital Ltd. LLC
+//
+// All rights reserved.
+Copyright (c) 2006, Industrial Light & Magic, a division of Lucasfilm
+Entertainment Company Ltd.  Portions contributed and copyright held by
+others as indicated.  All rights reserved.
+copyright__ = 'Copyright 2017 AllThingsTalk'
+Copyright EAVISE
+UCL are copyrighted software distributed
+Foursquare © 2019
+Copyright (C) 2019, by Djilani CARDINEAU.
+# Copyright michimani All rights reserved.
+Copyright(c) All Rights Reserved by Chinese Service Center for Scholarly Exchange
+Copyright(c) All right reserved SSC. Ltd.
+Third party copyrights are property of their respective owners.
+Copyright (c) All Rights Reserved by the District Export Council of Georgia.
+//COPYRIGHT
+//
+//All contributions by the University of California:
+//Copyright (c) 2014, The Regents of the University of California (Regents)
+//All rights reserved.
+//
+//All other contributions:
+//Copyright (c) 2014, the respective contributors
+//All rights reserved.
+//
+//Caffe uses a shared copyright model: each contributor holds copyright over
+//their contributions to Caffe. The project versioning records all such
+//contribution and copyright details. If a contributor wants to further mark
+//their specific copyright on a particular contribution, they should indicate
+//their copyright solely in the commit message of the change when it is
+//committed.
+//
+//LICENSE
+Copyright (C) 2013 Opensim Ltd.
+#COPYRIGHT
+#
+#All contributions by the University of California:
+#Copyright (c) 2014, 2015, The Regents of the University of California (Regents)
+#All rights reserved.
+#
+#All other contributions:
+#Copyright (c) 2014, 2015, the respective contributors
+#All rights reserved.
+LICENSE: Copyright 2016, All Rights Reserved
+(a)Download original face detection dataset -> (b)Convert annotation to the PASCAL VOC format -> (c)Create LMDB database with images + annotations for training
+(c) Copyright CNRI, All Rights Reserved. NO WARRANTY.
+Copyright (C), 2001-2011, Acme Tech. Co. Ltd.
+* libtiff/{tif_dirinfo.c, tif_dir.h, tif_dir.c, tif_print.c}: Make
+DocumentName, Artist, HostComputer, ImageDescription, Make, Model,
+Copyright, DateTime, PageName, TextureFormat, TextureWrapModes and
+TargetPrinter tags custom.
+COPYRIGHT (C) All About, Inc. All Rights Reserved.
+Copyright 2019, All Rights Reserved. # Author: Pine <cdtsgsz@gmail.com>
+*  For iOS video I/O
+*  by Eduard Feicho on 29/07/12
+*  by Alexander Shishkov on 17/07/13
+*  Copyright 2012. All rights reserved.
+COPYRIGHT(C) ALL JAPAN PRO-WRESTLING Co., Ltd.
+:copyright: Copyright (c) Joe Joyce and contributors, 2016-2019.
+Copyright 2014 uh-sem-blee, Co.
+Copyright (c) 2016 the Authors
+// Copyright (C) 2013, OpenCV Foundation, all rights reserved.
+// Third party copyrights are property of their respective owners.
+*  For iOS video I/O
+*  by Xiaochao Yang on 06/15/11 modified from
+*  cap_qtkit.mm for Nicholas Butko for Mac OS version.
+*  Copyright 2011. All rights reserved.
+Copyright (c) All the Raige Dog Salon. All Rights Reserved.
+[assembly: AssemblyCopyright(""Copyright ©  2014"")]
+<a href="http://www.enox.biz/">Copyright (C) All rights Reserved by 株式会社エノックス</a>
+2008 Nuance Communications
+Copyright 2008 TJ
+Scilab (c)INRIA-ENPC
+Copyright (c) 2006, FUJITA Yuji
diff --git a/src/textcode/gibberish.py b/src/textcode/gibberish.py
@@ -0,0 +1,127 @@
+#!/usr/bin/python
+#
+# From: https://raw.githubusercontent.com/yapus/gibberish/01637fe1fda827529ca76b8d6fee2de9100719f1/gibberish/gibberish.py
+#
+# 12Jun2017 Petr Janata - added srcfile and outfile
+# 17Jun2107 Petr Janata - expanded set of accepted characters to include digits and hyphen
+#
+# whch is based off of:
+# https://raw.githubusercontent.com/rrenaud/Gibberish-Detector/aa1d4e4555362b3dada97ebe6ecc23a84fc470fe/gib_detect_train.py
+#
+
+import math
+import pickle
+import os
+
+data_dir =  os.path.dirname(os.path.abspath(__file__)) + '/data/gibberish/'
+model_path = data_dir + 'gib_model.pki'
+
+accepted_chars = 'abcdefghijklmnopqrstuvwxyz0123456789- '
+pos = dict([(char, idx) for idx, char in enumerate(accepted_chars)])
+
+
+class Gibberish(object):
+    def __init__(self):
+        self.train_if_necessary()
+
+    def train_if_necessary(self):
+        if not os.path.isfile(model_path):
+            self.train()
+        else:
+            self.load_persisted_model()
+
+    def persist_model(self):
+        with open(model_path, 'wb') as f:
+            pickle.dump(vars(self), f)
+
+    def load_persisted_model(self):
+        with open(model_path, 'rb') as f:
+            persisted_model = pickle.load(f)
+            for key, value in persisted_model.items():
+                setattr(self, key, value)
+
+    def normalize(self, line):
+        """ Return only the subset of chars from accepted_chars.
+        This helps keep the  model relatively small by ignoring punctuation,
+        infrequenty symbols, etc. """
+        return [c.lower() for c in line if c.lower() in accepted_chars]
+
+    def ngram(self, n, l):
+        """ Return all n grams from l after normalizing """
+        filtered = self.normalize(l)
+        for start in range(0, len(filtered) - n + 1):
+            yield ''.join(filtered[start:start + n])
+
+    def avg_transition_prob(self, l, log_prob_mat):
+        """ Return the average transition prob from l through log_prob_mat. """
+        log_prob = 0.0
+        transition_ct = 0
+        for a, b in self.ngram(2, l):
+            log_prob += log_prob_mat[pos[a]][pos[b]]
+            transition_ct += 1
+        # The exponentiation translates from log probs to probs.
+        return math.exp(log_prob / (transition_ct or 1))
+
+    def train(self, bigfile=data_dir + 'big.txt', goodfile=data_dir + 'good.txt',
+              badfile=data_dir + 'bad.txt'):
+        """ Write a simple model as a pickle file """
+        k = len(accepted_chars)
+        # Assume we have seen 10 of each character pair.  This acts as a kind of
+        # prior or smoothing factor.  This way, if we see a character transition
+        # live that we've never observed in the past, we won't assume the entire
+        # string has 0 probability.
+        counts = [[10 for i in range(k)] for i in range(k)]
+
+        # Count transitions from big text file, taken
+        # from http://norvig.com/spell-correct.html
+        for line in open(bigfile):
+            for a, b in self.ngram(2, line):
+                counts[pos[a]][pos[b]] += 1
+
+        # Normalize the counts so that they become log probabilities.
+        # We use log probabilities rather than straight probabilities to avoid
+        # numeric underflow issues with long texts.
+        # This contains a justification:
+        # http://squarecog.wordpress.com/2009/01/10/dealing-with-underflow-in-joint-probability-calculations/
+        for i, row in enumerate(counts):
+            s = float(sum(row))
+            for j in range(len(row)):
+                row[j] = math.log(row[j] / s)
+
+        # Find the probability of generating a few arbitrarily choosen good and
+        # bad phrases.
+        good_probs = [self.avg_transition_prob(l, counts) for l in open(goodfile)]
+        bad_probs = [self.avg_transition_prob(l, counts) for l in open(badfile)]
+
+        # Assert that we actually are capable of detecting the junk.
+        assert min(good_probs) > max(bad_probs)
+
+        # And pick a threshold halfway between the worst good and best bad inputs.
+        thresh = (min(good_probs) + max(bad_probs)) / 2
+        self.mat = counts
+        self.thresh = thresh
+        self.persist_model()
+
+    def detect_gibberish(self, text):
+
+        text = ''.join(self.normalize(text))
+
+        return self.avg_transition_prob(text, self.mat) < self.thresh
+
+    def percent_gibberish(self, text):
+        text = ''.join(self.normalize(text))
+        text = text.strip()
+        words = text.split(' ')
+        if len(words) == 0:
+            return 0
+
+        gibberish_count = 0
+        for word in words:
+            if self.detect_gibberish(word):
+                gibberish_count += 1
+
+        return float(gibberish_count) / float(len(words))
+
+    def gibberish_pct(self, text):
+        text = ''.join(self.normalize(text))
+        return self.avg_transition_prob(text, self.mat)
diff --git a/tests/cluecode/data/copyrights/misco4/to_improve/junk-copyright-100.txt.yml b/tests/cluecode/data/copyrights/misco4/to_improve/junk-copyright-100.txt.yml
@@ -2,7 +2,3 @@ what:
   - copyrights
   - holders
   - authors
-copyrights:
-  - (c) (c) 2AICAA3SSY
-holders:
-  - 2AICAA3SSY
diff --git a/tests/cluecode/data/copyrights/misco4/to_improve/junk-copyright-118.txt.yml b/tests/cluecode/data/copyrights/misco4/to_improve/junk-copyright-118.txt.yml
@@ -2,7 +2,3 @@ what:
   - copyrights
   - holders
   - authors
-copyrights:
-  - U1e (c) IjAx
-holders:
-  - U1e IjAx
diff --git a/tests/cluecode/data/copyrights/misco4/to_improve/junk-copyright-130.txt.yml b/tests/cluecode/data/copyrights/misco4/to_improve/junk-copyright-130.txt.yml
@@ -2,7 +2,3 @@ what:
   - copyrights
   - holders
   - authors
-copyrights:
-  - Xz eaaeuyATNRU (c) Ijr
-holders:
-  - Xz eaaeuyATNRU Ijr
diff --git a/tests/cluecode/data/copyrights/misco4/to_improve/junk-copyright-178.txt.yml b/tests/cluecode/data/copyrights/misco4/to_improve/junk-copyright-178.txt.yml
@@ -2,7 +2,3 @@ what:
   - copyrights
   - holders
   - authors
-copyrights:
-  - (c) cc.fr
-holders:
-  - cc.fr
diff --git a/tests/cluecode/data/copyrights/misco4/to_improve/junk-copyright-195.txt.yml b/tests/cluecode/data/copyrights/misco4/to_improve/junk-copyright-195.txt.yml
@@ -2,7 +2,3 @@ what:
   - copyrights
   - holders
   - authors
-copyrights:
-  - (c) Oo2 UOY
-holders:
-  - Oo2 UOY
diff --git a/tests/cluecode/data/copyrights/misco4/to_improve/junk-copyright-280.txt.yml b/tests/cluecode/data/copyrights/misco4/to_improve/junk-copyright-280.txt.yml
@@ -2,7 +2,3 @@ what:
   - copyrights
   - holders
   - authors
-copyrights:
-  - I. (c) Uao
-holders:
-  - I. Uao
diff --git a/tests/cluecode/data/copyrights/misco4/to_improve/junk-copyright-291.txt.yml b/tests/cluecode/data/copyrights/misco4/to_improve/junk-copyright-291.txt.yml
@@ -2,7 +2,3 @@ what:
   - copyrights
   - holders
   - authors
-copyrights:
-  - (c) UOSSOO-O (c)
-holders:
-  - UOSSOO-O
diff --git a/tests/cluecode/data/copyrights/misco4/to_improve/junk-copyright-352.txt.yml b/tests/cluecode/data/copyrights/misco4/to_improve/junk-copyright-352.txt.yml
@@ -2,7 +2,3 @@ what:
   - copyrights
   - holders
   - authors
-copyrights:
-  - (c) Cj d Dj
-holders:
-  - Cj d Dj
diff --git a/tests/cluecode/data/copyrights/misco4/to_improve/junk-copyright-359.txt.yml b/tests/cluecode/data/copyrights/misco4/to_improve/junk-copyright-359.txt.yml
@@ -2,7 +2,3 @@ what:
   - copyrights
   - holders
   - authors
-copyrights:
-  - (c) ,33 ,BD(b.Xb(c+1),33) d d BVc
-holders:
-  - BD(b.Xb(c+1),33) d d BVc
diff --git a/tests/cluecode/data/copyrights/misco4/to_improve/junk-copyright-360.txt.yml b/tests/cluecode/data/copyrights/misco4/to_improve/junk-copyright-360.txt.yml
@@ -2,7 +2,3 @@ what:
   - copyrights
   - holders
   - authors
-copyrights:
-  - (c) q ltd
-holders:
-  - q ltd
diff --git a/tests/cluecode/data/copyrights/misco4/to_improve/junk-copyright-54.txt.yml b/tests/cluecode/data/copyrights/misco4/to_improve/junk-copyright-54.txt.yml
@@ -2,7 +2,3 @@ what:
   - copyrights
   - holders
   - authors
-copyrights:
-  - "(c) .'2\x14'OTh'q\x04\x19deg^ A1 Co"
-holders:
-  - "2\x14'OTh'q\x04\x19deg^ A1 Co"
diff --git a/tests/cluecode/data/copyrights/misco4/to_improve/junk-copyright-65.txt.yml b/tests/cluecode/data/copyrights/misco4/to_improve/junk-copyright-65.txt.yml
@@ -2,7 +2,3 @@ what:
   - copyrights
   - holders
   - authors
-copyrights:
-  - (c) (c) SS
-holders:
-  - SS
diff --git a/tests/cluecode/data/copyrights/misco4/to_improve/junk-copyright-87.txt.yml b/tests/cluecode/data/copyrights/misco4/to_improve/junk-copyright-87.txt.yml
@@ -2,7 +2,3 @@ what:
   - copyrights
   - holders
   - authors
-copyrights:
-  - (c) (c) Y
-holders:
-  - Y