Skip to content

Commit 7e6317f

Browse files
committed
Use 2-char markov chain gibberish detector #2402
Signed-off-by: Jono Yang <jyang@nexb.com>
1 parent 1e9b432 commit 7e6317f

20 files changed

+128694
-60
lines changed

setup.cfg

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -117,7 +117,6 @@ install_requires =
117117
typecode[full] >= 30.0.1
118118
extractcode[full] >= 31.0.0
119119
cyseq >= 0.0.2
120-
nostril-detector
121120

122121

123122
[options.packages.find]

src/cluecode/copyrights.py

Lines changed: 5 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -17,10 +17,10 @@
1717
from time import time
1818

1919
import attr
20-
from nostril_detector.nonsense_detector import generate_nonsense_detector
2120

2221
from commoncode.text import toascii
2322
from commoncode.text import unixlinesep
23+
from textcode.gibberish import Gibberish
2424
from pygmars import lex
2525
from pygmars import parse
2626
from pygmars import Token
@@ -62,9 +62,6 @@ def logger_debug(*args):
6262
logger_debug = print
6363

6464

65-
nonsense = generate_nonsense_detector(min_length=1)
66-
67-
6865
"""
6966
Detect and collect copyright statements.
7067
@@ -4271,6 +4268,8 @@ def strip_balanced_edge_parens(s):
42714268

42724269
is_only_digit_and_punct = re.compile('^[^A-Za-z]+$').match
42734270

4271+
gibberish_detector = Gibberish()
4272+
42744273

42754274
def is_candidate(prepared_line):
42764275
"""
@@ -4288,10 +4287,9 @@ def is_candidate(prepared_line):
42884287

42894288
return False
42904289

4291-
if nonsense(prepared_line):
4290+
if gibberish_detector.detect_gibberish(prepared_line):
42924291
if TRACE:
4293-
logger_debug(f'is_candidate: nonsense:\n{prepared_line!r}')
4294-
4292+
logger_debug(f'is_candidate: gibberish_detector.detect_gibberish:\n{prepared_line!r}')
42954293
return False
42964294

42974295
if copyrights_hint.years(prepared_line):
Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,5 @@
1+
zxcvwerjasc
2+
nmnjcviburili,<>
3+
zxcvnadtruqe
4+
ertrjiloifdfyyoiu
5+
grty iuewdiivjh

src/textcode/data/gibberish/big.txt

Lines changed: 128457 additions & 0 deletions
Large diffs are not rendered by default.
12.9 KB
Binary file not shown.
Lines changed: 100 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,100 @@
1+
Copyright (c) All Rights Reserved. Hair Plus Trading Co., Inc.
2+
South Baylo University Copyright (c) All Right Reserved.
3+
Created by shazron on 11-06-15. Copyright 2011 . All rights reserved.
4+
Copyright (c) All Rights Reserved 2014-2019 New Avenue Foundation.
5+
'Copyright 2017 AllThingsTalk'
6+
Copyright (C) All Rights Are Reserved. Chungjungwon​. Iotacoffee.Com 2011
7+
copyright(c) All rights reserved localism,Inc.
8+
Crown Copyright C All rights reserved.
9+
copyright(c) All rights reserved istyle Inc.
10+
[assembly: AssemblyCopyright(""Copyright © 2013"")]
11+
<span>Copyright (C) All Rights Reserved </span> <span>2007-2020版权所有: 镇江日报社 </span>
12+
Copyright (c) - All Rights Reserved - PROAIM Medical.
13+
Copyright (c), ALL Consulting, 2008
14+
Created by Samvel Khalatyan, May 28, 2013 Copyright 2013, All rights reserved
15+
Iotacoffee.Com 2011 Copyright (C) All Rights Are Reserved.
16+
Copyright (C) All Rights Reserved, Lei Connection Inc.
17+
Copyright(c) All Saints Episcopal Church, Fort Worth, 2011, church based at 3290 Lackland Road,, Fort Worth, TX 76116
18+
* Created by claudio beatrice on 2/21/10. Copyright 2010. All rights reserved.
19+
Copyright(c) All rights reserved by Minds, Japan Council for Quality Health Care.
20+
Copyright (C) All Rights Reserved by Leh. www.leh.jp
21+
Copyright (C) All rights Reserved by 株式会社 朝日住宅社
22+
/* For iOS video I/O
23+
* by Eduard Feicho on 29/07/12
24+
* Copyright 2012. All rights reserved.
25+
// Copyright (c) 2002-2010, Industrial Light & Magic, a division of Lucas
26+
// Digital Ltd. LLC
27+
//
28+
// All rights reserved.
29+
Copyright (c) 2006, Industrial Light & Magic, a division of Lucasfilm
30+
Entertainment Company Ltd. Portions contributed and copyright held by
31+
others as indicated. All rights reserved.
32+
copyright__ = 'Copyright 2017 AllThingsTalk'
33+
Copyright EAVISE
34+
UCL are copyrighted software distributed
35+
Foursquare © 2019
36+
Copyright (C) 2019, by Djilani CARDINEAU.
37+
# Copyright michimani All rights reserved.
38+
Copyright(c) All Rights Reserved by Chinese Service Center for Scholarly Exchange
39+
Copyright(c) All right reserved SSC. Ltd.
40+
Third party copyrights are property of their respective owners.
41+
Copyright (c) All Rights Reserved by the District Export Council of Georgia.
42+
//COPYRIGHT
43+
//
44+
//All contributions by the University of California:
45+
//Copyright (c) 2014, The Regents of the University of California (Regents)
46+
//All rights reserved.
47+
//
48+
//All other contributions:
49+
//Copyright (c) 2014, the respective contributors
50+
//All rights reserved.
51+
//
52+
//Caffe uses a shared copyright model: each contributor holds copyright over
53+
//their contributions to Caffe. The project versioning records all such
54+
//contribution and copyright details. If a contributor wants to further mark
55+
//their specific copyright on a particular contribution, they should indicate
56+
//their copyright solely in the commit message of the change when it is
57+
//committed.
58+
//
59+
//LICENSE
60+
Copyright (C) 2013 Opensim Ltd.
61+
#COPYRIGHT
62+
#
63+
#All contributions by the University of California:
64+
#Copyright (c) 2014, 2015, The Regents of the University of California (Regents)
65+
#All rights reserved.
66+
#
67+
#All other contributions:
68+
#Copyright (c) 2014, 2015, the respective contributors
69+
#All rights reserved.
70+
LICENSE: Copyright 2016, All Rights Reserved
71+
(a)Download original face detection dataset -> (b)Convert annotation to the PASCAL VOC format -> (c)Create LMDB database with images + annotations for training
72+
(c) Copyright CNRI, All Rights Reserved. NO WARRANTY.
73+
Copyright (C), 2001-2011, Acme Tech. Co. Ltd.
74+
* libtiff/{tif_dirinfo.c, tif_dir.h, tif_dir.c, tif_print.c}: Make
75+
DocumentName, Artist, HostComputer, ImageDescription, Make, Model,
76+
Copyright, DateTime, PageName, TextureFormat, TextureWrapModes and
77+
TargetPrinter tags custom.
78+
COPYRIGHT (C) All About, Inc. All Rights Reserved.
79+
Copyright 2019, All Rights Reserved. # Author: Pine <cdtsgsz@gmail.com>
80+
* For iOS video I/O
81+
* by Eduard Feicho on 29/07/12
82+
* by Alexander Shishkov on 17/07/13
83+
* Copyright 2012. All rights reserved.
84+
COPYRIGHT(C) ALL JAPAN PRO-WRESTLING Co., Ltd.
85+
:copyright: Copyright (c) Joe Joyce and contributors, 2016-2019.
86+
Copyright 2014 uh-sem-blee, Co.
87+
Copyright (c) 2016 the Authors
88+
// Copyright (C) 2013, OpenCV Foundation, all rights reserved.
89+
// Third party copyrights are property of their respective owners.
90+
* For iOS video I/O
91+
* by Xiaochao Yang on 06/15/11 modified from
92+
* cap_qtkit.mm for Nicholas Butko for Mac OS version.
93+
* Copyright 2011. All rights reserved.
94+
Copyright (c) All the Raige Dog Salon. All Rights Reserved.
95+
[assembly: AssemblyCopyright(""Copyright © 2014"")]
96+
<a href="http://www.enox.biz/">Copyright (C) All rights Reserved by 株式会社エノックス</a>
97+
2008 Nuance Communications
98+
Copyright 2008 TJ
99+
Scilab (c)INRIA-ENPC
100+
Copyright (c) 2006, FUJITA Yuji

src/textcode/gibberish.py

Lines changed: 127 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,127 @@
1+
#!/usr/bin/python
2+
#
3+
# From: https://raw.githubusercontent.com/yapus/gibberish/01637fe1fda827529ca76b8d6fee2de9100719f1/gibberish/gibberish.py
4+
#
5+
# 12Jun2017 Petr Janata - added srcfile and outfile
6+
# 17Jun2107 Petr Janata - expanded set of accepted characters to include digits and hyphen
7+
#
8+
# whch is based off of:
9+
# https://raw.githubusercontent.com/rrenaud/Gibberish-Detector/aa1d4e4555362b3dada97ebe6ecc23a84fc470fe/gib_detect_train.py
10+
#
11+
12+
import math
13+
import pickle
14+
import os
15+
16+
data_dir = os.path.dirname(os.path.abspath(__file__)) + '/data/gibberish/'
17+
model_path = data_dir + 'gib_model.pki'
18+
19+
accepted_chars = 'abcdefghijklmnopqrstuvwxyz0123456789- '
20+
pos = dict([(char, idx) for idx, char in enumerate(accepted_chars)])
21+
22+
23+
class Gibberish(object):
24+
def __init__(self):
25+
self.train_if_necessary()
26+
27+
def train_if_necessary(self):
28+
if not os.path.isfile(model_path):
29+
self.train()
30+
else:
31+
self.load_persisted_model()
32+
33+
def persist_model(self):
34+
with open(model_path, 'wb') as f:
35+
pickle.dump(vars(self), f)
36+
37+
def load_persisted_model(self):
38+
with open(model_path, 'rb') as f:
39+
persisted_model = pickle.load(f)
40+
for key, value in persisted_model.items():
41+
setattr(self, key, value)
42+
43+
def normalize(self, line):
44+
""" Return only the subset of chars from accepted_chars.
45+
This helps keep the model relatively small by ignoring punctuation,
46+
infrequenty symbols, etc. """
47+
return [c.lower() for c in line if c.lower() in accepted_chars]
48+
49+
def ngram(self, n, l):
50+
""" Return all n grams from l after normalizing """
51+
filtered = self.normalize(l)
52+
for start in range(0, len(filtered) - n + 1):
53+
yield ''.join(filtered[start:start + n])
54+
55+
def avg_transition_prob(self, l, log_prob_mat):
56+
""" Return the average transition prob from l through log_prob_mat. """
57+
log_prob = 0.0
58+
transition_ct = 0
59+
for a, b in self.ngram(2, l):
60+
log_prob += log_prob_mat[pos[a]][pos[b]]
61+
transition_ct += 1
62+
# The exponentiation translates from log probs to probs.
63+
return math.exp(log_prob / (transition_ct or 1))
64+
65+
def train(self, bigfile=data_dir + 'big.txt', goodfile=data_dir + 'good.txt',
66+
badfile=data_dir + 'bad.txt'):
67+
""" Write a simple model as a pickle file """
68+
k = len(accepted_chars)
69+
# Assume we have seen 10 of each character pair. This acts as a kind of
70+
# prior or smoothing factor. This way, if we see a character transition
71+
# live that we've never observed in the past, we won't assume the entire
72+
# string has 0 probability.
73+
counts = [[10 for i in range(k)] for i in range(k)]
74+
75+
# Count transitions from big text file, taken
76+
# from http://norvig.com/spell-correct.html
77+
for line in open(bigfile):
78+
for a, b in self.ngram(2, line):
79+
counts[pos[a]][pos[b]] += 1
80+
81+
# Normalize the counts so that they become log probabilities.
82+
# We use log probabilities rather than straight probabilities to avoid
83+
# numeric underflow issues with long texts.
84+
# This contains a justification:
85+
# http://squarecog.wordpress.com/2009/01/10/dealing-with-underflow-in-joint-probability-calculations/
86+
for i, row in enumerate(counts):
87+
s = float(sum(row))
88+
for j in range(len(row)):
89+
row[j] = math.log(row[j] / s)
90+
91+
# Find the probability of generating a few arbitrarily choosen good and
92+
# bad phrases.
93+
good_probs = [self.avg_transition_prob(l, counts) for l in open(goodfile)]
94+
bad_probs = [self.avg_transition_prob(l, counts) for l in open(badfile)]
95+
96+
# Assert that we actually are capable of detecting the junk.
97+
assert min(good_probs) > max(bad_probs)
98+
99+
# And pick a threshold halfway between the worst good and best bad inputs.
100+
thresh = (min(good_probs) + max(bad_probs)) / 2
101+
self.mat = counts
102+
self.thresh = thresh
103+
self.persist_model()
104+
105+
def detect_gibberish(self, text):
106+
107+
text = ''.join(self.normalize(text))
108+
109+
return self.avg_transition_prob(text, self.mat) < self.thresh
110+
111+
def percent_gibberish(self, text):
112+
text = ''.join(self.normalize(text))
113+
text = text.strip()
114+
words = text.split(' ')
115+
if len(words) == 0:
116+
return 0
117+
118+
gibberish_count = 0
119+
for word in words:
120+
if self.detect_gibberish(word):
121+
gibberish_count += 1
122+
123+
return float(gibberish_count) / float(len(words))
124+
125+
def gibberish_pct(self, text):
126+
text = ''.join(self.normalize(text))
127+
return self.avg_transition_prob(text, self.mat)

tests/cluecode/data/copyrights/misco4/to_improve/junk-copyright-100.txt.yml

Lines changed: 0 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,3 @@ what:
22
- copyrights
33
- holders
44
- authors
5-
copyrights:
6-
- (c) (c) 2AICAA3SSY
7-
holders:
8-
- 2AICAA3SSY

tests/cluecode/data/copyrights/misco4/to_improve/junk-copyright-118.txt.yml

Lines changed: 0 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,3 @@ what:
22
- copyrights
33
- holders
44
- authors
5-
copyrights:
6-
- U1e (c) IjAx
7-
holders:
8-
- U1e IjAx

tests/cluecode/data/copyrights/misco4/to_improve/junk-copyright-130.txt.yml

Lines changed: 0 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,3 @@ what:
22
- copyrights
33
- holders
44
- authors
5-
copyrights:
6-
- Xz eaaeuyATNRU (c) Ijr
7-
holders:
8-
- Xz eaaeuyATNRU Ijr

0 commit comments

Comments
 (0)