-
Notifications
You must be signed in to change notification settings - Fork 22
Expand file tree
/
Copy pathAbstractSourcedStatisticalTokenDistance.java
More file actions
73 lines (63 loc) · 2.75 KB
/
AbstractSourcedStatisticalTokenDistance.java
File metadata and controls
73 lines (63 loc) · 2.75 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
package com.wcohen.ss;
import java.util.*;
import com.wcohen.ss.tokens.*;
import com.wcohen.ss.api.*;
/**
* Abstract token distance metric that uses frequency statistics.
*/
abstract public class AbstractSourcedStatisticalTokenDistance extends AbstractSourcedTokenizedStringDistance
{
// to save space, allocate the small numbers only once in the documentFrequency map
private static final Integer ONE = new Integer(1);
private static final Integer TWO = new Integer(2);
private static final Integer THREE = new Integer(3);
// maps tokens to document frequency
protected Map documentFrequency = new HashMap();
// count number of documents
protected int collectionSize = 0;
// count number of tokens
protected int totalTokenCount = 0;
// count warnings
private int warningCounter = 0;
public AbstractSourcedStatisticalTokenDistance(SourcedTokenizer tokenizer) { super(tokenizer); }
public AbstractSourcedStatisticalTokenDistance() { super(); }
/** Accumulate statistics on how often each token value occurs
*/
public void train(StringWrapperIterator i0)
{
SourcedStringWrapperIterator i = (SourcedStringWrapperIterator)i0;
Set seenTokens = new HashSet();
while (i.hasNext()) {
BagOfSourcedTokens bag = asBagOfSourcedTokens(i.nextSourcedStringWrapper());
seenTokens.clear();
for (Iterator j=bag.tokenIterator(); j.hasNext(); ) {
totalTokenCount++;
Token tokj = (Token)j.next();
if (!seenTokens.contains(tokj)) {
seenTokens.add(tokj);
// increment documentFrequency counts
Integer df = (Integer)documentFrequency.get(tokj);
if (df==null) documentFrequency.put(tokj,ONE);
else if (df==ONE) documentFrequency.put(tokj,TWO);
else if (df==TWO) documentFrequency.put(tokj,THREE);
else documentFrequency.put(tokj, new Integer(df.intValue()+1));
}
}
collectionSize++;
}
}
protected void checkTrainingHasHappened(StringWrapper s, StringWrapper t)
{
if (collectionSize==0 && ++warningCounter<=10) {
System.out.println("Warning: "+this.getClass()+" not yet trained for sim('"+s+"','"+t+"')");
if (warningCounter == 10) {
System.out.println("(By the way, that's the last warning you'll get about this.)");
}
}
}
public int getDocumentFrequency(Token tok) {
Integer freqInteger = (Integer)documentFrequency.get(tok);
if (freqInteger==null) return 0;
else return freqInteger.intValue();
}
}