-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathRanker.java
More file actions
127 lines (101 loc) · 5.03 KB
/
Ranker.java
File metadata and controls
127 lines (101 loc) · 5.03 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
import java.io.*;
import java.util.*;
public class Ranker {
Dictionary Dictionaries;
HashMap <String,String[]> mainDictionary;
HashMap <String,String[]> docsDictionary;
String postingPath;
boolean useStemmer;
double N = 472525; //number of documents
double avgdl = 137.768; //average length document - need to calculate it
double k1 = 1; //value between 1.2-2.0
double b = 0.2; //value from wikipedia
public Ranker(Dictionary Dictionaries, String postingPath) {
this.Dictionaries = Dictionaries;
this.mainDictionary = Dictionaries.mainDictionary;
this.docsDictionary = Dictionaries.docsDictionary;
this.postingPath = postingPath;
N = Dictionaries.numberOfDocs;
avgdl = Dictionaries.totalLength/N;
useStemmer = false;
}
public ArrayList<String> rankDocsByQuery (String [] query) throws IOException {
HashMap<String,Double> scoreDocument = new HashMap<>(); //hashMap for all document are match for all queries. sum the rank here
for (int i=0; i<query.length ; i++) {
//reading the posting file of this query
String firstLetter = ""+query[i].charAt(0);
///////////////////////////////////////////////////////////////change for posting path !!!!!!!!!!!!!!!!!!!!!!!!!
FileInputStream inputStream;
if (!useStemmer) {
inputStream = new FileInputStream(postingPath + "\\" + firstLetter.toUpperCase() + ".txt");
}
else {
inputStream = new FileInputStream(postingPath+"\\Stemmer"+firstLetter.toUpperCase()+".txt");
}
BufferedReader bufferedReader = new BufferedReader(new InputStreamReader(inputStream));
String line = bufferedReader.readLine();
String [] splitLine = {};
String [] splitLineToDocs={};
while (line!=null){
splitLine = line.split("~"); //split to term name
if (splitLine[0].equals(query[i]) || splitLine[0].equals(query[i].toUpperCase()) || splitLine[0].equals(query[i].toLowerCase())){ //check if the term is same like query
splitLineToDocs = splitLine[1].split("&"); //split to documents
int numberOfDocs = splitLineToDocs.length-1;
//loop on documents and put into hash map and calculate rankBM25
for (int j=1 ; j<splitLineToDocs.length ; j++){
String [] documentDetails = splitLineToDocs[j].split(" "); //split to indexes
String docID = documentDetails[0];
//put into hashMap the rankBM25
if (scoreDocument.containsKey(docID)){ //if it contain - add this value to the old one
scoreDocument.replace(docID, scoreDocument.get(docID)+rankBM25(documentDetails, numberOfDocs));
}
else { // if it not contain put it and calculate the value
scoreDocument.put(docID, rankBM25(documentDetails, numberOfDocs));
}
}
}
line = bufferedReader.readLine();
}
inputStream.close();
bufferedReader.close();
}
//return the most 50 popular documents
LinkedHashMap<String, Double > sortedDocumentByRank = new LinkedHashMap<>();
scoreDocument.entrySet()
.stream()
.sorted(new Comparator<Map.Entry<String, Double>>() {
@Override
public int compare(Map.Entry<String, Double> o1, Map.Entry<String, Double> o2) {
return o2.getValue().compareTo(o1.getValue());
}
})
.forEachOrdered(x -> sortedDocumentByRank.put(x.getKey(), x.getValue()));
ArrayList <String> rankDocuments = new ArrayList<>();
int flag = 0;
for (HashMap.Entry<String, Double> doc: sortedDocumentByRank.entrySet()) {
if (flag ==50){
break;
}
rankDocuments.add(doc.getKey());
flag++;
}
return rankDocuments;
}
public double rankBM25 (String [] documentDetails, int numberOfDocs){
if (useStemmer){
k1=1.5;
b = 0.2;
}
String docID = documentDetails[0];
double termTimesInThisDoc = Double.parseDouble(documentDetails[1]);
double lengthOfTheDoc = Double.parseDouble(docsDictionary.get(docID)[2]);
double maxTF = Double.parseDouble(docsDictionary.get(docID)[0]);
double F = termTimesInThisDoc/maxTF;
//double F = termTimesInThisDoc;
double D = lengthOfTheDoc;
double n = numberOfDocs;
double IDF = Math.log((N-n+0.5)/n+0.5);
double bm25 = (IDF*(k1+1))/(F+k1*(1-b+b*(D/avgdl)));
return bm25;
}
}