-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathIndexer.java
More file actions
444 lines (386 loc) · 20 KB
/
Indexer.java
File metadata and controls
444 lines (386 loc) · 20 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
import javax.print.attribute.standard.QueuedJobCount;
import javax.swing.*;
import java.io.*;
import java.util.*;
import static javafx.scene.input.KeyCode.Q;
public class Indexer {
boolean useStemmer;
Stemmer stemmer;
HashMap<String, String []> mainDictionary; //0 - pointer, 1 - counter for number of docs, 2 - counter for number of times in the corpus
HashMap<String, Term> dictionary;
HashMap<String, Document> docsDictionary;
int postingFileNameCounter;
String path;
int numberOfDocs;
int totalLength ;
/**
* this is constructor
* @throws FileNotFoundException
*/
public Indexer() throws FileNotFoundException {
// this.useStemmer = true;///////////////////////////////////////// use stemmer or not ///////////////////////////////////
stemmer = new Stemmer();
mainDictionary = new HashMap<>();
dictionary = new HashMap<>();
docsDictionary = new HashMap<>();
numberOfDocs = 0;
totalLength =0;
}
/**
* this function is sending the term to indexer
* @param term
* @throws IOException
*/
public void termToIndex (Term term) throws IOException {
if (useStemmer) {
term.word = stemmer.stem(term.word);
}
if (term.isEntity){
if (docsDictionary.containsKey(term.docID)){ //if the doc is in the docs dictionary
docsDictionary.get(term.docID).length = docsDictionary.get(term.docID).length+1; //update the length value
if (docsDictionary.get(term.docID).terms.containsKey(term.word)){//if the term is exist for this doc
docsDictionary.get(term.docID).terms.replace(term.word, docsDictionary.get(term.docID).terms.get(term.word)+1); //update counter for this term
docsDictionary.get(term.docID).entities.replace(term.word, docsDictionary.get(term.docID).entities.get(term.word)+1); //update counter for this term
}
else{ //if the term is not exist for this doc
docsDictionary.get(term.docID).terms.put(term.word, 1);
docsDictionary.get(term.docID).entities.put(term.word, 1);
}
}
else{ //if the doc is not in the docs dictionary
Document newDoc = new Document(term.docID);
docsDictionary.put(term.docID, newDoc);
docsDictionary.get(term.docID).terms.put(term.word, 1);
docsDictionary.get(term.docID).entities.put(term.word, 1);
}
}
else {
lowerUpperCase(term);
if (docsDictionary.containsKey(term.docID)) { //if the doc is in the docs dictionary
docsDictionary.get(term.docID).length = docsDictionary.get(term.docID).length + 1; //update the length value
if (docsDictionary.get(term.docID).terms.containsKey(term.word.toUpperCase())) {//if the term is exist for this doc with capital letters
docsDictionary.get(term.docID).terms.replace(term.word.toUpperCase(), docsDictionary.get(term.docID).terms.get(term.word.toUpperCase()) + 1); //update counter for this term
} else if (docsDictionary.get(term.docID).terms.containsKey(term.word.toLowerCase())) { //if the term is exist for this doc with small letters
docsDictionary.get(term.docID).terms.replace(term.word.toLowerCase(), docsDictionary.get(term.docID).terms.get(term.word.toLowerCase()) + 1); //update counter for this term
} else { //if the term is not exist for this doc
docsDictionary.get(term.docID).terms.put(term.word, 1);
}
} else { //if the doc is not in the docs dictionary
Document newDoc = new Document(term.docID);
docsDictionary.put(term.docID, newDoc);
docsDictionary.get(term.docID).terms.put(term.word, 1);
}
}
if (mainDictionary.containsKey(term.word)){ //if the word is in the main dictionary only update the counter
mainDictionary.get(term.word)[2] = Integer.parseInt(mainDictionary.get(term.word)[2])+1 + ""; //update the counter of times in corpus
}
if (dictionary.containsKey(term.word)){ //if the word is in the temp dictionary
if (dictionary.get(term.word).docsIdIndexesMap.containsKey(term.docID)){ //check if doc is exist in docs list
dictionary.get(term.word).docsIdIndexesMap.get(term.docID).add(term.wordIndex); //add this index to array list
}
else{ //doc is not exist in docs list
ArrayList<Integer> indexes = new ArrayList<>();
indexes.add(term.wordIndex);
dictionary.get(term.word).docsIdIndexesMap.put(term.docID, indexes);//put into docs list in term object
mainDictionary.get(term.word)[1] = Integer.parseInt(mainDictionary.get(term.word)[1])+1 + ""; //update the counter of docs for this word
}
}
else{ //the word is not in the temp dictionary
term.pointerToPosting = "resource//" + term.word.charAt(0); //pointer to final posing file path
ArrayList<Integer> indexes = new ArrayList<>();
indexes.add(term.wordIndex);
term.docsIdIndexesMap.put(term.docID, indexes); //put into docs list in term object
dictionary.put(term.word,term); //put the term in dictionary
if (mainDictionary.containsKey(term.word)){ //if word is in the main dictionary
mainDictionary.get(term.word)[1] = Integer.parseInt(mainDictionary.get(term.word)[1])+1 + ""; //update the counter of docs for this word
}
else { //word is not in the main dictionary
String[] termPointerAndCounter = new String[3];
termPointerAndCounter[0] = "resource//" + term.word.charAt(0); //pointer
termPointerAndCounter[1] = "1"; //counter for number of docs
termPointerAndCounter[2] = "1"; //counter for number of times in the corpus
mainDictionary.put(term.word, termPointerAndCounter); //insert the term into the main dictionary
}
}
}
/**
* this function is sending everything to posting files
* @throws IOException
*/
public void sendToPostingFiles() throws IOException {
LinkedHashMap<String, Term > sortedTermPerNFiles = new LinkedHashMap<>();
//sorting dictionary
dictionary.entrySet()
.stream()
.sorted(new Comparator<Map.Entry<String, Term>>() {
@Override
public int compare(Map.Entry<String, Term> o1, Map.Entry<String, Term> o2) {
return o1.getKey().compareToIgnoreCase(o2.getKey());
}
})
.forEachOrdered(x -> sortedTermPerNFiles.put(x.getKey(), x.getValue()));
String postingFilePath= "";
if (useStemmer){
new File(path + "\\tempPostingsStemmer").mkdir();
postingFilePath = path + "\\tempPostingsStemmer\\" +"Stemmer_"+ postingFileNameCounter + ".txt";
}
else {
new File(path + "\\tempPostings").mkdir();
postingFilePath = path + "\\tempPostings\\" + postingFileNameCounter + ".txt";
}
postingFileNameCounter++;
FileWriter posting = new FileWriter(postingFilePath);
BufferedWriter writer = new BufferedWriter(posting);
for (HashMap.Entry<String, Term> word: sortedTermPerNFiles.entrySet()) {
Term term = word.getValue();
writer.write(term.toString());
writer.newLine();
}
writer.flush();
writer.close();
dictionary.clear();
}
/**
* this function is writing the main dictionary in the disk
* @throws IOException
*/
public void sendMainDictionaryToPostingFiles() throws IOException {
LinkedHashMap<String, String[] > sortedTermPerNFiles = new LinkedHashMap<>();
mainDictionary.entrySet()
.stream()
.sorted(new Comparator<Map.Entry<String, String[]>>() {
@Override
public int compare(Map.Entry<String, String[]> o1, Map.Entry<String, String[]> o2) {
return o1.getKey().compareToIgnoreCase(o2.getKey());
}
})
.forEachOrdered(x -> sortedTermPerNFiles.put(x.getKey(), x.getValue()));
String postingFilePath = "";
if (useStemmer){
postingFilePath = path+"\\Stemmer_dictionary.txt";
}
else {
postingFilePath = path+"\\dictionary.txt";
}
FileWriter posting = new FileWriter(postingFilePath);
BufferedWriter writer = new BufferedWriter(posting);
for (HashMap.Entry<String, String []> word: sortedTermPerNFiles.entrySet()) {
String [] term = word.getValue();
String string =word.getKey() + " "+ term[1] + " " + term[2];
writer.write(string);
writer.newLine();
}
writer.flush();
writer.close();
}
/**
* this function is writing the posting files in the disk
* @throws IOException
*/
public void sendDocsDictionaryToPostingFiles() throws IOException {
String postingFilePath = "";
if (useStemmer){
postingFilePath = path+"\\Stemmer_DocsDictionary.txt";
}
else {
postingFilePath = path+"\\DocsDictionary.txt";
}
FileWriter posting = new FileWriter(postingFilePath, true);
BufferedWriter writer = new BufferedWriter(posting);
for (String term: docsDictionary.keySet()) {
Document doc = docsDictionary.get(term);
totalLength=totalLength+doc.length;///////////////////////////////
writer.write(doc.toString());
writer.newLine();
}
writer.write("total length: "+totalLength);/////////////////////////////
writer.newLine();
numberOfDocs = numberOfDocs + docsDictionary.size();
writer.write("number of docs: "+numberOfDocs); ////////////////////////////
writer.newLine();
writer.flush();
writer.close();
docsDictionary.clear();
}
/**
* this function is changing the small letter and capital letter
* @param term
*/
public void lowerUpperCase (Term term){
char firstLetter = term.word.charAt(0);
if (mainDictionary.containsKey(term.word.toLowerCase())){ //if the term contain in the main dictionary with lower letters
if (firstLetter >= 'A' && firstLetter <= 'Z') { //if the term start with capital letter
term.word = term.word.toLowerCase();
}
}
else if(mainDictionary.containsKey(term.word.toUpperCase())){ //if the term contain in the main dictionary with upper letters
if (firstLetter >= 'A' && firstLetter <= 'Z') { //if the term start with capital letter
term.word = term.word.toUpperCase();
}
else if (firstLetter >= 'a' && firstLetter <= 'z'){ //if the term start with small letter
String[] valueTerm = mainDictionary.get(term.word.toUpperCase());
mainDictionary.remove(term.word.toUpperCase()); //remove the past key
mainDictionary.put(term.word,valueTerm); //put same value with new key
if (dictionary.containsKey(term.word.toUpperCase())){//if the term contain in the temp dictionary with upper letters
//change the key in the dictionary
dictionary.get(term.word.toUpperCase()).word = dictionary.get(term.word.toUpperCase()).word.toLowerCase(); //change the word into small letters
Term newTerm = dictionary.get(term.word.toUpperCase()); //save Term form dictionary
dictionary.remove(term.word.toUpperCase()); //remove the key from the dictionary
dictionary.put(term.word, newTerm); //put same term with change only the name to small letters
}
}
}
else if (firstLetter >= 'A' && firstLetter <= 'Z'){ //if the term not contain in the main dictionary and start with capital letter - change all letter to capital letters
term.word = term.word.toUpperCase();
}
}
/**
* this function is merging all posting files
* @throws IOException
*/
public void margePostingFiles () throws IOException {
String tempPostingPath=path+"\\tempPostings";
if (useStemmer == true){
tempPostingPath=path+"\\tempPostingsStemmer";
}
File folder = new File(tempPostingPath);
File[] files = folder.listFiles();
String[] names = new String[files.length];
for (int i =0 ; i<names.length ; i++)
{
names[i] = files[i].getName();
}
LinkedList<String> postingsPathQueue = new LinkedList<>();
for (int i =0 ; i< names.length ; i++) { //add full path
String postingPath = tempPostingPath + "\\" + names[i];
postingsPathQueue.addLast(postingPath);
}
int nameForMergesPostings = 0;
while (!(postingsPathQueue.size()==1)){
FileInputStream inputStream1 = new FileInputStream(postingsPathQueue.remove(0));
FileInputStream inputStream2 = new FileInputStream(postingsPathQueue.remove(0));
FileOutputStream outputStream = new FileOutputStream(tempPostingPath + "\\marge" + nameForMergesPostings +".txt");
BufferedReader br1 = new BufferedReader(new InputStreamReader(inputStream1));
BufferedReader br2 = new BufferedReader(new InputStreamReader(inputStream2));
BufferedWriter bw = new BufferedWriter(new OutputStreamWriter(outputStream));
String line1 = br1.readLine();
String line2 = br2.readLine();
String[] splitLine1={};
String[] splitLine2={};
while (line1!=null || line2!=null) {
if (line1!=null){
splitLine1 = line1.split("~");
}
if (line2!=null){
splitLine2=line2.split("~");
}
if(line1!=null && line2==null){
while (line1!=null){
bw.write(splitLine1[0]+"~"+splitLine1[1]);
bw.newLine();
line1 = br1.readLine();
}
}
else if(line1==null && line2!=null){
while (line2!=null){
bw.write(splitLine2[0]+"~"+splitLine2[1]);
bw.newLine();
line2 = br2.readLine();
}
}
else if (splitLine1[0].compareToIgnoreCase(splitLine2[0])==0) { //if there are same term
String mergeTerm;
if (splitLine1[0].charAt(0) >= 'a' && splitLine1[0].charAt(0) <= 'b') { //if the first term is small letter
mergeTerm = splitLine1[0] + "~" + splitLine1[1] + " " + splitLine2[1];
}
else if (splitLine2[0].charAt(0) >= 'a' && splitLine2[0].charAt(0) <= 'b') { //if the second term is small letter
mergeTerm = splitLine2[0] + "~" + splitLine1[1] + " " + splitLine2[1];
}else {
mergeTerm = splitLine1[0] + "~" + splitLine1[1] + " " + splitLine2[1];
}
bw.write(mergeTerm);
bw.newLine();
line1 = br1.readLine();
line2 = br2.readLine();
}
else if(splitLine1[0].compareToIgnoreCase(splitLine2[0]) < 0){ // 1<2. 1 should be before 2
bw.write(splitLine1[0]+"~"+splitLine1[1]);
bw.newLine();
line1 = br1.readLine();
}
else{
bw.write(splitLine2[0]+"~"+splitLine2[1]);
bw.newLine();
line2 = br2.readLine();
}
}
inputStream1.close();
inputStream2.close();
br1.close();
br2.close();
bw.flush();
bw.close();
outputStream.close();
postingsPathQueue.addLast(tempPostingPath + "\\marge" + nameForMergesPostings +".txt");
nameForMergesPostings++;
}
String stemmer = "";
if (useStemmer==true){
stemmer = "Stemmer";
}
FileInputStream inputStream = new FileInputStream(postingsPathQueue.remove(0));
BufferedReader bufferedReader = new BufferedReader(new InputStreamReader(inputStream));
String line = bufferedReader.readLine();
String[] splitLine1={};
//splitLine1 = line.split("~");
while (line!=null){
splitLine1 = line.split("~");
char firstLetter = splitLine1[0].charAt(0);
if (!(firstLetter >= 'A' && firstLetter <= 'Z') && !(firstLetter >= 'a' && firstLetter <= 'z')) {
FileOutputStream outputStream = new FileOutputStream(path + "\\" +stemmer+"numbers.txt", true);
BufferedWriter bw = new BufferedWriter(new OutputStreamWriter(outputStream));
while (!(firstLetter >= 'A' && firstLetter <= 'Z') && !(firstLetter >= 'a' && firstLetter <= 'z') && line!=null) {
bw.write(line);
bw.newLine();
line = bufferedReader.readLine();
if (line!=null) {
splitLine1 = line.split("~");
firstLetter = splitLine1[0].charAt(0);
}
}
bw.flush();
bw.close();
outputStream.close();
}
else {
//FileOutputStream outputStream = new FileOutputStream(path + "\\" +stemmer + "" +firstLetter + ".txt", true);
//BufferedWriter bw1= new BufferedWriter(new OutputStreamWriter(outputStream));
while (line!=null && ((firstLetter >= 'A' && firstLetter <= 'Z') || !(firstLetter >= 'a' && firstLetter <= 'z'))){
firstLetter = splitLine1[0].toUpperCase().charAt(0);
char nextLetter = splitLine1[0].toUpperCase().charAt(0);
// FileOutputStream outputStream = new FileOutputStream(path + "\\"+stemmer + "" +firstLetter+".txt", true);
// BufferedWriter bw1 = new BufferedWriter(new OutputStreamWriter(outputStream));
PrintWriter out = new PrintWriter(new BufferedWriter(new FileWriter(new File(path + "\\"+stemmer + "" +firstLetter+".txt"))));
while (line!=null && firstLetter==nextLetter) {
out.println(line);
line = bufferedReader.readLine();
if (line!=null){
splitLine1 = line.split("~");
nextLetter = splitLine1[0].toUpperCase().charAt(0);
}
}
out.close();
}
}
}
bufferedReader.close();
inputStream.close();
File tempDirectory = new File(tempPostingPath); //remove all of the temp posting files
String[] entries = tempDirectory.list();
for(String s: entries) {
File currentFile = new File(tempDirectory.getPath(), s);
currentFile.delete();
}
tempDirectory.delete();
}
}