SimpleIR/preprocessDB.py at master · vmalpani/SimpleIR · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
#!/usr/bin/python -O

import xml.etree.ElementTree as et
import nltk
import sys
import operator
import re
import cPickle
import time
import os

# add
import glob
import shutil

from helper import *
from collections import namedtuple
from nltk.stem.lancaster import LancasterStemmer
from nltk.corpus import stopwords
from string import punctuation

def init(xml_path):
	'''
        Parse the given xml using ElementTree
        Preprocessing xml includes:
			1. Changing text to lowercase
			2. Removing punctuation (Not hypen as it might have undesirable results)
			3. Eliminating stop words
        We use clean_data helper function in helper.py for preprocessing
    '''

	xmlTree = et.parse(xml_path)
	xmlRoot = xmlTree.getroot()

	metadata = namedtuple('metadata', 'docnum title author bib txt')
	docDB = []
	doc_dict = {}

	# Parse the xml file and extract relevant info using find, findall functions
	for docs in xmlRoot.findall('DOC'):
		docNum = docs.find('DOCNO').text
		title = docs.find('TITLE').text
		author = docs.find('.//AUTHOR').text
		bib = docs.find('BIBLIO').text
		txt = docs.find('TEXT').text

		doc_dict[docNum] = [title,author,bib,txt]
		Node = metadata(docNum, title, author, bib, txt)
		docDB.append(Node)

	text_list = []

	for i in docDB:
		# preprocessing the raw data
		text_list.append(clean_data(i[4]))

	# Dictionary of Dictionaries Word --> Doc Num --> [Occurrences]
	index = {}
	count_docs = 0
	count_words = 0
	for i in text_list:
		count_docs += 1
		for j in range(len(i)):
			if len(i[j]) > 2 and not i[j].isdigit():
				count_words += 1
				if i[j] in index.keys():
					if count_docs in index[i[j]].keys():
						index[i[j]][count_docs].append(j)
					else:
						index[i[j]][count_docs] = []
						index[i[j]][count_docs].append(j)
				else:
					index[i[j]] = {}
					index[i[j]][count_docs]= []
					index[i[j]][count_docs].append(j)
		if count_docs % 100 == 0:
			print "Indexing Files..."

	cPickle.dump(index, open('doc_index.p', 'wb'))
 	cPickle.dump(doc_dict, open('doc_metadata.p', 'wb'))
  	cPickle.dump(text_list, open('doc_clean_text.p', 'wb'))
	print "\n" + str(count_words) + " Words in " + str(count_docs) + " Files Indexed Successfully!!"
	print "\nIndex Stored: doc_index.p"


################################################################################

##############				Code Flow Begins Here				################

################################################################################

cran_path = raw_input("Enter Path Of XML(No Quotes): ")

# add
with open(cran_path, 'wb') as outfile:
	outfile.write('<cran>\n')
	for filename in glob.glob('*.xml'):
		with open(filename) as readfile:
			shutil.copyfileobj(readfile, outfile)
	outfile.write('\n</cran>')

start_time = time.time()
init(cran_path)
print "\nIndexing Time: ", time.time() - start_time, "seconds"
print "\nIndex File Size: " + str((os.stat('doc_index.p')).st_size)
print "\nMetadata File Size: " + str((os.stat('doc_metadata.p')).st_size)