-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathpreprocessDB.py
More file actions
106 lines (87 loc) · 2.94 KB
/
preprocessDB.py
File metadata and controls
106 lines (87 loc) · 2.94 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
#!/usr/bin/python -O
import xml.etree.ElementTree as et
import nltk
import sys
import operator
import re
import cPickle
import time
import os
# add
import glob
import shutil
from helper import *
from collections import namedtuple
from nltk.stem.lancaster import LancasterStemmer
from nltk.corpus import stopwords
from string import punctuation
def init(xml_path):
'''
Parse the given xml using ElementTree
Preprocessing xml includes:
1. Changing text to lowercase
2. Removing punctuation (Not hypen as it might have undesirable results)
3. Eliminating stop words
We use clean_data helper function in helper.py for preprocessing
'''
xmlTree = et.parse(xml_path)
xmlRoot = xmlTree.getroot()
metadata = namedtuple('metadata', 'docnum title author bib txt')
docDB = []
doc_dict = {}
# Parse the xml file and extract relevant info using find, findall functions
for docs in xmlRoot.findall('DOC'):
docNum = docs.find('DOCNO').text
title = docs.find('TITLE').text
author = docs.find('.//AUTHOR').text
bib = docs.find('BIBLIO').text
txt = docs.find('TEXT').text
doc_dict[docNum] = [title,author,bib,txt]
Node = metadata(docNum, title, author, bib, txt)
docDB.append(Node)
text_list = []
for i in docDB:
# preprocessing the raw data
text_list.append(clean_data(i[4]))
# Dictionary of Dictionaries Word --> Doc Num --> [Occurrences]
index = {}
count_docs = 0
count_words = 0
for i in text_list:
count_docs += 1
for j in range(len(i)):
if len(i[j]) > 2 and not i[j].isdigit():
count_words += 1
if i[j] in index.keys():
if count_docs in index[i[j]].keys():
index[i[j]][count_docs].append(j)
else:
index[i[j]][count_docs] = []
index[i[j]][count_docs].append(j)
else:
index[i[j]] = {}
index[i[j]][count_docs]= []
index[i[j]][count_docs].append(j)
if count_docs % 100 == 0:
print "Indexing Files..."
cPickle.dump(index, open('doc_index.p', 'wb'))
cPickle.dump(doc_dict, open('doc_metadata.p', 'wb'))
cPickle.dump(text_list, open('doc_clean_text.p', 'wb'))
print "\n" + str(count_words) + " Words in " + str(count_docs) + " Files Indexed Successfully!!"
print "\nIndex Stored: doc_index.p"
################################################################################
############## Code Flow Begins Here ################
################################################################################
cran_path = raw_input("Enter Path Of XML(No Quotes): ")
# add
with open(cran_path, 'wb') as outfile:
outfile.write('<cran>\n')
for filename in glob.glob('*.xml'):
with open(filename) as readfile:
shutil.copyfileobj(readfile, outfile)
outfile.write('\n</cran>')
start_time = time.time()
init(cran_path)
print "\nIndexing Time: ", time.time() - start_time, "seconds"
print "\nIndex File Size: " + str((os.stat('doc_index.p')).st_size)
print "\nMetadata File Size: " + str((os.stat('doc_metadata.p')).st_size)