-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy patha0-format_data.py
More file actions
90 lines (73 loc) · 2.69 KB
/
a0-format_data.py
File metadata and controls
90 lines (73 loc) · 2.69 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
#Author: Finn Dobkin
#Description: The goal of this script is to use a multiprocessing pool to parse a series (n>1.4 million) of XML files. We will create a function for parsing, create a pool boject, and then call the function using that pool object to run via multiprocessing.
#Finally, we will filter the dataset to create a list of unique newspaper titles.
#Import libraries
#Libraries for parsing data
from lxml import etree
from bs4 import BeautifulSoup
import pandas as pd
import os
#Libraries for multiprocessing
import multiprocessing as mp
from multiprocessing import Pool
#Load data
#Note: I had to executive this command multiple times for all documents to load.
#Set corpus to folder
corpus = '/home/ec2-user/SageMaker/data/full-sf-dataframe/'
#Read in files
input_files = os.listdir(corpus)
print("Loaded", len(input_files), "documents.")
#Define the output file to the file path
output_file = '/home/ec2-user/SageMaker/CA-Housing-Regs/Data/analytical_sample.csv/'
#Retrieve metadata from XML document
def getxmlcontent(corpus, file)
try:
tree = etree.parse(corpus + file)
root = tree.getroot()
if root.find('.//mstar') is not None:
mstar = root.find('.//mstar').text
else
mstar = None
if root.find('.//ObjectTypeOrigin') is not None:
objecttypeorigin = root.find('.//ObjectTypeOrigin').text
else
objecttypeorigin = None
if root.find('.//GOID') is not None:
goid = root.find('.//GOID').text
else
goid = None
if root.find('.//Title') is not None:
title = root.find('.//Title').text
else
title = None
if root.find('.//NumericDate') is not None:
date = root.find('.//NumericDate').text
else
date = None
if root.find('.//PublisherName') is not None:
publisher = root.find('.//PublisherName').text
else
publisher = None
if root.find('.//SortTitle') is not None:
newstitle = root.find('.//SortTitle').text
else
newstitle = None
if root.find('.//Qualifier') is not None:
location = root.find('.//Qualifier').text
else
location = None
if root.find('.//PublicationSubject') is not None:
subject = root.find('.//PublicationSubject').text
else
subject = None
if root.find('.//FullText') is not None:
text = root.find('.//FullText').text
elif root.find('.//HiddenText') is not None:
text = root.find('.//HiddenText').text
elif root.find('.//Text') is not None:
text = root.find('.//Text').text
else:
text = None
except Exception as e:
print(f"Error while parsing file {file}: {e}")
return mstar, objecttype, objecttytpeorigin, goid, title, location, date, publisher, newstitle, subject, text