Dissertation-Code/a0-format_data.py at main · fdobkin59/Dissertation-Code · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
#Author: Finn Dobkin
#Description: The goal of this script is to use a multiprocessing pool to parse a series (n>1.4 million) of XML files. We will create a function for parsing, create a pool boject, and then call the function using that pool object to run via multiprocessing.
#Finally, we will filter the dataset to create a list of unique newspaper titles.

#Import libraries
#Libraries for parsing data
from lxml import etree
from bs4 import BeautifulSoup
import pandas as pd
import os
#Libraries for multiprocessing
import multiprocessing as mp
from multiprocessing import Pool

#Load data
#Note: I had to executive this command multiple times for all documents to load.
#Set corpus to folder
corpus = '/home/ec2-user/SageMaker/data/full-sf-dataframe/'

#Read in files
input_files = os.listdir(corpus)
print("Loaded", len(input_files), "documents.")

#Define the output file to the file path
output_file = '/home/ec2-user/SageMaker/CA-Housing-Regs/Data/analytical_sample.csv/'

#Retrieve metadata from XML document
def getxmlcontent(corpus, file)
  try:
    tree = etree.parse(corpus + file)
    root = tree.getroot()

    if root.find('.//mstar') is not None:
      mstar = root.find('.//mstar').text
    else
      mstar = None

    if root.find('.//ObjectTypeOrigin') is not None:
      objecttypeorigin = root.find('.//ObjectTypeOrigin').text
    else
      objecttypeorigin = None

    if root.find('.//GOID') is not None:
      goid = root.find('.//GOID').text
    else
      goid = None

    if root.find('.//Title') is not None:
      title = root.find('.//Title').text
    else
      title = None

    if root.find('.//NumericDate') is not None:
      date = root.find('.//NumericDate').text
    else
      date = None

    if root.find('.//PublisherName') is not None:
      publisher = root.find('.//PublisherName').text
    else
      publisher = None

    if root.find('.//SortTitle') is not None:
      newstitle = root.find('.//SortTitle').text
    else
      newstitle = None

    if root.find('.//Qualifier') is not None:
      location = root.find('.//Qualifier').text
    else
      location = None

    if root.find('.//PublicationSubject') is not None:
      subject = root.find('.//PublicationSubject').text
    else
      subject = None

    if root.find('.//FullText') is not None:
      text = root.find('.//FullText').text
    elif root.find('.//HiddenText') is not None:
      text = root.find('.//HiddenText').text
    elif root.find('.//Text') is not None:
      text = root.find('.//Text').text
    else:
      text = None

except Exception as e:
print(f"Error while parsing file {file}: {e}")

return mstar, objecttype, objecttytpeorigin, goid, title, location, date, publisher, newstitle, subject, text