-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathpdfdates.py
More file actions
114 lines (94 loc) · 3.5 KB
/
pdfdates.py
File metadata and controls
114 lines (94 loc) · 3.5 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
import os
from pdfminer.pdfparser import PDFParser
from pdfminer.pdfdocument import PDFDocument
#from pdfminer.pdfparser import PDFParser, PDFDocument
from datetime import datetime, time, date
from lxml import etree
from pprint import pprint
import json
from msulrepo import repo
class DocumentDates:
def __init__(self, f):
self.file = f
def PDFCreationDate(self):
if self.file.endswith(".pdf"):
fp = open(self.file, 'rb')
parser = PDFParser(fp)
doc = PDFDocument()
parser.set_document(doc)
doc.set_parser(parser)
doc.initialize()
cdate = doc.info[0]['CreationDate']
if isinstance(cdate, str):
date_format = date(int(cdate[2:6]), int(cdate[6:8]), int(cdate[8:10]))
else:
date_format = None
print "No Creation Date for "+self.file
return date_format
else:
"The file doesn't appear to be a PDF."
return None
def FileModifiedDate(self):
mdate = os.path.getmtime(self.file)
date_format = date.fromtimestamp(mdate)
return date_format
def FileCtimeDate(self):
cdate = os.path.getctime(self.file)
date_format = date.fromtimestamp(cdate)
return date_format
class ETDData:
def __init__(self, directory, xml=None):
"""
Establishes class instance including file directory and xml file if provided.
Args:
directory(str)
"""
self.directory = directory
if xml:
self.xml = xml
def SearchAuth(self):
"""
Look up third-party search value XML file.
self.xml variable must be established.
"""
path_search = "/DISS_submission/@third_party_search"
tree = etree.parse(self.directory+self.xml)
r_search = tree.xpath(path_search)
return r_search[0]
def SearchRestrictions(self):
"""
Looks up publishing restriction codes for each XML document.
Return dictionary of attributes and their values.
self.xml variable must be established.
"""
path_r = "/DISS_submission"
tree = etree.parse(self.directory+self.xml)
r = tree.xpath(path_r)
return r[0].attrib
def FindRestricted(self, date, server, jsonize=False):
"""
Finds pids for restricted ETDs and returns them in a list.
Args:
date(str): Date after which ETD authors could select to restrict 3-p search.
server(str): Server to use (must be specified in configuration file).
jsonize(bool): Create JSON output if True.
Yields:
list: all PIDs from a given server which should be restricted.
"""
files = (x for x in os.listdir(self.directory) if x.endswith(".xml"))
marker = datetime.strptime(date, "%Y%m%d").date()
pids = []
for xfile in files:
self.xml = xfile
d = DocumentDates(self.directory+xfile)
filedate = d.FileModifiedDate()
thirdparty = self.SearchAuth()
if filedate > marker and (thirdparty == "N" or thirdparty == "O"):
repocls = repo.Repo_Connect(server)
pid = repo.Get_Pid(xfile,repocls)
if pid is not None:
pids.append(pid)
if jsonize == True:
with open("pids.json", "w") as f:
data = json.dump(pids, f)
return pids