-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathDateParser.py
More file actions
executable file
·136 lines (127 loc) · 5.48 KB
/
DateParser.py
File metadata and controls
executable file
·136 lines (127 loc) · 5.48 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
#!/usr/bin/python
# -*- coding: utf-8 -*-
import re
import time
import argparse
import os
import ntpath
""" This module aims at converting all occurence of a date to
a valid xsd:dateTimeStamp.
:Exemple:
>>> # in python
>>> datestring = '20-02-2018 blasdjwbc 20-04-2018'
>>> DateFormatter.fromString(datestring)
'2018-02-20T12:00:00Z blasdjwbc 2018-04-20T12:00:00Z'
"""
__author__ = "Guilhem Heinrich"
__license__ = "GPL"
__version__ = "1.0.1"
__maintainer__ = "Guilhem Heinrich"
__email__ = "guilhem.heinrich@inra.fr"
__status__ = "Prototype"
class DateFormatter:
token = 'MATCH'
date_format_regex = [
r'(?P<year>\d{4})(?P<sepD>.)(?P<mounth>\d{2})(?P=sepD)(?P<day>\d{2})',
r'(?P<day>\d{2})(?P<sepD>.)(?P<mounth>\d{2})(?P=sepD)(?P<year>\d{4})',
r'(?P<year>\d{4})(?P<sepD>.)(?P<mounth>\d{2})(?P=sepD)(?P<day>\d{2}).{0,2}\
(?P<hour>\d{2})(?P<sepT>)(?P<minute>\d{2})(?P=sepT)(?P<second>\d{2})',
r'(?P<day>\d{2})(?P<sepD>.)(?P<mounth>\d{2})(?P=sepD)(?P<year>\d{4}).{0,2}\
(?P<hour>\d{2})(?P<sepT>)(?P<minute>\d{2})(?P=sepT)(?P<second>\d{2})'
]
def __init__(self,
year='2000',
mounth='01',
day='01',
hour='12',
minute='00',
second='00',
timezoneShift=None,
**kwargs): # Kwargs is here to allow passing a dict with more
# arguments than required. It's a junk.
self.year = year
self.mounth = mounth
self.day = day
self.hour = hour
self.minute = minute
self.second = second
self.timezone_shift = time.localtime().tm_isdst
if timezoneShift:
self.timezone_shift = timezoneShift
def asXsdDateTimeStamp(self):
# pattern_XsdDateTimeStamp = '\d{4}-\d{2}-\d{2}T\d{2}:\d{2}:\d{2}Z|((+|-)\d{2}:\d{2})'
output = "{year}-{mounth}-{day}T{hour}:{minute}:{second}".format(
**self.__dict__)
if self.timezone_shift == 0:
output += 'Z'
elif isinstance(self.timezone_shift, int):
# If it's an integer, it's between -14 and 14
assert self.timezone_shift < 15 and self.timezone_shift > -15
if self.timezone_shift > 0:
output += '+'
if self.timezone_shift < 9:
output += '0' + str(self.timezone_shift) + ':00'
else:
output += str(self.timezone_shift) + ':00'
else:
if self.timezone_shift > -10:
output += '0' + str(abs(self.timezone_shift)) + ':00'
else:
output += str(abs(self.timezone_shift)) + ':00'
# Otherwise it's already a string
else:
assert isinstance(self.timezone_shift, basestring)
output += self.timezone_shift
return output
@staticmethod
def compute(stringOrMatch, token=''):
if stringOrMatch.__class__.__name__ == 'SRE_Match':
dateFormatter = DateFormatter(**stringOrMatch.groupdict())
return token + dateFormatter.asXsdDateTimeStamp()
else:
return stringOrMatch
@staticmethod
def fromFile(infile, outfile=None):
with open(infile) as inputFile:
with open(outfile, 'w') as outputFile:
for line in inputFile:
new_line = DateFormatter.fromString(line)
outputFile.write(new_line)
@staticmethod
def fromString(stringToParse):
old_line = stringToParse
for pattern in DateFormatter.date_format_regex:
patternAndToken = '(?<!' + DateFormatter.token + ')' + pattern
new_line = re.sub(patternAndToken, lambda line: DateFormatter.compute(
line, DateFormatter.token), old_line)
if old_line != new_line:
old_line = new_line
new_line = re.sub(DateFormatter.token, '', new_line)
return new_line
if __name__ == '__main__':
parser = argparse.ArgumentParser(description=""" This utility converts all dates to
the xsd:DateTimeStamp format (As described in http://www.datypic.com/sc/xsd11/t-xsd_dateTimeStamp.html).
If only one file is supplied as input, the output option (-o) can specify the path/to/newfile.
By default, '_parser' is append to the input filename and written in the directory child '/parsed/'.
It will convert a string like '20-02-2018 blasdjwbc 20-04-2018' in '2018-02-20T12:00:00Z blasdjwbc 2018-04-20T12:00:00Z'
""")
parser.add_argument('inputs', metavar='inputs', nargs='+',
help="""Path to the input(s) file""")
parser.add_argument('-o', dest='output', nargs='?',
help="""Path to the output file""")
args = parser.parse_args()
# print args.__dict__
inputs_filename = args.inputs
if args.output is not None:
DateFormatter.fromFile(infile=inputs_filename[0], outfile=args.output)
else:
for fullname in inputs_filename:
infile, file_extension = os.path.splitext(fullname)
directory_name = os.path.dirname(infile)
filename = os.path.basename(infile)
if os.path.isdir(infile):
continue
if not os.path.isdir(directory_name + '/parsed/'):
os.mkdir(directory_name + '/parsed/')
outfile = directory_name + '/parsed/' + filename + '_parsed' + file_extension
DateFormatter.fromFile(infile=fullname, outfile = outfile)