-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathdeltasTXT_hashing.py
More file actions
108 lines (85 loc) · 2.65 KB
/
deltasTXT_hashing.py
File metadata and controls
108 lines (85 loc) · 2.65 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
#!/usr/bin/python
import sys,gzip,zlib
import hashlib
import datetime
dataFieldSeparator='\t'
headerSeparator=','
tableName =sys.argv[1]
oldFileName =sys.argv[2]
newFileName =sys.argv[3]
headerFileName =sys.argv[4]
eventAction={}
oldContentHash={}
oldContentData={}
updContentData={}
################################################################################
### READING PART ###############################################################
################################################################################
#print datetime.datetime.now()
fileIn = gzip.open(oldFileName, 'r')
for thisLine in fileIn:
# Get rid of non ID lines
if not thisLine[0].isdigit():
continue
thisId=thisLine.split(dataFieldSeparator)[0]
oldContentHash[thisId]=hashlib.md5(thisLine).hexdigest()
oldContentData[thisId]=zlib.compress( thisLine.rstrip('\n') )
eventAction[thisId]='D'
fileIn.close()
#print eventAction
#print "new"
#print datetime.datetime.now()
fileIn = gzip.open(newFileName, 'r')
for thisLine in fileIn:
# Get rid of non ID lines
if not thisLine[0].isdigit():
continue
thisId=thisLine.split(dataFieldSeparator)[0]
# Check if exists
if eventAction.has_key(thisId):
# if so, same data?
if oldContentHash[thisId] == hashlib.md5(thisLine).hexdigest():
del eventAction[thisId]
#del oldContentHash[thisId]
del oldContentData[thisId]
continue
else:
# UPDATE
eventAction[thisId]='U'
updContentData[thisId]=zlib.compress( thisLine.rstrip('\n') )
else:
# new
eventAction[thisId]='I'
oldContentData[thisId]=zlib.compress( thisLine.rstrip('\n') )
del oldContentHash
fileIn.close()
#print eventAction
# Header, write once execute everywhere :-D
headerFile = open(headerFileName, 'r')
headerString=headerFile.readline().rstrip('\n')
headerFile.close()
headerList=headerString.split(headerSeparator)
headerLen=len(headerList)
for thisKey,thisAction in eventAction.items():
print thisAction,
print thisKey,
if thisAction != 'D':
print '\t',
if thisAction == 'I':
print 'NEW ',
else:
print 'OLD ',
newData = zlib.decompress ( oldContentData[thisKey] ).split(dataFieldSeparator)
for fieldIdx in range(0,len(headerList)):
print headerList[fieldIdx]+"="+newData[fieldIdx]+'\t',
if thisAction == 'U':
print '\t',
print 'NEW ',
updData = zlib.decompress ( updContentData[thisKey] ).split(dataFieldSeparator)
for fieldIdx in range(0,len(headerList)):
print headerList[fieldIdx]+"="+updData[fieldIdx]+'\t',
del updContentData[thisKey]
del oldContentData[thisKey]
print
#print
#print datetime.datetime.now()