-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathremoveDoubles.py
More file actions
executable file
·50 lines (44 loc) · 1.5 KB
/
removeDoubles.py
File metadata and controls
executable file
·50 lines (44 loc) · 1.5 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
#!/usr/bin/python3 -W all
"""
removeDoubles.py: select data from file2 that is not in file1
usage: rmeoveDoubles.py file1 file2
note: used for selecting new linked data not in erik-fasttext.txt
20180831 erikt(at)xs4all.nl
"""
import re
import sys
COMMAND = sys.argv.pop(0)
def processDate(date):
month,day,year = date.split("/")
month = re.sub("^DATE=","",day)
month = re.sub(r"^0","",day)
day = re.sub(r"^0","",month)
if re.search(r"....",year): return("DATE="+month+"/"+day+"/"+year)
else: return("DATE="+month+"/"+day+"/19"+year)
def readFile(fileName):
lines = []
labels = []
try:
inFile = open(fileName,"r")
for line in inFile:
tokens = line.strip().split()
labels.append(tokens.pop(0))
date = processDate(tokens.pop(0))
tokens.insert(0,date)
line = " ".join(tokens)
if line in lines: print("warning: duplicate line in file "+fileName)
lines.append(line)
inFile.close()
except Exception as e:
sys.exit(COMMAND+": error processing file "+fileName+": "+str(e))
return(lines,labels)
def main(argv):
fileName1,fileName2 = argv
lines1,labels1 = readFile(fileName1)
lines2,labels2 = readFile(fileName2)
inLines1 = {}
for i in range(0,len(lines1)): inLines1[lines1[i]] = True
for i in range(0,len(lines2)):
if not lines2[i] in inLines1: print(labels2[i],lines2[i])
if __name__ == "__main__":
sys.exit(main(sys.argv))