-
Notifications
You must be signed in to change notification settings - Fork 6
Expand file tree
/
Copy pathncds.py
More file actions
executable file
·87 lines (66 loc) · 1.8 KB
/
ncds.py
File metadata and controls
executable file
·87 lines (66 loc) · 1.8 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
#!/usr/bin/env python3
import sys
import os
import lzma
import io
import csv
from multiprocessing import Pool
import multiprocessing
def progress(s):
sys.stderr.write(s)
sys.stderr.write('\n')
files = sorted([f for f in sys.argv[1:] if os.path.getsize(f) > 0])
sizes = [os.path.getsize(f) for f in files]
progress("Reading all files")
contents = { f : io.FileIO(f).readall() for f in files}
lzma_filters = my_filters = [
{
"id": lzma.FILTER_LZMA2,
"preset": 9 | lzma.PRESET_EXTREME,
"dict_size": max(sizes) * 10, # a big enough dictionary, but not more than needed, saves memory
"lc": 3,
"lp": 0,
"pb": 0, # assume ascii
"mode": lzma.MODE_NORMAL,
"nice_len": 273,
"mf": lzma.MF_BT4
}
]
def Z(contents):
return len(lzma.compress(contents, format=lzma.FORMAT_RAW, filters= lzma_filters))
progress("Compressing all files")
compressed = { f : Z(contents[f]) for f in files }
if len(compressed) == 1:
print(compressed)
sys.exit()
def ncd(fa,fb):
Za = compressed[fa]
Zb = compressed[fb]
Zab = Z(contents[fa] + contents[fb])
return (Zab - min(Za, Zb)) / max(Za, Zb)
if len(compressed) == 2:
print(ncd(files[0], files[1]))
sys.exit()
progress("Calculating ncd's")
def calculate_row(f):
data = {"file" : f}
for f2 in files:
if f >= f2:
data[f2] = ncd(f, f2)
else:
data[f2] = ''
return data
data = {}
def add_result(row):
if len(data) % 10 == 0:
progress("progress: %f %%" % ((float(len(data)) / len(files)) * 100))
data[row["file"]] = row
with Pool(multiprocessing.cpu_count()) as p:
for f in files:
p.apply_async(calculate_row, args = (f,), callback = add_result)
p.close()
p.join()
output = csv.DictWriter(sys.stdout, ["file"] + files)
output.writeheader()
for f in files:
output.writerow(data[f])