ncd/ncds.py at master · DavyLandman/ncd · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
#!/usr/bin/env python3

import sys
import os
import lzma
import io
import csv
from multiprocessing import Pool
import multiprocessing

def progress(s):
  sys.stderr.write(s)
  sys.stderr.write('\n')

files = sorted([f for f in sys.argv[1:] if os.path.getsize(f) > 0])
sizes = [os.path.getsize(f) for f in files]

progress("Reading all files")
contents = { f : io.FileIO(f).readall() for f in files}


lzma_filters = my_filters = [
    {
      "id": lzma.FILTER_LZMA2,
      "preset": 9 | lzma.PRESET_EXTREME,
      "dict_size": max(sizes) * 10, # a big enough dictionary, but not more than needed, saves memory
      "lc": 3,
      "lp": 0,
      "pb": 0, # assume ascii
      "mode": lzma.MODE_NORMAL,
      "nice_len": 273,
      "mf": lzma.MF_BT4
    }
]

def Z(contents):
  return len(lzma.compress(contents, format=lzma.FORMAT_RAW, filters= lzma_filters))

progress("Compressing all files")
compressed = { f : Z(contents[f]) for f in files }

if len(compressed) == 1:
  print(compressed)
  sys.exit()


def ncd(fa,fb):
  Za = compressed[fa]
  Zb = compressed[fb]
  Zab = Z(contents[fa] + contents[fb])
  return (Zab - min(Za, Zb)) / max(Za, Zb)

if len(compressed) == 2:
  print(ncd(files[0], files[1]))
  sys.exit()

progress("Calculating ncd's")


def calculate_row(f):
  data = {"file" : f}
  for f2 in files:
    if f >= f2:
      data[f2] = ncd(f, f2)
    else:
      data[f2] = ''
  return data

data = {}


def add_result(row):
  if len(data) % 10 == 0:
    progress("progress: %f %%" % ((float(len(data)) / len(files)) * 100))
  data[row["file"]] = row

with Pool(multiprocessing.cpu_count()) as p:
  for f in files:
    p.apply_async(calculate_row, args = (f,), callback = add_result)
  p.close()
  p.join()

output =  csv.DictWriter(sys.stdout, ["file"] + files)
output.writeheader()
for f in files:
  output.writerow(data[f])