-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathprocess_folder.py
More file actions
74 lines (64 loc) · 2.5 KB
/
process_folder.py
File metadata and controls
74 lines (64 loc) · 2.5 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
import os
import sys
from bs4 import BeautifulSoup as BS
from pyhOCR.parse import hOCR_file_to_page
from pyhOCR.analytics import generate_page_stats
from pyhOCR.objects import Text
import re
from PIL import Image
from scipy.cluster import vq
import numpy as np
directory = sys.argv[1]
def create_initial_text(source_dir):
""" Takes a directory path and returns a Text object """
num = re.compile('\d\d\d\d')
for root, dirs, files in os.walk(source_dir):
pages = []
for f in files:
# Create a page object from the file
page = hOCR_file_to_page(root + f)
# Find the page number in the file name
page.number = int(num.search(f).group())
pages.append(page)
# Sort the pages into the correct order
sorted_pages = sorted(pages, key=lambda p: p.number)
text = Text(sorted_pages)
return text
def normalise_vectors(vector_list):
""" Takes a list of vectors and normalises all values
in those vectors between 0 and 1 """
for i in range(0, len(vector_list[0])):
max_value = max([vector[i] for vector in vector_list])
for vector in vector_list:
vector[i] = float(vector[i]) / max_value
return vector_list
def text_statistics(text):
""" Generates a list of page statistics vectors for a text """
page_stats = []
for page in text.pages:
# Taking only the value from the value, label tuples.
page_stats.append(list(stat[1] for stat in generate_page_stats(page)))
return page_stats
def cluster_pages(vector_spaces, no_of_clusters=5):
""" Use k-means clustering """
scaled_vectors = vq.whiten(np.array(vector_spaces))
centroids, labels = vq.kmeans2(scaled_vectors, no_of_clusters, minit='points')
return labels
def page_centroids(vector_spaces, no_of_clusters=5):
""" Use k-means clustering """
scaled_vectors = vq.whiten(np.array(vector_spaces))
codebook, distortion = vq.kmeans(scaled_vectors, 5)
return codebook
def relabel_types(labels):
""" Modifies labels to ascend in order of appearance """
mapping_dict = dict()
i = 1
for label in labels:
if not mapping_dict.get(label):
mapping_dict[label] = i
i += 1
return list([mapping_dict[label] for label in labels ])
text = create_initial_text(directory)
numbers = [page.number for page in text.pages]
for n, t in zip(numbers, relabel_types(cluster_pages(normalise_vectors(text_statistics(text))))):
print(n, t )