-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathtesserocr_batch.py
More file actions
executable file
·123 lines (113 loc) · 5.62 KB
/
tesserocr_batch.py
File metadata and controls
executable file
·123 lines (113 loc) · 5.62 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
from __future__ import absolute_import
import os, logging
import pickle
import multiprocessing as mp
import click
@click.command(context_settings={'help_option_names': ['-h', '--help']})
@click.option('-l', '--language', default='eng', type=str, help='specify language(s) used for OCR')
@click.option('-x', '--textsuf', default='.txt', type=str, help='file name suffix to use for text output (empty to use stdout)')
@click.option('-P', '--probsuf', default='.prob', type=str, help='file name suffix to use for characters probabilities (empty to disable)')
@click.option('-C', '--choicesuf', default='.confmat', type=str, help='file name suffix to use for alternative characters and their probabilities (empty to disable)')
@click.option('-Q', '--nprocs', default=1, type=int, help='number of processes to run in parallel')
@click.argument('input_files', nargs=-1, type=click.Path(exists=True, dir_okay=False))
def process(language, textsuf, probsuf, choicesuf, nprocs, input_files):
from tesserocr import get_languages
TESSDATA_PREFIX = os.environ['TESSDATA_PREFIX'] if 'TESSDATA_PREFIX' in os.environ else get_languages()[0]
logging.basicConfig()
log = logging.getLogger('')
log.setLevel(logging.INFO)
for sublanguage in language.split('+'):
if sublanguage not in get_languages()[1]:
raise Exception("configured language " + sublanguage + " is not installed")
def init_worker(worker):
from tesserocr import PyTessBaseAPI
worker.log = log
worker.language = language
worker.tessapi = PyTessBaseAPI(path=TESSDATA_PREFIX, lang=language)
worker.text = textsuf
worker.probabilities = probsuf
worker.choices = choicesuf
#worker.tessapi.SetVariable("tessedit_create_txt", "1")
worker.tessapi.SetVariable("lstm_choice_mode", "2")
with mp.Pool(processes=nprocs,
initializer=init_worker,
initargs=(process_file,)) as pool:
result = pool.map_async(process_file, input_files, error_callback=log.error)
result.wait()
if result.successful():
log.info('all done')
else:
log.error('error during processing')
exit(1)
def process_file(input_file):
from tesserocr import RIL, PSM, PyResultIterator
from io import open
CHOICE_THRESHOLD_NUM = 10 # maximum number of choices to query and annotate
CHOICE_THRESHOLD_CONF = 1.0 # maximum score drop from best choice to query and annotate
MAX_ELEMENTS = 500 # maximum number of lower level elements embedded within each element (for word/glyph iterators)
# get globals
log = process_file.log
language = process_file.language
tessapi = process_file.tessapi
text = process_file.text
prob = process_file.probabilities
alts = process_file.choices
basename = os.path.splitext(input_file)[0]
# recognize
tessapi.SetImageFile(input_file)
psm = PSM.SINGLE_LINE if language == 'deu-frak' else PSM.RAW_LINE # RAW_LINE fails with Tesseract 3 models and is worse with Tesseract 4 models
tessapi.SetPageSegMode(psm)
with open(basename + text, mode='w', encoding='utf-8') if text else sys.stdout as text_output, \
open(basename + prob, mode='w', encoding='utf-8') if prob else None as prob_output, \
open(basename + alts, mode='wb') if alts else None as alts_output:
log.info('processing "%s"', input_file)
tessapi.Recognize()
text_output.write(tessapi.GetUTF8Text().rstrip(u"\f"))
line = []
result_it = tessapi.GetIterator()
for word_no, _ in enumerate(iterate_level(result_it, RIL.WORD)):
#word_bbox = result_it.BoundingBox(RIL.WORD)
#word_attributes = result_it.WordFontAttributes()
# do sth on word result
for glyph_no, _ in enumerate(iterate_level(result_it, RIL.SYMBOL)):
glyph = []
glyph_symb = result_it.GetUTF8Text(RIL.SYMBOL)
glyph_conf = result_it.Confidence(RIL.SYMBOL)/100
#glyph_bbox = result_it.BoundingBox(RIL.SYMBOL)
# do sth on glyph result
if alts_output:
choice_it = result_it.GetChoiceIterator()
for choice_no, choice in enumerate(choice_it):
alt_symb = choice.GetUTF8Text()
alt_conf = choice.Confidence()/100
if (glyph_conf - alt_conf > CHOICE_THRESHOLD_CONF or
choice_no > CHOICE_THRESHOLD_NUM):
break
glyph.append((alt_symb, alt_conf))
line.append(glyph)
if prob_output:
prob_output.write("%s\t%f\n" % (glyph_symb, glyph_conf))
if result_it.IsAtFinalElement(RIL.WORD, RIL.SYMBOL):
if not result_it.IsAtFinalElement(RIL.TEXTLINE, RIL.WORD):
line.append([(' ', 1.0)])
if prob_output:
prob_output.write(" \t1.0\n")
else:
line.append([('\n', 1.0)])
if alts_output:
pickle.dump(line, alts_output)
tessapi.Clear()
#tessapi.ClearAdaptiveClassifier()
def iterate_level(it, ril, parent=None):
# improves over tesserocr.iterate_level by
# honouring multi-level semantics so iterators
# can be combined across levels
if parent is None:
parent = ril - 1
while it and not it.Empty(ril):
yield it
if ril > 0 and it.IsAtFinalElement(parent, ril):
break
it.Next(ril)
if __name__ == '__main__':
process()