python hocrmod.py -d -l nld+eng+lat -f ~/175789293-f39ddfdb-6f3e-4598-8d16-80a1f4a88b36.jpg
sort through hocr paragraphs....................................................................!
block out recognized text....................................................................!
look for missed text blocks......!
work through contours.......................................................!
Traceback (most recent call last):
File "/home/rmast/hocrmod/hocrmod.py", line 562, in <module>
runThruPars(img_base,pars,orig_page,args.conf,args.lang)
File "/home/rmast/hocrmod/hocrmod.py", line 299, in runThruPars
sortOutElement(elem, par_regions, parent_node,lang)
File "/home/rmast/hocrmod/hocrmod.py", line 216, in sortOutElement
x0,y0,x1,y1,_ = getBBoxInfo(elem.attrib['title'])
~~~~~~~~~~~^^^^^^^^^
KeyError: 'title'
conda list
# packages in environment at /home/rmast/miniconda3/envs/hocrmod:
#
# Name Version Build Channel
_libgcc_mutex 0.1 main
_openmp_mutex 5.1 1_gnu
blas 1.0 mkl
bzip2 1.0.8 h5eee18b_6
c-ares 1.19.1 h5eee18b_0
ca-certificates 2024.7.2 h06a4308_0
cairo 1.16.0 hb05425b_5
cyrus-sasl 2.1.28 h52b45da_1
dbus 1.13.18 hb2f20db_0
eigen 3.4.0 hdb19cb5_0
expat 2.6.2 h6a678d5_0
ffmpeg 4.2.2 h20bf706_0
fontconfig 2.14.1 h55d465d_3
freetype 2.12.1 h4a9f257_0
giflib 5.2.1 h5eee18b_3
glib 2.78.4 h6a678d5_0
glib-tools 2.78.4 h6a678d5_0
gmp 6.2.1 h295c915_3
gnutls 3.6.15 he1e5248_0
graphite2 1.3.14 h295c915_1
gst-plugins-base 1.14.1 h6a678d5_1
gstreamer 1.14.1 h5eee18b_1
harfbuzz 4.3.0 hf52aaf7_2
hdf5 1.12.1 h2b7332f_3
icu 73.1 h6a678d5_0
intel-openmp 2023.1.0 hdb19cb5_46306
jpeg 9e h5eee18b_3
krb5 1.20.1 h143b758_1
lame 3.100 h7b6447c_0
lcms2 2.12 h3be6417_0
ld_impl_linux-64 2.38 h1181459_1
leptonica 1.82.0 h42c8aad_2
lerc 3.0 h295c915_0
libabseil 20240116.2 cxx17_h6a678d5_0
libarchive 3.6.2 hfab0078_4
libclang 14.0.6 default_hc6dbbc7_1
libclang13 14.0.6 default_he11475f_1
libcups 2.4.2 h2d74bed_1
libcurl 8.7.1 h251f7ec_0
libdeflate 1.17 h5eee18b_1
libedit 3.1.20230828 h5eee18b_0
libev 4.33 h7f8727e_1
libffi 3.4.4 h6a678d5_1
libgcc-ng 11.2.0 h1234567_1
libgfortran-ng 11.2.0 h00389a5_1
libgfortran5 11.2.0 h1234567_1
libglib 2.78.4 hdc74915_0
libgomp 11.2.0 h1234567_1
libiconv 1.16 h5eee18b_3
libidn2 2.3.4 h5eee18b_0
libllvm14 14.0.6 hdb19cb5_3
libnghttp2 1.57.0 h2d74bed_0
libopus 1.3.1 h7b6447c_0
libpng 1.6.39 h5eee18b_0
libpq 12.17 hdbd6064_0
libprotobuf 4.25.3 he621ea3_0
libssh2 1.11.0 h251f7ec_0
libstdcxx-ng 11.2.0 h1234567_1
libtasn1 4.19.0 h5eee18b_0
libtiff 4.5.1 h6a678d5_0
libunistring 0.9.10 h27cfd23_0
libuuid 1.41.5 h5eee18b_0
libvpx 1.7.0 h439df22_0
libwebp 1.3.2 h11a3e52_0
libwebp-base 1.3.2 h5eee18b_0
libxcb 1.15 h7f8727e_0
libxkbcommon 1.0.1 h097e994_2
libxml2 2.13.1 hfdd30dd_2
lz4-c 1.9.4 h6a678d5_1
mkl 2023.1.0 h213fc3f_46344
mkl-service 2.4.0 py311h5eee18b_1
mkl_fft 1.3.8 py311h5eee18b_0
mkl_random 1.2.4 py311hdb19cb5_0
mysql 5.7.24 h721c034_2
ncurses 6.4 h6a678d5_0
nettle 3.7.3 hbbd107a_1
numpy 1.26.4 py311h08b1b3b_0
numpy-base 1.26.4 py311hf175353_0
opencv 4.10.0 py311hab19f7d_0
openh264 2.1.1 h4ff587b_0
openjpeg 2.5.2 he7f1fd0_0
openssl 3.0.14 h5eee18b_0
packaging 24.1 py311h06a4308_0
pcre2 10.42 hebb0a14_1
pillow 10.4.0 py311h5eee18b_0
pip 24.0 py311h06a4308_0
pixman 0.40.0 h7f8727e_1
pytesseract 0.3.10 py311h06a4308_0
python 3.11.9 h955ad1f_0
qt-main 5.15.2 h53bd1ea_10
readline 8.2 h5eee18b_0
setuptools 72.1.0 py311h06a4308_0
sqlite 3.45.3 h5eee18b_0
tbb 2021.8.0 hdb19cb5_0
tesseract 5.2.0 h6a678d5_0
tk 8.6.14 h39e8969_0
tzdata 2024a h04d1e81_0
wheel 0.43.0 py311h06a4308_0
x264 1!157.20191217 h7b6447c_0
xz 5.4.6 h5eee18b_1
zlib 1.2.13 h5eee18b_1
zstd 1.5.5 hc292b87_2
Image used from internetarchive/archive-pdf-tools#55