Skip to content

Commit 612eded

Browse files
authored
Merge pull request #696 from alephdata/release/4.1.0
Release/4.1.0
2 parents c014ac9 + fa740d2 commit 612eded

File tree

9 files changed

+23
-19
lines changed

9 files changed

+23
-19
lines changed

.bumpversion.cfg

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
[bumpversion]
2-
current_version = 4.0.2
2+
current_version = 4.1.0
33
tag_name = {new_version}
44
commit = True
55
tag = True

Dockerfile

Lines changed: 8 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -22,7 +22,7 @@ RUN echo "deb http://http.us.debian.org/debian stable non-free" >/etc/apt/source
2222
# image processing, djvu
2323
mdbtools djvulibre-bin \
2424
libtiff5-dev \
25-
libtiff-tools ghostscript librsvg2-bin jbig2dec \
25+
libtiff-tools ghostscript librsvg2-bin jbig2dec libopenjp2-7-dev \
2626
pst-utils libgif-dev \
2727
### tesseract
2828
tesseract-ocr-eng \
@@ -118,10 +118,11 @@ RUN groupadd -g 1000 -r app \
118118

119119
# Download the ftm-typepredict model
120120
RUN mkdir /models/ && \
121-
curl -o "/models/model_type_prediction.ftz" "https://public.data.occrp.org/develop/models/types/type-08012020-7a69d1b.ftz"
121+
curl --keepalive-time 2 -o "/models/model_type_prediction.ftz" "https://public.data.occrp.org/develop/models/types/type-08012020-7a69d1b.ftz"
122122

123123
COPY requirements.txt /tmp/
124-
RUN pip3 install --no-cache-dir --no-binary "tesserocr" -r /tmp/requirements.txt
124+
RUN pip install --upgrade pip setuptools
125+
RUN pip3 install --no-cache-dir --no-binary "tesserocr" --no-binary "Pillow" -r /tmp/requirements.txt
125126

126127
# Install spaCy models
127128
RUN python3 -m spacy download en_core_web_sm \
@@ -147,11 +148,10 @@ RUN pip install --no-cache-dir --config-settings editable_mode=compat --use-pep5
147148
RUN chown -R app:app /ingestors
148149

149150
ENV ARCHIVE_TYPE=file \
150-
ARCHIVE_PATH=/data \
151-
FTM_STORE_URI=postgresql://aleph:aleph@postgres/aleph \
152-
REDIS_URL=redis://redis:6379/0 \
153-
TESSDATA_PREFIX=/usr/share/tesseract-ocr/5/tessdata \
154-
LD_PRELOAD=/usr/lib/aarch64-linux-gnu/libgomp.so.1
151+
ARCHIVE_PATH=/data \
152+
FTM_STORE_URI=postgresql://aleph:aleph@postgres/aleph \
153+
REDIS_URL=redis://redis:6379/0 \
154+
TESSDATA_PREFIX=/usr/share/tesseract-ocr/5/tessdata
155155

156156
# USER app
157157
CMD ingestors process

Makefile

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
INGEST=ghcr.io/alephdata/ingest-file
22
COMPOSE=docker compose
3-
DOCKER=$(COMPOSE) run --rm ingest-file
3+
DOCKER=$(COMPOSE) run --rm -e LD_PRELOAD=/usr/lib/aarch64-linux-gnu/libgomp.so.1 ingest-file
44

55
.PHONY: build
66

ingestors/__init__.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@
22

33
import logging
44

5-
__version__ = "4.0.2"
5+
__version__ = "4.1.0"
66

77
logging.getLogger("chardet").setLevel(logging.INFO)
88
logging.getLogger("PIL").setLevel(logging.INFO)

ingestors/media/image.py

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
import logging
22
from io import BytesIO
3-
from PIL import Image, ExifTags
3+
from PIL import Image, ExifTags, ImageFile
44
from followthemoney import model
55

66
from ingestors.ingestor import Ingestor
@@ -10,6 +10,9 @@
1010

1111
log = logging.getLogger(__name__)
1212

13+
# from https://stackoverflow.com/a/47958486
14+
ImageFile.LOAD_TRUNCATED_IMAGES = True
15+
1316

1417
class ImageIngestor(Ingestor, OCRSupport, TimestampSupport):
1518
"""Image file ingestor class. Extracts the text from images using OCR."""

ingestors/support/ocr.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@
33
import threading
44
from hashlib import sha1
55
from normality import stringify
6-
from PIL import Image
6+
from PIL import Image, ImageFile
77
from io import BytesIO
88
from languagecodes import list_to_alpha3 as alpha3
99

@@ -13,6 +13,7 @@
1313

1414
log = logging.getLogger(__name__)
1515
TESSERACT_LOCALE = "C"
16+
ImageFile.LOAD_TRUNCATED_IMAGES = True
1617

1718

1819
class OCRSupport(CacheSupport):

ingestors/support/shell.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
import os
2+
import shutil
23
import subprocess
34
from servicelayer import env
4-
from distutils.spawn import find_executable
55

66
from ingestors.util import path_string
77
from ingestors.exc import ProcessingException
@@ -17,7 +17,7 @@ class ShellSupport(object):
1717
def find_command(self, name):
1818
config_name = "%s_BIN" % name
1919
config_name = config_name.replace("-", "_").upper()
20-
return env.get(config_name, find_executable(name))
20+
return env.get(config_name, shutil.which(name))
2121

2222
def exec_command(self, command, *args):
2323
binary = self.find_command(command)

requirements.txt

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@ normality==2.5.0
33
pantomime==0.6.1
44
followthemoney==3.5.9
55
followthemoney-store[postgresql]==3.1.0
6-
servicelayer[google,amazon]==1.23.2
6+
servicelayer[google,amazon]==1.24.0
77
languagecodes==1.1.1
88
countrytagger==0.1.2
99
pyicu==2.12
@@ -31,7 +31,7 @@ odfpy==1.4.1
3131
cchardet==2.1.7
3232
lxml==5.0.0
3333
olefile==0.47
34-
Pillow==10.1.0
34+
Pillow==10.4.0
3535
vobject==0.9.6.1
3636
msglite==0.30.0
3737
icalendar==5.0.12
@@ -41,4 +41,4 @@ requests[security]==2.31.0
4141
pymupdf==1.21.1
4242

4343
prometheus-client==0.17.1
44-
sentry_sdk==2.0.1
44+
sentry_sdk==2.19.2

setup.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,7 @@
55

66
setup(
77
name="ingest",
8-
version="4.0.2",
8+
version="4.1.0",
99
author="Organized Crime and Corruption Reporting Project",
1010
packages=find_packages(exclude=["tests"]),
1111
package_dir={"ingestors": "ingestors"},

0 commit comments

Comments
 (0)