-
Notifications
You must be signed in to change notification settings - Fork 4
Expand file tree
/
Copy pathDockerfile
More file actions
172 lines (142 loc) · 6.74 KB
/
Dockerfile
File metadata and controls
172 lines (142 loc) · 6.74 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
FROM ubuntu:noble
ARG APP_PYTHON_VERSION=3.13
ARG LO_PYTHON_VERSION=3.12
ARG HTTP_PROXY
ARG HTTPS_PROXY
ARG NO_PROXY
ARG http_proxy
ARG https_proxy
ARG no_proxy
ENV HTTP_PROXY=$HTTP_PROXY
ENV HTTPS_PROXY=$HTTPS_PROXY
ENV NO_PROXY=$NO_PROXY
ENV http_proxy=$HTTP_PROXY
ENV https_proxy=$HTTPS_PROXY
ENV no_proxy=$NO_PROXY
ENV DEBIAN_FRONTEND=noninteractive
ENV DEBIAN_PRIORITY=critical
ENV NVIDIA_VISIBLE_DEVICES=all
ENV NVIDIA_DRIVER_CAPABILITIES=compute,utility,display
# Enables Python to generate .pyc files in the container
ENV PYTHONDONTWRITEBYTECODE=0
# Turns off buffering for easier container logging
ENV PYTHONUNBUFFERED=1
# ENV SETUPTOOLS_USE_DISTUTILS=stdlib
# default user
USER root
# Avoid AppStream/dep11 metadata downloads that are flaky during mirror sync.
RUN rm -f /etc/apt/apt.conf.d/50appstream && \
printf 'Acquire::Retries "5";\nAcquire::Languages "none";\n' > /etc/apt/apt.conf.d/99apt-tuning
# install extra features
RUN apt-get -o Acquire::Retries=5 update -yq && \
apt-get upgrade -y && \
apt-get install -y software-properties-common
# add extra repos
RUN apt-add-repository -y -n multiverse && \
apt-add-repository -y -n universe && \
add-apt-repository -y -n ppa:deadsnakes/ppa && \
add-apt-repository -y -n ppa:graphics-drivers/ppa && \
apt-get -o Acquire::Retries=5 update -yq && \
apt-get upgrade -y
# install req packages
RUN apt-get install -y --no-install-recommends \
python3-all-dev \
python3-dev \
python3-pip \
python3-uno \
python${LO_PYTHON_VERSION} \
libpython${LO_PYTHON_VERSION}-dev \
python${LO_PYTHON_VERSION}-dev \
python${LO_PYTHON_VERSION}-venv \
python${APP_PYTHON_VERSION} \
libpython${APP_PYTHON_VERSION}-dev \
python${APP_PYTHON_VERSION}-dev \
python${APP_PYTHON_VERSION}-venv
RUN apt-get -y --no-install-recommends -o Dpkg::Options::="--force-confold" -y -o Dpkg::Options::="--force-confdef" -fuy dist-upgrade && \
apt-get install -y --no-install-recommends \
gnupg \
libssl-dev \
wget \
curl \
gnupg \
gnupg-agent \
dirmngr \
ca-certificates \
apt-transport-https \
fonts-dejavu \
build-essential \
gfortran \
gcc \
g++
##### utils for python and TESSERACT
RUN echo "ttf-mscorefonts-installer msttcorefonts/accepted-mscorefonts-eula select true" | debconf-set-selections
RUN apt-get install -y --no-install-recommends fontconfig ttf-mscorefonts-installer libimage-exiftool-perl libtcnative-1 \
libsm6 libxext6 gstreamer1.0-libav fonts-deva fonts-dejavu fonts-gfs-didot fonts-gfs-didot-classic fonts-junicode fonts-ebgaramond fonts-noto-cjk fonts-takao-gothic fonts-vlgothic \
ghostscript ghostscript-x gsfonts gsfonts-other gsfonts-x11 fonts-croscore fonts-crosextra-caladea fonts-crosextra-carlito fonts-liberation fonts-open-sans fonts-noto-core fonts-ibm-plex fonts-urw-base35 \
fonts-noto fonts-noto-cjk fonts-noto-extra xfonts-terminus fonts-font-awesome fonts-hack fonts-inconsolata fonts-liberation2 fonts-mononoki \
libpcre3 libpcre3-dev \
mesa-opencl-icd pocl-opencl-icd libvips-tools libvips libvips-dev \
imagemagick libcairo2-dev tesseract-ocr tesseract-ocr-all libtesseract5 libtesseract-dev libleptonica-dev liblept5
# tessaract language packages
RUN apt-get install -y --no-install-recommends --fix-missing tesseract-ocr-osd tesseract-ocr-lat \
tesseract-ocr-eng tesseract-ocr-enm tesseract-ocr-ita tesseract-ocr-osd tesseract-ocr-script-latn \
tesseract-ocr-fra tesseract-ocr-frk tesseract-ocr-deu tesseract-ocr-ces tesseract-ocr-dan tesseract-ocr-nld tesseract-ocr-nor \
tesseract-ocr-spa tesseract-ocr-swe tesseract-ocr-slk tesseract-ocr-ron tesseract-ocr-script-grek
# Pillow package requirements
RUN apt-get install -y --no-install-recommends tcl8.6-dev tk8.6-dev libopenjp2-7-dev libharfbuzz-dev libfribidi-dev libxcb1-dev libtiff5-dev libjpeg8-dev zlib1g-dev libfreetype6-dev liblcms2-dev libwebp-dev libglib2.0-dev libgl1
# python3 poppler requirement
RUN apt-get install -y --no-install-recommends poppler-utils
# libre office and java
RUN apt-get install -y --no-install-recommends default-jre libreoffice-java-common libreoffice libreoffice-script-provider-python
# build font cache
RUN fc-cache -f -v
# there is a bug in the blinker package that causes issues with uwsgi
# (this removes software-properties-common)
RUN apt remove -y python3-blinker
RUN apt-get clean autoclean && apt-get autoremove --purge -y
# other openCL packages
# beignet-opencl-icd
RUN rm -rf /var/lib/apt/lists/*
# create and copy the app
RUN mkdir /ocr_service
COPY ./ /ocr_service
WORKDIR /ocr_service
# --- Install system-wide unoserver to match requirements.txt (for UNO/system Python) ---
# BEFORE creating the venv so /usr/bin/python3.12 can run unoserver
# the reason for this is that the uno python bindings are tied to the system python
# and will not work in a venv
# so we need to install unoserver globally to match the version in requirements.txt
# this is a bit hacky but it works around the issue of unoserver not being available
# via pip for the system python (3.12 on Ubuntu noble)
RUN UNOSERVER_PIN=$(awk -F'==' '/^unoserver==/ {print $2; exit}' requirements.txt || true) && \
if [ -n "$UNOSERVER_PIN" ]; then \
echo "Installing system unoserver==$UNOSERVER_PIN"; \
/usr/bin/python${LO_PYTHON_VERSION} -m pip install --no-cache-dir --break-system-packages "unoserver==$UNOSERVER_PIN"; \
else \
echo "No exact pin found for unoserver in requirements.txt; installing latest system unoserver"; \
/usr/bin/python${LO_PYTHON_VERSION} -m pip install --no-cache-dir --break-system-packages unoserver; \
fi
# --- end unoserver system install ---
# Use a virtual environment for Python deps (single-stage build)
ENV VIRTUAL_ENV=/opt/venv
ENV PATH="$VIRTUAL_ENV/bin:$PATH"
ENV LIBRE_OFFICE_PYTHON_PATH=/usr/bin/python3.12
# Install the application into a Python 3.13 virtual environment while
# keeping LibreOffice/UNO on the system Python.
RUN python${APP_PYTHON_VERSION} -m venv "$VIRTUAL_ENV" && \
"$VIRTUAL_ENV/bin/python" && \
"$VIRTUAL_ENV/bin/pip" install --no-cache-dir -r ./requirements.txt
# compile the python files
# Byte-compile using venv python
RUN "$VIRTUAL_ENV/bin/python" -m compileall /ocr_service
# Run as non-root by default for Kubernetes restricted policies.
ARG OCR_SERVICE_UID=10001
ARG OCR_SERVICE_GID=10001
RUN groupadd --system --gid "$OCR_SERVICE_GID" ocrsvc && \
useradd --system --uid "$OCR_SERVICE_UID" --gid "$OCR_SERVICE_GID" --create-home --home-dir /home/ocrsvc --shell /usr/sbin/nologin ocrsvc && \
mkdir -p /ocr_service/tmp /ocr_service/log && \
chown -R ocrsvc:ocrsvc /ocr_service/tmp /ocr_service/log /home/ocrsvc
ENV HOME=/home/ocrsvc
USER ocrsvc
# Now run the simple api
CMD ["/bin/bash", "start_service_production.sh"]