Skip to content

Commit

Permalink
Merge pull request #6 from bertsky/fat-docker
Browse files Browse the repository at this point in the history
update, add workflow-configuration, deps-ubuntu and docker

Signed-off-by: Stefan Weil <[email protected]>
  • Loading branch information
stweil committed Nov 29, 2019
2 parents 0542d18 + aa0d831 commit bc0c425
Show file tree
Hide file tree
Showing 3 changed files with 143 additions and 69 deletions.
13 changes: 7 additions & 6 deletions Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -42,13 +42,12 @@ WORKDIR /build
# so we must rely on .dockerignore here)
COPY . .

ENV BUILDDEPS="build-essential automake autoconf libtool pkg-config git"
ENV PIP_INSTALL="pip install --timeout=300 -q"
ENV PIP_INSTALL="pip install --timeout=3000 -q"

# start a shell script (so we can comment individual steps here)
RUN echo "set -x" > docker.sh
RUN echo "set -ex" > docker.sh
# get packages for build
RUN echo "apt-get -y install $BUILDDEPS make" >> docker.sh
RUN echo "apt-get -y install automake autoconf libtool pkg-config g++ git make" >> docker.sh
# create git repo just so the (unconditional) submodule update recipes don't fail
RUN echo "git init" >> docker.sh
# we want to use PREFIX as venv
Expand All @@ -61,15 +60,17 @@ RUN echo "make -j install-tesseract" >> docker.sh
RUN echo "make ${OCRD_EXECUTABLES}" >> docker.sh
# post-install fixup against conflicting requirements
RUN echo "make fix-pip" >> docker.sh
# remove build pkgs, but keep `make` for makefile-based workflow processing
RUN echo "apt-get -y remove $BUILDDEPS" >> docker.sh
# remove unneeded automatic deps and clear pkg cache
RUN echo "apt-get -y autoremove && apt-get clean" >> docker.sh
# remove source directories from image
RUN echo "rm -fr /build" >> docker.sh
# run the script in one layer/step (to minimise image size)
RUN bash docker.sh

# remove (dated) security workaround preventing use of
# ImageMagick's convert on PDF/PS/EPS/XPS:
RUN rm /etc/ImageMagick-6/policy.xml

ENV DEBIAN_FRONTEND teletype

WORKDIR /data
Expand Down
29 changes: 16 additions & 13 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@

# Python version (python3 required).
PYTHON := python3
PIP_OPTIONS :=
PIP_INSTALL ?= pip3 install

# directory for virtual Python environment
# (but re-use if already active):
Expand All @@ -20,8 +20,8 @@ PKG_CONFIG_PATH := $(VIRTUAL_ENV)/lib/pkgconfig
export PKG_CONFIG_PATH

OCRD_EXECUTABLES = $(BIN)/ocrd # add more CLIs below
CUSTOM_INSTALL := $(BIN)/ocrd # add more non-pip installation targets below
CUSTOM_DEPS := core # add more modules which need deps-ubuntu below
CUSTOM_INSTALL = $(BIN)/ocrd # add more non-pip installation targets below
CUSTOM_DEPS = core # add more modules which need deps-ubuntu below

OCRD_MODULES := $(shell git submodule status | while read commit dir ref; do echo $$dir; done)

Expand Down Expand Up @@ -52,7 +52,7 @@ Targets:
Variables:
VIRTUAL_ENV: path to (re-)use for the virtual environment
PYTHON: name of the Python binary
PIP_OPTIONS: extra options to pass pip install like -q or -v
PIP_INSTALL: `pip install` command, optionally with extra options like `-q` or `-v`
EOF
endef
export HELP
Expand All @@ -74,13 +74,13 @@ $(OCRD_MODULES): always-update

$(ACTIVATE_VENV) $(VIRTUAL_ENV):
$(PYTHON) -m venv $(VIRTUAL_ENV)
. $(ACTIVATE_VENV) && pip install --upgrade pip
. $(ACTIVATE_VENV) && $(PIP_INSTALL) --upgrade pip

# Get Python modules.

# avoid making this .PHONY so it does not have to be repeated
$(SHARE)/numpy: | $(ACTIVATE_VENV)
. $(ACTIVATE_VENV) && pip install numpy
. $(ACTIVATE_VENV) && $(PIP_INSTALL) numpy
@touch $@

OCRD_EXECUTABLES += $(OCRD_KRAKEN)
Expand All @@ -99,12 +99,14 @@ $(OCRD_OCROPY): ocrd_ocropy
.PHONY: ocrd
ocrd: $(BIN)/ocrd
$(BIN)/ocrd: core
. $(ACTIVATE_VENV) && cd $< && $(MAKE) install PIP_INSTALL="pip install --force-reinstall $(PIP_OPTIONS)"
. $(ACTIVATE_VENV) && cd $< && make install PIP_INSTALL="$(PIP_INSTALL) --force-reinstall"
# workaround for core#351:
. $(ACTIVATE_VENV) && cd $< && make install PIP_INSTALL="$(PIP_INSTALL) --no-deps"

.PHONY: wheel
wheel: $(BIN)/wheel
$(BIN)/wheel: | $(ACTIVATE_VENV)
. $(ACTIVATE_VENV) && pip install --force-reinstall $(PIP_OPTIONS) wheel
. $(ACTIVATE_VENV) && $(PIP_INSTALL) --force-reinstall wheel

# Install Python modules from local code.

Expand Down Expand Up @@ -249,13 +251,13 @@ $(BIN)/ocrd-make: workflow-configuration
# install again forcefully without depds (to ensure
# the binary itself updates):
$(filter-out $(CUSTOM_INSTALL),$(OCRD_EXECUTABLES)):
. $(ACTIVATE_VENV) && cd $< && pip install $(PIP_OPTIONS) .
. $(ACTIVATE_VENV) && cd $< && pip install --no-deps --force-reinstall $(PIP_OPTIONS) .
. $(ACTIVATE_VENV) && cd $< && $(PIP_INSTALL) .
. $(ACTIVATE_VENV) && cd $< && $(PIP_INSTALL) --no-deps --force-reinstall .

# avoid making these .PHONY so they do not have to be repeated:
# clstm tesserocr
$(SHARE)/%: % | $(ACTIVATE_VENV)
. $(ACTIVATE_VENV) && cd $< && pip install $(PIP_OPTIONS) .
. $(ACTIVATE_VENV) && cd $< && $(PIP_INSTALL) .
@touch $@

# At last, add venv dependency (must not become first):
Expand All @@ -269,7 +271,7 @@ $(OCRD_EXECUTABLES): | $(BIN)/wheel
# - tensorflow>=2.0, tensorflow_gpu in another version
# - pillow==5.4.1 instead of >=6.2
fix-pip:
pip install $(PIP_OPTIONS) --force-reinstall \
. $(ACTIVATE_VENV) && $(PIP_INSTALL) --force-reinstall \
opencv-python-headless \
pillow>=6.2.0 \
$(pip list | grep tensorflow-gpu | sed -E 's/-gpu +/==/')
Expand Down Expand Up @@ -351,9 +353,10 @@ endef
# (mainly intended for docker, not recommended for live systems)
# FIXME: we should find a way to filter based on the actual executables required
deps-ubuntu: CLSTM_DEPS = scons libprotobuf-dev protobuf-compiler libpng-dev libeigen3-dev swig
deps-ubuntu: TESSERACT_DEPS = g++ make automake
deps-ubuntu: $(CUSTOM_DEPS)
set -e; for dir in $^; do $(MAKE) -C $$dir deps-ubuntu; done
apt-get -y install wget python3-venv $(CLSTM_DEPS)
apt-get -y install wget python3-venv $(TESSERACT_DEPS) $(CLSTM_DEPS)

.PHONY: docker
docker: DOCKER_TAG ?= ocrd/all
Expand Down
170 changes: 120 additions & 50 deletions README.md
Original file line number Diff line number Diff line change
@@ -1,31 +1,52 @@
# OCR-D/ocrd_all

This is a project which gets the source of all OCR-D modules.
It also includes a Makefile for their installation into a virtual environment (venv).
This controls installation of all OCR-D modules from source.

It includes a Makefile for their installation into a virtual environment (venv) or Docker container.

(A venv is a local user directory with shell scripts to load/unload itself
in the current shell environment via PATH and PYTHONHOME.)

## Preconditions

Make sure that there is enough free disk space. 3 GiB or more is recommended for
### Space

Make sure that there is enough free disk space. 7 GiB or more is recommended for
the required submodules, build data, temporary data, installed virtual environment
and pip cache.

Install GNU git and make:
### Locale

Next, the (shell) environment must have a Unicode-based localization. (Otherwise Python code based on `click` will not work, i.e. most OCR-D CLIs.) This is true for most installations today, and can be verified by:

locale | fgrep .UTF-8

This should show several `LC_*` variables. Otherwise, either select another localization globally...

sudo dpkg-reconfigure locales

# Debian / Ubuntu packages.
sudo apt install git make
... or use the Unicode-based POSIX locale temporarily:

export LC_ALL=C.UTF-8
export LANG=C.UTF-8

### System packages

Install GNU make and git, and wget if you want to download Tesseract models.

# on Debian / Ubuntu:
sudo apt install make git wget

Install the packages for Python3 development and for Python3 virtual environments
for your operating system / distribution.

# Debian / Ubuntu packages.
# on Debian / Ubuntu:
sudo apt install python3-dev python3-venv

Some modules use the Tesseract library. If your distribution provides Tesseract 4.1
or newer, install the development package:

# Debian / Ubuntu package.
# on Debian / Ubuntu:
sudo apt install libtesseract-dev

Ubuntu packages for Tesseract 5.0.0 (alpha) are available at the PPA
Expand All @@ -35,7 +56,7 @@ Otherwise or for the latest Tesseract code it can also be built locally.

Other modules will have additional system dependencies.

System dependencies for all modules on Ubuntu 18.04 (or similar) can also be automatically installed by running:
System dependencies **for all modules** on Ubuntu 18.04 (or similar) can also be installed **automatically** by running:

# on Debian / Ubuntu:
sudo apt install make git
Expand All @@ -46,49 +67,72 @@ System dependencies for all modules on Ubuntu 18.04 (or similar) can also be aut

Run `make` with optional parameters for _variables_ and _targets_ like so:

make [PYTHON=python3] [VIRTUAL_ENV=./venv] [target(s)]
make [PYTHON=python3] [VIRTUAL_ENV=./venv] [TARGET]...

### Targets

<dl>
<dt>ocrd</dt>
<dd>(default goal) Install OCR-D/core and its CLI `ocrd` into the venv.</dd>
<dt>all</dt>
<dd>Install executables from all modules into the venv. (Depends on _modules_ and _ocrd_.)</dd>
<dt>modules</dt>
<dd>Download/update all modules, but do not install anything.</dd>
<dt>deps-ubuntu</dt>
<dd>Install system packages for all modules</dd>
<dt>fix-pip</dt>
<dd>Fix incompatible/inconsistent pip requirements between all modules</dd>
<dt>docker</dt>
<dd>(Re-)build a docker image for all modules/executables.</dd>
<dt>clean</dt>
<dd>Remove the venv.</dd>
<dt>show</dt>
<dd>Print the venv directory, the module directories, and the executable names.</dd>
<dt>help</dt>
<dd>Print available targets and variables.</dd>
</dl>
#### _ocrd_

(default goal) Install OCR-D/core and its CLI `ocrd` into the venv.

#### _all_

Install executables from all modules into the venv. (Depends on _modules_ and _ocrd_.)

#### _modules_

Download/update all modules, but do not install anything.

#### _deps-ubuntu_

Install system packages for all modules

#### _fix-pip_

Fix incompatible/inconsistent pip requirements between all modules

#### _docker_

(Re-)build a docker image for all modules/executables.

#### _clean_

Remove the venv.

#### _show_

Print the venv directory, the module directories, and the executable names.

#### _help_

Print available targets and variables.

---

Further targets:
<dl>
<dt>[any module name]</dt>
<dd>Download/update that module, but do not install anything.</dd>
<dt>[any executable name]</dt>
<dd>Install that CLI into the venv. (Depends on that module and on _ocrd_.)</dd>
</dl>
#### _[any module name]_

Download/update that module, but do not install anything.

#### _[any executable name]_

Install that CLI into the venv. (Depends on that module and on _ocrd_.)

### Variables

<dl>
<dt>PYTHON</dt>
<dd>name of the Python binary to use (at least python3 required)</dd>
<dt>VIRTUAL_ENV</dt>
<dd>Directory prefix to use for local installation. (This is set automatically when activating a virtual environment on the shell. The build system will re-use existing venvs.)</dd>
<dt>PIP_OPTIONS</dt>
<dd>Extra options to pass to `pip install` (e.g. -q or -v)</dd>
</dl>
#### _PYTHON_

name of the Python binary to use (at least python3 required)

#### _VIRTUAL_ENV_

Directory prefix to use for local installation.

(This is set automatically when activating a virtual environment on the shell. The build system will re-use existing venvs.)

#### _PIP_OPTIONS_

Extra options to pass to `pip install` (e.g. -q or -v)

### Examples

Expand All @@ -115,7 +159,24 @@ Running `make modules` downloads/updates all modules.

Running `make all` additionally installs the executables from all modules.

## Issues
### Results

To use the built executables, simply activate the virtual environment:

. ${VIRTUAL_ENV:-venv}/bin/activate
ocrd --help
ocrd-...

For the Docker image, run it with your data path mounted as a user:

docker run -it -u $(id -u):$(id -g) $PWD:/data ocrd/all
ocrd --help
ocrd-...


## Challenges

This repo offers solutions to the following problems with OCR-D integration.

### No published/recent version on PyPI

Expand All @@ -125,32 +186,41 @@ The following Python modules need an installation from code for different reason
- cor-asv-ann (not available in PyPI)
- cor-asv-fst (not available in PyPI)
- dinglehopper (not available in PyPI)
- ocrd_cis (not available in PyPI; needs `ocrd>=2.0.0`)
- ocrd_tesserocr (too old in PyPI; needs `ocrd>=2.0.0`)
- ocrd_cis (not available in PyPI)
- tesserocr (too old in PyPI)

_(Solved by installation from source.)_

### Conflicting requirements

Merging all packages into one venv does not always work.
Modules may require mutually exclusive sets of dependent packages.

_pip does not stop or resolve conflicts – it merely warns._
`pip` does not even stop or resolve conflicts – it merely warns!

- `Pillow`:
* `==5.4.1` (required by ocrd_typegroups_classifier)
* `>=6.2.0` (required by all others)
- Tensorflow:
* `tensorflow-gpu==1.14.0` (required by ocrd_calamari and OCR-D-LAYoutERkennung)
* `tensorflow` (which pulls in `>=2.0` which is incompatible; required by cor-asv-ann and ocrd_keraslm)
* `tensorflow` (required by cor-asv-ann and ocrd_keraslm)

Both can be installed in parallel in different versions, but may depend on a mutually exclusive set of `tensorboard` and `tensorflow_estimator`.

Moreover, in the future, some modules (but not others) may depend on `tensorflow>=2.0`, which again is incompatible.
- OpenCV:
* `opencv-python-headlesss` (required by core and others, avoids pulling in X11 libraries)
* `opencv-python` (required by OCR-D-LAYoutERkennung and segmentation-runner)
* `opencv-python` (required by OCR-D-LAYoutERkennung)
* custom build on ARM...

- ...

_(Solved temporarily by post-installation `fix-pip`.)_

### System requirements

Not all modules advertise their system package requirements via `make deps-ubuntu`.

- clstm: depends on `scons libprotobuf-dev protobuf-compiler libpng-dev libeigen3-dev swig`

_(Solved by maintaining these requirements under `deps-ubuntu` here.)_

0 comments on commit bc0c425

Please sign in to comment.