Merge pull request #6 from bertsky/fat-docker

update, add workflow-configuration, deps-ubuntu and docker
OCR-D · Nov 29, 2019 · 380894d · 380894d
2 parents 0542d18 + aa0d831
commit 380894d
Show file tree

Hide file tree

Showing 3 changed files with 143 additions and 69 deletions.
diff --git a/Dockerfile b/Dockerfile
@@ -42,13 +42,12 @@ WORKDIR /build
 #  so we must rely on .dockerignore here)
 COPY . .
 
-ENV BUILDDEPS="build-essential automake autoconf libtool pkg-config git"
-ENV PIP_INSTALL="pip install --timeout=300 -q"
+ENV PIP_INSTALL="pip install --timeout=3000 -q"
 
 # start a shell script (so we can comment individual steps here)
-RUN echo "set -x" > docker.sh
+RUN echo "set -ex" > docker.sh
 # get packages for build
-RUN echo "apt-get -y install $BUILDDEPS make" >> docker.sh
+RUN echo "apt-get -y install automake autoconf libtool pkg-config g++ git make" >> docker.sh
 # create git repo just so the (unconditional) submodule update recipes don't fail
 RUN echo "git init" >> docker.sh
 # we want to use PREFIX as venv
@@ -61,15 +60,17 @@ RUN echo "make -j install-tesseract" >> docker.sh
 RUN echo "make ${OCRD_EXECUTABLES}" >> docker.sh
 # post-install fixup against conflicting requirements
 RUN echo "make fix-pip" >> docker.sh
-# remove build pkgs, but keep `make` for makefile-based workflow processing
-RUN echo "apt-get -y remove $BUILDDEPS" >> docker.sh
 # remove unneeded automatic deps and clear pkg cache
 RUN echo "apt-get -y autoremove && apt-get clean" >> docker.sh
 # remove source directories from image
 RUN echo "rm -fr /build" >> docker.sh
 # run the script in one layer/step (to minimise image size)
 RUN bash docker.sh
 
+# remove (dated) security workaround preventing use of
+# ImageMagick's convert on PDF/PS/EPS/XPS:
+RUN rm /etc/ImageMagick-6/policy.xml
+
 ENV DEBIAN_FRONTEND teletype
 
 WORKDIR /data

diff --git a/Makefile b/Makefile
@@ -2,7 +2,7 @@
 
 # Python version (python3 required).
 PYTHON := python3
-PIP_OPTIONS :=
+PIP_INSTALL ?= pip3 install
 
 # directory for virtual Python environment
 # (but re-use if already active):
@@ -20,8 +20,8 @@ PKG_CONFIG_PATH := $(VIRTUAL_ENV)/lib/pkgconfig
 export PKG_CONFIG_PATH
 
 OCRD_EXECUTABLES = $(BIN)/ocrd # add more CLIs below
-CUSTOM_INSTALL := $(BIN)/ocrd # add more non-pip installation targets below
-CUSTOM_DEPS := core # add more modules which need deps-ubuntu below
+CUSTOM_INSTALL = $(BIN)/ocrd # add more non-pip installation targets below
+CUSTOM_DEPS = core # add more modules which need deps-ubuntu below
 
 OCRD_MODULES := $(shell git submodule status | while read commit dir ref; do echo $$dir; done)
 
@@ -52,7 +52,7 @@ Targets:
 Variables:
 	VIRTUAL_ENV: path to (re-)use for the virtual environment
 	PYTHON: name of the Python binary
-	PIP_OPTIONS: extra options to pass pip install like -q or -v
+	PIP_INSTALL: `pip install` command, optionally with extra options like `-q` or `-v`
 EOF
 endef
 export HELP
@@ -74,13 +74,13 @@ $(OCRD_MODULES): always-update
 
 $(ACTIVATE_VENV) $(VIRTUAL_ENV):
 	$(PYTHON) -m venv $(VIRTUAL_ENV)
-	. $(ACTIVATE_VENV) && pip install --upgrade pip
+	. $(ACTIVATE_VENV) && $(PIP_INSTALL) --upgrade pip
 
 # Get Python modules.
 
 # avoid making this .PHONY so it does not have to be repeated
 $(SHARE)/numpy: | $(ACTIVATE_VENV)
-	. $(ACTIVATE_VENV) && pip install numpy
+	. $(ACTIVATE_VENV) && $(PIP_INSTALL) numpy
 	@touch $@
 
 OCRD_EXECUTABLES += $(OCRD_KRAKEN)
@@ -99,12 +99,14 @@ $(OCRD_OCROPY): ocrd_ocropy
 .PHONY: ocrd
 ocrd: $(BIN)/ocrd
 $(BIN)/ocrd: core
-	. $(ACTIVATE_VENV) && cd $< && $(MAKE) install PIP_INSTALL="pip install --force-reinstall $(PIP_OPTIONS)"
+	. $(ACTIVATE_VENV) && cd $< && make install PIP_INSTALL="$(PIP_INSTALL) --force-reinstall"
+	# workaround for core#351:
+	. $(ACTIVATE_VENV) && cd $< && make install PIP_INSTALL="$(PIP_INSTALL) --no-deps"
 
 .PHONY: wheel
 wheel: $(BIN)/wheel
 $(BIN)/wheel: | $(ACTIVATE_VENV)
-	. $(ACTIVATE_VENV) && pip install --force-reinstall $(PIP_OPTIONS) wheel
+	. $(ACTIVATE_VENV) && $(PIP_INSTALL) --force-reinstall wheel
 
 # Install Python modules from local code.
 
@@ -249,13 +251,13 @@ $(BIN)/ocrd-make: workflow-configuration
 # install again forcefully without depds (to ensure
 # the binary itself updates):
 $(filter-out $(CUSTOM_INSTALL),$(OCRD_EXECUTABLES)):
-	. $(ACTIVATE_VENV) && cd $< && pip install $(PIP_OPTIONS) .
-	. $(ACTIVATE_VENV) && cd $< && pip install --no-deps --force-reinstall $(PIP_OPTIONS) .
+	. $(ACTIVATE_VENV) && cd $< && $(PIP_INSTALL) .
+	. $(ACTIVATE_VENV) && cd $< && $(PIP_INSTALL) --no-deps --force-reinstall .
 
 # avoid making these .PHONY so they do not have to be repeated:
 # clstm tesserocr
 $(SHARE)/%: % | $(ACTIVATE_VENV)
-	. $(ACTIVATE_VENV) && cd $< && pip install $(PIP_OPTIONS) .
+	. $(ACTIVATE_VENV) && cd $< && $(PIP_INSTALL) .
 	@touch $@
 
 # At last, add venv dependency (must not become first):
@@ -269,7 +271,7 @@ $(OCRD_EXECUTABLES): | $(BIN)/wheel
 # - tensorflow>=2.0, tensorflow_gpu in another version
 # - pillow==5.4.1 instead of >=6.2
 fix-pip:
-	pip install $(PIP_OPTIONS) --force-reinstall \
+	. $(ACTIVATE_VENV) && $(PIP_INSTALL) --force-reinstall \
 		opencv-python-headless \
 		pillow>=6.2.0 \
 		$(pip list | grep tensorflow-gpu | sed -E 's/-gpu +/==/')
@@ -351,9 +353,10 @@ endef
 # (mainly intended for docker, not recommended for live systems)
 # FIXME: we should find a way to filter based on the actual executables required
 deps-ubuntu: CLSTM_DEPS = scons libprotobuf-dev protobuf-compiler libpng-dev libeigen3-dev swig
+deps-ubuntu: TESSERACT_DEPS = g++ make automake
 deps-ubuntu: $(CUSTOM_DEPS)
 	set -e; for dir in $^; do $(MAKE) -C $$dir deps-ubuntu; done
-	apt-get -y install wget python3-venv $(CLSTM_DEPS)
+	apt-get -y install wget python3-venv $(TESSERACT_DEPS) $(CLSTM_DEPS)
 
 .PHONY: docker
 docker: DOCKER_TAG ?= ocrd/all

diff --git a/README.md b/README.md
@@ -1,31 +1,52 @@
 # OCR-D/ocrd_all
 
-This is a project which gets the source of all OCR-D modules.
-It also includes a Makefile for their installation into a virtual environment (venv).
+This controls installation of all OCR-D modules from source.
+
+It includes a Makefile for their installation into a virtual environment (venv) or Docker container.
+
 (A venv is a local user directory with shell scripts to load/unload itself
 in the current shell environment via PATH and PYTHONHOME.)
 
 ## Preconditions
 
-Make sure that there is enough free disk space. 3 GiB or more is recommended for
+### Space
+
+Make sure that there is enough free disk space. 7 GiB or more is recommended for
 the required submodules, build data, temporary data, installed virtual environment
 and pip cache.
 
-Install GNU git and make:
+### Locale
+
+Next, the (shell) environment must have a Unicode-based localization. (Otherwise Python code based on `click` will not work, i.e. most OCR-D CLIs.) This is true for most installations today, and can be verified by:
+
+    locale | fgrep .UTF-8
+
+This should show several `LC_*` variables. Otherwise, either select another localization globally...
+
+    sudo dpkg-reconfigure locales
 
-    # Debian / Ubuntu packages.
-    sudo apt install git make
+... or use the Unicode-based POSIX locale temporarily:
+
+    export LC_ALL=C.UTF-8
+    export LANG=C.UTF-8
+
+### System packages
+
+Install GNU make and git, and wget if you want to download Tesseract models.
+
+    # on Debian / Ubuntu:
+    sudo apt install make git wget
 
 Install the packages for Python3 development and for Python3 virtual environments
 for your operating system / distribution.
 
-    # Debian / Ubuntu packages.
+    # on Debian / Ubuntu:
     sudo apt install python3-dev python3-venv
 
 Some modules use the Tesseract library. If your distribution provides Tesseract 4.1
 or newer, install the development package:
 
-    # Debian / Ubuntu package.
+    # on Debian / Ubuntu:
     sudo apt install libtesseract-dev
 
 Ubuntu packages for Tesseract 5.0.0 (alpha) are available at the PPA
@@ -35,7 +56,7 @@ Otherwise or for the latest Tesseract code it can also be built locally.
 
 Other modules will have additional system dependencies.
 
-System dependencies for all modules on Ubuntu 18.04 (or similar) can also be automatically installed by running:
+System dependencies **for all modules** on Ubuntu 18.04 (or similar) can also be installed **automatically** by running:
 
     # on Debian / Ubuntu:
     sudo apt install make git
@@ -46,49 +67,72 @@ System dependencies for all modules on Ubuntu 18.04 (or similar) can also be aut
 
 Run `make` with optional parameters for _variables_ and _targets_ like so:
 
-    make [PYTHON=python3] [VIRTUAL_ENV=./venv] [target(s)]
+    make [PYTHON=python3] [VIRTUAL_ENV=./venv] [TARGET]...
 
 ### Targets
 
-<dl>
-  <dt>ocrd</dt>
-  <dd>(default goal) Install OCR-D/core and its CLI `ocrd` into the venv.</dd>
-  <dt>all</dt>
-  <dd>Install executables from all modules into the venv. (Depends on _modules_ and _ocrd_.)</dd>
-  <dt>modules</dt>
-  <dd>Download/update all modules, but do not install anything.</dd>
-  <dt>deps-ubuntu</dt>
-  <dd>Install system packages for all modules</dd>
-  <dt>fix-pip</dt>
-  <dd>Fix incompatible/inconsistent pip requirements between all modules</dd>
-  <dt>docker</dt>
-  <dd>(Re-)build a docker image for all modules/executables.</dd>
-  <dt>clean</dt>
-  <dd>Remove the venv.</dd>
-  <dt>show</dt>
-  <dd>Print the venv directory, the module directories, and the executable names.</dd>
-  <dt>help</dt>
-  <dd>Print available targets and variables.</dd>
-</dl>
+#### _ocrd_
+
+(default goal) Install OCR-D/core and its CLI `ocrd` into the venv.
+
+#### _all_
+
+Install executables from all modules into the venv. (Depends on _modules_ and _ocrd_.)
+
+#### _modules_
+
+Download/update all modules, but do not install anything.
+
+#### _deps-ubuntu_
+
+Install system packages for all modules
+
+#### _fix-pip_
+
+Fix incompatible/inconsistent pip requirements between all modules
+
+#### _docker_
+
+(Re-)build a docker image for all modules/executables.
+
+#### _clean_
+
+Remove the venv.
+
+#### _show_
+
+Print the venv directory, the module directories, and the executable names.
+
+#### _help_
+
+Print available targets and variables.
+
+---
 
 Further targets:
-<dl>
-  <dt>[any module name]</dt>
-  <dd>Download/update that module, but do not install anything.</dd>
-  <dt>[any executable name]</dt>
-  <dd>Install that CLI into the venv. (Depends on that module and on _ocrd_.)</dd>
-</dl>
+#### _[any module name]_
+
+Download/update that module, but do not install anything.
+
+#### _[any executable name]_
+
+Install that CLI into the venv. (Depends on that module and on _ocrd_.)
 
 ### Variables
 
-<dl>
-  <dt>PYTHON</dt>
-  <dd>name of the Python binary to use (at least python3 required)</dd>
-  <dt>VIRTUAL_ENV</dt>
-  <dd>Directory prefix to use for local installation. (This is set automatically when activating a virtual environment on the shell. The build system will re-use existing venvs.)</dd>
-  <dt>PIP_OPTIONS</dt>
-  <dd>Extra options to pass to `pip install` (e.g. -q or -v)</dd>
-</dl>
+#### _PYTHON_
+
+name of the Python binary to use (at least python3 required)
+
+#### _VIRTUAL_ENV_
+
+Directory prefix to use for local installation. 
+
+(This is set automatically when activating a virtual environment on the shell. The build system will re-use existing venvs.)
+
+#### _PIP_OPTIONS_
+
+Extra options to pass to `pip install` (e.g. -q or -v)
 
 ### Examples
 
@@ -115,7 +159,24 @@ Running `make modules` downloads/updates all modules.
 
 Running `make all` additionally installs the executables from all modules.
 
-## Issues
+### Results
+
+To use the built executables, simply activate the virtual environment:
+
+    . ${VIRTUAL_ENV:-venv}/bin/activate
+    ocrd --help
+    ocrd-...
+
+For the Docker image, run it with your data path mounted as a user:
+
+    docker run -it -u $(id -u):$(id -g) $PWD:/data ocrd/all
+    ocrd --help
+    ocrd-...
+
+
+## Challenges
+
+This repo offers solutions to the following problems with OCR-D integration.
 
 ### No published/recent version on PyPI
 
@@ -125,32 +186,41 @@ The following Python modules need an installation from code for different reason
 - cor-asv-ann (not available in PyPI)
 - cor-asv-fst (not available in PyPI)
 - dinglehopper (not available in PyPI)
-- ocrd_cis (not available in PyPI; needs `ocrd>=2.0.0`)
-- ocrd_tesserocr (too old in PyPI; needs `ocrd>=2.0.0`)
+- ocrd_cis (not available in PyPI)
 - tesserocr (too old in PyPI)
 
+_(Solved by installation from source.)_
+
 ### Conflicting requirements
 
 Merging all packages into one venv does not always work.
 Modules may require mutually exclusive sets of dependent packages.
 
-_pip does not stop or resolve conflicts – it merely warns._
+`pip` does not even stop or resolve conflicts – it merely warns!
 
 - `Pillow`:
    * `==5.4.1` (required by ocrd_typegroups_classifier)
    * `>=6.2.0` (required by all others)
 - Tensorflow:
    * `tensorflow-gpu==1.14.0` (required by ocrd_calamari and OCR-D-LAYoutERkennung)
-   * `tensorflow` (which pulls in `>=2.0` which is incompatible; required by cor-asv-ann and ocrd_keraslm)
+   * `tensorflow` (required by cor-asv-ann and ocrd_keraslm)
+
+   Both can be installed in parallel in different versions, but may depend on a mutually exclusive set of `tensorboard` and `tensorflow_estimator`.
+
+   Moreover, in the future, some modules (but not others) may depend on `tensorflow>=2.0`, which again is incompatible.
 - OpenCV:
    * `opencv-python-headlesss` (required by core and others, avoids pulling in X11 libraries)
-   * `opencv-python` (required by OCR-D-LAYoutERkennung and segmentation-runner)
+   * `opencv-python` (required by OCR-D-LAYoutERkennung)
    * custom build on ARM...
 
 - ...
 
+_(Solved temporarily by post-installation `fix-pip`.)_
+
 ### System requirements
 
 Not all modules advertise their system package requirements via `make deps-ubuntu`.
 
 - clstm: depends on `scons libprotobuf-dev protobuf-compiler libpng-dev libeigen3-dev swig`
+
+_(Solved by maintaining these requirements under `deps-ubuntu` here.)_