OCR-D · stweil · Nov 29, 2019 · Nov 18, 2019 · Nov 19, 2019 · Nov 19, 2019
diff --git a/Dockerfile b/Dockerfile
@@ -42,13 +42,12 @@ WORKDIR /build
 #  so we must rely on .dockerignore here)
 COPY . .
 
-ENV BUILDDEPS="build-essential automake autoconf libtool pkg-config git"
-ENV PIP_INSTALL="pip install --timeout=300 -q"
+ENV PIP_INSTALL="pip install --timeout=3000 -q"
 
 # start a shell script (so we can comment individual steps here)
-RUN echo "set -x" > docker.sh
+RUN echo "set -ex" > docker.sh
 # get packages for build
-RUN echo "apt-get -y install $BUILDDEPS make" >> docker.sh
+RUN echo "apt-get -y install build-essential automake autoconf libtool pkg-config git make" >> docker.sh
 # create git repo just so the (unconditional) submodule update recipes don't fail
 RUN echo "git init" >> docker.sh
 # we want to use PREFIX as venv
@@ -61,15 +60,17 @@ RUN echo "make -j install-tesseract" >> docker.sh
 RUN echo "make ${OCRD_EXECUTABLES}" >> docker.sh
 # post-install fixup against conflicting requirements
 RUN echo "make fix-pip" >> docker.sh
-# remove build pkgs, but keep `make` for makefile-based workflow processing
-RUN echo "apt-get -y remove $BUILDDEPS" >> docker.sh
 # remove unneeded automatic deps and clear pkg cache
 RUN echo "apt-get -y autoremove && apt-get clean" >> docker.sh
 # remove source directories from image
 RUN echo "rm -fr /build" >> docker.sh
 # run the script in one layer/step (to minimise image size)
 RUN bash docker.sh
 
+# remove (dated) security workaround preventing use of
+# ImageMagick's convert on PDF/PS/EPS/XPS:
+RUN rm /etc/ImageMagick-6/policy.xml
+
 ENV DEBIAN_FRONTEND teletype
 
 WORKDIR /data

diff --git a/Makefile b/Makefile
@@ -2,11 +2,12 @@
 
 # Python version (python3 required).
 PYTHON := python3
-PIP_OPTIONS :=
+PIP_INSTALL ?= pip3 install
 
 # directory for virtual Python environment
 # (but re-use if already active):
 VIRTUAL_ENV ?= $(CURDIR)/venv
+export VIRTUAL_ENV
 
 BIN := $(VIRTUAL_ENV)/bin
 SHARE := $(VIRTUAL_ENV)/share
@@ -20,8 +21,8 @@ PKG_CONFIG_PATH := $(VIRTUAL_ENV)/lib/pkgconfig
 export PKG_CONFIG_PATH
 
 OCRD_EXECUTABLES = $(BIN)/ocrd # add more CLIs below
-CUSTOM_INSTALL := $(BIN)/ocrd # add more non-pip installation targets below
-CUSTOM_DEPS := core # add more modules which need deps-ubuntu below
+CUSTOM_INSTALL = $(BIN)/ocrd # add more non-pip installation targets below
+CUSTOM_DEPS = core # add more modules which need deps-ubuntu below
 
 OCRD_MODULES := $(shell git submodule status | while read commit dir ref; do echo $$dir; done)
 
@@ -52,7 +53,7 @@ Targets:
 Variables:
 	VIRTUAL_ENV: path to (re-)use for the virtual environment
 	PYTHON: name of the Python binary
-	PIP_OPTIONS: extra options to pass pip install like -q or -v
+	PIP_INSTALL: pass extra options to "pip install" like -q or -v
 EOF
 endef
 export HELP
@@ -74,13 +75,13 @@ $(OCRD_MODULES): always-update
 
 $(ACTIVATE_VENV) $(VIRTUAL_ENV):
 	$(PYTHON) -m venv $(VIRTUAL_ENV)
-	. $(ACTIVATE_VENV) && pip install --upgrade pip
+	. $(ACTIVATE_VENV) && $(PIP_INSTALL) --upgrade pip
 
 # Get Python modules.
 
 # avoid making this .PHONY so it does not have to be repeated
 $(SHARE)/numpy: | $(ACTIVATE_VENV)
-	. $(ACTIVATE_VENV) && pip install numpy
+	. $(ACTIVATE_VENV) && $(PIP_INSTALL) numpy
 	@touch $@
 
 OCRD_EXECUTABLES += $(OCRD_KRAKEN)
@@ -99,12 +100,14 @@ $(OCRD_OCROPY): ocrd_ocropy
 .PHONY: ocrd
 ocrd: $(BIN)/ocrd
 $(BIN)/ocrd: core
-	. $(ACTIVATE_VENV) && cd $< && $(MAKE) install PIP_INSTALL="pip install --force-reinstall $(PIP_OPTIONS)"
+	. $(ACTIVATE_VENV) && cd $< && make install PIP_INSTALL="$(PIP_INSTALL) --force-reinstall"
+	# workaround for core#351:
+	. $(ACTIVATE_VENV) && cd $< && make install PIP_INSTALL="$(PIP_INSTALL) --no-deps"
 
 .PHONY: wheel
 wheel: $(BIN)/wheel
 $(BIN)/wheel: | $(ACTIVATE_VENV)
-	. $(ACTIVATE_VENV) && pip install --force-reinstall $(PIP_OPTIONS) wheel
+	. $(ACTIVATE_VENV) && $(PIP_INSTALL) --force-reinstall wheel
 
 # Install Python modules from local code.
 
@@ -249,13 +252,13 @@ $(BIN)/ocrd-make: workflow-configuration
 # install again forcefully without depds (to ensure
 # the binary itself updates):
 $(filter-out $(CUSTOM_INSTALL),$(OCRD_EXECUTABLES)):
-	. $(ACTIVATE_VENV) && cd $< && pip install $(PIP_OPTIONS) .
-	. $(ACTIVATE_VENV) && cd $< && pip install --no-deps --force-reinstall $(PIP_OPTIONS) .
+	. $(ACTIVATE_VENV) && cd $< && $(PIP_INSTALL) .
+	. $(ACTIVATE_VENV) && cd $< && $(PIP_INSTALL) --no-deps --force-reinstall .
 
 # avoid making these .PHONY so they do not have to be repeated:
 # clstm tesserocr
 $(SHARE)/%: % | $(ACTIVATE_VENV)
-	. $(ACTIVATE_VENV) && cd $< && pip install $(PIP_OPTIONS) .
+	. $(ACTIVATE_VENV) && cd $< && $(PIP_INSTALL) .
 	@touch $@
 
 # At last, add venv dependency (must not become first):
@@ -269,7 +272,7 @@ $(OCRD_EXECUTABLES): | $(BIN)/wheel
 # - tensorflow>=2.0, tensorflow_gpu in another version
 # - pillow==5.4.1 instead of >=6.2
 fix-pip:
-	pip install $(PIP_OPTIONS) --force-reinstall \
+	$(PIP_INSTALL) --force-reinstall \
 		opencv-python-headless \
 		pillow>=6.2.0 \
 		$(pip list | grep tensorflow-gpu | sed -E 's/-gpu +/==/')
@@ -351,9 +354,10 @@ endef
 # (mainly intended for docker, not recommended for live systems)
 # FIXME: we should find a way to filter based on the actual executables required
 deps-ubuntu: CLSTM_DEPS = scons libprotobuf-dev protobuf-compiler libpng-dev libeigen3-dev swig
+deps-ubuntu: TESSERACT_DEPS = g++ make automake
 deps-ubuntu: $(CUSTOM_DEPS)
 	set -e; for dir in $^; do $(MAKE) -C $$dir deps-ubuntu; done
-	apt-get -y install wget python3-venv $(CLSTM_DEPS)
+	apt-get -y install wget python3-venv $(TESSERACT_DEPS) $(CLSTM_DEPS)
 
 .PHONY: docker
 docker: DOCKER_TAG ?= ocrd/all

diff --git a/README.md b/README.md
@@ -1,7 +1,7 @@
 # OCR-D/ocrd_all
 
 This is a project which gets the source of all OCR-D modules.
-It also includes a Makefile for their installation into a virtual environment (venv).
+It also includes a Makefile for their installation into a virtual environment (venv) or Docker container.
 (A venv is a local user directory with shell scripts to load/unload itself
 in the current shell environment via PATH and PYTHONHOME.)
 
@@ -11,21 +11,21 @@ Make sure that there is enough free disk space. 3 GiB or more is recommended for
 the required submodules, build data, temporary data, installed virtual environment
 and pip cache.
 
-Install GNU git and make:
+Install GNU make and git, and wget if you want to download Tesseract models.
 
-    # Debian / Ubuntu packages.
-    sudo apt install git make
+    # on Debian / Ubuntu:
+    sudo apt install make git wget
 
 Install the packages for Python3 development and for Python3 virtual environments
 for your operating system / distribution.
 
-    # Debian / Ubuntu packages.
+    # on Debian / Ubuntu:
     sudo apt install python3-dev python3-venv
 
 Some modules use the Tesseract library. If your distribution provides Tesseract 4.1
 or newer, install the development package:
 
-    # Debian / Ubuntu package.
+    # on Debian / Ubuntu:
     sudo apt install libtesseract-dev
 
 Ubuntu packages for Tesseract 5.0.0 (alpha) are available at the PPA
@@ -35,12 +35,25 @@ Otherwise or for the latest Tesseract code it can also be built locally.
 
 Other modules will have additional system dependencies.
 
-System dependencies for all modules on Ubuntu 18.04 (or similar) can also be automatically installed by running:
+System dependencies for all modules on Ubuntu 18.04 (or similar) can also be installed _automatically_ by running:
 
     # on Debian / Ubuntu:
     sudo apt install make git
     sudo make deps-ubuntu
 
+Moreover, the (shell) environment must have a Unicode-based localization. (Otherwise Python code based on `click` will not work, i.e. most OCR-D CLIs.) This is true for most installations today, and can be verified by:
+
+    locale | fgrep .UTF-8
+
+This should show several `LC_*` variables. Otherwise, either select another localization globally...
+
+    sudo dpkg-reconfigure locales
+
+... or use the Unicode-based POSIX locale temporarily:
+
+    export LC_ALL=C.UTF-8
+    export LANG=C.UTF-8
+
 
 ## Usage
 
@@ -115,6 +128,21 @@ Running `make modules` downloads/updates all modules.
 
 Running `make all` additionally installs the executables from all modules.
 
+### Results
+
+To use the built executables, simply activate the virtual environment:
+
+    . ${VIRTUAL_ENV:-venv}/bin/activate
+    ocrd --help
+    ocrd-...
+
+For the Docker image, run it with your data path mounted as a user:
+
+    docker run -it -u $(id -u):$(id -g) $PWD:/data ocrd/all
+    ocrd --help
+    ocrd-...
+
+
 ## Issues
 
 ### No published/recent version on PyPI
@@ -125,8 +153,7 @@ The following Python modules need an installation from code for different reason
 - cor-asv-ann (not available in PyPI)
 - cor-asv-fst (not available in PyPI)
 - dinglehopper (not available in PyPI)
-- ocrd_cis (not available in PyPI; needs `ocrd>=2.0.0`)
-- ocrd_tesserocr (too old in PyPI; needs `ocrd>=2.0.0`)
+- ocrd_cis (not available in PyPI)
 - tesserocr (too old in PyPI)
 
 ### Conflicting requirements
@@ -141,10 +168,14 @@ _pip does not stop or resolve conflicts – it merely warns._
    * `>=6.2.0` (required by all others)
 - Tensorflow:
    * `tensorflow-gpu==1.14.0` (required by ocrd_calamari and OCR-D-LAYoutERkennung)
-   * `tensorflow` (which pulls in `>=2.0` which is incompatible; required by cor-asv-ann and ocrd_keraslm)
+   * `tensorflow` (required by cor-asv-ann and ocrd_keraslm)
+
+   Both can be installed in parallel in different versions, but may depend on a mutually exclusive set of `tensorboard` and `tensorflow_estimator`.
+
+   Moreover, in the future, some modules (but not others) may depend on `tensorflow>=2.0`, which again is incompatible.
 - OpenCV:
    * `opencv-python-headlesss` (required by core and others, avoids pulling in X11 libraries)
-   * `opencv-python` (required by OCR-D-LAYoutERkennung and segmentation-runner)
+   * `opencv-python` (required by OCR-D-LAYoutERkennung)
    * custom build on ARM...
 
 - ...

diff --git a/ocrd_im6convert b/ocrd_im6convert
diff --git a/opencv-python b/opencv-python
diff --git a/tesseract b/tesseract
+22 −23		CMakeLists.txt
+17 −49		Makefile.am
+20 −5		configure.ac
+6 −1		src/api/tesseractmain.cpp
+19 −18		src/arch/simddetect.cpp
+4 −4		unittest/Makefile.am
+14 −14		unittest/intsimdmatrix_test.cc