OCR: Tesseract 4.1.1 / Ghostscript 9.54.0

With tesseract v4.0.0-beta.3 we often observe crashes with: ``` contains_unichar_id(unichar_id):Error:Assert failed:in file ../../src/ccutil/unicharset.h, line 511 ``` This seems to have been fixed by https://github.com/tesseract-ocr/tesseract/pull/1954 Still, even after updating to 4.1.1, text recognition from PDF in ERP5 is too expensive. We also update Ghostscript to 9.54.0, because this version has built-in OCR, which does not need to convert the PDF to PNG then TIFF as we currently do in ERP5. See merge request !985

OCR: Tesseract 4.1.1 / Ghostscript 9.54.0
With tesseract v4.0.0-beta.3 we often observe crashes with: ``` contains_unichar_id(unichar_id):Error:Assert failed:in file ../../src/ccutil/unicharset.h, line 511 ``` This seems to have been fixed by https://github.com/tesseract-ocr/tesseract/pull/1954 Still, even after updating to 4.1.1, text recognition from PDF in ERP5 is too expensive. We also update Ghostscript to 9.54.0, because this version has built-in OCR, which does not need to convert the PDF to PNG then TIFF as we currently do in ERP5. See merge request !985
ec129b70 · Jérome Perrin · 582b0b03 · 1b291415 · ec129b70 · ec129b70
Commit ec129b70 authored Jun 04, 2021 by Jérome Perrin
7 changed files
--- a/component/ghostscript/buildout.cfg
+++ b/component/ghostscript/buildout.cfg
@@ -2,17 +2,22 @@
 extends =
  ../fontconfig/buildout.cfg
  ../freetype/buildout.cfg
+  ../libjpeg/buildout.cfg
  ../libtiff/buildout.cfg
  ../libxml2/buildout.cfg
  ../pkgconfig/buildout.cfg
+  ../tesseract/buildout.cfg
  ../xz-utils/buildout.cfg
 parts = ghostscript
-[ghostscript-common]
+[ghostscript]
 recipe = slapos.recipe.cmmi
 shared = true
-pkg_config_depends = ${libtiff:location}/lib/pkgconfig:${fontconfig:location}/lib/pkgconfig:${fontconfig:pkg_config_depends}
+url = https://github.com/ArtifexSoftware/ghostpdl-downloads/releases/download/gs9540/ghostscript-9.54.0.tar.gz
+md5sum = 5d571792a8eb826c9f618fb69918d9fc
+pkg_config_depends = ${libtiff:location}/lib/pkgconfig:${libjpeg:location}/lib/pkgconfig:${fontconfig:location}/lib/pkgconfig:${fontconfig:pkg_config_depends}
+# XXX --with-tessdata work arounds a slaprunner bug of having softwares installed in a path containing //
 configure-options =
  --disable-cups
  --disable-threadsafe
@@ -20,18 +25,18 @@ configure-options =
  --without-libidn
  --without-x
  --with-drivers=FILES
-# it seems that parallel build sometimes fails for ghostscript.
+  --with-tessdata=$(python -c 'print("""${:tessdata-location}""".replace("//", "/"))')
-make-options = -j1
 environment =
  PATH=${pkgconfig:location}/bin:${xz-utils:location}/bin:%(PATH)s
  PKG_CONFIG_PATH=${:pkg_config_depends}
-  LDFLAGS=-Wl,-rpath=${fontconfig:location}/lib -Wl,-rpath=${freetype:location}/lib -Wl,-rpath=${libtiff:location}/lib
+  CFLAGS=-I${libjpeg:location}/include
+  LDFLAGS=-Wl,-rpath=${fontconfig:location}/lib -Wl,-rpath=${freetype:location}/lib -Wl,-rpath=${libtiff:location}/lib -L${libjpeg:location}/lib -Wl,-rpath=${libjpeg:location}/lib
  LD_LIBRARY_PATH=${fontconfig:location}/lib:${freetype:location}/lib:${libtiff:location}/lib:${libxml2:location}/lib
-[ghostscript]
+# configure gives priority to local jpeg library and refuse mixing local libjpeg with "system" libtiff.
-<= ghostscript-9
+# We remove this local jpeg library source folder so that configure picks up the slapos versions of these libraries.
+pre-configure = rm -r jpeg
-[ghostscript-9]
+post-make-hook = ${tesseract-download-traineddata:post-make-hook}
-<= ghostscript-common
+tessdata-location = @@LOCATION@@/share/tessdata/
-url = https://github.com/ArtifexSoftware/ghostpdl-downloads/releases/download/gs950/ghostscript-9.50.tar.xz
+tessdata-urls = ${tesseract-download-traineddata:urls}
-md5sum = 6cea6bae4a7cdfac6ccb09f07f0caf8c
--- a/component/leptonica/buildout.cfg
+++ b/component/leptonica/buildout.cfg
@@ -7,17 +7,14 @@ extends =
  ../libtiff/buildout.cfg
  ../webp/buildout.cfg
  ../giflib/buildout.cfg
-  ../patch/buildout.cfg
 [leptonica]
 recipe = slapos.recipe.cmmi
-url = http://www.leptonica.com/source/leptonica-1.76.0.tar.gz
-md5sum = a263a5e4f7e8f8a661fb121a265d2d20
 shared = true
+url = http://www.leptonica.org/source/leptonica-1.80.0.tar.gz
+md5sum = d640d684234442a84c9e8902f0b3ff36
 configure-options =
  --disable-static
 environment =
  CPPFLAGS=-I${zlib:location}/include -I${libjpeg:location}/include -I${libpng:location}/include -I${libtiff:location}/include -I${webp:location}/include -I${giflib:location}/include
  LDFLAGS=-L${zlib:location}/lib -Wl,-rpath=${zlib:location}/lib -L${libjpeg:location}/lib -Wl,-rpath=${libjpeg:location}/lib -L${libpng:location}/lib -Wl,-rpath=${libpng:location}/lib -L${libtiff:location}/lib -Wl,-rpath=${libtiff:location}/lib -L${webp:location}/lib -Wl,-rpath=${webp:location}/lib -L${giflib:location}/lib -Wl,-rpath=${giflib:location}/lib
-  PATH=${patch:location}/bin:%(PATH)s
--- a/component/leptonica/leptonica-1.69-zlib-include.patch
+++ b/component/leptonica/leptonica-1.69-zlib-include.patch
-This patch is originally taken from:
-http://leptonica.googlecode.com/issues/attachment?aid=560001000&name=zlib-include.patch&token=m2sugSYxB4xwAuNgrKXyHTxBYNg%3A1337345966091
-To fix the following issue with leptonica:
-http://code.google.com/p/leptonica/issues/detail?id=56
-diff -Nurd -x'*~' leptonica-1.68.orig/src/pngio.c leptonica-1.68/src/pngio.c
--- leptonica-1.68.orig/src/pngio.c	2011-02-01 00:41:12.000000000 -0500
-+++ leptonica-1.68/src/pngio.c	2011-07-09 09:17:17.000000000 -0400
-@@ -108,6 +108,10 @@
- #include "png.h"
-+#ifdef HAVE_LIBZ
-+#include "zlib.h"
-+#endif
-+
- /* ----------------Set defaults for read/write options ----------------- */
-     /* strip 16 bpp --> 8 bpp on reading png; default is for stripping */
- static l_int32   var_PNG_STRIP_16_TO_8 = 1;
--- a/component/tesseract/buildout.cfg
+++ b/component/tesseract/buildout.cfg
@@ -10,43 +10,34 @@ extends =
  ../fontconfig/buildout.cfg
  ../lcms/buildout.cfg
  ../pkgconfig/buildout.cfg
+  ./buildout.hash.cfg
 parts =
  tesseract
-  tesseract-traineddata
-  tesseract-eng-traineddata
-  tesseract-osd-traineddata
 [tesseract]
 recipe = slapos.recipe.cmmi
-url = https://github.com/tesseract-ocr/tesseract/archive/6b250b58121a9858d3e3019a78a6f7d421bd0fc7.tar.gz
+shared = true
-md5sum = fdc38148ad8eb1bd0485a217503dd6d5
+url = https://github.com/tesseract-ocr/tesseract/archive/refs/tags/4.1.1.tar.gz
+md5sum = 51fe2bcbff1bbce77a25d180fd247f7d
 pkg_config_depends = ${leptonica:location}/lib/pkgconfig:${fontconfig:location}/lib/pkgconfig:${fontconfig:pkg_config_depends}:${lcms2:location}/lib/pkgconfig:${xz-utils:location}/lib/pkgconfig
 pre-configure =
  autoreconf -ivf -I${pkgconfig:location}/share/aclocal -I${libtool:location}/share/aclocal -Wno-portability
-configure-options =
-  --disable-static
-  --datarootdir=${tesseract-traineddata:location}
-# XXX: tesseract seems not easily configurable at runtime about where to find
-# its trained data, so we set its datarootdir above to a controlled location
 environment =
  PATH=${pkgconfig:location}/bin:${autoconf:location}/bin:${automake:location}/bin:${libtool:location}/bin:${m4:location}/bin:${patch:location}/bin:%(PATH)s
  PKG_CONFIG_PATH=${:pkg_config_depends}
  LDFLAGS=-L${leptonica:location}/lib -Wl,-rpath=${leptonica:location}/lib -L${jbigkit:location}/lib -Wl,-rpath=${jbigkit:location}/lib -L${zlib:location}/lib -Wl,-rpath=${zlib:location}/lib
-[tesseract-traineddata]
+post-make-hook = ${tesseract-download-traineddata:post-make-hook}
-location = ${buildout:parts-directory}/${:_buildout_section_name_}
+tessdata-urls = ${tesseract-download-traineddata:urls}
+tessdata-location = @@LOCATION@@/share/tessdata/
-[tesseract-eng-traineddata]
-recipe = slapos.recipe.build:download
-destination = ${tesseract-traineddata:location}/tessdata/eng.traineddata
-url = https://github.com/tesseract-ocr/tessdata/raw/590567f20dc044f6948a8e2c61afc714c360ad0e/eng.traineddata
-md5sum = 57e0df3d84fed9fbf8c7a8e589f8f012
-[tesseract-osd-traineddata]
+[tesseract-download-traineddata]
-recipe = slapos.recipe.build:download
+post-make-hook = ${:_profile_base_location_}/${download-tessdata.py:filename}#${download-tessdata.py:md5sum}:post_make_hook
-destination = ${tesseract-traineddata:location}/tessdata/osd.traineddata
+urls =
-url = https://github.com/tesseract-ocr/tessdata/raw/590567f20dc044f6948a8e2c61afc714c360ad0e/osd.traineddata
+    https://raw.githubusercontent.com/tesseract-ocr/tessdata/4.1.0/eng.traineddata#57e0df3d84fed9fbf8c7a8e589f8f012
-md5sum = 7611737524efd1ce2dde67eff629bbcf
+    https://raw.githubusercontent.com/tesseract-ocr/tessdata/4.1.0/osd.traineddata#7611737524efd1ce2dde67eff629bbcf
+    https://raw.githubusercontent.com/tesseract-ocr/tessdata/4.1.0/fra.traineddata#a73e70c872f262895d93976febeb1638
+    https://raw.githubusercontent.com/tesseract-ocr/tessdata/4.1.0/jpn.traineddata#af3a30a9bec904e106aa8521e7caaeca
+    https://raw.githubusercontent.com/tesseract-ocr/tessdata/4.1.0/chi_sim.traineddata#6965cb3213edd961cb16264e2ea45f5c
--- a/component/tesseract/buildout.hash.cfg
+++ b/component/tesseract/buildout.hash.cfg
+[download-tessdata.py]
+filename = download-tessdata.py
+md5sum = 2d283a6d8662d6bb8c9de7b26162b702
--- a/component/tesseract/download-tessdata.py
+++ b/component/tesseract/download-tessdata.py
+# This is a post-make hook script to download tesseract training data.
+#
+# This script uses the following buildout options:
+#   - tessdata-urls: list of URLs and their expected md5sum as URL fragments
+#   - tessdata-location: path where to install the data.
+import zc.buildout
+import os
+def post_make_hook(options, buildout, env):
+  if not os.path.exists(options['tessdata-location']):
+    os.makedirs(options['tessdata-location'])
+  download = zc.buildout.download.Download(
+      buildout['buildout'],
+      hash_name=True,
+  )
+  for url in options['tessdata-urls'].splitlines():
+    url, _, md5sum = url.partition('#')
+    if url:
+      download(
+          url,
+          md5sum=md5sum,
+          path=os.path.join(options['tessdata-location'],
+                            os.path.basename(url)),
+      )
--- a/stack/erp5/buildout.cfg
+++ b/stack/erp5/buildout.cfg
@@ -7,6 +7,7 @@ extends =
  buildout.hash.cfg
  ../../component/fonts/buildout.cfg
  ../../component/git/buildout.cfg
+  ../../component/ghostscript/buildout.cfg
  ../../component/graphviz/buildout.cfg
  ../../component/gzip/buildout.cfg
  ../../component/xz-utils/buildout.cfg
@@ -65,8 +66,6 @@ parts +=
  slapos-cookbook
  mroonga-mariadb
  tesseract
-  tesseract-eng-traineddata
-  tesseract-osd-traineddata
  zabbix-agent
 # Buildoutish
@@ -252,6 +251,7 @@ link-binary =
  ${graphviz:location}/bin/dot
  ${grep:location}/bin/grep
  ${imagemagick:location}/bin/convert
+  ${ghostscript:location}/bin/gs
  ${imagemagick:location}/bin/identify
  ${jpegoptim:location}/bin/jpegoptim
  ${jsl:location}/bin/jsl