tesseract: Reintroduce enableLanguages

I've removed that attribute in 68bc260ca2,
because the language files no longer were distributed as seperate files,
but if we for example only want to use the English training data, the
closure size of Tesseract gets quite large (around 1.2 GB), which is a
bit much just to be able to run NixOS VM tests.

For this reason I've also switched the VM tests back to using only the
English language.

Tested using the following VM tests (the ones that have OCR enabled) on
x86_64-linux:

 * nixos/tests/chromium.nix -A stable
 * nixos/tests/emacs-daemon.nix
 * nixos/tests/installer.nix -A luksroot
 * nixos/tests/lightdm.nix
 * nixos/tests/plasma5.nix
 * nixos/tests/sddm.nix

Signed-off-by: aszlig <aszlig@redmoonstudios.org>
This commit is contained in:
aszlig 2017-04-07 23:05:04 +02:00
parent c8c340b05a
commit 288a79187c
No known key found for this signature in database
GPG key ID: 1DE8E48E57DB5436
2 changed files with 28 additions and 2 deletions

View file

@ -93,7 +93,7 @@ rec {
vms = map (m: m.config.system.build.vm) (lib.attrValues nodes);
ocrProg = tesseract;
ocrProg = tesseract.override { enableLanguages = [ "eng" ]; };
# Generate onvenience wrappers for running the test driver
# interactively with the specified network, and for starting the

View file

@ -1,5 +1,8 @@
{ stdenv, fetchFromGitHub, pkgconfig, leptonica, libpng, libtiff
, icu, pango, opencl-headers
# Supported list of languages or `null' for all available languages
, enableLanguages ? null
}:
stdenv.mkDerivation rec {
@ -25,7 +28,30 @@ stdenv.mkDerivation rec {
LIBLEPT_HEADERSDIR = "${leptonica}/include";
postInstall = "cp -Rt \"$out/share/tessdata\" \"$tessdata/\"*";
# Copy the .traineddata files of the languages specified in enableLanguages
# into `$out/share/tessdata' and check afterwards if copying was successful.
postInstall = let
mkArg = lang: "-iname ${stdenv.lib.escapeShellArg "${lang}.traineddata"}";
mkFindArgs = stdenv.lib.concatMapStringsSep " -o " mkArg;
findLangArgs = if enableLanguages != null
then "\\( ${mkFindArgs enableLanguages} \\)"
else "-iname '*.traineddata'";
in ''
numLangs="$(find "$tessdata" -mindepth 1 -maxdepth 1 -type f \
${findLangArgs} -exec cp -t "$out/share/tessdata" {} + -print | wc -l)"
${if enableLanguages != null then ''
expected=${toString (builtins.length enableLanguages)}
'' else ''
expected="$(ls -1 "$tessdata/"*.traineddata | wc -l)"
''}
if [ "$numLangs" -ne "$expected" ]; then
echo "Expected $expected languages, but $numLangs" \
"were copied to \`$out/share/tessdata'" >&2
exit 1
fi
'';
meta = {
description = "OCR engine";