Merge pull request #52379 from erikarvstedt/tesseract

Major tesseract improvements
This commit is contained in:
Michael Raskin 2018-12-20 14:41:48 +00:00 committed by GitHub
commit ede54f9144
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
11 changed files with 472 additions and 139 deletions

View file

@ -116,7 +116,7 @@ in rec {
vms = map (m: m.config.system.build.vm) (lib.attrValues nodes);
ocrProg = tesseract_4.override { enableLanguages = [ "eng" ]; };
ocrProg = tesseract4.override { enableLanguages = [ "eng" ]; };
imagemagick_tiff = imagemagick_light.override { inherit libtiff; };

View file

@ -1,61 +0,0 @@
{ stdenv, fetchFromGitHub, autoreconfHook, autoconf-archive, pkgconfig
, leptonica, libpng, libtiff, icu, pango, opencl-headers
# Supported list of languages or `null' for all available languages
, enableLanguages ? null
}:
stdenv.mkDerivation rec {
name = "tesseract-${version}";
version = "4.0.0";
src = fetchFromGitHub {
owner = "tesseract-ocr";
repo = "tesseract";
rev = version;
sha256 = "1b5fi2vibc4kk9b30kkk4ais4bw8fbbv24bzr5709194hb81cav8";
};
tessdata = fetchFromGitHub {
owner = "tesseract-ocr";
repo = "tessdata";
rev = version;
sha256 = "1chw1ya5zf8aaj2ixr9x013x7vwwwjjmx6f2ag0d6i14lypygy28";
};
nativeBuildInputs = [ pkgconfig autoreconfHook autoconf-archive ];
buildInputs = [ leptonica libpng libtiff icu pango opencl-headers ];
# Copy the .traineddata files of the languages specified in enableLanguages
# into `$out/share/tessdata' and check afterwards if copying was successful.
postInstall = let
mkArg = lang: "-iname ${stdenv.lib.escapeShellArg "${lang}.traineddata"}";
mkFindArgs = stdenv.lib.concatMapStringsSep " -o " mkArg;
findLangArgs = if enableLanguages != null
then "\\( ${mkFindArgs enableLanguages} \\)"
else "-iname '*.traineddata'";
in ''
numLangs="$(find "$tessdata" -mindepth 1 -maxdepth 1 -type f \
${findLangArgs} -exec cp -t "$out/share/tessdata" {} + -print | wc -l)"
${if enableLanguages != null then ''
expected=${toString (builtins.length enableLanguages)}
'' else ''
expected="$(ls -1 "$tessdata/"*.traineddata | wc -l)"
''}
if [ "$numLangs" -ne "$expected" ]; then
echo "Expected $expected languages, but $numLangs" \
"were copied to \`$out/share/tessdata'" >&2
exit 1
fi
'';
meta = {
description = "OCR engine";
homepage = https://github.com/tesseract-ocr/tesseract;
license = stdenv.lib.licenses.asl20;
maintainers = with stdenv.lib.maintainers; [viric];
platforms = with stdenv.lib.platforms; linux;
};
}

View file

@ -1,67 +1,18 @@
{ stdenv, fetchFromGitHub, autoreconfHook, pkgconfig
, leptonica, libpng, libtiff, icu, pango, opencl-headers
# Supported list of languages or `null' for all available languages
, enableLanguages ? null
# if you want just a specific list of languages, optionally specify a hash
# to make tessdata a fixed output derivation.
, enableLanguagesHash ? (if enableLanguages == null # all languages
then "1h48xfzabhn0ldbx5ib67cp9607pr0zpblsy8z6fs4knn0zznfnw"
else null)
}:
{ callPackage, lowPrio }:
let tessdata = stdenv.mkDerivation ({
name = "tessdata";
src = fetchFromGitHub {
owner = "tesseract-ocr";
repo = "tessdata";
rev = "3cf1e2df1fe1d1da29295c9ef0983796c7958b7d";
# when updating don't forget to update the default value fo enableLanguagesHash
sha256 = "1v4b63v5nzcxr2y3635r19l7lj5smjmc9vfk0wmxlryxncb4vpg7";
};
buildCommand = ''
cd $src;
for lang in ${if enableLanguages==null then "*.traineddata" else stdenv.lib.concatMapStringsSep " " (x: x+".traineddata") enableLanguages} ; do
install -Dt $out/share/tessdata $src/$lang ;
done;
'';
preferLocalBuild = true;
} // (stdenv.lib.optionalAttrs (enableLanguagesHash != null) {
# when a hash is given, we make this a fixed output derivation.
outputHashMode = "recursive";
outputHashAlgo = "sha256";
outputHash = enableLanguagesHash;
}));
let
base3 = callPackage ./tesseract3.nix {};
base4 = callPackage ./tesseract4.nix {};
languages = callPackage ./languages.nix {};
in
stdenv.mkDerivation rec {
name = "tesseract-${version}";
version = "3.05.00";
src = fetchFromGitHub {
owner = "tesseract-ocr";
repo = "tesseract";
rev = version;
sha256 = "11wrpcfl118wxsv2c3w2scznwb48c4547qml42s2bpdz079g8y30";
{
tesseract3 = callPackage ./wrapper.nix {
tesseractBase = base3;
languages = languages.v3;
};
enableParallelBuilding = true;
nativeBuildInputs = [ pkgconfig autoreconfHook ];
buildInputs = [ leptonica libpng libtiff icu pango opencl-headers ];
LIBLEPT_HEADERSDIR = "${leptonica}/include";
postInstall = ''
for i in ${tessdata}/share/tessdata/*; do
ln -s $i $out/share/tessdata;
done
'';
meta = {
description = "OCR engine";
homepage = https://github.com/tesseract-ocr/tesseract;
license = stdenv.lib.licenses.asl20;
maintainers = with stdenv.lib.maintainers; [viric];
platforms = with stdenv.lib.platforms; linux ++ darwin;
};
tesseract4 = lowPrio (callPackage ./wrapper.nix {
tesseractBase = base4;
languages = languages.v4;
});
}

View file

@ -0,0 +1,35 @@
#!/usr/bin/env bash
# Usage:
# ./fetch-language-hashes <tessdataRev> [<language code>…]
#
# Fetches all languages if no language codes are given.
#
# Example:
# ./fetch-language-hashes 4.0.0 eng spa
#
# Output:
# eng = "0iy0...";
# spa = "15kw...";
set -e
(( $# >= 1 )) || exit 1
tessdataRev=$1
shift
if (( $# > 0 )); then
langCodes="$@"
else
repoPage=$(curl -fs https://github.com/tesseract-ocr/tessdata/tree/$tessdataRev || {
>&2 echo "Invalid tessdataRev: $tessdataRev"
exit 1
})
langCodes=$(echo $(echo "$repoPage" | grep -ohP "(?<=/)[^/]+?(?=\.traineddata)" | sort))
fi
for lang in $langCodes; do
url=https://github.com/tesseract-ocr/tessdata/raw/$tessdataRev/$lang.traineddata
hash=$(nix-prefetch-url $url 2>/dev/null)
echo "$lang = \"$hash\";"
done

View file

@ -0,0 +1,289 @@
{ stdenv, lib, fetchurl, fetchFromGitHub }:
rec {
makeLanguages = { tessdataRev, tessdata ? null, all ? null, languages ? {} }:
let
tessdataSrc = fetchFromGitHub {
owner = "tesseract-ocr";
repo = "tessdata";
rev = tessdataRev;
sha256 = tessdata;
};
languageFile = lang: sha256: fetchurl {
url = "https://github.com/tesseract-ocr/tessdata/raw/${tessdataRev}/${lang}.traineddata";
inherit sha256;
};
in
{
# Use a simple fixed-output derivation for all languages to increase nix eval performance
all = stdenv.mkDerivation {
name = "all";
buildCommand = ''
mkdir $out
cd ${tessdataSrc}
cp *.traineddata $out
'';
outputHashMode = "recursive";
outputHashAlgo = "sha256";
outputHash = all;
};
} // (lib.mapAttrs languageFile languages);
v3 = makeLanguages {
tessdataRev = "3cf1e2df1fe1d1da29295c9ef0983796c7958b7d";
tessdata = "1v4b63v5nzcxr2y3635r19l7lj5smjmc9vfk0wmxlryxncb4vpg7";
all = "0yj6h9n6h0kzzcqsn3z87vsi8pa60szp0yiayb0znd0v9my0dqhn";
# Run `./fetch-language-hashes <tessdataRev>` to generate these hashes
languages = {
afr = "15dsnzy4i9ai26ilm73gkfj4ck039raa88i6w443c4b1fnay2akf";
amh = "1wbcsdq3svxga3j1alk61xs72a9fhsfsyjxhp3cwxfaqfhrzg7h4";
ara = "0nk495gki6jbbnwcl2ybsx4nd02d6qykcjncq0d2g8pbgapqmj91";
asm = "0c3wq15yphq7x74s2sn3f90k6z1cf5j7ic62z0dynidrv99bddfh";
aze = "0pz073hxqkx1a1cshlgg5k11lj73s52sdxa7k3020drc314lhaxw";
aze_cyrl = "0djbfgx28ykcjsn2p0766qrmj256g7vhc7valc3ivsva8b906lxq";
bel = "04zqy8vik0fcakq6apfp8wjhkkhlg0yn9kmag1lk7s8fy9ax3ws2";
ben = "0q7812kn5xjm47hcgdcg911lhbgqr7hbvqckfxxm8qw0yjx2cy0m";
bod = "0rwq7539zzfs8xs0bf1535z1cwkm0yk1ni25f5gjav7nm6qpiaan";
bos = "1qr04dj7lx347gxpin5nfprbggmxq2mwx8kf3pcc3vb5x3pa57g4";
bul = "0cyyqgi3i4y9bfzwls0lwljzgd0r8ayfqb4bbvdh4qmbni9x42ya";
cat = "0kgw8f5pdw9lfbn6cfp5n1s0j8pj3418yx6rsbagzcf1gr36gbr9";
ceb = "1g1n4np4vhar7wfwx2km5k6kldb600rrl7npfbf75229rar068f1";
ces = "0zxkkyhpd74i6321nv86pkjb0k7p9cp6m174rbn42nl7jz6qxib0";
chi_sim = "0k250xr0gk9yh22yqxd0zpxdsrqfzs164kdv5n9rxx1g996yffij";
chi_tra = "03nxqpd546p0gwfj6pqzbdbv5zjpdddzlpa10xn4nvmks1mmckbp";
chr = "1k1sg3hap0kd5aa36ysvmhp7r3fynxf0f7lzz814h6p3g250zclb";
cym = "0d6wbf9cmrrzf66mhcckwdfy3xh2i38r0by9nk6isw9rl7bf7j07";
dan = "1s1yj56rpzmif3ir3qs4iab744cgpflk7y8812z2665bh61illpr";
dan_frak = "1bxi53ymib5g0139vfd2pflh7nl5925vqznq3sfgaqx7gdx630vi";
deu = "0fna7fqk1a8ivd7q2k38vx37qm3vbn183zh4z5zfqb4pgqmb8znb";
deu_frak = "1y4krkvarg7jxhcq49fgybg4phbn58y9c0z2bm8mnp28jkih1cnb";
dzo = "1fcz0imi7zxi99762pxfcm5iz2jcbqj3s742magka4ihrxnz07xm";
ell = "0r0f71jy4y29bg055qvvy93wchi3lh08zz0k9c8l7466b03yvq5v";
eng = "0vghah8kqcv0n5fnjb88w6siz156ysrc41fckw3f2y8c3sgmqlf0";
enm = "10y61xv3w1ypgqz5rgb22y5hh1i4zx03cwiqw21ifqvg4xdrln46";
epo = "1y5lh55mbcx33cm7qlf1dcah8ffycxmlcpzjzx9r6ij14fdd4964";
equ = "1nqrd0a9jqqh6byy8snfhad1hisrc92dcx44wsy7v4nf40j3mx1s";
est = "12ll8lq1hjcsq9hh93020w78r7f1rcxcwlvrjqw8j5p3k9jg5a4g";
eus = "034s9mp7lw1a4yvf2cmbbj2fbqbaq6xnjqh30yn0wq0c0jck96nw";
fas = "0m61p4byc0kzf75cdn6g18s8hcg9r8ifs34wr85lbsb65kil4ijx";
fin = "1wac333k0lcd5jwprzg99b10bq8sdc96b9d6275kg9imyqjwcc7q";
fra = "1ax7i0nw1lwkz4sbrvn4z0lcrcai77ymdpla7qk7yij6s4xb5bw6";
frk = "16nmr71p93724vk1x5mq4r8vxpwnm448p6dwqv8scg8asch1cidp";
frm = "00yz3hz7wcralq8wbx1ap4c6b37ac6vnz5bgmxmgdx0kqzibiddn";
gle = "1n8z8kmn5m628rlzgz5v0iw6h46aalflq5asa1wj5rygx1y2azpa";
glg = "0fdniayplc3iwmlmvhblarh1gm97dp8rqhhkb8b0clwfd9cj342z";
grc = "04r2193qcxqyab5998xn8bf7197wiccmjm7iakij8d0c7l61dnxb";
guj = "0dp8mlxmf0x9wb8dg0c508sdwz03icq94z8ji8jhwgdqgv8hw1al";
hat = "0793mmlxbb09c8103jhdvlczz647nyn4ykkgd3gwgavncmjh72v8";
heb = "16za9ff1i3ya6hz75l9v3v7j4039kscxxw21g3i2w5p9zn52hyag";
hin = "1vnn5wpc724kgib8jbx0kpnnp4al60ivqir72gnbyh6cpnflb6bf";
hrv = "15rqd6xiv2bdmalb5s6rxvw0yk6w9agn9fli3bvi703q6vpj2yn3";
hun = "19zzwdxwi3h3vdsgr271i1m87gfpdirk6b1ljw2j8qmfilp4sw56";
iku = "1v1yvc1194qycjgb4ihh5hpj6472nlbp66dii183514g2dh9x0db";
ind = "120d4b41wvsgcd1sgy2mp78i9hvi7w03a63078dz1yds0yqdwf1p";
isl = "003ngk8dfv6dglkq8pmi6jsglrfkc65js5ywh3vvkg7qfqf6qsxz";
ita = "1lxklk3zc3x3k8yfpp6ygyv7fndgs57dfasc97rh8782ds16wkjs";
ita_old = "188gby1y51pa1ycyc8y17d16hs5w27yl5ch7xzni98bdjkwbkl1z";
jav = "1fjyjznjchls5ifbnx2b9xagisgxvgj9lsf39rr9d87sbzdbbwbp";
jpn = "1wmayj8wh3pfwznjhalad2qzv38mhrzw2sxl71mycvzvpdy9ag1w";
kan = "0hak4953whw9vd9dzl0hq076kzb19kk45kmfxk03af4k6gb206vg";
kat = "16k0057cvvdc6snm5svhdv3cr7cw71g74yy8215njjbsi838imi3";
kat_old = "02gl755d38plyvzwfjqxvjgfqkbjs9rvzx33qfhm2zvmgbwrfrfh";
kaz = "0hc36w7zz5waycsk220v0r83sg991gd5f5r937mvz44viql80sgm";
khm = "1gb2nv5qdq5fz9w9xq4fj68p46b62sd1m986ra5qbnskxqizr12s";
kir = "1b1ing6qqi8qqfh4xpk76rp4gxp69wdjdl5m777ayx3v02d7nhh3";
kor = "1rldj6f8h1nn5wpx57b0ci7p0fnivnwzgaf0d3576xls26z2wcgv";
kur = "1cp2pfd6g662gvxi7ywkxfbfq1lwbis888bf1gg8ynzy342mx1ic";
lao = "03bdaxakmxpbbr9vsnbzzfksvm6js0l5i0ijwl71piqyxqjj1gxf";
lat = "1q7v7drnwpna9k2l79jbdlxiv1j617rqzjc9d48h3lfrma5z97sj";
lav = "0fxzyvw7n67rmw2irvlghkf1bii4w47200zv26p0v3a9dwvhc7sg";
lit = "0f00ggjjqrl94kwwjmjqwajyfprsml0br8vhn2gvn11gaxvm52hm";
mal = "1i83plhin3m6sq8p92vzlyng5z59gvvqypyh7rnmvdmm9rranx8a";
mar = "0ay7q53yl3709crvn5l9c9jx7hw6m5d3x2crmvnvczsh83ayfdik";
mkd = "1q1wadcr4j1dzssyyqz43qmizc6vfqkbivr6xi2p7p4h9rl11x73";
mlt = "1qp4v6habak1l7xrw322wglvjjndrfp4j7bj8d4npwbzk1sh4s0h";
msa = "048p6mkx9zr40s9s5vbi0gnizhvqwn0g8i1hf1l8db7igbax5xyj";
mya = "17nyr5bd42kzvid3421n3mwckd49vzrjhjahd8rnfsmbsy1x382l";
nep = "154375r32sdmvcnp1ckvgbp3wxvb2xiiypb8bxbsvrabrz4wzjqc";
nld = "1clwbky71zkz55zd3f8r9hj8fhpnbkply80p1js4fvs7x12r715x";
nor = "1ynvrz6s0vmlq1xkjd8k2w6bx8770x6v29qgx83d4nl17ngjd459";
ori = "0dsakc8gnwhs6z5kxc2wdkbn31gkkiqk5vriw0swghychp164aac";
osd = "1zq0dfliavglmix7zzrqdxz1w01rm1f1x1352bqn8xf4zivdbxcw";
pan = "1fwdpwkydfmr6drwgkqzn89z12r2rdm02a75vvdxhxg2a9yiwmbv";
pol = "155z870ygzws476kp7qpzi8jcjcv3jb5px8rbzhnag1fklqr48hx";
por = "1814cff2rffpzlg4hyyrjzpf5ps2i95rmpa4c8ikblbvrlcv97q8";
pus = "1iz5nn1zfvn1l9gb1jriwx991d2hwwc7x4k1nvzjlwpzscplx25b";
ron = "11lr80zhvnnngvwwk01z1d3prfpbh3qbwpl1nl5fp7h09d6n3wzl";
rus = "1d6a8lg4bmd3np16jds1py3qpkaq4ahnhwghd5r0159y0jpxq00q";
san = "169f4ajgwn99yfdfrlwfvdgvv1abal7fpdp31sknvq8l7w2sak3g";
sin = "1411g18r6f6j6f4n0sn7ajgs4gkplb892s6ak0hi9nyyxwv3r1gm";
slk = "0bxfbrg1nf6px0xzkh6ihdi71fmr1rxxs99qb191k7pm16x2lpds";
slk_frak = "0zyqnn1y5cyx1y7wzgw743k4584ljl0rhvk2q1ni6jnjx9ciwzqy";
slv = "1kjn9m9hbwp0m0p2v8c3skpzr6f8x42hz8x48zl22550a7hq8n1h";
spa = "1npgl8ylvfm60hd4214z8a3lriy1hckhijschrbjpzmwdfcqafgj";
spa_old = "0w4ivkv8flyn7bjlyjcrcrdnslkvrrfs7l33mvird1jhhkyqd8sx";
sqi = "15wzvh6qm3yx7yf0k5j7g1imsaqxvq7r2xh6a0xgmkqbyypbbkdf";
srp = "05blqriv30x02c80ds3x7zhw0y21nc6lkqlv5jwgwnjgw4yfpgrm";
srp_latn = "0ss8s3q60aq8sd2a3sbnzvp13qqarxnjw4hij8hd9ab5gsjw0nwr";
swa = "1pwwhx7ldq21cv06cchws8gvwsmkwn5sjcy9z3nk3nbp9qjsf44f";
swe = "0l10iyn2cr7ibgk0akmpg8725mpwpydawgv3s77izsw7y6xhfr1a";
syr = "08bxil13wyp5h4hvbxjcys7ypgqgg46rrp653m7gyv5q94ycjgb0";
tam = "1g155kyba2wjfgzgy48g6yd2csinwbfjdi5r7vw0wm3dh1z39dvz";
tel = "0fydrcb54b6mmqazb337x4s36i2a64sb4xm7y7g3nqqmk9afsipv";
tgk = "0f6j37friywj7y132fv0jm6aj4sx8f0b7brspj3pbjqqpi4v5ws0";
tgl = "0f1r0gicif57qhyw8xaa1sqgny720q3z5cpd5srrn9i6fihaz577";
tha = "1y2hw55jfpidk95y8qbsiczgg2r2khabac97s1y3gl0v93a44jna";
tir = "1y7iryhjr83ca4yh5jjz7qlnrx4kbrp0a0p650whjvk2gnv8m98h";
tur = "0xqnq99b2jb4v74bj95py6wmg14dm31zp5s3l48dmcv6zdgcxg2w";
uig = "1sdddr15zlb33kd1d7hzi5lfd15bfhqn105d7x6snfpqp7vq4bxv";
ukr = "0cdwjnfnnmzz7jdn49l96vqgaimclfxcxaw09cm63f5my382r2rg";
urd = "10xcn1zs2lfswp5yai0ckyg7js587qhr5cf7qib3i35qjbw7nc18";
uzb = "1jkkd5j6vsx5jv5gwprbfwg1vwh714prm8j446wzvp74brmk949l";
uzb_cyrl = "1kdia38rgm2qd3ly80a412jyagxxryr09h1nz2d0iw71bmfn4855";
vie = "1ja18jxxaw282y4jljxpjf1gj15il61vc2ykpfy22vn88wvydxff";
yid = "1jddd0g8mm5v00z5kb8rbpfs7ppzgq9kzm1xlhhvv960yfdbi6fd";
};
};
v4 = makeLanguages {
tessdataRev = "4.0.0";
tessdata = "1chw1ya5zf8aaj2ixr9x013x7vwwwjjmx6f2ag0d6i14lypygy28";
all = "0dqgkp369rcvq72yhgnzj1pj8yrv7kqzc7y6sqs7nzcq7l5qazlg";
# Run `./fetch-language-hashes <tessdataRev>` to generate these hashes
languages = {
afr = "1a9f8pnrspfmcq9gpjnxn2kkhjlsmh912bnpx671fjizxpmiri2y";
amh = "0m1vdyxjx57kmf2qra0p31k509y1cqn4pyckzw00i5n3wx11d2j0";
ara = "0nswl6n0s94g900j5k1gwzp7m140c0yd9a2fdb2lzhdvg1krf190";
asm = "025d9vrjcrwyd6cc6hrw1x8xqhicgrb9wpvhhmlw71ql04dadslf";
aze = "01shcs78a6xn3my8p3y42x1c9f5hzfn83w2n2nwpffbgz4y2nsgf";
aze_cyrl = "1sbd89i5r7rnkjh2in8j0plrxnfiill9jl8pr68iw77ghih6q1vg";
bel = "0dhyymsxcyzwal8474q7ag3m2akv0b92hkdz7rka5z1cxry1cn8c";
ben = "0a7q9414k3frn37x2qcglz722ysg2iivj6kqaaa0ik7z14ibc8v0";
bod = "0rh7x54nlh6ir6ldccj8hi7g8hwlp13r3fkljw8gndvhwmgfkkar";
bos = "1szym4n605hlx12a9vpz4jjs76jscajh22rgkqwbv4qdsl0gi3nd";
bre = "070f4c84iznblsw4jkwpzh9dss8nfb678160szm5r8dlv2yinrrk";
bul = "03bg2yw79lg8rl43y9288313jrfh0h69vl4s4cmlgbmnbx8pvxwj";
cat = "19xs691aj8yy2ff07c3gzm07zicd5ha0gmcjxjh9pknqf2gfy7qv";
ceb = "1896vn41hqc4anm6hjvrnn022i0p8pmhwsp5rv9w2cvr6738l79r";
ces = "0fh2g47msfr91285rnccxcmcshihm126sqy496s4vrr0vk8ix1nf";
chi_sim = "0qxkvbpm5l7gzsshnn72wfx473pprf5nmw8hd4i4x2qxnfddh1gw";
chi_sim_vert = "1f75pzvxbda82vxa2zb1z9b9f13sh81kzaw45vg5118ncsklj8w7";
chi_tra = "056vjws1fir1v5iv44pzykkxs5q1dbb2j8blhj47i53w1zf6g42m";
chi_tra_vert = "10c9cdycg1a5kwlgg60sh8yp07w2fl4whinpxfhlzrzs56allql4";
chr = "19qq8a6c27973djsc4xpcklis92r58x21fg4mz5azdyka5i1n46l";
cos = "0z9kx1hw8h5n00pcahxla808wya50wrkk8cz7x676pd93ibyrlyx";
cym = "13pk9cpf43xxqbz3blfz2av2yd1ma6ds6jbdiqw8anhhj7l9ch2d";
dan = "1jirmahxvyyswhhyzhinvcqaycz7m3ixchqrj3lgfcdi3anvabr2";
dan_frak = "17wcgdqxmbzn7qchnx5gsa05aj4wmhbwk43w173bl3wr6h5ylmh0";
deu = "194rqsg4nlycca9bg2fqf15xgcl110rxp182l7dbjfjhar4knsw9";
deu_frak = "12hhhp32f15c7fw2jp05mwim9ps14kmamhh6vmalvm7r2033vbm7";
div = "09mm9r5hxhsc4qpyg10ym9mc2kdpawx8zk0aiv1xpgd35rzpyz41";
dzo = "1zk7crgcazgqy5zmslp6iw4jws07nja31qdxx0rpzhn3c0bjgw1b";
ell = "1hhym18a9411953j47xjk47jx9ij9xi2qwlx05c93zl41528nsqg";
eng = "0iy07z182lwhqfa0q288ha691scpsry330aynaizn68wcmywk86s";
enm = "1dhr1qvil38bil43wk5ci645sbm3my2y9y7qlcbnwz2p4pflayvm";
epo = "1jig4db7050vww32vxsqyig3j1b0vgz9ipxbsw0jpkjia84k44n9";
equ = "02qwg6s1z7pynwm0p6dvpwi04ivfkr1s7qgssbla1dx7v0ih6rlg";
est = "1jxygahy6by7fbirbmjmd68k6560q1a3h5mvpzdx15h5fw0q58gl";
eus = "0cai7nm7si8680avrrls8bf9ski980rvsj560fh9y6n9rz7mh9mp";
fao = "1n3434jf18bzakbylzyg3jaw2ad4h376g56dsql32bgh2yvyww8a";
fas = "17wjkfka9725rz32clgqgk9msmbz4axs59vz30jmhhxyrkliafqb";
fil = "0p713k8g27df9z384ns111xqxii5kq20m8brflsmd3yckw1mibhz";
fin = "1wc3y9nnm7rb2c2c5fkj7cv7jb27jlkb2bh0g8kaz57h6imfmb2g";
fra = "04qrfvi6irlaahh1pgn5azyfhbhavm12yyybza8603alf8firh7a";
frk = "05cqmxxxjqdl5hjyzi6dpmixnjpd6f3jr6741yapdmnxvkzxkiyp";
frm = "0a86yy6hd0lvlbzvnzjmyapzc0rn7mnkdadqycd65bw1b714cvy2";
fry = "0i84r8g9hlkr9nlhypl4lq6ncrhbcpskqkdcijgk88c2fdknh57h";
gla = "17idyhb505waz9dnb8dsk54faw7y0xvvb12yw71k0skq3i90akar";
gle = "1q87h5zzcva54pg364d3hl6q9hdlydlyj1qmq8n5k7hqk11msxmk";
glg = "01xssz1rhpy3a0sm4i43nba61wc2srz6wv327vdw1kg8ijm0s0g4";
grc = "00x0s3smx4wg5h12y2b9al0j2jk1y3f0yy2x6f2qf7ps831drgyl";
guj = "028v4fgn0zi2044vk6j2rlqklc9i0kj22s52vhifmx1g02kz9154";
hat = "1bca516pr2cnyjlwycc7pr6gfmdjb8565hp06pw9nwpr20ry0hss";
heb = "1qfkffjh29b21frs0mv6llsrchixl5kjkpj1if7fq816g9mym9kx";
hin = "1rkfam5c6qil2590lfffzndhq3bncdgf4ij0cyjcglgyljgx0xnc";
hrv = "0da7b6mk0rwc9zlbqkycwjpddp3qpy07l643i00ia5a1zq35fmgp";
hun = "0w2s4mn9p74zqzmp9hh2017zgsh5v43k4lid4pv29f4b0y5gj9xi";
hye = "0ifzm875wlbjh4vkpmj1n6f14m8i174413l6pc6i44y4p5fpgxrf";
iku = "19arnv82xbxhbcy8pf9fv1sl5zc5707mk34nh7w46dlz86qkidmn";
ind = "1d421hizwni4m6sr4f3nqqpr1g744hzn0krk130m7x8mhzgamba5";
isl = "1hjjw8k2r9qa990ziq5wxr36kyf16mnmrqfmq5vbcjprka9h08pq";
ita = "1qyrvlf7pjxzyb29sc7aq3gq61bww14sijka44scxggfw7134l3r";
ita_old = "1pf8461jbj0vpyry0b54crmkf2bk9mh4klxvmj09jvf0aq2vm9s6";
jav = "18vvbyimj0y462amjmwvqa6h9n8l122j9v0w3hfp63hlxpfprm0m";
jpn = "16hma9w32vdh41ihymp894jza72b0d235hwriv18r78j5n86nhbg";
jpn_vert = "0yca09l9sbpfjgb2slnpb9q7qd7vz3a1wb6bkln30d3nl0d9r1rn";
kan = "0lcmx37rjfxkbhhbrld1ndmkwkm9w9b3pzxhas0cv5dqsx2f84jd";
kat = "1b164bgwa7bbvw4177h8fxfh0fbh4bycfl9pkaa184dpjpaiqpia";
kat_old = "1mgff7sh93hdp3wh0ckikdggrdgf0syp75s39pickpbkp9ic41ai";
kaz = "0h37y0kb5lwsp5zpl7bvxg3ryqldl5hxfnardliwgyqgnag951vi";
khm = "0m7x1fynr18sid2kjjw8xa9ika0a0fc6a6hvc7ihizi47893hdfb";
kir = "09kxwqpqf6kxjii07qlqsiii83zk12rszp88xnzzjp8rjsnk78s3";
kor = "0nsr43fwrp9876ia1fc0zcviv2n8hw16n0wfh158vhygwglvy84m";
kor_vert = "1wmvdznmikk9fq7wdffvn22scxmcl26vjh26jhicqwxpc7kg4bh8";
kur = "0gbsf3ny3n5mgb30v54bz3crgnimdpg19jn633pbpzryzg3xhd25";
kur_ara = "1sbj0cczhi9q119fbzpi0m6zr9kjp3k76bv9w8szkv1wc5y4fng6";
lao = "1gvxlg8bw3a4c9izg3c2a2yl7q6rsy7z9y64axdw9a04pz2ndbl5";
lat = "0b7an3q3xrf9c55bhiqqh7l45ga88l0kwvkp1akmlr98piach3vr";
lav = "0fqsmy47cygamddxyjfrdgkfa9bvmrvf4csvppnkdvfzy6iiv0c2";
lit = "0wjgbkwc3bf5khdqali7ylnhhs4xvpx19m3zx2y9s27v2wjbb6kv";
ltz = "02zdxbniiqfl87fzsiaaqgldqfsv15z5hja1xhxnqpl0nds7shfc";
mal = "0a41ifz8i6lj2ywxjkwvymxzxahkz2cjv4apbrawdj1h42bn7frd";
mar = "00swhlh9bckvmlxanfmlw5j4n9qqhggl84bsq0827bmijsqwnl44";
mkd = "1bqfiwxlzfpz4fs4z5ci2wbv01qhrcayk1inmk3dxq7dsywx1ajg";
mlt = "1rmmga2aw88hr7q7cfr5cvhnsgnf1mi069d5k7z66zp4vzbl4zyz";
mon = "1jksvcavn9plsmjdmhg40mwq5rlvrd1b9gvghdjg7zkf6qqqynlh";
mri = "0jlfawx20s5clsnk82ndy3v2zidh4cfh4acrh8nindk21xmiwh5i";
msa = "0m7zs8anaa3l4z5f3xvbhs4syp41dp4all2yfpi1plyr0hy784an";
mya = "0hljm5haadlr4k5rhw4mvhkygcnrr709rvl7amz7av3nskmi8mb1";
nep = "1dhy0m2h6xfgwibf92iwxsn926dmrhfvkg9rafkdaqcr4pq6w563";
nld = "0bspf5bv1s7qzm6k4aqbpq91zvk4kxxhx5zv08w91xfsa1zpdxmi";
nor = "08majhc9m0fjvac50yq52ia2af9kscclimwkv403klnj4kgf8ndq";
oci = "1mzrw9gsdjrd1xj3zv7l5gzgjq5jrygxf8cfkz20d9lls0wj1xdv";
ori = "1sh42mjzb1hv6l6lljp3wifjmz7wrv818f9f16m8qjikwqxm0s78";
osd = "03mvfk1q1xp1klpf4bwna903rnp51bkqr3gl5hvxybvrc3l2m7z1";
pan = "0165kr94p6x5yxzs4p8sfppvg9cywp65ps0xaym5rqz9iashz32h";
pol = "0g0b71ms6ddgykmkna4mlavgzgmh9vj6s62fi8l4ja93nfpr37hp";
por = "132jbhzmcsq8skanm15bw2niyx9xpbrqr411wn7w9r5i3cvnlv01";
pus = "0iiglnkn478al11avigsav625pn7ifscycnxpj6fg8835vjww3xr";
que = "01vkmfi9idjwskv5pllmrxpil0v5h7f7rzv5viclxrzkmbvrz9b5";
ron = "0ag6vs0cn3sryavs1mfrallgdgi4h28114g7m61rhlhq0z484g0m";
rus = "1hippm3w5d73sh50r136x0xff2p6x128ry2x4fywf6xdpv1f46v8";
san = "1qlpqkr5c5wqcf1bvlipy72advqnvd4wm61vghmrj2sda8mx87sx";
sin = "097d2s4ma0zsq0ab5qs1ylgl9l5phw91fnpsvb7vjmz2mw3ic964";
slk = "0c97pp5iffhdzyma605x8q3rx1qq9pq2h6cai1kppaj92rz3ji9k";
slk_frak = "16ivsam1g18zlpw6pgidvzwb7h8rvw1s10nigs6yfwir8hjxsgki";
slv = "0644jlm55p0dg4zchgrashmbv36zb4x649ckmf2jkbss8bzx7wsf";
snd = "1i2mfi4414l3v9nznjy7959y2jcr8ymvf6w8zpyrw6nad4d1aak7";
spa = "15kwvr7cpcnlxm1ja1yyc022dmsd04gmk7h1p0df12aicsscn3qb";
spa_old = "1jq80c4mi3rmwnfhb3mbaaq0ci101mgbibkji9ala4l5dkcwjra3";
sqi = "19cvvixhz9906p4c9i2grpr386rbp5alp4fp14xm9nd81bmq4701";
srp = "1jd25n13h6vxsa3gzbj6q6mdh02rjl4qrd1bffr5psp33asqvw0l";
srp_latn = "1k7577mn3z0bm5ma9d8l14sn5wpvw50hq1nxwbc36yn3a5b3mhiz";
sun = "0lvlaw3jfvr7b5v09669kq8mm19jdsk9g5h09jsa2gr6fvsq11pa";
swa = "0qy9qc5pa1dzzqrh1z40gk845z1r4d2smywnzydknbb3n240lhz0";
swe = "1y56r7bgzw0pqkdylbah07r1f0v03sblkggiql8x5200rhaxvqi4";
syr = "1vfj5fsiv170jghryrxwyz0i9mdsaki1kglxrklkb2caal9kwy38";
tam = "0rhhdbnp0a2hpg00vpc0xyxcl2w36i1kn63mrvwx1f9q7m3y1fmf";
tat = "0a74rp8pyp4yivv2xcy2m8xgwch8scr3wmk1fzniwzf43fsrqp76";
tel = "0gcq8hxhxvilyh7x7kiikq07hllqysc8sfyr88gvpj4xi092h2bx";
tgk = "1458gk0k6gk49n8lr6fj7l7cwkhxn0lrhybzq10zl1ly7yzjhf67";
tgl = "12yscwckdy3l21mvsrj1021gxw2isjrg369r08rsf7lh96wn4wkn";
tha = "01f0j7gsc5slxaaql1gqbhk4wlwaxc29dlmfxwjzikxc46gjl0w8";
tir = "1q6w48b1jchv55713pq20inzjjdymh32fw8wxfaj1qi7bjqfb9fk";
ton = "06g60ga8rys8jaimqrvd4svh40qs1nz4bszdnf2hdv05ibryibdq";
tur = "0g9g1wvibp61qbriy8ys948yfkl88xk9g8f93bnq8w8dx029b6s8";
uig = "09sajx21lw3a3ph62dyqr10pjaq2mij10sdhkhvvjiydk34dn548";
ukr = "14q8ls8gkrg7c9pc6qzm6yf5ady3i3303vs1hz4d2idcl6yry334";
urd = "15vszhqraxqdcng1069p6i4xq3ck3904q207nkbap6dfpcpjig40";
uzb = "03hyw0vavmjirqs4wkd5r85g91w2avsyl14z624fhm3gc66pqg7n";
uzb_cyrl = "1433lrrp2lfgb1k0a4sc20b35b2jcl8f1z92vm2936y7w04xpaq7";
vie = "02k40d3wji74d1jgvkr3zrn9gpzlmp0lqhrrdmc48r2sgvnrnk8n";
yid = "0xnbvi04xv1qapqg72wa3bjwbw51pkdnyncjpjp37vn6dzh04l0z";
yor = "07w3aci52ng6i6nyp97q5zb2dqlj08w6im90y1h691qah1x44zlv";
};
};
}

View file

@ -0,0 +1,29 @@
{ stdenv, fetchurl, fetchFromGitHub, autoreconfHook, pkgconfig
, leptonica, libpng, libtiff, icu, pango, opencl-headers }:
stdenv.mkDerivation rec {
name = "tesseract-${version}";
version = "3.05.00";
src = fetchFromGitHub {
owner = "tesseract-ocr";
repo = "tesseract";
rev = version;
sha256 = "11wrpcfl118wxsv2c3w2scznwb48c4547qml42s2bpdz079g8y30";
};
enableParallelBuilding = true;
nativeBuildInputs = [ pkgconfig autoreconfHook ];
buildInputs = [ leptonica libpng libtiff icu pango opencl-headers ];
LIBLEPT_HEADERSDIR = "${leptonica}/include";
meta = {
description = "OCR engine";
homepage = https://github.com/tesseract-ocr/tesseract;
license = stdenv.lib.licenses.asl20;
maintainers = with stdenv.lib.maintainers; [ viric earvstedt ];
platforms = with stdenv.lib.platforms; linux ++ darwin;
};
}

View file

@ -0,0 +1,27 @@
{ stdenv, fetchFromGitHub, autoreconfHook, autoconf-archive, pkgconfig
, leptonica, libpng, libtiff, icu, pango, opencl-headers }:
stdenv.mkDerivation rec {
name = "tesseract-${version}";
version = "4.0.0";
src = fetchFromGitHub {
owner = "tesseract-ocr";
repo = "tesseract";
rev = version;
sha256 = "1b5fi2vibc4kk9b30kkk4ais4bw8fbbv24bzr5709194hb81cav8";
};
enableParallelBuilding = true;
nativeBuildInputs = [ pkgconfig autoreconfHook autoconf-archive ];
buildInputs = [ leptonica libpng libtiff icu pango opencl-headers ];
meta = {
description = "OCR engine";
homepage = https://github.com/tesseract-ocr/tesseract;
license = stdenv.lib.licenses.asl20;
maintainers = with stdenv.lib.maintainers; [ viric earvstedt ];
platforms = with stdenv.lib.platforms; linux ++ darwin;
};
}

View file

@ -0,0 +1,58 @@
{ stdenv, makeWrapper, tesseractBase, languages
# A list of languages like [ "eng" "spa" … ] or `null` for all available languages
, enableLanguages ? null
# A list of files or a directory containing files
, tessdata ? (if enableLanguages == null then languages.all
else map (lang: languages.${lang}) enableLanguages)
# This argument is obsolete
, enableLanguagesHash ? null
}:
let
passthru = { inherit tesseractBase languages tessdata; };
tesseractWithData = tesseractBase.overrideAttrs (_: {
inherit tesseractBase tessdata;
buildInputs = [ makeWrapper ];
buildCommand = ''
makeWrapper {$tesseractBase,$out}/bin/tesseract --set-default TESSDATA_PREFIX $out/share/tessdata
# Recursively link include, share
cp -rs --no-preserve=mode $tesseractBase/{include,share} $out
cp -r --no-preserve=mode $tesseractBase/lib $out
# Fixup the store paths in lib so that the tessdata from this derivation is used.
if (( ''${#tesseractBase} != ''${#out} )); then
echo "Can't replace store paths due to differing lengths"
exit 1
fi
find $out/lib -type f -exec sed -i "s|$tesseractBase|$out|g" {} \;
if [[ -d "$tessdata" ]]; then
ln -s $tessdata/* $out/share/tessdata
else
for lang in $tessdata; do
ln -s $lang $out/share/tessdata/''${lang#/nix/store*-}
done
fi
if [[ ! -e $out/share/tessdata/eng.traineddata ]]; then
# This is a bug in Tesseract's internal tessdata discovery mechanism
echo "eng.traineddata must be present in tessdata for Tesseract to work"
exit 1
fi
'';
});
tesseract = (if enableLanguages == [] then tesseractBase else tesseractWithData) // passthru;
in
if enableLanguagesHash == null then
tesseract
else
stdenv.lib.warn "Argument `enableLanguagesHash` is obsolete and can be removed."
tesseract

View file

@ -75,19 +75,21 @@ stdenv.mkDerivation rec {
cp ${src}/leptonica_mod/* src/
'';
});
tesseract_modded = tesseract.overrideAttrs (attrs: {
prePatch = ''
cp ${src}/tesseract_mod/{ambigs.cpp,ccutil.h,ccutil.cpp} ccutil/
cp ${src}/tesseract_mod/dawg.cpp api/
cp ${src}/tesseract_mod/{imagedata.cpp,tessdatamanager.cpp} ccstruct/
cp ${src}/tesseract_mod/openclwrapper.h opencl/
cp ${src}/tesseract_mod/{tessedit.cpp,thresholder.cpp} ccmain/
cp ${src}/tesseract_mod/tess_lang_mod_edge.h cube/
cp ${src}/tesseract_mod/tesscapi.cpp api/
cp ${src}/include_mod/{tesseract.h,leptonica.h} api/
'';
patches = [ ./tesseract.patch ];
});
tesseract_modded = tesseract.override {
tesseractBase = tesseract.tesseractBase.overrideAttrs (_: {
prePatch = ''
cp ${src}/tesseract_mod/{ambigs.cpp,ccutil.h,ccutil.cpp} ccutil/
cp ${src}/tesseract_mod/dawg.cpp api/
cp ${src}/tesseract_mod/{imagedata.cpp,tessdatamanager.cpp} ccstruct/
cp ${src}/tesseract_mod/openclwrapper.h opencl/
cp ${src}/tesseract_mod/{tessedit.cpp,thresholder.cpp} ccmain/
cp ${src}/tesseract_mod/tess_lang_mod_edge.h cube/
cp ${src}/tesseract_mod/tesscapi.cpp api/
cp ${src}/include_mod/{tesseract.h,leptonica.h} api/
'';
patches = [ ./tesseract.patch ];
});
};
in
[ zlib libpng ] ++
optional enableGSL gsl ++

View file

@ -306,6 +306,7 @@ mapAliases ({
terraform-provider-ibm = terraform-providers.ibm; # added 2018-09-28
terraform-provider-libvirt = terraform-providers.libvirt; # added 2018-09-28
terraform-provider-nixos = terraform-providers.nixos; # added 2018-09-28
tesseract_4 = tesseract4; # added 2018-12-19
tex-gyre-bonum-math = tex-gyre-math.bonum; # added 2018-04-03
tex-gyre-pagella-math = tex-gyre-math.pagella; # added 2018-04-03
tex-gyre-schola-math = tex-gyre-math.schola; # added 2018-04-03

View file

@ -19523,8 +19523,10 @@ in
termtosvg = callPackage ../tools/misc/termtosvg { };
tesseract = callPackage ../applications/graphics/tesseract { };
tesseract_4 = lowPrio (callPackage ../applications/graphics/tesseract/4.x.nix { });
inherit (callPackage ../applications/graphics/tesseract {})
tesseract3
tesseract4;
tesseract = tesseract3;
tetraproc = callPackage ../applications/audio/tetraproc { };