diff --git a/pkgs/development/python-modules/unstructured/default.nix b/pkgs/development/python-modules/unstructured/default.nix new file mode 100644 index 00000000000..542e6f0adc2 --- /dev/null +++ b/pkgs/development/python-modules/unstructured/default.nix @@ -0,0 +1,143 @@ +{ lib +, buildPythonPackage +, fetchFromGitHub +# propagated build inputs +, chardet +, filetype +, lxml +, msg-parser +, nltk +, openpyxl +, pandas +, pdf2image +, pdfminer-six +, pillow +, pypandoc +, python-docx +, python-pptx +, python-magic +, markdown +, requests +, tabulate +, xlrd +# optional-dependencies +, langdetect +, sacremoses +, sentencepiece +, torch +, transformers +, unstructured-inference +, s3fs +, fsspec +, adlfs +# , discord-py +, pygithub +, python-gitlab +, praw +, slack-sdk +, wikipedia +, google-api-python-client +# , gcsfs +, elasticsearch8 +, jq +# , dropboxdrivefs +, atlassian-python-api +# test dependencies +, pytestCheckHook +, black +, coverage +, click +, freezegun +# , label-studio-sdk +, mypy +, pytest-cov +, pytest-mock +, vcrpy +, grpcio +}: +let + version = "0.8.1"; + optional-dependencies = { + huggingflace = [ + langdetect + sacremoses + sentencepiece + torch + transformers + ]; + local-inference = [ unstructured-inference ]; + s3 = [ s3fs fsspec ]; + azure = [ adlfs fsspec ]; + discord = [ ]; # discord-py + github = [ pygithub ]; + gitlab = [ python-gitlab ]; + reddit = [ praw ]; + slack = [ slack-sdk ]; + wikipedia = [ wikipedia ]; + google-drive = [ google-api-python-client ]; + gcs = []; # gcsfs fsspec + elasticsearch = [ elasticsearch8 jq ]; + dropbox = []; # dropboxdrivefs fsspec + confluence = [ atlassian-python-api ]; + }; +in +buildPythonPackage { + pname = "unstructured"; + inherit version; + format = "setuptools"; + + src = fetchFromGitHub { + owner = "Unstructured-IO"; + repo = "unstructured"; + rev = version; + hash = "sha256-I9pRycg3uGn7Xfd4YGxic16SXi8+gslsIVarzDT8X2w="; + }; + + propagatedBuildInputs = [ + chardet + filetype + lxml + msg-parser + nltk + openpyxl + pandas + pdf2image + pdfminer-six + pillow + pypandoc + python-docx + python-pptx + python-magic + markdown + requests + tabulate + xlrd + ]; + + pythonImportsCheck = [ "unstructured" ]; + + # test try to download punkt from nltk + # figure out how to make it available to enable the tests + doCheck = false; + + nativeCheckInputs = [ + pytestCheckHook + black + coverage + click + freezegun + mypy + pytest-cov + pytest-mock + vcrpy + grpcio + ]; + + meta = with lib; { + description = "Open source libraries and APIs to build custom preprocessing pipelines for labeling, training, or production machine learning pipelines"; + homepage = "https://github.com/Unstructured-IO/unstructured"; + changelog = "https://github.com/Unstructured-IO/unstructured/blob/${version}/CHANGELOG.md"; + license = licenses.asl20; + maintainers = with maintainers; [ happysalada ]; + }; +} diff --git a/pkgs/top-level/python-packages.nix b/pkgs/top-level/python-packages.nix index 08fba87746d..0465cbe4187 100644 --- a/pkgs/top-level/python-packages.nix +++ b/pkgs/top-level/python-packages.nix @@ -13100,6 +13100,8 @@ self: super: with self; { unrpa = callPackage ../development/python-modules/unrpa { }; + unstructured = callPackage ../development/python-modules/unstructured { }; + unstructured-inference = callPackage ../development/python-modules/unstructured-inference { }; untangle = callPackage ../development/python-modules/untangle { };