]> git.ipfire.org Git - thirdparty/paperless-ngx.git/commitdiff
Bakes the NLTK data into the image (~60mb)
authorTrenton H <797416+stumpylog@users.noreply.github.com>
Fri, 2 Dec 2022 20:02:56 +0000 (12:02 -0800)
committerTrenton H <797416+stumpylog@users.noreply.github.com>
Wed, 7 Dec 2022 17:36:32 +0000 (09:36 -0800)
Dockerfile
docker/docker-entrypoint.sh
src/paperless/settings.py

index 2cb25c0c37a2b98a6d39d7941f3371a8b468ce0c..a1f57724305036b18b256404de79205e16ea1655 100644 (file)
@@ -228,6 +228,10 @@ RUN set -eux \
     && python3 -m pip install --no-cache-dir --upgrade wheel \
   && echo "Installing Python requirements" \
     && python3 -m pip install --default-timeout=1000 --no-cache-dir --requirement requirements.txt \
+  && echo "Installing NLTK data" \
+    && python3 -W ignore::RuntimeWarning -m nltk.downloader -d "/usr/local/share/nltk_data" snowball_data \
+    && python3 -W ignore::RuntimeWarning -m nltk.downloader -d "/usr/local/share/nltk_data" stopwords \
+    && python3 -W ignore::RuntimeWarning -m nltk.downloader -d "/usr/local/share/nltk_data" punkt \
   && echo "Cleaning up image" \
     && apt-get -y purge ${BUILD_PACKAGES} \
     && apt-get -y autoremove --purge \
index 74e0806712486625e8e38cf31abc3ba4860329db..00be59add574de9234f4ecde0d05130cd9a42ab8 100755 (executable)
@@ -53,30 +53,6 @@ map_folders() {
        export CONSUME_DIR="${PAPERLESS_CONSUMPTION_DIR:-/usr/src/paperless/consume}"
 }
 
-nltk_data () {
-       # Store the NLTK data outside the Docker container
-       local -r nltk_data_dir="${DATA_DIR}/nltk"
-       local -r truthy_things=("yes y 1 t true")
-
-       # If not set, or it looks truthy
-       if [[ -z "${PAPERLESS_ENABLE_NLTK}" ]] || [[ "${truthy_things[*]}" =~ ${PAPERLESS_ENABLE_NLTK,} ]]; then
-
-               # Download or update the snowball stemmer data
-               python3 -W ignore::RuntimeWarning -m nltk.downloader -d "${nltk_data_dir}" snowball_data
-
-               # Download or update the stopwords corpus
-               python3 -W ignore::RuntimeWarning -m nltk.downloader -d "${nltk_data_dir}" stopwords
-
-               # Download or update the punkt tokenizer data
-               python3 -W ignore::RuntimeWarning -m nltk.downloader -d "${nltk_data_dir}" punkt
-
-       else
-               echo "Skipping NLTK data download"
-
-       fi
-
-}
-
 custom_container_init() {
        # Mostly borrowed from the LinuxServer.io base image
        # https://github.com/linuxserver/docker-baseimage-ubuntu/tree/bionic/root/etc/cont-init.d
@@ -157,8 +133,6 @@ initialize() {
        echo "Creating directory ${tmp_dir}"
        mkdir -p "${tmp_dir}"
 
-       nltk_data
-
        set +e
        echo "Adjusting permissions of paperless files. This may take a while."
        chown -R paperless:paperless ${tmp_dir}
@@ -191,10 +165,6 @@ install_languages() {
 
        for lang in "${langs[@]}"; do
                pkg="tesseract-ocr-$lang"
-               # English is installed by default
-               #if [[ "$lang" ==  "eng" ]]; then
-               #    continue
-               #fi
 
                if dpkg -s "$pkg" &>/dev/null; then
                        echo "Package $pkg already installed!"
index c11e434892982938a350009a1a8fe430e4b78adc..40c7a5c3b4e77360b65bb60b4ed80d2f687aa5cb 100644 (file)
@@ -123,7 +123,7 @@ THUMBNAIL_DIR = os.path.join(MEDIA_ROOT, "documents", "thumbnails")
 
 DATA_DIR = __get_path("PAPERLESS_DATA_DIR", os.path.join(BASE_DIR, "..", "data"))
 
-NLTK_DIR = os.path.join(DATA_DIR, "nltk")
+NLTK_DIR = __get_path("PAPERLESS_NLTK_DIR", "/usr/local/share/nltk_data")
 
 TRASH_DIR = os.getenv("PAPERLESS_TRASH_DIR")