&& python3 -m pip install --no-cache-dir --upgrade wheel \
&& echo "Installing Python requirements" \
&& python3 -m pip install --default-timeout=1000 --no-cache-dir --requirement requirements.txt \
+ && echo "Installing NLTK data" \
+ && python3 -W ignore::RuntimeWarning -m nltk.downloader -d "/usr/local/share/nltk_data" snowball_data \
+ && python3 -W ignore::RuntimeWarning -m nltk.downloader -d "/usr/local/share/nltk_data" stopwords \
+ && python3 -W ignore::RuntimeWarning -m nltk.downloader -d "/usr/local/share/nltk_data" punkt \
&& echo "Cleaning up image" \
&& apt-get -y purge ${BUILD_PACKAGES} \
&& apt-get -y autoremove --purge \
export CONSUME_DIR="${PAPERLESS_CONSUMPTION_DIR:-/usr/src/paperless/consume}"
}
-nltk_data () {
- # Store the NLTK data outside the Docker container
- local -r nltk_data_dir="${DATA_DIR}/nltk"
- local -r truthy_things=("yes y 1 t true")
-
- # If not set, or it looks truthy
- if [[ -z "${PAPERLESS_ENABLE_NLTK}" ]] || [[ "${truthy_things[*]}" =~ ${PAPERLESS_ENABLE_NLTK,} ]]; then
-
- # Download or update the snowball stemmer data
- python3 -W ignore::RuntimeWarning -m nltk.downloader -d "${nltk_data_dir}" snowball_data
-
- # Download or update the stopwords corpus
- python3 -W ignore::RuntimeWarning -m nltk.downloader -d "${nltk_data_dir}" stopwords
-
- # Download or update the punkt tokenizer data
- python3 -W ignore::RuntimeWarning -m nltk.downloader -d "${nltk_data_dir}" punkt
-
- else
- echo "Skipping NLTK data download"
-
- fi
-
-}
-
custom_container_init() {
# Mostly borrowed from the LinuxServer.io base image
# https://github.com/linuxserver/docker-baseimage-ubuntu/tree/bionic/root/etc/cont-init.d
echo "Creating directory ${tmp_dir}"
mkdir -p "${tmp_dir}"
- nltk_data
-
set +e
echo "Adjusting permissions of paperless files. This may take a while."
chown -R paperless:paperless ${tmp_dir}
for lang in "${langs[@]}"; do
pkg="tesseract-ocr-$lang"
- # English is installed by default
- #if [[ "$lang" == "eng" ]]; then
- # continue
- #fi
if dpkg -s "$pkg" &>/dev/null; then
echo "Package $pkg already installed!"
DATA_DIR = __get_path("PAPERLESS_DATA_DIR", os.path.join(BASE_DIR, "..", "data"))
-NLTK_DIR = os.path.join(DATA_DIR, "nltk")
+NLTK_DIR = __get_path("PAPERLESS_NLTK_DIR", "/usr/local/share/nltk_data")
TRASH_DIR = os.getenv("PAPERLESS_TRASH_DIR")