import warnings
from datetime import datetime
from hashlib import sha256
+from pathlib import Path
from typing import Iterator
from typing import List
from typing import Optional
self._stemmer = None
self._stop_words = None
- def load(self):
+ def load(self) -> None:
# Catch warnings for processing
with warnings.catch_warnings(record=True) as w:
with open(settings.MODEL_FILE, "rb") as f:
raise IncompatibleClassifierVersionError
def save(self):
- target_file = settings.MODEL_FILE
- target_file_temp = settings.MODEL_FILE.with_suffix(".pickle.part")
+ target_file: Path = settings.MODEL_FILE
+ target_file_temp = target_file.with_suffix(".pickle.part")
with open(target_file_temp, "wb") as f:
pickle.dump(self.FORMAT_VERSION, f)
+
pickle.dump(self.last_doc_change_time, f)
pickle.dump(self.last_auto_type_hash, f)
pickle.dump(self.data_vectorizer, f)
pickle.dump(self.tags_binarizer, f)
-
pickle.dump(self.tags_classifier, f)
+
pickle.dump(self.correspondent_classifier, f)
pickle.dump(self.document_type_classifier, f)
pickle.dump(self.storage_path_classifier, f)
return content
- def predict_correspondent(self, content: str):
+ def predict_correspondent(self, content: str) -> Optional[int]:
if self.correspondent_classifier:
X = self.data_vectorizer.transform([self.preprocess_content(content)])
correspondent_id = self.correspondent_classifier.predict(X)
else:
return None
- def predict_document_type(self, content: str):
+ def predict_document_type(self, content: str) -> Optional[int]:
if self.document_type_classifier:
X = self.data_vectorizer.transform([self.preprocess_content(content)])
document_type_id = self.document_type_classifier.predict(X)
else:
return None
- def predict_tags(self, content: str):
+ def predict_tags(self, content: str) -> List[int]:
from sklearn.utils.multiclass import type_of_target
if self.tags_classifier:
else:
return []
- def predict_storage_path(self, content: str):
+ def predict_storage_path(self, content: str) -> Optional[int]:
if self.storage_path_classifier:
X = self.data_vectorizer.transform([self.preprocess_content(content)])
storage_path_id = self.storage_path_classifier.predict(X)
import logging
import re
+from documents.classifier import DocumentClassifier
from documents.models import Correspondent
+from documents.models import Document
from documents.models import DocumentType
from documents.models import MatchingModel
from documents.models import StoragePath
logger = logging.getLogger("paperless.matching")
-def log_reason(matching_model, document, reason):
+def log_reason(matching_model: MatchingModel, document: Document, reason: str):
class_name = type(matching_model).__name__
logger.debug(
f"{class_name} {matching_model.name} matched on document "
)
-def match_correspondents(document, classifier, user=None):
+def match_correspondents(document: Document, classifier: DocumentClassifier, user=None):
pred_id = classifier.predict_correspondent(document.content) if classifier else None
if user is None and document.owner is not None:
)
-def match_document_types(document, classifier, user=None):
+def match_document_types(document: Document, classifier: DocumentClassifier, user=None):
pred_id = classifier.predict_document_type(document.content) if classifier else None
if user is None and document.owner is not None:
)
-def match_tags(document, classifier, user=None):
+def match_tags(document: Document, classifier: DocumentClassifier, user=None):
predicted_tag_ids = classifier.predict_tags(document.content) if classifier else []
if user is None and document.owner is not None:
)
-def match_storage_paths(document, classifier, user=None):
+def match_storage_paths(document: Document, classifier: DocumentClassifier, user=None):
pred_id = classifier.predict_storage_path(document.content) if classifier else None
if user is None and document.owner is not None:
)
-def matches(matching_model, document):
+def matches(matching_model: MatchingModel, document: Document):
search_kwargs = {}
document_content = document.content
import logging
import os
import shutil
+from typing import Optional
from celery import states
from celery.signals import before_task_publish
from filelock import FileLock
from documents import matching
+from documents.classifier import DocumentClassifier
from documents.file_handling import create_source_path_directory
from documents.file_handling import delete_empty_directories
from documents.file_handling import generate_unique_filename
logger = logging.getLogger("paperless.handlers")
-def add_inbox_tags(sender, document=None, logging_group=None, **kwargs):
+def add_inbox_tags(sender, document: Document, logging_group=None, **kwargs):
if document.owner is not None:
tags = get_objects_for_user_owner_aware(
document.owner,
def set_correspondent(
sender,
- document=None,
+ document: Document,
logging_group=None,
- classifier=None,
+ classifier: Optional[DocumentClassifier] = None,
replace=False,
use_first=True,
suggest=False,
def set_document_type(
sender,
- document=None,
+ document: Document,
logging_group=None,
- classifier=None,
+ classifier: Optional[DocumentClassifier] = None,
replace=False,
use_first=True,
suggest=False,
def set_tags(
sender,
- document=None,
+ document: Document,
logging_group=None,
- classifier=None,
+ classifier: Optional[DocumentClassifier] = None,
replace=False,
suggest=False,
base_url=None,
def set_storage_path(
sender,
- document=None,
+ document: Document,
logging_group=None,
- classifier=None,
+ classifier: Optional[DocumentClassifier] = None,
replace=False,
use_first=True,
suggest=False,
)
-def set_log_entry(sender, document=None, logging_group=None, **kwargs):
+def set_log_entry(sender, document: Document, logging_group=None, **kwargs):
ct = ContentType.objects.get(model="document")
user = User.objects.get(username="consumer")