import shutil
import subprocess
import tempfile
+from typing import Iterator
from typing import Optional
from typing import Set
def parse_date(filename, text) -> Optional[datetime.datetime]:
+ return next(parse_date_generator(filename, text), None)
+
+
+def parse_date_generator(filename, text) -> Iterator[datetime.datetime]:
"""
Returns the date of the document.
"""
return date
return None
- date = None
-
- # if filename date parsing is enabled, search there first:
- if settings.FILENAME_DATE_ORDER:
- for m in re.finditer(DATE_REGEX, filename):
- date_string = m.group(0)
-
- try:
- date = __parser(date_string, settings.FILENAME_DATE_ORDER)
- except (TypeError, ValueError):
- # Skip all matches that do not parse to a proper date
- continue
-
- date = __filter(date)
- if date is not None:
- return date
-
- # Iterate through all regex matches in text and try to parse the date
- for m in re.finditer(DATE_REGEX, text):
- date_string = m.group(0)
+ def __process_match(
+ match: re.Match[str],
+ date_order: str,
+ ) -> Optional[datetime.datetime]:
+ date_string = match.group(0)
try:
- date = __parser(date_string, settings.DATE_ORDER)
+ date = __parser(date_string, date_order)
except (TypeError, ValueError):
# Skip all matches that do not parse to a proper date
- continue
+ date = None
- date = __filter(date)
- if date is not None:
- return date
+ return __filter(date)
+
+ def __process_content(content: str, date_order: str) -> Iterator[datetime.datetime]:
+ for m in re.finditer(DATE_REGEX, content):
+ date = __process_match(m, date_order)
+ if date is not None:
+ yield date
- return date
+ # if filename date parsing is enabled, search there first:
+ if settings.FILENAME_DATE_ORDER:
+ yield from __process_content(filename, settings.FILENAME_DATE_ORDER)
+
+ # Iterate through all regex matches in text and try to parse the date
+ yield from __process_content(text, settings.DATE_ORDER)
class ParseError(Exception):
+import itertools
import json
import logging
import os
from django.http import Http404
from django.http import HttpResponse
from django.http import HttpResponseBadRequest
+from django.shortcuts import get_object_or_404
from django.utils.decorators import method_decorator
from django.utils.translation import get_language
from django.views.decorators.cache import cache_control
from .models import StoragePath
from .models import Tag
from .parsers import get_parser_class_for_mime_type
+from .parsers import parse_date_generator
from .serialisers import AcknowledgeTasksViewSerializer
from .serialisers import BulkDownloadSerializer
from .serialisers import BulkEditSerializer
@action(methods=["get"], detail=True)
def suggestions(self, request, pk=None):
- try:
- doc = Document.objects.get(pk=pk)
- except Document.DoesNotExist:
- raise Http404()
+ doc = get_object_or_404(Document, pk=pk)
classifier = load_classifier()
+ gen = parse_date_generator(doc.filename, doc.content)
+ dates = {i for i in itertools.islice(gen, 5)}
+
return Response(
{
"correspondents": [c.id for c in match_correspondents(doc, classifier)],
dt.id for dt in match_document_types(doc, classifier)
],
"storage_paths": [dt.id for dt in match_storage_paths(doc, classifier)],
+ "dates": [
+ date.strftime("%Y-%m-%d") for date in dates if date is not None
+ ],
},
)