]> git.ipfire.org Git - thirdparty/paperless-ngx.git/commitdiff
Document index accent folding feature-accent-folding
authorshamoon <4887959+shamoon@users.noreply.github.com>
Mon, 15 Apr 2024 04:16:52 +0000 (21:16 -0700)
committershamoon <4887959+shamoon@users.noreply.github.com>
Sun, 15 Jun 2025 17:11:21 +0000 (10:11 -0700)
src/documents/index.py
src/documents/tests/test_api_search.py

index 10de0424569b942f1564e29ad8b637b87effc5b2..f0f2b40473d14884b091393110de0b109bc8cea8 100644 (file)
@@ -17,6 +17,8 @@ from guardian.shortcuts import get_users_with_perms
 from whoosh import classify
 from whoosh import highlight
 from whoosh import query
+from whoosh.analysis import CharsetFilter
+from whoosh.analysis import StemmingAnalyzer
 from whoosh.fields import BOOLEAN
 from whoosh.fields import DATETIME
 from whoosh.fields import KEYWORD
@@ -36,6 +38,7 @@ from whoosh.qparser.dateparse import DateParserPlugin
 from whoosh.qparser.dateparse import English
 from whoosh.qparser.plugins import FieldsPlugin
 from whoosh.scoring import TF_IDF
+from whoosh.support.charset import accent_map
 from whoosh.util.times import timespan
 from whoosh.writing import AsyncWriter
 
@@ -54,10 +57,13 @@ logger = logging.getLogger("paperless.index")
 
 
 def get_schema() -> Schema:
+    # add accent-folding filter to a stemming analyzer:
+    af_analyzer = StemmingAnalyzer() | CharsetFilter(accent_map)
+
     return Schema(
         id=NUMERIC(stored=True, unique=True),
-        title=TEXT(sortable=True),
-        content=TEXT(),
+        title=TEXT(sortable=True, analyzer=af_analyzer),
+        content=TEXT(analyzer=af_analyzer),
         asn=NUMERIC(sortable=True, signed=False),
         correspondent=TEXT(sortable=True),
         correspondent_id=NUMERIC(),
index 8f316c1451c623ce9fff78de32f691fb8b229365..65f497d6734738c78714c0114e923e28b1e31927 100644 (file)
@@ -557,7 +557,7 @@ class TestDocumentSearchApi(DirectoriesMixin, APITestCase):
 
         response = self.client.get("/api/search/autocomplete/?term=app")
         self.assertEqual(response.status_code, status.HTTP_200_OK)
-        self.assertEqual(response.data, [b"apples", b"applebaum", b"appletini"])
+        self.assertEqual(response.data, [b"appl", b"applebaum", b"appletini"])
 
         d3.owner = u2
 
@@ -566,7 +566,7 @@ class TestDocumentSearchApi(DirectoriesMixin, APITestCase):
 
         response = self.client.get("/api/search/autocomplete/?term=app")
         self.assertEqual(response.status_code, status.HTTP_200_OK)
-        self.assertEqual(response.data, [b"apples", b"applebaum"])
+        self.assertEqual(response.data, [b"appl", b"applebaum"])
 
         assign_perm("view_document", u1, d3)
 
@@ -575,7 +575,7 @@ class TestDocumentSearchApi(DirectoriesMixin, APITestCase):
 
         response = self.client.get("/api/search/autocomplete/?term=app")
         self.assertEqual(response.status_code, status.HTTP_200_OK)
-        self.assertEqual(response.data, [b"apples", b"applebaum", b"appletini"])
+        self.assertEqual(response.data, [b"appl", b"applebaum", b"appletini"])
 
     def test_search_autocomplete_field_name_match(self):
         """