]> git.ipfire.org Git - thirdparty/dovecot/core.git/commitdiff
doc: Update Solr schema
authorAki Tuomi <aki.tuomi@open-xchange.com>
Fri, 17 Dec 2021 09:36:37 +0000 (11:36 +0200)
committeraki.tuomi <aki.tuomi@open-xchange.com>
Tue, 18 Jan 2022 12:15:37 +0000 (12:15 +0000)
Use ICU tokenization and normalization.
Reorder filters to make more sense.

doc/solr-config-7.7.0.xml
doc/solr-schema-7.7.0.xml
doc/solr-schema.xml [changed from file to symlink]

index 3661874d6b90c58e6f1bdb4df4d12e6eaf2e4075..1f74c3fce91f0e28f88f62231a3085e245a8f7bc 100644 (file)
@@ -37,6 +37,9 @@
   <lib dir="${solr.install.dir:../../../..}/contrib/velocity/lib" regex=".*\.jar" />
   <lib dir="${solr.install.dir:../../../..}/dist/" regex="solr-velocity-\d.*\.jar" />
 
+  <lib dir="${solr.install.dir:../../../..}/contrib/analysis-extras/lib" regex="icu4j-.*\.jar"/>
+  <lib dir="${solr.install.dir:../../../..}/contrib/analysis-extras/lucene-libs" regex="lucene-analyzers-icu-.*\.jar"/>
+
   <!-- Data Directory
 
        Used to specify an alternate directory to hold all index data
index 601a290c4e3e22b212ba6a900740a79bb4e0d57e..ee7f01353fb3b8e010d2b6a6a806b323b86d54df 100644 (file)
@@ -1,29 +1,56 @@
 <?xml version="1.0" encoding="UTF-8"?>
 
-<schema name="dovecot" version="2.0">
+<schema name="dovecot" version="2.1">
+  <uniqueKey>id</uniqueKey>
+
   <fieldType name="string" class="solr.StrField" omitNorms="true" sortMissingLast="true"/>
   <fieldType name="long" class="solr.LongPointField" positionIncrementGap="0"/>
-  <fieldType name="boolean" class="solr.BoolField" sortMissingLast="true"/>
-
   <fieldType name="text" class="solr.TextField" autoGeneratePhraseQueries="true" positionIncrementGap="100">
-    <analyzer type="index">
-      <tokenizer class="solr.StandardTokenizerFactory"/>
-      <filter class="solr.StopFilterFactory" words="stopwords.txt" ignoreCase="true"/>
-      <filter class="solr.WordDelimiterGraphFilterFactory" catenateNumbers="1" generateNumberParts="1" splitOnCaseChange="1" generateWordParts="1" splitOnNumerics="1" catenateAll="1" catenateWords="1"/>
-      <filter class="solr.FlattenGraphFilterFactory"/>
-      <filter class="solr.LowerCaseFilterFactory"/>
+
+  <analyzer type="index">
+      <!-- Use unicode aware tokenizer, see
+           https://solr.apache.org/guide/7_7/tokenizers.html#icu-tokenizer
+      -->
+      <tokenizer class="solr.ICUTokenizerFactory"/>
+      <!-- Unicode aware case folding to normalize input, see
+           https://solr.apache.org/guide/7_7/filter-descriptions.html#icu-folding-filter
+      -->
+      <filter class="solr.ICUFoldingFilterFactory"/>
+      <!-- Protect protected words from being modified by stemmers. Edit protwords.txt
+           to customize -->
       <filter class="solr.KeywordMarkerFilterFactory" protected="protwords.txt"/>
+      <!-- Split tokens from word delimiters -->
+      <filter class="solr.WordDelimiterGraphFilterFactory" catenateNumbers="1"
+              generateNumberParts="1" splitOnCaseChange="1" generateWordParts="1"
+              splitOnNumerics="1" catenateAll="1" catenateWords="1"/>
+      <!-- Porter is optimized filter for english, if you want other languages
+           use SnowballPorterFilter here, see https://solr.apache.org/guide/7_6/filter-descriptions.html
+           <filter class="solr.SnowballPorterFilterFactory" language="French"/>
+      -->
       <filter class="solr.PorterStemFilterFactory"/>
+      <!-- Map synonyms, use synonyms.txt to customize -->
+      <filter class="solr.SynonymGraphFilterFactory" expand="true" ignoreCase="true"
+              synonyms="synonyms.txt"/>
+      <!-- Remove stop words, customize with stopwords.txt, usually solr ships
+           stopword catalog for multiple languages, so you can choose from there. -->
+      <filter class="solr.StopFilterFactory" words="stopwords.txt" ignoreCase="true"/>
+      <!-- Flatten the graph so it can be consumed by indexer -->
+      <filter class="solr.FlattenGraphFilterFactory"/>
     </analyzer>
     <analyzer type="query">
-      <tokenizer class="solr.StandardTokenizerFactory"/>
-      <filter class="solr.SynonymGraphFilterFactory" expand="true" ignoreCase="true" synonyms="synonyms.txt"/>
-      <filter class="solr.FlattenGraphFilterFactory"/>
-      <filter class="solr.StopFilterFactory" words="stopwords.txt" ignoreCase="true"/>
-      <filter class="solr.WordDelimiterGraphFilterFactory" catenateNumbers="1" generateNumberParts="1" splitOnCaseChange="1" generateWordParts="1" splitOnNumerics="1" catenateAll="1" catenateWords="1"/>
-      <filter class="solr.LowerCaseFilterFactory"/>
+      <tokenizer class="solr.ICUTokenizerFactory"/>
+      <filter class="solr.ICUFoldingFilterFactory"/>
       <filter class="solr.KeywordMarkerFilterFactory" protected="protwords.txt"/>
+      <filter class="solr.WordDelimiterGraphFilterFactory" catenateNumbers="1"
+              generateNumberParts="1" splitOnCaseChange="1" generateWordParts="1"
+              splitOnNumerics="1" catenateAll="1" catenateWords="1"/>
+      <!-- See above for comment on Porter -->
       <filter class="solr.PorterStemFilterFactory"/>
+      <filter class="solr.SynonymGraphFilterFactory" expand="true" ignoreCase="true" synonyms="synonyms.txt"/>
+      <!-- Differs from stop word filter a bit,
+           see https://solr.apache.org/guide/7_7/filter-descriptions.html#synonym-graph-filter
+      -->
+      <filter class="solr.SuggestStopFilterFactory" words="stopwords.txt" ignoreCase="true"/>
     </analyzer>
   </fieldType>
 
deleted file mode 100644 (file)
index cea6a3bb401f8b906362fe00ba849ea670d9c2aa..0000000000000000000000000000000000000000
+++ /dev/null
@@ -1,60 +0,0 @@
-<?xml version="1.0" encoding="UTF-8" ?>
-
-<!--
-For fts-solr:
-
-This is the Solr schema file, place it into solr/conf/schema.xml. You may
-want to modify the tokenizers and filters.
--->
-<schema name="dovecot" version="1.5">
-  <types>
-    <!-- IMAP has 32bit unsigned ints but java ints are signed, so use longs -->
-    <fieldType name="string" class="solr.StrField" />
-    <fieldType name="long" class="solr.TrieLongField" />
-    <fieldType name="boolean" class="solr.BoolField" />
-
-    <fieldType name="text" class="solr.TextField" positionIncrementGap="100">
-      <analyzer type="index">
-        <tokenizer class="solr.StandardTokenizerFactory"/>
-        <filter class="solr.StopFilterFactory" ignoreCase="true" words="lang/stopwords_en.txt"/>
-        <filter class="solr.WordDelimiterFilterFactory" generateWordParts="1" generateNumberParts="1" catenateWords="1" catenateNumbers="1" catenateAll="0" splitOnCaseChange="1"/>
-        <filter class="solr.LowerCaseFilterFactory"/>
-        <filter class="solr.EnglishPossessiveFilterFactory"/>
-        <filter class="solr.KeywordMarkerFilterFactory" protected="protwords.txt"/>
-        <filter class="solr.EnglishMinimalStemFilterFactory"/>
-      </analyzer>
-      <analyzer type="query">
-        <tokenizer class="solr.StandardTokenizerFactory"/>
-        <filter class="solr.SynonymFilterFactory" synonyms="synonyms.txt" ignoreCase="true" expand="true"/>
-        <filter class="solr.StopFilterFactory" ignoreCase="true" words="lang/stopwords_en.txt"/>
-        <filter class="solr.WordDelimiterFilterFactory" generateWordParts="1" generateNumberParts="1" catenateWords="0" catenateNumbers="0" catenateAll="0" splitOnCaseChange="1"/>
-        <filter class="solr.LowerCaseFilterFactory"/>
-        <filter class="solr.EnglishPossessiveFilterFactory"/>
-        <filter class="solr.KeywordMarkerFilterFactory" protected="protwords.txt"/>
-        <filter class="solr.EnglishMinimalStemFilterFactory"/>
-      </analyzer>
-    </fieldType>
- </types>
-
-
- <fields>
-   <field name="id" type="string" indexed="true" stored="true" required="true" />
-   <field name="uid" type="long" indexed="true" stored="true" required="true" />
-   <field name="box" type="string" indexed="true" stored="true" required="true" />
-   <field name="user" type="string" indexed="true" stored="true" required="true" />
-
-   <field name="hdr" type="text" indexed="true" stored="false" />
-   <field name="body" type="text" indexed="true" stored="false" />
-
-   <field name="from" type="text" indexed="true" stored="false" />
-   <field name="to" type="text" indexed="true" stored="false" />
-   <field name="cc" type="text" indexed="true" stored="false" />
-   <field name="bcc" type="text" indexed="true" stored="false" />
-   <field name="subject" type="text" indexed="true" stored="false" />
-
-   <!-- Used by Solr internally: -->
-   <field name="_version_" type="long" indexed="true" stored="true"/>
- </fields>
-
- <uniqueKey>id</uniqueKey>
-</schema>
new file mode 120000 (symlink)
index 0000000000000000000000000000000000000000..aee10ffbde306cdeeb10ea7e97aa94cd1c98fdc1
--- /dev/null
@@ -0,0 +1 @@
+solr-schema-7.7.0.xml
\ No newline at end of file