From: Stephan Bosch Date: Fri, 21 Mar 2025 18:40:01 +0000 (+0100) Subject: lib-language: Drop old word break and word boundary lookup implementation X-Git-Url: http://git.ipfire.org/?a=commitdiff_plain;h=99f5e0a551ec4bc59bd2bfb0c3621caf09e0d04b;p=thirdparty%2Fdovecot%2Fcore.git lib-language: Drop old word break and word boundary lookup implementation --- diff --git a/.gitignore b/.gitignore index 942de1e7b4..caa6bedfd0 100644 --- a/.gitignore +++ b/.gitignore @@ -105,10 +105,6 @@ src/lib/unicode-data-tables.h src/lib/unicode-data-types.c src/lib/unicode-data-types.h src/lib-compression/bench-compression -src/lib-language/PropList.txt -src/lib-language/WordBreakProperty.txt -src/lib-language/word-boundary-data.c -src/lib-language/word-break-data.c src/lib-dict-backend/dict-drivers-register.c src/lib-json/json-format src/lib-sql/sql-drivers-register.c diff --git a/src/lib-language/Makefile.am b/src/lib-language/Makefile.am index 5c0e6c7397..49d3b6db70 100644 --- a/src/lib-language/Makefile.am +++ b/src/lib-language/Makefile.am @@ -1,14 +1,10 @@ noinst_LTLIBRARIES = liblanguage.la -# I$(top_srcdir)/src/lib-language needed to include -# word-break-data.c and word-boundary-data.c -# in lang-tokenizer-generic.c AM_CPPFLAGS = \ -I$(top_srcdir)/src/lib \ -I$(top_srcdir)/src/lib-settings \ -I$(top_srcdir)/src/lib-test \ -I$(top_srcdir)/src/lib-mail \ - -I$(top_srcdir)/src/lib-language \ -I$(top_srcdir)/src/lib-var-expand \ $(LIBEXTTEXTCAT_CFLAGS) \ $(LIBICU_CFLAGS) \ @@ -33,29 +29,10 @@ dist_stopwords_DATA = \ stopwords/stopwords_sv.txt \ stopwords/stopwords_tr.txt -BUILT_SOURCES = $(srcdir)/word-boundary-data.c \ - $(srcdir)/word-break-data.c - EXTRA_DIST = \ udhr_fra.txt \ - PropList.txt \ - word-properties.pl \ - WordBreakProperty.txt \ - word-boundary-data.c \ - word-break-data.c \ stopwords/stopwords_malformed.txt -$(srcdir)/WordBreakProperty.txt: - $(AM_V_at)test -f $@ || $(WGET) -nv -O $@ https://dovecot.org/res/WordBreakProperty.txt -$(srcdir)/word-boundary-data.c: $(srcdir)/word-properties.pl $(srcdir)/WordBreakProperty.txt - $(AM_V_at)$(PERL) $(srcdir)/word-properties.pl boundaries $(srcdir)/WordBreakProperty.txt > $@ - -$(srcdir)/PropList.txt: - $(AM_V_at)test -f $@ || $(WGET) -nv -O $@ https://dovecot.org/res/PropList.txt -$(srcdir)/word-break-data.c: $(srcdir)/word-properties.pl $(srcdir)/PropList.txt - $(AM_V_at)$(PERL) $(srcdir)/word-properties.pl breaks $(srcdir)/PropList.txt > $@ - - if BUILD_LANG_STEMMER STEMMER_LIBS = -lstemmer endif diff --git a/src/lib-language/word-properties.pl b/src/lib-language/word-properties.pl deleted file mode 100644 index c600d14373..0000000000 --- a/src/lib-language/word-properties.pl +++ /dev/null @@ -1,34 +0,0 @@ -#!/usr/bin/env perl -use strict; -use warnings; - -my @categories; -my $which = shift(@ARGV); -if ($which eq 'boundaries') { - @categories = qw(CR LF Newline Extend Regional_Indicator Format Katakana Hebrew_Letter ALetter - Single_Quote Double_Quote MidNumLet MidLetter MidNum Numeric ExtendNumLet); -} elsif ($which eq 'breaks') { - @categories = qw(White_Space Dash Quotation_Mark Terminal_Punctuation STerm Pattern_White_Space); -} else { - die "specify 'boundaries' or 'breaks'"; -} - -my $catregexp=join('|', @categories); -my %catlists = map { $_ => []; } (@categories); - -while(<>) { - next if (m/^#/ or m/^\s*$/); - push(@{$catlists{$3}}, defined($2) ? (hex($1)..hex($2)) : hex($1)) - if (m/([[:xdigit:]]+)(?:\.\.([[:xdigit:]]+))?\s+; ($catregexp) #/) -} - -print "/* This file is automatically generated by word-properties.pl from $ARGV */\n"; -foreach(@categories) { - my $arref=$catlists{$_}; - print "static const uint32_t ${_}[]= {\n"; - while(scalar(@$arref)) { - print("\t", join(", ", map { sprintf("0x%05X", $_); } splice(@$arref, 0, 8))); - print(scalar(@$arref) ? ", \n" : "\n"); - } - print("};\n"); -}