From: Bruno Haible Date: Tue, 16 Sep 2008 00:19:13 +0000 (+0000) Subject: Do fuzzy matching by calling message_fuzzy_index_search on a hashed index, X-Git-Tag: v0.18~338 X-Git-Url: http://git.ipfire.org/cgi-bin/gitweb.cgi?a=commitdiff_plain;h=101de35be7c8e73329d8801f9eca938690130cb1;p=thirdparty%2Fgettext.git Do fuzzy matching by calling message_fuzzy_index_search on a hashed index, rather than through message_list_search_fuzzy. --- diff --git a/gettext-tools/src/Makefile.am b/gettext-tools/src/Makefile.am index f986d314f..0789660aa 100644 --- a/gettext-tools/src/Makefile.am +++ b/gettext-tools/src/Makefile.am @@ -129,6 +129,7 @@ LIBGREP = ../libgrep/libgrep.a # Source dependencies. msgcmp_SOURCES = msgcmp.c +msgcmp_SOURCES += msgl-fsearch.c msgfmt_SOURCES = msgfmt.c msgfmt_SOURCES += \ write-mo.c write-java.c write-csharp.c write-resources.c write-tcl.c \ diff --git a/gettext-tools/src/msgcmp.c b/gettext-tools/src/msgcmp.c index c51dd51a4..257a21dc8 100644 --- a/gettext-tools/src/msgcmp.c +++ b/gettext-tools/src/msgcmp.c @@ -38,7 +38,10 @@ #include "read-po.h" #include "read-properties.h" #include "read-stringtable.h" +#include "xmalloca.h" +#include "po-charset.h" #include "msgl-iconv.h" +#include "msgl-fsearch.h" #include "c-strstr.h" #include "c-strcase.h" #include "propername.h" @@ -297,7 +300,9 @@ remove_obsoletes (msgdomain_list_ty *mdlp) static void match_domain (const char *fn1, const char *fn2, - message_list_ty *defmlp, message_list_ty *refmlp, + message_list_ty *defmlp, message_fuzzy_index_ty **defmlp_findex, + const char *def_canon_charset, + message_list_ty *refmlp, int *nerrors) { size_t j; @@ -334,11 +339,31 @@ this message needs to be reviewed by the translator")); similar message, it could be a typo, or the suggestion may help. */ (*nerrors)++; - defmsg = - (use_fuzzy_matching - ? message_list_search_fuzzy (defmlp, - refmsg->msgctxt, refmsg->msgid) - : NULL); + if (use_fuzzy_matching) + { + if (false) + { + /* Old, slow code. */ + defmsg = + message_list_search_fuzzy (defmlp, + refmsg->msgctxt, refmsg->msgid); + } + else + { + /* Speedup through early abort in fstrcmp(), combined with + pre-sorting of the messages through a hashed index. */ + /* Create the fuzzy index lazily. */ + if (*defmlp_findex == NULL) + *defmlp_findex = + message_fuzzy_index_alloc (defmlp, def_canon_charset); + defmsg = + message_fuzzy_index_search (*defmlp_findex, + refmsg->msgctxt, refmsg->msgid, + FUZZY_THRESHOLD, false); + } + } + else + defmsg = NULL; if (defmsg) { po_gram_error_at_line (&refmsg->pos, _("\ @@ -363,6 +388,7 @@ compare (const char *fn1, const char *fn2, catalog_input_format_ty input_syntax) msgdomain_list_ty *ref; int nerrors; size_t j, k; + const char *def_canon_charset; message_list_ty *empty_list; /* This is the master file, created by a human. */ @@ -406,6 +432,55 @@ compare (const char *fn1, const char *fn2, catalog_input_format_ty input_syntax) def = iconv_msgdomain_list (def, "UTF-8", true, fn1); } + /* Determine canonicalized encoding name of the definitions now, after + conversion. Only used for fuzzy matching. */ + if (use_fuzzy_matching) + { + def_canon_charset = def->encoding; + if (def_canon_charset == NULL) + { + char *charset = NULL; + + /* Get the encoding of the definitions file. */ + for (k = 0; k < def->nitems; k++) + { + message_list_ty *mlp = def->item[k]->messages; + + for (j = 0; j < mlp->nitems; j++) + if (is_header (mlp->item[j]) && !mlp->item[j]->obsolete) + { + const char *header = mlp->item[j]->msgstr; + + if (header != NULL) + { + const char *charsetstr = c_strstr (header, "charset="); + + if (charsetstr != NULL) + { + size_t len; + + charsetstr += strlen ("charset="); + len = strcspn (charsetstr, " \t\n"); + charset = (char *) xmalloca (len + 1); + memcpy (charset, charsetstr, len); + charset[len] = '\0'; + break; + } + } + } + if (charset != NULL) + break; + } + if (charset != NULL) + def_canon_charset = po_charset_canonicalize (charset); + if (def_canon_charset == NULL) + /* Unspecified encoding. Assume unibyte encoding. */ + def_canon_charset = po_charset_ascii; + } + } + else + def_canon_charset = NULL; + empty_list = message_list_alloc (false); /* Every entry in the xgettext generated file must be matched by a @@ -417,12 +492,19 @@ compare (const char *fn1, const char *fn2, catalog_input_format_ty input_syntax) const char *domain = ref->item[k]->domain; message_list_ty *refmlp = ref->item[k]->messages; message_list_ty *defmlp; + message_fuzzy_index_ty *defmlp_findex; defmlp = msgdomain_list_sublist (def, domain, false); if (defmlp == NULL) defmlp = empty_list; - match_domain (fn1, fn2, defmlp, refmlp, &nerrors); + defmlp_findex = NULL; + + match_domain (fn1, fn2, defmlp, &defmlp_findex, def_canon_charset, + refmlp, &nerrors); + + if (defmlp_findex != NULL) + message_fuzzy_index_free (defmlp_findex); } else { @@ -436,7 +518,15 @@ compare (const char *fn1, const char *fn2, catalog_input_format_ty input_syntax) /* Ignore the default message domain if it has no messages. */ if (k > 0 || defmlp->nitems > 0) - match_domain (fn1, fn2, defmlp, refmlp, &nerrors); + { + message_fuzzy_index_ty *defmlp_findex = NULL; + + match_domain (fn1, fn2, defmlp, &defmlp_findex, def_canon_charset, + refmlp, &nerrors); + + if (defmlp_findex != NULL) + message_fuzzy_index_free (defmlp_findex); + } } } diff --git a/gettext-tools/src/msgl-fsearch.c b/gettext-tools/src/msgl-fsearch.c index e612f0172..dd2a1d367 100644 --- a/gettext-tools/src/msgl-fsearch.c +++ b/gettext-tools/src/msgl-fsearch.c @@ -486,11 +486,19 @@ mult_index_list_free (struct mult_index_list *accu) /* Find a good match for the given msgctxt and msgid in the given fuzzy index. The match does not need to be optimal. Ignore matches for which the fuzzy_search_goal_function is < LOWER_BOUND. - LOWER_BOUND must be >= FUZZY_THRESHOLD. */ + LOWER_BOUND must be >= FUZZY_THRESHOLD. + If HEURISTIC is true, only the few best messages among the list - according + to a certain heuristic - are considered. If HEURISTIC is false, all + messages with a fuzzy_search_goal_function > FUZZY_THRESHOLD are considered, + like in message_list_search_fuzzy (except that in ambiguous cases where + several best matches exist, message_list_search_fuzzy chooses the one with + the smallest index whereas message_fuzzy_index_search makes a better + choice). */ message_ty * message_fuzzy_index_search (message_fuzzy_index_ty *findex, const char *msgctxt, const char *msgid, - double lower_bound) + double lower_bound, + bool heuristic) { const char *str = msgid; @@ -538,17 +546,28 @@ message_fuzzy_index_search (message_fuzzy_index_ty *findex, /* Sort in decreasing count order. */ mult_index_list_sort (&accu); - /* Take the first few messages from this sorted list, and - maximize the fstrcmp() result. */ + /* Iterate over this sorted list, and maximize the + fuzzy_search_goal_function() result. + If HEURISTIC is true, take only the first few messages. + If HEURISTIC is false, consider all messages - to match + the behaviour of message_list_search_fuzzy -, but process + them in the order of the sorted list. This increases + the chances that the later calls to fstrcmp_bounded() (via + fuzzy_search_goal_function()) terminate quickly, thanks + to the best_weight which will be quite high already after + the first few messages. */ { size_t count; struct mult_index *ptr; message_ty *best_mp; double best_weight; - count = findex->firstfew; - if (count > accu.nitems) - count = accu.nitems; + count = accu.nitems; + if (heuristic) + { + if (count > findex->firstfew) + count = findex->firstfew; + } best_weight = lower_bound; best_mp = NULL; diff --git a/gettext-tools/src/msgl-fsearch.h b/gettext-tools/src/msgl-fsearch.h index d5f796251..c890f3717 100644 --- a/gettext-tools/src/msgl-fsearch.h +++ b/gettext-tools/src/msgl-fsearch.h @@ -20,6 +20,8 @@ #include "message.h" +#include + #ifdef __cplusplus extern "C" { @@ -41,11 +43,19 @@ extern message_fuzzy_index_ty * /* Find a good match for the given msgctxt and msgid in the given fuzzy index. The match does not need to be optimal. Ignore matches for which the fuzzy_search_goal_function is < LOWER_BOUND. - LOWER_BOUND must be >= FUZZY_THRESHOLD. */ + LOWER_BOUND must be >= FUZZY_THRESHOLD. + If HEURISTIC is true, only the few best messages among the list - according + to a certain heuristic - are considered. If HEURISTIC is false, all + messages with a fuzzy_search_goal_function > FUZZY_THRESHOLD are considered, + like in message_list_search_fuzzy (except that in ambiguous cases where + several best matches exist, message_list_search_fuzzy chooses the one with + the smallest index whereas message_fuzzy_index_search makes a better + choice). */ extern message_ty * message_fuzzy_index_search (message_fuzzy_index_ty *findex, const char *msgctxt, const char *msgid, - double lower_bound); + double lower_bound, + bool heuristic); /* Free a fuzzy index. */ extern void diff --git a/gettext-tools/src/msgmerge.c b/gettext-tools/src/msgmerge.c index be765e904..4c48be04d 100644 --- a/gettext-tools/src/msgmerge.c +++ b/gettext-tools/src/msgmerge.c @@ -680,6 +680,13 @@ struct definitions_ty for speed when doing the exact searches. */ message_list_list_ty *lists; + /* A fuzzy index of the current list of non-compendium messages, for speed + when doing fuzzy searches. Used only if use_fuzzy_matching is true. */ + message_fuzzy_index_ty *curr_findex; + /* A once-only execution guard for the initialization of the fuzzy index. + Needed for OpenMP. */ + gl_lock_define(, curr_findex_init_lock) + /* A fuzzy index of the compendiums, for speed when doing fuzzy searches. Used only if use_fuzzy_matching is true and compendiums != NULL. */ message_fuzzy_index_ty *comp_findex; @@ -687,7 +694,8 @@ struct definitions_ty Needed for OpenMP. */ gl_lock_define(, comp_findex_init_lock) - /* The canonical encoding of the compendiums. */ + /* The canonical encoding of the definitions and the compendiums. + Only used for fuzzy matching. */ const char *canon_charset; }; @@ -698,6 +706,8 @@ definitions_init (definitions_ty *definitions, const char *canon_charset) message_list_list_append (definitions->lists, NULL); if (compendiums != NULL) message_list_list_append_list (definitions->lists, compendiums); + definitions->curr_findex = NULL; + gl_lock_init (definitions->curr_findex_init_lock); definitions->comp_findex = NULL; gl_lock_init (definitions->comp_findex_init_lock); definitions->canon_charset = canon_charset; @@ -715,9 +725,28 @@ static inline void definitions_set_current_list (definitions_ty *definitions, message_list_ty *mlp) { definitions->lists->item[0] = mlp; + if (definitions->curr_findex != NULL) + { + message_fuzzy_index_free (definitions->curr_findex); + definitions->curr_findex = NULL; + } } -/* Create the fuzzy index. +/* Create the fuzzy index for the current list of non-compendium messages. + Used only if use_fuzzy_matching is true. */ +static inline void +definitions_init_curr_findex (definitions_ty *definitions) +{ + /* Protect against concurrent execution. */ + gl_lock_lock (definitions->curr_findex_init_lock); + if (definitions->curr_findex == NULL) + definitions->curr_findex = + message_fuzzy_index_alloc (definitions_current_list (definitions), + definitions->canon_charset); + gl_lock_unlock (definitions->curr_findex_init_lock); +} + +/* Create the fuzzy index for the compendium messages. Used only if use_fuzzy_matching is true and compendiums != NULL. */ static inline void definitions_init_comp_findex (definitions_ty *definitions) @@ -762,9 +791,27 @@ static inline message_ty * definitions_search_fuzzy (definitions_ty *definitions, const char *msgctxt, const char *msgid) { - message_ty *mp1 = - message_list_search_fuzzy (definitions_current_list (definitions), - msgctxt, msgid); + message_ty *mp1; + + if (false) + { + /* Old, slow code. */ + mp1 = + message_list_search_fuzzy (definitions_current_list (definitions), + msgctxt, msgid); + } + else + { + /* Speedup through early abort in fstrcmp(), combined with pre-sorting + of the messages through a hashed index. */ + /* Create the fuzzy index lazily. */ + if (definitions->curr_findex == NULL) + definitions_init_curr_findex (definitions); + mp1 = message_fuzzy_index_search (definitions->curr_findex, + msgctxt, msgid, + FUZZY_THRESHOLD, false); + } + if (compendiums != NULL) { double lower_bound_for_mp2; @@ -784,7 +831,7 @@ definitions_search_fuzzy (definitions_ty *definitions, mp2 = message_fuzzy_index_search (definitions->comp_findex, msgctxt, msgid, - lower_bound_for_mp2); + lower_bound_for_mp2, true); /* Choose the best among mp1, mp2. */ if (mp1 == NULL @@ -802,6 +849,8 @@ static inline void definitions_destroy (definitions_ty *definitions) { message_list_list_free (definitions->lists, 2); + if (definitions->curr_findex != NULL) + message_fuzzy_index_free (definitions->curr_findex); if (definitions->comp_findex != NULL) message_fuzzy_index_free (definitions->comp_findex); } @@ -1586,6 +1635,7 @@ merge (const char *fn1, const char *fn2, catalog_input_format_ty input_syntax, unsigned int processed; struct statistics stats; msgdomain_list_ty *result; + const char *def_canon_charset; definitions_ty definitions; message_list_ty *empty_list; @@ -1795,8 +1845,57 @@ merge (const char *fn1, const char *fn2, catalog_input_format_ty input_syntax, } } + /* Determine canonicalized encoding name of the definitions now, after + conversion. Only used for fuzzy matching. */ + if (use_fuzzy_matching) + { + def_canon_charset = def->encoding; + if (def_canon_charset == NULL) + { + char *charset = NULL; + + /* Get the encoding of the definitions file. */ + for (k = 0; k < def->nitems; k++) + { + message_list_ty *mlp = def->item[k]->messages; + + for (j = 0; j < mlp->nitems; j++) + if (is_header (mlp->item[j]) && !mlp->item[j]->obsolete) + { + const char *header = mlp->item[j]->msgstr; + + if (header != NULL) + { + const char *charsetstr = c_strstr (header, "charset="); + + if (charsetstr != NULL) + { + size_t len; + + charsetstr += strlen ("charset="); + len = strcspn (charsetstr, " \t\n"); + charset = (char *) xmalloca (len + 1); + memcpy (charset, charsetstr, len); + charset[len] = '\0'; + break; + } + } + } + if (charset != NULL) + break; + } + if (charset != NULL) + def_canon_charset = po_charset_canonicalize (charset); + if (def_canon_charset == NULL) + /* Unspecified encoding. Assume unibyte encoding. */ + def_canon_charset = po_charset_ascii; + } + } + else + def_canon_charset = NULL; + /* Initialize and preprocess the total set of message definitions. */ - definitions_init (&definitions, po_charset_utf8); + definitions_init (&definitions, def_canon_charset); empty_list = message_list_alloc (false); result = msgdomain_list_alloc (false);