rather than through message_list_search_fuzzy.
# Source dependencies.
msgcmp_SOURCES = msgcmp.c
+msgcmp_SOURCES += msgl-fsearch.c
msgfmt_SOURCES = msgfmt.c
msgfmt_SOURCES += \
write-mo.c write-java.c write-csharp.c write-resources.c write-tcl.c \
#include "read-po.h"
#include "read-properties.h"
#include "read-stringtable.h"
+#include "xmalloca.h"
+#include "po-charset.h"
#include "msgl-iconv.h"
+#include "msgl-fsearch.h"
#include "c-strstr.h"
#include "c-strcase.h"
#include "propername.h"
static void
match_domain (const char *fn1, const char *fn2,
- message_list_ty *defmlp, message_list_ty *refmlp,
+ message_list_ty *defmlp, message_fuzzy_index_ty **defmlp_findex,
+ const char *def_canon_charset,
+ message_list_ty *refmlp,
int *nerrors)
{
size_t j;
similar message, it could be a typo, or the suggestion may
help. */
(*nerrors)++;
- defmsg =
- (use_fuzzy_matching
- ? message_list_search_fuzzy (defmlp,
- refmsg->msgctxt, refmsg->msgid)
- : NULL);
+ if (use_fuzzy_matching)
+ {
+ if (false)
+ {
+ /* Old, slow code. */
+ defmsg =
+ message_list_search_fuzzy (defmlp,
+ refmsg->msgctxt, refmsg->msgid);
+ }
+ else
+ {
+ /* Speedup through early abort in fstrcmp(), combined with
+ pre-sorting of the messages through a hashed index. */
+ /* Create the fuzzy index lazily. */
+ if (*defmlp_findex == NULL)
+ *defmlp_findex =
+ message_fuzzy_index_alloc (defmlp, def_canon_charset);
+ defmsg =
+ message_fuzzy_index_search (*defmlp_findex,
+ refmsg->msgctxt, refmsg->msgid,
+ FUZZY_THRESHOLD, false);
+ }
+ }
+ else
+ defmsg = NULL;
if (defmsg)
{
po_gram_error_at_line (&refmsg->pos, _("\
msgdomain_list_ty *ref;
int nerrors;
size_t j, k;
+ const char *def_canon_charset;
message_list_ty *empty_list;
/* This is the master file, created by a human. */
def = iconv_msgdomain_list (def, "UTF-8", true, fn1);
}
+ /* Determine canonicalized encoding name of the definitions now, after
+ conversion. Only used for fuzzy matching. */
+ if (use_fuzzy_matching)
+ {
+ def_canon_charset = def->encoding;
+ if (def_canon_charset == NULL)
+ {
+ char *charset = NULL;
+
+ /* Get the encoding of the definitions file. */
+ for (k = 0; k < def->nitems; k++)
+ {
+ message_list_ty *mlp = def->item[k]->messages;
+
+ for (j = 0; j < mlp->nitems; j++)
+ if (is_header (mlp->item[j]) && !mlp->item[j]->obsolete)
+ {
+ const char *header = mlp->item[j]->msgstr;
+
+ if (header != NULL)
+ {
+ const char *charsetstr = c_strstr (header, "charset=");
+
+ if (charsetstr != NULL)
+ {
+ size_t len;
+
+ charsetstr += strlen ("charset=");
+ len = strcspn (charsetstr, " \t\n");
+ charset = (char *) xmalloca (len + 1);
+ memcpy (charset, charsetstr, len);
+ charset[len] = '\0';
+ break;
+ }
+ }
+ }
+ if (charset != NULL)
+ break;
+ }
+ if (charset != NULL)
+ def_canon_charset = po_charset_canonicalize (charset);
+ if (def_canon_charset == NULL)
+ /* Unspecified encoding. Assume unibyte encoding. */
+ def_canon_charset = po_charset_ascii;
+ }
+ }
+ else
+ def_canon_charset = NULL;
+
empty_list = message_list_alloc (false);
/* Every entry in the xgettext generated file must be matched by a
const char *domain = ref->item[k]->domain;
message_list_ty *refmlp = ref->item[k]->messages;
message_list_ty *defmlp;
+ message_fuzzy_index_ty *defmlp_findex;
defmlp = msgdomain_list_sublist (def, domain, false);
if (defmlp == NULL)
defmlp = empty_list;
- match_domain (fn1, fn2, defmlp, refmlp, &nerrors);
+ defmlp_findex = NULL;
+
+ match_domain (fn1, fn2, defmlp, &defmlp_findex, def_canon_charset,
+ refmlp, &nerrors);
+
+ if (defmlp_findex != NULL)
+ message_fuzzy_index_free (defmlp_findex);
}
else
{
/* Ignore the default message domain if it has no messages. */
if (k > 0 || defmlp->nitems > 0)
- match_domain (fn1, fn2, defmlp, refmlp, &nerrors);
+ {
+ message_fuzzy_index_ty *defmlp_findex = NULL;
+
+ match_domain (fn1, fn2, defmlp, &defmlp_findex, def_canon_charset,
+ refmlp, &nerrors);
+
+ if (defmlp_findex != NULL)
+ message_fuzzy_index_free (defmlp_findex);
+ }
}
}
/* Find a good match for the given msgctxt and msgid in the given fuzzy index.
The match does not need to be optimal.
Ignore matches for which the fuzzy_search_goal_function is < LOWER_BOUND.
- LOWER_BOUND must be >= FUZZY_THRESHOLD. */
+ LOWER_BOUND must be >= FUZZY_THRESHOLD.
+ If HEURISTIC is true, only the few best messages among the list - according
+ to a certain heuristic - are considered. If HEURISTIC is false, all
+ messages with a fuzzy_search_goal_function > FUZZY_THRESHOLD are considered,
+ like in message_list_search_fuzzy (except that in ambiguous cases where
+ several best matches exist, message_list_search_fuzzy chooses the one with
+ the smallest index whereas message_fuzzy_index_search makes a better
+ choice). */
message_ty *
message_fuzzy_index_search (message_fuzzy_index_ty *findex,
const char *msgctxt, const char *msgid,
- double lower_bound)
+ double lower_bound,
+ bool heuristic)
{
const char *str = msgid;
/* Sort in decreasing count order. */
mult_index_list_sort (&accu);
- /* Take the first few messages from this sorted list, and
- maximize the fstrcmp() result. */
+ /* Iterate over this sorted list, and maximize the
+ fuzzy_search_goal_function() result.
+ If HEURISTIC is true, take only the first few messages.
+ If HEURISTIC is false, consider all messages - to match
+ the behaviour of message_list_search_fuzzy -, but process
+ them in the order of the sorted list. This increases
+ the chances that the later calls to fstrcmp_bounded() (via
+ fuzzy_search_goal_function()) terminate quickly, thanks
+ to the best_weight which will be quite high already after
+ the first few messages. */
{
size_t count;
struct mult_index *ptr;
message_ty *best_mp;
double best_weight;
- count = findex->firstfew;
- if (count > accu.nitems)
- count = accu.nitems;
+ count = accu.nitems;
+ if (heuristic)
+ {
+ if (count > findex->firstfew)
+ count = findex->firstfew;
+ }
best_weight = lower_bound;
best_mp = NULL;
#include "message.h"
+#include <stdbool.h>
+
#ifdef __cplusplus
extern "C" {
/* Find a good match for the given msgctxt and msgid in the given fuzzy index.
The match does not need to be optimal.
Ignore matches for which the fuzzy_search_goal_function is < LOWER_BOUND.
- LOWER_BOUND must be >= FUZZY_THRESHOLD. */
+ LOWER_BOUND must be >= FUZZY_THRESHOLD.
+ If HEURISTIC is true, only the few best messages among the list - according
+ to a certain heuristic - are considered. If HEURISTIC is false, all
+ messages with a fuzzy_search_goal_function > FUZZY_THRESHOLD are considered,
+ like in message_list_search_fuzzy (except that in ambiguous cases where
+ several best matches exist, message_list_search_fuzzy chooses the one with
+ the smallest index whereas message_fuzzy_index_search makes a better
+ choice). */
extern message_ty *
message_fuzzy_index_search (message_fuzzy_index_ty *findex,
const char *msgctxt, const char *msgid,
- double lower_bound);
+ double lower_bound,
+ bool heuristic);
/* Free a fuzzy index. */
extern void
for speed when doing the exact searches. */
message_list_list_ty *lists;
+ /* A fuzzy index of the current list of non-compendium messages, for speed
+ when doing fuzzy searches. Used only if use_fuzzy_matching is true. */
+ message_fuzzy_index_ty *curr_findex;
+ /* A once-only execution guard for the initialization of the fuzzy index.
+ Needed for OpenMP. */
+ gl_lock_define(, curr_findex_init_lock)
+
/* A fuzzy index of the compendiums, for speed when doing fuzzy searches.
Used only if use_fuzzy_matching is true and compendiums != NULL. */
message_fuzzy_index_ty *comp_findex;
Needed for OpenMP. */
gl_lock_define(, comp_findex_init_lock)
- /* The canonical encoding of the compendiums. */
+ /* The canonical encoding of the definitions and the compendiums.
+ Only used for fuzzy matching. */
const char *canon_charset;
};
message_list_list_append (definitions->lists, NULL);
if (compendiums != NULL)
message_list_list_append_list (definitions->lists, compendiums);
+ definitions->curr_findex = NULL;
+ gl_lock_init (definitions->curr_findex_init_lock);
definitions->comp_findex = NULL;
gl_lock_init (definitions->comp_findex_init_lock);
definitions->canon_charset = canon_charset;
definitions_set_current_list (definitions_ty *definitions, message_list_ty *mlp)
{
definitions->lists->item[0] = mlp;
+ if (definitions->curr_findex != NULL)
+ {
+ message_fuzzy_index_free (definitions->curr_findex);
+ definitions->curr_findex = NULL;
+ }
}
-/* Create the fuzzy index.
+/* Create the fuzzy index for the current list of non-compendium messages.
+ Used only if use_fuzzy_matching is true. */
+static inline void
+definitions_init_curr_findex (definitions_ty *definitions)
+{
+ /* Protect against concurrent execution. */
+ gl_lock_lock (definitions->curr_findex_init_lock);
+ if (definitions->curr_findex == NULL)
+ definitions->curr_findex =
+ message_fuzzy_index_alloc (definitions_current_list (definitions),
+ definitions->canon_charset);
+ gl_lock_unlock (definitions->curr_findex_init_lock);
+}
+
+/* Create the fuzzy index for the compendium messages.
Used only if use_fuzzy_matching is true and compendiums != NULL. */
static inline void
definitions_init_comp_findex (definitions_ty *definitions)
definitions_search_fuzzy (definitions_ty *definitions,
const char *msgctxt, const char *msgid)
{
- message_ty *mp1 =
- message_list_search_fuzzy (definitions_current_list (definitions),
- msgctxt, msgid);
+ message_ty *mp1;
+
+ if (false)
+ {
+ /* Old, slow code. */
+ mp1 =
+ message_list_search_fuzzy (definitions_current_list (definitions),
+ msgctxt, msgid);
+ }
+ else
+ {
+ /* Speedup through early abort in fstrcmp(), combined with pre-sorting
+ of the messages through a hashed index. */
+ /* Create the fuzzy index lazily. */
+ if (definitions->curr_findex == NULL)
+ definitions_init_curr_findex (definitions);
+ mp1 = message_fuzzy_index_search (definitions->curr_findex,
+ msgctxt, msgid,
+ FUZZY_THRESHOLD, false);
+ }
+
if (compendiums != NULL)
{
double lower_bound_for_mp2;
mp2 = message_fuzzy_index_search (definitions->comp_findex,
msgctxt, msgid,
- lower_bound_for_mp2);
+ lower_bound_for_mp2, true);
/* Choose the best among mp1, mp2. */
if (mp1 == NULL
definitions_destroy (definitions_ty *definitions)
{
message_list_list_free (definitions->lists, 2);
+ if (definitions->curr_findex != NULL)
+ message_fuzzy_index_free (definitions->curr_findex);
if (definitions->comp_findex != NULL)
message_fuzzy_index_free (definitions->comp_findex);
}
unsigned int processed;
struct statistics stats;
msgdomain_list_ty *result;
+ const char *def_canon_charset;
definitions_ty definitions;
message_list_ty *empty_list;
}
}
+ /* Determine canonicalized encoding name of the definitions now, after
+ conversion. Only used for fuzzy matching. */
+ if (use_fuzzy_matching)
+ {
+ def_canon_charset = def->encoding;
+ if (def_canon_charset == NULL)
+ {
+ char *charset = NULL;
+
+ /* Get the encoding of the definitions file. */
+ for (k = 0; k < def->nitems; k++)
+ {
+ message_list_ty *mlp = def->item[k]->messages;
+
+ for (j = 0; j < mlp->nitems; j++)
+ if (is_header (mlp->item[j]) && !mlp->item[j]->obsolete)
+ {
+ const char *header = mlp->item[j]->msgstr;
+
+ if (header != NULL)
+ {
+ const char *charsetstr = c_strstr (header, "charset=");
+
+ if (charsetstr != NULL)
+ {
+ size_t len;
+
+ charsetstr += strlen ("charset=");
+ len = strcspn (charsetstr, " \t\n");
+ charset = (char *) xmalloca (len + 1);
+ memcpy (charset, charsetstr, len);
+ charset[len] = '\0';
+ break;
+ }
+ }
+ }
+ if (charset != NULL)
+ break;
+ }
+ if (charset != NULL)
+ def_canon_charset = po_charset_canonicalize (charset);
+ if (def_canon_charset == NULL)
+ /* Unspecified encoding. Assume unibyte encoding. */
+ def_canon_charset = po_charset_ascii;
+ }
+ }
+ else
+ def_canon_charset = NULL;
+
/* Initialize and preprocess the total set of message definitions. */
- definitions_init (&definitions, po_charset_utf8);
+ definitions_init (&definitions, def_canon_charset);
empty_list = message_list_alloc (false);
result = msgdomain_list_alloc (false);