Improve strcoll with strdiff.

author Leonhard Holz <leonhard.holz@web.de>

Tue, 12 May 2015 09:37:52 +0000 (11:37 +0200)

committer Ondřej Bílka <neleai@seznam.cz>

Tue, 12 May 2015 09:37:52 +0000 (11:37 +0200)
author Leonhard Holz <leonhard.holz@web.de>
Tue, 12 May 2015 09:37:52 +0000 (11:37 +0200)
committer Ondřej Bílka <neleai@seznam.cz>
Tue, 12 May 2015 09:37:52 +0000 (11:37 +0200)
diff --git a/ChangeLog b/ChangeLog

index c998dd59b87fbdfa27b765f052592c0fe89160f4..d78c626b257f5535bb6ae435446d6c21fbee774f 100644 (file)
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,3 +1,14 @@
+2015-05-12  Leonhard Holz <leonhard.holz@web.de>
+
+       * locale/categories.def: Define _NL_COLLATE_ENCODING_TYPE.
+       * locale/langinfo.h: Add _NL_COLLATE_ENCODING_TYPE to attribute list.
+       * locale/localeinfo.h: Add enum collation_encoding_type.
+       * locale/C-collate.c: Set _NL_COLLATE_ENCODING_TYPE to 8bit.
+       * programs/ld-collate.c (collate_output): Add encoding type info.
+       * string/strcoll_l.c (STRDIFF): New function.
+       * (STRCOLL): Use STRDIFF to skip over equal prefix.
+       * wcsmbs/wcscoll_l.c: Define STRDIFF.
+
  2015-05-11  Joseph Myers  <joseph@codesourcery.com>
  
         [BZ #18397]
diff --git a/locale/C-collate.c b/locale/C-collate.c

index 06dfdfaad52f537be07b1f708bc18ace5b5ae052..d7f3c550a580d469f44c95124d3c955fb6f22c12 100644 (file)
--- a/locale/C-collate.c
+++ b/locale/C-collate.c
@@ -144,6 +144,8 @@ const struct __locale_data _nl_C_LC_COLLATE attribute_hidden =
      /* _NL_COLLATE_COLLSEQWC */
      { .string = (const char *) collseqwc },
      /* _NL_COLLATE_CODESET */
-    { .string = _nl_C_codeset }
+    { .string = _nl_C_codeset },
+    /* _NL_COLLATE_ENCODING_TYPE */
+    { .word = __cet_8bit }
    }
  };
diff --git a/locale/categories.def b/locale/categories.def

index a8dda53007836f88f71353bcba7abaadb90d6469..045489d7412ea93ed31fe8f2485e5ffdde590cbd 100644 (file)
--- a/locale/categories.def
+++ b/locale/categories.def
@@ -58,6 +58,7 @@ DEFINE_CATEGORY
    DEFINE_ELEMENT (_NL_COLLATE_COLLSEQMB,        "collate-collseqmb",        std, wstring)
    DEFINE_ELEMENT (_NL_COLLATE_COLLSEQWC,        "collate-collseqwc",        std, wstring)
    DEFINE_ELEMENT (_NL_COLLATE_CODESET,         "collate-codeset",          std, string)
+  DEFINE_ELEMENT (_NL_COLLATE_ENCODING_TYPE,   "collate-encoding-type",    std, word)
    ), NO_POSTLOAD)
  
  
diff --git a/locale/langinfo.h b/locale/langinfo.h

index a565d9d120e7f871458915f772cbd1d408c489ba..ffc5c7f471ca64b6a85fb925d0f0999e0e43e4e5 100644 (file)
--- a/locale/langinfo.h
+++ b/locale/langinfo.h
@@ -255,6 +255,7 @@ enum
    _NL_COLLATE_COLLSEQMB,
    _NL_COLLATE_COLLSEQWC,
    _NL_COLLATE_CODESET,
+  _NL_COLLATE_ENCODING_TYPE,
    _NL_NUM_LC_COLLATE,
  
    /* LC_CTYPE category: character classification.
diff --git a/locale/localeinfo.h b/locale/localeinfo.h

index 1d2ee00876964396a046e4d3b942b5726a883aad..bdab9fe74551c54a50b3d9d93517b2617e62ddff 100644 (file)
--- a/locale/localeinfo.h
+++ b/locale/localeinfo.h
@@ -110,6 +110,14 @@ enum coll_sort_rule
    sort_mask
  };
  
+/* Collation encoding type.  */
+enum collation_encoding_type
+{
+  __cet_other,
+  __cet_8bit,
+  __cet_utf8
+};
+
  /* We can map the types of the entries into a few categories.  */
  enum value_type
  {
diff --git a/locale/programs/ld-collate.c b/locale/programs/ld-collate.c

index dc0fe30a1303eb9fa47b109068827f975f31aca2..a39a94f2cc3508b42308c3b11eb116ef582e4a5b 100644 (file)
--- a/locale/programs/ld-collate.c
+++ b/locale/programs/ld-collate.c
@@ -32,6 +32,7 @@
  #include "linereader.h"
  #include "locfile.h"
  #include "elem-hash.h"
+#include "../localeinfo.h"
  
  /* Uncomment the following line in the production version.  */
  /* #define NDEBUG 1 */
@@ -2130,6 +2131,8 @@ collate_output (struct localedef_t *locale, const struct charmap_t *charmap,
           /* The words have to be handled specially.  */
           if (idx == _NL_ITEM_INDEX (_NL_COLLATE_SYMB_HASH_SIZEMB))
             add_locale_uint32 (&file, 0);
+         else if (idx == _NL_ITEM_INDEX (_NL_COLLATE_ENCODING_TYPE))
+           add_locale_uint32 (&file, __cet_other);
           else
             add_locale_empty (&file);
         }
@@ -2493,6 +2496,12 @@ collate_output (struct localedef_t *locale, const struct charmap_t *charmap,
    add_locale_raw_data (&file, collate->mbseqorder, 256);
    add_locale_collseq_table (&file, &collate->wcseqorder);
    add_locale_string (&file, charmap->code_set_name);
+  if (strcmp (charmap->code_set_name, "UTF-8") == 0)
+    add_locale_uint32 (&file, __cet_utf8);
+  else if (charmap->mb_cur_max == 1)
+    add_locale_uint32 (&file, __cet_8bit);
+  else
+    add_locale_uint32 (&file, __cet_other);
    write_locale_data (output_path, LC_COLLATE, "LC_COLLATE", &file);
  
    obstack_free (&weightpool, NULL);
diff --git a/string/strcoll_l.c b/string/strcoll_l.c

index 658d5b9b906a23edcd775f85998b06b5516bf1c0..0fa005f0f9bc742713b91930daa60db9315c3576 100644 (file)
--- a/string/strcoll_l.c
+++ b/string/strcoll_l.c
@@ -29,6 +29,7 @@
  # define STRING_TYPE char
  # define USTRING_TYPE unsigned char
  # define STRCOLL __strcoll_l
+# define STRDIFF __strdiff
  # define STRCMP strcmp
  # define WEIGHT_H "../locale/weight.h"
  # define SUFFIX        MB
@@ -41,6 +42,20 @@
  #include "../locale/localeinfo.h"
  #include WEIGHT_H
  
+#define MASK_UTF8_7BIT  (1 << 7)
+#define MASK_UTF8_START (3 << 6)
+
+size_t
+STRDIFF (const STRING_TYPE *s, const STRING_TYPE *t)
+{
+  size_t n;
+
+  for (n = 0; *s != '\0' && *s++ == *t++; ++n)
+    continue;
+
+  return n;
+}
+
  /* Track status while looking for sequences in a string.  */
  typedef struct
  {
@@ -255,9 +270,29 @@ STRCOLL (const STRING_TYPE *s1, const STRING_TYPE *s2, __locale_t l)
    const USTRING_TYPE *extra;
    const int32_t *indirect;
  
+  /* In case there is no locale specific sort order (C / POSIX).  */
    if (nrules == 0)
      return STRCMP (s1, s2);
  
+  /* Fast forward to the position of the first difference.  Needs to be
+     encoding aware as the byte-by-byte comparison can stop in the middle
+     of a char sequence for multibyte encodings like UTF-8.  */
+  uint_fast32_t encoding =
+    current->values[_NL_ITEM_INDEX (_NL_COLLATE_ENCODING_TYPE)].word;
+  if (encoding != __cet_other)
+    {
+      size_t diff = STRDIFF (s1, s2);
+      if (diff > 0)
+       {
+         if (encoding == __cet_utf8 && (*(s1 + diff) & MASK_UTF8_7BIT) != 0)
+           do
+             diff--;
+           while (diff > 0 && (*(s1 + diff) & MASK_UTF8_START) != MASK_UTF8_START);
+         s1 += diff;
+         s2 += diff;
+       }
+    }
+
    /* Catch empty strings.  */
    if (__glibc_unlikely (*s1 == '\0') || __glibc_unlikely (*s2 == '\0'))
      return (*s1 != '\0') - (*s2 != '\0');
@@ -321,7 +356,8 @@ STRCOLL (const STRING_TYPE *s1, const STRING_TYPE *s2, __locale_t l)
                      byte-level comparison to ensure that we don't waste time
                      going through multiple passes for totally equal strings
                      before proceeding to subsequent passes.  */
-                 if (pass == 0 && STRCMP (s1, s2) == 0)
+                 if (pass == 0 && encoding == __cet_other &&
+                     STRCMP (s1, s2) == 0)
                     return result;
                   else
                     break;
diff --git a/wcsmbs/wcscoll_l.c b/wcsmbs/wcscoll_l.c

index 106ec93511b54d70e3eb1f8ce57ec861f0a744da..9f60cee9ea0e9333a3cf6d6f44fcd05dadfc3a31 100644 (file)
--- a/wcsmbs/wcscoll_l.c
+++ b/wcsmbs/wcscoll_l.c
@@ -23,6 +23,7 @@
  #define STRING_TYPE wchar_t
  #define USTRING_TYPE wint_t
  #define STRCOLL __wcscoll_l
+#define STRDIFF __wcsdiff
  #define STRCMP wcscmp
  #define WEIGHT_H "../locale/weightwc.h"
  #define SUFFIX WC
author	Leonhard Holz <leonhard.holz@web.de>
	Tue, 12 May 2015 09:37:52 +0000 (11:37 +0200)
committer	Ondřej Bílka <neleai@seznam.cz>
	Tue, 12 May 2015 09:37:52 +0000 (11:37 +0200)
ChangeLog		patch \| blob \| blame \| history
locale/C-collate.c		patch \| blob \| blame \| history
locale/categories.def		patch \| blob \| blame \| history
locale/langinfo.h		patch \| blob \| blame \| history
locale/localeinfo.h		patch \| blob \| blame \| history
locale/programs/ld-collate.c		patch \| blob \| blame \| history
string/strcoll_l.c		patch \| blob \| blame \| history
wcsmbs/wcscoll_l.c		patch \| blob \| blame \| history