uniq: avoid strcoll() to improve performance and consistency

author Pádraig Brady <P@draigBrady.com>

Sun, 23 Feb 2020 13:20:08 +0000 (13:20 +0000)

committer Pádraig Brady <P@draigBrady.com>

Mon, 24 Feb 2020 10:12:42 +0000 (10:12 +0000)
author Pádraig Brady <P@draigBrady.com>
Sun, 23 Feb 2020 13:20:08 +0000 (13:20 +0000)
committer Pádraig Brady <P@draigBrady.com>
Mon, 24 Feb 2020 10:12:42 +0000 (10:12 +0000)
diff --git a/NEWS b/NEWS

index 8a349634e7189087cffdf06aa34099c2ca152888..6afb9cb6d1ccefb9d8f45f045df85c94f47aa111 100644 (file)
--- a/NEWS
+++ b/NEWS
@@ -65,6 +65,9 @@ GNU coreutils NEWS                                    -*- outline -*-
    [The old behavior was introduced in sh-utils 2.0.15 ca. 1999, predating
    coreutils package.]
  
+  uniq no longer uses strcoll() to determine string equivalence,
+  and so will operate more efficiently and consistently.
+
  ** New Features
  
    ls now supports the --time=birth option to display and sort by
diff --git a/src/uniq.c b/src/uniq.c

index 0fcf50a16bb862a8a3c9ff4b5f633308967acc2a..e0247579b69c55803f6470644b31e700743854b0 100644 (file)
--- a/src/uniq.c
+++ b/src/uniq.c
@@ -27,10 +27,8 @@
  #include "die.h"
  #include "error.h"
  #include "fadvise.h"
-#include "hard-locale.h"
  #include "posixver.h"
  #include "stdio--.h"
-#include "xmemcoll.h"
  #include "xstrtol.h"
  #include "memcasecmp.h"
  #include "quote.h"
@@ -52,9 +50,6 @@
      }                                          \
    while (0)
  
-/* True if the LC_COLLATE locale is hard.  */
-static bool hard_LC_COLLATE;
-
  /* Number of fields to skip on each line when doing comparisons. */
  static size_t skip_fields;
  
@@ -220,7 +215,6 @@ characters.  Fields are skipped before chars.\n\
  \n\
  Note: 'uniq' does not detect repeated lines unless they are adjacent.\n\
  You may want to sort the input first, or use 'sort -u' without 'uniq'.\n\
-Also, comparisons honor the rules specified by 'LC_COLLATE'.\n\
  "), stdout);
        emit_ancillary_info (PROGRAM_NAME);
      }
@@ -293,12 +287,7 @@ different (char *old, char *new, size_t oldlen, size_t newlen)
      newlen = check_chars;
  
    if (ignore_case)
-    {
-      /* FIXME: This should invoke strcoll somehow.  */
-      return oldlen != newlen || memcasecmp (old, new, oldlen);
-    }
-  else if (hard_LC_COLLATE)
-    return xmemcoll (old, oldlen, new, newlen) != 0;
+    return oldlen != newlen || memcasecmp (old, new, oldlen);
    else
      return oldlen != newlen || memcmp (old, new, oldlen);
  }
@@ -501,7 +490,6 @@ main (int argc, char **argv)
    setlocale (LC_ALL, "");
    bindtextdomain (PACKAGE, LOCALEDIR);
    textdomain (PACKAGE);
-  hard_LC_COLLATE = hard_locale (LC_COLLATE);
  
    atexit (close_stdout);
  
diff --git a/tests/local.mk b/tests/local.mk

index bbcb9d4133dfb36e939371e6185da3ded6c86ee4..0aabdaacc6148ac3af1a2a3ee082a852e343b74c 100644 (file)
--- a/tests/local.mk
+++ b/tests/local.mk
@@ -438,6 +438,7 @@ all_tests =                                 \
    tests/misc/unexpand.pl                       \
    tests/misc/uniq.pl                           \
    tests/misc/uniq-perf.sh                      \
+  tests/misc/uniq-collate.sh                   \
    tests/misc/xattr.sh                          \
    tests/misc/yes.sh                            \
    tests/tail-2/wait.sh                         \
diff --git a/tests/misc/uniq-collate.sh b/tests/misc/uniq-collate.sh

new file mode 100755 (executable)

index 0000000..974befc
--- /dev/null
+++ b/tests/misc/uniq-collate.sh
@@ -0,0 +1,63 @@
+#!/bin/sh
+# before coreutils-8.32, uniq would not distinguish
+# items which compared equal with strcoll()
+# So ensure we avoid strcoll() for the following cases.
+
+# Copyright (C) 2020 Free Software Foundation, Inc.
+
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+
+# You should have received a copy of the GNU General Public License
+# along with this program.  If not, see <https://www.gnu.org/licenses/>.
+
+. "${srcdir=.}/tests/init.sh"; path_prepend_ ./src
+print_ver_ uniq printf
+
+gen_input()
+{
+  env LC_ALL=$LOCALE_FR_UTF8 printf "$@" > in || framework_failure_
+}
+
+# strcoll() used to return 0 comparing the following strings
+# which was fixed somewhere between glibc-2.22 and glibc-2.30
+gen_input '%s\n' 'ⁿᵘˡˡ' 'ܥܝܪܐܩ' > in || framework_failure_
+test $(LC_ALL=$LOCALE_FR_UTF8 uniq < in | wc -l) = 2 || fail=1
+
+# normalization in strcoll is inconsistent across platforms.
+# glibc based systems at least do _not_ normalize in strcoll,
+# while cygwin systems for example may do so.
+# á composed and decomposed, are generally not compared equal
+gen_input '\u00E1\na\u0301\n' > in || framework_failure_
+test $(LC_ALL=$LOCALE_FR_UTF8 uniq < in | wc -l) = 2 || fail=1
+# Similarly with the following equivalent hangul characters
+gen_input '\uAC01\n\u1100\u1161\u11A8\n' > in || framework_failure_
+test $(LC_ALL=ko_KR.utf8 uniq < in | wc -l) = 2 || fail=1
+
+# Note if running in the wrong locale,
+# strcoll may indicate the strings match when they don't.
+# I.e., cjk and hangul will now work even if
+# uniq is running in the wrong locale
+# hangul (ko_KR.utf8)
+gen_input '\uAC00\n\uAC01\n' > in || framework_failure_
+test $(LC_ALL=en_US.utf8 uniq < in | wc -l) = 2 || fail=1
+# CJK (zh_CN.utf8)
+gen_input '\u3400\n\u3401\n' > in || framework_failure_
+test $(LC_ALL=en_US.utf8 uniq < in | wc -l) = 2 || fail=1
+
+# Note strcoll() ignores certain characters,
+# but not if the strings are otherwise equal.
+# I.e., the following on glibc-2.30 at least,
+# as expected, does not print a single item,
+# but testing here for illustration
+gen_input ',a\n.a\n' > in || framework_failure_
+test $(LC_ALL=$LOCALE_FR_UTF8 uniq < in | wc -l) = 2 || fail=1
+
+Exit $fail
author	Pádraig Brady <P@draigBrady.com>
	Sun, 23 Feb 2020 13:20:08 +0000 (13:20 +0000)
committer	Pádraig Brady <P@draigBrady.com>
	Mon, 24 Feb 2020 10:12:42 +0000 (10:12 +0000)
NEWS		patch \| blob \| blame \| history
src/uniq.c		patch \| blob \| blame \| history
tests/local.mk		patch \| blob \| blame \| history
tests/misc/uniq-collate.sh	[new file with mode: 0755]	patch \| blob