From 8e81d44b528b0abf6b9f02a70baf47aee52e2930 Mon Sep 17 00:00:00 2001 From: =?utf8?q?P=C3=A1draig=20Brady?=
Date: Sun, 23 Feb 2020 13:20:08 +0000
Subject: [PATCH] uniq: avoid strcoll() to improve performance and consistency
strcoll() is only significant to uniq(1) if it returns 0,
and it generally only does so with buggy locales or mismatched
locales and data. Some systems may have strcoll()
return 0 for equivalent normalized unicode forms,
but for consistency across platforms strcoll() is avoided.
The various cases are defined in the new test.
This is consistent with newer POSIX standards as discussed at:
https://www.austingroupbugs.net/view.php?id=963
* src/uniq.c: s/xstrcoll/memcmp/.
* tests/local.mk: Reference the new test.
* tests/misc/uniq-collate.sh: Add a new test.
* NEWS: Mention the change in behavior.
Fixes https://bugs.gnu.org/38627
---
NEWS | 3 ++
src/uniq.c | 14 +--------
tests/local.mk | 1 +
tests/misc/uniq-collate.sh | 63 ++++++++++++++++++++++++++++++++++++++
4 files changed, 68 insertions(+), 13 deletions(-)
create mode 100755 tests/misc/uniq-collate.sh
diff --git a/NEWS b/NEWS
index 8a349634e7..6afb9cb6d1 100644
--- a/NEWS
+++ b/NEWS
@@ -65,6 +65,9 @@ GNU coreutils NEWS -*- outline -*-
[The old behavior was introduced in sh-utils 2.0.15 ca. 1999, predating
coreutils package.]
+ uniq no longer uses strcoll() to determine string equivalence,
+ and so will operate more efficiently and consistently.
+
** New Features
ls now supports the --time=birth option to display and sort by
diff --git a/src/uniq.c b/src/uniq.c
index 0fcf50a16b..e0247579b6 100644
--- a/src/uniq.c
+++ b/src/uniq.c
@@ -27,10 +27,8 @@
#include "die.h"
#include "error.h"
#include "fadvise.h"
-#include "hard-locale.h"
#include "posixver.h"
#include "stdio--.h"
-#include "xmemcoll.h"
#include "xstrtol.h"
#include "memcasecmp.h"
#include "quote.h"
@@ -52,9 +50,6 @@
} \
while (0)
-/* True if the LC_COLLATE locale is hard. */
-static bool hard_LC_COLLATE;
-
/* Number of fields to skip on each line when doing comparisons. */
static size_t skip_fields;
@@ -220,7 +215,6 @@ characters. Fields are skipped before chars.\n\
\n\
Note: 'uniq' does not detect repeated lines unless they are adjacent.\n\
You may want to sort the input first, or use 'sort -u' without 'uniq'.\n\
-Also, comparisons honor the rules specified by 'LC_COLLATE'.\n\
"), stdout);
emit_ancillary_info (PROGRAM_NAME);
}
@@ -293,12 +287,7 @@ different (char *old, char *new, size_t oldlen, size_t newlen)
newlen = check_chars;
if (ignore_case)
- {
- /* FIXME: This should invoke strcoll somehow. */
- return oldlen != newlen || memcasecmp (old, new, oldlen);
- }
- else if (hard_LC_COLLATE)
- return xmemcoll (old, oldlen, new, newlen) != 0;
+ return oldlen != newlen || memcasecmp (old, new, oldlen);
else
return oldlen != newlen || memcmp (old, new, oldlen);
}
@@ -501,7 +490,6 @@ main (int argc, char **argv)
setlocale (LC_ALL, "");
bindtextdomain (PACKAGE, LOCALEDIR);
textdomain (PACKAGE);
- hard_LC_COLLATE = hard_locale (LC_COLLATE);
atexit (close_stdout);
diff --git a/tests/local.mk b/tests/local.mk
index bbcb9d4133..0aabdaacc6 100644
--- a/tests/local.mk
+++ b/tests/local.mk
@@ -438,6 +438,7 @@ all_tests = \
tests/misc/unexpand.pl \
tests/misc/uniq.pl \
tests/misc/uniq-perf.sh \
+ tests/misc/uniq-collate.sh \
tests/misc/xattr.sh \
tests/misc/yes.sh \
tests/tail-2/wait.sh \
diff --git a/tests/misc/uniq-collate.sh b/tests/misc/uniq-collate.sh
new file mode 100755
index 0000000000..974befce22
--- /dev/null
+++ b/tests/misc/uniq-collate.sh
@@ -0,0 +1,63 @@
+#!/bin/sh
+# before coreutils-8.32, uniq would not distinguish
+# items which compared equal with strcoll()
+# So ensure we avoid strcoll() for the following cases.
+
+# Copyright (C) 2020 Free Software Foundation, Inc.
+
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU General Public License for more details.
+
+# You should have received a copy of the GNU General Public License
+# along with this program. If not, see