#include "die.h"
#include "error.h"
#include "fadvise.h"
-#include "hard-locale.h"
#include "posixver.h"
#include "stdio--.h"
-#include "xmemcoll.h"
#include "xstrtol.h"
#include "memcasecmp.h"
#include "quote.h"
} \
while (0)
-/* True if the LC_COLLATE locale is hard. */
-static bool hard_LC_COLLATE;
-
/* Number of fields to skip on each line when doing comparisons. */
static size_t skip_fields;
\n\
Note: 'uniq' does not detect repeated lines unless they are adjacent.\n\
You may want to sort the input first, or use 'sort -u' without 'uniq'.\n\
-Also, comparisons honor the rules specified by 'LC_COLLATE'.\n\
"), stdout);
emit_ancillary_info (PROGRAM_NAME);
}
newlen = check_chars;
if (ignore_case)
- {
- /* FIXME: This should invoke strcoll somehow. */
- return oldlen != newlen || memcasecmp (old, new, oldlen);
- }
- else if (hard_LC_COLLATE)
- return xmemcoll (old, oldlen, new, newlen) != 0;
+ return oldlen != newlen || memcasecmp (old, new, oldlen);
else
return oldlen != newlen || memcmp (old, new, oldlen);
}
setlocale (LC_ALL, "");
bindtextdomain (PACKAGE, LOCALEDIR);
textdomain (PACKAGE);
- hard_LC_COLLATE = hard_locale (LC_COLLATE);
atexit (close_stdout);
--- /dev/null
+#!/bin/sh
+# before coreutils-8.32, uniq would not distinguish
+# items which compared equal with strcoll()
+# So ensure we avoid strcoll() for the following cases.
+
+# Copyright (C) 2020 Free Software Foundation, Inc.
+
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU General Public License for more details.
+
+# You should have received a copy of the GNU General Public License
+# along with this program. If not, see <https://www.gnu.org/licenses/>.
+
+. "${srcdir=.}/tests/init.sh"; path_prepend_ ./src
+print_ver_ uniq printf
+
+gen_input()
+{
+ env LC_ALL=$LOCALE_FR_UTF8 printf "$@" > in || framework_failure_
+}
+
+# strcoll() used to return 0 comparing the following strings
+# which was fixed somewhere between glibc-2.22 and glibc-2.30
+gen_input '%s\n' 'ⁿᵘˡˡ' 'ܥܝܪܐܩ' > in || framework_failure_
+test $(LC_ALL=$LOCALE_FR_UTF8 uniq < in | wc -l) = 2 || fail=1
+
+# normalization in strcoll is inconsistent across platforms.
+# glibc based systems at least do _not_ normalize in strcoll,
+# while cygwin systems for example may do so.
+# á composed and decomposed, are generally not compared equal
+gen_input '\u00E1\na\u0301\n' > in || framework_failure_
+test $(LC_ALL=$LOCALE_FR_UTF8 uniq < in | wc -l) = 2 || fail=1
+# Similarly with the following equivalent hangul characters
+gen_input '\uAC01\n\u1100\u1161\u11A8\n' > in || framework_failure_
+test $(LC_ALL=ko_KR.utf8 uniq < in | wc -l) = 2 || fail=1
+
+# Note if running in the wrong locale,
+# strcoll may indicate the strings match when they don't.
+# I.e., cjk and hangul will now work even if
+# uniq is running in the wrong locale
+# hangul (ko_KR.utf8)
+gen_input '\uAC00\n\uAC01\n' > in || framework_failure_
+test $(LC_ALL=en_US.utf8 uniq < in | wc -l) = 2 || fail=1
+# CJK (zh_CN.utf8)
+gen_input '\u3400\n\u3401\n' > in || framework_failure_
+test $(LC_ALL=en_US.utf8 uniq < in | wc -l) = 2 || fail=1
+
+# Note strcoll() ignores certain characters,
+# but not if the strings are otherwise equal.
+# I.e., the following on glibc-2.30 at least,
+# as expected, does not print a single item,
+# but testing here for illustration
+gen_input ',a\n.a\n' > in || framework_failure_
+test $(LC_ALL=$LOCALE_FR_UTF8 uniq < in | wc -l) = 2 || fail=1
+
+Exit $fail