join,uniq: support multi-byte separators

author Paul Eggert <eggert@cs.ucla.edu>

Mon, 30 Oct 2023 07:32:51 +0000 (00:32 -0700)

committer Paul Eggert <eggert@cs.ucla.edu>

Mon, 30 Oct 2023 07:58:04 +0000 (00:58 -0700)
author Paul Eggert <eggert@cs.ucla.edu>
Mon, 30 Oct 2023 07:32:51 +0000 (00:32 -0700)
committer Paul Eggert <eggert@cs.ucla.edu>
Mon, 30 Oct 2023 07:58:04 +0000 (00:58 -0700)
diff --git a/NEWS b/NEWS

index 3021211dcfb3896afa9933d802655d747611ca25..b1088f68335bd36ffc5b6b0bd6ddab813cb65080 100644 (file)
--- a/NEWS
+++ b/NEWS
@@ -8,6 +8,11 @@ GNU coreutils NEWS                                    -*- outline -*-
    to preserve ownership" when copying to GNU/Linux CIFS file systems.
    They do this by working around some Linux CIFS bugs.
  
+  join and uniq now support multi-byte characters better.
+  For example, 'join -tX' now works even if X is a multi-byte character,
+  and both programs now treat multi-byte characters like U+3000
+  IDEOGRAPHIC SPACE as blanks if the current locale treats them so.
+
    numfmt options like --suffix no longer have an arbitrary 127-byte limit.
    [bug introduced with numfmt in coreutils-8.21]
  
diff --git a/bootstrap.conf b/bootstrap.conf

index 4724544d76186faff6cb057d270a2ee06b195db0..97645d6f0a8c5a3e534ac92b1485cb60dbe0822c 100644 (file)
--- a/bootstrap.conf
+++ b/bootstrap.conf
@@ -70,7 +70,6 @@ gnulib_modules="
    crypto/sha256
    crypto/sha512
    crypto/sm3
-  cu-ctype
    cycle-check
    d-ino
    d-type
@@ -241,6 +240,7 @@ gnulib_modules="
    settime
    sig2str
    sigaction
+  skipchars
    smack
    ssize_t
    stat-macros
diff --git a/gl/lib/cu-ctype.c b/gl/lib/cu-ctype.c

deleted file mode 100644 (file)

index 9f753de..0000000
--- a/gl/lib/cu-ctype.c
+++ /dev/null
@@ -1,3 +0,0 @@
-#include <config.h>
-#define CU_CTYPE_INLINE _GL_EXTERN_INLINE
-#include <cu-ctype.h>
diff --git a/gl/lib/cu-ctype.h b/gl/lib/cu-ctype.h

deleted file mode 100644 (file)

index 82f1d73..0000000
--- a/gl/lib/cu-ctype.h
+++ /dev/null
@@ -1,35 +0,0 @@
-/* Character type definitions for coreutils
-
-   Copyright 2023 Free Software Foundation, Inc.
-
-   This program is free software: you can redistribute it and/or modify
-   it under the terms of the GNU General Public License as published by
-   the Free Software Foundation, either version 3 of the License, or
-   (at your option) any later version.
-
-   This program is distributed in the hope that it will be useful,
-   but WITHOUT ANY WARRANTY; without even the implied warranty of
-   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-   GNU General Public License for more details.
-
-   You should have received a copy of the GNU General Public License
-   along with this program.  If not, see <https://www.gnu.org/licenses/>.  */
-
-#include <ctype.h>
-
-#ifndef _GL_INLINE_HEADER_BEGIN
-# error "Please include config.h first."
-#endif
-_GL_INLINE_HEADER_BEGIN
-#ifndef CU_CTYPE_INLINE
-# define CU_CTYPE_INLINE _GL_INLINE
-#endif
-
-/* '\n' is considered a field separator with  --zero-terminated.  */
-CU_CTYPE_INLINE bool
-field_sep (unsigned char ch)
-{
-  return isblank (ch) || ch == '\n';
-}
-
-_GL_INLINE_HEADER_END
diff --git a/gl/lib/skipchars.c b/gl/lib/skipchars.c

new file mode 100644 (file)

index 0000000..827c89d
--- /dev/null
+++ b/gl/lib/skipchars.c
@@ -0,0 +1,3 @@
+#include <config.h>
+#define SKIPCHARS_INLINE _GL_EXTERN_INLINE
+#include <skipchars.h>
diff --git a/gl/lib/skipchars.h b/gl/lib/skipchars.h

new file mode 100644 (file)

index 0000000..baa9eab
--- /dev/null
+++ b/gl/lib/skipchars.h
@@ -0,0 +1,56 @@
+/* Skipping sequences of characters satisfying a predicate
+
+   Copyright 2023 Free Software Foundation, Inc.
+
+   This program is free software: you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published by
+   the Free Software Foundation, either version 3 of the License, or
+   (at your option) any later version.
+
+   This program is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with this program.  If not, see <https://www.gnu.org/licenses/>.  */
+
+#include "mcel.h"
+
+_GL_INLINE_HEADER_BEGIN
+#ifndef SKIPCHARS_INLINE
+# define SKIPCHARS_INLINE _GL_INLINE
+#endif
+
+/* Return the address just past the leading sequence of possibly
+   multi-byte characters or encoding errors G in STR that satisfy
+   PREDICATE (G) if OK is true, or that do not satisfy the predicate
+   call if OK is false.  */
+
+SKIPCHARS_INLINE char *
+skip_str_matching (char const *str, bool (*predicate) (mcel_t), bool ok)
+{
+  char const *s = str;
+  for (mcel_t g; *s && predicate (g = mcel_scanz (s)) == ok;
+       s += g.len)
+    continue;
+  return (char *) s;
+}
+
+/* Return the address just past the leading sequence of possibly
+   multi-byte characters or encoding errors G in BUF (which ends at LIM)
+   that satisfy PREDICATE (G) if OK is true, or that do not satisfy
+   the predicate call if OK is false.  */
+
+SKIPCHARS_INLINE char *
+skip_buf_matching (char const *buf, char const *lim,
+                   bool (*predicate) (mcel_t), bool ok)
+{
+  char const *s = buf;
+  for (mcel_t g; s < lim && predicate (g = mcel_scan (s, lim)) == ok;
+       s += g.len)
+    continue;
+  return (char *) s;
+}
+
+_GL_INLINE_HEADER_END
diff --git a/gl/modules/cu-ctype b/gl/modules/cu-ctype

deleted file mode 100644 (file)

index bd328b3..0000000
--- a/gl/modules/cu-ctype
+++ /dev/null
@@ -1,24 +0,0 @@
-Description:
-ctype.h-like definitions for coreutils
-
-Files:
-lib/cu-ctype.c
-lib/cu-ctype.h
-
-Depends-on:
-ctype
-extern-inline
-
-configure.ac:
-
-Makefile.am:
-lib_SOURCES += cu-ctype.c
-
-Include:
-"cu-ctype.h"
-
-License:
-GPL
-
-Maintainer:
-all
diff --git a/gl/modules/skipchars b/gl/modules/skipchars

new file mode 100644 (file)

index 0000000..3b25fd6
--- /dev/null
+++ b/gl/modules/skipchars
@@ -0,0 +1,24 @@
+Description:
+Skip sequences of multi-byte characters or encoding errors
+
+Files:
+lib/skipchars.c
+lib/skipchars.h
+
+Depends-on:
+extern-inline
+mcel
+
+configure.ac:
+
+Makefile.am:
+lib_SOURCES += skipchars.c
+
+Include:
+"skipchars.h"
+
+License:
+GPL
+
+Maintainer:
+all
diff --git a/src/join.c b/src/join.c

index b95cf2b9be9198766abf0c86a0fd21a203ad95e0..b3ad27465992ec5ccdde2d2d2b96dc74b77c14ab 100644 (file)
--- a/src/join.c
+++ b/src/join.c
@@ -23,12 +23,13 @@
  
  #include "system.h"
  #include "assure.h"
-#include "cu-ctype.h"
  #include "fadvise.h"
  #include "hard-locale.h"
  #include "linebuffer.h"
+#include "mcel.h"
  #include "memcasecmp.h"
  #include "quote.h"
+#include "skipchars.h"
  #include "stdio--.h"
  #include "xmemcoll.h"
  #include "xstrtol.h"
@@ -135,10 +136,14 @@ static struct outlist outlist_head;
  /* Last element in 'outlist', where a new element can be added.  */
  static struct outlist *outlist_end = &outlist_head;
  
-/* Tab character separating fields.  If negative, fields are separated
-   by any nonempty string of blanks, otherwise by exactly one
-   tab character whose value (when cast to unsigned char) equals TAB.  */
-static int tab = -1;
+/* Tab character (or encoding error) separating fields.  If TAB.len == 0,
+   fields are separated by any nonempty string of blanks, otherwise by
+   exactly one tab character (or encoding error) equal to TAB.  */
+static mcel_t tab;
+
+/* The output separator to use, and its length in bytes.  */
+static char const *output_separator = " ";
+static idx_t output_seplen = 1;
  
  /* If nonzero, check that the input is correctly ordered. */
  static enum
@@ -267,6 +272,18 @@ extract_field (struct line *line, char *field, idx_t len)
    ++(line->nfields);
  }
  
+static bool
+eq_tab (mcel_t g)
+{
+  return mcel_cmp (g, tab) == 0;
+}
+
+static bool
+newline_or_blank (mcel_t g)
+{
+  return g.ch == '\n' || c32isblank (g.ch);
+}
+
  /* Fill in the 'fields' structure in LINE.  */
  
  static void
@@ -278,34 +295,29 @@ xfields (struct line *line)
    if (ptr == lim)
      return;
  
-  if (0 <= tab && tab != '\n')
-    {
-      char *sep;
-      for (; (sep = memchr (ptr, tab, lim - ptr)) != nullptr; ptr = sep + 1)
-        extract_field (line, ptr, sep - ptr);
-    }
-  else if (tab < 0)
+  if (!tab.len)
      {
-      /* Skip leading blanks before the first field.  */
-      while (field_sep (*ptr))
-        if (++ptr == lim)
-          return;
-
-      do
+      while (ptr < lim)
          {
-          char *sep;
-          for (sep = ptr + 1; sep != lim && ! field_sep (*sep); sep++)
-            continue;
+          ptr = skip_buf_matching (ptr, lim, newline_or_blank, true);
+          if (!*ptr)
+            break;
+          char *sep = skip_buf_matching (ptr, lim, newline_or_blank, false);
            extract_field (line, ptr, sep - ptr);
-          if (sep == lim)
-            return;
-          for (ptr = sep + 1; ptr != lim && field_sep (*ptr); ptr++)
-            continue;
+          ptr = sep;
          }
-      while (ptr != lim);
      }
+  else
+    {
+      if (tab.ch != '\n')
+        for (char *sep;
+             ((sep = skip_buf_matching (ptr, lim, eq_tab, false))
+              < lim);
+             ptr = sep + mcel_scan (sep, lim).len)
+          extract_field (line, ptr, sep - ptr);
  
-  extract_field (line, ptr, lim - ptr);
+      extract_field (line, ptr, lim - ptr);
+    }
  }
  
  static void
@@ -568,16 +580,15 @@ prfields (struct line const *line, idx_t join_field, idx_t autocount)
  {
    idx_t i;
    idx_t nfields = autoformat ? autocount : line->nfields;
-  char output_separator = tab < 0 ? ' ' : tab;
  
    for (i = 0; i < join_field && i < nfields; ++i)
      {
-      putchar (output_separator);
+      fwrite (output_separator, 1, output_seplen, stdout);
        prfield (i, line);
      }
    for (i = join_field + 1; i < nfields; ++i)
      {
-      putchar (output_separator);
+      fwrite (output_separator, 1, output_seplen, stdout);
        prfield (i, line);
      }
  }
@@ -588,7 +599,6 @@ static void
  prjoin (struct line const *line1, struct line const *line2)
  {
    const struct outlist *outlist;
-  char output_separator = tab < 0 ? ' ' : tab;
    idx_t field;
    struct line const *line;
  
@@ -622,7 +632,7 @@ prjoin (struct line const *line1, struct line const *line2)
            o = o->next;
            if (o == nullptr)
              break;
-          putchar (output_separator);
+          fwrite (output_separator, 1, output_seplen, stdout);
          }
        putchar (eolchar);
      }
@@ -886,6 +896,12 @@ decode_field_spec (char const *s, int *file_index, idx_t *field_index)
      }
  }
  
+static bool
+comma_or_blank (mcel_t g)
+{
+  return g.ch == ',' || c32isblank (g.ch);
+}
+
  /* Add the comma or blank separated field spec(s) in STR to 'outlist'.  */
  
  static void
@@ -898,14 +914,17 @@ add_field_list (char *str)
        int file_index;
        idx_t field_index;
        char const *spec_item = p;
-
-      p = strpbrk (p, ", \t");
-      if (p)
-        *p++ = '\0';
+      p = skip_str_matching (spec_item, comma_or_blank, false);
+      if (*p)
+        {
+          mcel_t g = mcel_scanz (p);
+          *p = '\0';
+          p += g.len;
+        }
        decode_field_spec (spec_item, &file_index, &field_index);
        add_field (file_index, field_index);
      }
-  while (p);
+  while (*p);
  }
  
  /* Set the join field *VAR to VAL, but report an error if *VAR is set
@@ -1087,20 +1106,30 @@ main (int argc, char **argv)
  
          case 't':
            {
-            unsigned char newtab = optarg[0];
-            if (! newtab)
-              newtab = '\n'; /* '' => process the whole line.  */
-            else if (optarg[1])
+            mcel_t newtab;
+            if (!*optarg)
+              {
+                /* '' => process the whole line.  */
+                newtab = mcel_ch ('\n', 1);
+                /* output_separator does not matter.  */
+              }
+            else if (STREQ (optarg, "\\0"))
+              {
+                newtab = mcel_ch ('\0', 1);
+                output_separator = "";
+              }
+            else
                {
-                if (STREQ (optarg, "\\0"))
-                  newtab = '\0';
-                else
+                newtab = mcel_scanz (optarg);
+                if (optarg[newtab.len])
                    error (EXIT_FAILURE, 0, _("multi-character tab %s"),
                           quote (optarg));
+                output_separator = optarg;
                }
-            if (0 <= tab && tab != newtab)
+            if (tab.len && mcel_cmp (tab, newtab) != 0)
                error (EXIT_FAILURE, 0, _("incompatible tabs"));
              tab = newtab;
+            output_seplen = newtab.len;
            }
            break;
  
diff --git a/src/numfmt.c b/src/numfmt.c

index 2ce70226cffdafe71cbd541b5b28348b190cca0c..7b53c87e4c7203698344e3bcd23fb2933e74b958 100644 (file)
--- a/src/numfmt.c
+++ b/src/numfmt.c
@@ -15,6 +15,7 @@
     along with this program.  If not, see <https://www.gnu.org/licenses/>.  */
  
  #include <config.h>
+#include <ctype.h>
  #include <float.h>
  #include <getopt.h>
  #include <stdckdint.h>
@@ -24,9 +25,9 @@
  
  #include "argmatch.h"
  #include "c-ctype.h"
-#include "cu-ctype.h"
  #include "mbswidth.h"
  #include "quote.h"
+#include "skipchars.h"
  #include "system.h"
  #include "xstrtol.h"
  
@@ -1314,6 +1315,12 @@ process_suffixed_number (char *text, long double *result,
    return (e == SSE_OK || e == SSE_OK_PRECISION_LOSS);
  }
  
+static bool
+newline_or_blank (mcel_t g)
+{
+  return g.ch == '\n' || c32isblank (g.ch);
+}
+
  /* Return a pointer to the beginning of the next field in line.
     The line pointer is moved to the end of the next field. */
  static char*
@@ -1334,11 +1341,8 @@ next_field (char **line)
    else
      {
        /* keep any space prefix in the returned field */
-      while (*field_end && field_sep (*field_end))
-        ++field_end;
-
-      while (*field_end && ! field_sep (*field_end))
-        ++field_end;
+      field_end = skip_str_matching (field_end, newline_or_blank, true);
+      field_end = skip_str_matching (field_end, newline_or_blank, false);
      }
  
    *line = field_end;
diff --git a/src/sort.c b/src/sort.c

index 6856e6151443fa5d75999f012492416640da494b..829b17f4236a8e091ffbc7d40c2ffcf969911340 100644 (file)
--- a/src/sort.c
+++ b/src/sort.c
@@ -22,6 +22,7 @@
  
  #include <config.h>
  
+#include <ctype.h>
  #include <getopt.h>
  #include <pthread.h>
  #include <sys/resource.h>
@@ -31,7 +32,6 @@
  #include "system.h"
  #include "argmatch.h"
  #include "assure.h"
-#include "cu-ctype.h"
  #include "fadvise.h"
  #include "filevercmp.h"
  #include "flexmember.h"
@@ -1293,9 +1293,9 @@ inittables (void)
  
    for (i = 0; i < UCHAR_LIM; ++i)
      {
-      blanks[i] = field_sep (i);
+      blanks[i] = i == '\n' || isblank (i);
+      nondictionary[i] = ! blanks[i] && ! isalnum (i);
        nonprinting[i] = ! isprint (i);
-      nondictionary[i] = ! isalnum (i) && ! field_sep (i);
        fold_toupper[i] = toupper (i);
      }
  
diff --git a/src/uniq.c b/src/uniq.c

index 7e177ac5a3639f7eb3fe4fc364df0aabe6cf3278..7dc0c999a5847a4d8f2374f595fdc2ff25228229 100644 (file)
--- a/src/uniq.c
+++ b/src/uniq.c
@@ -23,10 +23,11 @@
  
  #include "system.h"
  #include "argmatch.h"
-#include "cu-ctype.h"
  #include "linebuffer.h"
  #include "fadvise.h"
+#include "mcel.h"
  #include "posixver.h"
+#include "skipchars.h"
  #include "stdio--.h"
  #include "xstrtol.h"
  #include "memcasecmp.h"
@@ -248,6 +249,12 @@ size_opt (char const *opt, char const *msgid)
    return MIN (size, SIZE_MAX);
  }
  
+static bool
+newline_or_blank (mcel_t g)
+{
+  return g.ch == '\n' || c32isblank (g.ch);
+}
+
  /* Given a linebuffer LINE,
     return a pointer to the beginning of the line's field to be compared. */
  
@@ -256,21 +263,19 @@ static char *
  find_field (struct linebuffer const *line)
  {
    size_t count;
-  char const *lp = line->buffer;
-  size_t size = line->length - 1;
-  size_t i = 0;
+  char *lp = line->buffer;
+  char const *lim = lp + line->length - 1;
  
-  for (count = 0; count < skip_fields && i < size; count++)
+  for (count = 0; count < skip_fields && lp < lim; count++)
      {
-      while (i < size && field_sep (lp[i]))
-        i++;
-      while (i < size && !field_sep (lp[i]))
-        i++;
+      lp = skip_buf_matching (lp, lim, newline_or_blank, true);
+      lp = skip_buf_matching (lp, lim, newline_or_blank, false);
      }
  
-  i += MIN (skip_chars, size - i);
+  for (size_t s = skip_chars; lp < lim && s; s--)
+    lp += mcel_scan (lp, lim).len;
  
-  return line->buffer + i;
+  return lp;
  }
  
  /* Return false if two strings OLD and NEW match, true if not.
diff --git a/tests/local.mk b/tests/local.mk

index 79fea1f6e813ec251d0de36da38a18dfb77f05f2..a5fb62d969ccd3ce5894e3e94d129843ea9ee94e 100644 (file)
--- a/tests/local.mk
+++ b/tests/local.mk
@@ -271,6 +271,7 @@ all_tests =                                 \
    tests/misc/mktemp.pl                         \
    tests/misc/arch.sh                           \
    tests/misc/join.pl                           \
+  tests/misc/join-utf8.sh                      \
    tests/pr/pr-tests.pl                         \
    tests/pwd/pwd-option.sh                      \
    tests/chcon/chcon-fail.sh                    \
diff --git a/tests/misc/join-utf8.sh b/tests/misc/join-utf8.sh

new file mode 100755 (executable)

index 0000000..b70bff7
--- /dev/null
+++ b/tests/misc/join-utf8.sh
@@ -0,0 +1,51 @@
+#!/bin/sh
+# Test join in a UTF-8 locale.
+
+# Copyright 2023 Free Software Foundation, Inc.
+
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+
+# You should have received a copy of the GNU General Public License
+# along with this program.  If not, see <https://www.gnu.org/licenses/>.
+
+. "${srcdir=.}/tests/init.sh"; path_prepend_ ./src
+print_ver_ join
+
+test "${LOCALE_FR_UTF8+set}" = set || skip_ "French UTF-8 locale not available"
+
+LC_ALL=$LOCALE_FR_UTF8
+export LC_ALL
+
+fail=0
+
+vertical_line='|'
+multiplication_sign='×'
+en_dash='–'
+old_Persian_word_divider='𐏐'
+
+for s in \
+    "$vertical_line" \
+    "$multiplication_sign" \
+    "$en_dash" \
+    "$old_Persian_word_divider"
+do
+  printf '0%sA\n1%sa\n2%sb\n4%sc\n' "$s" "$s" "$s" "$s" >a ||
+    framework_failure_
+  printf '0%sB\n1%sd\n3%se\n4%sf\n' "$s" "$s" "$s" "$s" >b ||
+    framework_failure_
+  join -t"$s" -a1 -a2 -eouch -o0,1.2,2.2 a b >out || fail=1
+  printf '0%sA%sB\n1%sa%sd\n2%sb%souch\n3%souch%se\n4%sc%sf\n' \
+         "$s" "$s" "$s" "$s" "$s" "$s" "$s" "$s" "$s" "$s" >exp ||
+    framework_failure
+  compare exp out || fail=1
+done
+
+Exit $fail
author	Paul Eggert <eggert@cs.ucla.edu>
	Mon, 30 Oct 2023 07:32:51 +0000 (00:32 -0700)
committer	Paul Eggert <eggert@cs.ucla.edu>
	Mon, 30 Oct 2023 07:58:04 +0000 (00:58 -0700)
NEWS		patch \| blob \| blame \| history
bootstrap.conf		patch \| blob \| blame \| history
gl/lib/cu-ctype.c	[deleted file]	patch \| blob \| blame \| history
gl/lib/cu-ctype.h	[deleted file]	patch \| blob \| blame \| history
gl/lib/skipchars.c	[new file with mode: 0644]	patch \| blob
gl/lib/skipchars.h	[new file with mode: 0644]	patch \| blob
gl/modules/cu-ctype	[deleted file]	patch \| blob \| blame \| history
gl/modules/skipchars	[new file with mode: 0644]	patch \| blob
src/join.c		patch \| blob \| blame \| history
src/numfmt.c		patch \| blob \| blame \| history
src/sort.c		patch \| blob \| blame \| history
src/uniq.c		patch \| blob \| blame \| history
tests/local.mk		patch \| blob \| blame \| history
tests/misc/join-utf8.sh	[new file with mode: 0755]	patch \| blob