* NEWS: Mention this.
* bootstrap.conf (gnulib_modules): Remove cu-ctype, as this module
is now more trouble than it’s worth. All uses removed.
Add skipchars.
* gl/lib/cu-ctype.c, gl/lib/cu-ctype.h, gl/modules/cu-ctype:
Remove.
* gl/lib/skipchars.c, gl/lib/skipchars.h, gl/modules/skipchars:
* tests/misc/join-utf8.sh:
New files.
* src/join.c: Include skipchars.h and mcel.h instead of cu-ctype.h.
(tab): Now mcel_t, not int. All uses changed.
(output_separator, output_seplen): New static vars.
(eq_tab, newline_or_blank, comma_or_blank): New functions.
(xfields, prfields, prjoin, add_field_list, main):
Support multi-byte characters.
* src/numfmt.c: Include ctype.h, skipchars.h.
Do not include cu-ctype.h.
(newline_or_blank): New function.
(next_field): Support multi-byte characters.
* src/sort.c: Include ctype.h instead of cu-ctype.h.
(inittables): Open-code field_sep since it no longer exists.
‘sort’ is not multi-byte safe yet, but when it is this code
will need revamping anyway.
* src/uniq.c: Include mcel.h and skipchars.h instead of cu-ctype.h.
(newline_or_blank): New function.
(find_field): Support multi-byte characters.
* tests/local.mk (all_tests): Add tests/misc/join-utf8.sh
to preserve ownership" when copying to GNU/Linux CIFS file systems.
They do this by working around some Linux CIFS bugs.
+ join and uniq now support multi-byte characters better.
+ For example, 'join -tX' now works even if X is a multi-byte character,
+ and both programs now treat multi-byte characters like U+3000
+ IDEOGRAPHIC SPACE as blanks if the current locale treats them so.
+
numfmt options like --suffix no longer have an arbitrary 127-byte limit.
[bug introduced with numfmt in coreutils-8.21]
crypto/sha256
crypto/sha512
crypto/sm3
- cu-ctype
cycle-check
d-ino
d-type
settime
sig2str
sigaction
+ skipchars
smack
ssize_t
stat-macros
+++ /dev/null
-#include <config.h>
-#define CU_CTYPE_INLINE _GL_EXTERN_INLINE
-#include <cu-ctype.h>
+++ /dev/null
-/* Character type definitions for coreutils
-
- Copyright 2023 Free Software Foundation, Inc.
-
- This program is free software: you can redistribute it and/or modify
- it under the terms of the GNU General Public License as published by
- the Free Software Foundation, either version 3 of the License, or
- (at your option) any later version.
-
- This program is distributed in the hope that it will be useful,
- but WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- GNU General Public License for more details.
-
- You should have received a copy of the GNU General Public License
- along with this program. If not, see <https://www.gnu.org/licenses/>. */
-
-#include <ctype.h>
-
-#ifndef _GL_INLINE_HEADER_BEGIN
-# error "Please include config.h first."
-#endif
-_GL_INLINE_HEADER_BEGIN
-#ifndef CU_CTYPE_INLINE
-# define CU_CTYPE_INLINE _GL_INLINE
-#endif
-
-/* '\n' is considered a field separator with --zero-terminated. */
-CU_CTYPE_INLINE bool
-field_sep (unsigned char ch)
-{
- return isblank (ch) || ch == '\n';
-}
-
-_GL_INLINE_HEADER_END
--- /dev/null
+#include <config.h>
+#define SKIPCHARS_INLINE _GL_EXTERN_INLINE
+#include <skipchars.h>
--- /dev/null
+/* Skipping sequences of characters satisfying a predicate
+
+ Copyright 2023 Free Software Foundation, Inc.
+
+ This program is free software: you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation, either version 3 of the License, or
+ (at your option) any later version.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program. If not, see <https://www.gnu.org/licenses/>. */
+
+#include "mcel.h"
+
+_GL_INLINE_HEADER_BEGIN
+#ifndef SKIPCHARS_INLINE
+# define SKIPCHARS_INLINE _GL_INLINE
+#endif
+
+/* Return the address just past the leading sequence of possibly
+ multi-byte characters or encoding errors G in STR that satisfy
+ PREDICATE (G) if OK is true, or that do not satisfy the predicate
+ call if OK is false. */
+
+SKIPCHARS_INLINE char *
+skip_str_matching (char const *str, bool (*predicate) (mcel_t), bool ok)
+{
+ char const *s = str;
+ for (mcel_t g; *s && predicate (g = mcel_scanz (s)) == ok;
+ s += g.len)
+ continue;
+ return (char *) s;
+}
+
+/* Return the address just past the leading sequence of possibly
+ multi-byte characters or encoding errors G in BUF (which ends at LIM)
+ that satisfy PREDICATE (G) if OK is true, or that do not satisfy
+ the predicate call if OK is false. */
+
+SKIPCHARS_INLINE char *
+skip_buf_matching (char const *buf, char const *lim,
+ bool (*predicate) (mcel_t), bool ok)
+{
+ char const *s = buf;
+ for (mcel_t g; s < lim && predicate (g = mcel_scan (s, lim)) == ok;
+ s += g.len)
+ continue;
+ return (char *) s;
+}
+
+_GL_INLINE_HEADER_END
+++ /dev/null
-Description:
-ctype.h-like definitions for coreutils
-
-Files:
-lib/cu-ctype.c
-lib/cu-ctype.h
-
-Depends-on:
-ctype
-extern-inline
-
-configure.ac:
-
-Makefile.am:
-lib_SOURCES += cu-ctype.c
-
-Include:
-"cu-ctype.h"
-
-License:
-GPL
-
-Maintainer:
-all
--- /dev/null
+Description:
+Skip sequences of multi-byte characters or encoding errors
+
+Files:
+lib/skipchars.c
+lib/skipchars.h
+
+Depends-on:
+extern-inline
+mcel
+
+configure.ac:
+
+Makefile.am:
+lib_SOURCES += skipchars.c
+
+Include:
+"skipchars.h"
+
+License:
+GPL
+
+Maintainer:
+all
#include "system.h"
#include "assure.h"
-#include "cu-ctype.h"
#include "fadvise.h"
#include "hard-locale.h"
#include "linebuffer.h"
+#include "mcel.h"
#include "memcasecmp.h"
#include "quote.h"
+#include "skipchars.h"
#include "stdio--.h"
#include "xmemcoll.h"
#include "xstrtol.h"
/* Last element in 'outlist', where a new element can be added. */
static struct outlist *outlist_end = &outlist_head;
-/* Tab character separating fields. If negative, fields are separated
- by any nonempty string of blanks, otherwise by exactly one
- tab character whose value (when cast to unsigned char) equals TAB. */
-static int tab = -1;
+/* Tab character (or encoding error) separating fields. If TAB.len == 0,
+ fields are separated by any nonempty string of blanks, otherwise by
+ exactly one tab character (or encoding error) equal to TAB. */
+static mcel_t tab;
+
+/* The output separator to use, and its length in bytes. */
+static char const *output_separator = " ";
+static idx_t output_seplen = 1;
/* If nonzero, check that the input is correctly ordered. */
static enum
++(line->nfields);
}
+static bool
+eq_tab (mcel_t g)
+{
+ return mcel_cmp (g, tab) == 0;
+}
+
+static bool
+newline_or_blank (mcel_t g)
+{
+ return g.ch == '\n' || c32isblank (g.ch);
+}
+
/* Fill in the 'fields' structure in LINE. */
static void
if (ptr == lim)
return;
- if (0 <= tab && tab != '\n')
- {
- char *sep;
- for (; (sep = memchr (ptr, tab, lim - ptr)) != nullptr; ptr = sep + 1)
- extract_field (line, ptr, sep - ptr);
- }
- else if (tab < 0)
+ if (!tab.len)
{
- /* Skip leading blanks before the first field. */
- while (field_sep (*ptr))
- if (++ptr == lim)
- return;
-
- do
+ while (ptr < lim)
{
- char *sep;
- for (sep = ptr + 1; sep != lim && ! field_sep (*sep); sep++)
- continue;
+ ptr = skip_buf_matching (ptr, lim, newline_or_blank, true);
+ if (!*ptr)
+ break;
+ char *sep = skip_buf_matching (ptr, lim, newline_or_blank, false);
extract_field (line, ptr, sep - ptr);
- if (sep == lim)
- return;
- for (ptr = sep + 1; ptr != lim && field_sep (*ptr); ptr++)
- continue;
+ ptr = sep;
}
- while (ptr != lim);
}
+ else
+ {
+ if (tab.ch != '\n')
+ for (char *sep;
+ ((sep = skip_buf_matching (ptr, lim, eq_tab, false))
+ < lim);
+ ptr = sep + mcel_scan (sep, lim).len)
+ extract_field (line, ptr, sep - ptr);
- extract_field (line, ptr, lim - ptr);
+ extract_field (line, ptr, lim - ptr);
+ }
}
static void
{
idx_t i;
idx_t nfields = autoformat ? autocount : line->nfields;
- char output_separator = tab < 0 ? ' ' : tab;
for (i = 0; i < join_field && i < nfields; ++i)
{
- putchar (output_separator);
+ fwrite (output_separator, 1, output_seplen, stdout);
prfield (i, line);
}
for (i = join_field + 1; i < nfields; ++i)
{
- putchar (output_separator);
+ fwrite (output_separator, 1, output_seplen, stdout);
prfield (i, line);
}
}
prjoin (struct line const *line1, struct line const *line2)
{
const struct outlist *outlist;
- char output_separator = tab < 0 ? ' ' : tab;
idx_t field;
struct line const *line;
o = o->next;
if (o == nullptr)
break;
- putchar (output_separator);
+ fwrite (output_separator, 1, output_seplen, stdout);
}
putchar (eolchar);
}
}
}
+static bool
+comma_or_blank (mcel_t g)
+{
+ return g.ch == ',' || c32isblank (g.ch);
+}
+
/* Add the comma or blank separated field spec(s) in STR to 'outlist'. */
static void
int file_index;
idx_t field_index;
char const *spec_item = p;
-
- p = strpbrk (p, ", \t");
- if (p)
- *p++ = '\0';
+ p = skip_str_matching (spec_item, comma_or_blank, false);
+ if (*p)
+ {
+ mcel_t g = mcel_scanz (p);
+ *p = '\0';
+ p += g.len;
+ }
decode_field_spec (spec_item, &file_index, &field_index);
add_field (file_index, field_index);
}
- while (p);
+ while (*p);
}
/* Set the join field *VAR to VAL, but report an error if *VAR is set
case 't':
{
- unsigned char newtab = optarg[0];
- if (! newtab)
- newtab = '\n'; /* '' => process the whole line. */
- else if (optarg[1])
+ mcel_t newtab;
+ if (!*optarg)
+ {
+ /* '' => process the whole line. */
+ newtab = mcel_ch ('\n', 1);
+ /* output_separator does not matter. */
+ }
+ else if (STREQ (optarg, "\\0"))
+ {
+ newtab = mcel_ch ('\0', 1);
+ output_separator = "";
+ }
+ else
{
- if (STREQ (optarg, "\\0"))
- newtab = '\0';
- else
+ newtab = mcel_scanz (optarg);
+ if (optarg[newtab.len])
error (EXIT_FAILURE, 0, _("multi-character tab %s"),
quote (optarg));
+ output_separator = optarg;
}
- if (0 <= tab && tab != newtab)
+ if (tab.len && mcel_cmp (tab, newtab) != 0)
error (EXIT_FAILURE, 0, _("incompatible tabs"));
tab = newtab;
+ output_seplen = newtab.len;
}
break;
along with this program. If not, see <https://www.gnu.org/licenses/>. */
#include <config.h>
+#include <ctype.h>
#include <float.h>
#include <getopt.h>
#include <stdckdint.h>
#include "argmatch.h"
#include "c-ctype.h"
-#include "cu-ctype.h"
#include "mbswidth.h"
#include "quote.h"
+#include "skipchars.h"
#include "system.h"
#include "xstrtol.h"
return (e == SSE_OK || e == SSE_OK_PRECISION_LOSS);
}
+static bool
+newline_or_blank (mcel_t g)
+{
+ return g.ch == '\n' || c32isblank (g.ch);
+}
+
/* Return a pointer to the beginning of the next field in line.
The line pointer is moved to the end of the next field. */
static char*
else
{
/* keep any space prefix in the returned field */
- while (*field_end && field_sep (*field_end))
- ++field_end;
-
- while (*field_end && ! field_sep (*field_end))
- ++field_end;
+ field_end = skip_str_matching (field_end, newline_or_blank, true);
+ field_end = skip_str_matching (field_end, newline_or_blank, false);
}
*line = field_end;
#include <config.h>
+#include <ctype.h>
#include <getopt.h>
#include <pthread.h>
#include <sys/resource.h>
#include "system.h"
#include "argmatch.h"
#include "assure.h"
-#include "cu-ctype.h"
#include "fadvise.h"
#include "filevercmp.h"
#include "flexmember.h"
for (i = 0; i < UCHAR_LIM; ++i)
{
- blanks[i] = field_sep (i);
+ blanks[i] = i == '\n' || isblank (i);
+ nondictionary[i] = ! blanks[i] && ! isalnum (i);
nonprinting[i] = ! isprint (i);
- nondictionary[i] = ! isalnum (i) && ! field_sep (i);
fold_toupper[i] = toupper (i);
}
#include "system.h"
#include "argmatch.h"
-#include "cu-ctype.h"
#include "linebuffer.h"
#include "fadvise.h"
+#include "mcel.h"
#include "posixver.h"
+#include "skipchars.h"
#include "stdio--.h"
#include "xstrtol.h"
#include "memcasecmp.h"
return MIN (size, SIZE_MAX);
}
+static bool
+newline_or_blank (mcel_t g)
+{
+ return g.ch == '\n' || c32isblank (g.ch);
+}
+
/* Given a linebuffer LINE,
return a pointer to the beginning of the line's field to be compared. */
find_field (struct linebuffer const *line)
{
size_t count;
- char const *lp = line->buffer;
- size_t size = line->length - 1;
- size_t i = 0;
+ char *lp = line->buffer;
+ char const *lim = lp + line->length - 1;
- for (count = 0; count < skip_fields && i < size; count++)
+ for (count = 0; count < skip_fields && lp < lim; count++)
{
- while (i < size && field_sep (lp[i]))
- i++;
- while (i < size && !field_sep (lp[i]))
- i++;
+ lp = skip_buf_matching (lp, lim, newline_or_blank, true);
+ lp = skip_buf_matching (lp, lim, newline_or_blank, false);
}
- i += MIN (skip_chars, size - i);
+ for (size_t s = skip_chars; lp < lim && s; s--)
+ lp += mcel_scan (lp, lim).len;
- return line->buffer + i;
+ return lp;
}
/* Return false if two strings OLD and NEW match, true if not.
tests/misc/mktemp.pl \
tests/misc/arch.sh \
tests/misc/join.pl \
+ tests/misc/join-utf8.sh \
tests/pr/pr-tests.pl \
tests/pwd/pwd-option.sh \
tests/chcon/chcon-fail.sh \
--- /dev/null
+#!/bin/sh
+# Test join in a UTF-8 locale.
+
+# Copyright 2023 Free Software Foundation, Inc.
+
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU General Public License for more details.
+
+# You should have received a copy of the GNU General Public License
+# along with this program. If not, see <https://www.gnu.org/licenses/>.
+
+. "${srcdir=.}/tests/init.sh"; path_prepend_ ./src
+print_ver_ join
+
+test "${LOCALE_FR_UTF8+set}" = set || skip_ "French UTF-8 locale not available"
+
+LC_ALL=$LOCALE_FR_UTF8
+export LC_ALL
+
+fail=0
+
+vertical_line='|'
+multiplication_sign='×'
+en_dash='–'
+old_Persian_word_divider='𐏐'
+
+for s in \
+ "$vertical_line" \
+ "$multiplication_sign" \
+ "$en_dash" \
+ "$old_Persian_word_divider"
+do
+ printf '0%sA\n1%sa\n2%sb\n4%sc\n' "$s" "$s" "$s" "$s" >a ||
+ framework_failure_
+ printf '0%sB\n1%sd\n3%se\n4%sf\n' "$s" "$s" "$s" "$s" >b ||
+ framework_failure_
+ join -t"$s" -a1 -a2 -eouch -o0,1.2,2.2 a b >out || fail=1
+ printf '0%sA%sB\n1%sa%sd\n2%sb%souch\n3%souch%se\n4%sc%sf\n' \
+ "$s" "$s" "$s" "$s" "$s" "$s" "$s" "$s" "$s" "$s" >exp ||
+ framework_failure
+ compare exp out || fail=1
+done
+
+Exit $fail