From 6bbf04db6335a97adf06ad31ab6cea9292ad7d2c Mon Sep 17 00:00:00 2001 From: Collin Funk Date: Sat, 7 Feb 2026 11:15:23 -0800 Subject: [PATCH] nl: support multi-byte section delimiters * NEWS: Mention the improvement. * src/nl.c: Include mcel.h. (DEFAULT_SECTION_DELIMITERS): Resize to fit 2 multi-byte characters. (section_del_len): New variable. (check_section): Compare against section_del_len instead of 2. (main): Support multi-byte characters for the -d option. * tests/nl/multibyte.sh: New file. * tests/nl/nl.sh: New file, moved from tests/misc/nl.sh. * tests/local.mk (all_tests): Add the new test. Adjust the existing tests file name. * cfg.mk (exclude_file_name_regexp--sc_space_tab): Adjust Adjust the existing tests file name. --- NEWS | 4 +++ cfg.mk | 2 +- src/nl.c | 33 ++++++++++++++++------- tests/local.mk | 3 ++- tests/nl/multibyte.sh | 56 ++++++++++++++++++++++++++++++++++++++++ tests/{misc => nl}/nl.sh | 0 6 files changed, 87 insertions(+), 11 deletions(-) create mode 100755 tests/nl/multibyte.sh rename tests/{misc => nl}/nl.sh (100%) diff --git a/NEWS b/NEWS index 2af46eac30..6087494f3d 100644 --- a/NEWS +++ b/NEWS @@ -12,6 +12,10 @@ GNU coreutils NEWS -*- outline -*- 'date --date' now parses dot delimited dd.mm.yy format common in Europe. This is in addition to the already supported mm/dd/yy and yy-mm-dd formats. +** Improvements + + 'nl' now supports multi-byte --section-delimiter characters. + ** Build-related ./configure --enable-single-binary=hardlinks is now supported on systems diff --git a/cfg.mk b/cfg.mk index b394f6698e..27b63f93b0 100644 --- a/cfg.mk +++ b/cfg.mk @@ -905,7 +905,7 @@ update-copyright-env = \ # List syntax-check exemptions. exclude_file_name_regexp--sc_space_tab = \ - ^(tests/pr/|tests/misc/nl\.sh$$|gl/.*\.diff$$|man/help2man$$) + ^(tests/pr/|tests/nl/nl\.sh$$|gl/.*\.diff$$|man/help2man$$) exclude_file_name_regexp--sc_bindtextdomain = \ ^(gl/.*|lib/euidaccess-stat|src/make-prime-list|src/cksum_crc)\.c$$ exclude_file_name_regexp--sc_trailing_blank = \ diff --git a/src/nl.c b/src/nl.c index 805de34911..1b7a08bb27 100644 --- a/src/nl.c +++ b/src/nl.c @@ -29,6 +29,7 @@ #include "fadvise.h" #include "linebuffer.h" +#include "mcel.h" #include "quote.h" #include "xdectoint.h" @@ -52,7 +53,7 @@ static char const FORMAT_RIGHT_LZ[] = "%0*jd%s"; static char const FORMAT_LEFT[] = "%-*jd%s"; /* Default section delimiter characters. */ -static char DEFAULT_SECTION_DELIMITERS[] = "\\:"; +static char DEFAULT_SECTION_DELIMITERS[MCEL_LEN_MAX * 2 + 1] = "\\:"; /* Types of input lines: either one of the section delimiters, or text to output. */ @@ -96,6 +97,9 @@ static char const *separator_str = "\t"; /* Input section delimiter string (-d). */ static char *section_del = DEFAULT_SECTION_DELIMITERS; +/* Input section delimiter length. */ +static size_t section_del_len; + /* Header delimiter string. */ static char *header_del = NULL; @@ -404,8 +408,8 @@ check_section (void) { size_t len = line_buf.length - 1; - if (len < 2 || footer_del_len < 2 - || !memeq (line_buf.buffer, section_del, 2)) + if (len < section_del_len || footer_del_len < section_del_len + || !memeq (line_buf.buffer, section_del, section_del_len)) return Text; if (len == header_del_len && memeq (line_buf.buffer, header_del, header_del_len)) @@ -578,14 +582,25 @@ main (int argc, char **argv) break; case 'd': len = strlen (optarg); - if (len == 1 || len == 2) /* POSIX. */ + if (1 < MB_CUR_MAX) { - char *p = section_del; - while (*optarg) - *p++ = *optarg++; + char const *p = optarg; + char const *lim = p + len; + int n_chars = 0; + for (; p < lim && n_chars < 2; ++n_chars) + p += mcel_scan (p, lim).len; + if (n_chars == 1) + memcpy (mempcpy (section_del, optarg, len), ":", sizeof ":"); + else + section_del = optarg; } else - section_del = optarg; /* GNU extension. */ + { + if (len == 1) + *section_del = *optarg; + else + section_del = optarg; + } break; case_GETOPT_HELP_CHAR; case_GETOPT_VERSION_CHAR (PROGRAM_NAME, AUTHORS); @@ -599,7 +614,7 @@ main (int argc, char **argv) usage (EXIT_FAILURE); /* Initialize the section delimiters. */ - len = strlen (section_del); + section_del_len = len = strlen (section_del); header_del_len = len * 3; header_del = xmalloc (header_del_len + 1); diff --git a/tests/local.mk b/tests/local.mk index c7435d9af2..b7ed37c50b 100644 --- a/tests/local.mk +++ b/tests/local.mk @@ -367,7 +367,8 @@ all_tests = \ tests/misc/mknod.sh \ tests/nice/nice.sh \ tests/nice/nice-fail.sh \ - tests/misc/nl.sh \ + tests/nl/nl.sh \ + tests/nl/multibyte.sh \ tests/misc/nohup.sh \ tests/nproc/nproc-avail.sh \ tests/nproc/nproc-positive.sh \ diff --git a/tests/nl/multibyte.sh b/tests/nl/multibyte.sh new file mode 100755 index 0000000000..e1eb40467e --- /dev/null +++ b/tests/nl/multibyte.sh @@ -0,0 +1,56 @@ +#!/bin/sh +# Test nl with multibyte section delimiters. + +# Copyright (C) 2026 Free Software Foundation, Inc. + +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. + +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. + +# You should have received a copy of the GNU General Public License +# along with this program. If not, see . + +. "${srcdir=.}/tests/init.sh"; path_prepend_ ./src +print_ver_ nl printf + +test "$LOCALE_FR_UTF8" != none || skip_ "French UTF-8 locale not available" + +cat <<\EOF > exp || framework_failure_ + + 1 a + + 2 b + + 3 c +EOF + +test_nl_multibyte () +{ + { + export LC_ALL="$LOCALE_FR_UTF8" + # A missing second character implies ':'. + env printf "$2$2$2\na\n$2$2\nb\n$2\nc\n" > inp || framework_failure_ + nl -p -ha -fa -d $(env printf "$1") < inp > out || fail=1 + } + compare exp out +} + +# Implied ':' character. +test_nl_multibyte '\xc3' '\xc3:' || fail=1 +test_nl_multibyte '\uB250' '\uB250:' || fail=1 + +# Two characters. +test_nl_multibyte '\xc3\xc3' '\xc3\xc3' || fail=1 +test_nl_multibyte '\uB250\uB250' '\uB250\uB250' || fail=1 + +# More than 2 characters is a GNU extension. +test_nl_multibyte '\uB250\uB250\uB250' '\uB250\uB250\uB250' || fail=1 +test_nl_multibyte "$(bad_unicode)" "$(bad_unicode)" || fail=1 + +Exit $fail diff --git a/tests/misc/nl.sh b/tests/nl/nl.sh similarity index 100% rename from tests/misc/nl.sh rename to tests/nl/nl.sh -- 2.47.3