From a9f2be5bfec2bfe86c0851787312996467a653ee Mon Sep 17 00:00:00 2001
From: Assaf Gordon <assafgordon@gmail.com>
Date: Wed, 28 Jun 2017 01:23:52 +0000
Subject: [PATCH] expr: add multibyte support

Discussed in https://bugs.gnu.org/26779 .

* NEWS: Mention the improvement.
* bootstrap.conf: Add gnulib modules mbslen,mbschr.
* src/expr.c (mbs_logical_substr): New function to return a substring
based on logical character positions (instead of bytes).
(mbs_logical_cspn): Similar to strcspn/mbscspn, but returns number of
logical characters instead of byte offset.
(mbs_offset_to_chars): New function to return number of logical
characters fitting in a given byte offset.
(docolon): Report matched logical characters instead of bytes.
(eval6): For length/substr/index operations, use logical characters
instead of bytes by calling the above new functions.
* tests/misc/expr.pl: Repeat all tests with non-C locale to detect any
regressions.
* tests/misc/expr-multibyte.pl: New tests with multibyte input.
* tests/local.mk: Add new test file.
---
 NEWS                         |   5 +
 bootstrap.conf               |   2 +
 src/expr.c                   | 174 ++++++++++++++++++++++++---
 tests/local.mk               |   1 +
 tests/misc/expr-multibyte.pl | 226 +++++++++++++++++++++++++++++++++++
 tests/misc/expr.pl           |  20 ++++
 6 files changed, 410 insertions(+), 18 deletions(-)
 create mode 100755 tests/misc/expr-multibyte.pl

diff --git a/NEWS b/NEWS
index 071be4bb49..b834fa16ca 100644
--- a/NEWS
+++ b/NEWS
@@ -44,6 +44,9 @@ GNU coreutils NEWS                                    -*- outline -*-
   as that's inconsistent with the 24 hour time format used.
   [bug introduced in coreutils-7.0]
 
+  expr now returns number of characters matched (instead of incorrect
+  number of bytes matched) with 'match'/':' operators on multibyte strings.
+
 ** New features
 
   expand and unexpand now support specifying an offset for tab stops
@@ -53,6 +56,8 @@ GNU coreutils NEWS                                    -*- outline -*-
   split supports a new --hex-suffixes[=from] option to create files with
   lower case hexadecimal suffixes, similar to the --numeric-suffixes option.
 
+  expr supports multibyte strings for all string operations.
+
 ** Improvements
 
   mv --verbose now distinguishes rename and copy operations.
diff --git a/bootstrap.conf b/bootstrap.conf
index 30ce621ce0..4db77a3d78 100644
--- a/bootstrap.conf
+++ b/bootstrap.conf
@@ -155,6 +155,8 @@ gnulib_modules="
   mbrlen
   mbrtowc
   mbsalign
+  mbschr
+  mbslen
   mbswidth
   memcasecmp
   memchr
diff --git a/src/expr.c b/src/expr.c
index 3448fc8222..795140ce98 100644
--- a/src/expr.c
+++ b/src/expr.c
@@ -37,6 +37,7 @@
 #include "die.h"
 #include "error.h"
 #include "long-options.h"
+#include "mbuiter.h"
 #include "strnumcmp.h"
 #include "xstrtol.h"
 
@@ -192,6 +193,148 @@ static bool nomoreargs (void);
 static bool null (VALUE *v);
 static void printv (VALUE *v);
 
+
+/*
+   Find the first occurrence in the character string STRING of any character
+   in the character string ACCEPT.
+
+   Copied from gnulib's mbscspn, with two differences:
+   1. Returns 1-based position of first found character, or zero if not found.
+   2. Returned value is the logical character index, NOT byte offset.
+
+   Examples:
+     mbs_logical_cspn ('hello','a')  => 0
+     mbs_logical_cspn ('hello','h')  => 1
+     mbs_logical_cspn ('hello','oe') => 1
+     mbs_logical_cspn ('hello','lo') => 3
+
+   In UTF-8 \xCE\xB1 is a single character (greek alpha):
+     mbs_logical_cspn ('\xCE\xB1bc','\xCE\xB1') => 1
+     mbs_logical_cspn ('\xCE\xB1bc','c') => 3 */
+static size_t
+mbs_logical_cspn (const char *s, const char *accept)
+{
+  size_t idx = 0;
+
+  if (accept[0] == '\0')
+    return 0;
+
+  /* General case.  */
+  if (MB_CUR_MAX > 1)
+    {
+      mbui_iterator_t iter;
+
+      for (mbui_init (iter, s); mbui_avail (iter); mbui_advance (iter))
+        {
+          ++idx;
+          if (mb_len (mbui_cur (iter)) == 1)
+            {
+              if (mbschr (accept, *mbui_cur_ptr (iter)))
+                return idx;
+            }
+          else
+            {
+              mbui_iterator_t aiter;
+
+              for (mbui_init (aiter, accept);
+                   mbui_avail (aiter);
+                   mbui_advance (aiter))
+                if (mb_equal (mbui_cur (aiter), mbui_cur (iter)))
+                  return idx;
+            }
+        }
+
+      /* not found */
+      return 0;
+    }
+  else
+    {
+      /* single-byte locale,
+         convert returned byte offset to 1-based index or zero if not found. */
+      size_t i = strcspn (s, accept);
+      return (s[i] ? i + 1 : 0);
+    }
+}
+
+/* Extract the substring of S, from logical character
+   position POS and LEN characters.
+   first character position is 1.
+   POS and LEN refer to logical characters, not octets.
+
+   Upon exit, sets v->s to the new string.
+   The new string might be empty if POS/LEN are invalid. */
+static char *
+mbs_logical_substr (const char *s, size_t pos, size_t len)
+{
+  char *v, *vlim;
+
+  size_t blen = strlen (s); /* byte length */
+  size_t llen = (MB_CUR_MAX > 1) ? mbslen (s) : blen; /* logical length */
+
+  if (llen < pos || pos == 0 || len == 0 || len == SIZE_MAX)
+    return xstrdup ("");
+
+  /* characters to copy */
+  size_t vlen = MIN (len, llen - pos + 1);
+
+  if (MB_CUR_MAX == 1)
+    {
+      /* Single-byte case */
+      v = xmalloc (vlen + 1);
+      vlim = mempcpy (v, s + pos - 1, vlen);
+    }
+  else
+    {
+      /* Multibyte case */
+
+      /* FIXME: this is wasteful. Some memory can be saved by counting
+         how many bytes the matching characters occupy. */
+      vlim = v = xmalloc (blen + 1);
+
+      mbui_iterator_t iter;
+      size_t idx=1;
+      for (mbui_init (iter, s);
+           mbui_avail (iter) && vlen > 0;
+           mbui_advance (iter), ++idx)
+        {
+          /* Skip until we reach the starting position */
+          if (idx < pos)
+            continue;
+
+          /* Copy one character */
+          --vlen;
+          vlim = mempcpy (vlim, mbui_cur_ptr (iter), mb_len (mbui_cur (iter)));
+        }
+    }
+  *vlim = '\0';
+  return v;
+}
+
+/* Return the number of logical characteres (possibly multibyte)
+   that are in string S in the first OFS octets.
+
+   Example in UTF-8:
+   "\xE2\x9D\xA7" is "U+2767 ROTATED FLORAL HEART BULLET".
+   In the string below, there are only two characters
+   up to the first 4 bytes (The U+2767 which occupies 3 bytes and 'x'):
+      mbs_count_to_offset ("\xE2\x9D\xA7xyz", 4) => 2  */
+static size_t
+mbs_offset_to_chars (const char *s, size_t ofs)
+{
+  mbui_iterator_t iter;
+  size_t c = 0;
+  for (mbui_init (iter, s); mbui_avail (iter); mbui_advance (iter))
+    {
+      ptrdiff_t d = mbui_cur_ptr (iter) - s;
+      if (d >= ofs)
+        break;
+      ++c;
+    }
+  return c;
+}
+
+
+
 void
 usage (int status)
 {
@@ -574,7 +717,14 @@ docolon (VALUE *sv, VALUE *pv)
           v = str_value (sv->u.s + re_regs.start[1]);
         }
       else
-        v = int_value (matchlen);
+        {
+          /* In multibyte locales, convert the matched offset (=number of bytes)
+             to the number of matched characters. */
+          size_t i = (MB_CUR_MAX == 1
+                      ? matchlen
+                      : mbs_offset_to_chars (sv->u.s, matchlen));
+          v = int_value (i);
+        }
     }
   else if (matchlen == -1)
     {
@@ -650,7 +800,7 @@ eval6 (bool evaluate)
     {
       r = eval6 (evaluate);
       tostring (r);
-      v = int_value (strlen (r->u.s));
+      v = int_value (mbslen (r->u.s));
       freev (r);
       return v;
     }
@@ -676,20 +826,18 @@ eval6 (bool evaluate)
       r = eval6 (evaluate);
       tostring (l);
       tostring (r);
-      pos = strcspn (l->u.s, r->u.s);
-      v = int_value (l->u.s[pos] ? pos + 1 : 0);
+      pos = mbs_logical_cspn (l->u.s, r->u.s);
+      v = int_value (pos);
       freev (l);
       freev (r);
       return v;
     }
   else if (nextarg ("substr"))
     {
-      size_t llen;
       l = eval6 (evaluate);
       i1 = eval6 (evaluate);
       i2 = eval6 (evaluate);
       tostring (l);
-      llen = strlen (l->u.s);
 
       if (!toarith (i1) || !toarith (i2))
         v = str_value ("");
@@ -698,18 +846,8 @@ eval6 (bool evaluate)
           size_t pos = getsize (i1->u.i);
           size_t len = getsize (i2->u.i);
 
-          if (llen < pos || pos == 0 || len == 0 || len == SIZE_MAX)
-            v = str_value ("");
-          else
-            {
-              size_t vlen = MIN (len, llen - pos + 1);
-              char *vlim;
-              v = xmalloc (sizeof *v);
-              v->type = string;
-              v->u.s = xmalloc (vlen + 1);
-              vlim = mempcpy (v->u.s, l->u.s + pos - 1, vlen);
-              *vlim = '\0';
-            }
+          char *s = mbs_logical_substr (l->u.s, pos, len);
+          v = str_value (s);
         }
       freev (l);
       freev (i1);
diff --git a/tests/local.mk b/tests/local.mk
index 6112e88ed6..235bcbe65d 100644
--- a/tests/local.mk
+++ b/tests/local.mk
@@ -290,6 +290,7 @@ all_tests =					\
   tests/misc/env-null.sh			\
   tests/misc/expand.pl				\
   tests/misc/expr.pl				\
+  tests/misc/expr-multibyte.pl			\
   tests/misc/factor.pl				\
   tests/misc/factor-parallel.sh			\
   tests/misc/false-status.sh			\
diff --git a/tests/misc/expr-multibyte.pl b/tests/misc/expr-multibyte.pl
new file mode 100755
index 0000000000..8ef8f4051f
--- /dev/null
+++ b/tests/misc/expr-multibyte.pl
@@ -0,0 +1,226 @@
+#!/usr/bin/perl
+# Exercise expr with multibyte input
+
+# Copyright (C) 2017 Free Software Foundation, Inc.
+
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+
+# You should have received a copy of the GNU General Public License
+# along with this program.  If not, see <http://www.gnu.org/licenses/>.
+
+use strict;
+
+(my $ME = $0) =~ s|.*/||;
+
+my $limits = getlimits ();
+my $UINTMAX_OFLOW = $limits->{UINTMAX_OFLOW};
+
+(my $program_name = $0) =~ s|.*/||;
+my $prog = 'expr';
+
+my $locale = $ENV{LOCALE_FR_UTF8};
+! defined $locale || $locale eq 'none'
+  and CuSkip::skip "$ME: this test requires FR-UTF8 locale\n";
+
+
+=pod
+á¼ÎºÏÏÎ±ÏÎ¹Ï (ekphrasis) - "expression" in Ancient Greek.
+=cut
+my $expression = "\x{1F14}\x{3BA}\x{3C6}\x{3C1}\x{3B1}\x{3C3}\x{3B9}\x{3C2}";
+
+
+## NOTE about tests locales:
+## Tests starting with 'mb' will have {ENV=>"LC_ALL=$locale"}
+## added to them automatically - results are multibyte-aware.
+## Tests starting with 'sb' have the same input but will be
+## run under C locale and will be treated as single-bytes.
+## This enables interleaving C/UTF8 tests
+## (for easier comparison of expected results).
+
+my @Tests =
+  (
+   ### length expressions ###
+
+   # sanity check
+   ['mb-l1', 'length abcdef',   {OUT=>"6"}],
+   ['st-l1', 'length abcdef',   {OUT=>"6"}],
+
+   # A single multibyte character in the beginning of the string
+   # \xCE\xB1 is UTF-8 for "U+03B1 GREEK SMALL LETTER ALPHA"
+   ['mb-l2', "length \xCE\xB1bcdef",   {OUT=>"6"}],
+   ['st-l2', "length \xCE\xB1bcdef",   {OUT=>"7"}],
+
+   # A single multibyte character in the middle of the string
+   # \xCE\xB4 is UTF-8 for "U+03B4 GREEK SMALL LETTER DELTA"
+   ['mb-l3', "length abc\xCE\xB4ef",   {OUT=>"6"}],
+   ['st-l3', "length abc\xCE\xB4ef",   {OUT=>"7"}],
+
+   # A single multibyte character in the end of the string
+   ['mb-l4', "length fedcb\xCE\xB1",   {OUT=>"6"}],
+   ['st-l4', "length fedcb\xCE\xB1",   {OUT=>"7"}],
+
+   # A invalid multibyte sequence
+   ['mb-l5', "length \xB1aaa",   {OUT=>"4"}],
+   ['st-l5', "length \xB1aaa",   {OUT=>"4"}],
+
+   # An incomplete multibyte sequence at the end of the string
+   ['mb-l6', "length aaa\xCE",   {OUT=>"4"}],
+   ['st-l6', "length aaa\xCE",   {OUT=>"4"}],
+
+   # An incomplete multibyte sequence at the end of the string
+   ['mb-l7', "length $expression",   {OUT=>"8"}],
+   ['st-l7', "length $expression",   {OUT=>"17"}],
+
+
+
+   ### index expressions ###
+
+   # sanity check
+   ['mb-i1', 'index abcdef fb',   {OUT=>"2"}],
+   ['st-i1', 'index abcdef fb',   {OUT=>"2"}],
+
+   # Search for a single-octet
+   ['mb-i2', "index \xCE\xB1bc\xCE\xB4ef b",   {OUT=>"2"}],
+   ['st-i2', "index \xCE\xB1bc\xCE\xB4ef b",   {OUT=>"3"}],
+   ['mb-i3', "index \xCE\xB1bc\xCE\xB4ef f",   {OUT=>"6"}],
+   ['st-i3', "index \xCE\xB1bc\xCE\xB4ef f",   {OUT=>"8"}],
+
+   # Search for multibyte character.
+   # In the C locale, the search string is treated as two octets.
+   # the first of them (\xCE) matches the first octet of the input string.
+   ['mb-i4', "index \xCE\xB1bc\xCE\xB4ef \xCE\xB4",   {OUT=>"4"}],
+   ['st-i4', "index \xCE\xB1bc\xCE\xB4ef \xCE\xB4",   {OUT=>"1"}],
+
+   # Invalid multibyte sequence in the input string, treated as a single octet.
+   ['mb-i5', "index \xCEbc\xCE\xB4ef \xCE\xB4",   {OUT=>"4"}],
+   ['st-i5', "index \xCEbc\xCE\xB4ef \xCE\xB4",   {OUT=>"1"}],
+
+   # Invalid multibyte sequence in the search string, treated as a single octet.
+   # In multibyte locale, there should be no match, expr returns and prints
+   # zero, and terminates with exit-code 1 (as per POSIX).
+   ['mb-i6', "index \xCE\xB1bc\xCE\xB4ef \xB4",   {OUT=>"0"}, {EXIT=>1}],
+   ['st-i6', "index \xCE\xB1bc\xCE\xB4ef \xB4",   {OUT=>"6"}],
+
+   # Edge-case: invalid multibyte sequence BOTH in the input string
+   # and in the search string: expr should find a match.
+   ['mb-i7', "index \xCE\xB1bc\xB4ef \xB4",       {OUT=>"4"}],
+
+
+   ### substr expressions ###
+
+   # sanity check
+   ['mb-s1', 'substr abcdef 2 3',   {OUT=>"bcd"}],
+   ['st-s1', 'substr abcdef 2 3',   {OUT=>"bcd"}],
+
+   ['mb-s2', "substr \xCE\xB1bc\xCE\xB4ef 1 1",   {OUT=>"\xCE\xB1"}],
+   ['st-s2', "substr \xCE\xB1bc\xCE\xB4ef 1 1",   {OUT=>"\xCE"}],
+
+   ['mb-s3', "substr \xCE\xB1bc\xCE\xB4ef 3 2",   {OUT=>"c\xCE\xB4"}],
+   ['st-s3', "substr \xCE\xB1bc\xCE\xB4ef 3 2",   {OUT=>"bc"}],
+
+   ['mb-s4', "substr \xCE\xB1bc\xCE\xB4ef 4 1",   {OUT=>"\xCE\xB4"}],
+   ['st-s4', "substr \xCE\xB1bc\xCE\xB4ef 4 1",   {OUT=>"c"}],
+
+   ['mb-s5', "substr \xCE\xB1bc\xCE\xB4ef 4 2",   {OUT=>"\xCE\xB4e"}],
+   ['st-s5', "substr \xCE\xB1bc\xCE\xB4ef 4 2",   {OUT=>"c\xCE"}],
+
+   ['mb-s6', "substr \xCE\xB1bc\xCE\xB4ef 6 1",   {OUT=>"f"}],
+   ['st-s6', "substr \xCE\xB1bc\xCE\xB4ef 6 1",   {OUT=>"\xB4"}],
+
+   ['mb-s7', "substr \xCE\xB1bc\xCE\xB4ef 7 1",   {OUT=>""}, {EXIT=>1}],
+   ['st-s7', "substr \xCE\xB1bc\xCE\xB4ef 7 1",   {OUT=>"e"}],
+
+   # Invalid multibyte sequences
+   ['mb-s8', "substr \xCE\xB1bc\xB4ef 3 3",   {OUT=>"c\xB4e"}],
+   ['st-s8', "substr \xCE\xB1bc\xB4ef 3 3",   {OUT=>"bc\xB4"}],
+
+
+   ### match expressions ###
+
+   # sanity check
+   ['mb-m1', 'match abcdef ab',   {OUT=>"2"}],
+   ['st-m1', 'match abcdef ab',   {OUT=>"2"}],
+   ['mb-m2', 'match abcdef "\(ab\)"',   {OUT=>"ab"}],
+   ['st-m2', 'match abcdef "\(ab\)"',   {OUT=>"ab"}],
+
+   # The regex engine should match the '.' to the first multibyte character.
+   ['mb-m3', "match \xCE\xB1bc\xCE\xB4ef .bc", {OUT=>"3"}],
+   ['st-m3', "match \xCE\xB1bc\xCE\xB4ef .bc", {OUT=>"0"}, {EXIT=>1}],
+
+   # The opposite of the previous test: two dots should only match
+   # the two octets in single-byte locale.
+   ['mb-m4', "match \xCE\xB1bc\xCE\xB4ef ..bc", {OUT=>"0"}, {EXIT=>1}],
+   ['st-m4', "match \xCE\xB1bc\xCE\xB4ef ..bc", {OUT=>"4"}],
+
+   # Match with grouping - a single dot should return the two octets
+   ['mb-m5', "match \xCE\xB1bc\xCE\xB4ef '\\(.b\\)c'", {OUT=>"\xCE\xB1b"}],
+   ['st-m5', "match \xCE\xB1bc\xCE\xB4ef '\\(.b\\)c'", {OUT=>""}, {EXIT=>1}],
+
+   # Invalid multibyte sequences - regex should not match in multibyte locale
+   # (POSIX requirement)
+   ['mb-m6', "match \xCEbc\xCE\xB4ef '\\(.\\)'", {OUT=>""}, {EXIT=>1}],
+   ['st-m6', "match \xCEbc\xCE\xB4ef '\\(.\\)'", {OUT=>"\xCE"}],
+
+
+   # Character classes: in the multibyte case, the regex engine understands
+   # there is a single multibyte characeter in the brackets.
+   # In the single byte case, the regex engine sees two octets in the character
+   # class ('\xCE' and '\xB1') - and it matches the first one.
+   ['mb-m7', "match \xCE\xB1bc\xCE\xB4e '\\([\xCE\xB1]\\)'", {OUT=>"\xCE\xB1"}],
+   ['st-m7', "match \xCE\xB1bc\xCE\xB4e '\\([\xCE\xB1]\\)'", {OUT=>"\xCE"}],
+
+  );
+
+
+# Append a newline to end of each expected 'OUT' string.
+my $t;
+foreach $t (@Tests)
+  {
+    my $arg1 = $t->[1];
+    my $e;
+    foreach $e (@$t)
+      {
+        $e->{OUT} .= "\n"
+          if ref $e eq 'HASH' and exists $e->{OUT};
+      }
+  }
+
+
+# Force multibyte locale in all tests.
+#
+# NOTE about the ERR_SUBST:
+# The error tests above (e1/e2/e3/e4) expect error messages in C locale
+# having single-quote character (ASCII 0x27).
+# In UTF-8 locale, the error messages will use:
+#  'LEFT SINGLE QUOTATION MARK'  (U+2018) (UTF8: 0xE2 0x80 0x98)
+#  'RIGHT SINGLE QUOTATION MARK' (U+2019) (UTF8: 0xE2 0x80 0x99)
+# So we replace them with ascii single-quote and the results will
+# match the expected error string.
+if ($locale ne 'C')
+  {
+    my @new;
+    foreach my $t (@Tests)
+      {
+        my ($tname) = @$t;
+        if ($tname =~ /^mb/)
+          {
+            push @$t, ({ENV => "LC_ALL=$locale"},
+                       {ERR_SUBST => "s/\xe2\x80[\x98\x99]/'/g"});
+          }
+      }
+  }
+
+
+my $save_temps = $ENV{DEBUG};
+my $verbose = $ENV{VERBOSE};
+
+my $fail = run_tests ($program_name, $prog, \@Tests, $save_temps, $verbose);
+exit $fail;
diff --git a/tests/misc/expr.pl b/tests/misc/expr.pl
index 06d511ebe2..c0d6f7c242 100755
--- a/tests/misc/expr.pl
+++ b/tests/misc/expr.pl
@@ -24,6 +24,10 @@ my $prog = 'expr';
 # Turn off localization of executable's output.
 @ENV{qw(LANGUAGE LANG LC_ALL)} = ('C') x 3;
 
+my $mb_locale = $ENV{LOCALE_FR_UTF8};
+! defined $mb_locale || $mb_locale eq 'none'
+  and $mb_locale = 'C';
+
 my $big =      '98782897298723498732987928734';
 my $big_p1 =   '98782897298723498732987928735';
 my $big_sum = '197565794597446997465975857469';
@@ -191,6 +195,22 @@ foreach $t (@Tests)
       }
   }
 
+if ($mb_locale ne 'C')
+  {
+    # Duplicate each test vector, appending "-mb" to the test name and
+    # inserting {ENV => "LC_ALL=$mb_locale"} in the copy, so that we
+    # provide coverage for the distro-added multi-byte code paths.
+    my @new;
+    foreach my $t (@Tests)
+      {
+        my @new_t = @$t;
+        my $test_name = shift @new_t;
+
+        push @new, ["$test_name-mb", @new_t, {ENV => "LC_ALL=$mb_locale"}];
+      }
+    push @Tests, @new;
+  }
+
 my $save_temps = $ENV{SAVE_TEMPS};
 my $verbose = $ENV{VERBOSE};
 
-- 
2.47.3