From a9f2be5bfec2bfe86c0851787312996467a653ee Mon Sep 17 00:00:00 2001 From: Assaf Gordon Date: Wed, 28 Jun 2017 01:23:52 +0000 Subject: [PATCH] expr: add multibyte support Discussed in https://bugs.gnu.org/26779 . * NEWS: Mention the improvement. * bootstrap.conf: Add gnulib modules mbslen,mbschr. * src/expr.c (mbs_logical_substr): New function to return a substring based on logical character positions (instead of bytes). (mbs_logical_cspn): Similar to strcspn/mbscspn, but returns number of logical characters instead of byte offset. (mbs_offset_to_chars): New function to return number of logical characters fitting in a given byte offset. (docolon): Report matched logical characters instead of bytes. (eval6): For length/substr/index operations, use logical characters instead of bytes by calling the above new functions. * tests/misc/expr.pl: Repeat all tests with non-C locale to detect any regressions. * tests/misc/expr-multibyte.pl: New tests with multibyte input. * tests/local.mk: Add new test file. --- NEWS | 5 + bootstrap.conf | 2 + src/expr.c | 174 ++++++++++++++++++++++++--- tests/local.mk | 1 + tests/misc/expr-multibyte.pl | 226 +++++++++++++++++++++++++++++++++++ tests/misc/expr.pl | 20 ++++ 6 files changed, 410 insertions(+), 18 deletions(-) create mode 100755 tests/misc/expr-multibyte.pl diff --git a/NEWS b/NEWS index 071be4bb49..b834fa16ca 100644 --- a/NEWS +++ b/NEWS @@ -44,6 +44,9 @@ GNU coreutils NEWS -*- outline -*- as that's inconsistent with the 24 hour time format used. [bug introduced in coreutils-7.0] + expr now returns number of characters matched (instead of incorrect + number of bytes matched) with 'match'/':' operators on multibyte strings. + ** New features expand and unexpand now support specifying an offset for tab stops @@ -53,6 +56,8 @@ GNU coreutils NEWS -*- outline -*- split supports a new --hex-suffixes[=from] option to create files with lower case hexadecimal suffixes, similar to the --numeric-suffixes option. + expr supports multibyte strings for all string operations. + ** Improvements mv --verbose now distinguishes rename and copy operations. diff --git a/bootstrap.conf b/bootstrap.conf index 30ce621ce0..4db77a3d78 100644 --- a/bootstrap.conf +++ b/bootstrap.conf @@ -155,6 +155,8 @@ gnulib_modules=" mbrlen mbrtowc mbsalign + mbschr + mbslen mbswidth memcasecmp memchr diff --git a/src/expr.c b/src/expr.c index 3448fc8222..795140ce98 100644 --- a/src/expr.c +++ b/src/expr.c @@ -37,6 +37,7 @@ #include "die.h" #include "error.h" #include "long-options.h" +#include "mbuiter.h" #include "strnumcmp.h" #include "xstrtol.h" @@ -192,6 +193,148 @@ static bool nomoreargs (void); static bool null (VALUE *v); static void printv (VALUE *v); + +/* + Find the first occurrence in the character string STRING of any character + in the character string ACCEPT. + + Copied from gnulib's mbscspn, with two differences: + 1. Returns 1-based position of first found character, or zero if not found. + 2. Returned value is the logical character index, NOT byte offset. + + Examples: + mbs_logical_cspn ('hello','a') => 0 + mbs_logical_cspn ('hello','h') => 1 + mbs_logical_cspn ('hello','oe') => 1 + mbs_logical_cspn ('hello','lo') => 3 + + In UTF-8 \xCE\xB1 is a single character (greek alpha): + mbs_logical_cspn ('\xCE\xB1bc','\xCE\xB1') => 1 + mbs_logical_cspn ('\xCE\xB1bc','c') => 3 */ +static size_t +mbs_logical_cspn (const char *s, const char *accept) +{ + size_t idx = 0; + + if (accept[0] == '\0') + return 0; + + /* General case. */ + if (MB_CUR_MAX > 1) + { + mbui_iterator_t iter; + + for (mbui_init (iter, s); mbui_avail (iter); mbui_advance (iter)) + { + ++idx; + if (mb_len (mbui_cur (iter)) == 1) + { + if (mbschr (accept, *mbui_cur_ptr (iter))) + return idx; + } + else + { + mbui_iterator_t aiter; + + for (mbui_init (aiter, accept); + mbui_avail (aiter); + mbui_advance (aiter)) + if (mb_equal (mbui_cur (aiter), mbui_cur (iter))) + return idx; + } + } + + /* not found */ + return 0; + } + else + { + /* single-byte locale, + convert returned byte offset to 1-based index or zero if not found. */ + size_t i = strcspn (s, accept); + return (s[i] ? i + 1 : 0); + } +} + +/* Extract the substring of S, from logical character + position POS and LEN characters. + first character position is 1. + POS and LEN refer to logical characters, not octets. + + Upon exit, sets v->s to the new string. + The new string might be empty if POS/LEN are invalid. */ +static char * +mbs_logical_substr (const char *s, size_t pos, size_t len) +{ + char *v, *vlim; + + size_t blen = strlen (s); /* byte length */ + size_t llen = (MB_CUR_MAX > 1) ? mbslen (s) : blen; /* logical length */ + + if (llen < pos || pos == 0 || len == 0 || len == SIZE_MAX) + return xstrdup (""); + + /* characters to copy */ + size_t vlen = MIN (len, llen - pos + 1); + + if (MB_CUR_MAX == 1) + { + /* Single-byte case */ + v = xmalloc (vlen + 1); + vlim = mempcpy (v, s + pos - 1, vlen); + } + else + { + /* Multibyte case */ + + /* FIXME: this is wasteful. Some memory can be saved by counting + how many bytes the matching characters occupy. */ + vlim = v = xmalloc (blen + 1); + + mbui_iterator_t iter; + size_t idx=1; + for (mbui_init (iter, s); + mbui_avail (iter) && vlen > 0; + mbui_advance (iter), ++idx) + { + /* Skip until we reach the starting position */ + if (idx < pos) + continue; + + /* Copy one character */ + --vlen; + vlim = mempcpy (vlim, mbui_cur_ptr (iter), mb_len (mbui_cur (iter))); + } + } + *vlim = '\0'; + return v; +} + +/* Return the number of logical characteres (possibly multibyte) + that are in string S in the first OFS octets. + + Example in UTF-8: + "\xE2\x9D\xA7" is "U+2767 ROTATED FLORAL HEART BULLET". + In the string below, there are only two characters + up to the first 4 bytes (The U+2767 which occupies 3 bytes and 'x'): + mbs_count_to_offset ("\xE2\x9D\xA7xyz", 4) => 2 */ +static size_t +mbs_offset_to_chars (const char *s, size_t ofs) +{ + mbui_iterator_t iter; + size_t c = 0; + for (mbui_init (iter, s); mbui_avail (iter); mbui_advance (iter)) + { + ptrdiff_t d = mbui_cur_ptr (iter) - s; + if (d >= ofs) + break; + ++c; + } + return c; +} + + + void usage (int status) { @@ -574,7 +717,14 @@ docolon (VALUE *sv, VALUE *pv) v = str_value (sv->u.s + re_regs.start[1]); } else - v = int_value (matchlen); + { + /* In multibyte locales, convert the matched offset (=number of bytes) + to the number of matched characters. */ + size_t i = (MB_CUR_MAX == 1 + ? matchlen + : mbs_offset_to_chars (sv->u.s, matchlen)); + v = int_value (i); + } } else if (matchlen == -1) { @@ -650,7 +800,7 @@ eval6 (bool evaluate) { r = eval6 (evaluate); tostring (r); - v = int_value (strlen (r->u.s)); + v = int_value (mbslen (r->u.s)); freev (r); return v; } @@ -676,20 +826,18 @@ eval6 (bool evaluate) r = eval6 (evaluate); tostring (l); tostring (r); - pos = strcspn (l->u.s, r->u.s); - v = int_value (l->u.s[pos] ? pos + 1 : 0); + pos = mbs_logical_cspn (l->u.s, r->u.s); + v = int_value (pos); freev (l); freev (r); return v; } else if (nextarg ("substr")) { - size_t llen; l = eval6 (evaluate); i1 = eval6 (evaluate); i2 = eval6 (evaluate); tostring (l); - llen = strlen (l->u.s); if (!toarith (i1) || !toarith (i2)) v = str_value (""); @@ -698,18 +846,8 @@ eval6 (bool evaluate) size_t pos = getsize (i1->u.i); size_t len = getsize (i2->u.i); - if (llen < pos || pos == 0 || len == 0 || len == SIZE_MAX) - v = str_value (""); - else - { - size_t vlen = MIN (len, llen - pos + 1); - char *vlim; - v = xmalloc (sizeof *v); - v->type = string; - v->u.s = xmalloc (vlen + 1); - vlim = mempcpy (v->u.s, l->u.s + pos - 1, vlen); - *vlim = '\0'; - } + char *s = mbs_logical_substr (l->u.s, pos, len); + v = str_value (s); } freev (l); freev (i1); diff --git a/tests/local.mk b/tests/local.mk index 6112e88ed6..235bcbe65d 100644 --- a/tests/local.mk +++ b/tests/local.mk @@ -290,6 +290,7 @@ all_tests = \ tests/misc/env-null.sh \ tests/misc/expand.pl \ tests/misc/expr.pl \ + tests/misc/expr-multibyte.pl \ tests/misc/factor.pl \ tests/misc/factor-parallel.sh \ tests/misc/false-status.sh \ diff --git a/tests/misc/expr-multibyte.pl b/tests/misc/expr-multibyte.pl new file mode 100755 index 0000000000..8ef8f4051f --- /dev/null +++ b/tests/misc/expr-multibyte.pl @@ -0,0 +1,226 @@ +#!/usr/bin/perl +# Exercise expr with multibyte input + +# Copyright (C) 2017 Free Software Foundation, Inc. + +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. + +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. + +# You should have received a copy of the GNU General Public License +# along with this program. If not, see . + +use strict; + +(my $ME = $0) =~ s|.*/||; + +my $limits = getlimits (); +my $UINTMAX_OFLOW = $limits->{UINTMAX_OFLOW}; + +(my $program_name = $0) =~ s|.*/||; +my $prog = 'expr'; + +my $locale = $ENV{LOCALE_FR_UTF8}; +! defined $locale || $locale eq 'none' + and CuSkip::skip "$ME: this test requires FR-UTF8 locale\n"; + + +=pod +ἔκφρασις (ekphrasis) - "expression" in Ancient Greek. +=cut +my $expression = "\x{1F14}\x{3BA}\x{3C6}\x{3C1}\x{3B1}\x{3C3}\x{3B9}\x{3C2}"; + + +## NOTE about tests locales: +## Tests starting with 'mb' will have {ENV=>"LC_ALL=$locale"} +## added to them automatically - results are multibyte-aware. +## Tests starting with 'sb' have the same input but will be +## run under C locale and will be treated as single-bytes. +## This enables interleaving C/UTF8 tests +## (for easier comparison of expected results). + +my @Tests = + ( + ### length expressions ### + + # sanity check + ['mb-l1', 'length abcdef', {OUT=>"6"}], + ['st-l1', 'length abcdef', {OUT=>"6"}], + + # A single multibyte character in the beginning of the string + # \xCE\xB1 is UTF-8 for "U+03B1 GREEK SMALL LETTER ALPHA" + ['mb-l2', "length \xCE\xB1bcdef", {OUT=>"6"}], + ['st-l2', "length \xCE\xB1bcdef", {OUT=>"7"}], + + # A single multibyte character in the middle of the string + # \xCE\xB4 is UTF-8 for "U+03B4 GREEK SMALL LETTER DELTA" + ['mb-l3', "length abc\xCE\xB4ef", {OUT=>"6"}], + ['st-l3', "length abc\xCE\xB4ef", {OUT=>"7"}], + + # A single multibyte character in the end of the string + ['mb-l4', "length fedcb\xCE\xB1", {OUT=>"6"}], + ['st-l4', "length fedcb\xCE\xB1", {OUT=>"7"}], + + # A invalid multibyte sequence + ['mb-l5', "length \xB1aaa", {OUT=>"4"}], + ['st-l5', "length \xB1aaa", {OUT=>"4"}], + + # An incomplete multibyte sequence at the end of the string + ['mb-l6', "length aaa\xCE", {OUT=>"4"}], + ['st-l6', "length aaa\xCE", {OUT=>"4"}], + + # An incomplete multibyte sequence at the end of the string + ['mb-l7', "length $expression", {OUT=>"8"}], + ['st-l7', "length $expression", {OUT=>"17"}], + + + + ### index expressions ### + + # sanity check + ['mb-i1', 'index abcdef fb', {OUT=>"2"}], + ['st-i1', 'index abcdef fb', {OUT=>"2"}], + + # Search for a single-octet + ['mb-i2', "index \xCE\xB1bc\xCE\xB4ef b", {OUT=>"2"}], + ['st-i2', "index \xCE\xB1bc\xCE\xB4ef b", {OUT=>"3"}], + ['mb-i3', "index \xCE\xB1bc\xCE\xB4ef f", {OUT=>"6"}], + ['st-i3', "index \xCE\xB1bc\xCE\xB4ef f", {OUT=>"8"}], + + # Search for multibyte character. + # In the C locale, the search string is treated as two octets. + # the first of them (\xCE) matches the first octet of the input string. + ['mb-i4', "index \xCE\xB1bc\xCE\xB4ef \xCE\xB4", {OUT=>"4"}], + ['st-i4', "index \xCE\xB1bc\xCE\xB4ef \xCE\xB4", {OUT=>"1"}], + + # Invalid multibyte sequence in the input string, treated as a single octet. + ['mb-i5', "index \xCEbc\xCE\xB4ef \xCE\xB4", {OUT=>"4"}], + ['st-i5', "index \xCEbc\xCE\xB4ef \xCE\xB4", {OUT=>"1"}], + + # Invalid multibyte sequence in the search string, treated as a single octet. + # In multibyte locale, there should be no match, expr returns and prints + # zero, and terminates with exit-code 1 (as per POSIX). + ['mb-i6', "index \xCE\xB1bc\xCE\xB4ef \xB4", {OUT=>"0"}, {EXIT=>1}], + ['st-i6', "index \xCE\xB1bc\xCE\xB4ef \xB4", {OUT=>"6"}], + + # Edge-case: invalid multibyte sequence BOTH in the input string + # and in the search string: expr should find a match. + ['mb-i7', "index \xCE\xB1bc\xB4ef \xB4", {OUT=>"4"}], + + + ### substr expressions ### + + # sanity check + ['mb-s1', 'substr abcdef 2 3', {OUT=>"bcd"}], + ['st-s1', 'substr abcdef 2 3', {OUT=>"bcd"}], + + ['mb-s2', "substr \xCE\xB1bc\xCE\xB4ef 1 1", {OUT=>"\xCE\xB1"}], + ['st-s2', "substr \xCE\xB1bc\xCE\xB4ef 1 1", {OUT=>"\xCE"}], + + ['mb-s3', "substr \xCE\xB1bc\xCE\xB4ef 3 2", {OUT=>"c\xCE\xB4"}], + ['st-s3', "substr \xCE\xB1bc\xCE\xB4ef 3 2", {OUT=>"bc"}], + + ['mb-s4', "substr \xCE\xB1bc\xCE\xB4ef 4 1", {OUT=>"\xCE\xB4"}], + ['st-s4', "substr \xCE\xB1bc\xCE\xB4ef 4 1", {OUT=>"c"}], + + ['mb-s5', "substr \xCE\xB1bc\xCE\xB4ef 4 2", {OUT=>"\xCE\xB4e"}], + ['st-s5', "substr \xCE\xB1bc\xCE\xB4ef 4 2", {OUT=>"c\xCE"}], + + ['mb-s6', "substr \xCE\xB1bc\xCE\xB4ef 6 1", {OUT=>"f"}], + ['st-s6', "substr \xCE\xB1bc\xCE\xB4ef 6 1", {OUT=>"\xB4"}], + + ['mb-s7', "substr \xCE\xB1bc\xCE\xB4ef 7 1", {OUT=>""}, {EXIT=>1}], + ['st-s7', "substr \xCE\xB1bc\xCE\xB4ef 7 1", {OUT=>"e"}], + + # Invalid multibyte sequences + ['mb-s8', "substr \xCE\xB1bc\xB4ef 3 3", {OUT=>"c\xB4e"}], + ['st-s8', "substr \xCE\xB1bc\xB4ef 3 3", {OUT=>"bc\xB4"}], + + + ### match expressions ### + + # sanity check + ['mb-m1', 'match abcdef ab', {OUT=>"2"}], + ['st-m1', 'match abcdef ab', {OUT=>"2"}], + ['mb-m2', 'match abcdef "\(ab\)"', {OUT=>"ab"}], + ['st-m2', 'match abcdef "\(ab\)"', {OUT=>"ab"}], + + # The regex engine should match the '.' to the first multibyte character. + ['mb-m3', "match \xCE\xB1bc\xCE\xB4ef .bc", {OUT=>"3"}], + ['st-m3', "match \xCE\xB1bc\xCE\xB4ef .bc", {OUT=>"0"}, {EXIT=>1}], + + # The opposite of the previous test: two dots should only match + # the two octets in single-byte locale. + ['mb-m4', "match \xCE\xB1bc\xCE\xB4ef ..bc", {OUT=>"0"}, {EXIT=>1}], + ['st-m4', "match \xCE\xB1bc\xCE\xB4ef ..bc", {OUT=>"4"}], + + # Match with grouping - a single dot should return the two octets + ['mb-m5', "match \xCE\xB1bc\xCE\xB4ef '\\(.b\\)c'", {OUT=>"\xCE\xB1b"}], + ['st-m5', "match \xCE\xB1bc\xCE\xB4ef '\\(.b\\)c'", {OUT=>""}, {EXIT=>1}], + + # Invalid multibyte sequences - regex should not match in multibyte locale + # (POSIX requirement) + ['mb-m6', "match \xCEbc\xCE\xB4ef '\\(.\\)'", {OUT=>""}, {EXIT=>1}], + ['st-m6', "match \xCEbc\xCE\xB4ef '\\(.\\)'", {OUT=>"\xCE"}], + + + # Character classes: in the multibyte case, the regex engine understands + # there is a single multibyte characeter in the brackets. + # In the single byte case, the regex engine sees two octets in the character + # class ('\xCE' and '\xB1') - and it matches the first one. + ['mb-m7', "match \xCE\xB1bc\xCE\xB4e '\\([\xCE\xB1]\\)'", {OUT=>"\xCE\xB1"}], + ['st-m7', "match \xCE\xB1bc\xCE\xB4e '\\([\xCE\xB1]\\)'", {OUT=>"\xCE"}], + + ); + + +# Append a newline to end of each expected 'OUT' string. +my $t; +foreach $t (@Tests) + { + my $arg1 = $t->[1]; + my $e; + foreach $e (@$t) + { + $e->{OUT} .= "\n" + if ref $e eq 'HASH' and exists $e->{OUT}; + } + } + + +# Force multibyte locale in all tests. +# +# NOTE about the ERR_SUBST: +# The error tests above (e1/e2/e3/e4) expect error messages in C locale +# having single-quote character (ASCII 0x27). +# In UTF-8 locale, the error messages will use: +# 'LEFT SINGLE QUOTATION MARK' (U+2018) (UTF8: 0xE2 0x80 0x98) +# 'RIGHT SINGLE QUOTATION MARK' (U+2019) (UTF8: 0xE2 0x80 0x99) +# So we replace them with ascii single-quote and the results will +# match the expected error string. +if ($locale ne 'C') + { + my @new; + foreach my $t (@Tests) + { + my ($tname) = @$t; + if ($tname =~ /^mb/) + { + push @$t, ({ENV => "LC_ALL=$locale"}, + {ERR_SUBST => "s/\xe2\x80[\x98\x99]/'/g"}); + } + } + } + + +my $save_temps = $ENV{DEBUG}; +my $verbose = $ENV{VERBOSE}; + +my $fail = run_tests ($program_name, $prog, \@Tests, $save_temps, $verbose); +exit $fail; diff --git a/tests/misc/expr.pl b/tests/misc/expr.pl index 06d511ebe2..c0d6f7c242 100755 --- a/tests/misc/expr.pl +++ b/tests/misc/expr.pl @@ -24,6 +24,10 @@ my $prog = 'expr'; # Turn off localization of executable's output. @ENV{qw(LANGUAGE LANG LC_ALL)} = ('C') x 3; +my $mb_locale = $ENV{LOCALE_FR_UTF8}; +! defined $mb_locale || $mb_locale eq 'none' + and $mb_locale = 'C'; + my $big = '98782897298723498732987928734'; my $big_p1 = '98782897298723498732987928735'; my $big_sum = '197565794597446997465975857469'; @@ -191,6 +195,22 @@ foreach $t (@Tests) } } +if ($mb_locale ne 'C') + { + # Duplicate each test vector, appending "-mb" to the test name and + # inserting {ENV => "LC_ALL=$mb_locale"} in the copy, so that we + # provide coverage for the distro-added multi-byte code paths. + my @new; + foreach my $t (@Tests) + { + my @new_t = @$t; + my $test_name = shift @new_t; + + push @new, ["$test_name-mb", @new_t, {ENV => "LC_ALL=$mb_locale"}]; + } + push @Tests, @new; + } + my $save_temps = $ENV{SAVE_TEMPS}; my $verbose = $ENV{VERBOSE}; -- 2.47.2