From: Paul Eggert Date: Wed, 13 Sep 2023 04:21:18 +0000 (-0500) Subject: Support multi-byte --transform='...\L...' etc X-Git-Url: http://git.ipfire.org/?a=commitdiff_plain;h=c1e277476c4bf139cac12f26f96d6e74115a0829;p=thirdparty%2Ftar.git Support multi-byte --transform='...\L...' etc Support upcasing and downcasing in multi-byte locales. * gnulib.modules: Add c32rtomb, c32tolower, c32toupper, mbrtoc32-regular. * src/transform.c: Do not include ctype.h. Include mcel.h. (stk, stk_init): Move up. (run_case_conv): Return void, not char *. Append result to stk directly; this avoids the need for a separate allocation. All callers changed. Do not assume a single-byte locale. * tests/xform04.at: New test. * tests/Makefile.am (TESTSUITE_AT): * tests/testsuite.at: Add it. --- diff --git a/NEWS b/NEWS index 4b8094e5..e58bc3d0 100644 --- a/NEWS +++ b/NEWS @@ -1,4 +1,4 @@ -GNU tar NEWS - User visible changes. 2023-09-10 +GNU tar NEWS - User visible changes. 2023-09-12 Please send GNU tar bug reports to version TBD @@ -33,6 +33,9 @@ used, command output will be parsed using strptime(3). ** When diagnosing invalid extended headers tar now quotes control characters. +** Transformations that change case (e.g., --transform='s/.*/\L&/') + now work correctly with multi-byte characters. + version 1.35 - Sergey Poznyakoff, 2023-07-18 diff --git a/gnulib.modules b/gnulib.modules index bd703a8a..f941bf5d 100644 --- a/gnulib.modules +++ b/gnulib.modules @@ -25,6 +25,9 @@ argp-version-etc attribute backupfile c-ctype +c32rtomb +c32tolower +c32toupper closeout configmake dirname @@ -64,6 +67,7 @@ lchown linkat localcharset manywarnings +mbrtoc32-regular mcel-prefer mkdirat mkdtemp diff --git a/src/transform.c b/src/transform.c index 65dba791..857faf86 100644 --- a/src/transform.c +++ b/src/transform.c @@ -15,8 +15,8 @@ with this program. If not, see . */ #include -#include #include +#include #include "common.h" enum transform_type @@ -417,51 +417,44 @@ set_transform_expr (const char *expr) expr = parse_transform_expr (expr); } + +static struct obstack stk; +static bool stk_init; + /* Run case conversion specified by CASE_CTL on array PTR of SIZE - characters. Returns pointer to statically allocated storage. */ -static char * + characters. Append the result to STK. */ +static void run_case_conv (enum case_ctl_type case_ctl, char *ptr, size_t size) { - static char *case_ctl_buffer; - static size_t case_ctl_bufsize; - char *p; - - if (case_ctl_bufsize < size) + char const *p = ptr, *plim = ptr + size; + mbstate_t mbs; mbszero (&mbs); + while (p < plim) { - case_ctl_bufsize = size; - case_ctl_buffer = xrealloc (case_ctl_buffer, case_ctl_bufsize); + mcel_t g = mcel_scan (p, plim); + char32_t ch; + switch (case_ctl) + { + case ctl_upcase: case ctl_upcase_next: ch = c32toupper (g.ch); break; + case ctl_locase: case ctl_locase_next: ch = c32tolower (g.ch); break; + default: ch = g.ch; break; + } + if (ch == g.ch) + obstack_grow (&stk, p, g.len); + else + { + obstack_make_room (&stk, MB_LEN_MAX); + mbstate_t ombs; mbszero (&ombs); + size_t outbytes = c32rtomb (obstack_next_free (&stk), ch, &ombs); + obstack_blank_fast (&stk, outbytes); + } + p += g.len; + if (case_ctl != ctl_upcase && case_ctl != ctl_locase) + break; } - memcpy (case_ctl_buffer, ptr, size); - switch (case_ctl) - { - case ctl_upcase_next: - case_ctl_buffer[0] = toupper ((unsigned char) case_ctl_buffer[0]); - break; - - case ctl_locase_next: - case_ctl_buffer[0] = tolower ((unsigned char) case_ctl_buffer[0]); - break; - case ctl_upcase: - for (p = case_ctl_buffer; p < case_ctl_buffer + size; p++) - *p = toupper ((unsigned char) *p); - break; - - case ctl_locase: - for (p = case_ctl_buffer; p < case_ctl_buffer + size; p++) - *p = tolower ((unsigned char) *p); - break; - - case ctl_stop: - break; - } - return case_ctl_buffer; + obstack_grow (&stk, p, plim - p); } - -static struct obstack stk; -static bool stk_init; - static void _single_transform_name_to_obstack (struct transform *tf, char *input) { @@ -484,7 +477,6 @@ _single_transform_name_to_obstack (struct transform *tf, char *input) while (*input) { size_t disp; - char *ptr; rc = regexec (&tf->regex, input, tf->regex.re_nsub + 1, rmp, 0); @@ -510,16 +502,10 @@ _single_transform_name_to_obstack (struct transform *tf, char *input) switch (segm->type) { case segm_literal: /* Literal segment */ - if (case_ctl == ctl_stop) - ptr = segm->v.literal.ptr; - else - { - ptr = run_case_conv (case_ctl, - segm->v.literal.ptr, - segm->v.literal.size); - CASE_CTL_RESET(); - } - obstack_grow (&stk, ptr, segm->v.literal.size); + run_case_conv (case_ctl, + segm->v.literal.ptr, + segm->v.literal.size); + CASE_CTL_RESET (); break; case segm_backref: /* Back-reference segment */ @@ -528,14 +514,9 @@ _single_transform_name_to_obstack (struct transform *tf, char *input) { size_t size = rmp[segm->v.ref].rm_eo - rmp[segm->v.ref].rm_so; - ptr = input + rmp[segm->v.ref].rm_so; - if (case_ctl != ctl_stop) - { - ptr = run_case_conv (case_ctl, ptr, size); - CASE_CTL_RESET(); - } - - obstack_grow (&stk, ptr, size); + run_case_conv (case_ctl, + input + rmp[segm->v.ref].rm_so, size); + CASE_CTL_RESET (); } break; diff --git a/tests/Makefile.am b/tests/Makefile.am index 57c9696f..20a162af 100644 --- a/tests/Makefile.am +++ b/tests/Makefile.am @@ -287,7 +287,8 @@ TESTSUITE_AT = \ xform-h.at\ xform01.at\ xform02.at\ - xform03.at + xform03.at\ + xform04.at distclean-local: -rm -rf download diff --git a/tests/testsuite.at b/tests/testsuite.at index 7cfa636f..e738b1ce 100644 --- a/tests/testsuite.at +++ b/tests/testsuite.at @@ -293,6 +293,7 @@ m4_include([xform-h.at]) m4_include([xform01.at]) m4_include([xform02.at]) m4_include([xform03.at]) +m4_include([xform04.at]) AT_BANNER([Exclude]) m4_include([exclude.at]) diff --git a/tests/xform04.at b/tests/xform04.at new file mode 100644 index 00000000..4c6e0327 --- /dev/null +++ b/tests/xform04.at @@ -0,0 +1,48 @@ +# Process this file with autom4te to create testsuite. -*- Autotest -*- + +# Test suite for GNU tar. +# Copyright 2023 Free Software Foundation, Inc. + +# This file is part of GNU tar. + +# GNU tar is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation; either version 3 of the License, or +# (at your option) any later version. + +# GNU tar is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. + +# You should have received a copy of the GNU General Public License +# along with this program. If not, see . + +# Transformations can change the number of bytes when downcasing. + +AT_SETUP([transformations and multi-byte downcasing]) +AT_KEYWORDS([transform xform xform04]) + +AT_TAR_CHECK([ +if test "`(locale charmap) 2>/dev/null`" != UTF-8; then + for locale in en_US.UTF-8 `(locale -a) 2>/dev/null` not-found; do + case $locale in + *.[[Uu][Tt][Ff]]*8) + if test "`(LC_ALL=$locale locale charmap) 2>/dev/null`" = UTF-8; then + LC_ALL=$locale + export LC_ALL + break + fi;; + not-found) + AT_SKIP_TEST;; + esac + done +fi + +genfile --file Aa.Ⱥⱥ +tar -cvf /dev/null --transform='s/.*/\L&-\U&/' --show-transformed-name Aa.Ⱥⱥ], +[0], +[aa.ⱥⱥ-AA.ȺȺ +]) + +AT_CLEANUP