]> git.ipfire.org Git - thirdparty/tar.git/commitdiff
Support multi-byte --transform='...\L...' etc
authorPaul Eggert <eggert@cs.ucla.edu>
Wed, 13 Sep 2023 04:21:18 +0000 (23:21 -0500)
committerPaul Eggert <eggert@cs.ucla.edu>
Wed, 13 Sep 2023 04:23:41 +0000 (23:23 -0500)
Support upcasing and downcasing in multi-byte locales.
* gnulib.modules: Add c32rtomb, c32tolower, c32toupper,
mbrtoc32-regular.
* src/transform.c: Do not include ctype.h.  Include mcel.h.
(stk, stk_init): Move up.
(run_case_conv): Return void, not char *.  Append result to
stk directly; this avoids the need for a separate allocation.
All callers changed.  Do not assume a single-byte locale.
* tests/xform04.at: New test.
* tests/Makefile.am (TESTSUITE_AT):
* tests/testsuite.at: Add it.

NEWS
gnulib.modules
src/transform.c
tests/Makefile.am
tests/testsuite.at
tests/xform04.at [new file with mode: 0644]

diff --git a/NEWS b/NEWS
index 4b8094e59f599777b7a043878e0b2728fb6282bc..e58bc3d029e20f076a980c6431d7d42dd1b667f3 100644 (file)
--- a/NEWS
+++ b/NEWS
@@ -1,4 +1,4 @@
-GNU tar NEWS - User visible changes. 2023-09-10
+GNU tar NEWS - User visible changes. 2023-09-12
 Please send GNU tar bug reports to <bug-tar@gnu.org>
 \f
 version TBD
@@ -33,6 +33,9 @@ used, command output will be parsed using strptime(3).
 
 ** When diagnosing invalid extended headers tar now quotes control characters.
 
+** Transformations that change case (e.g., --transform='s/.*/\L&/')
+   now work correctly with multi-byte characters.
+
 \f
 version 1.35 - Sergey Poznyakoff, 2023-07-18
 
index bd703a8a840173e3cf41f3a122d583f95d47c87f..f941bf5d60cf727b8aa2622fbc440e51b085552e 100644 (file)
@@ -25,6 +25,9 @@ argp-version-etc
 attribute
 backupfile
 c-ctype
+c32rtomb
+c32tolower
+c32toupper
 closeout
 configmake
 dirname
@@ -64,6 +67,7 @@ lchown
 linkat
 localcharset
 manywarnings
+mbrtoc32-regular
 mcel-prefer
 mkdirat
 mkdtemp
index 65dba79126cf70ad61b0ee719cff24de48de14dd..857faf863f7e0a9417337b0c58094d75b3716dbb 100644 (file)
@@ -15,8 +15,8 @@
    with this program.  If not, see <http://www.gnu.org/licenses/>.  */
 
 #include <system.h>
-#include <ctype.h>
 #include <regex.h>
+#include <mcel.h>
 #include "common.h"
 
 enum transform_type
@@ -417,51 +417,44 @@ set_transform_expr (const char *expr)
     expr = parse_transform_expr (expr);
 }
 
+\f
+static struct obstack stk;
+static bool stk_init;
+
 /* Run case conversion specified by CASE_CTL on array PTR of SIZE
-   characters. Returns pointer to statically allocated storage. */
-static char *
+   characters.  Append the result to STK.  */
+static void
 run_case_conv (enum case_ctl_type case_ctl, char *ptr, size_t size)
 {
-  static char *case_ctl_buffer;
-  static size_t case_ctl_bufsize;
-  char *p;
-
-  if (case_ctl_bufsize < size)
+  char const *p = ptr, *plim = ptr + size;
+  mbstate_t mbs; mbszero (&mbs);
+  while (p < plim)
     {
-      case_ctl_bufsize = size;
-      case_ctl_buffer = xrealloc (case_ctl_buffer, case_ctl_bufsize);
+      mcel_t g = mcel_scan (p, plim);
+      char32_t ch;
+      switch (case_ctl)
+       {
+       case ctl_upcase: case ctl_upcase_next: ch = c32toupper (g.ch); break;
+       case ctl_locase: case ctl_locase_next: ch = c32tolower (g.ch); break;
+       default: ch = g.ch; break;
+       }
+      if (ch == g.ch)
+       obstack_grow (&stk, p, g.len);
+      else
+       {
+         obstack_make_room (&stk, MB_LEN_MAX);
+         mbstate_t ombs; mbszero (&ombs);
+         size_t outbytes = c32rtomb (obstack_next_free (&stk), ch, &ombs);
+         obstack_blank_fast (&stk, outbytes);
+       }
+      p += g.len;
+      if (case_ctl != ctl_upcase && case_ctl != ctl_locase)
+       break;
     }
-  memcpy (case_ctl_buffer, ptr, size);
-  switch (case_ctl)
-    {
-    case ctl_upcase_next:
-      case_ctl_buffer[0] = toupper ((unsigned char) case_ctl_buffer[0]);
-      break;
-
-    case ctl_locase_next:
-      case_ctl_buffer[0] = tolower ((unsigned char) case_ctl_buffer[0]);
-      break;
 
-    case ctl_upcase:
-      for (p = case_ctl_buffer; p < case_ctl_buffer + size; p++)
-       *p = toupper ((unsigned char) *p);
-      break;
-
-    case ctl_locase:
-      for (p = case_ctl_buffer; p < case_ctl_buffer + size; p++)
-       *p = tolower ((unsigned char) *p);
-      break;
-
-    case ctl_stop:
-      break;
-    }
-  return case_ctl_buffer;
+  obstack_grow (&stk, p, plim - p);
 }
 
-\f
-static struct obstack stk;
-static bool stk_init;
-
 static void
 _single_transform_name_to_obstack (struct transform *tf, char *input)
 {
@@ -484,7 +477,6 @@ _single_transform_name_to_obstack (struct transform *tf, char *input)
   while (*input)
     {
       size_t disp;
-      char *ptr;
 
       rc = regexec (&tf->regex, input, tf->regex.re_nsub + 1, rmp, 0);
 
@@ -510,16 +502,10 @@ _single_transform_name_to_obstack (struct transform *tf, char *input)
              switch (segm->type)
                {
                case segm_literal:    /* Literal segment */
-                 if (case_ctl == ctl_stop)
-                   ptr = segm->v.literal.ptr;
-                 else
-                   {
-                     ptr = run_case_conv (case_ctl,
-                                          segm->v.literal.ptr,
-                                          segm->v.literal.size);
-                     CASE_CTL_RESET();
-                   }
-                 obstack_grow (&stk, ptr, segm->v.literal.size);
+                 run_case_conv (case_ctl,
+                                segm->v.literal.ptr,
+                                segm->v.literal.size);
+                 CASE_CTL_RESET ();
                  break;
 
                case segm_backref:    /* Back-reference segment */
@@ -528,14 +514,9 @@ _single_transform_name_to_obstack (struct transform *tf, char *input)
                    {
                      size_t size = rmp[segm->v.ref].rm_eo
                                      - rmp[segm->v.ref].rm_so;
-                     ptr = input + rmp[segm->v.ref].rm_so;
-                     if (case_ctl != ctl_stop)
-                       {
-                         ptr = run_case_conv (case_ctl, ptr, size);
-                         CASE_CTL_RESET();
-                       }
-
-                     obstack_grow (&stk, ptr, size);
+                     run_case_conv (case_ctl,
+                                    input + rmp[segm->v.ref].rm_so, size);
+                     CASE_CTL_RESET ();
                    }
                  break;
 
index 57c9696fd30a817ccdb0e0a07cc4050d4026517a..20a162afcad9afcbd912355152e4179daa4c3690 100644 (file)
@@ -287,7 +287,8 @@ TESTSUITE_AT = \
  xform-h.at\
  xform01.at\
  xform02.at\
- xform03.at
+ xform03.at\
+ xform04.at
 
 distclean-local:
        -rm -rf download
index 7cfa636fb362f41657ac0175c6d2ebcd0ce9b1b1..e738b1ce4dbd73b148efb913b5169a23cd0e05d4 100644 (file)
@@ -293,6 +293,7 @@ m4_include([xform-h.at])
 m4_include([xform01.at])
 m4_include([xform02.at])
 m4_include([xform03.at])
+m4_include([xform04.at])
 
 AT_BANNER([Exclude])
 m4_include([exclude.at])
diff --git a/tests/xform04.at b/tests/xform04.at
new file mode 100644 (file)
index 0000000..4c6e032
--- /dev/null
@@ -0,0 +1,48 @@
+# Process this file with autom4te to create testsuite. -*- Autotest -*-
+
+# Test suite for GNU tar.
+# Copyright 2023 Free Software Foundation, Inc.
+
+# This file is part of GNU tar.
+
+# GNU tar is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation; either version 3 of the License, or
+# (at your option) any later version.
+
+# GNU tar is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+
+# You should have received a copy of the GNU General Public License
+# along with this program.  If not, see <http://www.gnu.org/licenses/>.
+
+# Transformations can change the number of bytes when downcasing.
+
+AT_SETUP([transformations and multi-byte downcasing])
+AT_KEYWORDS([transform xform xform04])
+
+AT_TAR_CHECK([
+if test "`(locale charmap) 2>/dev/null`" != UTF-8; then
+  for locale in en_US.UTF-8 `(locale -a) 2>/dev/null` not-found; do
+    case $locale in
+      *.[[Uu][Tt][Ff]]*8)
+       if test "`(LC_ALL=$locale locale charmap) 2>/dev/null`" = UTF-8; then
+         LC_ALL=$locale
+         export LC_ALL
+         break
+       fi;;
+      not-found)
+       AT_SKIP_TEST;;
+    esac
+  done
+fi
+
+genfile --file Aa.Ⱥⱥ
+tar -cvf /dev/null --transform='s/.*/\L&-\U&/' --show-transformed-name Aa.Ⱥⱥ],
+[0],
+[aa.ⱥⱥ-AA.ȺȺ
+])
+
+AT_CLEANUP