Add tuklib_mbstr_nonprint to mask non-printable characters

author Lasse Collin <lasse.collin@tukaani.org>

Wed, 18 Dec 2024 12:00:09 +0000 (14:00 +0200)

committer Lasse Collin <lasse.collin@tukaani.org>

Wed, 18 Dec 2024 15:09:32 +0000 (17:09 +0200)
author Lasse Collin <lasse.collin@tukaani.org>
Wed, 18 Dec 2024 12:00:09 +0000 (14:00 +0200)
committer Lasse Collin <lasse.collin@tukaani.org>
Wed, 18 Dec 2024 15:09:32 +0000 (17:09 +0200)
diff --git a/src/Makefile.am b/src/Makefile.am

index cb5aed40505083ec303f674c6c8c9f21c33df20f..15eee834a1d71cf66eb65701900e95b459e46418 100644 (file)
--- a/src/Makefile.am
+++ b/src/Makefile.am
@@ -29,6 +29,8 @@ EXTRA_DIST = \
         common/tuklib_integer.h \
         common/tuklib_mbstr.h \
         common/tuklib_mbstr_fw.c \
+       common/tuklib_mbstr_nonprint.c \
+       common/tuklib_mbstr_nonprint.h \
         common/tuklib_mbstr_width.c \
         common/tuklib_mbstr_wrap.c \
         common/tuklib_mbstr_wrap.h \
diff --git a/src/common/tuklib_mbstr_nonprint.c b/src/common/tuklib_mbstr_nonprint.c

new file mode 100644 (file)

index 0000000..cac10bf
--- /dev/null
+++ b/src/common/tuklib_mbstr_nonprint.c
@@ -0,0 +1,151 @@
+// SPDX-License-Identifier: 0BSD
+
+///////////////////////////////////////////////////////////////////////////////
+//
+/// \file       tuklib_mbstr_nonprint.c
+/// \brief      Find and replace non-printable characters with question marks
+//
+//  Author:     Lasse Collin
+//
+///////////////////////////////////////////////////////////////////////////////
+
+#include "tuklib_mbstr_nonprint.h"
+#include <stdlib.h>
+#include <string.h>
+
+#ifdef HAVE_MBRTOWC
+#      include <wchar.h>
+#      include <wctype.h>
+#else
+#      include <ctype.h>
+#endif
+
+
+static bool
+is_next_printable(const char *str, size_t len, size_t *next_len)
+{
+#ifdef HAVE_MBRTOWC
+       // This assumes that character sets with locking shift states aren't
+       // used, and thus mbsinit() is never needed.
+       mbstate_t ps;
+       memset(&ps, 0, sizeof(ps));
+
+       wchar_t wc;
+       *next_len = mbrtowc(&wc, str, len, &ps);
+
+       if (*next_len == (size_t)-2) {
+               // Incomplete multibyte sequence: Treat the whole sequence
+               // as a single non-printable multibyte character that ends
+               // the string.
+               *next_len = len;
+               return false;
+       }
+
+       // Check more broadly than just ret == (size_t)-1 to be safe
+       // in case mbrtowc() returns something weird. This check
+       // covers (size_t)-1 (that is, SIZE_MAX) too because len is from
+       // strlen() and the terminating '\0' isn't part of the length.
+       if (*next_len < 1 || *next_len > len) {
+               // Invalid multibyte sequence: Treat the first byte as
+               // a non-printable single-byte character. Decoding will
+               // be restarted from the next byte on the next call to
+               // this function.
+               *next_len = 1;
+               return false;
+       }
+
+#      if defined(_WIN32) && !defined(__CYGWIN__)
+       // On Windows, wchar_t stores UTF-16 code units, thus characters
+       // outside the Basic Multilingual Plane (BMP) don't fit into
+       // a single wchar_t. In an UTF-8 locale, UCRT's mbrtowc() returns
+       // successfully when the input is a non-BMP character but the
+       // output is the replacement character U+FFFD.
+       //
+       // iswprint() returns 0 for U+FFFD on Windows for some reason. Treat
+       // U+FFFD as printable and thus also all non-BMP chars as printable.
+       if (wc == 0xFFFD)
+               return true;
+#      endif
+
+       return iswprint((wint_t)wc) != 0;
+#else
+       (void)len;
+       *next_len = 1;
+       return isprint((unsigned char)str[0]) != 0;
+#endif
+}
+
+
+static bool
+has_nonprint(const char *str, size_t len)
+{
+       for (size_t i = 0; i < len; ) {
+               size_t next_len;
+               if (!is_next_printable(str + i, len - i, &next_len))
+                       return true;
+
+               i += next_len;
+       }
+
+       return false;
+}
+
+
+extern bool
+tuklib_has_nonprint(const char *str)
+{
+       return has_nonprint(str, strlen(str));
+}
+
+
+extern const char *
+tuklib_mask_nonprint_r(const char *str, char **mem)
+{
+       // Free the old string, if any.
+       free(*mem);
+       *mem = NULL;
+
+       // If the whole input string contains only printable characters,
+       // return the input string.
+       const size_t len = strlen(str);
+       if (!has_nonprint(str, len))
+               return str;
+
+       // Allocate memory for the masked string. Since we use the single-byte
+       // character '?' to mask non-printable characters, it's possible that
+       // a few bytes less memory would be needed in reality if multibyte
+       // characters are masked.
+       //
+       // If allocation fails, return "???" because it should be safer than
+       // returning the unmasked string.
+       *mem = malloc(len + 1);
+       if (*mem == NULL)
+               return "???";
+
+       // Replace all non-printable characters with '?'.
+       char *dest = *mem;
+
+       for (size_t i = 0; i < len; ) {
+               size_t next_len;
+               if (is_next_printable(str + i, len - i, &next_len)) {
+                       memcpy(dest, str + i, next_len);
+                       dest += next_len;
+               } else {
+                       *dest++ = '?';
+               }
+
+               i += next_len;
+       }
+
+       *dest = '\0';
+
+       return *mem;
+}
+
+
+extern const char *
+tuklib_mask_nonprint(const char *str)
+{
+       static char *mem = NULL;
+       return tuklib_mask_nonprint_r(str, &mem);
+}
diff --git a/src/common/tuklib_mbstr_nonprint.h b/src/common/tuklib_mbstr_nonprint.h

new file mode 100644 (file)

index 0000000..7c2bef1
--- /dev/null
+++ b/src/common/tuklib_mbstr_nonprint.h
@@ -0,0 +1,69 @@
+// SPDX-License-Identifier: 0BSD
+
+///////////////////////////////////////////////////////////////////////////////
+//
+/// \file       tuklib_mbstr_nonprint.h
+/// \brief      Find and replace non-printable characters with question marks
+///
+/// If mbrtowc(3) is available, it and iswprint(3) is used to check if all
+/// characters are printable. Otherwise single-byte character set is assumed
+/// and isprint(3) is used.
+//
+//  Author:     Lasse Collin
+//
+///////////////////////////////////////////////////////////////////////////////
+
+#ifndef TUKLIB_MBSTR_NONPRINT_H
+#define TUKLIB_MBSTR_NONPRINT_H
+
+#include "tuklib_common.h"
+TUKLIB_DECLS_BEGIN
+
+#define tuklib_has_nonprint TUKLIB_SYMBOL(tuklib_has_nonprint)
+extern bool tuklib_has_nonprint(const char *str);
+///<
+/// \brief      Check if a string contains any non-printable characters
+///
+/// \return     false if str contains only valid multibyte characters and
+///             iswprint(3) returns non-zero for all of them; true otherwise
+///
+/// \note       In case mbrtowc(3) isn't available, single-byte character set
+///             is assumed and isprint(3) is used instead of iswprint(3).
+
+#define tuklib_mask_nonprint_r TUKLIB_SYMBOL(tuklib_mask_nonprint_r)
+extern const char *tuklib_mask_nonprint_r(const char *str, char **mem);
+///<
+/// \brief      Replace non-printable characters with question marks
+///
+/// \param      str     Untrusted string, for example, a filename
+/// \param      mem     This function always calls free(*mem) to free the old
+///                     allocation and then sets *mem = NULL. Before the first
+///                     call, *mem should be initialized to NULL. If this
+///                     function needs to allocate memory for a modified
+///                     string, a pointer to the allocated memory will be
+///                     stored to *mem. Otherwise *mem will remain NULL.
+///
+/// \return     If tuklib_has_nonprint(str) returns false, this function
+///             returns str. Otherwise memory is allocated to hold a modified
+///             string and a pointer to that is returned. The pointer to the
+///             allocated memory is also stored to *mem. A modified string
+///             has the problematic characters replaced by '?'. If memory
+///             allocation fails, "???" is returned and *mem is NULL.
+
+#define tuklib_mask_nonprint TUKLIB_SYMBOL(tuklib_mask_nonprint)
+extern const char *tuklib_mask_nonprint(const char *str);
+///<
+/// \brief      Replace non-printable characters with question marks
+///
+/// This is a convenience function for single-threaded use. This calls
+/// tuklib_mask_nonprint_r() using an internal static variable to hold
+/// the possible allocation.
+///
+/// \param      str     Untrusted string, for example, a filename
+///
+/// \return     See tuklib_mask_nonprint_r().
+///
+/// \note       This function is not thread safe!
+
+TUKLIB_DECLS_END
+#endif
author	Lasse Collin <lasse.collin@tukaani.org>
	Wed, 18 Dec 2024 12:00:09 +0000 (14:00 +0200)
committer	Lasse Collin <lasse.collin@tukaani.org>
	Wed, 18 Dec 2024 15:09:32 +0000 (17:09 +0200)
src/Makefile.am		patch \| blob \| blame \| history
src/common/tuklib_mbstr_nonprint.c	[new file with mode: 0644]	patch \| blob
src/common/tuklib_mbstr_nonprint.h	[new file with mode: 0644]	patch \| blob