--- /dev/null
+// SPDX-License-Identifier: 0BSD
+
+///////////////////////////////////////////////////////////////////////////////
+//
+/// \file tuklib_mbstr_nonprint.c
+/// \brief Find and replace non-printable characters with question marks
+//
+// Author: Lasse Collin
+//
+///////////////////////////////////////////////////////////////////////////////
+
+#include "tuklib_mbstr_nonprint.h"
+#include <stdlib.h>
+#include <string.h>
+
+#ifdef HAVE_MBRTOWC
+# include <wchar.h>
+# include <wctype.h>
+#else
+# include <ctype.h>
+#endif
+
+
+static bool
+is_next_printable(const char *str, size_t len, size_t *next_len)
+{
+#ifdef HAVE_MBRTOWC
+ // This assumes that character sets with locking shift states aren't
+ // used, and thus mbsinit() is never needed.
+ mbstate_t ps;
+ memset(&ps, 0, sizeof(ps));
+
+ wchar_t wc;
+ *next_len = mbrtowc(&wc, str, len, &ps);
+
+ if (*next_len == (size_t)-2) {
+ // Incomplete multibyte sequence: Treat the whole sequence
+ // as a single non-printable multibyte character that ends
+ // the string.
+ *next_len = len;
+ return false;
+ }
+
+ // Check more broadly than just ret == (size_t)-1 to be safe
+ // in case mbrtowc() returns something weird. This check
+ // covers (size_t)-1 (that is, SIZE_MAX) too because len is from
+ // strlen() and the terminating '\0' isn't part of the length.
+ if (*next_len < 1 || *next_len > len) {
+ // Invalid multibyte sequence: Treat the first byte as
+ // a non-printable single-byte character. Decoding will
+ // be restarted from the next byte on the next call to
+ // this function.
+ *next_len = 1;
+ return false;
+ }
+
+# if defined(_WIN32) && !defined(__CYGWIN__)
+ // On Windows, wchar_t stores UTF-16 code units, thus characters
+ // outside the Basic Multilingual Plane (BMP) don't fit into
+ // a single wchar_t. In an UTF-8 locale, UCRT's mbrtowc() returns
+ // successfully when the input is a non-BMP character but the
+ // output is the replacement character U+FFFD.
+ //
+ // iswprint() returns 0 for U+FFFD on Windows for some reason. Treat
+ // U+FFFD as printable and thus also all non-BMP chars as printable.
+ if (wc == 0xFFFD)
+ return true;
+# endif
+
+ return iswprint((wint_t)wc) != 0;
+#else
+ (void)len;
+ *next_len = 1;
+ return isprint((unsigned char)str[0]) != 0;
+#endif
+}
+
+
+static bool
+has_nonprint(const char *str, size_t len)
+{
+ for (size_t i = 0; i < len; ) {
+ size_t next_len;
+ if (!is_next_printable(str + i, len - i, &next_len))
+ return true;
+
+ i += next_len;
+ }
+
+ return false;
+}
+
+
+extern bool
+tuklib_has_nonprint(const char *str)
+{
+ return has_nonprint(str, strlen(str));
+}
+
+
+extern const char *
+tuklib_mask_nonprint_r(const char *str, char **mem)
+{
+ // Free the old string, if any.
+ free(*mem);
+ *mem = NULL;
+
+ // If the whole input string contains only printable characters,
+ // return the input string.
+ const size_t len = strlen(str);
+ if (!has_nonprint(str, len))
+ return str;
+
+ // Allocate memory for the masked string. Since we use the single-byte
+ // character '?' to mask non-printable characters, it's possible that
+ // a few bytes less memory would be needed in reality if multibyte
+ // characters are masked.
+ //
+ // If allocation fails, return "???" because it should be safer than
+ // returning the unmasked string.
+ *mem = malloc(len + 1);
+ if (*mem == NULL)
+ return "???";
+
+ // Replace all non-printable characters with '?'.
+ char *dest = *mem;
+
+ for (size_t i = 0; i < len; ) {
+ size_t next_len;
+ if (is_next_printable(str + i, len - i, &next_len)) {
+ memcpy(dest, str + i, next_len);
+ dest += next_len;
+ } else {
+ *dest++ = '?';
+ }
+
+ i += next_len;
+ }
+
+ *dest = '\0';
+
+ return *mem;
+}
+
+
+extern const char *
+tuklib_mask_nonprint(const char *str)
+{
+ static char *mem = NULL;
+ return tuklib_mask_nonprint_r(str, &mem);
+}
--- /dev/null
+// SPDX-License-Identifier: 0BSD
+
+///////////////////////////////////////////////////////////////////////////////
+//
+/// \file tuklib_mbstr_nonprint.h
+/// \brief Find and replace non-printable characters with question marks
+///
+/// If mbrtowc(3) is available, it and iswprint(3) is used to check if all
+/// characters are printable. Otherwise single-byte character set is assumed
+/// and isprint(3) is used.
+//
+// Author: Lasse Collin
+//
+///////////////////////////////////////////////////////////////////////////////
+
+#ifndef TUKLIB_MBSTR_NONPRINT_H
+#define TUKLIB_MBSTR_NONPRINT_H
+
+#include "tuklib_common.h"
+TUKLIB_DECLS_BEGIN
+
+#define tuklib_has_nonprint TUKLIB_SYMBOL(tuklib_has_nonprint)
+extern bool tuklib_has_nonprint(const char *str);
+///<
+/// \brief Check if a string contains any non-printable characters
+///
+/// \return false if str contains only valid multibyte characters and
+/// iswprint(3) returns non-zero for all of them; true otherwise
+///
+/// \note In case mbrtowc(3) isn't available, single-byte character set
+/// is assumed and isprint(3) is used instead of iswprint(3).
+
+#define tuklib_mask_nonprint_r TUKLIB_SYMBOL(tuklib_mask_nonprint_r)
+extern const char *tuklib_mask_nonprint_r(const char *str, char **mem);
+///<
+/// \brief Replace non-printable characters with question marks
+///
+/// \param str Untrusted string, for example, a filename
+/// \param mem This function always calls free(*mem) to free the old
+/// allocation and then sets *mem = NULL. Before the first
+/// call, *mem should be initialized to NULL. If this
+/// function needs to allocate memory for a modified
+/// string, a pointer to the allocated memory will be
+/// stored to *mem. Otherwise *mem will remain NULL.
+///
+/// \return If tuklib_has_nonprint(str) returns false, this function
+/// returns str. Otherwise memory is allocated to hold a modified
+/// string and a pointer to that is returned. The pointer to the
+/// allocated memory is also stored to *mem. A modified string
+/// has the problematic characters replaced by '?'. If memory
+/// allocation fails, "???" is returned and *mem is NULL.
+
+#define tuklib_mask_nonprint TUKLIB_SYMBOL(tuklib_mask_nonprint)
+extern const char *tuklib_mask_nonprint(const char *str);
+///<
+/// \brief Replace non-printable characters with question marks
+///
+/// This is a convenience function for single-threaded use. This calls
+/// tuklib_mask_nonprint_r() using an internal static variable to hold
+/// the possible allocation.
+///
+/// \param str Untrusted string, for example, a filename
+///
+/// \return See tuklib_mask_nonprint_r().
+///
+/// \note This function is not thread safe!
+
+TUKLIB_DECLS_END
+#endif