Add tuklib_mbstr_wrap for automatic word wrapping

author Lasse Collin <lasse.collin@tukaani.org>

Mon, 16 Dec 2024 16:43:52 +0000 (18:43 +0200)

committer Lasse Collin <lasse.collin@tukaani.org>

Wed, 18 Dec 2024 15:09:31 +0000 (17:09 +0200)
author Lasse Collin <lasse.collin@tukaani.org>
Mon, 16 Dec 2024 16:43:52 +0000 (18:43 +0200)
committer Lasse Collin <lasse.collin@tukaani.org>
Wed, 18 Dec 2024 15:09:31 +0000 (17:09 +0200)
diff --git a/cmake/tuklib_mbstr.cmake b/cmake/tuklib_mbstr.cmake

index 71e16cc5a3e02ca4cdf4f5bffb0f8e7bad8a9266..bd234cbc96bbfee2f3b089fba33657a39a6762e5 100644 (file)
--- a/cmake/tuklib_mbstr.cmake
+++ b/cmake/tuklib_mbstr.cmake
@@ -18,4 +18,8 @@ function(tuklib_mbstr TARGET_OR_ALL)
      # NOTE: wcwidth() requires _GNU_SOURCE or _XOPEN_SOURCE on GNU/Linux.
      check_symbol_exists(wcwidth wchar.h HAVE_WCWIDTH)
      tuklib_add_definition_if("${TARGET_OR_ALL}" HAVE_WCWIDTH)
+
+    # NOTE: vasprintf() requires _GNU_SOURCE on GNU/Linux.
+    check_symbol_exists(vasprintf stdio.h HAVE_VASPRINTF)
+    tuklib_add_definition_if("${TARGET_OR_ALL}" HAVE_VASPRINTF)
  endfunction()
diff --git a/m4/tuklib_mbstr.m4 b/m4/tuklib_mbstr.m4

index 01398347c7545f114f5976f5cfec584774506984..98b1d7a3d9c50aaa25f968fda72314de6f699a67 100644 (file)
--- a/m4/tuklib_mbstr.m4
+++ b/m4/tuklib_mbstr.m4
@@ -27,5 +27,5 @@
  AC_DEFUN_ONCE([TUKLIB_MBSTR], [
  AC_REQUIRE([TUKLIB_COMMON])
  AC_FUNC_MBRTOWC
-AC_CHECK_FUNCS([wcwidth])
+AC_CHECK_FUNCS([wcwidth vasprintf])
  ])dnl
diff --git a/src/Makefile.am b/src/Makefile.am

index e1fcb5a92f4b7f4456ec7b4f40ab71afd2fd470a..cb5aed40505083ec303f674c6c8c9f21c33df20f 100644 (file)
--- a/src/Makefile.am
+++ b/src/Makefile.am
@@ -30,6 +30,8 @@ EXTRA_DIST = \
         common/tuklib_mbstr.h \
         common/tuklib_mbstr_fw.c \
         common/tuklib_mbstr_width.c \
+       common/tuklib_mbstr_wrap.c \
+       common/tuklib_mbstr_wrap.h \
         common/tuklib_open_stdxxx.c \
         common/tuklib_open_stdxxx.h \
         common/tuklib_physmem.c \
diff --git a/src/common/tuklib_gettext.h b/src/common/tuklib_gettext.h

index 4021c98fa9becf978c60190eae47067c19fe7040..2ee91cb20871cdd0ebb4787813473689c3209924 100644 (file)
--- a/src/common/tuklib_gettext.h
+++ b/src/common/tuklib_gettext.h
@@ -68,4 +68,15 @@
  #endif
  #define N_(msgid) msgid
  
+// Optional: Strings that are word wrapped using tuklib_mbstr_wrap may be
+// marked with W_("foo) in the source code. xgettext can then add a comment
+// to all such strings to inform translators. The following option needs to
+// be added to XGETTEXT_OPTIONS in po/Makevars or in an equivalent place:
+//
+// '--keyword=W_:1,"This is word wrapped at spaces. The Unicode character U+00A0 works as a non-breaking space. Tab (\t) is interpret as a zero-width space (the tab itself is not displayed); U+200B is NOT supported. Manual word wrapping with \n is supported but requires care."'
+//
+// NOTE: The double-quotes in the --keyword argument above must be passed to
+// xgettext as is, thus one needs the single-quotes in Makevars.
+#define W_(msgid) _(msgid)
+
  #endif
diff --git a/src/common/tuklib_mbstr_wrap.c b/src/common/tuklib_mbstr_wrap.c

new file mode 100644 (file)

index 0000000..4cc559d
--- /dev/null
+++ b/src/common/tuklib_mbstr_wrap.c
@@ -0,0 +1,285 @@
+// SPDX-License-Identifier: 0BSD
+
+///////////////////////////////////////////////////////////////////////////////
+//
+/// \file       tuklib_mbstr_wrap.c
+/// \brief      Word wraps a string and prints it to a FILE stream
+///
+/// This depends on tuklib_mbstr_width.c.
+//
+//  Author:     Lasse Collin
+//
+///////////////////////////////////////////////////////////////////////////////
+
+#include "tuklib_mbstr.h"
+#include "tuklib_mbstr_wrap.h"
+#include <stdarg.h>
+#include <stdlib.h>
+#include <stdio.h>
+#include <string.h>
+
+
+extern int
+tuklib_wraps(FILE *outfile, const struct tuklib_wrap_opt *opt, const char *str)
+{
+       // left_cont may be less than left_margin. In that case, if the first
+       // word is extremely long, it will stay on the first line even if
+       // the line then gets overlong.
+       //
+       // On the other hand, left2_cont < left2_margin isn't allowed because
+       // it could result in inconsistent behavior when a very long word
+       // comes right after a \v.
+       //
+       // It is fine to have left2_margin < left_margin although it would be
+       // an odd use case.
+       if (!(opt->left_margin < opt->right_margin
+                       && opt->left_cont < opt->right_margin
+                       && opt->left2_margin <= opt->left2_cont
+                       && opt->left2_cont < opt->right_margin))
+               return TUKLIB_WRAP_ERR_OPT;
+
+       // This is set to TUKLIB_WRAP_WARN_OVERLONG if one or more
+       // output lines extend past opt->right_margin columns.
+       int warn_overlong = 0;
+
+       // Indentation of the first output line after \n or \r.
+       // \v sets this to opt->left2_margin.
+       // \r resets this back to the original value.
+       size_t first_indent = opt->left_margin;
+
+       // Indentation of the output lines that occur due to word wrapping.
+       // \v sets this to opt->left2_cont and \r back to the original value.
+       size_t cont_indent = opt->left_cont;
+
+       // If word wrapping occurs, the newline isn't printed unless more
+       // text would be put on the continuation line. This is also used
+       // when \v needs to start on a new line.
+       bool pending_newline = false;
+
+       // Spaces are printed only when there is something else to put
+       // after the spaces on the line. This avoids unwanted empty lines
+       // in the output and makes it possible to ignore possible spaces
+       // before a \v character.
+       size_t pending_spaces = first_indent;
+
+       // Current output column. When cur_col == pending_spaces, nothing
+       // has been actually printed to the current output line.
+       size_t cur_col = pending_spaces;
+
+       while (true) {
+               // Number of bytes until the *next* line-break opportunity.
+               size_t len = 0;
+
+               // Number of columns until the *next* line-break opportunity.
+               size_t width = 0;
+
+               // Text between a pair of \b characters is treated as
+               // an unbreakable block even if it contains spaces.
+               // It must not contain any control characters before
+               // the closing \b.
+               bool unbreakable = false;
+
+               while (true) {
+                       // Find the next character that we handle specially.
+                       // In an unbreakable block, search only for the
+                       // closing \b; if missing, the unbreakable block
+                       // extends to the end of the string.
+                       const size_t n = strcspn(str + len,
+                                       unbreakable ? "\b" : " \t\n\r\v\b");
+
+                       // Calculate how many columns the characters need.
+                       const size_t w = tuklib_mbstr_width_mem(str + len, n);
+                       if (w == (size_t)-1)
+                               return TUKLIB_WRAP_ERR_STR;
+
+                       width += w;
+                       len += n;
+
+                       // \b isn't a line-break opportunity so it has to
+                       // be handled here. For simplicity, empty blocks
+                       // are treated as zero-width characters.
+                       if (str[len] == '\b') {
+                               ++len;
+                               unbreakable = !unbreakable;
+                               continue;
+                       }
+
+                       break;
+               }
+
+               // Determine if adding this chunk of text would make the
+               // current output line exceed opt->right_margin columns.
+               const bool too_long = cur_col + width > opt->right_margin;
+
+               // Wrap the line if needed. However:
+               //
+               //   - Don't wrap if the current column is less than where
+               //     the continuation line would begin. In that case
+               //     the chunk wouldn't fit on the next line either so
+               //     we just have to produce an overlong line.
+               //
+               //   - Don't wrap if so far the line only contains spaces.
+               //     Wrapping in that case would leave a weird empty line.
+               //     NOTE: This "only contains spaces" condition is the
+               //     reason why left2_margin > left2_cont isn't allowed.
+               if (too_long && cur_col > cont_indent
+                               && cur_col > pending_spaces) {
+                       // There might be trailing spaces or zero-width spaces
+                       // which need to be ignored to keep the output pretty.
+                       //
+                       // Spaces need to be ignored because in some
+                       // writing styles there are two spaces after
+                       // a full stop. Example string:
+                       //
+                       //     "Foo bar.  Abc def."
+                       //              ^
+                       // If the first space after the first full stop
+                       // triggers word wrapping, both spaces must be
+                       // ignored. Otherwise the next line would be
+                       // indented too much.
+                       //
+                       // Zero-width spaces are ignored the same way
+                       // because they are meaningless if an adjacent
+                       // character is a space.
+                       while (*str == ' ' || *str == '\t')
+                               ++str;
+
+                       // Don't print the newline here; only mark it as
+                       // pending. This avoids an unwanted empty line if
+                       // there is a \n or \r or \0 after the spaces have
+                       // been ignored.
+                       pending_newline = true;
+                       pending_spaces = cont_indent;
+                       cur_col = pending_spaces;
+
+                       // Since str may have been incremented due to the
+                       // ignored spaces, the loop needs to be restarted.
+                       continue;
+               }
+
+               // Print the current chunk of text before the next
+               // line-break opportunity. If the chunk was empty,
+               // don't print anything so that the pending newline
+               // and pending spaces aren't printed on their own.
+               if (len > 0) {
+                       if (pending_newline) {
+                               pending_newline = false;
+                               if (putc('\n', outfile) == EOF)
+                                       return TUKLIB_WRAP_ERR_IO;
+                       }
+
+                       while (pending_spaces > 0) {
+                               if (putc(' ', outfile) == EOF)
+                                       return TUKLIB_WRAP_ERR_IO;
+
+                               --pending_spaces;
+                       }
+
+                       for (size_t i = 0; i < len; ++i) {
+                               // Ignore unbreakable block characters (\b).
+                               const int c = (unsigned char)str[i];
+                               if (c != '\b' && putc(c, outfile) == EOF)
+                                       return TUKLIB_WRAP_ERR_IO;
+                       }
+
+                       str += len;
+                       cur_col += width;
+
+                       // Remember if the line got overlong. If no other
+                       // errors occur, we return warn_overlong. It might
+                       // help in catching problematic strings.
+                       if (too_long)
+                               warn_overlong = TUKLIB_WRAP_WARN_OVERLONG;
+               }
+
+               // Handle the special character after the chunk of text.
+               switch (*str) {
+               case ' ':
+                       // Regular space.
+                       ++cur_col;
+                       ++pending_spaces;
+                       break;
+
+               case '\v':
+                       // Set the alternative indentation settings.
+                       first_indent = opt->left2_margin;
+                       cont_indent = opt->left2_cont;
+
+                       if (first_indent > cur_col) {
+                               // Add one or more spaces to reach
+                               // the column specified in first_indent.
+                               pending_spaces += first_indent - cur_col;
+                       } else {
+                               // There is no room to add even one space
+                               // before reaching the column first_indent.
+                               pending_newline = true;
+                               pending_spaces = first_indent;
+                       }
+
+                       cur_col = first_indent;
+                       break;
+
+               case '\0': // Implicit newline at the end of the string.
+               case '\r': // Newline that also resets the effect of \v.
+               case '\n': // Newline without resetting the indentation mode.
+                       if (putc('\n', outfile) == EOF)
+                               return TUKLIB_WRAP_ERR_IO;
+
+                       if (*str == '\0')
+                               return warn_overlong;
+
+                       if (*str == '\r') {
+                               first_indent = opt->left_margin;
+                               cont_indent = opt->left_cont;
+                       }
+
+                       pending_newline = false;
+                       pending_spaces = first_indent;
+                       cur_col = first_indent;
+                       break;
+               }
+
+               // Skip the specially-handled character.
+               ++str;
+       }
+}
+
+
+extern int
+tuklib_wrapf(FILE *stream, const struct tuklib_wrap_opt *opt,
+               const char *fmt, ...)
+{
+       va_list ap;
+       char *buf;
+
+#ifdef HAVE_VASPRINTF
+       va_start(ap, fmt);
+       const int n = vasprintf(&buf, fmt, ap);
+       va_end(ap);
+       if (n == -1)
+               return TUKLIB_WRAP_ERR_FORMAT;
+#else
+       // Fixed buffer size is dumb but in practice one shouldn't need
+       // huge strings for *formatted* output. This simple method is safe
+       // with pre-C99 vsnprintf() implementations too which don't return
+       // the required buffer size (they return -1 or buf_size - 1) or
+       // which might not null-terminate the buffer in case it's too small.
+       const size_t buf_size = 128 * 1024;
+       buf = malloc(buf_size);
+       if (buf == NULL)
+               return TUKLIB_WRAP_ERR_FORMAT;
+
+       va_start(ap, fmt);
+       const int n = vsnprintf(buf, buf_size, fmt, ap);
+       va_end(ap);
+
+       if (n <= 0 || n >= (int)(buf_size - 1)) {
+               free(buf);
+               return TUKLIB_WRAP_ERR_FORMAT;
+       }
+#endif
+
+       const int ret = tuklib_wraps(stream, opt, buf);
+       free(buf);
+       return ret;
+}
diff --git a/src/common/tuklib_mbstr_wrap.h b/src/common/tuklib_mbstr_wrap.h

new file mode 100644 (file)

index 0000000..e20ffda
--- /dev/null
+++ b/src/common/tuklib_mbstr_wrap.h
@@ -0,0 +1,203 @@
+// SPDX-License-Identifier: 0BSD
+
+///////////////////////////////////////////////////////////////////////////////
+//
+/// \file       tuklib_mbstr_wrap.h
+/// \brief      Word wrapping for multibyte strings
+///
+/// The word wrapping functions are intended to be usable, for example,
+/// for printing --help text in command line tools. While manually-wrapped
+/// --help text allows precise formatting, such freedom requires translators
+/// to count spaces and determine where line breaks should occur. It's
+/// tedious and error prone, and experience has shown that only some
+/// translators do it well. Automatic word wrapping is less flexible but
+/// results in polished-enough look with less effort from everyone.
+/// Right-to-left languages and languages that don't use spaces between
+/// words will still need extra effort though.
+//
+//  Author:     Lasse Collin
+//
+///////////////////////////////////////////////////////////////////////////////
+
+#ifndef TUKLIB_MBSTR_WRAP_H
+#define TUKLIB_MBSTR_WRAP_H
+
+#include "tuklib_common.h"
+#include <stdio.h>
+
+TUKLIB_DECLS_BEGIN
+
+/// One or more output lines exceeded right_margin.
+/// This only a warning; everything was still printed successfully.
+#define TUKLIB_WRAP_WARN_OVERLONG   0x01
+
+/// Error writing to to the output FILE. The error flag in the FILE
+/// should have been set as well.
+#define TUKLIB_WRAP_ERR_IO          0x02
+
+/// Invalid options in struct tuklib_wrap_opt.
+/// Nothing was printed.
+#define TUKLIB_WRAP_ERR_OPT         0x04
+
+/// Invalid or unsupported multibyte character in the input string:
+/// either mbrtowc() failed or wcwidth() returned a negative value.
+#define TUKLIB_WRAP_ERR_STR         0x08
+
+/// Only tuklib_wrapf(): Error in converting the format string.
+/// It's either a memory allocation failure or something bad with the
+/// format string or arguments.
+#define TUKLIB_WRAP_ERR_FORMAT      0x10
+
+/// Options for tuklib_wraps() and tuklib_wrapf()
+struct tuklib_wrap_opt {
+       /// Indentation of the first output line after `\n` or `\r`.
+       /// This can be anything less than right_margin.
+       unsigned short left_margin;
+
+       /// Column where word-wrapped continuation lines start.
+       /// This can be anything less than right_margin.
+       unsigned short left_cont;
+
+       /// Column where the text after `\v` will start, either on the current
+       /// line (when there is room to add at least one space) or on a new
+       /// empty line.
+       unsigned short left2_margin;
+
+       /// Like left_cont but for text after a `\v`. However, this must
+       /// be greater than or equal to left2_margin in addition to being
+       /// less than right_margin.
+       unsigned short left2_cont;
+
+       /// For 80-column terminals, it is recommended to use 79 here for
+       /// maximum portability. 80 will work most of the time but it will
+       /// result in unwanted empty lines in the rare case where a terminal
+       /// moves the cursor to the beginning of the next line immediately
+       /// when the last column has been used.
+       unsigned short right_margin;
+};
+
+#define tuklib_wraps TUKLIB_SYMBOL(tuklib_wraps)
+extern int tuklib_wraps(FILE *stream, const struct tuklib_wrap_opt *opt,
+               const char *str);
+///<
+/// \brief      Word wrap a multibyte string and write it to a FILE
+///
+/// Word wrapping is done only at spaces and at the special control characters
+/// described below. Multiple consecutive spaces are handled properly: strings
+/// that have two (or more) spaces after a full sentence will look good even
+/// when the spaces occur at a word wrapping boundary. Trailing spaces are
+/// ignored at the end of a line or at the end of a string.
+///
+/// The following control characters have been repurposed:
+///
+///   - `\t` = Zero-width space allows a line break without producing any
+///            output by itself. This can be useful after hard hyphens as
+///            hyphens aren't otherwise used for line breaking. This can also
+///            be useful in languages that don't use spaces between words.
+///            (The Unicode character U+200B isn't supported.)
+///   - `\b` = Text between a pair of `\b` characters is treated as an
+///            unbreakable block (not wrapped even if there are spaces).
+///            For example, a non-breaking space can be done like
+///            in `"123\b \bMiB"`. Control characters (like `\n` or `\t`)
+///            aren't allowed before the closing `\b`. If closing `\b` is
+///            missing, the block extends to the end of the string. Empty
+///            blocks are treated as zero-width characters. If line breaks
+///            are possible around an empty block (like in `"foo \b\b bar"`
+///            or `"foo \b"`), it can result in weird output.
+///   - `\v` = Change to alternative indentation (left2_margin).
+///   - `\r` = Reset back to the initial indentation and add a newline.
+///            The next line will be indented by left_margin.
+///   - `\n` = Add a newline without resetting the effect of `\v`. The
+///            next line will be indented by left_margin or left2_margin
+///            (not left_cont or left2_cont).
+///
+/// Only `\n` should appear in translatable strings. `\t` works too but
+/// even that might confuse some translators even if there is a TRANSLATORS
+/// comment explaining its meaning.
+///
+/// To use the other control characters in messages, one should use
+/// tuklib_wrapf() with appropriate printf format string to combine
+/// translatable strings with non-translatable portions. For example:
+///
+/// \code{.c}
+/// static const struct tuklib_wrap_opt wrap2 = { 2,  2, 22, 22, 79 };
+/// int e = 0;
+/// ...
+/// e |= tuklib_wrapf(stdout, &wrap2,
+///                   "-h, --help\v%s\r"
+///                   "    --version\v%s",
+///                   W_("display this help and exit"),
+///                   W_("display version information and exit"));
+/// ...
+/// if (e != 0) {
+///     // Handle warning or error.
+///     ...
+/// }
+/// \endcode
+///
+/// Control characters other than `\n` and `\t` are unusable in
+/// translatable strings:
+///
+///   - Gettext tools show annoying warnings if C escape sequences other
+///     than `\n` or `\t` are seen. (Otherwise they still work perfectly
+///     fine though.)
+///
+///   - While at least Poedit and Lokalize support all escapes, some
+///     editors only support `\n` and `\t`.
+///
+///   - They could confuse some translators, resulting in broken
+///     translations.
+///
+/// Using non-control characters would solve some issues but it wouldn't
+/// help with the unfortunate real-world issue that some translators would
+/// likely have trouble understanding a new syntax. The Gettext manual
+/// specifically warns about this, see the subheading "No unusual markup"
+/// in `info (gettext)Preparing Strings`. (While using `\t` for zero-width
+/// space is such custom markup, most translators will never need it.)
+///
+/// Translators can use the Unicode character U+00A0 (or U+202F) if they
+/// need a non-breaking space. For example, in French a non-breaking space
+/// may be needed before colons and question marks (U+00A0 is common in
+/// real-world French PO files).
+///
+/// Using a non-ASCII char in a string in the C code (like `"123\u00A0MiB"`)
+/// can work if one tells xgettext that input encoding is UTF-8, one
+/// ensures that the C compiler uses UTF-8 as the input charset, and one
+/// is certain that the program is *always* run under an UTF-8 locale.
+/// Unfortunately a portable program cannot make this kind of assumptions,
+/// which means that there is no pretty way to have a non-breaking space in
+/// a translatable string.
+///
+/// Optional: To tell translators which strings are automatically word
+/// wrapped, see the macro `W_` in tuklib_gettext.h.
+///
+/// \param      stream      Output FILE stream. For decent performance, it
+///                         should be in buffered mode because this function
+///                         writes the output one byte at a time with fputc().
+/// \param      opt         Word wrapping options.
+/// \param      str         Null-terminated multibyte string that is in
+///                         the encoding used by the current locale.
+///
+/// \return     Returns 0 on success. If an error or warning occurs, one of
+///             TUKLIB_WRAP_* codes is returned. Those codes are powers
+///             of two. When warning/error detection can be delayed, the
+///             return values can be accumulated from multiple calls using
+///             bitwise-or into a single variable which can be checked after
+///             all strings have (hopefully) been printed.
+
+#define tuklib_wrapf TUKLIB_SYMBOL(tuklib_wrapf)
+extern int tuklib_wrapf(FILE *stream, const struct tuklib_wrap_opt *opt,
+               const char *fmt, ...);
+///<
+/// \brief      Format and word-wrap a multibyte string and write it to a FILE
+///
+/// This is like tuklib_wraps() except that this takes a printf
+/// format string.
+///
+/// \note       On platforms that lack vasprintf(), the intermediate
+///             result from vsnprintf() must fit into a 128 KiB buffer.
+///             TUKLIB_WRAP_ERR_FORMAT is returned if it doesn't but
+///             only on platforms that lack vasprintf().
+
+TUKLIB_DECLS_END
+#endif
author	Lasse Collin <lasse.collin@tukaani.org>
	Mon, 16 Dec 2024 16:43:52 +0000 (18:43 +0200)
committer	Lasse Collin <lasse.collin@tukaani.org>
	Wed, 18 Dec 2024 15:09:31 +0000 (17:09 +0200)
cmake/tuklib_mbstr.cmake		patch \| blob \| blame \| history
m4/tuklib_mbstr.m4		patch \| blob \| blame \| history
src/Makefile.am		patch \| blob \| blame \| history
src/common/tuklib_gettext.h		patch \| blob \| blame \| history
src/common/tuklib_mbstr_wrap.c	[new file with mode: 0644]	patch \| blob
src/common/tuklib_mbstr_wrap.h	[new file with mode: 0644]	patch \| blob