# NOTE: wcwidth() requires _GNU_SOURCE or _XOPEN_SOURCE on GNU/Linux.
check_symbol_exists(wcwidth wchar.h HAVE_WCWIDTH)
tuklib_add_definition_if("${TARGET_OR_ALL}" HAVE_WCWIDTH)
+
+ # NOTE: vasprintf() requires _GNU_SOURCE on GNU/Linux.
+ check_symbol_exists(vasprintf stdio.h HAVE_VASPRINTF)
+ tuklib_add_definition_if("${TARGET_OR_ALL}" HAVE_VASPRINTF)
endfunction()
AC_DEFUN_ONCE([TUKLIB_MBSTR], [
AC_REQUIRE([TUKLIB_COMMON])
AC_FUNC_MBRTOWC
-AC_CHECK_FUNCS([wcwidth])
+AC_CHECK_FUNCS([wcwidth vasprintf])
])dnl
common/tuklib_mbstr.h \
common/tuklib_mbstr_fw.c \
common/tuklib_mbstr_width.c \
+ common/tuklib_mbstr_wrap.c \
+ common/tuklib_mbstr_wrap.h \
common/tuklib_open_stdxxx.c \
common/tuklib_open_stdxxx.h \
common/tuklib_physmem.c \
#endif
#define N_(msgid) msgid
+// Optional: Strings that are word wrapped using tuklib_mbstr_wrap may be
+// marked with W_("foo) in the source code. xgettext can then add a comment
+// to all such strings to inform translators. The following option needs to
+// be added to XGETTEXT_OPTIONS in po/Makevars or in an equivalent place:
+//
+// '--keyword=W_:1,"This is word wrapped at spaces. The Unicode character U+00A0 works as a non-breaking space. Tab (\t) is interpret as a zero-width space (the tab itself is not displayed); U+200B is NOT supported. Manual word wrapping with \n is supported but requires care."'
+//
+// NOTE: The double-quotes in the --keyword argument above must be passed to
+// xgettext as is, thus one needs the single-quotes in Makevars.
+#define W_(msgid) _(msgid)
+
#endif
--- /dev/null
+// SPDX-License-Identifier: 0BSD
+
+///////////////////////////////////////////////////////////////////////////////
+//
+/// \file tuklib_mbstr_wrap.c
+/// \brief Word wraps a string and prints it to a FILE stream
+///
+/// This depends on tuklib_mbstr_width.c.
+//
+// Author: Lasse Collin
+//
+///////////////////////////////////////////////////////////////////////////////
+
+#include "tuklib_mbstr.h"
+#include "tuklib_mbstr_wrap.h"
+#include <stdarg.h>
+#include <stdlib.h>
+#include <stdio.h>
+#include <string.h>
+
+
+extern int
+tuklib_wraps(FILE *outfile, const struct tuklib_wrap_opt *opt, const char *str)
+{
+ // left_cont may be less than left_margin. In that case, if the first
+ // word is extremely long, it will stay on the first line even if
+ // the line then gets overlong.
+ //
+ // On the other hand, left2_cont < left2_margin isn't allowed because
+ // it could result in inconsistent behavior when a very long word
+ // comes right after a \v.
+ //
+ // It is fine to have left2_margin < left_margin although it would be
+ // an odd use case.
+ if (!(opt->left_margin < opt->right_margin
+ && opt->left_cont < opt->right_margin
+ && opt->left2_margin <= opt->left2_cont
+ && opt->left2_cont < opt->right_margin))
+ return TUKLIB_WRAP_ERR_OPT;
+
+ // This is set to TUKLIB_WRAP_WARN_OVERLONG if one or more
+ // output lines extend past opt->right_margin columns.
+ int warn_overlong = 0;
+
+ // Indentation of the first output line after \n or \r.
+ // \v sets this to opt->left2_margin.
+ // \r resets this back to the original value.
+ size_t first_indent = opt->left_margin;
+
+ // Indentation of the output lines that occur due to word wrapping.
+ // \v sets this to opt->left2_cont and \r back to the original value.
+ size_t cont_indent = opt->left_cont;
+
+ // If word wrapping occurs, the newline isn't printed unless more
+ // text would be put on the continuation line. This is also used
+ // when \v needs to start on a new line.
+ bool pending_newline = false;
+
+ // Spaces are printed only when there is something else to put
+ // after the spaces on the line. This avoids unwanted empty lines
+ // in the output and makes it possible to ignore possible spaces
+ // before a \v character.
+ size_t pending_spaces = first_indent;
+
+ // Current output column. When cur_col == pending_spaces, nothing
+ // has been actually printed to the current output line.
+ size_t cur_col = pending_spaces;
+
+ while (true) {
+ // Number of bytes until the *next* line-break opportunity.
+ size_t len = 0;
+
+ // Number of columns until the *next* line-break opportunity.
+ size_t width = 0;
+
+ // Text between a pair of \b characters is treated as
+ // an unbreakable block even if it contains spaces.
+ // It must not contain any control characters before
+ // the closing \b.
+ bool unbreakable = false;
+
+ while (true) {
+ // Find the next character that we handle specially.
+ // In an unbreakable block, search only for the
+ // closing \b; if missing, the unbreakable block
+ // extends to the end of the string.
+ const size_t n = strcspn(str + len,
+ unbreakable ? "\b" : " \t\n\r\v\b");
+
+ // Calculate how many columns the characters need.
+ const size_t w = tuklib_mbstr_width_mem(str + len, n);
+ if (w == (size_t)-1)
+ return TUKLIB_WRAP_ERR_STR;
+
+ width += w;
+ len += n;
+
+ // \b isn't a line-break opportunity so it has to
+ // be handled here. For simplicity, empty blocks
+ // are treated as zero-width characters.
+ if (str[len] == '\b') {
+ ++len;
+ unbreakable = !unbreakable;
+ continue;
+ }
+
+ break;
+ }
+
+ // Determine if adding this chunk of text would make the
+ // current output line exceed opt->right_margin columns.
+ const bool too_long = cur_col + width > opt->right_margin;
+
+ // Wrap the line if needed. However:
+ //
+ // - Don't wrap if the current column is less than where
+ // the continuation line would begin. In that case
+ // the chunk wouldn't fit on the next line either so
+ // we just have to produce an overlong line.
+ //
+ // - Don't wrap if so far the line only contains spaces.
+ // Wrapping in that case would leave a weird empty line.
+ // NOTE: This "only contains spaces" condition is the
+ // reason why left2_margin > left2_cont isn't allowed.
+ if (too_long && cur_col > cont_indent
+ && cur_col > pending_spaces) {
+ // There might be trailing spaces or zero-width spaces
+ // which need to be ignored to keep the output pretty.
+ //
+ // Spaces need to be ignored because in some
+ // writing styles there are two spaces after
+ // a full stop. Example string:
+ //
+ // "Foo bar. Abc def."
+ // ^
+ // If the first space after the first full stop
+ // triggers word wrapping, both spaces must be
+ // ignored. Otherwise the next line would be
+ // indented too much.
+ //
+ // Zero-width spaces are ignored the same way
+ // because they are meaningless if an adjacent
+ // character is a space.
+ while (*str == ' ' || *str == '\t')
+ ++str;
+
+ // Don't print the newline here; only mark it as
+ // pending. This avoids an unwanted empty line if
+ // there is a \n or \r or \0 after the spaces have
+ // been ignored.
+ pending_newline = true;
+ pending_spaces = cont_indent;
+ cur_col = pending_spaces;
+
+ // Since str may have been incremented due to the
+ // ignored spaces, the loop needs to be restarted.
+ continue;
+ }
+
+ // Print the current chunk of text before the next
+ // line-break opportunity. If the chunk was empty,
+ // don't print anything so that the pending newline
+ // and pending spaces aren't printed on their own.
+ if (len > 0) {
+ if (pending_newline) {
+ pending_newline = false;
+ if (putc('\n', outfile) == EOF)
+ return TUKLIB_WRAP_ERR_IO;
+ }
+
+ while (pending_spaces > 0) {
+ if (putc(' ', outfile) == EOF)
+ return TUKLIB_WRAP_ERR_IO;
+
+ --pending_spaces;
+ }
+
+ for (size_t i = 0; i < len; ++i) {
+ // Ignore unbreakable block characters (\b).
+ const int c = (unsigned char)str[i];
+ if (c != '\b' && putc(c, outfile) == EOF)
+ return TUKLIB_WRAP_ERR_IO;
+ }
+
+ str += len;
+ cur_col += width;
+
+ // Remember if the line got overlong. If no other
+ // errors occur, we return warn_overlong. It might
+ // help in catching problematic strings.
+ if (too_long)
+ warn_overlong = TUKLIB_WRAP_WARN_OVERLONG;
+ }
+
+ // Handle the special character after the chunk of text.
+ switch (*str) {
+ case ' ':
+ // Regular space.
+ ++cur_col;
+ ++pending_spaces;
+ break;
+
+ case '\v':
+ // Set the alternative indentation settings.
+ first_indent = opt->left2_margin;
+ cont_indent = opt->left2_cont;
+
+ if (first_indent > cur_col) {
+ // Add one or more spaces to reach
+ // the column specified in first_indent.
+ pending_spaces += first_indent - cur_col;
+ } else {
+ // There is no room to add even one space
+ // before reaching the column first_indent.
+ pending_newline = true;
+ pending_spaces = first_indent;
+ }
+
+ cur_col = first_indent;
+ break;
+
+ case '\0': // Implicit newline at the end of the string.
+ case '\r': // Newline that also resets the effect of \v.
+ case '\n': // Newline without resetting the indentation mode.
+ if (putc('\n', outfile) == EOF)
+ return TUKLIB_WRAP_ERR_IO;
+
+ if (*str == '\0')
+ return warn_overlong;
+
+ if (*str == '\r') {
+ first_indent = opt->left_margin;
+ cont_indent = opt->left_cont;
+ }
+
+ pending_newline = false;
+ pending_spaces = first_indent;
+ cur_col = first_indent;
+ break;
+ }
+
+ // Skip the specially-handled character.
+ ++str;
+ }
+}
+
+
+extern int
+tuklib_wrapf(FILE *stream, const struct tuklib_wrap_opt *opt,
+ const char *fmt, ...)
+{
+ va_list ap;
+ char *buf;
+
+#ifdef HAVE_VASPRINTF
+ va_start(ap, fmt);
+ const int n = vasprintf(&buf, fmt, ap);
+ va_end(ap);
+ if (n == -1)
+ return TUKLIB_WRAP_ERR_FORMAT;
+#else
+ // Fixed buffer size is dumb but in practice one shouldn't need
+ // huge strings for *formatted* output. This simple method is safe
+ // with pre-C99 vsnprintf() implementations too which don't return
+ // the required buffer size (they return -1 or buf_size - 1) or
+ // which might not null-terminate the buffer in case it's too small.
+ const size_t buf_size = 128 * 1024;
+ buf = malloc(buf_size);
+ if (buf == NULL)
+ return TUKLIB_WRAP_ERR_FORMAT;
+
+ va_start(ap, fmt);
+ const int n = vsnprintf(buf, buf_size, fmt, ap);
+ va_end(ap);
+
+ if (n <= 0 || n >= (int)(buf_size - 1)) {
+ free(buf);
+ return TUKLIB_WRAP_ERR_FORMAT;
+ }
+#endif
+
+ const int ret = tuklib_wraps(stream, opt, buf);
+ free(buf);
+ return ret;
+}
--- /dev/null
+// SPDX-License-Identifier: 0BSD
+
+///////////////////////////////////////////////////////////////////////////////
+//
+/// \file tuklib_mbstr_wrap.h
+/// \brief Word wrapping for multibyte strings
+///
+/// The word wrapping functions are intended to be usable, for example,
+/// for printing --help text in command line tools. While manually-wrapped
+/// --help text allows precise formatting, such freedom requires translators
+/// to count spaces and determine where line breaks should occur. It's
+/// tedious and error prone, and experience has shown that only some
+/// translators do it well. Automatic word wrapping is less flexible but
+/// results in polished-enough look with less effort from everyone.
+/// Right-to-left languages and languages that don't use spaces between
+/// words will still need extra effort though.
+//
+// Author: Lasse Collin
+//
+///////////////////////////////////////////////////////////////////////////////
+
+#ifndef TUKLIB_MBSTR_WRAP_H
+#define TUKLIB_MBSTR_WRAP_H
+
+#include "tuklib_common.h"
+#include <stdio.h>
+
+TUKLIB_DECLS_BEGIN
+
+/// One or more output lines exceeded right_margin.
+/// This only a warning; everything was still printed successfully.
+#define TUKLIB_WRAP_WARN_OVERLONG 0x01
+
+/// Error writing to to the output FILE. The error flag in the FILE
+/// should have been set as well.
+#define TUKLIB_WRAP_ERR_IO 0x02
+
+/// Invalid options in struct tuklib_wrap_opt.
+/// Nothing was printed.
+#define TUKLIB_WRAP_ERR_OPT 0x04
+
+/// Invalid or unsupported multibyte character in the input string:
+/// either mbrtowc() failed or wcwidth() returned a negative value.
+#define TUKLIB_WRAP_ERR_STR 0x08
+
+/// Only tuklib_wrapf(): Error in converting the format string.
+/// It's either a memory allocation failure or something bad with the
+/// format string or arguments.
+#define TUKLIB_WRAP_ERR_FORMAT 0x10
+
+/// Options for tuklib_wraps() and tuklib_wrapf()
+struct tuklib_wrap_opt {
+ /// Indentation of the first output line after `\n` or `\r`.
+ /// This can be anything less than right_margin.
+ unsigned short left_margin;
+
+ /// Column where word-wrapped continuation lines start.
+ /// This can be anything less than right_margin.
+ unsigned short left_cont;
+
+ /// Column where the text after `\v` will start, either on the current
+ /// line (when there is room to add at least one space) or on a new
+ /// empty line.
+ unsigned short left2_margin;
+
+ /// Like left_cont but for text after a `\v`. However, this must
+ /// be greater than or equal to left2_margin in addition to being
+ /// less than right_margin.
+ unsigned short left2_cont;
+
+ /// For 80-column terminals, it is recommended to use 79 here for
+ /// maximum portability. 80 will work most of the time but it will
+ /// result in unwanted empty lines in the rare case where a terminal
+ /// moves the cursor to the beginning of the next line immediately
+ /// when the last column has been used.
+ unsigned short right_margin;
+};
+
+#define tuklib_wraps TUKLIB_SYMBOL(tuklib_wraps)
+extern int tuklib_wraps(FILE *stream, const struct tuklib_wrap_opt *opt,
+ const char *str);
+///<
+/// \brief Word wrap a multibyte string and write it to a FILE
+///
+/// Word wrapping is done only at spaces and at the special control characters
+/// described below. Multiple consecutive spaces are handled properly: strings
+/// that have two (or more) spaces after a full sentence will look good even
+/// when the spaces occur at a word wrapping boundary. Trailing spaces are
+/// ignored at the end of a line or at the end of a string.
+///
+/// The following control characters have been repurposed:
+///
+/// - `\t` = Zero-width space allows a line break without producing any
+/// output by itself. This can be useful after hard hyphens as
+/// hyphens aren't otherwise used for line breaking. This can also
+/// be useful in languages that don't use spaces between words.
+/// (The Unicode character U+200B isn't supported.)
+/// - `\b` = Text between a pair of `\b` characters is treated as an
+/// unbreakable block (not wrapped even if there are spaces).
+/// For example, a non-breaking space can be done like
+/// in `"123\b \bMiB"`. Control characters (like `\n` or `\t`)
+/// aren't allowed before the closing `\b`. If closing `\b` is
+/// missing, the block extends to the end of the string. Empty
+/// blocks are treated as zero-width characters. If line breaks
+/// are possible around an empty block (like in `"foo \b\b bar"`
+/// or `"foo \b"`), it can result in weird output.
+/// - `\v` = Change to alternative indentation (left2_margin).
+/// - `\r` = Reset back to the initial indentation and add a newline.
+/// The next line will be indented by left_margin.
+/// - `\n` = Add a newline without resetting the effect of `\v`. The
+/// next line will be indented by left_margin or left2_margin
+/// (not left_cont or left2_cont).
+///
+/// Only `\n` should appear in translatable strings. `\t` works too but
+/// even that might confuse some translators even if there is a TRANSLATORS
+/// comment explaining its meaning.
+///
+/// To use the other control characters in messages, one should use
+/// tuklib_wrapf() with appropriate printf format string to combine
+/// translatable strings with non-translatable portions. For example:
+///
+/// \code{.c}
+/// static const struct tuklib_wrap_opt wrap2 = { 2, 2, 22, 22, 79 };
+/// int e = 0;
+/// ...
+/// e |= tuklib_wrapf(stdout, &wrap2,
+/// "-h, --help\v%s\r"
+/// " --version\v%s",
+/// W_("display this help and exit"),
+/// W_("display version information and exit"));
+/// ...
+/// if (e != 0) {
+/// // Handle warning or error.
+/// ...
+/// }
+/// \endcode
+///
+/// Control characters other than `\n` and `\t` are unusable in
+/// translatable strings:
+///
+/// - Gettext tools show annoying warnings if C escape sequences other
+/// than `\n` or `\t` are seen. (Otherwise they still work perfectly
+/// fine though.)
+///
+/// - While at least Poedit and Lokalize support all escapes, some
+/// editors only support `\n` and `\t`.
+///
+/// - They could confuse some translators, resulting in broken
+/// translations.
+///
+/// Using non-control characters would solve some issues but it wouldn't
+/// help with the unfortunate real-world issue that some translators would
+/// likely have trouble understanding a new syntax. The Gettext manual
+/// specifically warns about this, see the subheading "No unusual markup"
+/// in `info (gettext)Preparing Strings`. (While using `\t` for zero-width
+/// space is such custom markup, most translators will never need it.)
+///
+/// Translators can use the Unicode character U+00A0 (or U+202F) if they
+/// need a non-breaking space. For example, in French a non-breaking space
+/// may be needed before colons and question marks (U+00A0 is common in
+/// real-world French PO files).
+///
+/// Using a non-ASCII char in a string in the C code (like `"123\u00A0MiB"`)
+/// can work if one tells xgettext that input encoding is UTF-8, one
+/// ensures that the C compiler uses UTF-8 as the input charset, and one
+/// is certain that the program is *always* run under an UTF-8 locale.
+/// Unfortunately a portable program cannot make this kind of assumptions,
+/// which means that there is no pretty way to have a non-breaking space in
+/// a translatable string.
+///
+/// Optional: To tell translators which strings are automatically word
+/// wrapped, see the macro `W_` in tuklib_gettext.h.
+///
+/// \param stream Output FILE stream. For decent performance, it
+/// should be in buffered mode because this function
+/// writes the output one byte at a time with fputc().
+/// \param opt Word wrapping options.
+/// \param str Null-terminated multibyte string that is in
+/// the encoding used by the current locale.
+///
+/// \return Returns 0 on success. If an error or warning occurs, one of
+/// TUKLIB_WRAP_* codes is returned. Those codes are powers
+/// of two. When warning/error detection can be delayed, the
+/// return values can be accumulated from multiple calls using
+/// bitwise-or into a single variable which can be checked after
+/// all strings have (hopefully) been printed.
+
+#define tuklib_wrapf TUKLIB_SYMBOL(tuklib_wrapf)
+extern int tuklib_wrapf(FILE *stream, const struct tuklib_wrap_opt *opt,
+ const char *fmt, ...);
+///<
+/// \brief Format and word-wrap a multibyte string and write it to a FILE
+///
+/// This is like tuklib_wraps() except that this takes a printf
+/// format string.
+///
+/// \note On platforms that lack vasprintf(), the intermediate
+/// result from vsnprintf() must fit into a 128 KiB buffer.
+/// TUKLIB_WRAP_ERR_FORMAT is returned if it doesn't but
+/// only on platforms that lack vasprintf().
+
+TUKLIB_DECLS_END
+#endif