out the UTF8 parser, so that it can be shared between
valid_utf8_string() and printable(). Wietse Venema, with
tests by Viktor Dukhovni. Files: util/valid_utf8_string.c,
- util/printable.c, util/parse_utf8_char.c, util/printable.in,
+ util/printable.c, util/parse_utf8_char.h, util/printable.in,
util/printable.ref.
+
+20231010
+
+ Cleanup: printable() uses once again a single-pass algorithm.
+ Converted printable() test files to built-in test cases with
+ proper logging, and removed the printable() test files and
+ git metadata. Added similar tests for the valid_utf8_string()
+ function. Files: util/valid_utf8_string.c, util/printable.c,
+ util/parse_utf8_char.h, util/Makefile.in.
SPKI
peerpkey
rpk
+ep
+inlined
* Patches change both the patchlevel and the release date. Snapshots have no
* patchlevel; they change the release date only.
*/
-#define MAIL_RELEASE_DATE "20231008"
+#define MAIL_RELEASE_DATE "20231010"
#define MAIL_VERSION_NUMBER "3.9"
#ifdef SNAPSHOT
+++ /dev/null
-printable.in binary
strcasecmp_utf8_test vbuf_print_test miss_endif_cidr_test \
miss_endif_regexp_test split_qnameval_test vstring_test \
vstream_test byte_mask_tests mystrtok_test known_tcp_ports_test \
- binhash_test argv_test inet_prefix_top_test printable_test
+ binhash_test argv_test inet_prefix_top_test printable_test \
+ valid_utf8_string_test
dict_tests: all dict_test \
dict_pcre_tests dict_cidr_test dict_thash_test dict_static_test \
# diff unescape.in unescape.tmp
rm -f unescape.tmp
-printable_test: printable printable.in
- $(SHLIB_ENV) ${VALGRIND} ./printable <printable.in > printable.tmp
- diff -b printable.ref printable.tmp
- rm -f printable.tmp
+printable_test: printable
+ $(SHLIB_ENV) ${VALGRIND} ./printable
+
+valid_utf8_string_test: valid_utf8_string
+ $(SHLIB_ENV) ${VALGRIND} ./valid_utf8_string
hex_quote_test: hex_quote
$(SHLIB_ENV) ${VALGRIND} ./hex_quote <hex_quote.c | od -cb >hex_quote.tmp
/* SYNOPSIS
/* #include <parse_utf8_char.h>
/*
-/* char *parse_utf8_char(str, len)
+/* char *parse_utf8_char(str, end)
/* const char *str;
-/* ssize_t len;
+/* const char *end;
/* DESCRIPTION
-/* parse_utf8_char() determines if the \fBlen\fR bytes starting
-/* at \fBstr\fR begin with a complete UTF-8 multi-byte character
-/* as defined in RFC 3629. That is, it contains a proper
-/* encoding of code points U+0000..U+10FFFF, excluding over-long
-/* encodings and excluding U+D800..U+DFFF surrogates.
+/* parse_utf8_char() determines if the byte sequence starting
+/* at \fBstr\fR begins with a complete UTF-8 character as
+/* defined in RFC 3629. That is, a proper encoding of code
+/* points U+0000..U+10FFFF, excluding over-long encodings and
+/* excluding U+D800..U+DFFF surrogates.
/*
-/* When the \fBlen\fR bytes starting at \fBstr\fR begin with
-/* a complete UTF-8 multi-byte character, this function returns
-/* a pointer to the last byte in that character. Otherwise,
-/* it returns a null pointer.
+/* When the byte sequence starting at \fBstr\fR begins with a
+/* complete UTF-8 character, this function returns a pointer
+/* to the last byte in that character. Otherwise, it returns
+/* a null pointer.
+/*
+/* The \fBend\fR argument is either null (the byte sequence
+/* starting at \fBstr\fR must be null terminated), or \fBend
+/* - str\fR specifies the length of the byte sequence.
/* BUGS
/* Code points in the range U+FDD0..U+FDEF and ending in FFFE
/* or FFFF are non-characters in UNICODE. This function does
/*
* Optimized for correct input, time, space, and for CPUs that have a
- * decent number of registers.
+ * decent number of registers. Other implementation considerations:
+ *
+ * - In the UTF-8 encoding, a non-leading byte is never null. Therefore,
+ * this function will correctly reject a partial UTF-8 character at the
+ * end of a null-terminated string.
+ *
+ * - If the "end" argument is a null constant, and if this function is
+ * inlined, then an optimizing compiler should propagate the constant
+ * through the "ep" variable, and eliminate any code branches that
+ * require ep != 0.
*/
/* Single-byte encodings. */
if (EXPECTED((c0 = *cp) <= 0x7f) /* we know that c0 >= 0x0 */ ) {
else if (EXPECTED(c0 <= 0xdf) /* we know that c0 >= 0x80 */ ) {
/* Exclude over-long encodings. */
if (UNEXPECTED(c0 < 0xc2)
- || UNEXPECTED(cp + 1 >= ep)
+ || UNEXPECTED(ep && cp + 1 >= ep)
/* Require UTF-8 tail byte. */
|| UNEXPECTED(((ch = *++cp) & 0xc0) != 0x80))
return (0);
}
/* Three-byte encodings. */
else if (EXPECTED(c0 <= 0xef) /* we know that c0 >= 0xe0 */ ) {
- if (UNEXPECTED(cp + 2 >= ep)
+ if (UNEXPECTED(ep && cp + 2 >= ep)
/* Exclude over-long encodings. */
|| UNEXPECTED((ch = *++cp) < (c0 == 0xe0 ? 0xa0 : 0x80))
/* Exclude U+D800..U+DFFF. */
}
/* Four-byte encodings. */
else if (EXPECTED(c0 <= 0xf4) /* we know that c0 >= 0xf0 */ ) {
- if (UNEXPECTED(cp + 3 >= ep)
+ if (UNEXPECTED(ep && cp + 3 >= ep)
/* Exclude over-long encodings. */
|| UNEXPECTED((ch = *++cp) < (c0 == 0xf0 ? 0x90 : 0x80))
/* Exclude code points above U+10FFFF. */
char *printable_except(char *string, int replacement, const char *except)
{
char *cp;
- char *ep = string + strlen(string);
char *last;
int ch;
if (util_utf8_enable == 0) {
if (ISASCII(ch) && PRINT_OR_EXCEPT(ch))
continue;
- } else if ((last = parse_utf8_char(cp, ep)) == cp) { /* ASCII */
+ } else if ((last = parse_utf8_char(cp, 0)) == cp) { /* ASCII */
if (PRINT_OR_EXCEPT(ch))
continue;
- } else if (last > cp) { /* Other UTF8 */
+ } else if (last != 0) { /* Other UTF8 */
cp = last;
continue;
}
#include <stdlib.h>
#include <string.h>
#include <msg.h>
-#include <vstring_vstream.h>
+#include <msg_vstream.h>
+#include <mymalloc.h>
+#include <vstream.h>
+
+ /*
+ * Test cases for 1-, 2-, and 3-byte encodings. Originally contributed by
+ * Viktor Dukhovni, and annotated using translate.google.com.
+ *
+ * XXX Need a test for 4-byte encodings, preferably with strings that can be
+ * displayed.
+ */
+struct testcase {
+ const char *name;
+ const char *input;
+ const char *expected;;
+};
+static const struct testcase testcases[] = {
+ {"Printable ASCII",
+ "printable", "printable"
+ },
+ {"ASCII with control character",
+ "non\bn-printable", "non?n-printable"
+ },
+ {"Latin accented text, no error",
+ "na\303\257ve", "na\303\257ve"
+ },
+ {"Latin text, with error",
+ "na\303ve", "na?ve"
+ },
+ {"Viktor, Cyrillic, no error",
+ "\320\262\320\270\320\272\321\202\320\276\321\200",
+ "\320\262\320\270\320\272\321\202\320\276\321\200"
+ },
+ {"Viktor, Cyrillic, two errors",
+ "\320\262\320\320\272\272\321\202\320\276\321\200",
+ "\320\262?\320\272?\321\202\320\276\321\200"
+ },
+ {"Viktor, Hebrew, no error",
+ "\327\225\327\231\327\247\327\230\327\225\326\274\327\250",
+ "\327\225\327\231\327\247\327\230\327\225\326\274\327\250"
+ },
+ {"Viktor, Hebrew, with error",
+ "\327\225\231\327\247\327\230\327\225\326\274\327\250",
+ "\327\225?\327\247\327\230\327\225\326\274\327\250"
+ },
+ {"Chinese (Simplified), no error",
+ "\344\270\255\345\233\275\344\272\222\350\201\224\347\275\221\347"
+ "\273\234\345\217\221\345\261\225\347\212\266\345\206\265\347\273"
+ "\237\350\256\241\346\212\245\345\221\212",
+ "\344\270\255\345\233\275\344\272\222\350\201\224\347\275\221\347"
+ "\273\234\345\217\221\345\261\225\347\212\266\345\206\265\347\273"
+ "\237\350\256\241\346\212\245\345\221\212"
+ },
+ {"Chinese (Simplified), with errors",
+ "\344\270\255\345\344\272\222\350\224\347\275\221\347"
+ "\273\234\345\217\221\345\261\225\347\212\266\345\206\265\347\273"
+ "\237\350\256\241\346\212\245\345",
+ "\344\270\255?\344\272\222??\347\275\221\347"
+ "\273\234\345\217\221\345\261\225\347\212\266\345\206\265\347\273"
+ "\237\350\256\241\346\212\245?"
+ },
+};
int main(int argc, char **argv)
{
- VSTRING *in = vstring_alloc(10);
+ const struct testcase *tp;
+ int pass;
+ int fail;
+#define NUM_TESTS sizeof(testcases)/sizeof(testcases[0])
+
+ msg_vstream_init(basename(argv[0]), VSTREAM_ERR);
util_utf8_enable = 1;
- while (vstring_fgets_nonl(in, VSTREAM_IN)) {
- printable(vstring_str(in), '?');
- vstream_fwrite(VSTREAM_OUT, vstring_str(in), VSTRING_LEN(in));
- VSTREAM_PUTC('\n', VSTREAM_OUT);
+ for (pass = fail = 0, tp = testcases; tp < testcases + NUM_TESTS; tp++) {
+ char *input;
+ char *actual;
+
+ /*
+ * Notes:
+ *
+ * - The input is modified, therefore it must be copied.
+ *
+ * - The msg(3) functions use printable() which interferes when logging
+ * inputs and outputs. Use vstream_fprintf() instead.
+ */
+ vstream_fprintf(VSTREAM_ERR, "RUN %s\n", tp->name);
+ input = mystrdup(tp->input);
+ actual = printable(input, '?');
+
+ if (strcmp(actual, tp->expected) == 0) {
+ vstream_fprintf(VSTREAM_ERR, "input: >%s<, want and got: >%s<\n",
+ tp->input, actual);
+ vstream_fprintf(VSTREAM_ERR, "PASS %s\n", tp->name);
+ pass++;
+ } else {
+ vstream_fprintf(VSTREAM_ERR, "input: >%s<, want: >%s<, got: >%s<\n",
+ tp->input, tp->expected, actual);
+ vstream_fprintf(VSTREAM_ERR, "FAIL %s\n", tp->name);
+ fail++;
+ }
+ myfree(input);
}
- vstream_fflush(VSTREAM_OUT);
- exit(0);
+ msg_info("PASS=%d FAIL=%d", pass, fail);
+ return (fail > 0);
}
#endif
+++ /dev/null
-printable
-non\bn-printable
-naïve
-naÃve
-виктор
-вÐкºÑ\82оÑ\80
-ויקטוּר
-×\95\99×§×\98×\95ּר
-中国互联网络发展状况统计报告
-ä¸åäº\92è\94ç½\91ç»\9cå\8f\91å±\95ç\8a¶å\86µç»\9f计æ\8a¥å
+++ /dev/null
-printable
-non?n-printable
-naïve
-na?ve
-виктор
-в?к?тор
-ויקטוּר
-ו?קטוּר
-中国互联网络发展状况统计报告
-中?互??网络发展状况统计报?
if (len < 0)
return (0);
- if (len <= 0)
+ if (len == 0)
return (1);
/*
*/
#ifdef TEST
#include <stdlib.h>
+#include <string.h>
+#include <msg.h>
#include <vstream.h>
-#include <vstring.h>
-#include <vstring_vstream.h>
+#include <msg_vstream.h>
-#define STR(x) vstring_str(x)
-#define LEN(x) VSTRING_LEN(x)
+ /*
+ * Test cases for 1-, 2-, and 3-byte encodings. See printable() tests for
+ * provenance.
+ *
+ * XXX Need a test for 4-byte encodings, preferably with strings that can be
+ * displayed.
+ */
+struct testcase {
+ const char *name;
+ const char *input;
+ int expected;
+};
-int main(void)
+static const struct testcase testcases[] = {
+ {"Printable ASCII",
+ "printable", 1,
+ },
+ {"Latin accented text, no error",
+ "na\303\257ve", 1,
+ },
+ {"Latin text, with error",
+ "na\303ve", 0,
+ },
+ {"Viktor, Cyrillic, no error",
+ "\320\262\320\270\320\272\321\202\320\276\321\200", 1,
+ },
+ {"Viktor, Cyrillic, two errors",
+ "\320\262\320\320\272\272\321\202\320\276\321\200", 0,
+ },
+ {"Viktor, Hebrew, no error",
+ "\327\225\327\231\327\247\327\230\327\225\326\274\327\250", 1,
+ },
+ {"Viktor, Hebrew, with error",
+ "\327\225\231\327\247\327\230\327\225\326\274\327\250", 0,
+ },
+ {"Chinese (Simplified), no error",
+ "\344\270\255\345\233\275\344\272\222\350\201\224\347\275\221\347"
+ "\273\234\345\217\221\345\261\225\347\212\266\345\206\265\347\273"
+ "\237\350\256\241\346\212\245\345\221\212", 1,
+ },
+ {"Chinese (Simplified), with errors",
+ "\344\270\255\345\344\272\222\350\224\347\275\221\347"
+ "\273\234\345\217\221\345\261\225\347\212\266\345\206\265\347\273"
+ "\237\350\256\241\346\212\245\345", 0,
+ },
+};
+
+int main(int argc, char **argv)
{
- VSTRING *buf = vstring_alloc(1);
+ const struct testcase *tp;
+ int pass;
+ int fail;
+
+#define NUM_TESTS sizeof(testcases)/sizeof(testcases[0])
+
+ msg_vstream_init(basename(argv[0]), VSTREAM_ERR);
+ util_utf8_enable = 1;
+
+ for (pass = fail = 0, tp = testcases; tp < testcases + NUM_TESTS; tp++) {
+ int actual;
+
+ /*
+ * Notes:
+ *
+ * - The msg(3) functions use printable() which interferes when logging
+ * inputs and outputs. Use vstream_fprintf() instead.
+ */
+ vstream_fprintf(VSTREAM_ERR, "RUN %s\n", tp->name);
+ actual = valid_utf8_string(tp->input, strlen(tp->input));
- while (vstring_get_nonl(buf, VSTREAM_IN) != VSTREAM_EOF) {
- vstream_printf("%c", (LEN(buf) && !valid_utf8_string(STR(buf), LEN(buf))) ?
- '!' : ' ');
- vstream_fwrite(VSTREAM_OUT, STR(buf), LEN(buf));
- vstream_printf("\n");
+ if (actual == tp->expected) {
+ vstream_fprintf(VSTREAM_ERR, "input: >%s<, want and got: >%s<\n",
+ tp->input, actual ? "valid" : "not valid");
+ vstream_fprintf(VSTREAM_ERR, "PASS %s\n", tp->name);
+ pass++;
+ } else {
+ vstream_fprintf(VSTREAM_ERR, "input: >%s<, want: >%s<, got: >%s<\n",
+ tp->input, tp->expected ? "valid" : "not valid",
+ actual ? "valid" : "not valid");
+ vstream_fprintf(VSTREAM_ERR, "FAIL %s\n", tp->name);
+ fail++;
+ }
}
- vstream_fflush(VSTREAM_OUT);
- vstring_free(buf);
- exit(0);
+ msg_info("PASS=%d FAIL=%d", pass, fail);
+ return (fail > 0);
}
#endif