From: Tom Yu Date: Wed, 23 Dec 2015 21:49:24 +0000 (-0500) Subject: Add tests for UTF-8 conversions X-Git-Tag: krb5-1.15-beta1~304 X-Git-Url: http://git.ipfire.org/cgi-bin/gitweb.cgi?a=commitdiff_plain;h=4807e229ade234e417dbf9d589a3602218082453;p=thirdparty%2Fkrb5.git Add tests for UTF-8 conversions --- diff --git a/.gitignore b/.gitignore index ac718e2cb6..d42401969b 100644 --- a/.gitignore +++ b/.gitignore @@ -492,5 +492,6 @@ local.properties /src/util/support/t_path /src/util/support/t_path_win /src/util/support/t_unal +/src/util/support/t_utf8 /src/util/verto/rename.h diff --git a/src/util/support/Makefile.in b/src/util/support/Makefile.in index 51817621be..5d38b0326b 100644 --- a/src/util/support/Makefile.in +++ b/src/util/support/Makefile.in @@ -127,7 +127,8 @@ SRCS=\ $(srcdir)/base64.c \ $(srcdir)/json.c \ $(srcdir)/bcmp.c \ - $(srcdir)/strerror_r.c + $(srcdir)/strerror_r.c \ + $(srcdir)/t_utf8.c SHLIB_EXPDEPS = # Add -lm if dumping thread stats, for sqrt. @@ -199,7 +200,10 @@ t_json: $(T_JSON_OBJS) t_unal: t_unal.o $(CC_LINK) -o t_unal t_unal.o -TEST_PROGS= t_k5buf t_path t_path_win t_base64 t_json t_unal +t_utf8: t_utf8.o utf8.o + $(CC_LINK) -o t_utf8 t_utf8.o utf8.o + +TEST_PROGS= t_k5buf t_path t_path_win t_base64 t_json t_unal t_utf8 check-unix:: $(TEST_PROGS) ./t_k5buf @@ -208,11 +212,12 @@ check-unix:: $(TEST_PROGS) ./t_base64 ./t_json ./t_unal + ./t_utf8 clean:: $(RM) t_k5buf.o t_k5buf t_unal.o t_unal path_win.o path_win $(RM) t_path_win.o t_path_win t_path.o t_path t_base64.o t_base64 - $(RM) t_json.o t_json libkrb5support.exports + $(RM) t_json.o t_json libkrb5support.exports t_utf8.o t_utf8 @lib_frag@ @libobj_frag@ diff --git a/src/util/support/t_utf8.c b/src/util/support/t_utf8.c new file mode 100644 index 0000000000..583270165a --- /dev/null +++ b/src/util/support/t_utf8.c @@ -0,0 +1,209 @@ +/* -*- mode: c; c-basic-offset: 4; indent-tabs-mode: nil -*- */ +/* util/support/t_utf8.c - test UTF-8 boundary conditions */ +/* + * Copyright (C) 2015 by the Massachusetts Institute of Technology. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS + * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE + * COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, + * INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR + * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, + * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED + * OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include +#include + +#include "k5-platform.h" +#include "k5-utf8.h" + +/* + * Convenience macro to allow testing of old encodings. + * + * "Old" means ISO/IEC 10646 prior to 2011, when the highest valid code point + * was U+7FFFFFFF instead of U+10FFFF. + */ +#ifdef OLDENCODINGS +#define L(x) (x) +#else +#define L(x) 0 +#endif + +/* + * len is 0 for invalid encoding prefixes (krb5int_utf8_charlen2() partially + * enforces the validity of the first two bytes, based on masking the second + * byte. It doesn't check whether bit 6 is 0, though, and doesn't catch the + * range between U+110000 and U+13FFFF). + * + * ucs is 0 for invalid encodings (including ones with valid prefixes according + * to krb5int_utf8_charlen2(); krb5int_utf8_to_ucs4() will still fail on them + * because it checks more things.) Code points above U+10FFFF are excluded by + * the actual test code and remain in the table for possibly testing the old + * implementation that didn't exclude them. + * + * Neither krb5int_ucs4_to_utf8() nor krb5int_utf8_to_ucs4() excludes the + * surrogate pair range. + */ +struct testcase { + const char *p; + krb5_ucs4 ucs; + int len; +} testcases[] = { + { "\x7f", 0x0000007f, 1 }, /* Lowest 1-byte encoding */ + { "\xc0\x80", 0x00000000, 0 }, /* Invalid 2-byte encoding */ + { "\xc2\x80", 0x00000080, 2 }, /* Lowest valid 2-byte encoding */ + { "\xdf\xbf", 0x000007ff, 2 }, /* Highest valid 2-byte encoding*/ + { "\xdf\xff", 0x00000000, 2 }, /* Invalid 2-byte encoding*/ + { "\xe0\x80\x80", 0x00000000, 0 }, /* Invalid 3-byte encoding */ + { "\xe0\xa0\x80", 0x00000800, 3 }, /* Lowest valid 3-byte encoding */ + { "\xef\xbf\xbf", 0x0000ffff, 3 }, /* Highest valid 3-byte encoding */ + { "\xef\xff\xff", 0x00000000, 3 }, /* Invalid 3-byte encoding */ + { "\xf0\x80\x80\x80", 0x00000000, 0 }, /* Invalid 4-byte encoding */ + { "\xf0\x90\x80\x80", 0x00010000, 4 }, /* Lowest valid 4-byte encoding */ + { "\xf4\x8f\xbf\xbf", 0x0010ffff, 4 }, /* Highest valid 4-byte encoding */ + /* Next higher 4-byte encoding (old) */ + { "\xf4\x90\x80\x80", 0x00110000, 4 }, + /* Highest 4-byte encoding starting with 0xf4 (old) */ + { "\xf4\xbf\xbf\xbf", 0x0013ffff, 4 }, + /* Next higher 4-byte prefix byte (old) */ + { "\xf5\x80\x80\x80", 0x00140000, L(4) }, + /* Highest valid 4-byte encoding (old) */ + { "\xf7\xbf\xbf\xbf", 0x001fffff, L(4) }, + /* Invalid 4-byte encoding */ + { "\xf7\xff\xff\xff", 0x00000000, L(4) }, + /* Invalid 5-byte encoding */ + { "\xf8\x80\x80\x80\x80", 0x00000000, 0 }, + /* Lowest valid 5-byte encoding (old) */ + { "\xf8\x88\x80\x80\x80", 0x00200000, L(5) }, + /* Highest valid 5-byte encoding (old) */ + { "\xfb\xbf\xbf\xbf\xbf", 0x03ffffff, L(5) }, + /* Invalid 5-byte encoding */ + { "\xfb\xff\xff\xff\xff", 0x00000000, L(5) }, + /* Invalid 6-byte encoding */ + { "\xfc\x80\x80\x80\x80\x80", 0x00000000, 0 }, + /* Lowest valid 6-byte encoding (old) */ + { "\xfc\x84\x80\x80\x80\x80", 0x04000000, L(6) }, + /* Highest valid 6-byte encoding (old) */ + { "\xfd\xbf\xbf\xbf\xbf\xbf", 0x7fffffff, L(6) }, + /* Invalid 6-byte encoding */ + { "\xfd\xff\xff\xff\xff\xff", 0x00000000, L(6) }, +}; + +static void +printhex(const char *p) +{ + for (; *p != '\0'; p++) { + printf("%02x ", (unsigned char)*p); + } +} + +static void +printtest(struct testcase *t) +{ + printhex(t->p); + printf("0x%08lx, %d\n", (unsigned long)t->ucs, t->len); +} + +static int +test_decode(struct testcase *t, int high4) +{ + int len, status = 0; + krb5_ucs4 u = 0; + + len = krb5int_utf8_charlen2(t->p); + if (len != t->len) { + printf("expected len=%d, got len=%d\n", t->len, len); + status = 1; + } + if ((t->len == 0 || high4) && krb5int_utf8_to_ucs4(t->p, &u) != -1) { + printf("unexpected success in utf8_to_ucs4\n"); + status = 1; + } + if (krb5int_utf8_to_ucs4(t->p, &u) != 0 && t->ucs != 0 && !high4) { + printf("unexpected failure in utf8_to_ucs4\n"); + status = 1; + } + if (t->ucs != u && !high4) { + printf("expected 0x%08lx, got 0x%08lx\n", (unsigned long)t->ucs, + (unsigned long)u); + status = 1; + } + return status; +} + +static int +test_encode(struct testcase *t, int high4) +{ + size_t size; + char buf[7]; + + memset(buf, 0, sizeof(buf)); + size = krb5int_ucs4_to_utf8(t->ucs, buf); + if (high4 && size != 0) { + printf("unexpected success beyond U+10FFFF\n"); + return 1; + } + if (!high4 && size == 0) { + printf("unexpected zero size on encode\n"); + return 1; + } + if (size != 0 && strcmp(t->p, buf) != 0) { + printf("expected "); + printhex(t->p); + printf("got "); + printhex(buf); + printf("\n"); + return 1; + } + return 0; +} + +int +main(int argc, char **argv) +{ + size_t ncases = sizeof(testcases) / sizeof(testcases[0]); + size_t i; + struct testcase *t; + int status = 0, verbose = 0; + /* Is this a "high" 4-byte encoding above U+10FFFF? */ + int high4; + + if (argc == 2 && strcmp(argv[1], "-v") == 0) + verbose = 1; + for (i = 0; i < ncases; i++) { + t = &testcases[i]; + if (verbose) + printtest(t); +#ifndef OLDENCODINGS + high4 = t->ucs > 0x10ffff; +#else + high4 = 0; +#endif + if (test_decode(t, high4) != 0) + status = 1; + if (t->ucs == 0) + continue; + if (test_encode(t, high4) != 0) + status = 1; + } + return status; +}