From: Sean Bright Date: Mon, 13 Jul 2020 20:06:14 +0000 (-0400) Subject: utf8.c: Add UTF-8 validation and utility functions X-Git-Tag: 17.7.0-rc1~8 X-Git-Url: http://git.ipfire.org/cgi-bin/gitweb.cgi?a=commitdiff_plain;h=87e79e6f9cce3292284135cdd0f6d2f2162f1dc7;p=thirdparty%2Fasterisk.git utf8.c: Add UTF-8 validation and utility functions There are various places in Asterisk - specifically in regards to database integration - where having some kind of UTF-8 validation would be beneficial. This patch adds: * Functions to validate that a given string contains only valid UTF-8 sequences. * A function to copy a string (similar to ast_copy_string) stopping when an invalid UTF-8 sequence is encountered. * A UTF-8 validator that allows for progressive validation. All of this is based on the excellent UTF-8 decoder by Björn Höhrmann. More information is available here: https://bjoern.hoehrmann.de/utf-8/decoder/dfa/ The API was written in such a way that should allow us to replace the implementation later should we determine that we need something more comprehensive. Change-Id: I3555d787a79e7c780a7800cd26e0b5056368abf9 --- diff --git a/include/asterisk/utf8.h b/include/asterisk/utf8.h new file mode 100644 index 0000000000..c021f055b0 --- /dev/null +++ b/include/asterisk/utf8.h @@ -0,0 +1,188 @@ +/* + * Asterisk -- An open source telephony toolkit. + * + * Copyright (C) 2020, Sean Bright + * + * Sean Bright + * + * See http://www.asterisk.org for more information about + * the Asterisk project. Please do not directly contact + * any of the maintainers of this project for assistance; + * the project provides a web site, mailing lists and IRC + * channels for your use. + * + * This program is free software, distributed under the terms of + * the GNU General Public License Version 2. See the LICENSE file + * at the top of the source tree. + */ + +/*! \file + * + * \brief UTF-8 information and validation functions + */ + +#ifndef ASTERISK_UTF8_H +#define ASTERISK_UTF8_H + +/*! + * \brief Check if a zero-terminated string is valid UTF-8 + * \since 13.36.0, 16.13.0, 17.7.0, 18.0.0 + * + * \param str The zero-terminated string to check + * + * \retval 0 if the string is not valid UTF-8 + * \retval Non-zero if the string is valid UTF-8 + */ +int ast_utf8_is_valid(const char *str); + +/*! + * \brief Check if the first \a size bytes of a string are valid UTF-8 + * \since 13.36.0, 16.13.0, 17.7.0, 18.0.0 + * + * Similar to \a ast_utf8_is_valid() but checks the first \a size bytes or until + * a zero byte is reached, whichever comes first. + * + * \param str The string to check + * \param size The number of bytes to evaluate + * + * \retval 0 if the string is not valid UTF-8 + * \retval Non-zero if the string is valid UTF-8 + */ +int ast_utf8_is_validn(const char *str, size_t size); + +/*! + * \brief Copy a string safely ensuring valid UTF-8 + * \since 13.36.0, 16.13.0, 17.7.0, 18.0.0 + * + * This is similar to \a ast_copy_string, but it will only copy valid UTF-8 + * sequences from the source string into the destination buffer. If an invalid + * UTF-8 sequence is encountered, or the available space in the destination + * buffer is exhausted in the middle of an otherwise valid UTF-8 sequence, the + * destination buffer will be truncated to ensure that it only contains valid + * UTF-8. + * + * \param dst The destination buffer. + * \param src The source string + * \param size The size of the destination buffer + * \return Nothing. + */ +void ast_utf8_copy_string(char *dst, const char *src, size_t size); + +enum ast_utf8_validation_result { + /*! \brief The consumed sequence is valid UTF-8 + * + * The bytes consumed thus far by the validator represent a valid sequence of + * UTF-8 bytes. If additional bytes are fed into the validator, it can + * transition into either \a AST_UTF8_INVALID or \a AST_UTF8_UNKNOWN + */ + AST_UTF8_VALID, + + /*! \brief The consumed sequence is invalid UTF-8 + * + * The bytes consumed thus far by the validator represent an invalid sequence + * of UTF-8 bytes. Feeding additional bytes into the validator will not + * change its state. + */ + AST_UTF8_INVALID, + + /*! \brief The validator is in an intermediate state + * + * The validator is in the process of validating a multibyte UTF-8 sequence + * and requires additional data to be fed into it to determine validity. If + * additional bytes are fed into the validator, it can transition into either + * \a AST_UTF8_VALID or \a AST_UTF8_INVALID. If you have no additional data + * to feed into the validator the UTF-8 sequence is invalid. + */ + AST_UTF8_UNKNOWN, +}; + +/*! + * \brief Opaque type for UTF-8 validator state. + * \since 13.36.0, 16.13.0, 17.7.0, 18.0.0 + */ +struct ast_utf8_validator; + +/*! + * \brief Create a new UTF-8 validator + * \since 13.36.0, 16.13.0, 17.7.0, 18.0.0 + * + * \param[out] validator The validator instance + * + * \retval 0 on success + * \retval -1 on failure + */ +int ast_utf8_validator_new(struct ast_utf8_validator **validator); + +/*! + * \brief Feed a zero-terminated string into the UTF-8 validator + * \since 13.36.0, 16.13.0, 17.7.0, 18.0.0 + * + * \param validator The validator instance + * \param data The zero-terminated string to feed into the validator + * + * \return The \ref ast_utf8_validation_result indicating the current state of + * the validator. + */ +enum ast_utf8_validation_result ast_utf8_validator_feed( + struct ast_utf8_validator *validator, const char *data); + +/*! + * \brief Feed a string into the UTF-8 validator + * \since 13.36.0, 16.13.0, 17.7.0, 18.0.0 + * + * Similar to \a ast_utf8_validator_feed but will stop feeding in data if a zero + * byte is encountered or \a size bytes have been read. + * + * \param validator The validator instance + * \param data The string to feed into the validator + * \param size The number of bytes to feed into the validator + * + * \return The \ref ast_utf8_validation_result indicating the current state of + * the validator. + */ +enum ast_utf8_validation_result ast_utf8_validator_feedn( + struct ast_utf8_validator *validator, const char *data, size_t size); + +/*! + * \brief Get the current UTF-8 validator state + * \since 13.36.0, 16.13.0, 17.7.0, 18.0.0 + * + * \param validator The validator instance + * + * \return The \ref ast_utf8_validation_result indicating the current state of + * the validator. + */ +enum ast_utf8_validation_result ast_utf8_validator_state( + struct ast_utf8_validator *validator); + +/*! + * \brief Reset the state of a UTF-8 validator + * \since 13.36.0, 16.13.0, 17.7.0, 18.0.0 + * + * Resets the provided UTF-8 validator to its initial state so that it can be + * reused. + * + * \param validator The validator instance to reset + */ +void ast_utf8_validator_reset( + struct ast_utf8_validator *validator); + +/*! + * \brief Destroy a UTF-8 validator + * \since 13.36.0, 16.13.0, 17.7.0, 18.0.0 + * + * \param validator The validator instance to destroy + */ +void ast_utf8_validator_destroy(struct ast_utf8_validator *validator); + +/*! + * \brief Register UTF-8 tests + * \since 13.36.0, 16.13.0, 17.7.0, 18.0.0 + * + * Does nothing unless TEST_FRAMEWORK is defined. + * + * \return Always returns 0 + */ +int ast_utf8_init(void); + +#endif /* ASTERISK_UTF8_H */ diff --git a/main/asterisk.c b/main/asterisk.c index 3e13d11f41..a028611dba 100644 --- a/main/asterisk.c +++ b/main/asterisk.c @@ -242,6 +242,7 @@ int daemon(int, int); /* defined in libresolv of all places */ #include "asterisk/media_cache.h" #include "asterisk/astdb.h" #include "asterisk/options.h" +#include "asterisk/utf8.h" #include "../defaults.h" @@ -4068,6 +4069,7 @@ static void asterisk_daemon(int isroot, const char *runuser, const char *rungrou check_init(ast_json_init(), "libjansson"); ast_ulaw_init(); ast_alaw_init(); + ast_utf8_init(); tdd_init(); callerid_init(); ast_builtins_init(); diff --git a/main/utf8.c b/main/utf8.c new file mode 100644 index 0000000000..ab4338ba4c --- /dev/null +++ b/main/utf8.c @@ -0,0 +1,380 @@ +/* + * Asterisk -- An open source telephony toolkit. + * + * Copyright (C) 2020, Sean Bright + * + * Sean Bright + * + * See http://www.asterisk.org for more information about + * the Asterisk project. Please do not directly contact + * any of the maintainers of this project for assistance; + * the project provides a web site, mailing lists and IRC + * channels for your use. + * + * This program is free software, distributed under the terms of + * the GNU General Public License Version 2. See the LICENSE file + * at the top of the source tree. + */ + +/*! \file + * + * \brief UTF-8 information and validation functions + */ + +/*** MODULEINFO + core +***/ + +#include "asterisk.h" + +#include "asterisk/utils.h" +#include "asterisk/utf8.h" +#include "asterisk/test.h" + +/* + * BEGIN THIRD PARTY CODE + * + * Copyright (c) 2008-2010 Björn Höhrmann + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + * See http://bjoern.hoehrmann.de/utf-8/decoder/dfa/ for details. + */ + +#define UTF8_ACCEPT 0 +#define UTF8_REJECT 12 + +static const uint8_t utf8d[] = { + /* The first part of the table maps bytes to character classes that + * to reduce the size of the transition table and create bitmasks. */ + 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, + 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, + 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, + 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, + 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9, + 7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7, 7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7, + 8,8,2,2,2,2,2,2,2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, + 10,3,3,3,3,3,3,3,3,3,3,3,3,4,3,3, 11,6,6,6,5,8,8,8,8,8,8,8,8,8,8,8, + + /* The second part is a transition table that maps a combination + * of a state of the automaton and a character class to a state. */ + 0,12,24,36,60,96,84,12,12,12,48,72, 12,12,12,12,12,12,12,12,12,12,12,12, + 12, 0,12,12,12,12,12, 0,12, 0,12,12, 12,24,12,12,12,12,12,24,12,24,12,12, + 12,12,12,12,12,12,12,24,12,12,12,12, 12,24,12,12,12,12,12,12,12,24,12,12, + 12,12,12,12,12,12,12,36,12,36,12,12, 12,36,12,12,12,12,12,36,12,36,12,12, + 12,36,12,12,12,12,12,12,12,12,12,12, +}; + +#if 0 +/* We can bring this back if we need the codepoint? */ +static uint32_t inline decode(uint32_t *state, uint32_t *codep, uint32_t byte) { + uint32_t type = utf8d[byte]; + + *codep = (*state != UTF8_ACCEPT) ? + (byte & 0x3fu) | (*codep << 6) : + (0xff >> type) & (byte); + + *state = utf8d[256 + *state + type]; + return *state; +} +#endif + +static uint32_t inline decode(uint32_t *state, uint32_t byte) { + uint32_t type = utf8d[byte]; + *state = utf8d[256 + *state + type]; + return *state; +} + +/* + * END THIRD PARTY CODE + * + * See copyright notice above. + */ + +int ast_utf8_is_valid(const char *src) +{ + uint32_t state = UTF8_ACCEPT; + + while (*src) { + decode(&state, (uint8_t) *src++); + } + + return state == UTF8_ACCEPT; +} + +int ast_utf8_is_validn(const char *src, size_t size) +{ + uint32_t state = UTF8_ACCEPT; + + while (size && *src) { + decode(&state, (uint8_t) *src++); + size--; + } + + return state == UTF8_ACCEPT; +} + +void ast_utf8_copy_string(char *dst, const char *src, size_t size) +{ + uint32_t state = UTF8_ACCEPT; + char *last_good = dst; + + ast_assert(size > 0); + + while (size && *src) { + if (decode(&state, (uint8_t) *src) == UTF8_REJECT) { + /* We _could_ replace with U+FFFD and try to recover, but for now + * we treat this the same as if we had run out of space */ + break; + } + + *dst++ = *src++; + size--; + + if (size && state == UTF8_ACCEPT) { + /* last_good is where we will ultimately write the 0 byte */ + last_good = dst; + } + } + + *last_good = '\0'; +} + +struct ast_utf8_validator { + uint32_t state; +}; + +int ast_utf8_validator_new(struct ast_utf8_validator **validator) +{ + struct ast_utf8_validator *tmp = ast_malloc(sizeof(*tmp)); + + if (!tmp) { + return 1; + } + + tmp->state = UTF8_ACCEPT; + *validator = tmp; + return 0; +} + +enum ast_utf8_validation_result ast_utf8_validator_state( + struct ast_utf8_validator *validator) +{ + switch (validator->state) { + case UTF8_ACCEPT: + return AST_UTF8_VALID; + case UTF8_REJECT: + return AST_UTF8_INVALID; + default: + return AST_UTF8_UNKNOWN; + } +} + +enum ast_utf8_validation_result ast_utf8_validator_feed( + struct ast_utf8_validator *validator, const char *data) +{ + while (*data) { + decode(&validator->state, (uint8_t) *data++); + } + + return ast_utf8_validator_state(validator); +} + +enum ast_utf8_validation_result ast_utf8_validator_feedn( + struct ast_utf8_validator *validator, const char *data, size_t size) +{ + while (size && *data) { + decode(&validator->state, (uint8_t) *data++); + size--; + } + + return ast_utf8_validator_state(validator); +} + +void ast_utf8_validator_reset(struct ast_utf8_validator *validator) +{ + validator->state = UTF8_ACCEPT; +} + +void ast_utf8_validator_destroy(struct ast_utf8_validator *validator) +{ + ast_free(validator); +} + +#ifdef TEST_FRAMEWORK + +AST_TEST_DEFINE(test_utf8_is_valid) +{ + switch (cmd) { + case TEST_INIT: + info->name = "is_valid"; + info->category = "/main/utf8/"; + info->summary = "Test ast_utf8_is_valid and ast_utf8_is_validn"; + info->description = + "Tests UTF-8 string validation code."; + return AST_TEST_NOT_RUN; + case TEST_EXECUTE: + break; + } + + /* Valid UTF-8 */ + ast_test_validate(test, ast_utf8_is_valid("Asterisk")); + ast_test_validate(test, ast_utf8_is_valid("\xce\xbb")); + ast_test_validate(test, ast_utf8_is_valid("\xe2\x8a\x9b")); + ast_test_validate(test, ast_utf8_is_valid("\xf0\x9f\x93\x9e")); + + /* Valid with leading */ + ast_test_validate(test, ast_utf8_is_valid("aaa Asterisk")); + ast_test_validate(test, ast_utf8_is_valid("aaa \xce\xbb")); + ast_test_validate(test, ast_utf8_is_valid("aaa \xe2\x8a\x9b")); + ast_test_validate(test, ast_utf8_is_valid("aaa \xf0\x9f\x93\x9e")); + + /* Valid with trailing */ + ast_test_validate(test, ast_utf8_is_valid("Asterisk aaa")); + ast_test_validate(test, ast_utf8_is_valid("\xce\xbb aaa")); + ast_test_validate(test, ast_utf8_is_valid("\xe2\x8a\x9b aaa")); + ast_test_validate(test, ast_utf8_is_valid("\xf0\x9f\x93\x9e aaa")); + + /* Valid with leading and trailing */ + ast_test_validate(test, ast_utf8_is_valid("aaa Asterisk aaa")); + ast_test_validate(test, ast_utf8_is_valid("aaa \xce\xbb aaa")); + ast_test_validate(test, ast_utf8_is_valid("aaa \xe2\x8a\x9b aaa")); + ast_test_validate(test, ast_utf8_is_valid("aaa \xf0\x9f\x93\x9e aaa")); + + /* Valid if limited by number of bytes */ + ast_test_validate(test, ast_utf8_is_validn("Asterisk" "\xff", strlen("Asterisk"))); + ast_test_validate(test, ast_utf8_is_validn("\xce\xbb" "\xff", strlen("\xce\xbb"))); + ast_test_validate(test, ast_utf8_is_validn("\xe2\x8a\x9b" "\xff", strlen("\xe2\x8a\x9b"))); + ast_test_validate(test, ast_utf8_is_validn("\xf0\x9f\x93\x9e" "\xff", strlen("\xf0\x9f\x93\x9e"))); + + /* Invalid */ + ast_test_validate(test, !ast_utf8_is_valid("\xc0\x8a")); /* Overlong */ + ast_test_validate(test, !ast_utf8_is_valid("98.6\xa7")); /* 'High ASCII' */ + ast_test_validate(test, !ast_utf8_is_valid("\xc3\x28")); + ast_test_validate(test, !ast_utf8_is_valid("\xa0\xa1")); + ast_test_validate(test, !ast_utf8_is_valid("\xe2\x28\xa1")); + ast_test_validate(test, !ast_utf8_is_valid("\xe2\x82\x28")); + ast_test_validate(test, !ast_utf8_is_valid("\xf0\x28\x8c\xbc")); + ast_test_validate(test, !ast_utf8_is_valid("\xf0\x90\x28\xbc")); + ast_test_validate(test, !ast_utf8_is_valid("\xf0\x28\x8c\x28")); + + return AST_TEST_PASS; +} + +static int test_copy_and_compare(const char *src, size_t dst_len, const char *cmp) +{ + char dst[dst_len]; + ast_utf8_copy_string(dst, src, dst_len); + return strcmp(dst, cmp) == 0; +} + +AST_TEST_DEFINE(test_utf8_copy_string) +{ + switch (cmd) { + case TEST_INIT: + info->name = "copy_string"; + info->category = "/main/utf8/"; + info->summary = "Test ast_utf8_copy_string"; + info->description = + "Tests UTF-8 string copying code."; + return AST_TEST_NOT_RUN; + case TEST_EXECUTE: + break; + } + + ast_test_validate(test, test_copy_and_compare("Asterisk", 6, "Aster")); + ast_test_validate(test, test_copy_and_compare("Asterisk \xc2\xae", 11, "Asterisk ")); + ast_test_validate(test, test_copy_and_compare("Asterisk \xc2\xae", 12, "Asterisk \xc2\xae")); + ast_test_validate(test, test_copy_and_compare("Asterisk \xc0\x8a", 12, "Asterisk ")); + ast_test_validate(test, test_copy_and_compare("\xce\xbb xyz", 1, "")); + ast_test_validate(test, test_copy_and_compare("\xce\xbb xyz", 2, "")); + ast_test_validate(test, test_copy_and_compare("\xce\xbb xyz", 3, "\xce\xbb")); + ast_test_validate(test, test_copy_and_compare("\xce\xbb xyz", 4, "\xce\xbb ")); + ast_test_validate(test, test_copy_and_compare("\xce\xbb xyz", 5, "\xce\xbb x")); + ast_test_validate(test, test_copy_and_compare("\xce\xbb xyz", 6, "\xce\xbb xy")); + ast_test_validate(test, test_copy_and_compare("\xce\xbb xyz", 7, "\xce\xbb xyz")); + + return AST_TEST_PASS; +} + +AST_TEST_DEFINE(test_utf8_validator) +{ + struct ast_utf8_validator *validator; + + switch (cmd) { + case TEST_INIT: + info->name = "utf8_validator"; + info->category = "/main/utf8/"; + info->summary = "Test ast_utf8_validator"; + info->description = + "Tests UTF-8 progressive validator code."; + return AST_TEST_NOT_RUN; + case TEST_EXECUTE: + break; + } + + if (ast_utf8_validator_new(&validator)) { + return AST_TEST_FAIL; + } + + ast_test_validate(test, ast_utf8_validator_feed(validator, "Asterisk") == AST_UTF8_VALID); + ast_test_validate(test, ast_utf8_validator_feed(validator, "\xc2") == AST_UTF8_UNKNOWN); + ast_test_validate(test, ast_utf8_validator_feed(validator, "\xae") == AST_UTF8_VALID); + ast_test_validate(test, ast_utf8_validator_feed(validator, "Private") == AST_UTF8_VALID); + ast_test_validate(test, ast_utf8_validator_feed(validator, "Branch") == AST_UTF8_VALID); + ast_test_validate(test, ast_utf8_validator_feed(validator, "Exchange") == AST_UTF8_VALID); + ast_test_validate(test, ast_utf8_validator_feed(validator, "\xe2") == AST_UTF8_UNKNOWN); + ast_test_validate(test, ast_utf8_validator_feed(validator, "\x84") == AST_UTF8_UNKNOWN); + ast_test_validate(test, ast_utf8_validator_feed(validator, "\xbb") == AST_UTF8_VALID); + ast_test_validate(test, ast_utf8_validator_feed(validator, "\xc0\x8a") == AST_UTF8_INVALID); + ast_test_validate(test, ast_utf8_validator_feed(validator, "valid") == AST_UTF8_INVALID); + ast_test_validate(test, ast_utf8_validator_feed(validator, "valid") == AST_UTF8_INVALID); + ast_test_validate(test, ast_utf8_validator_feed(validator, "valid") == AST_UTF8_INVALID); + + ast_utf8_validator_destroy(validator); + + return AST_TEST_PASS; +} + +static void test_utf8_shutdown(void) +{ + AST_TEST_UNREGISTER(test_utf8_is_valid); + AST_TEST_UNREGISTER(test_utf8_copy_string); + AST_TEST_UNREGISTER(test_utf8_validator); +} + +int ast_utf8_init(void) +{ + AST_TEST_REGISTER(test_utf8_is_valid); + AST_TEST_REGISTER(test_utf8_copy_string); + AST_TEST_REGISTER(test_utf8_validator); + + ast_register_cleanup(test_utf8_shutdown); + + return 0; +} + +#else /* !TEST_FRAMEWORK */ + +int ast_utf8_init(void) +{ + return 0; +} + +#endif