From: Francis Dupont Date: Fri, 10 Jul 2020 21:05:16 +0000 (+0200) Subject: [#1304] Added latin1 -> UTF-8 encoder X-Git-Tag: Kea-1.9.0~156 X-Git-Url: http://git.ipfire.org/cgi-bin/gitweb.cgi?a=commitdiff_plain;h=306d488efc2ee7f394a7c54fab1b6231b25029c8;p=thirdparty%2Fkea.git [#1304] Added latin1 -> UTF-8 encoder --- diff --git a/src/lib/util/Makefile.am b/src/lib/util/Makefile.am index adf6735f26..c8085603e7 100644 --- a/src/lib/util/Makefile.am +++ b/src/lib/util/Makefile.am @@ -42,6 +42,7 @@ libkea_util_la_SOURCES += encode/base32hex_from_binary.h libkea_util_la_SOURCES += encode/base_n.cc encode/hex.h libkea_util_la_SOURCES += encode/binary_from_base32hex.h libkea_util_la_SOURCES += encode/binary_from_base16.h +libkea_util_la_SOURCES += encode/utf8.cc encode/utf8.h libkea_util_la_SOURCES += random/qid_gen.h random/qid_gen.cc libkea_util_la_SOURCES += random/random_number_generator.h @@ -94,7 +95,8 @@ libkea_util_encode_include_HEADERS = \ encode/base64.h \ encode/binary_from_base16.h \ encode/binary_from_base32hex.h \ - encode/hex.h + encode/hex.h \ + encode/utf8.h libkea_util_io_includedir = $(pkgincludedir)/util/io libkea_util_io_include_HEADERS = \ diff --git a/src/lib/util/encode/utf8.cc b/src/lib/util/encode/utf8.cc new file mode 100644 index 0000000000..0c0aadf7e7 --- /dev/null +++ b/src/lib/util/encode/utf8.cc @@ -0,0 +1,33 @@ +// Copyright (C) 2020 Internet Systems Consortium, Inc. ("ISC") +// +// This Source Code Form is subject to the terms of the Mozilla Public +// License, v. 2.0. If a copy of the MPL was not distributed with this +// file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#include + +namespace isc { +namespace util { +namespace encode { + +std::vector encodeUtf8(const std::string& value) { + std::vector result; + if (value.empty()) { + return (result); + } + const uint8_t* start = reinterpret_cast(value.c_str()); + std::vector binary(start, start + value.size()); + for (uint8_t ch : binary) { + if (ch < 0x80) { + result.push_back(ch); + } else { + result.push_back(0xc0 | (ch >> 6)); + result.push_back(0x80 | (ch & 0x3f)); + } + } + return (result); +} + +} // namespace encode +} // namespace util +} // namespace isc diff --git a/src/lib/util/encode/utf8.h b/src/lib/util/encode/utf8.h new file mode 100644 index 0000000000..9eda47175e --- /dev/null +++ b/src/lib/util/encode/utf8.h @@ -0,0 +1,27 @@ +// Copyright (C) 2020 Internet Systems Consortium, Inc. ("ISC") +// +// This Source Code Form is subject to the terms of the Mozilla Public +// License, v. 2.0. If a copy of the MPL was not distributed with this +// file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#ifndef UTF8_H +#define UTF8_H 1 + +#include +#include +#include + +namespace isc { +namespace util { +namespace encode { +/// @brief Encode value string into UTF-8. +/// +/// @param value A string in latin1 i.e. no encoding. +/// @return A vector object storing the data encoded in UTF-8. +std::vector encodeUtf8(const std::string& value); + +} // namespace encode +} // namespace util +} // namespace isc + +#endif // UTF8_H diff --git a/src/lib/util/tests/Makefile.am b/src/lib/util/tests/Makefile.am index b04d07278f..f5b3598f07 100644 --- a/src/lib/util/tests/Makefile.am +++ b/src/lib/util/tests/Makefile.am @@ -61,11 +61,11 @@ run_unittests_SOURCES += readwrite_mutex_unittest.cc run_unittests_SOURCES += signal_set_unittest.cc run_unittests_SOURCES += stopwatch_unittest.cc run_unittests_SOURCES += unlock_guard_unittests.cc +run_unittests_SOURCES += utf8_unittest.cc run_unittests_SOURCES += versioned_csv_file_unittest.cc run_unittests_SOURCES += watch_socket_unittests.cc run_unittests_SOURCES += watched_thread_unittest.cc - run_unittests_CPPFLAGS = $(AM_CPPFLAGS) $(GTEST_INCLUDES) run_unittests_LDFLAGS = $(AM_LDFLAGS) $(GTEST_LDFLAGS) diff --git a/src/lib/util/tests/utf8_unittest.cc b/src/lib/util/tests/utf8_unittest.cc new file mode 100644 index 0000000000..168f38bba3 --- /dev/null +++ b/src/lib/util/tests/utf8_unittest.cc @@ -0,0 +1,50 @@ +// Copyright (C) 2020 Internet Systems Consortium, Inc. ("ISC") +// +// This Source Code Form is subject to the terms of the Mozilla Public +// License, v. 2.0. If a copy of the MPL was not distributed with this +// file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#include + +#include + +#include + +using namespace isc::util; +using namespace isc::util::encode; +using namespace std; + +namespace { + +// Verify it does nothing for ASCII. +TEST(Utf8Test, foobar) { + string str("foobar"); + vector vec8 = encodeUtf8(str); + ASSERT_FALSE(vec8.empty()); + const char* start = reinterpret_cast(&vec8[0]); + string str8(start, start + vec8.size()); + EXPECT_EQ(str, str8); +} + +// Verify it encodes not ASCII as expected. +TEST(Utf8Test, eightc) { + string str("-\x8c-"); + vector vec8 = encodeUtf8(str); + ASSERT_FALSE(vec8.empty()); + const char* start = reinterpret_cast(&vec8[0]); + string str8(start, start + vec8.size()); + string expected("-\xc2\x8c-"); + EXPECT_EQ(expected, str8); +} + +// Verify it handles correctly control characters. +TEST(Utf8Test, control) { + string str("fo\x00\n\bar"); + vector vec8 = encodeUtf8(str); + ASSERT_FALSE(vec8.empty()); + const char* start = reinterpret_cast(&vec8[0]); + string str8(start, start + vec8.size()); + EXPECT_EQ(str, str8); +} + +}