From: Thomas Markwalder Date: Fri, 6 Jul 2018 17:14:05 +0000 (-0400) Subject: [5680] Added sanitizeString() function to libutil X-Git-Tag: ha_phase2~39 X-Git-Url: http://git.ipfire.org/cgi-bin/gitweb.cgi?a=commitdiff_plain;h=39ab2f5d15f5eef162d23b2905198d794affe270;p=thirdparty%2Fkea.git [5680] Added sanitizeString() function to libutil configure.ac Added logic to detect usable C++11 regex src/lib/util/strutil.* sanitizeString() - new function that replaces all occurances of invalid chars in a string with a specified replacement src/lib/util/tests/strutil_unittest.cc TEST(StringUtilTest, sanitizeString) - new test --- diff --git a/configure.ac b/configure.ac index bcf1c4cf33..dfa0b66459 100644 --- a/configure.ac +++ b/configure.ac @@ -571,6 +571,20 @@ AC_TRY_COMPILE([ AC_DEFINE(HAVE_SA_LEN, 1, [Define to 1 if sockaddr has a sa_len member, and corresponding sin_len and sun_len])], AC_MSG_RESULT(no)) +AC_MSG_CHECKING(for usuable C++11 regex) +AC_TRY_RUN([ +#include +#include +int main() { + const std::regex regex(".*"); + const std::string string = "This should match!"; + const auto result = std::regex_search(string, regex); + return result ? EXIT_SUCCESS : EXIT_FAILURE; +}], + [AC_MSG_RESULT(yes) + AC_DEFINE(USE_REGEX, 1, [Define to 1 if C++11 regex is usable])], + AC_MSG_RESULT(no)) + enable_gtest="no" GTEST_INCLUDES= diff --git a/src/lib/util/strutil.cc b/src/lib/util/strutil.cc index 63197eb7bb..b250d36bf4 100644 --- a/src/lib/util/strutil.cc +++ b/src/lib/util/strutil.cc @@ -1,4 +1,4 @@ -// Copyright (C) 2011-2017 Internet Systems Consortium, Inc. ("ISC") +// Copyright (C) 2011-2018 Internet Systems Consortium, Inc. ("ISC") // // This Source Code Form is subject to the terms of the Mozilla Public // License, v. 2.0. If a copy of the MPL was not distributed with this @@ -14,9 +14,20 @@ #include #include +#include #include -#include +// Early versions of C++11 regex were buggy, use it if we +// can otherwise, we fall back to regcomp/regexec. For more info see: +// https://stackoverflow.com/questions/12530406/is-gcc-4-8-or-earlier-buggy-about-regular-expressions +#ifdef USE_REGEX +#include +#else +#include +#include +#endif + +#include using namespace std; @@ -288,6 +299,81 @@ decodeFormattedHexString(const std::string& hex_string, } } +std::string +sanitizeString(const std::string& original, + const std::string& invalidChars, + const std::string& replacement) { +#ifdef USE_REGEX + std::regex rexpr; + try { + rexpr = std::regex(invalidChars, std::regex::extended); + } catch (const std::exception& ex) { + isc_throw(isc::BadValue, "invalid regex: '" + << invalidChars << "', " << ex.what()); + } + + std::stringstream result; + try { + std::regex_replace(std::ostream_iterator(result), + original.begin(), original.end(), + rexpr, replacement); + } catch (const std::exception& ex) { + isc_throw(isc::BadValue, "replacing '" << invalidChars << "' with '" + << replacement << "' in '" << original << "' failed: ," + << ex.what()); + } + + return (result.str()); +#else + // Compile the expression. + regex_t rex; + int ec = regcomp(&rex, invalidChars.c_str(), REG_EXTENDED); + if (ec) { + char errbuf[512] = ""; + static_cast(regerror(ec, &rex, errbuf, sizeof(errbuf))); + isc_throw(isc::BadValue, "invalid regex: '" << invalidChars + << "', " << errbuf); + } + + // Iterate over original string, match by match. + const char* origStr = original.c_str(); + const char* startFrom = origStr; + const char* endAt = origStr + strlen(origStr); + regmatch_t matches[2]; // n matches + 1 + stringstream result; + + while (startFrom < endAt) { + // Look for the next match + if (regexec(&rex, startFrom, 1, matches, 0) == REG_NOMATCH) { + // No matches, so add in the remainder + result << startFrom; + break; + } + + // Shouldn't happen, but one never knows eh? + if (matches[0].rm_so == -1) { + isc_throw(isc::Unexpected, "matched but so is -1?"); + } + + // Add everything from starting point up to the current match + const char* matchAt = startFrom + matches[0].rm_so; + while (startFrom < matchAt) { + result << *startFrom; + ++startFrom; + } + + // Add in the replacement + result << replacement; + + // Move past the match. + ++startFrom; + } + + regfree(&rex); + return (result.str()); +#endif +} + } // namespace str } // namespace util } // namespace isc diff --git a/src/lib/util/strutil.h b/src/lib/util/strutil.h index 1dd5de01de..c31fd38ac3 100644 --- a/src/lib/util/strutil.h +++ b/src/lib/util/strutil.h @@ -255,6 +255,25 @@ void decodeFormattedHexString(const std::string& hex_string, std::vector& binary); +/// \brief Replaces all occurences of a character set in a string +/// +/// This function runs a given string through a regular expression, +/// replacing all "matches" of that expression with the specified string. +/// +/// \param original the string to sanitize +/// \param invalidChars string containing a regular expression (POSIX +/// extended syntax) that describes the characters to replace. If you +/// wanted to sanitize hostnames for example, you could specify the +/// inversion of valid characters "[^A-Za-z0-9_-]". +/// \param replacement string of one or more characters to use as the +/// replacement for invalid characters. +/// \return the new, sanitized string +/// \throw BadValue if given an invalid regular expression, Unexpected if +/// an error occurs executing the expression +std::string +sanitizeString(const std::string& original, + const std::string& invalidChars, + const std::string& replacement); } // namespace str } // namespace util diff --git a/src/lib/util/tests/strutil_unittest.cc b/src/lib/util/tests/strutil_unittest.cc index e518acb8fe..c84537120c 100644 --- a/src/lib/util/tests/strutil_unittest.cc +++ b/src/lib/util/tests/strutil_unittest.cc @@ -1,4 +1,4 @@ -// Copyright (C) 2011-2017 Internet Systems Consortium, Inc. ("ISC") +// Copyright (C) 2011-2018 Internet Systems Consortium, Inc. ("ISC") // // This Source Code Form is subject to the terms of the Mozilla Public // License, v. 2.0. If a copy of the MPL was not distributed with this @@ -463,4 +463,37 @@ TEST(StringUtilTest, decodeFormattedHexString) { isc::BadValue); } +// Verifies sanitizeString() function +TEST(StringUtilTest, sanitizeString) { + std::string sanitized; + + // Bad regular expression should throw. + ASSERT_THROW (sanitized = sanitizeString("just a string", "[bogus-regex",""), + BadValue); + + // A string of all valid chars should return an identical string. + ASSERT_NO_THROW (sanitized = sanitizeString("-_A--B__Cabc34567_-", "[^A-Ca-c3-7_-]","x")); + EXPECT_EQ(sanitized, "-_A--B__Cabc34567_-"); + + // Replacing with a character should work. + ASSERT_NO_THROW (sanitized = sanitizeString("A[b]c\12JoE3-_x!B$Y#e", "[^A-Za-z0-9_]","*")); + EXPECT_EQ(sanitized, "A*b*c*JoE3*_x*B*Y*e"); + + // Removing (i.e.replacing with an "empty" string) should work. + ASSERT_NO_THROW (sanitized = sanitizeString("A[b]c\12JoE3-_x!B$Y#e", "[^A-Za-z0-9_]","")); + EXPECT_EQ(sanitized, "AbcJoE3_xBYe"); + + // More than one non-matching in a row should work. + ASSERT_NO_THROW (sanitized = sanitizeString("%%A%%B%%C%%", "[^A-Za-z0-9_]","x")); + EXPECT_EQ(sanitized, "xxAxxBxxCxx"); + + // Removing than one non-matching in a row should work. + ASSERT_NO_THROW (sanitized = sanitizeString("%%A%%B%%C%%", "[^A-Za-z0-9_]","")); + EXPECT_EQ(sanitized, "ABC"); + + // Replacing with a string should work. + ASSERT_NO_THROW (sanitized = sanitizeString("%%A%%B%%C%%", "[^A-Za-z0-9_]","xyz")); + EXPECT_EQ(sanitized, "xyzxyzAxyzxyzBxyzxyzCxyzxyz"); +} + } // end of anonymous namespace