[5680] Added sanitizeString() function to libutil

author Thomas Markwalder <tmark@isc.org>

Fri, 6 Jul 2018 17:14:05 +0000 (13:14 -0400)

committer Tomek Mrugalski <tomasz@isc.org>

Fri, 27 Jul 2018 11:54:10 +0000 (13:54 +0200)
author Thomas Markwalder <tmark@isc.org>
Fri, 6 Jul 2018 17:14:05 +0000 (13:14 -0400)
committer Tomek Mrugalski <tomasz@isc.org>
Fri, 27 Jul 2018 11:54:10 +0000 (13:54 +0200)
diff --git a/configure.ac b/configure.ac

index bcf1c4cf33d44735a65f9efba8c13cd34bd9a56c..dfa0b664599a72a2bcc91bfc82a24ce644d59ed1 100644 (file)
--- a/configure.ac
+++ b/configure.ac
@@ -571,6 +571,20 @@ AC_TRY_COMPILE([
          AC_DEFINE(HAVE_SA_LEN, 1, [Define to 1 if sockaddr has a sa_len member, and corresponding sin_len and sun_len])],
          AC_MSG_RESULT(no))
  
+AC_MSG_CHECKING(for usuable C++11 regex)
+AC_TRY_RUN([
+#include <regex>
+#include <iostream>
+int main() {
+  const std::regex regex(".*");
+  const std::string string = "This should match!";
+  const auto result = std::regex_search(string, regex);
+  return result ? EXIT_SUCCESS : EXIT_FAILURE;
+}],
+        [AC_MSG_RESULT(yes)
+        AC_DEFINE(USE_REGEX, 1, [Define to 1 if C++11 regex is usable])],
+        AC_MSG_RESULT(no))
+
  enable_gtest="no"
  GTEST_INCLUDES=
  
diff --git a/src/lib/util/strutil.cc b/src/lib/util/strutil.cc

index 63197eb7bba6ac4ea2288899d40192c40f46dcfa..b250d36bf4213595a7cf36963b6f2b4aee9182e5 100644 (file)
--- a/src/lib/util/strutil.cc
+++ b/src/lib/util/strutil.cc
@@ -1,4 +1,4 @@
-// Copyright (C) 2011-2017 Internet Systems Consortium, Inc. ("ISC")
+// Copyright (C) 2011-2018 Internet Systems Consortium, Inc. ("ISC")
  //
  // This Source Code Form is subject to the terms of the Mozilla Public
  // License, v. 2.0. If a copy of the MPL was not distributed with this
@@ -14,9 +14,20 @@
  #include <boost/algorithm/string/split.hpp>
  
  #include <numeric>
+#include <iostream>
  #include <sstream>
-#include <string.h>
  
+// Early versions of C++11 regex were buggy, use it if we
+// can otherwise, we fall back to regcomp/regexec.  For more info see:
+// https://stackoverflow.com/questions/12530406/is-gcc-4-8-or-earlier-buggy-about-regular-expressions
+#ifdef USE_REGEX
+#include <regex>
+#else
+#include <sys/types.h>
+#include <regex.h>
+#endif
+
+#include <string.h>
  
  using namespace std;
  
@@ -288,6 +299,81 @@ decodeFormattedHexString(const std::string& hex_string,
      }
  }
  
+std::string
+sanitizeString(const std::string& original,
+                            const std::string& invalidChars,
+                            const std::string& replacement) {
+#ifdef USE_REGEX
+    std::regex rexpr;
+    try {
+        rexpr = std::regex(invalidChars, std::regex::extended);
+    } catch (const std::exception& ex) {
+        isc_throw(isc::BadValue, "invalid regex: '"
+                  << invalidChars << "', " << ex.what());
+    }
+
+    std::stringstream result;
+    try {
+        std::regex_replace(std::ostream_iterator<char>(result),
+                           original.begin(), original.end(),
+                           rexpr, replacement);
+    } catch (const std::exception& ex) {
+        isc_throw(isc::BadValue, "replacing '" << invalidChars << "' with '"
+                   << replacement << "' in '" << original << "' failed: ,"
+                   << ex.what());
+    }
+
+    return (result.str());
+#else
+    // Compile the expression.
+    regex_t rex;
+    int ec = regcomp(&rex, invalidChars.c_str(), REG_EXTENDED);
+    if (ec) {
+        char errbuf[512] = "";
+        static_cast<void>(regerror(ec, &rex, errbuf, sizeof(errbuf)));
+        isc_throw(isc::BadValue, "invalid regex: '" << invalidChars
+                  << "', " << errbuf);
+    }
+
+    // Iterate over original string, match by match.
+    const char* origStr = original.c_str();
+    const char* startFrom = origStr;
+    const char* endAt = origStr + strlen(origStr);
+    regmatch_t matches[2];  // n matches + 1
+    stringstream result;
+
+    while (startFrom < endAt) {
+        // Look for the next match
+        if (regexec(&rex, startFrom, 1, matches, 0) == REG_NOMATCH) {
+            // No matches, so add in the remainder
+            result << startFrom;
+            break;
+        }
+
+        // Shouldn't happen, but one never knows eh?
+        if (matches[0].rm_so == -1) {
+            isc_throw(isc::Unexpected, "matched but so is -1?");
+        }
+
+        // Add everything from starting point up to the current match
+        const char* matchAt = startFrom + matches[0].rm_so;
+        while (startFrom < matchAt) {
+            result << *startFrom;
+            ++startFrom;
+        }
+
+        // Add in the replacement
+        result << replacement;
+
+        // Move past the match.
+        ++startFrom;
+    }
+
+    regfree(&rex);
+    return (result.str());
+#endif
+}
+
  } // namespace str
  } // namespace util
  } // namespace isc
diff --git a/src/lib/util/strutil.h b/src/lib/util/strutil.h

index 1dd5de01de933735ec468280b44402256e1eb00a..c31fd38ac356392cece2c3f6adb763113b96f7c2 100644 (file)
--- a/src/lib/util/strutil.h
+++ b/src/lib/util/strutil.h
@@ -255,6 +255,25 @@ void
  decodeFormattedHexString(const std::string& hex_string,
                           std::vector<uint8_t>& binary);
  
+/// \brief Replaces all occurences of a character set in a string
+///
+/// This function runs a given string through a regular expression,
+/// replacing all "matches" of that expression with the specified string.
+///
+/// \param original the string to sanitize
+/// \param invalidChars  string containing a regular expression (POSIX
+/// extended syntax) that describes the characters to replace.  If you
+/// wanted to sanitize hostnames for example, you could specify the
+/// inversion of valid characters "[^A-Za-z0-9_-]".
+/// \param replacement string of one or more characters to use as the
+/// replacement for invalid characters.
+/// \return the new, sanitized string
+/// \throw BadValue if given an invalid regular expression, Unexpected if
+/// an error occurs executing the expression
+std::string
+sanitizeString(const std::string& original,
+               const std::string& invalidChars,
+               const std::string& replacement);
  
  } // namespace str
  } // namespace util
diff --git a/src/lib/util/tests/strutil_unittest.cc b/src/lib/util/tests/strutil_unittest.cc

index e518acb8fe3260ee64d6ea66b56dfde212fee7ee..c84537120ca10a26f4d3ef2fb2d62d101a11dd77 100644 (file)
--- a/src/lib/util/tests/strutil_unittest.cc
+++ b/src/lib/util/tests/strutil_unittest.cc
@@ -1,4 +1,4 @@
-// Copyright (C) 2011-2017 Internet Systems Consortium, Inc. ("ISC")
+// Copyright (C) 2011-2018 Internet Systems Consortium, Inc. ("ISC")
  //
  // This Source Code Form is subject to the terms of the Mozilla Public
  // License, v. 2.0. If a copy of the MPL was not distributed with this
@@ -463,4 +463,37 @@ TEST(StringUtilTest, decodeFormattedHexString) {
                   isc::BadValue);
  }
  
+// Verifies sanitizeString() function
+TEST(StringUtilTest, sanitizeString) {
+    std::string sanitized;
+
+    // Bad regular expression should throw.
+    ASSERT_THROW (sanitized = sanitizeString("just a string", "[bogus-regex",""),
+                  BadValue);
+
+    // A string of all valid chars should return an identical string.
+    ASSERT_NO_THROW (sanitized = sanitizeString("-_A--B__Cabc34567_-", "[^A-Ca-c3-7_-]","x"));
+    EXPECT_EQ(sanitized, "-_A--B__Cabc34567_-");
+
+    // Replacing with a character should work.
+    ASSERT_NO_THROW (sanitized = sanitizeString("A[b]c\12JoE3-_x!B$Y#e", "[^A-Za-z0-9_]","*"));
+    EXPECT_EQ(sanitized, "A*b*c*JoE3*_x*B*Y*e");
+
+    // Removing (i.e.replacing with an "empty" string) should work.
+    ASSERT_NO_THROW (sanitized = sanitizeString("A[b]c\12JoE3-_x!B$Y#e", "[^A-Za-z0-9_]",""));
+    EXPECT_EQ(sanitized, "AbcJoE3_xBYe");
+
+    // More than one non-matching in a row should work.
+    ASSERT_NO_THROW (sanitized = sanitizeString("%%A%%B%%C%%", "[^A-Za-z0-9_]","x"));
+    EXPECT_EQ(sanitized, "xxAxxBxxCxx");
+
+    // Removing than one non-matching in a row should work.
+    ASSERT_NO_THROW (sanitized = sanitizeString("%%A%%B%%C%%", "[^A-Za-z0-9_]",""));
+    EXPECT_EQ(sanitized, "ABC");
+
+    // Replacing with a string should work.
+    ASSERT_NO_THROW (sanitized = sanitizeString("%%A%%B%%C%%", "[^A-Za-z0-9_]","xyz"));
+    EXPECT_EQ(sanitized, "xyzxyzAxyzxyzBxyzxyzCxyzxyz");
+}
+
  } // end of anonymous namespace
author	Thomas Markwalder <tmark@isc.org>
	Fri, 6 Jul 2018 17:14:05 +0000 (13:14 -0400)
committer	Tomek Mrugalski <tomasz@isc.org>
	Fri, 27 Jul 2018 11:54:10 +0000 (13:54 +0200)
configure.ac		patch \| blob \| blame \| history
src/lib/util/strutil.cc		patch \| blob \| blame \| history
src/lib/util/strutil.h		patch \| blob \| blame \| history
src/lib/util/tests/strutil_unittest.cc		patch \| blob \| blame \| history