From: Joel Rosdahl <joel@rosdahl.net>
Date: Mon, 5 Jul 2021 12:41:48 +0000 (+0200)
Subject: Add mode parameter to split_into_* and Tokenizer
X-Git-Tag: v4.4~158
X-Git-Url: http://git.ipfire.org/cgi-bin/gitweb.cgi?a=commitdiff_plain;h=443a570e6d70220406fa5fcf15a296dabb71239e;p=thirdparty%2Fccache.git

Add mode parameter to split_into_* and Tokenizer
---

diff --git a/src/Util.cpp b/src/Util.cpp
index e4ad44673..81b0d86bb 100644
--- a/src/Util.cpp
+++ b/src/Util.cpp
@@ -26,7 +26,6 @@
 #include "TemporaryFile.hpp"
 #include "fmtmacros.hpp"
 
-#include <util/Tokenizer.hpp>
 #include <util/path_utils.hpp>
 
 extern "C" {
@@ -139,10 +138,12 @@ path_max(const std::string& path)
 
 template<typename T>
 std::vector<T>
-split_into(string_view input, const char* separators)
+split_into(string_view string,
+           const char* separators,
+           util::Tokenizer::Mode mode)
 {
   std::vector<T> result;
-  for (const auto token : util::Tokenizer(input, separators)) {
+  for (const auto token : util::Tokenizer(string, separators, mode)) {
     result.emplace_back(token);
   }
   return result;
@@ -1335,15 +1336,19 @@ setenv(const std::string& name, const std::string& value)
 }
 
 std::vector<string_view>
-split_into_views(string_view input, const char* separators)
+split_into_views(string_view string,
+                 const char* separators,
+                 util::Tokenizer::Mode mode)
 {
-  return split_into<string_view>(input, separators);
+  return split_into<string_view>(string, separators, mode);
 }
 
 std::vector<std::string>
-split_into_strings(string_view input, const char* separators)
+split_into_strings(string_view string,
+                   const char* separators,
+                   util::Tokenizer::Mode mode)
 {
-  return split_into<std::string>(input, separators);
+  return split_into<std::string>(string, separators, mode);
 }
 
 std::string
diff --git a/src/Util.hpp b/src/Util.hpp
index 0d2504670..d31657c79 100644
--- a/src/Util.hpp
+++ b/src/Util.hpp
@@ -22,6 +22,8 @@
 
 #include "CacheFile.hpp"
 
+#include <util/Tokenizer.hpp>
+
 #include "third_party/nonstd/optional.hpp"
 #include "third_party/nonstd/string_view.hpp"
 
@@ -418,15 +420,19 @@ size_change_kibibyte(const Stat& old_stat, const Stat& new_stat)
          / 1024;
 }
 
-// Split `input` into words at any of the characters listed in `separators`.
-// These words are a view into `input`; empty words are omitted. `separators`
-// must neither be the empty string nor a nullptr.
-std::vector<nonstd::string_view> split_into_views(nonstd::string_view input,
-                                                  const char* separators);
-
-// Same as `split_into_views` but the words are copied from `input`.
-std::vector<std::string> split_into_strings(nonstd::string_view input,
-                                            const char* separators);
+// Split `string` into tokens at any of the characters in `separators`. These
+// tokens are views into `string`. `separators` must neither be the empty string
+// nor a nullptr.
+std::vector<nonstd::string_view> split_into_views(
+  nonstd::string_view string,
+  const char* separators,
+  util::Tokenizer::Mode mode = util::Tokenizer::Mode::skip_empty);
+
+// Same as `split_into_views` but the tokens are copied from `string`.
+std::vector<std::string> split_into_strings(
+  nonstd::string_view string,
+  const char* separators,
+  util::Tokenizer::Mode mode = util::Tokenizer::Mode::skip_empty);
 
 // Return true if `prefix` is a prefix of `string`.
 inline bool
diff --git a/src/util/Tokenizer.cpp b/src/util/Tokenizer.cpp
index c20140473..9b27c8fb2 100644
--- a/src/util/Tokenizer.cpp
+++ b/src/util/Tokenizer.cpp
@@ -20,24 +20,34 @@
 
 namespace util {
 
-Tokenizer::Iterator
-Tokenizer::Iterator::operator++()
+void
+Tokenizer::Iterator::advance(bool initial)
 {
-  if (m_pos >= m_string.size()) {
-    return *this;
-  }
+  constexpr auto npos = nonstd::string_view::npos;
+  const auto string = m_tokenizer.m_string;
+  const auto delimiters = m_tokenizer.m_delimiters;
+  const auto mode = m_tokenizer.m_mode;
+
+  DEBUG_ASSERT(m_left <= m_right);
+  DEBUG_ASSERT(m_right <= string.length());
 
-  m_pos = m_string.find_first_not_of(m_delimiters, m_pos + m_count);
-  if (m_pos == nonstd::string_view::npos) {
-    m_pos = m_string.size();
-    m_count = 0;
-  } else {
-    m_count = m_string.substr(m_pos).find_first_of(m_delimiters);
-    if (m_count == nonstd::string_view::npos) {
-      m_count = m_string.size() - m_pos;
+  do {
+    if (initial) {
+      initial = false;
+    } else if (m_right == string.length()) {
+      m_left = npos;
+    } else {
+      m_left = m_right + 1;
     }
+    if (m_left != npos) {
+      const auto delim_pos = string.find_first_of(delimiters, m_left);
+      m_right = delim_pos == npos ? string.length() : delim_pos;
+    }
+  } while (mode == Mode::skip_empty && m_left == m_right);
+
+  if (mode == Mode::skip_last_empty && m_left == string.length()) {
+    m_left = npos;
   }
-  return *this;
 }
 
 } // namespace util
diff --git a/src/util/Tokenizer.hpp b/src/util/Tokenizer.hpp
index 3ee3512f0..90cb0c09c 100644
--- a/src/util/Tokenizer.hpp
+++ b/src/util/Tokenizer.hpp
@@ -18,85 +18,113 @@
 
 #pragma once
 
+#include <assertions.hpp>
+
+#include <third_party/fmt/core.h>
 #include <third_party/nonstd/optional.hpp>
 #include <third_party/nonstd/string_view.hpp>
 
 namespace util {
 
+// An instance of this class can be used in a range-based for loop to split a
+// string into tokens at any of the characters in a string of delimiters.
 class Tokenizer
 {
 public:
-  Tokenizer(nonstd::string_view string, const char* delimiters);
+  enum class Mode {
+    include_empty,   // Include empty tokens.
+    skip_empty,      // Skip empty tokens.
+    skip_last_empty, // Include empty tokens except the last one.
+  };
+
+  // Split `string` into tokens at any of the characters in `separators` which
+  // must neither be the empty string nor a nullptr.
+  Tokenizer(nonstd::string_view string,
+            const char* delimiters,
+            Mode mode = Mode::skip_empty);
 
   class Iterator
   {
   public:
-    Iterator(nonstd::string_view string,
-             const char* delimiters,
-             size_t start_pos);
+    Iterator(const Tokenizer& tokenizer, size_t start_pos);
 
     Iterator operator++();
     bool operator!=(const Iterator& other) const;
     nonstd::string_view operator*() const;
 
   private:
-    const nonstd::string_view m_string;
-    const char* const m_delimiters;
-    size_t m_pos;
-    size_t m_count = 0;
+    const Tokenizer& m_tokenizer;
+    size_t m_left;
+    size_t m_right;
+
+    void advance(bool initial);
   };
 
   Iterator begin();
   Iterator end();
 
 private:
+  friend Iterator;
+
   const nonstd::string_view m_string;
   const char* const m_delimiters;
+  const Mode m_mode;
 };
 
 inline Tokenizer::Tokenizer(const nonstd::string_view string,
-                            const char* const delimiters)
+                            const char* const delimiters,
+                            const Tokenizer::Mode mode)
   : m_string(string),
-    m_delimiters(delimiters)
+    m_delimiters(delimiters),
+    m_mode(mode)
 {
-  assert(delimiters != nullptr && delimiters[0] != '\0');
+  DEBUG_ASSERT(delimiters != nullptr && delimiters[0] != '\0');
 }
 
-inline Tokenizer::Iterator::Iterator(const nonstd::string_view string,
-                                     const char* const delimiters,
+inline Tokenizer::Iterator::Iterator(const Tokenizer& tokenizer,
                                      const size_t start_pos)
-  : m_string(string),
-    m_delimiters(delimiters),
-    m_pos(start_pos)
+  : m_tokenizer(tokenizer),
+    m_left(start_pos),
+    m_right(start_pos)
+{
+  if (start_pos == 0) {
+    advance(true);
+  } else {
+    DEBUG_ASSERT(start_pos == nonstd::string_view::npos);
+  }
+}
+
+inline Tokenizer::Iterator
+Tokenizer::Iterator::operator++()
 {
-  ++*this;
+  advance(false);
+  return *this;
 }
 
 inline bool
 Tokenizer::Iterator::operator!=(const Iterator& other) const
 {
-  assert(m_string.data() == other.m_string.data());
-  assert(m_delimiters == other.m_delimiters);
-  return m_pos != other.m_pos || m_count != other.m_count;
+  return &m_tokenizer != &other.m_tokenizer || m_left != other.m_left;
 }
 
 inline nonstd::string_view
 Tokenizer::Iterator::operator*() const
 {
-  assert(m_pos < m_string.size());
-  return m_string.substr(m_pos, m_count);
+  DEBUG_ASSERT(m_left <= m_right);
+  DEBUG_ASSERT(m_right <= m_tokenizer.m_string.length());
+  return m_tokenizer.m_string.substr(m_left, m_right - m_left);
 }
 
 inline Tokenizer::Iterator
 Tokenizer::begin()
 {
-  return Iterator(m_string, m_delimiters, 0);
+  return Iterator(*this, 0);
 }
 
 inline Tokenizer::Iterator
 Tokenizer::end()
 {
-  return Iterator(m_string, m_delimiters, m_string.size());
+  return Iterator(*this, nonstd::string_view::npos);
 }
 
 } // namespace util
diff --git a/unittest/test_Util.cpp b/unittest/test_Util.cpp
index 07cdc2183..a96e64456 100644
--- a/unittest/test_Util.cpp
+++ b/unittest/test_Util.cpp
@@ -848,7 +848,7 @@ TEST_CASE("Util::same_program_name")
 }
 
 // Util::split_into_strings and Util::split_into_views are tested implicitly in
-// test_Tokenizer.cpp.
+// test_util_Tokenizer.cpp.
 
 TEST_CASE("Util::starts_with")
 {
diff --git a/unittest/test_util_Tokenizer.cpp b/unittest/test_util_Tokenizer.cpp
index 9a7801f0c..76c9e027c 100644
--- a/unittest/test_util_Tokenizer.cpp
+++ b/unittest/test_util_Tokenizer.cpp
@@ -22,38 +22,117 @@
 
 TEST_CASE("util::Tokenizer")
 {
-  CHECK(Util::split_into_views("", "/").empty());
-  CHECK(Util::split_into_views("///", "/").empty());
-  {
-    const auto s = Util::split_into_views("a/b", "/");
-    REQUIRE(s.size() == 2);
-    CHECK(s[0] == "a");
-    CHECK(s[1] == "b");
-  }
-  {
-    const auto s = Util::split_into_views("a/b", "x");
-    REQUIRE(s.size() == 1);
-    CHECK(s[0] == "a/b");
-  }
+  using Mode = util::Tokenizer::Mode;
+
+  SUBCASE("include empty tokens")
   {
-    const auto s = Util::split_into_views("a/b:c", "/:");
-    REQUIRE(s.size() == 3);
-    CHECK(s[0] == "a");
-    CHECK(s[1] == "b");
-    CHECK(s[2] == "c");
+    {
+      const auto s = Util::split_into_views("", "/", Mode::include_empty);
+      REQUIRE(s.size() == 1);
+      CHECK(s[0] == "");
+    }
+    {
+      const auto s = Util::split_into_views("/", "/", Mode::include_empty);
+      REQUIRE(s.size() == 2);
+      CHECK(s[0] == "");
+      CHECK(s[1] == "");
+    }
+    {
+      const auto s = Util::split_into_views("a/", "/", Mode::include_empty);
+      REQUIRE(s.size() == 2);
+      CHECK(s[0] == "a");
+      CHECK(s[1] == "");
+    }
+    {
+      const auto s = Util::split_into_views("/b", "/", Mode::include_empty);
+      REQUIRE(s.size() == 2);
+      CHECK(s[0] == "");
+      CHECK(s[1] == "b");
+    }
+    {
+      const auto s = Util::split_into_views("a/b", "/", Mode::include_empty);
+      REQUIRE(s.size() == 2);
+      CHECK(s[0] == "a");
+      CHECK(s[1] == "b");
+    }
+    {
+      const auto s = Util::split_into_views("/a:", "/:", Mode::include_empty);
+      REQUIRE(s.size() == 3);
+      CHECK(s[0] == "");
+      CHECK(s[1] == "a");
+      CHECK(s[2] == "");
+    }
   }
+
+  SUBCASE("skip empty")
   {
-    const auto s = Util::split_into_views(":a//b..:.c/:/.", "/:.");
-    REQUIRE(s.size() == 3);
-    CHECK(s[0] == "a");
-    CHECK(s[1] == "b");
-    CHECK(s[2] == "c");
+    CHECK(Util::split_into_views("", "/", Mode::skip_empty).empty());
+    CHECK(Util::split_into_views("///", "/", Mode::skip_empty).empty());
+    {
+      const auto s = Util::split_into_views("a/b", "/", Mode::skip_empty);
+      REQUIRE(s.size() == 2);
+      CHECK(s[0] == "a");
+      CHECK(s[1] == "b");
+    }
+    {
+      const auto s = Util::split_into_views("a/b", "x", Mode::skip_empty);
+      REQUIRE(s.size() == 1);
+      CHECK(s[0] == "a/b");
+    }
+    {
+      const auto s = Util::split_into_views("a/b:c", "/:", Mode::skip_empty);
+      REQUIRE(s.size() == 3);
+      CHECK(s[0] == "a");
+      CHECK(s[1] == "b");
+      CHECK(s[2] == "c");
+    }
+    {
+      const auto s =
+        Util::split_into_views(":a//b..:.c/:/.", "/:.", Mode::skip_empty);
+      REQUIRE(s.size() == 3);
+      CHECK(s[0] == "a");
+      CHECK(s[1] == "b");
+      CHECK(s[2] == "c");
+    }
+    {
+      const auto s = Util::split_into_views(
+        ".0.1.2.3.4.5.6.7.8.9.", "/:.+_abcdef", Mode::skip_empty);
+      REQUIRE(s.size() == 10);
+      CHECK(s[0] == "0");
+      CHECK(s[9] == "9");
+    }
   }
+
+  SUBCASE("skip last empty token")
   {
-    const auto s =
-      Util::split_into_views(".0.1.2.3.4.5.6.7.8.9.", "/:.+_abcdef");
-    REQUIRE(s.size() == 10);
-    CHECK(s[0] == "0");
-    CHECK(s[9] == "9");
+    CHECK(Util::split_into_views("", "/", Mode::skip_last_empty).empty());
+    {
+      const auto s = Util::split_into_views("/", "/", Mode::skip_last_empty);
+      REQUIRE(s.size() == 1);
+      CHECK(s[0] == "");
+    }
+    {
+      const auto s = Util::split_into_views("a/", "/", Mode::skip_last_empty);
+      REQUIRE(s.size() == 1);
+      CHECK(s[0] == "a");
+    }
+    {
+      const auto s = Util::split_into_views("/b", "/", Mode::skip_last_empty);
+      REQUIRE(s.size() == 2);
+      CHECK(s[0] == "");
+      CHECK(s[1] == "b");
+    }
+    {
+      const auto s = Util::split_into_views("a/b", "/", Mode::skip_last_empty);
+      REQUIRE(s.size() == 2);
+      CHECK(s[0] == "a");
+      CHECK(s[1] == "b");
+    }
+    {
+      const auto s = Util::split_into_views("/a:", "/:", Mode::skip_last_empty);
+      REQUIRE(s.size() == 2);
+      CHECK(s[0] == "");
+      CHECK(s[1] == "a");
+    }
   }
 }