Add mode parameter to split_into_* and Tokenizer

author Joel Rosdahl <joel@rosdahl.net>

Mon, 5 Jul 2021 12:41:48 +0000 (14:41 +0200)

committer Joel Rosdahl <joel@rosdahl.net>

Mon, 5 Jul 2021 17:06:25 +0000 (19:06 +0200)
author Joel Rosdahl <joel@rosdahl.net>
Mon, 5 Jul 2021 12:41:48 +0000 (14:41 +0200)
committer Joel Rosdahl <joel@rosdahl.net>
Mon, 5 Jul 2021 17:06:25 +0000 (19:06 +0200)
diff --git a/src/Util.cpp b/src/Util.cpp

index e4ad446736bd2cb7d0a7e3d781e72f36ce39e6b7..81b0d86bb21b74e1288f850849a290efe41c7fd0 100644 (file)
--- a/src/Util.cpp
+++ b/src/Util.cpp
@@ -26,7 +26,6 @@
  #include "TemporaryFile.hpp"
  #include "fmtmacros.hpp"
  
-#include <util/Tokenizer.hpp>
  #include <util/path_utils.hpp>
  
  extern "C" {
@@ -139,10 +138,12 @@ path_max(const std::string& path)
  
  template<typename T>
  std::vector<T>
-split_into(string_view input, const char* separators)
+split_into(string_view string,
+           const char* separators,
+           util::Tokenizer::Mode mode)
  {
    std::vector<T> result;
-  for (const auto token : util::Tokenizer(input, separators)) {
+  for (const auto token : util::Tokenizer(string, separators, mode)) {
      result.emplace_back(token);
    }
    return result;
@@ -1335,15 +1336,19 @@ setenv(const std::string& name, const std::string& value)
  }
  
  std::vector<string_view>
-split_into_views(string_view input, const char* separators)
+split_into_views(string_view string,
+                 const char* separators,
+                 util::Tokenizer::Mode mode)
  {
-  return split_into<string_view>(input, separators);
+  return split_into<string_view>(string, separators, mode);
  }
  
  std::vector<std::string>
-split_into_strings(string_view input, const char* separators)
+split_into_strings(string_view string,
+                   const char* separators,
+                   util::Tokenizer::Mode mode)
  {
-  return split_into<std::string>(input, separators);
+  return split_into<std::string>(string, separators, mode);
  }
  
  std::string
diff --git a/src/Util.hpp b/src/Util.hpp

index 0d25046702e6c2add3fd4a55b6839af1309c5d91..d31657c796504d6177ad015801c0be6fe3642a1d 100644 (file)
--- a/src/Util.hpp
+++ b/src/Util.hpp
@@ -22,6 +22,8 @@
  
  #include "CacheFile.hpp"
  
+#include <util/Tokenizer.hpp>
+
  #include "third_party/nonstd/optional.hpp"
  #include "third_party/nonstd/string_view.hpp"
  
@@ -418,15 +420,19 @@ size_change_kibibyte(const Stat& old_stat, const Stat& new_stat)
           / 1024;
  }
  
-// Split `input` into words at any of the characters listed in `separators`.
-// These words are a view into `input`; empty words are omitted. `separators`
-// must neither be the empty string nor a nullptr.
-std::vector<nonstd::string_view> split_into_views(nonstd::string_view input,
-                                                  const char* separators);
-
-// Same as `split_into_views` but the words are copied from `input`.
-std::vector<std::string> split_into_strings(nonstd::string_view input,
-                                            const char* separators);
+// Split `string` into tokens at any of the characters in `separators`. These
+// tokens are views into `string`. `separators` must neither be the empty string
+// nor a nullptr.
+std::vector<nonstd::string_view> split_into_views(
+  nonstd::string_view string,
+  const char* separators,
+  util::Tokenizer::Mode mode = util::Tokenizer::Mode::skip_empty);
+
+// Same as `split_into_views` but the tokens are copied from `string`.
+std::vector<std::string> split_into_strings(
+  nonstd::string_view string,
+  const char* separators,
+  util::Tokenizer::Mode mode = util::Tokenizer::Mode::skip_empty);
  
  // Return true if `prefix` is a prefix of `string`.
  inline bool
diff --git a/src/util/Tokenizer.cpp b/src/util/Tokenizer.cpp

index c20140473344b1cc2041837828db6484ecbd3779..9b27c8fb2643acbeba9c77f26ef073ddf72cb75b 100644 (file)
--- a/src/util/Tokenizer.cpp
+++ b/src/util/Tokenizer.cpp
@@ -20,24 +20,34 @@
  
  namespace util {
  
-Tokenizer::Iterator
-Tokenizer::Iterator::operator++()
+void
+Tokenizer::Iterator::advance(bool initial)
  {
-  if (m_pos >= m_string.size()) {
-    return *this;
-  }
+  constexpr auto npos = nonstd::string_view::npos;
+  const auto string = m_tokenizer.m_string;
+  const auto delimiters = m_tokenizer.m_delimiters;
+  const auto mode = m_tokenizer.m_mode;
+
+  DEBUG_ASSERT(m_left <= m_right);
+  DEBUG_ASSERT(m_right <= string.length());
  
-  m_pos = m_string.find_first_not_of(m_delimiters, m_pos + m_count);
-  if (m_pos == nonstd::string_view::npos) {
-    m_pos = m_string.size();
-    m_count = 0;
-  } else {
-    m_count = m_string.substr(m_pos).find_first_of(m_delimiters);
-    if (m_count == nonstd::string_view::npos) {
-      m_count = m_string.size() - m_pos;
+  do {
+    if (initial) {
+      initial = false;
+    } else if (m_right == string.length()) {
+      m_left = npos;
+    } else {
+      m_left = m_right + 1;
      }
+    if (m_left != npos) {
+      const auto delim_pos = string.find_first_of(delimiters, m_left);
+      m_right = delim_pos == npos ? string.length() : delim_pos;
+    }
+  } while (mode == Mode::skip_empty && m_left == m_right);
+
+  if (mode == Mode::skip_last_empty && m_left == string.length()) {
+    m_left = npos;
    }
-  return *this;
  }
  
  } // namespace util
diff --git a/src/util/Tokenizer.hpp b/src/util/Tokenizer.hpp

index 3ee3512f0013129a81314e65185bb2c6c1e6d169..90cb0c09cd2c767f35e125b82d83b596c6683698 100644 (file)
--- a/src/util/Tokenizer.hpp
+++ b/src/util/Tokenizer.hpp
@@ -18,85 +18,113 @@
  
  #pragma once
  
+#include <assertions.hpp>
+
+#include <third_party/fmt/core.h>
  #include <third_party/nonstd/optional.hpp>
  #include <third_party/nonstd/string_view.hpp>
  
  namespace util {
  
+// An instance of this class can be used in a range-based for loop to split a
+// string into tokens at any of the characters in a string of delimiters.
  class Tokenizer
  {
  public:
-  Tokenizer(nonstd::string_view string, const char* delimiters);
+  enum class Mode {
+    include_empty,   // Include empty tokens.
+    skip_empty,      // Skip empty tokens.
+    skip_last_empty, // Include empty tokens except the last one.
+  };
+
+  // Split `string` into tokens at any of the characters in `separators` which
+  // must neither be the empty string nor a nullptr.
+  Tokenizer(nonstd::string_view string,
+            const char* delimiters,
+            Mode mode = Mode::skip_empty);
  
    class Iterator
    {
    public:
-    Iterator(nonstd::string_view string,
-             const char* delimiters,
-             size_t start_pos);
+    Iterator(const Tokenizer& tokenizer, size_t start_pos);
  
      Iterator operator++();
      bool operator!=(const Iterator& other) const;
      nonstd::string_view operator*() const;
  
    private:
-    const nonstd::string_view m_string;
-    const char* const m_delimiters;
-    size_t m_pos;
-    size_t m_count = 0;
+    const Tokenizer& m_tokenizer;
+    size_t m_left;
+    size_t m_right;
+
+    void advance(bool initial);
    };
  
    Iterator begin();
    Iterator end();
  
  private:
+  friend Iterator;
+
    const nonstd::string_view m_string;
    const char* const m_delimiters;
+  const Mode m_mode;
  };
  
  inline Tokenizer::Tokenizer(const nonstd::string_view string,
-                            const char* const delimiters)
+                            const char* const delimiters,
+                            const Tokenizer::Mode mode)
    : m_string(string),
-    m_delimiters(delimiters)
+    m_delimiters(delimiters),
+    m_mode(mode)
  {
-  assert(delimiters != nullptr && delimiters[0] != '\0');
+  DEBUG_ASSERT(delimiters != nullptr && delimiters[0] != '\0');
  }
  
-inline Tokenizer::Iterator::Iterator(const nonstd::string_view string,
-                                     const char* const delimiters,
+inline Tokenizer::Iterator::Iterator(const Tokenizer& tokenizer,
                                       const size_t start_pos)
-  : m_string(string),
-    m_delimiters(delimiters),
-    m_pos(start_pos)
+  : m_tokenizer(tokenizer),
+    m_left(start_pos),
+    m_right(start_pos)
+{
+  if (start_pos == 0) {
+    advance(true);
+  } else {
+    DEBUG_ASSERT(start_pos == nonstd::string_view::npos);
+  }
+}
+
+inline Tokenizer::Iterator
+Tokenizer::Iterator::operator++()
  {
-  ++*this;
+  advance(false);
+  return *this;
  }
  
  inline bool
  Tokenizer::Iterator::operator!=(const Iterator& other) const
  {
-  assert(m_string.data() == other.m_string.data());
-  assert(m_delimiters == other.m_delimiters);
-  return m_pos != other.m_pos || m_count != other.m_count;
+  return &m_tokenizer != &other.m_tokenizer || m_left != other.m_left;
  }
  
  inline nonstd::string_view
  Tokenizer::Iterator::operator*() const
  {
-  assert(m_pos < m_string.size());
-  return m_string.substr(m_pos, m_count);
+  DEBUG_ASSERT(m_left <= m_right);
+  DEBUG_ASSERT(m_right <= m_tokenizer.m_string.length());
+  return m_tokenizer.m_string.substr(m_left, m_right - m_left);
  }
  
  inline Tokenizer::Iterator
  Tokenizer::begin()
  {
-  return Iterator(m_string, m_delimiters, 0);
+  return Iterator(*this, 0);
  }
  
  inline Tokenizer::Iterator
  Tokenizer::end()
  {
-  return Iterator(m_string, m_delimiters, m_string.size());
+  return Iterator(*this, nonstd::string_view::npos);
  }
  
  } // namespace util
diff --git a/unittest/test_Util.cpp b/unittest/test_Util.cpp

index 07cdc21830f17af90002b18933498479c3aaf46b..a96e64456b90f45527c713e5667166f7369680c5 100644 (file)
--- a/unittest/test_Util.cpp
+++ b/unittest/test_Util.cpp
@@ -848,7 +848,7 @@ TEST_CASE("Util::same_program_name")
  }
  
  // Util::split_into_strings and Util::split_into_views are tested implicitly in
-// test_Tokenizer.cpp.
+// test_util_Tokenizer.cpp.
  
  TEST_CASE("Util::starts_with")
  {
diff --git a/unittest/test_util_Tokenizer.cpp b/unittest/test_util_Tokenizer.cpp

index 9a7801f0cec18903aa021f458685e634b8ad2d2f..76c9e027c6f74671033a0f8de774605fa5975a0b 100644 (file)
--- a/unittest/test_util_Tokenizer.cpp
+++ b/unittest/test_util_Tokenizer.cpp
@@ -22,38 +22,117 @@
  
  TEST_CASE("util::Tokenizer")
  {
-  CHECK(Util::split_into_views("", "/").empty());
-  CHECK(Util::split_into_views("///", "/").empty());
-  {
-    const auto s = Util::split_into_views("a/b", "/");
-    REQUIRE(s.size() == 2);
-    CHECK(s[0] == "a");
-    CHECK(s[1] == "b");
-  }
-  {
-    const auto s = Util::split_into_views("a/b", "x");
-    REQUIRE(s.size() == 1);
-    CHECK(s[0] == "a/b");
-  }
+  using Mode = util::Tokenizer::Mode;
+
+  SUBCASE("include empty tokens")
    {
-    const auto s = Util::split_into_views("a/b:c", "/:");
-    REQUIRE(s.size() == 3);
-    CHECK(s[0] == "a");
-    CHECK(s[1] == "b");
-    CHECK(s[2] == "c");
+    {
+      const auto s = Util::split_into_views("", "/", Mode::include_empty);
+      REQUIRE(s.size() == 1);
+      CHECK(s[0] == "");
+    }
+    {
+      const auto s = Util::split_into_views("/", "/", Mode::include_empty);
+      REQUIRE(s.size() == 2);
+      CHECK(s[0] == "");
+      CHECK(s[1] == "");
+    }
+    {
+      const auto s = Util::split_into_views("a/", "/", Mode::include_empty);
+      REQUIRE(s.size() == 2);
+      CHECK(s[0] == "a");
+      CHECK(s[1] == "");
+    }
+    {
+      const auto s = Util::split_into_views("/b", "/", Mode::include_empty);
+      REQUIRE(s.size() == 2);
+      CHECK(s[0] == "");
+      CHECK(s[1] == "b");
+    }
+    {
+      const auto s = Util::split_into_views("a/b", "/", Mode::include_empty);
+      REQUIRE(s.size() == 2);
+      CHECK(s[0] == "a");
+      CHECK(s[1] == "b");
+    }
+    {
+      const auto s = Util::split_into_views("/a:", "/:", Mode::include_empty);
+      REQUIRE(s.size() == 3);
+      CHECK(s[0] == "");
+      CHECK(s[1] == "a");
+      CHECK(s[2] == "");
+    }
    }
+
+  SUBCASE("skip empty")
    {
-    const auto s = Util::split_into_views(":a//b..:.c/:/.", "/:.");
-    REQUIRE(s.size() == 3);
-    CHECK(s[0] == "a");
-    CHECK(s[1] == "b");
-    CHECK(s[2] == "c");
+    CHECK(Util::split_into_views("", "/", Mode::skip_empty).empty());
+    CHECK(Util::split_into_views("///", "/", Mode::skip_empty).empty());
+    {
+      const auto s = Util::split_into_views("a/b", "/", Mode::skip_empty);
+      REQUIRE(s.size() == 2);
+      CHECK(s[0] == "a");
+      CHECK(s[1] == "b");
+    }
+    {
+      const auto s = Util::split_into_views("a/b", "x", Mode::skip_empty);
+      REQUIRE(s.size() == 1);
+      CHECK(s[0] == "a/b");
+    }
+    {
+      const auto s = Util::split_into_views("a/b:c", "/:", Mode::skip_empty);
+      REQUIRE(s.size() == 3);
+      CHECK(s[0] == "a");
+      CHECK(s[1] == "b");
+      CHECK(s[2] == "c");
+    }
+    {
+      const auto s =
+        Util::split_into_views(":a//b..:.c/:/.", "/:.", Mode::skip_empty);
+      REQUIRE(s.size() == 3);
+      CHECK(s[0] == "a");
+      CHECK(s[1] == "b");
+      CHECK(s[2] == "c");
+    }
+    {
+      const auto s = Util::split_into_views(
+        ".0.1.2.3.4.5.6.7.8.9.", "/:.+_abcdef", Mode::skip_empty);
+      REQUIRE(s.size() == 10);
+      CHECK(s[0] == "0");
+      CHECK(s[9] == "9");
+    }
    }
+
+  SUBCASE("skip last empty token")
    {
-    const auto s =
-      Util::split_into_views(".0.1.2.3.4.5.6.7.8.9.", "/:.+_abcdef");
-    REQUIRE(s.size() == 10);
-    CHECK(s[0] == "0");
-    CHECK(s[9] == "9");
+    CHECK(Util::split_into_views("", "/", Mode::skip_last_empty).empty());
+    {
+      const auto s = Util::split_into_views("/", "/", Mode::skip_last_empty);
+      REQUIRE(s.size() == 1);
+      CHECK(s[0] == "");
+    }
+    {
+      const auto s = Util::split_into_views("a/", "/", Mode::skip_last_empty);
+      REQUIRE(s.size() == 1);
+      CHECK(s[0] == "a");
+    }
+    {
+      const auto s = Util::split_into_views("/b", "/", Mode::skip_last_empty);
+      REQUIRE(s.size() == 2);
+      CHECK(s[0] == "");
+      CHECK(s[1] == "b");
+    }
+    {
+      const auto s = Util::split_into_views("a/b", "/", Mode::skip_last_empty);
+      REQUIRE(s.size() == 2);
+      CHECK(s[0] == "a");
+      CHECK(s[1] == "b");
+    }
+    {
+      const auto s = Util::split_into_views("/a:", "/:", Mode::skip_last_empty);
+      REQUIRE(s.size() == 2);
+      CHECK(s[0] == "");
+      CHECK(s[1] == "a");
+    }
    }
  }
author	Joel Rosdahl <joel@rosdahl.net>
	Mon, 5 Jul 2021 12:41:48 +0000 (14:41 +0200)
committer	Joel Rosdahl <joel@rosdahl.net>
	Mon, 5 Jul 2021 17:06:25 +0000 (19:06 +0200)
src/Util.cpp		patch \| blob \| blame \| history
src/Util.hpp		patch \| blob \| blame \| history
src/util/Tokenizer.cpp		patch \| blob \| blame \| history
src/util/Tokenizer.hpp		patch \| blob \| blame \| history
unittest/test_Util.cpp		patch \| blob \| blame \| history
unittest/test_util_Tokenizer.cpp		patch \| blob \| blame \| history