From: Joel Rosdahl Date: Mon, 5 Jul 2021 12:41:48 +0000 (+0200) Subject: Add mode parameter to split_into_* and Tokenizer X-Git-Tag: v4.4~158 X-Git-Url: http://git.ipfire.org/cgi-bin/gitweb.cgi?a=commitdiff_plain;h=443a570e6d70220406fa5fcf15a296dabb71239e;p=thirdparty%2Fccache.git Add mode parameter to split_into_* and Tokenizer --- diff --git a/src/Util.cpp b/src/Util.cpp index e4ad44673..81b0d86bb 100644 --- a/src/Util.cpp +++ b/src/Util.cpp @@ -26,7 +26,6 @@ #include "TemporaryFile.hpp" #include "fmtmacros.hpp" -#include #include extern "C" { @@ -139,10 +138,12 @@ path_max(const std::string& path) template std::vector -split_into(string_view input, const char* separators) +split_into(string_view string, + const char* separators, + util::Tokenizer::Mode mode) { std::vector result; - for (const auto token : util::Tokenizer(input, separators)) { + for (const auto token : util::Tokenizer(string, separators, mode)) { result.emplace_back(token); } return result; @@ -1335,15 +1336,19 @@ setenv(const std::string& name, const std::string& value) } std::vector -split_into_views(string_view input, const char* separators) +split_into_views(string_view string, + const char* separators, + util::Tokenizer::Mode mode) { - return split_into(input, separators); + return split_into(string, separators, mode); } std::vector -split_into_strings(string_view input, const char* separators) +split_into_strings(string_view string, + const char* separators, + util::Tokenizer::Mode mode) { - return split_into(input, separators); + return split_into(string, separators, mode); } std::string diff --git a/src/Util.hpp b/src/Util.hpp index 0d2504670..d31657c79 100644 --- a/src/Util.hpp +++ b/src/Util.hpp @@ -22,6 +22,8 @@ #include "CacheFile.hpp" +#include + #include "third_party/nonstd/optional.hpp" #include "third_party/nonstd/string_view.hpp" @@ -418,15 +420,19 @@ size_change_kibibyte(const Stat& old_stat, const Stat& new_stat) / 1024; } -// Split `input` into words at any of the characters listed in `separators`. -// These words are a view into `input`; empty words are omitted. `separators` -// must neither be the empty string nor a nullptr. -std::vector split_into_views(nonstd::string_view input, - const char* separators); - -// Same as `split_into_views` but the words are copied from `input`. -std::vector split_into_strings(nonstd::string_view input, - const char* separators); +// Split `string` into tokens at any of the characters in `separators`. These +// tokens are views into `string`. `separators` must neither be the empty string +// nor a nullptr. +std::vector split_into_views( + nonstd::string_view string, + const char* separators, + util::Tokenizer::Mode mode = util::Tokenizer::Mode::skip_empty); + +// Same as `split_into_views` but the tokens are copied from `string`. +std::vector split_into_strings( + nonstd::string_view string, + const char* separators, + util::Tokenizer::Mode mode = util::Tokenizer::Mode::skip_empty); // Return true if `prefix` is a prefix of `string`. inline bool diff --git a/src/util/Tokenizer.cpp b/src/util/Tokenizer.cpp index c20140473..9b27c8fb2 100644 --- a/src/util/Tokenizer.cpp +++ b/src/util/Tokenizer.cpp @@ -20,24 +20,34 @@ namespace util { -Tokenizer::Iterator -Tokenizer::Iterator::operator++() +void +Tokenizer::Iterator::advance(bool initial) { - if (m_pos >= m_string.size()) { - return *this; - } + constexpr auto npos = nonstd::string_view::npos; + const auto string = m_tokenizer.m_string; + const auto delimiters = m_tokenizer.m_delimiters; + const auto mode = m_tokenizer.m_mode; + + DEBUG_ASSERT(m_left <= m_right); + DEBUG_ASSERT(m_right <= string.length()); - m_pos = m_string.find_first_not_of(m_delimiters, m_pos + m_count); - if (m_pos == nonstd::string_view::npos) { - m_pos = m_string.size(); - m_count = 0; - } else { - m_count = m_string.substr(m_pos).find_first_of(m_delimiters); - if (m_count == nonstd::string_view::npos) { - m_count = m_string.size() - m_pos; + do { + if (initial) { + initial = false; + } else if (m_right == string.length()) { + m_left = npos; + } else { + m_left = m_right + 1; } + if (m_left != npos) { + const auto delim_pos = string.find_first_of(delimiters, m_left); + m_right = delim_pos == npos ? string.length() : delim_pos; + } + } while (mode == Mode::skip_empty && m_left == m_right); + + if (mode == Mode::skip_last_empty && m_left == string.length()) { + m_left = npos; } - return *this; } } // namespace util diff --git a/src/util/Tokenizer.hpp b/src/util/Tokenizer.hpp index 3ee3512f0..90cb0c09c 100644 --- a/src/util/Tokenizer.hpp +++ b/src/util/Tokenizer.hpp @@ -18,85 +18,113 @@ #pragma once +#include + +#include #include #include namespace util { +// An instance of this class can be used in a range-based for loop to split a +// string into tokens at any of the characters in a string of delimiters. class Tokenizer { public: - Tokenizer(nonstd::string_view string, const char* delimiters); + enum class Mode { + include_empty, // Include empty tokens. + skip_empty, // Skip empty tokens. + skip_last_empty, // Include empty tokens except the last one. + }; + + // Split `string` into tokens at any of the characters in `separators` which + // must neither be the empty string nor a nullptr. + Tokenizer(nonstd::string_view string, + const char* delimiters, + Mode mode = Mode::skip_empty); class Iterator { public: - Iterator(nonstd::string_view string, - const char* delimiters, - size_t start_pos); + Iterator(const Tokenizer& tokenizer, size_t start_pos); Iterator operator++(); bool operator!=(const Iterator& other) const; nonstd::string_view operator*() const; private: - const nonstd::string_view m_string; - const char* const m_delimiters; - size_t m_pos; - size_t m_count = 0; + const Tokenizer& m_tokenizer; + size_t m_left; + size_t m_right; + + void advance(bool initial); }; Iterator begin(); Iterator end(); private: + friend Iterator; + const nonstd::string_view m_string; const char* const m_delimiters; + const Mode m_mode; }; inline Tokenizer::Tokenizer(const nonstd::string_view string, - const char* const delimiters) + const char* const delimiters, + const Tokenizer::Mode mode) : m_string(string), - m_delimiters(delimiters) + m_delimiters(delimiters), + m_mode(mode) { - assert(delimiters != nullptr && delimiters[0] != '\0'); + DEBUG_ASSERT(delimiters != nullptr && delimiters[0] != '\0'); } -inline Tokenizer::Iterator::Iterator(const nonstd::string_view string, - const char* const delimiters, +inline Tokenizer::Iterator::Iterator(const Tokenizer& tokenizer, const size_t start_pos) - : m_string(string), - m_delimiters(delimiters), - m_pos(start_pos) + : m_tokenizer(tokenizer), + m_left(start_pos), + m_right(start_pos) +{ + if (start_pos == 0) { + advance(true); + } else { + DEBUG_ASSERT(start_pos == nonstd::string_view::npos); + } +} + +inline Tokenizer::Iterator +Tokenizer::Iterator::operator++() { - ++*this; + advance(false); + return *this; } inline bool Tokenizer::Iterator::operator!=(const Iterator& other) const { - assert(m_string.data() == other.m_string.data()); - assert(m_delimiters == other.m_delimiters); - return m_pos != other.m_pos || m_count != other.m_count; + return &m_tokenizer != &other.m_tokenizer || m_left != other.m_left; } inline nonstd::string_view Tokenizer::Iterator::operator*() const { - assert(m_pos < m_string.size()); - return m_string.substr(m_pos, m_count); + DEBUG_ASSERT(m_left <= m_right); + DEBUG_ASSERT(m_right <= m_tokenizer.m_string.length()); + return m_tokenizer.m_string.substr(m_left, m_right - m_left); } inline Tokenizer::Iterator Tokenizer::begin() { - return Iterator(m_string, m_delimiters, 0); + return Iterator(*this, 0); } inline Tokenizer::Iterator Tokenizer::end() { - return Iterator(m_string, m_delimiters, m_string.size()); + return Iterator(*this, nonstd::string_view::npos); } } // namespace util diff --git a/unittest/test_Util.cpp b/unittest/test_Util.cpp index 07cdc2183..a96e64456 100644 --- a/unittest/test_Util.cpp +++ b/unittest/test_Util.cpp @@ -848,7 +848,7 @@ TEST_CASE("Util::same_program_name") } // Util::split_into_strings and Util::split_into_views are tested implicitly in -// test_Tokenizer.cpp. +// test_util_Tokenizer.cpp. TEST_CASE("Util::starts_with") { diff --git a/unittest/test_util_Tokenizer.cpp b/unittest/test_util_Tokenizer.cpp index 9a7801f0c..76c9e027c 100644 --- a/unittest/test_util_Tokenizer.cpp +++ b/unittest/test_util_Tokenizer.cpp @@ -22,38 +22,117 @@ TEST_CASE("util::Tokenizer") { - CHECK(Util::split_into_views("", "/").empty()); - CHECK(Util::split_into_views("///", "/").empty()); - { - const auto s = Util::split_into_views("a/b", "/"); - REQUIRE(s.size() == 2); - CHECK(s[0] == "a"); - CHECK(s[1] == "b"); - } - { - const auto s = Util::split_into_views("a/b", "x"); - REQUIRE(s.size() == 1); - CHECK(s[0] == "a/b"); - } + using Mode = util::Tokenizer::Mode; + + SUBCASE("include empty tokens") { - const auto s = Util::split_into_views("a/b:c", "/:"); - REQUIRE(s.size() == 3); - CHECK(s[0] == "a"); - CHECK(s[1] == "b"); - CHECK(s[2] == "c"); + { + const auto s = Util::split_into_views("", "/", Mode::include_empty); + REQUIRE(s.size() == 1); + CHECK(s[0] == ""); + } + { + const auto s = Util::split_into_views("/", "/", Mode::include_empty); + REQUIRE(s.size() == 2); + CHECK(s[0] == ""); + CHECK(s[1] == ""); + } + { + const auto s = Util::split_into_views("a/", "/", Mode::include_empty); + REQUIRE(s.size() == 2); + CHECK(s[0] == "a"); + CHECK(s[1] == ""); + } + { + const auto s = Util::split_into_views("/b", "/", Mode::include_empty); + REQUIRE(s.size() == 2); + CHECK(s[0] == ""); + CHECK(s[1] == "b"); + } + { + const auto s = Util::split_into_views("a/b", "/", Mode::include_empty); + REQUIRE(s.size() == 2); + CHECK(s[0] == "a"); + CHECK(s[1] == "b"); + } + { + const auto s = Util::split_into_views("/a:", "/:", Mode::include_empty); + REQUIRE(s.size() == 3); + CHECK(s[0] == ""); + CHECK(s[1] == "a"); + CHECK(s[2] == ""); + } } + + SUBCASE("skip empty") { - const auto s = Util::split_into_views(":a//b..:.c/:/.", "/:."); - REQUIRE(s.size() == 3); - CHECK(s[0] == "a"); - CHECK(s[1] == "b"); - CHECK(s[2] == "c"); + CHECK(Util::split_into_views("", "/", Mode::skip_empty).empty()); + CHECK(Util::split_into_views("///", "/", Mode::skip_empty).empty()); + { + const auto s = Util::split_into_views("a/b", "/", Mode::skip_empty); + REQUIRE(s.size() == 2); + CHECK(s[0] == "a"); + CHECK(s[1] == "b"); + } + { + const auto s = Util::split_into_views("a/b", "x", Mode::skip_empty); + REQUIRE(s.size() == 1); + CHECK(s[0] == "a/b"); + } + { + const auto s = Util::split_into_views("a/b:c", "/:", Mode::skip_empty); + REQUIRE(s.size() == 3); + CHECK(s[0] == "a"); + CHECK(s[1] == "b"); + CHECK(s[2] == "c"); + } + { + const auto s = + Util::split_into_views(":a//b..:.c/:/.", "/:.", Mode::skip_empty); + REQUIRE(s.size() == 3); + CHECK(s[0] == "a"); + CHECK(s[1] == "b"); + CHECK(s[2] == "c"); + } + { + const auto s = Util::split_into_views( + ".0.1.2.3.4.5.6.7.8.9.", "/:.+_abcdef", Mode::skip_empty); + REQUIRE(s.size() == 10); + CHECK(s[0] == "0"); + CHECK(s[9] == "9"); + } } + + SUBCASE("skip last empty token") { - const auto s = - Util::split_into_views(".0.1.2.3.4.5.6.7.8.9.", "/:.+_abcdef"); - REQUIRE(s.size() == 10); - CHECK(s[0] == "0"); - CHECK(s[9] == "9"); + CHECK(Util::split_into_views("", "/", Mode::skip_last_empty).empty()); + { + const auto s = Util::split_into_views("/", "/", Mode::skip_last_empty); + REQUIRE(s.size() == 1); + CHECK(s[0] == ""); + } + { + const auto s = Util::split_into_views("a/", "/", Mode::skip_last_empty); + REQUIRE(s.size() == 1); + CHECK(s[0] == "a"); + } + { + const auto s = Util::split_into_views("/b", "/", Mode::skip_last_empty); + REQUIRE(s.size() == 2); + CHECK(s[0] == ""); + CHECK(s[1] == "b"); + } + { + const auto s = Util::split_into_views("a/b", "/", Mode::skip_last_empty); + REQUIRE(s.size() == 2); + CHECK(s[0] == "a"); + CHECK(s[1] == "b"); + } + { + const auto s = Util::split_into_views("/a:", "/:", Mode::skip_last_empty); + REQUIRE(s.size() == 2); + CHECK(s[0] == ""); + CHECK(s[1] == "a"); + } } }