#include "TemporaryFile.hpp"
#include "fmtmacros.hpp"
-#include <util/Tokenizer.hpp>
#include <util/path_utils.hpp>
extern "C" {
template<typename T>
std::vector<T>
-split_into(string_view input, const char* separators)
+split_into(string_view string,
+ const char* separators,
+ util::Tokenizer::Mode mode)
{
std::vector<T> result;
- for (const auto token : util::Tokenizer(input, separators)) {
+ for (const auto token : util::Tokenizer(string, separators, mode)) {
result.emplace_back(token);
}
return result;
}
std::vector<string_view>
-split_into_views(string_view input, const char* separators)
+split_into_views(string_view string,
+ const char* separators,
+ util::Tokenizer::Mode mode)
{
- return split_into<string_view>(input, separators);
+ return split_into<string_view>(string, separators, mode);
}
std::vector<std::string>
-split_into_strings(string_view input, const char* separators)
+split_into_strings(string_view string,
+ const char* separators,
+ util::Tokenizer::Mode mode)
{
- return split_into<std::string>(input, separators);
+ return split_into<std::string>(string, separators, mode);
}
std::string
#include "CacheFile.hpp"
+#include <util/Tokenizer.hpp>
+
#include "third_party/nonstd/optional.hpp"
#include "third_party/nonstd/string_view.hpp"
/ 1024;
}
-// Split `input` into words at any of the characters listed in `separators`.
-// These words are a view into `input`; empty words are omitted. `separators`
-// must neither be the empty string nor a nullptr.
-std::vector<nonstd::string_view> split_into_views(nonstd::string_view input,
- const char* separators);
-
-// Same as `split_into_views` but the words are copied from `input`.
-std::vector<std::string> split_into_strings(nonstd::string_view input,
- const char* separators);
+// Split `string` into tokens at any of the characters in `separators`. These
+// tokens are views into `string`. `separators` must neither be the empty string
+// nor a nullptr.
+std::vector<nonstd::string_view> split_into_views(
+ nonstd::string_view string,
+ const char* separators,
+ util::Tokenizer::Mode mode = util::Tokenizer::Mode::skip_empty);
+
+// Same as `split_into_views` but the tokens are copied from `string`.
+std::vector<std::string> split_into_strings(
+ nonstd::string_view string,
+ const char* separators,
+ util::Tokenizer::Mode mode = util::Tokenizer::Mode::skip_empty);
// Return true if `prefix` is a prefix of `string`.
inline bool
namespace util {
-Tokenizer::Iterator
-Tokenizer::Iterator::operator++()
+void
+Tokenizer::Iterator::advance(bool initial)
{
- if (m_pos >= m_string.size()) {
- return *this;
- }
+ constexpr auto npos = nonstd::string_view::npos;
+ const auto string = m_tokenizer.m_string;
+ const auto delimiters = m_tokenizer.m_delimiters;
+ const auto mode = m_tokenizer.m_mode;
+
+ DEBUG_ASSERT(m_left <= m_right);
+ DEBUG_ASSERT(m_right <= string.length());
- m_pos = m_string.find_first_not_of(m_delimiters, m_pos + m_count);
- if (m_pos == nonstd::string_view::npos) {
- m_pos = m_string.size();
- m_count = 0;
- } else {
- m_count = m_string.substr(m_pos).find_first_of(m_delimiters);
- if (m_count == nonstd::string_view::npos) {
- m_count = m_string.size() - m_pos;
+ do {
+ if (initial) {
+ initial = false;
+ } else if (m_right == string.length()) {
+ m_left = npos;
+ } else {
+ m_left = m_right + 1;
}
+ if (m_left != npos) {
+ const auto delim_pos = string.find_first_of(delimiters, m_left);
+ m_right = delim_pos == npos ? string.length() : delim_pos;
+ }
+ } while (mode == Mode::skip_empty && m_left == m_right);
+
+ if (mode == Mode::skip_last_empty && m_left == string.length()) {
+ m_left = npos;
}
- return *this;
}
} // namespace util
#pragma once
+#include <assertions.hpp>
+
+#include <third_party/fmt/core.h>
#include <third_party/nonstd/optional.hpp>
#include <third_party/nonstd/string_view.hpp>
namespace util {
+// An instance of this class can be used in a range-based for loop to split a
+// string into tokens at any of the characters in a string of delimiters.
class Tokenizer
{
public:
- Tokenizer(nonstd::string_view string, const char* delimiters);
+ enum class Mode {
+ include_empty, // Include empty tokens.
+ skip_empty, // Skip empty tokens.
+ skip_last_empty, // Include empty tokens except the last one.
+ };
+
+ // Split `string` into tokens at any of the characters in `separators` which
+ // must neither be the empty string nor a nullptr.
+ Tokenizer(nonstd::string_view string,
+ const char* delimiters,
+ Mode mode = Mode::skip_empty);
class Iterator
{
public:
- Iterator(nonstd::string_view string,
- const char* delimiters,
- size_t start_pos);
+ Iterator(const Tokenizer& tokenizer, size_t start_pos);
Iterator operator++();
bool operator!=(const Iterator& other) const;
nonstd::string_view operator*() const;
private:
- const nonstd::string_view m_string;
- const char* const m_delimiters;
- size_t m_pos;
- size_t m_count = 0;
+ const Tokenizer& m_tokenizer;
+ size_t m_left;
+ size_t m_right;
+
+ void advance(bool initial);
};
Iterator begin();
Iterator end();
private:
+ friend Iterator;
+
const nonstd::string_view m_string;
const char* const m_delimiters;
+ const Mode m_mode;
};
inline Tokenizer::Tokenizer(const nonstd::string_view string,
- const char* const delimiters)
+ const char* const delimiters,
+ const Tokenizer::Mode mode)
: m_string(string),
- m_delimiters(delimiters)
+ m_delimiters(delimiters),
+ m_mode(mode)
{
- assert(delimiters != nullptr && delimiters[0] != '\0');
+ DEBUG_ASSERT(delimiters != nullptr && delimiters[0] != '\0');
}
-inline Tokenizer::Iterator::Iterator(const nonstd::string_view string,
- const char* const delimiters,
+inline Tokenizer::Iterator::Iterator(const Tokenizer& tokenizer,
const size_t start_pos)
- : m_string(string),
- m_delimiters(delimiters),
- m_pos(start_pos)
+ : m_tokenizer(tokenizer),
+ m_left(start_pos),
+ m_right(start_pos)
+{
+ if (start_pos == 0) {
+ advance(true);
+ } else {
+ DEBUG_ASSERT(start_pos == nonstd::string_view::npos);
+ }
+}
+
+inline Tokenizer::Iterator
+Tokenizer::Iterator::operator++()
{
- ++*this;
+ advance(false);
+ return *this;
}
inline bool
Tokenizer::Iterator::operator!=(const Iterator& other) const
{
- assert(m_string.data() == other.m_string.data());
- assert(m_delimiters == other.m_delimiters);
- return m_pos != other.m_pos || m_count != other.m_count;
+ return &m_tokenizer != &other.m_tokenizer || m_left != other.m_left;
}
inline nonstd::string_view
Tokenizer::Iterator::operator*() const
{
- assert(m_pos < m_string.size());
- return m_string.substr(m_pos, m_count);
+ DEBUG_ASSERT(m_left <= m_right);
+ DEBUG_ASSERT(m_right <= m_tokenizer.m_string.length());
+ return m_tokenizer.m_string.substr(m_left, m_right - m_left);
}
inline Tokenizer::Iterator
Tokenizer::begin()
{
- return Iterator(m_string, m_delimiters, 0);
+ return Iterator(*this, 0);
}
inline Tokenizer::Iterator
Tokenizer::end()
{
- return Iterator(m_string, m_delimiters, m_string.size());
+ return Iterator(*this, nonstd::string_view::npos);
}
} // namespace util
}
// Util::split_into_strings and Util::split_into_views are tested implicitly in
-// test_Tokenizer.cpp.
+// test_util_Tokenizer.cpp.
TEST_CASE("Util::starts_with")
{
TEST_CASE("util::Tokenizer")
{
- CHECK(Util::split_into_views("", "/").empty());
- CHECK(Util::split_into_views("///", "/").empty());
- {
- const auto s = Util::split_into_views("a/b", "/");
- REQUIRE(s.size() == 2);
- CHECK(s[0] == "a");
- CHECK(s[1] == "b");
- }
- {
- const auto s = Util::split_into_views("a/b", "x");
- REQUIRE(s.size() == 1);
- CHECK(s[0] == "a/b");
- }
+ using Mode = util::Tokenizer::Mode;
+
+ SUBCASE("include empty tokens")
{
- const auto s = Util::split_into_views("a/b:c", "/:");
- REQUIRE(s.size() == 3);
- CHECK(s[0] == "a");
- CHECK(s[1] == "b");
- CHECK(s[2] == "c");
+ {
+ const auto s = Util::split_into_views("", "/", Mode::include_empty);
+ REQUIRE(s.size() == 1);
+ CHECK(s[0] == "");
+ }
+ {
+ const auto s = Util::split_into_views("/", "/", Mode::include_empty);
+ REQUIRE(s.size() == 2);
+ CHECK(s[0] == "");
+ CHECK(s[1] == "");
+ }
+ {
+ const auto s = Util::split_into_views("a/", "/", Mode::include_empty);
+ REQUIRE(s.size() == 2);
+ CHECK(s[0] == "a");
+ CHECK(s[1] == "");
+ }
+ {
+ const auto s = Util::split_into_views("/b", "/", Mode::include_empty);
+ REQUIRE(s.size() == 2);
+ CHECK(s[0] == "");
+ CHECK(s[1] == "b");
+ }
+ {
+ const auto s = Util::split_into_views("a/b", "/", Mode::include_empty);
+ REQUIRE(s.size() == 2);
+ CHECK(s[0] == "a");
+ CHECK(s[1] == "b");
+ }
+ {
+ const auto s = Util::split_into_views("/a:", "/:", Mode::include_empty);
+ REQUIRE(s.size() == 3);
+ CHECK(s[0] == "");
+ CHECK(s[1] == "a");
+ CHECK(s[2] == "");
+ }
}
+
+ SUBCASE("skip empty")
{
- const auto s = Util::split_into_views(":a//b..:.c/:/.", "/:.");
- REQUIRE(s.size() == 3);
- CHECK(s[0] == "a");
- CHECK(s[1] == "b");
- CHECK(s[2] == "c");
+ CHECK(Util::split_into_views("", "/", Mode::skip_empty).empty());
+ CHECK(Util::split_into_views("///", "/", Mode::skip_empty).empty());
+ {
+ const auto s = Util::split_into_views("a/b", "/", Mode::skip_empty);
+ REQUIRE(s.size() == 2);
+ CHECK(s[0] == "a");
+ CHECK(s[1] == "b");
+ }
+ {
+ const auto s = Util::split_into_views("a/b", "x", Mode::skip_empty);
+ REQUIRE(s.size() == 1);
+ CHECK(s[0] == "a/b");
+ }
+ {
+ const auto s = Util::split_into_views("a/b:c", "/:", Mode::skip_empty);
+ REQUIRE(s.size() == 3);
+ CHECK(s[0] == "a");
+ CHECK(s[1] == "b");
+ CHECK(s[2] == "c");
+ }
+ {
+ const auto s =
+ Util::split_into_views(":a//b..:.c/:/.", "/:.", Mode::skip_empty);
+ REQUIRE(s.size() == 3);
+ CHECK(s[0] == "a");
+ CHECK(s[1] == "b");
+ CHECK(s[2] == "c");
+ }
+ {
+ const auto s = Util::split_into_views(
+ ".0.1.2.3.4.5.6.7.8.9.", "/:.+_abcdef", Mode::skip_empty);
+ REQUIRE(s.size() == 10);
+ CHECK(s[0] == "0");
+ CHECK(s[9] == "9");
+ }
}
+
+ SUBCASE("skip last empty token")
{
- const auto s =
- Util::split_into_views(".0.1.2.3.4.5.6.7.8.9.", "/:.+_abcdef");
- REQUIRE(s.size() == 10);
- CHECK(s[0] == "0");
- CHECK(s[9] == "9");
+ CHECK(Util::split_into_views("", "/", Mode::skip_last_empty).empty());
+ {
+ const auto s = Util::split_into_views("/", "/", Mode::skip_last_empty);
+ REQUIRE(s.size() == 1);
+ CHECK(s[0] == "");
+ }
+ {
+ const auto s = Util::split_into_views("a/", "/", Mode::skip_last_empty);
+ REQUIRE(s.size() == 1);
+ CHECK(s[0] == "a");
+ }
+ {
+ const auto s = Util::split_into_views("/b", "/", Mode::skip_last_empty);
+ REQUIRE(s.size() == 2);
+ CHECK(s[0] == "");
+ CHECK(s[1] == "b");
+ }
+ {
+ const auto s = Util::split_into_views("a/b", "/", Mode::skip_last_empty);
+ REQUIRE(s.size() == 2);
+ CHECK(s[0] == "a");
+ CHECK(s[1] == "b");
+ }
+ {
+ const auto s = Util::split_into_views("/a:", "/:", Mode::skip_last_empty);
+ REQUIRE(s.size() == 2);
+ CHECK(s[0] == "");
+ CHECK(s[1] == "a");
+ }
}
}