From a5c921682417885b81c496be1b1fb85f9a9b0789 Mon Sep 17 00:00:00 2001 From: ado Date: Mon, 25 Jan 2021 00:16:55 +0100 Subject: [PATCH] refator splitter, add resplit functionality, write some unit tests --- include/ss/converter.hpp | 17 ++- include/ss/splitter.hpp | 280 ++++++++++++++++++++++++++++----------- test/test_splitter.cpp | 96 +++++++++++++- 3 files changed, 306 insertions(+), 87 deletions(-) diff --git a/include/ss/converter.hpp b/include/ss/converter.hpp index f01a6d3..950bfeb 100644 --- a/include/ss/converter.hpp +++ b/include/ss/converter.hpp @@ -103,9 +103,6 @@ struct tied_class { template constexpr bool tied_class_v = tied_class::value; -// the error can be set inside a string, or a bool -enum class error_mode { error_string, error_bool }; - //////////////// // converter //////////////// @@ -128,6 +125,12 @@ public: no_void_validator_tup_t convert( char* line, const std::string& delim = default_delimiter) { input_ = split(line, delim); + /* TODO + if (!splitter_.valid()) { + // set error + return {}; + } + */ return convert(input_); } @@ -178,9 +181,13 @@ public: : bool_error_ == false; } - const std::string& error_msg() const { return string_error_; } + const std::string& error_msg() const { + return string_error_; + } - void set_error_mode(error_mode mode) { error_mode_ = mode; } + void set_error_mode(error_mode mode) { + error_mode_ = mode; + } // 'splits' string by given delimiter, returns vector of pairs which // contain the beginnings and the ends of each column of the string diff --git a/include/ss/splitter.hpp b/include/ss/splitter.hpp index 9308dd9..9ad3b1b 100644 --- a/include/ss/splitter.hpp +++ b/include/ss/splitter.hpp @@ -5,6 +5,9 @@ #include #include +// TODO remove +#include + namespace ss { template struct matcher { @@ -31,8 +34,8 @@ public: static bool match(char c) = delete; }; -template -struct quote : matcher {}; +template +struct quote : matcher {}; template struct trim : matcher {}; @@ -79,24 +82,131 @@ struct setup { template struct setup> : setup {}; -enum class state { begin, reading, quoting, finished }; -using range = std::pair; - using string_range = std::pair; using split_input = std::vector; +// the error can be set inside a string, or a bool +enum class error_mode { error_string, error_bool }; + template class splitter { +private: + enum class state { begin, reading, quoting, finished }; + constexpr static auto default_delimiter = ","; + using quote = typename setup::quote; using trim = typename setup::trim; using escape = typename setup::escape; - bool match(const char* end_i, char delim) { - return *end_i == delim; + constexpr static auto is_const_line = !quote::enabled && !escape::enabled; + using line_ptr_type = + typename ternary::type; + +public: + bool valid() const { + return (error_mode_ == error_mode::error_string) ? string_error_.empty() + : bool_error_ == false; + } + + bool unterminated_quote() { + return unterminated_quote_; + } + + const std::string& error_msg() const { + return string_error_; + } + + void set_error_mode(error_mode mode) { + error_mode_ = mode; + } + + split_input& split(line_ptr_type new_line, + const std::string& delimiter = default_delimiter) { + output_.clear(); + return resplit(new_line, -1, delimiter); + } + + void adjust_ranges(const char* old_line) { + for (auto& [begin, end] : output_) { + begin = begin - old_line + line_; + end = end - old_line + line_; + } + } + + split_input& resplit(line_ptr_type new_line, ssize_t new_size, + const std::string& delimiter = default_delimiter) { + clear_error(); + line_ = new_line; + + // resplitting, continue from last slice + if (!output_.empty()) { + const auto& last = std::prev(output_.end()); + const auto [old_line, old_begin] = *last; + size_t begin = old_begin - old_line - 1; + output_.pop_back(); + adjust_ranges(old_line); + + // safety measure + if (new_size != -1 && static_cast(new_size) < begin) { + set_error_invalid_resplit(); + return output_; + } + + line_ += begin; + } + + return split_impl_select_delim(delimiter); + } + +private: + //////////////// + // error + //////////////// + + void clear_error() { + string_error_.clear(); + bool_error_ = false; + unterminated_quote_ = false; + } + + void set_error_empty_delimiter() { + if (error_mode_ == error_mode::error_string) { + string_error_.clear(); + string_error_.append("empty delimiter"); + } else { + bool_error_ = true; + } + } + + void set_error_unterminated_quote() { + unterminated_quote_ = true; + if (error_mode_ == error_mode::error_string) { + string_error_.clear(); + string_error_.append("unterminated quote"); + } else { + bool_error_ = true; + } + } + + void set_error_invalid_resplit() { + if (error_mode_ == error_mode::error_string) { + string_error_.clear(); + string_error_.append("invalid_resplit"); + } else { + bool_error_ = true; + } + } + + //////////////// + // matching + //////////////// + + bool match(const char* const curr, char delim) { + return *curr == delim; }; - bool match(const char* end_i, const std::string& delim) { - return strncmp(end_i, delim.c_str(), delim.size()) == 0; + bool match(const char* const curr, const std::string& delim) { + return strncmp(curr, delim.c_str(), delim.size()) == 0; }; size_t delimiter_size(char) { @@ -107,7 +217,7 @@ class splitter { return delim.size(); } - void trim_if_enabled(char*& curr) { + void trim_if_enabled(line_ptr_type& curr) { if constexpr (trim::enabled) { while (trim::match(*curr)) { ++curr; @@ -115,89 +225,88 @@ class splitter { } } - void shift_if_escaped(char*& curr_i) { + void shift_if_escaped(line_ptr_type& curr) { if constexpr (escape::enabled) { - if (escape::match(*curr_i)) { - *curr = end[1]; - ++end; + if (escape::match(*curr)) { + *curr_ = end_[1]; + ++end_; } } } - void shift() { - if constexpr (escape::enabled || quote::enabled) { - *curr = *end; - } - ++end; - ++curr; - } - - void shift(size_t n) { - if constexpr (escape::enabled || quote::enabled) { - memcpy(curr, end, n); - } - end += n; - curr += n; - } - template - std::tuple match_delimiter(char* begin, const Delim& delim) { - char* end_i = begin; + std::tuple match_delimiter(line_ptr_type begin, + const Delim& delim) { + line_ptr_type end = begin; - trim_if_enabled(end_i); + trim_if_enabled(end); // just spacing - if (*end_i == '\0') { + if (*end == '\0') { return {0, false}; } // not a delimiter - if (!match(end_i, delim)) { - shift_if_escaped(end_i); - return {1 + end_i - begin, false}; + if (!match(end, delim)) { + shift_if_escaped(end); + return {1 + end - begin, false}; } - end_i += delimiter_size(delim); - trim_if_enabled(end_i); + end += delimiter_size(delim); + trim_if_enabled(end); // delimiter - return {end_i - begin, true}; + return {end - begin, true}; + } + + //////////////// + // matching + //////////////// + + void shift() { + if constexpr (!is_const_line) { + *curr_ = *end_; + } + ++end_; + ++curr_; + } + + void shift(size_t n) { + if constexpr (!is_const_line) { + memcpy(curr_, end_, n); + } + end_ += n; + curr_ += n; } void push_and_start_next(size_t n) { - output_.emplace_back(begin, curr); - begin = end + n; + output_.emplace_back(begin_, curr_); + begin_ = end_ + n; state_ = state::begin; } -public: - bool valid() { - return error_.empty(); - } - - split_input& split(char* new_line, const std::string& d = ",") { - line = new_line; - output_.clear(); - switch (d.size()) { + split_input& split_impl_select_delim( + const std::string& delimiter = default_delimiter) { + switch (delimiter.size()) { case 0: - // set error + set_error_empty_delimiter(); return output_; case 1: - return split_impl(d[0]); + return split_impl(delimiter[0]); default: - return split_impl(d); + return split_impl(delimiter); } } template - std::vector& split_impl(const Delim& delim) { + split_input& split_impl(const Delim& delim) { state_ = state::begin; - begin = line; + begin_ = line_; - trim_if_enabled(begin); + trim_if_enabled(begin_); while (state_ != state::finished) { - curr = end = begin; + curr_ = end_ = begin_; switch (state_) { case (state::begin): state_begin(); @@ -216,10 +325,14 @@ public: return output_; } + //////////////// + // states + //////////////// + void state_begin() { if constexpr (quote::enabled) { - if (quote::match(*begin)) { - ++begin; + if (quote::match(*begin_)) { + ++begin_; state_ = state::quoting; return; } @@ -230,13 +343,13 @@ public: template void state_reading(const Delim& delim) { while (true) { - auto [width, valid] = match_delimiter(end, delim); + auto [width, valid] = match_delimiter(end_, delim); // not a delimiter if (!valid) { if (width == 0) { // eol - output_.emplace_back(begin, curr); + output_.emplace_back(begin_, curr_); state_ = state::finished; break; } else { @@ -255,16 +368,16 @@ public: void state_quoting(const Delim& delim) { if constexpr (quote::enabled) { while (true) { - if (quote::match(*end)) { + if (quote::match(*end_)) { // double quote // eg: ...,"hel""lo,... -> hel"lo - if (quote::match(end[1])) { - ++end; + if (quote::match(end_[1])) { + ++end_; shift(); continue; } - auto [width, valid] = match_delimiter(end + 1, delim); + auto [width, valid] = match_delimiter(end_ + 1, delim); // not a delimiter if (!valid) { @@ -272,10 +385,11 @@ public: // eol // eg: ...,"hello" \0 -> hello // eg no trim: ...,"hello"\0 -> hello - output_.emplace_back(begin, curr); + output_.emplace_back(begin_, curr_); } else { // missmatched quote // eg: ...,"hel"lo,... -> error + // or not } state_ = state::finished; break; @@ -287,8 +401,8 @@ public: } if constexpr (escape::enabled) { - if (escape::match(*end)) { - ++end; + if (escape::match(*end_)) { + ++end_; shift(); continue; } @@ -296,27 +410,31 @@ public: // unterminated error // eg: ..."hell\0 -> quote not terminated - if (*end == '\0') { - *curr = '\0'; + if (*end_ == '\0') { + set_error_unterminated_quote(); + output_.emplace_back(line_, begin_); state_ = state::finished; break; } shift(); } - } else { - // set error impossible scenario - state_ = state::finished; } } -private: - std::vector output_; - std::string error_ = ""; + //////////////// + // members + //////////////// + + std::vector output_; + std::string string_error_; + bool bool_error_; + bool unterminated_quote_; + enum error_mode error_mode_ { error_mode::error_bool }; + line_ptr_type begin_; + line_ptr_type curr_; + line_ptr_type end_; + line_ptr_type line_; state state_; - char* curr; - char* end; - char* begin; - char* line; }; } /* ss */ diff --git a/test/test_splitter.cpp b/test/test_splitter.cpp index cff9852..3411714 100644 --- a/test/test_splitter.cpp +++ b/test/test_splitter.cpp @@ -416,7 +416,7 @@ TEST_CASE("testing splitter escape and trim") { } TEST_CASE("testing splitter quote and escape and trim") { - auto guard = set_combinations_size(4); + auto guard = set_combinations_size(3); case_type case1 = spaced({R"("\"")", R"(\")", R"("""")"}, " "); case_type case2 = spaced({R"("x\"x")", R"(x\"x)", R"(x"x)", R"("x""x")"}, " "); @@ -467,3 +467,97 @@ TEST_CASE("testing splitter quote and escape and trim") { ss::trim<' ', '\t'>>(p, {","}); } } + +TEST_CASE("testing splitter constnes if quoting and escaping are disabled") { + // to compile is enough + return; + const char* const line{}; + ss::splitter s1; + ss::splitter> s2; + s1.split(line); + s2.split(line); +} + +TEST_CASE("testing error mode") { + + { + // empty delimiter + ss::splitter s; + s.split(buff("just,some,strings"), ""); + CHECK(!s.valid()); + CHECK(!s.unterminated_quote()); + CHECK(s.error_msg().empty()); + + s.set_error_mode(ss::error_mode::error_string); + s.split(buff("just,some,strings"), ""); + CHECK(!s.valid()); + CHECK(!s.unterminated_quote()); + CHECK(!s.error_msg().empty()); + } + + { + // unterminated quote + ss::splitter> s; + s.split(buff("\"just")); + CHECK(!s.valid()); + CHECK(s.unterminated_quote()); + CHECK(s.error_msg().empty()); + + s.set_error_mode(ss::error_mode::error_string); + s.split(buff("\"just")); + CHECK(!s.valid()); + CHECK(s.unterminated_quote()); + CHECK(!s.error_msg().empty()); + } +} + +template +auto expect_unterminated_quote(Splitter& s, const std::string& line) { + auto vec = s.split(buff(line.c_str())); + CHECK(!s.valid()); + CHECK(s.unterminated_quote()); + return vec; +} + +TEST_CASE("testing unterminated quote") { + { + ss::splitter> s; + auto vec = expect_unterminated_quote(s, "\"just"); + CHECK(vec.size() == 1); + + char new_line[] = R"("just",strings)"; + vec = s.resplit(new_line, strlen(new_line)); + CHECK(s.valid()); + CHECK(!s.unterminated_quote()); + std::vector expected{"just", "strings"}; + CHECK(words(vec) == expected); + } + + { + ss::splitter> s; + auto vec = expect_unterminated_quote(s, "just,some,\"random"); + std::vector expected{"just", "some", "just,some,\""}; + CHECK(words(vec) == expected); + + char new_line[] = R"(just,some,"random",strings)"; + vec = s.resplit(new_line, strlen(new_line)); + CHECK(s.valid()); + CHECK(!s.unterminated_quote()); + expected = {"just", "some", "random", "strings"}; + CHECK(words(vec) == expected); + } + + { + ss::splitter> s; + auto vec = expect_unterminated_quote(s, R"("just","some","ran)"); + std::vector expected{"just", "some", R"("just","some",")"}; + CHECK(words(vec) == expected); + + char new_line[] = R"("just","some","ran,dom","strings")"; + vec = s.resplit(new_line, strlen(new_line)); + CHECK(s.valid()); + CHECK(!s.unterminated_quote()); + expected = {"just", "some", "ran,dom", "strings"}; + CHECK(words(vec) == expected); + } +}