refator splitter, add resplit functionality, write some unit tests

This commit is contained in:
ado 2021-01-25 00:16:55 +01:00
parent 9302a25256
commit a5c9216824
3 changed files with 306 additions and 87 deletions

View File

@ -103,9 +103,6 @@ struct tied_class {
template <typename... Ts> template <typename... Ts>
constexpr bool tied_class_v = tied_class<Ts...>::value; constexpr bool tied_class_v = tied_class<Ts...>::value;
// the error can be set inside a string, or a bool
enum class error_mode { error_string, error_bool };
//////////////// ////////////////
// converter // converter
//////////////// ////////////////
@ -128,6 +125,12 @@ public:
no_void_validator_tup_t<Ts...> convert( no_void_validator_tup_t<Ts...> convert(
char* line, const std::string& delim = default_delimiter) { char* line, const std::string& delim = default_delimiter) {
input_ = split(line, delim); input_ = split(line, delim);
/* TODO
if (!splitter_.valid()) {
// set error
return {};
}
*/
return convert<Ts...>(input_); return convert<Ts...>(input_);
} }
@ -178,9 +181,13 @@ public:
: bool_error_ == false; : bool_error_ == false;
} }
const std::string& error_msg() const { return string_error_; } const std::string& error_msg() const {
return string_error_;
}
void set_error_mode(error_mode mode) { error_mode_ = mode; } void set_error_mode(error_mode mode) {
error_mode_ = mode;
}
// 'splits' string by given delimiter, returns vector of pairs which // 'splits' string by given delimiter, returns vector of pairs which
// contain the beginnings and the ends of each column of the string // contain the beginnings and the ends of each column of the string

View File

@ -5,6 +5,9 @@
#include <string> #include <string>
#include <vector> #include <vector>
// TODO remove
#include <iostream>
namespace ss { namespace ss {
template <char... Cs> template <char... Cs>
struct matcher { struct matcher {
@ -31,8 +34,8 @@ public:
static bool match(char c) = delete; static bool match(char c) = delete;
}; };
template <char... Cs> template <char C>
struct quote : matcher<Cs...> {}; struct quote : matcher<C> {};
template <char... Cs> template <char... Cs>
struct trim : matcher<Cs...> {}; struct trim : matcher<Cs...> {};
@ -79,24 +82,131 @@ struct setup {
template <typename... Ts> template <typename... Ts>
struct setup<setup<Ts...>> : setup<Ts...> {}; struct setup<setup<Ts...>> : setup<Ts...> {};
enum class state { begin, reading, quoting, finished };
using range = std::pair<const char*, const char*>;
using string_range = std::pair<const char*, const char*>; using string_range = std::pair<const char*, const char*>;
using split_input = std::vector<string_range>; using split_input = std::vector<string_range>;
// the error can be set inside a string, or a bool
enum class error_mode { error_string, error_bool };
template <typename... Ts> template <typename... Ts>
class splitter { class splitter {
private:
enum class state { begin, reading, quoting, finished };
constexpr static auto default_delimiter = ",";
using quote = typename setup<Ts...>::quote; using quote = typename setup<Ts...>::quote;
using trim = typename setup<Ts...>::trim; using trim = typename setup<Ts...>::trim;
using escape = typename setup<Ts...>::escape; using escape = typename setup<Ts...>::escape;
bool match(const char* end_i, char delim) { constexpr static auto is_const_line = !quote::enabled && !escape::enabled;
return *end_i == delim; using line_ptr_type =
typename ternary<is_const_line, const char*, char*>::type;
public:
bool valid() const {
return (error_mode_ == error_mode::error_string) ? string_error_.empty()
: bool_error_ == false;
}
bool unterminated_quote() {
return unterminated_quote_;
}
const std::string& error_msg() const {
return string_error_;
}
void set_error_mode(error_mode mode) {
error_mode_ = mode;
}
split_input& split(line_ptr_type new_line,
const std::string& delimiter = default_delimiter) {
output_.clear();
return resplit(new_line, -1, delimiter);
}
void adjust_ranges(const char* old_line) {
for (auto& [begin, end] : output_) {
begin = begin - old_line + line_;
end = end - old_line + line_;
}
}
split_input& resplit(line_ptr_type new_line, ssize_t new_size,
const std::string& delimiter = default_delimiter) {
clear_error();
line_ = new_line;
// resplitting, continue from last slice
if (!output_.empty()) {
const auto& last = std::prev(output_.end());
const auto [old_line, old_begin] = *last;
size_t begin = old_begin - old_line - 1;
output_.pop_back();
adjust_ranges(old_line);
// safety measure
if (new_size != -1 && static_cast<size_t>(new_size) < begin) {
set_error_invalid_resplit();
return output_;
}
line_ += begin;
}
return split_impl_select_delim(delimiter);
}
private:
////////////////
// error
////////////////
void clear_error() {
string_error_.clear();
bool_error_ = false;
unterminated_quote_ = false;
}
void set_error_empty_delimiter() {
if (error_mode_ == error_mode::error_string) {
string_error_.clear();
string_error_.append("empty delimiter");
} else {
bool_error_ = true;
}
}
void set_error_unterminated_quote() {
unterminated_quote_ = true;
if (error_mode_ == error_mode::error_string) {
string_error_.clear();
string_error_.append("unterminated quote");
} else {
bool_error_ = true;
}
}
void set_error_invalid_resplit() {
if (error_mode_ == error_mode::error_string) {
string_error_.clear();
string_error_.append("invalid_resplit");
} else {
bool_error_ = true;
}
}
////////////////
// matching
////////////////
bool match(const char* const curr, char delim) {
return *curr == delim;
}; };
bool match(const char* end_i, const std::string& delim) { bool match(const char* const curr, const std::string& delim) {
return strncmp(end_i, delim.c_str(), delim.size()) == 0; return strncmp(curr, delim.c_str(), delim.size()) == 0;
}; };
size_t delimiter_size(char) { size_t delimiter_size(char) {
@ -107,7 +217,7 @@ class splitter {
return delim.size(); return delim.size();
} }
void trim_if_enabled(char*& curr) { void trim_if_enabled(line_ptr_type& curr) {
if constexpr (trim::enabled) { if constexpr (trim::enabled) {
while (trim::match(*curr)) { while (trim::match(*curr)) {
++curr; ++curr;
@ -115,89 +225,88 @@ class splitter {
} }
} }
void shift_if_escaped(char*& curr_i) { void shift_if_escaped(line_ptr_type& curr) {
if constexpr (escape::enabled) { if constexpr (escape::enabled) {
if (escape::match(*curr_i)) { if (escape::match(*curr)) {
*curr = end[1]; *curr_ = end_[1];
++end; ++end_;
} }
} }
} }
void shift() {
if constexpr (escape::enabled || quote::enabled) {
*curr = *end;
}
++end;
++curr;
}
void shift(size_t n) {
if constexpr (escape::enabled || quote::enabled) {
memcpy(curr, end, n);
}
end += n;
curr += n;
}
template <typename Delim> template <typename Delim>
std::tuple<size_t, bool> match_delimiter(char* begin, const Delim& delim) { std::tuple<size_t, bool> match_delimiter(line_ptr_type begin,
char* end_i = begin; const Delim& delim) {
line_ptr_type end = begin;
trim_if_enabled(end_i); trim_if_enabled(end);
// just spacing // just spacing
if (*end_i == '\0') { if (*end == '\0') {
return {0, false}; return {0, false};
} }
// not a delimiter // not a delimiter
if (!match(end_i, delim)) { if (!match(end, delim)) {
shift_if_escaped(end_i); shift_if_escaped(end);
return {1 + end_i - begin, false}; return {1 + end - begin, false};
} }
end_i += delimiter_size(delim); end += delimiter_size(delim);
trim_if_enabled(end_i); trim_if_enabled(end);
// delimiter // delimiter
return {end_i - begin, true}; return {end - begin, true};
}
////////////////
// matching
////////////////
void shift() {
if constexpr (!is_const_line) {
*curr_ = *end_;
}
++end_;
++curr_;
}
void shift(size_t n) {
if constexpr (!is_const_line) {
memcpy(curr_, end_, n);
}
end_ += n;
curr_ += n;
} }
void push_and_start_next(size_t n) { void push_and_start_next(size_t n) {
output_.emplace_back(begin, curr); output_.emplace_back(begin_, curr_);
begin = end + n; begin_ = end_ + n;
state_ = state::begin; state_ = state::begin;
} }
public: split_input& split_impl_select_delim(
bool valid() { const std::string& delimiter = default_delimiter) {
return error_.empty(); switch (delimiter.size()) {
}
split_input& split(char* new_line, const std::string& d = ",") {
line = new_line;
output_.clear();
switch (d.size()) {
case 0: case 0:
// set error set_error_empty_delimiter();
return output_; return output_;
case 1: case 1:
return split_impl(d[0]); return split_impl(delimiter[0]);
default: default:
return split_impl(d); return split_impl(delimiter);
} }
} }
template <typename Delim> template <typename Delim>
std::vector<range>& split_impl(const Delim& delim) { split_input& split_impl(const Delim& delim) {
state_ = state::begin; state_ = state::begin;
begin = line; begin_ = line_;
trim_if_enabled(begin); trim_if_enabled(begin_);
while (state_ != state::finished) { while (state_ != state::finished) {
curr = end = begin; curr_ = end_ = begin_;
switch (state_) { switch (state_) {
case (state::begin): case (state::begin):
state_begin(); state_begin();
@ -216,10 +325,14 @@ public:
return output_; return output_;
} }
////////////////
// states
////////////////
void state_begin() { void state_begin() {
if constexpr (quote::enabled) { if constexpr (quote::enabled) {
if (quote::match(*begin)) { if (quote::match(*begin_)) {
++begin; ++begin_;
state_ = state::quoting; state_ = state::quoting;
return; return;
} }
@ -230,13 +343,13 @@ public:
template <typename Delim> template <typename Delim>
void state_reading(const Delim& delim) { void state_reading(const Delim& delim) {
while (true) { while (true) {
auto [width, valid] = match_delimiter(end, delim); auto [width, valid] = match_delimiter(end_, delim);
// not a delimiter // not a delimiter
if (!valid) { if (!valid) {
if (width == 0) { if (width == 0) {
// eol // eol
output_.emplace_back(begin, curr); output_.emplace_back(begin_, curr_);
state_ = state::finished; state_ = state::finished;
break; break;
} else { } else {
@ -255,16 +368,16 @@ public:
void state_quoting(const Delim& delim) { void state_quoting(const Delim& delim) {
if constexpr (quote::enabled) { if constexpr (quote::enabled) {
while (true) { while (true) {
if (quote::match(*end)) { if (quote::match(*end_)) {
// double quote // double quote
// eg: ...,"hel""lo,... -> hel"lo // eg: ...,"hel""lo,... -> hel"lo
if (quote::match(end[1])) { if (quote::match(end_[1])) {
++end; ++end_;
shift(); shift();
continue; continue;
} }
auto [width, valid] = match_delimiter(end + 1, delim); auto [width, valid] = match_delimiter(end_ + 1, delim);
// not a delimiter // not a delimiter
if (!valid) { if (!valid) {
@ -272,10 +385,11 @@ public:
// eol // eol
// eg: ...,"hello" \0 -> hello // eg: ...,"hello" \0 -> hello
// eg no trim: ...,"hello"\0 -> hello // eg no trim: ...,"hello"\0 -> hello
output_.emplace_back(begin, curr); output_.emplace_back(begin_, curr_);
} else { } else {
// missmatched quote // missmatched quote
// eg: ...,"hel"lo,... -> error // eg: ...,"hel"lo,... -> error
// or not
} }
state_ = state::finished; state_ = state::finished;
break; break;
@ -287,8 +401,8 @@ public:
} }
if constexpr (escape::enabled) { if constexpr (escape::enabled) {
if (escape::match(*end)) { if (escape::match(*end_)) {
++end; ++end_;
shift(); shift();
continue; continue;
} }
@ -296,27 +410,31 @@ public:
// unterminated error // unterminated error
// eg: ..."hell\0 -> quote not terminated // eg: ..."hell\0 -> quote not terminated
if (*end == '\0') { if (*end_ == '\0') {
*curr = '\0'; set_error_unterminated_quote();
output_.emplace_back(line_, begin_);
state_ = state::finished; state_ = state::finished;
break; break;
} }
shift(); shift();
} }
} else {
// set error impossible scenario
state_ = state::finished;
} }
} }
private: ////////////////
std::vector<range> output_; // members
std::string error_ = ""; ////////////////
std::vector<string_range> output_;
std::string string_error_;
bool bool_error_;
bool unterminated_quote_;
enum error_mode error_mode_ { error_mode::error_bool };
line_ptr_type begin_;
line_ptr_type curr_;
line_ptr_type end_;
line_ptr_type line_;
state state_; state state_;
char* curr;
char* end;
char* begin;
char* line;
}; };
} /* ss */ } /* ss */

View File

@ -416,7 +416,7 @@ TEST_CASE("testing splitter escape and trim") {
} }
TEST_CASE("testing splitter quote and escape and trim") { TEST_CASE("testing splitter quote and escape and trim") {
auto guard = set_combinations_size(4); auto guard = set_combinations_size(3);
case_type case1 = spaced({R"("\"")", R"(\")", R"("""")"}, " "); case_type case1 = spaced({R"("\"")", R"(\")", R"("""")"}, " ");
case_type case2 = case_type case2 =
spaced({R"("x\"x")", R"(x\"x)", R"(x"x)", R"("x""x")"}, " "); spaced({R"("x\"x")", R"(x\"x)", R"(x"x)", R"("x""x")"}, " ");
@ -467,3 +467,97 @@ TEST_CASE("testing splitter quote and escape and trim") {
ss::trim<' ', '\t'>>(p, {","}); ss::trim<' ', '\t'>>(p, {","});
} }
} }
TEST_CASE("testing splitter constnes if quoting and escaping are disabled") {
// to compile is enough
return;
const char* const line{};
ss::splitter s1;
ss::splitter<ss::trim<' '>> s2;
s1.split(line);
s2.split(line);
}
TEST_CASE("testing error mode") {
{
// empty delimiter
ss::splitter s;
s.split(buff("just,some,strings"), "");
CHECK(!s.valid());
CHECK(!s.unterminated_quote());
CHECK(s.error_msg().empty());
s.set_error_mode(ss::error_mode::error_string);
s.split(buff("just,some,strings"), "");
CHECK(!s.valid());
CHECK(!s.unterminated_quote());
CHECK(!s.error_msg().empty());
}
{
// unterminated quote
ss::splitter<ss::quote<'"'>> s;
s.split(buff("\"just"));
CHECK(!s.valid());
CHECK(s.unterminated_quote());
CHECK(s.error_msg().empty());
s.set_error_mode(ss::error_mode::error_string);
s.split(buff("\"just"));
CHECK(!s.valid());
CHECK(s.unterminated_quote());
CHECK(!s.error_msg().empty());
}
}
template <typename Splitter>
auto expect_unterminated_quote(Splitter& s, const std::string& line) {
auto vec = s.split(buff(line.c_str()));
CHECK(!s.valid());
CHECK(s.unterminated_quote());
return vec;
}
TEST_CASE("testing unterminated quote") {
{
ss::splitter<ss::quote<'"'>> s;
auto vec = expect_unterminated_quote(s, "\"just");
CHECK(vec.size() == 1);
char new_line[] = R"("just",strings)";
vec = s.resplit(new_line, strlen(new_line));
CHECK(s.valid());
CHECK(!s.unterminated_quote());
std::vector<std::string> expected{"just", "strings"};
CHECK(words(vec) == expected);
}
{
ss::splitter<ss::quote<'"'>> s;
auto vec = expect_unterminated_quote(s, "just,some,\"random");
std::vector<std::string> expected{"just", "some", "just,some,\""};
CHECK(words(vec) == expected);
char new_line[] = R"(just,some,"random",strings)";
vec = s.resplit(new_line, strlen(new_line));
CHECK(s.valid());
CHECK(!s.unterminated_quote());
expected = {"just", "some", "random", "strings"};
CHECK(words(vec) == expected);
}
{
ss::splitter<ss::quote<'"'>> s;
auto vec = expect_unterminated_quote(s, R"("just","some","ran)");
std::vector<std::string> expected{"just", "some", R"("just","some",")"};
CHECK(words(vec) == expected);
char new_line[] = R"("just","some","ran,dom","strings")";
vec = s.resplit(new_line, strlen(new_line));
CHECK(s.valid());
CHECK(!s.unterminated_quote());
expected = {"just", "some", "ran,dom", "strings"};
CHECK(words(vec) == expected);
}
}