From 035e27c5ab6d8c62062721c087c29eb5c1e33f6a Mon Sep 17 00:00:00 2001 From: ado Date: Sun, 31 Jan 2021 23:08:46 +0100 Subject: [PATCH] add converter tests with quote trim and escape, enable resplit on converter, make parser handle multi-line csv, add unit tests --- include/ss/converter.hpp | 18 ++++- include/ss/parser.hpp | 137 ++++++++++++++++++++++++++++++++------- include/ss/splitter.hpp | 12 ++-- test/test_converter.cpp | 50 +++++++++++++- test/test_parser.cpp | 69 ++++++++++++++++++++ 5 files changed, 249 insertions(+), 37 deletions(-) diff --git a/include/ss/converter.hpp b/include/ss/converter.hpp index 950bfeb..2e95ac9 100644 --- a/include/ss/converter.hpp +++ b/include/ss/converter.hpp @@ -110,12 +110,14 @@ constexpr bool tied_class_v = tied_class::value; template class converter { constexpr static auto default_delimiter = ","; + using line_ptr_type = typename splitter::line_ptr_type; public: // parses line with given delimiter, returns a 'T' object created with // extracted values of type 'Ts' template - T convert_object(char* line, const std::string& delim = default_delimiter) { + T convert_object(line_ptr_type line, + const std::string& delim = default_delimiter) { return to_object(convert(line, delim)); } @@ -123,7 +125,7 @@ public: // extracted values of type 'Ts' template no_void_validator_tup_t convert( - char* line, const std::string& delim = default_delimiter) { + line_ptr_type line, const std::string& delim = default_delimiter) { input_ = split(line, delim); /* TODO if (!splitter_.valid()) { @@ -181,6 +183,10 @@ public: : bool_error_ == false; } + bool unterminated_quote() const { + return splitter_.unterminated_quote(); + } + const std::string& error_msg() const { return string_error_; } @@ -191,7 +197,7 @@ public: // 'splits' string by given delimiter, returns vector of pairs which // contain the beginnings and the ends of each column of the string - const split_input& split(char* line, + const split_input& split(line_ptr_type line, const std::string& delim = default_delimiter) { input_.clear(); if (line[0] == '\0') { @@ -202,6 +208,12 @@ public: return input_; } + const split_input& resplit(line_ptr_type new_line, ssize_t new_size, + const std::string& delim = default_delimiter) { + input_ = splitter_.resplit(new_line, new_size, delim); + return input_; + } + private: //////////////// // error diff --git a/include/ss/parser.hpp b/include/ss/parser.hpp index 4aafc22..4f7dcd6 100644 --- a/include/ss/parser.hpp +++ b/include/ss/parser.hpp @@ -9,6 +9,9 @@ #include #include +// TODO remove +#include + namespace ss { template @@ -40,7 +43,7 @@ public: void set_error_mode(error_mode mode) { error_mode_ = mode; - buff_.set_error_mode(mode); + reader_.set_error_mode(mode); } const std::string& error_msg() const { @@ -52,7 +55,7 @@ public: } bool ignore_next() { - return buff_.read(file_); + return reader_.read(file_); } template @@ -62,16 +65,16 @@ public: template no_void_validator_tup_t get_next() { - buff_.update(); + reader_.update(); clear_error(); if (eof_) { set_error_eof_reached(); return {}; } - auto value = buff_.get_converter().template convert(); + auto value = reader_.get_converter().template convert(); - if (!buff_.get_converter().valid()) { + if (!reader_.get_converter().valid()) { set_error_invalid_conversion(); } @@ -160,8 +163,8 @@ public: no_void_validator_tup_t try_same() { parser_.clear_error(); auto value = - parser_.buff_.get_converter().template convert(); - if (!parser_.buff_.get_converter().valid()) { + parser_.reader_.get_converter().template convert(); + if (!parser_.reader_.get_converter().valid()) { parser_.set_error_invalid_conversion(); } return value; @@ -244,40 +247,124 @@ private: // line reading //////////////// - class buffer { + class reader { char* buffer_{nullptr}; char* next_line_buffer_{nullptr}; + char* helper_buffer_{nullptr}; converter converter_; converter next_line_converter_; size_t size_{0}; + size_t helper_size_{0}; const std::string& delim_; - public: - buffer(const std::string& delimiter) : delim_{delimiter} { + bool crlf; + + bool escaped_eol(size_t size) { + // escaped new line + if constexpr (setup::escape::enabled) { + const char* curr; + for (curr = next_line_buffer_ + size - 1; + curr >= next_line_buffer_ && + setup::escape::match(*curr); + --curr) { + } + return (next_line_buffer_ - curr + size) % 2 == 0; + } + + return false; } - ~buffer() { - free(buffer_); - free(next_line_buffer_); + bool unterminated_quote() { + // unterimated quote + if constexpr (ss::setup::quote::enabled) { + if (next_line_converter_.unterminated_quote()) { + return true; + } + } + return false; } - bool read(FILE* file) { - ssize_t size = getline(&next_line_buffer_, &size_, file); - size_t string_end = size - 1; + void undo_remove_eol(size_t& string_end) { + if (crlf) { + memcpy(next_line_buffer_ + string_end, "\r\n\0", 3); + string_end += 2; + } else { + memcpy(next_line_buffer_ + string_end, "\n\0", 2); + string_end += 1; + } + } - if (size == -1) { + size_t remove_eol(char*& buffer, size_t size) { + size_t new_size = size - 1; + if (size >= 2 && buffer[size - 2] == '\r') { + crlf = true; + new_size--; + } else { + crlf = false; + } + + buffer[new_size] = '\0'; + return new_size; + } + + void realloc_concat(char*& first, size_t& first_size, + const char* const second, size_t second_size) { + first = static_cast(realloc(static_cast(first), + first_size + second_size + 2)); + + memcpy(first + first_size, second, second_size + 1); + first_size += second_size; + } + + bool append_line(FILE* file, char*& dst_buffer, size_t& dst_size) { + undo_remove_eol(dst_size); + + ssize_t ssize = getline(&helper_buffer_, &helper_size_, file); + if (ssize == -1) { return false; } - if (size >= 2 && next_line_buffer_[size - 2] == '\r') { - string_end--; + size_t size = remove_eol(helper_buffer_, ssize); + realloc_concat(dst_buffer, dst_size, helper_buffer_, size); + return true; + } + + public: + reader(const std::string& delimiter) : delim_{delimiter} { + } + + ~reader() { + free(buffer_); + free(next_line_buffer_); + free(helper_buffer_); + } + + bool read(FILE* file) { + ssize_t ssize = getline(&next_line_buffer_, &size_, file); + + if (ssize == -1) { + return false; + } + + size_t size = remove_eol(next_line_buffer_, ssize); + + while (escaped_eol(size)) { + if (!append_line(file, next_line_buffer_, size)) { + return false; + } } - next_line_buffer_[string_end] = '\0'; next_line_converter_.split(next_line_buffer_, delim_); + while (unterminated_quote()) { + if (!append_line(file, next_line_buffer_, size)) { + return false; + } + next_line_converter_.resplit(next_line_buffer_, size); + } + return true; } @@ -290,7 +377,7 @@ private: return converter_; } - const char* get() const { + const char* get_buffer() const { return buffer_; } @@ -301,7 +388,7 @@ private: }; void read_line() { - eof_ = !buff_.read(file_); + eof_ = !reader_.read(file_); ++line_number_; } @@ -341,9 +428,9 @@ private: .append(" ") .append(std::to_string(line_number_)) .append(": ") - .append(buff_.get_converter().error_msg()) + .append(reader_.get_converter().error_msg()) .append(": \"") - .append(buff_.get()) + .append(reader_.get_buffer()) .append("\""); } else { bool_error_ = true; @@ -360,7 +447,7 @@ private: bool bool_error_{false}; error_mode error_mode_{error_mode::error_bool}; FILE* file_{nullptr}; - buffer buff_{delim_}; + reader reader_{delim_}; size_t line_number_{0}; bool eof_{false}; }; diff --git a/include/ss/splitter.hpp b/include/ss/splitter.hpp index 917bdcb..e73b825 100644 --- a/include/ss/splitter.hpp +++ b/include/ss/splitter.hpp @@ -5,9 +5,6 @@ #include #include -// TODO remove -#include - namespace ss { template struct matcher { @@ -99,16 +96,17 @@ private: using escape = typename setup::escape; constexpr static auto is_const_line = !quote::enabled && !escape::enabled; + +public: using line_ptr_type = typename ternary::type; -public: bool valid() const { return (error_mode_ == error_mode::error_string) ? string_error_.empty() : bool_error_ == false; } - bool unterminated_quote() { + bool unterminated_quote() const { return unterminated_quote_; } @@ -120,7 +118,7 @@ public: error_mode_ = mode; } - split_input& split(line_ptr_type new_line, + const split_input& split(line_ptr_type new_line, const std::string& delimiter = default_delimiter) { output_.clear(); return resplit(new_line, -1, delimiter); @@ -133,7 +131,7 @@ public: } } - split_input& resplit(line_ptr_type new_line, ssize_t new_size, + const split_input& resplit(line_ptr_type new_line, ssize_t new_size, const std::string& delimiter = default_delimiter) { line_ = new_line; diff --git a/test/test_converter.cpp b/test/test_converter.cpp index 5bcab32..fa2fab7 100644 --- a/test/test_converter.cpp +++ b/test/test_converter.cpp @@ -46,8 +46,7 @@ TEST_CASE("testing valid conversions") { CHECK(tup == 5); } { - // TODO make \t -> ' ' - auto tup = c.convert(buff("junk\t5\tjunk"), "\t"); + auto tup = c.convert(buff("junk 5 junk"), " "); REQUIRE(c.valid()); CHECK(tup == 5); } @@ -398,3 +397,50 @@ TEST_CASE("testing error mode") { CHECK(!c.valid()); CHECK(!c.error_msg().empty()); } + +TEST_CASE("testing converter with quotes spacing and escaping") { + { + ss::converter c; + + auto tup = c.convert( + R"("just","some","strings")"); + REQUIRE(c.valid()); + CHECK(tup == std::make_tuple("\"just\"", "\"some\"", "\"strings\"")); + } + + { + ss::converter> c; + + auto tup = c.convert( + buff(R"("just",some,"12.3","a")")); + REQUIRE(c.valid()); + CHECK(tup == std::make_tuple("just", "some", 12.3, 'a')); + } + + { + ss::converter> c; + + auto tup = c.convert( + R"( just , some , 12.3 ,a )"); + REQUIRE(c.valid()); + CHECK(tup == std::make_tuple("just", "some", 12.3, 'a')); + } + + { + ss::converter> c; + + auto tup = + c.convert(buff(R"(ju\,st,strings)")); + REQUIRE(c.valid()); + CHECK(tup == std::make_tuple("ju,st", "strings")); + } + + { + ss::converter, ss::trim<' '>, ss::quote<'"'>> c; + + auto tup = c.convert( + buff(R"( ju\,st , "so,me" , 12.34 , "str""ings")")); + REQUIRE(c.valid()); + CHECK(tup == std::make_tuple("ju,st", "so,me", 12.34, "str\"ings")); + } +} diff --git a/test/test_parser.cpp b/test/test_parser.cpp index f9518d0..905f1a2 100644 --- a/test/test_parser.cpp +++ b/test/test_parser.cpp @@ -514,3 +514,72 @@ TEST_CASE("testing error mode") { CHECK(!p.valid()); CHECK(!p.error_msg().empty()); } + +std::string no_quote(const std::string& s) { + if (!s.empty() && s[0] == '"') { + return {std::next(begin(s)), std::prev(end(s))}; + } + return s; +} + +TEST_CASE("testing csv on multiple lines with quotes") { + unique_file_name f; + std::vector data = {{1, 2, "\"x\nx\nx\""}, {3, 4, "\"y\ny\ny\""}, + {5, 6, "\"z\nz\""}, {7, 8, "\"u\"\"\""}, + {9, 10, "v"}, {11, 12, "\"w\n\""}}; + make_and_write(f.name, data); + for (auto& [_, __, s] : data) { + s = no_quote(s); + if (s[0] == 'u') { + s = "u\""; + } + } + + ss::parser> p{f.name, ","}; + p.set_error_mode(ss::error_mode::error_string); + std::vector i; + + while (!p.eof()) { + auto a = p.get_next(); + auto [x, y, z] = a; + std::cout << "=====================" << std::endl; + std::cout << x << ' ' << y << ' ' << z << std::endl; + i.emplace_back(ss::to_object(a)); + } + + CHECK(std::equal(i.begin(), i.end(), data.begin())); +} + +std::string no_escape(std::string& s) { + s.erase(std::remove(begin(s), end(s), '\\'), end(s)); + return s; +} + +TEST_CASE("testing csv on multiple lines with escapes") { + unique_file_name f; + std::vector data = {{1, 2, "x\\\nx\\\nx"}, {3, 4, "y\\\ny\\\ny"}, + {5, 6, "z\\\nz"}, {7, 8, "u"}, + {9, 10, "v\\\\"}, {11, 12, "w\\\n"}}; + + make_and_write(f.name, data); + for (auto& [_, __, s] : data) { + s = no_escape(s); + if (s == "v") { + s = "v\\"; + } + } + + ss::parser> p{f.name, ","}; + p.set_error_mode(ss::error_mode::error_string); + std::vector i; + + while (!p.eof()) { + auto a = p.get_next(); + auto [x, y, z] = a; + std::cout << "=====================" << std::endl; + std::cout << x << ' ' << y << ' ' << z << std::endl; + i.emplace_back(ss::to_object(a)); + } + + CHECK(std::equal(i.begin(), i.end(), data.begin())); +}