From 31cbd20f8f1d904830aa8a2b60748a11081942ec Mon Sep 17 00:00:00 2001 From: ado Date: Fri, 28 Jul 2023 00:17:22 +0200 Subject: [PATCH] WIP, Continue writing additional parser tests --- include/ss/parser.hpp | 6 +- include/ss/setup.hpp | 2 +- test/test_helpers.hpp | 23 +++ test/test_parser.cpp | 357 ++++++++++++++++++++++++++++++----------- test/test_splitter.cpp | 20 --- 5 files changed, 289 insertions(+), 119 deletions(-) diff --git a/include/ss/parser.hpp b/include/ss/parser.hpp index 5f4cae7..b47c433 100644 --- a/include/ss/parser.hpp +++ b/include/ss/parser.hpp @@ -776,9 +776,9 @@ private: std::vector get_header() { std::vector header; std::string header_buffer = next_line_buffer_; - converter_.split(header_buffer.data(), delim_); - auto& header_row_raw = converter_.splitter_.split_data_; - for (const auto& [begin, end] : header_row_raw) { + ss::splitter splitter; + splitter.split(header_buffer.data(), delim_); + for (const auto& [begin, end] : splitter.split_data_) { header.emplace_back(begin, end); } return header; diff --git a/include/ss/setup.hpp b/include/ss/setup.hpp index ae2fc7f..6c5fdc9 100644 --- a/include/ss/setup.hpp +++ b/include/ss/setup.hpp @@ -269,7 +269,7 @@ private: static_assert( !multiline::enabled || (multiline::enabled && (quote::enabled || escape::enabled)), - "to enable multiline either quote or escape need to be enabled"); + "to enable multiline either quote or escape needs to be enabled"); static_assert(!(trim_all::enabled && trim_left_only::enabled) && !(trim_all::enabled && trim_right_only::enabled), diff --git a/test/test_helpers.hpp b/test/test_helpers.hpp index 022076f..273699c 100644 --- a/test/test_helpers.hpp +++ b/test/test_helpers.hpp @@ -1,6 +1,7 @@ #pragma once #include #include +#include #ifdef CMAKE_GITHUB_CI #include @@ -87,3 +88,25 @@ struct buffer { } catch (ss::exception & e) { \ CHECK_FALSE(std::string{e.what()}.empty()); \ } + +template +std::vector> vector_combinations( + const std::vector& v, size_t n) { + std::vector> ret; + if (n <= 1) { + for (const auto& i : v) { + ret.push_back({i}); + } + return ret; + } + + auto inner_combinations = vector_combinations(v, n - 1); + for (const auto& i : v) { + for (auto j : inner_combinations) { + j.insert(j.begin(), i); + ret.push_back(move(j)); + } + } + return ret; +} + diff --git a/test/test_parser.cpp b/test/test_parser.cpp index 8f47481..f0b0992 100644 --- a/test/test_parser.cpp +++ b/test/test_parser.cpp @@ -27,6 +27,7 @@ struct unique_file_name { } ~unique_file_name() { + // TODO uncomment // std::filesystem::remove(name); } }; @@ -692,7 +693,6 @@ std::string no_quote(const std::string& s) { } TEST_CASE("parser test csv on multiple lines with quotes") { - // TODO test with "_""_""_",... unique_file_name f; std::vector data = {{1, 2, "\"x\r\nx\nx\""}, {3, 4, "\"y\ny\r\ny\""}, @@ -1174,7 +1174,7 @@ struct random_number_generator { } bool rand_bool() { - return rand() % 4 == 0; + return (rand() % 100) > 50; } template @@ -1233,11 +1233,11 @@ column make_column(const std::string& input_header, } if (!setup::escape::enabled && !setup::quote::enabled) { - if (!setup::trim_left::enabled && el.has_spaces_left) { + if (setup::trim_left::enabled && el.has_spaces_left) { continue; } - if (!setup::trim_right::enabled && el.has_spaces_right) { + if (setup::trim_right::enabled && el.has_spaces_right) { continue; } } @@ -1245,18 +1245,22 @@ column make_column(const std::string& input_header, filtered_fields.push_back(el); } - return column{.header = input_header, .fields = filtered_fields}; + column c; + c.header = input_header; + c.fields = filtered_fields; + return c; } void replace_all2(std::string& s, const std::string& old_value, const std::string& new_value) { - while (true) { + for (size_t i = 0; i < 999; ++i) { size_t pos = s.find(old_value); if (pos == std::string::npos) { return; } s.replace(pos, old_value.size(), new_value); } + FAIL("bad replace"); } template @@ -1266,9 +1270,11 @@ std::vector generate_csv_data(const std::vector& data, using setup = ss::setup; constexpr static auto escape = '\\'; constexpr static auto quote = '"'; + constexpr static auto space = ' '; + constexpr static auto new_line = '\n'; constexpr static auto helper0 = '#'; constexpr static auto helper1 = '$'; - constexpr static auto new_line = '\n'; + // constexpr static auto helper3 = '&'; std::vector output; @@ -1279,6 +1285,8 @@ std::vector generate_csv_data(const std::vector& data, replace_all2(value, {escape, quote}, {helper1}); bool quote_newline = rng.rand_bool(); + bool quote_spacings = rng.rand_bool(); + bool has_spaces = el.has_spaces_right || el.has_spaces_left; // handle escape replace_all2(value, {escape}, {helper0}); @@ -1296,7 +1304,8 @@ std::vector generate_csv_data(const std::vector& data, replace_all2(value, {escape, quote}, {helper1}); - if (rng.rand_bool() || quote_newline) { + if (rng.rand_bool() || quote_newline || + (quote_spacings && has_spaces)) { replace_all2(value, {quote}, {helper0}); if (rng.rand_bool()) { replace_all2(value, {helper0}, {escape, quote}); @@ -1308,25 +1317,43 @@ std::vector generate_csv_data(const std::vector& data, replace_all2(value, {helper1}, {escape, quote}); + if (!quote_spacings && has_spaces) { + replace_all2(value, {escape, space}, {helper0}); + replace_all2(value, {space}, {helper0}); + replace_all2(value, {helper0}, {escape, space}); + } + output.push_back(value); } } else if (setup::escape::enabled) { for (const auto& el : data) { auto value = el.value; + replace_all2(value, {escape}, {helper0}); rng.rand_insert_n(value, escape, 3); replace_all2(value, {new_line}, {helper1}); replace_all2(value, {helper1}, {escape, new_line}); + replace_all2(value, {escape, escape}, {escape}); replace_all2(value, {escape, helper0}, {helper0}); + replace_all2(value, {helper0, escape}, {helper0}); replace_all2(value, {helper0}, {escape, escape}); + + if (setup::trim_right::enabled || setup::trim_left::enabled) { + // escape space + replace_all2(value, {escape, space}, {helper0}); + replace_all2(value, {space}, {helper0}); + replace_all2(value, {helper0}, {escape, space}); + } + output.push_back(value); } } else if (setup::quote::enabled) { for (const auto& el : data) { auto value = el.value; - if (rng.rand_bool() || el.has_new_line) { + if (rng.rand_bool() || el.has_new_line || el.has_spaces_left || + el.has_spaces_right) { replace_all2(value, {quote}, {helper0}); replace_all2(value, {helper0}, {quote, quote}); value = std::string{quote} + value + std::string{quote}; @@ -1375,13 +1402,31 @@ void write_to_file(const std::vector& data, template void test_combinations(const std::vector& input_data, - const std::string& delim) { + const std::string& delim, bool include_header) { // TODO test without string_error using setup = ss::setup; unique_file_name f; std::vector> expected_data; - size_t n = rng.rand() % 10; + std::vector header; + std::vector field_header; + + for (const auto& el : input_data) { + header.push_back(el.header); + field_header.push_back(field{el.header}); + } + + if (include_header) { + auto header_data = generate_csv_data(field_header, delim); + write_to_file(header_data, delim, f.name); + } + + std::vector layout; + size_t n = 1 + rng.rand() % 10; + + for (size_t i = 0; i < input_data.size(); ++i) { + layout.push_back(i); + } for (size_t i = 0; i < n; ++i) { std::vector raw_data; @@ -1398,84 +1443,153 @@ void test_combinations(const std::vector& input_data, auto data = generate_csv_data(raw_data, delim); write_to_file(data, delim, f.name); - // TODO remove + /* std::cout << "[."; for (const auto& el : data) { std::cout << el << '.'; } std::cout << "]" << std::endl; + */ } - std::cout << delim << std::endl; - ss::parser p{f.name, delim}; + auto layout_combinations = vector_combinations(layout, layout.size()); - auto check_error = [&p] { - CHECK(p.valid()); - if (!p.valid()) { - std::cout << p.error_msg() << std::endl; + auto remove_duplicates = [](const auto& vec) { + std::vector unique_vec; + std::unordered_set vec_set; + for (const auto& el : vec) { + if (vec_set.find(el) == vec_set.end()) { + vec_set.insert(el); + unique_vec.push_back(el); + } } + + return unique_vec; }; - for (size_t i = 0; i < n; ++i) { - switch (expected_data[i].size()) { - case 0: - // TODO handle; - break; - case 1: { - auto s0 = p.template get_next(); - check_error(); - std::cout << s0 << std::endl; - CHECK(s0 == expected_data[i][0].value); - break; + std::vector> unique_layout_combinations; + for (const auto& layout : layout_combinations) { + unique_layout_combinations.push_back(remove_duplicates(layout)); + } + + if (!include_header) { + unique_layout_combinations.clear(); + unique_layout_combinations.push_back(layout); + } + + for (const auto& layout : unique_layout_combinations) { + ss::parser p{f.name, delim}; + + if (include_header) { + std::vector fields; + for (const auto& index : layout) { + fields.push_back(header[index]); + } + + p.use_fields(fields); + + if (!p.valid()) { + std::cout << p.error_msg() << std::endl; + } + + REQUIRE(p.valid()); } - case 2: { - auto [s0, s1] = p.template get_next(); - check_error(); - std::cout << s0 << ' ' << s1 << std::endl; - CHECK(s0 == expected_data[i][0].value); - CHECK(s1 == expected_data[i][1].value); - break; - } - case 3: { - auto [s0, s1, s2] = - p.template get_next(); - check_error(); - std::cout << s0 << ' ' << s1 << ' ' << s2 << std::endl; - CHECK(s0 == expected_data[i][0].value); - CHECK(s1 == expected_data[i][1].value); - CHECK(s2 == expected_data[i][2].value); - break; - } - case 4: { - auto [s0, s1, s2, s3] = - p.template get_next(); - check_error(); - std::cout << s0 << ' ' << s1 << ' ' << s2 << ' ' << s3 << std::endl; - CHECK(s0 == expected_data[i][0].value); - CHECK(s1 == expected_data[i][1].value); - CHECK(s2 == expected_data[i][2].value); - CHECK(s3 == expected_data[i][3].value); - break; - } - case 5: { - auto [s0, s1, s2, s3, s4] = - p.template get_next(); - check_error(); - std::cout << s0 << ' ' << s1 << ' ' << s2 << ' ' << s3 << ' ' << s4 - << std::endl; - CHECK(s0 == expected_data[i][0].value); - CHECK(s1 == expected_data[i][1].value); - CHECK(s2 == expected_data[i][2].value); - CHECK(s3 == expected_data[i][3].value); - CHECK(s4 == expected_data[i][4].value); - break; - } - // ... - default: - // TODO handle - break; + + auto check_error = [&p] { + CHECK(p.valid()); + if (!p.valid()) { + std::cout << p.error_msg() << std::endl; + } + }; + + int num_columns = layout.size(); + for (size_t i = 0; i < n + 1; ++i) { + switch (num_columns) { + case 1: { + auto s0 = p.template get_next(); + if (i < n) { + check_error(); + // std::cout << s0 << std::endl; + CHECK(s0 == expected_data[i][layout[0]].value); + } else { + CHECK(p.eof()); + CHECK(!p.valid()); + } + break; + } + case 2: { + auto [s0, s1] = p.template get_next(); + if (i < n) { + check_error(); + // std::cout << s0 << ' ' << s1 << std::endl; + CHECK(s0 == expected_data[i][layout[0]].value); + CHECK(s1 == expected_data[i][layout[1]].value); + } else { + CHECK(p.eof()); + CHECK(!p.valid()); + } + break; + } + case 3: { + auto [s0, s1, s2] = + p.template get_next(); + if (i < n) { + check_error(); + // std::cout << s0 << ' ' << s1 << ' ' << s2 << std::endl; + CHECK(s0 == expected_data[i][layout[0]].value); + CHECK(s1 == expected_data[i][layout[1]].value); + CHECK(s2 == expected_data[i][layout[2]].value); + } else { + CHECK(p.eof()); + CHECK(!p.valid()); + } + break; + } + case 4: { + auto [s0, s1, s2, s3] = + p.template get_next(); + if (i < n) { + check_error(); + /* + std::cout << s0 << ' ' << s1 << ' ' << s2 << ' ' << s3 + << std::endl; + */ + CHECK(s0 == expected_data[i][layout[0]].value); + CHECK(s1 == expected_data[i][layout[1]].value); + CHECK(s2 == expected_data[i][layout[2]].value); + CHECK(s3 == expected_data[i][layout[3]].value); + } else { + CHECK(p.eof()); + CHECK(!p.valid()); + } + break; + } + case 5: { + auto [s0, s1, s2, s3, s4] = + p.template get_next(); + if (i < n) { + check_error(); + //std::cout << s0 << ' ' << s1 << ' ' << s2 << ' ' << s3 + // << ' ' << s4 << std::endl; + CHECK(s0 == expected_data[i][layout[0]].value); + CHECK(s1 == expected_data[i][layout[1]].value); + CHECK(s2 == expected_data[i][layout[2]].value); + CHECK(s3 == expected_data[i][layout[3]].value); + CHECK(s4 == expected_data[i][layout[4]].value); + } else { + CHECK(p.eof()); + CHECK(!p.valid()); + } + break; + } + default: + FAIL(("Invalid number of columns: " + + std::to_string(num_columns))); + break; + } } } } @@ -1483,30 +1597,56 @@ void test_combinations(const std::vector& input_data, // TODO rename template void test_combinations_impl() { - column data0 = - make_column("data0", {field{111}, field{11}, field{1}}); + column ints0 = + make_column("ints0", {field{123}, field{45}, field{6}}); + column ints1 = + make_column("ints1", {field{123}, field{45}, field{6}}); + column ints2 = + make_column("ints2", {field{123}, field{45}, field{6}}); - column data1 = make_column("data1", {field{"hel\\lo"}, field{"h\ni"}, - field{"new\nline"}}); + column floats0 = + make_column("floats0", {field{1.23}, field{456.7}, field{0.8}, + field{910}, field{123456789.987654321}}); + column floats1 = + make_column("floats1", {field{1.23}, field{456.7}, field{0.8}, + field{910}, field{123456789.987654321}}); + column floats2 = + make_column("floats2", {field{1.23}, field{456.7}, field{0.8}, + field{910}, field{123456789.987654321}}); - column data2 = - make_column("data2", {field{222}, field{22}, field{12345}}); + column strings0 = + make_column("strings0", {field{"just"}, field{"some"}, + field{"random"}, field{"string"}}); - column data3 = - make_column("data3", {field{"h\"mm"}, field{"::::::::"}}); + column strings1 = + make_column("strings1", {field{"st\"rings"}, field{"w\"\"ith"}, + field{"qu\"otes\\"}, field{"\\a\\n\\d"}, + field{"escapes\""}}); - column data4 = - make_column("data4", {field{"h\"\"e\\llloooo"}, field{":D"}}); + column strings2 = + make_column("strings2", + {field{" with "}, field{" spaces"}, + field{"and "}, field{"\nnew"}, field{" \nlines"}, + field{" a\n\nn\n\nd "}, field{" \nso\n "}, + field{"on"}}); - auto columns0 = std::vector{data0, data1, data2, data3, data4}; - auto columns1 = std::vector{data4, data3, data2, data1, data0}; - auto columns2 = std::vector{data2, data3, data0, data4, data1}; + auto columns0 = std::vector{ints0, strings0, floats0, strings1, strings2}; + auto columns1 = std::vector{strings2, strings1, floats0, strings0, ints0}; + auto columns2 = std::vector{floats0, strings1, ints0, strings2, strings0}; + auto columns3 = std::vector{ints0, ints1, ints2}; + auto columns4 = std::vector{floats0, floats1, floats2}; + auto columns5 = std::vector{strings1, strings2}; + auto columns6 = std::vector{strings1}; + auto columns7 = std::vector{strings2}; - for (size_t i = 0; i < 2; ++i) { - for (const auto& delimiter: {",", "-", "--"}) { - test_combinations(columns0, delimiter); - test_combinations(columns1, delimiter); - test_combinations(columns2, delimiter); + for (size_t i = 0; i < 3; ++i) { + for (const auto& delimiter : {",", "-", "--"}) { + for (const auto& columns : + {columns0, columns1, columns2, columns3, columns4, columns5, + columns6, columns7}) { + test_combinations(columns, delimiter, false); + test_combinations(columns, delimiter, true); + } } } } @@ -1520,10 +1660,37 @@ TEST_CASE("parser test various cases version 2") { using multiline = ss::multiline; test_combinations_impl<>(); + test_combinations_impl(); + test_combinations_impl(); + test_combinations_impl(); + test_combinations_impl(); + test_combinations_impl(); + test_combinations_impl(); + test_combinations_impl(); + test_combinations_impl(); + test_combinations_impl(); + test_combinations_impl(); + test_combinations_impl(); + test_combinations_impl(); + test_combinations_impl(); + test_combinations_impl(); + test_combinations_impl(); + test_combinations_impl(); + test_combinations_impl(); + test_combinations_impl(); + test_combinations_impl(); + test_combinations_impl(); + test_combinations_impl(); + test_combinations_impl(); + test_combinations_impl(); + test_combinations_impl(); + test_combinations_impl(); + test_combinations_impl(); + test_combinations_impl(); } diff --git a/test/test_splitter.cpp b/test/test_splitter.cpp index d6d4369..7b69d65 100644 --- a/test/test_splitter.cpp +++ b/test/test_splitter.cpp @@ -127,26 +127,6 @@ std::vector combinations(const std::vector& v, return ret; } -std::vector> vector_combinations( - const std::vector& v, size_t n) { - std::vector> ret; - if (n <= 1) { - for (const auto& i : v) { - ret.push_back({i}); - } - return ret; - } - - auto inner_combinations = vector_combinations(v, n - 1); - for (const auto& i : v) { - for (auto j : inner_combinations) { - j.insert(j.begin(), i); - ret.push_back(move(j)); - } - } - return ret; -} - std::pair, std::vector>> make_combinations(const std::vector& input, const std::vector& output,