From d978e986de5dc278753319f357a6154cd0c85571 Mon Sep 17 00:00:00 2001 From: ado Date: Mon, 14 Aug 2023 23:19:02 +0200 Subject: [PATCH] Enable ignore_empty functionality --- include/ss/parser.hpp | 182 +++++++++++++++++++++++++++++++++++------- test/test_parser.cpp | 53 ++++++++++-- 2 files changed, 202 insertions(+), 33 deletions(-) diff --git a/include/ss/parser.hpp b/include/ss/parser.hpp index a976b55..28d5f4b 100644 --- a/include/ss/parser.hpp +++ b/include/ss/parser.hpp @@ -1,5 +1,7 @@ #pragma once +#define BUFF_TAIL 2 + // TODO remove or rename #define likely(x) __builtin_expect(!!(x), 1) #define unlikely(x) __builtin_expect(!!(x), 0) @@ -699,26 +701,31 @@ private: } // TODO check buff_ - buff_ = static_cast(std::malloc(buff_size_)); + buff_ = static_cast(std::malloc(buff_size_ + BUFF_TAIL)); // TODO check buff_filled buff_filled_ = std::fread(buff_, 1, buff_size_, file_); if (buff_filled_ != buff_size_) { last_read_ = true; - } - // TODO handle differently - if (buff_filled_ == 0 || (buff_filled_ == 1 && buff_[0] == '\n') || - (buff_filled_ == 2 && buff_[0] == '\r' && buff_[1] == '\n')) { - fclose(file_); - file_ = nullptr; + // TODO handle differently + if (buff_filled_ == 0 || + (buff_filled_ == 1 && buff_[0] == '\n') || + (buff_filled_ == 2 && buff_[0] == '\r' && + buff_[1] == '\n')) { + eof_ = true; + } } begin_ = buff_; curr_ = buff_; shifted_curr_ = buff_; end_ = buff_ + buff_filled_; + + if constexpr (ignore_empty) { + ignore_all_empty_lines(); + } } ~reader() { @@ -744,7 +751,8 @@ private: buff_size_ *= 8; // TODO handle NULL - buff_ = static_cast(std::realloc(buff_, buff_size_)); + buff_ = + static_cast(std::realloc(buff_, buff_size_ + BUFF_TAIL)); // fill the rest of the buffer buff_filled_ += fread(buff_ + buff_filled_, 1, @@ -871,6 +879,9 @@ private: void shift_if_escaped(line_ptr_type& curr) { if constexpr (escape::enabled) { if (escape::match(*curr)) { + if (curr_ + 1 >= end_) { + throw "out of range"; + } // TODO handle differently if (curr[1] == '\0') { if constexpr (!multiline::enabled) { @@ -897,6 +908,10 @@ private: } } + bool reached_end() { + return curr_ >= end_ && last_read_; + } + void shift_and_set_shifted_current() { if constexpr (!is_const_line) { if (escaped_ > 0) { @@ -930,7 +945,9 @@ private: if constexpr (!is_const_line) { ++escaped_; } + ++curr_; + check_buff_end(); } void shift_push_and_start_next(size_t n) { @@ -993,9 +1010,37 @@ private: } } + void handle_error_unterminated_escape() { + constexpr static auto error_msg = + "unterminated escape at the end of the line"; + + if constexpr (string_error) { + error_.clear(); + error_.append(error_msg); + } else if constexpr (throw_on_error) { + throw ss::exception{error_msg}; + } else { + error_ = true; + } + } + + void handle_error_unterminated_quote() { + constexpr static auto error_msg = "unterminated quote"; + + if constexpr (string_error) { + error_.clear(); + error_.append(error_msg); + } else if constexpr (throw_on_error) { + throw ss::exception{error_msg}; + } else { + error_ = true; + } + } + void go_to_next_line() { while (*curr_ != '\n') { ++curr_; + check_buff_end(); } } @@ -1024,19 +1069,43 @@ private: } // TODO handle - if (*curr_ == '\n') { - ++line_number_; - check_buff_end(); - handle_error_empty_line(); - return; - } + while (true) { + if (*curr_ == '\n') { + ++line_number_; + if constexpr (!ignore_empty) { + check_buff_end(); + handle_error_empty_line(); + return; + } else { + ++curr_; + if (reached_end()) { + eof_ = true; + return; + } + check_buff_end(); + continue; + } + } - if (*curr_ == '\r' && *(curr_ + 1) == '\n') { - ++line_number_; - ++curr_; - check_buff_end(); - handle_error_empty_line(); - return; + if (*curr_ == '\r' && *(curr_ + 1) == '\n') { + ++line_number_; + if constexpr (!ignore_empty) { + ++curr_; + check_buff_end(); + handle_error_empty_line(); + return; + } else { + curr_ += 2; + if (reached_end()) { + eof_ = true; + return; + } + check_buff_end(); + continue; + } + } + + break; } while (true) { @@ -1098,14 +1167,25 @@ private: // not eol shift_and_jump_escape(); - check_buff_end(); ++curr_; + + if (reached_end()) { + handle_error_unterminated_quote(); + return; + } + + check_buff_end(); continue; } } ++curr_; + if (reached_end()) { + handle_error_unterminated_quote(); + return; + } + check_buff_end(); continue; } @@ -1139,6 +1219,7 @@ private: // eg no trim: ...,"hello"\n -> hello shift_and_push(); ++curr_; + check_buff_end(); // TODO handle differently if (curr_[0] == '\r') { ++curr_; @@ -1179,10 +1260,16 @@ private: // TODO handle differently if (*curr_ == '\r') { ++curr_; + check_buff_end(); } return; } else { curr_ += width; + // TODO check + if (reached_end()) { + handle_error_unterminated_escape(); + return; + } check_buff_end(); continue; } @@ -1197,9 +1284,10 @@ private: } } - // read next line each time in order to set eof_ - void read_next() { + // TODO rename + void check_and_cycle_buffer() { // TODO update division value + buff_processed_ = curr_ - buff_; if (buff_processed_ > buff_filled_ / 2) { if (!last_read_) { shift_read_next(); @@ -1212,6 +1300,11 @@ private: end_ = buff_ + buff_filled_; } } + } + + // read next line each time in order to set eof_ + void read_next() { + check_and_cycle_buffer(); split_data_.clear(); begin_ = curr_; @@ -1232,10 +1325,45 @@ private: } ++curr_; - buff_processed_ = curr_ - buff_; + if (last_read_) { + if (curr_ >= end_) { + eof_ = true; + return; + } + } - if (last_read_ && curr_ >= end_) { - eof_ = true; + if constexpr (ignore_empty) { + ignore_all_empty_lines(); + } + } + + void ignore_all_empty_lines() { + if constexpr (ignore_empty) { + while (curr_ < end_) { + check_and_cycle_buffer(); + if (*curr_ == '\n') { + ++line_number_; + ++curr_; + if (reached_end()) { + eof_ = true; + break; + } + check_buff_end(); + continue; + } + if (curr_[0] == '\r' && curr_[1] == '\n') { + ++line_number_; + curr_ += 2; + if (reached_end()) { + eof_ = true; + break; + } + check_buff_end(); + continue; + } + eof_ = false; + break; + } } } @@ -1247,7 +1375,7 @@ private: size_t line_number_{0}; // TODO set initial buffer size - size_t buff_size_{1}; + size_t buff_size_{8}; size_t buff_filled_{0}; size_t buff_processed_{0}; diff --git a/test/test_parser.cpp b/test/test_parser.cpp index 38700e0..9fe7248 100644 --- a/test/test_parser.cpp +++ b/test/test_parser.cpp @@ -1641,6 +1641,7 @@ void test_ignore_empty(const std::vector& data) { test_ignore_empty_impl(data); } +// TODO test with different initial buffer sizes TEST_CASE("parser test various cases with empty lines") { test_ignore_empty({{1, 2, "x"}, {3, 4, "y"}, {9, 10, "v"}, {11, 12, "w"}}); @@ -1670,13 +1671,53 @@ TEST_CASE("parser test various cases with empty lines") { {9, 10, X::empty}, {11, 12, X::empty}}); - test_ignore_empty( - {{1, 2, "x"}, {3, 4, X::empty}, {9, 10, X::empty}, {11, 12, X::empty}}); + test_ignore_empty({{1, 2, X::empty}, + {3, 4, X::empty}, + {5, 6, X::empty}, + {7, 8, X::empty}, + {9, 10, X::empty}, + {11, 12, X::empty}, + {13, 14, X::empty}, + {15, 16, X::empty}, + {17, 18, X::empty}}); - test_ignore_empty( - {{1, 2, X::empty}, {3, 4, X::empty}, {9, 10, X::empty}, {11, 12, "w"}}); + test_ignore_empty({{1, 2, X::empty}, + {3, 4, X::empty}, + {5, 6, X::empty}, + {7, 8, X::empty}, + {9, 10, X::empty}, + {11, 12, X::empty}, + {13, 14, X::empty}, + {15, 16, X::empty}, + {17, 18, "x"}}); - test_ignore_empty({{11, 12, X::empty}}); + test_ignore_empty({{1, 2, "x"}, + {3, 4, X::empty}, + {9, 10, X::empty}, + {11, 12, X::empty}}); - test_ignore_empty({}); + test_ignore_empty({{1, 2, "x"}, + {3, 4, X::empty}, + {3, 4, X::empty}, + {5, 6, X::empty}, + {7, 8, X::empty}, + {9, 10, X::empty}, + {11, 12, X::empty}}); + + test_ignore_empty({{1, 2, "x"}, + {3, 4, X::empty}, + {3, 4, X::empty}, + {5, 6, X::empty}, + {7, 8, X::empty}, + {9, 10, X::empty}, + {11, 12, "y"}}); + + test_ignore_empty({{1, 2, X::empty}, + {3, 4, X::empty}, + {9, 10, X::empty}, + {11, 12, "w"}}); + + test_ignore_empty({{11, 12, X::empty}}); + + test_ignore_empty({}); }