Enable ignore_empty functionality

This commit is contained in:
ado 2023-08-14 23:19:02 +02:00
parent 99d445bafe
commit d978e986de
2 changed files with 202 additions and 33 deletions

View File

@ -1,5 +1,7 @@
#pragma once #pragma once
#define BUFF_TAIL 2
// TODO remove or rename // TODO remove or rename
#define likely(x) __builtin_expect(!!(x), 1) #define likely(x) __builtin_expect(!!(x), 1)
#define unlikely(x) __builtin_expect(!!(x), 0) #define unlikely(x) __builtin_expect(!!(x), 0)
@ -699,26 +701,31 @@ private:
} }
// TODO check buff_ // TODO check buff_
buff_ = static_cast<char*>(std::malloc(buff_size_)); buff_ = static_cast<char*>(std::malloc(buff_size_ + BUFF_TAIL));
// TODO check buff_filled // TODO check buff_filled
buff_filled_ = std::fread(buff_, 1, buff_size_, file_); buff_filled_ = std::fread(buff_, 1, buff_size_, file_);
if (buff_filled_ != buff_size_) { if (buff_filled_ != buff_size_) {
last_read_ = true; last_read_ = true;
}
// TODO handle differently // TODO handle differently
if (buff_filled_ == 0 || (buff_filled_ == 1 && buff_[0] == '\n') || if (buff_filled_ == 0 ||
(buff_filled_ == 2 && buff_[0] == '\r' && buff_[1] == '\n')) { (buff_filled_ == 1 && buff_[0] == '\n') ||
fclose(file_); (buff_filled_ == 2 && buff_[0] == '\r' &&
file_ = nullptr; buff_[1] == '\n')) {
eof_ = true;
}
} }
begin_ = buff_; begin_ = buff_;
curr_ = buff_; curr_ = buff_;
shifted_curr_ = buff_; shifted_curr_ = buff_;
end_ = buff_ + buff_filled_; end_ = buff_ + buff_filled_;
if constexpr (ignore_empty) {
ignore_all_empty_lines();
}
} }
~reader() { ~reader() {
@ -744,7 +751,8 @@ private:
buff_size_ *= 8; buff_size_ *= 8;
// TODO handle NULL // TODO handle NULL
buff_ = static_cast<char*>(std::realloc(buff_, buff_size_)); buff_ =
static_cast<char*>(std::realloc(buff_, buff_size_ + BUFF_TAIL));
// fill the rest of the buffer // fill the rest of the buffer
buff_filled_ += fread(buff_ + buff_filled_, 1, buff_filled_ += fread(buff_ + buff_filled_, 1,
@ -871,6 +879,9 @@ private:
void shift_if_escaped(line_ptr_type& curr) { void shift_if_escaped(line_ptr_type& curr) {
if constexpr (escape::enabled) { if constexpr (escape::enabled) {
if (escape::match(*curr)) { if (escape::match(*curr)) {
if (curr_ + 1 >= end_) {
throw "out of range";
}
// TODO handle differently // TODO handle differently
if (curr[1] == '\0') { if (curr[1] == '\0') {
if constexpr (!multiline::enabled) { if constexpr (!multiline::enabled) {
@ -897,6 +908,10 @@ private:
} }
} }
bool reached_end() {
return curr_ >= end_ && last_read_;
}
void shift_and_set_shifted_current() { void shift_and_set_shifted_current() {
if constexpr (!is_const_line) { if constexpr (!is_const_line) {
if (escaped_ > 0) { if (escaped_ > 0) {
@ -930,7 +945,9 @@ private:
if constexpr (!is_const_line) { if constexpr (!is_const_line) {
++escaped_; ++escaped_;
} }
++curr_; ++curr_;
check_buff_end();
} }
void shift_push_and_start_next(size_t n) { void shift_push_and_start_next(size_t n) {
@ -993,9 +1010,37 @@ private:
} }
} }
void handle_error_unterminated_escape() {
constexpr static auto error_msg =
"unterminated escape at the end of the line";
if constexpr (string_error) {
error_.clear();
error_.append(error_msg);
} else if constexpr (throw_on_error) {
throw ss::exception{error_msg};
} else {
error_ = true;
}
}
void handle_error_unterminated_quote() {
constexpr static auto error_msg = "unterminated quote";
if constexpr (string_error) {
error_.clear();
error_.append(error_msg);
} else if constexpr (throw_on_error) {
throw ss::exception{error_msg};
} else {
error_ = true;
}
}
void go_to_next_line() { void go_to_next_line() {
while (*curr_ != '\n') { while (*curr_ != '\n') {
++curr_; ++curr_;
check_buff_end();
} }
} }
@ -1024,19 +1069,43 @@ private:
} }
// TODO handle // TODO handle
if (*curr_ == '\n') { while (true) {
++line_number_; if (*curr_ == '\n') {
check_buff_end(); ++line_number_;
handle_error_empty_line(); if constexpr (!ignore_empty) {
return; check_buff_end();
} handle_error_empty_line();
return;
} else {
++curr_;
if (reached_end()) {
eof_ = true;
return;
}
check_buff_end();
continue;
}
}
if (*curr_ == '\r' && *(curr_ + 1) == '\n') { if (*curr_ == '\r' && *(curr_ + 1) == '\n') {
++line_number_; ++line_number_;
++curr_; if constexpr (!ignore_empty) {
check_buff_end(); ++curr_;
handle_error_empty_line(); check_buff_end();
return; handle_error_empty_line();
return;
} else {
curr_ += 2;
if (reached_end()) {
eof_ = true;
return;
}
check_buff_end();
continue;
}
}
break;
} }
while (true) { while (true) {
@ -1098,14 +1167,25 @@ private:
// not eol // not eol
shift_and_jump_escape(); shift_and_jump_escape();
check_buff_end();
++curr_; ++curr_;
if (reached_end()) {
handle_error_unterminated_quote();
return;
}
check_buff_end();
continue; continue;
} }
} }
++curr_; ++curr_;
if (reached_end()) {
handle_error_unterminated_quote();
return;
}
check_buff_end(); check_buff_end();
continue; continue;
} }
@ -1139,6 +1219,7 @@ private:
// eg no trim: ...,"hello"\n -> hello // eg no trim: ...,"hello"\n -> hello
shift_and_push(); shift_and_push();
++curr_; ++curr_;
check_buff_end();
// TODO handle differently // TODO handle differently
if (curr_[0] == '\r') { if (curr_[0] == '\r') {
++curr_; ++curr_;
@ -1179,10 +1260,16 @@ private:
// TODO handle differently // TODO handle differently
if (*curr_ == '\r') { if (*curr_ == '\r') {
++curr_; ++curr_;
check_buff_end();
} }
return; return;
} else { } else {
curr_ += width; curr_ += width;
// TODO check
if (reached_end()) {
handle_error_unterminated_escape();
return;
}
check_buff_end(); check_buff_end();
continue; continue;
} }
@ -1197,9 +1284,10 @@ private:
} }
} }
// read next line each time in order to set eof_ // TODO rename
void read_next() { void check_and_cycle_buffer() {
// TODO update division value // TODO update division value
buff_processed_ = curr_ - buff_;
if (buff_processed_ > buff_filled_ / 2) { if (buff_processed_ > buff_filled_ / 2) {
if (!last_read_) { if (!last_read_) {
shift_read_next(); shift_read_next();
@ -1212,6 +1300,11 @@ private:
end_ = buff_ + buff_filled_; end_ = buff_ + buff_filled_;
} }
} }
}
// read next line each time in order to set eof_
void read_next() {
check_and_cycle_buffer();
split_data_.clear(); split_data_.clear();
begin_ = curr_; begin_ = curr_;
@ -1232,10 +1325,45 @@ private:
} }
++curr_; ++curr_;
buff_processed_ = curr_ - buff_; if (last_read_) {
if (curr_ >= end_) {
eof_ = true;
return;
}
}
if (last_read_ && curr_ >= end_) { if constexpr (ignore_empty) {
eof_ = true; ignore_all_empty_lines();
}
}
void ignore_all_empty_lines() {
if constexpr (ignore_empty) {
while (curr_ < end_) {
check_and_cycle_buffer();
if (*curr_ == '\n') {
++line_number_;
++curr_;
if (reached_end()) {
eof_ = true;
break;
}
check_buff_end();
continue;
}
if (curr_[0] == '\r' && curr_[1] == '\n') {
++line_number_;
curr_ += 2;
if (reached_end()) {
eof_ = true;
break;
}
check_buff_end();
continue;
}
eof_ = false;
break;
}
} }
} }
@ -1247,7 +1375,7 @@ private:
size_t line_number_{0}; size_t line_number_{0};
// TODO set initial buffer size // TODO set initial buffer size
size_t buff_size_{1}; size_t buff_size_{8};
size_t buff_filled_{0}; size_t buff_filled_{0};
size_t buff_processed_{0}; size_t buff_processed_{0};

View File

@ -1641,6 +1641,7 @@ void test_ignore_empty(const std::vector<X>& data) {
test_ignore_empty_impl<ss::throw_on_error>(data); test_ignore_empty_impl<ss::throw_on_error>(data);
} }
// TODO test with different initial buffer sizes
TEST_CASE("parser test various cases with empty lines") { TEST_CASE("parser test various cases with empty lines") {
test_ignore_empty({{1, 2, "x"}, {3, 4, "y"}, {9, 10, "v"}, {11, 12, "w"}}); test_ignore_empty({{1, 2, "x"}, {3, 4, "y"}, {9, 10, "v"}, {11, 12, "w"}});
@ -1670,13 +1671,53 @@ TEST_CASE("parser test various cases with empty lines") {
{9, 10, X::empty}, {9, 10, X::empty},
{11, 12, X::empty}}); {11, 12, X::empty}});
test_ignore_empty( test_ignore_empty({{1, 2, X::empty},
{{1, 2, "x"}, {3, 4, X::empty}, {9, 10, X::empty}, {11, 12, X::empty}}); {3, 4, X::empty},
{5, 6, X::empty},
{7, 8, X::empty},
{9, 10, X::empty},
{11, 12, X::empty},
{13, 14, X::empty},
{15, 16, X::empty},
{17, 18, X::empty}});
test_ignore_empty( test_ignore_empty({{1, 2, X::empty},
{{1, 2, X::empty}, {3, 4, X::empty}, {9, 10, X::empty}, {11, 12, "w"}}); {3, 4, X::empty},
{5, 6, X::empty},
{7, 8, X::empty},
{9, 10, X::empty},
{11, 12, X::empty},
{13, 14, X::empty},
{15, 16, X::empty},
{17, 18, "x"}});
test_ignore_empty({{11, 12, X::empty}}); test_ignore_empty({{1, 2, "x"},
{3, 4, X::empty},
{9, 10, X::empty},
{11, 12, X::empty}});
test_ignore_empty({}); test_ignore_empty({{1, 2, "x"},
{3, 4, X::empty},
{3, 4, X::empty},
{5, 6, X::empty},
{7, 8, X::empty},
{9, 10, X::empty},
{11, 12, X::empty}});
test_ignore_empty({{1, 2, "x"},
{3, 4, X::empty},
{3, 4, X::empty},
{5, 6, X::empty},
{7, 8, X::empty},
{9, 10, X::empty},
{11, 12, "y"}});
test_ignore_empty({{1, 2, X::empty},
{3, 4, X::empty},
{9, 10, X::empty},
{11, 12, "w"}});
test_ignore_empty({{11, 12, X::empty}});
test_ignore_empty({});
} }