ssp/include/ss/parser.hpp

1070 lines
34 KiB
C++
Raw Normal View History

#pragma once
2021-02-21 22:22:18 +01:00
#include "common.hpp"
#include "converter.hpp"
#include "exception.hpp"
#include "extract.hpp"
#include "restrictions.hpp"
2021-01-10 23:51:20 +01:00
#include <cstdlib>
#include <cstring>
2020-12-12 23:32:06 +01:00
#include <optional>
#include <string>
#include <vector>
namespace ss {
template <typename... Options>
class parser {
constexpr static auto string_error = setup<Options...>::string_error;
constexpr static auto throw_on_error = setup<Options...>::throw_on_error;
using multiline = typename setup<Options...>::multiline;
using error_type = std::conditional_t<string_error, std::string, bool>;
constexpr static bool escaped_multiline_enabled =
multiline::enabled && setup<Options...>::escape::enabled;
constexpr static bool quoted_multiline_enabled =
multiline::enabled && setup<Options...>::quote::enabled;
constexpr static bool ignore_header = setup<Options...>::ignore_header;
constexpr static bool ignore_empty = setup<Options...>::ignore_empty;
2022-03-28 19:11:41 +02:00
using header_splitter = ss::splitter<
ss::filter_not_t<ss::is_instance_of_multiline, Options...>>;
public:
parser(std::string file_name, std::string delim = ss::default_delimiter)
: file_name_{std::move(file_name)}, reader_{file_name_, delim} {
if (reader_.file_) {
read_line();
if constexpr (ignore_header) {
ignore_next();
} else {
2023-08-05 13:30:14 +02:00
raw_header_ = reader_.get_buffer();
}
} else {
2023-08-05 11:45:31 +02:00
handle_error_file_not_open();
eof_ = true;
}
}
parser(const char* const csv_data_buffer, size_t csv_data_size,
const std::string& delim = ss::default_delimiter)
: file_name_{"CSV data buffer"},
reader_{csv_data_buffer, csv_data_size, delim} {
if (csv_data_buffer) {
read_line();
if constexpr (ignore_header) {
ignore_next();
} else {
raw_header_ = reader_.get_buffer();
}
} else {
handle_error_null_buffer();
eof_ = true;
}
}
parser(parser&& other) noexcept = default;
parser& operator=(parser&& other) noexcept = default;
~parser() = default;
parser() = delete;
parser(const parser& other) = delete;
parser& operator=(const parser& other) = delete;
[[nodiscard]] bool valid() const {
if constexpr (string_error) {
return error_.empty();
2023-07-16 20:26:09 +02:00
} else if constexpr (throw_on_error) {
return true;
} else {
return !error_;
}
}
[[nodiscard]] const std::string& error_msg() const {
assert_string_error_defined<string_error>();
return error_;
}
[[nodiscard]] bool eof() const {
2022-02-27 19:40:23 +01:00
return eof_;
}
2022-02-27 19:40:23 +01:00
bool ignore_next() {
return reader_.read_next();
}
template <typename T, typename... Ts>
[[nodiscard]] T get_object() {
return to_object<T>(get_next<Ts...>());
}
[[nodiscard]] size_t line() const {
2024-02-25 02:57:46 +01:00
return reader_.line_number_ > 0 ? reader_.line_number_ - 1
2023-08-06 19:56:28 +02:00
: reader_.line_number_;
2022-02-27 19:40:23 +01:00
}
[[nodiscard]] size_t position() const {
return reader_.chars_read_;
}
template <typename T, typename... Ts>
[[nodiscard]] no_void_validator_tup_t<T, Ts...> get_next() {
if (!eof_) {
if constexpr (throw_on_error) {
try {
reader_.parse();
} catch (const ss::exception& e) {
read_line();
decorate_rethrow(e);
}
} else {
reader_.parse();
}
}
reader_.update();
if (!reader_.converter_.valid()) {
2023-08-05 11:45:31 +02:00
handle_error_invalid_conversion();
read_line();
return {};
}
clear_error();
if (eof_) {
2023-08-05 11:45:31 +02:00
handle_error_eof_reached();
return {};
}
if constexpr (throw_on_error) {
try {
auto value = reader_.converter_.template convert<T, Ts...>();
read_line();
return value;
} catch (const ss::exception& e) {
read_line();
decorate_rethrow(e);
}
}
auto value = reader_.converter_.template convert<T, Ts...>();
2020-12-12 23:32:06 +01:00
if (!reader_.converter_.valid()) {
2023-08-05 11:45:31 +02:00
handle_error_invalid_conversion();
2020-12-12 23:32:06 +01:00
}
read_line();
return value;
}
[[nodiscard]] std::string raw_header() const {
assert_ignore_header_not_defined();
return raw_header_;
}
[[nodiscard]] std::vector<std::string> header() {
assert_ignore_header_not_defined();
clear_error();
header_splitter splitter;
std::string raw_header_copy = raw_header_;
if (!strict_split(splitter, raw_header_copy)) {
return {};
}
std::vector<std::string> split_header;
for (const auto& [begin, end] : splitter.get_split_data()) {
split_header.emplace_back(begin, end);
}
return split_header;
}
[[nodiscard]] bool field_exists(const std::string& field) {
assert_ignore_header_not_defined();
clear_error();
2023-08-05 13:30:14 +02:00
if (header_.empty()) {
split_header_data();
}
if (!valid()) {
return false;
}
return header_index(field).has_value();
2022-02-27 19:40:23 +01:00
}
template <typename... Ts>
void use_fields(const Ts&... fields_args) {
assert_ignore_header_not_defined();
clear_error();
if (header_.empty() && !eof()) {
2023-08-05 13:30:14 +02:00
split_header_data();
}
2022-02-27 19:40:23 +01:00
if (!valid()) {
return;
}
auto fields = std::vector<std::string>{fields_args...};
2023-08-04 21:22:23 +02:00
if (fields.empty()) {
handle_error_invalid_use_fields_argument();
2023-08-04 21:22:23 +02:00
return;
}
2022-02-27 19:40:23 +01:00
std::vector<size_t> column_mappings;
for (const auto& field : fields) {
if (std::count(fields.begin(), fields.end(), field) != 1) {
2023-08-05 11:45:31 +02:00
handle_error_field_used_multiple_times(field);
2022-02-27 19:40:23 +01:00
return;
}
auto index = header_index(field);
if (!index) {
2023-08-05 11:45:31 +02:00
handle_error_invalid_field(field);
2022-02-27 19:40:23 +01:00
return;
}
column_mappings.push_back(*index);
}
reader_.converter_.set_column_mapping(column_mappings, header_.size());
reader_.next_line_converter_.set_column_mapping(column_mappings,
header_.size());
2023-08-04 21:22:23 +02:00
2024-02-25 03:54:33 +01:00
if (line() == 0) {
2022-02-27 19:40:23 +01:00
ignore_next();
}
}
////////////////
// iterator
////////////////
template <bool get_object, typename T, typename... Ts>
struct iterable {
struct iterator {
using value = std::conditional_t<get_object, T,
no_void_validator_tup_t<T, Ts...>>;
iterator() : parser_{nullptr}, value_{} {
2022-02-27 19:40:23 +01:00
}
2023-07-13 22:29:49 +02:00
iterator(parser<Options...>* parser) : parser_{parser}, value_{} {
2022-02-27 19:40:23 +01:00
}
iterator(const iterator& other) = default;
iterator(iterator&& other) noexcept = default;
~iterator() = default;
iterator& operator=(const iterator& other) = delete;
iterator& operator=(iterator&& other) noexcept = delete;
[[nodiscard]] value& operator*() {
2022-02-27 19:40:23 +01:00
return value_;
}
2023-07-13 22:29:49 +02:00
[[nodiscard]] value* operator->() {
2022-02-27 19:40:23 +01:00
return &value_;
}
iterator& operator++() {
if (!parser_ || parser_->eof()) {
parser_ = nullptr;
} else {
if constexpr (get_object) {
value_ =
std::move(parser_->template get_object<T, Ts...>());
} else {
value_ =
std::move(parser_->template get_next<T, Ts...>());
}
}
return *this;
}
iterator operator++(int) {
auto result = *this;
++*this;
return result;
2022-02-27 19:40:23 +01:00
}
[[nodiscard]] friend bool operator==(const iterator& lhs,
const iterator& rhs) {
return (lhs.parser_ == nullptr && rhs.parser_ == nullptr) ||
(lhs.parser_ == rhs.parser_ &&
&lhs.value_ == &rhs.value_);
}
[[nodiscard]] friend bool operator!=(const iterator& lhs,
const iterator& rhs) {
return !(lhs == rhs);
}
private:
parser<Options...>* parser_;
value value_;
};
iterable(parser<Options...>* parser) : parser_{parser} {
2022-02-27 19:40:23 +01:00
}
[[nodiscard]] iterator begin() {
2022-02-27 19:40:23 +01:00
return ++iterator{parser_};
}
2023-07-13 22:29:49 +02:00
[[nodiscard]] iterator end() {
2022-02-27 19:40:23 +01:00
return iterator{};
}
private:
parser<Options...>* parser_;
};
template <typename... Ts>
[[nodiscard]] auto iterate() {
return iterable<false, Ts...>{this};
}
template <typename... Ts>
[[nodiscard]] auto iterate_object() {
return iterable<true, Ts...>{this};
}
////////////////
// composite conversion
////////////////
template <typename... Ts>
class composite {
public:
composite(std::tuple<Ts...>&& values, parser& parser)
2022-02-27 19:40:23 +01:00
: values_{std::move(values)}, parser_{parser} {
}
// tries to convert the same line with a different output type
// only if the previous conversion was not successful,
// returns composite containing itself and the new output
// as optional, additionally, if a parameter is passed, and
// that parameter can be invoked using the converted value,
// than it will be invoked in the case of a valid conversion
2020-12-26 00:56:39 +01:00
template <typename... Us, typename Fun = none>
composite<Ts..., std::optional<no_void_validator_tup_t<Us...>>> or_else(
2020-12-26 00:56:39 +01:00
Fun&& fun = none{}) {
using Value = no_void_validator_tup_t<Us...>;
std::optional<Value> value;
try_convert_and_invoke<Value, Us...>(value, std::forward<Fun>(fun));
return composite_with(std::move(value));
}
// same as or_else, but saves the result into a 'U' object
// instead of a tuple
2020-12-26 00:56:39 +01:00
template <typename U, typename... Us, typename Fun = none>
composite<Ts..., std::optional<U>> or_object(Fun&& fun = none{}) {
std::optional<U> value;
try_convert_and_invoke<U, Us...>(value, std::forward<Fun>(fun));
return composite_with(std::move(value));
}
[[nodiscard]] std::tuple<Ts...> values() {
2022-02-27 19:40:23 +01:00
return values_;
}
template <typename Fun>
auto on_error(Fun&& fun) {
assert_throw_on_error_not_defined<throw_on_error>();
if (!parser_.valid()) {
if constexpr (std::is_invocable_v<Fun>) {
fun();
} else {
static_assert(string_error,
"to enable error messages within the "
"on_error method "
"callback string_error needs to be enabled");
std::invoke(std::forward<Fun>(fun), parser_.error_msg());
2020-12-12 23:32:06 +01:00
}
}
return *this;
}
2020-12-12 23:32:06 +01:00
private:
template <typename T>
[[nodiscard]] composite<Ts..., T> composite_with(T&& new_value) {
auto merged_values =
std::tuple_cat(std::move(values_),
2021-01-10 23:51:20 +01:00
std::tuple<T>{parser_.valid()
2021-01-17 21:46:36 +01:00
? std::forward<T>(new_value)
: std::nullopt});
return {std::move(merged_values), parser_};
}
2020-12-26 00:56:39 +01:00
template <typename U, typename... Us, typename Fun = none>
void try_convert_and_invoke(std::optional<U>& value, Fun&& fun) {
2024-02-21 01:25:53 +01:00
if (parser_.valid()) {
return;
}
2024-02-21 01:25:53 +01:00
auto tuple_output = try_same<Us...>();
if (!parser_.valid()) {
return;
}
2024-02-21 01:25:53 +01:00
if constexpr (!std::is_same_v<U, decltype(tuple_output)>) {
value = to_object<U>(std::move(tuple_output));
} else {
value = std::move(tuple_output);
}
2024-02-21 01:25:53 +01:00
parser_.try_invoke(*value, std::forward<Fun>(fun));
}
template <typename U, typename... Us>
[[nodiscard]] no_void_validator_tup_t<U, Us...> try_same() {
parser_.clear_error();
auto value =
parser_.reader_.converter_.template convert<U, Us...>();
if (!parser_.reader_.converter_.valid()) {
2023-08-05 11:45:31 +02:00
parser_.handle_error_invalid_conversion();
}
return value;
}
////////////////
// members
////////////////
std::tuple<Ts...> values_;
parser& parser_;
};
// tries to convert a line and returns a composite which is
// able to try additional conversions in case of failure
2020-12-26 00:56:39 +01:00
template <typename... Ts, typename Fun = none>
[[nodiscard]] composite<std::optional<no_void_validator_tup_t<Ts...>>>
try_next(Fun&& fun = none{}) {
assert_throw_on_error_not_defined<throw_on_error>();
using Ret = no_void_validator_tup_t<Ts...>;
return try_invoke_and_make_composite<
std::optional<Ret>>(get_next<Ts...>(), std::forward<Fun>(fun));
2021-01-19 20:26:36 +01:00
}
// identical to try_next but returns composite with object instead of a
// tuple
template <typename T, typename... Ts, typename Fun = none>
[[nodiscard]] composite<std::optional<T>> try_object(Fun&& fun = none{}) {
assert_throw_on_error_not_defined<throw_on_error>();
return try_invoke_and_make_composite<
std::optional<T>>(get_object<T, Ts...>(), std::forward<Fun>(fun));
2021-01-19 20:26:36 +01:00
}
private:
// tries to invoke the given function (see below), if the function
// returns a value which can be used as a conditional, and it returns
// false, the function sets an error, and allows the invoke of the
// next possible conversion as if the validation of the current one
// failed
2020-12-26 00:56:39 +01:00
template <typename Arg, typename Fun = none>
void try_invoke(Arg&& arg, Fun&& fun) {
2020-12-26 00:56:39 +01:00
constexpr bool is_none = std::is_same_v<std::decay_t<Fun>, none>;
if constexpr (!is_none) {
using Ret = decltype(try_invoke_impl(arg, std::forward<Fun>(fun)));
constexpr bool returns_void = std::is_same_v<Ret, void>;
if constexpr (!returns_void) {
if (!try_invoke_impl(std::forward<Arg>(arg),
std::forward<Fun>(fun))) {
2023-08-05 11:45:31 +02:00
handle_error_failed_check();
}
} else {
try_invoke_impl(arg, std::forward<Fun>(fun));
}
}
}
2020-12-26 00:56:39 +01:00
// tries to invoke the function if not none
// it first tries to invoke the function without arguments,
// than with one argument if the function accepts the whole tuple
// as an argument, and finally tries to invoke it with the tuple
// laid out as a parameter pack
2020-12-26 00:56:39 +01:00
template <typename Arg, typename Fun = none>
auto try_invoke_impl(Arg&& arg, Fun&& fun) {
2020-12-26 00:56:39 +01:00
constexpr bool is_none = std::is_same_v<std::decay_t<Fun>, none>;
if constexpr (!is_none) {
if constexpr (std::is_invocable_v<Fun>) {
return fun();
} else if constexpr (std::is_invocable_v<Fun, Arg>) {
return std::invoke(std::forward<Fun>(fun),
std::forward<Arg>(arg));
} else {
return std::apply(std::forward<Fun>(fun),
std::forward<Arg>(arg));
}
}
}
template <typename T, typename Fun = none>
[[nodiscard]] composite<T> try_invoke_and_make_composite(T&& value,
Fun&& fun) {
if (valid()) {
try_invoke(*value, std::forward<Fun>(fun));
}
return {valid() ? std::forward<T>(value) : std::nullopt, *this};
}
2022-02-27 19:40:23 +01:00
////////////////
// header
////////////////
void assert_ignore_header_not_defined() const {
static_assert(!ignore_header,
"cannot use this method when 'ignore_header' is defined");
}
[[nodiscard]] bool strict_split(header_splitter& splitter,
std::string& header) {
if constexpr (throw_on_error) {
try {
splitter.split(header.data(), reader_.delim_);
} catch (const ss::exception& e) {
decorate_rethrow_invalid_header_split(e);
}
} else {
splitter.split(header.data(), reader_.delim_);
if (!splitter.valid()) {
handle_error_invalid_header_split(splitter);
return false;
}
}
return true;
}
2023-08-05 13:30:14 +02:00
void split_header_data() {
header_splitter splitter;
2023-08-05 13:30:14 +02:00
std::string raw_header_copy = raw_header_;
if (!strict_split(splitter, raw_header_copy)) {
return;
}
for (const auto& [begin, end] : splitter.get_split_data()) {
std::string field{begin, end};
if (field.empty()) {
handle_error_duplicate_header_field(field);
header_.clear();
return;
}
if (std::find(header_.begin(), header_.end(), field) !=
header_.end()) {
handle_error_duplicate_header_field(field);
header_.clear();
return;
}
header_.push_back(std::move(field));
2023-08-05 13:30:14 +02:00
}
}
[[nodiscard]] std::optional<size_t> header_index(const std::string& field) {
2022-02-27 19:40:23 +01:00
auto it = std::find(header_.begin(), header_.end(), field);
if (it == header_.end()) {
return std::nullopt;
}
return std::distance(header_.begin(), it);
}
////////////////
// error
////////////////
2020-12-11 18:14:06 +01:00
void clear_error() {
if constexpr (string_error) {
error_.clear();
} else {
error_ = false;
}
}
2021-01-10 23:51:20 +01:00
2023-08-05 11:45:31 +02:00
void handle_error_failed_check() {
constexpr static auto error_msg = ": failed check";
if constexpr (string_error) {
2023-08-05 11:45:31 +02:00
error_.clear();
error_.append(file_name_).append(error_msg);
} else if constexpr (throw_on_error) {
throw ss::exception{file_name_ + error_msg};
} else {
error_ = true;
}
}
void handle_error_null_buffer() {
constexpr static auto error_msg = ": received null data buffer";
if constexpr (string_error) {
error_.clear();
error_.append(file_name_).append(error_msg);
} else if constexpr (throw_on_error) {
throw ss::exception{file_name_ + error_msg};
} else {
error_ = true;
}
}
2023-08-05 11:45:31 +02:00
void handle_error_file_not_open() {
constexpr static auto error_msg = ": could not be opened";
if constexpr (string_error) {
2023-08-05 11:45:31 +02:00
error_.clear();
error_.append(file_name_).append(error_msg);
} else if constexpr (throw_on_error) {
throw ss::exception{file_name_ + error_msg};
} else {
error_ = true;
}
}
2023-08-05 11:45:31 +02:00
void handle_error_eof_reached() {
constexpr static auto error_msg = ": read on end of file";
if constexpr (string_error) {
2023-08-05 11:45:31 +02:00
error_.clear();
error_.append(file_name_).append(error_msg);
} else if constexpr (throw_on_error) {
throw ss::exception{file_name_ + error_msg};
} else {
error_ = true;
}
}
2023-08-05 11:45:31 +02:00
void handle_error_invalid_conversion() {
if constexpr (string_error) {
2023-08-05 11:45:31 +02:00
error_.clear();
error_.append(file_name_)
.append(" ")
.append(std::to_string(reader_.line_number_))
.append(": ")
2023-08-04 21:22:23 +02:00
.append(reader_.converter_.error_msg());
} else if constexpr (!throw_on_error) {
error_ = true;
}
}
2023-08-05 11:45:31 +02:00
void handle_error_invalid_field(const std::string& field) {
constexpr static auto error_msg =
": header does not contain given field: ";
2022-02-27 19:40:23 +01:00
if constexpr (string_error) {
2023-08-05 11:45:31 +02:00
error_.clear();
error_.append(file_name_).append(error_msg).append(field);
} else if constexpr (throw_on_error) {
throw ss::exception{file_name_ + error_msg + field};
2022-02-27 19:40:23 +01:00
} else {
error_ = true;
}
}
2023-08-05 11:45:31 +02:00
void handle_error_field_used_multiple_times(const std::string& field) {
constexpr static auto error_msg = ": given field used multiple times: ";
2022-02-27 19:40:23 +01:00
if constexpr (string_error) {
2023-08-05 11:45:31 +02:00
error_.clear();
error_.append(file_name_).append(error_msg).append(field);
} else if constexpr (throw_on_error) {
throw ss::exception{file_name_ + error_msg + field};
2022-02-27 19:40:23 +01:00
} else {
error_ = true;
}
}
void handle_error_invalid_use_fields_argument() {
constexpr static auto error_msg =
"received invalid argument for 'use_fields'";
2023-08-04 21:22:23 +02:00
if constexpr (string_error) {
error_.clear();
error_.append(error_msg);
} else if constexpr (throw_on_error) {
throw ss::exception{error_msg};
} else {
error_ = true;
}
}
void handle_error_invalid_header_field() {
constexpr static auto error_msg = ": header contains empty field";
if constexpr (string_error) {
error_.clear();
error_.append(file_name_).append(error_msg);
} else if constexpr (throw_on_error) {
throw ss::exception{file_name_ + error_msg};
} else {
error_ = true;
}
}
void handle_error_duplicate_header_field(const std::string& field) {
constexpr static auto error_msg = ": header contains duplicate: ";
if constexpr (string_error) {
error_.clear();
error_.append(file_name_).append(error_msg).append(field);
} else if constexpr (throw_on_error) {
throw ss::exception{file_name_ + error_msg + field};
} else {
error_ = true;
}
}
void handle_error_invalid_header_split(const header_splitter& splitter) {
constexpr static auto error_msg = ": failed header parsing: ";
if constexpr (string_error) {
error_.clear();
error_.append(file_name_)
.append(error_msg)
.append(splitter.error_msg());
} else {
error_ = true;
}
}
void decorate_rethrow_invalid_header_split(const ss::exception& e) const {
static_assert(throw_on_error,
"throw_on_error needs to be enabled to use this method");
2023-08-06 19:56:28 +02:00
throw ss::exception{std::string{file_name_}
.append(": failed header parsing: ")
2023-08-06 19:56:28 +02:00
.append(e.what())};
}
void decorate_rethrow(const ss::exception& e) const {
static_assert(throw_on_error,
"throw_on_error needs to be enabled to use this method");
throw ss::exception{std::string{file_name_}
.append(" ")
.append(std::to_string(line()))
.append(": ")
.append(e.what())};
}
////////////////
// line reading
////////////////
2022-02-27 19:40:23 +01:00
void read_line() {
eof_ = !reader_.read_next();
}
2021-02-07 21:29:12 +01:00
struct reader {
reader(const std::string& file_name_, std::string delim)
: delim_{std::move(delim)},
file_{std::fopen(file_name_.c_str(), "rb")} {
2022-02-27 19:40:23 +01:00
}
reader(const char* const buffer, size_t csv_data_size,
std::string delim)
: delim_{std::move(delim)}, csv_data_buffer_{buffer},
csv_data_size_{csv_data_size} {
}
reader(reader&& other) noexcept
: buffer_{other.buffer_},
next_line_buffer_{other.next_line_buffer_},
helper_buffer_{other.helper_buffer_},
converter_{std::move(other.converter_)},
next_line_converter_{std::move(other.next_line_converter_)},
buffer_size_{other.buffer_size_},
next_line_buffer_size_{other.next_line_buffer_size_},
helper_buffer_size{other.helper_buffer_size},
delim_{std::move(other.delim_)}, file_{other.file_},
csv_data_buffer_{other.csv_data_buffer_},
csv_data_size_{other.csv_data_size_},
curr_char_{other.curr_char_}, crlf_{other.crlf_},
2024-02-25 02:57:46 +01:00
line_number_{other.line_number_}, chars_read_{other.chars_read_},
next_line_size_{other.next_line_size_} {
other.buffer_ = nullptr;
other.next_line_buffer_ = nullptr;
other.helper_buffer_ = nullptr;
other.file_ = nullptr;
}
reader& operator=(reader&& other) noexcept {
if (this != &other) {
buffer_ = other.buffer_;
next_line_buffer_ = other.next_line_buffer_;
helper_buffer_ = other.helper_buffer_;
converter_ = std::move(other.converter_);
next_line_converter_ = std::move(other.next_line_converter_);
buffer_size_ = other.buffer_size_;
next_line_buffer_size_ = other.next_line_buffer_size_;
helper_buffer_size = other.helper_buffer_size;
delim_ = std::move(other.delim_);
file_ = other.file_;
csv_data_buffer_ = other.csv_data_buffer_;
csv_data_size_ = other.csv_data_size_;
curr_char_ = other.curr_char_;
crlf_ = other.crlf_;
line_number_ = other.line_number_;
chars_read_ = other.chars_read_;
next_line_size_ = other.next_line_size_;
other.buffer_ = nullptr;
other.next_line_buffer_ = nullptr;
other.helper_buffer_ = nullptr;
other.file_ = nullptr;
other.csv_data_buffer_ = nullptr;
}
return *this;
2021-01-10 23:51:20 +01:00
}
~reader() {
std::free(buffer_);
std::free(next_line_buffer_);
std::free(helper_buffer_);
if (file_) {
std::ignore = std::fclose(file_);
}
}
2020-12-11 18:14:06 +01:00
reader() = delete;
reader(const reader& other) = delete;
reader& operator=(const reader& other) = delete;
// read next line each time in order to set eof_
[[nodiscard]] bool read_next() {
next_line_converter_.clear_error();
2022-03-28 19:11:41 +02:00
size_t size = 0;
while (size == 0) {
++line_number_;
if (next_line_buffer_size_ > 0) {
2022-03-28 19:11:41 +02:00
next_line_buffer_[0] = '\0';
}
chars_read_ = curr_char_;
auto [ssize, eof] =
get_line(next_line_buffer_, next_line_buffer_size_, file_,
csv_data_buffer_, csv_data_size_, curr_char_);
2022-03-28 19:11:41 +02:00
if (eof) {
2022-03-28 19:11:41 +02:00
return false;
}
size = remove_eol(next_line_buffer_, ssize);
if constexpr (!ignore_empty) {
break;
}
}
2020-12-12 23:32:06 +01:00
next_line_size_ = size;
return true;
}
void parse() {
size_t limit = 0;
if constexpr (escaped_multiline_enabled) {
while (escaped_eol(next_line_size_)) {
if (multiline_limit_reached(limit)) {
return;
}
if (!append_next_line_to_buffer(next_line_buffer_,
next_line_size_,
next_line_buffer_size_)) {
2023-08-05 11:45:31 +02:00
next_line_converter_.handle_error_unterminated_escape();
return;
2021-02-06 21:08:59 +01:00
}
}
}
2021-01-10 23:51:20 +01:00
next_line_converter_.split(next_line_buffer_, delim_);
if constexpr (quoted_multiline_enabled) {
2021-02-06 21:08:59 +01:00
while (unterminated_quote()) {
2023-07-31 21:52:08 +02:00
next_line_size_ -= next_line_converter_.size_shifted();
if (multiline_limit_reached(limit)) {
return;
}
if (!append_next_line_to_buffer(next_line_buffer_,
next_line_size_,
next_line_buffer_size_)) {
2023-08-05 11:45:31 +02:00
next_line_converter_.handle_error_unterminated_quote();
return;
2021-02-06 21:08:59 +01:00
}
if constexpr (escaped_multiline_enabled) {
while (escaped_eol(next_line_size_)) {
if (multiline_limit_reached(limit)) {
return;
}
if (!append_next_line_to_buffer(
next_line_buffer_, next_line_size_,
next_line_buffer_size_)) {
next_line_converter_
2023-08-05 11:45:31 +02:00
.handle_error_unterminated_escape();
return;
}
}
}
2023-07-25 00:56:38 +02:00
next_line_converter_.resplit(next_line_buffer_,
next_line_size_, delim_);
}
}
}
void update() {
2021-01-10 23:51:20 +01:00
std::swap(buffer_, next_line_buffer_);
std::swap(buffer_size_, next_line_buffer_size_);
2021-01-10 23:51:20 +01:00
std::swap(converter_, next_line_converter_);
}
[[nodiscard]] bool multiline_limit_reached(size_t& limit) {
if constexpr (multiline::size > 0) {
if (limit++ >= multiline::size) {
2023-08-05 11:45:31 +02:00
next_line_converter_.handle_error_multiline_limit_reached();
return true;
}
}
return false;
}
[[nodiscard]] bool escaped_eol(size_t size) {
const char* curr = nullptr;
for (curr = next_line_buffer_ + size - 1;
curr >= next_line_buffer_ &&
setup<Options...>::escape::match(*curr);
--curr) {
}
return (next_line_buffer_ - curr + size) % 2 == 0;
}
[[nodiscard]] bool unterminated_quote() {
return next_line_converter_.unterminated_quote();
}
void undo_remove_eol(char* buffer, size_t& line_size,
size_t buffer_size) {
if (crlf_ && buffer_size >= line_size + 2) {
std::copy_n("\r\n", 2, buffer + line_size);
line_size += 2;
} else if (buffer_size > line_size) {
std::copy_n("\n", 1, buffer + line_size);
line_size += 1;
}
}
[[nodiscard]] size_t remove_eol(char*& buffer, size_t ssize) {
if (buffer[ssize - 1] != '\n') {
crlf_ = false;
return ssize;
}
size_t size = ssize - 1;
if (ssize >= 2 && buffer[ssize - 2] == '\r') {
crlf_ = true;
size--;
} else {
crlf_ = false;
}
buffer[size] = '\0';
return size;
}
void realloc_concat(char*& first, size_t& first_size,
size_t& buffer_size, const char* const second,
size_t second_size) {
buffer_size = first_size + second_size + 3;
auto* new_first = static_cast<char*>(
2024-02-25 10:42:11 +01:00
strict_realloc(static_cast<void*>(first), buffer_size));
2023-08-05 12:05:17 +02:00
first = new_first;
std::copy_n(second, second_size + 1, first + first_size);
first_size += second_size;
}
[[nodiscard]] bool append_next_line_to_buffer(char*& buffer,
size_t& line_size,
size_t buffer_size) {
undo_remove_eol(buffer, line_size, buffer_size);
chars_read_ = curr_char_;
auto [next_ssize, eof] =
get_line(helper_buffer_, helper_buffer_size, file_,
csv_data_buffer_, csv_data_size_, curr_char_);
if (eof) {
return false;
}
++line_number_;
size_t next_size = remove_eol(helper_buffer_, next_ssize);
realloc_concat(buffer, line_size, next_line_buffer_size_,
helper_buffer_, next_size);
return true;
}
[[nodiscard]] std::string get_buffer() {
return std::string{next_line_buffer_, next_line_size_};
2022-02-27 19:40:23 +01:00
}
////////////////
// members
////////////////
char* buffer_{nullptr};
char* next_line_buffer_{nullptr};
char* helper_buffer_{nullptr};
converter<Options...> converter_;
converter<Options...> next_line_converter_;
size_t buffer_size_{0};
size_t next_line_buffer_size_{0};
size_t helper_buffer_size{0};
std::string delim_;
FILE* file_{nullptr};
const char* csv_data_buffer_{nullptr};
size_t csv_data_size_{0};
size_t curr_char_{0};
2023-07-13 22:29:49 +02:00
bool crlf_{false};
size_t line_number_{0};
size_t chars_read_{0};
size_t next_line_size_{0};
};
////////////////
// members
////////////////
std::string file_name_;
error_type error_{};
reader reader_;
2022-02-27 19:40:23 +01:00
std::vector<std::string> header_;
2023-08-05 13:30:14 +02:00
std::string raw_header_;
bool eof_{false};
};
} /* namespace ss */