forked from mindspore-Ecosystem/mindspore
!5496 Change the argument type of CSV state function from char to int
Merge pull request !5496 from jiangzhiwen/bug/csv_char_to_int
This commit is contained in:
commit
6e452d829d
|
@ -97,7 +97,67 @@ Status CsvOp::Init() {
|
||||||
return Status::OK();
|
return Status::OK();
|
||||||
}
|
}
|
||||||
|
|
||||||
int CsvOp::CsvParser::put_record(char c) {
|
CsvOp::CsvParser::CsvParser(int32_t worker_id, std::shared_ptr<JaggedConnector> connector, int64_t rows_per_buffer,
|
||||||
|
char field_delim, std::vector<std::shared_ptr<CsvOp::BaseRecord>> column_default)
|
||||||
|
: worker_id_(worker_id),
|
||||||
|
buffer_connector_(connector),
|
||||||
|
csv_rows_per_buffer_(rows_per_buffer),
|
||||||
|
csv_field_delim_(field_delim),
|
||||||
|
column_default_(column_default),
|
||||||
|
cur_state_(START_OF_FILE),
|
||||||
|
pos_(0),
|
||||||
|
cur_row_(0),
|
||||||
|
cur_col_(0),
|
||||||
|
total_rows_(0),
|
||||||
|
start_offset_(0),
|
||||||
|
end_offset_(std::numeric_limits<int64_t>::max()),
|
||||||
|
err_message_("unknown") {
|
||||||
|
cur_buffer_ = std::make_unique<DataBuffer>(0, DataBuffer::BufferFlags::kDeBFlagNone);
|
||||||
|
InitCsvParser();
|
||||||
|
}
|
||||||
|
|
||||||
|
void CsvOp::CsvParser::Reset() {
|
||||||
|
cur_state_ = START_OF_FILE;
|
||||||
|
pos_ = 0;
|
||||||
|
cur_row_ = 0;
|
||||||
|
cur_col_ = 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
CsvOp::CsvParser::Message CsvOp::CsvParser::GetMessage(int c) {
|
||||||
|
if (c == csv_field_delim_) {
|
||||||
|
return Message::MS_DELIM;
|
||||||
|
} else if (c == '"') {
|
||||||
|
return Message::MS_QUOTE;
|
||||||
|
} else if (c == '\r' || c == '\n') {
|
||||||
|
return Message::MS_END_OF_LINE;
|
||||||
|
} else if (c == std::char_traits<char>::eof()) {
|
||||||
|
return Message::MS_END_OF_FILE;
|
||||||
|
} else {
|
||||||
|
return Message::MS_NORMAL;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
int CsvOp::CsvParser::ProcessMessage(int c) {
|
||||||
|
Message m = GetMessage(c);
|
||||||
|
StateDiagram::iterator it = sd.find({cur_state_, m});
|
||||||
|
if (it == sd.end()) {
|
||||||
|
return -1;
|
||||||
|
}
|
||||||
|
int ret = it->second.second(*this, c);
|
||||||
|
cur_state_ = it->second.first;
|
||||||
|
return ret;
|
||||||
|
}
|
||||||
|
|
||||||
|
int CsvOp::CsvParser::PutChar(int c) {
|
||||||
|
if (pos_ >= str_buf_.size()) {
|
||||||
|
str_buf_.resize(str_buf_.size() * 2);
|
||||||
|
}
|
||||||
|
str_buf_[pos_] = c;
|
||||||
|
pos_++;
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
int CsvOp::CsvParser::PutRecord(int c) {
|
||||||
std::string s = std::string(str_buf_.begin(), str_buf_.begin() + pos_);
|
std::string s = std::string(str_buf_.begin(), str_buf_.begin() + pos_);
|
||||||
std::shared_ptr<Tensor> t;
|
std::shared_ptr<Tensor> t;
|
||||||
if (cur_col_ >= column_default_.size()) {
|
if (cur_col_ >= column_default_.size()) {
|
||||||
|
@ -125,7 +185,7 @@ int CsvOp::CsvParser::put_record(char c) {
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
int CsvOp::CsvParser::put_row(char c) {
|
int CsvOp::CsvParser::PutRow(int c) {
|
||||||
if (total_rows_ < start_offset_) {
|
if (total_rows_ < start_offset_) {
|
||||||
total_rows_++;
|
total_rows_++;
|
||||||
cur_col_ = 0;
|
cur_col_ = 0;
|
||||||
|
@ -137,7 +197,7 @@ int CsvOp::CsvParser::put_row(char c) {
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
int ret = put_record(c);
|
int ret = PutRecord(c);
|
||||||
if (ret < 0) {
|
if (ret < 0) {
|
||||||
return ret;
|
return ret;
|
||||||
}
|
}
|
||||||
|
@ -162,9 +222,14 @@ int CsvOp::CsvParser::put_row(char c) {
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
int CsvOp::CsvParser::end_file(char c) {
|
int CsvOp::CsvParser::AddRow(int c) {
|
||||||
|
total_rows_++;
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
int CsvOp::CsvParser::EndFile(int c) {
|
||||||
if (cur_col_ > 0) {
|
if (cur_col_ > 0) {
|
||||||
int ret = put_row(c);
|
int ret = PutRow(c);
|
||||||
if (ret < 0) {
|
if (ret < 0) {
|
||||||
return ret;
|
return ret;
|
||||||
}
|
}
|
||||||
|
@ -177,7 +242,18 @@ int CsvOp::CsvParser::end_file(char c) {
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
int CsvOp::CsvParser::countRows(int c) {
|
int CsvOp::CsvParser::CatchException(int c) {
|
||||||
|
if (GetMessage(c) == Message::MS_QUOTE && cur_state_ == State::UNQUOTE) {
|
||||||
|
err_message_ = "Invalid quote in unquote field.";
|
||||||
|
} else if (GetMessage(c) == Message::MS_END_OF_FILE && cur_state_ == State::QUOTE) {
|
||||||
|
err_message_ = "Reach the end of file in quote field.";
|
||||||
|
} else if (GetMessage(c) == Message::MS_NORMAL && cur_state_ == State::SECOND_QUOTE) {
|
||||||
|
err_message_ = "Receive unquote char in quote field.";
|
||||||
|
}
|
||||||
|
return -1;
|
||||||
|
}
|
||||||
|
|
||||||
|
int CsvOp::CsvParser::CountRows(int c) {
|
||||||
Message m;
|
Message m;
|
||||||
if (c == '"') {
|
if (c == '"') {
|
||||||
m = Message::MS_QUOTE;
|
m = Message::MS_QUOTE;
|
||||||
|
@ -194,79 +270,79 @@ int CsvOp::CsvParser::countRows(int c) {
|
||||||
return it->second.second(*this, c);
|
return it->second.second(*this, c);
|
||||||
}
|
}
|
||||||
|
|
||||||
Status CsvOp::CsvParser::initCsvParser() {
|
Status CsvOp::CsvParser::InitCsvParser() {
|
||||||
str_buf_.resize(CSV_BUFFER_SIZE);
|
str_buf_.resize(CSV_BUFFER_SIZE);
|
||||||
|
|
||||||
// State diagram for counting rows
|
// State diagram for counting rows
|
||||||
sdl = {// START_OF_FILE
|
sdl = {// START_OF_FILE
|
||||||
// ┌───────────┬───────────┬───────────────┐
|
// |---------------------------------------|
|
||||||
// │ abc │ " │ \n │
|
// | abc | " | \n |
|
||||||
// ├───────────┼───────────┼───────────────┤
|
// |---------------------------------------|
|
||||||
// │ UNQUOTE │ QUOTE │ START_OF_FILE │
|
// | UNQUOTE | QUOTE | START_OF_FILE |
|
||||||
// ├───────────┼───────────┼───────────────┤
|
// |---------------------------------------|
|
||||||
// | null_func │ null_func │ null_func │
|
// | NullFunc | NullFunc | NullFunc |
|
||||||
// └───────────┴───────────┴───────────────┘
|
// |---------------------------------------|
|
||||||
{{State::START_OF_FILE, Message::MS_NORMAL}, {State::UNQUOTE, &CsvParser::null_func}},
|
{{State::START_OF_FILE, Message::MS_NORMAL}, {State::UNQUOTE, &CsvParser::NullFunc}},
|
||||||
{{State::START_OF_FILE, Message::MS_QUOTE}, {State::QUOTE, &CsvParser::null_func}},
|
{{State::START_OF_FILE, Message::MS_QUOTE}, {State::QUOTE, &CsvParser::NullFunc}},
|
||||||
{{State::START_OF_FILE, Message::MS_END_OF_LINE}, {State::START_OF_FILE, &CsvParser::null_func}},
|
{{State::START_OF_FILE, Message::MS_END_OF_LINE}, {State::START_OF_FILE, &CsvParser::NullFunc}},
|
||||||
|
|
||||||
// UNQUOTE
|
// UNQUOTE
|
||||||
// ┌───────────┬───────────┬─────────────┐
|
// |-------------------------------------|
|
||||||
// │ abc │ " │ \n │
|
// | abc | " | \n |
|
||||||
// ├───────────┼───────────┼─────────────┤
|
// |-------------------------------------|
|
||||||
// │ UNQUOTE │ QUOTE │ END_OF_LINE │
|
// | UNQUOTE | QUOTE | END_OF_LINE |
|
||||||
// ├───────────┼───────────┼─────────────┤
|
// |-------------------------------------|
|
||||||
// | null_func │ null_func │ add_row │
|
// | NullFunc | NullFunc | AddRow |
|
||||||
// └───────────┴───────────┴─────────────┘
|
// |-------------------------------------|
|
||||||
{{State::UNQUOTE, Message::MS_NORMAL}, {State::UNQUOTE, &CsvParser::null_func}},
|
{{State::UNQUOTE, Message::MS_NORMAL}, {State::UNQUOTE, &CsvParser::NullFunc}},
|
||||||
{{State::UNQUOTE, Message::MS_QUOTE}, {State::QUOTE, &CsvParser::null_func}},
|
{{State::UNQUOTE, Message::MS_QUOTE}, {State::QUOTE, &CsvParser::NullFunc}},
|
||||||
{{State::UNQUOTE, Message::MS_END_OF_LINE}, {State::END_OF_LINE, &CsvParser::add_row}},
|
{{State::UNQUOTE, Message::MS_END_OF_LINE}, {State::END_OF_LINE, &CsvParser::AddRow}},
|
||||||
|
|
||||||
// QUOTE
|
// QUOTE
|
||||||
// ┌───────────┬──────────────┬───────────┐
|
// |--------------------------------------|
|
||||||
// │ abc │ " │ \n │
|
// | abc | " | \n |
|
||||||
// ├───────────┼──────────────┼───────────┤
|
// |--------------------------------------|
|
||||||
// │ QUOTE │ SECOND_QUOTE │ QUOTE │
|
// | QUOTE | SECOND_QUOTE | QUOTE |
|
||||||
// ├───────────┼──────────────┼───────────┤
|
// |--------------------------------------|
|
||||||
// | null_func │ null_func │ null_func │
|
// | NullFunc | NullFunc | NullFunc |
|
||||||
// └───────────┴──────────────┴───────────┘
|
// |--------------------------------------|
|
||||||
{{State::QUOTE, Message::MS_NORMAL}, {State::QUOTE, &CsvParser::null_func}},
|
{{State::QUOTE, Message::MS_NORMAL}, {State::QUOTE, &CsvParser::NullFunc}},
|
||||||
{{State::QUOTE, Message::MS_QUOTE}, {State::SECOND_QUOTE, &CsvParser::null_func}},
|
{{State::QUOTE, Message::MS_QUOTE}, {State::SECOND_QUOTE, &CsvParser::NullFunc}},
|
||||||
{{State::QUOTE, Message::MS_END_OF_LINE}, {State::QUOTE, &CsvParser::null_func}},
|
{{State::QUOTE, Message::MS_END_OF_LINE}, {State::QUOTE, &CsvParser::NullFunc}},
|
||||||
|
|
||||||
// SECOND_QUOTE
|
// SECOND_QUOTE
|
||||||
// ┌───────────┬───────────┬─────────────┐
|
// |-------------------------------------|
|
||||||
// │ abc │ " │ \n │
|
// | abc | " | \n |
|
||||||
// ├───────────┼───────────┼─────────────┤
|
// |-------------------------------------|
|
||||||
// │ UNQUOTE │ QUOTE │ END_OF_LINE │
|
// | UNQUOTE | QUOTE | END_OF_LINE |
|
||||||
// ├───────────┼───────────┼─────────────┤
|
// |-------------------------------------|
|
||||||
// | null_func │ null_func │ add_row │
|
// | NullFunc | NullFunc | AddRow |
|
||||||
// └───────────┴───────────┴─────────────┘
|
// |-------------------------------------|
|
||||||
{{State::SECOND_QUOTE, Message::MS_NORMAL}, {State::UNQUOTE, &CsvParser::null_func}},
|
{{State::SECOND_QUOTE, Message::MS_NORMAL}, {State::UNQUOTE, &CsvParser::NullFunc}},
|
||||||
{{State::SECOND_QUOTE, Message::MS_QUOTE}, {State::QUOTE, &CsvParser::null_func}},
|
{{State::SECOND_QUOTE, Message::MS_QUOTE}, {State::QUOTE, &CsvParser::NullFunc}},
|
||||||
{{State::SECOND_QUOTE, Message::MS_END_OF_LINE}, {State::END_OF_LINE, &CsvParser::add_row}},
|
{{State::SECOND_QUOTE, Message::MS_END_OF_LINE}, {State::END_OF_LINE, &CsvParser::AddRow}},
|
||||||
|
|
||||||
// END_OF_LINE
|
// END_OF_LINE
|
||||||
// ┌───────────┬───────────┬─────────────┐
|
// |-------------------------------------|
|
||||||
// │ abc │ " │ \n │
|
// | abc | " | \n |
|
||||||
// ├───────────┼───────────┼─────────────┤
|
// |-------------------------------------|
|
||||||
// │ UNQUOTE │ QUOTE │ END_OF_LINE │
|
// | UNQUOTE | QUOTE | END_OF_LINE |
|
||||||
// ├───────────┼───────────┼─────────────┤
|
// |-------------------------------------|
|
||||||
// | null_func │ null_func │ null_func │
|
// | NullFunc | NullFunc | NullFunc |
|
||||||
// └───────────┴───────────┴─────────────┘
|
// |-------------------------------------|
|
||||||
{{State::END_OF_LINE, Message::MS_NORMAL}, {State::UNQUOTE, &CsvParser::null_func}},
|
{{State::END_OF_LINE, Message::MS_NORMAL}, {State::UNQUOTE, &CsvParser::NullFunc}},
|
||||||
{{State::END_OF_LINE, Message::MS_QUOTE}, {State::QUOTE, &CsvParser::null_func}},
|
{{State::END_OF_LINE, Message::MS_QUOTE}, {State::QUOTE, &CsvParser::NullFunc}},
|
||||||
{{State::END_OF_LINE, Message::MS_END_OF_LINE}, {State::END_OF_LINE, &CsvParser::null_func}}};
|
{{State::END_OF_LINE, Message::MS_END_OF_LINE}, {State::END_OF_LINE, &CsvParser::NullFunc}}};
|
||||||
|
|
||||||
// State diagram for CSV parser
|
// State diagram for CSV parser
|
||||||
sd = {// START_OF_FILE
|
sd = {// START_OF_FILE
|
||||||
// ┌───────────┬──────────┬──────────┬────────────────┬────────────────┐
|
// |-------------------------------------------------------------------|
|
||||||
// │ abc │ , │ " │ \n │ EOF │
|
// | abc | , | " | \n | EOF |
|
||||||
// ├───────────┼──────────┼──────────┼────────────────┼────────────────┤
|
// |-------------------------------------------------------------------|
|
||||||
// │ UNQUOTE │ DELIM │ QUOTE │ START_OF_FILE │ END_OF_FILE │
|
// | UNQUOTE | DELIM | QUOTE | START_OF_FILE | END_OF_FILE |
|
||||||
// ├───────────┼──────────┼──────────┼────────────────┼────────────────┤
|
// |-------------------------------------------------------------------|
|
||||||
// | lambda │ lambda │ lambda │ null_func │ null_func │
|
// | lambda | lambda | lambda | NullFunc | NullFunc |
|
||||||
// └───────────┴──────────┴──────────┴────────────────┴────────────────┘
|
// |-------------------------------------------------------------------|
|
||||||
{{State::START_OF_FILE, Message::MS_NORMAL},
|
{{State::START_OF_FILE, Message::MS_NORMAL},
|
||||||
{State::UNQUOTE,
|
{State::UNQUOTE,
|
||||||
[this](CsvParser &, char c) -> int {
|
[this](CsvParser &, char c) -> int {
|
||||||
|
@ -281,7 +357,7 @@ Status CsvOp::CsvParser::initCsvParser() {
|
||||||
[this](CsvParser &, char c) -> int {
|
[this](CsvParser &, char c) -> int {
|
||||||
this->tensor_table_ = std::make_unique<TensorQTable>();
|
this->tensor_table_ = std::make_unique<TensorQTable>();
|
||||||
this->tensor_table_->push_back(TensorRow(column_default_.size(), nullptr));
|
this->tensor_table_->push_back(TensorRow(column_default_.size(), nullptr));
|
||||||
return this->put_record(c);
|
return this->PutRecord(c);
|
||||||
}}},
|
}}},
|
||||||
{{State::START_OF_FILE, Message::MS_QUOTE},
|
{{State::START_OF_FILE, Message::MS_QUOTE},
|
||||||
{State::QUOTE,
|
{State::QUOTE,
|
||||||
|
@ -291,81 +367,81 @@ Status CsvOp::CsvParser::initCsvParser() {
|
||||||
this->pos_ = 0;
|
this->pos_ = 0;
|
||||||
return 0;
|
return 0;
|
||||||
}}},
|
}}},
|
||||||
{{State::START_OF_FILE, Message::MS_END_OF_LINE}, {State::START_OF_FILE, &CsvParser::null_func}},
|
{{State::START_OF_FILE, Message::MS_END_OF_LINE}, {State::START_OF_FILE, &CsvParser::NullFunc}},
|
||||||
{{State::START_OF_FILE, Message::MS_END_OF_FILE}, {State::END_OF_FILE, &CsvParser::null_func}},
|
{{State::START_OF_FILE, Message::MS_END_OF_FILE}, {State::END_OF_FILE, &CsvParser::NullFunc}},
|
||||||
|
|
||||||
// UNQUOTE
|
// UNQUOTE
|
||||||
// ┌───────────┬────────────┬───────────┬─────────────┬────────────────┐
|
// |-------------------------------------------------------------------|
|
||||||
// │ abc │ , │ " │ \n │ EOF │
|
// | abc | , | " | \n | EOF |
|
||||||
// ├───────────┼────────────┼───────────┼─────────────┼────────────────┤
|
// |-------------------------------------------------------------------|
|
||||||
// │ UNQUOTE │ DELIM │ EXCEPTION │ END_OF_LINE │ END_OF_FILE │
|
// | UNQUOTE | DELIM | EXCEPTION | END_OF_LINE | END_OF_FILE |
|
||||||
// ├───────────┼────────────┼───────────┼─────────────┼────────────────┤
|
// |-------------------------------------------------------------------|
|
||||||
// | put_char │ put_record │ exception │ put_row │ end_file │
|
// | PutChar | PutRecord | exception | PutRow | EndFile |
|
||||||
// └───────────┴────────────┴───────────┴─────────────┴────────────────┘
|
// |-------------------------------------------------------------------|
|
||||||
{{State::UNQUOTE, Message::MS_NORMAL}, {State::UNQUOTE, &CsvParser::put_char}},
|
{{State::UNQUOTE, Message::MS_NORMAL}, {State::UNQUOTE, &CsvParser::PutChar}},
|
||||||
{{State::UNQUOTE, Message::MS_DELIM}, {State::DELIM, &CsvParser::put_record}},
|
{{State::UNQUOTE, Message::MS_DELIM}, {State::DELIM, &CsvParser::PutRecord}},
|
||||||
{{State::UNQUOTE, Message::MS_END_OF_LINE}, {State::END_OF_LINE, &CsvParser::put_row}},
|
{{State::UNQUOTE, Message::MS_END_OF_LINE}, {State::END_OF_LINE, &CsvParser::PutRow}},
|
||||||
{{State::UNQUOTE, Message::MS_END_OF_FILE}, {State::END_OF_FILE, &CsvParser::end_file}},
|
{{State::UNQUOTE, Message::MS_END_OF_FILE}, {State::END_OF_FILE, &CsvParser::EndFile}},
|
||||||
// UNQUOTE-Exception
|
// UNQUOTE-Exception
|
||||||
{{State::UNQUOTE, Message::MS_QUOTE}, {State::EXCEPTION, &CsvParser::catch_exception}},
|
{{State::UNQUOTE, Message::MS_QUOTE}, {State::EXCEPTION, &CsvParser::CatchException}},
|
||||||
|
|
||||||
// DELIM
|
// DELIM
|
||||||
// ┌───────────┬────────────┬───────────┬─────────────┬────────────────┐
|
// |-------------------------------------------------------------------|
|
||||||
// │ abc │ , │ " │ \n │ EOF │
|
// | abc | , | " | \n | EOF |
|
||||||
// ├───────────┼────────────┼───────────┼─────────────┼────────────────┤
|
// |-------------------------------------------------------------------|
|
||||||
// │ UNQUOTE │ DELIM │ QUOTE │ END_OF_LINE │ END_OF_FILE │
|
// | UNQUOTE | DELIM | QUOTE | END_OF_LINE | END_OF_FILE |
|
||||||
// ├───────────┼────────────┼───────────┼─────────────┼────────────────┤
|
// |-------------------------------------------------------------------|
|
||||||
// | put_char │ put_record │ lambda │ put_row │ end_file │
|
// | PutChar | PutRecord | lambda | PutRow | EndFile |
|
||||||
// └───────────┴────────────┴───────────┴─────────────┴────────────────┘
|
// |-------------------------------------------------------------------|
|
||||||
{{State::DELIM, Message::MS_NORMAL}, {State::UNQUOTE, &CsvParser::put_char}},
|
{{State::DELIM, Message::MS_NORMAL}, {State::UNQUOTE, &CsvParser::PutChar}},
|
||||||
{{State::DELIM, Message::MS_DELIM}, {State::DELIM, &CsvParser::put_record}},
|
{{State::DELIM, Message::MS_DELIM}, {State::DELIM, &CsvParser::PutRecord}},
|
||||||
{{State::DELIM, Message::MS_QUOTE},
|
{{State::DELIM, Message::MS_QUOTE},
|
||||||
{State::QUOTE,
|
{State::QUOTE,
|
||||||
[this](CsvParser &, char c) -> int {
|
[this](CsvParser &, char c) -> int {
|
||||||
this->pos_ = 0;
|
this->pos_ = 0;
|
||||||
return 0;
|
return 0;
|
||||||
}}},
|
}}},
|
||||||
{{State::DELIM, Message::MS_END_OF_LINE}, {State::END_OF_LINE, &CsvParser::put_row}},
|
{{State::DELIM, Message::MS_END_OF_LINE}, {State::END_OF_LINE, &CsvParser::PutRow}},
|
||||||
{{State::DELIM, Message::MS_END_OF_FILE}, {State::END_OF_FILE, &CsvParser::end_file}},
|
{{State::DELIM, Message::MS_END_OF_FILE}, {State::END_OF_FILE, &CsvParser::EndFile}},
|
||||||
|
|
||||||
// QUOTE
|
// QUOTE
|
||||||
// ┌───────────┬──────────┬──────────────┬──────────┬────────────────┐
|
// |-----------------------------------------------------------------|
|
||||||
// │ abc │ , │ " │ \n │ EOF │
|
// | abc | , | " | \n | EOF |
|
||||||
// ├───────────┼──────────┼──────────────┼──────────┼────────────────┤
|
// |-----------------------------------------------------------------|
|
||||||
// │ QUOTE │ QUOTE │ SECOND_QUOTE │ QUOTE │ EXCEPTION │
|
// | QUOTE | QUOTE | SECOND_QUOTE | QUOTE | EXCEPTION |
|
||||||
// ├───────────┼──────────┼──────────────┼──────────┼────────────────┤
|
// |-----------------------------------------------------------------|
|
||||||
// | put_char │ put_char │ null_func │ put_char │ exception │
|
// | PutChar | PutChar | NullFunc | PutChar | exception |
|
||||||
// └───────────┴──────────┴──────────────┴──────────┴────────────────┘
|
// |-----------------------------------------------------------------|
|
||||||
{{State::QUOTE, Message::MS_NORMAL}, {State::QUOTE, &CsvParser::put_char}},
|
{{State::QUOTE, Message::MS_NORMAL}, {State::QUOTE, &CsvParser::PutChar}},
|
||||||
{{State::QUOTE, Message::MS_DELIM}, {State::QUOTE, &CsvParser::put_char}},
|
{{State::QUOTE, Message::MS_DELIM}, {State::QUOTE, &CsvParser::PutChar}},
|
||||||
{{State::QUOTE, Message::MS_QUOTE}, {State::SECOND_QUOTE, &CsvParser::null_func}},
|
{{State::QUOTE, Message::MS_QUOTE}, {State::SECOND_QUOTE, &CsvParser::NullFunc}},
|
||||||
{{State::QUOTE, Message::MS_END_OF_LINE}, {State::QUOTE, &CsvParser::put_char}},
|
{{State::QUOTE, Message::MS_END_OF_LINE}, {State::QUOTE, &CsvParser::PutChar}},
|
||||||
// QUOTE-Exception
|
// QUOTE-Exception
|
||||||
{{State::QUOTE, Message::MS_END_OF_FILE}, {State::EXCEPTION, &CsvParser::catch_exception}},
|
{{State::QUOTE, Message::MS_END_OF_FILE}, {State::EXCEPTION, &CsvParser::CatchException}},
|
||||||
|
|
||||||
// SECOND_QUOTE
|
// SECOND_QUOTE
|
||||||
// ┌───────────┬────────────┬──────────┬─────────────┬────────────────┐
|
// |------------------------------------------------------------------|
|
||||||
// │ abc │ , │ " │ \n │ EOF │
|
// | abc | , | " | \n | EOF |
|
||||||
// ├───────────┼────────────┼──────────┼─────────────┼────────────────┤
|
// |------------------------------------------------------------------|
|
||||||
// │ EXCEPTION │ DELIM │ QUOTE │ END_OF_LINE │ END_OF_FILE │
|
// | EXCEPTION | DELIM | QUOTE | END_OF_LINE | END_OF_FILE |
|
||||||
// ├───────────┼────────────┼──────────┼─────────────┼────────────────┤
|
// |------------------------------------------------------------------|
|
||||||
// | exception │ put_record │ put_char │ put_row │ end_file │
|
// | exception | PutRecord | PutChar | PutRow | EndFile |
|
||||||
// └───────────┴────────────┴──────────┴─────────────┴────────────────┘
|
// |------------------------------------------------------------------|
|
||||||
{{State::SECOND_QUOTE, Message::MS_QUOTE}, {State::QUOTE, &CsvParser::put_char}},
|
{{State::SECOND_QUOTE, Message::MS_QUOTE}, {State::QUOTE, &CsvParser::PutChar}},
|
||||||
{{State::SECOND_QUOTE, Message::MS_DELIM}, {State::DELIM, &CsvParser::put_record}},
|
{{State::SECOND_QUOTE, Message::MS_DELIM}, {State::DELIM, &CsvParser::PutRecord}},
|
||||||
{{State::SECOND_QUOTE, Message::MS_END_OF_LINE}, {State::END_OF_LINE, &CsvParser::put_row}},
|
{{State::SECOND_QUOTE, Message::MS_END_OF_LINE}, {State::END_OF_LINE, &CsvParser::PutRow}},
|
||||||
{{State::SECOND_QUOTE, Message::MS_END_OF_FILE}, {State::END_OF_FILE, &CsvParser::end_file}},
|
{{State::SECOND_QUOTE, Message::MS_END_OF_FILE}, {State::END_OF_FILE, &CsvParser::EndFile}},
|
||||||
// SECOND_QUOTE-Exception
|
// SECOND_QUOTE-Exception
|
||||||
{{State::SECOND_QUOTE, Message::MS_NORMAL}, {State::EXCEPTION, &CsvParser::catch_exception}},
|
{{State::SECOND_QUOTE, Message::MS_NORMAL}, {State::EXCEPTION, &CsvParser::CatchException}},
|
||||||
|
|
||||||
// END_OF_LINE
|
// END_OF_LINE
|
||||||
// ┌─────────┬────────┬────────┬─────────────┬─────────────┐
|
// |-------------------------------------------------------|
|
||||||
// │ abc │ , │ " │ \n │ EOF │
|
// | abc | , | " | \n | EOF |
|
||||||
// ├─────────┼────────┼────────┼─────────────┼─────────────┤
|
// |-------------------------------------------------------|
|
||||||
// │ UNQUOTE │ DELIM │ QUOTE │ END_OF_LINE │ END_OF_FILE │
|
// | UNQUOTE | DELIM | QUOTE | END_OF_LINE | END_OF_FILE |
|
||||||
// ├─────────┼────────┼────────┼─────────────┼─────────────┤
|
// |-------------------------------------------------------|
|
||||||
// | lambda │ lambda │ lambda │ null_func │ end_file │
|
// | lambda | lambda | lambda | NullFunc | EndFile |
|
||||||
// └─────────┴────────┴────────┴─────────────┴─────────────┘
|
// |-------------------------------------------------------|
|
||||||
{{State::END_OF_LINE, Message::MS_NORMAL},
|
{{State::END_OF_LINE, Message::MS_NORMAL},
|
||||||
{State::UNQUOTE,
|
{State::UNQUOTE,
|
||||||
[this](CsvParser &, char c) -> int {
|
[this](CsvParser &, char c) -> int {
|
||||||
|
@ -382,7 +458,7 @@ Status CsvOp::CsvParser::initCsvParser() {
|
||||||
if (this->total_rows_ > this->start_offset_ && this->total_rows_ <= this->end_offset_) {
|
if (this->total_rows_ > this->start_offset_ && this->total_rows_ <= this->end_offset_) {
|
||||||
this->tensor_table_->push_back(TensorRow(column_default_.size(), nullptr));
|
this->tensor_table_->push_back(TensorRow(column_default_.size(), nullptr));
|
||||||
}
|
}
|
||||||
return this->put_record(c);
|
return this->PutRecord(c);
|
||||||
}}},
|
}}},
|
||||||
{{State::END_OF_LINE, Message::MS_QUOTE},
|
{{State::END_OF_LINE, Message::MS_QUOTE},
|
||||||
{State::QUOTE,
|
{State::QUOTE,
|
||||||
|
@ -392,8 +468,8 @@ Status CsvOp::CsvParser::initCsvParser() {
|
||||||
}
|
}
|
||||||
return 0;
|
return 0;
|
||||||
}}},
|
}}},
|
||||||
{{State::END_OF_LINE, Message::MS_END_OF_LINE}, {State::END_OF_LINE, &CsvParser::null_func}},
|
{{State::END_OF_LINE, Message::MS_END_OF_LINE}, {State::END_OF_LINE, &CsvParser::NullFunc}},
|
||||||
{{State::END_OF_LINE, Message::MS_END_OF_FILE}, {State::END_OF_FILE, &CsvParser::end_file}}};
|
{{State::END_OF_LINE, Message::MS_END_OF_FILE}, {State::END_OF_FILE, &CsvParser::EndFile}}};
|
||||||
return Status::OK();
|
return Status::OK();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -409,8 +485,8 @@ Status CsvOp::Reset() {
|
||||||
Status CsvOp::LoadFile(const std::string &file, const int64_t start_offset, const int64_t end_offset,
|
Status CsvOp::LoadFile(const std::string &file, const int64_t start_offset, const int64_t end_offset,
|
||||||
const int32_t worker_id) {
|
const int32_t worker_id) {
|
||||||
CsvParser csv_parser(worker_id, jagged_buffer_connector_, rows_per_buffer_, field_delim_, column_default_list_);
|
CsvParser csv_parser(worker_id, jagged_buffer_connector_, rows_per_buffer_, field_delim_, column_default_list_);
|
||||||
csv_parser.setStartOffset(start_offset);
|
csv_parser.SetStartOffset(start_offset);
|
||||||
csv_parser.setEndOffset(end_offset);
|
csv_parser.SetEndOffset(end_offset);
|
||||||
std::ifstream ifs;
|
std::ifstream ifs;
|
||||||
ifs.open(file, std::ifstream::in);
|
ifs.open(file, std::ifstream::in);
|
||||||
if (column_name_list_.empty()) {
|
if (column_name_list_.empty()) {
|
||||||
|
@ -424,16 +500,16 @@ Status CsvOp::LoadFile(const std::string &file, const int64_t start_offset, cons
|
||||||
// which is a 32-bit -1, it's not equal to the 8-bit -1 on Euler OS. So instead of char, we use
|
// which is a 32-bit -1, it's not equal to the 8-bit -1 on Euler OS. So instead of char, we use
|
||||||
// int to receive its return value.
|
// int to receive its return value.
|
||||||
int chr = ifs.get();
|
int chr = ifs.get();
|
||||||
if (csv_parser.processMessage(chr) != 0) {
|
if (csv_parser.ProcessMessage(chr) != 0) {
|
||||||
RETURN_STATUS_UNEXPECTED("Failed to parse file " + file + ":" + std::to_string(csv_parser.total_rows_ + 1) +
|
RETURN_STATUS_UNEXPECTED("Failed to parse file " + file + ":" + std::to_string(csv_parser.GetTotalRows() + 1) +
|
||||||
". error message: " + csv_parser.err_message_);
|
". error message: " + csv_parser.GetErrorMessage());
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
} catch (std::invalid_argument &ia) {
|
} catch (std::invalid_argument &ia) {
|
||||||
std::string err_row = std::to_string(csv_parser.total_rows_ + 1);
|
std::string err_row = std::to_string(csv_parser.GetTotalRows() + 1);
|
||||||
RETURN_STATUS_UNEXPECTED(file + ":" + err_row + ", type does not match");
|
RETURN_STATUS_UNEXPECTED(file + ":" + err_row + ", type does not match");
|
||||||
} catch (std::out_of_range &oor) {
|
} catch (std::out_of_range &oor) {
|
||||||
std::string err_row = std::to_string(csv_parser.total_rows_ + 1);
|
std::string err_row = std::to_string(csv_parser.GetTotalRows() + 1);
|
||||||
RETURN_STATUS_UNEXPECTED(file + ":" + err_row + ", out of range");
|
RETURN_STATUS_UNEXPECTED(file + ":" + err_row + ", out of range");
|
||||||
}
|
}
|
||||||
return Status::OK();
|
return Status::OK();
|
||||||
|
@ -712,12 +788,12 @@ int64_t CsvOp::CountTotalRows(const std::string &file) {
|
||||||
csv_parser.Reset();
|
csv_parser.Reset();
|
||||||
while (ifs.good()) {
|
while (ifs.good()) {
|
||||||
int chr = ifs.get();
|
int chr = ifs.get();
|
||||||
if (csv_parser.countRows(chr) != 0) {
|
if (csv_parser.CountRows(chr) != 0) {
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
return csv_parser.total_rows_;
|
return csv_parser.GetTotalRows();
|
||||||
}
|
}
|
||||||
|
|
||||||
// Pushes a control indicator onto the IOBlockQueue for each worker to consume.
|
// Pushes a control indicator onto the IOBlockQueue for each worker to consume.
|
||||||
|
|
|
@ -64,52 +64,27 @@ class CsvOp : public ParallelOp {
|
||||||
CsvParser() = delete;
|
CsvParser() = delete;
|
||||||
|
|
||||||
CsvParser(int32_t worker_id, std::shared_ptr<JaggedConnector> connector, int64_t rows_per_buffer, char field_delim,
|
CsvParser(int32_t worker_id, std::shared_ptr<JaggedConnector> connector, int64_t rows_per_buffer, char field_delim,
|
||||||
std::vector<std::shared_ptr<CsvOp::BaseRecord>> column_default)
|
std::vector<std::shared_ptr<CsvOp::BaseRecord>> column_default);
|
||||||
: worker_id_(worker_id),
|
|
||||||
buffer_connector_(connector),
|
|
||||||
csv_rows_per_buffer_(rows_per_buffer),
|
|
||||||
csv_field_delim_(field_delim),
|
|
||||||
column_default_(column_default),
|
|
||||||
cur_state_(START_OF_FILE),
|
|
||||||
pos_(0),
|
|
||||||
cur_row_(0),
|
|
||||||
cur_col_(0),
|
|
||||||
total_rows_(0),
|
|
||||||
start_offset_(0),
|
|
||||||
end_offset_(std::numeric_limits<int64_t>::max()),
|
|
||||||
err_message_("unknown") {
|
|
||||||
cur_buffer_ = std::make_unique<DataBuffer>(0, DataBuffer::BufferFlags::kDeBFlagNone);
|
|
||||||
initCsvParser();
|
|
||||||
}
|
|
||||||
|
|
||||||
~CsvParser() = default;
|
~CsvParser() = default;
|
||||||
|
|
||||||
void Reset() {
|
void Reset();
|
||||||
cur_state_ = START_OF_FILE;
|
|
||||||
pos_ = 0;
|
|
||||||
cur_row_ = 0;
|
|
||||||
cur_col_ = 0;
|
|
||||||
}
|
|
||||||
|
|
||||||
void setStartOffset(int64_t start_offset) { start_offset_ = start_offset; }
|
void SetStartOffset(int64_t start_offset) { start_offset_ = start_offset; }
|
||||||
|
|
||||||
void setEndOffset(int64_t end_offset) { end_offset_ = end_offset; }
|
void SetEndOffset(int64_t end_offset) { end_offset_ = end_offset; }
|
||||||
|
|
||||||
int processMessage(int c) {
|
int ProcessMessage(int c);
|
||||||
Message m = getMessage(c);
|
|
||||||
StateDiagram::iterator it = sd.find({cur_state_, m});
|
|
||||||
if (it == sd.end()) {
|
|
||||||
return -1;
|
|
||||||
}
|
|
||||||
int ret = it->second.second(*this, static_cast<char>(c));
|
|
||||||
cur_state_ = it->second.first;
|
|
||||||
return ret;
|
|
||||||
}
|
|
||||||
|
|
||||||
int countRows(int c);
|
int CountRows(int c);
|
||||||
|
|
||||||
Status initCsvParser();
|
Status InitCsvParser();
|
||||||
|
|
||||||
|
int64_t GetTotalRows() { return total_rows_; }
|
||||||
|
|
||||||
|
std::string GetErrorMessage() { return err_message_; }
|
||||||
|
|
||||||
|
private:
|
||||||
enum State : uint8_t {
|
enum State : uint8_t {
|
||||||
START_OF_FILE = 0,
|
START_OF_FILE = 0,
|
||||||
UNQUOTE,
|
UNQUOTE,
|
||||||
|
@ -130,55 +105,24 @@ class CsvOp : public ParallelOp {
|
||||||
};
|
};
|
||||||
|
|
||||||
typedef std::pair<State, Message> StateMessagePair;
|
typedef std::pair<State, Message> StateMessagePair;
|
||||||
typedef std::pair<State, std::function<int(CsvParser &, char)>> StateActionPair;
|
typedef std::pair<State, std::function<int(CsvParser &, int)>> StateActionPair;
|
||||||
typedef std::map<StateMessagePair, StateActionPair> StateDiagram;
|
typedef std::map<StateMessagePair, StateActionPair> StateDiagram;
|
||||||
|
|
||||||
Message getMessage(int c) {
|
Message GetMessage(int c);
|
||||||
if (c == csv_field_delim_) {
|
|
||||||
return Message::MS_DELIM;
|
|
||||||
} else if (c == '"') {
|
|
||||||
return Message::MS_QUOTE;
|
|
||||||
} else if (c == '\r' || c == '\n') {
|
|
||||||
return Message::MS_END_OF_LINE;
|
|
||||||
} else if (c == std::char_traits<char>::eof()) {
|
|
||||||
return Message::MS_END_OF_FILE;
|
|
||||||
} else {
|
|
||||||
return Message::MS_NORMAL;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
int null_func(char c) { return 0; }
|
int NullFunc(int c) { return 0; }
|
||||||
|
|
||||||
int put_char(char c) {
|
int PutChar(int c);
|
||||||
if (pos_ >= str_buf_.size()) {
|
|
||||||
str_buf_.resize(str_buf_.size() * 2);
|
|
||||||
}
|
|
||||||
str_buf_[pos_] = c;
|
|
||||||
pos_++;
|
|
||||||
return 0;
|
|
||||||
}
|
|
||||||
|
|
||||||
int put_record(char c);
|
int PutRecord(int c);
|
||||||
|
|
||||||
int put_row(char c);
|
int PutRow(int c);
|
||||||
|
|
||||||
int end_file(char c);
|
int EndFile(int c);
|
||||||
|
|
||||||
int add_row(char c) {
|
int AddRow(int c);
|
||||||
total_rows_++;
|
|
||||||
return 0;
|
|
||||||
}
|
|
||||||
|
|
||||||
int catch_exception(char c) {
|
int CatchException(int c);
|
||||||
if (getMessage(c) == Message::MS_QUOTE && cur_state_ == State::UNQUOTE) {
|
|
||||||
err_message_ = "Invalid quote in unquote field.";
|
|
||||||
} else if (getMessage(c) == Message::MS_END_OF_FILE && cur_state_ == State::QUOTE) {
|
|
||||||
err_message_ = "Reach the end of file in quote field.";
|
|
||||||
} else if (getMessage(c) == Message::MS_NORMAL && cur_state_ == State::SECOND_QUOTE) {
|
|
||||||
err_message_ = "Receive unquote char in quote field.";
|
|
||||||
}
|
|
||||||
return -1;
|
|
||||||
}
|
|
||||||
|
|
||||||
int32_t worker_id_;
|
int32_t worker_id_;
|
||||||
std::shared_ptr<JaggedConnector> buffer_connector_;
|
std::shared_ptr<JaggedConnector> buffer_connector_;
|
||||||
|
@ -401,8 +345,8 @@ class CsvOp : public ParallelOp {
|
||||||
|
|
||||||
// Select file and push it to the block queue.
|
// Select file and push it to the block queue.
|
||||||
// @param file_name - File name.
|
// @param file_name - File name.
|
||||||
// @param start_file - If file contains the first sample of data.
|
// @param start_offset - If file contains the first sample of data.
|
||||||
// @param end_file - If file contains the end sample of data.
|
// @param end_offset - If file contains the end sample of data.
|
||||||
// @param pre_count - Total rows of previous files.
|
// @param pre_count - Total rows of previous files.
|
||||||
// @return Status - the error code returned.
|
// @return Status - the error code returned.
|
||||||
bool NeedPushFileToBlockQueue(const std::string &file_name, int64_t *start_offset, int64_t *end_offset,
|
bool NeedPushFileToBlockQueue(const std::string &file_name, int64_t *start_offset, int64_t *end_offset,
|
||||||
|
|
Loading…
Reference in New Issue