!3753 CSV dataset reading a file that begins with a blank line will crash

Merge pull request !3753 from jiangzhiwen/dataset/csv_fix_blank_line_header
This commit is contained in:
mindspore-ci-bot 2020-08-13 16:41:16 +08:00 committed by Gitee
commit 9cc1ca1a19
2 changed files with 35 additions and 15 deletions

View File

@ -142,6 +142,11 @@ int CsvOp::CsvParser::put_row(char c) {
return ret;
}
if (cur_col_ != column_default_.size()) {
err_message_ = "The number of columns does not match the definition.";
return -1;
}
total_rows_++;
cur_row_++;
cur_col_ = 0;
@ -159,8 +164,12 @@ int CsvOp::CsvParser::put_row(char c) {
int CsvOp::CsvParser::end_file(char c) {
if (cur_col_ > 0) {
put_row(c);
int ret = put_row(c);
if (ret < 0) {
return ret;
}
}
if (cur_row_ > 0) {
cur_buffer_->set_tensor_table(std::move(tensor_table_));
buffer_connector_->Add(worker_id_, std::move(cur_buffer_));
@ -190,16 +199,16 @@ Status CsvOp::CsvParser::initCsvParser() {
// State diagram for counting rows
sdl = {// START_OF_FILE
// ┌───────────┬───────────┬─────────────
// │ abc │ " │ \n │
// ├───────────┼───────────┼─────────────
// │ UNQUOTE │ QUOTE │ END_OF_LINE │
// ├───────────┼───────────┼─────────────
// | null_func │ null_func │ null_func │
// └───────────┴───────────┴─────────────
// ┌───────────┬───────────┬───────────────
// │ abc │ " │ \n │
// ├───────────┼───────────┼───────────────
// │ UNQUOTE │ QUOTE │ START_OF_FILE │
// ├───────────┼───────────┼───────────────
// | null_func │ null_func │ null_func │
// └───────────┴───────────┴───────────────
{{State::START_OF_FILE, Message::MS_NORMAL}, {State::UNQUOTE, &CsvParser::null_func}},
{{State::START_OF_FILE, Message::MS_QUOTE}, {State::QUOTE, &CsvParser::null_func}},
{{State::START_OF_FILE, Message::MS_END_OF_LINE}, {State::END_OF_LINE, &CsvParser::null_func}},
{{State::START_OF_FILE, Message::MS_END_OF_LINE}, {State::START_OF_FILE, &CsvParser::null_func}},
// UNQUOTE
// ┌───────────┬───────────┬─────────────┐
@ -254,7 +263,7 @@ Status CsvOp::CsvParser::initCsvParser() {
// ┌───────────┬──────────┬──────────┬────────────────┬────────────────┐
// │ abc │ , │ " │ \n │ EOF │
// ├───────────┼──────────┼──────────┼────────────────┼────────────────┤
// │ UNQUOTE │ DELIM │ QUOTE │ END_OF_LINE │ END_OF_FILE │
// │ UNQUOTE │ DELIM │ QUOTE │ START_OF_FILE │ END_OF_FILE │
// ├───────────┼──────────┼──────────┼────────────────┼────────────────┤
// | lambda │ lambda │ lambda │ null_func │ null_func │
// └───────────┴──────────┴──────────┴────────────────┴────────────────┘
@ -282,7 +291,7 @@ Status CsvOp::CsvParser::initCsvParser() {
this->pos_ = 0;
return 0;
}}},
{{State::START_OF_FILE, Message::MS_END_OF_LINE}, {State::END_OF_LINE, &CsvParser::null_func}},
{{State::START_OF_FILE, Message::MS_END_OF_LINE}, {State::START_OF_FILE, &CsvParser::null_func}},
{{State::START_OF_FILE, Message::MS_END_OF_FILE}, {State::END_OF_FILE, &CsvParser::null_func}},
// UNQUOTE
@ -683,7 +692,7 @@ Status CsvOp::CalculateNumRowsPerShard() {
}
if (all_num_rows_ == 0) {
RETURN_STATUS_UNEXPECTED(
"There is no valid data matching the dataset API CsvDataset. Please check file path or dataset API "
"There is no valid data matching the dataset API CsvDataset. Please check file path or CSV format "
"validation first.");
}
@ -756,6 +765,8 @@ Status CsvOp::ComputeColMap() {
getline(handle, line);
std::vector<std::string> col_names = split(line, field_delim_);
for (int32_t i = 0; i < col_names.size(); i++) {
// consider the case of CRLF
col_names[i].erase(col_names[i].find_last_not_of('\r') + 1);
column_name_id_map_[col_names[i]] = i;
}
} else {

View File

@ -77,7 +77,7 @@ class CsvOp : public ParallelOp {
total_rows_(0),
start_offset_(0),
end_offset_(std::numeric_limits<int64_t>::max()),
err_message_("unkonw") {
err_message_("unknown") {
cur_buffer_ = std::make_unique<DataBuffer>(0, DataBuffer::BufferFlags::kDeBFlagNone);
initCsvParser();
}
@ -101,8 +101,9 @@ class CsvOp : public ParallelOp {
if (it == sd.end()) {
return -1;
}
int ret = it->second.second(*this, static_cast<char>(c));
cur_state_ = it->second.first;
return it->second.second(*this, c);
return ret;
}
int countRows(int c);
@ -169,7 +170,13 @@ class CsvOp : public ParallelOp {
}
int catch_exception(char c) {
MS_LOG(ERROR) << "Invalid syntax!";
if (getMessage(c) == Message::MS_QUOTE && cur_state_ == State::UNQUOTE) {
err_message_ = "Invalid quote in unquote field.";
} else if (getMessage(c) == Message::MS_END_OF_FILE && cur_state_ == State::QUOTE) {
err_message_ = "Reach the end of file in quote field.";
} else if (getMessage(c) == Message::MS_NORMAL && cur_state_ == State::SECOND_QUOTE) {
err_message_ = "Receive unquote char in quote field.";
}
return -1;
}
@ -425,6 +432,8 @@ class CsvOp : public ParallelOp {
Status ComputeColMap() override;
// Split string based on a character delimiter
// @param str - the input string
// @param str - the delimiter
// @return - the a string vector
std::vector<std::string> split(const std::string &s, char delim);