forked from mindspore-Ecosystem/mindspore
!3753 CSV dataset reading a file that begins with a blank line will crash
Merge pull request !3753 from jiangzhiwen/dataset/csv_fix_blank_line_header
This commit is contained in:
commit
9cc1ca1a19
|
@ -142,6 +142,11 @@ int CsvOp::CsvParser::put_row(char c) {
|
|||
return ret;
|
||||
}
|
||||
|
||||
if (cur_col_ != column_default_.size()) {
|
||||
err_message_ = "The number of columns does not match the definition.";
|
||||
return -1;
|
||||
}
|
||||
|
||||
total_rows_++;
|
||||
cur_row_++;
|
||||
cur_col_ = 0;
|
||||
|
@ -159,8 +164,12 @@ int CsvOp::CsvParser::put_row(char c) {
|
|||
|
||||
int CsvOp::CsvParser::end_file(char c) {
|
||||
if (cur_col_ > 0) {
|
||||
put_row(c);
|
||||
int ret = put_row(c);
|
||||
if (ret < 0) {
|
||||
return ret;
|
||||
}
|
||||
}
|
||||
|
||||
if (cur_row_ > 0) {
|
||||
cur_buffer_->set_tensor_table(std::move(tensor_table_));
|
||||
buffer_connector_->Add(worker_id_, std::move(cur_buffer_));
|
||||
|
@ -190,16 +199,16 @@ Status CsvOp::CsvParser::initCsvParser() {
|
|||
|
||||
// State diagram for counting rows
|
||||
sdl = {// START_OF_FILE
|
||||
// ┌───────────┬───────────┬─────────────┐
|
||||
// ┌───────────┬───────────┬───────────────┐
|
||||
// │ abc │ " │ \n │
|
||||
// ├───────────┼───────────┼─────────────┤
|
||||
// │ UNQUOTE │ QUOTE │ END_OF_LINE │
|
||||
// ├───────────┼───────────┼─────────────┤
|
||||
// ├───────────┼───────────┼───────────────┤
|
||||
// │ UNQUOTE │ QUOTE │ START_OF_FILE │
|
||||
// ├───────────┼───────────┼───────────────┤
|
||||
// | null_func │ null_func │ null_func │
|
||||
// └───────────┴───────────┴─────────────┘
|
||||
// └───────────┴───────────┴───────────────┘
|
||||
{{State::START_OF_FILE, Message::MS_NORMAL}, {State::UNQUOTE, &CsvParser::null_func}},
|
||||
{{State::START_OF_FILE, Message::MS_QUOTE}, {State::QUOTE, &CsvParser::null_func}},
|
||||
{{State::START_OF_FILE, Message::MS_END_OF_LINE}, {State::END_OF_LINE, &CsvParser::null_func}},
|
||||
{{State::START_OF_FILE, Message::MS_END_OF_LINE}, {State::START_OF_FILE, &CsvParser::null_func}},
|
||||
|
||||
// UNQUOTE
|
||||
// ┌───────────┬───────────┬─────────────┐
|
||||
|
@ -254,7 +263,7 @@ Status CsvOp::CsvParser::initCsvParser() {
|
|||
// ┌───────────┬──────────┬──────────┬────────────────┬────────────────┐
|
||||
// │ abc │ , │ " │ \n │ EOF │
|
||||
// ├───────────┼──────────┼──────────┼────────────────┼────────────────┤
|
||||
// │ UNQUOTE │ DELIM │ QUOTE │ END_OF_LINE │ END_OF_FILE │
|
||||
// │ UNQUOTE │ DELIM │ QUOTE │ START_OF_FILE │ END_OF_FILE │
|
||||
// ├───────────┼──────────┼──────────┼────────────────┼────────────────┤
|
||||
// | lambda │ lambda │ lambda │ null_func │ null_func │
|
||||
// └───────────┴──────────┴──────────┴────────────────┴────────────────┘
|
||||
|
@ -282,7 +291,7 @@ Status CsvOp::CsvParser::initCsvParser() {
|
|||
this->pos_ = 0;
|
||||
return 0;
|
||||
}}},
|
||||
{{State::START_OF_FILE, Message::MS_END_OF_LINE}, {State::END_OF_LINE, &CsvParser::null_func}},
|
||||
{{State::START_OF_FILE, Message::MS_END_OF_LINE}, {State::START_OF_FILE, &CsvParser::null_func}},
|
||||
{{State::START_OF_FILE, Message::MS_END_OF_FILE}, {State::END_OF_FILE, &CsvParser::null_func}},
|
||||
|
||||
// UNQUOTE
|
||||
|
@ -683,7 +692,7 @@ Status CsvOp::CalculateNumRowsPerShard() {
|
|||
}
|
||||
if (all_num_rows_ == 0) {
|
||||
RETURN_STATUS_UNEXPECTED(
|
||||
"There is no valid data matching the dataset API CsvDataset. Please check file path or dataset API "
|
||||
"There is no valid data matching the dataset API CsvDataset. Please check file path or CSV format "
|
||||
"validation first.");
|
||||
}
|
||||
|
||||
|
@ -756,6 +765,8 @@ Status CsvOp::ComputeColMap() {
|
|||
getline(handle, line);
|
||||
std::vector<std::string> col_names = split(line, field_delim_);
|
||||
for (int32_t i = 0; i < col_names.size(); i++) {
|
||||
// consider the case of CRLF
|
||||
col_names[i].erase(col_names[i].find_last_not_of('\r') + 1);
|
||||
column_name_id_map_[col_names[i]] = i;
|
||||
}
|
||||
} else {
|
||||
|
|
|
@ -77,7 +77,7 @@ class CsvOp : public ParallelOp {
|
|||
total_rows_(0),
|
||||
start_offset_(0),
|
||||
end_offset_(std::numeric_limits<int64_t>::max()),
|
||||
err_message_("unkonw") {
|
||||
err_message_("unknown") {
|
||||
cur_buffer_ = std::make_unique<DataBuffer>(0, DataBuffer::BufferFlags::kDeBFlagNone);
|
||||
initCsvParser();
|
||||
}
|
||||
|
@ -101,8 +101,9 @@ class CsvOp : public ParallelOp {
|
|||
if (it == sd.end()) {
|
||||
return -1;
|
||||
}
|
||||
int ret = it->second.second(*this, static_cast<char>(c));
|
||||
cur_state_ = it->second.first;
|
||||
return it->second.second(*this, c);
|
||||
return ret;
|
||||
}
|
||||
|
||||
int countRows(int c);
|
||||
|
@ -169,7 +170,13 @@ class CsvOp : public ParallelOp {
|
|||
}
|
||||
|
||||
int catch_exception(char c) {
|
||||
MS_LOG(ERROR) << "Invalid syntax!";
|
||||
if (getMessage(c) == Message::MS_QUOTE && cur_state_ == State::UNQUOTE) {
|
||||
err_message_ = "Invalid quote in unquote field.";
|
||||
} else if (getMessage(c) == Message::MS_END_OF_FILE && cur_state_ == State::QUOTE) {
|
||||
err_message_ = "Reach the end of file in quote field.";
|
||||
} else if (getMessage(c) == Message::MS_NORMAL && cur_state_ == State::SECOND_QUOTE) {
|
||||
err_message_ = "Receive unquote char in quote field.";
|
||||
}
|
||||
return -1;
|
||||
}
|
||||
|
||||
|
@ -425,6 +432,8 @@ class CsvOp : public ParallelOp {
|
|||
Status ComputeColMap() override;
|
||||
|
||||
// Split string based on a character delimiter
|
||||
// @param str - the input string
|
||||
// @param str - the delimiter
|
||||
// @return - the a string vector
|
||||
std::vector<std::string> split(const std::string &s, char delim);
|
||||
|
||||
|
|
Loading…
Reference in New Issue