fix security check

This commit is contained in:
liyong 2021-07-20 17:19:02 +08:00
parent 31a4c3116e
commit e262b11901
3 changed files with 126 additions and 71 deletions

View File

@ -106,22 +106,27 @@ std::pair<MSRStatus, std::string> ShardIndexGenerator::GetValueByField(const str
std::string ShardIndexGenerator::TakeFieldType(const string &field_path, json schema) { std::string ShardIndexGenerator::TakeFieldType(const string &field_path, json schema) {
std::vector<std::string> field_name = StringSplit(field_path, kPoint); std::vector<std::string> field_name = StringSplit(field_path, kPoint);
for (uint64_t i = 0; i < field_name.size(); i++) { for (uint64_t i = 0; i < field_name.size(); i++) {
if (i != field_name.size() - 1) { try {
// Get type information from json schema if (i != field_name.size() - 1) {
schema = schema.at(field_name[i]); // Get type information from json schema
schema = schema.at("properties"); schema = schema.at(field_name[i]);
} else {
// standard root layer exist "properties" if type is "object"
if (schema.find("properties") != schema.end()) {
schema = schema.at("properties"); schema = schema.at("properties");
}
schema = schema.at(field_name[i]);
std::string field_type = schema.at("type").dump();
if (field_type.length() <= 2) {
return "";
} else { } else {
return field_type.substr(1, field_type.length() - 2); // standard root layer exist "properties" if type is "object"
if (schema.find("properties") != schema.end()) {
schema = schema.at("properties");
}
schema = schema.at(field_name[i]);
std::string field_type = schema.at("type").dump();
if (field_type.length() <= 2) {
return "";
} else {
return field_type.substr(1, field_type.length() - 2);
}
} }
} catch (...) {
MS_LOG(WARNING) << "Exception occurred while get field type.";
return "";
} }
} }
return ""; return "";
@ -330,6 +335,9 @@ MSRStatus ShardIndexGenerator::BindParameterExecuteSQL(
const std::vector<std::vector<std::tuple<std::string, std::string, std::string>>> &data) { const std::vector<std::vector<std::tuple<std::string, std::string, std::string>>> &data) {
sqlite3_stmt *stmt = nullptr; sqlite3_stmt *stmt = nullptr;
if (sqlite3_prepare_v2(db, common::SafeCStr(sql), -1, &stmt, 0) != SQLITE_OK) { if (sqlite3_prepare_v2(db, common::SafeCStr(sql), -1, &stmt, 0) != SQLITE_OK) {
if (stmt) {
(void)sqlite3_finalize(stmt);
}
MS_LOG(ERROR) << "SQL error: could not prepare statement, sql: " << sql; MS_LOG(ERROR) << "SQL error: could not prepare statement, sql: " << sql;
return FAILED; return FAILED;
} }
@ -342,29 +350,34 @@ MSRStatus ShardIndexGenerator::BindParameterExecuteSQL(
int index = sqlite3_bind_parameter_index(stmt, common::SafeCStr(place_holder)); int index = sqlite3_bind_parameter_index(stmt, common::SafeCStr(place_holder));
if (field_type == "INTEGER") { if (field_type == "INTEGER") {
if (sqlite3_bind_int64(stmt, index, std::stoll(field_value)) != SQLITE_OK) { if (sqlite3_bind_int64(stmt, index, std::stoll(field_value)) != SQLITE_OK) {
(void)sqlite3_finalize(stmt);
MS_LOG(ERROR) << "SQL error: could not bind parameter, index: " << index MS_LOG(ERROR) << "SQL error: could not bind parameter, index: " << index
<< ", field value: " << std::stoll(field_value); << ", field value: " << std::stoll(field_value);
return FAILED; return FAILED;
} }
} else if (field_type == "NUMERIC") { } else if (field_type == "NUMERIC") {
if (sqlite3_bind_double(stmt, index, std::stold(field_value)) != SQLITE_OK) { if (sqlite3_bind_double(stmt, index, std::stold(field_value)) != SQLITE_OK) {
(void)sqlite3_finalize(stmt);
MS_LOG(ERROR) << "SQL error: could not bind parameter, index: " << index MS_LOG(ERROR) << "SQL error: could not bind parameter, index: " << index
<< ", field value: " << std::stold(field_value); << ", field value: " << std::stold(field_value);
return FAILED; return FAILED;
} }
} else if (field_type == "NULL") { } else if (field_type == "NULL") {
if (sqlite3_bind_null(stmt, index) != SQLITE_OK) { if (sqlite3_bind_null(stmt, index) != SQLITE_OK) {
(void)sqlite3_finalize(stmt);
MS_LOG(ERROR) << "SQL error: could not bind parameter, index: " << index << ", field value: NULL"; MS_LOG(ERROR) << "SQL error: could not bind parameter, index: " << index << ", field value: NULL";
return FAILED; return FAILED;
} }
} else { } else {
if (sqlite3_bind_text(stmt, index, common::SafeCStr(field_value), -1, SQLITE_STATIC) != SQLITE_OK) { if (sqlite3_bind_text(stmt, index, common::SafeCStr(field_value), -1, SQLITE_STATIC) != SQLITE_OK) {
(void)sqlite3_finalize(stmt);
MS_LOG(ERROR) << "SQL error: could not bind parameter, index: " << index << ", field value: " << field_value; MS_LOG(ERROR) << "SQL error: could not bind parameter, index: " << index << ", field value: " << field_value;
return FAILED; return FAILED;
} }
} }
} }
if (sqlite3_step(stmt) != SQLITE_DONE) { if (sqlite3_step(stmt) != SQLITE_DONE) {
(void)sqlite3_finalize(stmt);
MS_LOG(ERROR) << "SQL error: Could not step (execute) stmt."; MS_LOG(ERROR) << "SQL error: Could not step (execute) stmt.";
return FAILED; return FAILED;
} }
@ -422,7 +435,12 @@ ROW_DATA ShardIndexGenerator::GenerateRowData(int shard_no, const std::map<int,
std::vector<std::vector<std::tuple<std::string, std::string, std::string>>> full_data; std::vector<std::vector<std::tuple<std::string, std::string, std::string>>> full_data;
// current raw data page // current raw data page
std::shared_ptr<Page> cur_raw_page = shard_header_.GetPage(shard_no, raw_page_id).first; auto ret1 = shard_header_.GetPage(shard_no, raw_page_id);
if (ret1.second != SUCCESS) {
MS_LOG(ERROR) << "Get page failed";
return {FAILED, {}};
}
std::shared_ptr<Page> cur_raw_page = ret1.first;
// related blob page // related blob page
vector<pair<int, uint64_t>> row_group_list = cur_raw_page->GetRowGroupIds(); vector<pair<int, uint64_t>> row_group_list = cur_raw_page->GetRowGroupIds();
@ -430,7 +448,17 @@ ROW_DATA ShardIndexGenerator::GenerateRowData(int shard_no, const std::map<int,
// pair: row_group id, offset in raw data page // pair: row_group id, offset in raw data page
for (pair<int, int> blob_ids : row_group_list) { for (pair<int, int> blob_ids : row_group_list) {
// get blob data page according to row_group id // get blob data page according to row_group id
std::shared_ptr<Page> cur_blob_page = shard_header_.GetPage(shard_no, blob_id_to_page_id.at(blob_ids.first)).first; auto iter = blob_id_to_page_id.find(blob_ids.first);
if (iter == blob_id_to_page_id.end()) {
MS_LOG(ERROR) << "Convert blob id failed";
return {FAILED, {}};
}
auto ret2 = shard_header_.GetPage(shard_no, iter->second);
if (ret2.second != SUCCESS) {
MS_LOG(ERROR) << "Get page failed";
return {FAILED, {}};
}
std::shared_ptr<Page> cur_blob_page = ret2.first;
// offset in current raw data page // offset in current raw data page
auto cur_raw_page_offset = static_cast<uint64_t>(blob_ids.second); auto cur_raw_page_offset = static_cast<uint64_t>(blob_ids.second);
@ -618,7 +646,12 @@ void ShardIndexGenerator::DatabaseWriter() {
std::map<int, int> blob_id_to_page_id; std::map<int, int> blob_id_to_page_id;
std::vector<int> raw_page_ids; std::vector<int> raw_page_ids;
for (uint64_t i = 0; i < total_pages; ++i) { for (uint64_t i = 0; i < total_pages; ++i) {
std::shared_ptr<Page> cur_page = shard_header_.GetPage(shard_no, i).first; auto ret = shard_header_.GetPage(shard_no, i);
if (ret.second != SUCCESS) {
write_success_ = false;
return;
}
std::shared_ptr<Page> cur_page = ret.first;
if (cur_page->GetPageType() == "RAW_DATA") { if (cur_page->GetPageType() == "RAW_DATA") {
raw_page_ids.push_back(i); raw_page_ids.push_back(i);
} else if (cur_page->GetPageType() == "BLOB_DATA") { } else if (cur_page->GetPageType() == "BLOB_DATA") {

View File

@ -340,67 +340,78 @@ MSRStatus ShardReader::ConvertLabelToJson(const std::vector<std::vector<std::str
int shard_id, const std::vector<std::string> &columns, int shard_id, const std::vector<std::string> &columns,
std::shared_ptr<std::vector<std::vector<json>>> col_val_ptr) { std::shared_ptr<std::vector<std::vector<json>>> col_val_ptr) {
for (int i = 0; i < static_cast<int>(labels.size()); ++i) { for (int i = 0; i < static_cast<int>(labels.size()); ++i) {
uint64_t group_id = std::stoull(labels[i][0]); try {
uint64_t offset_start = std::stoull(labels[i][1]) + kInt64Len; uint64_t group_id = std::stoull(labels[i][0]);
uint64_t offset_end = std::stoull(labels[i][2]); uint64_t offset_start = std::stoull(labels[i][1]) + kInt64Len;
(*offset_ptr)[shard_id].emplace_back( uint64_t offset_end = std::stoull(labels[i][2]);
std::vector<uint64_t>{static_cast<uint64_t>(shard_id), group_id, offset_start, offset_end}); (*offset_ptr)[shard_id].emplace_back(
if (!all_in_index_) { std::vector<uint64_t>{static_cast<uint64_t>(shard_id), group_id, offset_start, offset_end});
int raw_page_id = std::stoi(labels[i][3]); if (!all_in_index_) {
uint64_t label_start = std::stoull(labels[i][4]) + kInt64Len; int raw_page_id = std::stoi(labels[i][3]);
uint64_t label_end = std::stoull(labels[i][5]); uint64_t label_start = std::stoull(labels[i][4]) + kInt64Len;
auto len = label_end - label_start; uint64_t label_end = std::stoull(labels[i][5]);
auto label_raw = std::vector<uint8_t>(len); auto len = label_end - label_start;
auto &io_seekg = fs->seekg(page_size_ * raw_page_id + header_size_ + label_start, std::ios::beg); auto label_raw = std::vector<uint8_t>(len);
if (!io_seekg.good() || io_seekg.fail() || io_seekg.bad()) { auto &io_seekg = fs->seekg(page_size_ * raw_page_id + header_size_ + label_start, std::ios::beg);
MS_LOG(ERROR) << "File seekg failed"; if (!io_seekg.good() || io_seekg.fail() || io_seekg.bad()) {
fs->close(); MS_LOG(ERROR) << "File seekg failed";
return FAILED; fs->close();
} return FAILED;
}
auto &io_read = fs->read(reinterpret_cast<char *>(&label_raw[0]), len); auto &io_read = fs->read(reinterpret_cast<char *>(&label_raw[0]), len);
if (!io_read.good() || io_read.fail() || io_read.bad()) { if (!io_read.good() || io_read.fail() || io_read.bad()) {
MS_LOG(ERROR) << "File read failed"; MS_LOG(ERROR) << "File read failed";
fs->close(); fs->close();
return FAILED; return FAILED;
} }
json label_json = json::from_msgpack(label_raw); json label_json = json::from_msgpack(label_raw);
json tmp; json tmp;
if (!columns.empty()) { if (!columns.empty()) {
for (auto &col : columns) { for (auto &col : columns) {
if (label_json.find(col) != label_json.end()) { if (label_json.find(col) != label_json.end()) {
tmp[col] = label_json[col]; tmp[col] = label_json[col];
}
}
} else {
tmp = label_json;
}
(*col_val_ptr)[shard_id].emplace_back(tmp);
} else {
json construct_json;
for (unsigned int j = 0; j < columns.size(); ++j) {
// construct json "f1": value
auto schema = shard_header_->GetSchemas()[0]->GetSchema()["schema"];
// convert the string to base type by schema
if (schema[columns[j]]["type"] == "int32") {
construct_json[columns[j]] = StringToNum<int32_t>(labels[i][j + 3]);
} else if (schema[columns[j]]["type"] == "int64") {
construct_json[columns[j]] = StringToNum<int64_t>(labels[i][j + 3]);
} else if (schema[columns[j]]["type"] == "float32") {
construct_json[columns[j]] = StringToNum<float>(labels[i][j + 3]);
} else if (schema[columns[j]]["type"] == "float64") {
construct_json[columns[j]] = StringToNum<double>(labels[i][j + 3]);
} else {
construct_json[columns[j]] = std::string(labels[i][j + 3]);
} }
} }
} else { (*col_val_ptr)[shard_id].emplace_back(construct_json);
tmp = label_json;
} }
(*col_val_ptr)[shard_id].emplace_back(tmp); } catch (std::out_of_range &e) {
} else { MS_LOG(ERROR) << "Out of range: " << e.what();
json construct_json; return FAILED;
for (unsigned int j = 0; j < columns.size(); ++j) { } catch (std::invalid_argument &e) {
// construct json "f1": value MS_LOG(ERROR) << "Invalid argument: " << e.what();
auto schema = shard_header_->GetSchemas()[0]->GetSchema()["schema"]; return FAILED;
} catch (...) {
// convert the string to base type by schema MS_LOG(ERROR) << "Exception was caught while convert label to json.";
if (schema[columns[j]]["type"] == "int32") { return FAILED;
construct_json[columns[j]] = StringToNum<int32_t>(labels[i][j + 3]);
} else if (schema[columns[j]]["type"] == "int64") {
construct_json[columns[j]] = StringToNum<int64_t>(labels[i][j + 3]);
} else if (schema[columns[j]]["type"] == "float32") {
construct_json[columns[j]] = StringToNum<float>(labels[i][j + 3]);
} else if (schema[columns[j]]["type"] == "float64") {
construct_json[columns[j]] = StringToNum<double>(labels[i][j + 3]);
} else {
construct_json[columns[j]] = std::string(labels[i][j + 3]);
}
}
(*col_val_ptr)[shard_id].emplace_back(construct_json);
} }
} }
return SUCCESS; return SUCCESS;
} } // namespace mindrecord
MSRStatus ShardReader::ReadAllRowsInShard(int shard_id, const std::string &sql, const std::vector<std::string> &columns, MSRStatus ShardReader::ReadAllRowsInShard(int shard_id, const std::string &sql, const std::vector<std::string> &columns,
std::shared_ptr<std::vector<std::vector<std::vector<uint64_t>>>> offset_ptr, std::shared_ptr<std::vector<std::vector<std::vector<uint64_t>>>> offset_ptr,
@ -961,9 +972,13 @@ MSRStatus ShardReader::CountTotalRows(const std::vector<std::string> &file_paths
num_samples = category_op->GetNumSamples(num_samples, num_classes); num_samples = category_op->GetNumSamples(num_samples, num_classes);
if (std::dynamic_pointer_cast<ShardPkSample>(op)) { if (std::dynamic_pointer_cast<ShardPkSample>(op)) {
auto tmp = std::dynamic_pointer_cast<ShardPkSample>(op)->GetNumSamples(); auto tmp = std::dynamic_pointer_cast<ShardPkSample>(op)->GetNumSamples();
if (tmp != 0) { if (tmp != 0 && num_samples != -1) {
num_samples = std::min(num_samples, tmp); num_samples = std::min(num_samples, tmp);
} }
if (-1 == num_samples) {
MS_LOG(ERROR) << "Number of samples exceeds the upper limit: " << std::numeric_limits<int64_t>::max();
return FAILED;
}
} }
} else if (std::dynamic_pointer_cast<ShardSample>(op)) { } else if (std::dynamic_pointer_cast<ShardSample>(op)) {
if (std::dynamic_pointer_cast<ShardDistributedSample>(op)) { if (std::dynamic_pointer_cast<ShardDistributedSample>(op)) {

View File

@ -39,7 +39,14 @@ MSRStatus ShardCategory::Execute(ShardTaskList &tasks) { return SUCCESS; }
int64_t ShardCategory::GetNumSamples(int64_t dataset_size, int64_t num_classes) { int64_t ShardCategory::GetNumSamples(int64_t dataset_size, int64_t num_classes) {
if (dataset_size == 0) return dataset_size; if (dataset_size == 0) return dataset_size;
if (dataset_size > 0 && num_classes > 0 && num_categories_ > 0 && num_elements_ > 0) { if (dataset_size > 0 && num_classes > 0 && num_categories_ > 0 && num_elements_ > 0) {
return std::min(num_categories_, num_classes) * num_elements_; num_classes = std::min(num_categories_, num_classes);
if (num_classes == 0) {
return 0;
}
if (num_elements_ > std::numeric_limits<int64_t>::max() / num_classes) {
return -1;
}
return num_classes * num_elements_;
} }
return 0; return 0;
} }