!21670 fix: null numpy field in mindrecord will cause read error

Merge pull request !21670 from guozhijian/fix_null_numpy_field_in_mr
This commit is contained in:
i-robot 2021-08-12 03:39:34 +00:00 committed by Gitee
commit bff5cbda9c
4 changed files with 70 additions and 2 deletions

View File

@ -155,7 +155,9 @@ Status ColDescriptor::MaterializeTensorShape(int32_t num_elements, TensorShape *
// Sanity check the the computed element counts divide evenly into the input element count
if (num_elements < num_elements_of_shape || num_elements_of_shape == 0 || num_elements % num_elements_of_shape != 0) {
RETURN_STATUS_UNEXPECTED("Requested shape has an invalid element count!");
std::string err = "Requested shape has an invalid element count! Number elements: " + std::to_string(num_elements) +
", number elements of shape: " + std::to_string(num_elements_of_shape);
RETURN_STATUS_UNEXPECTED(err);
}
// If there was any unknown dimensions, then update the requested shape to fill in the unknown

View File

@ -282,7 +282,12 @@ Status MindRecordOp::LoadTensorRow(TensorRow *tensor_row, const std::vector<uint
RETURN_IF_NOT_OK(Tensor::CreateScalar(s, &tensor));
} else if (column.hasShape()) {
auto new_shape = TensorShape(column.shape());
RETURN_IF_NOT_OK(column.MaterializeTensorShape(static_cast<int32_t>(num_elements), &new_shape));
// if the numpy is null, create empty tensor shape
if (num_elements == 0) {
new_shape = TensorShape({});
} else {
RETURN_IF_NOT_OK(column.MaterializeTensorShape(static_cast<int32_t>(num_elements), &new_shape));
}
RETURN_IF_NOT_OK(Tensor::CreateFromMemory(new_shape, type, data, &tensor));
} else {
std::vector<dsize_t> shapeDetails = {static_cast<dsize_t>(num_elements)};

View File

@ -421,6 +421,12 @@ MSRStatus ShardColumn::UncompressInt(const uint64_t &column_id, std::unique_ptr<
auto data = reinterpret_cast<const unsigned char *>(array_data.get());
*data_ptr = std::make_unique<unsigned char[]>(*num_bytes);
// field is none. for example: numpy is null
if (*num_bytes == 0) {
return SUCCESS;
}
int ret_code = memcpy_s(data_ptr->get(), *num_bytes, data, *num_bytes);
if (ret_code != 0) {
MS_LOG(ERROR) << "Failed to copy data!";

View File

@ -2568,6 +2568,60 @@ def test_distributed_shuffle_with_multi_epochs(create_multi_mindrecord_files):
assert datas_epoch2 not in (datas_epoch1, datas_epoch3)
assert datas_epoch3 not in (datas_epoch2, datas_epoch1)
def test_field_is_null_numpy():
"""add/remove nlp file"""
paths = ["{}{}".format(NLP_FILE_NAME, str(x).rjust(1, '0'))
for x in range(FILES_NUM)]
for x in paths:
if os.path.exists("{}".format(x)):
os.remove("{}".format(x))
if os.path.exists("{}.db".format(x)):
os.remove("{}.db".format(x))
writer = FileWriter(NLP_FILE_NAME, FILES_NUM)
data = []
# field array_d is null
for row_id in range(16):
data.append({
"label": row_id,
"array_a": np.reshape(np.array([0, 1, -1, 127, -128, 128, -129,
255, 256, -32768, 32767, -32769, 32768, -2147483648,
2147483647], dtype=np.int32), [-1]),
"array_b": np.reshape(np.array([0, 1, -1, 127, -128, 128, -129, 255,
256, -32768, 32767, -32769, 32768,
-2147483648, 2147483647, -2147483649, 2147483649,
-922337036854775808, 9223372036854775807]), [1, -1]),
"array_d": np.array([], dtype=np.int64)
})
nlp_schema_json = {"label": {"type": "int32"},
"array_a": {"type": "int32",
"shape": [-1]},
"array_b": {"type": "int64",
"shape": [1, -1]},
"array_d": {"type": "int64",
"shape": [-1]}
}
writer.set_header_size(1 << 14)
writer.set_page_size(1 << 15)
writer.add_schema(nlp_schema_json, "nlp_schema")
writer.write_raw_data(data)
writer.commit()
data_set = ds.MindDataset(dataset_file=NLP_FILE_NAME + "0",
columns_list=["label", "array_a", "array_b", "array_d"],
num_parallel_workers=2,
shuffle=False)
assert data_set.get_dataset_size() == 16
assert data_set.output_shapes() == [[], [15], [1, 19], []]
assert data_set.output_types()[0] == np.int32
assert data_set.output_types()[1] == np.int32
assert data_set.output_types()[2] == np.int64
assert data_set.output_types()[3] == np.int64
for x in paths:
os.remove("{}".format(x))
os.remove("{}.db".format(x))
if __name__ == '__main__':
test_nlp_compress_data(add_and_remove_nlp_compress_file)
test_nlp_compress_data_old_version(add_and_remove_nlp_compress_file)
@ -2603,3 +2657,4 @@ if __name__ == '__main__':
test_shuffle_with_global_infile_files(create_multi_mindrecord_files)
test_distributed_shuffle_with_global_infile_files(create_multi_mindrecord_files)
test_distributed_shuffle_with_multi_epochs(create_multi_mindrecord_files)
test_field_is_null_numpy()