!21670 fix: null numpy field in mindrecord will cause read error

Merge pull request !21670 from guozhijian/fix_null_numpy_field_in_mr
2021-08-12 03:39:34 +00:00 · 2021-08-12 03:39:34 +00:00 · bff5cbda9c
parent 3c106399bc 42b3512a1b
commit bff5cbda9c
4 changed files with 70 additions and 2 deletions
--- a/mindspore/ccsrc/minddata/dataset/engine/data_schema.cc
+++ b/mindspore/ccsrc/minddata/dataset/engine/data_schema.cc
@ -155,7 +155,9 @@ Status ColDescriptor::MaterializeTensorShape(int32_t num_elements, TensorShape *

  // Sanity check the the computed element counts divide evenly into the input element count
  if (num_elements < num_elements_of_shape || num_elements_of_shape == 0 || num_elements % num_elements_of_shape != 0) {
-    RETURN_STATUS_UNEXPECTED("Requested shape has an invalid element count!");
+    std::string err = "Requested shape has an invalid element count! Number elements: " + std::to_string(num_elements) +
+                      ", number elements of shape: " + std::to_string(num_elements_of_shape);
+    RETURN_STATUS_UNEXPECTED(err);
  }

  // If there was any unknown dimensions, then update the requested shape to fill in the unknown
--- a/mindspore/ccsrc/minddata/dataset/engine/datasetops/source/mindrecord_op.cc
+++ b/mindspore/ccsrc/minddata/dataset/engine/datasetops/source/mindrecord_op.cc
@ -282,7 +282,12 @@ Status MindRecordOp::LoadTensorRow(TensorRow *tensor_row, const std::vector<uint
      RETURN_IF_NOT_OK(Tensor::CreateScalar(s, &tensor));
    } else if (column.hasShape()) {
      auto new_shape = TensorShape(column.shape());
-      RETURN_IF_NOT_OK(column.MaterializeTensorShape(static_cast<int32_t>(num_elements), &new_shape));
+      // if the numpy is null, create empty tensor shape
+      if (num_elements == 0) {
+        new_shape = TensorShape({});
+      } else {
+        RETURN_IF_NOT_OK(column.MaterializeTensorShape(static_cast<int32_t>(num_elements), &new_shape));
+      }
      RETURN_IF_NOT_OK(Tensor::CreateFromMemory(new_shape, type, data, &tensor));
    } else {
      std::vector<dsize_t> shapeDetails = {static_cast<dsize_t>(num_elements)};
--- a/mindspore/ccsrc/minddata/mindrecord/meta/shard_column.cc
+++ b/mindspore/ccsrc/minddata/mindrecord/meta/shard_column.cc
@ -421,6 +421,12 @@ MSRStatus ShardColumn::UncompressInt(const uint64_t &column_id, std::unique_ptr<

  auto data = reinterpret_cast<const unsigned char *>(array_data.get());
  *data_ptr = std::make_unique<unsigned char[]>(*num_bytes);
+
+  // field is none. for example: numpy is null
+  if (*num_bytes == 0) {
+    return SUCCESS;
+  }
+
  int ret_code = memcpy_s(data_ptr->get(), *num_bytes, data, *num_bytes);
  if (ret_code != 0) {
    MS_LOG(ERROR) << "Failed to copy data!";
--- a/tests/ut/python/dataset/test_minddataset.py
+++ b/tests/ut/python/dataset/test_minddataset.py
@ -2568,6 +2568,60 @@ def test_distributed_shuffle_with_multi_epochs(create_multi_mindrecord_files):
    assert datas_epoch2 not in (datas_epoch1, datas_epoch3)
    assert datas_epoch3 not in (datas_epoch2, datas_epoch1)

+def test_field_is_null_numpy():
+    """add/remove nlp file"""
+    paths = ["{}{}".format(NLP_FILE_NAME, str(x).rjust(1, '0'))
+             for x in range(FILES_NUM)]
+    for x in paths:
+        if os.path.exists("{}".format(x)):
+            os.remove("{}".format(x))
+        if os.path.exists("{}.db".format(x)):
+            os.remove("{}.db".format(x))
+
+    writer = FileWriter(NLP_FILE_NAME, FILES_NUM)
+    data = []
+    # field array_d is null
+    for row_id in range(16):
+        data.append({
+            "label": row_id,
+            "array_a": np.reshape(np.array([0, 1, -1, 127, -128, 128, -129,
+                                            255, 256, -32768, 32767, -32769, 32768, -2147483648,
+                                            2147483647], dtype=np.int32), [-1]),
+            "array_b": np.reshape(np.array([0, 1, -1, 127, -128, 128, -129, 255,
+                                            256, -32768, 32767, -32769, 32768,
+                                            -2147483648, 2147483647, -2147483649, 2147483649,
+                                            -922337036854775808, 9223372036854775807]), [1, -1]),
+            "array_d": np.array([], dtype=np.int64)
+        })
+    nlp_schema_json = {"label": {"type": "int32"},
+                       "array_a": {"type": "int32",
+                                   "shape": [-1]},
+                       "array_b": {"type": "int64",
+                                   "shape": [1, -1]},
+                       "array_d": {"type": "int64",
+                                   "shape": [-1]}
+                       }
+    writer.set_header_size(1 << 14)
+    writer.set_page_size(1 << 15)
+    writer.add_schema(nlp_schema_json, "nlp_schema")
+    writer.write_raw_data(data)
+    writer.commit()
+
+    data_set = ds.MindDataset(dataset_file=NLP_FILE_NAME + "0",
+                              columns_list=["label", "array_a", "array_b", "array_d"],
+                              num_parallel_workers=2,
+                              shuffle=False)
+    assert data_set.get_dataset_size() == 16
+    assert data_set.output_shapes() == [[], [15], [1, 19], []]
+    assert data_set.output_types()[0] == np.int32
+    assert data_set.output_types()[1] == np.int32
+    assert data_set.output_types()[2] == np.int64
+    assert data_set.output_types()[3] == np.int64
+
+    for x in paths:
+        os.remove("{}".format(x))
+        os.remove("{}.db".format(x))
+
 if __name__ == '__main__':
    test_nlp_compress_data(add_and_remove_nlp_compress_file)
    test_nlp_compress_data_old_version(add_and_remove_nlp_compress_file)
@ -2603,3 +2657,4 @@ if __name__ == '__main__':
    test_shuffle_with_global_infile_files(create_multi_mindrecord_files)
    test_distributed_shuffle_with_global_infile_files(create_multi_mindrecord_files)
    test_distributed_shuffle_with_multi_epochs(create_multi_mindrecord_files)
+    test_field_is_null_numpy()