diff --git a/tests/ut/python/dataset/test_minddataset.py b/tests/ut/python/dataset/test_minddataset.py index 5791ea9618..24e6595233 100644 --- a/tests/ut/python/dataset/test_minddataset.py +++ b/tests/ut/python/dataset/test_minddataset.py @@ -46,58 +46,71 @@ def add_and_remove_cv_file(): """add/remove cv file""" paths = ["{}{}".format(CV_FILE_NAME, str(x).rjust(1, '0')) for x in range(FILES_NUM)] - for x in paths: - if os.path.exists("{}".format(x)): + try: + for x in paths: + if os.path.exists("{}".format(x)): + os.remove("{}".format(x)) + if os.path.exists("{}.db".format(x)): + os.remove("{}.db".format(x)) + writer = FileWriter(CV_FILE_NAME, FILES_NUM) + data = get_data(CV_DIR_NAME) + cv_schema_json = {"id": {"type": "int32"}, + "file_name": {"type": "string"}, + "label": {"type": "int32"}, + "data": {"type": "bytes"}} + writer.add_schema(cv_schema_json, "img_schema") + writer.add_index(["file_name", "label"]) + writer.write_raw_data(data) + writer.commit() + yield "yield_cv_data" + except Exception as error: + for x in paths: + os.remove("{}".format(x)) + os.remove("{}.db".format(x)) + raise error + else: + for x in paths: os.remove("{}".format(x)) - if os.path.exists("{}.db".format(x)): os.remove("{}.db".format(x)) - writer = FileWriter(CV_FILE_NAME, FILES_NUM) - data = get_data(CV_DIR_NAME) - cv_schema_json = {"id": {"type": "int32"}, - "file_name": {"type": "string"}, - "label": {"type": "int32"}, - "data": {"type": "bytes"}} - writer.add_schema(cv_schema_json, "img_schema") - writer.add_index(["file_name", "label"]) - writer.write_raw_data(data) - writer.commit() - yield "yield_cv_data" - for x in paths: - os.remove("{}".format(x)) - os.remove("{}.db".format(x)) - @pytest.fixture def add_and_remove_nlp_file(): """add/remove nlp file""" paths = ["{}{}".format(NLP_FILE_NAME, str(x).rjust(1, '0')) for x in range(FILES_NUM)] - for x in paths: - if os.path.exists("{}".format(x)): + try: + for x in paths: + if os.path.exists("{}".format(x)): + os.remove("{}".format(x)) + if os.path.exists("{}.db".format(x)): + os.remove("{}.db".format(x)) + writer = FileWriter(NLP_FILE_NAME, FILES_NUM) + data = [x for x in get_nlp_data(NLP_FILE_POS, NLP_FILE_VOCAB, 10)] + nlp_schema_json = {"id": {"type": "string"}, "label": {"type": "int32"}, + "rating": {"type": "float32"}, + "input_ids": {"type": "int64", + "shape": [-1]}, + "input_mask": {"type": "int64", + "shape": [1, -1]}, + "segment_ids": {"type": "int64", + "shape": [2, -1]} + } + writer.set_header_size(1 << 14) + writer.set_page_size(1 << 15) + writer.add_schema(nlp_schema_json, "nlp_schema") + writer.add_index(["id", "rating"]) + writer.write_raw_data(data) + writer.commit() + yield "yield_nlp_data" + except Exception as error: + for x in paths: + os.remove("{}".format(x)) + os.remove("{}.db".format(x)) + raise error + else: + for x in paths: os.remove("{}".format(x)) - if os.path.exists("{}.db".format(x)): os.remove("{}.db".format(x)) - writer = FileWriter(NLP_FILE_NAME, FILES_NUM) - data = [x for x in get_nlp_data(NLP_FILE_POS, NLP_FILE_VOCAB, 10)] - nlp_schema_json = {"id": {"type": "string"}, "label": {"type": "int32"}, - "rating": {"type": "float32"}, - "input_ids": {"type": "int64", - "shape": [-1]}, - "input_mask": {"type": "int64", - "shape": [1, -1]}, - "segment_ids": {"type": "int64", - "shape": [2, -1]} - } - writer.set_header_size(1 << 14) - writer.set_page_size(1 << 15) - writer.add_schema(nlp_schema_json, "nlp_schema") - writer.add_index(["id", "rating"]) - writer.write_raw_data(data) - writer.commit() - yield "yield_nlp_data" - for x in paths: - os.remove("{}".format(x)) - os.remove("{}.db".format(x)) @pytest.fixture @@ -105,44 +118,51 @@ def add_and_remove_nlp_compress_file(): """add/remove nlp file""" paths = ["{}{}".format(NLP_FILE_NAME, str(x).rjust(1, '0')) for x in range(FILES_NUM)] - for x in paths: - if os.path.exists("{}".format(x)): + try: + for x in paths: + if os.path.exists("{}".format(x)): + os.remove("{}".format(x)) + if os.path.exists("{}.db".format(x)): + os.remove("{}.db".format(x)) + writer = FileWriter(NLP_FILE_NAME, FILES_NUM) + data = [] + for row_id in range(16): + data.append({ + "label": row_id, + "array_a": np.reshape(np.array([0, 1, -1, 127, -128, 128, -129, + 255, 256, -32768, 32767, -32769, 32768, -2147483648, + 2147483647], dtype=np.int32), [-1]), + "array_b": np.reshape(np.array([0, 1, -1, 127, -128, 128, -129, 255, + 256, -32768, 32767, -32769, 32768, + -2147483648, 2147483647, -2147483649, 2147483649, + -922337036854775808, 9223372036854775807]), [1, -1]), + "array_c": str.encode("nlp data"), + "array_d": np.reshape(np.array([[-10, -127], [10, 127]]), [2, -1]) + }) + nlp_schema_json = {"label": {"type": "int32"}, + "array_a": {"type": "int32", + "shape": [-1]}, + "array_b": {"type": "int64", + "shape": [1, -1]}, + "array_c": {"type": "bytes"}, + "array_d": {"type": "int64", + "shape": [2, -1]} + } + writer.set_header_size(1 << 14) + writer.set_page_size(1 << 15) + writer.add_schema(nlp_schema_json, "nlp_schema") + writer.write_raw_data(data) + writer.commit() + yield "yield_nlp_data" + except Exception as error: + for x in paths: + os.remove("{}".format(x)) + os.remove("{}.db".format(x)) + raise error + else: + for x in paths: os.remove("{}".format(x)) - if os.path.exists("{}.db".format(x)): os.remove("{}.db".format(x)) - writer = FileWriter(NLP_FILE_NAME, FILES_NUM) - data = [] - for row_id in range(16): - data.append({ - "label": row_id, - "array_a": np.reshape(np.array([0, 1, -1, 127, -128, 128, -129, - 255, 256, -32768, 32767, -32769, 32768, -2147483648, - 2147483647], dtype=np.int32), [-1]), - "array_b": np.reshape(np.array([0, 1, -1, 127, -128, 128, -129, 255, - 256, -32768, 32767, -32769, 32768, - -2147483648, 2147483647, -2147483649, 2147483649, - -922337036854775808, 9223372036854775807]), [1, -1]), - "array_c": str.encode("nlp data"), - "array_d": np.reshape(np.array([[-10, -127], [10, 127]]), [2, -1]) - }) - nlp_schema_json = {"label": {"type": "int32"}, - "array_a": {"type": "int32", - "shape": [-1]}, - "array_b": {"type": "int64", - "shape": [1, -1]}, - "array_c": {"type": "bytes"}, - "array_d": {"type": "int64", - "shape": [2, -1]} - } - writer.set_header_size(1 << 14) - writer.set_page_size(1 << 15) - writer.add_schema(nlp_schema_json, "nlp_schema") - writer.write_raw_data(data) - writer.commit() - yield "yield_nlp_data" - for x in paths: - os.remove("{}".format(x)) - os.remove("{}.db".format(x)) def test_nlp_compress_data(add_and_remove_nlp_compress_file): @@ -199,22 +219,29 @@ def test_cv_minddataset_writer_tutorial(): """tutorial for cv dataset writer.""" paths = ["{}{}".format(CV_FILE_NAME, str(x).rjust(1, '0')) for x in range(FILES_NUM)] - for x in paths: - if os.path.exists("{}".format(x)): + try: + for x in paths: + if os.path.exists("{}".format(x)): + os.remove("{}".format(x)) + if os.path.exists("{}.db".format(x)): + os.remove("{}.db".format(x)) + writer = FileWriter(CV_FILE_NAME, FILES_NUM) + data = get_data(CV_DIR_NAME) + cv_schema_json = {"file_name": {"type": "string"}, "label": {"type": "int32"}, + "data": {"type": "bytes"}} + writer.add_schema(cv_schema_json, "img_schema") + writer.add_index(["file_name", "label"]) + writer.write_raw_data(data) + writer.commit() + except Exception as error: + for x in paths: + os.remove("{}".format(x)) + os.remove("{}.db".format(x)) + raise error + else: + for x in paths: os.remove("{}".format(x)) - if os.path.exists("{}.db".format(x)): os.remove("{}.db".format(x)) - writer = FileWriter(CV_FILE_NAME, FILES_NUM) - data = get_data(CV_DIR_NAME) - cv_schema_json = {"file_name": {"type": "string"}, "label": {"type": "int32"}, - "data": {"type": "bytes"}} - writer.add_schema(cv_schema_json, "img_schema") - writer.add_index(["file_name", "label"]) - writer.write_raw_data(data) - writer.commit() - for x in paths: - os.remove("{}".format(x)) - os.remove("{}.db".format(x)) def test_cv_minddataset_partition_tutorial(add_and_remove_cv_file): @@ -654,106 +681,124 @@ def test_cv_minddataset_reader_one_partition(add_and_remove_cv_file): def test_cv_minddataset_reader_two_dataset(add_and_remove_cv_file): """tutorial for cv minderdataset.""" - if os.path.exists(CV1_FILE_NAME): - os.remove(CV1_FILE_NAME) - if os.path.exists("{}.db".format(CV1_FILE_NAME)): - os.remove("{}.db".format(CV1_FILE_NAME)) - if os.path.exists(CV2_FILE_NAME): - os.remove(CV2_FILE_NAME) - if os.path.exists("{}.db".format(CV2_FILE_NAME)): - os.remove("{}.db".format(CV2_FILE_NAME)) - writer = FileWriter(CV1_FILE_NAME, 1) - data = get_data(CV_DIR_NAME) - cv_schema_json = {"id": {"type": "int32"}, - "file_name": {"type": "string"}, - "label": {"type": "int32"}, - "data": {"type": "bytes"}} - writer.add_schema(cv_schema_json, "CV1_schema") - writer.add_index(["file_name", "label"]) - writer.write_raw_data(data) - writer.commit() - - writer = FileWriter(CV2_FILE_NAME, 1) - data = get_data(CV_DIR_NAME) - cv_schema_json = {"id": {"type": "int32"}, - "file_name": {"type": "string"}, - "label": {"type": "int32"}, - "data": {"type": "bytes"}} - writer.add_schema(cv_schema_json, "CV2_schema") - writer.add_index(["file_name", "label"]) - writer.write_raw_data(data) - writer.commit() - columns_list = ["data", "file_name", "label"] - num_readers = 4 - data_set = ds.MindDataset([CV_FILE_NAME + str(x) for x in range(FILES_NUM)] + [CV1_FILE_NAME, CV2_FILE_NAME], - columns_list, num_readers) - assert data_set.get_dataset_size() == 30 - num_iter = 0 - for item in data_set.create_dict_iterator(): - logger.info( - "-------------- cv reader basic: {} ------------------------".format(num_iter)) - logger.info( - "-------------- len(item[data]): {} ------------------------".format(len(item["data"]))) - logger.info( - "-------------- item[data]: {} -----------------------------".format(item["data"])) - logger.info( - "-------------- item[file_name]: {} ------------------------".format(item["file_name"])) - logger.info( - "-------------- item[label]: {} ----------------------------".format(item["label"])) - num_iter += 1 - assert num_iter == 30 - if os.path.exists(CV1_FILE_NAME): - os.remove(CV1_FILE_NAME) - if os.path.exists("{}.db".format(CV1_FILE_NAME)): - os.remove("{}.db".format(CV1_FILE_NAME)) - if os.path.exists(CV2_FILE_NAME): - os.remove(CV2_FILE_NAME) - if os.path.exists("{}.db".format(CV2_FILE_NAME)): - os.remove("{}.db".format(CV2_FILE_NAME)) + try: + if os.path.exists(CV1_FILE_NAME): + os.remove(CV1_FILE_NAME) + if os.path.exists("{}.db".format(CV1_FILE_NAME)): + os.remove("{}.db".format(CV1_FILE_NAME)) + if os.path.exists(CV2_FILE_NAME): + os.remove(CV2_FILE_NAME) + if os.path.exists("{}.db".format(CV2_FILE_NAME)): + os.remove("{}.db".format(CV2_FILE_NAME)) + writer = FileWriter(CV1_FILE_NAME, 1) + data = get_data(CV_DIR_NAME) + cv_schema_json = {"id": {"type": "int32"}, + "file_name": {"type": "string"}, + "label": {"type": "int32"}, + "data": {"type": "bytes"}} + writer.add_schema(cv_schema_json, "CV1_schema") + writer.add_index(["file_name", "label"]) + writer.write_raw_data(data) + writer.commit() + writer = FileWriter(CV2_FILE_NAME, 1) + data = get_data(CV_DIR_NAME) + cv_schema_json = {"id": {"type": "int32"}, + "file_name": {"type": "string"}, + "label": {"type": "int32"}, + "data": {"type": "bytes"}} + writer.add_schema(cv_schema_json, "CV2_schema") + writer.add_index(["file_name", "label"]) + writer.write_raw_data(data) + writer.commit() + columns_list = ["data", "file_name", "label"] + num_readers = 4 + data_set = ds.MindDataset([CV_FILE_NAME + str(x) for x in range(FILES_NUM)] + [CV1_FILE_NAME, CV2_FILE_NAME], + columns_list, num_readers) + assert data_set.get_dataset_size() == 30 + num_iter = 0 + for item in data_set.create_dict_iterator(): + logger.info( + "-------------- cv reader basic: {} ------------------------".format(num_iter)) + logger.info( + "-------------- len(item[data]): {} ------------------------".format(len(item["data"]))) + logger.info( + "-------------- item[data]: {} -----------------------------".format(item["data"])) + logger.info( + "-------------- item[file_name]: {} ------------------------".format(item["file_name"])) + logger.info( + "-------------- item[label]: {} ----------------------------".format(item["label"])) + num_iter += 1 + assert num_iter == 30 + except Exception as error: + if os.path.exists(CV1_FILE_NAME): + os.remove(CV1_FILE_NAME) + if os.path.exists("{}.db".format(CV1_FILE_NAME)): + os.remove("{}.db".format(CV1_FILE_NAME)) + if os.path.exists(CV2_FILE_NAME): + os.remove(CV2_FILE_NAME) + if os.path.exists("{}.db".format(CV2_FILE_NAME)): + os.remove("{}.db".format(CV2_FILE_NAME)) + raise error + else: + if os.path.exists(CV1_FILE_NAME): + os.remove(CV1_FILE_NAME) + if os.path.exists("{}.db".format(CV1_FILE_NAME)): + os.remove("{}.db".format(CV1_FILE_NAME)) + if os.path.exists(CV2_FILE_NAME): + os.remove(CV2_FILE_NAME) + if os.path.exists("{}.db".format(CV2_FILE_NAME)): + os.remove("{}.db".format(CV2_FILE_NAME)) def test_cv_minddataset_reader_two_dataset_partition(add_and_remove_cv_file): paths = ["{}{}".format(CV1_FILE_NAME, str(x).rjust(1, '0')) for x in range(FILES_NUM)] - for x in paths: - if os.path.exists("{}".format(x)): + try: + for x in paths: + if os.path.exists("{}".format(x)): + os.remove("{}".format(x)) + if os.path.exists("{}.db".format(x)): + os.remove("{}.db".format(x)) + writer = FileWriter(CV1_FILE_NAME, FILES_NUM) + data = get_data(CV_DIR_NAME) + cv_schema_json = {"id": {"type": "int32"}, + "file_name": {"type": "string"}, + "label": {"type": "int32"}, + "data": {"type": "bytes"}} + writer.add_schema(cv_schema_json, "CV1_schema") + writer.add_index(["file_name", "label"]) + writer.write_raw_data(data) + writer.commit() + + columns_list = ["data", "file_name", "label"] + num_readers = 4 + data_set = ds.MindDataset([CV_FILE_NAME + str(x) for x in range(2)] + + [CV1_FILE_NAME + str(x) for x in range(2, 4)], + columns_list, num_readers) + assert data_set.get_dataset_size() < 20 + num_iter = 0 + for item in data_set.create_dict_iterator(): + logger.info( + "-------------- cv reader basic: {} ------------------------".format(num_iter)) + logger.info( + "-------------- len(item[data]): {} ------------------------".format(len(item["data"]))) + logger.info( + "-------------- item[data]: {} -----------------------------".format(item["data"])) + logger.info( + "-------------- item[file_name]: {} ------------------------".format(item["file_name"])) + logger.info( + "-------------- item[label]: {} ----------------------------".format(item["label"])) + num_iter += 1 + assert num_iter < 20 + except Exception as error: + for x in paths: + os.remove("{}".format(x)) + os.remove("{}.db".format(x)) + raise error + else: + for x in paths: os.remove("{}".format(x)) - if os.path.exists("{}.db".format(x)): os.remove("{}.db".format(x)) - writer = FileWriter(CV1_FILE_NAME, FILES_NUM) - data = get_data(CV_DIR_NAME) - cv_schema_json = {"id": {"type": "int32"}, - "file_name": {"type": "string"}, - "label": {"type": "int32"}, - "data": {"type": "bytes"}} - writer.add_schema(cv_schema_json, "CV1_schema") - writer.add_index(["file_name", "label"]) - writer.write_raw_data(data) - writer.commit() - - columns_list = ["data", "file_name", "label"] - num_readers = 4 - data_set = ds.MindDataset([CV_FILE_NAME + str(x) for x in range(2)] + [CV1_FILE_NAME + str(x) for x in range(2, 4)], - columns_list, num_readers) - assert data_set.get_dataset_size() < 20 - num_iter = 0 - for item in data_set.create_dict_iterator(): - logger.info( - "-------------- cv reader basic: {} ------------------------".format(num_iter)) - logger.info( - "-------------- len(item[data]): {} ------------------------".format(len(item["data"]))) - logger.info( - "-------------- item[data]: {} -----------------------------".format(item["data"])) - logger.info( - "-------------- item[file_name]: {} ------------------------".format(item["file_name"])) - logger.info( - "-------------- item[label]: {} ----------------------------".format(item["label"])) - num_iter += 1 - assert num_iter < 20 - for x in paths: - os.remove("{}".format(x)) - os.remove("{}.db".format(x)) - def test_cv_minddataset_reader_basic_tutorial(add_and_remove_cv_file): """tutorial for cv minderdataset.""" @@ -1086,809 +1131,870 @@ def inputs(vectors, maxlen=50): def test_write_with_multi_bytes_and_array_and_read_by_MindDataset(): mindrecord_file_name = "test.mindrecord" - if os.path.exists("{}".format(mindrecord_file_name)): + try: + if os.path.exists("{}".format(mindrecord_file_name)): + os.remove("{}".format(mindrecord_file_name)) + if os.path.exists("{}.db".format(mindrecord_file_name)): + os.remove("{}.db".format(mindrecord_file_name)) + data = [{"file_name": "001.jpg", "label": 4, + "image1": bytes("image1 bytes abc", encoding='UTF-8'), + "image2": bytes("image1 bytes def", encoding='UTF-8'), + "source_sos_ids": np.array([1, 2, 3, 4, 5], dtype=np.int64), + "source_sos_mask": np.array([6, 7, 8, 9, 10, 11, 12], dtype=np.int64), + "image3": bytes("image1 bytes ghi", encoding='UTF-8'), + "image4": bytes("image1 bytes jkl", encoding='UTF-8'), + "image5": bytes("image1 bytes mno", encoding='UTF-8'), + "target_sos_ids": np.array([28, 29, 30, 31, 32], dtype=np.int64), + "target_sos_mask": np.array([33, 34, 35, 36, 37, 38], dtype=np.int64), + "target_eos_ids": np.array([39, 40, 41, 42, 43, 44, 45, 46, 47], dtype=np.int64), + "target_eos_mask": np.array([48, 49, 50, 51], dtype=np.int64)}, + {"file_name": "002.jpg", "label": 5, + "image1": bytes("image2 bytes abc", encoding='UTF-8'), + "image2": bytes("image2 bytes def", encoding='UTF-8'), + "image3": bytes("image2 bytes ghi", encoding='UTF-8'), + "image4": bytes("image2 bytes jkl", encoding='UTF-8'), + "image5": bytes("image2 bytes mno", encoding='UTF-8'), + "source_sos_ids": np.array([11, 2, 3, 4, 5], dtype=np.int64), + "source_sos_mask": np.array([16, 7, 8, 9, 10, 11, 12], dtype=np.int64), + "target_sos_ids": np.array([128, 29, 30, 31, 32], dtype=np.int64), + "target_sos_mask": np.array([133, 34, 35, 36, 37, 38], dtype=np.int64), + "target_eos_ids": np.array([139, 40, 41, 42, 43, 44, 45, 46, 47], dtype=np.int64), + "target_eos_mask": np.array([148, 49, 50, 51], dtype=np.int64)}, + {"file_name": "003.jpg", "label": 6, + "source_sos_ids": np.array([21, 2, 3, 4, 5], dtype=np.int64), + "source_sos_mask": np.array([26, 7, 8, 9, 10, 11, 12], dtype=np.int64), + "target_sos_ids": np.array([228, 29, 30, 31, 32], dtype=np.int64), + "target_sos_mask": np.array([233, 34, 35, 36, 37, 38], dtype=np.int64), + "target_eos_ids": np.array([239, 40, 41, 42, 43, 44, 45, 46, 47], dtype=np.int64), + "image1": bytes("image3 bytes abc", encoding='UTF-8'), + "image2": bytes("image3 bytes def", encoding='UTF-8'), + "image3": bytes("image3 bytes ghi", encoding='UTF-8'), + "image4": bytes("image3 bytes jkl", encoding='UTF-8'), + "image5": bytes("image3 bytes mno", encoding='UTF-8'), + "target_eos_mask": np.array([248, 49, 50, 51], dtype=np.int64)}, + {"file_name": "004.jpg", "label": 7, + "source_sos_ids": np.array([31, 2, 3, 4, 5], dtype=np.int64), + "source_sos_mask": np.array([36, 7, 8, 9, 10, 11, 12], dtype=np.int64), + "image1": bytes("image4 bytes abc", encoding='UTF-8'), + "image2": bytes("image4 bytes def", encoding='UTF-8'), + "image3": bytes("image4 bytes ghi", encoding='UTF-8'), + "image4": bytes("image4 bytes jkl", encoding='UTF-8'), + "image5": bytes("image4 bytes mno", encoding='UTF-8'), + "target_sos_ids": np.array([328, 29, 30, 31, 32], dtype=np.int64), + "target_sos_mask": np.array([333, 34, 35, 36, 37, 38], dtype=np.int64), + "target_eos_ids": np.array([339, 40, 41, 42, 43, 44, 45, 46, 47], dtype=np.int64), + "target_eos_mask": np.array([348, 49, 50, 51], dtype=np.int64)}, + {"file_name": "005.jpg", "label": 8, + "source_sos_ids": np.array([41, 2, 3, 4, 5], dtype=np.int64), + "source_sos_mask": np.array([46, 7, 8, 9, 10, 11, 12], dtype=np.int64), + "target_sos_ids": np.array([428, 29, 30, 31, 32], dtype=np.int64), + "target_sos_mask": np.array([433, 34, 35, 36, 37, 38], dtype=np.int64), + "image1": bytes("image5 bytes abc", encoding='UTF-8'), + "image2": bytes("image5 bytes def", encoding='UTF-8'), + "image3": bytes("image5 bytes ghi", encoding='UTF-8'), + "image4": bytes("image5 bytes jkl", encoding='UTF-8'), + "image5": bytes("image5 bytes mno", encoding='UTF-8'), + "target_eos_ids": np.array([439, 40, 41, 42, 43, 44, 45, 46, 47], dtype=np.int64), + "target_eos_mask": np.array([448, 49, 50, 51], dtype=np.int64)}, + {"file_name": "006.jpg", "label": 9, + "source_sos_ids": np.array([51, 2, 3, 4, 5], dtype=np.int64), + "source_sos_mask": np.array([56, 7, 8, 9, 10, 11, 12], dtype=np.int64), + "target_sos_ids": np.array([528, 29, 30, 31, 32], dtype=np.int64), + "image1": bytes("image6 bytes abc", encoding='UTF-8'), + "image2": bytes("image6 bytes def", encoding='UTF-8'), + "image3": bytes("image6 bytes ghi", encoding='UTF-8'), + "image4": bytes("image6 bytes jkl", encoding='UTF-8'), + "image5": bytes("image6 bytes mno", encoding='UTF-8'), + "target_sos_mask": np.array([533, 34, 35, 36, 37, 38], dtype=np.int64), + "target_eos_ids": np.array([539, 40, 41, 42, 43, 44, 45, 46, 47], dtype=np.int64), + "target_eos_mask": np.array([548, 49, 50, 51], dtype=np.int64)} + ] + + writer = FileWriter(mindrecord_file_name) + schema = {"file_name": {"type": "string"}, + "image1": {"type": "bytes"}, + "image2": {"type": "bytes"}, + "source_sos_ids": {"type": "int64", "shape": [-1]}, + "source_sos_mask": {"type": "int64", "shape": [-1]}, + "image3": {"type": "bytes"}, + "image4": {"type": "bytes"}, + "image5": {"type": "bytes"}, + "target_sos_ids": {"type": "int64", "shape": [-1]}, + "target_sos_mask": {"type": "int64", "shape": [-1]}, + "target_eos_ids": {"type": "int64", "shape": [-1]}, + "target_eos_mask": {"type": "int64", "shape": [-1]}, + "label": {"type": "int32"}} + writer.add_schema(schema, "data is so cool") + writer.write_raw_data(data) + writer.commit() + + # change data value to list + data_value_to_list = [] + for item in data: + new_data = {} + new_data['file_name'] = np.asarray(item["file_name"], dtype='S') + new_data['label'] = np.asarray(list([item["label"]]), dtype=np.int32) + new_data['image1'] = np.asarray(list(item["image1"]), dtype=np.uint8) + new_data['image2'] = np.asarray(list(item["image2"]), dtype=np.uint8) + new_data['image3'] = np.asarray(list(item["image3"]), dtype=np.uint8) + new_data['image4'] = np.asarray(list(item["image4"]), dtype=np.uint8) + new_data['image5'] = np.asarray(list(item["image5"]), dtype=np.uint8) + new_data['source_sos_ids'] = item["source_sos_ids"] + new_data['source_sos_mask'] = item["source_sos_mask"] + new_data['target_sos_ids'] = item["target_sos_ids"] + new_data['target_sos_mask'] = item["target_sos_mask"] + new_data['target_eos_ids'] = item["target_eos_ids"] + new_data['target_eos_mask'] = item["target_eos_mask"] + data_value_to_list.append(new_data) + + num_readers = 2 + data_set = ds.MindDataset(dataset_file=mindrecord_file_name, + num_parallel_workers=num_readers, + shuffle=False) + assert data_set.get_dataset_size() == 6 + num_iter = 0 + for item in data_set.create_dict_iterator(): + assert len(item) == 13 + for field in item: + if isinstance(item[field], np.ndarray): + assert (item[field] == + data_value_to_list[num_iter][field]).all() + else: + assert item[field] == data_value_to_list[num_iter][field] + num_iter += 1 + assert num_iter == 6 + + num_readers = 2 + data_set = ds.MindDataset(dataset_file=mindrecord_file_name, + columns_list=["source_sos_ids", + "source_sos_mask", "target_sos_ids"], + num_parallel_workers=num_readers, + shuffle=False) + assert data_set.get_dataset_size() == 6 + num_iter = 0 + for item in data_set.create_dict_iterator(): + assert len(item) == 3 + for field in item: + if isinstance(item[field], np.ndarray): + assert (item[field] == data[num_iter][field]).all() + else: + assert item[field] == data[num_iter][field] + num_iter += 1 + assert num_iter == 6 + + num_readers = 1 + data_set = ds.MindDataset(dataset_file=mindrecord_file_name, + columns_list=["image2", "source_sos_mask", "image3", "target_sos_ids"], + num_parallel_workers=num_readers, + shuffle=False) + assert data_set.get_dataset_size() == 6 + num_iter = 0 + for item in data_set.create_dict_iterator(): + assert len(item) == 4 + for field in item: + if isinstance(item[field], np.ndarray): + assert (item[field] == + data_value_to_list[num_iter][field]).all() + else: + assert item[field] == data_value_to_list[num_iter][field] + num_iter += 1 + assert num_iter == 6 + + num_readers = 3 + data_set = ds.MindDataset(dataset_file=mindrecord_file_name, + columns_list=["target_sos_ids", + "image4", "source_sos_ids"], + num_parallel_workers=num_readers, + shuffle=False) + assert data_set.get_dataset_size() == 6 + num_iter = 0 + for item in data_set.create_dict_iterator(): + assert len(item) == 3 + for field in item: + if isinstance(item[field], np.ndarray): + assert (item[field] == + data_value_to_list[num_iter][field]).all() + else: + assert item[field] == data_value_to_list[num_iter][field] + num_iter += 1 + assert num_iter == 6 + + num_readers = 3 + data_set = ds.MindDataset(dataset_file=mindrecord_file_name, + columns_list=["target_sos_ids", "image5", + "image4", "image3", "source_sos_ids"], + num_parallel_workers=num_readers, + shuffle=False) + assert data_set.get_dataset_size() == 6 + num_iter = 0 + for item in data_set.create_dict_iterator(): + assert len(item) == 5 + for field in item: + if isinstance(item[field], np.ndarray): + assert (item[field] == + data_value_to_list[num_iter][field]).all() + else: + assert item[field] == data_value_to_list[num_iter][field] + num_iter += 1 + assert num_iter == 6 + + num_readers = 1 + data_set = ds.MindDataset(dataset_file=mindrecord_file_name, + columns_list=["target_eos_mask", "image5", + "image2", "source_sos_mask", "label"], + num_parallel_workers=num_readers, + shuffle=False) + assert data_set.get_dataset_size() == 6 + num_iter = 0 + for item in data_set.create_dict_iterator(): + assert len(item) == 5 + for field in item: + if isinstance(item[field], np.ndarray): + assert (item[field] == + data_value_to_list[num_iter][field]).all() + else: + assert item[field] == data_value_to_list[num_iter][field] + num_iter += 1 + assert num_iter == 6 + + num_readers = 2 + data_set = ds.MindDataset(dataset_file=mindrecord_file_name, + columns_list=["label", "target_eos_mask", "image1", "target_eos_ids", + "source_sos_mask", "image2", "image4", "image3", + "source_sos_ids", "image5", "file_name"], + num_parallel_workers=num_readers, + shuffle=False) + assert data_set.get_dataset_size() == 6 + num_iter = 0 + for item in data_set.create_dict_iterator(): + assert len(item) == 11 + for field in item: + if isinstance(item[field], np.ndarray): + assert (item[field] == + data_value_to_list[num_iter][field]).all() + else: + assert item[field] == data_value_to_list[num_iter][field] + num_iter += 1 + assert num_iter == 6 + except Exception as error: + os.remove("{}".format(mindrecord_file_name)) + os.remove("{}.db".format(mindrecord_file_name)) + raise error + else: os.remove("{}".format(mindrecord_file_name)) - if os.path.exists("{}.db".format(mindrecord_file_name)): os.remove("{}.db".format(mindrecord_file_name)) - data = [{"file_name": "001.jpg", "label": 4, - "image1": bytes("image1 bytes abc", encoding='UTF-8'), - "image2": bytes("image1 bytes def", encoding='UTF-8'), - "source_sos_ids": np.array([1, 2, 3, 4, 5], dtype=np.int64), - "source_sos_mask": np.array([6, 7, 8, 9, 10, 11, 12], dtype=np.int64), - "image3": bytes("image1 bytes ghi", encoding='UTF-8'), - "image4": bytes("image1 bytes jkl", encoding='UTF-8'), - "image5": bytes("image1 bytes mno", encoding='UTF-8'), - "target_sos_ids": np.array([28, 29, 30, 31, 32], dtype=np.int64), - "target_sos_mask": np.array([33, 34, 35, 36, 37, 38], dtype=np.int64), - "target_eos_ids": np.array([39, 40, 41, 42, 43, 44, 45, 46, 47], dtype=np.int64), - "target_eos_mask": np.array([48, 49, 50, 51], dtype=np.int64)}, - {"file_name": "002.jpg", "label": 5, - "image1": bytes("image2 bytes abc", encoding='UTF-8'), - "image2": bytes("image2 bytes def", encoding='UTF-8'), - "image3": bytes("image2 bytes ghi", encoding='UTF-8'), - "image4": bytes("image2 bytes jkl", encoding='UTF-8'), - "image5": bytes("image2 bytes mno", encoding='UTF-8'), - "source_sos_ids": np.array([11, 2, 3, 4, 5], dtype=np.int64), - "source_sos_mask": np.array([16, 7, 8, 9, 10, 11, 12], dtype=np.int64), - "target_sos_ids": np.array([128, 29, 30, 31, 32], dtype=np.int64), - "target_sos_mask": np.array([133, 34, 35, 36, 37, 38], dtype=np.int64), - "target_eos_ids": np.array([139, 40, 41, 42, 43, 44, 45, 46, 47], dtype=np.int64), - "target_eos_mask": np.array([148, 49, 50, 51], dtype=np.int64)}, - {"file_name": "003.jpg", "label": 6, - "source_sos_ids": np.array([21, 2, 3, 4, 5], dtype=np.int64), - "source_sos_mask": np.array([26, 7, 8, 9, 10, 11, 12], dtype=np.int64), - "target_sos_ids": np.array([228, 29, 30, 31, 32], dtype=np.int64), - "target_sos_mask": np.array([233, 34, 35, 36, 37, 38], dtype=np.int64), - "target_eos_ids": np.array([239, 40, 41, 42, 43, 44, 45, 46, 47], dtype=np.int64), - "image1": bytes("image3 bytes abc", encoding='UTF-8'), - "image2": bytes("image3 bytes def", encoding='UTF-8'), - "image3": bytes("image3 bytes ghi", encoding='UTF-8'), - "image4": bytes("image3 bytes jkl", encoding='UTF-8'), - "image5": bytes("image3 bytes mno", encoding='UTF-8'), - "target_eos_mask": np.array([248, 49, 50, 51], dtype=np.int64)}, - {"file_name": "004.jpg", "label": 7, - "source_sos_ids": np.array([31, 2, 3, 4, 5], dtype=np.int64), - "source_sos_mask": np.array([36, 7, 8, 9, 10, 11, 12], dtype=np.int64), - "image1": bytes("image4 bytes abc", encoding='UTF-8'), - "image2": bytes("image4 bytes def", encoding='UTF-8'), - "image3": bytes("image4 bytes ghi", encoding='UTF-8'), - "image4": bytes("image4 bytes jkl", encoding='UTF-8'), - "image5": bytes("image4 bytes mno", encoding='UTF-8'), - "target_sos_ids": np.array([328, 29, 30, 31, 32], dtype=np.int64), - "target_sos_mask": np.array([333, 34, 35, 36, 37, 38], dtype=np.int64), - "target_eos_ids": np.array([339, 40, 41, 42, 43, 44, 45, 46, 47], dtype=np.int64), - "target_eos_mask": np.array([348, 49, 50, 51], dtype=np.int64)}, - {"file_name": "005.jpg", "label": 8, - "source_sos_ids": np.array([41, 2, 3, 4, 5], dtype=np.int64), - "source_sos_mask": np.array([46, 7, 8, 9, 10, 11, 12], dtype=np.int64), - "target_sos_ids": np.array([428, 29, 30, 31, 32], dtype=np.int64), - "target_sos_mask": np.array([433, 34, 35, 36, 37, 38], dtype=np.int64), - "image1": bytes("image5 bytes abc", encoding='UTF-8'), - "image2": bytes("image5 bytes def", encoding='UTF-8'), - "image3": bytes("image5 bytes ghi", encoding='UTF-8'), - "image4": bytes("image5 bytes jkl", encoding='UTF-8'), - "image5": bytes("image5 bytes mno", encoding='UTF-8'), - "target_eos_ids": np.array([439, 40, 41, 42, 43, 44, 45, 46, 47], dtype=np.int64), - "target_eos_mask": np.array([448, 49, 50, 51], dtype=np.int64)}, - {"file_name": "006.jpg", "label": 9, - "source_sos_ids": np.array([51, 2, 3, 4, 5], dtype=np.int64), - "source_sos_mask": np.array([56, 7, 8, 9, 10, 11, 12], dtype=np.int64), - "target_sos_ids": np.array([528, 29, 30, 31, 32], dtype=np.int64), - "image1": bytes("image6 bytes abc", encoding='UTF-8'), - "image2": bytes("image6 bytes def", encoding='UTF-8'), - "image3": bytes("image6 bytes ghi", encoding='UTF-8'), - "image4": bytes("image6 bytes jkl", encoding='UTF-8'), - "image5": bytes("image6 bytes mno", encoding='UTF-8'), - "target_sos_mask": np.array([533, 34, 35, 36, 37, 38], dtype=np.int64), - "target_eos_ids": np.array([539, 40, 41, 42, 43, 44, 45, 46, 47], dtype=np.int64), - "target_eos_mask": np.array([548, 49, 50, 51], dtype=np.int64)} - ] - - writer = FileWriter(mindrecord_file_name) - schema = {"file_name": {"type": "string"}, - "image1": {"type": "bytes"}, - "image2": {"type": "bytes"}, - "source_sos_ids": {"type": "int64", "shape": [-1]}, - "source_sos_mask": {"type": "int64", "shape": [-1]}, - "image3": {"type": "bytes"}, - "image4": {"type": "bytes"}, - "image5": {"type": "bytes"}, - "target_sos_ids": {"type": "int64", "shape": [-1]}, - "target_sos_mask": {"type": "int64", "shape": [-1]}, - "target_eos_ids": {"type": "int64", "shape": [-1]}, - "target_eos_mask": {"type": "int64", "shape": [-1]}, - "label": {"type": "int32"}} - writer.add_schema(schema, "data is so cool") - writer.write_raw_data(data) - writer.commit() - - # change data value to list - data_value_to_list = [] - for item in data: - new_data = {} - new_data['file_name'] = np.asarray(item["file_name"], dtype='S') - new_data['label'] = np.asarray(list([item["label"]]), dtype=np.int32) - new_data['image1'] = np.asarray(list(item["image1"]), dtype=np.uint8) - new_data['image2'] = np.asarray(list(item["image2"]), dtype=np.uint8) - new_data['image3'] = np.asarray(list(item["image3"]), dtype=np.uint8) - new_data['image4'] = np.asarray(list(item["image4"]), dtype=np.uint8) - new_data['image5'] = np.asarray(list(item["image5"]), dtype=np.uint8) - new_data['source_sos_ids'] = item["source_sos_ids"] - new_data['source_sos_mask'] = item["source_sos_mask"] - new_data['target_sos_ids'] = item["target_sos_ids"] - new_data['target_sos_mask'] = item["target_sos_mask"] - new_data['target_eos_ids'] = item["target_eos_ids"] - new_data['target_eos_mask'] = item["target_eos_mask"] - data_value_to_list.append(new_data) - - num_readers = 2 - data_set = ds.MindDataset(dataset_file=mindrecord_file_name, - num_parallel_workers=num_readers, - shuffle=False) - assert data_set.get_dataset_size() == 6 - num_iter = 0 - for item in data_set.create_dict_iterator(): - assert len(item) == 13 - for field in item: - if isinstance(item[field], np.ndarray): - assert (item[field] == - data_value_to_list[num_iter][field]).all() - else: - assert item[field] == data_value_to_list[num_iter][field] - num_iter += 1 - assert num_iter == 6 - - num_readers = 2 - data_set = ds.MindDataset(dataset_file=mindrecord_file_name, - columns_list=["source_sos_ids", - "source_sos_mask", "target_sos_ids"], - num_parallel_workers=num_readers, - shuffle=False) - assert data_set.get_dataset_size() == 6 - num_iter = 0 - for item in data_set.create_dict_iterator(): - assert len(item) == 3 - for field in item: - if isinstance(item[field], np.ndarray): - assert (item[field] == data[num_iter][field]).all() - else: - assert item[field] == data[num_iter][field] - num_iter += 1 - assert num_iter == 6 - - num_readers = 1 - data_set = ds.MindDataset(dataset_file=mindrecord_file_name, - columns_list=[ - "image2", "source_sos_mask", "image3", "target_sos_ids"], - num_parallel_workers=num_readers, - shuffle=False) - assert data_set.get_dataset_size() == 6 - num_iter = 0 - for item in data_set.create_dict_iterator(): - assert len(item) == 4 - for field in item: - if isinstance(item[field], np.ndarray): - assert (item[field] == - data_value_to_list[num_iter][field]).all() - else: - assert item[field] == data_value_to_list[num_iter][field] - num_iter += 1 - assert num_iter == 6 - - num_readers = 3 - data_set = ds.MindDataset(dataset_file=mindrecord_file_name, - columns_list=["target_sos_ids", - "image4", "source_sos_ids"], - num_parallel_workers=num_readers, - shuffle=False) - assert data_set.get_dataset_size() == 6 - num_iter = 0 - for item in data_set.create_dict_iterator(): - assert len(item) == 3 - for field in item: - if isinstance(item[field], np.ndarray): - assert (item[field] == - data_value_to_list[num_iter][field]).all() - else: - assert item[field] == data_value_to_list[num_iter][field] - num_iter += 1 - assert num_iter == 6 - - num_readers = 3 - data_set = ds.MindDataset(dataset_file=mindrecord_file_name, - columns_list=["target_sos_ids", "image5", - "image4", "image3", "source_sos_ids"], - num_parallel_workers=num_readers, - shuffle=False) - assert data_set.get_dataset_size() == 6 - num_iter = 0 - for item in data_set.create_dict_iterator(): - assert len(item) == 5 - for field in item: - if isinstance(item[field], np.ndarray): - assert (item[field] == - data_value_to_list[num_iter][field]).all() - else: - assert item[field] == data_value_to_list[num_iter][field] - num_iter += 1 - assert num_iter == 6 - - num_readers = 1 - data_set = ds.MindDataset(dataset_file=mindrecord_file_name, - columns_list=["target_eos_mask", "image5", - "image2", "source_sos_mask", "label"], - num_parallel_workers=num_readers, - shuffle=False) - assert data_set.get_dataset_size() == 6 - num_iter = 0 - for item in data_set.create_dict_iterator(): - assert len(item) == 5 - for field in item: - if isinstance(item[field], np.ndarray): - assert (item[field] == - data_value_to_list[num_iter][field]).all() - else: - assert item[field] == data_value_to_list[num_iter][field] - num_iter += 1 - assert num_iter == 6 - - num_readers = 2 - data_set = ds.MindDataset(dataset_file=mindrecord_file_name, - columns_list=["label", "target_eos_mask", "image1", "target_eos_ids", "source_sos_mask", - "image2", "image4", "image3", "source_sos_ids", "image5", "file_name"], - num_parallel_workers=num_readers, - shuffle=False) - assert data_set.get_dataset_size() == 6 - num_iter = 0 - for item in data_set.create_dict_iterator(): - assert len(item) == 11 - for field in item: - if isinstance(item[field], np.ndarray): - assert (item[field] == - data_value_to_list[num_iter][field]).all() - else: - assert item[field] == data_value_to_list[num_iter][field] - num_iter += 1 - assert num_iter == 6 - - os.remove("{}".format(mindrecord_file_name)) - os.remove("{}.db".format(mindrecord_file_name)) def test_write_with_multi_bytes_and_MindDataset(): mindrecord_file_name = "test.mindrecord" - data = [{"file_name": "001.jpg", "label": 43, - "image1": bytes("image1 bytes abc", encoding='UTF-8'), - "image2": bytes("image1 bytes def", encoding='UTF-8'), - "image3": bytes("image1 bytes ghi", encoding='UTF-8'), - "image4": bytes("image1 bytes jkl", encoding='UTF-8'), - "image5": bytes("image1 bytes mno", encoding='UTF-8')}, - {"file_name": "002.jpg", "label": 91, - "image1": bytes("image2 bytes abc", encoding='UTF-8'), - "image2": bytes("image2 bytes def", encoding='UTF-8'), - "image3": bytes("image2 bytes ghi", encoding='UTF-8'), - "image4": bytes("image2 bytes jkl", encoding='UTF-8'), - "image5": bytes("image2 bytes mno", encoding='UTF-8')}, - {"file_name": "003.jpg", "label": 61, - "image1": bytes("image3 bytes abc", encoding='UTF-8'), - "image2": bytes("image3 bytes def", encoding='UTF-8'), - "image3": bytes("image3 bytes ghi", encoding='UTF-8'), - "image4": bytes("image3 bytes jkl", encoding='UTF-8'), - "image5": bytes("image3 bytes mno", encoding='UTF-8')}, - {"file_name": "004.jpg", "label": 29, - "image1": bytes("image4 bytes abc", encoding='UTF-8'), - "image2": bytes("image4 bytes def", encoding='UTF-8'), - "image3": bytes("image4 bytes ghi", encoding='UTF-8'), - "image4": bytes("image4 bytes jkl", encoding='UTF-8'), - "image5": bytes("image4 bytes mno", encoding='UTF-8')}, - {"file_name": "005.jpg", "label": 78, - "image1": bytes("image5 bytes abc", encoding='UTF-8'), - "image2": bytes("image5 bytes def", encoding='UTF-8'), - "image3": bytes("image5 bytes ghi", encoding='UTF-8'), - "image4": bytes("image5 bytes jkl", encoding='UTF-8'), - "image5": bytes("image5 bytes mno", encoding='UTF-8')}, - {"file_name": "006.jpg", "label": 37, - "image1": bytes("image6 bytes abc", encoding='UTF-8'), - "image2": bytes("image6 bytes def", encoding='UTF-8'), - "image3": bytes("image6 bytes ghi", encoding='UTF-8'), - "image4": bytes("image6 bytes jkl", encoding='UTF-8'), - "image5": bytes("image6 bytes mno", encoding='UTF-8')} - ] - writer = FileWriter(mindrecord_file_name) - schema = {"file_name": {"type": "string"}, - "image1": {"type": "bytes"}, - "image2": {"type": "bytes"}, - "image3": {"type": "bytes"}, - "label": {"type": "int32"}, - "image4": {"type": "bytes"}, - "image5": {"type": "bytes"}} - writer.add_schema(schema, "data is so cool") - writer.write_raw_data(data) - writer.commit() + try: + data = [{"file_name": "001.jpg", "label": 43, + "image1": bytes("image1 bytes abc", encoding='UTF-8'), + "image2": bytes("image1 bytes def", encoding='UTF-8'), + "image3": bytes("image1 bytes ghi", encoding='UTF-8'), + "image4": bytes("image1 bytes jkl", encoding='UTF-8'), + "image5": bytes("image1 bytes mno", encoding='UTF-8')}, + {"file_name": "002.jpg", "label": 91, + "image1": bytes("image2 bytes abc", encoding='UTF-8'), + "image2": bytes("image2 bytes def", encoding='UTF-8'), + "image3": bytes("image2 bytes ghi", encoding='UTF-8'), + "image4": bytes("image2 bytes jkl", encoding='UTF-8'), + "image5": bytes("image2 bytes mno", encoding='UTF-8')}, + {"file_name": "003.jpg", "label": 61, + "image1": bytes("image3 bytes abc", encoding='UTF-8'), + "image2": bytes("image3 bytes def", encoding='UTF-8'), + "image3": bytes("image3 bytes ghi", encoding='UTF-8'), + "image4": bytes("image3 bytes jkl", encoding='UTF-8'), + "image5": bytes("image3 bytes mno", encoding='UTF-8')}, + {"file_name": "004.jpg", "label": 29, + "image1": bytes("image4 bytes abc", encoding='UTF-8'), + "image2": bytes("image4 bytes def", encoding='UTF-8'), + "image3": bytes("image4 bytes ghi", encoding='UTF-8'), + "image4": bytes("image4 bytes jkl", encoding='UTF-8'), + "image5": bytes("image4 bytes mno", encoding='UTF-8')}, + {"file_name": "005.jpg", "label": 78, + "image1": bytes("image5 bytes abc", encoding='UTF-8'), + "image2": bytes("image5 bytes def", encoding='UTF-8'), + "image3": bytes("image5 bytes ghi", encoding='UTF-8'), + "image4": bytes("image5 bytes jkl", encoding='UTF-8'), + "image5": bytes("image5 bytes mno", encoding='UTF-8')}, + {"file_name": "006.jpg", "label": 37, + "image1": bytes("image6 bytes abc", encoding='UTF-8'), + "image2": bytes("image6 bytes def", encoding='UTF-8'), + "image3": bytes("image6 bytes ghi", encoding='UTF-8'), + "image4": bytes("image6 bytes jkl", encoding='UTF-8'), + "image5": bytes("image6 bytes mno", encoding='UTF-8')} + ] + writer = FileWriter(mindrecord_file_name) + schema = {"file_name": {"type": "string"}, + "image1": {"type": "bytes"}, + "image2": {"type": "bytes"}, + "image3": {"type": "bytes"}, + "label": {"type": "int32"}, + "image4": {"type": "bytes"}, + "image5": {"type": "bytes"}} + writer.add_schema(schema, "data is so cool") + writer.write_raw_data(data) + writer.commit() - # change data value to list - data_value_to_list = [] - for item in data: - new_data = {} - new_data['file_name'] = np.asarray(item["file_name"], dtype='S') - new_data['label'] = np.asarray(list([item["label"]]), dtype=np.int32) - new_data['image1'] = np.asarray(list(item["image1"]), dtype=np.uint8) - new_data['image2'] = np.asarray(list(item["image2"]), dtype=np.uint8) - new_data['image3'] = np.asarray(list(item["image3"]), dtype=np.uint8) - new_data['image4'] = np.asarray(list(item["image4"]), dtype=np.uint8) - new_data['image5'] = np.asarray(list(item["image5"]), dtype=np.uint8) - data_value_to_list.append(new_data) + # change data value to list + data_value_to_list = [] + for item in data: + new_data = {} + new_data['file_name'] = np.asarray(item["file_name"], dtype='S') + new_data['label'] = np.asarray(list([item["label"]]), dtype=np.int32) + new_data['image1'] = np.asarray(list(item["image1"]), dtype=np.uint8) + new_data['image2'] = np.asarray(list(item["image2"]), dtype=np.uint8) + new_data['image3'] = np.asarray(list(item["image3"]), dtype=np.uint8) + new_data['image4'] = np.asarray(list(item["image4"]), dtype=np.uint8) + new_data['image5'] = np.asarray(list(item["image5"]), dtype=np.uint8) + data_value_to_list.append(new_data) - num_readers = 2 - data_set = ds.MindDataset(dataset_file=mindrecord_file_name, - num_parallel_workers=num_readers, - shuffle=False) - assert data_set.get_dataset_size() == 6 - num_iter = 0 - for item in data_set.create_dict_iterator(): - assert len(item) == 7 - for field in item: - if isinstance(item[field], np.ndarray): - assert (item[field] == - data_value_to_list[num_iter][field]).all() - else: - assert item[field] == data_value_to_list[num_iter][field] - num_iter += 1 - assert num_iter == 6 + num_readers = 2 + data_set = ds.MindDataset(dataset_file=mindrecord_file_name, + num_parallel_workers=num_readers, + shuffle=False) + assert data_set.get_dataset_size() == 6 + num_iter = 0 + for item in data_set.create_dict_iterator(): + assert len(item) == 7 + for field in item: + if isinstance(item[field], np.ndarray): + assert (item[field] == + data_value_to_list[num_iter][field]).all() + else: + assert item[field] == data_value_to_list[num_iter][field] + num_iter += 1 + assert num_iter == 6 - num_readers = 2 - data_set = ds.MindDataset(dataset_file=mindrecord_file_name, - columns_list=["image1", "image2", "image5"], - num_parallel_workers=num_readers, - shuffle=False) - assert data_set.get_dataset_size() == 6 - num_iter = 0 - for item in data_set.create_dict_iterator(): - assert len(item) == 3 - for field in item: - if isinstance(item[field], np.ndarray): - assert (item[field] == - data_value_to_list[num_iter][field]).all() - else: - assert item[field] == data_value_to_list[num_iter][field] - num_iter += 1 - assert num_iter == 6 + num_readers = 2 + data_set = ds.MindDataset(dataset_file=mindrecord_file_name, + columns_list=["image1", "image2", "image5"], + num_parallel_workers=num_readers, + shuffle=False) + assert data_set.get_dataset_size() == 6 + num_iter = 0 + for item in data_set.create_dict_iterator(): + assert len(item) == 3 + for field in item: + if isinstance(item[field], np.ndarray): + assert (item[field] == + data_value_to_list[num_iter][field]).all() + else: + assert item[field] == data_value_to_list[num_iter][field] + num_iter += 1 + assert num_iter == 6 - num_readers = 2 - data_set = ds.MindDataset(dataset_file=mindrecord_file_name, - columns_list=["image2", "image4"], - num_parallel_workers=num_readers, - shuffle=False) - assert data_set.get_dataset_size() == 6 - num_iter = 0 - for item in data_set.create_dict_iterator(): - assert len(item) == 2 - for field in item: - if isinstance(item[field], np.ndarray): - assert (item[field] == - data_value_to_list[num_iter][field]).all() - else: - assert item[field] == data_value_to_list[num_iter][field] - num_iter += 1 - assert num_iter == 6 + num_readers = 2 + data_set = ds.MindDataset(dataset_file=mindrecord_file_name, + columns_list=["image2", "image4"], + num_parallel_workers=num_readers, + shuffle=False) + assert data_set.get_dataset_size() == 6 + num_iter = 0 + for item in data_set.create_dict_iterator(): + assert len(item) == 2 + for field in item: + if isinstance(item[field], np.ndarray): + assert (item[field] == + data_value_to_list[num_iter][field]).all() + else: + assert item[field] == data_value_to_list[num_iter][field] + num_iter += 1 + assert num_iter == 6 - num_readers = 2 - data_set = ds.MindDataset(dataset_file=mindrecord_file_name, - columns_list=["image5", "image2"], - num_parallel_workers=num_readers, - shuffle=False) - assert data_set.get_dataset_size() == 6 - num_iter = 0 - for item in data_set.create_dict_iterator(): - assert len(item) == 2 - for field in item: - if isinstance(item[field], np.ndarray): - assert (item[field] == - data_value_to_list[num_iter][field]).all() - else: - assert item[field] == data_value_to_list[num_iter][field] - num_iter += 1 - assert num_iter == 6 + num_readers = 2 + data_set = ds.MindDataset(dataset_file=mindrecord_file_name, + columns_list=["image5", "image2"], + num_parallel_workers=num_readers, + shuffle=False) + assert data_set.get_dataset_size() == 6 + num_iter = 0 + for item in data_set.create_dict_iterator(): + assert len(item) == 2 + for field in item: + if isinstance(item[field], np.ndarray): + assert (item[field] == + data_value_to_list[num_iter][field]).all() + else: + assert item[field] == data_value_to_list[num_iter][field] + num_iter += 1 + assert num_iter == 6 - num_readers = 2 - data_set = ds.MindDataset(dataset_file=mindrecord_file_name, - columns_list=["image5", "image2", "label"], - num_parallel_workers=num_readers, - shuffle=False) - assert data_set.get_dataset_size() == 6 - num_iter = 0 - for item in data_set.create_dict_iterator(): - assert len(item) == 3 - for field in item: - if isinstance(item[field], np.ndarray): - assert (item[field] == - data_value_to_list[num_iter][field]).all() - else: - assert item[field] == data_value_to_list[num_iter][field] - num_iter += 1 - assert num_iter == 6 - - num_readers = 2 - data_set = ds.MindDataset(dataset_file=mindrecord_file_name, - columns_list=["image4", "image5", - "image2", "image3", "file_name"], - num_parallel_workers=num_readers, - shuffle=False) - assert data_set.get_dataset_size() == 6 - num_iter = 0 - for item in data_set.create_dict_iterator(): - assert len(item) == 5 - for field in item: - if isinstance(item[field], np.ndarray): - assert (item[field] == - data_value_to_list[num_iter][field]).all() - else: - assert item[field] == data_value_to_list[num_iter][field] - num_iter += 1 - assert num_iter == 6 - - os.remove("{}".format(mindrecord_file_name)) - os.remove("{}.db".format(mindrecord_file_name)) + num_readers = 2 + data_set = ds.MindDataset(dataset_file=mindrecord_file_name, + columns_list=["image5", "image2", "label"], + num_parallel_workers=num_readers, + shuffle=False) + assert data_set.get_dataset_size() == 6 + num_iter = 0 + for item in data_set.create_dict_iterator(): + assert len(item) == 3 + for field in item: + if isinstance(item[field], np.ndarray): + assert (item[field] == + data_value_to_list[num_iter][field]).all() + else: + assert item[field] == data_value_to_list[num_iter][field] + num_iter += 1 + assert num_iter == 6 + num_readers = 2 + data_set = ds.MindDataset(dataset_file=mindrecord_file_name, + columns_list=["image4", "image5", + "image2", "image3", "file_name"], + num_parallel_workers=num_readers, + shuffle=False) + assert data_set.get_dataset_size() == 6 + num_iter = 0 + for item in data_set.create_dict_iterator(): + assert len(item) == 5 + for field in item: + if isinstance(item[field], np.ndarray): + assert (item[field] == + data_value_to_list[num_iter][field]).all() + else: + assert item[field] == data_value_to_list[num_iter][field] + num_iter += 1 + assert num_iter == 6 + except Exception as error: + os.remove("{}".format(mindrecord_file_name)) + os.remove("{}.db".format(mindrecord_file_name)) + raise error + else: + os.remove("{}".format(mindrecord_file_name)) + os.remove("{}.db".format(mindrecord_file_name)) def test_write_with_multi_array_and_MindDataset(): mindrecord_file_name = "test.mindrecord" - data = [{"source_sos_ids": np.array([1, 2, 3, 4, 5], dtype=np.int64), - "source_sos_mask": np.array([6, 7, 8, 9, 10, 11, 12], dtype=np.int64), - "source_eos_ids": np.array([13, 14, 15, 16, 17, 18], dtype=np.int64), - "source_eos_mask": np.array([19, 20, 21, 22, 23, 24, 25, 26, 27], dtype=np.int64), - "target_sos_ids": np.array([28, 29, 30, 31, 32], dtype=np.int64), - "target_sos_mask": np.array([33, 34, 35, 36, 37, 38], dtype=np.int64), - "target_eos_ids": np.array([39, 40, 41, 42, 43, 44, 45, 46, 47], dtype=np.int64), - "target_eos_mask": np.array([48, 49, 50, 51], dtype=np.int64)}, - {"source_sos_ids": np.array([11, 2, 3, 4, 5], dtype=np.int64), - "source_sos_mask": np.array([16, 7, 8, 9, 10, 11, 12], dtype=np.int64), - "source_eos_ids": np.array([113, 14, 15, 16, 17, 18], dtype=np.int64), - "source_eos_mask": np.array([119, 20, 21, 22, 23, 24, 25, 26, 27], dtype=np.int64), - "target_sos_ids": np.array([128, 29, 30, 31, 32], dtype=np.int64), - "target_sos_mask": np.array([133, 34, 35, 36, 37, 38], dtype=np.int64), - "target_eos_ids": np.array([139, 40, 41, 42, 43, 44, 45, 46, 47], dtype=np.int64), - "target_eos_mask": np.array([148, 49, 50, 51], dtype=np.int64)}, - {"source_sos_ids": np.array([21, 2, 3, 4, 5], dtype=np.int64), - "source_sos_mask": np.array([26, 7, 8, 9, 10, 11, 12], dtype=np.int64), - "source_eos_ids": np.array([213, 14, 15, 16, 17, 18], dtype=np.int64), - "source_eos_mask": np.array([219, 20, 21, 22, 23, 24, 25, 26, 27], dtype=np.int64), - "target_sos_ids": np.array([228, 29, 30, 31, 32], dtype=np.int64), - "target_sos_mask": np.array([233, 34, 35, 36, 37, 38], dtype=np.int64), - "target_eos_ids": np.array([239, 40, 41, 42, 43, 44, 45, 46, 47], dtype=np.int64), - "target_eos_mask": np.array([248, 49, 50, 51], dtype=np.int64)}, - {"source_sos_ids": np.array([31, 2, 3, 4, 5], dtype=np.int64), - "source_sos_mask": np.array([36, 7, 8, 9, 10, 11, 12], dtype=np.int64), - "source_eos_ids": np.array([313, 14, 15, 16, 17, 18], dtype=np.int64), - "source_eos_mask": np.array([319, 20, 21, 22, 23, 24, 25, 26, 27], dtype=np.int64), - "target_sos_ids": np.array([328, 29, 30, 31, 32], dtype=np.int64), - "target_sos_mask": np.array([333, 34, 35, 36, 37, 38], dtype=np.int64), - "target_eos_ids": np.array([339, 40, 41, 42, 43, 44, 45, 46, 47], dtype=np.int64), - "target_eos_mask": np.array([348, 49, 50, 51], dtype=np.int64)}, - {"source_sos_ids": np.array([41, 2, 3, 4, 5], dtype=np.int64), - "source_sos_mask": np.array([46, 7, 8, 9, 10, 11, 12], dtype=np.int64), - "source_eos_ids": np.array([413, 14, 15, 16, 17, 18], dtype=np.int64), - "source_eos_mask": np.array([419, 20, 21, 22, 23, 24, 25, 26, 27], dtype=np.int64), - "target_sos_ids": np.array([428, 29, 30, 31, 32], dtype=np.int64), - "target_sos_mask": np.array([433, 34, 35, 36, 37, 38], dtype=np.int64), - "target_eos_ids": np.array([439, 40, 41, 42, 43, 44, 45, 46, 47], dtype=np.int64), - "target_eos_mask": np.array([448, 49, 50, 51], dtype=np.int64)}, - {"source_sos_ids": np.array([51, 2, 3, 4, 5], dtype=np.int64), - "source_sos_mask": np.array([56, 7, 8, 9, 10, 11, 12], dtype=np.int64), - "source_eos_ids": np.array([513, 14, 15, 16, 17, 18], dtype=np.int64), - "source_eos_mask": np.array([519, 20, 21, 22, 23, 24, 25, 26, 27], dtype=np.int64), - "target_sos_ids": np.array([528, 29, 30, 31, 32], dtype=np.int64), - "target_sos_mask": np.array([533, 34, 35, 36, 37, 38], dtype=np.int64), - "target_eos_ids": np.array([539, 40, 41, 42, 43, 44, 45, 46, 47], dtype=np.int64), - "target_eos_mask": np.array([548, 49, 50, 51], dtype=np.int64)} - ] - writer = FileWriter(mindrecord_file_name) - schema = {"source_sos_ids": {"type": "int64", "shape": [-1]}, - "source_sos_mask": {"type": "int64", "shape": [-1]}, - "source_eos_ids": {"type": "int64", "shape": [-1]}, - "source_eos_mask": {"type": "int64", "shape": [-1]}, - "target_sos_ids": {"type": "int64", "shape": [-1]}, - "target_sos_mask": {"type": "int64", "shape": [-1]}, - "target_eos_ids": {"type": "int64", "shape": [-1]}, - "target_eos_mask": {"type": "int64", "shape": [-1]}} - writer.add_schema(schema, "data is so cool") - writer.write_raw_data(data) - writer.commit() + try: + data = [{"source_sos_ids": np.array([1, 2, 3, 4, 5], dtype=np.int64), + "source_sos_mask": np.array([6, 7, 8, 9, 10, 11, 12], dtype=np.int64), + "source_eos_ids": np.array([13, 14, 15, 16, 17, 18], dtype=np.int64), + "source_eos_mask": np.array([19, 20, 21, 22, 23, 24, 25, 26, 27], dtype=np.int64), + "target_sos_ids": np.array([28, 29, 30, 31, 32], dtype=np.int64), + "target_sos_mask": np.array([33, 34, 35, 36, 37, 38], dtype=np.int64), + "target_eos_ids": np.array([39, 40, 41, 42, 43, 44, 45, 46, 47], dtype=np.int64), + "target_eos_mask": np.array([48, 49, 50, 51], dtype=np.int64)}, + {"source_sos_ids": np.array([11, 2, 3, 4, 5], dtype=np.int64), + "source_sos_mask": np.array([16, 7, 8, 9, 10, 11, 12], dtype=np.int64), + "source_eos_ids": np.array([113, 14, 15, 16, 17, 18], dtype=np.int64), + "source_eos_mask": np.array([119, 20, 21, 22, 23, 24, 25, 26, 27], dtype=np.int64), + "target_sos_ids": np.array([128, 29, 30, 31, 32], dtype=np.int64), + "target_sos_mask": np.array([133, 34, 35, 36, 37, 38], dtype=np.int64), + "target_eos_ids": np.array([139, 40, 41, 42, 43, 44, 45, 46, 47], dtype=np.int64), + "target_eos_mask": np.array([148, 49, 50, 51], dtype=np.int64)}, + {"source_sos_ids": np.array([21, 2, 3, 4, 5], dtype=np.int64), + "source_sos_mask": np.array([26, 7, 8, 9, 10, 11, 12], dtype=np.int64), + "source_eos_ids": np.array([213, 14, 15, 16, 17, 18], dtype=np.int64), + "source_eos_mask": np.array([219, 20, 21, 22, 23, 24, 25, 26, 27], dtype=np.int64), + "target_sos_ids": np.array([228, 29, 30, 31, 32], dtype=np.int64), + "target_sos_mask": np.array([233, 34, 35, 36, 37, 38], dtype=np.int64), + "target_eos_ids": np.array([239, 40, 41, 42, 43, 44, 45, 46, 47], dtype=np.int64), + "target_eos_mask": np.array([248, 49, 50, 51], dtype=np.int64)}, + {"source_sos_ids": np.array([31, 2, 3, 4, 5], dtype=np.int64), + "source_sos_mask": np.array([36, 7, 8, 9, 10, 11, 12], dtype=np.int64), + "source_eos_ids": np.array([313, 14, 15, 16, 17, 18], dtype=np.int64), + "source_eos_mask": np.array([319, 20, 21, 22, 23, 24, 25, 26, 27], dtype=np.int64), + "target_sos_ids": np.array([328, 29, 30, 31, 32], dtype=np.int64), + "target_sos_mask": np.array([333, 34, 35, 36, 37, 38], dtype=np.int64), + "target_eos_ids": np.array([339, 40, 41, 42, 43, 44, 45, 46, 47], dtype=np.int64), + "target_eos_mask": np.array([348, 49, 50, 51], dtype=np.int64)}, + {"source_sos_ids": np.array([41, 2, 3, 4, 5], dtype=np.int64), + "source_sos_mask": np.array([46, 7, 8, 9, 10, 11, 12], dtype=np.int64), + "source_eos_ids": np.array([413, 14, 15, 16, 17, 18], dtype=np.int64), + "source_eos_mask": np.array([419, 20, 21, 22, 23, 24, 25, 26, 27], dtype=np.int64), + "target_sos_ids": np.array([428, 29, 30, 31, 32], dtype=np.int64), + "target_sos_mask": np.array([433, 34, 35, 36, 37, 38], dtype=np.int64), + "target_eos_ids": np.array([439, 40, 41, 42, 43, 44, 45, 46, 47], dtype=np.int64), + "target_eos_mask": np.array([448, 49, 50, 51], dtype=np.int64)}, + {"source_sos_ids": np.array([51, 2, 3, 4, 5], dtype=np.int64), + "source_sos_mask": np.array([56, 7, 8, 9, 10, 11, 12], dtype=np.int64), + "source_eos_ids": np.array([513, 14, 15, 16, 17, 18], dtype=np.int64), + "source_eos_mask": np.array([519, 20, 21, 22, 23, 24, 25, 26, 27], dtype=np.int64), + "target_sos_ids": np.array([528, 29, 30, 31, 32], dtype=np.int64), + "target_sos_mask": np.array([533, 34, 35, 36, 37, 38], dtype=np.int64), + "target_eos_ids": np.array([539, 40, 41, 42, 43, 44, 45, 46, 47], dtype=np.int64), + "target_eos_mask": np.array([548, 49, 50, 51], dtype=np.int64)} + ] + writer = FileWriter(mindrecord_file_name) + schema = {"source_sos_ids": {"type": "int64", "shape": [-1]}, + "source_sos_mask": {"type": "int64", "shape": [-1]}, + "source_eos_ids": {"type": "int64", "shape": [-1]}, + "source_eos_mask": {"type": "int64", "shape": [-1]}, + "target_sos_ids": {"type": "int64", "shape": [-1]}, + "target_sos_mask": {"type": "int64", "shape": [-1]}, + "target_eos_ids": {"type": "int64", "shape": [-1]}, + "target_eos_mask": {"type": "int64", "shape": [-1]}} + writer.add_schema(schema, "data is so cool") + writer.write_raw_data(data) + writer.commit() - # change data value to list - do none - data_value_to_list = [] - for item in data: - new_data = {} - new_data['source_sos_ids'] = item["source_sos_ids"] - new_data['source_sos_mask'] = item["source_sos_mask"] - new_data['source_eos_ids'] = item["source_eos_ids"] - new_data['source_eos_mask'] = item["source_eos_mask"] - new_data['target_sos_ids'] = item["target_sos_ids"] - new_data['target_sos_mask'] = item["target_sos_mask"] - new_data['target_eos_ids'] = item["target_eos_ids"] - new_data['target_eos_mask'] = item["target_eos_mask"] - data_value_to_list.append(new_data) + # change data value to list - do none + data_value_to_list = [] + for item in data: + new_data = {} + new_data['source_sos_ids'] = item["source_sos_ids"] + new_data['source_sos_mask'] = item["source_sos_mask"] + new_data['source_eos_ids'] = item["source_eos_ids"] + new_data['source_eos_mask'] = item["source_eos_mask"] + new_data['target_sos_ids'] = item["target_sos_ids"] + new_data['target_sos_mask'] = item["target_sos_mask"] + new_data['target_eos_ids'] = item["target_eos_ids"] + new_data['target_eos_mask'] = item["target_eos_mask"] + data_value_to_list.append(new_data) - num_readers = 2 - data_set = ds.MindDataset(dataset_file=mindrecord_file_name, - num_parallel_workers=num_readers, - shuffle=False) - assert data_set.get_dataset_size() == 6 - num_iter = 0 - for item in data_set.create_dict_iterator(): - assert len(item) == 8 - for field in item: - if isinstance(item[field], np.ndarray): - assert (item[field] == - data_value_to_list[num_iter][field]).all() - else: - assert item[field] == data_value_to_list[num_iter][field] - num_iter += 1 - assert num_iter == 6 + num_readers = 2 + data_set = ds.MindDataset(dataset_file=mindrecord_file_name, + num_parallel_workers=num_readers, + shuffle=False) + assert data_set.get_dataset_size() == 6 + num_iter = 0 + for item in data_set.create_dict_iterator(): + assert len(item) == 8 + for field in item: + if isinstance(item[field], np.ndarray): + assert (item[field] == + data_value_to_list[num_iter][field]).all() + else: + assert item[field] == data_value_to_list[num_iter][field] + num_iter += 1 + assert num_iter == 6 - num_readers = 2 - data_set = ds.MindDataset(dataset_file=mindrecord_file_name, - columns_list=["source_eos_ids", "source_eos_mask", - "target_sos_ids", "target_sos_mask", - "target_eos_ids", "target_eos_mask"], - num_parallel_workers=num_readers, - shuffle=False) - assert data_set.get_dataset_size() == 6 - num_iter = 0 - for item in data_set.create_dict_iterator(): - assert len(item) == 6 - for field in item: - if isinstance(item[field], np.ndarray): - assert (item[field] == - data_value_to_list[num_iter][field]).all() - else: - assert item[field] == data_value_to_list[num_iter][field] - num_iter += 1 - assert num_iter == 6 + num_readers = 2 + data_set = ds.MindDataset(dataset_file=mindrecord_file_name, + columns_list=["source_eos_ids", "source_eos_mask", + "target_sos_ids", "target_sos_mask", + "target_eos_ids", "target_eos_mask"], + num_parallel_workers=num_readers, + shuffle=False) + assert data_set.get_dataset_size() == 6 + num_iter = 0 + for item in data_set.create_dict_iterator(): + assert len(item) == 6 + for field in item: + if isinstance(item[field], np.ndarray): + assert (item[field] == + data_value_to_list[num_iter][field]).all() + else: + assert item[field] == data_value_to_list[num_iter][field] + num_iter += 1 + assert num_iter == 6 - num_readers = 2 - data_set = ds.MindDataset(dataset_file=mindrecord_file_name, - columns_list=["source_sos_ids", - "target_sos_ids", - "target_eos_mask"], - num_parallel_workers=num_readers, - shuffle=False) - assert data_set.get_dataset_size() == 6 - num_iter = 0 - for item in data_set.create_dict_iterator(): - assert len(item) == 3 - for field in item: - if isinstance(item[field], np.ndarray): - assert (item[field] == - data_value_to_list[num_iter][field]).all() - else: - assert item[field] == data_value_to_list[num_iter][field] - num_iter += 1 - assert num_iter == 6 + num_readers = 2 + data_set = ds.MindDataset(dataset_file=mindrecord_file_name, + columns_list=["source_sos_ids", + "target_sos_ids", + "target_eos_mask"], + num_parallel_workers=num_readers, + shuffle=False) + assert data_set.get_dataset_size() == 6 + num_iter = 0 + for item in data_set.create_dict_iterator(): + assert len(item) == 3 + for field in item: + if isinstance(item[field], np.ndarray): + assert (item[field] == + data_value_to_list[num_iter][field]).all() + else: + assert item[field] == data_value_to_list[num_iter][field] + num_iter += 1 + assert num_iter == 6 - num_readers = 2 - data_set = ds.MindDataset(dataset_file=mindrecord_file_name, - columns_list=["target_eos_mask", - "source_eos_mask", - "source_sos_mask"], - num_parallel_workers=num_readers, - shuffle=False) - assert data_set.get_dataset_size() == 6 - num_iter = 0 - for item in data_set.create_dict_iterator(): - assert len(item) == 3 - for field in item: - if isinstance(item[field], np.ndarray): - assert (item[field] == - data_value_to_list[num_iter][field]).all() - else: - assert item[field] == data_value_to_list[num_iter][field] - num_iter += 1 - assert num_iter == 6 + num_readers = 2 + data_set = ds.MindDataset(dataset_file=mindrecord_file_name, + columns_list=["target_eos_mask", + "source_eos_mask", + "source_sos_mask"], + num_parallel_workers=num_readers, + shuffle=False) + assert data_set.get_dataset_size() == 6 + num_iter = 0 + for item in data_set.create_dict_iterator(): + assert len(item) == 3 + for field in item: + if isinstance(item[field], np.ndarray): + assert (item[field] == + data_value_to_list[num_iter][field]).all() + else: + assert item[field] == data_value_to_list[num_iter][field] + num_iter += 1 + assert num_iter == 6 - num_readers = 2 - data_set = ds.MindDataset(dataset_file=mindrecord_file_name, - columns_list=["target_eos_ids"], - num_parallel_workers=num_readers, - shuffle=False) - assert data_set.get_dataset_size() == 6 - num_iter = 0 - for item in data_set.create_dict_iterator(): - assert len(item) == 1 - for field in item: - if isinstance(item[field], np.ndarray): - assert (item[field] == - data_value_to_list[num_iter][field]).all() - else: - assert item[field] == data_value_to_list[num_iter][field] - num_iter += 1 - assert num_iter == 6 + num_readers = 2 + data_set = ds.MindDataset(dataset_file=mindrecord_file_name, + columns_list=["target_eos_ids"], + num_parallel_workers=num_readers, + shuffle=False) + assert data_set.get_dataset_size() == 6 + num_iter = 0 + for item in data_set.create_dict_iterator(): + assert len(item) == 1 + for field in item: + if isinstance(item[field], np.ndarray): + assert (item[field] == + data_value_to_list[num_iter][field]).all() + else: + assert item[field] == data_value_to_list[num_iter][field] + num_iter += 1 + assert num_iter == 6 - num_readers = 1 - data_set = ds.MindDataset(dataset_file=mindrecord_file_name, - columns_list=["target_eos_mask", "target_eos_ids", - "target_sos_mask", "target_sos_ids", - "source_eos_mask", "source_eos_ids", - "source_sos_mask", "source_sos_ids"], - num_parallel_workers=num_readers, - shuffle=False) - assert data_set.get_dataset_size() == 6 - num_iter = 0 - for item in data_set.create_dict_iterator(): - assert len(item) == 8 - for field in item: - if isinstance(item[field], np.ndarray): - assert (item[field] == - data_value_to_list[num_iter][field]).all() - else: - assert item[field] == data_value_to_list[num_iter][field] - num_iter += 1 - assert num_iter == 6 + num_readers = 1 + data_set = ds.MindDataset(dataset_file=mindrecord_file_name, + columns_list=["target_eos_mask", "target_eos_ids", + "target_sos_mask", "target_sos_ids", + "source_eos_mask", "source_eos_ids", + "source_sos_mask", "source_sos_ids"], + num_parallel_workers=num_readers, + shuffle=False) + assert data_set.get_dataset_size() == 6 + num_iter = 0 + for item in data_set.create_dict_iterator(): + assert len(item) == 8 + for field in item: + if isinstance(item[field], np.ndarray): + assert (item[field] == + data_value_to_list[num_iter][field]).all() + else: + assert item[field] == data_value_to_list[num_iter][field] + num_iter += 1 + assert num_iter == 6 + except Exception as error: + os.remove("{}".format(mindrecord_file_name)) + os.remove("{}.db".format(mindrecord_file_name)) + raise error + else: + os.remove("{}".format(mindrecord_file_name)) + os.remove("{}.db".format(mindrecord_file_name)) + + +def test_numpy_generic(): + paths = ["{}{}".format(CV_FILE_NAME, str(x).rjust(1, '0')) + for x in range(FILES_NUM)] + try: + for x in paths: + if os.path.exists("{}".format(x)): + os.remove("{}".format(x)) + if os.path.exists("{}.db".format(x)): + os.remove("{}.db".format(x)) + writer = FileWriter(CV_FILE_NAME, FILES_NUM) + cv_schema_json = {"label1": {"type": "int32"}, "label2": {"type": "int64"}, + "label3": {"type": "float32"}, "label4": {"type": "float64"}} + data = [] + for idx in range(10): + row = {} + row['label1'] = np.int32(idx) + row['label2'] = np.int64(idx*10) + row['label3'] = np.float32(idx+0.12345) + row['label4'] = np.float64(idx+0.12345789) + data.append(row) + writer.add_schema(cv_schema_json, "img_schema") + writer.write_raw_data(data) + writer.commit() + + num_readers = 4 + data_set = ds.MindDataset(CV_FILE_NAME + "0", None, num_readers, shuffle=False) + assert data_set.get_dataset_size() == 10 + idx = 0 + for item in data_set.create_dict_iterator(): + assert item['label1'] == item['label1'] + assert item['label2'] == item['label2'] + assert item['label3'] == item['label3'] + assert item['label4'] == item['label4'] + idx += 1 + assert idx == 10 + except Exception as error: + for x in paths: + os.remove("{}".format(x)) + os.remove("{}.db".format(x)) + raise error + else: + for x in paths: + os.remove("{}".format(x)) + os.remove("{}.db".format(x)) - os.remove("{}".format(mindrecord_file_name)) - os.remove("{}.db".format(mindrecord_file_name)) def test_write_with_float32_float64_float32_array_float64_array_and_MindDataset(): mindrecord_file_name = "test.mindrecord" - data = [{"float32_array": np.array([1.2, 2.78, 3.1234, 4.9871, 5.12341], dtype=np.float32), - "float64_array": np.array([48.1234556789, 49.3251241431, 50.13514312414, 51.8971298471, - 123414314.2141243, 87.1212122], dtype=np.float64), - "float32": 3456.12345, - "float64": 1987654321.123456785, - "int32_array": np.array([1, 2, 3, 4, 5], dtype=np.int32), - "int64_array": np.array([48, 49, 50, 51, 123414314, 87], dtype=np.int64), - "int32": 3456, - "int64": 947654321123}, - {"float32_array": np.array([1.2, 2.78, 4.1234, 4.9871, 5.12341], dtype=np.float32), - "float64_array": np.array([48.1234556789, 49.3251241431, 60.13514312414, 51.8971298471, - 123414314.2141243, 87.1212122], dtype=np.float64), - "float32": 3456.12445, - "float64": 1987654321.123456786, - "int32_array": np.array([11, 21, 31, 41, 51], dtype=np.int32), - "int64_array": np.array([481, 491, 501, 511, 1234143141, 871], dtype=np.int64), - "int32": 3466, - "int64": 957654321123}, - {"float32_array": np.array([1.2, 2.78, 5.1234, 4.9871, 5.12341], dtype=np.float32), - "float64_array": np.array([48.1234556789, 49.3251241431, 70.13514312414, 51.8971298471, - 123414314.2141243, 87.1212122], dtype=np.float64), - "float32": 3456.12545, - "float64": 1987654321.123456787, - "int32_array": np.array([12, 22, 32, 42, 52], dtype=np.int32), - "int64_array": np.array([482, 492, 502, 512, 1234143142, 872], dtype=np.int64), - "int32": 3476, - "int64": 967654321123}, - {"float32_array": np.array([1.2, 2.78, 6.1234, 4.9871, 5.12341], dtype=np.float32), - "float64_array": np.array([48.1234556789, 49.3251241431, 80.13514312414, 51.8971298471, - 123414314.2141243, 87.1212122], dtype=np.float64), - "float32": 3456.12645, - "float64": 1987654321.123456788, - "int32_array": np.array([13, 23, 33, 43, 53], dtype=np.int32), - "int64_array": np.array([483, 493, 503, 513, 1234143143, 873], dtype=np.int64), - "int32": 3486, - "int64": 977654321123}, - {"float32_array": np.array([1.2, 2.78, 7.1234, 4.9871, 5.12341], dtype=np.float32), - "float64_array": np.array([48.1234556789, 49.3251241431, 90.13514312414, 51.8971298471, - 123414314.2141243, 87.1212122], dtype=np.float64), - "float32": 3456.12745, - "float64": 1987654321.123456789, - "int32_array": np.array([14, 24, 34, 44, 54], dtype=np.int32), - "int64_array": np.array([484, 494, 504, 514, 1234143144, 874], dtype=np.int64), - "int32": 3496, - "int64": 987654321123}, - ] - writer = FileWriter(mindrecord_file_name) - schema = {"float32_array": {"type": "float32", "shape": [-1]}, - "float64_array": {"type": "float64", "shape": [-1]}, - "float32": {"type": "float32"}, - "float64": {"type": "float64"}, - "int32_array": {"type": "int32", "shape": [-1]}, - "int64_array": {"type": "int64", "shape": [-1]}, - "int32": {"type": "int32"}, - "int64": {"type": "int64"}} - writer.add_schema(schema, "data is so cool") - writer.write_raw_data(data) - writer.commit() + try: + data = [{"float32_array": np.array([1.2, 2.78, 3.1234, 4.9871, 5.12341], dtype=np.float32), + "float64_array": np.array([48.1234556789, 49.3251241431, 50.13514312414, 51.8971298471, + 123414314.2141243, 87.1212122], dtype=np.float64), + "float32": 3456.12345, + "float64": 1987654321.123456785, + "int32_array": np.array([1, 2, 3, 4, 5], dtype=np.int32), + "int64_array": np.array([48, 49, 50, 51, 123414314, 87], dtype=np.int64), + "int32": 3456, + "int64": 947654321123}, + {"float32_array": np.array([1.2, 2.78, 4.1234, 4.9871, 5.12341], dtype=np.float32), + "float64_array": np.array([48.1234556789, 49.3251241431, 60.13514312414, 51.8971298471, + 123414314.2141243, 87.1212122], dtype=np.float64), + "float32": 3456.12445, + "float64": 1987654321.123456786, + "int32_array": np.array([11, 21, 31, 41, 51], dtype=np.int32), + "int64_array": np.array([481, 491, 501, 511, 1234143141, 871], dtype=np.int64), + "int32": 3466, + "int64": 957654321123}, + {"float32_array": np.array([1.2, 2.78, 5.1234, 4.9871, 5.12341], dtype=np.float32), + "float64_array": np.array([48.1234556789, 49.3251241431, 70.13514312414, 51.8971298471, + 123414314.2141243, 87.1212122], dtype=np.float64), + "float32": 3456.12545, + "float64": 1987654321.123456787, + "int32_array": np.array([12, 22, 32, 42, 52], dtype=np.int32), + "int64_array": np.array([482, 492, 502, 512, 1234143142, 872], dtype=np.int64), + "int32": 3476, + "int64": 967654321123}, + {"float32_array": np.array([1.2, 2.78, 6.1234, 4.9871, 5.12341], dtype=np.float32), + "float64_array": np.array([48.1234556789, 49.3251241431, 80.13514312414, 51.8971298471, + 123414314.2141243, 87.1212122], dtype=np.float64), + "float32": 3456.12645, + "float64": 1987654321.123456788, + "int32_array": np.array([13, 23, 33, 43, 53], dtype=np.int32), + "int64_array": np.array([483, 493, 503, 513, 1234143143, 873], dtype=np.int64), + "int32": 3486, + "int64": 977654321123}, + {"float32_array": np.array([1.2, 2.78, 7.1234, 4.9871, 5.12341], dtype=np.float32), + "float64_array": np.array([48.1234556789, 49.3251241431, 90.13514312414, 51.8971298471, + 123414314.2141243, 87.1212122], dtype=np.float64), + "float32": 3456.12745, + "float64": 1987654321.123456789, + "int32_array": np.array([14, 24, 34, 44, 54], dtype=np.int32), + "int64_array": np.array([484, 494, 504, 514, 1234143144, 874], dtype=np.int64), + "int32": 3496, + "int64": 987654321123}, + ] + writer = FileWriter(mindrecord_file_name) + schema = {"float32_array": {"type": "float32", "shape": [-1]}, + "float64_array": {"type": "float64", "shape": [-1]}, + "float32": {"type": "float32"}, + "float64": {"type": "float64"}, + "int32_array": {"type": "int32", "shape": [-1]}, + "int64_array": {"type": "int64", "shape": [-1]}, + "int32": {"type": "int32"}, + "int64": {"type": "int64"}} + writer.add_schema(schema, "data is so cool") + writer.write_raw_data(data) + writer.commit() - # change data value to list - do none - data_value_to_list = [] - for item in data: - new_data = {} - new_data['float32_array'] = item["float32_array"] - new_data['float64_array'] = item["float64_array"] - new_data['float32'] = item["float32"] - new_data['float64'] = item["float64"] - new_data['int32_array'] = item["int32_array"] - new_data['int64_array'] = item["int64_array"] - new_data['int32'] = item["int32"] - new_data['int64'] = item["int64"] - data_value_to_list.append(new_data) + # change data value to list - do none + data_value_to_list = [] + for item in data: + new_data = {} + new_data['float32_array'] = item["float32_array"] + new_data['float64_array'] = item["float64_array"] + new_data['float32'] = item["float32"] + new_data['float64'] = item["float64"] + new_data['int32_array'] = item["int32_array"] + new_data['int64_array'] = item["int64_array"] + new_data['int32'] = item["int32"] + new_data['int64'] = item["int64"] + data_value_to_list.append(new_data) - num_readers = 2 - data_set = ds.MindDataset(dataset_file=mindrecord_file_name, - num_parallel_workers=num_readers, - shuffle=False) - assert data_set.get_dataset_size() == 5 - num_iter = 0 - for item in data_set.create_dict_iterator(): - assert len(item) == 8 - for field in item: - if isinstance(item[field], np.ndarray): - if item[field].dtype == np.float32: - assert (item[field] == - np.array(data_value_to_list[num_iter][field], np.float32)).all() + num_readers = 2 + data_set = ds.MindDataset(dataset_file=mindrecord_file_name, + num_parallel_workers=num_readers, + shuffle=False) + assert data_set.get_dataset_size() == 5 + num_iter = 0 + for item in data_set.create_dict_iterator(): + assert len(item) == 8 + for field in item: + if isinstance(item[field], np.ndarray): + if item[field].dtype == np.float32: + assert (item[field] == + np.array(data_value_to_list[num_iter][field], np.float32)).all() + else: + assert (item[field] == + data_value_to_list[num_iter][field]).all() else: - assert (item[field] == - data_value_to_list[num_iter][field]).all() - else: - assert item[field] == data_value_to_list[num_iter][field] - num_iter += 1 - assert num_iter == 5 + assert item[field] == data_value_to_list[num_iter][field] + num_iter += 1 + assert num_iter == 5 - num_readers = 2 - data_set = ds.MindDataset(dataset_file=mindrecord_file_name, - columns_list=["float32", "int32"], - num_parallel_workers=num_readers, - shuffle=False) - assert data_set.get_dataset_size() == 5 - num_iter = 0 - for item in data_set.create_dict_iterator(): - assert len(item) == 2 - for field in item: - if isinstance(item[field], np.ndarray): - if item[field].dtype == np.float32: - assert (item[field] == - np.array(data_value_to_list[num_iter][field], np.float32)).all() + num_readers = 2 + data_set = ds.MindDataset(dataset_file=mindrecord_file_name, + columns_list=["float32", "int32"], + num_parallel_workers=num_readers, + shuffle=False) + assert data_set.get_dataset_size() == 5 + num_iter = 0 + for item in data_set.create_dict_iterator(): + assert len(item) == 2 + for field in item: + if isinstance(item[field], np.ndarray): + if item[field].dtype == np.float32: + assert (item[field] == + np.array(data_value_to_list[num_iter][field], np.float32)).all() + else: + assert (item[field] == + data_value_to_list[num_iter][field]).all() else: - assert (item[field] == - data_value_to_list[num_iter][field]).all() - else: - assert item[field] == data_value_to_list[num_iter][field] - num_iter += 1 - assert num_iter == 5 + assert item[field] == data_value_to_list[num_iter][field] + num_iter += 1 + assert num_iter == 5 - num_readers = 2 - data_set = ds.MindDataset(dataset_file=mindrecord_file_name, - columns_list=["float64", "int64"], - num_parallel_workers=num_readers, - shuffle=False) - assert data_set.get_dataset_size() == 5 - num_iter = 0 - for item in data_set.create_dict_iterator(): - assert len(item) == 2 - for field in item: - if isinstance(item[field], np.ndarray): - if item[field].dtype == np.float32: - assert (item[field] == - np.array(data_value_to_list[num_iter][field], np.float32)).all() - elif item[field].dtype == np.float64: - assert math.isclose(item[field], - np.array(data_value_to_list[num_iter][field], np.float64), rel_tol=1e-14) + num_readers = 2 + data_set = ds.MindDataset(dataset_file=mindrecord_file_name, + columns_list=["float64", "int64"], + num_parallel_workers=num_readers, + shuffle=False) + assert data_set.get_dataset_size() == 5 + num_iter = 0 + for item in data_set.create_dict_iterator(): + assert len(item) == 2 + for field in item: + if isinstance(item[field], np.ndarray): + if item[field].dtype == np.float32: + assert (item[field] == + np.array(data_value_to_list[num_iter][field], np.float32)).all() + elif item[field].dtype == np.float64: + assert math.isclose(item[field], + np.array(data_value_to_list[num_iter][field], np.float64), + rel_tol=1e-14) + else: + assert (item[field] == + data_value_to_list[num_iter][field]).all() else: - assert (item[field] == - data_value_to_list[num_iter][field]).all() - else: - assert item[field] == data_value_to_list[num_iter][field] - num_iter += 1 - assert num_iter == 5 + assert item[field] == data_value_to_list[num_iter][field] + num_iter += 1 + assert num_iter == 5 + except Exception as error: + os.remove("{}".format(mindrecord_file_name)) + os.remove("{}.db".format(mindrecord_file_name)) + raise error + else: + os.remove("{}".format(mindrecord_file_name)) + os.remove("{}.db".format(mindrecord_file_name)) - os.remove("{}".format(mindrecord_file_name)) - os.remove("{}.db".format(mindrecord_file_name)) - -def test_numpy_generic(): - - paths = ["{}{}".format(CV_FILE_NAME, str(x).rjust(1, '0')) - for x in range(FILES_NUM)] - for x in paths: - if os.path.exists("{}".format(x)): - os.remove("{}".format(x)) - if os.path.exists("{}.db".format(x)): - os.remove("{}.db".format(x)) - writer = FileWriter(CV_FILE_NAME, FILES_NUM) - cv_schema_json = {"label1": {"type": "int32"}, "label2": {"type": "int64"}, - "label3": {"type": "float32"}, "label4": {"type": "float64"}} - data = [] - for idx in range(10): - row = {} - row['label1'] = np.int32(idx) - row['label2'] = np.int64(idx*10) - row['label3'] = np.float32(idx+0.12345) - row['label4'] = np.float64(idx+0.12345789) - data.append(row) - writer.add_schema(cv_schema_json, "img_schema") - writer.write_raw_data(data) - writer.commit() - - num_readers = 4 - data_set = ds.MindDataset(CV_FILE_NAME + "0", None, num_readers, shuffle=False) - assert data_set.get_dataset_size() == 10 - idx = 0 - for item in data_set.create_dict_iterator(): - assert item['label1'] == item['label1'] - assert item['label2'] == item['label2'] - assert item['label3'] == item['label3'] - assert item['label4'] == item['label4'] - idx += 1 - assert idx == 10 - for x in paths: - os.remove("{}".format(x)) - os.remove("{}.db".format(x)) +if __name__ == '__main__': + test_nlp_compress_data(add_and_remove_nlp_compress_file) + test_nlp_compress_data_old_version(add_and_remove_nlp_compress_file) + test_cv_minddataset_writer_tutorial() + test_cv_minddataset_partition_tutorial(add_and_remove_cv_file) + test_cv_minddataset_partition_num_samples_0(add_and_remove_cv_file) + test_cv_minddataset_partition_num_samples_1(add_and_remove_cv_file) + test_cv_minddataset_partition_num_samples_2(add_and_remove_cv_file) + test_cv_minddataset_partition_tutorial_check_shuffle_result(add_and_remove_cv_file) + test_cv_minddataset_partition_tutorial_check_whole_reshuffle_result_per_epoch(add_and_remove_cv_file) + test_cv_minddataset_check_shuffle_result(add_and_remove_cv_file) + test_cv_minddataset_dataset_size(add_and_remove_cv_file) + test_cv_minddataset_repeat_reshuffle(add_and_remove_cv_file) + test_cv_minddataset_batch_size_larger_than_records(add_and_remove_cv_file) + test_cv_minddataset_issue_888(add_and_remove_cv_file) + test_cv_minddataset_blockreader_tutorial(add_and_remove_cv_file) + test_cv_minddataset_blockreader_some_field_not_in_index_tutorial(add_and_remove_cv_file) + test_cv_minddataset_reader_file_list(add_and_remove_cv_file) + test_cv_minddataset_reader_one_partition(add_and_remove_cv_file) + test_cv_minddataset_reader_two_dataset(add_and_remove_cv_file) + test_cv_minddataset_reader_two_dataset_partition(add_and_remove_cv_file) + test_cv_minddataset_reader_basic_tutorial(add_and_remove_cv_file) + test_nlp_minddataset_reader_basic_tutorial(add_and_remove_cv_file) + test_cv_minddataset_reader_basic_tutorial_5_epoch(add_and_remove_cv_file) + test_cv_minddataset_reader_basic_tutorial_5_epoch_with_batch(add_and_remove_cv_file) + test_cv_minddataset_reader_no_columns(add_and_remove_cv_file) + test_cv_minddataset_reader_repeat_tutorial(add_and_remove_cv_file) + test_write_with_multi_bytes_and_array_and_read_by_MindDataset() + test_write_with_multi_bytes_and_MindDataset() + test_write_with_multi_array_and_MindDataset() + test_numpy_generic() + test_write_with_float32_float64_float32_array_float64_array_and_MindDataset() diff --git a/tests/ut/python/dataset/test_minddataset_exception.py b/tests/ut/python/dataset/test_minddataset_exception.py index 619dff1962..51621750c8 100644 --- a/tests/ut/python/dataset/test_minddataset_exception.py +++ b/tests/ut/python/dataset/test_minddataset_exception.py @@ -99,8 +99,13 @@ def test_invalid_mindrecord(): num_iter = 0 for _ in data_set.create_dict_iterator(): num_iter += 1 - assert num_iter == 0 - os.remove('dummy.mindrecord') + try: + assert num_iter == 0 + except Exception as error: + os.remove('dummy.mindrecord') + raise error + else: + os.remove('dummy.mindrecord') def test_minddataset_lack_db(): @@ -113,8 +118,13 @@ def test_minddataset_lack_db(): num_iter = 0 for _ in data_set.create_dict_iterator(): num_iter += 1 - assert num_iter == 0 - os.remove(CV_FILE_NAME) + try: + assert num_iter == 0 + except Exception as error: + os.remove(CV_FILE_NAME) + raise error + else: + os.remove(CV_FILE_NAME) def test_cv_minddataset_pk_sample_error_class_column(): @@ -189,10 +199,16 @@ def test_minddataset_invalidate_num_shards(): num_iter = 0 for _ in data_set.create_dict_iterator(): num_iter += 1 - assert 'Input shard_id is not within the required interval of (0 to 0).' in str(error_info.value) + try: + assert 'Input shard_id is not within the required interval of (0 to 0).' in str(error_info.value) + except Exception as error: + os.remove(CV_FILE_NAME) + os.remove("{}.db".format(CV_FILE_NAME)) + raise error + else: + os.remove(CV_FILE_NAME) + os.remove("{}.db".format(CV_FILE_NAME)) - os.remove(CV_FILE_NAME) - os.remove("{}.db".format(CV_FILE_NAME)) def test_minddataset_invalidate_shard_id(): create_cv_mindrecord(1) @@ -203,9 +219,15 @@ def test_minddataset_invalidate_shard_id(): num_iter = 0 for _ in data_set.create_dict_iterator(): num_iter += 1 - assert 'Input shard_id is not within the required interval of (0 to 0).' in str(error_info.value) - os.remove(CV_FILE_NAME) - os.remove("{}.db".format(CV_FILE_NAME)) + try: + assert 'Input shard_id is not within the required interval of (0 to 0).' in str(error_info.value) + except Exception as error: + os.remove(CV_FILE_NAME) + os.remove("{}.db".format(CV_FILE_NAME)) + raise error + else: + os.remove(CV_FILE_NAME) + os.remove("{}.db".format(CV_FILE_NAME)) def test_minddataset_shard_id_bigger_than_num_shard(): @@ -217,17 +239,28 @@ def test_minddataset_shard_id_bigger_than_num_shard(): num_iter = 0 for _ in data_set.create_dict_iterator(): num_iter += 1 - assert 'Input shard_id is not within the required interval of (0 to 1).' in str(error_info.value) + try: + assert 'Input shard_id is not within the required interval of (0 to 1).' in str(error_info.value) + except Exception as error: + os.remove(CV_FILE_NAME) + os.remove("{}.db".format(CV_FILE_NAME)) + raise error with pytest.raises(Exception) as error_info: data_set = ds.MindDataset(CV_FILE_NAME, columns_list, num_readers, True, 2, 5) num_iter = 0 for _ in data_set.create_dict_iterator(): num_iter += 1 - assert 'Input shard_id is not within the required interval of (0 to 1).' in str(error_info.value) + try: + assert 'Input shard_id is not within the required interval of (0 to 1).' in str(error_info.value) + except Exception as error: + os.remove(CV_FILE_NAME) + os.remove("{}.db".format(CV_FILE_NAME)) + raise error + else: + os.remove(CV_FILE_NAME) + os.remove("{}.db".format(CV_FILE_NAME)) - os.remove(CV_FILE_NAME) - os.remove("{}.db".format(CV_FILE_NAME)) def test_cv_minddataset_partition_num_samples_equals_0(): """tutorial for cv minddataset.""" @@ -245,7 +278,26 @@ def test_cv_minddataset_partition_num_samples_equals_0(): num_iter += 1 with pytest.raises(Exception) as error_info: partitions(5) - assert 'num_samples should be a positive integer value, but got num_samples=0' in str(error_info.value) + try: + assert 'num_samples should be a positive integer value, but got num_samples=0' in str(error_info.value) + except Exception as error: + os.remove(CV_FILE_NAME) + os.remove("{}.db".format(CV_FILE_NAME)) + raise error + else: + os.remove(CV_FILE_NAME) + os.remove("{}.db".format(CV_FILE_NAME)) - os.remove(CV_FILE_NAME) - os.remove("{}.db".format(CV_FILE_NAME)) +if __name__ == '__main__': + test_cv_lack_json() + test_cv_lack_mindrecord() + test_invalid_mindrecord() + test_minddataset_lack_db() + test_cv_minddataset_pk_sample_error_class_column() + test_cv_minddataset_pk_sample_exclusive_shuffle() + test_cv_minddataset_reader_different_schema() + test_cv_minddataset_reader_different_page_size() + test_minddataset_invalidate_num_shards() + test_minddataset_invalidate_shard_id() + test_minddataset_shard_id_bigger_than_num_shard() + test_cv_minddataset_partition_num_samples_equals_0() diff --git a/tests/ut/python/dataset/test_minddataset_multi_images_and_ndarray.py b/tests/ut/python/dataset/test_minddataset_multi_images_and_ndarray.py index c9c9388e65..5ef3a7adcb 100644 --- a/tests/ut/python/dataset/test_minddataset_multi_images_and_ndarray.py +++ b/tests/ut/python/dataset/test_minddataset_multi_images_and_ndarray.py @@ -27,54 +27,64 @@ CV_FILE_NAME = "./complex.mindrecord" def test_cv_minddataset_reader_multi_image_and_ndarray_tutorial(): - writer = FileWriter(CV_FILE_NAME, FILES_NUM) - cv_schema_json = {"id": {"type": "int32"}, - "image_0": {"type": "bytes"}, - "image_2": {"type": "bytes"}, - "image_3": {"type": "bytes"}, - "image_4": {"type": "bytes"}, - "input_mask": {"type": "int32", "shape": [-1]}, - "segments": {"type": "float32", "shape": [2, 3]}} - writer.add_schema(cv_schema_json, "two_images_schema") - with open("../data/mindrecord/testImageNetData/images/image_00010.jpg", "rb") as file_reader: - img_data = file_reader.read() - ndarray_1 = np.array([1, 2, 3, 4, 5], np.int32) - ndarray_2 = np.array(([2, 3, 1], [7, 9, 0]), np.float32) - data = [] - for i in range(5): - item = {"id": i, "image_0": img_data, "image_2": img_data, "image_3": img_data, "image_4": img_data, - "input_mask": ndarray_1, "segments": ndarray_2} - data.append(item) - writer.write_raw_data(data) - writer.commit() - assert os.path.exists(CV_FILE_NAME) - assert os.path.exists(CV_FILE_NAME + ".db") + try: + writer = FileWriter(CV_FILE_NAME, FILES_NUM) + cv_schema_json = {"id": {"type": "int32"}, + "image_0": {"type": "bytes"}, + "image_2": {"type": "bytes"}, + "image_3": {"type": "bytes"}, + "image_4": {"type": "bytes"}, + "input_mask": {"type": "int32", "shape": [-1]}, + "segments": {"type": "float32", "shape": [2, 3]}} + writer.add_schema(cv_schema_json, "two_images_schema") + with open("../data/mindrecord/testImageNetData/images/image_00010.jpg", "rb") as file_reader: + img_data = file_reader.read() + ndarray_1 = np.array([1, 2, 3, 4, 5], np.int32) + ndarray_2 = np.array(([2, 3, 1], [7, 9, 0]), np.float32) + data = [] + for i in range(5): + item = {"id": i, "image_0": img_data, "image_2": img_data, "image_3": img_data, "image_4": img_data, + "input_mask": ndarray_1, "segments": ndarray_2} + data.append(item) + writer.write_raw_data(data) + writer.commit() + assert os.path.exists(CV_FILE_NAME) + assert os.path.exists(CV_FILE_NAME + ".db") - # tutorial for minderdataset. - columns_list = ["id", "image_0", "image_2", "image_3", "image_4", "input_mask", "segments"] - num_readers = 1 - data_set = ds.MindDataset(CV_FILE_NAME, columns_list, num_readers) - assert data_set.get_dataset_size() == 5 - num_iter = 0 - for item in data_set.create_dict_iterator(): - assert len(item) == 7 - logger.info("item: {}".format(item)) - assert item["image_0"].dtype == np.uint8 - assert (item["image_0"] == item["image_2"]).all() - assert (item["image_3"] == item["image_4"]).all() - assert (item["image_0"] == item["image_4"]).all() - assert item["image_2"].dtype == np.uint8 - assert item["image_3"].dtype == np.uint8 - assert item["image_4"].dtype == np.uint8 - assert item["id"].dtype == np.int32 - assert item["input_mask"].shape == (5,) - assert item["input_mask"].dtype == np.int32 - assert item["segments"].shape == (2, 3) - assert item["segments"].dtype == np.float32 - num_iter += 1 - assert num_iter == 5 + # tutorial for minderdataset. + columns_list = ["id", "image_0", "image_2", "image_3", "image_4", "input_mask", "segments"] + num_readers = 1 + data_set = ds.MindDataset(CV_FILE_NAME, columns_list, num_readers) + assert data_set.get_dataset_size() == 5 + num_iter = 0 + for item in data_set.create_dict_iterator(): + assert len(item) == 7 + logger.info("item: {}".format(item)) + assert item["image_0"].dtype == np.uint8 + assert (item["image_0"] == item["image_2"]).all() + assert (item["image_3"] == item["image_4"]).all() + assert (item["image_0"] == item["image_4"]).all() + assert item["image_2"].dtype == np.uint8 + assert item["image_3"].dtype == np.uint8 + assert item["image_4"].dtype == np.uint8 + assert item["id"].dtype == np.int32 + assert item["input_mask"].shape == (5,) + assert item["input_mask"].dtype == np.int32 + assert item["segments"].shape == (2, 3) + assert item["segments"].dtype == np.float32 + num_iter += 1 + assert num_iter == 5 + except Exception as error: + if os.path.exists("{}".format(CV_FILE_NAME + ".db")): + os.remove(CV_FILE_NAME + ".db") + if os.path.exists("{}".format(CV_FILE_NAME)): + os.remove(CV_FILE_NAME) + raise error + else: + if os.path.exists("{}".format(CV_FILE_NAME + ".db")): + os.remove(CV_FILE_NAME + ".db") + if os.path.exists("{}".format(CV_FILE_NAME)): + os.remove(CV_FILE_NAME) - if os.path.exists("{}".format(CV_FILE_NAME + ".db")): - os.remove(CV_FILE_NAME + ".db") - if os.path.exists("{}".format(CV_FILE_NAME)): - os.remove(CV_FILE_NAME) +if __name__ == '__main__': + test_cv_minddataset_reader_multi_image_and_ndarray_tutorial() diff --git a/tests/ut/python/dataset/test_minddataset_padded.py b/tests/ut/python/dataset/test_minddataset_padded.py index c0724e3236..a05879ab01 100644 --- a/tests/ut/python/dataset/test_minddataset_padded.py +++ b/tests/ut/python/dataset/test_minddataset_padded.py @@ -44,24 +44,31 @@ def add_and_remove_cv_file(): """add/remove cv file""" paths = ["{}{}".format(CV_FILE_NAME, str(x).rjust(1, '0')) for x in range(FILES_NUM)] - for x in paths: - os.remove("{}".format(x)) if os.path.exists("{}".format(x)) else None - os.remove("{}.db".format(x)) if os.path.exists( - "{}.db".format(x)) else None - writer = FileWriter(CV_FILE_NAME, FILES_NUM) - data = get_data(CV_DIR_NAME) - cv_schema_json = {"id": {"type": "int32"}, - "file_name": {"type": "string"}, - "label": {"type": "int32"}, - "data": {"type": "bytes"}} - writer.add_schema(cv_schema_json, "img_schema") - writer.add_index(["file_name", "label"]) - writer.write_raw_data(data) - writer.commit() - yield "yield_cv_data" - for x in paths: - os.remove("{}".format(x)) - os.remove("{}.db".format(x)) + try: + for x in paths: + os.remove("{}".format(x)) if os.path.exists("{}".format(x)) else None + os.remove("{}.db".format(x)) if os.path.exists( + "{}.db".format(x)) else None + writer = FileWriter(CV_FILE_NAME, FILES_NUM) + data = get_data(CV_DIR_NAME) + cv_schema_json = {"id": {"type": "int32"}, + "file_name": {"type": "string"}, + "label": {"type": "int32"}, + "data": {"type": "bytes"}} + writer.add_schema(cv_schema_json, "img_schema") + writer.add_index(["file_name", "label"]) + writer.write_raw_data(data) + writer.commit() + yield "yield_cv_data" + except Exception as error: + for x in paths: + os.remove("{}".format(x)) + os.remove("{}.db".format(x)) + raise error + else: + for x in paths: + os.remove("{}".format(x)) + os.remove("{}.db".format(x)) @pytest.fixture @@ -69,32 +76,39 @@ def add_and_remove_nlp_file(): """add/remove nlp file""" paths = ["{}{}".format(NLP_FILE_NAME, str(x).rjust(1, '0')) for x in range(FILES_NUM)] - for x in paths: - if os.path.exists("{}".format(x)): + try: + for x in paths: + if os.path.exists("{}".format(x)): + os.remove("{}".format(x)) + if os.path.exists("{}.db".format(x)): + os.remove("{}.db".format(x)) + writer = FileWriter(NLP_FILE_NAME, FILES_NUM) + data = [x for x in get_nlp_data(NLP_FILE_POS, NLP_FILE_VOCAB, 10)] + nlp_schema_json = {"id": {"type": "string"}, "label": {"type": "int32"}, + "rating": {"type": "float32"}, + "input_ids": {"type": "int64", + "shape": [-1]}, + "input_mask": {"type": "int64", + "shape": [1, -1]}, + "segment_ids": {"type": "int64", + "shape": [2, -1]} + } + writer.set_header_size(1 << 14) + writer.set_page_size(1 << 15) + writer.add_schema(nlp_schema_json, "nlp_schema") + writer.add_index(["id", "rating"]) + writer.write_raw_data(data) + writer.commit() + yield "yield_nlp_data" + except Exception as error: + for x in paths: + os.remove("{}".format(x)) + os.remove("{}.db".format(x)) + raise error + else: + for x in paths: os.remove("{}".format(x)) - if os.path.exists("{}.db".format(x)): os.remove("{}.db".format(x)) - writer = FileWriter(NLP_FILE_NAME, FILES_NUM) - data = [x for x in get_nlp_data(NLP_FILE_POS, NLP_FILE_VOCAB, 10)] - nlp_schema_json = {"id": {"type": "string"}, "label": {"type": "int32"}, - "rating": {"type": "float32"}, - "input_ids": {"type": "int64", - "shape": [-1]}, - "input_mask": {"type": "int64", - "shape": [1, -1]}, - "segment_ids": {"type": "int64", - "shape": [2, -1]} - } - writer.set_header_size(1 << 14) - writer.set_page_size(1 << 15) - writer.add_schema(nlp_schema_json, "nlp_schema") - writer.add_index(["id", "rating"]) - writer.write_raw_data(data) - writer.commit() - yield "yield_nlp_data" - for x in paths: - os.remove("{}".format(x)) - os.remove("{}.db".format(x)) def test_cv_minddataset_reader_basic_padded_samples(add_and_remove_cv_file): """tutorial for cv minderdataset.""" @@ -119,7 +133,7 @@ def test_cv_minddataset_reader_basic_padded_samples(add_and_remove_cv_file): encoding='utf8') assert item['label'] == padded_sample['label'] assert (item['data'] == np.array(list(padded_sample['data']))).all() - num_iter += 1 + num_iter += 1 assert num_padded_iter == 5 assert num_iter == 15 @@ -636,3 +650,17 @@ def inputs(vectors, maxlen=50): mask = [1] * length + [0] * (maxlen - length) segment = [0] * maxlen return input_, mask, segment + +if __name__ == '__main__': + test_cv_minddataset_reader_basic_padded_samples(add_and_remove_cv_file) + test_cv_minddataset_partition_padded_samples(add_and_remove_cv_file) + test_cv_minddataset_partition_padded_samples_multi_epoch(add_and_remove_cv_file) + test_cv_minddataset_partition_padded_samples_no_dividsible(add_and_remove_cv_file) + test_cv_minddataset_partition_padded_samples_dataset_size_no_divisible(add_and_remove_cv_file) + test_cv_minddataset_partition_padded_samples_no_equal_column_list(add_and_remove_cv_file) + test_cv_minddataset_partition_padded_samples_no_column_list(add_and_remove_cv_file) + test_cv_minddataset_partition_padded_samples_no_num_padded(add_and_remove_cv_file) + test_cv_minddataset_partition_padded_samples_no_padded_samples(add_and_remove_cv_file) + test_nlp_minddataset_reader_basic_padded_samples(add_and_remove_nlp_file) + test_nlp_minddataset_reader_basic_padded_samples_multi_epoch(add_and_remove_nlp_file) + test_nlp_minddataset_reader_basic_padded_samples_check_whole_reshuffle_result_per_epoch(add_and_remove_nlp_file) diff --git a/tests/ut/python/dataset/test_minddataset_sampler.py b/tests/ut/python/dataset/test_minddataset_sampler.py index 8d099f1af2..9c110c0e1f 100644 --- a/tests/ut/python/dataset/test_minddataset_sampler.py +++ b/tests/ut/python/dataset/test_minddataset_sampler.py @@ -34,26 +34,32 @@ def add_and_remove_cv_file(): """add/remove cv file""" paths = ["{}{}".format(CV_FILE_NAME, str(x).rjust(1, '0')) for x in range(FILES_NUM)] - for x in paths: - if os.path.exists("{}".format(x)): + try: + for x in paths: + if os.path.exists("{}".format(x)): + os.remove("{}".format(x)) + if os.path.exists("{}.db".format(x)): + os.remove("{}.db".format(x)) + writer = FileWriter(CV_FILE_NAME, FILES_NUM) + data = get_data(CV_DIR_NAME, True) + cv_schema_json = {"id": {"type": "int32"}, + "file_name": {"type": "string"}, + "label": {"type": "int32"}, + "data": {"type": "bytes"}} + writer.add_schema(cv_schema_json, "img_schema") + writer.add_index(["file_name", "label"]) + writer.write_raw_data(data) + writer.commit() + yield "yield_cv_data" + except Exception as error: + for x in paths: + os.remove("{}".format(x)) + os.remove("{}.db".format(x)) + raise error + else: + for x in paths: os.remove("{}".format(x)) - if os.path.exists("{}.db".format(x)): os.remove("{}.db".format(x)) - writer = FileWriter(CV_FILE_NAME, FILES_NUM) - data = get_data(CV_DIR_NAME, True) - cv_schema_json = {"id": {"type": "int32"}, - "file_name": {"type": "string"}, - "label": {"type": "int32"}, - "data": {"type": "bytes"}} - writer.add_schema(cv_schema_json, "img_schema") - writer.add_index(["file_name", "label"]) - writer.write_raw_data(data) - writer.commit() - yield "yield_cv_data" - for x in paths: - os.remove("{}".format(x)) - os.remove("{}.db".format(x)) - def test_cv_minddataset_pk_sample_no_column(add_and_remove_cv_file): """tutorial for cv minderdataset.""" @@ -626,3 +632,24 @@ def get_data(dir_name, sampler=False): except FileNotFoundError: continue return data_list + +if __name__ == '__main__': + test_cv_minddataset_pk_sample_no_column(add_and_remove_cv_file) + test_cv_minddataset_pk_sample_basic(add_and_remove_cv_file) + test_cv_minddataset_pk_sample_shuffle(add_and_remove_cv_file) + test_cv_minddataset_pk_sample_out_of_range(add_and_remove_cv_file) + test_cv_minddataset_subset_random_sample_basic(add_and_remove_cv_file) + test_cv_minddataset_subset_random_sample_replica(add_and_remove_cv_file) + test_cv_minddataset_subset_random_sample_empty(add_and_remove_cv_file) + test_cv_minddataset_subset_random_sample_out_of_range(add_and_remove_cv_file) + test_cv_minddataset_subset_random_sample_negative(add_and_remove_cv_file) + test_cv_minddataset_random_sampler_basic(add_and_remove_cv_file) + test_cv_minddataset_random_sampler_repeat(add_and_remove_cv_file) + test_cv_minddataset_random_sampler_replacement(add_and_remove_cv_file) + test_cv_minddataset_sequential_sampler_basic(add_and_remove_cv_file) + test_cv_minddataset_sequential_sampler_exceed_size(add_and_remove_cv_file) + test_cv_minddataset_split_basic(add_and_remove_cv_file) + test_cv_minddataset_split_exact_percent(add_and_remove_cv_file) + test_cv_minddataset_split_fuzzy_percent(add_and_remove_cv_file) + test_cv_minddataset_split_deterministic(add_and_remove_cv_file) + test_cv_minddataset_split_sharding(add_and_remove_cv_file)