From 31fed1a2f6c70e0d44a2893ca7f05a0da40d3f70 Mon Sep 17 00:00:00 2001 From: Xiao Tianci Date: Thu, 24 Dec 2020 16:33:29 +0800 Subject: [PATCH] change code to import APIs from mindspore.dataset rather than mindspore.dataset.engine --- .../official/cv/centerface/src/dataset.py | 4 +- .../cv/cnn_direction_model/src/dataset.py | 41 +++--- model_zoo/official/cv/crnn/src/dataset.py | 12 +- .../official/cv/inceptionv3/src/dataset.py | 22 ++-- .../official/cv/mobilenetv1/src/dataset.py | 34 ++--- .../official/cv/mobilenetv2/src/dataset.py | 32 ++--- .../cv/mobilenetv2_quant/src/dataset.py | 51 ++++---- .../official/cv/mobilenetv3/src/dataset.py | 22 ++-- model_zoo/official/cv/nasnet/src/dataset.py | 18 +-- model_zoo/official/cv/psenet/src/dataset.py | 36 +++-- .../cv/resnet/gpu_resnet_benchmark.py | 30 +++-- model_zoo/official/cv/resnet/src/dataset.py | 66 +++++----- .../official/cv/resnet50_quant/src/dataset.py | 39 +++--- .../official/cv/resnet_thor/src/dataset.py | 18 +-- .../official/cv/shufflenetv1/src/dataset.py | 16 +-- .../official/cv/shufflenetv2/src/dataset.py | 16 +-- .../official/cv/squeezenet/src/dataset.py | 70 +++++----- model_zoo/official/cv/warpctc/src/dataset.py | 16 +-- model_zoo/official/cv/xception/src/dataset.py | 17 +-- model_zoo/official/nlp/bert/src/dataset.py | 106 +++++++-------- .../official/nlp/bert_thor/src/dataset.py | 114 ++++++++-------- model_zoo/official/nlp/fasttext/eval.py | 14 +- .../official/nlp/fasttext/src/dataset.py | 6 +- .../official/nlp/fasttext/src/load_dataset.py | 47 +++---- .../nlp/gnmt_v2/src/dataset/load_dataset.py | 38 +++--- .../nlp/mass/src/dataset/load_dataset.py | 28 ++-- .../prophetnet/src/dataset/load_dataset.py | 28 ++-- .../official/nlp/tinybert/src/dataset.py | 28 ++-- model_zoo/official/nlp/transformer/eval.py | 36 ++--- .../official/recommend/deepfm/src/dataset.py | 72 +++++----- .../recommend/wide_and_deep/src/datasets.py | 94 +++++++------ .../train_and_eval_auto_parallel.py | 4 +- ...in_and_eval_parameter_server_distribute.py | 4 +- .../wide_and_deep_multitable/src/datasets.py | 50 ++++--- .../research/cv/centernet/src/dataset.py | 71 +++++----- model_zoo/research/cv/ghostnet/src/dataset.py | 32 ++--- .../research/cv/ghostnet_quant/src/dataset.py | 32 ++--- .../resnet50_adv_pruning/src/pet_dataset.py | 32 ++--- .../research/cv/squeezenet/src/dataset.py | 70 +++++----- .../research/recommend/autodis/src/dataset.py | 72 +++++----- .../st/model_zoo_tests/DeepFM/src/dataset.py | 72 +++++----- .../transformer/test_transformer.py | 40 +++--- .../python_file_for_ci/datasets.py | 64 ++++----- .../test_bert_tdt_lossscale.py | 38 +++--- .../bert_performance/test_bert_thor_mlperf.py | 50 +++---- .../bert_precision/test_bert_tdt_lossscale.py | 35 ++--- tests/st/networks/models/bert/src/dataset.py | 38 +++--- .../networks/models/resnet50/src/dataset.py | 19 ++- .../models/resnet50/src_thor/dataset.py | 21 ++- .../st/quantization/resnet50_quant/dataset.py | 21 ++- tests/ut/python/dataset/test_autocontrast.py | 123 +++++++++--------- tests/ut/python/dataset/test_equalize.py | 50 +++---- tests/ut/python/dataset/test_invert.py | 46 +++---- tests/ut/python/dataset/test_random_color.py | 9 +- .../python/dataset/test_random_sharpness.py | 27 ++-- .../python/dataset/test_random_solarize_op.py | 5 +- .../ut/python/dataset/test_uniform_augment.py | 20 +-- 57 files changed, 1135 insertions(+), 1081 deletions(-) diff --git a/model_zoo/official/cv/centerface/src/dataset.py b/model_zoo/official/cv/centerface/src/dataset.py index 9ba0f4619dc..f4b85315ab1 100644 --- a/model_zoo/official/cv/centerface/src/dataset.py +++ b/model_zoo/official/cv/centerface/src/dataset.py @@ -14,7 +14,7 @@ # ============================================================================ """generate dataloader and data processing entry""" -import mindspore.dataset.engine as de +import mindspore.dataset as ds from src.utils import DistributedSampler @@ -32,7 +32,7 @@ def GetDataLoader(per_batch_size, """ centerface_gen = CenterfaceDataset(config=config, split=split) sampler = DistributedSampler(centerface_gen, rank, group_size, shuffle=(split == 'train')) # user defined sampling strategy - de_dataset = de.GeneratorDataset(centerface_gen, ["image", "anns"], sampler=sampler, num_parallel_workers=16) + de_dataset = ds.GeneratorDataset(centerface_gen, ["image", "anns"], sampler=sampler, num_parallel_workers=16) if group_size > 1: num_parallel_workers = 24 diff --git a/model_zoo/official/cv/cnn_direction_model/src/dataset.py b/model_zoo/official/cv/cnn_direction_model/src/dataset.py index 13644c663d0..f671bf27e49 100644 --- a/model_zoo/official/cv/cnn_direction_model/src/dataset.py +++ b/model_zoo/official/cv/cnn_direction_model/src/dataset.py @@ -17,7 +17,7 @@ Data operations, will be used in train.py and eval.py """ import os -import mindspore.dataset.engine as de +import mindspore.dataset as ds import mindspore.dataset.vision.c_transforms as C from src.dataset_utils import lucky, noise_blur, noise_speckle, noise_gamma, noise_gaussian, noise_salt_pepper, \ shift_color, enhance_brightness, enhance_sharpness, enhance_contrast, enhance_color, gaussian_blur, \ @@ -26,6 +26,7 @@ from src.dataset_utils import lucky, noise_blur, noise_speckle, noise_gamma, noi import cv2 import numpy as np + cv2.setNumThreads(0) image_height = None @@ -179,23 +180,24 @@ def create_dataset_train(mindrecord_file_pos, config): rank_id = int(os.getenv("RANK_ID", '0')) decode = C.Decode() - ds = de.MindDataset(mindrecord_file_pos, columns_list=["image", "label"], num_parallel_workers=4, - num_shards=rank_size, shard_id=rank_id, shuffle=True) - ds = ds.map(operations=decode, input_columns=["image"], num_parallel_workers=8) + data_set = ds.MindDataset(mindrecord_file_pos, columns_list=["image", "label"], num_parallel_workers=4, + num_shards=rank_size, shard_id=rank_id, shuffle=True) + data_set = data_set.map(operations=decode, input_columns=["image"], num_parallel_workers=8) augmentor = Augmentor(config.augment_severity, config.augment_prob) operation = augmentor.process - ds = ds.map(operations=operation, input_columns=["image"], - num_parallel_workers=1, python_multiprocessing=True) + data_set = data_set.map(operations=operation, input_columns=["image"], + num_parallel_workers=1, python_multiprocessing=True) ##randomly augment half of samples to be negative samples - ds = ds.map(operations=[random_neg_with_rotate, unify_img_label, transform_image], input_columns=["image", "label"], - num_parallel_workers=8, python_multiprocessing=True) - ##for training double the dataset to accoun for positive and negative - ds = ds.repeat(2) + data_set = data_set.map(operations=[random_neg_with_rotate, unify_img_label, transform_image], + input_columns=["image", "label"], + num_parallel_workers=8, python_multiprocessing=True) + ##for training double the data_set to accoun for positive and negative + data_set = data_set.repeat(2) # apply batch operations - ds = ds.batch(config.batch_size, drop_remainder=True) - return ds + data_set = data_set.batch(config.batch_size, drop_remainder=True) + return data_set def resize_image(img, label): @@ -230,17 +232,18 @@ def create_dataset_eval(mindrecord_file_pos, config): rank_id = int(os.getenv("RANK_ID", '0')) decode = C.Decode() - ds = de.MindDataset(mindrecord_file_pos, columns_list=["image", "label"], num_parallel_workers=1, - num_shards=rank_size, shard_id=rank_id, shuffle=False) - ds = ds.map(operations=decode, input_columns=["image"], num_parallel_workers=8) + data_set = ds.MindDataset(mindrecord_file_pos, columns_list=["image", "label"], num_parallel_workers=1, + num_shards=rank_size, shard_id=rank_id, shuffle=False) + data_set = data_set.map(operations=decode, input_columns=["image"], num_parallel_workers=8) global image_height global image_width image_height = config.im_size_h image_width = config.im_size_w - ds = ds.map(operations=resize_image, input_columns=["image", "label"], num_parallel_workers=config.work_nums, - python_multiprocessing=False) + data_set = data_set.map(operations=resize_image, input_columns=["image", "label"], + num_parallel_workers=config.work_nums, + python_multiprocessing=False) # apply batch operations - ds = ds.batch(1, drop_remainder=True) + data_set = data_set.batch(1, drop_remainder=True) - return ds + return data_set diff --git a/model_zoo/official/cv/crnn/src/dataset.py b/model_zoo/official/cv/crnn/src/dataset.py index 5415bdf7665..da6d1f6bfaa 100644 --- a/model_zoo/official/cv/crnn/src/dataset.py +++ b/model_zoo/official/cv/crnn/src/dataset.py @@ -16,7 +16,7 @@ import os import numpy as np import mindspore.common.dtype as mstype -import mindspore.dataset.engine as de +import mindspore.dataset as ds import mindspore.dataset.transforms.c_transforms as C import mindspore.dataset.vision.c_transforms as vc from PIL import Image, ImageFile @@ -105,7 +105,7 @@ def create_dataset(name, dataset_path, batch_size=1, num_shards=1, shard_id=0, i dataset = IIIT5KDataset(dataset_path, "annotation.txt", config) else: raise ValueError(f"unsupported dataset name: {name}") - ds = de.GeneratorDataset(dataset, ["image", "label"], shuffle=True, num_shards=num_shards, shard_id=shard_id) + data_set = ds.GeneratorDataset(dataset, ["image", "label"], shuffle=True, num_shards=num_shards, shard_id=shard_id) image_trans = [ vc.Resize((config.image_height, config.image_width)), vc.Normalize([127.5, 127.5, 127.5], std=[127.5, 127.5, 127.5]), @@ -114,8 +114,8 @@ def create_dataset(name, dataset_path, batch_size=1, num_shards=1, shard_id=0, i label_trans = [ C.TypeCast(mstype.int32) ] - ds = ds.map(operations=image_trans, input_columns=["image"], num_parallel_workers=8) - ds = ds.map(operations=label_trans, input_columns=["label"], num_parallel_workers=8) + data_set = data_set.map(operations=image_trans, input_columns=["image"], num_parallel_workers=8) + data_set = data_set.map(operations=label_trans, input_columns=["label"], num_parallel_workers=8) - ds = ds.batch(batch_size, drop_remainder=True) - return ds + data_set = data_set.batch(batch_size, drop_remainder=True) + return data_set diff --git a/model_zoo/official/cv/inceptionv3/src/dataset.py b/model_zoo/official/cv/inceptionv3/src/dataset.py index 860d93f215a..9ffe08373ca 100644 --- a/model_zoo/official/cv/inceptionv3/src/dataset.py +++ b/model_zoo/official/cv/inceptionv3/src/dataset.py @@ -16,7 +16,7 @@ Data operations, will be used in train.py and eval.py """ import mindspore.common.dtype as mstype -import mindspore.dataset.engine as de +import mindspore.dataset as ds import mindspore.dataset.transforms.c_transforms as C2 import mindspore.dataset.vision.c_transforms as C from src.config import config_gpu as cfg @@ -37,33 +37,33 @@ def create_dataset(dataset_path, do_train, rank, group_size, repeat_num=1): dataset """ if group_size == 1: - ds = de.ImageFolderDataset(dataset_path, num_parallel_workers=cfg.work_nums, shuffle=True) + data_set = ds.ImageFolderDataset(dataset_path, num_parallel_workers=cfg.work_nums, shuffle=True) else: - ds = de.ImageFolderDataset(dataset_path, num_parallel_workers=cfg.work_nums, shuffle=True, - num_shards=group_size, shard_id=rank) + data_set = ds.ImageFolderDataset(dataset_path, num_parallel_workers=cfg.work_nums, shuffle=True, + num_shards=group_size, shard_id=rank) # define map operations if do_train: trans = [ C.RandomCropDecodeResize(299, scale=(0.08, 1.0), ratio=(0.75, 1.333)), C.RandomHorizontalFlip(prob=0.5), C.RandomColorAdjust(brightness=0.4, contrast=0.4, saturation=0.4) - ] + ] else: trans = [ C.Decode(), C.Resize(299), C.CenterCrop(299) - ] + ] trans += [ C.Rescale(1.0 / 255.0, 0.0), C.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]), C.HWC2CHW() ] type_cast_op = C2.TypeCast(mstype.int32) - ds = ds.map(operations=trans, input_columns="image", num_parallel_workers=cfg.work_nums) - ds = ds.map(operations=type_cast_op, input_columns="label", num_parallel_workers=cfg.work_nums) + data_set = data_set.map(operations=trans, input_columns="image", num_parallel_workers=cfg.work_nums) + data_set = data_set.map(operations=type_cast_op, input_columns="label", num_parallel_workers=cfg.work_nums) # apply batch operations - ds = ds.batch(cfg.batch_size, drop_remainder=True) + data_set = data_set.batch(cfg.batch_size, drop_remainder=True) # apply dataset repeat operation - ds = ds.repeat(repeat_num) - return ds + data_set = data_set.repeat(repeat_num) + return data_set diff --git a/model_zoo/official/cv/mobilenetv1/src/dataset.py b/model_zoo/official/cv/mobilenetv1/src/dataset.py index e3b1418765e..305a698d9e1 100755 --- a/model_zoo/official/cv/mobilenetv1/src/dataset.py +++ b/model_zoo/official/cv/mobilenetv1/src/dataset.py @@ -17,7 +17,7 @@ create train or eval dataset. """ import os import mindspore.common.dtype as mstype -import mindspore.dataset.engine as de +import mindspore.dataset as ds import mindspore.dataset.vision.c_transforms as C import mindspore.dataset.transforms.c_transforms as C2 from mindspore.communication.management import init, get_rank, get_group_size @@ -44,10 +44,10 @@ def create_dataset1(dataset_path, do_train, repeat_num=1, batch_size=32, target= device_num = get_group_size() if device_num == 1: - ds = de.Cifar10Dataset(dataset_path, num_parallel_workers=8, shuffle=True) + data_set = ds.Cifar10Dataset(dataset_path, num_parallel_workers=8, shuffle=True) else: - ds = de.Cifar10Dataset(dataset_path, num_parallel_workers=8, shuffle=True, - num_shards=device_num, shard_id=rank_id) + data_set = ds.Cifar10Dataset(dataset_path, num_parallel_workers=8, shuffle=True, + num_shards=device_num, shard_id=rank_id) # define map operations trans = [] @@ -66,15 +66,15 @@ def create_dataset1(dataset_path, do_train, repeat_num=1, batch_size=32, target= type_cast_op = C2.TypeCast(mstype.int32) - ds = ds.map(operations=type_cast_op, input_columns="label", num_parallel_workers=8) - ds = ds.map(operations=trans, input_columns="image", num_parallel_workers=8) + data_set = data_set.map(operations=type_cast_op, input_columns="label", num_parallel_workers=8) + data_set = data_set.map(operations=trans, input_columns="image", num_parallel_workers=8) # apply batch operations - ds = ds.batch(batch_size, drop_remainder=True) + data_set = data_set.batch(batch_size, drop_remainder=True) # apply dataset repeat operation - ds = ds.repeat(repeat_num) + data_set = data_set.repeat(repeat_num) - return ds + return data_set def create_dataset2(dataset_path, do_train, repeat_num=1, batch_size=32, target="Ascend"): @@ -99,10 +99,10 @@ def create_dataset2(dataset_path, do_train, repeat_num=1, batch_size=32, target= device_num = get_group_size() if device_num == 1: - ds = de.ImageFolderDataset(dataset_path, num_parallel_workers=8, shuffle=True) + data_set = ds.ImageFolderDataset(dataset_path, num_parallel_workers=8, shuffle=True) else: - ds = de.ImageFolderDataset(dataset_path, num_parallel_workers=8, shuffle=True, - num_shards=device_num, shard_id=rank_id) + data_set = ds.ImageFolderDataset(dataset_path, num_parallel_workers=8, shuffle=True, + num_shards=device_num, shard_id=rank_id) image_size = 224 mean = [0.485 * 255, 0.456 * 255, 0.406 * 255] @@ -127,16 +127,16 @@ def create_dataset2(dataset_path, do_train, repeat_num=1, batch_size=32, target= type_cast_op = C2.TypeCast(mstype.int32) - ds = ds.map(operations=trans, input_columns="image", num_parallel_workers=8) - ds = ds.map(operations=type_cast_op, input_columns="label", num_parallel_workers=8) + data_set = data_set.map(operations=trans, input_columns="image", num_parallel_workers=8) + data_set = data_set.map(operations=type_cast_op, input_columns="label", num_parallel_workers=8) # apply batch operations - ds = ds.batch(batch_size, drop_remainder=True) + data_set = data_set.batch(batch_size, drop_remainder=True) # apply dataset repeat operation - ds = ds.repeat(repeat_num) + data_set = data_set.repeat(repeat_num) - return ds + return data_set def _get_rank_info(): diff --git a/model_zoo/official/cv/mobilenetv2/src/dataset.py b/model_zoo/official/cv/mobilenetv2/src/dataset.py index 15542f36726..631796a32dd 100644 --- a/model_zoo/official/cv/mobilenetv2/src/dataset.py +++ b/model_zoo/official/cv/mobilenetv2/src/dataset.py @@ -21,7 +21,7 @@ import numpy as np from mindspore import Tensor from mindspore.train.model import Model import mindspore.common.dtype as mstype -import mindspore.dataset.engine as de +import mindspore.dataset as ds import mindspore.dataset.vision.c_transforms as C import mindspore.dataset.transforms.c_transforms as C2 @@ -43,22 +43,22 @@ def create_dataset(dataset_path, do_train, config, repeat_num=1): rank_size = int(os.getenv("RANK_SIZE", '1')) rank_id = int(os.getenv("RANK_ID", '0')) if rank_size == 1: - ds = de.ImageFolderDataset(dataset_path, num_parallel_workers=8, shuffle=True) + data_set = ds.ImageFolderDataset(dataset_path, num_parallel_workers=8, shuffle=True) else: - ds = de.ImageFolderDataset(dataset_path, num_parallel_workers=8, shuffle=True, - num_shards=rank_size, shard_id=rank_id) + data_set = ds.ImageFolderDataset(dataset_path, num_parallel_workers=8, shuffle=True, + num_shards=rank_size, shard_id=rank_id) elif config.platform == "GPU": if do_train: if config.run_distribute: from mindspore.communication.management import get_rank, get_group_size - ds = de.ImageFolderDataset(dataset_path, num_parallel_workers=8, shuffle=True, - num_shards=get_group_size(), shard_id=get_rank()) + data_set = ds.ImageFolderDataset(dataset_path, num_parallel_workers=8, shuffle=True, + num_shards=get_group_size(), shard_id=get_rank()) else: - ds = de.ImageFolderDataset(dataset_path, num_parallel_workers=8, shuffle=True) + data_set = ds.ImageFolderDataset(dataset_path, num_parallel_workers=8, shuffle=True) else: - ds = de.ImageFolderDataset(dataset_path, num_parallel_workers=8, shuffle=True) + data_set = ds.ImageFolderDataset(dataset_path, num_parallel_workers=8, shuffle=True) elif config.platform == "CPU": - ds = de.ImageFolderDataset(dataset_path, num_parallel_workers=8, shuffle=True) + data_set = ds.ImageFolderDataset(dataset_path, num_parallel_workers=8, shuffle=True) resize_height = config.image_height resize_width = config.image_width @@ -83,19 +83,19 @@ def create_dataset(dataset_path, do_train, config, repeat_num=1): type_cast_op = C2.TypeCast(mstype.int32) - ds = ds.map(operations=trans, input_columns="image", num_parallel_workers=8) - ds = ds.map(operations=type_cast_op, input_columns="label", num_parallel_workers=8) + data_set = data_set.map(operations=trans, input_columns="image", num_parallel_workers=8) + data_set = data_set.map(operations=type_cast_op, input_columns="label", num_parallel_workers=8) # apply shuffle operations - ds = ds.shuffle(buffer_size=buffer_size) + data_set = data_set.shuffle(buffer_size=buffer_size) # apply batch operations - ds = ds.batch(config.batch_size, drop_remainder=True) + data_set = data_set.batch(config.batch_size, drop_remainder=True) # apply dataset repeat operation - ds = ds.repeat(repeat_num) + data_set = data_set.repeat(repeat_num) - return ds + return data_set def extract_features(net, dataset_path, config): @@ -121,5 +121,5 @@ def extract_features(net, dataset_path, config): features = model.predict(Tensor(image)) np.save(features_path, features.asnumpy()) np.save(label_path, label) - print(f"Complete the batch {i+1}/{step_size}") + print(f"Complete the batch {i + 1}/{step_size}") return step_size diff --git a/model_zoo/official/cv/mobilenetv2_quant/src/dataset.py b/model_zoo/official/cv/mobilenetv2_quant/src/dataset.py index aefe1604261..1b0a4c292f0 100644 --- a/model_zoo/official/cv/mobilenetv2_quant/src/dataset.py +++ b/model_zoo/official/cv/mobilenetv2_quant/src/dataset.py @@ -18,7 +18,7 @@ create train or eval dataset. import os from functools import partial import mindspore.common.dtype as mstype -import mindspore.dataset.engine as de +import mindspore.dataset as ds import mindspore.dataset.vision.c_transforms as C import mindspore.dataset.transforms.c_transforms as C2 import mindspore.dataset.transforms.py_transforms as P2 @@ -43,24 +43,24 @@ def create_dataset(dataset_path, do_train, config, device_target, repeat_num=1, rank_id = int(os.getenv("RANK_ID")) columns_list = ['image', 'label'] if config.data_load_mode == "mindrecord": - load_func = partial(de.MindDataset, dataset_path, columns_list) + load_func = partial(ds.MindDataset, dataset_path, columns_list) else: - load_func = partial(de.ImageFolderDataset, dataset_path) + load_func = partial(ds.ImageFolderDataset, dataset_path) if do_train: if rank_size == 1: - ds = load_func(num_parallel_workers=8, shuffle=True) + data_set = load_func(num_parallel_workers=8, shuffle=True) else: - ds = load_func(num_parallel_workers=8, shuffle=True, - num_shards=rank_size, shard_id=rank_id) + data_set = load_func(num_parallel_workers=8, shuffle=True, + num_shards=rank_size, shard_id=rank_id) else: - ds = load_func(num_parallel_workers=8, shuffle=False) + data_set = load_func(num_parallel_workers=8, shuffle=False) elif device_target == "GPU": if do_train: from mindspore.communication.management import get_rank, get_group_size - ds = de.ImageFolderDataset(dataset_path, num_parallel_workers=8, shuffle=True, - num_shards=get_group_size(), shard_id=get_rank()) + data_set = ds.ImageFolderDataset(dataset_path, num_parallel_workers=8, shuffle=True, + num_shards=get_group_size(), shard_id=get_rank()) else: - ds = de.ImageFolderDataset(dataset_path, num_parallel_workers=8, shuffle=True) + data_set = ds.ImageFolderDataset(dataset_path, num_parallel_workers=8, shuffle=True) else: raise ValueError("Unsupported device_target.") @@ -69,7 +69,7 @@ def create_dataset(dataset_path, do_train, config, device_target, repeat_num=1, if do_train: buffer_size = 20480 # apply shuffle operations - ds = ds.shuffle(buffer_size=buffer_size) + data_set = data_set.shuffle(buffer_size=buffer_size) # define map operations decode_op = C.Decode() @@ -89,16 +89,16 @@ def create_dataset(dataset_path, do_train, config, device_target, repeat_num=1, type_cast_op = C2.TypeCast(mstype.int32) - ds = ds.map(operations=trans, input_columns="image", num_parallel_workers=16) - ds = ds.map(operations=type_cast_op, input_columns="label", num_parallel_workers=8) + data_set = data_set.map(operations=trans, input_columns="image", num_parallel_workers=16) + data_set = data_set.map(operations=type_cast_op, input_columns="label", num_parallel_workers=8) # apply batch operations - ds = ds.batch(batch_size, drop_remainder=True) + data_set = data_set.batch(batch_size, drop_remainder=True) # apply dataset repeat operation - ds = ds.repeat(repeat_num) + data_set = data_set.repeat(repeat_num) - return ds + return data_set def create_dataset_py(dataset_path, do_train, config, device_target, repeat_num=1, batch_size=32): @@ -119,12 +119,12 @@ def create_dataset_py(dataset_path, do_train, config, device_target, repeat_num= rank_id = int(os.getenv("RANK_ID")) if do_train: if rank_size == 1: - ds = de.ImageFolderDataset(dataset_path, num_parallel_workers=8, shuffle=True) + data_set = ds.ImageFolderDataset(dataset_path, num_parallel_workers=8, shuffle=True) else: - ds = de.ImageFolderDataset(dataset_path, num_parallel_workers=8, shuffle=True, - num_shards=rank_size, shard_id=rank_id) + data_set = ds.ImageFolderDataset(dataset_path, num_parallel_workers=8, shuffle=True, + num_shards=rank_size, shard_id=rank_id) else: - ds = de.ImageFolderDataset(dataset_path, num_parallel_workers=8, shuffle=False) + data_set = ds.ImageFolderDataset(dataset_path, num_parallel_workers=8, shuffle=False) else: raise ValueError("Unsupported device target.") @@ -133,7 +133,7 @@ def create_dataset_py(dataset_path, do_train, config, device_target, repeat_num= if do_train: buffer_size = 20480 # apply shuffle operations - ds = ds.shuffle(buffer_size=buffer_size) + data_set = data_set.shuffle(buffer_size=buffer_size) # define map operations decode_op = P.Decode() @@ -152,12 +152,13 @@ def create_dataset_py(dataset_path, do_train, config, device_target, repeat_num= compose = P2.Compose(trans) - ds = ds.map(operations=compose, input_columns="image", num_parallel_workers=8, python_multiprocessing=True) + data_set = data_set.map(operations=compose, input_columns="image", num_parallel_workers=8, + python_multiprocessing=True) # apply batch operations - ds = ds.batch(batch_size, drop_remainder=True) + data_set = data_set.batch(batch_size, drop_remainder=True) # apply dataset repeat operation - ds = ds.repeat(repeat_num) + data_set = data_set.repeat(repeat_num) - return ds + return data_set diff --git a/model_zoo/official/cv/mobilenetv3/src/dataset.py b/model_zoo/official/cv/mobilenetv3/src/dataset.py index c140a7fdbcd..ec082919f35 100644 --- a/model_zoo/official/cv/mobilenetv3/src/dataset.py +++ b/model_zoo/official/cv/mobilenetv3/src/dataset.py @@ -16,7 +16,7 @@ create train or eval dataset. """ import mindspore.common.dtype as mstype -import mindspore.dataset.engine as de +import mindspore.dataset as ds import mindspore.dataset.vision.c_transforms as C import mindspore.dataset.transforms.c_transforms as C2 @@ -38,12 +38,12 @@ def create_dataset(dataset_path, do_train, config, device_target, repeat_num=1, if do_train: if run_distribute: from mindspore.communication.management import get_rank, get_group_size - ds = de.ImageFolderDataset(dataset_path, num_parallel_workers=8, shuffle=True, - num_shards=get_group_size(), shard_id=get_rank()) + data_set = ds.ImageFolderDataset(dataset_path, num_parallel_workers=8, shuffle=True, + num_shards=get_group_size(), shard_id=get_rank()) else: - ds = de.ImageFolderDataset(dataset_path, num_parallel_workers=8, shuffle=True) + data_set = ds.ImageFolderDataset(dataset_path, num_parallel_workers=8, shuffle=True) else: - ds = de.ImageFolderDataset(dataset_path, num_parallel_workers=8, shuffle=True) + data_set = ds.ImageFolderDataset(dataset_path, num_parallel_workers=8, shuffle=True) else: raise ValueError("Unsupported device_target.") @@ -70,16 +70,16 @@ def create_dataset(dataset_path, do_train, config, device_target, repeat_num=1, type_cast_op = C2.TypeCast(mstype.int32) - ds = ds.map(operations=trans, input_columns="image", num_parallel_workers=8) - ds = ds.map(operations=type_cast_op, input_columns="label", num_parallel_workers=8) + data_set = data_set.map(operations=trans, input_columns="image", num_parallel_workers=8) + data_set = data_set.map(operations=type_cast_op, input_columns="label", num_parallel_workers=8) # apply shuffle operations - ds = ds.shuffle(buffer_size=buffer_size) + data_set = data_set.shuffle(buffer_size=buffer_size) # apply batch operations - ds = ds.batch(batch_size, drop_remainder=True) + data_set = data_set.batch(batch_size, drop_remainder=True) # apply dataset repeat operation - ds = ds.repeat(repeat_num) + data_set = data_set.repeat(repeat_num) - return ds + return data_set diff --git a/model_zoo/official/cv/nasnet/src/dataset.py b/model_zoo/official/cv/nasnet/src/dataset.py index 520d33e3b3c..2cff62394d1 100755 --- a/model_zoo/official/cv/nasnet/src/dataset.py +++ b/model_zoo/official/cv/nasnet/src/dataset.py @@ -16,7 +16,7 @@ Data operations, will be used in train.py and eval.py """ import mindspore.common.dtype as mstype -import mindspore.dataset.engine as de +import mindspore.dataset as ds import mindspore.dataset.transforms.c_transforms as C2 import mindspore.dataset.vision.c_transforms as C @@ -37,10 +37,10 @@ def create_dataset(dataset_path, config, do_train, repeat_num=1): rank = config.rank group_size = config.group_size if group_size == 1: - ds = de.ImageFolderDataset(dataset_path, num_parallel_workers=config.work_nums, shuffle=True) + data_set = ds.ImageFolderDataset(dataset_path, num_parallel_workers=config.work_nums, shuffle=True) else: - ds = de.ImageFolderDataset(dataset_path, num_parallel_workers=config.work_nums, shuffle=True, - num_shards=group_size, shard_id=rank) + data_set = ds.ImageFolderDataset(dataset_path, num_parallel_workers=config.work_nums, shuffle=True, + num_shards=group_size, shard_id=rank) # define map operations if do_train: trans = [ @@ -60,10 +60,10 @@ def create_dataset(dataset_path, config, do_train, repeat_num=1): C.HWC2CHW() ] type_cast_op = C2.TypeCast(mstype.int32) - ds = ds.map(operations=trans, input_columns="image", num_parallel_workers=config.work_nums) - ds = ds.map(operations=type_cast_op, input_columns="label", num_parallel_workers=config.work_nums) + data_set = data_set.map(operations=trans, input_columns="image", num_parallel_workers=config.work_nums) + data_set = data_set.map(operations=type_cast_op, input_columns="label", num_parallel_workers=config.work_nums) # apply batch operations - ds = ds.batch(config.batch_size, drop_remainder=True) + data_set = data_set.batch(config.batch_size, drop_remainder=True) # apply dataset repeat operation - ds = ds.repeat(repeat_num) - return ds + data_set = data_set.repeat(repeat_num) + return data_set diff --git a/model_zoo/official/cv/psenet/src/dataset.py b/model_zoo/official/cv/psenet/src/dataset.py index 32070b98e9a..d6f8a8e9617 100644 --- a/model_zoo/official/cv/psenet/src/dataset.py +++ b/model_zoo/official/cv/psenet/src/dataset.py @@ -25,21 +25,24 @@ import pyclipper from PIL import Image from src.config import config -import mindspore.dataset.engine as de +import mindspore.dataset as ds import mindspore.dataset.vision.py_transforms as py_transforms __all__ = ['train_dataset_creator', 'test_dataset_creator'] + def get_img(img_path): img = cv2.imread(img_path) img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB) return img + def get_imgs_names(root_dir): img_paths = [i for i in os.listdir(root_dir) if os.path.splitext(i)[-1].lower() in ['.jpg', '.jpeg', '.png']] return img_paths + def get_bboxes(img, gt_path): h, w = img.shape[0:2] with open(gt_path, 'r', encoding='utf-8-sig') as f: @@ -58,6 +61,7 @@ def get_bboxes(img, gt_path): tags.append(tag) return np.array(bboxes), tags + def random_scale(img, min_size): h, w = img.shape[0:2] if max(h, w) > 1280: @@ -74,12 +78,14 @@ def random_scale(img, min_size): img = cv2.resize(img, dsize=None, fx=scale2, fy=scale2) return img + def random_horizontal_flip(imgs): if random.random() < 0.5: for i, _ in enumerate(imgs): imgs[i] = np.flip(imgs[i], axis=1).copy() return imgs + def random_rotate(imgs): max_angle = 10 angle = random.random() * 2 * max_angle - max_angle @@ -91,6 +97,7 @@ def random_rotate(imgs): imgs[i] = img_rotation return imgs + def random_crop(imgs, img_size): h, w = imgs[0].shape[0:2] th, tw = img_size @@ -118,21 +125,25 @@ def random_crop(imgs, img_size): imgs[idx] = imgs[idx][i:i + th, j:j + tw] return imgs + def scale(img, long_size=2240): h, w = img.shape[0:2] scale_long = long_size * 1.0 / max(h, w) img = cv2.resize(img, dsize=None, fx=scale_long, fy=scale_long) return img + def dist(a, b): return np.sqrt(np.sum((a - b) ** 2)) + def perimeter(bbox): peri = 0.0 for i in range(bbox.shape[0]): peri += dist(bbox[i], bbox[(i + 1) % bbox.shape[0]]) return peri + def shrink(bboxes, rate, max_shr=20): rate = rate * rate shrinked_bboxes = [] @@ -158,6 +169,7 @@ def shrink(bboxes, rate, max_shr=20): return np.array(shrinked_bboxes) + class TrainDataset: def __init__(self): self.is_transform = True @@ -260,6 +272,7 @@ class TrainDataset: def __len__(self): return len(self.all_img_paths) + def IC15_TEST_Generator(): ic15_test_data_dir = config.TEST_ROOT_DIR + 'ch4_test_images/' img_size = config.INFER_LONG_SIZE @@ -298,6 +311,7 @@ def IC15_TEST_Generator(): yield img, img_resized, img_name + class DistributedSampler(): def __init__(self, dataset, rank, group_size, shuffle=True, seed=0): self.dataset = dataset @@ -324,18 +338,20 @@ class DistributedSampler(): def __len__(self): return self.num_samplers + def train_dataset_creator(rank, group_size, shuffle=True): cv2.setNumThreads(0) dataset = TrainDataset() sampler = DistributedSampler(dataset, rank, group_size, shuffle) - ds = de.GeneratorDataset(dataset, ['img', 'gt_text', 'gt_kernels', 'training_mask'], num_parallel_workers=8, - sampler=sampler) - ds = ds.repeat(1) - ds = ds.batch(config.TRAIN_BATCH_SIZE, drop_remainder=config.TRAIN_DROP_REMAINDER) - return ds + data_set = ds.GeneratorDataset(dataset, ['img', 'gt_text', 'gt_kernels', 'training_mask'], num_parallel_workers=8, + sampler=sampler) + data_set = data_set.repeat(1) + data_set = data_set.batch(config.TRAIN_BATCH_SIZE, drop_remainder=config.TRAIN_DROP_REMAINDER) + return data_set + def test_dataset_creator(): - ds = de.GeneratorDataset(IC15_TEST_Generator, ['img', 'img_resized', 'img_name']) - ds = ds.shuffle(config.TEST_BUFFER_SIZE) - ds = ds.batch(1, drop_remainder=config.TEST_DROP_REMAINDER) - return ds + data_set = ds.GeneratorDataset(IC15_TEST_Generator, ['img', 'img_resized', 'img_name']) + data_set = data_set.shuffle(config.TEST_BUFFER_SIZE) + data_set = data_set.batch(1, drop_remainder=config.TEST_DROP_REMAINDER) + return data_set diff --git a/model_zoo/official/cv/resnet/gpu_resnet_benchmark.py b/model_zoo/official/cv/resnet/gpu_resnet_benchmark.py index e75cbc4356b..577c8dbcec9 100644 --- a/model_zoo/official/cv/resnet/gpu_resnet_benchmark.py +++ b/model_zoo/official/cv/resnet/gpu_resnet_benchmark.py @@ -29,7 +29,7 @@ from mindspore.train.serialization import load_checkpoint, load_param_into_net from mindspore.common import set_seed import mindspore.nn as nn import mindspore.common.initializer as weight_init -import mindspore.dataset.engine as de +import mindspore.dataset as ds import mindspore.dataset.vision.c_transforms as C from src.resnet_gpu_benchmark import resnet50 as resnet from src.CrossEntropySmooth import CrossEntropySmooth @@ -45,19 +45,22 @@ parser.add_argument('--dataset_path', type=str, default=None, help='Imagenet dat parser.add_argument('--ckpt_path', type=str, default="./", help='The path to save ckpt if save_ckpt is True;\ Or the ckpt model file when eval is True') parser.add_argument('--mode', type=str, default="GRAPH", choices=["GRAPH", "PYNATIVE"], help='Execute mode') -parser.add_argument('--dtype', type=str, choices=["fp32", "fp16", "FP16", "FP32"], default="fp16",\ - help='Compute data type fp32 or fp16: default fp16') +parser.add_argument('--dtype', type=str, choices=["fp32", "fp16", "FP16", "FP32"], default="fp16", \ + help='Compute data type fp32 or fp16: default fp16') args_opt = parser.parse_args() set_seed(1) + class MyTimeMonitor(Callback): def __init__(self, batch_size, sink_size): super(MyTimeMonitor, self).__init__() self.batch_size = batch_size self.size = sink_size + def step_begin(self, run_context): self.step_time = time.time() + def step_end(self, run_context): cb_params = run_context.original_args() loss = cb_params.net_outputs @@ -75,17 +78,18 @@ class MyTimeMonitor(Callback): raise ValueError("epoch: {} step: {}. Invalid loss, terminating training.".format( cb_params.cur_epoch_num, cur_step_in_epoch)) step_mseconds = (time.time() - self.step_time) * 1000 - fps = self.batch_size / step_mseconds *1000 * self.size + fps = self.batch_size / step_mseconds * 1000 * self.size print("epoch: %s step: %s, loss is %s" % (cb_params.cur_epoch_num, cur_step_in_epoch, loss), "Epoch time: {:5.3f} ms, fps: {:d} img/sec.".format(step_mseconds, int(fps)), flush=True) + def create_dataset(dataset_path, do_train, repeat_num=1, batch_size=32, target="GPU", dtype="fp16", device_num=1): if device_num == 1: - ds = de.ImageFolderDataset(dataset_path, num_parallel_workers=4, shuffle=True) + data_set = ds.ImageFolderDataset(dataset_path, num_parallel_workers=4, shuffle=True) else: - ds = de.ImageFolderDataset(dataset_path, num_parallel_workers=4, shuffle=True, - num_shards=device_num, shard_id=get_rank()) + data_set = ds.ImageFolderDataset(dataset_path, num_parallel_workers=4, shuffle=True, + num_shards=device_num, shard_id=get_rank()) image_size = 224 mean = [0.485 * 255, 0.456 * 255, 0.406 * 255] std = [0.229 * 255, 0.224 * 255, 0.225 * 255] @@ -113,14 +117,15 @@ def create_dataset(dataset_path, do_train, repeat_num=1, batch_size=32, target=" ] if dtype == "fp32": trans.append(C.HWC2CHW()) - ds = ds.map(operations=trans, input_columns="image", num_parallel_workers=8) + data_set = data_set.map(operations=trans, input_columns="image", num_parallel_workers=8) # apply batch operations - ds = ds.batch(batch_size, drop_remainder=True) + data_set = data_set.batch(batch_size, drop_remainder=True) # apply dataset repeat operation if repeat_num > 1: - ds = ds.repeat(repeat_num) + data_set = data_set.repeat(repeat_num) + + return data_set - return ds def get_liner_lr(lr_init, lr_end, lr_max, warmup_epochs, total_epochs, steps_per_epoch): lr_each_step = [] @@ -136,6 +141,7 @@ def get_liner_lr(lr_init, lr_end, lr_max, warmup_epochs, total_epochs, steps_per lr_each_step = np.array(lr_each_step).astype(np.float32) return lr_each_step + def train(): # set args dev = "GPU" @@ -221,6 +227,7 @@ def train(): else: model.train(epoch_size, dataset, callbacks=cb) + def eval_(): # set args dev = "GPU" @@ -251,6 +258,7 @@ def eval_(): res = model.eval(dataset) print("result:", res, "ckpt=", ckpt_dir) + if __name__ == '__main__': if not args_opt.eval: train() diff --git a/model_zoo/official/cv/resnet/src/dataset.py b/model_zoo/official/cv/resnet/src/dataset.py index df19ed472ca..0c2d116a1b8 100755 --- a/model_zoo/official/cv/resnet/src/dataset.py +++ b/model_zoo/official/cv/resnet/src/dataset.py @@ -17,7 +17,7 @@ create train or eval dataset. """ import os import mindspore.common.dtype as mstype -import mindspore.dataset.engine as de +import mindspore.dataset as ds import mindspore.dataset.vision.c_transforms as C import mindspore.dataset.transforms.c_transforms as C2 from mindspore.communication.management import init, get_rank, get_group_size @@ -47,10 +47,10 @@ def create_dataset1(dataset_path, do_train, repeat_num=1, batch_size=32, target= else: device_num = 1 if device_num == 1: - ds = de.Cifar10Dataset(dataset_path, num_parallel_workers=8, shuffle=True) + data_set = ds.Cifar10Dataset(dataset_path, num_parallel_workers=8, shuffle=True) else: - ds = de.Cifar10Dataset(dataset_path, num_parallel_workers=8, shuffle=True, - num_shards=device_num, shard_id=rank_id) + data_set = ds.Cifar10Dataset(dataset_path, num_parallel_workers=8, shuffle=True, + num_shards=device_num, shard_id=rank_id) # define map operations trans = [] @@ -69,15 +69,15 @@ def create_dataset1(dataset_path, do_train, repeat_num=1, batch_size=32, target= type_cast_op = C2.TypeCast(mstype.int32) - ds = ds.map(operations=type_cast_op, input_columns="label", num_parallel_workers=8) - ds = ds.map(operations=trans, input_columns="image", num_parallel_workers=8) + data_set = data_set.map(operations=type_cast_op, input_columns="label", num_parallel_workers=8) + data_set = data_set.map(operations=trans, input_columns="image", num_parallel_workers=8) # apply batch operations - ds = ds.batch(batch_size, drop_remainder=True) + data_set = data_set.batch(batch_size, drop_remainder=True) # apply dataset repeat operation - ds = ds.repeat(repeat_num) + data_set = data_set.repeat(repeat_num) - return ds + return data_set def create_dataset2(dataset_path, do_train, repeat_num=1, batch_size=32, target="Ascend", distribute=False): @@ -106,10 +106,10 @@ def create_dataset2(dataset_path, do_train, repeat_num=1, batch_size=32, target= device_num = 1 if device_num == 1: - ds = de.ImageFolderDataset(dataset_path, num_parallel_workers=8, shuffle=True) + data_set = ds.ImageFolderDataset(dataset_path, num_parallel_workers=8, shuffle=True) else: - ds = de.ImageFolderDataset(dataset_path, num_parallel_workers=8, shuffle=True, - num_shards=device_num, shard_id=rank_id) + data_set = ds.ImageFolderDataset(dataset_path, num_parallel_workers=8, shuffle=True, + num_shards=device_num, shard_id=rank_id) image_size = 224 mean = [0.485 * 255, 0.456 * 255, 0.406 * 255] @@ -134,16 +134,16 @@ def create_dataset2(dataset_path, do_train, repeat_num=1, batch_size=32, target= type_cast_op = C2.TypeCast(mstype.int32) - ds = ds.map(operations=trans, input_columns="image", num_parallel_workers=8) - ds = ds.map(operations=type_cast_op, input_columns="label", num_parallel_workers=8) + data_set = data_set.map(operations=trans, input_columns="image", num_parallel_workers=8) + data_set = data_set.map(operations=type_cast_op, input_columns="label", num_parallel_workers=8) # apply batch operations - ds = ds.batch(batch_size, drop_remainder=True) + data_set = data_set.batch(batch_size, drop_remainder=True) # apply dataset repeat operation - ds = ds.repeat(repeat_num) + data_set = data_set.repeat(repeat_num) - return ds + return data_set def create_dataset3(dataset_path, do_train, repeat_num=1, batch_size=32, target="Ascend", distribute=False): @@ -171,10 +171,10 @@ def create_dataset3(dataset_path, do_train, repeat_num=1, batch_size=32, target= device_num = 1 rank_id = 1 if device_num == 1: - ds = de.ImageFolderDataset(dataset_path, num_parallel_workers=8, shuffle=True) + data_set = ds.ImageFolderDataset(dataset_path, num_parallel_workers=8, shuffle=True) else: - ds = de.ImageFolderDataset(dataset_path, num_parallel_workers=8, shuffle=True, - num_shards=device_num, shard_id=rank_id) + data_set = ds.ImageFolderDataset(dataset_path, num_parallel_workers=8, shuffle=True, + num_shards=device_num, shard_id=rank_id) image_size = 224 mean = [0.475 * 255, 0.451 * 255, 0.392 * 255] std = [0.275 * 255, 0.267 * 255, 0.278 * 255] @@ -198,15 +198,15 @@ def create_dataset3(dataset_path, do_train, repeat_num=1, batch_size=32, target= type_cast_op = C2.TypeCast(mstype.int32) - ds = ds.map(operations=trans, input_columns="image", num_parallel_workers=8) - ds = ds.map(operations=type_cast_op, input_columns="label", num_parallel_workers=8) + data_set = data_set.map(operations=trans, input_columns="image", num_parallel_workers=8) + data_set = data_set.map(operations=type_cast_op, input_columns="label", num_parallel_workers=8) # apply batch operations - ds = ds.batch(batch_size, drop_remainder=True) + data_set = data_set.batch(batch_size, drop_remainder=True) # apply dataset repeat operation - ds = ds.repeat(repeat_num) + data_set = data_set.repeat(repeat_num) - return ds + return data_set def create_dataset4(dataset_path, do_train, repeat_num=1, batch_size=32, target="Ascend", distribute=False): @@ -234,10 +234,10 @@ def create_dataset4(dataset_path, do_train, repeat_num=1, batch_size=32, target= else: device_num = 1 if device_num == 1: - ds = de.ImageFolderDataset(dataset_path, num_parallel_workers=12, shuffle=True) + data_set = ds.ImageFolderDataset(dataset_path, num_parallel_workers=12, shuffle=True) else: - ds = de.ImageFolderDataset(dataset_path, num_parallel_workers=12, shuffle=True, - num_shards=device_num, shard_id=rank_id) + data_set = ds.ImageFolderDataset(dataset_path, num_parallel_workers=12, shuffle=True, + num_shards=device_num, shard_id=rank_id) image_size = 224 mean = [123.68, 116.78, 103.94] std = [1.0, 1.0, 1.0] @@ -260,16 +260,16 @@ def create_dataset4(dataset_path, do_train, repeat_num=1, batch_size=32, target= ] type_cast_op = C2.TypeCast(mstype.int32) - ds = ds.map(operations=trans, input_columns="image", num_parallel_workers=12) - ds = ds.map(operations=type_cast_op, input_columns="label", num_parallel_workers=12) + data_set = data_set.map(operations=trans, input_columns="image", num_parallel_workers=12) + data_set = data_set.map(operations=type_cast_op, input_columns="label", num_parallel_workers=12) # apply batch operations - ds = ds.batch(batch_size, drop_remainder=True) + data_set = data_set.batch(batch_size, drop_remainder=True) # apply dataset repeat operation - ds = ds.repeat(repeat_num) + data_set = data_set.repeat(repeat_num) - return ds + return data_set def _get_rank_info(): diff --git a/model_zoo/official/cv/resnet50_quant/src/dataset.py b/model_zoo/official/cv/resnet50_quant/src/dataset.py index 4070e76ac1d..48d19fa3379 100755 --- a/model_zoo/official/cv/resnet50_quant/src/dataset.py +++ b/model_zoo/official/cv/resnet50_quant/src/dataset.py @@ -18,7 +18,7 @@ create train or eval dataset. import os from functools import partial import mindspore.common.dtype as mstype -import mindspore.dataset.engine as de +import mindspore.dataset as ds import mindspore.dataset.transforms.c_transforms as C2 import mindspore.dataset.vision.c_transforms as C import mindspore.dataset.transforms.py_transforms as P2 @@ -53,14 +53,14 @@ def create_dataset(dataset_path, do_train, repeat_num=1, batch_size=32, target=" columns_list = ['image', 'label'] if config.data_load_mode == "mindrecord": - load_func = partial(de.MindDataset, dataset_path, columns_list) + load_func = partial(ds.MindDataset, dataset_path, columns_list) else: - load_func = partial(de.ImageFolderDataset, dataset_path) + load_func = partial(ds.ImageFolderDataset, dataset_path) if device_num == 1: - ds = load_func(num_parallel_workers=8, shuffle=True) + data_set = load_func(num_parallel_workers=8, shuffle=True) else: - ds = load_func(num_parallel_workers=8, shuffle=True, - num_shards=device_num, shard_id=rank_id) + data_set = load_func(num_parallel_workers=8, shuffle=True, + num_shards=device_num, shard_id=rank_id) image_size = 224 mean = [0.485 * 255, 0.456 * 255, 0.406 * 255] @@ -85,16 +85,16 @@ def create_dataset(dataset_path, do_train, repeat_num=1, batch_size=32, target=" type_cast_op = C2.TypeCast(mstype.int32) - ds = ds.map(operations=trans, input_columns="image", num_parallel_workers=8) - ds = ds.map(operations=type_cast_op, input_columns="label", num_parallel_workers=8) + data_set = data_set.map(operations=trans, input_columns="image", num_parallel_workers=8) + data_set = data_set.map(operations=type_cast_op, input_columns="label", num_parallel_workers=8) # apply batch operations - ds = ds.batch(batch_size, drop_remainder=True) + data_set = data_set.batch(batch_size, drop_remainder=True) # apply dataset repeat operation - ds = ds.repeat(repeat_num) + data_set = data_set.repeat(repeat_num) - return ds + return data_set def create_dataset_py(dataset_path, do_train, repeat_num=1, batch_size=32, target="Ascend"): @@ -121,12 +121,12 @@ def create_dataset_py(dataset_path, do_train, repeat_num=1, batch_size=32, targe if do_train: if device_num == 1: - ds = de.ImageFolderDataset(dataset_path, num_parallel_workers=8, shuffle=True) + data_set = ds.ImageFolderDataset(dataset_path, num_parallel_workers=8, shuffle=True) else: - ds = de.ImageFolderDataset(dataset_path, num_parallel_workers=8, shuffle=True, - num_shards=device_num, shard_id=rank_id) + data_set = ds.ImageFolderDataset(dataset_path, num_parallel_workers=8, shuffle=True, + num_shards=device_num, shard_id=rank_id) else: - ds = de.ImageFolderDataset(dataset_path, num_parallel_workers=8, shuffle=False) + data_set = ds.ImageFolderDataset(dataset_path, num_parallel_workers=8, shuffle=False) image_size = 224 @@ -147,12 +147,13 @@ def create_dataset_py(dataset_path, do_train, repeat_num=1, batch_size=32, targe trans = [decode_op, resize_op, center_crop, to_tensor, normalize_op] compose = P2.Compose(trans) - ds = ds.map(operations=compose, input_columns="image", num_parallel_workers=8, python_multiprocessing=True) + data_set = data_set.map(operations=compose, input_columns="image", num_parallel_workers=8, + python_multiprocessing=True) # apply batch operations - ds = ds.batch(batch_size, drop_remainder=True) + data_set = data_set.batch(batch_size, drop_remainder=True) # apply dataset repeat operation - ds = ds.repeat(repeat_num) + data_set = data_set.repeat(repeat_num) - return ds + return data_set diff --git a/model_zoo/official/cv/resnet_thor/src/dataset.py b/model_zoo/official/cv/resnet_thor/src/dataset.py index f00585d23be..443817150b0 100644 --- a/model_zoo/official/cv/resnet_thor/src/dataset.py +++ b/model_zoo/official/cv/resnet_thor/src/dataset.py @@ -17,7 +17,7 @@ create train or eval dataset. """ import os import mindspore.common.dtype as mstype -import mindspore.dataset.engine as de +import mindspore.dataset as ds import mindspore.dataset.vision.c_transforms as C import mindspore.dataset.transforms.c_transforms as C2 from mindspore.communication.management import init, get_rank, get_group_size @@ -47,10 +47,10 @@ def create_dataset(dataset_path, do_train, repeat_num=1, batch_size=32, target=" num_parallels = 4 if device_num == 1: - ds = de.ImageFolderDataset(dataset_path, num_parallel_workers=num_parallels, shuffle=True) + data_set = ds.ImageFolderDataset(dataset_path, num_parallel_workers=num_parallels, shuffle=True) else: - ds = de.ImageFolderDataset(dataset_path, num_parallel_workers=num_parallels, shuffle=True, - num_shards=device_num, shard_id=rank_id) + data_set = ds.ImageFolderDataset(dataset_path, num_parallel_workers=num_parallels, shuffle=True, + num_shards=device_num, shard_id=rank_id) image_size = 224 mean = [0.485 * 255, 0.456 * 255, 0.406 * 255] @@ -75,16 +75,16 @@ def create_dataset(dataset_path, do_train, repeat_num=1, batch_size=32, target=" type_cast_op = C2.TypeCast(mstype.int32) - ds = ds.map(operations=trans, input_columns="image", num_parallel_workers=num_parallels) - ds = ds.map(operations=type_cast_op, input_columns="label", num_parallel_workers=num_parallels) + data_set = data_set.map(operations=trans, input_columns="image", num_parallel_workers=num_parallels) + data_set = data_set.map(operations=type_cast_op, input_columns="label", num_parallel_workers=num_parallels) # apply batch operations - ds = ds.batch(batch_size, drop_remainder=True) + data_set = data_set.batch(batch_size, drop_remainder=True) # apply dataset repeat operation - ds = ds.repeat(repeat_num) + data_set = data_set.repeat(repeat_num) - return ds + return data_set def _get_rank_info(): diff --git a/model_zoo/official/cv/shufflenetv1/src/dataset.py b/model_zoo/official/cv/shufflenetv1/src/dataset.py index ac5d03735d5..c88f1689f65 100644 --- a/model_zoo/official/cv/shufflenetv1/src/dataset.py +++ b/model_zoo/official/cv/shufflenetv1/src/dataset.py @@ -15,7 +15,7 @@ """Data operations, will be used in train.py and eval.py""" from src.config import config import mindspore.common.dtype as mstype -import mindspore.dataset.engine as de +import mindspore.dataset as ds import mindspore.dataset.transforms.c_transforms as C2 import mindspore.dataset.vision.c_transforms as C @@ -36,10 +36,10 @@ def create_dataset(dataset_path, do_train, device_num=1, rank=0): """ if device_num == 1: - ds = de.ImageFolderDataset(dataset_path, num_parallel_workers=8, shuffle=True) + data_set = ds.ImageFolderDataset(dataset_path, num_parallel_workers=8, shuffle=True) else: - ds = de.ImageFolderDataset(dataset_path, num_parallel_workers=8, shuffle=True, - num_shards=device_num, shard_id=rank) + data_set = ds.ImageFolderDataset(dataset_path, num_parallel_workers=8, shuffle=True, + num_shards=device_num, shard_id=rank) # define map operations if do_train: trans = [ @@ -59,8 +59,8 @@ def create_dataset(dataset_path, do_train, device_num=1, rank=0): ] type_cast_op = C2.TypeCast(mstype.int32) - ds = ds.map(input_columns="image", operations=trans, num_parallel_workers=8) - ds = ds.map(input_columns="label", operations=type_cast_op, num_parallel_workers=8) + data_set = data_set.map(input_columns="image", operations=trans, num_parallel_workers=8) + data_set = data_set.map(input_columns="label", operations=type_cast_op, num_parallel_workers=8) # apply batch operations - ds = ds.batch(config.batch_size, drop_remainder=True) - return ds + data_set = data_set.batch(config.batch_size, drop_remainder=True) + return data_set diff --git a/model_zoo/official/cv/shufflenetv2/src/dataset.py b/model_zoo/official/cv/shufflenetv2/src/dataset.py index af4fbf0c197..318330305d0 100644 --- a/model_zoo/official/cv/shufflenetv2/src/dataset.py +++ b/model_zoo/official/cv/shufflenetv2/src/dataset.py @@ -19,7 +19,7 @@ import numpy as np from src.config import config_gpu as cfg import mindspore.common.dtype as mstype -import mindspore.dataset.engine as de +import mindspore.dataset as ds import mindspore.dataset.transforms.c_transforms as C2 import mindspore.dataset.vision.c_transforms as C @@ -46,10 +46,10 @@ def create_dataset(dataset_path, do_train, rank, group_size, repeat_num=1): dataset """ if group_size == 1: - ds = de.ImageFolderDataset(dataset_path, num_parallel_workers=cfg.work_nums, shuffle=True) + data_set = ds.ImageFolderDataset(dataset_path, num_parallel_workers=cfg.work_nums, shuffle=True) else: - ds = de.ImageFolderDataset(dataset_path, num_parallel_workers=cfg.work_nums, shuffle=True, - num_shards=group_size, shard_id=rank) + data_set = ds.ImageFolderDataset(dataset_path, num_parallel_workers=cfg.work_nums, shuffle=True, + num_shards=group_size, shard_id=rank) # define map operations if do_train: trans = [ @@ -71,9 +71,9 @@ def create_dataset(dataset_path, do_train, rank, group_size, repeat_num=1): ] type_cast_op = C2.TypeCast(mstype.int32) - ds = ds.map(operations=trans, input_columns="image", num_parallel_workers=cfg.work_nums) - ds = ds.map(operations=type_cast_op, input_columns="label", num_parallel_workers=cfg.work_nums) + data_set = data_set.map(operations=trans, input_columns="image", num_parallel_workers=cfg.work_nums) + data_set = data_set.map(operations=type_cast_op, input_columns="label", num_parallel_workers=cfg.work_nums) # apply batch operations - ds = ds.batch(cfg.batch_size, drop_remainder=True) + data_set = data_set.batch(cfg.batch_size, drop_remainder=True) - return ds + return data_set diff --git a/model_zoo/official/cv/squeezenet/src/dataset.py b/model_zoo/official/cv/squeezenet/src/dataset.py index e1d9c7745cb..38eaef30f30 100755 --- a/model_zoo/official/cv/squeezenet/src/dataset.py +++ b/model_zoo/official/cv/squeezenet/src/dataset.py @@ -17,7 +17,7 @@ create train or eval dataset. """ import os import mindspore.common.dtype as mstype -import mindspore.dataset.engine as de +import mindspore.dataset as ds import mindspore.dataset.vision.c_transforms as C import mindspore.dataset.transforms.c_transforms as C2 from mindspore.communication.management import init, get_rank, get_group_size @@ -48,15 +48,15 @@ def create_dataset_cifar(dataset_path, device_num = get_group_size() if device_num == 1: - ds = de.Cifar10Dataset(dataset_path, - num_parallel_workers=8, - shuffle=True) + data_set = ds.Cifar10Dataset(dataset_path, + num_parallel_workers=8, + shuffle=True) else: - ds = de.Cifar10Dataset(dataset_path, - num_parallel_workers=8, - shuffle=True, - num_shards=device_num, - shard_id=rank_id) + data_set = ds.Cifar10Dataset(dataset_path, + num_parallel_workers=8, + shuffle=True, + num_shards=device_num, + shard_id=rank_id) # define map operations if do_train: @@ -80,20 +80,20 @@ def create_dataset_cifar(dataset_path, type_cast_op = C2.TypeCast(mstype.int32) - ds = ds.map(operations=type_cast_op, - input_columns="label", - num_parallel_workers=8) - ds = ds.map(operations=trans, - input_columns="image", - num_parallel_workers=8) + data_set = data_set.map(operations=type_cast_op, + input_columns="label", + num_parallel_workers=8) + data_set = data_set.map(operations=trans, + input_columns="image", + num_parallel_workers=8) # apply batch operations - ds = ds.batch(batch_size, drop_remainder=True) + data_set = data_set.batch(batch_size, drop_remainder=True) # apply dataset repeat operation - ds = ds.repeat(repeat_num) + data_set = data_set.repeat(repeat_num) - return ds + return data_set def create_dataset_imagenet(dataset_path, @@ -122,15 +122,15 @@ def create_dataset_imagenet(dataset_path, device_num = get_group_size() if device_num == 1: - ds = de.ImageFolderDataset(dataset_path, - num_parallel_workers=8, - shuffle=True) + data_set = ds.ImageFolderDataset(dataset_path, + num_parallel_workers=8, + shuffle=True) else: - ds = de.ImageFolderDataset(dataset_path, - num_parallel_workers=8, - shuffle=True, - num_shards=device_num, - shard_id=rank_id) + data_set = ds.ImageFolderDataset(dataset_path, + num_parallel_workers=8, + shuffle=True, + num_shards=device_num, + shard_id=rank_id) image_size = 227 mean = [0.485 * 255, 0.456 * 255, 0.406 * 255] @@ -159,20 +159,20 @@ def create_dataset_imagenet(dataset_path, type_cast_op = C2.TypeCast(mstype.int32) - ds = ds.map(operations=type_cast_op, - input_columns="label", - num_parallel_workers=8) - ds = ds.map(operations=trans, - input_columns="image", - num_parallel_workers=8) + data_set = data_set.map(operations=type_cast_op, + input_columns="label", + num_parallel_workers=8) + data_set = data_set.map(operations=trans, + input_columns="image", + num_parallel_workers=8) # apply batch operations - ds = ds.batch(batch_size, drop_remainder=True) + data_set = data_set.batch(batch_size, drop_remainder=True) # apply dataset repeat operation - ds = ds.repeat(repeat_num) + data_set = data_set.repeat(repeat_num) - return ds + return data_set def _get_rank_info(): diff --git a/model_zoo/official/cv/warpctc/src/dataset.py b/model_zoo/official/cv/warpctc/src/dataset.py index 34c9cc8e841..11c3322f1e2 100755 --- a/model_zoo/official/cv/warpctc/src/dataset.py +++ b/model_zoo/official/cv/warpctc/src/dataset.py @@ -17,7 +17,7 @@ import os import math as m import numpy as np import mindspore.common.dtype as mstype -import mindspore.dataset.engine as de +import mindspore.dataset as ds import mindspore.dataset.transforms.c_transforms as c import mindspore.dataset.vision.c_transforms as vc from PIL import Image @@ -86,7 +86,7 @@ def create_dataset(dataset_path, batch_size=1, num_shards=1, shard_id=0, device_ """ dataset = _CaptchaDataset(dataset_path, cf.max_captcha_digits, device_target) - ds = de.GeneratorDataset(dataset, ["image", "label"], shuffle=True, num_shards=num_shards, shard_id=shard_id) + data_set = ds.GeneratorDataset(dataset, ["image", "label"], shuffle=True, num_shards=num_shards, shard_id=shard_id) image_trans = [ vc.Rescale(1.0 / 255.0, 0.0), vc.Normalize([0.9010, 0.9049, 0.9025], std=[0.1521, 0.1347, 0.1458]), @@ -96,12 +96,12 @@ def create_dataset(dataset_path, batch_size=1, num_shards=1, shard_id=0, device_ label_trans = [ c.TypeCast(mstype.int32) ] - ds = ds.map(operations=image_trans, input_columns=["image"], num_parallel_workers=8) + data_set = data_set.map(operations=image_trans, input_columns=["image"], num_parallel_workers=8) if device_target == 'Ascend': - ds = ds.map(operations=transpose_hwc2whc, input_columns=["image"], num_parallel_workers=8) + data_set = data_set.map(operations=transpose_hwc2whc, input_columns=["image"], num_parallel_workers=8) else: - ds = ds.map(operations=transpose_hwc2chw, input_columns=["image"], num_parallel_workers=8) - ds = ds.map(operations=label_trans, input_columns=["label"], num_parallel_workers=8) + data_set = data_set.map(operations=transpose_hwc2chw, input_columns=["image"], num_parallel_workers=8) + data_set = data_set.map(operations=label_trans, input_columns=["label"], num_parallel_workers=8) - ds = ds.batch(batch_size, drop_remainder=True) - return ds + data_set = data_set.batch(batch_size, drop_remainder=True) + return data_set diff --git a/model_zoo/official/cv/xception/src/dataset.py b/model_zoo/official/cv/xception/src/dataset.py index b7f7aa89cba..adcc10f8720 100644 --- a/model_zoo/official/cv/xception/src/dataset.py +++ b/model_zoo/official/cv/xception/src/dataset.py @@ -16,10 +16,11 @@ Data operations, will be used in train.py and eval.py """ import mindspore.common.dtype as mstype -import mindspore.dataset.engine as de +import mindspore.dataset as ds import mindspore.dataset.transforms.c_transforms as C2 import mindspore.dataset.vision.c_transforms as C + def create_dataset(dataset_path, do_train, batch_size=16, device_num=1, rank=0): """ create a train or eval dataset @@ -35,10 +36,10 @@ def create_dataset(dataset_path, do_train, batch_size=16, device_num=1, rank=0): dataset """ if device_num == 1: - ds = de.ImageFolderDataset(dataset_path, num_parallel_workers=8, shuffle=True) + data_set = ds.ImageFolderDataset(dataset_path, num_parallel_workers=8, shuffle=True) else: - ds = de.ImageFolderDataset(dataset_path, num_parallel_workers=8, shuffle=True, - num_shards=device_num, shard_id=rank) + data_set = ds.ImageFolderDataset(dataset_path, num_parallel_workers=8, shuffle=True, + num_shards=device_num, shard_id=rank) # define map operations if do_train: trans = [ @@ -59,8 +60,8 @@ def create_dataset(dataset_path, do_train, batch_size=16, device_num=1, rank=0): ] type_cast_op = C2.TypeCast(mstype.int32) - ds = ds.map(input_columns="image", operations=trans, num_parallel_workers=8) - ds = ds.map(input_columns="label", operations=type_cast_op, num_parallel_workers=8) + data_set = data_set.map(input_columns="image", operations=trans, num_parallel_workers=8) + data_set = data_set.map(input_columns="label", operations=type_cast_op, num_parallel_workers=8) # apply batch operations - ds = ds.batch(batch_size, drop_remainder=True) - return ds + data_set = data_set.batch(batch_size, drop_remainder=True) + return data_set diff --git a/model_zoo/official/nlp/bert/src/dataset.py b/model_zoo/official/nlp/bert/src/dataset.py index 4deceadbec2..06006e39bcf 100644 --- a/model_zoo/official/nlp/bert/src/dataset.py +++ b/model_zoo/official/nlp/bert/src/dataset.py @@ -17,7 +17,7 @@ Data operations, will be used in run_pretrain.py """ import os import mindspore.common.dtype as mstype -import mindspore.dataset.engine.datasets as de +import mindspore.dataset as ds import mindspore.dataset.transforms.c_transforms as C from mindspore import log as logger from .config import cfg @@ -31,65 +31,67 @@ def create_bert_dataset(device_num=1, rank=0, do_shuffle="true", data_dir=None, for file_name in files: if "tfrecord" in file_name: data_files.append(os.path.join(data_dir, file_name)) - ds = de.TFRecordDataset(data_files, schema_dir if schema_dir != "" else None, - columns_list=["input_ids", "input_mask", "segment_ids", "next_sentence_labels", - "masked_lm_positions", "masked_lm_ids", "masked_lm_weights"], - shuffle=de.Shuffle.FILES if do_shuffle == "true" else False, - num_shards=device_num, shard_id=rank, shard_equal_rows=True) - ori_dataset_size = ds.get_dataset_size() + data_set = ds.TFRecordDataset(data_files, schema_dir if schema_dir != "" else None, + columns_list=["input_ids", "input_mask", "segment_ids", "next_sentence_labels", + "masked_lm_positions", "masked_lm_ids", "masked_lm_weights"], + shuffle=ds.Shuffle.FILES if do_shuffle == "true" else False, + num_shards=device_num, shard_id=rank, shard_equal_rows=True) + ori_dataset_size = data_set.get_dataset_size() print('origin dataset size: ', ori_dataset_size) type_cast_op = C.TypeCast(mstype.int32) - ds = ds.map(operations=type_cast_op, input_columns="masked_lm_ids") - ds = ds.map(operations=type_cast_op, input_columns="masked_lm_positions") - ds = ds.map(operations=type_cast_op, input_columns="next_sentence_labels") - ds = ds.map(operations=type_cast_op, input_columns="segment_ids") - ds = ds.map(operations=type_cast_op, input_columns="input_mask") - ds = ds.map(operations=type_cast_op, input_columns="input_ids") + data_set = data_set.map(operations=type_cast_op, input_columns="masked_lm_ids") + data_set = data_set.map(operations=type_cast_op, input_columns="masked_lm_positions") + data_set = data_set.map(operations=type_cast_op, input_columns="next_sentence_labels") + data_set = data_set.map(operations=type_cast_op, input_columns="segment_ids") + data_set = data_set.map(operations=type_cast_op, input_columns="input_mask") + data_set = data_set.map(operations=type_cast_op, input_columns="input_ids") # apply batch operations - ds = ds.batch(cfg.batch_size, drop_remainder=True) - logger.info("data size: {}".format(ds.get_dataset_size())) - logger.info("repeat count: {}".format(ds.get_repeat_count())) - return ds + data_set = data_set.batch(cfg.batch_size, drop_remainder=True) + logger.info("data size: {}".format(data_set.get_dataset_size())) + logger.info("repeat count: {}".format(data_set.get_repeat_count())) + return data_set def create_ner_dataset(batch_size=1, repeat_count=1, assessment_method="accuracy", data_file_path=None, schema_file_path=None, do_shuffle=True): """create finetune or evaluation dataset""" type_cast_op = C.TypeCast(mstype.int32) - ds = de.TFRecordDataset([data_file_path], schema_file_path if schema_file_path != "" else None, - columns_list=["input_ids", "input_mask", "segment_ids", "label_ids"], shuffle=do_shuffle) + data_set = ds.TFRecordDataset([data_file_path], schema_file_path if schema_file_path != "" else None, + columns_list=["input_ids", "input_mask", "segment_ids", "label_ids"], + shuffle=do_shuffle) if assessment_method == "Spearman_correlation": type_cast_op_float = C.TypeCast(mstype.float32) - ds = ds.map(operations=type_cast_op_float, input_columns="label_ids") + data_set = data_set.map(operations=type_cast_op_float, input_columns="label_ids") else: - ds = ds.map(operations=type_cast_op, input_columns="label_ids") - ds = ds.map(operations=type_cast_op, input_columns="segment_ids") - ds = ds.map(operations=type_cast_op, input_columns="input_mask") - ds = ds.map(operations=type_cast_op, input_columns="input_ids") - ds = ds.repeat(repeat_count) + data_set = data_set.map(operations=type_cast_op, input_columns="label_ids") + data_set = data_set.map(operations=type_cast_op, input_columns="segment_ids") + data_set = data_set.map(operations=type_cast_op, input_columns="input_mask") + data_set = data_set.map(operations=type_cast_op, input_columns="input_ids") + data_set = data_set.repeat(repeat_count) # apply batch operations - ds = ds.batch(batch_size, drop_remainder=True) - return ds + data_set = data_set.batch(batch_size, drop_remainder=True) + return data_set def create_classification_dataset(batch_size=1, repeat_count=1, assessment_method="accuracy", data_file_path=None, schema_file_path=None, do_shuffle=True): """create finetune or evaluation dataset""" type_cast_op = C.TypeCast(mstype.int32) - ds = de.TFRecordDataset([data_file_path], schema_file_path if schema_file_path != "" else None, - columns_list=["input_ids", "input_mask", "segment_ids", "label_ids"], shuffle=do_shuffle) + data_set = ds.TFRecordDataset([data_file_path], schema_file_path if schema_file_path != "" else None, + columns_list=["input_ids", "input_mask", "segment_ids", "label_ids"], + shuffle=do_shuffle) if assessment_method == "Spearman_correlation": type_cast_op_float = C.TypeCast(mstype.float32) - ds = ds.map(operations=type_cast_op_float, input_columns="label_ids") + data_set = data_set.map(operations=type_cast_op_float, input_columns="label_ids") else: - ds = ds.map(operations=type_cast_op, input_columns="label_ids") - ds = ds.map(operations=type_cast_op, input_columns="segment_ids") - ds = ds.map(operations=type_cast_op, input_columns="input_mask") - ds = ds.map(operations=type_cast_op, input_columns="input_ids") - ds = ds.repeat(repeat_count) + data_set = data_set.map(operations=type_cast_op, input_columns="label_ids") + data_set = data_set.map(operations=type_cast_op, input_columns="segment_ids") + data_set = data_set.map(operations=type_cast_op, input_columns="input_mask") + data_set = data_set.map(operations=type_cast_op, input_columns="input_ids") + data_set = data_set.repeat(repeat_count) # apply batch operations - ds = ds.batch(batch_size, drop_remainder=True) - return ds + data_set = data_set.batch(batch_size, drop_remainder=True) + return data_set def generator_squad(data_features): @@ -102,20 +104,20 @@ def create_squad_dataset(batch_size=1, repeat_count=1, data_file_path=None, sche """create finetune or evaluation dataset""" type_cast_op = C.TypeCast(mstype.int32) if is_training: - ds = de.TFRecordDataset([data_file_path], schema_file_path if schema_file_path != "" else None, - columns_list=["input_ids", "input_mask", "segment_ids", "start_positions", - "end_positions", "unique_ids", "is_impossible"], - shuffle=do_shuffle) - ds = ds.map(operations=type_cast_op, input_columns="start_positions") - ds = ds.map(operations=type_cast_op, input_columns="end_positions") + data_set = ds.TFRecordDataset([data_file_path], schema_file_path if schema_file_path != "" else None, + columns_list=["input_ids", "input_mask", "segment_ids", "start_positions", + "end_positions", "unique_ids", "is_impossible"], + shuffle=do_shuffle) + data_set = data_set.map(operations=type_cast_op, input_columns="start_positions") + data_set = data_set.map(operations=type_cast_op, input_columns="end_positions") else: - ds = de.GeneratorDataset(generator_squad(data_file_path), shuffle=do_shuffle, - column_names=["input_ids", "input_mask", "segment_ids", "unique_ids"]) - ds = ds.map(operations=type_cast_op, input_columns="segment_ids") - ds = ds.map(operations=type_cast_op, input_columns="input_mask") - ds = ds.map(operations=type_cast_op, input_columns="input_ids") - ds = ds.map(operations=type_cast_op, input_columns="unique_ids") - ds = ds.repeat(repeat_count) + data_set = ds.GeneratorDataset(generator_squad(data_file_path), shuffle=do_shuffle, + column_names=["input_ids", "input_mask", "segment_ids", "unique_ids"]) + data_set = data_set.map(operations=type_cast_op, input_columns="segment_ids") + data_set = data_set.map(operations=type_cast_op, input_columns="input_mask") + data_set = data_set.map(operations=type_cast_op, input_columns="input_ids") + data_set = data_set.map(operations=type_cast_op, input_columns="unique_ids") + data_set = data_set.repeat(repeat_count) # apply batch operations - ds = ds.batch(batch_size, drop_remainder=True) - return ds + data_set = data_set.batch(batch_size, drop_remainder=True) + return data_set diff --git a/model_zoo/official/nlp/bert_thor/src/dataset.py b/model_zoo/official/nlp/bert_thor/src/dataset.py index 705aa362e0b..74b12896af4 100644 --- a/model_zoo/official/nlp/bert_thor/src/dataset.py +++ b/model_zoo/official/nlp/bert_thor/src/dataset.py @@ -17,7 +17,7 @@ Data operations, will be used in run_pretrain.py """ import os import mindspore.common.dtype as mstype -import mindspore.dataset.engine.datasets as de +import mindspore.dataset as ds import mindspore.dataset.transforms.c_transforms as C from mindspore import log as logger from .bert_net_config import bert_net_cfg @@ -32,96 +32,96 @@ def create_bert_dataset(device_num=1, rank=0, do_shuffle="true", data_dir=None, if "tfrecord" in file_name: data_files.append(os.path.join(data_dir, file_name)) data_files = sorted(data_files) - ds = de.TFRecordDataset(data_files, schema_dir if schema_dir != "" else None, - columns_list=["input_ids", "input_mask", "segment_ids", "next_sentence_labels", - "masked_lm_positions", "masked_lm_ids", "masked_lm_weights"], - shuffle=de.Shuffle.FILES if do_shuffle == "true" else False, - num_shards=device_num, shard_id=rank, shard_equal_rows=False) - ori_dataset_size = ds.get_dataset_size() + data_set = ds.TFRecordDataset(data_files, schema_dir if schema_dir != "" else None, + columns_list=["input_ids", "input_mask", "segment_ids", "next_sentence_labels", + "masked_lm_positions", "masked_lm_ids", "masked_lm_weights"], + shuffle=ds.Shuffle.FILES if do_shuffle == "true" else False, + num_shards=device_num, shard_id=rank, shard_equal_rows=False) + ori_dataset_size = data_set.get_dataset_size() print('origin dataset size: ', ori_dataset_size) type_cast_op = C.TypeCast(mstype.int32) - ds = ds.map(operations=type_cast_op, input_columns="masked_lm_ids") - ds = ds.map(operations=type_cast_op, input_columns="masked_lm_positions") - ds = ds.map(operations=type_cast_op, input_columns="next_sentence_labels") - ds = ds.map(operations=type_cast_op, input_columns="segment_ids") - ds = ds.map(operations=type_cast_op, input_columns="input_mask") - ds = ds.map(operations=type_cast_op, input_columns="input_ids") + data_set = data_set.map(operations=type_cast_op, input_columns="masked_lm_ids") + data_set = data_set.map(operations=type_cast_op, input_columns="masked_lm_positions") + data_set = data_set.map(operations=type_cast_op, input_columns="next_sentence_labels") + data_set = data_set.map(operations=type_cast_op, input_columns="segment_ids") + data_set = data_set.map(operations=type_cast_op, input_columns="input_mask") + data_set = data_set.map(operations=type_cast_op, input_columns="input_ids") # apply batch operations - ds = ds.batch(bert_net_cfg.batch_size, drop_remainder=True) - logger.info("data size: {}".format(ds.get_dataset_size())) - logger.info("repeat count: {}".format(ds.get_repeat_count())) - return ds + data_set = data_set.batch(bert_net_cfg.batch_size, drop_remainder=True) + logger.info("data size: {}".format(data_set.get_dataset_size())) + logger.info("repeat count: {}".format(data_set.get_repeat_count())) + return data_set def create_ner_dataset(batch_size=1, repeat_count=1, assessment_method="accuracy", data_file_path=None, schema_file_path=None): """create finetune or evaluation dataset""" type_cast_op = C.TypeCast(mstype.int32) - ds = de.TFRecordDataset([data_file_path], schema_file_path if schema_file_path != "" else None, - columns_list=["input_ids", "input_mask", "segment_ids", "label_ids"]) + data_set = ds.TFRecordDataset([data_file_path], schema_file_path if schema_file_path != "" else None, + columns_list=["input_ids", "input_mask", "segment_ids", "label_ids"]) if assessment_method == "Spearman_correlation": type_cast_op_float = C.TypeCast(mstype.float32) - ds = ds.map(operations=type_cast_op_float, input_columns="label_ids") + data_set = data_set.map(operations=type_cast_op_float, input_columns="label_ids") else: - ds = ds.map(operations=type_cast_op, input_columns="label_ids") - ds = ds.map(operations=type_cast_op, input_columns="segment_ids") - ds = ds.map(operations=type_cast_op, input_columns="input_mask") - ds = ds.map(operations=type_cast_op, input_columns="input_ids") - ds = ds.repeat(repeat_count) + data_set = data_set.map(operations=type_cast_op, input_columns="label_ids") + data_set = data_set.map(operations=type_cast_op, input_columns="segment_ids") + data_set = data_set.map(operations=type_cast_op, input_columns="input_mask") + data_set = data_set.map(operations=type_cast_op, input_columns="input_ids") + data_set = data_set.repeat(repeat_count) # apply shuffle operation buffer_size = 960 - ds = ds.shuffle(buffer_size=buffer_size) + data_set = data_set.shuffle(buffer_size=buffer_size) # apply batch operations - ds = ds.batch(batch_size, drop_remainder=True) - return ds + data_set = data_set.batch(batch_size, drop_remainder=True) + return data_set def create_classification_dataset(batch_size=1, repeat_count=1, assessment_method="accuracy", data_file_path=None, schema_file_path=None): """create finetune or evaluation dataset""" type_cast_op = C.TypeCast(mstype.int32) - ds = de.TFRecordDataset([data_file_path], schema_file_path if schema_file_path != "" else None, - columns_list=["input_ids", "input_mask", "segment_ids", "label_ids"]) + data_set = ds.TFRecordDataset([data_file_path], schema_file_path if schema_file_path != "" else None, + columns_list=["input_ids", "input_mask", "segment_ids", "label_ids"]) if assessment_method == "Spearman_correlation": type_cast_op_float = C.TypeCast(mstype.float32) - ds = ds.map(operations=type_cast_op_float, input_columns="label_ids") + data_set = data_set.map(operations=type_cast_op_float, input_columns="label_ids") else: - ds = ds.map(operations=type_cast_op, input_columns="label_ids") - ds = ds.map(operations=type_cast_op, input_columns="segment_ids") - ds = ds.map(operations=type_cast_op, input_columns="input_mask") - ds = ds.map(operations=type_cast_op, input_columns="input_ids") - ds = ds.repeat(repeat_count) + data_set = data_set.map(operations=type_cast_op, input_columns="label_ids") + data_set = data_set.map(operations=type_cast_op, input_columns="segment_ids") + data_set = data_set.map(operations=type_cast_op, input_columns="input_mask") + data_set = data_set.map(operations=type_cast_op, input_columns="input_ids") + data_set = data_set.repeat(repeat_count) # apply shuffle operation buffer_size = 960 - ds = ds.shuffle(buffer_size=buffer_size) + data_set = data_set.shuffle(buffer_size=buffer_size) # apply batch operations - ds = ds.batch(batch_size, drop_remainder=True) - return ds + data_set = data_set.batch(batch_size, drop_remainder=True) + return data_set def create_squad_dataset(batch_size=1, repeat_count=1, data_file_path=None, schema_file_path=None, is_training=True): """create finetune or evaluation dataset""" type_cast_op = C.TypeCast(mstype.int32) if is_training: - ds = de.TFRecordDataset([data_file_path], schema_file_path if schema_file_path != "" else None, - columns_list=["input_ids", "input_mask", "segment_ids", - "start_positions", "end_positions", - "unique_ids", "is_impossible"]) - ds = ds.map(operations=type_cast_op, input_columns="start_positions") - ds = ds.map(operations=type_cast_op, input_columns="end_positions") + data_set = ds.TFRecordDataset([data_file_path], schema_file_path if schema_file_path != "" else None, + columns_list=["input_ids", "input_mask", "segment_ids", + "start_positions", "end_positions", + "unique_ids", "is_impossible"]) + data_set = data_set.map(operations=type_cast_op, input_columns="start_positions") + data_set = data_set.map(operations=type_cast_op, input_columns="end_positions") else: - ds = de.TFRecordDataset([data_file_path], schema_file_path if schema_file_path != "" else None, - columns_list=["input_ids", "input_mask", "segment_ids", "unique_ids"]) - ds = ds.map(operations=type_cast_op, input_columns="input_ids") - ds = ds.map(operations=type_cast_op, input_columns="input_mask") - ds = ds.map(operations=type_cast_op, input_columns="segment_ids") - ds = ds.map(operations=type_cast_op, input_columns="segment_ids") - ds = ds.map(operations=type_cast_op, input_columns="input_mask") - ds = ds.map(operations=type_cast_op, input_columns="input_ids") - ds = ds.repeat(repeat_count) + data_set = ds.TFRecordDataset([data_file_path], schema_file_path if schema_file_path != "" else None, + columns_list=["input_ids", "input_mask", "segment_ids", "unique_ids"]) + data_set = data_set.map(operations=type_cast_op, input_columns="input_ids") + data_set = data_set.map(operations=type_cast_op, input_columns="input_mask") + data_set = data_set.map(operations=type_cast_op, input_columns="segment_ids") + data_set = data_set.map(operations=type_cast_op, input_columns="segment_ids") + data_set = data_set.map(operations=type_cast_op, input_columns="input_mask") + data_set = data_set.map(operations=type_cast_op, input_columns="input_ids") + data_set = data_set.repeat(repeat_count) # apply shuffle operation buffer_size = 960 - ds = ds.shuffle(buffer_size=buffer_size) + data_set = data_set.shuffle(buffer_size=buffer_size) # apply batch operations - ds = ds.batch(batch_size, drop_remainder=True) - return ds + data_set = data_set.batch(batch_size, drop_remainder=True) + return data_set diff --git a/model_zoo/official/nlp/fasttext/eval.py b/model_zoo/official/nlp/fasttext/eval.py index c41963fb810..015c49063ea 100644 --- a/model_zoo/official/nlp/fasttext/eval.py +++ b/model_zoo/official/nlp/fasttext/eval.py @@ -22,7 +22,7 @@ import mindspore.ops.operations as P from mindspore.common.tensor import Tensor from mindspore.train.model import Model from mindspore.train.serialization import load_checkpoint, load_param_into_net -import mindspore.dataset.engine as de +import mindspore.dataset as ds import mindspore.dataset.transforms.c_transforms as deC from mindspore import context from src.fasttext_model import FastText @@ -73,15 +73,15 @@ class FastTextInferCell(nn.Cell): def load_infer_dataset(batch_size, datafile): """data loader for infer""" - ds = de.MindDataset(datafile, columns_list=['src_tokens', 'src_tokens_length', 'label_idx']) + data_set = ds.MindDataset(datafile, columns_list=['src_tokens', 'src_tokens_length', 'label_idx']) type_cast_op = deC.TypeCast(mstype.int32) - ds = ds.map(operations=type_cast_op, input_columns="src_tokens") - ds = ds.map(operations=type_cast_op, input_columns="src_tokens_length") - ds = ds.map(operations=type_cast_op, input_columns="label_idx") - ds = ds.batch(batch_size=batch_size, drop_remainder=True) + data_set = data_set.map(operations=type_cast_op, input_columns="src_tokens") + data_set = data_set.map(operations=type_cast_op, input_columns="src_tokens_length") + data_set = data_set.map(operations=type_cast_op, input_columns="label_idx") + data_set = data_set.batch(batch_size=batch_size, drop_remainder=True) - return ds + return data_set def run_fasttext_infer(): """run infer with FastText""" diff --git a/model_zoo/official/nlp/fasttext/src/dataset.py b/model_zoo/official/nlp/fasttext/src/dataset.py index 359ad665bb6..c3e0b3c0fad 100644 --- a/model_zoo/official/nlp/fasttext/src/dataset.py +++ b/model_zoo/official/nlp/fasttext/src/dataset.py @@ -25,8 +25,10 @@ import spacy from sklearn.feature_extraction import FeatureHasher from mindspore.mindrecord import FileWriter + class FastTextDataPreProcess(): """FastText data preprocess""" + def __init__(self, train_path, test_file, max_length, @@ -194,7 +196,6 @@ class FastTextDataPreProcess(): if self.text_less in sent_describe and self.text_greater in sent_describe: sent_describe = self.str_html.sub('', sent_describe) - doc = spacy_nlp(sent_describe) bows_token = [token.text for token in doc] @@ -222,7 +223,7 @@ class FastTextDataPreProcess(): def _get_bucket_length(self, x, bts): x_len = len(x) for index in range(1, len(bts)): - if bts[index-1] < x_len <= bts[index]: + if bts[index - 1] < x_len <= bts[index]: return bts[index] return bts[0] @@ -310,7 +311,6 @@ if __name__ == '__main__': print("Writing test data to MindRecord file.....") for k in args.test_bucket: - write_to_mindrecord(test_data_example[k], './test_dataset_bs_' + str(k) + '.mindrecord', 1) print("All done.....") diff --git a/model_zoo/official/nlp/fasttext/src/load_dataset.py b/model_zoo/official/nlp/fasttext/src/load_dataset.py index 07dc4a7692c..179f5abc26a 100644 --- a/model_zoo/official/nlp/fasttext/src/load_dataset.py +++ b/model_zoo/official/nlp/fasttext/src/load_dataset.py @@ -14,9 +14,10 @@ # ============================================================================ """FastText data loader""" import mindspore.common.dtype as mstype -import mindspore.dataset.engine as de +import mindspore.dataset as ds import mindspore.dataset.transforms.c_transforms as deC + def load_dataset(dataset_path, batch_size, epoch_count=1, @@ -25,38 +26,40 @@ def load_dataset(dataset_path, bucket=None, shuffle=True): """dataset loader""" + def batch_per_bucket(bucket_length, input_file): - input_file = input_file +'/train_dataset_bs_' + str(bucket_length) + '.mindrecord' + input_file = input_file + '/train_dataset_bs_' + str(bucket_length) + '.mindrecord' if not input_file: raise FileNotFoundError("input file parameter must not be empty.") - ds = de.MindDataset(input_file, - columns_list=['src_tokens', 'src_tokens_length', 'label_idx'], - shuffle=shuffle, - num_shards=rank_size, - shard_id=rank_id, - num_parallel_workers=8) - ori_dataset_size = ds.get_dataset_size() + data_set = ds.MindDataset(input_file, + columns_list=['src_tokens', 'src_tokens_length', 'label_idx'], + shuffle=shuffle, + num_shards=rank_size, + shard_id=rank_id, + num_parallel_workers=8) + ori_dataset_size = data_set.get_dataset_size() print(f"Dataset size: {ori_dataset_size}") repeat_count = epoch_count type_cast_op = deC.TypeCast(mstype.int32) - ds = ds.map(operations=type_cast_op, input_columns="src_tokens") - ds = ds.map(operations=type_cast_op, input_columns="src_tokens_length") - ds = ds.map(operations=type_cast_op, input_columns="label_idx") + data_set = data_set.map(operations=type_cast_op, input_columns="src_tokens") + data_set = data_set.map(operations=type_cast_op, input_columns="src_tokens_length") + data_set = data_set.map(operations=type_cast_op, input_columns="label_idx") + + data_set = data_set.rename(input_columns=['src_tokens', 'src_tokens_length', 'label_idx'], + output_columns=['src_token_text', 'src_tokens_text_length', 'label_idx_tag']) + data_set = data_set.batch(batch_size, drop_remainder=False) + data_set = data_set.repeat(repeat_count) + return data_set - ds = ds.rename(input_columns=['src_tokens', 'src_tokens_length', 'label_idx'], - output_columns=['src_token_text', 'src_tokens_text_length', 'label_idx_tag']) - ds = ds.batch(batch_size, drop_remainder=False) - ds = ds.repeat(repeat_count) - return ds for i, _ in enumerate(bucket): bucket_len = bucket[i] ds_per = batch_per_bucket(bucket_len, dataset_path) if i == 0: - ds = ds_per + data_set = ds_per else: - ds = ds + ds_per - ds = ds.shuffle(ds.get_dataset_size()) - ds.channel_name = 'fasttext' + data_set = data_set + ds_per + data_set = data_set.shuffle(data_set.get_dataset_size()) + data_set.channel_name = 'fasttext' - return ds + return data_set diff --git a/model_zoo/official/nlp/gnmt_v2/src/dataset/load_dataset.py b/model_zoo/official/nlp/gnmt_v2/src/dataset/load_dataset.py index a74ef18e940..681f9040e5a 100644 --- a/model_zoo/official/nlp/gnmt_v2/src/dataset/load_dataset.py +++ b/model_zoo/official/nlp/gnmt_v2/src/dataset/load_dataset.py @@ -15,7 +15,7 @@ """Dataset loader to feed into model.""" import os import mindspore.common.dtype as mstype -import mindspore.dataset.engine as de +import mindspore.dataset as ds import mindspore.dataset.transforms.c_transforms as deC @@ -55,7 +55,7 @@ def _load_dataset(input_files, schema_file, batch_size, sink_mode=False, print(f" | Loading {datafile}.") if not is_translate: - ds = de.MindDataset( + data_set = ds.MindDataset( input_files, columns_list=[ "src", "src_padding", "prev_opt", @@ -64,18 +64,18 @@ def _load_dataset(input_files, schema_file, batch_size, sink_mode=False, num_parallel_workers=8 ) - ori_dataset_size = ds.get_dataset_size() + ori_dataset_size = data_set.get_dataset_size() print(f" | Dataset size: {ori_dataset_size}.") if shuffle: - ds = ds.shuffle(buffer_size=ori_dataset_size // 20) + data_set = data_set.shuffle(buffer_size=ori_dataset_size // 20) type_cast_op = deC.TypeCast(mstype.int32) - ds = ds.map(input_columns="src", operations=type_cast_op, num_parallel_workers=8) - ds = ds.map(input_columns="src_padding", operations=type_cast_op, num_parallel_workers=8) - ds = ds.map(input_columns="prev_opt", operations=type_cast_op, num_parallel_workers=8) - ds = ds.map(input_columns="target", operations=type_cast_op, num_parallel_workers=8) - ds = ds.map(input_columns="tgt_padding", operations=type_cast_op, num_parallel_workers=8) + data_set = data_set.map(input_columns="src", operations=type_cast_op, num_parallel_workers=8) + data_set = data_set.map(input_columns="src_padding", operations=type_cast_op, num_parallel_workers=8) + data_set = data_set.map(input_columns="prev_opt", operations=type_cast_op, num_parallel_workers=8) + data_set = data_set.map(input_columns="target", operations=type_cast_op, num_parallel_workers=8) + data_set = data_set.map(input_columns="tgt_padding", operations=type_cast_op, num_parallel_workers=8) - ds = ds.rename( + data_set = data_set.rename( input_columns=["src", "src_padding", "prev_opt", @@ -87,9 +87,9 @@ def _load_dataset(input_files, schema_file, batch_size, sink_mode=False, "target_eos_ids", "target_eos_mask"] ) - ds = ds.batch(batch_size, drop_remainder=drop_remainder) + data_set = data_set.batch(batch_size, drop_remainder=drop_remainder) else: - ds = de.MindDataset( + data_set = ds.MindDataset( input_files, columns_list=[ "src", "src_padding" ], @@ -97,23 +97,23 @@ def _load_dataset(input_files, schema_file, batch_size, sink_mode=False, num_parallel_workers=8 ) - ori_dataset_size = ds.get_dataset_size() + ori_dataset_size = data_set.get_dataset_size() print(f" | Dataset size: {ori_dataset_size}.") if shuffle: - ds = ds.shuffle(buffer_size=ori_dataset_size // 20) + data_set = data_set.shuffle(buffer_size=ori_dataset_size // 20) type_cast_op = deC.TypeCast(mstype.int32) - ds = ds.map(input_columns="src", operations=type_cast_op, num_parallel_workers=8) - ds = ds.map(input_columns="src_padding", operations=type_cast_op, num_parallel_workers=8) + data_set = data_set.map(input_columns="src", operations=type_cast_op, num_parallel_workers=8) + data_set = data_set.map(input_columns="src_padding", operations=type_cast_op, num_parallel_workers=8) - ds = ds.rename( + data_set = data_set.rename( input_columns=["src", "src_padding"], output_columns=["source_eos_ids", "source_eos_mask"] ) - ds = ds.batch(batch_size, drop_remainder=drop_remainder) + data_set = data_set.batch(batch_size, drop_remainder=drop_remainder) - return ds + return data_set def load_dataset(data_files: list, schema: str, batch_size: int, sink_mode: bool, diff --git a/model_zoo/official/nlp/mass/src/dataset/load_dataset.py b/model_zoo/official/nlp/mass/src/dataset/load_dataset.py index d24ce6c49a1..879ccf41c9f 100644 --- a/model_zoo/official/nlp/mass/src/dataset/load_dataset.py +++ b/model_zoo/official/nlp/mass/src/dataset/load_dataset.py @@ -14,7 +14,7 @@ # ============================================================================ """Dataset loader to feed into model.""" import mindspore.common.dtype as mstype -import mindspore.dataset.engine as de +import mindspore.dataset as ds import mindspore.dataset.transforms.c_transforms as deC @@ -45,7 +45,7 @@ def _load_dataset(input_files, batch_size, epoch_count=1, for datafile in input_files: print(f" | Loading {datafile}.") - ds = de.TFRecordDataset( + data_set = ds.TFRecordDataset( input_files, columns_list=[ "src", "src_padding", @@ -55,19 +55,19 @@ def _load_dataset(input_files, batch_size, epoch_count=1, shuffle=shuffle, num_shards=rank_size, shard_id=rank_id, shard_equal_rows=True, num_parallel_workers=8) - ori_dataset_size = ds.get_dataset_size() + ori_dataset_size = data_set.get_dataset_size() print(f" | Dataset size: {ori_dataset_size}.") repeat_count = epoch_count type_cast_op = deC.TypeCast(mstype.int32) - ds = ds.map(operations=type_cast_op, input_columns="src") - ds = ds.map(operations=type_cast_op, input_columns="src_padding") - ds = ds.map(operations=type_cast_op, input_columns="prev_opt") - ds = ds.map(operations=type_cast_op, input_columns="prev_padding") - ds = ds.map(operations=type_cast_op, input_columns="target") - ds = ds.map(operations=type_cast_op, input_columns="tgt_padding") + data_set = data_set.map(operations=type_cast_op, input_columns="src") + data_set = data_set.map(operations=type_cast_op, input_columns="src_padding") + data_set = data_set.map(operations=type_cast_op, input_columns="prev_opt") + data_set = data_set.map(operations=type_cast_op, input_columns="prev_padding") + data_set = data_set.map(operations=type_cast_op, input_columns="target") + data_set = data_set.map(operations=type_cast_op, input_columns="tgt_padding") - ds = ds.rename( + data_set = data_set.rename( input_columns=["src", "src_padding", "prev_opt", @@ -82,11 +82,11 @@ def _load_dataset(input_files, batch_size, epoch_count=1, "target_eos_mask"] ) - ds = ds.batch(batch_size, drop_remainder=True) - ds = ds.repeat(repeat_count) + data_set = data_set.batch(batch_size, drop_remainder=True) + data_set = data_set.repeat(repeat_count) - ds.channel_name = 'transformer' - return ds + data_set.channel_name = 'transformer' + return data_set def load_dataset(data_files: list, batch_size: int, epoch_count: int, diff --git a/model_zoo/official/nlp/prophetnet/src/dataset/load_dataset.py b/model_zoo/official/nlp/prophetnet/src/dataset/load_dataset.py index be599413745..e585f50b999 100644 --- a/model_zoo/official/nlp/prophetnet/src/dataset/load_dataset.py +++ b/model_zoo/official/nlp/prophetnet/src/dataset/load_dataset.py @@ -14,7 +14,7 @@ # ============================================================================ """Dataset loader to feed into model.""" import mindspore.common.dtype as mstype -import mindspore.dataset.engine as de +import mindspore.dataset as ds import mindspore.dataset.transforms.c_transforms as deC @@ -45,7 +45,7 @@ def _load_dataset(input_files, batch_size, epoch_count=1, for datafile in input_files: print(f" | Loading {datafile}.") - ds = de.TFRecordDataset( + data_set = ds.TFRecordDataset( input_files, columns_list=[ "src", "src_padding", @@ -55,19 +55,19 @@ def _load_dataset(input_files, batch_size, epoch_count=1, shuffle=shuffle, num_shards=rank_size, shard_id=rank_id, shard_equal_rows=True, num_parallel_workers=8) - ori_dataset_size = ds.get_dataset_size() + ori_dataset_size = data_set.get_dataset_size() print(f" | Dataset size: {ori_dataset_size}.") repeat_count = epoch_count type_cast_op = deC.TypeCast(mstype.int32) - ds = ds.map(input_columns="src", operations=type_cast_op) - ds = ds.map(input_columns="src_padding", operations=type_cast_op) - ds = ds.map(input_columns="prev_opt", operations=type_cast_op) - ds = ds.map(input_columns="prev_padding", operations=type_cast_op) - ds = ds.map(input_columns="target", operations=type_cast_op) - ds = ds.map(input_columns="tgt_padding", operations=type_cast_op) + data_set = data_set.map(input_columns="src", operations=type_cast_op) + data_set = data_set.map(input_columns="src_padding", operations=type_cast_op) + data_set = data_set.map(input_columns="prev_opt", operations=type_cast_op) + data_set = data_set.map(input_columns="prev_padding", operations=type_cast_op) + data_set = data_set.map(input_columns="target", operations=type_cast_op) + data_set = data_set.map(input_columns="tgt_padding", operations=type_cast_op) - ds = ds.rename( + data_set = data_set.rename( input_columns=["src", "src_padding", "prev_opt", @@ -82,11 +82,11 @@ def _load_dataset(input_files, batch_size, epoch_count=1, "target_eos_mask"] ) - ds = ds.batch(batch_size, drop_remainder=True) - ds = ds.repeat(repeat_count) + data_set = data_set.batch(batch_size, drop_remainder=True) + data_set = data_set.repeat(repeat_count) - ds.channel_name = 'transformer' - return ds + data_set.channel_name = 'transformer' + return data_set def load_dataset(data_files: list, batch_size: int, epoch_count: int, diff --git a/model_zoo/official/nlp/tinybert/src/dataset.py b/model_zoo/official/nlp/tinybert/src/dataset.py index e4e065a9f1e..2d0c1861b41 100644 --- a/model_zoo/official/nlp/tinybert/src/dataset.py +++ b/model_zoo/official/nlp/tinybert/src/dataset.py @@ -18,14 +18,16 @@ import os from enum import Enum import mindspore.common.dtype as mstype -import mindspore.dataset.engine.datasets as de +import mindspore.dataset as ds import mindspore.dataset.transforms.c_transforms as C + class DataType(Enum): """Enumerate supported dataset format""" TFRECORD = 1 MINDRECORD = 2 + def create_tinybert_dataset(task='td', batch_size=32, device_num=1, rank=0, do_shuffle="true", data_dir=None, schema_dir=None, data_type=DataType.TFRECORD): @@ -47,22 +49,22 @@ def create_tinybert_dataset(task='td', batch_size=32, device_num=1, rank=0, shuffle = False if data_type == DataType.MINDRECORD: - ds = de.MindDataset(data_files, columns_list=columns_list, - shuffle=(do_shuffle == "true"), num_shards=device_num, shard_id=rank) + data_set = ds.MindDataset(data_files, columns_list=columns_list, + shuffle=(do_shuffle == "true"), num_shards=device_num, shard_id=rank) else: - ds = de.TFRecordDataset(data_files, schema_dir, columns_list=columns_list, - shuffle=shuffle, num_shards=device_num, shard_id=rank, - shard_equal_rows=shard_equal_rows) + data_set = ds.TFRecordDataset(data_files, schema_dir, columns_list=columns_list, + shuffle=shuffle, num_shards=device_num, shard_id=rank, + shard_equal_rows=shard_equal_rows) if device_num == 1 and shuffle is True: - ds = ds.shuffle(10000) + data_set = data_set.shuffle(10000) type_cast_op = C.TypeCast(mstype.int32) - ds = ds.map(operations=type_cast_op, input_columns="segment_ids") - ds = ds.map(operations=type_cast_op, input_columns="input_mask") - ds = ds.map(operations=type_cast_op, input_columns="input_ids") + data_set = data_set.map(operations=type_cast_op, input_columns="segment_ids") + data_set = data_set.map(operations=type_cast_op, input_columns="input_mask") + data_set = data_set.map(operations=type_cast_op, input_columns="input_ids") if task == "td": - ds = ds.map(operations=type_cast_op, input_columns="label_ids") + data_set = data_set.map(operations=type_cast_op, input_columns="label_ids") # apply batch operations - ds = ds.batch(batch_size, drop_remainder=True) + data_set = data_set.batch(batch_size, drop_remainder=True) - return ds + return data_set diff --git a/model_zoo/official/nlp/transformer/eval.py b/model_zoo/official/nlp/transformer/eval.py index 6d4b6632f9b..825f3c67653 100644 --- a/model_zoo/official/nlp/transformer/eval.py +++ b/model_zoo/official/nlp/transformer/eval.py @@ -23,38 +23,41 @@ from mindspore.common.parameter import Parameter from mindspore.common.tensor import Tensor from mindspore.train.model import Model from mindspore.train.serialization import load_checkpoint, load_param_into_net -import mindspore.dataset.engine as de +import mindspore.dataset as ds import mindspore.dataset.transforms.c_transforms as deC from mindspore import context from src.transformer_model import TransformerModel from src.eval_config import cfg, transformer_net_cfg + def load_test_data(batch_size=1, data_file=None): """ Load test dataset """ - ds = de.MindDataset(data_file, - columns_list=["source_eos_ids", "source_eos_mask", - "target_sos_ids", "target_sos_mask", - "target_eos_ids", "target_eos_mask"], - shuffle=False) + data_set = ds.MindDataset(data_file, + columns_list=["source_eos_ids", "source_eos_mask", + "target_sos_ids", "target_sos_mask", + "target_eos_ids", "target_eos_mask"], + shuffle=False) type_cast_op = deC.TypeCast(mstype.int32) - ds = ds.map(operations=type_cast_op, input_columns="source_eos_ids") - ds = ds.map(operations=type_cast_op, input_columns="source_eos_mask") - ds = ds.map(operations=type_cast_op, input_columns="target_sos_ids") - ds = ds.map(operations=type_cast_op, input_columns="target_sos_mask") - ds = ds.map(operations=type_cast_op, input_columns="target_eos_ids") - ds = ds.map(operations=type_cast_op, input_columns="target_eos_mask") + data_set = data_set.map(operations=type_cast_op, input_columns="source_eos_ids") + data_set = data_set.map(operations=type_cast_op, input_columns="source_eos_mask") + data_set = data_set.map(operations=type_cast_op, input_columns="target_sos_ids") + data_set = data_set.map(operations=type_cast_op, input_columns="target_sos_mask") + data_set = data_set.map(operations=type_cast_op, input_columns="target_eos_ids") + data_set = data_set.map(operations=type_cast_op, input_columns="target_eos_mask") # apply batch operations - ds = ds.batch(batch_size, drop_remainder=True) - ds.channel_name = 'transformer' - return ds + data_set = data_set.batch(batch_size, drop_remainder=True) + data_set.channel_name = 'transformer' + return data_set + class TransformerInferCell(nn.Cell): """ Encapsulation class of transformer network infer. """ + def __init__(self, network): super(TransformerInferCell, self).__init__(auto_prefix=False) self.network = network @@ -65,6 +68,7 @@ class TransformerInferCell(nn.Cell): predicted_ids = self.network(source_ids, source_mask) return predicted_ids + def load_weights(model_path): """ Load checkpoint as parameter dict, support both npz file and mindspore checkpoint file. @@ -93,6 +97,7 @@ def load_weights(model_path): parameter_dict[name] = Parameter(Tensor(weights[name]), name=name) return parameter_dict + def run_transformer_eval(): """ Transformer evaluation. @@ -136,5 +141,6 @@ def run_transformer_eval(): f.write(" ".join(token_ids) + "\n") f.close() + if __name__ == "__main__": run_transformer_eval() diff --git a/model_zoo/official/recommend/deepfm/src/dataset.py b/model_zoo/official/recommend/deepfm/src/dataset.py index a61cf2c8c73..01f53a647d0 100644 --- a/model_zoo/official/recommend/deepfm/src/dataset.py +++ b/model_zoo/official/recommend/deepfm/src/dataset.py @@ -21,7 +21,7 @@ from enum import Enum import numpy as np import pandas as pd -import mindspore.dataset.engine as de +import mindspore.dataset as ds import mindspore.common.dtype as mstype from .config import DataConfig @@ -142,8 +142,8 @@ class H5Dataset(): X_id = X[:, 0:self.max_length] X_va = X[:, self.max_length:] yield np.array(X_id.astype(dtype=np.int32)), \ - np.array(X_va.astype(dtype=np.float32)), \ - np.array(y.astype(dtype=np.float32)) + np.array(X_va.astype(dtype=np.float32)), \ + np.array(y.astype(dtype=np.float32)) def _get_h5_dataset(directory, train_mode=True, epochs=1, batch_size=1000): @@ -172,9 +172,9 @@ def _get_h5_dataset(directory, train_mode=True, epochs=1, batch_size=1000): for _ in range(0, numbers_of_batch, 1): yield train_eval_gen.__next__() - ds = de.GeneratorDataset(_iter_h5_data, ["ids", "weights", "labels"]) - ds = ds.repeat(epochs) - return ds + data_set = ds.GeneratorDataset(_iter_h5_data, ["ids", "weights", "labels"]) + data_set = data_set.repeat(epochs) + return data_set def _get_mindrecord_dataset(directory, train_mode=True, epochs=1, batch_size=1000, @@ -199,23 +199,23 @@ def _get_mindrecord_dataset(directory, train_mode=True, epochs=1, batch_size=100 shuffle = train_mode if rank_size is not None and rank_id is not None: - ds = de.MindDataset(os.path.join(directory, file_prefix_name + file_suffix_name), - columns_list=['feat_ids', 'feat_vals', 'label'], - num_shards=rank_size, shard_id=rank_id, shuffle=shuffle, - num_parallel_workers=8) + data_set = ds.MindDataset(os.path.join(directory, file_prefix_name + file_suffix_name), + columns_list=['feat_ids', 'feat_vals', 'label'], + num_shards=rank_size, shard_id=rank_id, shuffle=shuffle, + num_parallel_workers=8) else: - ds = de.MindDataset(os.path.join(directory, file_prefix_name + file_suffix_name), - columns_list=['feat_ids', 'feat_vals', 'label'], - shuffle=shuffle, num_parallel_workers=8) - ds = ds.batch(int(batch_size / line_per_sample), drop_remainder=True) - ds = ds.map(operations=(lambda x, y, z: (np.array(x).flatten().reshape(batch_size, 39), - np.array(y).flatten().reshape(batch_size, 39), - np.array(z).flatten().reshape(batch_size, 1))), - input_columns=['feat_ids', 'feat_vals', 'label'], - column_order=['feat_ids', 'feat_vals', 'label'], - num_parallel_workers=8) - ds = ds.repeat(epochs) - return ds + data_set = ds.MindDataset(os.path.join(directory, file_prefix_name + file_suffix_name), + columns_list=['feat_ids', 'feat_vals', 'label'], + shuffle=shuffle, num_parallel_workers=8) + data_set = data_set.batch(int(batch_size / line_per_sample), drop_remainder=True) + data_set = data_set.map(operations=(lambda x, y, z: (np.array(x).flatten().reshape(batch_size, 39), + np.array(y).flatten().reshape(batch_size, 39), + np.array(z).flatten().reshape(batch_size, 1))), + input_columns=['feat_ids', 'feat_vals', 'label'], + column_order=['feat_ids', 'feat_vals', 'label'], + num_parallel_workers=8) + data_set = data_set.repeat(epochs) + return data_set def _get_tf_dataset(directory, train_mode=True, epochs=1, batch_size=1000, @@ -242,28 +242,28 @@ def _get_tf_dataset(directory, train_mode=True, epochs=1, batch_size=1000, for filename in filenames: if file_prefixt_name in filename and 'tfrecord' in filename: dataset_files.append(os.path.join(dir_path, filename)) - schema = de.Schema() + schema = ds.Schema() schema.add_column('feat_ids', de_type=mstype.int32) schema.add_column('feat_vals', de_type=mstype.float32) schema.add_column('label', de_type=mstype.float32) if rank_size is not None and rank_id is not None: - ds = de.TFRecordDataset(dataset_files=dataset_files, shuffle=shuffle, - schema=schema, num_parallel_workers=8, - num_shards=rank_size, shard_id=rank_id, - shard_equal_rows=True) + data_set = ds.TFRecordDataset(dataset_files=dataset_files, shuffle=shuffle, + schema=schema, num_parallel_workers=8, + num_shards=rank_size, shard_id=rank_id, + shard_equal_rows=True) else: - ds = de.TFRecordDataset(dataset_files=dataset_files, shuffle=shuffle, - schema=schema, num_parallel_workers=8) - ds = ds.batch(int(batch_size / line_per_sample), drop_remainder=True) - ds = ds.map(operations=(lambda x, y, z: ( + data_set = ds.TFRecordDataset(dataset_files=dataset_files, shuffle=shuffle, + schema=schema, num_parallel_workers=8) + data_set = data_set.batch(int(batch_size / line_per_sample), drop_remainder=True) + data_set = data_set.map(operations=(lambda x, y, z: ( np.array(x).flatten().reshape(batch_size, 39), np.array(y).flatten().reshape(batch_size, 39), np.array(z).flatten().reshape(batch_size, 1))), - input_columns=['feat_ids', 'feat_vals', 'label'], - column_order=['feat_ids', 'feat_vals', 'label'], - num_parallel_workers=8) - ds = ds.repeat(epochs) - return ds + input_columns=['feat_ids', 'feat_vals', 'label'], + column_order=['feat_ids', 'feat_vals', 'label'], + num_parallel_workers=8) + data_set = data_set.repeat(epochs) + return data_set def create_dataset(directory, train_mode=True, epochs=1, batch_size=1000, diff --git a/model_zoo/official/recommend/wide_and_deep/src/datasets.py b/model_zoo/official/recommend/wide_and_deep/src/datasets.py index a44717ed526..5d9eee3fbd9 100644 --- a/model_zoo/official/recommend/wide_and_deep/src/datasets.py +++ b/model_zoo/official/recommend/wide_and_deep/src/datasets.py @@ -14,13 +14,12 @@ # ============================================================================ """train_dataset.""" - import os import math from enum import Enum import numpy as np import pandas as pd -import mindspore.dataset.engine as de +import mindspore.dataset as ds import mindspore.common.dtype as mstype @@ -84,9 +83,9 @@ class H5Dataset(): yield os.path.join(self._hdf_data_dir, self._file_prefix + '_input_part_' + str( p) + '.h5'), \ - os.path.join(self._hdf_data_dir, - self._file_prefix + '_output_part_' + str( - p) + '.h5'), i + 1 == len(parts) + os.path.join(self._hdf_data_dir, + self._file_prefix + '_output_part_' + str( + p) + '.h5'), i + 1 == len(parts) def _generator(self, X, y, batch_size, shuffle=True): """ @@ -106,8 +105,7 @@ class H5Dataset(): np.random.shuffle(sample_index) assert X.shape[0] > 0 while True: - batch_index = sample_index[ - batch_size * counter: batch_size * (counter + 1)] + batch_index = sample_index[batch_size * counter: batch_size * (counter + 1)] X_batch = X[batch_index] y_batch = y[batch_index] counter += 1 @@ -140,9 +138,8 @@ class H5Dataset(): X, y, finished = data_gen.__next__() X_id = X[:, 0:self.input_length] X_va = X[:, self.input_length:] - yield np.array(X_id.astype(dtype=np.int32)), np.array( - X_va.astype(dtype=np.float32)), np.array( - y.astype(dtype=np.float32)) + yield np.array(X_id.astype(dtype=np.int32)), np.array(X_va.astype(dtype=np.float32)), np.array( + y.astype(dtype=np.float32)) def _get_h5_dataset(data_dir, train_mode=True, epochs=1, batch_size=1000): @@ -164,9 +161,9 @@ def _get_h5_dataset(data_dir, train_mode=True, epochs=1, batch_size=1000): for _ in range(0, numbers_of_batch, 1): yield train_eval_gen.__next__() - ds = de.GeneratorDataset(_iter_h5_data(), ["ids", "weights", "labels"]) - ds = ds.repeat(epochs) - return ds + data_set = ds.GeneratorDataset(_iter_h5_data(), ["ids", "weights", "labels"]) + data_set = data_set.repeat(epochs) + return data_set def _padding_func(batch_size, manual_shape, target_column, field_size=39): @@ -174,11 +171,11 @@ def _padding_func(batch_size, manual_shape, target_column, field_size=39): get padding_func """ if manual_shape: - generate_concat_offset = [item[0]+item[1] for item in manual_shape] + generate_concat_offset = [item[0] + item[1] for item in manual_shape] part_size = int(target_column / len(generate_concat_offset)) filled_value = [] for i in range(field_size, target_column): - filled_value.append(generate_concat_offset[i//part_size]-1) + filled_value.append(generate_concat_offset[i // part_size] - 1) print("Filed Value:", filled_value) def padding_func(x, y, z): @@ -190,7 +187,7 @@ def _padding_func(batch_size, manual_shape, target_column, field_size=39): dtype=np.int32) * filled_value x_id = np.concatenate([x, x_id.astype(dtype=np.int32)], axis=1) mask = np.concatenate( - [y, np.zeros((batch_size, target_column-39), dtype=np.float32)], axis=1) + [y, np.zeros((batch_size, target_column - 39), dtype=np.float32)], axis=1) return (x_id, mask, z) else: def padding_func(x, y, z): @@ -214,24 +211,25 @@ def _get_tf_dataset(data_dir, train_mode=True, epochs=1, batch_size=1000, for filename in filenames: if file_prefix_name in filename and "tfrecord" in filename: dataset_files.append(os.path.join(dirpath, filename)) - schema = de.Schema() + schema = ds.Schema() schema.add_column('feat_ids', de_type=mstype.int32) schema.add_column('feat_vals', de_type=mstype.float32) schema.add_column('label', de_type=mstype.float32) if rank_size is not None and rank_id is not None: - ds = de.TFRecordDataset(dataset_files=dataset_files, shuffle=shuffle, schema=schema, num_parallel_workers=8, - num_shards=rank_size, shard_id=rank_id, shard_equal_rows=True) + data_set = ds.TFRecordDataset(dataset_files=dataset_files, shuffle=shuffle, schema=schema, + num_parallel_workers=8, + num_shards=rank_size, shard_id=rank_id, shard_equal_rows=True) else: - ds = de.TFRecordDataset(dataset_files=dataset_files, - shuffle=shuffle, schema=schema, num_parallel_workers=8) - ds = ds.batch(int(batch_size / line_per_sample), - drop_remainder=True) + data_set = ds.TFRecordDataset(dataset_files=dataset_files, + shuffle=shuffle, schema=schema, num_parallel_workers=8) + data_set = data_set.batch(int(batch_size / line_per_sample), + drop_remainder=True) - ds = ds.map(operations=_padding_func(batch_size, manual_shape, target_column), - input_columns=['feat_ids', 'feat_vals', 'label'], - column_order=['feat_ids', 'feat_vals', 'label'], num_parallel_workers=8) - ds = ds.repeat(epochs) - return ds + data_set = data_set.map(operations=_padding_func(batch_size, manual_shape, target_column), + input_columns=['feat_ids', 'feat_vals', 'label'], + column_order=['feat_ids', 'feat_vals', 'label'], num_parallel_workers=8) + data_set = data_set.repeat(epochs) + return data_set def _get_mindrecord_dataset(directory, train_mode=True, epochs=1, batch_size=1000, @@ -257,21 +255,21 @@ def _get_mindrecord_dataset(directory, train_mode=True, epochs=1, batch_size=100 shuffle = train_mode if rank_size is not None and rank_id is not None: - ds = de.MindDataset(os.path.join(directory, file_prefix_name + file_suffix_name), - columns_list=['feat_ids', 'feat_vals', 'label'], - num_shards=rank_size, shard_id=rank_id, shuffle=shuffle, - num_parallel_workers=8) + data_set = ds.MindDataset(os.path.join(directory, file_prefix_name + file_suffix_name), + columns_list=['feat_ids', 'feat_vals', 'label'], + num_shards=rank_size, shard_id=rank_id, shuffle=shuffle, + num_parallel_workers=8) else: - ds = de.MindDataset(os.path.join(directory, file_prefix_name + file_suffix_name), - columns_list=['feat_ids', 'feat_vals', 'label'], - shuffle=shuffle, num_parallel_workers=8) - ds = ds.batch(int(batch_size / line_per_sample), drop_remainder=True) - ds = ds.map(_padding_func(batch_size, manual_shape, target_column), - input_columns=['feat_ids', 'feat_vals', 'label'], - column_order=['feat_ids', 'feat_vals', 'label'], - num_parallel_workers=8) - ds = ds.repeat(epochs) - return ds + data_set = ds.MindDataset(os.path.join(directory, file_prefix_name + file_suffix_name), + columns_list=['feat_ids', 'feat_vals', 'label'], + shuffle=shuffle, num_parallel_workers=8) + data_set = data_set.batch(int(batch_size / line_per_sample), drop_remainder=True) + data_set = data_set.map(_padding_func(batch_size, manual_shape, target_column), + input_columns=['feat_ids', 'feat_vals', 'label'], + column_order=['feat_ids', 'feat_vals', 'label'], + num_parallel_workers=8) + data_set = data_set.repeat(epochs) + return data_set def _get_vocab_size(target_column_number, worker_size, total_vocab_size, multiply=False, per_vocab_size=None): @@ -284,7 +282,7 @@ def _get_vocab_size(target_column_number, worker_size, total_vocab_size, multipl 5, 21762, 14, 15, 15030, 61, 12220] new_vocabs = inidival_vocabs + [1] * \ - (target_column_number - len(inidival_vocabs)) + (target_column_number - len(inidival_vocabs)) part_size = int(target_column_number / worker_size) # According to the workers, we merge some fields into the same part @@ -304,21 +302,21 @@ def _get_vocab_size(target_column_number, worker_size, total_vocab_size, multipl # Expands the vocabulary of each field by the multiplier if multiply is True: cur_sum = sum(new_vocab_size) - k = total_vocab_size/cur_sum + k = total_vocab_size / cur_sum new_vocab_size = [ - math.ceil(int(item*k)/worker_size)*worker_size for item in new_vocab_size] - new_vocab_size = [(item // 8 + 1)*8 for item in new_vocab_size] + math.ceil(int(item * k) / worker_size) * worker_size for item in new_vocab_size] + new_vocab_size = [(item // 8 + 1) * 8 for item in new_vocab_size] else: if total_vocab_size > sum(new_vocab_size): new_vocab_size[-1] = total_vocab_size - \ - sum(new_vocab_size[:-1]) + sum(new_vocab_size[:-1]) new_vocab_size = [item for item in new_vocab_size] else: raise ValueError( "Please providede the correct vocab size, now is {}".format(total_vocab_size)) - for i in range(worker_size-1): + for i in range(worker_size - 1): off = index_offsets[i] + features[i] index_offsets.append(off) diff --git a/model_zoo/official/recommend/wide_and_deep/train_and_eval_auto_parallel.py b/model_zoo/official/recommend/wide_and_deep/train_and_eval_auto_parallel.py index dee2a95ff67..683e98e4978 100644 --- a/model_zoo/official/recommend/wide_and_deep/train_and_eval_auto_parallel.py +++ b/model_zoo/official/recommend/wide_and_deep/train_and_eval_auto_parallel.py @@ -17,7 +17,7 @@ import os import sys -import mindspore.dataset.engine as de +import mindspore.dataset as ds from mindspore import Model, context from mindspore.train.callback import ModelCheckpoint, CheckpointConfig, TimeMonitor from mindspore.context import ParallelMode @@ -88,7 +88,7 @@ def train_and_eval(config): print("epochs is {}".format(epochs)) if config.full_batch: context.set_auto_parallel_context(full_batch=True) - de.config.set_seed(1) + ds.config.set_seed(1) if config.field_slice: compute_manual_shape(config, get_group_size()) ds_train = create_dataset(data_path, train_mode=True, epochs=1, diff --git a/model_zoo/official/recommend/wide_and_deep/train_and_eval_parameter_server_distribute.py b/model_zoo/official/recommend/wide_and_deep/train_and_eval_parameter_server_distribute.py index 19a93d2ee03..03840efaac5 100644 --- a/model_zoo/official/recommend/wide_and_deep/train_and_eval_parameter_server_distribute.py +++ b/model_zoo/official/recommend/wide_and_deep/train_and_eval_parameter_server_distribute.py @@ -17,7 +17,7 @@ import os import sys -import mindspore.dataset.engine as de +import mindspore.dataset as ds from mindspore import Model, context from mindspore.train.callback import ModelCheckpoint, CheckpointConfig, TimeMonitor from mindspore.context import ParallelMode @@ -92,7 +92,7 @@ def train_and_eval(config): print("epochs is {}".format(epochs)) if config.full_batch: context.set_auto_parallel_context(full_batch=True) - de.config.set_seed(1) + ds.config.set_seed(1) ds_train = create_dataset(data_path, train_mode=True, epochs=1, batch_size=batch_size*get_group_size(), data_type=dataset_type) ds_eval = create_dataset(data_path, train_mode=False, epochs=1, diff --git a/model_zoo/official/recommend/wide_and_deep_multitable/src/datasets.py b/model_zoo/official/recommend/wide_and_deep_multitable/src/datasets.py index fa09b67e2a6..bd024b76b2a 100644 --- a/model_zoo/official/recommend/wide_and_deep_multitable/src/datasets.py +++ b/model_zoo/official/recommend/wide_and_deep_multitable/src/datasets.py @@ -18,7 +18,7 @@ import math import pickle import numpy as np import pandas as pd -import mindspore.dataset.engine as de +import mindspore.dataset as ds import mindspore.common.dtype as mstype @@ -97,8 +97,7 @@ class H5Dataset(): np.random.shuffle(sample_index) assert X.shape[0] > 0 while True: - batch_index = sample_index[batch_size * counter:batch_size * - (counter + 1)] + batch_index = sample_index[batch_size * counter:batch_size * (counter + 1)] X_batch = X[batch_index] y_batch = y[batch_index] counter += 1 @@ -135,9 +134,8 @@ class H5Dataset(): X, y, finished = data_gen.__next__() X_id = X[:, 0:self.input_length] X_va = X[:, self.input_length:] - yield np.array(X_id.astype(dtype=np.int32)), np.array( - X_va.astype(dtype=np.float32)), np.array( - y.astype(dtype=np.float32)) + yield np.array(X_id.astype(dtype=np.int32)), np.array(X_va.astype(dtype=np.float32)), np.array( + y.astype(dtype=np.float32)) def _get_h5_dataset(data_dir, train_mode=True, epochs=1, batch_size=1000): @@ -159,10 +157,10 @@ def _get_h5_dataset(data_dir, train_mode=True, epochs=1, batch_size=1000): for _ in range(0, numbers_of_batch, 1): yield train_eval_gen.__next__() - ds = de.GeneratorDataset(_iter_h5_data(), - ["ids", "weights", "labels"]) - ds = ds.repeat(epochs) - return ds + data_set = ds.GeneratorDataset(_iter_h5_data(), + ["ids", "weights", "labels"]) + data_set = data_set.repeat(epochs) + return data_set def _get_tf_dataset(data_dir, @@ -184,7 +182,7 @@ def _get_tf_dataset(data_dir, for filename in filenames: if file_prefix_name in filename and "tfrecord" in filename: dataset_files.append(os.path.join(dirpath, filename)) - schema = de.Schema() + schema = ds.Schema() float_key_list = ["label", "continue_val"] @@ -199,19 +197,19 @@ def _get_tf_dataset(data_dir, schema.add_column(key, de_type=ms_dtype) if rank_size is not None and rank_id is not None: - ds = de.TFRecordDataset(dataset_files=dataset_files, - shuffle=shuffle, - schema=schema, - num_parallel_workers=8, - num_shards=rank_size, - shard_id=rank_id, - shard_equal_rows=True) + data_set = ds.TFRecordDataset(dataset_files=dataset_files, + shuffle=shuffle, + schema=schema, + num_parallel_workers=8, + num_shards=rank_size, + shard_id=rank_id, + shard_equal_rows=True) else: - ds = de.TFRecordDataset(dataset_files=dataset_files, - shuffle=shuffle, - schema=schema, - num_parallel_workers=8) - ds = ds.batch(int(batch_size / line_per_sample), drop_remainder=True) + data_set = ds.TFRecordDataset(dataset_files=dataset_files, + shuffle=shuffle, + schema=schema, + num_parallel_workers=8) + data_set = data_set.batch(int(batch_size / line_per_sample), drop_remainder=True) operations_list = [] for key in columns_list: @@ -249,7 +247,7 @@ def _get_tf_dataset(data_dir, u = np.array(u).flatten().reshape(batch_size, -1) return a, b, c, d, e, f, g, h, i, j, k, l, m, n, o, p, q, r, s, t, u - ds = ds.map( + data_set = data_set.map( operations=mixup, input_columns=[ 'label', 'continue_val', 'indicator_id', 'emb_128_id', @@ -275,8 +273,8 @@ def _get_tf_dataset(data_dir, ], num_parallel_workers=8) - ds = ds.repeat(epochs) - return ds + data_set = data_set.repeat(epochs) + return data_set def compute_emb_dim(config): diff --git a/model_zoo/research/cv/centernet/src/dataset.py b/model_zoo/research/cv/centernet/src/dataset.py index 463fd9b5c9b..0b27397d47b 100644 --- a/model_zoo/research/cv/centernet/src/dataset.py +++ b/model_zoo/research/cv/centernet/src/dataset.py @@ -24,16 +24,17 @@ import cv2 import numpy as np import pycocotools.coco as coco -import mindspore.dataset.engine.datasets as de +import mindspore.dataset as ds from mindspore import log as logger from mindspore.mindrecord import FileWriter from src.image import color_aug, get_affine_transform, affine_transform from src.image import gaussian_radius, draw_umich_gaussian, draw_msra_gaussian, draw_dense_reg from src.visual import visual_image + _current_dir = os.path.dirname(os.path.realpath(__file__)) -class COCOHP(de.Dataset): +class COCOHP(ds.Dataset): """ Encapsulation class of COCO person keypoints datast. Initilize and preprocess of image for training and testing. @@ -47,6 +48,7 @@ class COCOHP(de.Dataset): Returns: Prepocessed training or testing dataset for CenterNet network. """ + def __init__(self, data_opt, run_mode="train", net_opt=None, enable_visual_image=False, save_path=None): super(COCOHP, self).__init__() self._data_rng = np.random.RandomState(123) @@ -64,7 +66,6 @@ class COCOHP(de.Dataset): if not os.path.exists(self.save_path): os.makedirs(self.save_path) - def init(self, data_dir, keep_res=False, flip_test=False): """initailize additional info""" logger.info('Initializing coco 2017 {} data.'.format(self.run_mode)) @@ -124,7 +125,7 @@ class COCOHP(de.Dataset): for img_id in self.images: image_info = self.coco.loadImgs([img_id]) annos = self.coco.loadAnns(self.anns[img_id]) - #get image + # get image img_name = image_info[0]['file_name'] img_name = os.path.join(self.image_path, img_name) with open(img_name, 'rb') as f: @@ -147,19 +148,16 @@ class COCOHP(de.Dataset): writer.commit() logger.info("Create Mindrecord Done, at {}".format(mindrecord_dir)) - def _coco_box_to_bbox(self, box): bbox = np.array([box[0], box[1], box[0] + box[2], box[1] + box[3]], dtype=np.float32) return bbox - def _get_border(self, border, size): i = 1 while size - border // i <= border // i: i *= 2 return border // i - def __getitem__(self, index): img_id = self.images[index] file_name = self.coco.loadImgs(ids=[img_id])[0]['file_name'] @@ -169,7 +167,6 @@ class COCOHP(de.Dataset): ret = (img, image_id) return ret - def pre_process_for_test(self, image, img_id, scale, meta=None): """image pre-process for evaluation""" b, h, w, ch = image.shape @@ -249,7 +246,6 @@ class COCOHP(de.Dataset): return images, meta - def preprocess_fn(self, img, num_objects, keypoints, bboxes, category_id): """image pre-process and augmentation""" num_objs = min(num_objects, self.data_opt.max_objs) @@ -269,12 +265,12 @@ class COCOHP(de.Dataset): else: sf = self.data_opt.scale cf = self.data_opt.shift - c[0] += s * np.clip(np.random.randn()*cf, -2*cf, 2*cf) - c[1] += s * np.clip(np.random.randn()*cf, -2*cf, 2*cf) - s = s * np.clip(np.random.randn()*sf + 1, 1 - sf, 1 + sf) + c[0] += s * np.clip(np.random.randn() * cf, -2 * cf, 2 * cf) + c[1] += s * np.clip(np.random.randn() * cf, -2 * cf, 2 * cf) + s = s * np.clip(np.random.randn() * sf + 1, 1 - sf, 1 + sf) if np.random.random() < self.data_opt.aug_rot: rf = self.data_opt.rotate - rot = np.clip(np.random.randn()*rf, -rf*2, rf*2) + rot = np.clip(np.random.randn() * rf, -rf * 2, rf * 2) if np.random.random() < self.data_opt.flip_prop: flipped = True @@ -323,7 +319,7 @@ class COCOHP(de.Dataset): cls_id = int(category_id[k]) - 1 pts = np.array(keypoints[k], np.float32).reshape(num_joints, 3) if flipped: - bbox[[0, 2]] = width - bbox[[2, 0]] - 1 # index begin from zero + bbox[[0, 2]] = width - bbox[[2, 0]] - 1 # index begin from zero pts[:, 0] = width - pts[:, 0] - 1 for e in self.data_opt.flip_idx: pts[e[0]], pts[e[1]] = pts[e[1]].copy(), pts[e[0]].copy() @@ -360,7 +356,7 @@ class COCOHP(de.Dataset): if pts[j, 2] > 0: pts[j, :2] = affine_transform(pts[j, :2], trans_output_rot) if pts[j, 0] >= 0 and pts[j, 0] < output_res and \ - pts[j, 1] >= 0 and pts[j, 1] < output_res: + pts[j, 1] >= 0 and pts[j, 1] < output_res: kps[k, j * 2: j * 2 + 2] = pts[j, :2] - ct_int kps_mask[k, j * 2: j * 2 + 2] = 1 pt_int = pts[j, :2].astype(np.int32) @@ -399,7 +395,6 @@ class COCOHP(de.Dataset): visual_image(out_img, ground_truth, self.save_path, ratio=self.data_opt.input_res[0] // output_res) return ret - def create_train_dataset(self, mindrecord_dir, prefix="coco_hp.train.mind", batch_size=1, device_num=1, rank=0, num_parallel_workers=1, do_shuffle=True): """create train dataset based on mindrecord file""" @@ -415,41 +410,43 @@ class COCOHP(de.Dataset): raise ValueError('data_dir {} have no data files'.format(mindrecord_dir)) columns = ["image", "num_objects", "keypoints", "bbox", "category_id"] - ds = de.MindDataset(data_files, - columns_list=columns, - num_parallel_workers=num_parallel_workers, shuffle=do_shuffle, - num_shards=device_num, shard_id=rank) - ori_dataset_size = ds.get_dataset_size() + data_set = ds.MindDataset(data_files, + columns_list=columns, + num_parallel_workers=num_parallel_workers, shuffle=do_shuffle, + num_shards=device_num, shard_id=rank) + ori_dataset_size = data_set.get_dataset_size() logger.info('origin dataset size: {}'.format(ori_dataset_size)) - ds = ds.map(operations=self.preprocess_fn, - input_columns=["image", "num_objects", "keypoints", "bbox", "category_id"], - output_columns=["image", "hm", "reg_mask", "ind", "wh", "kps", "kps_mask", - "reg", "hm_hp", "hp_offset", "hp_ind", "hp_mask"], - column_order=["image", "hm", "reg_mask", "ind", "wh", "kps", "kps_mask", - "reg", "hm_hp", "hp_offset", "hp_ind", "hp_mask"], - num_parallel_workers=num_parallel_workers, - python_multiprocessing=True) - ds = ds.batch(batch_size, drop_remainder=True, num_parallel_workers=8) - logger.info("data size: {}".format(ds.get_dataset_size())) - logger.info("repeat count: {}".format(ds.get_repeat_count())) - return ds - + data_set = data_set.map(operations=self.preprocess_fn, + input_columns=["image", "num_objects", "keypoints", "bbox", "category_id"], + output_columns=["image", "hm", "reg_mask", "ind", "wh", "kps", "kps_mask", + "reg", "hm_hp", "hp_offset", "hp_ind", "hp_mask"], + column_order=["image", "hm", "reg_mask", "ind", "wh", "kps", "kps_mask", + "reg", "hm_hp", "hp_offset", "hp_ind", "hp_mask"], + num_parallel_workers=num_parallel_workers, + python_multiprocessing=True) + data_set = data_set.batch(batch_size, drop_remainder=True, num_parallel_workers=8) + logger.info("data size: {}".format(data_set.get_dataset_size())) + logger.info("repeat count: {}".format(data_set.get_repeat_count())) + return data_set def create_eval_dataset(self, batch_size=1, num_parallel_workers=1): """create testing dataset based on coco format""" + def generator(): for i in range(self.num_samples): yield self.__getitem__(i) + column = ["image", "image_id"] - ds = de.GeneratorDataset(generator, column, num_parallel_workers=num_parallel_workers) - ds = ds.batch(batch_size, drop_remainder=True, num_parallel_workers=8) - return ds + data_set = ds.GeneratorDataset(generator, column, num_parallel_workers=num_parallel_workers) + data_set = data_set.batch(batch_size, drop_remainder=True, num_parallel_workers=8) + return data_set if __name__ == '__main__': # Convert coco2017 dataset to mindrecord to improve performance on host from src.config import dataset_config + parser = argparse.ArgumentParser(description='CenterNet MindRecord dataset') parser.add_argument("--coco_data_dir", type=str, default="", help="Coco dataset directory.") parser.add_argument("--mindrecord_dir", type=str, default="", help="MindRecord dataset dir.") diff --git a/model_zoo/research/cv/ghostnet/src/dataset.py b/model_zoo/research/cv/ghostnet/src/dataset.py index 1da50b2bf11..e5ec9f9710f 100644 --- a/model_zoo/research/cv/ghostnet/src/dataset.py +++ b/model_zoo/research/cv/ghostnet/src/dataset.py @@ -17,7 +17,7 @@ create train or eval dataset. """ import os import mindspore.common.dtype as mstype -import mindspore.dataset.engine as de +import mindspore.dataset as ds import mindspore.dataset.transforms.vision.c_transforms as C import mindspore.dataset.transforms.vision.py_transforms as P import mindspore.dataset.transforms.c_transforms as C2 @@ -41,18 +41,18 @@ def create_dataset(dataset_path, do_train, config, platform, repeat_num=1, batch rank_size = int(os.getenv("RANK_SIZE")) rank_id = int(os.getenv("RANK_ID")) if rank_size == 1: - ds = de.MindDataset( + data_set = ds.MindDataset( dataset_path, num_parallel_workers=8, shuffle=True) else: - ds = de.MindDataset(dataset_path, num_parallel_workers=8, shuffle=True, - num_shards=rank_size, shard_id=rank_id) + data_set = ds.MindDataset(dataset_path, num_parallel_workers=8, shuffle=True, + num_shards=rank_size, shard_id=rank_id) elif platform == "GPU": if do_train: from mindspore.communication.management import get_rank, get_group_size - ds = de.MindDataset(dataset_path, num_parallel_workers=8, shuffle=True, - num_shards=get_group_size(), shard_id=get_rank()) + data_set = ds.MindDataset(dataset_path, num_parallel_workers=8, shuffle=True, + num_shards=get_group_size(), shard_id=get_rank()) else: - ds = de.MindDataset( + data_set = ds.MindDataset( dataset_path, num_parallel_workers=8, shuffle=True) else: raise ValueError("Unsupport platform.") @@ -67,7 +67,7 @@ def create_dataset(dataset_path, do_train, config, platform, repeat_num=1, batch color_op = C.RandomColorAdjust( brightness=0.4, contrast=0.4, saturation=0.4) - rescale_op = C.Rescale(1/255.0, 0) + rescale_op = C.Rescale(1 / 255.0, 0) normalize_op = C.Normalize( mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]) change_swap_op = C.HWC2CHW() @@ -93,18 +93,18 @@ def create_dataset(dataset_path, do_train, config, platform, repeat_num=1, batch trans = composeop() type_cast_op = C2.TypeCast(mstype.int32) - ds = ds.map(input_columns="image", operations=trans, - num_parallel_workers=8) - ds = ds.map(input_columns="label_list", - operations=type_cast_op, num_parallel_workers=8) + data_set = data_set.map(input_columns="image", operations=trans, + num_parallel_workers=8) + data_set = data_set.map(input_columns="label_list", + operations=type_cast_op, num_parallel_workers=8) # apply shuffle operations - ds = ds.shuffle(buffer_size=buffer_size) + data_set = data_set.shuffle(buffer_size=buffer_size) # apply batch operations - ds = ds.batch(batch_size, drop_remainder=True) + data_set = data_set.batch(batch_size, drop_remainder=True) # apply dataset repeat operation - ds = ds.repeat(repeat_num) + data_set = data_set.repeat(repeat_num) - return ds + return data_set diff --git a/model_zoo/research/cv/ghostnet_quant/src/dataset.py b/model_zoo/research/cv/ghostnet_quant/src/dataset.py index 1da50b2bf11..e5ec9f9710f 100644 --- a/model_zoo/research/cv/ghostnet_quant/src/dataset.py +++ b/model_zoo/research/cv/ghostnet_quant/src/dataset.py @@ -17,7 +17,7 @@ create train or eval dataset. """ import os import mindspore.common.dtype as mstype -import mindspore.dataset.engine as de +import mindspore.dataset as ds import mindspore.dataset.transforms.vision.c_transforms as C import mindspore.dataset.transforms.vision.py_transforms as P import mindspore.dataset.transforms.c_transforms as C2 @@ -41,18 +41,18 @@ def create_dataset(dataset_path, do_train, config, platform, repeat_num=1, batch rank_size = int(os.getenv("RANK_SIZE")) rank_id = int(os.getenv("RANK_ID")) if rank_size == 1: - ds = de.MindDataset( + data_set = ds.MindDataset( dataset_path, num_parallel_workers=8, shuffle=True) else: - ds = de.MindDataset(dataset_path, num_parallel_workers=8, shuffle=True, - num_shards=rank_size, shard_id=rank_id) + data_set = ds.MindDataset(dataset_path, num_parallel_workers=8, shuffle=True, + num_shards=rank_size, shard_id=rank_id) elif platform == "GPU": if do_train: from mindspore.communication.management import get_rank, get_group_size - ds = de.MindDataset(dataset_path, num_parallel_workers=8, shuffle=True, - num_shards=get_group_size(), shard_id=get_rank()) + data_set = ds.MindDataset(dataset_path, num_parallel_workers=8, shuffle=True, + num_shards=get_group_size(), shard_id=get_rank()) else: - ds = de.MindDataset( + data_set = ds.MindDataset( dataset_path, num_parallel_workers=8, shuffle=True) else: raise ValueError("Unsupport platform.") @@ -67,7 +67,7 @@ def create_dataset(dataset_path, do_train, config, platform, repeat_num=1, batch color_op = C.RandomColorAdjust( brightness=0.4, contrast=0.4, saturation=0.4) - rescale_op = C.Rescale(1/255.0, 0) + rescale_op = C.Rescale(1 / 255.0, 0) normalize_op = C.Normalize( mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]) change_swap_op = C.HWC2CHW() @@ -93,18 +93,18 @@ def create_dataset(dataset_path, do_train, config, platform, repeat_num=1, batch trans = composeop() type_cast_op = C2.TypeCast(mstype.int32) - ds = ds.map(input_columns="image", operations=trans, - num_parallel_workers=8) - ds = ds.map(input_columns="label_list", - operations=type_cast_op, num_parallel_workers=8) + data_set = data_set.map(input_columns="image", operations=trans, + num_parallel_workers=8) + data_set = data_set.map(input_columns="label_list", + operations=type_cast_op, num_parallel_workers=8) # apply shuffle operations - ds = ds.shuffle(buffer_size=buffer_size) + data_set = data_set.shuffle(buffer_size=buffer_size) # apply batch operations - ds = ds.batch(batch_size, drop_remainder=True) + data_set = data_set.batch(batch_size, drop_remainder=True) # apply dataset repeat operation - ds = ds.repeat(repeat_num) + data_set = data_set.repeat(repeat_num) - return ds + return data_set diff --git a/model_zoo/research/cv/resnet50_adv_pruning/src/pet_dataset.py b/model_zoo/research/cv/resnet50_adv_pruning/src/pet_dataset.py index e72ea607d96..6f506a65058 100644 --- a/model_zoo/research/cv/resnet50_adv_pruning/src/pet_dataset.py +++ b/model_zoo/research/cv/resnet50_adv_pruning/src/pet_dataset.py @@ -17,7 +17,7 @@ create train or eval dataset. """ import os import mindspore.common.dtype as mstype -import mindspore.dataset.engine as de +import mindspore.dataset as ds import mindspore.dataset.vision.c_transforms as C import mindspore.dataset.vision.py_transforms as P import mindspore.dataset.transforms.c_transforms as C2 @@ -42,18 +42,18 @@ def create_dataset(dataset_path, do_train, config, platform, repeat_num=1, batch rank_size = int(os.getenv("RANK_SIZE")) rank_id = int(os.getenv("RANK_ID")) if rank_size == 1: - ds = de.MindDataset( + data_set = ds.MindDataset( dataset_path, num_parallel_workers=8, shuffle=True) else: - ds = de.MindDataset(dataset_path, num_parallel_workers=8, shuffle=True, - num_shards=rank_size, shard_id=rank_id) + data_set = ds.MindDataset(dataset_path, num_parallel_workers=8, shuffle=True, + num_shards=rank_size, shard_id=rank_id) elif platform == "GPU": if do_train: from mindspore.communication.management import get_rank, get_group_size - ds = de.MindDataset(dataset_path, num_parallel_workers=8, shuffle=True, - num_shards=get_group_size(), shard_id=get_rank()) + data_set = ds.MindDataset(dataset_path, num_parallel_workers=8, shuffle=True, + num_shards=get_group_size(), shard_id=get_rank()) else: - ds = de.MindDataset( + data_set = ds.MindDataset( dataset_path, num_parallel_workers=8, shuffle=False) else: raise ValueError("Unsupport platform.") @@ -68,7 +68,7 @@ def create_dataset(dataset_path, do_train, config, platform, repeat_num=1, batch color_op = C.RandomColorAdjust( brightness=0.4, contrast=0.4, saturation=0.4) - rescale_op = C.Rescale(1/255.0, 0) + rescale_op = C.Rescale(1 / 255.0, 0) normalize_op = C.Normalize( mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]) change_swap_op = C.HWC2CHW() @@ -88,18 +88,18 @@ def create_dataset(dataset_path, do_train, config, platform, repeat_num=1, batch trans = composeop type_cast_op = C2.TypeCast(mstype.int32) - ds = ds.map(input_columns="image", operations=trans, - num_parallel_workers=8) - ds = ds.map(input_columns="label_list", - operations=type_cast_op, num_parallel_workers=8) + data_set = data_set.map(input_columns="image", operations=trans, + num_parallel_workers=8) + data_set = data_set.map(input_columns="label_list", + operations=type_cast_op, num_parallel_workers=8) # apply shuffle operations - ds = ds.shuffle(buffer_size=buffer_size) + data_set = data_set.shuffle(buffer_size=buffer_size) # apply batch operations - ds = ds.batch(batch_size, drop_remainder=True) + data_set = data_set.batch(batch_size, drop_remainder=True) # apply dataset repeat operation - ds = ds.repeat(repeat_num) + data_set = data_set.repeat(repeat_num) - return ds + return data_set diff --git a/model_zoo/research/cv/squeezenet/src/dataset.py b/model_zoo/research/cv/squeezenet/src/dataset.py index e1d9c7745cb..38eaef30f30 100755 --- a/model_zoo/research/cv/squeezenet/src/dataset.py +++ b/model_zoo/research/cv/squeezenet/src/dataset.py @@ -17,7 +17,7 @@ create train or eval dataset. """ import os import mindspore.common.dtype as mstype -import mindspore.dataset.engine as de +import mindspore.dataset as ds import mindspore.dataset.vision.c_transforms as C import mindspore.dataset.transforms.c_transforms as C2 from mindspore.communication.management import init, get_rank, get_group_size @@ -48,15 +48,15 @@ def create_dataset_cifar(dataset_path, device_num = get_group_size() if device_num == 1: - ds = de.Cifar10Dataset(dataset_path, - num_parallel_workers=8, - shuffle=True) + data_set = ds.Cifar10Dataset(dataset_path, + num_parallel_workers=8, + shuffle=True) else: - ds = de.Cifar10Dataset(dataset_path, - num_parallel_workers=8, - shuffle=True, - num_shards=device_num, - shard_id=rank_id) + data_set = ds.Cifar10Dataset(dataset_path, + num_parallel_workers=8, + shuffle=True, + num_shards=device_num, + shard_id=rank_id) # define map operations if do_train: @@ -80,20 +80,20 @@ def create_dataset_cifar(dataset_path, type_cast_op = C2.TypeCast(mstype.int32) - ds = ds.map(operations=type_cast_op, - input_columns="label", - num_parallel_workers=8) - ds = ds.map(operations=trans, - input_columns="image", - num_parallel_workers=8) + data_set = data_set.map(operations=type_cast_op, + input_columns="label", + num_parallel_workers=8) + data_set = data_set.map(operations=trans, + input_columns="image", + num_parallel_workers=8) # apply batch operations - ds = ds.batch(batch_size, drop_remainder=True) + data_set = data_set.batch(batch_size, drop_remainder=True) # apply dataset repeat operation - ds = ds.repeat(repeat_num) + data_set = data_set.repeat(repeat_num) - return ds + return data_set def create_dataset_imagenet(dataset_path, @@ -122,15 +122,15 @@ def create_dataset_imagenet(dataset_path, device_num = get_group_size() if device_num == 1: - ds = de.ImageFolderDataset(dataset_path, - num_parallel_workers=8, - shuffle=True) + data_set = ds.ImageFolderDataset(dataset_path, + num_parallel_workers=8, + shuffle=True) else: - ds = de.ImageFolderDataset(dataset_path, - num_parallel_workers=8, - shuffle=True, - num_shards=device_num, - shard_id=rank_id) + data_set = ds.ImageFolderDataset(dataset_path, + num_parallel_workers=8, + shuffle=True, + num_shards=device_num, + shard_id=rank_id) image_size = 227 mean = [0.485 * 255, 0.456 * 255, 0.406 * 255] @@ -159,20 +159,20 @@ def create_dataset_imagenet(dataset_path, type_cast_op = C2.TypeCast(mstype.int32) - ds = ds.map(operations=type_cast_op, - input_columns="label", - num_parallel_workers=8) - ds = ds.map(operations=trans, - input_columns="image", - num_parallel_workers=8) + data_set = data_set.map(operations=type_cast_op, + input_columns="label", + num_parallel_workers=8) + data_set = data_set.map(operations=trans, + input_columns="image", + num_parallel_workers=8) # apply batch operations - ds = ds.batch(batch_size, drop_remainder=True) + data_set = data_set.batch(batch_size, drop_remainder=True) # apply dataset repeat operation - ds = ds.repeat(repeat_num) + data_set = data_set.repeat(repeat_num) - return ds + return data_set def _get_rank_info(): diff --git a/model_zoo/research/recommend/autodis/src/dataset.py b/model_zoo/research/recommend/autodis/src/dataset.py index a61cf2c8c73..01f53a647d0 100644 --- a/model_zoo/research/recommend/autodis/src/dataset.py +++ b/model_zoo/research/recommend/autodis/src/dataset.py @@ -21,7 +21,7 @@ from enum import Enum import numpy as np import pandas as pd -import mindspore.dataset.engine as de +import mindspore.dataset as ds import mindspore.common.dtype as mstype from .config import DataConfig @@ -142,8 +142,8 @@ class H5Dataset(): X_id = X[:, 0:self.max_length] X_va = X[:, self.max_length:] yield np.array(X_id.astype(dtype=np.int32)), \ - np.array(X_va.astype(dtype=np.float32)), \ - np.array(y.astype(dtype=np.float32)) + np.array(X_va.astype(dtype=np.float32)), \ + np.array(y.astype(dtype=np.float32)) def _get_h5_dataset(directory, train_mode=True, epochs=1, batch_size=1000): @@ -172,9 +172,9 @@ def _get_h5_dataset(directory, train_mode=True, epochs=1, batch_size=1000): for _ in range(0, numbers_of_batch, 1): yield train_eval_gen.__next__() - ds = de.GeneratorDataset(_iter_h5_data, ["ids", "weights", "labels"]) - ds = ds.repeat(epochs) - return ds + data_set = ds.GeneratorDataset(_iter_h5_data, ["ids", "weights", "labels"]) + data_set = data_set.repeat(epochs) + return data_set def _get_mindrecord_dataset(directory, train_mode=True, epochs=1, batch_size=1000, @@ -199,23 +199,23 @@ def _get_mindrecord_dataset(directory, train_mode=True, epochs=1, batch_size=100 shuffle = train_mode if rank_size is not None and rank_id is not None: - ds = de.MindDataset(os.path.join(directory, file_prefix_name + file_suffix_name), - columns_list=['feat_ids', 'feat_vals', 'label'], - num_shards=rank_size, shard_id=rank_id, shuffle=shuffle, - num_parallel_workers=8) + data_set = ds.MindDataset(os.path.join(directory, file_prefix_name + file_suffix_name), + columns_list=['feat_ids', 'feat_vals', 'label'], + num_shards=rank_size, shard_id=rank_id, shuffle=shuffle, + num_parallel_workers=8) else: - ds = de.MindDataset(os.path.join(directory, file_prefix_name + file_suffix_name), - columns_list=['feat_ids', 'feat_vals', 'label'], - shuffle=shuffle, num_parallel_workers=8) - ds = ds.batch(int(batch_size / line_per_sample), drop_remainder=True) - ds = ds.map(operations=(lambda x, y, z: (np.array(x).flatten().reshape(batch_size, 39), - np.array(y).flatten().reshape(batch_size, 39), - np.array(z).flatten().reshape(batch_size, 1))), - input_columns=['feat_ids', 'feat_vals', 'label'], - column_order=['feat_ids', 'feat_vals', 'label'], - num_parallel_workers=8) - ds = ds.repeat(epochs) - return ds + data_set = ds.MindDataset(os.path.join(directory, file_prefix_name + file_suffix_name), + columns_list=['feat_ids', 'feat_vals', 'label'], + shuffle=shuffle, num_parallel_workers=8) + data_set = data_set.batch(int(batch_size / line_per_sample), drop_remainder=True) + data_set = data_set.map(operations=(lambda x, y, z: (np.array(x).flatten().reshape(batch_size, 39), + np.array(y).flatten().reshape(batch_size, 39), + np.array(z).flatten().reshape(batch_size, 1))), + input_columns=['feat_ids', 'feat_vals', 'label'], + column_order=['feat_ids', 'feat_vals', 'label'], + num_parallel_workers=8) + data_set = data_set.repeat(epochs) + return data_set def _get_tf_dataset(directory, train_mode=True, epochs=1, batch_size=1000, @@ -242,28 +242,28 @@ def _get_tf_dataset(directory, train_mode=True, epochs=1, batch_size=1000, for filename in filenames: if file_prefixt_name in filename and 'tfrecord' in filename: dataset_files.append(os.path.join(dir_path, filename)) - schema = de.Schema() + schema = ds.Schema() schema.add_column('feat_ids', de_type=mstype.int32) schema.add_column('feat_vals', de_type=mstype.float32) schema.add_column('label', de_type=mstype.float32) if rank_size is not None and rank_id is not None: - ds = de.TFRecordDataset(dataset_files=dataset_files, shuffle=shuffle, - schema=schema, num_parallel_workers=8, - num_shards=rank_size, shard_id=rank_id, - shard_equal_rows=True) + data_set = ds.TFRecordDataset(dataset_files=dataset_files, shuffle=shuffle, + schema=schema, num_parallel_workers=8, + num_shards=rank_size, shard_id=rank_id, + shard_equal_rows=True) else: - ds = de.TFRecordDataset(dataset_files=dataset_files, shuffle=shuffle, - schema=schema, num_parallel_workers=8) - ds = ds.batch(int(batch_size / line_per_sample), drop_remainder=True) - ds = ds.map(operations=(lambda x, y, z: ( + data_set = ds.TFRecordDataset(dataset_files=dataset_files, shuffle=shuffle, + schema=schema, num_parallel_workers=8) + data_set = data_set.batch(int(batch_size / line_per_sample), drop_remainder=True) + data_set = data_set.map(operations=(lambda x, y, z: ( np.array(x).flatten().reshape(batch_size, 39), np.array(y).flatten().reshape(batch_size, 39), np.array(z).flatten().reshape(batch_size, 1))), - input_columns=['feat_ids', 'feat_vals', 'label'], - column_order=['feat_ids', 'feat_vals', 'label'], - num_parallel_workers=8) - ds = ds.repeat(epochs) - return ds + input_columns=['feat_ids', 'feat_vals', 'label'], + column_order=['feat_ids', 'feat_vals', 'label'], + num_parallel_workers=8) + data_set = data_set.repeat(epochs) + return data_set def create_dataset(directory, train_mode=True, epochs=1, batch_size=1000, diff --git a/tests/st/model_zoo_tests/DeepFM/src/dataset.py b/tests/st/model_zoo_tests/DeepFM/src/dataset.py index 15cc9407f8f..2510c630f34 100644 --- a/tests/st/model_zoo_tests/DeepFM/src/dataset.py +++ b/tests/st/model_zoo_tests/DeepFM/src/dataset.py @@ -21,7 +21,7 @@ from enum import Enum import pandas as pd import numpy as np -import mindspore.dataset.engine as de +import mindspore.dataset as ds import mindspore.common.dtype as mstype from .config import DataConfig @@ -142,8 +142,8 @@ class H5Dataset(): X_id = X[:, 0:self.max_length] X_va = X[:, self.max_length:] yield np.array(X_id.astype(dtype=np.int32)), \ - np.array(X_va.astype(dtype=np.float32)), \ - np.array(y.astype(dtype=np.float32)) + np.array(X_va.astype(dtype=np.float32)), \ + np.array(y.astype(dtype=np.float32)) def _get_h5_dataset(directory, train_mode=True, epochs=1, batch_size=1000): @@ -172,9 +172,9 @@ def _get_h5_dataset(directory, train_mode=True, epochs=1, batch_size=1000): for _ in range(0, numbers_of_batch, 1): yield train_eval_gen.__next__() - ds = de.GeneratorDataset(_iter_h5_data, ["ids", "weights", "labels"], num_samples=3000) - ds = ds.repeat(epochs) - return ds + data_set = ds.GeneratorDataset(_iter_h5_data, ["ids", "weights", "labels"], num_samples=3000) + data_set = data_set.repeat(epochs) + return data_set def _get_mindrecord_dataset(directory, train_mode=True, epochs=1, batch_size=1000, @@ -199,23 +199,23 @@ def _get_mindrecord_dataset(directory, train_mode=True, epochs=1, batch_size=100 shuffle = train_mode if rank_size is not None and rank_id is not None: - ds = de.MindDataset(os.path.join(directory, file_prefix_name + file_suffix_name), - columns_list=['feat_ids', 'feat_vals', 'label'], - num_shards=rank_size, shard_id=rank_id, shuffle=shuffle, - num_parallel_workers=8) + data_set = ds.MindDataset(os.path.join(directory, file_prefix_name + file_suffix_name), + columns_list=['feat_ids', 'feat_vals', 'label'], + num_shards=rank_size, shard_id=rank_id, shuffle=shuffle, + num_parallel_workers=8) else: - ds = de.MindDataset(os.path.join(directory, file_prefix_name + file_suffix_name), - columns_list=['feat_ids', 'feat_vals', 'label'], - shuffle=shuffle, num_parallel_workers=8) - ds = ds.batch(int(batch_size / line_per_sample), drop_remainder=True) - ds = ds.map(operations=(lambda x, y, z: (np.array(x).flatten().reshape(batch_size, 39), - np.array(y).flatten().reshape(batch_size, 39), - np.array(z).flatten().reshape(batch_size, 1))), - input_columns=['feat_ids', 'feat_vals', 'label'], - column_order=['feat_ids', 'feat_vals', 'label'], - num_parallel_workers=8) - ds = ds.repeat(epochs) - return ds + data_set = ds.MindDataset(os.path.join(directory, file_prefix_name + file_suffix_name), + columns_list=['feat_ids', 'feat_vals', 'label'], + shuffle=shuffle, num_parallel_workers=8) + data_set = data_set.batch(int(batch_size / line_per_sample), drop_remainder=True) + data_set = data_set.map(operations=(lambda x, y, z: (np.array(x).flatten().reshape(batch_size, 39), + np.array(y).flatten().reshape(batch_size, 39), + np.array(z).flatten().reshape(batch_size, 1))), + input_columns=['feat_ids', 'feat_vals', 'label'], + column_order=['feat_ids', 'feat_vals', 'label'], + num_parallel_workers=8) + data_set = data_set.repeat(epochs) + return data_set def _get_tf_dataset(directory, train_mode=True, epochs=1, batch_size=1000, @@ -242,28 +242,28 @@ def _get_tf_dataset(directory, train_mode=True, epochs=1, batch_size=1000, for filename in filenames: if file_prefixt_name in filename and 'tfrecord' in filename: dataset_files.append(os.path.join(dir_path, filename)) - schema = de.Schema() + schema = ds.Schema() schema.add_column('feat_ids', de_type=mstype.int32) schema.add_column('feat_vals', de_type=mstype.float32) schema.add_column('label', de_type=mstype.float32) if rank_size is not None and rank_id is not None: - ds = de.TFRecordDataset(dataset_files=dataset_files, shuffle=shuffle, - schema=schema, num_parallel_workers=8, - num_shards=rank_size, shard_id=rank_id, - shard_equal_rows=True, num_samples=3000) + data_set = ds.TFRecordDataset(dataset_files=dataset_files, shuffle=shuffle, + schema=schema, num_parallel_workers=8, + num_shards=rank_size, shard_id=rank_id, + shard_equal_rows=True, num_samples=3000) else: - ds = de.TFRecordDataset(dataset_files=dataset_files, shuffle=shuffle, - schema=schema, num_parallel_workers=8, num_samples=3000) - ds = ds.batch(int(batch_size / line_per_sample), drop_remainder=True) - ds = ds.map(operations=(lambda x, y, z: ( + data_set = ds.TFRecordDataset(dataset_files=dataset_files, shuffle=shuffle, + schema=schema, num_parallel_workers=8, num_samples=3000) + data_set = data_set.batch(int(batch_size / line_per_sample), drop_remainder=True) + data_set = data_set.map(operations=(lambda x, y, z: ( np.array(x).flatten().reshape(batch_size, 39), np.array(y).flatten().reshape(batch_size, 39), np.array(z).flatten().reshape(batch_size, 1))), - input_columns=['feat_ids', 'feat_vals', 'label'], - column_order=['feat_ids', 'feat_vals', 'label'], - num_parallel_workers=8) - ds = ds.repeat(epochs) - return ds + input_columns=['feat_ids', 'feat_vals', 'label'], + column_order=['feat_ids', 'feat_vals', 'label'], + num_parallel_workers=8) + data_set = data_set.repeat(epochs) + return data_set def create_dataset(directory, train_mode=True, epochs=1, batch_size=1000, diff --git a/tests/st/model_zoo_tests/transformer/test_transformer.py b/tests/st/model_zoo_tests/transformer/test_transformer.py index 327d58fdd42..fff7c095360 100644 --- a/tests/st/model_zoo_tests/transformer/test_transformer.py +++ b/tests/st/model_zoo_tests/transformer/test_transformer.py @@ -24,17 +24,18 @@ from mindspore.nn.optim import Adam from mindspore.train.model import Model from mindspore.train.loss_scale_manager import DynamicLossScaleManager from mindspore.train.callback import Callback -import mindspore.dataset.engine as de +import mindspore.dataset as ds import mindspore.dataset.transforms.c_transforms as deC from mindspore import context from model_zoo.official.nlp.transformer.src.transformer_model import TransformerConfig from model_zoo.official.nlp.transformer.src.transformer_for_train import TransformerNetworkWithLoss, \ - TransformerTrainOneStepWithLossScaleCell + TransformerTrainOneStepWithLossScaleCell from model_zoo.official.nlp.transformer.src.config import cfg, transformer_net_cfg from model_zoo.official.nlp.transformer.src.lr_schedule import create_dynamic_lr DATA_DIR = ["/home/workspace/mindspore_dataset/transformer/test-mindrecord"] + def get_config(version='base', batch_size=1): """get config""" if version == 'large': @@ -75,23 +76,25 @@ def get_config(version='base', batch_size=1): transformer_cfg = TransformerConfig(batch_size=batch_size) return transformer_cfg + def load_test_data(batch_size=1, data_file=None): """Load test dataset.""" - ds = de.MindDataset(data_file, - columns_list=["source_eos_ids", "source_eos_mask", - "target_sos_ids", "target_sos_mask", - "target_eos_ids", "target_eos_mask"], - shuffle=False) + data_set = ds.MindDataset(data_file, + columns_list=["source_eos_ids", "source_eos_mask", + "target_sos_ids", "target_sos_mask", + "target_eos_ids", "target_eos_mask"], + shuffle=False) type_cast_op = deC.TypeCast(mstype.int32) - ds = ds.map(operations=type_cast_op, input_columns="source_eos_ids") - ds = ds.map(operations=type_cast_op, input_columns="source_eos_mask") - ds = ds.map(operations=type_cast_op, input_columns="target_sos_ids") - ds = ds.map(operations=type_cast_op, input_columns="target_sos_mask") - ds = ds.map(operations=type_cast_op, input_columns="target_eos_ids") - ds = ds.map(operations=type_cast_op, input_columns="target_eos_mask") + data_set = data_set.map(operations=type_cast_op, input_columns="source_eos_ids") + data_set = data_set.map(operations=type_cast_op, input_columns="source_eos_mask") + data_set = data_set.map(operations=type_cast_op, input_columns="target_sos_ids") + data_set = data_set.map(operations=type_cast_op, input_columns="target_sos_mask") + data_set = data_set.map(operations=type_cast_op, input_columns="target_eos_ids") + data_set = data_set.map(operations=type_cast_op, input_columns="target_eos_mask") # apply batch operations - ds = ds.batch(batch_size, drop_remainder=True) - return ds + data_set = data_set.batch(batch_size, drop_remainder=True) + return data_set + class ModelCallback(Callback): def __init__(self): @@ -107,13 +110,16 @@ class ModelCallback(Callback): self.lossscale_list.append(cb_params.net_outputs[2].asnumpy()) print("epoch: {}, outputs are: {}".format(cb_params.cur_epoch_num, str(cb_params.net_outputs))) + class TimeMonitor(Callback): """Time Monitor.""" + def __init__(self, data_size): super(TimeMonitor, self).__init__() self.data_size = data_size self.epoch_mseconds_list = [] self.per_step_mseconds_list = [] + def epoch_begin(self, run_context): self.epoch_time = time.time() @@ -122,6 +128,7 @@ class TimeMonitor(Callback): self.epoch_mseconds_list.append(epoch_mseconds) self.per_step_mseconds_list.append(epoch_mseconds / self.data_size) + @pytest.mark.level0 @pytest.mark.platform_arm_ascend_training @pytest.mark.platform_x86_ascend_training @@ -142,7 +149,7 @@ def test_transformer(): netwithloss = TransformerNetworkWithLoss(config, True) lr = Tensor(create_dynamic_lr(schedule="constant*rsqrt_hidden*linear_warmup*rsqrt_decay", - training_steps=dataset.get_dataset_size()*epoch_size, + training_steps=dataset.get_dataset_size() * epoch_size, learning_rate=cfg.lr_schedule.learning_rate, warmup_steps=cfg.lr_schedule.warmup_steps, hidden_size=config.hidden_size), mstype.float32) @@ -193,5 +200,6 @@ def test_transformer(): print("per step mseconds: {}".format(per_step_mseconds)) assert per_step_mseconds <= expect_per_step_mseconds + 2 + if __name__ == '__main__': test_transformer() diff --git a/tests/st/model_zoo_tests/wide_and_deep/python_file_for_ci/datasets.py b/tests/st/model_zoo_tests/wide_and_deep/python_file_for_ci/datasets.py index d0a5234d9a6..a2f04b638ca 100644 --- a/tests/st/model_zoo_tests/wide_and_deep/python_file_for_ci/datasets.py +++ b/tests/st/model_zoo_tests/wide_and_deep/python_file_for_ci/datasets.py @@ -14,13 +14,13 @@ # ============================================================================ """train_imagenet.""" - import os from enum import Enum import numpy as np -import mindspore.dataset.engine as de +import mindspore.dataset as ds import mindspore.common.dtype as mstype + class DataType(Enum): """ Enumerate supported dataset format. @@ -29,6 +29,7 @@ class DataType(Enum): TFRECORD = 2 H5 = 3 + def _get_tf_dataset(data_dir, train_mode=True, epochs=1, batch_size=1000, line_per_sample=1000, rank_size=None, rank_id=None): """ @@ -41,26 +42,29 @@ def _get_tf_dataset(data_dir, train_mode=True, epochs=1, batch_size=1000, for filename in filenames: if file_prefix_name in filename and "tfrecord" in filename: dataset_files.append(os.path.join(dirpath, filename)) - schema = de.Schema() + schema = ds.Schema() schema.add_column('feat_ids', de_type=mstype.int32) schema.add_column('feat_vals', de_type=mstype.float32) schema.add_column('label', de_type=mstype.float32) if rank_size is not None and rank_id is not None: - ds = de.TFRecordDataset(dataset_files=dataset_files, shuffle=shuffle, schema=schema, num_parallel_workers=8, - num_shards=rank_size, shard_id=rank_id, shard_equal_rows=True) + data_set = ds.TFRecordDataset(dataset_files=dataset_files, shuffle=shuffle, schema=schema, + num_parallel_workers=8, + num_shards=rank_size, shard_id=rank_id, shard_equal_rows=True) else: - ds = de.TFRecordDataset(dataset_files=dataset_files, shuffle=shuffle, schema=schema, num_parallel_workers=8) - ds = ds.batch(int(batch_size / line_per_sample), - drop_remainder=True) - ds = ds.map(operations=(lambda x, y, z: ( + data_set = ds.TFRecordDataset(dataset_files=dataset_files, shuffle=shuffle, schema=schema, + num_parallel_workers=8) + data_set = data_set.batch(int(batch_size / line_per_sample), + drop_remainder=True) + data_set = data_set.map(operations=(lambda x, y, z: ( np.array(x).flatten().reshape(batch_size, 39), np.array(y).flatten().reshape(batch_size, 39), np.array(z).flatten().reshape(batch_size, 1))), - input_columns=['feat_ids', 'feat_vals', 'label'], - column_order=['feat_ids', 'feat_vals', 'label'], num_parallel_workers=8) - #if train_mode: - ds = ds.repeat(epochs) - return ds + input_columns=['feat_ids', 'feat_vals', 'label'], + column_order=['feat_ids', 'feat_vals', 'label'], num_parallel_workers=8) + # if train_mode: + data_set = data_set.repeat(epochs) + return data_set + def _get_mindrecord_dataset(directory, train_mode=True, epochs=1, batch_size=1000, line_per_sample=1000, rank_size=None, rank_id=None): @@ -84,23 +88,23 @@ def _get_mindrecord_dataset(directory, train_mode=True, epochs=1, batch_size=100 shuffle = train_mode if rank_size is not None and rank_id is not None: - ds = de.MindDataset(os.path.join(directory, file_prefix_name + file_suffix_name), - columns_list=['feat_ids', 'feat_vals', 'label'], - num_shards=rank_size, shard_id=rank_id, shuffle=shuffle, - num_parallel_workers=8) + data_set = ds.MindDataset(os.path.join(directory, file_prefix_name + file_suffix_name), + columns_list=['feat_ids', 'feat_vals', 'label'], + num_shards=rank_size, shard_id=rank_id, shuffle=shuffle, + num_parallel_workers=8) else: - ds = de.MindDataset(os.path.join(directory, file_prefix_name + file_suffix_name), - columns_list=['feat_ids', 'feat_vals', 'label'], - shuffle=shuffle, num_parallel_workers=8) - ds = ds.batch(int(batch_size / line_per_sample), drop_remainder=True) - ds = ds.map(operations=(lambda x, y, z: (np.array(x).flatten().reshape(batch_size, 39), - np.array(y).flatten().reshape(batch_size, 39), - np.array(z).flatten().reshape(batch_size, 1))), - input_columns=['feat_ids', 'feat_vals', 'label'], - column_order=['feat_ids', 'feat_vals', 'label'], - num_parallel_workers=8) - ds = ds.repeat(epochs) - return ds + data_set = ds.MindDataset(os.path.join(directory, file_prefix_name + file_suffix_name), + columns_list=['feat_ids', 'feat_vals', 'label'], + shuffle=shuffle, num_parallel_workers=8) + data_set = data_set.batch(int(batch_size / line_per_sample), drop_remainder=True) + data_set = data_set.map(operations=(lambda x, y, z: (np.array(x).flatten().reshape(batch_size, 39), + np.array(y).flatten().reshape(batch_size, 39), + np.array(z).flatten().reshape(batch_size, 1))), + input_columns=['feat_ids', 'feat_vals', 'label'], + column_order=['feat_ids', 'feat_vals', 'label'], + num_parallel_workers=8) + data_set = data_set.repeat(epochs) + return data_set def create_dataset(data_dir, train_mode=True, epochs=1, batch_size=1000, diff --git a/tests/st/networks/models/bert/bert_performance/test_bert_tdt_lossscale.py b/tests/st/networks/models/bert/bert_performance/test_bert_tdt_lossscale.py index adc88dd7e91..5dabf024152 100644 --- a/tests/st/networks/models/bert/bert_performance/test_bert_tdt_lossscale.py +++ b/tests/st/networks/models/bert/bert_performance/test_bert_tdt_lossscale.py @@ -20,7 +20,7 @@ import time import numpy as np import pytest import mindspore.common.dtype as mstype -import mindspore.dataset.engine.datasets as de +import mindspore.dataset as ds import mindspore.dataset.transforms.c_transforms as C from mindspore import context from mindspore import log as logger @@ -35,7 +35,6 @@ from model_zoo.official.nlp.bert.src.bert_for_pre_training import BertNetworkWit from model_zoo.official.nlp.bert.src.bert_for_pre_training import BertTrainOneStepWithLossScaleCell from model_zoo.official.nlp.bert.src.bert_model import BertConfig - _current_dir = os.path.dirname(os.path.realpath(__file__)) DATA_DIR = ["/home/workspace/mindspore_dataset/bert/example/examples.tfrecord"] SCHEMA_DIR = "/home/workspace/mindspore_dataset/bert/example/datasetSchema.json" @@ -88,25 +87,26 @@ def me_de_train_dataset(sink_mode=False): repeat_count = 1 sink_size = -1 batch_size = 16 - ds = de.TFRecordDataset(DATA_DIR, SCHEMA_DIR, columns_list=["input_ids", "input_mask", "segment_ids", - "next_sentence_labels", "masked_lm_positions", - "masked_lm_ids", "masked_lm_weights"], shuffle=False) + data_set = ds.TFRecordDataset(DATA_DIR, SCHEMA_DIR, columns_list=["input_ids", "input_mask", "segment_ids", + "next_sentence_labels", "masked_lm_positions", + "masked_lm_ids", "masked_lm_weights"], + shuffle=False) type_cast_op = C.TypeCast(mstype.int32) new_repeat_count = repeat_count if sink_mode: sink_size = 100 new_repeat_count = 3 - ds = ds.map(operations=type_cast_op, input_columns="masked_lm_ids") - ds = ds.map(operations=type_cast_op, input_columns="masked_lm_positions") - ds = ds.map(operations=type_cast_op, input_columns="next_sentence_labels") - ds = ds.map(operations=type_cast_op, input_columns="segment_ids") - ds = ds.map(operations=type_cast_op, input_columns="input_mask") - ds = ds.map(operations=type_cast_op, input_columns="input_ids") + data_set = data_set.map(operations=type_cast_op, input_columns="masked_lm_ids") + data_set = data_set.map(operations=type_cast_op, input_columns="masked_lm_positions") + data_set = data_set.map(operations=type_cast_op, input_columns="next_sentence_labels") + data_set = data_set.map(operations=type_cast_op, input_columns="segment_ids") + data_set = data_set.map(operations=type_cast_op, input_columns="input_mask") + data_set = data_set.map(operations=type_cast_op, input_columns="input_ids") # apply batch operations - ds = ds.batch(batch_size, drop_remainder=True) - logger.info("data size: {}".format(ds.get_dataset_size())) - logger.info("repeat_count: {}".format(ds.get_repeat_count())) - return ds, new_repeat_count, sink_size + data_set = data_set.batch(batch_size, drop_remainder=True) + logger.info("data size: {}".format(data_set.get_dataset_size())) + logger.info("repeat_count: {}".format(data_set.get_repeat_count())) + return data_set, new_repeat_count, sink_size def weight_variable(shape): @@ -155,13 +155,16 @@ class ModelCallback(Callback): self.lossscale_list.append(cb_params.net_outputs[2].asnumpy()) print("epoch: {}, outputs are: {}".format(cb_params.cur_epoch_num, str(cb_params.net_outputs))) + class TimeMonitor(Callback): """Time Monitor.""" + def __init__(self, data_size): super(TimeMonitor, self).__init__() self.data_size = data_size self.epoch_mseconds_list = [] self.per_step_mseconds_list = [] + def epoch_begin(self, run_context): self.epoch_time = time.time() @@ -178,7 +181,7 @@ class TimeMonitor(Callback): def test_bert_performance(): """test bert performance""" context.set_context(mode=context.GRAPH_MODE, device_target="Ascend", reserve_class_name_in_scope=False) - ds, new_repeat_count, sink_size = me_de_train_dataset(sink_mode=True) + data_set, new_repeat_count, sink_size = me_de_train_dataset(sink_mode=True) version = os.getenv('VERSION', 'large') config = get_config(version=version) netwithloss = BertNetworkWithLoss(config, True) @@ -221,7 +224,7 @@ def test_bert_performance(): logger.info("***************** BERT param name is 3 {}".format(name)) param.set_data(weight_variable(value.asnumpy().shape)) time_monitor_callback = TimeMonitor(sink_size) - model.train(new_repeat_count, ds, callbacks=[time_monitor_callback, callback], + model.train(new_repeat_count, data_set, callbacks=[time_monitor_callback, callback], dataset_sink_mode=True, sink_size=sink_size) # assertion occurs while the loss value, overflow state or loss_scale value is wrong @@ -250,5 +253,6 @@ def test_bert_performance(): print("per step mseconds: {}".format(per_step_mseconds)) assert per_step_mseconds <= expect_per_step_mseconds + 1 + if __name__ == '__main__': test_bert_performance() diff --git a/tests/st/networks/models/bert/bert_performance/test_bert_thor_mlperf.py b/tests/st/networks/models/bert/bert_performance/test_bert_thor_mlperf.py index e10962aaf62..b3aa63d227a 100644 --- a/tests/st/networks/models/bert/bert_performance/test_bert_thor_mlperf.py +++ b/tests/st/networks/models/bert/bert_performance/test_bert_thor_mlperf.py @@ -20,7 +20,7 @@ import time from multiprocessing import Process, Queue import pytest import numpy as np -import mindspore.dataset as dataset +import mindspore.dataset as ds import mindspore.common.dtype as mstype import mindspore.communication.management as D from mindspore import context @@ -28,7 +28,6 @@ from mindspore import log as logger from mindspore.train.callback import Callback from mindspore.context import ParallelMode from mindspore.train.serialization import load_checkpoint, load_param_into_net -import mindspore.dataset.engine.datasets as de import mindspore.dataset.transforms.c_transforms as C from model_zoo.official.nlp.bert_thor.src.bert_for_pre_training import BertNetworkWithLoss, BertTrainOneStepCell from model_zoo.official.nlp.bert_thor.src.bert_net_config import bert_net_cfg @@ -45,11 +44,13 @@ train_steps = 200 batch_size = 12 np.random.seed(1) -dataset.config.set_seed(1) +ds.config.set_seed(1) os.environ['GLOG_v'] = str(2) + class TimeMonitor(Callback): """Time Monitor.""" + def __init__(self, data_size): super(TimeMonitor, self).__init__() self.data_size = data_size @@ -67,6 +68,7 @@ class TimeMonitor(Callback): self.per_step_mseconds_list.append(per_step_mseconds) print("epoch: {}, per_step_mseconds are {}".format(cb_params.cur_epoch_num, str(per_step_mseconds)), flush=True) + class LossCallback(Callback): def __init__(self): super(LossCallback, self).__init__() @@ -78,6 +80,7 @@ class LossCallback(Callback): print("epoch: {}, step: {}, outputs are {}".format(cb_params.cur_epoch_num, cb_params.cur_step_num, str(cb_params.net_outputs)), flush=True) + def create_bert_dataset(device_num=1, rank=0, do_shuffle="true", data_dir=None, schema_dir=None): """create train dataset""" # apply repeat operations @@ -87,25 +90,25 @@ def create_bert_dataset(device_num=1, rank=0, do_shuffle="true", data_dir=None, if "tfrecord" in file_name: data_files.append(os.path.join(data_dir, file_name)) data_files = sorted(data_files) - ds = de.TFRecordDataset(data_files, schema_dir if schema_dir != "" else None, - columns_list=["input_ids", "input_mask", "segment_ids", "next_sentence_labels", - "masked_lm_positions", "masked_lm_ids", "masked_lm_weights"], - shuffle=de.Shuffle.FILES if do_shuffle == "true" else False, - num_shards=device_num, shard_id=rank, shard_equal_rows=True) - ori_dataset_size = ds.get_dataset_size() + data_set = ds.TFRecordDataset(data_files, schema_dir if schema_dir != "" else None, + columns_list=["input_ids", "input_mask", "segment_ids", "next_sentence_labels", + "masked_lm_positions", "masked_lm_ids", "masked_lm_weights"], + shuffle=ds.Shuffle.FILES if do_shuffle == "true" else False, + num_shards=device_num, shard_id=rank, shard_equal_rows=True) + ori_dataset_size = data_set.get_dataset_size() print('origin dataset size: ', ori_dataset_size) type_cast_op = C.TypeCast(mstype.int32) - ds = ds.map(operations=type_cast_op, input_columns="masked_lm_ids") - ds = ds.map(operations=type_cast_op, input_columns="masked_lm_positions") - ds = ds.map(operations=type_cast_op, input_columns="next_sentence_labels") - ds = ds.map(operations=type_cast_op, input_columns="segment_ids") - ds = ds.map(operations=type_cast_op, input_columns="input_mask") - ds = ds.map(operations=type_cast_op, input_columns="input_ids") + data_set = data_set.map(operations=type_cast_op, input_columns="masked_lm_ids") + data_set = data_set.map(operations=type_cast_op, input_columns="masked_lm_positions") + data_set = data_set.map(operations=type_cast_op, input_columns="next_sentence_labels") + data_set = data_set.map(operations=type_cast_op, input_columns="segment_ids") + data_set = data_set.map(operations=type_cast_op, input_columns="input_mask") + data_set = data_set.map(operations=type_cast_op, input_columns="input_ids") # apply batch operations - ds = ds.batch(batch_size, drop_remainder=True) - logger.info("data size: {}".format(ds.get_dataset_size())) - logger.info("repeat count: {}".format(ds.get_repeat_count())) - return ds + data_set = data_set.batch(batch_size, drop_remainder=True) + logger.info("data size: {}".format(data_set.get_dataset_size())) + logger.info("repeat count: {}".format(data_set.get_repeat_count())) + return data_set def _set_bert_all_reduce_split(): @@ -151,13 +154,13 @@ def train_process_bert_thor(q, device_id, epoch_size, device_num): device_num=device_num) bert_net_cfg.num_hidden_layers = 4 - ds = create_bert_dataset(device_num=device_num, rank=rank, do_shuffle=False, data_dir=DATASET_PATH, schema_dir=None) + data_set = create_bert_dataset(device_num=device_num, rank=rank, do_shuffle=False, data_dir=DATASET_PATH, + schema_dir=None) net_with_loss = BertNetworkWithLoss(bert_net_cfg, True) - new_repeat_count = epoch_size * ds.get_dataset_size() // data_sink_steps + new_repeat_count = epoch_size * data_set.get_dataset_size() // data_sink_steps new_repeat_count = min(new_repeat_count, train_steps // data_sink_steps) - lr = get_bert_lr() damping = get_bert_damping() optimizer = THOR(filter(lambda x: x.requires_grad, net_with_loss.get_parameters()), lr, cfg.Thor.momentum, @@ -175,7 +178,7 @@ def train_process_bert_thor(q, device_id, epoch_size, device_num): net_with_grads = BertTrainOneStepCell(net_with_loss, optimizer=optimizer) model = Model(net_with_grads, frequency=cfg.Thor.frequency) - model.train(new_repeat_count, ds, callbacks=callback, dataset_sink_mode=True, sink_size=data_sink_steps) + model.train(new_repeat_count, data_set, callbacks=callback, dataset_sink_mode=True, sink_size=data_sink_steps) loss_list = loss_callback.loss_list per_step_mseconds = time_monitor_callback.per_step_mseconds_list @@ -230,5 +233,6 @@ def test_bert_thor_mlperf_8p(): assert mean_cost < 64.2 assert mean_loss < 7.9 + if __name__ == '__main__': test_bert_thor_mlperf_8p() diff --git a/tests/st/networks/models/bert/bert_precision/test_bert_tdt_lossscale.py b/tests/st/networks/models/bert/bert_precision/test_bert_tdt_lossscale.py index b431a554c53..3f6d9e72c5c 100644 --- a/tests/st/networks/models/bert/bert_precision/test_bert_tdt_lossscale.py +++ b/tests/st/networks/models/bert/bert_precision/test_bert_tdt_lossscale.py @@ -20,7 +20,7 @@ import time import numpy as np import pytest import mindspore.common.dtype as mstype -import mindspore.dataset.engine.datasets as de +import mindspore.dataset as ds import mindspore.dataset.transforms.c_transforms as C from mindspore import context from mindspore import log as logger @@ -87,25 +87,26 @@ def me_de_train_dataset(sink_mode=False): repeat_count = 1 sink_size = -1 batch_size = 16 - ds = de.TFRecordDataset(DATA_DIR, SCHEMA_DIR, columns_list=["input_ids", "input_mask", "segment_ids", - "next_sentence_labels", "masked_lm_positions", - "masked_lm_ids", "masked_lm_weights"], shuffle=False) + data_set = ds.TFRecordDataset(DATA_DIR, SCHEMA_DIR, columns_list=["input_ids", "input_mask", "segment_ids", + "next_sentence_labels", "masked_lm_positions", + "masked_lm_ids", "masked_lm_weights"], + shuffle=False) type_cast_op = C.TypeCast(mstype.int32) new_repeat_count = repeat_count if sink_mode: sink_size = 100 new_repeat_count = 3 - ds = ds.map(operations=type_cast_op, input_columns="masked_lm_ids") - ds = ds.map(operations=type_cast_op, input_columns="masked_lm_positions") - ds = ds.map(operations=type_cast_op, input_columns="next_sentence_labels") - ds = ds.map(operations=type_cast_op, input_columns="segment_ids") - ds = ds.map(operations=type_cast_op, input_columns="input_mask") - ds = ds.map(operations=type_cast_op, input_columns="input_ids") + data_set = data_set.map(operations=type_cast_op, input_columns="masked_lm_ids") + data_set = data_set.map(operations=type_cast_op, input_columns="masked_lm_positions") + data_set = data_set.map(operations=type_cast_op, input_columns="next_sentence_labels") + data_set = data_set.map(operations=type_cast_op, input_columns="segment_ids") + data_set = data_set.map(operations=type_cast_op, input_columns="input_mask") + data_set = data_set.map(operations=type_cast_op, input_columns="input_ids") # apply batch operations - ds = ds.batch(batch_size, drop_remainder=True) - logger.info("data size: {}".format(ds.get_dataset_size())) - logger.info("repeat_count: {}".format(ds.get_repeat_count())) - return ds, new_repeat_count, sink_size + data_set = data_set.batch(batch_size, drop_remainder=True) + logger.info("data size: {}".format(data_set.get_dataset_size())) + logger.info("repeat_count: {}".format(data_set.get_repeat_count())) + return data_set, new_repeat_count, sink_size def weight_variable(shape): @@ -178,11 +179,11 @@ def test_bert_percision(enable_graph_kernel=False): context.set_context(mode=context.GRAPH_MODE, device_target="Ascend", reserve_class_name_in_scope=False) if enable_graph_kernel: context.set_context(enable_graph_kernel=True) - ds, new_repeat_count, _ = me_de_train_dataset() + data_set, new_repeat_count, _ = me_de_train_dataset() version = os.getenv('VERSION', 'large') config = get_config(version=version) netwithloss = BertNetworkWithLoss(config, True) - lr = BertLearningRate(decay_steps=ds.get_dataset_size() * new_repeat_count, + lr = BertLearningRate(decay_steps=data_set.get_dataset_size() * new_repeat_count, learning_rate=5e-5, end_learning_rate=1e-9, power=10.0, warmup_steps=0) decay_filter = lambda x: 'layernorm' not in x.name.lower() and 'bias' not in x.name.lower() @@ -218,7 +219,7 @@ def test_bert_percision(enable_graph_kernel=False): else: logger.info("***************** BERT param name is 3 {}".format(name)) param.set_data(weight_variable(value.asnumpy().shape)) - model.train(new_repeat_count, ds, callbacks=callback, dataset_sink_mode=False) + model.train(new_repeat_count, data_set, callbacks=callback, dataset_sink_mode=False) # assertion occurs while the loss value, overflow state or loss_scale value is wrong loss_value = np.array(callback.loss_list) diff --git a/tests/st/networks/models/bert/src/dataset.py b/tests/st/networks/models/bert/src/dataset.py index 6a33a6b5840..dc3abbbe9c9 100644 --- a/tests/st/networks/models/bert/src/dataset.py +++ b/tests/st/networks/models/bert/src/dataset.py @@ -17,7 +17,7 @@ Data operations, will be used in run_pretrain.py """ import os import mindspore.common.dtype as mstype -import mindspore.dataset.engine.datasets as de +import mindspore.dataset as ds import mindspore.dataset.transforms.c_transforms as C from mindspore import log as logger from .config import bert_net_cfg @@ -32,24 +32,24 @@ def create_bert_dataset(epoch_size=1, device_num=1, rank=0, do_shuffle="true", d for file_name in files: if "tfrecord" in file_name: data_files.append(os.path.join(data_dir, file_name)) - ds = de.TFRecordDataset(data_files, schema_dir if schema_dir != "" else None, - columns_list=["input_ids", "input_mask", "segment_ids", "next_sentence_labels", - "masked_lm_positions", "masked_lm_ids", "masked_lm_weights"], - shuffle=(do_shuffle == "true"), num_shards=device_num, shard_id=rank, - shard_equal_rows=True) - ori_dataset_size = ds.get_dataset_size() + data_set = ds.TFRecordDataset(data_files, schema_dir if schema_dir != "" else None, + columns_list=["input_ids", "input_mask", "segment_ids", "next_sentence_labels", + "masked_lm_positions", "masked_lm_ids", "masked_lm_weights"], + shuffle=(do_shuffle == "true"), num_shards=device_num, shard_id=rank, + shard_equal_rows=True) + ori_dataset_size = data_set.get_dataset_size() print('origin dataset size: ', ori_dataset_size) - new_repeat_count = int(repeat_count * ori_dataset_size // ds.get_dataset_size()) + new_repeat_count = int(repeat_count * ori_dataset_size // data_set.get_dataset_size()) type_cast_op = C.TypeCast(mstype.int32) - ds = ds.map(operations=type_cast_op, input_columns="masked_lm_ids") - ds = ds.map(operations=type_cast_op, input_columns="masked_lm_positions") - ds = ds.map(operations=type_cast_op, input_columns="next_sentence_labels") - ds = ds.map(operations=type_cast_op, input_columns="segment_ids") - ds = ds.map(operations=type_cast_op, input_columns="input_mask") - ds = ds.map(operations=type_cast_op, input_columns="input_ids") + data_set = data_set.map(operations=type_cast_op, input_columns="masked_lm_ids") + data_set = data_set.map(operations=type_cast_op, input_columns="masked_lm_positions") + data_set = data_set.map(operations=type_cast_op, input_columns="next_sentence_labels") + data_set = data_set.map(operations=type_cast_op, input_columns="segment_ids") + data_set = data_set.map(operations=type_cast_op, input_columns="input_mask") + data_set = data_set.map(operations=type_cast_op, input_columns="input_ids") # apply batch operations - ds = ds.batch(bert_net_cfg.batch_size, drop_remainder=True) - ds = ds.repeat(max(new_repeat_count, repeat_count)) - logger.info("data size: {}".format(ds.get_dataset_size())) - logger.info("repeatcount: {}".format(ds.get_repeat_count())) - return ds, new_repeat_count + data_set = data_set.batch(bert_net_cfg.batch_size, drop_remainder=True) + data_set = data_set.repeat(max(new_repeat_count, repeat_count)) + logger.info("data size: {}".format(data_set.get_dataset_size())) + logger.info("repeatcount: {}".format(data_set.get_repeat_count())) + return data_set, new_repeat_count diff --git a/tests/st/networks/models/resnet50/src/dataset.py b/tests/st/networks/models/resnet50/src/dataset.py index c65824166e0..799b1fed748 100755 --- a/tests/st/networks/models/resnet50/src/dataset.py +++ b/tests/st/networks/models/resnet50/src/dataset.py @@ -17,7 +17,7 @@ import os import mindspore.common.dtype as mstype -import mindspore.dataset.engine as de +import mindspore.dataset as ds import mindspore.dataset.vision.c_transforms as C import mindspore.dataset.transforms.c_transforms as C2 @@ -39,10 +39,10 @@ def create_dataset(dataset_path, do_train, repeat_num=1, batch_size=32): device_num = int(os.getenv("RANK_SIZE")) rank_id = int(os.getenv("RANK_ID")) if device_num == 1: - ds = de.ImageFolderDataset(dataset_path, num_parallel_workers=8, shuffle=True) + data_set = ds.ImageFolderDataset(dataset_path, num_parallel_workers=8, shuffle=True) else: - ds = de.ImageFolderDataset(dataset_path, num_parallel_workers=8, shuffle=True, - num_shards=device_num, shard_id=rank_id) + data_set = ds.ImageFolderDataset(dataset_path, num_parallel_workers=8, shuffle=True, + num_shards=device_num, shard_id=rank_id) image_size = 224 mean = [0.485 * 255, 0.456 * 255, 0.406 * 255] @@ -65,15 +65,14 @@ def create_dataset(dataset_path, do_train, repeat_num=1, batch_size=32): C.HWC2CHW() ] - type_cast_op = C2.TypeCast(mstype.int32) - ds = ds.map(operations=trans, input_columns="image", num_parallel_workers=8) - ds = ds.map(operations=type_cast_op, input_columns="label", num_parallel_workers=8) + data_set = data_set.map(operations=trans, input_columns="image", num_parallel_workers=8) + data_set = data_set.map(operations=type_cast_op, input_columns="label", num_parallel_workers=8) # apply batch operations - ds = ds.batch(batch_size, drop_remainder=True) + data_set = data_set.batch(batch_size, drop_remainder=True) # apply dataset repeat operation - ds = ds.repeat(repeat_num) - return ds + data_set = data_set.repeat(repeat_num) + return data_set diff --git a/tests/st/networks/models/resnet50/src_thor/dataset.py b/tests/st/networks/models/resnet50/src_thor/dataset.py index 8179bdd1734..d7707f18e4b 100644 --- a/tests/st/networks/models/resnet50/src_thor/dataset.py +++ b/tests/st/networks/models/resnet50/src_thor/dataset.py @@ -18,12 +18,11 @@ import os import mindspore.common.dtype as mstype -import mindspore.dataset as dataset -import mindspore.dataset.engine as de +import mindspore.dataset as ds import mindspore.dataset.transforms.c_transforms as C2 import mindspore.dataset.vision.c_transforms as C -dataset.config.set_seed(1) +ds.config.set_seed(1) def create_dataset(dataset_path, do_train, repeat_num=1, batch_size=32): @@ -43,10 +42,10 @@ def create_dataset(dataset_path, do_train, repeat_num=1, batch_size=32): device_num = int(os.getenv("RANK_SIZE")) rank_id = int(os.getenv("RANK_ID")) if device_num == 1: - ds = de.ImageFolderDataset(dataset_path, num_parallel_workers=8, shuffle=True) + data_set = ds.ImageFolderDataset(dataset_path, num_parallel_workers=8, shuffle=True) else: - ds = de.ImageFolderDataset(dataset_path, num_parallel_workers=8, shuffle=True, - num_shards=device_num, shard_id=rank_id) + data_set = ds.ImageFolderDataset(dataset_path, num_parallel_workers=8, shuffle=True, + num_shards=device_num, shard_id=rank_id) image_size = 224 mean = [0.485 * 255, 0.456 * 255, 0.406 * 255] @@ -71,12 +70,12 @@ def create_dataset(dataset_path, do_train, repeat_num=1, batch_size=32): type_cast_op = C2.TypeCast(mstype.int32) - ds = ds.map(operations=trans, input_columns="image", num_parallel_workers=8) - ds = ds.map(operations=type_cast_op, input_columns="label", num_parallel_workers=8) + data_set = data_set.map(operations=trans, input_columns="image", num_parallel_workers=8) + data_set = data_set.map(operations=type_cast_op, input_columns="label", num_parallel_workers=8) # apply batch operations - ds = ds.batch(batch_size, drop_remainder=True) + data_set = data_set.batch(batch_size, drop_remainder=True) # apply dataset repeat operation - ds = ds.repeat(repeat_num) - return ds + data_set = data_set.repeat(repeat_num) + return data_set diff --git a/tests/st/quantization/resnet50_quant/dataset.py b/tests/st/quantization/resnet50_quant/dataset.py index fd4df32d9f1..ec59a50d3d2 100755 --- a/tests/st/quantization/resnet50_quant/dataset.py +++ b/tests/st/quantization/resnet50_quant/dataset.py @@ -14,11 +14,10 @@ # ============================================================================ """ create train dataset. """ - from functools import partial import mindspore.common.dtype as mstype -import mindspore.dataset.engine as de +import mindspore.dataset as ds import mindspore.dataset.transforms.c_transforms as C2 import mindspore.dataset.vision.c_transforms as C @@ -37,8 +36,8 @@ def create_dataset(dataset_path, config, repeat_num=1, batch_size=32): dataset """ - load_func = partial(de.Cifar10Dataset, dataset_path) - ds = load_func(num_parallel_workers=8, shuffle=False) + load_func = partial(ds.Cifar10Dataset, dataset_path) + data_set = load_func(num_parallel_workers=8, shuffle=False) resize_height = config.image_height resize_width = config.image_width @@ -54,15 +53,15 @@ def create_dataset(dataset_path, config, repeat_num=1, batch_size=32): type_cast_op = C2.TypeCast(mstype.int32) - ds = ds.map(operations=c_trans, input_columns="image", - num_parallel_workers=8) - ds = ds.map(operations=type_cast_op, - input_columns="label", num_parallel_workers=8) + data_set = data_set.map(operations=c_trans, input_columns="image", + num_parallel_workers=8) + data_set = data_set.map(operations=type_cast_op, + input_columns="label", num_parallel_workers=8) # apply batch operations - ds = ds.batch(batch_size, drop_remainder=True) + data_set = data_set.batch(batch_size, drop_remainder=True) # apply dataset repeat operation - ds = ds.repeat(repeat_num) + data_set = data_set.repeat(repeat_num) - return ds + return data_set diff --git a/tests/ut/python/dataset/test_autocontrast.py b/tests/ut/python/dataset/test_autocontrast.py index fd8a7d45c3f..550450f45fe 100644 --- a/tests/ut/python/dataset/test_autocontrast.py +++ b/tests/ut/python/dataset/test_autocontrast.py @@ -16,7 +16,7 @@ Testing AutoContrast op in DE """ import numpy as np -import mindspore.dataset.engine as de +import mindspore.dataset as ds import mindspore.dataset.transforms.py_transforms import mindspore.dataset.vision.py_transforms as F import mindspore.dataset.vision.c_transforms as C @@ -36,13 +36,13 @@ def test_auto_contrast_py(plot=False): logger.info("Test AutoContrast Python Op") # Original Images - ds = de.ImageFolderDataset(dataset_dir=DATA_DIR, shuffle=False) + data_set = ds.ImageFolderDataset(dataset_dir=DATA_DIR, shuffle=False) transforms_original = mindspore.dataset.transforms.py_transforms.Compose([F.Decode(), F.Resize((224, 224)), F.ToTensor()]) - ds_original = ds.map(operations=transforms_original, input_columns="image") + ds_original = data_set.map(operations=transforms_original, input_columns="image") ds_original = ds_original.batch(512) @@ -55,7 +55,7 @@ def test_auto_contrast_py(plot=False): axis=0) # AutoContrast Images - ds = de.ImageFolderDataset(dataset_dir=DATA_DIR, shuffle=False) + data_set = ds.ImageFolderDataset(dataset_dir=DATA_DIR, shuffle=False) transforms_auto_contrast = \ mindspore.dataset.transforms.py_transforms.Compose([F.Decode(), @@ -63,7 +63,7 @@ def test_auto_contrast_py(plot=False): F.AutoContrast(cutoff=10.0, ignore=[10, 20]), F.ToTensor()]) - ds_auto_contrast = ds.map(operations=transforms_auto_contrast, input_columns="image") + ds_auto_contrast = data_set.map(operations=transforms_auto_contrast, input_columns="image") ds_auto_contrast = ds_auto_contrast.batch(512) @@ -96,15 +96,15 @@ def test_auto_contrast_c(plot=False): logger.info("Test AutoContrast C Op") # AutoContrast Images - ds = de.ImageFolderDataset(dataset_dir=DATA_DIR, shuffle=False) - ds = ds.map(operations=[C.Decode(), C.Resize((224, 224))], input_columns=["image"]) + data_set = ds.ImageFolderDataset(dataset_dir=DATA_DIR, shuffle=False) + data_set = data_set.map(operations=[C.Decode(), C.Resize((224, 224))], input_columns=["image"]) python_op = F.AutoContrast(cutoff=10.0, ignore=[10, 20]) c_op = C.AutoContrast(cutoff=10.0, ignore=[10, 20]) transforms_op = mindspore.dataset.transforms.py_transforms.Compose([lambda img: F.ToPIL()(img.astype(np.uint8)), python_op, np.array]) - ds_auto_contrast_py = ds.map(operations=transforms_op, input_columns="image") + ds_auto_contrast_py = data_set.map(operations=transforms_op, input_columns="image") ds_auto_contrast_py = ds_auto_contrast_py.batch(512) @@ -116,10 +116,10 @@ def test_auto_contrast_c(plot=False): image.asnumpy(), axis=0) - ds = de.ImageFolderDataset(dataset_dir=DATA_DIR, shuffle=False) - ds = ds.map(operations=[C.Decode(), C.Resize((224, 224))], input_columns=["image"]) + data_set = ds.ImageFolderDataset(dataset_dir=DATA_DIR, shuffle=False) + data_set = data_set.map(operations=[C.Decode(), C.Resize((224, 224))], input_columns=["image"]) - ds_auto_contrast_c = ds.map(operations=c_op, input_columns="image") + ds_auto_contrast_c = data_set.map(operations=c_op, input_columns="image") ds_auto_contrast_c = ds_auto_contrast_c.batch(512) @@ -153,8 +153,8 @@ def test_auto_contrast_one_channel_c(plot=False): logger.info("Test AutoContrast C Op With One Channel Images") # AutoContrast Images - ds = de.ImageFolderDataset(dataset_dir=DATA_DIR, shuffle=False) - ds = ds.map(operations=[C.Decode(), C.Resize((224, 224))], input_columns=["image"]) + data_set = ds.ImageFolderDataset(dataset_dir=DATA_DIR, shuffle=False) + data_set = data_set.map(operations=[C.Decode(), C.Resize((224, 224))], input_columns=["image"]) python_op = F.AutoContrast() c_op = C.AutoContrast() # not using F.ToTensor() since it converts to floats @@ -164,7 +164,7 @@ def test_auto_contrast_one_channel_c(plot=False): python_op, np.array]) - ds_auto_contrast_py = ds.map(operations=transforms_op, input_columns="image") + ds_auto_contrast_py = data_set.map(operations=transforms_op, input_columns="image") ds_auto_contrast_py = ds_auto_contrast_py.batch(512) @@ -176,11 +176,11 @@ def test_auto_contrast_one_channel_c(plot=False): image.asnumpy(), axis=0) - ds = de.ImageFolderDataset(dataset_dir=DATA_DIR, shuffle=False) - ds = ds.map(operations=[C.Decode(), C.Resize((224, 224)), lambda img: np.array(img[:, :, 0])], - input_columns=["image"]) + data_set = ds.ImageFolderDataset(dataset_dir=DATA_DIR, shuffle=False) + data_set = data_set.map(operations=[C.Decode(), C.Resize((224, 224)), lambda img: np.array(img[:, :, 0])], + input_columns=["image"]) - ds_auto_contrast_c = ds.map(operations=c_op, input_columns="image") + ds_auto_contrast_c = data_set.map(operations=c_op, input_columns="image") ds_auto_contrast_c = ds_auto_contrast_c.batch(512) @@ -208,9 +208,9 @@ def test_auto_contrast_mnist_c(plot=False): Test AutoContrast C op with MNIST dataset (Grayscale images) """ logger.info("Test AutoContrast C Op With MNIST Images") - ds = de.MnistDataset(dataset_dir=MNIST_DATA_DIR, num_samples=2, shuffle=False) - ds_auto_contrast_c = ds.map(operations=C.AutoContrast(cutoff=1, ignore=(0, 255)), input_columns="image") - ds_orig = de.MnistDataset(dataset_dir=MNIST_DATA_DIR, num_samples=2, shuffle=False) + data_set = ds.MnistDataset(dataset_dir=MNIST_DATA_DIR, num_samples=2, shuffle=False) + ds_auto_contrast_c = data_set.map(operations=C.AutoContrast(cutoff=1, ignore=(0, 255)), input_columns="image") + ds_orig = ds.MnistDataset(dataset_dir=MNIST_DATA_DIR, num_samples=2, shuffle=False) images = [] images_trans = [] @@ -236,21 +236,21 @@ def test_auto_contrast_invalid_ignore_param_c(): """ logger.info("Test AutoContrast C Op with invalid ignore parameter") try: - ds = de.ImageFolderDataset(dataset_dir=DATA_DIR, shuffle=False) - ds = ds.map(operations=[C.Decode(), - C.Resize((224, 224)), - lambda img: np.array(img[:, :, 0])], input_columns=["image"]) + data_set = ds.ImageFolderDataset(dataset_dir=DATA_DIR, shuffle=False) + data_set = data_set.map(operations=[C.Decode(), + C.Resize((224, 224)), + lambda img: np.array(img[:, :, 0])], input_columns=["image"]) # invalid ignore - ds = ds.map(operations=C.AutoContrast(ignore=255.5), input_columns="image") + data_set = data_set.map(operations=C.AutoContrast(ignore=255.5), input_columns="image") except TypeError as error: logger.info("Got an exception in DE: {}".format(str(error))) assert "Argument ignore with value 255.5 is not of type" in str(error) try: - ds = de.ImageFolderDataset(dataset_dir=DATA_DIR, shuffle=False) - ds = ds.map(operations=[C.Decode(), C.Resize((224, 224)), - lambda img: np.array(img[:, :, 0])], input_columns=["image"]) + data_set = ds.ImageFolderDataset(dataset_dir=DATA_DIR, shuffle=False) + data_set = data_set.map(operations=[C.Decode(), C.Resize((224, 224)), + lambda img: np.array(img[:, :, 0])], input_columns=["image"]) # invalid ignore - ds = ds.map(operations=C.AutoContrast(ignore=(10, 100)), input_columns="image") + data_set = data_set.map(operations=C.AutoContrast(ignore=(10, 100)), input_columns="image") except TypeError as error: logger.info("Got an exception in DE: {}".format(str(error))) assert "Argument ignore with value (10,100) is not of type" in str(error) @@ -262,22 +262,22 @@ def test_auto_contrast_invalid_cutoff_param_c(): """ logger.info("Test AutoContrast C Op with invalid cutoff parameter") try: - ds = de.ImageFolderDataset(dataset_dir=DATA_DIR, shuffle=False) - ds = ds.map(operations=[C.Decode(), - C.Resize((224, 224)), - lambda img: np.array(img[:, :, 0])], input_columns=["image"]) + data_set = ds.ImageFolderDataset(dataset_dir=DATA_DIR, shuffle=False) + data_set = data_set.map(operations=[C.Decode(), + C.Resize((224, 224)), + lambda img: np.array(img[:, :, 0])], input_columns=["image"]) # invalid ignore - ds = ds.map(operations=C.AutoContrast(cutoff=-10.0), input_columns="image") + data_set = data_set.map(operations=C.AutoContrast(cutoff=-10.0), input_columns="image") except ValueError as error: logger.info("Got an exception in DE: {}".format(str(error))) assert "Input cutoff is not within the required interval of (0 to 100)." in str(error) try: - ds = de.ImageFolderDataset(dataset_dir=DATA_DIR, shuffle=False) - ds = ds.map(operations=[C.Decode(), - C.Resize((224, 224)), - lambda img: np.array(img[:, :, 0])], input_columns=["image"]) + data_set = ds.ImageFolderDataset(dataset_dir=DATA_DIR, shuffle=False) + data_set = data_set.map(operations=[C.Decode(), + C.Resize((224, 224)), + lambda img: np.array(img[:, :, 0])], input_columns=["image"]) # invalid ignore - ds = ds.map(operations=C.AutoContrast(cutoff=120.0), input_columns="image") + data_set = data_set.map(operations=C.AutoContrast(cutoff=120.0), input_columns="image") except ValueError as error: logger.info("Got an exception in DE: {}".format(str(error))) assert "Input cutoff is not within the required interval of (0 to 100)." in str(error) @@ -289,22 +289,24 @@ def test_auto_contrast_invalid_ignore_param_py(): """ logger.info("Test AutoContrast python Op with invalid ignore parameter") try: - ds = de.ImageFolderDataset(dataset_dir=DATA_DIR, shuffle=False) - ds = ds.map(operations=[mindspore.dataset.transforms.py_transforms.Compose([F.Decode(), - F.Resize((224, 224)), - F.AutoContrast(ignore=255.5), - F.ToTensor()])], - input_columns=["image"]) + data_set = ds.ImageFolderDataset(dataset_dir=DATA_DIR, shuffle=False) + data_set = data_set.map(operations=[mindspore.dataset.transforms.py_transforms.Compose([F.Decode(), + F.Resize((224, 224)), + F.AutoContrast( + ignore=255.5), + F.ToTensor()])], + input_columns=["image"]) except TypeError as error: logger.info("Got an exception in DE: {}".format(str(error))) assert "Argument ignore with value 255.5 is not of type" in str(error) try: - ds = de.ImageFolderDataset(dataset_dir=DATA_DIR, shuffle=False) - ds = ds.map(operations=[mindspore.dataset.transforms.py_transforms.Compose([F.Decode(), - F.Resize((224, 224)), - F.AutoContrast(ignore=(10, 100)), - F.ToTensor()])], - input_columns=["image"]) + data_set = ds.ImageFolderDataset(dataset_dir=DATA_DIR, shuffle=False) + data_set = data_set.map(operations=[mindspore.dataset.transforms.py_transforms.Compose([F.Decode(), + F.Resize((224, 224)), + F.AutoContrast( + ignore=(10, 100)), + F.ToTensor()])], + input_columns=["image"]) except TypeError as error: logger.info("Got an exception in DE: {}".format(str(error))) assert "Argument ignore with value (10,100) is not of type" in str(error) @@ -316,18 +318,19 @@ def test_auto_contrast_invalid_cutoff_param_py(): """ logger.info("Test AutoContrast python Op with invalid cutoff parameter") try: - ds = de.ImageFolderDataset(dataset_dir=DATA_DIR, shuffle=False) - ds = ds.map(operations=[mindspore.dataset.transforms.py_transforms.Compose([F.Decode(), - F.Resize((224, 224)), - F.AutoContrast(cutoff=-10.0), - F.ToTensor()])], - input_columns=["image"]) + data_set = ds.ImageFolderDataset(dataset_dir=DATA_DIR, shuffle=False) + data_set = data_set.map(operations=[mindspore.dataset.transforms.py_transforms.Compose([F.Decode(), + F.Resize((224, 224)), + F.AutoContrast( + cutoff=-10.0), + F.ToTensor()])], + input_columns=["image"]) except ValueError as error: logger.info("Got an exception in DE: {}".format(str(error))) assert "Input cutoff is not within the required interval of (0 to 100)." in str(error) try: - ds = de.ImageFolderDataset(dataset_dir=DATA_DIR, shuffle=False) - ds = ds.map( + data_set = ds.ImageFolderDataset(dataset_dir=DATA_DIR, shuffle=False) + data_set = data_set.map( operations=[mindspore.dataset.transforms.py_transforms.Compose([F.Decode(), F.Resize((224, 224)), F.AutoContrast(cutoff=120.0), diff --git a/tests/ut/python/dataset/test_equalize.py b/tests/ut/python/dataset/test_equalize.py index 4411942c6ea..12538595d1b 100644 --- a/tests/ut/python/dataset/test_equalize.py +++ b/tests/ut/python/dataset/test_equalize.py @@ -17,7 +17,7 @@ Testing Equalize op in DE """ import numpy as np -import mindspore.dataset.engine as de +import mindspore.dataset as ds import mindspore.dataset.transforms.py_transforms import mindspore.dataset.vision.c_transforms as C import mindspore.dataset.vision.py_transforms as F @@ -37,13 +37,13 @@ def test_equalize_py(plot=False): logger.info("Test Equalize") # Original Images - ds = de.ImageFolderDataset(dataset_dir=DATA_DIR, shuffle=False) + data_set = ds.ImageFolderDataset(dataset_dir=DATA_DIR, shuffle=False) transforms_original = mindspore.dataset.transforms.py_transforms.Compose([F.Decode(), F.Resize((224, 224)), F.ToTensor()]) - ds_original = ds.map(operations=transforms_original, input_columns="image") + ds_original = data_set.map(operations=transforms_original, input_columns="image") ds_original = ds_original.batch(512) @@ -56,14 +56,14 @@ def test_equalize_py(plot=False): axis=0) # Color Equalized Images - ds = de.ImageFolderDataset(dataset_dir=DATA_DIR, shuffle=False) + data_set = ds.ImageFolderDataset(dataset_dir=DATA_DIR, shuffle=False) transforms_equalize = mindspore.dataset.transforms.py_transforms.Compose([F.Decode(), F.Resize((224, 224)), F.Equalize(), F.ToTensor()]) - ds_equalize = ds.map(operations=transforms_equalize, input_columns="image") + ds_equalize = data_set.map(operations=transforms_equalize, input_columns="image") ds_equalize = ds_equalize.batch(512) @@ -92,11 +92,11 @@ def test_equalize_c(plot=False): logger.info("Test Equalize cpp op") # Original Images - ds = de.ImageFolderDataset(dataset_dir=DATA_DIR, shuffle=False) + data_set = ds.ImageFolderDataset(dataset_dir=DATA_DIR, shuffle=False) transforms_original = [C.Decode(), C.Resize(size=[224, 224])] - ds_original = ds.map(operations=transforms_original, input_columns="image") + ds_original = data_set.map(operations=transforms_original, input_columns="image") ds_original = ds_original.batch(512) @@ -109,12 +109,12 @@ def test_equalize_c(plot=False): axis=0) # Equalize Images - ds = de.ImageFolderDataset(dataset_dir=DATA_DIR, shuffle=False) + data_set = ds.ImageFolderDataset(dataset_dir=DATA_DIR, shuffle=False) transform_equalize = [C.Decode(), C.Resize(size=[224, 224]), C.Equalize()] - ds_equalize = ds.map(operations=transform_equalize, input_columns="image") + ds_equalize = data_set.map(operations=transform_equalize, input_columns="image") ds_equalize = ds_equalize.batch(512) @@ -142,10 +142,10 @@ def test_equalize_py_c(plot=False): logger.info("Test Equalize cpp and python op") # equalize Images in cpp - ds = de.ImageFolderDataset(dataset_dir=DATA_DIR, shuffle=False) - ds = ds.map(operations=[C.Decode(), C.Resize((224, 224))], input_columns=["image"]) + data_set = ds.ImageFolderDataset(dataset_dir=DATA_DIR, shuffle=False) + data_set = data_set.map(operations=[C.Decode(), C.Resize((224, 224))], input_columns=["image"]) - ds_c_equalize = ds.map(operations=C.Equalize(), input_columns="image") + ds_c_equalize = data_set.map(operations=C.Equalize(), input_columns="image") ds_c_equalize = ds_c_equalize.batch(512) @@ -158,15 +158,15 @@ def test_equalize_py_c(plot=False): axis=0) # Equalize images in python - ds = de.ImageFolderDataset(dataset_dir=DATA_DIR, shuffle=False) - ds = ds.map(operations=[C.Decode(), C.Resize((224, 224))], input_columns=["image"]) + data_set = ds.ImageFolderDataset(dataset_dir=DATA_DIR, shuffle=False) + data_set = data_set.map(operations=[C.Decode(), C.Resize((224, 224))], input_columns=["image"]) transforms_p_equalize = mindspore.dataset.transforms.py_transforms.Compose([lambda img: img.astype(np.uint8), F.ToPIL(), F.Equalize(), np.array]) - ds_p_equalize = ds.map(operations=transforms_p_equalize, input_columns="image") + ds_p_equalize = data_set.map(operations=transforms_p_equalize, input_columns="image") ds_p_equalize = ds_p_equalize.batch(512) @@ -197,11 +197,11 @@ def test_equalize_one_channel(): c_op = C.Equalize() try: - ds = de.ImageFolderDataset(dataset_dir=DATA_DIR, shuffle=False) - ds = ds.map(operations=[C.Decode(), C.Resize((224, 224)), - lambda img: np.array(img[:, :, 0])], input_columns=["image"]) + data_set = ds.ImageFolderDataset(dataset_dir=DATA_DIR, shuffle=False) + data_set = data_set.map(operations=[C.Decode(), C.Resize((224, 224)), + lambda img: np.array(img[:, :, 0])], input_columns=["image"]) - ds.map(operations=c_op, input_columns="image") + data_set.map(operations=c_op, input_columns="image") except RuntimeError as e: logger.info("Got an exception in DE: {}".format(str(e))) @@ -213,9 +213,9 @@ def test_equalize_mnist_c(plot=False): Test Equalize C op with MNIST dataset (Grayscale images) """ logger.info("Test Equalize C Op With MNIST Images") - ds = de.MnistDataset(dataset_dir=MNIST_DATA_DIR, num_samples=2, shuffle=False) - ds_equalize_c = ds.map(operations=C.Equalize(), input_columns="image") - ds_orig = de.MnistDataset(dataset_dir=MNIST_DATA_DIR, num_samples=2, shuffle=False) + data_set = ds.MnistDataset(dataset_dir=MNIST_DATA_DIR, num_samples=2, shuffle=False) + ds_equalize_c = data_set.map(operations=C.Equalize(), input_columns="image") + ds_orig = ds.MnistDataset(dataset_dir=MNIST_DATA_DIR, num_samples=2, shuffle=False) images = [] images_trans = [] @@ -242,7 +242,7 @@ def test_equalize_md5_py(): logger.info("Test Equalize") # First dataset - data1 = de.ImageFolderDataset(dataset_dir=DATA_DIR, shuffle=False) + data1 = ds.ImageFolderDataset(dataset_dir=DATA_DIR, shuffle=False) transforms = mindspore.dataset.transforms.py_transforms.Compose([F.Decode(), F.Equalize(), F.ToTensor()]) @@ -260,14 +260,14 @@ def test_equalize_md5_c(): logger.info("Test Equalize cpp op with md5 check") # Generate dataset - ds = de.ImageFolderDataset(dataset_dir=DATA_DIR, shuffle=False) + data_set = ds.ImageFolderDataset(dataset_dir=DATA_DIR, shuffle=False) transforms_equalize = [C.Decode(), C.Resize(size=[224, 224]), C.Equalize(), F.ToTensor()] - data = ds.map(operations=transforms_equalize, input_columns="image") + data = data_set.map(operations=transforms_equalize, input_columns="image") # Compare with expected md5 from images filename = "equalize_01_result_c.npz" save_and_check_md5(data, filename, generate_golden=GENERATE_GOLDEN) diff --git a/tests/ut/python/dataset/test_invert.py b/tests/ut/python/dataset/test_invert.py index d3d8dc98599..07a4d5bc6f7 100644 --- a/tests/ut/python/dataset/test_invert.py +++ b/tests/ut/python/dataset/test_invert.py @@ -17,7 +17,7 @@ Testing Invert op in DE """ import numpy as np -import mindspore.dataset.engine as de +import mindspore.dataset as ds import mindspore.dataset.transforms.py_transforms import mindspore.dataset.vision.py_transforms as F import mindspore.dataset.vision.c_transforms as C @@ -36,13 +36,13 @@ def test_invert_py(plot=False): logger.info("Test Invert Python op") # Original Images - ds = de.ImageFolderDataset(dataset_dir=DATA_DIR, shuffle=False) + data_set = ds.ImageFolderDataset(dataset_dir=DATA_DIR, shuffle=False) transforms_original = mindspore.dataset.transforms.py_transforms.Compose([F.Decode(), F.Resize((224, 224)), F.ToTensor()]) - ds_original = ds.map(operations=transforms_original, input_columns="image") + ds_original = data_set.map(operations=transforms_original, input_columns="image") ds_original = ds_original.batch(512) @@ -55,14 +55,14 @@ def test_invert_py(plot=False): axis=0) # Color Inverted Images - ds = de.ImageFolderDataset(dataset_dir=DATA_DIR, shuffle=False) + data_set = ds.ImageFolderDataset(dataset_dir=DATA_DIR, shuffle=False) transforms_invert = mindspore.dataset.transforms.py_transforms.Compose([F.Decode(), F.Resize((224, 224)), F.Invert(), F.ToTensor()]) - ds_invert = ds.map(operations=transforms_invert, input_columns="image") + ds_invert = data_set.map(operations=transforms_invert, input_columns="image") ds_invert = ds_invert.batch(512) @@ -91,11 +91,11 @@ def test_invert_c(plot=False): logger.info("Test Invert cpp op") # Original Images - ds = de.ImageFolderDataset(dataset_dir=DATA_DIR, shuffle=False) + data_set = ds.ImageFolderDataset(dataset_dir=DATA_DIR, shuffle=False) transforms_original = [C.Decode(), C.Resize(size=[224, 224])] - ds_original = ds.map(operations=transforms_original, input_columns="image") + ds_original = data_set.map(operations=transforms_original, input_columns="image") ds_original = ds_original.batch(512) @@ -108,12 +108,12 @@ def test_invert_c(plot=False): axis=0) # Invert Images - ds = de.ImageFolderDataset(dataset_dir=DATA_DIR, shuffle=False) + data_set = ds.ImageFolderDataset(dataset_dir=DATA_DIR, shuffle=False) transform_invert = [C.Decode(), C.Resize(size=[224, 224]), C.Invert()] - ds_invert = ds.map(operations=transform_invert, input_columns="image") + ds_invert = data_set.map(operations=transform_invert, input_columns="image") ds_invert = ds_invert.batch(512) @@ -141,10 +141,10 @@ def test_invert_py_c(plot=False): logger.info("Test Invert cpp and python op") # Invert Images in cpp - ds = de.ImageFolderDataset(dataset_dir=DATA_DIR, shuffle=False) - ds = ds.map(operations=[C.Decode(), C.Resize((224, 224))], input_columns=["image"]) + data_set = ds.ImageFolderDataset(dataset_dir=DATA_DIR, shuffle=False) + data_set = data_set.map(operations=[C.Decode(), C.Resize((224, 224))], input_columns=["image"]) - ds_c_invert = ds.map(operations=C.Invert(), input_columns="image") + ds_c_invert = data_set.map(operations=C.Invert(), input_columns="image") ds_c_invert = ds_c_invert.batch(512) @@ -157,15 +157,15 @@ def test_invert_py_c(plot=False): axis=0) # invert images in python - ds = de.ImageFolderDataset(dataset_dir=DATA_DIR, shuffle=False) - ds = ds.map(operations=[C.Decode(), C.Resize((224, 224))], input_columns=["image"]) + data_set = ds.ImageFolderDataset(dataset_dir=DATA_DIR, shuffle=False) + data_set = data_set.map(operations=[C.Decode(), C.Resize((224, 224))], input_columns=["image"]) transforms_p_invert = mindspore.dataset.transforms.py_transforms.Compose([lambda img: img.astype(np.uint8), F.ToPIL(), F.Invert(), np.array]) - ds_p_invert = ds.map(operations=transforms_p_invert, input_columns="image") + ds_p_invert = data_set.map(operations=transforms_p_invert, input_columns="image") ds_p_invert = ds_p_invert.batch(512) @@ -196,11 +196,11 @@ def test_invert_one_channel(): c_op = C.Invert() try: - ds = de.ImageFolderDataset(dataset_dir=DATA_DIR, shuffle=False) - ds = ds.map(operations=[C.Decode(), C.Resize((224, 224)), - lambda img: np.array(img[:, :, 0])], input_columns=["image"]) + data_set = ds.ImageFolderDataset(dataset_dir=DATA_DIR, shuffle=False) + data_set = data_set.map(operations=[C.Decode(), C.Resize((224, 224)), + lambda img: np.array(img[:, :, 0])], input_columns=["image"]) - ds.map(operations=c_op, input_columns="image") + data_set.map(operations=c_op, input_columns="image") except RuntimeError as e: logger.info("Got an exception in DE: {}".format(str(e))) @@ -214,13 +214,13 @@ def test_invert_md5_py(): logger.info("Test Invert python op with md5 check") # Generate dataset - ds = de.ImageFolderDataset(dataset_dir=DATA_DIR, shuffle=False) + data_set = ds.ImageFolderDataset(dataset_dir=DATA_DIR, shuffle=False) transforms_invert = mindspore.dataset.transforms.py_transforms.Compose([F.Decode(), F.Invert(), F.ToTensor()]) - data = ds.map(operations=transforms_invert, input_columns="image") + data = data_set.map(operations=transforms_invert, input_columns="image") # Compare with expected md5 from images filename = "invert_01_result_py.npz" save_and_check_md5(data, filename, generate_golden=GENERATE_GOLDEN) @@ -233,14 +233,14 @@ def test_invert_md5_c(): logger.info("Test Invert cpp op with md5 check") # Generate dataset - ds = de.ImageFolderDataset(dataset_dir=DATA_DIR, shuffle=False) + data_set = ds.ImageFolderDataset(dataset_dir=DATA_DIR, shuffle=False) transforms_invert = [C.Decode(), C.Resize(size=[224, 224]), C.Invert(), F.ToTensor()] - data = ds.map(operations=transforms_invert, input_columns="image") + data = data_set.map(operations=transforms_invert, input_columns="image") # Compare with expected md5 from images filename = "invert_01_result_c.npz" save_and_check_md5(data, filename, generate_golden=GENERATE_GOLDEN) diff --git a/tests/ut/python/dataset/test_random_color.py b/tests/ut/python/dataset/test_random_color.py index 4ea0de372cb..7f78ed48525 100644 --- a/tests/ut/python/dataset/test_random_color.py +++ b/tests/ut/python/dataset/test_random_color.py @@ -19,7 +19,6 @@ import numpy as np import pytest import mindspore.dataset as ds -import mindspore.dataset.engine as de import mindspore.dataset.transforms.py_transforms import mindspore.dataset.vision.c_transforms as vision import mindspore.dataset.vision.py_transforms as F @@ -44,7 +43,7 @@ def test_random_color_py(degrees=(0.1, 1.9), plot=False): logger.info("Test RandomColor") # Original Images - data = de.ImageFolderDataset(dataset_dir=DATA_DIR, shuffle=False) + data = ds.ImageFolderDataset(dataset_dir=DATA_DIR, shuffle=False) transforms_original = mindspore.dataset.transforms.py_transforms.Compose([F.Decode(), F.Resize((224, 224)), @@ -63,7 +62,7 @@ def test_random_color_py(degrees=(0.1, 1.9), plot=False): axis=0) # Random Color Adjusted Images - data = de.ImageFolderDataset(dataset_dir=DATA_DIR, shuffle=False) + data = ds.ImageFolderDataset(dataset_dir=DATA_DIR, shuffle=False) transforms_random_color = mindspore.dataset.transforms.py_transforms.Compose([F.Decode(), F.Resize((224, 224)), @@ -146,7 +145,7 @@ def test_random_color_py_md5(): original_num_parallel_workers = config_get_set_num_parallel_workers(1) # Generate dataset - data = de.ImageFolderDataset(dataset_dir=DATA_DIR, shuffle=False) + data = ds.ImageFolderDataset(dataset_dir=DATA_DIR, shuffle=False) transforms = mindspore.dataset.transforms.py_transforms.Compose([F.Decode(), F.RandomColor((2.0, 2.5)), @@ -234,7 +233,7 @@ def test_random_color_c_errors(): assert "degrees must be a sequence with length 2." in str(error_info.value) # RandomColor Cpp Op will fail with one channel input - mnist_ds = de.MnistDataset(dataset_dir=MNIST_DATA_DIR, num_samples=2, shuffle=False) + mnist_ds = ds.MnistDataset(dataset_dir=MNIST_DATA_DIR, num_samples=2, shuffle=False) mnist_ds = mnist_ds.map(operations=vision.RandomColor(), input_columns="image") with pytest.raises(RuntimeError) as error_info: diff --git a/tests/ut/python/dataset/test_random_sharpness.py b/tests/ut/python/dataset/test_random_sharpness.py index 0acf1a2e0fe..6b1db9620f0 100644 --- a/tests/ut/python/dataset/test_random_sharpness.py +++ b/tests/ut/python/dataset/test_random_sharpness.py @@ -17,7 +17,6 @@ Testing RandomSharpness op in DE """ import numpy as np import mindspore.dataset as ds -import mindspore.dataset.engine as de import mindspore.dataset.transforms.py_transforms import mindspore.dataset.vision.py_transforms as F import mindspore.dataset.vision.c_transforms as C @@ -38,7 +37,7 @@ def test_random_sharpness_py(degrees=(0.7, 0.7), plot=False): logger.info("Test RandomSharpness python op") # Original Images - data = de.ImageFolderDataset(dataset_dir=DATA_DIR, shuffle=False) + data = ds.ImageFolderDataset(dataset_dir=DATA_DIR, shuffle=False) transforms_original = mindspore.dataset.transforms.py_transforms.Compose([F.Decode(), F.Resize((224, 224)), @@ -57,7 +56,7 @@ def test_random_sharpness_py(degrees=(0.7, 0.7), plot=False): axis=0) # Random Sharpness Adjusted Images - data = de.ImageFolderDataset(dataset_dir=DATA_DIR, shuffle=False) + data = ds.ImageFolderDataset(dataset_dir=DATA_DIR, shuffle=False) py_op = F.RandomSharpness() if degrees is not None: @@ -108,7 +107,7 @@ def test_random_sharpness_py_md5(): transform = mindspore.dataset.transforms.py_transforms.Compose(transforms) # Generate dataset - data = de.ImageFolderDataset(dataset_dir=DATA_DIR, shuffle=False) + data = ds.ImageFolderDataset(dataset_dir=DATA_DIR, shuffle=False) data = data.map(operations=transform, input_columns=["image"]) # check results with md5 comparison @@ -128,7 +127,7 @@ def test_random_sharpness_c(degrees=(1.6, 1.6), plot=False): logger.info("Test RandomSharpness cpp op") # Original Images - data = de.ImageFolderDataset(dataset_dir=DATA_DIR, shuffle=False) + data = ds.ImageFolderDataset(dataset_dir=DATA_DIR, shuffle=False) transforms_original = [C.Decode(), C.Resize((224, 224))] @@ -146,7 +145,7 @@ def test_random_sharpness_c(degrees=(1.6, 1.6), plot=False): axis=0) # Random Sharpness Adjusted Images - data = de.ImageFolderDataset(dataset_dir=DATA_DIR, shuffle=False) + data = ds.ImageFolderDataset(dataset_dir=DATA_DIR, shuffle=False) c_op = C.RandomSharpness() if degrees is not None: @@ -194,7 +193,7 @@ def test_random_sharpness_c_md5(): ] # Generate dataset - data = de.ImageFolderDataset(dataset_dir=DATA_DIR, shuffle=False) + data = ds.ImageFolderDataset(dataset_dir=DATA_DIR, shuffle=False) data = data.map(operations=transforms, input_columns=["image"]) # check results with md5 comparison @@ -213,7 +212,7 @@ def test_random_sharpness_c_py(degrees=(1.0, 1.0), plot=False): logger.info("Test RandomSharpness C and python Op") # RandomSharpness Images - data = de.ImageFolderDataset(dataset_dir=DATA_DIR, shuffle=False) + data = ds.ImageFolderDataset(dataset_dir=DATA_DIR, shuffle=False) data = data.map(operations=[C.Decode(), C.Resize((200, 300))], input_columns=["image"]) python_op = F.RandomSharpness(degrees) @@ -236,7 +235,7 @@ def test_random_sharpness_c_py(degrees=(1.0, 1.0), plot=False): image, axis=0) - data = de.ImageFolderDataset(dataset_dir=DATA_DIR, shuffle=False) + data = ds.ImageFolderDataset(dataset_dir=DATA_DIR, shuffle=False) data = data.map(operations=[C.Decode(), C.Resize((200, 300))], input_columns=["image"]) ds_images_random_sharpness_c = data.map(operations=c_op, input_columns="image") @@ -271,10 +270,10 @@ def test_random_sharpness_one_channel_c(degrees=(1.4, 1.4), plot=False): if degrees is not None: c_op = C.RandomSharpness(degrees) # RandomSharpness Images - data = de.MnistDataset(dataset_dir=MNIST_DATA_DIR, num_samples=2, shuffle=False) + data = ds.MnistDataset(dataset_dir=MNIST_DATA_DIR, num_samples=2, shuffle=False) ds_random_sharpness_c = data.map(operations=c_op, input_columns="image") # Original images - data = de.MnistDataset(dataset_dir=MNIST_DATA_DIR, num_samples=2, shuffle=False) + data = ds.MnistDataset(dataset_dir=MNIST_DATA_DIR, num_samples=2, shuffle=False) images = [] images_trans = [] @@ -296,7 +295,7 @@ def test_random_sharpness_invalid_params(): """ logger.info("Test RandomSharpness with invalid input parameters.") try: - data = de.ImageFolderDataset(dataset_dir=DATA_DIR, shuffle=False) + data = ds.ImageFolderDataset(dataset_dir=DATA_DIR, shuffle=False) data = data.map(operations=[C.Decode(), C.Resize((224, 224)), C.RandomSharpness(10)], input_columns=["image"]) except TypeError as error: @@ -304,7 +303,7 @@ def test_random_sharpness_invalid_params(): assert "tuple" in str(error) try: - data = de.ImageFolderDataset(dataset_dir=DATA_DIR, shuffle=False) + data = ds.ImageFolderDataset(dataset_dir=DATA_DIR, shuffle=False) data = data.map(operations=[C.Decode(), C.Resize((224, 224)), C.RandomSharpness((-10, 10))], input_columns=["image"]) except ValueError as error: @@ -312,7 +311,7 @@ def test_random_sharpness_invalid_params(): assert "interval" in str(error) try: - data = de.ImageFolderDataset(dataset_dir=DATA_DIR, shuffle=False) + data = ds.ImageFolderDataset(dataset_dir=DATA_DIR, shuffle=False) data = data.map(operations=[C.Decode(), C.Resize((224, 224)), C.RandomSharpness((10, 5))], input_columns=["image"]) except ValueError as error: diff --git a/tests/ut/python/dataset/test_random_solarize_op.py b/tests/ut/python/dataset/test_random_solarize_op.py index 10adcc8108f..7e122542f82 100644 --- a/tests/ut/python/dataset/test_random_solarize_op.py +++ b/tests/ut/python/dataset/test_random_solarize_op.py @@ -17,7 +17,6 @@ Testing RandomSolarizeOp op in DE """ import pytest import mindspore.dataset as ds -import mindspore.dataset.engine as de import mindspore.dataset.vision.c_transforms as vision from mindspore import log as logger from util import visualize_list, save_and_check_md5, config_get_set_seed, config_get_set_num_parallel_workers, \ @@ -78,8 +77,8 @@ def test_random_solarize_mnist(plot=False, run_golden=True): Test RandomSolarize op with MNIST dataset (Grayscale images) """ - mnist_1 = de.MnistDataset(dataset_dir=MNIST_DATA_DIR, num_samples=2, shuffle=False) - mnist_2 = de.MnistDataset(dataset_dir=MNIST_DATA_DIR, num_samples=2, shuffle=False) + mnist_1 = ds.MnistDataset(dataset_dir=MNIST_DATA_DIR, num_samples=2, shuffle=False) + mnist_2 = ds.MnistDataset(dataset_dir=MNIST_DATA_DIR, num_samples=2, shuffle=False) mnist_2 = mnist_2.map(operations=vision.RandomSolarize((0, 255)), input_columns="image") images = [] diff --git a/tests/ut/python/dataset/test_uniform_augment.py b/tests/ut/python/dataset/test_uniform_augment.py index 8d09cf9df70..06a8338ce27 100644 --- a/tests/ut/python/dataset/test_uniform_augment.py +++ b/tests/ut/python/dataset/test_uniform_augment.py @@ -18,7 +18,7 @@ Testing UniformAugment in DE import numpy as np import pytest -import mindspore.dataset.engine as de +import mindspore.dataset as ds import mindspore.dataset.transforms.py_transforms import mindspore.dataset.vision.c_transforms as C import mindspore.dataset.vision.py_transforms as F @@ -35,13 +35,13 @@ def test_uniform_augment(plot=False, num_ops=2): logger.info("Test UniformAugment") # Original Images - ds = de.ImageFolderDataset(dataset_dir=DATA_DIR, shuffle=False) + data_set = ds.ImageFolderDataset(dataset_dir=DATA_DIR, shuffle=False) transforms_original = mindspore.dataset.transforms.py_transforms.Compose([F.Decode(), F.Resize((224, 224)), F.ToTensor()]) - ds_original = ds.map(operations=transforms_original, input_columns="image") + ds_original = data_set.map(operations=transforms_original, input_columns="image") ds_original = ds_original.batch(512) @@ -54,7 +54,7 @@ def test_uniform_augment(plot=False, num_ops=2): axis=0) # UniformAugment Images - ds = de.ImageFolderDataset(dataset_dir=DATA_DIR, shuffle=False) + data_set = ds.ImageFolderDataset(dataset_dir=DATA_DIR, shuffle=False) transform_list = [F.RandomRotation(45), F.RandomColor(), @@ -70,7 +70,7 @@ def test_uniform_augment(plot=False, num_ops=2): num_ops=num_ops), F.ToTensor()]) - ds_ua = ds.map(operations=transforms_ua, input_columns="image") + ds_ua = data_set.map(operations=transforms_ua, input_columns="image") ds_ua = ds_ua.batch(512) @@ -99,12 +99,12 @@ def test_cpp_uniform_augment(plot=False, num_ops=2): logger.info("Test CPP UniformAugment") # Original Images - ds = de.ImageFolderDataset(dataset_dir=DATA_DIR, shuffle=False) + data_set = ds.ImageFolderDataset(dataset_dir=DATA_DIR, shuffle=False) transforms_original = [C.Decode(), C.Resize(size=[224, 224]), F.ToTensor()] - ds_original = ds.map(operations=transforms_original, input_columns="image") + ds_original = data_set.map(operations=transforms_original, input_columns="image") ds_original = ds_original.batch(512) @@ -117,7 +117,7 @@ def test_cpp_uniform_augment(plot=False, num_ops=2): axis=0) # UniformAugment Images - ds = de.ImageFolderDataset(dataset_dir=DATA_DIR, shuffle=False) + data_set = ds.ImageFolderDataset(dataset_dir=DATA_DIR, shuffle=False) transforms_ua = [C.RandomCrop(size=[224, 224], padding=[32, 32, 32, 32]), C.RandomHorizontalFlip(), C.RandomVerticalFlip(), @@ -130,7 +130,7 @@ def test_cpp_uniform_augment(plot=False, num_ops=2): uni_aug, F.ToTensor()] - ds_ua = ds.map(operations=transforms_all, input_columns="image", num_parallel_workers=1) + ds_ua = data_set.map(operations=transforms_all, input_columns="image", num_parallel_workers=1) ds_ua = ds_ua.batch(512) @@ -240,7 +240,7 @@ def test_cpp_uniform_augment_random_crop_badinput(num_ops=1): logger.info("Test CPP UniformAugment with random_crop bad input") batch_size = 2 cifar10_dir = "../data/dataset/testCifar10Data" - ds1 = de.Cifar10Dataset(cifar10_dir, shuffle=False) # shape = [32,32,3] + ds1 = ds.Cifar10Dataset(cifar10_dir, shuffle=False) # shape = [32,32,3] transforms_ua = [ # Note: crop size [224, 224] > image size [32, 32]