forked from mindspore-Ecosystem/mindspore
change code to import APIs from mindspore.dataset rather than mindspore.dataset.engine
This commit is contained in:
parent
3ba3ffedd4
commit
31fed1a2f6
|
@ -14,7 +14,7 @@
|
|||
# ============================================================================
|
||||
"""generate dataloader and data processing entry"""
|
||||
|
||||
import mindspore.dataset.engine as de
|
||||
import mindspore.dataset as ds
|
||||
|
||||
from src.utils import DistributedSampler
|
||||
|
||||
|
@ -32,7 +32,7 @@ def GetDataLoader(per_batch_size,
|
|||
"""
|
||||
centerface_gen = CenterfaceDataset(config=config, split=split)
|
||||
sampler = DistributedSampler(centerface_gen, rank, group_size, shuffle=(split == 'train')) # user defined sampling strategy
|
||||
de_dataset = de.GeneratorDataset(centerface_gen, ["image", "anns"], sampler=sampler, num_parallel_workers=16)
|
||||
de_dataset = ds.GeneratorDataset(centerface_gen, ["image", "anns"], sampler=sampler, num_parallel_workers=16)
|
||||
|
||||
if group_size > 1:
|
||||
num_parallel_workers = 24
|
||||
|
|
|
@ -17,7 +17,7 @@ Data operations, will be used in train.py and eval.py
|
|||
"""
|
||||
import os
|
||||
|
||||
import mindspore.dataset.engine as de
|
||||
import mindspore.dataset as ds
|
||||
import mindspore.dataset.vision.c_transforms as C
|
||||
from src.dataset_utils import lucky, noise_blur, noise_speckle, noise_gamma, noise_gaussian, noise_salt_pepper, \
|
||||
shift_color, enhance_brightness, enhance_sharpness, enhance_contrast, enhance_color, gaussian_blur, \
|
||||
|
@ -26,6 +26,7 @@ from src.dataset_utils import lucky, noise_blur, noise_speckle, noise_gamma, noi
|
|||
|
||||
import cv2
|
||||
import numpy as np
|
||||
|
||||
cv2.setNumThreads(0)
|
||||
|
||||
image_height = None
|
||||
|
@ -179,23 +180,24 @@ def create_dataset_train(mindrecord_file_pos, config):
|
|||
rank_id = int(os.getenv("RANK_ID", '0'))
|
||||
decode = C.Decode()
|
||||
|
||||
ds = de.MindDataset(mindrecord_file_pos, columns_list=["image", "label"], num_parallel_workers=4,
|
||||
num_shards=rank_size, shard_id=rank_id, shuffle=True)
|
||||
ds = ds.map(operations=decode, input_columns=["image"], num_parallel_workers=8)
|
||||
data_set = ds.MindDataset(mindrecord_file_pos, columns_list=["image", "label"], num_parallel_workers=4,
|
||||
num_shards=rank_size, shard_id=rank_id, shuffle=True)
|
||||
data_set = data_set.map(operations=decode, input_columns=["image"], num_parallel_workers=8)
|
||||
|
||||
augmentor = Augmentor(config.augment_severity, config.augment_prob)
|
||||
operation = augmentor.process
|
||||
ds = ds.map(operations=operation, input_columns=["image"],
|
||||
num_parallel_workers=1, python_multiprocessing=True)
|
||||
data_set = data_set.map(operations=operation, input_columns=["image"],
|
||||
num_parallel_workers=1, python_multiprocessing=True)
|
||||
##randomly augment half of samples to be negative samples
|
||||
ds = ds.map(operations=[random_neg_with_rotate, unify_img_label, transform_image], input_columns=["image", "label"],
|
||||
num_parallel_workers=8, python_multiprocessing=True)
|
||||
##for training double the dataset to accoun for positive and negative
|
||||
ds = ds.repeat(2)
|
||||
data_set = data_set.map(operations=[random_neg_with_rotate, unify_img_label, transform_image],
|
||||
input_columns=["image", "label"],
|
||||
num_parallel_workers=8, python_multiprocessing=True)
|
||||
##for training double the data_set to accoun for positive and negative
|
||||
data_set = data_set.repeat(2)
|
||||
|
||||
# apply batch operations
|
||||
ds = ds.batch(config.batch_size, drop_remainder=True)
|
||||
return ds
|
||||
data_set = data_set.batch(config.batch_size, drop_remainder=True)
|
||||
return data_set
|
||||
|
||||
|
||||
def resize_image(img, label):
|
||||
|
@ -230,17 +232,18 @@ def create_dataset_eval(mindrecord_file_pos, config):
|
|||
rank_id = int(os.getenv("RANK_ID", '0'))
|
||||
decode = C.Decode()
|
||||
|
||||
ds = de.MindDataset(mindrecord_file_pos, columns_list=["image", "label"], num_parallel_workers=1,
|
||||
num_shards=rank_size, shard_id=rank_id, shuffle=False)
|
||||
ds = ds.map(operations=decode, input_columns=["image"], num_parallel_workers=8)
|
||||
data_set = ds.MindDataset(mindrecord_file_pos, columns_list=["image", "label"], num_parallel_workers=1,
|
||||
num_shards=rank_size, shard_id=rank_id, shuffle=False)
|
||||
data_set = data_set.map(operations=decode, input_columns=["image"], num_parallel_workers=8)
|
||||
|
||||
global image_height
|
||||
global image_width
|
||||
image_height = config.im_size_h
|
||||
image_width = config.im_size_w
|
||||
ds = ds.map(operations=resize_image, input_columns=["image", "label"], num_parallel_workers=config.work_nums,
|
||||
python_multiprocessing=False)
|
||||
data_set = data_set.map(operations=resize_image, input_columns=["image", "label"],
|
||||
num_parallel_workers=config.work_nums,
|
||||
python_multiprocessing=False)
|
||||
# apply batch operations
|
||||
ds = ds.batch(1, drop_remainder=True)
|
||||
data_set = data_set.batch(1, drop_remainder=True)
|
||||
|
||||
return ds
|
||||
return data_set
|
||||
|
|
|
@ -16,7 +16,7 @@
|
|||
import os
|
||||
import numpy as np
|
||||
import mindspore.common.dtype as mstype
|
||||
import mindspore.dataset.engine as de
|
||||
import mindspore.dataset as ds
|
||||
import mindspore.dataset.transforms.c_transforms as C
|
||||
import mindspore.dataset.vision.c_transforms as vc
|
||||
from PIL import Image, ImageFile
|
||||
|
@ -105,7 +105,7 @@ def create_dataset(name, dataset_path, batch_size=1, num_shards=1, shard_id=0, i
|
|||
dataset = IIIT5KDataset(dataset_path, "annotation.txt", config)
|
||||
else:
|
||||
raise ValueError(f"unsupported dataset name: {name}")
|
||||
ds = de.GeneratorDataset(dataset, ["image", "label"], shuffle=True, num_shards=num_shards, shard_id=shard_id)
|
||||
data_set = ds.GeneratorDataset(dataset, ["image", "label"], shuffle=True, num_shards=num_shards, shard_id=shard_id)
|
||||
image_trans = [
|
||||
vc.Resize((config.image_height, config.image_width)),
|
||||
vc.Normalize([127.5, 127.5, 127.5], std=[127.5, 127.5, 127.5]),
|
||||
|
@ -114,8 +114,8 @@ def create_dataset(name, dataset_path, batch_size=1, num_shards=1, shard_id=0, i
|
|||
label_trans = [
|
||||
C.TypeCast(mstype.int32)
|
||||
]
|
||||
ds = ds.map(operations=image_trans, input_columns=["image"], num_parallel_workers=8)
|
||||
ds = ds.map(operations=label_trans, input_columns=["label"], num_parallel_workers=8)
|
||||
data_set = data_set.map(operations=image_trans, input_columns=["image"], num_parallel_workers=8)
|
||||
data_set = data_set.map(operations=label_trans, input_columns=["label"], num_parallel_workers=8)
|
||||
|
||||
ds = ds.batch(batch_size, drop_remainder=True)
|
||||
return ds
|
||||
data_set = data_set.batch(batch_size, drop_remainder=True)
|
||||
return data_set
|
||||
|
|
|
@ -16,7 +16,7 @@
|
|||
Data operations, will be used in train.py and eval.py
|
||||
"""
|
||||
import mindspore.common.dtype as mstype
|
||||
import mindspore.dataset.engine as de
|
||||
import mindspore.dataset as ds
|
||||
import mindspore.dataset.transforms.c_transforms as C2
|
||||
import mindspore.dataset.vision.c_transforms as C
|
||||
from src.config import config_gpu as cfg
|
||||
|
@ -37,33 +37,33 @@ def create_dataset(dataset_path, do_train, rank, group_size, repeat_num=1):
|
|||
dataset
|
||||
"""
|
||||
if group_size == 1:
|
||||
ds = de.ImageFolderDataset(dataset_path, num_parallel_workers=cfg.work_nums, shuffle=True)
|
||||
data_set = ds.ImageFolderDataset(dataset_path, num_parallel_workers=cfg.work_nums, shuffle=True)
|
||||
else:
|
||||
ds = de.ImageFolderDataset(dataset_path, num_parallel_workers=cfg.work_nums, shuffle=True,
|
||||
num_shards=group_size, shard_id=rank)
|
||||
data_set = ds.ImageFolderDataset(dataset_path, num_parallel_workers=cfg.work_nums, shuffle=True,
|
||||
num_shards=group_size, shard_id=rank)
|
||||
# define map operations
|
||||
if do_train:
|
||||
trans = [
|
||||
C.RandomCropDecodeResize(299, scale=(0.08, 1.0), ratio=(0.75, 1.333)),
|
||||
C.RandomHorizontalFlip(prob=0.5),
|
||||
C.RandomColorAdjust(brightness=0.4, contrast=0.4, saturation=0.4)
|
||||
]
|
||||
]
|
||||
else:
|
||||
trans = [
|
||||
C.Decode(),
|
||||
C.Resize(299),
|
||||
C.CenterCrop(299)
|
||||
]
|
||||
]
|
||||
trans += [
|
||||
C.Rescale(1.0 / 255.0, 0.0),
|
||||
C.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
|
||||
C.HWC2CHW()
|
||||
]
|
||||
type_cast_op = C2.TypeCast(mstype.int32)
|
||||
ds = ds.map(operations=trans, input_columns="image", num_parallel_workers=cfg.work_nums)
|
||||
ds = ds.map(operations=type_cast_op, input_columns="label", num_parallel_workers=cfg.work_nums)
|
||||
data_set = data_set.map(operations=trans, input_columns="image", num_parallel_workers=cfg.work_nums)
|
||||
data_set = data_set.map(operations=type_cast_op, input_columns="label", num_parallel_workers=cfg.work_nums)
|
||||
# apply batch operations
|
||||
ds = ds.batch(cfg.batch_size, drop_remainder=True)
|
||||
data_set = data_set.batch(cfg.batch_size, drop_remainder=True)
|
||||
# apply dataset repeat operation
|
||||
ds = ds.repeat(repeat_num)
|
||||
return ds
|
||||
data_set = data_set.repeat(repeat_num)
|
||||
return data_set
|
||||
|
|
|
@ -17,7 +17,7 @@ create train or eval dataset.
|
|||
"""
|
||||
import os
|
||||
import mindspore.common.dtype as mstype
|
||||
import mindspore.dataset.engine as de
|
||||
import mindspore.dataset as ds
|
||||
import mindspore.dataset.vision.c_transforms as C
|
||||
import mindspore.dataset.transforms.c_transforms as C2
|
||||
from mindspore.communication.management import init, get_rank, get_group_size
|
||||
|
@ -44,10 +44,10 @@ def create_dataset1(dataset_path, do_train, repeat_num=1, batch_size=32, target=
|
|||
device_num = get_group_size()
|
||||
|
||||
if device_num == 1:
|
||||
ds = de.Cifar10Dataset(dataset_path, num_parallel_workers=8, shuffle=True)
|
||||
data_set = ds.Cifar10Dataset(dataset_path, num_parallel_workers=8, shuffle=True)
|
||||
else:
|
||||
ds = de.Cifar10Dataset(dataset_path, num_parallel_workers=8, shuffle=True,
|
||||
num_shards=device_num, shard_id=rank_id)
|
||||
data_set = ds.Cifar10Dataset(dataset_path, num_parallel_workers=8, shuffle=True,
|
||||
num_shards=device_num, shard_id=rank_id)
|
||||
|
||||
# define map operations
|
||||
trans = []
|
||||
|
@ -66,15 +66,15 @@ def create_dataset1(dataset_path, do_train, repeat_num=1, batch_size=32, target=
|
|||
|
||||
type_cast_op = C2.TypeCast(mstype.int32)
|
||||
|
||||
ds = ds.map(operations=type_cast_op, input_columns="label", num_parallel_workers=8)
|
||||
ds = ds.map(operations=trans, input_columns="image", num_parallel_workers=8)
|
||||
data_set = data_set.map(operations=type_cast_op, input_columns="label", num_parallel_workers=8)
|
||||
data_set = data_set.map(operations=trans, input_columns="image", num_parallel_workers=8)
|
||||
|
||||
# apply batch operations
|
||||
ds = ds.batch(batch_size, drop_remainder=True)
|
||||
data_set = data_set.batch(batch_size, drop_remainder=True)
|
||||
# apply dataset repeat operation
|
||||
ds = ds.repeat(repeat_num)
|
||||
data_set = data_set.repeat(repeat_num)
|
||||
|
||||
return ds
|
||||
return data_set
|
||||
|
||||
|
||||
def create_dataset2(dataset_path, do_train, repeat_num=1, batch_size=32, target="Ascend"):
|
||||
|
@ -99,10 +99,10 @@ def create_dataset2(dataset_path, do_train, repeat_num=1, batch_size=32, target=
|
|||
device_num = get_group_size()
|
||||
|
||||
if device_num == 1:
|
||||
ds = de.ImageFolderDataset(dataset_path, num_parallel_workers=8, shuffle=True)
|
||||
data_set = ds.ImageFolderDataset(dataset_path, num_parallel_workers=8, shuffle=True)
|
||||
else:
|
||||
ds = de.ImageFolderDataset(dataset_path, num_parallel_workers=8, shuffle=True,
|
||||
num_shards=device_num, shard_id=rank_id)
|
||||
data_set = ds.ImageFolderDataset(dataset_path, num_parallel_workers=8, shuffle=True,
|
||||
num_shards=device_num, shard_id=rank_id)
|
||||
|
||||
image_size = 224
|
||||
mean = [0.485 * 255, 0.456 * 255, 0.406 * 255]
|
||||
|
@ -127,16 +127,16 @@ def create_dataset2(dataset_path, do_train, repeat_num=1, batch_size=32, target=
|
|||
|
||||
type_cast_op = C2.TypeCast(mstype.int32)
|
||||
|
||||
ds = ds.map(operations=trans, input_columns="image", num_parallel_workers=8)
|
||||
ds = ds.map(operations=type_cast_op, input_columns="label", num_parallel_workers=8)
|
||||
data_set = data_set.map(operations=trans, input_columns="image", num_parallel_workers=8)
|
||||
data_set = data_set.map(operations=type_cast_op, input_columns="label", num_parallel_workers=8)
|
||||
|
||||
# apply batch operations
|
||||
ds = ds.batch(batch_size, drop_remainder=True)
|
||||
data_set = data_set.batch(batch_size, drop_remainder=True)
|
||||
|
||||
# apply dataset repeat operation
|
||||
ds = ds.repeat(repeat_num)
|
||||
data_set = data_set.repeat(repeat_num)
|
||||
|
||||
return ds
|
||||
return data_set
|
||||
|
||||
|
||||
def _get_rank_info():
|
||||
|
|
|
@ -21,7 +21,7 @@ import numpy as np
|
|||
from mindspore import Tensor
|
||||
from mindspore.train.model import Model
|
||||
import mindspore.common.dtype as mstype
|
||||
import mindspore.dataset.engine as de
|
||||
import mindspore.dataset as ds
|
||||
import mindspore.dataset.vision.c_transforms as C
|
||||
import mindspore.dataset.transforms.c_transforms as C2
|
||||
|
||||
|
@ -43,22 +43,22 @@ def create_dataset(dataset_path, do_train, config, repeat_num=1):
|
|||
rank_size = int(os.getenv("RANK_SIZE", '1'))
|
||||
rank_id = int(os.getenv("RANK_ID", '0'))
|
||||
if rank_size == 1:
|
||||
ds = de.ImageFolderDataset(dataset_path, num_parallel_workers=8, shuffle=True)
|
||||
data_set = ds.ImageFolderDataset(dataset_path, num_parallel_workers=8, shuffle=True)
|
||||
else:
|
||||
ds = de.ImageFolderDataset(dataset_path, num_parallel_workers=8, shuffle=True,
|
||||
num_shards=rank_size, shard_id=rank_id)
|
||||
data_set = ds.ImageFolderDataset(dataset_path, num_parallel_workers=8, shuffle=True,
|
||||
num_shards=rank_size, shard_id=rank_id)
|
||||
elif config.platform == "GPU":
|
||||
if do_train:
|
||||
if config.run_distribute:
|
||||
from mindspore.communication.management import get_rank, get_group_size
|
||||
ds = de.ImageFolderDataset(dataset_path, num_parallel_workers=8, shuffle=True,
|
||||
num_shards=get_group_size(), shard_id=get_rank())
|
||||
data_set = ds.ImageFolderDataset(dataset_path, num_parallel_workers=8, shuffle=True,
|
||||
num_shards=get_group_size(), shard_id=get_rank())
|
||||
else:
|
||||
ds = de.ImageFolderDataset(dataset_path, num_parallel_workers=8, shuffle=True)
|
||||
data_set = ds.ImageFolderDataset(dataset_path, num_parallel_workers=8, shuffle=True)
|
||||
else:
|
||||
ds = de.ImageFolderDataset(dataset_path, num_parallel_workers=8, shuffle=True)
|
||||
data_set = ds.ImageFolderDataset(dataset_path, num_parallel_workers=8, shuffle=True)
|
||||
elif config.platform == "CPU":
|
||||
ds = de.ImageFolderDataset(dataset_path, num_parallel_workers=8, shuffle=True)
|
||||
data_set = ds.ImageFolderDataset(dataset_path, num_parallel_workers=8, shuffle=True)
|
||||
|
||||
resize_height = config.image_height
|
||||
resize_width = config.image_width
|
||||
|
@ -83,19 +83,19 @@ def create_dataset(dataset_path, do_train, config, repeat_num=1):
|
|||
|
||||
type_cast_op = C2.TypeCast(mstype.int32)
|
||||
|
||||
ds = ds.map(operations=trans, input_columns="image", num_parallel_workers=8)
|
||||
ds = ds.map(operations=type_cast_op, input_columns="label", num_parallel_workers=8)
|
||||
data_set = data_set.map(operations=trans, input_columns="image", num_parallel_workers=8)
|
||||
data_set = data_set.map(operations=type_cast_op, input_columns="label", num_parallel_workers=8)
|
||||
|
||||
# apply shuffle operations
|
||||
ds = ds.shuffle(buffer_size=buffer_size)
|
||||
data_set = data_set.shuffle(buffer_size=buffer_size)
|
||||
|
||||
# apply batch operations
|
||||
ds = ds.batch(config.batch_size, drop_remainder=True)
|
||||
data_set = data_set.batch(config.batch_size, drop_remainder=True)
|
||||
|
||||
# apply dataset repeat operation
|
||||
ds = ds.repeat(repeat_num)
|
||||
data_set = data_set.repeat(repeat_num)
|
||||
|
||||
return ds
|
||||
return data_set
|
||||
|
||||
|
||||
def extract_features(net, dataset_path, config):
|
||||
|
@ -121,5 +121,5 @@ def extract_features(net, dataset_path, config):
|
|||
features = model.predict(Tensor(image))
|
||||
np.save(features_path, features.asnumpy())
|
||||
np.save(label_path, label)
|
||||
print(f"Complete the batch {i+1}/{step_size}")
|
||||
print(f"Complete the batch {i + 1}/{step_size}")
|
||||
return step_size
|
||||
|
|
|
@ -18,7 +18,7 @@ create train or eval dataset.
|
|||
import os
|
||||
from functools import partial
|
||||
import mindspore.common.dtype as mstype
|
||||
import mindspore.dataset.engine as de
|
||||
import mindspore.dataset as ds
|
||||
import mindspore.dataset.vision.c_transforms as C
|
||||
import mindspore.dataset.transforms.c_transforms as C2
|
||||
import mindspore.dataset.transforms.py_transforms as P2
|
||||
|
@ -43,24 +43,24 @@ def create_dataset(dataset_path, do_train, config, device_target, repeat_num=1,
|
|||
rank_id = int(os.getenv("RANK_ID"))
|
||||
columns_list = ['image', 'label']
|
||||
if config.data_load_mode == "mindrecord":
|
||||
load_func = partial(de.MindDataset, dataset_path, columns_list)
|
||||
load_func = partial(ds.MindDataset, dataset_path, columns_list)
|
||||
else:
|
||||
load_func = partial(de.ImageFolderDataset, dataset_path)
|
||||
load_func = partial(ds.ImageFolderDataset, dataset_path)
|
||||
if do_train:
|
||||
if rank_size == 1:
|
||||
ds = load_func(num_parallel_workers=8, shuffle=True)
|
||||
data_set = load_func(num_parallel_workers=8, shuffle=True)
|
||||
else:
|
||||
ds = load_func(num_parallel_workers=8, shuffle=True,
|
||||
num_shards=rank_size, shard_id=rank_id)
|
||||
data_set = load_func(num_parallel_workers=8, shuffle=True,
|
||||
num_shards=rank_size, shard_id=rank_id)
|
||||
else:
|
||||
ds = load_func(num_parallel_workers=8, shuffle=False)
|
||||
data_set = load_func(num_parallel_workers=8, shuffle=False)
|
||||
elif device_target == "GPU":
|
||||
if do_train:
|
||||
from mindspore.communication.management import get_rank, get_group_size
|
||||
ds = de.ImageFolderDataset(dataset_path, num_parallel_workers=8, shuffle=True,
|
||||
num_shards=get_group_size(), shard_id=get_rank())
|
||||
data_set = ds.ImageFolderDataset(dataset_path, num_parallel_workers=8, shuffle=True,
|
||||
num_shards=get_group_size(), shard_id=get_rank())
|
||||
else:
|
||||
ds = de.ImageFolderDataset(dataset_path, num_parallel_workers=8, shuffle=True)
|
||||
data_set = ds.ImageFolderDataset(dataset_path, num_parallel_workers=8, shuffle=True)
|
||||
else:
|
||||
raise ValueError("Unsupported device_target.")
|
||||
|
||||
|
@ -69,7 +69,7 @@ def create_dataset(dataset_path, do_train, config, device_target, repeat_num=1,
|
|||
if do_train:
|
||||
buffer_size = 20480
|
||||
# apply shuffle operations
|
||||
ds = ds.shuffle(buffer_size=buffer_size)
|
||||
data_set = data_set.shuffle(buffer_size=buffer_size)
|
||||
|
||||
# define map operations
|
||||
decode_op = C.Decode()
|
||||
|
@ -89,16 +89,16 @@ def create_dataset(dataset_path, do_train, config, device_target, repeat_num=1,
|
|||
|
||||
type_cast_op = C2.TypeCast(mstype.int32)
|
||||
|
||||
ds = ds.map(operations=trans, input_columns="image", num_parallel_workers=16)
|
||||
ds = ds.map(operations=type_cast_op, input_columns="label", num_parallel_workers=8)
|
||||
data_set = data_set.map(operations=trans, input_columns="image", num_parallel_workers=16)
|
||||
data_set = data_set.map(operations=type_cast_op, input_columns="label", num_parallel_workers=8)
|
||||
|
||||
# apply batch operations
|
||||
ds = ds.batch(batch_size, drop_remainder=True)
|
||||
data_set = data_set.batch(batch_size, drop_remainder=True)
|
||||
|
||||
# apply dataset repeat operation
|
||||
ds = ds.repeat(repeat_num)
|
||||
data_set = data_set.repeat(repeat_num)
|
||||
|
||||
return ds
|
||||
return data_set
|
||||
|
||||
|
||||
def create_dataset_py(dataset_path, do_train, config, device_target, repeat_num=1, batch_size=32):
|
||||
|
@ -119,12 +119,12 @@ def create_dataset_py(dataset_path, do_train, config, device_target, repeat_num=
|
|||
rank_id = int(os.getenv("RANK_ID"))
|
||||
if do_train:
|
||||
if rank_size == 1:
|
||||
ds = de.ImageFolderDataset(dataset_path, num_parallel_workers=8, shuffle=True)
|
||||
data_set = ds.ImageFolderDataset(dataset_path, num_parallel_workers=8, shuffle=True)
|
||||
else:
|
||||
ds = de.ImageFolderDataset(dataset_path, num_parallel_workers=8, shuffle=True,
|
||||
num_shards=rank_size, shard_id=rank_id)
|
||||
data_set = ds.ImageFolderDataset(dataset_path, num_parallel_workers=8, shuffle=True,
|
||||
num_shards=rank_size, shard_id=rank_id)
|
||||
else:
|
||||
ds = de.ImageFolderDataset(dataset_path, num_parallel_workers=8, shuffle=False)
|
||||
data_set = ds.ImageFolderDataset(dataset_path, num_parallel_workers=8, shuffle=False)
|
||||
else:
|
||||
raise ValueError("Unsupported device target.")
|
||||
|
||||
|
@ -133,7 +133,7 @@ def create_dataset_py(dataset_path, do_train, config, device_target, repeat_num=
|
|||
if do_train:
|
||||
buffer_size = 20480
|
||||
# apply shuffle operations
|
||||
ds = ds.shuffle(buffer_size=buffer_size)
|
||||
data_set = data_set.shuffle(buffer_size=buffer_size)
|
||||
|
||||
# define map operations
|
||||
decode_op = P.Decode()
|
||||
|
@ -152,12 +152,13 @@ def create_dataset_py(dataset_path, do_train, config, device_target, repeat_num=
|
|||
|
||||
compose = P2.Compose(trans)
|
||||
|
||||
ds = ds.map(operations=compose, input_columns="image", num_parallel_workers=8, python_multiprocessing=True)
|
||||
data_set = data_set.map(operations=compose, input_columns="image", num_parallel_workers=8,
|
||||
python_multiprocessing=True)
|
||||
|
||||
# apply batch operations
|
||||
ds = ds.batch(batch_size, drop_remainder=True)
|
||||
data_set = data_set.batch(batch_size, drop_remainder=True)
|
||||
|
||||
# apply dataset repeat operation
|
||||
ds = ds.repeat(repeat_num)
|
||||
data_set = data_set.repeat(repeat_num)
|
||||
|
||||
return ds
|
||||
return data_set
|
||||
|
|
|
@ -16,7 +16,7 @@
|
|||
create train or eval dataset.
|
||||
"""
|
||||
import mindspore.common.dtype as mstype
|
||||
import mindspore.dataset.engine as de
|
||||
import mindspore.dataset as ds
|
||||
import mindspore.dataset.vision.c_transforms as C
|
||||
import mindspore.dataset.transforms.c_transforms as C2
|
||||
|
||||
|
@ -38,12 +38,12 @@ def create_dataset(dataset_path, do_train, config, device_target, repeat_num=1,
|
|||
if do_train:
|
||||
if run_distribute:
|
||||
from mindspore.communication.management import get_rank, get_group_size
|
||||
ds = de.ImageFolderDataset(dataset_path, num_parallel_workers=8, shuffle=True,
|
||||
num_shards=get_group_size(), shard_id=get_rank())
|
||||
data_set = ds.ImageFolderDataset(dataset_path, num_parallel_workers=8, shuffle=True,
|
||||
num_shards=get_group_size(), shard_id=get_rank())
|
||||
else:
|
||||
ds = de.ImageFolderDataset(dataset_path, num_parallel_workers=8, shuffle=True)
|
||||
data_set = ds.ImageFolderDataset(dataset_path, num_parallel_workers=8, shuffle=True)
|
||||
else:
|
||||
ds = de.ImageFolderDataset(dataset_path, num_parallel_workers=8, shuffle=True)
|
||||
data_set = ds.ImageFolderDataset(dataset_path, num_parallel_workers=8, shuffle=True)
|
||||
else:
|
||||
raise ValueError("Unsupported device_target.")
|
||||
|
||||
|
@ -70,16 +70,16 @@ def create_dataset(dataset_path, do_train, config, device_target, repeat_num=1,
|
|||
|
||||
type_cast_op = C2.TypeCast(mstype.int32)
|
||||
|
||||
ds = ds.map(operations=trans, input_columns="image", num_parallel_workers=8)
|
||||
ds = ds.map(operations=type_cast_op, input_columns="label", num_parallel_workers=8)
|
||||
data_set = data_set.map(operations=trans, input_columns="image", num_parallel_workers=8)
|
||||
data_set = data_set.map(operations=type_cast_op, input_columns="label", num_parallel_workers=8)
|
||||
|
||||
# apply shuffle operations
|
||||
ds = ds.shuffle(buffer_size=buffer_size)
|
||||
data_set = data_set.shuffle(buffer_size=buffer_size)
|
||||
|
||||
# apply batch operations
|
||||
ds = ds.batch(batch_size, drop_remainder=True)
|
||||
data_set = data_set.batch(batch_size, drop_remainder=True)
|
||||
|
||||
# apply dataset repeat operation
|
||||
ds = ds.repeat(repeat_num)
|
||||
data_set = data_set.repeat(repeat_num)
|
||||
|
||||
return ds
|
||||
return data_set
|
||||
|
|
|
@ -16,7 +16,7 @@
|
|||
Data operations, will be used in train.py and eval.py
|
||||
"""
|
||||
import mindspore.common.dtype as mstype
|
||||
import mindspore.dataset.engine as de
|
||||
import mindspore.dataset as ds
|
||||
import mindspore.dataset.transforms.c_transforms as C2
|
||||
import mindspore.dataset.vision.c_transforms as C
|
||||
|
||||
|
@ -37,10 +37,10 @@ def create_dataset(dataset_path, config, do_train, repeat_num=1):
|
|||
rank = config.rank
|
||||
group_size = config.group_size
|
||||
if group_size == 1:
|
||||
ds = de.ImageFolderDataset(dataset_path, num_parallel_workers=config.work_nums, shuffle=True)
|
||||
data_set = ds.ImageFolderDataset(dataset_path, num_parallel_workers=config.work_nums, shuffle=True)
|
||||
else:
|
||||
ds = de.ImageFolderDataset(dataset_path, num_parallel_workers=config.work_nums, shuffle=True,
|
||||
num_shards=group_size, shard_id=rank)
|
||||
data_set = ds.ImageFolderDataset(dataset_path, num_parallel_workers=config.work_nums, shuffle=True,
|
||||
num_shards=group_size, shard_id=rank)
|
||||
# define map operations
|
||||
if do_train:
|
||||
trans = [
|
||||
|
@ -60,10 +60,10 @@ def create_dataset(dataset_path, config, do_train, repeat_num=1):
|
|||
C.HWC2CHW()
|
||||
]
|
||||
type_cast_op = C2.TypeCast(mstype.int32)
|
||||
ds = ds.map(operations=trans, input_columns="image", num_parallel_workers=config.work_nums)
|
||||
ds = ds.map(operations=type_cast_op, input_columns="label", num_parallel_workers=config.work_nums)
|
||||
data_set = data_set.map(operations=trans, input_columns="image", num_parallel_workers=config.work_nums)
|
||||
data_set = data_set.map(operations=type_cast_op, input_columns="label", num_parallel_workers=config.work_nums)
|
||||
# apply batch operations
|
||||
ds = ds.batch(config.batch_size, drop_remainder=True)
|
||||
data_set = data_set.batch(config.batch_size, drop_remainder=True)
|
||||
# apply dataset repeat operation
|
||||
ds = ds.repeat(repeat_num)
|
||||
return ds
|
||||
data_set = data_set.repeat(repeat_num)
|
||||
return data_set
|
||||
|
|
|
@ -25,21 +25,24 @@ import pyclipper
|
|||
from PIL import Image
|
||||
from src.config import config
|
||||
|
||||
import mindspore.dataset.engine as de
|
||||
import mindspore.dataset as ds
|
||||
import mindspore.dataset.vision.py_transforms as py_transforms
|
||||
|
||||
__all__ = ['train_dataset_creator', 'test_dataset_creator']
|
||||
|
||||
|
||||
def get_img(img_path):
|
||||
img = cv2.imread(img_path)
|
||||
img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
|
||||
return img
|
||||
|
||||
|
||||
def get_imgs_names(root_dir):
|
||||
img_paths = [i for i in os.listdir(root_dir)
|
||||
if os.path.splitext(i)[-1].lower() in ['.jpg', '.jpeg', '.png']]
|
||||
return img_paths
|
||||
|
||||
|
||||
def get_bboxes(img, gt_path):
|
||||
h, w = img.shape[0:2]
|
||||
with open(gt_path, 'r', encoding='utf-8-sig') as f:
|
||||
|
@ -58,6 +61,7 @@ def get_bboxes(img, gt_path):
|
|||
tags.append(tag)
|
||||
return np.array(bboxes), tags
|
||||
|
||||
|
||||
def random_scale(img, min_size):
|
||||
h, w = img.shape[0:2]
|
||||
if max(h, w) > 1280:
|
||||
|
@ -74,12 +78,14 @@ def random_scale(img, min_size):
|
|||
img = cv2.resize(img, dsize=None, fx=scale2, fy=scale2)
|
||||
return img
|
||||
|
||||
|
||||
def random_horizontal_flip(imgs):
|
||||
if random.random() < 0.5:
|
||||
for i, _ in enumerate(imgs):
|
||||
imgs[i] = np.flip(imgs[i], axis=1).copy()
|
||||
return imgs
|
||||
|
||||
|
||||
def random_rotate(imgs):
|
||||
max_angle = 10
|
||||
angle = random.random() * 2 * max_angle - max_angle
|
||||
|
@ -91,6 +97,7 @@ def random_rotate(imgs):
|
|||
imgs[i] = img_rotation
|
||||
return imgs
|
||||
|
||||
|
||||
def random_crop(imgs, img_size):
|
||||
h, w = imgs[0].shape[0:2]
|
||||
th, tw = img_size
|
||||
|
@ -118,21 +125,25 @@ def random_crop(imgs, img_size):
|
|||
imgs[idx] = imgs[idx][i:i + th, j:j + tw]
|
||||
return imgs
|
||||
|
||||
|
||||
def scale(img, long_size=2240):
|
||||
h, w = img.shape[0:2]
|
||||
scale_long = long_size * 1.0 / max(h, w)
|
||||
img = cv2.resize(img, dsize=None, fx=scale_long, fy=scale_long)
|
||||
return img
|
||||
|
||||
|
||||
def dist(a, b):
|
||||
return np.sqrt(np.sum((a - b) ** 2))
|
||||
|
||||
|
||||
def perimeter(bbox):
|
||||
peri = 0.0
|
||||
for i in range(bbox.shape[0]):
|
||||
peri += dist(bbox[i], bbox[(i + 1) % bbox.shape[0]])
|
||||
return peri
|
||||
|
||||
|
||||
def shrink(bboxes, rate, max_shr=20):
|
||||
rate = rate * rate
|
||||
shrinked_bboxes = []
|
||||
|
@ -158,6 +169,7 @@ def shrink(bboxes, rate, max_shr=20):
|
|||
|
||||
return np.array(shrinked_bboxes)
|
||||
|
||||
|
||||
class TrainDataset:
|
||||
def __init__(self):
|
||||
self.is_transform = True
|
||||
|
@ -260,6 +272,7 @@ class TrainDataset:
|
|||
def __len__(self):
|
||||
return len(self.all_img_paths)
|
||||
|
||||
|
||||
def IC15_TEST_Generator():
|
||||
ic15_test_data_dir = config.TEST_ROOT_DIR + 'ch4_test_images/'
|
||||
img_size = config.INFER_LONG_SIZE
|
||||
|
@ -298,6 +311,7 @@ def IC15_TEST_Generator():
|
|||
|
||||
yield img, img_resized, img_name
|
||||
|
||||
|
||||
class DistributedSampler():
|
||||
def __init__(self, dataset, rank, group_size, shuffle=True, seed=0):
|
||||
self.dataset = dataset
|
||||
|
@ -324,18 +338,20 @@ class DistributedSampler():
|
|||
def __len__(self):
|
||||
return self.num_samplers
|
||||
|
||||
|
||||
def train_dataset_creator(rank, group_size, shuffle=True):
|
||||
cv2.setNumThreads(0)
|
||||
dataset = TrainDataset()
|
||||
sampler = DistributedSampler(dataset, rank, group_size, shuffle)
|
||||
ds = de.GeneratorDataset(dataset, ['img', 'gt_text', 'gt_kernels', 'training_mask'], num_parallel_workers=8,
|
||||
sampler=sampler)
|
||||
ds = ds.repeat(1)
|
||||
ds = ds.batch(config.TRAIN_BATCH_SIZE, drop_remainder=config.TRAIN_DROP_REMAINDER)
|
||||
return ds
|
||||
data_set = ds.GeneratorDataset(dataset, ['img', 'gt_text', 'gt_kernels', 'training_mask'], num_parallel_workers=8,
|
||||
sampler=sampler)
|
||||
data_set = data_set.repeat(1)
|
||||
data_set = data_set.batch(config.TRAIN_BATCH_SIZE, drop_remainder=config.TRAIN_DROP_REMAINDER)
|
||||
return data_set
|
||||
|
||||
|
||||
def test_dataset_creator():
|
||||
ds = de.GeneratorDataset(IC15_TEST_Generator, ['img', 'img_resized', 'img_name'])
|
||||
ds = ds.shuffle(config.TEST_BUFFER_SIZE)
|
||||
ds = ds.batch(1, drop_remainder=config.TEST_DROP_REMAINDER)
|
||||
return ds
|
||||
data_set = ds.GeneratorDataset(IC15_TEST_Generator, ['img', 'img_resized', 'img_name'])
|
||||
data_set = data_set.shuffle(config.TEST_BUFFER_SIZE)
|
||||
data_set = data_set.batch(1, drop_remainder=config.TEST_DROP_REMAINDER)
|
||||
return data_set
|
||||
|
|
|
@ -29,7 +29,7 @@ from mindspore.train.serialization import load_checkpoint, load_param_into_net
|
|||
from mindspore.common import set_seed
|
||||
import mindspore.nn as nn
|
||||
import mindspore.common.initializer as weight_init
|
||||
import mindspore.dataset.engine as de
|
||||
import mindspore.dataset as ds
|
||||
import mindspore.dataset.vision.c_transforms as C
|
||||
from src.resnet_gpu_benchmark import resnet50 as resnet
|
||||
from src.CrossEntropySmooth import CrossEntropySmooth
|
||||
|
@ -45,19 +45,22 @@ parser.add_argument('--dataset_path', type=str, default=None, help='Imagenet dat
|
|||
parser.add_argument('--ckpt_path', type=str, default="./", help='The path to save ckpt if save_ckpt is True;\
|
||||
Or the ckpt model file when eval is True')
|
||||
parser.add_argument('--mode', type=str, default="GRAPH", choices=["GRAPH", "PYNATIVE"], help='Execute mode')
|
||||
parser.add_argument('--dtype', type=str, choices=["fp32", "fp16", "FP16", "FP32"], default="fp16",\
|
||||
help='Compute data type fp32 or fp16: default fp16')
|
||||
parser.add_argument('--dtype', type=str, choices=["fp32", "fp16", "FP16", "FP32"], default="fp16", \
|
||||
help='Compute data type fp32 or fp16: default fp16')
|
||||
args_opt = parser.parse_args()
|
||||
|
||||
set_seed(1)
|
||||
|
||||
|
||||
class MyTimeMonitor(Callback):
|
||||
def __init__(self, batch_size, sink_size):
|
||||
super(MyTimeMonitor, self).__init__()
|
||||
self.batch_size = batch_size
|
||||
self.size = sink_size
|
||||
|
||||
def step_begin(self, run_context):
|
||||
self.step_time = time.time()
|
||||
|
||||
def step_end(self, run_context):
|
||||
cb_params = run_context.original_args()
|
||||
loss = cb_params.net_outputs
|
||||
|
@ -75,17 +78,18 @@ class MyTimeMonitor(Callback):
|
|||
raise ValueError("epoch: {} step: {}. Invalid loss, terminating training.".format(
|
||||
cb_params.cur_epoch_num, cur_step_in_epoch))
|
||||
step_mseconds = (time.time() - self.step_time) * 1000
|
||||
fps = self.batch_size / step_mseconds *1000 * self.size
|
||||
fps = self.batch_size / step_mseconds * 1000 * self.size
|
||||
print("epoch: %s step: %s, loss is %s" % (cb_params.cur_epoch_num, cur_step_in_epoch, loss),
|
||||
"Epoch time: {:5.3f} ms, fps: {:d} img/sec.".format(step_mseconds, int(fps)), flush=True)
|
||||
|
||||
|
||||
def create_dataset(dataset_path, do_train, repeat_num=1, batch_size=32, target="GPU", dtype="fp16",
|
||||
device_num=1):
|
||||
if device_num == 1:
|
||||
ds = de.ImageFolderDataset(dataset_path, num_parallel_workers=4, shuffle=True)
|
||||
data_set = ds.ImageFolderDataset(dataset_path, num_parallel_workers=4, shuffle=True)
|
||||
else:
|
||||
ds = de.ImageFolderDataset(dataset_path, num_parallel_workers=4, shuffle=True,
|
||||
num_shards=device_num, shard_id=get_rank())
|
||||
data_set = ds.ImageFolderDataset(dataset_path, num_parallel_workers=4, shuffle=True,
|
||||
num_shards=device_num, shard_id=get_rank())
|
||||
image_size = 224
|
||||
mean = [0.485 * 255, 0.456 * 255, 0.406 * 255]
|
||||
std = [0.229 * 255, 0.224 * 255, 0.225 * 255]
|
||||
|
@ -113,14 +117,15 @@ def create_dataset(dataset_path, do_train, repeat_num=1, batch_size=32, target="
|
|||
]
|
||||
if dtype == "fp32":
|
||||
trans.append(C.HWC2CHW())
|
||||
ds = ds.map(operations=trans, input_columns="image", num_parallel_workers=8)
|
||||
data_set = data_set.map(operations=trans, input_columns="image", num_parallel_workers=8)
|
||||
# apply batch operations
|
||||
ds = ds.batch(batch_size, drop_remainder=True)
|
||||
data_set = data_set.batch(batch_size, drop_remainder=True)
|
||||
# apply dataset repeat operation
|
||||
if repeat_num > 1:
|
||||
ds = ds.repeat(repeat_num)
|
||||
data_set = data_set.repeat(repeat_num)
|
||||
|
||||
return data_set
|
||||
|
||||
return ds
|
||||
|
||||
def get_liner_lr(lr_init, lr_end, lr_max, warmup_epochs, total_epochs, steps_per_epoch):
|
||||
lr_each_step = []
|
||||
|
@ -136,6 +141,7 @@ def get_liner_lr(lr_init, lr_end, lr_max, warmup_epochs, total_epochs, steps_per
|
|||
lr_each_step = np.array(lr_each_step).astype(np.float32)
|
||||
return lr_each_step
|
||||
|
||||
|
||||
def train():
|
||||
# set args
|
||||
dev = "GPU"
|
||||
|
@ -221,6 +227,7 @@ def train():
|
|||
else:
|
||||
model.train(epoch_size, dataset, callbacks=cb)
|
||||
|
||||
|
||||
def eval_():
|
||||
# set args
|
||||
dev = "GPU"
|
||||
|
@ -251,6 +258,7 @@ def eval_():
|
|||
res = model.eval(dataset)
|
||||
print("result:", res, "ckpt=", ckpt_dir)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
if not args_opt.eval:
|
||||
train()
|
||||
|
|
|
@ -17,7 +17,7 @@ create train or eval dataset.
|
|||
"""
|
||||
import os
|
||||
import mindspore.common.dtype as mstype
|
||||
import mindspore.dataset.engine as de
|
||||
import mindspore.dataset as ds
|
||||
import mindspore.dataset.vision.c_transforms as C
|
||||
import mindspore.dataset.transforms.c_transforms as C2
|
||||
from mindspore.communication.management import init, get_rank, get_group_size
|
||||
|
@ -47,10 +47,10 @@ def create_dataset1(dataset_path, do_train, repeat_num=1, batch_size=32, target=
|
|||
else:
|
||||
device_num = 1
|
||||
if device_num == 1:
|
||||
ds = de.Cifar10Dataset(dataset_path, num_parallel_workers=8, shuffle=True)
|
||||
data_set = ds.Cifar10Dataset(dataset_path, num_parallel_workers=8, shuffle=True)
|
||||
else:
|
||||
ds = de.Cifar10Dataset(dataset_path, num_parallel_workers=8, shuffle=True,
|
||||
num_shards=device_num, shard_id=rank_id)
|
||||
data_set = ds.Cifar10Dataset(dataset_path, num_parallel_workers=8, shuffle=True,
|
||||
num_shards=device_num, shard_id=rank_id)
|
||||
|
||||
# define map operations
|
||||
trans = []
|
||||
|
@ -69,15 +69,15 @@ def create_dataset1(dataset_path, do_train, repeat_num=1, batch_size=32, target=
|
|||
|
||||
type_cast_op = C2.TypeCast(mstype.int32)
|
||||
|
||||
ds = ds.map(operations=type_cast_op, input_columns="label", num_parallel_workers=8)
|
||||
ds = ds.map(operations=trans, input_columns="image", num_parallel_workers=8)
|
||||
data_set = data_set.map(operations=type_cast_op, input_columns="label", num_parallel_workers=8)
|
||||
data_set = data_set.map(operations=trans, input_columns="image", num_parallel_workers=8)
|
||||
|
||||
# apply batch operations
|
||||
ds = ds.batch(batch_size, drop_remainder=True)
|
||||
data_set = data_set.batch(batch_size, drop_remainder=True)
|
||||
# apply dataset repeat operation
|
||||
ds = ds.repeat(repeat_num)
|
||||
data_set = data_set.repeat(repeat_num)
|
||||
|
||||
return ds
|
||||
return data_set
|
||||
|
||||
|
||||
def create_dataset2(dataset_path, do_train, repeat_num=1, batch_size=32, target="Ascend", distribute=False):
|
||||
|
@ -106,10 +106,10 @@ def create_dataset2(dataset_path, do_train, repeat_num=1, batch_size=32, target=
|
|||
device_num = 1
|
||||
|
||||
if device_num == 1:
|
||||
ds = de.ImageFolderDataset(dataset_path, num_parallel_workers=8, shuffle=True)
|
||||
data_set = ds.ImageFolderDataset(dataset_path, num_parallel_workers=8, shuffle=True)
|
||||
else:
|
||||
ds = de.ImageFolderDataset(dataset_path, num_parallel_workers=8, shuffle=True,
|
||||
num_shards=device_num, shard_id=rank_id)
|
||||
data_set = ds.ImageFolderDataset(dataset_path, num_parallel_workers=8, shuffle=True,
|
||||
num_shards=device_num, shard_id=rank_id)
|
||||
|
||||
image_size = 224
|
||||
mean = [0.485 * 255, 0.456 * 255, 0.406 * 255]
|
||||
|
@ -134,16 +134,16 @@ def create_dataset2(dataset_path, do_train, repeat_num=1, batch_size=32, target=
|
|||
|
||||
type_cast_op = C2.TypeCast(mstype.int32)
|
||||
|
||||
ds = ds.map(operations=trans, input_columns="image", num_parallel_workers=8)
|
||||
ds = ds.map(operations=type_cast_op, input_columns="label", num_parallel_workers=8)
|
||||
data_set = data_set.map(operations=trans, input_columns="image", num_parallel_workers=8)
|
||||
data_set = data_set.map(operations=type_cast_op, input_columns="label", num_parallel_workers=8)
|
||||
|
||||
# apply batch operations
|
||||
ds = ds.batch(batch_size, drop_remainder=True)
|
||||
data_set = data_set.batch(batch_size, drop_remainder=True)
|
||||
|
||||
# apply dataset repeat operation
|
||||
ds = ds.repeat(repeat_num)
|
||||
data_set = data_set.repeat(repeat_num)
|
||||
|
||||
return ds
|
||||
return data_set
|
||||
|
||||
|
||||
def create_dataset3(dataset_path, do_train, repeat_num=1, batch_size=32, target="Ascend", distribute=False):
|
||||
|
@ -171,10 +171,10 @@ def create_dataset3(dataset_path, do_train, repeat_num=1, batch_size=32, target=
|
|||
device_num = 1
|
||||
rank_id = 1
|
||||
if device_num == 1:
|
||||
ds = de.ImageFolderDataset(dataset_path, num_parallel_workers=8, shuffle=True)
|
||||
data_set = ds.ImageFolderDataset(dataset_path, num_parallel_workers=8, shuffle=True)
|
||||
else:
|
||||
ds = de.ImageFolderDataset(dataset_path, num_parallel_workers=8, shuffle=True,
|
||||
num_shards=device_num, shard_id=rank_id)
|
||||
data_set = ds.ImageFolderDataset(dataset_path, num_parallel_workers=8, shuffle=True,
|
||||
num_shards=device_num, shard_id=rank_id)
|
||||
image_size = 224
|
||||
mean = [0.475 * 255, 0.451 * 255, 0.392 * 255]
|
||||
std = [0.275 * 255, 0.267 * 255, 0.278 * 255]
|
||||
|
@ -198,15 +198,15 @@ def create_dataset3(dataset_path, do_train, repeat_num=1, batch_size=32, target=
|
|||
|
||||
type_cast_op = C2.TypeCast(mstype.int32)
|
||||
|
||||
ds = ds.map(operations=trans, input_columns="image", num_parallel_workers=8)
|
||||
ds = ds.map(operations=type_cast_op, input_columns="label", num_parallel_workers=8)
|
||||
data_set = data_set.map(operations=trans, input_columns="image", num_parallel_workers=8)
|
||||
data_set = data_set.map(operations=type_cast_op, input_columns="label", num_parallel_workers=8)
|
||||
|
||||
# apply batch operations
|
||||
ds = ds.batch(batch_size, drop_remainder=True)
|
||||
data_set = data_set.batch(batch_size, drop_remainder=True)
|
||||
# apply dataset repeat operation
|
||||
ds = ds.repeat(repeat_num)
|
||||
data_set = data_set.repeat(repeat_num)
|
||||
|
||||
return ds
|
||||
return data_set
|
||||
|
||||
|
||||
def create_dataset4(dataset_path, do_train, repeat_num=1, batch_size=32, target="Ascend", distribute=False):
|
||||
|
@ -234,10 +234,10 @@ def create_dataset4(dataset_path, do_train, repeat_num=1, batch_size=32, target=
|
|||
else:
|
||||
device_num = 1
|
||||
if device_num == 1:
|
||||
ds = de.ImageFolderDataset(dataset_path, num_parallel_workers=12, shuffle=True)
|
||||
data_set = ds.ImageFolderDataset(dataset_path, num_parallel_workers=12, shuffle=True)
|
||||
else:
|
||||
ds = de.ImageFolderDataset(dataset_path, num_parallel_workers=12, shuffle=True,
|
||||
num_shards=device_num, shard_id=rank_id)
|
||||
data_set = ds.ImageFolderDataset(dataset_path, num_parallel_workers=12, shuffle=True,
|
||||
num_shards=device_num, shard_id=rank_id)
|
||||
image_size = 224
|
||||
mean = [123.68, 116.78, 103.94]
|
||||
std = [1.0, 1.0, 1.0]
|
||||
|
@ -260,16 +260,16 @@ def create_dataset4(dataset_path, do_train, repeat_num=1, batch_size=32, target=
|
|||
]
|
||||
|
||||
type_cast_op = C2.TypeCast(mstype.int32)
|
||||
ds = ds.map(operations=trans, input_columns="image", num_parallel_workers=12)
|
||||
ds = ds.map(operations=type_cast_op, input_columns="label", num_parallel_workers=12)
|
||||
data_set = data_set.map(operations=trans, input_columns="image", num_parallel_workers=12)
|
||||
data_set = data_set.map(operations=type_cast_op, input_columns="label", num_parallel_workers=12)
|
||||
|
||||
# apply batch operations
|
||||
ds = ds.batch(batch_size, drop_remainder=True)
|
||||
data_set = data_set.batch(batch_size, drop_remainder=True)
|
||||
|
||||
# apply dataset repeat operation
|
||||
ds = ds.repeat(repeat_num)
|
||||
data_set = data_set.repeat(repeat_num)
|
||||
|
||||
return ds
|
||||
return data_set
|
||||
|
||||
|
||||
def _get_rank_info():
|
||||
|
|
|
@ -18,7 +18,7 @@ create train or eval dataset.
|
|||
import os
|
||||
from functools import partial
|
||||
import mindspore.common.dtype as mstype
|
||||
import mindspore.dataset.engine as de
|
||||
import mindspore.dataset as ds
|
||||
import mindspore.dataset.transforms.c_transforms as C2
|
||||
import mindspore.dataset.vision.c_transforms as C
|
||||
import mindspore.dataset.transforms.py_transforms as P2
|
||||
|
@ -53,14 +53,14 @@ def create_dataset(dataset_path, do_train, repeat_num=1, batch_size=32, target="
|
|||
|
||||
columns_list = ['image', 'label']
|
||||
if config.data_load_mode == "mindrecord":
|
||||
load_func = partial(de.MindDataset, dataset_path, columns_list)
|
||||
load_func = partial(ds.MindDataset, dataset_path, columns_list)
|
||||
else:
|
||||
load_func = partial(de.ImageFolderDataset, dataset_path)
|
||||
load_func = partial(ds.ImageFolderDataset, dataset_path)
|
||||
if device_num == 1:
|
||||
ds = load_func(num_parallel_workers=8, shuffle=True)
|
||||
data_set = load_func(num_parallel_workers=8, shuffle=True)
|
||||
else:
|
||||
ds = load_func(num_parallel_workers=8, shuffle=True,
|
||||
num_shards=device_num, shard_id=rank_id)
|
||||
data_set = load_func(num_parallel_workers=8, shuffle=True,
|
||||
num_shards=device_num, shard_id=rank_id)
|
||||
|
||||
image_size = 224
|
||||
mean = [0.485 * 255, 0.456 * 255, 0.406 * 255]
|
||||
|
@ -85,16 +85,16 @@ def create_dataset(dataset_path, do_train, repeat_num=1, batch_size=32, target="
|
|||
|
||||
type_cast_op = C2.TypeCast(mstype.int32)
|
||||
|
||||
ds = ds.map(operations=trans, input_columns="image", num_parallel_workers=8)
|
||||
ds = ds.map(operations=type_cast_op, input_columns="label", num_parallel_workers=8)
|
||||
data_set = data_set.map(operations=trans, input_columns="image", num_parallel_workers=8)
|
||||
data_set = data_set.map(operations=type_cast_op, input_columns="label", num_parallel_workers=8)
|
||||
|
||||
# apply batch operations
|
||||
ds = ds.batch(batch_size, drop_remainder=True)
|
||||
data_set = data_set.batch(batch_size, drop_remainder=True)
|
||||
|
||||
# apply dataset repeat operation
|
||||
ds = ds.repeat(repeat_num)
|
||||
data_set = data_set.repeat(repeat_num)
|
||||
|
||||
return ds
|
||||
return data_set
|
||||
|
||||
|
||||
def create_dataset_py(dataset_path, do_train, repeat_num=1, batch_size=32, target="Ascend"):
|
||||
|
@ -121,12 +121,12 @@ def create_dataset_py(dataset_path, do_train, repeat_num=1, batch_size=32, targe
|
|||
|
||||
if do_train:
|
||||
if device_num == 1:
|
||||
ds = de.ImageFolderDataset(dataset_path, num_parallel_workers=8, shuffle=True)
|
||||
data_set = ds.ImageFolderDataset(dataset_path, num_parallel_workers=8, shuffle=True)
|
||||
else:
|
||||
ds = de.ImageFolderDataset(dataset_path, num_parallel_workers=8, shuffle=True,
|
||||
num_shards=device_num, shard_id=rank_id)
|
||||
data_set = ds.ImageFolderDataset(dataset_path, num_parallel_workers=8, shuffle=True,
|
||||
num_shards=device_num, shard_id=rank_id)
|
||||
else:
|
||||
ds = de.ImageFolderDataset(dataset_path, num_parallel_workers=8, shuffle=False)
|
||||
data_set = ds.ImageFolderDataset(dataset_path, num_parallel_workers=8, shuffle=False)
|
||||
|
||||
image_size = 224
|
||||
|
||||
|
@ -147,12 +147,13 @@ def create_dataset_py(dataset_path, do_train, repeat_num=1, batch_size=32, targe
|
|||
trans = [decode_op, resize_op, center_crop, to_tensor, normalize_op]
|
||||
|
||||
compose = P2.Compose(trans)
|
||||
ds = ds.map(operations=compose, input_columns="image", num_parallel_workers=8, python_multiprocessing=True)
|
||||
data_set = data_set.map(operations=compose, input_columns="image", num_parallel_workers=8,
|
||||
python_multiprocessing=True)
|
||||
|
||||
# apply batch operations
|
||||
ds = ds.batch(batch_size, drop_remainder=True)
|
||||
data_set = data_set.batch(batch_size, drop_remainder=True)
|
||||
|
||||
# apply dataset repeat operation
|
||||
ds = ds.repeat(repeat_num)
|
||||
data_set = data_set.repeat(repeat_num)
|
||||
|
||||
return ds
|
||||
return data_set
|
||||
|
|
|
@ -17,7 +17,7 @@ create train or eval dataset.
|
|||
"""
|
||||
import os
|
||||
import mindspore.common.dtype as mstype
|
||||
import mindspore.dataset.engine as de
|
||||
import mindspore.dataset as ds
|
||||
import mindspore.dataset.vision.c_transforms as C
|
||||
import mindspore.dataset.transforms.c_transforms as C2
|
||||
from mindspore.communication.management import init, get_rank, get_group_size
|
||||
|
@ -47,10 +47,10 @@ def create_dataset(dataset_path, do_train, repeat_num=1, batch_size=32, target="
|
|||
num_parallels = 4
|
||||
|
||||
if device_num == 1:
|
||||
ds = de.ImageFolderDataset(dataset_path, num_parallel_workers=num_parallels, shuffle=True)
|
||||
data_set = ds.ImageFolderDataset(dataset_path, num_parallel_workers=num_parallels, shuffle=True)
|
||||
else:
|
||||
ds = de.ImageFolderDataset(dataset_path, num_parallel_workers=num_parallels, shuffle=True,
|
||||
num_shards=device_num, shard_id=rank_id)
|
||||
data_set = ds.ImageFolderDataset(dataset_path, num_parallel_workers=num_parallels, shuffle=True,
|
||||
num_shards=device_num, shard_id=rank_id)
|
||||
|
||||
image_size = 224
|
||||
mean = [0.485 * 255, 0.456 * 255, 0.406 * 255]
|
||||
|
@ -75,16 +75,16 @@ def create_dataset(dataset_path, do_train, repeat_num=1, batch_size=32, target="
|
|||
|
||||
type_cast_op = C2.TypeCast(mstype.int32)
|
||||
|
||||
ds = ds.map(operations=trans, input_columns="image", num_parallel_workers=num_parallels)
|
||||
ds = ds.map(operations=type_cast_op, input_columns="label", num_parallel_workers=num_parallels)
|
||||
data_set = data_set.map(operations=trans, input_columns="image", num_parallel_workers=num_parallels)
|
||||
data_set = data_set.map(operations=type_cast_op, input_columns="label", num_parallel_workers=num_parallels)
|
||||
|
||||
# apply batch operations
|
||||
ds = ds.batch(batch_size, drop_remainder=True)
|
||||
data_set = data_set.batch(batch_size, drop_remainder=True)
|
||||
|
||||
# apply dataset repeat operation
|
||||
ds = ds.repeat(repeat_num)
|
||||
data_set = data_set.repeat(repeat_num)
|
||||
|
||||
return ds
|
||||
return data_set
|
||||
|
||||
|
||||
def _get_rank_info():
|
||||
|
|
|
@ -15,7 +15,7 @@
|
|||
"""Data operations, will be used in train.py and eval.py"""
|
||||
from src.config import config
|
||||
import mindspore.common.dtype as mstype
|
||||
import mindspore.dataset.engine as de
|
||||
import mindspore.dataset as ds
|
||||
import mindspore.dataset.transforms.c_transforms as C2
|
||||
import mindspore.dataset.vision.c_transforms as C
|
||||
|
||||
|
@ -36,10 +36,10 @@ def create_dataset(dataset_path, do_train, device_num=1, rank=0):
|
|||
"""
|
||||
|
||||
if device_num == 1:
|
||||
ds = de.ImageFolderDataset(dataset_path, num_parallel_workers=8, shuffle=True)
|
||||
data_set = ds.ImageFolderDataset(dataset_path, num_parallel_workers=8, shuffle=True)
|
||||
else:
|
||||
ds = de.ImageFolderDataset(dataset_path, num_parallel_workers=8, shuffle=True,
|
||||
num_shards=device_num, shard_id=rank)
|
||||
data_set = ds.ImageFolderDataset(dataset_path, num_parallel_workers=8, shuffle=True,
|
||||
num_shards=device_num, shard_id=rank)
|
||||
# define map operations
|
||||
if do_train:
|
||||
trans = [
|
||||
|
@ -59,8 +59,8 @@ def create_dataset(dataset_path, do_train, device_num=1, rank=0):
|
|||
]
|
||||
|
||||
type_cast_op = C2.TypeCast(mstype.int32)
|
||||
ds = ds.map(input_columns="image", operations=trans, num_parallel_workers=8)
|
||||
ds = ds.map(input_columns="label", operations=type_cast_op, num_parallel_workers=8)
|
||||
data_set = data_set.map(input_columns="image", operations=trans, num_parallel_workers=8)
|
||||
data_set = data_set.map(input_columns="label", operations=type_cast_op, num_parallel_workers=8)
|
||||
# apply batch operations
|
||||
ds = ds.batch(config.batch_size, drop_remainder=True)
|
||||
return ds
|
||||
data_set = data_set.batch(config.batch_size, drop_remainder=True)
|
||||
return data_set
|
||||
|
|
|
@ -19,7 +19,7 @@ import numpy as np
|
|||
from src.config import config_gpu as cfg
|
||||
|
||||
import mindspore.common.dtype as mstype
|
||||
import mindspore.dataset.engine as de
|
||||
import mindspore.dataset as ds
|
||||
import mindspore.dataset.transforms.c_transforms as C2
|
||||
import mindspore.dataset.vision.c_transforms as C
|
||||
|
||||
|
@ -46,10 +46,10 @@ def create_dataset(dataset_path, do_train, rank, group_size, repeat_num=1):
|
|||
dataset
|
||||
"""
|
||||
if group_size == 1:
|
||||
ds = de.ImageFolderDataset(dataset_path, num_parallel_workers=cfg.work_nums, shuffle=True)
|
||||
data_set = ds.ImageFolderDataset(dataset_path, num_parallel_workers=cfg.work_nums, shuffle=True)
|
||||
else:
|
||||
ds = de.ImageFolderDataset(dataset_path, num_parallel_workers=cfg.work_nums, shuffle=True,
|
||||
num_shards=group_size, shard_id=rank)
|
||||
data_set = ds.ImageFolderDataset(dataset_path, num_parallel_workers=cfg.work_nums, shuffle=True,
|
||||
num_shards=group_size, shard_id=rank)
|
||||
# define map operations
|
||||
if do_train:
|
||||
trans = [
|
||||
|
@ -71,9 +71,9 @@ def create_dataset(dataset_path, do_train, rank, group_size, repeat_num=1):
|
|||
]
|
||||
|
||||
type_cast_op = C2.TypeCast(mstype.int32)
|
||||
ds = ds.map(operations=trans, input_columns="image", num_parallel_workers=cfg.work_nums)
|
||||
ds = ds.map(operations=type_cast_op, input_columns="label", num_parallel_workers=cfg.work_nums)
|
||||
data_set = data_set.map(operations=trans, input_columns="image", num_parallel_workers=cfg.work_nums)
|
||||
data_set = data_set.map(operations=type_cast_op, input_columns="label", num_parallel_workers=cfg.work_nums)
|
||||
# apply batch operations
|
||||
ds = ds.batch(cfg.batch_size, drop_remainder=True)
|
||||
data_set = data_set.batch(cfg.batch_size, drop_remainder=True)
|
||||
|
||||
return ds
|
||||
return data_set
|
||||
|
|
|
@ -17,7 +17,7 @@ create train or eval dataset.
|
|||
"""
|
||||
import os
|
||||
import mindspore.common.dtype as mstype
|
||||
import mindspore.dataset.engine as de
|
||||
import mindspore.dataset as ds
|
||||
import mindspore.dataset.vision.c_transforms as C
|
||||
import mindspore.dataset.transforms.c_transforms as C2
|
||||
from mindspore.communication.management import init, get_rank, get_group_size
|
||||
|
@ -48,15 +48,15 @@ def create_dataset_cifar(dataset_path,
|
|||
device_num = get_group_size()
|
||||
|
||||
if device_num == 1:
|
||||
ds = de.Cifar10Dataset(dataset_path,
|
||||
num_parallel_workers=8,
|
||||
shuffle=True)
|
||||
data_set = ds.Cifar10Dataset(dataset_path,
|
||||
num_parallel_workers=8,
|
||||
shuffle=True)
|
||||
else:
|
||||
ds = de.Cifar10Dataset(dataset_path,
|
||||
num_parallel_workers=8,
|
||||
shuffle=True,
|
||||
num_shards=device_num,
|
||||
shard_id=rank_id)
|
||||
data_set = ds.Cifar10Dataset(dataset_path,
|
||||
num_parallel_workers=8,
|
||||
shuffle=True,
|
||||
num_shards=device_num,
|
||||
shard_id=rank_id)
|
||||
|
||||
# define map operations
|
||||
if do_train:
|
||||
|
@ -80,20 +80,20 @@ def create_dataset_cifar(dataset_path,
|
|||
|
||||
type_cast_op = C2.TypeCast(mstype.int32)
|
||||
|
||||
ds = ds.map(operations=type_cast_op,
|
||||
input_columns="label",
|
||||
num_parallel_workers=8)
|
||||
ds = ds.map(operations=trans,
|
||||
input_columns="image",
|
||||
num_parallel_workers=8)
|
||||
data_set = data_set.map(operations=type_cast_op,
|
||||
input_columns="label",
|
||||
num_parallel_workers=8)
|
||||
data_set = data_set.map(operations=trans,
|
||||
input_columns="image",
|
||||
num_parallel_workers=8)
|
||||
|
||||
# apply batch operations
|
||||
ds = ds.batch(batch_size, drop_remainder=True)
|
||||
data_set = data_set.batch(batch_size, drop_remainder=True)
|
||||
|
||||
# apply dataset repeat operation
|
||||
ds = ds.repeat(repeat_num)
|
||||
data_set = data_set.repeat(repeat_num)
|
||||
|
||||
return ds
|
||||
return data_set
|
||||
|
||||
|
||||
def create_dataset_imagenet(dataset_path,
|
||||
|
@ -122,15 +122,15 @@ def create_dataset_imagenet(dataset_path,
|
|||
device_num = get_group_size()
|
||||
|
||||
if device_num == 1:
|
||||
ds = de.ImageFolderDataset(dataset_path,
|
||||
num_parallel_workers=8,
|
||||
shuffle=True)
|
||||
data_set = ds.ImageFolderDataset(dataset_path,
|
||||
num_parallel_workers=8,
|
||||
shuffle=True)
|
||||
else:
|
||||
ds = de.ImageFolderDataset(dataset_path,
|
||||
num_parallel_workers=8,
|
||||
shuffle=True,
|
||||
num_shards=device_num,
|
||||
shard_id=rank_id)
|
||||
data_set = ds.ImageFolderDataset(dataset_path,
|
||||
num_parallel_workers=8,
|
||||
shuffle=True,
|
||||
num_shards=device_num,
|
||||
shard_id=rank_id)
|
||||
|
||||
image_size = 227
|
||||
mean = [0.485 * 255, 0.456 * 255, 0.406 * 255]
|
||||
|
@ -159,20 +159,20 @@ def create_dataset_imagenet(dataset_path,
|
|||
|
||||
type_cast_op = C2.TypeCast(mstype.int32)
|
||||
|
||||
ds = ds.map(operations=type_cast_op,
|
||||
input_columns="label",
|
||||
num_parallel_workers=8)
|
||||
ds = ds.map(operations=trans,
|
||||
input_columns="image",
|
||||
num_parallel_workers=8)
|
||||
data_set = data_set.map(operations=type_cast_op,
|
||||
input_columns="label",
|
||||
num_parallel_workers=8)
|
||||
data_set = data_set.map(operations=trans,
|
||||
input_columns="image",
|
||||
num_parallel_workers=8)
|
||||
|
||||
# apply batch operations
|
||||
ds = ds.batch(batch_size, drop_remainder=True)
|
||||
data_set = data_set.batch(batch_size, drop_remainder=True)
|
||||
|
||||
# apply dataset repeat operation
|
||||
ds = ds.repeat(repeat_num)
|
||||
data_set = data_set.repeat(repeat_num)
|
||||
|
||||
return ds
|
||||
return data_set
|
||||
|
||||
|
||||
def _get_rank_info():
|
||||
|
|
|
@ -17,7 +17,7 @@ import os
|
|||
import math as m
|
||||
import numpy as np
|
||||
import mindspore.common.dtype as mstype
|
||||
import mindspore.dataset.engine as de
|
||||
import mindspore.dataset as ds
|
||||
import mindspore.dataset.transforms.c_transforms as c
|
||||
import mindspore.dataset.vision.c_transforms as vc
|
||||
from PIL import Image
|
||||
|
@ -86,7 +86,7 @@ def create_dataset(dataset_path, batch_size=1, num_shards=1, shard_id=0, device_
|
|||
"""
|
||||
|
||||
dataset = _CaptchaDataset(dataset_path, cf.max_captcha_digits, device_target)
|
||||
ds = de.GeneratorDataset(dataset, ["image", "label"], shuffle=True, num_shards=num_shards, shard_id=shard_id)
|
||||
data_set = ds.GeneratorDataset(dataset, ["image", "label"], shuffle=True, num_shards=num_shards, shard_id=shard_id)
|
||||
image_trans = [
|
||||
vc.Rescale(1.0 / 255.0, 0.0),
|
||||
vc.Normalize([0.9010, 0.9049, 0.9025], std=[0.1521, 0.1347, 0.1458]),
|
||||
|
@ -96,12 +96,12 @@ def create_dataset(dataset_path, batch_size=1, num_shards=1, shard_id=0, device_
|
|||
label_trans = [
|
||||
c.TypeCast(mstype.int32)
|
||||
]
|
||||
ds = ds.map(operations=image_trans, input_columns=["image"], num_parallel_workers=8)
|
||||
data_set = data_set.map(operations=image_trans, input_columns=["image"], num_parallel_workers=8)
|
||||
if device_target == 'Ascend':
|
||||
ds = ds.map(operations=transpose_hwc2whc, input_columns=["image"], num_parallel_workers=8)
|
||||
data_set = data_set.map(operations=transpose_hwc2whc, input_columns=["image"], num_parallel_workers=8)
|
||||
else:
|
||||
ds = ds.map(operations=transpose_hwc2chw, input_columns=["image"], num_parallel_workers=8)
|
||||
ds = ds.map(operations=label_trans, input_columns=["label"], num_parallel_workers=8)
|
||||
data_set = data_set.map(operations=transpose_hwc2chw, input_columns=["image"], num_parallel_workers=8)
|
||||
data_set = data_set.map(operations=label_trans, input_columns=["label"], num_parallel_workers=8)
|
||||
|
||||
ds = ds.batch(batch_size, drop_remainder=True)
|
||||
return ds
|
||||
data_set = data_set.batch(batch_size, drop_remainder=True)
|
||||
return data_set
|
||||
|
|
|
@ -16,10 +16,11 @@
|
|||
Data operations, will be used in train.py and eval.py
|
||||
"""
|
||||
import mindspore.common.dtype as mstype
|
||||
import mindspore.dataset.engine as de
|
||||
import mindspore.dataset as ds
|
||||
import mindspore.dataset.transforms.c_transforms as C2
|
||||
import mindspore.dataset.vision.c_transforms as C
|
||||
|
||||
|
||||
def create_dataset(dataset_path, do_train, batch_size=16, device_num=1, rank=0):
|
||||
"""
|
||||
create a train or eval dataset
|
||||
|
@ -35,10 +36,10 @@ def create_dataset(dataset_path, do_train, batch_size=16, device_num=1, rank=0):
|
|||
dataset
|
||||
"""
|
||||
if device_num == 1:
|
||||
ds = de.ImageFolderDataset(dataset_path, num_parallel_workers=8, shuffle=True)
|
||||
data_set = ds.ImageFolderDataset(dataset_path, num_parallel_workers=8, shuffle=True)
|
||||
else:
|
||||
ds = de.ImageFolderDataset(dataset_path, num_parallel_workers=8, shuffle=True,
|
||||
num_shards=device_num, shard_id=rank)
|
||||
data_set = ds.ImageFolderDataset(dataset_path, num_parallel_workers=8, shuffle=True,
|
||||
num_shards=device_num, shard_id=rank)
|
||||
# define map operations
|
||||
if do_train:
|
||||
trans = [
|
||||
|
@ -59,8 +60,8 @@ def create_dataset(dataset_path, do_train, batch_size=16, device_num=1, rank=0):
|
|||
]
|
||||
|
||||
type_cast_op = C2.TypeCast(mstype.int32)
|
||||
ds = ds.map(input_columns="image", operations=trans, num_parallel_workers=8)
|
||||
ds = ds.map(input_columns="label", operations=type_cast_op, num_parallel_workers=8)
|
||||
data_set = data_set.map(input_columns="image", operations=trans, num_parallel_workers=8)
|
||||
data_set = data_set.map(input_columns="label", operations=type_cast_op, num_parallel_workers=8)
|
||||
# apply batch operations
|
||||
ds = ds.batch(batch_size, drop_remainder=True)
|
||||
return ds
|
||||
data_set = data_set.batch(batch_size, drop_remainder=True)
|
||||
return data_set
|
||||
|
|
|
@ -17,7 +17,7 @@ Data operations, will be used in run_pretrain.py
|
|||
"""
|
||||
import os
|
||||
import mindspore.common.dtype as mstype
|
||||
import mindspore.dataset.engine.datasets as de
|
||||
import mindspore.dataset as ds
|
||||
import mindspore.dataset.transforms.c_transforms as C
|
||||
from mindspore import log as logger
|
||||
from .config import cfg
|
||||
|
@ -31,65 +31,67 @@ def create_bert_dataset(device_num=1, rank=0, do_shuffle="true", data_dir=None,
|
|||
for file_name in files:
|
||||
if "tfrecord" in file_name:
|
||||
data_files.append(os.path.join(data_dir, file_name))
|
||||
ds = de.TFRecordDataset(data_files, schema_dir if schema_dir != "" else None,
|
||||
columns_list=["input_ids", "input_mask", "segment_ids", "next_sentence_labels",
|
||||
"masked_lm_positions", "masked_lm_ids", "masked_lm_weights"],
|
||||
shuffle=de.Shuffle.FILES if do_shuffle == "true" else False,
|
||||
num_shards=device_num, shard_id=rank, shard_equal_rows=True)
|
||||
ori_dataset_size = ds.get_dataset_size()
|
||||
data_set = ds.TFRecordDataset(data_files, schema_dir if schema_dir != "" else None,
|
||||
columns_list=["input_ids", "input_mask", "segment_ids", "next_sentence_labels",
|
||||
"masked_lm_positions", "masked_lm_ids", "masked_lm_weights"],
|
||||
shuffle=ds.Shuffle.FILES if do_shuffle == "true" else False,
|
||||
num_shards=device_num, shard_id=rank, shard_equal_rows=True)
|
||||
ori_dataset_size = data_set.get_dataset_size()
|
||||
print('origin dataset size: ', ori_dataset_size)
|
||||
type_cast_op = C.TypeCast(mstype.int32)
|
||||
ds = ds.map(operations=type_cast_op, input_columns="masked_lm_ids")
|
||||
ds = ds.map(operations=type_cast_op, input_columns="masked_lm_positions")
|
||||
ds = ds.map(operations=type_cast_op, input_columns="next_sentence_labels")
|
||||
ds = ds.map(operations=type_cast_op, input_columns="segment_ids")
|
||||
ds = ds.map(operations=type_cast_op, input_columns="input_mask")
|
||||
ds = ds.map(operations=type_cast_op, input_columns="input_ids")
|
||||
data_set = data_set.map(operations=type_cast_op, input_columns="masked_lm_ids")
|
||||
data_set = data_set.map(operations=type_cast_op, input_columns="masked_lm_positions")
|
||||
data_set = data_set.map(operations=type_cast_op, input_columns="next_sentence_labels")
|
||||
data_set = data_set.map(operations=type_cast_op, input_columns="segment_ids")
|
||||
data_set = data_set.map(operations=type_cast_op, input_columns="input_mask")
|
||||
data_set = data_set.map(operations=type_cast_op, input_columns="input_ids")
|
||||
# apply batch operations
|
||||
ds = ds.batch(cfg.batch_size, drop_remainder=True)
|
||||
logger.info("data size: {}".format(ds.get_dataset_size()))
|
||||
logger.info("repeat count: {}".format(ds.get_repeat_count()))
|
||||
return ds
|
||||
data_set = data_set.batch(cfg.batch_size, drop_remainder=True)
|
||||
logger.info("data size: {}".format(data_set.get_dataset_size()))
|
||||
logger.info("repeat count: {}".format(data_set.get_repeat_count()))
|
||||
return data_set
|
||||
|
||||
|
||||
def create_ner_dataset(batch_size=1, repeat_count=1, assessment_method="accuracy",
|
||||
data_file_path=None, schema_file_path=None, do_shuffle=True):
|
||||
"""create finetune or evaluation dataset"""
|
||||
type_cast_op = C.TypeCast(mstype.int32)
|
||||
ds = de.TFRecordDataset([data_file_path], schema_file_path if schema_file_path != "" else None,
|
||||
columns_list=["input_ids", "input_mask", "segment_ids", "label_ids"], shuffle=do_shuffle)
|
||||
data_set = ds.TFRecordDataset([data_file_path], schema_file_path if schema_file_path != "" else None,
|
||||
columns_list=["input_ids", "input_mask", "segment_ids", "label_ids"],
|
||||
shuffle=do_shuffle)
|
||||
if assessment_method == "Spearman_correlation":
|
||||
type_cast_op_float = C.TypeCast(mstype.float32)
|
||||
ds = ds.map(operations=type_cast_op_float, input_columns="label_ids")
|
||||
data_set = data_set.map(operations=type_cast_op_float, input_columns="label_ids")
|
||||
else:
|
||||
ds = ds.map(operations=type_cast_op, input_columns="label_ids")
|
||||
ds = ds.map(operations=type_cast_op, input_columns="segment_ids")
|
||||
ds = ds.map(operations=type_cast_op, input_columns="input_mask")
|
||||
ds = ds.map(operations=type_cast_op, input_columns="input_ids")
|
||||
ds = ds.repeat(repeat_count)
|
||||
data_set = data_set.map(operations=type_cast_op, input_columns="label_ids")
|
||||
data_set = data_set.map(operations=type_cast_op, input_columns="segment_ids")
|
||||
data_set = data_set.map(operations=type_cast_op, input_columns="input_mask")
|
||||
data_set = data_set.map(operations=type_cast_op, input_columns="input_ids")
|
||||
data_set = data_set.repeat(repeat_count)
|
||||
# apply batch operations
|
||||
ds = ds.batch(batch_size, drop_remainder=True)
|
||||
return ds
|
||||
data_set = data_set.batch(batch_size, drop_remainder=True)
|
||||
return data_set
|
||||
|
||||
|
||||
def create_classification_dataset(batch_size=1, repeat_count=1, assessment_method="accuracy",
|
||||
data_file_path=None, schema_file_path=None, do_shuffle=True):
|
||||
"""create finetune or evaluation dataset"""
|
||||
type_cast_op = C.TypeCast(mstype.int32)
|
||||
ds = de.TFRecordDataset([data_file_path], schema_file_path if schema_file_path != "" else None,
|
||||
columns_list=["input_ids", "input_mask", "segment_ids", "label_ids"], shuffle=do_shuffle)
|
||||
data_set = ds.TFRecordDataset([data_file_path], schema_file_path if schema_file_path != "" else None,
|
||||
columns_list=["input_ids", "input_mask", "segment_ids", "label_ids"],
|
||||
shuffle=do_shuffle)
|
||||
if assessment_method == "Spearman_correlation":
|
||||
type_cast_op_float = C.TypeCast(mstype.float32)
|
||||
ds = ds.map(operations=type_cast_op_float, input_columns="label_ids")
|
||||
data_set = data_set.map(operations=type_cast_op_float, input_columns="label_ids")
|
||||
else:
|
||||
ds = ds.map(operations=type_cast_op, input_columns="label_ids")
|
||||
ds = ds.map(operations=type_cast_op, input_columns="segment_ids")
|
||||
ds = ds.map(operations=type_cast_op, input_columns="input_mask")
|
||||
ds = ds.map(operations=type_cast_op, input_columns="input_ids")
|
||||
ds = ds.repeat(repeat_count)
|
||||
data_set = data_set.map(operations=type_cast_op, input_columns="label_ids")
|
||||
data_set = data_set.map(operations=type_cast_op, input_columns="segment_ids")
|
||||
data_set = data_set.map(operations=type_cast_op, input_columns="input_mask")
|
||||
data_set = data_set.map(operations=type_cast_op, input_columns="input_ids")
|
||||
data_set = data_set.repeat(repeat_count)
|
||||
# apply batch operations
|
||||
ds = ds.batch(batch_size, drop_remainder=True)
|
||||
return ds
|
||||
data_set = data_set.batch(batch_size, drop_remainder=True)
|
||||
return data_set
|
||||
|
||||
|
||||
def generator_squad(data_features):
|
||||
|
@ -102,20 +104,20 @@ def create_squad_dataset(batch_size=1, repeat_count=1, data_file_path=None, sche
|
|||
"""create finetune or evaluation dataset"""
|
||||
type_cast_op = C.TypeCast(mstype.int32)
|
||||
if is_training:
|
||||
ds = de.TFRecordDataset([data_file_path], schema_file_path if schema_file_path != "" else None,
|
||||
columns_list=["input_ids", "input_mask", "segment_ids", "start_positions",
|
||||
"end_positions", "unique_ids", "is_impossible"],
|
||||
shuffle=do_shuffle)
|
||||
ds = ds.map(operations=type_cast_op, input_columns="start_positions")
|
||||
ds = ds.map(operations=type_cast_op, input_columns="end_positions")
|
||||
data_set = ds.TFRecordDataset([data_file_path], schema_file_path if schema_file_path != "" else None,
|
||||
columns_list=["input_ids", "input_mask", "segment_ids", "start_positions",
|
||||
"end_positions", "unique_ids", "is_impossible"],
|
||||
shuffle=do_shuffle)
|
||||
data_set = data_set.map(operations=type_cast_op, input_columns="start_positions")
|
||||
data_set = data_set.map(operations=type_cast_op, input_columns="end_positions")
|
||||
else:
|
||||
ds = de.GeneratorDataset(generator_squad(data_file_path), shuffle=do_shuffle,
|
||||
column_names=["input_ids", "input_mask", "segment_ids", "unique_ids"])
|
||||
ds = ds.map(operations=type_cast_op, input_columns="segment_ids")
|
||||
ds = ds.map(operations=type_cast_op, input_columns="input_mask")
|
||||
ds = ds.map(operations=type_cast_op, input_columns="input_ids")
|
||||
ds = ds.map(operations=type_cast_op, input_columns="unique_ids")
|
||||
ds = ds.repeat(repeat_count)
|
||||
data_set = ds.GeneratorDataset(generator_squad(data_file_path), shuffle=do_shuffle,
|
||||
column_names=["input_ids", "input_mask", "segment_ids", "unique_ids"])
|
||||
data_set = data_set.map(operations=type_cast_op, input_columns="segment_ids")
|
||||
data_set = data_set.map(operations=type_cast_op, input_columns="input_mask")
|
||||
data_set = data_set.map(operations=type_cast_op, input_columns="input_ids")
|
||||
data_set = data_set.map(operations=type_cast_op, input_columns="unique_ids")
|
||||
data_set = data_set.repeat(repeat_count)
|
||||
# apply batch operations
|
||||
ds = ds.batch(batch_size, drop_remainder=True)
|
||||
return ds
|
||||
data_set = data_set.batch(batch_size, drop_remainder=True)
|
||||
return data_set
|
||||
|
|
|
@ -17,7 +17,7 @@ Data operations, will be used in run_pretrain.py
|
|||
"""
|
||||
import os
|
||||
import mindspore.common.dtype as mstype
|
||||
import mindspore.dataset.engine.datasets as de
|
||||
import mindspore.dataset as ds
|
||||
import mindspore.dataset.transforms.c_transforms as C
|
||||
from mindspore import log as logger
|
||||
from .bert_net_config import bert_net_cfg
|
||||
|
@ -32,96 +32,96 @@ def create_bert_dataset(device_num=1, rank=0, do_shuffle="true", data_dir=None,
|
|||
if "tfrecord" in file_name:
|
||||
data_files.append(os.path.join(data_dir, file_name))
|
||||
data_files = sorted(data_files)
|
||||
ds = de.TFRecordDataset(data_files, schema_dir if schema_dir != "" else None,
|
||||
columns_list=["input_ids", "input_mask", "segment_ids", "next_sentence_labels",
|
||||
"masked_lm_positions", "masked_lm_ids", "masked_lm_weights"],
|
||||
shuffle=de.Shuffle.FILES if do_shuffle == "true" else False,
|
||||
num_shards=device_num, shard_id=rank, shard_equal_rows=False)
|
||||
ori_dataset_size = ds.get_dataset_size()
|
||||
data_set = ds.TFRecordDataset(data_files, schema_dir if schema_dir != "" else None,
|
||||
columns_list=["input_ids", "input_mask", "segment_ids", "next_sentence_labels",
|
||||
"masked_lm_positions", "masked_lm_ids", "masked_lm_weights"],
|
||||
shuffle=ds.Shuffle.FILES if do_shuffle == "true" else False,
|
||||
num_shards=device_num, shard_id=rank, shard_equal_rows=False)
|
||||
ori_dataset_size = data_set.get_dataset_size()
|
||||
print('origin dataset size: ', ori_dataset_size)
|
||||
type_cast_op = C.TypeCast(mstype.int32)
|
||||
ds = ds.map(operations=type_cast_op, input_columns="masked_lm_ids")
|
||||
ds = ds.map(operations=type_cast_op, input_columns="masked_lm_positions")
|
||||
ds = ds.map(operations=type_cast_op, input_columns="next_sentence_labels")
|
||||
ds = ds.map(operations=type_cast_op, input_columns="segment_ids")
|
||||
ds = ds.map(operations=type_cast_op, input_columns="input_mask")
|
||||
ds = ds.map(operations=type_cast_op, input_columns="input_ids")
|
||||
data_set = data_set.map(operations=type_cast_op, input_columns="masked_lm_ids")
|
||||
data_set = data_set.map(operations=type_cast_op, input_columns="masked_lm_positions")
|
||||
data_set = data_set.map(operations=type_cast_op, input_columns="next_sentence_labels")
|
||||
data_set = data_set.map(operations=type_cast_op, input_columns="segment_ids")
|
||||
data_set = data_set.map(operations=type_cast_op, input_columns="input_mask")
|
||||
data_set = data_set.map(operations=type_cast_op, input_columns="input_ids")
|
||||
# apply batch operations
|
||||
ds = ds.batch(bert_net_cfg.batch_size, drop_remainder=True)
|
||||
logger.info("data size: {}".format(ds.get_dataset_size()))
|
||||
logger.info("repeat count: {}".format(ds.get_repeat_count()))
|
||||
return ds
|
||||
data_set = data_set.batch(bert_net_cfg.batch_size, drop_remainder=True)
|
||||
logger.info("data size: {}".format(data_set.get_dataset_size()))
|
||||
logger.info("repeat count: {}".format(data_set.get_repeat_count()))
|
||||
return data_set
|
||||
|
||||
|
||||
def create_ner_dataset(batch_size=1, repeat_count=1, assessment_method="accuracy",
|
||||
data_file_path=None, schema_file_path=None):
|
||||
"""create finetune or evaluation dataset"""
|
||||
type_cast_op = C.TypeCast(mstype.int32)
|
||||
ds = de.TFRecordDataset([data_file_path], schema_file_path if schema_file_path != "" else None,
|
||||
columns_list=["input_ids", "input_mask", "segment_ids", "label_ids"])
|
||||
data_set = ds.TFRecordDataset([data_file_path], schema_file_path if schema_file_path != "" else None,
|
||||
columns_list=["input_ids", "input_mask", "segment_ids", "label_ids"])
|
||||
if assessment_method == "Spearman_correlation":
|
||||
type_cast_op_float = C.TypeCast(mstype.float32)
|
||||
ds = ds.map(operations=type_cast_op_float, input_columns="label_ids")
|
||||
data_set = data_set.map(operations=type_cast_op_float, input_columns="label_ids")
|
||||
else:
|
||||
ds = ds.map(operations=type_cast_op, input_columns="label_ids")
|
||||
ds = ds.map(operations=type_cast_op, input_columns="segment_ids")
|
||||
ds = ds.map(operations=type_cast_op, input_columns="input_mask")
|
||||
ds = ds.map(operations=type_cast_op, input_columns="input_ids")
|
||||
ds = ds.repeat(repeat_count)
|
||||
data_set = data_set.map(operations=type_cast_op, input_columns="label_ids")
|
||||
data_set = data_set.map(operations=type_cast_op, input_columns="segment_ids")
|
||||
data_set = data_set.map(operations=type_cast_op, input_columns="input_mask")
|
||||
data_set = data_set.map(operations=type_cast_op, input_columns="input_ids")
|
||||
data_set = data_set.repeat(repeat_count)
|
||||
# apply shuffle operation
|
||||
buffer_size = 960
|
||||
ds = ds.shuffle(buffer_size=buffer_size)
|
||||
data_set = data_set.shuffle(buffer_size=buffer_size)
|
||||
# apply batch operations
|
||||
ds = ds.batch(batch_size, drop_remainder=True)
|
||||
return ds
|
||||
data_set = data_set.batch(batch_size, drop_remainder=True)
|
||||
return data_set
|
||||
|
||||
|
||||
def create_classification_dataset(batch_size=1, repeat_count=1, assessment_method="accuracy",
|
||||
data_file_path=None, schema_file_path=None):
|
||||
"""create finetune or evaluation dataset"""
|
||||
type_cast_op = C.TypeCast(mstype.int32)
|
||||
ds = de.TFRecordDataset([data_file_path], schema_file_path if schema_file_path != "" else None,
|
||||
columns_list=["input_ids", "input_mask", "segment_ids", "label_ids"])
|
||||
data_set = ds.TFRecordDataset([data_file_path], schema_file_path if schema_file_path != "" else None,
|
||||
columns_list=["input_ids", "input_mask", "segment_ids", "label_ids"])
|
||||
if assessment_method == "Spearman_correlation":
|
||||
type_cast_op_float = C.TypeCast(mstype.float32)
|
||||
ds = ds.map(operations=type_cast_op_float, input_columns="label_ids")
|
||||
data_set = data_set.map(operations=type_cast_op_float, input_columns="label_ids")
|
||||
else:
|
||||
ds = ds.map(operations=type_cast_op, input_columns="label_ids")
|
||||
ds = ds.map(operations=type_cast_op, input_columns="segment_ids")
|
||||
ds = ds.map(operations=type_cast_op, input_columns="input_mask")
|
||||
ds = ds.map(operations=type_cast_op, input_columns="input_ids")
|
||||
ds = ds.repeat(repeat_count)
|
||||
data_set = data_set.map(operations=type_cast_op, input_columns="label_ids")
|
||||
data_set = data_set.map(operations=type_cast_op, input_columns="segment_ids")
|
||||
data_set = data_set.map(operations=type_cast_op, input_columns="input_mask")
|
||||
data_set = data_set.map(operations=type_cast_op, input_columns="input_ids")
|
||||
data_set = data_set.repeat(repeat_count)
|
||||
# apply shuffle operation
|
||||
buffer_size = 960
|
||||
ds = ds.shuffle(buffer_size=buffer_size)
|
||||
data_set = data_set.shuffle(buffer_size=buffer_size)
|
||||
# apply batch operations
|
||||
ds = ds.batch(batch_size, drop_remainder=True)
|
||||
return ds
|
||||
data_set = data_set.batch(batch_size, drop_remainder=True)
|
||||
return data_set
|
||||
|
||||
|
||||
def create_squad_dataset(batch_size=1, repeat_count=1, data_file_path=None, schema_file_path=None, is_training=True):
|
||||
"""create finetune or evaluation dataset"""
|
||||
type_cast_op = C.TypeCast(mstype.int32)
|
||||
if is_training:
|
||||
ds = de.TFRecordDataset([data_file_path], schema_file_path if schema_file_path != "" else None,
|
||||
columns_list=["input_ids", "input_mask", "segment_ids",
|
||||
"start_positions", "end_positions",
|
||||
"unique_ids", "is_impossible"])
|
||||
ds = ds.map(operations=type_cast_op, input_columns="start_positions")
|
||||
ds = ds.map(operations=type_cast_op, input_columns="end_positions")
|
||||
data_set = ds.TFRecordDataset([data_file_path], schema_file_path if schema_file_path != "" else None,
|
||||
columns_list=["input_ids", "input_mask", "segment_ids",
|
||||
"start_positions", "end_positions",
|
||||
"unique_ids", "is_impossible"])
|
||||
data_set = data_set.map(operations=type_cast_op, input_columns="start_positions")
|
||||
data_set = data_set.map(operations=type_cast_op, input_columns="end_positions")
|
||||
else:
|
||||
ds = de.TFRecordDataset([data_file_path], schema_file_path if schema_file_path != "" else None,
|
||||
columns_list=["input_ids", "input_mask", "segment_ids", "unique_ids"])
|
||||
ds = ds.map(operations=type_cast_op, input_columns="input_ids")
|
||||
ds = ds.map(operations=type_cast_op, input_columns="input_mask")
|
||||
ds = ds.map(operations=type_cast_op, input_columns="segment_ids")
|
||||
ds = ds.map(operations=type_cast_op, input_columns="segment_ids")
|
||||
ds = ds.map(operations=type_cast_op, input_columns="input_mask")
|
||||
ds = ds.map(operations=type_cast_op, input_columns="input_ids")
|
||||
ds = ds.repeat(repeat_count)
|
||||
data_set = ds.TFRecordDataset([data_file_path], schema_file_path if schema_file_path != "" else None,
|
||||
columns_list=["input_ids", "input_mask", "segment_ids", "unique_ids"])
|
||||
data_set = data_set.map(operations=type_cast_op, input_columns="input_ids")
|
||||
data_set = data_set.map(operations=type_cast_op, input_columns="input_mask")
|
||||
data_set = data_set.map(operations=type_cast_op, input_columns="segment_ids")
|
||||
data_set = data_set.map(operations=type_cast_op, input_columns="segment_ids")
|
||||
data_set = data_set.map(operations=type_cast_op, input_columns="input_mask")
|
||||
data_set = data_set.map(operations=type_cast_op, input_columns="input_ids")
|
||||
data_set = data_set.repeat(repeat_count)
|
||||
# apply shuffle operation
|
||||
buffer_size = 960
|
||||
ds = ds.shuffle(buffer_size=buffer_size)
|
||||
data_set = data_set.shuffle(buffer_size=buffer_size)
|
||||
# apply batch operations
|
||||
ds = ds.batch(batch_size, drop_remainder=True)
|
||||
return ds
|
||||
data_set = data_set.batch(batch_size, drop_remainder=True)
|
||||
return data_set
|
||||
|
|
|
@ -22,7 +22,7 @@ import mindspore.ops.operations as P
|
|||
from mindspore.common.tensor import Tensor
|
||||
from mindspore.train.model import Model
|
||||
from mindspore.train.serialization import load_checkpoint, load_param_into_net
|
||||
import mindspore.dataset.engine as de
|
||||
import mindspore.dataset as ds
|
||||
import mindspore.dataset.transforms.c_transforms as deC
|
||||
from mindspore import context
|
||||
from src.fasttext_model import FastText
|
||||
|
@ -73,15 +73,15 @@ class FastTextInferCell(nn.Cell):
|
|||
|
||||
def load_infer_dataset(batch_size, datafile):
|
||||
"""data loader for infer"""
|
||||
ds = de.MindDataset(datafile, columns_list=['src_tokens', 'src_tokens_length', 'label_idx'])
|
||||
data_set = ds.MindDataset(datafile, columns_list=['src_tokens', 'src_tokens_length', 'label_idx'])
|
||||
|
||||
type_cast_op = deC.TypeCast(mstype.int32)
|
||||
ds = ds.map(operations=type_cast_op, input_columns="src_tokens")
|
||||
ds = ds.map(operations=type_cast_op, input_columns="src_tokens_length")
|
||||
ds = ds.map(operations=type_cast_op, input_columns="label_idx")
|
||||
ds = ds.batch(batch_size=batch_size, drop_remainder=True)
|
||||
data_set = data_set.map(operations=type_cast_op, input_columns="src_tokens")
|
||||
data_set = data_set.map(operations=type_cast_op, input_columns="src_tokens_length")
|
||||
data_set = data_set.map(operations=type_cast_op, input_columns="label_idx")
|
||||
data_set = data_set.batch(batch_size=batch_size, drop_remainder=True)
|
||||
|
||||
return ds
|
||||
return data_set
|
||||
|
||||
def run_fasttext_infer():
|
||||
"""run infer with FastText"""
|
||||
|
|
|
@ -25,8 +25,10 @@ import spacy
|
|||
from sklearn.feature_extraction import FeatureHasher
|
||||
from mindspore.mindrecord import FileWriter
|
||||
|
||||
|
||||
class FastTextDataPreProcess():
|
||||
"""FastText data preprocess"""
|
||||
|
||||
def __init__(self, train_path,
|
||||
test_file,
|
||||
max_length,
|
||||
|
@ -194,7 +196,6 @@ class FastTextDataPreProcess():
|
|||
if self.text_less in sent_describe and self.text_greater in sent_describe:
|
||||
sent_describe = self.str_html.sub('', sent_describe)
|
||||
|
||||
|
||||
doc = spacy_nlp(sent_describe)
|
||||
bows_token = [token.text for token in doc]
|
||||
|
||||
|
@ -222,7 +223,7 @@ class FastTextDataPreProcess():
|
|||
def _get_bucket_length(self, x, bts):
|
||||
x_len = len(x)
|
||||
for index in range(1, len(bts)):
|
||||
if bts[index-1] < x_len <= bts[index]:
|
||||
if bts[index - 1] < x_len <= bts[index]:
|
||||
return bts[index]
|
||||
return bts[0]
|
||||
|
||||
|
@ -310,7 +311,6 @@ if __name__ == '__main__':
|
|||
|
||||
print("Writing test data to MindRecord file.....")
|
||||
for k in args.test_bucket:
|
||||
|
||||
write_to_mindrecord(test_data_example[k], './test_dataset_bs_' + str(k) + '.mindrecord', 1)
|
||||
|
||||
print("All done.....")
|
||||
|
|
|
@ -14,9 +14,10 @@
|
|||
# ============================================================================
|
||||
"""FastText data loader"""
|
||||
import mindspore.common.dtype as mstype
|
||||
import mindspore.dataset.engine as de
|
||||
import mindspore.dataset as ds
|
||||
import mindspore.dataset.transforms.c_transforms as deC
|
||||
|
||||
|
||||
def load_dataset(dataset_path,
|
||||
batch_size,
|
||||
epoch_count=1,
|
||||
|
@ -25,38 +26,40 @@ def load_dataset(dataset_path,
|
|||
bucket=None,
|
||||
shuffle=True):
|
||||
"""dataset loader"""
|
||||
|
||||
def batch_per_bucket(bucket_length, input_file):
|
||||
input_file = input_file +'/train_dataset_bs_' + str(bucket_length) + '.mindrecord'
|
||||
input_file = input_file + '/train_dataset_bs_' + str(bucket_length) + '.mindrecord'
|
||||
if not input_file:
|
||||
raise FileNotFoundError("input file parameter must not be empty.")
|
||||
|
||||
ds = de.MindDataset(input_file,
|
||||
columns_list=['src_tokens', 'src_tokens_length', 'label_idx'],
|
||||
shuffle=shuffle,
|
||||
num_shards=rank_size,
|
||||
shard_id=rank_id,
|
||||
num_parallel_workers=8)
|
||||
ori_dataset_size = ds.get_dataset_size()
|
||||
data_set = ds.MindDataset(input_file,
|
||||
columns_list=['src_tokens', 'src_tokens_length', 'label_idx'],
|
||||
shuffle=shuffle,
|
||||
num_shards=rank_size,
|
||||
shard_id=rank_id,
|
||||
num_parallel_workers=8)
|
||||
ori_dataset_size = data_set.get_dataset_size()
|
||||
print(f"Dataset size: {ori_dataset_size}")
|
||||
repeat_count = epoch_count
|
||||
type_cast_op = deC.TypeCast(mstype.int32)
|
||||
ds = ds.map(operations=type_cast_op, input_columns="src_tokens")
|
||||
ds = ds.map(operations=type_cast_op, input_columns="src_tokens_length")
|
||||
ds = ds.map(operations=type_cast_op, input_columns="label_idx")
|
||||
data_set = data_set.map(operations=type_cast_op, input_columns="src_tokens")
|
||||
data_set = data_set.map(operations=type_cast_op, input_columns="src_tokens_length")
|
||||
data_set = data_set.map(operations=type_cast_op, input_columns="label_idx")
|
||||
|
||||
data_set = data_set.rename(input_columns=['src_tokens', 'src_tokens_length', 'label_idx'],
|
||||
output_columns=['src_token_text', 'src_tokens_text_length', 'label_idx_tag'])
|
||||
data_set = data_set.batch(batch_size, drop_remainder=False)
|
||||
data_set = data_set.repeat(repeat_count)
|
||||
return data_set
|
||||
|
||||
ds = ds.rename(input_columns=['src_tokens', 'src_tokens_length', 'label_idx'],
|
||||
output_columns=['src_token_text', 'src_tokens_text_length', 'label_idx_tag'])
|
||||
ds = ds.batch(batch_size, drop_remainder=False)
|
||||
ds = ds.repeat(repeat_count)
|
||||
return ds
|
||||
for i, _ in enumerate(bucket):
|
||||
bucket_len = bucket[i]
|
||||
ds_per = batch_per_bucket(bucket_len, dataset_path)
|
||||
if i == 0:
|
||||
ds = ds_per
|
||||
data_set = ds_per
|
||||
else:
|
||||
ds = ds + ds_per
|
||||
ds = ds.shuffle(ds.get_dataset_size())
|
||||
ds.channel_name = 'fasttext'
|
||||
data_set = data_set + ds_per
|
||||
data_set = data_set.shuffle(data_set.get_dataset_size())
|
||||
data_set.channel_name = 'fasttext'
|
||||
|
||||
return ds
|
||||
return data_set
|
||||
|
|
|
@ -15,7 +15,7 @@
|
|||
"""Dataset loader to feed into model."""
|
||||
import os
|
||||
import mindspore.common.dtype as mstype
|
||||
import mindspore.dataset.engine as de
|
||||
import mindspore.dataset as ds
|
||||
import mindspore.dataset.transforms.c_transforms as deC
|
||||
|
||||
|
||||
|
@ -55,7 +55,7 @@ def _load_dataset(input_files, schema_file, batch_size, sink_mode=False,
|
|||
print(f" | Loading {datafile}.")
|
||||
|
||||
if not is_translate:
|
||||
ds = de.MindDataset(
|
||||
data_set = ds.MindDataset(
|
||||
input_files, columns_list=[
|
||||
"src", "src_padding",
|
||||
"prev_opt",
|
||||
|
@ -64,18 +64,18 @@ def _load_dataset(input_files, schema_file, batch_size, sink_mode=False,
|
|||
num_parallel_workers=8
|
||||
)
|
||||
|
||||
ori_dataset_size = ds.get_dataset_size()
|
||||
ori_dataset_size = data_set.get_dataset_size()
|
||||
print(f" | Dataset size: {ori_dataset_size}.")
|
||||
if shuffle:
|
||||
ds = ds.shuffle(buffer_size=ori_dataset_size // 20)
|
||||
data_set = data_set.shuffle(buffer_size=ori_dataset_size // 20)
|
||||
type_cast_op = deC.TypeCast(mstype.int32)
|
||||
ds = ds.map(input_columns="src", operations=type_cast_op, num_parallel_workers=8)
|
||||
ds = ds.map(input_columns="src_padding", operations=type_cast_op, num_parallel_workers=8)
|
||||
ds = ds.map(input_columns="prev_opt", operations=type_cast_op, num_parallel_workers=8)
|
||||
ds = ds.map(input_columns="target", operations=type_cast_op, num_parallel_workers=8)
|
||||
ds = ds.map(input_columns="tgt_padding", operations=type_cast_op, num_parallel_workers=8)
|
||||
data_set = data_set.map(input_columns="src", operations=type_cast_op, num_parallel_workers=8)
|
||||
data_set = data_set.map(input_columns="src_padding", operations=type_cast_op, num_parallel_workers=8)
|
||||
data_set = data_set.map(input_columns="prev_opt", operations=type_cast_op, num_parallel_workers=8)
|
||||
data_set = data_set.map(input_columns="target", operations=type_cast_op, num_parallel_workers=8)
|
||||
data_set = data_set.map(input_columns="tgt_padding", operations=type_cast_op, num_parallel_workers=8)
|
||||
|
||||
ds = ds.rename(
|
||||
data_set = data_set.rename(
|
||||
input_columns=["src",
|
||||
"src_padding",
|
||||
"prev_opt",
|
||||
|
@ -87,9 +87,9 @@ def _load_dataset(input_files, schema_file, batch_size, sink_mode=False,
|
|||
"target_eos_ids",
|
||||
"target_eos_mask"]
|
||||
)
|
||||
ds = ds.batch(batch_size, drop_remainder=drop_remainder)
|
||||
data_set = data_set.batch(batch_size, drop_remainder=drop_remainder)
|
||||
else:
|
||||
ds = de.MindDataset(
|
||||
data_set = ds.MindDataset(
|
||||
input_files, columns_list=[
|
||||
"src", "src_padding"
|
||||
],
|
||||
|
@ -97,23 +97,23 @@ def _load_dataset(input_files, schema_file, batch_size, sink_mode=False,
|
|||
num_parallel_workers=8
|
||||
)
|
||||
|
||||
ori_dataset_size = ds.get_dataset_size()
|
||||
ori_dataset_size = data_set.get_dataset_size()
|
||||
print(f" | Dataset size: {ori_dataset_size}.")
|
||||
if shuffle:
|
||||
ds = ds.shuffle(buffer_size=ori_dataset_size // 20)
|
||||
data_set = data_set.shuffle(buffer_size=ori_dataset_size // 20)
|
||||
type_cast_op = deC.TypeCast(mstype.int32)
|
||||
ds = ds.map(input_columns="src", operations=type_cast_op, num_parallel_workers=8)
|
||||
ds = ds.map(input_columns="src_padding", operations=type_cast_op, num_parallel_workers=8)
|
||||
data_set = data_set.map(input_columns="src", operations=type_cast_op, num_parallel_workers=8)
|
||||
data_set = data_set.map(input_columns="src_padding", operations=type_cast_op, num_parallel_workers=8)
|
||||
|
||||
ds = ds.rename(
|
||||
data_set = data_set.rename(
|
||||
input_columns=["src",
|
||||
"src_padding"],
|
||||
output_columns=["source_eos_ids",
|
||||
"source_eos_mask"]
|
||||
)
|
||||
ds = ds.batch(batch_size, drop_remainder=drop_remainder)
|
||||
data_set = data_set.batch(batch_size, drop_remainder=drop_remainder)
|
||||
|
||||
return ds
|
||||
return data_set
|
||||
|
||||
|
||||
def load_dataset(data_files: list, schema: str, batch_size: int, sink_mode: bool,
|
||||
|
|
|
@ -14,7 +14,7 @@
|
|||
# ============================================================================
|
||||
"""Dataset loader to feed into model."""
|
||||
import mindspore.common.dtype as mstype
|
||||
import mindspore.dataset.engine as de
|
||||
import mindspore.dataset as ds
|
||||
import mindspore.dataset.transforms.c_transforms as deC
|
||||
|
||||
|
||||
|
@ -45,7 +45,7 @@ def _load_dataset(input_files, batch_size, epoch_count=1,
|
|||
for datafile in input_files:
|
||||
print(f" | Loading {datafile}.")
|
||||
|
||||
ds = de.TFRecordDataset(
|
||||
data_set = ds.TFRecordDataset(
|
||||
input_files,
|
||||
columns_list=[
|
||||
"src", "src_padding",
|
||||
|
@ -55,19 +55,19 @@ def _load_dataset(input_files, batch_size, epoch_count=1,
|
|||
shuffle=shuffle, num_shards=rank_size, shard_id=rank_id,
|
||||
shard_equal_rows=True, num_parallel_workers=8)
|
||||
|
||||
ori_dataset_size = ds.get_dataset_size()
|
||||
ori_dataset_size = data_set.get_dataset_size()
|
||||
print(f" | Dataset size: {ori_dataset_size}.")
|
||||
repeat_count = epoch_count
|
||||
|
||||
type_cast_op = deC.TypeCast(mstype.int32)
|
||||
ds = ds.map(operations=type_cast_op, input_columns="src")
|
||||
ds = ds.map(operations=type_cast_op, input_columns="src_padding")
|
||||
ds = ds.map(operations=type_cast_op, input_columns="prev_opt")
|
||||
ds = ds.map(operations=type_cast_op, input_columns="prev_padding")
|
||||
ds = ds.map(operations=type_cast_op, input_columns="target")
|
||||
ds = ds.map(operations=type_cast_op, input_columns="tgt_padding")
|
||||
data_set = data_set.map(operations=type_cast_op, input_columns="src")
|
||||
data_set = data_set.map(operations=type_cast_op, input_columns="src_padding")
|
||||
data_set = data_set.map(operations=type_cast_op, input_columns="prev_opt")
|
||||
data_set = data_set.map(operations=type_cast_op, input_columns="prev_padding")
|
||||
data_set = data_set.map(operations=type_cast_op, input_columns="target")
|
||||
data_set = data_set.map(operations=type_cast_op, input_columns="tgt_padding")
|
||||
|
||||
ds = ds.rename(
|
||||
data_set = data_set.rename(
|
||||
input_columns=["src",
|
||||
"src_padding",
|
||||
"prev_opt",
|
||||
|
@ -82,11 +82,11 @@ def _load_dataset(input_files, batch_size, epoch_count=1,
|
|||
"target_eos_mask"]
|
||||
)
|
||||
|
||||
ds = ds.batch(batch_size, drop_remainder=True)
|
||||
ds = ds.repeat(repeat_count)
|
||||
data_set = data_set.batch(batch_size, drop_remainder=True)
|
||||
data_set = data_set.repeat(repeat_count)
|
||||
|
||||
ds.channel_name = 'transformer'
|
||||
return ds
|
||||
data_set.channel_name = 'transformer'
|
||||
return data_set
|
||||
|
||||
|
||||
def load_dataset(data_files: list, batch_size: int, epoch_count: int,
|
||||
|
|
|
@ -14,7 +14,7 @@
|
|||
# ============================================================================
|
||||
"""Dataset loader to feed into model."""
|
||||
import mindspore.common.dtype as mstype
|
||||
import mindspore.dataset.engine as de
|
||||
import mindspore.dataset as ds
|
||||
import mindspore.dataset.transforms.c_transforms as deC
|
||||
|
||||
|
||||
|
@ -45,7 +45,7 @@ def _load_dataset(input_files, batch_size, epoch_count=1,
|
|||
for datafile in input_files:
|
||||
print(f" | Loading {datafile}.")
|
||||
|
||||
ds = de.TFRecordDataset(
|
||||
data_set = ds.TFRecordDataset(
|
||||
input_files,
|
||||
columns_list=[
|
||||
"src", "src_padding",
|
||||
|
@ -55,19 +55,19 @@ def _load_dataset(input_files, batch_size, epoch_count=1,
|
|||
shuffle=shuffle, num_shards=rank_size, shard_id=rank_id,
|
||||
shard_equal_rows=True, num_parallel_workers=8)
|
||||
|
||||
ori_dataset_size = ds.get_dataset_size()
|
||||
ori_dataset_size = data_set.get_dataset_size()
|
||||
print(f" | Dataset size: {ori_dataset_size}.")
|
||||
repeat_count = epoch_count
|
||||
|
||||
type_cast_op = deC.TypeCast(mstype.int32)
|
||||
ds = ds.map(input_columns="src", operations=type_cast_op)
|
||||
ds = ds.map(input_columns="src_padding", operations=type_cast_op)
|
||||
ds = ds.map(input_columns="prev_opt", operations=type_cast_op)
|
||||
ds = ds.map(input_columns="prev_padding", operations=type_cast_op)
|
||||
ds = ds.map(input_columns="target", operations=type_cast_op)
|
||||
ds = ds.map(input_columns="tgt_padding", operations=type_cast_op)
|
||||
data_set = data_set.map(input_columns="src", operations=type_cast_op)
|
||||
data_set = data_set.map(input_columns="src_padding", operations=type_cast_op)
|
||||
data_set = data_set.map(input_columns="prev_opt", operations=type_cast_op)
|
||||
data_set = data_set.map(input_columns="prev_padding", operations=type_cast_op)
|
||||
data_set = data_set.map(input_columns="target", operations=type_cast_op)
|
||||
data_set = data_set.map(input_columns="tgt_padding", operations=type_cast_op)
|
||||
|
||||
ds = ds.rename(
|
||||
data_set = data_set.rename(
|
||||
input_columns=["src",
|
||||
"src_padding",
|
||||
"prev_opt",
|
||||
|
@ -82,11 +82,11 @@ def _load_dataset(input_files, batch_size, epoch_count=1,
|
|||
"target_eos_mask"]
|
||||
)
|
||||
|
||||
ds = ds.batch(batch_size, drop_remainder=True)
|
||||
ds = ds.repeat(repeat_count)
|
||||
data_set = data_set.batch(batch_size, drop_remainder=True)
|
||||
data_set = data_set.repeat(repeat_count)
|
||||
|
||||
ds.channel_name = 'transformer'
|
||||
return ds
|
||||
data_set.channel_name = 'transformer'
|
||||
return data_set
|
||||
|
||||
|
||||
def load_dataset(data_files: list, batch_size: int, epoch_count: int,
|
||||
|
|
|
@ -18,14 +18,16 @@
|
|||
import os
|
||||
from enum import Enum
|
||||
import mindspore.common.dtype as mstype
|
||||
import mindspore.dataset.engine.datasets as de
|
||||
import mindspore.dataset as ds
|
||||
import mindspore.dataset.transforms.c_transforms as C
|
||||
|
||||
|
||||
class DataType(Enum):
|
||||
"""Enumerate supported dataset format"""
|
||||
TFRECORD = 1
|
||||
MINDRECORD = 2
|
||||
|
||||
|
||||
def create_tinybert_dataset(task='td', batch_size=32, device_num=1, rank=0,
|
||||
do_shuffle="true", data_dir=None, schema_dir=None,
|
||||
data_type=DataType.TFRECORD):
|
||||
|
@ -47,22 +49,22 @@ def create_tinybert_dataset(task='td', batch_size=32, device_num=1, rank=0,
|
|||
shuffle = False
|
||||
|
||||
if data_type == DataType.MINDRECORD:
|
||||
ds = de.MindDataset(data_files, columns_list=columns_list,
|
||||
shuffle=(do_shuffle == "true"), num_shards=device_num, shard_id=rank)
|
||||
data_set = ds.MindDataset(data_files, columns_list=columns_list,
|
||||
shuffle=(do_shuffle == "true"), num_shards=device_num, shard_id=rank)
|
||||
else:
|
||||
ds = de.TFRecordDataset(data_files, schema_dir, columns_list=columns_list,
|
||||
shuffle=shuffle, num_shards=device_num, shard_id=rank,
|
||||
shard_equal_rows=shard_equal_rows)
|
||||
data_set = ds.TFRecordDataset(data_files, schema_dir, columns_list=columns_list,
|
||||
shuffle=shuffle, num_shards=device_num, shard_id=rank,
|
||||
shard_equal_rows=shard_equal_rows)
|
||||
if device_num == 1 and shuffle is True:
|
||||
ds = ds.shuffle(10000)
|
||||
data_set = data_set.shuffle(10000)
|
||||
|
||||
type_cast_op = C.TypeCast(mstype.int32)
|
||||
ds = ds.map(operations=type_cast_op, input_columns="segment_ids")
|
||||
ds = ds.map(operations=type_cast_op, input_columns="input_mask")
|
||||
ds = ds.map(operations=type_cast_op, input_columns="input_ids")
|
||||
data_set = data_set.map(operations=type_cast_op, input_columns="segment_ids")
|
||||
data_set = data_set.map(operations=type_cast_op, input_columns="input_mask")
|
||||
data_set = data_set.map(operations=type_cast_op, input_columns="input_ids")
|
||||
if task == "td":
|
||||
ds = ds.map(operations=type_cast_op, input_columns="label_ids")
|
||||
data_set = data_set.map(operations=type_cast_op, input_columns="label_ids")
|
||||
# apply batch operations
|
||||
ds = ds.batch(batch_size, drop_remainder=True)
|
||||
data_set = data_set.batch(batch_size, drop_remainder=True)
|
||||
|
||||
return ds
|
||||
return data_set
|
||||
|
|
|
@ -23,38 +23,41 @@ from mindspore.common.parameter import Parameter
|
|||
from mindspore.common.tensor import Tensor
|
||||
from mindspore.train.model import Model
|
||||
from mindspore.train.serialization import load_checkpoint, load_param_into_net
|
||||
import mindspore.dataset.engine as de
|
||||
import mindspore.dataset as ds
|
||||
import mindspore.dataset.transforms.c_transforms as deC
|
||||
from mindspore import context
|
||||
|
||||
from src.transformer_model import TransformerModel
|
||||
from src.eval_config import cfg, transformer_net_cfg
|
||||
|
||||
|
||||
def load_test_data(batch_size=1, data_file=None):
|
||||
"""
|
||||
Load test dataset
|
||||
"""
|
||||
ds = de.MindDataset(data_file,
|
||||
columns_list=["source_eos_ids", "source_eos_mask",
|
||||
"target_sos_ids", "target_sos_mask",
|
||||
"target_eos_ids", "target_eos_mask"],
|
||||
shuffle=False)
|
||||
data_set = ds.MindDataset(data_file,
|
||||
columns_list=["source_eos_ids", "source_eos_mask",
|
||||
"target_sos_ids", "target_sos_mask",
|
||||
"target_eos_ids", "target_eos_mask"],
|
||||
shuffle=False)
|
||||
type_cast_op = deC.TypeCast(mstype.int32)
|
||||
ds = ds.map(operations=type_cast_op, input_columns="source_eos_ids")
|
||||
ds = ds.map(operations=type_cast_op, input_columns="source_eos_mask")
|
||||
ds = ds.map(operations=type_cast_op, input_columns="target_sos_ids")
|
||||
ds = ds.map(operations=type_cast_op, input_columns="target_sos_mask")
|
||||
ds = ds.map(operations=type_cast_op, input_columns="target_eos_ids")
|
||||
ds = ds.map(operations=type_cast_op, input_columns="target_eos_mask")
|
||||
data_set = data_set.map(operations=type_cast_op, input_columns="source_eos_ids")
|
||||
data_set = data_set.map(operations=type_cast_op, input_columns="source_eos_mask")
|
||||
data_set = data_set.map(operations=type_cast_op, input_columns="target_sos_ids")
|
||||
data_set = data_set.map(operations=type_cast_op, input_columns="target_sos_mask")
|
||||
data_set = data_set.map(operations=type_cast_op, input_columns="target_eos_ids")
|
||||
data_set = data_set.map(operations=type_cast_op, input_columns="target_eos_mask")
|
||||
# apply batch operations
|
||||
ds = ds.batch(batch_size, drop_remainder=True)
|
||||
ds.channel_name = 'transformer'
|
||||
return ds
|
||||
data_set = data_set.batch(batch_size, drop_remainder=True)
|
||||
data_set.channel_name = 'transformer'
|
||||
return data_set
|
||||
|
||||
|
||||
class TransformerInferCell(nn.Cell):
|
||||
"""
|
||||
Encapsulation class of transformer network infer.
|
||||
"""
|
||||
|
||||
def __init__(self, network):
|
||||
super(TransformerInferCell, self).__init__(auto_prefix=False)
|
||||
self.network = network
|
||||
|
@ -65,6 +68,7 @@ class TransformerInferCell(nn.Cell):
|
|||
predicted_ids = self.network(source_ids, source_mask)
|
||||
return predicted_ids
|
||||
|
||||
|
||||
def load_weights(model_path):
|
||||
"""
|
||||
Load checkpoint as parameter dict, support both npz file and mindspore checkpoint file.
|
||||
|
@ -93,6 +97,7 @@ def load_weights(model_path):
|
|||
parameter_dict[name] = Parameter(Tensor(weights[name]), name=name)
|
||||
return parameter_dict
|
||||
|
||||
|
||||
def run_transformer_eval():
|
||||
"""
|
||||
Transformer evaluation.
|
||||
|
@ -136,5 +141,6 @@ def run_transformer_eval():
|
|||
f.write(" ".join(token_ids) + "\n")
|
||||
f.close()
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
run_transformer_eval()
|
||||
|
|
|
@ -21,7 +21,7 @@ from enum import Enum
|
|||
|
||||
import numpy as np
|
||||
import pandas as pd
|
||||
import mindspore.dataset.engine as de
|
||||
import mindspore.dataset as ds
|
||||
import mindspore.common.dtype as mstype
|
||||
|
||||
from .config import DataConfig
|
||||
|
@ -142,8 +142,8 @@ class H5Dataset():
|
|||
X_id = X[:, 0:self.max_length]
|
||||
X_va = X[:, self.max_length:]
|
||||
yield np.array(X_id.astype(dtype=np.int32)), \
|
||||
np.array(X_va.astype(dtype=np.float32)), \
|
||||
np.array(y.astype(dtype=np.float32))
|
||||
np.array(X_va.astype(dtype=np.float32)), \
|
||||
np.array(y.astype(dtype=np.float32))
|
||||
|
||||
|
||||
def _get_h5_dataset(directory, train_mode=True, epochs=1, batch_size=1000):
|
||||
|
@ -172,9 +172,9 @@ def _get_h5_dataset(directory, train_mode=True, epochs=1, batch_size=1000):
|
|||
for _ in range(0, numbers_of_batch, 1):
|
||||
yield train_eval_gen.__next__()
|
||||
|
||||
ds = de.GeneratorDataset(_iter_h5_data, ["ids", "weights", "labels"])
|
||||
ds = ds.repeat(epochs)
|
||||
return ds
|
||||
data_set = ds.GeneratorDataset(_iter_h5_data, ["ids", "weights", "labels"])
|
||||
data_set = data_set.repeat(epochs)
|
||||
return data_set
|
||||
|
||||
|
||||
def _get_mindrecord_dataset(directory, train_mode=True, epochs=1, batch_size=1000,
|
||||
|
@ -199,23 +199,23 @@ def _get_mindrecord_dataset(directory, train_mode=True, epochs=1, batch_size=100
|
|||
shuffle = train_mode
|
||||
|
||||
if rank_size is not None and rank_id is not None:
|
||||
ds = de.MindDataset(os.path.join(directory, file_prefix_name + file_suffix_name),
|
||||
columns_list=['feat_ids', 'feat_vals', 'label'],
|
||||
num_shards=rank_size, shard_id=rank_id, shuffle=shuffle,
|
||||
num_parallel_workers=8)
|
||||
data_set = ds.MindDataset(os.path.join(directory, file_prefix_name + file_suffix_name),
|
||||
columns_list=['feat_ids', 'feat_vals', 'label'],
|
||||
num_shards=rank_size, shard_id=rank_id, shuffle=shuffle,
|
||||
num_parallel_workers=8)
|
||||
else:
|
||||
ds = de.MindDataset(os.path.join(directory, file_prefix_name + file_suffix_name),
|
||||
columns_list=['feat_ids', 'feat_vals', 'label'],
|
||||
shuffle=shuffle, num_parallel_workers=8)
|
||||
ds = ds.batch(int(batch_size / line_per_sample), drop_remainder=True)
|
||||
ds = ds.map(operations=(lambda x, y, z: (np.array(x).flatten().reshape(batch_size, 39),
|
||||
np.array(y).flatten().reshape(batch_size, 39),
|
||||
np.array(z).flatten().reshape(batch_size, 1))),
|
||||
input_columns=['feat_ids', 'feat_vals', 'label'],
|
||||
column_order=['feat_ids', 'feat_vals', 'label'],
|
||||
num_parallel_workers=8)
|
||||
ds = ds.repeat(epochs)
|
||||
return ds
|
||||
data_set = ds.MindDataset(os.path.join(directory, file_prefix_name + file_suffix_name),
|
||||
columns_list=['feat_ids', 'feat_vals', 'label'],
|
||||
shuffle=shuffle, num_parallel_workers=8)
|
||||
data_set = data_set.batch(int(batch_size / line_per_sample), drop_remainder=True)
|
||||
data_set = data_set.map(operations=(lambda x, y, z: (np.array(x).flatten().reshape(batch_size, 39),
|
||||
np.array(y).flatten().reshape(batch_size, 39),
|
||||
np.array(z).flatten().reshape(batch_size, 1))),
|
||||
input_columns=['feat_ids', 'feat_vals', 'label'],
|
||||
column_order=['feat_ids', 'feat_vals', 'label'],
|
||||
num_parallel_workers=8)
|
||||
data_set = data_set.repeat(epochs)
|
||||
return data_set
|
||||
|
||||
|
||||
def _get_tf_dataset(directory, train_mode=True, epochs=1, batch_size=1000,
|
||||
|
@ -242,28 +242,28 @@ def _get_tf_dataset(directory, train_mode=True, epochs=1, batch_size=1000,
|
|||
for filename in filenames:
|
||||
if file_prefixt_name in filename and 'tfrecord' in filename:
|
||||
dataset_files.append(os.path.join(dir_path, filename))
|
||||
schema = de.Schema()
|
||||
schema = ds.Schema()
|
||||
schema.add_column('feat_ids', de_type=mstype.int32)
|
||||
schema.add_column('feat_vals', de_type=mstype.float32)
|
||||
schema.add_column('label', de_type=mstype.float32)
|
||||
if rank_size is not None and rank_id is not None:
|
||||
ds = de.TFRecordDataset(dataset_files=dataset_files, shuffle=shuffle,
|
||||
schema=schema, num_parallel_workers=8,
|
||||
num_shards=rank_size, shard_id=rank_id,
|
||||
shard_equal_rows=True)
|
||||
data_set = ds.TFRecordDataset(dataset_files=dataset_files, shuffle=shuffle,
|
||||
schema=schema, num_parallel_workers=8,
|
||||
num_shards=rank_size, shard_id=rank_id,
|
||||
shard_equal_rows=True)
|
||||
else:
|
||||
ds = de.TFRecordDataset(dataset_files=dataset_files, shuffle=shuffle,
|
||||
schema=schema, num_parallel_workers=8)
|
||||
ds = ds.batch(int(batch_size / line_per_sample), drop_remainder=True)
|
||||
ds = ds.map(operations=(lambda x, y, z: (
|
||||
data_set = ds.TFRecordDataset(dataset_files=dataset_files, shuffle=shuffle,
|
||||
schema=schema, num_parallel_workers=8)
|
||||
data_set = data_set.batch(int(batch_size / line_per_sample), drop_remainder=True)
|
||||
data_set = data_set.map(operations=(lambda x, y, z: (
|
||||
np.array(x).flatten().reshape(batch_size, 39),
|
||||
np.array(y).flatten().reshape(batch_size, 39),
|
||||
np.array(z).flatten().reshape(batch_size, 1))),
|
||||
input_columns=['feat_ids', 'feat_vals', 'label'],
|
||||
column_order=['feat_ids', 'feat_vals', 'label'],
|
||||
num_parallel_workers=8)
|
||||
ds = ds.repeat(epochs)
|
||||
return ds
|
||||
input_columns=['feat_ids', 'feat_vals', 'label'],
|
||||
column_order=['feat_ids', 'feat_vals', 'label'],
|
||||
num_parallel_workers=8)
|
||||
data_set = data_set.repeat(epochs)
|
||||
return data_set
|
||||
|
||||
|
||||
def create_dataset(directory, train_mode=True, epochs=1, batch_size=1000,
|
||||
|
|
|
@ -14,13 +14,12 @@
|
|||
# ============================================================================
|
||||
"""train_dataset."""
|
||||
|
||||
|
||||
import os
|
||||
import math
|
||||
from enum import Enum
|
||||
import numpy as np
|
||||
import pandas as pd
|
||||
import mindspore.dataset.engine as de
|
||||
import mindspore.dataset as ds
|
||||
import mindspore.common.dtype as mstype
|
||||
|
||||
|
||||
|
@ -84,9 +83,9 @@ class H5Dataset():
|
|||
yield os.path.join(self._hdf_data_dir,
|
||||
self._file_prefix + '_input_part_' + str(
|
||||
p) + '.h5'), \
|
||||
os.path.join(self._hdf_data_dir,
|
||||
self._file_prefix + '_output_part_' + str(
|
||||
p) + '.h5'), i + 1 == len(parts)
|
||||
os.path.join(self._hdf_data_dir,
|
||||
self._file_prefix + '_output_part_' + str(
|
||||
p) + '.h5'), i + 1 == len(parts)
|
||||
|
||||
def _generator(self, X, y, batch_size, shuffle=True):
|
||||
"""
|
||||
|
@ -106,8 +105,7 @@ class H5Dataset():
|
|||
np.random.shuffle(sample_index)
|
||||
assert X.shape[0] > 0
|
||||
while True:
|
||||
batch_index = sample_index[
|
||||
batch_size * counter: batch_size * (counter + 1)]
|
||||
batch_index = sample_index[batch_size * counter: batch_size * (counter + 1)]
|
||||
X_batch = X[batch_index]
|
||||
y_batch = y[batch_index]
|
||||
counter += 1
|
||||
|
@ -140,9 +138,8 @@ class H5Dataset():
|
|||
X, y, finished = data_gen.__next__()
|
||||
X_id = X[:, 0:self.input_length]
|
||||
X_va = X[:, self.input_length:]
|
||||
yield np.array(X_id.astype(dtype=np.int32)), np.array(
|
||||
X_va.astype(dtype=np.float32)), np.array(
|
||||
y.astype(dtype=np.float32))
|
||||
yield np.array(X_id.astype(dtype=np.int32)), np.array(X_va.astype(dtype=np.float32)), np.array(
|
||||
y.astype(dtype=np.float32))
|
||||
|
||||
|
||||
def _get_h5_dataset(data_dir, train_mode=True, epochs=1, batch_size=1000):
|
||||
|
@ -164,9 +161,9 @@ def _get_h5_dataset(data_dir, train_mode=True, epochs=1, batch_size=1000):
|
|||
for _ in range(0, numbers_of_batch, 1):
|
||||
yield train_eval_gen.__next__()
|
||||
|
||||
ds = de.GeneratorDataset(_iter_h5_data(), ["ids", "weights", "labels"])
|
||||
ds = ds.repeat(epochs)
|
||||
return ds
|
||||
data_set = ds.GeneratorDataset(_iter_h5_data(), ["ids", "weights", "labels"])
|
||||
data_set = data_set.repeat(epochs)
|
||||
return data_set
|
||||
|
||||
|
||||
def _padding_func(batch_size, manual_shape, target_column, field_size=39):
|
||||
|
@ -174,11 +171,11 @@ def _padding_func(batch_size, manual_shape, target_column, field_size=39):
|
|||
get padding_func
|
||||
"""
|
||||
if manual_shape:
|
||||
generate_concat_offset = [item[0]+item[1] for item in manual_shape]
|
||||
generate_concat_offset = [item[0] + item[1] for item in manual_shape]
|
||||
part_size = int(target_column / len(generate_concat_offset))
|
||||
filled_value = []
|
||||
for i in range(field_size, target_column):
|
||||
filled_value.append(generate_concat_offset[i//part_size]-1)
|
||||
filled_value.append(generate_concat_offset[i // part_size] - 1)
|
||||
print("Filed Value:", filled_value)
|
||||
|
||||
def padding_func(x, y, z):
|
||||
|
@ -190,7 +187,7 @@ def _padding_func(batch_size, manual_shape, target_column, field_size=39):
|
|||
dtype=np.int32) * filled_value
|
||||
x_id = np.concatenate([x, x_id.astype(dtype=np.int32)], axis=1)
|
||||
mask = np.concatenate(
|
||||
[y, np.zeros((batch_size, target_column-39), dtype=np.float32)], axis=1)
|
||||
[y, np.zeros((batch_size, target_column - 39), dtype=np.float32)], axis=1)
|
||||
return (x_id, mask, z)
|
||||
else:
|
||||
def padding_func(x, y, z):
|
||||
|
@ -214,24 +211,25 @@ def _get_tf_dataset(data_dir, train_mode=True, epochs=1, batch_size=1000,
|
|||
for filename in filenames:
|
||||
if file_prefix_name in filename and "tfrecord" in filename:
|
||||
dataset_files.append(os.path.join(dirpath, filename))
|
||||
schema = de.Schema()
|
||||
schema = ds.Schema()
|
||||
schema.add_column('feat_ids', de_type=mstype.int32)
|
||||
schema.add_column('feat_vals', de_type=mstype.float32)
|
||||
schema.add_column('label', de_type=mstype.float32)
|
||||
if rank_size is not None and rank_id is not None:
|
||||
ds = de.TFRecordDataset(dataset_files=dataset_files, shuffle=shuffle, schema=schema, num_parallel_workers=8,
|
||||
num_shards=rank_size, shard_id=rank_id, shard_equal_rows=True)
|
||||
data_set = ds.TFRecordDataset(dataset_files=dataset_files, shuffle=shuffle, schema=schema,
|
||||
num_parallel_workers=8,
|
||||
num_shards=rank_size, shard_id=rank_id, shard_equal_rows=True)
|
||||
else:
|
||||
ds = de.TFRecordDataset(dataset_files=dataset_files,
|
||||
shuffle=shuffle, schema=schema, num_parallel_workers=8)
|
||||
ds = ds.batch(int(batch_size / line_per_sample),
|
||||
drop_remainder=True)
|
||||
data_set = ds.TFRecordDataset(dataset_files=dataset_files,
|
||||
shuffle=shuffle, schema=schema, num_parallel_workers=8)
|
||||
data_set = data_set.batch(int(batch_size / line_per_sample),
|
||||
drop_remainder=True)
|
||||
|
||||
ds = ds.map(operations=_padding_func(batch_size, manual_shape, target_column),
|
||||
input_columns=['feat_ids', 'feat_vals', 'label'],
|
||||
column_order=['feat_ids', 'feat_vals', 'label'], num_parallel_workers=8)
|
||||
ds = ds.repeat(epochs)
|
||||
return ds
|
||||
data_set = data_set.map(operations=_padding_func(batch_size, manual_shape, target_column),
|
||||
input_columns=['feat_ids', 'feat_vals', 'label'],
|
||||
column_order=['feat_ids', 'feat_vals', 'label'], num_parallel_workers=8)
|
||||
data_set = data_set.repeat(epochs)
|
||||
return data_set
|
||||
|
||||
|
||||
def _get_mindrecord_dataset(directory, train_mode=True, epochs=1, batch_size=1000,
|
||||
|
@ -257,21 +255,21 @@ def _get_mindrecord_dataset(directory, train_mode=True, epochs=1, batch_size=100
|
|||
shuffle = train_mode
|
||||
|
||||
if rank_size is not None and rank_id is not None:
|
||||
ds = de.MindDataset(os.path.join(directory, file_prefix_name + file_suffix_name),
|
||||
columns_list=['feat_ids', 'feat_vals', 'label'],
|
||||
num_shards=rank_size, shard_id=rank_id, shuffle=shuffle,
|
||||
num_parallel_workers=8)
|
||||
data_set = ds.MindDataset(os.path.join(directory, file_prefix_name + file_suffix_name),
|
||||
columns_list=['feat_ids', 'feat_vals', 'label'],
|
||||
num_shards=rank_size, shard_id=rank_id, shuffle=shuffle,
|
||||
num_parallel_workers=8)
|
||||
else:
|
||||
ds = de.MindDataset(os.path.join(directory, file_prefix_name + file_suffix_name),
|
||||
columns_list=['feat_ids', 'feat_vals', 'label'],
|
||||
shuffle=shuffle, num_parallel_workers=8)
|
||||
ds = ds.batch(int(batch_size / line_per_sample), drop_remainder=True)
|
||||
ds = ds.map(_padding_func(batch_size, manual_shape, target_column),
|
||||
input_columns=['feat_ids', 'feat_vals', 'label'],
|
||||
column_order=['feat_ids', 'feat_vals', 'label'],
|
||||
num_parallel_workers=8)
|
||||
ds = ds.repeat(epochs)
|
||||
return ds
|
||||
data_set = ds.MindDataset(os.path.join(directory, file_prefix_name + file_suffix_name),
|
||||
columns_list=['feat_ids', 'feat_vals', 'label'],
|
||||
shuffle=shuffle, num_parallel_workers=8)
|
||||
data_set = data_set.batch(int(batch_size / line_per_sample), drop_remainder=True)
|
||||
data_set = data_set.map(_padding_func(batch_size, manual_shape, target_column),
|
||||
input_columns=['feat_ids', 'feat_vals', 'label'],
|
||||
column_order=['feat_ids', 'feat_vals', 'label'],
|
||||
num_parallel_workers=8)
|
||||
data_set = data_set.repeat(epochs)
|
||||
return data_set
|
||||
|
||||
|
||||
def _get_vocab_size(target_column_number, worker_size, total_vocab_size, multiply=False, per_vocab_size=None):
|
||||
|
@ -284,7 +282,7 @@ def _get_vocab_size(target_column_number, worker_size, total_vocab_size, multipl
|
|||
5, 21762, 14, 15, 15030, 61, 12220]
|
||||
|
||||
new_vocabs = inidival_vocabs + [1] * \
|
||||
(target_column_number - len(inidival_vocabs))
|
||||
(target_column_number - len(inidival_vocabs))
|
||||
part_size = int(target_column_number / worker_size)
|
||||
|
||||
# According to the workers, we merge some fields into the same part
|
||||
|
@ -304,21 +302,21 @@ def _get_vocab_size(target_column_number, worker_size, total_vocab_size, multipl
|
|||
# Expands the vocabulary of each field by the multiplier
|
||||
if multiply is True:
|
||||
cur_sum = sum(new_vocab_size)
|
||||
k = total_vocab_size/cur_sum
|
||||
k = total_vocab_size / cur_sum
|
||||
new_vocab_size = [
|
||||
math.ceil(int(item*k)/worker_size)*worker_size for item in new_vocab_size]
|
||||
new_vocab_size = [(item // 8 + 1)*8 for item in new_vocab_size]
|
||||
math.ceil(int(item * k) / worker_size) * worker_size for item in new_vocab_size]
|
||||
new_vocab_size = [(item // 8 + 1) * 8 for item in new_vocab_size]
|
||||
|
||||
else:
|
||||
if total_vocab_size > sum(new_vocab_size):
|
||||
new_vocab_size[-1] = total_vocab_size - \
|
||||
sum(new_vocab_size[:-1])
|
||||
sum(new_vocab_size[:-1])
|
||||
new_vocab_size = [item for item in new_vocab_size]
|
||||
else:
|
||||
raise ValueError(
|
||||
"Please providede the correct vocab size, now is {}".format(total_vocab_size))
|
||||
|
||||
for i in range(worker_size-1):
|
||||
for i in range(worker_size - 1):
|
||||
off = index_offsets[i] + features[i]
|
||||
index_offsets.append(off)
|
||||
|
||||
|
|
|
@ -17,7 +17,7 @@
|
|||
|
||||
import os
|
||||
import sys
|
||||
import mindspore.dataset.engine as de
|
||||
import mindspore.dataset as ds
|
||||
from mindspore import Model, context
|
||||
from mindspore.train.callback import ModelCheckpoint, CheckpointConfig, TimeMonitor
|
||||
from mindspore.context import ParallelMode
|
||||
|
@ -88,7 +88,7 @@ def train_and_eval(config):
|
|||
print("epochs is {}".format(epochs))
|
||||
if config.full_batch:
|
||||
context.set_auto_parallel_context(full_batch=True)
|
||||
de.config.set_seed(1)
|
||||
ds.config.set_seed(1)
|
||||
if config.field_slice:
|
||||
compute_manual_shape(config, get_group_size())
|
||||
ds_train = create_dataset(data_path, train_mode=True, epochs=1,
|
||||
|
|
|
@ -17,7 +17,7 @@
|
|||
|
||||
import os
|
||||
import sys
|
||||
import mindspore.dataset.engine as de
|
||||
import mindspore.dataset as ds
|
||||
from mindspore import Model, context
|
||||
from mindspore.train.callback import ModelCheckpoint, CheckpointConfig, TimeMonitor
|
||||
from mindspore.context import ParallelMode
|
||||
|
@ -92,7 +92,7 @@ def train_and_eval(config):
|
|||
print("epochs is {}".format(epochs))
|
||||
if config.full_batch:
|
||||
context.set_auto_parallel_context(full_batch=True)
|
||||
de.config.set_seed(1)
|
||||
ds.config.set_seed(1)
|
||||
ds_train = create_dataset(data_path, train_mode=True, epochs=1,
|
||||
batch_size=batch_size*get_group_size(), data_type=dataset_type)
|
||||
ds_eval = create_dataset(data_path, train_mode=False, epochs=1,
|
||||
|
|
|
@ -18,7 +18,7 @@ import math
|
|||
import pickle
|
||||
import numpy as np
|
||||
import pandas as pd
|
||||
import mindspore.dataset.engine as de
|
||||
import mindspore.dataset as ds
|
||||
import mindspore.common.dtype as mstype
|
||||
|
||||
|
||||
|
@ -97,8 +97,7 @@ class H5Dataset():
|
|||
np.random.shuffle(sample_index)
|
||||
assert X.shape[0] > 0
|
||||
while True:
|
||||
batch_index = sample_index[batch_size * counter:batch_size *
|
||||
(counter + 1)]
|
||||
batch_index = sample_index[batch_size * counter:batch_size * (counter + 1)]
|
||||
X_batch = X[batch_index]
|
||||
y_batch = y[batch_index]
|
||||
counter += 1
|
||||
|
@ -135,9 +134,8 @@ class H5Dataset():
|
|||
X, y, finished = data_gen.__next__()
|
||||
X_id = X[:, 0:self.input_length]
|
||||
X_va = X[:, self.input_length:]
|
||||
yield np.array(X_id.astype(dtype=np.int32)), np.array(
|
||||
X_va.astype(dtype=np.float32)), np.array(
|
||||
y.astype(dtype=np.float32))
|
||||
yield np.array(X_id.astype(dtype=np.int32)), np.array(X_va.astype(dtype=np.float32)), np.array(
|
||||
y.astype(dtype=np.float32))
|
||||
|
||||
|
||||
def _get_h5_dataset(data_dir, train_mode=True, epochs=1, batch_size=1000):
|
||||
|
@ -159,10 +157,10 @@ def _get_h5_dataset(data_dir, train_mode=True, epochs=1, batch_size=1000):
|
|||
for _ in range(0, numbers_of_batch, 1):
|
||||
yield train_eval_gen.__next__()
|
||||
|
||||
ds = de.GeneratorDataset(_iter_h5_data(),
|
||||
["ids", "weights", "labels"])
|
||||
ds = ds.repeat(epochs)
|
||||
return ds
|
||||
data_set = ds.GeneratorDataset(_iter_h5_data(),
|
||||
["ids", "weights", "labels"])
|
||||
data_set = data_set.repeat(epochs)
|
||||
return data_set
|
||||
|
||||
|
||||
def _get_tf_dataset(data_dir,
|
||||
|
@ -184,7 +182,7 @@ def _get_tf_dataset(data_dir,
|
|||
for filename in filenames:
|
||||
if file_prefix_name in filename and "tfrecord" in filename:
|
||||
dataset_files.append(os.path.join(dirpath, filename))
|
||||
schema = de.Schema()
|
||||
schema = ds.Schema()
|
||||
|
||||
float_key_list = ["label", "continue_val"]
|
||||
|
||||
|
@ -199,19 +197,19 @@ def _get_tf_dataset(data_dir,
|
|||
schema.add_column(key, de_type=ms_dtype)
|
||||
|
||||
if rank_size is not None and rank_id is not None:
|
||||
ds = de.TFRecordDataset(dataset_files=dataset_files,
|
||||
shuffle=shuffle,
|
||||
schema=schema,
|
||||
num_parallel_workers=8,
|
||||
num_shards=rank_size,
|
||||
shard_id=rank_id,
|
||||
shard_equal_rows=True)
|
||||
data_set = ds.TFRecordDataset(dataset_files=dataset_files,
|
||||
shuffle=shuffle,
|
||||
schema=schema,
|
||||
num_parallel_workers=8,
|
||||
num_shards=rank_size,
|
||||
shard_id=rank_id,
|
||||
shard_equal_rows=True)
|
||||
else:
|
||||
ds = de.TFRecordDataset(dataset_files=dataset_files,
|
||||
shuffle=shuffle,
|
||||
schema=schema,
|
||||
num_parallel_workers=8)
|
||||
ds = ds.batch(int(batch_size / line_per_sample), drop_remainder=True)
|
||||
data_set = ds.TFRecordDataset(dataset_files=dataset_files,
|
||||
shuffle=shuffle,
|
||||
schema=schema,
|
||||
num_parallel_workers=8)
|
||||
data_set = data_set.batch(int(batch_size / line_per_sample), drop_remainder=True)
|
||||
|
||||
operations_list = []
|
||||
for key in columns_list:
|
||||
|
@ -249,7 +247,7 @@ def _get_tf_dataset(data_dir,
|
|||
u = np.array(u).flatten().reshape(batch_size, -1)
|
||||
return a, b, c, d, e, f, g, h, i, j, k, l, m, n, o, p, q, r, s, t, u
|
||||
|
||||
ds = ds.map(
|
||||
data_set = data_set.map(
|
||||
operations=mixup,
|
||||
input_columns=[
|
||||
'label', 'continue_val', 'indicator_id', 'emb_128_id',
|
||||
|
@ -275,8 +273,8 @@ def _get_tf_dataset(data_dir,
|
|||
],
|
||||
num_parallel_workers=8)
|
||||
|
||||
ds = ds.repeat(epochs)
|
||||
return ds
|
||||
data_set = data_set.repeat(epochs)
|
||||
return data_set
|
||||
|
||||
|
||||
def compute_emb_dim(config):
|
||||
|
|
|
@ -24,16 +24,17 @@ import cv2
|
|||
import numpy as np
|
||||
import pycocotools.coco as coco
|
||||
|
||||
import mindspore.dataset.engine.datasets as de
|
||||
import mindspore.dataset as ds
|
||||
from mindspore import log as logger
|
||||
from mindspore.mindrecord import FileWriter
|
||||
from src.image import color_aug, get_affine_transform, affine_transform
|
||||
from src.image import gaussian_radius, draw_umich_gaussian, draw_msra_gaussian, draw_dense_reg
|
||||
from src.visual import visual_image
|
||||
|
||||
_current_dir = os.path.dirname(os.path.realpath(__file__))
|
||||
|
||||
|
||||
class COCOHP(de.Dataset):
|
||||
class COCOHP(ds.Dataset):
|
||||
"""
|
||||
Encapsulation class of COCO person keypoints datast.
|
||||
Initilize and preprocess of image for training and testing.
|
||||
|
@ -47,6 +48,7 @@ class COCOHP(de.Dataset):
|
|||
Returns:
|
||||
Prepocessed training or testing dataset for CenterNet network.
|
||||
"""
|
||||
|
||||
def __init__(self, data_opt, run_mode="train", net_opt=None, enable_visual_image=False, save_path=None):
|
||||
super(COCOHP, self).__init__()
|
||||
self._data_rng = np.random.RandomState(123)
|
||||
|
@ -64,7 +66,6 @@ class COCOHP(de.Dataset):
|
|||
if not os.path.exists(self.save_path):
|
||||
os.makedirs(self.save_path)
|
||||
|
||||
|
||||
def init(self, data_dir, keep_res=False, flip_test=False):
|
||||
"""initailize additional info"""
|
||||
logger.info('Initializing coco 2017 {} data.'.format(self.run_mode))
|
||||
|
@ -124,7 +125,7 @@ class COCOHP(de.Dataset):
|
|||
for img_id in self.images:
|
||||
image_info = self.coco.loadImgs([img_id])
|
||||
annos = self.coco.loadAnns(self.anns[img_id])
|
||||
#get image
|
||||
# get image
|
||||
img_name = image_info[0]['file_name']
|
||||
img_name = os.path.join(self.image_path, img_name)
|
||||
with open(img_name, 'rb') as f:
|
||||
|
@ -147,19 +148,16 @@ class COCOHP(de.Dataset):
|
|||
writer.commit()
|
||||
logger.info("Create Mindrecord Done, at {}".format(mindrecord_dir))
|
||||
|
||||
|
||||
def _coco_box_to_bbox(self, box):
|
||||
bbox = np.array([box[0], box[1], box[0] + box[2], box[1] + box[3]], dtype=np.float32)
|
||||
return bbox
|
||||
|
||||
|
||||
def _get_border(self, border, size):
|
||||
i = 1
|
||||
while size - border // i <= border // i:
|
||||
i *= 2
|
||||
return border // i
|
||||
|
||||
|
||||
def __getitem__(self, index):
|
||||
img_id = self.images[index]
|
||||
file_name = self.coco.loadImgs(ids=[img_id])[0]['file_name']
|
||||
|
@ -169,7 +167,6 @@ class COCOHP(de.Dataset):
|
|||
ret = (img, image_id)
|
||||
return ret
|
||||
|
||||
|
||||
def pre_process_for_test(self, image, img_id, scale, meta=None):
|
||||
"""image pre-process for evaluation"""
|
||||
b, h, w, ch = image.shape
|
||||
|
@ -249,7 +246,6 @@ class COCOHP(de.Dataset):
|
|||
|
||||
return images, meta
|
||||
|
||||
|
||||
def preprocess_fn(self, img, num_objects, keypoints, bboxes, category_id):
|
||||
"""image pre-process and augmentation"""
|
||||
num_objs = min(num_objects, self.data_opt.max_objs)
|
||||
|
@ -269,12 +265,12 @@ class COCOHP(de.Dataset):
|
|||
else:
|
||||
sf = self.data_opt.scale
|
||||
cf = self.data_opt.shift
|
||||
c[0] += s * np.clip(np.random.randn()*cf, -2*cf, 2*cf)
|
||||
c[1] += s * np.clip(np.random.randn()*cf, -2*cf, 2*cf)
|
||||
s = s * np.clip(np.random.randn()*sf + 1, 1 - sf, 1 + sf)
|
||||
c[0] += s * np.clip(np.random.randn() * cf, -2 * cf, 2 * cf)
|
||||
c[1] += s * np.clip(np.random.randn() * cf, -2 * cf, 2 * cf)
|
||||
s = s * np.clip(np.random.randn() * sf + 1, 1 - sf, 1 + sf)
|
||||
if np.random.random() < self.data_opt.aug_rot:
|
||||
rf = self.data_opt.rotate
|
||||
rot = np.clip(np.random.randn()*rf, -rf*2, rf*2)
|
||||
rot = np.clip(np.random.randn() * rf, -rf * 2, rf * 2)
|
||||
|
||||
if np.random.random() < self.data_opt.flip_prop:
|
||||
flipped = True
|
||||
|
@ -323,7 +319,7 @@ class COCOHP(de.Dataset):
|
|||
cls_id = int(category_id[k]) - 1
|
||||
pts = np.array(keypoints[k], np.float32).reshape(num_joints, 3)
|
||||
if flipped:
|
||||
bbox[[0, 2]] = width - bbox[[2, 0]] - 1 # index begin from zero
|
||||
bbox[[0, 2]] = width - bbox[[2, 0]] - 1 # index begin from zero
|
||||
pts[:, 0] = width - pts[:, 0] - 1
|
||||
for e in self.data_opt.flip_idx:
|
||||
pts[e[0]], pts[e[1]] = pts[e[1]].copy(), pts[e[0]].copy()
|
||||
|
@ -360,7 +356,7 @@ class COCOHP(de.Dataset):
|
|||
if pts[j, 2] > 0:
|
||||
pts[j, :2] = affine_transform(pts[j, :2], trans_output_rot)
|
||||
if pts[j, 0] >= 0 and pts[j, 0] < output_res and \
|
||||
pts[j, 1] >= 0 and pts[j, 1] < output_res:
|
||||
pts[j, 1] >= 0 and pts[j, 1] < output_res:
|
||||
kps[k, j * 2: j * 2 + 2] = pts[j, :2] - ct_int
|
||||
kps_mask[k, j * 2: j * 2 + 2] = 1
|
||||
pt_int = pts[j, :2].astype(np.int32)
|
||||
|
@ -399,7 +395,6 @@ class COCOHP(de.Dataset):
|
|||
visual_image(out_img, ground_truth, self.save_path, ratio=self.data_opt.input_res[0] // output_res)
|
||||
return ret
|
||||
|
||||
|
||||
def create_train_dataset(self, mindrecord_dir, prefix="coco_hp.train.mind", batch_size=1,
|
||||
device_num=1, rank=0, num_parallel_workers=1, do_shuffle=True):
|
||||
"""create train dataset based on mindrecord file"""
|
||||
|
@ -415,41 +410,43 @@ class COCOHP(de.Dataset):
|
|||
raise ValueError('data_dir {} have no data files'.format(mindrecord_dir))
|
||||
|
||||
columns = ["image", "num_objects", "keypoints", "bbox", "category_id"]
|
||||
ds = de.MindDataset(data_files,
|
||||
columns_list=columns,
|
||||
num_parallel_workers=num_parallel_workers, shuffle=do_shuffle,
|
||||
num_shards=device_num, shard_id=rank)
|
||||
ori_dataset_size = ds.get_dataset_size()
|
||||
data_set = ds.MindDataset(data_files,
|
||||
columns_list=columns,
|
||||
num_parallel_workers=num_parallel_workers, shuffle=do_shuffle,
|
||||
num_shards=device_num, shard_id=rank)
|
||||
ori_dataset_size = data_set.get_dataset_size()
|
||||
logger.info('origin dataset size: {}'.format(ori_dataset_size))
|
||||
|
||||
ds = ds.map(operations=self.preprocess_fn,
|
||||
input_columns=["image", "num_objects", "keypoints", "bbox", "category_id"],
|
||||
output_columns=["image", "hm", "reg_mask", "ind", "wh", "kps", "kps_mask",
|
||||
"reg", "hm_hp", "hp_offset", "hp_ind", "hp_mask"],
|
||||
column_order=["image", "hm", "reg_mask", "ind", "wh", "kps", "kps_mask",
|
||||
"reg", "hm_hp", "hp_offset", "hp_ind", "hp_mask"],
|
||||
num_parallel_workers=num_parallel_workers,
|
||||
python_multiprocessing=True)
|
||||
ds = ds.batch(batch_size, drop_remainder=True, num_parallel_workers=8)
|
||||
logger.info("data size: {}".format(ds.get_dataset_size()))
|
||||
logger.info("repeat count: {}".format(ds.get_repeat_count()))
|
||||
return ds
|
||||
|
||||
data_set = data_set.map(operations=self.preprocess_fn,
|
||||
input_columns=["image", "num_objects", "keypoints", "bbox", "category_id"],
|
||||
output_columns=["image", "hm", "reg_mask", "ind", "wh", "kps", "kps_mask",
|
||||
"reg", "hm_hp", "hp_offset", "hp_ind", "hp_mask"],
|
||||
column_order=["image", "hm", "reg_mask", "ind", "wh", "kps", "kps_mask",
|
||||
"reg", "hm_hp", "hp_offset", "hp_ind", "hp_mask"],
|
||||
num_parallel_workers=num_parallel_workers,
|
||||
python_multiprocessing=True)
|
||||
data_set = data_set.batch(batch_size, drop_remainder=True, num_parallel_workers=8)
|
||||
logger.info("data size: {}".format(data_set.get_dataset_size()))
|
||||
logger.info("repeat count: {}".format(data_set.get_repeat_count()))
|
||||
return data_set
|
||||
|
||||
def create_eval_dataset(self, batch_size=1, num_parallel_workers=1):
|
||||
"""create testing dataset based on coco format"""
|
||||
|
||||
def generator():
|
||||
for i in range(self.num_samples):
|
||||
yield self.__getitem__(i)
|
||||
|
||||
column = ["image", "image_id"]
|
||||
ds = de.GeneratorDataset(generator, column, num_parallel_workers=num_parallel_workers)
|
||||
ds = ds.batch(batch_size, drop_remainder=True, num_parallel_workers=8)
|
||||
return ds
|
||||
data_set = ds.GeneratorDataset(generator, column, num_parallel_workers=num_parallel_workers)
|
||||
data_set = data_set.batch(batch_size, drop_remainder=True, num_parallel_workers=8)
|
||||
return data_set
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
# Convert coco2017 dataset to mindrecord to improve performance on host
|
||||
from src.config import dataset_config
|
||||
|
||||
parser = argparse.ArgumentParser(description='CenterNet MindRecord dataset')
|
||||
parser.add_argument("--coco_data_dir", type=str, default="", help="Coco dataset directory.")
|
||||
parser.add_argument("--mindrecord_dir", type=str, default="", help="MindRecord dataset dir.")
|
||||
|
|
|
@ -17,7 +17,7 @@ create train or eval dataset.
|
|||
"""
|
||||
import os
|
||||
import mindspore.common.dtype as mstype
|
||||
import mindspore.dataset.engine as de
|
||||
import mindspore.dataset as ds
|
||||
import mindspore.dataset.transforms.vision.c_transforms as C
|
||||
import mindspore.dataset.transforms.vision.py_transforms as P
|
||||
import mindspore.dataset.transforms.c_transforms as C2
|
||||
|
@ -41,18 +41,18 @@ def create_dataset(dataset_path, do_train, config, platform, repeat_num=1, batch
|
|||
rank_size = int(os.getenv("RANK_SIZE"))
|
||||
rank_id = int(os.getenv("RANK_ID"))
|
||||
if rank_size == 1:
|
||||
ds = de.MindDataset(
|
||||
data_set = ds.MindDataset(
|
||||
dataset_path, num_parallel_workers=8, shuffle=True)
|
||||
else:
|
||||
ds = de.MindDataset(dataset_path, num_parallel_workers=8, shuffle=True,
|
||||
num_shards=rank_size, shard_id=rank_id)
|
||||
data_set = ds.MindDataset(dataset_path, num_parallel_workers=8, shuffle=True,
|
||||
num_shards=rank_size, shard_id=rank_id)
|
||||
elif platform == "GPU":
|
||||
if do_train:
|
||||
from mindspore.communication.management import get_rank, get_group_size
|
||||
ds = de.MindDataset(dataset_path, num_parallel_workers=8, shuffle=True,
|
||||
num_shards=get_group_size(), shard_id=get_rank())
|
||||
data_set = ds.MindDataset(dataset_path, num_parallel_workers=8, shuffle=True,
|
||||
num_shards=get_group_size(), shard_id=get_rank())
|
||||
else:
|
||||
ds = de.MindDataset(
|
||||
data_set = ds.MindDataset(
|
||||
dataset_path, num_parallel_workers=8, shuffle=True)
|
||||
else:
|
||||
raise ValueError("Unsupport platform.")
|
||||
|
@ -67,7 +67,7 @@ def create_dataset(dataset_path, do_train, config, platform, repeat_num=1, batch
|
|||
|
||||
color_op = C.RandomColorAdjust(
|
||||
brightness=0.4, contrast=0.4, saturation=0.4)
|
||||
rescale_op = C.Rescale(1/255.0, 0)
|
||||
rescale_op = C.Rescale(1 / 255.0, 0)
|
||||
normalize_op = C.Normalize(
|
||||
mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
|
||||
change_swap_op = C.HWC2CHW()
|
||||
|
@ -93,18 +93,18 @@ def create_dataset(dataset_path, do_train, config, platform, repeat_num=1, batch
|
|||
trans = composeop()
|
||||
type_cast_op = C2.TypeCast(mstype.int32)
|
||||
|
||||
ds = ds.map(input_columns="image", operations=trans,
|
||||
num_parallel_workers=8)
|
||||
ds = ds.map(input_columns="label_list",
|
||||
operations=type_cast_op, num_parallel_workers=8)
|
||||
data_set = data_set.map(input_columns="image", operations=trans,
|
||||
num_parallel_workers=8)
|
||||
data_set = data_set.map(input_columns="label_list",
|
||||
operations=type_cast_op, num_parallel_workers=8)
|
||||
|
||||
# apply shuffle operations
|
||||
ds = ds.shuffle(buffer_size=buffer_size)
|
||||
data_set = data_set.shuffle(buffer_size=buffer_size)
|
||||
|
||||
# apply batch operations
|
||||
ds = ds.batch(batch_size, drop_remainder=True)
|
||||
data_set = data_set.batch(batch_size, drop_remainder=True)
|
||||
|
||||
# apply dataset repeat operation
|
||||
ds = ds.repeat(repeat_num)
|
||||
data_set = data_set.repeat(repeat_num)
|
||||
|
||||
return ds
|
||||
return data_set
|
||||
|
|
|
@ -17,7 +17,7 @@ create train or eval dataset.
|
|||
"""
|
||||
import os
|
||||
import mindspore.common.dtype as mstype
|
||||
import mindspore.dataset.engine as de
|
||||
import mindspore.dataset as ds
|
||||
import mindspore.dataset.transforms.vision.c_transforms as C
|
||||
import mindspore.dataset.transforms.vision.py_transforms as P
|
||||
import mindspore.dataset.transforms.c_transforms as C2
|
||||
|
@ -41,18 +41,18 @@ def create_dataset(dataset_path, do_train, config, platform, repeat_num=1, batch
|
|||
rank_size = int(os.getenv("RANK_SIZE"))
|
||||
rank_id = int(os.getenv("RANK_ID"))
|
||||
if rank_size == 1:
|
||||
ds = de.MindDataset(
|
||||
data_set = ds.MindDataset(
|
||||
dataset_path, num_parallel_workers=8, shuffle=True)
|
||||
else:
|
||||
ds = de.MindDataset(dataset_path, num_parallel_workers=8, shuffle=True,
|
||||
num_shards=rank_size, shard_id=rank_id)
|
||||
data_set = ds.MindDataset(dataset_path, num_parallel_workers=8, shuffle=True,
|
||||
num_shards=rank_size, shard_id=rank_id)
|
||||
elif platform == "GPU":
|
||||
if do_train:
|
||||
from mindspore.communication.management import get_rank, get_group_size
|
||||
ds = de.MindDataset(dataset_path, num_parallel_workers=8, shuffle=True,
|
||||
num_shards=get_group_size(), shard_id=get_rank())
|
||||
data_set = ds.MindDataset(dataset_path, num_parallel_workers=8, shuffle=True,
|
||||
num_shards=get_group_size(), shard_id=get_rank())
|
||||
else:
|
||||
ds = de.MindDataset(
|
||||
data_set = ds.MindDataset(
|
||||
dataset_path, num_parallel_workers=8, shuffle=True)
|
||||
else:
|
||||
raise ValueError("Unsupport platform.")
|
||||
|
@ -67,7 +67,7 @@ def create_dataset(dataset_path, do_train, config, platform, repeat_num=1, batch
|
|||
|
||||
color_op = C.RandomColorAdjust(
|
||||
brightness=0.4, contrast=0.4, saturation=0.4)
|
||||
rescale_op = C.Rescale(1/255.0, 0)
|
||||
rescale_op = C.Rescale(1 / 255.0, 0)
|
||||
normalize_op = C.Normalize(
|
||||
mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
|
||||
change_swap_op = C.HWC2CHW()
|
||||
|
@ -93,18 +93,18 @@ def create_dataset(dataset_path, do_train, config, platform, repeat_num=1, batch
|
|||
trans = composeop()
|
||||
type_cast_op = C2.TypeCast(mstype.int32)
|
||||
|
||||
ds = ds.map(input_columns="image", operations=trans,
|
||||
num_parallel_workers=8)
|
||||
ds = ds.map(input_columns="label_list",
|
||||
operations=type_cast_op, num_parallel_workers=8)
|
||||
data_set = data_set.map(input_columns="image", operations=trans,
|
||||
num_parallel_workers=8)
|
||||
data_set = data_set.map(input_columns="label_list",
|
||||
operations=type_cast_op, num_parallel_workers=8)
|
||||
|
||||
# apply shuffle operations
|
||||
ds = ds.shuffle(buffer_size=buffer_size)
|
||||
data_set = data_set.shuffle(buffer_size=buffer_size)
|
||||
|
||||
# apply batch operations
|
||||
ds = ds.batch(batch_size, drop_remainder=True)
|
||||
data_set = data_set.batch(batch_size, drop_remainder=True)
|
||||
|
||||
# apply dataset repeat operation
|
||||
ds = ds.repeat(repeat_num)
|
||||
data_set = data_set.repeat(repeat_num)
|
||||
|
||||
return ds
|
||||
return data_set
|
||||
|
|
|
@ -17,7 +17,7 @@ create train or eval dataset.
|
|||
"""
|
||||
import os
|
||||
import mindspore.common.dtype as mstype
|
||||
import mindspore.dataset.engine as de
|
||||
import mindspore.dataset as ds
|
||||
import mindspore.dataset.vision.c_transforms as C
|
||||
import mindspore.dataset.vision.py_transforms as P
|
||||
import mindspore.dataset.transforms.c_transforms as C2
|
||||
|
@ -42,18 +42,18 @@ def create_dataset(dataset_path, do_train, config, platform, repeat_num=1, batch
|
|||
rank_size = int(os.getenv("RANK_SIZE"))
|
||||
rank_id = int(os.getenv("RANK_ID"))
|
||||
if rank_size == 1:
|
||||
ds = de.MindDataset(
|
||||
data_set = ds.MindDataset(
|
||||
dataset_path, num_parallel_workers=8, shuffle=True)
|
||||
else:
|
||||
ds = de.MindDataset(dataset_path, num_parallel_workers=8, shuffle=True,
|
||||
num_shards=rank_size, shard_id=rank_id)
|
||||
data_set = ds.MindDataset(dataset_path, num_parallel_workers=8, shuffle=True,
|
||||
num_shards=rank_size, shard_id=rank_id)
|
||||
elif platform == "GPU":
|
||||
if do_train:
|
||||
from mindspore.communication.management import get_rank, get_group_size
|
||||
ds = de.MindDataset(dataset_path, num_parallel_workers=8, shuffle=True,
|
||||
num_shards=get_group_size(), shard_id=get_rank())
|
||||
data_set = ds.MindDataset(dataset_path, num_parallel_workers=8, shuffle=True,
|
||||
num_shards=get_group_size(), shard_id=get_rank())
|
||||
else:
|
||||
ds = de.MindDataset(
|
||||
data_set = ds.MindDataset(
|
||||
dataset_path, num_parallel_workers=8, shuffle=False)
|
||||
else:
|
||||
raise ValueError("Unsupport platform.")
|
||||
|
@ -68,7 +68,7 @@ def create_dataset(dataset_path, do_train, config, platform, repeat_num=1, batch
|
|||
|
||||
color_op = C.RandomColorAdjust(
|
||||
brightness=0.4, contrast=0.4, saturation=0.4)
|
||||
rescale_op = C.Rescale(1/255.0, 0)
|
||||
rescale_op = C.Rescale(1 / 255.0, 0)
|
||||
normalize_op = C.Normalize(
|
||||
mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
|
||||
change_swap_op = C.HWC2CHW()
|
||||
|
@ -88,18 +88,18 @@ def create_dataset(dataset_path, do_train, config, platform, repeat_num=1, batch
|
|||
trans = composeop
|
||||
type_cast_op = C2.TypeCast(mstype.int32)
|
||||
|
||||
ds = ds.map(input_columns="image", operations=trans,
|
||||
num_parallel_workers=8)
|
||||
ds = ds.map(input_columns="label_list",
|
||||
operations=type_cast_op, num_parallel_workers=8)
|
||||
data_set = data_set.map(input_columns="image", operations=trans,
|
||||
num_parallel_workers=8)
|
||||
data_set = data_set.map(input_columns="label_list",
|
||||
operations=type_cast_op, num_parallel_workers=8)
|
||||
|
||||
# apply shuffle operations
|
||||
ds = ds.shuffle(buffer_size=buffer_size)
|
||||
data_set = data_set.shuffle(buffer_size=buffer_size)
|
||||
|
||||
# apply batch operations
|
||||
ds = ds.batch(batch_size, drop_remainder=True)
|
||||
data_set = data_set.batch(batch_size, drop_remainder=True)
|
||||
|
||||
# apply dataset repeat operation
|
||||
ds = ds.repeat(repeat_num)
|
||||
data_set = data_set.repeat(repeat_num)
|
||||
|
||||
return ds
|
||||
return data_set
|
||||
|
|
|
@ -17,7 +17,7 @@ create train or eval dataset.
|
|||
"""
|
||||
import os
|
||||
import mindspore.common.dtype as mstype
|
||||
import mindspore.dataset.engine as de
|
||||
import mindspore.dataset as ds
|
||||
import mindspore.dataset.vision.c_transforms as C
|
||||
import mindspore.dataset.transforms.c_transforms as C2
|
||||
from mindspore.communication.management import init, get_rank, get_group_size
|
||||
|
@ -48,15 +48,15 @@ def create_dataset_cifar(dataset_path,
|
|||
device_num = get_group_size()
|
||||
|
||||
if device_num == 1:
|
||||
ds = de.Cifar10Dataset(dataset_path,
|
||||
num_parallel_workers=8,
|
||||
shuffle=True)
|
||||
data_set = ds.Cifar10Dataset(dataset_path,
|
||||
num_parallel_workers=8,
|
||||
shuffle=True)
|
||||
else:
|
||||
ds = de.Cifar10Dataset(dataset_path,
|
||||
num_parallel_workers=8,
|
||||
shuffle=True,
|
||||
num_shards=device_num,
|
||||
shard_id=rank_id)
|
||||
data_set = ds.Cifar10Dataset(dataset_path,
|
||||
num_parallel_workers=8,
|
||||
shuffle=True,
|
||||
num_shards=device_num,
|
||||
shard_id=rank_id)
|
||||
|
||||
# define map operations
|
||||
if do_train:
|
||||
|
@ -80,20 +80,20 @@ def create_dataset_cifar(dataset_path,
|
|||
|
||||
type_cast_op = C2.TypeCast(mstype.int32)
|
||||
|
||||
ds = ds.map(operations=type_cast_op,
|
||||
input_columns="label",
|
||||
num_parallel_workers=8)
|
||||
ds = ds.map(operations=trans,
|
||||
input_columns="image",
|
||||
num_parallel_workers=8)
|
||||
data_set = data_set.map(operations=type_cast_op,
|
||||
input_columns="label",
|
||||
num_parallel_workers=8)
|
||||
data_set = data_set.map(operations=trans,
|
||||
input_columns="image",
|
||||
num_parallel_workers=8)
|
||||
|
||||
# apply batch operations
|
||||
ds = ds.batch(batch_size, drop_remainder=True)
|
||||
data_set = data_set.batch(batch_size, drop_remainder=True)
|
||||
|
||||
# apply dataset repeat operation
|
||||
ds = ds.repeat(repeat_num)
|
||||
data_set = data_set.repeat(repeat_num)
|
||||
|
||||
return ds
|
||||
return data_set
|
||||
|
||||
|
||||
def create_dataset_imagenet(dataset_path,
|
||||
|
@ -122,15 +122,15 @@ def create_dataset_imagenet(dataset_path,
|
|||
device_num = get_group_size()
|
||||
|
||||
if device_num == 1:
|
||||
ds = de.ImageFolderDataset(dataset_path,
|
||||
num_parallel_workers=8,
|
||||
shuffle=True)
|
||||
data_set = ds.ImageFolderDataset(dataset_path,
|
||||
num_parallel_workers=8,
|
||||
shuffle=True)
|
||||
else:
|
||||
ds = de.ImageFolderDataset(dataset_path,
|
||||
num_parallel_workers=8,
|
||||
shuffle=True,
|
||||
num_shards=device_num,
|
||||
shard_id=rank_id)
|
||||
data_set = ds.ImageFolderDataset(dataset_path,
|
||||
num_parallel_workers=8,
|
||||
shuffle=True,
|
||||
num_shards=device_num,
|
||||
shard_id=rank_id)
|
||||
|
||||
image_size = 227
|
||||
mean = [0.485 * 255, 0.456 * 255, 0.406 * 255]
|
||||
|
@ -159,20 +159,20 @@ def create_dataset_imagenet(dataset_path,
|
|||
|
||||
type_cast_op = C2.TypeCast(mstype.int32)
|
||||
|
||||
ds = ds.map(operations=type_cast_op,
|
||||
input_columns="label",
|
||||
num_parallel_workers=8)
|
||||
ds = ds.map(operations=trans,
|
||||
input_columns="image",
|
||||
num_parallel_workers=8)
|
||||
data_set = data_set.map(operations=type_cast_op,
|
||||
input_columns="label",
|
||||
num_parallel_workers=8)
|
||||
data_set = data_set.map(operations=trans,
|
||||
input_columns="image",
|
||||
num_parallel_workers=8)
|
||||
|
||||
# apply batch operations
|
||||
ds = ds.batch(batch_size, drop_remainder=True)
|
||||
data_set = data_set.batch(batch_size, drop_remainder=True)
|
||||
|
||||
# apply dataset repeat operation
|
||||
ds = ds.repeat(repeat_num)
|
||||
data_set = data_set.repeat(repeat_num)
|
||||
|
||||
return ds
|
||||
return data_set
|
||||
|
||||
|
||||
def _get_rank_info():
|
||||
|
|
|
@ -21,7 +21,7 @@ from enum import Enum
|
|||
|
||||
import numpy as np
|
||||
import pandas as pd
|
||||
import mindspore.dataset.engine as de
|
||||
import mindspore.dataset as ds
|
||||
import mindspore.common.dtype as mstype
|
||||
|
||||
from .config import DataConfig
|
||||
|
@ -142,8 +142,8 @@ class H5Dataset():
|
|||
X_id = X[:, 0:self.max_length]
|
||||
X_va = X[:, self.max_length:]
|
||||
yield np.array(X_id.astype(dtype=np.int32)), \
|
||||
np.array(X_va.astype(dtype=np.float32)), \
|
||||
np.array(y.astype(dtype=np.float32))
|
||||
np.array(X_va.astype(dtype=np.float32)), \
|
||||
np.array(y.astype(dtype=np.float32))
|
||||
|
||||
|
||||
def _get_h5_dataset(directory, train_mode=True, epochs=1, batch_size=1000):
|
||||
|
@ -172,9 +172,9 @@ def _get_h5_dataset(directory, train_mode=True, epochs=1, batch_size=1000):
|
|||
for _ in range(0, numbers_of_batch, 1):
|
||||
yield train_eval_gen.__next__()
|
||||
|
||||
ds = de.GeneratorDataset(_iter_h5_data, ["ids", "weights", "labels"])
|
||||
ds = ds.repeat(epochs)
|
||||
return ds
|
||||
data_set = ds.GeneratorDataset(_iter_h5_data, ["ids", "weights", "labels"])
|
||||
data_set = data_set.repeat(epochs)
|
||||
return data_set
|
||||
|
||||
|
||||
def _get_mindrecord_dataset(directory, train_mode=True, epochs=1, batch_size=1000,
|
||||
|
@ -199,23 +199,23 @@ def _get_mindrecord_dataset(directory, train_mode=True, epochs=1, batch_size=100
|
|||
shuffle = train_mode
|
||||
|
||||
if rank_size is not None and rank_id is not None:
|
||||
ds = de.MindDataset(os.path.join(directory, file_prefix_name + file_suffix_name),
|
||||
columns_list=['feat_ids', 'feat_vals', 'label'],
|
||||
num_shards=rank_size, shard_id=rank_id, shuffle=shuffle,
|
||||
num_parallel_workers=8)
|
||||
data_set = ds.MindDataset(os.path.join(directory, file_prefix_name + file_suffix_name),
|
||||
columns_list=['feat_ids', 'feat_vals', 'label'],
|
||||
num_shards=rank_size, shard_id=rank_id, shuffle=shuffle,
|
||||
num_parallel_workers=8)
|
||||
else:
|
||||
ds = de.MindDataset(os.path.join(directory, file_prefix_name + file_suffix_name),
|
||||
columns_list=['feat_ids', 'feat_vals', 'label'],
|
||||
shuffle=shuffle, num_parallel_workers=8)
|
||||
ds = ds.batch(int(batch_size / line_per_sample), drop_remainder=True)
|
||||
ds = ds.map(operations=(lambda x, y, z: (np.array(x).flatten().reshape(batch_size, 39),
|
||||
np.array(y).flatten().reshape(batch_size, 39),
|
||||
np.array(z).flatten().reshape(batch_size, 1))),
|
||||
input_columns=['feat_ids', 'feat_vals', 'label'],
|
||||
column_order=['feat_ids', 'feat_vals', 'label'],
|
||||
num_parallel_workers=8)
|
||||
ds = ds.repeat(epochs)
|
||||
return ds
|
||||
data_set = ds.MindDataset(os.path.join(directory, file_prefix_name + file_suffix_name),
|
||||
columns_list=['feat_ids', 'feat_vals', 'label'],
|
||||
shuffle=shuffle, num_parallel_workers=8)
|
||||
data_set = data_set.batch(int(batch_size / line_per_sample), drop_remainder=True)
|
||||
data_set = data_set.map(operations=(lambda x, y, z: (np.array(x).flatten().reshape(batch_size, 39),
|
||||
np.array(y).flatten().reshape(batch_size, 39),
|
||||
np.array(z).flatten().reshape(batch_size, 1))),
|
||||
input_columns=['feat_ids', 'feat_vals', 'label'],
|
||||
column_order=['feat_ids', 'feat_vals', 'label'],
|
||||
num_parallel_workers=8)
|
||||
data_set = data_set.repeat(epochs)
|
||||
return data_set
|
||||
|
||||
|
||||
def _get_tf_dataset(directory, train_mode=True, epochs=1, batch_size=1000,
|
||||
|
@ -242,28 +242,28 @@ def _get_tf_dataset(directory, train_mode=True, epochs=1, batch_size=1000,
|
|||
for filename in filenames:
|
||||
if file_prefixt_name in filename and 'tfrecord' in filename:
|
||||
dataset_files.append(os.path.join(dir_path, filename))
|
||||
schema = de.Schema()
|
||||
schema = ds.Schema()
|
||||
schema.add_column('feat_ids', de_type=mstype.int32)
|
||||
schema.add_column('feat_vals', de_type=mstype.float32)
|
||||
schema.add_column('label', de_type=mstype.float32)
|
||||
if rank_size is not None and rank_id is not None:
|
||||
ds = de.TFRecordDataset(dataset_files=dataset_files, shuffle=shuffle,
|
||||
schema=schema, num_parallel_workers=8,
|
||||
num_shards=rank_size, shard_id=rank_id,
|
||||
shard_equal_rows=True)
|
||||
data_set = ds.TFRecordDataset(dataset_files=dataset_files, shuffle=shuffle,
|
||||
schema=schema, num_parallel_workers=8,
|
||||
num_shards=rank_size, shard_id=rank_id,
|
||||
shard_equal_rows=True)
|
||||
else:
|
||||
ds = de.TFRecordDataset(dataset_files=dataset_files, shuffle=shuffle,
|
||||
schema=schema, num_parallel_workers=8)
|
||||
ds = ds.batch(int(batch_size / line_per_sample), drop_remainder=True)
|
||||
ds = ds.map(operations=(lambda x, y, z: (
|
||||
data_set = ds.TFRecordDataset(dataset_files=dataset_files, shuffle=shuffle,
|
||||
schema=schema, num_parallel_workers=8)
|
||||
data_set = data_set.batch(int(batch_size / line_per_sample), drop_remainder=True)
|
||||
data_set = data_set.map(operations=(lambda x, y, z: (
|
||||
np.array(x).flatten().reshape(batch_size, 39),
|
||||
np.array(y).flatten().reshape(batch_size, 39),
|
||||
np.array(z).flatten().reshape(batch_size, 1))),
|
||||
input_columns=['feat_ids', 'feat_vals', 'label'],
|
||||
column_order=['feat_ids', 'feat_vals', 'label'],
|
||||
num_parallel_workers=8)
|
||||
ds = ds.repeat(epochs)
|
||||
return ds
|
||||
input_columns=['feat_ids', 'feat_vals', 'label'],
|
||||
column_order=['feat_ids', 'feat_vals', 'label'],
|
||||
num_parallel_workers=8)
|
||||
data_set = data_set.repeat(epochs)
|
||||
return data_set
|
||||
|
||||
|
||||
def create_dataset(directory, train_mode=True, epochs=1, batch_size=1000,
|
||||
|
|
|
@ -21,7 +21,7 @@ from enum import Enum
|
|||
|
||||
import pandas as pd
|
||||
import numpy as np
|
||||
import mindspore.dataset.engine as de
|
||||
import mindspore.dataset as ds
|
||||
import mindspore.common.dtype as mstype
|
||||
|
||||
from .config import DataConfig
|
||||
|
@ -142,8 +142,8 @@ class H5Dataset():
|
|||
X_id = X[:, 0:self.max_length]
|
||||
X_va = X[:, self.max_length:]
|
||||
yield np.array(X_id.astype(dtype=np.int32)), \
|
||||
np.array(X_va.astype(dtype=np.float32)), \
|
||||
np.array(y.astype(dtype=np.float32))
|
||||
np.array(X_va.astype(dtype=np.float32)), \
|
||||
np.array(y.astype(dtype=np.float32))
|
||||
|
||||
|
||||
def _get_h5_dataset(directory, train_mode=True, epochs=1, batch_size=1000):
|
||||
|
@ -172,9 +172,9 @@ def _get_h5_dataset(directory, train_mode=True, epochs=1, batch_size=1000):
|
|||
for _ in range(0, numbers_of_batch, 1):
|
||||
yield train_eval_gen.__next__()
|
||||
|
||||
ds = de.GeneratorDataset(_iter_h5_data, ["ids", "weights", "labels"], num_samples=3000)
|
||||
ds = ds.repeat(epochs)
|
||||
return ds
|
||||
data_set = ds.GeneratorDataset(_iter_h5_data, ["ids", "weights", "labels"], num_samples=3000)
|
||||
data_set = data_set.repeat(epochs)
|
||||
return data_set
|
||||
|
||||
|
||||
def _get_mindrecord_dataset(directory, train_mode=True, epochs=1, batch_size=1000,
|
||||
|
@ -199,23 +199,23 @@ def _get_mindrecord_dataset(directory, train_mode=True, epochs=1, batch_size=100
|
|||
shuffle = train_mode
|
||||
|
||||
if rank_size is not None and rank_id is not None:
|
||||
ds = de.MindDataset(os.path.join(directory, file_prefix_name + file_suffix_name),
|
||||
columns_list=['feat_ids', 'feat_vals', 'label'],
|
||||
num_shards=rank_size, shard_id=rank_id, shuffle=shuffle,
|
||||
num_parallel_workers=8)
|
||||
data_set = ds.MindDataset(os.path.join(directory, file_prefix_name + file_suffix_name),
|
||||
columns_list=['feat_ids', 'feat_vals', 'label'],
|
||||
num_shards=rank_size, shard_id=rank_id, shuffle=shuffle,
|
||||
num_parallel_workers=8)
|
||||
else:
|
||||
ds = de.MindDataset(os.path.join(directory, file_prefix_name + file_suffix_name),
|
||||
columns_list=['feat_ids', 'feat_vals', 'label'],
|
||||
shuffle=shuffle, num_parallel_workers=8)
|
||||
ds = ds.batch(int(batch_size / line_per_sample), drop_remainder=True)
|
||||
ds = ds.map(operations=(lambda x, y, z: (np.array(x).flatten().reshape(batch_size, 39),
|
||||
np.array(y).flatten().reshape(batch_size, 39),
|
||||
np.array(z).flatten().reshape(batch_size, 1))),
|
||||
input_columns=['feat_ids', 'feat_vals', 'label'],
|
||||
column_order=['feat_ids', 'feat_vals', 'label'],
|
||||
num_parallel_workers=8)
|
||||
ds = ds.repeat(epochs)
|
||||
return ds
|
||||
data_set = ds.MindDataset(os.path.join(directory, file_prefix_name + file_suffix_name),
|
||||
columns_list=['feat_ids', 'feat_vals', 'label'],
|
||||
shuffle=shuffle, num_parallel_workers=8)
|
||||
data_set = data_set.batch(int(batch_size / line_per_sample), drop_remainder=True)
|
||||
data_set = data_set.map(operations=(lambda x, y, z: (np.array(x).flatten().reshape(batch_size, 39),
|
||||
np.array(y).flatten().reshape(batch_size, 39),
|
||||
np.array(z).flatten().reshape(batch_size, 1))),
|
||||
input_columns=['feat_ids', 'feat_vals', 'label'],
|
||||
column_order=['feat_ids', 'feat_vals', 'label'],
|
||||
num_parallel_workers=8)
|
||||
data_set = data_set.repeat(epochs)
|
||||
return data_set
|
||||
|
||||
|
||||
def _get_tf_dataset(directory, train_mode=True, epochs=1, batch_size=1000,
|
||||
|
@ -242,28 +242,28 @@ def _get_tf_dataset(directory, train_mode=True, epochs=1, batch_size=1000,
|
|||
for filename in filenames:
|
||||
if file_prefixt_name in filename and 'tfrecord' in filename:
|
||||
dataset_files.append(os.path.join(dir_path, filename))
|
||||
schema = de.Schema()
|
||||
schema = ds.Schema()
|
||||
schema.add_column('feat_ids', de_type=mstype.int32)
|
||||
schema.add_column('feat_vals', de_type=mstype.float32)
|
||||
schema.add_column('label', de_type=mstype.float32)
|
||||
if rank_size is not None and rank_id is not None:
|
||||
ds = de.TFRecordDataset(dataset_files=dataset_files, shuffle=shuffle,
|
||||
schema=schema, num_parallel_workers=8,
|
||||
num_shards=rank_size, shard_id=rank_id,
|
||||
shard_equal_rows=True, num_samples=3000)
|
||||
data_set = ds.TFRecordDataset(dataset_files=dataset_files, shuffle=shuffle,
|
||||
schema=schema, num_parallel_workers=8,
|
||||
num_shards=rank_size, shard_id=rank_id,
|
||||
shard_equal_rows=True, num_samples=3000)
|
||||
else:
|
||||
ds = de.TFRecordDataset(dataset_files=dataset_files, shuffle=shuffle,
|
||||
schema=schema, num_parallel_workers=8, num_samples=3000)
|
||||
ds = ds.batch(int(batch_size / line_per_sample), drop_remainder=True)
|
||||
ds = ds.map(operations=(lambda x, y, z: (
|
||||
data_set = ds.TFRecordDataset(dataset_files=dataset_files, shuffle=shuffle,
|
||||
schema=schema, num_parallel_workers=8, num_samples=3000)
|
||||
data_set = data_set.batch(int(batch_size / line_per_sample), drop_remainder=True)
|
||||
data_set = data_set.map(operations=(lambda x, y, z: (
|
||||
np.array(x).flatten().reshape(batch_size, 39),
|
||||
np.array(y).flatten().reshape(batch_size, 39),
|
||||
np.array(z).flatten().reshape(batch_size, 1))),
|
||||
input_columns=['feat_ids', 'feat_vals', 'label'],
|
||||
column_order=['feat_ids', 'feat_vals', 'label'],
|
||||
num_parallel_workers=8)
|
||||
ds = ds.repeat(epochs)
|
||||
return ds
|
||||
input_columns=['feat_ids', 'feat_vals', 'label'],
|
||||
column_order=['feat_ids', 'feat_vals', 'label'],
|
||||
num_parallel_workers=8)
|
||||
data_set = data_set.repeat(epochs)
|
||||
return data_set
|
||||
|
||||
|
||||
def create_dataset(directory, train_mode=True, epochs=1, batch_size=1000,
|
||||
|
|
|
@ -24,17 +24,18 @@ from mindspore.nn.optim import Adam
|
|||
from mindspore.train.model import Model
|
||||
from mindspore.train.loss_scale_manager import DynamicLossScaleManager
|
||||
from mindspore.train.callback import Callback
|
||||
import mindspore.dataset.engine as de
|
||||
import mindspore.dataset as ds
|
||||
import mindspore.dataset.transforms.c_transforms as deC
|
||||
from mindspore import context
|
||||
from model_zoo.official.nlp.transformer.src.transformer_model import TransformerConfig
|
||||
from model_zoo.official.nlp.transformer.src.transformer_for_train import TransformerNetworkWithLoss, \
|
||||
TransformerTrainOneStepWithLossScaleCell
|
||||
TransformerTrainOneStepWithLossScaleCell
|
||||
from model_zoo.official.nlp.transformer.src.config import cfg, transformer_net_cfg
|
||||
from model_zoo.official.nlp.transformer.src.lr_schedule import create_dynamic_lr
|
||||
|
||||
DATA_DIR = ["/home/workspace/mindspore_dataset/transformer/test-mindrecord"]
|
||||
|
||||
|
||||
def get_config(version='base', batch_size=1):
|
||||
"""get config"""
|
||||
if version == 'large':
|
||||
|
@ -75,23 +76,25 @@ def get_config(version='base', batch_size=1):
|
|||
transformer_cfg = TransformerConfig(batch_size=batch_size)
|
||||
return transformer_cfg
|
||||
|
||||
|
||||
def load_test_data(batch_size=1, data_file=None):
|
||||
"""Load test dataset."""
|
||||
ds = de.MindDataset(data_file,
|
||||
columns_list=["source_eos_ids", "source_eos_mask",
|
||||
"target_sos_ids", "target_sos_mask",
|
||||
"target_eos_ids", "target_eos_mask"],
|
||||
shuffle=False)
|
||||
data_set = ds.MindDataset(data_file,
|
||||
columns_list=["source_eos_ids", "source_eos_mask",
|
||||
"target_sos_ids", "target_sos_mask",
|
||||
"target_eos_ids", "target_eos_mask"],
|
||||
shuffle=False)
|
||||
type_cast_op = deC.TypeCast(mstype.int32)
|
||||
ds = ds.map(operations=type_cast_op, input_columns="source_eos_ids")
|
||||
ds = ds.map(operations=type_cast_op, input_columns="source_eos_mask")
|
||||
ds = ds.map(operations=type_cast_op, input_columns="target_sos_ids")
|
||||
ds = ds.map(operations=type_cast_op, input_columns="target_sos_mask")
|
||||
ds = ds.map(operations=type_cast_op, input_columns="target_eos_ids")
|
||||
ds = ds.map(operations=type_cast_op, input_columns="target_eos_mask")
|
||||
data_set = data_set.map(operations=type_cast_op, input_columns="source_eos_ids")
|
||||
data_set = data_set.map(operations=type_cast_op, input_columns="source_eos_mask")
|
||||
data_set = data_set.map(operations=type_cast_op, input_columns="target_sos_ids")
|
||||
data_set = data_set.map(operations=type_cast_op, input_columns="target_sos_mask")
|
||||
data_set = data_set.map(operations=type_cast_op, input_columns="target_eos_ids")
|
||||
data_set = data_set.map(operations=type_cast_op, input_columns="target_eos_mask")
|
||||
# apply batch operations
|
||||
ds = ds.batch(batch_size, drop_remainder=True)
|
||||
return ds
|
||||
data_set = data_set.batch(batch_size, drop_remainder=True)
|
||||
return data_set
|
||||
|
||||
|
||||
class ModelCallback(Callback):
|
||||
def __init__(self):
|
||||
|
@ -107,13 +110,16 @@ class ModelCallback(Callback):
|
|||
self.lossscale_list.append(cb_params.net_outputs[2].asnumpy())
|
||||
print("epoch: {}, outputs are: {}".format(cb_params.cur_epoch_num, str(cb_params.net_outputs)))
|
||||
|
||||
|
||||
class TimeMonitor(Callback):
|
||||
"""Time Monitor."""
|
||||
|
||||
def __init__(self, data_size):
|
||||
super(TimeMonitor, self).__init__()
|
||||
self.data_size = data_size
|
||||
self.epoch_mseconds_list = []
|
||||
self.per_step_mseconds_list = []
|
||||
|
||||
def epoch_begin(self, run_context):
|
||||
self.epoch_time = time.time()
|
||||
|
||||
|
@ -122,6 +128,7 @@ class TimeMonitor(Callback):
|
|||
self.epoch_mseconds_list.append(epoch_mseconds)
|
||||
self.per_step_mseconds_list.append(epoch_mseconds / self.data_size)
|
||||
|
||||
|
||||
@pytest.mark.level0
|
||||
@pytest.mark.platform_arm_ascend_training
|
||||
@pytest.mark.platform_x86_ascend_training
|
||||
|
@ -142,7 +149,7 @@ def test_transformer():
|
|||
netwithloss = TransformerNetworkWithLoss(config, True)
|
||||
|
||||
lr = Tensor(create_dynamic_lr(schedule="constant*rsqrt_hidden*linear_warmup*rsqrt_decay",
|
||||
training_steps=dataset.get_dataset_size()*epoch_size,
|
||||
training_steps=dataset.get_dataset_size() * epoch_size,
|
||||
learning_rate=cfg.lr_schedule.learning_rate,
|
||||
warmup_steps=cfg.lr_schedule.warmup_steps,
|
||||
hidden_size=config.hidden_size), mstype.float32)
|
||||
|
@ -193,5 +200,6 @@ def test_transformer():
|
|||
print("per step mseconds: {}".format(per_step_mseconds))
|
||||
assert per_step_mseconds <= expect_per_step_mseconds + 2
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
test_transformer()
|
||||
|
|
|
@ -14,13 +14,13 @@
|
|||
# ============================================================================
|
||||
"""train_imagenet."""
|
||||
|
||||
|
||||
import os
|
||||
from enum import Enum
|
||||
import numpy as np
|
||||
import mindspore.dataset.engine as de
|
||||
import mindspore.dataset as ds
|
||||
import mindspore.common.dtype as mstype
|
||||
|
||||
|
||||
class DataType(Enum):
|
||||
"""
|
||||
Enumerate supported dataset format.
|
||||
|
@ -29,6 +29,7 @@ class DataType(Enum):
|
|||
TFRECORD = 2
|
||||
H5 = 3
|
||||
|
||||
|
||||
def _get_tf_dataset(data_dir, train_mode=True, epochs=1, batch_size=1000,
|
||||
line_per_sample=1000, rank_size=None, rank_id=None):
|
||||
"""
|
||||
|
@ -41,26 +42,29 @@ def _get_tf_dataset(data_dir, train_mode=True, epochs=1, batch_size=1000,
|
|||
for filename in filenames:
|
||||
if file_prefix_name in filename and "tfrecord" in filename:
|
||||
dataset_files.append(os.path.join(dirpath, filename))
|
||||
schema = de.Schema()
|
||||
schema = ds.Schema()
|
||||
schema.add_column('feat_ids', de_type=mstype.int32)
|
||||
schema.add_column('feat_vals', de_type=mstype.float32)
|
||||
schema.add_column('label', de_type=mstype.float32)
|
||||
if rank_size is not None and rank_id is not None:
|
||||
ds = de.TFRecordDataset(dataset_files=dataset_files, shuffle=shuffle, schema=schema, num_parallel_workers=8,
|
||||
num_shards=rank_size, shard_id=rank_id, shard_equal_rows=True)
|
||||
data_set = ds.TFRecordDataset(dataset_files=dataset_files, shuffle=shuffle, schema=schema,
|
||||
num_parallel_workers=8,
|
||||
num_shards=rank_size, shard_id=rank_id, shard_equal_rows=True)
|
||||
else:
|
||||
ds = de.TFRecordDataset(dataset_files=dataset_files, shuffle=shuffle, schema=schema, num_parallel_workers=8)
|
||||
ds = ds.batch(int(batch_size / line_per_sample),
|
||||
drop_remainder=True)
|
||||
ds = ds.map(operations=(lambda x, y, z: (
|
||||
data_set = ds.TFRecordDataset(dataset_files=dataset_files, shuffle=shuffle, schema=schema,
|
||||
num_parallel_workers=8)
|
||||
data_set = data_set.batch(int(batch_size / line_per_sample),
|
||||
drop_remainder=True)
|
||||
data_set = data_set.map(operations=(lambda x, y, z: (
|
||||
np.array(x).flatten().reshape(batch_size, 39),
|
||||
np.array(y).flatten().reshape(batch_size, 39),
|
||||
np.array(z).flatten().reshape(batch_size, 1))),
|
||||
input_columns=['feat_ids', 'feat_vals', 'label'],
|
||||
column_order=['feat_ids', 'feat_vals', 'label'], num_parallel_workers=8)
|
||||
#if train_mode:
|
||||
ds = ds.repeat(epochs)
|
||||
return ds
|
||||
input_columns=['feat_ids', 'feat_vals', 'label'],
|
||||
column_order=['feat_ids', 'feat_vals', 'label'], num_parallel_workers=8)
|
||||
# if train_mode:
|
||||
data_set = data_set.repeat(epochs)
|
||||
return data_set
|
||||
|
||||
|
||||
def _get_mindrecord_dataset(directory, train_mode=True, epochs=1, batch_size=1000,
|
||||
line_per_sample=1000, rank_size=None, rank_id=None):
|
||||
|
@ -84,23 +88,23 @@ def _get_mindrecord_dataset(directory, train_mode=True, epochs=1, batch_size=100
|
|||
shuffle = train_mode
|
||||
|
||||
if rank_size is not None and rank_id is not None:
|
||||
ds = de.MindDataset(os.path.join(directory, file_prefix_name + file_suffix_name),
|
||||
columns_list=['feat_ids', 'feat_vals', 'label'],
|
||||
num_shards=rank_size, shard_id=rank_id, shuffle=shuffle,
|
||||
num_parallel_workers=8)
|
||||
data_set = ds.MindDataset(os.path.join(directory, file_prefix_name + file_suffix_name),
|
||||
columns_list=['feat_ids', 'feat_vals', 'label'],
|
||||
num_shards=rank_size, shard_id=rank_id, shuffle=shuffle,
|
||||
num_parallel_workers=8)
|
||||
else:
|
||||
ds = de.MindDataset(os.path.join(directory, file_prefix_name + file_suffix_name),
|
||||
columns_list=['feat_ids', 'feat_vals', 'label'],
|
||||
shuffle=shuffle, num_parallel_workers=8)
|
||||
ds = ds.batch(int(batch_size / line_per_sample), drop_remainder=True)
|
||||
ds = ds.map(operations=(lambda x, y, z: (np.array(x).flatten().reshape(batch_size, 39),
|
||||
np.array(y).flatten().reshape(batch_size, 39),
|
||||
np.array(z).flatten().reshape(batch_size, 1))),
|
||||
input_columns=['feat_ids', 'feat_vals', 'label'],
|
||||
column_order=['feat_ids', 'feat_vals', 'label'],
|
||||
num_parallel_workers=8)
|
||||
ds = ds.repeat(epochs)
|
||||
return ds
|
||||
data_set = ds.MindDataset(os.path.join(directory, file_prefix_name + file_suffix_name),
|
||||
columns_list=['feat_ids', 'feat_vals', 'label'],
|
||||
shuffle=shuffle, num_parallel_workers=8)
|
||||
data_set = data_set.batch(int(batch_size / line_per_sample), drop_remainder=True)
|
||||
data_set = data_set.map(operations=(lambda x, y, z: (np.array(x).flatten().reshape(batch_size, 39),
|
||||
np.array(y).flatten().reshape(batch_size, 39),
|
||||
np.array(z).flatten().reshape(batch_size, 1))),
|
||||
input_columns=['feat_ids', 'feat_vals', 'label'],
|
||||
column_order=['feat_ids', 'feat_vals', 'label'],
|
||||
num_parallel_workers=8)
|
||||
data_set = data_set.repeat(epochs)
|
||||
return data_set
|
||||
|
||||
|
||||
def create_dataset(data_dir, train_mode=True, epochs=1, batch_size=1000,
|
||||
|
|
|
@ -20,7 +20,7 @@ import time
|
|||
import numpy as np
|
||||
import pytest
|
||||
import mindspore.common.dtype as mstype
|
||||
import mindspore.dataset.engine.datasets as de
|
||||
import mindspore.dataset as ds
|
||||
import mindspore.dataset.transforms.c_transforms as C
|
||||
from mindspore import context
|
||||
from mindspore import log as logger
|
||||
|
@ -35,7 +35,6 @@ from model_zoo.official.nlp.bert.src.bert_for_pre_training import BertNetworkWit
|
|||
from model_zoo.official.nlp.bert.src.bert_for_pre_training import BertTrainOneStepWithLossScaleCell
|
||||
from model_zoo.official.nlp.bert.src.bert_model import BertConfig
|
||||
|
||||
|
||||
_current_dir = os.path.dirname(os.path.realpath(__file__))
|
||||
DATA_DIR = ["/home/workspace/mindspore_dataset/bert/example/examples.tfrecord"]
|
||||
SCHEMA_DIR = "/home/workspace/mindspore_dataset/bert/example/datasetSchema.json"
|
||||
|
@ -88,25 +87,26 @@ def me_de_train_dataset(sink_mode=False):
|
|||
repeat_count = 1
|
||||
sink_size = -1
|
||||
batch_size = 16
|
||||
ds = de.TFRecordDataset(DATA_DIR, SCHEMA_DIR, columns_list=["input_ids", "input_mask", "segment_ids",
|
||||
"next_sentence_labels", "masked_lm_positions",
|
||||
"masked_lm_ids", "masked_lm_weights"], shuffle=False)
|
||||
data_set = ds.TFRecordDataset(DATA_DIR, SCHEMA_DIR, columns_list=["input_ids", "input_mask", "segment_ids",
|
||||
"next_sentence_labels", "masked_lm_positions",
|
||||
"masked_lm_ids", "masked_lm_weights"],
|
||||
shuffle=False)
|
||||
type_cast_op = C.TypeCast(mstype.int32)
|
||||
new_repeat_count = repeat_count
|
||||
if sink_mode:
|
||||
sink_size = 100
|
||||
new_repeat_count = 3
|
||||
ds = ds.map(operations=type_cast_op, input_columns="masked_lm_ids")
|
||||
ds = ds.map(operations=type_cast_op, input_columns="masked_lm_positions")
|
||||
ds = ds.map(operations=type_cast_op, input_columns="next_sentence_labels")
|
||||
ds = ds.map(operations=type_cast_op, input_columns="segment_ids")
|
||||
ds = ds.map(operations=type_cast_op, input_columns="input_mask")
|
||||
ds = ds.map(operations=type_cast_op, input_columns="input_ids")
|
||||
data_set = data_set.map(operations=type_cast_op, input_columns="masked_lm_ids")
|
||||
data_set = data_set.map(operations=type_cast_op, input_columns="masked_lm_positions")
|
||||
data_set = data_set.map(operations=type_cast_op, input_columns="next_sentence_labels")
|
||||
data_set = data_set.map(operations=type_cast_op, input_columns="segment_ids")
|
||||
data_set = data_set.map(operations=type_cast_op, input_columns="input_mask")
|
||||
data_set = data_set.map(operations=type_cast_op, input_columns="input_ids")
|
||||
# apply batch operations
|
||||
ds = ds.batch(batch_size, drop_remainder=True)
|
||||
logger.info("data size: {}".format(ds.get_dataset_size()))
|
||||
logger.info("repeat_count: {}".format(ds.get_repeat_count()))
|
||||
return ds, new_repeat_count, sink_size
|
||||
data_set = data_set.batch(batch_size, drop_remainder=True)
|
||||
logger.info("data size: {}".format(data_set.get_dataset_size()))
|
||||
logger.info("repeat_count: {}".format(data_set.get_repeat_count()))
|
||||
return data_set, new_repeat_count, sink_size
|
||||
|
||||
|
||||
def weight_variable(shape):
|
||||
|
@ -155,13 +155,16 @@ class ModelCallback(Callback):
|
|||
self.lossscale_list.append(cb_params.net_outputs[2].asnumpy())
|
||||
print("epoch: {}, outputs are: {}".format(cb_params.cur_epoch_num, str(cb_params.net_outputs)))
|
||||
|
||||
|
||||
class TimeMonitor(Callback):
|
||||
"""Time Monitor."""
|
||||
|
||||
def __init__(self, data_size):
|
||||
super(TimeMonitor, self).__init__()
|
||||
self.data_size = data_size
|
||||
self.epoch_mseconds_list = []
|
||||
self.per_step_mseconds_list = []
|
||||
|
||||
def epoch_begin(self, run_context):
|
||||
self.epoch_time = time.time()
|
||||
|
||||
|
@ -178,7 +181,7 @@ class TimeMonitor(Callback):
|
|||
def test_bert_performance():
|
||||
"""test bert performance"""
|
||||
context.set_context(mode=context.GRAPH_MODE, device_target="Ascend", reserve_class_name_in_scope=False)
|
||||
ds, new_repeat_count, sink_size = me_de_train_dataset(sink_mode=True)
|
||||
data_set, new_repeat_count, sink_size = me_de_train_dataset(sink_mode=True)
|
||||
version = os.getenv('VERSION', 'large')
|
||||
config = get_config(version=version)
|
||||
netwithloss = BertNetworkWithLoss(config, True)
|
||||
|
@ -221,7 +224,7 @@ def test_bert_performance():
|
|||
logger.info("***************** BERT param name is 3 {}".format(name))
|
||||
param.set_data(weight_variable(value.asnumpy().shape))
|
||||
time_monitor_callback = TimeMonitor(sink_size)
|
||||
model.train(new_repeat_count, ds, callbacks=[time_monitor_callback, callback],
|
||||
model.train(new_repeat_count, data_set, callbacks=[time_monitor_callback, callback],
|
||||
dataset_sink_mode=True, sink_size=sink_size)
|
||||
|
||||
# assertion occurs while the loss value, overflow state or loss_scale value is wrong
|
||||
|
@ -250,5 +253,6 @@ def test_bert_performance():
|
|||
print("per step mseconds: {}".format(per_step_mseconds))
|
||||
assert per_step_mseconds <= expect_per_step_mseconds + 1
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
test_bert_performance()
|
||||
|
|
|
@ -20,7 +20,7 @@ import time
|
|||
from multiprocessing import Process, Queue
|
||||
import pytest
|
||||
import numpy as np
|
||||
import mindspore.dataset as dataset
|
||||
import mindspore.dataset as ds
|
||||
import mindspore.common.dtype as mstype
|
||||
import mindspore.communication.management as D
|
||||
from mindspore import context
|
||||
|
@ -28,7 +28,6 @@ from mindspore import log as logger
|
|||
from mindspore.train.callback import Callback
|
||||
from mindspore.context import ParallelMode
|
||||
from mindspore.train.serialization import load_checkpoint, load_param_into_net
|
||||
import mindspore.dataset.engine.datasets as de
|
||||
import mindspore.dataset.transforms.c_transforms as C
|
||||
from model_zoo.official.nlp.bert_thor.src.bert_for_pre_training import BertNetworkWithLoss, BertTrainOneStepCell
|
||||
from model_zoo.official.nlp.bert_thor.src.bert_net_config import bert_net_cfg
|
||||
|
@ -45,11 +44,13 @@ train_steps = 200
|
|||
batch_size = 12
|
||||
|
||||
np.random.seed(1)
|
||||
dataset.config.set_seed(1)
|
||||
ds.config.set_seed(1)
|
||||
os.environ['GLOG_v'] = str(2)
|
||||
|
||||
|
||||
class TimeMonitor(Callback):
|
||||
"""Time Monitor."""
|
||||
|
||||
def __init__(self, data_size):
|
||||
super(TimeMonitor, self).__init__()
|
||||
self.data_size = data_size
|
||||
|
@ -67,6 +68,7 @@ class TimeMonitor(Callback):
|
|||
self.per_step_mseconds_list.append(per_step_mseconds)
|
||||
print("epoch: {}, per_step_mseconds are {}".format(cb_params.cur_epoch_num, str(per_step_mseconds)), flush=True)
|
||||
|
||||
|
||||
class LossCallback(Callback):
|
||||
def __init__(self):
|
||||
super(LossCallback, self).__init__()
|
||||
|
@ -78,6 +80,7 @@ class LossCallback(Callback):
|
|||
print("epoch: {}, step: {}, outputs are {}".format(cb_params.cur_epoch_num, cb_params.cur_step_num,
|
||||
str(cb_params.net_outputs)), flush=True)
|
||||
|
||||
|
||||
def create_bert_dataset(device_num=1, rank=0, do_shuffle="true", data_dir=None, schema_dir=None):
|
||||
"""create train dataset"""
|
||||
# apply repeat operations
|
||||
|
@ -87,25 +90,25 @@ def create_bert_dataset(device_num=1, rank=0, do_shuffle="true", data_dir=None,
|
|||
if "tfrecord" in file_name:
|
||||
data_files.append(os.path.join(data_dir, file_name))
|
||||
data_files = sorted(data_files)
|
||||
ds = de.TFRecordDataset(data_files, schema_dir if schema_dir != "" else None,
|
||||
columns_list=["input_ids", "input_mask", "segment_ids", "next_sentence_labels",
|
||||
"masked_lm_positions", "masked_lm_ids", "masked_lm_weights"],
|
||||
shuffle=de.Shuffle.FILES if do_shuffle == "true" else False,
|
||||
num_shards=device_num, shard_id=rank, shard_equal_rows=True)
|
||||
ori_dataset_size = ds.get_dataset_size()
|
||||
data_set = ds.TFRecordDataset(data_files, schema_dir if schema_dir != "" else None,
|
||||
columns_list=["input_ids", "input_mask", "segment_ids", "next_sentence_labels",
|
||||
"masked_lm_positions", "masked_lm_ids", "masked_lm_weights"],
|
||||
shuffle=ds.Shuffle.FILES if do_shuffle == "true" else False,
|
||||
num_shards=device_num, shard_id=rank, shard_equal_rows=True)
|
||||
ori_dataset_size = data_set.get_dataset_size()
|
||||
print('origin dataset size: ', ori_dataset_size)
|
||||
type_cast_op = C.TypeCast(mstype.int32)
|
||||
ds = ds.map(operations=type_cast_op, input_columns="masked_lm_ids")
|
||||
ds = ds.map(operations=type_cast_op, input_columns="masked_lm_positions")
|
||||
ds = ds.map(operations=type_cast_op, input_columns="next_sentence_labels")
|
||||
ds = ds.map(operations=type_cast_op, input_columns="segment_ids")
|
||||
ds = ds.map(operations=type_cast_op, input_columns="input_mask")
|
||||
ds = ds.map(operations=type_cast_op, input_columns="input_ids")
|
||||
data_set = data_set.map(operations=type_cast_op, input_columns="masked_lm_ids")
|
||||
data_set = data_set.map(operations=type_cast_op, input_columns="masked_lm_positions")
|
||||
data_set = data_set.map(operations=type_cast_op, input_columns="next_sentence_labels")
|
||||
data_set = data_set.map(operations=type_cast_op, input_columns="segment_ids")
|
||||
data_set = data_set.map(operations=type_cast_op, input_columns="input_mask")
|
||||
data_set = data_set.map(operations=type_cast_op, input_columns="input_ids")
|
||||
# apply batch operations
|
||||
ds = ds.batch(batch_size, drop_remainder=True)
|
||||
logger.info("data size: {}".format(ds.get_dataset_size()))
|
||||
logger.info("repeat count: {}".format(ds.get_repeat_count()))
|
||||
return ds
|
||||
data_set = data_set.batch(batch_size, drop_remainder=True)
|
||||
logger.info("data size: {}".format(data_set.get_dataset_size()))
|
||||
logger.info("repeat count: {}".format(data_set.get_repeat_count()))
|
||||
return data_set
|
||||
|
||||
|
||||
def _set_bert_all_reduce_split():
|
||||
|
@ -151,13 +154,13 @@ def train_process_bert_thor(q, device_id, epoch_size, device_num):
|
|||
device_num=device_num)
|
||||
|
||||
bert_net_cfg.num_hidden_layers = 4
|
||||
ds = create_bert_dataset(device_num=device_num, rank=rank, do_shuffle=False, data_dir=DATASET_PATH, schema_dir=None)
|
||||
data_set = create_bert_dataset(device_num=device_num, rank=rank, do_shuffle=False, data_dir=DATASET_PATH,
|
||||
schema_dir=None)
|
||||
net_with_loss = BertNetworkWithLoss(bert_net_cfg, True)
|
||||
|
||||
new_repeat_count = epoch_size * ds.get_dataset_size() // data_sink_steps
|
||||
new_repeat_count = epoch_size * data_set.get_dataset_size() // data_sink_steps
|
||||
new_repeat_count = min(new_repeat_count, train_steps // data_sink_steps)
|
||||
|
||||
|
||||
lr = get_bert_lr()
|
||||
damping = get_bert_damping()
|
||||
optimizer = THOR(filter(lambda x: x.requires_grad, net_with_loss.get_parameters()), lr, cfg.Thor.momentum,
|
||||
|
@ -175,7 +178,7 @@ def train_process_bert_thor(q, device_id, epoch_size, device_num):
|
|||
|
||||
net_with_grads = BertTrainOneStepCell(net_with_loss, optimizer=optimizer)
|
||||
model = Model(net_with_grads, frequency=cfg.Thor.frequency)
|
||||
model.train(new_repeat_count, ds, callbacks=callback, dataset_sink_mode=True, sink_size=data_sink_steps)
|
||||
model.train(new_repeat_count, data_set, callbacks=callback, dataset_sink_mode=True, sink_size=data_sink_steps)
|
||||
|
||||
loss_list = loss_callback.loss_list
|
||||
per_step_mseconds = time_monitor_callback.per_step_mseconds_list
|
||||
|
@ -230,5 +233,6 @@ def test_bert_thor_mlperf_8p():
|
|||
assert mean_cost < 64.2
|
||||
assert mean_loss < 7.9
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
test_bert_thor_mlperf_8p()
|
||||
|
|
|
@ -20,7 +20,7 @@ import time
|
|||
import numpy as np
|
||||
import pytest
|
||||
import mindspore.common.dtype as mstype
|
||||
import mindspore.dataset.engine.datasets as de
|
||||
import mindspore.dataset as ds
|
||||
import mindspore.dataset.transforms.c_transforms as C
|
||||
from mindspore import context
|
||||
from mindspore import log as logger
|
||||
|
@ -87,25 +87,26 @@ def me_de_train_dataset(sink_mode=False):
|
|||
repeat_count = 1
|
||||
sink_size = -1
|
||||
batch_size = 16
|
||||
ds = de.TFRecordDataset(DATA_DIR, SCHEMA_DIR, columns_list=["input_ids", "input_mask", "segment_ids",
|
||||
"next_sentence_labels", "masked_lm_positions",
|
||||
"masked_lm_ids", "masked_lm_weights"], shuffle=False)
|
||||
data_set = ds.TFRecordDataset(DATA_DIR, SCHEMA_DIR, columns_list=["input_ids", "input_mask", "segment_ids",
|
||||
"next_sentence_labels", "masked_lm_positions",
|
||||
"masked_lm_ids", "masked_lm_weights"],
|
||||
shuffle=False)
|
||||
type_cast_op = C.TypeCast(mstype.int32)
|
||||
new_repeat_count = repeat_count
|
||||
if sink_mode:
|
||||
sink_size = 100
|
||||
new_repeat_count = 3
|
||||
ds = ds.map(operations=type_cast_op, input_columns="masked_lm_ids")
|
||||
ds = ds.map(operations=type_cast_op, input_columns="masked_lm_positions")
|
||||
ds = ds.map(operations=type_cast_op, input_columns="next_sentence_labels")
|
||||
ds = ds.map(operations=type_cast_op, input_columns="segment_ids")
|
||||
ds = ds.map(operations=type_cast_op, input_columns="input_mask")
|
||||
ds = ds.map(operations=type_cast_op, input_columns="input_ids")
|
||||
data_set = data_set.map(operations=type_cast_op, input_columns="masked_lm_ids")
|
||||
data_set = data_set.map(operations=type_cast_op, input_columns="masked_lm_positions")
|
||||
data_set = data_set.map(operations=type_cast_op, input_columns="next_sentence_labels")
|
||||
data_set = data_set.map(operations=type_cast_op, input_columns="segment_ids")
|
||||
data_set = data_set.map(operations=type_cast_op, input_columns="input_mask")
|
||||
data_set = data_set.map(operations=type_cast_op, input_columns="input_ids")
|
||||
# apply batch operations
|
||||
ds = ds.batch(batch_size, drop_remainder=True)
|
||||
logger.info("data size: {}".format(ds.get_dataset_size()))
|
||||
logger.info("repeat_count: {}".format(ds.get_repeat_count()))
|
||||
return ds, new_repeat_count, sink_size
|
||||
data_set = data_set.batch(batch_size, drop_remainder=True)
|
||||
logger.info("data size: {}".format(data_set.get_dataset_size()))
|
||||
logger.info("repeat_count: {}".format(data_set.get_repeat_count()))
|
||||
return data_set, new_repeat_count, sink_size
|
||||
|
||||
|
||||
def weight_variable(shape):
|
||||
|
@ -178,11 +179,11 @@ def test_bert_percision(enable_graph_kernel=False):
|
|||
context.set_context(mode=context.GRAPH_MODE, device_target="Ascend", reserve_class_name_in_scope=False)
|
||||
if enable_graph_kernel:
|
||||
context.set_context(enable_graph_kernel=True)
|
||||
ds, new_repeat_count, _ = me_de_train_dataset()
|
||||
data_set, new_repeat_count, _ = me_de_train_dataset()
|
||||
version = os.getenv('VERSION', 'large')
|
||||
config = get_config(version=version)
|
||||
netwithloss = BertNetworkWithLoss(config, True)
|
||||
lr = BertLearningRate(decay_steps=ds.get_dataset_size() * new_repeat_count,
|
||||
lr = BertLearningRate(decay_steps=data_set.get_dataset_size() * new_repeat_count,
|
||||
learning_rate=5e-5, end_learning_rate=1e-9,
|
||||
power=10.0, warmup_steps=0)
|
||||
decay_filter = lambda x: 'layernorm' not in x.name.lower() and 'bias' not in x.name.lower()
|
||||
|
@ -218,7 +219,7 @@ def test_bert_percision(enable_graph_kernel=False):
|
|||
else:
|
||||
logger.info("***************** BERT param name is 3 {}".format(name))
|
||||
param.set_data(weight_variable(value.asnumpy().shape))
|
||||
model.train(new_repeat_count, ds, callbacks=callback, dataset_sink_mode=False)
|
||||
model.train(new_repeat_count, data_set, callbacks=callback, dataset_sink_mode=False)
|
||||
|
||||
# assertion occurs while the loss value, overflow state or loss_scale value is wrong
|
||||
loss_value = np.array(callback.loss_list)
|
||||
|
|
|
@ -17,7 +17,7 @@ Data operations, will be used in run_pretrain.py
|
|||
"""
|
||||
import os
|
||||
import mindspore.common.dtype as mstype
|
||||
import mindspore.dataset.engine.datasets as de
|
||||
import mindspore.dataset as ds
|
||||
import mindspore.dataset.transforms.c_transforms as C
|
||||
from mindspore import log as logger
|
||||
from .config import bert_net_cfg
|
||||
|
@ -32,24 +32,24 @@ def create_bert_dataset(epoch_size=1, device_num=1, rank=0, do_shuffle="true", d
|
|||
for file_name in files:
|
||||
if "tfrecord" in file_name:
|
||||
data_files.append(os.path.join(data_dir, file_name))
|
||||
ds = de.TFRecordDataset(data_files, schema_dir if schema_dir != "" else None,
|
||||
columns_list=["input_ids", "input_mask", "segment_ids", "next_sentence_labels",
|
||||
"masked_lm_positions", "masked_lm_ids", "masked_lm_weights"],
|
||||
shuffle=(do_shuffle == "true"), num_shards=device_num, shard_id=rank,
|
||||
shard_equal_rows=True)
|
||||
ori_dataset_size = ds.get_dataset_size()
|
||||
data_set = ds.TFRecordDataset(data_files, schema_dir if schema_dir != "" else None,
|
||||
columns_list=["input_ids", "input_mask", "segment_ids", "next_sentence_labels",
|
||||
"masked_lm_positions", "masked_lm_ids", "masked_lm_weights"],
|
||||
shuffle=(do_shuffle == "true"), num_shards=device_num, shard_id=rank,
|
||||
shard_equal_rows=True)
|
||||
ori_dataset_size = data_set.get_dataset_size()
|
||||
print('origin dataset size: ', ori_dataset_size)
|
||||
new_repeat_count = int(repeat_count * ori_dataset_size // ds.get_dataset_size())
|
||||
new_repeat_count = int(repeat_count * ori_dataset_size // data_set.get_dataset_size())
|
||||
type_cast_op = C.TypeCast(mstype.int32)
|
||||
ds = ds.map(operations=type_cast_op, input_columns="masked_lm_ids")
|
||||
ds = ds.map(operations=type_cast_op, input_columns="masked_lm_positions")
|
||||
ds = ds.map(operations=type_cast_op, input_columns="next_sentence_labels")
|
||||
ds = ds.map(operations=type_cast_op, input_columns="segment_ids")
|
||||
ds = ds.map(operations=type_cast_op, input_columns="input_mask")
|
||||
ds = ds.map(operations=type_cast_op, input_columns="input_ids")
|
||||
data_set = data_set.map(operations=type_cast_op, input_columns="masked_lm_ids")
|
||||
data_set = data_set.map(operations=type_cast_op, input_columns="masked_lm_positions")
|
||||
data_set = data_set.map(operations=type_cast_op, input_columns="next_sentence_labels")
|
||||
data_set = data_set.map(operations=type_cast_op, input_columns="segment_ids")
|
||||
data_set = data_set.map(operations=type_cast_op, input_columns="input_mask")
|
||||
data_set = data_set.map(operations=type_cast_op, input_columns="input_ids")
|
||||
# apply batch operations
|
||||
ds = ds.batch(bert_net_cfg.batch_size, drop_remainder=True)
|
||||
ds = ds.repeat(max(new_repeat_count, repeat_count))
|
||||
logger.info("data size: {}".format(ds.get_dataset_size()))
|
||||
logger.info("repeatcount: {}".format(ds.get_repeat_count()))
|
||||
return ds, new_repeat_count
|
||||
data_set = data_set.batch(bert_net_cfg.batch_size, drop_remainder=True)
|
||||
data_set = data_set.repeat(max(new_repeat_count, repeat_count))
|
||||
logger.info("data size: {}".format(data_set.get_dataset_size()))
|
||||
logger.info("repeatcount: {}".format(data_set.get_repeat_count()))
|
||||
return data_set, new_repeat_count
|
||||
|
|
|
@ -17,7 +17,7 @@
|
|||
|
||||
import os
|
||||
import mindspore.common.dtype as mstype
|
||||
import mindspore.dataset.engine as de
|
||||
import mindspore.dataset as ds
|
||||
import mindspore.dataset.vision.c_transforms as C
|
||||
import mindspore.dataset.transforms.c_transforms as C2
|
||||
|
||||
|
@ -39,10 +39,10 @@ def create_dataset(dataset_path, do_train, repeat_num=1, batch_size=32):
|
|||
device_num = int(os.getenv("RANK_SIZE"))
|
||||
rank_id = int(os.getenv("RANK_ID"))
|
||||
if device_num == 1:
|
||||
ds = de.ImageFolderDataset(dataset_path, num_parallel_workers=8, shuffle=True)
|
||||
data_set = ds.ImageFolderDataset(dataset_path, num_parallel_workers=8, shuffle=True)
|
||||
else:
|
||||
ds = de.ImageFolderDataset(dataset_path, num_parallel_workers=8, shuffle=True,
|
||||
num_shards=device_num, shard_id=rank_id)
|
||||
data_set = ds.ImageFolderDataset(dataset_path, num_parallel_workers=8, shuffle=True,
|
||||
num_shards=device_num, shard_id=rank_id)
|
||||
|
||||
image_size = 224
|
||||
mean = [0.485 * 255, 0.456 * 255, 0.406 * 255]
|
||||
|
@ -65,15 +65,14 @@ def create_dataset(dataset_path, do_train, repeat_num=1, batch_size=32):
|
|||
C.HWC2CHW()
|
||||
]
|
||||
|
||||
|
||||
type_cast_op = C2.TypeCast(mstype.int32)
|
||||
|
||||
ds = ds.map(operations=trans, input_columns="image", num_parallel_workers=8)
|
||||
ds = ds.map(operations=type_cast_op, input_columns="label", num_parallel_workers=8)
|
||||
data_set = data_set.map(operations=trans, input_columns="image", num_parallel_workers=8)
|
||||
data_set = data_set.map(operations=type_cast_op, input_columns="label", num_parallel_workers=8)
|
||||
|
||||
# apply batch operations
|
||||
ds = ds.batch(batch_size, drop_remainder=True)
|
||||
data_set = data_set.batch(batch_size, drop_remainder=True)
|
||||
|
||||
# apply dataset repeat operation
|
||||
ds = ds.repeat(repeat_num)
|
||||
return ds
|
||||
data_set = data_set.repeat(repeat_num)
|
||||
return data_set
|
||||
|
|
|
@ -18,12 +18,11 @@
|
|||
import os
|
||||
|
||||
import mindspore.common.dtype as mstype
|
||||
import mindspore.dataset as dataset
|
||||
import mindspore.dataset.engine as de
|
||||
import mindspore.dataset as ds
|
||||
import mindspore.dataset.transforms.c_transforms as C2
|
||||
import mindspore.dataset.vision.c_transforms as C
|
||||
|
||||
dataset.config.set_seed(1)
|
||||
ds.config.set_seed(1)
|
||||
|
||||
|
||||
def create_dataset(dataset_path, do_train, repeat_num=1, batch_size=32):
|
||||
|
@ -43,10 +42,10 @@ def create_dataset(dataset_path, do_train, repeat_num=1, batch_size=32):
|
|||
device_num = int(os.getenv("RANK_SIZE"))
|
||||
rank_id = int(os.getenv("RANK_ID"))
|
||||
if device_num == 1:
|
||||
ds = de.ImageFolderDataset(dataset_path, num_parallel_workers=8, shuffle=True)
|
||||
data_set = ds.ImageFolderDataset(dataset_path, num_parallel_workers=8, shuffle=True)
|
||||
else:
|
||||
ds = de.ImageFolderDataset(dataset_path, num_parallel_workers=8, shuffle=True,
|
||||
num_shards=device_num, shard_id=rank_id)
|
||||
data_set = ds.ImageFolderDataset(dataset_path, num_parallel_workers=8, shuffle=True,
|
||||
num_shards=device_num, shard_id=rank_id)
|
||||
|
||||
image_size = 224
|
||||
mean = [0.485 * 255, 0.456 * 255, 0.406 * 255]
|
||||
|
@ -71,12 +70,12 @@ def create_dataset(dataset_path, do_train, repeat_num=1, batch_size=32):
|
|||
|
||||
type_cast_op = C2.TypeCast(mstype.int32)
|
||||
|
||||
ds = ds.map(operations=trans, input_columns="image", num_parallel_workers=8)
|
||||
ds = ds.map(operations=type_cast_op, input_columns="label", num_parallel_workers=8)
|
||||
data_set = data_set.map(operations=trans, input_columns="image", num_parallel_workers=8)
|
||||
data_set = data_set.map(operations=type_cast_op, input_columns="label", num_parallel_workers=8)
|
||||
|
||||
# apply batch operations
|
||||
ds = ds.batch(batch_size, drop_remainder=True)
|
||||
data_set = data_set.batch(batch_size, drop_remainder=True)
|
||||
|
||||
# apply dataset repeat operation
|
||||
ds = ds.repeat(repeat_num)
|
||||
return ds
|
||||
data_set = data_set.repeat(repeat_num)
|
||||
return data_set
|
||||
|
|
|
@ -14,11 +14,10 @@
|
|||
# ============================================================================
|
||||
""" create train dataset. """
|
||||
|
||||
|
||||
from functools import partial
|
||||
|
||||
import mindspore.common.dtype as mstype
|
||||
import mindspore.dataset.engine as de
|
||||
import mindspore.dataset as ds
|
||||
import mindspore.dataset.transforms.c_transforms as C2
|
||||
import mindspore.dataset.vision.c_transforms as C
|
||||
|
||||
|
@ -37,8 +36,8 @@ def create_dataset(dataset_path, config, repeat_num=1, batch_size=32):
|
|||
dataset
|
||||
"""
|
||||
|
||||
load_func = partial(de.Cifar10Dataset, dataset_path)
|
||||
ds = load_func(num_parallel_workers=8, shuffle=False)
|
||||
load_func = partial(ds.Cifar10Dataset, dataset_path)
|
||||
data_set = load_func(num_parallel_workers=8, shuffle=False)
|
||||
|
||||
resize_height = config.image_height
|
||||
resize_width = config.image_width
|
||||
|
@ -54,15 +53,15 @@ def create_dataset(dataset_path, config, repeat_num=1, batch_size=32):
|
|||
|
||||
type_cast_op = C2.TypeCast(mstype.int32)
|
||||
|
||||
ds = ds.map(operations=c_trans, input_columns="image",
|
||||
num_parallel_workers=8)
|
||||
ds = ds.map(operations=type_cast_op,
|
||||
input_columns="label", num_parallel_workers=8)
|
||||
data_set = data_set.map(operations=c_trans, input_columns="image",
|
||||
num_parallel_workers=8)
|
||||
data_set = data_set.map(operations=type_cast_op,
|
||||
input_columns="label", num_parallel_workers=8)
|
||||
|
||||
# apply batch operations
|
||||
ds = ds.batch(batch_size, drop_remainder=True)
|
||||
data_set = data_set.batch(batch_size, drop_remainder=True)
|
||||
|
||||
# apply dataset repeat operation
|
||||
ds = ds.repeat(repeat_num)
|
||||
data_set = data_set.repeat(repeat_num)
|
||||
|
||||
return ds
|
||||
return data_set
|
||||
|
|
|
@ -16,7 +16,7 @@
|
|||
Testing AutoContrast op in DE
|
||||
"""
|
||||
import numpy as np
|
||||
import mindspore.dataset.engine as de
|
||||
import mindspore.dataset as ds
|
||||
import mindspore.dataset.transforms.py_transforms
|
||||
import mindspore.dataset.vision.py_transforms as F
|
||||
import mindspore.dataset.vision.c_transforms as C
|
||||
|
@ -36,13 +36,13 @@ def test_auto_contrast_py(plot=False):
|
|||
logger.info("Test AutoContrast Python Op")
|
||||
|
||||
# Original Images
|
||||
ds = de.ImageFolderDataset(dataset_dir=DATA_DIR, shuffle=False)
|
||||
data_set = ds.ImageFolderDataset(dataset_dir=DATA_DIR, shuffle=False)
|
||||
|
||||
transforms_original = mindspore.dataset.transforms.py_transforms.Compose([F.Decode(),
|
||||
F.Resize((224, 224)),
|
||||
F.ToTensor()])
|
||||
|
||||
ds_original = ds.map(operations=transforms_original, input_columns="image")
|
||||
ds_original = data_set.map(operations=transforms_original, input_columns="image")
|
||||
|
||||
ds_original = ds_original.batch(512)
|
||||
|
||||
|
@ -55,7 +55,7 @@ def test_auto_contrast_py(plot=False):
|
|||
axis=0)
|
||||
|
||||
# AutoContrast Images
|
||||
ds = de.ImageFolderDataset(dataset_dir=DATA_DIR, shuffle=False)
|
||||
data_set = ds.ImageFolderDataset(dataset_dir=DATA_DIR, shuffle=False)
|
||||
|
||||
transforms_auto_contrast = \
|
||||
mindspore.dataset.transforms.py_transforms.Compose([F.Decode(),
|
||||
|
@ -63,7 +63,7 @@ def test_auto_contrast_py(plot=False):
|
|||
F.AutoContrast(cutoff=10.0, ignore=[10, 20]),
|
||||
F.ToTensor()])
|
||||
|
||||
ds_auto_contrast = ds.map(operations=transforms_auto_contrast, input_columns="image")
|
||||
ds_auto_contrast = data_set.map(operations=transforms_auto_contrast, input_columns="image")
|
||||
|
||||
ds_auto_contrast = ds_auto_contrast.batch(512)
|
||||
|
||||
|
@ -96,15 +96,15 @@ def test_auto_contrast_c(plot=False):
|
|||
logger.info("Test AutoContrast C Op")
|
||||
|
||||
# AutoContrast Images
|
||||
ds = de.ImageFolderDataset(dataset_dir=DATA_DIR, shuffle=False)
|
||||
ds = ds.map(operations=[C.Decode(), C.Resize((224, 224))], input_columns=["image"])
|
||||
data_set = ds.ImageFolderDataset(dataset_dir=DATA_DIR, shuffle=False)
|
||||
data_set = data_set.map(operations=[C.Decode(), C.Resize((224, 224))], input_columns=["image"])
|
||||
python_op = F.AutoContrast(cutoff=10.0, ignore=[10, 20])
|
||||
c_op = C.AutoContrast(cutoff=10.0, ignore=[10, 20])
|
||||
transforms_op = mindspore.dataset.transforms.py_transforms.Compose([lambda img: F.ToPIL()(img.astype(np.uint8)),
|
||||
python_op,
|
||||
np.array])
|
||||
|
||||
ds_auto_contrast_py = ds.map(operations=transforms_op, input_columns="image")
|
||||
ds_auto_contrast_py = data_set.map(operations=transforms_op, input_columns="image")
|
||||
|
||||
ds_auto_contrast_py = ds_auto_contrast_py.batch(512)
|
||||
|
||||
|
@ -116,10 +116,10 @@ def test_auto_contrast_c(plot=False):
|
|||
image.asnumpy(),
|
||||
axis=0)
|
||||
|
||||
ds = de.ImageFolderDataset(dataset_dir=DATA_DIR, shuffle=False)
|
||||
ds = ds.map(operations=[C.Decode(), C.Resize((224, 224))], input_columns=["image"])
|
||||
data_set = ds.ImageFolderDataset(dataset_dir=DATA_DIR, shuffle=False)
|
||||
data_set = data_set.map(operations=[C.Decode(), C.Resize((224, 224))], input_columns=["image"])
|
||||
|
||||
ds_auto_contrast_c = ds.map(operations=c_op, input_columns="image")
|
||||
ds_auto_contrast_c = data_set.map(operations=c_op, input_columns="image")
|
||||
|
||||
ds_auto_contrast_c = ds_auto_contrast_c.batch(512)
|
||||
|
||||
|
@ -153,8 +153,8 @@ def test_auto_contrast_one_channel_c(plot=False):
|
|||
logger.info("Test AutoContrast C Op With One Channel Images")
|
||||
|
||||
# AutoContrast Images
|
||||
ds = de.ImageFolderDataset(dataset_dir=DATA_DIR, shuffle=False)
|
||||
ds = ds.map(operations=[C.Decode(), C.Resize((224, 224))], input_columns=["image"])
|
||||
data_set = ds.ImageFolderDataset(dataset_dir=DATA_DIR, shuffle=False)
|
||||
data_set = data_set.map(operations=[C.Decode(), C.Resize((224, 224))], input_columns=["image"])
|
||||
python_op = F.AutoContrast()
|
||||
c_op = C.AutoContrast()
|
||||
# not using F.ToTensor() since it converts to floats
|
||||
|
@ -164,7 +164,7 @@ def test_auto_contrast_one_channel_c(plot=False):
|
|||
python_op,
|
||||
np.array])
|
||||
|
||||
ds_auto_contrast_py = ds.map(operations=transforms_op, input_columns="image")
|
||||
ds_auto_contrast_py = data_set.map(operations=transforms_op, input_columns="image")
|
||||
|
||||
ds_auto_contrast_py = ds_auto_contrast_py.batch(512)
|
||||
|
||||
|
@ -176,11 +176,11 @@ def test_auto_contrast_one_channel_c(plot=False):
|
|||
image.asnumpy(),
|
||||
axis=0)
|
||||
|
||||
ds = de.ImageFolderDataset(dataset_dir=DATA_DIR, shuffle=False)
|
||||
ds = ds.map(operations=[C.Decode(), C.Resize((224, 224)), lambda img: np.array(img[:, :, 0])],
|
||||
input_columns=["image"])
|
||||
data_set = ds.ImageFolderDataset(dataset_dir=DATA_DIR, shuffle=False)
|
||||
data_set = data_set.map(operations=[C.Decode(), C.Resize((224, 224)), lambda img: np.array(img[:, :, 0])],
|
||||
input_columns=["image"])
|
||||
|
||||
ds_auto_contrast_c = ds.map(operations=c_op, input_columns="image")
|
||||
ds_auto_contrast_c = data_set.map(operations=c_op, input_columns="image")
|
||||
|
||||
ds_auto_contrast_c = ds_auto_contrast_c.batch(512)
|
||||
|
||||
|
@ -208,9 +208,9 @@ def test_auto_contrast_mnist_c(plot=False):
|
|||
Test AutoContrast C op with MNIST dataset (Grayscale images)
|
||||
"""
|
||||
logger.info("Test AutoContrast C Op With MNIST Images")
|
||||
ds = de.MnistDataset(dataset_dir=MNIST_DATA_DIR, num_samples=2, shuffle=False)
|
||||
ds_auto_contrast_c = ds.map(operations=C.AutoContrast(cutoff=1, ignore=(0, 255)), input_columns="image")
|
||||
ds_orig = de.MnistDataset(dataset_dir=MNIST_DATA_DIR, num_samples=2, shuffle=False)
|
||||
data_set = ds.MnistDataset(dataset_dir=MNIST_DATA_DIR, num_samples=2, shuffle=False)
|
||||
ds_auto_contrast_c = data_set.map(operations=C.AutoContrast(cutoff=1, ignore=(0, 255)), input_columns="image")
|
||||
ds_orig = ds.MnistDataset(dataset_dir=MNIST_DATA_DIR, num_samples=2, shuffle=False)
|
||||
|
||||
images = []
|
||||
images_trans = []
|
||||
|
@ -236,21 +236,21 @@ def test_auto_contrast_invalid_ignore_param_c():
|
|||
"""
|
||||
logger.info("Test AutoContrast C Op with invalid ignore parameter")
|
||||
try:
|
||||
ds = de.ImageFolderDataset(dataset_dir=DATA_DIR, shuffle=False)
|
||||
ds = ds.map(operations=[C.Decode(),
|
||||
C.Resize((224, 224)),
|
||||
lambda img: np.array(img[:, :, 0])], input_columns=["image"])
|
||||
data_set = ds.ImageFolderDataset(dataset_dir=DATA_DIR, shuffle=False)
|
||||
data_set = data_set.map(operations=[C.Decode(),
|
||||
C.Resize((224, 224)),
|
||||
lambda img: np.array(img[:, :, 0])], input_columns=["image"])
|
||||
# invalid ignore
|
||||
ds = ds.map(operations=C.AutoContrast(ignore=255.5), input_columns="image")
|
||||
data_set = data_set.map(operations=C.AutoContrast(ignore=255.5), input_columns="image")
|
||||
except TypeError as error:
|
||||
logger.info("Got an exception in DE: {}".format(str(error)))
|
||||
assert "Argument ignore with value 255.5 is not of type" in str(error)
|
||||
try:
|
||||
ds = de.ImageFolderDataset(dataset_dir=DATA_DIR, shuffle=False)
|
||||
ds = ds.map(operations=[C.Decode(), C.Resize((224, 224)),
|
||||
lambda img: np.array(img[:, :, 0])], input_columns=["image"])
|
||||
data_set = ds.ImageFolderDataset(dataset_dir=DATA_DIR, shuffle=False)
|
||||
data_set = data_set.map(operations=[C.Decode(), C.Resize((224, 224)),
|
||||
lambda img: np.array(img[:, :, 0])], input_columns=["image"])
|
||||
# invalid ignore
|
||||
ds = ds.map(operations=C.AutoContrast(ignore=(10, 100)), input_columns="image")
|
||||
data_set = data_set.map(operations=C.AutoContrast(ignore=(10, 100)), input_columns="image")
|
||||
except TypeError as error:
|
||||
logger.info("Got an exception in DE: {}".format(str(error)))
|
||||
assert "Argument ignore with value (10,100) is not of type" in str(error)
|
||||
|
@ -262,22 +262,22 @@ def test_auto_contrast_invalid_cutoff_param_c():
|
|||
"""
|
||||
logger.info("Test AutoContrast C Op with invalid cutoff parameter")
|
||||
try:
|
||||
ds = de.ImageFolderDataset(dataset_dir=DATA_DIR, shuffle=False)
|
||||
ds = ds.map(operations=[C.Decode(),
|
||||
C.Resize((224, 224)),
|
||||
lambda img: np.array(img[:, :, 0])], input_columns=["image"])
|
||||
data_set = ds.ImageFolderDataset(dataset_dir=DATA_DIR, shuffle=False)
|
||||
data_set = data_set.map(operations=[C.Decode(),
|
||||
C.Resize((224, 224)),
|
||||
lambda img: np.array(img[:, :, 0])], input_columns=["image"])
|
||||
# invalid ignore
|
||||
ds = ds.map(operations=C.AutoContrast(cutoff=-10.0), input_columns="image")
|
||||
data_set = data_set.map(operations=C.AutoContrast(cutoff=-10.0), input_columns="image")
|
||||
except ValueError as error:
|
||||
logger.info("Got an exception in DE: {}".format(str(error)))
|
||||
assert "Input cutoff is not within the required interval of (0 to 100)." in str(error)
|
||||
try:
|
||||
ds = de.ImageFolderDataset(dataset_dir=DATA_DIR, shuffle=False)
|
||||
ds = ds.map(operations=[C.Decode(),
|
||||
C.Resize((224, 224)),
|
||||
lambda img: np.array(img[:, :, 0])], input_columns=["image"])
|
||||
data_set = ds.ImageFolderDataset(dataset_dir=DATA_DIR, shuffle=False)
|
||||
data_set = data_set.map(operations=[C.Decode(),
|
||||
C.Resize((224, 224)),
|
||||
lambda img: np.array(img[:, :, 0])], input_columns=["image"])
|
||||
# invalid ignore
|
||||
ds = ds.map(operations=C.AutoContrast(cutoff=120.0), input_columns="image")
|
||||
data_set = data_set.map(operations=C.AutoContrast(cutoff=120.0), input_columns="image")
|
||||
except ValueError as error:
|
||||
logger.info("Got an exception in DE: {}".format(str(error)))
|
||||
assert "Input cutoff is not within the required interval of (0 to 100)." in str(error)
|
||||
|
@ -289,22 +289,24 @@ def test_auto_contrast_invalid_ignore_param_py():
|
|||
"""
|
||||
logger.info("Test AutoContrast python Op with invalid ignore parameter")
|
||||
try:
|
||||
ds = de.ImageFolderDataset(dataset_dir=DATA_DIR, shuffle=False)
|
||||
ds = ds.map(operations=[mindspore.dataset.transforms.py_transforms.Compose([F.Decode(),
|
||||
F.Resize((224, 224)),
|
||||
F.AutoContrast(ignore=255.5),
|
||||
F.ToTensor()])],
|
||||
input_columns=["image"])
|
||||
data_set = ds.ImageFolderDataset(dataset_dir=DATA_DIR, shuffle=False)
|
||||
data_set = data_set.map(operations=[mindspore.dataset.transforms.py_transforms.Compose([F.Decode(),
|
||||
F.Resize((224, 224)),
|
||||
F.AutoContrast(
|
||||
ignore=255.5),
|
||||
F.ToTensor()])],
|
||||
input_columns=["image"])
|
||||
except TypeError as error:
|
||||
logger.info("Got an exception in DE: {}".format(str(error)))
|
||||
assert "Argument ignore with value 255.5 is not of type" in str(error)
|
||||
try:
|
||||
ds = de.ImageFolderDataset(dataset_dir=DATA_DIR, shuffle=False)
|
||||
ds = ds.map(operations=[mindspore.dataset.transforms.py_transforms.Compose([F.Decode(),
|
||||
F.Resize((224, 224)),
|
||||
F.AutoContrast(ignore=(10, 100)),
|
||||
F.ToTensor()])],
|
||||
input_columns=["image"])
|
||||
data_set = ds.ImageFolderDataset(dataset_dir=DATA_DIR, shuffle=False)
|
||||
data_set = data_set.map(operations=[mindspore.dataset.transforms.py_transforms.Compose([F.Decode(),
|
||||
F.Resize((224, 224)),
|
||||
F.AutoContrast(
|
||||
ignore=(10, 100)),
|
||||
F.ToTensor()])],
|
||||
input_columns=["image"])
|
||||
except TypeError as error:
|
||||
logger.info("Got an exception in DE: {}".format(str(error)))
|
||||
assert "Argument ignore with value (10,100) is not of type" in str(error)
|
||||
|
@ -316,18 +318,19 @@ def test_auto_contrast_invalid_cutoff_param_py():
|
|||
"""
|
||||
logger.info("Test AutoContrast python Op with invalid cutoff parameter")
|
||||
try:
|
||||
ds = de.ImageFolderDataset(dataset_dir=DATA_DIR, shuffle=False)
|
||||
ds = ds.map(operations=[mindspore.dataset.transforms.py_transforms.Compose([F.Decode(),
|
||||
F.Resize((224, 224)),
|
||||
F.AutoContrast(cutoff=-10.0),
|
||||
F.ToTensor()])],
|
||||
input_columns=["image"])
|
||||
data_set = ds.ImageFolderDataset(dataset_dir=DATA_DIR, shuffle=False)
|
||||
data_set = data_set.map(operations=[mindspore.dataset.transforms.py_transforms.Compose([F.Decode(),
|
||||
F.Resize((224, 224)),
|
||||
F.AutoContrast(
|
||||
cutoff=-10.0),
|
||||
F.ToTensor()])],
|
||||
input_columns=["image"])
|
||||
except ValueError as error:
|
||||
logger.info("Got an exception in DE: {}".format(str(error)))
|
||||
assert "Input cutoff is not within the required interval of (0 to 100)." in str(error)
|
||||
try:
|
||||
ds = de.ImageFolderDataset(dataset_dir=DATA_DIR, shuffle=False)
|
||||
ds = ds.map(
|
||||
data_set = ds.ImageFolderDataset(dataset_dir=DATA_DIR, shuffle=False)
|
||||
data_set = data_set.map(
|
||||
operations=[mindspore.dataset.transforms.py_transforms.Compose([F.Decode(),
|
||||
F.Resize((224, 224)),
|
||||
F.AutoContrast(cutoff=120.0),
|
||||
|
|
|
@ -17,7 +17,7 @@ Testing Equalize op in DE
|
|||
"""
|
||||
import numpy as np
|
||||
|
||||
import mindspore.dataset.engine as de
|
||||
import mindspore.dataset as ds
|
||||
import mindspore.dataset.transforms.py_transforms
|
||||
import mindspore.dataset.vision.c_transforms as C
|
||||
import mindspore.dataset.vision.py_transforms as F
|
||||
|
@ -37,13 +37,13 @@ def test_equalize_py(plot=False):
|
|||
logger.info("Test Equalize")
|
||||
|
||||
# Original Images
|
||||
ds = de.ImageFolderDataset(dataset_dir=DATA_DIR, shuffle=False)
|
||||
data_set = ds.ImageFolderDataset(dataset_dir=DATA_DIR, shuffle=False)
|
||||
|
||||
transforms_original = mindspore.dataset.transforms.py_transforms.Compose([F.Decode(),
|
||||
F.Resize((224, 224)),
|
||||
F.ToTensor()])
|
||||
|
||||
ds_original = ds.map(operations=transforms_original, input_columns="image")
|
||||
ds_original = data_set.map(operations=transforms_original, input_columns="image")
|
||||
|
||||
ds_original = ds_original.batch(512)
|
||||
|
||||
|
@ -56,14 +56,14 @@ def test_equalize_py(plot=False):
|
|||
axis=0)
|
||||
|
||||
# Color Equalized Images
|
||||
ds = de.ImageFolderDataset(dataset_dir=DATA_DIR, shuffle=False)
|
||||
data_set = ds.ImageFolderDataset(dataset_dir=DATA_DIR, shuffle=False)
|
||||
|
||||
transforms_equalize = mindspore.dataset.transforms.py_transforms.Compose([F.Decode(),
|
||||
F.Resize((224, 224)),
|
||||
F.Equalize(),
|
||||
F.ToTensor()])
|
||||
|
||||
ds_equalize = ds.map(operations=transforms_equalize, input_columns="image")
|
||||
ds_equalize = data_set.map(operations=transforms_equalize, input_columns="image")
|
||||
|
||||
ds_equalize = ds_equalize.batch(512)
|
||||
|
||||
|
@ -92,11 +92,11 @@ def test_equalize_c(plot=False):
|
|||
logger.info("Test Equalize cpp op")
|
||||
|
||||
# Original Images
|
||||
ds = de.ImageFolderDataset(dataset_dir=DATA_DIR, shuffle=False)
|
||||
data_set = ds.ImageFolderDataset(dataset_dir=DATA_DIR, shuffle=False)
|
||||
|
||||
transforms_original = [C.Decode(), C.Resize(size=[224, 224])]
|
||||
|
||||
ds_original = ds.map(operations=transforms_original, input_columns="image")
|
||||
ds_original = data_set.map(operations=transforms_original, input_columns="image")
|
||||
|
||||
ds_original = ds_original.batch(512)
|
||||
|
||||
|
@ -109,12 +109,12 @@ def test_equalize_c(plot=False):
|
|||
axis=0)
|
||||
|
||||
# Equalize Images
|
||||
ds = de.ImageFolderDataset(dataset_dir=DATA_DIR, shuffle=False)
|
||||
data_set = ds.ImageFolderDataset(dataset_dir=DATA_DIR, shuffle=False)
|
||||
|
||||
transform_equalize = [C.Decode(), C.Resize(size=[224, 224]),
|
||||
C.Equalize()]
|
||||
|
||||
ds_equalize = ds.map(operations=transform_equalize, input_columns="image")
|
||||
ds_equalize = data_set.map(operations=transform_equalize, input_columns="image")
|
||||
|
||||
ds_equalize = ds_equalize.batch(512)
|
||||
|
||||
|
@ -142,10 +142,10 @@ def test_equalize_py_c(plot=False):
|
|||
logger.info("Test Equalize cpp and python op")
|
||||
|
||||
# equalize Images in cpp
|
||||
ds = de.ImageFolderDataset(dataset_dir=DATA_DIR, shuffle=False)
|
||||
ds = ds.map(operations=[C.Decode(), C.Resize((224, 224))], input_columns=["image"])
|
||||
data_set = ds.ImageFolderDataset(dataset_dir=DATA_DIR, shuffle=False)
|
||||
data_set = data_set.map(operations=[C.Decode(), C.Resize((224, 224))], input_columns=["image"])
|
||||
|
||||
ds_c_equalize = ds.map(operations=C.Equalize(), input_columns="image")
|
||||
ds_c_equalize = data_set.map(operations=C.Equalize(), input_columns="image")
|
||||
|
||||
ds_c_equalize = ds_c_equalize.batch(512)
|
||||
|
||||
|
@ -158,15 +158,15 @@ def test_equalize_py_c(plot=False):
|
|||
axis=0)
|
||||
|
||||
# Equalize images in python
|
||||
ds = de.ImageFolderDataset(dataset_dir=DATA_DIR, shuffle=False)
|
||||
ds = ds.map(operations=[C.Decode(), C.Resize((224, 224))], input_columns=["image"])
|
||||
data_set = ds.ImageFolderDataset(dataset_dir=DATA_DIR, shuffle=False)
|
||||
data_set = data_set.map(operations=[C.Decode(), C.Resize((224, 224))], input_columns=["image"])
|
||||
|
||||
transforms_p_equalize = mindspore.dataset.transforms.py_transforms.Compose([lambda img: img.astype(np.uint8),
|
||||
F.ToPIL(),
|
||||
F.Equalize(),
|
||||
np.array])
|
||||
|
||||
ds_p_equalize = ds.map(operations=transforms_p_equalize, input_columns="image")
|
||||
ds_p_equalize = data_set.map(operations=transforms_p_equalize, input_columns="image")
|
||||
|
||||
ds_p_equalize = ds_p_equalize.batch(512)
|
||||
|
||||
|
@ -197,11 +197,11 @@ def test_equalize_one_channel():
|
|||
c_op = C.Equalize()
|
||||
|
||||
try:
|
||||
ds = de.ImageFolderDataset(dataset_dir=DATA_DIR, shuffle=False)
|
||||
ds = ds.map(operations=[C.Decode(), C.Resize((224, 224)),
|
||||
lambda img: np.array(img[:, :, 0])], input_columns=["image"])
|
||||
data_set = ds.ImageFolderDataset(dataset_dir=DATA_DIR, shuffle=False)
|
||||
data_set = data_set.map(operations=[C.Decode(), C.Resize((224, 224)),
|
||||
lambda img: np.array(img[:, :, 0])], input_columns=["image"])
|
||||
|
||||
ds.map(operations=c_op, input_columns="image")
|
||||
data_set.map(operations=c_op, input_columns="image")
|
||||
|
||||
except RuntimeError as e:
|
||||
logger.info("Got an exception in DE: {}".format(str(e)))
|
||||
|
@ -213,9 +213,9 @@ def test_equalize_mnist_c(plot=False):
|
|||
Test Equalize C op with MNIST dataset (Grayscale images)
|
||||
"""
|
||||
logger.info("Test Equalize C Op With MNIST Images")
|
||||
ds = de.MnistDataset(dataset_dir=MNIST_DATA_DIR, num_samples=2, shuffle=False)
|
||||
ds_equalize_c = ds.map(operations=C.Equalize(), input_columns="image")
|
||||
ds_orig = de.MnistDataset(dataset_dir=MNIST_DATA_DIR, num_samples=2, shuffle=False)
|
||||
data_set = ds.MnistDataset(dataset_dir=MNIST_DATA_DIR, num_samples=2, shuffle=False)
|
||||
ds_equalize_c = data_set.map(operations=C.Equalize(), input_columns="image")
|
||||
ds_orig = ds.MnistDataset(dataset_dir=MNIST_DATA_DIR, num_samples=2, shuffle=False)
|
||||
|
||||
images = []
|
||||
images_trans = []
|
||||
|
@ -242,7 +242,7 @@ def test_equalize_md5_py():
|
|||
logger.info("Test Equalize")
|
||||
|
||||
# First dataset
|
||||
data1 = de.ImageFolderDataset(dataset_dir=DATA_DIR, shuffle=False)
|
||||
data1 = ds.ImageFolderDataset(dataset_dir=DATA_DIR, shuffle=False)
|
||||
transforms = mindspore.dataset.transforms.py_transforms.Compose([F.Decode(),
|
||||
F.Equalize(),
|
||||
F.ToTensor()])
|
||||
|
@ -260,14 +260,14 @@ def test_equalize_md5_c():
|
|||
logger.info("Test Equalize cpp op with md5 check")
|
||||
|
||||
# Generate dataset
|
||||
ds = de.ImageFolderDataset(dataset_dir=DATA_DIR, shuffle=False)
|
||||
data_set = ds.ImageFolderDataset(dataset_dir=DATA_DIR, shuffle=False)
|
||||
|
||||
transforms_equalize = [C.Decode(),
|
||||
C.Resize(size=[224, 224]),
|
||||
C.Equalize(),
|
||||
F.ToTensor()]
|
||||
|
||||
data = ds.map(operations=transforms_equalize, input_columns="image")
|
||||
data = data_set.map(operations=transforms_equalize, input_columns="image")
|
||||
# Compare with expected md5 from images
|
||||
filename = "equalize_01_result_c.npz"
|
||||
save_and_check_md5(data, filename, generate_golden=GENERATE_GOLDEN)
|
||||
|
|
|
@ -17,7 +17,7 @@ Testing Invert op in DE
|
|||
"""
|
||||
import numpy as np
|
||||
|
||||
import mindspore.dataset.engine as de
|
||||
import mindspore.dataset as ds
|
||||
import mindspore.dataset.transforms.py_transforms
|
||||
import mindspore.dataset.vision.py_transforms as F
|
||||
import mindspore.dataset.vision.c_transforms as C
|
||||
|
@ -36,13 +36,13 @@ def test_invert_py(plot=False):
|
|||
logger.info("Test Invert Python op")
|
||||
|
||||
# Original Images
|
||||
ds = de.ImageFolderDataset(dataset_dir=DATA_DIR, shuffle=False)
|
||||
data_set = ds.ImageFolderDataset(dataset_dir=DATA_DIR, shuffle=False)
|
||||
|
||||
transforms_original = mindspore.dataset.transforms.py_transforms.Compose([F.Decode(),
|
||||
F.Resize((224, 224)),
|
||||
F.ToTensor()])
|
||||
|
||||
ds_original = ds.map(operations=transforms_original, input_columns="image")
|
||||
ds_original = data_set.map(operations=transforms_original, input_columns="image")
|
||||
|
||||
ds_original = ds_original.batch(512)
|
||||
|
||||
|
@ -55,14 +55,14 @@ def test_invert_py(plot=False):
|
|||
axis=0)
|
||||
|
||||
# Color Inverted Images
|
||||
ds = de.ImageFolderDataset(dataset_dir=DATA_DIR, shuffle=False)
|
||||
data_set = ds.ImageFolderDataset(dataset_dir=DATA_DIR, shuffle=False)
|
||||
|
||||
transforms_invert = mindspore.dataset.transforms.py_transforms.Compose([F.Decode(),
|
||||
F.Resize((224, 224)),
|
||||
F.Invert(),
|
||||
F.ToTensor()])
|
||||
|
||||
ds_invert = ds.map(operations=transforms_invert, input_columns="image")
|
||||
ds_invert = data_set.map(operations=transforms_invert, input_columns="image")
|
||||
|
||||
ds_invert = ds_invert.batch(512)
|
||||
|
||||
|
@ -91,11 +91,11 @@ def test_invert_c(plot=False):
|
|||
logger.info("Test Invert cpp op")
|
||||
|
||||
# Original Images
|
||||
ds = de.ImageFolderDataset(dataset_dir=DATA_DIR, shuffle=False)
|
||||
data_set = ds.ImageFolderDataset(dataset_dir=DATA_DIR, shuffle=False)
|
||||
|
||||
transforms_original = [C.Decode(), C.Resize(size=[224, 224])]
|
||||
|
||||
ds_original = ds.map(operations=transforms_original, input_columns="image")
|
||||
ds_original = data_set.map(operations=transforms_original, input_columns="image")
|
||||
|
||||
ds_original = ds_original.batch(512)
|
||||
|
||||
|
@ -108,12 +108,12 @@ def test_invert_c(plot=False):
|
|||
axis=0)
|
||||
|
||||
# Invert Images
|
||||
ds = de.ImageFolderDataset(dataset_dir=DATA_DIR, shuffle=False)
|
||||
data_set = ds.ImageFolderDataset(dataset_dir=DATA_DIR, shuffle=False)
|
||||
|
||||
transform_invert = [C.Decode(), C.Resize(size=[224, 224]),
|
||||
C.Invert()]
|
||||
|
||||
ds_invert = ds.map(operations=transform_invert, input_columns="image")
|
||||
ds_invert = data_set.map(operations=transform_invert, input_columns="image")
|
||||
|
||||
ds_invert = ds_invert.batch(512)
|
||||
|
||||
|
@ -141,10 +141,10 @@ def test_invert_py_c(plot=False):
|
|||
logger.info("Test Invert cpp and python op")
|
||||
|
||||
# Invert Images in cpp
|
||||
ds = de.ImageFolderDataset(dataset_dir=DATA_DIR, shuffle=False)
|
||||
ds = ds.map(operations=[C.Decode(), C.Resize((224, 224))], input_columns=["image"])
|
||||
data_set = ds.ImageFolderDataset(dataset_dir=DATA_DIR, shuffle=False)
|
||||
data_set = data_set.map(operations=[C.Decode(), C.Resize((224, 224))], input_columns=["image"])
|
||||
|
||||
ds_c_invert = ds.map(operations=C.Invert(), input_columns="image")
|
||||
ds_c_invert = data_set.map(operations=C.Invert(), input_columns="image")
|
||||
|
||||
ds_c_invert = ds_c_invert.batch(512)
|
||||
|
||||
|
@ -157,15 +157,15 @@ def test_invert_py_c(plot=False):
|
|||
axis=0)
|
||||
|
||||
# invert images in python
|
||||
ds = de.ImageFolderDataset(dataset_dir=DATA_DIR, shuffle=False)
|
||||
ds = ds.map(operations=[C.Decode(), C.Resize((224, 224))], input_columns=["image"])
|
||||
data_set = ds.ImageFolderDataset(dataset_dir=DATA_DIR, shuffle=False)
|
||||
data_set = data_set.map(operations=[C.Decode(), C.Resize((224, 224))], input_columns=["image"])
|
||||
|
||||
transforms_p_invert = mindspore.dataset.transforms.py_transforms.Compose([lambda img: img.astype(np.uint8),
|
||||
F.ToPIL(),
|
||||
F.Invert(),
|
||||
np.array])
|
||||
|
||||
ds_p_invert = ds.map(operations=transforms_p_invert, input_columns="image")
|
||||
ds_p_invert = data_set.map(operations=transforms_p_invert, input_columns="image")
|
||||
|
||||
ds_p_invert = ds_p_invert.batch(512)
|
||||
|
||||
|
@ -196,11 +196,11 @@ def test_invert_one_channel():
|
|||
c_op = C.Invert()
|
||||
|
||||
try:
|
||||
ds = de.ImageFolderDataset(dataset_dir=DATA_DIR, shuffle=False)
|
||||
ds = ds.map(operations=[C.Decode(), C.Resize((224, 224)),
|
||||
lambda img: np.array(img[:, :, 0])], input_columns=["image"])
|
||||
data_set = ds.ImageFolderDataset(dataset_dir=DATA_DIR, shuffle=False)
|
||||
data_set = data_set.map(operations=[C.Decode(), C.Resize((224, 224)),
|
||||
lambda img: np.array(img[:, :, 0])], input_columns=["image"])
|
||||
|
||||
ds.map(operations=c_op, input_columns="image")
|
||||
data_set.map(operations=c_op, input_columns="image")
|
||||
|
||||
except RuntimeError as e:
|
||||
logger.info("Got an exception in DE: {}".format(str(e)))
|
||||
|
@ -214,13 +214,13 @@ def test_invert_md5_py():
|
|||
logger.info("Test Invert python op with md5 check")
|
||||
|
||||
# Generate dataset
|
||||
ds = de.ImageFolderDataset(dataset_dir=DATA_DIR, shuffle=False)
|
||||
data_set = ds.ImageFolderDataset(dataset_dir=DATA_DIR, shuffle=False)
|
||||
|
||||
transforms_invert = mindspore.dataset.transforms.py_transforms.Compose([F.Decode(),
|
||||
F.Invert(),
|
||||
F.ToTensor()])
|
||||
|
||||
data = ds.map(operations=transforms_invert, input_columns="image")
|
||||
data = data_set.map(operations=transforms_invert, input_columns="image")
|
||||
# Compare with expected md5 from images
|
||||
filename = "invert_01_result_py.npz"
|
||||
save_and_check_md5(data, filename, generate_golden=GENERATE_GOLDEN)
|
||||
|
@ -233,14 +233,14 @@ def test_invert_md5_c():
|
|||
logger.info("Test Invert cpp op with md5 check")
|
||||
|
||||
# Generate dataset
|
||||
ds = de.ImageFolderDataset(dataset_dir=DATA_DIR, shuffle=False)
|
||||
data_set = ds.ImageFolderDataset(dataset_dir=DATA_DIR, shuffle=False)
|
||||
|
||||
transforms_invert = [C.Decode(),
|
||||
C.Resize(size=[224, 224]),
|
||||
C.Invert(),
|
||||
F.ToTensor()]
|
||||
|
||||
data = ds.map(operations=transforms_invert, input_columns="image")
|
||||
data = data_set.map(operations=transforms_invert, input_columns="image")
|
||||
# Compare with expected md5 from images
|
||||
filename = "invert_01_result_c.npz"
|
||||
save_and_check_md5(data, filename, generate_golden=GENERATE_GOLDEN)
|
||||
|
|
|
@ -19,7 +19,6 @@ import numpy as np
|
|||
import pytest
|
||||
|
||||
import mindspore.dataset as ds
|
||||
import mindspore.dataset.engine as de
|
||||
import mindspore.dataset.transforms.py_transforms
|
||||
import mindspore.dataset.vision.c_transforms as vision
|
||||
import mindspore.dataset.vision.py_transforms as F
|
||||
|
@ -44,7 +43,7 @@ def test_random_color_py(degrees=(0.1, 1.9), plot=False):
|
|||
logger.info("Test RandomColor")
|
||||
|
||||
# Original Images
|
||||
data = de.ImageFolderDataset(dataset_dir=DATA_DIR, shuffle=False)
|
||||
data = ds.ImageFolderDataset(dataset_dir=DATA_DIR, shuffle=False)
|
||||
|
||||
transforms_original = mindspore.dataset.transforms.py_transforms.Compose([F.Decode(),
|
||||
F.Resize((224, 224)),
|
||||
|
@ -63,7 +62,7 @@ def test_random_color_py(degrees=(0.1, 1.9), plot=False):
|
|||
axis=0)
|
||||
|
||||
# Random Color Adjusted Images
|
||||
data = de.ImageFolderDataset(dataset_dir=DATA_DIR, shuffle=False)
|
||||
data = ds.ImageFolderDataset(dataset_dir=DATA_DIR, shuffle=False)
|
||||
|
||||
transforms_random_color = mindspore.dataset.transforms.py_transforms.Compose([F.Decode(),
|
||||
F.Resize((224, 224)),
|
||||
|
@ -146,7 +145,7 @@ def test_random_color_py_md5():
|
|||
original_num_parallel_workers = config_get_set_num_parallel_workers(1)
|
||||
|
||||
# Generate dataset
|
||||
data = de.ImageFolderDataset(dataset_dir=DATA_DIR, shuffle=False)
|
||||
data = ds.ImageFolderDataset(dataset_dir=DATA_DIR, shuffle=False)
|
||||
|
||||
transforms = mindspore.dataset.transforms.py_transforms.Compose([F.Decode(),
|
||||
F.RandomColor((2.0, 2.5)),
|
||||
|
@ -234,7 +233,7 @@ def test_random_color_c_errors():
|
|||
assert "degrees must be a sequence with length 2." in str(error_info.value)
|
||||
|
||||
# RandomColor Cpp Op will fail with one channel input
|
||||
mnist_ds = de.MnistDataset(dataset_dir=MNIST_DATA_DIR, num_samples=2, shuffle=False)
|
||||
mnist_ds = ds.MnistDataset(dataset_dir=MNIST_DATA_DIR, num_samples=2, shuffle=False)
|
||||
mnist_ds = mnist_ds.map(operations=vision.RandomColor(), input_columns="image")
|
||||
|
||||
with pytest.raises(RuntimeError) as error_info:
|
||||
|
|
|
@ -17,7 +17,6 @@ Testing RandomSharpness op in DE
|
|||
"""
|
||||
import numpy as np
|
||||
import mindspore.dataset as ds
|
||||
import mindspore.dataset.engine as de
|
||||
import mindspore.dataset.transforms.py_transforms
|
||||
import mindspore.dataset.vision.py_transforms as F
|
||||
import mindspore.dataset.vision.c_transforms as C
|
||||
|
@ -38,7 +37,7 @@ def test_random_sharpness_py(degrees=(0.7, 0.7), plot=False):
|
|||
logger.info("Test RandomSharpness python op")
|
||||
|
||||
# Original Images
|
||||
data = de.ImageFolderDataset(dataset_dir=DATA_DIR, shuffle=False)
|
||||
data = ds.ImageFolderDataset(dataset_dir=DATA_DIR, shuffle=False)
|
||||
|
||||
transforms_original = mindspore.dataset.transforms.py_transforms.Compose([F.Decode(),
|
||||
F.Resize((224, 224)),
|
||||
|
@ -57,7 +56,7 @@ def test_random_sharpness_py(degrees=(0.7, 0.7), plot=False):
|
|||
axis=0)
|
||||
|
||||
# Random Sharpness Adjusted Images
|
||||
data = de.ImageFolderDataset(dataset_dir=DATA_DIR, shuffle=False)
|
||||
data = ds.ImageFolderDataset(dataset_dir=DATA_DIR, shuffle=False)
|
||||
|
||||
py_op = F.RandomSharpness()
|
||||
if degrees is not None:
|
||||
|
@ -108,7 +107,7 @@ def test_random_sharpness_py_md5():
|
|||
transform = mindspore.dataset.transforms.py_transforms.Compose(transforms)
|
||||
|
||||
# Generate dataset
|
||||
data = de.ImageFolderDataset(dataset_dir=DATA_DIR, shuffle=False)
|
||||
data = ds.ImageFolderDataset(dataset_dir=DATA_DIR, shuffle=False)
|
||||
data = data.map(operations=transform, input_columns=["image"])
|
||||
|
||||
# check results with md5 comparison
|
||||
|
@ -128,7 +127,7 @@ def test_random_sharpness_c(degrees=(1.6, 1.6), plot=False):
|
|||
logger.info("Test RandomSharpness cpp op")
|
||||
|
||||
# Original Images
|
||||
data = de.ImageFolderDataset(dataset_dir=DATA_DIR, shuffle=False)
|
||||
data = ds.ImageFolderDataset(dataset_dir=DATA_DIR, shuffle=False)
|
||||
|
||||
transforms_original = [C.Decode(),
|
||||
C.Resize((224, 224))]
|
||||
|
@ -146,7 +145,7 @@ def test_random_sharpness_c(degrees=(1.6, 1.6), plot=False):
|
|||
axis=0)
|
||||
|
||||
# Random Sharpness Adjusted Images
|
||||
data = de.ImageFolderDataset(dataset_dir=DATA_DIR, shuffle=False)
|
||||
data = ds.ImageFolderDataset(dataset_dir=DATA_DIR, shuffle=False)
|
||||
|
||||
c_op = C.RandomSharpness()
|
||||
if degrees is not None:
|
||||
|
@ -194,7 +193,7 @@ def test_random_sharpness_c_md5():
|
|||
]
|
||||
|
||||
# Generate dataset
|
||||
data = de.ImageFolderDataset(dataset_dir=DATA_DIR, shuffle=False)
|
||||
data = ds.ImageFolderDataset(dataset_dir=DATA_DIR, shuffle=False)
|
||||
data = data.map(operations=transforms, input_columns=["image"])
|
||||
|
||||
# check results with md5 comparison
|
||||
|
@ -213,7 +212,7 @@ def test_random_sharpness_c_py(degrees=(1.0, 1.0), plot=False):
|
|||
logger.info("Test RandomSharpness C and python Op")
|
||||
|
||||
# RandomSharpness Images
|
||||
data = de.ImageFolderDataset(dataset_dir=DATA_DIR, shuffle=False)
|
||||
data = ds.ImageFolderDataset(dataset_dir=DATA_DIR, shuffle=False)
|
||||
data = data.map(operations=[C.Decode(), C.Resize((200, 300))], input_columns=["image"])
|
||||
|
||||
python_op = F.RandomSharpness(degrees)
|
||||
|
@ -236,7 +235,7 @@ def test_random_sharpness_c_py(degrees=(1.0, 1.0), plot=False):
|
|||
image,
|
||||
axis=0)
|
||||
|
||||
data = de.ImageFolderDataset(dataset_dir=DATA_DIR, shuffle=False)
|
||||
data = ds.ImageFolderDataset(dataset_dir=DATA_DIR, shuffle=False)
|
||||
data = data.map(operations=[C.Decode(), C.Resize((200, 300))], input_columns=["image"])
|
||||
|
||||
ds_images_random_sharpness_c = data.map(operations=c_op, input_columns="image")
|
||||
|
@ -271,10 +270,10 @@ def test_random_sharpness_one_channel_c(degrees=(1.4, 1.4), plot=False):
|
|||
if degrees is not None:
|
||||
c_op = C.RandomSharpness(degrees)
|
||||
# RandomSharpness Images
|
||||
data = de.MnistDataset(dataset_dir=MNIST_DATA_DIR, num_samples=2, shuffle=False)
|
||||
data = ds.MnistDataset(dataset_dir=MNIST_DATA_DIR, num_samples=2, shuffle=False)
|
||||
ds_random_sharpness_c = data.map(operations=c_op, input_columns="image")
|
||||
# Original images
|
||||
data = de.MnistDataset(dataset_dir=MNIST_DATA_DIR, num_samples=2, shuffle=False)
|
||||
data = ds.MnistDataset(dataset_dir=MNIST_DATA_DIR, num_samples=2, shuffle=False)
|
||||
|
||||
images = []
|
||||
images_trans = []
|
||||
|
@ -296,7 +295,7 @@ def test_random_sharpness_invalid_params():
|
|||
"""
|
||||
logger.info("Test RandomSharpness with invalid input parameters.")
|
||||
try:
|
||||
data = de.ImageFolderDataset(dataset_dir=DATA_DIR, shuffle=False)
|
||||
data = ds.ImageFolderDataset(dataset_dir=DATA_DIR, shuffle=False)
|
||||
data = data.map(operations=[C.Decode(), C.Resize((224, 224)),
|
||||
C.RandomSharpness(10)], input_columns=["image"])
|
||||
except TypeError as error:
|
||||
|
@ -304,7 +303,7 @@ def test_random_sharpness_invalid_params():
|
|||
assert "tuple" in str(error)
|
||||
|
||||
try:
|
||||
data = de.ImageFolderDataset(dataset_dir=DATA_DIR, shuffle=False)
|
||||
data = ds.ImageFolderDataset(dataset_dir=DATA_DIR, shuffle=False)
|
||||
data = data.map(operations=[C.Decode(), C.Resize((224, 224)),
|
||||
C.RandomSharpness((-10, 10))], input_columns=["image"])
|
||||
except ValueError as error:
|
||||
|
@ -312,7 +311,7 @@ def test_random_sharpness_invalid_params():
|
|||
assert "interval" in str(error)
|
||||
|
||||
try:
|
||||
data = de.ImageFolderDataset(dataset_dir=DATA_DIR, shuffle=False)
|
||||
data = ds.ImageFolderDataset(dataset_dir=DATA_DIR, shuffle=False)
|
||||
data = data.map(operations=[C.Decode(), C.Resize((224, 224)),
|
||||
C.RandomSharpness((10, 5))], input_columns=["image"])
|
||||
except ValueError as error:
|
||||
|
|
|
@ -17,7 +17,6 @@ Testing RandomSolarizeOp op in DE
|
|||
"""
|
||||
import pytest
|
||||
import mindspore.dataset as ds
|
||||
import mindspore.dataset.engine as de
|
||||
import mindspore.dataset.vision.c_transforms as vision
|
||||
from mindspore import log as logger
|
||||
from util import visualize_list, save_and_check_md5, config_get_set_seed, config_get_set_num_parallel_workers, \
|
||||
|
@ -78,8 +77,8 @@ def test_random_solarize_mnist(plot=False, run_golden=True):
|
|||
Test RandomSolarize op with MNIST dataset (Grayscale images)
|
||||
"""
|
||||
|
||||
mnist_1 = de.MnistDataset(dataset_dir=MNIST_DATA_DIR, num_samples=2, shuffle=False)
|
||||
mnist_2 = de.MnistDataset(dataset_dir=MNIST_DATA_DIR, num_samples=2, shuffle=False)
|
||||
mnist_1 = ds.MnistDataset(dataset_dir=MNIST_DATA_DIR, num_samples=2, shuffle=False)
|
||||
mnist_2 = ds.MnistDataset(dataset_dir=MNIST_DATA_DIR, num_samples=2, shuffle=False)
|
||||
mnist_2 = mnist_2.map(operations=vision.RandomSolarize((0, 255)), input_columns="image")
|
||||
|
||||
images = []
|
||||
|
|
|
@ -18,7 +18,7 @@ Testing UniformAugment in DE
|
|||
import numpy as np
|
||||
import pytest
|
||||
|
||||
import mindspore.dataset.engine as de
|
||||
import mindspore.dataset as ds
|
||||
import mindspore.dataset.transforms.py_transforms
|
||||
import mindspore.dataset.vision.c_transforms as C
|
||||
import mindspore.dataset.vision.py_transforms as F
|
||||
|
@ -35,13 +35,13 @@ def test_uniform_augment(plot=False, num_ops=2):
|
|||
logger.info("Test UniformAugment")
|
||||
|
||||
# Original Images
|
||||
ds = de.ImageFolderDataset(dataset_dir=DATA_DIR, shuffle=False)
|
||||
data_set = ds.ImageFolderDataset(dataset_dir=DATA_DIR, shuffle=False)
|
||||
|
||||
transforms_original = mindspore.dataset.transforms.py_transforms.Compose([F.Decode(),
|
||||
F.Resize((224, 224)),
|
||||
F.ToTensor()])
|
||||
|
||||
ds_original = ds.map(operations=transforms_original, input_columns="image")
|
||||
ds_original = data_set.map(operations=transforms_original, input_columns="image")
|
||||
|
||||
ds_original = ds_original.batch(512)
|
||||
|
||||
|
@ -54,7 +54,7 @@ def test_uniform_augment(plot=False, num_ops=2):
|
|||
axis=0)
|
||||
|
||||
# UniformAugment Images
|
||||
ds = de.ImageFolderDataset(dataset_dir=DATA_DIR, shuffle=False)
|
||||
data_set = ds.ImageFolderDataset(dataset_dir=DATA_DIR, shuffle=False)
|
||||
|
||||
transform_list = [F.RandomRotation(45),
|
||||
F.RandomColor(),
|
||||
|
@ -70,7 +70,7 @@ def test_uniform_augment(plot=False, num_ops=2):
|
|||
num_ops=num_ops),
|
||||
F.ToTensor()])
|
||||
|
||||
ds_ua = ds.map(operations=transforms_ua, input_columns="image")
|
||||
ds_ua = data_set.map(operations=transforms_ua, input_columns="image")
|
||||
|
||||
ds_ua = ds_ua.batch(512)
|
||||
|
||||
|
@ -99,12 +99,12 @@ def test_cpp_uniform_augment(plot=False, num_ops=2):
|
|||
logger.info("Test CPP UniformAugment")
|
||||
|
||||
# Original Images
|
||||
ds = de.ImageFolderDataset(dataset_dir=DATA_DIR, shuffle=False)
|
||||
data_set = ds.ImageFolderDataset(dataset_dir=DATA_DIR, shuffle=False)
|
||||
|
||||
transforms_original = [C.Decode(), C.Resize(size=[224, 224]),
|
||||
F.ToTensor()]
|
||||
|
||||
ds_original = ds.map(operations=transforms_original, input_columns="image")
|
||||
ds_original = data_set.map(operations=transforms_original, input_columns="image")
|
||||
|
||||
ds_original = ds_original.batch(512)
|
||||
|
||||
|
@ -117,7 +117,7 @@ def test_cpp_uniform_augment(plot=False, num_ops=2):
|
|||
axis=0)
|
||||
|
||||
# UniformAugment Images
|
||||
ds = de.ImageFolderDataset(dataset_dir=DATA_DIR, shuffle=False)
|
||||
data_set = ds.ImageFolderDataset(dataset_dir=DATA_DIR, shuffle=False)
|
||||
transforms_ua = [C.RandomCrop(size=[224, 224], padding=[32, 32, 32, 32]),
|
||||
C.RandomHorizontalFlip(),
|
||||
C.RandomVerticalFlip(),
|
||||
|
@ -130,7 +130,7 @@ def test_cpp_uniform_augment(plot=False, num_ops=2):
|
|||
uni_aug,
|
||||
F.ToTensor()]
|
||||
|
||||
ds_ua = ds.map(operations=transforms_all, input_columns="image", num_parallel_workers=1)
|
||||
ds_ua = data_set.map(operations=transforms_all, input_columns="image", num_parallel_workers=1)
|
||||
|
||||
ds_ua = ds_ua.batch(512)
|
||||
|
||||
|
@ -240,7 +240,7 @@ def test_cpp_uniform_augment_random_crop_badinput(num_ops=1):
|
|||
logger.info("Test CPP UniformAugment with random_crop bad input")
|
||||
batch_size = 2
|
||||
cifar10_dir = "../data/dataset/testCifar10Data"
|
||||
ds1 = de.Cifar10Dataset(cifar10_dir, shuffle=False) # shape = [32,32,3]
|
||||
ds1 = ds.Cifar10Dataset(cifar10_dir, shuffle=False) # shape = [32,32,3]
|
||||
|
||||
transforms_ua = [
|
||||
# Note: crop size [224, 224] > image size [32, 32]
|
||||
|
|
Loading…
Reference in New Issue