Fix error info bug and multiprocessing shared memory error

This commit is contained in:
Zhenglong Li 2021-06-30 18:15:46 +08:00
parent bde38a582c
commit 7e26677534
7 changed files with 41 additions and 30 deletions

View File

@ -39,7 +39,7 @@ namespace vision {
class DvppDecodeResizeJpeg final : public TensorTransform {
public:
/// \brief Constructor.
/// \param[in] resize A vector of int value for each dimension, with respect to H,W order.
/// \param[in] resize Parameter vector of two integers for each dimension, with respect to H,W order.
explicit DvppDecodeResizeJpeg(std::vector<uint32_t> resize);
/// \brief Destructor.
@ -62,8 +62,8 @@ class DvppDecodeResizeJpeg final : public TensorTransform {
class DvppDecodeResizeCropJpeg final : public TensorTransform {
public:
/// \brief Constructor.
/// \param[in] crop A vector of int value for each dimension after final cropping, with respect to H,W order.
/// \param[in] resize A vector of int value for each dimension after resizing, with respect to H,W order.
/// \param[in] crop Parameter vector of two integers for each dimension after final crop, with respect to H,W order.
/// \param[in] resize Parameter vector of two integers for each dimension after resize, with respect to H,W order.
explicit DvppDecodeResizeCropJpeg(std::vector<uint32_t> crop, std::vector<uint32_t> resize);
/// \brief Destructor.

View File

@ -18,6 +18,7 @@ General Validators.
import inspect
from multiprocessing import cpu_count
import os
from pickle import dumps
import numpy as np
import mindspore._c_dataengine as cde
@ -62,6 +63,23 @@ def is_iterable(obj):
return True
def is_serializable(obj):
"""
Helper function to check if object is serializable.
Args:
obj (any): object to check if serializable
Returns:
bool, true if object is serializable
"""
try:
dumps(obj)
except TypeError:
return False
return True
def pad_arg_name(arg_name):
"""
Appends a space to the arg_name (if not empty)

View File

@ -448,13 +448,9 @@ class Dataset:
len(output_columns). The size of this list must match the number of output
columns of the last operation. (default=None, output columns will have the same
name as the input columns, i.e., the columns will be replaced).
column_order (Union[str, list[str]], optional): List of all the desired columns to propagate to
the child node. This list must be a permutation of all the columns in the dataset after
all operations are applied. The order of the columns in each row propagated to the
child node follow the order they appear in this list. The parameter is mandatory
if the len(input_columns) != len(output_columns). (default=None, all columns
will be propagated to the child node, the order of the columns will remain the
same).
column_order (Union[str, list[str]], optional): Specifies the list of all the columns you need in the whole
dataset. The parameter is required when len(input_column) != len(output_column). Caution: the list here
is not just the columns specified in parameter input_columns and output_columns.
pad_info (dict, optional): Whether to perform padding on selected columns. pad_info={"col1":([224,224],0)}
would pad column with name "col1" to a tensor of size [224,224] and fill the missing with 0.
python_multiprocessing (bool, optional): Parallelize Python function per_batch_map with multi-processing.
@ -645,13 +641,9 @@ class Dataset:
len(output_columns). The size of this list must match the number of output
columns of the last operation. (default=None, output columns will have the same
name as the input columns, i.e., the columns will be replaced).
column_order (list[str], optional): List of all the desired columns to propagate to the
child node. This list must be a subset of all the columns in the dataset after
all operations are applied. The order of the columns in each row propagated to the
child node follow the order they appear in this list. The parameter is mandatory
if the len(input_columns) != len(output_columns). (default=None, all columns
will be propagated to the child node, the order of the columns will remain the
same).
column_order (list[str], optional): Specifies the list of all the columns you need in the whole
dataset. The parameter is required when len(input_column) != len(output_column). Caution: the list here
is not just the columns specified in parameter input_columns and output_columns.
num_parallel_workers (int, optional): Number of threads used to process the dataset in
parallel (default=None, the value from the configuration will be used).
python_multiprocessing (bool, optional): Parallelize Python operations with multiple worker processes. This
@ -782,7 +774,7 @@ class Dataset:
@check_repeat
def repeat(self, count=None):
"""
Repeat this dataset N times where N = count. Repeat stochastically if the count is None or -1.
Repeat this dataset N times where N = count. Repeat infinitely if the count is None or -1.
Note:
The order of using repeat and batch reflects the number of batches. It is recommended that
@ -2069,13 +2061,9 @@ class BatchDataset(Dataset):
len(output_columns). The size of this list must match the number of output
columns of the last operation. (default=None, output columns will have the same
name as the input columns, i.e., the columns will be replaced).
column_order (Union[str, list[str]], optional): List of all the desired columns to propagate to the
child node. This list must be a subset of all the columns in the dataset after
all operations are applied. The order of the columns in each row propagated to the
child node follow the order they appear in this list. The parameter is mandatory
if the len(input_columns) != len(output_columns). (default=None, all columns
will be propagated to the child node, the order of the columns will remain the
same).
column_order (Union[str, list[str]], optional): Specifies the list of all the columns you need in the whole
dataset. The parameter is required when len(input_column) != len(output_column). Caution: the list here
is not just the columns specified in parameter input_columns and output_columns.
pad_info (dict, optional): Whether to perform padding on selected columns. pad_info={"col1":([224,224],0)}
will pad column with name "col1" to a tensor of size [224,224] and fill the missing with 0.
max_rowsize(int, optional): Maximum size of row in MB that is used for shared memory allocation to copy
@ -2558,8 +2546,9 @@ class MapDataset(Dataset):
The size of the list should match the number of outputs of the last operator
(default=None, output columns will be the input columns, i.e., the columns will
be replaced).
column_order (list[str], optional): List of all the desired columns of the dataset (default=None).
The argument is mandatory if len(input_columns) != len(output_columns).
column_order (list[str], optional): Specifies the list of all the columns you need in the whole
dataset. The parameter is required when len(input_column) != len(output_column). Caution: the list here
is not just the columns specified in parameter input_columns and output_columns.
num_parallel_workers (int, optional): Number of workers to process the dataset
in parallel (default=None).
python_multiprocessing (bool, optional): Parallelize Python operations with multiple worker process. This

View File

@ -293,7 +293,7 @@ class GraphData:
node_list (Union[list, numpy.ndarray]): The given list of nodes.
neighbor_type (int): Specify the type of neighbor.
output_format (OutputFormat, optional): Output storage format (default=OutputFormat.NORMAL)
It can be any of [OutputFormat.NORMAL, OutputFormat.COO, OutputFormat.CSR].
It can be any of [OutputFormat.NORMAL, OutputFormat.COO, OutputFormat.CSR].
Returns:
For NORMAL format or COO format

View File

@ -22,6 +22,7 @@ import multiprocessing.queues
import multiprocessing
import numpy as np
from mindspore import log as logger
from ..core.validator_helpers import is_serializable
from ..transforms.py_transforms_util import ExceptionHandler
@ -75,6 +76,9 @@ class _SharedQueue(multiprocessing.queues.Queue):
count = 0
start_bytes = 0
for r in data:
if not is_serializable(obj=r):
raise TypeError("Can not pickle {} object, please verify pyfunc return with numpy array"
.format(type(r)))
if (isinstance(r, np.ndarray) and r.size > self.min_shared_mem
and start_bytes + r.nbytes < self.seg_size):
# need to convert start_bytes to offset in array

View File

@ -40,7 +40,7 @@ def serialize(dataset, json_filepath=""):
dict containing the serialized dataset graph.
Raises:
OSError cannot open a file
OSError: Can not open a file
Examples:
>>> dataset = ds.MnistDataset(mnist_dataset_dir, 100)

View File

@ -682,7 +682,7 @@ def check_repeat(method):
type_check(count, (int, type(None)), "repeat")
if isinstance(count, int):
if (count <= 0 and count != -1) or count > INT32_MAX:
raise ValueError("count should be either -1 or positive integer.")
raise ValueError("count should be either -1 or positive integer, range[1, INT32_MAX].")
return method(self, *args, **kwargs)
return new_method