Adding offload hardware accelerator.

2021-10-22 18:42:10 +00:00 · 2021-10-22 18:42:10 +00:00 · 22e03ab12e
parent 14efcd5a1c
commit 22e03ab12e
20 changed files with 717 additions and 28 deletions
--- a/mindspore/ccsrc/minddata/dataset/api/python/bindings/dataset/engine/ir/consumer/bindings.cc
+++ b/mindspore/ccsrc/minddata/dataset/api/python/bindings/dataset/engine/ir/consumer/bindings.cc
@ -38,6 +38,7 @@ PYBIND_REGISTER(PythonIteratorConsumer, 1, ([](const py::module *m) {
                           THROW_IF_ERROR(self.GetNextAsDict(&output));
                           return output;
                         })
+                    .def("GetOffload", [](PythonIteratorConsumer &self) { return self.GetOffload(); })
                    .def("GetNextAsList", [](PythonIteratorConsumer &self) {
                      py::list output;
                      THROW_IF_ERROR(self.GetNextAsList(&output));
@ -123,6 +124,7 @@ PYBIND_REGISTER(ToDevice, 1, ([](const py::module *m) {
                    .def("Send", [](ToDevice &self) { THROW_IF_ERROR(self.Send()); })
                    .def("ContinueSend", [](ToDevice &self) { THROW_IF_ERROR(self.Continue()); })
                    .def("StopSend", [](ToDevice &self) { THROW_IF_ERROR(self.Stop()); })
+                    .def("GetOffload", [](ToDevice &self) { return self.GetOffload(); })
                    .def("GetDataInfo",
                         [](ToDevice &self) {
                           std::vector<DataType> types_c;
@ -170,6 +172,5 @@ PYBIND_REGISTER(PythonDatasetSizeGetter, 1, ([](const py::module *m) {
                      return size;
                    });
                }));
-
 }  // namespace dataset
 }  // namespace mindspore
--- a/mindspore/ccsrc/minddata/dataset/api/python/bindings/dataset/engine/ir/datasetops/bindings.cc
+++ b/mindspore/ccsrc/minddata/dataset/api/python/bindings/dataset/engine/ir/datasetops/bindings.cc
@ -51,7 +51,6 @@

 namespace mindspore {
 namespace dataset {
-
 PYBIND_REGISTER(DatasetNode, 1, ([](const py::module *m) {
                  (void)py::class_<DatasetNode, std::shared_ptr<DatasetNode>>(*m, "Dataset")
                    .def("set_num_workers",
@ -193,11 +192,12 @@ PYBIND_REGISTER(MapNode, 2, ([](const py::module *m) {
                  (void)py::class_<MapNode, DatasetNode, std::shared_ptr<MapNode>>(*m, "MapNode", "to create a MapNode")
                    .def(py::init([](std::shared_ptr<DatasetNode> self, py::list operations, py::list input_columns,
                                     py::list output_columns, py::list project_columns,
-                                     std::vector<std::shared_ptr<PyDSCallback>> py_callbacks) {
+                                     std::vector<std::shared_ptr<PyDSCallback>> py_callbacks, int64_t max_rowsize,
+                                     bool offload) {
                      auto map = std::make_shared<MapNode>(
                        self, std::move(toTensorOperations(operations)), toStringVector(input_columns),
                        toStringVector(output_columns), toStringVector(project_columns), nullptr,
-                        std::vector<std::shared_ptr<DSCallback>>(py_callbacks.begin(), py_callbacks.end()));
+                        std::vector<std::shared_ptr<DSCallback>>(py_callbacks.begin(), py_callbacks.end()), offload);
                      THROW_IF_ERROR(map->ValidateParams());
                      return map;
                    }));
@ -297,6 +297,5 @@ PYBIND_REGISTER(ZipNode, 2, ([](const py::module *m) {
                      return zip;
                    }));
                }));
-
 }  // namespace dataset
 }  // namespace mindspore
--- a/mindspore/ccsrc/minddata/dataset/engine/consumers/tree_consumer.cc
+++ b/mindspore/ccsrc/minddata/dataset/engine/consumers/tree_consumer.cc
@ -98,6 +98,8 @@ Status TreeConsumer::RegisterProfilingManager() {
 }
 #endif

+std::string TreeConsumer::GetOffload() { return (tree_adapter_->GetOffloadJson()).dump(); }
+
 // IteratorConsumer
 Status IteratorConsumer::Init(std::shared_ptr<DatasetNode> d) {
  RETURN_IF_NOT_OK(tree_adapter_->Compile(std::move(d), num_epochs_));
--- a/mindspore/ccsrc/minddata/dataset/engine/consumers/tree_consumer.h
+++ b/mindspore/ccsrc/minddata/dataset/engine/consumers/tree_consumer.h
@ -50,6 +50,10 @@ class TreeConsumer {
  /// \return Status error code
  virtual Status Terminate();

+  /// Function for all consumers to get the offload JSON string.
+  /// \return Offload JSON string.
+  std::string GetOffload();
+
 #ifndef ENABLE_SECURITY
  virtual Status RegisterProfilingManager();

--- a/mindspore/ccsrc/minddata/dataset/engine/ir/datasetops/map_node.cc
+++ b/mindspore/ccsrc/minddata/dataset/engine/ir/datasetops/map_node.cc
@ -35,20 +35,21 @@ namespace dataset {
 MapNode::MapNode(std::shared_ptr<DatasetNode> child, std::vector<std::shared_ptr<TensorOperation>> operations,
                 std::vector<std::string> input_columns, std::vector<std::string> output_columns,
                 const std::vector<std::string> &project_columns, std::shared_ptr<DatasetCache> cache,
-                 std::vector<std::shared_ptr<DSCallback>> callbacks)
+                 std::vector<std::shared_ptr<DSCallback>> callbacks, bool offload)
    : operations_(operations),
      input_columns_(input_columns),
      output_columns_(output_columns),
      project_columns_(project_columns),
      DatasetNode(std::move(cache)),
-      callbacks_(callbacks) {
+      callbacks_(callbacks),
+      offload_(offload) {
  this->AddChild(child);
 }

 std::shared_ptr<DatasetNode> MapNode::Copy() {
  std::vector<std::shared_ptr<TensorOperation>> operations = operations_;
  auto node = std::make_shared<MapNode>(nullptr, operations, input_columns_, output_columns_, project_columns_, cache_,
-                                        callbacks_);
+                                        callbacks_, offload_);
  return node;
 }

@ -151,6 +152,8 @@ void MapNode::setOperations(const std::vector<std::shared_ptr<TensorOperation>>
 }
 std::vector<std::shared_ptr<TensorOperation>> MapNode::operations() { return operations_; }

+void MapNode::SetOffload(bool offload) { offload_ = offload; }
+
 Status MapNode::to_json(nlohmann::json *out_json) {
  RETURN_UNEXPECTED_IF_NULL(out_json);
  nlohmann::json args;
@ -182,6 +185,7 @@ Status MapNode::to_json(nlohmann::json *out_json) {
  (void)std::transform(callbacks_.begin(), callbacks_.end(), std::back_inserter(cbs),
                       [](std::shared_ptr<DSCallback> cb) -> int32_t { return cb != nullptr ? cb->step_size() : 0; });
  args["callback"] = cbs;
+
  *out_json = args;
  return Status::OK();
 }
--- a/mindspore/ccsrc/minddata/dataset/engine/ir/datasetops/map_node.h
+++ b/mindspore/ccsrc/minddata/dataset/engine/ir/datasetops/map_node.h
@ -32,7 +32,7 @@ class MapNode : public DatasetNode {
  MapNode(std::shared_ptr<DatasetNode> child, std::vector<std::shared_ptr<TensorOperation>> operations,
          std::vector<std::string> input_columns = {}, std::vector<std::string> output_columns = {},
          const std::vector<std::string> &columns = {}, std::shared_ptr<DatasetCache> cache = nullptr,
-          std::vector<std::shared_ptr<DSCallback>> callbacks = {});
+          std::vector<std::shared_ptr<DSCallback>> callbacks = {}, bool offload = false);

  /// \brief Destructor
  ~MapNode() = default;
@ -87,6 +87,10 @@ class MapNode : public DatasetNode {
  const std::vector<std::string> &OutputColumns() const { return output_columns_; }
  const std::vector<std::string> &ProjectColumns() const { return project_columns_; }
  const std::vector<std::shared_ptr<DSCallback>> &Callbacks() const { return callbacks_; }
+  bool GetOffload() const { return offload_; }
+
+  /// \brief setter to set offload flag of node
+  void SetOffload(bool offload);

  /// \brief Get the arguments of node
  /// \param[out] out_json JSON string of all attributes
@ -118,6 +122,9 @@ class MapNode : public DatasetNode {
  std::vector<std::string> output_columns_;
  std::vector<std::string> project_columns_;
  std::vector<std::shared_ptr<DSCallback>> callbacks_;
+
+  /// \brief Flag to indicate whether offload is set for the Map node.
+  bool offload_;
 };

 }  // namespace dataset
--- a/mindspore/ccsrc/minddata/dataset/engine/opt/CMakeLists.txt
+++ b/mindspore/ccsrc/minddata/dataset/engine/opt/CMakeLists.txt
@ -12,6 +12,7 @@ set(DATASET_ENGINE_OPT_SRC_FILES
    pre/epoch_ctrl_pass.cc
    pre/getter_pass.cc
    pre/input_validation_pass.cc
+    pre/node_offload_pass.cc
    pre/node_removal_pass.cc
    )

--- a/mindspore/ccsrc/minddata/dataset/engine/opt/pre/node_offload_pass.cc
+++ b/mindspore/ccsrc/minddata/dataset/engine/opt/pre/node_offload_pass.cc
@ -0,0 +1,73 @@
+/**
+ * Copyright 2021 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "minddata/dataset/engine/opt/pre/node_offload_pass.h"
+#include "minddata/dataset/engine/ir/datasetops/map_node.h"
+#include "minddata/dataset/engine/ir/datasetops/batch_node.h"
+
+namespace mindspore {
+namespace dataset {
+NodeOffloadPass::OffloadNodes::OffloadNodes() : prev_map_offloaded_(true) {}
+
+// Perform MapNode offload check.
+Status NodeOffloadPass::OffloadNodes::Visit(std::shared_ptr<MapNode> node, bool *const modified) {
+  *modified = false;
+  // Check if this node is set to offload and add to nodes_to_offload_.
+  if (node->GetOffload() == true) {
+    MS_LOG(INFO) << "Pre pass: node offload of map class is true.";
+    if (prev_map_offloaded_) {
+      nodes_to_offload_.push_back(std::static_pointer_cast<DatasetNode>(node));
+    } else {
+      MS_LOG(WARNING) << "Invalid use of offload in map, ignoring offload flag. Ops will be run in CPU pipeline";
+      node->SetOffload(false);
+      *modified = true;
+    }
+  } else {
+    // Since map nodes are visited in reverse order, no other map ops can be offloaded after this.
+    prev_map_offloaded_ = false;
+  }
+  return Status::OK();
+}
+
+// constructor
+NodeOffloadPass::NodeOffloadPass() {}
+
+// Walk the tree to collect the nodes to offload, fill the offload_json object, then remove the node.
+Status NodeOffloadPass::RunOnTree(std::shared_ptr<DatasetNode> root_ir, bool *const modified) {
+  MS_LOG(INFO) << "Pre pass: node offload pass started.";
+  // Create the offload node pass which can identify which nodes need to be offloaded.
+  std::unique_ptr<NodeOffloadPass::OffloadNodes> offload_nodes = std::make_unique<NodeOffloadPass::OffloadNodes>();
+  RETURN_IF_NOT_OK(offload_nodes->Run(root_ir, modified));
+
+  // Update modified flag if there were any nodes identified to be offloaded
+  if (offload_nodes->nodes_to_offload().empty() == false) {
+    *modified = true;
+  }
+
+  // Then, execute the offloading of any nodes that were set up to be offloaded
+  for (auto node : offload_nodes->nodes_to_offload()) {
+    RETURN_IF_NOT_OK(node->to_json(&offload_json_));
+    offload_json_["op_type"] = node->Name();
+
+    // Add the single offloaded node to the list of offloaded nodes and remove the node from the ir tree
+    offload_json_list_.push_back(offload_json_);
+    RETURN_IF_NOT_OK(node->Drop());
+  }
+  MS_LOG(INFO) << "Pre pass: offload node removal pass complete.";
+  return Status::OK();
+}
+}  // namespace dataset
+}  // namespace mindspore
--- a/mindspore/ccsrc/minddata/dataset/engine/opt/pre/node_offload_pass.h
+++ b/mindspore/ccsrc/minddata/dataset/engine/opt/pre/node_offload_pass.h
@ -0,0 +1,82 @@
+/**
+ * Copyright 2021 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef MINDSPORE_CCSRC_MINDDATA_DATASET_ENGINE_OPT_PRE_NODE_OFFLOAD_PASS_H_
+#define MINDSPORE_CCSRC_MINDDATA_DATASET_ENGINE_OPT_PRE_NODE_OFFLOAD_PASS_H_
+
+#include <memory>
+#include <vector>
+#include "minddata/dataset/engine/opt/pass.h"
+
+namespace mindspore {
+namespace dataset {
+class DatasetOp;
+
+/// \class NodeOffloadPass
+/// \brief This is a tree pass that will offload nodes.  It uses offload_nodes to first identify which
+///     nodes should be offloaded, adds the nodes' namea to the offload list, then removes the nodes from the ir tree.
+class NodeOffloadPass : public IRTreePass {
+  /// \class OffloadNodes
+  /// \brief This is a NodePass whose job is to identify which nodes should be offloaded.
+  class OffloadNodes : public IRNodePass {
+   public:
+    /// \brief Constructor
+    OffloadNodes();
+    /// \brief Destructor
+    ~OffloadNodes() = default;
+
+    /// \brief Perform MapNode offload check
+    /// \param[in] node The node being visited
+    /// \param[in, out] modified Indicator if the node was changed at all
+    /// \return Status The status code returned
+    Status Visit(std::shared_ptr<MapNode> node, bool *const modified) override;
+
+    /// \brief Access selected offload nodes for removal.
+    /// \return All the nodes to be removed by offload.
+    std::vector<std::shared_ptr<DatasetNode>> nodes_to_offload() { return nodes_to_offload_; }
+
+   private:
+    std::vector<std::shared_ptr<DatasetNode>> nodes_to_offload_;
+    bool prev_map_offloaded_;
+  };
+
+ public:
+  /// \brief Constructor
+  NodeOffloadPass();
+
+  /// \brief Destructor
+  ~NodeOffloadPass() = default;
+
+  /// \brief Runs an offload_nodes pass first to find out which nodes to offload, then offloads them.
+  /// \param[in, out] root_ir The tree to operate on.
+  /// \param[in, out] modified Indicates if the tree was modified.
+  /// \return Status The status code returned
+  Status RunOnTree(std::shared_ptr<DatasetNode> root_ir, bool *const modified) override;
+  /// \brief Getter
+  /// \return JSON of offload
+  nlohmann::json GetOffloadJson() { return offload_json_list_; }
+
+ private:
+  /// \brief JSON instance containing single offload op.
+  nlohmann::json offload_json_;
+
+  /// \brief JSON instance containing all offload ops.
+  nlohmann::json offload_json_list_;
+};
+}  // namespace dataset
+}  // namespace mindspore
+
+#endif  // MINDSPORE_CCSRC_MINDDATA_DATASET_ENGINE_OPT_PRE_NODE_OFFLOAD_PASS_H_
--- a/mindspore/ccsrc/minddata/dataset/engine/opt/pre/node_removal_pass.cc
+++ b/mindspore/ccsrc/minddata/dataset/engine/opt/pre/node_removal_pass.cc
@ -61,6 +61,11 @@ Status NodeRemovalPass::RunOnTree(std::shared_ptr<DatasetNode> root_ir, bool *co
  std::unique_ptr<NodeRemovalPass::RemovalNodes> removal_nodes = std::make_unique<NodeRemovalPass::RemovalNodes>();
  RETURN_IF_NOT_OK(removal_nodes->Run(root_ir, modified));

+  // Update modified flag if there were any nodes identified to be removed
+  if (removal_nodes->nodes_to_remove().empty() == false) {
+    *modified = true;
+  }
+
  // Then, execute the removal of any nodes that were set up for removal
  for (auto node : removal_nodes->nodes_to_remove()) {
    RETURN_IF_NOT_OK(node->Drop());
--- a/mindspore/ccsrc/minddata/dataset/engine/opt/pre/node_removal_pass.h
+++ b/mindspore/ccsrc/minddata/dataset/engine/opt/pre/node_removal_pass.h
@ -36,7 +36,6 @@ class NodeRemovalPass : public IRTreePass {
  class RemovalNodes : public IRNodePass {
   public:
    /// \brief Constructor
-    /// \param[in] removal_pass Raw pointer back to controlling tree pass
    RemovalNodes();

    /// \brief Destructor
--- a/mindspore/ccsrc/minddata/dataset/engine/tree_adapter.cc
+++ b/mindspore/ccsrc/minddata/dataset/engine/tree_adapter.cc
@ -21,6 +21,7 @@
 #ifndef ENABLE_ANDROID
 #include "minddata/dataset/engine/opt/optional/tensor_op_fusion_pass.h"
 #include "minddata/dataset/engine/opt/pre/cache_transform_pass.h"
+#include "minddata/dataset/engine/opt/pre/node_offload_pass.h"
 #include "minddata/dataset/engine/opt/post/repeat_pass.h"
 #endif
 #include "minddata/dataset/engine/opt/pass.h"
@ -60,6 +61,14 @@ Status TreeAdapter::PrePass(std::shared_ptr<DatasetNode> ir) {
  if (usage_ == kDeGetter) actions.emplace_back(std::make_unique<GetterPass>());
 #ifndef ENABLE_ANDROID
  actions.emplace_back(std::make_unique<CacheTransformPass>());
+
+  std::unique_ptr<NodeOffloadPass> offload = std::make_unique<NodeOffloadPass>();
+  // Checks nodes for offload removal
+  bool offload_mod = false;
+  // Checks ir_tree nodes for offload removal
+  offload->Run(ir, &offload_mod);
+  // Creates JSON object of offload nodes.
+  offload_json_ = offload->GetOffloadJson();
 #endif
  // Vector of flags for each action
  std::vector<bool> modified(actions.size(), false);
@ -69,7 +78,8 @@ Status TreeAdapter::PrePass(std::shared_ptr<DatasetNode> ir) {
    RETURN_IF_NOT_OK(actions[i]->Run(ir, &m));
    modified[i] = m;
  }
-  MS_LOG(INFO) << "Pre pass complete.";
+
+  MS_LOG(INFO) << "Pre pass offload complete.";
  return Status::OK();
 }

@ -260,5 +270,7 @@ Status TreeAdapter::Launch() {
  return Status::OK();
 }

+nlohmann::json TreeAdapter::GetOffloadJson() { return offload_json_; }
+
 }  // namespace dataset
 }  // namespace mindspore
--- a/mindspore/ccsrc/minddata/dataset/engine/tree_adapter.h
+++ b/mindspore/ccsrc/minddata/dataset/engine/tree_adapter.h
@ -78,6 +78,9 @@ class TreeAdapter {
  // Optional optimizations status
  bool OptimizationEnabled() const { return optimize_; }

+  // Return Offload Json
+  nlohmann::json GetOffloadJson();
+
 #ifndef ENABLE_SECURITY
  /// \brief Setter for Profiling Manager
  Status SetProfilingManagerPtr(std::shared_ptr<ProfilingManager> profiling_manager,
@ -129,6 +132,7 @@ class TreeAdapter {
    kCompileStateReady          // Execution tree is generated from the optimized IR
  };
  CompileState tree_state_;
+  nlohmann::json offload_json_;
 };
 }  // namespace dataset
 }  // namespace mindspore
--- a/mindspore/dataset/core/validator_helpers.py
+++ b/mindspore/dataset/core/validator_helpers.py
@ -729,3 +729,12 @@ def check_c_tensor_op(param, param_name):
 def replace_none(value, default):
    """ replaces None with a default value."""
    return value if value is not None else default
+
+def check_dataset_num_shards_shard_id(num_shards, shard_id):
+    if (num_shards is None) != (shard_id is None):
+        # These two parameters appear together.
+        raise ValueError("num_shards and shard_id need to be passed in together.")
+    if num_shards is not None:
+        check_pos_int32(num_shards, "num_shards")
+        if shard_id >= num_shards:
+            raise ValueError("shard_id should be less than num_shards.")
--- a/mindspore/dataset/engine/datasets.py
+++ b/mindspore/dataset/engine/datasets.py
@ -51,6 +51,7 @@ from mindspore.common import Tensor
 from mindspore import log as logger
 from mindspore.parallel._ps_context import _is_role_pserver, _is_role_sched
 from mindspore.parallel._utils import _get_device_num
+from mindspore.dataset.engine.offload import GetOffloadModel, op_to_model

 import mindspore.dataset.transforms.py_transforms as py_transforms

@ -92,6 +93,29 @@ ShuffleToShuffleMode = {Shuffle.FILES: cde.ShuffleMode.FILES,
                        Shuffle.INFILE: cde.ShuffleMode.INFILE}


+def get_offloadable_ops(operations):
+    """
+    Check if operations are supported by offload hardware accelarator.
+
+    Args:
+        operations: list of operations.
+
+    Returns:
+        Dictionary with boolean key for each operation for offload support.
+    """
+    is_offloadable = {}
+    if not isinstance(operations, list):
+        operations = [operations]
+    for op in operations:
+        name = op.__class__.__name__
+        if name in op_to_model:
+            is_offloadable[name] = True
+        else:
+            is_offloadable[name] = False
+
+    return is_offloadable
+
+
 def shuffle_to_shuffle_mode(shuffle):
    """
    Shuffle Enum to Shuffle Mode
@ -650,7 +674,8 @@ class Dataset:

    @check_map
    def map(self, operations, input_columns=None, output_columns=None, column_order=None,
-            num_parallel_workers=None, python_multiprocessing=False, cache=None, callbacks=None, max_rowsize=16):
+            num_parallel_workers=None, python_multiprocessing=False, cache=None, callbacks=None,
+            max_rowsize=16, offload=False):
        """
        Apply each operation in operations to this dataset.

@ -690,8 +715,9 @@ class Dataset:
            cache (DatasetCache, optional): Use tensor caching service to speed up dataset processing.
                (default=None, which means no cache is used).
            callbacks (DSCallback, list[DSCallback], optional): List of Dataset callbacks to be called (Default=None).
-            max_rowsize(int, optional): Maximum size of row in MB that is used for shared memory allocation to copy
-               data between processes.  This is only used if python_multiprocessing is set to True (default=16).
+            max_rowsize (int, optional): Maximum size of row in MB that is used for shared memory allocation to copy
+               data between processes.  This is only used if python_multiprocessing is set to True (Default=16).
+            offload (bool, optional): Flag to indicate whether offload is used (Default=False).


        Returns:
@ -785,7 +811,7 @@ class Dataset:
        """

        return MapDataset(self, operations, input_columns, output_columns, column_order, num_parallel_workers,
-                          python_multiprocessing, cache, callbacks, max_rowsize)
+                          python_multiprocessing, cache, callbacks, max_rowsize, offload)

    @check_filter
    def filter(self, predicate, input_columns=None, num_parallel_workers=None):
@ -2767,13 +2793,15 @@ class MapDataset(Dataset):
        callbacks (DSCallback, list[DSCallback], optional): List of Dataset callbacks to be called (Default=None)
        max_rowsize(int, optional): Maximum size of row in MB that is used for shared memory allocation to copy
            data between processes.  This is only used if python_multiprocessing is set to True (default=16).
+        offload (bool, optional): Flag to indicate whether offload is used (Default=False).

    Raises:
        ValueError: If len(input_columns) != len(output_columns) and column_order is not specified.
    """

    def __init__(self, input_dataset, operations=None, input_columns=None, output_columns=None, column_order=None,
-                 num_parallel_workers=None, python_multiprocessing=False, cache=None, callbacks=None, max_rowsize=16):
+                 num_parallel_workers=None, python_multiprocessing=False, cache=None, callbacks=None, max_rowsize=16,
+                 offload=False):
        super().__init__(children=input_dataset, num_parallel_workers=num_parallel_workers, cache=cache)
        self.operations = to_list(operations)
        self.operations = py_transforms.Compose.reduce(self.operations)
@ -2799,6 +2827,20 @@ class MapDataset(Dataset):

        self.callbacks = to_list(callbacks)
        self.max_rowsize = max_rowsize
+        self.offload = offload
+
+        if self.offload is True:
+            offloadable_ops = get_offloadable_ops(operations)
+            cannot_offload = False
+            invalid_ops = []
+            for op in offloadable_ops:
+                if offloadable_ops[op] is not True:
+                    cannot_offload = True
+                    invalid_ops.append(op)
+            if cannot_offload is True:
+                logger.warning(("In map(), offload is set to True, but offload is not supported for the following "
+                                "operation(s): {} \nSetting offload to False").format(*invalid_ops))
+                self.offload = False

    def parse(self, children=None):
        operations = []
@ -2810,7 +2852,7 @@ class MapDataset(Dataset):

        callbacks = [cb.create_runtime_obj() for cb in self.callbacks]
        return cde.MapNode(children[0], operations, self.input_columns, self.output_columns, self.column_order,
-                           callbacks)
+                           callbacks, self.max_rowsize, self.offload)

    def __deepcopy__(self, memodict):
        return self.__safe_deepcopy__(memodict, exclude=("operations", "callbacks", "__transfer_dataset__"))
@ -3210,6 +3252,13 @@ class _ToDevice:
    def __deepcopy__(self, memodict):
        return self

+    def get_offload_model(self):
+        """
+        Get offload model containing removed offload ops from pipeline.
+        """
+        offload_model = GetOffloadModel(self._to_device)
+        return offload_model
+

 class TransferDataset(Dataset):
    """
@ -3287,6 +3336,12 @@ class TransferDataset(Dataset):
            return self._to_device.get_data_info()
        raise RuntimeError("Calling get_data_info with bad state.")

+    def get_offload_model(self):
+        if self._to_device is not None:
+            return self._to_device.get_offload_model()
+
+        raise RuntimeError("get_offload_model, _to_device is None")
+
    def release(self):
        """
        Manually terminate Device Queue instead of relying on out of scope destruction.
@ -6478,6 +6533,7 @@ class _Flowers102Dataset:
    """
    Mainly for loading Flowers102 Dataset, and return one row each time.
    """
+
    def __init__(self, dataset_dir, task, usage, decode):
        self.dataset_dir = os.path.realpath(dataset_dir)
        self.task = task
--- a/mindspore/dataset/engine/iterators.py
+++ b/mindspore/dataset/engine/iterators.py
@ -20,8 +20,9 @@ import signal
 import weakref
 import numpy as np

-from mindspore.common.tensor import Tensor
 import mindspore._c_dataengine as cde
+from mindspore.common.tensor import Tensor
+import mindspore.dataset.engine.offload as offload

 from mindspore import log as logger

@ -86,6 +87,10 @@ class Iterator:
                self._transform_tensor = lambda t: Tensor.from_numpy(t.as_array())
        self.__index = 0

+        self.offload_model = None
+        if offload.check_map_offload(self.__ori_dataset):
+            self.offload_model = offload.GetOffloadModel(consumer)
+
        ITERATORS_LIST.append(weakref.ref(self))
        _unset_iterator_cleanup()

@ -139,6 +144,10 @@ class Iterator:
                self.__ori_dataset.dataset_size = self.__index
            raise StopIteration
        self.__index += 1
+
+        if self.offload_model is not None:
+            data = offload.apply_offload_iterators(data, self.offload_model)
+
        return data

    def __deepcopy__(self, memo):
--- a/mindspore/dataset/engine/offload.py
+++ b/mindspore/dataset/engine/offload.py
@ -0,0 +1,368 @@
+# Copyright 2021 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Offload Support.
+"""
+import json
+import numpy as np
+
+import mindspore.common.dtype as mstype
+from mindspore.common.tensor import Tensor
+import mindspore.nn as nn
+import mindspore.ops.composite as C
+from mindspore.ops import operations as P
+
+
+def check_map_offload(dataset):
+    """
+    Check if offload flag is set in data pipeline map ops.
+    """
+    offload_ckeck = False
+    dataset_tmp = dataset
+    while dataset_tmp:
+        if hasattr(dataset_tmp, 'offload'):
+            if dataset_tmp.offload is True:
+                offload_ckeck = True
+        if dataset_tmp.children:
+            dataset_tmp = dataset_tmp.children[0]
+            continue
+        dataset_tmp = dataset_tmp.children
+
+    if offload_ckeck is True:
+        if len(dataset.children) > 1:
+            raise RuntimeError("Offload currently does not support concatenated datasets.")
+
+    return offload_ckeck
+
+
+def apply_offload_iterators(data, offload_model):
+    """
+    Apply offload for non sink mode pipeline.
+    """
+    if len(data) != 2:
+        # A temporary solution to ensure there are two columns in dataset.
+        raise RuntimeError("Offload can currently only use datasets with two columns.")
+    if isinstance(data[0], Tensor) is True:
+        data[0] = offload_model(data[0])
+    else:
+        data[0] = Tensor(data[0], dtype=mstype.float32)
+        data[0] = offload_model(data[0]).asnumpy()
+
+    return data
+
+
+class ApplyPreTransform(nn.Cell):
+    """
+    Concatenates offload model with network.
+    """
+    def __init__(self, transform, model):
+        super(ApplyPreTransform, self).__init__(auto_prefix=False, flags=model.get_flags())
+        self.transform = transform
+        self.model = model
+
+    def construct(self, x, label):
+        x = self.transform(x)
+        x = self.model(x, label)
+        return x
+
+
+class IdentityCell(nn.Cell):
+    """
+    Applies identity transform on given input tensors.
+    """
+    def __init__(self):
+        super(IdentityCell, self).__init__()
+        self.identity = P.Identity()
+
+    def construct(self, x):
+        return self.identity(x)
+
+
+class RandomHorizontalFlip(nn.Cell):
+    """
+    Applies Random Horizontal Flip transform on given input tensors.
+    """
+    def __init__(self, prob):
+        super(RandomHorizontalFlip, self).__init__()
+
+        self.prob = Tensor(prob, dtype=mstype.float32)
+
+        self.cast = P.Cast()
+        self.shape = P.Shape()
+        self.uniformReal = P.UniformReal()
+        self.reshape = P.Reshape()
+        self.h_flip = P.ReverseV2(axis=[2])
+        self.mul = P.Mul()
+
+    def construct(self, x):
+
+        x = self.cast(x, mstype.float32)
+        bs, h, w, c = self.shape(x)
+
+        flip_rand_factor = self.uniformReal((bs, 1))
+        flip_rand_factor = self.cast((self.prob > flip_rand_factor), mstype.float32)
+        flip_rand_factor = self.reshape(C.repeat_elements(flip_rand_factor, rep=(h*w*c)), (bs, h, w, c))
+
+        x_flip = self.h_flip(x)
+        x = self.mul(x_flip, flip_rand_factor) + self.mul((1 - flip_rand_factor), x)
+
+        return x
+
+
+class RandomVerticalFlip(nn.Cell):
+    """
+    Applies Random Vertical Flip transform on given input tensors.
+    """
+    def __init__(self, prob):
+        super(RandomVerticalFlip, self).__init__()
+
+        self.prob = Tensor(prob, dtype=mstype.float32)
+
+        self.cast = P.Cast()
+        self.shape = P.Shape()
+        self.uniformReal = P.UniformReal()
+        self.reshape = P.Reshape()
+        self.h_flip = P.ReverseV2(axis=[1])
+        self.mul = P.Mul()
+
+    def construct(self, x):
+
+        x = self.cast(x, mstype.float32)
+        bs, h, w, c = self.shape(x)
+
+        flip_rand_factor = self.uniformReal((bs, 1))
+        flip_rand_factor = self.cast((self.prob > flip_rand_factor), mstype.float32)
+        flip_rand_factor = self.reshape(C.repeat_elements(flip_rand_factor, rep=(h*w*c)), (bs, h, w, c))
+
+        x_flip = self.h_flip(x)
+        x = self.mul(x_flip, flip_rand_factor) + self.mul((1 - flip_rand_factor), x)
+
+        return x
+
+
+class RandomColorAdjust(nn.Cell):
+    """
+    Applies Random Color Adjust transform on given input tensors.
+    """
+    def __init__(self, brightness, saturation):
+        super(RandomColorAdjust, self).__init__()
+
+        if isinstance(brightness, (list, tuple)):
+            self.br_min = brightness[0]
+            self.br_max = brightness[1]
+        else:
+            self.br_min = max(0, 1 - brightness)
+            self.br_max = 1 + brightness
+
+        if isinstance(saturation, (list, tuple)):
+            self.sa_min = saturation[0]
+            self.sa_max = saturation[1]
+        else:
+            self.sa_min = max(0, 1 - saturation)
+            self.sa_max = 1 + saturation
+
+        self.cast = P.Cast()
+        self.shape = P.Shape()
+        self.uniformReal = P.UniformReal()
+        self.reshape = P.Reshape()
+        self.unstack = P.Unstack(axis=-1)
+        self.expand_dims = P.ExpandDims()
+        self.mul = P.Mul()
+
+    def construct(self, x):
+
+        x = self.cast(x, mstype.float32)
+        bs, h, w, c = self.shape(x)
+
+        br_rand_factor = self.br_min + (self.br_max - self.br_min)*self.uniformReal((bs, 1))
+        br_rand_factor = self.reshape(C.repeat_elements(br_rand_factor, rep=(h*w*c)), (bs, h, w, c))
+
+        sa_rand_factor = self.sa_min + (self.sa_max - self.sa_min)*self.uniformReal((bs, 1))
+        sa_rand_factor = self.reshape(C.repeat_elements(sa_rand_factor, rep=(h*w*c)), (bs, h, w, c))
+
+        r, g, b = self.unstack(x)
+        x_gray = C.repeat_elements(self.expand_dims((0.2989 * r + 0.587 * g + 0.114 * b), -1), rep=c, axis=-1)
+
+        x = self.mul(x, br_rand_factor)
+        x = C.clip_by_value(x, 0.0, 255.0)
+
+        x = self.mul(x, sa_rand_factor) + self.mul((1 - sa_rand_factor), x_gray)
+        x = C.clip_by_value(x, 0.0, 255.0)
+
+        return x
+
+
+class RandomSharpness(nn.Cell):
+    """
+    Applies Random Sharpness transform on given input tensors.
+    """
+    def __init__(self, degrees):
+        super(RandomSharpness, self).__init__()
+
+        if isinstance(degrees, (list, tuple)):
+            self.degree_min = degrees[0]
+            self.degree_max = degrees[1]
+        else:
+            self.degree_min = max(0, 1 - degrees)
+            self.degree_max = 1 + degrees
+
+        self.cast = P.Cast()
+        self.shape = P.Shape()
+        self.uniformReal = P.UniformReal()
+        self.reshape = P.Reshape()
+        self.expand_dims = P.ExpandDims()
+        self.mul = P.Mul()
+        self.transpose = P.Transpose()
+
+        self.weight = np.array([[1, 1, 1], [1, 5, 1], [1, 1, 1]])/13.0
+        self.weight = np.repeat(self.weight[np.newaxis, :, :], 3, axis=0)
+        self.weight = np.repeat(self.weight[np.newaxis, :, :], 3, axis=0)
+        self.weight = Tensor(self.weight, mstype.float32)
+
+        self.filter = P.Conv2D(out_channel=3, kernel_size=(3, 3), pad_mode='same')
+
+    def construct(self, x):
+
+        x = self.cast(x, mstype.float32)
+        bs, h, w, c = self.shape(x)
+
+        degree_rand_factor = self.degree_min + (self.degree_max - self.degree_min)*self.uniformReal((bs, 1))
+        degree_rand_factor = self.reshape(C.repeat_elements(degree_rand_factor, rep=(h*w*c)), (bs, h, w, c))
+
+        x_sharp = self.filter(self.transpose(x, (0, 3, 1, 2)), self.weight)
+        x_sharp = self.transpose(x_sharp, (0, 2, 3, 1))
+
+        x = self.mul(x, degree_rand_factor) + self.mul((1 - degree_rand_factor), x_sharp)
+        x = C.clip_by_value(x, 0.0, 255.0)
+
+        return x
+
+
+class Rescale(nn.Cell):
+    """
+    Applies Rescale transform on given input tensors.
+    """
+    def __init__(self, rescale, shift):
+        super(Rescale, self).__init__()
+
+        self.rescale = Tensor(rescale, dtype=mstype.float32)
+        self.shift = Tensor(shift, dtype=mstype.float32)
+
+        self.cast = P.Cast()
+        self.mul = P.Mul()
+
+    def construct(self, x):
+
+        x = self.cast(x, mstype.float32)
+        x = x * self.rescale + self.shift
+
+        return x
+
+
+class HwcToChw(nn.Cell):
+    """
+    Applies Channel Swap transform on given input tensors.
+    """
+    def __init__(self):
+        super(HwcToChw, self).__init__()
+        self.trans = P.Transpose()
+
+    def construct(self, x):
+        return self.trans(x, (0, 3, 1, 2))
+
+
+class Normalize(nn.Cell):
+    """
+    Applies Normalize transform on given input tensors.
+    """
+    def __init__(self, mean, std):
+        super(Normalize, self).__init__()
+        self.mean = Tensor(mean, mstype.float32)
+        self.std = Tensor(std, mstype.float32)
+        self.sub = P.Sub()
+        self.div = P.Div()
+        self.cast = P.Cast()
+
+    def construct(self, x):
+        x = self.cast(x, mstype.float32)
+        x = self.sub(x, self.mean)
+        x = self.div(x, self.std)
+        return x
+
+
+class OffloadModel():
+    def __init__(self, func, args_names=None):
+        self.func = func
+        self.args_names = args_names
+
+
+# Dictionary connecting operation name to model
+op_to_model = {
+    "HWC2CHW": OffloadModel(HwcToChw),
+    "HwcToChw": OffloadModel(HwcToChw),
+    "Normalize": OffloadModel(Normalize, ["std", "mean"]),
+    "RandomColorAdjust": OffloadModel(RandomColorAdjust, ["brightness", "saturation"]),
+    "RandomHorizontalFlip": OffloadModel(RandomHorizontalFlip, ["prob"]),
+    "RandomSharpness": OffloadModel(RandomSharpness, ["degrees"]),
+    "RandomVerticalFlip": OffloadModel(RandomVerticalFlip, ["prob"]),
+    "Rescale": OffloadModel(Rescale, ["rescale", "shift"])
+}
+
+
+class GetModelFromJson2Col(nn.Cell):
+    """
+    Generates offload ME model from offload JSON file for a single map op.
+    """
+    def __init__(self, json_offload):
+        super(GetModelFromJson2Col, self).__init__()
+        self.me_ops = []
+        if json_offload is not None:
+            offload_ops = json_offload["operations"]
+            for op in offload_ops:
+                name = op["tensor_op_name"]
+                args = op["tensor_op_params"]
+                op_model = op_to_model[name]
+                op_model_inputs = []
+                if op_model.args_names is not None:
+                    for arg_key in op_model.args_names:
+                        op_model_inputs.append(args[arg_key])
+
+                self.me_ops.append(op_model.func(*op_model_inputs))
+        else:
+            raise RuntimeError("Offload hardware accelarator cannot be applied for this pipeline.")
+
+        self.cell = nn.SequentialCell(self.me_ops)
+
+    def construct(self, x):
+        return self.cell(x)
+
+
+class GetOffloadModel(nn.Cell):
+    """
+    Generates offload ME model.
+    """
+    def __init__(self, dataset_consumer):
+        super(GetOffloadModel, self).__init__()
+        self.transform_list = []
+        json_offload = json.loads(dataset_consumer.GetOffload())
+        if json_offload is not None:
+            for node in json_offload:
+                if node["op_type"] == 'Map':
+                    self.transform_list.append(GetModelFromJson2Col(node))
+            self.transform_list.reverse()
+
+    def construct(self, x):
+        for transform in self.transform_list:
+            x = transform(x)
+        return x
--- a/mindspore/dataset/engine/validators.py
+++ b/mindspore/dataset/engine/validators.py
@ -26,7 +26,7 @@ from mindspore._c_expression import typing
 from ..core.validator_helpers import parse_user_args, type_check, type_check_list, check_value, \
    INT32_MAX, check_valid_detype, check_dir, check_file, check_sampler_shuffle_shard_options, \
    validate_dataset_param_value, check_padding_options, check_gnn_list_or_ndarray, check_gnn_list_of_pair_or_ndarray, \
-    check_num_parallel_workers, check_columns, check_pos_int32, check_valid_str
+    check_num_parallel_workers, check_columns, check_pos_int32, check_valid_str, check_dataset_num_shards_shard_id

 from . import datasets
 from . import samplers
@ -548,13 +548,7 @@ def check_generatordataset(method):

        num_shards = param_dict.get("num_shards")
        shard_id = param_dict.get("shard_id")
-        if (num_shards is None) != (shard_id is None):
-            # These two parameters appear together.
-            raise ValueError("num_shards and shard_id need to be passed in together.")
-        if num_shards is not None:
-            check_pos_int32(num_shards, "num_shards")
-            if shard_id >= num_shards:
-                raise ValueError("shard_id should be less than num_shards.")
+        check_dataset_num_shards_shard_id(num_shards, shard_id)

        sampler = param_dict.get("sampler")
        if sampler is not None:
@ -776,7 +770,7 @@ def check_map(method):
    def new_method(self, *args, **kwargs):
        from mindspore.dataset.callback import DSCallback
        [_, input_columns, output_columns, column_order, num_parallel_workers, python_multiprocessing, cache,
-         callbacks, max_rowsize], _ = \
+         callbacks, max_rowsize, offload], _ = \
            parse_user_args(method, *args, **kwargs)

        nreq_param_columns = ['input_columns', 'output_columns', 'column_order']
@ -788,6 +782,7 @@ def check_map(method):
        type_check(python_multiprocessing, (bool,), "python_multiprocessing")
        check_cache_option(cache)
        type_check(max_rowsize, (int,), "max_rowsize")
+        type_check(offload, (bool,), "offload")

        if callbacks is not None:
            if isinstance(callbacks, (list, tuple)):
--- a/mindspore/train/dataset_helper.py
+++ b/mindspore/train/dataset_helper.py
@ -118,6 +118,18 @@ def _generate_network_with_dataset(network, dataset_helper, queue_name):
    return network


+def _check_add_offload(dataset, dataset_helper, network):
+    from mindspore.dataset.engine import offload
+    if offload.check_map_offload(dataset.__transfer_dataset__):
+        # A temporary solution to ensure there are two columns in dataset.
+        dataset_types, _ = dataset_helper.types_shapes()
+        if len(dataset_types) != 2:
+            raise RuntimeError("Offload can currently only use datasets with two columns.")
+        offload_model = dataset.__transfer_dataset__.get_offload_model()
+        network = offload.ApplyPreTransform(offload_model, network)
+    return network
+
+
 def connect_network_with_dataset(network, dataset_helper):
    """
    Connect the `network` with dataset in `dataset_helper`.
@ -153,7 +165,6 @@ def connect_network_with_dataset(network, dataset_helper):
        >>> net = Net()
        >>> net_with_get_next = connect_network_with_dataset(net, dataset_helper)
    """
-
    dataset_iter = dataset_helper.iter
    dataset = dataset_iter.dataset

@ -191,6 +202,7 @@ def connect_network_with_dataset(network, dataset_helper):
       not context.get_context("enable_ge") and \
       context.get_context("device_target") in ("Ascend", "GPU"):
        dataset.__me_inited__ = True
+        network = _check_add_offload(dataset, dataset_helper, network)
        network = _generate_network_with_dataset(network, dataset_helper, queue_name)

    if _dynamic_sink_data(dataset, dataset_iter) and _dynamic_sink_exception_scenario(dataset_iter):
--- a/tests/ut/python/dataset/test_map_offload.py
+++ b/tests/ut/python/dataset/test_map_offload.py
@ -0,0 +1,47 @@
+# Copyright 2021 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+import numpy as np
+
+import mindspore.dataset as ds
+import mindspore.dataset.vision.c_transforms as C
+
+
+DATA_DIR = "../data/dataset/testPK/data"
+
+def test_offload():
+    """
+    Feature: test map offload flag.
+    Description: Input is image dataset.
+    Expectation: Output should be same with activated or deactivated offload.
+    """
+    # Dataset with offload activated.
+    dataset_0 = ds.ImageFolderDataset(DATA_DIR)
+    dataset_0 = dataset_0.map(operations=[C.Decode()], input_columns="image")
+    dataset_0 = dataset_0.map(operations=[C.HWC2CHW()], input_columns="image", offload=True)
+    dataset_0 = dataset_0.batch(8, drop_remainder=True)
+
+    # Dataset with offload not activated.
+    dataset_1 = ds.ImageFolderDataset(DATA_DIR)
+    dataset_1 = dataset_1.map(operations=[C.Decode()], input_columns="image")
+    dataset_1 = dataset_1.map(operations=[C.HWC2CHW()], input_columns="image")
+    dataset_1 = dataset_1.batch(8, drop_remainder=True)
+
+    for (img_0, _), (img_1, _) in zip(dataset_0.create_tuple_iterator(num_epochs=1, output_numpy=True),
+                                      dataset_1.create_tuple_iterator(num_epochs=1, output_numpy=True)):
+        np.testing.assert_array_equal(img_0, img_1)
+
+
+if __name__ == "__main__":
+    test_offload()