!6320 change mix_precision to c++

Merge pull request !6320 from vlne-v1/pynative_amp
2020-09-19 21:29:34 +08:00 · 2020-09-19 21:29:34 +08:00 · 3aa07a4362
parent 4905de06bd 44c01e27c0
commit 3aa07a4362
8 changed files with 149 additions and 69 deletions
--- a/mindspore/ccsrc/pipeline/pynative/pynative_execute.cc
+++ b/mindspore/ccsrc/pipeline/pynative/pynative_execute.cc
@ -257,41 +257,84 @@ py::object DoAutoCast(const py::object &arg, const TypeId &type_id) {
  return RunOp(args)[0];
 }

-void ConvertInputs(const PrimitivePyPtr &prim, const py::list &args, const OpExecInfoPtr &op_exec_info) {
-  auto &out_args = op_exec_info->op_inputs;
-  auto signature = prim->signatures();
-  std::vector<SignatureEnumDType> dtypes;
-  (void)std::transform(signature.begin(), signature.end(), std::back_inserter(dtypes),
-                       [](const Signature &sig) { return sig.dtype; });
-  int empty_dtype_count = std::count(dtypes.begin(), dtypes.end(), SignatureEnumDType::kDTypeEmptyDefaultValue);
-  if (dtypes.empty() || static_cast<int>(dtypes.size()) == empty_dtype_count) {
-    return;
+py::object DoParamMixPrecisionCast(bool *is_cast, const py::object obj) {
+  auto tensor = py::cast<tensor::TensorPtr>(obj);
+  auto cast_type = tensor->cast_dtype();
+  py::object cast_output;
+  if (cast_type != nullptr) {
+    auto source_element = tensor->Dtype();
+    if (source_element != nullptr && IsSubType(source_element, kFloat) && *source_element != *cast_type) {
+      MS_LOG(DEBUG) << "cast to " << cast_type->ToString();
+      cast_output = DoAutoCast(obj, cast_type->type_id());
+      *is_cast = true;
    }
-  auto type_indexes = GetTypeIndex(dtypes);
-  auto dst_type = GetDstType(out_args, type_indexes);
+  }
+  return cast_output;
+}

-  for (size_t i = 0; i < dtypes.size(); ++i) {
-    if (dtypes[i] == SignatureEnumDType::kDTypeEmptyDefaultValue) {
+py::object DoParamMixPrecisionCastTuple(bool *is_cast, const py::tuple tuple) {
+  auto tuple_size = static_cast<int>(tuple.size());
+  py::tuple result(tuple_size);
+
+  for (int i = 0; i < tuple_size; i++) {
+    if (py::isinstance<tensor::MetaTensor>(tuple[i])) {
+      MS_LOG(DEBUG) << "call cast for item " << i;
+      result[i] = DoParamMixPrecisionCast(is_cast, tuple[i]);
+    } else if (py::isinstance<py::tuple>(tuple[i])) {
+      result[i] = DoParamMixPrecisionCastTuple(is_cast, tuple[i]);
+    }
+  }
+  return result;
+}
+
+bool GetSignatureType(const PrimitivePyPtr &prim, std::vector<SignatureEnumDType> *dtypes) {
+  auto signature = prim->signatures();
+  bool has_sig_dtype = false;
+  (void)std::transform(signature.begin(), signature.end(), std::back_inserter(*dtypes),
+                       [&has_sig_dtype](const Signature &sig) {
+                         auto dtype = sig.dtype;
+                         if (dtype != SignatureEnumDType::kDTypeEmptyDefaultValue) {
+                           has_sig_dtype = true;
+                         }
+                         return dtype;
+                       });
+  return has_sig_dtype;
+}
+
+void DoSignatrueCast(const PrimitivePyPtr &prim, const std::map<SignatureEnumDType, TypeId> &dst_type,
+                     const std::vector<SignatureEnumDType> &dtypes, const OpExecInfoPtr &op_exec_info) {
+  const auto &signature = prim->signatures();
+  auto &out_args = op_exec_info->op_inputs;
+  bool has_dtype_sig = (dtypes.size() > 0);
+  for (size_t i = 0; i < out_args.size(); ++i) {
+    MS_LOG(DEBUG) << "check inputs " << i;
+    auto obj = out_args[i];
+    auto sig = SignatureEnumRW::kRWDefault;
+    if (signature.size() > 0) {
+      sig = signature[i].rw;
+    }
+    bool is_parameter = false;
+    TypeId arg_type_id = kTypeUnknown;
+    if (py::isinstance<tensor::MetaTensor>(obj)) {
+      auto arg = py::cast<tensor::MetaTensorPtr>(obj);
+      if (arg->is_parameter()) {
+        is_parameter = true;
+        MS_LOG(DEBUG) << "parameter is read " << i;
+      }
+      arg_type_id = arg->data_type();
+    }
+
+    // No need to implicit cast if no dtype.
+    if (!has_dtype_sig || dtypes[i] == SignatureEnumDType::kDTypeEmptyDefaultValue) {
      continue;
    }
    auto it = dst_type.find(dtypes[i]);
    if (it == dst_type.end() || it->second == kTypeUnknown) {
      continue;
    }
-
-    auto obj = out_args[i];
-    auto sig = signature[i].rw;
-    bool is_parameter = false;
+    // implicit cast
    bool is_same_type = false;
-    TypeId arg_type_id = kTypeUnknown;
    bool is_sig_write = (sig == SignatureEnumRW::kRWWrite);
-    if (py::isinstance<tensor::MetaTensor>(obj)) {
-      auto arg = py::cast<tensor::MetaTensorPtr>(obj);
-      if (arg->is_parameter()) {
-        is_parameter = true;
-      }
-      arg_type_id = arg->data_type();
-    }
    if (arg_type_id != 0) {
      is_same_type = (prim::type_map.find(arg_type_id) == prim::type_map.end() || arg_type_id == it->second);
    }
@ -318,7 +361,6 @@ void ConvertInputs(const PrimitivePyPtr &prim, const py::list &args, const OpExe
    }
    py::object cast_output = DoAutoCast(out_args[i], it->second);
    out_args[i] = cast_output;
-    ValuePtr input_value = PyAttrValue(cast_output);
  }
 }

@ -347,7 +389,6 @@ OpExecInfoPtr GenerateOpExecInfo(const py::args &args) {
  op_exec_info->py_primitive = prim;
  op_exec_info->op_attrs = py::getattr(args[PY_PRIM], "attrs");
  op_exec_info->op_inputs = args[PY_INPUTS];
-  ConvertInputs(prim, args[PY_INPUTS], op_exec_info);
  return op_exec_info;
 }

@ -698,11 +739,53 @@ AnfNodePtr PynativeExecutor::MakeCNode(const OpExecInfoPtr &op_exec_info, std::v
  inputs.push_back(NewValueNode(prim));

  size_t size = op_exec_info->op_inputs.size();
-  auto const_input_index = prim->get_const_input_indexes();
-  bool have_const_input = !const_input_index.empty();
-  bool is_const_prim = prim->is_const_prim();
+  // ignore signature for cast op
+  bool is_cast_op = (op_exec_info->op_name == "Cast");
+  if (!is_cast_op) {
+    const auto &signature = prim->signatures();
    for (size_t i = 0; i < size; i++) {
      auto obj = op_exec_info->op_inputs[i];
+      auto sig = SignatureEnumRW::kRWDefault;
+      if (signature.size() > 0) {
+        sig = signature[i].rw;
+      }
+      MS_LOG(DEBUG) << "check mix precision " << op_exec_info->op_name << " input " << i << " "
+                    << std::string(py::repr(obj));
+      // mix precision for non param
+      bool is_cast = false;
+      py::object cast_output;
+      if (py::isinstance<tensor::MetaTensor>(obj)) {
+        auto meta_tensor = obj.cast<tensor::MetaTensorPtr>();
+        if (meta_tensor && meta_tensor->is_parameter()) {
+          if (sig != SignatureEnumRW::kRWRead) {
+            continue;
+          }
+        }
+        // redundant cast call if the tensor is a const Tensor.
+        cast_output = DoParamMixPrecisionCast(&is_cast, obj);
+      } else if (py::isinstance<py::tuple>(obj)) {
+        // mix precision for tuple inputs
+        cast_output = DoParamMixPrecisionCastTuple(&is_cast, obj);
+      }
+      if (is_cast) {
+        op_exec_info->op_inputs[i] = cast_output;
+      }
+    }
+    std::vector<SignatureEnumDType> dtypes;
+
+    bool has_dtype_sig = GetSignatureType(prim, &dtypes);
+    std::map<SignatureEnumDType, TypeId> dst_types;
+    if (has_dtype_sig) {
+      // fetch info for implicit cast
+      auto type_indexes = GetTypeIndex(dtypes);
+      dst_types = GetDstType(op_exec_info->op_inputs, type_indexes);
+    }
+    MS_LOG(DEBUG) << "do signature for " << op_exec_info->op_name;
+    DoSignatrueCast(prim, dst_types, dtypes, op_exec_info);
+  }
+  MS_LOG(DEBUG) << "make cnode for " << op_exec_info->op_name;
+  for (size_t i = 0; i < size; i++) {
+    const auto &obj = op_exec_info->op_inputs[i];
    bool op_mask = false;
    if (py::isinstance<tensor::MetaTensor>(obj)) {
      auto meta_tensor = obj.cast<tensor::MetaTensorPtr>();
@ -710,9 +793,8 @@ AnfNodePtr PynativeExecutor::MakeCNode(const OpExecInfoPtr &op_exec_info, std::v
        op_mask = meta_tensor->is_parameter();
      }
    }
-
    (*op_masks).push_back(op_mask);
-    MS_LOG(DEBUG) << "gen " << op_exec_info->op_name << " arg " << i << ": op mask " << op_mask << " grad_flag_ "
+    MS_LOG(DEBUG) << "gen args i " << i << " " << op_exec_info->op_name << " op mask " << op_mask << " grad_flag_ "
                  << grad_flag_;

    AnfNodePtr node = nullptr;
@ -727,6 +809,10 @@ AnfNodePtr PynativeExecutor::MakeCNode(const OpExecInfoPtr &op_exec_info, std::v
    if (node != nullptr && node->abstract() != nullptr) {
      abs = node->abstract();
    }
+
+    auto const_input_index = prim->get_const_input_indexes();
+    bool have_const_input = !const_input_index.empty();
+    bool is_const_prim = prim->is_const_prim();
    MS_LOG(DEBUG) << prim->ToString() << " abs is nullptr " << (abs == nullptr) << " is_const_value "
                  << prim->is_const_prim();
    bool is_const_input = have_const_input && std::count(const_input_index.begin(), const_input_index.end(), i);
@ -998,7 +1084,7 @@ py::tuple PynativeExecutor::RunOpInner(const OpExecInfoPtr &op_exec_info) {
 }

 py::tuple PynativeExecutor::RunOpInner(const py::args &args) {
-  MS_LOG(DEBUG) << "RunOp start" << args.size();
+  MS_LOG(DEBUG) << "RunOp start " << args.size();
  OpExecInfoPtr op_exec_info = nullptr;
  auto prim = py::cast<PrimitivePyPtr>(args[PY_PRIM]);
  auto name = py::cast<std::string>(args[PY_NAME]);
--- a/mindspore/ccsrc/pybind_api/ir/tensor_py.cc
+++ b/mindspore/ccsrc/pybind_api/ir/tensor_py.cc
@ -455,6 +455,7 @@ REGISTER_PYBIND_DEFINE(Tensor, ([](const py::module *m) {
                                  >>> data.set_dtype(mindspore.int32)
                                  mindspore.int32
                              )mydelimiter")
+                           .def("set_cast_dtype", &Tensor::set_cast_dtype)
                           .def("__str__", &Tensor::ToString)
                           .def("__repr__", &Tensor::ToStringRepr)
                           .def(py::pickle(
--- a/mindspore/common/api.py
+++ b/mindspore/common/api.py
@ -292,6 +292,7 @@ class _PynativeExecutor:
    def __init__(self):
        self._executor = PynativeExecutor_.get_instance()

+    #TODO(kpy):add a type arg
    def new_graph(self, obj, *args, **kwargs):
        self._executor.new_graph(obj, *args, *(kwargs.values()))

--- a/mindspore/common/parameter.py
+++ b/mindspore/common/parameter.py
@ -219,16 +219,6 @@ class Parameter(MetaTensor):
            raise ValueError("The type of the name should be `str` or `None`.")
        self._param_info.name = name_

-    @property
-    def cast_type(self):
-        return self._cast_type
-
-    @cast_type.setter
-    def cast_type(self, dst_type):
-        if dst_type not in (mstype.float16, mstype.float32, None):
-            raise ValueError("The type of the name should be type of [float32, float16] or `None`.")
-        self._cast_type = dst_type
-
    @property
    def sliced(self):
        """Get slice status of the parameter."""
--- a/mindspore/core/ir/tensor.h
+++ b/mindspore/core/ir/tensor.h
@ -268,6 +268,8 @@ class Tensor : public MetaTensor {
  std::vector<Axis> padding_type() const { return padding_type_; }

  std::string id() const { return id_; }
+  TypePtr cast_dtype() { return cast_dtype_; }
+  void set_cast_dtype(TypePtr dtype) { cast_dtype_ = dtype; }

  void SetNeedWait(bool need_wait) {
    if (event_ != nullptr) {
@ -310,6 +312,7 @@ class Tensor : public MetaTensor {
  mutable TensorSyncStatus sync_status_{kNeedSyncHostToDevice};
  DeviceSyncPtr device_sync_{nullptr};
  std::vector<Axis> padding_type_;
+  TypePtr cast_dtype_{nullptr};
 };
 using TensorPtr = std::shared_ptr<Tensor>;
 using TensorPtrList = std::vector<std::shared_ptr<Tensor>>;
--- a/mindspore/nn/cell.py
+++ b/mindspore/nn/cell.py
@ -61,15 +61,17 @@ class Cell(Cell_):
    """
    IGNORE_LIST = ['_scope', '_cell_init_args', '_auto_prefix', '_cells', '_params', '_construct_inputs_names',
                   '_construct_inputs_num', '_create_time', '_mindspore_flags', '_parallel_inputs_run',
-                   '_parameter_layout_dict', '_already_run', '_params_list', '_phase', '_auto_parallel_mode',
-                   '_backward_hook', '_bprop_debug', '_is_run', '_param_prefix', '_attr_synced',
-                   'enable_hook', 'pynative', 'requires_grad', '_auto_parallel_compile_and_run', 'cell_type']
+                   '_parameter_layout_dict', '_already_run', '_params_list', '_tensor_list', '_phase',
+                   '_auto_parallel_mode', '_backward_hook', '_bprop_debug', '_is_run', '_param_prefix',
+                   '_attr_synced', 'enable_hook', 'pynative', 'requires_grad',
+                   '_auto_parallel_compile_and_run', 'cell_type']

    def __init__(self, auto_prefix=True, flags=None):
        Cell_.__init__(self, self._cell_tag)
        self._params = OrderedDict()
        self._cells = OrderedDict()
        self._params_list = OrderedDict()
+        self._tensor_list = OrderedDict()
        self.training = False
        self.requires_grad = False
        self.pynative = False
@ -228,6 +230,9 @@ class Cell(Cell_):
                return cells[name]
        if context.get_context("mode") == context.PYNATIVE_MODE and '_params_list' in self.__dict__:
            params_list = self.__dict__['_params_list']
+            tensor_list = self.__dict__['_tensor_list']
+            if name in tensor_list:
+                return self.cast_param(tensor_list[name])
            if name in params_list:
                para_list = params_list[name]
                cast_list = list()
@ -339,6 +344,7 @@ class Cell(Cell_):
        cells = self.__dict__.get('_cells')
        params = self.__dict__.get('_params')
        params_list = self.__dict__.get('_params_list')
+        tensor_list = self.__dict__.get('_tensor_list')
        if isinstance(value, Parameter):
            if params is None:
                raise AttributeError("Can not assign params before Cell.__init__() call.")
@ -383,6 +389,13 @@ class Cell(Cell_):
            if value is not None:
                raise TypeError("Expected type is cell, but got {}.".format(type(value)))
            self._cells[name] = None
+        elif isinstance(value, Tensor):
+            if context.get_context("mode") == context.PYNATIVE_MODE:
+                if name in self.__dict__:
+                    del self.__dict__[name]
+                tensor_list[name] = value
+            else:
+                object.__setattr__(self, name, value)
        else:
            if isinstance(value, Primitive):
                value.set_prim_instance_name(name)
@ -570,11 +583,9 @@ class Cell(Cell_):
        """
        if hasattr(self, "_mindspore_flags"):
            if self._mindspore_flags.get('fp16'):
-                param.cast_type = mstype.float16
-            elif self._mindspore_flags.get('fp32'):
-                param.cast_type = mstype.float32
-            else:
-                param.cast_type = None
+                param.set_cast_dtype(mstype.float16)
+            if self._mindspore_flags.get('fp32'):
+                param.set_cast_dtype(mstype.float32)
        return param

    def insert_child_to_cell(self, child_name, child):
--- a/mindspore/ops/primitive.py
+++ b/mindspore/ops/primitive.py
@ -17,7 +17,6 @@
 import inspect
 import copy
 from mindspore.common.api import _wrap_func
-from mindspore.common._register_for_tensor import tensor_operator_registry
 from mindspore import context
 from .._c_expression import Primitive_, real_run_op, prim_type
 from . import signature as sig
@ -496,16 +495,7 @@ def constexpr(fn=None, get_instance=True, name=None):
@_wrap_func
 def _run_op(obj, op_name, args):
    """Single op execution function supported by ge in PyNative mode."""
-    cast = tensor_operator_registry.get("cast")
-    if op_name == "Cast" or obj.update_parameter:
-        cast_args = args
-    else:
-        cast_args = list(args)
-        for idx, arg in enumerate(args):
-            cast_type = getattr(arg, "cast_type", None)
-            if cast_type:
-                cast_args[idx] = cast(arg, cast_type)
-    output = real_run_op(obj, op_name, cast_args)
+    output = real_run_op(obj, op_name, args)
    if not output:
        raise RuntimeError("Pynative run op %s failed!" % op_name)
    if len(output) == 1:
--- a/tests/ut/python/model/test_mix_precision.py
+++ b/tests/ut/python/model/test_mix_precision.py
@ -74,7 +74,7 @@ def test_add_cast_flag():
    net.fc3.to_float(mstype.float32)
    net = train_step_with_loss_warp(net)
    net.set_train()
-    _executor.compile(net, predict, label)
+    net(predict, label)


 def test_add_cast_flag_tensor():
@ -82,7 +82,7 @@ def test_add_cast_flag_tensor():
    net = NetForConcat()
    net.add_flags_recursive(fp16=True)
    net.set_train()
-    _executor.compile(net, x1)
+    net(x1)


 def test_on_momentum():
@ -91,7 +91,7 @@ def test_on_momentum():
    net = LeNet5()
    net = train_step_with_loss_warp(net).to_float(mstype.float16)
    net.set_train()
-    _executor.compile(net, predict, label)
+    net(predict, label)


 def test_data_parallel_with_cast():
@ -134,7 +134,6 @@ def test_nn_prelu():
 class NetForCast(nn.Cell):
    def __init__(self):
        super(NetForCast, self).__init__()
-        self.concat = P.Concat()
        self.x1 = Tensor(1.0, mstype.float32)
        self.x2 = Parameter(Tensor(np.zeros([1, 10]).astype(np.float32)), name='x2')

@ -144,11 +143,10 @@ class NetForCast(nn.Cell):


 def test_cast():
-    context.set_context(save_graphs=True)
    x = Tensor(np.ones([1, 16, 10, 10]).astype(np.float32) * 0.01)
    net = NetForCast()
    net.add_flags_recursive(fp16=True)
-    _executor.compile(net, x)
+    net(x)


 class IRBlockZ(nn.Cell):