move enterprise code to opensource
This commit is contained in:
parent
3008a3cee8
commit
8c6ee47470
|
@ -5,7 +5,7 @@ mindspore.train.OnRequestExit
|
|||
|
||||
响应用户关闭请求,退出训练或推理进程,保存checkpoint和mindir。
|
||||
|
||||
在训练开始前,注册OnRequestExit回调,当用户想要退出训练进程并保存训练数据时,可通过发送注册的退出信号"sig"到训练进程。
|
||||
在训练开始前,注册OnRequestExit回调,当用户想要退出训练进程并保存训练数据时,可通过发送注册的退出信号 `sig` 到训练进程。
|
||||
训练进程执行完当前step后,保存当前训练状态,包括checkpoint和mindir,然后退出训练过程。
|
||||
|
||||
参数:
|
||||
|
@ -17,7 +17,7 @@ mindspore.train.OnRequestExit
|
|||
|
||||
异常:
|
||||
- **ValueError** - `save_ckpt` 不是bool值 。
|
||||
- **ValueError** - `save_mindir` 不是字符串。
|
||||
- **ValueError** - `save_mindir` 不是bool值。
|
||||
- **ValueError** - `file_name` 不是字符串。
|
||||
- **ValueError** - `directory` 不是字符串。
|
||||
- **ValueError** - `sig` 不是int值,或者是signal.SIGKILL。
|
||||
|
|
|
@ -26,7 +26,6 @@
|
|||
#include "backend/common/pass/convert_tuple_input_to_dynamic_input.h"
|
||||
#include "backend/common/pass/convert_const_scalar_to_tensor.h"
|
||||
#include "backend/common/pass/convert_attr_to_unify_mindir.h"
|
||||
#include "backend/common/pass/add_training_attr.h"
|
||||
#include "backend/common/pass/optimize_updatestate.h"
|
||||
#include "backend/common/pass/conv_transpose_to_conv_bp.h"
|
||||
#include "backend/common/pass/reduce_sum_optimizer.h"
|
||||
|
@ -71,7 +70,6 @@ void BackendCommonOptimization(const std::shared_ptr<session::KernelGraph> &kern
|
|||
common_pm->AddPass(std::make_shared<ConvertUnusedTupleParaToMakeTuple>());
|
||||
common_pm->AddPass(std::make_shared<ConvertConstScalarToTensor>());
|
||||
common_pm->AddPass(std::make_shared<ConvertTupleInputToDynamicInput>());
|
||||
common_pm->AddPass(std::make_shared<AddTrainingAttr>());
|
||||
common_pm->AddPass(std::make_shared<FlattenConcatFission>());
|
||||
common_pm->AddPass(std::make_shared<AddDropoutAttrs>());
|
||||
optimizer->AddPassManager(common_pm);
|
||||
|
|
|
@ -25,32 +25,11 @@
|
|||
#include "ir/graph_utils.h"
|
||||
#include "backend/common/optimizer/helper.h"
|
||||
#include "include/common/utils/anfalgo.h"
|
||||
#include "utils/ms_context.h"
|
||||
|
||||
namespace mindspore {
|
||||
namespace opt {
|
||||
namespace {
|
||||
mindspore::HashMap<std::string, mindspore::HashSet<std::string>> MarkOp{
|
||||
{"LSTM", {"LSTMGradWeight", "LSTMGrad", "LSTMGradData"}}};
|
||||
|
||||
bool CheckOP(const FuncGraphManagerPtr &manager, const AnfNodePtr &cnode, const mindspore::HashSet<std::string> &set) {
|
||||
MS_EXCEPTION_IF_NULL(manager);
|
||||
MS_EXCEPTION_IF_NULL(cnode);
|
||||
for (const auto &node_index : manager->node_users()[cnode]) {
|
||||
auto output = node_index.first;
|
||||
MS_EXCEPTION_IF_NULL(output);
|
||||
if (common::AnfAlgo::CheckPrimitiveType(output, prim::kPrimTupleGetItem)) {
|
||||
if (CheckOP(manager, output, set)) {
|
||||
return true;
|
||||
}
|
||||
} else if (output->isa<CNode>()) {
|
||||
auto name = common::AnfAlgo::GetCNodeName(output);
|
||||
if (set.find(name) != set.end()) {
|
||||
return true;
|
||||
}
|
||||
}
|
||||
}
|
||||
return false;
|
||||
}
|
||||
void AddAttrTraining(const FuncGraphPtr &func_graph, const CNodePtr &cnode) {
|
||||
MS_EXCEPTION_IF_NULL(func_graph);
|
||||
MS_EXCEPTION_IF_NULL(cnode);
|
||||
|
@ -59,9 +38,9 @@ void AddAttrTraining(const FuncGraphPtr &func_graph, const CNodePtr &cnode) {
|
|||
if (manager->node_users().find(cnode) == manager->node_users().end()) {
|
||||
return;
|
||||
}
|
||||
auto set = MarkOp[common::AnfAlgo::GetCNodeName(cnode)];
|
||||
if (CheckOP(manager, cnode, set)) {
|
||||
cnode->AddAttr(kAttrIsTraining, MakeValue(true));
|
||||
auto prim = GetCNodePrimitive(cnode);
|
||||
if (prim->HasAttr(kAttrIsTraining)) {
|
||||
cnode->AddAttr(kAttrIsTraining, prim->GetAttr(kAttrIsTraining));
|
||||
} else {
|
||||
cnode->AddAttr(kAttrIsTraining, MakeValue(false));
|
||||
}
|
||||
|
@ -78,8 +57,7 @@ const AnfNodePtr AddTrainingAttr::Process(const FuncGraphPtr &func_graph, const
|
|||
return nullptr;
|
||||
}
|
||||
auto name = common::AnfAlgo::GetCNodeName(node);
|
||||
auto iter = MarkOp.find(name);
|
||||
if (iter == MarkOp.end()) {
|
||||
if (name != prim::kPrimLstm->name()) {
|
||||
return nullptr;
|
||||
}
|
||||
auto cnode = node->cast<CNodePtr>();
|
||||
|
|
|
@ -39,6 +39,7 @@
|
|||
#include "backend/common/pass/communication_op_fusion.h"
|
||||
#include "backend/common/pass/replace_node_by_proxy.h"
|
||||
#include "backend/common/pass/erase_visit_attr.h"
|
||||
#include "backend/common/pass/add_training_attr.h"
|
||||
#include "backend/common/pass/insert_tensor_move_for_communication.h"
|
||||
#include "common/graph_kernel/adapter/graph_kernel_optimization.h"
|
||||
#include "common/graph_kernel/adapter/expander.h"
|
||||
|
@ -180,6 +181,7 @@ void CPUKernelExecutor::OptimizeGraphImpl(const KernelGraphPtr &graph) const {
|
|||
pm->AddPass(std::make_shared<opt::InsertCastCPU>("insert_cast"));
|
||||
pm->AddPass(std::make_shared<opt::EraseVisitAttr>());
|
||||
pm->AddPass(std::make_shared<opt::InsertTensorMoveForCommunication>());
|
||||
pm->AddPass(std::make_shared<opt::AddTrainingAttr>());
|
||||
optimizer->AddPassManager(pm);
|
||||
(void)optimizer->Optimize(graph);
|
||||
graph->SetExecOrderByDefault();
|
||||
|
|
|
@ -222,7 +222,7 @@ template <typename T>
|
|||
template <typename Op>
|
||||
void ArithLogicCpuTypeFunc<T>::BinaryOp(const T *input1, const T *input2, bool *out, Op op) {
|
||||
int64_t input1_size = 1;
|
||||
int64_t input2_size = 2;
|
||||
int64_t input2_size = 1;
|
||||
|
||||
for (size_t i = 0; i < output_shape_.size(); i++) {
|
||||
input1_size *= input_shape1_[i];
|
||||
|
@ -270,7 +270,7 @@ template <typename T>
|
|||
template <typename Op>
|
||||
void ArithComplexLogicCpuTypeFunc<T>::BinaryOp(const T *input1, const T *input2, bool *out, Op op) {
|
||||
int64_t input1_size = 1;
|
||||
int64_t input2_size = 2;
|
||||
int64_t input2_size = 1;
|
||||
|
||||
for (size_t i = 0; i < output_shape_.size(); i++) {
|
||||
input1_size *= input_shape1_[i];
|
||||
|
|
|
@ -76,14 +76,17 @@ bool DropoutCpuKernelMod::LaunchKernel(const std::vector<kernel::AddressPtr> &in
|
|||
const auto *input_addr = reinterpret_cast<T *>(inputs[0]->addr);
|
||||
auto *output_addr = reinterpret_cast<T *>(outputs[0]->addr);
|
||||
auto mask_addr = reinterpret_cast<T *>(outputs[1]->addr);
|
||||
std::random_device rd;
|
||||
std::mt19937 gen(rd());
|
||||
std::bernoulli_distribution dis(keep_prob_);
|
||||
T scale = static_cast<T>(1.f / keep_prob_);
|
||||
for (uint64_t i = 0; i < tensor_size_; ++i) {
|
||||
mask_addr[i] = static_cast<T>(dis(gen));
|
||||
std::random_device rd;
|
||||
std::default_random_engine generator(rd());
|
||||
std::uniform_real_distribution<float> uniform(0.f, 1.f);
|
||||
auto task = [input_addr, output_addr, mask_addr, scale, &uniform, &generator, this](size_t start, size_t end) {
|
||||
for (size_t i = start; i < end; i++) {
|
||||
mask_addr[i] = static_cast<T>(uniform(generator) < keep_prob_);
|
||||
output_addr[i] = mask_addr[i] * input_addr[i] * scale;
|
||||
}
|
||||
};
|
||||
ParallelLaunchAutoSearch(task, tensor_size_, this, ¶llel_search_info_);
|
||||
return true;
|
||||
}
|
||||
|
||||
|
|
|
@ -53,7 +53,7 @@ class DropoutCpuKernelMod : public NativeCpuKernelMod, public MatchKernelHelper<
|
|||
|
||||
ShapeVector input_shape_;
|
||||
float keep_prob_{0.0};
|
||||
uint64_t tensor_size_{1};
|
||||
size_t tensor_size_{1};
|
||||
};
|
||||
} // namespace kernel
|
||||
} // namespace mindspore
|
||||
|
|
|
@ -27,6 +27,14 @@ constexpr auto kFlattenGrad = "FlattenGrad";
|
|||
constexpr auto kExpandDims = "ExpandDims";
|
||||
constexpr auto kSqueeze = "Squeeze";
|
||||
} // namespace
|
||||
|
||||
bool MemcpyCpuKernelMod::Init(const BaseOperatorPtr &base_operator, const std::vector<KernelTensorPtr> &,
|
||||
const std::vector<KernelTensorPtr> &) {
|
||||
MS_EXCEPTION_IF_NULL(base_operator);
|
||||
kernel_name_ = base_operator->name();
|
||||
return true;
|
||||
}
|
||||
|
||||
bool MemcpyCpuKernelMod::Launch(const std::vector<kernel::AddressPtr> &inputs, const std::vector<kernel::AddressPtr> &,
|
||||
const std::vector<kernel::AddressPtr> &outputs) {
|
||||
if (inputs.empty()) {
|
||||
|
@ -40,11 +48,19 @@ bool MemcpyCpuKernelMod::Launch(const std::vector<kernel::AddressPtr> &inputs, c
|
|||
if (inputs[0]->addr == outputs[0]->addr) {
|
||||
return true;
|
||||
}
|
||||
size_t copy_size = outputs[0]->size;
|
||||
auto ret = memcpy_s(outputs[0]->addr, copy_size, inputs[0]->addr, copy_size);
|
||||
if (ret != 0) {
|
||||
MS_LOG(EXCEPTION) << "For '" << kernel_name_ << "', memcpy_s error. Error no: " << ret;
|
||||
const auto *input_addr = reinterpret_cast<unsigned char *>(inputs[0]->addr);
|
||||
auto *output_addr = reinterpret_cast<unsigned char *>(outputs[0]->addr);
|
||||
int cp_ret = EOK;
|
||||
auto task = [input_addr, output_addr, &cp_ret](size_t start, size_t end) {
|
||||
auto ret = memcpy_s(output_addr + start, end - start, input_addr + start, end - start);
|
||||
if (ret != EOK && cp_ret == EOK) {
|
||||
cp_ret = ret;
|
||||
}
|
||||
};
|
||||
if (cp_ret != EOK) {
|
||||
MS_LOG(EXCEPTION) << "For " << kernel_name_ << ", memcpy error, errorno: " << cp_ret;
|
||||
}
|
||||
ParallelLaunchAutoSearch(task, outputs[0]->size, this, ¶llel_search_info_);
|
||||
return true;
|
||||
}
|
||||
|
||||
|
|
|
@ -33,11 +33,7 @@ class MemcpyCpuKernelMod : public NativeCpuKernelMod {
|
|||
~MemcpyCpuKernelMod() override = default;
|
||||
|
||||
bool Init(const BaseOperatorPtr &base_operator, const std::vector<KernelTensorPtr> &,
|
||||
const std::vector<KernelTensorPtr> &) override {
|
||||
MS_EXCEPTION_IF_NULL(base_operator);
|
||||
kernel_name_ = base_operator->name();
|
||||
return true;
|
||||
}
|
||||
const std::vector<KernelTensorPtr> &) override;
|
||||
|
||||
bool Launch(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &workspace,
|
||||
const std::vector<AddressPtr> &outputs) override;
|
||||
|
|
|
@ -36,7 +36,14 @@ constexpr size_t kAddNOutputsNum = 1;
|
|||
void AddInt(const int *in_0, const int *in_1, int *out, int start, int end) {
|
||||
int ret = ElementAddInt(in_0 + start, in_1 + start, out + start, end - start);
|
||||
if (ret != NNACL_OK) {
|
||||
MS_LOG(EXCEPTION) << "Add failed.";
|
||||
MS_LOG(EXCEPTION) << "For 'AddN', AddInt failed.";
|
||||
}
|
||||
}
|
||||
|
||||
void AddFloat(const float *in_0, const float *in_1, float *out, int start, int end) {
|
||||
int ret = ElementAdd(in_0 + start, in_1 + start, out + start, end - start);
|
||||
if (ret != NNACL_OK) {
|
||||
MS_LOG(EXCEPTION) << "For 'AddN', AddFloat failed.";
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -66,73 +73,35 @@ bool AddNCpuKernelMod::Init(const BaseOperatorPtr &base_operator, const std::vec
|
|||
return false;
|
||||
}
|
||||
kernel_func_ = func_list_[index].second;
|
||||
dtype_ = inputs[kIndex0]->GetDtype();
|
||||
return true;
|
||||
}
|
||||
|
||||
int AddNCpuKernelMod::Resize(const BaseOperatorPtr &base_operator, const std::vector<KernelTensorPtr> &inputs,
|
||||
const std::vector<KernelTensorPtr> &outputs,
|
||||
const std::map<uint32_t, tensor::TensorPtr> &inputsOnHost) {
|
||||
if (auto ret = KernelMod::Resize(base_operator, inputs, outputs, inputsOnHost); ret != KRET_OK) {
|
||||
return ret;
|
||||
}
|
||||
auto src0_shape = inputs[kIndex0]->GetDeviceShapeAdaptively();
|
||||
auto src1_shape = inputs[kIndex1]->GetDeviceShapeAdaptively();
|
||||
auto dst_shape = outputs[kIndex0]->GetDeviceShapeAdaptively();
|
||||
dnnl::memory::desc src0_mem_desc = GetDefaultMemDesc(src0_shape);
|
||||
dnnl::memory::desc src1_mem_desc = GetDefaultMemDesc(src1_shape);
|
||||
dnnl::memory::desc dst_mem_desc = GetDefaultMemDesc(dst_shape);
|
||||
auto desc = CreateDesc<dnnl::binary::desc>(dnnl::algorithm::binary_add, src0_mem_desc, src1_mem_desc, dst_mem_desc);
|
||||
auto prim_desc = CreateDesc<dnnl::binary::primitive_desc>(desc, engine_);
|
||||
primitive_ = CreatePrimitive<dnnl::binary>(prim_desc);
|
||||
AddArgument(DNNL_ARG_SRC_0, src0_mem_desc);
|
||||
AddArgument(DNNL_ARG_SRC_1, src1_mem_desc);
|
||||
AddArgument(DNNL_ARG_DST, dst_mem_desc);
|
||||
return KRET_OK;
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
bool AddNCpuKernelMod::LaunchKernel(const std::vector<kernel::AddressPtr> &inputs,
|
||||
const std::vector<kernel::AddressPtr> &,
|
||||
const std::vector<kernel::AddressPtr> &outputs) {
|
||||
CHECK_KERNEL_INPUTS_NUM(inputs.size(), input_num_, kernel_name_);
|
||||
CHECK_KERNEL_OUTPUTS_NUM(outputs.size(), kAddNOutputsNum, kernel_name_);
|
||||
if (dtype_ == kNumberTypeFloat32) {
|
||||
SetArgumentHandle(DNNL_ARG_SRC_0, inputs[0]->addr);
|
||||
SetArgumentHandle(DNNL_ARG_SRC_1, inputs[1]->addr);
|
||||
SetArgumentHandle(DNNL_ARG_DST, outputs[0]->addr);
|
||||
ExecutePrimitive();
|
||||
for (size_t index = 2; index < input_num_; ++index) {
|
||||
SetArgumentHandle(DNNL_ARG_SRC_0, outputs[0]->addr);
|
||||
SetArgumentHandle(DNNL_ARG_SRC_1, inputs[index]->addr);
|
||||
SetArgumentHandle(DNNL_ARG_DST, outputs[0]->addr);
|
||||
ExecutePrimitive();
|
||||
}
|
||||
} else if (dtype_ == kNumberTypeInt32) {
|
||||
size_t elements_num = outputs[0]->size / sizeof(int);
|
||||
const auto input_0 = reinterpret_cast<int *>(inputs[0]->addr);
|
||||
const auto input_1 = reinterpret_cast<int *>(inputs[1]->addr);
|
||||
auto output = reinterpret_cast<int *>(outputs[0]->addr);
|
||||
auto task_0 = std::bind(AddInt, input_0, input_1, output, std::placeholders::_1, std::placeholders::_2);
|
||||
ParallelLaunchAutoSearch(task_0, elements_num, this, ¶llel_search_info_);
|
||||
for (size_t index = 2; index < input_num_; ++index) {
|
||||
const auto input = reinterpret_cast<int *>(inputs[index]->addr);
|
||||
auto task = std::bind(AddInt, input, output, output, std::placeholders::_1, std::placeholders::_2);
|
||||
ParallelLaunchAutoSearch(task, elements_num, this, ¶llel_search_info_);
|
||||
}
|
||||
std::function<void(const T *, const T *, T *, int, int)> comput_func;
|
||||
if constexpr (std::is_same<T, float>::value) {
|
||||
comput_func = AddFloat;
|
||||
} else if constexpr (std::is_same<T, int>::value) {
|
||||
comput_func = AddInt;
|
||||
} else {
|
||||
comput_func = AddT<T>;
|
||||
}
|
||||
|
||||
size_t elements_num = outputs[0]->size / sizeof(T);
|
||||
const auto input_0 = reinterpret_cast<T *>(inputs[0]->addr);
|
||||
const auto input_1 = reinterpret_cast<T *>(inputs[1]->addr);
|
||||
auto output = reinterpret_cast<T *>(outputs[0]->addr);
|
||||
auto task_0 = std::bind(AddT<T>, input_0, input_1, output, std::placeholders::_1, std::placeholders::_2);
|
||||
auto task_0 = std::bind(comput_func, input_0, input_1, output, std::placeholders::_1, std::placeholders::_2);
|
||||
ParallelLaunchAutoSearch(task_0, elements_num, this, ¶llel_search_info_);
|
||||
for (size_t index = 2; index < input_num_; ++index) {
|
||||
const auto input = reinterpret_cast<T *>(inputs[index]->addr);
|
||||
auto task = std::bind(AddT<T>, input, output, output, std::placeholders::_1, std::placeholders::_2);
|
||||
auto task = std::bind(comput_func, input, output, output, std::placeholders::_1, std::placeholders::_2);
|
||||
ParallelLaunchAutoSearch(task, elements_num, this, ¶llel_search_info_);
|
||||
}
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
|
|
|
@ -32,9 +32,6 @@ class AddNCpuKernelMod : public MKLCpuKernelMod {
|
|||
bool Init(const BaseOperatorPtr &base_operator, const std::vector<KernelTensorPtr> &inputs,
|
||||
const std::vector<KernelTensorPtr> &outputs) override;
|
||||
|
||||
int Resize(const BaseOperatorPtr &base_operator, const std::vector<KernelTensorPtr> &inputs,
|
||||
const std::vector<KernelTensorPtr> &outputs, const std::map<uint32_t, tensor::TensorPtr> &) override;
|
||||
|
||||
bool Launch(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &workspace,
|
||||
const std::vector<AddressPtr> &outputs) override {
|
||||
return kernel_func_(this, inputs, workspace, outputs);
|
||||
|
@ -54,7 +51,6 @@ class AddNCpuKernelMod : public MKLCpuKernelMod {
|
|||
|
||||
size_t input_num_{0};
|
||||
std::vector<size_t> output_shape_;
|
||||
TypeId dtype_{kNumberTypeFloat32};
|
||||
};
|
||||
} // namespace kernel
|
||||
} // namespace mindspore
|
||||
|
|
|
@ -50,18 +50,17 @@ bool LstmCpuKernelMod::Init(const BaseOperatorPtr &base_operator, const std::vec
|
|||
<< kLstmOutputsNum << ", but get " << inputs.size() << " and " << outputs.size();
|
||||
return false;
|
||||
}
|
||||
|
||||
auto kernel_ptr = std::dynamic_pointer_cast<ops::LSTM>(base_operator);
|
||||
if (!kernel_ptr) {
|
||||
MS_LOG(ERROR) << "Cast LSTM ops failed!";
|
||||
return false;
|
||||
}
|
||||
|
||||
bidirectional_ = kernel_ptr->get_bidirectional();
|
||||
input_size_ = kernel_ptr->get_input_size();
|
||||
hidden_size_ = kernel_ptr->get_hidden_size();
|
||||
num_layers_ = kernel_ptr->get_num_layers();
|
||||
has_bias_ = kernel_ptr->get_has_bias();
|
||||
|
||||
constexpr int kBidirectional = 2;
|
||||
num_directions_ = 1;
|
||||
if (bidirectional_) {
|
||||
|
@ -74,22 +73,18 @@ bool LstmCpuKernelMod::Init(const BaseOperatorPtr &base_operator, const std::vec
|
|||
if (num_layers_ > kMaxLSTMLayer) {
|
||||
MS_LOG(EXCEPTION) << "Layers must be lower than 100!";
|
||||
}
|
||||
|
||||
for (int i = 0; i < num_layers_; ++i) {
|
||||
weight_size_ += gate_size * (i == 0 ? input_size_ : hidden_size_ * num_directions_);
|
||||
weight_h_size_ += gate_size * hidden_size_;
|
||||
}
|
||||
weight_size_ = weight_size_ * num_directions_;
|
||||
weight_h_size_ = weight_h_size_ * num_directions_;
|
||||
|
||||
weights_dims_ = {num_layers_, num_directions_, input_size_, kGateNum, hidden_size_};
|
||||
weights_h_dims_ = {num_layers_, num_directions_, hidden_size_, kGateNum, hidden_size_};
|
||||
bias_dims_ = {num_layers_, num_directions_, kGateNum, hidden_size_};
|
||||
|
||||
if (base_operator->HasAttr(kAttrIsTraining)) {
|
||||
is_training_ = GetValue<bool>(base_operator->GetAttr(kAttrIsTraining));
|
||||
} else {
|
||||
is_training_ = true;
|
||||
}
|
||||
is_training_ =
|
||||
base_operator->HasAttr(kAttrIsTraining) ? GetValue<bool>(base_operator->GetAttr(kAttrIsTraining)) : true;
|
||||
return true;
|
||||
}
|
||||
|
||||
|
@ -111,8 +106,6 @@ int LstmCpuKernelMod::Resize(const BaseOperatorPtr &base_operator, const std::ve
|
|||
if (num_directions_ * num_layers_ != src_h_shape[0]) {
|
||||
MS_LOG(EXCEPTION) << "Error iteration shape!";
|
||||
}
|
||||
|
||||
auto eng = engine_;
|
||||
dnnl::rnn_direction direction = dnnl::rnn_direction::unidirectional;
|
||||
if (bidirectional_) {
|
||||
direction = dnnl::rnn_direction::bidirectional_concat;
|
||||
|
@ -131,27 +124,22 @@ int LstmCpuKernelMod::Resize(const BaseOperatorPtr &base_operator, const std::ve
|
|||
dnnl::memory::desc dst_h_desc = formatted_md(dst_h_dims, tag::ldnc);
|
||||
dnnl::memory::desc dst_c_desc = formatted_md(dst_c_dims, tag::ldnc);
|
||||
|
||||
auto prop_kind = dnnl::prop_kind::forward_training;
|
||||
if (!is_training_) {
|
||||
prop_kind = dnnl::prop_kind::forward_inference;
|
||||
}
|
||||
auto prop_kind = is_training_ ? dnnl::prop_kind::forward_training : dnnl::prop_kind::forward_inference;
|
||||
auto weights_desc = formatted_md(weights_dims_, tag::any);
|
||||
auto weights_h_desc = formatted_md(weights_h_dims_, tag::any);
|
||||
auto desc =
|
||||
CreatePrimitive<dnnl::lstm_forward::desc>(prop_kind, direction, src_desc, src_h_desc, src_c_desc, weights_desc,
|
||||
weights_h_desc, bias_desc, dst_desc, dst_h_desc, dst_c_desc);
|
||||
prim_desc_ = CreateDesc<dnnl::lstm_forward::primitive_desc>(*desc, eng);
|
||||
prim_desc_ = CreateDesc<dnnl::lstm_forward::primitive_desc>(*desc, engine_);
|
||||
primitive_ = CreatePrimitive<dnnl::lstm_forward>(prim_desc_);
|
||||
auto weights_layer = GetWeightsLayerDesc(prim_desc_);
|
||||
auto weights_iter = GetWeightsIterDesc(prim_desc_);
|
||||
bias_desc_ = GetBiasDesc(prim_desc_);
|
||||
if (is_training_) {
|
||||
auto wksp_desc = GetWorkspaceDesc(prim_desc_);
|
||||
reserve_size_ = GetSize(wksp_desc);
|
||||
AddArgument(DNNL_ARG_WORKSPACE, wksp_desc);
|
||||
} else {
|
||||
reserve_size_ = 1;
|
||||
}
|
||||
auto weights_layer = GetWeightsLayerDesc(prim_desc_);
|
||||
auto weights_iter = GetWeightsIterDesc(prim_desc_);
|
||||
bias_desc_ = GetBiasDesc(prim_desc_);
|
||||
AddArgument(DNNL_ARG_SRC_LAYER, src_desc);
|
||||
AddArgument(DNNL_ARG_SRC_ITER, src_h_desc);
|
||||
AddArgument(DNNL_ARG_SRC_ITER_C, src_c_desc);
|
||||
|
@ -164,11 +152,11 @@ int LstmCpuKernelMod::Resize(const BaseOperatorPtr &base_operator, const std::ve
|
|||
|
||||
auto weights_dims_desc = CreateDesc<dnnl::memory::desc>(weights_dims_, dt::f32, tag::ldgoi);
|
||||
auto weights_h_dims_desc = CreateDesc<dnnl::memory::desc>(weights_h_dims_, dt::f32, tag::ldgoi);
|
||||
user_weights_memory_ = CreateDesc<dnnl::memory>(weights_dims_desc, eng);
|
||||
user_weights_h_memory_ = CreateDesc<dnnl::memory>(weights_h_dims_desc, eng);
|
||||
weights_memory_ = CreateDesc<dnnl::memory>(weights_layer, eng);
|
||||
weights_h_memory_ = CreateDesc<dnnl::memory>(weights_iter, eng);
|
||||
bias_memory_ = CreateDesc<dnnl::memory>(bias_desc_, eng);
|
||||
user_weights_memory_ = CreateDesc<dnnl::memory>(weights_dims_desc, engine_);
|
||||
user_weights_h_memory_ = CreateDesc<dnnl::memory>(weights_h_dims_desc, engine_);
|
||||
weights_memory_ = CreateDesc<dnnl::memory>(weights_layer, engine_);
|
||||
weights_h_memory_ = CreateDesc<dnnl::memory>(weights_iter, engine_);
|
||||
bias_memory_ = CreateDesc<dnnl::memory>(bias_desc_, engine_);
|
||||
|
||||
InitOutputSize(outputs);
|
||||
return KRET_OK;
|
||||
|
@ -200,7 +188,7 @@ bool LstmCpuKernelMod::Launch(const std::vector<kernel::AddressPtr> &inputs, con
|
|||
SetArgumentHandle(DNNL_ARG_DST_ITER, outputs[1]->addr);
|
||||
SetArgumentHandle(DNNL_ARG_DST_ITER_C, outputs[2]->addr);
|
||||
if (is_training_) {
|
||||
SetArgumentHandle(DNNL_ARG_WORKSPACE, outputs[3]->addr);
|
||||
SetArgumentHandle(DNNL_ARG_WORKSPACE, outputs[kOutputWorkSpaceIndex]->addr);
|
||||
}
|
||||
ExecutePrimitive();
|
||||
return true;
|
||||
|
|
|
@ -69,7 +69,7 @@ class LstmCpuKernelMod : public MKLCpuKernelMod {
|
|||
bool bidirectional_{false};
|
||||
bool has_bias_{false};
|
||||
bool is_training_{false};
|
||||
size_t reserve_size_{0};
|
||||
size_t reserve_size_{1};
|
||||
|
||||
dnnl::memory::dims weights_dims_;
|
||||
dnnl::memory::dims weights_h_dims_;
|
||||
|
|
|
@ -40,7 +40,6 @@ constexpr int kDstIterCIdx = 6;
|
|||
constexpr int kDiffDstLayerIdx = 7;
|
||||
constexpr int kDiffDstIterIdx = 8;
|
||||
constexpr int kDiffDstIterCIdx = 9;
|
||||
constexpr int kWorkspaceIdx = 10;
|
||||
constexpr int kNumberOne = 1;
|
||||
constexpr int kNumberTwo = 2;
|
||||
constexpr int kNumberFour = 4;
|
||||
|
@ -154,8 +153,7 @@ void LSTMGradCpuKernelMod::InitDnnl() {
|
|||
primitive_ = CreatePrimitive<dnnl::lstm_backward>(prim_backward_desc_);
|
||||
auto wksp_desc = GetWorkspaceDesc(prim_forward_desc);
|
||||
reserve_size_ = GetSize(wksp_desc);
|
||||
AddArgument(DNNL_ARG_WORKSPACE, wksp_desc);
|
||||
AddArgumentOp(src_desc, src_h_desc, src_c_desc, bias_desc, dst_desc, dst_h_desc, dst_c_desc);
|
||||
AddArgumentOp(src_desc, src_h_desc, src_c_desc, bias_desc, dst_desc, dst_h_desc, dst_c_desc, wksp_desc);
|
||||
|
||||
// construct fw memory
|
||||
weights_layer_desc_ = GetWeightsLayerDesc(prim_backward_desc_);
|
||||
|
@ -183,7 +181,7 @@ void LSTMGradCpuKernelMod::InitDnnl() {
|
|||
void LSTMGradCpuKernelMod::AddArgumentOp(const dnnl::memory::desc &src_desc, const dnnl::memory::desc &src_h_desc,
|
||||
const dnnl::memory::desc &src_c_desc, const dnnl::memory::desc &bias_desc,
|
||||
const dnnl::memory::desc &dst_desc, const dnnl::memory::desc &dst_h_desc,
|
||||
const dnnl::memory::desc &dst_c_desc) {
|
||||
const dnnl::memory::desc &dst_c_desc, const dnnl::memory::desc &wksp_desc) {
|
||||
AddArgument(DNNL_ARG_SRC_LAYER, src_desc);
|
||||
AddArgument(DNNL_ARG_SRC_ITER, src_h_desc);
|
||||
AddArgument(DNNL_ARG_SRC_ITER_C, src_c_desc);
|
||||
|
@ -202,6 +200,7 @@ void LSTMGradCpuKernelMod::AddArgumentOp(const dnnl::memory::desc &src_desc, con
|
|||
AddArgument(DNNL_ARG_DIFF_DST_LAYER, dst_desc);
|
||||
AddArgument(DNNL_ARG_DIFF_DST_ITER, dst_h_desc);
|
||||
AddArgument(DNNL_ARG_DIFF_DST_ITER_C, dst_c_desc);
|
||||
AddArgument(DNNL_ARG_WORKSPACE, wksp_desc);
|
||||
}
|
||||
|
||||
void LSTMGradCpuKernelMod::SetArgumentHandleOp(const std::vector<kernel::AddressPtr> &inputs,
|
||||
|
@ -215,7 +214,7 @@ void LSTMGradCpuKernelMod::SetArgumentHandleOp(const std::vector<kernel::Address
|
|||
SetArgumentHandle(DNNL_ARG_DST_LAYER, inputs[kDstLayerIdx]->addr);
|
||||
SetArgumentHandle(DNNL_ARG_DST_ITER, inputs[kDstIterIdx]->addr);
|
||||
SetArgumentHandle(DNNL_ARG_DST_ITER_C, inputs[kDstIterCIdx]->addr);
|
||||
SetArgumentHandle(DNNL_ARG_WORKSPACE, inputs[kWorkspaceIdx]->addr);
|
||||
SetArgumentHandle(DNNL_ARG_WORKSPACE, inputs[kInputWorkSpaceIndex]->addr);
|
||||
SetArgumentHandle(DNNL_ARG_DIFF_SRC_LAYER, outputs[kSrcLayerIdx]->addr);
|
||||
SetArgumentHandle(DNNL_ARG_DIFF_SRC_ITER, outputs[kSrcIterIdx]->addr);
|
||||
SetArgumentHandle(DNNL_ARG_DIFF_SRC_ITER_C, outputs[kSrcIterCIdx]->addr);
|
||||
|
|
|
@ -64,7 +64,7 @@ class LSTMGradCpuKernelMod : public MKLCpuKernelMod {
|
|||
void AddArgumentOp(const dnnl::memory::desc &src_desc, const dnnl::memory::desc &src_h_desc,
|
||||
const dnnl::memory::desc &src_c_desc, const dnnl::memory::desc &bias_desc,
|
||||
const dnnl::memory::desc &dst_desc, const dnnl::memory::desc &dst_h_desc,
|
||||
const dnnl::memory::desc &dst_c_desc);
|
||||
const dnnl::memory::desc &dst_c_desc, const dnnl::memory::desc &wksp_desc);
|
||||
void SetArgumentHandleOp(const std::vector<kernel::AddressPtr> &inputs,
|
||||
const std::vector<kernel::AddressPtr> &outputs);
|
||||
void ResetMemory(const dnnl::memory &mem, const string name) const;
|
||||
|
|
|
@ -123,10 +123,10 @@ template <typename T>
|
|||
bool UnpackCpuKernelMod::LaunchKernel(const std::vector<AddressPtr> &inputs,
|
||||
const std::vector<kernel::AddressPtr> &workspace,
|
||||
const std::vector<AddressPtr> &outputs) {
|
||||
const void *input = reinterpret_cast<void *>(inputs[0]->addr);
|
||||
void **outputs_host = reinterpret_cast<void **>(workspace[0]->addr);
|
||||
const auto *input = reinterpret_cast<unsigned char *>(inputs[0]->addr);
|
||||
auto **outputs_host = reinterpret_cast<unsigned char **>(workspace[0]->addr);
|
||||
for (size_t i = 0; i < outputs.size(); i++) {
|
||||
outputs_host[i] = reinterpret_cast<T *>(outputs[i]->addr);
|
||||
outputs_host[i] = reinterpret_cast<unsigned char *>(outputs[i]->addr);
|
||||
}
|
||||
|
||||
size_t total_size = input_size_ * sizeof(T);
|
||||
|
@ -135,7 +135,26 @@ bool UnpackCpuKernelMod::LaunchKernel(const std::vector<AddressPtr> &inputs,
|
|||
<< total_size << " bytes";
|
||||
}
|
||||
int data_size = SizeToInt(sizeof(T));
|
||||
Unstack(input, outputs_host, &unstack_param_, data_size);
|
||||
int copy_size = unstack_param_.after_dims_ * data_size;
|
||||
int cp_ret = EOK;
|
||||
auto task = [this, input, outputs_host, data_size, copy_size, &cp_ret](size_t start, size_t end) {
|
||||
for (size_t i = start; i < end; i++) {
|
||||
int n = i / unstack_param_.axis_dim_;
|
||||
int c = i % unstack_param_.axis_dim_;
|
||||
int in_offset = n * unstack_param_.axis_dim_ * unstack_param_.after_dims_ + c * unstack_param_.after_dims_;
|
||||
int out_offset = n * unstack_param_.after_dims_;
|
||||
auto ret =
|
||||
memcpy_s(outputs_host[c] + out_offset * data_size, copy_size, input + in_offset * data_size, copy_size);
|
||||
if (ret != EOK && cp_ret == EOK) {
|
||||
cp_ret = ret;
|
||||
}
|
||||
}
|
||||
};
|
||||
ParallelLaunchAutoSearch(task, IntToSize(unstack_param_.num_ * unstack_param_.pre_dims_), this,
|
||||
¶llel_search_info_);
|
||||
if (cp_ret != EOK) {
|
||||
MS_LOG(EXCEPTION) << "For " << kernel_name_ << ", memcpy error, errorno: " << cp_ret;
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
|
|
|
@ -36,7 +36,6 @@ bool BatchNormGradGradGpuKernelMod::Init(const BaseOperatorPtr &base_operator,
|
|||
auto [is_match, index] = MatchKernelAttr(kernel_attr, GetOpSupport());
|
||||
if (!is_match) {
|
||||
MS_LOG(EXCEPTION) << kernel_name_ << " does not support this kernel data type: " << kernel_attr;
|
||||
return false;
|
||||
}
|
||||
execute_func_ = func_list_[index].second;
|
||||
is_training_ = op->get_is_training();
|
||||
|
|
|
@ -856,7 +856,8 @@ class Parser:
|
|||
attr = 'source'
|
||||
try:
|
||||
source = inspect.getsourcelines(self.fn)
|
||||
if context.get_context('support_binary') and '/mindspore/' not in self.filename and \
|
||||
if context.get_context('support_binary') and \
|
||||
'/mindspore/' not in self.filename and '\\mindspore\\' not in self.filename and \
|
||||
(not hasattr(self.fn, attr) or getattr(self.fn, attr) != source):
|
||||
if not os.access(self.filename, os.W_OK):
|
||||
raise PermissionError(f"Don't have the write permission on the file {self.filename}.")
|
||||
|
|
|
@ -1213,6 +1213,7 @@ def get_bprop_lstm(self):
|
|||
return dx, dhx, dcx, dw
|
||||
|
||||
if context.get_context('device_target') == "CPU":
|
||||
self.add_prim_attr("is_training", True)
|
||||
return bprop_cpu
|
||||
|
||||
return bprop
|
||||
|
|
|
@ -48,7 +48,7 @@ def test_cpu_profiling():
|
|||
if os.path.isdir(data_path):
|
||||
shutil.rmtree(data_path)
|
||||
context.set_context(mode=context.GRAPH_MODE, device_target="CPU")
|
||||
device_id = context.get_context("device_id")
|
||||
rank_id = int(os.getenv('RANK_ID')) if os.getenv('RANK_ID') else 0
|
||||
profiler = Profiler(output_path="data_cpu_profiler")
|
||||
x = np.random.randn(1, 3, 3, 4).astype(np.float32)
|
||||
y = np.random.randn(1, 3, 3, 4).astype(np.float32)
|
||||
|
@ -60,9 +60,9 @@ def test_cpu_profiling():
|
|||
assert len(os.listdir(data_path)) == 1
|
||||
|
||||
profiler_dir = os.path.join(data_path, f"{os.listdir(data_path)[0]}/")
|
||||
op_detail_file = f"{profiler_dir}cpu_op_detail_info_{device_id}.csv"
|
||||
op_type_file = f"{profiler_dir}cpu_op_type_info_{device_id}.csv"
|
||||
timeline_file = f"{profiler_dir}cpu_op_execute_timestamp_{device_id}.txt"
|
||||
op_detail_file = f"{profiler_dir}cpu_op_detail_info_{rank_id}.csv"
|
||||
op_type_file = f"{profiler_dir}cpu_op_type_info_{rank_id}.csv"
|
||||
timeline_file = f"{profiler_dir}cpu_op_execute_timestamp_{rank_id}.txt"
|
||||
cpu_profiler_files = (op_detail_file, op_type_file, timeline_file)
|
||||
for file in cpu_profiler_files:
|
||||
assert os.path.isfile(file)
|
||||
|
|
|
@ -24,7 +24,7 @@ import numpy as np
|
|||
import pytest
|
||||
|
||||
from mindspore import Model
|
||||
from mindspore import nn
|
||||
from mindspore import nn, context
|
||||
from mindspore import dataset as ds
|
||||
from mindspore.common.initializer import TruncatedNormal
|
||||
from mindspore.train.callback import Callback, OnRequestExit, LossMonitor
|
||||
|
@ -136,6 +136,7 @@ def test_on_request_exit_callback():
|
|||
Expectation: When a signal received,
|
||||
the train process should be stopped and save the ckpt and mindir should be saved.
|
||||
"""
|
||||
context.set_context(mode=context.GRAPH_MODE)
|
||||
directory = "./data"
|
||||
if os.path.exists(directory):
|
||||
shutil.rmtree(directory)
|
||||
|
|
Loading…
Reference in New Issue