From 2261cf7bca2064facadf762668c23d2537a77e15 Mon Sep 17 00:00:00 2001 From: suxin Date: Tue, 28 Feb 2023 10:50:56 +0800 Subject: [PATCH] V2 overflow check on Ascend --- .../amp/mindspore.amp.all_finite.rst | 1 - .../amp/mindspore.amp.init_status.rst | 12 -- docs/api/api_python/mindspore.amp.rst | 1 - docs/api/api_python_en/mindspore.amp.rst | 1 - mindspore/ccsrc/include/common/utils/utils.h | 2 + mindspore/ccsrc/kernel/kash/kernel_pack.cc | 94 +++++++--- mindspore/ccsrc/kernel/kernel.h | 8 + .../hal/device/ascend_memory_adapter.cc | 16 +- .../ascend/hal/device/ascend_memory_adapter.h | 2 +- .../device/ascend/hal/device/kernel_adjust.cc | 156 +++++++--------- .../device/ascend/hal/device/kernel_adjust.h | 14 +- .../device/ascend/kernel/hccl/hccl_kernel.cc | 5 +- .../kernel/tbe/dynamic_tbe_kernel_mod.cc | 7 + .../tbe/tbe_json/single_tbe_json_creator.cc | 38 ++++ .../tbe/tbe_json/single_tbe_json_creator.h | 1 + .../ascend/kernel/tbe/tbe_kernel_compile.cc | 20 +- .../ascend/kernel/tbe/tbe_kernel_mod.cc | 78 +++++++- .../device/ascend/kernel/tbe/tbe_kernel_mod.h | 4 + .../device/ascend/kernel/tbe/tbe_utils.cc | 2 + mindspore/core/ops/core_ops.h | 2 + .../core/ops/npu_clear_float_status_v2.cc | 99 ++++++++++ .../core/ops/npu_clear_float_status_v2.h | 39 ++++ mindspore/core/ops/npu_get_float_status_v2.cc | 99 ++++++++++ mindspore/core/ops/npu_get_float_status_v2.h | 39 ++++ .../tbe_compiler/tbe_helper.py | 2 +- mindspore/python/mindspore/amp.py | 48 +---- .../mindspore/boost/boost_cell_wrapper.py | 53 ++++-- .../python/mindspore/nn/wrap/loss_scale.py | 44 +++-- .../mindspore/ops/_op_impl/tbe/__init__.py | 2 + .../_op_impl/tbe/npu_clear_float_status_v2.py | 35 ++++ .../_op_impl/tbe/npu_get_float_status_v2.py | 35 ++++ .../mindspore/ops/operations/math_ops.py | 170 +++++++++++++++++ .../mix_precision/test_mix_precision_func.py | 7 +- tests/st/ops/ascend/test_npu_overflow_v2.py | 175 ++++++++++++++++++ tests/st/train/test_amp_overflow.py | 83 +++++++++ tests/st/train/test_loss_scale_overflow.py | 114 ++++++++++++ tests/ut/cpp/tbe/tbe_json_creator_test.cc | 12 +- 37 files changed, 1280 insertions(+), 240 deletions(-) delete mode 100644 docs/api/api_python/amp/mindspore.amp.init_status.rst create mode 100644 mindspore/core/ops/npu_clear_float_status_v2.cc create mode 100644 mindspore/core/ops/npu_clear_float_status_v2.h create mode 100644 mindspore/core/ops/npu_get_float_status_v2.cc create mode 100644 mindspore/core/ops/npu_get_float_status_v2.h create mode 100644 mindspore/python/mindspore/ops/_op_impl/tbe/npu_clear_float_status_v2.py create mode 100644 mindspore/python/mindspore/ops/_op_impl/tbe/npu_get_float_status_v2.py create mode 100644 tests/st/ops/ascend/test_npu_overflow_v2.py create mode 100644 tests/st/train/test_amp_overflow.py create mode 100644 tests/st/train/test_loss_scale_overflow.py diff --git a/docs/api/api_python/amp/mindspore.amp.all_finite.rst b/docs/api/api_python/amp/mindspore.amp.all_finite.rst index 38c000b9af9..ce7e25820bc 100644 --- a/docs/api/api_python/amp/mindspore.amp.all_finite.rst +++ b/docs/api/api_python/amp/mindspore.amp.all_finite.rst @@ -12,7 +12,6 @@ mindspore.amp.all_finite 参数: - **inputs** (Union(tuple(Tensor), list(Tensor))) - 可迭代的Tensor。 - - **status** (Tensor) - 溢出检测时所需要的初始状态,仅在Ascend需要。默认值:None。 返回: Tensor,布尔类型的标量Tensor。 diff --git a/docs/api/api_python/amp/mindspore.amp.init_status.rst b/docs/api/api_python/amp/mindspore.amp.init_status.rst deleted file mode 100644 index ead21c9020a..00000000000 --- a/docs/api/api_python/amp/mindspore.amp.init_status.rst +++ /dev/null @@ -1,12 +0,0 @@ -mindspore.amp.init_status -=========================== - -.. py:function:: mindspore.amp.init_status() - - 初始化溢出状态检测变量。 - - .. note:: - 该接口仅在Ascend后端有效,在GPU、CPU上调用的返回值没有作用。 - - 返回: - Tensor,shape为 (8,) 。 diff --git a/docs/api/api_python/mindspore.amp.rst b/docs/api/api_python/mindspore.amp.rst index ee14fff63c1..669c60bc228 100644 --- a/docs/api/api_python/mindspore.amp.rst +++ b/docs/api/api_python/mindspore.amp.rst @@ -35,5 +35,4 @@ mindspore.amp :nosignatures: :template: classtemplate.rst - mindspore.amp.init_status mindspore.amp.all_finite diff --git a/docs/api/api_python_en/mindspore.amp.rst b/docs/api/api_python_en/mindspore.amp.rst index b738798014f..81f7154953b 100644 --- a/docs/api/api_python_en/mindspore.amp.rst +++ b/docs/api/api_python_en/mindspore.amp.rst @@ -35,5 +35,4 @@ Overflow Detection :nosignatures: :template: classtemplate.rst - mindspore.amp.init_status mindspore.amp.all_finite diff --git a/mindspore/ccsrc/include/common/utils/utils.h b/mindspore/ccsrc/include/common/utils/utils.h index 816d014f148..6ec25185bd6 100644 --- a/mindspore/ccsrc/include/common/utils/utils.h +++ b/mindspore/ccsrc/include/common/utils/utils.h @@ -571,6 +571,8 @@ constexpr auto kNonZeroOpName = "NonZero"; constexpr auto kNPUAllocFloatStatusOpName = "NPUAllocFloatStatus"; constexpr auto kNPUClearFloatStatusOpName = "NPUClearFloatStatus"; constexpr auto kNPUGetFloatStatusOpName = "NPUGetFloatStatus"; +constexpr auto kNPUClearFloatStatusV2OpName = "NPUClearFloatStatusV2"; +constexpr auto kNPUGetFloatStatusV2OpName = "NPUGetFloatStatusV2"; constexpr auto kNthElementOpName = "NthElement"; constexpr auto kOneHotOpName = "OneHot"; constexpr auto kOneHotDOpName = "OneHotD"; diff --git a/mindspore/ccsrc/kernel/kash/kernel_pack.cc b/mindspore/ccsrc/kernel/kash/kernel_pack.cc index bd757b85033..d9c8629532e 100644 --- a/mindspore/ccsrc/kernel/kash/kernel_pack.cc +++ b/mindspore/ccsrc/kernel/kash/kernel_pack.cc @@ -24,14 +24,31 @@ namespace mindspore { namespace kernel { constexpr size_t kJsonSuffixLength = 5; +constexpr char kMagic[] = "magic"; +constexpr char kBlockDim[] = "blockDim"; +constexpr char kKernelName[] = "kernelName"; +constexpr char kBinFileName[] = "binFileName"; +constexpr char kBinFileSuffix[] = "binFileSuffix"; +constexpr char kCoreType[] = "core_type"; +constexpr char kTaskRation[] = "taskRation"; +constexpr char kWorkspace[] = "workspace"; +constexpr char kParameters[] = "parameters"; +constexpr char kOpParaSize[] = "opParaSize"; +constexpr char kSHA256[] = "sha256"; +constexpr char kKBHit[] = "KBHit"; +constexpr char kKernelList[] = "kernelList"; +constexpr char kModeInArgsFirstField[] = "modeInArgsFirstField"; +constexpr char kBatchBindOnly[] = "batchBindOnly"; +constexpr char kArgsRemap[] = "args_remap"; +constexpr char kSize[] = "size"; +constexpr char kGlobalWorkspaceSpecWorkspace[] = "globalworkspace_spec_workspace"; namespace { bool CheckHash(const std::string &json_file, const std::string &bin_file, const nlohmann::json &js) { - if (js.find("sha256") == js.end()) { - MS_LOG(ERROR) << "No sha256 found in " << json_file; + if (js.find(kSHA256) == js.end()) { return false; } std::string sha256_cal = system::sha256::GetHashFromFile(bin_file); - std::string sha256_str = js["sha256"]; + std::string sha256_str = js[kSHA256]; if (sha256_cal.empty() || sha256_cal != sha256_str) { MS_LOG(WARNING) << "Check sha256 for [" << bin_file << "] failed, it will try to rebuild the op."; return false; @@ -154,9 +171,9 @@ bool KernelPack::ReadFromJsonFile(const std::string &json_f, const std::string & } // cuda json file may have workspace information - if (js.find("workspace") != js.end()) { - auto workspace = js.at("workspace"); - std::vector sizes = workspace.at("size"); + if (js.find(kWorkspace) != js.end()) { + auto workspace = js.at(kWorkspace); + std::vector sizes = workspace.at(kSize); for (auto size : sizes) { kernel_json_info_.workspaces.push_back(size); } @@ -165,7 +182,7 @@ bool KernelPack::ReadFromJsonFile(const std::string &json_f, const std::string & return true; } - std::string binfile_suffix = js["binFileSuffix"]; + std::string binfile_suffix = js[kBinFileSuffix]; std::string bin_f = json_f.substr(0, json_f.length() - kJsonSuffixLength) + binfile_suffix; if (binfile_suffix == ".so") { // change "xx/xx.so" -> "xx/libxx.so" @@ -282,18 +299,18 @@ void KernelPack::ParseWorkSpace(const std::string &key, const nlohmann::json &js } try { auto workspace = js.at(key); - if (workspace.find("num") == workspace.end() || workspace.find("size") == workspace.end()) { + if (workspace.find("num") == workspace.end() || workspace.find(kSize) == workspace.end()) { MS_LOG(WARNING) << "'num' and 'size' ars necessary in workspace, but not found. " << js.dump(indent); return; } size_t num = workspace.at("num"); - std::vector sizes = workspace.at("size"); + std::vector sizes = workspace.at(kSize); if (num != sizes.size()) { MS_LOG(WARNING) << "'num' and length of 'size' must be same. " << js.dump(indent); return; } - if (workspace.find("type") != workspace.end()) { - std::vector type = workspace.at("type"); + if (workspace.find(kType) != workspace.end()) { + std::vector type = workspace.at(kType); if (num != type.size()) { MS_LOG(WARNING) << "'num' and length of 'type' must be same. " << js.dump(indent); return; @@ -383,24 +400,47 @@ void KernelPack::ParseArgsRemap(const std::string &key, const nlohmann::json &js } } +void KernelPack::ParseGlogbleWorkSpace(const std::string &key, const nlohmann::json &js, + KernelJsonInfo *kernel_json_info) { + MS_EXCEPTION_IF_NULL(kernel_json_info); + if (js.find(key) == js.end()) { + return; + } + try { + auto globalWorkspace = js.at(key); + if (globalWorkspace.find(kSize) != globalWorkspace.end()) { + kernel_json_info->global_workspace.size = globalWorkspace.at(kSize); + kernel_json_info->global_workspace.is_overflow = true; + } + if (globalWorkspace.find(kType) != globalWorkspace.end()) { + kernel_json_info->global_workspace.type = globalWorkspace.at(kType); + kernel_json_info->global_workspace.is_overflow = true; + } + } catch (std::exception &e) { + MS_LOG(ERROR) << "Parse json value failed, jsong is:" + js.dump() + ", error info: " << e.what(); + } +} + void KernelPack::ParseKernelJson(const nlohmann::json &js) { using KernelJsonParser = std::function; - const std::map kernel_json_map = {{"magic", ParseMagic}, - {"blockDim", ParseBlockDim}, - {"kernelName", ParseKernelName}, - {"binFileName", ParseBinFileName}, - {"binFileSuffix", ParseBinFileSuffix}, - {"core_type", ParseCoreType}, - {"taskRation", ParseTaskRatio}, - {"workspace", ParseWorkSpace}, - {"parameters", ParseParameters}, - {"opParaSize", ParseOpParaSize}, - {"sha256", ParseSHA256}, - {"KBHit", ParseKBHit}, - {"kernelList", ParseKernelList}, - {"modeInArgsFirstField", ParseModeInArgsFirstField}, - {"batchBindOnly", ParseBatchBindOnly}, - {"args_remap", ParseArgsRemap}}; + const std::map kernel_json_map = { + {kMagic, ParseMagic}, + {kBlockDim, ParseBlockDim}, + {kKernelName, ParseKernelName}, + {kBinFileName, ParseBinFileName}, + {kBinFileSuffix, ParseBinFileSuffix}, + {kCoreType, ParseCoreType}, + {kTaskRation, ParseTaskRatio}, + {kWorkspace, ParseWorkSpace}, + {kParameters, ParseParameters}, + {kOpParaSize, ParseOpParaSize}, + {kSHA256, ParseSHA256}, + {kKBHit, ParseKBHit}, + {kKernelList, ParseKernelList}, + {kModeInArgsFirstField, ParseModeInArgsFirstField}, + {kBatchBindOnly, ParseBatchBindOnly}, + {kArgsRemap, ParseArgsRemap}, + {kGlobalWorkspaceSpecWorkspace, ParseGlogbleWorkSpace}}; auto iter = kernel_json_map.begin(); while (iter != kernel_json_map.end()) { iter->second(iter->first, js, &kernel_json_info_); diff --git a/mindspore/ccsrc/kernel/kernel.h b/mindspore/ccsrc/kernel/kernel.h index d19ea6ffa0f..b74e487779e 100644 --- a/mindspore/ccsrc/kernel/kernel.h +++ b/mindspore/ccsrc/kernel/kernel.h @@ -123,6 +123,12 @@ struct FlexArray { char contents[]; }; +struct GlobalWorkspace { + size_t size; + size_t type; + bool is_overflow = false; +}; + struct KernelJsonInfo { std::string bin_file_name; std::string bin_file_suffix; @@ -133,6 +139,7 @@ struct KernelJsonInfo { std::string sha256; std::vector workspaces_type; std::vector workspaces; + GlobalWorkspace global_workspace; bool has_kernel_list = false; uint32_t op_para_size; int32_t KBHit; @@ -185,6 +192,7 @@ class BACKEND_EXPORT KernelPack { static void ParseModeInArgsFirstField(const std::string &key, const nlohmann::json &js, KernelJsonInfo *kernel_json_info); static void ParseArgsRemap(const std::string &key, const nlohmann::json &js, KernelJsonInfo *kernel_json_info); + static void ParseGlogbleWorkSpace(const std::string &key, const nlohmann::json &js, KernelJsonInfo *kernel_json_info); KernelJsonInfo kernel_json_info_; FlexArray *json_; diff --git a/mindspore/ccsrc/plugin/device/ascend/hal/device/ascend_memory_adapter.cc b/mindspore/ccsrc/plugin/device/ascend/hal/device/ascend_memory_adapter.cc index 91fecb5189c..ccdec764daf 100644 --- a/mindspore/ccsrc/plugin/device/ascend/hal/device/ascend_memory_adapter.cc +++ b/mindspore/ccsrc/plugin/device/ascend/hal/device/ascend_memory_adapter.cc @@ -19,6 +19,7 @@ #include #include "ir/func_graph.h" #include "runtime/mem.h" +#include "acl/acl_rt.h" #include "utils/ms_context.h" #include "utils/convert_utils_base.h" #include "graphengine/inc/external/runtime/rt_error_codes.h" @@ -35,6 +36,7 @@ constexpr double kHalfRatio = 0.5; // The Ascend max available device memory is 32GB. constexpr float kAscendMaxDeviceMemory = 32; constexpr uint64_t kOverflowAddrSize = 512; +constexpr char kGlobalOverflowWorkspace[] = "GLOBAL_OVERFLOW_WORKSPACE"; size_t AscendMemAdapter::GetRoundDownAlignSize(size_t input_size) { return (input_size / kAscendMemAlignSize) * kAscendMemAlignSize; @@ -180,16 +182,16 @@ uint8_t *AscendMemAdapter::MallocDynamicDevMem(size_t size, const std::string &t return memory_block_ptr; } -uint8_t *AscendMemAdapter::MallocOverflowMem(const CNodePtr &kernel) { +uint8_t *AscendMemAdapter::MallocOverflowMem() { std::lock_guard locker(overflow_mutex_); - auto funcGraph = kernel->func_graph(); - MS_EXCEPTION_IF_NULL(funcGraph); - if (overflow_memory_info_map_.find(funcGraph->ToString()) != overflow_memory_info_map_.cend()) { - return overflow_memory_info_map_.find(funcGraph->ToString())->second; + if (overflow_memory_info_map_.find(kGlobalOverflowWorkspace) != overflow_memory_info_map_.cend()) { + auto addr = overflow_memory_info_map_.find(kGlobalOverflowWorkspace); + return addr->second; } else { - auto overflow_memory_ptr = MallocStaticDevMem(kOverflowAddrSize, "overflow memory ptr"); + auto overflow_memory_ptr = MallocStaticDevMem(kOverflowAddrSize, "global overflow memory ptr"); MS_EXCEPTION_IF_NULL(overflow_memory_ptr); - (void)overflow_memory_info_map_.emplace(funcGraph->ToString(), overflow_memory_ptr); + (void)aclrtMemset(overflow_memory_ptr, kOverflowAddrSize, 0, kOverflowAddrSize); + (void)overflow_memory_info_map_.emplace(kGlobalOverflowWorkspace, overflow_memory_ptr); return overflow_memory_ptr; } } diff --git a/mindspore/ccsrc/plugin/device/ascend/hal/device/ascend_memory_adapter.h b/mindspore/ccsrc/plugin/device/ascend/hal/device/ascend_memory_adapter.h index 2c96a3b399e..16aa28f95f5 100644 --- a/mindspore/ccsrc/plugin/device/ascend/hal/device/ascend_memory_adapter.h +++ b/mindspore/ccsrc/plugin/device/ascend/hal/device/ascend_memory_adapter.h @@ -39,7 +39,7 @@ class AscendMemAdapter { uint8_t *MallocStaticDevMem(size_t size, const std::string &tag = ""); uint8_t *MallocDynamicDevMem(size_t size, const std::string &tag = ""); - uint8_t *MallocOverflowMem(const CNodePtr &kernel); + uint8_t *MallocOverflowMem(); bool FreeStaticDevMem(void *) const { return true; } void ResetDynamicMemory(); diff --git a/mindspore/ccsrc/plugin/device/ascend/hal/device/kernel_adjust.cc b/mindspore/ccsrc/plugin/device/ascend/hal/device/kernel_adjust.cc index a59c8f3f85b..90b374cc6af 100644 --- a/mindspore/ccsrc/plugin/device/ascend/hal/device/kernel_adjust.cc +++ b/mindspore/ccsrc/plugin/device/ascend/hal/device/kernel_adjust.cc @@ -735,74 +735,56 @@ void KernelAdjust::InsertProfilingKernel(const ProfilingTraceInfo &profiling_tra } #endif -CNodePtr KernelAdjust::CreateNPUGetFloatStatus(const std::shared_ptr &kernel_graph_ptr, - const CNodePtr &npu_alloc_cnode) const { +CNodePtr KernelAdjust::CreateNPUGetFloatStatusV2(const std::shared_ptr &kernel_graph_ptr, + const AnfNodePtr &status_value_node) const { MS_EXCEPTION_IF_NULL(kernel_graph_ptr); - MS_EXCEPTION_IF_NULL(npu_alloc_cnode); - auto npu_get_primitive = std::make_shared(kNPUGetFloatStatusOpName); - std::vector npu_get_inputs = {NewValueNode(npu_get_primitive), npu_alloc_cnode}; + MS_EXCEPTION_IF_NULL(status_value_node); + auto npu_get_primitive = std::make_shared(kNPUGetFloatStatusV2OpName); + std::vector npu_get_inputs = {NewValueNode(npu_get_primitive), status_value_node}; auto npu_get_cnode = kernel_graph_ptr->NewCNode(npu_get_inputs); MS_EXCEPTION_IF_NULL(npu_get_cnode); - npu_alloc_cnode->set_scope(kDefaultScope); - npu_get_cnode->set_abstract(npu_alloc_cnode->abstract()); + status_value_node->set_scope(kDefaultScope); + ShapeVector npu_output_shape = {kNPUShape}; + common::AnfAlgo::SetOutputInferTypeAndShape({kNumberTypeInt32}, {npu_output_shape}, npu_get_cnode.get()); kernel::KernelBuildInfo::KernelBuildInfoBuilder selected_kernel_builder; selected_kernel_builder.SetInputsFormat({kOpFormat_DEFAULT}); - selected_kernel_builder.SetInputsDeviceType({kNumberTypeFloat32}); + selected_kernel_builder.SetInputsDeviceType({kNumberTypeInt32}); selected_kernel_builder.SetFusionType(kernel::kPatternOpaque); selected_kernel_builder.SetProcessor(kernel::Processor::AICORE); selected_kernel_builder.SetKernelType(KernelType::TBE_KERNEL); selected_kernel_builder.SetOutputsFormat({kOpFormat_DEFAULT}); - selected_kernel_builder.SetOutputsDeviceType({kNumberTypeFloat32}); + selected_kernel_builder.SetOutputsDeviceType({kNumberTypeInt32}); AnfAlgo::SetSelectKernelBuildInfo(selected_kernel_builder.Build(), npu_get_cnode.get()); return npu_get_cnode; } -CNodePtr KernelAdjust::CreateNPUClearStatus(const std::shared_ptr &kernel_graph_ptr, - const CNodePtr &npu_alloc_cnode) const { +CNodePtr KernelAdjust::CreateNPUClearStatusV2(const std::shared_ptr &kernel_graph_ptr, + const AnfNodePtr &status_value_node) const { MS_EXCEPTION_IF_NULL(kernel_graph_ptr); - MS_EXCEPTION_IF_NULL(npu_alloc_cnode); - auto npu_clear_primitive = std::make_shared(kNPUClearFloatStatusOpName); - std::vector npu_clear_inputs = {NewValueNode(npu_clear_primitive), npu_alloc_cnode}; + MS_EXCEPTION_IF_NULL(status_value_node); + auto npu_clear_primitive = std::make_shared(kNPUClearFloatStatusV2OpName); + std::vector npu_clear_inputs = {NewValueNode(npu_clear_primitive), status_value_node}; auto npu_clear_cnode = kernel_graph_ptr->NewCNode(npu_clear_inputs); MS_EXCEPTION_IF_NULL(npu_clear_cnode); - npu_alloc_cnode->set_scope(kDefaultScope); - npu_clear_cnode->set_abstract(npu_alloc_cnode->abstract()); + status_value_node->set_scope(kDefaultScope); + npu_clear_cnode->set_abstract(status_value_node->abstract()); + ShapeVector npu_output_shape = {kNPUShape}; + common::AnfAlgo::SetOutputInferTypeAndShape({kNumberTypeInt32}, {npu_output_shape}, npu_clear_cnode.get()); kernel::KernelBuildInfo::KernelBuildInfoBuilder selected_kernel_builder; selected_kernel_builder.SetInputsFormat({kOpFormat_DEFAULT}); - selected_kernel_builder.SetInputsDeviceType({kNumberTypeFloat32}); + selected_kernel_builder.SetInputsDeviceType({kNumberTypeInt32}); selected_kernel_builder.SetFusionType(kernel::kPatternOpaque); selected_kernel_builder.SetProcessor(kernel::Processor::AICORE); selected_kernel_builder.SetKernelType(KernelType::TBE_KERNEL); selected_kernel_builder.SetOutputsFormat({kOpFormat_DEFAULT}); - selected_kernel_builder.SetOutputsDeviceType({kNumberTypeFloat32}); + selected_kernel_builder.SetOutputsDeviceType({kNumberTypeInt32}); AnfAlgo::SetSelectKernelBuildInfo(selected_kernel_builder.Build(), npu_clear_cnode.get()); return npu_clear_cnode; } -CNodePtr KernelAdjust::CreateNPUAllocStatus(const std::shared_ptr &kernel_graph_ptr) const { - MS_EXCEPTION_IF_NULL(kernel_graph_ptr); - // create npu_alloc_cnode - auto npu_alloc_primitive = std::make_shared(kNPUAllocFloatStatusOpName); - std::vector npu_alloc_inputs = {NewValueNode(npu_alloc_primitive)}; - auto npu_alloc_cnode = kernel_graph_ptr->NewCNode(npu_alloc_inputs); - MS_EXCEPTION_IF_NULL(npu_alloc_cnode); - npu_alloc_cnode->set_scope(kDefaultScope); - ShapeVector npu_output_shape = {kNPUShape}; - common::AnfAlgo::SetOutputInferTypeAndShape({kNumberTypeFloat32}, {npu_output_shape}, npu_alloc_cnode.get()); - - kernel::KernelBuildInfo::KernelBuildInfoBuilder selected_kernel_builder; - selected_kernel_builder.SetFusionType(kernel::kPatternOpaque); - selected_kernel_builder.SetProcessor(kernel::Processor::AICORE); - selected_kernel_builder.SetKernelType(KernelType::TBE_KERNEL); - selected_kernel_builder.SetOutputsFormat({kOpFormat_DEFAULT}); - selected_kernel_builder.SetOutputsDeviceType({kNumberTypeFloat32}); - AnfAlgo::SetSelectKernelBuildInfo(selected_kernel_builder.Build(), npu_alloc_cnode.get()); - return npu_alloc_cnode; -} - CNodePtr KernelAdjust::CreateAssignAdd(const std::shared_ptr &kernel_graph_ptr, const CNodePtr &npu_alloc_cnode, const AnfNodePtr &specify_para) const { MS_EXCEPTION_IF_NULL(kernel_graph_ptr); @@ -836,39 +818,41 @@ CNodePtr KernelAdjust::CreateAssignAdd(const std::shared_ptr &kernel_graph_ptr, - const AnfNodePtr &specify_para) const { - MS_EXCEPTION_IF_NULL(kernel_graph_ptr); - MS_EXCEPTION_IF_NULL(specify_para); - - std::vector reset(kNPUShape, 0.0); - ShapeVector reset_shape({kNPUShape}); - auto shp_buf_size = sizeof(float) * reset.size(); - auto reset_tensor = std::make_shared(kNumberTypeFloat32, reset_shape, reset.data(), shp_buf_size); - auto reset_value_node = std::make_shared(reset_tensor); - MS_EXCEPTION_IF_NULL(reset_value_node); - reset_value_node->set_abstract(specify_para->abstract()); - kernel_graph_ptr->AddValueNodeToGraph(reset_value_node); +AnfNodePtr KernelAdjust::CreateZerosValueNode(const std::shared_ptr &kernel_graph_ptr) const { + std::vector zeros(kNPUShape, 0); + ShapeVector zeros_shape({kNPUShape}); + auto shp_buf_size = sizeof(float) * zeros.size(); + auto zeros_tensor = std::make_shared(kNumberTypeInt32, zeros_shape, zeros.data(), shp_buf_size); + auto zeros_value_node = std::make_shared(zeros_tensor); + MS_EXCEPTION_IF_NULL(zeros_value_node); + kernel_graph_ptr->AddValueNodeToGraph(zeros_value_node); auto kernel_info = std::make_shared(); MS_EXCEPTION_IF_NULL(kernel_info); - reset_value_node->set_kernel_info(kernel_info); + zeros_value_node->set_kernel_info(kernel_info); kernel::KernelBuildInfo::KernelBuildInfoBuilder builder1; builder1.SetOutputsFormat({kOpFormat_DEFAULT}); - builder1.SetOutputsDeviceType({kNumberTypeFloat32}); - AnfAlgo::SetSelectKernelBuildInfo(builder1.Build(), reset_value_node.get()); + builder1.SetOutputsDeviceType({kNumberTypeInt32}); + AnfAlgo::SetSelectKernelBuildInfo(builder1.Build(), zeros_value_node.get()); + return zeros_value_node; +} + +CNodePtr KernelAdjust::CreateAssign(const std::shared_ptr &kernel_graph_ptr, + const AnfNodePtr &specify_para, const AnfNodePtr &data) const { + MS_EXCEPTION_IF_NULL(kernel_graph_ptr); + MS_EXCEPTION_IF_NULL(specify_para); auto assign_primitive = std::make_shared(kAssignOpName); - std::vector assign_inputs = {NewValueNode(assign_primitive), specify_para, reset_value_node}; + std::vector assign_inputs = {NewValueNode(assign_primitive), specify_para, data}; auto assign_cnode = kernel_graph_ptr->NewCNode(assign_inputs); MS_EXCEPTION_IF_NULL(assign_cnode); assign_cnode->set_scope(kDefaultScope); assign_cnode->set_abstract(specify_para->abstract()); kernel::KernelBuildInfo::KernelBuildInfoBuilder selected_kernel_builder = CreateMngKernelBuilder( - {kOpFormat_DEFAULT, kOpFormat_DEFAULT}, {TypeId::kNumberTypeFloat32, TypeId::kNumberTypeFloat32}); + {kOpFormat_DEFAULT, kOpFormat_DEFAULT}, {TypeId::kNumberTypeInt32, TypeId::kNumberTypeInt32}); selected_kernel_builder.SetOutputsFormat({kOpFormat_DEFAULT}); - selected_kernel_builder.SetOutputsDeviceType({kNumberTypeFloat32}); + selected_kernel_builder.SetOutputsDeviceType({kNumberTypeInt32}); AnfAlgo::SetSelectKernelBuildInfo(selected_kernel_builder.Build(), assign_cnode.get()); std::vector input_names = {"ref", "value"}; @@ -944,7 +928,8 @@ void KernelAdjust::InsertGradientOverflowCheckOperations( MS_EXCEPTION_IF_NULL(kernel_graph_ptr); bool first_grad_op = true; - CNodePtr npu_alloc_cnode; + auto status_value_node = CreateZerosValueNode(kernel_graph_ptr); + auto reset_value_node = CreateZerosValueNode(kernel_graph_ptr); std::vector new_execution_order; auto execution_order = kernel_graph_ptr->execution_order(); for (size_t i = 0; i < execution_order.size() - 1; i++) { @@ -956,39 +941,37 @@ void KernelAdjust::InsertGradientOverflowCheckOperations( if (cur_full_name.find(kGradients) == std::string::npos && next_full_name.find(kGradients) != std::string::npos) { if (first_grad_op) { - npu_alloc_cnode = CreateNPUAllocStatus(kernel_graph_ptr); - auto npu_clear_cnode = CreateNPUClearStatus(kernel_graph_ptr, npu_alloc_cnode); - auto assign_cnode = CreateAssign(kernel_graph_ptr, specify_para); - AnfAlgo::SetStreamId(next_stream_id, npu_alloc_cnode.get()); + auto npu_clear_cnode = CreateNPUClearStatusV2(kernel_graph_ptr, status_value_node); + auto assign_cnode = CreateAssign(kernel_graph_ptr, specify_para, reset_value_node); + AnfAlgo::SetStreamId(next_stream_id, status_value_node.get()); AnfAlgo::SetStreamId(next_stream_id, npu_clear_cnode.get()); AnfAlgo::SetStreamId(next_stream_id, assign_cnode.get()); - new_execution_order.push_back(npu_alloc_cnode); new_execution_order.push_back(npu_clear_cnode); new_execution_order.push_back(assign_cnode); first_grad_op = false; } else { - auto npu_clear_cnode = CreateNPUClearStatus(kernel_graph_ptr, npu_alloc_cnode); + auto npu_clear_cnode = CreateNPUClearStatusV2(kernel_graph_ptr, status_value_node); AnfAlgo::SetStreamId(next_stream_id, npu_clear_cnode.get()); new_execution_order.push_back(npu_clear_cnode); } } if (cur_full_name.find(kGradients) != std::string::npos && next_full_name.find(kGradients) == std::string::npos) { - auto npu_get_cnode = CreateNPUGetFloatStatus(kernel_graph_ptr, npu_alloc_cnode); - auto assign_add_cnode = CreateAssignAdd(kernel_graph_ptr, npu_alloc_cnode, specify_para); + auto npu_get_cnode = CreateNPUGetFloatStatusV2(kernel_graph_ptr, status_value_node); + auto assign_status_node = CreateAssign(kernel_graph_ptr, specify_para, npu_get_cnode); + AnfAlgo::SetStreamId(cur_stream_id, npu_get_cnode.get()); AnfAlgo::SetStreamId(cur_stream_id, npu_get_cnode.get()); - AnfAlgo::SetStreamId(cur_stream_id, assign_add_cnode.get()); new_execution_order.push_back(npu_get_cnode); - new_execution_order.push_back(assign_add_cnode); + new_execution_order.push_back(assign_status_node); } if (i == execution_order.size() - kLastHandleDiff) { new_execution_order.push_back(execution_order[i + 1]); if (next_full_name.find(kGradients) != std::string::npos) { - auto npu_get_cnode = CreateNPUGetFloatStatus(kernel_graph_ptr, npu_alloc_cnode); - auto assign_add_cnode = CreateAssignAdd(kernel_graph_ptr, npu_alloc_cnode, specify_para); + auto npu_get_cnode = CreateNPUGetFloatStatusV2(kernel_graph_ptr, status_value_node); + auto assign_status_node = CreateAssign(kernel_graph_ptr, specify_para, npu_get_cnode); AnfAlgo::SetStreamId(cur_stream_id, npu_get_cnode.get()); - AnfAlgo::SetStreamId(cur_stream_id, assign_add_cnode.get()); + AnfAlgo::SetStreamId(cur_stream_id, assign_status_node.get()); new_execution_order.push_back(npu_get_cnode); - new_execution_order.push_back(assign_add_cnode); + new_execution_order.push_back(assign_status_node); } } } @@ -1030,18 +1013,16 @@ void KernelAdjust::InsertDynamicLossScaleCheckOperations(const std::shared_ptr new_execution_order; int64_t cur_param = static_cast(dynamic_loss_scale_param_list->size()) - 1; - CNodePtr npu_alloc_cnode; + auto status_value_node = CreateZerosValueNode(kernel_graph_ptr); + auto reset_value_node = CreateZerosValueNode(kernel_graph_ptr); std::set viewed_id; for (size_t i = 0; i < execution_order.size(); ++i) { auto cur_node = execution_order[i]; auto cur_stream_id = AnfAlgo::GetStreamId(cur_node); if (common::AnfAlgo::HasNodeAttr(kSplitOverFlow, cur_node) || (i == end_gradient_index)) { if (first_layer_op) { - npu_alloc_cnode = CreateNPUAllocStatus(kernel_graph_ptr); - AnfAlgo::SetStreamId(cur_stream_id, npu_alloc_cnode.get()); - (void)new_execution_order.emplace_back(npu_alloc_cnode); for (const auto ¶m : *dynamic_loss_scale_param_list) { - auto assign_cnode = CreateAssign(kernel_graph_ptr, param); + auto assign_cnode = CreateAssign(kernel_graph_ptr, param, reset_value_node); AnfAlgo::SetStreamId(cur_stream_id, assign_cnode.get()); (void)new_execution_order.emplace_back(assign_cnode); } @@ -1055,22 +1036,19 @@ void KernelAdjust::InsertDynamicLossScaleCheckOperations(const std::shared_ptrat(cur_param)); - AnfAlgo::SetStreamId(cur_stream_id, assign_cnode.get()); - (void)new_execution_order.emplace_back(assign_cnode); - } - auto npu_get_cnode = CreateNPUGetFloatStatus(kernel_graph_ptr, npu_alloc_cnode); + + auto npu_get_cnode = CreateNPUGetFloatStatusV2(kernel_graph_ptr, status_value_node); AnfAlgo::SetStreamId(cur_stream_id, npu_get_cnode.get()); (void)new_execution_order.emplace_back(npu_get_cnode); - auto assign_add_cnode = - CreateAssignAdd(kernel_graph_ptr, npu_alloc_cnode, dynamic_loss_scale_param_list->at(cur_param)); - AnfAlgo::SetStreamId(cur_stream_id, assign_add_cnode.get()); - (void)new_execution_order.emplace_back(assign_add_cnode); + + auto assign_status_node = + CreateAssign(kernel_graph_ptr, dynamic_loss_scale_param_list->at(cur_param), npu_get_cnode); + AnfAlgo::SetStreamId(cur_stream_id, assign_status_node.get()); + (void)new_execution_order.emplace_back(assign_status_node); (void)viewed_id.insert(cur_param); cur_param--; } - auto npu_clear_cnode = CreateNPUClearStatus(kernel_graph_ptr, npu_alloc_cnode); + auto npu_clear_cnode = CreateNPUClearStatusV2(kernel_graph_ptr, status_value_node); AnfAlgo::SetStreamId(cur_stream_id, npu_clear_cnode.get()); (void)new_execution_order.emplace_back(npu_clear_cnode); } diff --git a/mindspore/ccsrc/plugin/device/ascend/hal/device/kernel_adjust.h b/mindspore/ccsrc/plugin/device/ascend/hal/device/kernel_adjust.h index 661f4a7e306..19dcf982f23 100644 --- a/mindspore/ccsrc/plugin/device/ascend/hal/device/kernel_adjust.h +++ b/mindspore/ccsrc/plugin/device/ascend/hal/device/kernel_adjust.h @@ -80,15 +80,15 @@ class KernelAdjust { KernelAdjust() = default; ~KernelAdjust() = default; - CNodePtr CreateNPUGetFloatStatus(const std::shared_ptr &kernel_graph_ptr, - const CNodePtr &npu_alloc_cnode) const; - CNodePtr CreateNPUClearStatus(const std::shared_ptr &kernel_graph_ptr, - const CNodePtr &npu_alloc_cnode) const; - CNodePtr CreateNPUAllocStatus(const std::shared_ptr &kernel_graph_ptr) const; + AnfNodePtr CreateZerosValueNode(const std::shared_ptr &kernel_graph_ptr) const; + CNodePtr CreateNPUGetFloatStatusV2(const std::shared_ptr &kernel_graph_ptr, + const AnfNodePtr &status_value_node) const; + CNodePtr CreateNPUClearStatusV2(const std::shared_ptr &kernel_graph_ptr, + const AnfNodePtr &status_value_node) const; CNodePtr CreateAssignAdd(const std::shared_ptr &kernel_graph_ptr, const CNodePtr &npu_alloc_cnode, const AnfNodePtr &specify_para) const; - CNodePtr CreateAssign(const std::shared_ptr &kernel_graph_ptr, - const AnfNodePtr &specify_para) const; + CNodePtr CreateAssign(const std::shared_ptr &kernel_graph_ptr, const AnfNodePtr &specify_para, + const AnfNodePtr &data) const; void ReorderGetNext(const std::shared_ptr &kernel_graph_ptr) const; CNodePtr CreateStreamSwitchOp(const std::shared_ptr &kernel_graph_ptr, const std::map &switch_loop_input, diff --git a/mindspore/ccsrc/plugin/device/ascend/kernel/hccl/hccl_kernel.cc b/mindspore/ccsrc/plugin/device/ascend/kernel/hccl/hccl_kernel.cc index f0b4c68f602..4d98568f36e 100644 --- a/mindspore/ccsrc/plugin/device/ascend/kernel/hccl/hccl_kernel.cc +++ b/mindspore/ccsrc/plugin/device/ascend/kernel/hccl/hccl_kernel.cc @@ -305,10 +305,11 @@ std::vector HcclKernel::GenTask(const std::vector &inpu } std::vector global_workspace_addr; - auto overflow_memory_ptr = - device::ascend::AscendMemAdapter::GetInstance().MallocOverflowMem(anf_node_.lock()->cast()); + auto overflow_memory_ptr = device::ascend::AscendMemAdapter::GetInstance().MallocOverflowMem(); MS_EXCEPTION_IF_NULL(overflow_memory_ptr); global_workspace_addr.push_back(reinterpret_cast(overflow_memory_ptr)); + MS_LOG(DEBUG) << "Assign overflow memory for node " << anf_node->fullname_with_scope() << ", addr is " + << reinterpret_cast(overflow_memory_ptr); HcclTaskInfoPtr hcclTaskInfo = std::make_shared(unique_name_, stream_id, hccl::HcclAdapter::GetHcclType(anf_node), input_data_addr, diff --git a/mindspore/ccsrc/plugin/device/ascend/kernel/tbe/dynamic_tbe_kernel_mod.cc b/mindspore/ccsrc/plugin/device/ascend/kernel/tbe/dynamic_tbe_kernel_mod.cc index e64235c6f9b..ca67009e322 100644 --- a/mindspore/ccsrc/plugin/device/ascend/kernel/tbe/dynamic_tbe_kernel_mod.cc +++ b/mindspore/ccsrc/plugin/device/ascend/kernel/tbe/dynamic_tbe_kernel_mod.cc @@ -289,6 +289,13 @@ bool DynamicTbeKernelMod::Launch(const std::vector &inputs, const st runtimeargs.push_back(tiling_data_ptr_); } + AddressPtr overflow_address_ptr = GetOverflowAddress(); + if (overflow_address_ptr != nullptr) { + runtimeargs.emplace_back(overflow_address_ptr->addr); + MS_LOG(DEBUG) << "Assign overflow memory for node " << node->fullname_with_scope() << ", addr is " + << overflow_address_ptr->addr; + } + rtL2Ctrl_t *l2ctrl = nullptr; auto args_size = static_cast(UlongToUint(sizeof(void *)) * runtimeargs.size()); auto node_info = cnode->fullname_with_scope(); diff --git a/mindspore/ccsrc/plugin/device/ascend/kernel/tbe/tbe_json/single_tbe_json_creator.cc b/mindspore/ccsrc/plugin/device/ascend/kernel/tbe/tbe_json/single_tbe_json_creator.cc index 39f65710779..d2239b53db4 100644 --- a/mindspore/ccsrc/plugin/device/ascend/kernel/tbe/tbe_json/single_tbe_json_creator.cc +++ b/mindspore/ccsrc/plugin/device/ascend/kernel/tbe/tbe_json/single_tbe_json_creator.cc @@ -58,6 +58,43 @@ bool SingleTbeJsonCreator::GenJson(const AnfNodePtr &anf_node, nlohmann::json *k return true; } +void NpuClearV2PostProcessing(const AnfNodePtr &anf_node, std::vector *op_list_json) { + if (op_list_json->size() != 2) { + MS_LOG(ERROR) << "Op list json's size is not equal to 2, abort post processing."; + } + + auto compute_json = (*op_list_json)[1]; + std::vector empty_vector_json; + compute_json[kJInputDesc] = empty_vector_json; + compute_json[kJOutputDataDesc] = empty_vector_json; + compute_json[kJOutputDesc] = empty_vector_json; + op_list_json->clear(); + (*op_list_json).emplace_back(compute_json); + MS_LOG(DEBUG) << "Op list json after post processing:" << compute_json.dump(); +} + +void NpuGetV2PostProcessing(const AnfNodePtr &anf_node, std::vector *op_list_json) { + if (op_list_json->size() != 2) { + MS_LOG(ERROR) << "Op list json's size is not equal to 2, abort post processing."; + } + + auto compute_json = (*op_list_json)[1]; + std::vector empty_vector_json; + compute_json[kJInputDesc] = empty_vector_json; + op_list_json->clear(); + (*op_list_json).emplace_back(compute_json); + MS_LOG(DEBUG) << "Op list json after post processing:" << compute_json.dump(); +} + +void SingleTbeJsonCreator::OpListPostProcessing(const AnfNodePtr &anf_node, std::vector *op_list_json) { + auto kernel_name = common::AnfAlgo::GetCNodeName(anf_node); + if (kernel_name == kNPUClearFloatStatusV2OpName) { + NpuClearV2PostProcessing(anf_node, op_list_json); + } else if (kernel_name == kNPUGetFloatStatusV2OpName) { + NpuGetV2PostProcessing(anf_node, op_list_json); + } +} + bool SingleTbeJsonCreator::GenOpListJson(const AnfNodePtr &anf_node, std::vector *op_list_json) { MS_EXCEPTION_IF_NULL(anf_node); MS_EXCEPTION_IF_NULL(op_list_json); @@ -69,6 +106,7 @@ bool SingleTbeJsonCreator::GenOpListJson(const AnfNodePtr &anf_node, std::vector } GenDataJson(anf_node, compute_json, op_list_json); (*op_list_json).push_back(compute_json); + OpListPostProcessing(anf_node, op_list_json); MS_LOG(DEBUG) << "End."; return true; } diff --git a/mindspore/ccsrc/plugin/device/ascend/kernel/tbe/tbe_json/single_tbe_json_creator.h b/mindspore/ccsrc/plugin/device/ascend/kernel/tbe/tbe_json/single_tbe_json_creator.h index 9984977cde6..77c76b65ecd 100644 --- a/mindspore/ccsrc/plugin/device/ascend/kernel/tbe/tbe_json/single_tbe_json_creator.h +++ b/mindspore/ccsrc/plugin/device/ascend/kernel/tbe/tbe_json/single_tbe_json_creator.h @@ -29,6 +29,7 @@ class SingleTbeJsonCreator : public TbeJsonCreator { protected: bool GenOpListJson(const AnfNodePtr &anf_node, std::vector *op_list_json); + void OpListPostProcessing(const AnfNodePtr &anf_node, std::vector *op_list_json); void GenDataJson(const AnfNodePtr &anf_node, const nlohmann::json &compute_json, std::vector *op_list_json) const; virtual void GenInputDescJson(const AnfNodePtr &anf_node, size_t real_input_index, nlohmann::json *input_desc); diff --git a/mindspore/ccsrc/plugin/device/ascend/kernel/tbe/tbe_kernel_compile.cc b/mindspore/ccsrc/plugin/device/ascend/kernel/tbe/tbe_kernel_compile.cc index b157473aa33..49cdac9bd72 100644 --- a/mindspore/ccsrc/plugin/device/ascend/kernel/tbe/tbe_kernel_compile.cc +++ b/mindspore/ccsrc/plugin/device/ascend/kernel/tbe/tbe_kernel_compile.cc @@ -571,15 +571,17 @@ void TbeKernelCompileManager::Query(const std::string &type) { std::pair, std::vector> TbeKernelCompileManager::GenKernelMod( const std::vector &node_list) { MS_LOG(INFO) << "Gen kernel mod start!"; - std::vector success_node; - std::vector failed_node; + std::vector success_nodes; + std::vector failed_nodes; for (auto &node : node_list) { MS_EXCEPTION_IF_NULL(node); if (AnfAlgo::GetKernelMod(node) != nullptr) { - (void)success_node.emplace_back(node); + (void)success_nodes.emplace_back(node); continue; // kernel mod already exist, continue; } + auto op_name = common::AnfAlgo::GetCNodeName(node); + auto full_name = node->fullname_with_scope(); if (common::AnfAlgo::HasNodeAttr(kAttrOriFusionName, node)) { full_name = common::AnfAlgo::GetNodeAttr(node, kAttrOriFusionName); @@ -592,7 +594,7 @@ std::pair, std::vector> TbeKernelCompileManager: kernel_pack = bin_map->SearchInFile(json_name); if (kernel_pack == nullptr) { MS_LOG(INFO) << "Can not find .json file or the .o file for op:" << json_name << trace::DumpSourceLines(node); - (void)failed_node.emplace_back(node); + (void)failed_nodes.emplace_back(node); continue; } } @@ -612,11 +614,17 @@ std::pair, std::vector> TbeKernelCompileManager: kernel_mod_ptr->SetInputSizeList(iter->second.input_size_list); kernel_mod_ptr->SetOutputSizeList(iter->second.output_size_list); kernel_mod_ptr->SetWorkspaceSizeList(kernel_info_json.workspaces); + if (op_name == kNPUClearFloatStatusV2OpName || op_name == kNPUGetFloatStatusV2OpName) { + constexpr size_t io_byte_size = 32; + const std::vector size_list = {io_byte_size}; + kernel_mod_ptr->SetInputSizeList(size_list); + kernel_mod_ptr->SetOutputSizeList(size_list); + } AnfAlgo::SetKernelMod(kernel_mod_ptr, node.get()); - (void)success_node.emplace_back(node); + (void)success_nodes.emplace_back(node); } MS_LOG(INFO) << "Gen kernel mod end!"; - return std::make_pair(success_node, failed_node); + return std::make_pair(success_nodes, failed_nodes); } void TbeKernelCompileManager::UpdateFusionTypeAndOutputDataDesc(const std::vector &nodes) { diff --git a/mindspore/ccsrc/plugin/device/ascend/kernel/tbe/tbe_kernel_mod.cc b/mindspore/ccsrc/plugin/device/ascend/kernel/tbe/tbe_kernel_mod.cc index 8999bcc2742..4fc2ab5f0d8 100644 --- a/mindspore/ccsrc/plugin/device/ascend/kernel/tbe/tbe_kernel_mod.cc +++ b/mindspore/ccsrc/plugin/device/ascend/kernel/tbe/tbe_kernel_mod.cc @@ -21,12 +21,12 @@ #include "utils/ms_context.h" #include "plugin/device/ascend/hal/device/ge_runtime/task_info.h" #include "runtime/device/kernel_runtime.h" +#include "plugin/device/ascend/hal/device/ascend_memory_adapter.h" namespace mindspore { namespace kernel { using TbeTaskInfoPtr = std::shared_ptr; using tbe::KernelManager; -using AddressPtrList = std::vector; bool TbeKernelMod::Launch(const std::vector &inputs, const std::vector &workspace, const std::vector &outputs, void *stream_ptr) { @@ -60,14 +60,23 @@ bool TbeKernelMod::Launch(const std::vector &inpu return false; } + auto node = anf_node_.lock(); + MS_EXCEPTION_IF_NULL(node); + auto cnode = node->cast(); + MS_EXCEPTION_IF_NULL(cnode); + + std::vector real_inputs; + std::vector real_outputs; + GetRealIOAddress(cnode, inputs, outputs, &real_inputs, &real_outputs); + // pack all addresses into a vector. std::vector runtimeargs; - (void)std::transform(std::begin(inputs), std::end(inputs), std::back_inserter(runtimeargs), + (void)std::transform(std::begin(real_inputs), std::end(real_inputs), std::back_inserter(runtimeargs), [](const AddressPtr &input) -> void * { MS_EXCEPTION_IF_NULL(input); return input->addr; }); - (void)std::transform(std::begin(outputs), std::end(outputs), std::back_inserter(runtimeargs), + (void)std::transform(std::begin(real_outputs), std::end(real_outputs), std::back_inserter(runtimeargs), [](const AddressPtr &output) -> void * { MS_EXCEPTION_IF_NULL(output); return output->addr; @@ -79,6 +88,14 @@ bool TbeKernelMod::Launch(const std::vector &inpu return addr->addr; }); } + + AddressPtr overflow_address_ptr = GetOverflowAddress(); + if (overflow_address_ptr != nullptr) { + runtimeargs.emplace_back(overflow_address_ptr->addr); + MS_LOG(DEBUG) << "Assign overflow memory for node " << cnode->fullname_with_scope() << ", addr is " + << overflow_address_ptr->addr; + } + rtL2Ctrl_t *l2ctrl = nullptr; const void *stubFunc = reinterpret_cast(func_stub); auto argsSize = static_cast(UlongToUint(sizeof(void *)) * runtimeargs.size()); @@ -106,13 +123,22 @@ std::vector TbeKernelMod::GenTask(const std::vector &in std::vector output_data_addrs; std::vector workspace_addrs; + auto node = anf_node_.lock(); + MS_EXCEPTION_IF_NULL(node); + auto cnode = node->cast(); + MS_EXCEPTION_IF_NULL(cnode); + + std::vector real_inputs; + std::vector real_outputs; + GetRealIOAddress(cnode, inputs, outputs, &real_inputs, &real_outputs); + // pack all addresses into a vector. - (void)std::transform(std::begin(inputs), std::end(inputs), std::back_inserter(input_data_addrs), + (void)std::transform(std::begin(real_inputs), std::end(real_inputs), std::back_inserter(input_data_addrs), [](const AddressPtr &input) -> void * { MS_EXCEPTION_IF_NULL(input); return input->addr; }); - (void)std::transform(std::begin(outputs), std::end(outputs), std::back_inserter(output_data_addrs), + (void)std::transform(std::begin(real_outputs), std::end(real_outputs), std::back_inserter(output_data_addrs), [](const AddressPtr &output) -> void * { MS_EXCEPTION_IF_NULL(output); return output->addr; @@ -125,6 +151,13 @@ std::vector TbeKernelMod::GenTask(const std::vector &in }); } + AddressPtr overflow_address_ptr = GetOverflowAddress(); + if (overflow_address_ptr != nullptr) { + workspace_addrs.emplace_back(overflow_address_ptr->addr); + MS_LOG(DEBUG) << "Assign overflow memory for node " << cnode->fullname_with_scope() << ", addr is " + << overflow_address_ptr->addr; + } + stream_id_ = stream_id; auto funcstub = KernelManager::GenFuncStub(*kernel_pack_, false, &block_dim_, nullptr); if (funcstub == 0) { @@ -146,5 +179,40 @@ vector TbeKernelMod::GenParameters() { auto kernel_json_info = kernel_pack_->kernel_json_info(); return kernel_json_info.parameters; } + +AddressPtr TbeKernelMod::GetOverflowAddress() { + AddressPtr overflow_address_ptr = nullptr; + auto is_overflow = kernel_pack_.get()->kernel_json_info().global_workspace.is_overflow; + if (is_overflow) { + constexpr size_t size = 32; + auto overflow_memory_ptr = device::ascend::AscendMemAdapter::GetInstance().MallocOverflowMem(); + MS_EXCEPTION_IF_NULL(overflow_memory_ptr); + overflow_address_ptr = std::make_shared(); + overflow_address_ptr->addr = reinterpret_cast(overflow_memory_ptr); + overflow_address_ptr->size = size; + } + return overflow_address_ptr; +} + +void TbeKernelMod::GetRealIOAddress(const AnfNodePtr &cnode, const vector &inputs, + const vector &outputs, + vector *real_inputs, + vector *real_outputs) const { + auto op_name = common::AnfAlgo::GetCNodeName(cnode); + MS_EXCEPTION_IF_NULL(real_inputs); + MS_EXCEPTION_IF_NULL(real_outputs); + *real_inputs = inputs; + *real_outputs = outputs; + if (op_name == kNPUClearFloatStatusV2OpName) { + // NPUClearFloatStatusV2 has no input output. + real_inputs->clear(); + real_outputs->clear(); + MS_LOG(INFO) << "Clear Node " << cnode->fullname_with_scope() << "'s inputs and outputs"; + } else if (op_name == kNPUGetFloatStatusV2OpName) { + // NPUGetFloatStatusV2 has no input + real_inputs->clear(); + MS_LOG(INFO) << "Clear Node " << cnode->fullname_with_scope() << "'s inputs"; + } +} } // namespace kernel } // namespace mindspore diff --git a/mindspore/ccsrc/plugin/device/ascend/kernel/tbe/tbe_kernel_mod.h b/mindspore/ccsrc/plugin/device/ascend/kernel/tbe/tbe_kernel_mod.h index 4c64627e139..bdc9ccbd4cb 100644 --- a/mindspore/ccsrc/plugin/device/ascend/kernel/tbe/tbe_kernel_mod.h +++ b/mindspore/ccsrc/plugin/device/ascend/kernel/tbe/tbe_kernel_mod.h @@ -42,6 +42,10 @@ class TbeKernelMod : public AscendKernelMod { std::vector GenTask(const std::vector &inputs, const std::vector &workspaces, const std::vector &outputs, uint32_t stream_id) override; std::vector GenParameters() override; + AddressPtr GetOverflowAddress(); + void GetRealIOAddress(const AnfNodePtr &cnode, const std::vector &inputs, + const std::vector &outputs, std::vector *real_inputs, + std::vector *real_outputs) const; protected: KernelPackPtr kernel_pack_; diff --git a/mindspore/ccsrc/plugin/device/ascend/kernel/tbe/tbe_utils.cc b/mindspore/ccsrc/plugin/device/ascend/kernel/tbe/tbe_utils.cc index f05e561907d..0da24f06238 100644 --- a/mindspore/ccsrc/plugin/device/ascend/kernel/tbe/tbe_utils.cc +++ b/mindspore/ccsrc/plugin/device/ascend/kernel/tbe/tbe_utils.cc @@ -208,6 +208,8 @@ nlohmann::json TbeUtils::GenSocInfo() { soc_info_json["op_debug_config"] = GetOpDebugConfig(); soc_info_json["autoTilingMode"] = context_ptr->get_param(MS_CTX_TUNE_MODE); soc_info_json["deviceId"] = std::to_string(context_ptr->get_param(MS_CTX_DEVICE_ID)); + soc_info_json["status_check"] = "true"; + std::string config_path; if (!Common::CommonFuncForConfigPath("", common::GetEnv("OP_BANK_PATH"), &config_path)) { MS_LOG(EXCEPTION) << "Invalid environment variable 'OP_BANK_PATH', the path is " << common::GetEnv("OP_BANK_PATH") diff --git a/mindspore/core/ops/core_ops.h b/mindspore/core/ops/core_ops.h index 3f7392ac765..dd02e5680f5 100644 --- a/mindspore/core/ops/core_ops.h +++ b/mindspore/core/ops/core_ops.h @@ -1607,6 +1607,8 @@ GVAR_DEF(PrimitivePtr, kPrimPush, std::make_shared("Push")); GVAR_DEF(PrimitivePtr, kPrimNPUGetFloatStatus, std::make_shared("NPUGetFloatStatus")); GVAR_DEF(PrimitivePtr, kPrimNPUAllocFloatStatus, std::make_shared("NPUAllocFloatStatus")); GVAR_DEF(PrimitivePtr, kPrimNPUClearFloatStatus, std::make_shared("NPUClearFloatStatus")); +GVAR_DEF(PrimitivePtr, kPrimNPUGetFloatStatusV2, std::make_shared("NPUGetFloatStatusV2")); +GVAR_DEF(PrimitivePtr, kPrimNPUClearFloatStatusV2, std::make_shared("NPUClearFloatStatusV2")); GVAR_DEF(PrimitivePtr, kPrimPyFunc, std::make_shared("PyFunc")); GVAR_DEF(PrimitivePtr, kPrimDynamicLossScale, std::make_shared("_DynamicLossScale")); GVAR_DEF(PrimitivePtr, kPrimScaleGrad, std::make_shared("ScaleGrad")); diff --git a/mindspore/core/ops/npu_clear_float_status_v2.cc b/mindspore/core/ops/npu_clear_float_status_v2.cc new file mode 100644 index 00000000000..9045d77a9f5 --- /dev/null +++ b/mindspore/core/ops/npu_clear_float_status_v2.cc @@ -0,0 +1,99 @@ +/** + * Copyright 2023 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include +#include + +#include "ops/npu_clear_float_status_v2.h" +#include "ops/op_utils.h" +#include "abstract/param_validator.h" +#include "utils/check_convert_utils.h" +#include "abstract/ops/primitive_infer_map.h" +#include "mindapi/src/helper.h" + +namespace mindspore { +namespace ops { +namespace { +abstract::ShapePtr NPUClearFloatStatusV2InferShape(const PrimitivePtr &, + const std::vector &input_args) { + auto input_shape = CheckAndConvertUtils::ConvertShapePtrToShapeMap(input_args[0]->BuildShape())[kShape]; + // dynamic rank + if (IsDynamicRank(input_shape)) { + return std::make_shared(ShapeVector{abstract::Shape::kShapeRankAny}); + } + // dynamic shape + if (IsDynamic(input_shape)) { + ShapeVector out_shape_dyn; + for (size_t i = 0; i < input_shape.size(); ++i) { + out_shape_dyn.push_back(abstract::Shape::kShapeDimAny); + } + return std::make_shared(out_shape_dyn); + } + const int64_t normal_shape_size = 1; + const int64_t normal_shape_len = 8; + if (input_shape.size() != normal_shape_size) { + MS_EXCEPTION(ValueError) << "Input_x must be a 1-dimensional tensor, but got " << std::to_string(input_shape.size()) + << "-dimensional tensor."; + } + if (input_shape[0] != normal_shape_len) { + MS_EXCEPTION(ValueError) << "The first dimension of input_x must be 8, but got " << std::to_string(input_shape[0]); + } + std::vector output_shape = {normal_shape_len}; + return std::make_shared(output_shape); +} + +TypePtr NPUClearFloatStatusV2InferType(const PrimitivePtr &primitive, const std::vector &input_args) { + std::map types; + std::set valid_types = {kInt32}; + TypePtr input_x_type = input_args[0]->BuildType(); + (void)types.emplace("input_x", input_x_type); + (void)CheckAndConvertUtils::CheckTensorTypeSame(types, valid_types, primitive->name()); + return kInt32; +} +} // namespace +MIND_API_OPERATOR_IMPL(NPUClearFloatStatusV2, BaseOperator); +AbstractBasePtr NPUClearFloatStatusV2Infer(const abstract::AnalysisEnginePtr &, const PrimitivePtr &primitive, + const std::vector &input_args) { + MS_EXCEPTION_IF_NULL(primitive); + const int64_t input_num = 1; + CheckAndConvertUtils::CheckInputArgs(input_args, kEqual, input_num, primitive->name()); + auto infer_type = NPUClearFloatStatusV2InferType(primitive, input_args); + auto infer_shape = NPUClearFloatStatusV2InferShape(primitive, input_args); + return abstract::MakeAbstract(infer_shape, infer_type); +} + +// AG means auto generated +class MIND_API AGNPUClearFloatStatusV2Infer : public abstract::OpInferBase { + public: + BaseShapePtr InferShape(const PrimitivePtr &primitive, + const std::vector &input_args) const override { + return NPUClearFloatStatusV2InferShape(primitive, input_args); + } + + TypePtr InferType(const PrimitivePtr &primitive, const std::vector &input_args) const override { + return NPUClearFloatStatusV2InferType(primitive, input_args); + } + AbstractBasePtr InferShapeAndType(const abstract::AnalysisEnginePtr &engine, const PrimitivePtr &primitive, + const std::vector &input_args) const override { + return NPUClearFloatStatusV2Infer(engine, primitive, input_args); + } +}; + +REGISTER_PRIMITIVE_OP_INFER_IMPL(NPUClearFloatStatusV2, prim::kPrimNPUClearFloatStatusV2, AGNPUClearFloatStatusV2Infer, + false); +} // namespace ops +} // namespace mindspore diff --git a/mindspore/core/ops/npu_clear_float_status_v2.h b/mindspore/core/ops/npu_clear_float_status_v2.h new file mode 100644 index 00000000000..504fcaeadae --- /dev/null +++ b/mindspore/core/ops/npu_clear_float_status_v2.h @@ -0,0 +1,39 @@ +/** + * Copyright 2023 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef MINDSPORE_CORE_OPS_NPU_CLEAR_FLOAT_STATUS_V2_H_ +#define MINDSPORE_CORE_OPS_NPU_CLEAR_FLOAT_STATUS_V2_H_ +#include +#include + +#include "ops/base_operator.h" +#include "mindapi/base/types.h" + +namespace mindspore { +namespace ops { +constexpr auto kNameNPUClearFloatStatusV2 = "NPUClearFloatStatusV2"; +class MIND_API NPUClearFloatStatusV2 : public BaseOperator { + public: + MIND_API_BASE_MEMBER(NPUClearFloatStatusV2); + NPUClearFloatStatusV2() : BaseOperator(kNameNPUClearFloatStatusV2) { InitIOName({"addr"}, {"data"}); } + void Init() const {} +}; +MIND_API abstract::AbstractBasePtr NPUClearFloatStatusV2Infer(const abstract::AnalysisEnginePtr &, + const PrimitivePtr &primitive, + const std::vector &input_args); +} // namespace ops +} // namespace mindspore +#endif // MINDSPORE_CORE_OPS_NPU_CLEAR_FLOAT_STATUS_V2_H_ diff --git a/mindspore/core/ops/npu_get_float_status_v2.cc b/mindspore/core/ops/npu_get_float_status_v2.cc new file mode 100644 index 00000000000..18ef1df35af --- /dev/null +++ b/mindspore/core/ops/npu_get_float_status_v2.cc @@ -0,0 +1,99 @@ +/** + * Copyright 2023 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include +#include + +#include "ops/npu_get_float_status_v2.h" +#include "ops/op_utils.h" +#include "abstract/param_validator.h" +#include "utils/check_convert_utils.h" +#include "abstract/ops/primitive_infer_map.h" +#include "mindapi/src/helper.h" + +namespace mindspore { +namespace ops { +namespace { +abstract::ShapePtr NPUGetFloatStatusV2InferShape(const PrimitivePtr &, const std::vector &input_args) { + auto input_shape = CheckAndConvertUtils::ConvertShapePtrToShapeMap(input_args[0]->BuildShape())[kShape]; + // dynamic rank + if (IsDynamicRank(input_shape)) { + return std::make_shared(ShapeVector{abstract::Shape::kShapeRankAny}); + } + // dynamic shape + if (IsDynamic(input_shape)) { + ShapeVector out_shape_dyn; + for (size_t i = 0; i < input_shape.size(); ++i) { + out_shape_dyn.push_back(abstract::Shape::kShapeDimAny); + } + return std::make_shared(out_shape_dyn); + } + const int64_t normal_shape_size = 1; + const int64_t normal_shape_len = 8; + if (input_shape.size() != normal_shape_size) { + MS_EXCEPTION(ValueError) << "Input_x must be a 1-dimensional tensor, but got " << std::to_string(input_shape.size()) + << "-dimensional tensor."; + } + if (input_shape[0] != normal_shape_len) { + MS_EXCEPTION(ValueError) << "The first dimension of input_x must be 8, but got " << std::to_string(input_shape[0]); + } + + std::vector output_shape = {normal_shape_len}; + return std::make_shared(output_shape); +} + +TypePtr NPUGetFloatStatusV2InferType(const PrimitivePtr &primitive, const std::vector &input_args) { + std::map types; + std::set valid_types = {kInt32}; + TypePtr input_x_type = input_args[0]->BuildType(); + (void)types.emplace("input_x", input_x_type); + (void)CheckAndConvertUtils::CheckTensorTypeSame(types, valid_types, primitive->name()); + return kInt32; +} +} // namespace +MIND_API_OPERATOR_IMPL(NPUGetFloatStatusV2, BaseOperator); +AbstractBasePtr NPUGetFloatStatusV2Infer(const abstract::AnalysisEnginePtr &, const PrimitivePtr &primitive, + const std::vector &input_args) { + MS_EXCEPTION_IF_NULL(primitive); + const int64_t input_num = 1; + CheckAndConvertUtils::CheckInputArgs(input_args, kEqual, input_num, primitive->name()); + auto infer_type = NPUGetFloatStatusV2InferType(primitive, input_args); + auto infer_shape = NPUGetFloatStatusV2InferShape(primitive, input_args); + return abstract::MakeAbstract(infer_shape, infer_type); +} + +// AG means auto generated +class MIND_API AGNPUGetFloatStatusV2Infer : public abstract::OpInferBase { + public: + BaseShapePtr InferShape(const PrimitivePtr &primitive, + const std::vector &input_args) const override { + return NPUGetFloatStatusV2InferShape(primitive, input_args); + } + + TypePtr InferType(const PrimitivePtr &primitive, const std::vector &input_args) const override { + return NPUGetFloatStatusV2InferType(primitive, input_args); + } + AbstractBasePtr InferShapeAndType(const abstract::AnalysisEnginePtr &engine, const PrimitivePtr &primitive, + const std::vector &input_args) const override { + return NPUGetFloatStatusV2Infer(engine, primitive, input_args); + } +}; + +REGISTER_PRIMITIVE_OP_INFER_IMPL(NPUGetFloatStatusV2, prim::kPrimNPUGetFloatStatusV2, AGNPUGetFloatStatusV2Infer, + false); +} // namespace ops +} // namespace mindspore diff --git a/mindspore/core/ops/npu_get_float_status_v2.h b/mindspore/core/ops/npu_get_float_status_v2.h new file mode 100644 index 00000000000..2857127faa3 --- /dev/null +++ b/mindspore/core/ops/npu_get_float_status_v2.h @@ -0,0 +1,39 @@ +/** + * Copyright 2023 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef MINDSPORE_CORE_OPS_NPU_GET_FLOAT_STATUS_V2_H_ +#define MINDSPORE_CORE_OPS_NPU_GET_FLOAT_STATUS_V2_H_ +#include +#include + +#include "ops/base_operator.h" +#include "mindapi/base/types.h" + +namespace mindspore { +namespace ops { +constexpr auto kNameNPUGetFloatStatusV2 = "NPUGetFloatStatusV2"; +class MIND_API NPUGetFloatStatusV2 : public BaseOperator { + public: + MIND_API_BASE_MEMBER(NPUGetFloatStatusV2); + NPUGetFloatStatusV2() : BaseOperator(kNameNPUGetFloatStatusV2) { InitIOName({"addr"}, {"data"}); } + void Init() const {} +}; +MIND_API abstract::AbstractBasePtr NPUGetFloatStatusV2Infer(const abstract::AnalysisEnginePtr &, + const PrimitivePtr &primitive, + const std::vector &input_args); +} // namespace ops +} // namespace mindspore +#endif // MINDSPORE_CORE_OPS_NPU_GET_FLOAT_STATUS_V2_H_ diff --git a/mindspore/python/mindspore/_extends/parallel_compile/tbe_compiler/tbe_helper.py b/mindspore/python/mindspore/_extends/parallel_compile/tbe_compiler/tbe_helper.py index b3e2de5c218..94b9f2edc99 100644 --- a/mindspore/python/mindspore/_extends/parallel_compile/tbe_compiler/tbe_helper.py +++ b/mindspore/python/mindspore/_extends/parallel_compile/tbe_compiler/tbe_helper.py @@ -298,7 +298,7 @@ def get_options_info(job_content): options["op_impl_mode_list"] = job_content["SocInfo"]["op_impl_mode_list"] options["kernel_meta_temp_dir"] = job_content["SocInfo"]["kernel_meta_temp_dir"] options["deterministic"] = job_content["SocInfo"]["deterministic"] - options["status_check"] = "false" + options["status_check"] = job_content["SocInfo"]["status_check"] return options diff --git a/mindspore/python/mindspore/amp.py b/mindspore/python/mindspore/amp.py index 0a5b9b59620..a00408a1afc 100644 --- a/mindspore/python/mindspore/amp.py +++ b/mindspore/python/mindspore/amp.py @@ -1,4 +1,4 @@ -# Copyright 2020 Huawei Technologies Co., Ltd +# Copyright 2023 Huawei Technologies Co., Ltd # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -16,7 +16,8 @@ from __future__ import absolute_import from abc import ABC, abstractmethod - +from mindspore.ops._primitive_cache import _get_cache_prim +from mindspore.ops.operations.math_ops import NPUGetFloatStatusV2, NPUClearFloatStatusV2 from ._checkparam import Validator as validator from .common import dtype as mstype from . import context @@ -58,34 +59,7 @@ def _overflow(inputs): return 1 - status.all() -def init_status(): - r""" - Returns a Tensor indicating initialized status for overflow detection. - - Note: - Only Ascend need status to capture overflow status, you can also call - this function on GPU or CPU, but the return value is useless. - - Returns: - Tensor, has the shape of `(8,)`. - - Supported Platforms: - ``Ascend`` ``GPU`` ``CPU`` - - Examples: - >>> status = amp.init_status() - """ - if _ascend_target(): - status = ops.NPUAllocFloatStatus()() - clear_status = ops.NPUClearFloatStatus()(status) - status = ops.depend(status, clear_status) - else: - status = Tensor([0, 0, 0, 0, 0, 0, 0, 0], mstype.float32) - - return status - - -def all_finite(inputs, status=None): +def all_finite(inputs): r""" Returns a scalar Tensor indicating whether the inputs are finite. @@ -98,8 +72,6 @@ def all_finite(inputs, status=None): Args: inputs (Union(tuple(Tensor), list(Tensor))): a iterable Tensor. - status (Tensor): the status Tensor for overflow detection, only required on - Ascend. Default: None. Returns: Tensor, a scalar Tensor and the dtype is bool. @@ -112,13 +84,13 @@ def all_finite(inputs, status=None): >>> output = amp.all_finite(x) """ if _ascend_target(): - if status is None: - raise ValueError("The status must be initialized on Ascend, but get 'None'.") + status = Tensor([0] * 8, mstype.int32) status = ops.depend(status, inputs) - get_status = ops.NPUGetFloatStatus()(status) + get_status = _get_cache_prim(NPUGetFloatStatusV2)()(status) status = ops.depend(status, get_status) - status_finite = status.sum() == 0 - _ = ops.NPUClearFloatStatus()(status) + clear_status = _get_cache_prim(NPUClearFloatStatusV2)()(status) + get_status = ops.depend(get_status, clear_status) + status_finite = get_status.equal(Tensor(0, mstype.int32)).all() return status_finite outputs = _hypermap(_partial(_overflow), inputs) flag_sum = ops.addn(outputs).reshape(()) @@ -329,5 +301,5 @@ class DynamicLossScaler(LossScaler): __all__ = [ "DynamicLossScaleManager", "LossScaleManager", "FixedLossScaleManager", "build_train_network", "DynamicLossScaler", "StaticLossScaler", "LossScaler", - "auto_mixed_precision", "init_status", "all_finite" + "auto_mixed_precision", "all_finite" ] diff --git a/mindspore/python/mindspore/boost/boost_cell_wrapper.py b/mindspore/python/mindspore/boost/boost_cell_wrapper.py index ad740710646..f62d085eaef 100644 --- a/mindspore/python/mindspore/boost/boost_cell_wrapper.py +++ b/mindspore/python/mindspore/boost/boost_cell_wrapper.py @@ -27,6 +27,7 @@ from mindspore.common import Tensor from mindspore.common.sparse_tensor import RowTensorInner from mindspore.common.parameter import Parameter, ParameterTuple from mindspore.nn.wrap.grad_reducer import DistributedGradReducer +from mindspore.ops.operations.math_ops import NPUGetFloatStatusV2, NPUClearFloatStatusV2 from mindspore.ops import functional as F from mindspore.ops import composite as C from mindspore.ops import operations as P @@ -460,6 +461,9 @@ class BoostTrainOneStepWithLossScaleCell(BoostTrainOneStepCell): self.is_distributed = (self.parallel_mode != ParallelMode.STAND_ALONE) self.gpu_target = (context.get_context("device_target") == "GPU") self.loss_scaling_manager = None + self.base0 = Tensor(0, mstype.int32) + self.reduce_all = P.ReduceAll(keep_dims=False) + self.equal = P.Equal() if self.auto_boost.boost_config.get("loss_scale_group", False): self.enable_enhanced_amp = True @@ -535,12 +539,13 @@ class BoostTrainOneStepWithLossScaleCell(BoostTrainOneStepCell): bool, overflow value. float, update ratio. """ - flag_sum = self.reduce_sum(param, (0,)) + flag_sum = self.equal(self.base0, param) if self.reducer_flag: flag_reduce = self.allreduce(flag_sum) - overflow = self.less_equal(self.base, flag_reduce) + overflow = not self.reduce_all(flag_reduce) else: - overflow = self.less_equal(self.base, flag_sum) + overflow = not self.reduce_all(flag_sum) + if overflow: update_ratio = self.reduce_ratio else: @@ -609,13 +614,11 @@ class BoostTrainOneStepWithLossScaleCell(BoostTrainOneStepCell): The second value is the same as the input of `compute_input`, but contains some information about the execution order. """ - status = False + status = Tensor([0] * 8, mstype.int32) if not self.gpu_target: - # init overflow buffer - status = P.NPUAllocFloatStatus()() status = F.depend(status, pre_cond) # clear overflow buffer - clear_status = P.NPUClearFloatStatus()(status) + clear_status = NPUClearFloatStatusV2()(status) compute_input = F.depend(compute_input, clear_status) return status, compute_input @@ -636,22 +639,36 @@ class BoostTrainOneStepWithLossScaleCell(BoostTrainOneStepCell): """ if not self.gpu_target: status = F.depend(status, compute_output) - get_status = P.NPUGetFloatStatus()(status) - status = F.depend(status, get_status) - # sum overflow buffer elements, 0:not overflow , >0:overflow - flag_sum = self.reduce_sum(status, (0,)) + get_status = NPUGetFloatStatusV2()(status) + + if self.is_distributed: + # sum overflow flag over devices + flag_reduce = self.allreduce(get_status) + # get_status not equal to [0]*8 means overflow + flag = self.equal(self.base0, flag_reduce) + status = F.depend(status, flag) + clear_status = NPUClearFloatStatusV2()(status) + flag = F.depend(flag, clear_status) + overall_finite = self.reduce_all(flag) + else: + status = F.depend(status, get_status) + clear_status = NPUClearFloatStatusV2()(status) + get_status = F.depend(get_status, clear_status) + flag = self.equal(self.base0, get_status) + overall_finite = self.reduce_all(flag) + overflow = not overall_finite else: flag_sum = self.hyper_map(F.partial(_grad_overflow), compute_output) flag_sum = P.AddN()(flag_sum) # convert flag_sum to scalar flag_sum = P.Reshape()(flag_sum, (())) - if self.is_distributed: - # sum overflow flag over devices - flag_reduce = self.allreduce(flag_sum) - overflow = self.less_equal(self.base, flag_reduce) - else: - overflow = self.less_equal(self.base, flag_sum) + if self.is_distributed: + # sum overflow flag over devices + flag_reduce = self.allreduce(flag_sum) + overflow = self.less_equal(self.base, flag_reduce) + else: + overflow = self.less_equal(self.base, flag_sum) return overflow def _process_loss_scale(self, overflow): @@ -688,7 +705,7 @@ class BoostTrainOneStepWithLossScaleCell(BoostTrainOneStepCell): self.optimizer_loss_scale = [self.parent.count(x) for x in parent_set] self.reduce_ratio = Tensor(1.0 / (2 ** 0.5), mstype.float32) self.growth_ratio = Tensor(2 ** (1.0 / 1000.0), mstype.float32) - self.overflow_status_list = ParameterTuple(Parameter(Tensor(np.zeros(shape=[8]), mstype.float32), + self.overflow_status_list = ParameterTuple(Parameter(Tensor(np.zeros(shape=[8]), mstype.int32), name='mix_layer_status_{}'.format(x), requires_grad=False) for x in range(loss_scale_number)) self.loss_scaling_manager.set_loss_scale_status(loss_scale_number, self.loss_scaling_manager.get_loss_scale()) diff --git a/mindspore/python/mindspore/nn/wrap/loss_scale.py b/mindspore/python/mindspore/nn/wrap/loss_scale.py index ceba6e9beb4..5a32d974872 100644 --- a/mindspore/python/mindspore/nn/wrap/loss_scale.py +++ b/mindspore/python/mindspore/nn/wrap/loss_scale.py @@ -23,6 +23,7 @@ from mindspore.nn.cell import Cell from mindspore.common import Tensor from mindspore.common.sparse_tensor import RowTensorInner from mindspore.common.parameter import Parameter +from mindspore.ops.operations.math_ops import NPUGetFloatStatusV2, NPUClearFloatStatusV2 from mindspore.ops import functional as F from mindspore.ops import composite as C from mindspore.ops import operations as P @@ -309,8 +310,11 @@ class TrainOneStepWithLossScaleCell(TrainOneStepCell): super(TrainOneStepWithLossScaleCell, self).__init__(network, optimizer, sens=None) self.hyper_map = C.HyperMap() self.base = Tensor(1, mstype.float32) + self.base0 = Tensor(0, mstype.int32) self.reduce_sum = P.ReduceSum(keep_dims=False) + self.reduce_all = P.ReduceAll(keep_dims=False) self.less_equal = P.LessEqual() + self.equal = P.Equal() self.allreduce = P.AllReduce() self.is_distributed = (self.parallel_mode != ParallelMode.STAND_ALONE) self.gpu_target = (context.get_context("device_target") == "GPU") @@ -390,13 +394,11 @@ class TrainOneStepWithLossScaleCell(TrainOneStepCell): The second value is the same as the input of `compute_input`, but contains some information about the execution order. """ - status = False + status = Tensor([0] * 8, mstype.int32) if not self.gpu_target: - # init overflow buffer - status = P.NPUAllocFloatStatus()() status = F.depend(status, pre_cond) # clear overflow buffer - clear_status = P.NPUClearFloatStatus()(status) + clear_status = NPUClearFloatStatusV2()(status) compute_input = F.depend(compute_input, clear_status) return status, compute_input @@ -419,22 +421,36 @@ class TrainOneStepWithLossScaleCell(TrainOneStepCell): """ if not self.gpu_target: status = F.depend(status, compute_output) - get_status = P.NPUGetFloatStatus()(status) - status = F.depend(status, get_status) - # sum overflow buffer elements, 0:not overflow , >0:overflow - flag_sum = self.reduce_sum(status, (0,)) + get_status = NPUGetFloatStatusV2()(status) + + if self.is_distributed: + # sum overflow flag over devices + flag_reduce = self.allreduce(get_status) + # get_status not equal to [0]*8 means overflow + flag = self.equal(self.base0, flag_reduce) + status = F.depend(status, flag) + clear_status = NPUClearFloatStatusV2()(status) + flag = F.depend(flag, clear_status) + overall_finite = self.reduce_all(flag) + else: + status = F.depend(status, get_status) + clear_status = NPUClearFloatStatusV2()(status) + get_status = F.depend(get_status, clear_status) + flag = self.equal(self.base0, get_status) + overall_finite = self.reduce_all(flag) + overflow = not overall_finite else: flag_sum = self.hyper_map(F.partial(_grad_overflow), compute_output) flag_sum = P.AddN()(flag_sum) # convert flag_sum to scalar flag_sum = P.Reshape()(flag_sum, (())) - if self.is_distributed: - # sum overflow flag over devices - flag_reduce = self.allreduce(flag_sum) - overflow = self.less_equal(self.base, flag_reduce) - else: - overflow = self.less_equal(self.base, flag_sum) + if self.is_distributed: + # sum overflow flag over devices + flag_reduce = self.allreduce(flag_sum) + overflow = self.less_equal(self.base, flag_reduce) + else: + overflow = self.less_equal(self.base, flag_sum) return overflow def process_loss_scale(self, overflow): diff --git a/mindspore/python/mindspore/ops/_op_impl/tbe/__init__.py b/mindspore/python/mindspore/ops/_op_impl/tbe/__init__.py index 27e8a2d1ed8..7095376f678 100644 --- a/mindspore/python/mindspore/ops/_op_impl/tbe/__init__.py +++ b/mindspore/python/mindspore/ops/_op_impl/tbe/__init__.py @@ -37,3 +37,5 @@ from .scatter_nd_d import _scatter_nd_d_tbe # in python no check supported from .assign_add_ds import _assign_add_ds_tbe # "Frac_nz in pangu not support" from .atomic_addr_clean import _atomic_addr_clean_tbe # need to clean addr larger than 2G, int32 is not enough from .assign import _assign_tbe # Different formats of assign inputs cause memory to increase +from .npu_clear_float_status_v2 import _npu_clear_float_status_v2_tbe # io mismatch +from .npu_get_float_status_v2 import _npu_get_float_status_v2_tbe # io mismatch diff --git a/mindspore/python/mindspore/ops/_op_impl/tbe/npu_clear_float_status_v2.py b/mindspore/python/mindspore/ops/_op_impl/tbe/npu_clear_float_status_v2.py new file mode 100644 index 00000000000..cc6d28b027b --- /dev/null +++ b/mindspore/python/mindspore/ops/_op_impl/tbe/npu_clear_float_status_v2.py @@ -0,0 +1,35 @@ +# Copyright 2023 Huawei Technologies Co., Ltd +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================ + +"""NPUClearFloatStatusV2 op""" +from mindspore.ops.op_info_register import op_info_register, TBERegOp, DataType + +npu_clear_float_status_v2_op_info = TBERegOp("NPUClearFloatStatusV2") \ + .fusion_type("OPAQUE") \ + .async_flag(False) \ + .binfile_name("n_p_u_clear_float_status_v2.so") \ + .compute_cost(10) \ + .kernel_name("n_p_u_clear_float_status_v2") \ + .partial_flag(True) \ + .input(0, "addr", False, "required", "all") \ + .output(0, "data", False, "required", "all") \ + .dtype_format(DataType.I32_Default, DataType.I32_Default) \ + .get_op_info() + + +@op_info_register(npu_clear_float_status_v2_op_info) +def _npu_clear_float_status_v2_tbe(): + """NPUClearFloatStatusV2 TBE register""" + return diff --git a/mindspore/python/mindspore/ops/_op_impl/tbe/npu_get_float_status_v2.py b/mindspore/python/mindspore/ops/_op_impl/tbe/npu_get_float_status_v2.py new file mode 100644 index 00000000000..1cdf43e9303 --- /dev/null +++ b/mindspore/python/mindspore/ops/_op_impl/tbe/npu_get_float_status_v2.py @@ -0,0 +1,35 @@ +# Copyright 2023 Huawei Technologies Co., Ltd +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================ + +"""NPUGetFloatStatusV2 op""" +from mindspore.ops.op_info_register import op_info_register, TBERegOp, DataType + +npu_get_float_status_v2_op_info = TBERegOp("NPUGetFloatStatusV2") \ + .fusion_type("ELEMWISE") \ + .async_flag(False) \ + .binfile_name("n_p_u_get_float_status_v2.so") \ + .compute_cost(10) \ + .kernel_name("n_p_u_get_float_status_v2") \ + .partial_flag(True) \ + .input(0, "addr", False, "required", "all") \ + .output(0, "data", False, "required", "all") \ + .dtype_format(DataType.I32_Default, DataType.I32_Default) \ + .get_op_info() + + +@op_info_register(npu_get_float_status_v2_op_info) +def _npu_get_float_status_v2_tbe(): + """NPUGetFloatStatusV2 TBE register""" + return diff --git a/mindspore/python/mindspore/ops/operations/math_ops.py b/mindspore/python/mindspore/ops/operations/math_ops.py index a21430fd5e6..bd5ffa5011b 100644 --- a/mindspore/python/mindspore/ops/operations/math_ops.py +++ b/mindspore/python/mindspore/ops/operations/math_ops.py @@ -20,6 +20,7 @@ from __future__ import division import numpy as np from mindspore import context +from mindspore import log as logger from mindspore.ops import signature as sig from mindspore._checkparam import Validator as validator from mindspore._checkparam import Rel @@ -4339,6 +4340,7 @@ class NPUAllocFloatStatus(Primitive): @prim_attr_register def __init__(self): """Initialize NPUAllocFloatStatus""" + logger.warning("The 'NPUAllocFloatStatus' operator will be deprecated in the future. Please don't use it.") class NPUGetFloatStatus(Primitive): @@ -4408,6 +4410,7 @@ class NPUGetFloatStatus(Primitive): @prim_attr_register def __init__(self): """Initialize NPUGetFloatStatus""" + logger.warning("The 'NPUGetFloatStatus' operator will be deprecated in the future. Please don't use it.") class NPUClearFloatStatus(Primitive): @@ -4471,6 +4474,173 @@ class NPUClearFloatStatus(Primitive): @prim_attr_register def __init__(self): """Initialize NPUClearFloatStatus""" + logger.warning("The 'NPUClearFloatStatus' operator will be deprecated in the future. Please don't use it.") + + +class NPUGetFloatStatusV2(Primitive): + """ + Get the flag for storage overflow status. This flag is located in a register at a + fixed address on the `Ascend` device, and overflow information is automatically + written to this register. + The flag is a one-dimensional Tensor with shape :math:`(8,)` and data type `mindspore.dtype.int32`. + If the value of flag is zero, no overflow has occurred, otherwise, overflow. + When performing overflow detection on the network, you should first call `NPUClearFloatStatusV2` to + reset the register before the detection, and then call `NPUGetFloatStatusV2` to get the register + status after the network execution is completed. + + Note: + - In order to avoid mis-optimization by the compiler, additional input is added to + this operator. The input is defined as a shape of: math:`(8,)` and data type of + `mindspore.dtype.int32` Tensor, meaningless. + - Since this op lacks contextual dependencies with parameters in the network, + :class:`mindspore.ops.Depend` needs to be used to ensure order of execution. + + Inputs: + Tensor, an additional input created to avoid compiler optimization, is specified as shape :math:`(8,)`, + data type is `mindspore.dtype.int32`, and has no actual meaning. + Usually use the output of `NPUClearFloatStatusV2`. + + Outputs: + Tensor, shape and data type are the same as input. If all are zero, it means no overflow, otherwise, overflow. + + Raises: + TypeError: If `x` is not a Tensor. + TypeError: If dtype of `x` is not int32. + ValueError: If shape of `x` is not equal to :math:`(8,)`. + + Supported Platforms: + ``Ascend`` + + Examples: + >>> import mindspore as ms + >>> import numpy as np + >>> from mindspore import ops, nn, Tensor + >>> from mindspore.ops.operations.math_ops import NPUGetFloatStatusV2, NPUClearFloatStatusV2 + >>> class Net(nn.Cell): + ... def __init__(self): + ... super().__init__() + ... self.clear_status = NPUClearFloatStatusV2() + ... self.get_status = NPUGetFloatStatusV2() + ... self.sub = ops.Sub() + ... self.neg = ops.Neg() + ... self.equal = ops.Equal() + ... self.reduce_all = ops.ReduceAll(keep_dims=False) + ... self.base = Tensor([0], dtype=ms.int32) + ... + ... def construct(self, x): + ... init = Tensor([0]*8, dtype=ms.int32) + ... clear_status = self.clear_status(init) + ... x = ops.depend(x, clear_status) + ... res = self.sub(x, self.neg(x)) + ... init = ops.depend(init, res) + ... get_status = self.get_status(init) + ... flag = self.equal(self.base, get_status) + ... overall_finite = self.reduce_all(flag) + ... overflow = not overall_finite + ... return overflow + ... + >>> value = 65504 + >>> data = np.full((2, 3), value, dtype=np.float16) + >>> x = Tensor(data, dtype=ms.float16) + >>> net = Net() + >>> res = net(x) + >>> print(res) + True + >>> value = 10 + >>> data = np.full((2, 3), value, dtype=np.float16) + >>> x = Tensor(data, dtype=ms.float16) + >>> net = Net() + >>> res = net(x) + >>> print(res) + False + """ + + @prim_attr_register + def __init__(self): + """Initialize NPUGetFloatStatusV2""" + + + +class NPUClearFloatStatusV2(Primitive): + """ + Clear the flag for storage overflow status. This flag is located in a register at a + fixed address on the `Ascend` device, and overflow information is automatically + written to this register. + The flag is a one-dimensional Tensor with shape :math:`(8,)` and data type `mindspore.dtype.int32`. + If the value of flag is zero, no overflow has occurred, otherwise, overflow. + When performing overflow detection on the network, you should first call `NPUClearFloatStatusV2` to + reset the register before the detection, and then call `NPUGetFloatStatusV2` to get the register + status after the network execution is completed. + + Note: + - In order to avoid mis-optimization by the compiler, additional input and output are added to + this operator. The input and output are defined as a shape of: math:`(8,)` and data type of + `mindspore.dtype.int32` Tensor, meaningless. + - Since this op lacks contextual dependencies with parameters in the network, + :class:`mindspore.ops.Depend` needs to be used to ensure order of execution. + + Inputs: + Tensor, an additional input created to avoid compiler optimization, is specified as shape :math:`(8,)`, + data type is `mindspore.dtype.int32`, and has no actual meaning. + + Outputs: + Tensor, shape and data type are the same as input, meaningless. + + Raises: + TypeError: If `x` is not a Tensor. + TypeError: If dtype of `x` is not int32. + ValueError: If shape of `x` is not equal to :math:`(8,)`. + + Supported Platforms: + ``Ascend`` + + Examples: + >>> import mindspore as ms + >>> import numpy as np + >>> from mindspore import ops, nn, Tensor + >>> from mindspore.ops.operations.math_ops import NPUGetFloatStatusV2, NPUClearFloatStatusV2 + >>> class Net(nn.Cell): + ... def __init__(self): + ... super().__init__() + ... self.clear_status = NPUClearFloatStatusV2() + ... self.get_status = NPUGetFloatStatusV2() + ... self.sub = ops.Sub() + ... self.neg = ops.Neg() + ... self.equal = ops.Equal() + ... self.reduce_all = ops.ReduceAll(keep_dims=False) + ... self.base = Tensor([0], dtype=ms.int32) + ... + ... def construct(self, x): + ... init = Tensor([0]*8, dtype=ms.int32) + ... clear_status = self.clear_status(init) + ... x = ops.depend(x, clear_status) + ... res = self.sub(x, self.neg(x)) + ... init = ops.depend(init, res) + ... get_status = self.get_status(init) + ... flag = self.equal(self.base, get_status) + ... overall_finite = self.reduce_all(flag) + ... overflow = not overall_finite + ... return overflow + ... + >>> value = 65504 + >>> data = np.full((2, 3), value, dtype=np.float16) + >>> x = Tensor(data, dtype=ms.float16) + >>> net = Net() + >>> res = net(x) + >>> print(res) + True + >>> value = 10 + >>> data = np.full((2, 3), value, dtype=np.float16) + >>> x = Tensor(data, dtype=ms.float16) + >>> net = Net() + >>> res = net(x) + >>> print(res) + False + """ + + @prim_attr_register + def __init__(self): + """Initialize NPUClearFloatStatusV2""" class Cos(Primitive): diff --git a/tests/st/mix_precision/test_mix_precision_func.py b/tests/st/mix_precision/test_mix_precision_func.py index d1107657c07..05a243acfa8 100644 --- a/tests/st/mix_precision/test_mix_precision_func.py +++ b/tests/st/mix_precision/test_mix_precision_func.py @@ -15,6 +15,7 @@ import numpy as np import pytest + import mindspore from mindspore import Tensor, Parameter from mindspore.common import dtype as mstype @@ -60,22 +61,20 @@ def test_dynamic_loss_scaler(mode): Expectation: the `scale_value` can be adjusted correctly. """ context.set_context(mode=mode) - status = amp.init_status() loss_scaler = amp.DynamicLossScaler(scale_value=2**10, scale_factor=2, scale_window=50) grads = (Tensor(np.array([0.5, 1.0]), mindspore.float16), Tensor(np.array([0.2]), mindspore.float16)) unscaled_grads = loss_scaler.unscale(grads) - grads_finite = amp.all_finite(unscaled_grads, status) + grads_finite = amp.all_finite(unscaled_grads) loss_scaler.counter = Parameter(Tensor(49, dtype=mstype.int32)) loss_scaler.adjust(grads_finite) assert loss_scaler.scale_value.asnumpy() == np.array(2048.) - status = amp.init_status() grads = (Tensor(np.array([2., 1.0]), mindspore.float16), Tensor(np.array([0.2]), mindspore.float16)) unscaled_grads = loss_scaler.unscale(grads) - grads_finite = amp.all_finite(unscaled_grads, status) + grads_finite = amp.all_finite(unscaled_grads) loss_scaler.scale_value = Parameter(Tensor(2**10, dtype=mstype.float32)) loss_scaler.adjust(grads_finite) assert loss_scaler.scale_value.asnumpy() == np.array(1024.) diff --git a/tests/st/ops/ascend/test_npu_overflow_v2.py b/tests/st/ops/ascend/test_npu_overflow_v2.py new file mode 100644 index 00000000000..d191d562fc0 --- /dev/null +++ b/tests/st/ops/ascend/test_npu_overflow_v2.py @@ -0,0 +1,175 @@ +# Copyright 2023 Huawei Technologies Co., Ltd +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================ + +import pytest +import numpy as np + +import mindspore as ms +from mindspore import Tensor, nn, ops +from mindspore import dtype as mstype +from mindspore.ops._primitive_cache import _get_cache_prim +from mindspore.ops.operations.math_ops import NPUGetFloatStatusV2, NPUClearFloatStatusV2 + + +class OverflowCheckNet(nn.Cell): + def __init__(self): + super(OverflowCheckNet, self).__init__() + self.base1 = Tensor(1, mstype.float32) + self.base2 = Tensor(0, mstype.int32) + self.reduce_sum = ops.ReduceSum(keep_dims=False) + self.less_equal = ops.LessEqual() + self.reduce_all = ops.ReduceAll(keep_dims=False) + self.equal = ops.Equal() + + def start_overflow_check_v1(self, pre_cond, compute_input): + status = False + # init overflow buffer + status = ops.NPUAllocFloatStatus()() + status = ops.depend(status, pre_cond) + # clear overflow buffer + clear_status = ops.NPUClearFloatStatus()(status) + compute_input = ops.depend(compute_input, clear_status) + return status, compute_input + + def get_overflow_status_v1(self, status, compute_output): + status = ops.depend(status, compute_output) + get_status = ops.NPUGetFloatStatus()(status) + status = ops.depend(status, get_status) + # sum overflow buffer elements, 0:not overflow , >0:overflow + flag_sum = self.reduce_sum(status, (0,)) + overflow = self.less_equal(self.base1, flag_sum) + return overflow + + def start_overflow_check_v2(self, pre_cond, compute_input): + status = Tensor([0] * 8, mstype.int32) + status = ops.depend(status, pre_cond) + # clear overflow buffer + clear_status = _get_cache_prim(NPUClearFloatStatusV2)()(status) + compute_input = ops.depend(compute_input, clear_status) + return status, compute_input + + def get_overflow_status_v2(self, status, compute_output): + status = ops.depend(status, compute_output) + get_status = _get_cache_prim(NPUGetFloatStatusV2)()(status) + status = ops.depend(status, get_status) + clear_status = _get_cache_prim(NPUClearFloatStatusV2)()(status) + get_status = ops.depend(get_status, clear_status) + flag = self.equal(self.base2, get_status) + overall_finite = self.reduce_all(flag) + return not overall_finite + + +class OverFlowNetV2GetStatusAfterClear(OverflowCheckNet): + def __init__(self): + super(OverFlowNetV2GetStatusAfterClear, self).__init__() + self.mul = ops.Mul() + self.sub = ops.Sub() + + def construct(self, x1, x2): + y1 = self.mul(x1, x1) + status, compute_input = self.start_overflow_check_v2(y1, x2) + y2 = self.sub(y1, compute_input) + cond = self.get_overflow_status_v2(status, y2) + return cond + + +class OverFlowNetV2GetStatus(OverflowCheckNet): + def __init__(self): + super(OverFlowNetV2GetStatus, self).__init__() + self.add = ops.Add() + self.mul = ops.Mul() + + def construct(self, x1, x2): + y1 = self.add(x1, x1) + status, compute_input = self.start_overflow_check_v2(y1, x2) + y2 = self.mul(y1, compute_input) + cond = self.get_overflow_status_v2(status, y2) + return cond + + +class OverflowCheckV1vsV2(OverflowCheckNet): + def __init__(self): + super(OverflowCheckV1vsV2, self).__init__() + self.add = ops.Add() + self.atan2 = ops.Atan2() + + def construct(self, x1, x2, version): + y1 = self.add(x1, x1) + if version == 1: + status, compute_input = self.start_overflow_check_v1(y1, x2) + y2 = self.atan2(y1, compute_input) + cond = self.get_overflow_status_v1(status, y2) + else: + status, compute_input = self.start_overflow_check_v2(y1, x2) + y2 = self.atan2(y1, compute_input) + cond = self.get_overflow_status_v2(status, y2) + return cond + + +@pytest.mark.level0 +@pytest.mark.platform_arm_ascend_training +@pytest.mark.platform_x86_ascend_training +@pytest.mark.env_onecard +@pytest.mark.parametrize('mode', [ms.GRAPH_MODE, ms.PYNATIVE_MODE]) +def test_v2_overflow_get_after_clear(mode): + """ + Feature: overflow check v2 + Description: Verify the result of get_status after clear + Expectation: success + """ + ms.set_context(mode=mode) + net = OverFlowNetV2GetStatusAfterClear() + output = net(Tensor(65504, mstype.float16), Tensor(1, mstype.float16)) + assert not output + + +@pytest.mark.level0 +@pytest.mark.platform_arm_ascend_training +@pytest.mark.platform_x86_ascend_training +@pytest.mark.env_onecard +@pytest.mark.parametrize('mode', [ms.GRAPH_MODE, ms.PYNATIVE_MODE]) +def test_v2_clear_overflow_get(mode): + """ + Feature: overflow check v2 + Description: Verify the result of get_status when overflow + Expectation: success + """ + ms.set_context(mode=mode) + net = OverFlowNetV2GetStatus() + output = net(Tensor(1, mstype.float16), Tensor(65504, mstype.float16)) + assert output + + +@pytest.mark.level0 +@pytest.mark.platform_arm_ascend_training +@pytest.mark.platform_x86_ascend_training +@pytest.mark.env_onecard +@pytest.mark.parametrize('mode', [ms.GRAPH_MODE, ms.PYNATIVE_MODE]) +def test_v1_vs_v2_overflow_check(mode): + """ + Feature: overflow check v1 vs v2 + Description: Verify the result of atan2 when inputs include 0 + Expectation: success + """ + ms.set_context(mode=mode) + input1 = np.random.random((2, 4)).astype(np.float32) + input2 = np.random.random((2, 4)).astype(np.float32) + input1[0] = 0 + input2[1] = 0 + net = OverflowCheckV1vsV2() + overflow_v1 = net(Tensor(input1), Tensor(input2), 1) + overflow_v2 = net(Tensor(input1), Tensor(input2), 2) + assert overflow_v1 + assert not overflow_v2 diff --git a/tests/st/train/test_amp_overflow.py b/tests/st/train/test_amp_overflow.py new file mode 100644 index 00000000000..4da3baf8675 --- /dev/null +++ b/tests/st/train/test_amp_overflow.py @@ -0,0 +1,83 @@ +# Copyright 2023 Huawei Technologies Co., Ltd +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================ + +'''test overflow''' +import pytest +import numpy as np + +from mindspore import Tensor, Parameter, nn, ops +import mindspore.amp as amp +import mindspore as ms + + +class Net(nn.Cell): + def __init__(self, in_features, out_features): + super(Net, self).__init__() + self.weight = Parameter(Tensor(np.full([in_features, out_features], 2, np.float16)), + name='weight') + self.matmul = ops.MatMul() + + def construct(self, x): + output = self.matmul(x, self.weight) + return output + + +@pytest.mark.level0 +@pytest.mark.platform_arm_ascend_training +@pytest.mark.platform_x86_ascend_training +@pytest.mark.platform_x86_gpu_training +@pytest.mark.platform_x86_cpu_training +@pytest.mark.env_onecard +@pytest.mark.parametrize('mode', [ms.GRAPH_MODE, ms.PYNATIVE_MODE]) +def test_functional_amp_overflow(mode): + """ + Feature: mindspore.amp.overflow + Description: test amp overflow + Expectation: Success. + """ + ms.set_context(mode=mode) + size, in_features, out_features = 1, 2, 2 + net = Net(in_features, out_features) + loss_fn = nn.MSELoss() + + def forward_fn(data, label): + logits = net(data) + loss = loss_fn(logits, label) + return loss, logits + + grad_fn = ops.value_and_grad(forward_fn, grad_position=None, weights=net.trainable_params()) + + @ms.jit + def train_step(data, label): + (loss, _), grads = grad_fn(data, label) + is_finite = amp.all_finite(grads) + return loss, is_finite + + shape = (size, in_features) + inputs = [ + Tensor(np.full(shape, -np.inf, np.float16)), + Tensor(np.full(shape, 0, np.float16)), + Tensor(np.full(shape, 40000, np.float16)), + Tensor(np.full(shape, 10, np.float16)), + Tensor(np.full(shape, np.inf, np.float16)), + ] + label = Tensor(np.full([out_features,], 0, np.float16)) + datasets = list(zip(inputs, [label for _ in range(len(inputs))])) + expect_results = [False, True, False, True, False] + outputs = [] + for data, label in datasets: + _, is_finite = train_step(data, label) + outputs.append(is_finite.asnumpy().tolist()) + assert outputs == expect_results diff --git a/tests/st/train/test_loss_scale_overflow.py b/tests/st/train/test_loss_scale_overflow.py new file mode 100644 index 00000000000..d73673499f2 --- /dev/null +++ b/tests/st/train/test_loss_scale_overflow.py @@ -0,0 +1,114 @@ +# Copyright 2023 Huawei Technologies Co., Ltd +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================ + +'''test overflow''' +import pytest +import numpy as np + +import mindspore as ms +from mindspore import Tensor, Parameter, nn, ops, boost +from mindspore import dtype as mstype + + +class Net(nn.Cell): + def __init__(self, in_features, out_features): + super(Net, self).__init__() + self.weight = Parameter(Tensor(np.full([in_features, out_features], 2, np.float16)), + name='weight') + self.matmul = ops.MatMul() + + def construct(self, x): + output = self.matmul(x, self.weight) + return output + + +@pytest.mark.level0 +@pytest.mark.platform_arm_ascend_training +@pytest.mark.platform_x86_ascend_training +@pytest.mark.platform_x86_gpu_training +@pytest.mark.env_onecard +@pytest.mark.parametrize('mode', [ms.GRAPH_MODE, ms.PYNATIVE_MODE]) +def test_train_one_step_with_loss_scale_cell_overflow(mode): + """ + Feature: mindspore.TrainOneStepWithLossScaleCell.overflow + Description: test TrainOneStepWithLossScaleCell overflow + Expectation: Success. + """ + ms.set_context(mode=mode) + size, in_features, out_features = 1, 2, 2 + net = Net(in_features, out_features) + loss = nn.MSELoss() + optimizer = nn.Momentum(net.trainable_params(), + learning_rate=0.1, momentum=0.9) + net_with_loss = nn.WithLossCell(net, loss) + shape = (size, in_features) + inputs = [ + Tensor(np.full(shape, -np.inf, np.float16)), + Tensor(np.full(shape, 0, np.float16)), + Tensor(np.full(shape, 40000, np.float16)), + Tensor(np.full(shape, 10, np.float16)), + Tensor(np.full(shape, np.inf, np.float16)), + ] + label = Tensor(np.full([out_features,], 0, np.float16)) + datasets = list(zip(inputs, [label for _ in range(len(inputs))])) + scaling_sens = Tensor([8], dtype=mstype.float16) + train_network = nn.TrainOneStepWithLossScaleCell( + net_with_loss, optimizer, scale_sense=scaling_sens) + expect_results = [True, False, True, False, True] + outputs = [] + for x, label in datasets: + _, overflow, _ = train_network(x, label) + outputs.append(overflow.asnumpy().tolist()) + assert outputs == expect_results + + +@pytest.mark.level0 +@pytest.mark.platform_arm_ascend_training +@pytest.mark.platform_x86_ascend_training +@pytest.mark.platform_x86_gpu_training +@pytest.mark.env_onecard +@pytest.mark.parametrize('mode', [ms.PYNATIVE_MODE]) +def test_boost_train_one_step_with_loss_scale_cell_overflow(mode): + """ + Feature: mindspore.BoostTrainOneStepWithLossScaleCell.overflow + Description: test BoostTrainOneStepWithLossScaleCell overflow + Expectation: Success. + """ + ms.set_context(mode=mode) + size, in_features, out_features = 1, 2, 2 + net = Net(in_features, out_features) + loss = nn.MSELoss() + optimizer = nn.Momentum(net.trainable_params(), + learning_rate=0.1, momentum=0.9) + net_with_loss = nn.WithLossCell(net, loss) + shape = (size, in_features) + inputs = [ + Tensor(np.full(shape, -np.inf, np.float16)), + Tensor(np.full(shape, 0, np.float16)), + Tensor(np.full(shape, 40000, np.float16)), + Tensor(np.full(shape, 10, np.float16)), + Tensor(np.full(shape, np.inf, np.float16)), + ] + label = Tensor(np.full([out_features,], 0, np.float16)) + datasets = list(zip(inputs, [label for _ in range(len(inputs))])) + scaling_sens = Tensor([8], dtype=mstype.float16) + train_network = boost.BoostTrainOneStepWithLossScaleCell( + net_with_loss, optimizer, scale_sense=scaling_sens) + expect_results = [True, False, True, False, True] + outputs = [] + for x, label in datasets: + _, overflow, _ = train_network(x, label) + outputs.append(overflow) + assert outputs == expect_results diff --git a/tests/ut/cpp/tbe/tbe_json_creator_test.cc b/tests/ut/cpp/tbe/tbe_json_creator_test.cc index 5eb03bfdda3..230ef58f06c 100644 --- a/tests/ut/cpp/tbe/tbe_json_creator_test.cc +++ b/tests/ut/cpp/tbe/tbe_json_creator_test.cc @@ -76,7 +76,7 @@ TEST_F(TestHWTBEJsonCreator, DISABLED_test_tbe_single_common) { auto tbe_json_creator_build = std::make_shared(); nlohmann::json kernel_json; EXPECT_TRUE(tbe_json_creator_select->GenJson(relu1, &kernel_json)); - EXPECT_EQ(tbe_json_creator_select->GetJsonHash(), 10654173078034037040U) + EXPECT_EQ(tbe_json_creator_select->GetJsonHash(), 12207851473833394607U) << "Error json is:" << kernel_json << ", for expected json, see file: tbe_single_common_select.json"; EXPECT_TRUE(tbe_json_creator_build->GenJson(relu1, &kernel_json)); EXPECT_EQ(tbe_json_creator_build->GetJsonHash(), 2389029245513168162U) @@ -118,7 +118,7 @@ TEST_F(TestHWTBEJsonCreator, DISABLED_test_tbe_single_conv2d_backprop_filter) { auto tbe_json_creator_build = std::make_shared(); nlohmann::json kernel_json; EXPECT_TRUE(tbe_json_creator_select->GenJson(conv2d_backprop_filter, &kernel_json)); - EXPECT_EQ(tbe_json_creator_select->GetJsonHash(), 16416634683849134630U) + EXPECT_EQ(tbe_json_creator_select->GetJsonHash(), 14683931476519216146U) << "Error json is:" << kernel_json << ", for expected json, see file: tbe_single_conv2d_backprop_filter_select.json"; EXPECT_TRUE(tbe_json_creator_build->GenJson(conv2d_backprop_filter, &kernel_json)); @@ -177,7 +177,7 @@ TEST_F(TestHWTBEJsonCreator, DISABLED_test_tbe_single_dynamic_rnn) { auto tbe_json_creator_build = std::make_shared(); nlohmann::json kernel_json; EXPECT_TRUE(tbe_json_creator_select->GenJson(dynamic_rnn, &kernel_json)); - EXPECT_EQ(tbe_json_creator_select->GetJsonHash(), 3107761065269367419U) + EXPECT_EQ(tbe_json_creator_select->GetJsonHash(), 16143536111232395651U) << "Error json is:" << kernel_json << ", for expected json, see file: tbe_single_dynamic_rnn_select.json"; EXPECT_TRUE(tbe_json_creator_build->GenJson(dynamic_rnn, &kernel_json)); EXPECT_EQ(tbe_json_creator_build->GetJsonHash(), 14916511955212123861U) @@ -230,7 +230,7 @@ TEST_F(TestHWTBEJsonCreator, DISABLED_test_tbe_single_layer_norm) { auto tbe_json_creator_build = std::make_shared(); nlohmann::json kernel_json; EXPECT_TRUE(tbe_json_creator_select->GenJson(layer_norm, &kernel_json)); - EXPECT_EQ(tbe_json_creator_select->GetJsonHash(), 6592146268336877821U) + EXPECT_EQ(tbe_json_creator_select->GetJsonHash(), 1161191001728520611U) << "Error json is:" << kernel_json << ", for expected json, see file: tbe_single_layer_norm_select.json"; EXPECT_TRUE(tbe_json_creator_build->GenJson(layer_norm, &kernel_json)); EXPECT_EQ(tbe_json_creator_build->GetJsonHash(), 2848618249728529296U) @@ -306,7 +306,7 @@ TEST_F(TestHWTBEJsonCreator, test_tbe_fusion_common) { nlohmann::json fusion_json; auto tbe_json_creator = std::make_shared(); EXPECT_TRUE(tbe_json_creator->GenJson(fusion_scope_info, &fusion_json)); - EXPECT_EQ(tbe_json_creator->GetJsonHash(), 9482071119130243510U) + EXPECT_EQ(tbe_json_creator->GetJsonHash(), 18379117451241093022U) << "Error json is:" << fusion_json << ", for expected json, see file: tbe_fusion_common.json"; } @@ -367,7 +367,7 @@ TEST_F(TestHWTBEJsonCreator, test_fusion_add_conv2d) { nlohmann::json fusion_json; auto tbe_json_creator = std::make_shared(); EXPECT_TRUE(tbe_json_creator->GenJson(fusion_scope_info, &fusion_json)); - EXPECT_EQ(tbe_json_creator->GetJsonHash(), 1515571995667332418U) + EXPECT_EQ(tbe_json_creator->GetJsonHash(), 16132617067967162574U) << "Error json is:" << fusion_json << ", for expected json, see file: test_fusion_add_conv2d.json"; }