diff --git a/mindspore/ccsrc/frontend/parallel/ops_info/matmul_ffn_info.cc b/mindspore/ccsrc/frontend/parallel/ops_info/matmul_ffn_info.cc deleted file mode 100644 index 4ca1f8e505b..00000000000 --- a/mindspore/ccsrc/frontend/parallel/ops_info/matmul_ffn_info.cc +++ /dev/null @@ -1,82 +0,0 @@ -/** - * Copyright 2024 Huawei Technologies Co., Ltd - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include "frontend/parallel/ops_info/matmul_ffn_info.h" -#include "frontend/parallel/dynamic_creator.h" - -namespace mindspore { -namespace parallel { -// MatMulQkv has 3 inputs and 2 outputs -// x: (batch * seq_len (inc is 1), hidden_size) -// weight_1: (weight_1_hidden_size, hidden_size) -// weight_2: (weight_2_hidden_size, hidden_size) -// ------------------------------ -// output_1: (batch * seq_len (inc is 1), weight_1_hidden_size) -// output_2: (batch * seq_len (inc is 1), weight_2_hidden_size) - -constexpr size_t kMatMulFfnOutputSize = 2; - -Status MatmulFfnInfo::CheckStrategy(const StrategyPtr &strategy) { - if (CheckStrategyValue(strategy, inputs_shape_) != SUCCESS) { - return FAILED; - } - - // TODO - - return SUCCESS; -} - -Status MatmulFfnInfo::InferDevMatrixShape() { - auto input_strategies = strategy()->GetInputDim(); - auto x = input_strategies.at(0); // (batch * seq_len, hidden_size) - auto weight_1 = input_strategies.at(1); - // dp mp - // 1 0 - dev_matrix_shape_ = {x.at(0), weight_1.at(0)}; - - return SUCCESS; -} - -Status MatmulFfnInfo::InferTensorMap() { - Shape x_tensor_map{1, -1}; - Shape weight_1_tensor_map{0, -1}; - Shape weight_2_tensor_map{0, -1}; - inputs_tensor_map_.emplace_back(x_tensor_map); - inputs_tensor_map_.emplace_back(weight_1_tensor_map); - inputs_tensor_map_.emplace_back(weight_2_tensor_map); - - Shape output_q_tensor_map{1, 0}; - Shape output_k_tensor_map{1, 0}; - outputs_tensor_map_.emplace_back(output_q_tensor_map); - outputs_tensor_map_.emplace_back(output_k_tensor_map); - - return SUCCESS; -} - -Status MatmulFfnInfo::InferAsLossDivisor() { - if (outputs_tensor_map_.size() != kMatMulFfnOutputSize) { - MS_LOG(ERROR) << name_ << ": The size of outputs tensor map must be 2, but got " << outputs_tensor_map_.size(); - return FAILED; - } - as_loss_divisor_ = ComputeRepeatDeviceNumByTensorMap(dev_matrix_shape_, outputs_tensor_map_[0]); - MS_LOG(INFO) << name_ << " : The dev matrix shape is " << ShapeToString(dev_matrix_shape_) - << ", the output[0]'s tensor map is " << ShapeToString(outputs_tensor_map_[0]) - << ", as_loss_divisor_ is " << as_loss_divisor_; - return SUCCESS; -} -REGISTER(MatmulFfnInfo); -} // namespace parallel -} // namespace mindspore \ No newline at end of file diff --git a/mindspore/ccsrc/frontend/parallel/ops_info/matmul_ffn_info.h b/mindspore/ccsrc/frontend/parallel/ops_info/matmul_ffn_info.h deleted file mode 100644 index 1b800b363b5..00000000000 --- a/mindspore/ccsrc/frontend/parallel/ops_info/matmul_ffn_info.h +++ /dev/null @@ -1,53 +0,0 @@ -/** - * Copyright 2024 Huawei Technologies Co., Ltd - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#ifndef MINDSPORE_CCSRC_FRONTEND_PARALLEL_OPS_INFO_MATMUL_FFN_INFO_H_ -#define MINDSPORE_CCSRC_FRONTEND_PARALLEL_OPS_INFO_MATMUL_FFN_INFO_H_ - -#include -#include -#include - -#include "utils/hash_map.h" -#include "ir/value.h" -#include "frontend/parallel/auto_parallel/operator_costmodel.h" -#include "frontend/parallel/ops_info/operator_info.h" -#include "frontend/parallel/strategy.h" - -namespace mindspore { -namespace parallel { -class MatmulFfnInfo : public OperatorInfo { - public: - MatmulFfnInfo(const std::string &name, const Shapes &inputs_shape, const Shapes &outputs_shape, - const PrimitiveAttrs &attrs) - : OperatorInfo(name, inputs_shape, outputs_shape, attrs, std::make_shared()) {} - ~MatmulFfnInfo() override = default; - Status CheckStrategy(const StrategyPtr &strategy) override; - std::vector GenerateOpStrategies(int64_t stage_id) override { return {}; } - Status SetCostUnderStrategy(const StrategyPtr &strategy) override { return SetCostUnderStrategyBase(strategy); } - - protected: - Status GetAttrs() override { return SUCCESS; } - Status InferForwardCommunication() { return SUCCESS; } - Status InferTensorMap() override; - Status InferDevMatrixShape() override; - Status InferAsLossDivisor() override; -}; -using MatmulFfnInfoPtr = std::shared_ptr; -} // namespace parallel -} // namespace mindspore - -#endif // MINDSPORE_CCSRC_FRONTEND_PARALLEL_OPS_INFO_MATMUL_FFN_INFO_H_ diff --git a/mindspore/ccsrc/frontend/parallel/ops_info/matmul_qkv_info.cc b/mindspore/ccsrc/frontend/parallel/ops_info/matmul_qkv_info.cc deleted file mode 100644 index 0334fcc6bcb..00000000000 --- a/mindspore/ccsrc/frontend/parallel/ops_info/matmul_qkv_info.cc +++ /dev/null @@ -1,95 +0,0 @@ -/** - * Copyright 2024 Huawei Technologies Co., Ltd - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include "frontend/parallel/ops_info/matmul_qkv_info.h" -#include "frontend/parallel/dynamic_creator.h" - -namespace mindspore { -namespace parallel { -// MatMulQkv has 3 inputs and 3 outputs -// x: (batch * seq_len (inc is 1), query_hidden_size) -// q: (query_hidden_size, query_hidden_size) -// k: (key_hidden_size, query_hidden_size) -// v: (value_hidden_size, query_hidden_size) -// ------------------------------ -// output_q: (batch * seq_len (inc is 1), query_hidden_size) -// output_k: (batch * seq_len (inc is 1), key_hidden_size) -// output_v: (batch * seq_len (inc is 1), value_hidden_size) - -// split strategy -// batch is not able to split -// seq_len is not able to split -// query_hidden_size is able to split -// key_hidden_size is able to split -// value_hidden_size is able to split -constexpr size_t kMatMulQkvOutputSize = 3; - -Status MatmulQkvInfo::CheckStrategy(const StrategyPtr &strategy) { - if (CheckStrategyValue(strategy, inputs_shape_) != SUCCESS) { - return FAILED; - } - - // TODO - - return SUCCESS; -} - -Status MatmulQkvInfo::InferDevMatrixShape() { - auto input_strategies = strategy()->GetInputDim(); - auto x = input_strategies.at(0); // (batch * seq_len, q_hidden_size) - auto q = input_strategies.at(1); - // dp mp - // 1 0 - dev_matrix_shape_ = {x.at(0), q.at(0)}; - - return SUCCESS; -} - -Status MatmulQkvInfo::InferTensorMap() { - Shape x_tensor_map{1, -1}; - Shape q_tensor_map{0, -1}; - Shape k_tensor_map{0, -1}; - Shape v_tensor_map{0, -1}; - - inputs_tensor_map_.emplace_back(x_tensor_map); - inputs_tensor_map_.emplace_back(q_tensor_map); - inputs_tensor_map_.emplace_back(k_tensor_map); - inputs_tensor_map_.emplace_back(v_tensor_map); - - Shape output_q_tensor_map{1, 0}; - Shape output_k_tensor_map{1, 0}; - Shape output_v_tensor_map{1, 0}; - outputs_tensor_map_.emplace_back(output_q_tensor_map); - outputs_tensor_map_.emplace_back(output_k_tensor_map); - outputs_tensor_map_.emplace_back(output_v_tensor_map); - - return SUCCESS; -} - -Status MatmulQkvInfo::InferAsLossDivisor() { - if (outputs_tensor_map_.size() != kMatMulQkvOutputSize) { - MS_LOG(ERROR) << name_ << ": The size of outputs tensor map must be 3, but got " << outputs_tensor_map_.size(); - return FAILED; - } - as_loss_divisor_ = ComputeRepeatDeviceNumByTensorMap(dev_matrix_shape_, outputs_tensor_map_[0]); - MS_LOG(INFO) << name_ << " : The dev matrix shape is " << ShapeToString(dev_matrix_shape_) - << ", the output[0]'s tensor map is " << ShapeToString(outputs_tensor_map_[0]) - << ", as_loss_divisor_ is " << as_loss_divisor_; - return SUCCESS; -} -REGISTER(MatmulQkvInfo); -} // namespace parallel -} // namespace mindspore \ No newline at end of file diff --git a/mindspore/ccsrc/frontend/parallel/ops_info/matmul_qkv_info.h b/mindspore/ccsrc/frontend/parallel/ops_info/matmul_qkv_info.h deleted file mode 100644 index 43e6e8c33e1..00000000000 --- a/mindspore/ccsrc/frontend/parallel/ops_info/matmul_qkv_info.h +++ /dev/null @@ -1,53 +0,0 @@ -/** - * Copyright 2024 Huawei Technologies Co., Ltd - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#ifndef MINDSPORE_CCSRC_FRONTEND_PARALLEL_OPS_INFO_MATMUL_QKV_INFO_H_ -#define MINDSPORE_CCSRC_FRONTEND_PARALLEL_OPS_INFO_MATMUL_QKV_INFO_H_ - -#include -#include -#include - -#include "utils/hash_map.h" -#include "ir/value.h" -#include "frontend/parallel/auto_parallel/operator_costmodel.h" -#include "frontend/parallel/ops_info/operator_info.h" -#include "frontend/parallel/strategy.h" - -namespace mindspore { -namespace parallel { -class MatmulQkvInfo : public OperatorInfo { - public: - MatmulQkvInfo(const std::string &name, const Shapes &inputs_shape, const Shapes &outputs_shape, - const PrimitiveAttrs &attrs) - : OperatorInfo(name, inputs_shape, outputs_shape, attrs, std::make_shared()) {} - ~MatmulQkvInfo() override = default; - Status CheckStrategy(const StrategyPtr &strategy) override; - std::vector GenerateOpStrategies(int64_t stage_id) override { return {}; } - Status SetCostUnderStrategy(const StrategyPtr &strategy) override { return SetCostUnderStrategyBase(strategy); } - - protected: - Status GetAttrs() override { return SUCCESS; } - Status InferForwardCommunication() { return SUCCESS; } - Status InferTensorMap() override; - Status InferDevMatrixShape() override; - Status InferAsLossDivisor() override; -}; -using MatMulQkvInfoPtr = std::shared_ptr; -} // namespace parallel -} // namespace mindspore - -#endif // MINDSPORE_CCSRC_FRONTEND_PARALLEL_OPS_INFO_MATMUL_QKV_INFO_H_ diff --git a/mindspore/ccsrc/frontend/parallel/ops_info/paged_attention_info.cc b/mindspore/ccsrc/frontend/parallel/ops_info/paged_attention_info.cc index 29cd499ee1a..4fdbc0feb09 100644 --- a/mindspore/ccsrc/frontend/parallel/ops_info/paged_attention_info.cc +++ b/mindspore/ccsrc/frontend/parallel/ops_info/paged_attention_info.cc @@ -91,10 +91,8 @@ Status PagedAttentionInfo::CheckStrategy(const StrategyPtr &strategy) { Status PagedAttentionInfo::InferDevMatrixShape() { auto input_strategies = strategy()->GetInputDim(); - auto query = input_strategies.at(0); // (batch, seq_len, hidden_size) - auto cache = input_strategies.at(1); // (block_size, num_blocks, hidden_size) - auto block_tables = input_strategies.at(3); // (batch, max_num_block_per_batch) - auto context_lens = input_strategies.at(4); // (context_lens) + auto query = input_strategies.at(0); // (batch, seq_len, hidden_size) + auto cache = input_strategies.at(1); // (block_size, num_blocks, hidden_size) // batch block_size num_blocks seq_len hidden_size // 4 3 2 1 0 diff --git a/mindspore/ccsrc/frontend/parallel/step_parallel.cc b/mindspore/ccsrc/frontend/parallel/step_parallel.cc index 49c9ed6212d..c3a658d25fe 100644 --- a/mindspore/ccsrc/frontend/parallel/step_parallel.cc +++ b/mindspore/ccsrc/frontend/parallel/step_parallel.cc @@ -834,12 +834,7 @@ void InsertVirtualOutput(const FuncGraphPtr &root, const std::vector OperatorAttrs attrs; OperatorArgs args = std::make_pair(attrs, params); Operator op = std::make_pair(VIRTUAL_OUTPUT, args); - // Temporarily circumvent the MatmulQkv problem, and then modify it - auto cnode = dyn_cast_ptr(out_node); - const auto &input = cnode->input(0); - MS_EXCEPTION_IF_NULL(input); - auto prim = GetValuePtr(input); - if (IsPrimitiveCNode(out_node, prim::kPrimMakeTuple) || prim->name() == "MatmulQkv") { + if (IsPrimitiveCNode(out_node, prim::kPrimMakeTuple)) { auto tuple = out_node->cast(); MS_EXCEPTION_IF_NULL(tuple); for (size_t i = 1; i < tuple->size(); ++i) { diff --git a/mindspore/ccsrc/plugin/device/ascend/hal/device/ascend_device_address.cc b/mindspore/ccsrc/plugin/device/ascend/hal/device/ascend_device_address.cc index 771483f8977..311263c8ee5 100644 --- a/mindspore/ccsrc/plugin/device/ascend/hal/device/ascend_device_address.cc +++ b/mindspore/ccsrc/plugin/device/ascend/hal/device/ascend_device_address.cc @@ -763,9 +763,7 @@ bool AscendDeviceAddress::AsyncDeviceToDevice(const ShapeVector & /* shape */, s return ret; } - -bool AscendDeviceAddress::AsyncHostToDevice(size_t size, TypeId /* type */, - const void *host_ptr) const { +bool AscendDeviceAddress::AsyncHostToDevice(size_t size, TypeId /* type */, const void *host_ptr) const { MS_ERROR_IF_NULL(host_ptr); BindDevice(); if (!MoveToDevice(false)) { @@ -779,7 +777,8 @@ bool AscendDeviceAddress::AsyncHostToDevice(size_t size, TypeId /* type */, auto runtime_instance = device::KernelRuntimeManager::Instance().GetKernelRuntime(kAscendDevice, device_id); MS_EXCEPTION_IF_NULL(runtime_instance); - auto ret = CALL_ASCEND_API(aclrtMemcpyAsync, GetDevicePtr(), size, host_ptr, size, ACL_MEMCPY_HOST_TO_DEVICE, runtime_instance->compute_stream()); + auto ret = CALL_ASCEND_API(aclrtMemcpyAsync, GetDevicePtr(), size, host_ptr, size, ACL_MEMCPY_HOST_TO_DEVICE, + runtime_instance->compute_stream()); if (ret != ACL_ERROR_NONE) { MS_LOG(ERROR) << "Call aclrtMemcpyAsync host to device failed, the error num[" << ret << "]"; return false; @@ -787,7 +786,6 @@ bool AscendDeviceAddress::AsyncHostToDevice(size_t size, TypeId /* type */, return true; } - bool AscendDeviceAddress::AsyncHostToDevice(const ShapeVector & /* shape */, size_t size, TypeId /* type */, const void *host_ptr, size_t stream_id) const { MS_ERROR_IF_NULL(host_ptr); diff --git a/mindspore/ccsrc/plugin/device/ascend/hal/hardware/CMakeLists.txt b/mindspore/ccsrc/plugin/device/ascend/hal/hardware/CMakeLists.txt index cc0fa4c4a69..700987c6606 100644 --- a/mindspore/ccsrc/plugin/device/ascend/hal/hardware/CMakeLists.txt +++ b/mindspore/ccsrc/plugin/device/ascend/hal/hardware/CMakeLists.txt @@ -17,7 +17,7 @@ if(DEFINED ENV{MS_INTERNAL_KERNEL_HOME}) ${CMAKE_SOURCE_DIR}/mindspore/ccsrc/runtime/collective/collective_communication_lib.cc ${CMAKE_SOURCE_DIR}/mindspore/ccsrc/runtime/collective/communication_group.cc) set_property(SOURCE ${LOWLATENCY_COLLECTIVE_SRCS} PROPERTY COMPILE_DEFINITIONS - SUBMODULE_ID=mindspore::SubModuleId::SM_DEVICE) + SUBMODULE_ID=mindspore::SubModuleId::SM_DEVICE) add_library(lowlatency_collective SHARED ${LOWLATENCY_COLLECTIVE_SRCS}) target_link_libraries(lowlatency_collective PRIVATE lcal) endif() diff --git a/mindspore/ccsrc/plugin/device/ascend/hal/hardware/ge_device_res_manager.cc b/mindspore/ccsrc/plugin/device/ascend/hal/hardware/ge_device_res_manager.cc index 3bc396d539f..ef1dfa7037a 100644 --- a/mindspore/ccsrc/plugin/device/ascend/hal/hardware/ge_device_res_manager.cc +++ b/mindspore/ccsrc/plugin/device/ascend/hal/hardware/ge_device_res_manager.cc @@ -246,12 +246,13 @@ void GeDeviceResManager::CreateSessionAndGraphRunner() { } bool GeDeviceResManager::LoadCollectiveCommLib() { - // If this is simulation, don't load any collective communication library. + // If this is simulation, load dummy collective communication library. if (!common::GetEnv(kSimulationLevel).empty()) { + collective_comm_lib_ = &DummyAscendCollectiveCommLib::GetInstance(); return true; } // Ascend backend supports HCCL and LCCL collective communication libraries. - if (!common::GetEnv("ENABLE_LCCL").empty()) { + if (!common::GetEnv("MS_ENABLE_LCCL").empty()) { std::string lowlatency_comm_lib_name = "liblowlatency_collective.so"; auto loader = std::make_shared(lowlatency_comm_lib_name); MS_EXCEPTION_IF_NULL(loader); @@ -265,7 +266,7 @@ bool GeDeviceResManager::LoadCollectiveCommLib() { auto instance_func = DlsymFuncObj(communication_lib_instance, collective_comm_lib_handle); collective_comm_lib_ = instance_func(); MS_EXCEPTION_IF_NULL(collective_comm_lib_); - MS_LOG(WARNING) << "Loading LCCL because env ENABLE_LCCL is set to 1. Pay attention that LCCL only supports " + MS_LOG(WARNING) << "Loading LCCL because env MS_ENABLE_LCCL is set to 1. Pay attention that LCCL only supports " "single-node-multi-card mode in KernelByKernel for now."; } else { collective_comm_lib_ = &AscendCollectiveCommLib::GetInstance(); diff --git a/mindspore/ccsrc/plugin/device/ascend/hal/hardware/ge_device_res_manager.h b/mindspore/ccsrc/plugin/device/ascend/hal/hardware/ge_device_res_manager.h index 5a315db6ba2..9f331919768 100644 --- a/mindspore/ccsrc/plugin/device/ascend/hal/hardware/ge_device_res_manager.h +++ b/mindspore/ccsrc/plugin/device/ascend/hal/hardware/ge_device_res_manager.h @@ -73,18 +73,7 @@ class GeDeviceResManager : public DeviceResManager { static void CreateSessionAndGraphRunner(); -<<<<<<< HEAD - bool LoadCollectiveCommLib() override { - if (common::GetEnv(kSimulationLevel).empty()) { - collective_comm_lib_ = &AscendCollectiveCommLib::GetInstance(); - } else { - collective_comm_lib_ = &DummyAscendCollectiveCommLib::GetInstance(); - } - return true; - } -======= bool LoadCollectiveCommLib() override; ->>>>>>> Add lccl so. void ResetStreamAndCtx() override; bool BindDeviceToCurrentThread(bool force_bind) const override; diff --git a/mindspore/ccsrc/plugin/device/ascend/hal/hardware/lowlatency_communication_group.h b/mindspore/ccsrc/plugin/device/ascend/hal/hardware/lowlatency_communication_group.h index 9d032133cd3..64bed21e6cc 100644 --- a/mindspore/ccsrc/plugin/device/ascend/hal/hardware/lowlatency_communication_group.h +++ b/mindspore/ccsrc/plugin/device/ascend/hal/hardware/lowlatency_communication_group.h @@ -44,7 +44,7 @@ class LowlatencyCommunicationGroup : public CommunicationGroup { void *GenerateRootInfo(size_t *root_info_size) override; - // Return communicator for collective communcation ops. + // Return communicator for collective communication ops. const LcclPtr &lccl_communicator() const; // Return communicator of lcal. const LcalCommPtr &lcal_comm() const; diff --git a/mindspore/ccsrc/plugin/device/ascend/kernel/hccl/hccl_kernel.cc b/mindspore/ccsrc/plugin/device/ascend/kernel/hccl/hccl_kernel.cc index b61676946b3..77873bbb357 100644 --- a/mindspore/ccsrc/plugin/device/ascend/kernel/hccl/hccl_kernel.cc +++ b/mindspore/ccsrc/plugin/device/ascend/kernel/hccl/hccl_kernel.cc @@ -135,7 +135,7 @@ bool HcclKernel::Init(const std::vector &inputs, const std::vect if (common::GetEnv(kSimulationLevel).empty() && !common::IsNeedProfileMemory()) { #ifdef ENABLE_INTERNAL_KERNELS - if (!common::GetEnv("ENABLE_LCCL").empty()) { + if (!common::GetEnv("MS_ENABLE_LCCL").empty()) { LoadLcclLibrary(); } else { LoadHcclLibrary(); diff --git a/mindspore/ccsrc/plugin/device/ascend/kernel/hccl/hcom_all_broadcast.cc b/mindspore/ccsrc/plugin/device/ascend/kernel/hccl/hcom_all_broadcast.cc index fe84cf9ff4e..162520518f9 100644 --- a/mindspore/ccsrc/plugin/device/ascend/kernel/hccl/hcom_all_broadcast.cc +++ b/mindspore/ccsrc/plugin/device/ascend/kernel/hccl/hcom_all_broadcast.cc @@ -31,7 +31,7 @@ bool HcomAllBroadCastKernel::Launch(const std::vector &inputs, c MS_EXCEPTION_IF_NULL(stream_ptr); #ifdef ENABLE_INTERNAL_KERNELS - if (!common::GetEnv("ENABLE_LCCL").empty()) { + if (!common::GetEnv("MS_ENABLE_LCCL").empty()) { auto lccl_result = lccl_comm_->Broadcast(inputs[0]->device_ptr(), hccl_count_, hccl_data_type_list_[0], root_id_, stream_ptr); if (lccl_result != Lcal::LCAL_SUCCESS) { diff --git a/mindspore/ccsrc/plugin/device/ascend/kernel/hccl/hcom_all_gather.cc b/mindspore/ccsrc/plugin/device/ascend/kernel/hccl/hcom_all_gather.cc index e4c52b2c59c..9061fa4e93e 100644 --- a/mindspore/ccsrc/plugin/device/ascend/kernel/hccl/hcom_all_gather.cc +++ b/mindspore/ccsrc/plugin/device/ascend/kernel/hccl/hcom_all_gather.cc @@ -32,7 +32,7 @@ bool HcomAllGatherKernel::Launch(const std::vector &inputs, cons MS_EXCEPTION_IF_NULL(stream_ptr); #ifdef ENABLE_INTERNAL_KERNELS - if (!common::GetEnv("ENABLE_LCCL").empty()) { + if (!common::GetEnv("MS_ENABLE_LCCL").empty()) { auto lccl_result = lccl_comm_->AllGather(inputs[0]->device_ptr(), outputs[0]->device_ptr(), hccl_count_, hccl_data_type_list_[0], stream_ptr); if (lccl_result != Lcal::LCAL_SUCCESS) { diff --git a/mindspore/ccsrc/plugin/device/ascend/kernel/hccl/hcom_all_reduce.cc b/mindspore/ccsrc/plugin/device/ascend/kernel/hccl/hcom_all_reduce.cc index c51ae577988..c0ad808303e 100644 --- a/mindspore/ccsrc/plugin/device/ascend/kernel/hccl/hcom_all_reduce.cc +++ b/mindspore/ccsrc/plugin/device/ascend/kernel/hccl/hcom_all_reduce.cc @@ -38,7 +38,7 @@ bool HcomAllReduceKernel::Launch(const std::vector &inputs, cons MS_EXCEPTION_IF_NULL(stream_ptr); #ifdef ENABLE_INTERNAL_KERNELS - if (!common::GetEnv("ENABLE_LCCL").empty()) { + if (!common::GetEnv("MS_ENABLE_LCCL").empty()) { auto lccl_result = lccl_comm_->AllReduce(inputs[0]->device_ptr(), outputs[0]->device_ptr(), hccl_count_, hccl_data_type_list_[0], op_type_, stream_ptr); if (lccl_result != Lcal::LCAL_SUCCESS) { diff --git a/mindspore/ccsrc/plugin/device/ascend/kernel/hccl/hcom_all_reduce_scatter.cc b/mindspore/ccsrc/plugin/device/ascend/kernel/hccl/hcom_all_reduce_scatter.cc index c47383773e6..c65e2d03564 100644 --- a/mindspore/ccsrc/plugin/device/ascend/kernel/hccl/hcom_all_reduce_scatter.cc +++ b/mindspore/ccsrc/plugin/device/ascend/kernel/hccl/hcom_all_reduce_scatter.cc @@ -32,7 +32,7 @@ bool HcomAllReduceScatterKernel::Launch(const std::vector &input MS_EXCEPTION_IF_NULL(stream_ptr); #ifdef ENABLE_INTERNAL_KERNELS - if (!common::GetEnv("ENABLE_LCCL").empty()) { + if (!common::GetEnv("MS_ENABLE_LCCL").empty()) { auto lccl_result = lccl_comm_->ReduceScatter(inputs[0]->device_ptr(), outputs[0]->device_ptr(), hccl_count_, hccl_data_type_list_[0], op_type_, stream_ptr); if (lccl_result != Lcal::LCAL_SUCCESS) { diff --git a/mindspore/ccsrc/plugin/device/ascend/kernel/hccl/hcom_matmul_all_reduce.cc b/mindspore/ccsrc/plugin/device/ascend/kernel/hccl/hcom_matmul_all_reduce.cc index 232d34c0c09..03f1b37b0b1 100644 --- a/mindspore/ccsrc/plugin/device/ascend/kernel/hccl/hcom_matmul_all_reduce.cc +++ b/mindspore/ccsrc/plugin/device/ascend/kernel/hccl/hcom_matmul_all_reduce.cc @@ -62,7 +62,7 @@ int HcomMatMulAllReduceKernel::Resize(const std::vector &inputs, // The dimensions of left and right matrices. matmul_info_.m = hccl_kernel_input_shape_list_[0][0]; matmul_info_.k = hccl_kernel_input_shape_list_[0][1]; - matmul_info_.n = hccl_kernel_input_shape_list_[1][0]; + matmul_info_.n = hccl_kernel_input_shape_list_[1][1]; matmul_info_.transA = transpose_a_; matmul_info_.transB = transpose_b_; diff --git a/mindspore/ccsrc/plugin/device/ascend/kernel/hccl/hcom_matmul_all_reduce.h b/mindspore/ccsrc/plugin/device/ascend/kernel/hccl/hcom_matmul_all_reduce.h index 5ecb10224fc..4cd287222de 100644 --- a/mindspore/ccsrc/plugin/device/ascend/kernel/hccl/hcom_matmul_all_reduce.h +++ b/mindspore/ccsrc/plugin/device/ascend/kernel/hccl/hcom_matmul_all_reduce.h @@ -26,7 +26,6 @@ namespace kernel { constexpr uint32_t kMatMulAllReduceInputNum = 2; constexpr uint32_t kMatMulAllReduceOutputNum = 1; constexpr char kAttrNameTransposeA[] = "transpose_a"; -; constexpr char kAttrNameTransposeB[] = "transpose_b"; class HcomMatMulAllReduceKernel : public HcclKernel { diff --git a/mindspore/ccsrc/plugin/device/ascend/kernel/internal/elewise_binary.cc b/mindspore/ccsrc/plugin/device/ascend/kernel/internal/elewise_binary.cc index 7f812caab12..947365f97e3 100644 --- a/mindspore/ccsrc/plugin/device/ascend/kernel/internal/elewise_binary.cc +++ b/mindspore/ccsrc/plugin/device/ascend/kernel/internal/elewise_binary.cc @@ -85,7 +85,7 @@ class InternalSub : public ElewiseBinary { param_ptr->input2_dtype_ = InternalKernelUtils::ToInternalDType(inputs[kIndex1]->dtype_id()); param_ptr->input1_dims_ = internal::VecToSVec(inputs[kIndex0]->GetShapeVector()); param_ptr->input2_dims_ = internal::VecToSVec(inputs[kIndex1]->GetShapeVector()); - + return std::static_pointer_cast(param_ptr); } }; diff --git a/mindspore/ccsrc/plugin/device/ascend/kernel/internal/internal_kernel_mod.cc b/mindspore/ccsrc/plugin/device/ascend/kernel/internal/internal_kernel_mod.cc index 35432c4416f..78929f430f3 100644 --- a/mindspore/ccsrc/plugin/device/ascend/kernel/internal/internal_kernel_mod.cc +++ b/mindspore/ccsrc/plugin/device/ascend/kernel/internal/internal_kernel_mod.cc @@ -105,7 +105,6 @@ int InternalKernelMod::Resize(const std::vector &inputs, const s } } std::vector input_shapes(inputs_.size()); - std::vector output_shapes; for (auto iter = inputsIdxMap_.begin(); iter != inputsIdxMap_.end(); iter++) { InternalKernelUtils::ToInternalTensor(inputs_[iter->second], inputs[iter->first]); input_shapes[iter->second] = inputs_[iter->second]->desc.dims; diff --git a/mindspore/ccsrc/plugin/device/ascend/optimizer/backend_common_unify_mindir.cc b/mindspore/ccsrc/plugin/device/ascend/optimizer/backend_common_unify_mindir.cc index 71cce9ec90e..830bb2ac49f 100644 --- a/mindspore/ccsrc/plugin/device/ascend/optimizer/backend_common_unify_mindir.cc +++ b/mindspore/ccsrc/plugin/device/ascend/optimizer/backend_common_unify_mindir.cc @@ -136,13 +136,9 @@ void GetBackendCommonUnifyMindIRPassManager(PassManagerPtr *unify_mindir_pm) { #ifdef ENABLE_INTERNAL_KERNELS (*unify_mindir_pm)->AddPass(std::make_shared()); (*unify_mindir_pm)->AddPass(std::make_shared()); - if (common::GetEnv("MS_ENABLE_INTERNAL_KERNELS") == "on") { - (*unify_mindir_pm)->AddPass(std::make_shared()); - } + (*unify_mindir_pm)->AddPass(std::make_shared()); (*unify_mindir_pm)->AddPass(std::make_shared()); - if (common::GetEnv("ENABLE_MATMUL_ALLREDUCE") == "on") { - (*unify_mindir_pm)->AddPass(std::make_shared()); - } + (*unify_mindir_pm)->AddPass(std::make_shared()); #endif // ENABLE_INTERNAL_KERNELS } diff --git a/mindspore/ccsrc/plugin/device/ascend/optimizer/ir_fusion/matmul_allreduce_fusion.cc b/mindspore/ccsrc/plugin/device/ascend/optimizer/ir_fusion/matmul_allreduce_fusion.cc index 17a430139fa..d1f591e1bc7 100644 --- a/mindspore/ccsrc/plugin/device/ascend/optimizer/ir_fusion/matmul_allreduce_fusion.cc +++ b/mindspore/ccsrc/plugin/device/ascend/optimizer/ir_fusion/matmul_allreduce_fusion.cc @@ -15,11 +15,13 @@ */ #include "plugin/device/ascend/optimizer/ir_fusion/matmul_allreduce_fusion.h" +#include #include #include "mindspore/core/ops/nn_ops.h" #include "mindspore/core/ops/math_ops.h" #include "mindspore/core/ops/other_ops.h" #include "mindspore/core/ops/lite_ops.h" +#include "mindspore/core/utils/ms_context.h" #include "include/backend/optimizer/helper.h" #include "include/backend/anf_runtime_algorithm.h" #include "include/common/utils/anfalgo.h" @@ -101,6 +103,16 @@ AnfNodePtr MatMulAllReduceFusion::CreateMatMulAllReduceNode(const FuncGraphPtr & const AnfNodePtr MatMulAllReduceFusion::Process(const mindspore::FuncGraphPtr &func_graph, const mindspore::AnfNodePtr &node, const mindspore::EquivPtr &equiv) const { + auto ms_context = MsContext::GetInstance(); + MS_EXCEPTION_IF_NULL(ms_context); + if (!ms_context->IsEnableInferBoost()) { + return nullptr; + } + + if (common::GetEnv("DISABLE_MATMULALLREDUCE_FUSION") == "True") { + return nullptr; + } + if (func_graph == nullptr || node == nullptr) { return nullptr; } diff --git a/mindspore/ccsrc/plugin/device/ascend/optimizer/ir_fusion/matmul_allreduce_fusion.h b/mindspore/ccsrc/plugin/device/ascend/optimizer/ir_fusion/matmul_allreduce_fusion.h index 7e4c8e8037d..0f12f996fca 100644 --- a/mindspore/ccsrc/plugin/device/ascend/optimizer/ir_fusion/matmul_allreduce_fusion.h +++ b/mindspore/ccsrc/plugin/device/ascend/optimizer/ir_fusion/matmul_allreduce_fusion.h @@ -16,6 +16,7 @@ #ifndef MINDSPORE_CCSRC_BACKEND_OPTIMIZER_ASCEND_IR_FUSION_MATMUL_ALLREDUCE_FUSION_H_ #define MINDSPORE_CCSRC_BACKEND_OPTIMIZER_ASCEND_IR_FUSION_MATMUL_ALLREDUCE_FUSION_H_ +#include #include #include "include/backend/optimizer/optimizer.h" @@ -24,7 +25,7 @@ namespace opt { class MatMulAllReduceFusion : public PatternProcessPass { public: explicit MatMulAllReduceFusion(bool multigraph = true, const string &pass_name = "MatMulAllReduce") - : PatternProcessPass(pass_name, multigraph){}; + : PatternProcessPass(pass_name, multigraph) {} ~MatMulAllReduceFusion() override = default; const BaseRef DefinePattern() const override; const AnfNodePtr Process(const FuncGraphPtr &graph, const AnfNodePtr &node, const EquivPtr &equiv) const override; @@ -46,4 +47,4 @@ class MatMulAllReduceFusion : public PatternProcessPass { } // namespace opt } // namespace mindspore -#endif // MINDSPORE_CCSRC_BACKEND_OPTIMIZER_ASCEND_IR_FUSION_MATMUL_ALLREDUCE_FUSION_H_ \ No newline at end of file +#endif // MINDSPORE_CCSRC_BACKEND_OPTIMIZER_ASCEND_IR_FUSION_MATMUL_ALLREDUCE_FUSION_H_ diff --git a/mindspore/ccsrc/plugin/device/ascend/optimizer/ir_fusion/multi_matmuls_fusion.cc b/mindspore/ccsrc/plugin/device/ascend/optimizer/ir_fusion/multi_matmuls_fusion.cc index e54392bba0c..70a88644ab0 100644 --- a/mindspore/ccsrc/plugin/device/ascend/optimizer/ir_fusion/multi_matmuls_fusion.cc +++ b/mindspore/ccsrc/plugin/device/ascend/optimizer/ir_fusion/multi_matmuls_fusion.cc @@ -15,13 +15,19 @@ */ #include "plugin/device/ascend/optimizer/ir_fusion/multi_matmuls_fusion.h" +#include "mindspore/core/utils/ms_context.h" + namespace mindspore { namespace opt { bool MultiMatmulsFusion::Run(const FuncGraphPtr &graph) { bool changed = false; - if (common::GetEnv("ENABLE_MATMUL_FUSION") != "on") { + + auto ms_context = MsContext::GetInstance(); + MS_EXCEPTION_IF_NULL(ms_context); + if (!ms_context->IsEnableInferBoost() || common::GetEnv("ENABLE_MATMUL_FUSION") != "on") { return changed; } + auto mng = graph->manager(); MS_EXCEPTION_IF_NULL(mng); const auto &node_users_map = mng->node_users(); diff --git a/mindspore/ccsrc/plugin/device/ascend/optimizer/ir_fusion/shape_reshape_fusion.cc b/mindspore/ccsrc/plugin/device/ascend/optimizer/ir_fusion/shape_reshape_fusion.cc index 6bd6cd24338..5cfb7c1a4c5 100644 --- a/mindspore/ccsrc/plugin/device/ascend/optimizer/ir_fusion/shape_reshape_fusion.cc +++ b/mindspore/ccsrc/plugin/device/ascend/optimizer/ir_fusion/shape_reshape_fusion.cc @@ -22,6 +22,7 @@ #include "mindspore/core/ops/reshape_ext.h" #include "mindspore/core/ops/scalar_graph_holder.h" #include "mindspore/core/ops/array_ops.h" +#include "mindspore/core/utils/ms_context.h" #include "include/common/utils/anfalgo.h" #include "mindspore/ccsrc/include/common/utils/utils.h" #include "plugin/device/ascend/optimizer/get_value_helper.h" @@ -200,6 +201,12 @@ const BaseRef ShapeReshapeFusion::DefinePattern() const { const AnfNodePtr ShapeReshapeFusion::Process(const FuncGraphPtr &func_graph, const AnfNodePtr &node, const EquivPtr &equiv) const { + auto ms_context = MsContext::GetInstance(); + MS_EXCEPTION_IF_NULL(ms_context); + if (!ms_context->IsEnableInferBoost()) { + return nullptr; + } + MS_EXCEPTION_IF_NULL(func_graph); MS_EXCEPTION_IF_NULL(equiv); diff --git a/mindspore/ccsrc/runtime/graph_scheduler/actor/data_prepare_actor.cc b/mindspore/ccsrc/runtime/graph_scheduler/actor/data_prepare_actor.cc index 6e5f39a44eb..c89a758ba9a 100644 --- a/mindspore/ccsrc/runtime/graph_scheduler/actor/data_prepare_actor.cc +++ b/mindspore/ccsrc/runtime/graph_scheduler/actor/data_prepare_actor.cc @@ -825,9 +825,9 @@ void DataPrepareActor::PrepareDataForHostTensorQueueNew(const VectorRef &args, O << " for input parameter:" << origin_parameter->fullname_with_scope(); if (!isDyn) { - if(host_tensors_[tensor_position] != input_tensor->shape()) { - isDyn = true; - } + if (host_tensors_[tensor_position] != input_tensor->shape()) { + isDyn = true; + } } host_tensors_[tensor_position] = input_tensor->shape(); host_tensors[tensor_position] = input_tensor; diff --git a/mindspore/ccsrc/runtime/graph_scheduler/actor/data_source_actor.cc b/mindspore/ccsrc/runtime/graph_scheduler/actor/data_source_actor.cc index 627dd4c2d39..24da68b61ff 100644 --- a/mindspore/ccsrc/runtime/graph_scheduler/actor/data_source_actor.cc +++ b/mindspore/ccsrc/runtime/graph_scheduler/actor/data_source_actor.cc @@ -23,6 +23,7 @@ #include "mindrt/include/async/async.h" #include "utils/log_adapter.h" #include "kernel/common_utils.h" +#include "mindspore/core/utils/ms_context.h" namespace mindspore { namespace runtime { @@ -278,6 +279,8 @@ void HostQueueDataSourceActor::SendMemoryFreeReq(OpContext *const } void HostQueueDataSourceActor::OnMemoryAllocFinish(OpContext *const context) { + auto ms_context = MsContext::GetInstance(); + MS_EXCEPTION_IF_NULL(ms_context); MS_EXCEPTION_IF_NULL(context); if (IsRunningFailed(context)) { return; @@ -328,11 +331,20 @@ void HostQueueDataSourceActor::OnMemoryAllocFinish(OpContext *cons continue; } - - if (!device_tensor->AsyncHostToDevice(LongToSize(host_tensor->data().nbytes()), host_tensor->data_type(), - host_tensor->data_ptr()->data())) { - SET_OPCONTEXT_FAIL_RET_WITH_ERROR((*context), "SyncHostToDevice failed."); + if (ms_context->IsEnableInferBoost()) { + if (!device_tensor->AsyncHostToDevice(LongToSize(host_tensor->data().nbytes()), host_tensor->data_type(), + host_tensor->data_ptr()->data())) { + SET_OPCONTEXT_FAIL_RET_WITH_ERROR((*context), "SyncHostToDevice failed."); + } + } else { + if (!device_tensor->SyncHostToDevice( + trans::GetRuntimePaddingShape(data_node_with_indexs_[i].first, data_node_with_indexs_[i].second), + LongToSize(host_tensor->data().nbytes()), host_tensor->data_type(), + host_tensor->device_info().host_format_, host_tensor->data_ptr())) { + SET_OPCONTEXT_FAIL_RET_WITH_ERROR((*context), "SyncHostToDevice failed."); + } } + if (IsDynamic(device_tensor->host_shape())) { device_tensor->set_host_shape(host_tensor->shape()); } diff --git a/mindspore/ccsrc/runtime/graph_scheduler/actor/kernel_infer_actor.cc b/mindspore/ccsrc/runtime/graph_scheduler/actor/kernel_infer_actor.cc index 4d3f4eea34c..613f6e2ab82 100644 --- a/mindspore/ccsrc/runtime/graph_scheduler/actor/kernel_infer_actor.cc +++ b/mindspore/ccsrc/runtime/graph_scheduler/actor/kernel_infer_actor.cc @@ -27,7 +27,6 @@ void KernelInferActor::Init() { if (memory_free_list_.size() > input_num) { memory_free_list_.erase(memory_free_list_.begin() + input_num, memory_free_list_.end()); } - } void KernelInferActor::RunOpData(OpData *const input_data, OpContext *const context) { diff --git a/mindspore/ccsrc/runtime/graph_scheduler/actor/super_kernel_actor.cc b/mindspore/ccsrc/runtime/graph_scheduler/actor/super_kernel_actor.cc index 57d61aa340a..21c7fc9e444 100644 --- a/mindspore/ccsrc/runtime/graph_scheduler/actor/super_kernel_actor.cc +++ b/mindspore/ccsrc/runtime/graph_scheduler/actor/super_kernel_actor.cc @@ -211,8 +211,6 @@ void SuperKernelActor::Run(OpContext *const context) { return RunGraphKernelByKernel(context); } - device::tracker::CALL_MEMORY_TRACKER_WITH_FILE(AddTask, GetAID().Name(), "", graph_->ToString()); - if (device_contexts_.empty() || device_contexts_[0] == nullptr) { SET_OPCONTEXT_FAIL_RET_WITH_ERROR((*context), "Invalid device context for super kernel actor:" + GetAID().Name()); } diff --git a/mindspore/ccsrc/runtime/graph_scheduler/graph_scheduler.cc b/mindspore/ccsrc/runtime/graph_scheduler/graph_scheduler.cc index 061181d3496..434ecd79c74 100644 --- a/mindspore/ccsrc/runtime/graph_scheduler/graph_scheduler.cc +++ b/mindspore/ccsrc/runtime/graph_scheduler/graph_scheduler.cc @@ -662,14 +662,14 @@ void GraphScheduler::SpawnMultiPipelineActor(ActorSet *const actor_set, ActorThr } // If enable runtime multi pipeline, async launch kernel will be enabled. -<<<<<<< HEAD - ActorDispatcher::set_enable_runtime_multi_pipeline(EnableRuntimePipeline() && actor_set->has_dynamic_shape_ && - !actor_set->kernel_actors_.empty() && - default_actor_thread_num_ > kMultiPipelineThreadNum); ->>>>>>> 80d1685cc13... lzy: Support run graph without kernel actor for kbyk mode + ActorDispatcher::set_enable_runtime_multi_pipeline( + enable_runtime_pipeline && actor_set->has_dynamic_shape_ && + (EnableKbkSubGraphExecute() || !actor_set->kernel_actors_.empty()) && + default_actor_thread_num_ > kMultiPipelineThreadNum); if (ActorDispatcher::enable_runtime_multi_pipeline() && !already_spawn_kernel_async_infer_resize_actor_) { size_t current_actor_thread_num = thread_pool->GetActorThreadNum(); MS_LOG(INFO) << "Enable runtime multi pipeline, default actor thread num: " << default_actor_thread_num_ + << ", current actor thread num: " << current_actor_thread_num; if (current_actor_thread_num != default_actor_thread_num_) { thread_pool->SetActorThreadNum(default_actor_thread_num_); MS_LOG(DEBUG) << "Reset actor thread number to: " << default_actor_thread_num_; diff --git a/mindspore/core/ops/fusion/matmul_allreduce.cc b/mindspore/core/ops/fusion/matmul_allreduce.cc index e7176bc2940..d47bd5a6318 100644 --- a/mindspore/core/ops/fusion/matmul_allreduce.cc +++ b/mindspore/core/ops/fusion/matmul_allreduce.cc @@ -26,6 +26,5 @@ namespace mindspore { namespace ops { MIND_API_OPERATOR_IMPL(MatMulAllReduce, MatMul); -REGISTER_PRIMITIVE_C(kMatMulAllReduce, MatMulAllReduce); } // namespace ops } // namespace mindspore diff --git a/mindspore/core/ops/mat_mul.cc b/mindspore/core/ops/mat_mul.cc index 62066ddeec6..ce4046d56ec 100644 --- a/mindspore/core/ops/mat_mul.cc +++ b/mindspore/core/ops/mat_mul.cc @@ -34,8 +34,10 @@ #include "mindapi/src/helper.h" #include "mindspore/core/ops/math_ops.h" #include "ops/mat_mul.h" +#include "ops/fusion/matmul_allreduce.h" #include "ops/op_name.h" #include "ops/primitive_c.h" +#include "ops/lite_ops.h" #include "utils/check_convert_utils.h" #include "utils/convert_utils_base.h" #include "utils/log_adapter.h" @@ -169,5 +171,6 @@ class MatMulInfer : public abstract::OpInferBase { }; REGISTER_PRIMITIVE_OP_INFER_IMPL(MatMul, prim::kPrimMatMul, MatMulInfer, false); +REGISTER_PRIMITIVE_OP_INFER_IMPL(MatMulAllReduce, prim::kPrimMatMulAllReduce, MatMulInfer, false); } // namespace ops } // namespace mindspore diff --git a/mindspore/core/ops/math_ops.h b/mindspore/core/ops/math_ops.h index f9e316ace95..cf9c64e3426 100644 --- a/mindspore/core/ops/math_ops.h +++ b/mindspore/core/ops/math_ops.h @@ -62,8 +62,10 @@ GVAR_DEF(PrimitivePtr, kPrimTensorAdd, std::make_shared("TensorAdd")) GVAR_DEF(PrimitivePtr, kPrimAddV2, std::make_shared(kAddV2OpName)); GVAR_DEF(PrimitivePtr, kPrimAddLayerNorm, std::make_shared("AddLayerNorm")); GVAR_DEF(PrimitivePtr, kPrimAddRmsNorm, std::make_shared("AddRmsNorm")); +GVAR_DEF(PrimitivePtr, kPrimMatMul, std::make_shared("MatMul")); GVAR_DEF(PrimitivePtr, kPrimMatMulV2, std::make_shared("MatMulV2")); GVAR_DEF(PrimitivePtr, kPrimMatrixDiag, std::make_shared("MatrixDiag")); +GVAR_DEF(PrimitivePtr, kPrimBatchMatMul, std::make_shared("BatchMatMul")); GVAR_DEF(PrimitivePtr, kPrimBatchMatMulV2, std::make_shared("BatchMatMulV2")); GVAR_DEF(PrimitivePtr, kPrimFusedMatMulBiasAdd, std::make_shared("FusedMatMulBiasAdd")); GVAR_DEF(PrimitivePtr, kPrimMinimumGradGrad, std::make_shared("MinimumGradGrad")); diff --git a/mindspore/core/utils/ms_context.h b/mindspore/core/utils/ms_context.h index 1fc46b3a758..6beb5160a7a 100644 --- a/mindspore/core/utils/ms_context.h +++ b/mindspore/core/utils/ms_context.h @@ -24,6 +24,7 @@ #include #include #include +#include #include "utils/log_adapter.h" #include "utils/ms_utils.h" @@ -402,7 +403,7 @@ inline void MsContext::increase_param(MsCtxParam param) { uint32_params_[param - MS_CTX_TYPE_UINT32_BEGIN]++; } -// decreate method implementation for type uint32_t +// decrease method implementation for type uint32_t template <> inline void MsContext::decrease_param(MsCtxParam param) { uint32_params_[param - MS_CTX_TYPE_UINT32_BEGIN]--; diff --git a/mindspore/python/mindspore/common/api.py b/mindspore/python/mindspore/common/api.py index fcf0c7583a1..861b5b6de4b 100644 --- a/mindspore/python/mindspore/common/api.py +++ b/mindspore/python/mindspore/common/api.py @@ -302,12 +302,13 @@ def _handle_arg_predict(obj, arg, compile_arg): return None if isinstance(arg, (list, tuple)): - if compile_arg is not None and hasattr(compile_arg, "__ms_mutable__") and getattr(compile_arg, "__ms_mutable__"): + if compile_arg is not None and hasattr(compile_arg, "__ms_mutable__") and \ + getattr(compile_arg, "__ms_mutable__"): # mutable([]) will be eliminated by FuncGraphSpecializer, and empty list is not supported by backend. if isinstance(arg, list) and not arg: return None return arg - elif hasattr(obj, "enable_tuple_broaden") and obj.enable_tuple_broaden and isinstance(arg, tuple) and \ + if hasattr(obj, "enable_tuple_broaden") and obj.enable_tuple_broaden and isinstance(arg, tuple) and \ _check_all_tensor(arg): return arg return None diff --git a/mindspore/python/mindspore/common/jit_config.py b/mindspore/python/mindspore/common/jit_config.py index 1631ab38d7c..0f6db062f5d 100644 --- a/mindspore/python/mindspore/common/jit_config.py +++ b/mindspore/python/mindspore/common/jit_config.py @@ -60,7 +60,7 @@ class JitConfig: infer_boost (str, optional): enable infer boost mode. The value must be ``"on"`` , ``"off"``. Default to an "off", which means that disable infer boost. - when infer boost mode is enabled, mindspore will use high perf kernel lib, use faster runtime make + when infer boost mode is enabled, mindspore will use high perf kernel lib, use faster runtime make infer speed is best. Note: current infer boost only support jit_level == O0 and device is Ascend910B Default: ``"off"`` . @@ -78,16 +78,17 @@ class JitConfig: >>> >>> net.set_jit_config(jitconfig) """ - def __init__(self, jit_level="", exc_mode="auto", jit_syntax_level="", debug_level="RELEASE", infer_boost="off", **kwargs): + def __init__(self, jit_level="", exc_mode="auto", jit_syntax_level="", debug_level="RELEASE", + infer_boost="off", **kwargs): if jit_level not in ["", "O0", "O1", "O2"]: raise ValueError("For 'jit_level' must be one of ['O0', 'O1', 'O2'].") - if exc_mode not in ['auto', 'sink', 'no_sink']: + if exc_mode not in ["auto", "sink", "no_sink"]: raise ValueError("For 'exc_mode' must be one of '['auto', 'sink', 'no_sink']'.") - if jit_syntax_level != "" and jit_syntax_level not in ['STRICT', 'COMPATIBLE', 'LAX']: + if jit_syntax_level != "" and jit_syntax_level not in ["STRICT", "COMPATIBLE", "LAX"]: raise ValueError("For 'jit_syntax_level' must be one of '['STRICT', 'LAX']'.") - if debug_level not in ['RELEASE', 'DEBUG']: + if debug_level not in ["RELEASE", "DEBUG"]: raise ValueError("For 'debug_level' must be one of '['RELEASE', 'DEBUG']'.") - if infer_boost != "" and infer_boost not in ['on', 'off']: + if infer_boost != "" and infer_boost not in ["on", "off"]: raise ValueError("For 'infer_boost' must be one of '['on', 'off']'.") self.jit_config_dict = kwargs self.jit_config_dict["jit_level"] = jit_level diff --git a/mindspore/python/mindspore/nn/cell.py b/mindspore/python/mindspore/nn/cell.py index 87cea23061c..41c8df35c73 100755 --- a/mindspore/python/mindspore/nn/cell.py +++ b/mindspore/python/mindspore/nn/cell.py @@ -667,6 +667,8 @@ class Cell(Cell_): def _predict(self, *args, **kwargs): + if not hasattr(self, "phase"): + return False, None if (self.phase == "prefill" or self.phase == 'increment') and self.phase in self.phase_cache: new_args = _get_args_for_run_predict(self, args, kwargs, self._compile_args) res = _cell_graph_executor._graph_executor(tuple(new_args), self.phase_cache[self.phase]) @@ -687,7 +689,7 @@ class Cell(Cell_): if predict_compiled: return res self._check_construct_args(*args) - + if self._hook_fn_registered(): logger.warning(f"For 'Cell', it's not support hook function in graph mode. If you want to use hook " f"function, please use context.set_context to set pynative mode.") @@ -993,9 +995,9 @@ class Cell(Cell_): kwargs (dict): Kwargs of the Cell object. """ if self.phase == "prefill": - os.environ["ENABLE_MATMUL_ALLREDUCE"] = "on" + os.environ["DISABLE_MATMULALLREDUCE_FUSION"] = "False" else: - os.environ["ENABLE_MATMUL_ALLREDUCE"] = "" + os.environ["DISABLE_MATMULALLREDUCE_FUSION"] = "True" self._compile_args = self._get_compile_args(args) _cell_graph_executor.compile(self, *self._compile_args, phase=self.phase, jit_config_dict=self._jit_config_dict, **kwargs) diff --git a/mindspore/python/mindspore/ops/operations/__init__.py b/mindspore/python/mindspore/ops/operations/__init__.py index bf78db7aacf..60e17aa02aa 100644 --- a/mindspore/python/mindspore/ops/operations/__init__.py +++ b/mindspore/python/mindspore/ops/operations/__init__.py @@ -118,7 +118,7 @@ from .nn_ops import (LSTM, SGD, Adam, AdamWeightDecay, FusedSparseAdam, FusedSpa FractionalMaxPool, FractionalMaxPool3DWithFixedKsize, FractionalMaxPoolWithFixedKsize, GridSampler2D, TripletMarginLoss, UpsampleNearest3D, UpsampleTrilinear3D, PadV3, ChannelShuffle, GLU, MaxUnpool3D, Pdist, RmsNorm, PagedAttention, PagedAttentionMask, ReshapeAndCache, - ApplyRotaryPosEmb, MatmulQkv, MatmulFfn) + ApplyRotaryPosEmb) from .other_ops import (Assign, IOU, BoundingBoxDecode, BoundingBoxEncode, ConfusionMatrix, UpdateState, Load, StopGradient, Reusing, CheckValid, Partial, Depend, Push, Pull, PyExecute, PyFunc, _DynamicLossScale, @@ -695,8 +695,6 @@ __all__ = [ "ReshapeAndCache", "ApplyRotaryPosEmb", "RmsNorm", - "MatmulQkv", - "MatmulFfn", ] __custom__ = [ diff --git a/mindspore/python/mindspore/ops/operations/nn_ops.py b/mindspore/python/mindspore/ops/operations/nn_ops.py index 13303bce8b8..fa26de101a9 100644 --- a/mindspore/python/mindspore/ops/operations/nn_ops.py +++ b/mindspore/python/mindspore/ops/operations/nn_ops.py @@ -10196,27 +10196,3 @@ class RmsNorm(Primitive): """Initialize Dense.""" validator.check_value_type("epsilon", epsilon, [float], self.name) self.init_prim_io_names(inputs=['x', 'gamma'], outputs=["y", "rstd"]) - - -class MatmulQkv(Primitive): - r""" - Fuse three matmul ops for q k v attention into one - """ - - @prim_attr_register - def __init__(self): - """Initialize""" - self.init_prim_io_names(inputs=['hidden_states', 'weight_q', 'weight_k', 'weight_v'], - outputs=["output_q", "output_k", "output_v"]) - - -class MatmulFfn(Primitive): - r""" - Fuse two matmul ops for feed forward into one - """ - - @prim_attr_register - def __init__(self): - """Initialize""" - self.init_prim_io_names(inputs=['hidden_states', 'weight_gate', 'weight_up'], - outputs=["output_gate", "output_up"]) diff --git a/scripts/build/check_and_build_ms_kernels_internal.sh b/scripts/build/check_and_build_ms_kernels_internal.sh index e38689250ac..8bf33778f72 100644 --- a/scripts/build/check_and_build_ms_kernels_internal.sh +++ b/scripts/build/check_and_build_ms_kernels_internal.sh @@ -17,26 +17,26 @@ if [ "$(uname)" == Linux ]; then if [ -n "${MS_INTERNAL_KERNEL_HOME}" ]; then echo "Use local MS_INTERNAL_KERNEL_HOME : ${MS_INTERNAL_KERNEL_HOME}" - else - file_path=${BASEPATH}/mindspore/ccsrc/plugin/device/ascend/kernel/internal/prebuild - lib_file=${file_path}/ms_kernels_internal.tar.gz - if [ -f "${lib_file}" ]; then - file_lines=`cat "${lib_file}" | wc -l` - if [ ${file_lines} -ne 3 ]; then - tar -zxf ${lib_file} -C ${file_path} - if [ $? -eq 0 ]; then - echo "Unzip ms_kernel_internal.tar.gz SUCCESS!" - export MS_INTERNAL_KERNEL_HOME="${file_path}/ms_kernels_internal" - echo "MS_INTERNAL_KERNEL_HOME = ${MS_INTERNAL_KERNEL_HOME}" - else - echo "[WARNING] Unzip ms_kernel_internal.tar.gz FAILED!" - fi - else - echo "[WARNING] The file ms_kernel_internal.tar.gz is not pulled. Please ensure git-lfs is installed by" - echo "[WARNING] `git lfs install` and retry downloading using `git lfs pull`." - fi - else - echo "[WARNING] The file ms_kernel_internal.tar.gz does NOT EXIST." - fi + # else + # file_path=${BASEPATH}/mindspore/ccsrc/plugin/device/ascend/kernel/internal/prebuild + # lib_file=${file_path}/ms_kernels_internal.tar.gz + # if [ -f "${lib_file}" ]; then + # file_lines=`cat "${lib_file}" | wc -l` + # if [ ${file_lines} -ne 3 ]; then + # tar -zxf ${lib_file} -C ${file_path} + # if [ $? -eq 0 ]; then + # echo "Unzip ms_kernel_internal.tar.gz SUCCESS!" + # export MS_INTERNAL_KERNEL_HOME="${file_path}/ms_kernels_internal" + # echo "MS_INTERNAL_KERNEL_HOME = ${MS_INTERNAL_KERNEL_HOME}" + # else + # echo "[WARNING] Unzip ms_kernel_internal.tar.gz FAILED!" + # fi + # else + # echo "[WARNING] The file ms_kernel_internal.tar.gz is not pulled. Please ensure git-lfs is installed by" + # echo "[WARNING] 'git lfs install' and retry downloading using 'git lfs pull'." + # fi + # else + # echo "[WARNING] The file ms_kernel_internal.tar.gz does NOT EXIST." + # fi fi fi diff --git a/tests/st/lccl/test_all.py b/tests/st/lccl/test_all.py index 4c55289b88d..94b3b1058d6 100644 --- a/tests/st/lccl/test_all.py +++ b/tests/st/lccl/test_all.py @@ -27,7 +27,7 @@ def test_lccl_allreduce(): Description: msrun lccl all_reduce 8P case. Expectation: success """ - os.environ['ENABLE_LCCL'] = str(1) + os.environ['MS_ENABLE_LCCL'] = str(1) os.environ['GRAPH_OP_RUN'] = str(1) context.set_context(mode=context.GRAPH_MODE, device_target="Ascend") return_code = os.system("msrun --worker_num=8 --local_worker_num=8 --join=True pytest -s test_lccl_allreduce.py") @@ -44,7 +44,7 @@ def test_lccl_allgather(): Description: msrun lccl all_gather 8P case. Expectation: success """ - os.environ['ENABLE_LCCL'] = str(1) + os.environ['MS_ENABLE_LCCL'] = str(1) os.environ['GRAPH_OP_RUN'] = str(1) context.set_context(mode=context.GRAPH_MODE, device_target="Ascend") return_code = os.system("msrun --worker_num=8 --local_worker_num=8 --join=True pytest -s test_lccl_allgather.py") @@ -61,7 +61,7 @@ def test_lccl_reducescatter(): Description: msrun lccl reduce_scatter 8P case. Expectation: success """ - os.environ['ENABLE_LCCL'] = str(1) + os.environ['MS_ENABLE_LCCL'] = str(1) os.environ['GRAPH_OP_RUN'] = str(1) context.set_context(mode=context.GRAPH_MODE, device_target="Ascend") return_code = os.system("msrun --worker_num=8 --local_worker_num=8 --join=True " @@ -79,7 +79,7 @@ def test_lccl_broadcast(): Description: msrun lccl broadcast 8P case. Expectation: success """ - os.environ['ENABLE_LCCL'] = str(1) + os.environ['MS_ENABLE_LCCL'] = str(1) os.environ['GRAPH_OP_RUN'] = str(1) context.set_context(mode=context.GRAPH_MODE, device_target="Ascend") return_code = os.system("msrun --worker_num=8 --local_worker_num=8 --join=True pytest -s test_lccl_broadcast.py") diff --git a/tests/st/lccl/test_lccl_allreduce.py b/tests/st/lccl/test_lccl_allreduce.py index 8d60fc0a622..d239b6aa2c8 100644 --- a/tests/st/lccl/test_lccl_allreduce.py +++ b/tests/st/lccl/test_lccl_allreduce.py @@ -24,7 +24,6 @@ from mindspore.common.initializer import initializer from mindspore.common.parameter import Parameter from mindspore.communication.management import init, HCCL_WORLD_COMM_GROUP, get_rank, get_group_size from mindspore.ops import operations as P -from mindspore.ops.operations import _inner_ops as inner context.set_context(mode=context.GRAPH_MODE, device_target='Ascend') diff --git a/tests/st/msrun/test_entry_msrun.py b/tests/st/msrun/test_entry_msrun.py index c26dd33e010..5ef228678be 100644 --- a/tests/st/msrun/test_entry_msrun.py +++ b/tests/st/msrun/test_entry_msrun.py @@ -27,7 +27,6 @@ def test_msrun(): Description: Launch distributed training job with dynamic cluster using msrun. Expectation: All workers are successfully spawned and running training. """ - os.environ['ENABLE_LCCL'] = str(1) os.environ['GRAPH_OP_RUN'] = str(1) return_code = os.system( "msrun --worker_num=4 --local_worker_num=4 --master_addr=127.0.0.1 "\ diff --git a/tests/ut/cpp/CMakeLists.txt b/tests/ut/cpp/CMakeLists.txt index bec86c14f3b..4ff8412bb3a 100644 --- a/tests/ut/cpp/CMakeLists.txt +++ b/tests/ut/cpp/CMakeLists.txt @@ -263,6 +263,8 @@ list(REMOVE_ITEM MINDSPORE_SRC_LIST "../../../mindspore/ccsrc/runtime/graph_scheduler/actor/embedding_cache/embedding_cache_prefetch_actor.cc") list(REMOVE_ITEM MINDSPORE_SRC_LIST "../../../mindspore/ccsrc/plugin/device/ascend/hal/profiler/parallel_strategy_profiling.cc") +list(REMOVE_ITEM MINDSPORE_SRC_LIST + "../../../mindspore/ccsrc/plugin/device/ascend/kernel/hccl/hcom_matmul_all_reduce.cc") add_library(_ut_mindspore_obj STATIC ${MINDSPORE_SRC_LIST} $ $ $ $