bugfix after rebase 2.3:

1. unify fusion pass
2. data_source_actor.cc
3. remove matmulqkv/ffn python nn_op and parallel info
4. Codecheck
5. disable internal kernels compile temporarily
6. use dummy ascend.
7. fix matmul allreduce correction
8. update MS_ENABLE_LCCL

ut build remove hcom_matmul_all_reduce
bugfix of cell.py::predict
matmul_allreduce re-apply
This commit is contained in:
zhengzuohe 2024-04-16 18:21:25 +08:00
parent 66183a6817
commit 6073f75254
44 changed files with 124 additions and 414 deletions

View File

@ -1,82 +0,0 @@
/**
* Copyright 2024 Huawei Technologies Co., Ltd
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "frontend/parallel/ops_info/matmul_ffn_info.h"
#include "frontend/parallel/dynamic_creator.h"
namespace mindspore {
namespace parallel {
// MatMulQkv has 3 inputs and 2 outputs
// x: (batch * seq_len (inc is 1), hidden_size)
// weight_1: (weight_1_hidden_size, hidden_size)
// weight_2: (weight_2_hidden_size, hidden_size)
// ------------------------------
// output_1: (batch * seq_len (inc is 1), weight_1_hidden_size)
// output_2: (batch * seq_len (inc is 1), weight_2_hidden_size)
constexpr size_t kMatMulFfnOutputSize = 2;
Status MatmulFfnInfo::CheckStrategy(const StrategyPtr &strategy) {
if (CheckStrategyValue(strategy, inputs_shape_) != SUCCESS) {
return FAILED;
}
// TODO
return SUCCESS;
}
Status MatmulFfnInfo::InferDevMatrixShape() {
auto input_strategies = strategy()->GetInputDim();
auto x = input_strategies.at(0); // (batch * seq_len, hidden_size)
auto weight_1 = input_strategies.at(1);
// dp mp
// 1 0
dev_matrix_shape_ = {x.at(0), weight_1.at(0)};
return SUCCESS;
}
Status MatmulFfnInfo::InferTensorMap() {
Shape x_tensor_map{1, -1};
Shape weight_1_tensor_map{0, -1};
Shape weight_2_tensor_map{0, -1};
inputs_tensor_map_.emplace_back(x_tensor_map);
inputs_tensor_map_.emplace_back(weight_1_tensor_map);
inputs_tensor_map_.emplace_back(weight_2_tensor_map);
Shape output_q_tensor_map{1, 0};
Shape output_k_tensor_map{1, 0};
outputs_tensor_map_.emplace_back(output_q_tensor_map);
outputs_tensor_map_.emplace_back(output_k_tensor_map);
return SUCCESS;
}
Status MatmulFfnInfo::InferAsLossDivisor() {
if (outputs_tensor_map_.size() != kMatMulFfnOutputSize) {
MS_LOG(ERROR) << name_ << ": The size of outputs tensor map must be 2, but got " << outputs_tensor_map_.size();
return FAILED;
}
as_loss_divisor_ = ComputeRepeatDeviceNumByTensorMap(dev_matrix_shape_, outputs_tensor_map_[0]);
MS_LOG(INFO) << name_ << " : The dev matrix shape is " << ShapeToString(dev_matrix_shape_)
<< ", the output[0]'s tensor map is " << ShapeToString(outputs_tensor_map_[0])
<< ", as_loss_divisor_ is " << as_loss_divisor_;
return SUCCESS;
}
REGISTER(MatmulFfnInfo);
} // namespace parallel
} // namespace mindspore

View File

@ -1,53 +0,0 @@
/**
* Copyright 2024 Huawei Technologies Co., Ltd
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#ifndef MINDSPORE_CCSRC_FRONTEND_PARALLEL_OPS_INFO_MATMUL_FFN_INFO_H_
#define MINDSPORE_CCSRC_FRONTEND_PARALLEL_OPS_INFO_MATMUL_FFN_INFO_H_
#include <memory>
#include <string>
#include <vector>
#include "utils/hash_map.h"
#include "ir/value.h"
#include "frontend/parallel/auto_parallel/operator_costmodel.h"
#include "frontend/parallel/ops_info/operator_info.h"
#include "frontend/parallel/strategy.h"
namespace mindspore {
namespace parallel {
class MatmulFfnInfo : public OperatorInfo {
public:
MatmulFfnInfo(const std::string &name, const Shapes &inputs_shape, const Shapes &outputs_shape,
const PrimitiveAttrs &attrs)
: OperatorInfo(name, inputs_shape, outputs_shape, attrs, std::make_shared<ActivationInfoCost>()) {}
~MatmulFfnInfo() override = default;
Status CheckStrategy(const StrategyPtr &strategy) override;
std::vector<StrategyPtr> GenerateOpStrategies(int64_t stage_id) override { return {}; }
Status SetCostUnderStrategy(const StrategyPtr &strategy) override { return SetCostUnderStrategyBase(strategy); }
protected:
Status GetAttrs() override { return SUCCESS; }
Status InferForwardCommunication() { return SUCCESS; }
Status InferTensorMap() override;
Status InferDevMatrixShape() override;
Status InferAsLossDivisor() override;
};
using MatmulFfnInfoPtr = std::shared_ptr<MatmulFfnInfo>;
} // namespace parallel
} // namespace mindspore
#endif // MINDSPORE_CCSRC_FRONTEND_PARALLEL_OPS_INFO_MATMUL_FFN_INFO_H_

View File

@ -1,95 +0,0 @@
/**
* Copyright 2024 Huawei Technologies Co., Ltd
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "frontend/parallel/ops_info/matmul_qkv_info.h"
#include "frontend/parallel/dynamic_creator.h"
namespace mindspore {
namespace parallel {
// MatMulQkv has 3 inputs and 3 outputs
// x: (batch * seq_len (inc is 1), query_hidden_size)
// q: (query_hidden_size, query_hidden_size)
// k: (key_hidden_size, query_hidden_size)
// v: (value_hidden_size, query_hidden_size)
// ------------------------------
// output_q: (batch * seq_len (inc is 1), query_hidden_size)
// output_k: (batch * seq_len (inc is 1), key_hidden_size)
// output_v: (batch * seq_len (inc is 1), value_hidden_size)
// split strategy
// batch is not able to split
// seq_len is not able to split
// query_hidden_size is able to split
// key_hidden_size is able to split
// value_hidden_size is able to split
constexpr size_t kMatMulQkvOutputSize = 3;
Status MatmulQkvInfo::CheckStrategy(const StrategyPtr &strategy) {
if (CheckStrategyValue(strategy, inputs_shape_) != SUCCESS) {
return FAILED;
}
// TODO
return SUCCESS;
}
Status MatmulQkvInfo::InferDevMatrixShape() {
auto input_strategies = strategy()->GetInputDim();
auto x = input_strategies.at(0); // (batch * seq_len, q_hidden_size)
auto q = input_strategies.at(1);
// dp mp
// 1 0
dev_matrix_shape_ = {x.at(0), q.at(0)};
return SUCCESS;
}
Status MatmulQkvInfo::InferTensorMap() {
Shape x_tensor_map{1, -1};
Shape q_tensor_map{0, -1};
Shape k_tensor_map{0, -1};
Shape v_tensor_map{0, -1};
inputs_tensor_map_.emplace_back(x_tensor_map);
inputs_tensor_map_.emplace_back(q_tensor_map);
inputs_tensor_map_.emplace_back(k_tensor_map);
inputs_tensor_map_.emplace_back(v_tensor_map);
Shape output_q_tensor_map{1, 0};
Shape output_k_tensor_map{1, 0};
Shape output_v_tensor_map{1, 0};
outputs_tensor_map_.emplace_back(output_q_tensor_map);
outputs_tensor_map_.emplace_back(output_k_tensor_map);
outputs_tensor_map_.emplace_back(output_v_tensor_map);
return SUCCESS;
}
Status MatmulQkvInfo::InferAsLossDivisor() {
if (outputs_tensor_map_.size() != kMatMulQkvOutputSize) {
MS_LOG(ERROR) << name_ << ": The size of outputs tensor map must be 3, but got " << outputs_tensor_map_.size();
return FAILED;
}
as_loss_divisor_ = ComputeRepeatDeviceNumByTensorMap(dev_matrix_shape_, outputs_tensor_map_[0]);
MS_LOG(INFO) << name_ << " : The dev matrix shape is " << ShapeToString(dev_matrix_shape_)
<< ", the output[0]'s tensor map is " << ShapeToString(outputs_tensor_map_[0])
<< ", as_loss_divisor_ is " << as_loss_divisor_;
return SUCCESS;
}
REGISTER(MatmulQkvInfo);
} // namespace parallel
} // namespace mindspore

View File

@ -1,53 +0,0 @@
/**
* Copyright 2024 Huawei Technologies Co., Ltd
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#ifndef MINDSPORE_CCSRC_FRONTEND_PARALLEL_OPS_INFO_MATMUL_QKV_INFO_H_
#define MINDSPORE_CCSRC_FRONTEND_PARALLEL_OPS_INFO_MATMUL_QKV_INFO_H_
#include <memory>
#include <string>
#include <vector>
#include "utils/hash_map.h"
#include "ir/value.h"
#include "frontend/parallel/auto_parallel/operator_costmodel.h"
#include "frontend/parallel/ops_info/operator_info.h"
#include "frontend/parallel/strategy.h"
namespace mindspore {
namespace parallel {
class MatmulQkvInfo : public OperatorInfo {
public:
MatmulQkvInfo(const std::string &name, const Shapes &inputs_shape, const Shapes &outputs_shape,
const PrimitiveAttrs &attrs)
: OperatorInfo(name, inputs_shape, outputs_shape, attrs, std::make_shared<ActivationInfoCost>()) {}
~MatmulQkvInfo() override = default;
Status CheckStrategy(const StrategyPtr &strategy) override;
std::vector<StrategyPtr> GenerateOpStrategies(int64_t stage_id) override { return {}; }
Status SetCostUnderStrategy(const StrategyPtr &strategy) override { return SetCostUnderStrategyBase(strategy); }
protected:
Status GetAttrs() override { return SUCCESS; }
Status InferForwardCommunication() { return SUCCESS; }
Status InferTensorMap() override;
Status InferDevMatrixShape() override;
Status InferAsLossDivisor() override;
};
using MatMulQkvInfoPtr = std::shared_ptr<MatmulQkvInfo>;
} // namespace parallel
} // namespace mindspore
#endif // MINDSPORE_CCSRC_FRONTEND_PARALLEL_OPS_INFO_MATMUL_QKV_INFO_H_

View File

@ -91,10 +91,8 @@ Status PagedAttentionInfo::CheckStrategy(const StrategyPtr &strategy) {
Status PagedAttentionInfo::InferDevMatrixShape() {
auto input_strategies = strategy()->GetInputDim();
auto query = input_strategies.at(0); // (batch, seq_len, hidden_size)
auto cache = input_strategies.at(1); // (block_size, num_blocks, hidden_size)
auto block_tables = input_strategies.at(3); // (batch, max_num_block_per_batch)
auto context_lens = input_strategies.at(4); // (context_lens)
auto query = input_strategies.at(0); // (batch, seq_len, hidden_size)
auto cache = input_strategies.at(1); // (block_size, num_blocks, hidden_size)
// batch block_size num_blocks seq_len hidden_size
// 4 3 2 1 0

View File

@ -834,12 +834,7 @@ void InsertVirtualOutput(const FuncGraphPtr &root, const std::vector<AnfNodePtr>
OperatorAttrs attrs;
OperatorArgs args = std::make_pair(attrs, params);
Operator op = std::make_pair(VIRTUAL_OUTPUT, args);
// Temporarily circumvent the MatmulQkv problem, and then modify it
auto cnode = dyn_cast_ptr<CNode>(out_node);
const auto &input = cnode->input(0);
MS_EXCEPTION_IF_NULL(input);
auto prim = GetValuePtr<Primitive>(input);
if (IsPrimitiveCNode(out_node, prim::kPrimMakeTuple) || prim->name() == "MatmulQkv") {
if (IsPrimitiveCNode(out_node, prim::kPrimMakeTuple)) {
auto tuple = out_node->cast<CNodePtr>();
MS_EXCEPTION_IF_NULL(tuple);
for (size_t i = 1; i < tuple->size(); ++i) {

View File

@ -763,9 +763,7 @@ bool AscendDeviceAddress::AsyncDeviceToDevice(const ShapeVector & /* shape */, s
return ret;
}
bool AscendDeviceAddress::AsyncHostToDevice(size_t size, TypeId /* type */,
const void *host_ptr) const {
bool AscendDeviceAddress::AsyncHostToDevice(size_t size, TypeId /* type */, const void *host_ptr) const {
MS_ERROR_IF_NULL(host_ptr);
BindDevice();
if (!MoveToDevice(false)) {
@ -779,7 +777,8 @@ bool AscendDeviceAddress::AsyncHostToDevice(size_t size, TypeId /* type */,
auto runtime_instance = device::KernelRuntimeManager::Instance().GetKernelRuntime(kAscendDevice, device_id);
MS_EXCEPTION_IF_NULL(runtime_instance);
auto ret = CALL_ASCEND_API(aclrtMemcpyAsync, GetDevicePtr(), size, host_ptr, size, ACL_MEMCPY_HOST_TO_DEVICE, runtime_instance->compute_stream());
auto ret = CALL_ASCEND_API(aclrtMemcpyAsync, GetDevicePtr(), size, host_ptr, size, ACL_MEMCPY_HOST_TO_DEVICE,
runtime_instance->compute_stream());
if (ret != ACL_ERROR_NONE) {
MS_LOG(ERROR) << "Call aclrtMemcpyAsync host to device failed, the error num[" << ret << "]";
return false;
@ -787,7 +786,6 @@ bool AscendDeviceAddress::AsyncHostToDevice(size_t size, TypeId /* type */,
return true;
}
bool AscendDeviceAddress::AsyncHostToDevice(const ShapeVector & /* shape */, size_t size, TypeId /* type */,
const void *host_ptr, size_t stream_id) const {
MS_ERROR_IF_NULL(host_ptr);

View File

@ -17,7 +17,7 @@ if(DEFINED ENV{MS_INTERNAL_KERNEL_HOME})
${CMAKE_SOURCE_DIR}/mindspore/ccsrc/runtime/collective/collective_communication_lib.cc
${CMAKE_SOURCE_DIR}/mindspore/ccsrc/runtime/collective/communication_group.cc)
set_property(SOURCE ${LOWLATENCY_COLLECTIVE_SRCS} PROPERTY COMPILE_DEFINITIONS
SUBMODULE_ID=mindspore::SubModuleId::SM_DEVICE)
SUBMODULE_ID=mindspore::SubModuleId::SM_DEVICE)
add_library(lowlatency_collective SHARED ${LOWLATENCY_COLLECTIVE_SRCS})
target_link_libraries(lowlatency_collective PRIVATE lcal)
endif()

View File

@ -246,12 +246,13 @@ void GeDeviceResManager::CreateSessionAndGraphRunner() {
}
bool GeDeviceResManager::LoadCollectiveCommLib() {
// If this is simulation, don't load any collective communication library.
// If this is simulation, load dummy collective communication library.
if (!common::GetEnv(kSimulationLevel).empty()) {
collective_comm_lib_ = &DummyAscendCollectiveCommLib::GetInstance();
return true;
}
// Ascend backend supports HCCL and LCCL collective communication libraries.
if (!common::GetEnv("ENABLE_LCCL").empty()) {
if (!common::GetEnv("MS_ENABLE_LCCL").empty()) {
std::string lowlatency_comm_lib_name = "liblowlatency_collective.so";
auto loader = std::make_shared<CollectiveCommLibLoader>(lowlatency_comm_lib_name);
MS_EXCEPTION_IF_NULL(loader);
@ -265,7 +266,7 @@ bool GeDeviceResManager::LoadCollectiveCommLib() {
auto instance_func = DlsymFuncObj(communication_lib_instance, collective_comm_lib_handle);
collective_comm_lib_ = instance_func();
MS_EXCEPTION_IF_NULL(collective_comm_lib_);
MS_LOG(WARNING) << "Loading LCCL because env ENABLE_LCCL is set to 1. Pay attention that LCCL only supports "
MS_LOG(WARNING) << "Loading LCCL because env MS_ENABLE_LCCL is set to 1. Pay attention that LCCL only supports "
"single-node-multi-card mode in KernelByKernel for now.";
} else {
collective_comm_lib_ = &AscendCollectiveCommLib::GetInstance();

View File

@ -73,18 +73,7 @@ class GeDeviceResManager : public DeviceResManager {
static void CreateSessionAndGraphRunner();
<<<<<<< HEAD
bool LoadCollectiveCommLib() override {
if (common::GetEnv(kSimulationLevel).empty()) {
collective_comm_lib_ = &AscendCollectiveCommLib::GetInstance();
} else {
collective_comm_lib_ = &DummyAscendCollectiveCommLib::GetInstance();
}
return true;
}
=======
bool LoadCollectiveCommLib() override;
>>>>>>> Add lccl so.
void ResetStreamAndCtx() override;
bool BindDeviceToCurrentThread(bool force_bind) const override;

View File

@ -44,7 +44,7 @@ class LowlatencyCommunicationGroup : public CommunicationGroup {
void *GenerateRootInfo(size_t *root_info_size) override;
// Return communicator for collective communcation ops.
// Return communicator for collective communication ops.
const LcclPtr &lccl_communicator() const;
// Return communicator of lcal.
const LcalCommPtr &lcal_comm() const;

View File

@ -135,7 +135,7 @@ bool HcclKernel::Init(const std::vector<KernelTensor *> &inputs, const std::vect
if (common::GetEnv(kSimulationLevel).empty() && !common::IsNeedProfileMemory()) {
#ifdef ENABLE_INTERNAL_KERNELS
if (!common::GetEnv("ENABLE_LCCL").empty()) {
if (!common::GetEnv("MS_ENABLE_LCCL").empty()) {
LoadLcclLibrary();
} else {
LoadHcclLibrary();

View File

@ -31,7 +31,7 @@ bool HcomAllBroadCastKernel::Launch(const std::vector<KernelTensor *> &inputs, c
MS_EXCEPTION_IF_NULL(stream_ptr);
#ifdef ENABLE_INTERNAL_KERNELS
if (!common::GetEnv("ENABLE_LCCL").empty()) {
if (!common::GetEnv("MS_ENABLE_LCCL").empty()) {
auto lccl_result =
lccl_comm_->Broadcast(inputs[0]->device_ptr(), hccl_count_, hccl_data_type_list_[0], root_id_, stream_ptr);
if (lccl_result != Lcal::LCAL_SUCCESS) {

View File

@ -32,7 +32,7 @@ bool HcomAllGatherKernel::Launch(const std::vector<KernelTensor *> &inputs, cons
MS_EXCEPTION_IF_NULL(stream_ptr);
#ifdef ENABLE_INTERNAL_KERNELS
if (!common::GetEnv("ENABLE_LCCL").empty()) {
if (!common::GetEnv("MS_ENABLE_LCCL").empty()) {
auto lccl_result = lccl_comm_->AllGather(inputs[0]->device_ptr(), outputs[0]->device_ptr(), hccl_count_,
hccl_data_type_list_[0], stream_ptr);
if (lccl_result != Lcal::LCAL_SUCCESS) {

View File

@ -38,7 +38,7 @@ bool HcomAllReduceKernel::Launch(const std::vector<KernelTensor *> &inputs, cons
MS_EXCEPTION_IF_NULL(stream_ptr);
#ifdef ENABLE_INTERNAL_KERNELS
if (!common::GetEnv("ENABLE_LCCL").empty()) {
if (!common::GetEnv("MS_ENABLE_LCCL").empty()) {
auto lccl_result = lccl_comm_->AllReduce(inputs[0]->device_ptr(), outputs[0]->device_ptr(), hccl_count_,
hccl_data_type_list_[0], op_type_, stream_ptr);
if (lccl_result != Lcal::LCAL_SUCCESS) {

View File

@ -32,7 +32,7 @@ bool HcomAllReduceScatterKernel::Launch(const std::vector<KernelTensor *> &input
MS_EXCEPTION_IF_NULL(stream_ptr);
#ifdef ENABLE_INTERNAL_KERNELS
if (!common::GetEnv("ENABLE_LCCL").empty()) {
if (!common::GetEnv("MS_ENABLE_LCCL").empty()) {
auto lccl_result = lccl_comm_->ReduceScatter(inputs[0]->device_ptr(), outputs[0]->device_ptr(), hccl_count_,
hccl_data_type_list_[0], op_type_, stream_ptr);
if (lccl_result != Lcal::LCAL_SUCCESS) {

View File

@ -62,7 +62,7 @@ int HcomMatMulAllReduceKernel::Resize(const std::vector<KernelTensor *> &inputs,
// The dimensions of left and right matrices.
matmul_info_.m = hccl_kernel_input_shape_list_[0][0];
matmul_info_.k = hccl_kernel_input_shape_list_[0][1];
matmul_info_.n = hccl_kernel_input_shape_list_[1][0];
matmul_info_.n = hccl_kernel_input_shape_list_[1][1];
matmul_info_.transA = transpose_a_;
matmul_info_.transB = transpose_b_;

View File

@ -26,7 +26,6 @@ namespace kernel {
constexpr uint32_t kMatMulAllReduceInputNum = 2;
constexpr uint32_t kMatMulAllReduceOutputNum = 1;
constexpr char kAttrNameTransposeA[] = "transpose_a";
;
constexpr char kAttrNameTransposeB[] = "transpose_b";
class HcomMatMulAllReduceKernel : public HcclKernel {

View File

@ -85,7 +85,7 @@ class InternalSub : public ElewiseBinary {
param_ptr->input2_dtype_ = InternalKernelUtils::ToInternalDType(inputs[kIndex1]->dtype_id());
param_ptr->input1_dims_ = internal::VecToSVec<int64_t>(inputs[kIndex0]->GetShapeVector());
param_ptr->input2_dims_ = internal::VecToSVec<int64_t>(inputs[kIndex1]->GetShapeVector());
return std::static_pointer_cast<internal::OpParam>(param_ptr);
}
};

View File

@ -105,7 +105,6 @@ int InternalKernelMod::Resize(const std::vector<KernelTensor *> &inputs, const s
}
}
std::vector<internal::DIMS> input_shapes(inputs_.size());
std::vector<internal::DIMS> output_shapes;
for (auto iter = inputsIdxMap_.begin(); iter != inputsIdxMap_.end(); iter++) {
InternalKernelUtils::ToInternalTensor(inputs_[iter->second], inputs[iter->first]);
input_shapes[iter->second] = inputs_[iter->second]->desc.dims;

View File

@ -136,13 +136,9 @@ void GetBackendCommonUnifyMindIRPassManager(PassManagerPtr *unify_mindir_pm) {
#ifdef ENABLE_INTERNAL_KERNELS
(*unify_mindir_pm)->AddPass(std::make_shared<opt::MultiMatmulsFusion>());
(*unify_mindir_pm)->AddPass(std::make_shared<opt::AddLayernormFusion>());
if (common::GetEnv("MS_ENABLE_INTERNAL_KERNELS") == "on") {
(*unify_mindir_pm)->AddPass(std::make_shared<opt::ShapeReshapeFusion>());
}
(*unify_mindir_pm)->AddPass(std::make_shared<opt::ShapeReshapeFusion>());
(*unify_mindir_pm)->AddPass(std::make_shared<opt::AddRmsNormFusion>());
if (common::GetEnv("ENABLE_MATMUL_ALLREDUCE") == "on") {
(*unify_mindir_pm)->AddPass(std::make_shared<opt::MatMulAllReduceFusion>());
}
(*unify_mindir_pm)->AddPass(std::make_shared<opt::MatMulAllReduceFusion>());
#endif // ENABLE_INTERNAL_KERNELS
}

View File

@ -15,11 +15,13 @@
*/
#include "plugin/device/ascend/optimizer/ir_fusion/matmul_allreduce_fusion.h"
#include <set>
#include <vector>
#include "mindspore/core/ops/nn_ops.h"
#include "mindspore/core/ops/math_ops.h"
#include "mindspore/core/ops/other_ops.h"
#include "mindspore/core/ops/lite_ops.h"
#include "mindspore/core/utils/ms_context.h"
#include "include/backend/optimizer/helper.h"
#include "include/backend/anf_runtime_algorithm.h"
#include "include/common/utils/anfalgo.h"
@ -101,6 +103,16 @@ AnfNodePtr MatMulAllReduceFusion::CreateMatMulAllReduceNode(const FuncGraphPtr &
const AnfNodePtr MatMulAllReduceFusion::Process(const mindspore::FuncGraphPtr &func_graph,
const mindspore::AnfNodePtr &node,
const mindspore::EquivPtr &equiv) const {
auto ms_context = MsContext::GetInstance();
MS_EXCEPTION_IF_NULL(ms_context);
if (!ms_context->IsEnableInferBoost()) {
return nullptr;
}
if (common::GetEnv("DISABLE_MATMULALLREDUCE_FUSION") == "True") {
return nullptr;
}
if (func_graph == nullptr || node == nullptr) {
return nullptr;
}

View File

@ -16,6 +16,7 @@
#ifndef MINDSPORE_CCSRC_BACKEND_OPTIMIZER_ASCEND_IR_FUSION_MATMUL_ALLREDUCE_FUSION_H_
#define MINDSPORE_CCSRC_BACKEND_OPTIMIZER_ASCEND_IR_FUSION_MATMUL_ALLREDUCE_FUSION_H_
#include <string>
#include <memory>
#include "include/backend/optimizer/optimizer.h"
@ -24,7 +25,7 @@ namespace opt {
class MatMulAllReduceFusion : public PatternProcessPass {
public:
explicit MatMulAllReduceFusion(bool multigraph = true, const string &pass_name = "MatMulAllReduce")
: PatternProcessPass(pass_name, multigraph){};
: PatternProcessPass(pass_name, multigraph) {}
~MatMulAllReduceFusion() override = default;
const BaseRef DefinePattern() const override;
const AnfNodePtr Process(const FuncGraphPtr &graph, const AnfNodePtr &node, const EquivPtr &equiv) const override;
@ -46,4 +47,4 @@ class MatMulAllReduceFusion : public PatternProcessPass {
} // namespace opt
} // namespace mindspore
#endif // MINDSPORE_CCSRC_BACKEND_OPTIMIZER_ASCEND_IR_FUSION_MATMUL_ALLREDUCE_FUSION_H_
#endif // MINDSPORE_CCSRC_BACKEND_OPTIMIZER_ASCEND_IR_FUSION_MATMUL_ALLREDUCE_FUSION_H_

View File

@ -15,13 +15,19 @@
*/
#include "plugin/device/ascend/optimizer/ir_fusion/multi_matmuls_fusion.h"
#include "mindspore/core/utils/ms_context.h"
namespace mindspore {
namespace opt {
bool MultiMatmulsFusion::Run(const FuncGraphPtr &graph) {
bool changed = false;
if (common::GetEnv("ENABLE_MATMUL_FUSION") != "on") {
auto ms_context = MsContext::GetInstance();
MS_EXCEPTION_IF_NULL(ms_context);
if (!ms_context->IsEnableInferBoost() || common::GetEnv("ENABLE_MATMUL_FUSION") != "on") {
return changed;
}
auto mng = graph->manager();
MS_EXCEPTION_IF_NULL(mng);
const auto &node_users_map = mng->node_users();

View File

@ -22,6 +22,7 @@
#include "mindspore/core/ops/reshape_ext.h"
#include "mindspore/core/ops/scalar_graph_holder.h"
#include "mindspore/core/ops/array_ops.h"
#include "mindspore/core/utils/ms_context.h"
#include "include/common/utils/anfalgo.h"
#include "mindspore/ccsrc/include/common/utils/utils.h"
#include "plugin/device/ascend/optimizer/get_value_helper.h"
@ -200,6 +201,12 @@ const BaseRef ShapeReshapeFusion::DefinePattern() const {
const AnfNodePtr ShapeReshapeFusion::Process(const FuncGraphPtr &func_graph, const AnfNodePtr &node,
const EquivPtr &equiv) const {
auto ms_context = MsContext::GetInstance();
MS_EXCEPTION_IF_NULL(ms_context);
if (!ms_context->IsEnableInferBoost()) {
return nullptr;
}
MS_EXCEPTION_IF_NULL(func_graph);
MS_EXCEPTION_IF_NULL(equiv);

View File

@ -825,9 +825,9 @@ void DataPrepareActor::PrepareDataForHostTensorQueueNew(const VectorRef &args, O
<< " for input parameter:" << origin_parameter->fullname_with_scope();
if (!isDyn) {
if(host_tensors_[tensor_position] != input_tensor->shape()) {
isDyn = true;
}
if (host_tensors_[tensor_position] != input_tensor->shape()) {
isDyn = true;
}
}
host_tensors_[tensor_position] = input_tensor->shape();
host_tensors[tensor_position] = input_tensor;

View File

@ -23,6 +23,7 @@
#include "mindrt/include/async/async.h"
#include "utils/log_adapter.h"
#include "kernel/common_utils.h"
#include "mindspore/core/utils/ms_context.h"
namespace mindspore {
namespace runtime {
@ -278,6 +279,8 @@ void HostQueueDataSourceActor::SendMemoryFreeReq(OpContext<DeviceTensor> *const
}
void HostQueueDataSourceActor::OnMemoryAllocFinish(OpContext<DeviceTensor> *const context) {
auto ms_context = MsContext::GetInstance();
MS_EXCEPTION_IF_NULL(ms_context);
MS_EXCEPTION_IF_NULL(context);
if (IsRunningFailed(context)) {
return;
@ -328,11 +331,20 @@ void HostQueueDataSourceActor::OnMemoryAllocFinish(OpContext<DeviceTensor> *cons
continue;
}
if (!device_tensor->AsyncHostToDevice(LongToSize(host_tensor->data().nbytes()), host_tensor->data_type(),
host_tensor->data_ptr()->data())) {
SET_OPCONTEXT_FAIL_RET_WITH_ERROR((*context), "SyncHostToDevice failed.");
if (ms_context->IsEnableInferBoost()) {
if (!device_tensor->AsyncHostToDevice(LongToSize(host_tensor->data().nbytes()), host_tensor->data_type(),
host_tensor->data_ptr()->data())) {
SET_OPCONTEXT_FAIL_RET_WITH_ERROR((*context), "SyncHostToDevice failed.");
}
} else {
if (!device_tensor->SyncHostToDevice(
trans::GetRuntimePaddingShape(data_node_with_indexs_[i].first, data_node_with_indexs_[i].second),
LongToSize(host_tensor->data().nbytes()), host_tensor->data_type(),
host_tensor->device_info().host_format_, host_tensor->data_ptr())) {
SET_OPCONTEXT_FAIL_RET_WITH_ERROR((*context), "SyncHostToDevice failed.");
}
}
if (IsDynamic(device_tensor->host_shape())) {
device_tensor->set_host_shape(host_tensor->shape());
}

View File

@ -27,7 +27,6 @@ void KernelInferActor::Init() {
if (memory_free_list_.size() > input_num) {
memory_free_list_.erase(memory_free_list_.begin() + input_num, memory_free_list_.end());
}
}
void KernelInferActor::RunOpData(OpData<DeviceTensor> *const input_data, OpContext<DeviceTensor> *const context) {

View File

@ -211,8 +211,6 @@ void SuperKernelActor::Run(OpContext<DeviceTensor> *const context) {
return RunGraphKernelByKernel(context);
}
device::tracker::CALL_MEMORY_TRACKER_WITH_FILE(AddTask, GetAID().Name(), "", graph_->ToString());
if (device_contexts_.empty() || device_contexts_[0] == nullptr) {
SET_OPCONTEXT_FAIL_RET_WITH_ERROR((*context), "Invalid device context for super kernel actor:" + GetAID().Name());
}

View File

@ -662,14 +662,14 @@ void GraphScheduler::SpawnMultiPipelineActor(ActorSet *const actor_set, ActorThr
}
// If enable runtime multi pipeline, async launch kernel will be enabled.
<<<<<<< HEAD
ActorDispatcher::set_enable_runtime_multi_pipeline(EnableRuntimePipeline() && actor_set->has_dynamic_shape_ &&
!actor_set->kernel_actors_.empty() &&
default_actor_thread_num_ > kMultiPipelineThreadNum);
>>>>>>> 80d1685cc13... lzy: Support run graph without kernel actor for kbyk mode
ActorDispatcher::set_enable_runtime_multi_pipeline(
enable_runtime_pipeline && actor_set->has_dynamic_shape_ &&
(EnableKbkSubGraphExecute() || !actor_set->kernel_actors_.empty()) &&
default_actor_thread_num_ > kMultiPipelineThreadNum);
if (ActorDispatcher::enable_runtime_multi_pipeline() && !already_spawn_kernel_async_infer_resize_actor_) {
size_t current_actor_thread_num = thread_pool->GetActorThreadNum();
MS_LOG(INFO) << "Enable runtime multi pipeline, default actor thread num: " << default_actor_thread_num_
<< ", current actor thread num: " << current_actor_thread_num;
if (current_actor_thread_num != default_actor_thread_num_) {
thread_pool->SetActorThreadNum(default_actor_thread_num_);
MS_LOG(DEBUG) << "Reset actor thread number to: " << default_actor_thread_num_;

View File

@ -26,6 +26,5 @@
namespace mindspore {
namespace ops {
MIND_API_OPERATOR_IMPL(MatMulAllReduce, MatMul);
REGISTER_PRIMITIVE_C(kMatMulAllReduce, MatMulAllReduce);
} // namespace ops
} // namespace mindspore

View File

@ -34,8 +34,10 @@
#include "mindapi/src/helper.h"
#include "mindspore/core/ops/math_ops.h"
#include "ops/mat_mul.h"
#include "ops/fusion/matmul_allreduce.h"
#include "ops/op_name.h"
#include "ops/primitive_c.h"
#include "ops/lite_ops.h"
#include "utils/check_convert_utils.h"
#include "utils/convert_utils_base.h"
#include "utils/log_adapter.h"
@ -169,5 +171,6 @@ class MatMulInfer : public abstract::OpInferBase {
};
REGISTER_PRIMITIVE_OP_INFER_IMPL(MatMul, prim::kPrimMatMul, MatMulInfer, false);
REGISTER_PRIMITIVE_OP_INFER_IMPL(MatMulAllReduce, prim::kPrimMatMulAllReduce, MatMulInfer, false);
} // namespace ops
} // namespace mindspore

View File

@ -62,8 +62,10 @@ GVAR_DEF(PrimitivePtr, kPrimTensorAdd, std::make_shared<Primitive>("TensorAdd"))
GVAR_DEF(PrimitivePtr, kPrimAddV2, std::make_shared<Primitive>(kAddV2OpName));
GVAR_DEF(PrimitivePtr, kPrimAddLayerNorm, std::make_shared<Primitive>("AddLayerNorm"));
GVAR_DEF(PrimitivePtr, kPrimAddRmsNorm, std::make_shared<Primitive>("AddRmsNorm"));
GVAR_DEF(PrimitivePtr, kPrimMatMul, std::make_shared<Primitive>("MatMul"));
GVAR_DEF(PrimitivePtr, kPrimMatMulV2, std::make_shared<Primitive>("MatMulV2"));
GVAR_DEF(PrimitivePtr, kPrimMatrixDiag, std::make_shared<Primitive>("MatrixDiag"));
GVAR_DEF(PrimitivePtr, kPrimBatchMatMul, std::make_shared<Primitive>("BatchMatMul"));
GVAR_DEF(PrimitivePtr, kPrimBatchMatMulV2, std::make_shared<Primitive>("BatchMatMulV2"));
GVAR_DEF(PrimitivePtr, kPrimFusedMatMulBiasAdd, std::make_shared<Primitive>("FusedMatMulBiasAdd"));
GVAR_DEF(PrimitivePtr, kPrimMinimumGradGrad, std::make_shared<Primitive>("MinimumGradGrad"));

View File

@ -24,6 +24,7 @@
#include <functional>
#include <mutex>
#include <vector>
#include <optional>
#include "utils/log_adapter.h"
#include "utils/ms_utils.h"
@ -402,7 +403,7 @@ inline void MsContext::increase_param<uint32_t>(MsCtxParam param) {
uint32_params_[param - MS_CTX_TYPE_UINT32_BEGIN]++;
}
// decreate method implementation for type uint32_t
// decrease method implementation for type uint32_t
template <>
inline void MsContext::decrease_param<uint32_t>(MsCtxParam param) {
uint32_params_[param - MS_CTX_TYPE_UINT32_BEGIN]--;

View File

@ -302,12 +302,13 @@ def _handle_arg_predict(obj, arg, compile_arg):
return None
if isinstance(arg, (list, tuple)):
if compile_arg is not None and hasattr(compile_arg, "__ms_mutable__") and getattr(compile_arg, "__ms_mutable__"):
if compile_arg is not None and hasattr(compile_arg, "__ms_mutable__") and \
getattr(compile_arg, "__ms_mutable__"):
# mutable([]) will be eliminated by FuncGraphSpecializer, and empty list is not supported by backend.
if isinstance(arg, list) and not arg:
return None
return arg
elif hasattr(obj, "enable_tuple_broaden") and obj.enable_tuple_broaden and isinstance(arg, tuple) and \
if hasattr(obj, "enable_tuple_broaden") and obj.enable_tuple_broaden and isinstance(arg, tuple) and \
_check_all_tensor(arg):
return arg
return None

View File

@ -60,7 +60,7 @@ class JitConfig:
infer_boost (str, optional): enable infer boost mode.
The value must be ``"on"`` , ``"off"``. Default to an "off", which means that disable infer boost.
when infer boost mode is enabled, mindspore will use high perf kernel lib, use faster runtime make
when infer boost mode is enabled, mindspore will use high perf kernel lib, use faster runtime make
infer speed is best.
Note: current infer boost only support jit_level == O0 and device is Ascend910B
Default: ``"off"`` .
@ -78,16 +78,17 @@ class JitConfig:
>>>
>>> net.set_jit_config(jitconfig)
"""
def __init__(self, jit_level="", exc_mode="auto", jit_syntax_level="", debug_level="RELEASE", infer_boost="off", **kwargs):
def __init__(self, jit_level="", exc_mode="auto", jit_syntax_level="", debug_level="RELEASE",
infer_boost="off", **kwargs):
if jit_level not in ["", "O0", "O1", "O2"]:
raise ValueError("For 'jit_level' must be one of ['O0', 'O1', 'O2'].")
if exc_mode not in ['auto', 'sink', 'no_sink']:
if exc_mode not in ["auto", "sink", "no_sink"]:
raise ValueError("For 'exc_mode' must be one of '['auto', 'sink', 'no_sink']'.")
if jit_syntax_level != "" and jit_syntax_level not in ['STRICT', 'COMPATIBLE', 'LAX']:
if jit_syntax_level != "" and jit_syntax_level not in ["STRICT", "COMPATIBLE", "LAX"]:
raise ValueError("For 'jit_syntax_level' must be one of '['STRICT', 'LAX']'.")
if debug_level not in ['RELEASE', 'DEBUG']:
if debug_level not in ["RELEASE", "DEBUG"]:
raise ValueError("For 'debug_level' must be one of '['RELEASE', 'DEBUG']'.")
if infer_boost != "" and infer_boost not in ['on', 'off']:
if infer_boost != "" and infer_boost not in ["on", "off"]:
raise ValueError("For 'infer_boost' must be one of '['on', 'off']'.")
self.jit_config_dict = kwargs
self.jit_config_dict["jit_level"] = jit_level

View File

@ -667,6 +667,8 @@ class Cell(Cell_):
def _predict(self, *args, **kwargs):
if not hasattr(self, "phase"):
return False, None
if (self.phase == "prefill" or self.phase == 'increment') and self.phase in self.phase_cache:
new_args = _get_args_for_run_predict(self, args, kwargs, self._compile_args)
res = _cell_graph_executor._graph_executor(tuple(new_args), self.phase_cache[self.phase])
@ -687,7 +689,7 @@ class Cell(Cell_):
if predict_compiled:
return res
self._check_construct_args(*args)
if self._hook_fn_registered():
logger.warning(f"For 'Cell', it's not support hook function in graph mode. If you want to use hook "
f"function, please use context.set_context to set pynative mode.")
@ -993,9 +995,9 @@ class Cell(Cell_):
kwargs (dict): Kwargs of the Cell object.
"""
if self.phase == "prefill":
os.environ["ENABLE_MATMUL_ALLREDUCE"] = "on"
os.environ["DISABLE_MATMULALLREDUCE_FUSION"] = "False"
else:
os.environ["ENABLE_MATMUL_ALLREDUCE"] = ""
os.environ["DISABLE_MATMULALLREDUCE_FUSION"] = "True"
self._compile_args = self._get_compile_args(args)
_cell_graph_executor.compile(self, *self._compile_args, phase=self.phase,
jit_config_dict=self._jit_config_dict, **kwargs)

View File

@ -118,7 +118,7 @@ from .nn_ops import (LSTM, SGD, Adam, AdamWeightDecay, FusedSparseAdam, FusedSpa
FractionalMaxPool, FractionalMaxPool3DWithFixedKsize, FractionalMaxPoolWithFixedKsize,
GridSampler2D, TripletMarginLoss, UpsampleNearest3D, UpsampleTrilinear3D, PadV3, ChannelShuffle,
GLU, MaxUnpool3D, Pdist, RmsNorm, PagedAttention, PagedAttentionMask, ReshapeAndCache,
ApplyRotaryPosEmb, MatmulQkv, MatmulFfn)
ApplyRotaryPosEmb)
from .other_ops import (Assign, IOU, BoundingBoxDecode, BoundingBoxEncode,
ConfusionMatrix, UpdateState, Load, StopGradient, Reusing,
CheckValid, Partial, Depend, Push, Pull, PyExecute, PyFunc, _DynamicLossScale,
@ -695,8 +695,6 @@ __all__ = [
"ReshapeAndCache",
"ApplyRotaryPosEmb",
"RmsNorm",
"MatmulQkv",
"MatmulFfn",
]
__custom__ = [

View File

@ -10196,27 +10196,3 @@ class RmsNorm(Primitive):
"""Initialize Dense."""
validator.check_value_type("epsilon", epsilon, [float], self.name)
self.init_prim_io_names(inputs=['x', 'gamma'], outputs=["y", "rstd"])
class MatmulQkv(Primitive):
r"""
Fuse three matmul ops for q k v attention into one
"""
@prim_attr_register
def __init__(self):
"""Initialize"""
self.init_prim_io_names(inputs=['hidden_states', 'weight_q', 'weight_k', 'weight_v'],
outputs=["output_q", "output_k", "output_v"])
class MatmulFfn(Primitive):
r"""
Fuse two matmul ops for feed forward into one
"""
@prim_attr_register
def __init__(self):
"""Initialize"""
self.init_prim_io_names(inputs=['hidden_states', 'weight_gate', 'weight_up'],
outputs=["output_gate", "output_up"])

View File

@ -17,26 +17,26 @@
if [ "$(uname)" == Linux ]; then
if [ -n "${MS_INTERNAL_KERNEL_HOME}" ]; then
echo "Use local MS_INTERNAL_KERNEL_HOME : ${MS_INTERNAL_KERNEL_HOME}"
else
file_path=${BASEPATH}/mindspore/ccsrc/plugin/device/ascend/kernel/internal/prebuild
lib_file=${file_path}/ms_kernels_internal.tar.gz
if [ -f "${lib_file}" ]; then
file_lines=`cat "${lib_file}" | wc -l`
if [ ${file_lines} -ne 3 ]; then
tar -zxf ${lib_file} -C ${file_path}
if [ $? -eq 0 ]; then
echo "Unzip ms_kernel_internal.tar.gz SUCCESS!"
export MS_INTERNAL_KERNEL_HOME="${file_path}/ms_kernels_internal"
echo "MS_INTERNAL_KERNEL_HOME = ${MS_INTERNAL_KERNEL_HOME}"
else
echo "[WARNING] Unzip ms_kernel_internal.tar.gz FAILED!"
fi
else
echo "[WARNING] The file ms_kernel_internal.tar.gz is not pulled. Please ensure git-lfs is installed by"
echo "[WARNING] `git lfs install` and retry downloading using `git lfs pull`."
fi
else
echo "[WARNING] The file ms_kernel_internal.tar.gz does NOT EXIST."
fi
# else
# file_path=${BASEPATH}/mindspore/ccsrc/plugin/device/ascend/kernel/internal/prebuild
# lib_file=${file_path}/ms_kernels_internal.tar.gz
# if [ -f "${lib_file}" ]; then
# file_lines=`cat "${lib_file}" | wc -l`
# if [ ${file_lines} -ne 3 ]; then
# tar -zxf ${lib_file} -C ${file_path}
# if [ $? -eq 0 ]; then
# echo "Unzip ms_kernel_internal.tar.gz SUCCESS!"
# export MS_INTERNAL_KERNEL_HOME="${file_path}/ms_kernels_internal"
# echo "MS_INTERNAL_KERNEL_HOME = ${MS_INTERNAL_KERNEL_HOME}"
# else
# echo "[WARNING] Unzip ms_kernel_internal.tar.gz FAILED!"
# fi
# else
# echo "[WARNING] The file ms_kernel_internal.tar.gz is not pulled. Please ensure git-lfs is installed by"
# echo "[WARNING] 'git lfs install' and retry downloading using 'git lfs pull'."
# fi
# else
# echo "[WARNING] The file ms_kernel_internal.tar.gz does NOT EXIST."
# fi
fi
fi

View File

@ -27,7 +27,7 @@ def test_lccl_allreduce():
Description: msrun lccl all_reduce 8P case.
Expectation: success
"""
os.environ['ENABLE_LCCL'] = str(1)
os.environ['MS_ENABLE_LCCL'] = str(1)
os.environ['GRAPH_OP_RUN'] = str(1)
context.set_context(mode=context.GRAPH_MODE, device_target="Ascend")
return_code = os.system("msrun --worker_num=8 --local_worker_num=8 --join=True pytest -s test_lccl_allreduce.py")
@ -44,7 +44,7 @@ def test_lccl_allgather():
Description: msrun lccl all_gather 8P case.
Expectation: success
"""
os.environ['ENABLE_LCCL'] = str(1)
os.environ['MS_ENABLE_LCCL'] = str(1)
os.environ['GRAPH_OP_RUN'] = str(1)
context.set_context(mode=context.GRAPH_MODE, device_target="Ascend")
return_code = os.system("msrun --worker_num=8 --local_worker_num=8 --join=True pytest -s test_lccl_allgather.py")
@ -61,7 +61,7 @@ def test_lccl_reducescatter():
Description: msrun lccl reduce_scatter 8P case.
Expectation: success
"""
os.environ['ENABLE_LCCL'] = str(1)
os.environ['MS_ENABLE_LCCL'] = str(1)
os.environ['GRAPH_OP_RUN'] = str(1)
context.set_context(mode=context.GRAPH_MODE, device_target="Ascend")
return_code = os.system("msrun --worker_num=8 --local_worker_num=8 --join=True "
@ -79,7 +79,7 @@ def test_lccl_broadcast():
Description: msrun lccl broadcast 8P case.
Expectation: success
"""
os.environ['ENABLE_LCCL'] = str(1)
os.environ['MS_ENABLE_LCCL'] = str(1)
os.environ['GRAPH_OP_RUN'] = str(1)
context.set_context(mode=context.GRAPH_MODE, device_target="Ascend")
return_code = os.system("msrun --worker_num=8 --local_worker_num=8 --join=True pytest -s test_lccl_broadcast.py")

View File

@ -24,7 +24,6 @@ from mindspore.common.initializer import initializer
from mindspore.common.parameter import Parameter
from mindspore.communication.management import init, HCCL_WORLD_COMM_GROUP, get_rank, get_group_size
from mindspore.ops import operations as P
from mindspore.ops.operations import _inner_ops as inner
context.set_context(mode=context.GRAPH_MODE, device_target='Ascend')

View File

@ -27,7 +27,6 @@ def test_msrun():
Description: Launch distributed training job with dynamic cluster using msrun.
Expectation: All workers are successfully spawned and running training.
"""
os.environ['ENABLE_LCCL'] = str(1)
os.environ['GRAPH_OP_RUN'] = str(1)
return_code = os.system(
"msrun --worker_num=4 --local_worker_num=4 --master_addr=127.0.0.1 "\

View File

@ -263,6 +263,8 @@ list(REMOVE_ITEM MINDSPORE_SRC_LIST
"../../../mindspore/ccsrc/runtime/graph_scheduler/actor/embedding_cache/embedding_cache_prefetch_actor.cc")
list(REMOVE_ITEM MINDSPORE_SRC_LIST
"../../../mindspore/ccsrc/plugin/device/ascend/hal/profiler/parallel_strategy_profiling.cc")
list(REMOVE_ITEM MINDSPORE_SRC_LIST
"../../../mindspore/ccsrc/plugin/device/ascend/kernel/hccl/hcom_matmul_all_reduce.cc")
add_library(_ut_mindspore_obj STATIC ${MINDSPORE_SRC_LIST} $<TARGET_OBJECTS:core_proto_obj> $<TARGET_OBJECTS:mindrt_mid>
$<TARGET_OBJECTS:common_shared_lib_obj> $<TARGET_OBJECTS:_mindspore_utils_obj>