bugfix after rebase 2.3:
1. unify fusion pass 2. data_source_actor.cc 3. remove matmulqkv/ffn python nn_op and parallel info 4. Codecheck 5. disable internal kernels compile temporarily 6. use dummy ascend. 7. fix matmul allreduce correction 8. update MS_ENABLE_LCCL ut build remove hcom_matmul_all_reduce bugfix of cell.py::predict matmul_allreduce re-apply
This commit is contained in:
parent
66183a6817
commit
6073f75254
|
@ -1,82 +0,0 @@
|
|||
/**
|
||||
* Copyright 2024 Huawei Technologies Co., Ltd
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
#include "frontend/parallel/ops_info/matmul_ffn_info.h"
|
||||
#include "frontend/parallel/dynamic_creator.h"
|
||||
|
||||
namespace mindspore {
|
||||
namespace parallel {
|
||||
// MatMulQkv has 3 inputs and 2 outputs
|
||||
// x: (batch * seq_len (inc is 1), hidden_size)
|
||||
// weight_1: (weight_1_hidden_size, hidden_size)
|
||||
// weight_2: (weight_2_hidden_size, hidden_size)
|
||||
// ------------------------------
|
||||
// output_1: (batch * seq_len (inc is 1), weight_1_hidden_size)
|
||||
// output_2: (batch * seq_len (inc is 1), weight_2_hidden_size)
|
||||
|
||||
constexpr size_t kMatMulFfnOutputSize = 2;
|
||||
|
||||
Status MatmulFfnInfo::CheckStrategy(const StrategyPtr &strategy) {
|
||||
if (CheckStrategyValue(strategy, inputs_shape_) != SUCCESS) {
|
||||
return FAILED;
|
||||
}
|
||||
|
||||
// TODO
|
||||
|
||||
return SUCCESS;
|
||||
}
|
||||
|
||||
Status MatmulFfnInfo::InferDevMatrixShape() {
|
||||
auto input_strategies = strategy()->GetInputDim();
|
||||
auto x = input_strategies.at(0); // (batch * seq_len, hidden_size)
|
||||
auto weight_1 = input_strategies.at(1);
|
||||
// dp mp
|
||||
// 1 0
|
||||
dev_matrix_shape_ = {x.at(0), weight_1.at(0)};
|
||||
|
||||
return SUCCESS;
|
||||
}
|
||||
|
||||
Status MatmulFfnInfo::InferTensorMap() {
|
||||
Shape x_tensor_map{1, -1};
|
||||
Shape weight_1_tensor_map{0, -1};
|
||||
Shape weight_2_tensor_map{0, -1};
|
||||
inputs_tensor_map_.emplace_back(x_tensor_map);
|
||||
inputs_tensor_map_.emplace_back(weight_1_tensor_map);
|
||||
inputs_tensor_map_.emplace_back(weight_2_tensor_map);
|
||||
|
||||
Shape output_q_tensor_map{1, 0};
|
||||
Shape output_k_tensor_map{1, 0};
|
||||
outputs_tensor_map_.emplace_back(output_q_tensor_map);
|
||||
outputs_tensor_map_.emplace_back(output_k_tensor_map);
|
||||
|
||||
return SUCCESS;
|
||||
}
|
||||
|
||||
Status MatmulFfnInfo::InferAsLossDivisor() {
|
||||
if (outputs_tensor_map_.size() != kMatMulFfnOutputSize) {
|
||||
MS_LOG(ERROR) << name_ << ": The size of outputs tensor map must be 2, but got " << outputs_tensor_map_.size();
|
||||
return FAILED;
|
||||
}
|
||||
as_loss_divisor_ = ComputeRepeatDeviceNumByTensorMap(dev_matrix_shape_, outputs_tensor_map_[0]);
|
||||
MS_LOG(INFO) << name_ << " : The dev matrix shape is " << ShapeToString(dev_matrix_shape_)
|
||||
<< ", the output[0]'s tensor map is " << ShapeToString(outputs_tensor_map_[0])
|
||||
<< ", as_loss_divisor_ is " << as_loss_divisor_;
|
||||
return SUCCESS;
|
||||
}
|
||||
REGISTER(MatmulFfnInfo);
|
||||
} // namespace parallel
|
||||
} // namespace mindspore
|
|
@ -1,53 +0,0 @@
|
|||
/**
|
||||
* Copyright 2024 Huawei Technologies Co., Ltd
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
#ifndef MINDSPORE_CCSRC_FRONTEND_PARALLEL_OPS_INFO_MATMUL_FFN_INFO_H_
|
||||
#define MINDSPORE_CCSRC_FRONTEND_PARALLEL_OPS_INFO_MATMUL_FFN_INFO_H_
|
||||
|
||||
#include <memory>
|
||||
#include <string>
|
||||
#include <vector>
|
||||
|
||||
#include "utils/hash_map.h"
|
||||
#include "ir/value.h"
|
||||
#include "frontend/parallel/auto_parallel/operator_costmodel.h"
|
||||
#include "frontend/parallel/ops_info/operator_info.h"
|
||||
#include "frontend/parallel/strategy.h"
|
||||
|
||||
namespace mindspore {
|
||||
namespace parallel {
|
||||
class MatmulFfnInfo : public OperatorInfo {
|
||||
public:
|
||||
MatmulFfnInfo(const std::string &name, const Shapes &inputs_shape, const Shapes &outputs_shape,
|
||||
const PrimitiveAttrs &attrs)
|
||||
: OperatorInfo(name, inputs_shape, outputs_shape, attrs, std::make_shared<ActivationInfoCost>()) {}
|
||||
~MatmulFfnInfo() override = default;
|
||||
Status CheckStrategy(const StrategyPtr &strategy) override;
|
||||
std::vector<StrategyPtr> GenerateOpStrategies(int64_t stage_id) override { return {}; }
|
||||
Status SetCostUnderStrategy(const StrategyPtr &strategy) override { return SetCostUnderStrategyBase(strategy); }
|
||||
|
||||
protected:
|
||||
Status GetAttrs() override { return SUCCESS; }
|
||||
Status InferForwardCommunication() { return SUCCESS; }
|
||||
Status InferTensorMap() override;
|
||||
Status InferDevMatrixShape() override;
|
||||
Status InferAsLossDivisor() override;
|
||||
};
|
||||
using MatmulFfnInfoPtr = std::shared_ptr<MatmulFfnInfo>;
|
||||
} // namespace parallel
|
||||
} // namespace mindspore
|
||||
|
||||
#endif // MINDSPORE_CCSRC_FRONTEND_PARALLEL_OPS_INFO_MATMUL_FFN_INFO_H_
|
|
@ -1,95 +0,0 @@
|
|||
/**
|
||||
* Copyright 2024 Huawei Technologies Co., Ltd
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
#include "frontend/parallel/ops_info/matmul_qkv_info.h"
|
||||
#include "frontend/parallel/dynamic_creator.h"
|
||||
|
||||
namespace mindspore {
|
||||
namespace parallel {
|
||||
// MatMulQkv has 3 inputs and 3 outputs
|
||||
// x: (batch * seq_len (inc is 1), query_hidden_size)
|
||||
// q: (query_hidden_size, query_hidden_size)
|
||||
// k: (key_hidden_size, query_hidden_size)
|
||||
// v: (value_hidden_size, query_hidden_size)
|
||||
// ------------------------------
|
||||
// output_q: (batch * seq_len (inc is 1), query_hidden_size)
|
||||
// output_k: (batch * seq_len (inc is 1), key_hidden_size)
|
||||
// output_v: (batch * seq_len (inc is 1), value_hidden_size)
|
||||
|
||||
// split strategy
|
||||
// batch is not able to split
|
||||
// seq_len is not able to split
|
||||
// query_hidden_size is able to split
|
||||
// key_hidden_size is able to split
|
||||
// value_hidden_size is able to split
|
||||
constexpr size_t kMatMulQkvOutputSize = 3;
|
||||
|
||||
Status MatmulQkvInfo::CheckStrategy(const StrategyPtr &strategy) {
|
||||
if (CheckStrategyValue(strategy, inputs_shape_) != SUCCESS) {
|
||||
return FAILED;
|
||||
}
|
||||
|
||||
// TODO
|
||||
|
||||
return SUCCESS;
|
||||
}
|
||||
|
||||
Status MatmulQkvInfo::InferDevMatrixShape() {
|
||||
auto input_strategies = strategy()->GetInputDim();
|
||||
auto x = input_strategies.at(0); // (batch * seq_len, q_hidden_size)
|
||||
auto q = input_strategies.at(1);
|
||||
// dp mp
|
||||
// 1 0
|
||||
dev_matrix_shape_ = {x.at(0), q.at(0)};
|
||||
|
||||
return SUCCESS;
|
||||
}
|
||||
|
||||
Status MatmulQkvInfo::InferTensorMap() {
|
||||
Shape x_tensor_map{1, -1};
|
||||
Shape q_tensor_map{0, -1};
|
||||
Shape k_tensor_map{0, -1};
|
||||
Shape v_tensor_map{0, -1};
|
||||
|
||||
inputs_tensor_map_.emplace_back(x_tensor_map);
|
||||
inputs_tensor_map_.emplace_back(q_tensor_map);
|
||||
inputs_tensor_map_.emplace_back(k_tensor_map);
|
||||
inputs_tensor_map_.emplace_back(v_tensor_map);
|
||||
|
||||
Shape output_q_tensor_map{1, 0};
|
||||
Shape output_k_tensor_map{1, 0};
|
||||
Shape output_v_tensor_map{1, 0};
|
||||
outputs_tensor_map_.emplace_back(output_q_tensor_map);
|
||||
outputs_tensor_map_.emplace_back(output_k_tensor_map);
|
||||
outputs_tensor_map_.emplace_back(output_v_tensor_map);
|
||||
|
||||
return SUCCESS;
|
||||
}
|
||||
|
||||
Status MatmulQkvInfo::InferAsLossDivisor() {
|
||||
if (outputs_tensor_map_.size() != kMatMulQkvOutputSize) {
|
||||
MS_LOG(ERROR) << name_ << ": The size of outputs tensor map must be 3, but got " << outputs_tensor_map_.size();
|
||||
return FAILED;
|
||||
}
|
||||
as_loss_divisor_ = ComputeRepeatDeviceNumByTensorMap(dev_matrix_shape_, outputs_tensor_map_[0]);
|
||||
MS_LOG(INFO) << name_ << " : The dev matrix shape is " << ShapeToString(dev_matrix_shape_)
|
||||
<< ", the output[0]'s tensor map is " << ShapeToString(outputs_tensor_map_[0])
|
||||
<< ", as_loss_divisor_ is " << as_loss_divisor_;
|
||||
return SUCCESS;
|
||||
}
|
||||
REGISTER(MatmulQkvInfo);
|
||||
} // namespace parallel
|
||||
} // namespace mindspore
|
|
@ -1,53 +0,0 @@
|
|||
/**
|
||||
* Copyright 2024 Huawei Technologies Co., Ltd
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
#ifndef MINDSPORE_CCSRC_FRONTEND_PARALLEL_OPS_INFO_MATMUL_QKV_INFO_H_
|
||||
#define MINDSPORE_CCSRC_FRONTEND_PARALLEL_OPS_INFO_MATMUL_QKV_INFO_H_
|
||||
|
||||
#include <memory>
|
||||
#include <string>
|
||||
#include <vector>
|
||||
|
||||
#include "utils/hash_map.h"
|
||||
#include "ir/value.h"
|
||||
#include "frontend/parallel/auto_parallel/operator_costmodel.h"
|
||||
#include "frontend/parallel/ops_info/operator_info.h"
|
||||
#include "frontend/parallel/strategy.h"
|
||||
|
||||
namespace mindspore {
|
||||
namespace parallel {
|
||||
class MatmulQkvInfo : public OperatorInfo {
|
||||
public:
|
||||
MatmulQkvInfo(const std::string &name, const Shapes &inputs_shape, const Shapes &outputs_shape,
|
||||
const PrimitiveAttrs &attrs)
|
||||
: OperatorInfo(name, inputs_shape, outputs_shape, attrs, std::make_shared<ActivationInfoCost>()) {}
|
||||
~MatmulQkvInfo() override = default;
|
||||
Status CheckStrategy(const StrategyPtr &strategy) override;
|
||||
std::vector<StrategyPtr> GenerateOpStrategies(int64_t stage_id) override { return {}; }
|
||||
Status SetCostUnderStrategy(const StrategyPtr &strategy) override { return SetCostUnderStrategyBase(strategy); }
|
||||
|
||||
protected:
|
||||
Status GetAttrs() override { return SUCCESS; }
|
||||
Status InferForwardCommunication() { return SUCCESS; }
|
||||
Status InferTensorMap() override;
|
||||
Status InferDevMatrixShape() override;
|
||||
Status InferAsLossDivisor() override;
|
||||
};
|
||||
using MatMulQkvInfoPtr = std::shared_ptr<MatmulQkvInfo>;
|
||||
} // namespace parallel
|
||||
} // namespace mindspore
|
||||
|
||||
#endif // MINDSPORE_CCSRC_FRONTEND_PARALLEL_OPS_INFO_MATMUL_QKV_INFO_H_
|
|
@ -91,10 +91,8 @@ Status PagedAttentionInfo::CheckStrategy(const StrategyPtr &strategy) {
|
|||
|
||||
Status PagedAttentionInfo::InferDevMatrixShape() {
|
||||
auto input_strategies = strategy()->GetInputDim();
|
||||
auto query = input_strategies.at(0); // (batch, seq_len, hidden_size)
|
||||
auto cache = input_strategies.at(1); // (block_size, num_blocks, hidden_size)
|
||||
auto block_tables = input_strategies.at(3); // (batch, max_num_block_per_batch)
|
||||
auto context_lens = input_strategies.at(4); // (context_lens)
|
||||
auto query = input_strategies.at(0); // (batch, seq_len, hidden_size)
|
||||
auto cache = input_strategies.at(1); // (block_size, num_blocks, hidden_size)
|
||||
|
||||
// batch block_size num_blocks seq_len hidden_size
|
||||
// 4 3 2 1 0
|
||||
|
|
|
@ -834,12 +834,7 @@ void InsertVirtualOutput(const FuncGraphPtr &root, const std::vector<AnfNodePtr>
|
|||
OperatorAttrs attrs;
|
||||
OperatorArgs args = std::make_pair(attrs, params);
|
||||
Operator op = std::make_pair(VIRTUAL_OUTPUT, args);
|
||||
// Temporarily circumvent the MatmulQkv problem, and then modify it
|
||||
auto cnode = dyn_cast_ptr<CNode>(out_node);
|
||||
const auto &input = cnode->input(0);
|
||||
MS_EXCEPTION_IF_NULL(input);
|
||||
auto prim = GetValuePtr<Primitive>(input);
|
||||
if (IsPrimitiveCNode(out_node, prim::kPrimMakeTuple) || prim->name() == "MatmulQkv") {
|
||||
if (IsPrimitiveCNode(out_node, prim::kPrimMakeTuple)) {
|
||||
auto tuple = out_node->cast<CNodePtr>();
|
||||
MS_EXCEPTION_IF_NULL(tuple);
|
||||
for (size_t i = 1; i < tuple->size(); ++i) {
|
||||
|
|
|
@ -763,9 +763,7 @@ bool AscendDeviceAddress::AsyncDeviceToDevice(const ShapeVector & /* shape */, s
|
|||
return ret;
|
||||
}
|
||||
|
||||
|
||||
bool AscendDeviceAddress::AsyncHostToDevice(size_t size, TypeId /* type */,
|
||||
const void *host_ptr) const {
|
||||
bool AscendDeviceAddress::AsyncHostToDevice(size_t size, TypeId /* type */, const void *host_ptr) const {
|
||||
MS_ERROR_IF_NULL(host_ptr);
|
||||
BindDevice();
|
||||
if (!MoveToDevice(false)) {
|
||||
|
@ -779,7 +777,8 @@ bool AscendDeviceAddress::AsyncHostToDevice(size_t size, TypeId /* type */,
|
|||
auto runtime_instance = device::KernelRuntimeManager::Instance().GetKernelRuntime(kAscendDevice, device_id);
|
||||
MS_EXCEPTION_IF_NULL(runtime_instance);
|
||||
|
||||
auto ret = CALL_ASCEND_API(aclrtMemcpyAsync, GetDevicePtr(), size, host_ptr, size, ACL_MEMCPY_HOST_TO_DEVICE, runtime_instance->compute_stream());
|
||||
auto ret = CALL_ASCEND_API(aclrtMemcpyAsync, GetDevicePtr(), size, host_ptr, size, ACL_MEMCPY_HOST_TO_DEVICE,
|
||||
runtime_instance->compute_stream());
|
||||
if (ret != ACL_ERROR_NONE) {
|
||||
MS_LOG(ERROR) << "Call aclrtMemcpyAsync host to device failed, the error num[" << ret << "]";
|
||||
return false;
|
||||
|
@ -787,7 +786,6 @@ bool AscendDeviceAddress::AsyncHostToDevice(size_t size, TypeId /* type */,
|
|||
return true;
|
||||
}
|
||||
|
||||
|
||||
bool AscendDeviceAddress::AsyncHostToDevice(const ShapeVector & /* shape */, size_t size, TypeId /* type */,
|
||||
const void *host_ptr, size_t stream_id) const {
|
||||
MS_ERROR_IF_NULL(host_ptr);
|
||||
|
|
|
@ -17,7 +17,7 @@ if(DEFINED ENV{MS_INTERNAL_KERNEL_HOME})
|
|||
${CMAKE_SOURCE_DIR}/mindspore/ccsrc/runtime/collective/collective_communication_lib.cc
|
||||
${CMAKE_SOURCE_DIR}/mindspore/ccsrc/runtime/collective/communication_group.cc)
|
||||
set_property(SOURCE ${LOWLATENCY_COLLECTIVE_SRCS} PROPERTY COMPILE_DEFINITIONS
|
||||
SUBMODULE_ID=mindspore::SubModuleId::SM_DEVICE)
|
||||
SUBMODULE_ID=mindspore::SubModuleId::SM_DEVICE)
|
||||
add_library(lowlatency_collective SHARED ${LOWLATENCY_COLLECTIVE_SRCS})
|
||||
target_link_libraries(lowlatency_collective PRIVATE lcal)
|
||||
endif()
|
||||
|
|
|
@ -246,12 +246,13 @@ void GeDeviceResManager::CreateSessionAndGraphRunner() {
|
|||
}
|
||||
|
||||
bool GeDeviceResManager::LoadCollectiveCommLib() {
|
||||
// If this is simulation, don't load any collective communication library.
|
||||
// If this is simulation, load dummy collective communication library.
|
||||
if (!common::GetEnv(kSimulationLevel).empty()) {
|
||||
collective_comm_lib_ = &DummyAscendCollectiveCommLib::GetInstance();
|
||||
return true;
|
||||
}
|
||||
// Ascend backend supports HCCL and LCCL collective communication libraries.
|
||||
if (!common::GetEnv("ENABLE_LCCL").empty()) {
|
||||
if (!common::GetEnv("MS_ENABLE_LCCL").empty()) {
|
||||
std::string lowlatency_comm_lib_name = "liblowlatency_collective.so";
|
||||
auto loader = std::make_shared<CollectiveCommLibLoader>(lowlatency_comm_lib_name);
|
||||
MS_EXCEPTION_IF_NULL(loader);
|
||||
|
@ -265,7 +266,7 @@ bool GeDeviceResManager::LoadCollectiveCommLib() {
|
|||
auto instance_func = DlsymFuncObj(communication_lib_instance, collective_comm_lib_handle);
|
||||
collective_comm_lib_ = instance_func();
|
||||
MS_EXCEPTION_IF_NULL(collective_comm_lib_);
|
||||
MS_LOG(WARNING) << "Loading LCCL because env ENABLE_LCCL is set to 1. Pay attention that LCCL only supports "
|
||||
MS_LOG(WARNING) << "Loading LCCL because env MS_ENABLE_LCCL is set to 1. Pay attention that LCCL only supports "
|
||||
"single-node-multi-card mode in KernelByKernel for now.";
|
||||
} else {
|
||||
collective_comm_lib_ = &AscendCollectiveCommLib::GetInstance();
|
||||
|
|
|
@ -73,18 +73,7 @@ class GeDeviceResManager : public DeviceResManager {
|
|||
|
||||
static void CreateSessionAndGraphRunner();
|
||||
|
||||
<<<<<<< HEAD
|
||||
bool LoadCollectiveCommLib() override {
|
||||
if (common::GetEnv(kSimulationLevel).empty()) {
|
||||
collective_comm_lib_ = &AscendCollectiveCommLib::GetInstance();
|
||||
} else {
|
||||
collective_comm_lib_ = &DummyAscendCollectiveCommLib::GetInstance();
|
||||
}
|
||||
return true;
|
||||
}
|
||||
=======
|
||||
bool LoadCollectiveCommLib() override;
|
||||
>>>>>>> Add lccl so.
|
||||
|
||||
void ResetStreamAndCtx() override;
|
||||
bool BindDeviceToCurrentThread(bool force_bind) const override;
|
||||
|
|
|
@ -44,7 +44,7 @@ class LowlatencyCommunicationGroup : public CommunicationGroup {
|
|||
|
||||
void *GenerateRootInfo(size_t *root_info_size) override;
|
||||
|
||||
// Return communicator for collective communcation ops.
|
||||
// Return communicator for collective communication ops.
|
||||
const LcclPtr &lccl_communicator() const;
|
||||
// Return communicator of lcal.
|
||||
const LcalCommPtr &lcal_comm() const;
|
||||
|
|
|
@ -135,7 +135,7 @@ bool HcclKernel::Init(const std::vector<KernelTensor *> &inputs, const std::vect
|
|||
|
||||
if (common::GetEnv(kSimulationLevel).empty() && !common::IsNeedProfileMemory()) {
|
||||
#ifdef ENABLE_INTERNAL_KERNELS
|
||||
if (!common::GetEnv("ENABLE_LCCL").empty()) {
|
||||
if (!common::GetEnv("MS_ENABLE_LCCL").empty()) {
|
||||
LoadLcclLibrary();
|
||||
} else {
|
||||
LoadHcclLibrary();
|
||||
|
|
|
@ -31,7 +31,7 @@ bool HcomAllBroadCastKernel::Launch(const std::vector<KernelTensor *> &inputs, c
|
|||
MS_EXCEPTION_IF_NULL(stream_ptr);
|
||||
|
||||
#ifdef ENABLE_INTERNAL_KERNELS
|
||||
if (!common::GetEnv("ENABLE_LCCL").empty()) {
|
||||
if (!common::GetEnv("MS_ENABLE_LCCL").empty()) {
|
||||
auto lccl_result =
|
||||
lccl_comm_->Broadcast(inputs[0]->device_ptr(), hccl_count_, hccl_data_type_list_[0], root_id_, stream_ptr);
|
||||
if (lccl_result != Lcal::LCAL_SUCCESS) {
|
||||
|
|
|
@ -32,7 +32,7 @@ bool HcomAllGatherKernel::Launch(const std::vector<KernelTensor *> &inputs, cons
|
|||
MS_EXCEPTION_IF_NULL(stream_ptr);
|
||||
|
||||
#ifdef ENABLE_INTERNAL_KERNELS
|
||||
if (!common::GetEnv("ENABLE_LCCL").empty()) {
|
||||
if (!common::GetEnv("MS_ENABLE_LCCL").empty()) {
|
||||
auto lccl_result = lccl_comm_->AllGather(inputs[0]->device_ptr(), outputs[0]->device_ptr(), hccl_count_,
|
||||
hccl_data_type_list_[0], stream_ptr);
|
||||
if (lccl_result != Lcal::LCAL_SUCCESS) {
|
||||
|
|
|
@ -38,7 +38,7 @@ bool HcomAllReduceKernel::Launch(const std::vector<KernelTensor *> &inputs, cons
|
|||
MS_EXCEPTION_IF_NULL(stream_ptr);
|
||||
|
||||
#ifdef ENABLE_INTERNAL_KERNELS
|
||||
if (!common::GetEnv("ENABLE_LCCL").empty()) {
|
||||
if (!common::GetEnv("MS_ENABLE_LCCL").empty()) {
|
||||
auto lccl_result = lccl_comm_->AllReduce(inputs[0]->device_ptr(), outputs[0]->device_ptr(), hccl_count_,
|
||||
hccl_data_type_list_[0], op_type_, stream_ptr);
|
||||
if (lccl_result != Lcal::LCAL_SUCCESS) {
|
||||
|
|
|
@ -32,7 +32,7 @@ bool HcomAllReduceScatterKernel::Launch(const std::vector<KernelTensor *> &input
|
|||
MS_EXCEPTION_IF_NULL(stream_ptr);
|
||||
|
||||
#ifdef ENABLE_INTERNAL_KERNELS
|
||||
if (!common::GetEnv("ENABLE_LCCL").empty()) {
|
||||
if (!common::GetEnv("MS_ENABLE_LCCL").empty()) {
|
||||
auto lccl_result = lccl_comm_->ReduceScatter(inputs[0]->device_ptr(), outputs[0]->device_ptr(), hccl_count_,
|
||||
hccl_data_type_list_[0], op_type_, stream_ptr);
|
||||
if (lccl_result != Lcal::LCAL_SUCCESS) {
|
||||
|
|
|
@ -62,7 +62,7 @@ int HcomMatMulAllReduceKernel::Resize(const std::vector<KernelTensor *> &inputs,
|
|||
// The dimensions of left and right matrices.
|
||||
matmul_info_.m = hccl_kernel_input_shape_list_[0][0];
|
||||
matmul_info_.k = hccl_kernel_input_shape_list_[0][1];
|
||||
matmul_info_.n = hccl_kernel_input_shape_list_[1][0];
|
||||
matmul_info_.n = hccl_kernel_input_shape_list_[1][1];
|
||||
matmul_info_.transA = transpose_a_;
|
||||
matmul_info_.transB = transpose_b_;
|
||||
|
||||
|
|
|
@ -26,7 +26,6 @@ namespace kernel {
|
|||
constexpr uint32_t kMatMulAllReduceInputNum = 2;
|
||||
constexpr uint32_t kMatMulAllReduceOutputNum = 1;
|
||||
constexpr char kAttrNameTransposeA[] = "transpose_a";
|
||||
;
|
||||
constexpr char kAttrNameTransposeB[] = "transpose_b";
|
||||
|
||||
class HcomMatMulAllReduceKernel : public HcclKernel {
|
||||
|
|
|
@ -85,7 +85,7 @@ class InternalSub : public ElewiseBinary {
|
|||
param_ptr->input2_dtype_ = InternalKernelUtils::ToInternalDType(inputs[kIndex1]->dtype_id());
|
||||
param_ptr->input1_dims_ = internal::VecToSVec<int64_t>(inputs[kIndex0]->GetShapeVector());
|
||||
param_ptr->input2_dims_ = internal::VecToSVec<int64_t>(inputs[kIndex1]->GetShapeVector());
|
||||
|
||||
|
||||
return std::static_pointer_cast<internal::OpParam>(param_ptr);
|
||||
}
|
||||
};
|
||||
|
|
|
@ -105,7 +105,6 @@ int InternalKernelMod::Resize(const std::vector<KernelTensor *> &inputs, const s
|
|||
}
|
||||
}
|
||||
std::vector<internal::DIMS> input_shapes(inputs_.size());
|
||||
std::vector<internal::DIMS> output_shapes;
|
||||
for (auto iter = inputsIdxMap_.begin(); iter != inputsIdxMap_.end(); iter++) {
|
||||
InternalKernelUtils::ToInternalTensor(inputs_[iter->second], inputs[iter->first]);
|
||||
input_shapes[iter->second] = inputs_[iter->second]->desc.dims;
|
||||
|
|
|
@ -136,13 +136,9 @@ void GetBackendCommonUnifyMindIRPassManager(PassManagerPtr *unify_mindir_pm) {
|
|||
#ifdef ENABLE_INTERNAL_KERNELS
|
||||
(*unify_mindir_pm)->AddPass(std::make_shared<opt::MultiMatmulsFusion>());
|
||||
(*unify_mindir_pm)->AddPass(std::make_shared<opt::AddLayernormFusion>());
|
||||
if (common::GetEnv("MS_ENABLE_INTERNAL_KERNELS") == "on") {
|
||||
(*unify_mindir_pm)->AddPass(std::make_shared<opt::ShapeReshapeFusion>());
|
||||
}
|
||||
(*unify_mindir_pm)->AddPass(std::make_shared<opt::ShapeReshapeFusion>());
|
||||
(*unify_mindir_pm)->AddPass(std::make_shared<opt::AddRmsNormFusion>());
|
||||
if (common::GetEnv("ENABLE_MATMUL_ALLREDUCE") == "on") {
|
||||
(*unify_mindir_pm)->AddPass(std::make_shared<opt::MatMulAllReduceFusion>());
|
||||
}
|
||||
(*unify_mindir_pm)->AddPass(std::make_shared<opt::MatMulAllReduceFusion>());
|
||||
#endif // ENABLE_INTERNAL_KERNELS
|
||||
}
|
||||
|
||||
|
|
|
@ -15,11 +15,13 @@
|
|||
*/
|
||||
|
||||
#include "plugin/device/ascend/optimizer/ir_fusion/matmul_allreduce_fusion.h"
|
||||
#include <set>
|
||||
#include <vector>
|
||||
#include "mindspore/core/ops/nn_ops.h"
|
||||
#include "mindspore/core/ops/math_ops.h"
|
||||
#include "mindspore/core/ops/other_ops.h"
|
||||
#include "mindspore/core/ops/lite_ops.h"
|
||||
#include "mindspore/core/utils/ms_context.h"
|
||||
#include "include/backend/optimizer/helper.h"
|
||||
#include "include/backend/anf_runtime_algorithm.h"
|
||||
#include "include/common/utils/anfalgo.h"
|
||||
|
@ -101,6 +103,16 @@ AnfNodePtr MatMulAllReduceFusion::CreateMatMulAllReduceNode(const FuncGraphPtr &
|
|||
const AnfNodePtr MatMulAllReduceFusion::Process(const mindspore::FuncGraphPtr &func_graph,
|
||||
const mindspore::AnfNodePtr &node,
|
||||
const mindspore::EquivPtr &equiv) const {
|
||||
auto ms_context = MsContext::GetInstance();
|
||||
MS_EXCEPTION_IF_NULL(ms_context);
|
||||
if (!ms_context->IsEnableInferBoost()) {
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
if (common::GetEnv("DISABLE_MATMULALLREDUCE_FUSION") == "True") {
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
if (func_graph == nullptr || node == nullptr) {
|
||||
return nullptr;
|
||||
}
|
||||
|
|
|
@ -16,6 +16,7 @@
|
|||
#ifndef MINDSPORE_CCSRC_BACKEND_OPTIMIZER_ASCEND_IR_FUSION_MATMUL_ALLREDUCE_FUSION_H_
|
||||
#define MINDSPORE_CCSRC_BACKEND_OPTIMIZER_ASCEND_IR_FUSION_MATMUL_ALLREDUCE_FUSION_H_
|
||||
|
||||
#include <string>
|
||||
#include <memory>
|
||||
#include "include/backend/optimizer/optimizer.h"
|
||||
|
||||
|
@ -24,7 +25,7 @@ namespace opt {
|
|||
class MatMulAllReduceFusion : public PatternProcessPass {
|
||||
public:
|
||||
explicit MatMulAllReduceFusion(bool multigraph = true, const string &pass_name = "MatMulAllReduce")
|
||||
: PatternProcessPass(pass_name, multigraph){};
|
||||
: PatternProcessPass(pass_name, multigraph) {}
|
||||
~MatMulAllReduceFusion() override = default;
|
||||
const BaseRef DefinePattern() const override;
|
||||
const AnfNodePtr Process(const FuncGraphPtr &graph, const AnfNodePtr &node, const EquivPtr &equiv) const override;
|
||||
|
@ -46,4 +47,4 @@ class MatMulAllReduceFusion : public PatternProcessPass {
|
|||
} // namespace opt
|
||||
} // namespace mindspore
|
||||
|
||||
#endif // MINDSPORE_CCSRC_BACKEND_OPTIMIZER_ASCEND_IR_FUSION_MATMUL_ALLREDUCE_FUSION_H_
|
||||
#endif // MINDSPORE_CCSRC_BACKEND_OPTIMIZER_ASCEND_IR_FUSION_MATMUL_ALLREDUCE_FUSION_H_
|
||||
|
|
|
@ -15,13 +15,19 @@
|
|||
*/
|
||||
#include "plugin/device/ascend/optimizer/ir_fusion/multi_matmuls_fusion.h"
|
||||
|
||||
#include "mindspore/core/utils/ms_context.h"
|
||||
|
||||
namespace mindspore {
|
||||
namespace opt {
|
||||
bool MultiMatmulsFusion::Run(const FuncGraphPtr &graph) {
|
||||
bool changed = false;
|
||||
if (common::GetEnv("ENABLE_MATMUL_FUSION") != "on") {
|
||||
|
||||
auto ms_context = MsContext::GetInstance();
|
||||
MS_EXCEPTION_IF_NULL(ms_context);
|
||||
if (!ms_context->IsEnableInferBoost() || common::GetEnv("ENABLE_MATMUL_FUSION") != "on") {
|
||||
return changed;
|
||||
}
|
||||
|
||||
auto mng = graph->manager();
|
||||
MS_EXCEPTION_IF_NULL(mng);
|
||||
const auto &node_users_map = mng->node_users();
|
||||
|
|
|
@ -22,6 +22,7 @@
|
|||
#include "mindspore/core/ops/reshape_ext.h"
|
||||
#include "mindspore/core/ops/scalar_graph_holder.h"
|
||||
#include "mindspore/core/ops/array_ops.h"
|
||||
#include "mindspore/core/utils/ms_context.h"
|
||||
#include "include/common/utils/anfalgo.h"
|
||||
#include "mindspore/ccsrc/include/common/utils/utils.h"
|
||||
#include "plugin/device/ascend/optimizer/get_value_helper.h"
|
||||
|
@ -200,6 +201,12 @@ const BaseRef ShapeReshapeFusion::DefinePattern() const {
|
|||
|
||||
const AnfNodePtr ShapeReshapeFusion::Process(const FuncGraphPtr &func_graph, const AnfNodePtr &node,
|
||||
const EquivPtr &equiv) const {
|
||||
auto ms_context = MsContext::GetInstance();
|
||||
MS_EXCEPTION_IF_NULL(ms_context);
|
||||
if (!ms_context->IsEnableInferBoost()) {
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
MS_EXCEPTION_IF_NULL(func_graph);
|
||||
MS_EXCEPTION_IF_NULL(equiv);
|
||||
|
||||
|
|
|
@ -825,9 +825,9 @@ void DataPrepareActor::PrepareDataForHostTensorQueueNew(const VectorRef &args, O
|
|||
<< " for input parameter:" << origin_parameter->fullname_with_scope();
|
||||
|
||||
if (!isDyn) {
|
||||
if(host_tensors_[tensor_position] != input_tensor->shape()) {
|
||||
isDyn = true;
|
||||
}
|
||||
if (host_tensors_[tensor_position] != input_tensor->shape()) {
|
||||
isDyn = true;
|
||||
}
|
||||
}
|
||||
host_tensors_[tensor_position] = input_tensor->shape();
|
||||
host_tensors[tensor_position] = input_tensor;
|
||||
|
|
|
@ -23,6 +23,7 @@
|
|||
#include "mindrt/include/async/async.h"
|
||||
#include "utils/log_adapter.h"
|
||||
#include "kernel/common_utils.h"
|
||||
#include "mindspore/core/utils/ms_context.h"
|
||||
|
||||
namespace mindspore {
|
||||
namespace runtime {
|
||||
|
@ -278,6 +279,8 @@ void HostQueueDataSourceActor::SendMemoryFreeReq(OpContext<DeviceTensor> *const
|
|||
}
|
||||
|
||||
void HostQueueDataSourceActor::OnMemoryAllocFinish(OpContext<DeviceTensor> *const context) {
|
||||
auto ms_context = MsContext::GetInstance();
|
||||
MS_EXCEPTION_IF_NULL(ms_context);
|
||||
MS_EXCEPTION_IF_NULL(context);
|
||||
if (IsRunningFailed(context)) {
|
||||
return;
|
||||
|
@ -328,11 +331,20 @@ void HostQueueDataSourceActor::OnMemoryAllocFinish(OpContext<DeviceTensor> *cons
|
|||
continue;
|
||||
}
|
||||
|
||||
|
||||
if (!device_tensor->AsyncHostToDevice(LongToSize(host_tensor->data().nbytes()), host_tensor->data_type(),
|
||||
host_tensor->data_ptr()->data())) {
|
||||
SET_OPCONTEXT_FAIL_RET_WITH_ERROR((*context), "SyncHostToDevice failed.");
|
||||
if (ms_context->IsEnableInferBoost()) {
|
||||
if (!device_tensor->AsyncHostToDevice(LongToSize(host_tensor->data().nbytes()), host_tensor->data_type(),
|
||||
host_tensor->data_ptr()->data())) {
|
||||
SET_OPCONTEXT_FAIL_RET_WITH_ERROR((*context), "SyncHostToDevice failed.");
|
||||
}
|
||||
} else {
|
||||
if (!device_tensor->SyncHostToDevice(
|
||||
trans::GetRuntimePaddingShape(data_node_with_indexs_[i].first, data_node_with_indexs_[i].second),
|
||||
LongToSize(host_tensor->data().nbytes()), host_tensor->data_type(),
|
||||
host_tensor->device_info().host_format_, host_tensor->data_ptr())) {
|
||||
SET_OPCONTEXT_FAIL_RET_WITH_ERROR((*context), "SyncHostToDevice failed.");
|
||||
}
|
||||
}
|
||||
|
||||
if (IsDynamic(device_tensor->host_shape())) {
|
||||
device_tensor->set_host_shape(host_tensor->shape());
|
||||
}
|
||||
|
|
|
@ -27,7 +27,6 @@ void KernelInferActor::Init() {
|
|||
if (memory_free_list_.size() > input_num) {
|
||||
memory_free_list_.erase(memory_free_list_.begin() + input_num, memory_free_list_.end());
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
void KernelInferActor::RunOpData(OpData<DeviceTensor> *const input_data, OpContext<DeviceTensor> *const context) {
|
||||
|
|
|
@ -211,8 +211,6 @@ void SuperKernelActor::Run(OpContext<DeviceTensor> *const context) {
|
|||
return RunGraphKernelByKernel(context);
|
||||
}
|
||||
|
||||
device::tracker::CALL_MEMORY_TRACKER_WITH_FILE(AddTask, GetAID().Name(), "", graph_->ToString());
|
||||
|
||||
if (device_contexts_.empty() || device_contexts_[0] == nullptr) {
|
||||
SET_OPCONTEXT_FAIL_RET_WITH_ERROR((*context), "Invalid device context for super kernel actor:" + GetAID().Name());
|
||||
}
|
||||
|
|
|
@ -662,14 +662,14 @@ void GraphScheduler::SpawnMultiPipelineActor(ActorSet *const actor_set, ActorThr
|
|||
}
|
||||
|
||||
// If enable runtime multi pipeline, async launch kernel will be enabled.
|
||||
<<<<<<< HEAD
|
||||
ActorDispatcher::set_enable_runtime_multi_pipeline(EnableRuntimePipeline() && actor_set->has_dynamic_shape_ &&
|
||||
!actor_set->kernel_actors_.empty() &&
|
||||
default_actor_thread_num_ > kMultiPipelineThreadNum);
|
||||
>>>>>>> 80d1685cc13... lzy: Support run graph without kernel actor for kbyk mode
|
||||
ActorDispatcher::set_enable_runtime_multi_pipeline(
|
||||
enable_runtime_pipeline && actor_set->has_dynamic_shape_ &&
|
||||
(EnableKbkSubGraphExecute() || !actor_set->kernel_actors_.empty()) &&
|
||||
default_actor_thread_num_ > kMultiPipelineThreadNum);
|
||||
if (ActorDispatcher::enable_runtime_multi_pipeline() && !already_spawn_kernel_async_infer_resize_actor_) {
|
||||
size_t current_actor_thread_num = thread_pool->GetActorThreadNum();
|
||||
MS_LOG(INFO) << "Enable runtime multi pipeline, default actor thread num: " << default_actor_thread_num_
|
||||
<< ", current actor thread num: " << current_actor_thread_num;
|
||||
if (current_actor_thread_num != default_actor_thread_num_) {
|
||||
thread_pool->SetActorThreadNum(default_actor_thread_num_);
|
||||
MS_LOG(DEBUG) << "Reset actor thread number to: " << default_actor_thread_num_;
|
||||
|
|
|
@ -26,6 +26,5 @@
|
|||
namespace mindspore {
|
||||
namespace ops {
|
||||
MIND_API_OPERATOR_IMPL(MatMulAllReduce, MatMul);
|
||||
REGISTER_PRIMITIVE_C(kMatMulAllReduce, MatMulAllReduce);
|
||||
} // namespace ops
|
||||
} // namespace mindspore
|
||||
|
|
|
@ -34,8 +34,10 @@
|
|||
#include "mindapi/src/helper.h"
|
||||
#include "mindspore/core/ops/math_ops.h"
|
||||
#include "ops/mat_mul.h"
|
||||
#include "ops/fusion/matmul_allreduce.h"
|
||||
#include "ops/op_name.h"
|
||||
#include "ops/primitive_c.h"
|
||||
#include "ops/lite_ops.h"
|
||||
#include "utils/check_convert_utils.h"
|
||||
#include "utils/convert_utils_base.h"
|
||||
#include "utils/log_adapter.h"
|
||||
|
@ -169,5 +171,6 @@ class MatMulInfer : public abstract::OpInferBase {
|
|||
};
|
||||
|
||||
REGISTER_PRIMITIVE_OP_INFER_IMPL(MatMul, prim::kPrimMatMul, MatMulInfer, false);
|
||||
REGISTER_PRIMITIVE_OP_INFER_IMPL(MatMulAllReduce, prim::kPrimMatMulAllReduce, MatMulInfer, false);
|
||||
} // namespace ops
|
||||
} // namespace mindspore
|
||||
|
|
|
@ -62,8 +62,10 @@ GVAR_DEF(PrimitivePtr, kPrimTensorAdd, std::make_shared<Primitive>("TensorAdd"))
|
|||
GVAR_DEF(PrimitivePtr, kPrimAddV2, std::make_shared<Primitive>(kAddV2OpName));
|
||||
GVAR_DEF(PrimitivePtr, kPrimAddLayerNorm, std::make_shared<Primitive>("AddLayerNorm"));
|
||||
GVAR_DEF(PrimitivePtr, kPrimAddRmsNorm, std::make_shared<Primitive>("AddRmsNorm"));
|
||||
GVAR_DEF(PrimitivePtr, kPrimMatMul, std::make_shared<Primitive>("MatMul"));
|
||||
GVAR_DEF(PrimitivePtr, kPrimMatMulV2, std::make_shared<Primitive>("MatMulV2"));
|
||||
GVAR_DEF(PrimitivePtr, kPrimMatrixDiag, std::make_shared<Primitive>("MatrixDiag"));
|
||||
GVAR_DEF(PrimitivePtr, kPrimBatchMatMul, std::make_shared<Primitive>("BatchMatMul"));
|
||||
GVAR_DEF(PrimitivePtr, kPrimBatchMatMulV2, std::make_shared<Primitive>("BatchMatMulV2"));
|
||||
GVAR_DEF(PrimitivePtr, kPrimFusedMatMulBiasAdd, std::make_shared<Primitive>("FusedMatMulBiasAdd"));
|
||||
GVAR_DEF(PrimitivePtr, kPrimMinimumGradGrad, std::make_shared<Primitive>("MinimumGradGrad"));
|
||||
|
|
|
@ -24,6 +24,7 @@
|
|||
#include <functional>
|
||||
#include <mutex>
|
||||
#include <vector>
|
||||
#include <optional>
|
||||
#include "utils/log_adapter.h"
|
||||
#include "utils/ms_utils.h"
|
||||
|
||||
|
@ -402,7 +403,7 @@ inline void MsContext::increase_param<uint32_t>(MsCtxParam param) {
|
|||
uint32_params_[param - MS_CTX_TYPE_UINT32_BEGIN]++;
|
||||
}
|
||||
|
||||
// decreate method implementation for type uint32_t
|
||||
// decrease method implementation for type uint32_t
|
||||
template <>
|
||||
inline void MsContext::decrease_param<uint32_t>(MsCtxParam param) {
|
||||
uint32_params_[param - MS_CTX_TYPE_UINT32_BEGIN]--;
|
||||
|
|
|
@ -302,12 +302,13 @@ def _handle_arg_predict(obj, arg, compile_arg):
|
|||
return None
|
||||
|
||||
if isinstance(arg, (list, tuple)):
|
||||
if compile_arg is not None and hasattr(compile_arg, "__ms_mutable__") and getattr(compile_arg, "__ms_mutable__"):
|
||||
if compile_arg is not None and hasattr(compile_arg, "__ms_mutable__") and \
|
||||
getattr(compile_arg, "__ms_mutable__"):
|
||||
# mutable([]) will be eliminated by FuncGraphSpecializer, and empty list is not supported by backend.
|
||||
if isinstance(arg, list) and not arg:
|
||||
return None
|
||||
return arg
|
||||
elif hasattr(obj, "enable_tuple_broaden") and obj.enable_tuple_broaden and isinstance(arg, tuple) and \
|
||||
if hasattr(obj, "enable_tuple_broaden") and obj.enable_tuple_broaden and isinstance(arg, tuple) and \
|
||||
_check_all_tensor(arg):
|
||||
return arg
|
||||
return None
|
||||
|
|
|
@ -60,7 +60,7 @@ class JitConfig:
|
|||
|
||||
infer_boost (str, optional): enable infer boost mode.
|
||||
The value must be ``"on"`` , ``"off"``. Default to an "off", which means that disable infer boost.
|
||||
when infer boost mode is enabled, mindspore will use high perf kernel lib, use faster runtime make
|
||||
when infer boost mode is enabled, mindspore will use high perf kernel lib, use faster runtime make
|
||||
infer speed is best.
|
||||
Note: current infer boost only support jit_level == O0 and device is Ascend910B
|
||||
Default: ``"off"`` .
|
||||
|
@ -78,16 +78,17 @@ class JitConfig:
|
|||
>>>
|
||||
>>> net.set_jit_config(jitconfig)
|
||||
"""
|
||||
def __init__(self, jit_level="", exc_mode="auto", jit_syntax_level="", debug_level="RELEASE", infer_boost="off", **kwargs):
|
||||
def __init__(self, jit_level="", exc_mode="auto", jit_syntax_level="", debug_level="RELEASE",
|
||||
infer_boost="off", **kwargs):
|
||||
if jit_level not in ["", "O0", "O1", "O2"]:
|
||||
raise ValueError("For 'jit_level' must be one of ['O0', 'O1', 'O2'].")
|
||||
if exc_mode not in ['auto', 'sink', 'no_sink']:
|
||||
if exc_mode not in ["auto", "sink", "no_sink"]:
|
||||
raise ValueError("For 'exc_mode' must be one of '['auto', 'sink', 'no_sink']'.")
|
||||
if jit_syntax_level != "" and jit_syntax_level not in ['STRICT', 'COMPATIBLE', 'LAX']:
|
||||
if jit_syntax_level != "" and jit_syntax_level not in ["STRICT", "COMPATIBLE", "LAX"]:
|
||||
raise ValueError("For 'jit_syntax_level' must be one of '['STRICT', 'LAX']'.")
|
||||
if debug_level not in ['RELEASE', 'DEBUG']:
|
||||
if debug_level not in ["RELEASE", "DEBUG"]:
|
||||
raise ValueError("For 'debug_level' must be one of '['RELEASE', 'DEBUG']'.")
|
||||
if infer_boost != "" and infer_boost not in ['on', 'off']:
|
||||
if infer_boost != "" and infer_boost not in ["on", "off"]:
|
||||
raise ValueError("For 'infer_boost' must be one of '['on', 'off']'.")
|
||||
self.jit_config_dict = kwargs
|
||||
self.jit_config_dict["jit_level"] = jit_level
|
||||
|
|
|
@ -667,6 +667,8 @@ class Cell(Cell_):
|
|||
|
||||
|
||||
def _predict(self, *args, **kwargs):
|
||||
if not hasattr(self, "phase"):
|
||||
return False, None
|
||||
if (self.phase == "prefill" or self.phase == 'increment') and self.phase in self.phase_cache:
|
||||
new_args = _get_args_for_run_predict(self, args, kwargs, self._compile_args)
|
||||
res = _cell_graph_executor._graph_executor(tuple(new_args), self.phase_cache[self.phase])
|
||||
|
@ -687,7 +689,7 @@ class Cell(Cell_):
|
|||
if predict_compiled:
|
||||
return res
|
||||
self._check_construct_args(*args)
|
||||
|
||||
|
||||
if self._hook_fn_registered():
|
||||
logger.warning(f"For 'Cell', it's not support hook function in graph mode. If you want to use hook "
|
||||
f"function, please use context.set_context to set pynative mode.")
|
||||
|
@ -993,9 +995,9 @@ class Cell(Cell_):
|
|||
kwargs (dict): Kwargs of the Cell object.
|
||||
"""
|
||||
if self.phase == "prefill":
|
||||
os.environ["ENABLE_MATMUL_ALLREDUCE"] = "on"
|
||||
os.environ["DISABLE_MATMULALLREDUCE_FUSION"] = "False"
|
||||
else:
|
||||
os.environ["ENABLE_MATMUL_ALLREDUCE"] = ""
|
||||
os.environ["DISABLE_MATMULALLREDUCE_FUSION"] = "True"
|
||||
self._compile_args = self._get_compile_args(args)
|
||||
_cell_graph_executor.compile(self, *self._compile_args, phase=self.phase,
|
||||
jit_config_dict=self._jit_config_dict, **kwargs)
|
||||
|
|
|
@ -118,7 +118,7 @@ from .nn_ops import (LSTM, SGD, Adam, AdamWeightDecay, FusedSparseAdam, FusedSpa
|
|||
FractionalMaxPool, FractionalMaxPool3DWithFixedKsize, FractionalMaxPoolWithFixedKsize,
|
||||
GridSampler2D, TripletMarginLoss, UpsampleNearest3D, UpsampleTrilinear3D, PadV3, ChannelShuffle,
|
||||
GLU, MaxUnpool3D, Pdist, RmsNorm, PagedAttention, PagedAttentionMask, ReshapeAndCache,
|
||||
ApplyRotaryPosEmb, MatmulQkv, MatmulFfn)
|
||||
ApplyRotaryPosEmb)
|
||||
from .other_ops import (Assign, IOU, BoundingBoxDecode, BoundingBoxEncode,
|
||||
ConfusionMatrix, UpdateState, Load, StopGradient, Reusing,
|
||||
CheckValid, Partial, Depend, Push, Pull, PyExecute, PyFunc, _DynamicLossScale,
|
||||
|
@ -695,8 +695,6 @@ __all__ = [
|
|||
"ReshapeAndCache",
|
||||
"ApplyRotaryPosEmb",
|
||||
"RmsNorm",
|
||||
"MatmulQkv",
|
||||
"MatmulFfn",
|
||||
]
|
||||
|
||||
__custom__ = [
|
||||
|
|
|
@ -10196,27 +10196,3 @@ class RmsNorm(Primitive):
|
|||
"""Initialize Dense."""
|
||||
validator.check_value_type("epsilon", epsilon, [float], self.name)
|
||||
self.init_prim_io_names(inputs=['x', 'gamma'], outputs=["y", "rstd"])
|
||||
|
||||
|
||||
class MatmulQkv(Primitive):
|
||||
r"""
|
||||
Fuse three matmul ops for q k v attention into one
|
||||
"""
|
||||
|
||||
@prim_attr_register
|
||||
def __init__(self):
|
||||
"""Initialize"""
|
||||
self.init_prim_io_names(inputs=['hidden_states', 'weight_q', 'weight_k', 'weight_v'],
|
||||
outputs=["output_q", "output_k", "output_v"])
|
||||
|
||||
|
||||
class MatmulFfn(Primitive):
|
||||
r"""
|
||||
Fuse two matmul ops for feed forward into one
|
||||
"""
|
||||
|
||||
@prim_attr_register
|
||||
def __init__(self):
|
||||
"""Initialize"""
|
||||
self.init_prim_io_names(inputs=['hidden_states', 'weight_gate', 'weight_up'],
|
||||
outputs=["output_gate", "output_up"])
|
||||
|
|
|
@ -17,26 +17,26 @@
|
|||
if [ "$(uname)" == Linux ]; then
|
||||
if [ -n "${MS_INTERNAL_KERNEL_HOME}" ]; then
|
||||
echo "Use local MS_INTERNAL_KERNEL_HOME : ${MS_INTERNAL_KERNEL_HOME}"
|
||||
else
|
||||
file_path=${BASEPATH}/mindspore/ccsrc/plugin/device/ascend/kernel/internal/prebuild
|
||||
lib_file=${file_path}/ms_kernels_internal.tar.gz
|
||||
if [ -f "${lib_file}" ]; then
|
||||
file_lines=`cat "${lib_file}" | wc -l`
|
||||
if [ ${file_lines} -ne 3 ]; then
|
||||
tar -zxf ${lib_file} -C ${file_path}
|
||||
if [ $? -eq 0 ]; then
|
||||
echo "Unzip ms_kernel_internal.tar.gz SUCCESS!"
|
||||
export MS_INTERNAL_KERNEL_HOME="${file_path}/ms_kernels_internal"
|
||||
echo "MS_INTERNAL_KERNEL_HOME = ${MS_INTERNAL_KERNEL_HOME}"
|
||||
else
|
||||
echo "[WARNING] Unzip ms_kernel_internal.tar.gz FAILED!"
|
||||
fi
|
||||
else
|
||||
echo "[WARNING] The file ms_kernel_internal.tar.gz is not pulled. Please ensure git-lfs is installed by"
|
||||
echo "[WARNING] `git lfs install` and retry downloading using `git lfs pull`."
|
||||
fi
|
||||
else
|
||||
echo "[WARNING] The file ms_kernel_internal.tar.gz does NOT EXIST."
|
||||
fi
|
||||
# else
|
||||
# file_path=${BASEPATH}/mindspore/ccsrc/plugin/device/ascend/kernel/internal/prebuild
|
||||
# lib_file=${file_path}/ms_kernels_internal.tar.gz
|
||||
# if [ -f "${lib_file}" ]; then
|
||||
# file_lines=`cat "${lib_file}" | wc -l`
|
||||
# if [ ${file_lines} -ne 3 ]; then
|
||||
# tar -zxf ${lib_file} -C ${file_path}
|
||||
# if [ $? -eq 0 ]; then
|
||||
# echo "Unzip ms_kernel_internal.tar.gz SUCCESS!"
|
||||
# export MS_INTERNAL_KERNEL_HOME="${file_path}/ms_kernels_internal"
|
||||
# echo "MS_INTERNAL_KERNEL_HOME = ${MS_INTERNAL_KERNEL_HOME}"
|
||||
# else
|
||||
# echo "[WARNING] Unzip ms_kernel_internal.tar.gz FAILED!"
|
||||
# fi
|
||||
# else
|
||||
# echo "[WARNING] The file ms_kernel_internal.tar.gz is not pulled. Please ensure git-lfs is installed by"
|
||||
# echo "[WARNING] 'git lfs install' and retry downloading using 'git lfs pull'."
|
||||
# fi
|
||||
# else
|
||||
# echo "[WARNING] The file ms_kernel_internal.tar.gz does NOT EXIST."
|
||||
# fi
|
||||
fi
|
||||
fi
|
||||
|
|
|
@ -27,7 +27,7 @@ def test_lccl_allreduce():
|
|||
Description: msrun lccl all_reduce 8P case.
|
||||
Expectation: success
|
||||
"""
|
||||
os.environ['ENABLE_LCCL'] = str(1)
|
||||
os.environ['MS_ENABLE_LCCL'] = str(1)
|
||||
os.environ['GRAPH_OP_RUN'] = str(1)
|
||||
context.set_context(mode=context.GRAPH_MODE, device_target="Ascend")
|
||||
return_code = os.system("msrun --worker_num=8 --local_worker_num=8 --join=True pytest -s test_lccl_allreduce.py")
|
||||
|
@ -44,7 +44,7 @@ def test_lccl_allgather():
|
|||
Description: msrun lccl all_gather 8P case.
|
||||
Expectation: success
|
||||
"""
|
||||
os.environ['ENABLE_LCCL'] = str(1)
|
||||
os.environ['MS_ENABLE_LCCL'] = str(1)
|
||||
os.environ['GRAPH_OP_RUN'] = str(1)
|
||||
context.set_context(mode=context.GRAPH_MODE, device_target="Ascend")
|
||||
return_code = os.system("msrun --worker_num=8 --local_worker_num=8 --join=True pytest -s test_lccl_allgather.py")
|
||||
|
@ -61,7 +61,7 @@ def test_lccl_reducescatter():
|
|||
Description: msrun lccl reduce_scatter 8P case.
|
||||
Expectation: success
|
||||
"""
|
||||
os.environ['ENABLE_LCCL'] = str(1)
|
||||
os.environ['MS_ENABLE_LCCL'] = str(1)
|
||||
os.environ['GRAPH_OP_RUN'] = str(1)
|
||||
context.set_context(mode=context.GRAPH_MODE, device_target="Ascend")
|
||||
return_code = os.system("msrun --worker_num=8 --local_worker_num=8 --join=True "
|
||||
|
@ -79,7 +79,7 @@ def test_lccl_broadcast():
|
|||
Description: msrun lccl broadcast 8P case.
|
||||
Expectation: success
|
||||
"""
|
||||
os.environ['ENABLE_LCCL'] = str(1)
|
||||
os.environ['MS_ENABLE_LCCL'] = str(1)
|
||||
os.environ['GRAPH_OP_RUN'] = str(1)
|
||||
context.set_context(mode=context.GRAPH_MODE, device_target="Ascend")
|
||||
return_code = os.system("msrun --worker_num=8 --local_worker_num=8 --join=True pytest -s test_lccl_broadcast.py")
|
||||
|
|
|
@ -24,7 +24,6 @@ from mindspore.common.initializer import initializer
|
|||
from mindspore.common.parameter import Parameter
|
||||
from mindspore.communication.management import init, HCCL_WORLD_COMM_GROUP, get_rank, get_group_size
|
||||
from mindspore.ops import operations as P
|
||||
from mindspore.ops.operations import _inner_ops as inner
|
||||
|
||||
context.set_context(mode=context.GRAPH_MODE, device_target='Ascend')
|
||||
|
||||
|
|
|
@ -27,7 +27,6 @@ def test_msrun():
|
|||
Description: Launch distributed training job with dynamic cluster using msrun.
|
||||
Expectation: All workers are successfully spawned and running training.
|
||||
"""
|
||||
os.environ['ENABLE_LCCL'] = str(1)
|
||||
os.environ['GRAPH_OP_RUN'] = str(1)
|
||||
return_code = os.system(
|
||||
"msrun --worker_num=4 --local_worker_num=4 --master_addr=127.0.0.1 "\
|
||||
|
|
|
@ -263,6 +263,8 @@ list(REMOVE_ITEM MINDSPORE_SRC_LIST
|
|||
"../../../mindspore/ccsrc/runtime/graph_scheduler/actor/embedding_cache/embedding_cache_prefetch_actor.cc")
|
||||
list(REMOVE_ITEM MINDSPORE_SRC_LIST
|
||||
"../../../mindspore/ccsrc/plugin/device/ascend/hal/profiler/parallel_strategy_profiling.cc")
|
||||
list(REMOVE_ITEM MINDSPORE_SRC_LIST
|
||||
"../../../mindspore/ccsrc/plugin/device/ascend/kernel/hccl/hcom_matmul_all_reduce.cc")
|
||||
|
||||
add_library(_ut_mindspore_obj STATIC ${MINDSPORE_SRC_LIST} $<TARGET_OBJECTS:core_proto_obj> $<TARGET_OBJECTS:mindrt_mid>
|
||||
$<TARGET_OBJECTS:common_shared_lib_obj> $<TARGET_OBJECTS:_mindspore_utils_obj>
|
||||
|
|
Loading…
Reference in New Issue