commit
b1c4038a5b
|
@ -322,7 +322,7 @@ bool HcclKernel::Launch(const std::vector<AddressPtr> &inputs, const std::vector
|
|||
op_info.outputPtr = outputs[0]->addr;
|
||||
op_info.dataType = static_cast<HcclDataType>(data_type);
|
||||
op_info.opType = static_cast<HcclReduceOp>(op_type_);
|
||||
op_info.root = IntToUint(root_id_);
|
||||
op_info.root = root_id_;
|
||||
op_info.count = hccl_count_;
|
||||
|
||||
auto callback = [this](HcclResult status) {
|
||||
|
|
|
@ -19,6 +19,7 @@
|
|||
|
||||
#include <memory>
|
||||
#include <vector>
|
||||
|
||||
#include "plugin/device/ascend/kernel/hccl/hccl_kernel.h"
|
||||
|
||||
namespace mindspore {
|
||||
|
|
|
@ -158,7 +158,7 @@ bool HcomUtil::GetHcomCount(const AnfNodePtr &anf_node, const vector<HcclDataTyp
|
|||
}
|
||||
size_t actual_input_size = input_size;
|
||||
if (common::AnfAlgo::HasNodeAttr(kAttrFusion, cnode) &&
|
||||
common::AnfAlgo::GetNodeAttr<int64_t>(anf_node, kAttrFusion)) {
|
||||
common::AnfAlgo::GetNodeAttr<int64_t>(anf_node, kAttrFusion) != 0) {
|
||||
actual_input_size = (input_size + align_size - 1 + filled_size) / align_size * align_size;
|
||||
}
|
||||
block_size = static_cast<uint64_t>(actual_input_size / LongToSize(rank_size));
|
||||
|
@ -219,7 +219,7 @@ bool HcomUtil::GetHcomRootId(const AnfNodePtr &anf_node, uint32_t *root_id) {
|
|||
auto primitive = common::AnfAlgo::GetCNodePrimitive(anf_node);
|
||||
MS_EXCEPTION_IF_NULL(primitive);
|
||||
if (primitive->GetAttr(kAttrRootRank) != nullptr) {
|
||||
*root_id = (uint32_t)GetValue<int64_t>(primitive->GetAttr(kAttrRootRank));
|
||||
*root_id = static_cast<uint32_t>(GetValue<int64_t>(primitive->GetAttr(kAttrRootRank)));
|
||||
} else {
|
||||
MS_LOG(ERROR) << "HcomUtil::Get HCOM_ATTR_ROOT_INDEX fail, not support!";
|
||||
return false;
|
||||
|
|
|
@ -58,7 +58,7 @@ static map<HcclDataType, uint32_t> kConstOpHcomDataTypeSizeMap = {
|
|||
|
||||
class HcomUtil {
|
||||
public:
|
||||
static bool GetKernelInputShape(const AnfNodePtr &anf_node, vector<ShapeVector> *hccl_kernel_shape_list);
|
||||
static bool GetKernelInputShape(const AnfNodePtr &anf_node, vector<ShapeVector> *hccl_kernel_intput_shape_list);
|
||||
static bool GetKernelOutputShape(const AnfNodePtr &anf_node, vector<ShapeVector> *hccl_kernel_shape_list);
|
||||
static ::HcclDataType ConvertHcclType(TypeId type_id);
|
||||
static bool GetHcomDataType(const AnfNodePtr &anf_node, vector<HcclDataType> *data_type_list);
|
||||
|
|
|
@ -144,14 +144,13 @@ std::vector<int64_t> GetInputShape(const CNodePtr &cnode, size_t index) {
|
|||
x_shape_value->set_device_address(address_x, false);
|
||||
x_shape_value->data_sync();
|
||||
|
||||
auto x_value = reinterpret_cast<int64_t *>(x_shape_value->data_c());
|
||||
auto x_value = static_cast<int64_t *>(x_shape_value->data_c());
|
||||
MS_EXCEPTION_IF_NULL(x_value);
|
||||
std::vector<int64_t> input_shape = {x_value, x_value + x_num};
|
||||
return input_shape;
|
||||
}
|
||||
|
||||
size_t SetOutputValue(const CNodePtr &cnode, const std::vector<std::vector<int64_t>> &grad_reduce_idx, size_t index,
|
||||
size_t input_num) {
|
||||
size_t SetOutputValue(const CNodePtr &cnode, const std::vector<std::vector<int64_t>> &grad_reduce_idx, size_t index) {
|
||||
std::vector<int64_t> output;
|
||||
size_t out_size = grad_reduce_idx[index].size();
|
||||
for (size_t k = 0; k < out_size; ++k) {
|
||||
|
@ -188,7 +187,7 @@ size_t SetOutputValue(const CNodePtr &cnode, const std::vector<std::vector<int64
|
|||
}
|
||||
} // namespace
|
||||
|
||||
void DynamicBroadcastGradientArgsKernelMod::Execute() {
|
||||
void DynamicBroadcastGradientArgsKernelMod::Execute() const {
|
||||
MS_LOG(INFO) << "Execute DynamicBroadcastGradientArgsKernel Start";
|
||||
auto node = anf_node_.lock();
|
||||
MS_EXCEPTION_IF_NULL(node);
|
||||
|
@ -205,8 +204,8 @@ void DynamicBroadcastGradientArgsKernelMod::Execute() {
|
|||
input_shapes[1] = GetInputShape(cnode, 1);
|
||||
auto grad_reduce_idx = CalculateOutput(input_shapes);
|
||||
|
||||
auto r0_size = SetOutputValue(cnode, grad_reduce_idx, 0, input_shapes[0].size());
|
||||
auto r1_size = SetOutputValue(cnode, grad_reduce_idx, 1, input_shapes[1].size());
|
||||
auto r0_size = SetOutputValue(cnode, grad_reduce_idx, 0);
|
||||
auto r1_size = SetOutputValue(cnode, grad_reduce_idx, 1);
|
||||
|
||||
ShapeVector r0_shp{SizeToLong(r0_size)};
|
||||
ShapeVector r1_shp{SizeToLong(r1_size)};
|
||||
|
|
|
@ -15,9 +15,10 @@
|
|||
*/
|
||||
#ifndef MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_HOST_DYNAMIC_BROADCAST_GRADIENT_ARGS_KERNEL_H_
|
||||
#define MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_HOST_DYNAMIC_BROADCAST_GRADIENT_ARGS_KERNEL_H_
|
||||
#include <vector>
|
||||
#include <memory>
|
||||
#include <string>
|
||||
#include <vector>
|
||||
|
||||
#include "plugin/device/ascend/kernel/host/host_kernel_mod.h"
|
||||
|
||||
namespace mindspore {
|
||||
|
@ -30,7 +31,7 @@ class DynamicBroadcastGradientArgsKernelMod : public HostKernelMod {
|
|||
const std::vector<AddressPtr> &outputs, void *stream_ptr) override;
|
||||
|
||||
private:
|
||||
void Execute();
|
||||
void Execute() const;
|
||||
};
|
||||
MS_HOST_REG_KERNEL(DynamicBroadcastGradientArgs, DynamicBroadcastGradientArgsKernelMod);
|
||||
} // namespace kernel
|
||||
|
|
|
@ -23,7 +23,7 @@
|
|||
|
||||
namespace mindspore {
|
||||
namespace kernel {
|
||||
void TensorShapeKernelMod::Execute() {
|
||||
void TensorShapeKernelMod::Execute() const {
|
||||
MS_LOG(INFO) << "Execute TensorShapeKernel Start";
|
||||
auto node = anf_node_.lock();
|
||||
MS_EXCEPTION_IF_NULL(node);
|
||||
|
@ -64,50 +64,16 @@ void TensorShapeKernelMod::Execute() {
|
|||
if (!ret) {
|
||||
MS_LOG(EXCEPTION) << "Sync stream error!";
|
||||
}
|
||||
output_addr->SyncHostToDevice(output_shape, LongToSize(output_tensor_for_sync->data().nbytes()),
|
||||
output_tensor_for_sync->data_type(), output_tensor_for_sync->data_c(),
|
||||
output_tensor_for_sync->device_info().host_format_);
|
||||
if (!output_addr->SyncHostToDevice(output_shape, LongToSize(output_tensor_for_sync->data().nbytes()),
|
||||
output_tensor_for_sync->data_type(), output_tensor_for_sync->data_c(),
|
||||
output_tensor_for_sync->device_info().host_format_)) {
|
||||
MS_LOG(EXCEPTION) << "TensorShapeKernel SyncHostToDevice failed.";
|
||||
}
|
||||
}
|
||||
|
||||
MS_LOG(INFO) << "Execute TensorShapeKernel End";
|
||||
}
|
||||
|
||||
void TensorShapeKernelMod::Execute(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &outputs) {
|
||||
MS_LOG(INFO) << "Execute TensorShapeKernel Start";
|
||||
auto node = anf_node_.lock();
|
||||
MS_EXCEPTION_IF_NULL(node);
|
||||
auto cnode = node->cast<CNodePtr>();
|
||||
MS_EXCEPTION_IF_NULL(cnode);
|
||||
auto input_num = common::AnfAlgo::GetInputTensorNum(cnode);
|
||||
if (input_num != 1) {
|
||||
MS_LOG(EXCEPTION) << "Op [" << cnode->DebugString() << "] has invalid input num, should be 1, but got " << input_num
|
||||
<< trace::DumpSourceLines(cnode);
|
||||
}
|
||||
|
||||
auto prev_output_shape = common::AnfAlgo::GetPrevNodeOutputInferShape(cnode, 0);
|
||||
std::vector<int64_t> output_shape = {SizeToLong(prev_output_shape.size())};
|
||||
|
||||
auto output_type = TypeId::kNumberTypeInt64;
|
||||
|
||||
auto output_tensor_for_sync = std::make_shared<tensor::Tensor>(output_type, output_shape);
|
||||
MS_EXCEPTION_IF_NULL(output_tensor_for_sync);
|
||||
auto data_ptr = static_cast<int64_t *>(output_tensor_for_sync->data_c());
|
||||
for (size_t i = 0; i < prev_output_shape.size(); ++i) {
|
||||
MS_LOG(INFO) << "DEBUG prev_output_shape[" << i << "]:" << prev_output_shape[i];
|
||||
*(data_ptr + i) = prev_output_shape[i];
|
||||
}
|
||||
|
||||
if (outputs.empty()) {
|
||||
MS_LOG(EXCEPTION) << "Output address of DynamicShape is empty";
|
||||
}
|
||||
auto status = rtMemcpyAsync(outputs[0]->addr, outputs[0]->size, output_tensor_for_sync->data_c(),
|
||||
LongToSize(output_tensor_for_sync->data().nbytes()), RT_MEMCPY_HOST_TO_DEVICE, stream_);
|
||||
if (status != RT_ERROR_NONE) {
|
||||
MS_LOG(EXCEPTION) << "Execute TensorShapeKernel rtMemcpyAsync failed!";
|
||||
}
|
||||
MS_LOG(INFO) << "Execute TensorShapeKernel End";
|
||||
}
|
||||
|
||||
bool TensorShapeKernelMod::Launch(const std::vector<AddressPtr> &, const std::vector<AddressPtr> &,
|
||||
const std::vector<AddressPtr> &, void *stream_ptr) {
|
||||
auto node = anf_node_.lock();
|
||||
|
|
|
@ -30,8 +30,7 @@ class TensorShapeKernelMod : public HostKernelMod {
|
|||
const std::vector<AddressPtr> &outputs, void *stream_ptr) override;
|
||||
|
||||
private:
|
||||
void Execute();
|
||||
void Execute(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &outputs);
|
||||
void Execute() const;
|
||||
};
|
||||
MS_HOST_REG_KERNEL(DynamicShape, TensorShapeKernelMod);
|
||||
MS_HOST_REG_KERNEL(TensorShape, TensorShapeKernelMod);
|
||||
|
|
|
@ -17,7 +17,6 @@
|
|||
#ifndef MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_HOST_HOST_KERNEL_META_DATA_H_
|
||||
#define MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_HOST_HOST_KERNEL_META_DATA_H_
|
||||
|
||||
#include <string>
|
||||
#include <vector>
|
||||
#include <memory>
|
||||
#include "kernel/kernel_build_info.h"
|
||||
|
|
Loading…
Reference in New Issue