!26682 GraphKernel For CPU

Merge pull request !26682 from ZengZitao/gk_cpu
This commit is contained in:
i-robot 2021-11-27 06:23:01 +00:00 committed by Gitee
commit 2bd6deaeeb
12 changed files with 267 additions and 33 deletions

View File

@ -12,7 +12,8 @@
# See the License for the specific language governing permissions and
# limitations under the License.
# ============================================================================
"""expanders init"""
"""expanders init. Deprecated, please add the new operators in the c++ file"""
from .addn import AddN
from .assign_add import AssignAdd

View File

@ -1198,8 +1198,8 @@ class GraphSplitCpu(GraphSplitByPattern):
def get_default_mode(self, op):
"""Get default mode in CPU"""
pattern = PrimLib.iter_type(op)
return self.Area.MODE_BASIC if pattern == PrimLib.RESHAPE else self.Area.MODE_COMPOSITE
del op
return self.Area.MODE_COMPOSITE
def pattern_fuse(self, fuse_func=None):
"""fuse Areas by pattern"""

View File

@ -114,6 +114,7 @@ if(ENABLE_AKG AND ${CMAKE_SYSTEM_NAME} MATCHES "Linux")
list(APPEND AKG_SRC_LIST ${AKG_D_SRC_LIST})
endif()
if(ENABLE_CPU)
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fopenmp")
file(GLOB_RECURSE AKG_CPU_SRC_LIST RELATIVE ${CMAKE_CURRENT_SOURCE_DIR}
"akg/cpu/*.cc"
)

View File

@ -17,6 +17,8 @@
#include "backend/kernel_compiler/akg/cpu/akg_cpu_kernel_mod.h"
#include <dlfcn.h>
#include <omp.h>
#include <thread>
#include <algorithm>
#include <memory>
#include <utility>
@ -28,24 +30,29 @@
namespace mindspore {
namespace kernel {
namespace {
using AkgParallelLambda = int (*)(int task_id, int num_task, void *cdata);
int AkgLaunchFunc(AkgParallelLambda flambda, void *cdata, int num_task) {
size_t num_workers =
std::min(mindspore::common::ThreadPool::GetInstance().GetSyncRunThreadNum(), static_cast<size_t>(num_task));
std::vector<mindspore::common::Task> tasks;
size_t thread_index = 0;
while (thread_index < num_workers) {
auto block = [&, thread_index]() {
flambda(thread_index, num_workers, cdata);
return mindspore::common::SUCCESS;
};
tasks.emplace_back(block);
thread_index++;
class AkgParallelLaunch {
public:
using AkgParallelLambda = int (*)(int task_id, int num_task, void *cdata);
static size_t num_workers;
static int AkgLaunchFunc(AkgParallelLambda flambda, void *cdata, int num_task) {
#pragma omp parallel num_threads(num_workers)
{ flambda(omp_get_thread_num(), num_workers, cdata); }
return 0;
}
mindspore::common::ThreadPool::GetInstance().SyncRun(tasks);
return 0;
}
// the GetFunc should be called only once.
static void *GetFunc() {
const char *omp_num_threads = getenv("OMP_NUM_THREADS");
if (omp_num_threads != nullptr) {
auto env_thread = std::stoi(std::string(omp_num_threads));
if (env_thread > 0) {
AkgParallelLaunch::num_workers = static_cast<size_t>(env_thread);
}
}
MS_LOG(INFO) << "AKG threads is : " << AkgParallelLaunch::num_workers;
return reinterpret_cast<void *>(&AkgParallelLaunch::AkgLaunchFunc);
}
};
size_t AkgParallelLaunch::num_workers = 1;
struct AkgCallBack {
void *parallel_launch_func;
@ -53,13 +60,13 @@ struct AkgCallBack {
void (*free_func)(void *);
AkgCallBack() {
parallel_launch_func = reinterpret_cast<void *>(&AkgLaunchFunc);
parallel_launch_func = AkgParallelLaunch::GetFunc();
malloc_func = &malloc;
free_func = &free;
}
~AkgCallBack() = default;
};
} // namespace
CpuKernelManagerPtr CpuKernelMod::kernelmanager_ = std::make_shared<CpuKernelManager>();
CpuKernelManager::~CpuKernelManager() {
@ -120,14 +127,16 @@ void *CpuKernelManager::GetFunction(const std::string &kernel_name) {
return launch_func;
}
CpuKernelMod::CpuKernelMod(const KernelPackPtr &kp) {
auto js = nlohmann::json::parse(kp->GetJson()->contents, kp->GetJson()->contents + kp->GetJson()->len);
kernel_name_ = js["kernelName"];
launch_func_ = kernelmanager_->GetFunction(kernel_name_);
}
bool CpuKernelMod::Launch(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &,
const std::vector<AddressPtr> &outputs, void *stream_ptr) {
auto js = nlohmann::json::parse(kernel_pack_->GetJson()->contents,
kernel_pack_->GetJson()->contents + kernel_pack_->GetJson()->len);
std::string kernel_name = js["kernelName"];
auto launch_func = kernelmanager_->GetFunction(kernel_name);
if (launch_func == nullptr) {
MS_LOG(ERROR) << "GetFunction failed. kernel: " << kernel_name;
if (launch_func_ == nullptr) {
MS_LOG(ERROR) << "GetFunction failed. kernel: " << kernel_name_;
return false;
}
std::vector<void *> runtimeargs;
@ -135,10 +144,10 @@ bool CpuKernelMod::Launch(const std::vector<AddressPtr> &inputs, const std::vect
[](const AddressPtr &input) -> void * { return input->addr; });
(void)std::transform(std::begin(outputs), std::end(outputs), std::back_inserter(runtimeargs),
[](const AddressPtr &output) -> void * { return output->addr; });
AkgCallBack akg_callback;
static AkgCallBack akg_callback;
runtimeargs.emplace_back(reinterpret_cast<void *>(&akg_callback));
using AkgCpuKernelFunction = void (*)(void *);
reinterpret_cast<AkgCpuKernelFunction>(launch_func)(reinterpret_cast<void *>(runtimeargs.data()));
reinterpret_cast<AkgCpuKernelFunction>(launch_func_)(reinterpret_cast<void *>(runtimeargs.data()));
return true;
}
} // namespace kernel

View File

@ -43,10 +43,9 @@ class CpuKernelManager {
mutable std::shared_mutex mutex_;
};
using CpuKernelManagerPtr = std::shared_ptr<CpuKernelManager>;
class CpuKernelMod : public KernelMod {
public:
explicit CpuKernelMod(const KernelPackPtr &kp) : kernel_pack_(kp) {}
explicit CpuKernelMod(const KernelPackPtr &kp);
~CpuKernelMod() = default;
void SetInputSizeList(const std::vector<size_t> &size_list) { input_size_list_ = size_list; }
@ -61,10 +60,11 @@ class CpuKernelMod : public KernelMod {
static CpuKernelManagerPtr kernelmanager_;
private:
KernelPackPtr kernel_pack_;
std::vector<size_t> input_size_list_;
std::vector<size_t> output_size_list_;
std::vector<size_t> workspace_size_list_; // workspace is not used in cpu kernel.
void *launch_func_;
std::string kernel_name_;
};
using CpuKernelModPtr = std::shared_ptr<CpuKernelMod>;

View File

@ -0,0 +1,99 @@
/**
* Copyright 2021 Huawei Technologies Co., Ltd
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include <memory>
#include <vector>
#include "backend/optimizer/graph_kernel/expanders/expander_factory.h"
#include "ir/dtype.h"
namespace mindspore::graphkernel::expanders {
class Adam : public OpExpander {
public:
Adam() {
std::initializer_list<std::string> attrs{"use_nesterov"};
(void)validators_.emplace_back(std::make_unique<CheckAttr>(attrs));
}
~Adam() = default;
protected:
bool CheckInputs() override {
const auto &var = inputs_info_[0];
if (var.type != kNumberTypeFloat32 && var.type != kNumberTypeFloat16) {
MS_LOG(INFO) << "In Adam, var's dtype must be float16 or float32";
return false;
}
return true;
}
NodePtrList Expand() override {
const auto &inputs = gb.Get()->inputs();
const auto &var = inputs[0];
const auto &m = inputs[1];
const auto &v = inputs[2];
const auto &beta1_power = inputs[3];
const auto &beta2_power = inputs[4];
const auto &lr = inputs[5];
const auto &beta1 = inputs[6];
const auto &beta2 = inputs[7];
const auto &epsilon = inputs[8];
const auto &grad = inputs[9];
// m_new <- beta1 * m + (1 - beta1) * grad
auto m_b = gb.Emit("Mul", {beta1, m});
tensor::TensorPtr data = std::make_shared<tensor::Tensor>(static_cast<double>(1.0), TypeIdToType(var->type));
auto const_one = gb.Value(data);
auto m1_beta1 = gb.Emit("Sub", {const_one, beta1});
auto m_g = gb.Emit("Mul", {m1_beta1, grad});
auto m_new = gb.Emit("Add", {m_b, m_g});
// v_new <- beta2 * v + (1 - beta2) * grad * grad
auto v_b = gb.Emit("Mul", {beta2, v});
auto m1_beta2 = gb.Emit("Sub", {const_one, beta2});
auto grad_mul = gb.Emit("Mul", {grad, grad});
auto v_g = gb.Emit("Mul", {m1_beta2, grad_mul});
auto v_new = gb.Emit("Add", {v_b, v_g});
// lr_t <- lr * sqrt(1 - beta2_power) / (1 - beta1_power);
auto m1_beta2_power = gb.Emit("Sub", {const_one, beta2_power});
auto m1_beta2_power_sqrt = gb.Emit("Sqrt", {m1_beta2_power});
auto m1_beta1_power = gb.Emit("Sub", {const_one, beta1_power});
auto power_div = gb.Emit("RealDiv", {m1_beta2_power_sqrt, m1_beta1_power});
auto lr_t = gb.Emit("Mul", {lr, power_div});
// if use_nesterov: var_new <- var - lr_t * (m_new * beta1 + (1 - beta1) * grad) / (epsilon + sqrt(v_new))
// if not use_nesterov: var_new <- var - lr_t * m_new / (epsilon + sqrt(v_new))
auto v_new_sqrt = gb.Emit("Sqrt", {v_new});
auto v_new_sqrt_e = gb.Emit("Add", {epsilon, v_new_sqrt});
auto lr_t_div = gb.Emit("RealDiv", {lr_t, v_new_sqrt_e});
mindspore::graphkernel::inner::NodePtr var_sub;
if (GetValue<bool>(attrs_["use_nesterov"])) {
auto m_new_mul = gb.Emit("Mul", {m_new, beta1});
auto m_new_mul_add = gb.Emit("Add", {m_new_mul, m_g});
var_sub = gb.Emit("Mul", {lr_t_div, m_new_mul_add});
} else {
var_sub = gb.Emit("Mul", {lr_t_div, m_new});
}
auto var_new = gb.Emit("Sub", {var, var_sub});
auto var_result = gb.Emit("InplaceAssign", {var, var_new, var_new}, {{"fake_output", MakeValue(true)}});
auto m_result = gb.Emit("InplaceAssign", {m, m_new, var_result}, {{"fake_output", MakeValue(true)}});
auto v_result = gb.Emit("InplaceAssign", {v, v_new, m_result}, {{"fake_output", MakeValue(true)}});
auto result = {var_result, m_result, v_result};
return result;
}
};
OP_EXPANDER_REGISTER("Adam", Adam);
} // namespace mindspore::graphkernel::expanders

View File

@ -0,0 +1,51 @@
/**
* Copyright 2021 Huawei Technologies Co., Ltd
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include <memory>
#include <vector>
#include "backend/optimizer/graph_kernel/expanders/expander_factory.h"
#include "ir/dtype.h"
namespace mindspore::graphkernel::expanders {
class Softplus : public OpExpander {
public:
Softplus() {}
~Softplus() = default;
protected:
bool CheckInputs() override {
const auto &input_x = inputs_info_[0];
if (input_x.type != kNumberTypeFloat32 && input_x.type != kNumberTypeFloat16) {
MS_LOG(INFO) << "In Softplus, input_x's dtype must be float16 or float32";
return false;
}
return true;
}
NodePtrList Expand() override {
const auto &inputs = gb.Get()->inputs();
const auto &input_x = inputs[0];
auto exp_x = gb.Emit("Exp", {input_x});
tensor::TensorPtr data = std::make_shared<tensor::Tensor>(static_cast<double>(1.0), TypeIdToType(input_x->type));
auto const_one = gb.Value(data);
auto exp_x_add_one = gb.Emit("Add", {exp_x, const_one});
auto result = gb.Emit("Log", {exp_x_add_one});
return {result};
}
};
OP_EXPANDER_REGISTER("Softplus", Softplus);
} // namespace mindspore::graphkernel::expanders

View File

@ -0,0 +1,53 @@
/**
* Copyright 2021 Huawei Technologies Co., Ltd
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include <memory>
#include <vector>
#include "backend/optimizer/graph_kernel/expanders/expander_factory.h"
#include "ir/dtype.h"
namespace mindspore::graphkernel::expanders {
class SoftplusGrad : public OpExpander {
public:
SoftplusGrad() {}
~SoftplusGrad() = default;
protected:
bool CheckInputs() override {
const auto &input_x = inputs_info_[1];
if (input_x.type != kNumberTypeFloat32 && input_x.type != kNumberTypeFloat16) {
MS_LOG(INFO) << "In SoftplusGrad, input_x's dtype must be float16 or float32";
return false;
}
return true;
}
NodePtrList Expand() override {
const auto &inputs = gb.Get()->inputs();
const auto &input_dy = inputs[0];
const auto &input_x = inputs[1];
auto exp_x = gb.Emit("Exp", {input_x});
tensor::TensorPtr data = std::make_shared<tensor::Tensor>(static_cast<double>(1.0), TypeIdToType(input_x->type));
auto const_one = gb.Value(data);
auto exp_x_add_one = gb.Emit("Add", {exp_x, const_one});
auto dy_mul_exp_x = gb.Emit("Mul", {input_dy, exp_x});
auto result = gb.Emit("RealDiv", {dy_mul_exp_x, exp_x_add_one});
return {result};
}
};
OP_EXPANDER_REGISTER("SoftplusGrad", SoftplusGrad);
} // namespace mindspore::graphkernel::expanders

View File

@ -27,6 +27,8 @@ class StandardNormal : public OpExpander {
(void)validators_.emplace_back(std::make_unique<CheckAttr>(attrs));
}
~StandardNormal() {}
protected:
NodePtrList Expand() override {
const auto &inputs = gb.Get()->inputs();
const auto &input_x = inputs[0];

View File

@ -100,6 +100,13 @@ std::vector<PrimitivePtr> GraphKernelCluster::GetClusterableOpList() {
{kGPUDevice, OpLevel_0, prim::kPrimSign},
{kGPUDevice, OpLevel_0, prim::kPrimSin},
{kGPUDevice, OpLevel_0, prim::kPrimStridedSlice},
// cpu
{kCPUDevice, OpLevel_0, prim::kPrimLogicalNot},
{kCPUDevice, OpLevel_0, prim::kPrimMod},
{kCPUDevice, OpLevel_1, prim::kPrimReduceMax},
{kCPUDevice, OpLevel_0, prim::kPrimSelect},
{kCPUDevice, OpLevel_0, prim::kPrimLess},
{kCPUDevice, OpLevel_0, prim::kPrimLessEqual},
};
const auto &flags = GraphKernelFlags::GetInstance();
return GkUtils::GetValidOps(clusterable_ops_with_level, flags.fusion_ops_level, flags.enable_cluster_ops_only,

View File

@ -46,6 +46,7 @@ constexpr size_t kAssignInputIdx = 1;
constexpr size_t kLambOptimizerInputIdx = 12;
constexpr size_t kLambWeightInputIdx = 4;
constexpr size_t kRandomInputIdx = 1;
constexpr size_t kAdamInputIdx = 10;
std::vector<PrimitivePtr> GetExpandOps() {
std::vector<OpWithLevel> expand_ops_with_level = {
@ -94,6 +95,13 @@ std::vector<PrimitivePtr> GetExpandOps() {
{kGPUDevice, OpLevel_0, prim::kPrimIdentityMath},
{kGPUDevice, OpLevel_0, prim::kPrimOnesLike},
{kGPUDevice, OpLevel_0, prim::kPrimStandardNormal},
{kCPUDevice, OpLevel_0, prim::kPrimOnesLike},
{kCPUDevice, OpLevel_0, prim::kPrimBiasAdd},
{kCPUDevice, OpLevel_1, prim::kPrimBiasAddGrad},
{kCPUDevice, OpLevel_0, prim::kPrimRelu},
{kCPUDevice, OpLevel_1, prim::kPrimMaximumGrad},
{kCPUDevice, OpLevel_1, prim::kPrimMinimumGrad},
{kCPUDevice, OpLevel_1, prim::kPrimAdam},
};
const auto &flags = GraphKernelFlags::GetInstance();
return GkUtils::GetValidOps(expand_ops_with_level, flags.fusion_ops_level, flags.enable_expand_ops_only,
@ -199,6 +207,7 @@ ExpanderPtr GraphKernelExpander::GetExpander(const AnfNodePtr &node) {
{prim::kLambApplyOptimizerAssign, std::make_shared<OpUMonadExpander>(kLambOptimizerInputIdx)},
{prim::kLambApplyWeightAssign, std::make_shared<OpUMonadExpander>(kLambWeightInputIdx)},
{prim::kPrimStandardNormal, std::make_shared<OpUMonadExpander>(kRandomInputIdx)},
{prim::kPrimAdam, std::make_shared<OpUMonadExpander>(kAdamInputIdx)},
};
for (auto &e : expanders) {

View File

@ -204,6 +204,8 @@ list(REMOVE_ITEM MINDSPORE_SRC_LIST "../../../mindspore/ccsrc/backend/optimizer/
list(REMOVE_ITEM MINDSPORE_SRC_LIST
"../../../mindspore/ccsrc/backend/optimizer/graph_kernel/lite_adapter/callback_impl.cc")
list(REMOVE_ITEM MINDSPORE_SRC_LIST "../../../mindspore/ccsrc/backend/kernel_compiler/tbe/tbe_kernel_compile.cc")
list(REMOVE_ITEM MINDSPORE_SRC_LIST "../../../mindspore/ccsrc/backend/kernel_compiler/akg/cpu/akg_cpu_kernel_mod.cc")
list(REMOVE_ITEM MINDSPORE_SRC_LIST "../../../mindspore/ccsrc/backend/kernel_compiler/akg/cpu/akg_cpu_kernel_build.cc")
if(ENABLE_SECURITY)
list(REMOVE_ITEM MINDSPORE_SRC_LIST "../../../mindspore/ccsrc/profiler/device/profiling.cc")
list(REMOVE_ITEM MINDSPORE_SRC_LIST "../../../mindspore/ccsrc/profiler/device/ascend/memory_profiling.cc")