forked from mindspore-Ecosystem/mindspore
!4205 transpose output set to image2d and add mobilenetv2 GPU testcase
Merge pull request !4205 from chenzupeng/master-lite
This commit is contained in:
commit
ce2e8839a6
|
@ -1,8 +1,9 @@
|
||||||
#define FLT half
|
#define FLT half
|
||||||
#define FLT4 half4
|
#define FLT4 half4
|
||||||
#define READ_IMAGE read_imageh
|
#define READ_IMAGE read_imageh
|
||||||
|
#define WRITE_IMAGE write_imageh
|
||||||
__constant sampler_t smp_zero = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;
|
__constant sampler_t smp_zero = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;
|
||||||
__kernel void transpose(__read_only image2d_t src_data, __global float4 *dst_data, int2 HW, int2 C) {
|
__kernel void transpose(__read_only image2d_t src_data, __write_only image2d_t dst_data, int2 HW, int2 C) {
|
||||||
int X = get_global_id(0);
|
int X = get_global_id(0);
|
||||||
int Y = get_global_id(1);
|
int Y = get_global_id(1);
|
||||||
if (X >= HW.y || Y >= C.y) {
|
if (X >= HW.y || Y >= C.y) {
|
||||||
|
@ -37,8 +38,8 @@ __kernel void transpose(__read_only image2d_t src_data, __global float4 *dst_dat
|
||||||
result[3].z = x2.w;
|
result[3].z = x2.w;
|
||||||
result[3].w = x3.w;
|
result[3].w = x3.w;
|
||||||
|
|
||||||
dst_data[4 * Y * HW.y + X] = result[0];
|
WRITE_IMAGE(dst_data, (int2)(X, 4 * Y), result[0]);
|
||||||
dst_data[(4 * Y + 1) * HW.y + X] = result[1];
|
WRITE_IMAGE(dst_data, (int2)(X, 4 * Y + 1), result[1]);
|
||||||
dst_data[(4 * Y + 2) * HW.y + X] = result[2];
|
WRITE_IMAGE(dst_data, (int2)(X, 4 * Y + 2), result[2]);
|
||||||
dst_data[(4 * Y + 3) * HW.y + X] = result[3];
|
WRITE_IMAGE(dst_data, (int2)(X, 4 * Y + 3), result[3]);
|
||||||
}
|
}
|
||||||
|
|
|
@ -1,8 +1,9 @@
|
||||||
#define FLT float
|
#define FLT float
|
||||||
#define FLT4 float4
|
#define FLT4 float4
|
||||||
#define READ_IMAGE read_imagef
|
#define READ_IMAGE read_imagef
|
||||||
|
#define WRITE_IMAGE write_imagef
|
||||||
__constant sampler_t smp_zero = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;
|
__constant sampler_t smp_zero = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;
|
||||||
__kernel void transpose(__read_only image2d_t src_data, __global float4 *dst_data, int2 HW, int2 C) {
|
__kernel void transpose(__read_only image2d_t src_data, __write_only image2d_t dst_data, int2 HW, int2 C) {
|
||||||
int X = get_global_id(0);
|
int X = get_global_id(0);
|
||||||
int Y = get_global_id(1);
|
int Y = get_global_id(1);
|
||||||
if (X >= HW.y || Y >= C.y) {
|
if (X >= HW.y || Y >= C.y) {
|
||||||
|
@ -37,8 +38,8 @@ __kernel void transpose(__read_only image2d_t src_data, __global float4 *dst_dat
|
||||||
result[3].z = x2.w;
|
result[3].z = x2.w;
|
||||||
result[3].w = x3.w;
|
result[3].w = x3.w;
|
||||||
|
|
||||||
dst_data[4 * Y * HW.y + X] = result[0];
|
WRITE_IMAGE(dst_data, (int2)(X, 4 * Y), result[0]);
|
||||||
dst_data[(4 * Y + 1) * HW.y + X] = result[1];
|
WRITE_IMAGE(dst_data, (int2)(X, 4 * Y + 1), result[1]);
|
||||||
dst_data[(4 * Y + 2) * HW.y + X] = result[2];
|
WRITE_IMAGE(dst_data, (int2)(X, 4 * Y + 2), result[2]);
|
||||||
dst_data[(4 * Y + 3) * HW.y + X] = result[3];
|
WRITE_IMAGE(dst_data, (int2)(X, 4 * Y + 3), result[3]);
|
||||||
}
|
}
|
||||||
|
|
|
@ -43,7 +43,7 @@ void ArithmeticOpenCLKernel::Image2dGetWorkGroupSize() {
|
||||||
size_t H = outputs_[0]->Batch() * outputs_[0]->Height();
|
size_t H = outputs_[0]->Batch() * outputs_[0]->Height();
|
||||||
size_t W = outputs_[0]->Width() * UP_DIV(outputs_[0]->Channel(), C4NUM);
|
size_t W = outputs_[0]->Width() * UP_DIV(outputs_[0]->Channel(), C4NUM);
|
||||||
local_size_ = {16, 16};
|
local_size_ = {16, 16};
|
||||||
global_size_ = {H, W};
|
global_size_ = {W, H};
|
||||||
}
|
}
|
||||||
|
|
||||||
void ArithmeticOpenCLKernel::BufferGetWorkGroupSize() {
|
void ArithmeticOpenCLKernel::BufferGetWorkGroupSize() {
|
||||||
|
@ -140,7 +140,7 @@ int ArithmeticOpenCLKernel::Run() {
|
||||||
bias_ = -1 * value;
|
bias_ = -1 * value;
|
||||||
break;
|
break;
|
||||||
case PrimitiveType_Div:
|
case PrimitiveType_Div:
|
||||||
bias_ = 1 / value;
|
weight_ = 1 / value;
|
||||||
break;
|
break;
|
||||||
default:
|
default:
|
||||||
MS_LOG(ERROR) << "Error Operator type " << opParameter->type_;
|
MS_LOG(ERROR) << "Error Operator type " << opParameter->type_;
|
||||||
|
@ -152,7 +152,7 @@ int ArithmeticOpenCLKernel::Run() {
|
||||||
runtime_->SetKernelArg(kernel_, arg_idx++, outputs_[0]->Data());
|
runtime_->SetKernelArg(kernel_, arg_idx++, outputs_[0]->Data());
|
||||||
int H = outputs_[0]->Batch() * outputs_[0]->Height();
|
int H = outputs_[0]->Batch() * outputs_[0]->Height();
|
||||||
int W = outputs_[0]->Width() * UP_DIV(outputs_[0]->Channel(), C4NUM);
|
int W = outputs_[0]->Width() * UP_DIV(outputs_[0]->Channel(), C4NUM);
|
||||||
cl_int2 output_shape{H, W};
|
cl_int2 output_shape{W, H};
|
||||||
runtime_->SetKernelArg(kernel_, arg_idx++, output_shape);
|
runtime_->SetKernelArg(kernel_, arg_idx++, output_shape);
|
||||||
runtime_->RunKernel(kernel_, global_size_, local_size_, nullptr);
|
runtime_->RunKernel(kernel_, global_size_, local_size_, nullptr);
|
||||||
return 0;
|
return 0;
|
||||||
|
|
|
@ -67,6 +67,21 @@ int TransposeOpenCLKernel::Init() {
|
||||||
|
|
||||||
int TransposeOpenCLKernel::ReSize() { return 0; }
|
int TransposeOpenCLKernel::ReSize() { return 0; }
|
||||||
|
|
||||||
|
int TransposeOpenCLKernel::GetImageSize(size_t idx, std::vector<size_t> *img_size) {
|
||||||
|
size_t im_dst_x, im_dst_y;
|
||||||
|
im_dst_x = UP_DIV(outputs_[0]->Height() * outputs_[0]->Width(), C4NUM);
|
||||||
|
im_dst_y = outputs_[0]->Channel();
|
||||||
|
#ifdef ENABLE_FP16
|
||||||
|
size_t img_dtype = CL_HALF_FLOAT;
|
||||||
|
#else
|
||||||
|
size_t img_dtype = CL_FLOAT;
|
||||||
|
#endif
|
||||||
|
img_size->clear();
|
||||||
|
std::vector<size_t> vec{im_dst_x, im_dst_y, img_dtype};
|
||||||
|
*img_size = vec;
|
||||||
|
return RET_OK;
|
||||||
|
}
|
||||||
|
|
||||||
int TransposeOpenCLKernel::Run() {
|
int TransposeOpenCLKernel::Run() {
|
||||||
MS_LOG(DEBUG) << this->Name() << " Running!";
|
MS_LOG(DEBUG) << this->Name() << " Running!";
|
||||||
std::vector<int> shapex = inputs_[0]->shape();
|
std::vector<int> shapex = inputs_[0]->shape();
|
||||||
|
|
|
@ -35,7 +35,7 @@ class TransposeOpenCLKernel : public OpenCLKernel {
|
||||||
int Init() override;
|
int Init() override;
|
||||||
int ReSize() override;
|
int ReSize() override;
|
||||||
int Run() override;
|
int Run() override;
|
||||||
|
int GetImageSize(size_t idx, std::vector<size_t> *img_size) override;
|
||||||
private:
|
private:
|
||||||
cl::Kernel kernel_;
|
cl::Kernel kernel_;
|
||||||
};
|
};
|
||||||
|
|
|
@ -50,6 +50,15 @@ auto status = RunBenchmark(5, argv);
|
||||||
ASSERT_EQ(status, RET_OK);
|
ASSERT_EQ(status, RET_OK);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
TEST_F(BenchmarkTest, Test_MV2_GPU) {
|
||||||
|
const char *argv[] = {"./benchmark", "--modelPath=./hiai/mobilenet_v2.ms",
|
||||||
|
"--inDataPath=./hiai/mobilenet_v2_in.bin",
|
||||||
|
"--calibDataPath=./hiai/mobilenet_v2_out.bin",
|
||||||
|
"--device=GPU"};
|
||||||
|
auto status = RunBenchmark(5, argv);
|
||||||
|
ASSERT_EQ(status, RET_OK);
|
||||||
|
}
|
||||||
|
|
||||||
TEST_F(BenchmarkTest, TestHebing) {
|
TEST_F(BenchmarkTest, TestHebing) {
|
||||||
const char *argv[] = {"./benchmark", "--modelPath=./hiai/model_hebing_3branch.ms",
|
const char *argv[] = {"./benchmark", "--modelPath=./hiai/model_hebing_3branch.ms",
|
||||||
"--inDataPath=./hiai/model_hebing_3branch.bin",
|
"--inDataPath=./hiai/model_hebing_3branch.bin",
|
||||||
|
|
|
@ -66,6 +66,9 @@ TEST_F(TestTransposeOpenCL, TransposeFp32) {
|
||||||
size_n = size_n > 100 ? 100 : size_n;
|
size_n = size_n > 100 ? 100 : size_n;
|
||||||
for (int i = 0; i < size_n; i++) {
|
for (int i = 0; i < size_n; i++) {
|
||||||
std::cout << output_data[i] << " ";
|
std::cout << output_data[i] << " ";
|
||||||
|
if ((i + 1) % c == 0) {
|
||||||
|
std::cout << std::endl;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
std::cout << std::endl;
|
std::cout << std::endl;
|
||||||
|
|
||||||
|
|
Loading…
Reference in New Issue