diff --git a/mindspore/ccsrc/kernel/gpu/math/matmul_gpu_kernel.h b/mindspore/ccsrc/kernel/gpu/math/matmul_gpu_kernel.h index e2c0a965102..e1e3b92620d 100644 --- a/mindspore/ccsrc/kernel/gpu/math/matmul_gpu_kernel.h +++ b/mindspore/ccsrc/kernel/gpu/math/matmul_gpu_kernel.h @@ -67,7 +67,7 @@ class MatMulGpuKernel : public GpuKernel { CHECK_CUBLAS_RET_WITH_EXCEPT( cublasGemmStridedBatchedEx(handle_, transpose_x2_, transpose_x1_, SizeToInt(n_), SizeToInt(m_), SizeToInt(k_), &alpha, input2_addr, dtype_b_, ldb, stride_b, input1_addr, dtype_a_, lda, stride_a, - &beta, output_addr, dtype_c_, ldc, stride_c, batch_, dtype_c_, algo_), + &beta, output_addr, dtype_c_, ldc, stride_c, batch_, CUDA_R_32F, algo_), "cublasSgemm Call Fail"); return true; } diff --git a/tests/st/ops/gpu/test_batch_matmul.py b/tests/st/ops/gpu/test_batch_matmul.py index 4e357095c57..ebf7bb397ba 100644 --- a/tests/st/ops/gpu/test_batch_matmul.py +++ b/tests/st/ops/gpu/test_batch_matmul.py @@ -60,7 +60,7 @@ def test_4D(): def test_4D_transpose_a(): input_x = Tensor(np.arange(2*4*3*1).reshape(2,4,3,1), mstype.float32) input_y = Tensor(np.arange(2*4*3*4).reshape(2,4,3,4), mstype.float32) - + context.set_context(mode=context.GRAPH_MODE, device_target="GPU") net = BatchMatMulNet(transpose_a=True) output = net(input_x, input_y) @@ -82,7 +82,7 @@ def test_4D_transpose_a(): def test_4D_transpose_b(): input_x = Tensor(np.arange(2*4*1*3).reshape(2,4,1,3), mstype.float32) input_y = Tensor(np.arange(2*4*4*3).reshape(2,4,4,3), mstype.float32) - + context.set_context(mode=context.GRAPH_MODE, device_target="GPU") net = BatchMatMulNet(transpose_b=True) output = net(input_x, input_y) @@ -104,7 +104,7 @@ def test_4D_transpose_b(): def test_4D_transpose_ab(): input_x = Tensor(np.arange(2*4*3*1).reshape(2,4,3,1), mstype.float32) input_y = Tensor(np.arange(2*4*4*3).reshape(2,4,4,3), mstype.float32) - + context.set_context(mode=context.GRAPH_MODE, device_target="GPU") net = BatchMatMulNet(transpose_a=True, transpose_b=True) output = net(input_x, input_y) @@ -118,3 +118,29 @@ def test_4D_transpose_ab(): [[4163, 4334, 4505, 4676]], [[5612, 5810, 6008, 6206]]]] assert (output.asnumpy() == expect).all() + +class BatchMatMulNet(nn.Cell): + def __init__(self, transpose_a=False, transpose_b=False): + super(BatchMatMulNet, self).__init__() + self.batch_matmul = P.BatchMatMul(transpose_a, transpose_b) + + def construct(self, x, y): + return self.batch_matmul(x, y) + +def test_4D_fp16(): + input_x = Tensor(np.arange(2 * 4 * 1 * 3).reshape(2, 4, 1, 3), mstype.float16) + input_y = Tensor(np.arange(2 * 4 * 3 * 4).reshape(2, 4, 3, 4), mstype.float16) + + context.set_context(mode=context.GRAPH_MODE, device_target="GPU") + net = BatchMatMulNet() + output = net(input_x, input_y) + expect = [[[[ 20, 23, 26, 29]], + [[ 200, 212, 224, 236]], + [[ 596, 617, 638, 659]], + [[1208, 1238, 1268, 1298]]], + + [[[2036, 2075, 2114, 2153]], + [[3080, 3128, 3176, 3224]], + [[4340, 4397, 4454, 4511]], + [[5816, 5882, 5948, 6014]]]] + assert (output.asnumpy() == expect).all()