[RISCV] Match vwmulsu_vx with scalar splat input.

This is a more generic version of D119110 that uses MaskedValueIsZero
to do the matching and SimplifyDemandedBits to remove any unneeded
AND instructions.

Tests were taken from D119110.

Reviewed By: Chenbing.Zheng

Differential Revision: https://reviews.llvm.org/D119622
This commit is contained in:
Craig Topper 2022-02-15 08:45:21 -08:00
parent d132b47bb9
commit ab6e02dded
2 changed files with 263 additions and 6 deletions

View File

@ -7786,12 +7786,15 @@ static SDValue combineMUL_VLToVWMUL_VL(SDNode *N, SelectionDAG &DAG,
if (ScalarBits < EltBits)
return SDValue();
if (IsSignExt) {
if (DAG.ComputeNumSignBits(Op1) <= (ScalarBits - NarrowSize))
return SDValue();
// If the LHS is a sign extend, try to use vwmul.
if (IsSignExt && DAG.ComputeNumSignBits(Op1) > (ScalarBits - NarrowSize)) {
// Can use vwmul.
} else {
// Otherwise try to use vwmulu or vwmulsu.
APInt Mask = APInt::getBitsSetFrom(ScalarBits, NarrowSize);
if (!DAG.MaskedValueIsZero(Op1, Mask))
if (DAG.MaskedValueIsZero(Op1, Mask))
IsVWMULSU = IsSignExt;
else
return SDValue();
}
@ -8438,6 +8441,16 @@ SDValue RISCVTargetLowering::PerformDAGCombine(SDNode *N,
return Gather;
break;
}
case RISCVISD::VMV_V_X_VL: {
// VMV.V.X only demands the vector element bitwidth from the scalar input.
unsigned ScalarSize = N->getOperand(0).getValueSizeInBits();
unsigned EltWidth = N->getValueType(0).getScalarSizeInBits();
if (ScalarSize > EltWidth)
if (SimplifyDemandedLowBitsHelper(0, EltWidth))
return SDValue(N, 0);
break;
}
}
return SDValue();

View File

@ -1,6 +1,6 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc -mtriple=riscv32 -mattr=+v -riscv-v-vector-bits-min=128 -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK
; RUN: llc -mtriple=riscv64 -mattr=+v -riscv-v-vector-bits-min=128 -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK
; RUN: llc -mtriple=riscv32 -mattr=+v -riscv-v-vector-bits-min=128 -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,RV32
; RUN: llc -mtriple=riscv64 -mattr=+v -riscv-v-vector-bits-min=128 -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,RV64
define <2 x i16> @vwmulsu_v2i16(<2 x i8>* %x, <2 x i8>* %y) {
; CHECK-LABEL: vwmulsu_v2i16:
@ -681,3 +681,247 @@ define <16 x i64> @vwmulsu_vx_v16i64(<16 x i32>* %x, i32 %y) {
%f = mul <16 x i64> %d, %e
ret <16 x i64> %f
}
define <8 x i16> @vwmulsu_vx_v8i16_i8(<8 x i8>* %x, i8* %y) {
; CHECK-LABEL: vwmulsu_vx_v8i16_i8:
; CHECK: # %bb.0:
; CHECK-NEXT: vsetivli zero, 8, e8, mf2, ta, mu
; CHECK-NEXT: vle8.v v9, (a0)
; CHECK-NEXT: lbu a0, 0(a1)
; CHECK-NEXT: vwmulsu.vx v8, v9, a0
; CHECK-NEXT: ret
%a = load <8 x i8>, <8 x i8>* %x
%b = load i8, i8* %y
%c = zext i8 %b to i16
%d = insertelement <8 x i16> poison, i16 %c, i32 0
%e = shufflevector <8 x i16> %d, <8 x i16> poison, <8 x i32> zeroinitializer
%f = sext <8 x i8> %a to <8 x i16>
%g = mul <8 x i16> %e, %f
ret <8 x i16> %g
}
define <8 x i16> @vwmulsu_vx_v8i16_i8_swap(<8 x i8>* %x, i8* %y) {
; CHECK-LABEL: vwmulsu_vx_v8i16_i8_swap:
; CHECK: # %bb.0:
; CHECK-NEXT: vsetivli zero, 8, e8, mf2, ta, mu
; CHECK-NEXT: vle8.v v8, (a0)
; CHECK-NEXT: lb a0, 0(a1)
; CHECK-NEXT: vsetvli zero, zero, e16, m1, ta, mu
; CHECK-NEXT: vzext.vf2 v9, v8
; CHECK-NEXT: vmul.vx v8, v9, a0
; CHECK-NEXT: ret
%a = load <8 x i8>, <8 x i8>* %x
%b = load i8, i8* %y
%c = sext i8 %b to i16
%d = insertelement <8 x i16> poison, i16 %c, i32 0
%e = shufflevector <8 x i16> %d, <8 x i16> poison, <8 x i32> zeroinitializer
%f = zext <8 x i8> %a to <8 x i16>
%g = mul <8 x i16> %e, %f
ret <8 x i16> %g
}
define <4 x i32> @vwmulsu_vx_v4i32_i8(<4 x i16>* %x, i8* %y) {
; CHECK-LABEL: vwmulsu_vx_v4i32_i8:
; CHECK: # %bb.0:
; CHECK-NEXT: vsetivli zero, 4, e16, mf2, ta, mu
; CHECK-NEXT: vle16.v v9, (a0)
; CHECK-NEXT: lbu a0, 0(a1)
; CHECK-NEXT: vwmul.vx v8, v9, a0
; CHECK-NEXT: ret
%a = load <4 x i16>, <4 x i16>* %x
%b = load i8, i8* %y
%c = zext i8 %b to i32
%d = insertelement <4 x i32> poison, i32 %c, i32 0
%e = shufflevector <4 x i32> %d, <4 x i32> poison, <4 x i32> zeroinitializer
%f = sext <4 x i16> %a to <4 x i32>
%g = mul <4 x i32> %e, %f
ret <4 x i32> %g
}
define <4 x i32> @vwmulsu_vx_v4i32_i16(<4 x i16>* %x, i16* %y) {
; CHECK-LABEL: vwmulsu_vx_v4i32_i16:
; CHECK: # %bb.0:
; CHECK-NEXT: vsetivli zero, 4, e16, mf2, ta, mu
; CHECK-NEXT: vle16.v v9, (a0)
; CHECK-NEXT: lhu a0, 0(a1)
; CHECK-NEXT: vwmulsu.vx v8, v9, a0
; CHECK-NEXT: ret
%a = load <4 x i16>, <4 x i16>* %x
%b = load i16, i16* %y
%c = zext i16 %b to i32
%d = insertelement <4 x i32> poison, i32 %c, i32 0
%e = shufflevector <4 x i32> %d, <4 x i32> poison, <4 x i32> zeroinitializer
%f = sext <4 x i16> %a to <4 x i32>
%g = mul <4 x i32> %e, %f
ret <4 x i32> %g
}
define <2 x i64> @vwmulsu_vx_v2i64_i8(<2 x i32>* %x, i8* %y) {
; RV32-LABEL: vwmulsu_vx_v2i64_i8:
; RV32: # %bb.0:
; RV32-NEXT: addi sp, sp, -16
; RV32-NEXT: .cfi_def_cfa_offset 16
; RV32-NEXT: vsetivli zero, 2, e32, mf2, ta, mu
; RV32-NEXT: lbu a1, 0(a1)
; RV32-NEXT: vle32.v v8, (a0)
; RV32-NEXT: sw zero, 12(sp)
; RV32-NEXT: sw a1, 8(sp)
; RV32-NEXT: addi a0, sp, 8
; RV32-NEXT: vlse64.v v9, (a0), zero
; RV32-NEXT: vsetvli zero, zero, e64, m1, ta, mu
; RV32-NEXT: vsext.vf2 v10, v8
; RV32-NEXT: vmul.vv v8, v9, v10
; RV32-NEXT: addi sp, sp, 16
; RV32-NEXT: ret
;
; RV64-LABEL: vwmulsu_vx_v2i64_i8:
; RV64: # %bb.0:
; RV64-NEXT: vsetivli zero, 2, e32, mf2, ta, mu
; RV64-NEXT: vle32.v v9, (a0)
; RV64-NEXT: lbu a0, 0(a1)
; RV64-NEXT: vwmul.vx v8, v9, a0
; RV64-NEXT: ret
%a = load <2 x i32>, <2 x i32>* %x
%b = load i8, i8* %y
%c = zext i8 %b to i64
%d = insertelement <2 x i64> poison, i64 %c, i64 0
%e = shufflevector <2 x i64> %d, <2 x i64> poison, <2 x i32> zeroinitializer
%f = sext <2 x i32> %a to <2 x i64>
%g = mul <2 x i64> %e, %f
ret <2 x i64> %g
}
define <2 x i64> @vwmulsu_vx_v2i64_i16(<2 x i32>* %x, i16* %y) {
; RV32-LABEL: vwmulsu_vx_v2i64_i16:
; RV32: # %bb.0:
; RV32-NEXT: addi sp, sp, -16
; RV32-NEXT: .cfi_def_cfa_offset 16
; RV32-NEXT: vsetivli zero, 2, e32, mf2, ta, mu
; RV32-NEXT: lhu a1, 0(a1)
; RV32-NEXT: vle32.v v8, (a0)
; RV32-NEXT: sw zero, 12(sp)
; RV32-NEXT: sw a1, 8(sp)
; RV32-NEXT: addi a0, sp, 8
; RV32-NEXT: vlse64.v v9, (a0), zero
; RV32-NEXT: vsetvli zero, zero, e64, m1, ta, mu
; RV32-NEXT: vsext.vf2 v10, v8
; RV32-NEXT: vmul.vv v8, v9, v10
; RV32-NEXT: addi sp, sp, 16
; RV32-NEXT: ret
;
; RV64-LABEL: vwmulsu_vx_v2i64_i16:
; RV64: # %bb.0:
; RV64-NEXT: vsetivli zero, 2, e32, mf2, ta, mu
; RV64-NEXT: vle32.v v9, (a0)
; RV64-NEXT: lhu a0, 0(a1)
; RV64-NEXT: vwmul.vx v8, v9, a0
; RV64-NEXT: ret
%a = load <2 x i32>, <2 x i32>* %x
%b = load i16, i16* %y
%c = zext i16 %b to i64
%d = insertelement <2 x i64> poison, i64 %c, i64 0
%e = shufflevector <2 x i64> %d, <2 x i64> poison, <2 x i32> zeroinitializer
%f = sext <2 x i32> %a to <2 x i64>
%g = mul <2 x i64> %e, %f
ret <2 x i64> %g
}
define <2 x i64> @vwmulsu_vx_v2i64_i32(<2 x i32>* %x, i32* %y) {
; RV32-LABEL: vwmulsu_vx_v2i64_i32:
; RV32: # %bb.0:
; RV32-NEXT: addi sp, sp, -16
; RV32-NEXT: .cfi_def_cfa_offset 16
; RV32-NEXT: vsetivli zero, 2, e32, mf2, ta, mu
; RV32-NEXT: lw a1, 0(a1)
; RV32-NEXT: vle32.v v8, (a0)
; RV32-NEXT: sw zero, 12(sp)
; RV32-NEXT: sw a1, 8(sp)
; RV32-NEXT: addi a0, sp, 8
; RV32-NEXT: vlse64.v v9, (a0), zero
; RV32-NEXT: vsetvli zero, zero, e64, m1, ta, mu
; RV32-NEXT: vsext.vf2 v10, v8
; RV32-NEXT: vmul.vv v8, v9, v10
; RV32-NEXT: addi sp, sp, 16
; RV32-NEXT: ret
;
; RV64-LABEL: vwmulsu_vx_v2i64_i32:
; RV64: # %bb.0:
; RV64-NEXT: vsetivli zero, 2, e32, mf2, ta, mu
; RV64-NEXT: vle32.v v9, (a0)
; RV64-NEXT: lwu a0, 0(a1)
; RV64-NEXT: vwmulsu.vx v8, v9, a0
; RV64-NEXT: ret
%a = load <2 x i32>, <2 x i32>* %x
%b = load i32, i32* %y
%c = zext i32 %b to i64
%d = insertelement <2 x i64> poison, i64 %c, i64 0
%e = shufflevector <2 x i64> %d, <2 x i64> poison, <2 x i32> zeroinitializer
%f = sext <2 x i32> %a to <2 x i64>
%g = mul <2 x i64> %e, %f
ret <2 x i64> %g
}
define <8 x i16> @vwmulsu_vx_v8i16_i8_and(<8 x i8>* %x, i16 %y) {
; CHECK-LABEL: vwmulsu_vx_v8i16_i8_and:
; CHECK: # %bb.0:
; CHECK-NEXT: vsetivli zero, 8, e8, mf2, ta, mu
; CHECK-NEXT: vle8.v v9, (a0)
; CHECK-NEXT: vwmulsu.vx v8, v9, a1
; CHECK-NEXT: ret
%a = load <8 x i8>, <8 x i8>* %x
%b = and i16 %y, 255
%c = insertelement <8 x i16> poison, i16 %b, i32 0
%d = shufflevector <8 x i16> %c, <8 x i16> poison, <8 x i32> zeroinitializer
%e = sext <8 x i8> %a to <8 x i16>
%f = mul <8 x i16> %d, %e
ret <8 x i16> %f
}
define <8 x i16> @vwmulsu_vx_v8i16_i8_and1(<8 x i8>* %x, i16 %y) {
; CHECK-LABEL: vwmulsu_vx_v8i16_i8_and1:
; CHECK: # %bb.0:
; CHECK-NEXT: vsetivli zero, 8, e8, mf2, ta, mu
; CHECK-NEXT: vle8.v v9, (a0)
; CHECK-NEXT: andi a0, a1, 254
; CHECK-NEXT: vwmulsu.vx v8, v9, a0
; CHECK-NEXT: ret
%a = load <8 x i8>, <8 x i8>* %x
%b = and i16 %y, 254
%c = insertelement <8 x i16> poison, i16 %b, i32 0
%d = shufflevector <8 x i16> %c, <8 x i16> poison, <8 x i32> zeroinitializer
%e = sext <8 x i8> %a to <8 x i16>
%f = mul <8 x i16> %d, %e
ret <8 x i16> %f
}
define <4 x i32> @vwmulsu_vx_v4i32_i16_and(<4 x i16>* %x, i32 %y) {
; CHECK-LABEL: vwmulsu_vx_v4i32_i16_and:
; CHECK: # %bb.0:
; CHECK-NEXT: vsetivli zero, 4, e16, mf2, ta, mu
; CHECK-NEXT: vle16.v v9, (a0)
; CHECK-NEXT: vwmulsu.vx v8, v9, a1
; CHECK-NEXT: ret
%a = load <4 x i16>, <4 x i16>* %x
%b = and i32 %y, 65535
%c = insertelement <4 x i32> poison, i32 %b, i32 0
%d = shufflevector <4 x i32> %c, <4 x i32> poison, <4 x i32> zeroinitializer
%e = sext <4 x i16> %a to <4 x i32>
%f = mul <4 x i32> %d, %e
ret <4 x i32> %f
}
define <4 x i32> @vwmulsu_vx_v4i32_i16_zext(<4 x i16>* %x, i16 %y) {
; CHECK-LABEL: vwmulsu_vx_v4i32_i16_zext:
; CHECK: # %bb.0:
; CHECK-NEXT: vsetivli zero, 4, e16, mf2, ta, mu
; CHECK-NEXT: vle16.v v9, (a0)
; CHECK-NEXT: vwmulsu.vx v8, v9, a1
; CHECK-NEXT: ret
%a = load <4 x i16>, <4 x i16>* %x
%b = zext i16 %y to i32
%c = insertelement <4 x i32> poison, i32 %b, i32 0
%d = shufflevector <4 x i32> %c, <4 x i32> poison, <4 x i32> zeroinitializer
%e = sext <4 x i16> %a to <4 x i32>
%f = mul <4 x i32> %d, %e
ret <4 x i32> %f
}