forked from OSchip/llvm-project
[AArch64] Fold a floating-point multiply by power of two into fp conversion.
Part of http://reviews.llvm.org/D13442 llvm-svn: 249576
This commit is contained in:
parent
0015e5a088
commit
fa30c9b436
|
@ -478,6 +478,9 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM,
|
|||
setTargetDAGCombine(ISD::SINT_TO_FP);
|
||||
setTargetDAGCombine(ISD::UINT_TO_FP);
|
||||
|
||||
setTargetDAGCombine(ISD::FP_TO_SINT);
|
||||
setTargetDAGCombine(ISD::FP_TO_UINT);
|
||||
|
||||
setTargetDAGCombine(ISD::INTRINSIC_WO_CHAIN);
|
||||
|
||||
setTargetDAGCombine(ISD::ANY_EXTEND);
|
||||
|
@ -7529,6 +7532,70 @@ static SDValue performIntToFpCombine(SDNode *N, SelectionDAG &DAG,
|
|||
return SDValue();
|
||||
}
|
||||
|
||||
/// Fold a floating-point multiply by power of two into floating-point to
|
||||
/// fixed-point conversion.
|
||||
static SDValue performFpToIntCombine(SDNode *N, SelectionDAG &DAG,
|
||||
const AArch64Subtarget *Subtarget) {
|
||||
if (!Subtarget->hasNEON())
|
||||
return SDValue();
|
||||
|
||||
SDValue Op = N->getOperand(0);
|
||||
if (!Op.getValueType().isVector() || Op.getOpcode() != ISD::FMUL)
|
||||
return SDValue();
|
||||
|
||||
SDValue ConstVec = Op->getOperand(1);
|
||||
if (!isa<BuildVectorSDNode>(ConstVec))
|
||||
return SDValue();
|
||||
|
||||
MVT FloatTy = Op.getSimpleValueType().getVectorElementType();
|
||||
uint32_t FloatBits = FloatTy.getSizeInBits();
|
||||
if (FloatBits != 32 && FloatBits != 64)
|
||||
return SDValue();
|
||||
|
||||
MVT IntTy = N->getSimpleValueType(0).getVectorElementType();
|
||||
uint32_t IntBits = IntTy.getSizeInBits();
|
||||
if (IntBits != 16 && IntBits != 32 && IntBits != 64)
|
||||
return SDValue();
|
||||
|
||||
// Avoid conversions where iN is larger than the float (e.g., float -> i64).
|
||||
if (IntBits > FloatBits)
|
||||
return SDValue();
|
||||
|
||||
BitVector UndefElements;
|
||||
BuildVectorSDNode *BV = cast<BuildVectorSDNode>(ConstVec);
|
||||
int32_t Bits = IntBits == 64 ? 64 : 32;
|
||||
int32_t C = BV->getConstantFPSplatPow2ToLog2Int(&UndefElements, Bits + 1);
|
||||
if (C == -1 || C == 0 || C > Bits)
|
||||
return SDValue();
|
||||
|
||||
MVT ResTy;
|
||||
unsigned NumLanes = Op.getValueType().getVectorNumElements();
|
||||
switch (NumLanes) {
|
||||
default:
|
||||
return SDValue();
|
||||
case 2:
|
||||
ResTy = FloatBits == 32 ? MVT::v2i32 : MVT::v2i64;
|
||||
break;
|
||||
case 4:
|
||||
ResTy = MVT::v4i32;
|
||||
break;
|
||||
}
|
||||
|
||||
SDLoc DL(N);
|
||||
bool IsSigned = N->getOpcode() == ISD::FP_TO_SINT;
|
||||
unsigned IntrinsicOpcode = IsSigned ? Intrinsic::aarch64_neon_vcvtfp2fxs
|
||||
: Intrinsic::aarch64_neon_vcvtfp2fxu;
|
||||
SDValue FixConv =
|
||||
DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, ResTy,
|
||||
DAG.getConstant(IntrinsicOpcode, DL, MVT::i32),
|
||||
Op->getOperand(0), DAG.getConstant(C, DL, MVT::i32));
|
||||
// We can handle smaller integers by generating an extra trunc.
|
||||
if (IntBits < FloatBits)
|
||||
FixConv = DAG.getNode(ISD::TRUNCATE, DL, N->getValueType(0), FixConv);
|
||||
|
||||
return FixConv;
|
||||
}
|
||||
|
||||
/// An EXTR instruction is made up of two shifts, ORed together. This helper
|
||||
/// searches for and classifies those shifts.
|
||||
static bool findEXTRHalf(SDValue N, SDValue &Src, uint32_t &ShiftAmount,
|
||||
|
@ -9400,6 +9467,9 @@ SDValue AArch64TargetLowering::PerformDAGCombine(SDNode *N,
|
|||
case ISD::SINT_TO_FP:
|
||||
case ISD::UINT_TO_FP:
|
||||
return performIntToFpCombine(N, DAG, Subtarget);
|
||||
case ISD::FP_TO_SINT:
|
||||
case ISD::FP_TO_UINT:
|
||||
return performFpToIntCombine(N, DAG, Subtarget);
|
||||
case ISD::OR:
|
||||
return performORCombine(N, DCI, Subtarget);
|
||||
case ISD::INTRINSIC_WO_CHAIN:
|
||||
|
|
|
@ -0,0 +1,154 @@
|
|||
; RUN: llc -mtriple=aarch64-linux-gnu -aarch64-neon-syntax=apple -verify-machineinstrs -o - %s | FileCheck %s
|
||||
|
||||
; CHECK-LABEL: test1
|
||||
; CHECK-NOT: fmul.2s
|
||||
; CHECK: fcvtzs.2s v0, v0, #4
|
||||
; CHECK: ret
|
||||
define <2 x i32> @test1(<2 x float> %f) {
|
||||
%mul.i = fmul <2 x float> %f, <float 16.000000e+00, float 16.000000e+00>
|
||||
%vcvt.i = fptosi <2 x float> %mul.i to <2 x i32>
|
||||
ret <2 x i32> %vcvt.i
|
||||
}
|
||||
|
||||
; CHECK-LABEL: test2
|
||||
; CHECK-NOT: fmul.4s
|
||||
; CHECK: fcvtzs.4s v0, v0, #3
|
||||
; CHECK: ret
|
||||
define <4 x i32> @test2(<4 x float> %f) {
|
||||
%mul.i = fmul <4 x float> %f, <float 8.000000e+00, float 8.000000e+00, float 8.000000e+00, float 8.000000e+00>
|
||||
%vcvt.i = fptosi <4 x float> %mul.i to <4 x i32>
|
||||
ret <4 x i32> %vcvt.i
|
||||
}
|
||||
|
||||
; CHECK-LABEL: test3
|
||||
; CHECK-NOT: fmul.2d
|
||||
; CHECK: fcvtzs.2d v0, v0, #5
|
||||
; CHECK: ret
|
||||
define <2 x i64> @test3(<2 x double> %d) {
|
||||
%mul.i = fmul <2 x double> %d, <double 32.000000e+00, double 32.000000e+00>
|
||||
%vcvt.i = fptosi <2 x double> %mul.i to <2 x i64>
|
||||
ret <2 x i64> %vcvt.i
|
||||
}
|
||||
|
||||
; Truncate double to i32
|
||||
; CHECK-LABEL: test4
|
||||
; CHECK-NOT: fmul.2d v0, v0, #4
|
||||
; CHECK: fcvtzs.2d v0, v0
|
||||
; CHECK: xtn.2s
|
||||
; CHECK: ret
|
||||
define <2 x i32> @test4(<2 x double> %d) {
|
||||
%mul.i = fmul <2 x double> %d, <double 16.000000e+00, double 16.000000e+00>
|
||||
%vcvt.i = fptosi <2 x double> %mul.i to <2 x i32>
|
||||
ret <2 x i32> %vcvt.i
|
||||
}
|
||||
|
||||
; Truncate float to i16
|
||||
; CHECK-LABEL: test5
|
||||
; CHECK-NOT: fmul.2s
|
||||
; CHECK: fcvtzs.2s v0, v0, #4
|
||||
; CHECK: ret
|
||||
define <2 x i16> @test5(<2 x float> %f) {
|
||||
%mul.i = fmul <2 x float> %f, <float 16.000000e+00, float 16.000000e+00>
|
||||
%vcvt.i = fptosi <2 x float> %mul.i to <2 x i16>
|
||||
ret <2 x i16> %vcvt.i
|
||||
}
|
||||
|
||||
; Don't convert float to i64
|
||||
; CHECK-LABEL: test6
|
||||
; CHECK: fmov.2s v1, #16.00000000
|
||||
; CHECK: fmul.2s v0, v0, v1
|
||||
; CHECK: fcvtl v0.2d, v0.2s
|
||||
; CHECK: fcvtzs.2d v0, v0
|
||||
; CHECK: ret
|
||||
define <2 x i64> @test6(<2 x float> %f) {
|
||||
%mul.i = fmul <2 x float> %f, <float 16.000000e+00, float 16.000000e+00>
|
||||
%vcvt.i = fptosi <2 x float> %mul.i to <2 x i64>
|
||||
ret <2 x i64> %vcvt.i
|
||||
}
|
||||
|
||||
; Check unsigned conversion.
|
||||
; CHECK-LABEL: test7
|
||||
; CHECK-NOT: fmul.2s
|
||||
; CHECK: fcvtzu.2s v0, v0, #4
|
||||
; CHECK: ret
|
||||
define <2 x i32> @test7(<2 x float> %f) {
|
||||
%mul.i = fmul <2 x float> %f, <float 16.000000e+00, float 16.000000e+00>
|
||||
%vcvt.i = fptoui <2 x float> %mul.i to <2 x i32>
|
||||
ret <2 x i32> %vcvt.i
|
||||
}
|
||||
|
||||
; Test which should not fold due to non-power of 2.
|
||||
; CHECK-LABEL: test8
|
||||
; CHECK: fmov.2s v1, #17.00000000
|
||||
; CHECK: fmul.2s v0, v0, v1
|
||||
; CHECK: fcvtzu.2s v0, v0
|
||||
; CHECK: ret
|
||||
define <2 x i32> @test8(<2 x float> %f) {
|
||||
%mul.i = fmul <2 x float> %f, <float 17.000000e+00, float 17.000000e+00>
|
||||
%vcvt.i = fptoui <2 x float> %mul.i to <2 x i32>
|
||||
ret <2 x i32> %vcvt.i
|
||||
}
|
||||
|
||||
; Test which should not fold due to non-matching power of 2.
|
||||
; CHECK-LABEL: test9
|
||||
; CHECK: fmul.2s v0, v0, v1
|
||||
; CHECK: fcvtzu.2s v0, v0
|
||||
; CHECK: ret
|
||||
define <2 x i32> @test9(<2 x float> %f) {
|
||||
%mul.i = fmul <2 x float> %f, <float 16.000000e+00, float 8.000000e+00>
|
||||
%vcvt.i = fptoui <2 x float> %mul.i to <2 x i32>
|
||||
ret <2 x i32> %vcvt.i
|
||||
}
|
||||
|
||||
; Don't combine all undefs.
|
||||
; CHECK-LABEL: test10
|
||||
; CHECK: fmul.2s v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
|
||||
; CHECK: fcvtzu.2s v{{[0-9]+}}, v{{[0-9]+}}
|
||||
; CHECK: ret
|
||||
define <2 x i32> @test10(<2 x float> %f) {
|
||||
%mul.i = fmul <2 x float> %f, <float undef, float undef>
|
||||
%vcvt.i = fptoui <2 x float> %mul.i to <2 x i32>
|
||||
ret <2 x i32> %vcvt.i
|
||||
}
|
||||
|
||||
; Combine if mix of undef and pow2.
|
||||
; CHECK-LABEL: test11
|
||||
; CHECK: fcvtzu.2s v0, v0, #3
|
||||
; CHECK: ret
|
||||
define <2 x i32> @test11(<2 x float> %f) {
|
||||
%mul.i = fmul <2 x float> %f, <float undef, float 8.000000e+00>
|
||||
%vcvt.i = fptoui <2 x float> %mul.i to <2 x i32>
|
||||
ret <2 x i32> %vcvt.i
|
||||
}
|
||||
|
||||
; Don't combine when multiplied by 0.0.
|
||||
; CHECK-LABEL: test12
|
||||
; CHECK: fmul.2s v0, v0, v1
|
||||
; CHECK: fcvtzs.2s v0, v0
|
||||
; CHECK: ret
|
||||
define <2 x i32> @test12(<2 x float> %f) {
|
||||
%mul.i = fmul <2 x float> %f, <float 0.000000e+00, float 0.000000e+00>
|
||||
%vcvt.i = fptosi <2 x float> %mul.i to <2 x i32>
|
||||
ret <2 x i32> %vcvt.i
|
||||
}
|
||||
|
||||
; Test which should not fold due to power of 2 out of range (i.e., 2^33).
|
||||
; CHECK-LABEL: test13
|
||||
; CHECK: fmul.2s v0, v0, v1
|
||||
; CHECK: fcvtzs.2s v0, v0
|
||||
; CHECK: ret
|
||||
define <2 x i32> @test13(<2 x float> %f) {
|
||||
%mul.i = fmul <2 x float> %f, <float 0x4200000000000000, float 0x4200000000000000>
|
||||
%vcvt.i = fptosi <2 x float> %mul.i to <2 x i32>
|
||||
ret <2 x i32> %vcvt.i
|
||||
}
|
||||
|
||||
; Test case where const is max power of 2 (i.e., 2^32).
|
||||
; CHECK-LABEL: test14
|
||||
; CHECK: fcvtzs.2s v0, v0, #32
|
||||
; CHECK: ret
|
||||
define <2 x i32> @test14(<2 x float> %f) {
|
||||
%mul.i = fmul <2 x float> %f, <float 0x41F0000000000000, float 0x41F0000000000000>
|
||||
%vcvt.i = fptosi <2 x float> %mul.i to <2 x i32>
|
||||
ret <2 x i32> %vcvt.i
|
||||
}
|
Loading…
Reference in New Issue