From 3f8acfc3c40e8dad68a3deb13a9fbb31f1b82b99 Mon Sep 17 00:00:00 2001 From: Nadav Rotem Date: Mon, 23 Apr 2012 21:53:37 +0000 Subject: [PATCH] Optimize the vector UINT_TO_FP, SINT_TO_FP and FP_TO_SINT operations where the integer type is i8 (commonly used in graphics). llvm-svn: 155397 --- llvm/lib/Target/X86/X86ISelLowering.cpp | 56 +++++++++++++++++++++++++ llvm/test/CodeGen/X86/vec_cast2.ll | 49 ++++++++++++++++++++++ 2 files changed, 105 insertions(+) create mode 100644 llvm/test/CodeGen/X86/vec_cast2.ll diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp index b9dba7bdf31b..87c480516839 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -1221,7 +1221,9 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM) setTargetDAGCombine(ISD::ANY_EXTEND); setTargetDAGCombine(ISD::SIGN_EXTEND); setTargetDAGCombine(ISD::TRUNCATE); + setTargetDAGCombine(ISD::UINT_TO_FP); setTargetDAGCombine(ISD::SINT_TO_FP); + setTargetDAGCombine(ISD::FP_TO_SINT); if (Subtarget->is64Bit()) setTargetDAGCombine(ISD::MUL); if (Subtarget->hasBMI()) @@ -14985,9 +14987,43 @@ static SDValue PerformSETCCCombine(SDNode *N, SelectionDAG &DAG) { return SDValue(); } +static SDValue PerformUINT_TO_FPCombine(SDNode *N, SelectionDAG &DAG, + const X86TargetLowering *XTLI) { + SDValue Op0 = N->getOperand(0); + EVT InVT = Op0->getValueType(0); + if (!InVT.isSimple()) + return SDValue(); + + // UINT_TO_FP(v4i8) -> SINT_TO_FP(ZEXT(v4i8 to v4i32)) + MVT SrcVT = InVT.getSimpleVT(); + if (SrcVT == MVT::v8i8 || SrcVT == MVT::v4i8) { + DebugLoc dl = N->getDebugLoc(); + MVT DstVT = (SrcVT.getVectorNumElements() == 4 ? MVT::v4i32 : MVT::v8i32); + SDValue P = DAG.getNode(ISD::ZERO_EXTEND, dl, DstVT, Op0); + // Notice that we use SINT_TO_FP because we know that the high bits + // are zero and SINT_TO_FP is better supported by the hardware. + return DAG.getNode(ISD::SINT_TO_FP, dl, N->getValueType(0), P); + } + + return SDValue(); +} + static SDValue PerformSINT_TO_FPCombine(SDNode *N, SelectionDAG &DAG, const X86TargetLowering *XTLI) { SDValue Op0 = N->getOperand(0); + EVT InVT = Op0->getValueType(0); + if (!InVT.isSimple()) + return SDValue(); + + // SINT_TO_FP(v4i8) -> SINT_TO_FP(SEXT(v4i8 to v4i32)) + MVT SrcVT = InVT.getSimpleVT(); + if (SrcVT == MVT::v8i8 || SrcVT == MVT::v4i8) { + DebugLoc dl = N->getDebugLoc(); + MVT DstVT = (SrcVT.getVectorNumElements() == 4 ? MVT::v4i32 : MVT::v8i32); + SDValue P = DAG.getNode(ISD::SIGN_EXTEND, dl, DstVT, Op0); + return DAG.getNode(ISD::SINT_TO_FP, dl, N->getValueType(0), P); + } + // Transform (SINT_TO_FP (i64 ...)) into an x87 operation if we have // a 32-bit target where SSE doesn't support i64->FP operations. if (Op0.getOpcode() == ISD::LOAD) { @@ -15006,6 +15042,24 @@ static SDValue PerformSINT_TO_FPCombine(SDNode *N, SelectionDAG &DAG, return SDValue(); } +static SDValue PerformFP_TO_SINTCombine(SDNode *N, SelectionDAG &DAG, + const X86TargetLowering *XTLI) { + EVT InVT = N->getValueType(0); + if (!InVT.isSimple()) + return SDValue(); + + // v4i8 = FP_TO_SINT() -> v4i8 = TRUNCATE (V4i32 = FP_TO_SINT() + MVT VT = InVT.getSimpleVT(); + if (VT == MVT::v8i8 || VT == MVT::v4i8) { + DebugLoc dl = N->getDebugLoc(); + MVT DstVT = (VT.getVectorNumElements() == 4 ? MVT::v4i32 : MVT::v8i32); + SDValue I = DAG.getNode(ISD::FP_TO_SINT, dl, DstVT, N->getOperand(0)); + return DAG.getNode(ISD::TRUNCATE, dl, VT, I); + } + + return SDValue(); +} + // Optimize RES, EFLAGS = X86ISD::ADC LHS, RHS, EFLAGS static SDValue PerformADCCombine(SDNode *N, SelectionDAG &DAG, X86TargetLowering::DAGCombinerInfo &DCI) { @@ -15142,7 +15196,9 @@ SDValue X86TargetLowering::PerformDAGCombine(SDNode *N, case ISD::XOR: return PerformXorCombine(N, DAG, DCI, Subtarget); case ISD::LOAD: return PerformLOADCombine(N, DAG, Subtarget); case ISD::STORE: return PerformSTORECombine(N, DAG, Subtarget); + case ISD::UINT_TO_FP: return PerformUINT_TO_FPCombine(N, DAG, this); case ISD::SINT_TO_FP: return PerformSINT_TO_FPCombine(N, DAG, this); + case ISD::FP_TO_SINT: return PerformFP_TO_SINTCombine(N, DAG, this); case ISD::FADD: return PerformFADDCombine(N, DAG, Subtarget); case ISD::FSUB: return PerformFSUBCombine(N, DAG, Subtarget); case X86ISD::FXOR: diff --git a/llvm/test/CodeGen/X86/vec_cast2.ll b/llvm/test/CodeGen/X86/vec_cast2.ll new file mode 100644 index 000000000000..08eb16f6313b --- /dev/null +++ b/llvm/test/CodeGen/X86/vec_cast2.ll @@ -0,0 +1,49 @@ +; RUN: llc < %s -mtriple=i386-apple-darwin10 -mcpu=corei7-avx -mattr=+avx | FileCheck %s + +;CHECK: foo1_8 +;CHECK: vcvtdq2ps +;CHECK: ret +define <8 x float> @foo1_8(<8 x i8> %src) { + %res = sitofp <8 x i8> %src to <8 x float> + ret <8 x float> %res +} + +;CHECK: foo1_4 +;CHECK: vcvtdq2ps +;CHECK: ret +define <4 x float> @foo1_4(<4 x i8> %src) { + %res = sitofp <4 x i8> %src to <4 x float> + ret <4 x float> %res +} + +;CHECK: foo2_8 +;CHECK: vcvtdq2ps +;CHECK: ret +define <8 x float> @foo2_8(<8 x i8> %src) { + %res = uitofp <8 x i8> %src to <8 x float> + ret <8 x float> %res +} + +;CHECK: foo2_4 +;CHECK: vcvtdq2ps +;CHECK: ret +define <4 x float> @foo2_4(<4 x i8> %src) { + %res = uitofp <4 x i8> %src to <4 x float> + ret <4 x float> %res +} + +;CHECK: foo3_8 +;CHECK: vcvttps2dq +;CHECK: ret +define <8 x i8> @foo3_8(<8 x float> %src) { + %res = fptosi <8 x float> %src to <8 x i8> + ret <8 x i8> %res +} +;CHECK: foo3_4 +;CHECK: vcvttps2dq +;CHECK: ret +define <4 x i8> @foo3_4(<4 x float> %src) { + %res = fptosi <4 x float> %src to <4 x i8> + ret <4 x i8> %res +} +