From 977e0be4a0f7b1f66667ac0c905f2f752a5cd4b5 Mon Sep 17 00:00:00 2001 From: Nadav Rotem Date: Wed, 9 Jan 2013 05:14:33 +0000 Subject: [PATCH] Efficient lowering of vector sdiv when the divisor is a splatted power of two constant. PR 14848. The lowered sequence is based on the existing sequence the target-independent DAG Combiner creates for the scalar case. Patch by Zvi Rackover. llvm-svn: 171953 --- llvm/lib/Target/X86/X86ISelLowering.cpp | 50 +++++++++++++++ llvm/lib/Target/X86/X86ISelLowering.h | 1 + llvm/test/CodeGen/X86/vec_sdiv_to_shift.ll | 72 ++++++++++++++++++++++ 3 files changed, 123 insertions(+) create mode 100644 llvm/test/CodeGen/X86/vec_sdiv_to_shift.ll diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp index 4b00b46e7362..f42884dd2e80 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -1047,6 +1047,8 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM) setOperationAction(ISD::SRA, MVT::v4i32, Custom); } + setOperationAction(ISD::SDIV, MVT::v8i16, Custom); + setOperationAction(ISD::SDIV, MVT::v4i32, Custom); } if (!TM.Options.UseSoftFloat && Subtarget->hasFp256()) { @@ -1111,6 +1113,8 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM) setOperationAction(ISD::SRA, MVT::v16i16, Custom); setOperationAction(ISD::SRA, MVT::v32i8, Custom); + setOperationAction(ISD::SDIV, MVT::v16i16, Custom); + setOperationAction(ISD::SETCC, MVT::v32i8, Custom); setOperationAction(ISD::SETCC, MVT::v16i16, Custom); setOperationAction(ISD::SETCC, MVT::v8i32, Custom); @@ -1166,6 +1170,8 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM) setOperationAction(ISD::SHL, MVT::v8i32, Legal); setOperationAction(ISD::SRA, MVT::v8i32, Legal); + + setOperationAction(ISD::SDIV, MVT::v8i32, Custom); } else { setOperationAction(ISD::ADD, MVT::v4i64, Custom); setOperationAction(ISD::ADD, MVT::v8i32, Custom); @@ -11377,6 +11383,49 @@ static SDValue LowerMUL(SDValue Op, const X86Subtarget *Subtarget, return DAG.getNode(ISD::ADD, dl, VT, Res, AhiBlo); } +SDValue X86TargetLowering::LowerSDIV(SDValue Op, SelectionDAG &DAG) const { + EVT VT = Op.getValueType(); + EVT EltTy = VT.getVectorElementType(); + unsigned NumElts = VT.getVectorNumElements(); + SDValue N0 = Op.getOperand(0); + DebugLoc dl = Op.getDebugLoc(); + + // Lower sdiv X, pow2-const. + BuildVectorSDNode *C = dyn_cast(Op.getOperand(1)); + if (!C) + return SDValue(); + + APInt SplatValue, SplatUndef; + unsigned MinSplatBits; + bool HasAnyUndefs; + if (!C->isConstantSplat(SplatValue, SplatUndef, MinSplatBits, HasAnyUndefs)) + return SDValue(); + + if ((SplatValue != 0) && + (SplatValue.isPowerOf2() || (-SplatValue).isPowerOf2())) { + unsigned lg2 = SplatValue.countTrailingZeros(); + // Splat the sign bit. + SDValue Sz = DAG.getConstant(EltTy.getSizeInBits()-1, MVT::i32); + SDValue SGN = getTargetVShiftNode(X86ISD::VSRAI, dl, VT, N0, Sz, DAG); + // Add (N0 < 0) ? abs2 - 1 : 0; + SDValue Amt = DAG.getConstant(EltTy.getSizeInBits() - lg2, MVT::i32); + SDValue SRL = getTargetVShiftNode(X86ISD::VSRLI, dl, VT, SGN, Amt, DAG); + SDValue ADD = DAG.getNode(ISD::ADD, dl, VT, N0, SRL); + SDValue Lg2Amt = DAG.getConstant(lg2, MVT::i32); + SDValue SRA = getTargetVShiftNode(X86ISD::VSRAI, dl, VT, ADD, Lg2Amt, DAG); + + // If we're dividing by a positive value, we're done. Otherwise, we must + // negate the result. + if (SplatValue.isNonNegative()) + return SRA; + + SmallVector V(NumElts, DAG.getConstant(0, EltTy)); + SDValue Zero = DAG.getNode(ISD::BUILD_VECTOR, dl, VT, &V[0], NumElts); + return DAG.getNode(ISD::SUB, dl, VT, Zero, SRA); + } + return SDValue(); +} + SDValue X86TargetLowering::LowerShift(SDValue Op, SelectionDAG &DAG) const { EVT VT = Op.getValueType(); @@ -12033,6 +12082,7 @@ SDValue X86TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const { case ISD::SUBE: return LowerADDC_ADDE_SUBC_SUBE(Op, DAG); case ISD::ADD: return LowerADD(Op, DAG); case ISD::SUB: return LowerSUB(Op, DAG); + case ISD::SDIV: return LowerSDIV(Op, DAG); } } diff --git a/llvm/lib/Target/X86/X86ISelLowering.h b/llvm/lib/Target/X86/X86ISelLowering.h index 16ce364cd524..35b5abd791ee 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.h +++ b/llvm/lib/Target/X86/X86ISelLowering.h @@ -841,6 +841,7 @@ namespace llvm { SDValue LowerINIT_TRAMPOLINE(SDValue Op, SelectionDAG &DAG) const; SDValue LowerFLT_ROUNDS_(SDValue Op, SelectionDAG &DAG) const; SDValue LowerShift(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerSDIV(SDValue Op, SelectionDAG &DAG) const; SDValue LowerSIGN_EXTEND_INREG(SDValue Op, SelectionDAG &DAG) const; diff --git a/llvm/test/CodeGen/X86/vec_sdiv_to_shift.ll b/llvm/test/CodeGen/X86/vec_sdiv_to_shift.ll new file mode 100644 index 000000000000..693463b83740 --- /dev/null +++ b/llvm/test/CodeGen/X86/vec_sdiv_to_shift.ll @@ -0,0 +1,72 @@ +; RUN: llc < %s -mcpu=penryn -mattr=+avx2 | FileCheck %s + + +define <8 x i16> @sdiv_vec8x16(<8 x i16> %var) { +entry: +; CHECK: sdiv_vec8x16 +; CHECK: psraw $15 +; CHECK: vpsrlw $11 +; CHECK: vpaddw +; CHECK: vpsraw $5 +; CHECK: ret + %0 = sdiv <8 x i16> %var, + ret <8 x i16> %0 +} + +define <4 x i32> @sdiv_zero(<4 x i32> %var) { +entry: +; CHECK: sdiv_zero +; CHECK-NOT sra +; CHECK: ret + %0 = sdiv <4 x i32> %var, + ret <4 x i32> %0 +} + +define <4 x i32> @sdiv_vec4x32(<4 x i32> %var) { +entry: +; CHECK: sdiv_vec4x32 +; CHECK: vpsrad $31 +; CHECK: vpsrld $28 +; CHECK: vpaddd +; CHECK: vpsrad $4 +; CHECK: ret +%0 = sdiv <4 x i32> %var, +ret <4 x i32> %0 +} + +define <4 x i32> @sdiv_negative(<4 x i32> %var) { +entry: +; CHECK: sdiv_negative +; CHECK: vpsrad $31 +; CHECK: vpsrld $28 +; CHECK: vpaddd +; CHECK: vpsrad $4 +; CHECK: vpsubd +; CHECK: ret +%0 = sdiv <4 x i32> %var, +ret <4 x i32> %0 +} + +define <8 x i32> @sdiv8x32(<8 x i32> %var) { +entry: +; CHECK: sdiv8x32 +; CHECK: vpsrad $31 +; CHECK: vpsrld $26 +; CHECK: vpaddd +; CHECK: vpsrad $6 +; CHECK: ret +%0 = sdiv <8 x i32> %var, +ret <8 x i32> %0 +} + +define <16 x i16> @sdiv16x16(<16 x i16> %var) { +entry: +; CHECK: sdiv16x16 +; CHECK: vpsraw $15 +; CHECK: vpsrlw $14 +; CHECK: vpaddw +; CHECK: vpsraw $2 +; CHECK: ret + %a0 = sdiv <16 x i16> %var, + ret <16 x i16> %a0 +}