forked from OSchip/llvm-project
X86: If SSE4.1 is missing lower SMUL_LOHI of v4i32 to pmuludq and fix up the high parts.
This is more expensive than pmuldq but still cheaper than scalarizing the whole thing. llvm-svn: 207370
This commit is contained in:
parent
d34db65c84
commit
3693e77cb4
|
@ -941,6 +941,7 @@ void X86TargetLowering::resetOperationActions() {
|
|||
setOperationAction(ISD::MUL, MVT::v4i32, Custom);
|
||||
setOperationAction(ISD::MUL, MVT::v2i64, Custom);
|
||||
setOperationAction(ISD::UMUL_LOHI, MVT::v4i32, Custom);
|
||||
setOperationAction(ISD::SMUL_LOHI, MVT::v4i32, Custom);
|
||||
setOperationAction(ISD::MULHU, MVT::v8i16, Legal);
|
||||
setOperationAction(ISD::MULHS, MVT::v8i16, Legal);
|
||||
setOperationAction(ISD::SUB, MVT::v16i8, Legal);
|
||||
|
@ -1062,7 +1063,6 @@ void X86TargetLowering::resetOperationActions() {
|
|||
|
||||
// FIXME: Do we need to handle scalar-to-vector here?
|
||||
setOperationAction(ISD::MUL, MVT::v4i32, Legal);
|
||||
setOperationAction(ISD::SMUL_LOHI, MVT::v4i32, Custom);
|
||||
|
||||
setOperationAction(ISD::VSELECT, MVT::v2f64, Legal);
|
||||
setOperationAction(ISD::VSELECT, MVT::v2i64, Legal);
|
||||
|
@ -13166,8 +13166,9 @@ static SDValue LowerMUL_LOHI(SDValue Op, const X86Subtarget *Subtarget,
|
|||
// Emit two multiplies, one for the lower 2 ints and one for the higher 2
|
||||
// ints.
|
||||
MVT MulVT = VT == MVT::v4i32 ? MVT::v2i64 : MVT::v4i64;
|
||||
bool IsSigned = Op->getOpcode() == ISD::SMUL_LOHI;
|
||||
unsigned Opcode =
|
||||
Op->getOpcode() == ISD::UMUL_LOHI ? X86ISD::PMULUDQ : X86ISD::PMULDQ;
|
||||
(!IsSigned || !Subtarget->hasSSE41()) ? X86ISD::PMULUDQ : X86ISD::PMULDQ;
|
||||
SDValue Mul1 = DAG.getNode(ISD::BITCAST, dl, VT,
|
||||
DAG.getNode(Opcode, dl, MulVT, Op0, Op1));
|
||||
SDValue Mul2 = DAG.getNode(ISD::BITCAST, dl, VT,
|
||||
|
@ -13179,6 +13180,20 @@ static SDValue LowerMUL_LOHI(SDValue Op, const X86Subtarget *Subtarget,
|
|||
const int LowMask[] = {0, 4, 2, 6, 8, 12, 10, 14};
|
||||
SDValue Lows = DAG.getVectorShuffle(VT, dl, Mul1, Mul2, LowMask);
|
||||
|
||||
// If we have a signed multiply but no PMULDQ fix up the high parts of a
|
||||
// unsigned multiply.
|
||||
if (IsSigned && !Subtarget->hasSSE41()) {
|
||||
SDValue ShAmt =
|
||||
DAG.getConstant(31, DAG.getTargetLoweringInfo().getShiftAmountTy(VT));
|
||||
SDValue T1 = DAG.getNode(ISD::AND, dl, VT,
|
||||
DAG.getNode(ISD::SRA, dl, VT, Op0, ShAmt), Op1);
|
||||
SDValue T2 = DAG.getNode(ISD::AND, dl, VT,
|
||||
DAG.getNode(ISD::SRA, dl, VT, Op1, ShAmt), Op0);
|
||||
|
||||
SDValue Fixup = DAG.getNode(ISD::ADD, dl, VT, T1, T2);
|
||||
Highs = DAG.getNode(ISD::SUB, dl, VT, Highs, Fixup);
|
||||
}
|
||||
|
||||
return DAG.getNode(ISD::MERGE_VALUES, dl, Op.getValueType(), Highs, Lows);
|
||||
}
|
||||
|
||||
|
|
|
@ -1,19 +1,20 @@
|
|||
; RUN: llc -march=x86-64 -mcpu=corei7 < %s | FileCheck %s -check-prefix=SSE
|
||||
; RUN: llc -march=x86-64 -mcpu=core2 -mattr=+sse4.1 < %s | FileCheck %s -check-prefix=SSE41
|
||||
; RUN: llc -march=x86-64 -mcpu=core2 < %s | FileCheck %s -check-prefix=SSE
|
||||
; RUN: llc -march=x86-64 -mcpu=core-avx2 < %s | FileCheck %s -check-prefix=AVX
|
||||
|
||||
define <4 x i32> @test1(<4 x i32> %a) {
|
||||
%div = udiv <4 x i32> %a, <i32 7, i32 7, i32 7, i32 7>
|
||||
ret <4 x i32> %div
|
||||
|
||||
; SSE-LABEL: test1:
|
||||
; SSE: pmuludq
|
||||
; SSE: pshufd $57
|
||||
; SSE: pmuludq
|
||||
; SSE: shufps $-35
|
||||
; SSE: psubd
|
||||
; SSE: psrld $1
|
||||
; SSE: padd
|
||||
; SSE: psrld $2
|
||||
; SSE41-LABEL: test1:
|
||||
; SSE41: pmuludq
|
||||
; SSE41: pshufd $57
|
||||
; SSE41: pmuludq
|
||||
; SSE41: shufps $-35
|
||||
; SSE41: psubd
|
||||
; SSE41: psrld $1
|
||||
; SSE41: padd
|
||||
; SSE41: psrld $2
|
||||
|
||||
; AVX-LABEL: test1:
|
||||
; AVX: vpmuludq
|
||||
|
@ -46,12 +47,12 @@ define <8 x i16> @test3(<8 x i16> %a) {
|
|||
%div = udiv <8 x i16> %a, <i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7>
|
||||
ret <8 x i16> %div
|
||||
|
||||
; SSE-LABEL: test3:
|
||||
; SSE: pmulhuw
|
||||
; SSE: psubw
|
||||
; SSE: psrlw $1
|
||||
; SSE: paddw
|
||||
; SSE: psrlw $2
|
||||
; SSE41-LABEL: test3:
|
||||
; SSE41: pmulhuw
|
||||
; SSE41: psubw
|
||||
; SSE41: psrlw $1
|
||||
; SSE41: paddw
|
||||
; SSE41: psrlw $2
|
||||
|
||||
; AVX-LABEL: test3:
|
||||
; AVX: vpmulhuw
|
||||
|
@ -78,11 +79,11 @@ define <8 x i16> @test5(<8 x i16> %a) {
|
|||
%div = sdiv <8 x i16> %a, <i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7>
|
||||
ret <8 x i16> %div
|
||||
|
||||
; SSE-LABEL: test5:
|
||||
; SSE: pmulhw
|
||||
; SSE: psrlw $15
|
||||
; SSE: psraw $1
|
||||
; SSE: paddw
|
||||
; SSE41-LABEL: test5:
|
||||
; SSE41: pmulhw
|
||||
; SSE41: psrlw $15
|
||||
; SSE41: psraw $1
|
||||
; SSE41: paddw
|
||||
|
||||
; AVX-LABEL: test5:
|
||||
; AVX: vpmulhw
|
||||
|
@ -112,13 +113,29 @@ define <4 x i32> @test8(<4 x i32> %a) {
|
|||
%div = sdiv <4 x i32> %a, <i32 7, i32 7, i32 7, i32 7>
|
||||
ret <4 x i32> %div
|
||||
|
||||
; SSE41-LABEL: test8:
|
||||
; SSE41: pmuldq
|
||||
; SSE41: pshufd $57
|
||||
; SSE41-NOT: pshufd $57
|
||||
; SSE41: pmuldq
|
||||
; SSE41: shufps $-35
|
||||
; SSE41: pshufd $-40
|
||||
; SSE41: padd
|
||||
; SSE41: psrld $31
|
||||
; SSE41: psrad $2
|
||||
; SSE41: padd
|
||||
|
||||
; SSE-LABEL: test8:
|
||||
; SSE: pmuldq
|
||||
; SSE: psrad $31
|
||||
; SSE: pand
|
||||
; SSE: paddd
|
||||
; SSE: pmuludq
|
||||
; SSE: pshufd $57
|
||||
; SSE-NOT: pshufd $57
|
||||
; SSE: pmuldq
|
||||
; SSE: pmuludq
|
||||
; SSE: shufps $-35
|
||||
; SSE: pshufd $-40
|
||||
; SSE: psubd
|
||||
; SSE: padd
|
||||
; SSE: psrld $31
|
||||
; SSE: psrad $2
|
||||
|
|
Loading…
Reference in New Issue