[InstCombine][X86] Add MULDQ/MULUDQ undef handling

llvm-svn: 292627
This commit is contained in:
Simon Pilgrim 2017-01-20 18:20:30 +00:00
parent f5677329a6
commit a50a93fcd0
3 changed files with 81 additions and 12 deletions

View File

@ -510,6 +510,18 @@ static Value *simplifyX86varShift(const IntrinsicInst &II,
return Builder.CreateAShr(Vec, ShiftVec);
}
static Value *simplifyX86muldq(const IntrinsicInst &II) {
Value *Arg0 = II.getArgOperand(0);
Value *Arg1 = II.getArgOperand(1);
Type *ResTy = II.getType();
// muldq/muludq(undef, undef) -> undef
if (isa<UndefValue>(Arg0) && isa<UndefValue>(Arg1))
return UndefValue::get(ResTy);
return nullptr;
}
static Value *simplifyX86movmsk(const IntrinsicInst &II,
InstCombiner::BuilderTy &Builder) {
Value *Arg = II.getArgOperand(0);
@ -2142,6 +2154,9 @@ Instruction *InstCombiner::visitCallInst(CallInst &CI) {
case Intrinsic::x86_avx2_pmulu_dq:
case Intrinsic::x86_avx512_pmul_dq_512:
case Intrinsic::x86_avx512_pmulu_dq_512: {
if (Value *V = simplifyX86muldq(*II))
return replaceInstUsesWith(*II, V);
unsigned VWidth = II->getType()->getVectorNumElements();
APInt UndefElts(VWidth, 0);
APInt DemandedElts = APInt::getAllOnesValue(VWidth);

View File

@ -1469,6 +1469,12 @@ Value *InstCombiner::SimplifyDemandedVectorElts(Value *V, APInt DemandedElts,
Depth + 1);
if (TmpV) { II->setArgOperand(1, TmpV); MadeChange = true; }
// Output elements are undefined if both are undefined. Consider things
// like undef*0. The result is known zero, not undef.
for (unsigned i = 0; i != VWidth; ++i)
if (UndefElts2[i * 2] && UndefElts3[i * 2])
UndefElts.setBit(i);
break;
}

View File

@ -7,8 +7,7 @@
define <2 x i64> @undef_pmuludq_128(<4 x i32> %a0, <4 x i32> %a1) {
; CHECK-LABEL: @undef_pmuludq_128(
; CHECK-NEXT: [[TMP1:%.*]] = call <2 x i64> @llvm.x86.sse2.pmulu.dq(<4 x i32> undef, <4 x i32> undef)
; CHECK-NEXT: ret <2 x i64> [[TMP1]]
; CHECK-NEXT: ret <2 x i64> undef
;
%1 = call <2 x i64> @llvm.x86.sse2.pmulu.dq(<4 x i32> undef, <4 x i32> undef)
ret <2 x i64> %1
@ -16,8 +15,7 @@ define <2 x i64> @undef_pmuludq_128(<4 x i32> %a0, <4 x i32> %a1) {
define <4 x i64> @undef_pmuludq_256(<8 x i32> %a0, <8 x i32> %a1) {
; CHECK-LABEL: @undef_pmuludq_256(
; CHECK-NEXT: [[TMP1:%.*]] = call <4 x i64> @llvm.x86.avx2.pmulu.dq(<8 x i32> undef, <8 x i32> undef)
; CHECK-NEXT: ret <4 x i64> [[TMP1]]
; CHECK-NEXT: ret <4 x i64> undef
;
%1 = call <4 x i64> @llvm.x86.avx2.pmulu.dq(<8 x i32> undef, <8 x i32> undef)
ret <4 x i64> %1
@ -25,8 +23,7 @@ define <4 x i64> @undef_pmuludq_256(<8 x i32> %a0, <8 x i32> %a1) {
define <8 x i64> @undef_pmuludq_512(<16 x i32> %a0, <16 x i32> %a1) {
; CHECK-LABEL: @undef_pmuludq_512(
; CHECK-NEXT: [[TMP1:%.*]] = call <8 x i64> @llvm.x86.avx512.pmulu.dq.512(<16 x i32> undef, <16 x i32> undef)
; CHECK-NEXT: ret <8 x i64> [[TMP1]]
; CHECK-NEXT: ret <8 x i64> undef
;
%1 = call <8 x i64> @llvm.x86.avx512.pmulu.dq.512(<16 x i32> undef, <16 x i32> undef)
ret <8 x i64> %1
@ -34,8 +31,7 @@ define <8 x i64> @undef_pmuludq_512(<16 x i32> %a0, <16 x i32> %a1) {
define <2 x i64> @undef_pmuldq_128(<4 x i32> %a0, <4 x i32> %a1) {
; CHECK-LABEL: @undef_pmuldq_128(
; CHECK-NEXT: [[TMP1:%.*]] = call <2 x i64> @llvm.x86.sse41.pmuldq(<4 x i32> undef, <4 x i32> undef)
; CHECK-NEXT: ret <2 x i64> [[TMP1]]
; CHECK-NEXT: ret <2 x i64> undef
;
%1 = call <2 x i64> @llvm.x86.sse41.pmuldq(<4 x i32> undef, <4 x i32> undef)
ret <2 x i64> %1
@ -43,8 +39,7 @@ define <2 x i64> @undef_pmuldq_128(<4 x i32> %a0, <4 x i32> %a1) {
define <4 x i64> @undef_pmuldq_256(<8 x i32> %a0, <8 x i32> %a1) {
; CHECK-LABEL: @undef_pmuldq_256(
; CHECK-NEXT: [[TMP1:%.*]] = call <4 x i64> @llvm.x86.avx2.pmul.dq(<8 x i32> undef, <8 x i32> undef)
; CHECK-NEXT: ret <4 x i64> [[TMP1]]
; CHECK-NEXT: ret <4 x i64> undef
;
%1 = call <4 x i64> @llvm.x86.avx2.pmul.dq(<8 x i32> undef, <8 x i32> undef)
ret <4 x i64> %1
@ -52,13 +47,66 @@ define <4 x i64> @undef_pmuldq_256(<8 x i32> %a0, <8 x i32> %a1) {
define <8 x i64> @undef_pmuldq_512(<16 x i32> %a0, <16 x i32> %a1) {
; CHECK-LABEL: @undef_pmuldq_512(
; CHECK-NEXT: [[TMP1:%.*]] = call <8 x i64> @llvm.x86.avx512.pmul.dq.512(<16 x i32> undef, <16 x i32> undef)
; CHECK-NEXT: ret <8 x i64> [[TMP1]]
; CHECK-NEXT: ret <8 x i64> undef
;
%1 = call <8 x i64> @llvm.x86.avx512.pmul.dq.512(<16 x i32> undef, <16 x i32> undef)
ret <8 x i64> %1
}
define <2 x i64> @undef_zero_pmuludq_128(<4 x i32> %a0, <4 x i32> %a1) {
; CHECK-LABEL: @undef_zero_pmuludq_128(
; CHECK-NEXT: [[TMP1:%.*]] = call <2 x i64> @llvm.x86.sse2.pmulu.dq(<4 x i32> undef, <4 x i32> <i32 0, i32 undef, i32 0, i32 undef>)
; CHECK-NEXT: ret <2 x i64> [[TMP1]]
;
%1 = call <2 x i64> @llvm.x86.sse2.pmulu.dq(<4 x i32> undef, <4 x i32> zeroinitializer)
ret <2 x i64> %1
}
define <4 x i64> @undef_zero_pmuludq_256(<8 x i32> %a0, <8 x i32> %a1) {
; CHECK-LABEL: @undef_zero_pmuludq_256(
; CHECK-NEXT: [[TMP1:%.*]] = call <4 x i64> @llvm.x86.avx2.pmulu.dq(<8 x i32> <i32 0, i32 undef, i32 0, i32 undef, i32 0, i32 undef, i32 0, i32 undef>, <8 x i32> undef)
; CHECK-NEXT: ret <4 x i64> [[TMP1]]
;
%1 = call <4 x i64> @llvm.x86.avx2.pmulu.dq(<8 x i32> zeroinitializer, <8 x i32> undef)
ret <4 x i64> %1
}
define <8 x i64> @undef_zero_pmuludq_512(<16 x i32> %a0, <16 x i32> %a1) {
; CHECK-LABEL: @undef_zero_pmuludq_512(
; CHECK-NEXT: [[TMP1:%.*]] = call <8 x i64> @llvm.x86.avx512.pmulu.dq.512(<16 x i32> undef, <16 x i32> <i32 0, i32 undef, i32 0, i32 undef, i32 0, i32 undef, i32 0, i32 undef, i32 0, i32 undef, i32 0, i32 undef, i32 0, i32 undef, i32 0, i32 undef>)
; CHECK-NEXT: ret <8 x i64> [[TMP1]]
;
%1 = call <8 x i64> @llvm.x86.avx512.pmulu.dq.512(<16 x i32> undef, <16 x i32> zeroinitializer)
ret <8 x i64> %1
}
define <2 x i64> @undef_zero_pmuldq_128(<4 x i32> %a0, <4 x i32> %a1) {
; CHECK-LABEL: @undef_zero_pmuldq_128(
; CHECK-NEXT: [[TMP1:%.*]] = call <2 x i64> @llvm.x86.sse41.pmuldq(<4 x i32> <i32 0, i32 undef, i32 0, i32 undef>, <4 x i32> undef)
; CHECK-NEXT: ret <2 x i64> [[TMP1]]
;
%1 = call <2 x i64> @llvm.x86.sse41.pmuldq(<4 x i32> zeroinitializer, <4 x i32> undef)
ret <2 x i64> %1
}
define <4 x i64> @undef_zero_pmuldq_256(<8 x i32> %a0, <8 x i32> %a1) {
; CHECK-LABEL: @undef_zero_pmuldq_256(
; CHECK-NEXT: [[TMP1:%.*]] = call <4 x i64> @llvm.x86.avx2.pmul.dq(<8 x i32> undef, <8 x i32> <i32 0, i32 undef, i32 0, i32 undef, i32 0, i32 undef, i32 0, i32 undef>)
; CHECK-NEXT: ret <4 x i64> [[TMP1]]
;
%1 = call <4 x i64> @llvm.x86.avx2.pmul.dq(<8 x i32> undef, <8 x i32> zeroinitializer)
ret <4 x i64> %1
}
define <8 x i64> @undef_zero_pmuldq_512(<16 x i32> %a0, <16 x i32> %a1) {
; CHECK-LABEL: @undef_zero_pmuldq_512(
; CHECK-NEXT: [[TMP1:%.*]] = call <8 x i64> @llvm.x86.avx512.pmul.dq.512(<16 x i32> <i32 0, i32 undef, i32 0, i32 undef, i32 0, i32 undef, i32 0, i32 undef, i32 0, i32 undef, i32 0, i32 undef, i32 0, i32 undef, i32 0, i32 undef>, <16 x i32> undef)
; CHECK-NEXT: ret <8 x i64> [[TMP1]]
;
%1 = call <8 x i64> @llvm.x86.avx512.pmul.dq.512(<16 x i32> zeroinitializer, <16 x i32> undef)
ret <8 x i64> %1
}
;
; PMULUDQ/PMULDQ - only the even elements (0, 2, 4, 6) of the vXi32 inputs are required.
;