[InstCombine][X86] Add DemandedElts support for PMULDQ/PMULUDQ instructions

PMULDQ/PMULUDQ vXi64 instructions only use the even numbered v2Xi32 input elements which SimplifyDemandedVectorElts should try and use.

Differential Revision: https://reviews.llvm.org/D28119

llvm-svn: 290554
This commit is contained in:
Simon Pilgrim 2016-12-26 23:28:17 +00:00
parent d9eaa54ef4
commit c9cf7fc7a4
3 changed files with 56 additions and 18 deletions

View File

@ -1996,6 +1996,21 @@ Instruction *InstCombiner::visitCallInst(CallInst &CI) {
return replaceInstUsesWith(*II, V);
break;
case Intrinsic::x86_sse2_pmulu_dq:
case Intrinsic::x86_sse41_pmuldq:
case Intrinsic::x86_avx2_pmul_dq:
case Intrinsic::x86_avx2_pmulu_dq: {
unsigned VWidth = II->getType()->getVectorNumElements();
APInt UndefElts(VWidth, 0);
APInt DemandedElts = APInt::getAllOnesValue(VWidth);
if (Value *V = SimplifyDemandedVectorElts(II, DemandedElts, UndefElts)) {
if (V != II)
return replaceInstUsesWith(*II, V);
return II;
}
break;
}
case Intrinsic::x86_sse41_insertps:
if (Value *V = simplifyX86insertps(*II, *Builder))
return replaceInstUsesWith(*II, V);

View File

@ -1431,6 +1431,33 @@ Value *InstCombiner::SimplifyDemandedVectorElts(Value *V, APInt DemandedElts,
break;
case Intrinsic::x86_sse2_pmulu_dq:
case Intrinsic::x86_sse41_pmuldq:
case Intrinsic::x86_avx2_pmul_dq:
case Intrinsic::x86_avx2_pmulu_dq: {
Value *Op0 = II->getArgOperand(0);
Value *Op1 = II->getArgOperand(1);
unsigned InnerVWidth = Op0->getType()->getVectorNumElements();
assert((VWidth * 2) == InnerVWidth && "Unexpected input size");
APInt InnerDemandedElts(InnerVWidth, 0);
for (unsigned i = 0; i != VWidth; ++i)
if (DemandedElts[i])
InnerDemandedElts.setBit(i * 2);
UndefElts2 = APInt(InnerVWidth, 0);
TmpV = SimplifyDemandedVectorElts(Op0, InnerDemandedElts, UndefElts2,
Depth + 1);
if (TmpV) { II->setArgOperand(0, TmpV); MadeChange = true; }
UndefElts3 = APInt(InnerVWidth, 0);
TmpV = SimplifyDemandedVectorElts(Op1, InnerDemandedElts, UndefElts3,
Depth + 1);
if (TmpV) { II->setArgOperand(1, TmpV); MadeChange = true; }
break;
}
// SSE4A instructions leave the upper 64-bits of the 128-bit result
// in an undefined state.
case Intrinsic::x86_sse4a_extrq:

View File

@ -7,11 +7,10 @@
define <2 x i64> @test_demanded_elts_pmuludq_128(<4 x i32> %a0, <4 x i32> %a1) {
; CHECK-LABEL: @test_demanded_elts_pmuludq_128(
; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <4 x i32> %a0, <4 x i32> undef, <4 x i32> <i32 0, i32 0, i32 2, i32 2>
; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <4 x i32> %a1, <4 x i32> undef, <4 x i32> <i32 1, i32 1, i32 3, i32 3>
; CHECK-NEXT: [[TMP3:%.*]] = call <2 x i64> @llvm.x86.sse2.pmulu.dq(<4 x i32> [[TMP1]], <4 x i32> [[TMP2]])
; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <2 x i64> [[TMP3]], <2 x i64> undef, <2 x i32> zeroinitializer
; CHECK-NEXT: ret <2 x i64> [[TMP4]]
; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <4 x i32> %a1, <4 x i32> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
; CHECK-NEXT: [[TMP2:%.*]] = call <2 x i64> @llvm.x86.sse2.pmulu.dq(<4 x i32> %a0, <4 x i32> [[TMP1]])
; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <2 x i64> [[TMP2]], <2 x i64> undef, <2 x i32> zeroinitializer
; CHECK-NEXT: ret <2 x i64> [[TMP3]]
;
%1 = shufflevector <4 x i32> %a0, <4 x i32> undef, <4 x i32> <i32 0, i32 0, i32 2, i32 2>
%2 = shufflevector <4 x i32> %a1, <4 x i32> undef, <4 x i32> <i32 1, i32 1, i32 3, i32 3>
@ -22,10 +21,9 @@ define <2 x i64> @test_demanded_elts_pmuludq_128(<4 x i32> %a0, <4 x i32> %a1) {
define <4 x i64> @test_demanded_elts_pmuludq_256(<8 x i32> %a0, <8 x i32> %a1) {
; CHECK-LABEL: @test_demanded_elts_pmuludq_256(
; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <8 x i32> %a0, <8 x i32> undef, <8 x i32> <i32 0, i32 0, i32 2, i32 2, i32 4, i32 4, i32 6, i32 6>
; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <8 x i32> %a1, <8 x i32> undef, <8 x i32> <i32 1, i32 1, i32 3, i32 3, i32 5, i32 5, i32 7, i32 7>
; CHECK-NEXT: [[TMP3:%.*]] = call <4 x i64> @llvm.x86.avx2.pmulu.dq(<8 x i32> [[TMP1]], <8 x i32> [[TMP2]])
; CHECK-NEXT: ret <4 x i64> [[TMP3]]
; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <8 x i32> %a1, <8 x i32> undef, <8 x i32> <i32 1, i32 undef, i32 3, i32 undef, i32 5, i32 undef, i32 7, i32 undef>
; CHECK-NEXT: [[TMP2:%.*]] = call <4 x i64> @llvm.x86.avx2.pmulu.dq(<8 x i32> %a0, <8 x i32> [[TMP1]])
; CHECK-NEXT: ret <4 x i64> [[TMP2]]
;
%1 = shufflevector <8 x i32> %a0, <8 x i32> undef, <8 x i32> <i32 0, i32 0, i32 2, i32 2, i32 4, i32 4, i32 6, i32 6>
%2 = shufflevector <8 x i32> %a1, <8 x i32> undef, <8 x i32> <i32 1, i32 1, i32 3, i32 3, i32 5, i32 5, i32 7, i32 7>
@ -35,10 +33,9 @@ define <4 x i64> @test_demanded_elts_pmuludq_256(<8 x i32> %a0, <8 x i32> %a1) {
define <2 x i64> @test_demanded_elts_pmuldq_128(<4 x i32> %a0, <4 x i32> %a1) {
; CHECK-LABEL: @test_demanded_elts_pmuldq_128(
; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <4 x i32> %a0, <4 x i32> undef, <4 x i32> <i32 0, i32 0, i32 2, i32 2>
; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <4 x i32> %a1, <4 x i32> undef, <4 x i32> <i32 1, i32 1, i32 3, i32 3>
; CHECK-NEXT: [[TMP3:%.*]] = call <2 x i64> @llvm.x86.sse41.pmuldq(<4 x i32> [[TMP1]], <4 x i32> [[TMP2]])
; CHECK-NEXT: ret <2 x i64> [[TMP3]]
; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <4 x i32> %a1, <4 x i32> undef, <4 x i32> <i32 1, i32 undef, i32 3, i32 undef>
; CHECK-NEXT: [[TMP2:%.*]] = call <2 x i64> @llvm.x86.sse41.pmuldq(<4 x i32> %a0, <4 x i32> [[TMP1]])
; CHECK-NEXT: ret <2 x i64> [[TMP2]]
;
%1 = shufflevector <4 x i32> %a0, <4 x i32> undef, <4 x i32> <i32 0, i32 0, i32 2, i32 2>
%2 = shufflevector <4 x i32> %a1, <4 x i32> undef, <4 x i32> <i32 1, i32 1, i32 3, i32 3>
@ -48,11 +45,10 @@ define <2 x i64> @test_demanded_elts_pmuldq_128(<4 x i32> %a0, <4 x i32> %a1) {
define <4 x i64> @test_demanded_elts_pmuluq_256(<8 x i32> %a0, <8 x i32> %a1) {
; CHECK-LABEL: @test_demanded_elts_pmuluq_256(
; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <8 x i32> %a0, <8 x i32> undef, <8 x i32> <i32 0, i32 0, i32 2, i32 2, i32 4, i32 4, i32 6, i32 6>
; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <8 x i32> %a1, <8 x i32> undef, <8 x i32> <i32 1, i32 1, i32 3, i32 3, i32 5, i32 5, i32 7, i32 7>
; CHECK-NEXT: [[TMP3:%.*]] = call <4 x i64> @llvm.x86.avx2.pmul.dq(<8 x i32> [[TMP1]], <8 x i32> [[TMP2]])
; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <4 x i64> [[TMP3]], <4 x i64> undef, <4 x i32> <i32 0, i32 0, i32 3, i32 3>
; CHECK-NEXT: ret <4 x i64> [[TMP4]]
; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <8 x i32> %a1, <8 x i32> undef, <8 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 7, i32 undef>
; CHECK-NEXT: [[TMP2:%.*]] = call <4 x i64> @llvm.x86.avx2.pmul.dq(<8 x i32> %a0, <8 x i32> [[TMP1]])
; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <4 x i64> [[TMP2]], <4 x i64> undef, <4 x i32> <i32 0, i32 0, i32 3, i32 3>
; CHECK-NEXT: ret <4 x i64> [[TMP3]]
;
%1 = shufflevector <8 x i32> %a0, <8 x i32> undef, <8 x i32> <i32 0, i32 0, i32 2, i32 2, i32 4, i32 4, i32 6, i32 6>
%2 = shufflevector <8 x i32> %a1, <8 x i32> undef, <8 x i32> <i32 1, i32 1, i32 3, i32 3, i32 5, i32 5, i32 7, i32 7>