From c0711af7f95ef78022362a9e7c7d562112ca9a05 Mon Sep 17 00:00:00 2001 From: Simon Pilgrim Date: Wed, 26 Jun 2019 11:21:09 +0000 Subject: [PATCH] [X86][AVX] combineExtractSubvector - 'little to big' extract_subvector(bitcast()) support Ideally this needs to be a generic combine in DAGCombiner::visitEXTRACT_SUBVECTOR but there's some nasty regressions in aarch64 due to neon shuffles not handling bitcasts at all..... llvm-svn: 364407 --- llvm/lib/Target/X86/X86ISelLowering.cpp | 28 ++++++++ llvm/test/CodeGen/X86/avg.ll | 4 +- .../CodeGen/X86/horizontal-reduce-smax.ll | 4 +- .../CodeGen/X86/horizontal-reduce-smin.ll | 4 +- .../CodeGen/X86/horizontal-reduce-umax.ll | 4 +- .../CodeGen/X86/horizontal-reduce-umin.ll | 4 +- .../CodeGen/X86/vector-reduce-mul-widen.ll | 64 +++++++++---------- llvm/test/CodeGen/X86/vector-reduce-mul.ll | 64 +++++++++---------- llvm/test/CodeGen/X86/vselect.ll | 24 ++----- 9 files changed, 109 insertions(+), 91 deletions(-) diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp index 0639940bf025..1bf7daf3b3b4 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -43630,6 +43630,34 @@ static SDValue combineExtractSubvector(SDNode *N, SelectionDAG &DAG, VT, SDLoc(N), InVec.getNode()->ops().slice(IdxVal, VT.getVectorNumElements())); + // Try to move vector bitcast after extract_subv by scaling extraction index: + // extract_subv (bitcast X), Index --> bitcast (extract_subv X, Index') + // TODO: Move this to DAGCombiner::visitEXTRACT_SUBVECTOR + if (InVec.getOpcode() == ISD::BITCAST && + InVec.getOperand(0).getValueType().isVector()) { + SDValue SrcOp = InVec.getOperand(0); + EVT SrcVT = SrcOp.getValueType(); + unsigned SrcNumElts = SrcVT.getVectorNumElements(); + unsigned DestNumElts = InVec.getValueType().getVectorNumElements(); + if ((DestNumElts % SrcNumElts) == 0) { + unsigned DestSrcRatio = DestNumElts / SrcNumElts; + if ((VT.getVectorNumElements() % DestSrcRatio) == 0) { + unsigned NewExtNumElts = VT.getVectorNumElements() / DestSrcRatio; + EVT NewExtVT = EVT::getVectorVT(*DAG.getContext(), + SrcVT.getScalarType(), NewExtNumElts); + if ((N->getConstantOperandVal(1) % DestSrcRatio) == 0 && + TLI.isOperationLegalOrCustom(ISD::EXTRACT_SUBVECTOR, NewExtVT)) { + unsigned IndexValScaled = N->getConstantOperandVal(1) / DestSrcRatio; + SDLoc DL(N); + SDValue NewIndex = DAG.getIntPtrConstant(IndexValScaled, DL); + SDValue NewExtract = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, NewExtVT, + SrcOp, NewIndex); + return DAG.getBitcast(VT, NewExtract); + } + } + } + } + // If we're extracting from a broadcast then we're better off just // broadcasting to the smaller type directly, assuming this is the only use. // As its a broadcast we don't care about the extraction index. diff --git a/llvm/test/CodeGen/X86/avg.ll b/llvm/test/CodeGen/X86/avg.ll index 22a6daa999d7..86d8628c9ae7 100644 --- a/llvm/test/CodeGen/X86/avg.ll +++ b/llvm/test/CodeGen/X86/avg.ll @@ -382,7 +382,7 @@ define void @avg_v48i8(<48 x i8>* %a, <48 x i8>* %b) nounwind { ; AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[2,3,0,1] ; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero,xmm2[4],zero,zero,zero,xmm2[5],zero,zero,zero,xmm2[6],zero,zero,zero,xmm2[7],zero,zero,zero ; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero -; AVX2-NEXT: vpbroadcastq 24(%rdi), %ymm3 +; AVX2-NEXT: vpbroadcastq 24(%rdi), %xmm3 ; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm3 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero,xmm3[2],zero,zero,zero,xmm3[3],zero,zero,zero,xmm3[4],zero,zero,zero,xmm3[5],zero,zero,zero,xmm3[6],zero,zero,zero,xmm3[7],zero,zero,zero ; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm4 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero ; AVX2-NEXT: vpshufd {{.*#+}} xmm5 = xmm1[2,3,0,1] @@ -395,7 +395,7 @@ define void @avg_v48i8(<48 x i8>* %a, <48 x i8>* %b) nounwind { ; AVX2-NEXT: vpaddd %ymm1, %ymm2, %ymm1 ; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm2 = xmm6[0],zero,zero,zero,xmm6[1],zero,zero,zero,xmm6[2],zero,zero,zero,xmm6[3],zero,zero,zero,xmm6[4],zero,zero,zero,xmm6[5],zero,zero,zero,xmm6[6],zero,zero,zero,xmm6[7],zero,zero,zero ; AVX2-NEXT: vpaddd %ymm2, %ymm0, %ymm0 -; AVX2-NEXT: vpbroadcastq 24(%rsi), %ymm2 +; AVX2-NEXT: vpbroadcastq 24(%rsi), %xmm2 ; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero,xmm2[4],zero,zero,zero,xmm2[5],zero,zero,zero,xmm2[6],zero,zero,zero,xmm2[7],zero,zero,zero ; AVX2-NEXT: vpaddd %ymm2, %ymm3, %ymm2 ; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm3 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero diff --git a/llvm/test/CodeGen/X86/horizontal-reduce-smax.ll b/llvm/test/CodeGen/X86/horizontal-reduce-smax.ll index ec831ac9b9f6..9ce634bfaea3 100644 --- a/llvm/test/CodeGen/X86/horizontal-reduce-smax.ll +++ b/llvm/test/CodeGen/X86/horizontal-reduce-smax.ll @@ -472,7 +472,7 @@ define i64 @test_reduce_v4i64(<4 x i64> %a0) { ; X86-AVX2-NEXT: vblendvpd %ymm2, %ymm0, %ymm1, %ymm0 ; X86-AVX2-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1] ; X86-AVX2-NEXT: vpcmpgtq %ymm1, %ymm0, %ymm2 -; X86-AVX2-NEXT: vblendvpd %ymm2, %ymm0, %ymm1, %ymm0 +; X86-AVX2-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 ; X86-AVX2-NEXT: vmovd %xmm0, %eax ; X86-AVX2-NEXT: vpextrd $1, %xmm0, %edx ; X86-AVX2-NEXT: vzeroupper @@ -1139,7 +1139,7 @@ define i64 @test_reduce_v8i64(<8 x i64> %a0) { ; X86-AVX2-NEXT: vblendvpd %ymm2, %ymm0, %ymm1, %ymm0 ; X86-AVX2-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1] ; X86-AVX2-NEXT: vpcmpgtq %ymm1, %ymm0, %ymm2 -; X86-AVX2-NEXT: vblendvpd %ymm2, %ymm0, %ymm1, %ymm0 +; X86-AVX2-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 ; X86-AVX2-NEXT: vmovd %xmm0, %eax ; X86-AVX2-NEXT: vpextrd $1, %xmm0, %edx ; X86-AVX2-NEXT: vzeroupper diff --git a/llvm/test/CodeGen/X86/horizontal-reduce-smin.ll b/llvm/test/CodeGen/X86/horizontal-reduce-smin.ll index 1adb3368767d..b4bf606ff28b 100644 --- a/llvm/test/CodeGen/X86/horizontal-reduce-smin.ll +++ b/llvm/test/CodeGen/X86/horizontal-reduce-smin.ll @@ -475,7 +475,7 @@ define i64 @test_reduce_v4i64(<4 x i64> %a0) { ; X86-AVX2-NEXT: vblendvpd %ymm2, %ymm0, %ymm1, %ymm0 ; X86-AVX2-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1] ; X86-AVX2-NEXT: vpcmpgtq %ymm0, %ymm1, %ymm2 -; X86-AVX2-NEXT: vblendvpd %ymm2, %ymm0, %ymm1, %ymm0 +; X86-AVX2-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 ; X86-AVX2-NEXT: vmovd %xmm0, %eax ; X86-AVX2-NEXT: vpextrd $1, %xmm0, %edx ; X86-AVX2-NEXT: vzeroupper @@ -1143,7 +1143,7 @@ define i64 @test_reduce_v8i64(<8 x i64> %a0) { ; X86-AVX2-NEXT: vblendvpd %ymm2, %ymm0, %ymm1, %ymm0 ; X86-AVX2-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1] ; X86-AVX2-NEXT: vpcmpgtq %ymm0, %ymm1, %ymm2 -; X86-AVX2-NEXT: vblendvpd %ymm2, %ymm0, %ymm1, %ymm0 +; X86-AVX2-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 ; X86-AVX2-NEXT: vmovd %xmm0, %eax ; X86-AVX2-NEXT: vpextrd $1, %xmm0, %edx ; X86-AVX2-NEXT: vzeroupper diff --git a/llvm/test/CodeGen/X86/horizontal-reduce-umax.ll b/llvm/test/CodeGen/X86/horizontal-reduce-umax.ll index b676b2acb775..3e760ef104a7 100644 --- a/llvm/test/CodeGen/X86/horizontal-reduce-umax.ll +++ b/llvm/test/CodeGen/X86/horizontal-reduce-umax.ll @@ -565,7 +565,7 @@ define i64 @test_reduce_v4i64(<4 x i64> %a0) { ; X86-AVX2-NEXT: vxorpd %ymm2, %ymm0, %ymm3 ; X86-AVX2-NEXT: vxorpd %ymm2, %ymm1, %ymm2 ; X86-AVX2-NEXT: vpcmpgtq %ymm2, %ymm3, %ymm2 -; X86-AVX2-NEXT: vblendvpd %ymm2, %ymm0, %ymm1, %ymm0 +; X86-AVX2-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 ; X86-AVX2-NEXT: vmovd %xmm0, %eax ; X86-AVX2-NEXT: vpextrd $1, %xmm0, %edx ; X86-AVX2-NEXT: vzeroupper @@ -1290,7 +1290,7 @@ define i64 @test_reduce_v8i64(<8 x i64> %a0) { ; X86-AVX2-NEXT: vxorpd %ymm2, %ymm0, %ymm3 ; X86-AVX2-NEXT: vxorpd %ymm2, %ymm1, %ymm2 ; X86-AVX2-NEXT: vpcmpgtq %ymm2, %ymm3, %ymm2 -; X86-AVX2-NEXT: vblendvpd %ymm2, %ymm0, %ymm1, %ymm0 +; X86-AVX2-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 ; X86-AVX2-NEXT: vmovd %xmm0, %eax ; X86-AVX2-NEXT: vpextrd $1, %xmm0, %edx ; X86-AVX2-NEXT: vzeroupper diff --git a/llvm/test/CodeGen/X86/horizontal-reduce-umin.ll b/llvm/test/CodeGen/X86/horizontal-reduce-umin.ll index 8c524aa92692..9d8853a4ddd2 100644 --- a/llvm/test/CodeGen/X86/horizontal-reduce-umin.ll +++ b/llvm/test/CodeGen/X86/horizontal-reduce-umin.ll @@ -503,7 +503,7 @@ define i64 @test_reduce_v4i64(<4 x i64> %a0) { ; X86-AVX2-NEXT: vxorpd %ymm2, %ymm0, %ymm3 ; X86-AVX2-NEXT: vxorpd %ymm2, %ymm1, %ymm2 ; X86-AVX2-NEXT: vpcmpgtq %ymm3, %ymm2, %ymm2 -; X86-AVX2-NEXT: vblendvpd %ymm2, %ymm0, %ymm1, %ymm0 +; X86-AVX2-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 ; X86-AVX2-NEXT: vmovd %xmm0, %eax ; X86-AVX2-NEXT: vpextrd $1, %xmm0, %edx ; X86-AVX2-NEXT: vzeroupper @@ -1192,7 +1192,7 @@ define i64 @test_reduce_v8i64(<8 x i64> %a0) { ; X86-AVX2-NEXT: vxorpd %ymm2, %ymm0, %ymm3 ; X86-AVX2-NEXT: vxorpd %ymm2, %ymm1, %ymm2 ; X86-AVX2-NEXT: vpcmpgtq %ymm3, %ymm2, %ymm2 -; X86-AVX2-NEXT: vblendvpd %ymm2, %ymm0, %ymm1, %ymm0 +; X86-AVX2-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 ; X86-AVX2-NEXT: vmovd %xmm0, %eax ; X86-AVX2-NEXT: vpextrd $1, %xmm0, %edx ; X86-AVX2-NEXT: vzeroupper diff --git a/llvm/test/CodeGen/X86/vector-reduce-mul-widen.ll b/llvm/test/CodeGen/X86/vector-reduce-mul-widen.ll index 573f9c836c21..a31aebcab24e 100644 --- a/llvm/test/CodeGen/X86/vector-reduce-mul-widen.ll +++ b/llvm/test/CodeGen/X86/vector-reduce-mul-widen.ll @@ -144,14 +144,14 @@ define i64 @test_v4i64(<4 x i64> %a0) { ; AVX2-LABEL: test_v4i64: ; AVX2: # %bb.0: ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2-NEXT: vpsrlq $32, %ymm0, %ymm2 -; AVX2-NEXT: vpmuludq %ymm1, %ymm2, %ymm2 -; AVX2-NEXT: vpsrlq $32, %ymm1, %ymm3 -; AVX2-NEXT: vpmuludq %ymm3, %ymm0, %ymm3 -; AVX2-NEXT: vpaddq %ymm2, %ymm3, %ymm2 -; AVX2-NEXT: vpsllq $32, %ymm2, %ymm2 -; AVX2-NEXT: vpmuludq %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vpaddq %ymm2, %ymm0, %ymm0 +; AVX2-NEXT: vpsrlq $32, %xmm0, %xmm2 +; AVX2-NEXT: vpmuludq %xmm1, %xmm2, %xmm2 +; AVX2-NEXT: vpsrlq $32, %xmm1, %xmm3 +; AVX2-NEXT: vpmuludq %xmm3, %xmm0, %xmm3 +; AVX2-NEXT: vpaddq %xmm2, %xmm3, %xmm2 +; AVX2-NEXT: vpsllq $32, %xmm2, %xmm2 +; AVX2-NEXT: vpmuludq %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpaddq %xmm2, %xmm0, %xmm0 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] ; AVX2-NEXT: vpsrlq $32, %xmm0, %xmm2 ; AVX2-NEXT: vpmuludq %xmm1, %xmm2, %xmm2 @@ -168,14 +168,14 @@ define i64 @test_v4i64(<4 x i64> %a0) { ; AVX512BW-LABEL: test_v4i64: ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512BW-NEXT: vpsrlq $32, %ymm0, %ymm2 -; AVX512BW-NEXT: vpmuludq %ymm1, %ymm2, %ymm2 -; AVX512BW-NEXT: vpsrlq $32, %ymm1, %ymm3 -; AVX512BW-NEXT: vpmuludq %ymm3, %ymm0, %ymm3 -; AVX512BW-NEXT: vpaddq %ymm2, %ymm3, %ymm2 -; AVX512BW-NEXT: vpsllq $32, %ymm2, %ymm2 -; AVX512BW-NEXT: vpmuludq %ymm1, %ymm0, %ymm0 -; AVX512BW-NEXT: vpaddq %ymm2, %ymm0, %ymm0 +; AVX512BW-NEXT: vpsrlq $32, %xmm0, %xmm2 +; AVX512BW-NEXT: vpmuludq %xmm1, %xmm2, %xmm2 +; AVX512BW-NEXT: vpsrlq $32, %xmm1, %xmm3 +; AVX512BW-NEXT: vpmuludq %xmm3, %xmm0, %xmm3 +; AVX512BW-NEXT: vpaddq %xmm2, %xmm3, %xmm2 +; AVX512BW-NEXT: vpsllq $32, %xmm2, %xmm2 +; AVX512BW-NEXT: vpmuludq %xmm1, %xmm0, %xmm0 +; AVX512BW-NEXT: vpaddq %xmm2, %xmm0, %xmm0 ; AVX512BW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] ; AVX512BW-NEXT: vpsrlq $32, %xmm0, %xmm2 ; AVX512BW-NEXT: vpmuludq %xmm1, %xmm2, %xmm2 @@ -336,14 +336,14 @@ define i64 @test_v8i64(<8 x i64> %a0) { ; AVX2-NEXT: vpmuludq %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vpaddq %ymm2, %ymm0, %ymm0 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2-NEXT: vpsrlq $32, %ymm0, %ymm2 -; AVX2-NEXT: vpmuludq %ymm1, %ymm2, %ymm2 -; AVX2-NEXT: vpsrlq $32, %ymm1, %ymm3 -; AVX2-NEXT: vpmuludq %ymm3, %ymm0, %ymm3 -; AVX2-NEXT: vpaddq %ymm2, %ymm3, %ymm2 -; AVX2-NEXT: vpsllq $32, %ymm2, %ymm2 -; AVX2-NEXT: vpmuludq %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vpaddq %ymm2, %ymm0, %ymm0 +; AVX2-NEXT: vpsrlq $32, %xmm0, %xmm2 +; AVX2-NEXT: vpmuludq %xmm1, %xmm2, %xmm2 +; AVX2-NEXT: vpsrlq $32, %xmm1, %xmm3 +; AVX2-NEXT: vpmuludq %xmm3, %xmm0, %xmm3 +; AVX2-NEXT: vpaddq %xmm2, %xmm3, %xmm2 +; AVX2-NEXT: vpsllq $32, %xmm2, %xmm2 +; AVX2-NEXT: vpmuludq %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpaddq %xmm2, %xmm0, %xmm0 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] ; AVX2-NEXT: vpsrlq $32, %xmm0, %xmm2 ; AVX2-NEXT: vpmuludq %xmm1, %xmm2, %xmm2 @@ -639,14 +639,14 @@ define i64 @test_v16i64(<16 x i64> %a0) { ; AVX2-NEXT: vpmuludq %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vpaddq %ymm2, %ymm0, %ymm0 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2-NEXT: vpsrlq $32, %ymm0, %ymm2 -; AVX2-NEXT: vpmuludq %ymm1, %ymm2, %ymm2 -; AVX2-NEXT: vpsrlq $32, %ymm1, %ymm3 -; AVX2-NEXT: vpmuludq %ymm3, %ymm0, %ymm3 -; AVX2-NEXT: vpaddq %ymm2, %ymm3, %ymm2 -; AVX2-NEXT: vpsllq $32, %ymm2, %ymm2 -; AVX2-NEXT: vpmuludq %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vpaddq %ymm2, %ymm0, %ymm0 +; AVX2-NEXT: vpsrlq $32, %xmm0, %xmm2 +; AVX2-NEXT: vpmuludq %xmm1, %xmm2, %xmm2 +; AVX2-NEXT: vpsrlq $32, %xmm1, %xmm3 +; AVX2-NEXT: vpmuludq %xmm3, %xmm0, %xmm3 +; AVX2-NEXT: vpaddq %xmm2, %xmm3, %xmm2 +; AVX2-NEXT: vpsllq $32, %xmm2, %xmm2 +; AVX2-NEXT: vpmuludq %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpaddq %xmm2, %xmm0, %xmm0 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] ; AVX2-NEXT: vpsrlq $32, %xmm0, %xmm2 ; AVX2-NEXT: vpmuludq %xmm1, %xmm2, %xmm2 diff --git a/llvm/test/CodeGen/X86/vector-reduce-mul.ll b/llvm/test/CodeGen/X86/vector-reduce-mul.ll index 2187c9def2d1..e89857b25ff8 100644 --- a/llvm/test/CodeGen/X86/vector-reduce-mul.ll +++ b/llvm/test/CodeGen/X86/vector-reduce-mul.ll @@ -144,14 +144,14 @@ define i64 @test_v4i64(<4 x i64> %a0) { ; AVX2-LABEL: test_v4i64: ; AVX2: # %bb.0: ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2-NEXT: vpsrlq $32, %ymm0, %ymm2 -; AVX2-NEXT: vpmuludq %ymm1, %ymm2, %ymm2 -; AVX2-NEXT: vpsrlq $32, %ymm1, %ymm3 -; AVX2-NEXT: vpmuludq %ymm3, %ymm0, %ymm3 -; AVX2-NEXT: vpaddq %ymm2, %ymm3, %ymm2 -; AVX2-NEXT: vpsllq $32, %ymm2, %ymm2 -; AVX2-NEXT: vpmuludq %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vpaddq %ymm2, %ymm0, %ymm0 +; AVX2-NEXT: vpsrlq $32, %xmm0, %xmm2 +; AVX2-NEXT: vpmuludq %xmm1, %xmm2, %xmm2 +; AVX2-NEXT: vpsrlq $32, %xmm1, %xmm3 +; AVX2-NEXT: vpmuludq %xmm3, %xmm0, %xmm3 +; AVX2-NEXT: vpaddq %xmm2, %xmm3, %xmm2 +; AVX2-NEXT: vpsllq $32, %xmm2, %xmm2 +; AVX2-NEXT: vpmuludq %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpaddq %xmm2, %xmm0, %xmm0 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] ; AVX2-NEXT: vpsrlq $32, %xmm0, %xmm2 ; AVX2-NEXT: vpmuludq %xmm1, %xmm2, %xmm2 @@ -168,14 +168,14 @@ define i64 @test_v4i64(<4 x i64> %a0) { ; AVX512BW-LABEL: test_v4i64: ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512BW-NEXT: vpsrlq $32, %ymm0, %ymm2 -; AVX512BW-NEXT: vpmuludq %ymm1, %ymm2, %ymm2 -; AVX512BW-NEXT: vpsrlq $32, %ymm1, %ymm3 -; AVX512BW-NEXT: vpmuludq %ymm3, %ymm0, %ymm3 -; AVX512BW-NEXT: vpaddq %ymm2, %ymm3, %ymm2 -; AVX512BW-NEXT: vpsllq $32, %ymm2, %ymm2 -; AVX512BW-NEXT: vpmuludq %ymm1, %ymm0, %ymm0 -; AVX512BW-NEXT: vpaddq %ymm2, %ymm0, %ymm0 +; AVX512BW-NEXT: vpsrlq $32, %xmm0, %xmm2 +; AVX512BW-NEXT: vpmuludq %xmm1, %xmm2, %xmm2 +; AVX512BW-NEXT: vpsrlq $32, %xmm1, %xmm3 +; AVX512BW-NEXT: vpmuludq %xmm3, %xmm0, %xmm3 +; AVX512BW-NEXT: vpaddq %xmm2, %xmm3, %xmm2 +; AVX512BW-NEXT: vpsllq $32, %xmm2, %xmm2 +; AVX512BW-NEXT: vpmuludq %xmm1, %xmm0, %xmm0 +; AVX512BW-NEXT: vpaddq %xmm2, %xmm0, %xmm0 ; AVX512BW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] ; AVX512BW-NEXT: vpsrlq $32, %xmm0, %xmm2 ; AVX512BW-NEXT: vpmuludq %xmm1, %xmm2, %xmm2 @@ -336,14 +336,14 @@ define i64 @test_v8i64(<8 x i64> %a0) { ; AVX2-NEXT: vpmuludq %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vpaddq %ymm2, %ymm0, %ymm0 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2-NEXT: vpsrlq $32, %ymm0, %ymm2 -; AVX2-NEXT: vpmuludq %ymm1, %ymm2, %ymm2 -; AVX2-NEXT: vpsrlq $32, %ymm1, %ymm3 -; AVX2-NEXT: vpmuludq %ymm3, %ymm0, %ymm3 -; AVX2-NEXT: vpaddq %ymm2, %ymm3, %ymm2 -; AVX2-NEXT: vpsllq $32, %ymm2, %ymm2 -; AVX2-NEXT: vpmuludq %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vpaddq %ymm2, %ymm0, %ymm0 +; AVX2-NEXT: vpsrlq $32, %xmm0, %xmm2 +; AVX2-NEXT: vpmuludq %xmm1, %xmm2, %xmm2 +; AVX2-NEXT: vpsrlq $32, %xmm1, %xmm3 +; AVX2-NEXT: vpmuludq %xmm3, %xmm0, %xmm3 +; AVX2-NEXT: vpaddq %xmm2, %xmm3, %xmm2 +; AVX2-NEXT: vpsllq $32, %xmm2, %xmm2 +; AVX2-NEXT: vpmuludq %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpaddq %xmm2, %xmm0, %xmm0 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] ; AVX2-NEXT: vpsrlq $32, %xmm0, %xmm2 ; AVX2-NEXT: vpmuludq %xmm1, %xmm2, %xmm2 @@ -639,14 +639,14 @@ define i64 @test_v16i64(<16 x i64> %a0) { ; AVX2-NEXT: vpmuludq %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vpaddq %ymm2, %ymm0, %ymm0 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2-NEXT: vpsrlq $32, %ymm0, %ymm2 -; AVX2-NEXT: vpmuludq %ymm1, %ymm2, %ymm2 -; AVX2-NEXT: vpsrlq $32, %ymm1, %ymm3 -; AVX2-NEXT: vpmuludq %ymm3, %ymm0, %ymm3 -; AVX2-NEXT: vpaddq %ymm2, %ymm3, %ymm2 -; AVX2-NEXT: vpsllq $32, %ymm2, %ymm2 -; AVX2-NEXT: vpmuludq %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vpaddq %ymm2, %ymm0, %ymm0 +; AVX2-NEXT: vpsrlq $32, %xmm0, %xmm2 +; AVX2-NEXT: vpmuludq %xmm1, %xmm2, %xmm2 +; AVX2-NEXT: vpsrlq $32, %xmm1, %xmm3 +; AVX2-NEXT: vpmuludq %xmm3, %xmm0, %xmm3 +; AVX2-NEXT: vpaddq %xmm2, %xmm3, %xmm2 +; AVX2-NEXT: vpsllq $32, %xmm2, %xmm2 +; AVX2-NEXT: vpmuludq %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpaddq %xmm2, %xmm0, %xmm0 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] ; AVX2-NEXT: vpsrlq $32, %xmm0, %xmm2 ; AVX2-NEXT: vpmuludq %xmm1, %xmm2, %xmm2 diff --git a/llvm/test/CodeGen/X86/vselect.ll b/llvm/test/CodeGen/X86/vselect.ll index a8afad6f9bb7..28f944434523 100644 --- a/llvm/test/CodeGen/X86/vselect.ll +++ b/llvm/test/CodeGen/X86/vselect.ll @@ -671,23 +671,13 @@ define i64 @vselect_any_extend_vector_inreg_crash(<8 x i8>* %x) { ; SSE41-NEXT: movq %xmm2, %rax ; SSE41-NEXT: retq ; -; AVX1-LABEL: vselect_any_extend_vector_inreg_crash: -; AVX1: # %bb.0: -; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero -; AVX1-NEXT: vpcmpeqw {{.*}}(%rip), %xmm0, %xmm0 -; AVX1-NEXT: vmovq %xmm0, %rax -; AVX1-NEXT: andl $32768, %eax # imm = 0x8000 -; AVX1-NEXT: retq -; -; AVX2-LABEL: vselect_any_extend_vector_inreg_crash: -; AVX2: # %bb.0: -; AVX2-NEXT: vpmovzxbw {{.*#+}} xmm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero -; AVX2-NEXT: vpcmpeqw {{.*}}(%rip), %xmm0, %xmm0 -; AVX2-NEXT: vpmovzxwq {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero -; AVX2-NEXT: vmovq %xmm0, %rax -; AVX2-NEXT: andl $32768, %eax # imm = 0x8000 -; AVX2-NEXT: vzeroupper -; AVX2-NEXT: retq +; AVX-LABEL: vselect_any_extend_vector_inreg_crash: +; AVX: # %bb.0: +; AVX-NEXT: vpmovzxbw {{.*#+}} xmm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero +; AVX-NEXT: vpcmpeqw {{.*}}(%rip), %xmm0, %xmm0 +; AVX-NEXT: vmovq %xmm0, %rax +; AVX-NEXT: andl $32768, %eax # imm = 0x8000 +; AVX-NEXT: retq 0: %1 = load <8 x i8>, <8 x i8>* %x %2 = icmp eq <8 x i8> %1,