[X86][TLI] SimplifyDemandedVectorEltsForTargetNode(): don't break apart broadcasts from which not just the 0'th elt is demanded

Apparently this has no test coverage before D108382, but D108382 itself shows a few regressions that this fixes. It doesn't seem worthwhile breaking apart broadcasts, assuming we want the broadcasted value to be preset in several elements, not just the 0'th one. Reviewed By: RKSimon Differential Revision: https://reviews.llvm.org/D108411
2021-09-19 17:38:32 +03:00 · 2021-09-19 17:38:32 +03:00 · 5f2fe48d06
parent 07f1d8f0ca
commit 5f2fe48d06
4 changed files with 12 additions and 8 deletions
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@ -39978,6 +39978,12 @@ bool X86TargetLowering::SimplifyDemandedVectorEltsForTargetNode(
    }
  }

+  // For broadcasts, unless we *only* demand the 0'th element,
+  // stop attempts at simplification here, we aren't going to improve things,
+  // this is better than any potential shuffle.
+  if (isTargetShuffleSplat(Op) && !DemandedElts.isOneValue())
+    return false;
+
  // Get target/faux shuffle mask.
  APInt OpUndef, OpZero;
  SmallVector<int, 64> OpMask;
--- a/llvm/test/CodeGen/X86/copy-low-subvec-elt-to-high-subvec-elt.ll
+++ b/llvm/test/CodeGen/X86/copy-low-subvec-elt-to-high-subvec-elt.ll
@ -284,7 +284,7 @@ define <4 x i64> @vec256_eltty_i64_source_subvec_0_target_subvec_mask_3_unary(<4
 define <4 x i64> @vec256_eltty_i64_source_subvec_0_target_subvec_mask_3_binary(<4 x i64> %x, <4 x i64> %y) nounwind {
 ; CHECK-LABEL: vec256_eltty_i64_source_subvec_0_target_subvec_mask_3_binary:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vpermpd {{.*#+}} ymm1 = ymm1[0,1,0,1]
+; CHECK-NEXT:    vbroadcastsd %xmm1, %ymm1
 ; CHECK-NEXT:    vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2]
 ; CHECK-NEXT:    retq
  %r = shufflevector <4 x i64> %x, <4 x i64> %y, <4 x i32> <i32 0, i32 4, i32 2, i32 4>
--- a/llvm/test/CodeGen/X86/horizontal-sum.ll
+++ b/llvm/test/CodeGen/X86/horizontal-sum.ll
@ -752,9 +752,9 @@ define <4 x i32> @sequential_sum_v4i32_v4i32(<4 x i32> %0, <4 x i32> %1, <4 x i3
 ; AVX2-SLOW-NEXT:    vpshufd {{.*#+}} xmm4 = xmm2[3,3,3,3]
 ; AVX2-SLOW-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm4[0]
 ; AVX2-SLOW-NEXT:    vpblendd {{.*#+}} xmm2 = xmm5[0,1],xmm2[2,3]
-; AVX2-SLOW-NEXT:    vpbroadcastd %xmm3, %xmm4
-; AVX2-SLOW-NEXT:    vpbroadcastq %xmm3, %xmm5
-; AVX2-SLOW-NEXT:    vpaddd %xmm4, %xmm5, %xmm4
+; AVX2-SLOW-NEXT:    vpshufd {{.*#+}} xmm4 = xmm3[1,1,1,1]
+; AVX2-SLOW-NEXT:    vpbroadcastd %xmm3, %xmm5
+; AVX2-SLOW-NEXT:    vpaddd %xmm5, %xmm4, %xmm4
 ; AVX2-SLOW-NEXT:    vpblendd {{.*#+}} xmm1 = xmm1[0,1,2],xmm4[3]
 ; AVX2-SLOW-NEXT:    vpshufd {{.*#+}} xmm4 = xmm3[2,2,2,2]
 ; AVX2-SLOW-NEXT:    vpblendd {{.*#+}} xmm2 = xmm2[0,1,2],xmm4[3]
--- a/llvm/test/CodeGen/X86/sse41.ll
+++ b/llvm/test/CodeGen/X86/sse41.ll
@ -645,8 +645,7 @@ define <4 x i32> @pinsrd_from_shufflevector_i32(<4 x i32> %a, <4 x i32>* nocaptu
 ; X86-AVX1-LABEL: pinsrd_from_shufflevector_i32:
 ; X86-AVX1:       ## %bb.0: ## %entry
 ; X86-AVX1-NEXT:    movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04]
-; X86-AVX1-NEXT:    vpermilps $0, (%eax), %xmm1 ## encoding: [0xc4,0xe3,0x79,0x04,0x08,0x00]
-; X86-AVX1-NEXT:    ## xmm1 = mem[0,0,0,0]
+; X86-AVX1-NEXT:    vbroadcastss (%eax), %xmm1 ## encoding: [0xc4,0xe2,0x79,0x18,0x08]
 ; X86-AVX1-NEXT:    vblendps $8, %xmm1, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x0c,0xc1,0x08]
 ; X86-AVX1-NEXT:    ## xmm0 = xmm0[0,1,2],xmm1[3]
 ; X86-AVX1-NEXT:    retl ## encoding: [0xc3]
@ -669,8 +668,7 @@ define <4 x i32> @pinsrd_from_shufflevector_i32(<4 x i32> %a, <4 x i32>* nocaptu
 ;
 ; X64-AVX1-LABEL: pinsrd_from_shufflevector_i32:
 ; X64-AVX1:       ## %bb.0: ## %entry
-; X64-AVX1-NEXT:    vpermilps $0, (%rdi), %xmm1 ## encoding: [0xc4,0xe3,0x79,0x04,0x0f,0x00]
-; X64-AVX1-NEXT:    ## xmm1 = mem[0,0,0,0]
+; X64-AVX1-NEXT:    vbroadcastss (%rdi), %xmm1 ## encoding: [0xc4,0xe2,0x79,0x18,0x0f]
 ; X64-AVX1-NEXT:    vblendps $8, %xmm1, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x0c,0xc1,0x08]
 ; X64-AVX1-NEXT:    ## xmm0 = xmm0[0,1,2],xmm1[3]
 ; X64-AVX1-NEXT:    retq ## encoding: [0xc3]