[X86][SSE] combineExtractWithShuffle - extract(bitcast(broadcast(x))) --> x

Removes some unnecessary gpr<-->fpu traffic
2020-01-22 17:23:56 +00:00 · 2020-01-22 17:23:56 +00:00 · 5340434c94
parent 58991ba773
commit 5340434c94
2 changed files with 15 additions and 12 deletions
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@ -37102,11 +37102,24 @@ static SDValue combineExtractWithShuffle(SDNode *N, SelectionDAG &DAG,

  SDValue SrcBC = peekThroughBitcasts(Src);

-  // Handle extract(broadcast(scalar_value)), it doesn't matter what index is.
+  // Handle extract(bitcast(broadcast(scalar_value))).
  if (X86ISD::VBROADCAST == SrcBC.getOpcode()) {
    SDValue SrcOp = SrcBC.getOperand(0);
    if (SrcOp.getValueSizeInBits() == VT.getSizeInBits())
      return DAG.getBitcast(VT, SrcOp);
+
+    EVT SrcOpVT = SrcOp.getValueType();
+    if (SrcOpVT.isScalarInteger() && VT.isInteger() &&
+        (SrcOpVT.getSizeInBits() % SrcSVT.getSizeInBits()) == 0) {
+      unsigned Scale = SrcOpVT.getSizeInBits() / SrcSVT.getSizeInBits();
+      unsigned Offset = IdxC.urem(Scale) * SrcSVT.getSizeInBits();
+      // TODO support non-zero offsets.
+      if (Offset == 0) {
+        SrcOp = DAG.getZExtOrTrunc(SrcOp, dl, SrcVT.getScalarType());
+        SrcOp = DAG.getZExtOrTrunc(SrcOp, dl, VT);
+        return SrcOp;
+      }
+    }
  }

  // If we're extracting a single element from a broadcast load and there are
@ -37126,7 +37139,7 @@ static SDValue combineExtractWithShuffle(SDNode *N, SelectionDAG &DAG,
    }
  }

-  // Handle extract(scalar_to_vector(scalar_value)) for integers.
+  // Handle extract(bitcast(scalar_to_vector(scalar_value))) for integers.
  // TODO: Move to DAGCombine?
  if (SrcBC.getOpcode() == ISD::SCALAR_TO_VECTOR && VT.isInteger() &&
      SrcBC.getValueType().isInteger() &&
--- a/llvm/test/CodeGen/X86/bitcast-vector-bool.ll
+++ b/llvm/test/CodeGen/X86/bitcast-vector-bool.ll
@ -77,9 +77,7 @@ define i2 @bitcast_v4i32_to_v2i2(<4 x i32> %a0) nounwind {
 ; AVX512-NEXT:    movzbl %cl, %eax
 ; AVX512-NEXT:    shrl $2, %eax
 ; AVX512-NEXT:    andl $3, %eax
-; AVX512-NEXT:    vpbroadcastq %rax, %xmm0
 ; AVX512-NEXT:    andl $3, %ecx
-; AVX512-NEXT:    vpextrb $8, %xmm0, %eax
 ; AVX512-NEXT:    addb %cl, %al
 ; AVX512-NEXT:    # kill: def $al killed $al killed $eax
 ; AVX512-NEXT:    retq
@ -124,9 +122,7 @@ define i4 @bitcast_v8i16_to_v2i4(<8 x i16> %a0) nounwind {
 ; AVX512-NEXT:    kmovd %k0, %ecx
 ; AVX512-NEXT:    movzbl %cl, %eax
 ; AVX512-NEXT:    shrl $4, %eax
-; AVX512-NEXT:    vpbroadcastq %rax, %xmm0
 ; AVX512-NEXT:    andl $15, %ecx
-; AVX512-NEXT:    vpextrb $8, %xmm0, %eax
 ; AVX512-NEXT:    addb %cl, %al
 ; AVX512-NEXT:    # kill: def $al killed $al killed $eax
 ; AVX512-NEXT:    retq
@ -214,9 +210,7 @@ define i2 @bitcast_v4i64_to_v2i2(<4 x i64> %a0) nounwind {
 ; AVX512-NEXT:    movzbl %cl, %eax
 ; AVX512-NEXT:    shrl $2, %eax
 ; AVX512-NEXT:    andl $3, %eax
-; AVX512-NEXT:    vpbroadcastq %rax, %xmm0
 ; AVX512-NEXT:    andl $3, %ecx
-; AVX512-NEXT:    vpextrb $8, %xmm0, %eax
 ; AVX512-NEXT:    addb %cl, %al
 ; AVX512-NEXT:    # kill: def $al killed $al killed $eax
 ; AVX512-NEXT:    vzeroupper
@ -264,9 +258,7 @@ define i4 @bitcast_v8i32_to_v2i4(<8 x i32> %a0) nounwind {
 ; AVX512-NEXT:    kmovd %k0, %ecx
 ; AVX512-NEXT:    movzbl %cl, %eax
 ; AVX512-NEXT:    shrl $4, %eax
-; AVX512-NEXT:    vpbroadcastq %rax, %xmm0
 ; AVX512-NEXT:    andl $15, %ecx
-; AVX512-NEXT:    vpextrb $8, %xmm0, %eax
 ; AVX512-NEXT:    addb %cl, %al
 ; AVX512-NEXT:    # kill: def $al killed $al killed $eax
 ; AVX512-NEXT:    vzeroupper
@ -451,9 +443,7 @@ define i4 @bitcast_v8i64_to_v2i4(<8 x i64> %a0) nounwind {
 ; AVX512-NEXT:    kmovd %k0, %ecx
 ; AVX512-NEXT:    movzbl %cl, %eax
 ; AVX512-NEXT:    shrl $4, %eax
-; AVX512-NEXT:    vpbroadcastq %rax, %xmm0
 ; AVX512-NEXT:    andl $15, %ecx
-; AVX512-NEXT:    vpextrb $8, %xmm0, %eax
 ; AVX512-NEXT:    addb %cl, %al
 ; AVX512-NEXT:    # kill: def $al killed $al killed $eax
 ; AVX512-NEXT:    vzeroupper