[X86, AVX] use blends instead of insert128 with index 0

Another case of x86-specific shuffle strength reduction: avoid generating insert*128 instructions with index 0 because they are slower than their non-lane-changing blend equivalents. Shuffle lowering already catches most of these cases, but the zero vector case and some other paths such as in the modified test in vector-shuffle-256-v32.ll were getting through. Differential Revision: http://reviews.llvm.org/D8366 llvm-svn: 232773
2015-03-19 22:29:40 +00:00 · 2015-03-19 22:29:40 +00:00 · d5c2d287f9
parent ab58a568ee
commit d5c2d287f9
4 changed files with 114 additions and 22 deletions
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@ -160,8 +160,51 @@ static SDValue InsertSubVector(SDValue Result, SDValue Vec,
 /// we want.  It need not be aligned to a 128-bit boundary.  That makes
 /// lowering INSERT_VECTOR_ELT operations easier.
 static SDValue Insert128BitVector(SDValue Result, SDValue Vec, unsigned IdxVal,
-                                  SelectionDAG &DAG,SDLoc dl) {
+                                  SelectionDAG &DAG, SDLoc dl) {
  assert(Vec.getValueType().is128BitVector() && "Unexpected vector size!");
+
+  // For insertion into the zero index (low half) of a 256-bit vector, it is
+  // more efficient to generate a blend with immediate instead of an insert*128.
+  // We are still creating an INSERT_SUBVECTOR below with an undef node to
+  // extend the subvector to the size of the result vector. Make sure that
+  // we are not recursing on that node by checking for undef here.
+  if (IdxVal == 0 && Result.getValueType().is256BitVector() &&
+      Result.getOpcode() != ISD::UNDEF) {
+    EVT ResultVT = Result.getValueType();
+    SDValue ZeroIndex = DAG.getIntPtrConstant(0);
+    SDValue Undef = DAG.getUNDEF(ResultVT);
+    SDValue Vec256 = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResultVT, Undef,
+                               Vec, ZeroIndex);
+
+    // The blend instruction, and therefore its mask, depend on the data type.
+    MVT ScalarType = ResultVT.getScalarType().getSimpleVT();
+    if (ScalarType.isFloatingPoint()) {
+      // Choose either vblendps (float) or vblendpd (double).
+      unsigned ScalarSize = ScalarType.getSizeInBits();
+      assert((ScalarSize == 64 || ScalarSize == 32) && "Unknown float type");
+      unsigned MaskVal = (ScalarSize == 64) ? 0x03 : 0x0f;
+      SDValue Mask = DAG.getConstant(MaskVal, MVT::i8);
+      return DAG.getNode(X86ISD::BLENDI, dl, ResultVT, Result, Vec256, Mask);
+    }
+    
+    const X86Subtarget &Subtarget =
+      static_cast<const X86Subtarget &>(DAG.getSubtarget());
+
+    // AVX2 is needed for 256-bit integer blend support.
+    // Integers must be cast to 32-bit because there is only vpblendd;
+    // vpblendw can't be used for this because it has a handicapped mask.
+
+    // If we don't have AVX2, then cast to float. Using a wrong domain blend
+    // is still more efficient than using the wrong domain vinsertf128 that
+    // will be created by InsertSubVector().
+    MVT CastVT = Subtarget.hasAVX2() ? MVT::v8i32 : MVT::v8f32;
+
+    SDValue Mask = DAG.getConstant(0x0f, MVT::i8);
+    Vec256 = DAG.getNode(ISD::BITCAST, dl, CastVT, Vec256);
+    Vec256 = DAG.getNode(X86ISD::BLENDI, dl, CastVT, Result, Vec256, Mask);
+    return DAG.getNode(ISD::BITCAST, dl, ResultVT, Vec256);
+  }
+
  return InsertSubVector(Result, Vec, IdxVal, DAG, dl, 128);
 }

--- a/llvm/test/CodeGen/X86/2012-04-26-sdglue.ll
+++ b/llvm/test/CodeGen/X86/2012-04-26-sdglue.ll
@ -5,10 +5,10 @@
 ; It's hard to test for the ISEL condition because CodeGen optimizes
 ; away the bugpointed code. Just ensure the basics are still there.
 ;CHECK-LABEL: func:
-;CHECK: vpxor
-;CHECK: vinserti128
+;CHECK: vxorps
 ;CHECK: vpshufd
 ;CHECK: vpbroadcastd
+;CHECK: vinserti128
 ;CHECK: vmulps
 ;CHECK: vmulps
 ;CHECK: ret
--- a/llvm/test/CodeGen/X86/avx-cast.ll
+++ b/llvm/test/CodeGen/X86/avx-cast.ll
@ -1,51 +1,100 @@
-; RUN: llc < %s -mtriple=x86_64-apple-darwin -mattr=+avx | FileCheck %s
+; RUN: llc < %s -mtriple=x86_64-apple-darwin -mattr=+avx  | FileCheck %s --check-prefix=AVX1
+; RUN: llc < %s -mtriple=x86_64-apple-darwin -mattr=+avx2 | FileCheck %s --check-prefix=AVX2
+
+; Prefer a blend instruction to a vinsert128 instruction because blends
+; are simpler (no lane changes) and therefore will have equal or better
+; performance.

-; CHECK-LABEL: castA:
-; CHECK: vxorps
-; CHECK-NEXT: vinsertf128 $0
 define <8 x float> @castA(<4 x float> %m) nounwind uwtable readnone ssp {
+; AVX1-LABEL: castA:
+; AVX1:         vxorps %ymm1, %ymm1, %ymm1
+; AVX1-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: castA:
+; AVX2:         vxorps %ymm1, %ymm1, %ymm1
+; AVX2-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
+; AVX2-NEXT:    retq
+
 entry:
  %shuffle.i = shufflevector <4 x float> %m, <4 x float> zeroinitializer, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 4, i32 4, i32 4>
  ret <8 x float> %shuffle.i
 }

-; CHECK-LABEL: castB:
-; CHECK: vxorps
-; CHECK-NEXT: vinsertf128 $0
 define <4 x double> @castB(<2 x double> %m) nounwind uwtable readnone ssp {
+; AVX1-LABEL: castB:
+; AVX1:         vxorpd %ymm1, %ymm1, %ymm1
+; AVX1-NEXT:    vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3]
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: castB:
+; AVX2:         vxorpd %ymm1, %ymm1, %ymm1
+; AVX2-NEXT:    vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3]
+; AVX2-NEXT:    retq
+
 entry:
  %shuffle.i = shufflevector <2 x double> %m, <2 x double> zeroinitializer, <4 x i32> <i32 0, i32 1, i32 2, i32 2>
  ret <4 x double> %shuffle.i
 }

-; CHECK-LABEL: castC:
-; CHECK: vxorps
-; CHECK-NEXT: vinsertf128 $0
+; AVX2 is needed for integer types.
+
 define <4 x i64> @castC(<2 x i64> %m) nounwind uwtable readnone ssp {
+; AVX1-LABEL: castC:
+; AVX1:         vxorps %xmm1, %xmm1, %xmm1
+; AVX1-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: castC:
+; AVX2:         vpxor %ymm1, %ymm1, %ymm1
+; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
+; AVX2-NEXT:    retq
+
 entry:
  %shuffle.i = shufflevector <2 x i64> %m, <2 x i64> zeroinitializer, <4 x i32> <i32 0, i32 1, i32 2, i32 2>
  ret <4 x i64> %shuffle.i
 }

-; CHECK-LABEL: castD:
-; CHECK-NOT: vextractf128 $0
+; The next three tests don't need any shuffling. There may or may not be a
+; vzeroupper before the return, so just check for the absence of shuffles.
+
 define <4 x float> @castD(<8 x float> %m) nounwind uwtable readnone ssp {
+; AVX1-LABEL: castD:
+; AVX1-NOT:    extract
+; AVX1-NOT:    blend
+;
+; AVX2-LABEL: castD:
+; AVX2-NOT:    extract
+; AVX2-NOT:    blend
+
 entry:
  %shuffle.i = shufflevector <8 x float> %m, <8 x float> %m, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
  ret <4 x float> %shuffle.i
 }

-; CHECK-LABEL: castE:
-; CHECK-NOT: vextractf128 $0
 define <2 x i64> @castE(<4 x i64> %m) nounwind uwtable readnone ssp {
+; AVX1-LABEL: castE:
+; AVX1-NOT:    extract
+; AVX1-NOT:    blend
+;
+; AVX2-LABEL: castE:
+; AVX2-NOT:    extract
+; AVX2-NOT:    blend
+
 entry:
  %shuffle.i = shufflevector <4 x i64> %m, <4 x i64> %m, <2 x i32> <i32 0, i32 1>
  ret <2 x i64> %shuffle.i
 }

-; CHECK-LABEL: castF:
-; CHECK-NOT: vextractf128 $0
 define <2 x double> @castF(<4 x double> %m) nounwind uwtable readnone ssp {
+; AVX1-LABEL: castF:
+; AVX1-NOT:    extract
+; AVX1-NOT:    blend
+;
+; AVX2-LABEL: castF:
+; AVX2-NOT:    extract
+; AVX2-NOT:    blend
+
 entry:
  %shuffle.i = shufflevector <4 x double> %m, <4 x double> %m, <2 x i32> <i32 0, i32 1>
  ret <2 x double> %shuffle.i
--- a/llvm/test/CodeGen/X86/vector-shuffle-256-v32.ll
+++ b/llvm/test/CodeGen/X86/vector-shuffle-256-v32.ll
@ -652,12 +652,12 @@ define <32 x i8> @shuffle_v32i8_31_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_
 ;
 ; AVX2-LABEL: shuffle_v32i8_31_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00:
 ; AVX2:       # BB#0:
+; AVX2-NEXT:    vperm2i128 {{.*#+}} ymm1 = ymm0[2,3,0,1]
+; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3,4,5,6,7]
 ; AVX2-NEXT:    movl $15, %eax
 ; AVX2-NEXT:    vmovd %eax, %xmm1
 ; AVX2-NEXT:    vpxor %ymm2, %ymm2, %ymm2
-; AVX2-NEXT:    vinserti128 $0, %xmm1, %ymm2, %ymm1
-; AVX2-NEXT:    vperm2i128 {{.*#+}} ymm2 = ymm0[2,3,0,1]
-; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm2[2,3,4,5,6,7]
+; AVX2-NEXT:    vpblendd $15, %ymm1, %ymm2, %ymm1
 ; AVX2-NEXT:    vpshufb %ymm1, %ymm0, %ymm0
 ; AVX2-NEXT:    retq
  %shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 31, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>