[X86, AVX] use blends instead of insert128 with index 0

Another case of x86-specific shuffle strength reduction:
avoid generating insert*128 instructions with index 0 because
they are slower than their non-lane-changing blend equivalents.

Shuffle lowering already catches most of these cases, but
the zero vector case and some other paths such as in the
modified test in vector-shuffle-256-v32.ll were getting
through.

Differential Revision: http://reviews.llvm.org/D8366

llvm-svn: 232773
This commit is contained in:
Sanjay Patel 2015-03-19 22:29:40 +00:00
parent ab58a568ee
commit d5c2d287f9
4 changed files with 114 additions and 22 deletions

View File

@ -160,8 +160,51 @@ static SDValue InsertSubVector(SDValue Result, SDValue Vec,
/// we want. It need not be aligned to a 128-bit boundary. That makes
/// lowering INSERT_VECTOR_ELT operations easier.
static SDValue Insert128BitVector(SDValue Result, SDValue Vec, unsigned IdxVal,
SelectionDAG &DAG,SDLoc dl) {
SelectionDAG &DAG, SDLoc dl) {
assert(Vec.getValueType().is128BitVector() && "Unexpected vector size!");
// For insertion into the zero index (low half) of a 256-bit vector, it is
// more efficient to generate a blend with immediate instead of an insert*128.
// We are still creating an INSERT_SUBVECTOR below with an undef node to
// extend the subvector to the size of the result vector. Make sure that
// we are not recursing on that node by checking for undef here.
if (IdxVal == 0 && Result.getValueType().is256BitVector() &&
Result.getOpcode() != ISD::UNDEF) {
EVT ResultVT = Result.getValueType();
SDValue ZeroIndex = DAG.getIntPtrConstant(0);
SDValue Undef = DAG.getUNDEF(ResultVT);
SDValue Vec256 = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResultVT, Undef,
Vec, ZeroIndex);
// The blend instruction, and therefore its mask, depend on the data type.
MVT ScalarType = ResultVT.getScalarType().getSimpleVT();
if (ScalarType.isFloatingPoint()) {
// Choose either vblendps (float) or vblendpd (double).
unsigned ScalarSize = ScalarType.getSizeInBits();
assert((ScalarSize == 64 || ScalarSize == 32) && "Unknown float type");
unsigned MaskVal = (ScalarSize == 64) ? 0x03 : 0x0f;
SDValue Mask = DAG.getConstant(MaskVal, MVT::i8);
return DAG.getNode(X86ISD::BLENDI, dl, ResultVT, Result, Vec256, Mask);
}
const X86Subtarget &Subtarget =
static_cast<const X86Subtarget &>(DAG.getSubtarget());
// AVX2 is needed for 256-bit integer blend support.
// Integers must be cast to 32-bit because there is only vpblendd;
// vpblendw can't be used for this because it has a handicapped mask.
// If we don't have AVX2, then cast to float. Using a wrong domain blend
// is still more efficient than using the wrong domain vinsertf128 that
// will be created by InsertSubVector().
MVT CastVT = Subtarget.hasAVX2() ? MVT::v8i32 : MVT::v8f32;
SDValue Mask = DAG.getConstant(0x0f, MVT::i8);
Vec256 = DAG.getNode(ISD::BITCAST, dl, CastVT, Vec256);
Vec256 = DAG.getNode(X86ISD::BLENDI, dl, CastVT, Result, Vec256, Mask);
return DAG.getNode(ISD::BITCAST, dl, ResultVT, Vec256);
}
return InsertSubVector(Result, Vec, IdxVal, DAG, dl, 128);
}

View File

@ -5,10 +5,10 @@
; It's hard to test for the ISEL condition because CodeGen optimizes
; away the bugpointed code. Just ensure the basics are still there.
;CHECK-LABEL: func:
;CHECK: vpxor
;CHECK: vinserti128
;CHECK: vxorps
;CHECK: vpshufd
;CHECK: vpbroadcastd
;CHECK: vinserti128
;CHECK: vmulps
;CHECK: vmulps
;CHECK: ret

View File

@ -1,51 +1,100 @@
; RUN: llc < %s -mtriple=x86_64-apple-darwin -mattr=+avx | FileCheck %s
; RUN: llc < %s -mtriple=x86_64-apple-darwin -mattr=+avx | FileCheck %s --check-prefix=AVX1
; RUN: llc < %s -mtriple=x86_64-apple-darwin -mattr=+avx2 | FileCheck %s --check-prefix=AVX2
; Prefer a blend instruction to a vinsert128 instruction because blends
; are simpler (no lane changes) and therefore will have equal or better
; performance.
; CHECK-LABEL: castA:
; CHECK: vxorps
; CHECK-NEXT: vinsertf128 $0
define <8 x float> @castA(<4 x float> %m) nounwind uwtable readnone ssp {
; AVX1-LABEL: castA:
; AVX1: vxorps %ymm1, %ymm1, %ymm1
; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
; AVX1-NEXT: retq
;
; AVX2-LABEL: castA:
; AVX2: vxorps %ymm1, %ymm1, %ymm1
; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
; AVX2-NEXT: retq
entry:
%shuffle.i = shufflevector <4 x float> %m, <4 x float> zeroinitializer, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 4, i32 4, i32 4>
ret <8 x float> %shuffle.i
}
; CHECK-LABEL: castB:
; CHECK: vxorps
; CHECK-NEXT: vinsertf128 $0
define <4 x double> @castB(<2 x double> %m) nounwind uwtable readnone ssp {
; AVX1-LABEL: castB:
; AVX1: vxorpd %ymm1, %ymm1, %ymm1
; AVX1-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3]
; AVX1-NEXT: retq
;
; AVX2-LABEL: castB:
; AVX2: vxorpd %ymm1, %ymm1, %ymm1
; AVX2-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3]
; AVX2-NEXT: retq
entry:
%shuffle.i = shufflevector <2 x double> %m, <2 x double> zeroinitializer, <4 x i32> <i32 0, i32 1, i32 2, i32 2>
ret <4 x double> %shuffle.i
}
; CHECK-LABEL: castC:
; CHECK: vxorps
; CHECK-NEXT: vinsertf128 $0
; AVX2 is needed for integer types.
define <4 x i64> @castC(<2 x i64> %m) nounwind uwtable readnone ssp {
; AVX1-LABEL: castC:
; AVX1: vxorps %xmm1, %xmm1, %xmm1
; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
; AVX1-NEXT: retq
;
; AVX2-LABEL: castC:
; AVX2: vpxor %ymm1, %ymm1, %ymm1
; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
; AVX2-NEXT: retq
entry:
%shuffle.i = shufflevector <2 x i64> %m, <2 x i64> zeroinitializer, <4 x i32> <i32 0, i32 1, i32 2, i32 2>
ret <4 x i64> %shuffle.i
}
; CHECK-LABEL: castD:
; CHECK-NOT: vextractf128 $0
; The next three tests don't need any shuffling. There may or may not be a
; vzeroupper before the return, so just check for the absence of shuffles.
define <4 x float> @castD(<8 x float> %m) nounwind uwtable readnone ssp {
; AVX1-LABEL: castD:
; AVX1-NOT: extract
; AVX1-NOT: blend
;
; AVX2-LABEL: castD:
; AVX2-NOT: extract
; AVX2-NOT: blend
entry:
%shuffle.i = shufflevector <8 x float> %m, <8 x float> %m, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
ret <4 x float> %shuffle.i
}
; CHECK-LABEL: castE:
; CHECK-NOT: vextractf128 $0
define <2 x i64> @castE(<4 x i64> %m) nounwind uwtable readnone ssp {
; AVX1-LABEL: castE:
; AVX1-NOT: extract
; AVX1-NOT: blend
;
; AVX2-LABEL: castE:
; AVX2-NOT: extract
; AVX2-NOT: blend
entry:
%shuffle.i = shufflevector <4 x i64> %m, <4 x i64> %m, <2 x i32> <i32 0, i32 1>
ret <2 x i64> %shuffle.i
}
; CHECK-LABEL: castF:
; CHECK-NOT: vextractf128 $0
define <2 x double> @castF(<4 x double> %m) nounwind uwtable readnone ssp {
; AVX1-LABEL: castF:
; AVX1-NOT: extract
; AVX1-NOT: blend
;
; AVX2-LABEL: castF:
; AVX2-NOT: extract
; AVX2-NOT: blend
entry:
%shuffle.i = shufflevector <4 x double> %m, <4 x double> %m, <2 x i32> <i32 0, i32 1>
ret <2 x double> %shuffle.i

View File

@ -652,12 +652,12 @@ define <32 x i8> @shuffle_v32i8_31_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_
;
; AVX2-LABEL: shuffle_v32i8_31_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00:
; AVX2: # BB#0:
; AVX2-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm0[2,3,0,1]
; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3,4,5,6,7]
; AVX2-NEXT: movl $15, %eax
; AVX2-NEXT: vmovd %eax, %xmm1
; AVX2-NEXT: vpxor %ymm2, %ymm2, %ymm2
; AVX2-NEXT: vinserti128 $0, %xmm1, %ymm2, %ymm1
; AVX2-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm0[2,3,0,1]
; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm2[2,3,4,5,6,7]
; AVX2-NEXT: vpblendd $15, %ymm1, %ymm2, %ymm1
; AVX2-NEXT: vpshufb %ymm1, %ymm0, %ymm0
; AVX2-NEXT: retq
%shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 31, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>