forked from OSchip/llvm-project
[X86] Move turning 256-bit INSERT_SUBVECTORS into BLENDI from legalize to DAG combine.
On one test this seems to have given more chance for DAG combine to do other INSERT_SUBVECTOR/EXTRACT_SUBVECTOR combines before the BLENDI was created. Looks like we can still improve more by teaching DAG combine to optimize INSERT_SUBVECTOR/EXTRACT_SUBVECTOR with BLENDI. llvm-svn: 293944
This commit is contained in:
parent
c35139ec0d
commit
c45657375b
|
@ -4917,50 +4917,6 @@ static SDValue insertSubVector(SDValue Result, SDValue Vec, unsigned IdxVal,
|
|||
static SDValue insert128BitVector(SDValue Result, SDValue Vec, unsigned IdxVal,
|
||||
SelectionDAG &DAG, const SDLoc &dl) {
|
||||
assert(Vec.getValueType().is128BitVector() && "Unexpected vector size!");
|
||||
|
||||
// For insertion into the zero index (low half) of a 256-bit vector, it is
|
||||
// more efficient to generate a blend with immediate instead of an insert*128.
|
||||
// We are still creating an INSERT_SUBVECTOR below with an undef node to
|
||||
// extend the subvector to the size of the result vector. Make sure that
|
||||
// we are not recursing on that node by checking for undef here.
|
||||
if (IdxVal == 0 && Result.getValueType().is256BitVector() &&
|
||||
!Result.isUndef()) {
|
||||
EVT ResultVT = Result.getValueType();
|
||||
SDValue ZeroIndex = DAG.getIntPtrConstant(0, dl);
|
||||
SDValue Undef = DAG.getUNDEF(ResultVT);
|
||||
SDValue Vec256 = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResultVT, Undef,
|
||||
Vec, ZeroIndex);
|
||||
|
||||
// The blend instruction, and therefore its mask, depend on the data type.
|
||||
MVT ScalarType = ResultVT.getVectorElementType().getSimpleVT();
|
||||
if (ScalarType.isFloatingPoint()) {
|
||||
// Choose either vblendps (float) or vblendpd (double).
|
||||
unsigned ScalarSize = ScalarType.getSizeInBits();
|
||||
assert((ScalarSize == 64 || ScalarSize == 32) && "Unknown float type");
|
||||
unsigned MaskVal = (ScalarSize == 64) ? 0x03 : 0x0f;
|
||||
SDValue Mask = DAG.getConstant(MaskVal, dl, MVT::i8);
|
||||
return DAG.getNode(X86ISD::BLENDI, dl, ResultVT, Result, Vec256, Mask);
|
||||
}
|
||||
|
||||
const X86Subtarget &Subtarget =
|
||||
static_cast<const X86Subtarget &>(DAG.getSubtarget());
|
||||
|
||||
// AVX2 is needed for 256-bit integer blend support.
|
||||
// Integers must be cast to 32-bit because there is only vpblendd;
|
||||
// vpblendw can't be used for this because it has a handicapped mask.
|
||||
|
||||
// If we don't have AVX2, then cast to float. Using a wrong domain blend
|
||||
// is still more efficient than using the wrong domain vinsertf128 that
|
||||
// will be created by InsertSubVector().
|
||||
MVT CastVT = Subtarget.hasAVX2() ? MVT::v8i32 : MVT::v8f32;
|
||||
|
||||
SDValue Mask = DAG.getConstant(0x0f, dl, MVT::i8);
|
||||
Result = DAG.getBitcast(CastVT, Result);
|
||||
Vec256 = DAG.getBitcast(CastVT, Vec256);
|
||||
Vec256 = DAG.getNode(X86ISD::BLENDI, dl, CastVT, Result, Vec256, Mask);
|
||||
return DAG.getBitcast(ResultVT, Vec256);
|
||||
}
|
||||
|
||||
return insertSubVector(Result, Vec, IdxVal, DAG, dl, 128);
|
||||
}
|
||||
|
||||
|
@ -34165,6 +34121,45 @@ static SDValue combineInsertSubvector(SDNode *N, SelectionDAG &DAG,
|
|||
MVT OpVT = N->getSimpleValueType(0);
|
||||
MVT SubVecVT = SubVec.getSimpleValueType();
|
||||
|
||||
// For insertion into the zero index (low half) of a 256-bit vector, it is
|
||||
// more efficient to generate a blend with immediate instead of an insert*128.
|
||||
// We are still creating an INSERT_SUBVECTOR below with an undef node to
|
||||
// extend the subvector to the size of the result vector. Make sure that
|
||||
// we are not recursing on that node by checking for undef here.
|
||||
if (IdxVal == 0 && OpVT.is256BitVector() && SubVecVT.is128BitVector() &&
|
||||
!Vec.isUndef()) {
|
||||
SDValue ZeroIndex = DAG.getIntPtrConstant(0, dl);
|
||||
SDValue Undef = DAG.getUNDEF(OpVT);
|
||||
SDValue Vec256 = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, OpVT, Undef,
|
||||
SubVec, ZeroIndex);
|
||||
|
||||
// The blend instruction, and therefore its mask, depend on the data type.
|
||||
MVT ScalarType = OpVT.getVectorElementType();
|
||||
if (ScalarType.isFloatingPoint()) {
|
||||
// Choose either vblendps (float) or vblendpd (double).
|
||||
unsigned ScalarSize = ScalarType.getSizeInBits();
|
||||
assert((ScalarSize == 64 || ScalarSize == 32) && "Unknown float type");
|
||||
unsigned MaskVal = (ScalarSize == 64) ? 0x03 : 0x0f;
|
||||
SDValue Mask = DAG.getConstant(MaskVal, dl, MVT::i8);
|
||||
return DAG.getNode(X86ISD::BLENDI, dl, OpVT, Vec, Vec256, Mask);
|
||||
}
|
||||
|
||||
// AVX2 is needed for 256-bit integer blend support.
|
||||
// Integers must be cast to 32-bit because there is only vpblendd;
|
||||
// vpblendw can't be used for this because it has a handicapped mask.
|
||||
|
||||
// If we don't have AVX2, then cast to float. Using a wrong domain blend
|
||||
// is still more efficient than using the wrong domain vinsertf128 that
|
||||
// will be created by InsertSubVector().
|
||||
MVT CastVT = Subtarget.hasAVX2() ? MVT::v8i32 : MVT::v8f32;
|
||||
|
||||
SDValue Mask = DAG.getConstant(0x0f, dl, MVT::i8);
|
||||
Vec = DAG.getBitcast(CastVT, Vec);
|
||||
Vec256 = DAG.getBitcast(CastVT, Vec256);
|
||||
Vec256 = DAG.getNode(X86ISD::BLENDI, dl, CastVT, Vec, Vec256, Mask);
|
||||
return DAG.getBitcast(OpVT, Vec256);
|
||||
}
|
||||
|
||||
// Fold two 16-byte or 32-byte subvector loads into one 32-byte or 64-byte
|
||||
// load:
|
||||
// (insert_subvector (insert_subvector undef, (load16 addr), 0),
|
||||
|
|
|
@ -409,9 +409,8 @@ define <16 x i16> @insert_v16i16_z12345z789ABZDEz(<16 x i16> %a) {
|
|||
; AVX1: # BB#0:
|
||||
; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1
|
||||
; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm1[0],xmm0[1,2,3,4,5,6,7]
|
||||
; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7]
|
||||
; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm0[0,1,2,3,4,5],xmm1[6],xmm0[7]
|
||||
; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7]
|
||||
; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5],xmm1[6],xmm2[7]
|
||||
; AVX1-NEXT: vblendpd {{.*#+}} ymm0 = ymm2[0,1],ymm0[2,3]
|
||||
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
|
||||
; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1,2,3,4,5,6],xmm1[7]
|
||||
; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
|
||||
|
@ -421,8 +420,7 @@ define <16 x i16> @insert_v16i16_z12345z789ABZDEz(<16 x i16> %a) {
|
|||
; AVX2: # BB#0:
|
||||
; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1
|
||||
; AVX2-NEXT: vpblendw {{.*#+}} xmm2 = xmm1[0],xmm0[1,2,3,4,5,6,7]
|
||||
; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7]
|
||||
; AVX2-NEXT: vpblendw {{.*#+}} xmm2 = xmm0[0,1,2,3,4,5],xmm1[6],xmm0[7]
|
||||
; AVX2-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5],xmm1[6],xmm2[7]
|
||||
; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7]
|
||||
; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm2
|
||||
; AVX2-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1,2,3,4,5,6],xmm1[7]
|
||||
|
@ -500,9 +498,8 @@ define <32 x i8> @insert_v32i8_z123456789ABCDEzGHIJKLMNOPQRSTzz(<32 x i8> %a) {
|
|||
; AVX1: # BB#0:
|
||||
; AVX1-NEXT: xorl %eax, %eax
|
||||
; AVX1-NEXT: vpinsrb $0, %eax, %xmm0, %xmm1
|
||||
; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
|
||||
; AVX1-NEXT: vpinsrb $15, %eax, %xmm0, %xmm1
|
||||
; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
|
||||
; AVX1-NEXT: vpinsrb $15, %eax, %xmm1, %xmm1
|
||||
; AVX1-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3]
|
||||
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
|
||||
; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2
|
||||
; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,6],xmm2[7]
|
||||
|
@ -513,8 +510,7 @@ define <32 x i8> @insert_v32i8_z123456789ABCDEzGHIJKLMNOPQRSTzz(<32 x i8> %a) {
|
|||
; AVX2: # BB#0:
|
||||
; AVX2-NEXT: xorl %eax, %eax
|
||||
; AVX2-NEXT: vpinsrb $0, %eax, %xmm0, %xmm1
|
||||
; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
|
||||
; AVX2-NEXT: vpinsrb $15, %eax, %xmm0, %xmm1
|
||||
; AVX2-NEXT: vpinsrb $15, %eax, %xmm1, %xmm1
|
||||
; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
|
||||
; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
|
||||
; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2
|
||||
|
|
Loading…
Reference in New Issue