forked from OSchip/llvm-project
[X86][SSE] Improved blend+zero target shuffle combining to use combined shuffle mask directly
We currently only combine to blend+zero if the target value type has 8 elements or less, but this was missing a lot of cases where the combined mask had been widened. This change makes it so we use the combined mask to determine the blend value type, allowing us to catch more widened cases. llvm-svn: 272003
This commit is contained in:
parent
53298a1808
commit
ca1da1bf07
|
@ -24602,23 +24602,27 @@ static bool combineX86ShuffleChain(SDValue Input, SDValue Root,
|
||||||
}
|
}
|
||||||
|
|
||||||
// Attempt to blend with zero.
|
// Attempt to blend with zero.
|
||||||
if (VT.getVectorNumElements() <= 8 &&
|
if (NumMaskElts <= 8 &&
|
||||||
((Subtarget.hasSSE41() && VT.is128BitVector()) ||
|
((Subtarget.hasSSE41() && VT.is128BitVector()) ||
|
||||||
(Subtarget.hasAVX() && VT.is256BitVector()))) {
|
(Subtarget.hasAVX() && VT.is256BitVector()))) {
|
||||||
// Convert VT to a type compatible with X86ISD::BLENDI.
|
// Convert VT to a type compatible with X86ISD::BLENDI.
|
||||||
// TODO - add 16i16 support (requires lane duplication).
|
// TODO - add 16i16 support (requires lane duplication).
|
||||||
MVT ShuffleVT = VT;
|
bool FloatDomain = VT.isFloatingPoint();
|
||||||
|
MVT ShuffleVT = FloatDomain ? MVT::getFloatingPointVT(MaskEltSizeInBits)
|
||||||
|
: MVT::getIntegerVT(MaskEltSizeInBits);
|
||||||
|
ShuffleVT = MVT::getVectorVT(ShuffleVT, NumMaskElts);
|
||||||
|
|
||||||
if (Subtarget.hasAVX2()) {
|
if (Subtarget.hasAVX2()) {
|
||||||
if (VT == MVT::v4i64)
|
if (ShuffleVT == MVT::v4i64)
|
||||||
ShuffleVT = MVT::v8i32;
|
ShuffleVT = MVT::v8i32;
|
||||||
else if (VT == MVT::v2i64)
|
else if (ShuffleVT == MVT::v2i64)
|
||||||
ShuffleVT = MVT::v4i32;
|
ShuffleVT = MVT::v4i32;
|
||||||
} else {
|
} else {
|
||||||
if (VT == MVT::v2i64 || VT == MVT::v4i32)
|
if (ShuffleVT == MVT::v2i64 || ShuffleVT == MVT::v4i32)
|
||||||
ShuffleVT = MVT::v8i16;
|
ShuffleVT = MVT::v8i16;
|
||||||
else if (VT == MVT::v4i64)
|
else if (ShuffleVT == MVT::v4i64)
|
||||||
ShuffleVT = MVT::v4f64;
|
ShuffleVT = MVT::v4f64;
|
||||||
else if (VT == MVT::v8i32)
|
else if (ShuffleVT == MVT::v8i32)
|
||||||
ShuffleVT = MVT::v8f32;
|
ShuffleVT = MVT::v8f32;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -347,16 +347,11 @@ define <4 x i32> @_clearupper4xi32b(<4 x i32>) nounwind {
|
||||||
; SSE-NEXT: pinsrw $7, %eax, %xmm0
|
; SSE-NEXT: pinsrw $7, %eax, %xmm0
|
||||||
; SSE-NEXT: retq
|
; SSE-NEXT: retq
|
||||||
;
|
;
|
||||||
; AVX1-LABEL: _clearupper4xi32b:
|
; AVX-LABEL: _clearupper4xi32b:
|
||||||
; AVX1: # BB#0:
|
; AVX: # BB#0:
|
||||||
; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1
|
; AVX-NEXT: vpxor %xmm1, %xmm1, %xmm1
|
||||||
; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3],xmm0[4],xmm1[5],xmm0[6],xmm1[7]
|
; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3],xmm0[4],xmm1[5],xmm0[6],xmm1[7]
|
||||||
; AVX1-NEXT: retq
|
; AVX-NEXT: retq
|
||||||
;
|
|
||||||
; AVX2-LABEL: _clearupper4xi32b:
|
|
||||||
; AVX2: # BB#0:
|
|
||||||
; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1],zero,zero,xmm0[4,5],zero,zero,xmm0[8,9],zero,zero,xmm0[12,13],zero,zero
|
|
||||||
; AVX2-NEXT: retq
|
|
||||||
%x16 = bitcast <4 x i32> %0 to <8 x i16>
|
%x16 = bitcast <4 x i32> %0 to <8 x i16>
|
||||||
%r0 = insertelement <8 x i16> %x16, i16 zeroinitializer, i32 1
|
%r0 = insertelement <8 x i16> %x16, i16 zeroinitializer, i32 1
|
||||||
%r1 = insertelement <8 x i16> %r0, i16 zeroinitializer, i32 3
|
%r1 = insertelement <8 x i16> %r0, i16 zeroinitializer, i32 3
|
||||||
|
|
|
@ -107,7 +107,8 @@ define <16 x i8> @combine_vpperm_identity_bitcast(<16 x i8> %a0, <16 x i8> %a1)
|
||||||
define <16 x i8> @combine_vpperm_as_blend_with_zero(<16 x i8> %a0, <16 x i8> %a1) {
|
define <16 x i8> @combine_vpperm_as_blend_with_zero(<16 x i8> %a0, <16 x i8> %a1) {
|
||||||
; CHECK-LABEL: combine_vpperm_as_blend_with_zero:
|
; CHECK-LABEL: combine_vpperm_as_blend_with_zero:
|
||||||
; CHECK: # BB#0:
|
; CHECK: # BB#0:
|
||||||
; CHECK-NEXT: vpperm {{.*#+}} xmm0 = xmm0[0,1],zero,zero,xmm0[4,5,6,7],zero,zero,zero,zero,zero,zero,zero,zero
|
; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1
|
||||||
|
; CHECK-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3],xmm1[4,5,6,7]
|
||||||
; CHECK-NEXT: retq
|
; CHECK-NEXT: retq
|
||||||
%res0 = call <16 x i8> @llvm.x86.xop.vpperm(<16 x i8> %a0, <16 x i8> %a1, <16 x i8> <i8 0, i8 1, i8 128, i8 129, i8 4, i8 5, i8 6, i8 7, i8 130, i8 131, i8 132, i8 133, i8 134, i8 135, i8 136, i8 137>)
|
%res0 = call <16 x i8> @llvm.x86.xop.vpperm(<16 x i8> %a0, <16 x i8> %a1, <16 x i8> <i8 0, i8 1, i8 128, i8 129, i8 4, i8 5, i8 6, i8 7, i8 130, i8 131, i8 132, i8 133, i8 134, i8 135, i8 136, i8 137>)
|
||||||
ret <16 x i8> %res0
|
ret <16 x i8> %res0
|
||||||
|
|
|
@ -1182,26 +1182,47 @@ define <8 x i16> @var_shuffle_v8i16_v8i16_xyxyxy00_i16(<8 x i16> %x, <8 x i16> %
|
||||||
; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm1[0,1,2,3,4,5],xmm0[6,7]
|
; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm1[0,1,2,3,4,5],xmm0[6,7]
|
||||||
; SSE41-NEXT: retq
|
; SSE41-NEXT: retq
|
||||||
;
|
;
|
||||||
; AVX-LABEL: var_shuffle_v8i16_v8i16_xyxyxy00_i16:
|
; AVX1-LABEL: var_shuffle_v8i16_v8i16_xyxyxy00_i16:
|
||||||
; AVX: # BB#0:
|
; AVX1: # BB#0:
|
||||||
; AVX-NEXT: movswq %di, %r10
|
; AVX1-NEXT: movswq %di, %r10
|
||||||
; AVX-NEXT: movswq %si, %r11
|
; AVX1-NEXT: movswq %si, %r11
|
||||||
; AVX-NEXT: movswq %dx, %rdx
|
; AVX1-NEXT: movswq %dx, %rdx
|
||||||
; AVX-NEXT: movswq %cx, %rcx
|
; AVX1-NEXT: movswq %cx, %rcx
|
||||||
; AVX-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp)
|
; AVX1-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp)
|
||||||
; AVX-NEXT: movswq %r8w, %rdi
|
; AVX1-NEXT: movswq %r8w, %rdi
|
||||||
; AVX-NEXT: vmovdqa %xmm1, -{{[0-9]+}}(%rsp)
|
; AVX1-NEXT: vmovdqa %xmm1, -{{[0-9]+}}(%rsp)
|
||||||
; AVX-NEXT: movswq %r9w, %rax
|
; AVX1-NEXT: movswq %r9w, %rax
|
||||||
; AVX-NEXT: movzwl -40(%rsp,%r10,2), %esi
|
; AVX1-NEXT: movzwl -40(%rsp,%r10,2), %esi
|
||||||
; AVX-NEXT: vmovd %esi, %xmm0
|
; AVX1-NEXT: vmovd %esi, %xmm0
|
||||||
; AVX-NEXT: vpinsrw $1, -24(%rsp,%r11,2), %xmm0, %xmm0
|
; AVX1-NEXT: vpinsrw $1, -24(%rsp,%r11,2), %xmm0, %xmm0
|
||||||
; AVX-NEXT: vpinsrw $2, -40(%rsp,%rdx,2), %xmm0, %xmm0
|
; AVX1-NEXT: vpinsrw $2, -40(%rsp,%rdx,2), %xmm0, %xmm0
|
||||||
; AVX-NEXT: vpinsrw $3, -24(%rsp,%rcx,2), %xmm0, %xmm0
|
; AVX1-NEXT: vpinsrw $3, -24(%rsp,%rcx,2), %xmm0, %xmm0
|
||||||
; AVX-NEXT: vpinsrw $4, -40(%rsp,%rdi,2), %xmm0, %xmm0
|
; AVX1-NEXT: vpinsrw $4, -40(%rsp,%rdi,2), %xmm0, %xmm0
|
||||||
; AVX-NEXT: vpinsrw $5, -24(%rsp,%rax,2), %xmm0, %xmm0
|
; AVX1-NEXT: vpinsrw $5, -24(%rsp,%rax,2), %xmm0, %xmm0
|
||||||
; AVX-NEXT: vpxor %xmm1, %xmm1, %xmm1
|
; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1
|
||||||
; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5],xmm1[6,7]
|
; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5],xmm1[6,7]
|
||||||
; AVX-NEXT: retq
|
; AVX1-NEXT: retq
|
||||||
|
;
|
||||||
|
; AVX2-LABEL: var_shuffle_v8i16_v8i16_xyxyxy00_i16:
|
||||||
|
; AVX2: # BB#0:
|
||||||
|
; AVX2-NEXT: movswq %di, %r10
|
||||||
|
; AVX2-NEXT: movswq %si, %r11
|
||||||
|
; AVX2-NEXT: movswq %dx, %rdx
|
||||||
|
; AVX2-NEXT: movswq %cx, %rcx
|
||||||
|
; AVX2-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp)
|
||||||
|
; AVX2-NEXT: movswq %r8w, %rdi
|
||||||
|
; AVX2-NEXT: vmovdqa %xmm1, -{{[0-9]+}}(%rsp)
|
||||||
|
; AVX2-NEXT: movswq %r9w, %rax
|
||||||
|
; AVX2-NEXT: movzwl -40(%rsp,%r10,2), %esi
|
||||||
|
; AVX2-NEXT: vmovd %esi, %xmm0
|
||||||
|
; AVX2-NEXT: vpinsrw $1, -24(%rsp,%r11,2), %xmm0, %xmm0
|
||||||
|
; AVX2-NEXT: vpinsrw $2, -40(%rsp,%rdx,2), %xmm0, %xmm0
|
||||||
|
; AVX2-NEXT: vpinsrw $3, -24(%rsp,%rcx,2), %xmm0, %xmm0
|
||||||
|
; AVX2-NEXT: vpinsrw $4, -40(%rsp,%rdi,2), %xmm0, %xmm0
|
||||||
|
; AVX2-NEXT: vpinsrw $5, -24(%rsp,%rax,2), %xmm0, %xmm0
|
||||||
|
; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1
|
||||||
|
; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[3]
|
||||||
|
; AVX2-NEXT: retq
|
||||||
%x0 = extractelement <8 x i16> %x, i16 %i0
|
%x0 = extractelement <8 x i16> %x, i16 %i0
|
||||||
%y1 = extractelement <8 x i16> %y, i16 %i1
|
%y1 = extractelement <8 x i16> %y, i16 %i1
|
||||||
%x2 = extractelement <8 x i16> %x, i16 %i2
|
%x2 = extractelement <8 x i16> %x, i16 %i2
|
||||||
|
|
Loading…
Reference in New Issue