forked from OSchip/llvm-project
[AVX-512] Improve lowering of inserting a single element into lowest element of a 512-bit vector of zeroes by using vmovq/vmovd/vmovss/vmovsd.
llvm-svn: 277965
This commit is contained in:
parent
da178822c2
commit
f44423120f
|
@ -6691,12 +6691,8 @@ X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const {
|
||||||
|
|
||||||
if (ExtVT == MVT::i32 || ExtVT == MVT::f32 || ExtVT == MVT::f64 ||
|
if (ExtVT == MVT::i32 || ExtVT == MVT::f32 || ExtVT == MVT::f64 ||
|
||||||
(ExtVT == MVT::i64 && Subtarget.is64Bit())) {
|
(ExtVT == MVT::i64 && Subtarget.is64Bit())) {
|
||||||
if (VT.is512BitVector()) {
|
assert((VT.is128BitVector() || VT.is256BitVector() ||
|
||||||
SDValue ZeroVec = getZeroVector(VT, Subtarget, DAG, dl);
|
VT.is512BitVector()) &&
|
||||||
return DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, ZeroVec,
|
|
||||||
Item, DAG.getIntPtrConstant(0, dl));
|
|
||||||
}
|
|
||||||
assert((VT.is128BitVector() || VT.is256BitVector()) &&
|
|
||||||
"Expected an SSE value type!");
|
"Expected an SSE value type!");
|
||||||
Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Item);
|
Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Item);
|
||||||
// Turn it into a MOVL (i.e. movss, movsd, or movd) to a zero vector.
|
// Turn it into a MOVL (i.e. movss, movsd, or movd) to a zero vector.
|
||||||
|
@ -12091,6 +12087,16 @@ static SDValue lower512BitVectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
|
||||||
assert(Subtarget.hasAVX512() &&
|
assert(Subtarget.hasAVX512() &&
|
||||||
"Cannot lower 512-bit vectors w/ basic ISA!");
|
"Cannot lower 512-bit vectors w/ basic ISA!");
|
||||||
|
|
||||||
|
// If we have a single input to the zero element, insert that into V1 if we
|
||||||
|
// can do so cheaply.
|
||||||
|
int NumElts = Mask.size();
|
||||||
|
int NumV2Elements = count_if(Mask, [NumElts](int M) { return M >= NumElts; });
|
||||||
|
|
||||||
|
if (NumV2Elements == 1 && Mask[0] >= NumElts)
|
||||||
|
if (SDValue Insertion = lowerVectorShuffleAsElementInsertion(
|
||||||
|
DL, VT, V1, V2, Mask, Subtarget, DAG))
|
||||||
|
return Insertion;
|
||||||
|
|
||||||
// Check for being able to broadcast a single element.
|
// Check for being able to broadcast a single element.
|
||||||
if (SDValue Broadcast =
|
if (SDValue Broadcast =
|
||||||
lowerVectorShuffleAsBroadcast(DL, VT, V1, V2, Mask, Subtarget, DAG))
|
lowerVectorShuffleAsBroadcast(DL, VT, V1, V2, Mask, Subtarget, DAG))
|
||||||
|
|
|
@ -3305,6 +3305,10 @@ let Predicates = [HasAVX512] in {
|
||||||
def : Pat<(v4i64 (X86vzmovl (insert_subvector undef,
|
def : Pat<(v4i64 (X86vzmovl (insert_subvector undef,
|
||||||
(v2i64 (scalar_to_vector GR64:$src)),(iPTR 0)))),
|
(v2i64 (scalar_to_vector GR64:$src)),(iPTR 0)))),
|
||||||
(SUBREG_TO_REG (i64 0), (VMOV64toPQIZrr GR64:$src), sub_xmm)>;
|
(SUBREG_TO_REG (i64 0), (VMOV64toPQIZrr GR64:$src), sub_xmm)>;
|
||||||
|
|
||||||
|
def : Pat<(v8i64 (X86vzmovl (insert_subvector undef,
|
||||||
|
(v2i64 (scalar_to_vector GR64:$src)),(iPTR 0)))),
|
||||||
|
(SUBREG_TO_REG (i64 0), (VMOV64toPQIZrr GR64:$src), sub_xmm)>;
|
||||||
}
|
}
|
||||||
// AVX 128-bit movd/movq instruction write zeros in the high 128-bit part.
|
// AVX 128-bit movd/movq instruction write zeros in the high 128-bit part.
|
||||||
let AddedComplexity = 20 in {
|
let AddedComplexity = 20 in {
|
||||||
|
@ -3330,6 +3334,10 @@ let Predicates = [HasAVX512] in {
|
||||||
(v4i32 (scalar_to_vector GR32:$src)),(iPTR 0)))),
|
(v4i32 (scalar_to_vector GR32:$src)),(iPTR 0)))),
|
||||||
(SUBREG_TO_REG (i32 0), (VMOVDI2PDIZrr GR32:$src), sub_xmm)>;
|
(SUBREG_TO_REG (i32 0), (VMOVDI2PDIZrr GR32:$src), sub_xmm)>;
|
||||||
|
|
||||||
|
def : Pat<(v16i32 (X86vzmovl (insert_subvector undef,
|
||||||
|
(v4i32 (scalar_to_vector GR32:$src)),(iPTR 0)))),
|
||||||
|
(SUBREG_TO_REG (i32 0), (VMOVDI2PDIZrr GR32:$src), sub_xmm)>;
|
||||||
|
|
||||||
// Use regular 128-bit instructions to match 512-bit scalar_to_vec+zext.
|
// Use regular 128-bit instructions to match 512-bit scalar_to_vec+zext.
|
||||||
def : Pat<(v8i64 (X86vzload addr:$src)),
|
def : Pat<(v8i64 (X86vzload addr:$src)),
|
||||||
(SUBREG_TO_REG (i64 0), (VMOVZPQILo2PQIZrm addr:$src), sub_xmm)>;
|
(SUBREG_TO_REG (i64 0), (VMOVZPQILo2PQIZrm addr:$src), sub_xmm)>;
|
||||||
|
|
|
@ -297,3 +297,14 @@ define <16 x float> @shuffle_v16f32_00_01_10_10_04_05_14_14_08_09_18_18_0c_0d_1c
|
||||||
%shuffle = shufflevector <16 x float> %a, <16 x float> %b, <16 x i32> <i32 0, i32 1, i32 16, i32 16, i32 4, i32 5, i32 20, i32 20, i32 8, i32 9, i32 24, i32 24, i32 12, i32 13, i32 28, i32 28>
|
%shuffle = shufflevector <16 x float> %a, <16 x float> %b, <16 x i32> <i32 0, i32 1, i32 16, i32 16, i32 4, i32 5, i32 20, i32 20, i32 8, i32 9, i32 24, i32 24, i32 12, i32 13, i32 28, i32 28>
|
||||||
ret <16 x float> %shuffle
|
ret <16 x float> %shuffle
|
||||||
}
|
}
|
||||||
|
|
||||||
|
define <16 x i32> @insert_mem_and_zero_v16i32(i32* %ptr) {
|
||||||
|
; ALL-LABEL: insert_mem_and_zero_v16i32:
|
||||||
|
; ALL: # BB#0:
|
||||||
|
; ALL-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
|
||||||
|
; ALL-NEXT: retq
|
||||||
|
%a = load i32, i32* %ptr
|
||||||
|
%v = insertelement <16 x i32> undef, i32 %a, i32 0
|
||||||
|
%shuffle = shufflevector <16 x i32> %v, <16 x i32> zeroinitializer, <16 x i32> <i32 0, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
|
||||||
|
ret <16 x i32> %shuffle
|
||||||
|
}
|
||||||
|
|
|
@ -147,20 +147,15 @@ define <8 x double> @shuffle_v8f64_06000000(<8 x double> %a, <8 x double> %b) {
|
||||||
define <8 x double> @shuffle_v8f64_70000000(<8 x double> %a, <8 x double> %b) {
|
define <8 x double> @shuffle_v8f64_70000000(<8 x double> %a, <8 x double> %b) {
|
||||||
; AVX512F-LABEL: shuffle_v8f64_70000000:
|
; AVX512F-LABEL: shuffle_v8f64_70000000:
|
||||||
; AVX512F: # BB#0:
|
; AVX512F: # BB#0:
|
||||||
; AVX512F-NEXT: vpxord %zmm1, %zmm1, %zmm1
|
|
||||||
; AVX512F-NEXT: movl $7, %eax
|
; AVX512F-NEXT: movl $7, %eax
|
||||||
; AVX512F-NEXT: vpinsrq $0, %rax, %xmm1, %xmm2
|
; AVX512F-NEXT: vmovq %rax, %xmm1
|
||||||
; AVX512F-NEXT: vinserti32x4 $0, %xmm2, %zmm1, %zmm1
|
|
||||||
; AVX512F-NEXT: vpermpd %zmm0, %zmm1, %zmm0
|
; AVX512F-NEXT: vpermpd %zmm0, %zmm1, %zmm0
|
||||||
; AVX512F-NEXT: retq
|
; AVX512F-NEXT: retq
|
||||||
;
|
;
|
||||||
; AVX512F-32-LABEL: shuffle_v8f64_70000000:
|
; AVX512F-32-LABEL: shuffle_v8f64_70000000:
|
||||||
; AVX512F-32: # BB#0:
|
; AVX512F-32: # BB#0:
|
||||||
; AVX512F-32-NEXT: vpxor %xmm1, %xmm1, %xmm1
|
|
||||||
; AVX512F-32-NEXT: movl $7, %eax
|
; AVX512F-32-NEXT: movl $7, %eax
|
||||||
; AVX512F-32-NEXT: vpinsrd $0, %eax, %xmm1, %xmm1
|
; AVX512F-32-NEXT: vmovd %eax, %xmm1
|
||||||
; AVX512F-32-NEXT: vpxord %zmm2, %zmm2, %zmm2
|
|
||||||
; AVX512F-32-NEXT: vinserti32x4 $0, %xmm1, %zmm2, %zmm1
|
|
||||||
; AVX512F-32-NEXT: vpermpd %zmm0, %zmm1, %zmm0
|
; AVX512F-32-NEXT: vpermpd %zmm0, %zmm1, %zmm0
|
||||||
; AVX512F-32-NEXT: retl
|
; AVX512F-32-NEXT: retl
|
||||||
%shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> <i32 7, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
|
%shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> <i32 7, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
|
||||||
|
@ -1116,20 +1111,15 @@ define <8 x i64> @shuffle_v8i64_70000000(<8 x i64> %a, <8 x i64> %b) {
|
||||||
;
|
;
|
||||||
; AVX512F-LABEL: shuffle_v8i64_70000000:
|
; AVX512F-LABEL: shuffle_v8i64_70000000:
|
||||||
; AVX512F: # BB#0:
|
; AVX512F: # BB#0:
|
||||||
; AVX512F-NEXT: vpxord %zmm1, %zmm1, %zmm1
|
|
||||||
; AVX512F-NEXT: movl $7, %eax
|
; AVX512F-NEXT: movl $7, %eax
|
||||||
; AVX512F-NEXT: vpinsrq $0, %rax, %xmm1, %xmm2
|
; AVX512F-NEXT: vmovq %rax, %xmm1
|
||||||
; AVX512F-NEXT: vinserti32x4 $0, %xmm2, %zmm1, %zmm1
|
|
||||||
; AVX512F-NEXT: vpermq %zmm0, %zmm1, %zmm0
|
; AVX512F-NEXT: vpermq %zmm0, %zmm1, %zmm0
|
||||||
; AVX512F-NEXT: retq
|
; AVX512F-NEXT: retq
|
||||||
;
|
;
|
||||||
; AVX512F-32-LABEL: shuffle_v8i64_70000000:
|
; AVX512F-32-LABEL: shuffle_v8i64_70000000:
|
||||||
; AVX512F-32: # BB#0:
|
; AVX512F-32: # BB#0:
|
||||||
; AVX512F-32-NEXT: vpxor %xmm1, %xmm1, %xmm1
|
|
||||||
; AVX512F-32-NEXT: movl $7, %eax
|
; AVX512F-32-NEXT: movl $7, %eax
|
||||||
; AVX512F-32-NEXT: vpinsrd $0, %eax, %xmm1, %xmm1
|
; AVX512F-32-NEXT: vmovd %eax, %xmm1
|
||||||
; AVX512F-32-NEXT: vpxord %zmm2, %zmm2, %zmm2
|
|
||||||
; AVX512F-32-NEXT: vinserti32x4 $0, %xmm1, %zmm2, %zmm1
|
|
||||||
; AVX512F-32-NEXT: vpermq %zmm0, %zmm1, %zmm0
|
; AVX512F-32-NEXT: vpermq %zmm0, %zmm1, %zmm0
|
||||||
; AVX512F-32-NEXT: retl
|
; AVX512F-32-NEXT: retl
|
||||||
%shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> <i32 7, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
|
%shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> <i32 7, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
|
||||||
|
|
Loading…
Reference in New Issue