[SelectionDAG][X86] Don't use SEXTLOAD for promoting masked loads in the type legalizer

Summary:
I'm not sure why we were using SEXTLOAD. EXTLOAD seems more appropriate since we don't care about the upper bits.

This patch changes this and then modifies the X86 post legalization combine to emit a extending shuffle instead of a sign_extend_vector_inreg. Could maybe use an any_extend_vector_inreg, but I just did what we already do in LowerLoad. I think we can actually get rid of this code entirely if we switch to -x86-experimental-vector-widening-legalization.

On AVX512 targets I think we might be able to use a masked vpmovzx and not have to expand this at all.

Reviewers: RKSimon, spatel

Reviewed By: RKSimon

Subscribers: llvm-commits

Differential Revision: https://reviews.llvm.org/D57186

llvm-svn: 352255
This commit is contained in:
Craig Topper 2019-01-26 00:26:37 +00:00
parent 6bb3a1aa75
commit b1d3457c03
3 changed files with 20 additions and 8 deletions

View File

@ -554,7 +554,7 @@ SDValue DAGTypeLegalizer::PromoteIntRes_MLOAD(MaskedLoadSDNode *N) {
SDLoc dl(N); SDLoc dl(N);
SDValue Res = DAG.getMaskedLoad(NVT, dl, N->getChain(), N->getBasePtr(), SDValue Res = DAG.getMaskedLoad(NVT, dl, N->getChain(), N->getBasePtr(),
N->getMask(), ExtPassThru, N->getMemoryVT(), N->getMask(), ExtPassThru, N->getMemoryVT(),
N->getMemOperand(), ISD::SEXTLOAD); N->getMemOperand(), ISD::EXTLOAD);
// Legalize the chain result - switch anything that used the old chain to // Legalize the chain result - switch anything that used the old chain to
// use the new one. // use the new one.
ReplaceValueWith(SDValue(N, 1), Res.getValue(1)); ReplaceValueWith(SDValue(N, 1), Res.getValue(1));

View File

@ -37592,7 +37592,7 @@ static SDValue combineMaskedLoad(SDNode *N, SelectionDAG &DAG,
return Blend; return Blend;
} }
if (Mld->getExtensionType() != ISD::SEXTLOAD) if (Mld->getExtensionType() != ISD::EXTLOAD)
return SDValue(); return SDValue();
// Resolve extending loads. // Resolve extending loads.
@ -37662,8 +37662,20 @@ static SDValue combineMaskedLoad(SDNode *N, SelectionDAG &DAG,
Mld->getBasePtr(), NewMask, WidePassThru, Mld->getBasePtr(), NewMask, WidePassThru,
Mld->getMemoryVT(), Mld->getMemOperand(), Mld->getMemoryVT(), Mld->getMemOperand(),
ISD::NON_EXTLOAD); ISD::NON_EXTLOAD);
SDValue NewVec = getExtendInVec(/*Signed*/true, dl, VT, WideLd, DAG);
return DCI.CombineTo(N, NewVec, WideLd.getValue(1), true); SDValue SlicedVec = DAG.getBitcast(WideVecVT, WideLd);
SmallVector<int, 16> ShuffleVec(NumElems * SizeRatio, -1);
for (unsigned i = 0; i != NumElems; ++i)
ShuffleVec[i * SizeRatio] = i;
// Can't shuffle using an illegal type.
assert(DAG.getTargetLoweringInfo().isTypeLegal(WideVecVT) &&
"WideVecVT should be legal");
SlicedVec = DAG.getVectorShuffle(WideVecVT, dl, SlicedVec,
DAG.getUNDEF(WideVecVT), ShuffleVec);
SlicedVec = DAG.getBitcast(VT, SlicedVec);
return DCI.CombineTo(N, SlicedVec, WideLd.getValue(1), true);
} }
/// If exactly one element of the mask is set for a non-truncating masked store, /// If exactly one element of the mask is set for a non-truncating masked store,

View File

@ -1388,7 +1388,7 @@ define <2 x i32> @load_v2i32_v2i32(<2 x i32> %trigger, <2 x i32>* %addr, <2 x i3
; AVX1-NEXT: vmaskmovps (%rdi), %xmm0, %xmm2 ; AVX1-NEXT: vmaskmovps (%rdi), %xmm0, %xmm2
; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[0,2,2,3] ; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[0,2,2,3]
; AVX1-NEXT: vblendvps %xmm0, %xmm2, %xmm1, %xmm0 ; AVX1-NEXT: vblendvps %xmm0, %xmm2, %xmm1, %xmm0
; AVX1-NEXT: vpmovsxdq %xmm0, %xmm0 ; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero
; AVX1-NEXT: retq ; AVX1-NEXT: retq
; ;
; AVX2-LABEL: load_v2i32_v2i32: ; AVX2-LABEL: load_v2i32_v2i32:
@ -1400,7 +1400,7 @@ define <2 x i32> @load_v2i32_v2i32(<2 x i32> %trigger, <2 x i32>* %addr, <2 x i3
; AVX2-NEXT: vpmaskmovd (%rdi), %xmm0, %xmm2 ; AVX2-NEXT: vpmaskmovd (%rdi), %xmm0, %xmm2
; AVX2-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[0,2,2,3] ; AVX2-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[0,2,2,3]
; AVX2-NEXT: vblendvps %xmm0, %xmm2, %xmm1, %xmm0 ; AVX2-NEXT: vblendvps %xmm0, %xmm2, %xmm1, %xmm0
; AVX2-NEXT: vpmovsxdq %xmm0, %xmm0 ; AVX2-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero
; AVX2-NEXT: retq ; AVX2-NEXT: retq
; ;
; AVX512F-LABEL: load_v2i32_v2i32: ; AVX512F-LABEL: load_v2i32_v2i32:
@ -1412,7 +1412,7 @@ define <2 x i32> @load_v2i32_v2i32(<2 x i32> %trigger, <2 x i32>* %addr, <2 x i3
; AVX512F-NEXT: kshiftlw $14, %k0, %k0 ; AVX512F-NEXT: kshiftlw $14, %k0, %k0
; AVX512F-NEXT: kshiftrw $14, %k0, %k1 ; AVX512F-NEXT: kshiftrw $14, %k0, %k1
; AVX512F-NEXT: vmovdqu32 (%rdi), %zmm0 {%k1} ; AVX512F-NEXT: vmovdqu32 (%rdi), %zmm0 {%k1}
; AVX512F-NEXT: vpmovsxdq %xmm0, %xmm0 ; AVX512F-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero
; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: vzeroupper
; AVX512F-NEXT: retq ; AVX512F-NEXT: retq
; ;
@ -1423,7 +1423,7 @@ define <2 x i32> @load_v2i32_v2i32(<2 x i32> %trigger, <2 x i32>* %addr, <2 x i3
; AVX512VLBW-NEXT: vptestnmq %xmm0, %xmm0, %k1 ; AVX512VLBW-NEXT: vptestnmq %xmm0, %xmm0, %k1
; AVX512VLBW-NEXT: vpshufd {{.*#+}} xmm0 = xmm1[0,2,2,3] ; AVX512VLBW-NEXT: vpshufd {{.*#+}} xmm0 = xmm1[0,2,2,3]
; AVX512VLBW-NEXT: vmovdqu32 (%rdi), %xmm0 {%k1} ; AVX512VLBW-NEXT: vmovdqu32 (%rdi), %xmm0 {%k1}
; AVX512VLBW-NEXT: vpmovsxdq %xmm0, %xmm0 ; AVX512VLBW-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero
; AVX512VLBW-NEXT: retq ; AVX512VLBW-NEXT: retq
%mask = icmp eq <2 x i32> %trigger, zeroinitializer %mask = icmp eq <2 x i32> %trigger, zeroinitializer
%res = call <2 x i32> @llvm.masked.load.v2i32.p0v2i32(<2 x i32>* %addr, i32 4, <2 x i1>%mask, <2 x i32>%dst) %res = call <2 x i32> @llvm.masked.load.v2i32.p0v2i32(<2 x i32>* %addr, i32 4, <2 x i1>%mask, <2 x i32>%dst)