From b1d3457c0321a1b04bdd192c6de203f30f682be2 Mon Sep 17 00:00:00 2001 From: Craig Topper Date: Sat, 26 Jan 2019 00:26:37 +0000 Subject: [PATCH] [SelectionDAG][X86] Don't use SEXTLOAD for promoting masked loads in the type legalizer Summary: I'm not sure why we were using SEXTLOAD. EXTLOAD seems more appropriate since we don't care about the upper bits. This patch changes this and then modifies the X86 post legalization combine to emit a extending shuffle instead of a sign_extend_vector_inreg. Could maybe use an any_extend_vector_inreg, but I just did what we already do in LowerLoad. I think we can actually get rid of this code entirely if we switch to -x86-experimental-vector-widening-legalization. On AVX512 targets I think we might be able to use a masked vpmovzx and not have to expand this at all. Reviewers: RKSimon, spatel Reviewed By: RKSimon Subscribers: llvm-commits Differential Revision: https://reviews.llvm.org/D57186 llvm-svn: 352255 --- .../SelectionDAG/LegalizeIntegerTypes.cpp | 2 +- llvm/lib/Target/X86/X86ISelLowering.cpp | 18 +++++++++++++++--- llvm/test/CodeGen/X86/masked_load.ll | 8 ++++---- 3 files changed, 20 insertions(+), 8 deletions(-) diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp index 62fadc1e6232..e94ccc08d992 100644 --- a/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp @@ -554,7 +554,7 @@ SDValue DAGTypeLegalizer::PromoteIntRes_MLOAD(MaskedLoadSDNode *N) { SDLoc dl(N); SDValue Res = DAG.getMaskedLoad(NVT, dl, N->getChain(), N->getBasePtr(), N->getMask(), ExtPassThru, N->getMemoryVT(), - N->getMemOperand(), ISD::SEXTLOAD); + N->getMemOperand(), ISD::EXTLOAD); // Legalize the chain result - switch anything that used the old chain to // use the new one. ReplaceValueWith(SDValue(N, 1), Res.getValue(1)); diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp index d1870b390ea7..13d56cb0d9f0 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -37592,7 +37592,7 @@ static SDValue combineMaskedLoad(SDNode *N, SelectionDAG &DAG, return Blend; } - if (Mld->getExtensionType() != ISD::SEXTLOAD) + if (Mld->getExtensionType() != ISD::EXTLOAD) return SDValue(); // Resolve extending loads. @@ -37662,8 +37662,20 @@ static SDValue combineMaskedLoad(SDNode *N, SelectionDAG &DAG, Mld->getBasePtr(), NewMask, WidePassThru, Mld->getMemoryVT(), Mld->getMemOperand(), ISD::NON_EXTLOAD); - SDValue NewVec = getExtendInVec(/*Signed*/true, dl, VT, WideLd, DAG); - return DCI.CombineTo(N, NewVec, WideLd.getValue(1), true); + + SDValue SlicedVec = DAG.getBitcast(WideVecVT, WideLd); + SmallVector ShuffleVec(NumElems * SizeRatio, -1); + for (unsigned i = 0; i != NumElems; ++i) + ShuffleVec[i * SizeRatio] = i; + + // Can't shuffle using an illegal type. + assert(DAG.getTargetLoweringInfo().isTypeLegal(WideVecVT) && + "WideVecVT should be legal"); + SlicedVec = DAG.getVectorShuffle(WideVecVT, dl, SlicedVec, + DAG.getUNDEF(WideVecVT), ShuffleVec); + SlicedVec = DAG.getBitcast(VT, SlicedVec); + + return DCI.CombineTo(N, SlicedVec, WideLd.getValue(1), true); } /// If exactly one element of the mask is set for a non-truncating masked store, diff --git a/llvm/test/CodeGen/X86/masked_load.ll b/llvm/test/CodeGen/X86/masked_load.ll index 5a456bd2d718..8d28f45d9886 100644 --- a/llvm/test/CodeGen/X86/masked_load.ll +++ b/llvm/test/CodeGen/X86/masked_load.ll @@ -1388,7 +1388,7 @@ define <2 x i32> @load_v2i32_v2i32(<2 x i32> %trigger, <2 x i32>* %addr, <2 x i3 ; AVX1-NEXT: vmaskmovps (%rdi), %xmm0, %xmm2 ; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[0,2,2,3] ; AVX1-NEXT: vblendvps %xmm0, %xmm2, %xmm1, %xmm0 -; AVX1-NEXT: vpmovsxdq %xmm0, %xmm0 +; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero ; AVX1-NEXT: retq ; ; AVX2-LABEL: load_v2i32_v2i32: @@ -1400,7 +1400,7 @@ define <2 x i32> @load_v2i32_v2i32(<2 x i32> %trigger, <2 x i32>* %addr, <2 x i3 ; AVX2-NEXT: vpmaskmovd (%rdi), %xmm0, %xmm2 ; AVX2-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[0,2,2,3] ; AVX2-NEXT: vblendvps %xmm0, %xmm2, %xmm1, %xmm0 -; AVX2-NEXT: vpmovsxdq %xmm0, %xmm0 +; AVX2-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero ; AVX2-NEXT: retq ; ; AVX512F-LABEL: load_v2i32_v2i32: @@ -1412,7 +1412,7 @@ define <2 x i32> @load_v2i32_v2i32(<2 x i32> %trigger, <2 x i32>* %addr, <2 x i3 ; AVX512F-NEXT: kshiftlw $14, %k0, %k0 ; AVX512F-NEXT: kshiftrw $14, %k0, %k1 ; AVX512F-NEXT: vmovdqu32 (%rdi), %zmm0 {%k1} -; AVX512F-NEXT: vpmovsxdq %xmm0, %xmm0 +; AVX512F-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; @@ -1423,7 +1423,7 @@ define <2 x i32> @load_v2i32_v2i32(<2 x i32> %trigger, <2 x i32>* %addr, <2 x i3 ; AVX512VLBW-NEXT: vptestnmq %xmm0, %xmm0, %k1 ; AVX512VLBW-NEXT: vpshufd {{.*#+}} xmm0 = xmm1[0,2,2,3] ; AVX512VLBW-NEXT: vmovdqu32 (%rdi), %xmm0 {%k1} -; AVX512VLBW-NEXT: vpmovsxdq %xmm0, %xmm0 +; AVX512VLBW-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero ; AVX512VLBW-NEXT: retq %mask = icmp eq <2 x i32> %trigger, zeroinitializer %res = call <2 x i32> @llvm.masked.load.v2i32.p0v2i32(<2 x i32>* %addr, i32 4, <2 x i1>%mask, <2 x i32>%dst)