diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp index 62fadc1e6232..e94ccc08d992 100644 --- a/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp @@ -554,7 +554,7 @@ SDValue DAGTypeLegalizer::PromoteIntRes_MLOAD(MaskedLoadSDNode *N) { SDLoc dl(N); SDValue Res = DAG.getMaskedLoad(NVT, dl, N->getChain(), N->getBasePtr(), N->getMask(), ExtPassThru, N->getMemoryVT(), - N->getMemOperand(), ISD::SEXTLOAD); + N->getMemOperand(), ISD::EXTLOAD); // Legalize the chain result - switch anything that used the old chain to // use the new one. ReplaceValueWith(SDValue(N, 1), Res.getValue(1)); diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp index d1870b390ea7..13d56cb0d9f0 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -37592,7 +37592,7 @@ static SDValue combineMaskedLoad(SDNode *N, SelectionDAG &DAG, return Blend; } - if (Mld->getExtensionType() != ISD::SEXTLOAD) + if (Mld->getExtensionType() != ISD::EXTLOAD) return SDValue(); // Resolve extending loads. @@ -37662,8 +37662,20 @@ static SDValue combineMaskedLoad(SDNode *N, SelectionDAG &DAG, Mld->getBasePtr(), NewMask, WidePassThru, Mld->getMemoryVT(), Mld->getMemOperand(), ISD::NON_EXTLOAD); - SDValue NewVec = getExtendInVec(/*Signed*/true, dl, VT, WideLd, DAG); - return DCI.CombineTo(N, NewVec, WideLd.getValue(1), true); + + SDValue SlicedVec = DAG.getBitcast(WideVecVT, WideLd); + SmallVector ShuffleVec(NumElems * SizeRatio, -1); + for (unsigned i = 0; i != NumElems; ++i) + ShuffleVec[i * SizeRatio] = i; + + // Can't shuffle using an illegal type. + assert(DAG.getTargetLoweringInfo().isTypeLegal(WideVecVT) && + "WideVecVT should be legal"); + SlicedVec = DAG.getVectorShuffle(WideVecVT, dl, SlicedVec, + DAG.getUNDEF(WideVecVT), ShuffleVec); + SlicedVec = DAG.getBitcast(VT, SlicedVec); + + return DCI.CombineTo(N, SlicedVec, WideLd.getValue(1), true); } /// If exactly one element of the mask is set for a non-truncating masked store, diff --git a/llvm/test/CodeGen/X86/masked_load.ll b/llvm/test/CodeGen/X86/masked_load.ll index 5a456bd2d718..8d28f45d9886 100644 --- a/llvm/test/CodeGen/X86/masked_load.ll +++ b/llvm/test/CodeGen/X86/masked_load.ll @@ -1388,7 +1388,7 @@ define <2 x i32> @load_v2i32_v2i32(<2 x i32> %trigger, <2 x i32>* %addr, <2 x i3 ; AVX1-NEXT: vmaskmovps (%rdi), %xmm0, %xmm2 ; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[0,2,2,3] ; AVX1-NEXT: vblendvps %xmm0, %xmm2, %xmm1, %xmm0 -; AVX1-NEXT: vpmovsxdq %xmm0, %xmm0 +; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero ; AVX1-NEXT: retq ; ; AVX2-LABEL: load_v2i32_v2i32: @@ -1400,7 +1400,7 @@ define <2 x i32> @load_v2i32_v2i32(<2 x i32> %trigger, <2 x i32>* %addr, <2 x i3 ; AVX2-NEXT: vpmaskmovd (%rdi), %xmm0, %xmm2 ; AVX2-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[0,2,2,3] ; AVX2-NEXT: vblendvps %xmm0, %xmm2, %xmm1, %xmm0 -; AVX2-NEXT: vpmovsxdq %xmm0, %xmm0 +; AVX2-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero ; AVX2-NEXT: retq ; ; AVX512F-LABEL: load_v2i32_v2i32: @@ -1412,7 +1412,7 @@ define <2 x i32> @load_v2i32_v2i32(<2 x i32> %trigger, <2 x i32>* %addr, <2 x i3 ; AVX512F-NEXT: kshiftlw $14, %k0, %k0 ; AVX512F-NEXT: kshiftrw $14, %k0, %k1 ; AVX512F-NEXT: vmovdqu32 (%rdi), %zmm0 {%k1} -; AVX512F-NEXT: vpmovsxdq %xmm0, %xmm0 +; AVX512F-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; @@ -1423,7 +1423,7 @@ define <2 x i32> @load_v2i32_v2i32(<2 x i32> %trigger, <2 x i32>* %addr, <2 x i3 ; AVX512VLBW-NEXT: vptestnmq %xmm0, %xmm0, %k1 ; AVX512VLBW-NEXT: vpshufd {{.*#+}} xmm0 = xmm1[0,2,2,3] ; AVX512VLBW-NEXT: vmovdqu32 (%rdi), %xmm0 {%k1} -; AVX512VLBW-NEXT: vpmovsxdq %xmm0, %xmm0 +; AVX512VLBW-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero ; AVX512VLBW-NEXT: retq %mask = icmp eq <2 x i32> %trigger, zeroinitializer %res = call <2 x i32> @llvm.masked.load.v2i32.p0v2i32(<2 x i32>* %addr, i32 4, <2 x i1>%mask, <2 x i32>%dst)