forked from OSchip/llvm-project
[X86] Add a DAG combine to replace vector loads feeding a v4i32->v2f64 CVTSI2FP/CVTUI2FP node with a vzload.
But only when the load isn't volatile. This improves load folding during isel where we only have vzload and scalar_to_vector+load patterns. We can't have full vector load isel patterns for the same volatile load issue. Also add some missing masked cvtsi2fp/cvtui2fp with vzload patterns. llvm-svn: 364728
This commit is contained in:
parent
fc233c9108
commit
4ca81a9b99
|
@ -41101,6 +41101,34 @@ static SDValue combineX86INT_TO_FP(SDNode *N, SelectionDAG &DAG,
|
|||
KnownZero, DCI))
|
||||
return SDValue(N, 0);
|
||||
|
||||
// Convert a full vector load into vzload when not all bits are needed.
|
||||
SDValue In = N->getOperand(0);
|
||||
MVT InVT = In.getSimpleValueType();
|
||||
if (VT.getVectorNumElements() < InVT.getVectorNumElements() &&
|
||||
ISD::isNormalLoad(In.getNode()) && In.hasOneUse()) {
|
||||
assert(InVT.is128BitVector() && "Expected 128-bit input vector");
|
||||
LoadSDNode *LN = cast<LoadSDNode>(N->getOperand(0));
|
||||
// Unless the load is volatile.
|
||||
if (!LN->isVolatile()) {
|
||||
SDLoc dl(N);
|
||||
unsigned NumBits = InVT.getScalarSizeInBits() * VT.getVectorNumElements();
|
||||
MVT MemVT = MVT::getIntegerVT(NumBits);
|
||||
MVT LoadVT = MVT::getVectorVT(MemVT, 128 / NumBits);
|
||||
SDVTList Tys = DAG.getVTList(LoadVT, MVT::Other);
|
||||
SDValue Ops[] = { LN->getChain(), LN->getBasePtr() };
|
||||
SDValue VZLoad =
|
||||
DAG.getMemIntrinsicNode(X86ISD::VZEXT_LOAD, dl, Tys, Ops, MemVT,
|
||||
LN->getPointerInfo(),
|
||||
LN->getAlignment(),
|
||||
LN->getMemOperand()->getFlags());
|
||||
SDValue Convert = DAG.getNode(N->getOpcode(), dl, VT,
|
||||
DAG.getBitcast(InVT, VZLoad));
|
||||
DCI.CombineTo(N, Convert);
|
||||
DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), VZLoad.getValue(1));
|
||||
return SDValue(N, 0);
|
||||
}
|
||||
}
|
||||
|
||||
return SDValue();
|
||||
}
|
||||
|
||||
|
|
|
@ -8429,9 +8429,25 @@ def : Pat<(v2f64 (X86VUintToFP (v4i32 VR128X:$src1))),
|
|||
let Predicates = [HasVLX] in {
|
||||
def : Pat<(v2f64 (X86VSintToFP (bc_v4i32 (v2i64 (X86vzload addr:$src))))),
|
||||
(VCVTDQ2PDZ128rm addr:$src)>;
|
||||
def : Pat<(v2f64 (vselect VK2WM:$mask,
|
||||
(X86VSintToFP (bc_v4i32 (v2i64 (X86vzload addr:$src)))),
|
||||
VR128X:$src0)),
|
||||
(VCVTDQ2PDZ128rmk VR128X:$src0, VK2WM:$mask, addr:$src)>;
|
||||
def : Pat<(v2f64 (vselect VK2WM:$mask,
|
||||
(X86VSintToFP (bc_v4i32 (v2i64 (X86vzload addr:$src)))),
|
||||
v2f64x_info.ImmAllZerosV)),
|
||||
(VCVTDQ2PDZ128rmkz VK2WM:$mask, addr:$src)>;
|
||||
|
||||
def : Pat<(v2f64 (X86VUintToFP (bc_v4i32 (v2i64 (X86vzload addr:$src))))),
|
||||
(VCVTUDQ2PDZ128rm addr:$src)>;
|
||||
def : Pat<(v2f64 (vselect VK2WM:$mask,
|
||||
(X86VUintToFP (bc_v4i32 (v2i64 (X86vzload addr:$src)))),
|
||||
VR128X:$src0)),
|
||||
(VCVTUDQ2PDZ128rmk VR128X:$src0, VK2WM:$mask, addr:$src)>;
|
||||
def : Pat<(v2f64 (vselect VK2WM:$mask,
|
||||
(X86VUintToFP (bc_v4i32 (v2i64 (X86vzload addr:$src)))),
|
||||
v2f64x_info.ImmAllZerosV)),
|
||||
(VCVTUDQ2PDZ128rmkz VK2WM:$mask, addr:$src)>;
|
||||
}
|
||||
|
||||
let Predicates = [HasDQI, HasVLX] in {
|
||||
|
|
|
@ -3122,14 +3122,12 @@ define <2 x double> @sitofp_volatile_load_4i32_to_2f64(<4 x i32> *%a) {
|
|||
define <2 x double> @sitofp_load_4i32_to_2f64_2(<4 x i32>* %x) {
|
||||
; SSE-LABEL: sitofp_load_4i32_to_2f64_2:
|
||||
; SSE: # %bb.0:
|
||||
; SSE-NEXT: movaps (%rdi), %xmm0
|
||||
; SSE-NEXT: cvtdq2pd %xmm0, %xmm0
|
||||
; SSE-NEXT: cvtdq2pd (%rdi), %xmm0
|
||||
; SSE-NEXT: retq
|
||||
;
|
||||
; AVX-LABEL: sitofp_load_4i32_to_2f64_2:
|
||||
; AVX: # %bb.0:
|
||||
; AVX-NEXT: vmovaps (%rdi), %xmm0
|
||||
; AVX-NEXT: vcvtdq2pd %xmm0, %xmm0
|
||||
; AVX-NEXT: vcvtdq2pd (%rdi), %xmm0
|
||||
; AVX-NEXT: retq
|
||||
%a = load <4 x i32>, <4 x i32>* %x
|
||||
%b = sitofp <4 x i32> %a to <4 x double>
|
||||
|
@ -3597,7 +3595,7 @@ define <2 x double> @uitofp_load_4i32_to_2f64_2(<4 x i32>* %x) {
|
|||
;
|
||||
; AVX512F-LABEL: uitofp_load_4i32_to_2f64_2:
|
||||
; AVX512F: # %bb.0:
|
||||
; AVX512F-NEXT: vmovaps (%rdi), %xmm0
|
||||
; AVX512F-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
|
||||
; AVX512F-NEXT: vcvtudq2pd %ymm0, %zmm0
|
||||
; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
|
||||
; AVX512F-NEXT: vzeroupper
|
||||
|
@ -3605,13 +3603,12 @@ define <2 x double> @uitofp_load_4i32_to_2f64_2(<4 x i32>* %x) {
|
|||
;
|
||||
; AVX512VL-LABEL: uitofp_load_4i32_to_2f64_2:
|
||||
; AVX512VL: # %bb.0:
|
||||
; AVX512VL-NEXT: vmovaps (%rdi), %xmm0
|
||||
; AVX512VL-NEXT: vcvtudq2pd %xmm0, %xmm0
|
||||
; AVX512VL-NEXT: vcvtudq2pd (%rdi), %xmm0
|
||||
; AVX512VL-NEXT: retq
|
||||
;
|
||||
; AVX512DQ-LABEL: uitofp_load_4i32_to_2f64_2:
|
||||
; AVX512DQ: # %bb.0:
|
||||
; AVX512DQ-NEXT: vmovaps (%rdi), %xmm0
|
||||
; AVX512DQ-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
|
||||
; AVX512DQ-NEXT: vcvtudq2pd %ymm0, %zmm0
|
||||
; AVX512DQ-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
|
||||
; AVX512DQ-NEXT: vzeroupper
|
||||
|
@ -3619,8 +3616,7 @@ define <2 x double> @uitofp_load_4i32_to_2f64_2(<4 x i32>* %x) {
|
|||
;
|
||||
; AVX512VLDQ-LABEL: uitofp_load_4i32_to_2f64_2:
|
||||
; AVX512VLDQ: # %bb.0:
|
||||
; AVX512VLDQ-NEXT: vmovaps (%rdi), %xmm0
|
||||
; AVX512VLDQ-NEXT: vcvtudq2pd %xmm0, %xmm0
|
||||
; AVX512VLDQ-NEXT: vcvtudq2pd (%rdi), %xmm0
|
||||
; AVX512VLDQ-NEXT: retq
|
||||
%a = load <4 x i32>, <4 x i32>* %x
|
||||
%b = uitofp <4 x i32> %a to <4 x double>
|
||||
|
|
|
@ -3122,14 +3122,12 @@ define <2 x double> @sitofp_volatile_load_4i32_to_2f64(<4 x i32> *%a) {
|
|||
define <2 x double> @sitofp_load_4i32_to_2f64_2(<4 x i32>* %x) {
|
||||
; SSE-LABEL: sitofp_load_4i32_to_2f64_2:
|
||||
; SSE: # %bb.0:
|
||||
; SSE-NEXT: movaps (%rdi), %xmm0
|
||||
; SSE-NEXT: cvtdq2pd %xmm0, %xmm0
|
||||
; SSE-NEXT: cvtdq2pd (%rdi), %xmm0
|
||||
; SSE-NEXT: retq
|
||||
;
|
||||
; AVX-LABEL: sitofp_load_4i32_to_2f64_2:
|
||||
; AVX: # %bb.0:
|
||||
; AVX-NEXT: vmovaps (%rdi), %xmm0
|
||||
; AVX-NEXT: vcvtdq2pd %xmm0, %xmm0
|
||||
; AVX-NEXT: vcvtdq2pd (%rdi), %xmm0
|
||||
; AVX-NEXT: retq
|
||||
%a = load <4 x i32>, <4 x i32>* %x
|
||||
%b = sitofp <4 x i32> %a to <4 x double>
|
||||
|
@ -3595,7 +3593,7 @@ define <2 x double> @uitofp_load_4i32_to_2f64_2(<4 x i32>* %x) {
|
|||
;
|
||||
; AVX512F-LABEL: uitofp_load_4i32_to_2f64_2:
|
||||
; AVX512F: # %bb.0:
|
||||
; AVX512F-NEXT: vmovaps (%rdi), %xmm0
|
||||
; AVX512F-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
|
||||
; AVX512F-NEXT: vcvtudq2pd %ymm0, %zmm0
|
||||
; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
|
||||
; AVX512F-NEXT: vzeroupper
|
||||
|
@ -3603,13 +3601,12 @@ define <2 x double> @uitofp_load_4i32_to_2f64_2(<4 x i32>* %x) {
|
|||
;
|
||||
; AVX512VL-LABEL: uitofp_load_4i32_to_2f64_2:
|
||||
; AVX512VL: # %bb.0:
|
||||
; AVX512VL-NEXT: vmovaps (%rdi), %xmm0
|
||||
; AVX512VL-NEXT: vcvtudq2pd %xmm0, %xmm0
|
||||
; AVX512VL-NEXT: vcvtudq2pd (%rdi), %xmm0
|
||||
; AVX512VL-NEXT: retq
|
||||
;
|
||||
; AVX512DQ-LABEL: uitofp_load_4i32_to_2f64_2:
|
||||
; AVX512DQ: # %bb.0:
|
||||
; AVX512DQ-NEXT: vmovaps (%rdi), %xmm0
|
||||
; AVX512DQ-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
|
||||
; AVX512DQ-NEXT: vcvtudq2pd %ymm0, %zmm0
|
||||
; AVX512DQ-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
|
||||
; AVX512DQ-NEXT: vzeroupper
|
||||
|
@ -3617,8 +3614,7 @@ define <2 x double> @uitofp_load_4i32_to_2f64_2(<4 x i32>* %x) {
|
|||
;
|
||||
; AVX512VLDQ-LABEL: uitofp_load_4i32_to_2f64_2:
|
||||
; AVX512VLDQ: # %bb.0:
|
||||
; AVX512VLDQ-NEXT: vmovaps (%rdi), %xmm0
|
||||
; AVX512VLDQ-NEXT: vcvtudq2pd %xmm0, %xmm0
|
||||
; AVX512VLDQ-NEXT: vcvtudq2pd (%rdi), %xmm0
|
||||
; AVX512VLDQ-NEXT: retq
|
||||
%a = load <4 x i32>, <4 x i32>* %x
|
||||
%b = uitofp <4 x i32> %a to <4 x double>
|
||||
|
|
Loading…
Reference in New Issue