forked from OSchip/llvm-project
[X86] Convert f32/f64 FANDN/FAND/FOR/FXOR to vector logic ops and scalar_to_vector/extract_vector_elts to reduce isel patterns.
Previously we did the equivalent operation in isel patterns with COPY_TO_REGCLASS operations to transition. By inserting scalar_to_vetors and extract_vector_elts before isel we can allow each piece to be selected individually and accomplish the same final result. I ideally we'd use vector operations earlier in lowering/combine, but that looks to be more difficult. The scalar-fp-to-i64.ll changes are because we have a pattern for using movlpd for store+extract_vector_elt. While an f64 store uses movsd. The encoding sizes are the same. llvm-svn: 362914
This commit is contained in:
parent
80fee25776
commit
f7ba8b808a
|
@ -841,6 +841,49 @@ void X86DAGToDAGISel::PreprocessISelDAG() {
|
|||
CurDAG->DeleteNode(N);
|
||||
continue;
|
||||
}
|
||||
case X86ISD::FANDN:
|
||||
case X86ISD::FAND:
|
||||
case X86ISD::FOR:
|
||||
case X86ISD::FXOR: {
|
||||
// Widen scalar fp logic ops to vector to reduce isel patterns.
|
||||
// FIXME: Can we do this during lowering/combine.
|
||||
MVT VT = N->getSimpleValueType(0);
|
||||
if (VT.isVector() || VT == MVT::f128)
|
||||
break;
|
||||
|
||||
MVT VecVT = VT == MVT::f64 ? MVT::v2f64 : MVT::v4f32;
|
||||
SDLoc dl(N);
|
||||
SDValue Op0 = CurDAG->getNode(ISD::SCALAR_TO_VECTOR, dl, VecVT,
|
||||
N->getOperand(0));
|
||||
SDValue Op1 = CurDAG->getNode(ISD::SCALAR_TO_VECTOR, dl, VecVT,
|
||||
N->getOperand(1));
|
||||
|
||||
SDValue Res;
|
||||
if (Subtarget->hasSSE2()) {
|
||||
EVT IntVT = EVT(VecVT).changeVectorElementTypeToInteger();
|
||||
Op0 = CurDAG->getNode(ISD::BITCAST, dl, IntVT, Op0);
|
||||
Op1 = CurDAG->getNode(ISD::BITCAST, dl, IntVT, Op1);
|
||||
unsigned Opc;
|
||||
switch (N->getOpcode()) {
|
||||
default: llvm_unreachable("Unexpected opcode!");
|
||||
case X86ISD::FANDN: Opc = X86ISD::ANDNP; break;
|
||||
case X86ISD::FAND: Opc = ISD::AND; break;
|
||||
case X86ISD::FOR: Opc = ISD::OR; break;
|
||||
case X86ISD::FXOR: Opc = ISD::XOR; break;
|
||||
}
|
||||
Res = CurDAG->getNode(Opc, dl, IntVT, Op0, Op1);
|
||||
Res = CurDAG->getNode(ISD::BITCAST, dl, VecVT, Res);
|
||||
} else {
|
||||
Res = CurDAG->getNode(N->getOpcode(), dl, VecVT, Op0, Op1);
|
||||
}
|
||||
Res = CurDAG->getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, Res,
|
||||
CurDAG->getIntPtrConstant(0, dl));
|
||||
--I;
|
||||
CurDAG->ReplaceAllUsesOfValueWith(SDValue(N, 0), Res);
|
||||
++I;
|
||||
CurDAG->DeleteNode(N);
|
||||
continue;
|
||||
}
|
||||
}
|
||||
|
||||
if (OptLevel != CodeGenOpt::None &&
|
||||
|
|
|
@ -5657,51 +5657,6 @@ defm VOR : avx512_fp_binop_p<0x56, "vor", null_frag, HasDQI,
|
|||
defm VXOR : avx512_fp_binop_p<0x57, "vxor", null_frag, HasDQI,
|
||||
SchedWriteFLogicSizes, 1>;
|
||||
|
||||
let Predicates = [HasVLX,HasDQI] in {
|
||||
// Use packed logical operations for scalar ops.
|
||||
def : Pat<(f64 (X86fand FR64X:$src1, FR64X:$src2)),
|
||||
(COPY_TO_REGCLASS
|
||||
(v2f64 (VANDPDZ128rr (v2f64 (COPY_TO_REGCLASS FR64X:$src1, VR128X)),
|
||||
(v2f64 (COPY_TO_REGCLASS FR64X:$src2, VR128X)))),
|
||||
FR64X)>;
|
||||
def : Pat<(f64 (X86for FR64X:$src1, FR64X:$src2)),
|
||||
(COPY_TO_REGCLASS
|
||||
(v2f64 (VORPDZ128rr (v2f64 (COPY_TO_REGCLASS FR64X:$src1, VR128X)),
|
||||
(v2f64 (COPY_TO_REGCLASS FR64X:$src2, VR128X)))),
|
||||
FR64X)>;
|
||||
def : Pat<(f64 (X86fxor FR64X:$src1, FR64X:$src2)),
|
||||
(COPY_TO_REGCLASS
|
||||
(v2f64 (VXORPDZ128rr (v2f64 (COPY_TO_REGCLASS FR64X:$src1, VR128X)),
|
||||
(v2f64 (COPY_TO_REGCLASS FR64X:$src2, VR128X)))),
|
||||
FR64X)>;
|
||||
def : Pat<(f64 (X86fandn FR64X:$src1, FR64X:$src2)),
|
||||
(COPY_TO_REGCLASS
|
||||
(v2f64 (VANDNPDZ128rr (v2f64 (COPY_TO_REGCLASS FR64X:$src1, VR128X)),
|
||||
(v2f64 (COPY_TO_REGCLASS FR64X:$src2, VR128X)))),
|
||||
FR64X)>;
|
||||
|
||||
def : Pat<(f32 (X86fand FR32X:$src1, FR32X:$src2)),
|
||||
(COPY_TO_REGCLASS
|
||||
(v4f32 (VANDPSZ128rr (v4f32 (COPY_TO_REGCLASS FR32X:$src1, VR128X)),
|
||||
(v4f32 (COPY_TO_REGCLASS FR32X:$src2, VR128X)))),
|
||||
FR32X)>;
|
||||
def : Pat<(f32 (X86for FR32X:$src1, FR32X:$src2)),
|
||||
(COPY_TO_REGCLASS
|
||||
(v4f32 (VORPSZ128rr (v4f32 (COPY_TO_REGCLASS FR32X:$src1, VR128X)),
|
||||
(v4f32 (COPY_TO_REGCLASS FR32X:$src2, VR128X)))),
|
||||
FR32X)>;
|
||||
def : Pat<(f32 (X86fxor FR32X:$src1, FR32X:$src2)),
|
||||
(COPY_TO_REGCLASS
|
||||
(v4f32 (VXORPSZ128rr (v4f32 (COPY_TO_REGCLASS FR32X:$src1, VR128X)),
|
||||
(v4f32 (COPY_TO_REGCLASS FR32X:$src2, VR128X)))),
|
||||
FR32X)>;
|
||||
def : Pat<(f32 (X86fandn FR32X:$src1, FR32X:$src2)),
|
||||
(COPY_TO_REGCLASS
|
||||
(v4f32 (VANDNPSZ128rr (v4f32 (COPY_TO_REGCLASS FR32X:$src1, VR128X)),
|
||||
(v4f32 (COPY_TO_REGCLASS FR32X:$src2, VR128X)))),
|
||||
FR32X)>;
|
||||
}
|
||||
|
||||
multiclass avx512_fp_scalef_p<bits<8> opc, string OpcodeStr, SDNode OpNode,
|
||||
X86FoldableSchedWrite sched, X86VectorVTInfo _> {
|
||||
let ExeDomain = _.ExeDomain in {
|
||||
|
|
|
@ -2417,99 +2417,6 @@ let Predicates = [HasAVX1Only] in {
|
|||
(VANDNPSYrm VR256:$src1, addr:$src2)>;
|
||||
}
|
||||
|
||||
let Predicates = [HasAVX, NoVLX_Or_NoDQI] in {
|
||||
// Use packed logical operations for scalar ops.
|
||||
def : Pat<(f64 (X86fand FR64:$src1, FR64:$src2)),
|
||||
(COPY_TO_REGCLASS
|
||||
(v2f64 (VANDPDrr (v2f64 (COPY_TO_REGCLASS FR64:$src1, VR128)),
|
||||
(v2f64 (COPY_TO_REGCLASS FR64:$src2, VR128)))),
|
||||
FR64)>;
|
||||
def : Pat<(f64 (X86for FR64:$src1, FR64:$src2)),
|
||||
(COPY_TO_REGCLASS
|
||||
(v2f64 (VORPDrr (v2f64 (COPY_TO_REGCLASS FR64:$src1, VR128)),
|
||||
(v2f64 (COPY_TO_REGCLASS FR64:$src2, VR128)))),
|
||||
FR64)>;
|
||||
def : Pat<(f64 (X86fxor FR64:$src1, FR64:$src2)),
|
||||
(COPY_TO_REGCLASS
|
||||
(v2f64 (VXORPDrr (v2f64 (COPY_TO_REGCLASS FR64:$src1, VR128)),
|
||||
(v2f64 (COPY_TO_REGCLASS FR64:$src2, VR128)))),
|
||||
FR64)>;
|
||||
def : Pat<(f64 (X86fandn FR64:$src1, FR64:$src2)),
|
||||
(COPY_TO_REGCLASS
|
||||
(v2f64 (VANDNPDrr (v2f64 (COPY_TO_REGCLASS FR64:$src1, VR128)),
|
||||
(v2f64 (COPY_TO_REGCLASS FR64:$src2, VR128)))),
|
||||
FR64)>;
|
||||
|
||||
def : Pat<(f32 (X86fand FR32:$src1, FR32:$src2)),
|
||||
(COPY_TO_REGCLASS
|
||||
(v4f32 (VANDPSrr (v4f32 (COPY_TO_REGCLASS FR32:$src1, VR128)),
|
||||
(v4f32 (COPY_TO_REGCLASS FR32:$src2, VR128)))),
|
||||
FR32)>;
|
||||
def : Pat<(f32 (X86for FR32:$src1, FR32:$src2)),
|
||||
(COPY_TO_REGCLASS
|
||||
(v4f32 (VORPSrr (v4f32 (COPY_TO_REGCLASS FR32:$src1, VR128)),
|
||||
(v4f32 (COPY_TO_REGCLASS FR32:$src2, VR128)))),
|
||||
FR32)>;
|
||||
def : Pat<(f32 (X86fxor FR32:$src1, FR32:$src2)),
|
||||
(COPY_TO_REGCLASS
|
||||
(v4f32 (VXORPSrr (v4f32 (COPY_TO_REGCLASS FR32:$src1, VR128)),
|
||||
(v4f32 (COPY_TO_REGCLASS FR32:$src2, VR128)))),
|
||||
FR32)>;
|
||||
def : Pat<(f32 (X86fandn FR32:$src1, FR32:$src2)),
|
||||
(COPY_TO_REGCLASS
|
||||
(v4f32 (VANDNPSrr (v4f32 (COPY_TO_REGCLASS FR32:$src1, VR128)),
|
||||
(v4f32 (COPY_TO_REGCLASS FR32:$src2, VR128)))),
|
||||
FR32)>;
|
||||
}
|
||||
|
||||
let Predicates = [UseSSE1] in {
|
||||
// Use packed logical operations for scalar ops.
|
||||
def : Pat<(f32 (X86fand FR32:$src1, FR32:$src2)),
|
||||
(COPY_TO_REGCLASS
|
||||
(v4f32 (ANDPSrr (v4f32 (COPY_TO_REGCLASS FR32:$src1, VR128)),
|
||||
(v4f32 (COPY_TO_REGCLASS FR32:$src2, VR128)))),
|
||||
FR32)>;
|
||||
def : Pat<(f32 (X86for FR32:$src1, FR32:$src2)),
|
||||
(COPY_TO_REGCLASS
|
||||
(v4f32 (ORPSrr (v4f32 (COPY_TO_REGCLASS FR32:$src1, VR128)),
|
||||
(v4f32 (COPY_TO_REGCLASS FR32:$src2, VR128)))),
|
||||
FR32)>;
|
||||
def : Pat<(f32 (X86fxor FR32:$src1, FR32:$src2)),
|
||||
(COPY_TO_REGCLASS
|
||||
(v4f32 (XORPSrr (v4f32 (COPY_TO_REGCLASS FR32:$src1, VR128)),
|
||||
(v4f32 (COPY_TO_REGCLASS FR32:$src2, VR128)))),
|
||||
FR32)>;
|
||||
def : Pat<(f32 (X86fandn FR32:$src1, FR32:$src2)),
|
||||
(COPY_TO_REGCLASS
|
||||
(v4f32 (ANDNPSrr (v4f32 (COPY_TO_REGCLASS FR32:$src1, VR128)),
|
||||
(v4f32 (COPY_TO_REGCLASS FR32:$src2, VR128)))),
|
||||
FR32)>;
|
||||
}
|
||||
|
||||
let Predicates = [UseSSE2] in {
|
||||
// Use packed logical operations for scalar ops.
|
||||
def : Pat<(f64 (X86fand FR64:$src1, FR64:$src2)),
|
||||
(COPY_TO_REGCLASS
|
||||
(v2f64 (ANDPDrr (v2f64 (COPY_TO_REGCLASS FR64:$src1, VR128)),
|
||||
(v2f64 (COPY_TO_REGCLASS FR64:$src2, VR128)))),
|
||||
FR64)>;
|
||||
def : Pat<(f64 (X86for FR64:$src1, FR64:$src2)),
|
||||
(COPY_TO_REGCLASS
|
||||
(v2f64 (ORPDrr (v2f64 (COPY_TO_REGCLASS FR64:$src1, VR128)),
|
||||
(v2f64 (COPY_TO_REGCLASS FR64:$src2, VR128)))),
|
||||
FR64)>;
|
||||
def : Pat<(f64 (X86fxor FR64:$src1, FR64:$src2)),
|
||||
(COPY_TO_REGCLASS
|
||||
(v2f64 (XORPDrr (v2f64 (COPY_TO_REGCLASS FR64:$src1, VR128)),
|
||||
(v2f64 (COPY_TO_REGCLASS FR64:$src2, VR128)))),
|
||||
FR64)>;
|
||||
def : Pat<(f64 (X86fandn FR64:$src1, FR64:$src2)),
|
||||
(COPY_TO_REGCLASS
|
||||
(v2f64 (ANDNPDrr (v2f64 (COPY_TO_REGCLASS FR64:$src1, VR128)),
|
||||
(v2f64 (COPY_TO_REGCLASS FR64:$src2, VR128)))),
|
||||
FR64)>;
|
||||
}
|
||||
|
||||
let Predicates = [HasAVX, NoVLX] in {
|
||||
def : Pat<(v16i8 (and VR128:$src1, VR128:$src2)),
|
||||
(VPANDrr VR128:$src1, VR128:$src2)>;
|
||||
|
|
|
@ -631,7 +631,7 @@ define i64 @d_to_u64(double %a) nounwind {
|
|||
; SSE3_32_WIN-NEXT: subsd %xmm1, %xmm0
|
||||
; SSE3_32_WIN-NEXT: andnpd %xmm0, %xmm3
|
||||
; SSE3_32_WIN-NEXT: orpd %xmm3, %xmm2
|
||||
; SSE3_32_WIN-NEXT: movsd %xmm2, (%esp)
|
||||
; SSE3_32_WIN-NEXT: movlpd %xmm2, (%esp)
|
||||
; SSE3_32_WIN-NEXT: fldl (%esp)
|
||||
; SSE3_32_WIN-NEXT: fisttpll (%esp)
|
||||
; SSE3_32_WIN-NEXT: setbe %dl
|
||||
|
@ -656,7 +656,7 @@ define i64 @d_to_u64(double %a) nounwind {
|
|||
; SSE3_32_LIN-NEXT: subsd %xmm1, %xmm0
|
||||
; SSE3_32_LIN-NEXT: andnpd %xmm0, %xmm3
|
||||
; SSE3_32_LIN-NEXT: orpd %xmm3, %xmm2
|
||||
; SSE3_32_LIN-NEXT: movsd %xmm2, (%esp)
|
||||
; SSE3_32_LIN-NEXT: movlpd %xmm2, (%esp)
|
||||
; SSE3_32_LIN-NEXT: fldl (%esp)
|
||||
; SSE3_32_LIN-NEXT: fisttpll (%esp)
|
||||
; SSE3_32_LIN-NEXT: setbe %dl
|
||||
|
@ -695,7 +695,7 @@ define i64 @d_to_u64(double %a) nounwind {
|
|||
; SSE2_32_WIN-NEXT: andnpd %xmm2, %xmm4
|
||||
; SSE2_32_WIN-NEXT: andpd %xmm0, %xmm3
|
||||
; SSE2_32_WIN-NEXT: orpd %xmm4, %xmm3
|
||||
; SSE2_32_WIN-NEXT: movsd %xmm3, {{[0-9]+}}(%esp)
|
||||
; SSE2_32_WIN-NEXT: movlpd %xmm3, {{[0-9]+}}(%esp)
|
||||
; SSE2_32_WIN-NEXT: fldl {{[0-9]+}}(%esp)
|
||||
; SSE2_32_WIN-NEXT: fnstcw {{[0-9]+}}(%esp)
|
||||
; SSE2_32_WIN-NEXT: movzwl {{[0-9]+}}(%esp), %eax
|
||||
|
@ -727,7 +727,7 @@ define i64 @d_to_u64(double %a) nounwind {
|
|||
; SSE2_32_LIN-NEXT: andnpd %xmm2, %xmm4
|
||||
; SSE2_32_LIN-NEXT: andpd %xmm0, %xmm3
|
||||
; SSE2_32_LIN-NEXT: orpd %xmm4, %xmm3
|
||||
; SSE2_32_LIN-NEXT: movsd %xmm3, {{[0-9]+}}(%esp)
|
||||
; SSE2_32_LIN-NEXT: movlpd %xmm3, {{[0-9]+}}(%esp)
|
||||
; SSE2_32_LIN-NEXT: fldl {{[0-9]+}}(%esp)
|
||||
; SSE2_32_LIN-NEXT: fnstcw {{[0-9]+}}(%esp)
|
||||
; SSE2_32_LIN-NEXT: movzwl {{[0-9]+}}(%esp), %eax
|
||||
|
|
|
@ -19,7 +19,7 @@ define float @foo(float %f) #0 {
|
|||
; CHECK: %12:fr32 = VMULSSrr killed %11, killed %10
|
||||
; CHECK: %14:fr32 = FsFLD0SS
|
||||
; CHECK: %15:fr32 = VCMPSSrr %0, killed %14, 0
|
||||
; CHECK: %17:vr128 = VANDNPSrr killed %16, killed %13
|
||||
; CHECK: %17:vr128 = VPANDNrr killed %16, killed %13
|
||||
; CHECK: $xmm0 = COPY %18
|
||||
; CHECK: RET 0, $xmm0
|
||||
%call = tail call float @llvm.sqrt.f32(float %f) #1
|
||||
|
|
Loading…
Reference in New Issue