forked from OSchip/llvm-project
Begin to support some vector operations for AVX 256-bit intructions. The long
term goal here is to be able to match enough of vector_shuffle and build_vector so all avx intrinsics which aren't mapped to their own built-ins but to shufflevector calls can be codegen'd. This is the first (baby) step, support building zeroed vectors. llvm-svn: 110897
This commit is contained in:
parent
d1e5b439c9
commit
7306c86886
|
@ -883,7 +883,7 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM)
|
|||
setOperationAction(ISD::FDIV, MVT::v8f32, Legal);
|
||||
setOperationAction(ISD::FSQRT, MVT::v8f32, Legal);
|
||||
setOperationAction(ISD::FNEG, MVT::v8f32, Custom);
|
||||
//setOperationAction(ISD::BUILD_VECTOR, MVT::v8f32, Custom);
|
||||
setOperationAction(ISD::BUILD_VECTOR, MVT::v8f32, Custom);
|
||||
//setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v8f32, Custom);
|
||||
//setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v8f32, Custom);
|
||||
//setOperationAction(ISD::SELECT, MVT::v8f32, Custom);
|
||||
|
@ -3412,18 +3412,27 @@ static SDValue getZeroVector(EVT VT, bool HasSSE2, SelectionDAG &DAG,
|
|||
DebugLoc dl) {
|
||||
assert(VT.isVector() && "Expected a vector type");
|
||||
|
||||
// Always build zero vectors as <4 x i32> or <2 x i32> bitcasted to their dest
|
||||
// type. This ensures they get CSE'd.
|
||||
// Always build zero vectors as <4 x i32> or <2 x i32> bitcasted
|
||||
// to their dest type. This ensures they get CSE'd.
|
||||
SDValue Vec;
|
||||
if (VT.getSizeInBits() == 64) { // MMX
|
||||
SDValue Cst = DAG.getTargetConstant(0, MVT::i32);
|
||||
Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v2i32, Cst, Cst);
|
||||
} else if (HasSSE2) { // SSE2
|
||||
SDValue Cst = DAG.getTargetConstant(0, MVT::i32);
|
||||
Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4i32, Cst, Cst, Cst, Cst);
|
||||
} else { // SSE1
|
||||
} else if (VT.getSizeInBits() == 128) {
|
||||
if (HasSSE2) { // SSE2
|
||||
SDValue Cst = DAG.getTargetConstant(0, MVT::i32);
|
||||
Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4i32, Cst, Cst, Cst, Cst);
|
||||
} else { // SSE1
|
||||
SDValue Cst = DAG.getTargetConstantFP(+0.0, MVT::f32);
|
||||
Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4f32, Cst, Cst, Cst, Cst);
|
||||
}
|
||||
} else if (VT.getSizeInBits() == 256) { // AVX
|
||||
// 256-bit logic and arithmetic instructions in AVX are
|
||||
// all floating-point, no support for integer ops. Default
|
||||
// to emitting fp zeroed vectors then.
|
||||
SDValue Cst = DAG.getTargetConstantFP(+0.0, MVT::f32);
|
||||
Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4f32, Cst, Cst, Cst, Cst);
|
||||
SDValue Ops[] = { Cst, Cst, Cst, Cst, Cst, Cst, Cst, Cst };
|
||||
Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v8f32, Ops, 8);
|
||||
}
|
||||
return DAG.getNode(ISD::BIT_CONVERT, dl, VT, Vec);
|
||||
}
|
||||
|
@ -3437,9 +3446,9 @@ static SDValue getOnesVector(EVT VT, SelectionDAG &DAG, DebugLoc dl) {
|
|||
// type. This ensures they get CSE'd.
|
||||
SDValue Cst = DAG.getTargetConstant(~0U, MVT::i32);
|
||||
SDValue Vec;
|
||||
if (VT.getSizeInBits() == 64) // MMX
|
||||
if (VT.getSizeInBits() == 64) // MMX
|
||||
Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v2i32, Cst, Cst);
|
||||
else // SSE
|
||||
else // SSE
|
||||
Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4i32, Cst, Cst, Cst, Cst);
|
||||
return DAG.getNode(ISD::BIT_CONVERT, dl, VT, Vec);
|
||||
}
|
||||
|
@ -3844,9 +3853,13 @@ static SDValue EltsFromConsecutiveLoads(EVT VT, SmallVectorImpl<SDValue> &Elts,
|
|||
SDValue
|
||||
X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const {
|
||||
DebugLoc dl = Op.getDebugLoc();
|
||||
// All zero's are handled with pxor, all one's are handled with pcmpeqd.
|
||||
if (ISD::isBuildVectorAllZeros(Op.getNode())
|
||||
|| ISD::isBuildVectorAllOnes(Op.getNode())) {
|
||||
// All zero's are handled with pxor in SSE2 and above, xorps in SSE1 and
|
||||
// all one's are handled with pcmpeqd. In AVX, zero's are handled with
|
||||
// vpxor in 128-bit and xor{pd,ps} in 256-bit, but no 256 version of pcmpeqd
|
||||
// is present, so AllOnes is ignored.
|
||||
if (ISD::isBuildVectorAllZeros(Op.getNode()) ||
|
||||
(Op.getValueType().getSizeInBits() != 256 &&
|
||||
ISD::isBuildVectorAllOnes(Op.getNode()))) {
|
||||
// Canonicalize this to either <4 x i32> or <2 x i32> (SSE vs MMX) to
|
||||
// 1) ensure the zero vectors are CSE'd, and 2) ensure that i64 scalars are
|
||||
// eliminated on x86-32 hosts.
|
||||
|
|
|
@ -2186,6 +2186,14 @@ def V_SET0PI : PDI<0xEF, MRMInitReg, (outs VR128:$dst), (ins), "",
|
|||
[(set VR128:$dst, (v4i32 immAllZerosV))]>;
|
||||
}
|
||||
|
||||
let isReMaterializable = 1, isAsCheapAsAMove = 1, canFoldAsLoad = 1,
|
||||
isCodeGenOnly = 1, Predicates = [HasAVX] in {
|
||||
def V_SET0PSY : PSI<0x57, MRMInitReg, (outs VR256:$dst), (ins), "",
|
||||
[(set VR256:$dst, (v8f32 immAllZerosV))]>, VEX_4V;
|
||||
def V_SET0PDY : PDI<0x57, MRMInitReg, (outs VR256:$dst), (ins), "",
|
||||
[(set VR256:$dst, (v4f64 immAllZerosV))]>, VEX_4V;
|
||||
}
|
||||
|
||||
def : Pat<(v2i64 immAllZerosV), (V_SET0PI)>;
|
||||
def : Pat<(v8i16 immAllZerosV), (V_SET0PI)>;
|
||||
def : Pat<(v16i8 immAllZerosV), (V_SET0PI)>;
|
||||
|
|
|
@ -374,12 +374,14 @@ void X86MCInstLower::Lower(const MachineInstr *MI, MCInst &OutMI) const {
|
|||
case X86::MMX_V_SET0: LowerUnaryToTwoAddr(OutMI, X86::MMX_PXORrr); break;
|
||||
case X86::MMX_V_SETALLONES:
|
||||
LowerUnaryToTwoAddr(OutMI, X86::MMX_PCMPEQDrr); break;
|
||||
case X86::FsFLD0SS: LowerUnaryToTwoAddr(OutMI, X86::PXORrr); break;
|
||||
case X86::FsFLD0SD: LowerUnaryToTwoAddr(OutMI, X86::PXORrr); break;
|
||||
case X86::V_SET0PS: LowerUnaryToTwoAddr(OutMI, X86::XORPSrr); break;
|
||||
case X86::V_SET0PD: LowerUnaryToTwoAddr(OutMI, X86::XORPDrr); break;
|
||||
case X86::V_SET0PI: LowerUnaryToTwoAddr(OutMI, X86::PXORrr); break;
|
||||
case X86::V_SETALLONES: LowerUnaryToTwoAddr(OutMI, X86::PCMPEQDrr); break;
|
||||
case X86::FsFLD0SS: LowerUnaryToTwoAddr(OutMI, X86::PXORrr); break;
|
||||
case X86::FsFLD0SD: LowerUnaryToTwoAddr(OutMI, X86::PXORrr); break;
|
||||
case X86::V_SET0PS: LowerUnaryToTwoAddr(OutMI, X86::XORPSrr); break;
|
||||
case X86::V_SET0PSY: LowerUnaryToTwoAddr(OutMI, X86::VXORPSYrr); break;
|
||||
case X86::V_SET0PD: LowerUnaryToTwoAddr(OutMI, X86::XORPDrr); break;
|
||||
case X86::V_SET0PDY: LowerUnaryToTwoAddr(OutMI, X86::VXORPDYrr); break;
|
||||
case X86::V_SET0PI: LowerUnaryToTwoAddr(OutMI, X86::PXORrr); break;
|
||||
case X86::V_SETALLONES: LowerUnaryToTwoAddr(OutMI, X86::PCMPEQDrr); break;
|
||||
|
||||
case X86::MOV16r0:
|
||||
LowerSubReg32_Op0(OutMI, X86::MOV32r0); // MOV16r0 -> MOV32r0
|
||||
|
|
|
@ -0,0 +1,15 @@
|
|||
; RUN: llc < %s -mtriple=x86_64-apple-darwin -march=x86 -mcpu=corei7 -mattr=avx | FileCheck %s
|
||||
|
||||
@x = common global <8 x float> zeroinitializer, align 32
|
||||
@y = common global <4 x double> zeroinitializer, align 32
|
||||
|
||||
define void @zero() nounwind ssp {
|
||||
entry:
|
||||
; CHECK: vxorps
|
||||
; CHECK: vmovaps
|
||||
; CHECK: vmovaps
|
||||
store <8 x float> zeroinitializer, <8 x float>* @x, align 32
|
||||
store <4 x double> zeroinitializer, <4 x double>* @y, align 32
|
||||
ret void
|
||||
}
|
||||
|
Loading…
Reference in New Issue