From 8b08f5232bc027a8e9f669b6ea9fab0270357211 Mon Sep 17 00:00:00 2001 From: Nate Begeman Date: Fri, 10 Dec 2010 00:26:57 +0000 Subject: [PATCH] Formalize the notion that AVX and SSE are non-overlapping extensions from the compiler's point of view. Per email discussion, we either want to always use VEX-prefixed instructions or never use them, and are taking "HasAVX" to mean "Always use VEX". Passing -mattr=-avx,+sse42 should serve to restore legacy SSE support when desirable. llvm-svn: 121439 --- llvm/lib/Target/X86/X86.td | 10 +++---- llvm/lib/Target/X86/X86CallingConv.td | 16 +++++------ llvm/lib/Target/X86/X86ISelLowering.cpp | 38 ++++++++++++------------- llvm/lib/Target/X86/X86InstrInfo.td | 27 +++++++++--------- llvm/lib/Target/X86/X86InstrSSE.td | 10 ++++++- llvm/lib/Target/X86/X86Subtarget.cpp | 6 ++-- llvm/lib/Target/X86/X86Subtarget.h | 2 ++ llvm/test/CodeGen/X86/avx-128.ll | 2 +- 8 files changed, 61 insertions(+), 50 deletions(-) diff --git a/llvm/lib/Target/X86/X86.td b/llvm/lib/Target/X86/X86.td index fa76619c2818..77d3d56fba2e 100644 --- a/llvm/lib/Target/X86/X86.td +++ b/llvm/lib/Target/X86/X86.td @@ -116,11 +116,11 @@ def : Proc<"nehalem", [FeatureSSE42, Feature64Bit, FeatureSlowBTMem, FeatureFastUAMem]>; // Westmere is a similar machine to nehalem with some additional features. // Westmere is the corei3/i5/i7 path from nehalem to sandybridge -def : Proc<"westmere", [FeatureSSE42, Feature64Bit, FeatureSlowBTMem, - FeatureFastUAMem, FeatureAES]>; -// Sandy Bridge does not have FMA -// FIXME: Wikipedia says it does... it should have AES as well. -def : Proc<"sandybridge", [FeatureSSE42, FeatureAVX, Feature64Bit]>; +def : Proc<"westmere", [FeatureSSE42, Feature64Bit, FeatureSlowBTMem, + FeatureFastUAMem, FeatureAES, FeatureCLMUL]>; +// SSE is not listed here since llvm treats AVX as a reimplementation of SSE, +// rather than a superset. +def : Proc<"sandybridge", [FeatureAVX, FeatureAES, FeatureCLMUL, Feature64Bit]>; def : Proc<"k6", [FeatureMMX]>; def : Proc<"k6-2", [FeatureMMX, Feature3DNow]>; diff --git a/llvm/lib/Target/X86/X86CallingConv.td b/llvm/lib/Target/X86/X86CallingConv.td index 56863d9492e2..a44fb694e725 100644 --- a/llvm/lib/Target/X86/X86CallingConv.td +++ b/llvm/lib/Target/X86/X86CallingConv.td @@ -61,7 +61,7 @@ def RetCC_X86_32_C : CallingConv<[ // weirdly; this is really the sse-regparm calling convention) in which // case they use XMM0, otherwise it is the same as the common X86 calling // conv. - CCIfInReg>>>, CCIfType<[f32,f64], CCAssignToReg<[ST0, ST1]>>, CCDelegateTo @@ -73,8 +73,8 @@ def RetCC_X86_32_Fast : CallingConv<[ // SSE2. // This can happen when a float, 2 x float, or 3 x float vector is split by // target lowering, and is returned in 1-3 sse regs. - CCIfType<[f32], CCIfSubtarget<"hasSSE2()", CCAssignToReg<[XMM0,XMM1,XMM2]>>>, - CCIfType<[f64], CCIfSubtarget<"hasSSE2()", CCAssignToReg<[XMM0,XMM1,XMM2]>>>, + CCIfType<[f32], CCIfSubtarget<"hasXMMInt()", CCAssignToReg<[XMM0,XMM1,XMM2]>>>, + CCIfType<[f64], CCIfSubtarget<"hasXMMInt()", CCAssignToReg<[XMM0,XMM1,XMM2]>>>, // For integers, ECX can be used as an extra return register CCIfType<[i8], CCAssignToReg<[AL, DL, CL]>>, @@ -163,12 +163,12 @@ def CC_X86_64_C : CallingConv<[ // registers on Darwin. CCIfType<[x86mmx], CCIfSubtarget<"isTargetDarwin()", - CCIfSubtarget<"hasSSE2()", + CCIfSubtarget<"hasXMMInt()", CCPromoteToType>>>, // The first 8 FP/Vector arguments are passed in XMM registers. CCIfType<[f32, f64, v16i8, v8i16, v4i32, v2i64, v4f32, v2f64], - CCIfSubtarget<"hasSSE1()", + CCIfSubtarget<"hasXMM()", CCAssignToReg<[XMM0, XMM1, XMM2, XMM3, XMM4, XMM5, XMM6, XMM7]>>>, // The first 8 256-bit vector arguments are passed in YMM registers. @@ -245,7 +245,7 @@ def CC_X86_64_GHC : CallingConv<[ // Pass in STG registers: F1, F2, F3, F4, D1, D2 CCIfType<[f32, f64, v16i8, v8i16, v4i32, v2i64, v4f32, v2f64], - CCIfSubtarget<"hasSSE1()", + CCIfSubtarget<"hasXMM()", CCAssignToReg<[XMM1, XMM2, XMM3, XMM4, XMM5, XMM6]>>> ]>; @@ -263,7 +263,7 @@ def CC_X86_32_Common : CallingConv<[ // The first 3 float or double arguments, if marked 'inreg' and if the call // is not a vararg call and if SSE2 is available, are passed in SSE registers. CCIfNotVarArg>>>>, // The first 3 __m64 (except for v1i64) vector arguments are passed in mmx @@ -362,7 +362,7 @@ def CC_X86_32_FastCC : CallingConv<[ // The first 3 float or double arguments, if the call is not a vararg // call and if SSE2 is available, are passed in SSE registers. CCIfNotVarArg>>>, // Doubles get 8-byte slots that are 8-byte aligned. diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp index f38cad7d37c4..045bb933cbb4 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -81,8 +81,8 @@ static TargetLoweringObjectFile *createTLOF(X86TargetMachine &TM) { X86TargetLowering::X86TargetLowering(X86TargetMachine &TM) : TargetLowering(TM, createTLOF(TM)) { Subtarget = &TM.getSubtarget(); - X86ScalarSSEf64 = Subtarget->hasSSE2(); - X86ScalarSSEf32 = Subtarget->hasSSE1(); + X86ScalarSSEf64 = Subtarget->hasXMMInt(); + X86ScalarSSEf32 = Subtarget->hasXMM(); X86StackPtr = Subtarget->is64Bit() ? X86::RSP : X86::ESP; RegInfo = TM.getRegisterInfo(); @@ -356,7 +356,7 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM) setOperationAction(ISD::SRL_PARTS , MVT::i64 , Custom); } - if (Subtarget->hasSSE1()) + if (Subtarget->hasXMM()) setOperationAction(ISD::PREFETCH , MVT::Other, Legal); // We may not have a libcall for MEMBARRIER so we should lower this. @@ -664,7 +664,7 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM) setOperationAction(ISD::BITCAST, MVT::v2i32, Expand); setOperationAction(ISD::BITCAST, MVT::v1i64, Expand); - if (!UseSoftFloat && Subtarget->hasSSE1()) { + if (!UseSoftFloat && Subtarget->hasXMM()) { addRegisterClass(MVT::v4f32, X86::VR128RegisterClass); setOperationAction(ISD::FADD, MVT::v4f32, Legal); @@ -681,7 +681,7 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM) setOperationAction(ISD::VSETCC, MVT::v4f32, Custom); } - if (!UseSoftFloat && Subtarget->hasSSE2()) { + if (!UseSoftFloat && Subtarget->hasXMMInt()) { addRegisterClass(MVT::v2f64, X86::VR128RegisterClass); // FIXME: Unfortunately -soft-float and -no-implicit-float means XMM @@ -1043,7 +1043,7 @@ unsigned X86TargetLowering::getByValTypeAlignment(const Type *Ty) const { } unsigned Align = 4; - if (Subtarget->hasSSE1()) + if (Subtarget->hasXMM()) getMaxByValAlign(Ty, Align); return Align; } @@ -1084,7 +1084,7 @@ X86TargetLowering::getOptimalMemOpType(uint64_t Size, } else if (!MemcpyStrSrc && Size >= 8 && !Subtarget->is64Bit() && Subtarget->getStackAlignment() >= 8 && - Subtarget->hasSSE2()) { + Subtarget->hasXMMInt()) { // Do not use f64 to lower memcpy if source is string constant. It's // better to use i32 to avoid the loads. return MVT::f64; @@ -1272,14 +1272,14 @@ X86TargetLowering::LowerReturn(SDValue Chain, // or SSE or MMX vectors. if ((ValVT == MVT::f32 || ValVT == MVT::f64 || VA.getLocReg() == X86::XMM0 || VA.getLocReg() == X86::XMM1) && - (Subtarget->is64Bit() && !Subtarget->hasSSE1())) { + (Subtarget->is64Bit() && !Subtarget->hasXMM())) { report_fatal_error("SSE register return with SSE disabled"); } // Likewise we can't return F64 values with SSE1 only. gcc does so, but // llvm-gcc has never done it right and no one has noticed, so this // should be OK for now. if (ValVT == MVT::f64 && - (Subtarget->is64Bit() && !Subtarget->hasSSE2())) + (Subtarget->is64Bit() && !Subtarget->hasXMMInt())) report_fatal_error("SSE2 register return with SSE2 disabled"); // Returns in ST0/ST1 are handled specially: these are pushed as operands to @@ -1391,7 +1391,7 @@ X86TargetLowering::LowerCallResult(SDValue Chain, SDValue InFlag, // If this is x86-64, and we disabled SSE, we can't return FP values if ((CopyVT == MVT::f32 || CopyVT == MVT::f64) && - ((Is64Bit || Ins[i].Flags.isInReg()) && !Subtarget->hasSSE1())) { + ((Is64Bit || Ins[i].Flags.isInReg()) && !Subtarget->hasXMM())) { report_fatal_error("SSE register return with SSE disabled"); } @@ -1700,11 +1700,11 @@ X86TargetLowering::LowerFormalArguments(SDValue Chain, TotalNumIntRegs); bool NoImplicitFloatOps = Fn->hasFnAttr(Attribute::NoImplicitFloat); - assert(!(NumXMMRegs && !Subtarget->hasSSE1()) && + assert(!(NumXMMRegs && !Subtarget->hasXMM()) && "SSE register cannot be used when SSE is disabled!"); assert(!(NumXMMRegs && UseSoftFloat && NoImplicitFloatOps) && "SSE register cannot be used when SSE is disabled!"); - if (UseSoftFloat || NoImplicitFloatOps || !Subtarget->hasSSE1()) + if (UseSoftFloat || NoImplicitFloatOps || !Subtarget->hasXMM()) // Kernel mode asks for SSE to be disabled, so don't push them // on the stack. TotalNumXMMRegs = 0; @@ -2055,7 +2055,7 @@ X86TargetLowering::LowerCall(SDValue Chain, SDValue Callee, X86::XMM4, X86::XMM5, X86::XMM6, X86::XMM7 }; unsigned NumXMMRegs = CCInfo.getFirstUnallocated(XMMArgRegs, 8); - assert((Subtarget->hasSSE1() || !NumXMMRegs) + assert((Subtarget->hasXMM() || !NumXMMRegs) && "SSE registers cannot be used when SSE is disabled"); Chain = DAG.getCopyToReg(Chain, dl, X86::AL, @@ -7635,7 +7635,7 @@ SDValue X86TargetLowering::LowerVAARG(SDValue Op, SelectionDAG &DAG) const { assert(!UseSoftFloat && !(DAG.getMachineFunction() .getFunction()->hasFnAttr(Attribute::NoImplicitFloat)) && - Subtarget->hasSSE1()); + Subtarget->hasXMM()); } // Insert VAARG_64 node into the DAG @@ -11689,7 +11689,7 @@ TargetLowering::ConstraintWeight break; case 'x': case 'Y': - if ((type->getPrimitiveSizeInBits() == 128) && Subtarget->hasSSE1()) + if ((type->getPrimitiveSizeInBits() == 128) && Subtarget->hasXMM()) weight = CW_Register; break; case 'I': @@ -11759,9 +11759,9 @@ LowerXConstraint(EVT ConstraintVT) const { // FP X constraints get lowered to SSE1/2 registers if available, otherwise // 'f' like normal targets. if (ConstraintVT.isFloatingPoint()) { - if (Subtarget->hasSSE2()) + if (Subtarget->hasXMMInt()) return "Y"; - if (Subtarget->hasSSE1()) + if (Subtarget->hasXMM()) return "x"; } @@ -11991,10 +11991,10 @@ X86TargetLowering::getRegForInlineAsmConstraint(const std::string &Constraint, if (!Subtarget->hasMMX()) break; return std::make_pair(0U, X86::VR64RegisterClass); case 'Y': // SSE_REGS if SSE2 allowed - if (!Subtarget->hasSSE2()) break; + if (!Subtarget->hasXMMInt()) break; // FALL THROUGH. case 'x': // SSE_REGS if SSE1 allowed - if (!Subtarget->hasSSE1()) break; + if (!Subtarget->hasXMM()) break; switch (VT.getSimpleVT().SimpleTy) { default: break; diff --git a/llvm/lib/Target/X86/X86InstrInfo.td b/llvm/lib/Target/X86/X86InstrInfo.td index 8b5a8cf44e14..967fb92ea56a 100644 --- a/llvm/lib/Target/X86/X86InstrInfo.td +++ b/llvm/lib/Target/X86/X86InstrInfo.td @@ -400,26 +400,26 @@ def tls64addr : ComplexPatternhasCMov()">; def NoCMov : Predicate<"!Subtarget->hasCMov()">; -// FIXME: temporary hack to let codegen assert or generate poor code in case -// no AVX version of the desired intructions is present, this is better for -// incremental dev (without fallbacks it's easier to spot what's missing) -def HasMMX : Predicate<"Subtarget->hasMMX() && !Subtarget->hasAVX()">; +def HasMMX : Predicate<"Subtarget->hasMMX()">; def Has3DNow : Predicate<"Subtarget->has3DNow()">; def Has3DNowA : Predicate<"Subtarget->has3DNowA()">; -def HasSSE1 : Predicate<"Subtarget->hasSSE1() && !Subtarget->hasAVX()">; -def HasSSE2 : Predicate<"Subtarget->hasSSE2() && !Subtarget->hasAVX()">; -def HasSSE3 : Predicate<"Subtarget->hasSSE3() && !Subtarget->hasAVX()">; -def HasSSSE3 : Predicate<"Subtarget->hasSSSE3() && !Subtarget->hasAVX()">; -def HasSSE41 : Predicate<"Subtarget->hasSSE41() && !Subtarget->hasAVX()">; -def HasSSE42 : Predicate<"Subtarget->hasSSE42() && !Subtarget->hasAVX()">; -def HasSSE4A : Predicate<"Subtarget->hasSSE4A() && !Subtarget->hasAVX()">; +def HasSSE1 : Predicate<"Subtarget->hasSSE1()">; +def HasSSE2 : Predicate<"Subtarget->hasSSE2()">; +def HasSSE3 : Predicate<"Subtarget->hasSSE3()">; +def HasSSSE3 : Predicate<"Subtarget->hasSSSE3()">; +def HasSSE41 : Predicate<"Subtarget->hasSSE41()">; +def HasSSE42 : Predicate<"Subtarget->hasSSE42()">; +def HasSSE4A : Predicate<"Subtarget->hasSSE4A()">; def HasAVX : Predicate<"Subtarget->hasAVX()">; +def HasXMMInt : Predicate<"Subtarget->hasXMMInt()">; + +def HasAES : Predicate<"Subtarget->hasAES()">; def HasCLMUL : Predicate<"Subtarget->hasCLMUL()">; def HasFMA3 : Predicate<"Subtarget->hasFMA3()">; def HasFMA4 : Predicate<"Subtarget->hasFMA4()">; -def FPStackf32 : Predicate<"!Subtarget->hasSSE1()">; -def FPStackf64 : Predicate<"!Subtarget->hasSSE2()">; +def FPStackf32 : Predicate<"!Subtarget->hasXMM()">; +def FPStackf64 : Predicate<"!Subtarget->hasXMMInt()">; def In32BitMode : Predicate<"!Subtarget->is64Bit()">, AssemblerPredicate; def In64BitMode : Predicate<"Subtarget->is64Bit()">, AssemblerPredicate; def IsWin64 : Predicate<"Subtarget->isTargetWin64()">; @@ -436,7 +436,6 @@ def OptForSize : Predicate<"OptForSize">; def OptForSpeed : Predicate<"!OptForSize">; def FastBTMem : Predicate<"!Subtarget->isBTMemSlow()">; def CallImmAddr : Predicate<"Subtarget->IsLegalToCallImmediateAddr(TM)">; -def HasAES : Predicate<"Subtarget->hasAES()">; //===----------------------------------------------------------------------===// // X86 Instruction Format Definitions. diff --git a/llvm/lib/Target/X86/X86InstrSSE.td b/llvm/lib/Target/X86/X86InstrSSE.td index b9542993bf9f..f471947be79f 100644 --- a/llvm/lib/Target/X86/X86InstrSSE.td +++ b/llvm/lib/Target/X86/X86InstrSSE.td @@ -712,6 +712,8 @@ def VCVTSD2SSrm : I<0x5A, MRMSrcMem, (outs FR32:$dst), "vcvtsd2ss\t{$src2, $src1, $dst|$dst, $src1, $src2}", []>, XD, Requires<[HasAVX, OptForSize]>, VEX_4V; } +def : Pat<(f32 (fround FR64:$src)), (VCVTSD2SSrr FR64:$src, FR64:$src)>; + def CVTSD2SSrr : SDI<0x5A, MRMSrcReg, (outs FR32:$dst), (ins FR64:$src), "cvtsd2ss\t{$src, $dst|$dst, $src}", [(set FR32:$dst, (fround FR64:$src))]>; @@ -739,6 +741,8 @@ def VCVTSS2SDrm : I<0x5A, MRMSrcMem, (outs FR64:$dst), "vcvtss2sd\t{$src2, $src1, $dst|$dst, $src1, $src2}", []>, XS, VEX_4V, Requires<[HasAVX, OptForSize]>; } +def : Pat<(f64 (fextend FR32:$src)), (VCVTSS2SDrr FR32:$src, FR32:$src)>; + def CVTSS2SDrr : I<0x5A, MRMSrcReg, (outs FR64:$dst), (ins FR32:$src), "cvtss2sd\t{$src, $dst|$dst, $src}", [(set FR64:$dst, (fextend FR32:$src))]>, XS, @@ -3680,7 +3684,7 @@ let Predicates = [HasSSE2] in (CVTSS2SDrm addr:$src)>; // bit_convert -let Predicates = [HasSSE2] in { +let Predicates = [HasXMMInt] in { def : Pat<(v2i64 (bitconvert (v4i32 VR128:$src))), (v2i64 VR128:$src)>; def : Pat<(v2i64 (bitconvert (v8i16 VR128:$src))), (v2i64 VR128:$src)>; def : Pat<(v2i64 (bitconvert (v16i8 VR128:$src))), (v2i64 VR128:$src)>; @@ -3713,6 +3717,10 @@ let Predicates = [HasSSE2] in { def : Pat<(v2f64 (bitconvert (v4f32 VR128:$src))), (v2f64 VR128:$src)>; } +let Predicates = [HasAVX] in { + def : Pat<(v4f64 (bitconvert (v8f32 VR256:$src))), (v4f64 VR256:$src)>; +} + // Move scalar to XMM zero-extended // movd to XMM register zero-extends let AddedComplexity = 15 in { diff --git a/llvm/lib/Target/X86/X86Subtarget.cpp b/llvm/lib/Target/X86/X86Subtarget.cpp index 5607c133bc7f..c251984dcc08 100644 --- a/llvm/lib/Target/X86/X86Subtarget.cpp +++ b/llvm/lib/Target/X86/X86Subtarget.cpp @@ -256,13 +256,13 @@ void X86Subtarget::AutoDetectSubtargetFeatures() { if ((ECX >> 9) & 1) X86SSELevel = SSSE3; if ((ECX >> 19) & 1) X86SSELevel = SSE41; if ((ECX >> 20) & 1) X86SSELevel = SSE42; + if ((ECX >> 28) & 1) { HasAVX = true; X86SSELevel = NoMMXSSE; } bool IsIntel = memcmp(text.c, "GenuineIntel", 12) == 0; bool IsAMD = !IsIntel && memcmp(text.c, "AuthenticAMD", 12) == 0; HasCLMUL = IsIntel && ((ECX >> 1) & 0x1); HasFMA3 = IsIntel && ((ECX >> 12) & 0x1); - HasAVX = ((ECX >> 28) & 0x1); HasAES = IsIntel && ((ECX >> 25) & 0x1); if (IsIntel || IsAMD) { @@ -316,11 +316,13 @@ X86Subtarget::X86Subtarget(const std::string &TT, const std::string &FS, ParseSubtargetFeatures(FS, CPU); // All X86-64 CPUs also have SSE2, however user might request no SSE via // -mattr, so don't force SSELevel here. + if (HasAVX) + X86SSELevel = NoMMXSSE; } else { // Otherwise, use CPUID to auto-detect feature set. AutoDetectSubtargetFeatures(); // Make sure SSE2 is enabled; it is available on all X86-64 CPUs. - if (Is64Bit && X86SSELevel < SSE2) + if (Is64Bit && !HasAVX && X86SSELevel < SSE2) X86SSELevel = SSE2; } diff --git a/llvm/lib/Target/X86/X86Subtarget.h b/llvm/lib/Target/X86/X86Subtarget.h index 58cf3e0c6d28..95c438d678a5 100644 --- a/llvm/lib/Target/X86/X86Subtarget.h +++ b/llvm/lib/Target/X86/X86Subtarget.h @@ -155,6 +155,8 @@ public: bool has3DNowA() const { return X863DNowLevel >= ThreeDNowA; } bool hasPOPCNT() const { return HasPOPCNT; } bool hasAVX() const { return HasAVX; } + bool hasXMM() const { return hasSSE1() || hasAVX(); } + bool hasXMMInt() const { return hasSSE2() || hasAVX(); } bool hasAES() const { return HasAES; } bool hasCLMUL() const { return HasCLMUL; } bool hasFMA3() const { return HasFMA3; } diff --git a/llvm/test/CodeGen/X86/avx-128.ll b/llvm/test/CodeGen/X86/avx-128.ll index a72160be719a..2bd3b5dfedd6 100644 --- a/llvm/test/CodeGen/X86/avx-128.ll +++ b/llvm/test/CodeGen/X86/avx-128.ll @@ -4,7 +4,7 @@ define void @zero() nounwind ssp { entry: - ; CHECK: vpxor + ; CHECK: vxorps ; CHECK: vmovaps store <4 x float> zeroinitializer, <4 x float>* @z, align 16 ret void