From e36c400255fcb3ef3def139a8a8d01f1ce875fb6 Mon Sep 17 00:00:00 2001
From: Dale Johannesen <dalej@apple.com>
Date: Sun, 23 Sep 2007 14:52:20 +0000
Subject: [PATCH] Fix PR 1681.  When X86 target uses +sse -sse2, keep f32 in
 SSE registers and f64 in x87.  This is effectively a new codegen mode. Change
 addLegalFPImmediate to permit float and double variants to do different
 things. Adjust callers.

llvm-svn: 42246
---
 llvm/include/llvm/Target/TargetLowering.h   |  10 --
 llvm/lib/Target/Alpha/AlphaISelLowering.cpp |   2 +
 llvm/lib/Target/IA64/IA64ISelLowering.cpp   |   2 +
 llvm/lib/Target/X86/X86ISelLowering.cpp     |  99 +++++++++++---
 llvm/lib/Target/X86/X86ISelLowering.h       |   8 +-
 llvm/lib/Target/X86/X86InstrFPStack.td      | 135 ++++++++++----------
 llvm/lib/Target/X86/X86InstrInfo.td         |   3 +-
 llvm/lib/Target/X86/X86InstrSSE.td          |   2 +-
 8 files changed, 164 insertions(+), 97 deletions(-)
diff --git a/llvm/include/llvm/Target/TargetLowering.h b/llvm/include/llvm/Target/TargetLowering.h
index 88b382327e6a..8586d7f09162 100644
--- a/llvm/include/llvm/Target/TargetLowering.h
+++ b/llvm/include/llvm/Target/TargetLowering.h
@@ -783,17 +783,7 @@ protected:
   /// addLegalFPImmediate - Indicate that this target can instruction select
   /// the specified FP immediate natively.
   void addLegalFPImmediate(const APFloat& Imm) {
-    // Incoming constants are expected to be double.  We also add
-    // the float version.  It is expected that all constants are exactly
-    // representable as floats.
-    assert(&Imm.getSemantics() == &APFloat::IEEEdouble);
-    APFloat Immf = APFloat(Imm);
-    // Rounding mode is not supposed to matter here...
-    if (Immf.convert(APFloat::IEEEsingle, APFloat::rmNearestTiesToEven) != 
-        APFloat::opOK)
-      assert(0);
     LegalFPImmediates.push_back(Imm);
-    LegalFPImmediates.push_back(Immf);
   }
 
   /// setTargetDAGCombine - Targets should invoke this method for each target
diff --git a/llvm/lib/Target/Alpha/AlphaISelLowering.cpp b/llvm/lib/Target/Alpha/AlphaISelLowering.cpp
index a3654af114f6..45a271d0ef8a 100644
--- a/llvm/lib/Target/Alpha/AlphaISelLowering.cpp
+++ b/llvm/lib/Target/Alpha/AlphaISelLowering.cpp
@@ -140,7 +140,9 @@ AlphaTargetLowering::AlphaTargetLowering(TargetMachine &TM) : TargetLowering(TM)
   setOperationAction(ISD::ConstantFP, MVT::f64, Expand);
   setOperationAction(ISD::ConstantFP, MVT::f32, Expand);
   addLegalFPImmediate(APFloat(+0.0)); //F31
+  addLegalFPImmediate(APFloat(+0.0f)); //F31
   addLegalFPImmediate(APFloat(-0.0)); //-F31
+  addLegalFPImmediate(APFloat(-0.0f)); //-F31
 
   setJumpBufSize(272);
   setJumpBufAlignment(16);
diff --git a/llvm/lib/Target/IA64/IA64ISelLowering.cpp b/llvm/lib/Target/IA64/IA64ISelLowering.cpp
index f1bd5ba86f42..0bbda9add6ba 100644
--- a/llvm/lib/Target/IA64/IA64ISelLowering.cpp
+++ b/llvm/lib/Target/IA64/IA64ISelLowering.cpp
@@ -120,7 +120,9 @@ IA64TargetLowering::IA64TargetLowering(TargetMachine &TM)
 
       setOperationAction(ISD::ConstantFP, MVT::f64, Expand);
       addLegalFPImmediate(APFloat(+0.0));
+      addLegalFPImmediate(APFloat(+0.0f));
       addLegalFPImmediate(APFloat(+1.0));
+      addLegalFPImmediate(APFloat(+1.0f));
 }
 
 const char *IA64TargetLowering::getTargetNodeName(unsigned Opcode) const {
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index e79ee90eae89..e52b6531faf4 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -40,7 +40,8 @@ using namespace llvm;
 X86TargetLowering::X86TargetLowering(TargetMachine &TM)
   : TargetLowering(TM) {
   Subtarget = &TM.getSubtarget<X86Subtarget>();
-  X86ScalarSSE = Subtarget->hasSSE2();
+  X86ScalarSSEf64 = Subtarget->hasSSE2();
+  X86ScalarSSEf32 = Subtarget->hasSSE1();
   X86StackPtr = Subtarget->is64Bit() ? X86::RSP : X86::ESP;
 
   RegInfo = TM.getRegisterInfo();
@@ -87,7 +88,7 @@ X86TargetLowering::X86TargetLowering(TargetMachine &TM)
     setOperationAction(ISD::UINT_TO_FP     , MVT::i64  , Expand);
     setOperationAction(ISD::UINT_TO_FP     , MVT::i32  , Promote);
   } else {
-    if (X86ScalarSSE)
+    if (X86ScalarSSEf64)
       // If SSE i64 SINT_TO_FP is not available, expand i32 UINT_TO_FP.
       setOperationAction(ISD::UINT_TO_FP   , MVT::i32  , Expand);
     else
@@ -99,7 +100,7 @@ X86TargetLowering::X86TargetLowering(TargetMachine &TM)
   setOperationAction(ISD::SINT_TO_FP       , MVT::i1   , Promote);
   setOperationAction(ISD::SINT_TO_FP       , MVT::i8   , Promote);
   // SSE has no i16 to fp conversion, only i32
-  if (X86ScalarSSE) {
+  if (X86ScalarSSEf32) {
     setOperationAction(ISD::SINT_TO_FP     , MVT::i16  , Promote);
     // f32 and f64 cases are Legal, f80 case is not
     setOperationAction(ISD::SINT_TO_FP     , MVT::i32  , Custom);
@@ -118,7 +119,7 @@ X86TargetLowering::X86TargetLowering(TargetMachine &TM)
   setOperationAction(ISD::FP_TO_SINT       , MVT::i1   , Promote);
   setOperationAction(ISD::FP_TO_SINT       , MVT::i8   , Promote);
 
-  if (X86ScalarSSE) {
+  if (X86ScalarSSEf32) {
     setOperationAction(ISD::FP_TO_SINT     , MVT::i16  , Promote);
     // f32 and f64 cases are Legal, f80 case is not
     setOperationAction(ISD::FP_TO_SINT     , MVT::i32  , Custom);
@@ -137,7 +138,7 @@ X86TargetLowering::X86TargetLowering(TargetMachine &TM)
     setOperationAction(ISD::FP_TO_UINT     , MVT::i64  , Expand);
     setOperationAction(ISD::FP_TO_UINT     , MVT::i32  , Promote);
   } else {
-    if (X86ScalarSSE && !Subtarget->hasSSE3())
+    if (X86ScalarSSEf32 && !Subtarget->hasSSE3())
       // Expand FP_TO_UINT into a select.
       // FIXME: We would like to use a Custom expander here eventually to do
       // the optimal thing for SSE vs. the default expansion in the legalizer.
@@ -148,7 +149,7 @@ X86TargetLowering::X86TargetLowering(TargetMachine &TM)
   }
 
   // TODO: when we have SSE, these could be more efficient, by using movd/movq.
-  if (!X86ScalarSSE) {
+  if (!X86ScalarSSEf64) {
     setOperationAction(ISD::BIT_CONVERT      , MVT::f32  , Expand);
     setOperationAction(ISD::BIT_CONVERT      , MVT::i32  , Expand);
   }
@@ -271,7 +272,8 @@ X86TargetLowering::X86TargetLowering(TargetMachine &TM)
   else
     setOperationAction(ISD::DYNAMIC_STACKALLOC, MVT::i32, Expand);
 
-  if (X86ScalarSSE) {
+  if (X86ScalarSSEf64) {
+    // f32 and f64 use SSE.
     // Set up the FP register classes.
     addRegisterClass(MVT::f32, X86::FR32RegisterClass);
     addRegisterClass(MVT::f64, X86::FR64RegisterClass);
@@ -300,7 +302,8 @@ X86TargetLowering::X86TargetLowering(TargetMachine &TM)
     // cases we handle.
     setOperationAction(ISD::ConstantFP, MVT::f64, Expand);
     setOperationAction(ISD::ConstantFP, MVT::f32, Expand);
-    addLegalFPImmediate(APFloat(+0.0)); // xorps / xorpd
+    addLegalFPImmediate(APFloat(+0.0)); // xorpd
+    addLegalFPImmediate(APFloat(+0.0f)); // xorps
 
     // Conversions to long double (in X87) go through memory.
     setConvertAction(MVT::f32, MVT::f80, Expand);
@@ -309,7 +312,55 @@ X86TargetLowering::X86TargetLowering(TargetMachine &TM)
     // Conversions from long double (in X87) go through memory.
     setConvertAction(MVT::f80, MVT::f32, Expand);
     setConvertAction(MVT::f80, MVT::f64, Expand);
+  } else if (X86ScalarSSEf32) {
+    // Use SSE for f32, x87 for f64.
+    // Set up the FP register classes.
+    addRegisterClass(MVT::f32, X86::FR32RegisterClass);
+    addRegisterClass(MVT::f64, X86::RFP64RegisterClass);
+
+    // Use ANDPS to simulate FABS.
+    setOperationAction(ISD::FABS , MVT::f32, Custom);
+
+    // Use XORP to simulate FNEG.
+    setOperationAction(ISD::FNEG , MVT::f32, Custom);
+
+    setOperationAction(ISD::UNDEF,     MVT::f64, Expand);
+
+    // Use ANDPS and ORPS to simulate FCOPYSIGN.
+    setOperationAction(ISD::FCOPYSIGN, MVT::f64, Expand);
+    setOperationAction(ISD::FCOPYSIGN, MVT::f32, Custom);
+
+    // We don't support sin/cos/fmod
+    setOperationAction(ISD::FSIN , MVT::f32, Expand);
+    setOperationAction(ISD::FCOS , MVT::f32, Expand);
+    setOperationAction(ISD::FREM , MVT::f32, Expand);
+
+    // Expand FP immediates into loads from the stack, except for the special
+    // cases we handle.
+    setOperationAction(ISD::ConstantFP, MVT::f64, Expand);
+    setOperationAction(ISD::ConstantFP, MVT::f32, Expand);
+    addLegalFPImmediate(APFloat(+0.0f)); // xorps
+    addLegalFPImmediate(APFloat(+0.0)); // FLD0
+    addLegalFPImmediate(APFloat(+1.0)); // FLD1
+    addLegalFPImmediate(APFloat(-0.0)); // FLD0/FCHS
+    addLegalFPImmediate(APFloat(-1.0)); // FLD1/FCHS
+
+    // SSE->x87 conversions go through memory.
+    setConvertAction(MVT::f32, MVT::f64, Expand);
+    setConvertAction(MVT::f32, MVT::f80, Expand);
+
+    // x87->SSE truncations need to go through memory.
+    setConvertAction(MVT::f80, MVT::f32, Expand);    
+    setConvertAction(MVT::f64, MVT::f32, Expand);
+    // And x87->x87 truncations also.
+    setConvertAction(MVT::f80, MVT::f64, Expand);
+
+    if (!UnsafeFPMath) {
+      setOperationAction(ISD::FSIN           , MVT::f64  , Expand);
+      setOperationAction(ISD::FCOS           , MVT::f64  , Expand);
+    }
   } else {
+    // f32 and f64 in x87.
     // Set up the FP register classes.
     addRegisterClass(MVT::f64, X86::RFP64RegisterClass);
     addRegisterClass(MVT::f32, X86::RFP32RegisterClass);
@@ -335,6 +386,10 @@ X86TargetLowering::X86TargetLowering(TargetMachine &TM)
     addLegalFPImmediate(APFloat(+1.0)); // FLD1
     addLegalFPImmediate(APFloat(-0.0)); // FLD0/FCHS
     addLegalFPImmediate(APFloat(-1.0)); // FLD1/FCHS
+    addLegalFPImmediate(APFloat(+0.0f)); // FLD0
+    addLegalFPImmediate(APFloat(+1.0f)); // FLD1
+    addLegalFPImmediate(APFloat(-0.0f)); // FLD0/FCHS
+    addLegalFPImmediate(APFloat(-1.0f)); // FLD1/FCHS
   }
 
   // Long double always uses X87.
@@ -583,7 +638,8 @@ SDOperand X86TargetLowering::LowerRET(SDOperand Op, SelectionDAG &DAG) {
     
     // If this is an FP return with ScalarSSE, we need to move the value from
     // an XMM register onto the fp-stack.
-    if (X86ScalarSSE) {
+    if ((X86ScalarSSEf32 && RVLocs[0].getValVT()==MVT::f32) ||
+        (X86ScalarSSEf64 && RVLocs[0].getValVT()==MVT::f64)) {
       SDOperand MemLoc;
       
       // If this is a load into a scalarsse value, don't store the loaded value
@@ -659,7 +715,8 @@ LowerCallResult(SDOperand Chain, SDOperand InFlag, SDNode *TheCall,
     
     // If we are using ScalarSSE, store ST(0) to the stack and reload it into
     // an XMM register.
-    if (X86ScalarSSE) {
+    if ((X86ScalarSSEf32 && RVLocs[0].getValVT() == MVT::f32) ||
+        (X86ScalarSSEf64 && RVLocs[0].getValVT() == MVT::f64)) {
       // FIXME: Currently the FST is flagged to the FP_GET_RESULT. This
       // shouldn't be necessary except that RFP cannot be live across
       // multiple blocks. When stackifier is fixed, they can be uncoupled.
@@ -3334,7 +3391,9 @@ SDOperand X86TargetLowering::LowerSINT_TO_FP(SDOperand Op, SelectionDAG &DAG) {
                                  StackSlot, NULL, 0);
 
   // These are really Legal; caller falls through into that case.
-  if (SrcVT==MVT::i32 && Op.getValueType() != MVT::f80 && X86ScalarSSE)
+  if (SrcVT==MVT::i32 && Op.getValueType() == MVT::f32 && X86ScalarSSEf32)
+    return Result;
+  if (SrcVT==MVT::i32 && Op.getValueType() == MVT::f64 && X86ScalarSSEf64)
     return Result;
   if (SrcVT==MVT::i64 && Op.getValueType() != MVT::f80 && 
       Subtarget->is64Bit())
@@ -3342,7 +3401,8 @@ SDOperand X86TargetLowering::LowerSINT_TO_FP(SDOperand Op, SelectionDAG &DAG) {
 
   // Build the FILD
   SDVTList Tys;
-  bool useSSE = X86ScalarSSE && Op.getValueType() != MVT::f80;
+  bool useSSE = (X86ScalarSSEf32 && Op.getValueType() == MVT::f32) ||
+                (X86ScalarSSEf64 && Op.getValueType() == MVT::f64);
   if (useSSE)
     Tys = DAG.getVTList(MVT::f64, MVT::Other, MVT::Flag);
   else
@@ -3390,8 +3450,11 @@ SDOperand X86TargetLowering::LowerFP_TO_SINT(SDOperand Op, SelectionDAG &DAG) {
   SDOperand StackSlot = DAG.getFrameIndex(SSFI, getPointerTy());
 
   // These are really Legal.
-  if (Op.getValueType() == MVT::i32 && X86ScalarSSE && 
-      Op.getOperand(0).getValueType() != MVT::f80)
+  if (Op.getValueType() == MVT::i32 && 
+      X86ScalarSSEf32 && Op.getOperand(0).getValueType() == MVT::f32)
+    return Result;
+  if (Op.getValueType() == MVT::i32 && 
+      X86ScalarSSEf64 && Op.getOperand(0).getValueType() == MVT::f64)
     return Result;
   if (Subtarget->is64Bit() &&
       Op.getValueType() == MVT::i64 &&
@@ -3408,7 +3471,8 @@ SDOperand X86TargetLowering::LowerFP_TO_SINT(SDOperand Op, SelectionDAG &DAG) {
 
   SDOperand Chain = DAG.getEntryNode();
   SDOperand Value = Op.getOperand(0);
-  if (X86ScalarSSE && Op.getOperand(0).getValueType() != MVT::f80) {
+  if ((X86ScalarSSEf32 && Op.getOperand(0).getValueType() == MVT::f32) ||
+      (X86ScalarSSEf64 && Op.getOperand(0).getValueType() == MVT::f64)) {
     assert(Op.getValueType() == MVT::i64 && "Invalid FP_TO_SINT to lower!");
     Chain = DAG.getStore(Chain, Value, StackSlot, NULL, 0);
     SDVTList Tys = DAG.getVTList(Op.getOperand(0).getValueType(), MVT::Other);
@@ -3620,8 +3684,9 @@ SDOperand X86TargetLowering::LowerSELECT(SDOperand Op, SelectionDAG &DAG) {
     // pressure reason)?
     SDOperand Cmp = Cond.getOperand(1);
     unsigned Opc = Cmp.getOpcode();
-    bool IllegalFPCMov = !X86ScalarSSE &&
-      MVT::isFloatingPoint(Op.getValueType()) &&
+    bool IllegalFPCMov = 
+      ! ((X86ScalarSSEf32 && Op.getValueType()==MVT::f32) ||
+         (X86ScalarSSEf64 && Op.getValueType()==MVT::f64)) &&
       !hasFPCMov(cast<ConstantSDNode>(CC)->getSignExtended());
     if ((Opc == X86ISD::CMP || Opc == X86ISD::COMI || Opc == X86ISD::UCOMI) &&
         !IllegalFPCMov) {
diff --git a/llvm/lib/Target/X86/X86ISelLowering.h b/llvm/lib/Target/X86/X86ISelLowering.h
index 95f95e2c8375..658b34449d6a 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.h
+++ b/llvm/lib/Target/X86/X86ISelLowering.h
@@ -373,8 +373,12 @@ namespace llvm {
     /// X86StackPtr - X86 physical register used as stack ptr.
     unsigned X86StackPtr;
 
-    /// X86ScalarSSE - Select between SSE2 or x87 floating point ops.
-    bool X86ScalarSSE;
+    /// X86ScalarSSEf32, X86ScalarSSEf64 - Select between SSE or x87 
+    /// floating point ops.
+    /// When SSE is available, use it for f32 operations.
+    /// When SSE2 is available, use it for f64 operations.
+    bool X86ScalarSSEf32;
+    bool X86ScalarSSEf64;
 
     SDNode *LowerCallResult(SDOperand Chain, SDOperand InFlag, SDNode*TheCall,
                             unsigned CallingConv, SelectionDAG &DAG);
diff --git a/llvm/lib/Target/X86/X86InstrFPStack.td b/llvm/lib/Target/X86/X86InstrFPStack.td
index c4a4e4c8dad0..1e8cf6a5b4d0 100644
--- a/llvm/lib/Target/X86/X86InstrFPStack.td
+++ b/llvm/lib/Target/X86/X86InstrFPStack.td
@@ -152,30 +152,33 @@ def FpSETRESULT80 : FpI_<(outs), (ins RFP80:$src), SpecialFP,
                       [(X86fpset RFP80:$src)]>;// ST(0) = FPR
 }
 
-// FpI - Floating Point Psuedo Instruction template. Predicated on FPStack.
-// Note that f80-only instructions are used even in SSE mode and use FpI_
-// not this predicate.
-class FpI<dag outs, dag ins, FPFormat fp, list<dag> pattern> :
-  FpI_<outs, ins, fp, pattern>, Requires<[FPStack]>;
+// FpIf32, FpIf64 - Floating Point Psuedo Instruction template.
+// f32 instructions can use SSE1 and are predicated on FPStackf32 == !SSE1.
+// f64 instructions can use SSE2 and are predicated on FPStackf64 == !SSE2.
+// f80 instructions cannot use SSE and use neither of these.
+class FpIf32<dag outs, dag ins, FPFormat fp, list<dag> pattern> :
+  FpI_<outs, ins, fp, pattern>, Requires<[FPStackf32]>;
+class FpIf64<dag outs, dag ins, FPFormat fp, list<dag> pattern> :
+  FpI_<outs, ins, fp, pattern>, Requires<[FPStackf64]>;
 
 // Register copies.  Just copies, the shortening ones do not truncate.
-def MOV_Fp3232       : FpI<(outs RFP32:$dst), (ins RFP32:$src), SpecialFP, []>; 
-def MOV_Fp3264       : FpI<(outs RFP64:$dst), (ins RFP32:$src), SpecialFP, []>; 
-def MOV_Fp6432       : FpI<(outs RFP32:$dst), (ins RFP64:$src), SpecialFP, []>; 
-def MOV_Fp6464       : FpI<(outs RFP64:$dst), (ins RFP64:$src), SpecialFP, []>; 
-def MOV_Fp8032       : FpI<(outs RFP32:$dst), (ins RFP80:$src), SpecialFP, []>; 
-def MOV_Fp3280       : FpI<(outs RFP80:$dst), (ins RFP32:$src), SpecialFP, []>; 
-def MOV_Fp8064       : FpI<(outs RFP64:$dst), (ins RFP80:$src), SpecialFP, []>; 
-def MOV_Fp6480       : FpI<(outs RFP80:$dst), (ins RFP64:$src), SpecialFP, []>; 
+def MOV_Fp3232       : FpIf32<(outs RFP32:$dst), (ins RFP32:$src), SpecialFP, []>; 
+def MOV_Fp3264       : FpIf32<(outs RFP64:$dst), (ins RFP32:$src), SpecialFP, []>; 
+def MOV_Fp6432       : FpIf32<(outs RFP32:$dst), (ins RFP64:$src), SpecialFP, []>; 
+def MOV_Fp6464       : FpIf64<(outs RFP64:$dst), (ins RFP64:$src), SpecialFP, []>; 
+def MOV_Fp8032       : FpIf32<(outs RFP32:$dst), (ins RFP80:$src), SpecialFP, []>; 
+def MOV_Fp3280       : FpIf32<(outs RFP80:$dst), (ins RFP32:$src), SpecialFP, []>; 
+def MOV_Fp8064       : FpIf64<(outs RFP64:$dst), (ins RFP80:$src), SpecialFP, []>; 
+def MOV_Fp6480       : FpIf64<(outs RFP80:$dst), (ins RFP64:$src), SpecialFP, []>; 
 def MOV_Fp8080       : FpI_<(outs RFP80:$dst), (ins RFP80:$src), SpecialFP, []>; 
 
 // Factoring for arithmetic.
 multiclass FPBinary_rr<SDNode OpNode> {
 // Register op register -> register
 // These are separated out because they have no reversed form.
-def _Fp32 : FpI<(outs RFP32:$dst), (ins RFP32:$src1, RFP32:$src2), TwoArgFP,
+def _Fp32 : FpIf32<(outs RFP32:$dst), (ins RFP32:$src1, RFP32:$src2), TwoArgFP,
                 [(set RFP32:$dst, (OpNode RFP32:$src1, RFP32:$src2))]>;
-def _Fp64 : FpI<(outs RFP64:$dst), (ins RFP64:$src1, RFP64:$src2), TwoArgFP,
+def _Fp64 : FpIf64<(outs RFP64:$dst), (ins RFP64:$src1, RFP64:$src2), TwoArgFP,
                 [(set RFP64:$dst, (OpNode RFP64:$src1, RFP64:$src2))]>;
 def _Fp80 : FpI_<(outs RFP80:$dst), (ins RFP80:$src1, RFP80:$src2), TwoArgFP,
                 [(set RFP80:$dst, (OpNode RFP80:$src1, RFP80:$src2))]>;
@@ -185,13 +188,13 @@ def _Fp80 : FpI_<(outs RFP80:$dst), (ins RFP80:$src1, RFP80:$src2), TwoArgFP,
 // These instructions cannot address 80-bit memory.
 multiclass FPBinary<SDNode OpNode, Format fp, string asmstring> {
 // ST(0) = ST(0) + [mem]
-def _Fp32m  : FpI<(outs RFP32:$dst), (ins RFP32:$src1, f32mem:$src2), OneArgFPRW,
+def _Fp32m  : FpIf32<(outs RFP32:$dst), (ins RFP32:$src1, f32mem:$src2), OneArgFPRW,
                   [(set RFP32:$dst, 
                     (OpNode RFP32:$src1, (loadf32 addr:$src2)))]>;
-def _Fp64m  : FpI<(outs RFP64:$dst), (ins RFP64:$src1, f64mem:$src2), OneArgFPRW,
+def _Fp64m  : FpIf64<(outs RFP64:$dst), (ins RFP64:$src1, f64mem:$src2), OneArgFPRW,
                   [(set RFP64:$dst, 
                     (OpNode RFP64:$src1, (loadf64 addr:$src2)))]>;
-def _Fp64m32: FpI<(outs RFP64:$dst), (ins RFP64:$src1, f32mem:$src2), OneArgFPRW,
+def _Fp64m32: FpIf64<(outs RFP64:$dst), (ins RFP64:$src1, f32mem:$src2), OneArgFPRW,
                   [(set RFP64:$dst, 
                     (OpNode RFP64:$src1, (f64 (extloadf32 addr:$src2))))]>;
 def _Fp80m32: FpI_<(outs RFP80:$dst), (ins RFP80:$src1, f32mem:$src2), OneArgFPRW,
@@ -205,16 +208,16 @@ def _F32m  : FPI<0xD8, fp, (outs), (ins f32mem:$src),
 def _F64m  : FPI<0xDC, fp, (outs), (ins f64mem:$src), 
                  !strconcat("f", !strconcat(asmstring, "{l}\t$src"))>;
 // ST(0) = ST(0) + [memint]
-def _FpI16m32 : FpI<(outs RFP32:$dst), (ins RFP32:$src1, i16mem:$src2), OneArgFPRW,
+def _FpI16m32 : FpIf32<(outs RFP32:$dst), (ins RFP32:$src1, i16mem:$src2), OneArgFPRW,
                     [(set RFP32:$dst, (OpNode RFP32:$src1,
                                        (X86fild addr:$src2, i16)))]>;
-def _FpI32m32 : FpI<(outs RFP32:$dst), (ins RFP32:$src1, i32mem:$src2), OneArgFPRW,
+def _FpI32m32 : FpIf32<(outs RFP32:$dst), (ins RFP32:$src1, i32mem:$src2), OneArgFPRW,
                     [(set RFP32:$dst, (OpNode RFP32:$src1,
                                        (X86fild addr:$src2, i32)))]>;
-def _FpI16m64 : FpI<(outs RFP64:$dst), (ins RFP64:$src1, i16mem:$src2), OneArgFPRW,
+def _FpI16m64 : FpIf64<(outs RFP64:$dst), (ins RFP64:$src1, i16mem:$src2), OneArgFPRW,
                     [(set RFP64:$dst, (OpNode RFP64:$src1,
                                        (X86fild addr:$src2, i16)))]>;
-def _FpI32m64 : FpI<(outs RFP64:$dst), (ins RFP64:$src1, i32mem:$src2), OneArgFPRW,
+def _FpI32m64 : FpIf64<(outs RFP64:$dst), (ins RFP64:$src1, i32mem:$src2), OneArgFPRW,
                     [(set RFP64:$dst, (OpNode RFP64:$src1,
                                        (X86fild addr:$src2, i32)))]>;
 def _FpI16m80 : FpI_<(outs RFP80:$dst), (ins RFP80:$src1, i16mem:$src2), OneArgFPRW,
@@ -271,9 +274,9 @@ def DIVR_FPrST0 : FPrST0PInst<0xF0, "fdiv{|r}p\t$op">;
 
 // Unary operations.
 multiclass FPUnary<SDNode OpNode, bits<8> opcode, string asmstring> {
-def _Fp32  : FpI<(outs RFP32:$dst), (ins RFP32:$src), OneArgFPRW,
+def _Fp32  : FpIf32<(outs RFP32:$dst), (ins RFP32:$src), OneArgFPRW,
                  [(set RFP32:$dst, (OpNode RFP32:$src))]>;
-def _Fp64  : FpI<(outs RFP64:$dst), (ins RFP64:$src), OneArgFPRW,
+def _Fp64  : FpIf64<(outs RFP64:$dst), (ins RFP64:$src), OneArgFPRW,
                  [(set RFP64:$dst, (OpNode RFP64:$src))]>;
 def _Fp80  : FpI_<(outs RFP80:$dst), (ins RFP80:$src), OneArgFPRW,
                  [(set RFP80:$dst, (OpNode RFP80:$src))]>;
@@ -286,9 +289,9 @@ defm SQRT: FPUnary<fsqrt,0xFA, "fsqrt">;
 defm SIN : FPUnary<fsin, 0xFE, "fsin">;
 defm COS : FPUnary<fcos, 0xFF, "fcos">;
 
-def TST_Fp32  : FpI<(outs), (ins RFP32:$src), OneArgFP,
+def TST_Fp32  : FpIf32<(outs), (ins RFP32:$src), OneArgFP,
                  []>;
-def TST_Fp64  : FpI<(outs), (ins RFP64:$src), OneArgFP,
+def TST_Fp64  : FpIf64<(outs), (ins RFP64:$src), OneArgFP,
                  []>;
 def TST_Fp80  : FpI_<(outs), (ins RFP80:$src), OneArgFP,
                  []>;
@@ -296,10 +299,10 @@ def TST_F  : FPI<0xE4, RawFrm, (outs), (ins), "ftst">, D9;
 
 // Floating point cmovs.
 multiclass FPCMov<PatLeaf cc> {
-  def _Fp32  : FpI<(outs RFP32:$dst), (ins RFP32:$src1, RFP32:$src2), CondMovFP,
+  def _Fp32  : FpIf32<(outs RFP32:$dst), (ins RFP32:$src1, RFP32:$src2), CondMovFP,
                      [(set RFP32:$dst, (X86cmov RFP32:$src1, RFP32:$src2,
                                         cc))]>;
-  def _Fp64  : FpI<(outs RFP64:$dst), (ins RFP64:$src1, RFP64:$src2), CondMovFP,
+  def _Fp64  : FpIf64<(outs RFP64:$dst), (ins RFP64:$src1, RFP64:$src2), CondMovFP,
                      [(set RFP64:$dst, (X86cmov RFP64:$src1, RFP64:$src2,
                                         cc))]>;
   def _Fp80  : FpI_<(outs RFP80:$dst), (ins RFP80:$src1, RFP80:$src2), CondMovFP,
@@ -337,30 +340,30 @@ def CMOVNP_F : FPI<0xD8, AddRegFrm, (outs RST:$op), (ins),
 
 // Floating point loads & stores.
 let isLoad = 1 in {
-def LD_Fp32m   : FpI<(outs RFP32:$dst), (ins f32mem:$src), ZeroArgFP,
+def LD_Fp32m   : FpIf32<(outs RFP32:$dst), (ins f32mem:$src), ZeroArgFP,
                   [(set RFP32:$dst, (loadf32 addr:$src))]>;
-def LD_Fp64m   : FpI<(outs RFP64:$dst), (ins f64mem:$src), ZeroArgFP,
+def LD_Fp64m   : FpIf64<(outs RFP64:$dst), (ins f64mem:$src), ZeroArgFP,
                   [(set RFP64:$dst, (loadf64 addr:$src))]>;
 def LD_Fp80m   : FpI_<(outs RFP80:$dst), (ins f80mem:$src), ZeroArgFP,
                   [(set RFP80:$dst, (loadf80 addr:$src))]>;
 }
-def LD_Fp32m64 : FpI<(outs RFP64:$dst), (ins f32mem:$src), ZeroArgFP,
+def LD_Fp32m64 : FpIf64<(outs RFP64:$dst), (ins f32mem:$src), ZeroArgFP,
                   [(set RFP64:$dst, (f64 (extloadf32 addr:$src)))]>;
 def LD_Fp64m80 : FpI_<(outs RFP80:$dst), (ins f64mem:$src), ZeroArgFP,
                   [(set RFP80:$dst, (f80 (extloadf64 addr:$src)))]>;
 def LD_Fp32m80 : FpI_<(outs RFP80:$dst), (ins f32mem:$src), ZeroArgFP,
                   [(set RFP80:$dst, (f80 (extloadf32 addr:$src)))]>;
-def ILD_Fp16m32: FpI<(outs RFP32:$dst), (ins i16mem:$src), ZeroArgFP,
+def ILD_Fp16m32: FpIf32<(outs RFP32:$dst), (ins i16mem:$src), ZeroArgFP,
                   [(set RFP32:$dst, (X86fild addr:$src, i16))]>;
-def ILD_Fp32m32: FpI<(outs RFP32:$dst), (ins i32mem:$src), ZeroArgFP,
+def ILD_Fp32m32: FpIf32<(outs RFP32:$dst), (ins i32mem:$src), ZeroArgFP,
                   [(set RFP32:$dst, (X86fild addr:$src, i32))]>;
-def ILD_Fp64m32: FpI<(outs RFP32:$dst), (ins i64mem:$src), ZeroArgFP,
+def ILD_Fp64m32: FpIf32<(outs RFP32:$dst), (ins i64mem:$src), ZeroArgFP,
                   [(set RFP32:$dst, (X86fild addr:$src, i64))]>;
-def ILD_Fp16m64: FpI<(outs RFP64:$dst), (ins i16mem:$src), ZeroArgFP,
+def ILD_Fp16m64: FpIf64<(outs RFP64:$dst), (ins i16mem:$src), ZeroArgFP,
                   [(set RFP64:$dst, (X86fild addr:$src, i16))]>;
-def ILD_Fp32m64: FpI<(outs RFP64:$dst), (ins i32mem:$src), ZeroArgFP,
+def ILD_Fp32m64: FpIf64<(outs RFP64:$dst), (ins i32mem:$src), ZeroArgFP,
                   [(set RFP64:$dst, (X86fild addr:$src, i32))]>;
-def ILD_Fp64m64: FpI<(outs RFP64:$dst), (ins i64mem:$src), ZeroArgFP,
+def ILD_Fp64m64: FpIf64<(outs RFP64:$dst), (ins i64mem:$src), ZeroArgFP,
                   [(set RFP64:$dst, (X86fild addr:$src, i64))]>;
 def ILD_Fp16m80: FpI_<(outs RFP80:$dst), (ins i16mem:$src), ZeroArgFP,
                   [(set RFP80:$dst, (X86fild addr:$src, i16))]>;
@@ -369,11 +372,11 @@ def ILD_Fp32m80: FpI_<(outs RFP80:$dst), (ins i32mem:$src), ZeroArgFP,
 def ILD_Fp64m80: FpI_<(outs RFP80:$dst), (ins i64mem:$src), ZeroArgFP,
                   [(set RFP80:$dst, (X86fild addr:$src, i64))]>;
 
-def ST_Fp32m   : FpI<(outs), (ins f32mem:$op, RFP32:$src), OneArgFP,
+def ST_Fp32m   : FpIf32<(outs), (ins f32mem:$op, RFP32:$src), OneArgFP,
                   [(store RFP32:$src, addr:$op)]>;
-def ST_Fp64m32 : FpI<(outs), (ins f32mem:$op, RFP64:$src), OneArgFP,
+def ST_Fp64m32 : FpIf64<(outs), (ins f32mem:$op, RFP64:$src), OneArgFP,
                   [(truncstoref32 RFP64:$src, addr:$op)]>;
-def ST_Fp64m   : FpI<(outs), (ins f64mem:$op, RFP64:$src), OneArgFP,
+def ST_Fp64m   : FpIf64<(outs), (ins f64mem:$op, RFP64:$src), OneArgFP,
                   [(store RFP64:$src, addr:$op)]>;
 def ST_Fp80m32 : FpI_<(outs), (ins f32mem:$op, RFP80:$src), OneArgFP,
                   [(truncstoref32 RFP80:$src, addr:$op)]>;
@@ -381,19 +384,19 @@ def ST_Fp80m64 : FpI_<(outs), (ins f64mem:$op, RFP80:$src), OneArgFP,
                   [(truncstoref64 RFP80:$src, addr:$op)]>;
 // FST does not support 80-bit memory target; FSTP must be used.
 
-def ST_FpP32m    : FpI<(outs), (ins f32mem:$op, RFP32:$src), OneArgFP, []>;
-def ST_FpP64m32  : FpI<(outs), (ins f32mem:$op, RFP64:$src), OneArgFP, []>;
-def ST_FpP64m    : FpI<(outs), (ins f64mem:$op, RFP64:$src), OneArgFP, []>;
-def ST_FpP80m32  : FpI<(outs), (ins f32mem:$op, RFP80:$src), OneArgFP, []>;
-def ST_FpP80m64  : FpI<(outs), (ins f64mem:$op, RFP80:$src), OneArgFP, []>;
+def ST_FpP32m    : FpIf32<(outs), (ins f32mem:$op, RFP32:$src), OneArgFP, []>;
+def ST_FpP64m32  : FpIf64<(outs), (ins f32mem:$op, RFP64:$src), OneArgFP, []>;
+def ST_FpP64m    : FpIf64<(outs), (ins f64mem:$op, RFP64:$src), OneArgFP, []>;
+def ST_FpP80m32  : FpI_<(outs), (ins f32mem:$op, RFP80:$src), OneArgFP, []>;
+def ST_FpP80m64  : FpI_<(outs), (ins f64mem:$op, RFP80:$src), OneArgFP, []>;
 def ST_FpP80m    : FpI_<(outs), (ins f80mem:$op, RFP80:$src), OneArgFP,
                     [(store RFP80:$src, addr:$op)]>;
-def IST_Fp16m32  : FpI<(outs), (ins i16mem:$op, RFP32:$src), OneArgFP, []>;
-def IST_Fp32m32  : FpI<(outs), (ins i32mem:$op, RFP32:$src), OneArgFP, []>;
-def IST_Fp64m32  : FpI<(outs), (ins i64mem:$op, RFP32:$src), OneArgFP, []>;
-def IST_Fp16m64  : FpI<(outs), (ins i16mem:$op, RFP64:$src), OneArgFP, []>;
-def IST_Fp32m64  : FpI<(outs), (ins i32mem:$op, RFP64:$src), OneArgFP, []>;
-def IST_Fp64m64  : FpI<(outs), (ins i64mem:$op, RFP64:$src), OneArgFP, []>;
+def IST_Fp16m32  : FpIf32<(outs), (ins i16mem:$op, RFP32:$src), OneArgFP, []>;
+def IST_Fp32m32  : FpIf32<(outs), (ins i32mem:$op, RFP32:$src), OneArgFP, []>;
+def IST_Fp64m32  : FpIf32<(outs), (ins i64mem:$op, RFP32:$src), OneArgFP, []>;
+def IST_Fp16m64  : FpIf64<(outs), (ins i16mem:$op, RFP64:$src), OneArgFP, []>;
+def IST_Fp32m64  : FpIf64<(outs), (ins i32mem:$op, RFP64:$src), OneArgFP, []>;
+def IST_Fp64m64  : FpIf64<(outs), (ins i64mem:$op, RFP64:$src), OneArgFP, []>;
 def IST_Fp16m80  : FpI_<(outs), (ins i16mem:$op, RFP80:$src), OneArgFP, []>;
 def IST_Fp32m80  : FpI_<(outs), (ins i32mem:$op, RFP80:$src), OneArgFP, []>;
 def IST_Fp64m80  : FpI_<(outs), (ins i64mem:$op, RFP80:$src), OneArgFP, []>;
@@ -456,13 +459,13 @@ def XCH_F    : FPI<0xC8, AddRegFrm, (outs), (ins RST:$op), "fxch\t$op">, D9;
 
 // Floating point constant loads.
 let isReMaterializable = 1 in {
-def LD_Fp032 : FpI<(outs RFP32:$dst), (ins), ZeroArgFP,
+def LD_Fp032 : FpIf32<(outs RFP32:$dst), (ins), ZeroArgFP,
                 [(set RFP32:$dst, fpimm0)]>;
-def LD_Fp132 : FpI<(outs RFP32:$dst), (ins), ZeroArgFP,
+def LD_Fp132 : FpIf32<(outs RFP32:$dst), (ins), ZeroArgFP,
                 [(set RFP32:$dst, fpimm1)]>;
-def LD_Fp064 : FpI<(outs RFP64:$dst), (ins), ZeroArgFP,
+def LD_Fp064 : FpIf64<(outs RFP64:$dst), (ins), ZeroArgFP,
                 [(set RFP64:$dst, fpimm0)]>;
-def LD_Fp164 : FpI<(outs RFP64:$dst), (ins), ZeroArgFP,
+def LD_Fp164 : FpIf64<(outs RFP64:$dst), (ins), ZeroArgFP,
                 [(set RFP64:$dst, fpimm1)]>;
 def LD_Fp080 : FpI_<(outs RFP80:$dst), (ins), ZeroArgFP,
                 [(set RFP80:$dst, fpimm0)]>;
@@ -475,13 +478,13 @@ def LD_F1 : FPI<0xE8, RawFrm, (outs), (ins), "fld1">, D9;
 
 
 // Floating point compares.
-def UCOM_Fpr32 : FpI<(outs), (ins RFP32:$lhs, RFP32:$rhs), CompareFP,
+def UCOM_Fpr32 : FpIf32<(outs), (ins RFP32:$lhs, RFP32:$rhs), CompareFP,
                   []>;  // FPSW = cmp ST(0) with ST(i)
-def UCOM_FpIr32: FpI<(outs), (ins RFP32:$lhs, RFP32:$rhs), CompareFP,
+def UCOM_FpIr32: FpIf32<(outs), (ins RFP32:$lhs, RFP32:$rhs), CompareFP,
                   [(X86cmp RFP32:$lhs, RFP32:$rhs)]>; // CC = ST(0) cmp ST(i)
-def UCOM_Fpr64 : FpI<(outs), (ins RFP64:$lhs, RFP64:$rhs), CompareFP,
+def UCOM_Fpr64 : FpIf64<(outs), (ins RFP64:$lhs, RFP64:$rhs), CompareFP,
                   []>;  // FPSW = cmp ST(0) with ST(i)
-def UCOM_FpIr64: FpI<(outs), (ins RFP64:$lhs, RFP64:$rhs), CompareFP,
+def UCOM_FpIr64: FpIf64<(outs), (ins RFP64:$lhs, RFP64:$rhs), CompareFP,
                   [(X86cmp RFP64:$lhs, RFP64:$rhs)]>; // CC = ST(0) cmp ST(i)
 def UCOM_Fpr80 : FpI_<(outs), (ins RFP80:$lhs, RFP80:$rhs), CompareFP,
                   []>;  // FPSW = cmp ST(0) with ST(i)
@@ -535,16 +538,16 @@ def : Pat<(X86fst RFP80:$src, addr:$op, f64), (ST_Fp80m64 addr:$op, RFP80:$src)>
 def : Pat<(X86fst RFP80:$src, addr:$op, f80), (ST_FpP80m addr:$op, RFP80:$src)>;
 
 // Floating point constant -0.0 and -1.0
-def : Pat<(f32 fpimmneg0), (CHS_Fp32 (LD_Fp032))>, Requires<[FPStack]>;
-def : Pat<(f32 fpimmneg1), (CHS_Fp32 (LD_Fp132))>, Requires<[FPStack]>;
-def : Pat<(f64 fpimmneg0), (CHS_Fp64 (LD_Fp064))>, Requires<[FPStack]>;
-def : Pat<(f64 fpimmneg1), (CHS_Fp64 (LD_Fp164))>, Requires<[FPStack]>;
+def : Pat<(f32 fpimmneg0), (CHS_Fp32 (LD_Fp032))>, Requires<[FPStackf32]>;
+def : Pat<(f32 fpimmneg1), (CHS_Fp32 (LD_Fp132))>, Requires<[FPStackf32]>;
+def : Pat<(f64 fpimmneg0), (CHS_Fp64 (LD_Fp064))>, Requires<[FPStackf64]>;
+def : Pat<(f64 fpimmneg1), (CHS_Fp64 (LD_Fp164))>, Requires<[FPStackf64]>;
 def : Pat<(f80 fpimmneg0), (CHS_Fp80 (LD_Fp080))>;
 def : Pat<(f80 fpimmneg1), (CHS_Fp80 (LD_Fp180))>;
 
 // Used to conv. i64 to f64 since there isn't a SSE version.
 def : Pat<(X86fildflag addr:$src, i64), (ILD_Fp64m64 addr:$src)>;
 
-def : Pat<(f64 (fextend RFP32:$src)), (MOV_Fp3264 RFP32:$src)>, Requires<[FPStack]>;
-def : Pat<(f80 (fextend RFP32:$src)), (MOV_Fp3280 RFP32:$src)>, Requires<[FPStack]>;
-def : Pat<(f80 (fextend RFP64:$src)), (MOV_Fp6480 RFP64:$src)>, Requires<[FPStack]>;
+def : Pat<(f64 (fextend RFP32:$src)), (MOV_Fp3264 RFP32:$src)>, Requires<[FPStackf32]>;
+def : Pat<(f80 (fextend RFP32:$src)), (MOV_Fp3280 RFP32:$src)>, Requires<[FPStackf32]>;
+def : Pat<(f80 (fextend RFP64:$src)), (MOV_Fp6480 RFP64:$src)>, Requires<[FPStackf64]>;
diff --git a/llvm/lib/Target/X86/X86InstrInfo.td b/llvm/lib/Target/X86/X86InstrInfo.td
index 4f6e372560ec..dde59fc04be4 100644
--- a/llvm/lib/Target/X86/X86InstrInfo.td
+++ b/llvm/lib/Target/X86/X86InstrInfo.td
@@ -160,7 +160,8 @@ def HasSSE1      : Predicate<"Subtarget->hasSSE1()">;
 def HasSSE2      : Predicate<"Subtarget->hasSSE2()">;
 def HasSSE3      : Predicate<"Subtarget->hasSSE3()">;
 def HasSSSE3     : Predicate<"Subtarget->hasSSSE3()">;
-def FPStack      : Predicate<"!Subtarget->hasSSE2()">;
+def FPStackf32   : Predicate<"!Subtarget->hasSSE1()">;
+def FPStackf64   : Predicate<"!Subtarget->hasSSE2()">;
 def In32BitMode  : Predicate<"!Subtarget->is64Bit()">;
 def In64BitMode  : Predicate<"Subtarget->is64Bit()">;
 def HasLow4G     : Predicate<"Subtarget->hasLow4GUserSpaceAddress()">;
diff --git a/llvm/lib/Target/X86/X86InstrSSE.td b/llvm/lib/Target/X86/X86InstrSSE.td
index 7fb7d2efb25c..19ec6adf17df 100644
--- a/llvm/lib/Target/X86/X86InstrSSE.td
+++ b/llvm/lib/Target/X86/X86InstrSSE.td
@@ -50,7 +50,7 @@ def IMPLICIT_DEF_VR128 : I<0, Pseudo, (outs VR128:$dst), (ins),
                          Requires<[HasSSE1]>;
 def IMPLICIT_DEF_FR32  : I<0, Pseudo, (outs FR32:$dst), (ins),
                            "#IMPLICIT_DEF $dst",
-                           [(set FR32:$dst, (undef))]>, Requires<[HasSSE2]>;
+                           [(set FR32:$dst, (undef))]>, Requires<[HasSSE1]>;
 def IMPLICIT_DEF_FR64  : I<0, Pseudo, (outs FR64:$dst), (ins),
                            "#IMPLICIT_DEF $dst",
                            [(set FR64:$dst, (undef))]>, Requires<[HasSSE2]>;