From 75ff0534973ae5768ef955b56434a007dfc58abf Mon Sep 17 00:00:00 2001
From: Chris Lattner <sabre@nondot.org>
Date: Wed, 25 Aug 2010 22:49:25 +0000
Subject: [PATCH] Change handling of illegal vector types to widen when
 possible instead of expanding: e.g. <2 x float> -> <4 x float> instead of ->
 2 floats.  This affects two places in the code: handling cross block values
 and handling function return and arguments.  Since vectors are already
 widened by legalizetypes, this gives us much better code and unblocks x86-64
 abi and SPU abi work.

For example, this (which is a silly example of a cross-block value):
define <4 x float> @test2(<4 x float> %A) nounwind {
 %B = shufflevector <4 x float> %A, <4 x float> undef, <2 x i32> <i32 0, i32 1>
 %C = fadd <2 x float> %B, %B
  br label %BB
BB:
 %D = fadd <2 x float> %C, %C
 %E = shufflevector <2 x float> %D, <2 x float> undef, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
 ret <4 x float> %E
}

Now compiles into:

_test2:                                 ## @test2
## BB#0:
 addps %xmm0, %xmm0
 addps %xmm0, %xmm0
 ret

previously it compiled into:

_test2:                                 ## @test2
## BB#0:
 addps %xmm0, %xmm0
 pshufd $1, %xmm0, %xmm1
                                        ## kill: XMM0<def> XMM0<kill> XMM0<def>
 insertps $0, %xmm0, %xmm0
 insertps $16, %xmm1, %xmm0
 addps %xmm0, %xmm0
 ret

This implements rdar://8230384

llvm-svn: 112101
---
 llvm/include/llvm/Target/TargetLowering.h     | 59 ++++++++++---
 .../SelectionDAG/SelectionDAGBuilder.cpp      | 61 +++++++++++---
 .../CodeGen/SelectionDAG/TargetLowering.cpp   | 82 +++++++++++--------
 llvm/test/CodeGen/X86/v2f32.ll                | 57 ++++++++++---
 llvm/test/CodeGen/X86/widen_shuffle-1.ll      |  8 +-
 5 files changed, 193 insertions(+), 74 deletions(-)
diff --git a/llvm/include/llvm/Target/TargetLowering.h b/llvm/include/llvm/Target/TargetLowering.h
index 197b8d1eb47e..0dc9d4e0a8b7 100644
--- a/llvm/include/llvm/Target/TargetLowering.h
+++ b/llvm/include/llvm/Target/TargetLowering.h
@@ -214,24 +214,59 @@ public:
     /// ValueTypeActions - For each value type, keep a LegalizeAction enum
     /// that indicates how instruction selection should deal with the type.
     uint8_t ValueTypeActions[MVT::LAST_VALUETYPE];
+    
+    LegalizeAction getExtendedTypeAction(EVT VT) const {
+      // Handle non-vector integers.
+      if (!VT.isVector()) {
+        assert(VT.isInteger() && "Unsupported extended type!");
+        unsigned BitSize = VT.getSizeInBits();
+        // First promote to a power-of-two size, then expand if necessary.
+        if (BitSize < 8 || !isPowerOf2_32(BitSize))
+          return Promote;
+        return Expand;
+      }
+      
+      // If this is a type smaller than a legal vector type, promote to that
+      // type, e.g. <2 x float> -> <4 x float>.
+      if (VT.getVectorElementType().isSimple() &&
+          VT.getVectorNumElements() != 1) {
+        MVT EltType = VT.getVectorElementType().getSimpleVT();
+        unsigned NumElts = VT.getVectorNumElements();
+        while (1) {
+          // Round up to the nearest power of 2.
+          NumElts = (unsigned)NextPowerOf2(NumElts);
+          
+          MVT LargerVector = MVT::getVectorVT(EltType, NumElts);
+          if (LargerVector == MVT()) break;
+          
+          // If this the larger type is legal, promote to it.
+          if (getTypeAction(LargerVector) == Legal) return Promote;
+        }
+      }
+      
+      return VT.isPow2VectorType() ? Expand : Promote;
+    }      
   public:
     ValueTypeActionImpl() {
       std::fill(ValueTypeActions, array_endof(ValueTypeActions), 0);
     }
+    
+    /// FIXME: This Context argument is now dead, zap it.
     LegalizeAction getTypeAction(LLVMContext &Context, EVT VT) const {
-      if (VT.isExtended()) {
-        if (VT.isVector()) {
-          return VT.isPow2VectorType() ? Expand : Promote;
-        }
-        if (VT.isInteger())
-          // First promote to a power-of-two size, then expand if necessary.
-          return VT == VT.getRoundIntegerType(Context) ? Expand : Promote;
-        assert(0 && "Unsupported extended type!");
-        return Legal;
-      }
-      unsigned I = VT.getSimpleVT().SimpleTy;
-      return (LegalizeAction)ValueTypeActions[I];
+      return getTypeAction(VT);
     }
+    
+    LegalizeAction getTypeAction(EVT VT) const {
+      if (!VT.isExtended())
+        return getTypeAction(VT.getSimpleVT());
+      return getExtendedTypeAction(VT);
+    }
+    
+    LegalizeAction getTypeAction(MVT VT) const {
+      return (LegalizeAction)ValueTypeActions[VT.SimpleTy];
+    }
+    
+    
     void setTypeAction(EVT VT, LegalizeAction Action) {
       unsigned I = VT.getSimpleVT().SimpleTy;
       ValueTypeActions[I] = Action;
diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
index a8a8ecb4c131..35e752021372 100644
--- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
@@ -252,8 +252,21 @@ static SDValue getCopyFromPartsVector(SelectionDAG &DAG, DebugLoc DL,
   if (PartVT == ValueVT)
     return Val;
   
-  if (PartVT.isVector())  // Vector/Vector bitcast.
+  if (PartVT.isVector()) {
+    // If the element type of the source/dest vectors are the same, but the
+    // parts vector has more elements than the value vector, then we have a
+    // vector widening case (e.g. <2 x float> -> <4 x float>).  Extract the
+    // elements we want.
+    if (PartVT.getVectorElementType() == ValueVT.getVectorElementType()) {
+      assert(PartVT.getVectorNumElements() > ValueVT.getVectorNumElements() &&
+             "Cannot narrow, it would be a lossy transformation");
+      return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, ValueVT, Val,
+                         DAG.getIntPtrConstant(0));
+    }                                      
+    
+    // Vector/Vector bitcast.
     return DAG.getNode(ISD::BIT_CONVERT, DL, ValueVT, Val);
+  }
   
   assert(ValueVT.getVectorElementType() == PartVT &&
          ValueVT.getVectorNumElements() == 1 &&
@@ -392,16 +405,39 @@ static void getCopyToPartsVector(SelectionDAG &DAG, DebugLoc DL,
   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
   
   if (NumParts == 1) {
-    if (PartVT != ValueVT) {
-      if (PartVT.getSizeInBits() == ValueVT.getSizeInBits()) {
-        Val = DAG.getNode(ISD::BIT_CONVERT, DL, PartVT, Val);
-      } else {
-        assert(ValueVT.getVectorElementType() == PartVT &&
-               ValueVT.getVectorNumElements() == 1 &&
-               "Only trivial vector-to-scalar conversions should get here!");
-        Val = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL,
-                          PartVT, Val, DAG.getIntPtrConstant(0));
-      }
+    if (PartVT == ValueVT) {
+      // Nothing to do.
+    } else if (PartVT.getSizeInBits() == ValueVT.getSizeInBits()) {
+      // Bitconvert vector->vector case.
+      Val = DAG.getNode(ISD::BIT_CONVERT, DL, PartVT, Val);
+    } else if (PartVT.isVector() &&
+               PartVT.getVectorElementType() == ValueVT.getVectorElementType()&&
+               PartVT.getVectorNumElements() > ValueVT.getVectorNumElements()) {
+      EVT ElementVT = PartVT.getVectorElementType();
+      // Vector widening case, e.g. <2 x float> -> <4 x float>.  Shuffle in
+      // undef elements.
+      SmallVector<SDValue, 16> Ops;
+      for (unsigned i = 0, e = ValueVT.getVectorNumElements(); i != e; ++i)
+        Ops.push_back(DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL,
+                                  ElementVT, Val, DAG.getIntPtrConstant(i)));
+      
+      for (unsigned i = ValueVT.getVectorNumElements(),
+           e = PartVT.getVectorNumElements(); i != e; ++i)
+        Ops.push_back(DAG.getUNDEF(ElementVT));
+
+      Val = DAG.getNode(ISD::BUILD_VECTOR, DL, PartVT, &Ops[0], Ops.size());
+
+      // FIXME: Use CONCAT for 2x -> 4x.
+      
+      //SDValue UndefElts = DAG.getUNDEF(VectorTy);
+      //Val = DAG.getNode(ISD::CONCAT_VECTORS, DL, PartVT, Val, UndefElts);
+    } else {
+      // Vector -> scalar conversion.
+      assert(ValueVT.getVectorElementType() == PartVT &&
+             ValueVT.getVectorNumElements() == 1 &&
+             "Only trivial vector-to-scalar conversions should get here!");
+      Val = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL,
+                        PartVT, Val, DAG.getIntPtrConstant(0));
     }
     
     Parts[0] = Val;
@@ -428,8 +464,7 @@ static void getCopyToPartsVector(SelectionDAG &DAG, DebugLoc DL,
                    DAG.getIntPtrConstant(i * (NumElements / NumIntermediates)));
     else
       Ops[i] = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL,
-                           IntermediateVT, Val,
-                           DAG.getIntPtrConstant(i));
+                           IntermediateVT, Val, DAG.getIntPtrConstant(i));
   }
   
   // Split the intermediate operands into legal parts.
diff --git a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
index 9772d55449e9..077fd1dadd11 100644
--- a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
@@ -697,6 +697,7 @@ TargetLowering::findRepresentativeClass(EVT VT) const {
   return std::make_pair(BestRC, 1);
 }
 
+
 /// computeRegisterProperties - Once all of the register classes are added,
 /// this allows us to compute derived properties we expose.
 void TargetLowering::computeRegisterProperties() {
@@ -782,6 +783,28 @@ void TargetLowering::computeRegisterProperties() {
     MVT VT = (MVT::SimpleValueType)i;
     if (isTypeLegal(VT)) continue;
     
+    // Determine if there is a legal wider type.  If so, we should promote to
+    // that wider vector type.
+    EVT EltVT = VT.getVectorElementType();
+    unsigned NElts = VT.getVectorNumElements();
+    if (NElts != 1) {
+      bool IsLegalWiderType = false;
+      for (unsigned nVT = i+1; nVT <= MVT::LAST_VECTOR_VALUETYPE; ++nVT) {
+        EVT SVT = (MVT::SimpleValueType)nVT;
+        if (SVT.getVectorElementType() == EltVT &&
+            SVT.getVectorNumElements() > NElts && 
+            isTypeSynthesizable(SVT)) {
+          TransformToType[i] = SVT;
+          RegisterTypeForVT[i] = SVT;
+          NumRegistersForVT[i] = 1;
+          ValueTypeActions.setTypeAction(VT, Promote);
+          IsLegalWiderType = true;
+          break;
+        }
+      }
+      if (IsLegalWiderType) continue;
+    }
+    
     MVT IntermediateVT;
     EVT RegisterVT;
     unsigned NumIntermediates;
@@ -790,30 +813,14 @@ void TargetLowering::computeRegisterProperties() {
                                 RegisterVT, this);
     RegisterTypeForVT[i] = RegisterVT;
     
-    // Determine if there is a legal wider type.
-    bool IsLegalWiderType = false;
-    EVT EltVT = VT.getVectorElementType();
-    unsigned NElts = VT.getVectorNumElements();
-    for (unsigned nVT = i+1; nVT <= MVT::LAST_VECTOR_VALUETYPE; ++nVT) {
-      EVT SVT = (MVT::SimpleValueType)nVT;
-      if (isTypeSynthesizable(SVT) && SVT.getVectorElementType() == EltVT &&
-          SVT.getVectorNumElements() > NElts && NElts != 1) {
-        TransformToType[i] = SVT;
-        ValueTypeActions.setTypeAction(VT, Promote);
-        IsLegalWiderType = true;
-        break;
-      }
-    }
-    if (!IsLegalWiderType) {
-      EVT NVT = VT.getPow2VectorType();
-      if (NVT == VT) {
-        // Type is already a power of 2.  The default action is to split.
-        TransformToType[i] = MVT::Other;
-        ValueTypeActions.setTypeAction(VT, Expand);
-      } else {
-        TransformToType[i] = NVT;
-        ValueTypeActions.setTypeAction(VT, Promote);
-      }
+    EVT NVT = VT.getPow2VectorType();
+    if (NVT == VT) {
+      // Type is already a power of 2.  The default action is to split.
+      TransformToType[i] = MVT::Other;
+      ValueTypeActions.setTypeAction(VT, Expand);
+    } else {
+      TransformToType[i] = NVT;
+      ValueTypeActions.setTypeAction(VT, Promote);
     }
   }
 
@@ -857,8 +864,21 @@ unsigned TargetLowering::getVectorTypeBreakdown(LLVMContext &Context, EVT VT,
                                                 EVT &IntermediateVT,
                                                 unsigned &NumIntermediates,
                                                 EVT &RegisterVT) const {
-  // Figure out the right, legal destination reg to copy into.
   unsigned NumElts = VT.getVectorNumElements();
+  
+  // If there is a wider vector type with the same element type as this one,
+  // we should widen to that legal vector type.  This handles things like
+  // <2 x float> -> <4 x float>.
+  if (NumElts != 1 && getTypeAction(Context, VT) == Promote) {
+    RegisterVT = getTypeToTransformTo(Context, VT);
+    if (isTypeLegal(RegisterVT)) {
+      IntermediateVT = RegisterVT;
+      NumIntermediates = 1;
+      return 1;
+    }
+  }
+  
+  // Figure out the right, legal destination reg to copy into.
   EVT EltTy = VT.getVectorElementType();
   
   unsigned NumVectorRegs = 1;
@@ -887,16 +907,12 @@ unsigned TargetLowering::getVectorTypeBreakdown(LLVMContext &Context, EVT VT,
 
   EVT DestVT = getRegisterType(Context, NewVT);
   RegisterVT = DestVT;
-  if (DestVT.bitsLT(NewVT)) {
-    // Value is expanded, e.g. i64 -> i16.
+  if (DestVT.bitsLT(NewVT))   // Value is expanded, e.g. i64 -> i16.
     return NumVectorRegs*(NewVT.getSizeInBits()/DestVT.getSizeInBits());
-  } else {
-    // Otherwise, promotion or legal types use the same number of registers as
-    // the vector decimated to the appropriate level.
-    return NumVectorRegs;
-  }
   
-  return 1;
+  // Otherwise, promotion or legal types use the same number of registers as
+  // the vector decimated to the appropriate level.
+  return NumVectorRegs;
 }
 
 /// Get the EVTs and ArgFlags collections that represent the legalized return 
diff --git a/llvm/test/CodeGen/X86/v2f32.ll b/llvm/test/CodeGen/X86/v2f32.ll
index 9c4b773a6190..76c3fdfc060c 100644
--- a/llvm/test/CodeGen/X86/v2f32.ll
+++ b/llvm/test/CodeGen/X86/v2f32.ll
@@ -10,15 +10,16 @@ define void @test1(<2 x float> %Q, float *%P2) nounwind {
   store float %c, float* %P2
   ret void
 ; X64: test1:
-; X64-NEXT: addss	%xmm1, %xmm0
-; X64-NEXT: movss	%xmm0, (%rdi)
+; X64-NEXT: pshufd	$1, %xmm0, %xmm1
+; X64-NEXT: addss	%xmm0, %xmm1
+; X64-NEXT: movss	%xmm1, (%rdi)
 ; X64-NEXT: ret
 
 ; X32: test1:
-; X32-NEXT: movss	4(%esp), %xmm0
-; X32-NEXT: addss	8(%esp), %xmm0
-; X32-NEXT: movl	12(%esp), %eax
-; X32-NEXT: movss	%xmm0, (%eax)
+; X32-NEXT: pshufd	$1, %xmm0, %xmm1
+; X32-NEXT: addss	%xmm0, %xmm1
+; X32-NEXT: movl	4(%esp), %eax
+; X32-NEXT: movss	%xmm1, (%eax)
 ; X32-NEXT: ret
 }
 
@@ -28,12 +29,42 @@ define <2 x float> @test2(<2 x float> %Q, <2 x float> %R, <2 x float> *%P) nounw
   ret <2 x float> %Z
   
 ; X64: test2:
-; X64-NEXT: insertps $0
-; X64-NEXT: insertps $16
-; X64-NEXT: insertps $0
-; X64-NEXT: insertps $16
-; X64-NEXT: addps
-; X64-NEXT: movaps
-; X64-NEXT: pshufd
+; X64-NEXT: addps	%xmm1, %xmm0
 ; X64-NEXT: ret
 }
+
+
+define <2 x float> @test3(<4 x float> %A) nounwind {
+	%B = shufflevector <4 x float> %A, <4 x float> undef, <2 x i32> <i32 0, i32 1>
+	%C = fadd <2 x float> %B, %B
+	ret <2 x float> %C
+; CHECK: test3:
+; CHECK-NEXT: 	addps	%xmm0, %xmm0
+; CHECK-NEXT: 	ret
+}
+
+define <2 x float> @test4(<2 x float> %A) nounwind {
+	%C = fadd <2 x float> %A, %A
+	ret <2 x float> %C
+; CHECK: test4:
+; CHECK-NEXT: 	addps	%xmm0, %xmm0
+; CHECK-NEXT: 	ret
+}
+
+define <4 x float> @test5(<4 x float> %A) nounwind {
+	%B = shufflevector <4 x float> %A, <4 x float> undef, <2 x i32> <i32 0, i32 1>
+	%C = fadd <2 x float> %B, %B
+        br label %BB
+        
+BB:
+        %D = fadd <2 x float> %C, %C
+	%E = shufflevector <2 x float> %D, <2 x float> undef, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
+	ret <4 x float> %E
+        
+; CHECK: _test5:
+; CHECK-NEXT: 	addps	%xmm0, %xmm0
+; CHECK-NEXT: 	addps	%xmm0, %xmm0
+; CHECK-NEXT: 	ret
+}
+
+
diff --git a/llvm/test/CodeGen/X86/widen_shuffle-1.ll b/llvm/test/CodeGen/X86/widen_shuffle-1.ll
index 25dde57c767e..463f522a11df 100644
--- a/llvm/test/CodeGen/X86/widen_shuffle-1.ll
+++ b/llvm/test/CodeGen/X86/widen_shuffle-1.ll
@@ -3,7 +3,8 @@
 ; widening shuffle v3float and then a add
 define void @shuf(<3 x float>* %dst.addr, <3 x float> %src1,<3 x float> %src2) nounwind {
 entry:
-; CHECK: insertps
+; CHECK: shuf:
+; CHECK: extractps
 ; CHECK: extractps
 	%x = shufflevector <3 x float> %src1, <3 x float> %src2, <3 x i32> < i32 0, i32 1, i32 2>
 	%val = fadd <3 x float> %x, %src2
@@ -15,7 +16,8 @@ entry:
 ; widening shuffle v3float with a different mask and then a add
 define void @shuf2(<3 x float>* %dst.addr, <3 x float> %src1,<3 x float> %src2) nounwind {
 entry:
-; CHECK: insertps
+; CHECK: shuf2:
+; CHECK: extractps
 ; CHECK: extractps
 	%x = shufflevector <3 x float> %src1, <3 x float> %src2, <3 x i32> < i32 0, i32 4, i32 2>
 	%val = fadd <3 x float> %x, %src2
@@ -26,7 +28,7 @@ entry:
 ; Example of when widening a v3float operation causes the DAG to replace a node
 ; with the operation that we are currently widening, i.e. when replacing
 ; opA with opB, the DAG will produce new operations with opA.
-define void @shuf3(<4 x float> %tmp10, <4 x float> %vecinit15, <4 x float>* %dst) {
+define void @shuf3(<4 x float> %tmp10, <4 x float> %vecinit15, <4 x float>* %dst) nounwind {
 entry:
 ; CHECK: pshufd
   %shuffle.i.i.i12 = shufflevector <4 x float> %tmp10, <4 x float> %vecinit15, <4 x i32> <i32 0, i32 1, i32 4, i32 5>