diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp index f4329d3bf9d5..2d84860610c9 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -4977,6 +4977,18 @@ static SDValue EltsFromConsecutiveLoads(EVT VT, SmallVectorImpl &Elts, LDBase->getAlignment(), false/*isVolatile*/, true/*ReadMem*/, false/*WriteMem*/); + + // Make sure the newly-created LOAD is in the same position as LDBase in + // terms of dependency. We create a TokenFactor for LDBase and ResNode, and + // update uses of LDBase's output chain to use the TokenFactor. + if (LDBase->hasAnyUseOfValue(1)) { + SDValue NewChain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, + SDValue(LDBase, 1), SDValue(ResNode.getNode(), 1)); + DAG.ReplaceAllUsesOfValueWith(SDValue(LDBase, 1), NewChain); + DAG.UpdateNodeOperands(NewChain.getNode(), SDValue(LDBase, 1), + SDValue(ResNode.getNode(), 1)); + } + return DAG.getNode(ISD::BITCAST, DL, VT, ResNode); } return SDValue(); diff --git a/llvm/test/CodeGen/X86/vec_shuffle-26.ll b/llvm/test/CodeGen/X86/vec_shuffle-26.ll index 086af6bb114b..93c553001f81 100644 --- a/llvm/test/CodeGen/X86/vec_shuffle-26.ll +++ b/llvm/test/CodeGen/X86/vec_shuffle-26.ll @@ -1,6 +1,4 @@ -; RUN: llc < %s -march=x86 -mattr=sse41 -o %t -; RUN: grep unpcklps %t | count 1 -; RUN: grep unpckhps %t | count 3 +; RUN: llc < %s -march=x86 -mattr=sse41 | FileCheck %s ; Transpose example using the more generic vector shuffle. Return float8 ; instead of float16 @@ -14,6 +12,11 @@ target triple = "i386-apple-cl.1.0" define <8 x float> @__transpose2(<4 x float> %p0, <4 x float> %p1, <4 x float> %p2, <4 x float> %p3) nounwind { entry: +; CHECK: transpose2 +; CHECK: unpckhps +; CHECK: unpckhps +; CHECK: unpcklps +; CHECK: unpckhps %unpcklps = shufflevector <4 x float> %p0, <4 x float> %p2, <4 x i32> < i32 0, i32 4, i32 1, i32 5 > ; <<4 x float>> [#uses=2] %unpckhps = shufflevector <4 x float> %p0, <4 x float> %p2, <4 x i32> < i32 2, i32 6, i32 3, i32 7 > ; <<4 x float>> [#uses=2] %unpcklps8 = shufflevector <4 x float> %p1, <4 x float> %p3, <4 x i32> < i32 0, i32 4, i32 1, i32 5 > ; <<4 x float>> [#uses=2] @@ -27,3 +30,28 @@ entry: ; %r3 = shufflevector <8 x float> %r1, <8 x float> %r2, <16 x i32> < i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15 >; ret <8 x float> %r2 } + +define <2 x i64> @lo_hi_shift(float* nocapture %x, float* nocapture %y) nounwind { +entry: +; movhps should happen before extractps to assure it gets the correct value. +; CHECK: lo_hi_shift +; CHECK: movhps ([[BASEREG:%[a-z]+]]), +; CHECK: extractps ${{[0-9]+}}, %xmm{{[0-9]+}}, {{[0-9]*}}([[BASEREG]]) +; CHECK: extractps ${{[0-9]+}}, %xmm{{[0-9]+}}, {{[0-9]*}}([[BASEREG]]) + %v.i = bitcast float* %y to <4 x float>* + %0 = load <4 x float>* %v.i, align 1 + %1 = bitcast float* %x to <1 x i64>* + %.val = load <1 x i64>* %1, align 1 + %2 = bitcast <1 x i64> %.val to <2 x float> + %shuffle.i = shufflevector <2 x float> %2, <2 x float> undef, <4 x i32> + %shuffle1.i = shufflevector <4 x float> %0, <4 x float> %shuffle.i, <4 x i32> + %cast.i = bitcast <4 x float> %0 to <2 x i64> + %extract.i = extractelement <2 x i64> %cast.i, i32 1 + %3 = bitcast float* %x to i64* + store i64 %extract.i, i64* %3, align 4 + %4 = bitcast <4 x float> %0 to <16 x i8> + %5 = bitcast <4 x float> %shuffle1.i to <16 x i8> + %palignr = shufflevector <16 x i8> %5, <16 x i8> %4, <16 x i32> + %6 = bitcast <16 x i8> %palignr to <2 x i64> + ret <2 x i64> %6 +}