From de4b225093f7e2747c2e7127c039d83dbc879e63 Mon Sep 17 00:00:00 2001 From: Bill Wendling Date: Fri, 30 Apr 2010 22:19:17 +0000 Subject: [PATCH] EXTRACT_VECTOR_ELT of an INSERT_VECTOR_ELT may have the same index, but the indexes could be of a different value type. Or not even using the same SDNode for the constant (weird, I know). Compare the actual values instead of the pointers. llvm-svn: 102791 --- .../lib/CodeGen/SelectionDAG/SelectionDAG.cpp | 21 +++-- .../CodeGen/X86/2010-04-30-VectorUnrollBug.ll | 85 +++++++++++++++++++ 2 files changed, 98 insertions(+), 8 deletions(-) create mode 100644 llvm/test/CodeGen/X86/2010-04-30-VectorUnrollBug.ll diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp index c479f651ade7..e6df742bc336 100644 --- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp @@ -2792,14 +2792,19 @@ SDValue SelectionDAG::getNode(unsigned Opcode, DebugLoc DL, EVT VT, // If the indices are the same, return the inserted element else // if the indices are known different, extract the element from // the original vector. - if (N1.getOperand(2) == N2) { - if (VT == N1.getOperand(1).getValueType()) - return N1.getOperand(1); - else - return getSExtOrTrunc(N1.getOperand(1), DL, VT); - } else if (isa(N1.getOperand(2)) && - isa(N2)) + SDValue N1Op2 = N1.getOperand(2); + ConstantSDNode *N1Op2C = dyn_cast(N1Op2.getNode()); + + if (N1Op2C && N2C) { + if (N1Op2C->getZExtValue() == N2C->getZExtValue()) { + if (VT == N1.getOperand(1).getValueType()) + return N1.getOperand(1); + else + return getSExtOrTrunc(N1.getOperand(1), DL, VT); + } + return getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, N1.getOperand(0), N2); + } } break; case ISD::EXTRACT_ELEMENT: @@ -6089,7 +6094,7 @@ SDValue SelectionDAG::UnrollVectorOp(SDNode *N, unsigned ResNE) { unsigned i; for (i= 0; i != NE; ++i) { - for (unsigned j = 0; j != N->getNumOperands(); ++j) { + for (unsigned j = 0, e = N->getNumOperands(); j != e; ++j) { SDValue Operand = N->getOperand(j); EVT OperandVT = Operand.getValueType(); if (OperandVT.isVector()) { diff --git a/llvm/test/CodeGen/X86/2010-04-30-VectorUnrollBug.ll b/llvm/test/CodeGen/X86/2010-04-30-VectorUnrollBug.ll new file mode 100644 index 000000000000..d072dc37bccc --- /dev/null +++ b/llvm/test/CodeGen/X86/2010-04-30-VectorUnrollBug.ll @@ -0,0 +1,85 @@ +; RUN: llc < %s -mtriple=x86_64-apple-darwin -disable-mmx | FileCheck %s +; + +define void @test1(i8** %a, i64* %b, i64 %c, i64 %d) nounwind { +entry: + %ptrtoarg37 = load i8** %a ; [#uses=1] + %arglist1 = getelementptr i8** %a, i64 1 ; [#uses=1] + %ptrtoarg238 = load i8** %arglist1 ; [#uses=1] + %arglist4 = getelementptr i8** %a, i64 2 ; [#uses=1] + %ptrtoarg539 = load i8** %arglist4 ; [#uses=1] + %0 = load i64* %b ; [#uses=2] + br label %loop.cond + +loop.cond: ; preds = %loop, %entry + %iv = phi i64 [ %0, %entry ], [ %11, %loop ] ; [#uses=3] + %1 = icmp eq i64 %c, %iv ; [#uses=1] + br i1 %1, label %return, label %loop + +loop: ; preds = %loop.cond + %2 = bitcast i8* %ptrtoarg539 to i32 addrspace(1)* ; [#uses=1] + %3 = bitcast i8* %ptrtoarg238 to i32 addrspace(1)* ; [#uses=1] + %4 = bitcast i8* %ptrtoarg37 to i32 addrspace(1)* ; [#uses=1] + %tmp1.i = load i64* addrspace(256)* inttoptr (i64 248 to i64* addrspace(256)*) ; [#uses=1] + %tmp2.i = load i64* %tmp1.i ; [#uses=1] + %conv.i = trunc i64 %tmp2.i to i32 ; [#uses=1] + %conv1.i = sext i32 %conv.i to i64 ; [#uses=1] + %5 = bitcast i32 addrspace(1)* %4 to i32* ; [#uses=2] + %i_times_3 = mul i64 %conv1.i, 3 ; [#uses=4] + %inptrA.i = getelementptr inbounds i32* %5, i64 %i_times_3 ; [#uses=1] + %6 = bitcast i32* %inptrA.i to i64* ; [#uses=1] + %i_times_3_plus_2 = add i64 %i_times_3, 2 ; [#uses=3] + %A.xy = load i64* %6 ; [#uses=1] + %inptrA.ip2 = getelementptr inbounds i32* %5, i64 %i_times_3_plus_2 ; [#uses=1] + %A.xyuu = insertelement <2 x i64> undef, i64 %A.xy, i32 0 ; <<2 x i64>> [#uses=1] + %A.xy__ = insertelement <2 x i64> %A.xyuu, i64 0, i32 1 ; <<2 x i64>> [#uses=1] + %tmp13.i.i20 = bitcast <2 x i64> %A.xy__ to <4 x i32> ; <<4 x i32>> [#uses=1] + %A.z = load i32* %inptrA.ip2 ; [#uses=1] + +; The "movl" is the load of %A.z. The registers aren't important. Here are what they map to: +; +; %rbx -> %i_times_3 +; %r9 -> %5 +; %r9 + (%rbx + 2) * 4 -> %inptrA.ip2 +; 8(%r9,%rbx,4) -> *%inptrA.ip2 -> %A.z +; +; CHECK: sarl %cl, %r15d +; CHECK-NEXT: pinsrd $0, %r15d, %xmm3 +; CHECK-NEXT: pinsrd $1, %r14d, %xmm3 +; CHECK-NEXT: movl 8(%r9,%rbx,4), %r14d +; CHECK-NEXT: pextrd $2, %xmm2, %ecx + + %A.xyz_ = insertelement <4 x i32> %tmp13.i.i20, i32 %A.z, i32 2 ; <<4 x i32>> [#uses=1] + %A.xyz = shufflevector <4 x i32> %A.xyz_, <4 x i32> undef, <3 x i32> ; <<3 x i32>> [#uses=1] + %7 = bitcast i32 addrspace(1)* %3 to i32* ; [#uses=2] + %add.ptr.i.i8 = getelementptr inbounds i32* %7, i64 %i_times_3 ; [#uses=1] + %8 = bitcast i32* %add.ptr.i.i8 to i64* ; [#uses=1] + %B.xy = load i64* %8 ; [#uses=1] + %arrayidx15.i.i = getelementptr inbounds i32* %7, i64 %i_times_3_plus_2 ; [#uses=1] + %B.xyuu = insertelement <2 x i64> undef, i64 %B.xy, i32 0 ; <<2 x i64>> [#uses=1] + %B.z = load i32* %arrayidx15.i.i ; [#uses=1] + %vecinit2.i.i.i = insertelement <2 x i64> %B.xyuu, i64 0, i32 1 ; <<2 x i64>> [#uses=1] + %B.xy__ = bitcast <2 x i64> %vecinit2.i.i.i to <4 x i32> ; <<4 x i32>> [#uses=1] + %B.xyz_ = insertelement <4 x i32> %B.xy__, i32 %B.z, i32 2 ; <<4 x i32>> [#uses=1] + %B.xyz = shufflevector <4 x i32> %B.xyz_, <4 x i32> undef, <3 x i32> ; <<3 x i32>> [#uses=1] + %and.i = and <3 x i32> %B.xyz, ; <<3 x i32>> [#uses=1] + %shr.i = ashr <3 x i32> %A.xyz, %and.i ; <<3 x i32>> [#uses=2] + %9 = bitcast i32 addrspace(1)* %2 to i32* ; [#uses=2] + %tmp3.i.i = shufflevector <3 x i32> %shr.i, <3 x i32> undef, <4 x i32> ; <<4 x i32>> [#uses=1] + %add.ptr.i.i = getelementptr inbounds i32* %9, i64 %i_times_3 ; [#uses=1] + %10 = bitcast i32* %add.ptr.i.i to i8* ; [#uses=1] + call void @llvm.x86.sse2.storel.dq(i8* %10, <4 x i32> %tmp3.i.i) nounwind + %vecext.i.i.i = extractelement <3 x i32> %shr.i, i32 2 ; [#uses=1] + %arrayidx.i.i = getelementptr inbounds i32* %9, i64 %i_times_3_plus_2 ; [#uses=1] + store i32 %vecext.i.i.i, i32* %arrayidx.i.i + %11 = add i64 %iv, %d ; [#uses=1] + %tmp = add i64 %d, %iv ; [#uses=1] + store i64 %tmp, i64* %b + br label %loop.cond + +return: ; preds = %loop.cond + store i64 %0, i64* %b + ret void +} + +declare void @llvm.x86.sse2.storel.dq(i8*, <4 x i32>) nounwind