From 3c9b5f394b1cb4008e782c807c094b872a760c29 Mon Sep 17 00:00:00 2001 From: Dan Gohman Date: Thu, 2 Sep 2010 21:18:42 +0000 Subject: [PATCH] Don't narrow the load and store in a load+twiddle+store sequence unless there are clearly no stores between the load and the store. This fixes this miscompile reported as PR7833. This breaks the test/CodeGen/X86/narrow_op-2.ll optimization, which is safe, but awkward to prove safe. Move it to X86's README.txt. llvm-svn: 112861 --- llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp | 3 +- llvm/lib/Target/X86/README.txt | 45 +++++++++++++++++++ llvm/test/CodeGen/X86/narrow_op-2.ll | 25 ----------- llvm/test/CodeGen/X86/store-narrow.ll | 31 ++++++++++++- 4 files changed, 76 insertions(+), 28 deletions(-) delete mode 100644 llvm/test/CodeGen/X86/narrow_op-2.ll diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp index 7ffbf8d791af..c9c4d91e9736 100644 --- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp @@ -5798,7 +5798,8 @@ SDValue DAGCombiner::ReduceLoadOpStoreWidth(SDNode *N) { return SDValue(); SDValue N0 = Value.getOperand(0); - if (ISD::isNormalLoad(N0.getNode()) && N0.hasOneUse()) { + if (ISD::isNormalLoad(N0.getNode()) && N0.hasOneUse() && + Chain == SDValue(N0.getNode(), 1)) { LoadSDNode *LD = cast(N0); if (LD->getBasePtr() != Ptr) return SDValue(); diff --git a/llvm/lib/Target/X86/README.txt b/llvm/lib/Target/X86/README.txt index 69c6d33ae5e2..a305ae6ec550 100644 --- a/llvm/lib/Target/X86/README.txt +++ b/llvm/lib/Target/X86/README.txt @@ -1915,3 +1915,48 @@ And the following x86 code: It should be possible to eliminate the sign extensions. //===---------------------------------------------------------------------===// + +LLVM misses a load+store narrowing opportunity in this code: + +%struct.bf = type { i64, i16, i16, i32 } + +@bfi = external global %struct.bf* ; <%struct.bf**> [#uses=2] + +define void @t1() nounwind ssp { +entry: + %0 = load %struct.bf** @bfi, align 8 ; <%struct.bf*> [#uses=1] + %1 = getelementptr %struct.bf* %0, i64 0, i32 1 ; [#uses=1] + %2 = bitcast i16* %1 to i32* ; [#uses=2] + %3 = load i32* %2, align 1 ; [#uses=1] + %4 = and i32 %3, -65537 ; [#uses=1] + store i32 %4, i32* %2, align 1 + %5 = load %struct.bf** @bfi, align 8 ; <%struct.bf*> [#uses=1] + %6 = getelementptr %struct.bf* %5, i64 0, i32 1 ; [#uses=1] + %7 = bitcast i16* %6 to i32* ; [#uses=2] + %8 = load i32* %7, align 1 ; [#uses=1] + %9 = and i32 %8, -131073 ; [#uses=1] + store i32 %9, i32* %7, align 1 + ret void +} + +LLVM currently emits this: + + movq bfi(%rip), %rax + andl $-65537, 8(%rax) + movq bfi(%rip), %rax + andl $-131073, 8(%rax) + ret + +It could narrow the loads and stores to emit this: + + movq bfi(%rip), %rax + andb $-2, 10(%rax) + movq bfi(%rip), %rax + andb $-3, 10(%rax) + ret + +The trouble is that there is a TokenFactor between the store and the +load, making it non-trivial to determine if there's anything between +the load and the store which would prohibit narrowing. + +//===---------------------------------------------------------------------===// diff --git a/llvm/test/CodeGen/X86/narrow_op-2.ll b/llvm/test/CodeGen/X86/narrow_op-2.ll deleted file mode 100644 index 796ef7a29e49..000000000000 --- a/llvm/test/CodeGen/X86/narrow_op-2.ll +++ /dev/null @@ -1,25 +0,0 @@ -; RUN: llc < %s -march=x86-64 | FileCheck %s - - %struct.bf = type { i64, i16, i16, i32 } -@bfi = external global %struct.bf* - -define void @t1() nounwind ssp { -entry: - -; CHECK: andb $-2, 10( -; CHECK: andb $-3, 10( - - %0 = load %struct.bf** @bfi, align 8 - %1 = getelementptr %struct.bf* %0, i64 0, i32 1 - %2 = bitcast i16* %1 to i32* - %3 = load i32* %2, align 1 - %4 = and i32 %3, -65537 - store i32 %4, i32* %2, align 1 - %5 = load %struct.bf** @bfi, align 8 - %6 = getelementptr %struct.bf* %5, i64 0, i32 1 - %7 = bitcast i16* %6 to i32* - %8 = load i32* %7, align 1 - %9 = and i32 %8, -131073 - store i32 %9, i32* %7, align 1 - ret void -} diff --git a/llvm/test/CodeGen/X86/store-narrow.ll b/llvm/test/CodeGen/X86/store-narrow.ll index 5682e7caf8bd..abc5174c98de 100644 --- a/llvm/test/CodeGen/X86/store-narrow.ll +++ b/llvm/test/CodeGen/X86/store-narrow.ll @@ -1,6 +1,6 @@ ; rdar://7860110 -; RUN: llc < %s | FileCheck %s -check-prefix=X64 -; RUN: llc -march=x86 < %s | FileCheck %s -check-prefix=X32 +; RUN: llc -asm-verbose=false < %s | FileCheck %s -check-prefix=X64 +; RUN: llc -march=x86 -asm-verbose=false < %s | FileCheck %s -check-prefix=X32 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64" target triple = "x86_64-apple-darwin10.2" @@ -125,3 +125,30 @@ entry: ; X32: movb %cl, 5(%{{.*}}) } +; PR7833 + +@g_16 = internal global i32 -1 + +; X64: test8: +; X64-NEXT: movl _g_16(%rip), %eax +; X64-NEXT: movl $0, _g_16(%rip) +; X64-NEXT: orl $1, %eax +; X64-NEXT: movl %eax, _g_16(%rip) +; X64-NEXT: ret +define void @test8() nounwind { + %tmp = load i32* @g_16 + store i32 0, i32* @g_16 + %or = or i32 %tmp, 1 + store i32 %or, i32* @g_16 + ret void +} + +; X64: test9: +; X64-NEXT: orb $1, _g_16(%rip) +; X64-NEXT: ret +define void @test9() nounwind { + %tmp = load i32* @g_16 + %or = or i32 %tmp, 1 + store i32 %or, i32* @g_16 + ret void +}