llvm-project/llvm/test/CodeGen/ARM/fold-stack-adjust.ll

222 lines
6.2 KiB
LLVM
Raw Normal View History

; RUN: llc -mtriple=thumbv7-apple-none-macho < %s | FileCheck %s
; RUN: llc -mtriple=thumbv6m-apple-none-macho -disable-fp-elim < %s | FileCheck %s --check-prefix=CHECK-T1
; RUN: llc -mtriple=thumbv7-apple-darwin-ios -disable-fp-elim < %s | FileCheck %s --check-prefix=CHECK-IOS
; RUN: llc -mtriple=thumbv7--linux-gnueabi -disable-fp-elim < %s | FileCheck %s --check-prefix=CHECK-LINUX
declare void @bar(i8*)
%bigVec = type [2 x double]
@var = global %bigVec zeroinitializer
define void @check_simple() minsize {
; CHECK-LABEL: check_simple:
; CHECK: push {r3, r4, r5, r6, r7, lr}
; CHECK-NOT: sub sp, sp,
; ...
; CHECK-NOT: add sp, sp,
; CHECK: pop {r0, r1, r2, r3, r7, pc}
; CHECK-T1-LABEL: check_simple:
; CHECK-T1: push {r3, r4, r5, r6, r7, lr}
; CHECK-T1: add r7, sp, #16
; CHECK-T1-NOT: sub sp, sp,
; ...
; CHECK-T1-NOT: add sp, sp,
; CHECK-T1: pop {r0, r1, r2, r3, r7, pc}
; iOS always has a frame pointer and messing with the push affects
; how it's set in the prologue. Make sure we get that right.
; CHECK-IOS-LABEL: check_simple:
; CHECK-IOS: push {r3, r4, r5, r6, r7, lr}
; CHECK-NOT: sub sp,
; CHECK-IOS: add r7, sp, #16
; CHECK-NOT: sub sp,
; ...
; CHECK-NOT: add sp,
; CHEC: pop {r3, r4, r5, r6, r7, pc}
%var = alloca i8, i32 16
call void @bar(i8* %var)
ret void
}
define void @check_simple_too_big() minsize {
; CHECK-LABEL: check_simple_too_big:
; CHECK: push {r7, lr}
; CHECK: sub sp,
; ...
; CHECK: add sp,
; CHECK: pop {r7, pc}
%var = alloca i8, i32 64
call void @bar(i8* %var)
ret void
}
define void @check_vfp_fold() minsize {
; CHECK-LABEL: check_vfp_fold:
; CHECK: push {r[[GLOBREG:[0-9]+]], lr}
; CHECK: vpush {d6, d7, d8, d9}
; CHECK-NOT: sub sp,
; ...
; CHECK: vldmia r[[GLOBREG]], {d8, d9}
; ...
; CHECK-NOT: add sp,
; CHECK: vpop {d6, d7, d8, d9}
; CHECKL pop {r[[GLOBREG]], pc}
; iOS uses aligned NEON stores here, which is convenient since we
; want to make sure that works too.
; CHECK-IOS-LABEL: check_vfp_fold:
; CHECK-IOS: push {r0, r1, r2, r3, r4, r7, lr}
; CHECK-IOS: sub.w r4, sp, #16
; CHECK-IOS: bfc r4, #0, #4
; CHECK-IOS: mov sp, r4
; CHECK-IOS: vst1.64 {d8, d9}, [r4:128]
; ...
; CHECK-IOS: add r4, sp, #16
; CHECK-IOS: vld1.64 {d8, d9}, [r4:128]
; CHECK-IOS: mov sp, r4
; CHECK-IOS: pop {r4, r7, pc}
%var = alloca i8, i32 16
%tmp = load %bigVec, %bigVec* @var
call void @bar(i8* %var)
store %bigVec %tmp, %bigVec* @var
ret void
}
; This function should use just enough space that the "add sp, sp, ..." could be
; folded in except that doing so would clobber the value being returned.
define i64 @check_no_return_clobber() minsize {
; CHECK-LABEL: check_no_return_clobber:
; CHECK: push {r1, r2, r3, r4, r5, r6, r7, lr}
; CHECK-NOT: sub sp,
; ...
; CHECK: add sp, #24
; CHECK: pop {r7, pc}
; Just to keep iOS FileCheck within previous function:
; CHECK-IOS-LABEL: check_no_return_clobber:
%var = alloca i8, i32 20
call void @bar(i8* %var)
ret i64 0
}
define arm_aapcs_vfpcc double @check_vfp_no_return_clobber() minsize {
; CHECK-LABEL: check_vfp_no_return_clobber:
; CHECK: push {r[[GLOBREG:[0-9]+]], lr}
; CHECK: vpush {d0, d1, d2, d3, d4, d5, d6, d7, d8, d9}
; CHECK-NOT: sub sp,
; ...
; CHECK: add sp, #64
; CHECK: vpop {d8, d9}
; CHECK: pop {r[[GLOBREG]], pc}
%var = alloca i8, i32 64
%tmp = load %bigVec, %bigVec* @var
call void @bar(i8* %var)
store %bigVec %tmp, %bigVec* @var
ret double 1.0
}
@dbl = global double 0.0
; PR18136: there was a bug determining where the first eligible pop in a
; basic-block was when the entire block was epilogue code.
define void @test_fold_point(i1 %tst) minsize {
; CHECK-LABEL: test_fold_point:
; Important to check for beginning of basic block, because if it gets
; if-converted the test is probably no longer checking what it should.
; CHECK: {{LBB[0-9]+_2}}:
; CHECK-NEXT: vpop {d7, d8}
; CHECK-NEXT: pop {r4, pc}
; With a guaranteed frame-pointer, we want to make sure that its offset in the
; push block is correct, even if a few registers have been tacked onto a later
; vpush (PR18160).
; CHECK-IOS-LABEL: test_fold_point:
; CHECK-IOS: push {r4, r7, lr}
; CHECK-IOS-NEXT: add r7, sp, #4
; CHECK-IOS-NEXT: vpush {d7, d8}
; We want some memory so there's a stack adjustment to fold...
%var = alloca i8, i32 8
; We want a long-lived floating register so that a callee-saved dN is used and
; there's both a vpop and a pop.
%live_val = load double, double* @dbl
br i1 %tst, label %true, label %end
true:
call void @bar(i8* %var)
store double %live_val, double* @dbl
br label %end
end:
; We want the epilogue to be the only thing in a basic block so that we hit
; the correct edge-case (first inst in block is correct one to adjust).
ret void
}
define void @test_varsize(...) minsize {
; CHECK-T1-LABEL: test_varsize:
; CHECK-T1: sub sp, #16
[SDAG] Make the DAGCombine worklist not grow endlessly due to duplicate insertions. The old behavior could cause arbitrarily bad memory usage in the DAG combiner if there was heavy traffic of adding nodes already on the worklist to it. This commit switches the DAG combine worklist to work the same way as the instcombine worklist where we null-out removed entries and only add new entries to the worklist. My measurements of codegen time shows slight improvement. The memory utilization is unsurprisingly dominated by other factors (the IR and DAG itself I suspect). This change results in subtle, frustrating churn in the particular order in which DAG combines are applied which causes a number of minor regressions where we fail to match a pattern previously matched by accident. AFAICT, all of these should be using AddToWorklist to directly or should be written in a less brittle way. None of the changes seem drastically bad, and a few of the changes seem distinctly better. A major change required to make this work is to significantly harden the way in which the DAG combiner handle nodes which become dead (zero-uses). Previously, we relied on the ability to "priority-bump" them on the combine worklist to achieve recursive deletion of these nodes and ensure that the frontier of remaining live nodes all were added to the worklist. Instead, I've introduced a routine to just implement that precise logic with no indirection. It is a significantly simpler operation than that of the combiner worklist proper. I suspect this will also fix some other problems with the combiner. I think the x86 changes are really minor and uninteresting, but the avx512 change at least is hiding a "regression" (despite the test case being just noise, not testing some performance invariant) that might be looked into. Not sure if any of the others impact specific "important" code paths, but they didn't look terribly interesting to me, or the changes were really minor. The consensus in review is to fix any regressions that show up after the fact here. Thanks to the other reviewers for checking the output on other architectures. There is a specific regression on ARM that Tim already has a fix prepped to commit. Differential Revision: http://reviews.llvm.org/D4616 llvm-svn: 213727
2014-07-23 15:08:53 +08:00
; CHECK-T1: push {r5, r6, r7, lr}
; ...
[SDAG] Make the DAGCombine worklist not grow endlessly due to duplicate insertions. The old behavior could cause arbitrarily bad memory usage in the DAG combiner if there was heavy traffic of adding nodes already on the worklist to it. This commit switches the DAG combine worklist to work the same way as the instcombine worklist where we null-out removed entries and only add new entries to the worklist. My measurements of codegen time shows slight improvement. The memory utilization is unsurprisingly dominated by other factors (the IR and DAG itself I suspect). This change results in subtle, frustrating churn in the particular order in which DAG combines are applied which causes a number of minor regressions where we fail to match a pattern previously matched by accident. AFAICT, all of these should be using AddToWorklist to directly or should be written in a less brittle way. None of the changes seem drastically bad, and a few of the changes seem distinctly better. A major change required to make this work is to significantly harden the way in which the DAG combiner handle nodes which become dead (zero-uses). Previously, we relied on the ability to "priority-bump" them on the combine worklist to achieve recursive deletion of these nodes and ensure that the frontier of remaining live nodes all were added to the worklist. Instead, I've introduced a routine to just implement that precise logic with no indirection. It is a significantly simpler operation than that of the combiner worklist proper. I suspect this will also fix some other problems with the combiner. I think the x86 changes are really minor and uninteresting, but the avx512 change at least is hiding a "regression" (despite the test case being just noise, not testing some performance invariant) that might be looked into. Not sure if any of the others impact specific "important" code paths, but they didn't look terribly interesting to me, or the changes were really minor. The consensus in review is to fix any regressions that show up after the fact here. Thanks to the other reviewers for checking the output on other architectures. There is a specific regression on ARM that Tim already has a fix prepped to commit. Differential Revision: http://reviews.llvm.org/D4616 llvm-svn: 213727
2014-07-23 15:08:53 +08:00
; CHECK-T1: pop {r2, r3, r7}
; CHECK-T1: pop {[[POP_REG:r[0-3]]]}
; CHECK-T1: add sp, #16
; CHECK-T1: bx [[POP_REG]]
; CHECK-LABEL: test_varsize:
; CHECK: sub sp, #16
; CHECK: push {r5, r6, r7, lr}
; ...
; CHECK: pop.w {r2, r3, r7, lr}
; CHECK: add sp, #16
; CHECK: bx lr
%var = alloca i8, i32 8
call void @llvm.va_start(i8* %var)
call void @bar(i8* %var)
ret void
}
%"MyClass" = type { i8*, i32, i32, float, float, float, [2 x i8], i32, i32* }
declare float @foo()
declare void @bar3()
declare %"MyClass"* @bar2(%"MyClass"* returned, i16*, i32, float, float, i32, i32, i1 zeroext, i1 zeroext, i32)
define fastcc float @check_vfp_no_return_clobber2(i16* %r, i16* %chars, i32 %length, i1 zeroext %flag) minsize {
entry:
; CHECK-LINUX-LABEL: check_vfp_no_return_clobber2
; CHECK-LINUX: vpush {d0, d1, d2, d3, d4, d5, d6, d7, d8}
; CHECK-NOT: sub sp,
; ...
; CHECK-LINUX: add sp
; CHECK-LINUX: vpop {d8}
%run = alloca %"MyClass", align 4
%call = call %"MyClass"* @bar2(%"MyClass"* %run, i16* %chars, i32 %length, float 0.000000e+00, float 0.000000e+00, i32 1, i32 1, i1 zeroext false, i1 zeroext true, i32 3)
%call1 = call float @foo()
%cmp = icmp eq %"MyClass"* %run, null
br i1 %cmp, label %exit, label %if.then
if.then: ; preds = %entry
call void @bar3()
br label %exit
exit: ; preds = %if.then, %entry
ret float %call1
}
declare void @llvm.va_start(i8*) nounwind