llvm-project/llvm/test/CodeGen/PowerPC/ppc64-func-desc-hoist.ll

46 lines
1.4 KiB
LLVM
Raw Normal View History

; RUN: llc -verify-machineinstrs -mcpu=a2 < %s | FileCheck %s -check-prefix=INVFUNCDESC
; RUN: llc -verify-machineinstrs -mcpu=a2 -mattr=-invariant-function-descriptors < %s | FileCheck %s -check-prefix=NONINVFUNCDESC
[PowerPC] Loosen ELFv1 PPC64 func descriptor loads for indirect calls Function pointers under PPC64 ELFv1 (which is used on PPC64/Linux on the POWER7, A2 and earlier cores) are really pointers to a function descriptor, a structure with three pointers: the actual pointer to the code to which to jump, the pointer to the TOC needed by the callee, and an environment pointer. We used to chain these loads, and make them opaque to the rest of the optimizer, so that they'd always occur directly before the call. This is not necessary, and in fact, highly suboptimal on embedded cores. Once the function pointer is known, the loads can be performed ahead of time; in fact, they can be hoisted out of loops. Now these function descriptors are almost always generated by the linker, and thus the contents of the descriptors are invariant. As a result, by default, we'll mark the associated loads as invariant (allowing them to be hoisted out of loops). I've added a target feature to turn this off, however, just in case someone needs that option (constructing an on-stack descriptor, casting it to a function pointer, and then calling it cannot be well-defined C/C++ code, but I can imagine some JIT-compilation system doing so). Consider this simple test: $ cat call.c typedef void (*fp)(); void bar(fp x) { for (int i = 0; i < 1600000000; ++i) x(); } $ cat main.c typedef void (*fp)(); void bar(fp x); void foo() {} int main() { bar(foo); } On the PPC A2 (the BG/Q supercomputer), marking the function-descriptor loads as invariant brings the execution time down to ~8 seconds from ~32 seconds with the loads in the loop. The difference on the POWER7 is smaller. Compiling with: gcc -std=c99 -O3 -mcpu=native call.c main.c : ~6 seconds [this is 4.8.2] clang -O3 -mcpu=native call.c main.c : ~5.3 seconds clang -O3 -mcpu=native call.c main.c -mno-invariant-function-descriptors : ~4 seconds (looks like we'd benefit from additional loop unrolling here, as a first guess, because this is faster with the extra loads) The -mno-invariant-function-descriptors will be added to Clang shortly. llvm-svn: 226207
2015-01-16 05:17:34 +08:00
target datalayout = "E-m:e-i64:64-n32:64"
target triple = "powerpc64-unknown-linux-gnu"
; Function Attrs: nounwind
define void @bar(void (...)* nocapture %x) #0 {
entry:
%callee.knr.cast = bitcast void (...)* %x to void ()*
br label %for.body
; INVFUNCDESC-LABEL: @bar
; INVFUNCDESC-DAG: ld [[REG1:[0-9]+]], 8(3)
; INVFUNCDESC-DAG: ld [[REG2:[0-9]+]], 16(3)
; INVFUNCDESC-DAG: ld [[REG3:[0-9]+]], 0(3)
; INVFUNCDESC: %for.body
; INVFUNCDESC-DAG: mtctr [[REG3]]
; INVFUNCDESC-DAG: mr 11, [[REG2]]
; INVFUNCDESC-DAG: mr 2, [[REG1]]
[PowerPC] Loosen ELFv1 PPC64 func descriptor loads for indirect calls Function pointers under PPC64 ELFv1 (which is used on PPC64/Linux on the POWER7, A2 and earlier cores) are really pointers to a function descriptor, a structure with three pointers: the actual pointer to the code to which to jump, the pointer to the TOC needed by the callee, and an environment pointer. We used to chain these loads, and make them opaque to the rest of the optimizer, so that they'd always occur directly before the call. This is not necessary, and in fact, highly suboptimal on embedded cores. Once the function pointer is known, the loads can be performed ahead of time; in fact, they can be hoisted out of loops. Now these function descriptors are almost always generated by the linker, and thus the contents of the descriptors are invariant. As a result, by default, we'll mark the associated loads as invariant (allowing them to be hoisted out of loops). I've added a target feature to turn this off, however, just in case someone needs that option (constructing an on-stack descriptor, casting it to a function pointer, and then calling it cannot be well-defined C/C++ code, but I can imagine some JIT-compilation system doing so). Consider this simple test: $ cat call.c typedef void (*fp)(); void bar(fp x) { for (int i = 0; i < 1600000000; ++i) x(); } $ cat main.c typedef void (*fp)(); void bar(fp x); void foo() {} int main() { bar(foo); } On the PPC A2 (the BG/Q supercomputer), marking the function-descriptor loads as invariant brings the execution time down to ~8 seconds from ~32 seconds with the loads in the loop. The difference on the POWER7 is smaller. Compiling with: gcc -std=c99 -O3 -mcpu=native call.c main.c : ~6 seconds [this is 4.8.2] clang -O3 -mcpu=native call.c main.c : ~5.3 seconds clang -O3 -mcpu=native call.c main.c -mno-invariant-function-descriptors : ~4 seconds (looks like we'd benefit from additional loop unrolling here, as a first guess, because this is faster with the extra loads) The -mno-invariant-function-descriptors will be added to Clang shortly. llvm-svn: 226207
2015-01-16 05:17:34 +08:00
; INVFUNCDESC: bctrl
; INVFUNCDESC-NEXT: ld 2, 40(1)
; NONINVFUNCDESC-LABEL: @bar
; NONINVFUNCDESC: %for.body
; NONINVFUNCDESC-DAG: ld 3, 0(30)
; NONINVFUNCDESC-DAG: ld 11, 16(30)
; NONINVFUNCDESC-DAG: ld 2, 8(30)
; NONINVFUNCDESC: mtctr 3
; NONINVFUNCDESC: bctrl
; NONINVFUNCDESC-NEXT: ld 2, 40(1)
for.body: ; preds = %for.body, %entry
%i.02 = phi i32 [ 0, %entry ], [ %inc, %for.body ]
tail call void %callee.knr.cast() #0
%inc = add nuw nsw i32 %i.02, 1
%exitcond = icmp eq i32 %inc, 1600000000
br i1 %exitcond, label %for.end, label %for.body
for.end: ; preds = %for.body
ret void
}
attributes #0 = { nounwind }