2016-08-04 02:17:35 +08:00
|
|
|
; RUN: llc -verify-machineinstrs -mcpu=a2 < %s | FileCheck %s -check-prefix=INVFUNCDESC
|
|
|
|
; RUN: llc -verify-machineinstrs -mcpu=a2 -mattr=-invariant-function-descriptors < %s | FileCheck %s -check-prefix=NONINVFUNCDESC
|
[PowerPC] Loosen ELFv1 PPC64 func descriptor loads for indirect calls
Function pointers under PPC64 ELFv1 (which is used on PPC64/Linux on the
POWER7, A2 and earlier cores) are really pointers to a function descriptor, a
structure with three pointers: the actual pointer to the code to which to jump,
the pointer to the TOC needed by the callee, and an environment pointer. We
used to chain these loads, and make them opaque to the rest of the optimizer,
so that they'd always occur directly before the call. This is not necessary,
and in fact, highly suboptimal on embedded cores. Once the function pointer is
known, the loads can be performed ahead of time; in fact, they can be hoisted
out of loops.
Now these function descriptors are almost always generated by the linker, and
thus the contents of the descriptors are invariant. As a result, by default,
we'll mark the associated loads as invariant (allowing them to be hoisted out
of loops). I've added a target feature to turn this off, however, just in case
someone needs that option (constructing an on-stack descriptor, casting it to a
function pointer, and then calling it cannot be well-defined C/C++ code, but I
can imagine some JIT-compilation system doing so).
Consider this simple test:
$ cat call.c
typedef void (*fp)();
void bar(fp x) {
for (int i = 0; i < 1600000000; ++i)
x();
}
$ cat main.c
typedef void (*fp)();
void bar(fp x);
void foo() {}
int main() {
bar(foo);
}
On the PPC A2 (the BG/Q supercomputer), marking the function-descriptor loads
as invariant brings the execution time down to ~8 seconds from ~32 seconds with
the loads in the loop.
The difference on the POWER7 is smaller. Compiling with:
gcc -std=c99 -O3 -mcpu=native call.c main.c : ~6 seconds [this is 4.8.2]
clang -O3 -mcpu=native call.c main.c : ~5.3 seconds
clang -O3 -mcpu=native call.c main.c -mno-invariant-function-descriptors : ~4 seconds
(looks like we'd benefit from additional loop unrolling here, as a first
guess, because this is faster with the extra loads)
The -mno-invariant-function-descriptors will be added to Clang shortly.
llvm-svn: 226207
2015-01-16 05:17:34 +08:00
|
|
|
target datalayout = "E-m:e-i64:64-n32:64"
|
|
|
|
target triple = "powerpc64-unknown-linux-gnu"
|
|
|
|
|
|
|
|
; Function Attrs: nounwind
|
|
|
|
define void @bar(void (...)* nocapture %x) #0 {
|
|
|
|
entry:
|
|
|
|
%callee.knr.cast = bitcast void (...)* %x to void ()*
|
|
|
|
br label %for.body
|
|
|
|
|
|
|
|
; INVFUNCDESC-LABEL: @bar
|
|
|
|
; INVFUNCDESC-DAG: ld [[REG1:[0-9]+]], 8(3)
|
|
|
|
; INVFUNCDESC-DAG: ld [[REG2:[0-9]+]], 16(3)
|
|
|
|
; INVFUNCDESC-DAG: ld [[REG3:[0-9]+]], 0(3)
|
|
|
|
|
|
|
|
; INVFUNCDESC: %for.body
|
|
|
|
; INVFUNCDESC-DAG: mtctr [[REG3]]
|
|
|
|
; INVFUNCDESC-DAG: mr 11, [[REG2]]
|
2015-01-19 15:20:27 +08:00
|
|
|
; INVFUNCDESC-DAG: mr 2, [[REG1]]
|
[PowerPC] Loosen ELFv1 PPC64 func descriptor loads for indirect calls
Function pointers under PPC64 ELFv1 (which is used on PPC64/Linux on the
POWER7, A2 and earlier cores) are really pointers to a function descriptor, a
structure with three pointers: the actual pointer to the code to which to jump,
the pointer to the TOC needed by the callee, and an environment pointer. We
used to chain these loads, and make them opaque to the rest of the optimizer,
so that they'd always occur directly before the call. This is not necessary,
and in fact, highly suboptimal on embedded cores. Once the function pointer is
known, the loads can be performed ahead of time; in fact, they can be hoisted
out of loops.
Now these function descriptors are almost always generated by the linker, and
thus the contents of the descriptors are invariant. As a result, by default,
we'll mark the associated loads as invariant (allowing them to be hoisted out
of loops). I've added a target feature to turn this off, however, just in case
someone needs that option (constructing an on-stack descriptor, casting it to a
function pointer, and then calling it cannot be well-defined C/C++ code, but I
can imagine some JIT-compilation system doing so).
Consider this simple test:
$ cat call.c
typedef void (*fp)();
void bar(fp x) {
for (int i = 0; i < 1600000000; ++i)
x();
}
$ cat main.c
typedef void (*fp)();
void bar(fp x);
void foo() {}
int main() {
bar(foo);
}
On the PPC A2 (the BG/Q supercomputer), marking the function-descriptor loads
as invariant brings the execution time down to ~8 seconds from ~32 seconds with
the loads in the loop.
The difference on the POWER7 is smaller. Compiling with:
gcc -std=c99 -O3 -mcpu=native call.c main.c : ~6 seconds [this is 4.8.2]
clang -O3 -mcpu=native call.c main.c : ~5.3 seconds
clang -O3 -mcpu=native call.c main.c -mno-invariant-function-descriptors : ~4 seconds
(looks like we'd benefit from additional loop unrolling here, as a first
guess, because this is faster with the extra loads)
The -mno-invariant-function-descriptors will be added to Clang shortly.
llvm-svn: 226207
2015-01-16 05:17:34 +08:00
|
|
|
; INVFUNCDESC: bctrl
|
|
|
|
; INVFUNCDESC-NEXT: ld 2, 40(1)
|
|
|
|
|
|
|
|
; NONINVFUNCDESC-LABEL: @bar
|
|
|
|
; NONINVFUNCDESC: %for.body
|
|
|
|
; NONINVFUNCDESC-DAG: ld 3, 0(30)
|
|
|
|
; NONINVFUNCDESC-DAG: ld 11, 16(30)
|
|
|
|
; NONINVFUNCDESC-DAG: ld 2, 8(30)
|
|
|
|
; NONINVFUNCDESC: mtctr 3
|
|
|
|
; NONINVFUNCDESC: bctrl
|
|
|
|
; NONINVFUNCDESC-NEXT: ld 2, 40(1)
|
|
|
|
|
|
|
|
for.body: ; preds = %for.body, %entry
|
|
|
|
%i.02 = phi i32 [ 0, %entry ], [ %inc, %for.body ]
|
|
|
|
tail call void %callee.knr.cast() #0
|
|
|
|
%inc = add nuw nsw i32 %i.02, 1
|
|
|
|
%exitcond = icmp eq i32 %inc, 1600000000
|
|
|
|
br i1 %exitcond, label %for.end, label %for.body
|
|
|
|
|
|
|
|
for.end: ; preds = %for.body
|
|
|
|
ret void
|
|
|
|
}
|
|
|
|
|
|
|
|
attributes #0 = { nounwind }
|
|
|
|
|