2016-08-04 02:17:35 +08:00
|
|
|
; RUN: llc -verify-machineinstrs < %s | FileCheck %s
|
[PowerPC] Load two floats directly instead of using one 64-bit integer load
When dealing with complex<float>, and similar structures with two
single-precision floating-point numbers, especially when such things are being
passed around by value, we'll sometimes end up loading both float values by
extracting them from one 64-bit integer load. It looks like this:
t13: i64,ch = load<LD8[%ref.tmp]> t0, t6, undef:i64
t16: i64 = srl t13, Constant:i32<32>
t17: i32 = truncate t16
t18: f32 = bitcast t17
t19: i32 = truncate t13
t20: f32 = bitcast t19
The problem, especially before the P8 where those bitcasts aren't legal (and
get expanded via the stack), is that it would have been better to use two
floating-point loads directly. Here we add a target-specific DAGCombine to do
just that. In short, we turn:
ld 3, 0(5)
stw 3, -8(1)
rldicl 3, 3, 32, 32
stw 3, -4(1)
lfs 3, -4(1)
lfs 0, -8(1)
into:
lfs 3, 4(5)
lfs 0, 0(5)
llvm-svn: 264988
2016-03-31 10:56:05 +08:00
|
|
|
target datalayout = "E-m:e-i64:64-n32:64"
|
|
|
|
target triple = "powerpc64-bgq-linux"
|
|
|
|
|
|
|
|
define void @_Z4testSt7complexIfE(float %v0, float %v1, i64* %ref.tmp, float* %_M_value.realp.i.i, float* %_M_value.imagp.i.i) {
|
|
|
|
entry:
|
|
|
|
%v2 = load i64, i64* %ref.tmp, align 8
|
|
|
|
%v3 = lshr i64 %v2, 32
|
|
|
|
%v4 = trunc i64 %v3 to i32
|
|
|
|
%v5 = bitcast i32 %v4 to float
|
|
|
|
%v6 = trunc i64 %v2 to i32
|
|
|
|
%v7 = bitcast i32 %v6 to float
|
|
|
|
%mul_ad.i.i = fmul fast float %v5, %v1
|
|
|
|
%mul_bc.i.i = fmul fast float %v7, %v0
|
|
|
|
%mul_i.i.i = fadd fast float %mul_ad.i.i, %mul_bc.i.i
|
|
|
|
%mul_ac.i.i = fmul fast float %v5, %v0
|
|
|
|
%mul_bd.i.i = fmul fast float %v7, %v1
|
|
|
|
%mul_r.i.i = fsub fast float %mul_ac.i.i, %mul_bd.i.i
|
|
|
|
store float %mul_r.i.i, float* %_M_value.realp.i.i, align 4
|
|
|
|
store float %mul_i.i.i, float* %_M_value.imagp.i.i, align 4
|
|
|
|
ret void
|
|
|
|
|
|
|
|
; CHECK-LABEL: @_Z4testSt7complexIfE
|
|
|
|
; CHECK-NOT: ld {{[0-9]+}}, 0(5)
|
|
|
|
; CHECK-NOT: stw
|
|
|
|
; CHECK-NOT: rldicl
|
|
|
|
; CHECK-DAG: lfs {{[0-9]+}}, 4(5)
|
|
|
|
; CHECK-DAG: lfs {{[0-9]+}}, 0(5)
|
|
|
|
; CHECK: blr
|
|
|
|
}
|
|
|
|
|
|
|
|
define i64* @_Z4testSt7complexIfE_idx(float %v0, float %v1, i64* %ref.tmp, float* %_M_value.realp.i.i, float* %_M_value.imagp.i.i) {
|
|
|
|
entry:
|
|
|
|
%r = getelementptr i64, i64* %ref.tmp, i64 1
|
|
|
|
%v2 = load i64, i64* %r, align 8
|
|
|
|
%v3 = lshr i64 %v2, 32
|
|
|
|
%v4 = trunc i64 %v3 to i32
|
|
|
|
%v5 = bitcast i32 %v4 to float
|
|
|
|
%v6 = trunc i64 %v2 to i32
|
|
|
|
%v7 = bitcast i32 %v6 to float
|
|
|
|
%mul_ad.i.i = fmul fast float %v5, %v1
|
|
|
|
%mul_bc.i.i = fmul fast float %v7, %v0
|
|
|
|
%mul_i.i.i = fadd fast float %mul_ad.i.i, %mul_bc.i.i
|
|
|
|
%mul_ac.i.i = fmul fast float %v5, %v0
|
|
|
|
%mul_bd.i.i = fmul fast float %v7, %v1
|
|
|
|
%mul_r.i.i = fsub fast float %mul_ac.i.i, %mul_bd.i.i
|
|
|
|
store float %mul_r.i.i, float* %_M_value.realp.i.i, align 4
|
|
|
|
store float %mul_i.i.i, float* %_M_value.imagp.i.i, align 4
|
|
|
|
ret i64* %r
|
|
|
|
|
|
|
|
; CHECK-LABEL: @_Z4testSt7complexIfE
|
|
|
|
; CHECK-NOT: ld {{[0-9]+}}, 8(5)
|
|
|
|
; CHECK-NOT: ldu {{[0-9]+}}, 8(5)
|
|
|
|
; CHECK-NOT: stw
|
|
|
|
; CHECK-NOT: rldicl
|
2018-01-31 17:26:51 +08:00
|
|
|
; CHECK-DAG: lfsu {{[0-9]+}}, 8(3)
|
|
|
|
; CHECK-DAG: lfs {{[0-9]+}}, 4(3)
|
[PowerPC] Load two floats directly instead of using one 64-bit integer load
When dealing with complex<float>, and similar structures with two
single-precision floating-point numbers, especially when such things are being
passed around by value, we'll sometimes end up loading both float values by
extracting them from one 64-bit integer load. It looks like this:
t13: i64,ch = load<LD8[%ref.tmp]> t0, t6, undef:i64
t16: i64 = srl t13, Constant:i32<32>
t17: i32 = truncate t16
t18: f32 = bitcast t17
t19: i32 = truncate t13
t20: f32 = bitcast t19
The problem, especially before the P8 where those bitcasts aren't legal (and
get expanded via the stack), is that it would have been better to use two
floating-point loads directly. Here we add a target-specific DAGCombine to do
just that. In short, we turn:
ld 3, 0(5)
stw 3, -8(1)
rldicl 3, 3, 32, 32
stw 3, -4(1)
lfs 3, -4(1)
lfs 0, -8(1)
into:
lfs 3, 4(5)
lfs 0, 0(5)
llvm-svn: 264988
2016-03-31 10:56:05 +08:00
|
|
|
; CHECK: blr
|
|
|
|
}
|
|
|
|
|