2016-05-12 09:58:58 +08:00
|
|
|
; RUN: opt -S -mtriple=amdgcn-unknown-amdhsa -mcpu=kaveri -amdgpu-promote-alloca < %s | FileCheck %s
|
|
|
|
|
2018-12-06 01:34:59 +08:00
|
|
|
target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5"
|
|
|
|
|
2016-05-12 09:58:58 +08:00
|
|
|
; CHECK-LABEL: @lds_promoted_alloca_select_invalid_pointer_operand(
|
|
|
|
; CHECK: %alloca = alloca i32
|
2018-12-06 01:34:59 +08:00
|
|
|
; CHECK: select i1 undef, i32 addrspace(5)* undef, i32 addrspace(5)* %alloca
|
2017-03-22 05:39:51 +08:00
|
|
|
define amdgpu_kernel void @lds_promoted_alloca_select_invalid_pointer_operand() #0 {
|
2018-12-06 01:34:59 +08:00
|
|
|
%alloca = alloca i32, align 4, addrspace(5)
|
|
|
|
%select = select i1 undef, i32 addrspace(5)* undef, i32 addrspace(5)* %alloca
|
|
|
|
store i32 0, i32 addrspace(5)* %select, align 4
|
2016-05-12 09:58:58 +08:00
|
|
|
ret void
|
|
|
|
}
|
|
|
|
|
|
|
|
; CHECK-LABEL: @lds_promote_alloca_select_two_derived_pointers(
|
|
|
|
; CHECK: [[ARRAYGEP:%[0-9]+]] = getelementptr inbounds [256 x [16 x i32]], [256 x [16 x i32]] addrspace(3)* @lds_promote_alloca_select_two_derived_pointers.alloca, i32 0, i32 %{{[0-9]+}}
|
|
|
|
; CHECK: %ptr0 = getelementptr inbounds [16 x i32], [16 x i32] addrspace(3)* [[ARRAYGEP]], i32 0, i32 %a
|
|
|
|
; CHECK: %ptr1 = getelementptr inbounds [16 x i32], [16 x i32] addrspace(3)* [[ARRAYGEP]], i32 0, i32 %b
|
|
|
|
; CHECK: %select = select i1 undef, i32 addrspace(3)* %ptr0, i32 addrspace(3)* %ptr1
|
|
|
|
; CHECK: store i32 0, i32 addrspace(3)* %select, align 4
|
2017-03-22 05:39:51 +08:00
|
|
|
define amdgpu_kernel void @lds_promote_alloca_select_two_derived_pointers(i32 %a, i32 %b) #0 {
|
2018-12-06 01:34:59 +08:00
|
|
|
%alloca = alloca [16 x i32], align 4, addrspace(5)
|
|
|
|
%ptr0 = getelementptr inbounds [16 x i32], [16 x i32] addrspace(5)* %alloca, i32 0, i32 %a
|
|
|
|
%ptr1 = getelementptr inbounds [16 x i32], [16 x i32] addrspace(5)* %alloca, i32 0, i32 %b
|
|
|
|
%select = select i1 undef, i32 addrspace(5)* %ptr0, i32 addrspace(5)* %ptr1
|
|
|
|
store i32 0, i32 addrspace(5)* %select, align 4
|
2016-05-12 09:58:58 +08:00
|
|
|
ret void
|
|
|
|
}
|
|
|
|
|
|
|
|
; FIXME: This should be promotable but requires knowing that both will be promoted first.
|
|
|
|
|
|
|
|
; CHECK-LABEL: @lds_promote_alloca_select_two_allocas(
|
|
|
|
; CHECK: %alloca0 = alloca i32, i32 16, align 4
|
|
|
|
; CHECK: %alloca1 = alloca i32, i32 16, align 4
|
2018-12-06 01:34:59 +08:00
|
|
|
; CHECK: %ptr0 = getelementptr inbounds i32, i32 addrspace(5)* %alloca0, i32 %a
|
|
|
|
; CHECK: %ptr1 = getelementptr inbounds i32, i32 addrspace(5)* %alloca1, i32 %b
|
|
|
|
; CHECK: %select = select i1 undef, i32 addrspace(5)* %ptr0, i32 addrspace(5)* %ptr1
|
2017-03-22 05:39:51 +08:00
|
|
|
define amdgpu_kernel void @lds_promote_alloca_select_two_allocas(i32 %a, i32 %b) #0 {
|
2018-12-06 01:34:59 +08:00
|
|
|
%alloca0 = alloca i32, i32 16, align 4, addrspace(5)
|
|
|
|
%alloca1 = alloca i32, i32 16, align 4, addrspace(5)
|
|
|
|
%ptr0 = getelementptr inbounds i32, i32 addrspace(5)* %alloca0, i32 %a
|
|
|
|
%ptr1 = getelementptr inbounds i32, i32 addrspace(5)* %alloca1, i32 %b
|
|
|
|
%select = select i1 undef, i32 addrspace(5)* %ptr0, i32 addrspace(5)* %ptr1
|
|
|
|
store i32 0, i32 addrspace(5)* %select, align 4
|
2016-05-12 09:58:58 +08:00
|
|
|
ret void
|
|
|
|
}
|
|
|
|
|
|
|
|
; TODO: Maybe this should be canonicalized to select on the constant and GEP after.
|
|
|
|
; CHECK-LABEL: @lds_promote_alloca_select_two_derived_constant_pointers(
|
|
|
|
; CHECK: [[ARRAYGEP:%[0-9]+]] = getelementptr inbounds [256 x [16 x i32]], [256 x [16 x i32]] addrspace(3)* @lds_promote_alloca_select_two_derived_constant_pointers.alloca, i32 0, i32 %{{[0-9]+}}
|
|
|
|
; CHECK: %ptr0 = getelementptr inbounds [16 x i32], [16 x i32] addrspace(3)* [[ARRAYGEP]], i32 0, i32 1
|
|
|
|
; CHECK: %ptr1 = getelementptr inbounds [16 x i32], [16 x i32] addrspace(3)* [[ARRAYGEP]], i32 0, i32 3
|
|
|
|
; CHECK: %select = select i1 undef, i32 addrspace(3)* %ptr0, i32 addrspace(3)* %ptr1
|
|
|
|
; CHECK: store i32 0, i32 addrspace(3)* %select, align 4
|
2017-03-22 05:39:51 +08:00
|
|
|
define amdgpu_kernel void @lds_promote_alloca_select_two_derived_constant_pointers() #0 {
|
2018-12-06 01:34:59 +08:00
|
|
|
%alloca = alloca [16 x i32], align 4, addrspace(5)
|
|
|
|
%ptr0 = getelementptr inbounds [16 x i32], [16 x i32] addrspace(5)* %alloca, i32 0, i32 1
|
|
|
|
%ptr1 = getelementptr inbounds [16 x i32], [16 x i32] addrspace(5)* %alloca, i32 0, i32 3
|
|
|
|
%select = select i1 undef, i32 addrspace(5)* %ptr0, i32 addrspace(5)* %ptr1
|
|
|
|
store i32 0, i32 addrspace(5)* %select, align 4
|
2016-05-12 09:58:58 +08:00
|
|
|
ret void
|
|
|
|
}
|
|
|
|
|
[ValueTracking, BasicAA] Don't simplify instructions
GetUnderlyingObject() (and by required symmetry
DecomposeGEPExpression()) will call SimplifyInstruction() on the
passed value if other checks fail. This simplification is very
expensive, but has little effect in practice. This patch removes
the SimplifyInstruction call(), and replaces it with a check for
single-argument phis (which can occur in canonical IR in LCSSA
form), which is the only useful simplification case I was able to
identify.
At O3 the geomean CTMark improvement is -1.7%. The largest
improvement is SPASS with ThinLTO at -6%.
In test-suite, I see only two tests with a hash difference and
no code size difference (PAQ8p, Ptrdist), which indicates that
the simplification only ends up being useful very rarely. (I would
have liked to figure out which simplification is responsible here,
but wasn't able to spot it looking at transformation logs.)
The AMDGPU test case that is update was using two selects with
undef condition, in which case GetUnderlyingObject will return
the first select operand as the underlying object. This will of
course not happen with non-undef conditions, so this was not
testing anything realistic. Additionally this illustrates potential
unsoundness: While GetUnderlyingObject will pick the first operand,
the select might be later replaced by the second operand, resulting
in inconsistent assumptions about the undef value.
Differential Revision: https://reviews.llvm.org/D82261
2020-06-20 19:59:24 +08:00
|
|
|
; FIXME: Can be promoted, but we'd have to recursively show that the select
|
|
|
|
; operands all point to the same alloca.
|
|
|
|
|
2016-05-12 09:58:58 +08:00
|
|
|
; CHECK-LABEL: @lds_promoted_alloca_select_input_select(
|
[ValueTracking, BasicAA] Don't simplify instructions
GetUnderlyingObject() (and by required symmetry
DecomposeGEPExpression()) will call SimplifyInstruction() on the
passed value if other checks fail. This simplification is very
expensive, but has little effect in practice. This patch removes
the SimplifyInstruction call(), and replaces it with a check for
single-argument phis (which can occur in canonical IR in LCSSA
form), which is the only useful simplification case I was able to
identify.
At O3 the geomean CTMark improvement is -1.7%. The largest
improvement is SPASS with ThinLTO at -6%.
In test-suite, I see only two tests with a hash difference and
no code size difference (PAQ8p, Ptrdist), which indicates that
the simplification only ends up being useful very rarely. (I would
have liked to figure out which simplification is responsible here,
but wasn't able to spot it looking at transformation logs.)
The AMDGPU test case that is update was using two selects with
undef condition, in which case GetUnderlyingObject will return
the first select operand as the underlying object. This will of
course not happen with non-undef conditions, so this was not
testing anything realistic. Additionally this illustrates potential
unsoundness: While GetUnderlyingObject will pick the first operand,
the select might be later replaced by the second operand, resulting
in inconsistent assumptions about the undef value.
Differential Revision: https://reviews.llvm.org/D82261
2020-06-20 19:59:24 +08:00
|
|
|
; CHECK: alloca
|
|
|
|
define amdgpu_kernel void @lds_promoted_alloca_select_input_select(i32 %a, i32 %b, i32 %c, i1 %c1, i1 %c2) #0 {
|
2018-12-06 01:34:59 +08:00
|
|
|
%alloca = alloca [16 x i32], align 4, addrspace(5)
|
|
|
|
%ptr0 = getelementptr inbounds [16 x i32], [16 x i32] addrspace(5)* %alloca, i32 0, i32 %a
|
|
|
|
%ptr1 = getelementptr inbounds [16 x i32], [16 x i32] addrspace(5)* %alloca, i32 0, i32 %b
|
|
|
|
%ptr2 = getelementptr inbounds [16 x i32], [16 x i32] addrspace(5)* %alloca, i32 0, i32 %c
|
[ValueTracking, BasicAA] Don't simplify instructions
GetUnderlyingObject() (and by required symmetry
DecomposeGEPExpression()) will call SimplifyInstruction() on the
passed value if other checks fail. This simplification is very
expensive, but has little effect in practice. This patch removes
the SimplifyInstruction call(), and replaces it with a check for
single-argument phis (which can occur in canonical IR in LCSSA
form), which is the only useful simplification case I was able to
identify.
At O3 the geomean CTMark improvement is -1.7%. The largest
improvement is SPASS with ThinLTO at -6%.
In test-suite, I see only two tests with a hash difference and
no code size difference (PAQ8p, Ptrdist), which indicates that
the simplification only ends up being useful very rarely. (I would
have liked to figure out which simplification is responsible here,
but wasn't able to spot it looking at transformation logs.)
The AMDGPU test case that is update was using two selects with
undef condition, in which case GetUnderlyingObject will return
the first select operand as the underlying object. This will of
course not happen with non-undef conditions, so this was not
testing anything realistic. Additionally this illustrates potential
unsoundness: While GetUnderlyingObject will pick the first operand,
the select might be later replaced by the second operand, resulting
in inconsistent assumptions about the undef value.
Differential Revision: https://reviews.llvm.org/D82261
2020-06-20 19:59:24 +08:00
|
|
|
%select0 = select i1 %c1, i32 addrspace(5)* %ptr0, i32 addrspace(5)* %ptr1
|
|
|
|
%select1 = select i1 %c2, i32 addrspace(5)* %select0, i32 addrspace(5)* %ptr2
|
2018-12-06 01:34:59 +08:00
|
|
|
store i32 0, i32 addrspace(5)* %select1, align 4
|
2016-05-12 09:58:58 +08:00
|
|
|
ret void
|
|
|
|
}
|
|
|
|
|
2017-03-22 05:39:51 +08:00
|
|
|
define amdgpu_kernel void @lds_promoted_alloca_select_input_phi(i32 %a, i32 %b, i32 %c) #0 {
|
2016-05-12 09:58:58 +08:00
|
|
|
entry:
|
2018-12-06 01:34:59 +08:00
|
|
|
%alloca = alloca [16 x i32], align 4, addrspace(5)
|
|
|
|
%ptr0 = getelementptr inbounds [16 x i32], [16 x i32] addrspace(5)* %alloca, i32 0, i32 %a
|
|
|
|
%ptr1 = getelementptr inbounds [16 x i32], [16 x i32] addrspace(5)* %alloca, i32 0, i32 %b
|
|
|
|
store i32 0, i32 addrspace(5)* %ptr0
|
2016-05-12 09:58:58 +08:00
|
|
|
br i1 undef, label %bb1, label %bb2
|
|
|
|
|
|
|
|
bb1:
|
2018-12-06 01:34:59 +08:00
|
|
|
%ptr2 = getelementptr inbounds [16 x i32], [16 x i32] addrspace(5)* %alloca, i32 0, i32 %c
|
|
|
|
%select0 = select i1 undef, i32 addrspace(5)* undef, i32 addrspace(5)* %ptr2
|
|
|
|
store i32 0, i32 addrspace(5)* %ptr1
|
2016-05-12 09:58:58 +08:00
|
|
|
br label %bb2
|
|
|
|
|
|
|
|
bb2:
|
2018-12-06 01:34:59 +08:00
|
|
|
%phi.ptr = phi i32 addrspace(5)* [ %ptr0, %entry ], [ %select0, %bb1 ]
|
|
|
|
%select1 = select i1 undef, i32 addrspace(5)* %phi.ptr, i32 addrspace(5)* %ptr1
|
|
|
|
store i32 0, i32 addrspace(5)* %select1, align 4
|
2016-05-12 09:58:58 +08:00
|
|
|
ret void
|
|
|
|
}
|
|
|
|
|
2016-05-18 23:57:21 +08:00
|
|
|
; CHECK-LABEL: @select_null_rhs(
|
|
|
|
; CHECK-NOT: alloca
|
|
|
|
; CHECK: select i1 %tmp2, double addrspace(3)* %{{[0-9]+}}, double addrspace(3)* null
|
2017-03-22 05:39:51 +08:00
|
|
|
define amdgpu_kernel void @select_null_rhs(double addrspace(1)* nocapture %arg, i32 %arg1) #1 {
|
2016-05-18 23:57:21 +08:00
|
|
|
bb:
|
2018-12-06 01:34:59 +08:00
|
|
|
%tmp = alloca double, align 8, addrspace(5)
|
|
|
|
store double 0.000000e+00, double addrspace(5)* %tmp, align 8
|
2016-05-18 23:57:21 +08:00
|
|
|
%tmp2 = icmp eq i32 %arg1, 0
|
2018-12-06 01:34:59 +08:00
|
|
|
%tmp3 = select i1 %tmp2, double addrspace(5)* %tmp, double addrspace(5)* null
|
|
|
|
store double 1.000000e+00, double addrspace(5)* %tmp3, align 8
|
|
|
|
%tmp4 = load double, double addrspace(5)* %tmp, align 8
|
2016-05-18 23:57:21 +08:00
|
|
|
store double %tmp4, double addrspace(1)* %arg
|
|
|
|
ret void
|
|
|
|
}
|
|
|
|
|
|
|
|
; CHECK-LABEL: @select_null_lhs(
|
|
|
|
; CHECK-NOT: alloca
|
|
|
|
; CHECK: select i1 %tmp2, double addrspace(3)* null, double addrspace(3)* %{{[0-9]+}}
|
2017-03-22 05:39:51 +08:00
|
|
|
define amdgpu_kernel void @select_null_lhs(double addrspace(1)* nocapture %arg, i32 %arg1) #1 {
|
2016-05-18 23:57:21 +08:00
|
|
|
bb:
|
2018-12-06 01:34:59 +08:00
|
|
|
%tmp = alloca double, align 8, addrspace(5)
|
|
|
|
store double 0.000000e+00, double addrspace(5)* %tmp, align 8
|
2016-05-18 23:57:21 +08:00
|
|
|
%tmp2 = icmp eq i32 %arg1, 0
|
2018-12-06 01:34:59 +08:00
|
|
|
%tmp3 = select i1 %tmp2, double addrspace(5)* null, double addrspace(5)* %tmp
|
|
|
|
store double 1.000000e+00, double addrspace(5)* %tmp3, align 8
|
|
|
|
%tmp4 = load double, double addrspace(5)* %tmp, align 8
|
2016-05-18 23:57:21 +08:00
|
|
|
store double %tmp4, double addrspace(1)* %arg
|
|
|
|
ret void
|
|
|
|
}
|
|
|
|
|
2019-08-28 00:34:40 +08:00
|
|
|
attributes #0 = { norecurse nounwind "amdgpu-waves-per-eu"="1,1" "amdgpu-flat-work-group-size"="1,256" }
|
2017-12-01 00:12:24 +08:00
|
|
|
attributes #1 = { norecurse nounwind }
|