2019-06-05 01:05:06 +08:00
|
|
|
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
|
2021-01-06 06:15:20 +08:00
|
|
|
; RUN: llc -march=amdgcn -mcpu=verde -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s
|
|
|
|
; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=VI %s
|
2015-06-27 04:29:10 +08:00
|
|
|
|
2017-02-22 08:02:21 +08:00
|
|
|
define amdgpu_ps float @main(float %arg0, float %arg1) #0 {
|
2019-06-05 01:05:06 +08:00
|
|
|
; SI-LABEL: main:
|
|
|
|
; SI: ; %bb.0: ; %bb
|
|
|
|
; SI-NEXT: v_cvt_i32_f32_e32 v0, v0
|
|
|
|
; SI-NEXT: s_mov_b32 s0, 0
|
|
|
|
; SI-NEXT: s_mov_b32 s1, s0
|
|
|
|
; SI-NEXT: s_mov_b32 s2, s0
|
|
|
|
; SI-NEXT: s_mov_b32 s3, s0
|
|
|
|
; SI-NEXT: s_mov_b32 s4, s0
|
|
|
|
; SI-NEXT: s_mov_b32 s5, s0
|
|
|
|
; SI-NEXT: s_mov_b32 s6, s0
|
|
|
|
; SI-NEXT: s_mov_b32 s7, s0
|
|
|
|
; SI-NEXT: image_load v2, v0, s[0:7] dmask:0x1 unorm
|
|
|
|
; SI-NEXT: v_and_b32_e32 v0, 7, v0
|
|
|
|
; SI-NEXT: s_waitcnt vmcnt(0)
|
2020-02-01 01:49:00 +08:00
|
|
|
; SI-NEXT: v_lshr_b32_e32 v0, v2, v0
|
[Codegen] (X & (C l>>/<< Y)) ==/!= 0 --> ((X <</l>> Y) & C) ==/!= 0 fold
Summary:
This was originally reported in D62818.
https://rise4fun.com/Alive/oPH
InstCombine does the opposite fold, in hope that `C l>>/<< Y` expression
will be hoisted out of a loop if `Y` is invariant and `X` is not.
But as it is seen from the diffs here, if it didn't get hoisted,
the produced assembly is almost universally worse.
Much like with my recent "hoist add/sub by/from const" patches,
we should get almost universal win if we hoist constant,
there is almost always an "and/test by imm" instruction,
but "shift of imm" not so much, so we may avoid having to
materialize the immediate, and thus need one less register.
And since we now shift not by constant, but by something else,
the live-range of that something else may reduce.
Special care needs to be applied not to disturb x86 `BT` / hexagon `tstbit`
instruction pattern. And to not get into endless combine loop.
Reviewers: RKSimon, efriedma, t.p.northover, craig.topper, spatel, arsenm
Reviewed By: spatel
Subscribers: hiraditya, MaskRay, wuzish, xbolva00, nikic, nemanjai, jvesely, wdng, nhaehnle, javed.absar, tpr, kristof.beyls, jsji, llvm-commits
Tags: #llvm
Differential Revision: https://reviews.llvm.org/D62871
llvm-svn: 366955
2019-07-25 06:57:22 +08:00
|
|
|
; SI-NEXT: v_and_b32_e32 v0, 1, v0
|
|
|
|
; SI-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0
|
2019-06-05 01:05:06 +08:00
|
|
|
; SI-NEXT: v_cndmask_b32_e32 v0, 0, v1, vcc
|
2019-10-14 20:01:10 +08:00
|
|
|
; SI-NEXT: v_cvt_pkrtz_f16_f32_e32 v0, s0, v0
|
2019-06-05 01:05:06 +08:00
|
|
|
; SI-NEXT: ; return to shader part epilog
|
|
|
|
;
|
|
|
|
; VI-LABEL: main:
|
|
|
|
; VI: ; %bb.0: ; %bb
|
|
|
|
; VI-NEXT: v_cvt_i32_f32_e32 v0, v0
|
|
|
|
; VI-NEXT: s_mov_b32 s0, 0
|
|
|
|
; VI-NEXT: s_mov_b32 s1, s0
|
|
|
|
; VI-NEXT: s_mov_b32 s2, s0
|
|
|
|
; VI-NEXT: s_mov_b32 s3, s0
|
|
|
|
; VI-NEXT: s_mov_b32 s4, s0
|
|
|
|
; VI-NEXT: s_mov_b32 s5, s0
|
|
|
|
; VI-NEXT: s_mov_b32 s6, s0
|
|
|
|
; VI-NEXT: s_mov_b32 s7, s0
|
|
|
|
; VI-NEXT: image_load v2, v0, s[0:7] dmask:0x1 unorm
|
|
|
|
; VI-NEXT: v_and_b32_e32 v0, 7, v0
|
|
|
|
; VI-NEXT: s_waitcnt vmcnt(0)
|
[Codegen] (X & (C l>>/<< Y)) ==/!= 0 --> ((X <</l>> Y) & C) ==/!= 0 fold
Summary:
This was originally reported in D62818.
https://rise4fun.com/Alive/oPH
InstCombine does the opposite fold, in hope that `C l>>/<< Y` expression
will be hoisted out of a loop if `Y` is invariant and `X` is not.
But as it is seen from the diffs here, if it didn't get hoisted,
the produced assembly is almost universally worse.
Much like with my recent "hoist add/sub by/from const" patches,
we should get almost universal win if we hoist constant,
there is almost always an "and/test by imm" instruction,
but "shift of imm" not so much, so we may avoid having to
materialize the immediate, and thus need one less register.
And since we now shift not by constant, but by something else,
the live-range of that something else may reduce.
Special care needs to be applied not to disturb x86 `BT` / hexagon `tstbit`
instruction pattern. And to not get into endless combine loop.
Reviewers: RKSimon, efriedma, t.p.northover, craig.topper, spatel, arsenm
Reviewed By: spatel
Subscribers: hiraditya, MaskRay, wuzish, xbolva00, nikic, nemanjai, jvesely, wdng, nhaehnle, javed.absar, tpr, kristof.beyls, jsji, llvm-commits
Tags: #llvm
Differential Revision: https://reviews.llvm.org/D62871
llvm-svn: 366955
2019-07-25 06:57:22 +08:00
|
|
|
; VI-NEXT: v_lshrrev_b32_e32 v0, v0, v2
|
|
|
|
; VI-NEXT: v_and_b32_e32 v0, 1, v0
|
|
|
|
; VI-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0
|
2019-06-05 01:05:06 +08:00
|
|
|
; VI-NEXT: v_cndmask_b32_e32 v0, 0, v1, vcc
|
2019-10-14 20:01:10 +08:00
|
|
|
; VI-NEXT: v_cvt_pkrtz_f16_f32 v0, s0, v0
|
2019-06-05 01:05:06 +08:00
|
|
|
; VI-NEXT: ; return to shader part epilog
|
2016-01-26 12:38:08 +08:00
|
|
|
bb:
|
2016-05-18 23:48:44 +08:00
|
|
|
%tmp = fptosi float %arg0 to i32
|
AMDGPU: Convert test cases to the dimension-aware intrinsics
Summary:
Also explicitly port over some tests in llvm.amdgcn.image.* that were
missing. Some tests are removed because they no longer apply (i.e.
explicitly testing building an address vector via insertelement).
This is in preparation for the eventual removal of the old-style
intrinsics.
Some additional notes:
- constant-address-space-32bit.ll: change some GCN-NEXT to GCN because
the instruction schedule was subtly altered
- insert_vector_elt.ll: the old test didn't actually test anything,
because %tmp1 was not used; remove the load, because it doesn't work
(Because of the amdgpu_ps calling convention? In any case, it's
orthogonal to what the test claims to be testing.)
Change-Id: Idfa99b6512ad139e755e82b8b89548ab08f0afcf
Reviewers: arsenm, rampitec
Subscribers: MatzeB, qcolombet, kzhuravl, wdng, yaxunl, dstuttard, tpr, t-tye, javed.absar, llvm-commits
Differential Revision: https://reviews.llvm.org/D48018
llvm-svn: 335229
2018-06-21 21:37:19 +08:00
|
|
|
%tmp1 = call <4 x float> @llvm.amdgcn.image.load.1d.v4f32.i32(i32 15, i32 undef, <8 x i32> undef, i32 0, i32 0)
|
2016-01-26 12:38:08 +08:00
|
|
|
%tmp2.f = extractelement <4 x float> %tmp1, i32 0
|
|
|
|
%tmp2 = bitcast float %tmp2.f to i32
|
|
|
|
%tmp3 = and i32 %tmp, 7
|
|
|
|
%tmp4 = shl i32 1, %tmp3
|
|
|
|
%tmp5 = and i32 %tmp2, %tmp4
|
|
|
|
%tmp6 = icmp eq i32 %tmp5, 0
|
2016-05-18 23:48:44 +08:00
|
|
|
%tmp7 = select i1 %tmp6, float 0.000000e+00, float %arg1
|
2017-02-22 08:27:34 +08:00
|
|
|
%tmp8 = call <2 x half> @llvm.amdgcn.cvt.pkrtz(float undef, float %tmp7)
|
|
|
|
%tmp9 = bitcast <2 x half> %tmp8 to float
|
2017-02-22 08:02:21 +08:00
|
|
|
ret float %tmp9
|
2015-06-27 04:29:10 +08:00
|
|
|
}
|
|
|
|
|
2017-02-22 08:27:34 +08:00
|
|
|
declare <2 x half> @llvm.amdgcn.cvt.pkrtz(float, float) #1
|
AMDGPU: Convert test cases to the dimension-aware intrinsics
Summary:
Also explicitly port over some tests in llvm.amdgcn.image.* that were
missing. Some tests are removed because they no longer apply (i.e.
explicitly testing building an address vector via insertelement).
This is in preparation for the eventual removal of the old-style
intrinsics.
Some additional notes:
- constant-address-space-32bit.ll: change some GCN-NEXT to GCN because
the instruction schedule was subtly altered
- insert_vector_elt.ll: the old test didn't actually test anything,
because %tmp1 was not used; remove the load, because it doesn't work
(Because of the amdgpu_ps calling convention? In any case, it's
orthogonal to what the test claims to be testing.)
Change-Id: Idfa99b6512ad139e755e82b8b89548ab08f0afcf
Reviewers: arsenm, rampitec
Subscribers: MatzeB, qcolombet, kzhuravl, wdng, yaxunl, dstuttard, tpr, t-tye, javed.absar, llvm-commits
Differential Revision: https://reviews.llvm.org/D48018
llvm-svn: 335229
2018-06-21 21:37:19 +08:00
|
|
|
declare <4 x float> @llvm.amdgcn.image.load.1d.v4f32.i32(i32, i32, <8 x i32>, i32, i32) #2
|
2015-06-27 04:29:10 +08:00
|
|
|
|
2016-07-12 07:35:48 +08:00
|
|
|
attributes #0 = { nounwind }
|
2015-06-27 04:29:10 +08:00
|
|
|
attributes #1 = { nounwind readnone }
|
2017-03-22 00:24:12 +08:00
|
|
|
attributes #2 = { nounwind readonly }
|