[AMDGPU][SILoadStoreOptimizer] Merge TBUFFER loads/stores
Summary: Extend SILoadStoreOptimizer to merge tbuffer loads and stores.
Reviewers: nhaehnle
Reviewed By: nhaehnle
Subscribers: arsenm, kzhuravl, jvesely, wdng, nhaehnle, yaxunl, dstuttard, tpr, t-tye, hiraditya, llvm-commits
Tags: #llvm
Differential Revision: https://reviews.llvm.org/D69794
2019-11-21 05:30:02 +08:00
|
|
|
# RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs -run-pass si-load-store-opt -o - %s | FileCheck -check-prefix=GFX9 %s
|
|
|
|
# RUN: llc -march=amdgcn -mcpu=gfx1010 -verify-machineinstrs -run-pass si-load-store-opt -o - %s | FileCheck -check-prefix=GFX10 %s
|
|
|
|
|
|
|
|
#
|
2020-05-15 23:02:39 +08:00
|
|
|
# COM: GFX9 tests
|
[AMDGPU][SILoadStoreOptimizer] Merge TBUFFER loads/stores
Summary: Extend SILoadStoreOptimizer to merge tbuffer loads and stores.
Reviewers: nhaehnle
Reviewed By: nhaehnle
Subscribers: arsenm, kzhuravl, jvesely, wdng, nhaehnle, yaxunl, dstuttard, tpr, t-tye, hiraditya, llvm-commits
Tags: #llvm
Differential Revision: https://reviews.llvm.org/D69794
2019-11-21 05:30:02 +08:00
|
|
|
#
|
|
|
|
|
|
|
|
# GFX9-LABEL: name: gfx9_tbuffer_load_x_xyz
|
|
|
|
# GFX9: %{{[0-9]+}}:vreg_128 = TBUFFER_LOAD_FORMAT_XYZW_OFFSET %4, 0, 4, 126, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 16, align 1, addrspace 4)
|
|
|
|
# GFX9: %{{[0-9]+}}:vgpr_32 = COPY %7.sub0
|
|
|
|
# GFX9: %{{[0-9]+}}:vreg_96 = COPY killed %7.sub1_sub2_sub3
|
|
|
|
name: gfx9_tbuffer_load_x_xyz
|
|
|
|
body: |
|
|
|
|
bb.0.entry:
|
|
|
|
%0:sgpr_32 = COPY $sgpr0
|
|
|
|
%1:sgpr_32 = COPY $sgpr1
|
|
|
|
%2:sgpr_32 = COPY $sgpr2
|
|
|
|
%3:sgpr_32 = COPY $sgpr3
|
|
|
|
%5:sgpr_128 = REG_SEQUENCE %0:sgpr_32, %subreg.sub0, %1:sgpr_32, %subreg.sub1, %2:sgpr_32, %subreg.sub2, %3:sgpr_32, %subreg.sub3
|
|
|
|
%7:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %5:sgpr_128, 0, 4, 116, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, align 1, addrspace 4)
|
|
|
|
%8:vreg_96 = TBUFFER_LOAD_FORMAT_XYZ_OFFSET %5:sgpr_128, 0, 8, 125, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 12, align 1, addrspace 4)
|
|
|
|
...
|
|
|
|
---
|
|
|
|
|
|
|
|
# GFX9-LABEL: name: gfx9_tbuffer_load_xyz_x
|
|
|
|
# GFX9: %{{[0-9]+}}:vreg_128 = TBUFFER_LOAD_FORMAT_XYZW_OFFSET %4, 0, 4, 126, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 16, align 1, addrspace 4)
|
|
|
|
# GFX9: %{{[0-9]+}}:vreg_96 = COPY %7.sub0_sub1_sub2
|
|
|
|
# GFX9: %{{[0-9]+}}:vgpr_32 = COPY killed %7.sub3
|
|
|
|
name: gfx9_tbuffer_load_xyz_x
|
|
|
|
body: |
|
|
|
|
bb.0.entry:
|
|
|
|
%0:sgpr_32 = COPY $sgpr0
|
|
|
|
%1:sgpr_32 = COPY $sgpr1
|
|
|
|
%2:sgpr_32 = COPY $sgpr2
|
|
|
|
%3:sgpr_32 = COPY $sgpr3
|
|
|
|
%5:sgpr_128 = REG_SEQUENCE %0:sgpr_32, %subreg.sub0, %1:sgpr_32, %subreg.sub1, %2:sgpr_32, %subreg.sub2, %3:sgpr_32, %subreg.sub3
|
|
|
|
%7:vreg_96 = TBUFFER_LOAD_FORMAT_XYZ_OFFSET %5:sgpr_128, 0, 4, 125, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 12, align 1, addrspace 4)
|
|
|
|
%8:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %5:sgpr_128, 0, 16, 116, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, align 1, addrspace 4)
|
|
|
|
...
|
|
|
|
---
|
|
|
|
|
|
|
|
# GFX9-LABEL: name: gfx9_tbuffer_load_xy_xy
|
|
|
|
# GFX9: %{{[0-9]+}}:vreg_128 = TBUFFER_LOAD_FORMAT_XYZW_OFFSET %4, 0, 4, 126, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 16, align 1, addrspace 4)
|
|
|
|
# GFX9: %{{[0-9]+}}:vreg_64 = COPY %7.sub0_sub1
|
|
|
|
# GFX9: %{{[0-9]+}}:vreg_64 = COPY killed %7.sub2_sub3
|
|
|
|
name: gfx9_tbuffer_load_xy_xy
|
|
|
|
body: |
|
|
|
|
bb.0.entry:
|
|
|
|
%0:sgpr_32 = COPY $sgpr0
|
|
|
|
%1:sgpr_32 = COPY $sgpr1
|
|
|
|
%2:sgpr_32 = COPY $sgpr2
|
|
|
|
%3:sgpr_32 = COPY $sgpr3
|
|
|
|
%5:sgpr_128 = REG_SEQUENCE %0:sgpr_32, %subreg.sub0, %1:sgpr_32, %subreg.sub1, %2:sgpr_32, %subreg.sub2, %3:sgpr_32, %subreg.sub3
|
|
|
|
%7:vreg_64 = TBUFFER_LOAD_FORMAT_XY_OFFSET %5:sgpr_128, 0, 4, 123, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 8, align 1, addrspace 4)
|
|
|
|
%8:vreg_64 = TBUFFER_LOAD_FORMAT_XY_OFFSET %5:sgpr_128, 0, 12, 123, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 8, align 1, addrspace 4)
|
|
|
|
...
|
|
|
|
---
|
|
|
|
|
|
|
|
# GFX9-LABEL: name: gfx9_tbuffer_load_x_xy
|
|
|
|
# GFX9: %{{[0-9]+}}:vreg_96 = TBUFFER_LOAD_FORMAT_XYZ_OFFSET %4, 0, 4, 125, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 12, align 1, addrspace 4)
|
|
|
|
# GFX9: %{{[0-9]+}}:vgpr_32 = COPY %7.sub0
|
|
|
|
# GFX9: %{{[0-9]+}}:vreg_64 = COPY killed %7.sub1_sub2
|
|
|
|
name: gfx9_tbuffer_load_x_xy
|
|
|
|
body: |
|
|
|
|
bb.0.entry:
|
|
|
|
%0:sgpr_32 = COPY $sgpr0
|
|
|
|
%1:sgpr_32 = COPY $sgpr1
|
|
|
|
%2:sgpr_32 = COPY $sgpr2
|
|
|
|
%3:sgpr_32 = COPY $sgpr3
|
|
|
|
%5:sgpr_128 = REG_SEQUENCE %0:sgpr_32, %subreg.sub0, %1:sgpr_32, %subreg.sub1, %2:sgpr_32, %subreg.sub2, %3:sgpr_32, %subreg.sub3
|
|
|
|
%7:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %5:sgpr_128, 0, 4, 116, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, align 1, addrspace 4)
|
|
|
|
%8:vreg_64 = TBUFFER_LOAD_FORMAT_XY_OFFSET %5:sgpr_128, 0, 8, 123, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 8, align 1, addrspace 4)
|
|
|
|
...
|
|
|
|
---
|
|
|
|
|
|
|
|
# GFX9-LABEL: name: gfx9_tbuffer_load_xy_x
|
|
|
|
# GFX9: %{{[0-9]+}}:vreg_96 = TBUFFER_LOAD_FORMAT_XYZ_OFFSET %4, 0, 4, 125, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 12, align 1, addrspace 4)
|
|
|
|
# GFX9: %{{[0-9]+}}:vreg_64 = COPY %7.sub0_sub1
|
|
|
|
# GFX9: %{{[0-9]+}}:vgpr_32 = COPY killed %7.sub2
|
|
|
|
name: gfx9_tbuffer_load_xy_x
|
|
|
|
body: |
|
|
|
|
bb.0.entry:
|
|
|
|
%0:sgpr_32 = COPY $sgpr0
|
|
|
|
%1:sgpr_32 = COPY $sgpr1
|
|
|
|
%2:sgpr_32 = COPY $sgpr2
|
|
|
|
%3:sgpr_32 = COPY $sgpr3
|
|
|
|
%5:sgpr_128 = REG_SEQUENCE %0:sgpr_32, %subreg.sub0, %1:sgpr_32, %subreg.sub1, %2:sgpr_32, %subreg.sub2, %3:sgpr_32, %subreg.sub3
|
|
|
|
%7:vreg_64 = TBUFFER_LOAD_FORMAT_XY_OFFSET %5:sgpr_128, 0, 4, 123, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 8, align 1, addrspace 4)
|
|
|
|
%8:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %5:sgpr_128, 0, 12, 116, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, align 1, addrspace 4)
|
|
|
|
...
|
|
|
|
---
|
|
|
|
|
|
|
|
|
|
|
|
# GFX9-LABEL: name: gfx9_tbuffer_load_x_x
|
|
|
|
# GFX9: %{{[0-9]+}}:vreg_64 = TBUFFER_LOAD_FORMAT_XY_OFFSET %4, 0, 4, 123, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 8, align 1, addrspace 4)
|
|
|
|
# GFX9: %{{[0-9]+}}:vgpr_32 = COPY %7.sub0
|
|
|
|
# GFX9: %{{[0-9]+}}:vgpr_32 = COPY killed %7.sub1
|
|
|
|
|
|
|
|
name: gfx9_tbuffer_load_x_x
|
|
|
|
body: |
|
|
|
|
bb.0.entry:
|
|
|
|
%0:sgpr_32 = COPY $sgpr0
|
|
|
|
%1:sgpr_32 = COPY $sgpr1
|
|
|
|
%2:sgpr_32 = COPY $sgpr2
|
|
|
|
%3:sgpr_32 = COPY $sgpr3
|
|
|
|
%5:sgpr_128 = REG_SEQUENCE %0:sgpr_32, %subreg.sub0, %1:sgpr_32, %subreg.sub1, %2:sgpr_32, %subreg.sub2, %3:sgpr_32, %subreg.sub3
|
|
|
|
%7:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %5:sgpr_128, 0, 4, 116, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, align 1, addrspace 4)
|
|
|
|
%8:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %5:sgpr_128, 0, 8, 116, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, align 1, addrspace 4)
|
|
|
|
...
|
|
|
|
---
|
|
|
|
|
|
|
|
# GFX9-LABEL: name: gfx9_tbuffer_load_x_x_format_32_32_32_32
|
|
|
|
# GFX9: %{{[0-9]+}}:vreg_64 = TBUFFER_LOAD_FORMAT_XY_OFFSET %4, 0, 4, 123, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 8, align 1, addrspace 4)
|
|
|
|
# GFX9: %{{[0-9]+}}:vgpr_32 = COPY %7.sub0
|
|
|
|
# GFX9: %{{[0-9]+}}:vgpr_32 = COPY killed %7.sub1
|
|
|
|
|
|
|
|
name: gfx9_tbuffer_load_x_x_format_32_32_32_32
|
|
|
|
body: |
|
|
|
|
bb.0.entry:
|
|
|
|
%0:sgpr_32 = COPY $sgpr0
|
|
|
|
%1:sgpr_32 = COPY $sgpr1
|
|
|
|
%2:sgpr_32 = COPY $sgpr2
|
|
|
|
%3:sgpr_32 = COPY $sgpr3
|
|
|
|
%5:sgpr_128 = REG_SEQUENCE %0:sgpr_32, %subreg.sub0, %1:sgpr_32, %subreg.sub1, %2:sgpr_32, %subreg.sub2, %3:sgpr_32, %subreg.sub3
|
|
|
|
%7:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %5:sgpr_128, 0, 4, 126, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, align 1, addrspace 4)
|
|
|
|
%8:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %5:sgpr_128, 0, 8, 126, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, align 1, addrspace 4)
|
|
|
|
...
|
|
|
|
---
|
|
|
|
|
|
|
|
|
|
|
|
# GFX9-LABEL: name: gfx9_tbuffer_load_float_32
|
|
|
|
# GFX9: %{{[0-9]+}}:vreg_64 = TBUFFER_LOAD_FORMAT_XY_OFFSET %4, 0, 4, 123, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 8, align 1, addrspace 4)
|
|
|
|
# GFX9: %{{[0-9]+}}:vgpr_32 = COPY %14.sub0
|
|
|
|
# GFX9: %{{[0-9]+}}:vgpr_32 = COPY killed %14.sub1
|
|
|
|
# GFX9: %{{[0-9]+}}:vreg_128 = TBUFFER_LOAD_FORMAT_XYZW_OFFSET %4, 0, 16, 126, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 16, align 1, addrspace 4)
|
AMDGPU/SILoadStoreOptimizer: Improve merging of out of order offsets
Summary:
This improves merging of sequences like:
store a, ptr + 4
store b, ptr + 8
store c, ptr + 12
store d, ptr + 16
store e, ptr + 20
store f, ptr
Prior to this patch the basic block was scanned in order to find instructions
to merge and the above sequence would be transformed to:
store4 <a, b, c, d>, ptr + 4
store e, ptr + 20
store r, ptr
With this change, we now sort all the candidate merge instructions by their offset,
so instructions are visited in offset order rather than in the order they appear
in the basic block. We now transform this sequnce into:
store4 <f, a, b, c>, ptr
store2 <d, e>, ptr + 16
Another benefit of this change is that since we have sorted the mergeable lists
by offset, we can easily check if an instruction is mergeable by checking the
offset of the instruction that becomes before or after it in the sorted list.
Once we determine an instruction is not mergeable we can remove it from the list
and avoid having to do the more expensive mergeablilty checks.
Reviewers: arsenm, pendingchaos, rampitec, nhaehnle, vpykhtin
Reviewed By: arsenm, nhaehnle
Subscribers: kerbowa, merge_guards_bot, kzhuravl, jvesely, wdng, yaxunl, dstuttard, tpr, t-tye, hiraditya, llvm-commits
Tags: #llvm
Differential Revision: https://reviews.llvm.org/D65966
2020-01-25 05:07:08 +08:00
|
|
|
# GFX9: %{{[0-9]+}}:vreg_96 = COPY %17.sub0_sub1_sub2
|
|
|
|
# GFX9: %{{[0-9]+}}:vgpr_32 = COPY killed %17.sub3
|
|
|
|
# GFX9: %{{[0-9]+}}:vreg_64 = COPY %16.sub0_sub1
|
|
|
|
# GFX9: %{{[0-9]+}}:vgpr_32 = COPY killed %16.sub2
|
[AMDGPU][SILoadStoreOptimizer] Merge TBUFFER loads/stores
Summary: Extend SILoadStoreOptimizer to merge tbuffer loads and stores.
Reviewers: nhaehnle
Reviewed By: nhaehnle
Subscribers: arsenm, kzhuravl, jvesely, wdng, nhaehnle, yaxunl, dstuttard, tpr, t-tye, hiraditya, llvm-commits
Tags: #llvm
Differential Revision: https://reviews.llvm.org/D69794
2019-11-21 05:30:02 +08:00
|
|
|
# GFX9: %{{[0-9]+}}:vgpr_32 = COPY %15.sub0
|
|
|
|
# GFX9: %{{[0-9]+}}:vgpr_32 = COPY killed %15.sub1
|
|
|
|
# GFX9: %{{[0-9]+}}:vreg_96 = TBUFFER_LOAD_FORMAT_XYZ_OFFSET %4, 0, 36, 125, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 12, align 1, addrspace 4)
|
|
|
|
# GFX9: %{{[0-9]+}}:vreg_64 = COPY %19.sub0_sub1
|
|
|
|
# GFX9: %{{[0-9]+}}:vgpr_32 = COPY killed %19.sub2
|
AMDGPU/SILoadStoreOptimizer: Improve merging of out of order offsets
Summary:
This improves merging of sequences like:
store a, ptr + 4
store b, ptr + 8
store c, ptr + 12
store d, ptr + 16
store e, ptr + 20
store f, ptr
Prior to this patch the basic block was scanned in order to find instructions
to merge and the above sequence would be transformed to:
store4 <a, b, c, d>, ptr + 4
store e, ptr + 20
store r, ptr
With this change, we now sort all the candidate merge instructions by their offset,
so instructions are visited in offset order rather than in the order they appear
in the basic block. We now transform this sequnce into:
store4 <f, a, b, c>, ptr
store2 <d, e>, ptr + 16
Another benefit of this change is that since we have sorted the mergeable lists
by offset, we can easily check if an instruction is mergeable by checking the
offset of the instruction that becomes before or after it in the sorted list.
Once we determine an instruction is not mergeable we can remove it from the list
and avoid having to do the more expensive mergeablilty checks.
Reviewers: arsenm, pendingchaos, rampitec, nhaehnle, vpykhtin
Reviewed By: arsenm, nhaehnle
Subscribers: kerbowa, merge_guards_bot, kzhuravl, jvesely, wdng, yaxunl, dstuttard, tpr, t-tye, hiraditya, llvm-commits
Tags: #llvm
Differential Revision: https://reviews.llvm.org/D65966
2020-01-25 05:07:08 +08:00
|
|
|
# GFX9: %{{[0-9]+}}:vgpr_32 = COPY %18.sub0
|
|
|
|
# GFX9: %{{[0-9]+}}:vgpr_32 = COPY killed %18.sub1
|
[AMDGPU][SILoadStoreOptimizer] Merge TBUFFER loads/stores
Summary: Extend SILoadStoreOptimizer to merge tbuffer loads and stores.
Reviewers: nhaehnle
Reviewed By: nhaehnle
Subscribers: arsenm, kzhuravl, jvesely, wdng, nhaehnle, yaxunl, dstuttard, tpr, t-tye, hiraditya, llvm-commits
Tags: #llvm
Differential Revision: https://reviews.llvm.org/D69794
2019-11-21 05:30:02 +08:00
|
|
|
|
|
|
|
name: gfx9_tbuffer_load_float_32
|
|
|
|
body: |
|
|
|
|
bb.0.entry:
|
|
|
|
%0:sgpr_32 = COPY $sgpr0
|
|
|
|
%1:sgpr_32 = COPY $sgpr1
|
|
|
|
%2:sgpr_32 = COPY $sgpr2
|
|
|
|
%3:sgpr_32 = COPY $sgpr3
|
|
|
|
%5:sgpr_128 = REG_SEQUENCE %0:sgpr_32, %subreg.sub0, %1:sgpr_32, %subreg.sub1, %2:sgpr_32, %subreg.sub2, %3:sgpr_32, %subreg.sub3
|
|
|
|
%7:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %5:sgpr_128, 0, 4, 116, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, align 1, addrspace 4)
|
|
|
|
%8:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %5:sgpr_128, 0, 8, 116, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, align 1, addrspace 4)
|
|
|
|
%9:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %5:sgpr_128, 0, 16, 116, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, align 1, addrspace 4)
|
|
|
|
%10:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %5:sgpr_128, 0, 20, 116, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, align 1, addrspace 4)
|
|
|
|
%11:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %5:sgpr_128, 0, 24, 116, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, align 1, addrspace 4)
|
|
|
|
%12:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %5:sgpr_128, 0, 28, 116, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, align 1, addrspace 4)
|
|
|
|
%13:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %5:sgpr_128, 0, 36, 116, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, align 1, addrspace 4)
|
|
|
|
%14:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %5:sgpr_128, 0, 40, 116, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, align 1, addrspace 4)
|
|
|
|
%15:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %5:sgpr_128, 0, 44, 116, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, align 1, addrspace 4)
|
|
|
|
...
|
|
|
|
---
|
|
|
|
|
|
|
|
# GFX9-LABEL: name: gfx9_tbuffer_load_sint_32
|
|
|
|
# GFX9: %{{[0-9]+}}:vreg_64 = TBUFFER_LOAD_FORMAT_XY_OFFSET %4, 0, 4, 91, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 8, align 1, addrspace 4)
|
|
|
|
# GFX9: %{{[0-9]+}}:vgpr_32 = COPY %14.sub0
|
|
|
|
# GFX9: %{{[0-9]+}}:vgpr_32 = COPY killed %14.sub1
|
|
|
|
# GFX9: %{{[0-9]+}}:vreg_128 = TBUFFER_LOAD_FORMAT_XYZW_OFFSET %4, 0, 16, 94, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 16, align 1, addrspace 4)
|
AMDGPU/SILoadStoreOptimizer: Improve merging of out of order offsets
Summary:
This improves merging of sequences like:
store a, ptr + 4
store b, ptr + 8
store c, ptr + 12
store d, ptr + 16
store e, ptr + 20
store f, ptr
Prior to this patch the basic block was scanned in order to find instructions
to merge and the above sequence would be transformed to:
store4 <a, b, c, d>, ptr + 4
store e, ptr + 20
store r, ptr
With this change, we now sort all the candidate merge instructions by their offset,
so instructions are visited in offset order rather than in the order they appear
in the basic block. We now transform this sequnce into:
store4 <f, a, b, c>, ptr
store2 <d, e>, ptr + 16
Another benefit of this change is that since we have sorted the mergeable lists
by offset, we can easily check if an instruction is mergeable by checking the
offset of the instruction that becomes before or after it in the sorted list.
Once we determine an instruction is not mergeable we can remove it from the list
and avoid having to do the more expensive mergeablilty checks.
Reviewers: arsenm, pendingchaos, rampitec, nhaehnle, vpykhtin
Reviewed By: arsenm, nhaehnle
Subscribers: kerbowa, merge_guards_bot, kzhuravl, jvesely, wdng, yaxunl, dstuttard, tpr, t-tye, hiraditya, llvm-commits
Tags: #llvm
Differential Revision: https://reviews.llvm.org/D65966
2020-01-25 05:07:08 +08:00
|
|
|
# GFX9: %{{[0-9]+}}:vreg_96 = COPY %17.sub0_sub1_sub2
|
|
|
|
# GFX9: %{{[0-9]+}}:vgpr_32 = COPY killed %17.sub3
|
|
|
|
# GFX9: %{{[0-9]+}}:vreg_64 = COPY %16.sub0_sub1
|
|
|
|
# GFX9: %{{[0-9]+}}:vgpr_32 = COPY killed %16.sub2
|
[AMDGPU][SILoadStoreOptimizer] Merge TBUFFER loads/stores
Summary: Extend SILoadStoreOptimizer to merge tbuffer loads and stores.
Reviewers: nhaehnle
Reviewed By: nhaehnle
Subscribers: arsenm, kzhuravl, jvesely, wdng, nhaehnle, yaxunl, dstuttard, tpr, t-tye, hiraditya, llvm-commits
Tags: #llvm
Differential Revision: https://reviews.llvm.org/D69794
2019-11-21 05:30:02 +08:00
|
|
|
# GFX9: %{{[0-9]+}}:vgpr_32 = COPY %15.sub0
|
|
|
|
# GFX9: %{{[0-9]+}}:vgpr_32 = COPY killed %15.sub1
|
|
|
|
# GFX9: %{{[0-9]+}}:vreg_96 = TBUFFER_LOAD_FORMAT_XYZ_OFFSET %4, 0, 36, 93, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 12, align 1, addrspace 4)
|
|
|
|
# GFX9: %{{[0-9]+}}:vreg_64 = COPY %19.sub0_sub1
|
|
|
|
# GFX9: %{{[0-9]+}}:vgpr_32 = COPY killed %19.sub2
|
AMDGPU/SILoadStoreOptimizer: Improve merging of out of order offsets
Summary:
This improves merging of sequences like:
store a, ptr + 4
store b, ptr + 8
store c, ptr + 12
store d, ptr + 16
store e, ptr + 20
store f, ptr
Prior to this patch the basic block was scanned in order to find instructions
to merge and the above sequence would be transformed to:
store4 <a, b, c, d>, ptr + 4
store e, ptr + 20
store r, ptr
With this change, we now sort all the candidate merge instructions by their offset,
so instructions are visited in offset order rather than in the order they appear
in the basic block. We now transform this sequnce into:
store4 <f, a, b, c>, ptr
store2 <d, e>, ptr + 16
Another benefit of this change is that since we have sorted the mergeable lists
by offset, we can easily check if an instruction is mergeable by checking the
offset of the instruction that becomes before or after it in the sorted list.
Once we determine an instruction is not mergeable we can remove it from the list
and avoid having to do the more expensive mergeablilty checks.
Reviewers: arsenm, pendingchaos, rampitec, nhaehnle, vpykhtin
Reviewed By: arsenm, nhaehnle
Subscribers: kerbowa, merge_guards_bot, kzhuravl, jvesely, wdng, yaxunl, dstuttard, tpr, t-tye, hiraditya, llvm-commits
Tags: #llvm
Differential Revision: https://reviews.llvm.org/D65966
2020-01-25 05:07:08 +08:00
|
|
|
# GFX9: %{{[0-9]+}}:vgpr_32 = COPY %18.sub0
|
|
|
|
# GFX9: %{{[0-9]+}}:vgpr_32 = COPY killed %18.sub1
|
[AMDGPU][SILoadStoreOptimizer] Merge TBUFFER loads/stores
Summary: Extend SILoadStoreOptimizer to merge tbuffer loads and stores.
Reviewers: nhaehnle
Reviewed By: nhaehnle
Subscribers: arsenm, kzhuravl, jvesely, wdng, nhaehnle, yaxunl, dstuttard, tpr, t-tye, hiraditya, llvm-commits
Tags: #llvm
Differential Revision: https://reviews.llvm.org/D69794
2019-11-21 05:30:02 +08:00
|
|
|
|
|
|
|
name: gfx9_tbuffer_load_sint_32
|
|
|
|
body: |
|
|
|
|
bb.0.entry:
|
|
|
|
%0:sgpr_32 = COPY $sgpr0
|
|
|
|
%1:sgpr_32 = COPY $sgpr1
|
|
|
|
%2:sgpr_32 = COPY $sgpr2
|
|
|
|
%3:sgpr_32 = COPY $sgpr3
|
|
|
|
%5:sgpr_128 = REG_SEQUENCE %0:sgpr_32, %subreg.sub0, %1:sgpr_32, %subreg.sub1, %2:sgpr_32, %subreg.sub2, %3:sgpr_32, %subreg.sub3
|
|
|
|
%7:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %5:sgpr_128, 0, 4, 84, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, align 1, addrspace 4)
|
|
|
|
%8:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %5:sgpr_128, 0, 8, 84, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, align 1, addrspace 4)
|
|
|
|
%9:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %5:sgpr_128, 0, 16, 84, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, align 1, addrspace 4)
|
|
|
|
%10:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %5:sgpr_128, 0, 20, 84, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, align 1, addrspace 4)
|
|
|
|
%11:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %5:sgpr_128, 0, 24, 84, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, align 1, addrspace 4)
|
|
|
|
%12:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %5:sgpr_128, 0, 28, 84, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, align 1, addrspace 4)
|
|
|
|
%13:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %5:sgpr_128, 0, 36, 84, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, align 1, addrspace 4)
|
|
|
|
%14:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %5:sgpr_128, 0, 40, 84, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, align 1, addrspace 4)
|
|
|
|
%15:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %5:sgpr_128, 0, 44, 84, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, align 1, addrspace 4)
|
|
|
|
...
|
|
|
|
---
|
|
|
|
|
|
|
|
# GFX9-LABEL: name: gfx9_tbuffer_load_uint_32
|
|
|
|
# GFX9: %{{[0-9]+}}:vreg_64 = TBUFFER_LOAD_FORMAT_XY_OFFSET %4, 0, 4, 75, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 8, align 1, addrspace 4)
|
|
|
|
# GFX9: %{{[0-9]+}}:vgpr_32 = COPY %14.sub0
|
|
|
|
# GFX9: %{{[0-9]+}}:vgpr_32 = COPY killed %14.sub1
|
|
|
|
# GFX9: %{{[0-9]+}}:vreg_128 = TBUFFER_LOAD_FORMAT_XYZW_OFFSET %4, 0, 16, 78, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 16, align 1, addrspace 4)
|
AMDGPU/SILoadStoreOptimizer: Improve merging of out of order offsets
Summary:
This improves merging of sequences like:
store a, ptr + 4
store b, ptr + 8
store c, ptr + 12
store d, ptr + 16
store e, ptr + 20
store f, ptr
Prior to this patch the basic block was scanned in order to find instructions
to merge and the above sequence would be transformed to:
store4 <a, b, c, d>, ptr + 4
store e, ptr + 20
store r, ptr
With this change, we now sort all the candidate merge instructions by their offset,
so instructions are visited in offset order rather than in the order they appear
in the basic block. We now transform this sequnce into:
store4 <f, a, b, c>, ptr
store2 <d, e>, ptr + 16
Another benefit of this change is that since we have sorted the mergeable lists
by offset, we can easily check if an instruction is mergeable by checking the
offset of the instruction that becomes before or after it in the sorted list.
Once we determine an instruction is not mergeable we can remove it from the list
and avoid having to do the more expensive mergeablilty checks.
Reviewers: arsenm, pendingchaos, rampitec, nhaehnle, vpykhtin
Reviewed By: arsenm, nhaehnle
Subscribers: kerbowa, merge_guards_bot, kzhuravl, jvesely, wdng, yaxunl, dstuttard, tpr, t-tye, hiraditya, llvm-commits
Tags: #llvm
Differential Revision: https://reviews.llvm.org/D65966
2020-01-25 05:07:08 +08:00
|
|
|
# GFX9: %{{[0-9]+}}:vreg_96 = COPY %17.sub0_sub1_sub2
|
|
|
|
# GFX9: %{{[0-9]+}}:vgpr_32 = COPY killed %17.sub3
|
|
|
|
# GFX9: %{{[0-9]+}}:vreg_64 = COPY %16.sub0_sub1
|
|
|
|
# GFX9: %{{[0-9]+}}:vgpr_32 = COPY killed %16.sub2
|
[AMDGPU][SILoadStoreOptimizer] Merge TBUFFER loads/stores
Summary: Extend SILoadStoreOptimizer to merge tbuffer loads and stores.
Reviewers: nhaehnle
Reviewed By: nhaehnle
Subscribers: arsenm, kzhuravl, jvesely, wdng, nhaehnle, yaxunl, dstuttard, tpr, t-tye, hiraditya, llvm-commits
Tags: #llvm
Differential Revision: https://reviews.llvm.org/D69794
2019-11-21 05:30:02 +08:00
|
|
|
# GFX9: %{{[0-9]+}}:vgpr_32 = COPY %15.sub0
|
|
|
|
# GFX9: %{{[0-9]+}}:vgpr_32 = COPY killed %15.sub1
|
|
|
|
# GFX9: %{{[0-9]+}}:vreg_96 = TBUFFER_LOAD_FORMAT_XYZ_OFFSET %4, 0, 36, 77, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 12, align 1, addrspace 4)
|
|
|
|
# GFX9: %{{[0-9]+}}:vreg_64 = COPY %19.sub0_sub1
|
|
|
|
# GFX9: %{{[0-9]+}}:vgpr_32 = COPY killed %19.sub2
|
AMDGPU/SILoadStoreOptimizer: Improve merging of out of order offsets
Summary:
This improves merging of sequences like:
store a, ptr + 4
store b, ptr + 8
store c, ptr + 12
store d, ptr + 16
store e, ptr + 20
store f, ptr
Prior to this patch the basic block was scanned in order to find instructions
to merge and the above sequence would be transformed to:
store4 <a, b, c, d>, ptr + 4
store e, ptr + 20
store r, ptr
With this change, we now sort all the candidate merge instructions by their offset,
so instructions are visited in offset order rather than in the order they appear
in the basic block. We now transform this sequnce into:
store4 <f, a, b, c>, ptr
store2 <d, e>, ptr + 16
Another benefit of this change is that since we have sorted the mergeable lists
by offset, we can easily check if an instruction is mergeable by checking the
offset of the instruction that becomes before or after it in the sorted list.
Once we determine an instruction is not mergeable we can remove it from the list
and avoid having to do the more expensive mergeablilty checks.
Reviewers: arsenm, pendingchaos, rampitec, nhaehnle, vpykhtin
Reviewed By: arsenm, nhaehnle
Subscribers: kerbowa, merge_guards_bot, kzhuravl, jvesely, wdng, yaxunl, dstuttard, tpr, t-tye, hiraditya, llvm-commits
Tags: #llvm
Differential Revision: https://reviews.llvm.org/D65966
2020-01-25 05:07:08 +08:00
|
|
|
# GFX9: %{{[0-9]+}}:vgpr_32 = COPY %18.sub0
|
|
|
|
# GFX9: %{{[0-9]+}}:vgpr_32 = COPY killed %18.sub1
|
[AMDGPU][SILoadStoreOptimizer] Merge TBUFFER loads/stores
Summary: Extend SILoadStoreOptimizer to merge tbuffer loads and stores.
Reviewers: nhaehnle
Reviewed By: nhaehnle
Subscribers: arsenm, kzhuravl, jvesely, wdng, nhaehnle, yaxunl, dstuttard, tpr, t-tye, hiraditya, llvm-commits
Tags: #llvm
Differential Revision: https://reviews.llvm.org/D69794
2019-11-21 05:30:02 +08:00
|
|
|
|
|
|
|
name: gfx9_tbuffer_load_uint_32
|
|
|
|
body: |
|
|
|
|
bb.0.entry:
|
|
|
|
%0:sgpr_32 = COPY $sgpr0
|
|
|
|
%1:sgpr_32 = COPY $sgpr1
|
|
|
|
%2:sgpr_32 = COPY $sgpr2
|
|
|
|
%3:sgpr_32 = COPY $sgpr3
|
|
|
|
%5:sgpr_128 = REG_SEQUENCE %0:sgpr_32, %subreg.sub0, %1:sgpr_32, %subreg.sub1, %2:sgpr_32, %subreg.sub2, %3:sgpr_32, %subreg.sub3
|
|
|
|
%7:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %5:sgpr_128, 0, 4, 68, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, align 1, addrspace 4)
|
|
|
|
%8:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %5:sgpr_128, 0, 8, 68, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, align 1, addrspace 4)
|
|
|
|
%9:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %5:sgpr_128, 0, 16, 68, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, align 1, addrspace 4)
|
|
|
|
%10:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %5:sgpr_128, 0, 20, 68, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, align 1, addrspace 4)
|
|
|
|
%11:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %5:sgpr_128, 0, 24, 68, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, align 1, addrspace 4)
|
|
|
|
%12:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %5:sgpr_128, 0, 28, 68, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, align 1, addrspace 4)
|
|
|
|
%13:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %5:sgpr_128, 0, 36, 68, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, align 1, addrspace 4)
|
|
|
|
%14:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %5:sgpr_128, 0, 40, 68, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, align 1, addrspace 4)
|
|
|
|
%15:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %5:sgpr_128, 0, 44, 68, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, align 1, addrspace 4)
|
|
|
|
...
|
|
|
|
---
|
|
|
|
|
|
|
|
# GFX9-LABEL: name: gfx9_tbuffer_load_not_merged_data_format_mismatch
|
|
|
|
# GFX9: %{{[0-9]+}}:sgpr_32 = COPY $sgpr0
|
|
|
|
# GFX9: %{{[0-9]+}}:sgpr_32 = COPY $sgpr1
|
|
|
|
# GFX9: %{{[0-9]+}}:sgpr_32 = COPY $sgpr2
|
|
|
|
# GFX9: %{{[0-9]+}}:sgpr_32 = COPY $sgpr3
|
|
|
|
# GFX9: %{{[0-9]+}}:sgpr_128 = REG_SEQUENCE %0, %subreg.sub0, %1, %subreg.sub1, %2, %subreg.sub2, %3, %subreg.sub3
|
|
|
|
# GFX9: %{{[0-9]+}}:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %4, 0, 4, 116, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, align 1, addrspace 4)
|
|
|
|
# GFX9: %{{[0-9]+}}:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %4, 0, 8, 114, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, align 1, addrspace 4)
|
|
|
|
# GFX9: %{{[0-9]+}}:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %4, 0, 16, 116, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, align 1, addrspace 4)
|
|
|
|
# GFX9: %{{[0-9]+}}:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %4, 0, 20, 114, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, align 1, addrspace 4)
|
|
|
|
# GFX9: %{{[0-9]+}}:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %4, 0, 24, 116, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, align 1, addrspace 4)
|
|
|
|
# GFX9: %{{[0-9]+}}:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %4, 0, 28, 114, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, align 1, addrspace 4)
|
|
|
|
# GFX9: %{{[0-9]+}}:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %4, 0, 36, 116, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, align 1, addrspace 4)
|
|
|
|
# GFX9: %{{[0-9]+}}:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %4, 0, 40, 114, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, align 1, addrspace 4)
|
|
|
|
# GFX9: %{{[0-9]+}}:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %4, 0, 44, 116, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, align 1, addrspace 4)
|
|
|
|
|
|
|
|
|
|
|
|
name: gfx9_tbuffer_load_not_merged_data_format_mismatch
|
|
|
|
body: |
|
|
|
|
bb.0.entry:
|
|
|
|
%0:sgpr_32 = COPY $sgpr0
|
|
|
|
%1:sgpr_32 = COPY $sgpr1
|
|
|
|
%2:sgpr_32 = COPY $sgpr2
|
|
|
|
%3:sgpr_32 = COPY $sgpr3
|
|
|
|
%5:sgpr_128 = REG_SEQUENCE %0:sgpr_32, %subreg.sub0, %1:sgpr_32, %subreg.sub1, %2:sgpr_32, %subreg.sub2, %3:sgpr_32, %subreg.sub3
|
|
|
|
%7:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %5:sgpr_128, 0, 4, 116, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, align 1, addrspace 4)
|
|
|
|
%8:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %5:sgpr_128, 0, 8, 114, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, align 1, addrspace 4)
|
|
|
|
%9:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %5:sgpr_128, 0, 16, 116, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, align 1, addrspace 4)
|
|
|
|
%10:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %5:sgpr_128, 0, 20, 114, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, align 1, addrspace 4)
|
|
|
|
%11:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %5:sgpr_128, 0, 24, 116, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, align 1, addrspace 4)
|
|
|
|
%12:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %5:sgpr_128, 0, 28, 114, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, align 1, addrspace 4)
|
|
|
|
%13:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %5:sgpr_128, 0, 36, 116, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, align 1, addrspace 4)
|
|
|
|
%14:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %5:sgpr_128, 0, 40, 114, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, align 1, addrspace 4)
|
|
|
|
%15:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %5:sgpr_128, 0, 44, 116, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, align 1, addrspace 4)
|
|
|
|
...
|
|
|
|
---
|
|
|
|
|
|
|
|
# GFX9-LABEL: name: gfx9_tbuffer_load_not_merged_num_format_mismatch
|
|
|
|
# GFX9: %{{[0-9]+}}:sgpr_32 = COPY $sgpr0
|
|
|
|
# GFX9: %{{[0-9]+}}:sgpr_32 = COPY $sgpr1
|
|
|
|
# GFX9: %{{[0-9]+}}:sgpr_32 = COPY $sgpr2
|
|
|
|
# GFX9: %{{[0-9]+}}:sgpr_32 = COPY $sgpr3
|
|
|
|
# GFX9: %{{[0-9]+}}:sgpr_128 = REG_SEQUENCE %0, %subreg.sub0, %1, %subreg.sub1, %2, %subreg.sub2, %3, %subreg.sub3
|
|
|
|
# GFX9: %{{[0-9]+}}:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %4, 0, 4, 116, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, align 1, addrspace 4)
|
|
|
|
# GFX9: %{{[0-9]+}}:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %4, 0, 8, 84, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, align 1, addrspace 4)
|
|
|
|
# GFX9: %{{[0-9]+}}:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %4, 0, 16, 116, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, align 1, addrspace 4)
|
|
|
|
# GFX9: %{{[0-9]+}}:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %4, 0, 20, 84, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, align 1, addrspace 4)
|
|
|
|
# GFX9: %{{[0-9]+}}:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %4, 0, 24, 116, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, align 1, addrspace 4)
|
|
|
|
# GFX9: %{{[0-9]+}}:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %4, 0, 28, 84, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, align 1, addrspace 4)
|
|
|
|
# GFX9: %{{[0-9]+}}:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %4, 0, 36, 116, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, align 1, addrspace 4)
|
|
|
|
# GFX9: %{{[0-9]+}}:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %4, 0, 40, 84, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, align 1, addrspace 4)
|
|
|
|
# GFX9: %{{[0-9]+}}:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %4, 0, 44, 116, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, align 1, addrspace 4)
|
|
|
|
name: gfx9_tbuffer_load_not_merged_num_format_mismatch
|
|
|
|
body: |
|
|
|
|
bb.0.entry:
|
|
|
|
%0:sgpr_32 = COPY $sgpr0
|
|
|
|
%1:sgpr_32 = COPY $sgpr1
|
|
|
|
%2:sgpr_32 = COPY $sgpr2
|
|
|
|
%3:sgpr_32 = COPY $sgpr3
|
|
|
|
%5:sgpr_128 = REG_SEQUENCE %0:sgpr_32, %subreg.sub0, %1:sgpr_32, %subreg.sub1, %2:sgpr_32, %subreg.sub2, %3:sgpr_32, %subreg.sub3
|
|
|
|
%7:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %5:sgpr_128, 0, 4, 116, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, align 1, addrspace 4)
|
|
|
|
%8:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %5:sgpr_128, 0, 8, 84, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, align 1, addrspace 4)
|
|
|
|
%9:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %5:sgpr_128, 0, 16, 116, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, align 1, addrspace 4)
|
|
|
|
%10:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %5:sgpr_128, 0, 20, 84, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, align 1, addrspace 4)
|
|
|
|
%11:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %5:sgpr_128, 0, 24, 116, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, align 1, addrspace 4)
|
|
|
|
%12:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %5:sgpr_128, 0, 28, 84, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, align 1, addrspace 4)
|
|
|
|
%13:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %5:sgpr_128, 0, 36, 116, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, align 1, addrspace 4)
|
|
|
|
%14:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %5:sgpr_128, 0, 40, 84, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, align 1, addrspace 4)
|
|
|
|
%15:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %5:sgpr_128, 0, 44, 116, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, align 1, addrspace 4)
|
|
|
|
...
|
|
|
|
---
|
|
|
|
|
|
|
|
# GFX9-LABEL: name: gfx9_tbuffer_store_x_xyz
|
|
|
|
# GFX9: %{{[0-9]+}}:vreg_96 = REG_SEQUENCE %3, %subreg.sub0, %2, %subreg.sub1, %1, %subreg.sub2
|
|
|
|
# GFX9: %{{[0-9]+}}:vreg_128 = REG_SEQUENCE %0, %subreg.sub0, %9, %subreg.sub1_sub2_sub3
|
|
|
|
# GFX9: TBUFFER_STORE_FORMAT_XYZW_OFFSET_exact killed %10, %8, 0, 4, 126, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 16, align 1, addrspace 4)
|
|
|
|
name: gfx9_tbuffer_store_x_xyz
|
|
|
|
body: |
|
|
|
|
bb.0.entry:
|
|
|
|
liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr0, $vgpr1, $vgpr2, $vgpr3
|
|
|
|
%7:vgpr_32 = COPY $vgpr3
|
|
|
|
%6:vgpr_32 = COPY $vgpr2
|
|
|
|
%5:vgpr_32 = COPY $vgpr1
|
|
|
|
%4:vgpr_32 = COPY $vgpr0
|
|
|
|
%3:sgpr_32 = COPY $sgpr3
|
|
|
|
%2:sgpr_32 = COPY $sgpr2
|
|
|
|
%1:sgpr_32 = COPY $sgpr1
|
|
|
|
%0:sgpr_32 = COPY $sgpr0
|
|
|
|
%13:sgpr_128 = REG_SEQUENCE %0:sgpr_32, %subreg.sub0, %1:sgpr_32, %subreg.sub1, %2:sgpr_32, %subreg.sub2, %3:sgpr_32, %subreg.sub3
|
|
|
|
%14:vreg_96 = REG_SEQUENCE %4:vgpr_32, %subreg.sub0, %5:vgpr_32, %subreg.sub1, %6:vgpr_32, %subreg.sub2
|
|
|
|
TBUFFER_STORE_FORMAT_X_OFFSET_exact %7:vgpr_32, %13:sgpr_128, 0, 4, 116, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4, align 1, addrspace 4)
|
|
|
|
TBUFFER_STORE_FORMAT_XYZ_OFFSET_exact %14:vreg_96, %13:sgpr_128, 0, 8, 125, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 12, align 1, addrspace 4)
|
|
|
|
...
|
|
|
|
---
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
# GFX9-LABEL: name: gfx9_tbuffer_store_xyz_x
|
|
|
|
# GFX9: %{{[0-9]+}}:vreg_96 = REG_SEQUENCE %3, %subreg.sub0, %2, %subreg.sub1, %1, %subreg.sub2
|
|
|
|
# GFX9: %{{[0-9]+}}:vreg_128 = REG_SEQUENCE %9, %subreg.sub0_sub1_sub2, %0, %subreg.sub3
|
|
|
|
# GFX9: TBUFFER_STORE_FORMAT_XYZW_OFFSET_exact killed %10, %8, 0, 4, 126, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 16, align 1, addrspace 4)
|
|
|
|
name: gfx9_tbuffer_store_xyz_x
|
|
|
|
body: |
|
|
|
|
bb.0.entry:
|
|
|
|
liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr0, $vgpr1, $vgpr2, $vgpr3
|
|
|
|
%7:vgpr_32 = COPY $vgpr3
|
|
|
|
%6:vgpr_32 = COPY $vgpr2
|
|
|
|
%5:vgpr_32 = COPY $vgpr1
|
|
|
|
%4:vgpr_32 = COPY $vgpr0
|
|
|
|
%3:sgpr_32 = COPY $sgpr3
|
|
|
|
%2:sgpr_32 = COPY $sgpr2
|
|
|
|
%1:sgpr_32 = COPY $sgpr1
|
|
|
|
%0:sgpr_32 = COPY $sgpr0
|
|
|
|
%13:sgpr_128 = REG_SEQUENCE %0:sgpr_32, %subreg.sub0, %1:sgpr_32, %subreg.sub1, %2:sgpr_32, %subreg.sub2, %3:sgpr_32, %subreg.sub3
|
|
|
|
%14:vreg_96 = REG_SEQUENCE %4:vgpr_32, %subreg.sub0, %5:vgpr_32, %subreg.sub1, %6:vgpr_32, %subreg.sub2
|
|
|
|
TBUFFER_STORE_FORMAT_XYZ_OFFSET_exact %14:vreg_96, %13:sgpr_128, 0, 4, 125, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 12, align 1, addrspace 4)
|
|
|
|
TBUFFER_STORE_FORMAT_X_OFFSET_exact %7:vgpr_32, %13:sgpr_128, 0, 16, 116, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4, align 1, addrspace 4)
|
|
|
|
...
|
|
|
|
---
|
|
|
|
|
|
|
|
# GFX9-LABEL: name: gfx9_tbuffer_store_xy_xy
|
|
|
|
# GFX9: %{{[0-9]+}}:vreg_64 = REG_SEQUENCE %3, %subreg.sub0, %2, %subreg.sub1
|
|
|
|
# GFX9: %{{[0-9]+}}:vreg_64 = REG_SEQUENCE %1, %subreg.sub0, %0, %subreg.sub1
|
|
|
|
# GFX9: %{{[0-9]+}}:vreg_128 = REG_SEQUENCE %9, %subreg.sub0_sub1, %10, %subreg.sub2_sub3
|
|
|
|
# GFX9: TBUFFER_STORE_FORMAT_XYZW_OFFSET_exact killed %11, %8, 0, 4, 126, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 16, align 1, addrspace 4)
|
|
|
|
name: gfx9_tbuffer_store_xy_xy
|
|
|
|
body: |
|
|
|
|
bb.0.entry:
|
|
|
|
liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr0, $vgpr1, $vgpr2, $vgpr3
|
|
|
|
%7:vgpr_32 = COPY $vgpr3
|
|
|
|
%6:vgpr_32 = COPY $vgpr2
|
|
|
|
%5:vgpr_32 = COPY $vgpr1
|
|
|
|
%4:vgpr_32 = COPY $vgpr0
|
|
|
|
%3:sgpr_32 = COPY $sgpr3
|
|
|
|
%2:sgpr_32 = COPY $sgpr2
|
|
|
|
%1:sgpr_32 = COPY $sgpr1
|
|
|
|
%0:sgpr_32 = COPY $sgpr0
|
|
|
|
%13:sgpr_128 = REG_SEQUENCE %0:sgpr_32, %subreg.sub0, %1:sgpr_32, %subreg.sub1, %2:sgpr_32, %subreg.sub2, %3:sgpr_32, %subreg.sub3
|
|
|
|
%14:vreg_64 = REG_SEQUENCE %4:vgpr_32, %subreg.sub0, %5:vgpr_32, %subreg.sub1
|
|
|
|
%15:vreg_64 = REG_SEQUENCE %6:vgpr_32, %subreg.sub0, %7:vgpr_32, %subreg.sub1
|
|
|
|
TBUFFER_STORE_FORMAT_XY_OFFSET_exact %14:vreg_64, %13:sgpr_128, 0, 4, 123, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 8, align 1, addrspace 4)
|
|
|
|
TBUFFER_STORE_FORMAT_XY_OFFSET_exact %15:vreg_64, %13:sgpr_128, 0, 12, 123, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 8, align 1, addrspace 4)
|
|
|
|
...
|
|
|
|
---
|
|
|
|
|
|
|
|
# GFX9-LABEL: name: gfx9_tbuffer_store_x_xy
|
|
|
|
# GFX9: %{{[0-9]+}}:vreg_64 = REG_SEQUENCE %3, %subreg.sub0, %2, %subreg.sub1
|
|
|
|
# GFX9: %{{[0-9]+}}:vreg_64, %subreg.sub1_sub2
|
|
|
|
# GFX9: TBUFFER_STORE_FORMAT_XYZ_OFFSET_exact killed %11, %8, 0, 4, 125, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 12, align 1, addrspace 4)
|
|
|
|
name: gfx9_tbuffer_store_x_xy
|
|
|
|
body: |
|
|
|
|
bb.0.entry:
|
|
|
|
liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr0, $vgpr1, $vgpr2, $vgpr3
|
|
|
|
%7:vgpr_32 = COPY $vgpr3
|
|
|
|
%6:vgpr_32 = COPY $vgpr2
|
|
|
|
%5:vgpr_32 = COPY $vgpr1
|
|
|
|
%4:vgpr_32 = COPY $vgpr0
|
|
|
|
%3:sgpr_32 = COPY $sgpr3
|
|
|
|
%2:sgpr_32 = COPY $sgpr2
|
|
|
|
%1:sgpr_32 = COPY $sgpr1
|
|
|
|
%0:sgpr_32 = COPY $sgpr0
|
|
|
|
%13:sgpr_128 = REG_SEQUENCE %0:sgpr_32, %subreg.sub0, %1:sgpr_32, %subreg.sub1, %2:sgpr_32, %subreg.sub2, %3:sgpr_32, %subreg.sub3
|
|
|
|
%14:vreg_64 = REG_SEQUENCE %4:vgpr_32, %subreg.sub0, %5:vgpr_32, %subreg.sub1
|
|
|
|
TBUFFER_STORE_FORMAT_X_OFFSET_exact %7:vgpr_32, %13:sgpr_128, 0, 4, 116, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4, align 1, addrspace 4)
|
|
|
|
TBUFFER_STORE_FORMAT_XY_OFFSET_exact %15:vreg_64, %13:sgpr_128, 0, 8, 123, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 8, align 1, addrspace 4)
|
|
|
|
...
|
|
|
|
---
|
|
|
|
|
|
|
|
# GFX9-LABEL: name: gfx9_tbuffer_store_xy_x
|
|
|
|
# GFX9: %{{[0-9]+}}:vreg_64 = REG_SEQUENCE %3, %subreg.sub0, %2, %subreg.sub1
|
|
|
|
# GFX9: %{{[0-9]+}}:vreg_96 = REG_SEQUENCE %9, %subreg.sub0_sub1, %0, %subreg.sub2
|
|
|
|
# GFX9: TBUFFER_STORE_FORMAT_XYZ_OFFSET_exact killed %10, %8, 0, 4, 125, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 12, align 1, addrspace 4)
|
|
|
|
name: gfx9_tbuffer_store_xy_x
|
|
|
|
body: |
|
|
|
|
bb.0.entry:
|
|
|
|
liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr0, $vgpr1, $vgpr2, $vgpr3
|
|
|
|
liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr0, $vgpr1, $vgpr2, $vgpr3
|
|
|
|
%7:vgpr_32 = COPY $vgpr3
|
|
|
|
%6:vgpr_32 = COPY $vgpr2
|
|
|
|
%5:vgpr_32 = COPY $vgpr1
|
|
|
|
%4:vgpr_32 = COPY $vgpr0
|
|
|
|
%3:sgpr_32 = COPY $sgpr3
|
|
|
|
%2:sgpr_32 = COPY $sgpr2
|
|
|
|
%1:sgpr_32 = COPY $sgpr1
|
|
|
|
%0:sgpr_32 = COPY $sgpr0
|
|
|
|
%13:sgpr_128 = REG_SEQUENCE %0:sgpr_32, %subreg.sub0, %1:sgpr_32, %subreg.sub1, %2:sgpr_32, %subreg.sub2, %3:sgpr_32, %subreg.sub3
|
|
|
|
%14:vreg_64 = REG_SEQUENCE %4:vgpr_32, %subreg.sub0, %5:vgpr_32, %subreg.sub1
|
|
|
|
TBUFFER_STORE_FORMAT_XY_OFFSET_exact %14:vreg_64, %13:sgpr_128, 0, 4, 123, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 8, align 1, addrspace 4)
|
|
|
|
TBUFFER_STORE_FORMAT_X_OFFSET_exact %7:vgpr_32, %13:sgpr_128, 0, 12, 116, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4, align 1, addrspace 4)
|
|
|
|
...
|
|
|
|
---
|
|
|
|
|
|
|
|
|
|
|
|
# GFX9-LABEL: name: gfx9_tbuffer_store_x_x
|
|
|
|
# GFX9: %{{[0-9]+}}:vreg_64 = REG_SEQUENCE %1, %subreg.sub0, %0, %subreg.sub1
|
|
|
|
# GFX9: TBUFFER_STORE_FORMAT_XY_OFFSET_exact killed %9, %8, 0, 4, 123, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 8, align 1, addrspace 4)
|
|
|
|
name: gfx9_tbuffer_store_x_x
|
|
|
|
body: |
|
|
|
|
bb.0.entry:
|
|
|
|
liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr0, $vgpr1, $vgpr2, $vgpr3
|
|
|
|
%7:vgpr_32 = COPY $vgpr3
|
|
|
|
%6:vgpr_32 = COPY $vgpr2
|
|
|
|
%5:vgpr_32 = COPY $vgpr1
|
|
|
|
%4:vgpr_32 = COPY $vgpr0
|
|
|
|
%3:sgpr_32 = COPY $sgpr3
|
|
|
|
%2:sgpr_32 = COPY $sgpr2
|
|
|
|
%1:sgpr_32 = COPY $sgpr1
|
|
|
|
%0:sgpr_32 = COPY $sgpr0
|
|
|
|
%13:sgpr_128 = REG_SEQUENCE %0:sgpr_32, %subreg.sub0, %1:sgpr_32, %subreg.sub1, %2:sgpr_32, %subreg.sub2, %3:sgpr_32, %subreg.sub3
|
|
|
|
TBUFFER_STORE_FORMAT_X_OFFSET_exact %6:vgpr_32, %13:sgpr_128, 0, 4, 116, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4, align 1, addrspace 4)
|
|
|
|
TBUFFER_STORE_FORMAT_X_OFFSET_exact %7:vgpr_32, %13:sgpr_128, 0, 8, 116, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4, align 1, addrspace 4)
|
|
|
|
...
|
|
|
|
---
|
|
|
|
|
|
|
|
# GFX9-LABEL: name: gfx9_tbuffer_store_x_x_format_32_32_32_32
|
|
|
|
# GFX9: %{{[0-9]+}}:vreg_64 = REG_SEQUENCE %1, %subreg.sub0, %0, %subreg.sub1
|
|
|
|
# GFX9: TBUFFER_STORE_FORMAT_XY_OFFSET_exact killed %9, %8, 0, 4, 123, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 8, align 1, addrspace 4)
|
|
|
|
name: gfx9_tbuffer_store_x_x_format_32_32_32_32
|
|
|
|
body: |
|
|
|
|
bb.0.entry:
|
|
|
|
liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr0, $vgpr1, $vgpr2, $vgpr3
|
|
|
|
%7:vgpr_32 = COPY $vgpr3
|
|
|
|
%6:vgpr_32 = COPY $vgpr2
|
|
|
|
%5:vgpr_32 = COPY $vgpr1
|
|
|
|
%4:vgpr_32 = COPY $vgpr0
|
|
|
|
%3:sgpr_32 = COPY $sgpr3
|
|
|
|
%2:sgpr_32 = COPY $sgpr2
|
|
|
|
%1:sgpr_32 = COPY $sgpr1
|
|
|
|
%0:sgpr_32 = COPY $sgpr0
|
|
|
|
%13:sgpr_128 = REG_SEQUENCE %0:sgpr_32, %subreg.sub0, %1:sgpr_32, %subreg.sub1, %2:sgpr_32, %subreg.sub2, %3:sgpr_32, %subreg.sub3
|
|
|
|
TBUFFER_STORE_FORMAT_X_OFFSET_exact %6:vgpr_32, %13:sgpr_128, 0, 4, 126, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4, align 1, addrspace 4)
|
|
|
|
TBUFFER_STORE_FORMAT_X_OFFSET_exact %7:vgpr_32, %13:sgpr_128, 0, 8, 126, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4, align 1, addrspace 4)
|
|
|
|
...
|
|
|
|
---
|
|
|
|
|
|
|
|
# GFX9-LABEL: name: gfx9_tbuffer_store_float32
|
|
|
|
# GFX9: %{{[0-9]+}}:vgpr_32 = COPY $vgpr8
|
|
|
|
# GFX9: %{{[0-9]+}}:vgpr_32 = COPY $vgpr7
|
|
|
|
# GFX9: %{{[0-9]+}}:vgpr_32 = COPY $vgpr6
|
|
|
|
# GFX9: %{{[0-9]+}}:vgpr_32 = COPY $vgpr5
|
|
|
|
# GFX9: %{{[0-9]+}}:vgpr_32 = COPY $vgpr4
|
|
|
|
# GFX9: %{{[0-9]+}}:vgpr_32 = COPY $vgpr3
|
|
|
|
# GFX9: %{{[0-9]+}}:vgpr_32 = COPY $vgpr2
|
|
|
|
# GFX9: %{{[0-9]+}}:vgpr_32 = COPY $vgpr1
|
|
|
|
# GFX9: %{{[0-9]+}}:vgpr_32 = COPY $vgpr0
|
|
|
|
# GFX9: %{{[0-9]+}}:sgpr_32 = COPY $sgpr3
|
|
|
|
# GFX9: %{{[0-9]+}}:sgpr_32 = COPY $sgpr2
|
|
|
|
# GFX9: %{{[0-9]+}}:sgpr_32 = COPY $sgpr1
|
|
|
|
# GFX9: %{{[0-9]+}}:sgpr_32 = COPY $sgpr0
|
|
|
|
# GFX9: %{{[0-9]+}}:sgpr_128 = REG_SEQUENCE %12, %subreg.sub0, %11, %subreg.sub1, %10, %subreg.sub2, %9, %subreg.sub3
|
|
|
|
# GFX9: %{{[0-9]+}}:vreg_64 = REG_SEQUENCE %8, %subreg.sub0, %7, %subreg.sub1
|
|
|
|
# GFX9: TBUFFER_STORE_FORMAT_XY_OFFSET_exact killed %14, %13, 0, 4, 123, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 8, align 1, addrspace 4)
|
|
|
|
# GFX9: %{{[0-9]+}}:vreg_64 = REG_SEQUENCE %6, %subreg.sub0, %5, %subreg.sub1
|
AMDGPU/SILoadStoreOptimizer: Improve merging of out of order offsets
Summary:
This improves merging of sequences like:
store a, ptr + 4
store b, ptr + 8
store c, ptr + 12
store d, ptr + 16
store e, ptr + 20
store f, ptr
Prior to this patch the basic block was scanned in order to find instructions
to merge and the above sequence would be transformed to:
store4 <a, b, c, d>, ptr + 4
store e, ptr + 20
store r, ptr
With this change, we now sort all the candidate merge instructions by their offset,
so instructions are visited in offset order rather than in the order they appear
in the basic block. We now transform this sequnce into:
store4 <f, a, b, c>, ptr
store2 <d, e>, ptr + 16
Another benefit of this change is that since we have sorted the mergeable lists
by offset, we can easily check if an instruction is mergeable by checking the
offset of the instruction that becomes before or after it in the sorted list.
Once we determine an instruction is not mergeable we can remove it from the list
and avoid having to do the more expensive mergeablilty checks.
Reviewers: arsenm, pendingchaos, rampitec, nhaehnle, vpykhtin
Reviewed By: arsenm, nhaehnle
Subscribers: kerbowa, merge_guards_bot, kzhuravl, jvesely, wdng, yaxunl, dstuttard, tpr, t-tye, hiraditya, llvm-commits
Tags: #llvm
Differential Revision: https://reviews.llvm.org/D65966
2020-01-25 05:07:08 +08:00
|
|
|
# GFX9: %{{[0-9]+}}:vreg_96 = REG_SEQUENCE killed %15, %subreg.sub0_sub1, %4, %subreg.sub2
|
|
|
|
# GFX9: %{{[0-9]+}}:vreg_128 = REG_SEQUENCE killed %16, %subreg.sub0_sub1_sub2, %3, %subreg.sub3
|
|
|
|
# GFX9: TBUFFER_STORE_FORMAT_XYZW_OFFSET_exact killed %17, %13, 0, 16, 126, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 16, align 1, addrspace 4)
|
[AMDGPU][SILoadStoreOptimizer] Merge TBUFFER loads/stores
Summary: Extend SILoadStoreOptimizer to merge tbuffer loads and stores.
Reviewers: nhaehnle
Reviewed By: nhaehnle
Subscribers: arsenm, kzhuravl, jvesely, wdng, nhaehnle, yaxunl, dstuttard, tpr, t-tye, hiraditya, llvm-commits
Tags: #llvm
Differential Revision: https://reviews.llvm.org/D69794
2019-11-21 05:30:02 +08:00
|
|
|
# GFX9: %{{[0-9]+}}:vreg_64 = REG_SEQUENCE %2, %subreg.sub0, %1, %subreg.sub1
|
AMDGPU/SILoadStoreOptimizer: Improve merging of out of order offsets
Summary:
This improves merging of sequences like:
store a, ptr + 4
store b, ptr + 8
store c, ptr + 12
store d, ptr + 16
store e, ptr + 20
store f, ptr
Prior to this patch the basic block was scanned in order to find instructions
to merge and the above sequence would be transformed to:
store4 <a, b, c, d>, ptr + 4
store e, ptr + 20
store r, ptr
With this change, we now sort all the candidate merge instructions by their offset,
so instructions are visited in offset order rather than in the order they appear
in the basic block. We now transform this sequnce into:
store4 <f, a, b, c>, ptr
store2 <d, e>, ptr + 16
Another benefit of this change is that since we have sorted the mergeable lists
by offset, we can easily check if an instruction is mergeable by checking the
offset of the instruction that becomes before or after it in the sorted list.
Once we determine an instruction is not mergeable we can remove it from the list
and avoid having to do the more expensive mergeablilty checks.
Reviewers: arsenm, pendingchaos, rampitec, nhaehnle, vpykhtin
Reviewed By: arsenm, nhaehnle
Subscribers: kerbowa, merge_guards_bot, kzhuravl, jvesely, wdng, yaxunl, dstuttard, tpr, t-tye, hiraditya, llvm-commits
Tags: #llvm
Differential Revision: https://reviews.llvm.org/D65966
2020-01-25 05:07:08 +08:00
|
|
|
# GFX9: %{{[0-9]+}}:vreg_96 = REG_SEQUENCE killed %18, %subreg.sub0_sub1, %0, %subreg.sub2
|
[AMDGPU][SILoadStoreOptimizer] Merge TBUFFER loads/stores
Summary: Extend SILoadStoreOptimizer to merge tbuffer loads and stores.
Reviewers: nhaehnle
Reviewed By: nhaehnle
Subscribers: arsenm, kzhuravl, jvesely, wdng, nhaehnle, yaxunl, dstuttard, tpr, t-tye, hiraditya, llvm-commits
Tags: #llvm
Differential Revision: https://reviews.llvm.org/D69794
2019-11-21 05:30:02 +08:00
|
|
|
# GFX9: TBUFFER_STORE_FORMAT_XYZ_OFFSET_exact killed %19, %13, 0, 36, 125, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 12, align 1, addrspace 4)
|
|
|
|
name: gfx9_tbuffer_store_float32
|
|
|
|
body: |
|
|
|
|
bb.0.entry:
|
|
|
|
liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8
|
|
|
|
%12:vgpr_32 = COPY $vgpr8
|
|
|
|
%11:vgpr_32 = COPY $vgpr7
|
|
|
|
%10:vgpr_32 = COPY $vgpr6
|
|
|
|
%9:vgpr_32 = COPY $vgpr5
|
|
|
|
%8:vgpr_32 = COPY $vgpr4
|
|
|
|
%7:vgpr_32 = COPY $vgpr3
|
|
|
|
%6:vgpr_32 = COPY $vgpr2
|
|
|
|
%5:vgpr_32 = COPY $vgpr1
|
|
|
|
%4:vgpr_32 = COPY $vgpr0
|
|
|
|
%3:sgpr_32 = COPY $sgpr3
|
|
|
|
%2:sgpr_32 = COPY $sgpr2
|
|
|
|
%1:sgpr_32 = COPY $sgpr1
|
|
|
|
%0:sgpr_32 = COPY $sgpr0
|
|
|
|
%13:sgpr_128 = REG_SEQUENCE %0:sgpr_32, %subreg.sub0, %1:sgpr_32, %subreg.sub1, %2:sgpr_32, %subreg.sub2, %3:sgpr_32, %subreg.sub3
|
|
|
|
TBUFFER_STORE_FORMAT_X_OFFSET_exact %4:vgpr_32, %13:sgpr_128, 0, 4, 116, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4, align 1, addrspace 4)
|
|
|
|
TBUFFER_STORE_FORMAT_X_OFFSET_exact %5:vgpr_32, %13:sgpr_128, 0, 8, 116, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4, align 1, addrspace 4)
|
|
|
|
TBUFFER_STORE_FORMAT_X_OFFSET_exact %6:vgpr_32, %13:sgpr_128, 0, 16, 116, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4, align 1, addrspace 4)
|
|
|
|
TBUFFER_STORE_FORMAT_X_OFFSET_exact %7:vgpr_32, %13:sgpr_128, 0, 20, 116, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4, align 1, addrspace 4)
|
|
|
|
TBUFFER_STORE_FORMAT_X_OFFSET_exact %8:vgpr_32, %13:sgpr_128, 0, 24, 116, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4, align 1, addrspace 4)
|
|
|
|
TBUFFER_STORE_FORMAT_X_OFFSET_exact %9:vgpr_32, %13:sgpr_128, 0, 28, 116, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4, align 1, addrspace 4)
|
|
|
|
TBUFFER_STORE_FORMAT_X_OFFSET_exact %10:vgpr_32, %13:sgpr_128, 0, 36, 116, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4, align 1, addrspace 4)
|
|
|
|
TBUFFER_STORE_FORMAT_X_OFFSET_exact %11:vgpr_32, %13:sgpr_128, 0, 40, 116, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4, align 1, addrspace 4)
|
|
|
|
TBUFFER_STORE_FORMAT_X_OFFSET_exact %12:vgpr_32, %13:sgpr_128, 0, 44, 116, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4, align 1, addrspace 4)
|
|
|
|
...
|
|
|
|
---
|
|
|
|
|
|
|
|
# GFX9-LABEL: name: gfx9_tbuffer_store_sint32
|
|
|
|
# GFX9: %{{[0-9]+}}:vgpr_32 = COPY $vgpr8
|
|
|
|
# GFX9: %{{[0-9]+}}:vgpr_32 = COPY $vgpr7
|
|
|
|
# GFX9: %{{[0-9]+}}:vgpr_32 = COPY $vgpr6
|
|
|
|
# GFX9: %{{[0-9]+}}:vgpr_32 = COPY $vgpr5
|
|
|
|
# GFX9: %{{[0-9]+}}:vgpr_32 = COPY $vgpr4
|
|
|
|
# GFX9: %{{[0-9]+}}:vgpr_32 = COPY $vgpr3
|
|
|
|
# GFX9: %{{[0-9]+}}:vgpr_32 = COPY $vgpr2
|
|
|
|
# GFX9: %{{[0-9]+}}:vgpr_32 = COPY $vgpr1
|
|
|
|
# GFX9: %{{[0-9]+}}:vgpr_32 = COPY $vgpr0
|
|
|
|
# GFX9: %{{[0-9]+}}:sgpr_32 = COPY $sgpr3
|
|
|
|
# GFX9: %{{[0-9]+}}:sgpr_32 = COPY $sgpr2
|
|
|
|
# GFX9: %{{[0-9]+}}:sgpr_32 = COPY $sgpr1
|
|
|
|
# GFX9: %{{[0-9]+}}:sgpr_32 = COPY $sgpr0
|
|
|
|
# GFX9: %{{[0-9]+}}:sgpr_128 = REG_SEQUENCE %12, %subreg.sub0, %11, %subreg.sub1, %10, %subreg.sub2, %9, %subreg.sub3
|
|
|
|
# GFX9: %{{[0-9]+}}:vreg_64 = REG_SEQUENCE %8, %subreg.sub0, %7, %subreg.sub1
|
|
|
|
# GFX9: TBUFFER_STORE_FORMAT_XY_OFFSET_exact killed %14, %13, 0, 4, 91, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 8, align 1, addrspace 4)
|
|
|
|
# GFX9: %{{[0-9]+}}:vreg_64 = REG_SEQUENCE %6, %subreg.sub0, %5, %subreg.sub1
|
AMDGPU/SILoadStoreOptimizer: Improve merging of out of order offsets
Summary:
This improves merging of sequences like:
store a, ptr + 4
store b, ptr + 8
store c, ptr + 12
store d, ptr + 16
store e, ptr + 20
store f, ptr
Prior to this patch the basic block was scanned in order to find instructions
to merge and the above sequence would be transformed to:
store4 <a, b, c, d>, ptr + 4
store e, ptr + 20
store r, ptr
With this change, we now sort all the candidate merge instructions by their offset,
so instructions are visited in offset order rather than in the order they appear
in the basic block. We now transform this sequnce into:
store4 <f, a, b, c>, ptr
store2 <d, e>, ptr + 16
Another benefit of this change is that since we have sorted the mergeable lists
by offset, we can easily check if an instruction is mergeable by checking the
offset of the instruction that becomes before or after it in the sorted list.
Once we determine an instruction is not mergeable we can remove it from the list
and avoid having to do the more expensive mergeablilty checks.
Reviewers: arsenm, pendingchaos, rampitec, nhaehnle, vpykhtin
Reviewed By: arsenm, nhaehnle
Subscribers: kerbowa, merge_guards_bot, kzhuravl, jvesely, wdng, yaxunl, dstuttard, tpr, t-tye, hiraditya, llvm-commits
Tags: #llvm
Differential Revision: https://reviews.llvm.org/D65966
2020-01-25 05:07:08 +08:00
|
|
|
# GFX9: %{{[0-9]+}}:vreg_96 = REG_SEQUENCE killed %15, %subreg.sub0_sub1, %4, %subreg.sub2
|
|
|
|
# GFX9: %{{[0-9]+}}:vreg_128 = REG_SEQUENCE killed %16, %subreg.sub0_sub1_sub2, %3, %subreg.sub3
|
|
|
|
# GFX9: TBUFFER_STORE_FORMAT_XYZW_OFFSET_exact killed %17, %13, 0, 16, 94, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 16, align 1, addrspace 4)
|
[AMDGPU][SILoadStoreOptimizer] Merge TBUFFER loads/stores
Summary: Extend SILoadStoreOptimizer to merge tbuffer loads and stores.
Reviewers: nhaehnle
Reviewed By: nhaehnle
Subscribers: arsenm, kzhuravl, jvesely, wdng, nhaehnle, yaxunl, dstuttard, tpr, t-tye, hiraditya, llvm-commits
Tags: #llvm
Differential Revision: https://reviews.llvm.org/D69794
2019-11-21 05:30:02 +08:00
|
|
|
# GFX9: %{{[0-9]+}}:vreg_64 = REG_SEQUENCE %2, %subreg.sub0, %1, %subreg.sub1
|
AMDGPU/SILoadStoreOptimizer: Improve merging of out of order offsets
Summary:
This improves merging of sequences like:
store a, ptr + 4
store b, ptr + 8
store c, ptr + 12
store d, ptr + 16
store e, ptr + 20
store f, ptr
Prior to this patch the basic block was scanned in order to find instructions
to merge and the above sequence would be transformed to:
store4 <a, b, c, d>, ptr + 4
store e, ptr + 20
store r, ptr
With this change, we now sort all the candidate merge instructions by their offset,
so instructions are visited in offset order rather than in the order they appear
in the basic block. We now transform this sequnce into:
store4 <f, a, b, c>, ptr
store2 <d, e>, ptr + 16
Another benefit of this change is that since we have sorted the mergeable lists
by offset, we can easily check if an instruction is mergeable by checking the
offset of the instruction that becomes before or after it in the sorted list.
Once we determine an instruction is not mergeable we can remove it from the list
and avoid having to do the more expensive mergeablilty checks.
Reviewers: arsenm, pendingchaos, rampitec, nhaehnle, vpykhtin
Reviewed By: arsenm, nhaehnle
Subscribers: kerbowa, merge_guards_bot, kzhuravl, jvesely, wdng, yaxunl, dstuttard, tpr, t-tye, hiraditya, llvm-commits
Tags: #llvm
Differential Revision: https://reviews.llvm.org/D65966
2020-01-25 05:07:08 +08:00
|
|
|
# GFX9: %{{[0-9]+}}:vreg_96 = REG_SEQUENCE killed %18, %subreg.sub0_sub1, %0, %subreg.sub2
|
[AMDGPU][SILoadStoreOptimizer] Merge TBUFFER loads/stores
Summary: Extend SILoadStoreOptimizer to merge tbuffer loads and stores.
Reviewers: nhaehnle
Reviewed By: nhaehnle
Subscribers: arsenm, kzhuravl, jvesely, wdng, nhaehnle, yaxunl, dstuttard, tpr, t-tye, hiraditya, llvm-commits
Tags: #llvm
Differential Revision: https://reviews.llvm.org/D69794
2019-11-21 05:30:02 +08:00
|
|
|
# GFX9: TBUFFER_STORE_FORMAT_XYZ_OFFSET_exact killed %19, %13, 0, 36, 93, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 12, align 1, addrspace 4)
|
|
|
|
name: gfx9_tbuffer_store_sint32
|
|
|
|
body: |
|
|
|
|
bb.0.entry:
|
|
|
|
liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8
|
|
|
|
%12:vgpr_32 = COPY $vgpr8
|
|
|
|
%11:vgpr_32 = COPY $vgpr7
|
|
|
|
%10:vgpr_32 = COPY $vgpr6
|
|
|
|
%9:vgpr_32 = COPY $vgpr5
|
|
|
|
%8:vgpr_32 = COPY $vgpr4
|
|
|
|
%7:vgpr_32 = COPY $vgpr3
|
|
|
|
%6:vgpr_32 = COPY $vgpr2
|
|
|
|
%5:vgpr_32 = COPY $vgpr1
|
|
|
|
%4:vgpr_32 = COPY $vgpr0
|
|
|
|
%3:sgpr_32 = COPY $sgpr3
|
|
|
|
%2:sgpr_32 = COPY $sgpr2
|
|
|
|
%1:sgpr_32 = COPY $sgpr1
|
|
|
|
%0:sgpr_32 = COPY $sgpr0
|
|
|
|
%13:sgpr_128 = REG_SEQUENCE %0:sgpr_32, %subreg.sub0, %1:sgpr_32, %subreg.sub1, %2:sgpr_32, %subreg.sub2, %3:sgpr_32, %subreg.sub3
|
|
|
|
TBUFFER_STORE_FORMAT_X_OFFSET_exact %4:vgpr_32, %13:sgpr_128, 0, 4, 84, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4, align 1, addrspace 4)
|
|
|
|
TBUFFER_STORE_FORMAT_X_OFFSET_exact %5:vgpr_32, %13:sgpr_128, 0, 8, 84, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4, align 1, addrspace 4)
|
|
|
|
TBUFFER_STORE_FORMAT_X_OFFSET_exact %6:vgpr_32, %13:sgpr_128, 0, 16, 84, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4, align 1, addrspace 4)
|
|
|
|
TBUFFER_STORE_FORMAT_X_OFFSET_exact %7:vgpr_32, %13:sgpr_128, 0, 20, 84, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4, align 1, addrspace 4)
|
|
|
|
TBUFFER_STORE_FORMAT_X_OFFSET_exact %8:vgpr_32, %13:sgpr_128, 0, 24, 84, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4, align 1, addrspace 4)
|
|
|
|
TBUFFER_STORE_FORMAT_X_OFFSET_exact %9:vgpr_32, %13:sgpr_128, 0, 28, 84, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4, align 1, addrspace 4)
|
|
|
|
TBUFFER_STORE_FORMAT_X_OFFSET_exact %10:vgpr_32, %13:sgpr_128, 0, 36, 84, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4, align 1, addrspace 4)
|
|
|
|
TBUFFER_STORE_FORMAT_X_OFFSET_exact %11:vgpr_32, %13:sgpr_128, 0, 40, 84, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4, align 1, addrspace 4)
|
|
|
|
TBUFFER_STORE_FORMAT_X_OFFSET_exact %12:vgpr_32, %13:sgpr_128, 0, 44, 84, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4, align 1, addrspace 4)
|
|
|
|
...
|
|
|
|
---
|
|
|
|
|
|
|
|
# GFX9-LABEL: name: gfx9_tbuffer_store_uint32
|
|
|
|
# GFX9: %{{[0-9]+}}:vgpr_32 = COPY $vgpr8
|
|
|
|
# GFX9: %{{[0-9]+}}:vgpr_32 = COPY $vgpr7
|
|
|
|
# GFX9: %{{[0-9]+}}:vgpr_32 = COPY $vgpr6
|
|
|
|
# GFX9: %{{[0-9]+}}:vgpr_32 = COPY $vgpr5
|
|
|
|
# GFX9: %{{[0-9]+}}:vgpr_32 = COPY $vgpr4
|
|
|
|
# GFX9: %{{[0-9]+}}:vgpr_32 = COPY $vgpr3
|
|
|
|
# GFX9: %{{[0-9]+}}:vgpr_32 = COPY $vgpr2
|
|
|
|
# GFX9: %{{[0-9]+}}:vgpr_32 = COPY $vgpr1
|
|
|
|
# GFX9: %{{[0-9]+}}:vgpr_32 = COPY $vgpr0
|
|
|
|
# GFX9: %{{[0-9]+}}:sgpr_32 = COPY $sgpr3
|
|
|
|
# GFX9: %{{[0-9]+}}:sgpr_32 = COPY $sgpr2
|
|
|
|
# GFX9: %{{[0-9]+}}:sgpr_32 = COPY $sgpr1
|
|
|
|
# GFX9: %{{[0-9]+}}:sgpr_32 = COPY $sgpr0
|
|
|
|
# GFX9: %{{[0-9]+}}:sgpr_128 = REG_SEQUENCE %12, %subreg.sub0, %11, %subreg.sub1, %10, %subreg.sub2, %9, %subreg.sub3
|
|
|
|
# GFX9: %{{[0-9]+}}:vreg_64 = REG_SEQUENCE %8, %subreg.sub0, %7, %subreg.sub1
|
|
|
|
# GFX9: TBUFFER_STORE_FORMAT_XY_OFFSET_exact killed %14, %13, 0, 4, 75, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 8, align 1, addrspace 4)
|
|
|
|
# GFX9: %{{[0-9]+}}:vreg_64 = REG_SEQUENCE %6, %subreg.sub0, %5, %subreg.sub1
|
AMDGPU/SILoadStoreOptimizer: Improve merging of out of order offsets
Summary:
This improves merging of sequences like:
store a, ptr + 4
store b, ptr + 8
store c, ptr + 12
store d, ptr + 16
store e, ptr + 20
store f, ptr
Prior to this patch the basic block was scanned in order to find instructions
to merge and the above sequence would be transformed to:
store4 <a, b, c, d>, ptr + 4
store e, ptr + 20
store r, ptr
With this change, we now sort all the candidate merge instructions by their offset,
so instructions are visited in offset order rather than in the order they appear
in the basic block. We now transform this sequnce into:
store4 <f, a, b, c>, ptr
store2 <d, e>, ptr + 16
Another benefit of this change is that since we have sorted the mergeable lists
by offset, we can easily check if an instruction is mergeable by checking the
offset of the instruction that becomes before or after it in the sorted list.
Once we determine an instruction is not mergeable we can remove it from the list
and avoid having to do the more expensive mergeablilty checks.
Reviewers: arsenm, pendingchaos, rampitec, nhaehnle, vpykhtin
Reviewed By: arsenm, nhaehnle
Subscribers: kerbowa, merge_guards_bot, kzhuravl, jvesely, wdng, yaxunl, dstuttard, tpr, t-tye, hiraditya, llvm-commits
Tags: #llvm
Differential Revision: https://reviews.llvm.org/D65966
2020-01-25 05:07:08 +08:00
|
|
|
# GFX9: %{{[0-9]+}}:vreg_96 = REG_SEQUENCE killed %15, %subreg.sub0_sub1, %4, %subreg.sub2
|
|
|
|
# GFX9: %{{[0-9]+}}:vreg_128 = REG_SEQUENCE killed %16, %subreg.sub0_sub1_sub2, %3, %subreg.sub3
|
|
|
|
# GFX9: TBUFFER_STORE_FORMAT_XYZW_OFFSET_exact killed %17, %13, 0, 16, 78, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 16, align 1, addrspace 4)
|
[AMDGPU][SILoadStoreOptimizer] Merge TBUFFER loads/stores
Summary: Extend SILoadStoreOptimizer to merge tbuffer loads and stores.
Reviewers: nhaehnle
Reviewed By: nhaehnle
Subscribers: arsenm, kzhuravl, jvesely, wdng, nhaehnle, yaxunl, dstuttard, tpr, t-tye, hiraditya, llvm-commits
Tags: #llvm
Differential Revision: https://reviews.llvm.org/D69794
2019-11-21 05:30:02 +08:00
|
|
|
# GFX9: %{{[0-9]+}}:vreg_64 = REG_SEQUENCE %2, %subreg.sub0, %1, %subreg.sub1
|
AMDGPU/SILoadStoreOptimizer: Improve merging of out of order offsets
Summary:
This improves merging of sequences like:
store a, ptr + 4
store b, ptr + 8
store c, ptr + 12
store d, ptr + 16
store e, ptr + 20
store f, ptr
Prior to this patch the basic block was scanned in order to find instructions
to merge and the above sequence would be transformed to:
store4 <a, b, c, d>, ptr + 4
store e, ptr + 20
store r, ptr
With this change, we now sort all the candidate merge instructions by their offset,
so instructions are visited in offset order rather than in the order they appear
in the basic block. We now transform this sequnce into:
store4 <f, a, b, c>, ptr
store2 <d, e>, ptr + 16
Another benefit of this change is that since we have sorted the mergeable lists
by offset, we can easily check if an instruction is mergeable by checking the
offset of the instruction that becomes before or after it in the sorted list.
Once we determine an instruction is not mergeable we can remove it from the list
and avoid having to do the more expensive mergeablilty checks.
Reviewers: arsenm, pendingchaos, rampitec, nhaehnle, vpykhtin
Reviewed By: arsenm, nhaehnle
Subscribers: kerbowa, merge_guards_bot, kzhuravl, jvesely, wdng, yaxunl, dstuttard, tpr, t-tye, hiraditya, llvm-commits
Tags: #llvm
Differential Revision: https://reviews.llvm.org/D65966
2020-01-25 05:07:08 +08:00
|
|
|
# GFX9: %{{[0-9]+}}:vreg_96 = REG_SEQUENCE killed %18, %subreg.sub0_sub1, %0, %subreg.sub2
|
[AMDGPU][SILoadStoreOptimizer] Merge TBUFFER loads/stores
Summary: Extend SILoadStoreOptimizer to merge tbuffer loads and stores.
Reviewers: nhaehnle
Reviewed By: nhaehnle
Subscribers: arsenm, kzhuravl, jvesely, wdng, nhaehnle, yaxunl, dstuttard, tpr, t-tye, hiraditya, llvm-commits
Tags: #llvm
Differential Revision: https://reviews.llvm.org/D69794
2019-11-21 05:30:02 +08:00
|
|
|
# GFX9: TBUFFER_STORE_FORMAT_XYZ_OFFSET_exact killed %19, %13, 0, 36, 77, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 12, align 1, addrspace 4)
|
|
|
|
name: gfx9_tbuffer_store_uint32
|
|
|
|
body: |
|
|
|
|
bb.0.entry:
|
|
|
|
liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8
|
|
|
|
%12:vgpr_32 = COPY $vgpr8
|
|
|
|
%11:vgpr_32 = COPY $vgpr7
|
|
|
|
%10:vgpr_32 = COPY $vgpr6
|
|
|
|
%9:vgpr_32 = COPY $vgpr5
|
|
|
|
%8:vgpr_32 = COPY $vgpr4
|
|
|
|
%7:vgpr_32 = COPY $vgpr3
|
|
|
|
%6:vgpr_32 = COPY $vgpr2
|
|
|
|
%5:vgpr_32 = COPY $vgpr1
|
|
|
|
%4:vgpr_32 = COPY $vgpr0
|
|
|
|
%3:sgpr_32 = COPY $sgpr3
|
|
|
|
%2:sgpr_32 = COPY $sgpr2
|
|
|
|
%1:sgpr_32 = COPY $sgpr1
|
|
|
|
%0:sgpr_32 = COPY $sgpr0
|
|
|
|
%13:sgpr_128 = REG_SEQUENCE %0:sgpr_32, %subreg.sub0, %1:sgpr_32, %subreg.sub1, %2:sgpr_32, %subreg.sub2, %3:sgpr_32, %subreg.sub3
|
|
|
|
TBUFFER_STORE_FORMAT_X_OFFSET_exact %4:vgpr_32, %13:sgpr_128, 0, 4, 68, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4, align 1, addrspace 4)
|
|
|
|
TBUFFER_STORE_FORMAT_X_OFFSET_exact %5:vgpr_32, %13:sgpr_128, 0, 8, 68, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4, align 1, addrspace 4)
|
|
|
|
TBUFFER_STORE_FORMAT_X_OFFSET_exact %6:vgpr_32, %13:sgpr_128, 0, 16, 68, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4, align 1, addrspace 4)
|
|
|
|
TBUFFER_STORE_FORMAT_X_OFFSET_exact %7:vgpr_32, %13:sgpr_128, 0, 20, 68, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4, align 1, addrspace 4)
|
|
|
|
TBUFFER_STORE_FORMAT_X_OFFSET_exact %8:vgpr_32, %13:sgpr_128, 0, 24, 68, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4, align 1, addrspace 4)
|
|
|
|
TBUFFER_STORE_FORMAT_X_OFFSET_exact %9:vgpr_32, %13:sgpr_128, 0, 28, 68, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4, align 1, addrspace 4)
|
|
|
|
TBUFFER_STORE_FORMAT_X_OFFSET_exact %10:vgpr_32, %13:sgpr_128, 0, 36, 68, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4, align 1, addrspace 4)
|
|
|
|
TBUFFER_STORE_FORMAT_X_OFFSET_exact %11:vgpr_32, %13:sgpr_128, 0, 40, 68, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4, align 1, addrspace 4)
|
|
|
|
TBUFFER_STORE_FORMAT_X_OFFSET_exact %12:vgpr_32, %13:sgpr_128, 0, 44, 68, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4, align 1, addrspace 4)
|
|
|
|
...
|
|
|
|
---
|
|
|
|
|
|
|
|
# GFX9-LABEL: name: gfx9_tbuffer_store_not_merged_data_format_mismatch
|
|
|
|
# GFX9: %{{[0-9]+}}:vgpr_32 = COPY $vgpr8
|
|
|
|
# GFX9: %{{[0-9]+}}:vgpr_32 = COPY $vgpr7
|
|
|
|
# GFX9: %{{[0-9]+}}:vgpr_32 = COPY $vgpr6
|
|
|
|
# GFX9: %{{[0-9]+}}:vgpr_32 = COPY $vgpr5
|
|
|
|
# GFX9: %{{[0-9]+}}:vgpr_32 = COPY $vgpr4
|
|
|
|
# GFX9: %{{[0-9]+}}:vgpr_32 = COPY $vgpr3
|
|
|
|
# GFX9: %{{[0-9]+}}:vgpr_32 = COPY $vgpr2
|
|
|
|
# GFX9: %{{[0-9]+}}:vgpr_32 = COPY $vgpr1
|
|
|
|
# GFX9: %{{[0-9]+}}:vgpr_32 = COPY $vgpr0
|
|
|
|
# GFX9: %{{[0-9]+}}:sgpr_32 = COPY $sgpr3
|
|
|
|
# GFX9: %{{[0-9]+}}:sgpr_32 = COPY $sgpr2
|
|
|
|
# GFX9: %{{[0-9]+}}:sgpr_32 = COPY $sgpr1
|
|
|
|
# GFX9: %{{[0-9]+}}:sgpr_32 = COPY $sgpr0
|
|
|
|
# GFX9: %{{[0-9]+}}:sgpr_128 = REG_SEQUENCE %12, %subreg.sub0, %11, %subreg.sub1, %10, %subreg.sub2, %9, %subreg.sub3
|
|
|
|
# GFX9: TBUFFER_STORE_FORMAT_X_OFFSET_exact %8, %13, 0, 4, 116, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4, align 1, addrspace 4)
|
|
|
|
# GFX9: TBUFFER_STORE_FORMAT_X_OFFSET_exact %7, %13, 0, 8, 84, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4, align 1, addrspace 4)
|
|
|
|
# GFX9: TBUFFER_STORE_FORMAT_X_OFFSET_exact %6, %13, 0, 16, 116, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4, align 1, addrspace 4)
|
|
|
|
# GFX9: TBUFFER_STORE_FORMAT_X_OFFSET_exact %5, %13, 0, 20, 84, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4, align 1, addrspace 4)
|
|
|
|
# GFX9: TBUFFER_STORE_FORMAT_X_OFFSET_exact %4, %13, 0, 24, 116, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4, align 1, addrspace 4)
|
|
|
|
# GFX9: TBUFFER_STORE_FORMAT_X_OFFSET_exact %3, %13, 0, 28, 84, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4, align 1, addrspace 4)
|
|
|
|
# GFX9: TBUFFER_STORE_FORMAT_X_OFFSET_exact %2, %13, 0, 36, 116, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4, align 1, addrspace 4)
|
|
|
|
# GFX9: TBUFFER_STORE_FORMAT_X_OFFSET_exact %1, %13, 0, 40, 84, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4, align 1, addrspace 4)
|
|
|
|
# GFX9: TBUFFER_STORE_FORMAT_X_OFFSET_exact %0, %13, 0, 44, 116, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4, align 1, addrspace 4)
|
|
|
|
name: gfx9_tbuffer_store_not_merged_data_format_mismatch
|
|
|
|
body: |
|
|
|
|
bb.0.entry:
|
|
|
|
liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8
|
|
|
|
%12:vgpr_32 = COPY $vgpr8
|
|
|
|
%11:vgpr_32 = COPY $vgpr7
|
|
|
|
%10:vgpr_32 = COPY $vgpr6
|
|
|
|
%9:vgpr_32 = COPY $vgpr5
|
|
|
|
%8:vgpr_32 = COPY $vgpr4
|
|
|
|
%7:vgpr_32 = COPY $vgpr3
|
|
|
|
%6:vgpr_32 = COPY $vgpr2
|
|
|
|
%5:vgpr_32 = COPY $vgpr1
|
|
|
|
%4:vgpr_32 = COPY $vgpr0
|
|
|
|
%3:sgpr_32 = COPY $sgpr3
|
|
|
|
%2:sgpr_32 = COPY $sgpr2
|
|
|
|
%1:sgpr_32 = COPY $sgpr1
|
|
|
|
%0:sgpr_32 = COPY $sgpr0
|
|
|
|
%13:sgpr_128 = REG_SEQUENCE %0:sgpr_32, %subreg.sub0, %1:sgpr_32, %subreg.sub1, %2:sgpr_32, %subreg.sub2, %3:sgpr_32, %subreg.sub3
|
|
|
|
TBUFFER_STORE_FORMAT_X_OFFSET_exact %4:vgpr_32, %13:sgpr_128, 0, 4, 116, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4, align 1, addrspace 4)
|
|
|
|
TBUFFER_STORE_FORMAT_X_OFFSET_exact %5:vgpr_32, %13:sgpr_128, 0, 8, 84, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4, align 1, addrspace 4)
|
|
|
|
TBUFFER_STORE_FORMAT_X_OFFSET_exact %6:vgpr_32, %13:sgpr_128, 0, 16, 116, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4, align 1, addrspace 4)
|
|
|
|
TBUFFER_STORE_FORMAT_X_OFFSET_exact %7:vgpr_32, %13:sgpr_128, 0, 20, 84, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4, align 1, addrspace 4)
|
|
|
|
TBUFFER_STORE_FORMAT_X_OFFSET_exact %8:vgpr_32, %13:sgpr_128, 0, 24, 116, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4, align 1, addrspace 4)
|
|
|
|
TBUFFER_STORE_FORMAT_X_OFFSET_exact %9:vgpr_32, %13:sgpr_128, 0, 28, 84, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4, align 1, addrspace 4)
|
|
|
|
TBUFFER_STORE_FORMAT_X_OFFSET_exact %10:vgpr_32, %13:sgpr_128, 0, 36, 116, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4, align 1, addrspace 4)
|
|
|
|
TBUFFER_STORE_FORMAT_X_OFFSET_exact %11:vgpr_32, %13:sgpr_128, 0, 40, 84, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4, align 1, addrspace 4)
|
|
|
|
TBUFFER_STORE_FORMAT_X_OFFSET_exact %12:vgpr_32, %13:sgpr_128, 0, 44, 116, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4, align 1, addrspace 4)
|
|
|
|
...
|
|
|
|
---
|
|
|
|
|
|
|
|
# GFX9-LABEL: name: gfx9_tbuffer_store_not_merged_num_format_mismatch
|
|
|
|
# GFX9: %{{[0-9]+}}:vgpr_32 = COPY $vgpr8
|
|
|
|
# GFX9: %{{[0-9]+}}:vgpr_32 = COPY $vgpr7
|
|
|
|
# GFX9: %{{[0-9]+}}:vgpr_32 = COPY $vgpr6
|
|
|
|
# GFX9: %{{[0-9]+}}:vgpr_32 = COPY $vgpr5
|
|
|
|
# GFX9: %{{[0-9]+}}:vgpr_32 = COPY $vgpr4
|
|
|
|
# GFX9: %{{[0-9]+}}:vgpr_32 = COPY $vgpr3
|
|
|
|
# GFX9: %{{[0-9]+}}:vgpr_32 = COPY $vgpr2
|
|
|
|
# GFX9: %{{[0-9]+}}:vgpr_32 = COPY $vgpr1
|
|
|
|
# GFX9: %{{[0-9]+}}:vgpr_32 = COPY $vgpr0
|
|
|
|
# GFX9: %{{[0-9]+}}:sgpr_32 = COPY $sgpr3
|
|
|
|
# GFX9: %{{[0-9]+}}:sgpr_32 = COPY $sgpr2
|
|
|
|
# GFX9: %{{[0-9]+}}:sgpr_32 = COPY $sgpr1
|
|
|
|
# GFX9: %{{[0-9]+}}:sgpr_32 = COPY $sgpr0
|
|
|
|
# GFX9: %{{[0-9]+}}:sgpr_128 = REG_SEQUENCE %12, %subreg.sub0, %11, %subreg.sub1, %10, %subreg.sub2, %9, %subreg.sub3
|
|
|
|
# GFX9: TBUFFER_STORE_FORMAT_X_OFFSET_exact %8, %13, 0, 4, 116, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4, align 1, addrspace 4)
|
|
|
|
# GFX9: TBUFFER_STORE_FORMAT_X_OFFSET_exact %7, %13, 0, 8, 114, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4, align 1, addrspace 4)
|
|
|
|
# GFX9: TBUFFER_STORE_FORMAT_X_OFFSET_exact %6, %13, 0, 16, 116, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4, align 1, addrspace 4)
|
|
|
|
# GFX9: TBUFFER_STORE_FORMAT_X_OFFSET_exact %5, %13, 0, 20, 114, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4, align 1, addrspace 4)
|
|
|
|
# GFX9: TBUFFER_STORE_FORMAT_X_OFFSET_exact %4, %13, 0, 24, 116, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4, align 1, addrspace 4)
|
|
|
|
# GFX9: TBUFFER_STORE_FORMAT_X_OFFSET_exact %3, %13, 0, 28, 114, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4, align 1, addrspace 4)
|
|
|
|
# GFX9: TBUFFER_STORE_FORMAT_X_OFFSET_exact %2, %13, 0, 36, 116, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4, align 1, addrspace 4)
|
|
|
|
# GFX9: TBUFFER_STORE_FORMAT_X_OFFSET_exact %1, %13, 0, 40, 114, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4, align 1, addrspace 4)
|
|
|
|
# GFX9: TBUFFER_STORE_FORMAT_X_OFFSET_exact %0, %13, 0, 44, 116, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4, align 1, addrspace 4)
|
|
|
|
name: gfx9_tbuffer_store_not_merged_num_format_mismatch
|
|
|
|
body: |
|
|
|
|
bb.0.entry:
|
|
|
|
liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8
|
|
|
|
%12:vgpr_32 = COPY $vgpr8
|
|
|
|
%11:vgpr_32 = COPY $vgpr7
|
|
|
|
%10:vgpr_32 = COPY $vgpr6
|
|
|
|
%9:vgpr_32 = COPY $vgpr5
|
|
|
|
%8:vgpr_32 = COPY $vgpr4
|
|
|
|
%7:vgpr_32 = COPY $vgpr3
|
|
|
|
%6:vgpr_32 = COPY $vgpr2
|
|
|
|
%5:vgpr_32 = COPY $vgpr1
|
|
|
|
%4:vgpr_32 = COPY $vgpr0
|
|
|
|
%3:sgpr_32 = COPY $sgpr3
|
|
|
|
%2:sgpr_32 = COPY $sgpr2
|
|
|
|
%1:sgpr_32 = COPY $sgpr1
|
|
|
|
%0:sgpr_32 = COPY $sgpr0
|
|
|
|
%13:sgpr_128 = REG_SEQUENCE %0:sgpr_32, %subreg.sub0, %1:sgpr_32, %subreg.sub1, %2:sgpr_32, %subreg.sub2, %3:sgpr_32, %subreg.sub3
|
|
|
|
TBUFFER_STORE_FORMAT_X_OFFSET_exact %4:vgpr_32, %13:sgpr_128, 0, 4, 116, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4, align 1, addrspace 4)
|
|
|
|
TBUFFER_STORE_FORMAT_X_OFFSET_exact %5:vgpr_32, %13:sgpr_128, 0, 8, 114, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4, align 1, addrspace 4)
|
|
|
|
TBUFFER_STORE_FORMAT_X_OFFSET_exact %6:vgpr_32, %13:sgpr_128, 0, 16, 116, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4, align 1, addrspace 4)
|
|
|
|
TBUFFER_STORE_FORMAT_X_OFFSET_exact %7:vgpr_32, %13:sgpr_128, 0, 20, 114, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4, align 1, addrspace 4)
|
|
|
|
TBUFFER_STORE_FORMAT_X_OFFSET_exact %8:vgpr_32, %13:sgpr_128, 0, 24, 116, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4, align 1, addrspace 4)
|
|
|
|
TBUFFER_STORE_FORMAT_X_OFFSET_exact %9:vgpr_32, %13:sgpr_128, 0, 28, 114, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4, align 1, addrspace 4)
|
|
|
|
TBUFFER_STORE_FORMAT_X_OFFSET_exact %10:vgpr_32, %13:sgpr_128, 0, 36, 116, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4, align 1, addrspace 4)
|
|
|
|
TBUFFER_STORE_FORMAT_X_OFFSET_exact %11:vgpr_32, %13:sgpr_128, 0, 40, 114, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4, align 1, addrspace 4)
|
|
|
|
TBUFFER_STORE_FORMAT_X_OFFSET_exact %12:vgpr_32, %13:sgpr_128, 0, 44, 116, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4, align 1, addrspace 4)
|
|
|
|
...
|
|
|
|
---
|
|
|
|
|
|
|
|
|
|
|
|
# GFX9-LABEL: name: gfx9_tbuffer_load_not_merged_swizzled_0
|
|
|
|
# GFX9: %{{[0-9]+}}:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %4, 0, 4, 116, 0, 0, 0, 0, 1, implicit $exec :: (dereferenceable load 4, align 1, addrspace 4)
|
|
|
|
# GFX9: %{{[0-9]+}}:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %4, 0, 8, 116, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, align 1, addrspace 4)
|
|
|
|
name: gfx9_tbuffer_load_not_merged_swizzled_0
|
|
|
|
body: |
|
|
|
|
bb.0.entry:
|
|
|
|
%0:sgpr_32 = COPY $sgpr0
|
|
|
|
%1:sgpr_32 = COPY $sgpr1
|
|
|
|
%2:sgpr_32 = COPY $sgpr2
|
|
|
|
%3:sgpr_32 = COPY $sgpr3
|
|
|
|
%5:sgpr_128 = REG_SEQUENCE %0:sgpr_32, %subreg.sub0, %1:sgpr_32, %subreg.sub1, %2:sgpr_32, %subreg.sub2, %3:sgpr_32, %subreg.sub3
|
|
|
|
%7:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %5:sgpr_128, 0, 4, 116, 0, 0, 0, 0, 1, implicit $exec :: (dereferenceable load 4, align 1, addrspace 4)
|
|
|
|
%8:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %5:sgpr_128, 0, 8, 116, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, align 1, addrspace 4)
|
|
|
|
...
|
|
|
|
---
|
|
|
|
|
|
|
|
|
|
|
|
# GFX9-LABEL: name: gfx9_tbuffer_load_not_merged_swizzled_1
|
|
|
|
# GFX9: %{{[0-9]+}}:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %4, 0, 4, 116, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, align 1, addrspace 4)
|
|
|
|
# GFX9: %{{[0-9]+}}:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %4, 0, 8, 116, 0, 0, 0, 0, 1, implicit $exec :: (dereferenceable load 4, align 1, addrspace 4)
|
|
|
|
name: gfx9_tbuffer_load_not_merged_swizzled_1
|
|
|
|
body: |
|
|
|
|
bb.0.entry:
|
|
|
|
%0:sgpr_32 = COPY $sgpr0
|
|
|
|
%1:sgpr_32 = COPY $sgpr1
|
|
|
|
%2:sgpr_32 = COPY $sgpr2
|
|
|
|
%3:sgpr_32 = COPY $sgpr3
|
|
|
|
%5:sgpr_128 = REG_SEQUENCE %0:sgpr_32, %subreg.sub0, %1:sgpr_32, %subreg.sub1, %2:sgpr_32, %subreg.sub2, %3:sgpr_32, %subreg.sub3
|
|
|
|
%7:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %5:sgpr_128, 0, 4, 116, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, align 1, addrspace 4)
|
|
|
|
%8:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %5:sgpr_128, 0, 8, 116, 0, 0, 0, 0, 1, implicit $exec :: (dereferenceable load 4, align 1, addrspace 4)
|
|
|
|
...
|
|
|
|
---
|
|
|
|
|
|
|
|
|
|
|
|
#
|
2020-05-15 23:02:39 +08:00
|
|
|
# COM: GFX10 tests
|
[AMDGPU][SILoadStoreOptimizer] Merge TBUFFER loads/stores
Summary: Extend SILoadStoreOptimizer to merge tbuffer loads and stores.
Reviewers: nhaehnle
Reviewed By: nhaehnle
Subscribers: arsenm, kzhuravl, jvesely, wdng, nhaehnle, yaxunl, dstuttard, tpr, t-tye, hiraditya, llvm-commits
Tags: #llvm
Differential Revision: https://reviews.llvm.org/D69794
2019-11-21 05:30:02 +08:00
|
|
|
#
|
|
|
|
|
|
|
|
# GFX10-LABEL: name: gfx10_tbuffer_load_x_xyz
|
|
|
|
# GFX10: %{{[0-9]+}}:vreg_128 = TBUFFER_LOAD_FORMAT_XYZW_OFFSET %4, 0, 4, 77, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 16, align 1, addrspace 4)
|
|
|
|
# GFX10: %{{[0-9]+}}:vgpr_32 = COPY %7.sub0
|
|
|
|
# GFX10: %{{[0-9]+}}:vreg_96 = COPY killed %7.sub1_sub2_sub3
|
|
|
|
name: gfx10_tbuffer_load_x_xyz
|
|
|
|
body: |
|
|
|
|
bb.0.entry:
|
|
|
|
%0:sgpr_32 = COPY $sgpr0
|
|
|
|
%1:sgpr_32 = COPY $sgpr1
|
|
|
|
%2:sgpr_32 = COPY $sgpr2
|
|
|
|
%3:sgpr_32 = COPY $sgpr3
|
|
|
|
%5:sgpr_128 = REG_SEQUENCE %0:sgpr_32, %subreg.sub0, %1:sgpr_32, %subreg.sub1, %2:sgpr_32, %subreg.sub2, %3:sgpr_32, %subreg.sub3
|
|
|
|
%7:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %5:sgpr_128, 0, 4, 22, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, align 1, addrspace 4)
|
|
|
|
%8:vreg_96 = TBUFFER_LOAD_FORMAT_XYZ_OFFSET %5:sgpr_128, 0, 8, 74, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 12, align 1, addrspace 4)
|
|
|
|
...
|
|
|
|
---
|
|
|
|
|
|
|
|
# GFX10-LABEL: name: gfx10_tbuffer_load_xyz_x
|
|
|
|
# GFX10: %{{[0-9]+}}:vreg_128 = TBUFFER_LOAD_FORMAT_XYZW_OFFSET %4, 0, 4, 77, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 16, align 1, addrspace 4)
|
|
|
|
# GFX10: %{{[0-9]+}}:vreg_96 = COPY %7.sub0_sub1_sub2
|
|
|
|
# GFX10: %{{[0-9]+}}:vgpr_32 = COPY killed %7.sub3
|
|
|
|
name: gfx10_tbuffer_load_xyz_x
|
|
|
|
body: |
|
|
|
|
bb.0.entry:
|
|
|
|
%0:sgpr_32 = COPY $sgpr0
|
|
|
|
%1:sgpr_32 = COPY $sgpr1
|
|
|
|
%2:sgpr_32 = COPY $sgpr2
|
|
|
|
%3:sgpr_32 = COPY $sgpr3
|
|
|
|
%5:sgpr_128 = REG_SEQUENCE %0:sgpr_32, %subreg.sub0, %1:sgpr_32, %subreg.sub1, %2:sgpr_32, %subreg.sub2, %3:sgpr_32, %subreg.sub3
|
|
|
|
%7:vreg_96 = TBUFFER_LOAD_FORMAT_XYZ_OFFSET %5:sgpr_128, 0, 4, 74, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, align 1, addrspace 4)
|
|
|
|
%8:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %5:sgpr_128, 0, 16, 22, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 12, align 1, addrspace 4)
|
|
|
|
...
|
|
|
|
---
|
|
|
|
|
|
|
|
# GFX10-LABEL: name: gfx10_tbuffer_load_xy_xy
|
|
|
|
# GFX10: %{{[0-9]+}}:vreg_128 = TBUFFER_LOAD_FORMAT_XYZW_OFFSET %4, 0, 4, 77, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 16, align 1, addrspace 4)
|
|
|
|
# GFX10: %{{[0-9]+}}:vreg_64 = COPY %7.sub0_sub1
|
|
|
|
# GFX10: %{{[0-9]+}}:vreg_64 = COPY killed %7.sub2_sub3
|
|
|
|
name: gfx10_tbuffer_load_xy_xy
|
|
|
|
body: |
|
|
|
|
bb.0.entry:
|
|
|
|
%0:sgpr_32 = COPY $sgpr0
|
|
|
|
%1:sgpr_32 = COPY $sgpr1
|
|
|
|
%2:sgpr_32 = COPY $sgpr2
|
|
|
|
%3:sgpr_32 = COPY $sgpr3
|
|
|
|
%5:sgpr_128 = REG_SEQUENCE %0:sgpr_32, %subreg.sub0, %1:sgpr_32, %subreg.sub1, %2:sgpr_32, %subreg.sub2, %3:sgpr_32, %subreg.sub3
|
|
|
|
%7:vreg_64 = TBUFFER_LOAD_FORMAT_XY_OFFSET %5:sgpr_128, 0, 4, 64, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 8, align 1, addrspace 4)
|
|
|
|
%8:vreg_64 = TBUFFER_LOAD_FORMAT_XY_OFFSET %5:sgpr_128, 0, 12, 64, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 8, align 1, addrspace 4)
|
|
|
|
...
|
|
|
|
---
|
|
|
|
|
|
|
|
# GFX10-LABEL: name: gfx10_tbuffer_load_x_xy
|
|
|
|
# GFX10: %{{[0-9]+}}:vreg_96 = TBUFFER_LOAD_FORMAT_XYZ_OFFSET %4, 0, 4, 74, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 12, align 1, addrspace 4)
|
|
|
|
# GFX10: %{{[0-9]+}}:vgpr_32 = COPY %7.sub0
|
|
|
|
# GFX10: %{{[0-9]+}}:vreg_64 = COPY killed %7.sub1_sub2
|
|
|
|
name: gfx10_tbuffer_load_x_xy
|
|
|
|
body: |
|
|
|
|
bb.0.entry:
|
|
|
|
%0:sgpr_32 = COPY $sgpr0
|
|
|
|
%1:sgpr_32 = COPY $sgpr1
|
|
|
|
%2:sgpr_32 = COPY $sgpr2
|
|
|
|
%3:sgpr_32 = COPY $sgpr3
|
|
|
|
%5:sgpr_128 = REG_SEQUENCE %0:sgpr_32, %subreg.sub0, %1:sgpr_32, %subreg.sub1, %2:sgpr_32, %subreg.sub2, %3:sgpr_32, %subreg.sub3
|
|
|
|
%7:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %5:sgpr_128, 0, 4, 22, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, align 1, addrspace 4)
|
|
|
|
%8:vreg_64 = TBUFFER_LOAD_FORMAT_XY_OFFSET %5:sgpr_128, 0, 8, 64, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 8, align 1, addrspace 4)
|
|
|
|
...
|
|
|
|
---
|
|
|
|
|
|
|
|
# GFX10-LABEL: name: gfx10_tbuffer_load_xy_x
|
|
|
|
# GFX10: %{{[0-9]+}}:vreg_96 = TBUFFER_LOAD_FORMAT_XYZ_OFFSET %4, 0, 4, 74, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 12, align 1, addrspace 4)
|
|
|
|
# GFX10: %{{[0-9]+}}:vreg_64 = COPY %7.sub0_sub1
|
|
|
|
# GFX10: %{{[0-9]+}}:vgpr_32 = COPY killed %7.sub2
|
|
|
|
name: gfx10_tbuffer_load_xy_x
|
|
|
|
body: |
|
|
|
|
bb.0.entry:
|
|
|
|
%0:sgpr_32 = COPY $sgpr0
|
|
|
|
%1:sgpr_32 = COPY $sgpr1
|
|
|
|
%2:sgpr_32 = COPY $sgpr2
|
|
|
|
%3:sgpr_32 = COPY $sgpr3
|
|
|
|
%5:sgpr_128 = REG_SEQUENCE %0:sgpr_32, %subreg.sub0, %1:sgpr_32, %subreg.sub1, %2:sgpr_32, %subreg.sub2, %3:sgpr_32, %subreg.sub3
|
|
|
|
%7:vreg_64 = TBUFFER_LOAD_FORMAT_XY_OFFSET %5:sgpr_128, 0, 4, 64, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 8, align 1, addrspace 4)
|
|
|
|
%8:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %5:sgpr_128, 0, 12, 22, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, align 1, addrspace 4)
|
|
|
|
...
|
|
|
|
---
|
|
|
|
|
|
|
|
|
|
|
|
# GFX10-LABEL: name: gfx10_tbuffer_load_x_x
|
|
|
|
# GFX10: %{{[0-9]+}}:vreg_64 = TBUFFER_LOAD_FORMAT_XY_OFFSET %4, 0, 4, 64, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 8, align 1, addrspace 4)
|
|
|
|
# GFX10: %{{[0-9]+}}:vgpr_32 = COPY %7.sub0
|
|
|
|
# GFX10: %{{[0-9]+}}:vgpr_32 = COPY killed %7.sub1
|
|
|
|
|
|
|
|
name: gfx10_tbuffer_load_x_x
|
|
|
|
body: |
|
|
|
|
bb.0.entry:
|
|
|
|
%0:sgpr_32 = COPY $sgpr0
|
|
|
|
%1:sgpr_32 = COPY $sgpr1
|
|
|
|
%2:sgpr_32 = COPY $sgpr2
|
|
|
|
%3:sgpr_32 = COPY $sgpr3
|
|
|
|
%5:sgpr_128 = REG_SEQUENCE %0:sgpr_32, %subreg.sub0, %1:sgpr_32, %subreg.sub1, %2:sgpr_32, %subreg.sub2, %3:sgpr_32, %subreg.sub3
|
|
|
|
%7:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %5:sgpr_128, 0, 4, 22, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, align 1, addrspace 4)
|
|
|
|
%8:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %5:sgpr_128, 0, 8, 22, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, align 1, addrspace 4)
|
|
|
|
...
|
|
|
|
---
|
|
|
|
|
|
|
|
# GFX10-LABEL: name: gfx10_tbuffer_load_x_x_format_32_32_32_32
|
|
|
|
# GFX10: %{{[0-9]+}}:vreg_64 = TBUFFER_LOAD_FORMAT_XY_OFFSET %4, 0, 4, 64, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 8, align 1, addrspace 4)
|
|
|
|
# GFX10: %{{[0-9]+}}:vgpr_32 = COPY %7.sub0
|
|
|
|
# GFX10: %{{[0-9]+}}:vgpr_32 = COPY killed %7.sub1
|
|
|
|
|
|
|
|
name: gfx10_tbuffer_load_x_x_format_32_32_32_32
|
|
|
|
body: |
|
|
|
|
bb.0.entry:
|
|
|
|
%0:sgpr_32 = COPY $sgpr0
|
|
|
|
%1:sgpr_32 = COPY $sgpr1
|
|
|
|
%2:sgpr_32 = COPY $sgpr2
|
|
|
|
%3:sgpr_32 = COPY $sgpr3
|
|
|
|
%5:sgpr_128 = REG_SEQUENCE %0:sgpr_32, %subreg.sub0, %1:sgpr_32, %subreg.sub1, %2:sgpr_32, %subreg.sub2, %3:sgpr_32, %subreg.sub3
|
|
|
|
%7:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %5:sgpr_128, 0, 4, 77, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, align 1, addrspace 4)
|
|
|
|
%8:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %5:sgpr_128, 0, 8, 77, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, align 1, addrspace 4)
|
|
|
|
...
|
|
|
|
---
|
|
|
|
|
|
|
|
|
|
|
|
# GFX10-LABEL: name: gfx10_tbuffer_load_float_32
|
|
|
|
# GFX10: %{{[0-9]+}}:vreg_64 = TBUFFER_LOAD_FORMAT_XY_OFFSET %4, 0, 4, 64, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 8, align 1, addrspace 4)
|
|
|
|
# GFX10: %{{[0-9]+}}:vgpr_32 = COPY %14.sub0
|
|
|
|
# GFX10: %{{[0-9]+}}:vgpr_32 = COPY killed %14.sub1
|
|
|
|
# GFX10: %{{[0-9]+}}:vreg_128 = TBUFFER_LOAD_FORMAT_XYZW_OFFSET %4, 0, 16, 77, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 16, align 1, addrspace 4)
|
AMDGPU/SILoadStoreOptimizer: Improve merging of out of order offsets
Summary:
This improves merging of sequences like:
store a, ptr + 4
store b, ptr + 8
store c, ptr + 12
store d, ptr + 16
store e, ptr + 20
store f, ptr
Prior to this patch the basic block was scanned in order to find instructions
to merge and the above sequence would be transformed to:
store4 <a, b, c, d>, ptr + 4
store e, ptr + 20
store r, ptr
With this change, we now sort all the candidate merge instructions by their offset,
so instructions are visited in offset order rather than in the order they appear
in the basic block. We now transform this sequnce into:
store4 <f, a, b, c>, ptr
store2 <d, e>, ptr + 16
Another benefit of this change is that since we have sorted the mergeable lists
by offset, we can easily check if an instruction is mergeable by checking the
offset of the instruction that becomes before or after it in the sorted list.
Once we determine an instruction is not mergeable we can remove it from the list
and avoid having to do the more expensive mergeablilty checks.
Reviewers: arsenm, pendingchaos, rampitec, nhaehnle, vpykhtin
Reviewed By: arsenm, nhaehnle
Subscribers: kerbowa, merge_guards_bot, kzhuravl, jvesely, wdng, yaxunl, dstuttard, tpr, t-tye, hiraditya, llvm-commits
Tags: #llvm
Differential Revision: https://reviews.llvm.org/D65966
2020-01-25 05:07:08 +08:00
|
|
|
# GFX10: %{{[0-9]+}}:vreg_96 = COPY %17.sub0_sub1_sub2
|
|
|
|
# GFX10: %{{[0-9]+}}:vgpr_32 = COPY killed %17.sub3
|
|
|
|
# GFX10: %{{[0-9]+}}:vreg_64 = COPY %16.sub0_sub1
|
|
|
|
# GFX10: %{{[0-9]+}}:vgpr_32 = COPY killed %16.sub2
|
[AMDGPU][SILoadStoreOptimizer] Merge TBUFFER loads/stores
Summary: Extend SILoadStoreOptimizer to merge tbuffer loads and stores.
Reviewers: nhaehnle
Reviewed By: nhaehnle
Subscribers: arsenm, kzhuravl, jvesely, wdng, nhaehnle, yaxunl, dstuttard, tpr, t-tye, hiraditya, llvm-commits
Tags: #llvm
Differential Revision: https://reviews.llvm.org/D69794
2019-11-21 05:30:02 +08:00
|
|
|
# GFX10: %{{[0-9]+}}:vgpr_32 = COPY %15.sub0
|
|
|
|
# GFX10: %{{[0-9]+}}:vgpr_32 = COPY killed %15.sub1
|
|
|
|
# GFX10: %{{[0-9]+}}:vreg_96 = TBUFFER_LOAD_FORMAT_XYZ_OFFSET %4, 0, 36, 74, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 12, align 1, addrspace 4)
|
|
|
|
# GFX10: %{{[0-9]+}}:vreg_64 = COPY %19.sub0_sub1
|
|
|
|
# GFX10: %{{[0-9]+}}:vgpr_32 = COPY killed %19.sub2
|
AMDGPU/SILoadStoreOptimizer: Improve merging of out of order offsets
Summary:
This improves merging of sequences like:
store a, ptr + 4
store b, ptr + 8
store c, ptr + 12
store d, ptr + 16
store e, ptr + 20
store f, ptr
Prior to this patch the basic block was scanned in order to find instructions
to merge and the above sequence would be transformed to:
store4 <a, b, c, d>, ptr + 4
store e, ptr + 20
store r, ptr
With this change, we now sort all the candidate merge instructions by their offset,
so instructions are visited in offset order rather than in the order they appear
in the basic block. We now transform this sequnce into:
store4 <f, a, b, c>, ptr
store2 <d, e>, ptr + 16
Another benefit of this change is that since we have sorted the mergeable lists
by offset, we can easily check if an instruction is mergeable by checking the
offset of the instruction that becomes before or after it in the sorted list.
Once we determine an instruction is not mergeable we can remove it from the list
and avoid having to do the more expensive mergeablilty checks.
Reviewers: arsenm, pendingchaos, rampitec, nhaehnle, vpykhtin
Reviewed By: arsenm, nhaehnle
Subscribers: kerbowa, merge_guards_bot, kzhuravl, jvesely, wdng, yaxunl, dstuttard, tpr, t-tye, hiraditya, llvm-commits
Tags: #llvm
Differential Revision: https://reviews.llvm.org/D65966
2020-01-25 05:07:08 +08:00
|
|
|
# GFX10: %{{[0-9]+}}:vgpr_32 = COPY %18.sub0
|
|
|
|
# GFX10: %{{[0-9]+}}:vgpr_32 = COPY killed %18.sub1
|
[AMDGPU][SILoadStoreOptimizer] Merge TBUFFER loads/stores
Summary: Extend SILoadStoreOptimizer to merge tbuffer loads and stores.
Reviewers: nhaehnle
Reviewed By: nhaehnle
Subscribers: arsenm, kzhuravl, jvesely, wdng, nhaehnle, yaxunl, dstuttard, tpr, t-tye, hiraditya, llvm-commits
Tags: #llvm
Differential Revision: https://reviews.llvm.org/D69794
2019-11-21 05:30:02 +08:00
|
|
|
|
|
|
|
name: gfx10_tbuffer_load_float_32
|
|
|
|
body: |
|
|
|
|
bb.0.entry:
|
|
|
|
%0:sgpr_32 = COPY $sgpr0
|
|
|
|
%1:sgpr_32 = COPY $sgpr1
|
|
|
|
%2:sgpr_32 = COPY $sgpr2
|
|
|
|
%3:sgpr_32 = COPY $sgpr3
|
|
|
|
%5:sgpr_128 = REG_SEQUENCE %0:sgpr_32, %subreg.sub0, %1:sgpr_32, %subreg.sub1, %2:sgpr_32, %subreg.sub2, %3:sgpr_32, %subreg.sub3
|
|
|
|
%7:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %5:sgpr_128, 0, 4, 22, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, align 1, addrspace 4)
|
|
|
|
%8:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %5:sgpr_128, 0, 8, 22, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, align 1, addrspace 4)
|
|
|
|
%9:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %5:sgpr_128, 0, 16, 22, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, align 1, addrspace 4)
|
|
|
|
%10:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %5:sgpr_128, 0, 20, 22, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, align 1, addrspace 4)
|
|
|
|
%11:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %5:sgpr_128, 0, 24, 22, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, align 1, addrspace 4)
|
|
|
|
%12:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %5:sgpr_128, 0, 28, 22, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, align 1, addrspace 4)
|
|
|
|
%13:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %5:sgpr_128, 0, 36, 22, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, align 1, addrspace 4)
|
|
|
|
%14:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %5:sgpr_128, 0, 40, 22, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, align 1, addrspace 4)
|
|
|
|
%15:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %5:sgpr_128, 0, 44, 22, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, align 1, addrspace 4)
|
|
|
|
...
|
|
|
|
---
|
|
|
|
|
|
|
|
# GFX10-LABEL: name: gfx10_tbuffer_load_sint_32
|
|
|
|
# GFX10: %{{[0-9]+}}:vreg_64 = TBUFFER_LOAD_FORMAT_XY_OFFSET %4, 0, 4, 63, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 8, align 1, addrspace 4)
|
|
|
|
# GFX10: %{{[0-9]+}}:vgpr_32 = COPY %14.sub0
|
|
|
|
# GFX10: %{{[0-9]+}}:vgpr_32 = COPY killed %14.sub1
|
|
|
|
# GFX10: %{{[0-9]+}}:vreg_128 = TBUFFER_LOAD_FORMAT_XYZW_OFFSET %4, 0, 16, 76, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 16, align 1, addrspace 4)
|
AMDGPU/SILoadStoreOptimizer: Improve merging of out of order offsets
Summary:
This improves merging of sequences like:
store a, ptr + 4
store b, ptr + 8
store c, ptr + 12
store d, ptr + 16
store e, ptr + 20
store f, ptr
Prior to this patch the basic block was scanned in order to find instructions
to merge and the above sequence would be transformed to:
store4 <a, b, c, d>, ptr + 4
store e, ptr + 20
store r, ptr
With this change, we now sort all the candidate merge instructions by their offset,
so instructions are visited in offset order rather than in the order they appear
in the basic block. We now transform this sequnce into:
store4 <f, a, b, c>, ptr
store2 <d, e>, ptr + 16
Another benefit of this change is that since we have sorted the mergeable lists
by offset, we can easily check if an instruction is mergeable by checking the
offset of the instruction that becomes before or after it in the sorted list.
Once we determine an instruction is not mergeable we can remove it from the list
and avoid having to do the more expensive mergeablilty checks.
Reviewers: arsenm, pendingchaos, rampitec, nhaehnle, vpykhtin
Reviewed By: arsenm, nhaehnle
Subscribers: kerbowa, merge_guards_bot, kzhuravl, jvesely, wdng, yaxunl, dstuttard, tpr, t-tye, hiraditya, llvm-commits
Tags: #llvm
Differential Revision: https://reviews.llvm.org/D65966
2020-01-25 05:07:08 +08:00
|
|
|
# GFX10: %{{[0-9]+}}:vreg_96 = COPY %17.sub0_sub1_sub2
|
|
|
|
# GFX10: %{{[0-9]+}}:vgpr_32 = COPY killed %17.sub3
|
|
|
|
# GFX10: %{{[0-9]+}}:vreg_64 = COPY %16.sub0_sub1
|
|
|
|
# GFX10: %{{[0-9]+}}:vgpr_32 = COPY killed %16.sub2
|
[AMDGPU][SILoadStoreOptimizer] Merge TBUFFER loads/stores
Summary: Extend SILoadStoreOptimizer to merge tbuffer loads and stores.
Reviewers: nhaehnle
Reviewed By: nhaehnle
Subscribers: arsenm, kzhuravl, jvesely, wdng, nhaehnle, yaxunl, dstuttard, tpr, t-tye, hiraditya, llvm-commits
Tags: #llvm
Differential Revision: https://reviews.llvm.org/D69794
2019-11-21 05:30:02 +08:00
|
|
|
# GFX10: %{{[0-9]+}}:vgpr_32 = COPY %15.sub0
|
|
|
|
# GFX10: %{{[0-9]+}}:vgpr_32 = COPY killed %15.sub1
|
|
|
|
# GFX10: %{{[0-9]+}}:vreg_96 = TBUFFER_LOAD_FORMAT_XYZ_OFFSET %4, 0, 36, 73, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 12, align 1, addrspace 4)
|
|
|
|
# GFX10: %{{[0-9]+}}:vreg_64 = COPY %19.sub0_sub1
|
|
|
|
# GFX10: %{{[0-9]+}}:vgpr_32 = COPY killed %19.sub2
|
AMDGPU/SILoadStoreOptimizer: Improve merging of out of order offsets
Summary:
This improves merging of sequences like:
store a, ptr + 4
store b, ptr + 8
store c, ptr + 12
store d, ptr + 16
store e, ptr + 20
store f, ptr
Prior to this patch the basic block was scanned in order to find instructions
to merge and the above sequence would be transformed to:
store4 <a, b, c, d>, ptr + 4
store e, ptr + 20
store r, ptr
With this change, we now sort all the candidate merge instructions by their offset,
so instructions are visited in offset order rather than in the order they appear
in the basic block. We now transform this sequnce into:
store4 <f, a, b, c>, ptr
store2 <d, e>, ptr + 16
Another benefit of this change is that since we have sorted the mergeable lists
by offset, we can easily check if an instruction is mergeable by checking the
offset of the instruction that becomes before or after it in the sorted list.
Once we determine an instruction is not mergeable we can remove it from the list
and avoid having to do the more expensive mergeablilty checks.
Reviewers: arsenm, pendingchaos, rampitec, nhaehnle, vpykhtin
Reviewed By: arsenm, nhaehnle
Subscribers: kerbowa, merge_guards_bot, kzhuravl, jvesely, wdng, yaxunl, dstuttard, tpr, t-tye, hiraditya, llvm-commits
Tags: #llvm
Differential Revision: https://reviews.llvm.org/D65966
2020-01-25 05:07:08 +08:00
|
|
|
# GFX10: %{{[0-9]+}}:vgpr_32 = COPY %18.sub0
|
|
|
|
# GFX10: %{{[0-9]+}}:vgpr_32 = COPY killed %18.sub1
|
[AMDGPU][SILoadStoreOptimizer] Merge TBUFFER loads/stores
Summary: Extend SILoadStoreOptimizer to merge tbuffer loads and stores.
Reviewers: nhaehnle
Reviewed By: nhaehnle
Subscribers: arsenm, kzhuravl, jvesely, wdng, nhaehnle, yaxunl, dstuttard, tpr, t-tye, hiraditya, llvm-commits
Tags: #llvm
Differential Revision: https://reviews.llvm.org/D69794
2019-11-21 05:30:02 +08:00
|
|
|
|
|
|
|
name: gfx10_tbuffer_load_sint_32
|
|
|
|
body: |
|
|
|
|
bb.0.entry:
|
|
|
|
%0:sgpr_32 = COPY $sgpr0
|
|
|
|
%1:sgpr_32 = COPY $sgpr1
|
|
|
|
%2:sgpr_32 = COPY $sgpr2
|
|
|
|
%3:sgpr_32 = COPY $sgpr3
|
|
|
|
%5:sgpr_128 = REG_SEQUENCE %0:sgpr_32, %subreg.sub0, %1:sgpr_32, %subreg.sub1, %2:sgpr_32, %subreg.sub2, %3:sgpr_32, %subreg.sub3
|
|
|
|
%7:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %5:sgpr_128, 0, 4, 21, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, align 1, addrspace 4)
|
|
|
|
%8:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %5:sgpr_128, 0, 8, 21, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, align 1, addrspace 4)
|
|
|
|
%9:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %5:sgpr_128, 0, 16, 21, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, align 1, addrspace 4)
|
|
|
|
%10:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %5:sgpr_128, 0, 20, 21, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, align 1, addrspace 4)
|
|
|
|
%11:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %5:sgpr_128, 0, 24, 21, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, align 1, addrspace 4)
|
|
|
|
%12:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %5:sgpr_128, 0, 28, 21, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, align 1, addrspace 4)
|
|
|
|
%13:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %5:sgpr_128, 0, 36, 21, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, align 1, addrspace 4)
|
|
|
|
%14:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %5:sgpr_128, 0, 40, 21, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, align 1, addrspace 4)
|
|
|
|
%15:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %5:sgpr_128, 0, 44, 21, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, align 1, addrspace 4)
|
|
|
|
...
|
|
|
|
---
|
|
|
|
|
|
|
|
# GFX10-LABEL: name: gfx10_tbuffer_load_uint_32
|
|
|
|
# GFX10: %{{[0-9]+}}:vreg_64 = TBUFFER_LOAD_FORMAT_XY_OFFSET %4, 0, 4, 62, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 8, align 1, addrspace 4)
|
|
|
|
# GFX10: %{{[0-9]+}}:vgpr_32 = COPY %14.sub0
|
|
|
|
# GFX10: %{{[0-9]+}}:vgpr_32 = COPY killed %14.sub1
|
|
|
|
# GFX10: %{{[0-9]+}}:vreg_128 = TBUFFER_LOAD_FORMAT_XYZW_OFFSET %4, 0, 16, 75, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 16, align 1, addrspace 4)
|
AMDGPU/SILoadStoreOptimizer: Improve merging of out of order offsets
Summary:
This improves merging of sequences like:
store a, ptr + 4
store b, ptr + 8
store c, ptr + 12
store d, ptr + 16
store e, ptr + 20
store f, ptr
Prior to this patch the basic block was scanned in order to find instructions
to merge and the above sequence would be transformed to:
store4 <a, b, c, d>, ptr + 4
store e, ptr + 20
store r, ptr
With this change, we now sort all the candidate merge instructions by their offset,
so instructions are visited in offset order rather than in the order they appear
in the basic block. We now transform this sequnce into:
store4 <f, a, b, c>, ptr
store2 <d, e>, ptr + 16
Another benefit of this change is that since we have sorted the mergeable lists
by offset, we can easily check if an instruction is mergeable by checking the
offset of the instruction that becomes before or after it in the sorted list.
Once we determine an instruction is not mergeable we can remove it from the list
and avoid having to do the more expensive mergeablilty checks.
Reviewers: arsenm, pendingchaos, rampitec, nhaehnle, vpykhtin
Reviewed By: arsenm, nhaehnle
Subscribers: kerbowa, merge_guards_bot, kzhuravl, jvesely, wdng, yaxunl, dstuttard, tpr, t-tye, hiraditya, llvm-commits
Tags: #llvm
Differential Revision: https://reviews.llvm.org/D65966
2020-01-25 05:07:08 +08:00
|
|
|
# GFX10: %{{[0-9]+}}:vreg_96 = COPY %17.sub0_sub1_sub2
|
|
|
|
# GFX10: %{{[0-9]+}}:vgpr_32 = COPY killed %17.sub3
|
|
|
|
# GFX10: %{{[0-9]+}}:vreg_64 = COPY %16.sub0_sub1
|
|
|
|
# GFX10: %{{[0-9]+}}:vgpr_32 = COPY killed %16.sub2
|
[AMDGPU][SILoadStoreOptimizer] Merge TBUFFER loads/stores
Summary: Extend SILoadStoreOptimizer to merge tbuffer loads and stores.
Reviewers: nhaehnle
Reviewed By: nhaehnle
Subscribers: arsenm, kzhuravl, jvesely, wdng, nhaehnle, yaxunl, dstuttard, tpr, t-tye, hiraditya, llvm-commits
Tags: #llvm
Differential Revision: https://reviews.llvm.org/D69794
2019-11-21 05:30:02 +08:00
|
|
|
# GFX10: %{{[0-9]+}}:vgpr_32 = COPY %15.sub0
|
|
|
|
# GFX10: %{{[0-9]+}}:vgpr_32 = COPY killed %15.sub1
|
|
|
|
# GFX10: %{{[0-9]+}}:vreg_96 = TBUFFER_LOAD_FORMAT_XYZ_OFFSET %4, 0, 36, 72, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 12, align 1, addrspace 4)
|
|
|
|
# GFX10: %{{[0-9]+}}:vreg_64 = COPY %19.sub0_sub1
|
|
|
|
# GFX10: %{{[0-9]+}}:vgpr_32 = COPY killed %19.sub2
|
AMDGPU/SILoadStoreOptimizer: Improve merging of out of order offsets
Summary:
This improves merging of sequences like:
store a, ptr + 4
store b, ptr + 8
store c, ptr + 12
store d, ptr + 16
store e, ptr + 20
store f, ptr
Prior to this patch the basic block was scanned in order to find instructions
to merge and the above sequence would be transformed to:
store4 <a, b, c, d>, ptr + 4
store e, ptr + 20
store r, ptr
With this change, we now sort all the candidate merge instructions by their offset,
so instructions are visited in offset order rather than in the order they appear
in the basic block. We now transform this sequnce into:
store4 <f, a, b, c>, ptr
store2 <d, e>, ptr + 16
Another benefit of this change is that since we have sorted the mergeable lists
by offset, we can easily check if an instruction is mergeable by checking the
offset of the instruction that becomes before or after it in the sorted list.
Once we determine an instruction is not mergeable we can remove it from the list
and avoid having to do the more expensive mergeablilty checks.
Reviewers: arsenm, pendingchaos, rampitec, nhaehnle, vpykhtin
Reviewed By: arsenm, nhaehnle
Subscribers: kerbowa, merge_guards_bot, kzhuravl, jvesely, wdng, yaxunl, dstuttard, tpr, t-tye, hiraditya, llvm-commits
Tags: #llvm
Differential Revision: https://reviews.llvm.org/D65966
2020-01-25 05:07:08 +08:00
|
|
|
# GFX10: %{{[0-9]+}}:vgpr_32 = COPY %18.sub0
|
|
|
|
# GFX10: %{{[0-9]+}}:vgpr_32 = COPY killed %18.sub1
|
[AMDGPU][SILoadStoreOptimizer] Merge TBUFFER loads/stores
Summary: Extend SILoadStoreOptimizer to merge tbuffer loads and stores.
Reviewers: nhaehnle
Reviewed By: nhaehnle
Subscribers: arsenm, kzhuravl, jvesely, wdng, nhaehnle, yaxunl, dstuttard, tpr, t-tye, hiraditya, llvm-commits
Tags: #llvm
Differential Revision: https://reviews.llvm.org/D69794
2019-11-21 05:30:02 +08:00
|
|
|
|
|
|
|
name: gfx10_tbuffer_load_uint_32
|
|
|
|
body: |
|
|
|
|
bb.0.entry:
|
|
|
|
%0:sgpr_32 = COPY $sgpr0
|
|
|
|
%1:sgpr_32 = COPY $sgpr1
|
|
|
|
%2:sgpr_32 = COPY $sgpr2
|
|
|
|
%3:sgpr_32 = COPY $sgpr3
|
|
|
|
%5:sgpr_128 = REG_SEQUENCE %0:sgpr_32, %subreg.sub0, %1:sgpr_32, %subreg.sub1, %2:sgpr_32, %subreg.sub2, %3:sgpr_32, %subreg.sub3
|
|
|
|
%7:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %5:sgpr_128, 0, 4, 20, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, align 1, addrspace 4)
|
|
|
|
%8:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %5:sgpr_128, 0, 8, 20, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, align 1, addrspace 4)
|
|
|
|
%9:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %5:sgpr_128, 0, 16, 20, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, align 1, addrspace 4)
|
|
|
|
%10:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %5:sgpr_128, 0, 20, 20, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, align 1, addrspace 4)
|
|
|
|
%11:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %5:sgpr_128, 0, 24, 20, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, align 1, addrspace 4)
|
|
|
|
%12:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %5:sgpr_128, 0, 28, 20, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, align 1, addrspace 4)
|
|
|
|
%13:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %5:sgpr_128, 0, 36, 20, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, align 1, addrspace 4)
|
|
|
|
%14:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %5:sgpr_128, 0, 40, 20, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, align 1, addrspace 4)
|
|
|
|
%15:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %5:sgpr_128, 0, 44, 20, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, align 1, addrspace 4)
|
|
|
|
...
|
|
|
|
---
|
|
|
|
|
|
|
|
# GFX10-LABEL: name: gfx10_tbuffer_load_not_merged_data_format_mismatch
|
|
|
|
# GFX10: %{{[0-9]+}}:sgpr_32 = COPY $sgpr0
|
|
|
|
# GFX10: %{{[0-9]+}}:sgpr_32 = COPY $sgpr1
|
|
|
|
# GFX10: %{{[0-9]+}}:sgpr_32 = COPY $sgpr2
|
|
|
|
# GFX10: %{{[0-9]+}}:sgpr_32 = COPY $sgpr3
|
|
|
|
# GFX10: %{{[0-9]+}}:sgpr_128 = REG_SEQUENCE %0, %subreg.sub0, %1, %subreg.sub1, %2, %subreg.sub2, %3, %subreg.sub3
|
|
|
|
# GFX10: %{{[0-9]+}}:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %4, 0, 4, 22, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, align 1, addrspace 4)
|
|
|
|
# GFX10: %{{[0-9]+}}:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %4, 0, 8, 13, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, align 1, addrspace 4)
|
|
|
|
# GFX10: %{{[0-9]+}}:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %4, 0, 16, 22, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, align 1, addrspace 4)
|
|
|
|
# GFX10: %{{[0-9]+}}:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %4, 0, 20, 13, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, align 1, addrspace 4)
|
|
|
|
# GFX10: %{{[0-9]+}}:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %4, 0, 24, 22, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, align 1, addrspace 4)
|
|
|
|
# GFX10: %{{[0-9]+}}:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %4, 0, 28, 13, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, align 1, addrspace 4)
|
|
|
|
# GFX10: %{{[0-9]+}}:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %4, 0, 36, 22, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, align 1, addrspace 4)
|
|
|
|
# GFX10: %{{[0-9]+}}:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %4, 0, 40, 13, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, align 1, addrspace 4)
|
|
|
|
# GFX10: %{{[0-9]+}}:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %4, 0, 44, 22, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, align 1, addrspace 4)
|
|
|
|
|
|
|
|
|
|
|
|
name: gfx10_tbuffer_load_not_merged_data_format_mismatch
|
|
|
|
body: |
|
|
|
|
bb.0.entry:
|
|
|
|
%0:sgpr_32 = COPY $sgpr0
|
|
|
|
%1:sgpr_32 = COPY $sgpr1
|
|
|
|
%2:sgpr_32 = COPY $sgpr2
|
|
|
|
%3:sgpr_32 = COPY $sgpr3
|
|
|
|
%5:sgpr_128 = REG_SEQUENCE %0:sgpr_32, %subreg.sub0, %1:sgpr_32, %subreg.sub1, %2:sgpr_32, %subreg.sub2, %3:sgpr_32, %subreg.sub3
|
|
|
|
%7:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %5:sgpr_128, 0, 4, 22, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, align 1, addrspace 4)
|
|
|
|
%8:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %5:sgpr_128, 0, 8, 13, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, align 1, addrspace 4)
|
|
|
|
%9:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %5:sgpr_128, 0, 16, 22, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, align 1, addrspace 4)
|
|
|
|
%10:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %5:sgpr_128, 0, 20, 13, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, align 1, addrspace 4)
|
|
|
|
%11:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %5:sgpr_128, 0, 24, 22, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, align 1, addrspace 4)
|
|
|
|
%12:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %5:sgpr_128, 0, 28, 13, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, align 1, addrspace 4)
|
|
|
|
%13:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %5:sgpr_128, 0, 36, 22, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, align 1, addrspace 4)
|
|
|
|
%14:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %5:sgpr_128, 0, 40, 13, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, align 1, addrspace 4)
|
|
|
|
%15:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %5:sgpr_128, 0, 44, 22, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, align 1, addrspace 4)
|
|
|
|
...
|
|
|
|
---
|
|
|
|
|
|
|
|
# GFX10-LABEL: name: gfx10_tbuffer_load_not_merged_num_format_mismatch
|
|
|
|
# GFX10: %{{[0-9]+}}:sgpr_32 = COPY $sgpr0
|
|
|
|
# GFX10: %{{[0-9]+}}:sgpr_32 = COPY $sgpr1
|
|
|
|
# GFX10: %{{[0-9]+}}:sgpr_32 = COPY $sgpr2
|
|
|
|
# GFX10: %{{[0-9]+}}:sgpr_32 = COPY $sgpr3
|
|
|
|
# GFX10: %{{[0-9]+}}:sgpr_128 = REG_SEQUENCE %0, %subreg.sub0, %1, %subreg.sub1, %2, %subreg.sub2, %3, %subreg.sub3
|
|
|
|
# GFX10: %{{[0-9]+}}:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %4, 0, 4, 22, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, align 1, addrspace 4)
|
|
|
|
# GFX10: %{{[0-9]+}}:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %4, 0, 8, 21, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, align 1, addrspace 4)
|
|
|
|
# GFX10: %{{[0-9]+}}:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %4, 0, 16, 22, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, align 1, addrspace 4)
|
|
|
|
# GFX10: %{{[0-9]+}}:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %4, 0, 20, 21, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, align 1, addrspace 4)
|
|
|
|
# GFX10: %{{[0-9]+}}:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %4, 0, 24, 22, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, align 1, addrspace 4)
|
|
|
|
# GFX10: %{{[0-9]+}}:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %4, 0, 28, 21, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, align 1, addrspace 4)
|
|
|
|
# GFX10: %{{[0-9]+}}:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %4, 0, 36, 22, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, align 1, addrspace 4)
|
|
|
|
# GFX10: %{{[0-9]+}}:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %4, 0, 40, 21, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, align 1, addrspace 4)
|
|
|
|
# GFX10: %{{[0-9]+}}:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %4, 0, 44, 22, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, align 1, addrspace 4)
|
|
|
|
name: gfx10_tbuffer_load_not_merged_num_format_mismatch
|
|
|
|
body: |
|
|
|
|
bb.0.entry:
|
|
|
|
%0:sgpr_32 = COPY $sgpr0
|
|
|
|
%1:sgpr_32 = COPY $sgpr1
|
|
|
|
%2:sgpr_32 = COPY $sgpr2
|
|
|
|
%3:sgpr_32 = COPY $sgpr3
|
|
|
|
%5:sgpr_128 = REG_SEQUENCE %0:sgpr_32, %subreg.sub0, %1:sgpr_32, %subreg.sub1, %2:sgpr_32, %subreg.sub2, %3:sgpr_32, %subreg.sub3
|
|
|
|
%7:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %5:sgpr_128, 0, 4, 22, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, align 1, addrspace 4)
|
|
|
|
%8:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %5:sgpr_128, 0, 8, 21, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, align 1, addrspace 4)
|
|
|
|
%9:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %5:sgpr_128, 0, 16, 22, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, align 1, addrspace 4)
|
|
|
|
%10:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %5:sgpr_128, 0, 20, 21, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, align 1, addrspace 4)
|
|
|
|
%11:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %5:sgpr_128, 0, 24, 22, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, align 1, addrspace 4)
|
|
|
|
%12:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %5:sgpr_128, 0, 28, 21, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, align 1, addrspace 4)
|
|
|
|
%13:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %5:sgpr_128, 0, 36, 22, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, align 1, addrspace 4)
|
|
|
|
%14:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %5:sgpr_128, 0, 40, 21, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, align 1, addrspace 4)
|
|
|
|
%15:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %5:sgpr_128, 0, 44, 22, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, align 1, addrspace 4)
|
|
|
|
...
|
|
|
|
---
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
# GFX10-LABEL: name: gfx10_tbuffer_store_x_xyz
|
|
|
|
# GFX10: %{{[0-9]+}}:vreg_96 = REG_SEQUENCE %3, %subreg.sub0, %2, %subreg.sub1, %1, %subreg.sub2
|
|
|
|
# GFX10: %{{[0-9]+}}:vreg_128 = REG_SEQUENCE %0, %subreg.sub0, %9, %subreg.sub1_sub2_sub3
|
|
|
|
# GFX10: TBUFFER_STORE_FORMAT_XYZW_OFFSET_exact killed %10, %8, 0, 4, 77, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 16, align 1, addrspace 4)
|
|
|
|
name: gfx10_tbuffer_store_x_xyz
|
|
|
|
body: |
|
|
|
|
bb.0.entry:
|
|
|
|
liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr0, $vgpr1, $vgpr2, $vgpr3
|
|
|
|
%7:vgpr_32 = COPY $vgpr3
|
|
|
|
%6:vgpr_32 = COPY $vgpr2
|
|
|
|
%5:vgpr_32 = COPY $vgpr1
|
|
|
|
%4:vgpr_32 = COPY $vgpr0
|
|
|
|
%3:sgpr_32 = COPY $sgpr3
|
|
|
|
%2:sgpr_32 = COPY $sgpr2
|
|
|
|
%1:sgpr_32 = COPY $sgpr1
|
|
|
|
%0:sgpr_32 = COPY $sgpr0
|
|
|
|
%13:sgpr_128 = REG_SEQUENCE %0:sgpr_32, %subreg.sub0, %1:sgpr_32, %subreg.sub1, %2:sgpr_32, %subreg.sub2, %3:sgpr_32, %subreg.sub3
|
|
|
|
%14:vreg_96 = REG_SEQUENCE %4:vgpr_32, %subreg.sub0, %5:vgpr_32, %subreg.sub1, %6:vgpr_32, %subreg.sub2
|
|
|
|
TBUFFER_STORE_FORMAT_X_OFFSET_exact %7:vgpr_32, %13:sgpr_128, 0, 4, 22, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4, align 1, addrspace 4)
|
|
|
|
TBUFFER_STORE_FORMAT_XYZ_OFFSET_exact %14:vreg_96, %13:sgpr_128, 0, 8, 74, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 12, align 1, addrspace 4)
|
|
|
|
...
|
|
|
|
---
|
|
|
|
|
|
|
|
|
|
|
|
# GFX10-LABEL: name: gfx10_tbuffer_store_xyz_x
|
|
|
|
# GFX10: %{{[0-9]+}}:vreg_96 = REG_SEQUENCE %3, %subreg.sub0, %2, %subreg.sub1, %1, %subreg.sub2
|
|
|
|
# GFX10: %{{[0-9]+}}:vreg_128 = REG_SEQUENCE %9, %subreg.sub0_sub1_sub2, %0, %subreg.sub3
|
|
|
|
# GFX10: TBUFFER_STORE_FORMAT_XYZW_OFFSET_exact killed %10, %8, 0, 4, 77, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 16, align 1, addrspace 4)
|
|
|
|
name: gfx10_tbuffer_store_xyz_x
|
|
|
|
body: |
|
|
|
|
bb.0.entry:
|
|
|
|
liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr0, $vgpr1, $vgpr2, $vgpr3
|
|
|
|
%7:vgpr_32 = COPY $vgpr3
|
|
|
|
%6:vgpr_32 = COPY $vgpr2
|
|
|
|
%5:vgpr_32 = COPY $vgpr1
|
|
|
|
%4:vgpr_32 = COPY $vgpr0
|
|
|
|
%3:sgpr_32 = COPY $sgpr3
|
|
|
|
%2:sgpr_32 = COPY $sgpr2
|
|
|
|
%1:sgpr_32 = COPY $sgpr1
|
|
|
|
%0:sgpr_32 = COPY $sgpr0
|
|
|
|
%13:sgpr_128 = REG_SEQUENCE %0:sgpr_32, %subreg.sub0, %1:sgpr_32, %subreg.sub1, %2:sgpr_32, %subreg.sub2, %3:sgpr_32, %subreg.sub3
|
|
|
|
%14:vreg_96 = REG_SEQUENCE %4:vgpr_32, %subreg.sub0, %5:vgpr_32, %subreg.sub1, %6:vgpr_32, %subreg.sub2
|
|
|
|
TBUFFER_STORE_FORMAT_XYZ_OFFSET_exact %14:vreg_96, %13:sgpr_128, 0, 4, 74, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 12, align 1, addrspace 4)
|
|
|
|
TBUFFER_STORE_FORMAT_X_OFFSET_exact %7:vgpr_32, %13:sgpr_128, 0, 16, 22, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4, align 1, addrspace 4)
|
|
|
|
...
|
|
|
|
---
|
|
|
|
|
|
|
|
# GFX10-LABEL: name: gfx10_tbuffer_store_xy_xy
|
|
|
|
# GFX10: %{{[0-9]+}}:vreg_64 = REG_SEQUENCE %3, %subreg.sub0, %2, %subreg.sub1
|
|
|
|
# GFX10: %{{[0-9]+}}:vreg_64 = REG_SEQUENCE %1, %subreg.sub0, %0, %subreg.sub1
|
|
|
|
# GFX10: %{{[0-9]+}}:vreg_128 = REG_SEQUENCE %9, %subreg.sub0_sub1, %10, %subreg.sub2_sub3
|
|
|
|
# GFX10: TBUFFER_STORE_FORMAT_XYZW_OFFSET_exact killed %11, %8, 0, 4, 77, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 16, align 1, addrspace 4)
|
|
|
|
name: gfx10_tbuffer_store_xy_xy
|
|
|
|
body: |
|
|
|
|
bb.0.entry:
|
|
|
|
liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr0, $vgpr1, $vgpr2, $vgpr3
|
|
|
|
%7:vgpr_32 = COPY $vgpr3
|
|
|
|
%6:vgpr_32 = COPY $vgpr2
|
|
|
|
%5:vgpr_32 = COPY $vgpr1
|
|
|
|
%4:vgpr_32 = COPY $vgpr0
|
|
|
|
%3:sgpr_32 = COPY $sgpr3
|
|
|
|
%2:sgpr_32 = COPY $sgpr2
|
|
|
|
%1:sgpr_32 = COPY $sgpr1
|
|
|
|
%0:sgpr_32 = COPY $sgpr0
|
|
|
|
%13:sgpr_128 = REG_SEQUENCE %0:sgpr_32, %subreg.sub0, %1:sgpr_32, %subreg.sub1, %2:sgpr_32, %subreg.sub2, %3:sgpr_32, %subreg.sub3
|
|
|
|
%14:vreg_64 = REG_SEQUENCE %4:vgpr_32, %subreg.sub0, %5:vgpr_32, %subreg.sub1
|
|
|
|
%15:vreg_64 = REG_SEQUENCE %6:vgpr_32, %subreg.sub0, %7:vgpr_32, %subreg.sub1
|
|
|
|
TBUFFER_STORE_FORMAT_XY_OFFSET_exact %14:vreg_64, %13:sgpr_128, 0, 4, 64, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 8, align 1, addrspace 4)
|
|
|
|
TBUFFER_STORE_FORMAT_XY_OFFSET_exact %15:vreg_64, %13:sgpr_128, 0, 12, 64, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 8, align 1, addrspace 4)
|
|
|
|
...
|
|
|
|
---
|
|
|
|
|
|
|
|
# GFX10-LABEL: name: gfx10_tbuffer_store_x_xy
|
|
|
|
# GFX10: %{{[0-9]+}}:vreg_64 = REG_SEQUENCE %3, %subreg.sub0, %2, %subreg.sub1
|
|
|
|
# GFX10: %{{[0-9]+}}:vreg_64, %subreg.sub1_sub2
|
|
|
|
# GFX10: TBUFFER_STORE_FORMAT_XYZ_OFFSET_exact killed %11, %8, 0, 4, 74, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 12, align 1, addrspace 4)
|
|
|
|
name: gfx10_tbuffer_store_x_xy
|
|
|
|
body: |
|
|
|
|
bb.0.entry:
|
|
|
|
liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr0, $vgpr1, $vgpr2, $vgpr3
|
|
|
|
%7:vgpr_32 = COPY $vgpr3
|
|
|
|
%6:vgpr_32 = COPY $vgpr2
|
|
|
|
%5:vgpr_32 = COPY $vgpr1
|
|
|
|
%4:vgpr_32 = COPY $vgpr0
|
|
|
|
%3:sgpr_32 = COPY $sgpr3
|
|
|
|
%2:sgpr_32 = COPY $sgpr2
|
|
|
|
%1:sgpr_32 = COPY $sgpr1
|
|
|
|
%0:sgpr_32 = COPY $sgpr0
|
|
|
|
%13:sgpr_128 = REG_SEQUENCE %0:sgpr_32, %subreg.sub0, %1:sgpr_32, %subreg.sub1, %2:sgpr_32, %subreg.sub2, %3:sgpr_32, %subreg.sub3
|
|
|
|
%14:vreg_64 = REG_SEQUENCE %4:vgpr_32, %subreg.sub0, %5:vgpr_32, %subreg.sub1
|
|
|
|
TBUFFER_STORE_FORMAT_X_OFFSET_exact %7:vgpr_32, %13:sgpr_128, 0, 4, 22, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4, align 1, addrspace 4)
|
|
|
|
TBUFFER_STORE_FORMAT_XY_OFFSET_exact %15:vreg_64, %13:sgpr_128, 0, 8, 64, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 8, align 1, addrspace 4)
|
|
|
|
...
|
|
|
|
---
|
|
|
|
|
|
|
|
# GFX10-LABEL: name: gfx10_tbuffer_store_xy_x
|
|
|
|
# GFX10: %{{[0-9]+}}:vreg_64 = REG_SEQUENCE %3, %subreg.sub0, %2, %subreg.sub1
|
|
|
|
# GFX10: %{{[0-9]+}}:vreg_96 = REG_SEQUENCE %9, %subreg.sub0_sub1, %0, %subreg.sub2
|
|
|
|
# GFX10: TBUFFER_STORE_FORMAT_XYZ_OFFSET_exact killed %10, %8, 0, 4, 74, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 12, align 1, addrspace 4)
|
|
|
|
name: gfx10_tbuffer_store_xy_x
|
|
|
|
body: |
|
|
|
|
bb.0.entry:
|
|
|
|
liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr0, $vgpr1, $vgpr2, $vgpr3
|
|
|
|
liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr0, $vgpr1, $vgpr2, $vgpr3
|
|
|
|
%7:vgpr_32 = COPY $vgpr3
|
|
|
|
%6:vgpr_32 = COPY $vgpr2
|
|
|
|
%5:vgpr_32 = COPY $vgpr1
|
|
|
|
%4:vgpr_32 = COPY $vgpr0
|
|
|
|
%3:sgpr_32 = COPY $sgpr3
|
|
|
|
%2:sgpr_32 = COPY $sgpr2
|
|
|
|
%1:sgpr_32 = COPY $sgpr1
|
|
|
|
%0:sgpr_32 = COPY $sgpr0
|
|
|
|
%13:sgpr_128 = REG_SEQUENCE %0:sgpr_32, %subreg.sub0, %1:sgpr_32, %subreg.sub1, %2:sgpr_32, %subreg.sub2, %3:sgpr_32, %subreg.sub3
|
|
|
|
%14:vreg_64 = REG_SEQUENCE %4:vgpr_32, %subreg.sub0, %5:vgpr_32, %subreg.sub1
|
|
|
|
TBUFFER_STORE_FORMAT_XY_OFFSET_exact %14:vreg_64, %13:sgpr_128, 0, 4, 64, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 8, align 1, addrspace 4)
|
|
|
|
TBUFFER_STORE_FORMAT_X_OFFSET_exact %7:vgpr_32, %13:sgpr_128, 0, 12, 22, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4, align 1, addrspace 4)
|
|
|
|
...
|
|
|
|
---
|
|
|
|
|
|
|
|
|
|
|
|
# GFX10-LABEL: name: gfx10_tbuffer_store_x_x
|
|
|
|
# GFX10: %{{[0-9]+}}:vreg_64 = REG_SEQUENCE %1, %subreg.sub0, %0, %subreg.sub1
|
|
|
|
# GFX10: TBUFFER_STORE_FORMAT_XY_OFFSET_exact killed %9, %8, 0, 4, 64, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 8, align 1, addrspace 4)
|
|
|
|
name: gfx10_tbuffer_store_x_x
|
|
|
|
body: |
|
|
|
|
bb.0.entry:
|
|
|
|
liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr0, $vgpr1, $vgpr2, $vgpr3
|
|
|
|
%7:vgpr_32 = COPY $vgpr3
|
|
|
|
%6:vgpr_32 = COPY $vgpr2
|
|
|
|
%5:vgpr_32 = COPY $vgpr1
|
|
|
|
%4:vgpr_32 = COPY $vgpr0
|
|
|
|
%3:sgpr_32 = COPY $sgpr3
|
|
|
|
%2:sgpr_32 = COPY $sgpr2
|
|
|
|
%1:sgpr_32 = COPY $sgpr1
|
|
|
|
%0:sgpr_32 = COPY $sgpr0
|
|
|
|
%13:sgpr_128 = REG_SEQUENCE %0:sgpr_32, %subreg.sub0, %1:sgpr_32, %subreg.sub1, %2:sgpr_32, %subreg.sub2, %3:sgpr_32, %subreg.sub3
|
|
|
|
TBUFFER_STORE_FORMAT_X_OFFSET_exact %6:vgpr_32, %13:sgpr_128, 0, 4, 22, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4, align 1, addrspace 4)
|
|
|
|
TBUFFER_STORE_FORMAT_X_OFFSET_exact %7:vgpr_32, %13:sgpr_128, 0, 8, 22, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4, align 1, addrspace 4)
|
|
|
|
...
|
|
|
|
---
|
|
|
|
|
|
|
|
# GFX10-LABEL: name: gfx10_tbuffer_store_x_x_format_32_32_32_32
|
|
|
|
# GFX10: %{{[0-9]+}}:vreg_64 = REG_SEQUENCE %1, %subreg.sub0, %0, %subreg.sub1
|
|
|
|
# GFX10: TBUFFER_STORE_FORMAT_XY_OFFSET_exact killed %9, %8, 0, 4, 64, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 8, align 1, addrspace 4)
|
|
|
|
name: gfx10_tbuffer_store_x_x_format_32_32_32_32
|
|
|
|
body: |
|
|
|
|
bb.0.entry:
|
|
|
|
liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr0, $vgpr1, $vgpr2, $vgpr3
|
|
|
|
%7:vgpr_32 = COPY $vgpr3
|
|
|
|
%6:vgpr_32 = COPY $vgpr2
|
|
|
|
%5:vgpr_32 = COPY $vgpr1
|
|
|
|
%4:vgpr_32 = COPY $vgpr0
|
|
|
|
%3:sgpr_32 = COPY $sgpr3
|
|
|
|
%2:sgpr_32 = COPY $sgpr2
|
|
|
|
%1:sgpr_32 = COPY $sgpr1
|
|
|
|
%0:sgpr_32 = COPY $sgpr0
|
|
|
|
%13:sgpr_128 = REG_SEQUENCE %0:sgpr_32, %subreg.sub0, %1:sgpr_32, %subreg.sub1, %2:sgpr_32, %subreg.sub2, %3:sgpr_32, %subreg.sub3
|
|
|
|
TBUFFER_STORE_FORMAT_X_OFFSET_exact %6:vgpr_32, %13:sgpr_128, 0, 4, 77, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4, align 1, addrspace 4)
|
|
|
|
TBUFFER_STORE_FORMAT_X_OFFSET_exact %7:vgpr_32, %13:sgpr_128, 0, 8, 77, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4, align 1, addrspace 4)
|
|
|
|
...
|
|
|
|
---
|
|
|
|
|
|
|
|
# GFX10-LABEL: name: gfx10_tbuffer_store_float32
|
|
|
|
# GFX10: %{{[0-9]+}}:vgpr_32 = COPY $vgpr8
|
|
|
|
# GFX10: %{{[0-9]+}}:vgpr_32 = COPY $vgpr7
|
|
|
|
# GFX10: %{{[0-9]+}}:vgpr_32 = COPY $vgpr6
|
|
|
|
# GFX10: %{{[0-9]+}}:vgpr_32 = COPY $vgpr5
|
|
|
|
# GFX10: %{{[0-9]+}}:vgpr_32 = COPY $vgpr4
|
|
|
|
# GFX10: %{{[0-9]+}}:vgpr_32 = COPY $vgpr3
|
|
|
|
# GFX10: %{{[0-9]+}}:vgpr_32 = COPY $vgpr2
|
|
|
|
# GFX10: %{{[0-9]+}}:vgpr_32 = COPY $vgpr1
|
|
|
|
# GFX10: %{{[0-9]+}}:vgpr_32 = COPY $vgpr0
|
|
|
|
# GFX10: %{{[0-9]+}}:sgpr_32 = COPY $sgpr3
|
|
|
|
# GFX10: %{{[0-9]+}}:sgpr_32 = COPY $sgpr2
|
|
|
|
# GFX10: %{{[0-9]+}}:sgpr_32 = COPY $sgpr1
|
|
|
|
# GFX10: %{{[0-9]+}}:sgpr_32 = COPY $sgpr0
|
|
|
|
# GFX10: %{{[0-9]+}}:sgpr_128 = REG_SEQUENCE %12, %subreg.sub0, %11, %subreg.sub1, %10, %subreg.sub2, %9, %subreg.sub3
|
|
|
|
# GFX10: %{{[0-9]+}}:vreg_64 = REG_SEQUENCE %8, %subreg.sub0, %7, %subreg.sub1
|
|
|
|
# GFX10: TBUFFER_STORE_FORMAT_XY_OFFSET_exact killed %14, %13, 0, 4, 64, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 8, align 1, addrspace 4)
|
|
|
|
# GFX10: %{{[0-9]+}}:vreg_64 = REG_SEQUENCE %6, %subreg.sub0, %5, %subreg.sub1
|
AMDGPU/SILoadStoreOptimizer: Improve merging of out of order offsets
Summary:
This improves merging of sequences like:
store a, ptr + 4
store b, ptr + 8
store c, ptr + 12
store d, ptr + 16
store e, ptr + 20
store f, ptr
Prior to this patch the basic block was scanned in order to find instructions
to merge and the above sequence would be transformed to:
store4 <a, b, c, d>, ptr + 4
store e, ptr + 20
store r, ptr
With this change, we now sort all the candidate merge instructions by their offset,
so instructions are visited in offset order rather than in the order they appear
in the basic block. We now transform this sequnce into:
store4 <f, a, b, c>, ptr
store2 <d, e>, ptr + 16
Another benefit of this change is that since we have sorted the mergeable lists
by offset, we can easily check if an instruction is mergeable by checking the
offset of the instruction that becomes before or after it in the sorted list.
Once we determine an instruction is not mergeable we can remove it from the list
and avoid having to do the more expensive mergeablilty checks.
Reviewers: arsenm, pendingchaos, rampitec, nhaehnle, vpykhtin
Reviewed By: arsenm, nhaehnle
Subscribers: kerbowa, merge_guards_bot, kzhuravl, jvesely, wdng, yaxunl, dstuttard, tpr, t-tye, hiraditya, llvm-commits
Tags: #llvm
Differential Revision: https://reviews.llvm.org/D65966
2020-01-25 05:07:08 +08:00
|
|
|
# GFX10: %{{[0-9]+}}:vreg_96 = REG_SEQUENCE killed %15, %subreg.sub0_sub1, %4, %subreg.sub2
|
|
|
|
# GFX10: %{{[0-9]+}}:vreg_128 = REG_SEQUENCE killed %16, %subreg.sub0_sub1_sub2, %3, %subreg.sub3
|
|
|
|
# GFX10: TBUFFER_STORE_FORMAT_XYZW_OFFSET_exact killed %17, %13, 0, 16, 77, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 16, align 1, addrspace 4)
|
[AMDGPU][SILoadStoreOptimizer] Merge TBUFFER loads/stores
Summary: Extend SILoadStoreOptimizer to merge tbuffer loads and stores.
Reviewers: nhaehnle
Reviewed By: nhaehnle
Subscribers: arsenm, kzhuravl, jvesely, wdng, nhaehnle, yaxunl, dstuttard, tpr, t-tye, hiraditya, llvm-commits
Tags: #llvm
Differential Revision: https://reviews.llvm.org/D69794
2019-11-21 05:30:02 +08:00
|
|
|
# GFX10: %{{[0-9]+}}:vreg_64 = REG_SEQUENCE %2, %subreg.sub0, %1, %subreg.sub1
|
AMDGPU/SILoadStoreOptimizer: Improve merging of out of order offsets
Summary:
This improves merging of sequences like:
store a, ptr + 4
store b, ptr + 8
store c, ptr + 12
store d, ptr + 16
store e, ptr + 20
store f, ptr
Prior to this patch the basic block was scanned in order to find instructions
to merge and the above sequence would be transformed to:
store4 <a, b, c, d>, ptr + 4
store e, ptr + 20
store r, ptr
With this change, we now sort all the candidate merge instructions by their offset,
so instructions are visited in offset order rather than in the order they appear
in the basic block. We now transform this sequnce into:
store4 <f, a, b, c>, ptr
store2 <d, e>, ptr + 16
Another benefit of this change is that since we have sorted the mergeable lists
by offset, we can easily check if an instruction is mergeable by checking the
offset of the instruction that becomes before or after it in the sorted list.
Once we determine an instruction is not mergeable we can remove it from the list
and avoid having to do the more expensive mergeablilty checks.
Reviewers: arsenm, pendingchaos, rampitec, nhaehnle, vpykhtin
Reviewed By: arsenm, nhaehnle
Subscribers: kerbowa, merge_guards_bot, kzhuravl, jvesely, wdng, yaxunl, dstuttard, tpr, t-tye, hiraditya, llvm-commits
Tags: #llvm
Differential Revision: https://reviews.llvm.org/D65966
2020-01-25 05:07:08 +08:00
|
|
|
# GFX10: %{{[0-9]+}}:vreg_96 = REG_SEQUENCE killed %18, %subreg.sub0_sub1, %0, %subreg.sub2
|
[AMDGPU][SILoadStoreOptimizer] Merge TBUFFER loads/stores
Summary: Extend SILoadStoreOptimizer to merge tbuffer loads and stores.
Reviewers: nhaehnle
Reviewed By: nhaehnle
Subscribers: arsenm, kzhuravl, jvesely, wdng, nhaehnle, yaxunl, dstuttard, tpr, t-tye, hiraditya, llvm-commits
Tags: #llvm
Differential Revision: https://reviews.llvm.org/D69794
2019-11-21 05:30:02 +08:00
|
|
|
# GFX10: TBUFFER_STORE_FORMAT_XYZ_OFFSET_exact killed %19, %13, 0, 36, 74, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 12, align 1, addrspace 4)
|
|
|
|
name: gfx10_tbuffer_store_float32
|
|
|
|
body: |
|
|
|
|
bb.0.entry:
|
|
|
|
liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8
|
|
|
|
%12:vgpr_32 = COPY $vgpr8
|
|
|
|
%11:vgpr_32 = COPY $vgpr7
|
|
|
|
%10:vgpr_32 = COPY $vgpr6
|
|
|
|
%9:vgpr_32 = COPY $vgpr5
|
|
|
|
%8:vgpr_32 = COPY $vgpr4
|
|
|
|
%7:vgpr_32 = COPY $vgpr3
|
|
|
|
%6:vgpr_32 = COPY $vgpr2
|
|
|
|
%5:vgpr_32 = COPY $vgpr1
|
|
|
|
%4:vgpr_32 = COPY $vgpr0
|
|
|
|
%3:sgpr_32 = COPY $sgpr3
|
|
|
|
%2:sgpr_32 = COPY $sgpr2
|
|
|
|
%1:sgpr_32 = COPY $sgpr1
|
|
|
|
%0:sgpr_32 = COPY $sgpr0
|
|
|
|
%13:sgpr_128 = REG_SEQUENCE %0:sgpr_32, %subreg.sub0, %1:sgpr_32, %subreg.sub1, %2:sgpr_32, %subreg.sub2, %3:sgpr_32, %subreg.sub3
|
|
|
|
TBUFFER_STORE_FORMAT_X_OFFSET_exact %4:vgpr_32, %13:sgpr_128, 0, 4, 22, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4, align 1, addrspace 4)
|
|
|
|
TBUFFER_STORE_FORMAT_X_OFFSET_exact %5:vgpr_32, %13:sgpr_128, 0, 8, 22, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4, align 1, addrspace 4)
|
|
|
|
TBUFFER_STORE_FORMAT_X_OFFSET_exact %6:vgpr_32, %13:sgpr_128, 0, 16, 22, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4, align 1, addrspace 4)
|
|
|
|
TBUFFER_STORE_FORMAT_X_OFFSET_exact %7:vgpr_32, %13:sgpr_128, 0, 20, 22, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4, align 1, addrspace 4)
|
|
|
|
TBUFFER_STORE_FORMAT_X_OFFSET_exact %8:vgpr_32, %13:sgpr_128, 0, 24, 22, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4, align 1, addrspace 4)
|
|
|
|
TBUFFER_STORE_FORMAT_X_OFFSET_exact %9:vgpr_32, %13:sgpr_128, 0, 28, 22, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4, align 1, addrspace 4)
|
|
|
|
TBUFFER_STORE_FORMAT_X_OFFSET_exact %10:vgpr_32, %13:sgpr_128, 0, 36, 22, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4, align 1, addrspace 4)
|
|
|
|
TBUFFER_STORE_FORMAT_X_OFFSET_exact %11:vgpr_32, %13:sgpr_128, 0, 40, 22, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4, align 1, addrspace 4)
|
|
|
|
TBUFFER_STORE_FORMAT_X_OFFSET_exact %12:vgpr_32, %13:sgpr_128, 0, 44, 22, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4, align 1, addrspace 4)
|
|
|
|
...
|
|
|
|
---
|
|
|
|
|
|
|
|
# GFX10-LABEL: name: gfx10_tbuffer_store_sint32
|
|
|
|
# GFX10: %{{[0-9]+}}:vgpr_32 = COPY $vgpr8
|
|
|
|
# GFX10: %{{[0-9]+}}:vgpr_32 = COPY $vgpr7
|
|
|
|
# GFX10: %{{[0-9]+}}:vgpr_32 = COPY $vgpr6
|
|
|
|
# GFX10: %{{[0-9]+}}:vgpr_32 = COPY $vgpr5
|
|
|
|
# GFX10: %{{[0-9]+}}:vgpr_32 = COPY $vgpr4
|
|
|
|
# GFX10: %{{[0-9]+}}:vgpr_32 = COPY $vgpr3
|
|
|
|
# GFX10: %{{[0-9]+}}:vgpr_32 = COPY $vgpr2
|
|
|
|
# GFX10: %{{[0-9]+}}:vgpr_32 = COPY $vgpr1
|
|
|
|
# GFX10: %{{[0-9]+}}:vgpr_32 = COPY $vgpr0
|
|
|
|
# GFX10: %{{[0-9]+}}:sgpr_32 = COPY $sgpr3
|
|
|
|
# GFX10: %{{[0-9]+}}:sgpr_32 = COPY $sgpr2
|
|
|
|
# GFX10: %{{[0-9]+}}:sgpr_32 = COPY $sgpr1
|
|
|
|
# GFX10: %{{[0-9]+}}:sgpr_32 = COPY $sgpr0
|
|
|
|
# GFX10: %{{[0-9]+}}:sgpr_128 = REG_SEQUENCE %12, %subreg.sub0, %11, %subreg.sub1, %10, %subreg.sub2, %9, %subreg.sub3
|
|
|
|
# GFX10: %{{[0-9]+}}:vreg_64 = REG_SEQUENCE %8, %subreg.sub0, %7, %subreg.sub1
|
|
|
|
# GFX10: TBUFFER_STORE_FORMAT_XY_OFFSET_exact killed %14, %13, 0, 4, 63, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 8, align 1, addrspace 4)
|
|
|
|
# GFX10: %{{[0-9]+}}:vreg_64 = REG_SEQUENCE %6, %subreg.sub0, %5, %subreg.sub1
|
AMDGPU/SILoadStoreOptimizer: Improve merging of out of order offsets
Summary:
This improves merging of sequences like:
store a, ptr + 4
store b, ptr + 8
store c, ptr + 12
store d, ptr + 16
store e, ptr + 20
store f, ptr
Prior to this patch the basic block was scanned in order to find instructions
to merge and the above sequence would be transformed to:
store4 <a, b, c, d>, ptr + 4
store e, ptr + 20
store r, ptr
With this change, we now sort all the candidate merge instructions by their offset,
so instructions are visited in offset order rather than in the order they appear
in the basic block. We now transform this sequnce into:
store4 <f, a, b, c>, ptr
store2 <d, e>, ptr + 16
Another benefit of this change is that since we have sorted the mergeable lists
by offset, we can easily check if an instruction is mergeable by checking the
offset of the instruction that becomes before or after it in the sorted list.
Once we determine an instruction is not mergeable we can remove it from the list
and avoid having to do the more expensive mergeablilty checks.
Reviewers: arsenm, pendingchaos, rampitec, nhaehnle, vpykhtin
Reviewed By: arsenm, nhaehnle
Subscribers: kerbowa, merge_guards_bot, kzhuravl, jvesely, wdng, yaxunl, dstuttard, tpr, t-tye, hiraditya, llvm-commits
Tags: #llvm
Differential Revision: https://reviews.llvm.org/D65966
2020-01-25 05:07:08 +08:00
|
|
|
# GFX10: %{{[0-9]+}}:vreg_96 = REG_SEQUENCE killed %15, %subreg.sub0_sub1, %4, %subreg.sub2
|
|
|
|
# GFX10: %{{[0-9]+}}:vreg_128 = REG_SEQUENCE killed %16, %subreg.sub0_sub1_sub2, %3, %subreg.sub3
|
|
|
|
# GFX10: TBUFFER_STORE_FORMAT_XYZW_OFFSET_exact killed %17, %13, 0, 16, 76, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 16, align 1, addrspace 4)
|
[AMDGPU][SILoadStoreOptimizer] Merge TBUFFER loads/stores
Summary: Extend SILoadStoreOptimizer to merge tbuffer loads and stores.
Reviewers: nhaehnle
Reviewed By: nhaehnle
Subscribers: arsenm, kzhuravl, jvesely, wdng, nhaehnle, yaxunl, dstuttard, tpr, t-tye, hiraditya, llvm-commits
Tags: #llvm
Differential Revision: https://reviews.llvm.org/D69794
2019-11-21 05:30:02 +08:00
|
|
|
# GFX10: %{{[0-9]+}}:vreg_64 = REG_SEQUENCE %2, %subreg.sub0, %1, %subreg.sub1
|
AMDGPU/SILoadStoreOptimizer: Improve merging of out of order offsets
Summary:
This improves merging of sequences like:
store a, ptr + 4
store b, ptr + 8
store c, ptr + 12
store d, ptr + 16
store e, ptr + 20
store f, ptr
Prior to this patch the basic block was scanned in order to find instructions
to merge and the above sequence would be transformed to:
store4 <a, b, c, d>, ptr + 4
store e, ptr + 20
store r, ptr
With this change, we now sort all the candidate merge instructions by their offset,
so instructions are visited in offset order rather than in the order they appear
in the basic block. We now transform this sequnce into:
store4 <f, a, b, c>, ptr
store2 <d, e>, ptr + 16
Another benefit of this change is that since we have sorted the mergeable lists
by offset, we can easily check if an instruction is mergeable by checking the
offset of the instruction that becomes before or after it in the sorted list.
Once we determine an instruction is not mergeable we can remove it from the list
and avoid having to do the more expensive mergeablilty checks.
Reviewers: arsenm, pendingchaos, rampitec, nhaehnle, vpykhtin
Reviewed By: arsenm, nhaehnle
Subscribers: kerbowa, merge_guards_bot, kzhuravl, jvesely, wdng, yaxunl, dstuttard, tpr, t-tye, hiraditya, llvm-commits
Tags: #llvm
Differential Revision: https://reviews.llvm.org/D65966
2020-01-25 05:07:08 +08:00
|
|
|
# GFX10: %{{[0-9]+}}:vreg_96 = REG_SEQUENCE killed %18, %subreg.sub0_sub1, %0, %subreg.sub2
|
[AMDGPU][SILoadStoreOptimizer] Merge TBUFFER loads/stores
Summary: Extend SILoadStoreOptimizer to merge tbuffer loads and stores.
Reviewers: nhaehnle
Reviewed By: nhaehnle
Subscribers: arsenm, kzhuravl, jvesely, wdng, nhaehnle, yaxunl, dstuttard, tpr, t-tye, hiraditya, llvm-commits
Tags: #llvm
Differential Revision: https://reviews.llvm.org/D69794
2019-11-21 05:30:02 +08:00
|
|
|
# GFX10: TBUFFER_STORE_FORMAT_XYZ_OFFSET_exact killed %19, %13, 0, 36, 73, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 12, align 1, addrspace 4)
|
|
|
|
name: gfx10_tbuffer_store_sint32
|
|
|
|
body: |
|
|
|
|
bb.0.entry:
|
|
|
|
liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8
|
|
|
|
%12:vgpr_32 = COPY $vgpr8
|
|
|
|
%11:vgpr_32 = COPY $vgpr7
|
|
|
|
%10:vgpr_32 = COPY $vgpr6
|
|
|
|
%9:vgpr_32 = COPY $vgpr5
|
|
|
|
%8:vgpr_32 = COPY $vgpr4
|
|
|
|
%7:vgpr_32 = COPY $vgpr3
|
|
|
|
%6:vgpr_32 = COPY $vgpr2
|
|
|
|
%5:vgpr_32 = COPY $vgpr1
|
|
|
|
%4:vgpr_32 = COPY $vgpr0
|
|
|
|
%3:sgpr_32 = COPY $sgpr3
|
|
|
|
%2:sgpr_32 = COPY $sgpr2
|
|
|
|
%1:sgpr_32 = COPY $sgpr1
|
|
|
|
%0:sgpr_32 = COPY $sgpr0
|
|
|
|
%13:sgpr_128 = REG_SEQUENCE %0:sgpr_32, %subreg.sub0, %1:sgpr_32, %subreg.sub1, %2:sgpr_32, %subreg.sub2, %3:sgpr_32, %subreg.sub3
|
|
|
|
TBUFFER_STORE_FORMAT_X_OFFSET_exact %4:vgpr_32, %13:sgpr_128, 0, 4, 21, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4, align 1, addrspace 4)
|
|
|
|
TBUFFER_STORE_FORMAT_X_OFFSET_exact %5:vgpr_32, %13:sgpr_128, 0, 8, 21, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4, align 1, addrspace 4)
|
|
|
|
TBUFFER_STORE_FORMAT_X_OFFSET_exact %6:vgpr_32, %13:sgpr_128, 0, 16, 21, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4, align 1, addrspace 4)
|
|
|
|
TBUFFER_STORE_FORMAT_X_OFFSET_exact %7:vgpr_32, %13:sgpr_128, 0, 20, 21, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4, align 1, addrspace 4)
|
|
|
|
TBUFFER_STORE_FORMAT_X_OFFSET_exact %8:vgpr_32, %13:sgpr_128, 0, 24, 21, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4, align 1, addrspace 4)
|
|
|
|
TBUFFER_STORE_FORMAT_X_OFFSET_exact %9:vgpr_32, %13:sgpr_128, 0, 28, 21, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4, align 1, addrspace 4)
|
|
|
|
TBUFFER_STORE_FORMAT_X_OFFSET_exact %10:vgpr_32, %13:sgpr_128, 0, 36, 21, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4, align 1, addrspace 4)
|
|
|
|
TBUFFER_STORE_FORMAT_X_OFFSET_exact %11:vgpr_32, %13:sgpr_128, 0, 40, 21, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4, align 1, addrspace 4)
|
|
|
|
TBUFFER_STORE_FORMAT_X_OFFSET_exact %12:vgpr_32, %13:sgpr_128, 0, 44, 21, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4, align 1, addrspace 4)
|
|
|
|
...
|
|
|
|
---
|
|
|
|
|
|
|
|
# GFX10-LABEL: name: gfx10_tbuffer_store_uint32
|
|
|
|
# GFX10: %{{[0-9]+}}:vgpr_32 = COPY $vgpr8
|
|
|
|
# GFX10: %{{[0-9]+}}:vgpr_32 = COPY $vgpr7
|
|
|
|
# GFX10: %{{[0-9]+}}:vgpr_32 = COPY $vgpr6
|
|
|
|
# GFX10: %{{[0-9]+}}:vgpr_32 = COPY $vgpr5
|
|
|
|
# GFX10: %{{[0-9]+}}:vgpr_32 = COPY $vgpr4
|
|
|
|
# GFX10: %{{[0-9]+}}:vgpr_32 = COPY $vgpr3
|
|
|
|
# GFX10: %{{[0-9]+}}:vgpr_32 = COPY $vgpr2
|
|
|
|
# GFX10: %{{[0-9]+}}:vgpr_32 = COPY $vgpr1
|
|
|
|
# GFX10: %{{[0-9]+}}:vgpr_32 = COPY $vgpr0
|
|
|
|
# GFX10: %{{[0-9]+}}:sgpr_32 = COPY $sgpr3
|
|
|
|
# GFX10: %{{[0-9]+}}:sgpr_32 = COPY $sgpr2
|
|
|
|
# GFX10: %{{[0-9]+}}:sgpr_32 = COPY $sgpr1
|
|
|
|
# GFX10: %{{[0-9]+}}:sgpr_32 = COPY $sgpr0
|
|
|
|
# GFX10: %{{[0-9]+}}:sgpr_128 = REG_SEQUENCE %12, %subreg.sub0, %11, %subreg.sub1, %10, %subreg.sub2, %9, %subreg.sub3
|
|
|
|
# GFX10: %{{[0-9]+}}:vreg_64 = REG_SEQUENCE %8, %subreg.sub0, %7, %subreg.sub1
|
|
|
|
# GFX10: TBUFFER_STORE_FORMAT_XY_OFFSET_exact killed %14, %13, 0, 4, 62, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 8, align 1, addrspace 4)
|
|
|
|
# GFX10: %{{[0-9]+}}:vreg_64 = REG_SEQUENCE %6, %subreg.sub0, %5, %subreg.sub1
|
AMDGPU/SILoadStoreOptimizer: Improve merging of out of order offsets
Summary:
This improves merging of sequences like:
store a, ptr + 4
store b, ptr + 8
store c, ptr + 12
store d, ptr + 16
store e, ptr + 20
store f, ptr
Prior to this patch the basic block was scanned in order to find instructions
to merge and the above sequence would be transformed to:
store4 <a, b, c, d>, ptr + 4
store e, ptr + 20
store r, ptr
With this change, we now sort all the candidate merge instructions by their offset,
so instructions are visited in offset order rather than in the order they appear
in the basic block. We now transform this sequnce into:
store4 <f, a, b, c>, ptr
store2 <d, e>, ptr + 16
Another benefit of this change is that since we have sorted the mergeable lists
by offset, we can easily check if an instruction is mergeable by checking the
offset of the instruction that becomes before or after it in the sorted list.
Once we determine an instruction is not mergeable we can remove it from the list
and avoid having to do the more expensive mergeablilty checks.
Reviewers: arsenm, pendingchaos, rampitec, nhaehnle, vpykhtin
Reviewed By: arsenm, nhaehnle
Subscribers: kerbowa, merge_guards_bot, kzhuravl, jvesely, wdng, yaxunl, dstuttard, tpr, t-tye, hiraditya, llvm-commits
Tags: #llvm
Differential Revision: https://reviews.llvm.org/D65966
2020-01-25 05:07:08 +08:00
|
|
|
# GFX10: %{{[0-9]+}}:vreg_96 = REG_SEQUENCE killed %15, %subreg.sub0_sub1, %4, %subreg.sub2
|
|
|
|
# GFX10: %{{[0-9]+}}:vreg_128 = REG_SEQUENCE killed %16, %subreg.sub0_sub1_sub2, %3, %subreg.sub3
|
|
|
|
# GFX10: TBUFFER_STORE_FORMAT_XYZW_OFFSET_exact killed %17, %13, 0, 16, 75, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 16, align 1, addrspace 4)
|
[AMDGPU][SILoadStoreOptimizer] Merge TBUFFER loads/stores
Summary: Extend SILoadStoreOptimizer to merge tbuffer loads and stores.
Reviewers: nhaehnle
Reviewed By: nhaehnle
Subscribers: arsenm, kzhuravl, jvesely, wdng, nhaehnle, yaxunl, dstuttard, tpr, t-tye, hiraditya, llvm-commits
Tags: #llvm
Differential Revision: https://reviews.llvm.org/D69794
2019-11-21 05:30:02 +08:00
|
|
|
# GFX10: %{{[0-9]+}}:vreg_64 = REG_SEQUENCE %2, %subreg.sub0, %1, %subreg.sub1
|
AMDGPU/SILoadStoreOptimizer: Improve merging of out of order offsets
Summary:
This improves merging of sequences like:
store a, ptr + 4
store b, ptr + 8
store c, ptr + 12
store d, ptr + 16
store e, ptr + 20
store f, ptr
Prior to this patch the basic block was scanned in order to find instructions
to merge and the above sequence would be transformed to:
store4 <a, b, c, d>, ptr + 4
store e, ptr + 20
store r, ptr
With this change, we now sort all the candidate merge instructions by their offset,
so instructions are visited in offset order rather than in the order they appear
in the basic block. We now transform this sequnce into:
store4 <f, a, b, c>, ptr
store2 <d, e>, ptr + 16
Another benefit of this change is that since we have sorted the mergeable lists
by offset, we can easily check if an instruction is mergeable by checking the
offset of the instruction that becomes before or after it in the sorted list.
Once we determine an instruction is not mergeable we can remove it from the list
and avoid having to do the more expensive mergeablilty checks.
Reviewers: arsenm, pendingchaos, rampitec, nhaehnle, vpykhtin
Reviewed By: arsenm, nhaehnle
Subscribers: kerbowa, merge_guards_bot, kzhuravl, jvesely, wdng, yaxunl, dstuttard, tpr, t-tye, hiraditya, llvm-commits
Tags: #llvm
Differential Revision: https://reviews.llvm.org/D65966
2020-01-25 05:07:08 +08:00
|
|
|
# GFX10: %{{[0-9]+}}:vreg_96 = REG_SEQUENCE killed %18, %subreg.sub0_sub1, %0, %subreg.sub2
|
[AMDGPU][SILoadStoreOptimizer] Merge TBUFFER loads/stores
Summary: Extend SILoadStoreOptimizer to merge tbuffer loads and stores.
Reviewers: nhaehnle
Reviewed By: nhaehnle
Subscribers: arsenm, kzhuravl, jvesely, wdng, nhaehnle, yaxunl, dstuttard, tpr, t-tye, hiraditya, llvm-commits
Tags: #llvm
Differential Revision: https://reviews.llvm.org/D69794
2019-11-21 05:30:02 +08:00
|
|
|
# GFX10: TBUFFER_STORE_FORMAT_XYZ_OFFSET_exact killed %19, %13, 0, 36, 72, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 12, align 1, addrspace 4)
|
|
|
|
name: gfx10_tbuffer_store_uint32
|
|
|
|
body: |
|
|
|
|
bb.0.entry:
|
|
|
|
liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8
|
|
|
|
%12:vgpr_32 = COPY $vgpr8
|
|
|
|
%11:vgpr_32 = COPY $vgpr7
|
|
|
|
%10:vgpr_32 = COPY $vgpr6
|
|
|
|
%9:vgpr_32 = COPY $vgpr5
|
|
|
|
%8:vgpr_32 = COPY $vgpr4
|
|
|
|
%7:vgpr_32 = COPY $vgpr3
|
|
|
|
%6:vgpr_32 = COPY $vgpr2
|
|
|
|
%5:vgpr_32 = COPY $vgpr1
|
|
|
|
%4:vgpr_32 = COPY $vgpr0
|
|
|
|
%3:sgpr_32 = COPY $sgpr3
|
|
|
|
%2:sgpr_32 = COPY $sgpr2
|
|
|
|
%1:sgpr_32 = COPY $sgpr1
|
|
|
|
%0:sgpr_32 = COPY $sgpr0
|
|
|
|
%13:sgpr_128 = REG_SEQUENCE %0:sgpr_32, %subreg.sub0, %1:sgpr_32, %subreg.sub1, %2:sgpr_32, %subreg.sub2, %3:sgpr_32, %subreg.sub3
|
|
|
|
TBUFFER_STORE_FORMAT_X_OFFSET_exact %4:vgpr_32, %13:sgpr_128, 0, 4, 20, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4, align 1, addrspace 4)
|
|
|
|
TBUFFER_STORE_FORMAT_X_OFFSET_exact %5:vgpr_32, %13:sgpr_128, 0, 8, 20, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4, align 1, addrspace 4)
|
|
|
|
TBUFFER_STORE_FORMAT_X_OFFSET_exact %6:vgpr_32, %13:sgpr_128, 0, 16, 20, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4, align 1, addrspace 4)
|
|
|
|
TBUFFER_STORE_FORMAT_X_OFFSET_exact %7:vgpr_32, %13:sgpr_128, 0, 20, 20, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4, align 1, addrspace 4)
|
|
|
|
TBUFFER_STORE_FORMAT_X_OFFSET_exact %8:vgpr_32, %13:sgpr_128, 0, 24, 20, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4, align 1, addrspace 4)
|
|
|
|
TBUFFER_STORE_FORMAT_X_OFFSET_exact %9:vgpr_32, %13:sgpr_128, 0, 28, 20, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4, align 1, addrspace 4)
|
|
|
|
TBUFFER_STORE_FORMAT_X_OFFSET_exact %10:vgpr_32, %13:sgpr_128, 0, 36, 20, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4, align 1, addrspace 4)
|
|
|
|
TBUFFER_STORE_FORMAT_X_OFFSET_exact %11:vgpr_32, %13:sgpr_128, 0, 40, 20, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4, align 1, addrspace 4)
|
|
|
|
TBUFFER_STORE_FORMAT_X_OFFSET_exact %12:vgpr_32, %13:sgpr_128, 0, 44, 20, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4, align 1, addrspace 4)
|
|
|
|
...
|
|
|
|
---
|
|
|
|
|
|
|
|
# GFX10-LABEL: name: gfx10_tbuffer_store_not_merged_data_format_mismatch
|
|
|
|
# GFX10: %{{[0-9]+}}:vgpr_32 = COPY $vgpr8
|
|
|
|
# GFX10: %{{[0-9]+}}:vgpr_32 = COPY $vgpr7
|
|
|
|
# GFX10: %{{[0-9]+}}:vgpr_32 = COPY $vgpr6
|
|
|
|
# GFX10: %{{[0-9]+}}:vgpr_32 = COPY $vgpr5
|
|
|
|
# GFX10: %{{[0-9]+}}:vgpr_32 = COPY $vgpr4
|
|
|
|
# GFX10: %{{[0-9]+}}:vgpr_32 = COPY $vgpr3
|
|
|
|
# GFX10: %{{[0-9]+}}:vgpr_32 = COPY $vgpr2
|
|
|
|
# GFX10: %{{[0-9]+}}:vgpr_32 = COPY $vgpr1
|
|
|
|
# GFX10: %{{[0-9]+}}:vgpr_32 = COPY $vgpr0
|
|
|
|
# GFX10: %{{[0-9]+}}:sgpr_32 = COPY $sgpr3
|
|
|
|
# GFX10: %{{[0-9]+}}:sgpr_32 = COPY $sgpr2
|
|
|
|
# GFX10: %{{[0-9]+}}:sgpr_32 = COPY $sgpr1
|
|
|
|
# GFX10: %{{[0-9]+}}:sgpr_32 = COPY $sgpr0
|
|
|
|
# GFX10: %{{[0-9]+}}:sgpr_128 = REG_SEQUENCE %12, %subreg.sub0, %11, %subreg.sub1, %10, %subreg.sub2, %9, %subreg.sub3
|
|
|
|
# GFX10: TBUFFER_STORE_FORMAT_X_OFFSET_exact %8, %13, 0, 4, 22, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4, align 1, addrspace 4)
|
|
|
|
# GFX10: TBUFFER_STORE_FORMAT_X_OFFSET_exact %7, %13, 0, 8, 21, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4, align 1, addrspace 4)
|
|
|
|
# GFX10: TBUFFER_STORE_FORMAT_X_OFFSET_exact %6, %13, 0, 16, 22, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4, align 1, addrspace 4)
|
|
|
|
# GFX10: TBUFFER_STORE_FORMAT_X_OFFSET_exact %5, %13, 0, 20, 21, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4, align 1, addrspace 4)
|
|
|
|
# GFX10: TBUFFER_STORE_FORMAT_X_OFFSET_exact %4, %13, 0, 24, 22, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4, align 1, addrspace 4)
|
|
|
|
# GFX10: TBUFFER_STORE_FORMAT_X_OFFSET_exact %3, %13, 0, 28, 21, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4, align 1, addrspace 4)
|
|
|
|
# GFX10: TBUFFER_STORE_FORMAT_X_OFFSET_exact %2, %13, 0, 36, 22, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4, align 1, addrspace 4)
|
|
|
|
# GFX10: TBUFFER_STORE_FORMAT_X_OFFSET_exact %1, %13, 0, 40, 21, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4, align 1, addrspace 4)
|
|
|
|
# GFX10: TBUFFER_STORE_FORMAT_X_OFFSET_exact %0, %13, 0, 44, 22, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4, align 1, addrspace 4)
|
|
|
|
name: gfx10_tbuffer_store_not_merged_data_format_mismatch
|
|
|
|
body: |
|
|
|
|
bb.0.entry:
|
|
|
|
liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8
|
|
|
|
%12:vgpr_32 = COPY $vgpr8
|
|
|
|
%11:vgpr_32 = COPY $vgpr7
|
|
|
|
%10:vgpr_32 = COPY $vgpr6
|
|
|
|
%9:vgpr_32 = COPY $vgpr5
|
|
|
|
%8:vgpr_32 = COPY $vgpr4
|
|
|
|
%7:vgpr_32 = COPY $vgpr3
|
|
|
|
%6:vgpr_32 = COPY $vgpr2
|
|
|
|
%5:vgpr_32 = COPY $vgpr1
|
|
|
|
%4:vgpr_32 = COPY $vgpr0
|
|
|
|
%3:sgpr_32 = COPY $sgpr3
|
|
|
|
%2:sgpr_32 = COPY $sgpr2
|
|
|
|
%1:sgpr_32 = COPY $sgpr1
|
|
|
|
%0:sgpr_32 = COPY $sgpr0
|
|
|
|
%13:sgpr_128 = REG_SEQUENCE %0:sgpr_32, %subreg.sub0, %1:sgpr_32, %subreg.sub1, %2:sgpr_32, %subreg.sub2, %3:sgpr_32, %subreg.sub3
|
|
|
|
TBUFFER_STORE_FORMAT_X_OFFSET_exact %4:vgpr_32, %13:sgpr_128, 0, 4, 22, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4, align 1, addrspace 4)
|
|
|
|
TBUFFER_STORE_FORMAT_X_OFFSET_exact %5:vgpr_32, %13:sgpr_128, 0, 8, 21, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4, align 1, addrspace 4)
|
|
|
|
TBUFFER_STORE_FORMAT_X_OFFSET_exact %6:vgpr_32, %13:sgpr_128, 0, 16, 22, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4, align 1, addrspace 4)
|
|
|
|
TBUFFER_STORE_FORMAT_X_OFFSET_exact %7:vgpr_32, %13:sgpr_128, 0, 20, 21, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4, align 1, addrspace 4)
|
|
|
|
TBUFFER_STORE_FORMAT_X_OFFSET_exact %8:vgpr_32, %13:sgpr_128, 0, 24, 22, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4, align 1, addrspace 4)
|
|
|
|
TBUFFER_STORE_FORMAT_X_OFFSET_exact %9:vgpr_32, %13:sgpr_128, 0, 28, 21, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4, align 1, addrspace 4)
|
|
|
|
TBUFFER_STORE_FORMAT_X_OFFSET_exact %10:vgpr_32, %13:sgpr_128, 0, 36, 22, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4, align 1, addrspace 4)
|
|
|
|
TBUFFER_STORE_FORMAT_X_OFFSET_exact %11:vgpr_32, %13:sgpr_128, 0, 40, 21, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4, align 1, addrspace 4)
|
|
|
|
TBUFFER_STORE_FORMAT_X_OFFSET_exact %12:vgpr_32, %13:sgpr_128, 0, 44, 22, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4, align 1, addrspace 4)
|
|
|
|
...
|
|
|
|
---
|
|
|
|
|
|
|
|
# GFX10-LABEL: name: gfx10_tbuffer_store_not_merged_num_format_mismatch
|
|
|
|
# GFX10: %{{[0-9]+}}:vgpr_32 = COPY $vgpr8
|
|
|
|
# GFX10: %{{[0-9]+}}:vgpr_32 = COPY $vgpr7
|
|
|
|
# GFX10: %{{[0-9]+}}:vgpr_32 = COPY $vgpr6
|
|
|
|
# GFX10: %{{[0-9]+}}:vgpr_32 = COPY $vgpr5
|
|
|
|
# GFX10: %{{[0-9]+}}:vgpr_32 = COPY $vgpr4
|
|
|
|
# GFX10: %{{[0-9]+}}:vgpr_32 = COPY $vgpr3
|
|
|
|
# GFX10: %{{[0-9]+}}:vgpr_32 = COPY $vgpr2
|
|
|
|
# GFX10: %{{[0-9]+}}:vgpr_32 = COPY $vgpr1
|
|
|
|
# GFX10: %{{[0-9]+}}:vgpr_32 = COPY $vgpr0
|
|
|
|
# GFX10: %{{[0-9]+}}:sgpr_32 = COPY $sgpr3
|
|
|
|
# GFX10: %{{[0-9]+}}:sgpr_32 = COPY $sgpr2
|
|
|
|
# GFX10: %{{[0-9]+}}:sgpr_32 = COPY $sgpr1
|
|
|
|
# GFX10: %{{[0-9]+}}:sgpr_32 = COPY $sgpr0
|
|
|
|
# GFX10: %{{[0-9]+}}:sgpr_128 = REG_SEQUENCE %12, %subreg.sub0, %11, %subreg.sub1, %10, %subreg.sub2, %9, %subreg.sub3
|
|
|
|
# GFX10: TBUFFER_STORE_FORMAT_X_OFFSET_exact %8, %13, 0, 4, 22, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4, align 1, addrspace 4)
|
|
|
|
# GFX10: TBUFFER_STORE_FORMAT_X_OFFSET_exact %7, %13, 0, 8, 13, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4, align 1, addrspace 4)
|
|
|
|
# GFX10: TBUFFER_STORE_FORMAT_X_OFFSET_exact %6, %13, 0, 16, 22, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4, align 1, addrspace 4)
|
|
|
|
# GFX10: TBUFFER_STORE_FORMAT_X_OFFSET_exact %5, %13, 0, 20, 13, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4, align 1, addrspace 4)
|
|
|
|
# GFX10: TBUFFER_STORE_FORMAT_X_OFFSET_exact %4, %13, 0, 24, 22, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4, align 1, addrspace 4)
|
|
|
|
# GFX10: TBUFFER_STORE_FORMAT_X_OFFSET_exact %3, %13, 0, 28, 13, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4, align 1, addrspace 4)
|
|
|
|
# GFX10: TBUFFER_STORE_FORMAT_X_OFFSET_exact %2, %13, 0, 36, 22, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4, align 1, addrspace 4)
|
|
|
|
# GFX10: TBUFFER_STORE_FORMAT_X_OFFSET_exact %1, %13, 0, 40, 13, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4, align 1, addrspace 4)
|
|
|
|
# GFX10: TBUFFER_STORE_FORMAT_X_OFFSET_exact %0, %13, 0, 44, 22, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4, align 1, addrspace 4)
|
|
|
|
name: gfx10_tbuffer_store_not_merged_num_format_mismatch
|
|
|
|
body: |
|
|
|
|
bb.0.entry:
|
|
|
|
liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8
|
|
|
|
%12:vgpr_32 = COPY $vgpr8
|
|
|
|
%11:vgpr_32 = COPY $vgpr7
|
|
|
|
%10:vgpr_32 = COPY $vgpr6
|
|
|
|
%9:vgpr_32 = COPY $vgpr5
|
|
|
|
%8:vgpr_32 = COPY $vgpr4
|
|
|
|
%7:vgpr_32 = COPY $vgpr3
|
|
|
|
%6:vgpr_32 = COPY $vgpr2
|
|
|
|
%5:vgpr_32 = COPY $vgpr1
|
|
|
|
%4:vgpr_32 = COPY $vgpr0
|
|
|
|
%3:sgpr_32 = COPY $sgpr3
|
|
|
|
%2:sgpr_32 = COPY $sgpr2
|
|
|
|
%1:sgpr_32 = COPY $sgpr1
|
|
|
|
%0:sgpr_32 = COPY $sgpr0
|
|
|
|
%13:sgpr_128 = REG_SEQUENCE %0:sgpr_32, %subreg.sub0, %1:sgpr_32, %subreg.sub1, %2:sgpr_32, %subreg.sub2, %3:sgpr_32, %subreg.sub3
|
|
|
|
TBUFFER_STORE_FORMAT_X_OFFSET_exact %4:vgpr_32, %13:sgpr_128, 0, 4, 22, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4, align 1, addrspace 4)
|
|
|
|
TBUFFER_STORE_FORMAT_X_OFFSET_exact %5:vgpr_32, %13:sgpr_128, 0, 8, 13, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4, align 1, addrspace 4)
|
|
|
|
TBUFFER_STORE_FORMAT_X_OFFSET_exact %6:vgpr_32, %13:sgpr_128, 0, 16, 22, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4, align 1, addrspace 4)
|
|
|
|
TBUFFER_STORE_FORMAT_X_OFFSET_exact %7:vgpr_32, %13:sgpr_128, 0, 20, 13, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4, align 1, addrspace 4)
|
|
|
|
TBUFFER_STORE_FORMAT_X_OFFSET_exact %8:vgpr_32, %13:sgpr_128, 0, 24, 22, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4, align 1, addrspace 4)
|
|
|
|
TBUFFER_STORE_FORMAT_X_OFFSET_exact %9:vgpr_32, %13:sgpr_128, 0, 28, 13, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4, align 1, addrspace 4)
|
|
|
|
TBUFFER_STORE_FORMAT_X_OFFSET_exact %10:vgpr_32, %13:sgpr_128, 0, 36, 22, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4, align 1, addrspace 4)
|
|
|
|
TBUFFER_STORE_FORMAT_X_OFFSET_exact %11:vgpr_32, %13:sgpr_128, 0, 40, 13, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4, align 1, addrspace 4)
|
|
|
|
TBUFFER_STORE_FORMAT_X_OFFSET_exact %12:vgpr_32, %13:sgpr_128, 0, 44, 22, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4, align 1, addrspace 4)
|
|
|
|
...
|
|
|
|
---
|
|
|
|
|
|
|
|
|
|
|
|
# GFX10-LABEL: name: gfx10_tbuffer_load_not_merged_swizzled_0
|
|
|
|
# GFX10: %{{[0-9]+}}:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %4, 0, 4, 22, 0, 0, 0, 0, 1, implicit $exec :: (dereferenceable load 4, align 1, addrspace 4)
|
|
|
|
# GFX10: %{{[0-9]+}}:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %4, 0, 8, 22, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, align 1, addrspace 4)
|
|
|
|
name: gfx10_tbuffer_load_not_merged_swizzled_0
|
|
|
|
body: |
|
|
|
|
bb.0.entry:
|
|
|
|
%0:sgpr_32 = COPY $sgpr0
|
|
|
|
%1:sgpr_32 = COPY $sgpr1
|
|
|
|
%2:sgpr_32 = COPY $sgpr2
|
|
|
|
%3:sgpr_32 = COPY $sgpr3
|
|
|
|
%5:sgpr_128 = REG_SEQUENCE %0:sgpr_32, %subreg.sub0, %1:sgpr_32, %subreg.sub1, %2:sgpr_32, %subreg.sub2, %3:sgpr_32, %subreg.sub3
|
|
|
|
%7:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %5:sgpr_128, 0, 4, 22, 0, 0, 0, 0, 1, implicit $exec :: (dereferenceable load 4, align 1, addrspace 4)
|
|
|
|
%8:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %5:sgpr_128, 0, 8, 22, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, align 1, addrspace 4)
|
|
|
|
...
|
|
|
|
---
|
|
|
|
|
|
|
|
|
|
|
|
# GFX10-LABEL: name: gfx10_tbuffer_load_not_merged_swizzled_1
|
|
|
|
# GFX10: %{{[0-9]+}}:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %4, 0, 4, 22, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, align 1, addrspace 4)
|
|
|
|
# GFX10: %{{[0-9]+}}:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %4, 0, 8, 22, 0, 0, 0, 0, 1, implicit $exec :: (dereferenceable load 4, align 1, addrspace 4)
|
|
|
|
name: gfx10_tbuffer_load_not_merged_swizzled_1
|
|
|
|
body: |
|
|
|
|
bb.0.entry:
|
|
|
|
%0:sgpr_32 = COPY $sgpr0
|
|
|
|
%1:sgpr_32 = COPY $sgpr1
|
|
|
|
%2:sgpr_32 = COPY $sgpr2
|
|
|
|
%3:sgpr_32 = COPY $sgpr3
|
|
|
|
%5:sgpr_128 = REG_SEQUENCE %0:sgpr_32, %subreg.sub0, %1:sgpr_32, %subreg.sub1, %2:sgpr_32, %subreg.sub2, %3:sgpr_32, %subreg.sub3
|
|
|
|
%7:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %5:sgpr_128, 0, 4, 22, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4, align 1, addrspace 4)
|
|
|
|
%8:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %5:sgpr_128, 0, 8, 22, 0, 0, 0, 0, 1, implicit $exec :: (dereferenceable load 4, align 1, addrspace 4)
|
|
|
|
...
|
|
|
|
---
|
|
|
|
|