Revert "[AMDGPU/MemOpsCluster] Implement new heuristic for computing max mem ops cluster size"

This reverts commit cc9d693856.
This commit is contained in:
hsmahesha 2020-07-17 11:40:10 +05:30
parent f76a0cd97a
commit 4905536086
14 changed files with 565 additions and 523 deletions

View File

@ -474,27 +474,65 @@ bool SIInstrInfo::shouldClusterMemOps(ArrayRef<const MachineOperand *> BaseOps1,
ArrayRef<const MachineOperand *> BaseOps2, ArrayRef<const MachineOperand *> BaseOps2,
unsigned NumLoads, unsigned NumLoads,
unsigned NumBytes) const { unsigned NumBytes) const {
// If current mem ops pair do not have same base pointer, then they cannot be
// clustered.
assert(!BaseOps1.empty() && !BaseOps2.empty()); assert(!BaseOps1.empty() && !BaseOps2.empty());
const MachineInstr &FirstLdSt = *BaseOps1.front()->getParent(); const MachineInstr &FirstLdSt = *BaseOps1.front()->getParent();
const MachineInstr &SecondLdSt = *BaseOps2.front()->getParent(); const MachineInstr &SecondLdSt = *BaseOps2.front()->getParent();
if (!memOpsHaveSameBasePtr(FirstLdSt, BaseOps1, SecondLdSt, BaseOps2)) if (!memOpsHaveSameBasePtr(FirstLdSt, BaseOps1, SecondLdSt, BaseOps2))
return false; return false;
// Compute max cluster size based on average number bytes clustered till now, const MachineOperand *FirstDst = nullptr;
// and decide based on it, if current mem ops pair can be clustered or not. const MachineOperand *SecondDst = nullptr;
assert((NumLoads > 0) && (NumBytes > 0) && (NumBytes >= NumLoads) &&
"Invalid NumLoads/NumBytes values"); if ((isMUBUF(FirstLdSt) && isMUBUF(SecondLdSt)) ||
unsigned MaxNumLoads; (isMTBUF(FirstLdSt) && isMTBUF(SecondLdSt)) ||
if (NumBytes <= 4 * NumLoads) { (isMIMG(FirstLdSt) && isMIMG(SecondLdSt)) ||
// Loads are dword or smaller (on average). (isFLAT(FirstLdSt) && isFLAT(SecondLdSt))) {
MaxNumLoads = 5; const unsigned MaxGlobalLoadCluster = 7;
} else { if (NumLoads > MaxGlobalLoadCluster)
// Loads are bigger than a dword (on average). return false;
MaxNumLoads = 4;
FirstDst = getNamedOperand(FirstLdSt, AMDGPU::OpName::vdata);
if (!FirstDst)
FirstDst = getNamedOperand(FirstLdSt, AMDGPU::OpName::vdst);
SecondDst = getNamedOperand(SecondLdSt, AMDGPU::OpName::vdata);
if (!SecondDst)
SecondDst = getNamedOperand(SecondLdSt, AMDGPU::OpName::vdst);
} else if (isSMRD(FirstLdSt) && isSMRD(SecondLdSt)) {
FirstDst = getNamedOperand(FirstLdSt, AMDGPU::OpName::sdst);
SecondDst = getNamedOperand(SecondLdSt, AMDGPU::OpName::sdst);
} else if (isDS(FirstLdSt) && isDS(SecondLdSt)) {
FirstDst = getNamedOperand(FirstLdSt, AMDGPU::OpName::vdst);
SecondDst = getNamedOperand(SecondLdSt, AMDGPU::OpName::vdst);
} }
return NumLoads <= MaxNumLoads;
if (!FirstDst || !SecondDst)
return false;
// Try to limit clustering based on the total number of bytes loaded
// rather than the number of instructions. This is done to help reduce
// register pressure. The method used is somewhat inexact, though,
// because it assumes that all loads in the cluster will load the
// same number of bytes as FirstLdSt.
// The unit of this value is bytes.
// FIXME: This needs finer tuning.
unsigned LoadClusterThreshold = 16;
const MachineRegisterInfo &MRI =
FirstLdSt.getParent()->getParent()->getRegInfo();
const Register Reg = FirstDst->getReg();
const TargetRegisterClass *DstRC = Register::isVirtualRegister(Reg)
? MRI.getRegClass(Reg)
: RI.getPhysRegClass(Reg);
// FIXME: NumLoads should not be subtracted 1. This is to match behavior
// of clusterNeighboringMemOps which was previosly passing cluster length
// less 1. LoadClusterThreshold should be tuned instead.
return ((NumLoads - 1) * (RI.getRegSizeInBits(*DstRC) / 8)) <=
LoadClusterThreshold;
} }
// FIXME: This behaves strangely. If, for example, you have 32 load + stores, // FIXME: This behaves strangely. If, for example, you have 32 load + stores,

View File

@ -235,17 +235,17 @@ define amdgpu_kernel void @test_div_fmas_f32(float addrspace(1)* %out, [8 x i32]
; ;
; GFX8-LABEL: test_div_fmas_f32: ; GFX8-LABEL: test_div_fmas_f32:
; GFX8: ; %bb.0: ; GFX8: ; %bb.0:
; GFX8-NEXT: s_load_dword s2, s[0:1], 0xb8 ; GFX8-NEXT: s_load_dword s2, s[0:1], 0x4c
; GFX8-NEXT: s_load_dword s3, s[0:1], 0x4c ; GFX8-NEXT: s_load_dword s3, s[0:1], 0x70
; GFX8-NEXT: s_load_dword s4, s[0:1], 0x70 ; GFX8-NEXT: s_load_dword s4, s[0:1], 0x94
; GFX8-NEXT: s_load_dword s5, s[0:1], 0x94 ; GFX8-NEXT: s_load_dword s5, s[0:1], 0xb8
; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: s_and_b32 s2, 1, s2 ; GFX8-NEXT: v_mov_b32_e32 v0, s2
; GFX8-NEXT: v_mov_b32_e32 v0, s3 ; GFX8-NEXT: v_mov_b32_e32 v1, s3
; GFX8-NEXT: v_mov_b32_e32 v1, s4 ; GFX8-NEXT: v_mov_b32_e32 v2, s4
; GFX8-NEXT: v_mov_b32_e32 v2, s5 ; GFX8-NEXT: s_and_b32 s2, 1, s5
; GFX8-NEXT: v_cmp_ne_u32_e64 vcc, 0, s2 ; GFX8-NEXT: v_cmp_ne_u32_e64 vcc, 0, s2
; GFX8-NEXT: s_nop 3 ; GFX8-NEXT: s_nop 3
; GFX8-NEXT: v_div_fmas_f32 v2, v0, v1, v2 ; GFX8-NEXT: v_div_fmas_f32 v2, v0, v1, v2
; GFX8-NEXT: v_mov_b32_e32 v0, s0 ; GFX8-NEXT: v_mov_b32_e32 v0, s0
@ -527,43 +527,43 @@ define amdgpu_kernel void @test_div_fmas_f32_inline_imm_2(float addrspace(1)* %o
define amdgpu_kernel void @test_div_fmas_f64(double addrspace(1)* %out, double %a, double %b, double %c, i1 %d) { define amdgpu_kernel void @test_div_fmas_f64(double addrspace(1)* %out, double %a, double %b, double %c, i1 %d) {
; GFX7-LABEL: test_div_fmas_f64: ; GFX7-LABEL: test_div_fmas_f64:
; GFX7: ; %bb.0: ; GFX7: ; %bb.0:
; GFX7-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x9 ; GFX7-NEXT: s_load_dword s8, s[0:1], 0x11
; GFX7-NEXT: s_load_dword s0, s[0:1], 0x11 ; GFX7-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9
; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v0, s2
; GFX7-NEXT: v_mov_b32_e32 v2, s8 ; GFX7-NEXT: v_mov_b32_e32 v1, s3
; GFX7-NEXT: v_mov_b32_e32 v4, s10 ; GFX7-NEXT: v_mov_b32_e32 v2, s4
; GFX7-NEXT: s_and_b32 s0, 1, s0 ; GFX7-NEXT: v_mov_b32_e32 v4, s6
; GFX7-NEXT: v_mov_b32_e32 v1, s7 ; GFX7-NEXT: s_and_b32 s2, 1, s8
; GFX7-NEXT: v_mov_b32_e32 v3, s9 ; GFX7-NEXT: v_mov_b32_e32 v3, s5
; GFX7-NEXT: v_mov_b32_e32 v5, s11 ; GFX7-NEXT: v_mov_b32_e32 v5, s7
; GFX7-NEXT: v_cmp_ne_u32_e64 vcc, 0, s0 ; GFX7-NEXT: v_cmp_ne_u32_e64 vcc, 0, s2
; GFX7-NEXT: s_nop 3 ; GFX7-NEXT: s_nop 3
; GFX7-NEXT: v_div_fmas_f64 v[0:1], v[0:1], v[2:3], v[4:5] ; GFX7-NEXT: v_div_fmas_f64 v[0:1], v[0:1], v[2:3], v[4:5]
; GFX7-NEXT: v_mov_b32_e32 v2, s4 ; GFX7-NEXT: v_mov_b32_e32 v3, s1
; GFX7-NEXT: v_mov_b32_e32 v3, s5 ; GFX7-NEXT: v_mov_b32_e32 v2, s0
; GFX7-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GFX7-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; GFX7-NEXT: s_endpgm ; GFX7-NEXT: s_endpgm
; ;
; GFX8-LABEL: test_div_fmas_f64: ; GFX8-LABEL: test_div_fmas_f64:
; GFX8: ; %bb.0: ; GFX8: ; %bb.0:
; GFX8-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24 ; GFX8-NEXT: s_load_dword s8, s[0:1], 0x44
; GFX8-NEXT: s_load_dword s0, s[0:1], 0x44 ; GFX8-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: v_mov_b32_e32 v0, s6 ; GFX8-NEXT: v_mov_b32_e32 v0, s2
; GFX8-NEXT: v_mov_b32_e32 v2, s8 ; GFX8-NEXT: v_mov_b32_e32 v1, s3
; GFX8-NEXT: v_mov_b32_e32 v4, s10 ; GFX8-NEXT: v_mov_b32_e32 v2, s4
; GFX8-NEXT: s_and_b32 s0, 1, s0 ; GFX8-NEXT: v_mov_b32_e32 v4, s6
; GFX8-NEXT: v_mov_b32_e32 v1, s7 ; GFX8-NEXT: s_and_b32 s2, 1, s8
; GFX8-NEXT: v_mov_b32_e32 v3, s9 ; GFX8-NEXT: v_mov_b32_e32 v3, s5
; GFX8-NEXT: v_mov_b32_e32 v5, s11 ; GFX8-NEXT: v_mov_b32_e32 v5, s7
; GFX8-NEXT: v_cmp_ne_u32_e64 vcc, 0, s0 ; GFX8-NEXT: v_cmp_ne_u32_e64 vcc, 0, s2
; GFX8-NEXT: s_nop 3 ; GFX8-NEXT: s_nop 3
; GFX8-NEXT: v_div_fmas_f64 v[0:1], v[0:1], v[2:3], v[4:5] ; GFX8-NEXT: v_div_fmas_f64 v[0:1], v[0:1], v[2:3], v[4:5]
; GFX8-NEXT: v_mov_b32_e32 v2, s4 ; GFX8-NEXT: v_mov_b32_e32 v3, s1
; GFX8-NEXT: v_mov_b32_e32 v3, s5 ; GFX8-NEXT: v_mov_b32_e32 v2, s0
; GFX8-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GFX8-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; GFX8-NEXT: s_endpgm ; GFX8-NEXT: s_endpgm
; ;
; GFX10_W32-LABEL: test_div_fmas_f64: ; GFX10_W32-LABEL: test_div_fmas_f64:
; GFX10_W32: ; %bb.0: ; GFX10_W32: ; %bb.0:

View File

@ -3,7 +3,7 @@
; GCN-LABEL: {{^}}amdhsa_trap_num_sgprs ; GCN-LABEL: {{^}}amdhsa_trap_num_sgprs
; TRAP-HANDLER-ENABLE: NumSgprs: 61 ; TRAP-HANDLER-ENABLE: NumSgprs: 61
; TRAP-HANDLER-DISABLE: NumSgprs: 77 ; TRAP-HANDLER-DISABLE: NumSgprs: 79
define amdgpu_kernel void @amdhsa_trap_num_sgprs( define amdgpu_kernel void @amdhsa_trap_num_sgprs(
i32 addrspace(1)* %out0, i32 %in0, i32 addrspace(1)* %out0, i32 %in0,
i32 addrspace(1)* %out1, i32 %in1, i32 addrspace(1)* %out1, i32 %in1,

View File

@ -46,8 +46,8 @@ entry:
; Test various offset boundaries. ; Test various offset boundaries.
; GFX9: global_load_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, off offset:4088{{$}} ; GFX9: global_load_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, off offset:4088{{$}}
; GFX9: global_load_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, s[{{[0-9]+}}:{{[0-9]+}}] offset:2040{{$}}
; GFX9: global_load_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, s[{{[0-9]+}}:{{[0-9]+}}] offset:4088{{$}} ; GFX9: global_load_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, s[{{[0-9]+}}:{{[0-9]+}}] offset:4088{{$}}
; GFX9: global_load_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, s[{{[0-9]+}}:{{[0-9]+}}] offset:2056{{$}}
%gep11 = getelementptr inbounds i64, i64 addrspace(1)* %gep, i64 511 %gep11 = getelementptr inbounds i64, i64 addrspace(1)* %gep, i64 511
%load11 = load i64, i64 addrspace(1)* %gep11 %load11 = load i64, i64 addrspace(1)* %gep11
%gep12 = getelementptr inbounds i64, i64 addrspace(1)* %gep, i64 1023 %gep12 = getelementptr inbounds i64, i64 addrspace(1)* %gep, i64 1023

View File

@ -681,27 +681,27 @@ define amdgpu_kernel void @dynamic_insertelement_v4i32(<4 x i32> addrspace(1)* %
; ;
; VI-LABEL: dynamic_insertelement_v4i32: ; VI-LABEL: dynamic_insertelement_v4i32:
; VI: ; %bb.0: ; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; VI-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x10 ; VI-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x10
; VI-NEXT: s_load_dword s6, s[4:5], 0x20 ; VI-NEXT: s_load_dword s6, s[4:5], 0x20
; VI-NEXT: s_load_dword s4, s[4:5], 0x44 ; VI-NEXT: s_load_dword s4, s[4:5], 0x44
; VI-NEXT: s_mov_b32 s3, 0x1100f000 ; VI-NEXT: s_mov_b32 s3, 0x1100f000
; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_mov_b32 s2, -1
; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_cmp_eq_u32 s6, 3 ; VI-NEXT: s_cmp_eq_u32 s6, 3
; VI-NEXT: s_cselect_b32 s5, s4, s11 ; VI-NEXT: s_cselect_b32 s5, s4, s11
; VI-NEXT: s_cmp_eq_u32 s6, 2 ; VI-NEXT: s_cmp_eq_u32 s6, 2
; VI-NEXT: s_cselect_b32 s7, s4, s10 ; VI-NEXT: s_cselect_b32 s7, s4, s10
; VI-NEXT: s_cmp_eq_u32 s6, 1 ; VI-NEXT: s_cmp_eq_u32 s6, 1
; VI-NEXT: s_cselect_b32 s9, s4, s9 ; VI-NEXT: s_cselect_b32 s9, s4, s9
; VI-NEXT: s_cmp_eq_u32 s6, 0 ; VI-NEXT: s_cmp_eq_u32 s6, 0
; VI-NEXT: s_cselect_b32 s4, s4, s8 ; VI-NEXT: s_cselect_b32 s4, s4, s8
; VI-NEXT: v_mov_b32_e32 v0, s4 ; VI-NEXT: v_mov_b32_e32 v0, s4
; VI-NEXT: v_mov_b32_e32 v1, s9 ; VI-NEXT: v_mov_b32_e32 v1, s9
; VI-NEXT: v_mov_b32_e32 v2, s7 ; VI-NEXT: v_mov_b32_e32 v2, s7
; VI-NEXT: v_mov_b32_e32 v3, s5 ; VI-NEXT: v_mov_b32_e32 v3, s5
; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 ; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0
; VI-NEXT: s_endpgm ; VI-NEXT: s_endpgm
%vecins = insertelement <4 x i32> %a, i32 %val, i32 %b %vecins = insertelement <4 x i32> %a, i32 %val, i32 %b
store <4 x i32> %vecins, <4 x i32> addrspace(1)* %out, align 16 store <4 x i32> %vecins, <4 x i32> addrspace(1)* %out, align 16
ret void ret void

View File

@ -855,10 +855,10 @@ define amdgpu_kernel void @struct_argument_alignment({i32, i64} %arg0, i8, {i32,
; multiple. ; multiple.
; FUNC-LABEL: {{^}}packed_struct_argument_alignment: ; FUNC-LABEL: {{^}}packed_struct_argument_alignment:
; HSA-GFX9: kernarg_segment_byte_size = 28 ; HSA-GFX9: kernarg_segment_byte_size = 28
; HSA-GFX9: s_load_dword s{{[0-9]+}}, s[4:5], 0x0
; HSA-GFX9: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s[4:5], 0x4
; HSA-GFX9: global_load_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, off offset:17 ; HSA-GFX9: global_load_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, off offset:17
; HSA-GFX9: global_load_dword v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, off offset:13 ; HSA-GFX9: global_load_dword v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, off offset:13
; HSA-GFX9: s_load_dword s{{[0-9]+}}, s[4:5], 0x0
; HSA-GFX9: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s[4:5], 0x4
define amdgpu_kernel void @packed_struct_argument_alignment(<{i32, i64}> %arg0, i8, <{i32, i64}> %arg1) { define amdgpu_kernel void @packed_struct_argument_alignment(<{i32, i64}> %arg0, i8, <{i32, i64}> %arg1) {
%val0 = extractvalue <{i32, i64}> %arg0, 0 %val0 = extractvalue <{i32, i64}> %arg0, 0
%val1 = extractvalue <{i32, i64}> %arg0, 1 %val1 = extractvalue <{i32, i64}> %arg0, 1

View File

@ -51,38 +51,38 @@ bb:
define amdgpu_kernel void @scalar_clause(<4 x i32> addrspace(1)* noalias nocapture readonly %arg, <4 x i32> addrspace(1)* noalias nocapture %arg1) { define amdgpu_kernel void @scalar_clause(<4 x i32> addrspace(1)* noalias nocapture readonly %arg, <4 x i32> addrspace(1)* noalias nocapture %arg1) {
; GCN-LABEL: scalar_clause: ; GCN-LABEL: scalar_clause:
; GCN: ; %bb.0: ; %bb ; GCN: ; %bb.0: ; %bb
; GCN-NEXT: s_load_dwordx2 s[16:17], s[0:1], 0x24 ; GCN-NEXT: s_load_dwordx2 s[16:17], s[0:1], 0x24
; GCN-NEXT: s_load_dwordx2 s[18:19], s[0:1], 0x2c ; GCN-NEXT: s_load_dwordx2 s[18:19], s[0:1], 0x2c
; GCN-NEXT: s_nop 0 ; GCN-NEXT: s_nop 0
; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: s_load_dwordx4 s[0:3], s[16:17], 0x0 ; GCN-NEXT: s_load_dwordx4 s[0:3], s[16:17], 0x0
; GCN-NEXT: s_load_dwordx4 s[4:7], s[16:17], 0x10 ; GCN-NEXT: s_load_dwordx4 s[4:7], s[16:17], 0x10
; GCN-NEXT: s_load_dwordx4 s[8:11], s[16:17], 0x20 ; GCN-NEXT: s_load_dwordx4 s[8:11], s[16:17], 0x20
; GCN-NEXT: s_load_dwordx4 s[12:15], s[16:17], 0x30 ; GCN-NEXT: s_load_dwordx4 s[12:15], s[16:17], 0x30
; GCN-NEXT: v_mov_b32_e32 v16, s18 ; GCN-NEXT: v_mov_b32_e32 v12, s18
; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: v_mov_b32_e32 v0, s0 ; GCN-NEXT: v_mov_b32_e32 v0, s0
; GCN-NEXT: v_mov_b32_e32 v4, s4 ; GCN-NEXT: v_mov_b32_e32 v4, s4
; GCN-NEXT: v_mov_b32_e32 v8, s8 ; GCN-NEXT: v_mov_b32_e32 v8, s8
; GCN-NEXT: v_mov_b32_e32 v12, s12 ; GCN-NEXT: v_mov_b32_e32 v13, s19
; GCN-NEXT: v_mov_b32_e32 v17, s19 ; GCN-NEXT: v_mov_b32_e32 v1, s1
; GCN-NEXT: v_mov_b32_e32 v1, s1 ; GCN-NEXT: v_mov_b32_e32 v2, s2
; GCN-NEXT: v_mov_b32_e32 v2, s2 ; GCN-NEXT: v_mov_b32_e32 v3, s3
; GCN-NEXT: v_mov_b32_e32 v3, s3 ; GCN-NEXT: v_mov_b32_e32 v5, s5
; GCN-NEXT: v_mov_b32_e32 v5, s5 ; GCN-NEXT: v_mov_b32_e32 v6, s6
; GCN-NEXT: v_mov_b32_e32 v6, s6 ; GCN-NEXT: v_mov_b32_e32 v7, s7
; GCN-NEXT: v_mov_b32_e32 v7, s7 ; GCN-NEXT: global_store_dwordx4 v[12:13], v[0:3], off
; GCN-NEXT: v_mov_b32_e32 v9, s9 ; GCN-NEXT: global_store_dwordx4 v[12:13], v[4:7], off offset:16
; GCN-NEXT: v_mov_b32_e32 v10, s10 ; GCN-NEXT: v_mov_b32_e32 v0, s12
; GCN-NEXT: v_mov_b32_e32 v11, s11 ; GCN-NEXT: v_mov_b32_e32 v9, s9
; GCN-NEXT: v_mov_b32_e32 v13, s13 ; GCN-NEXT: v_mov_b32_e32 v10, s10
; GCN-NEXT: v_mov_b32_e32 v14, s14 ; GCN-NEXT: v_mov_b32_e32 v11, s11
; GCN-NEXT: v_mov_b32_e32 v15, s15 ; GCN-NEXT: v_mov_b32_e32 v1, s13
; GCN-NEXT: global_store_dwordx4 v[16:17], v[0:3], off ; GCN-NEXT: v_mov_b32_e32 v2, s14
; GCN-NEXT: global_store_dwordx4 v[16:17], v[4:7], off offset:16 ; GCN-NEXT: v_mov_b32_e32 v3, s15
; GCN-NEXT: global_store_dwordx4 v[16:17], v[8:11], off offset:32 ; GCN-NEXT: global_store_dwordx4 v[12:13], v[8:11], off offset:32
; GCN-NEXT: global_store_dwordx4 v[16:17], v[12:15], off offset:48 ; GCN-NEXT: global_store_dwordx4 v[12:13], v[0:3], off offset:48
; GCN-NEXT: s_endpgm ; GCN-NEXT: s_endpgm
bb: bb:
%tmp = load <4 x i32>, <4 x i32> addrspace(1)* %arg, align 16 %tmp = load <4 x i32>, <4 x i32> addrspace(1)* %arg, align 16
%tmp2 = getelementptr inbounds <4 x i32>, <4 x i32> addrspace(1)* %arg, i64 1 %tmp2 = getelementptr inbounds <4 x i32>, <4 x i32> addrspace(1)* %arg, i64 1

View File

@ -17,8 +17,8 @@ define amdgpu_kernel void @clmem_read_simplified(i8 addrspace(1)* %buffer) {
; ;
; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off{{$}} ; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off{{$}}
; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:2048 ; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:2048
; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off{{$}}
; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:2048 ; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:2048
; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off{{$}}
; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:2048 ; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:2048
; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off{{$}} ; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off{{$}}
; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:-4096 ; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:-4096
@ -86,7 +86,6 @@ define hidden amdgpu_kernel void @clmem_read(i8 addrspace(1)* %buffer) {
; GFX8: flat_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}] ; GFX8: flat_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}]
; GFX8: flat_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}] ; GFX8: flat_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}]
; ;
; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:-4096
; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:-2048 ; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:-2048
; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:-4096 ; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:-4096
; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:-2048 ; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:-2048
@ -96,8 +95,10 @@ define hidden amdgpu_kernel void @clmem_read(i8 addrspace(1)* %buffer) {
; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:-2048 ; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:-2048
; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off{{$}} ; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off{{$}}
; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:2048 ; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:2048
; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:-4096
; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off ; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off
; ;
; GFX10: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:-2048
; GFX10: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off{{$}} ; GFX10: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off{{$}}
; GFX10: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off{{$}} ; GFX10: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off{{$}}
; GFX10: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:-2048 ; GFX10: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:-2048
@ -299,9 +300,9 @@ define amdgpu_kernel void @Offset64(i8 addrspace(1)* %buffer) {
; GFX8: flat_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}] ; GFX8: flat_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}]
; ;
; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off{{$}} ; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off{{$}}
; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off{{$}}
; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:-4096 ; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:-4096
; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:2048 ; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:2048
; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off{{$}}
; ;
; GFX10: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off{{$}} ; GFX10: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off{{$}}
; GFX10: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:-2048 ; GFX10: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:-2048
@ -455,11 +456,11 @@ define amdgpu_kernel void @ReverseOrder(i8 addrspace(1)* %buffer) {
; ;
; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off{{$}} ; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off{{$}}
; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:2048 ; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:2048
; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off{{$}}
; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:2048 ; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:2048
; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off{{$}} ; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off{{$}}
; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:2048 ; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:2048
; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off{{$}} ; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off{{$}}
; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off{{$}}
; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:2048 ; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:2048
; ;
; GFX10: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off{{$}} ; GFX10: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off{{$}}

View File

@ -203,10 +203,10 @@ entry:
; GCN-LABEL: {{^}}smrd_valu_ci_offset_x16: ; GCN-LABEL: {{^}}smrd_valu_ci_offset_x16:
; SI: s_mov_b32 {{s[0-9]+}}, 0x13480 ; SI: s_mov_b32 {{s[0-9]+}}, 0x13480
; SI: buffer_load_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, s[{{[0-9]+:[0-9]+}}], {{s[0-9]+}} addr64
; SI: buffer_load_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, s[{{[0-9]+:[0-9]+}}], 0 addr64 offset:16 ; SI: buffer_load_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, s[{{[0-9]+:[0-9]+}}], 0 addr64 offset:16
; SI: buffer_load_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, s[{{[0-9]+:[0-9]+}}], 0 addr64 offset:32 ; SI: buffer_load_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, s[{{[0-9]+:[0-9]+}}], 0 addr64 offset:32
; SI: buffer_load_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, s[{{[0-9]+:[0-9]+}}], 0 addr64 offset:48 ; SI: buffer_load_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, s[{{[0-9]+:[0-9]+}}], 0 addr64 offset:48
; SI: buffer_load_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, s[{{[0-9]+:[0-9]+}}], {{s[0-9]+}} addr64
; CI-NOHSA-DAG: s_mov_b32 [[OFFSET0:s[0-9]+]], 0x13480{{$}} ; CI-NOHSA-DAG: s_mov_b32 [[OFFSET0:s[0-9]+]], 0x13480{{$}}
; CI-NOHSA-DAG: buffer_load_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, s[{{[0-9]+:[0-9]+}}], [[OFFSET0]] addr64{{$}} ; CI-NOHSA-DAG: buffer_load_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, s[{{[0-9]+:[0-9]+}}], [[OFFSET0]] addr64{{$}}
; CI-NOHSA-DAG: s_mov_b32 [[OFFSET1:s[0-9]+]], 0x13490{{$}} ; CI-NOHSA-DAG: s_mov_b32 [[OFFSET1:s[0-9]+]], 0x13490{{$}}

View File

@ -11,27 +11,28 @@
define amdgpu_kernel void @sgpr_if_else_salu_br(i32 addrspace(1)* %out, i32 %a, i32 %b, i32 %c, i32 %d, i32 %e) { define amdgpu_kernel void @sgpr_if_else_salu_br(i32 addrspace(1)* %out, i32 %a, i32 %b, i32 %c, i32 %d, i32 %e) {
; SI-LABEL: sgpr_if_else_salu_br: ; SI-LABEL: sgpr_if_else_salu_br:
; SI: ; %bb.0: ; %entry ; SI: ; %bb.0: ; %entry
; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0xb ; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9
; SI-NEXT: s_load_dword s2, s[0:1], 0xf ; SI-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0xb
; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_load_dword s0, s[0:1], 0xf
; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: s_cmp_lg_u32 s4, 0 ; SI-NEXT: s_cmp_lg_u32 s8, 0
; SI-NEXT: s_cbranch_scc0 BB0_2 ; SI-NEXT: s_cbranch_scc0 BB0_2
; SI-NEXT: ; %bb.1: ; %else ; SI-NEXT:; %bb.1: ; %else
; SI-NEXT: s_add_i32 s2, s7, s2 ; SI-NEXT: s_add_i32 s0, s11, s0
; SI-NEXT: s_cbranch_execz BB0_3 ; SI-NEXT: s_cbranch_execz BB0_3
; SI-NEXT: s_branch BB0_4 ; SI-NEXT: s_branch BB0_4
; SI-NEXT: BB0_2: ; SI-NEXT:BB0_2:
; SI-NEXT: ; implicit-def: $sgpr2 ; SI-NEXT: ; implicit-def: $sgpr0
; SI-NEXT: BB0_3: ; %if ; SI-NEXT:BB0_3: ; %if
; SI-NEXT: s_sub_i32 s2, s5, s6 ; SI-NEXT: s_sub_i32 s0, s9, s10
; SI-NEXT: BB0_4: ; %endif ; SI-NEXT:BB0_4: ; %endif
; SI-NEXT: s_add_i32 s4, s2, s4 ; SI-NEXT: s_add_i32 s0, s0, s8
; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s7, 0xf000
; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_mov_b32 s6, -1
; SI-NEXT: v_mov_b32_e32 v0, s4 ; SI-NEXT: v_mov_b32_e32 v0, s0
; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0
; SI-NEXT: s_endpgm ; SI-NEXT: s_endpgm
entry: entry:
%0 = icmp eq i32 %a, 0 %0 = icmp eq i32 %a, 0
br i1 %0, label %if, label %else br i1 %0, label %if, label %else
@ -54,32 +55,33 @@ endif:
define amdgpu_kernel void @sgpr_if_else_salu_br_opt(i32 addrspace(1)* %out, [8 x i32], i32 %a, [8 x i32], i32 %b, [8 x i32], i32 %c, [8 x i32], i32 %d, [8 x i32], i32 %e) { define amdgpu_kernel void @sgpr_if_else_salu_br_opt(i32 addrspace(1)* %out, [8 x i32], i32 %a, [8 x i32], i32 %b, [8 x i32], i32 %c, [8 x i32], i32 %d, [8 x i32], i32 %e) {
; SI-LABEL: sgpr_if_else_salu_br_opt: ; SI-LABEL: sgpr_if_else_salu_br_opt:
; SI: ; %bb.0: ; %entry ; SI: ; %bb.0: ; %entry
; SI-NEXT: s_load_dword s2, s[0:1], 0x13 ; SI-NEXT: s_load_dword s2, s[0:1], 0x13
; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 ; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9
; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: s_cmp_lg_u32 s2, 0 ; SI-NEXT: s_cmp_lg_u32 s2, 0
; SI-NEXT: s_cbranch_scc0 BB1_2 ; SI-NEXT: s_cbranch_scc0 BB1_2
; SI-NEXT: ; %bb.1: ; %else ; SI-NEXT:; %bb.1: ; %else
; SI-NEXT: s_load_dword s3, s[0:1], 0x2e ; SI-NEXT: s_load_dword s3, s[0:1], 0x2e
; SI-NEXT: s_load_dword s6, s[0:1], 0x37 ; SI-NEXT: s_load_dword s6, s[0:1], 0x37
; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: s_add_i32 s3, s3, s6 ; SI-NEXT: s_add_i32 s3, s3, s6
; SI-NEXT: s_cbranch_execz BB1_3 ; SI-NEXT: s_cbranch_execz BB1_3
; SI-NEXT: s_branch BB1_4 ; SI-NEXT: s_branch BB1_4
; SI-NEXT: BB1_2: ; SI-NEXT:BB1_2:
; SI-NEXT: ; implicit-def: $sgpr3 ; SI-NEXT: ; implicit-def: $sgpr3
; SI-NEXT: BB1_3: ; %if ; SI-NEXT:BB1_3: ; %if
; SI-NEXT: s_load_dword s3, s[0:1], 0x1c ; SI-NEXT: s_load_dword s3, s[0:1], 0x1c
; SI-NEXT: s_load_dword s0, s[0:1], 0x25 ; SI-NEXT: s_load_dword s0, s[0:1], 0x25
; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: s_add_i32 s3, s3, s0 ; SI-NEXT: s_add_i32 s3, s3, s0
; SI-NEXT: BB1_4: ; %endif ; SI-NEXT:BB1_4: ; %endif
; SI-NEXT: s_add_i32 s0, s3, s2 ; SI-NEXT: s_add_i32 s0, s3, s2
; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s7, 0xf000
; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_mov_b32 s6, -1
; SI-NEXT: v_mov_b32_e32 v0, s0 ; SI-NEXT: v_mov_b32_e32 v0, s0
; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0
; SI-NEXT: s_endpgm ; SI-NEXT: s_endpgm
entry: entry:
%cmp0 = icmp eq i32 %a, 0 %cmp0 = icmp eq i32 %a, 0
br i1 %cmp0, label %if, label %else br i1 %cmp0, label %if, label %else

View File

@ -446,68 +446,68 @@ define <2 x i128> @v_ashr_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) {
define amdgpu_kernel void @s_shl_v2i128ss(<2 x i128> %lhs, <2 x i128> %rhs) { define amdgpu_kernel void @s_shl_v2i128ss(<2 x i128> %lhs, <2 x i128> %rhs) {
; GCN-LABEL: s_shl_v2i128ss: ; GCN-LABEL: s_shl_v2i128ss:
; GCN: ; %bb.0: ; GCN: ; %bb.0:
; GCN-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x0 ; GCN-NEXT: s_load_dwordx8 s[16:23], s[4:5], 0x8
; GCN-NEXT: s_load_dwordx8 s[16:23], s[4:5], 0x8 ; GCN-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x0
; GCN-NEXT: v_mov_b32_e32 v10, 16 ; GCN-NEXT: v_mov_b32_e32 v10, 16
; GCN-NEXT: v_mov_b32_e32 v8, 0 ; GCN-NEXT: v_mov_b32_e32 v8, 0
; GCN-NEXT: v_mov_b32_e32 v11, 0 ; GCN-NEXT: v_mov_b32_e32 v11, 0
; GCN-NEXT: v_mov_b32_e32 v9, 0 ; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: s_sub_i32 s6, 64, s16
; GCN-NEXT: s_sub_i32 s6, 64, s16 ; GCN-NEXT: v_cmp_lt_u64_e64 s[0:1], s[16:17], 64
; GCN-NEXT: v_cmp_lt_u64_e64 s[0:1], s[16:17], 64 ; GCN-NEXT: v_cmp_eq_u64_e64 s[2:3], s[18:19], 0
; GCN-NEXT: v_cmp_eq_u64_e64 s[2:3], s[18:19], 0 ; GCN-NEXT: s_sub_i32 s4, s16, 64
; GCN-NEXT: s_sub_i32 s4, s16, 64 ; GCN-NEXT: s_lshr_b64 s[6:7], s[8:9], s6
; GCN-NEXT: s_lshr_b64 s[6:7], s[8:9], s6 ; GCN-NEXT: s_lshl_b64 s[24:25], s[10:11], s16
; GCN-NEXT: s_lshl_b64 s[24:25], s[10:11], s16 ; GCN-NEXT: s_and_b64 vcc, s[2:3], s[0:1]
; GCN-NEXT: s_and_b64 vcc, s[2:3], s[0:1] ; GCN-NEXT: s_or_b64 s[0:1], s[16:17], s[18:19]
; GCN-NEXT: s_or_b64 s[0:1], s[16:17], s[18:19] ; GCN-NEXT: s_lshl_b64 s[4:5], s[8:9], s4
; GCN-NEXT: s_lshl_b64 s[4:5], s[8:9], s4 ; GCN-NEXT: s_or_b64 s[6:7], s[24:25], s[6:7]
; GCN-NEXT: s_or_b64 s[6:7], s[24:25], s[6:7] ; GCN-NEXT: v_mov_b32_e32 v0, s5
; GCN-NEXT: v_mov_b32_e32 v0, s5 ; GCN-NEXT: v_mov_b32_e32 v1, s7
; GCN-NEXT: v_mov_b32_e32 v1, s7 ; GCN-NEXT: v_cmp_eq_u64_e64 s[0:1], s[0:1], 0
; GCN-NEXT: v_cmp_eq_u64_e64 s[0:1], s[0:1], 0 ; GCN-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc
; GCN-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc ; GCN-NEXT: v_mov_b32_e32 v1, s11
; GCN-NEXT: v_mov_b32_e32 v1, s11 ; GCN-NEXT: v_cndmask_b32_e64 v3, v0, v1, s[0:1]
; GCN-NEXT: v_cndmask_b32_e64 v3, v0, v1, s[0:1] ; GCN-NEXT: v_mov_b32_e32 v0, s4
; GCN-NEXT: v_mov_b32_e32 v0, s4 ; GCN-NEXT: v_mov_b32_e32 v1, s6
; GCN-NEXT: v_mov_b32_e32 v1, s6 ; GCN-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc
; GCN-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc ; GCN-NEXT: v_mov_b32_e32 v1, s10
; GCN-NEXT: v_mov_b32_e32 v1, s10 ; GCN-NEXT: s_sub_i32 s6, 64, s20
; GCN-NEXT: s_sub_i32 s6, 64, s20 ; GCN-NEXT: v_cndmask_b32_e64 v2, v0, v1, s[0:1]
; GCN-NEXT: v_cndmask_b32_e64 v2, v0, v1, s[0:1] ; GCN-NEXT: v_cmp_lt_u64_e64 s[0:1], s[20:21], 64
; GCN-NEXT: v_cmp_lt_u64_e64 s[0:1], s[20:21], 64 ; GCN-NEXT: v_cmp_eq_u64_e64 s[2:3], s[22:23], 0
; GCN-NEXT: v_cmp_eq_u64_e64 s[2:3], s[22:23], 0 ; GCN-NEXT: s_sub_i32 s4, s20, 64
; GCN-NEXT: s_sub_i32 s4, s20, 64 ; GCN-NEXT: s_lshr_b64 s[6:7], s[12:13], s6
; GCN-NEXT: s_lshr_b64 s[6:7], s[12:13], s6 ; GCN-NEXT: s_lshl_b64 s[10:11], s[14:15], s20
; GCN-NEXT: s_lshl_b64 s[10:11], s[14:15], s20 ; GCN-NEXT: s_lshl_b64 s[4:5], s[12:13], s4
; GCN-NEXT: s_lshl_b64 s[4:5], s[12:13], s4 ; GCN-NEXT: s_or_b64 s[6:7], s[10:11], s[6:7]
; GCN-NEXT: s_or_b64 s[6:7], s[10:11], s[6:7] ; GCN-NEXT: s_and_b64 s[0:1], s[2:3], s[0:1]
; GCN-NEXT: s_and_b64 s[0:1], s[2:3], s[0:1] ; GCN-NEXT: s_or_b64 s[2:3], s[20:21], s[22:23]
; GCN-NEXT: s_or_b64 s[2:3], s[20:21], s[22:23] ; GCN-NEXT: v_mov_b32_e32 v0, s5
; GCN-NEXT: v_mov_b32_e32 v0, s5 ; GCN-NEXT: v_mov_b32_e32 v1, s7
; GCN-NEXT: v_mov_b32_e32 v1, s7 ; GCN-NEXT: v_cmp_eq_u64_e64 s[2:3], s[2:3], 0
; GCN-NEXT: v_cmp_eq_u64_e64 s[2:3], s[2:3], 0 ; GCN-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[0:1]
; GCN-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[0:1] ; GCN-NEXT: v_mov_b32_e32 v1, s15
; GCN-NEXT: v_mov_b32_e32 v1, s15 ; GCN-NEXT: v_cndmask_b32_e64 v7, v0, v1, s[2:3]
; GCN-NEXT: v_cndmask_b32_e64 v7, v0, v1, s[2:3] ; GCN-NEXT: v_mov_b32_e32 v0, s4
; GCN-NEXT: v_mov_b32_e32 v0, s4 ; GCN-NEXT: v_mov_b32_e32 v1, s6
; GCN-NEXT: v_mov_b32_e32 v1, s6 ; GCN-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[0:1]
; GCN-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[0:1] ; GCN-NEXT: v_mov_b32_e32 v1, s14
; GCN-NEXT: v_mov_b32_e32 v1, s14 ; GCN-NEXT: v_cndmask_b32_e64 v6, v0, v1, s[2:3]
; GCN-NEXT: v_cndmask_b32_e64 v6, v0, v1, s[2:3] ; GCN-NEXT: s_lshl_b64 s[2:3], s[8:9], s16
; GCN-NEXT: s_lshl_b64 s[2:3], s[8:9], s16 ; GCN-NEXT: v_mov_b32_e32 v0, s3
; GCN-NEXT: v_mov_b32_e32 v0, s3 ; GCN-NEXT: v_cndmask_b32_e32 v1, 0, v0, vcc
; GCN-NEXT: v_cndmask_b32_e32 v1, 0, v0, vcc ; GCN-NEXT: v_mov_b32_e32 v0, s2
; GCN-NEXT: v_mov_b32_e32 v0, s2 ; GCN-NEXT: s_lshl_b64 s[2:3], s[12:13], s20
; GCN-NEXT: s_lshl_b64 s[2:3], s[12:13], s20 ; GCN-NEXT: v_mov_b32_e32 v4, s3
; GCN-NEXT: v_mov_b32_e32 v4, s3 ; GCN-NEXT: v_cndmask_b32_e64 v5, 0, v4, s[0:1]
; GCN-NEXT: v_cndmask_b32_e64 v5, 0, v4, s[0:1] ; GCN-NEXT: v_mov_b32_e32 v4, s2
; GCN-NEXT: v_mov_b32_e32 v4, s2 ; GCN-NEXT: v_cndmask_b32_e64 v4, 0, v4, s[0:1]
; GCN-NEXT: v_cndmask_b32_e64 v4, 0, v4, s[0:1] ; GCN-NEXT: v_mov_b32_e32 v9, 0
; GCN-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc ; GCN-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc
; GCN-NEXT: flat_store_dwordx4 v[10:11], v[4:7] ; GCN-NEXT: flat_store_dwordx4 v[10:11], v[4:7]
; GCN-NEXT: flat_store_dwordx4 v[8:9], v[0:3] ; GCN-NEXT: flat_store_dwordx4 v[8:9], v[0:3]
; GCN-NEXT: s_endpgm ; GCN-NEXT: s_endpgm
%shift = shl <2 x i128> %lhs, %rhs %shift = shl <2 x i128> %lhs, %rhs
store <2 x i128> %shift, <2 x i128> addrspace(1)* null store <2 x i128> %shift, <2 x i128> addrspace(1)* null
ret void ret void
@ -516,68 +516,68 @@ define amdgpu_kernel void @s_shl_v2i128ss(<2 x i128> %lhs, <2 x i128> %rhs) {
define amdgpu_kernel void @s_lshr_v2i128_ss(<2 x i128> %lhs, <2 x i128> %rhs) { define amdgpu_kernel void @s_lshr_v2i128_ss(<2 x i128> %lhs, <2 x i128> %rhs) {
; GCN-LABEL: s_lshr_v2i128_ss: ; GCN-LABEL: s_lshr_v2i128_ss:
; GCN: ; %bb.0: ; GCN: ; %bb.0:
; GCN-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x0 ; GCN-NEXT: s_load_dwordx8 s[16:23], s[4:5], 0x8
; GCN-NEXT: s_load_dwordx8 s[16:23], s[4:5], 0x8 ; GCN-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x0
; GCN-NEXT: v_mov_b32_e32 v10, 16 ; GCN-NEXT: v_mov_b32_e32 v10, 16
; GCN-NEXT: v_mov_b32_e32 v8, 0 ; GCN-NEXT: v_mov_b32_e32 v8, 0
; GCN-NEXT: v_mov_b32_e32 v11, 0 ; GCN-NEXT: v_mov_b32_e32 v11, 0
; GCN-NEXT: v_mov_b32_e32 v9, 0 ; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: s_sub_i32 s6, 64, s16
; GCN-NEXT: s_sub_i32 s6, 64, s16 ; GCN-NEXT: v_cmp_lt_u64_e64 s[0:1], s[16:17], 64
; GCN-NEXT: v_cmp_lt_u64_e64 s[0:1], s[16:17], 64 ; GCN-NEXT: v_cmp_eq_u64_e64 s[2:3], s[18:19], 0
; GCN-NEXT: v_cmp_eq_u64_e64 s[2:3], s[18:19], 0 ; GCN-NEXT: s_sub_i32 s4, s16, 64
; GCN-NEXT: s_sub_i32 s4, s16, 64 ; GCN-NEXT: s_lshl_b64 s[6:7], s[10:11], s6
; GCN-NEXT: s_lshl_b64 s[6:7], s[10:11], s6 ; GCN-NEXT: s_lshr_b64 s[24:25], s[8:9], s16
; GCN-NEXT: s_lshr_b64 s[24:25], s[8:9], s16 ; GCN-NEXT: s_or_b64 s[6:7], s[24:25], s[6:7]
; GCN-NEXT: s_or_b64 s[6:7], s[24:25], s[6:7] ; GCN-NEXT: s_and_b64 vcc, s[2:3], s[0:1]
; GCN-NEXT: s_and_b64 vcc, s[2:3], s[0:1] ; GCN-NEXT: s_or_b64 s[0:1], s[16:17], s[18:19]
; GCN-NEXT: s_or_b64 s[0:1], s[16:17], s[18:19] ; GCN-NEXT: s_lshr_b64 s[4:5], s[10:11], s4
; GCN-NEXT: s_lshr_b64 s[4:5], s[10:11], s4 ; GCN-NEXT: v_mov_b32_e32 v0, s5
; GCN-NEXT: v_mov_b32_e32 v0, s5 ; GCN-NEXT: v_mov_b32_e32 v1, s7
; GCN-NEXT: v_mov_b32_e32 v1, s7 ; GCN-NEXT: v_cmp_eq_u64_e64 s[0:1], s[0:1], 0
; GCN-NEXT: v_cmp_eq_u64_e64 s[0:1], s[0:1], 0 ; GCN-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc
; GCN-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc ; GCN-NEXT: v_mov_b32_e32 v1, s9
; GCN-NEXT: v_mov_b32_e32 v1, s9 ; GCN-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[0:1]
; GCN-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[0:1] ; GCN-NEXT: v_mov_b32_e32 v0, s4
; GCN-NEXT: v_mov_b32_e32 v0, s4 ; GCN-NEXT: v_mov_b32_e32 v2, s6
; GCN-NEXT: v_mov_b32_e32 v2, s6 ; GCN-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc
; GCN-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc ; GCN-NEXT: v_mov_b32_e32 v2, s8
; GCN-NEXT: v_mov_b32_e32 v2, s8 ; GCN-NEXT: s_sub_i32 s6, 64, s20
; GCN-NEXT: s_sub_i32 s6, 64, s20 ; GCN-NEXT: v_cndmask_b32_e64 v0, v0, v2, s[0:1]
; GCN-NEXT: v_cndmask_b32_e64 v0, v0, v2, s[0:1] ; GCN-NEXT: v_cmp_lt_u64_e64 s[0:1], s[20:21], 64
; GCN-NEXT: v_cmp_lt_u64_e64 s[0:1], s[20:21], 64 ; GCN-NEXT: v_cmp_eq_u64_e64 s[2:3], s[22:23], 0
; GCN-NEXT: v_cmp_eq_u64_e64 s[2:3], s[22:23], 0 ; GCN-NEXT: s_sub_i32 s4, s20, 64
; GCN-NEXT: s_sub_i32 s4, s20, 64 ; GCN-NEXT: s_lshl_b64 s[6:7], s[14:15], s6
; GCN-NEXT: s_lshl_b64 s[6:7], s[14:15], s6 ; GCN-NEXT: s_lshr_b64 s[8:9], s[12:13], s20
; GCN-NEXT: s_lshr_b64 s[8:9], s[12:13], s20 ; GCN-NEXT: s_lshr_b64 s[4:5], s[14:15], s4
; GCN-NEXT: s_lshr_b64 s[4:5], s[14:15], s4 ; GCN-NEXT: s_or_b64 s[6:7], s[8:9], s[6:7]
; GCN-NEXT: s_or_b64 s[6:7], s[8:9], s[6:7] ; GCN-NEXT: s_and_b64 s[0:1], s[2:3], s[0:1]
; GCN-NEXT: s_and_b64 s[0:1], s[2:3], s[0:1] ; GCN-NEXT: s_or_b64 s[2:3], s[20:21], s[22:23]
; GCN-NEXT: s_or_b64 s[2:3], s[20:21], s[22:23] ; GCN-NEXT: v_mov_b32_e32 v2, s5
; GCN-NEXT: v_mov_b32_e32 v2, s5 ; GCN-NEXT: v_mov_b32_e32 v3, s7
; GCN-NEXT: v_mov_b32_e32 v3, s7 ; GCN-NEXT: v_cmp_eq_u64_e64 s[2:3], s[2:3], 0
; GCN-NEXT: v_cmp_eq_u64_e64 s[2:3], s[2:3], 0 ; GCN-NEXT: v_cndmask_b32_e64 v2, v2, v3, s[0:1]
; GCN-NEXT: v_cndmask_b32_e64 v2, v2, v3, s[0:1] ; GCN-NEXT: v_mov_b32_e32 v3, s13
; GCN-NEXT: v_mov_b32_e32 v3, s13 ; GCN-NEXT: v_cndmask_b32_e64 v5, v2, v3, s[2:3]
; GCN-NEXT: v_cndmask_b32_e64 v5, v2, v3, s[2:3] ; GCN-NEXT: v_mov_b32_e32 v2, s4
; GCN-NEXT: v_mov_b32_e32 v2, s4 ; GCN-NEXT: v_mov_b32_e32 v3, s6
; GCN-NEXT: v_mov_b32_e32 v3, s6 ; GCN-NEXT: v_cndmask_b32_e64 v2, v2, v3, s[0:1]
; GCN-NEXT: v_cndmask_b32_e64 v2, v2, v3, s[0:1] ; GCN-NEXT: v_mov_b32_e32 v3, s12
; GCN-NEXT: v_mov_b32_e32 v3, s12 ; GCN-NEXT: v_cndmask_b32_e64 v4, v2, v3, s[2:3]
; GCN-NEXT: v_cndmask_b32_e64 v4, v2, v3, s[2:3] ; GCN-NEXT: s_lshr_b64 s[2:3], s[10:11], s16
; GCN-NEXT: s_lshr_b64 s[2:3], s[10:11], s16 ; GCN-NEXT: v_mov_b32_e32 v2, s3
; GCN-NEXT: v_mov_b32_e32 v2, s3 ; GCN-NEXT: v_cndmask_b32_e32 v3, 0, v2, vcc
; GCN-NEXT: v_cndmask_b32_e32 v3, 0, v2, vcc ; GCN-NEXT: v_mov_b32_e32 v2, s2
; GCN-NEXT: v_mov_b32_e32 v2, s2 ; GCN-NEXT: s_lshr_b64 s[2:3], s[14:15], s20
; GCN-NEXT: s_lshr_b64 s[2:3], s[14:15], s20 ; GCN-NEXT: v_mov_b32_e32 v6, s3
; GCN-NEXT: v_mov_b32_e32 v6, s3 ; GCN-NEXT: v_cndmask_b32_e64 v7, 0, v6, s[0:1]
; GCN-NEXT: v_cndmask_b32_e64 v7, 0, v6, s[0:1] ; GCN-NEXT: v_mov_b32_e32 v6, s2
; GCN-NEXT: v_mov_b32_e32 v6, s2 ; GCN-NEXT: v_cndmask_b32_e64 v6, 0, v6, s[0:1]
; GCN-NEXT: v_cndmask_b32_e64 v6, 0, v6, s[0:1] ; GCN-NEXT: v_mov_b32_e32 v9, 0
; GCN-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc ; GCN-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc
; GCN-NEXT: flat_store_dwordx4 v[10:11], v[4:7] ; GCN-NEXT: flat_store_dwordx4 v[10:11], v[4:7]
; GCN-NEXT: flat_store_dwordx4 v[8:9], v[0:3] ; GCN-NEXT: flat_store_dwordx4 v[8:9], v[0:3]
; GCN-NEXT: s_endpgm ; GCN-NEXT: s_endpgm
%shift = lshr <2 x i128> %lhs, %rhs %shift = lshr <2 x i128> %lhs, %rhs
store <2 x i128> %shift, <2 x i128> addrspace(1)* null store <2 x i128> %shift, <2 x i128> addrspace(1)* null
ret void ret void
@ -586,72 +586,72 @@ define amdgpu_kernel void @s_lshr_v2i128_ss(<2 x i128> %lhs, <2 x i128> %rhs) {
define amdgpu_kernel void @s_ashr_v2i128_ss(<2 x i128> %lhs, <2 x i128> %rhs) { define amdgpu_kernel void @s_ashr_v2i128_ss(<2 x i128> %lhs, <2 x i128> %rhs) {
; GCN-LABEL: s_ashr_v2i128_ss: ; GCN-LABEL: s_ashr_v2i128_ss:
; GCN: ; %bb.0: ; GCN: ; %bb.0:
; GCN-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x0 ; GCN-NEXT: s_load_dwordx8 s[16:23], s[4:5], 0x8
; GCN-NEXT: s_load_dwordx8 s[16:23], s[4:5], 0x8 ; GCN-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x0
; GCN-NEXT: v_mov_b32_e32 v8, 0 ; GCN-NEXT: v_mov_b32_e32 v8, 0
; GCN-NEXT: v_mov_b32_e32 v9, 0 ; GCN-NEXT: v_mov_b32_e32 v9, 0
; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: s_sub_i32 s6, 64, s16 ; GCN-NEXT: s_sub_i32 s6, 64, s16
; GCN-NEXT: v_cmp_lt_u64_e64 s[0:1], s[16:17], 64 ; GCN-NEXT: v_cmp_lt_u64_e64 s[0:1], s[16:17], 64
; GCN-NEXT: v_cmp_eq_u64_e64 s[2:3], s[18:19], 0 ; GCN-NEXT: v_cmp_eq_u64_e64 s[2:3], s[18:19], 0
; GCN-NEXT: s_sub_i32 s4, s16, 64 ; GCN-NEXT: s_sub_i32 s4, s16, 64
; GCN-NEXT: s_lshl_b64 s[6:7], s[10:11], s6 ; GCN-NEXT: s_lshl_b64 s[6:7], s[10:11], s6
; GCN-NEXT: s_lshr_b64 s[24:25], s[8:9], s16 ; GCN-NEXT: s_lshr_b64 s[24:25], s[8:9], s16
; GCN-NEXT: s_or_b64 s[6:7], s[24:25], s[6:7] ; GCN-NEXT: s_or_b64 s[6:7], s[24:25], s[6:7]
; GCN-NEXT: s_and_b64 vcc, s[2:3], s[0:1] ; GCN-NEXT: s_and_b64 vcc, s[2:3], s[0:1]
; GCN-NEXT: s_or_b64 s[0:1], s[16:17], s[18:19] ; GCN-NEXT: s_or_b64 s[0:1], s[16:17], s[18:19]
; GCN-NEXT: s_ashr_i64 s[4:5], s[10:11], s4 ; GCN-NEXT: s_ashr_i64 s[4:5], s[10:11], s4
; GCN-NEXT: v_mov_b32_e32 v0, s5 ; GCN-NEXT: v_mov_b32_e32 v0, s5
; GCN-NEXT: v_mov_b32_e32 v1, s7 ; GCN-NEXT: v_mov_b32_e32 v1, s7
; GCN-NEXT: v_cmp_eq_u64_e64 s[0:1], s[0:1], 0 ; GCN-NEXT: v_cmp_eq_u64_e64 s[0:1], s[0:1], 0
; GCN-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc ; GCN-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc
; GCN-NEXT: v_mov_b32_e32 v1, s9 ; GCN-NEXT: v_mov_b32_e32 v1, s9
; GCN-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[0:1] ; GCN-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[0:1]
; GCN-NEXT: v_mov_b32_e32 v0, s4 ; GCN-NEXT: v_mov_b32_e32 v0, s4
; GCN-NEXT: v_mov_b32_e32 v2, s6 ; GCN-NEXT: v_mov_b32_e32 v2, s6
; GCN-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc ; GCN-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc
; GCN-NEXT: v_mov_b32_e32 v2, s8 ; GCN-NEXT: v_mov_b32_e32 v2, s8
; GCN-NEXT: s_sub_i32 s6, 64, s20 ; GCN-NEXT: s_sub_i32 s6, 64, s20
; GCN-NEXT: v_cndmask_b32_e64 v0, v0, v2, s[0:1] ; GCN-NEXT: v_cndmask_b32_e64 v0, v0, v2, s[0:1]
; GCN-NEXT: v_cmp_lt_u64_e64 s[0:1], s[20:21], 64 ; GCN-NEXT: v_cmp_lt_u64_e64 s[0:1], s[20:21], 64
; GCN-NEXT: v_cmp_eq_u64_e64 s[2:3], s[22:23], 0 ; GCN-NEXT: v_cmp_eq_u64_e64 s[2:3], s[22:23], 0
; GCN-NEXT: s_sub_i32 s4, s20, 64 ; GCN-NEXT: s_sub_i32 s4, s20, 64
; GCN-NEXT: s_lshl_b64 s[6:7], s[14:15], s6 ; GCN-NEXT: s_lshl_b64 s[6:7], s[14:15], s6
; GCN-NEXT: s_lshr_b64 s[8:9], s[12:13], s20 ; GCN-NEXT: s_lshr_b64 s[8:9], s[12:13], s20
; GCN-NEXT: s_ashr_i64 s[4:5], s[14:15], s4 ; GCN-NEXT: s_ashr_i64 s[4:5], s[14:15], s4
; GCN-NEXT: s_or_b64 s[6:7], s[8:9], s[6:7] ; GCN-NEXT: s_or_b64 s[6:7], s[8:9], s[6:7]
; GCN-NEXT: s_and_b64 s[0:1], s[2:3], s[0:1] ; GCN-NEXT: s_and_b64 s[0:1], s[2:3], s[0:1]
; GCN-NEXT: s_or_b64 s[2:3], s[20:21], s[22:23] ; GCN-NEXT: s_or_b64 s[2:3], s[20:21], s[22:23]
; GCN-NEXT: v_mov_b32_e32 v2, s5 ; GCN-NEXT: v_mov_b32_e32 v2, s5
; GCN-NEXT: v_mov_b32_e32 v3, s7 ; GCN-NEXT: v_mov_b32_e32 v3, s7
; GCN-NEXT: v_cmp_eq_u64_e64 s[2:3], s[2:3], 0 ; GCN-NEXT: v_cmp_eq_u64_e64 s[2:3], s[2:3], 0
; GCN-NEXT: v_cndmask_b32_e64 v2, v2, v3, s[0:1] ; GCN-NEXT: v_cndmask_b32_e64 v2, v2, v3, s[0:1]
; GCN-NEXT: v_mov_b32_e32 v3, s13 ; GCN-NEXT: v_mov_b32_e32 v3, s13
; GCN-NEXT: v_cndmask_b32_e64 v5, v2, v3, s[2:3] ; GCN-NEXT: v_cndmask_b32_e64 v5, v2, v3, s[2:3]
; GCN-NEXT: v_mov_b32_e32 v2, s4 ; GCN-NEXT: v_mov_b32_e32 v2, s4
; GCN-NEXT: v_mov_b32_e32 v3, s6 ; GCN-NEXT: v_mov_b32_e32 v3, s6
; GCN-NEXT: v_cndmask_b32_e64 v2, v2, v3, s[0:1] ; GCN-NEXT: v_cndmask_b32_e64 v2, v2, v3, s[0:1]
; GCN-NEXT: v_mov_b32_e32 v3, s12 ; GCN-NEXT: v_mov_b32_e32 v3, s12
; GCN-NEXT: v_cndmask_b32_e64 v4, v2, v3, s[2:3] ; GCN-NEXT: v_cndmask_b32_e64 v4, v2, v3, s[2:3]
; GCN-NEXT: s_ashr_i64 s[2:3], s[10:11], s16 ; GCN-NEXT: s_ashr_i64 s[2:3], s[10:11], s16
; GCN-NEXT: s_ashr_i32 s4, s11, 31 ; GCN-NEXT: s_ashr_i32 s4, s11, 31
; GCN-NEXT: v_mov_b32_e32 v2, s4 ; GCN-NEXT: v_mov_b32_e32 v2, s4
; GCN-NEXT: v_mov_b32_e32 v3, s3 ; GCN-NEXT: v_mov_b32_e32 v3, s3
; GCN-NEXT: v_mov_b32_e32 v6, s2 ; GCN-NEXT: v_mov_b32_e32 v6, s2
; GCN-NEXT: s_ashr_i64 s[2:3], s[14:15], s20 ; GCN-NEXT: s_ashr_i64 s[2:3], s[14:15], s20
; GCN-NEXT: s_ashr_i32 s4, s15, 31 ; GCN-NEXT: s_ashr_i32 s4, s15, 31
; GCN-NEXT: v_cndmask_b32_e32 v3, v2, v3, vcc ; GCN-NEXT: v_cndmask_b32_e32 v3, v2, v3, vcc
; GCN-NEXT: v_cndmask_b32_e32 v2, v2, v6, vcc ; GCN-NEXT: v_cndmask_b32_e32 v2, v2, v6, vcc
; GCN-NEXT: v_mov_b32_e32 v6, s4 ; GCN-NEXT: v_mov_b32_e32 v6, s4
; GCN-NEXT: v_mov_b32_e32 v7, s3 ; GCN-NEXT: v_mov_b32_e32 v7, s3
; GCN-NEXT: v_mov_b32_e32 v10, s2 ; GCN-NEXT: v_mov_b32_e32 v10, s2
; GCN-NEXT: v_cndmask_b32_e64 v7, v6, v7, s[0:1] ; GCN-NEXT: v_cndmask_b32_e64 v7, v6, v7, s[0:1]
; GCN-NEXT: v_cndmask_b32_e64 v6, v6, v10, s[0:1] ; GCN-NEXT: v_cndmask_b32_e64 v6, v6, v10, s[0:1]
; GCN-NEXT: v_mov_b32_e32 v10, 16 ; GCN-NEXT: v_mov_b32_e32 v10, 16
; GCN-NEXT: v_mov_b32_e32 v11, 0 ; GCN-NEXT: v_mov_b32_e32 v11, 0
; GCN-NEXT: flat_store_dwordx4 v[10:11], v[4:7] ; GCN-NEXT: flat_store_dwordx4 v[10:11], v[4:7]
; GCN-NEXT: flat_store_dwordx4 v[8:9], v[0:3] ; GCN-NEXT: flat_store_dwordx4 v[8:9], v[0:3]
; GCN-NEXT: s_endpgm ; GCN-NEXT: s_endpgm
%shift = ashr <2 x i128> %lhs, %rhs %shift = ashr <2 x i128> %lhs, %rhs
store <2 x i128> %shift, <2 x i128> addrspace(1)* null store <2 x i128> %shift, <2 x i128> addrspace(1)* null
ret void ret void

View File

@ -6,14 +6,14 @@
define void @local_store_i56(i56 addrspace(3)* %ptr, i56 %arg) #0 { define void @local_store_i56(i56 addrspace(3)* %ptr, i56 %arg) #0 {
; CIVI-LABEL: local_store_i56: ; CIVI-LABEL: local_store_i56:
; CIVI: ; %bb.0: ; CIVI: ; %bb.0:
; CIVI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; CIVI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; CIVI-NEXT: v_lshrrev_b32_e32 v3, 16, v2 ; CIVI-NEXT: s_mov_b32 m0, -1
; CIVI-NEXT: s_mov_b32 m0, -1 ; CIVI-NEXT: ds_write_b16 v0, v2 offset:4
; CIVI-NEXT: ds_write_b8 v0, v3 offset:6 ; CIVI-NEXT: ds_write_b32 v0, v1
; CIVI-NEXT: ds_write_b16 v0, v2 offset:4 ; CIVI-NEXT: v_lshrrev_b32_e32 v1, 16, v2
; CIVI-NEXT: ds_write_b32 v0, v1 ; CIVI-NEXT: ds_write_b8 v0, v1 offset:6
; CIVI-NEXT: s_waitcnt lgkmcnt(0) ; CIVI-NEXT: s_waitcnt lgkmcnt(0)
; CIVI-NEXT: s_setpc_b64 s[30:31] ; CIVI-NEXT: s_setpc_b64 s[30:31]
; ;
; GFX9-LABEL: local_store_i56: ; GFX9-LABEL: local_store_i56:
; GFX9: ; %bb.0: ; GFX9: ; %bb.0:
@ -30,70 +30,70 @@ define void @local_store_i56(i56 addrspace(3)* %ptr, i56 %arg) #0 {
define amdgpu_kernel void @local_store_i55(i55 addrspace(3)* %ptr, i55 %arg) #0 { define amdgpu_kernel void @local_store_i55(i55 addrspace(3)* %ptr, i55 %arg) #0 {
; HAWAII-LABEL: local_store_i55: ; HAWAII-LABEL: local_store_i55:
; HAWAII: ; %bb.0: ; HAWAII: ; %bb.0:
; HAWAII-NEXT: s_or_b32 s0, s4, 14 ; HAWAII-NEXT: s_or_b32 s0, s4, 14
; HAWAII-NEXT: v_mov_b32_e32 v0, s0 ; HAWAII-NEXT: v_mov_b32_e32 v0, s0
; HAWAII-NEXT: v_mov_b32_e32 v1, s5 ; HAWAII-NEXT: v_mov_b32_e32 v1, s5
; HAWAII-NEXT: flat_load_ubyte v0, v[0:1] ; HAWAII-NEXT: flat_load_ubyte v0, v[0:1]
; HAWAII-NEXT: s_load_dword s0, s[4:5], 0x0 ; HAWAII-NEXT: s_load_dword s0, s[4:5], 0x0
; HAWAII-NEXT: s_load_dword s1, s[4:5], 0x2 ; HAWAII-NEXT: s_load_dword s1, s[4:5], 0x2
; HAWAII-NEXT: s_load_dword s2, s[4:5], 0x3 ; HAWAII-NEXT: s_load_dword s2, s[4:5], 0x3
; HAWAII-NEXT: s_mov_b32 m0, -1 ; HAWAII-NEXT: s_mov_b32 m0, -1
; HAWAII-NEXT: s_waitcnt lgkmcnt(0) ; HAWAII-NEXT: s_waitcnt lgkmcnt(0)
; HAWAII-NEXT: v_mov_b32_e32 v1, s0 ; HAWAII-NEXT: v_mov_b32_e32 v1, s0
; HAWAII-NEXT: v_mov_b32_e32 v2, s1 ; HAWAII-NEXT: v_mov_b32_e32 v3, s1
; HAWAII-NEXT: v_mov_b32_e32 v3, s2 ; HAWAII-NEXT: v_mov_b32_e32 v2, s2
; HAWAII-NEXT: s_waitcnt vmcnt(0) ; HAWAII-NEXT: ds_write_b16 v1, v2 offset:4
; HAWAII-NEXT: v_and_b32_e32 v0, 0x7f, v0 ; HAWAII-NEXT: s_waitcnt vmcnt(0)
; HAWAII-NEXT: ds_write_b8 v1, v0 offset:6 ; HAWAII-NEXT: v_and_b32_e32 v0, 0x7f, v0
; HAWAII-NEXT: ds_write_b16 v1, v3 offset:4 ; HAWAII-NEXT: ds_write_b8 v1, v0 offset:6
; HAWAII-NEXT: ds_write_b32 v1, v2 ; HAWAII-NEXT: ds_write_b32 v1, v3
; HAWAII-NEXT: s_endpgm ; HAWAII-NEXT: s_endpgm
; ;
; FIJI-LABEL: local_store_i55: ; FIJI-LABEL: local_store_i55:
; FIJI: ; %bb.0: ; FIJI: ; %bb.0:
; FIJI-NEXT: s_or_b32 s0, s4, 14 ; FIJI-NEXT: s_or_b32 s0, s4, 14
; FIJI-NEXT: v_mov_b32_e32 v0, s0 ; FIJI-NEXT: v_mov_b32_e32 v0, s0
; FIJI-NEXT: v_mov_b32_e32 v1, s5 ; FIJI-NEXT: v_mov_b32_e32 v1, s5
; FIJI-NEXT: flat_load_ubyte v0, v[0:1] ; FIJI-NEXT: flat_load_ubyte v0, v[0:1]
; FIJI-NEXT: s_load_dword s0, s[4:5], 0x0 ; FIJI-NEXT: s_load_dword s0, s[4:5], 0x0
; FIJI-NEXT: s_load_dword s1, s[4:5], 0x8 ; FIJI-NEXT: s_load_dword s1, s[4:5], 0x8
; FIJI-NEXT: s_load_dword s2, s[4:5], 0xc ; FIJI-NEXT: s_load_dword s2, s[4:5], 0xc
; FIJI-NEXT: s_mov_b32 m0, -1 ; FIJI-NEXT: s_mov_b32 m0, -1
; FIJI-NEXT: s_waitcnt lgkmcnt(0) ; FIJI-NEXT: s_waitcnt lgkmcnt(0)
; FIJI-NEXT: v_mov_b32_e32 v1, s0 ; FIJI-NEXT: v_mov_b32_e32 v1, s0
; FIJI-NEXT: v_mov_b32_e32 v3, s1 ; FIJI-NEXT: v_mov_b32_e32 v3, s1
; FIJI-NEXT: s_and_b32 s3, s2, 0xffff ; FIJI-NEXT: s_and_b32 s3, s2, 0xffff
; FIJI-NEXT: v_mov_b32_e32 v2, s2 ; FIJI-NEXT: v_mov_b32_e32 v2, s2
; FIJI-NEXT: s_waitcnt vmcnt(0) ; FIJI-NEXT: ds_write_b16 v1, v2 offset:4
; FIJI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; FIJI-NEXT: s_waitcnt vmcnt(0)
; FIJI-NEXT: v_or_b32_e32 v0, s3, v0 ; FIJI-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; FIJI-NEXT: v_bfe_u32 v0, v0, 16, 7 ; FIJI-NEXT: v_or_b32_e32 v0, s3, v0
; FIJI-NEXT: ds_write_b8 v1, v0 offset:6 ; FIJI-NEXT: v_bfe_u32 v0, v0, 16, 7
; FIJI-NEXT: ds_write_b16 v1, v2 offset:4 ; FIJI-NEXT: ds_write_b8 v1, v0 offset:6
; FIJI-NEXT: ds_write_b32 v1, v3 ; FIJI-NEXT: ds_write_b32 v1, v3
; FIJI-NEXT: s_endpgm ; FIJI-NEXT: s_endpgm
; ;
; GFX9-LABEL: local_store_i55: ; GFX9-LABEL: local_store_i55:
; GFX9: ; %bb.0: ; GFX9: ; %bb.0:
; GFX9-NEXT: v_mov_b32_e32 v0, s4 ; GFX9-NEXT: v_mov_b32_e32 v0, s4
; GFX9-NEXT: v_mov_b32_e32 v1, s5 ; GFX9-NEXT: v_mov_b32_e32 v1, s5
; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: v_mov_b32_e32 v2, 0
; GFX9-NEXT: global_load_ubyte_d16_hi v2, v[0:1], off offset:14 ; GFX9-NEXT: global_load_ubyte_d16_hi v2, v[0:1], off offset:14
; GFX9-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX9-NEXT: s_load_dword s0, s[4:5], 0x0
; GFX9-NEXT: s_load_dword s1, s[4:5], 0x8 ; GFX9-NEXT: s_load_dword s1, s[4:5], 0x8
; GFX9-NEXT: s_load_dword s2, s[4:5], 0xc ; GFX9-NEXT: s_load_dword s2, s[4:5], 0xc
; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: v_mov_b32_e32 v0, s0 ; GFX9-NEXT: v_mov_b32_e32 v0, s0
; GFX9-NEXT: v_mov_b32_e32 v3, s1 ; GFX9-NEXT: v_mov_b32_e32 v3, s1
; GFX9-NEXT: s_and_b32 s3, s2, 0xffff ; GFX9-NEXT: v_mov_b32_e32 v1, s2
; GFX9-NEXT: v_mov_b32_e32 v1, s2 ; GFX9-NEXT: s_and_b32 s3, s2, 0xffff
; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: ds_write_b16 v0, v1 offset:4
; GFX9-NEXT: v_or_b32_e32 v2, s3, v2 ; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_and_b32_e32 v2, 0x7fffff, v2 ; GFX9-NEXT: v_or_b32_e32 v1, s3, v2
; GFX9-NEXT: ds_write_b8_d16_hi v0, v2 offset:6 ; GFX9-NEXT: v_and_b32_e32 v1, 0x7fffff, v1
; GFX9-NEXT: ds_write_b16 v0, v1 offset:4 ; GFX9-NEXT: ds_write_b8_d16_hi v0, v1 offset:6
; GFX9-NEXT: ds_write_b32 v0, v3 ; GFX9-NEXT: ds_write_b32 v0, v3
; GFX9-NEXT: s_endpgm ; GFX9-NEXT: s_endpgm
store i55 %arg, i55 addrspace(3)* %ptr, align 8 store i55 %arg, i55 addrspace(3)* %ptr, align 8
ret void ret void
} }
@ -101,31 +101,31 @@ define amdgpu_kernel void @local_store_i55(i55 addrspace(3)* %ptr, i55 %arg) #0
define amdgpu_kernel void @local_store_i48(i48 addrspace(3)* %ptr, i48 %arg) #0 { define amdgpu_kernel void @local_store_i48(i48 addrspace(3)* %ptr, i48 %arg) #0 {
; HAWAII-LABEL: local_store_i48: ; HAWAII-LABEL: local_store_i48:
; HAWAII: ; %bb.0: ; HAWAII: ; %bb.0:
; HAWAII-NEXT: s_load_dword s0, s[4:5], 0x0 ; HAWAII-NEXT: s_load_dword s0, s[4:5], 0x0
; HAWAII-NEXT: s_load_dword s1, s[4:5], 0x2 ; HAWAII-NEXT: s_load_dword s1, s[4:5], 0x2
; HAWAII-NEXT: s_load_dword s2, s[4:5], 0x3 ; HAWAII-NEXT: s_load_dword s2, s[4:5], 0x3
; HAWAII-NEXT: s_mov_b32 m0, -1 ; HAWAII-NEXT: s_mov_b32 m0, -1
; HAWAII-NEXT: s_waitcnt lgkmcnt(0) ; HAWAII-NEXT: s_waitcnt lgkmcnt(0)
; HAWAII-NEXT: v_mov_b32_e32 v0, s0 ; HAWAII-NEXT: v_mov_b32_e32 v0, s0
; HAWAII-NEXT: v_mov_b32_e32 v2, s1 ; HAWAII-NEXT: v_mov_b32_e32 v1, s2
; HAWAII-NEXT: v_mov_b32_e32 v1, s2 ; HAWAII-NEXT: ds_write_b16 v0, v1 offset:4
; HAWAII-NEXT: ds_write_b16 v0, v1 offset:4 ; HAWAII-NEXT: v_mov_b32_e32 v1, s1
; HAWAII-NEXT: ds_write_b32 v0, v2 ; HAWAII-NEXT: ds_write_b32 v0, v1
; HAWAII-NEXT: s_endpgm ; HAWAII-NEXT: s_endpgm
; ;
; FIJI-LABEL: local_store_i48: ; FIJI-LABEL: local_store_i48:
; FIJI: ; %bb.0: ; FIJI: ; %bb.0:
; FIJI-NEXT: s_load_dword s0, s[4:5], 0x0 ; FIJI-NEXT: s_load_dword s0, s[4:5], 0x0
; FIJI-NEXT: s_load_dword s1, s[4:5], 0x8 ; FIJI-NEXT: s_load_dword s1, s[4:5], 0x8
; FIJI-NEXT: s_load_dword s2, s[4:5], 0xc ; FIJI-NEXT: s_load_dword s2, s[4:5], 0xc
; FIJI-NEXT: s_mov_b32 m0, -1 ; FIJI-NEXT: s_mov_b32 m0, -1
; FIJI-NEXT: s_waitcnt lgkmcnt(0) ; FIJI-NEXT: s_waitcnt lgkmcnt(0)
; FIJI-NEXT: v_mov_b32_e32 v0, s0 ; FIJI-NEXT: v_mov_b32_e32 v0, s0
; FIJI-NEXT: v_mov_b32_e32 v2, s1 ; FIJI-NEXT: v_mov_b32_e32 v1, s2
; FIJI-NEXT: v_mov_b32_e32 v1, s2 ; FIJI-NEXT: ds_write_b16 v0, v1 offset:4
; FIJI-NEXT: ds_write_b16 v0, v1 offset:4 ; FIJI-NEXT: v_mov_b32_e32 v1, s1
; FIJI-NEXT: ds_write_b32 v0, v2 ; FIJI-NEXT: ds_write_b32 v0, v1
; FIJI-NEXT: s_endpgm ; FIJI-NEXT: s_endpgm
; ;
; GFX9-LABEL: local_store_i48: ; GFX9-LABEL: local_store_i48:
; GFX9: ; %bb.0: ; GFX9: ; %bb.0:
@ -146,35 +146,35 @@ define amdgpu_kernel void @local_store_i48(i48 addrspace(3)* %ptr, i48 %arg) #0
define amdgpu_kernel void @local_store_i65(i65 addrspace(3)* %ptr, i65 %arg) #0 { define amdgpu_kernel void @local_store_i65(i65 addrspace(3)* %ptr, i65 %arg) #0 {
; HAWAII-LABEL: local_store_i65: ; HAWAII-LABEL: local_store_i65:
; HAWAII: ; %bb.0: ; HAWAII: ; %bb.0:
; HAWAII-NEXT: s_load_dword s2, s[4:5], 0x0 ; HAWAII-NEXT: s_load_dword s2, s[4:5], 0x0
; HAWAII-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2 ; HAWAII-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2
; HAWAII-NEXT: s_load_dword s3, s[4:5], 0x4 ; HAWAII-NEXT: s_load_dword s3, s[4:5], 0x4
; HAWAII-NEXT: s_mov_b32 m0, -1 ; HAWAII-NEXT: s_mov_b32 m0, -1
; HAWAII-NEXT: s_waitcnt lgkmcnt(0) ; HAWAII-NEXT: s_waitcnt lgkmcnt(0)
; HAWAII-NEXT: v_mov_b32_e32 v2, s2 ; HAWAII-NEXT: v_mov_b32_e32 v2, s2
; HAWAII-NEXT: v_mov_b32_e32 v0, s0 ; HAWAII-NEXT: s_and_b32 s3, s3, 1
; HAWAII-NEXT: s_and_b32 s3, s3, 1 ; HAWAII-NEXT: v_mov_b32_e32 v0, s3
; HAWAII-NEXT: v_mov_b32_e32 v3, s3 ; HAWAII-NEXT: ds_write_b8 v2, v0 offset:8
; HAWAII-NEXT: v_mov_b32_e32 v1, s1 ; HAWAII-NEXT: v_mov_b32_e32 v0, s0
; HAWAII-NEXT: ds_write_b8 v2, v3 offset:8 ; HAWAII-NEXT: v_mov_b32_e32 v1, s1
; HAWAII-NEXT: ds_write_b64 v2, v[0:1] ; HAWAII-NEXT: ds_write_b64 v2, v[0:1]
; HAWAII-NEXT: s_endpgm ; HAWAII-NEXT: s_endpgm
; ;
; FIJI-LABEL: local_store_i65: ; FIJI-LABEL: local_store_i65:
; FIJI: ; %bb.0: ; FIJI: ; %bb.0:
; FIJI-NEXT: s_load_dword s2, s[4:5], 0x0 ; FIJI-NEXT: s_load_dword s2, s[4:5], 0x0
; FIJI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 ; FIJI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8
; FIJI-NEXT: s_load_dword s3, s[4:5], 0x10 ; FIJI-NEXT: s_load_dword s3, s[4:5], 0x10
; FIJI-NEXT: s_mov_b32 m0, -1 ; FIJI-NEXT: s_mov_b32 m0, -1
; FIJI-NEXT: s_waitcnt lgkmcnt(0) ; FIJI-NEXT: s_waitcnt lgkmcnt(0)
; FIJI-NEXT: v_mov_b32_e32 v2, s2 ; FIJI-NEXT: v_mov_b32_e32 v2, s2
; FIJI-NEXT: v_mov_b32_e32 v0, s0 ; FIJI-NEXT: s_and_b32 s3, s3, 1
; FIJI-NEXT: s_and_b32 s3, s3, 1 ; FIJI-NEXT: v_mov_b32_e32 v0, s3
; FIJI-NEXT: v_mov_b32_e32 v3, s3 ; FIJI-NEXT: ds_write_b8 v2, v0 offset:8
; FIJI-NEXT: v_mov_b32_e32 v1, s1 ; FIJI-NEXT: v_mov_b32_e32 v0, s0
; FIJI-NEXT: ds_write_b8 v2, v3 offset:8 ; FIJI-NEXT: v_mov_b32_e32 v1, s1
; FIJI-NEXT: ds_write_b64 v2, v[0:1] ; FIJI-NEXT: ds_write_b64 v2, v[0:1]
; FIJI-NEXT: s_endpgm ; FIJI-NEXT: s_endpgm
; ;
; GFX9-LABEL: local_store_i65: ; GFX9-LABEL: local_store_i65:
; GFX9: ; %bb.0: ; GFX9: ; %bb.0:
@ -218,22 +218,22 @@ define void @local_store_i13(i13 addrspace(3)* %ptr, i13 %arg) #0 {
define void @local_store_i17(i17 addrspace(3)* %ptr, i17 %arg) #0 { define void @local_store_i17(i17 addrspace(3)* %ptr, i17 %arg) #0 {
; CIVI-LABEL: local_store_i17: ; CIVI-LABEL: local_store_i17:
; CIVI: ; %bb.0: ; CIVI: ; %bb.0:
; CIVI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; CIVI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; CIVI-NEXT: s_mov_b32 m0, -1 ; CIVI-NEXT: s_mov_b32 m0, -1
; CIVI-NEXT: v_bfe_u32 v2, v1, 16, 1 ; CIVI-NEXT: ds_write_b16 v0, v1
; CIVI-NEXT: ds_write_b16 v0, v1 ; CIVI-NEXT: v_bfe_u32 v1, v1, 16, 1
; CIVI-NEXT: ds_write_b8 v0, v2 offset:2 ; CIVI-NEXT: ds_write_b8 v0, v1 offset:2
; CIVI-NEXT: s_waitcnt lgkmcnt(0) ; CIVI-NEXT: s_waitcnt lgkmcnt(0)
; CIVI-NEXT: s_setpc_b64 s[30:31] ; CIVI-NEXT: s_setpc_b64 s[30:31]
; ;
; GFX9-LABEL: local_store_i17: ; GFX9-LABEL: local_store_i17:
; GFX9: ; %bb.0: ; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: v_and_b32_e32 v2, 0x1ffff, v1 ; GFX9-NEXT: ds_write_b16 v0, v1
; GFX9-NEXT: ds_write_b16 v0, v1 ; GFX9-NEXT: v_and_b32_e32 v1, 0x1ffff, v1
; GFX9-NEXT: ds_write_b8_d16_hi v0, v2 offset:2 ; GFX9-NEXT: ds_write_b8_d16_hi v0, v1 offset:2
; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: s_setpc_b64 s[30:31] ; GFX9-NEXT: s_setpc_b64 s[30:31]
store i17 %arg, i17 addrspace(3)* %ptr, align 8 store i17 %arg, i17 addrspace(3)* %ptr, align 8
ret void ret void
} }

View File

@ -38,10 +38,10 @@ entry:
} }
; GCN-LABEL: {{^}}trunc_store_v16i64_v16i32: ; GCN-LABEL: {{^}}trunc_store_v16i64_v16i32:
; GCN: global_store_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, off offset:48 ; GCN: global_store_dwordx4 v{{\[[0-9]:[0-9]+\]}}, v{{\[[0-9]:[0-9]+\]}}, off offset:48
; GCN: global_store_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, off offset:32 ; GCN: global_store_dwordx4 v{{\[[0-9]:[0-9]+\]}}, v{{\[[0-9]:[0-9]+\]}}, off offset:32
; GCN: global_store_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, off offset:16 ; GCN: global_store_dwordx4 v{{\[[0-9]:[0-9]+\]}}, v{{\[[0-9]:[0-9]+\]}}, off offset:16
; GCN: global_store_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, off ; GCN: global_store_dwordx4 v{{\[[0-9]:[0-9]+\]}}, v{{\[[0-9]:[0-9]+\]}}, off
define amdgpu_kernel void @trunc_store_v16i64_v16i32(< 16 x i32> addrspace(1)* %out, <16 x i64> %in) { define amdgpu_kernel void @trunc_store_v16i64_v16i32(< 16 x i32> addrspace(1)* %out, <16 x i64> %in) {
entry: entry:
%trunc = trunc <16 x i64> %in to <16 x i32> %trunc = trunc <16 x i64> %in to <16 x i32>

View File

@ -36,40 +36,41 @@ define amdgpu_kernel void @test_udivrem(i32 addrspace(1)* %out0, [8 x i32], i32
; ;
; GFX6-LABEL: test_udivrem: ; GFX6-LABEL: test_udivrem:
; GFX6: ; %bb.0: ; GFX6: ; %bb.0:
; GFX6-NEXT: s_load_dword s3, s[0:1], 0x26 ; GFX6-NEXT: s_load_dword s3, s[0:1], 0x26
; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9
; GFX6-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x13 ; GFX6-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x13
; GFX6-NEXT: s_load_dword s0, s[0:1], 0x1d ; GFX6-NEXT: s_mov_b32 s7, 0xf000
; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s6, -1
; GFX6-NEXT: s_mov_b32 s6, -1 ; GFX6-NEXT: s_mov_b32 s10, s6
; GFX6-NEXT: s_mov_b32 s10, s6 ; GFX6-NEXT: s_waitcnt lgkmcnt(0)
; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_cvt_f32_u32_e32 v0, s3
; GFX6-NEXT: v_cvt_f32_u32_e32 v0, s3 ; GFX6-NEXT: s_sub_i32 s2, 0, s3
; GFX6-NEXT: s_sub_i32 s2, 0, s3 ; GFX6-NEXT: s_mov_b32 s11, s7
; GFX6-NEXT: s_mov_b32 s11, s7 ; GFX6-NEXT: v_rcp_iflag_f32_e32 v0, v0
; GFX6-NEXT: v_rcp_iflag_f32_e32 v0, v0 ; GFX6-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0
; GFX6-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX6-NEXT: v_cvt_u32_f32_e32 v0, v0
; GFX6-NEXT: v_cvt_u32_f32_e32 v0, v0 ; GFX6-NEXT: v_mul_lo_u32 v1, s2, v0
; GFX6-NEXT: v_mul_lo_u32 v1, s2, v0 ; GFX6-NEXT: s_load_dword s2, s[0:1], 0x1d
; GFX6-NEXT: v_mul_hi_u32 v1, v0, v1 ; GFX6-NEXT: v_mul_hi_u32 v1, v0, v1
; GFX6-NEXT: v_add_i32_e32 v0, vcc, v1, v0 ; GFX6-NEXT: v_add_i32_e32 v0, vcc, v1, v0
; GFX6-NEXT: v_mul_hi_u32 v0, s0, v0 ; GFX6-NEXT: s_waitcnt lgkmcnt(0)
; GFX6-NEXT: v_mul_lo_u32 v1, v0, s3 ; GFX6-NEXT: v_mul_hi_u32 v0, s2, v0
; GFX6-NEXT: v_add_i32_e32 v2, vcc, 1, v0 ; GFX6-NEXT: v_mul_lo_u32 v1, v0, s3
; GFX6-NEXT: v_sub_i32_e32 v1, vcc, s0, v1 ; GFX6-NEXT: v_add_i32_e32 v2, vcc, 1, v0
; GFX6-NEXT: v_cmp_le_u32_e64 s[0:1], s3, v1 ; GFX6-NEXT: v_sub_i32_e32 v1, vcc, s2, v1
; GFX6-NEXT: v_cndmask_b32_e64 v0, v0, v2, s[0:1] ; GFX6-NEXT: v_cmp_le_u32_e64 s[0:1], s3, v1
; GFX6-NEXT: v_subrev_i32_e32 v2, vcc, s3, v1 ; GFX6-NEXT: v_cndmask_b32_e64 v0, v0, v2, s[0:1]
; GFX6-NEXT: v_cndmask_b32_e64 v1, v1, v2, s[0:1] ; GFX6-NEXT: v_subrev_i32_e32 v2, vcc, s3, v1
; GFX6-NEXT: v_add_i32_e32 v2, vcc, 1, v0 ; GFX6-NEXT: v_cndmask_b32_e64 v1, v1, v2, s[0:1]
; GFX6-NEXT: v_cmp_le_u32_e64 s[0:1], s3, v1 ; GFX6-NEXT: v_add_i32_e32 v2, vcc, 1, v0
; GFX6-NEXT: v_cndmask_b32_e64 v0, v0, v2, s[0:1] ; GFX6-NEXT: v_cmp_le_u32_e64 s[0:1], s3, v1
; GFX6-NEXT: v_subrev_i32_e32 v2, vcc, s3, v1 ; GFX6-NEXT: v_cndmask_b32_e64 v0, v0, v2, s[0:1]
; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GFX6-NEXT: v_subrev_i32_e32 v2, vcc, s3, v1
; GFX6-NEXT: s_waitcnt expcnt(0) ; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0
; GFX6-NEXT: v_cndmask_b32_e64 v0, v1, v2, s[0:1] ; GFX6-NEXT: s_waitcnt expcnt(0)
; GFX6-NEXT: buffer_store_dword v0, off, s[8:11], 0 ; GFX6-NEXT: v_cndmask_b32_e64 v0, v1, v2, s[0:1]
; GFX6-NEXT: s_endpgm ; GFX6-NEXT: buffer_store_dword v0, off, s[8:11], 0
; GFX6-NEXT: s_endpgm
; ;
; GFX8-LABEL: test_udivrem: ; GFX8-LABEL: test_udivrem:
; GFX8: ; %bb.0: ; GFX8: ; %bb.0: