diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp index 84e091caed10..5d7cd5ffc4ce 100644 --- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp +++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp @@ -474,27 +474,65 @@ bool SIInstrInfo::shouldClusterMemOps(ArrayRef BaseOps1, ArrayRef BaseOps2, unsigned NumLoads, unsigned NumBytes) const { - // If current mem ops pair do not have same base pointer, then they cannot be - // clustered. assert(!BaseOps1.empty() && !BaseOps2.empty()); const MachineInstr &FirstLdSt = *BaseOps1.front()->getParent(); const MachineInstr &SecondLdSt = *BaseOps2.front()->getParent(); + if (!memOpsHaveSameBasePtr(FirstLdSt, BaseOps1, SecondLdSt, BaseOps2)) return false; - // Compute max cluster size based on average number bytes clustered till now, - // and decide based on it, if current mem ops pair can be clustered or not. - assert((NumLoads > 0) && (NumBytes > 0) && (NumBytes >= NumLoads) && - "Invalid NumLoads/NumBytes values"); - unsigned MaxNumLoads; - if (NumBytes <= 4 * NumLoads) { - // Loads are dword or smaller (on average). - MaxNumLoads = 5; - } else { - // Loads are bigger than a dword (on average). - MaxNumLoads = 4; + const MachineOperand *FirstDst = nullptr; + const MachineOperand *SecondDst = nullptr; + + if ((isMUBUF(FirstLdSt) && isMUBUF(SecondLdSt)) || + (isMTBUF(FirstLdSt) && isMTBUF(SecondLdSt)) || + (isMIMG(FirstLdSt) && isMIMG(SecondLdSt)) || + (isFLAT(FirstLdSt) && isFLAT(SecondLdSt))) { + const unsigned MaxGlobalLoadCluster = 7; + if (NumLoads > MaxGlobalLoadCluster) + return false; + + FirstDst = getNamedOperand(FirstLdSt, AMDGPU::OpName::vdata); + if (!FirstDst) + FirstDst = getNamedOperand(FirstLdSt, AMDGPU::OpName::vdst); + SecondDst = getNamedOperand(SecondLdSt, AMDGPU::OpName::vdata); + if (!SecondDst) + SecondDst = getNamedOperand(SecondLdSt, AMDGPU::OpName::vdst); + } else if (isSMRD(FirstLdSt) && isSMRD(SecondLdSt)) { + FirstDst = getNamedOperand(FirstLdSt, AMDGPU::OpName::sdst); + SecondDst = getNamedOperand(SecondLdSt, AMDGPU::OpName::sdst); + } else if (isDS(FirstLdSt) && isDS(SecondLdSt)) { + FirstDst = getNamedOperand(FirstLdSt, AMDGPU::OpName::vdst); + SecondDst = getNamedOperand(SecondLdSt, AMDGPU::OpName::vdst); } - return NumLoads <= MaxNumLoads; + + if (!FirstDst || !SecondDst) + return false; + + // Try to limit clustering based on the total number of bytes loaded + // rather than the number of instructions. This is done to help reduce + // register pressure. The method used is somewhat inexact, though, + // because it assumes that all loads in the cluster will load the + // same number of bytes as FirstLdSt. + + // The unit of this value is bytes. + // FIXME: This needs finer tuning. + unsigned LoadClusterThreshold = 16; + + const MachineRegisterInfo &MRI = + FirstLdSt.getParent()->getParent()->getRegInfo(); + + const Register Reg = FirstDst->getReg(); + + const TargetRegisterClass *DstRC = Register::isVirtualRegister(Reg) + ? MRI.getRegClass(Reg) + : RI.getPhysRegClass(Reg); + + // FIXME: NumLoads should not be subtracted 1. This is to match behavior + // of clusterNeighboringMemOps which was previosly passing cluster length + // less 1. LoadClusterThreshold should be tuned instead. + return ((NumLoads - 1) * (RI.getRegSizeInBits(*DstRC) / 8)) <= + LoadClusterThreshold; } // FIXME: This behaves strangely. If, for example, you have 32 load + stores, diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.div.fmas.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.div.fmas.ll index 7b375641f729..524482df5356 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.div.fmas.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.div.fmas.ll @@ -235,17 +235,17 @@ define amdgpu_kernel void @test_div_fmas_f32(float addrspace(1)* %out, [8 x i32] ; ; GFX8-LABEL: test_div_fmas_f32: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dword s2, s[0:1], 0xb8 -; GFX8-NEXT: s_load_dword s3, s[0:1], 0x4c -; GFX8-NEXT: s_load_dword s4, s[0:1], 0x70 -; GFX8-NEXT: s_load_dword s5, s[0:1], 0x94 -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_and_b32 s2, 1, s2 -; GFX8-NEXT: v_mov_b32_e32 v0, s3 -; GFX8-NEXT: v_mov_b32_e32 v1, s4 -; GFX8-NEXT: v_mov_b32_e32 v2, s5 -; GFX8-NEXT: v_cmp_ne_u32_e64 vcc, 0, s2 +; GFX8-NEXT: s_load_dword s2, s[0:1], 0x4c +; GFX8-NEXT: s_load_dword s3, s[0:1], 0x70 +; GFX8-NEXT: s_load_dword s4, s[0:1], 0x94 +; GFX8-NEXT: s_load_dword s5, s[0:1], 0xb8 +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: v_mov_b32_e32 v0, s2 +; GFX8-NEXT: v_mov_b32_e32 v1, s3 +; GFX8-NEXT: v_mov_b32_e32 v2, s4 +; GFX8-NEXT: s_and_b32 s2, 1, s5 +; GFX8-NEXT: v_cmp_ne_u32_e64 vcc, 0, s2 ; GFX8-NEXT: s_nop 3 ; GFX8-NEXT: v_div_fmas_f32 v2, v0, v1, v2 ; GFX8-NEXT: v_mov_b32_e32 v0, s0 @@ -527,43 +527,43 @@ define amdgpu_kernel void @test_div_fmas_f32_inline_imm_2(float addrspace(1)* %o define amdgpu_kernel void @test_div_fmas_f64(double addrspace(1)* %out, double %a, double %b, double %c, i1 %d) { ; GFX7-LABEL: test_div_fmas_f64: ; GFX7: ; %bb.0: -; GFX7-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x9 -; GFX7-NEXT: s_load_dword s0, s[0:1], 0x11 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v0, s6 -; GFX7-NEXT: v_mov_b32_e32 v2, s8 -; GFX7-NEXT: v_mov_b32_e32 v4, s10 -; GFX7-NEXT: s_and_b32 s0, 1, s0 -; GFX7-NEXT: v_mov_b32_e32 v1, s7 -; GFX7-NEXT: v_mov_b32_e32 v3, s9 -; GFX7-NEXT: v_mov_b32_e32 v5, s11 -; GFX7-NEXT: v_cmp_ne_u32_e64 vcc, 0, s0 -; GFX7-NEXT: s_nop 3 -; GFX7-NEXT: v_div_fmas_f64 v[0:1], v[0:1], v[2:3], v[4:5] -; GFX7-NEXT: v_mov_b32_e32 v2, s4 -; GFX7-NEXT: v_mov_b32_e32 v3, s5 -; GFX7-NEXT: flat_store_dwordx2 v[2:3], v[0:1] -; GFX7-NEXT: s_endpgm +; GFX7-NEXT: s_load_dword s8, s[0:1], 0x11 +; GFX7-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s2 +; GFX7-NEXT: v_mov_b32_e32 v1, s3 +; GFX7-NEXT: v_mov_b32_e32 v2, s4 +; GFX7-NEXT: v_mov_b32_e32 v4, s6 +; GFX7-NEXT: s_and_b32 s2, 1, s8 +; GFX7-NEXT: v_mov_b32_e32 v3, s5 +; GFX7-NEXT: v_mov_b32_e32 v5, s7 +; GFX7-NEXT: v_cmp_ne_u32_e64 vcc, 0, s2 +; GFX7-NEXT: s_nop 3 +; GFX7-NEXT: v_div_fmas_f64 v[0:1], v[0:1], v[2:3], v[4:5] +; GFX7-NEXT: v_mov_b32_e32 v3, s1 +; GFX7-NEXT: v_mov_b32_e32 v2, s0 +; GFX7-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; GFX7-NEXT: s_endpgm ; ; GFX8-LABEL: test_div_fmas_f64: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24 -; GFX8-NEXT: s_load_dword s0, s[0:1], 0x44 -; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v0, s6 -; GFX8-NEXT: v_mov_b32_e32 v2, s8 -; GFX8-NEXT: v_mov_b32_e32 v4, s10 -; GFX8-NEXT: s_and_b32 s0, 1, s0 -; GFX8-NEXT: v_mov_b32_e32 v1, s7 -; GFX8-NEXT: v_mov_b32_e32 v3, s9 -; GFX8-NEXT: v_mov_b32_e32 v5, s11 -; GFX8-NEXT: v_cmp_ne_u32_e64 vcc, 0, s0 -; GFX8-NEXT: s_nop 3 -; GFX8-NEXT: v_div_fmas_f64 v[0:1], v[0:1], v[2:3], v[4:5] -; GFX8-NEXT: v_mov_b32_e32 v2, s4 -; GFX8-NEXT: v_mov_b32_e32 v3, s5 -; GFX8-NEXT: flat_store_dwordx2 v[2:3], v[0:1] -; GFX8-NEXT: s_endpgm +; GFX8-NEXT: s_load_dword s8, s[0:1], 0x44 +; GFX8-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: v_mov_b32_e32 v0, s2 +; GFX8-NEXT: v_mov_b32_e32 v1, s3 +; GFX8-NEXT: v_mov_b32_e32 v2, s4 +; GFX8-NEXT: v_mov_b32_e32 v4, s6 +; GFX8-NEXT: s_and_b32 s2, 1, s8 +; GFX8-NEXT: v_mov_b32_e32 v3, s5 +; GFX8-NEXT: v_mov_b32_e32 v5, s7 +; GFX8-NEXT: v_cmp_ne_u32_e64 vcc, 0, s2 +; GFX8-NEXT: s_nop 3 +; GFX8-NEXT: v_div_fmas_f64 v[0:1], v[0:1], v[2:3], v[4:5] +; GFX8-NEXT: v_mov_b32_e32 v3, s1 +; GFX8-NEXT: v_mov_b32_e32 v2, s0 +; GFX8-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; GFX8-NEXT: s_endpgm ; ; GFX10_W32-LABEL: test_div_fmas_f64: ; GFX10_W32: ; %bb.0: diff --git a/llvm/test/CodeGen/AMDGPU/amdhsa-trap-num-sgprs.ll b/llvm/test/CodeGen/AMDGPU/amdhsa-trap-num-sgprs.ll index 8f4f1c391535..3d75eca93cb4 100644 --- a/llvm/test/CodeGen/AMDGPU/amdhsa-trap-num-sgprs.ll +++ b/llvm/test/CodeGen/AMDGPU/amdhsa-trap-num-sgprs.ll @@ -3,7 +3,7 @@ ; GCN-LABEL: {{^}}amdhsa_trap_num_sgprs ; TRAP-HANDLER-ENABLE: NumSgprs: 61 -; TRAP-HANDLER-DISABLE: NumSgprs: 77 +; TRAP-HANDLER-DISABLE: NumSgprs: 79 define amdgpu_kernel void @amdhsa_trap_num_sgprs( i32 addrspace(1)* %out0, i32 %in0, i32 addrspace(1)* %out1, i32 %in1, diff --git a/llvm/test/CodeGen/AMDGPU/global-saddr.ll b/llvm/test/CodeGen/AMDGPU/global-saddr.ll index 8a3d1d3053f9..136cfd63686c 100644 --- a/llvm/test/CodeGen/AMDGPU/global-saddr.ll +++ b/llvm/test/CodeGen/AMDGPU/global-saddr.ll @@ -46,8 +46,8 @@ entry: ; Test various offset boundaries. ; GFX9: global_load_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, off offset:4088{{$}} +; GFX9: global_load_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, s[{{[0-9]+}}:{{[0-9]+}}] offset:2040{{$}} ; GFX9: global_load_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, s[{{[0-9]+}}:{{[0-9]+}}] offset:4088{{$}} -; GFX9: global_load_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, s[{{[0-9]+}}:{{[0-9]+}}] offset:2056{{$}} %gep11 = getelementptr inbounds i64, i64 addrspace(1)* %gep, i64 511 %load11 = load i64, i64 addrspace(1)* %gep11 %gep12 = getelementptr inbounds i64, i64 addrspace(1)* %gep, i64 1023 diff --git a/llvm/test/CodeGen/AMDGPU/insert_vector_elt.ll b/llvm/test/CodeGen/AMDGPU/insert_vector_elt.ll index 15643d4b67f7..817e3e5ca28c 100644 --- a/llvm/test/CodeGen/AMDGPU/insert_vector_elt.ll +++ b/llvm/test/CodeGen/AMDGPU/insert_vector_elt.ll @@ -681,27 +681,27 @@ define amdgpu_kernel void @dynamic_insertelement_v4i32(<4 x i32> addrspace(1)* % ; ; VI-LABEL: dynamic_insertelement_v4i32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; VI-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x10 -; VI-NEXT: s_load_dword s6, s[4:5], 0x20 -; VI-NEXT: s_load_dword s4, s[4:5], 0x44 -; VI-NEXT: s_mov_b32 s3, 0x1100f000 -; VI-NEXT: s_mov_b32 s2, -1 -; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_cmp_eq_u32 s6, 3 -; VI-NEXT: s_cselect_b32 s5, s4, s11 -; VI-NEXT: s_cmp_eq_u32 s6, 2 -; VI-NEXT: s_cselect_b32 s7, s4, s10 -; VI-NEXT: s_cmp_eq_u32 s6, 1 -; VI-NEXT: s_cselect_b32 s9, s4, s9 -; VI-NEXT: s_cmp_eq_u32 s6, 0 -; VI-NEXT: s_cselect_b32 s4, s4, s8 -; VI-NEXT: v_mov_b32_e32 v0, s4 -; VI-NEXT: v_mov_b32_e32 v1, s9 -; VI-NEXT: v_mov_b32_e32 v2, s7 -; VI-NEXT: v_mov_b32_e32 v3, s5 -; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 -; VI-NEXT: s_endpgm +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; VI-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x10 +; VI-NEXT: s_load_dword s6, s[4:5], 0x20 +; VI-NEXT: s_load_dword s4, s[4:5], 0x44 +; VI-NEXT: s_mov_b32 s3, 0x1100f000 +; VI-NEXT: s_mov_b32 s2, -1 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: s_cmp_eq_u32 s6, 3 +; VI-NEXT: s_cselect_b32 s5, s4, s11 +; VI-NEXT: s_cmp_eq_u32 s6, 2 +; VI-NEXT: s_cselect_b32 s7, s4, s10 +; VI-NEXT: s_cmp_eq_u32 s6, 1 +; VI-NEXT: s_cselect_b32 s9, s4, s9 +; VI-NEXT: s_cmp_eq_u32 s6, 0 +; VI-NEXT: s_cselect_b32 s4, s4, s8 +; VI-NEXT: v_mov_b32_e32 v0, s4 +; VI-NEXT: v_mov_b32_e32 v1, s9 +; VI-NEXT: v_mov_b32_e32 v2, s7 +; VI-NEXT: v_mov_b32_e32 v3, s5 +; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 +; VI-NEXT: s_endpgm %vecins = insertelement <4 x i32> %a, i32 %val, i32 %b store <4 x i32> %vecins, <4 x i32> addrspace(1)* %out, align 16 ret void diff --git a/llvm/test/CodeGen/AMDGPU/kernel-args.ll b/llvm/test/CodeGen/AMDGPU/kernel-args.ll index 38fddcafaa29..49c2bf08ba3e 100644 --- a/llvm/test/CodeGen/AMDGPU/kernel-args.ll +++ b/llvm/test/CodeGen/AMDGPU/kernel-args.ll @@ -855,10 +855,10 @@ define amdgpu_kernel void @struct_argument_alignment({i32, i64} %arg0, i8, {i32, ; multiple. ; FUNC-LABEL: {{^}}packed_struct_argument_alignment: ; HSA-GFX9: kernarg_segment_byte_size = 28 -; HSA-GFX9: s_load_dword s{{[0-9]+}}, s[4:5], 0x0 -; HSA-GFX9: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s[4:5], 0x4 ; HSA-GFX9: global_load_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, off offset:17 ; HSA-GFX9: global_load_dword v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, off offset:13 +; HSA-GFX9: s_load_dword s{{[0-9]+}}, s[4:5], 0x0 +; HSA-GFX9: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s[4:5], 0x4 define amdgpu_kernel void @packed_struct_argument_alignment(<{i32, i64}> %arg0, i8, <{i32, i64}> %arg1) { %val0 = extractvalue <{i32, i64}> %arg0, 0 %val1 = extractvalue <{i32, i64}> %arg0, 1 diff --git a/llvm/test/CodeGen/AMDGPU/memory_clause.ll b/llvm/test/CodeGen/AMDGPU/memory_clause.ll index bc4c0d03db93..5a435f01925c 100644 --- a/llvm/test/CodeGen/AMDGPU/memory_clause.ll +++ b/llvm/test/CodeGen/AMDGPU/memory_clause.ll @@ -51,38 +51,38 @@ bb: define amdgpu_kernel void @scalar_clause(<4 x i32> addrspace(1)* noalias nocapture readonly %arg, <4 x i32> addrspace(1)* noalias nocapture %arg1) { ; GCN-LABEL: scalar_clause: ; GCN: ; %bb.0: ; %bb -; GCN-NEXT: s_load_dwordx2 s[16:17], s[0:1], 0x24 -; GCN-NEXT: s_load_dwordx2 s[18:19], s[0:1], 0x2c -; GCN-NEXT: s_nop 0 -; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_load_dwordx4 s[0:3], s[16:17], 0x0 -; GCN-NEXT: s_load_dwordx4 s[4:7], s[16:17], 0x10 -; GCN-NEXT: s_load_dwordx4 s[8:11], s[16:17], 0x20 -; GCN-NEXT: s_load_dwordx4 s[12:15], s[16:17], 0x30 -; GCN-NEXT: v_mov_b32_e32 v16, s18 -; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v0, s0 -; GCN-NEXT: v_mov_b32_e32 v4, s4 -; GCN-NEXT: v_mov_b32_e32 v8, s8 -; GCN-NEXT: v_mov_b32_e32 v12, s12 -; GCN-NEXT: v_mov_b32_e32 v17, s19 -; GCN-NEXT: v_mov_b32_e32 v1, s1 -; GCN-NEXT: v_mov_b32_e32 v2, s2 -; GCN-NEXT: v_mov_b32_e32 v3, s3 -; GCN-NEXT: v_mov_b32_e32 v5, s5 -; GCN-NEXT: v_mov_b32_e32 v6, s6 -; GCN-NEXT: v_mov_b32_e32 v7, s7 -; GCN-NEXT: v_mov_b32_e32 v9, s9 -; GCN-NEXT: v_mov_b32_e32 v10, s10 -; GCN-NEXT: v_mov_b32_e32 v11, s11 -; GCN-NEXT: v_mov_b32_e32 v13, s13 -; GCN-NEXT: v_mov_b32_e32 v14, s14 -; GCN-NEXT: v_mov_b32_e32 v15, s15 -; GCN-NEXT: global_store_dwordx4 v[16:17], v[0:3], off -; GCN-NEXT: global_store_dwordx4 v[16:17], v[4:7], off offset:16 -; GCN-NEXT: global_store_dwordx4 v[16:17], v[8:11], off offset:32 -; GCN-NEXT: global_store_dwordx4 v[16:17], v[12:15], off offset:48 -; GCN-NEXT: s_endpgm +; GCN-NEXT: s_load_dwordx2 s[16:17], s[0:1], 0x24 +; GCN-NEXT: s_load_dwordx2 s[18:19], s[0:1], 0x2c +; GCN-NEXT: s_nop 0 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: s_load_dwordx4 s[0:3], s[16:17], 0x0 +; GCN-NEXT: s_load_dwordx4 s[4:7], s[16:17], 0x10 +; GCN-NEXT: s_load_dwordx4 s[8:11], s[16:17], 0x20 +; GCN-NEXT: s_load_dwordx4 s[12:15], s[16:17], 0x30 +; GCN-NEXT: v_mov_b32_e32 v12, s18 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: v_mov_b32_e32 v0, s0 +; GCN-NEXT: v_mov_b32_e32 v4, s4 +; GCN-NEXT: v_mov_b32_e32 v8, s8 +; GCN-NEXT: v_mov_b32_e32 v13, s19 +; GCN-NEXT: v_mov_b32_e32 v1, s1 +; GCN-NEXT: v_mov_b32_e32 v2, s2 +; GCN-NEXT: v_mov_b32_e32 v3, s3 +; GCN-NEXT: v_mov_b32_e32 v5, s5 +; GCN-NEXT: v_mov_b32_e32 v6, s6 +; GCN-NEXT: v_mov_b32_e32 v7, s7 +; GCN-NEXT: global_store_dwordx4 v[12:13], v[0:3], off +; GCN-NEXT: global_store_dwordx4 v[12:13], v[4:7], off offset:16 +; GCN-NEXT: v_mov_b32_e32 v0, s12 +; GCN-NEXT: v_mov_b32_e32 v9, s9 +; GCN-NEXT: v_mov_b32_e32 v10, s10 +; GCN-NEXT: v_mov_b32_e32 v11, s11 +; GCN-NEXT: v_mov_b32_e32 v1, s13 +; GCN-NEXT: v_mov_b32_e32 v2, s14 +; GCN-NEXT: v_mov_b32_e32 v3, s15 +; GCN-NEXT: global_store_dwordx4 v[12:13], v[8:11], off offset:32 +; GCN-NEXT: global_store_dwordx4 v[12:13], v[0:3], off offset:48 +; GCN-NEXT: s_endpgm bb: %tmp = load <4 x i32>, <4 x i32> addrspace(1)* %arg, align 16 %tmp2 = getelementptr inbounds <4 x i32>, <4 x i32> addrspace(1)* %arg, i64 1 diff --git a/llvm/test/CodeGen/AMDGPU/promote-constOffset-to-imm.ll b/llvm/test/CodeGen/AMDGPU/promote-constOffset-to-imm.ll index 4c6fa2f0f4c8..c7ae08c839ee 100644 --- a/llvm/test/CodeGen/AMDGPU/promote-constOffset-to-imm.ll +++ b/llvm/test/CodeGen/AMDGPU/promote-constOffset-to-imm.ll @@ -17,8 +17,8 @@ define amdgpu_kernel void @clmem_read_simplified(i8 addrspace(1)* %buffer) { ; ; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off{{$}} ; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:2048 -; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off{{$}} ; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:2048 +; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off{{$}} ; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:2048 ; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off{{$}} ; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:-4096 @@ -86,7 +86,6 @@ define hidden amdgpu_kernel void @clmem_read(i8 addrspace(1)* %buffer) { ; GFX8: flat_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}] ; GFX8: flat_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}] ; -; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:-4096 ; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:-2048 ; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:-4096 ; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:-2048 @@ -96,8 +95,10 @@ define hidden amdgpu_kernel void @clmem_read(i8 addrspace(1)* %buffer) { ; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:-2048 ; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off{{$}} ; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:2048 +; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:-4096 ; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off ; +; GFX10: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:-2048 ; GFX10: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off{{$}} ; GFX10: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off{{$}} ; GFX10: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:-2048 @@ -299,9 +300,9 @@ define amdgpu_kernel void @Offset64(i8 addrspace(1)* %buffer) { ; GFX8: flat_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}] ; ; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off{{$}} -; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off{{$}} ; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:-4096 ; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:2048 +; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off{{$}} ; ; GFX10: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off{{$}} ; GFX10: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:-2048 @@ -455,11 +456,11 @@ define amdgpu_kernel void @ReverseOrder(i8 addrspace(1)* %buffer) { ; ; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off{{$}} ; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:2048 -; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off{{$}} ; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:2048 ; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off{{$}} ; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:2048 ; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off{{$}} +; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off{{$}} ; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:2048 ; ; GFX10: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off{{$}} diff --git a/llvm/test/CodeGen/AMDGPU/salu-to-valu.ll b/llvm/test/CodeGen/AMDGPU/salu-to-valu.ll index 091ed34e1112..14635ab7e708 100644 --- a/llvm/test/CodeGen/AMDGPU/salu-to-valu.ll +++ b/llvm/test/CodeGen/AMDGPU/salu-to-valu.ll @@ -203,10 +203,10 @@ entry: ; GCN-LABEL: {{^}}smrd_valu_ci_offset_x16: ; SI: s_mov_b32 {{s[0-9]+}}, 0x13480 -; SI: buffer_load_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, s[{{[0-9]+:[0-9]+}}], {{s[0-9]+}} addr64 ; SI: buffer_load_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, s[{{[0-9]+:[0-9]+}}], 0 addr64 offset:16 ; SI: buffer_load_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, s[{{[0-9]+:[0-9]+}}], 0 addr64 offset:32 ; SI: buffer_load_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, s[{{[0-9]+:[0-9]+}}], 0 addr64 offset:48 +; SI: buffer_load_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, s[{{[0-9]+:[0-9]+}}], {{s[0-9]+}} addr64 ; CI-NOHSA-DAG: s_mov_b32 [[OFFSET0:s[0-9]+]], 0x13480{{$}} ; CI-NOHSA-DAG: buffer_load_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, s[{{[0-9]+:[0-9]+}}], [[OFFSET0]] addr64{{$}} ; CI-NOHSA-DAG: s_mov_b32 [[OFFSET1:s[0-9]+]], 0x13490{{$}} diff --git a/llvm/test/CodeGen/AMDGPU/sgpr-control-flow.ll b/llvm/test/CodeGen/AMDGPU/sgpr-control-flow.ll index d040a04877e6..f423672b8da5 100644 --- a/llvm/test/CodeGen/AMDGPU/sgpr-control-flow.ll +++ b/llvm/test/CodeGen/AMDGPU/sgpr-control-flow.ll @@ -11,27 +11,28 @@ define amdgpu_kernel void @sgpr_if_else_salu_br(i32 addrspace(1)* %out, i32 %a, i32 %b, i32 %c, i32 %d, i32 %e) { ; SI-LABEL: sgpr_if_else_salu_br: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0xb -; SI-NEXT: s_load_dword s2, s[0:1], 0xf -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 -; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_cmp_lg_u32 s4, 0 -; SI-NEXT: s_cbranch_scc0 BB0_2 -; SI-NEXT: ; %bb.1: ; %else -; SI-NEXT: s_add_i32 s2, s7, s2 -; SI-NEXT: s_cbranch_execz BB0_3 -; SI-NEXT: s_branch BB0_4 -; SI-NEXT: BB0_2: -; SI-NEXT: ; implicit-def: $sgpr2 -; SI-NEXT: BB0_3: ; %if -; SI-NEXT: s_sub_i32 s2, s5, s6 -; SI-NEXT: BB0_4: ; %endif -; SI-NEXT: s_add_i32 s4, s2, s4 -; SI-NEXT: s_mov_b32 s3, 0xf000 -; SI-NEXT: s_mov_b32 s2, -1 -; SI-NEXT: v_mov_b32_e32 v0, s4 -; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 -; SI-NEXT: s_endpgm +; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 +; SI-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0xb +; SI-NEXT: s_load_dword s0, s[0:1], 0xf +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: s_cmp_lg_u32 s8, 0 +; SI-NEXT: s_cbranch_scc0 BB0_2 +; SI-NEXT:; %bb.1: ; %else +; SI-NEXT: s_add_i32 s0, s11, s0 +; SI-NEXT: s_cbranch_execz BB0_3 +; SI-NEXT: s_branch BB0_4 +; SI-NEXT:BB0_2: +; SI-NEXT: ; implicit-def: $sgpr0 +; SI-NEXT:BB0_3: ; %if +; SI-NEXT: s_sub_i32 s0, s9, s10 +; SI-NEXT:BB0_4: ; %endif +; SI-NEXT: s_add_i32 s0, s0, s8 +; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_mov_b32 s6, -1 +; SI-NEXT: v_mov_b32_e32 v0, s0 +; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; SI-NEXT: s_endpgm + entry: %0 = icmp eq i32 %a, 0 br i1 %0, label %if, label %else @@ -54,32 +55,33 @@ endif: define amdgpu_kernel void @sgpr_if_else_salu_br_opt(i32 addrspace(1)* %out, [8 x i32], i32 %a, [8 x i32], i32 %b, [8 x i32], i32 %c, [8 x i32], i32 %d, [8 x i32], i32 %e) { ; SI-LABEL: sgpr_if_else_salu_br_opt: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_load_dword s2, s[0:1], 0x13 -; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_cmp_lg_u32 s2, 0 -; SI-NEXT: s_cbranch_scc0 BB1_2 -; SI-NEXT: ; %bb.1: ; %else -; SI-NEXT: s_load_dword s3, s[0:1], 0x2e -; SI-NEXT: s_load_dword s6, s[0:1], 0x37 -; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_add_i32 s3, s3, s6 -; SI-NEXT: s_cbranch_execz BB1_3 -; SI-NEXT: s_branch BB1_4 -; SI-NEXT: BB1_2: -; SI-NEXT: ; implicit-def: $sgpr3 -; SI-NEXT: BB1_3: ; %if -; SI-NEXT: s_load_dword s3, s[0:1], 0x1c -; SI-NEXT: s_load_dword s0, s[0:1], 0x25 -; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_add_i32 s3, s3, s0 -; SI-NEXT: BB1_4: ; %endif -; SI-NEXT: s_add_i32 s0, s3, s2 -; SI-NEXT: s_mov_b32 s7, 0xf000 -; SI-NEXT: s_mov_b32 s6, -1 -; SI-NEXT: v_mov_b32_e32 v0, s0 -; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 -; SI-NEXT: s_endpgm +; SI-NEXT: s_load_dword s2, s[0:1], 0x13 +; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: s_cmp_lg_u32 s2, 0 +; SI-NEXT: s_cbranch_scc0 BB1_2 +; SI-NEXT:; %bb.1: ; %else +; SI-NEXT: s_load_dword s3, s[0:1], 0x2e +; SI-NEXT: s_load_dword s6, s[0:1], 0x37 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: s_add_i32 s3, s3, s6 +; SI-NEXT: s_cbranch_execz BB1_3 +; SI-NEXT: s_branch BB1_4 +; SI-NEXT:BB1_2: +; SI-NEXT: ; implicit-def: $sgpr3 +; SI-NEXT:BB1_3: ; %if +; SI-NEXT: s_load_dword s3, s[0:1], 0x1c +; SI-NEXT: s_load_dword s0, s[0:1], 0x25 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: s_add_i32 s3, s3, s0 +; SI-NEXT:BB1_4: ; %endif +; SI-NEXT: s_add_i32 s0, s3, s2 +; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_mov_b32 s6, -1 +; SI-NEXT: v_mov_b32_e32 v0, s0 +; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; SI-NEXT: s_endpgm + entry: %cmp0 = icmp eq i32 %a, 0 br i1 %cmp0, label %if, label %else diff --git a/llvm/test/CodeGen/AMDGPU/shift-i128.ll b/llvm/test/CodeGen/AMDGPU/shift-i128.ll index 59aebaeed56e..f2077aa2a1ad 100644 --- a/llvm/test/CodeGen/AMDGPU/shift-i128.ll +++ b/llvm/test/CodeGen/AMDGPU/shift-i128.ll @@ -446,68 +446,68 @@ define <2 x i128> @v_ashr_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) { define amdgpu_kernel void @s_shl_v2i128ss(<2 x i128> %lhs, <2 x i128> %rhs) { ; GCN-LABEL: s_shl_v2i128ss: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x0 -; GCN-NEXT: s_load_dwordx8 s[16:23], s[4:5], 0x8 -; GCN-NEXT: v_mov_b32_e32 v10, 16 -; GCN-NEXT: v_mov_b32_e32 v8, 0 -; GCN-NEXT: v_mov_b32_e32 v11, 0 -; GCN-NEXT: v_mov_b32_e32 v9, 0 -; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_sub_i32 s6, 64, s16 -; GCN-NEXT: v_cmp_lt_u64_e64 s[0:1], s[16:17], 64 -; GCN-NEXT: v_cmp_eq_u64_e64 s[2:3], s[18:19], 0 -; GCN-NEXT: s_sub_i32 s4, s16, 64 -; GCN-NEXT: s_lshr_b64 s[6:7], s[8:9], s6 -; GCN-NEXT: s_lshl_b64 s[24:25], s[10:11], s16 -; GCN-NEXT: s_and_b64 vcc, s[2:3], s[0:1] -; GCN-NEXT: s_or_b64 s[0:1], s[16:17], s[18:19] -; GCN-NEXT: s_lshl_b64 s[4:5], s[8:9], s4 -; GCN-NEXT: s_or_b64 s[6:7], s[24:25], s[6:7] -; GCN-NEXT: v_mov_b32_e32 v0, s5 -; GCN-NEXT: v_mov_b32_e32 v1, s7 -; GCN-NEXT: v_cmp_eq_u64_e64 s[0:1], s[0:1], 0 -; GCN-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc -; GCN-NEXT: v_mov_b32_e32 v1, s11 -; GCN-NEXT: v_cndmask_b32_e64 v3, v0, v1, s[0:1] -; GCN-NEXT: v_mov_b32_e32 v0, s4 -; GCN-NEXT: v_mov_b32_e32 v1, s6 -; GCN-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc -; GCN-NEXT: v_mov_b32_e32 v1, s10 -; GCN-NEXT: s_sub_i32 s6, 64, s20 -; GCN-NEXT: v_cndmask_b32_e64 v2, v0, v1, s[0:1] -; GCN-NEXT: v_cmp_lt_u64_e64 s[0:1], s[20:21], 64 -; GCN-NEXT: v_cmp_eq_u64_e64 s[2:3], s[22:23], 0 -; GCN-NEXT: s_sub_i32 s4, s20, 64 -; GCN-NEXT: s_lshr_b64 s[6:7], s[12:13], s6 -; GCN-NEXT: s_lshl_b64 s[10:11], s[14:15], s20 -; GCN-NEXT: s_lshl_b64 s[4:5], s[12:13], s4 -; GCN-NEXT: s_or_b64 s[6:7], s[10:11], s[6:7] -; GCN-NEXT: s_and_b64 s[0:1], s[2:3], s[0:1] -; GCN-NEXT: s_or_b64 s[2:3], s[20:21], s[22:23] -; GCN-NEXT: v_mov_b32_e32 v0, s5 -; GCN-NEXT: v_mov_b32_e32 v1, s7 -; GCN-NEXT: v_cmp_eq_u64_e64 s[2:3], s[2:3], 0 -; GCN-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[0:1] -; GCN-NEXT: v_mov_b32_e32 v1, s15 -; GCN-NEXT: v_cndmask_b32_e64 v7, v0, v1, s[2:3] -; GCN-NEXT: v_mov_b32_e32 v0, s4 -; GCN-NEXT: v_mov_b32_e32 v1, s6 -; GCN-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[0:1] -; GCN-NEXT: v_mov_b32_e32 v1, s14 -; GCN-NEXT: v_cndmask_b32_e64 v6, v0, v1, s[2:3] -; GCN-NEXT: s_lshl_b64 s[2:3], s[8:9], s16 -; GCN-NEXT: v_mov_b32_e32 v0, s3 -; GCN-NEXT: v_cndmask_b32_e32 v1, 0, v0, vcc -; GCN-NEXT: v_mov_b32_e32 v0, s2 -; GCN-NEXT: s_lshl_b64 s[2:3], s[12:13], s20 -; GCN-NEXT: v_mov_b32_e32 v4, s3 -; GCN-NEXT: v_cndmask_b32_e64 v5, 0, v4, s[0:1] -; GCN-NEXT: v_mov_b32_e32 v4, s2 -; GCN-NEXT: v_cndmask_b32_e64 v4, 0, v4, s[0:1] -; GCN-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc -; GCN-NEXT: flat_store_dwordx4 v[10:11], v[4:7] -; GCN-NEXT: flat_store_dwordx4 v[8:9], v[0:3] -; GCN-NEXT: s_endpgm +; GCN-NEXT: s_load_dwordx8 s[16:23], s[4:5], 0x8 +; GCN-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x0 +; GCN-NEXT: v_mov_b32_e32 v10, 16 +; GCN-NEXT: v_mov_b32_e32 v8, 0 +; GCN-NEXT: v_mov_b32_e32 v11, 0 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: s_sub_i32 s6, 64, s16 +; GCN-NEXT: v_cmp_lt_u64_e64 s[0:1], s[16:17], 64 +; GCN-NEXT: v_cmp_eq_u64_e64 s[2:3], s[18:19], 0 +; GCN-NEXT: s_sub_i32 s4, s16, 64 +; GCN-NEXT: s_lshr_b64 s[6:7], s[8:9], s6 +; GCN-NEXT: s_lshl_b64 s[24:25], s[10:11], s16 +; GCN-NEXT: s_and_b64 vcc, s[2:3], s[0:1] +; GCN-NEXT: s_or_b64 s[0:1], s[16:17], s[18:19] +; GCN-NEXT: s_lshl_b64 s[4:5], s[8:9], s4 +; GCN-NEXT: s_or_b64 s[6:7], s[24:25], s[6:7] +; GCN-NEXT: v_mov_b32_e32 v0, s5 +; GCN-NEXT: v_mov_b32_e32 v1, s7 +; GCN-NEXT: v_cmp_eq_u64_e64 s[0:1], s[0:1], 0 +; GCN-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc +; GCN-NEXT: v_mov_b32_e32 v1, s11 +; GCN-NEXT: v_cndmask_b32_e64 v3, v0, v1, s[0:1] +; GCN-NEXT: v_mov_b32_e32 v0, s4 +; GCN-NEXT: v_mov_b32_e32 v1, s6 +; GCN-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc +; GCN-NEXT: v_mov_b32_e32 v1, s10 +; GCN-NEXT: s_sub_i32 s6, 64, s20 +; GCN-NEXT: v_cndmask_b32_e64 v2, v0, v1, s[0:1] +; GCN-NEXT: v_cmp_lt_u64_e64 s[0:1], s[20:21], 64 +; GCN-NEXT: v_cmp_eq_u64_e64 s[2:3], s[22:23], 0 +; GCN-NEXT: s_sub_i32 s4, s20, 64 +; GCN-NEXT: s_lshr_b64 s[6:7], s[12:13], s6 +; GCN-NEXT: s_lshl_b64 s[10:11], s[14:15], s20 +; GCN-NEXT: s_lshl_b64 s[4:5], s[12:13], s4 +; GCN-NEXT: s_or_b64 s[6:7], s[10:11], s[6:7] +; GCN-NEXT: s_and_b64 s[0:1], s[2:3], s[0:1] +; GCN-NEXT: s_or_b64 s[2:3], s[20:21], s[22:23] +; GCN-NEXT: v_mov_b32_e32 v0, s5 +; GCN-NEXT: v_mov_b32_e32 v1, s7 +; GCN-NEXT: v_cmp_eq_u64_e64 s[2:3], s[2:3], 0 +; GCN-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[0:1] +; GCN-NEXT: v_mov_b32_e32 v1, s15 +; GCN-NEXT: v_cndmask_b32_e64 v7, v0, v1, s[2:3] +; GCN-NEXT: v_mov_b32_e32 v0, s4 +; GCN-NEXT: v_mov_b32_e32 v1, s6 +; GCN-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[0:1] +; GCN-NEXT: v_mov_b32_e32 v1, s14 +; GCN-NEXT: v_cndmask_b32_e64 v6, v0, v1, s[2:3] +; GCN-NEXT: s_lshl_b64 s[2:3], s[8:9], s16 +; GCN-NEXT: v_mov_b32_e32 v0, s3 +; GCN-NEXT: v_cndmask_b32_e32 v1, 0, v0, vcc +; GCN-NEXT: v_mov_b32_e32 v0, s2 +; GCN-NEXT: s_lshl_b64 s[2:3], s[12:13], s20 +; GCN-NEXT: v_mov_b32_e32 v4, s3 +; GCN-NEXT: v_cndmask_b32_e64 v5, 0, v4, s[0:1] +; GCN-NEXT: v_mov_b32_e32 v4, s2 +; GCN-NEXT: v_cndmask_b32_e64 v4, 0, v4, s[0:1] +; GCN-NEXT: v_mov_b32_e32 v9, 0 +; GCN-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc +; GCN-NEXT: flat_store_dwordx4 v[10:11], v[4:7] +; GCN-NEXT: flat_store_dwordx4 v[8:9], v[0:3] +; GCN-NEXT: s_endpgm %shift = shl <2 x i128> %lhs, %rhs store <2 x i128> %shift, <2 x i128> addrspace(1)* null ret void @@ -516,68 +516,68 @@ define amdgpu_kernel void @s_shl_v2i128ss(<2 x i128> %lhs, <2 x i128> %rhs) { define amdgpu_kernel void @s_lshr_v2i128_ss(<2 x i128> %lhs, <2 x i128> %rhs) { ; GCN-LABEL: s_lshr_v2i128_ss: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x0 -; GCN-NEXT: s_load_dwordx8 s[16:23], s[4:5], 0x8 -; GCN-NEXT: v_mov_b32_e32 v10, 16 -; GCN-NEXT: v_mov_b32_e32 v8, 0 -; GCN-NEXT: v_mov_b32_e32 v11, 0 -; GCN-NEXT: v_mov_b32_e32 v9, 0 -; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_sub_i32 s6, 64, s16 -; GCN-NEXT: v_cmp_lt_u64_e64 s[0:1], s[16:17], 64 -; GCN-NEXT: v_cmp_eq_u64_e64 s[2:3], s[18:19], 0 -; GCN-NEXT: s_sub_i32 s4, s16, 64 -; GCN-NEXT: s_lshl_b64 s[6:7], s[10:11], s6 -; GCN-NEXT: s_lshr_b64 s[24:25], s[8:9], s16 -; GCN-NEXT: s_or_b64 s[6:7], s[24:25], s[6:7] -; GCN-NEXT: s_and_b64 vcc, s[2:3], s[0:1] -; GCN-NEXT: s_or_b64 s[0:1], s[16:17], s[18:19] -; GCN-NEXT: s_lshr_b64 s[4:5], s[10:11], s4 -; GCN-NEXT: v_mov_b32_e32 v0, s5 -; GCN-NEXT: v_mov_b32_e32 v1, s7 -; GCN-NEXT: v_cmp_eq_u64_e64 s[0:1], s[0:1], 0 -; GCN-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc -; GCN-NEXT: v_mov_b32_e32 v1, s9 -; GCN-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[0:1] -; GCN-NEXT: v_mov_b32_e32 v0, s4 -; GCN-NEXT: v_mov_b32_e32 v2, s6 -; GCN-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc -; GCN-NEXT: v_mov_b32_e32 v2, s8 -; GCN-NEXT: s_sub_i32 s6, 64, s20 -; GCN-NEXT: v_cndmask_b32_e64 v0, v0, v2, s[0:1] -; GCN-NEXT: v_cmp_lt_u64_e64 s[0:1], s[20:21], 64 -; GCN-NEXT: v_cmp_eq_u64_e64 s[2:3], s[22:23], 0 -; GCN-NEXT: s_sub_i32 s4, s20, 64 -; GCN-NEXT: s_lshl_b64 s[6:7], s[14:15], s6 -; GCN-NEXT: s_lshr_b64 s[8:9], s[12:13], s20 -; GCN-NEXT: s_lshr_b64 s[4:5], s[14:15], s4 -; GCN-NEXT: s_or_b64 s[6:7], s[8:9], s[6:7] -; GCN-NEXT: s_and_b64 s[0:1], s[2:3], s[0:1] -; GCN-NEXT: s_or_b64 s[2:3], s[20:21], s[22:23] -; GCN-NEXT: v_mov_b32_e32 v2, s5 -; GCN-NEXT: v_mov_b32_e32 v3, s7 -; GCN-NEXT: v_cmp_eq_u64_e64 s[2:3], s[2:3], 0 -; GCN-NEXT: v_cndmask_b32_e64 v2, v2, v3, s[0:1] -; GCN-NEXT: v_mov_b32_e32 v3, s13 -; GCN-NEXT: v_cndmask_b32_e64 v5, v2, v3, s[2:3] -; GCN-NEXT: v_mov_b32_e32 v2, s4 -; GCN-NEXT: v_mov_b32_e32 v3, s6 -; GCN-NEXT: v_cndmask_b32_e64 v2, v2, v3, s[0:1] -; GCN-NEXT: v_mov_b32_e32 v3, s12 -; GCN-NEXT: v_cndmask_b32_e64 v4, v2, v3, s[2:3] -; GCN-NEXT: s_lshr_b64 s[2:3], s[10:11], s16 -; GCN-NEXT: v_mov_b32_e32 v2, s3 -; GCN-NEXT: v_cndmask_b32_e32 v3, 0, v2, vcc -; GCN-NEXT: v_mov_b32_e32 v2, s2 -; GCN-NEXT: s_lshr_b64 s[2:3], s[14:15], s20 -; GCN-NEXT: v_mov_b32_e32 v6, s3 -; GCN-NEXT: v_cndmask_b32_e64 v7, 0, v6, s[0:1] -; GCN-NEXT: v_mov_b32_e32 v6, s2 -; GCN-NEXT: v_cndmask_b32_e64 v6, 0, v6, s[0:1] -; GCN-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc -; GCN-NEXT: flat_store_dwordx4 v[10:11], v[4:7] -; GCN-NEXT: flat_store_dwordx4 v[8:9], v[0:3] -; GCN-NEXT: s_endpgm +; GCN-NEXT: s_load_dwordx8 s[16:23], s[4:5], 0x8 +; GCN-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x0 +; GCN-NEXT: v_mov_b32_e32 v10, 16 +; GCN-NEXT: v_mov_b32_e32 v8, 0 +; GCN-NEXT: v_mov_b32_e32 v11, 0 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: s_sub_i32 s6, 64, s16 +; GCN-NEXT: v_cmp_lt_u64_e64 s[0:1], s[16:17], 64 +; GCN-NEXT: v_cmp_eq_u64_e64 s[2:3], s[18:19], 0 +; GCN-NEXT: s_sub_i32 s4, s16, 64 +; GCN-NEXT: s_lshl_b64 s[6:7], s[10:11], s6 +; GCN-NEXT: s_lshr_b64 s[24:25], s[8:9], s16 +; GCN-NEXT: s_or_b64 s[6:7], s[24:25], s[6:7] +; GCN-NEXT: s_and_b64 vcc, s[2:3], s[0:1] +; GCN-NEXT: s_or_b64 s[0:1], s[16:17], s[18:19] +; GCN-NEXT: s_lshr_b64 s[4:5], s[10:11], s4 +; GCN-NEXT: v_mov_b32_e32 v0, s5 +; GCN-NEXT: v_mov_b32_e32 v1, s7 +; GCN-NEXT: v_cmp_eq_u64_e64 s[0:1], s[0:1], 0 +; GCN-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc +; GCN-NEXT: v_mov_b32_e32 v1, s9 +; GCN-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[0:1] +; GCN-NEXT: v_mov_b32_e32 v0, s4 +; GCN-NEXT: v_mov_b32_e32 v2, s6 +; GCN-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc +; GCN-NEXT: v_mov_b32_e32 v2, s8 +; GCN-NEXT: s_sub_i32 s6, 64, s20 +; GCN-NEXT: v_cndmask_b32_e64 v0, v0, v2, s[0:1] +; GCN-NEXT: v_cmp_lt_u64_e64 s[0:1], s[20:21], 64 +; GCN-NEXT: v_cmp_eq_u64_e64 s[2:3], s[22:23], 0 +; GCN-NEXT: s_sub_i32 s4, s20, 64 +; GCN-NEXT: s_lshl_b64 s[6:7], s[14:15], s6 +; GCN-NEXT: s_lshr_b64 s[8:9], s[12:13], s20 +; GCN-NEXT: s_lshr_b64 s[4:5], s[14:15], s4 +; GCN-NEXT: s_or_b64 s[6:7], s[8:9], s[6:7] +; GCN-NEXT: s_and_b64 s[0:1], s[2:3], s[0:1] +; GCN-NEXT: s_or_b64 s[2:3], s[20:21], s[22:23] +; GCN-NEXT: v_mov_b32_e32 v2, s5 +; GCN-NEXT: v_mov_b32_e32 v3, s7 +; GCN-NEXT: v_cmp_eq_u64_e64 s[2:3], s[2:3], 0 +; GCN-NEXT: v_cndmask_b32_e64 v2, v2, v3, s[0:1] +; GCN-NEXT: v_mov_b32_e32 v3, s13 +; GCN-NEXT: v_cndmask_b32_e64 v5, v2, v3, s[2:3] +; GCN-NEXT: v_mov_b32_e32 v2, s4 +; GCN-NEXT: v_mov_b32_e32 v3, s6 +; GCN-NEXT: v_cndmask_b32_e64 v2, v2, v3, s[0:1] +; GCN-NEXT: v_mov_b32_e32 v3, s12 +; GCN-NEXT: v_cndmask_b32_e64 v4, v2, v3, s[2:3] +; GCN-NEXT: s_lshr_b64 s[2:3], s[10:11], s16 +; GCN-NEXT: v_mov_b32_e32 v2, s3 +; GCN-NEXT: v_cndmask_b32_e32 v3, 0, v2, vcc +; GCN-NEXT: v_mov_b32_e32 v2, s2 +; GCN-NEXT: s_lshr_b64 s[2:3], s[14:15], s20 +; GCN-NEXT: v_mov_b32_e32 v6, s3 +; GCN-NEXT: v_cndmask_b32_e64 v7, 0, v6, s[0:1] +; GCN-NEXT: v_mov_b32_e32 v6, s2 +; GCN-NEXT: v_cndmask_b32_e64 v6, 0, v6, s[0:1] +; GCN-NEXT: v_mov_b32_e32 v9, 0 +; GCN-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc +; GCN-NEXT: flat_store_dwordx4 v[10:11], v[4:7] +; GCN-NEXT: flat_store_dwordx4 v[8:9], v[0:3] +; GCN-NEXT: s_endpgm %shift = lshr <2 x i128> %lhs, %rhs store <2 x i128> %shift, <2 x i128> addrspace(1)* null ret void @@ -586,72 +586,72 @@ define amdgpu_kernel void @s_lshr_v2i128_ss(<2 x i128> %lhs, <2 x i128> %rhs) { define amdgpu_kernel void @s_ashr_v2i128_ss(<2 x i128> %lhs, <2 x i128> %rhs) { ; GCN-LABEL: s_ashr_v2i128_ss: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x0 -; GCN-NEXT: s_load_dwordx8 s[16:23], s[4:5], 0x8 -; GCN-NEXT: v_mov_b32_e32 v8, 0 -; GCN-NEXT: v_mov_b32_e32 v9, 0 -; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_sub_i32 s6, 64, s16 -; GCN-NEXT: v_cmp_lt_u64_e64 s[0:1], s[16:17], 64 -; GCN-NEXT: v_cmp_eq_u64_e64 s[2:3], s[18:19], 0 -; GCN-NEXT: s_sub_i32 s4, s16, 64 -; GCN-NEXT: s_lshl_b64 s[6:7], s[10:11], s6 -; GCN-NEXT: s_lshr_b64 s[24:25], s[8:9], s16 -; GCN-NEXT: s_or_b64 s[6:7], s[24:25], s[6:7] -; GCN-NEXT: s_and_b64 vcc, s[2:3], s[0:1] -; GCN-NEXT: s_or_b64 s[0:1], s[16:17], s[18:19] -; GCN-NEXT: s_ashr_i64 s[4:5], s[10:11], s4 -; GCN-NEXT: v_mov_b32_e32 v0, s5 -; GCN-NEXT: v_mov_b32_e32 v1, s7 -; GCN-NEXT: v_cmp_eq_u64_e64 s[0:1], s[0:1], 0 -; GCN-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc -; GCN-NEXT: v_mov_b32_e32 v1, s9 -; GCN-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[0:1] -; GCN-NEXT: v_mov_b32_e32 v0, s4 -; GCN-NEXT: v_mov_b32_e32 v2, s6 -; GCN-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc -; GCN-NEXT: v_mov_b32_e32 v2, s8 -; GCN-NEXT: s_sub_i32 s6, 64, s20 -; GCN-NEXT: v_cndmask_b32_e64 v0, v0, v2, s[0:1] -; GCN-NEXT: v_cmp_lt_u64_e64 s[0:1], s[20:21], 64 -; GCN-NEXT: v_cmp_eq_u64_e64 s[2:3], s[22:23], 0 -; GCN-NEXT: s_sub_i32 s4, s20, 64 -; GCN-NEXT: s_lshl_b64 s[6:7], s[14:15], s6 -; GCN-NEXT: s_lshr_b64 s[8:9], s[12:13], s20 -; GCN-NEXT: s_ashr_i64 s[4:5], s[14:15], s4 -; GCN-NEXT: s_or_b64 s[6:7], s[8:9], s[6:7] -; GCN-NEXT: s_and_b64 s[0:1], s[2:3], s[0:1] -; GCN-NEXT: s_or_b64 s[2:3], s[20:21], s[22:23] -; GCN-NEXT: v_mov_b32_e32 v2, s5 -; GCN-NEXT: v_mov_b32_e32 v3, s7 -; GCN-NEXT: v_cmp_eq_u64_e64 s[2:3], s[2:3], 0 -; GCN-NEXT: v_cndmask_b32_e64 v2, v2, v3, s[0:1] -; GCN-NEXT: v_mov_b32_e32 v3, s13 -; GCN-NEXT: v_cndmask_b32_e64 v5, v2, v3, s[2:3] -; GCN-NEXT: v_mov_b32_e32 v2, s4 -; GCN-NEXT: v_mov_b32_e32 v3, s6 -; GCN-NEXT: v_cndmask_b32_e64 v2, v2, v3, s[0:1] -; GCN-NEXT: v_mov_b32_e32 v3, s12 -; GCN-NEXT: v_cndmask_b32_e64 v4, v2, v3, s[2:3] -; GCN-NEXT: s_ashr_i64 s[2:3], s[10:11], s16 -; GCN-NEXT: s_ashr_i32 s4, s11, 31 -; GCN-NEXT: v_mov_b32_e32 v2, s4 -; GCN-NEXT: v_mov_b32_e32 v3, s3 -; GCN-NEXT: v_mov_b32_e32 v6, s2 -; GCN-NEXT: s_ashr_i64 s[2:3], s[14:15], s20 -; GCN-NEXT: s_ashr_i32 s4, s15, 31 -; GCN-NEXT: v_cndmask_b32_e32 v3, v2, v3, vcc -; GCN-NEXT: v_cndmask_b32_e32 v2, v2, v6, vcc -; GCN-NEXT: v_mov_b32_e32 v6, s4 -; GCN-NEXT: v_mov_b32_e32 v7, s3 -; GCN-NEXT: v_mov_b32_e32 v10, s2 -; GCN-NEXT: v_cndmask_b32_e64 v7, v6, v7, s[0:1] -; GCN-NEXT: v_cndmask_b32_e64 v6, v6, v10, s[0:1] -; GCN-NEXT: v_mov_b32_e32 v10, 16 -; GCN-NEXT: v_mov_b32_e32 v11, 0 -; GCN-NEXT: flat_store_dwordx4 v[10:11], v[4:7] -; GCN-NEXT: flat_store_dwordx4 v[8:9], v[0:3] -; GCN-NEXT: s_endpgm +; GCN-NEXT: s_load_dwordx8 s[16:23], s[4:5], 0x8 +; GCN-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x0 +; GCN-NEXT: v_mov_b32_e32 v8, 0 +; GCN-NEXT: v_mov_b32_e32 v9, 0 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: s_sub_i32 s6, 64, s16 +; GCN-NEXT: v_cmp_lt_u64_e64 s[0:1], s[16:17], 64 +; GCN-NEXT: v_cmp_eq_u64_e64 s[2:3], s[18:19], 0 +; GCN-NEXT: s_sub_i32 s4, s16, 64 +; GCN-NEXT: s_lshl_b64 s[6:7], s[10:11], s6 +; GCN-NEXT: s_lshr_b64 s[24:25], s[8:9], s16 +; GCN-NEXT: s_or_b64 s[6:7], s[24:25], s[6:7] +; GCN-NEXT: s_and_b64 vcc, s[2:3], s[0:1] +; GCN-NEXT: s_or_b64 s[0:1], s[16:17], s[18:19] +; GCN-NEXT: s_ashr_i64 s[4:5], s[10:11], s4 +; GCN-NEXT: v_mov_b32_e32 v0, s5 +; GCN-NEXT: v_mov_b32_e32 v1, s7 +; GCN-NEXT: v_cmp_eq_u64_e64 s[0:1], s[0:1], 0 +; GCN-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc +; GCN-NEXT: v_mov_b32_e32 v1, s9 +; GCN-NEXT: v_cndmask_b32_e64 v1, v0, v1, s[0:1] +; GCN-NEXT: v_mov_b32_e32 v0, s4 +; GCN-NEXT: v_mov_b32_e32 v2, s6 +; GCN-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc +; GCN-NEXT: v_mov_b32_e32 v2, s8 +; GCN-NEXT: s_sub_i32 s6, 64, s20 +; GCN-NEXT: v_cndmask_b32_e64 v0, v0, v2, s[0:1] +; GCN-NEXT: v_cmp_lt_u64_e64 s[0:1], s[20:21], 64 +; GCN-NEXT: v_cmp_eq_u64_e64 s[2:3], s[22:23], 0 +; GCN-NEXT: s_sub_i32 s4, s20, 64 +; GCN-NEXT: s_lshl_b64 s[6:7], s[14:15], s6 +; GCN-NEXT: s_lshr_b64 s[8:9], s[12:13], s20 +; GCN-NEXT: s_ashr_i64 s[4:5], s[14:15], s4 +; GCN-NEXT: s_or_b64 s[6:7], s[8:9], s[6:7] +; GCN-NEXT: s_and_b64 s[0:1], s[2:3], s[0:1] +; GCN-NEXT: s_or_b64 s[2:3], s[20:21], s[22:23] +; GCN-NEXT: v_mov_b32_e32 v2, s5 +; GCN-NEXT: v_mov_b32_e32 v3, s7 +; GCN-NEXT: v_cmp_eq_u64_e64 s[2:3], s[2:3], 0 +; GCN-NEXT: v_cndmask_b32_e64 v2, v2, v3, s[0:1] +; GCN-NEXT: v_mov_b32_e32 v3, s13 +; GCN-NEXT: v_cndmask_b32_e64 v5, v2, v3, s[2:3] +; GCN-NEXT: v_mov_b32_e32 v2, s4 +; GCN-NEXT: v_mov_b32_e32 v3, s6 +; GCN-NEXT: v_cndmask_b32_e64 v2, v2, v3, s[0:1] +; GCN-NEXT: v_mov_b32_e32 v3, s12 +; GCN-NEXT: v_cndmask_b32_e64 v4, v2, v3, s[2:3] +; GCN-NEXT: s_ashr_i64 s[2:3], s[10:11], s16 +; GCN-NEXT: s_ashr_i32 s4, s11, 31 +; GCN-NEXT: v_mov_b32_e32 v2, s4 +; GCN-NEXT: v_mov_b32_e32 v3, s3 +; GCN-NEXT: v_mov_b32_e32 v6, s2 +; GCN-NEXT: s_ashr_i64 s[2:3], s[14:15], s20 +; GCN-NEXT: s_ashr_i32 s4, s15, 31 +; GCN-NEXT: v_cndmask_b32_e32 v3, v2, v3, vcc +; GCN-NEXT: v_cndmask_b32_e32 v2, v2, v6, vcc +; GCN-NEXT: v_mov_b32_e32 v6, s4 +; GCN-NEXT: v_mov_b32_e32 v7, s3 +; GCN-NEXT: v_mov_b32_e32 v10, s2 +; GCN-NEXT: v_cndmask_b32_e64 v7, v6, v7, s[0:1] +; GCN-NEXT: v_cndmask_b32_e64 v6, v6, v10, s[0:1] +; GCN-NEXT: v_mov_b32_e32 v10, 16 +; GCN-NEXT: v_mov_b32_e32 v11, 0 +; GCN-NEXT: flat_store_dwordx4 v[10:11], v[4:7] +; GCN-NEXT: flat_store_dwordx4 v[8:9], v[0:3] +; GCN-NEXT: s_endpgm %shift = ashr <2 x i128> %lhs, %rhs store <2 x i128> %shift, <2 x i128> addrspace(1)* null ret void diff --git a/llvm/test/CodeGen/AMDGPU/store-weird-sizes.ll b/llvm/test/CodeGen/AMDGPU/store-weird-sizes.ll index 9c673c74248b..2b8eba5f9014 100644 --- a/llvm/test/CodeGen/AMDGPU/store-weird-sizes.ll +++ b/llvm/test/CodeGen/AMDGPU/store-weird-sizes.ll @@ -6,14 +6,14 @@ define void @local_store_i56(i56 addrspace(3)* %ptr, i56 %arg) #0 { ; CIVI-LABEL: local_store_i56: ; CIVI: ; %bb.0: -; CIVI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CIVI-NEXT: v_lshrrev_b32_e32 v3, 16, v2 -; CIVI-NEXT: s_mov_b32 m0, -1 -; CIVI-NEXT: ds_write_b8 v0, v3 offset:6 -; CIVI-NEXT: ds_write_b16 v0, v2 offset:4 -; CIVI-NEXT: ds_write_b32 v0, v1 -; CIVI-NEXT: s_waitcnt lgkmcnt(0) -; CIVI-NEXT: s_setpc_b64 s[30:31] +; CIVI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CIVI-NEXT: s_mov_b32 m0, -1 +; CIVI-NEXT: ds_write_b16 v0, v2 offset:4 +; CIVI-NEXT: ds_write_b32 v0, v1 +; CIVI-NEXT: v_lshrrev_b32_e32 v1, 16, v2 +; CIVI-NEXT: ds_write_b8 v0, v1 offset:6 +; CIVI-NEXT: s_waitcnt lgkmcnt(0) +; CIVI-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: local_store_i56: ; GFX9: ; %bb.0: @@ -30,70 +30,70 @@ define void @local_store_i56(i56 addrspace(3)* %ptr, i56 %arg) #0 { define amdgpu_kernel void @local_store_i55(i55 addrspace(3)* %ptr, i55 %arg) #0 { ; HAWAII-LABEL: local_store_i55: ; HAWAII: ; %bb.0: -; HAWAII-NEXT: s_or_b32 s0, s4, 14 -; HAWAII-NEXT: v_mov_b32_e32 v0, s0 -; HAWAII-NEXT: v_mov_b32_e32 v1, s5 -; HAWAII-NEXT: flat_load_ubyte v0, v[0:1] -; HAWAII-NEXT: s_load_dword s0, s[4:5], 0x0 -; HAWAII-NEXT: s_load_dword s1, s[4:5], 0x2 -; HAWAII-NEXT: s_load_dword s2, s[4:5], 0x3 -; HAWAII-NEXT: s_mov_b32 m0, -1 -; HAWAII-NEXT: s_waitcnt lgkmcnt(0) -; HAWAII-NEXT: v_mov_b32_e32 v1, s0 -; HAWAII-NEXT: v_mov_b32_e32 v2, s1 -; HAWAII-NEXT: v_mov_b32_e32 v3, s2 -; HAWAII-NEXT: s_waitcnt vmcnt(0) -; HAWAII-NEXT: v_and_b32_e32 v0, 0x7f, v0 -; HAWAII-NEXT: ds_write_b8 v1, v0 offset:6 -; HAWAII-NEXT: ds_write_b16 v1, v3 offset:4 -; HAWAII-NEXT: ds_write_b32 v1, v2 -; HAWAII-NEXT: s_endpgm +; HAWAII-NEXT: s_or_b32 s0, s4, 14 +; HAWAII-NEXT: v_mov_b32_e32 v0, s0 +; HAWAII-NEXT: v_mov_b32_e32 v1, s5 +; HAWAII-NEXT: flat_load_ubyte v0, v[0:1] +; HAWAII-NEXT: s_load_dword s0, s[4:5], 0x0 +; HAWAII-NEXT: s_load_dword s1, s[4:5], 0x2 +; HAWAII-NEXT: s_load_dword s2, s[4:5], 0x3 +; HAWAII-NEXT: s_mov_b32 m0, -1 +; HAWAII-NEXT: s_waitcnt lgkmcnt(0) +; HAWAII-NEXT: v_mov_b32_e32 v1, s0 +; HAWAII-NEXT: v_mov_b32_e32 v3, s1 +; HAWAII-NEXT: v_mov_b32_e32 v2, s2 +; HAWAII-NEXT: ds_write_b16 v1, v2 offset:4 +; HAWAII-NEXT: s_waitcnt vmcnt(0) +; HAWAII-NEXT: v_and_b32_e32 v0, 0x7f, v0 +; HAWAII-NEXT: ds_write_b8 v1, v0 offset:6 +; HAWAII-NEXT: ds_write_b32 v1, v3 +; HAWAII-NEXT: s_endpgm ; ; FIJI-LABEL: local_store_i55: ; FIJI: ; %bb.0: -; FIJI-NEXT: s_or_b32 s0, s4, 14 -; FIJI-NEXT: v_mov_b32_e32 v0, s0 -; FIJI-NEXT: v_mov_b32_e32 v1, s5 -; FIJI-NEXT: flat_load_ubyte v0, v[0:1] -; FIJI-NEXT: s_load_dword s0, s[4:5], 0x0 -; FIJI-NEXT: s_load_dword s1, s[4:5], 0x8 -; FIJI-NEXT: s_load_dword s2, s[4:5], 0xc -; FIJI-NEXT: s_mov_b32 m0, -1 -; FIJI-NEXT: s_waitcnt lgkmcnt(0) -; FIJI-NEXT: v_mov_b32_e32 v1, s0 -; FIJI-NEXT: v_mov_b32_e32 v3, s1 -; FIJI-NEXT: s_and_b32 s3, s2, 0xffff -; FIJI-NEXT: v_mov_b32_e32 v2, s2 -; FIJI-NEXT: s_waitcnt vmcnt(0) -; FIJI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; FIJI-NEXT: v_or_b32_e32 v0, s3, v0 -; FIJI-NEXT: v_bfe_u32 v0, v0, 16, 7 -; FIJI-NEXT: ds_write_b8 v1, v0 offset:6 -; FIJI-NEXT: ds_write_b16 v1, v2 offset:4 -; FIJI-NEXT: ds_write_b32 v1, v3 -; FIJI-NEXT: s_endpgm +; FIJI-NEXT: s_or_b32 s0, s4, 14 +; FIJI-NEXT: v_mov_b32_e32 v0, s0 +; FIJI-NEXT: v_mov_b32_e32 v1, s5 +; FIJI-NEXT: flat_load_ubyte v0, v[0:1] +; FIJI-NEXT: s_load_dword s0, s[4:5], 0x0 +; FIJI-NEXT: s_load_dword s1, s[4:5], 0x8 +; FIJI-NEXT: s_load_dword s2, s[4:5], 0xc +; FIJI-NEXT: s_mov_b32 m0, -1 +; FIJI-NEXT: s_waitcnt lgkmcnt(0) +; FIJI-NEXT: v_mov_b32_e32 v1, s0 +; FIJI-NEXT: v_mov_b32_e32 v3, s1 +; FIJI-NEXT: s_and_b32 s3, s2, 0xffff +; FIJI-NEXT: v_mov_b32_e32 v2, s2 +; FIJI-NEXT: ds_write_b16 v1, v2 offset:4 +; FIJI-NEXT: s_waitcnt vmcnt(0) +; FIJI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; FIJI-NEXT: v_or_b32_e32 v0, s3, v0 +; FIJI-NEXT: v_bfe_u32 v0, v0, 16, 7 +; FIJI-NEXT: ds_write_b8 v1, v0 offset:6 +; FIJI-NEXT: ds_write_b32 v1, v3 +; FIJI-NEXT: s_endpgm ; ; GFX9-LABEL: local_store_i55: ; GFX9: ; %bb.0: -; GFX9-NEXT: v_mov_b32_e32 v0, s4 -; GFX9-NEXT: v_mov_b32_e32 v1, s5 -; GFX9-NEXT: v_mov_b32_e32 v2, 0 -; GFX9-NEXT: global_load_ubyte_d16_hi v2, v[0:1], off offset:14 -; GFX9-NEXT: s_load_dword s0, s[4:5], 0x0 -; GFX9-NEXT: s_load_dword s1, s[4:5], 0x8 -; GFX9-NEXT: s_load_dword s2, s[4:5], 0xc -; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NEXT: v_mov_b32_e32 v3, s1 -; GFX9-NEXT: s_and_b32 s3, s2, 0xffff -; GFX9-NEXT: v_mov_b32_e32 v1, s2 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_or_b32_e32 v2, s3, v2 -; GFX9-NEXT: v_and_b32_e32 v2, 0x7fffff, v2 -; GFX9-NEXT: ds_write_b8_d16_hi v0, v2 offset:6 -; GFX9-NEXT: ds_write_b16 v0, v1 offset:4 -; GFX9-NEXT: ds_write_b32 v0, v3 -; GFX9-NEXT: s_endpgm +; GFX9-NEXT: v_mov_b32_e32 v0, s4 +; GFX9-NEXT: v_mov_b32_e32 v1, s5 +; GFX9-NEXT: v_mov_b32_e32 v2, 0 +; GFX9-NEXT: global_load_ubyte_d16_hi v2, v[0:1], off offset:14 +; GFX9-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX9-NEXT: s_load_dword s1, s[4:5], 0x8 +; GFX9-NEXT: s_load_dword s2, s[4:5], 0xc +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NEXT: v_mov_b32_e32 v3, s1 +; GFX9-NEXT: v_mov_b32_e32 v1, s2 +; GFX9-NEXT: s_and_b32 s3, s2, 0xffff +; GFX9-NEXT: ds_write_b16 v0, v1 offset:4 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_e32 v1, s3, v2 +; GFX9-NEXT: v_and_b32_e32 v1, 0x7fffff, v1 +; GFX9-NEXT: ds_write_b8_d16_hi v0, v1 offset:6 +; GFX9-NEXT: ds_write_b32 v0, v3 +; GFX9-NEXT: s_endpgm store i55 %arg, i55 addrspace(3)* %ptr, align 8 ret void } @@ -101,31 +101,31 @@ define amdgpu_kernel void @local_store_i55(i55 addrspace(3)* %ptr, i55 %arg) #0 define amdgpu_kernel void @local_store_i48(i48 addrspace(3)* %ptr, i48 %arg) #0 { ; HAWAII-LABEL: local_store_i48: ; HAWAII: ; %bb.0: -; HAWAII-NEXT: s_load_dword s0, s[4:5], 0x0 -; HAWAII-NEXT: s_load_dword s1, s[4:5], 0x2 -; HAWAII-NEXT: s_load_dword s2, s[4:5], 0x3 -; HAWAII-NEXT: s_mov_b32 m0, -1 -; HAWAII-NEXT: s_waitcnt lgkmcnt(0) -; HAWAII-NEXT: v_mov_b32_e32 v0, s0 -; HAWAII-NEXT: v_mov_b32_e32 v2, s1 -; HAWAII-NEXT: v_mov_b32_e32 v1, s2 -; HAWAII-NEXT: ds_write_b16 v0, v1 offset:4 -; HAWAII-NEXT: ds_write_b32 v0, v2 -; HAWAII-NEXT: s_endpgm +; HAWAII-NEXT: s_load_dword s0, s[4:5], 0x0 +; HAWAII-NEXT: s_load_dword s1, s[4:5], 0x2 +; HAWAII-NEXT: s_load_dword s2, s[4:5], 0x3 +; HAWAII-NEXT: s_mov_b32 m0, -1 +; HAWAII-NEXT: s_waitcnt lgkmcnt(0) +; HAWAII-NEXT: v_mov_b32_e32 v0, s0 +; HAWAII-NEXT: v_mov_b32_e32 v1, s2 +; HAWAII-NEXT: ds_write_b16 v0, v1 offset:4 +; HAWAII-NEXT: v_mov_b32_e32 v1, s1 +; HAWAII-NEXT: ds_write_b32 v0, v1 +; HAWAII-NEXT: s_endpgm ; ; FIJI-LABEL: local_store_i48: ; FIJI: ; %bb.0: -; FIJI-NEXT: s_load_dword s0, s[4:5], 0x0 -; FIJI-NEXT: s_load_dword s1, s[4:5], 0x8 -; FIJI-NEXT: s_load_dword s2, s[4:5], 0xc -; FIJI-NEXT: s_mov_b32 m0, -1 -; FIJI-NEXT: s_waitcnt lgkmcnt(0) -; FIJI-NEXT: v_mov_b32_e32 v0, s0 -; FIJI-NEXT: v_mov_b32_e32 v2, s1 -; FIJI-NEXT: v_mov_b32_e32 v1, s2 -; FIJI-NEXT: ds_write_b16 v0, v1 offset:4 -; FIJI-NEXT: ds_write_b32 v0, v2 -; FIJI-NEXT: s_endpgm +; FIJI-NEXT: s_load_dword s0, s[4:5], 0x0 +; FIJI-NEXT: s_load_dword s1, s[4:5], 0x8 +; FIJI-NEXT: s_load_dword s2, s[4:5], 0xc +; FIJI-NEXT: s_mov_b32 m0, -1 +; FIJI-NEXT: s_waitcnt lgkmcnt(0) +; FIJI-NEXT: v_mov_b32_e32 v0, s0 +; FIJI-NEXT: v_mov_b32_e32 v1, s2 +; FIJI-NEXT: ds_write_b16 v0, v1 offset:4 +; FIJI-NEXT: v_mov_b32_e32 v1, s1 +; FIJI-NEXT: ds_write_b32 v0, v1 +; FIJI-NEXT: s_endpgm ; ; GFX9-LABEL: local_store_i48: ; GFX9: ; %bb.0: @@ -146,35 +146,35 @@ define amdgpu_kernel void @local_store_i48(i48 addrspace(3)* %ptr, i48 %arg) #0 define amdgpu_kernel void @local_store_i65(i65 addrspace(3)* %ptr, i65 %arg) #0 { ; HAWAII-LABEL: local_store_i65: ; HAWAII: ; %bb.0: -; HAWAII-NEXT: s_load_dword s2, s[4:5], 0x0 -; HAWAII-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2 -; HAWAII-NEXT: s_load_dword s3, s[4:5], 0x4 -; HAWAII-NEXT: s_mov_b32 m0, -1 -; HAWAII-NEXT: s_waitcnt lgkmcnt(0) -; HAWAII-NEXT: v_mov_b32_e32 v2, s2 -; HAWAII-NEXT: v_mov_b32_e32 v0, s0 -; HAWAII-NEXT: s_and_b32 s3, s3, 1 -; HAWAII-NEXT: v_mov_b32_e32 v3, s3 -; HAWAII-NEXT: v_mov_b32_e32 v1, s1 -; HAWAII-NEXT: ds_write_b8 v2, v3 offset:8 -; HAWAII-NEXT: ds_write_b64 v2, v[0:1] -; HAWAII-NEXT: s_endpgm +; HAWAII-NEXT: s_load_dword s2, s[4:5], 0x0 +; HAWAII-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2 +; HAWAII-NEXT: s_load_dword s3, s[4:5], 0x4 +; HAWAII-NEXT: s_mov_b32 m0, -1 +; HAWAII-NEXT: s_waitcnt lgkmcnt(0) +; HAWAII-NEXT: v_mov_b32_e32 v2, s2 +; HAWAII-NEXT: s_and_b32 s3, s3, 1 +; HAWAII-NEXT: v_mov_b32_e32 v0, s3 +; HAWAII-NEXT: ds_write_b8 v2, v0 offset:8 +; HAWAII-NEXT: v_mov_b32_e32 v0, s0 +; HAWAII-NEXT: v_mov_b32_e32 v1, s1 +; HAWAII-NEXT: ds_write_b64 v2, v[0:1] +; HAWAII-NEXT: s_endpgm ; ; FIJI-LABEL: local_store_i65: ; FIJI: ; %bb.0: -; FIJI-NEXT: s_load_dword s2, s[4:5], 0x0 -; FIJI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 -; FIJI-NEXT: s_load_dword s3, s[4:5], 0x10 -; FIJI-NEXT: s_mov_b32 m0, -1 -; FIJI-NEXT: s_waitcnt lgkmcnt(0) -; FIJI-NEXT: v_mov_b32_e32 v2, s2 -; FIJI-NEXT: v_mov_b32_e32 v0, s0 -; FIJI-NEXT: s_and_b32 s3, s3, 1 -; FIJI-NEXT: v_mov_b32_e32 v3, s3 -; FIJI-NEXT: v_mov_b32_e32 v1, s1 -; FIJI-NEXT: ds_write_b8 v2, v3 offset:8 -; FIJI-NEXT: ds_write_b64 v2, v[0:1] -; FIJI-NEXT: s_endpgm +; FIJI-NEXT: s_load_dword s2, s[4:5], 0x0 +; FIJI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; FIJI-NEXT: s_load_dword s3, s[4:5], 0x10 +; FIJI-NEXT: s_mov_b32 m0, -1 +; FIJI-NEXT: s_waitcnt lgkmcnt(0) +; FIJI-NEXT: v_mov_b32_e32 v2, s2 +; FIJI-NEXT: s_and_b32 s3, s3, 1 +; FIJI-NEXT: v_mov_b32_e32 v0, s3 +; FIJI-NEXT: ds_write_b8 v2, v0 offset:8 +; FIJI-NEXT: v_mov_b32_e32 v0, s0 +; FIJI-NEXT: v_mov_b32_e32 v1, s1 +; FIJI-NEXT: ds_write_b64 v2, v[0:1] +; FIJI-NEXT: s_endpgm ; ; GFX9-LABEL: local_store_i65: ; GFX9: ; %bb.0: @@ -218,22 +218,22 @@ define void @local_store_i13(i13 addrspace(3)* %ptr, i13 %arg) #0 { define void @local_store_i17(i17 addrspace(3)* %ptr, i17 %arg) #0 { ; CIVI-LABEL: local_store_i17: ; CIVI: ; %bb.0: -; CIVI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CIVI-NEXT: s_mov_b32 m0, -1 -; CIVI-NEXT: v_bfe_u32 v2, v1, 16, 1 -; CIVI-NEXT: ds_write_b16 v0, v1 -; CIVI-NEXT: ds_write_b8 v0, v2 offset:2 -; CIVI-NEXT: s_waitcnt lgkmcnt(0) -; CIVI-NEXT: s_setpc_b64 s[30:31] +; CIVI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CIVI-NEXT: s_mov_b32 m0, -1 +; CIVI-NEXT: ds_write_b16 v0, v1 +; CIVI-NEXT: v_bfe_u32 v1, v1, 16, 1 +; CIVI-NEXT: ds_write_b8 v0, v1 offset:2 +; CIVI-NEXT: s_waitcnt lgkmcnt(0) +; CIVI-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: local_store_i17: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_and_b32_e32 v2, 0x1ffff, v1 -; GFX9-NEXT: ds_write_b16 v0, v1 -; GFX9-NEXT: ds_write_b8_d16_hi v0, v2 offset:2 -; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ds_write_b16 v0, v1 +; GFX9-NEXT: v_and_b32_e32 v1, 0x1ffff, v1 +; GFX9-NEXT: ds_write_b8_d16_hi v0, v1 offset:2 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] store i17 %arg, i17 addrspace(3)* %ptr, align 8 ret void } diff --git a/llvm/test/CodeGen/AMDGPU/trunc-store-i64.ll b/llvm/test/CodeGen/AMDGPU/trunc-store-i64.ll index 1c52aac1068f..627ba9e0f717 100644 --- a/llvm/test/CodeGen/AMDGPU/trunc-store-i64.ll +++ b/llvm/test/CodeGen/AMDGPU/trunc-store-i64.ll @@ -38,10 +38,10 @@ entry: } ; GCN-LABEL: {{^}}trunc_store_v16i64_v16i32: -; GCN: global_store_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, off offset:48 -; GCN: global_store_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, off offset:32 -; GCN: global_store_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, off offset:16 -; GCN: global_store_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, off +; GCN: global_store_dwordx4 v{{\[[0-9]:[0-9]+\]}}, v{{\[[0-9]:[0-9]+\]}}, off offset:48 +; GCN: global_store_dwordx4 v{{\[[0-9]:[0-9]+\]}}, v{{\[[0-9]:[0-9]+\]}}, off offset:32 +; GCN: global_store_dwordx4 v{{\[[0-9]:[0-9]+\]}}, v{{\[[0-9]:[0-9]+\]}}, off offset:16 +; GCN: global_store_dwordx4 v{{\[[0-9]:[0-9]+\]}}, v{{\[[0-9]:[0-9]+\]}}, off define amdgpu_kernel void @trunc_store_v16i64_v16i32(< 16 x i32> addrspace(1)* %out, <16 x i64> %in) { entry: %trunc = trunc <16 x i64> %in to <16 x i32> diff --git a/llvm/test/CodeGen/AMDGPU/udivrem.ll b/llvm/test/CodeGen/AMDGPU/udivrem.ll index 10299b314e83..559f1092e6e4 100644 --- a/llvm/test/CodeGen/AMDGPU/udivrem.ll +++ b/llvm/test/CodeGen/AMDGPU/udivrem.ll @@ -36,40 +36,41 @@ define amdgpu_kernel void @test_udivrem(i32 addrspace(1)* %out0, [8 x i32], i32 ; ; GFX6-LABEL: test_udivrem: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dword s3, s[0:1], 0x26 -; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; GFX6-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x13 -; GFX6-NEXT: s_load_dword s0, s[0:1], 0x1d -; GFX6-NEXT: s_mov_b32 s7, 0xf000 -; GFX6-NEXT: s_mov_b32 s6, -1 -; GFX6-NEXT: s_mov_b32 s10, s6 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_cvt_f32_u32_e32 v0, s3 -; GFX6-NEXT: s_sub_i32 s2, 0, s3 -; GFX6-NEXT: s_mov_b32 s11, s7 -; GFX6-NEXT: v_rcp_iflag_f32_e32 v0, v0 -; GFX6-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 -; GFX6-NEXT: v_cvt_u32_f32_e32 v0, v0 -; GFX6-NEXT: v_mul_lo_u32 v1, s2, v0 -; GFX6-NEXT: v_mul_hi_u32 v1, v0, v1 -; GFX6-NEXT: v_add_i32_e32 v0, vcc, v1, v0 -; GFX6-NEXT: v_mul_hi_u32 v0, s0, v0 -; GFX6-NEXT: v_mul_lo_u32 v1, v0, s3 -; GFX6-NEXT: v_add_i32_e32 v2, vcc, 1, v0 -; GFX6-NEXT: v_sub_i32_e32 v1, vcc, s0, v1 -; GFX6-NEXT: v_cmp_le_u32_e64 s[0:1], s3, v1 -; GFX6-NEXT: v_cndmask_b32_e64 v0, v0, v2, s[0:1] -; GFX6-NEXT: v_subrev_i32_e32 v2, vcc, s3, v1 -; GFX6-NEXT: v_cndmask_b32_e64 v1, v1, v2, s[0:1] -; GFX6-NEXT: v_add_i32_e32 v2, vcc, 1, v0 -; GFX6-NEXT: v_cmp_le_u32_e64 s[0:1], s3, v1 -; GFX6-NEXT: v_cndmask_b32_e64 v0, v0, v2, s[0:1] -; GFX6-NEXT: v_subrev_i32_e32 v2, vcc, s3, v1 -; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 -; GFX6-NEXT: s_waitcnt expcnt(0) -; GFX6-NEXT: v_cndmask_b32_e64 v0, v1, v2, s[0:1] -; GFX6-NEXT: buffer_store_dword v0, off, s[8:11], 0 -; GFX6-NEXT: s_endpgm +; GFX6-NEXT: s_load_dword s3, s[0:1], 0x26 +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 +; GFX6-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x13 +; GFX6-NEXT: s_mov_b32 s7, 0xf000 +; GFX6-NEXT: s_mov_b32 s6, -1 +; GFX6-NEXT: s_mov_b32 s10, s6 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_cvt_f32_u32_e32 v0, s3 +; GFX6-NEXT: s_sub_i32 s2, 0, s3 +; GFX6-NEXT: s_mov_b32 s11, s7 +; GFX6-NEXT: v_rcp_iflag_f32_e32 v0, v0 +; GFX6-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 +; GFX6-NEXT: v_cvt_u32_f32_e32 v0, v0 +; GFX6-NEXT: v_mul_lo_u32 v1, s2, v0 +; GFX6-NEXT: s_load_dword s2, s[0:1], 0x1d +; GFX6-NEXT: v_mul_hi_u32 v1, v0, v1 +; GFX6-NEXT: v_add_i32_e32 v0, vcc, v1, v0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mul_hi_u32 v0, s2, v0 +; GFX6-NEXT: v_mul_lo_u32 v1, v0, s3 +; GFX6-NEXT: v_add_i32_e32 v2, vcc, 1, v0 +; GFX6-NEXT: v_sub_i32_e32 v1, vcc, s2, v1 +; GFX6-NEXT: v_cmp_le_u32_e64 s[0:1], s3, v1 +; GFX6-NEXT: v_cndmask_b32_e64 v0, v0, v2, s[0:1] +; GFX6-NEXT: v_subrev_i32_e32 v2, vcc, s3, v1 +; GFX6-NEXT: v_cndmask_b32_e64 v1, v1, v2, s[0:1] +; GFX6-NEXT: v_add_i32_e32 v2, vcc, 1, v0 +; GFX6-NEXT: v_cmp_le_u32_e64 s[0:1], s3, v1 +; GFX6-NEXT: v_cndmask_b32_e64 v0, v0, v2, s[0:1] +; GFX6-NEXT: v_subrev_i32_e32 v2, vcc, s3, v1 +; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GFX6-NEXT: s_waitcnt expcnt(0) +; GFX6-NEXT: v_cndmask_b32_e64 v0, v1, v2, s[0:1] +; GFX6-NEXT: buffer_store_dword v0, off, s[8:11], 0 +; GFX6-NEXT: s_endpgm ; ; GFX8-LABEL: test_udivrem: ; GFX8: ; %bb.0: