Revert "AMDGPU/GlobalISel: Combine zext(trunc x) to x after RegBankSelect"

This reverts commit bf5a582650.
Also depends on now-reverted 4c8fb7ddd6
This commit is contained in:
Nico Weber 2021-03-04 10:16:11 -05:00
parent 59beb1ef6d
commit e68de60bc4
11 changed files with 337 additions and 262 deletions

View File

@ -64,6 +64,6 @@ def AMDGPUPostLegalizerCombinerHelper: GICombinerHelper<
}
def AMDGPURegBankCombinerHelper : GICombinerHelper<
"AMDGPUGenRegBankCombinerHelper", [zext_trunc_fold]> {
"AMDGPUGenRegBankCombinerHelper", []> {
let DisableRuleOption = "amdgpuregbankcombiner-disable-rule";
}

View File

@ -228,7 +228,6 @@ extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeAMDGPUTarget() {
initializeAMDGPUOpenCLEnqueuedBlockLoweringPass(*PR);
initializeAMDGPUPostLegalizerCombinerPass(*PR);
initializeAMDGPUPreLegalizerCombinerPass(*PR);
initializeAMDGPURegBankCombinerPass(*PR);
initializeAMDGPUPromoteAllocaPass(*PR);
initializeAMDGPUPromoteAllocaToVectorPass(*PR);
initializeAMDGPUCodeGenPreparePass(*PR);
@ -808,7 +807,6 @@ public:
bool addLegalizeMachineIR() override;
void addPreRegBankSelect() override;
bool addRegBankSelect() override;
void addPreGlobalInstructionSelect() override;
bool addGlobalInstructionSelect() override;
void addFastRegAlloc() override;
void addOptimizedRegAlloc() override;
@ -1115,11 +1113,6 @@ bool GCNPassConfig::addRegBankSelect() {
return false;
}
void GCNPassConfig::addPreGlobalInstructionSelect() {
bool IsOptNone = getOptLevel() == CodeGenOpt::None;
addPass(createAMDGPURegBankCombiner(IsOptNone));
}
bool GCNPassConfig::addGlobalInstructionSelect() {
addPass(new InstructionSelect(getOptLevel()));
// TODO: Fix instruction selection to do the right thing for image

View File

@ -139,6 +139,9 @@ define void @constrained_if_register_class() {
; CHECK-NEXT: s_load_dword s4, s[4:5], 0x0
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
; CHECK-NEXT: s_cmp_lg_u32 s4, 0
; CHECK-NEXT: s_cselect_b32 s4, 1, 0
; CHECK-NEXT: s_and_b32 s4, s4, 1
; CHECK-NEXT: s_cmp_lg_u32 s4, 0
; CHECK-NEXT: s_cbranch_scc1 BB4_4
; CHECK-NEXT: ; %bb.1: ; %bb2
; CHECK-NEXT: s_getpc_b64 s[6:7]
@ -158,6 +161,9 @@ define void @constrained_if_register_class() {
; CHECK-NEXT: s_mov_b32 s4, 0
; CHECK-NEXT: BB4_3: ; %bb8
; CHECK-NEXT: s_cmp_lg_u32 s4, 0
; CHECK-NEXT: s_cselect_b32 s4, 1, 0
; CHECK-NEXT: s_and_b32 s4, s4, 1
; CHECK-NEXT: s_cmp_lg_u32 s4, 0
; CHECK-NEXT: s_cbranch_scc0 BB4_5
; CHECK-NEXT: BB4_4: ; %bb12
; CHECK-NEXT: s_setpc_b64 s[30:31]

View File

@ -9,6 +9,9 @@ define amdgpu_kernel void @test_wave32(i32 %arg0, [8 x i32], i32 %saved) {
; GCN-NEXT: s_load_dword s0, s[4:5], 0x24
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: s_cmp_lg_u32 s1, 0
; GCN-NEXT: s_cselect_b32 s1, 1, 0
; GCN-NEXT: s_and_b32 s1, s1, 1
; GCN-NEXT: s_cmp_lg_u32 s1, 0
; GCN-NEXT: s_cbranch_scc1 BB0_2
; GCN-NEXT: ; %bb.1: ; %mid
; GCN-NEXT: v_mov_b32_e32 v0, 0

View File

@ -8,6 +8,9 @@ define amdgpu_kernel void @test_wave64(i32 %arg0, i64 %saved) {
; GCN-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: s_cmp_lg_u32 s2, 0
; GCN-NEXT: s_cselect_b32 s2, 1, 0
; GCN-NEXT: s_and_b32 s2, s2, 1
; GCN-NEXT: s_cmp_lg_u32 s2, 0
; GCN-NEXT: s_cbranch_scc1 BB0_2
; GCN-NEXT: ; %bb.1: ; %mid
; GCN-NEXT: v_mov_b32_e32 v0, 0

View File

@ -53,6 +53,9 @@ define amdgpu_kernel void @is_private_sgpr(i8* %ptr) {
; CI-NEXT: s_load_dword s0, s[4:5], 0x11
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: s_cmp_lg_u32 s1, s0
; CI-NEXT: s_cselect_b32 s0, 1, 0
; CI-NEXT: s_and_b32 s0, s0, 1
; CI-NEXT: s_cmp_lg_u32 s0, 0
; CI-NEXT: s_cbranch_scc1 BB1_2
; CI-NEXT: ; %bb.1: ; %bb0
; CI-NEXT: v_mov_b32_e32 v0, 0
@ -68,6 +71,9 @@ define amdgpu_kernel void @is_private_sgpr(i8* %ptr) {
; GFX9-NEXT: s_getreg_b32 s0, hwreg(HW_REG_SH_MEM_BASES, 0, 16)
; GFX9-NEXT: s_lshl_b32 s0, s0, 16
; GFX9-NEXT: s_cmp_lg_u32 s1, s0
; GFX9-NEXT: s_cselect_b32 s0, 1, 0
; GFX9-NEXT: s_and_b32 s0, s0, 1
; GFX9-NEXT: s_cmp_lg_u32 s0, 0
; GFX9-NEXT: s_cbranch_scc1 BB1_2
; GFX9-NEXT: ; %bb.1: ; %bb0
; GFX9-NEXT: v_mov_b32_e32 v0, 0

View File

@ -53,6 +53,9 @@ define amdgpu_kernel void @is_local_sgpr(i8* %ptr) {
; CI-NEXT: s_load_dword s0, s[4:5], 0x10
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: s_cmp_lg_u32 s1, s0
; CI-NEXT: s_cselect_b32 s0, 1, 0
; CI-NEXT: s_and_b32 s0, s0, 1
; CI-NEXT: s_cmp_lg_u32 s0, 0
; CI-NEXT: s_cbranch_scc1 BB1_2
; CI-NEXT: ; %bb.1: ; %bb0
; CI-NEXT: v_mov_b32_e32 v0, 0
@ -68,6 +71,9 @@ define amdgpu_kernel void @is_local_sgpr(i8* %ptr) {
; GFX9-NEXT: s_getreg_b32 s0, hwreg(HW_REG_SH_MEM_BASES, 16, 16)
; GFX9-NEXT: s_lshl_b32 s0, s0, 16
; GFX9-NEXT: s_cmp_lg_u32 s1, s0
; GFX9-NEXT: s_cselect_b32 s0, 1, 0
; GFX9-NEXT: s_and_b32 s0, s0, 1
; GFX9-NEXT: s_cmp_lg_u32 s0, 0
; GFX9-NEXT: s_cbranch_scc1 BB1_2
; GFX9-NEXT: ; %bb.1: ; %bb0
; GFX9-NEXT: v_mov_b32_e32 v0, 0

View File

@ -43,18 +43,19 @@ define amdgpu_kernel void @set_inactive_64(i64 addrspace(1)* %out, i64 %in) {
define amdgpu_kernel void @set_inactive_scc(i32 addrspace(1)* %out, i32 %in, <4 x i32> inreg %desc) {
; GCN-LABEL: set_inactive_scc:
; GCN: ; %bb.0:
; GCN-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x34
; GCN-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: s_buffer_load_dword s2, s[4:7], 0x0
; GCN-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
; GCN-NEXT: s_load_dword s0, s[0:1], 0x2c
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: s_buffer_load_dword s1, s[8:11], 0x0
; GCN-NEXT: s_cmp_lg_u32 s2, 56
; GCN-NEXT: v_mov_b32_e32 v0, s0
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: s_cmp_lg_u32 s1, 56
; GCN-NEXT: s_cselect_b32 s0, 1, 0
; GCN-NEXT: s_not_b64 exec, exec
; GCN-NEXT: v_mov_b32_e32 v0, 42
; GCN-NEXT: s_not_b64 exec, exec
; GCN-NEXT: s_and_b32 s0, s0, 1
; GCN-NEXT: s_cmp_lg_u32 s0, 0
; GCN-NEXT: s_cbranch_scc0 BB2_2
; GCN-NEXT: ; %bb.1: ; %.one

View File

@ -21,12 +21,18 @@ define amdgpu_kernel void @kernel_non_entry_block_static_alloca_uniformly_reache
; GCN-NEXT: s_movk_i32 s32, 0x400
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: s_cmp_lg_u32 s6, 0
; GCN-NEXT: s_cselect_b32 s6, 1, 0
; GCN-NEXT: s_and_b32 s6, s6, 1
; GCN-NEXT: s_cmp_lg_u32 s6, 0
; GCN-NEXT: s_mov_b32 s33, 0
; GCN-NEXT: s_cbranch_scc1 BB0_3
; GCN-NEXT: ; %bb.1: ; %bb.0
; GCN-NEXT: s_load_dword s6, s[4:5], 0xc
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: s_cmp_lg_u32 s6, 0
; GCN-NEXT: s_cselect_b32 s6, 1, 0
; GCN-NEXT: s_and_b32 s6, s6, 1
; GCN-NEXT: s_cmp_lg_u32 s6, 0
; GCN-NEXT: s_cbranch_scc1 BB0_3
; GCN-NEXT: ; %bb.2: ; %bb.1
; GCN-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0
@ -96,6 +102,9 @@ define amdgpu_kernel void @kernel_non_entry_block_static_alloca_uniformly_reache
; GCN-NEXT: s_movk_i32 s32, 0x1000
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: s_cmp_lg_u32 s6, 0
; GCN-NEXT: s_cselect_b32 s6, 1, 0
; GCN-NEXT: s_and_b32 s6, s6, 1
; GCN-NEXT: s_cmp_lg_u32 s6, 0
; GCN-NEXT: s_mov_b32 s33, 0
; GCN-NEXT: s_cbranch_scc1 BB1_2
; GCN-NEXT: ; %bb.1: ; %bb.0

View File

@ -4824,9 +4824,11 @@ define amdgpu_ps i128 @s_saddsat_i128(i128 inreg %lhs, i128 inreg %rhs) {
; GFX6-NEXT: s_or_b64 s[2:3], s[2:3], s[4:5]
; GFX6-NEXT: s_ashr_i32 s4, s11, 31
; GFX6-NEXT: s_ashr_i64 s[6:7], s[10:11], s6
; GFX6-NEXT: s_and_b32 s12, s12, 1
; GFX6-NEXT: s_cmp_lg_u32 s12, 0
; GFX6-NEXT: s_cselect_b64 s[2:3], s[2:3], s[6:7]
; GFX6-NEXT: s_cmp_lg_u32 s13, 0
; GFX6-NEXT: s_and_b32 s6, s13, 1
; GFX6-NEXT: s_cmp_lg_u32 s6, 0
; GFX6-NEXT: s_cselect_b64 s[2:3], s[8:9], s[2:3]
; GFX6-NEXT: s_mov_b32 s5, s4
; GFX6-NEXT: s_cmp_lg_u32 s12, 0
@ -4915,9 +4917,11 @@ define amdgpu_ps i128 @s_saddsat_i128(i128 inreg %lhs, i128 inreg %rhs) {
; GFX8-NEXT: s_or_b64 s[2:3], s[2:3], s[4:5]
; GFX8-NEXT: s_ashr_i32 s4, s11, 31
; GFX8-NEXT: s_ashr_i64 s[6:7], s[10:11], s6
; GFX8-NEXT: s_and_b32 s12, s12, 1
; GFX8-NEXT: s_cmp_lg_u32 s12, 0
; GFX8-NEXT: s_cselect_b64 s[2:3], s[2:3], s[6:7]
; GFX8-NEXT: s_cmp_lg_u32 s13, 0
; GFX8-NEXT: s_and_b32 s6, s13, 1
; GFX8-NEXT: s_cmp_lg_u32 s6, 0
; GFX8-NEXT: s_cselect_b64 s[2:3], s[8:9], s[2:3]
; GFX8-NEXT: s_mov_b32 s5, s4
; GFX8-NEXT: s_cmp_lg_u32 s12, 0
@ -5006,9 +5010,11 @@ define amdgpu_ps i128 @s_saddsat_i128(i128 inreg %lhs, i128 inreg %rhs) {
; GFX9-NEXT: s_or_b64 s[2:3], s[2:3], s[4:5]
; GFX9-NEXT: s_ashr_i32 s4, s11, 31
; GFX9-NEXT: s_ashr_i64 s[6:7], s[10:11], s6
; GFX9-NEXT: s_and_b32 s12, s12, 1
; GFX9-NEXT: s_cmp_lg_u32 s12, 0
; GFX9-NEXT: s_cselect_b64 s[2:3], s[2:3], s[6:7]
; GFX9-NEXT: s_cmp_lg_u32 s13, 0
; GFX9-NEXT: s_and_b32 s6, s13, 1
; GFX9-NEXT: s_cmp_lg_u32 s6, 0
; GFX9-NEXT: s_cselect_b64 s[2:3], s[8:9], s[2:3]
; GFX9-NEXT: s_mov_b32 s5, s4
; GFX9-NEXT: s_cmp_lg_u32 s12, 0
@ -5079,53 +5085,55 @@ define amdgpu_ps i128 @s_saddsat_i128(i128 inreg %lhs, i128 inreg %rhs) {
; GFX10-NEXT: s_cselect_b32 s1, 1, 0
; GFX10-NEXT: s_sub_i32 s13, s12, 64
; GFX10-NEXT: s_and_b32 s14, 1, s1
; GFX10-NEXT: s_sub_i32 s15, 64, s12
; GFX10-NEXT: s_sub_i32 s2, 64, s12
; GFX10-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc_lo
; GFX10-NEXT: s_cmp_lt_u32 s12, 64
; GFX10-NEXT: v_cndmask_b32_e64 v1, 0, 1, s0
; GFX10-NEXT: v_cmp_lt_i64_e64 s0, s[6:7], 0
; GFX10-NEXT: s_cselect_b32 s16, 1, 0
; GFX10-NEXT: s_cmp_eq_u32 s12, 0
; GFX10-NEXT: s_cmp_lt_u32 s12, 64
; GFX10-NEXT: v_cmp_ne_u32_e64 vcc_lo, 0, s14
; GFX10-NEXT: s_cselect_b32 s17, 1, 0
; GFX10-NEXT: s_lshr_b64 s[2:3], s[8:9], s12
; GFX10-NEXT: s_lshl_b64 s[4:5], s[10:11], s15
; GFX10-NEXT: s_cselect_b32 s15, 1, 0
; GFX10-NEXT: s_cmp_eq_u32 s12, 0
; GFX10-NEXT: s_cselect_b32 s16, 1, 0
; GFX10-NEXT: v_cndmask_b32_e64 v2, 0, 1, s0
; GFX10-NEXT: s_ashr_i32 s6, s11, 31
; GFX10-NEXT: s_or_b64 s[2:3], s[2:3], s[4:5]
; GFX10-NEXT: s_ashr_i64 s[0:1], s[10:11], s12
; GFX10-NEXT: s_ashr_i64 s[4:5], s[10:11], s13
; GFX10-NEXT: s_cmp_lg_u32 s16, 0
; GFX10-NEXT: s_mov_b32 s7, s6
; GFX10-NEXT: s_cselect_b64 s[2:3], s[2:3], s[4:5]
; GFX10-NEXT: s_cmp_lg_u32 s17, 0
; GFX10-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc_lo
; GFX10-NEXT: s_cselect_b64 s[2:3], s[8:9], s[2:3]
; GFX10-NEXT: s_cmp_lg_u32 s16, 0
; GFX10-NEXT: v_mov_b32_e32 v2, s9
; GFX10-NEXT: s_lshr_b64 s[0:1], s[8:9], s12
; GFX10-NEXT: s_lshl_b64 s[2:3], s[10:11], s2
; GFX10-NEXT: s_ashr_i64 s[4:5], s[10:11], s12
; GFX10-NEXT: s_and_b32 s12, s15, 1
; GFX10-NEXT: s_or_b64 s[0:1], s[0:1], s[2:3]
; GFX10-NEXT: s_ashr_i32 s2, s11, 31
; GFX10-NEXT: s_ashr_i64 s[6:7], s[10:11], s13
; GFX10-NEXT: s_cmp_lg_u32 s12, 0
; GFX10-NEXT: s_mov_b32 s3, s2
; GFX10-NEXT: s_cselect_b64 s[0:1], s[0:1], s[6:7]
; GFX10-NEXT: s_add_u32 s2, s2, 0
; GFX10-NEXT: s_cselect_b32 s4, 1, 0
; GFX10-NEXT: s_and_b32 s6, s16, 1
; GFX10-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc_lo
; GFX10-NEXT: s_cmp_lg_u32 s6, 0
; GFX10-NEXT: v_mov_b32_e32 v2, s9
; GFX10-NEXT: s_cselect_b64 s[0:1], s[8:9], s[0:1]
; GFX10-NEXT: s_cmp_lg_u32 s12, 0
; GFX10-NEXT: v_xor_b32_e32 v0, v1, v0
; GFX10-NEXT: s_and_b32 s4, s4, 1
; GFX10-NEXT: v_mov_b32_e32 v1, s8
; GFX10-NEXT: s_cmp_lg_u32 s4, 0
; GFX10-NEXT: s_addc_u32 s3, s3, 0
; GFX10-NEXT: s_cselect_b64 s[2:3], s[4:5], s[2:3]
; GFX10-NEXT: s_add_u32 s0, s0, 0
; GFX10-NEXT: s_cselect_b32 s4, 1, 0
; GFX10-NEXT: v_and_b32_e32 v0, 1, v0
; GFX10-NEXT: v_mov_b32_e32 v1, s8
; GFX10-NEXT: s_and_b32 s4, s4, 1
; GFX10-NEXT: v_and_b32_e32 v0, 1, v0
; GFX10-NEXT: s_cmp_lg_u32 s4, 0
; GFX10-NEXT: s_addc_u32 s0, s0, 0
; GFX10-NEXT: s_addc_u32 s1, s1, 0
; GFX10-NEXT: s_cselect_b32 s4, 1, 0
; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0
; GFX10-NEXT: s_and_b32 s4, s4, 1
; GFX10-NEXT: s_cmp_lg_u32 s4, 0
; GFX10-NEXT: s_addc_u32 s1, s1, 0x80000000
; GFX10-NEXT: v_cndmask_b32_e64 v0, v1, s2, vcc_lo
; GFX10-NEXT: v_cndmask_b32_e64 v1, v2, s3, vcc_lo
; GFX10-NEXT: v_cndmask_b32_e64 v2, v3, s0, vcc_lo
; GFX10-NEXT: v_cndmask_b32_e64 v3, v4, s1, vcc_lo
; GFX10-NEXT: s_addc_u32 s2, s2, 0
; GFX10-NEXT: s_cselect_b32 s4, 1, 0
; GFX10-NEXT: v_cndmask_b32_e64 v0, v1, s0, vcc_lo
; GFX10-NEXT: s_and_b32 s4, s4, 1
; GFX10-NEXT: v_cndmask_b32_e64 v1, v2, s1, vcc_lo
; GFX10-NEXT: s_cmp_lg_u32 s4, 0
; GFX10-NEXT: v_cndmask_b32_e64 v2, v3, s2, vcc_lo
; GFX10-NEXT: s_addc_u32 s3, s3, 0x80000000
; GFX10-NEXT: v_readfirstlane_b32 s0, v0
; GFX10-NEXT: v_cndmask_b32_e64 v3, v4, s3, vcc_lo
; GFX10-NEXT: v_readfirstlane_b32 s1, v1
; GFX10-NEXT: v_readfirstlane_b32 s2, v2
; GFX10-NEXT: v_readfirstlane_b32 s3, v3
@ -6115,9 +6123,11 @@ define amdgpu_ps <2 x i128> @s_saddsat_v2i128(<2 x i128> inreg %lhs, <2 x i128>
; GFX6-NEXT: s_ashr_i32 s8, s19, 31
; GFX6-NEXT: s_ashr_i64 s[0:1], s[18:19], s20
; GFX6-NEXT: s_ashr_i64 s[10:11], s[18:19], s21
; GFX6-NEXT: s_and_b32 s23, s23, 1
; GFX6-NEXT: s_cmp_lg_u32 s23, 0
; GFX6-NEXT: s_cselect_b64 s[2:3], s[2:3], s[10:11]
; GFX6-NEXT: s_cmp_lg_u32 s24, 0
; GFX6-NEXT: s_and_b32 s10, s24, 1
; GFX6-NEXT: s_cmp_lg_u32 s10, 0
; GFX6-NEXT: s_cselect_b64 s[2:3], s[16:17], s[2:3]
; GFX6-NEXT: s_cmp_lg_u32 s23, 0
; GFX6-NEXT: s_mov_b32 s9, s8
@ -6189,9 +6199,11 @@ define amdgpu_ps <2 x i128> @s_saddsat_v2i128(<2 x i128> inreg %lhs, <2 x i128>
; GFX6-NEXT: s_ashr_i32 s8, s3, 31
; GFX6-NEXT: s_ashr_i64 s[4:5], s[2:3], s20
; GFX6-NEXT: s_ashr_i64 s[10:11], s[2:3], s21
; GFX6-NEXT: s_and_b32 s12, s12, 1
; GFX6-NEXT: s_cmp_lg_u32 s12, 0
; GFX6-NEXT: s_cselect_b64 s[6:7], s[6:7], s[10:11]
; GFX6-NEXT: s_cmp_lg_u32 s13, 0
; GFX6-NEXT: s_and_b32 s10, s13, 1
; GFX6-NEXT: s_cmp_lg_u32 s10, 0
; GFX6-NEXT: s_cselect_b64 s[6:7], s[0:1], s[6:7]
; GFX6-NEXT: s_mov_b32 s9, s8
; GFX6-NEXT: s_cmp_lg_u32 s12, 0
@ -6285,9 +6297,11 @@ define amdgpu_ps <2 x i128> @s_saddsat_v2i128(<2 x i128> inreg %lhs, <2 x i128>
; GFX8-NEXT: s_ashr_i32 s8, s19, 31
; GFX8-NEXT: s_ashr_i64 s[0:1], s[18:19], s20
; GFX8-NEXT: s_ashr_i64 s[10:11], s[18:19], s21
; GFX8-NEXT: s_and_b32 s23, s23, 1
; GFX8-NEXT: s_cmp_lg_u32 s23, 0
; GFX8-NEXT: s_cselect_b64 s[2:3], s[2:3], s[10:11]
; GFX8-NEXT: s_cmp_lg_u32 s24, 0
; GFX8-NEXT: s_and_b32 s10, s24, 1
; GFX8-NEXT: s_cmp_lg_u32 s10, 0
; GFX8-NEXT: s_cselect_b64 s[2:3], s[16:17], s[2:3]
; GFX8-NEXT: s_cmp_lg_u32 s23, 0
; GFX8-NEXT: s_mov_b32 s9, s8
@ -6366,9 +6380,11 @@ define amdgpu_ps <2 x i128> @s_saddsat_v2i128(<2 x i128> inreg %lhs, <2 x i128>
; GFX8-NEXT: s_ashr_i32 s8, s3, 31
; GFX8-NEXT: s_ashr_i64 s[4:5], s[2:3], s20
; GFX8-NEXT: s_ashr_i64 s[10:11], s[2:3], s21
; GFX8-NEXT: s_and_b32 s12, s12, 1
; GFX8-NEXT: s_cmp_lg_u32 s12, 0
; GFX8-NEXT: s_cselect_b64 s[6:7], s[6:7], s[10:11]
; GFX8-NEXT: s_cmp_lg_u32 s13, 0
; GFX8-NEXT: s_and_b32 s10, s13, 1
; GFX8-NEXT: s_cmp_lg_u32 s10, 0
; GFX8-NEXT: s_cselect_b64 s[6:7], s[0:1], s[6:7]
; GFX8-NEXT: s_mov_b32 s9, s8
; GFX8-NEXT: s_cmp_lg_u32 s12, 0
@ -6461,9 +6477,11 @@ define amdgpu_ps <2 x i128> @s_saddsat_v2i128(<2 x i128> inreg %lhs, <2 x i128>
; GFX9-NEXT: s_ashr_i32 s8, s19, 31
; GFX9-NEXT: s_ashr_i64 s[0:1], s[18:19], s20
; GFX9-NEXT: s_ashr_i64 s[10:11], s[18:19], s21
; GFX9-NEXT: s_and_b32 s23, s23, 1
; GFX9-NEXT: s_cmp_lg_u32 s23, 0
; GFX9-NEXT: s_cselect_b64 s[2:3], s[2:3], s[10:11]
; GFX9-NEXT: s_cmp_lg_u32 s24, 0
; GFX9-NEXT: s_and_b32 s10, s24, 1
; GFX9-NEXT: s_cmp_lg_u32 s10, 0
; GFX9-NEXT: s_cselect_b64 s[2:3], s[16:17], s[2:3]
; GFX9-NEXT: s_cmp_lg_u32 s23, 0
; GFX9-NEXT: s_mov_b32 s9, s8
@ -6542,9 +6560,11 @@ define amdgpu_ps <2 x i128> @s_saddsat_v2i128(<2 x i128> inreg %lhs, <2 x i128>
; GFX9-NEXT: s_ashr_i32 s8, s3, 31
; GFX9-NEXT: s_ashr_i64 s[4:5], s[2:3], s20
; GFX9-NEXT: s_ashr_i64 s[10:11], s[2:3], s21
; GFX9-NEXT: s_and_b32 s12, s12, 1
; GFX9-NEXT: s_cmp_lg_u32 s12, 0
; GFX9-NEXT: s_cselect_b64 s[6:7], s[6:7], s[10:11]
; GFX9-NEXT: s_cmp_lg_u32 s13, 0
; GFX9-NEXT: s_and_b32 s10, s13, 1
; GFX9-NEXT: s_cmp_lg_u32 s10, 0
; GFX9-NEXT: s_cselect_b64 s[6:7], s[0:1], s[6:7]
; GFX9-NEXT: s_mov_b32 s9, s8
; GFX9-NEXT: s_cmp_lg_u32 s12, 0
@ -6590,15 +6610,15 @@ define amdgpu_ps <2 x i128> @s_saddsat_v2i128(<2 x i128> inreg %lhs, <2 x i128>
;
; GFX10-LABEL: s_saddsat_v2i128:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_add_u32 s16, s0, s8
; GFX10-NEXT: s_add_u32 s28, s0, s8
; GFX10-NEXT: s_cselect_b32 s17, 1, 0
; GFX10-NEXT: s_mov_b32 s46, s0
; GFX10-NEXT: s_and_b32 s17, s17, 1
; GFX10-NEXT: s_mov_b32 s47, s1
; GFX10-NEXT: s_cmp_lg_u32 s17, 0
; GFX10-NEXT: s_addc_u32 s17, s1, s9
; GFX10-NEXT: s_addc_u32 s29, s1, s9
; GFX10-NEXT: s_cselect_b32 s18, 1, 0
; GFX10-NEXT: v_cmp_lt_u64_e64 s0, s[16:17], s[46:47]
; GFX10-NEXT: v_cmp_lt_u64_e64 s0, s[28:29], s[46:47]
; GFX10-NEXT: s_and_b32 s18, s18, 1
; GFX10-NEXT: s_cmp_lg_u32 s18, 0
; GFX10-NEXT: s_addc_u32 s30, s2, s10
@ -6618,132 +6638,136 @@ define amdgpu_ps <2 x i128> @s_saddsat_v2i128(<2 x i128> inreg %lhs, <2 x i128>
; GFX10-NEXT: s_cselect_b32 s1, 1, 0
; GFX10-NEXT: v_cndmask_b32_e64 v2, 0, 1, s2
; GFX10-NEXT: v_cmp_lt_i64_e64 s2, s[10:11], 0
; GFX10-NEXT: v_cmp_ne_u32_e64 vcc_lo, 0, s0
; GFX10-NEXT: s_and_b32 s1, 1, s1
; GFX10-NEXT: s_sub_i32 s21, s20, 64
; GFX10-NEXT: s_sub_i32 s22, 64, s20
; GFX10-NEXT: s_cmp_lt_u32 s20, 64
; GFX10-NEXT: v_cmp_ne_u32_e64 vcc_lo, 0, s0
; GFX10-NEXT: s_cselect_b32 s23, 1, 0
; GFX10-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc_lo
; GFX10-NEXT: s_cselect_b32 s10, 1, 0
; GFX10-NEXT: s_cmp_eq_u32 s20, 0
; GFX10-NEXT: v_cndmask_b32_e64 v3, 0, 1, s2
; GFX10-NEXT: s_cselect_b32 s24, 1, 0
; GFX10-NEXT: s_lshr_b64 s[2:3], s[16:17], s20
; GFX10-NEXT: s_lshl_b64 s[8:9], s[30:31], s22
; GFX10-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc_lo
; GFX10-NEXT: s_cselect_b32 s23, 1, 0
; GFX10-NEXT: v_cmp_ne_u32_e64 vcc_lo, 0, s1
; GFX10-NEXT: s_ashr_i32 s10, s31, 31
; GFX10-NEXT: s_or_b64 s[2:3], s[2:3], s[8:9]
; GFX10-NEXT: s_ashr_i64 s[0:1], s[30:31], s20
; GFX10-NEXT: s_ashr_i64 s[8:9], s[30:31], s21
; GFX10-NEXT: s_cmp_lg_u32 s23, 0
; GFX10-NEXT: s_mov_b32 s11, s10
; GFX10-NEXT: s_cselect_b64 s[2:3], s[2:3], s[8:9]
; GFX10-NEXT: s_lshr_b64 s[0:1], s[28:29], s20
; GFX10-NEXT: s_lshl_b64 s[2:3], s[30:31], s22
; GFX10-NEXT: s_and_b32 s24, s10, 1
; GFX10-NEXT: s_or_b64 s[0:1], s[0:1], s[2:3]
; GFX10-NEXT: s_ashr_i32 s2, s31, 31
; GFX10-NEXT: s_ashr_i64 s[8:9], s[30:31], s20
; GFX10-NEXT: s_ashr_i64 s[10:11], s[30:31], s21
; GFX10-NEXT: s_cmp_lg_u32 s24, 0
; GFX10-NEXT: v_cndmask_b32_e32 v1, v3, v2, vcc_lo
; GFX10-NEXT: s_cselect_b64 s[2:3], s[16:17], s[2:3]
; GFX10-NEXT: s_cmp_lg_u32 s23, 0
; GFX10-NEXT: v_mov_b32_e32 v2, s17
; GFX10-NEXT: s_mov_b32 s3, s2
; GFX10-NEXT: s_cselect_b64 s[0:1], s[0:1], s[10:11]
; GFX10-NEXT: s_add_u32 s2, s2, 0
; GFX10-NEXT: s_cselect_b32 s8, 1, 0
; GFX10-NEXT: s_and_b32 s10, s23, 1
; GFX10-NEXT: v_cndmask_b32_e32 v1, v3, v2, vcc_lo
; GFX10-NEXT: s_cmp_lg_u32 s10, 0
; GFX10-NEXT: v_mov_b32_e32 v2, s29
; GFX10-NEXT: s_cselect_b64 s[0:1], s[28:29], s[0:1]
; GFX10-NEXT: s_cmp_lg_u32 s24, 0
; GFX10-NEXT: v_xor_b32_e32 v0, v1, v0
; GFX10-NEXT: s_cselect_b64 s[2:3], s[8:9], s[2:3]
; GFX10-NEXT: s_add_u32 s0, s0, 0
; GFX10-NEXT: s_cselect_b32 s8, 1, 0
; GFX10-NEXT: v_mov_b32_e32 v1, s28
; GFX10-NEXT: s_and_b32 s8, s8, 1
; GFX10-NEXT: v_mov_b32_e32 v1, s16
; GFX10-NEXT: v_and_b32_e32 v0, 1, v0
; GFX10-NEXT: s_cmp_lg_u32 s8, 0
; GFX10-NEXT: s_brev_b32 s23, 1
; GFX10-NEXT: s_addc_u32 s3, s3, 0
; GFX10-NEXT: s_cselect_b32 s8, 1, 0
; GFX10-NEXT: v_and_b32_e32 v0, 1, v0
; GFX10-NEXT: s_and_b32 s8, s8, 1
; GFX10-NEXT: v_mov_b32_e32 v3, s31
; GFX10-NEXT: s_cmp_lg_u32 s8, 0
; GFX10-NEXT: s_addc_u32 s0, s0, 0
; GFX10-NEXT: s_addc_u32 s1, s1, 0
; GFX10-NEXT: s_cselect_b32 s8, 1, 0
; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0
; GFX10-NEXT: s_and_b32 s8, s8, 1
; GFX10-NEXT: v_mov_b32_e32 v3, s31
; GFX10-NEXT: s_cmp_lg_u32 s8, 0
; GFX10-NEXT: s_addc_u32 s1, s1, s23
; GFX10-NEXT: v_cndmask_b32_e64 v0, v1, s2, vcc_lo
; GFX10-NEXT: s_add_u32 s2, s4, s12
; GFX10-NEXT: v_cndmask_b32_e64 v1, v2, s3, vcc_lo
; GFX10-NEXT: s_cselect_b32 s3, 1, 0
; GFX10-NEXT: v_cndmask_b32_e64 v3, v3, s1, vcc_lo
; GFX10-NEXT: s_and_b32 s3, s3, 1
; GFX10-NEXT: v_mov_b32_e32 v2, s30
; GFX10-NEXT: s_cmp_lg_u32 s3, 0
; GFX10-NEXT: s_addc_u32 s3, s5, s13
; GFX10-NEXT: s_addc_u32 s2, s2, 0
; GFX10-NEXT: s_cselect_b32 s8, 1, 0
; GFX10-NEXT: v_cmp_lt_u64_e64 s1, s[2:3], s[4:5]
; GFX10-NEXT: v_cndmask_b32_e64 v0, v1, s0, vcc_lo
; GFX10-NEXT: s_and_b32 s8, s8, 1
; GFX10-NEXT: v_cndmask_b32_e64 v1, v2, s1, vcc_lo
; GFX10-NEXT: s_cmp_lg_u32 s8, 0
; GFX10-NEXT: v_mov_b32_e32 v2, s30
; GFX10-NEXT: s_addc_u32 s3, s3, s23
; GFX10-NEXT: s_add_u32 s0, s4, s12
; GFX10-NEXT: s_cselect_b32 s1, 1, 0
; GFX10-NEXT: v_cndmask_b32_e64 v3, v3, s3, vcc_lo
; GFX10-NEXT: s_and_b32 s1, s1, 1
; GFX10-NEXT: v_cndmask_b32_e64 v2, v2, s2, vcc_lo
; GFX10-NEXT: s_cmp_lg_u32 s1, 0
; GFX10-NEXT: s_addc_u32 s1, s5, s13
; GFX10-NEXT: s_cselect_b32 s8, 1, 0
; GFX10-NEXT: v_cmp_lt_u64_e64 s3, s[0:1], s[4:5]
; GFX10-NEXT: s_and_b32 s8, s8, 1
; GFX10-NEXT: v_cndmask_b32_e64 v2, v2, s0, vcc_lo
; GFX10-NEXT: s_cmp_lg_u32 s8, 0
; GFX10-NEXT: s_addc_u32 s8, s6, s14
; GFX10-NEXT: s_cselect_b32 s9, 1, 0
; GFX10-NEXT: v_cndmask_b32_e64 v4, 0, 1, s1
; GFX10-NEXT: v_cndmask_b32_e64 v4, 0, 1, s3
; GFX10-NEXT: s_and_b32 s9, s9, 1
; GFX10-NEXT: v_mov_b32_e32 v7, s8
; GFX10-NEXT: s_cmp_lg_u32 s9, 0
; GFX10-NEXT: s_addc_u32 s9, s7, s15
; GFX10-NEXT: s_cmp_eq_u64 s[8:9], s[6:7]
; GFX10-NEXT: v_cmp_lt_i64_e64 s1, s[8:9], s[6:7]
; GFX10-NEXT: s_cselect_b32 s0, 1, 0
; GFX10-NEXT: v_cmp_lt_i64_e64 s3, s[8:9], s[6:7]
; GFX10-NEXT: s_cselect_b32 s2, 1, 0
; GFX10-NEXT: v_mov_b32_e32 v8, s9
; GFX10-NEXT: s_and_b32 s0, 1, s0
; GFX10-NEXT: s_and_b32 s2, 1, s2
; GFX10-NEXT: s_cmp_eq_u64 s[14:15], 0
; GFX10-NEXT: v_cmp_ne_u32_e64 vcc_lo, 0, s0
; GFX10-NEXT: v_cmp_lt_u64_e64 s0, s[12:13], 0
; GFX10-NEXT: v_cndmask_b32_e64 v5, 0, 1, s1
; GFX10-NEXT: s_cselect_b32 s1, 1, 0
; GFX10-NEXT: s_and_b32 s16, 1, s1
; GFX10-NEXT: v_cmp_ne_u32_e64 vcc_lo, 0, s2
; GFX10-NEXT: v_cmp_lt_u64_e64 s2, s[12:13], 0
; GFX10-NEXT: v_cndmask_b32_e64 v5, 0, 1, s3
; GFX10-NEXT: s_cselect_b32 s3, 1, 0
; GFX10-NEXT: s_and_b32 s16, 1, s3
; GFX10-NEXT: s_cmp_lt_u32 s20, 64
; GFX10-NEXT: v_cndmask_b32_e32 v4, v5, v4, vcc_lo
; GFX10-NEXT: v_cndmask_b32_e64 v5, 0, 1, s0
; GFX10-NEXT: v_cmp_lt_i64_e64 s0, s[14:15], 0
; GFX10-NEXT: s_cselect_b32 s17, 1, 0
; GFX10-NEXT: v_cndmask_b32_e64 v5, 0, 1, s2
; GFX10-NEXT: v_cmp_lt_i64_e64 s2, s[14:15], 0
; GFX10-NEXT: s_cselect_b32 s10, 1, 0
; GFX10-NEXT: s_cmp_eq_u32 s20, 0
; GFX10-NEXT: v_cmp_ne_u32_e64 vcc_lo, 0, s16
; GFX10-NEXT: s_cselect_b32 s12, 1, 0
; GFX10-NEXT: s_lshr_b64 s[4:5], s[2:3], s20
; GFX10-NEXT: s_lshl_b64 s[6:7], s[8:9], s22
; GFX10-NEXT: v_cndmask_b32_e64 v6, 0, 1, s0
; GFX10-NEXT: s_ashr_i32 s10, s9, 31
; GFX10-NEXT: s_or_b64 s[4:5], s[4:5], s[6:7]
; GFX10-NEXT: s_ashr_i64 s[0:1], s[8:9], s20
; GFX10-NEXT: s_ashr_i64 s[6:7], s[8:9], s21
; GFX10-NEXT: s_cmp_lg_u32 s17, 0
; GFX10-NEXT: s_mov_b32 s11, s10
; GFX10-NEXT: s_cselect_b64 s[4:5], s[4:5], s[6:7]
; GFX10-NEXT: s_cmp_lg_u32 s12, 0
; GFX10-NEXT: s_lshl_b64 s[4:5], s[8:9], s22
; GFX10-NEXT: v_cndmask_b32_e64 v6, 0, 1, s2
; GFX10-NEXT: s_lshr_b64 s[2:3], s[0:1], s20
; GFX10-NEXT: s_and_b32 s13, s10, 1
; GFX10-NEXT: s_or_b64 s[2:3], s[2:3], s[4:5]
; GFX10-NEXT: s_ashr_i32 s4, s9, 31
; GFX10-NEXT: s_ashr_i64 s[6:7], s[8:9], s20
; GFX10-NEXT: s_ashr_i64 s[10:11], s[8:9], s21
; GFX10-NEXT: s_cmp_lg_u32 s13, 0
; GFX10-NEXT: s_mov_b32 s5, s4
; GFX10-NEXT: s_cselect_b64 s[2:3], s[2:3], s[10:11]
; GFX10-NEXT: s_and_b32 s10, s12, 1
; GFX10-NEXT: v_cndmask_b32_e32 v5, v6, v5, vcc_lo
; GFX10-NEXT: s_cselect_b64 s[4:5], s[2:3], s[4:5]
; GFX10-NEXT: s_cmp_lg_u32 s17, 0
; GFX10-NEXT: v_mov_b32_e32 v6, s3
; GFX10-NEXT: s_cselect_b64 s[0:1], s[0:1], s[10:11]
; GFX10-NEXT: s_add_u32 s4, s4, 0
; GFX10-NEXT: s_cselect_b32 s6, 1, 0
; GFX10-NEXT: s_cmp_lg_u32 s10, 0
; GFX10-NEXT: v_mov_b32_e32 v6, s1
; GFX10-NEXT: s_cselect_b64 s[2:3], s[0:1], s[2:3]
; GFX10-NEXT: s_cmp_lg_u32 s13, 0
; GFX10-NEXT: v_xor_b32_e32 v4, v5, v4
; GFX10-NEXT: s_and_b32 s6, s6, 1
; GFX10-NEXT: v_mov_b32_e32 v5, s2
; GFX10-NEXT: s_cmp_lg_u32 s6, 0
; GFX10-NEXT: v_readfirstlane_b32 s2, v2
; GFX10-NEXT: s_addc_u32 s5, s5, 0
; GFX10-NEXT: s_cselect_b64 s[4:5], s[6:7], s[4:5]
; GFX10-NEXT: s_add_u32 s2, s2, 0
; GFX10-NEXT: s_cselect_b32 s6, 1, 0
; GFX10-NEXT: v_and_b32_e32 v4, 1, v4
; GFX10-NEXT: v_mov_b32_e32 v5, s0
; GFX10-NEXT: s_and_b32 s6, s6, 1
; GFX10-NEXT: v_readfirstlane_b32 s3, v3
; GFX10-NEXT: v_and_b32_e32 v4, 1, v4
; GFX10-NEXT: s_cmp_lg_u32 s6, 0
; GFX10-NEXT: s_addc_u32 s0, s0, 0
; GFX10-NEXT: v_readfirstlane_b32 s0, v0
; GFX10-NEXT: s_addc_u32 s3, s3, 0
; GFX10-NEXT: s_cselect_b32 s6, 1, 0
; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v4
; GFX10-NEXT: s_and_b32 s6, s6, 1
; GFX10-NEXT: s_cmp_lg_u32 s6, 0
; GFX10-NEXT: s_addc_u32 s1, s1, s23
; GFX10-NEXT: v_cndmask_b32_e64 v4, v5, s4, vcc_lo
; GFX10-NEXT: v_cndmask_b32_e64 v5, v6, s5, vcc_lo
; GFX10-NEXT: v_cndmask_b32_e64 v6, v7, s0, vcc_lo
; GFX10-NEXT: s_addc_u32 s4, s4, 0
; GFX10-NEXT: s_cselect_b32 s6, 1, 0
; GFX10-NEXT: v_cndmask_b32_e64 v4, v5, s2, vcc_lo
; GFX10-NEXT: s_and_b32 s6, s6, 1
; GFX10-NEXT: v_cndmask_b32_e64 v5, v6, s3, vcc_lo
; GFX10-NEXT: s_cmp_lg_u32 s6, 0
; GFX10-NEXT: v_cndmask_b32_e64 v6, v7, s4, vcc_lo
; GFX10-NEXT: s_addc_u32 s1, s5, s23
; GFX10-NEXT: v_readfirstlane_b32 s2, v2
; GFX10-NEXT: v_cndmask_b32_e64 v7, v8, s1, vcc_lo
; GFX10-NEXT: v_readfirstlane_b32 s0, v0
; GFX10-NEXT: v_readfirstlane_b32 s1, v1
; GFX10-NEXT: v_readfirstlane_b32 s3, v3
; GFX10-NEXT: v_readfirstlane_b32 s4, v4
; GFX10-NEXT: v_readfirstlane_b32 s5, v5
; GFX10-NEXT: v_readfirstlane_b32 s6, v6

View File

@ -4810,9 +4810,11 @@ define amdgpu_ps i128 @s_ssubsat_i128(i128 inreg %lhs, i128 inreg %rhs) {
; GFX6-NEXT: s_or_b64 s[2:3], s[2:3], s[4:5]
; GFX6-NEXT: s_ashr_i32 s4, s11, 31
; GFX6-NEXT: s_ashr_i64 s[6:7], s[10:11], s6
; GFX6-NEXT: s_and_b32 s12, s12, 1
; GFX6-NEXT: s_cmp_lg_u32 s12, 0
; GFX6-NEXT: s_cselect_b64 s[2:3], s[2:3], s[6:7]
; GFX6-NEXT: s_cmp_lg_u32 s13, 0
; GFX6-NEXT: s_and_b32 s6, s13, 1
; GFX6-NEXT: s_cmp_lg_u32 s6, 0
; GFX6-NEXT: s_cselect_b64 s[2:3], s[8:9], s[2:3]
; GFX6-NEXT: s_mov_b32 s5, s4
; GFX6-NEXT: s_cmp_lg_u32 s12, 0
@ -4901,9 +4903,11 @@ define amdgpu_ps i128 @s_ssubsat_i128(i128 inreg %lhs, i128 inreg %rhs) {
; GFX8-NEXT: s_or_b64 s[2:3], s[2:3], s[4:5]
; GFX8-NEXT: s_ashr_i32 s4, s11, 31
; GFX8-NEXT: s_ashr_i64 s[6:7], s[10:11], s6
; GFX8-NEXT: s_and_b32 s12, s12, 1
; GFX8-NEXT: s_cmp_lg_u32 s12, 0
; GFX8-NEXT: s_cselect_b64 s[2:3], s[2:3], s[6:7]
; GFX8-NEXT: s_cmp_lg_u32 s13, 0
; GFX8-NEXT: s_and_b32 s6, s13, 1
; GFX8-NEXT: s_cmp_lg_u32 s6, 0
; GFX8-NEXT: s_cselect_b64 s[2:3], s[8:9], s[2:3]
; GFX8-NEXT: s_mov_b32 s5, s4
; GFX8-NEXT: s_cmp_lg_u32 s12, 0
@ -4992,9 +4996,11 @@ define amdgpu_ps i128 @s_ssubsat_i128(i128 inreg %lhs, i128 inreg %rhs) {
; GFX9-NEXT: s_or_b64 s[2:3], s[2:3], s[4:5]
; GFX9-NEXT: s_ashr_i32 s4, s11, 31
; GFX9-NEXT: s_ashr_i64 s[6:7], s[10:11], s6
; GFX9-NEXT: s_and_b32 s12, s12, 1
; GFX9-NEXT: s_cmp_lg_u32 s12, 0
; GFX9-NEXT: s_cselect_b64 s[2:3], s[2:3], s[6:7]
; GFX9-NEXT: s_cmp_lg_u32 s13, 0
; GFX9-NEXT: s_and_b32 s6, s13, 1
; GFX9-NEXT: s_cmp_lg_u32 s6, 0
; GFX9-NEXT: s_cselect_b64 s[2:3], s[8:9], s[2:3]
; GFX9-NEXT: s_mov_b32 s5, s4
; GFX9-NEXT: s_cmp_lg_u32 s12, 0
@ -5065,53 +5071,55 @@ define amdgpu_ps i128 @s_ssubsat_i128(i128 inreg %lhs, i128 inreg %rhs) {
; GFX10-NEXT: s_cselect_b32 s1, 1, 0
; GFX10-NEXT: s_sub_i32 s13, s12, 64
; GFX10-NEXT: s_and_b32 s14, 1, s1
; GFX10-NEXT: s_sub_i32 s15, 64, s12
; GFX10-NEXT: s_sub_i32 s2, 64, s12
; GFX10-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc_lo
; GFX10-NEXT: s_cmp_lt_u32 s12, 64
; GFX10-NEXT: v_cndmask_b32_e64 v1, 0, 1, s0
; GFX10-NEXT: v_cmp_gt_i64_e64 s0, s[6:7], 0
; GFX10-NEXT: s_cselect_b32 s16, 1, 0
; GFX10-NEXT: s_cmp_eq_u32 s12, 0
; GFX10-NEXT: s_cmp_lt_u32 s12, 64
; GFX10-NEXT: v_cmp_ne_u32_e64 vcc_lo, 0, s14
; GFX10-NEXT: s_cselect_b32 s17, 1, 0
; GFX10-NEXT: s_lshr_b64 s[2:3], s[8:9], s12
; GFX10-NEXT: s_lshl_b64 s[4:5], s[10:11], s15
; GFX10-NEXT: s_cselect_b32 s15, 1, 0
; GFX10-NEXT: s_cmp_eq_u32 s12, 0
; GFX10-NEXT: s_cselect_b32 s16, 1, 0
; GFX10-NEXT: v_cndmask_b32_e64 v2, 0, 1, s0
; GFX10-NEXT: s_ashr_i32 s6, s11, 31
; GFX10-NEXT: s_or_b64 s[2:3], s[2:3], s[4:5]
; GFX10-NEXT: s_ashr_i64 s[0:1], s[10:11], s12
; GFX10-NEXT: s_ashr_i64 s[4:5], s[10:11], s13
; GFX10-NEXT: s_cmp_lg_u32 s16, 0
; GFX10-NEXT: s_mov_b32 s7, s6
; GFX10-NEXT: s_cselect_b64 s[2:3], s[2:3], s[4:5]
; GFX10-NEXT: s_cmp_lg_u32 s17, 0
; GFX10-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc_lo
; GFX10-NEXT: s_cselect_b64 s[2:3], s[8:9], s[2:3]
; GFX10-NEXT: s_cmp_lg_u32 s16, 0
; GFX10-NEXT: v_mov_b32_e32 v2, s9
; GFX10-NEXT: s_lshr_b64 s[0:1], s[8:9], s12
; GFX10-NEXT: s_lshl_b64 s[2:3], s[10:11], s2
; GFX10-NEXT: s_ashr_i64 s[4:5], s[10:11], s12
; GFX10-NEXT: s_and_b32 s12, s15, 1
; GFX10-NEXT: s_or_b64 s[0:1], s[0:1], s[2:3]
; GFX10-NEXT: s_ashr_i32 s2, s11, 31
; GFX10-NEXT: s_ashr_i64 s[6:7], s[10:11], s13
; GFX10-NEXT: s_cmp_lg_u32 s12, 0
; GFX10-NEXT: s_mov_b32 s3, s2
; GFX10-NEXT: s_cselect_b64 s[0:1], s[0:1], s[6:7]
; GFX10-NEXT: s_add_u32 s2, s2, 0
; GFX10-NEXT: s_cselect_b32 s4, 1, 0
; GFX10-NEXT: s_and_b32 s6, s16, 1
; GFX10-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc_lo
; GFX10-NEXT: s_cmp_lg_u32 s6, 0
; GFX10-NEXT: v_mov_b32_e32 v2, s9
; GFX10-NEXT: s_cselect_b64 s[0:1], s[8:9], s[0:1]
; GFX10-NEXT: s_cmp_lg_u32 s12, 0
; GFX10-NEXT: v_xor_b32_e32 v0, v1, v0
; GFX10-NEXT: s_and_b32 s4, s4, 1
; GFX10-NEXT: v_mov_b32_e32 v1, s8
; GFX10-NEXT: s_cmp_lg_u32 s4, 0
; GFX10-NEXT: s_addc_u32 s3, s3, 0
; GFX10-NEXT: s_cselect_b64 s[2:3], s[4:5], s[2:3]
; GFX10-NEXT: s_add_u32 s0, s0, 0
; GFX10-NEXT: s_cselect_b32 s4, 1, 0
; GFX10-NEXT: v_and_b32_e32 v0, 1, v0
; GFX10-NEXT: v_mov_b32_e32 v1, s8
; GFX10-NEXT: s_and_b32 s4, s4, 1
; GFX10-NEXT: v_and_b32_e32 v0, 1, v0
; GFX10-NEXT: s_cmp_lg_u32 s4, 0
; GFX10-NEXT: s_addc_u32 s0, s0, 0
; GFX10-NEXT: s_addc_u32 s1, s1, 0
; GFX10-NEXT: s_cselect_b32 s4, 1, 0
; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0
; GFX10-NEXT: s_and_b32 s4, s4, 1
; GFX10-NEXT: s_cmp_lg_u32 s4, 0
; GFX10-NEXT: s_addc_u32 s1, s1, 0x80000000
; GFX10-NEXT: v_cndmask_b32_e64 v0, v1, s2, vcc_lo
; GFX10-NEXT: v_cndmask_b32_e64 v1, v2, s3, vcc_lo
; GFX10-NEXT: v_cndmask_b32_e64 v2, v3, s0, vcc_lo
; GFX10-NEXT: v_cndmask_b32_e64 v3, v4, s1, vcc_lo
; GFX10-NEXT: s_addc_u32 s2, s2, 0
; GFX10-NEXT: s_cselect_b32 s4, 1, 0
; GFX10-NEXT: v_cndmask_b32_e64 v0, v1, s0, vcc_lo
; GFX10-NEXT: s_and_b32 s4, s4, 1
; GFX10-NEXT: v_cndmask_b32_e64 v1, v2, s1, vcc_lo
; GFX10-NEXT: s_cmp_lg_u32 s4, 0
; GFX10-NEXT: v_cndmask_b32_e64 v2, v3, s2, vcc_lo
; GFX10-NEXT: s_addc_u32 s3, s3, 0x80000000
; GFX10-NEXT: v_readfirstlane_b32 s0, v0
; GFX10-NEXT: v_cndmask_b32_e64 v3, v4, s3, vcc_lo
; GFX10-NEXT: v_readfirstlane_b32 s1, v1
; GFX10-NEXT: v_readfirstlane_b32 s2, v2
; GFX10-NEXT: v_readfirstlane_b32 s3, v3
@ -6101,9 +6109,11 @@ define amdgpu_ps <2 x i128> @s_ssubsat_v2i128(<2 x i128> inreg %lhs, <2 x i128>
; GFX6-NEXT: s_ashr_i32 s8, s19, 31
; GFX6-NEXT: s_ashr_i64 s[0:1], s[18:19], s20
; GFX6-NEXT: s_ashr_i64 s[10:11], s[18:19], s21
; GFX6-NEXT: s_and_b32 s23, s23, 1
; GFX6-NEXT: s_cmp_lg_u32 s23, 0
; GFX6-NEXT: s_cselect_b64 s[2:3], s[2:3], s[10:11]
; GFX6-NEXT: s_cmp_lg_u32 s24, 0
; GFX6-NEXT: s_and_b32 s10, s24, 1
; GFX6-NEXT: s_cmp_lg_u32 s10, 0
; GFX6-NEXT: s_cselect_b64 s[2:3], s[16:17], s[2:3]
; GFX6-NEXT: s_cmp_lg_u32 s23, 0
; GFX6-NEXT: s_mov_b32 s9, s8
@ -6175,9 +6185,11 @@ define amdgpu_ps <2 x i128> @s_ssubsat_v2i128(<2 x i128> inreg %lhs, <2 x i128>
; GFX6-NEXT: s_ashr_i32 s8, s3, 31
; GFX6-NEXT: s_ashr_i64 s[4:5], s[2:3], s20
; GFX6-NEXT: s_ashr_i64 s[10:11], s[2:3], s21
; GFX6-NEXT: s_and_b32 s12, s12, 1
; GFX6-NEXT: s_cmp_lg_u32 s12, 0
; GFX6-NEXT: s_cselect_b64 s[6:7], s[6:7], s[10:11]
; GFX6-NEXT: s_cmp_lg_u32 s13, 0
; GFX6-NEXT: s_and_b32 s10, s13, 1
; GFX6-NEXT: s_cmp_lg_u32 s10, 0
; GFX6-NEXT: s_cselect_b64 s[6:7], s[0:1], s[6:7]
; GFX6-NEXT: s_mov_b32 s9, s8
; GFX6-NEXT: s_cmp_lg_u32 s12, 0
@ -6271,9 +6283,11 @@ define amdgpu_ps <2 x i128> @s_ssubsat_v2i128(<2 x i128> inreg %lhs, <2 x i128>
; GFX8-NEXT: s_ashr_i32 s8, s19, 31
; GFX8-NEXT: s_ashr_i64 s[0:1], s[18:19], s20
; GFX8-NEXT: s_ashr_i64 s[10:11], s[18:19], s21
; GFX8-NEXT: s_and_b32 s23, s23, 1
; GFX8-NEXT: s_cmp_lg_u32 s23, 0
; GFX8-NEXT: s_cselect_b64 s[2:3], s[2:3], s[10:11]
; GFX8-NEXT: s_cmp_lg_u32 s24, 0
; GFX8-NEXT: s_and_b32 s10, s24, 1
; GFX8-NEXT: s_cmp_lg_u32 s10, 0
; GFX8-NEXT: s_cselect_b64 s[2:3], s[16:17], s[2:3]
; GFX8-NEXT: s_cmp_lg_u32 s23, 0
; GFX8-NEXT: s_mov_b32 s9, s8
@ -6352,9 +6366,11 @@ define amdgpu_ps <2 x i128> @s_ssubsat_v2i128(<2 x i128> inreg %lhs, <2 x i128>
; GFX8-NEXT: s_ashr_i32 s8, s3, 31
; GFX8-NEXT: s_ashr_i64 s[4:5], s[2:3], s20
; GFX8-NEXT: s_ashr_i64 s[10:11], s[2:3], s21
; GFX8-NEXT: s_and_b32 s12, s12, 1
; GFX8-NEXT: s_cmp_lg_u32 s12, 0
; GFX8-NEXT: s_cselect_b64 s[6:7], s[6:7], s[10:11]
; GFX8-NEXT: s_cmp_lg_u32 s13, 0
; GFX8-NEXT: s_and_b32 s10, s13, 1
; GFX8-NEXT: s_cmp_lg_u32 s10, 0
; GFX8-NEXT: s_cselect_b64 s[6:7], s[0:1], s[6:7]
; GFX8-NEXT: s_mov_b32 s9, s8
; GFX8-NEXT: s_cmp_lg_u32 s12, 0
@ -6447,9 +6463,11 @@ define amdgpu_ps <2 x i128> @s_ssubsat_v2i128(<2 x i128> inreg %lhs, <2 x i128>
; GFX9-NEXT: s_ashr_i32 s8, s19, 31
; GFX9-NEXT: s_ashr_i64 s[0:1], s[18:19], s20
; GFX9-NEXT: s_ashr_i64 s[10:11], s[18:19], s21
; GFX9-NEXT: s_and_b32 s23, s23, 1
; GFX9-NEXT: s_cmp_lg_u32 s23, 0
; GFX9-NEXT: s_cselect_b64 s[2:3], s[2:3], s[10:11]
; GFX9-NEXT: s_cmp_lg_u32 s24, 0
; GFX9-NEXT: s_and_b32 s10, s24, 1
; GFX9-NEXT: s_cmp_lg_u32 s10, 0
; GFX9-NEXT: s_cselect_b64 s[2:3], s[16:17], s[2:3]
; GFX9-NEXT: s_cmp_lg_u32 s23, 0
; GFX9-NEXT: s_mov_b32 s9, s8
@ -6528,9 +6546,11 @@ define amdgpu_ps <2 x i128> @s_ssubsat_v2i128(<2 x i128> inreg %lhs, <2 x i128>
; GFX9-NEXT: s_ashr_i32 s8, s3, 31
; GFX9-NEXT: s_ashr_i64 s[4:5], s[2:3], s20
; GFX9-NEXT: s_ashr_i64 s[10:11], s[2:3], s21
; GFX9-NEXT: s_and_b32 s12, s12, 1
; GFX9-NEXT: s_cmp_lg_u32 s12, 0
; GFX9-NEXT: s_cselect_b64 s[6:7], s[6:7], s[10:11]
; GFX9-NEXT: s_cmp_lg_u32 s13, 0
; GFX9-NEXT: s_and_b32 s10, s13, 1
; GFX9-NEXT: s_cmp_lg_u32 s10, 0
; GFX9-NEXT: s_cselect_b64 s[6:7], s[0:1], s[6:7]
; GFX9-NEXT: s_mov_b32 s9, s8
; GFX9-NEXT: s_cmp_lg_u32 s12, 0
@ -6576,15 +6596,15 @@ define amdgpu_ps <2 x i128> @s_ssubsat_v2i128(<2 x i128> inreg %lhs, <2 x i128>
;
; GFX10-LABEL: s_ssubsat_v2i128:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_sub_u32 s16, s0, s8
; GFX10-NEXT: s_sub_u32 s28, s0, s8
; GFX10-NEXT: s_cselect_b32 s17, 1, 0
; GFX10-NEXT: s_mov_b32 s46, s0
; GFX10-NEXT: s_and_b32 s17, s17, 1
; GFX10-NEXT: s_mov_b32 s47, s1
; GFX10-NEXT: s_cmp_lg_u32 s17, 0
; GFX10-NEXT: s_subb_u32 s17, s1, s9
; GFX10-NEXT: s_subb_u32 s29, s1, s9
; GFX10-NEXT: s_cselect_b32 s18, 1, 0
; GFX10-NEXT: v_cmp_lt_u64_e64 s0, s[16:17], s[46:47]
; GFX10-NEXT: v_cmp_lt_u64_e64 s0, s[28:29], s[46:47]
; GFX10-NEXT: s_and_b32 s18, s18, 1
; GFX10-NEXT: s_cmp_lg_u32 s18, 0
; GFX10-NEXT: s_subb_u32 s30, s2, s10
@ -6604,132 +6624,136 @@ define amdgpu_ps <2 x i128> @s_ssubsat_v2i128(<2 x i128> inreg %lhs, <2 x i128>
; GFX10-NEXT: s_cselect_b32 s1, 1, 0
; GFX10-NEXT: v_cndmask_b32_e64 v2, 0, 1, s2
; GFX10-NEXT: v_cmp_gt_i64_e64 s2, s[10:11], 0
; GFX10-NEXT: v_cmp_ne_u32_e64 vcc_lo, 0, s0
; GFX10-NEXT: s_and_b32 s1, 1, s1
; GFX10-NEXT: s_sub_i32 s21, s20, 64
; GFX10-NEXT: s_sub_i32 s22, 64, s20
; GFX10-NEXT: s_cmp_lt_u32 s20, 64
; GFX10-NEXT: v_cmp_ne_u32_e64 vcc_lo, 0, s0
; GFX10-NEXT: s_cselect_b32 s23, 1, 0
; GFX10-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc_lo
; GFX10-NEXT: s_cselect_b32 s10, 1, 0
; GFX10-NEXT: s_cmp_eq_u32 s20, 0
; GFX10-NEXT: v_cndmask_b32_e64 v3, 0, 1, s2
; GFX10-NEXT: s_cselect_b32 s24, 1, 0
; GFX10-NEXT: s_lshr_b64 s[2:3], s[16:17], s20
; GFX10-NEXT: s_lshl_b64 s[8:9], s[30:31], s22
; GFX10-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc_lo
; GFX10-NEXT: s_cselect_b32 s23, 1, 0
; GFX10-NEXT: v_cmp_ne_u32_e64 vcc_lo, 0, s1
; GFX10-NEXT: s_ashr_i32 s10, s31, 31
; GFX10-NEXT: s_or_b64 s[2:3], s[2:3], s[8:9]
; GFX10-NEXT: s_ashr_i64 s[0:1], s[30:31], s20
; GFX10-NEXT: s_ashr_i64 s[8:9], s[30:31], s21
; GFX10-NEXT: s_cmp_lg_u32 s23, 0
; GFX10-NEXT: s_mov_b32 s11, s10
; GFX10-NEXT: s_cselect_b64 s[2:3], s[2:3], s[8:9]
; GFX10-NEXT: s_lshr_b64 s[0:1], s[28:29], s20
; GFX10-NEXT: s_lshl_b64 s[2:3], s[30:31], s22
; GFX10-NEXT: s_and_b32 s24, s10, 1
; GFX10-NEXT: s_or_b64 s[0:1], s[0:1], s[2:3]
; GFX10-NEXT: s_ashr_i32 s2, s31, 31
; GFX10-NEXT: s_ashr_i64 s[8:9], s[30:31], s20
; GFX10-NEXT: s_ashr_i64 s[10:11], s[30:31], s21
; GFX10-NEXT: s_cmp_lg_u32 s24, 0
; GFX10-NEXT: v_cndmask_b32_e32 v1, v3, v2, vcc_lo
; GFX10-NEXT: s_cselect_b64 s[2:3], s[16:17], s[2:3]
; GFX10-NEXT: s_cmp_lg_u32 s23, 0
; GFX10-NEXT: v_mov_b32_e32 v2, s17
; GFX10-NEXT: s_mov_b32 s3, s2
; GFX10-NEXT: s_cselect_b64 s[0:1], s[0:1], s[10:11]
; GFX10-NEXT: s_add_u32 s2, s2, 0
; GFX10-NEXT: s_cselect_b32 s8, 1, 0
; GFX10-NEXT: s_and_b32 s10, s23, 1
; GFX10-NEXT: v_cndmask_b32_e32 v1, v3, v2, vcc_lo
; GFX10-NEXT: s_cmp_lg_u32 s10, 0
; GFX10-NEXT: v_mov_b32_e32 v2, s29
; GFX10-NEXT: s_cselect_b64 s[0:1], s[28:29], s[0:1]
; GFX10-NEXT: s_cmp_lg_u32 s24, 0
; GFX10-NEXT: v_xor_b32_e32 v0, v1, v0
; GFX10-NEXT: s_cselect_b64 s[2:3], s[8:9], s[2:3]
; GFX10-NEXT: s_add_u32 s0, s0, 0
; GFX10-NEXT: s_cselect_b32 s8, 1, 0
; GFX10-NEXT: v_mov_b32_e32 v1, s28
; GFX10-NEXT: s_and_b32 s8, s8, 1
; GFX10-NEXT: v_mov_b32_e32 v1, s16
; GFX10-NEXT: v_and_b32_e32 v0, 1, v0
; GFX10-NEXT: s_cmp_lg_u32 s8, 0
; GFX10-NEXT: s_brev_b32 s23, 1
; GFX10-NEXT: s_addc_u32 s3, s3, 0
; GFX10-NEXT: s_cselect_b32 s8, 1, 0
; GFX10-NEXT: v_and_b32_e32 v0, 1, v0
; GFX10-NEXT: s_and_b32 s8, s8, 1
; GFX10-NEXT: v_mov_b32_e32 v3, s31
; GFX10-NEXT: s_cmp_lg_u32 s8, 0
; GFX10-NEXT: s_addc_u32 s0, s0, 0
; GFX10-NEXT: s_addc_u32 s1, s1, 0
; GFX10-NEXT: s_cselect_b32 s8, 1, 0
; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0
; GFX10-NEXT: s_and_b32 s8, s8, 1
; GFX10-NEXT: v_mov_b32_e32 v3, s31
; GFX10-NEXT: s_cmp_lg_u32 s8, 0
; GFX10-NEXT: s_addc_u32 s1, s1, s23
; GFX10-NEXT: v_cndmask_b32_e64 v0, v1, s2, vcc_lo
; GFX10-NEXT: s_sub_u32 s2, s4, s12
; GFX10-NEXT: v_cndmask_b32_e64 v1, v2, s3, vcc_lo
; GFX10-NEXT: s_cselect_b32 s3, 1, 0
; GFX10-NEXT: v_cndmask_b32_e64 v3, v3, s1, vcc_lo
; GFX10-NEXT: s_and_b32 s3, s3, 1
; GFX10-NEXT: v_mov_b32_e32 v2, s30
; GFX10-NEXT: s_cmp_lg_u32 s3, 0
; GFX10-NEXT: s_subb_u32 s3, s5, s13
; GFX10-NEXT: s_addc_u32 s2, s2, 0
; GFX10-NEXT: s_cselect_b32 s8, 1, 0
; GFX10-NEXT: v_cmp_lt_u64_e64 s1, s[2:3], s[4:5]
; GFX10-NEXT: v_cndmask_b32_e64 v0, v1, s0, vcc_lo
; GFX10-NEXT: s_and_b32 s8, s8, 1
; GFX10-NEXT: v_cndmask_b32_e64 v1, v2, s1, vcc_lo
; GFX10-NEXT: s_cmp_lg_u32 s8, 0
; GFX10-NEXT: v_mov_b32_e32 v2, s30
; GFX10-NEXT: s_addc_u32 s3, s3, s23
; GFX10-NEXT: s_sub_u32 s0, s4, s12
; GFX10-NEXT: s_cselect_b32 s1, 1, 0
; GFX10-NEXT: v_cndmask_b32_e64 v3, v3, s3, vcc_lo
; GFX10-NEXT: s_and_b32 s1, s1, 1
; GFX10-NEXT: v_cndmask_b32_e64 v2, v2, s2, vcc_lo
; GFX10-NEXT: s_cmp_lg_u32 s1, 0
; GFX10-NEXT: s_subb_u32 s1, s5, s13
; GFX10-NEXT: s_cselect_b32 s8, 1, 0
; GFX10-NEXT: v_cmp_lt_u64_e64 s3, s[0:1], s[4:5]
; GFX10-NEXT: s_and_b32 s8, s8, 1
; GFX10-NEXT: v_cndmask_b32_e64 v2, v2, s0, vcc_lo
; GFX10-NEXT: s_cmp_lg_u32 s8, 0
; GFX10-NEXT: s_subb_u32 s8, s6, s14
; GFX10-NEXT: s_cselect_b32 s9, 1, 0
; GFX10-NEXT: v_cndmask_b32_e64 v4, 0, 1, s1
; GFX10-NEXT: v_cndmask_b32_e64 v4, 0, 1, s3
; GFX10-NEXT: s_and_b32 s9, s9, 1
; GFX10-NEXT: v_mov_b32_e32 v7, s8
; GFX10-NEXT: s_cmp_lg_u32 s9, 0
; GFX10-NEXT: s_subb_u32 s9, s7, s15
; GFX10-NEXT: s_cmp_eq_u64 s[8:9], s[6:7]
; GFX10-NEXT: v_cmp_lt_i64_e64 s1, s[8:9], s[6:7]
; GFX10-NEXT: s_cselect_b32 s0, 1, 0
; GFX10-NEXT: v_cmp_lt_i64_e64 s3, s[8:9], s[6:7]
; GFX10-NEXT: s_cselect_b32 s2, 1, 0
; GFX10-NEXT: v_mov_b32_e32 v8, s9
; GFX10-NEXT: s_and_b32 s0, 1, s0
; GFX10-NEXT: s_and_b32 s2, 1, s2
; GFX10-NEXT: s_cmp_eq_u64 s[14:15], 0
; GFX10-NEXT: v_cmp_ne_u32_e64 vcc_lo, 0, s0
; GFX10-NEXT: v_cmp_gt_u64_e64 s0, s[12:13], 0
; GFX10-NEXT: v_cndmask_b32_e64 v5, 0, 1, s1
; GFX10-NEXT: s_cselect_b32 s1, 1, 0
; GFX10-NEXT: s_and_b32 s16, 1, s1
; GFX10-NEXT: v_cmp_ne_u32_e64 vcc_lo, 0, s2
; GFX10-NEXT: v_cmp_gt_u64_e64 s2, s[12:13], 0
; GFX10-NEXT: v_cndmask_b32_e64 v5, 0, 1, s3
; GFX10-NEXT: s_cselect_b32 s3, 1, 0
; GFX10-NEXT: s_and_b32 s16, 1, s3
; GFX10-NEXT: s_cmp_lt_u32 s20, 64
; GFX10-NEXT: v_cndmask_b32_e32 v4, v5, v4, vcc_lo
; GFX10-NEXT: v_cndmask_b32_e64 v5, 0, 1, s0
; GFX10-NEXT: v_cmp_gt_i64_e64 s0, s[14:15], 0
; GFX10-NEXT: s_cselect_b32 s17, 1, 0
; GFX10-NEXT: v_cndmask_b32_e64 v5, 0, 1, s2
; GFX10-NEXT: v_cmp_gt_i64_e64 s2, s[14:15], 0
; GFX10-NEXT: s_cselect_b32 s10, 1, 0
; GFX10-NEXT: s_cmp_eq_u32 s20, 0
; GFX10-NEXT: v_cmp_ne_u32_e64 vcc_lo, 0, s16
; GFX10-NEXT: s_cselect_b32 s12, 1, 0
; GFX10-NEXT: s_lshr_b64 s[4:5], s[2:3], s20
; GFX10-NEXT: s_lshl_b64 s[6:7], s[8:9], s22
; GFX10-NEXT: v_cndmask_b32_e64 v6, 0, 1, s0
; GFX10-NEXT: s_ashr_i32 s10, s9, 31
; GFX10-NEXT: s_or_b64 s[4:5], s[4:5], s[6:7]
; GFX10-NEXT: s_ashr_i64 s[0:1], s[8:9], s20
; GFX10-NEXT: s_ashr_i64 s[6:7], s[8:9], s21
; GFX10-NEXT: s_cmp_lg_u32 s17, 0
; GFX10-NEXT: s_mov_b32 s11, s10
; GFX10-NEXT: s_cselect_b64 s[4:5], s[4:5], s[6:7]
; GFX10-NEXT: s_cmp_lg_u32 s12, 0
; GFX10-NEXT: s_lshl_b64 s[4:5], s[8:9], s22
; GFX10-NEXT: v_cndmask_b32_e64 v6, 0, 1, s2
; GFX10-NEXT: s_lshr_b64 s[2:3], s[0:1], s20
; GFX10-NEXT: s_and_b32 s13, s10, 1
; GFX10-NEXT: s_or_b64 s[2:3], s[2:3], s[4:5]
; GFX10-NEXT: s_ashr_i32 s4, s9, 31
; GFX10-NEXT: s_ashr_i64 s[6:7], s[8:9], s20
; GFX10-NEXT: s_ashr_i64 s[10:11], s[8:9], s21
; GFX10-NEXT: s_cmp_lg_u32 s13, 0
; GFX10-NEXT: s_mov_b32 s5, s4
; GFX10-NEXT: s_cselect_b64 s[2:3], s[2:3], s[10:11]
; GFX10-NEXT: s_and_b32 s10, s12, 1
; GFX10-NEXT: v_cndmask_b32_e32 v5, v6, v5, vcc_lo
; GFX10-NEXT: s_cselect_b64 s[4:5], s[2:3], s[4:5]
; GFX10-NEXT: s_cmp_lg_u32 s17, 0
; GFX10-NEXT: v_mov_b32_e32 v6, s3
; GFX10-NEXT: s_cselect_b64 s[0:1], s[0:1], s[10:11]
; GFX10-NEXT: s_add_u32 s4, s4, 0
; GFX10-NEXT: s_cselect_b32 s6, 1, 0
; GFX10-NEXT: s_cmp_lg_u32 s10, 0
; GFX10-NEXT: v_mov_b32_e32 v6, s1
; GFX10-NEXT: s_cselect_b64 s[2:3], s[0:1], s[2:3]
; GFX10-NEXT: s_cmp_lg_u32 s13, 0
; GFX10-NEXT: v_xor_b32_e32 v4, v5, v4
; GFX10-NEXT: s_and_b32 s6, s6, 1
; GFX10-NEXT: v_mov_b32_e32 v5, s2
; GFX10-NEXT: s_cmp_lg_u32 s6, 0
; GFX10-NEXT: v_readfirstlane_b32 s2, v2
; GFX10-NEXT: s_addc_u32 s5, s5, 0
; GFX10-NEXT: s_cselect_b64 s[4:5], s[6:7], s[4:5]
; GFX10-NEXT: s_add_u32 s2, s2, 0
; GFX10-NEXT: s_cselect_b32 s6, 1, 0
; GFX10-NEXT: v_and_b32_e32 v4, 1, v4
; GFX10-NEXT: v_mov_b32_e32 v5, s0
; GFX10-NEXT: s_and_b32 s6, s6, 1
; GFX10-NEXT: v_readfirstlane_b32 s3, v3
; GFX10-NEXT: v_and_b32_e32 v4, 1, v4
; GFX10-NEXT: s_cmp_lg_u32 s6, 0
; GFX10-NEXT: s_addc_u32 s0, s0, 0
; GFX10-NEXT: v_readfirstlane_b32 s0, v0
; GFX10-NEXT: s_addc_u32 s3, s3, 0
; GFX10-NEXT: s_cselect_b32 s6, 1, 0
; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v4
; GFX10-NEXT: s_and_b32 s6, s6, 1
; GFX10-NEXT: s_cmp_lg_u32 s6, 0
; GFX10-NEXT: s_addc_u32 s1, s1, s23
; GFX10-NEXT: v_cndmask_b32_e64 v4, v5, s4, vcc_lo
; GFX10-NEXT: v_cndmask_b32_e64 v5, v6, s5, vcc_lo
; GFX10-NEXT: v_cndmask_b32_e64 v6, v7, s0, vcc_lo
; GFX10-NEXT: s_addc_u32 s4, s4, 0
; GFX10-NEXT: s_cselect_b32 s6, 1, 0
; GFX10-NEXT: v_cndmask_b32_e64 v4, v5, s2, vcc_lo
; GFX10-NEXT: s_and_b32 s6, s6, 1
; GFX10-NEXT: v_cndmask_b32_e64 v5, v6, s3, vcc_lo
; GFX10-NEXT: s_cmp_lg_u32 s6, 0
; GFX10-NEXT: v_cndmask_b32_e64 v6, v7, s4, vcc_lo
; GFX10-NEXT: s_addc_u32 s1, s5, s23
; GFX10-NEXT: v_readfirstlane_b32 s2, v2
; GFX10-NEXT: v_cndmask_b32_e64 v7, v8, s1, vcc_lo
; GFX10-NEXT: v_readfirstlane_b32 s0, v0
; GFX10-NEXT: v_readfirstlane_b32 s1, v1
; GFX10-NEXT: v_readfirstlane_b32 s3, v3
; GFX10-NEXT: v_readfirstlane_b32 s4, v4
; GFX10-NEXT: v_readfirstlane_b32 s5, v5
; GFX10-NEXT: v_readfirstlane_b32 s6, v6