forked from OSchip/llvm-project
GlobalISel: Implement computeKnownBits for overflow bool results
This commit is contained in:
parent
eee82dc66d
commit
1416744f84
|
@ -567,6 +567,26 @@ void GISelKnownBits::computeKnownBitsImpl(Register R, KnownBits &Known,
|
|||
Known = KnownBits::ashr(KnownBits::shl(Known, ShiftKnown), ShiftKnown);
|
||||
break;
|
||||
}
|
||||
case TargetOpcode::G_UADDO:
|
||||
case TargetOpcode::G_UADDE:
|
||||
case TargetOpcode::G_SADDO:
|
||||
case TargetOpcode::G_SADDE:
|
||||
case TargetOpcode::G_USUBO:
|
||||
case TargetOpcode::G_USUBE:
|
||||
case TargetOpcode::G_SSUBO:
|
||||
case TargetOpcode::G_SSUBE:
|
||||
case TargetOpcode::G_UMULO:
|
||||
case TargetOpcode::G_SMULO: {
|
||||
if (MI.getOperand(1).getReg() == R) {
|
||||
// If we know the result of a compare has the top bits zero, use this
|
||||
// info.
|
||||
if (TL.getBooleanContents(DstTy.isVector(), false) ==
|
||||
TargetLowering::ZeroOrOneBooleanContent &&
|
||||
BitWidth > 1)
|
||||
Known.Zero.setBitsFrom(1);
|
||||
}
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
assert(!Known.hasConflict() && "Bits known to be one AND zero?");
|
||||
|
@ -673,6 +693,27 @@ unsigned GISelKnownBits::computeNumSignBits(Register R,
|
|||
MI.getOperand(3).getReg(), DemandedElts,
|
||||
Depth + 1);
|
||||
}
|
||||
case TargetOpcode::G_SADDO:
|
||||
case TargetOpcode::G_SADDE:
|
||||
case TargetOpcode::G_UADDO:
|
||||
case TargetOpcode::G_UADDE:
|
||||
case TargetOpcode::G_SSUBO:
|
||||
case TargetOpcode::G_SSUBE:
|
||||
case TargetOpcode::G_USUBO:
|
||||
case TargetOpcode::G_USUBE:
|
||||
case TargetOpcode::G_SMULO:
|
||||
case TargetOpcode::G_UMULO: {
|
||||
// If compares returns 0/-1, all bits are sign bits.
|
||||
// We know that we have an integer-based boolean since these operations
|
||||
// are only available for integer.
|
||||
if (MI.getOperand(1).getReg() == R) {
|
||||
if (TL.getBooleanContents(DstTy.isVector(), false) ==
|
||||
TargetLowering::ZeroOrNegativeOneBooleanContent)
|
||||
return TyBits;
|
||||
}
|
||||
|
||||
break;
|
||||
}
|
||||
case TargetOpcode::G_INTRINSIC:
|
||||
case TargetOpcode::G_INTRINSIC_W_SIDE_EFFECTS:
|
||||
default: {
|
||||
|
|
|
@ -457,7 +457,6 @@ define amdgpu_ps i32 @s_uaddo_i32(i32 inreg %a, i32 inreg %b) {
|
|||
; GFX7: ; %bb.0:
|
||||
; GFX7-NEXT: s_add_u32 s0, s0, s1
|
||||
; GFX7-NEXT: s_cselect_b32 s1, 1, 0
|
||||
; GFX7-NEXT: s_and_b32 s1, s1, 1
|
||||
; GFX7-NEXT: s_add_i32 s0, s0, s1
|
||||
; GFX7-NEXT: ; return to shader part epilog
|
||||
;
|
||||
|
@ -465,7 +464,6 @@ define amdgpu_ps i32 @s_uaddo_i32(i32 inreg %a, i32 inreg %b) {
|
|||
; GFX8: ; %bb.0:
|
||||
; GFX8-NEXT: s_add_u32 s0, s0, s1
|
||||
; GFX8-NEXT: s_cselect_b32 s1, 1, 0
|
||||
; GFX8-NEXT: s_and_b32 s1, s1, 1
|
||||
; GFX8-NEXT: s_add_i32 s0, s0, s1
|
||||
; GFX8-NEXT: ; return to shader part epilog
|
||||
;
|
||||
|
@ -473,7 +471,6 @@ define amdgpu_ps i32 @s_uaddo_i32(i32 inreg %a, i32 inreg %b) {
|
|||
; GFX9: ; %bb.0:
|
||||
; GFX9-NEXT: s_add_u32 s0, s0, s1
|
||||
; GFX9-NEXT: s_cselect_b32 s1, 1, 0
|
||||
; GFX9-NEXT: s_and_b32 s1, s1, 1
|
||||
; GFX9-NEXT: s_add_i32 s0, s0, s1
|
||||
; GFX9-NEXT: ; return to shader part epilog
|
||||
%uaddo = call {i32, i1} @llvm.uadd.with.overflow.i32(i32 %a, i32 %b)
|
||||
|
@ -488,9 +485,6 @@ define amdgpu_ps i64 @s_uaddo_i64(i64 inreg %a, i64 inreg %b) {
|
|||
; GFX7-LABEL: s_uaddo_i64:
|
||||
; GFX7: ; %bb.0:
|
||||
; GFX7-NEXT: s_add_u32 s0, s0, s2
|
||||
; GFX7-NEXT: s_cselect_b32 s4, 1, 0
|
||||
; GFX7-NEXT: s_and_b32 s4, s4, 1
|
||||
; GFX7-NEXT: s_cmp_lg_u32 s4, 0
|
||||
; GFX7-NEXT: v_mov_b32_e32 v0, s2
|
||||
; GFX7-NEXT: s_addc_u32 s1, s1, s3
|
||||
; GFX7-NEXT: v_mov_b32_e32 v1, s3
|
||||
|
@ -506,9 +500,6 @@ define amdgpu_ps i64 @s_uaddo_i64(i64 inreg %a, i64 inreg %b) {
|
|||
; GFX8-LABEL: s_uaddo_i64:
|
||||
; GFX8: ; %bb.0:
|
||||
; GFX8-NEXT: s_add_u32 s0, s0, s2
|
||||
; GFX8-NEXT: s_cselect_b32 s4, 1, 0
|
||||
; GFX8-NEXT: s_and_b32 s4, s4, 1
|
||||
; GFX8-NEXT: s_cmp_lg_u32 s4, 0
|
||||
; GFX8-NEXT: v_mov_b32_e32 v0, s2
|
||||
; GFX8-NEXT: s_addc_u32 s1, s1, s3
|
||||
; GFX8-NEXT: v_mov_b32_e32 v1, s3
|
||||
|
@ -524,9 +515,6 @@ define amdgpu_ps i64 @s_uaddo_i64(i64 inreg %a, i64 inreg %b) {
|
|||
; GFX9-LABEL: s_uaddo_i64:
|
||||
; GFX9: ; %bb.0:
|
||||
; GFX9-NEXT: s_add_u32 s0, s0, s2
|
||||
; GFX9-NEXT: s_cselect_b32 s4, 1, 0
|
||||
; GFX9-NEXT: s_and_b32 s4, s4, 1
|
||||
; GFX9-NEXT: s_cmp_lg_u32 s4, 0
|
||||
; GFX9-NEXT: v_mov_b32_e32 v0, s2
|
||||
; GFX9-NEXT: s_addc_u32 s1, s1, s3
|
||||
; GFX9-NEXT: v_mov_b32_e32 v1, s3
|
||||
|
@ -553,8 +541,6 @@ define amdgpu_ps <2 x i32> @s_uaddo_v2i32(<2 x i32> inreg %a, <2 x i32> inreg %b
|
|||
; GFX7-NEXT: s_cselect_b32 s2, 1, 0
|
||||
; GFX7-NEXT: s_add_u32 s1, s1, s3
|
||||
; GFX7-NEXT: s_cselect_b32 s3, 1, 0
|
||||
; GFX7-NEXT: s_and_b32 s2, s2, 1
|
||||
; GFX7-NEXT: s_and_b32 s3, s3, 1
|
||||
; GFX7-NEXT: s_add_i32 s0, s0, s2
|
||||
; GFX7-NEXT: s_add_i32 s1, s1, s3
|
||||
; GFX7-NEXT: ; return to shader part epilog
|
||||
|
@ -565,8 +551,6 @@ define amdgpu_ps <2 x i32> @s_uaddo_v2i32(<2 x i32> inreg %a, <2 x i32> inreg %b
|
|||
; GFX8-NEXT: s_cselect_b32 s2, 1, 0
|
||||
; GFX8-NEXT: s_add_u32 s1, s1, s3
|
||||
; GFX8-NEXT: s_cselect_b32 s3, 1, 0
|
||||
; GFX8-NEXT: s_and_b32 s2, s2, 1
|
||||
; GFX8-NEXT: s_and_b32 s3, s3, 1
|
||||
; GFX8-NEXT: s_add_i32 s0, s0, s2
|
||||
; GFX8-NEXT: s_add_i32 s1, s1, s3
|
||||
; GFX8-NEXT: ; return to shader part epilog
|
||||
|
@ -577,8 +561,6 @@ define amdgpu_ps <2 x i32> @s_uaddo_v2i32(<2 x i32> inreg %a, <2 x i32> inreg %b
|
|||
; GFX9-NEXT: s_cselect_b32 s2, 1, 0
|
||||
; GFX9-NEXT: s_add_u32 s1, s1, s3
|
||||
; GFX9-NEXT: s_cselect_b32 s3, 1, 0
|
||||
; GFX9-NEXT: s_and_b32 s2, s2, 1
|
||||
; GFX9-NEXT: s_and_b32 s3, s3, 1
|
||||
; GFX9-NEXT: s_add_i32 s0, s0, s2
|
||||
; GFX9-NEXT: s_add_i32 s1, s1, s3
|
||||
; GFX9-NEXT: ; return to shader part epilog
|
||||
|
@ -728,9 +710,6 @@ define amdgpu_ps i64 @s_saddo_i64(i64 inreg %a, i64 inreg %b) {
|
|||
; GFX7-LABEL: s_saddo_i64:
|
||||
; GFX7: ; %bb.0:
|
||||
; GFX7-NEXT: s_add_u32 s4, s0, s2
|
||||
; GFX7-NEXT: s_cselect_b32 s5, 1, 0
|
||||
; GFX7-NEXT: s_and_b32 s5, s5, 1
|
||||
; GFX7-NEXT: s_cmp_lg_u32 s5, 0
|
||||
; GFX7-NEXT: v_mov_b32_e32 v0, s0
|
||||
; GFX7-NEXT: s_addc_u32 s5, s1, s3
|
||||
; GFX7-NEXT: v_mov_b32_e32 v1, s1
|
||||
|
@ -748,9 +727,6 @@ define amdgpu_ps i64 @s_saddo_i64(i64 inreg %a, i64 inreg %b) {
|
|||
; GFX8-LABEL: s_saddo_i64:
|
||||
; GFX8: ; %bb.0:
|
||||
; GFX8-NEXT: s_add_u32 s4, s0, s2
|
||||
; GFX8-NEXT: s_cselect_b32 s5, 1, 0
|
||||
; GFX8-NEXT: s_and_b32 s5, s5, 1
|
||||
; GFX8-NEXT: s_cmp_lg_u32 s5, 0
|
||||
; GFX8-NEXT: v_mov_b32_e32 v0, s0
|
||||
; GFX8-NEXT: s_addc_u32 s5, s1, s3
|
||||
; GFX8-NEXT: v_mov_b32_e32 v1, s1
|
||||
|
@ -768,9 +744,6 @@ define amdgpu_ps i64 @s_saddo_i64(i64 inreg %a, i64 inreg %b) {
|
|||
; GFX9-LABEL: s_saddo_i64:
|
||||
; GFX9: ; %bb.0:
|
||||
; GFX9-NEXT: s_add_u32 s4, s0, s2
|
||||
; GFX9-NEXT: s_cselect_b32 s5, 1, 0
|
||||
; GFX9-NEXT: s_and_b32 s5, s5, 1
|
||||
; GFX9-NEXT: s_cmp_lg_u32 s5, 0
|
||||
; GFX9-NEXT: v_mov_b32_e32 v0, s0
|
||||
; GFX9-NEXT: s_addc_u32 s5, s1, s3
|
||||
; GFX9-NEXT: v_mov_b32_e32 v1, s1
|
||||
|
|
|
@ -31,9 +31,6 @@ define amdgpu_cs i64 @abs_sgpr_i64(i64 inreg %arg) {
|
|||
; GFX: ; %bb.0:
|
||||
; GFX-NEXT: s_ashr_i32 s2, s1, 31
|
||||
; GFX-NEXT: s_add_u32 s0, s0, s2
|
||||
; GFX-NEXT: s_cselect_b32 s4, 1, 0
|
||||
; GFX-NEXT: s_and_b32 s4, s4, 1
|
||||
; GFX-NEXT: s_cmp_lg_u32 s4, 0
|
||||
; GFX-NEXT: s_mov_b32 s3, s2
|
||||
; GFX-NEXT: s_addc_u32 s1, s1, s2
|
||||
; GFX-NEXT: s_xor_b64 s[0:1], s[0:1], s[2:3]
|
||||
|
|
File diff suppressed because it is too large
Load Diff
|
@ -4217,9 +4217,6 @@ define amdgpu_ps i64 @s_saddsat_i64(i64 inreg %lhs, i64 inreg %rhs) {
|
|||
; GFX6-LABEL: s_saddsat_i64:
|
||||
; GFX6: ; %bb.0:
|
||||
; GFX6-NEXT: s_add_u32 s4, s0, s2
|
||||
; GFX6-NEXT: s_cselect_b32 s5, 1, 0
|
||||
; GFX6-NEXT: s_and_b32 s5, s5, 1
|
||||
; GFX6-NEXT: s_cmp_lg_u32 s5, 0
|
||||
; GFX6-NEXT: v_mov_b32_e32 v0, s0
|
||||
; GFX6-NEXT: s_addc_u32 s5, s1, s3
|
||||
; GFX6-NEXT: v_mov_b32_e32 v1, s1
|
||||
|
@ -4243,9 +4240,6 @@ define amdgpu_ps i64 @s_saddsat_i64(i64 inreg %lhs, i64 inreg %rhs) {
|
|||
; GFX8-LABEL: s_saddsat_i64:
|
||||
; GFX8: ; %bb.0:
|
||||
; GFX8-NEXT: s_add_u32 s4, s0, s2
|
||||
; GFX8-NEXT: s_cselect_b32 s5, 1, 0
|
||||
; GFX8-NEXT: s_and_b32 s5, s5, 1
|
||||
; GFX8-NEXT: s_cmp_lg_u32 s5, 0
|
||||
; GFX8-NEXT: v_mov_b32_e32 v0, s0
|
||||
; GFX8-NEXT: s_addc_u32 s5, s1, s3
|
||||
; GFX8-NEXT: v_mov_b32_e32 v1, s1
|
||||
|
@ -4269,9 +4263,6 @@ define amdgpu_ps i64 @s_saddsat_i64(i64 inreg %lhs, i64 inreg %rhs) {
|
|||
; GFX9-LABEL: s_saddsat_i64:
|
||||
; GFX9: ; %bb.0:
|
||||
; GFX9-NEXT: s_add_u32 s4, s0, s2
|
||||
; GFX9-NEXT: s_cselect_b32 s5, 1, 0
|
||||
; GFX9-NEXT: s_and_b32 s5, s5, 1
|
||||
; GFX9-NEXT: s_cmp_lg_u32 s5, 0
|
||||
; GFX9-NEXT: v_mov_b32_e32 v0, s0
|
||||
; GFX9-NEXT: s_addc_u32 s5, s1, s3
|
||||
; GFX9-NEXT: v_mov_b32_e32 v1, s1
|
||||
|
@ -4295,15 +4286,12 @@ define amdgpu_ps i64 @s_saddsat_i64(i64 inreg %lhs, i64 inreg %rhs) {
|
|||
; GFX10-LABEL: s_saddsat_i64:
|
||||
; GFX10: ; %bb.0:
|
||||
; GFX10-NEXT: s_add_u32 s4, s0, s2
|
||||
; GFX10-NEXT: s_cselect_b32 s5, 1, 0
|
||||
; GFX10-NEXT: v_cmp_lt_i64_e64 s2, s[2:3], 0
|
||||
; GFX10-NEXT: s_and_b32 s5, s5, 1
|
||||
; GFX10-NEXT: v_mov_b32_e32 v0, s4
|
||||
; GFX10-NEXT: s_cmp_lg_u32 s5, 0
|
||||
; GFX10-NEXT: s_addc_u32 s5, s1, s3
|
||||
; GFX10-NEXT: s_mov_b32 s3, 0
|
||||
; GFX10-NEXT: v_cmp_lt_i64_e64 s2, s[2:3], 0
|
||||
; GFX10-NEXT: v_cmp_lt_i64_e64 s1, s[4:5], s[0:1]
|
||||
; GFX10-NEXT: s_mov_b32 s3, 0
|
||||
; GFX10-NEXT: s_ashr_i32 s0, s5, 31
|
||||
; GFX10-NEXT: v_mov_b32_e32 v0, s4
|
||||
; GFX10-NEXT: v_mov_b32_e32 v1, s5
|
||||
; GFX10-NEXT: s_xor_b32 s2, s2, s1
|
||||
; GFX10-NEXT: s_cmp_lg_u32 s3, 0
|
||||
|
@ -4559,9 +4547,6 @@ define amdgpu_ps <2 x i64> @s_saddsat_v2i64(<2 x i64> inreg %lhs, <2 x i64> inre
|
|||
; GFX6-LABEL: s_saddsat_v2i64:
|
||||
; GFX6: ; %bb.0:
|
||||
; GFX6-NEXT: s_add_u32 s8, s0, s4
|
||||
; GFX6-NEXT: s_cselect_b32 s9, 1, 0
|
||||
; GFX6-NEXT: s_and_b32 s9, s9, 1
|
||||
; GFX6-NEXT: s_cmp_lg_u32 s9, 0
|
||||
; GFX6-NEXT: v_mov_b32_e32 v0, s0
|
||||
; GFX6-NEXT: s_addc_u32 s9, s1, s5
|
||||
; GFX6-NEXT: v_mov_b32_e32 v1, s1
|
||||
|
@ -4572,16 +4557,13 @@ define amdgpu_ps <2 x i64> @s_saddsat_v2i64(<2 x i64> inreg %lhs, <2 x i64> inre
|
|||
; GFX6-NEXT: s_xor_b64 vcc, s[0:1], vcc
|
||||
; GFX6-NEXT: s_brev_b32 s5, 1
|
||||
; GFX6-NEXT: s_cmp_lg_u32 s10, 0
|
||||
; GFX6-NEXT: s_addc_u32 s1, s4, s5
|
||||
; GFX6-NEXT: s_add_u32 s0, s2, s6
|
||||
; GFX6-NEXT: v_mov_b32_e32 v2, s1
|
||||
; GFX6-NEXT: s_cselect_b32 s1, 1, 0
|
||||
; GFX6-NEXT: v_mov_b32_e32 v0, s4
|
||||
; GFX6-NEXT: v_mov_b32_e32 v1, s8
|
||||
; GFX6-NEXT: s_and_b32 s1, s1, 1
|
||||
; GFX6-NEXT: s_addc_u32 s1, s4, s5
|
||||
; GFX6-NEXT: v_cndmask_b32_e32 v4, v1, v0, vcc
|
||||
; GFX6-NEXT: s_cmp_lg_u32 s1, 0
|
||||
; GFX6-NEXT: s_add_u32 s0, s2, s6
|
||||
; GFX6-NEXT: v_mov_b32_e32 v0, s2
|
||||
; GFX6-NEXT: v_mov_b32_e32 v2, s1
|
||||
; GFX6-NEXT: v_mov_b32_e32 v3, s9
|
||||
; GFX6-NEXT: s_addc_u32 s1, s3, s7
|
||||
; GFX6-NEXT: v_mov_b32_e32 v1, s3
|
||||
|
@ -4608,9 +4590,6 @@ define amdgpu_ps <2 x i64> @s_saddsat_v2i64(<2 x i64> inreg %lhs, <2 x i64> inre
|
|||
; GFX8-LABEL: s_saddsat_v2i64:
|
||||
; GFX8: ; %bb.0:
|
||||
; GFX8-NEXT: s_add_u32 s8, s0, s4
|
||||
; GFX8-NEXT: s_cselect_b32 s9, 1, 0
|
||||
; GFX8-NEXT: s_and_b32 s9, s9, 1
|
||||
; GFX8-NEXT: s_cmp_lg_u32 s9, 0
|
||||
; GFX8-NEXT: v_mov_b32_e32 v0, s0
|
||||
; GFX8-NEXT: s_addc_u32 s9, s1, s5
|
||||
; GFX8-NEXT: v_mov_b32_e32 v1, s1
|
||||
|
@ -4621,16 +4600,13 @@ define amdgpu_ps <2 x i64> @s_saddsat_v2i64(<2 x i64> inreg %lhs, <2 x i64> inre
|
|||
; GFX8-NEXT: s_xor_b64 vcc, s[0:1], vcc
|
||||
; GFX8-NEXT: s_brev_b32 s5, 1
|
||||
; GFX8-NEXT: s_cmp_lg_u32 s10, 0
|
||||
; GFX8-NEXT: s_addc_u32 s1, s4, s5
|
||||
; GFX8-NEXT: s_add_u32 s0, s2, s6
|
||||
; GFX8-NEXT: v_mov_b32_e32 v2, s1
|
||||
; GFX8-NEXT: s_cselect_b32 s1, 1, 0
|
||||
; GFX8-NEXT: v_mov_b32_e32 v0, s4
|
||||
; GFX8-NEXT: v_mov_b32_e32 v1, s8
|
||||
; GFX8-NEXT: s_and_b32 s1, s1, 1
|
||||
; GFX8-NEXT: s_addc_u32 s1, s4, s5
|
||||
; GFX8-NEXT: v_cndmask_b32_e32 v4, v1, v0, vcc
|
||||
; GFX8-NEXT: s_cmp_lg_u32 s1, 0
|
||||
; GFX8-NEXT: s_add_u32 s0, s2, s6
|
||||
; GFX8-NEXT: v_mov_b32_e32 v0, s2
|
||||
; GFX8-NEXT: v_mov_b32_e32 v2, s1
|
||||
; GFX8-NEXT: v_mov_b32_e32 v3, s9
|
||||
; GFX8-NEXT: s_addc_u32 s1, s3, s7
|
||||
; GFX8-NEXT: v_mov_b32_e32 v1, s3
|
||||
|
@ -4657,9 +4633,6 @@ define amdgpu_ps <2 x i64> @s_saddsat_v2i64(<2 x i64> inreg %lhs, <2 x i64> inre
|
|||
; GFX9-LABEL: s_saddsat_v2i64:
|
||||
; GFX9: ; %bb.0:
|
||||
; GFX9-NEXT: s_add_u32 s8, s0, s4
|
||||
; GFX9-NEXT: s_cselect_b32 s9, 1, 0
|
||||
; GFX9-NEXT: s_and_b32 s9, s9, 1
|
||||
; GFX9-NEXT: s_cmp_lg_u32 s9, 0
|
||||
; GFX9-NEXT: v_mov_b32_e32 v0, s0
|
||||
; GFX9-NEXT: s_addc_u32 s9, s1, s5
|
||||
; GFX9-NEXT: v_mov_b32_e32 v1, s1
|
||||
|
@ -4670,16 +4643,13 @@ define amdgpu_ps <2 x i64> @s_saddsat_v2i64(<2 x i64> inreg %lhs, <2 x i64> inre
|
|||
; GFX9-NEXT: s_xor_b64 vcc, s[0:1], vcc
|
||||
; GFX9-NEXT: s_brev_b32 s5, 1
|
||||
; GFX9-NEXT: s_cmp_lg_u32 s10, 0
|
||||
; GFX9-NEXT: s_addc_u32 s1, s4, s5
|
||||
; GFX9-NEXT: s_add_u32 s0, s2, s6
|
||||
; GFX9-NEXT: v_mov_b32_e32 v2, s1
|
||||
; GFX9-NEXT: s_cselect_b32 s1, 1, 0
|
||||
; GFX9-NEXT: v_mov_b32_e32 v0, s4
|
||||
; GFX9-NEXT: v_mov_b32_e32 v1, s8
|
||||
; GFX9-NEXT: s_and_b32 s1, s1, 1
|
||||
; GFX9-NEXT: s_addc_u32 s1, s4, s5
|
||||
; GFX9-NEXT: v_cndmask_b32_e32 v4, v1, v0, vcc
|
||||
; GFX9-NEXT: s_cmp_lg_u32 s1, 0
|
||||
; GFX9-NEXT: s_add_u32 s0, s2, s6
|
||||
; GFX9-NEXT: v_mov_b32_e32 v0, s2
|
||||
; GFX9-NEXT: v_mov_b32_e32 v2, s1
|
||||
; GFX9-NEXT: v_mov_b32_e32 v3, s9
|
||||
; GFX9-NEXT: s_addc_u32 s1, s3, s7
|
||||
; GFX9-NEXT: v_mov_b32_e32 v1, s3
|
||||
|
@ -4706,32 +4676,26 @@ define amdgpu_ps <2 x i64> @s_saddsat_v2i64(<2 x i64> inreg %lhs, <2 x i64> inre
|
|||
; GFX10-LABEL: s_saddsat_v2i64:
|
||||
; GFX10: ; %bb.0:
|
||||
; GFX10-NEXT: s_add_u32 s8, s0, s4
|
||||
; GFX10-NEXT: s_cselect_b32 s9, 1, 0
|
||||
; GFX10-NEXT: v_cmp_lt_i64_e64 s4, s[4:5], 0
|
||||
; GFX10-NEXT: s_and_b32 s9, s9, 1
|
||||
; GFX10-NEXT: s_mov_b32 s11, 0
|
||||
; GFX10-NEXT: s_cmp_lg_u32 s9, 0
|
||||
; GFX10-NEXT: v_mov_b32_e32 v0, s8
|
||||
; GFX10-NEXT: s_addc_u32 s9, s1, s5
|
||||
; GFX10-NEXT: s_brev_b32 s10, 1
|
||||
; GFX10-NEXT: v_cmp_lt_i64_e64 s4, s[4:5], 0
|
||||
; GFX10-NEXT: v_cmp_lt_i64_e64 s1, s[8:9], s[0:1]
|
||||
; GFX10-NEXT: s_mov_b32 s11, 0
|
||||
; GFX10-NEXT: s_ashr_i32 s0, s9, 31
|
||||
; GFX10-NEXT: v_mov_b32_e32 v0, s8
|
||||
; GFX10-NEXT: s_brev_b32 s10, 1
|
||||
; GFX10-NEXT: v_mov_b32_e32 v1, s9
|
||||
; GFX10-NEXT: s_xor_b32 s8, s4, s1
|
||||
; GFX10-NEXT: s_cmp_lg_u32 s11, 0
|
||||
; GFX10-NEXT: v_cndmask_b32_e64 v0, v0, s0, s8
|
||||
; GFX10-NEXT: s_addc_u32 s1, s0, s10
|
||||
; GFX10-NEXT: s_add_u32 s4, s2, s6
|
||||
; GFX10-NEXT: s_cselect_b32 s5, 1, 0
|
||||
; GFX10-NEXT: v_mov_b32_e32 v2, s4
|
||||
; GFX10-NEXT: s_and_b32 s5, s5, 1
|
||||
; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, s1, s8
|
||||
; GFX10-NEXT: s_cmp_lg_u32 s5, 0
|
||||
; GFX10-NEXT: s_addc_u32 s5, s3, s7
|
||||
; GFX10-NEXT: v_mov_b32_e32 v2, s4
|
||||
; GFX10-NEXT: v_cmp_lt_i64_e64 s2, s[4:5], s[2:3]
|
||||
; GFX10-NEXT: v_cmp_lt_i64_e64 s3, s[6:7], 0
|
||||
; GFX10-NEXT: s_ashr_i32 s0, s5, 31
|
||||
; GFX10-NEXT: v_mov_b32_e32 v3, s5
|
||||
; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, s1, s8
|
||||
; GFX10-NEXT: s_xor_b32 s2, s3, s2
|
||||
; GFX10-NEXT: s_cmp_lg_u32 s11, 0
|
||||
; GFX10-NEXT: v_cndmask_b32_e64 v2, v2, s0, s2
|
||||
|
@ -4750,19 +4714,10 @@ define amdgpu_ps i128 @s_saddsat_i128(i128 inreg %lhs, i128 inreg %rhs) {
|
|||
; GFX6-LABEL: s_saddsat_i128:
|
||||
; GFX6: ; %bb.0:
|
||||
; GFX6-NEXT: s_add_u32 s4, s0, s4
|
||||
; GFX6-NEXT: s_cselect_b32 s8, 1, 0
|
||||
; GFX6-NEXT: s_and_b32 s8, s8, 1
|
||||
; GFX6-NEXT: s_cmp_lg_u32 s8, 0
|
||||
; GFX6-NEXT: s_addc_u32 s5, s1, s5
|
||||
; GFX6-NEXT: s_cselect_b32 s8, 1, 0
|
||||
; GFX6-NEXT: s_and_b32 s8, s8, 1
|
||||
; GFX6-NEXT: s_cmp_lg_u32 s8, 0
|
||||
; GFX6-NEXT: s_addc_u32 s8, s2, s6
|
||||
; GFX6-NEXT: s_cselect_b32 s9, 1, 0
|
||||
; GFX6-NEXT: v_mov_b32_e32 v3, s1
|
||||
; GFX6-NEXT: s_and_b32 s9, s9, 1
|
||||
; GFX6-NEXT: s_addc_u32 s5, s1, s5
|
||||
; GFX6-NEXT: v_mov_b32_e32 v2, s0
|
||||
; GFX6-NEXT: s_cmp_lg_u32 s9, 0
|
||||
; GFX6-NEXT: s_addc_u32 s8, s2, s6
|
||||
; GFX6-NEXT: v_mov_b32_e32 v0, s2
|
||||
; GFX6-NEXT: v_cmp_lt_u64_e32 vcc, s[4:5], v[2:3]
|
||||
; GFX6-NEXT: s_addc_u32 s9, s3, s7
|
||||
|
@ -4779,15 +4734,9 @@ define amdgpu_ps i128 @s_saddsat_i128(i128 inreg %lhs, i128 inreg %rhs) {
|
|||
; GFX6-NEXT: s_ashr_i32 s0, s9, 31
|
||||
; GFX6-NEXT: s_mov_b32 s1, 0
|
||||
; GFX6-NEXT: s_cmp_lg_u32 s1, 0
|
||||
; GFX6-NEXT: s_addc_u32 s1, s0, 0
|
||||
; GFX6-NEXT: s_cselect_b32 s2, 1, 0
|
||||
; GFX6-NEXT: s_and_b32 s2, s2, 1
|
||||
; GFX6-NEXT: s_cmp_lg_u32 s2, 0
|
||||
; GFX6-NEXT: s_addc_u32 s2, s0, 0
|
||||
; GFX6-NEXT: s_cselect_b32 s3, 1, 0
|
||||
; GFX6-NEXT: v_xor_b32_e32 v0, v1, v0
|
||||
; GFX6-NEXT: s_and_b32 s3, s3, 1
|
||||
; GFX6-NEXT: s_cmp_lg_u32 s3, 0
|
||||
; GFX6-NEXT: s_addc_u32 s1, s0, 0
|
||||
; GFX6-NEXT: s_addc_u32 s2, s0, 0
|
||||
; GFX6-NEXT: v_and_b32_e32 v0, 1, v0
|
||||
; GFX6-NEXT: s_addc_u32 s3, s0, 0x80000000
|
||||
; GFX6-NEXT: v_mov_b32_e32 v1, s0
|
||||
|
@ -4812,18 +4761,9 @@ define amdgpu_ps i128 @s_saddsat_i128(i128 inreg %lhs, i128 inreg %rhs) {
|
|||
; GFX8-LABEL: s_saddsat_i128:
|
||||
; GFX8: ; %bb.0:
|
||||
; GFX8-NEXT: s_add_u32 s4, s0, s4
|
||||
; GFX8-NEXT: s_cselect_b32 s8, 1, 0
|
||||
; GFX8-NEXT: s_and_b32 s8, s8, 1
|
||||
; GFX8-NEXT: s_cmp_lg_u32 s8, 0
|
||||
; GFX8-NEXT: s_addc_u32 s5, s1, s5
|
||||
; GFX8-NEXT: s_cselect_b32 s8, 1, 0
|
||||
; GFX8-NEXT: s_and_b32 s8, s8, 1
|
||||
; GFX8-NEXT: s_cmp_lg_u32 s8, 0
|
||||
; GFX8-NEXT: s_addc_u32 s8, s2, s6
|
||||
; GFX8-NEXT: s_cselect_b32 s9, 1, 0
|
||||
; GFX8-NEXT: s_and_b32 s9, s9, 1
|
||||
; GFX8-NEXT: v_mov_b32_e32 v3, s1
|
||||
; GFX8-NEXT: s_cmp_lg_u32 s9, 0
|
||||
; GFX8-NEXT: s_addc_u32 s8, s2, s6
|
||||
; GFX8-NEXT: v_mov_b32_e32 v2, s0
|
||||
; GFX8-NEXT: s_addc_u32 s9, s3, s7
|
||||
; GFX8-NEXT: v_mov_b32_e32 v0, s2
|
||||
|
@ -4845,17 +4785,11 @@ define amdgpu_ps i128 @s_saddsat_i128(i128 inreg %lhs, i128 inreg %rhs) {
|
|||
; GFX8-NEXT: v_cndmask_b32_e64 v1, v1, 0, s[0:1]
|
||||
; GFX8-NEXT: s_ashr_i32 s0, s9, 31
|
||||
; GFX8-NEXT: s_mov_b32 s1, 0
|
||||
; GFX8-NEXT: s_cmp_lg_u32 s1, 0
|
||||
; GFX8-NEXT: s_addc_u32 s1, s0, 0
|
||||
; GFX8-NEXT: s_cselect_b32 s2, 1, 0
|
||||
; GFX8-NEXT: s_and_b32 s2, s2, 1
|
||||
; GFX8-NEXT: s_cmp_lg_u32 s2, 0
|
||||
; GFX8-NEXT: s_addc_u32 s2, s0, 0
|
||||
; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc
|
||||
; GFX8-NEXT: s_cselect_b32 s3, 1, 0
|
||||
; GFX8-NEXT: s_cmp_lg_u32 s1, 0
|
||||
; GFX8-NEXT: v_xor_b32_e32 v0, v1, v0
|
||||
; GFX8-NEXT: s_and_b32 s3, s3, 1
|
||||
; GFX8-NEXT: s_cmp_lg_u32 s3, 0
|
||||
; GFX8-NEXT: s_addc_u32 s1, s0, 0
|
||||
; GFX8-NEXT: s_addc_u32 s2, s0, 0
|
||||
; GFX8-NEXT: v_and_b32_e32 v0, 1, v0
|
||||
; GFX8-NEXT: s_addc_u32 s3, s0, 0x80000000
|
||||
; GFX8-NEXT: v_mov_b32_e32 v1, s0
|
||||
|
@ -4880,18 +4814,9 @@ define amdgpu_ps i128 @s_saddsat_i128(i128 inreg %lhs, i128 inreg %rhs) {
|
|||
; GFX9-LABEL: s_saddsat_i128:
|
||||
; GFX9: ; %bb.0:
|
||||
; GFX9-NEXT: s_add_u32 s4, s0, s4
|
||||
; GFX9-NEXT: s_cselect_b32 s8, 1, 0
|
||||
; GFX9-NEXT: s_and_b32 s8, s8, 1
|
||||
; GFX9-NEXT: s_cmp_lg_u32 s8, 0
|
||||
; GFX9-NEXT: s_addc_u32 s5, s1, s5
|
||||
; GFX9-NEXT: s_cselect_b32 s8, 1, 0
|
||||
; GFX9-NEXT: s_and_b32 s8, s8, 1
|
||||
; GFX9-NEXT: s_cmp_lg_u32 s8, 0
|
||||
; GFX9-NEXT: s_addc_u32 s8, s2, s6
|
||||
; GFX9-NEXT: s_cselect_b32 s9, 1, 0
|
||||
; GFX9-NEXT: s_and_b32 s9, s9, 1
|
||||
; GFX9-NEXT: v_mov_b32_e32 v3, s1
|
||||
; GFX9-NEXT: s_cmp_lg_u32 s9, 0
|
||||
; GFX9-NEXT: s_addc_u32 s8, s2, s6
|
||||
; GFX9-NEXT: v_mov_b32_e32 v2, s0
|
||||
; GFX9-NEXT: s_addc_u32 s9, s3, s7
|
||||
; GFX9-NEXT: v_mov_b32_e32 v0, s2
|
||||
|
@ -4913,17 +4838,11 @@ define amdgpu_ps i128 @s_saddsat_i128(i128 inreg %lhs, i128 inreg %rhs) {
|
|||
; GFX9-NEXT: v_cndmask_b32_e64 v1, v1, 0, s[0:1]
|
||||
; GFX9-NEXT: s_ashr_i32 s0, s9, 31
|
||||
; GFX9-NEXT: s_mov_b32 s1, 0
|
||||
; GFX9-NEXT: s_cmp_lg_u32 s1, 0
|
||||
; GFX9-NEXT: s_addc_u32 s1, s0, 0
|
||||
; GFX9-NEXT: s_cselect_b32 s2, 1, 0
|
||||
; GFX9-NEXT: s_and_b32 s2, s2, 1
|
||||
; GFX9-NEXT: s_cmp_lg_u32 s2, 0
|
||||
; GFX9-NEXT: s_addc_u32 s2, s0, 0
|
||||
; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc
|
||||
; GFX9-NEXT: s_cselect_b32 s3, 1, 0
|
||||
; GFX9-NEXT: s_cmp_lg_u32 s1, 0
|
||||
; GFX9-NEXT: v_xor_b32_e32 v0, v1, v0
|
||||
; GFX9-NEXT: s_and_b32 s3, s3, 1
|
||||
; GFX9-NEXT: s_cmp_lg_u32 s3, 0
|
||||
; GFX9-NEXT: s_addc_u32 s1, s0, 0
|
||||
; GFX9-NEXT: s_addc_u32 s2, s0, 0
|
||||
; GFX9-NEXT: v_and_b32_e32 v0, 1, v0
|
||||
; GFX9-NEXT: s_addc_u32 s3, s0, 0x80000000
|
||||
; GFX9-NEXT: v_mov_b32_e32 v1, s0
|
||||
|
@ -4948,60 +4867,45 @@ define amdgpu_ps i128 @s_saddsat_i128(i128 inreg %lhs, i128 inreg %rhs) {
|
|||
; GFX10-LABEL: s_saddsat_i128:
|
||||
; GFX10: ; %bb.0:
|
||||
; GFX10-NEXT: s_add_u32 s4, s0, s4
|
||||
; GFX10-NEXT: s_cselect_b32 s8, 1, 0
|
||||
; GFX10-NEXT: v_cmp_lt_i64_e64 s10, s[6:7], 0
|
||||
; GFX10-NEXT: s_and_b32 s8, s8, 1
|
||||
; GFX10-NEXT: s_cmp_lg_u32 s8, 0
|
||||
; GFX10-NEXT: s_addc_u32 s5, s1, s5
|
||||
; GFX10-NEXT: s_cselect_b32 s8, 1, 0
|
||||
; GFX10-NEXT: v_cmp_lt_u64_e64 s0, s[4:5], s[0:1]
|
||||
; GFX10-NEXT: s_and_b32 s8, s8, 1
|
||||
; GFX10-NEXT: v_cndmask_b32_e64 v0, 0, 1, s10
|
||||
; GFX10-NEXT: s_cmp_lg_u32 s8, 0
|
||||
; GFX10-NEXT: s_addc_u32 s8, s2, s6
|
||||
; GFX10-NEXT: s_cselect_b32 s9, 1, 0
|
||||
; GFX10-NEXT: v_cndmask_b32_e64 v1, 0, 1, s0
|
||||
; GFX10-NEXT: s_and_b32 s9, s9, 1
|
||||
; GFX10-NEXT: v_mov_b32_e32 v3, s8
|
||||
; GFX10-NEXT: s_cmp_lg_u32 s9, 0
|
||||
; GFX10-NEXT: v_cmp_lt_u64_e64 s0, s[4:5], s[0:1]
|
||||
; GFX10-NEXT: s_addc_u32 s9, s3, s7
|
||||
; GFX10-NEXT: s_cmp_eq_u64 s[8:9], s[2:3]
|
||||
; GFX10-NEXT: v_cmp_lt_i64_e64 s2, s[8:9], s[2:3]
|
||||
; GFX10-NEXT: v_mov_b32_e32 v3, s9
|
||||
; GFX10-NEXT: s_cselect_b32 s10, 1, 0
|
||||
; GFX10-NEXT: v_mov_b32_e32 v4, s9
|
||||
; GFX10-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0
|
||||
; GFX10-NEXT: v_cmp_lt_i64_e64 s0, s[8:9], s[2:3]
|
||||
; GFX10-NEXT: v_cmp_lt_i64_e64 s2, s[6:7], 0
|
||||
; GFX10-NEXT: v_cndmask_b32_e64 v1, 0, 1, s0
|
||||
; GFX10-NEXT: s_and_b32 s0, 1, s10
|
||||
; GFX10-NEXT: s_cmp_eq_u64 s[6:7], 0
|
||||
; GFX10-NEXT: v_cmp_ne_u32_e64 vcc_lo, 0, s0
|
||||
; GFX10-NEXT: s_cselect_b32 s1, 1, 0
|
||||
; GFX10-NEXT: v_cndmask_b32_e64 v2, 0, 1, s2
|
||||
; GFX10-NEXT: s_cselect_b32 s1, 1, 0
|
||||
; GFX10-NEXT: v_cmp_ne_u32_e64 vcc_lo, 0, s0
|
||||
; GFX10-NEXT: s_and_b32 s1, 1, s1
|
||||
; GFX10-NEXT: v_cmp_ne_u32_e64 s0, 0, s1
|
||||
; GFX10-NEXT: s_mov_b32 s1, 0
|
||||
; GFX10-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc_lo
|
||||
; GFX10-NEXT: v_mov_b32_e32 v2, s5
|
||||
; GFX10-NEXT: v_cndmask_b32_e64 v0, v0, 0, s0
|
||||
; GFX10-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc_lo
|
||||
; GFX10-NEXT: v_cndmask_b32_e64 v1, v2, 0, s0
|
||||
; GFX10-NEXT: s_ashr_i32 s0, s9, 31
|
||||
; GFX10-NEXT: s_cmp_lg_u32 s1, 0
|
||||
; GFX10-NEXT: v_mov_b32_e32 v2, s5
|
||||
; GFX10-NEXT: s_addc_u32 s1, s0, 0
|
||||
; GFX10-NEXT: s_cselect_b32 s2, 1, 0
|
||||
; GFX10-NEXT: v_xor_b32_e32 v0, v0, v1
|
||||
; GFX10-NEXT: s_and_b32 s2, s2, 1
|
||||
; GFX10-NEXT: v_xor_b32_e32 v0, v1, v0
|
||||
; GFX10-NEXT: v_mov_b32_e32 v1, s4
|
||||
; GFX10-NEXT: s_cmp_lg_u32 s2, 0
|
||||
; GFX10-NEXT: v_and_b32_e32 v0, 1, v0
|
||||
; GFX10-NEXT: s_addc_u32 s2, s0, 0
|
||||
; GFX10-NEXT: s_cselect_b32 s3, 1, 0
|
||||
; GFX10-NEXT: s_and_b32 s3, s3, 1
|
||||
; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0
|
||||
; GFX10-NEXT: s_cmp_lg_u32 s3, 0
|
||||
; GFX10-NEXT: s_addc_u32 s3, s0, 0x80000000
|
||||
; GFX10-NEXT: v_cndmask_b32_e64 v0, v1, s0, vcc_lo
|
||||
; GFX10-NEXT: v_cndmask_b32_e64 v1, v2, s1, vcc_lo
|
||||
; GFX10-NEXT: v_cndmask_b32_e64 v2, v3, s2, vcc_lo
|
||||
; GFX10-NEXT: v_cndmask_b32_e64 v3, v4, s3, vcc_lo
|
||||
; GFX10-NEXT: v_readfirstlane_b32 s0, v0
|
||||
; GFX10-NEXT: v_readfirstlane_b32 s1, v1
|
||||
; GFX10-NEXT: v_readfirstlane_b32 s2, v2
|
||||
; GFX10-NEXT: v_and_b32_e32 v0, 1, v0
|
||||
; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0
|
||||
; GFX10-NEXT: v_mov_b32_e32 v0, s8
|
||||
; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, s0, vcc_lo
|
||||
; GFX10-NEXT: v_cndmask_b32_e64 v2, v2, s1, vcc_lo
|
||||
; GFX10-NEXT: v_cndmask_b32_e64 v0, v0, s2, vcc_lo
|
||||
; GFX10-NEXT: v_cndmask_b32_e64 v3, v3, s3, vcc_lo
|
||||
; GFX10-NEXT: v_readfirstlane_b32 s0, v1
|
||||
; GFX10-NEXT: v_readfirstlane_b32 s1, v2
|
||||
; GFX10-NEXT: v_readfirstlane_b32 s2, v0
|
||||
; GFX10-NEXT: v_readfirstlane_b32 s3, v3
|
||||
; GFX10-NEXT: ; return to shader part epilog
|
||||
%result = call i128 @llvm.sadd.sat.i128(i128 %lhs, i128 %rhs)
|
||||
|
@ -5527,19 +5431,10 @@ define amdgpu_ps <2 x i128> @s_saddsat_v2i128(<2 x i128> inreg %lhs, <2 x i128>
|
|||
; GFX6-LABEL: s_saddsat_v2i128:
|
||||
; GFX6: ; %bb.0:
|
||||
; GFX6-NEXT: s_add_u32 s8, s0, s8
|
||||
; GFX6-NEXT: s_cselect_b32 s16, 1, 0
|
||||
; GFX6-NEXT: s_and_b32 s16, s16, 1
|
||||
; GFX6-NEXT: s_cmp_lg_u32 s16, 0
|
||||
; GFX6-NEXT: s_addc_u32 s9, s1, s9
|
||||
; GFX6-NEXT: s_cselect_b32 s16, 1, 0
|
||||
; GFX6-NEXT: s_and_b32 s16, s16, 1
|
||||
; GFX6-NEXT: s_cmp_lg_u32 s16, 0
|
||||
; GFX6-NEXT: s_addc_u32 s16, s2, s10
|
||||
; GFX6-NEXT: s_cselect_b32 s17, 1, 0
|
||||
; GFX6-NEXT: v_mov_b32_e32 v3, s1
|
||||
; GFX6-NEXT: s_and_b32 s17, s17, 1
|
||||
; GFX6-NEXT: s_addc_u32 s9, s1, s9
|
||||
; GFX6-NEXT: v_mov_b32_e32 v2, s0
|
||||
; GFX6-NEXT: s_cmp_lg_u32 s17, 0
|
||||
; GFX6-NEXT: s_addc_u32 s16, s2, s10
|
||||
; GFX6-NEXT: v_mov_b32_e32 v0, s2
|
||||
; GFX6-NEXT: v_cmp_lt_u64_e32 vcc, s[8:9], v[2:3]
|
||||
; GFX6-NEXT: s_addc_u32 s17, s3, s11
|
||||
|
@ -5551,50 +5446,35 @@ define amdgpu_ps <2 x i128> @s_saddsat_v2i128(<2 x i128> inreg %lhs, <2 x i128>
|
|||
; GFX6-NEXT: v_cmp_eq_u64_e32 vcc, s[16:17], v[0:1]
|
||||
; GFX6-NEXT: v_cndmask_b32_e64 v1, 0, 1, s[0:1]
|
||||
; GFX6-NEXT: v_cmp_eq_u64_e64 s[0:1], s[10:11], 0
|
||||
; GFX6-NEXT: s_brev_b32 s10, 1
|
||||
; GFX6-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc
|
||||
; GFX6-NEXT: v_cndmask_b32_e64 v1, v1, 0, s[0:1]
|
||||
; GFX6-NEXT: s_ashr_i32 s0, s17, 31
|
||||
; GFX6-NEXT: s_mov_b32 s1, 0
|
||||
; GFX6-NEXT: s_cmp_lg_u32 s1, 0
|
||||
; GFX6-NEXT: s_addc_u32 s1, s0, 0
|
||||
; GFX6-NEXT: s_cselect_b32 s2, 1, 0
|
||||
; GFX6-NEXT: s_and_b32 s2, s2, 1
|
||||
; GFX6-NEXT: s_cmp_lg_u32 s2, 0
|
||||
; GFX6-NEXT: s_addc_u32 s2, s0, 0
|
||||
; GFX6-NEXT: s_cselect_b32 s3, 1, 0
|
||||
; GFX6-NEXT: s_and_b32 s3, s3, 1
|
||||
; GFX6-NEXT: s_cmp_lg_u32 s3, 0
|
||||
; GFX6-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc
|
||||
; GFX6-NEXT: s_addc_u32 s3, s0, s10
|
||||
; GFX6-NEXT: v_xor_b32_e32 v0, v1, v0
|
||||
; GFX6-NEXT: v_mov_b32_e32 v1, s0
|
||||
; GFX6-NEXT: s_add_u32 s0, s4, s12
|
||||
; GFX6-NEXT: v_mov_b32_e32 v2, s1
|
||||
; GFX6-NEXT: s_cselect_b32 s1, 1, 0
|
||||
; GFX6-NEXT: s_and_b32 s1, s1, 1
|
||||
; GFX6-NEXT: s_cmp_lg_u32 s1, 0
|
||||
; GFX6-NEXT: s_addc_u32 s1, s0, 0
|
||||
; GFX6-NEXT: s_brev_b32 s10, 1
|
||||
; GFX6-NEXT: s_addc_u32 s2, s0, 0
|
||||
; GFX6-NEXT: v_and_b32_e32 v0, 1, v0
|
||||
; GFX6-NEXT: s_addc_u32 s1, s5, s13
|
||||
; GFX6-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0
|
||||
; GFX6-NEXT: v_mov_b32_e32 v0, s2
|
||||
; GFX6-NEXT: s_cselect_b32 s2, 1, 0
|
||||
; GFX6-NEXT: s_and_b32 s2, s2, 1
|
||||
; GFX6-NEXT: s_addc_u32 s3, s0, s10
|
||||
; GFX6-NEXT: v_mov_b32_e32 v1, s0
|
||||
; GFX6-NEXT: v_mov_b32_e32 v2, s1
|
||||
; GFX6-NEXT: v_mov_b32_e32 v3, s8
|
||||
; GFX6-NEXT: v_mov_b32_e32 v4, s9
|
||||
; GFX6-NEXT: s_cmp_lg_u32 s2, 0
|
||||
; GFX6-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0
|
||||
; GFX6-NEXT: v_cndmask_b32_e32 v5, v3, v1, vcc
|
||||
; GFX6-NEXT: v_cndmask_b32_e32 v4, v4, v2, vcc
|
||||
; GFX6-NEXT: v_mov_b32_e32 v0, s2
|
||||
; GFX6-NEXT: v_mov_b32_e32 v1, s3
|
||||
; GFX6-NEXT: v_mov_b32_e32 v2, s16
|
||||
; GFX6-NEXT: v_mov_b32_e32 v3, s17
|
||||
; GFX6-NEXT: s_addc_u32 s2, s6, s14
|
||||
; GFX6-NEXT: v_cndmask_b32_e32 v6, v2, v0, vcc
|
||||
; GFX6-NEXT: v_cndmask_b32_e32 v7, v3, v1, vcc
|
||||
; GFX6-NEXT: s_cselect_b32 s3, 1, 0
|
||||
; GFX6-NEXT: s_add_u32 s0, s4, s12
|
||||
; GFX6-NEXT: v_mov_b32_e32 v2, s4
|
||||
; GFX6-NEXT: s_and_b32 s3, s3, 1
|
||||
; GFX6-NEXT: s_addc_u32 s1, s5, s13
|
||||
; GFX6-NEXT: v_mov_b32_e32 v3, s5
|
||||
; GFX6-NEXT: s_cmp_lg_u32 s3, 0
|
||||
; GFX6-NEXT: s_addc_u32 s2, s6, s14
|
||||
; GFX6-NEXT: v_mov_b32_e32 v0, s6
|
||||
; GFX6-NEXT: v_cmp_lt_u64_e32 vcc, s[0:1], v[2:3]
|
||||
; GFX6-NEXT: s_addc_u32 s3, s7, s15
|
||||
|
@ -5611,15 +5491,9 @@ define amdgpu_ps <2 x i128> @s_saddsat_v2i128(<2 x i128> inreg %lhs, <2 x i128>
|
|||
; GFX6-NEXT: s_ashr_i32 s4, s3, 31
|
||||
; GFX6-NEXT: s_mov_b32 s5, 0
|
||||
; GFX6-NEXT: s_cmp_lg_u32 s5, 0
|
||||
; GFX6-NEXT: s_addc_u32 s5, s4, 0
|
||||
; GFX6-NEXT: s_cselect_b32 s6, 1, 0
|
||||
; GFX6-NEXT: s_and_b32 s6, s6, 1
|
||||
; GFX6-NEXT: s_cmp_lg_u32 s6, 0
|
||||
; GFX6-NEXT: s_addc_u32 s6, s4, 0
|
||||
; GFX6-NEXT: s_cselect_b32 s7, 1, 0
|
||||
; GFX6-NEXT: v_xor_b32_e32 v0, v1, v0
|
||||
; GFX6-NEXT: s_and_b32 s7, s7, 1
|
||||
; GFX6-NEXT: s_cmp_lg_u32 s7, 0
|
||||
; GFX6-NEXT: s_addc_u32 s5, s4, 0
|
||||
; GFX6-NEXT: s_addc_u32 s6, s4, 0
|
||||
; GFX6-NEXT: v_and_b32_e32 v0, 1, v0
|
||||
; GFX6-NEXT: s_addc_u32 s7, s4, s10
|
||||
; GFX6-NEXT: v_mov_b32_e32 v1, s4
|
||||
|
@ -5648,18 +5522,9 @@ define amdgpu_ps <2 x i128> @s_saddsat_v2i128(<2 x i128> inreg %lhs, <2 x i128>
|
|||
; GFX8-LABEL: s_saddsat_v2i128:
|
||||
; GFX8: ; %bb.0:
|
||||
; GFX8-NEXT: s_add_u32 s8, s0, s8
|
||||
; GFX8-NEXT: s_cselect_b32 s16, 1, 0
|
||||
; GFX8-NEXT: s_and_b32 s16, s16, 1
|
||||
; GFX8-NEXT: s_cmp_lg_u32 s16, 0
|
||||
; GFX8-NEXT: s_addc_u32 s9, s1, s9
|
||||
; GFX8-NEXT: s_cselect_b32 s16, 1, 0
|
||||
; GFX8-NEXT: s_and_b32 s16, s16, 1
|
||||
; GFX8-NEXT: s_cmp_lg_u32 s16, 0
|
||||
; GFX8-NEXT: s_addc_u32 s16, s2, s10
|
||||
; GFX8-NEXT: s_cselect_b32 s17, 1, 0
|
||||
; GFX8-NEXT: s_and_b32 s17, s17, 1
|
||||
; GFX8-NEXT: v_mov_b32_e32 v3, s1
|
||||
; GFX8-NEXT: s_cmp_lg_u32 s17, 0
|
||||
; GFX8-NEXT: s_addc_u32 s16, s2, s10
|
||||
; GFX8-NEXT: v_mov_b32_e32 v2, s0
|
||||
; GFX8-NEXT: s_addc_u32 s17, s3, s11
|
||||
; GFX8-NEXT: v_mov_b32_e32 v0, s2
|
||||
|
@ -5681,46 +5546,31 @@ define amdgpu_ps <2 x i128> @s_saddsat_v2i128(<2 x i128> inreg %lhs, <2 x i128>
|
|||
; GFX8-NEXT: v_cndmask_b32_e64 v1, v1, 0, s[0:1]
|
||||
; GFX8-NEXT: s_ashr_i32 s0, s17, 31
|
||||
; GFX8-NEXT: s_mov_b32 s1, 0
|
||||
; GFX8-NEXT: s_cmp_lg_u32 s1, 0
|
||||
; GFX8-NEXT: s_addc_u32 s1, s0, 0
|
||||
; GFX8-NEXT: s_cselect_b32 s2, 1, 0
|
||||
; GFX8-NEXT: s_and_b32 s2, s2, 1
|
||||
; GFX8-NEXT: s_cmp_lg_u32 s2, 0
|
||||
; GFX8-NEXT: s_addc_u32 s2, s0, 0
|
||||
; GFX8-NEXT: s_cselect_b32 s3, 1, 0
|
||||
; GFX8-NEXT: s_and_b32 s3, s3, 1
|
||||
; GFX8-NEXT: s_brev_b32 s10, 1
|
||||
; GFX8-NEXT: s_cmp_lg_u32 s3, 0
|
||||
; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc
|
||||
; GFX8-NEXT: s_addc_u32 s3, s0, s10
|
||||
; GFX8-NEXT: v_xor_b32_e32 v0, v1, v0
|
||||
; GFX8-NEXT: v_mov_b32_e32 v1, s0
|
||||
; GFX8-NEXT: s_add_u32 s0, s4, s12
|
||||
; GFX8-NEXT: v_mov_b32_e32 v2, s1
|
||||
; GFX8-NEXT: s_cselect_b32 s1, 1, 0
|
||||
; GFX8-NEXT: s_and_b32 s1, s1, 1
|
||||
; GFX8-NEXT: s_cmp_lg_u32 s1, 0
|
||||
; GFX8-NEXT: v_xor_b32_e32 v0, v1, v0
|
||||
; GFX8-NEXT: s_addc_u32 s1, s0, 0
|
||||
; GFX8-NEXT: s_brev_b32 s10, 1
|
||||
; GFX8-NEXT: s_addc_u32 s2, s0, 0
|
||||
; GFX8-NEXT: v_and_b32_e32 v0, 1, v0
|
||||
; GFX8-NEXT: s_addc_u32 s1, s5, s13
|
||||
; GFX8-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0
|
||||
; GFX8-NEXT: v_mov_b32_e32 v0, s2
|
||||
; GFX8-NEXT: s_cselect_b32 s2, 1, 0
|
||||
; GFX8-NEXT: s_and_b32 s2, s2, 1
|
||||
; GFX8-NEXT: s_cmp_lg_u32 s2, 0
|
||||
; GFX8-NEXT: s_addc_u32 s3, s0, s10
|
||||
; GFX8-NEXT: v_mov_b32_e32 v1, s0
|
||||
; GFX8-NEXT: v_mov_b32_e32 v2, s1
|
||||
; GFX8-NEXT: v_mov_b32_e32 v3, s8
|
||||
; GFX8-NEXT: v_mov_b32_e32 v4, s9
|
||||
; GFX8-NEXT: s_addc_u32 s2, s6, s14
|
||||
; GFX8-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0
|
||||
; GFX8-NEXT: v_cndmask_b32_e32 v5, v3, v1, vcc
|
||||
; GFX8-NEXT: v_cndmask_b32_e32 v4, v4, v2, vcc
|
||||
; GFX8-NEXT: v_mov_b32_e32 v0, s2
|
||||
; GFX8-NEXT: v_mov_b32_e32 v1, s3
|
||||
; GFX8-NEXT: v_mov_b32_e32 v2, s16
|
||||
; GFX8-NEXT: v_mov_b32_e32 v3, s17
|
||||
; GFX8-NEXT: s_cselect_b32 s3, 1, 0
|
||||
; GFX8-NEXT: s_add_u32 s0, s4, s12
|
||||
; GFX8-NEXT: v_cndmask_b32_e32 v6, v2, v0, vcc
|
||||
; GFX8-NEXT: v_cndmask_b32_e32 v7, v3, v1, vcc
|
||||
; GFX8-NEXT: s_and_b32 s3, s3, 1
|
||||
; GFX8-NEXT: s_addc_u32 s1, s5, s13
|
||||
; GFX8-NEXT: v_mov_b32_e32 v2, s4
|
||||
; GFX8-NEXT: s_cmp_lg_u32 s3, 0
|
||||
; GFX8-NEXT: s_addc_u32 s2, s6, s14
|
||||
; GFX8-NEXT: v_mov_b32_e32 v3, s5
|
||||
; GFX8-NEXT: s_addc_u32 s3, s7, s15
|
||||
; GFX8-NEXT: v_mov_b32_e32 v0, s6
|
||||
|
@ -5742,17 +5592,11 @@ define amdgpu_ps <2 x i128> @s_saddsat_v2i128(<2 x i128> inreg %lhs, <2 x i128>
|
|||
; GFX8-NEXT: v_cndmask_b32_e64 v1, v1, 0, s[4:5]
|
||||
; GFX8-NEXT: s_ashr_i32 s4, s3, 31
|
||||
; GFX8-NEXT: s_mov_b32 s5, 0
|
||||
; GFX8-NEXT: s_cmp_lg_u32 s5, 0
|
||||
; GFX8-NEXT: s_addc_u32 s5, s4, 0
|
||||
; GFX8-NEXT: s_cselect_b32 s6, 1, 0
|
||||
; GFX8-NEXT: s_and_b32 s6, s6, 1
|
||||
; GFX8-NEXT: s_cmp_lg_u32 s6, 0
|
||||
; GFX8-NEXT: s_addc_u32 s6, s4, 0
|
||||
; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc
|
||||
; GFX8-NEXT: s_cselect_b32 s7, 1, 0
|
||||
; GFX8-NEXT: s_cmp_lg_u32 s5, 0
|
||||
; GFX8-NEXT: v_xor_b32_e32 v0, v1, v0
|
||||
; GFX8-NEXT: s_and_b32 s7, s7, 1
|
||||
; GFX8-NEXT: s_cmp_lg_u32 s7, 0
|
||||
; GFX8-NEXT: s_addc_u32 s5, s4, 0
|
||||
; GFX8-NEXT: s_addc_u32 s6, s4, 0
|
||||
; GFX8-NEXT: v_and_b32_e32 v0, 1, v0
|
||||
; GFX8-NEXT: s_addc_u32 s7, s4, s10
|
||||
; GFX8-NEXT: v_mov_b32_e32 v1, s4
|
||||
|
@ -5781,18 +5625,9 @@ define amdgpu_ps <2 x i128> @s_saddsat_v2i128(<2 x i128> inreg %lhs, <2 x i128>
|
|||
; GFX9-LABEL: s_saddsat_v2i128:
|
||||
; GFX9: ; %bb.0:
|
||||
; GFX9-NEXT: s_add_u32 s8, s0, s8
|
||||
; GFX9-NEXT: s_cselect_b32 s16, 1, 0
|
||||
; GFX9-NEXT: s_and_b32 s16, s16, 1
|
||||
; GFX9-NEXT: s_cmp_lg_u32 s16, 0
|
||||
; GFX9-NEXT: s_addc_u32 s9, s1, s9
|
||||
; GFX9-NEXT: s_cselect_b32 s16, 1, 0
|
||||
; GFX9-NEXT: s_and_b32 s16, s16, 1
|
||||
; GFX9-NEXT: s_cmp_lg_u32 s16, 0
|
||||
; GFX9-NEXT: s_addc_u32 s16, s2, s10
|
||||
; GFX9-NEXT: s_cselect_b32 s17, 1, 0
|
||||
; GFX9-NEXT: s_and_b32 s17, s17, 1
|
||||
; GFX9-NEXT: v_mov_b32_e32 v3, s1
|
||||
; GFX9-NEXT: s_cmp_lg_u32 s17, 0
|
||||
; GFX9-NEXT: s_addc_u32 s16, s2, s10
|
||||
; GFX9-NEXT: v_mov_b32_e32 v2, s0
|
||||
; GFX9-NEXT: s_addc_u32 s17, s3, s11
|
||||
; GFX9-NEXT: v_mov_b32_e32 v0, s2
|
||||
|
@ -5814,46 +5649,31 @@ define amdgpu_ps <2 x i128> @s_saddsat_v2i128(<2 x i128> inreg %lhs, <2 x i128>
|
|||
; GFX9-NEXT: v_cndmask_b32_e64 v1, v1, 0, s[0:1]
|
||||
; GFX9-NEXT: s_ashr_i32 s0, s17, 31
|
||||
; GFX9-NEXT: s_mov_b32 s1, 0
|
||||
; GFX9-NEXT: s_cmp_lg_u32 s1, 0
|
||||
; GFX9-NEXT: s_addc_u32 s1, s0, 0
|
||||
; GFX9-NEXT: s_cselect_b32 s2, 1, 0
|
||||
; GFX9-NEXT: s_and_b32 s2, s2, 1
|
||||
; GFX9-NEXT: s_cmp_lg_u32 s2, 0
|
||||
; GFX9-NEXT: s_addc_u32 s2, s0, 0
|
||||
; GFX9-NEXT: s_cselect_b32 s3, 1, 0
|
||||
; GFX9-NEXT: s_and_b32 s3, s3, 1
|
||||
; GFX9-NEXT: s_brev_b32 s10, 1
|
||||
; GFX9-NEXT: s_cmp_lg_u32 s3, 0
|
||||
; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc
|
||||
; GFX9-NEXT: s_addc_u32 s3, s0, s10
|
||||
; GFX9-NEXT: v_xor_b32_e32 v0, v1, v0
|
||||
; GFX9-NEXT: v_mov_b32_e32 v1, s0
|
||||
; GFX9-NEXT: s_add_u32 s0, s4, s12
|
||||
; GFX9-NEXT: v_mov_b32_e32 v2, s1
|
||||
; GFX9-NEXT: s_cselect_b32 s1, 1, 0
|
||||
; GFX9-NEXT: s_and_b32 s1, s1, 1
|
||||
; GFX9-NEXT: s_cmp_lg_u32 s1, 0
|
||||
; GFX9-NEXT: v_xor_b32_e32 v0, v1, v0
|
||||
; GFX9-NEXT: s_addc_u32 s1, s0, 0
|
||||
; GFX9-NEXT: s_brev_b32 s10, 1
|
||||
; GFX9-NEXT: s_addc_u32 s2, s0, 0
|
||||
; GFX9-NEXT: v_and_b32_e32 v0, 1, v0
|
||||
; GFX9-NEXT: s_addc_u32 s1, s5, s13
|
||||
; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0
|
||||
; GFX9-NEXT: v_mov_b32_e32 v0, s2
|
||||
; GFX9-NEXT: s_cselect_b32 s2, 1, 0
|
||||
; GFX9-NEXT: s_and_b32 s2, s2, 1
|
||||
; GFX9-NEXT: s_cmp_lg_u32 s2, 0
|
||||
; GFX9-NEXT: s_addc_u32 s3, s0, s10
|
||||
; GFX9-NEXT: v_mov_b32_e32 v1, s0
|
||||
; GFX9-NEXT: v_mov_b32_e32 v2, s1
|
||||
; GFX9-NEXT: v_mov_b32_e32 v3, s8
|
||||
; GFX9-NEXT: v_mov_b32_e32 v4, s9
|
||||
; GFX9-NEXT: s_addc_u32 s2, s6, s14
|
||||
; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0
|
||||
; GFX9-NEXT: v_cndmask_b32_e32 v5, v3, v1, vcc
|
||||
; GFX9-NEXT: v_cndmask_b32_e32 v4, v4, v2, vcc
|
||||
; GFX9-NEXT: v_mov_b32_e32 v0, s2
|
||||
; GFX9-NEXT: v_mov_b32_e32 v1, s3
|
||||
; GFX9-NEXT: v_mov_b32_e32 v2, s16
|
||||
; GFX9-NEXT: v_mov_b32_e32 v3, s17
|
||||
; GFX9-NEXT: s_cselect_b32 s3, 1, 0
|
||||
; GFX9-NEXT: s_add_u32 s0, s4, s12
|
||||
; GFX9-NEXT: v_cndmask_b32_e32 v6, v2, v0, vcc
|
||||
; GFX9-NEXT: v_cndmask_b32_e32 v7, v3, v1, vcc
|
||||
; GFX9-NEXT: s_and_b32 s3, s3, 1
|
||||
; GFX9-NEXT: s_addc_u32 s1, s5, s13
|
||||
; GFX9-NEXT: v_mov_b32_e32 v2, s4
|
||||
; GFX9-NEXT: s_cmp_lg_u32 s3, 0
|
||||
; GFX9-NEXT: s_addc_u32 s2, s6, s14
|
||||
; GFX9-NEXT: v_mov_b32_e32 v3, s5
|
||||
; GFX9-NEXT: s_addc_u32 s3, s7, s15
|
||||
; GFX9-NEXT: v_mov_b32_e32 v0, s6
|
||||
|
@ -5875,17 +5695,11 @@ define amdgpu_ps <2 x i128> @s_saddsat_v2i128(<2 x i128> inreg %lhs, <2 x i128>
|
|||
; GFX9-NEXT: v_cndmask_b32_e64 v1, v1, 0, s[4:5]
|
||||
; GFX9-NEXT: s_ashr_i32 s4, s3, 31
|
||||
; GFX9-NEXT: s_mov_b32 s5, 0
|
||||
; GFX9-NEXT: s_cmp_lg_u32 s5, 0
|
||||
; GFX9-NEXT: s_addc_u32 s5, s4, 0
|
||||
; GFX9-NEXT: s_cselect_b32 s6, 1, 0
|
||||
; GFX9-NEXT: s_and_b32 s6, s6, 1
|
||||
; GFX9-NEXT: s_cmp_lg_u32 s6, 0
|
||||
; GFX9-NEXT: s_addc_u32 s6, s4, 0
|
||||
; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc
|
||||
; GFX9-NEXT: s_cselect_b32 s7, 1, 0
|
||||
; GFX9-NEXT: s_cmp_lg_u32 s5, 0
|
||||
; GFX9-NEXT: v_xor_b32_e32 v0, v1, v0
|
||||
; GFX9-NEXT: s_and_b32 s7, s7, 1
|
||||
; GFX9-NEXT: s_cmp_lg_u32 s7, 0
|
||||
; GFX9-NEXT: s_addc_u32 s5, s4, 0
|
||||
; GFX9-NEXT: s_addc_u32 s6, s4, 0
|
||||
; GFX9-NEXT: v_and_b32_e32 v0, 1, v0
|
||||
; GFX9-NEXT: s_addc_u32 s7, s4, s10
|
||||
; GFX9-NEXT: v_mov_b32_e32 v1, s4
|
||||
|
@ -5914,25 +5728,16 @@ define amdgpu_ps <2 x i128> @s_saddsat_v2i128(<2 x i128> inreg %lhs, <2 x i128>
|
|||
; GFX10-LABEL: s_saddsat_v2i128:
|
||||
; GFX10: ; %bb.0:
|
||||
; GFX10-NEXT: s_add_u32 s8, s0, s8
|
||||
; GFX10-NEXT: s_cselect_b32 s16, 1, 0
|
||||
; GFX10-NEXT: s_and_b32 s16, s16, 1
|
||||
; GFX10-NEXT: s_cmp_lg_u32 s16, 0
|
||||
; GFX10-NEXT: s_addc_u32 s9, s1, s9
|
||||
; GFX10-NEXT: s_cselect_b32 s16, 1, 0
|
||||
; GFX10-NEXT: v_cmp_lt_u64_e64 s0, s[8:9], s[0:1]
|
||||
; GFX10-NEXT: s_and_b32 s16, s16, 1
|
||||
; GFX10-NEXT: s_cmp_lg_u32 s16, 0
|
||||
; GFX10-NEXT: s_addc_u32 s16, s2, s10
|
||||
; GFX10-NEXT: s_cselect_b32 s17, 1, 0
|
||||
; GFX10-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0
|
||||
; GFX10-NEXT: s_and_b32 s17, s17, 1
|
||||
; GFX10-NEXT: s_cmp_lg_u32 s17, 0
|
||||
; GFX10-NEXT: v_cmp_lt_u64_e64 s0, s[8:9], s[0:1]
|
||||
; GFX10-NEXT: s_addc_u32 s17, s3, s11
|
||||
; GFX10-NEXT: v_cmp_lt_i64_e64 s0, s[16:17], s[2:3]
|
||||
; GFX10-NEXT: s_cmp_eq_u64 s[16:17], s[2:3]
|
||||
; GFX10-NEXT: v_cmp_lt_i64_e64 s2, s[10:11], 0
|
||||
; GFX10-NEXT: v_mov_b32_e32 v5, s17
|
||||
; GFX10-NEXT: s_cselect_b32 s18, 1, 0
|
||||
; GFX10-NEXT: v_mov_b32_e32 v3, s17
|
||||
; GFX10-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0
|
||||
; GFX10-NEXT: v_cmp_lt_i64_e64 s0, s[16:17], s[2:3]
|
||||
; GFX10-NEXT: v_cmp_lt_i64_e64 s2, s[10:11], 0
|
||||
; GFX10-NEXT: v_cndmask_b32_e64 v1, 0, 1, s0
|
||||
; GFX10-NEXT: s_and_b32 s0, 1, s18
|
||||
; GFX10-NEXT: s_cmp_eq_u64 s[10:11], 0
|
||||
|
@ -5940,91 +5745,70 @@ define amdgpu_ps <2 x i128> @s_saddsat_v2i128(<2 x i128> inreg %lhs, <2 x i128>
|
|||
; GFX10-NEXT: s_cselect_b32 s1, 1, 0
|
||||
; GFX10-NEXT: v_cmp_ne_u32_e64 vcc_lo, 0, s0
|
||||
; GFX10-NEXT: s_and_b32 s1, 1, s1
|
||||
; GFX10-NEXT: s_brev_b32 s10, 1
|
||||
; GFX10-NEXT: s_ashr_i32 s2, s17, 31
|
||||
; GFX10-NEXT: v_cmp_ne_u32_e64 s0, 0, s1
|
||||
; GFX10-NEXT: s_mov_b32 s1, 0
|
||||
; GFX10-NEXT: s_brev_b32 s11, 1
|
||||
; GFX10-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc_lo
|
||||
; GFX10-NEXT: v_cndmask_b32_e64 v1, v2, 0, s0
|
||||
; GFX10-NEXT: s_ashr_i32 s0, s17, 31
|
||||
; GFX10-NEXT: s_cmp_lg_u32 s1, 0
|
||||
; GFX10-NEXT: v_mov_b32_e32 v2, s9
|
||||
; GFX10-NEXT: s_addc_u32 s1, s0, 0
|
||||
; GFX10-NEXT: s_mov_b32 s0, 0
|
||||
; GFX10-NEXT: s_cmp_lg_u32 s0, 0
|
||||
; GFX10-NEXT: s_addc_u32 s1, s2, 0
|
||||
; GFX10-NEXT: s_addc_u32 s10, s2, 0
|
||||
; GFX10-NEXT: s_addc_u32 s3, s2, s11
|
||||
; GFX10-NEXT: s_add_u32 s12, s4, s12
|
||||
; GFX10-NEXT: s_addc_u32 s13, s5, s13
|
||||
; GFX10-NEXT: s_addc_u32 s18, s6, s14
|
||||
; GFX10-NEXT: v_cmp_lt_u64_e64 s4, s[12:13], s[4:5]
|
||||
; GFX10-NEXT: s_addc_u32 s19, s7, s15
|
||||
; GFX10-NEXT: v_cmp_lt_i64_e64 s5, s[14:15], 0
|
||||
; GFX10-NEXT: s_cmp_eq_u64 s[18:19], s[6:7]
|
||||
; GFX10-NEXT: v_xor_b32_e32 v0, v1, v0
|
||||
; GFX10-NEXT: s_cselect_b32 s2, 1, 0
|
||||
; GFX10-NEXT: v_mov_b32_e32 v1, s8
|
||||
; GFX10-NEXT: s_and_b32 s2, s2, 1
|
||||
; GFX10-NEXT: s_cmp_lg_u32 s2, 0
|
||||
; GFX10-NEXT: v_and_b32_e32 v0, 1, v0
|
||||
; GFX10-NEXT: s_addc_u32 s2, s0, 0
|
||||
; GFX10-NEXT: s_cselect_b32 s3, 1, 0
|
||||
; GFX10-NEXT: s_and_b32 s3, s3, 1
|
||||
; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0
|
||||
; GFX10-NEXT: s_cmp_lg_u32 s3, 0
|
||||
; GFX10-NEXT: s_addc_u32 s3, s0, s10
|
||||
; GFX10-NEXT: v_cndmask_b32_e64 v0, v1, s0, vcc_lo
|
||||
; GFX10-NEXT: s_add_u32 s0, s4, s12
|
||||
; GFX10-NEXT: v_cndmask_b32_e64 v1, v2, s1, vcc_lo
|
||||
; GFX10-NEXT: s_cselect_b32 s1, 1, 0
|
||||
; GFX10-NEXT: v_cndmask_b32_e64 v3, v3, s3, vcc_lo
|
||||
; GFX10-NEXT: s_and_b32 s1, s1, 1
|
||||
; GFX10-NEXT: v_mov_b32_e32 v2, s16
|
||||
; GFX10-NEXT: s_cmp_lg_u32 s1, 0
|
||||
; GFX10-NEXT: s_addc_u32 s1, s5, s13
|
||||
; GFX10-NEXT: s_cselect_b32 s8, 1, 0
|
||||
; GFX10-NEXT: v_cmp_lt_u64_e64 s3, s[0:1], s[4:5]
|
||||
; GFX10-NEXT: s_and_b32 s8, s8, 1
|
||||
; GFX10-NEXT: v_cndmask_b32_e64 v2, v2, s2, vcc_lo
|
||||
; GFX10-NEXT: s_cmp_lg_u32 s8, 0
|
||||
; GFX10-NEXT: v_cmp_lt_i64_e64 s4, s[14:15], 0
|
||||
; GFX10-NEXT: s_addc_u32 s8, s6, s14
|
||||
; GFX10-NEXT: s_cselect_b32 s9, 1, 0
|
||||
; GFX10-NEXT: v_cndmask_b32_e64 v4, 0, 1, s3
|
||||
; GFX10-NEXT: s_and_b32 s9, s9, 1
|
||||
; GFX10-NEXT: v_mov_b32_e32 v7, s8
|
||||
; GFX10-NEXT: s_cmp_lg_u32 s9, 0
|
||||
; GFX10-NEXT: v_cndmask_b32_e64 v6, 0, 1, s4
|
||||
; GFX10-NEXT: s_addc_u32 s9, s7, s15
|
||||
; GFX10-NEXT: s_cmp_eq_u64 s[8:9], s[6:7]
|
||||
; GFX10-NEXT: v_cmp_lt_i64_e64 s3, s[8:9], s[6:7]
|
||||
; GFX10-NEXT: s_cselect_b32 s2, 1, 0
|
||||
; GFX10-NEXT: v_mov_b32_e32 v8, s9
|
||||
; GFX10-NEXT: s_and_b32 s2, 1, s2
|
||||
; GFX10-NEXT: s_cselect_b32 s0, 1, 0
|
||||
; GFX10-NEXT: v_cndmask_b32_e64 v2, 0, 1, s4
|
||||
; GFX10-NEXT: v_cmp_lt_i64_e64 s4, s[18:19], s[6:7]
|
||||
; GFX10-NEXT: s_and_b32 s0, 1, s0
|
||||
; GFX10-NEXT: s_cmp_eq_u64 s[14:15], 0
|
||||
; GFX10-NEXT: v_cmp_ne_u32_e64 vcc_lo, 0, s2
|
||||
; GFX10-NEXT: v_cndmask_b32_e64 v5, 0, 1, s3
|
||||
; GFX10-NEXT: s_cselect_b32 s3, 1, 0
|
||||
; GFX10-NEXT: s_and_b32 s3, 1, s3
|
||||
; GFX10-NEXT: v_cmp_ne_u32_e64 s2, 0, s3
|
||||
; GFX10-NEXT: v_cndmask_b32_e32 v4, v5, v4, vcc_lo
|
||||
; GFX10-NEXT: s_mov_b32 s3, 0
|
||||
; GFX10-NEXT: v_cndmask_b32_e64 v5, v6, 0, s2
|
||||
; GFX10-NEXT: s_ashr_i32 s2, s9, 31
|
||||
; GFX10-NEXT: s_cmp_lg_u32 s3, 0
|
||||
; GFX10-NEXT: v_mov_b32_e32 v6, s1
|
||||
; GFX10-NEXT: s_addc_u32 s3, s2, 0
|
||||
; GFX10-NEXT: v_cmp_ne_u32_e64 s0, 0, s0
|
||||
; GFX10-NEXT: v_cndmask_b32_e64 v4, 0, 1, s5
|
||||
; GFX10-NEXT: v_and_b32_e32 v0, 1, v0
|
||||
; GFX10-NEXT: v_cndmask_b32_e64 v3, 0, 1, s4
|
||||
; GFX10-NEXT: s_cselect_b32 s4, 1, 0
|
||||
; GFX10-NEXT: v_xor_b32_e32 v4, v5, v4
|
||||
; GFX10-NEXT: s_and_b32 s4, s4, 1
|
||||
; GFX10-NEXT: v_mov_b32_e32 v5, s0
|
||||
; GFX10-NEXT: s_cmp_lg_u32 s4, 0
|
||||
; GFX10-NEXT: v_readfirstlane_b32 s0, v0
|
||||
; GFX10-NEXT: v_and_b32_e32 v4, 1, v4
|
||||
; GFX10-NEXT: s_addc_u32 s4, s2, 0
|
||||
; GFX10-NEXT: s_cselect_b32 s5, 1, 0
|
||||
; GFX10-NEXT: s_and_b32 s5, s5, 1
|
||||
; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v4
|
||||
; GFX10-NEXT: s_cmp_lg_u32 s5, 0
|
||||
; GFX10-NEXT: s_addc_u32 s1, s2, s10
|
||||
; GFX10-NEXT: v_cndmask_b32_e64 v4, v5, s2, vcc_lo
|
||||
; GFX10-NEXT: v_cndmask_b32_e64 v5, v6, s3, vcc_lo
|
||||
; GFX10-NEXT: v_cndmask_b32_e64 v6, v7, s4, vcc_lo
|
||||
; GFX10-NEXT: v_cndmask_b32_e64 v7, v8, s1, vcc_lo
|
||||
; GFX10-NEXT: v_readfirstlane_b32 s1, v1
|
||||
; GFX10-NEXT: v_readfirstlane_b32 s2, v2
|
||||
; GFX10-NEXT: v_readfirstlane_b32 s3, v3
|
||||
; GFX10-NEXT: v_readfirstlane_b32 s4, v4
|
||||
; GFX10-NEXT: v_readfirstlane_b32 s5, v5
|
||||
; GFX10-NEXT: v_readfirstlane_b32 s6, v6
|
||||
; GFX10-NEXT: v_mov_b32_e32 v1, s8
|
||||
; GFX10-NEXT: s_and_b32 s4, 1, s4
|
||||
; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0
|
||||
; GFX10-NEXT: v_cndmask_b32_e64 v2, v3, v2, s0
|
||||
; GFX10-NEXT: v_cmp_ne_u32_e64 s0, 0, s4
|
||||
; GFX10-NEXT: v_mov_b32_e32 v0, s9
|
||||
; GFX10-NEXT: v_mov_b32_e32 v6, s13
|
||||
; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, s2, vcc_lo
|
||||
; GFX10-NEXT: v_mov_b32_e32 v7, s19
|
||||
; GFX10-NEXT: v_cndmask_b32_e64 v3, v4, 0, s0
|
||||
; GFX10-NEXT: v_mov_b32_e32 v4, s16
|
||||
; GFX10-NEXT: v_cndmask_b32_e64 v0, v0, s1, vcc_lo
|
||||
; GFX10-NEXT: s_mov_b32 s1, 0
|
||||
; GFX10-NEXT: s_ashr_i32 s0, s19, 31
|
||||
; GFX10-NEXT: v_xor_b32_e32 v2, v3, v2
|
||||
; GFX10-NEXT: s_cmp_lg_u32 s1, 0
|
||||
; GFX10-NEXT: v_cndmask_b32_e64 v3, v4, s10, vcc_lo
|
||||
; GFX10-NEXT: v_cndmask_b32_e64 v4, v5, s3, vcc_lo
|
||||
; GFX10-NEXT: v_mov_b32_e32 v5, s12
|
||||
; GFX10-NEXT: v_and_b32_e32 v2, 1, v2
|
||||
; GFX10-NEXT: s_addc_u32 s1, s0, 0
|
||||
; GFX10-NEXT: s_addc_u32 s2, s0, 0
|
||||
; GFX10-NEXT: s_addc_u32 s3, s0, s11
|
||||
; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v2
|
||||
; GFX10-NEXT: v_mov_b32_e32 v2, s18
|
||||
; GFX10-NEXT: v_cndmask_b32_e64 v5, v5, s0, vcc_lo
|
||||
; GFX10-NEXT: v_cndmask_b32_e64 v6, v6, s1, vcc_lo
|
||||
; GFX10-NEXT: v_cndmask_b32_e64 v2, v2, s2, vcc_lo
|
||||
; GFX10-NEXT: v_cndmask_b32_e64 v7, v7, s3, vcc_lo
|
||||
; GFX10-NEXT: v_readfirstlane_b32 s0, v1
|
||||
; GFX10-NEXT: v_readfirstlane_b32 s1, v0
|
||||
; GFX10-NEXT: v_readfirstlane_b32 s2, v3
|
||||
; GFX10-NEXT: v_readfirstlane_b32 s3, v4
|
||||
; GFX10-NEXT: v_readfirstlane_b32 s4, v5
|
||||
; GFX10-NEXT: v_readfirstlane_b32 s5, v6
|
||||
; GFX10-NEXT: v_readfirstlane_b32 s6, v2
|
||||
; GFX10-NEXT: v_readfirstlane_b32 s7, v7
|
||||
; GFX10-NEXT: ; return to shader part epilog
|
||||
%result = call <2 x i128> @llvm.sadd.sat.v2i128(<2 x i128> %lhs, <2 x i128> %rhs)
|
||||
|
|
|
@ -208,14 +208,8 @@ define amdgpu_ps i64 @s_sdiv_i64(i64 inreg %num, i64 inreg %den) {
|
|||
; CHECK-NEXT: s_ashr_i32 s6, s3, 31
|
||||
; CHECK-NEXT: s_ashr_i32 s8, s5, 31
|
||||
; CHECK-NEXT: s_add_u32 s0, s2, s6
|
||||
; CHECK-NEXT: s_cselect_b32 s1, 1, 0
|
||||
; CHECK-NEXT: s_and_b32 s1, s1, 1
|
||||
; CHECK-NEXT: s_cmp_lg_u32 s1, 0
|
||||
; CHECK-NEXT: s_addc_u32 s1, s3, s6
|
||||
; CHECK-NEXT: s_add_u32 s10, s4, s8
|
||||
; CHECK-NEXT: s_cselect_b32 s3, 1, 0
|
||||
; CHECK-NEXT: s_and_b32 s3, s3, 1
|
||||
; CHECK-NEXT: s_cmp_lg_u32 s3, 0
|
||||
; CHECK-NEXT: s_mov_b32 s9, s8
|
||||
; CHECK-NEXT: s_addc_u32 s11, s5, s8
|
||||
; CHECK-NEXT: s_xor_b64 s[10:11], s[10:11], s[8:9]
|
||||
|
@ -226,21 +220,18 @@ define amdgpu_ps i64 @s_sdiv_i64(i64 inreg %num, i64 inreg %den) {
|
|||
; CHECK-NEXT: s_sub_u32 s0, 0, s10
|
||||
; CHECK-NEXT: v_mac_f32_e32 v0, 0x4f800000, v1
|
||||
; CHECK-NEXT: v_rcp_iflag_f32_e32 v0, v0
|
||||
; CHECK-NEXT: s_cselect_b32 s1, 1, 0
|
||||
; CHECK-NEXT: s_and_b32 s1, s1, 1
|
||||
; CHECK-NEXT: s_cmp_lg_u32 s1, 0
|
||||
; CHECK-NEXT: s_subb_u32 s1, 0, s11
|
||||
; CHECK-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0
|
||||
; CHECK-NEXT: v_mul_f32_e32 v1, 0x2f800000, v0
|
||||
; CHECK-NEXT: v_trunc_f32_e32 v1, v1
|
||||
; CHECK-NEXT: v_mac_f32_e32 v0, 0xcf800000, v1
|
||||
; CHECK-NEXT: v_cvt_u32_f32_e32 v1, v1
|
||||
; CHECK-NEXT: v_cvt_u32_f32_e32 v0, v0
|
||||
; CHECK-NEXT: s_subb_u32 s1, 0, s11
|
||||
; CHECK-NEXT: v_mul_lo_u32 v3, s0, v1
|
||||
; CHECK-NEXT: v_mul_lo_u32 v2, s1, v0
|
||||
; CHECK-NEXT: v_mul_lo_u32 v2, s0, v1
|
||||
; CHECK-NEXT: v_mul_lo_u32 v3, s1, v0
|
||||
; CHECK-NEXT: v_mul_hi_u32 v5, s0, v0
|
||||
; CHECK-NEXT: v_mul_lo_u32 v4, s0, v0
|
||||
; CHECK-NEXT: v_add_i32_e32 v2, vcc, v2, v3
|
||||
; CHECK-NEXT: v_add_i32_e32 v2, vcc, v3, v2
|
||||
; CHECK-NEXT: v_add_i32_e32 v2, vcc, v2, v5
|
||||
; CHECK-NEXT: v_mul_lo_u32 v3, v1, v4
|
||||
; CHECK-NEXT: v_mul_lo_u32 v5, v0, v2
|
||||
|
@ -1196,43 +1187,38 @@ define <2 x i64> @v_sdiv_v2i64_pow2k_denom(<2 x i64> %num) {
|
|||
; GISEL: ; %bb.0:
|
||||
; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; GISEL-NEXT: s_movk_i32 s10, 0x1000
|
||||
; GISEL-NEXT: s_add_u32 s4, s10, 0
|
||||
; GISEL-NEXT: s_cselect_b32 s5, 1, 0
|
||||
; GISEL-NEXT: s_and_b32 s5, s5, 1
|
||||
; GISEL-NEXT: s_mov_b32 s6, 0
|
||||
; GISEL-NEXT: s_cmp_lg_u32 s5, 0
|
||||
; GISEL-NEXT: s_add_u32 s4, s10, 0
|
||||
; GISEL-NEXT: s_mov_b32 s7, s6
|
||||
; GISEL-NEXT: s_addc_u32 s5, 0, 0
|
||||
; GISEL-NEXT: s_xor_b64 s[8:9], s[4:5], s[6:7]
|
||||
; GISEL-NEXT: v_cvt_f32_u32_e32 v5, s8
|
||||
; GISEL-NEXT: v_cvt_f32_u32_e32 v6, s9
|
||||
; GISEL-NEXT: s_sub_u32 s4, 0, s8
|
||||
; GISEL-NEXT: s_cselect_b32 s5, 1, 0
|
||||
; GISEL-NEXT: s_and_b32 s5, s5, 1
|
||||
; GISEL-NEXT: v_mac_f32_e32 v5, 0x4f800000, v6
|
||||
; GISEL-NEXT: v_rcp_iflag_f32_e32 v5, v5
|
||||
; GISEL-NEXT: s_cmp_lg_u32 s5, 0
|
||||
; GISEL-NEXT: s_subb_u32 s5, 0, s9
|
||||
; GISEL-NEXT: v_ashrrev_i32_e32 v4, 31, v1
|
||||
; GISEL-NEXT: v_mac_f32_e32 v5, 0x4f800000, v6
|
||||
; GISEL-NEXT: v_rcp_iflag_f32_e32 v5, v5
|
||||
; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v4
|
||||
; GISEL-NEXT: v_addc_u32_e32 v1, vcc, v1, v4, vcc
|
||||
; GISEL-NEXT: v_mul_f32_e32 v5, 0x5f7ffffc, v5
|
||||
; GISEL-NEXT: v_mul_f32_e32 v6, 0x2f800000, v5
|
||||
; GISEL-NEXT: v_trunc_f32_e32 v6, v6
|
||||
; GISEL-NEXT: v_mac_f32_e32 v5, 0xcf800000, v6
|
||||
; GISEL-NEXT: v_cvt_u32_f32_e32 v5, v5
|
||||
; GISEL-NEXT: v_cvt_u32_f32_e32 v6, v6
|
||||
; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v4
|
||||
; GISEL-NEXT: v_xor_b32_e32 v0, v0, v4
|
||||
; GISEL-NEXT: v_xor_b32_e32 v1, v1, v4
|
||||
; GISEL-NEXT: v_mul_lo_u32 v7, s5, v5
|
||||
; GISEL-NEXT: v_mul_lo_u32 v8, s4, v6
|
||||
; GISEL-NEXT: v_mul_hi_u32 v10, s4, v5
|
||||
; GISEL-NEXT: v_mul_lo_u32 v9, s4, v5
|
||||
; GISEL-NEXT: v_addc_u32_e32 v1, vcc, v1, v4, vcc
|
||||
; GISEL-NEXT: v_add_i32_e32 v7, vcc, v7, v8
|
||||
; GISEL-NEXT: v_add_i32_e32 v7, vcc, v7, v10
|
||||
; GISEL-NEXT: v_mul_lo_u32 v8, v6, v9
|
||||
; GISEL-NEXT: v_mul_lo_u32 v10, v5, v7
|
||||
; GISEL-NEXT: v_mul_hi_u32 v11, v5, v9
|
||||
; GISEL-NEXT: v_mul_hi_u32 v9, v6, v9
|
||||
; GISEL-NEXT: v_xor_b32_e32 v0, v0, v4
|
||||
; GISEL-NEXT: v_add_i32_e32 v8, vcc, v8, v10
|
||||
; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc
|
||||
; GISEL-NEXT: v_add_i32_e32 v8, vcc, v8, v11
|
||||
|
@ -1256,7 +1242,6 @@ define <2 x i64> @v_sdiv_v2i64_pow2k_denom(<2 x i64> %num) {
|
|||
; GISEL-NEXT: v_mul_lo_u32 v8, s4, v6
|
||||
; GISEL-NEXT: v_mul_hi_u32 v10, s4, v5
|
||||
; GISEL-NEXT: v_mul_lo_u32 v9, s4, v5
|
||||
; GISEL-NEXT: v_xor_b32_e32 v1, v1, v4
|
||||
; GISEL-NEXT: v_add_i32_e32 v7, vcc, v7, v8
|
||||
; GISEL-NEXT: v_add_i32_e32 v7, vcc, v7, v10
|
||||
; GISEL-NEXT: v_mul_lo_u32 v8, v6, v9
|
||||
|
@ -1327,15 +1312,12 @@ define <2 x i64> @v_sdiv_v2i64_pow2k_denom(<2 x i64> %num) {
|
|||
; GISEL-NEXT: v_cndmask_b32_e64 v7, v7, v10, s[4:5]
|
||||
; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, -1, vcc
|
||||
; GISEL-NEXT: v_cmp_le_u32_e32 vcc, s8, v0
|
||||
; GISEL-NEXT: s_add_u32 s4, s10, 0
|
||||
; GISEL-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc
|
||||
; GISEL-NEXT: v_cmp_eq_u32_e32 vcc, s9, v1
|
||||
; GISEL-NEXT: s_cselect_b32 s5, 1, 0
|
||||
; GISEL-NEXT: v_cndmask_b32_e32 v0, v10, v0, vcc
|
||||
; GISEL-NEXT: v_add_i32_e32 v1, vcc, 1, v8
|
||||
; GISEL-NEXT: s_and_b32 s5, s5, 1
|
||||
; GISEL-NEXT: v_addc_u32_e32 v10, vcc, 0, v9, vcc
|
||||
; GISEL-NEXT: s_cmp_lg_u32 s5, 0
|
||||
; GISEL-NEXT: s_add_u32 s4, s10, 0
|
||||
; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0
|
||||
; GISEL-NEXT: s_addc_u32 s5, 0, 0
|
||||
; GISEL-NEXT: v_cndmask_b32_e32 v0, v8, v1, vcc
|
||||
|
@ -1347,25 +1329,22 @@ define <2 x i64> @v_sdiv_v2i64_pow2k_denom(<2 x i64> %num) {
|
|||
; GISEL-NEXT: v_cvt_f32_u32_e32 v5, s6
|
||||
; GISEL-NEXT: v_cvt_f32_u32_e32 v6, s7
|
||||
; GISEL-NEXT: s_sub_u32 s4, 0, s6
|
||||
; GISEL-NEXT: s_cselect_b32 s5, 1, 0
|
||||
; GISEL-NEXT: s_and_b32 s5, s5, 1
|
||||
; GISEL-NEXT: v_mac_f32_e32 v5, 0x4f800000, v6
|
||||
; GISEL-NEXT: v_rcp_iflag_f32_e32 v5, v5
|
||||
; GISEL-NEXT: s_cmp_lg_u32 s5, 0
|
||||
; GISEL-NEXT: s_subb_u32 s5, 0, s7
|
||||
; GISEL-NEXT: v_xor_b32_e32 v0, v0, v4
|
||||
; GISEL-NEXT: v_mac_f32_e32 v5, 0x4f800000, v6
|
||||
; GISEL-NEXT: v_rcp_iflag_f32_e32 v5, v5
|
||||
; GISEL-NEXT: v_xor_b32_e32 v1, v1, v4
|
||||
; GISEL-NEXT: v_sub_i32_e32 v0, vcc, v0, v4
|
||||
; GISEL-NEXT: v_mul_f32_e32 v5, 0x5f7ffffc, v5
|
||||
; GISEL-NEXT: v_mul_f32_e32 v6, 0x2f800000, v5
|
||||
; GISEL-NEXT: v_trunc_f32_e32 v6, v6
|
||||
; GISEL-NEXT: v_mac_f32_e32 v5, 0xcf800000, v6
|
||||
; GISEL-NEXT: v_cvt_u32_f32_e32 v5, v5
|
||||
; GISEL-NEXT: v_cvt_u32_f32_e32 v6, v6
|
||||
; GISEL-NEXT: v_xor_b32_e32 v1, v1, v4
|
||||
; GISEL-NEXT: v_sub_i32_e32 v0, vcc, v0, v4
|
||||
; GISEL-NEXT: v_subb_u32_e32 v1, vcc, v1, v4, vcc
|
||||
; GISEL-NEXT: v_mul_lo_u32 v7, s5, v5
|
||||
; GISEL-NEXT: v_mul_lo_u32 v8, s4, v6
|
||||
; GISEL-NEXT: v_mul_hi_u32 v10, s4, v5
|
||||
; GISEL-NEXT: v_subb_u32_e32 v1, vcc, v1, v4, vcc
|
||||
; GISEL-NEXT: v_ashrrev_i32_e32 v4, 31, v3
|
||||
; GISEL-NEXT: v_mul_lo_u32 v9, s4, v5
|
||||
; GISEL-NEXT: v_add_i32_e32 v2, vcc, v2, v4
|
||||
|
@ -1912,43 +1891,38 @@ define <2 x i64> @v_sdiv_v2i64_oddk_denom(<2 x i64> %num) {
|
|||
; GISEL: ; %bb.0:
|
||||
; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; GISEL-NEXT: s_mov_b32 s10, 0x12d8fb
|
||||
; GISEL-NEXT: s_add_u32 s4, s10, 0
|
||||
; GISEL-NEXT: s_cselect_b32 s5, 1, 0
|
||||
; GISEL-NEXT: s_and_b32 s5, s5, 1
|
||||
; GISEL-NEXT: s_mov_b32 s6, 0
|
||||
; GISEL-NEXT: s_cmp_lg_u32 s5, 0
|
||||
; GISEL-NEXT: s_add_u32 s4, s10, 0
|
||||
; GISEL-NEXT: s_mov_b32 s7, s6
|
||||
; GISEL-NEXT: s_addc_u32 s5, 0, 0
|
||||
; GISEL-NEXT: s_xor_b64 s[8:9], s[4:5], s[6:7]
|
||||
; GISEL-NEXT: v_cvt_f32_u32_e32 v5, s8
|
||||
; GISEL-NEXT: v_cvt_f32_u32_e32 v6, s9
|
||||
; GISEL-NEXT: s_sub_u32 s4, 0, s8
|
||||
; GISEL-NEXT: s_cselect_b32 s5, 1, 0
|
||||
; GISEL-NEXT: s_and_b32 s5, s5, 1
|
||||
; GISEL-NEXT: v_mac_f32_e32 v5, 0x4f800000, v6
|
||||
; GISEL-NEXT: v_rcp_iflag_f32_e32 v5, v5
|
||||
; GISEL-NEXT: s_cmp_lg_u32 s5, 0
|
||||
; GISEL-NEXT: s_subb_u32 s5, 0, s9
|
||||
; GISEL-NEXT: v_ashrrev_i32_e32 v4, 31, v1
|
||||
; GISEL-NEXT: v_mac_f32_e32 v5, 0x4f800000, v6
|
||||
; GISEL-NEXT: v_rcp_iflag_f32_e32 v5, v5
|
||||
; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v4
|
||||
; GISEL-NEXT: v_addc_u32_e32 v1, vcc, v1, v4, vcc
|
||||
; GISEL-NEXT: v_mul_f32_e32 v5, 0x5f7ffffc, v5
|
||||
; GISEL-NEXT: v_mul_f32_e32 v6, 0x2f800000, v5
|
||||
; GISEL-NEXT: v_trunc_f32_e32 v6, v6
|
||||
; GISEL-NEXT: v_mac_f32_e32 v5, 0xcf800000, v6
|
||||
; GISEL-NEXT: v_cvt_u32_f32_e32 v5, v5
|
||||
; GISEL-NEXT: v_cvt_u32_f32_e32 v6, v6
|
||||
; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v4
|
||||
; GISEL-NEXT: v_xor_b32_e32 v0, v0, v4
|
||||
; GISEL-NEXT: v_xor_b32_e32 v1, v1, v4
|
||||
; GISEL-NEXT: v_mul_lo_u32 v7, s5, v5
|
||||
; GISEL-NEXT: v_mul_lo_u32 v8, s4, v6
|
||||
; GISEL-NEXT: v_mul_hi_u32 v10, s4, v5
|
||||
; GISEL-NEXT: v_mul_lo_u32 v9, s4, v5
|
||||
; GISEL-NEXT: v_addc_u32_e32 v1, vcc, v1, v4, vcc
|
||||
; GISEL-NEXT: v_add_i32_e32 v7, vcc, v7, v8
|
||||
; GISEL-NEXT: v_add_i32_e32 v7, vcc, v7, v10
|
||||
; GISEL-NEXT: v_mul_lo_u32 v8, v6, v9
|
||||
; GISEL-NEXT: v_mul_lo_u32 v10, v5, v7
|
||||
; GISEL-NEXT: v_mul_hi_u32 v11, v5, v9
|
||||
; GISEL-NEXT: v_mul_hi_u32 v9, v6, v9
|
||||
; GISEL-NEXT: v_xor_b32_e32 v0, v0, v4
|
||||
; GISEL-NEXT: v_add_i32_e32 v8, vcc, v8, v10
|
||||
; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc
|
||||
; GISEL-NEXT: v_add_i32_e32 v8, vcc, v8, v11
|
||||
|
@ -1972,7 +1946,6 @@ define <2 x i64> @v_sdiv_v2i64_oddk_denom(<2 x i64> %num) {
|
|||
; GISEL-NEXT: v_mul_lo_u32 v8, s4, v6
|
||||
; GISEL-NEXT: v_mul_hi_u32 v10, s4, v5
|
||||
; GISEL-NEXT: v_mul_lo_u32 v9, s4, v5
|
||||
; GISEL-NEXT: v_xor_b32_e32 v1, v1, v4
|
||||
; GISEL-NEXT: v_add_i32_e32 v7, vcc, v7, v8
|
||||
; GISEL-NEXT: v_add_i32_e32 v7, vcc, v7, v10
|
||||
; GISEL-NEXT: v_mul_lo_u32 v8, v6, v9
|
||||
|
@ -2043,15 +2016,12 @@ define <2 x i64> @v_sdiv_v2i64_oddk_denom(<2 x i64> %num) {
|
|||
; GISEL-NEXT: v_cndmask_b32_e64 v7, v7, v10, s[4:5]
|
||||
; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, -1, vcc
|
||||
; GISEL-NEXT: v_cmp_le_u32_e32 vcc, s8, v0
|
||||
; GISEL-NEXT: s_add_u32 s4, s10, 0
|
||||
; GISEL-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc
|
||||
; GISEL-NEXT: v_cmp_eq_u32_e32 vcc, s9, v1
|
||||
; GISEL-NEXT: s_cselect_b32 s5, 1, 0
|
||||
; GISEL-NEXT: v_cndmask_b32_e32 v0, v10, v0, vcc
|
||||
; GISEL-NEXT: v_add_i32_e32 v1, vcc, 1, v8
|
||||
; GISEL-NEXT: s_and_b32 s5, s5, 1
|
||||
; GISEL-NEXT: v_addc_u32_e32 v10, vcc, 0, v9, vcc
|
||||
; GISEL-NEXT: s_cmp_lg_u32 s5, 0
|
||||
; GISEL-NEXT: s_add_u32 s4, s10, 0
|
||||
; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0
|
||||
; GISEL-NEXT: s_addc_u32 s5, 0, 0
|
||||
; GISEL-NEXT: v_cndmask_b32_e32 v0, v8, v1, vcc
|
||||
|
@ -2063,25 +2033,22 @@ define <2 x i64> @v_sdiv_v2i64_oddk_denom(<2 x i64> %num) {
|
|||
; GISEL-NEXT: v_cvt_f32_u32_e32 v5, s6
|
||||
; GISEL-NEXT: v_cvt_f32_u32_e32 v6, s7
|
||||
; GISEL-NEXT: s_sub_u32 s4, 0, s6
|
||||
; GISEL-NEXT: s_cselect_b32 s5, 1, 0
|
||||
; GISEL-NEXT: s_and_b32 s5, s5, 1
|
||||
; GISEL-NEXT: v_mac_f32_e32 v5, 0x4f800000, v6
|
||||
; GISEL-NEXT: v_rcp_iflag_f32_e32 v5, v5
|
||||
; GISEL-NEXT: s_cmp_lg_u32 s5, 0
|
||||
; GISEL-NEXT: s_subb_u32 s5, 0, s7
|
||||
; GISEL-NEXT: v_xor_b32_e32 v0, v0, v4
|
||||
; GISEL-NEXT: v_mac_f32_e32 v5, 0x4f800000, v6
|
||||
; GISEL-NEXT: v_rcp_iflag_f32_e32 v5, v5
|
||||
; GISEL-NEXT: v_xor_b32_e32 v1, v1, v4
|
||||
; GISEL-NEXT: v_sub_i32_e32 v0, vcc, v0, v4
|
||||
; GISEL-NEXT: v_mul_f32_e32 v5, 0x5f7ffffc, v5
|
||||
; GISEL-NEXT: v_mul_f32_e32 v6, 0x2f800000, v5
|
||||
; GISEL-NEXT: v_trunc_f32_e32 v6, v6
|
||||
; GISEL-NEXT: v_mac_f32_e32 v5, 0xcf800000, v6
|
||||
; GISEL-NEXT: v_cvt_u32_f32_e32 v5, v5
|
||||
; GISEL-NEXT: v_cvt_u32_f32_e32 v6, v6
|
||||
; GISEL-NEXT: v_xor_b32_e32 v1, v1, v4
|
||||
; GISEL-NEXT: v_sub_i32_e32 v0, vcc, v0, v4
|
||||
; GISEL-NEXT: v_subb_u32_e32 v1, vcc, v1, v4, vcc
|
||||
; GISEL-NEXT: v_mul_lo_u32 v7, s5, v5
|
||||
; GISEL-NEXT: v_mul_lo_u32 v8, s4, v6
|
||||
; GISEL-NEXT: v_mul_hi_u32 v10, s4, v5
|
||||
; GISEL-NEXT: v_subb_u32_e32 v1, vcc, v1, v4, vcc
|
||||
; GISEL-NEXT: v_ashrrev_i32_e32 v4, 31, v3
|
||||
; GISEL-NEXT: v_mul_lo_u32 v9, s4, v5
|
||||
; GISEL-NEXT: v_add_i32_e32 v2, vcc, v2, v4
|
||||
|
|
|
@ -150,14 +150,8 @@ define amdgpu_kernel void @sdivrem_i64(i64 addrspace(1)* %out0, i64 addrspace(1)
|
|||
; GFX8-NEXT: s_ashr_i32 s2, s9, 31
|
||||
; GFX8-NEXT: s_ashr_i32 s12, s11, 31
|
||||
; GFX8-NEXT: s_add_u32 s0, s8, s2
|
||||
; GFX8-NEXT: s_cselect_b32 s1, 1, 0
|
||||
; GFX8-NEXT: s_and_b32 s1, s1, 1
|
||||
; GFX8-NEXT: s_cmp_lg_u32 s1, 0
|
||||
; GFX8-NEXT: s_addc_u32 s1, s9, s2
|
||||
; GFX8-NEXT: s_add_u32 s8, s10, s12
|
||||
; GFX8-NEXT: s_cselect_b32 s3, 1, 0
|
||||
; GFX8-NEXT: s_and_b32 s3, s3, 1
|
||||
; GFX8-NEXT: s_cmp_lg_u32 s3, 0
|
||||
; GFX8-NEXT: s_mov_b32 s13, s12
|
||||
; GFX8-NEXT: s_addc_u32 s9, s11, s12
|
||||
; GFX8-NEXT: s_xor_b64 s[8:9], s[8:9], s[12:13]
|
||||
|
@ -169,8 +163,7 @@ define amdgpu_kernel void @sdivrem_i64(i64 addrspace(1)* %out0, i64 addrspace(1)
|
|||
; GFX8-NEXT: v_add_f32_e32 v0, v0, v1
|
||||
; GFX8-NEXT: v_rcp_iflag_f32_e32 v0, v0
|
||||
; GFX8-NEXT: s_sub_u32 s0, 0, s8
|
||||
; GFX8-NEXT: s_cselect_b32 s1, 1, 0
|
||||
; GFX8-NEXT: s_and_b32 s1, s1, 1
|
||||
; GFX8-NEXT: s_subb_u32 s1, 0, s9
|
||||
; GFX8-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0
|
||||
; GFX8-NEXT: v_mul_f32_e32 v1, 0x2f800000, v0
|
||||
; GFX8-NEXT: v_trunc_f32_e32 v1, v1
|
||||
|
@ -178,8 +171,6 @@ define amdgpu_kernel void @sdivrem_i64(i64 addrspace(1)* %out0, i64 addrspace(1)
|
|||
; GFX8-NEXT: v_add_f32_e32 v0, v2, v0
|
||||
; GFX8-NEXT: v_cvt_u32_f32_e32 v1, v1
|
||||
; GFX8-NEXT: v_cvt_u32_f32_e32 v0, v0
|
||||
; GFX8-NEXT: s_cmp_lg_u32 s1, 0
|
||||
; GFX8-NEXT: s_subb_u32 s1, 0, s9
|
||||
; GFX8-NEXT: v_mul_lo_u32 v2, s0, v1
|
||||
; GFX8-NEXT: v_mul_lo_u32 v3, s1, v0
|
||||
; GFX8-NEXT: v_mul_hi_u32 v5, s0, v0
|
||||
|
@ -329,14 +320,8 @@ define amdgpu_kernel void @sdivrem_i64(i64 addrspace(1)* %out0, i64 addrspace(1)
|
|||
; GFX9-NEXT: s_ashr_i32 s2, s9, 31
|
||||
; GFX9-NEXT: s_ashr_i32 s12, s11, 31
|
||||
; GFX9-NEXT: s_add_u32 s0, s8, s2
|
||||
; GFX9-NEXT: s_cselect_b32 s1, 1, 0
|
||||
; GFX9-NEXT: s_and_b32 s1, s1, 1
|
||||
; GFX9-NEXT: s_cmp_lg_u32 s1, 0
|
||||
; GFX9-NEXT: s_addc_u32 s1, s9, s2
|
||||
; GFX9-NEXT: s_add_u32 s8, s10, s12
|
||||
; GFX9-NEXT: s_cselect_b32 s3, 1, 0
|
||||
; GFX9-NEXT: s_and_b32 s3, s3, 1
|
||||
; GFX9-NEXT: s_cmp_lg_u32 s3, 0
|
||||
; GFX9-NEXT: s_mov_b32 s13, s12
|
||||
; GFX9-NEXT: s_addc_u32 s9, s11, s12
|
||||
; GFX9-NEXT: s_xor_b64 s[8:9], s[8:9], s[12:13]
|
||||
|
@ -348,8 +333,8 @@ define amdgpu_kernel void @sdivrem_i64(i64 addrspace(1)* %out0, i64 addrspace(1)
|
|||
; GFX9-NEXT: v_add_f32_e32 v0, v0, v1
|
||||
; GFX9-NEXT: v_rcp_iflag_f32_e32 v0, v0
|
||||
; GFX9-NEXT: s_sub_u32 s0, 0, s8
|
||||
; GFX9-NEXT: s_cselect_b32 s1, 1, 0
|
||||
; GFX9-NEXT: s_and_b32 s1, s1, 1
|
||||
; GFX9-NEXT: s_subb_u32 s1, 0, s9
|
||||
; GFX9-NEXT: v_mov_b32_e32 v8, s11
|
||||
; GFX9-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0
|
||||
; GFX9-NEXT: v_mul_f32_e32 v1, 0x2f800000, v0
|
||||
; GFX9-NEXT: v_trunc_f32_e32 v1, v1
|
||||
|
@ -357,27 +342,24 @@ define amdgpu_kernel void @sdivrem_i64(i64 addrspace(1)* %out0, i64 addrspace(1)
|
|||
; GFX9-NEXT: v_add_f32_e32 v0, v2, v0
|
||||
; GFX9-NEXT: v_cvt_u32_f32_e32 v1, v1
|
||||
; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0
|
||||
; GFX9-NEXT: s_cmp_lg_u32 s1, 0
|
||||
; GFX9-NEXT: s_subb_u32 s1, 0, s9
|
||||
; GFX9-NEXT: v_mul_lo_u32 v2, s0, v1
|
||||
; GFX9-NEXT: v_mul_lo_u32 v3, s1, v0
|
||||
; GFX9-NEXT: v_mul_hi_u32 v4, s0, v0
|
||||
; GFX9-NEXT: v_mul_lo_u32 v5, s0, v0
|
||||
; GFX9-NEXT: v_mov_b32_e32 v8, s11
|
||||
; GFX9-NEXT: v_add3_u32 v2, v3, v2, v4
|
||||
; GFX9-NEXT: v_mul_lo_u32 v3, v1, v5
|
||||
; GFX9-NEXT: v_mul_lo_u32 v4, v0, v2
|
||||
; GFX9-NEXT: v_mul_hi_u32 v6, v0, v5
|
||||
; GFX9-NEXT: v_mul_lo_u32 v6, v0, v2
|
||||
; GFX9-NEXT: v_mul_hi_u32 v4, v0, v5
|
||||
; GFX9-NEXT: v_mul_hi_u32 v5, v1, v5
|
||||
; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, v3, v4
|
||||
; GFX9-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc
|
||||
; GFX9-NEXT: v_mul_lo_u32 v7, v1, v2
|
||||
; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, v3, v6
|
||||
; GFX9-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc
|
||||
; GFX9-NEXT: v_mul_lo_u32 v6, v1, v2
|
||||
; GFX9-NEXT: v_add_u32_e32 v3, v4, v3
|
||||
; GFX9-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc
|
||||
; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, v3, v4
|
||||
; GFX9-NEXT: v_mul_hi_u32 v4, v0, v2
|
||||
; GFX9-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc
|
||||
; GFX9-NEXT: v_add_co_u32_e32 v5, vcc, v7, v5
|
||||
; GFX9-NEXT: v_mul_hi_u32 v2, v1, v2
|
||||
; GFX9-NEXT: v_add_co_u32_e32 v5, vcc, v6, v5
|
||||
; GFX9-NEXT: v_add_u32_e32 v3, v6, v3
|
||||
; GFX9-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc
|
||||
; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, v5, v4
|
||||
; GFX9-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc
|
||||
|
@ -499,27 +481,18 @@ define amdgpu_kernel void @sdivrem_i64(i64 addrspace(1)* %out0, i64 addrspace(1)
|
|||
; GFX10-NEXT: s_ashr_i32 s2, s9, 31
|
||||
; GFX10-NEXT: s_ashr_i32 s12, s11, 31
|
||||
; GFX10-NEXT: s_add_u32 s0, s8, s2
|
||||
; GFX10-NEXT: s_cselect_b32 s1, 1, 0
|
||||
; GFX10-NEXT: s_mov_b32 s13, s12
|
||||
; GFX10-NEXT: s_and_b32 s1, s1, 1
|
||||
; GFX10-NEXT: s_cmp_lg_u32 s1, 0
|
||||
; GFX10-NEXT: s_addc_u32 s1, s9, s2
|
||||
; GFX10-NEXT: s_add_u32 s8, s10, s12
|
||||
; GFX10-NEXT: s_cselect_b32 s3, 1, 0
|
||||
; GFX10-NEXT: s_and_b32 s3, s3, 1
|
||||
; GFX10-NEXT: s_cmp_lg_u32 s3, 0
|
||||
; GFX10-NEXT: s_mov_b32 s3, s2
|
||||
; GFX10-NEXT: s_mov_b32 s13, s12
|
||||
; GFX10-NEXT: s_addc_u32 s9, s11, s12
|
||||
; GFX10-NEXT: s_xor_b64 s[0:1], s[0:1], s[2:3]
|
||||
; GFX10-NEXT: s_mov_b32 s3, s2
|
||||
; GFX10-NEXT: s_xor_b64 s[8:9], s[8:9], s[12:13]
|
||||
; GFX10-NEXT: s_xor_b64 s[0:1], s[0:1], s[2:3]
|
||||
; GFX10-NEXT: v_cvt_f32_u32_e32 v0, s9
|
||||
; GFX10-NEXT: v_cvt_f32_u32_e32 v1, s8
|
||||
; GFX10-NEXT: s_sub_u32 s10, 0, s8
|
||||
; GFX10-NEXT: s_cselect_b32 s11, 1, 0
|
||||
; GFX10-NEXT: s_and_b32 s11, s11, 1
|
||||
; GFX10-NEXT: v_mul_f32_e32 v0, 0x4f800000, v0
|
||||
; GFX10-NEXT: s_cmp_lg_u32 s11, 0
|
||||
; GFX10-NEXT: s_subb_u32 s11, 0, s9
|
||||
; GFX10-NEXT: v_mul_f32_e32 v0, 0x4f800000, v0
|
||||
; GFX10-NEXT: v_add_f32_e32 v0, v0, v1
|
||||
; GFX10-NEXT: v_rcp_iflag_f32_e32 v0, v0
|
||||
; GFX10-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0
|
||||
|
@ -1335,14 +1308,8 @@ define amdgpu_kernel void @sdivrem_v2i64(<2 x i64> addrspace(1)* %out0, <2 x i64
|
|||
; GFX8-NEXT: s_ashr_i32 s2, s9, 31
|
||||
; GFX8-NEXT: s_ashr_i32 s6, s13, 31
|
||||
; GFX8-NEXT: s_add_u32 s0, s8, s2
|
||||
; GFX8-NEXT: s_cselect_b32 s1, 1, 0
|
||||
; GFX8-NEXT: s_and_b32 s1, s1, 1
|
||||
; GFX8-NEXT: s_cmp_lg_u32 s1, 0
|
||||
; GFX8-NEXT: s_addc_u32 s1, s9, s2
|
||||
; GFX8-NEXT: s_add_u32 s8, s12, s6
|
||||
; GFX8-NEXT: s_cselect_b32 s3, 1, 0
|
||||
; GFX8-NEXT: s_and_b32 s3, s3, 1
|
||||
; GFX8-NEXT: s_cmp_lg_u32 s3, 0
|
||||
; GFX8-NEXT: s_mov_b32 s7, s6
|
||||
; GFX8-NEXT: s_addc_u32 s9, s13, s6
|
||||
; GFX8-NEXT: s_xor_b64 s[8:9], s[8:9], s[6:7]
|
||||
|
@ -1354,8 +1321,7 @@ define amdgpu_kernel void @sdivrem_v2i64(<2 x i64> addrspace(1)* %out0, <2 x i64
|
|||
; GFX8-NEXT: v_add_f32_e32 v0, v0, v1
|
||||
; GFX8-NEXT: v_rcp_iflag_f32_e32 v0, v0
|
||||
; GFX8-NEXT: s_sub_u32 s0, 0, s8
|
||||
; GFX8-NEXT: s_cselect_b32 s1, 1, 0
|
||||
; GFX8-NEXT: s_and_b32 s1, s1, 1
|
||||
; GFX8-NEXT: s_subb_u32 s1, 0, s9
|
||||
; GFX8-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0
|
||||
; GFX8-NEXT: v_mul_f32_e32 v1, 0x2f800000, v0
|
||||
; GFX8-NEXT: v_trunc_f32_e32 v1, v1
|
||||
|
@ -1363,8 +1329,6 @@ define amdgpu_kernel void @sdivrem_v2i64(<2 x i64> addrspace(1)* %out0, <2 x i64
|
|||
; GFX8-NEXT: v_add_f32_e32 v0, v2, v0
|
||||
; GFX8-NEXT: v_cvt_u32_f32_e32 v1, v1
|
||||
; GFX8-NEXT: v_cvt_u32_f32_e32 v0, v0
|
||||
; GFX8-NEXT: s_cmp_lg_u32 s1, 0
|
||||
; GFX8-NEXT: s_subb_u32 s1, 0, s9
|
||||
; GFX8-NEXT: v_mul_lo_u32 v2, s0, v1
|
||||
; GFX8-NEXT: v_mul_lo_u32 v3, s1, v0
|
||||
; GFX8-NEXT: v_mul_hi_u32 v5, s0, v0
|
||||
|
@ -1496,14 +1460,8 @@ define amdgpu_kernel void @sdivrem_v2i64(<2 x i64> addrspace(1)* %out0, <2 x i64
|
|||
; GFX8-NEXT: s_add_u32 s0, s10, s6
|
||||
; GFX8-NEXT: v_xor_b32_e32 v1, s1, v1
|
||||
; GFX8-NEXT: v_mov_b32_e32 v4, s1
|
||||
; GFX8-NEXT: s_cselect_b32 s1, 1, 0
|
||||
; GFX8-NEXT: s_and_b32 s1, s1, 1
|
||||
; GFX8-NEXT: s_cmp_lg_u32 s1, 0
|
||||
; GFX8-NEXT: s_addc_u32 s1, s11, s6
|
||||
; GFX8-NEXT: s_add_u32 s10, s14, s8
|
||||
; GFX8-NEXT: s_cselect_b32 s3, 1, 0
|
||||
; GFX8-NEXT: s_and_b32 s3, s3, 1
|
||||
; GFX8-NEXT: s_cmp_lg_u32 s3, 0
|
||||
; GFX8-NEXT: s_mov_b32 s9, s8
|
||||
; GFX8-NEXT: s_addc_u32 s11, s15, s8
|
||||
; GFX8-NEXT: s_xor_b64 s[10:11], s[10:11], s[8:9]
|
||||
|
@ -1516,8 +1474,8 @@ define amdgpu_kernel void @sdivrem_v2i64(<2 x i64> addrspace(1)* %out0, <2 x i64
|
|||
; GFX8-NEXT: v_add_f32_e32 v4, v4, v5
|
||||
; GFX8-NEXT: v_rcp_iflag_f32_e32 v4, v4
|
||||
; GFX8-NEXT: s_sub_u32 s0, 0, s10
|
||||
; GFX8-NEXT: s_cselect_b32 s1, 1, 0
|
||||
; GFX8-NEXT: s_and_b32 s1, s1, 1
|
||||
; GFX8-NEXT: s_subb_u32 s1, 0, s11
|
||||
; GFX8-NEXT: v_xor_b32_e32 v3, s2, v3
|
||||
; GFX8-NEXT: v_mul_f32_e32 v4, 0x5f7ffffc, v4
|
||||
; GFX8-NEXT: v_mul_f32_e32 v6, 0x2f800000, v4
|
||||
; GFX8-NEXT: v_trunc_f32_e32 v6, v6
|
||||
|
@ -1525,17 +1483,14 @@ define amdgpu_kernel void @sdivrem_v2i64(<2 x i64> addrspace(1)* %out0, <2 x i64
|
|||
; GFX8-NEXT: v_add_f32_e32 v4, v7, v4
|
||||
; GFX8-NEXT: v_cvt_u32_f32_e32 v7, v4
|
||||
; GFX8-NEXT: v_cvt_u32_f32_e32 v6, v6
|
||||
; GFX8-NEXT: s_cmp_lg_u32 s1, 0
|
||||
; GFX8-NEXT: s_subb_u32 s1, 0, s11
|
||||
; GFX8-NEXT: v_xor_b32_e32 v2, s2, v2
|
||||
; GFX8-NEXT: v_mov_b32_e32 v5, s2
|
||||
; GFX8-NEXT: v_mul_lo_u32 v4, s1, v7
|
||||
; GFX8-NEXT: v_mul_lo_u32 v8, s0, v6
|
||||
; GFX8-NEXT: v_mul_hi_u32 v10, s0, v7
|
||||
; GFX8-NEXT: v_mul_lo_u32 v9, s0, v7
|
||||
; GFX8-NEXT: v_xor_b32_e32 v3, s2, v3
|
||||
; GFX8-NEXT: v_add_u32_e32 v4, vcc, v4, v8
|
||||
; GFX8-NEXT: v_add_u32_e32 v8, vcc, v4, v10
|
||||
; GFX8-NEXT: v_xor_b32_e32 v2, s2, v2
|
||||
; GFX8-NEXT: v_mov_b32_e32 v5, s2
|
||||
; GFX8-NEXT: v_mul_lo_u32 v10, v6, v9
|
||||
; GFX8-NEXT: v_mul_lo_u32 v11, v7, v8
|
||||
; GFX8-NEXT: v_subrev_u32_e32 v4, vcc, s2, v3
|
||||
|
@ -1683,14 +1638,8 @@ define amdgpu_kernel void @sdivrem_v2i64(<2 x i64> addrspace(1)* %out0, <2 x i64
|
|||
; GFX9-NEXT: s_ashr_i32 s2, s9, 31
|
||||
; GFX9-NEXT: s_ashr_i32 s6, s13, 31
|
||||
; GFX9-NEXT: s_add_u32 s0, s8, s2
|
||||
; GFX9-NEXT: s_cselect_b32 s1, 1, 0
|
||||
; GFX9-NEXT: s_and_b32 s1, s1, 1
|
||||
; GFX9-NEXT: s_cmp_lg_u32 s1, 0
|
||||
; GFX9-NEXT: s_addc_u32 s1, s9, s2
|
||||
; GFX9-NEXT: s_add_u32 s8, s12, s6
|
||||
; GFX9-NEXT: s_cselect_b32 s3, 1, 0
|
||||
; GFX9-NEXT: s_and_b32 s3, s3, 1
|
||||
; GFX9-NEXT: s_cmp_lg_u32 s3, 0
|
||||
; GFX9-NEXT: s_mov_b32 s7, s6
|
||||
; GFX9-NEXT: s_addc_u32 s9, s13, s6
|
||||
; GFX9-NEXT: s_xor_b64 s[8:9], s[8:9], s[6:7]
|
||||
|
@ -1702,8 +1651,7 @@ define amdgpu_kernel void @sdivrem_v2i64(<2 x i64> addrspace(1)* %out0, <2 x i64
|
|||
; GFX9-NEXT: v_add_f32_e32 v0, v0, v1
|
||||
; GFX9-NEXT: v_rcp_iflag_f32_e32 v0, v0
|
||||
; GFX9-NEXT: s_sub_u32 s0, 0, s8
|
||||
; GFX9-NEXT: s_cselect_b32 s1, 1, 0
|
||||
; GFX9-NEXT: s_and_b32 s1, s1, 1
|
||||
; GFX9-NEXT: s_subb_u32 s1, 0, s9
|
||||
; GFX9-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0
|
||||
; GFX9-NEXT: v_mul_f32_e32 v1, 0x2f800000, v0
|
||||
; GFX9-NEXT: v_trunc_f32_e32 v1, v1
|
||||
|
@ -1711,27 +1659,24 @@ define amdgpu_kernel void @sdivrem_v2i64(<2 x i64> addrspace(1)* %out0, <2 x i64
|
|||
; GFX9-NEXT: v_add_f32_e32 v0, v2, v0
|
||||
; GFX9-NEXT: v_cvt_u32_f32_e32 v1, v1
|
||||
; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0
|
||||
; GFX9-NEXT: s_cmp_lg_u32 s1, 0
|
||||
; GFX9-NEXT: s_subb_u32 s1, 0, s9
|
||||
; GFX9-NEXT: v_mul_lo_u32 v2, s0, v1
|
||||
; GFX9-NEXT: v_mul_lo_u32 v3, s1, v0
|
||||
; GFX9-NEXT: v_mul_hi_u32 v4, s0, v0
|
||||
; GFX9-NEXT: v_mul_lo_u32 v5, s0, v0
|
||||
; GFX9-NEXT: v_mov_b32_e32 v7, s13
|
||||
; GFX9-NEXT: v_add3_u32 v2, v3, v2, v4
|
||||
; GFX9-NEXT: v_mul_lo_u32 v3, v1, v5
|
||||
; GFX9-NEXT: v_mul_lo_u32 v4, v0, v2
|
||||
; GFX9-NEXT: v_mul_hi_u32 v6, v0, v5
|
||||
; GFX9-NEXT: v_mul_lo_u32 v6, v0, v2
|
||||
; GFX9-NEXT: v_mul_hi_u32 v4, v0, v5
|
||||
; GFX9-NEXT: v_mul_hi_u32 v5, v1, v5
|
||||
; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, v3, v4
|
||||
; GFX9-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc
|
||||
; GFX9-NEXT: v_mul_lo_u32 v7, v1, v2
|
||||
; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, v3, v6
|
||||
; GFX9-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc
|
||||
; GFX9-NEXT: v_mul_lo_u32 v6, v1, v2
|
||||
; GFX9-NEXT: v_add_u32_e32 v3, v4, v3
|
||||
; GFX9-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc
|
||||
; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, v3, v4
|
||||
; GFX9-NEXT: v_mul_hi_u32 v4, v0, v2
|
||||
; GFX9-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc
|
||||
; GFX9-NEXT: v_add_co_u32_e32 v5, vcc, v7, v5
|
||||
; GFX9-NEXT: v_mul_hi_u32 v2, v1, v2
|
||||
; GFX9-NEXT: v_add_co_u32_e32 v5, vcc, v6, v5
|
||||
; GFX9-NEXT: v_add_u32_e32 v3, v6, v3
|
||||
; GFX9-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc
|
||||
; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, v5, v4
|
||||
; GFX9-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc
|
||||
|
@ -1745,6 +1690,7 @@ define amdgpu_kernel void @sdivrem_v2i64(<2 x i64> addrspace(1)* %out0, <2 x i64
|
|||
; GFX9-NEXT: v_mul_lo_u32 v3, s0, v1
|
||||
; GFX9-NEXT: v_mul_hi_u32 v4, s0, v0
|
||||
; GFX9-NEXT: v_mul_lo_u32 v5, s0, v0
|
||||
; GFX9-NEXT: v_mov_b32_e32 v7, s13
|
||||
; GFX9-NEXT: v_add3_u32 v2, v2, v3, v4
|
||||
; GFX9-NEXT: v_mul_lo_u32 v3, v1, v5
|
||||
; GFX9-NEXT: v_mul_lo_u32 v4, v0, v2
|
||||
|
@ -1826,14 +1772,8 @@ define amdgpu_kernel void @sdivrem_v2i64(<2 x i64> addrspace(1)* %out0, <2 x i64
|
|||
; GFX9-NEXT: s_ashr_i32 s6, s11, 31
|
||||
; GFX9-NEXT: s_ashr_i32 s8, s15, 31
|
||||
; GFX9-NEXT: s_add_u32 s12, s10, s6
|
||||
; GFX9-NEXT: s_cselect_b32 s3, 1, 0
|
||||
; GFX9-NEXT: s_and_b32 s3, s3, 1
|
||||
; GFX9-NEXT: s_cmp_lg_u32 s3, 0
|
||||
; GFX9-NEXT: s_addc_u32 s13, s11, s6
|
||||
; GFX9-NEXT: s_add_u32 s10, s14, s8
|
||||
; GFX9-NEXT: s_cselect_b32 s3, 1, 0
|
||||
; GFX9-NEXT: s_and_b32 s3, s3, 1
|
||||
; GFX9-NEXT: s_cmp_lg_u32 s3, 0
|
||||
; GFX9-NEXT: s_mov_b32 s9, s8
|
||||
; GFX9-NEXT: s_addc_u32 s11, s15, s8
|
||||
; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v11
|
||||
|
@ -1858,14 +1798,11 @@ define amdgpu_kernel void @sdivrem_v2i64(<2 x i64> addrspace(1)* %out0, <2 x i64
|
|||
; GFX9-NEXT: v_add_f32_e32 v4, v6, v4
|
||||
; GFX9-NEXT: v_cvt_u32_f32_e32 v4, v4
|
||||
; GFX9-NEXT: v_cvt_u32_f32_e32 v5, v5
|
||||
; GFX9-NEXT: s_cselect_b32 s14, 1, 0
|
||||
; GFX9-NEXT: s_and_b32 s14, s14, 1
|
||||
; GFX9-NEXT: s_cmp_lg_u32 s14, 0
|
||||
; GFX9-NEXT: s_subb_u32 s14, 0, s11
|
||||
; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v9, vcc
|
||||
; GFX9-NEXT: v_mul_lo_u32 v6, s14, v4
|
||||
; GFX9-NEXT: v_mul_lo_u32 v7, s3, v5
|
||||
; GFX9-NEXT: v_mul_hi_u32 v8, s3, v4
|
||||
; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v9, vcc
|
||||
; GFX9-NEXT: v_mul_lo_u32 v9, s3, v4
|
||||
; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v10, vcc
|
||||
; GFX9-NEXT: v_xor_b32_e32 v0, s0, v0
|
||||
|
@ -2015,321 +1952,303 @@ define amdgpu_kernel void @sdivrem_v2i64(<2 x i64> addrspace(1)* %out0, <2 x i64
|
|||
; GFX10-NEXT: s_ashr_i32 s2, s9, 31
|
||||
; GFX10-NEXT: s_ashr_i32 s6, s13, 31
|
||||
; GFX10-NEXT: s_add_u32 s0, s8, s2
|
||||
; GFX10-NEXT: s_cselect_b32 s1, 1, 0
|
||||
; GFX10-NEXT: s_mov_b32 s7, s6
|
||||
; GFX10-NEXT: s_and_b32 s1, s1, 1
|
||||
; GFX10-NEXT: s_cmp_lg_u32 s1, 0
|
||||
; GFX10-NEXT: s_addc_u32 s1, s9, s2
|
||||
; GFX10-NEXT: s_add_u32 s8, s12, s6
|
||||
; GFX10-NEXT: s_cselect_b32 s3, 1, 0
|
||||
; GFX10-NEXT: s_and_b32 s3, s3, 1
|
||||
; GFX10-NEXT: s_cmp_lg_u32 s3, 0
|
||||
; GFX10-NEXT: s_mov_b32 s3, s2
|
||||
; GFX10-NEXT: s_mov_b32 s7, s6
|
||||
; GFX10-NEXT: s_addc_u32 s9, s13, s6
|
||||
; GFX10-NEXT: s_xor_b64 s[0:1], s[0:1], s[2:3]
|
||||
; GFX10-NEXT: s_mov_b32 s3, s2
|
||||
; GFX10-NEXT: s_xor_b64 s[8:9], s[8:9], s[6:7]
|
||||
; GFX10-NEXT: s_xor_b64 s[0:1], s[0:1], s[2:3]
|
||||
; GFX10-NEXT: v_cvt_f32_u32_e32 v1, s9
|
||||
; GFX10-NEXT: s_sub_u32 s20, 0, s8
|
||||
; GFX10-NEXT: s_cselect_b32 s12, 1, 0
|
||||
; GFX10-NEXT: v_cvt_f32_u32_e32 v0, s8
|
||||
; GFX10-NEXT: s_and_b32 s12, s12, 1
|
||||
; GFX10-NEXT: v_mul_f32_e32 v1, 0x4f800000, v1
|
||||
; GFX10-NEXT: s_cmp_lg_u32 s12, 0
|
||||
; GFX10-NEXT: s_subb_u32 s21, 0, s9
|
||||
; GFX10-NEXT: s_ashr_i32 s12, s11, 31
|
||||
; GFX10-NEXT: v_cvt_f32_u32_e32 v0, s8
|
||||
; GFX10-NEXT: s_xor_b64 s[18:19], s[2:3], s[6:7]
|
||||
; GFX10-NEXT: s_ashr_i32 s16, s15, 31
|
||||
; GFX10-NEXT: v_add_f32_e32 v0, v1, v0
|
||||
; GFX10-NEXT: v_mul_f32_e32 v1, 0x4f800000, v1
|
||||
; GFX10-NEXT: s_add_u32 s6, s10, s12
|
||||
; GFX10-NEXT: s_cselect_b32 s3, 1, 0
|
||||
; GFX10-NEXT: s_mov_b32 s17, s16
|
||||
; GFX10-NEXT: s_and_b32 s3, s3, 1
|
||||
; GFX10-NEXT: v_rcp_iflag_f32_e32 v0, v0
|
||||
; GFX10-NEXT: s_cmp_lg_u32 s3, 0
|
||||
; GFX10-NEXT: s_mov_b32 s13, s12
|
||||
; GFX10-NEXT: s_addc_u32 s7, s11, s12
|
||||
; GFX10-NEXT: s_add_u32 s10, s14, s16
|
||||
; GFX10-NEXT: s_cselect_b32 s3, 1, 0
|
||||
; GFX10-NEXT: s_and_b32 s3, s3, 1
|
||||
; GFX10-NEXT: s_cmp_lg_u32 s3, 0
|
||||
; GFX10-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0
|
||||
; GFX10-NEXT: s_mov_b32 s17, s16
|
||||
; GFX10-NEXT: s_addc_u32 s11, s15, s16
|
||||
; GFX10-NEXT: s_xor_b64 s[14:15], s[6:7], s[12:13]
|
||||
; GFX10-NEXT: v_add_f32_e32 v0, v1, v0
|
||||
; GFX10-NEXT: s_xor_b64 s[10:11], s[10:11], s[16:17]
|
||||
; GFX10-NEXT: s_mov_b32 s13, s12
|
||||
; GFX10-NEXT: v_cvt_f32_u32_e32 v1, s11
|
||||
; GFX10-NEXT: v_mul_f32_e32 v2, 0x2f800000, v0
|
||||
; GFX10-NEXT: v_cvt_f32_u32_e32 v3, s10
|
||||
; GFX10-NEXT: v_cvt_f32_u32_e32 v2, s10
|
||||
; GFX10-NEXT: v_rcp_iflag_f32_e32 v0, v0
|
||||
; GFX10-NEXT: s_xor_b64 s[14:15], s[6:7], s[12:13]
|
||||
; GFX10-NEXT: s_sub_u32 s3, 0, s10
|
||||
; GFX10-NEXT: s_cselect_b32 s6, 1, 0
|
||||
; GFX10-NEXT: v_mul_f32_e32 v1, 0x4f800000, v1
|
||||
; GFX10-NEXT: s_subb_u32 s6, 0, s11
|
||||
; GFX10-NEXT: v_add_f32_e32 v1, v1, v2
|
||||
; GFX10-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0
|
||||
; GFX10-NEXT: v_rcp_iflag_f32_e32 v1, v1
|
||||
; GFX10-NEXT: v_mul_f32_e32 v2, 0x2f800000, v0
|
||||
; GFX10-NEXT: v_trunc_f32_e32 v2, v2
|
||||
; GFX10-NEXT: s_and_b32 s6, s6, 1
|
||||
; GFX10-NEXT: s_cmp_lg_u32 s6, 0
|
||||
; GFX10-NEXT: v_add_f32_e32 v1, v1, v3
|
||||
; GFX10-NEXT: v_mul_f32_e32 v1, 0x5f7ffffc, v1
|
||||
; GFX10-NEXT: v_mul_f32_e32 v3, 0xcf800000, v2
|
||||
; GFX10-NEXT: v_cvt_u32_f32_e32 v2, v2
|
||||
; GFX10-NEXT: s_subb_u32 s6, 0, s11
|
||||
; GFX10-NEXT: v_rcp_iflag_f32_e32 v1, v1
|
||||
; GFX10-NEXT: v_mul_f32_e32 v4, 0x2f800000, v1
|
||||
; GFX10-NEXT: v_add_f32_e32 v0, v3, v0
|
||||
; GFX10-NEXT: v_mul_lo_u32 v3, s20, v2
|
||||
; GFX10-NEXT: v_mul_lo_u32 v5, s20, v2
|
||||
; GFX10-NEXT: v_trunc_f32_e32 v3, v4
|
||||
; GFX10-NEXT: v_cvt_u32_f32_e32 v0, v0
|
||||
; GFX10-NEXT: v_mul_f32_e32 v1, 0x5f7ffffc, v1
|
||||
; GFX10-NEXT: v_mul_lo_u32 v4, s21, v0
|
||||
; GFX10-NEXT: v_mul_hi_u32 v5, s20, v0
|
||||
; GFX10-NEXT: v_mul_lo_u32 v6, s20, v0
|
||||
; GFX10-NEXT: v_mul_f32_e32 v7, 0x2f800000, v1
|
||||
; GFX10-NEXT: v_add3_u32 v3, v4, v3, v5
|
||||
; GFX10-NEXT: v_trunc_f32_e32 v4, v7
|
||||
; GFX10-NEXT: v_mul_lo_u32 v5, v2, v6
|
||||
; GFX10-NEXT: v_mul_hi_u32 v7, v0, v6
|
||||
; GFX10-NEXT: v_mul_hi_u32 v6, v2, v6
|
||||
; GFX10-NEXT: v_mul_lo_u32 v8, v0, v3
|
||||
; GFX10-NEXT: v_mul_lo_u32 v10, v2, v3
|
||||
; GFX10-NEXT: v_mul_f32_e32 v9, 0xcf800000, v4
|
||||
; GFX10-NEXT: v_mul_hi_u32 v11, v0, v3
|
||||
; GFX10-NEXT: v_cvt_u32_f32_e32 v4, v4
|
||||
; GFX10-NEXT: v_mul_hi_u32 v3, v2, v3
|
||||
; GFX10-NEXT: v_add_f32_e32 v1, v9, v1
|
||||
; GFX10-NEXT: v_add_co_u32 v5, s7, v5, v8
|
||||
; GFX10-NEXT: v_cndmask_b32_e64 v8, 0, 1, s7
|
||||
; GFX10-NEXT: v_add_co_u32 v6, s7, v10, v6
|
||||
; GFX10-NEXT: v_cndmask_b32_e64 v10, 0, 1, s7
|
||||
; GFX10-NEXT: v_add_co_u32 v5, s7, v5, v7
|
||||
; GFX10-NEXT: v_mul_f32_e32 v4, 0xcf800000, v3
|
||||
; GFX10-NEXT: v_cvt_u32_f32_e32 v3, v3
|
||||
; GFX10-NEXT: v_mul_lo_u32 v6, s21, v0
|
||||
; GFX10-NEXT: v_mul_hi_u32 v7, s20, v0
|
||||
; GFX10-NEXT: v_add_f32_e32 v1, v4, v1
|
||||
; GFX10-NEXT: v_mul_lo_u32 v4, s20, v0
|
||||
; GFX10-NEXT: v_mul_lo_u32 v8, s3, v3
|
||||
; GFX10-NEXT: v_cvt_u32_f32_e32 v1, v1
|
||||
; GFX10-NEXT: v_cndmask_b32_e64 v5, 0, 1, s7
|
||||
; GFX10-NEXT: v_mul_lo_u32 v9, s3, v4
|
||||
; GFX10-NEXT: v_add3_u32 v5, v6, v5, v7
|
||||
; GFX10-NEXT: v_mul_lo_u32 v6, v2, v4
|
||||
; GFX10-NEXT: v_mul_lo_u32 v7, s6, v1
|
||||
; GFX10-NEXT: v_mul_hi_u32 v9, s3, v1
|
||||
; GFX10-NEXT: v_mul_lo_u32 v12, v0, v5
|
||||
; GFX10-NEXT: v_mul_hi_u32 v11, v0, v4
|
||||
; GFX10-NEXT: v_mul_hi_u32 v4, v2, v4
|
||||
; GFX10-NEXT: v_mul_lo_u32 v13, v2, v5
|
||||
; GFX10-NEXT: v_mul_lo_u32 v10, s3, v1
|
||||
; GFX10-NEXT: v_mul_hi_u32 v14, v0, v5
|
||||
; GFX10-NEXT: v_mul_hi_u32 v5, v2, v5
|
||||
; GFX10-NEXT: v_add3_u32 v7, v7, v8, v9
|
||||
; GFX10-NEXT: v_add_co_u32 v6, s7, v6, v12
|
||||
; GFX10-NEXT: v_cndmask_b32_e64 v12, 0, 1, s7
|
||||
; GFX10-NEXT: v_add_co_u32 v4, s7, v13, v4
|
||||
; GFX10-NEXT: v_mul_lo_u32 v8, v3, v10
|
||||
; GFX10-NEXT: v_cndmask_b32_e64 v13, 0, 1, s7
|
||||
; GFX10-NEXT: v_mul_lo_u32 v15, v1, v7
|
||||
; GFX10-NEXT: v_add_co_u32 v6, s7, v6, v11
|
||||
; GFX10-NEXT: v_mul_lo_u32 v12, s6, v1
|
||||
; GFX10-NEXT: v_mul_hi_u32 v13, s3, v1
|
||||
; GFX10-NEXT: v_add_nc_u32_e32 v5, v8, v5
|
||||
; GFX10-NEXT: v_cndmask_b32_e64 v7, 0, 1, s7
|
||||
; GFX10-NEXT: v_mul_lo_u32 v11, s3, v1
|
||||
; GFX10-NEXT: v_add_co_u32 v5, s7, v6, v5
|
||||
; GFX10-NEXT: v_add_nc_u32_e32 v7, v10, v7
|
||||
; GFX10-NEXT: v_mul_hi_u32 v9, v1, v10
|
||||
; GFX10-NEXT: v_mul_hi_u32 v10, v3, v10
|
||||
; GFX10-NEXT: v_cndmask_b32_e64 v6, 0, 1, s7
|
||||
; GFX10-NEXT: v_add3_u32 v8, v12, v9, v13
|
||||
; GFX10-NEXT: v_mul_lo_u32 v9, v4, v11
|
||||
; GFX10-NEXT: v_mul_hi_u32 v10, v1, v11
|
||||
; GFX10-NEXT: v_mul_hi_u32 v11, v4, v11
|
||||
; GFX10-NEXT: v_add3_u32 v3, v7, v6, v3
|
||||
; GFX10-NEXT: v_mul_lo_u32 v6, v1, v8
|
||||
; GFX10-NEXT: v_mul_lo_u32 v7, v4, v8
|
||||
; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, v0, v5
|
||||
; GFX10-NEXT: v_add_co_ci_u32_e32 v2, vcc_lo, v2, v3, vcc_lo
|
||||
; GFX10-NEXT: v_mul_hi_u32 v5, v1, v8
|
||||
; GFX10-NEXT: v_mul_lo_u32 v12, s21, v0
|
||||
; GFX10-NEXT: v_add_co_u32 v6, s7, v9, v6
|
||||
; GFX10-NEXT: v_mul_hi_u32 v13, s20, v0
|
||||
; GFX10-NEXT: v_mul_lo_u32 v14, s20, v2
|
||||
; GFX10-NEXT: v_cndmask_b32_e64 v9, 0, 1, s7
|
||||
; GFX10-NEXT: v_add_co_u32 v7, s7, v7, v11
|
||||
; GFX10-NEXT: v_add_co_u32 v4, s7, v4, v14
|
||||
; GFX10-NEXT: v_mul_lo_u32 v14, v3, v7
|
||||
; GFX10-NEXT: v_cndmask_b32_e64 v11, 0, 1, s7
|
||||
; GFX10-NEXT: v_add_co_u32 v6, s7, v6, v10
|
||||
; GFX10-NEXT: v_mul_lo_u32 v3, s20, v0
|
||||
; GFX10-NEXT: v_add_nc_u32_e32 v6, v12, v6
|
||||
; GFX10-NEXT: v_add_co_u32 v8, s7, v8, v15
|
||||
; GFX10-NEXT: v_cndmask_b32_e64 v12, 0, 1, s7
|
||||
; GFX10-NEXT: v_add_nc_u32_e32 v11, v13, v11
|
||||
; GFX10-NEXT: v_mul_hi_u32 v16, v1, v7
|
||||
; GFX10-NEXT: v_add_co_u32 v10, s7, v14, v10
|
||||
; GFX10-NEXT: v_cndmask_b32_e64 v13, 0, 1, s7
|
||||
; GFX10-NEXT: v_add_co_u32 v4, s7, v4, v6
|
||||
; GFX10-NEXT: v_cndmask_b32_e64 v6, 0, 1, s7
|
||||
; GFX10-NEXT: v_add_co_u32 v5, s7, v7, v5
|
||||
; GFX10-NEXT: v_cndmask_b32_e64 v7, 0, 1, s7
|
||||
; GFX10-NEXT: v_add3_u32 v12, v12, v14, v13
|
||||
; GFX10-NEXT: v_add_nc_u32_e32 v6, v9, v6
|
||||
; GFX10-NEXT: v_mul_hi_u32 v8, v4, v8
|
||||
; GFX10-NEXT: v_mul_lo_u32 v10, v2, v3
|
||||
; GFX10-NEXT: v_add_nc_u32_e32 v7, v11, v7
|
||||
; GFX10-NEXT: v_mul_lo_u32 v11, v0, v12
|
||||
; GFX10-NEXT: v_add_co_u32 v5, s7, v5, v6
|
||||
; GFX10-NEXT: v_mul_hi_u32 v9, v0, v3
|
||||
; GFX10-NEXT: v_cndmask_b32_e64 v6, 0, 1, s7
|
||||
; GFX10-NEXT: v_mul_hi_u32 v3, v2, v3
|
||||
; GFX10-NEXT: v_mul_lo_u32 v13, v2, v12
|
||||
; GFX10-NEXT: v_add_co_u32 v1, vcc_lo, v1, v5
|
||||
; GFX10-NEXT: v_add_co_u32 v5, s7, v10, v11
|
||||
; GFX10-NEXT: v_add3_u32 v6, v7, v6, v8
|
||||
; GFX10-NEXT: v_mul_hi_u32 v14, v0, v12
|
||||
; GFX10-NEXT: v_mul_lo_u32 v10, s6, v1
|
||||
; GFX10-NEXT: v_add_co_u32 v5, s6, v5, v9
|
||||
; GFX10-NEXT: v_cndmask_b32_e64 v7, 0, 1, s7
|
||||
; GFX10-NEXT: v_add_co_u32 v3, s7, v13, v3
|
||||
; GFX10-NEXT: v_add_co_ci_u32_e32 v4, vcc_lo, v4, v6, vcc_lo
|
||||
; GFX10-NEXT: v_cndmask_b32_e64 v5, 0, 1, s6
|
||||
; GFX10-NEXT: v_mul_hi_u32 v11, s3, v1
|
||||
; GFX10-NEXT: v_add_co_u32 v3, s6, v3, v14
|
||||
; GFX10-NEXT: v_mul_lo_u32 v13, s3, v4
|
||||
; GFX10-NEXT: v_add_nc_u32_e32 v5, v7, v5
|
||||
; GFX10-NEXT: v_add_co_u32 v8, s7, v8, v9
|
||||
; GFX10-NEXT: v_cndmask_b32_e64 v8, 0, 1, s7
|
||||
; GFX10-NEXT: v_cndmask_b32_e64 v9, 0, 1, s6
|
||||
; GFX10-NEXT: v_mul_hi_u32 v7, v2, v12
|
||||
; GFX10-NEXT: v_mul_lo_u32 v6, s3, v1
|
||||
; GFX10-NEXT: v_add_co_u32 v3, s3, v3, v5
|
||||
; GFX10-NEXT: v_add_nc_u32_e32 v8, v8, v9
|
||||
; GFX10-NEXT: v_cndmask_b32_e64 v5, 0, 1, s3
|
||||
; GFX10-NEXT: v_add3_u32 v9, v10, v13, v11
|
||||
; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, v0, v3
|
||||
; GFX10-NEXT: v_mul_lo_u32 v10, v4, v6
|
||||
; GFX10-NEXT: v_add3_u32 v5, v8, v5, v7
|
||||
; GFX10-NEXT: v_mul_lo_u32 v7, v1, v9
|
||||
; GFX10-NEXT: v_mul_hi_u32 v11, v1, v6
|
||||
; GFX10-NEXT: v_mul_hi_u32 v6, v4, v6
|
||||
; GFX10-NEXT: v_mul_lo_u32 v8, v4, v9
|
||||
; GFX10-NEXT: v_add_co_u32 v9, s7, v10, v16
|
||||
; GFX10-NEXT: v_add3_u32 v5, v11, v6, v5
|
||||
; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, v0, v4
|
||||
; GFX10-NEXT: v_add_nc_u32_e32 v4, v12, v8
|
||||
; GFX10-NEXT: v_cndmask_b32_e64 v10, 0, 1, s7
|
||||
; GFX10-NEXT: v_add_co_ci_u32_e32 v2, vcc_lo, v2, v5, vcc_lo
|
||||
; GFX10-NEXT: v_mul_hi_u32 v3, v1, v9
|
||||
; GFX10-NEXT: v_mul_lo_u32 v5, s1, v0
|
||||
; GFX10-NEXT: v_add_co_u32 v7, s3, v10, v7
|
||||
; GFX10-NEXT: v_mul_lo_u32 v13, s0, v2
|
||||
; GFX10-NEXT: v_mul_hi_u32 v7, v3, v7
|
||||
; GFX10-NEXT: v_add_co_u32 v4, s7, v9, v4
|
||||
; GFX10-NEXT: v_add_nc_u32_e32 v6, v13, v10
|
||||
; GFX10-NEXT: v_mul_lo_u32 v5, s20, v0
|
||||
; GFX10-NEXT: v_mul_lo_u32 v9, s21, v0
|
||||
; GFX10-NEXT: v_mul_hi_u32 v10, s20, v0
|
||||
; GFX10-NEXT: v_mul_lo_u32 v11, s20, v2
|
||||
; GFX10-NEXT: v_cndmask_b32_e64 v8, 0, 1, s7
|
||||
; GFX10-NEXT: v_add_co_u32 v1, vcc_lo, v1, v4
|
||||
; GFX10-NEXT: v_mul_hi_u32 v4, v2, v5
|
||||
; GFX10-NEXT: v_add3_u32 v6, v6, v8, v7
|
||||
; GFX10-NEXT: v_mul_lo_u32 v7, v2, v5
|
||||
; GFX10-NEXT: v_mul_hi_u32 v8, v0, v5
|
||||
; GFX10-NEXT: v_add3_u32 v5, v9, v11, v10
|
||||
; GFX10-NEXT: v_mul_lo_u32 v9, s6, v1
|
||||
; GFX10-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, v3, v6, vcc_lo
|
||||
; GFX10-NEXT: v_mul_hi_u32 v10, s3, v1
|
||||
; GFX10-NEXT: v_mul_lo_u32 v12, v0, v5
|
||||
; GFX10-NEXT: v_mul_lo_u32 v13, v2, v5
|
||||
; GFX10-NEXT: v_mul_lo_u32 v11, s3, v3
|
||||
; GFX10-NEXT: v_mul_lo_u32 v6, s3, v1
|
||||
; GFX10-NEXT: v_mul_hi_u32 v14, v0, v5
|
||||
; GFX10-NEXT: v_mul_hi_u32 v5, v2, v5
|
||||
; GFX10-NEXT: v_add_co_u32 v7, s3, v7, v12
|
||||
; GFX10-NEXT: v_add3_u32 v9, v9, v11, v10
|
||||
; GFX10-NEXT: v_cndmask_b32_e64 v10, 0, 1, s3
|
||||
; GFX10-NEXT: v_add_co_u32 v6, s3, v8, v6
|
||||
; GFX10-NEXT: v_mul_hi_u32 v12, s0, v0
|
||||
; GFX10-NEXT: v_mul_hi_u32 v0, s1, v0
|
||||
; GFX10-NEXT: v_mul_lo_u32 v14, s1, v2
|
||||
; GFX10-NEXT: v_cndmask_b32_e64 v8, 0, 1, s3
|
||||
; GFX10-NEXT: v_add_co_u32 v7, s3, v7, v11
|
||||
; GFX10-NEXT: v_cndmask_b32_e64 v7, 0, 1, s3
|
||||
; GFX10-NEXT: v_add_co_u32 v3, s3, v6, v3
|
||||
; GFX10-NEXT: v_cndmask_b32_e64 v6, 0, 1, s3
|
||||
; GFX10-NEXT: v_add_co_u32 v5, s3, v5, v13
|
||||
; GFX10-NEXT: v_add_co_u32 v4, s3, v13, v4
|
||||
; GFX10-NEXT: v_cndmask_b32_e64 v11, 0, 1, s3
|
||||
; GFX10-NEXT: v_add_co_u32 v0, s3, v14, v0
|
||||
; GFX10-NEXT: v_mul_hi_u32 v15, s0, v2
|
||||
; GFX10-NEXT: v_cndmask_b32_e64 v13, 0, 1, s3
|
||||
; GFX10-NEXT: v_add_co_u32 v5, s3, v5, v12
|
||||
; GFX10-NEXT: v_cndmask_b32_e64 v5, 0, 1, s3
|
||||
; GFX10-NEXT: v_mul_hi_u32 v2, s1, v2
|
||||
; GFX10-NEXT: v_add_nc_u32_e32 v7, v10, v7
|
||||
; GFX10-NEXT: v_add_nc_u32_e32 v6, v8, v6
|
||||
; GFX10-NEXT: v_add_co_u32 v0, s3, v0, v15
|
||||
; GFX10-NEXT: v_add_nc_u32_e32 v5, v11, v5
|
||||
; GFX10-NEXT: v_cndmask_b32_e64 v12, 0, 1, s3
|
||||
; GFX10-NEXT: v_mul_hi_u32 v9, v4, v9
|
||||
; GFX10-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x0
|
||||
; GFX10-NEXT: v_add_co_u32 v0, s3, v0, v5
|
||||
; GFX10-NEXT: v_add_nc_u32_e32 v8, v13, v12
|
||||
; GFX10-NEXT: v_cndmask_b32_e64 v5, 0, 1, s3
|
||||
; GFX10-NEXT: v_add_co_u32 v3, s3, v3, v7
|
||||
; GFX10-NEXT: v_add_co_u32 v7, s3, v7, v8
|
||||
; GFX10-NEXT: v_cndmask_b32_e64 v7, 0, 1, s3
|
||||
; GFX10-NEXT: v_add3_u32 v2, v8, v5, v2
|
||||
; GFX10-NEXT: v_add_co_u32 v1, vcc_lo, v1, v3
|
||||
; GFX10-NEXT: v_add3_u32 v5, v6, v7, v9
|
||||
; GFX10-NEXT: v_mul_lo_u32 v6, s9, v0
|
||||
; GFX10-NEXT: v_mul_hi_u32 v7, s8, v0
|
||||
; GFX10-NEXT: v_mul_lo_u32 v9, s8, v2
|
||||
; GFX10-NEXT: v_mul_lo_u32 v3, s8, v0
|
||||
; GFX10-NEXT: v_add_co_ci_u32_e32 v4, vcc_lo, v4, v5, vcc_lo
|
||||
; GFX10-NEXT: v_mul_lo_u32 v5, s15, v1
|
||||
; GFX10-NEXT: v_mul_hi_u32 v10, s15, v1
|
||||
; GFX10-NEXT: v_mul_hi_u32 v1, s14, v1
|
||||
; GFX10-NEXT: v_mul_hi_u32 v17, s14, v4
|
||||
; GFX10-NEXT: v_add3_u32 v6, v6, v9, v7
|
||||
; GFX10-NEXT: v_sub_co_u32 v3, vcc_lo, s0, v3
|
||||
; GFX10-NEXT: v_mul_lo_u32 v7, s14, v4
|
||||
; GFX10-NEXT: v_mul_lo_u32 v9, s15, v4
|
||||
; GFX10-NEXT: v_sub_nc_u32_e32 v11, s1, v6
|
||||
; GFX10-NEXT: v_sub_co_ci_u32_e64 v6, s0, s1, v6, vcc_lo
|
||||
; GFX10-NEXT: v_cmp_le_u32_e64 s0, s8, v3
|
||||
; GFX10-NEXT: v_mul_hi_u32 v4, s15, v4
|
||||
; GFX10-NEXT: v_subrev_co_ci_u32_e32 v11, vcc_lo, s9, v11, vcc_lo
|
||||
; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, s9, v6
|
||||
; GFX10-NEXT: v_cndmask_b32_e64 v12, 0, -1, s0
|
||||
; GFX10-NEXT: v_mov_b32_e32 v8, 0
|
||||
; GFX10-NEXT: v_cndmask_b32_e64 v13, 0, -1, vcc_lo
|
||||
; GFX10-NEXT: v_sub_co_u32 v14, vcc_lo, v3, s8
|
||||
; GFX10-NEXT: v_subrev_co_ci_u32_e64 v15, s0, 0, v11, vcc_lo
|
||||
; GFX10-NEXT: v_cmp_eq_u32_e64 s0, s9, v6
|
||||
; GFX10-NEXT: v_subrev_co_ci_u32_e32 v11, vcc_lo, s9, v11, vcc_lo
|
||||
; GFX10-NEXT: v_cndmask_b32_e64 v12, v13, v12, s0
|
||||
; GFX10-NEXT: v_cmp_le_u32_e64 s0, s8, v14
|
||||
; GFX10-NEXT: v_cndmask_b32_e64 v13, 0, -1, s0
|
||||
; GFX10-NEXT: v_cmp_le_u32_e64 s0, s9, v15
|
||||
; GFX10-NEXT: v_cndmask_b32_e64 v16, 0, -1, s0
|
||||
; GFX10-NEXT: v_add_co_u32 v5, s0, v5, v7
|
||||
; GFX10-NEXT: v_add_co_u32 v4, s3, v4, v14
|
||||
; GFX10-NEXT: v_cndmask_b32_e64 v8, 0, 1, s3
|
||||
; GFX10-NEXT: v_mul_lo_u32 v15, v3, v6
|
||||
; GFX10-NEXT: v_add_nc_u32_e32 v7, v10, v7
|
||||
; GFX10-NEXT: v_mul_lo_u32 v12, v1, v9
|
||||
; GFX10-NEXT: v_mul_hi_u32 v16, v1, v6
|
||||
; GFX10-NEXT: v_add_nc_u32_e32 v8, v11, v8
|
||||
; GFX10-NEXT: v_mul_hi_u32 v6, v3, v6
|
||||
; GFX10-NEXT: v_add_co_u32 v4, s3, v4, v7
|
||||
; GFX10-NEXT: v_cndmask_b32_e64 v7, 0, 1, s3
|
||||
; GFX10-NEXT: v_mul_lo_u32 v13, v3, v9
|
||||
; GFX10-NEXT: v_mul_hi_u32 v10, v1, v9
|
||||
; GFX10-NEXT: v_add_co_u32 v11, s3, v15, v12
|
||||
; GFX10-NEXT: v_add3_u32 v5, v8, v7, v5
|
||||
; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, v0, v4
|
||||
; GFX10-NEXT: v_cndmask_b32_e64 v12, 0, 1, s3
|
||||
; GFX10-NEXT: v_add_co_u32 v6, s3, v13, v6
|
||||
; GFX10-NEXT: v_add_co_ci_u32_e32 v2, vcc_lo, v2, v5, vcc_lo
|
||||
; GFX10-NEXT: v_cndmask_b32_e64 v13, 0, 1, s3
|
||||
; GFX10-NEXT: v_add_co_u32 v7, s3, v11, v16
|
||||
; GFX10-NEXT: v_cndmask_b32_e64 v7, 0, 1, s3
|
||||
; GFX10-NEXT: v_add_co_u32 v4, s3, v6, v10
|
||||
; GFX10-NEXT: v_mul_lo_u32 v6, s1, v0
|
||||
; GFX10-NEXT: v_mul_lo_u32 v8, s0, v2
|
||||
; GFX10-NEXT: v_mul_hi_u32 v10, s1, v0
|
||||
; GFX10-NEXT: v_mul_hi_u32 v0, s0, v0
|
||||
; GFX10-NEXT: v_mul_lo_u32 v11, s1, v2
|
||||
; GFX10-NEXT: v_cndmask_b32_e64 v5, 0, 1, s3
|
||||
; GFX10-NEXT: v_add_nc_u32_e32 v7, v12, v7
|
||||
; GFX10-NEXT: v_mul_hi_u32 v12, s0, v2
|
||||
; GFX10-NEXT: v_mul_hi_u32 v9, v3, v9
|
||||
; GFX10-NEXT: v_add_co_u32 v6, s3, v6, v8
|
||||
; GFX10-NEXT: v_cndmask_b32_e64 v8, 0, 1, s3
|
||||
; GFX10-NEXT: v_add_co_u32 v10, s3, v11, v10
|
||||
; GFX10-NEXT: v_add_co_u32 v0, s6, v6, v0
|
||||
; GFX10-NEXT: v_cndmask_b32_e64 v0, 0, 1, s6
|
||||
; GFX10-NEXT: v_cndmask_b32_e64 v6, 0, 1, s3
|
||||
; GFX10-NEXT: v_add_co_u32 v10, s3, v10, v12
|
||||
; GFX10-NEXT: v_cndmask_b32_e64 v11, 0, 1, s3
|
||||
; GFX10-NEXT: v_add_nc_u32_e32 v0, v8, v0
|
||||
; GFX10-NEXT: v_add_co_u32 v4, s3, v4, v7
|
||||
; GFX10-NEXT: v_mul_hi_u32 v2, s1, v2
|
||||
; GFX10-NEXT: v_cndmask_b32_e64 v7, 0, 1, s3
|
||||
; GFX10-NEXT: v_add_co_u32 v0, s3, v10, v0
|
||||
; GFX10-NEXT: v_add_nc_u32_e32 v5, v13, v5
|
||||
; GFX10-NEXT: v_add_nc_u32_e32 v6, v6, v11
|
||||
; GFX10-NEXT: v_cndmask_b32_e64 v8, 0, 1, s3
|
||||
; GFX10-NEXT: v_add_co_u32 v1, vcc_lo, v1, v4
|
||||
; GFX10-NEXT: v_add3_u32 v5, v5, v7, v9
|
||||
; GFX10-NEXT: v_mul_lo_u32 v4, s9, v0
|
||||
; GFX10-NEXT: v_add3_u32 v2, v6, v8, v2
|
||||
; GFX10-NEXT: v_mul_lo_u32 v7, s15, v1
|
||||
; GFX10-NEXT: v_mul_lo_u32 v8, s8, v0
|
||||
; GFX10-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, v3, v5, vcc_lo
|
||||
; GFX10-NEXT: v_mul_hi_u32 v5, s8, v0
|
||||
; GFX10-NEXT: v_mul_lo_u32 v6, s8, v2
|
||||
; GFX10-NEXT: v_mul_hi_u32 v9, s14, v1
|
||||
; GFX10-NEXT: v_mul_lo_u32 v11, s14, v3
|
||||
; GFX10-NEXT: v_mul_hi_u32 v1, s15, v1
|
||||
; GFX10-NEXT: v_mul_lo_u32 v12, s15, v3
|
||||
; GFX10-NEXT: v_mul_hi_u32 v13, s14, v3
|
||||
; GFX10-NEXT: v_mul_hi_u32 v3, s15, v3
|
||||
; GFX10-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x0
|
||||
; GFX10-NEXT: v_add3_u32 v4, v4, v6, v5
|
||||
; GFX10-NEXT: v_sub_co_u32 v5, vcc_lo, s0, v8
|
||||
; GFX10-NEXT: v_add_co_u32 v6, s0, v7, v11
|
||||
; GFX10-NEXT: v_sub_nc_u32_e32 v8, s1, v4
|
||||
; GFX10-NEXT: v_cndmask_b32_e64 v7, 0, 1, s0
|
||||
; GFX10-NEXT: v_add_co_u32 v9, s0, v9, v10
|
||||
; GFX10-NEXT: v_add_co_u32 v1, s1, v5, v1
|
||||
; GFX10-NEXT: v_cndmask_b32_e64 v1, 0, 1, s1
|
||||
; GFX10-NEXT: v_cndmask_b32_e64 v5, 0, 1, s0
|
||||
; GFX10-NEXT: v_add_co_u32 v9, s0, v9, v17
|
||||
; GFX10-NEXT: v_cndmask_b32_e64 v10, 0, 1, s0
|
||||
; GFX10-NEXT: v_add_co_u32 v17, s0, v0, 1
|
||||
; GFX10-NEXT: v_sub_co_ci_u32_e64 v4, s0, s1, v4, vcc_lo
|
||||
; GFX10-NEXT: v_cmp_le_u32_e64 s0, s8, v5
|
||||
; GFX10-NEXT: v_subrev_co_ci_u32_e32 v8, vcc_lo, s9, v8, vcc_lo
|
||||
; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, s9, v4
|
||||
; GFX10-NEXT: v_add_co_u32 v6, s1, v6, v9
|
||||
; GFX10-NEXT: v_cndmask_b32_e64 v11, 0, -1, s0
|
||||
; GFX10-NEXT: v_cndmask_b32_e64 v6, 0, 1, s1
|
||||
; GFX10-NEXT: v_cndmask_b32_e64 v14, 0, -1, vcc_lo
|
||||
; GFX10-NEXT: v_sub_co_u32 v15, vcc_lo, v5, s8
|
||||
; GFX10-NEXT: v_subrev_co_ci_u32_e64 v16, s0, 0, v8, vcc_lo
|
||||
; GFX10-NEXT: v_cmp_eq_u32_e64 s0, s9, v4
|
||||
; GFX10-NEXT: v_add_nc_u32_e32 v6, v7, v6
|
||||
; GFX10-NEXT: v_subrev_co_ci_u32_e32 v8, vcc_lo, s9, v8, vcc_lo
|
||||
; GFX10-NEXT: v_mov_b32_e32 v10, 0
|
||||
; GFX10-NEXT: v_cndmask_b32_e64 v11, v14, v11, s0
|
||||
; GFX10-NEXT: v_cmp_le_u32_e64 s0, s8, v15
|
||||
; GFX10-NEXT: v_cndmask_b32_e64 v14, 0, -1, s0
|
||||
; GFX10-NEXT: v_cmp_le_u32_e64 s0, s9, v16
|
||||
; GFX10-NEXT: v_cndmask_b32_e64 v17, 0, -1, s0
|
||||
; GFX10-NEXT: v_add_co_u32 v1, s0, v12, v1
|
||||
; GFX10-NEXT: v_cndmask_b32_e64 v9, 0, 1, s0
|
||||
; GFX10-NEXT: v_add_co_u32 v1, s0, v1, v13
|
||||
; GFX10-NEXT: v_cndmask_b32_e64 v12, 0, 1, s0
|
||||
; GFX10-NEXT: v_add_co_u32 v13, s0, v0, 1
|
||||
; GFX10-NEXT: v_add_co_ci_u32_e64 v18, s0, 0, v2, s0
|
||||
; GFX10-NEXT: v_add_nc_u32_e32 v1, v7, v1
|
||||
; GFX10-NEXT: v_cmp_eq_u32_e64 s0, s9, v15
|
||||
; GFX10-NEXT: v_add_nc_u32_e32 v5, v5, v10
|
||||
; GFX10-NEXT: v_cndmask_b32_e64 v7, v16, v13, s0
|
||||
; GFX10-NEXT: v_add_co_u32 v9, s0, v9, v1
|
||||
; GFX10-NEXT: v_cmp_eq_u32_e64 s0, s9, v16
|
||||
; GFX10-NEXT: v_add_nc_u32_e32 v9, v9, v12
|
||||
; GFX10-NEXT: v_cndmask_b32_e64 v7, v17, v14, s0
|
||||
; GFX10-NEXT: v_add_co_u32 v6, s0, v1, v6
|
||||
; GFX10-NEXT: v_cndmask_b32_e64 v1, 0, 1, s0
|
||||
; GFX10-NEXT: v_add_co_u32 v10, s0, v17, 1
|
||||
; GFX10-NEXT: v_add_co_ci_u32_e64 v13, s0, 0, v18, s0
|
||||
; GFX10-NEXT: v_add_co_u32 v12, s0, v13, 1
|
||||
; GFX10-NEXT: v_add_co_ci_u32_e64 v14, s0, 0, v18, s0
|
||||
; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v7
|
||||
; GFX10-NEXT: v_add3_u32 v4, v5, v1, v4
|
||||
; GFX10-NEXT: v_sub_co_u32 v1, s0, v14, s8
|
||||
; GFX10-NEXT: v_subrev_co_ci_u32_e64 v5, s0, 0, v11, s0
|
||||
; GFX10-NEXT: v_cndmask_b32_e32 v10, v17, v10, vcc_lo
|
||||
; GFX10-NEXT: v_cndmask_b32_e32 v11, v18, v13, vcc_lo
|
||||
; GFX10-NEXT: v_mul_lo_u32 v13, s11, v9
|
||||
; GFX10-NEXT: v_mul_lo_u32 v16, s10, v4
|
||||
; GFX10-NEXT: v_mul_hi_u32 v17, s10, v9
|
||||
; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v12
|
||||
; GFX10-NEXT: v_add3_u32 v3, v9, v1, v3
|
||||
; GFX10-NEXT: v_sub_co_u32 v1, s0, v15, s8
|
||||
; GFX10-NEXT: v_mul_hi_u32 v17, s10, v6
|
||||
; GFX10-NEXT: v_cndmask_b32_e32 v9, v13, v12, vcc_lo
|
||||
; GFX10-NEXT: v_cndmask_b32_e32 v12, v18, v14, vcc_lo
|
||||
; GFX10-NEXT: v_mul_lo_u32 v13, s11, v6
|
||||
; GFX10-NEXT: v_mul_lo_u32 v14, s10, v3
|
||||
; GFX10-NEXT: v_subrev_co_ci_u32_e64 v8, s0, 0, v8, s0
|
||||
; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v11
|
||||
; GFX10-NEXT: v_cmp_ne_u32_e64 s0, 0, v7
|
||||
; GFX10-NEXT: v_mul_lo_u32 v7, s10, v9
|
||||
; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v10, vcc_lo
|
||||
; GFX10-NEXT: v_cndmask_b32_e64 v1, v14, v1, s0
|
||||
; GFX10-NEXT: v_add3_u32 v10, v13, v16, v17
|
||||
; GFX10-NEXT: v_cndmask_b32_e64 v5, v15, v5, s0
|
||||
; GFX10-NEXT: v_cndmask_b32_e32 v2, v2, v11, vcc_lo
|
||||
; GFX10-NEXT: v_mul_lo_u32 v7, s10, v6
|
||||
; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v9, vcc_lo
|
||||
; GFX10-NEXT: v_add3_u32 v9, v13, v14, v17
|
||||
; GFX10-NEXT: v_cndmask_b32_e64 v1, v15, v1, s0
|
||||
; GFX10-NEXT: v_cndmask_b32_e64 v8, v16, v8, s0
|
||||
; GFX10-NEXT: v_cndmask_b32_e32 v2, v2, v12, vcc_lo
|
||||
; GFX10-NEXT: v_sub_co_u32 v7, s0, s14, v7
|
||||
; GFX10-NEXT: v_sub_co_ci_u32_e64 v11, s1, s15, v10, s0
|
||||
; GFX10-NEXT: v_cndmask_b32_e32 v3, v3, v1, vcc_lo
|
||||
; GFX10-NEXT: v_cndmask_b32_e32 v5, v6, v5, vcc_lo
|
||||
; GFX10-NEXT: v_sub_nc_u32_e32 v1, s15, v10
|
||||
; GFX10-NEXT: v_sub_co_ci_u32_e64 v11, s1, s15, v9, s0
|
||||
; GFX10-NEXT: v_cndmask_b32_e32 v5, v5, v1, vcc_lo
|
||||
; GFX10-NEXT: v_cndmask_b32_e32 v4, v4, v8, vcc_lo
|
||||
; GFX10-NEXT: v_sub_nc_u32_e32 v1, s15, v9
|
||||
; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, s11, v11
|
||||
; GFX10-NEXT: v_xor_b32_e32 v0, s18, v0
|
||||
; GFX10-NEXT: v_xor_b32_e32 v2, s19, v2
|
||||
; GFX10-NEXT: v_xor_b32_e32 v5, s2, v5
|
||||
; GFX10-NEXT: v_cndmask_b32_e64 v6, 0, -1, vcc_lo
|
||||
; GFX10-NEXT: v_subrev_co_ci_u32_e64 v10, vcc_lo, s11, v1, s0
|
||||
; GFX10-NEXT: v_cndmask_b32_e64 v8, 0, -1, vcc_lo
|
||||
; GFX10-NEXT: v_subrev_co_ci_u32_e64 v9, vcc_lo, s11, v1, s0
|
||||
; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, s10, v7
|
||||
; GFX10-NEXT: v_cndmask_b32_e64 v12, 0, -1, vcc_lo
|
||||
; GFX10-NEXT: v_sub_co_u32 v13, vcc_lo, v7, s10
|
||||
; GFX10-NEXT: v_subrev_co_ci_u32_e64 v14, s0, 0, v10, vcc_lo
|
||||
; GFX10-NEXT: v_subrev_co_ci_u32_e64 v14, s0, 0, v9, vcc_lo
|
||||
; GFX10-NEXT: v_sub_co_u32 v0, s0, v0, s18
|
||||
; GFX10-NEXT: v_subrev_co_ci_u32_e64 v1, s0, s19, v2, s0
|
||||
; GFX10-NEXT: v_cmp_eq_u32_e64 s0, s11, v11
|
||||
; GFX10-NEXT: v_xor_b32_e32 v2, s2, v3
|
||||
; GFX10-NEXT: v_subrev_co_ci_u32_e32 v10, vcc_lo, s11, v10, vcc_lo
|
||||
; GFX10-NEXT: v_cndmask_b32_e64 v3, v6, v12, s0
|
||||
; GFX10-NEXT: v_xor_b32_e32 v2, s2, v5
|
||||
; GFX10-NEXT: v_subrev_co_ci_u32_e32 v9, vcc_lo, s11, v9, vcc_lo
|
||||
; GFX10-NEXT: v_cndmask_b32_e64 v5, v8, v12, s0
|
||||
; GFX10-NEXT: v_cmp_le_u32_e64 s0, s11, v14
|
||||
; GFX10-NEXT: v_cndmask_b32_e64 v6, 0, -1, s0
|
||||
; GFX10-NEXT: v_cndmask_b32_e64 v8, 0, -1, s0
|
||||
; GFX10-NEXT: v_cmp_le_u32_e64 s0, s10, v13
|
||||
; GFX10-NEXT: v_cndmask_b32_e64 v12, 0, -1, s0
|
||||
; GFX10-NEXT: v_add_co_u32 v15, s0, v9, 1
|
||||
; GFX10-NEXT: v_add_co_ci_u32_e64 v16, s0, 0, v4, s0
|
||||
; GFX10-NEXT: v_add_co_u32 v15, s0, v6, 1
|
||||
; GFX10-NEXT: v_add_co_ci_u32_e64 v16, s0, 0, v3, s0
|
||||
; GFX10-NEXT: v_cmp_eq_u32_e64 s0, s11, v14
|
||||
; GFX10-NEXT: v_cndmask_b32_e64 v6, v6, v12, s0
|
||||
; GFX10-NEXT: v_cndmask_b32_e64 v8, v8, v12, s0
|
||||
; GFX10-NEXT: v_add_co_u32 v12, s0, v15, 1
|
||||
; GFX10-NEXT: v_add_co_ci_u32_e64 v17, s0, 0, v16, s0
|
||||
; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v6
|
||||
; GFX10-NEXT: v_sub_co_u32 v6, s0, v13, s10
|
||||
; GFX10-NEXT: v_subrev_co_ci_u32_e64 v10, s0, 0, v10, s0
|
||||
; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v8
|
||||
; GFX10-NEXT: v_sub_co_u32 v8, s0, v13, s10
|
||||
; GFX10-NEXT: v_subrev_co_ci_u32_e64 v9, s0, 0, v9, s0
|
||||
; GFX10-NEXT: v_cndmask_b32_e32 v12, v15, v12, vcc_lo
|
||||
; GFX10-NEXT: v_cmp_ne_u32_e64 s0, 0, v3
|
||||
; GFX10-NEXT: v_cmp_ne_u32_e64 s0, 0, v5
|
||||
; GFX10-NEXT: v_cndmask_b32_e32 v15, v16, v17, vcc_lo
|
||||
; GFX10-NEXT: v_cndmask_b32_e32 v3, v13, v6, vcc_lo
|
||||
; GFX10-NEXT: v_cndmask_b32_e32 v6, v14, v10, vcc_lo
|
||||
; GFX10-NEXT: v_cndmask_b32_e64 v9, v9, v12, s0
|
||||
; GFX10-NEXT: v_cndmask_b32_e64 v10, v4, v15, s0
|
||||
; GFX10-NEXT: v_cndmask_b32_e64 v3, v7, v3, s0
|
||||
; GFX10-NEXT: v_cndmask_b32_e64 v6, v11, v6, s0
|
||||
; GFX10-NEXT: v_cndmask_b32_e32 v5, v13, v8, vcc_lo
|
||||
; GFX10-NEXT: v_cndmask_b32_e32 v8, v14, v9, vcc_lo
|
||||
; GFX10-NEXT: v_xor_b32_e32 v9, s2, v4
|
||||
; GFX10-NEXT: v_cndmask_b32_e64 v6, v6, v12, s0
|
||||
; GFX10-NEXT: v_cndmask_b32_e64 v3, v3, v15, s0
|
||||
; GFX10-NEXT: v_cndmask_b32_e64 v7, v7, v5, s0
|
||||
; GFX10-NEXT: v_cndmask_b32_e64 v8, v11, v8, s0
|
||||
; GFX10-NEXT: s_xor_b64 s[0:1], s[12:13], s[16:17]
|
||||
; GFX10-NEXT: v_sub_co_u32 v4, vcc_lo, v2, s2
|
||||
; GFX10-NEXT: v_xor_b32_e32 v2, s0, v9
|
||||
; GFX10-NEXT: v_xor_b32_e32 v7, s1, v10
|
||||
; GFX10-NEXT: v_xor_b32_e32 v9, s12, v3
|
||||
; GFX10-NEXT: v_subrev_co_ci_u32_e32 v5, vcc_lo, s2, v5, vcc_lo
|
||||
; GFX10-NEXT: v_xor_b32_e32 v10, s12, v6
|
||||
; GFX10-NEXT: v_xor_b32_e32 v2, s0, v6
|
||||
; GFX10-NEXT: v_xor_b32_e32 v3, s1, v3
|
||||
; GFX10-NEXT: v_xor_b32_e32 v6, s12, v7
|
||||
; GFX10-NEXT: v_subrev_co_ci_u32_e32 v5, vcc_lo, s2, v9, vcc_lo
|
||||
; GFX10-NEXT: v_xor_b32_e32 v7, s12, v8
|
||||
; GFX10-NEXT: v_sub_co_u32 v2, vcc_lo, v2, s0
|
||||
; GFX10-NEXT: v_subrev_co_ci_u32_e32 v3, vcc_lo, s1, v7, vcc_lo
|
||||
; GFX10-NEXT: v_sub_co_u32 v6, vcc_lo, v9, s12
|
||||
; GFX10-NEXT: v_subrev_co_ci_u32_e32 v7, vcc_lo, s12, v10, vcc_lo
|
||||
; GFX10-NEXT: v_subrev_co_ci_u32_e32 v3, vcc_lo, s1, v3, vcc_lo
|
||||
; GFX10-NEXT: v_sub_co_u32 v6, vcc_lo, v6, s12
|
||||
; GFX10-NEXT: v_subrev_co_ci_u32_e32 v7, vcc_lo, s12, v7, vcc_lo
|
||||
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX10-NEXT: global_store_dwordx4 v8, v[0:3], s[4:5]
|
||||
; GFX10-NEXT: global_store_dwordx4 v8, v[4:7], s[6:7]
|
||||
; GFX10-NEXT: global_store_dwordx4 v10, v[0:3], s[4:5]
|
||||
; GFX10-NEXT: global_store_dwordx4 v10, v[4:7], s[6:7]
|
||||
; GFX10-NEXT: s_endpgm
|
||||
%div = sdiv <2 x i64> %x, %y
|
||||
store <2 x i64> %div, <2 x i64> addrspace(1)* %out0
|
||||
|
|
|
@ -204,14 +204,8 @@ define amdgpu_ps i64 @s_srem_i64(i64 inreg %num, i64 inreg %den) {
|
|||
; CHECK-NEXT: s_ashr_i32 s6, s3, 31
|
||||
; CHECK-NEXT: s_ashr_i32 s0, s5, 31
|
||||
; CHECK-NEXT: s_add_u32 s10, s2, s6
|
||||
; CHECK-NEXT: s_cselect_b32 s7, 1, 0
|
||||
; CHECK-NEXT: s_and_b32 s7, s7, 1
|
||||
; CHECK-NEXT: s_cmp_lg_u32 s7, 0
|
||||
; CHECK-NEXT: s_addc_u32 s11, s3, s6
|
||||
; CHECK-NEXT: s_add_u32 s8, s4, s0
|
||||
; CHECK-NEXT: s_cselect_b32 s3, 1, 0
|
||||
; CHECK-NEXT: s_and_b32 s3, s3, 1
|
||||
; CHECK-NEXT: s_cmp_lg_u32 s3, 0
|
||||
; CHECK-NEXT: s_mov_b32 s1, s0
|
||||
; CHECK-NEXT: s_addc_u32 s9, s5, s0
|
||||
; CHECK-NEXT: s_xor_b64 s[8:9], s[8:9], s[0:1]
|
||||
|
@ -222,21 +216,18 @@ define amdgpu_ps i64 @s_srem_i64(i64 inreg %num, i64 inreg %den) {
|
|||
; CHECK-NEXT: s_sub_u32 s0, 0, s8
|
||||
; CHECK-NEXT: v_mac_f32_e32 v0, 0x4f800000, v1
|
||||
; CHECK-NEXT: v_rcp_iflag_f32_e32 v0, v0
|
||||
; CHECK-NEXT: s_cselect_b32 s1, 1, 0
|
||||
; CHECK-NEXT: s_and_b32 s1, s1, 1
|
||||
; CHECK-NEXT: s_cmp_lg_u32 s1, 0
|
||||
; CHECK-NEXT: s_subb_u32 s1, 0, s9
|
||||
; CHECK-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0
|
||||
; CHECK-NEXT: v_mul_f32_e32 v1, 0x2f800000, v0
|
||||
; CHECK-NEXT: v_trunc_f32_e32 v1, v1
|
||||
; CHECK-NEXT: v_mac_f32_e32 v0, 0xcf800000, v1
|
||||
; CHECK-NEXT: v_cvt_u32_f32_e32 v1, v1
|
||||
; CHECK-NEXT: v_cvt_u32_f32_e32 v0, v0
|
||||
; CHECK-NEXT: s_subb_u32 s1, 0, s9
|
||||
; CHECK-NEXT: v_mul_lo_u32 v3, s0, v1
|
||||
; CHECK-NEXT: v_mul_lo_u32 v2, s1, v0
|
||||
; CHECK-NEXT: v_mul_lo_u32 v2, s0, v1
|
||||
; CHECK-NEXT: v_mul_lo_u32 v3, s1, v0
|
||||
; CHECK-NEXT: v_mul_hi_u32 v5, s0, v0
|
||||
; CHECK-NEXT: v_mul_lo_u32 v4, s0, v0
|
||||
; CHECK-NEXT: v_add_i32_e32 v2, vcc, v2, v3
|
||||
; CHECK-NEXT: v_add_i32_e32 v2, vcc, v3, v2
|
||||
; CHECK-NEXT: v_add_i32_e32 v2, vcc, v2, v5
|
||||
; CHECK-NEXT: v_mul_lo_u32 v3, v1, v4
|
||||
; CHECK-NEXT: v_mul_lo_u32 v5, v0, v2
|
||||
|
@ -1174,43 +1165,38 @@ define <2 x i64> @v_srem_v2i64_pow2k_denom(<2 x i64> %num) {
|
|||
; GISEL: ; %bb.0:
|
||||
; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; GISEL-NEXT: s_movk_i32 s10, 0x1000
|
||||
; GISEL-NEXT: s_add_u32 s4, s10, 0
|
||||
; GISEL-NEXT: s_cselect_b32 s5, 1, 0
|
||||
; GISEL-NEXT: s_and_b32 s5, s5, 1
|
||||
; GISEL-NEXT: s_mov_b32 s6, 0
|
||||
; GISEL-NEXT: s_cmp_lg_u32 s5, 0
|
||||
; GISEL-NEXT: s_add_u32 s4, s10, 0
|
||||
; GISEL-NEXT: s_mov_b32 s7, s6
|
||||
; GISEL-NEXT: s_addc_u32 s5, 0, 0
|
||||
; GISEL-NEXT: s_xor_b64 s[8:9], s[4:5], s[6:7]
|
||||
; GISEL-NEXT: v_cvt_f32_u32_e32 v5, s8
|
||||
; GISEL-NEXT: v_cvt_f32_u32_e32 v6, s9
|
||||
; GISEL-NEXT: s_sub_u32 s4, 0, s8
|
||||
; GISEL-NEXT: s_cselect_b32 s5, 1, 0
|
||||
; GISEL-NEXT: s_and_b32 s5, s5, 1
|
||||
; GISEL-NEXT: v_mac_f32_e32 v5, 0x4f800000, v6
|
||||
; GISEL-NEXT: v_rcp_iflag_f32_e32 v5, v5
|
||||
; GISEL-NEXT: s_cmp_lg_u32 s5, 0
|
||||
; GISEL-NEXT: s_subb_u32 s5, 0, s9
|
||||
; GISEL-NEXT: v_ashrrev_i32_e32 v4, 31, v1
|
||||
; GISEL-NEXT: v_mac_f32_e32 v5, 0x4f800000, v6
|
||||
; GISEL-NEXT: v_rcp_iflag_f32_e32 v5, v5
|
||||
; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v4
|
||||
; GISEL-NEXT: v_addc_u32_e32 v1, vcc, v1, v4, vcc
|
||||
; GISEL-NEXT: v_mul_f32_e32 v5, 0x5f7ffffc, v5
|
||||
; GISEL-NEXT: v_mul_f32_e32 v6, 0x2f800000, v5
|
||||
; GISEL-NEXT: v_trunc_f32_e32 v6, v6
|
||||
; GISEL-NEXT: v_mac_f32_e32 v5, 0xcf800000, v6
|
||||
; GISEL-NEXT: v_cvt_u32_f32_e32 v5, v5
|
||||
; GISEL-NEXT: v_cvt_u32_f32_e32 v6, v6
|
||||
; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v4
|
||||
; GISEL-NEXT: v_xor_b32_e32 v0, v0, v4
|
||||
; GISEL-NEXT: v_xor_b32_e32 v1, v1, v4
|
||||
; GISEL-NEXT: v_mul_lo_u32 v7, s5, v5
|
||||
; GISEL-NEXT: v_mul_lo_u32 v8, s4, v6
|
||||
; GISEL-NEXT: v_mul_hi_u32 v10, s4, v5
|
||||
; GISEL-NEXT: v_mul_lo_u32 v9, s4, v5
|
||||
; GISEL-NEXT: v_addc_u32_e32 v1, vcc, v1, v4, vcc
|
||||
; GISEL-NEXT: v_add_i32_e32 v7, vcc, v7, v8
|
||||
; GISEL-NEXT: v_add_i32_e32 v7, vcc, v7, v10
|
||||
; GISEL-NEXT: v_mul_lo_u32 v8, v6, v9
|
||||
; GISEL-NEXT: v_mul_lo_u32 v10, v5, v7
|
||||
; GISEL-NEXT: v_mul_hi_u32 v11, v5, v9
|
||||
; GISEL-NEXT: v_mul_hi_u32 v9, v6, v9
|
||||
; GISEL-NEXT: v_xor_b32_e32 v0, v0, v4
|
||||
; GISEL-NEXT: v_add_i32_e32 v8, vcc, v8, v10
|
||||
; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc
|
||||
; GISEL-NEXT: v_add_i32_e32 v8, vcc, v8, v11
|
||||
|
@ -1234,7 +1220,6 @@ define <2 x i64> @v_srem_v2i64_pow2k_denom(<2 x i64> %num) {
|
|||
; GISEL-NEXT: v_mul_lo_u32 v8, s4, v6
|
||||
; GISEL-NEXT: v_mul_hi_u32 v10, s4, v5
|
||||
; GISEL-NEXT: v_mul_lo_u32 v9, s4, v5
|
||||
; GISEL-NEXT: v_xor_b32_e32 v1, v1, v4
|
||||
; GISEL-NEXT: v_add_i32_e32 v7, vcc, v7, v8
|
||||
; GISEL-NEXT: v_add_i32_e32 v7, vcc, v7, v10
|
||||
; GISEL-NEXT: v_mul_lo_u32 v8, v6, v9
|
||||
|
@ -1303,16 +1288,13 @@ define <2 x i64> @v_srem_v2i64_pow2k_denom(<2 x i64> %num) {
|
|||
; GISEL-NEXT: v_cmp_le_u32_e64 s[4:5], s9, v8
|
||||
; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, -1, s[4:5]
|
||||
; GISEL-NEXT: v_cmp_le_u32_e64 s[4:5], s8, v7
|
||||
; GISEL-NEXT: v_subb_u32_e32 v1, vcc, v1, v9, vcc
|
||||
; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, -1, s[4:5]
|
||||
; GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], s9, v8
|
||||
; GISEL-NEXT: v_cndmask_b32_e64 v10, v10, v11, s[4:5]
|
||||
; GISEL-NEXT: s_add_u32 s4, s10, 0
|
||||
; GISEL-NEXT: v_subb_u32_e32 v1, vcc, v1, v9, vcc
|
||||
; GISEL-NEXT: s_cselect_b32 s5, 1, 0
|
||||
; GISEL-NEXT: v_subrev_i32_e32 v9, vcc, s8, v7
|
||||
; GISEL-NEXT: s_and_b32 s5, s5, 1
|
||||
; GISEL-NEXT: v_cndmask_b32_e64 v10, v10, v11, s[4:5]
|
||||
; GISEL-NEXT: v_subbrev_u32_e32 v1, vcc, 0, v1, vcc
|
||||
; GISEL-NEXT: s_cmp_lg_u32 s5, 0
|
||||
; GISEL-NEXT: s_add_u32 s4, s10, 0
|
||||
; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10
|
||||
; GISEL-NEXT: s_addc_u32 s5, 0, 0
|
||||
; GISEL-NEXT: v_cndmask_b32_e32 v7, v7, v9, vcc
|
||||
|
@ -1323,26 +1305,23 @@ define <2 x i64> @v_srem_v2i64_pow2k_denom(<2 x i64> %num) {
|
|||
; GISEL-NEXT: v_cvt_f32_u32_e32 v5, s6
|
||||
; GISEL-NEXT: v_cvt_f32_u32_e32 v6, s7
|
||||
; GISEL-NEXT: s_sub_u32 s4, 0, s6
|
||||
; GISEL-NEXT: s_cselect_b32 s5, 1, 0
|
||||
; GISEL-NEXT: s_and_b32 s5, s5, 1
|
||||
; GISEL-NEXT: v_mac_f32_e32 v5, 0x4f800000, v6
|
||||
; GISEL-NEXT: v_rcp_iflag_f32_e32 v5, v5
|
||||
; GISEL-NEXT: s_cmp_lg_u32 s5, 0
|
||||
; GISEL-NEXT: v_cndmask_b32_e32 v0, v0, v7, vcc
|
||||
; GISEL-NEXT: s_subb_u32 s5, 0, s7
|
||||
; GISEL-NEXT: v_mac_f32_e32 v5, 0x4f800000, v6
|
||||
; GISEL-NEXT: v_rcp_iflag_f32_e32 v5, v5
|
||||
; GISEL-NEXT: v_xor_b32_e32 v0, v0, v4
|
||||
; GISEL-NEXT: v_xor_b32_e32 v1, v1, v4
|
||||
; GISEL-NEXT: v_sub_i32_e32 v0, vcc, v0, v4
|
||||
; GISEL-NEXT: v_mul_f32_e32 v5, 0x5f7ffffc, v5
|
||||
; GISEL-NEXT: v_mul_f32_e32 v6, 0x2f800000, v5
|
||||
; GISEL-NEXT: v_trunc_f32_e32 v6, v6
|
||||
; GISEL-NEXT: v_mac_f32_e32 v5, 0xcf800000, v6
|
||||
; GISEL-NEXT: v_cvt_u32_f32_e32 v5, v5
|
||||
; GISEL-NEXT: v_cvt_u32_f32_e32 v6, v6
|
||||
; GISEL-NEXT: v_xor_b32_e32 v0, v0, v4
|
||||
; GISEL-NEXT: v_xor_b32_e32 v1, v1, v4
|
||||
; GISEL-NEXT: v_subb_u32_e32 v1, vcc, v1, v4, vcc
|
||||
; GISEL-NEXT: v_mul_lo_u32 v7, s5, v5
|
||||
; GISEL-NEXT: v_mul_lo_u32 v8, s4, v6
|
||||
; GISEL-NEXT: v_sub_i32_e32 v0, vcc, v0, v4
|
||||
; GISEL-NEXT: v_mul_hi_u32 v10, s4, v5
|
||||
; GISEL-NEXT: v_subb_u32_e32 v1, vcc, v1, v4, vcc
|
||||
; GISEL-NEXT: v_ashrrev_i32_e32 v4, 31, v3
|
||||
; GISEL-NEXT: v_mul_lo_u32 v9, s4, v5
|
||||
; GISEL-NEXT: v_add_i32_e32 v2, vcc, v2, v4
|
||||
|
@ -1882,43 +1861,38 @@ define <2 x i64> @v_srem_v2i64_oddk_denom(<2 x i64> %num) {
|
|||
; GISEL: ; %bb.0:
|
||||
; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; GISEL-NEXT: s_mov_b32 s10, 0x12d8fb
|
||||
; GISEL-NEXT: s_add_u32 s4, s10, 0
|
||||
; GISEL-NEXT: s_cselect_b32 s5, 1, 0
|
||||
; GISEL-NEXT: s_and_b32 s5, s5, 1
|
||||
; GISEL-NEXT: s_mov_b32 s6, 0
|
||||
; GISEL-NEXT: s_cmp_lg_u32 s5, 0
|
||||
; GISEL-NEXT: s_add_u32 s4, s10, 0
|
||||
; GISEL-NEXT: s_mov_b32 s7, s6
|
||||
; GISEL-NEXT: s_addc_u32 s5, 0, 0
|
||||
; GISEL-NEXT: s_xor_b64 s[8:9], s[4:5], s[6:7]
|
||||
; GISEL-NEXT: v_cvt_f32_u32_e32 v5, s8
|
||||
; GISEL-NEXT: v_cvt_f32_u32_e32 v6, s9
|
||||
; GISEL-NEXT: s_sub_u32 s4, 0, s8
|
||||
; GISEL-NEXT: s_cselect_b32 s5, 1, 0
|
||||
; GISEL-NEXT: s_and_b32 s5, s5, 1
|
||||
; GISEL-NEXT: v_mac_f32_e32 v5, 0x4f800000, v6
|
||||
; GISEL-NEXT: v_rcp_iflag_f32_e32 v5, v5
|
||||
; GISEL-NEXT: s_cmp_lg_u32 s5, 0
|
||||
; GISEL-NEXT: s_subb_u32 s5, 0, s9
|
||||
; GISEL-NEXT: v_ashrrev_i32_e32 v4, 31, v1
|
||||
; GISEL-NEXT: v_mac_f32_e32 v5, 0x4f800000, v6
|
||||
; GISEL-NEXT: v_rcp_iflag_f32_e32 v5, v5
|
||||
; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v4
|
||||
; GISEL-NEXT: v_addc_u32_e32 v1, vcc, v1, v4, vcc
|
||||
; GISEL-NEXT: v_mul_f32_e32 v5, 0x5f7ffffc, v5
|
||||
; GISEL-NEXT: v_mul_f32_e32 v6, 0x2f800000, v5
|
||||
; GISEL-NEXT: v_trunc_f32_e32 v6, v6
|
||||
; GISEL-NEXT: v_mac_f32_e32 v5, 0xcf800000, v6
|
||||
; GISEL-NEXT: v_cvt_u32_f32_e32 v5, v5
|
||||
; GISEL-NEXT: v_cvt_u32_f32_e32 v6, v6
|
||||
; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v4
|
||||
; GISEL-NEXT: v_xor_b32_e32 v0, v0, v4
|
||||
; GISEL-NEXT: v_xor_b32_e32 v1, v1, v4
|
||||
; GISEL-NEXT: v_mul_lo_u32 v7, s5, v5
|
||||
; GISEL-NEXT: v_mul_lo_u32 v8, s4, v6
|
||||
; GISEL-NEXT: v_mul_hi_u32 v10, s4, v5
|
||||
; GISEL-NEXT: v_mul_lo_u32 v9, s4, v5
|
||||
; GISEL-NEXT: v_addc_u32_e32 v1, vcc, v1, v4, vcc
|
||||
; GISEL-NEXT: v_add_i32_e32 v7, vcc, v7, v8
|
||||
; GISEL-NEXT: v_add_i32_e32 v7, vcc, v7, v10
|
||||
; GISEL-NEXT: v_mul_lo_u32 v8, v6, v9
|
||||
; GISEL-NEXT: v_mul_lo_u32 v10, v5, v7
|
||||
; GISEL-NEXT: v_mul_hi_u32 v11, v5, v9
|
||||
; GISEL-NEXT: v_mul_hi_u32 v9, v6, v9
|
||||
; GISEL-NEXT: v_xor_b32_e32 v0, v0, v4
|
||||
; GISEL-NEXT: v_add_i32_e32 v8, vcc, v8, v10
|
||||
; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc
|
||||
; GISEL-NEXT: v_add_i32_e32 v8, vcc, v8, v11
|
||||
|
@ -1942,7 +1916,6 @@ define <2 x i64> @v_srem_v2i64_oddk_denom(<2 x i64> %num) {
|
|||
; GISEL-NEXT: v_mul_lo_u32 v8, s4, v6
|
||||
; GISEL-NEXT: v_mul_hi_u32 v10, s4, v5
|
||||
; GISEL-NEXT: v_mul_lo_u32 v9, s4, v5
|
||||
; GISEL-NEXT: v_xor_b32_e32 v1, v1, v4
|
||||
; GISEL-NEXT: v_add_i32_e32 v7, vcc, v7, v8
|
||||
; GISEL-NEXT: v_add_i32_e32 v7, vcc, v7, v10
|
||||
; GISEL-NEXT: v_mul_lo_u32 v8, v6, v9
|
||||
|
@ -2011,16 +1984,13 @@ define <2 x i64> @v_srem_v2i64_oddk_denom(<2 x i64> %num) {
|
|||
; GISEL-NEXT: v_cmp_le_u32_e64 s[4:5], s9, v8
|
||||
; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, -1, s[4:5]
|
||||
; GISEL-NEXT: v_cmp_le_u32_e64 s[4:5], s8, v7
|
||||
; GISEL-NEXT: v_subb_u32_e32 v1, vcc, v1, v9, vcc
|
||||
; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, -1, s[4:5]
|
||||
; GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], s9, v8
|
||||
; GISEL-NEXT: v_cndmask_b32_e64 v10, v10, v11, s[4:5]
|
||||
; GISEL-NEXT: s_add_u32 s4, s10, 0
|
||||
; GISEL-NEXT: v_subb_u32_e32 v1, vcc, v1, v9, vcc
|
||||
; GISEL-NEXT: s_cselect_b32 s5, 1, 0
|
||||
; GISEL-NEXT: v_subrev_i32_e32 v9, vcc, s8, v7
|
||||
; GISEL-NEXT: s_and_b32 s5, s5, 1
|
||||
; GISEL-NEXT: v_cndmask_b32_e64 v10, v10, v11, s[4:5]
|
||||
; GISEL-NEXT: v_subbrev_u32_e32 v1, vcc, 0, v1, vcc
|
||||
; GISEL-NEXT: s_cmp_lg_u32 s5, 0
|
||||
; GISEL-NEXT: s_add_u32 s4, s10, 0
|
||||
; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10
|
||||
; GISEL-NEXT: s_addc_u32 s5, 0, 0
|
||||
; GISEL-NEXT: v_cndmask_b32_e32 v7, v7, v9, vcc
|
||||
|
@ -2031,26 +2001,23 @@ define <2 x i64> @v_srem_v2i64_oddk_denom(<2 x i64> %num) {
|
|||
; GISEL-NEXT: v_cvt_f32_u32_e32 v5, s6
|
||||
; GISEL-NEXT: v_cvt_f32_u32_e32 v6, s7
|
||||
; GISEL-NEXT: s_sub_u32 s4, 0, s6
|
||||
; GISEL-NEXT: s_cselect_b32 s5, 1, 0
|
||||
; GISEL-NEXT: s_and_b32 s5, s5, 1
|
||||
; GISEL-NEXT: v_mac_f32_e32 v5, 0x4f800000, v6
|
||||
; GISEL-NEXT: v_rcp_iflag_f32_e32 v5, v5
|
||||
; GISEL-NEXT: s_cmp_lg_u32 s5, 0
|
||||
; GISEL-NEXT: v_cndmask_b32_e32 v0, v0, v7, vcc
|
||||
; GISEL-NEXT: s_subb_u32 s5, 0, s7
|
||||
; GISEL-NEXT: v_mac_f32_e32 v5, 0x4f800000, v6
|
||||
; GISEL-NEXT: v_rcp_iflag_f32_e32 v5, v5
|
||||
; GISEL-NEXT: v_xor_b32_e32 v0, v0, v4
|
||||
; GISEL-NEXT: v_xor_b32_e32 v1, v1, v4
|
||||
; GISEL-NEXT: v_sub_i32_e32 v0, vcc, v0, v4
|
||||
; GISEL-NEXT: v_mul_f32_e32 v5, 0x5f7ffffc, v5
|
||||
; GISEL-NEXT: v_mul_f32_e32 v6, 0x2f800000, v5
|
||||
; GISEL-NEXT: v_trunc_f32_e32 v6, v6
|
||||
; GISEL-NEXT: v_mac_f32_e32 v5, 0xcf800000, v6
|
||||
; GISEL-NEXT: v_cvt_u32_f32_e32 v5, v5
|
||||
; GISEL-NEXT: v_cvt_u32_f32_e32 v6, v6
|
||||
; GISEL-NEXT: v_xor_b32_e32 v0, v0, v4
|
||||
; GISEL-NEXT: v_xor_b32_e32 v1, v1, v4
|
||||
; GISEL-NEXT: v_subb_u32_e32 v1, vcc, v1, v4, vcc
|
||||
; GISEL-NEXT: v_mul_lo_u32 v7, s5, v5
|
||||
; GISEL-NEXT: v_mul_lo_u32 v8, s4, v6
|
||||
; GISEL-NEXT: v_sub_i32_e32 v0, vcc, v0, v4
|
||||
; GISEL-NEXT: v_mul_hi_u32 v10, s4, v5
|
||||
; GISEL-NEXT: v_subb_u32_e32 v1, vcc, v1, v4, vcc
|
||||
; GISEL-NEXT: v_ashrrev_i32_e32 v4, 31, v3
|
||||
; GISEL-NEXT: v_mul_lo_u32 v9, s4, v5
|
||||
; GISEL-NEXT: v_add_i32_e32 v2, vcc, v2, v4
|
||||
|
|
|
@ -4203,9 +4203,6 @@ define amdgpu_ps i64 @s_ssubsat_i64(i64 inreg %lhs, i64 inreg %rhs) {
|
|||
; GFX6-LABEL: s_ssubsat_i64:
|
||||
; GFX6: ; %bb.0:
|
||||
; GFX6-NEXT: s_sub_u32 s4, s0, s2
|
||||
; GFX6-NEXT: s_cselect_b32 s5, 1, 0
|
||||
; GFX6-NEXT: s_and_b32 s5, s5, 1
|
||||
; GFX6-NEXT: s_cmp_lg_u32 s5, 0
|
||||
; GFX6-NEXT: v_mov_b32_e32 v0, s0
|
||||
; GFX6-NEXT: s_subb_u32 s5, s1, s3
|
||||
; GFX6-NEXT: v_mov_b32_e32 v1, s1
|
||||
|
@ -4229,9 +4226,6 @@ define amdgpu_ps i64 @s_ssubsat_i64(i64 inreg %lhs, i64 inreg %rhs) {
|
|||
; GFX8-LABEL: s_ssubsat_i64:
|
||||
; GFX8: ; %bb.0:
|
||||
; GFX8-NEXT: s_sub_u32 s4, s0, s2
|
||||
; GFX8-NEXT: s_cselect_b32 s5, 1, 0
|
||||
; GFX8-NEXT: s_and_b32 s5, s5, 1
|
||||
; GFX8-NEXT: s_cmp_lg_u32 s5, 0
|
||||
; GFX8-NEXT: v_mov_b32_e32 v0, s0
|
||||
; GFX8-NEXT: s_subb_u32 s5, s1, s3
|
||||
; GFX8-NEXT: v_mov_b32_e32 v1, s1
|
||||
|
@ -4255,9 +4249,6 @@ define amdgpu_ps i64 @s_ssubsat_i64(i64 inreg %lhs, i64 inreg %rhs) {
|
|||
; GFX9-LABEL: s_ssubsat_i64:
|
||||
; GFX9: ; %bb.0:
|
||||
; GFX9-NEXT: s_sub_u32 s4, s0, s2
|
||||
; GFX9-NEXT: s_cselect_b32 s5, 1, 0
|
||||
; GFX9-NEXT: s_and_b32 s5, s5, 1
|
||||
; GFX9-NEXT: s_cmp_lg_u32 s5, 0
|
||||
; GFX9-NEXT: v_mov_b32_e32 v0, s0
|
||||
; GFX9-NEXT: s_subb_u32 s5, s1, s3
|
||||
; GFX9-NEXT: v_mov_b32_e32 v1, s1
|
||||
|
@ -4281,15 +4272,12 @@ define amdgpu_ps i64 @s_ssubsat_i64(i64 inreg %lhs, i64 inreg %rhs) {
|
|||
; GFX10-LABEL: s_ssubsat_i64:
|
||||
; GFX10: ; %bb.0:
|
||||
; GFX10-NEXT: s_sub_u32 s4, s0, s2
|
||||
; GFX10-NEXT: s_cselect_b32 s5, 1, 0
|
||||
; GFX10-NEXT: v_cmp_gt_i64_e64 s2, s[2:3], 0
|
||||
; GFX10-NEXT: s_and_b32 s5, s5, 1
|
||||
; GFX10-NEXT: v_mov_b32_e32 v0, s4
|
||||
; GFX10-NEXT: s_cmp_lg_u32 s5, 0
|
||||
; GFX10-NEXT: s_subb_u32 s5, s1, s3
|
||||
; GFX10-NEXT: s_mov_b32 s3, 0
|
||||
; GFX10-NEXT: v_cmp_gt_i64_e64 s2, s[2:3], 0
|
||||
; GFX10-NEXT: v_cmp_lt_i64_e64 s1, s[4:5], s[0:1]
|
||||
; GFX10-NEXT: s_mov_b32 s3, 0
|
||||
; GFX10-NEXT: s_ashr_i32 s0, s5, 31
|
||||
; GFX10-NEXT: v_mov_b32_e32 v0, s4
|
||||
; GFX10-NEXT: v_mov_b32_e32 v1, s5
|
||||
; GFX10-NEXT: s_xor_b32 s2, s2, s1
|
||||
; GFX10-NEXT: s_cmp_lg_u32 s3, 0
|
||||
|
@ -4545,9 +4533,6 @@ define amdgpu_ps <2 x i64> @s_ssubsat_v2i64(<2 x i64> inreg %lhs, <2 x i64> inre
|
|||
; GFX6-LABEL: s_ssubsat_v2i64:
|
||||
; GFX6: ; %bb.0:
|
||||
; GFX6-NEXT: s_sub_u32 s8, s0, s4
|
||||
; GFX6-NEXT: s_cselect_b32 s9, 1, 0
|
||||
; GFX6-NEXT: s_and_b32 s9, s9, 1
|
||||
; GFX6-NEXT: s_cmp_lg_u32 s9, 0
|
||||
; GFX6-NEXT: v_mov_b32_e32 v0, s0
|
||||
; GFX6-NEXT: s_subb_u32 s9, s1, s5
|
||||
; GFX6-NEXT: v_mov_b32_e32 v1, s1
|
||||
|
@ -4558,16 +4543,13 @@ define amdgpu_ps <2 x i64> @s_ssubsat_v2i64(<2 x i64> inreg %lhs, <2 x i64> inre
|
|||
; GFX6-NEXT: s_xor_b64 vcc, s[0:1], vcc
|
||||
; GFX6-NEXT: s_brev_b32 s5, 1
|
||||
; GFX6-NEXT: s_cmp_lg_u32 s10, 0
|
||||
; GFX6-NEXT: s_addc_u32 s1, s4, s5
|
||||
; GFX6-NEXT: s_sub_u32 s0, s2, s6
|
||||
; GFX6-NEXT: v_mov_b32_e32 v2, s1
|
||||
; GFX6-NEXT: s_cselect_b32 s1, 1, 0
|
||||
; GFX6-NEXT: v_mov_b32_e32 v0, s4
|
||||
; GFX6-NEXT: v_mov_b32_e32 v1, s8
|
||||
; GFX6-NEXT: s_and_b32 s1, s1, 1
|
||||
; GFX6-NEXT: s_addc_u32 s1, s4, s5
|
||||
; GFX6-NEXT: v_cndmask_b32_e32 v4, v1, v0, vcc
|
||||
; GFX6-NEXT: s_cmp_lg_u32 s1, 0
|
||||
; GFX6-NEXT: s_sub_u32 s0, s2, s6
|
||||
; GFX6-NEXT: v_mov_b32_e32 v0, s2
|
||||
; GFX6-NEXT: v_mov_b32_e32 v2, s1
|
||||
; GFX6-NEXT: v_mov_b32_e32 v3, s9
|
||||
; GFX6-NEXT: s_subb_u32 s1, s3, s7
|
||||
; GFX6-NEXT: v_mov_b32_e32 v1, s3
|
||||
|
@ -4594,9 +4576,6 @@ define amdgpu_ps <2 x i64> @s_ssubsat_v2i64(<2 x i64> inreg %lhs, <2 x i64> inre
|
|||
; GFX8-LABEL: s_ssubsat_v2i64:
|
||||
; GFX8: ; %bb.0:
|
||||
; GFX8-NEXT: s_sub_u32 s8, s0, s4
|
||||
; GFX8-NEXT: s_cselect_b32 s9, 1, 0
|
||||
; GFX8-NEXT: s_and_b32 s9, s9, 1
|
||||
; GFX8-NEXT: s_cmp_lg_u32 s9, 0
|
||||
; GFX8-NEXT: v_mov_b32_e32 v0, s0
|
||||
; GFX8-NEXT: s_subb_u32 s9, s1, s5
|
||||
; GFX8-NEXT: v_mov_b32_e32 v1, s1
|
||||
|
@ -4607,16 +4586,13 @@ define amdgpu_ps <2 x i64> @s_ssubsat_v2i64(<2 x i64> inreg %lhs, <2 x i64> inre
|
|||
; GFX8-NEXT: s_xor_b64 vcc, s[0:1], vcc
|
||||
; GFX8-NEXT: s_brev_b32 s5, 1
|
||||
; GFX8-NEXT: s_cmp_lg_u32 s10, 0
|
||||
; GFX8-NEXT: s_addc_u32 s1, s4, s5
|
||||
; GFX8-NEXT: s_sub_u32 s0, s2, s6
|
||||
; GFX8-NEXT: v_mov_b32_e32 v2, s1
|
||||
; GFX8-NEXT: s_cselect_b32 s1, 1, 0
|
||||
; GFX8-NEXT: v_mov_b32_e32 v0, s4
|
||||
; GFX8-NEXT: v_mov_b32_e32 v1, s8
|
||||
; GFX8-NEXT: s_and_b32 s1, s1, 1
|
||||
; GFX8-NEXT: s_addc_u32 s1, s4, s5
|
||||
; GFX8-NEXT: v_cndmask_b32_e32 v4, v1, v0, vcc
|
||||
; GFX8-NEXT: s_cmp_lg_u32 s1, 0
|
||||
; GFX8-NEXT: s_sub_u32 s0, s2, s6
|
||||
; GFX8-NEXT: v_mov_b32_e32 v0, s2
|
||||
; GFX8-NEXT: v_mov_b32_e32 v2, s1
|
||||
; GFX8-NEXT: v_mov_b32_e32 v3, s9
|
||||
; GFX8-NEXT: s_subb_u32 s1, s3, s7
|
||||
; GFX8-NEXT: v_mov_b32_e32 v1, s3
|
||||
|
@ -4643,9 +4619,6 @@ define amdgpu_ps <2 x i64> @s_ssubsat_v2i64(<2 x i64> inreg %lhs, <2 x i64> inre
|
|||
; GFX9-LABEL: s_ssubsat_v2i64:
|
||||
; GFX9: ; %bb.0:
|
||||
; GFX9-NEXT: s_sub_u32 s8, s0, s4
|
||||
; GFX9-NEXT: s_cselect_b32 s9, 1, 0
|
||||
; GFX9-NEXT: s_and_b32 s9, s9, 1
|
||||
; GFX9-NEXT: s_cmp_lg_u32 s9, 0
|
||||
; GFX9-NEXT: v_mov_b32_e32 v0, s0
|
||||
; GFX9-NEXT: s_subb_u32 s9, s1, s5
|
||||
; GFX9-NEXT: v_mov_b32_e32 v1, s1
|
||||
|
@ -4656,16 +4629,13 @@ define amdgpu_ps <2 x i64> @s_ssubsat_v2i64(<2 x i64> inreg %lhs, <2 x i64> inre
|
|||
; GFX9-NEXT: s_xor_b64 vcc, s[0:1], vcc
|
||||
; GFX9-NEXT: s_brev_b32 s5, 1
|
||||
; GFX9-NEXT: s_cmp_lg_u32 s10, 0
|
||||
; GFX9-NEXT: s_addc_u32 s1, s4, s5
|
||||
; GFX9-NEXT: s_sub_u32 s0, s2, s6
|
||||
; GFX9-NEXT: v_mov_b32_e32 v2, s1
|
||||
; GFX9-NEXT: s_cselect_b32 s1, 1, 0
|
||||
; GFX9-NEXT: v_mov_b32_e32 v0, s4
|
||||
; GFX9-NEXT: v_mov_b32_e32 v1, s8
|
||||
; GFX9-NEXT: s_and_b32 s1, s1, 1
|
||||
; GFX9-NEXT: s_addc_u32 s1, s4, s5
|
||||
; GFX9-NEXT: v_cndmask_b32_e32 v4, v1, v0, vcc
|
||||
; GFX9-NEXT: s_cmp_lg_u32 s1, 0
|
||||
; GFX9-NEXT: s_sub_u32 s0, s2, s6
|
||||
; GFX9-NEXT: v_mov_b32_e32 v0, s2
|
||||
; GFX9-NEXT: v_mov_b32_e32 v2, s1
|
||||
; GFX9-NEXT: v_mov_b32_e32 v3, s9
|
||||
; GFX9-NEXT: s_subb_u32 s1, s3, s7
|
||||
; GFX9-NEXT: v_mov_b32_e32 v1, s3
|
||||
|
@ -4692,32 +4662,26 @@ define amdgpu_ps <2 x i64> @s_ssubsat_v2i64(<2 x i64> inreg %lhs, <2 x i64> inre
|
|||
; GFX10-LABEL: s_ssubsat_v2i64:
|
||||
; GFX10: ; %bb.0:
|
||||
; GFX10-NEXT: s_sub_u32 s8, s0, s4
|
||||
; GFX10-NEXT: s_cselect_b32 s9, 1, 0
|
||||
; GFX10-NEXT: v_cmp_gt_i64_e64 s4, s[4:5], 0
|
||||
; GFX10-NEXT: s_and_b32 s9, s9, 1
|
||||
; GFX10-NEXT: s_mov_b32 s11, 0
|
||||
; GFX10-NEXT: s_cmp_lg_u32 s9, 0
|
||||
; GFX10-NEXT: v_mov_b32_e32 v0, s8
|
||||
; GFX10-NEXT: s_subb_u32 s9, s1, s5
|
||||
; GFX10-NEXT: s_brev_b32 s10, 1
|
||||
; GFX10-NEXT: v_cmp_gt_i64_e64 s4, s[4:5], 0
|
||||
; GFX10-NEXT: v_cmp_lt_i64_e64 s1, s[8:9], s[0:1]
|
||||
; GFX10-NEXT: s_mov_b32 s11, 0
|
||||
; GFX10-NEXT: s_ashr_i32 s0, s9, 31
|
||||
; GFX10-NEXT: v_mov_b32_e32 v0, s8
|
||||
; GFX10-NEXT: s_brev_b32 s10, 1
|
||||
; GFX10-NEXT: v_mov_b32_e32 v1, s9
|
||||
; GFX10-NEXT: s_xor_b32 s8, s4, s1
|
||||
; GFX10-NEXT: s_cmp_lg_u32 s11, 0
|
||||
; GFX10-NEXT: v_cndmask_b32_e64 v0, v0, s0, s8
|
||||
; GFX10-NEXT: s_addc_u32 s1, s0, s10
|
||||
; GFX10-NEXT: s_sub_u32 s4, s2, s6
|
||||
; GFX10-NEXT: s_cselect_b32 s5, 1, 0
|
||||
; GFX10-NEXT: v_mov_b32_e32 v2, s4
|
||||
; GFX10-NEXT: s_and_b32 s5, s5, 1
|
||||
; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, s1, s8
|
||||
; GFX10-NEXT: s_cmp_lg_u32 s5, 0
|
||||
; GFX10-NEXT: s_subb_u32 s5, s3, s7
|
||||
; GFX10-NEXT: v_mov_b32_e32 v2, s4
|
||||
; GFX10-NEXT: v_cmp_lt_i64_e64 s2, s[4:5], s[2:3]
|
||||
; GFX10-NEXT: v_cmp_gt_i64_e64 s3, s[6:7], 0
|
||||
; GFX10-NEXT: s_ashr_i32 s0, s5, 31
|
||||
; GFX10-NEXT: v_mov_b32_e32 v3, s5
|
||||
; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, s1, s8
|
||||
; GFX10-NEXT: s_xor_b32 s2, s3, s2
|
||||
; GFX10-NEXT: s_cmp_lg_u32 s11, 0
|
||||
; GFX10-NEXT: v_cndmask_b32_e64 v2, v2, s0, s2
|
||||
|
@ -4736,19 +4700,10 @@ define amdgpu_ps i128 @s_ssubsat_i128(i128 inreg %lhs, i128 inreg %rhs) {
|
|||
; GFX6-LABEL: s_ssubsat_i128:
|
||||
; GFX6: ; %bb.0:
|
||||
; GFX6-NEXT: s_sub_u32 s8, s0, s4
|
||||
; GFX6-NEXT: s_cselect_b32 s9, 1, 0
|
||||
; GFX6-NEXT: s_and_b32 s9, s9, 1
|
||||
; GFX6-NEXT: s_cmp_lg_u32 s9, 0
|
||||
; GFX6-NEXT: s_subb_u32 s9, s1, s5
|
||||
; GFX6-NEXT: s_cselect_b32 s10, 1, 0
|
||||
; GFX6-NEXT: s_and_b32 s10, s10, 1
|
||||
; GFX6-NEXT: s_cmp_lg_u32 s10, 0
|
||||
; GFX6-NEXT: s_subb_u32 s10, s2, s6
|
||||
; GFX6-NEXT: s_cselect_b32 s11, 1, 0
|
||||
; GFX6-NEXT: v_mov_b32_e32 v3, s1
|
||||
; GFX6-NEXT: s_and_b32 s11, s11, 1
|
||||
; GFX6-NEXT: s_subb_u32 s9, s1, s5
|
||||
; GFX6-NEXT: v_mov_b32_e32 v2, s0
|
||||
; GFX6-NEXT: s_cmp_lg_u32 s11, 0
|
||||
; GFX6-NEXT: s_subb_u32 s10, s2, s6
|
||||
; GFX6-NEXT: v_mov_b32_e32 v0, s2
|
||||
; GFX6-NEXT: v_cmp_lt_u64_e32 vcc, s[8:9], v[2:3]
|
||||
; GFX6-NEXT: s_subb_u32 s11, s3, s7
|
||||
|
@ -4761,21 +4716,15 @@ define amdgpu_ps i128 @s_ssubsat_i128(i128 inreg %lhs, i128 inreg %rhs) {
|
|||
; GFX6-NEXT: v_cndmask_b32_e64 v1, 0, 1, s[0:1]
|
||||
; GFX6-NEXT: v_cmp_gt_i64_e64 s[0:1], s[6:7], 0
|
||||
; GFX6-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc
|
||||
; GFX6-NEXT: v_cmp_eq_u64_e64 vcc, s[6:7], 0
|
||||
; GFX6-NEXT: v_cndmask_b32_e64 v2, 0, 1, s[0:1]
|
||||
; GFX6-NEXT: s_ashr_i32 s0, s11, 31
|
||||
; GFX6-NEXT: s_mov_b32 s1, 0
|
||||
; GFX6-NEXT: s_cmp_lg_u32 s1, 0
|
||||
; GFX6-NEXT: s_addc_u32 s1, s0, 0
|
||||
; GFX6-NEXT: s_cselect_b32 s2, 1, 0
|
||||
; GFX6-NEXT: s_and_b32 s2, s2, 1
|
||||
; GFX6-NEXT: v_cmp_eq_u64_e64 vcc, s[6:7], 0
|
||||
; GFX6-NEXT: s_cmp_lg_u32 s2, 0
|
||||
; GFX6-NEXT: s_addc_u32 s2, s0, 0
|
||||
; GFX6-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc
|
||||
; GFX6-NEXT: s_cselect_b32 s3, 1, 0
|
||||
; GFX6-NEXT: s_cmp_lg_u32 s1, 0
|
||||
; GFX6-NEXT: v_xor_b32_e32 v0, v1, v0
|
||||
; GFX6-NEXT: s_and_b32 s3, s3, 1
|
||||
; GFX6-NEXT: s_cmp_lg_u32 s3, 0
|
||||
; GFX6-NEXT: s_addc_u32 s1, s0, 0
|
||||
; GFX6-NEXT: s_addc_u32 s2, s0, 0
|
||||
; GFX6-NEXT: v_and_b32_e32 v0, 1, v0
|
||||
; GFX6-NEXT: s_addc_u32 s3, s0, 0x80000000
|
||||
; GFX6-NEXT: v_mov_b32_e32 v1, s0
|
||||
|
@ -4800,18 +4749,9 @@ define amdgpu_ps i128 @s_ssubsat_i128(i128 inreg %lhs, i128 inreg %rhs) {
|
|||
; GFX8-LABEL: s_ssubsat_i128:
|
||||
; GFX8: ; %bb.0:
|
||||
; GFX8-NEXT: s_sub_u32 s8, s0, s4
|
||||
; GFX8-NEXT: s_cselect_b32 s9, 1, 0
|
||||
; GFX8-NEXT: s_and_b32 s9, s9, 1
|
||||
; GFX8-NEXT: s_cmp_lg_u32 s9, 0
|
||||
; GFX8-NEXT: s_subb_u32 s9, s1, s5
|
||||
; GFX8-NEXT: s_cselect_b32 s10, 1, 0
|
||||
; GFX8-NEXT: s_and_b32 s10, s10, 1
|
||||
; GFX8-NEXT: s_cmp_lg_u32 s10, 0
|
||||
; GFX8-NEXT: s_subb_u32 s10, s2, s6
|
||||
; GFX8-NEXT: s_cselect_b32 s11, 1, 0
|
||||
; GFX8-NEXT: s_and_b32 s11, s11, 1
|
||||
; GFX8-NEXT: v_mov_b32_e32 v3, s1
|
||||
; GFX8-NEXT: s_cmp_lg_u32 s11, 0
|
||||
; GFX8-NEXT: s_subb_u32 s10, s2, s6
|
||||
; GFX8-NEXT: v_mov_b32_e32 v2, s0
|
||||
; GFX8-NEXT: s_subb_u32 s11, s3, s7
|
||||
; GFX8-NEXT: v_mov_b32_e32 v0, s2
|
||||
|
@ -4835,17 +4775,11 @@ define amdgpu_ps i128 @s_ssubsat_i128(i128 inreg %lhs, i128 inreg %rhs) {
|
|||
; GFX8-NEXT: v_cmp_ne_u32_e64 vcc, 0, s0
|
||||
; GFX8-NEXT: s_ashr_i32 s0, s11, 31
|
||||
; GFX8-NEXT: s_mov_b32 s1, 0
|
||||
; GFX8-NEXT: s_cmp_lg_u32 s1, 0
|
||||
; GFX8-NEXT: s_addc_u32 s1, s0, 0
|
||||
; GFX8-NEXT: s_cselect_b32 s2, 1, 0
|
||||
; GFX8-NEXT: s_and_b32 s2, s2, 1
|
||||
; GFX8-NEXT: s_cmp_lg_u32 s2, 0
|
||||
; GFX8-NEXT: s_addc_u32 s2, s0, 0
|
||||
; GFX8-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc
|
||||
; GFX8-NEXT: s_cselect_b32 s3, 1, 0
|
||||
; GFX8-NEXT: s_cmp_lg_u32 s1, 0
|
||||
; GFX8-NEXT: v_xor_b32_e32 v0, v1, v0
|
||||
; GFX8-NEXT: s_and_b32 s3, s3, 1
|
||||
; GFX8-NEXT: s_cmp_lg_u32 s3, 0
|
||||
; GFX8-NEXT: s_addc_u32 s1, s0, 0
|
||||
; GFX8-NEXT: s_addc_u32 s2, s0, 0
|
||||
; GFX8-NEXT: v_and_b32_e32 v0, 1, v0
|
||||
; GFX8-NEXT: s_addc_u32 s3, s0, 0x80000000
|
||||
; GFX8-NEXT: v_mov_b32_e32 v1, s0
|
||||
|
@ -4870,18 +4804,9 @@ define amdgpu_ps i128 @s_ssubsat_i128(i128 inreg %lhs, i128 inreg %rhs) {
|
|||
; GFX9-LABEL: s_ssubsat_i128:
|
||||
; GFX9: ; %bb.0:
|
||||
; GFX9-NEXT: s_sub_u32 s8, s0, s4
|
||||
; GFX9-NEXT: s_cselect_b32 s9, 1, 0
|
||||
; GFX9-NEXT: s_and_b32 s9, s9, 1
|
||||
; GFX9-NEXT: s_cmp_lg_u32 s9, 0
|
||||
; GFX9-NEXT: s_subb_u32 s9, s1, s5
|
||||
; GFX9-NEXT: s_cselect_b32 s10, 1, 0
|
||||
; GFX9-NEXT: s_and_b32 s10, s10, 1
|
||||
; GFX9-NEXT: s_cmp_lg_u32 s10, 0
|
||||
; GFX9-NEXT: s_subb_u32 s10, s2, s6
|
||||
; GFX9-NEXT: s_cselect_b32 s11, 1, 0
|
||||
; GFX9-NEXT: s_and_b32 s11, s11, 1
|
||||
; GFX9-NEXT: v_mov_b32_e32 v3, s1
|
||||
; GFX9-NEXT: s_cmp_lg_u32 s11, 0
|
||||
; GFX9-NEXT: s_subb_u32 s10, s2, s6
|
||||
; GFX9-NEXT: v_mov_b32_e32 v2, s0
|
||||
; GFX9-NEXT: s_subb_u32 s11, s3, s7
|
||||
; GFX9-NEXT: v_mov_b32_e32 v0, s2
|
||||
|
@ -4905,17 +4830,11 @@ define amdgpu_ps i128 @s_ssubsat_i128(i128 inreg %lhs, i128 inreg %rhs) {
|
|||
; GFX9-NEXT: v_cmp_ne_u32_e64 vcc, 0, s0
|
||||
; GFX9-NEXT: s_ashr_i32 s0, s11, 31
|
||||
; GFX9-NEXT: s_mov_b32 s1, 0
|
||||
; GFX9-NEXT: s_cmp_lg_u32 s1, 0
|
||||
; GFX9-NEXT: s_addc_u32 s1, s0, 0
|
||||
; GFX9-NEXT: s_cselect_b32 s2, 1, 0
|
||||
; GFX9-NEXT: s_and_b32 s2, s2, 1
|
||||
; GFX9-NEXT: s_cmp_lg_u32 s2, 0
|
||||
; GFX9-NEXT: s_addc_u32 s2, s0, 0
|
||||
; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc
|
||||
; GFX9-NEXT: s_cselect_b32 s3, 1, 0
|
||||
; GFX9-NEXT: s_cmp_lg_u32 s1, 0
|
||||
; GFX9-NEXT: v_xor_b32_e32 v0, v1, v0
|
||||
; GFX9-NEXT: s_and_b32 s3, s3, 1
|
||||
; GFX9-NEXT: s_cmp_lg_u32 s3, 0
|
||||
; GFX9-NEXT: s_addc_u32 s1, s0, 0
|
||||
; GFX9-NEXT: s_addc_u32 s2, s0, 0
|
||||
; GFX9-NEXT: v_and_b32_e32 v0, 1, v0
|
||||
; GFX9-NEXT: s_addc_u32 s3, s0, 0x80000000
|
||||
; GFX9-NEXT: v_mov_b32_e32 v1, s0
|
||||
|
@ -4940,62 +4859,47 @@ define amdgpu_ps i128 @s_ssubsat_i128(i128 inreg %lhs, i128 inreg %rhs) {
|
|||
; GFX10-LABEL: s_ssubsat_i128:
|
||||
; GFX10: ; %bb.0:
|
||||
; GFX10-NEXT: s_sub_u32 s8, s0, s4
|
||||
; GFX10-NEXT: s_cselect_b32 s9, 1, 0
|
||||
; GFX10-NEXT: s_and_b32 s9, s9, 1
|
||||
; GFX10-NEXT: s_cmp_lg_u32 s9, 0
|
||||
; GFX10-NEXT: s_subb_u32 s9, s1, s5
|
||||
; GFX10-NEXT: s_cselect_b32 s10, 1, 0
|
||||
; GFX10-NEXT: v_cmp_lt_u64_e64 s0, s[8:9], s[0:1]
|
||||
; GFX10-NEXT: s_and_b32 s10, s10, 1
|
||||
; GFX10-NEXT: s_cmp_lg_u32 s10, 0
|
||||
; GFX10-NEXT: s_subb_u32 s10, s2, s6
|
||||
; GFX10-NEXT: s_cselect_b32 s11, 1, 0
|
||||
; GFX10-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0
|
||||
; GFX10-NEXT: s_and_b32 s11, s11, 1
|
||||
; GFX10-NEXT: s_cmp_lg_u32 s11, 0
|
||||
; GFX10-NEXT: v_cmp_lt_u64_e64 s0, s[8:9], s[0:1]
|
||||
; GFX10-NEXT: s_subb_u32 s11, s3, s7
|
||||
; GFX10-NEXT: v_cmp_lt_i64_e64 s1, s[10:11], s[2:3]
|
||||
; GFX10-NEXT: s_cmp_eq_u64 s[10:11], s[2:3]
|
||||
; GFX10-NEXT: v_cmp_gt_i64_e64 s2, s[6:7], 0
|
||||
; GFX10-NEXT: s_cselect_b32 s0, 1, 0
|
||||
; GFX10-NEXT: v_mov_b32_e32 v4, s11
|
||||
; GFX10-NEXT: s_and_b32 s0, 1, s0
|
||||
; GFX10-NEXT: v_cndmask_b32_e64 v1, 0, 1, s1
|
||||
; GFX10-NEXT: v_cmp_gt_u64_e64 s1, s[4:5], 0
|
||||
; GFX10-NEXT: s_cselect_b32 s12, 1, 0
|
||||
; GFX10-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0
|
||||
; GFX10-NEXT: v_cmp_lt_i64_e64 s0, s[10:11], s[2:3]
|
||||
; GFX10-NEXT: v_cmp_gt_u64_e64 s2, s[4:5], 0
|
||||
; GFX10-NEXT: v_cndmask_b32_e64 v1, 0, 1, s0
|
||||
; GFX10-NEXT: s_and_b32 s0, 1, s12
|
||||
; GFX10-NEXT: s_cmp_eq_u64 s[6:7], 0
|
||||
; GFX10-NEXT: v_cndmask_b32_e64 v2, 0, 1, s2
|
||||
; GFX10-NEXT: v_cmp_gt_i64_e64 s2, s[6:7], 0
|
||||
; GFX10-NEXT: v_cmp_ne_u32_e64 vcc_lo, 0, s0
|
||||
; GFX10-NEXT: v_cndmask_b32_e64 v3, 0, 1, s2
|
||||
; GFX10-NEXT: v_cndmask_b32_e64 v2, 0, 1, s1
|
||||
; GFX10-NEXT: s_cselect_b32 s1, 1, 0
|
||||
; GFX10-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc_lo
|
||||
; GFX10-NEXT: s_and_b32 s1, 1, s1
|
||||
; GFX10-NEXT: s_ashr_i32 s0, s11, 31
|
||||
; GFX10-NEXT: s_and_b32 s1, 1, s1
|
||||
; GFX10-NEXT: v_cndmask_b32_e64 v3, 0, 1, s2
|
||||
; GFX10-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc_lo
|
||||
; GFX10-NEXT: v_cmp_ne_u32_e64 vcc_lo, 0, s1
|
||||
; GFX10-NEXT: s_mov_b32 s1, 0
|
||||
; GFX10-NEXT: s_cmp_lg_u32 s1, 0
|
||||
; GFX10-NEXT: v_cndmask_b32_e32 v1, v3, v2, vcc_lo
|
||||
; GFX10-NEXT: s_addc_u32 s1, s0, 0
|
||||
; GFX10-NEXT: s_cselect_b32 s2, 1, 0
|
||||
; GFX10-NEXT: v_mov_b32_e32 v2, s9
|
||||
; GFX10-NEXT: s_and_b32 s2, s2, 1
|
||||
; GFX10-NEXT: v_xor_b32_e32 v0, v1, v0
|
||||
; GFX10-NEXT: s_cmp_lg_u32 s2, 0
|
||||
; GFX10-NEXT: v_mov_b32_e32 v1, s8
|
||||
; GFX10-NEXT: v_mov_b32_e32 v3, s11
|
||||
; GFX10-NEXT: s_addc_u32 s1, s0, 0
|
||||
; GFX10-NEXT: s_addc_u32 s2, s0, 0
|
||||
; GFX10-NEXT: s_cselect_b32 s3, 1, 0
|
||||
; GFX10-NEXT: v_and_b32_e32 v0, 1, v0
|
||||
; GFX10-NEXT: s_and_b32 s3, s3, 1
|
||||
; GFX10-NEXT: v_mov_b32_e32 v3, s10
|
||||
; GFX10-NEXT: s_cmp_lg_u32 s3, 0
|
||||
; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0
|
||||
; GFX10-NEXT: v_xor_b32_e32 v0, v1, v0
|
||||
; GFX10-NEXT: v_mov_b32_e32 v1, s8
|
||||
; GFX10-NEXT: s_addc_u32 s3, s0, 0x80000000
|
||||
; GFX10-NEXT: v_cndmask_b32_e64 v0, v1, s0, vcc_lo
|
||||
; GFX10-NEXT: v_cndmask_b32_e64 v1, v2, s1, vcc_lo
|
||||
; GFX10-NEXT: v_cndmask_b32_e64 v2, v3, s2, vcc_lo
|
||||
; GFX10-NEXT: v_cndmask_b32_e64 v3, v4, s3, vcc_lo
|
||||
; GFX10-NEXT: v_readfirstlane_b32 s0, v0
|
||||
; GFX10-NEXT: v_readfirstlane_b32 s1, v1
|
||||
; GFX10-NEXT: v_readfirstlane_b32 s2, v2
|
||||
; GFX10-NEXT: v_and_b32_e32 v0, 1, v0
|
||||
; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0
|
||||
; GFX10-NEXT: v_mov_b32_e32 v0, s10
|
||||
; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, s0, vcc_lo
|
||||
; GFX10-NEXT: v_cndmask_b32_e64 v2, v2, s1, vcc_lo
|
||||
; GFX10-NEXT: v_cndmask_b32_e64 v0, v0, s2, vcc_lo
|
||||
; GFX10-NEXT: v_cndmask_b32_e64 v3, v3, s3, vcc_lo
|
||||
; GFX10-NEXT: v_readfirstlane_b32 s0, v1
|
||||
; GFX10-NEXT: v_readfirstlane_b32 s1, v2
|
||||
; GFX10-NEXT: v_readfirstlane_b32 s2, v0
|
||||
; GFX10-NEXT: v_readfirstlane_b32 s3, v3
|
||||
; GFX10-NEXT: ; return to shader part epilog
|
||||
%result = call i128 @llvm.ssub.sat.i128(i128 %lhs, i128 %rhs)
|
||||
|
@ -5553,19 +5457,10 @@ define amdgpu_ps <2 x i128> @s_ssubsat_v2i128(<2 x i128> inreg %lhs, <2 x i128>
|
|||
; GFX6-LABEL: s_ssubsat_v2i128:
|
||||
; GFX6: ; %bb.0:
|
||||
; GFX6-NEXT: s_sub_u32 s16, s0, s8
|
||||
; GFX6-NEXT: s_cselect_b32 s17, 1, 0
|
||||
; GFX6-NEXT: s_and_b32 s17, s17, 1
|
||||
; GFX6-NEXT: s_cmp_lg_u32 s17, 0
|
||||
; GFX6-NEXT: s_subb_u32 s17, s1, s9
|
||||
; GFX6-NEXT: s_cselect_b32 s18, 1, 0
|
||||
; GFX6-NEXT: s_and_b32 s18, s18, 1
|
||||
; GFX6-NEXT: s_cmp_lg_u32 s18, 0
|
||||
; GFX6-NEXT: s_subb_u32 s18, s2, s10
|
||||
; GFX6-NEXT: s_cselect_b32 s19, 1, 0
|
||||
; GFX6-NEXT: v_mov_b32_e32 v3, s1
|
||||
; GFX6-NEXT: s_and_b32 s19, s19, 1
|
||||
; GFX6-NEXT: s_subb_u32 s17, s1, s9
|
||||
; GFX6-NEXT: v_mov_b32_e32 v2, s0
|
||||
; GFX6-NEXT: s_cmp_lg_u32 s19, 0
|
||||
; GFX6-NEXT: s_subb_u32 s18, s2, s10
|
||||
; GFX6-NEXT: v_mov_b32_e32 v0, s2
|
||||
; GFX6-NEXT: v_cmp_lt_u64_e32 vcc, s[16:17], v[2:3]
|
||||
; GFX6-NEXT: s_subb_u32 s19, s3, s11
|
||||
|
@ -5578,51 +5473,36 @@ define amdgpu_ps <2 x i128> @s_ssubsat_v2i128(<2 x i128> inreg %lhs, <2 x i128>
|
|||
; GFX6-NEXT: v_cndmask_b32_e64 v1, 0, 1, s[0:1]
|
||||
; GFX6-NEXT: v_cmp_gt_i64_e64 s[0:1], s[10:11], 0
|
||||
; GFX6-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc
|
||||
; GFX6-NEXT: v_cmp_eq_u64_e64 vcc, s[10:11], 0
|
||||
; GFX6-NEXT: v_cndmask_b32_e64 v2, 0, 1, s[0:1]
|
||||
; GFX6-NEXT: s_ashr_i32 s0, s19, 31
|
||||
; GFX6-NEXT: s_mov_b32 s1, 0
|
||||
; GFX6-NEXT: s_cmp_lg_u32 s1, 0
|
||||
; GFX6-NEXT: s_addc_u32 s1, s0, 0
|
||||
; GFX6-NEXT: s_cselect_b32 s2, 1, 0
|
||||
; GFX6-NEXT: s_and_b32 s2, s2, 1
|
||||
; GFX6-NEXT: s_cmp_lg_u32 s2, 0
|
||||
; GFX6-NEXT: s_addc_u32 s2, s0, 0
|
||||
; GFX6-NEXT: s_cselect_b32 s3, 1, 0
|
||||
; GFX6-NEXT: v_cmp_eq_u64_e64 vcc, s[10:11], 0
|
||||
; GFX6-NEXT: s_and_b32 s3, s3, 1
|
||||
; GFX6-NEXT: s_brev_b32 s8, 1
|
||||
; GFX6-NEXT: s_cmp_lg_u32 s3, 0
|
||||
; GFX6-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc
|
||||
; GFX6-NEXT: s_addc_u32 s3, s0, s8
|
||||
; GFX6-NEXT: v_xor_b32_e32 v0, v1, v0
|
||||
; GFX6-NEXT: v_mov_b32_e32 v1, s0
|
||||
; GFX6-NEXT: s_sub_u32 s0, s4, s12
|
||||
; GFX6-NEXT: v_mov_b32_e32 v2, s1
|
||||
; GFX6-NEXT: s_cselect_b32 s1, 1, 0
|
||||
; GFX6-NEXT: s_and_b32 s1, s1, 1
|
||||
; GFX6-NEXT: s_cmp_lg_u32 s1, 0
|
||||
; GFX6-NEXT: v_xor_b32_e32 v0, v1, v0
|
||||
; GFX6-NEXT: s_addc_u32 s1, s0, 0
|
||||
; GFX6-NEXT: s_brev_b32 s8, 1
|
||||
; GFX6-NEXT: s_addc_u32 s2, s0, 0
|
||||
; GFX6-NEXT: v_and_b32_e32 v0, 1, v0
|
||||
; GFX6-NEXT: s_subb_u32 s1, s5, s13
|
||||
; GFX6-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0
|
||||
; GFX6-NEXT: v_mov_b32_e32 v0, s2
|
||||
; GFX6-NEXT: s_cselect_b32 s2, 1, 0
|
||||
; GFX6-NEXT: s_and_b32 s2, s2, 1
|
||||
; GFX6-NEXT: s_addc_u32 s3, s0, s8
|
||||
; GFX6-NEXT: v_mov_b32_e32 v1, s0
|
||||
; GFX6-NEXT: v_mov_b32_e32 v2, s1
|
||||
; GFX6-NEXT: v_mov_b32_e32 v3, s16
|
||||
; GFX6-NEXT: v_mov_b32_e32 v4, s17
|
||||
; GFX6-NEXT: s_cmp_lg_u32 s2, 0
|
||||
; GFX6-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0
|
||||
; GFX6-NEXT: v_cndmask_b32_e32 v5, v3, v1, vcc
|
||||
; GFX6-NEXT: v_cndmask_b32_e32 v4, v4, v2, vcc
|
||||
; GFX6-NEXT: v_mov_b32_e32 v0, s2
|
||||
; GFX6-NEXT: v_mov_b32_e32 v1, s3
|
||||
; GFX6-NEXT: v_mov_b32_e32 v2, s18
|
||||
; GFX6-NEXT: v_mov_b32_e32 v3, s19
|
||||
; GFX6-NEXT: s_subb_u32 s2, s6, s14
|
||||
; GFX6-NEXT: v_cndmask_b32_e32 v6, v2, v0, vcc
|
||||
; GFX6-NEXT: v_cndmask_b32_e32 v7, v3, v1, vcc
|
||||
; GFX6-NEXT: s_cselect_b32 s3, 1, 0
|
||||
; GFX6-NEXT: s_sub_u32 s0, s4, s12
|
||||
; GFX6-NEXT: v_mov_b32_e32 v2, s4
|
||||
; GFX6-NEXT: s_and_b32 s3, s3, 1
|
||||
; GFX6-NEXT: s_subb_u32 s1, s5, s13
|
||||
; GFX6-NEXT: v_mov_b32_e32 v3, s5
|
||||
; GFX6-NEXT: s_cmp_lg_u32 s3, 0
|
||||
; GFX6-NEXT: s_subb_u32 s2, s6, s14
|
||||
; GFX6-NEXT: v_mov_b32_e32 v0, s6
|
||||
; GFX6-NEXT: v_cmp_lt_u64_e32 vcc, s[0:1], v[2:3]
|
||||
; GFX6-NEXT: s_subb_u32 s3, s7, s15
|
||||
|
@ -5635,21 +5515,15 @@ define amdgpu_ps <2 x i128> @s_ssubsat_v2i128(<2 x i128> inreg %lhs, <2 x i128>
|
|||
; GFX6-NEXT: v_cndmask_b32_e64 v1, 0, 1, s[4:5]
|
||||
; GFX6-NEXT: v_cmp_gt_i64_e64 s[4:5], s[14:15], 0
|
||||
; GFX6-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc
|
||||
; GFX6-NEXT: v_cmp_eq_u64_e64 vcc, s[14:15], 0
|
||||
; GFX6-NEXT: v_cndmask_b32_e64 v2, 0, 1, s[4:5]
|
||||
; GFX6-NEXT: s_ashr_i32 s4, s3, 31
|
||||
; GFX6-NEXT: s_mov_b32 s5, 0
|
||||
; GFX6-NEXT: s_cmp_lg_u32 s5, 0
|
||||
; GFX6-NEXT: s_addc_u32 s5, s4, 0
|
||||
; GFX6-NEXT: s_cselect_b32 s6, 1, 0
|
||||
; GFX6-NEXT: s_and_b32 s6, s6, 1
|
||||
; GFX6-NEXT: v_cmp_eq_u64_e64 vcc, s[14:15], 0
|
||||
; GFX6-NEXT: s_cmp_lg_u32 s6, 0
|
||||
; GFX6-NEXT: s_addc_u32 s6, s4, 0
|
||||
; GFX6-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc
|
||||
; GFX6-NEXT: s_cselect_b32 s7, 1, 0
|
||||
; GFX6-NEXT: s_cmp_lg_u32 s5, 0
|
||||
; GFX6-NEXT: v_xor_b32_e32 v0, v1, v0
|
||||
; GFX6-NEXT: s_and_b32 s7, s7, 1
|
||||
; GFX6-NEXT: s_cmp_lg_u32 s7, 0
|
||||
; GFX6-NEXT: s_addc_u32 s5, s4, 0
|
||||
; GFX6-NEXT: s_addc_u32 s6, s4, 0
|
||||
; GFX6-NEXT: v_and_b32_e32 v0, 1, v0
|
||||
; GFX6-NEXT: s_addc_u32 s7, s4, s8
|
||||
; GFX6-NEXT: v_mov_b32_e32 v1, s4
|
||||
|
@ -5678,18 +5552,9 @@ define amdgpu_ps <2 x i128> @s_ssubsat_v2i128(<2 x i128> inreg %lhs, <2 x i128>
|
|||
; GFX8-LABEL: s_ssubsat_v2i128:
|
||||
; GFX8: ; %bb.0:
|
||||
; GFX8-NEXT: s_sub_u32 s16, s0, s8
|
||||
; GFX8-NEXT: s_cselect_b32 s17, 1, 0
|
||||
; GFX8-NEXT: s_and_b32 s17, s17, 1
|
||||
; GFX8-NEXT: s_cmp_lg_u32 s17, 0
|
||||
; GFX8-NEXT: s_subb_u32 s17, s1, s9
|
||||
; GFX8-NEXT: s_cselect_b32 s18, 1, 0
|
||||
; GFX8-NEXT: s_and_b32 s18, s18, 1
|
||||
; GFX8-NEXT: s_cmp_lg_u32 s18, 0
|
||||
; GFX8-NEXT: s_subb_u32 s18, s2, s10
|
||||
; GFX8-NEXT: s_cselect_b32 s19, 1, 0
|
||||
; GFX8-NEXT: s_and_b32 s19, s19, 1
|
||||
; GFX8-NEXT: v_mov_b32_e32 v3, s1
|
||||
; GFX8-NEXT: s_cmp_lg_u32 s19, 0
|
||||
; GFX8-NEXT: s_subb_u32 s18, s2, s10
|
||||
; GFX8-NEXT: v_mov_b32_e32 v2, s0
|
||||
; GFX8-NEXT: s_subb_u32 s19, s3, s11
|
||||
; GFX8-NEXT: v_mov_b32_e32 v0, s2
|
||||
|
@ -5713,46 +5578,31 @@ define amdgpu_ps <2 x i128> @s_ssubsat_v2i128(<2 x i128> inreg %lhs, <2 x i128>
|
|||
; GFX8-NEXT: v_cmp_ne_u32_e64 vcc, 0, s0
|
||||
; GFX8-NEXT: s_ashr_i32 s0, s19, 31
|
||||
; GFX8-NEXT: s_mov_b32 s1, 0
|
||||
; GFX8-NEXT: s_cmp_lg_u32 s1, 0
|
||||
; GFX8-NEXT: s_addc_u32 s1, s0, 0
|
||||
; GFX8-NEXT: s_cselect_b32 s2, 1, 0
|
||||
; GFX8-NEXT: s_and_b32 s2, s2, 1
|
||||
; GFX8-NEXT: s_cmp_lg_u32 s2, 0
|
||||
; GFX8-NEXT: s_addc_u32 s2, s0, 0
|
||||
; GFX8-NEXT: s_cselect_b32 s3, 1, 0
|
||||
; GFX8-NEXT: s_and_b32 s3, s3, 1
|
||||
; GFX8-NEXT: s_brev_b32 s8, 1
|
||||
; GFX8-NEXT: s_cmp_lg_u32 s3, 0
|
||||
; GFX8-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc
|
||||
; GFX8-NEXT: s_addc_u32 s3, s0, s8
|
||||
; GFX8-NEXT: v_xor_b32_e32 v0, v1, v0
|
||||
; GFX8-NEXT: v_mov_b32_e32 v1, s0
|
||||
; GFX8-NEXT: s_sub_u32 s0, s4, s12
|
||||
; GFX8-NEXT: v_mov_b32_e32 v2, s1
|
||||
; GFX8-NEXT: s_cselect_b32 s1, 1, 0
|
||||
; GFX8-NEXT: s_and_b32 s1, s1, 1
|
||||
; GFX8-NEXT: s_cmp_lg_u32 s1, 0
|
||||
; GFX8-NEXT: v_xor_b32_e32 v0, v1, v0
|
||||
; GFX8-NEXT: s_addc_u32 s1, s0, 0
|
||||
; GFX8-NEXT: s_brev_b32 s8, 1
|
||||
; GFX8-NEXT: s_addc_u32 s2, s0, 0
|
||||
; GFX8-NEXT: v_and_b32_e32 v0, 1, v0
|
||||
; GFX8-NEXT: s_subb_u32 s1, s5, s13
|
||||
; GFX8-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0
|
||||
; GFX8-NEXT: v_mov_b32_e32 v0, s2
|
||||
; GFX8-NEXT: s_cselect_b32 s2, 1, 0
|
||||
; GFX8-NEXT: s_and_b32 s2, s2, 1
|
||||
; GFX8-NEXT: s_cmp_lg_u32 s2, 0
|
||||
; GFX8-NEXT: s_addc_u32 s3, s0, s8
|
||||
; GFX8-NEXT: v_mov_b32_e32 v1, s0
|
||||
; GFX8-NEXT: v_mov_b32_e32 v2, s1
|
||||
; GFX8-NEXT: v_mov_b32_e32 v3, s16
|
||||
; GFX8-NEXT: v_mov_b32_e32 v4, s17
|
||||
; GFX8-NEXT: s_subb_u32 s2, s6, s14
|
||||
; GFX8-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0
|
||||
; GFX8-NEXT: v_cndmask_b32_e32 v5, v3, v1, vcc
|
||||
; GFX8-NEXT: v_cndmask_b32_e32 v4, v4, v2, vcc
|
||||
; GFX8-NEXT: v_mov_b32_e32 v0, s2
|
||||
; GFX8-NEXT: v_mov_b32_e32 v1, s3
|
||||
; GFX8-NEXT: v_mov_b32_e32 v2, s18
|
||||
; GFX8-NEXT: v_mov_b32_e32 v3, s19
|
||||
; GFX8-NEXT: s_cselect_b32 s3, 1, 0
|
||||
; GFX8-NEXT: s_sub_u32 s0, s4, s12
|
||||
; GFX8-NEXT: v_cndmask_b32_e32 v6, v2, v0, vcc
|
||||
; GFX8-NEXT: v_cndmask_b32_e32 v7, v3, v1, vcc
|
||||
; GFX8-NEXT: s_and_b32 s3, s3, 1
|
||||
; GFX8-NEXT: s_subb_u32 s1, s5, s13
|
||||
; GFX8-NEXT: v_mov_b32_e32 v2, s4
|
||||
; GFX8-NEXT: s_cmp_lg_u32 s3, 0
|
||||
; GFX8-NEXT: s_subb_u32 s2, s6, s14
|
||||
; GFX8-NEXT: v_mov_b32_e32 v3, s5
|
||||
; GFX8-NEXT: s_subb_u32 s3, s7, s15
|
||||
; GFX8-NEXT: v_mov_b32_e32 v0, s6
|
||||
|
@ -5776,17 +5626,11 @@ define amdgpu_ps <2 x i128> @s_ssubsat_v2i128(<2 x i128> inreg %lhs, <2 x i128>
|
|||
; GFX8-NEXT: v_cmp_ne_u32_e64 vcc, 0, s4
|
||||
; GFX8-NEXT: s_ashr_i32 s4, s3, 31
|
||||
; GFX8-NEXT: s_mov_b32 s5, 0
|
||||
; GFX8-NEXT: s_cmp_lg_u32 s5, 0
|
||||
; GFX8-NEXT: s_addc_u32 s5, s4, 0
|
||||
; GFX8-NEXT: s_cselect_b32 s6, 1, 0
|
||||
; GFX8-NEXT: s_and_b32 s6, s6, 1
|
||||
; GFX8-NEXT: s_cmp_lg_u32 s6, 0
|
||||
; GFX8-NEXT: s_addc_u32 s6, s4, 0
|
||||
; GFX8-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc
|
||||
; GFX8-NEXT: s_cselect_b32 s7, 1, 0
|
||||
; GFX8-NEXT: s_cmp_lg_u32 s5, 0
|
||||
; GFX8-NEXT: v_xor_b32_e32 v0, v1, v0
|
||||
; GFX8-NEXT: s_and_b32 s7, s7, 1
|
||||
; GFX8-NEXT: s_cmp_lg_u32 s7, 0
|
||||
; GFX8-NEXT: s_addc_u32 s5, s4, 0
|
||||
; GFX8-NEXT: s_addc_u32 s6, s4, 0
|
||||
; GFX8-NEXT: v_and_b32_e32 v0, 1, v0
|
||||
; GFX8-NEXT: s_addc_u32 s7, s4, s8
|
||||
; GFX8-NEXT: v_mov_b32_e32 v1, s4
|
||||
|
@ -5815,18 +5659,9 @@ define amdgpu_ps <2 x i128> @s_ssubsat_v2i128(<2 x i128> inreg %lhs, <2 x i128>
|
|||
; GFX9-LABEL: s_ssubsat_v2i128:
|
||||
; GFX9: ; %bb.0:
|
||||
; GFX9-NEXT: s_sub_u32 s16, s0, s8
|
||||
; GFX9-NEXT: s_cselect_b32 s17, 1, 0
|
||||
; GFX9-NEXT: s_and_b32 s17, s17, 1
|
||||
; GFX9-NEXT: s_cmp_lg_u32 s17, 0
|
||||
; GFX9-NEXT: s_subb_u32 s17, s1, s9
|
||||
; GFX9-NEXT: s_cselect_b32 s18, 1, 0
|
||||
; GFX9-NEXT: s_and_b32 s18, s18, 1
|
||||
; GFX9-NEXT: s_cmp_lg_u32 s18, 0
|
||||
; GFX9-NEXT: s_subb_u32 s18, s2, s10
|
||||
; GFX9-NEXT: s_cselect_b32 s19, 1, 0
|
||||
; GFX9-NEXT: s_and_b32 s19, s19, 1
|
||||
; GFX9-NEXT: v_mov_b32_e32 v3, s1
|
||||
; GFX9-NEXT: s_cmp_lg_u32 s19, 0
|
||||
; GFX9-NEXT: s_subb_u32 s18, s2, s10
|
||||
; GFX9-NEXT: v_mov_b32_e32 v2, s0
|
||||
; GFX9-NEXT: s_subb_u32 s19, s3, s11
|
||||
; GFX9-NEXT: v_mov_b32_e32 v0, s2
|
||||
|
@ -5850,46 +5685,31 @@ define amdgpu_ps <2 x i128> @s_ssubsat_v2i128(<2 x i128> inreg %lhs, <2 x i128>
|
|||
; GFX9-NEXT: v_cmp_ne_u32_e64 vcc, 0, s0
|
||||
; GFX9-NEXT: s_ashr_i32 s0, s19, 31
|
||||
; GFX9-NEXT: s_mov_b32 s1, 0
|
||||
; GFX9-NEXT: s_cmp_lg_u32 s1, 0
|
||||
; GFX9-NEXT: s_addc_u32 s1, s0, 0
|
||||
; GFX9-NEXT: s_cselect_b32 s2, 1, 0
|
||||
; GFX9-NEXT: s_and_b32 s2, s2, 1
|
||||
; GFX9-NEXT: s_cmp_lg_u32 s2, 0
|
||||
; GFX9-NEXT: s_addc_u32 s2, s0, 0
|
||||
; GFX9-NEXT: s_cselect_b32 s3, 1, 0
|
||||
; GFX9-NEXT: s_and_b32 s3, s3, 1
|
||||
; GFX9-NEXT: s_brev_b32 s8, 1
|
||||
; GFX9-NEXT: s_cmp_lg_u32 s3, 0
|
||||
; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc
|
||||
; GFX9-NEXT: s_addc_u32 s3, s0, s8
|
||||
; GFX9-NEXT: v_xor_b32_e32 v0, v1, v0
|
||||
; GFX9-NEXT: v_mov_b32_e32 v1, s0
|
||||
; GFX9-NEXT: s_sub_u32 s0, s4, s12
|
||||
; GFX9-NEXT: v_mov_b32_e32 v2, s1
|
||||
; GFX9-NEXT: s_cselect_b32 s1, 1, 0
|
||||
; GFX9-NEXT: s_and_b32 s1, s1, 1
|
||||
; GFX9-NEXT: s_cmp_lg_u32 s1, 0
|
||||
; GFX9-NEXT: v_xor_b32_e32 v0, v1, v0
|
||||
; GFX9-NEXT: s_addc_u32 s1, s0, 0
|
||||
; GFX9-NEXT: s_brev_b32 s8, 1
|
||||
; GFX9-NEXT: s_addc_u32 s2, s0, 0
|
||||
; GFX9-NEXT: v_and_b32_e32 v0, 1, v0
|
||||
; GFX9-NEXT: s_subb_u32 s1, s5, s13
|
||||
; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0
|
||||
; GFX9-NEXT: v_mov_b32_e32 v0, s2
|
||||
; GFX9-NEXT: s_cselect_b32 s2, 1, 0
|
||||
; GFX9-NEXT: s_and_b32 s2, s2, 1
|
||||
; GFX9-NEXT: s_cmp_lg_u32 s2, 0
|
||||
; GFX9-NEXT: s_addc_u32 s3, s0, s8
|
||||
; GFX9-NEXT: v_mov_b32_e32 v1, s0
|
||||
; GFX9-NEXT: v_mov_b32_e32 v2, s1
|
||||
; GFX9-NEXT: v_mov_b32_e32 v3, s16
|
||||
; GFX9-NEXT: v_mov_b32_e32 v4, s17
|
||||
; GFX9-NEXT: s_subb_u32 s2, s6, s14
|
||||
; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0
|
||||
; GFX9-NEXT: v_cndmask_b32_e32 v5, v3, v1, vcc
|
||||
; GFX9-NEXT: v_cndmask_b32_e32 v4, v4, v2, vcc
|
||||
; GFX9-NEXT: v_mov_b32_e32 v0, s2
|
||||
; GFX9-NEXT: v_mov_b32_e32 v1, s3
|
||||
; GFX9-NEXT: v_mov_b32_e32 v2, s18
|
||||
; GFX9-NEXT: v_mov_b32_e32 v3, s19
|
||||
; GFX9-NEXT: s_cselect_b32 s3, 1, 0
|
||||
; GFX9-NEXT: s_sub_u32 s0, s4, s12
|
||||
; GFX9-NEXT: v_cndmask_b32_e32 v6, v2, v0, vcc
|
||||
; GFX9-NEXT: v_cndmask_b32_e32 v7, v3, v1, vcc
|
||||
; GFX9-NEXT: s_and_b32 s3, s3, 1
|
||||
; GFX9-NEXT: s_subb_u32 s1, s5, s13
|
||||
; GFX9-NEXT: v_mov_b32_e32 v2, s4
|
||||
; GFX9-NEXT: s_cmp_lg_u32 s3, 0
|
||||
; GFX9-NEXT: s_subb_u32 s2, s6, s14
|
||||
; GFX9-NEXT: v_mov_b32_e32 v3, s5
|
||||
; GFX9-NEXT: s_subb_u32 s3, s7, s15
|
||||
; GFX9-NEXT: v_mov_b32_e32 v0, s6
|
||||
|
@ -5913,17 +5733,11 @@ define amdgpu_ps <2 x i128> @s_ssubsat_v2i128(<2 x i128> inreg %lhs, <2 x i128>
|
|||
; GFX9-NEXT: v_cmp_ne_u32_e64 vcc, 0, s4
|
||||
; GFX9-NEXT: s_ashr_i32 s4, s3, 31
|
||||
; GFX9-NEXT: s_mov_b32 s5, 0
|
||||
; GFX9-NEXT: s_cmp_lg_u32 s5, 0
|
||||
; GFX9-NEXT: s_addc_u32 s5, s4, 0
|
||||
; GFX9-NEXT: s_cselect_b32 s6, 1, 0
|
||||
; GFX9-NEXT: s_and_b32 s6, s6, 1
|
||||
; GFX9-NEXT: s_cmp_lg_u32 s6, 0
|
||||
; GFX9-NEXT: s_addc_u32 s6, s4, 0
|
||||
; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc
|
||||
; GFX9-NEXT: s_cselect_b32 s7, 1, 0
|
||||
; GFX9-NEXT: s_cmp_lg_u32 s5, 0
|
||||
; GFX9-NEXT: v_xor_b32_e32 v0, v1, v0
|
||||
; GFX9-NEXT: s_and_b32 s7, s7, 1
|
||||
; GFX9-NEXT: s_cmp_lg_u32 s7, 0
|
||||
; GFX9-NEXT: s_addc_u32 s5, s4, 0
|
||||
; GFX9-NEXT: s_addc_u32 s6, s4, 0
|
||||
; GFX9-NEXT: v_and_b32_e32 v0, 1, v0
|
||||
; GFX9-NEXT: s_addc_u32 s7, s4, s8
|
||||
; GFX9-NEXT: v_mov_b32_e32 v1, s4
|
||||
|
@ -5952,120 +5766,90 @@ define amdgpu_ps <2 x i128> @s_ssubsat_v2i128(<2 x i128> inreg %lhs, <2 x i128>
|
|||
; GFX10-LABEL: s_ssubsat_v2i128:
|
||||
; GFX10: ; %bb.0:
|
||||
; GFX10-NEXT: s_sub_u32 s16, s0, s8
|
||||
; GFX10-NEXT: s_cselect_b32 s17, 1, 0
|
||||
; GFX10-NEXT: s_and_b32 s17, s17, 1
|
||||
; GFX10-NEXT: s_cmp_lg_u32 s17, 0
|
||||
; GFX10-NEXT: s_subb_u32 s17, s1, s9
|
||||
; GFX10-NEXT: s_cselect_b32 s18, 1, 0
|
||||
; GFX10-NEXT: v_cmp_lt_u64_e64 s0, s[16:17], s[0:1]
|
||||
; GFX10-NEXT: s_and_b32 s18, s18, 1
|
||||
; GFX10-NEXT: s_cmp_lg_u32 s18, 0
|
||||
; GFX10-NEXT: s_subb_u32 s18, s2, s10
|
||||
; GFX10-NEXT: s_cselect_b32 s19, 1, 0
|
||||
; GFX10-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0
|
||||
; GFX10-NEXT: s_and_b32 s19, s19, 1
|
||||
; GFX10-NEXT: s_cmp_lg_u32 s19, 0
|
||||
; GFX10-NEXT: v_cmp_lt_u64_e64 s0, s[16:17], s[0:1]
|
||||
; GFX10-NEXT: s_subb_u32 s19, s3, s11
|
||||
; GFX10-NEXT: v_cmp_lt_i64_e64 s0, s[18:19], s[2:3]
|
||||
; GFX10-NEXT: s_brev_b32 s21, 1
|
||||
; GFX10-NEXT: s_cmp_eq_u64 s[18:19], s[2:3]
|
||||
; GFX10-NEXT: v_cmp_gt_u64_e64 s2, s[8:9], 0
|
||||
; GFX10-NEXT: s_cselect_b32 s20, 1, 0
|
||||
; GFX10-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0
|
||||
; GFX10-NEXT: v_cmp_lt_i64_e64 s0, s[18:19], s[2:3]
|
||||
; GFX10-NEXT: v_cmp_gt_u64_e64 s2, s[8:9], 0
|
||||
; GFX10-NEXT: v_cndmask_b32_e64 v1, 0, 1, s0
|
||||
; GFX10-NEXT: s_and_b32 s0, 1, s20
|
||||
; GFX10-NEXT: s_cmp_eq_u64 s[10:11], 0
|
||||
; GFX10-NEXT: v_cndmask_b32_e64 v2, 0, 1, s2
|
||||
; GFX10-NEXT: s_cselect_b32 s1, 1, 0
|
||||
; GFX10-NEXT: v_cmp_gt_i64_e64 s2, s[10:11], 0
|
||||
; GFX10-NEXT: v_cmp_ne_u32_e64 vcc_lo, 0, s0
|
||||
; GFX10-NEXT: s_cselect_b32 s1, 1, 0
|
||||
; GFX10-NEXT: s_mov_b32 s10, 0
|
||||
; GFX10-NEXT: s_mov_b32 s20, 0
|
||||
; GFX10-NEXT: s_and_b32 s1, 1, s1
|
||||
; GFX10-NEXT: s_ashr_i32 s0, s19, 31
|
||||
; GFX10-NEXT: v_cndmask_b32_e64 v3, 0, 1, s2
|
||||
; GFX10-NEXT: s_cmp_lg_u32 s20, 0
|
||||
; GFX10-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc_lo
|
||||
; GFX10-NEXT: v_cmp_ne_u32_e64 vcc_lo, 0, s1
|
||||
; GFX10-NEXT: s_cmp_lg_u32 s10, 0
|
||||
; GFX10-NEXT: s_brev_b32 s11, 1
|
||||
; GFX10-NEXT: s_addc_u32 s1, s0, 0
|
||||
; GFX10-NEXT: s_cselect_b32 s2, 1, 0
|
||||
; GFX10-NEXT: v_cndmask_b32_e32 v1, v3, v2, vcc_lo
|
||||
; GFX10-NEXT: s_and_b32 s2, s2, 1
|
||||
; GFX10-NEXT: v_mov_b32_e32 v2, s17
|
||||
; GFX10-NEXT: s_cmp_lg_u32 s2, 0
|
||||
; GFX10-NEXT: v_mov_b32_e32 v3, s19
|
||||
; GFX10-NEXT: v_xor_b32_e32 v0, v1, v0
|
||||
; GFX10-NEXT: v_cndmask_b32_e64 v3, 0, 1, s2
|
||||
; GFX10-NEXT: s_addc_u32 s2, s0, 0
|
||||
; GFX10-NEXT: s_cselect_b32 s3, 1, 0
|
||||
; GFX10-NEXT: s_addc_u32 s3, s0, s21
|
||||
; GFX10-NEXT: s_sub_u32 s8, s4, s12
|
||||
; GFX10-NEXT: s_subb_u32 s9, s5, s13
|
||||
; GFX10-NEXT: s_subb_u32 s10, s6, s14
|
||||
; GFX10-NEXT: v_cmp_lt_u64_e64 s4, s[8:9], s[4:5]
|
||||
; GFX10-NEXT: s_subb_u32 s11, s7, s15
|
||||
; GFX10-NEXT: v_cndmask_b32_e32 v1, v3, v2, vcc_lo
|
||||
; GFX10-NEXT: s_cmp_eq_u64 s[10:11], s[6:7]
|
||||
; GFX10-NEXT: v_mov_b32_e32 v2, s17
|
||||
; GFX10-NEXT: v_mov_b32_e32 v7, s11
|
||||
; GFX10-NEXT: v_cndmask_b32_e64 v3, 0, 1, s4
|
||||
; GFX10-NEXT: v_cmp_lt_i64_e64 s4, s[10:11], s[6:7]
|
||||
; GFX10-NEXT: v_cmp_gt_u64_e64 s6, s[12:13], 0
|
||||
; GFX10-NEXT: v_xor_b32_e32 v0, v1, v0
|
||||
; GFX10-NEXT: v_mov_b32_e32 v1, s16
|
||||
; GFX10-NEXT: s_and_b32 s3, s3, 1
|
||||
; GFX10-NEXT: v_and_b32_e32 v0, 1, v0
|
||||
; GFX10-NEXT: s_cmp_lg_u32 s3, 0
|
||||
; GFX10-NEXT: s_addc_u32 s3, s0, s11
|
||||
; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0
|
||||
; GFX10-NEXT: v_cndmask_b32_e64 v0, v1, s0, vcc_lo
|
||||
; GFX10-NEXT: s_sub_u32 s0, s4, s12
|
||||
; GFX10-NEXT: v_cndmask_b32_e64 v1, v2, s1, vcc_lo
|
||||
; GFX10-NEXT: s_cselect_b32 s1, 1, 0
|
||||
; GFX10-NEXT: v_cndmask_b32_e64 v3, v3, s3, vcc_lo
|
||||
; GFX10-NEXT: s_and_b32 s1, s1, 1
|
||||
; GFX10-NEXT: v_mov_b32_e32 v2, s18
|
||||
; GFX10-NEXT: s_cmp_lg_u32 s1, 0
|
||||
; GFX10-NEXT: s_subb_u32 s1, s5, s13
|
||||
; GFX10-NEXT: s_cselect_b32 s8, 1, 0
|
||||
; GFX10-NEXT: v_cmp_lt_u64_e64 s3, s[0:1], s[4:5]
|
||||
; GFX10-NEXT: s_and_b32 s8, s8, 1
|
||||
; GFX10-NEXT: v_cmp_gt_u64_e64 s4, s[12:13], 0
|
||||
; GFX10-NEXT: s_cmp_lg_u32 s8, 0
|
||||
; GFX10-NEXT: v_cndmask_b32_e64 v2, v2, s2, vcc_lo
|
||||
; GFX10-NEXT: s_subb_u32 s8, s6, s14
|
||||
; GFX10-NEXT: s_cselect_b32 s9, 1, 0
|
||||
; GFX10-NEXT: v_cndmask_b32_e64 v4, 0, 1, s3
|
||||
; GFX10-NEXT: s_and_b32 s9, s9, 1
|
||||
; GFX10-NEXT: v_cndmask_b32_e64 v6, 0, 1, s4
|
||||
; GFX10-NEXT: s_cmp_lg_u32 s9, 0
|
||||
; GFX10-NEXT: v_cmp_gt_i64_e64 s4, s[14:15], 0
|
||||
; GFX10-NEXT: s_subb_u32 s9, s7, s15
|
||||
; GFX10-NEXT: s_cmp_eq_u64 s[8:9], s[6:7]
|
||||
; GFX10-NEXT: v_cmp_lt_i64_e64 s3, s[8:9], s[6:7]
|
||||
; GFX10-NEXT: s_cselect_b32 s2, 1, 0
|
||||
; GFX10-NEXT: v_cndmask_b32_e64 v7, 0, 1, s4
|
||||
; GFX10-NEXT: s_and_b32 s2, 1, s2
|
||||
; GFX10-NEXT: s_cselect_b32 s16, 1, 0
|
||||
; GFX10-NEXT: v_cndmask_b32_e64 v4, 0, 1, s4
|
||||
; GFX10-NEXT: s_and_b32 s4, 1, s16
|
||||
; GFX10-NEXT: s_cmp_eq_u64 s[14:15], 0
|
||||
; GFX10-NEXT: v_cmp_ne_u32_e64 vcc_lo, 0, s2
|
||||
; GFX10-NEXT: v_cndmask_b32_e64 v5, 0, 1, s3
|
||||
; GFX10-NEXT: s_cselect_b32 s3, 1, 0
|
||||
; GFX10-NEXT: s_ashr_i32 s2, s9, 31
|
||||
; GFX10-NEXT: s_and_b32 s3, 1, s3
|
||||
; GFX10-NEXT: s_cmp_lg_u32 s10, 0
|
||||
; GFX10-NEXT: v_cndmask_b32_e32 v4, v5, v4, vcc_lo
|
||||
; GFX10-NEXT: v_cmp_ne_u32_e64 vcc_lo, 0, s3
|
||||
; GFX10-NEXT: s_addc_u32 s3, s2, 0
|
||||
; GFX10-NEXT: s_cselect_b32 s4, 1, 0
|
||||
; GFX10-NEXT: v_mov_b32_e32 v8, s9
|
||||
; GFX10-NEXT: s_and_b32 s4, s4, 1
|
||||
; GFX10-NEXT: v_cndmask_b32_e32 v5, v7, v6, vcc_lo
|
||||
; GFX10-NEXT: s_cmp_lg_u32 s4, 0
|
||||
; GFX10-NEXT: v_mov_b32_e32 v6, s1
|
||||
; GFX10-NEXT: s_addc_u32 s4, s2, 0
|
||||
; GFX10-NEXT: v_cndmask_b32_e64 v5, 0, 1, s6
|
||||
; GFX10-NEXT: v_cmp_gt_i64_e64 s6, s[14:15], 0
|
||||
; GFX10-NEXT: v_cmp_ne_u32_e64 vcc_lo, 0, s4
|
||||
; GFX10-NEXT: s_cselect_b32 s5, 1, 0
|
||||
; GFX10-NEXT: v_xor_b32_e32 v4, v5, v4
|
||||
; GFX10-NEXT: s_and_b32 s5, s5, 1
|
||||
; GFX10-NEXT: v_mov_b32_e32 v5, s0
|
||||
; GFX10-NEXT: v_mov_b32_e32 v7, s8
|
||||
; GFX10-NEXT: s_cmp_lg_u32 s5, 0
|
||||
; GFX10-NEXT: v_and_b32_e32 v4, 1, v4
|
||||
; GFX10-NEXT: s_addc_u32 s1, s2, s11
|
||||
; GFX10-NEXT: v_readfirstlane_b32 s0, v0
|
||||
; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v4
|
||||
; GFX10-NEXT: v_cndmask_b32_e64 v4, v5, s2, vcc_lo
|
||||
; GFX10-NEXT: v_cndmask_b32_e64 v5, v6, s3, vcc_lo
|
||||
; GFX10-NEXT: v_cndmask_b32_e64 v6, v7, s4, vcc_lo
|
||||
; GFX10-NEXT: v_cndmask_b32_e64 v7, v8, s1, vcc_lo
|
||||
; GFX10-NEXT: v_readfirstlane_b32 s1, v1
|
||||
; GFX10-NEXT: v_readfirstlane_b32 s2, v2
|
||||
; GFX10-NEXT: v_readfirstlane_b32 s3, v3
|
||||
; GFX10-NEXT: v_readfirstlane_b32 s4, v4
|
||||
; GFX10-NEXT: v_readfirstlane_b32 s5, v5
|
||||
; GFX10-NEXT: v_readfirstlane_b32 s6, v6
|
||||
; GFX10-NEXT: v_and_b32_e32 v0, 1, v0
|
||||
; GFX10-NEXT: s_and_b32 s5, 1, s5
|
||||
; GFX10-NEXT: v_cndmask_b32_e64 v6, 0, 1, s6
|
||||
; GFX10-NEXT: v_cndmask_b32_e32 v3, v4, v3, vcc_lo
|
||||
; GFX10-NEXT: v_cmp_ne_u32_e64 vcc_lo, 0, s5
|
||||
; GFX10-NEXT: v_cndmask_b32_e32 v4, v6, v5, vcc_lo
|
||||
; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0
|
||||
; GFX10-NEXT: v_mov_b32_e32 v0, s18
|
||||
; GFX10-NEXT: v_mov_b32_e32 v5, s19
|
||||
; GFX10-NEXT: v_mov_b32_e32 v6, s9
|
||||
; GFX10-NEXT: v_xor_b32_e32 v3, v4, v3
|
||||
; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, s0, vcc_lo
|
||||
; GFX10-NEXT: s_ashr_i32 s0, s11, 31
|
||||
; GFX10-NEXT: s_cmp_lg_u32 s20, 0
|
||||
; GFX10-NEXT: v_cndmask_b32_e64 v2, v2, s1, vcc_lo
|
||||
; GFX10-NEXT: v_and_b32_e32 v3, 1, v3
|
||||
; GFX10-NEXT: v_cndmask_b32_e64 v0, v0, s2, vcc_lo
|
||||
; GFX10-NEXT: v_cndmask_b32_e64 v4, v5, s3, vcc_lo
|
||||
; GFX10-NEXT: v_mov_b32_e32 v5, s8
|
||||
; GFX10-NEXT: s_addc_u32 s1, s0, 0
|
||||
; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v3
|
||||
; GFX10-NEXT: v_mov_b32_e32 v3, s10
|
||||
; GFX10-NEXT: s_addc_u32 s2, s0, 0
|
||||
; GFX10-NEXT: s_addc_u32 s3, s0, s21
|
||||
; GFX10-NEXT: v_cndmask_b32_e64 v5, v5, s0, vcc_lo
|
||||
; GFX10-NEXT: v_cndmask_b32_e64 v6, v6, s1, vcc_lo
|
||||
; GFX10-NEXT: v_cndmask_b32_e64 v3, v3, s2, vcc_lo
|
||||
; GFX10-NEXT: v_cndmask_b32_e64 v7, v7, s3, vcc_lo
|
||||
; GFX10-NEXT: v_readfirstlane_b32 s0, v1
|
||||
; GFX10-NEXT: v_readfirstlane_b32 s1, v2
|
||||
; GFX10-NEXT: v_readfirstlane_b32 s2, v0
|
||||
; GFX10-NEXT: v_readfirstlane_b32 s3, v4
|
||||
; GFX10-NEXT: v_readfirstlane_b32 s4, v5
|
||||
; GFX10-NEXT: v_readfirstlane_b32 s5, v6
|
||||
; GFX10-NEXT: v_readfirstlane_b32 s6, v3
|
||||
; GFX10-NEXT: v_readfirstlane_b32 s7, v7
|
||||
; GFX10-NEXT: ; return to shader part epilog
|
||||
%result = call <2 x i128> @llvm.ssub.sat.v2i128(<2 x i128> %lhs, <2 x i128> %rhs)
|
||||
|
|
|
@ -457,7 +457,6 @@ define amdgpu_ps i32 @s_usubo_i32(i32 inreg %a, i32 inreg %b) {
|
|||
; GFX7: ; %bb.0:
|
||||
; GFX7-NEXT: s_sub_u32 s0, s0, s1
|
||||
; GFX7-NEXT: s_cselect_b32 s1, 1, 0
|
||||
; GFX7-NEXT: s_and_b32 s1, s1, 1
|
||||
; GFX7-NEXT: s_sub_i32 s0, s0, s1
|
||||
; GFX7-NEXT: ; return to shader part epilog
|
||||
;
|
||||
|
@ -465,7 +464,6 @@ define amdgpu_ps i32 @s_usubo_i32(i32 inreg %a, i32 inreg %b) {
|
|||
; GFX8: ; %bb.0:
|
||||
; GFX8-NEXT: s_sub_u32 s0, s0, s1
|
||||
; GFX8-NEXT: s_cselect_b32 s1, 1, 0
|
||||
; GFX8-NEXT: s_and_b32 s1, s1, 1
|
||||
; GFX8-NEXT: s_sub_i32 s0, s0, s1
|
||||
; GFX8-NEXT: ; return to shader part epilog
|
||||
;
|
||||
|
@ -473,7 +471,6 @@ define amdgpu_ps i32 @s_usubo_i32(i32 inreg %a, i32 inreg %b) {
|
|||
; GFX9: ; %bb.0:
|
||||
; GFX9-NEXT: s_sub_u32 s0, s0, s1
|
||||
; GFX9-NEXT: s_cselect_b32 s1, 1, 0
|
||||
; GFX9-NEXT: s_and_b32 s1, s1, 1
|
||||
; GFX9-NEXT: s_sub_i32 s0, s0, s1
|
||||
; GFX9-NEXT: ; return to shader part epilog
|
||||
%usubo = call {i32, i1} @llvm.usub.with.overflow.i32(i32 %a, i32 %b)
|
||||
|
@ -487,13 +484,10 @@ define amdgpu_ps i32 @s_usubo_i32(i32 inreg %a, i32 inreg %b) {
|
|||
define amdgpu_ps i64 @s_usubo_i64(i64 inreg %a, i64 inreg %b) {
|
||||
; GFX7-LABEL: s_usubo_i64:
|
||||
; GFX7: ; %bb.0:
|
||||
; GFX7-NEXT: s_sub_u32 s4, s0, s2
|
||||
; GFX7-NEXT: v_mov_b32_e32 v0, s2
|
||||
; GFX7-NEXT: s_cselect_b32 s5, 1, 0
|
||||
; GFX7-NEXT: v_mov_b32_e32 v1, s3
|
||||
; GFX7-NEXT: s_and_b32 s5, s5, 1
|
||||
; GFX7-NEXT: v_cmp_lt_u64_e32 vcc, s[0:1], v[0:1]
|
||||
; GFX7-NEXT: s_cmp_lg_u32 s5, 0
|
||||
; GFX7-NEXT: s_sub_u32 s4, s0, s2
|
||||
; GFX7-NEXT: s_subb_u32 s5, s1, s3
|
||||
; GFX7-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
|
||||
; GFX7-NEXT: v_mov_b32_e32 v1, s5
|
||||
|
@ -505,13 +499,10 @@ define amdgpu_ps i64 @s_usubo_i64(i64 inreg %a, i64 inreg %b) {
|
|||
;
|
||||
; GFX8-LABEL: s_usubo_i64:
|
||||
; GFX8: ; %bb.0:
|
||||
; GFX8-NEXT: s_sub_u32 s4, s0, s2
|
||||
; GFX8-NEXT: v_mov_b32_e32 v0, s2
|
||||
; GFX8-NEXT: s_cselect_b32 s5, 1, 0
|
||||
; GFX8-NEXT: v_mov_b32_e32 v1, s3
|
||||
; GFX8-NEXT: s_and_b32 s5, s5, 1
|
||||
; GFX8-NEXT: v_cmp_lt_u64_e32 vcc, s[0:1], v[0:1]
|
||||
; GFX8-NEXT: s_cmp_lg_u32 s5, 0
|
||||
; GFX8-NEXT: s_sub_u32 s4, s0, s2
|
||||
; GFX8-NEXT: s_subb_u32 s5, s1, s3
|
||||
; GFX8-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
|
||||
; GFX8-NEXT: v_mov_b32_e32 v1, s5
|
||||
|
@ -523,13 +514,10 @@ define amdgpu_ps i64 @s_usubo_i64(i64 inreg %a, i64 inreg %b) {
|
|||
;
|
||||
; GFX9-LABEL: s_usubo_i64:
|
||||
; GFX9: ; %bb.0:
|
||||
; GFX9-NEXT: s_sub_u32 s4, s0, s2
|
||||
; GFX9-NEXT: v_mov_b32_e32 v0, s2
|
||||
; GFX9-NEXT: s_cselect_b32 s5, 1, 0
|
||||
; GFX9-NEXT: v_mov_b32_e32 v1, s3
|
||||
; GFX9-NEXT: s_and_b32 s5, s5, 1
|
||||
; GFX9-NEXT: v_cmp_lt_u64_e32 vcc, s[0:1], v[0:1]
|
||||
; GFX9-NEXT: s_cmp_lg_u32 s5, 0
|
||||
; GFX9-NEXT: s_sub_u32 s4, s0, s2
|
||||
; GFX9-NEXT: s_subb_u32 s5, s1, s3
|
||||
; GFX9-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
|
||||
; GFX9-NEXT: v_mov_b32_e32 v1, s5
|
||||
|
@ -553,8 +541,6 @@ define amdgpu_ps <2 x i32> @s_usubo_v2i32(<2 x i32> inreg %a, <2 x i32> inreg %b
|
|||
; GFX7-NEXT: s_cselect_b32 s2, 1, 0
|
||||
; GFX7-NEXT: s_sub_u32 s1, s1, s3
|
||||
; GFX7-NEXT: s_cselect_b32 s3, 1, 0
|
||||
; GFX7-NEXT: s_and_b32 s2, s2, 1
|
||||
; GFX7-NEXT: s_and_b32 s3, s3, 1
|
||||
; GFX7-NEXT: s_sub_i32 s0, s0, s2
|
||||
; GFX7-NEXT: s_sub_i32 s1, s1, s3
|
||||
; GFX7-NEXT: ; return to shader part epilog
|
||||
|
@ -565,8 +551,6 @@ define amdgpu_ps <2 x i32> @s_usubo_v2i32(<2 x i32> inreg %a, <2 x i32> inreg %b
|
|||
; GFX8-NEXT: s_cselect_b32 s2, 1, 0
|
||||
; GFX8-NEXT: s_sub_u32 s1, s1, s3
|
||||
; GFX8-NEXT: s_cselect_b32 s3, 1, 0
|
||||
; GFX8-NEXT: s_and_b32 s2, s2, 1
|
||||
; GFX8-NEXT: s_and_b32 s3, s3, 1
|
||||
; GFX8-NEXT: s_sub_i32 s0, s0, s2
|
||||
; GFX8-NEXT: s_sub_i32 s1, s1, s3
|
||||
; GFX8-NEXT: ; return to shader part epilog
|
||||
|
@ -577,8 +561,6 @@ define amdgpu_ps <2 x i32> @s_usubo_v2i32(<2 x i32> inreg %a, <2 x i32> inreg %b
|
|||
; GFX9-NEXT: s_cselect_b32 s2, 1, 0
|
||||
; GFX9-NEXT: s_sub_u32 s1, s1, s3
|
||||
; GFX9-NEXT: s_cselect_b32 s3, 1, 0
|
||||
; GFX9-NEXT: s_and_b32 s2, s2, 1
|
||||
; GFX9-NEXT: s_and_b32 s3, s3, 1
|
||||
; GFX9-NEXT: s_sub_i32 s0, s0, s2
|
||||
; GFX9-NEXT: s_sub_i32 s1, s1, s3
|
||||
; GFX9-NEXT: ; return to shader part epilog
|
||||
|
@ -728,9 +710,6 @@ define amdgpu_ps i64 @s_ssubo_i64(i64 inreg %a, i64 inreg %b) {
|
|||
; GFX7-LABEL: s_ssubo_i64:
|
||||
; GFX7: ; %bb.0:
|
||||
; GFX7-NEXT: s_sub_u32 s4, s0, s2
|
||||
; GFX7-NEXT: s_cselect_b32 s5, 1, 0
|
||||
; GFX7-NEXT: s_and_b32 s5, s5, 1
|
||||
; GFX7-NEXT: s_cmp_lg_u32 s5, 0
|
||||
; GFX7-NEXT: v_mov_b32_e32 v0, s0
|
||||
; GFX7-NEXT: s_subb_u32 s5, s1, s3
|
||||
; GFX7-NEXT: v_mov_b32_e32 v1, s1
|
||||
|
@ -748,9 +727,6 @@ define amdgpu_ps i64 @s_ssubo_i64(i64 inreg %a, i64 inreg %b) {
|
|||
; GFX8-LABEL: s_ssubo_i64:
|
||||
; GFX8: ; %bb.0:
|
||||
; GFX8-NEXT: s_sub_u32 s4, s0, s2
|
||||
; GFX8-NEXT: s_cselect_b32 s5, 1, 0
|
||||
; GFX8-NEXT: s_and_b32 s5, s5, 1
|
||||
; GFX8-NEXT: s_cmp_lg_u32 s5, 0
|
||||
; GFX8-NEXT: v_mov_b32_e32 v0, s0
|
||||
; GFX8-NEXT: s_subb_u32 s5, s1, s3
|
||||
; GFX8-NEXT: v_mov_b32_e32 v1, s1
|
||||
|
@ -768,9 +744,6 @@ define amdgpu_ps i64 @s_ssubo_i64(i64 inreg %a, i64 inreg %b) {
|
|||
; GFX9-LABEL: s_ssubo_i64:
|
||||
; GFX9: ; %bb.0:
|
||||
; GFX9-NEXT: s_sub_u32 s4, s0, s2
|
||||
; GFX9-NEXT: s_cselect_b32 s5, 1, 0
|
||||
; GFX9-NEXT: s_and_b32 s5, s5, 1
|
||||
; GFX9-NEXT: s_cmp_lg_u32 s5, 0
|
||||
; GFX9-NEXT: v_mov_b32_e32 v0, s0
|
||||
; GFX9-NEXT: s_subb_u32 s5, s1, s3
|
||||
; GFX9-NEXT: v_mov_b32_e32 v1, s1
|
||||
|
|
|
@ -2591,9 +2591,6 @@ define amdgpu_ps i64 @s_uaddsat_i64(i64 inreg %lhs, i64 inreg %rhs) {
|
|||
; GFX6-LABEL: s_uaddsat_i64:
|
||||
; GFX6: ; %bb.0:
|
||||
; GFX6-NEXT: s_add_u32 s0, s0, s2
|
||||
; GFX6-NEXT: s_cselect_b32 s4, 1, 0
|
||||
; GFX6-NEXT: s_and_b32 s4, s4, 1
|
||||
; GFX6-NEXT: s_cmp_lg_u32 s4, 0
|
||||
; GFX6-NEXT: v_mov_b32_e32 v0, s2
|
||||
; GFX6-NEXT: s_addc_u32 s1, s1, s3
|
||||
; GFX6-NEXT: v_mov_b32_e32 v1, s3
|
||||
|
@ -2609,9 +2606,6 @@ define amdgpu_ps i64 @s_uaddsat_i64(i64 inreg %lhs, i64 inreg %rhs) {
|
|||
; GFX8-LABEL: s_uaddsat_i64:
|
||||
; GFX8: ; %bb.0:
|
||||
; GFX8-NEXT: s_add_u32 s0, s0, s2
|
||||
; GFX8-NEXT: s_cselect_b32 s4, 1, 0
|
||||
; GFX8-NEXT: s_and_b32 s4, s4, 1
|
||||
; GFX8-NEXT: s_cmp_lg_u32 s4, 0
|
||||
; GFX8-NEXT: v_mov_b32_e32 v0, s2
|
||||
; GFX8-NEXT: s_addc_u32 s1, s1, s3
|
||||
; GFX8-NEXT: v_mov_b32_e32 v1, s3
|
||||
|
@ -2627,9 +2621,6 @@ define amdgpu_ps i64 @s_uaddsat_i64(i64 inreg %lhs, i64 inreg %rhs) {
|
|||
; GFX9-LABEL: s_uaddsat_i64:
|
||||
; GFX9: ; %bb.0:
|
||||
; GFX9-NEXT: s_add_u32 s0, s0, s2
|
||||
; GFX9-NEXT: s_cselect_b32 s4, 1, 0
|
||||
; GFX9-NEXT: s_and_b32 s4, s4, 1
|
||||
; GFX9-NEXT: s_cmp_lg_u32 s4, 0
|
||||
; GFX9-NEXT: v_mov_b32_e32 v0, s2
|
||||
; GFX9-NEXT: s_addc_u32 s1, s1, s3
|
||||
; GFX9-NEXT: v_mov_b32_e32 v1, s3
|
||||
|
@ -2645,9 +2636,6 @@ define amdgpu_ps i64 @s_uaddsat_i64(i64 inreg %lhs, i64 inreg %rhs) {
|
|||
; GFX10-LABEL: s_uaddsat_i64:
|
||||
; GFX10: ; %bb.0:
|
||||
; GFX10-NEXT: s_add_u32 s0, s0, s2
|
||||
; GFX10-NEXT: s_cselect_b32 s4, 1, 0
|
||||
; GFX10-NEXT: s_and_b32 s4, s4, 1
|
||||
; GFX10-NEXT: s_cmp_lg_u32 s4, 0
|
||||
; GFX10-NEXT: s_addc_u32 s1, s1, s3
|
||||
; GFX10-NEXT: v_cmp_lt_u64_e64 s2, s[0:1], s[2:3]
|
||||
; GFX10-NEXT: v_cndmask_b32_e64 v0, s0, -1, s2
|
||||
|
@ -2816,20 +2804,14 @@ define amdgpu_ps <2 x i64> @s_uaddsat_v2i64(<2 x i64> inreg %lhs, <2 x i64> inre
|
|||
; GFX6-LABEL: s_uaddsat_v2i64:
|
||||
; GFX6: ; %bb.0:
|
||||
; GFX6-NEXT: s_add_u32 s0, s0, s4
|
||||
; GFX6-NEXT: s_cselect_b32 s8, 1, 0
|
||||
; GFX6-NEXT: s_and_b32 s8, s8, 1
|
||||
; GFX6-NEXT: s_cmp_lg_u32 s8, 0
|
||||
; GFX6-NEXT: v_mov_b32_e32 v0, s4
|
||||
; GFX6-NEXT: s_addc_u32 s1, s1, s5
|
||||
; GFX6-NEXT: v_mov_b32_e32 v1, s5
|
||||
; GFX6-NEXT: v_mov_b32_e32 v2, s0
|
||||
; GFX6-NEXT: v_cmp_lt_u64_e32 vcc, s[0:1], v[0:1]
|
||||
; GFX6-NEXT: s_add_u32 s0, s2, s6
|
||||
; GFX6-NEXT: v_mov_b32_e32 v3, s1
|
||||
; GFX6-NEXT: s_cselect_b32 s1, 1, 0
|
||||
; GFX6-NEXT: s_and_b32 s1, s1, 1
|
||||
; GFX6-NEXT: s_cmp_lg_u32 s1, 0
|
||||
; GFX6-NEXT: v_mov_b32_e32 v0, s6
|
||||
; GFX6-NEXT: v_mov_b32_e32 v3, s1
|
||||
; GFX6-NEXT: s_addc_u32 s1, s3, s7
|
||||
; GFX6-NEXT: v_mov_b32_e32 v1, s7
|
||||
; GFX6-NEXT: v_cndmask_b32_e64 v2, v2, -1, vcc
|
||||
|
@ -2848,20 +2830,14 @@ define amdgpu_ps <2 x i64> @s_uaddsat_v2i64(<2 x i64> inreg %lhs, <2 x i64> inre
|
|||
; GFX8-LABEL: s_uaddsat_v2i64:
|
||||
; GFX8: ; %bb.0:
|
||||
; GFX8-NEXT: s_add_u32 s0, s0, s4
|
||||
; GFX8-NEXT: s_cselect_b32 s8, 1, 0
|
||||
; GFX8-NEXT: s_and_b32 s8, s8, 1
|
||||
; GFX8-NEXT: s_cmp_lg_u32 s8, 0
|
||||
; GFX8-NEXT: v_mov_b32_e32 v0, s4
|
||||
; GFX8-NEXT: s_addc_u32 s1, s1, s5
|
||||
; GFX8-NEXT: v_mov_b32_e32 v1, s5
|
||||
; GFX8-NEXT: v_mov_b32_e32 v2, s0
|
||||
; GFX8-NEXT: v_cmp_lt_u64_e32 vcc, s[0:1], v[0:1]
|
||||
; GFX8-NEXT: s_add_u32 s0, s2, s6
|
||||
; GFX8-NEXT: v_mov_b32_e32 v3, s1
|
||||
; GFX8-NEXT: s_cselect_b32 s1, 1, 0
|
||||
; GFX8-NEXT: s_and_b32 s1, s1, 1
|
||||
; GFX8-NEXT: s_cmp_lg_u32 s1, 0
|
||||
; GFX8-NEXT: v_mov_b32_e32 v0, s6
|
||||
; GFX8-NEXT: v_mov_b32_e32 v3, s1
|
||||
; GFX8-NEXT: s_addc_u32 s1, s3, s7
|
||||
; GFX8-NEXT: v_mov_b32_e32 v1, s7
|
||||
; GFX8-NEXT: v_cndmask_b32_e64 v2, v2, -1, vcc
|
||||
|
@ -2880,20 +2856,14 @@ define amdgpu_ps <2 x i64> @s_uaddsat_v2i64(<2 x i64> inreg %lhs, <2 x i64> inre
|
|||
; GFX9-LABEL: s_uaddsat_v2i64:
|
||||
; GFX9: ; %bb.0:
|
||||
; GFX9-NEXT: s_add_u32 s0, s0, s4
|
||||
; GFX9-NEXT: s_cselect_b32 s8, 1, 0
|
||||
; GFX9-NEXT: s_and_b32 s8, s8, 1
|
||||
; GFX9-NEXT: s_cmp_lg_u32 s8, 0
|
||||
; GFX9-NEXT: v_mov_b32_e32 v0, s4
|
||||
; GFX9-NEXT: s_addc_u32 s1, s1, s5
|
||||
; GFX9-NEXT: v_mov_b32_e32 v1, s5
|
||||
; GFX9-NEXT: v_mov_b32_e32 v2, s0
|
||||
; GFX9-NEXT: v_cmp_lt_u64_e32 vcc, s[0:1], v[0:1]
|
||||
; GFX9-NEXT: s_add_u32 s0, s2, s6
|
||||
; GFX9-NEXT: v_mov_b32_e32 v3, s1
|
||||
; GFX9-NEXT: s_cselect_b32 s1, 1, 0
|
||||
; GFX9-NEXT: s_and_b32 s1, s1, 1
|
||||
; GFX9-NEXT: s_cmp_lg_u32 s1, 0
|
||||
; GFX9-NEXT: v_mov_b32_e32 v0, s6
|
||||
; GFX9-NEXT: v_mov_b32_e32 v3, s1
|
||||
; GFX9-NEXT: s_addc_u32 s1, s3, s7
|
||||
; GFX9-NEXT: v_mov_b32_e32 v1, s7
|
||||
; GFX9-NEXT: v_cndmask_b32_e64 v2, v2, -1, vcc
|
||||
|
@ -2912,23 +2882,17 @@ define amdgpu_ps <2 x i64> @s_uaddsat_v2i64(<2 x i64> inreg %lhs, <2 x i64> inre
|
|||
; GFX10-LABEL: s_uaddsat_v2i64:
|
||||
; GFX10: ; %bb.0:
|
||||
; GFX10-NEXT: s_add_u32 s0, s0, s4
|
||||
; GFX10-NEXT: s_cselect_b32 s8, 1, 0
|
||||
; GFX10-NEXT: s_and_b32 s8, s8, 1
|
||||
; GFX10-NEXT: s_cmp_lg_u32 s8, 0
|
||||
; GFX10-NEXT: s_addc_u32 s1, s1, s5
|
||||
; GFX10-NEXT: s_add_u32 s2, s2, s6
|
||||
; GFX10-NEXT: s_cselect_b32 s8, 1, 0
|
||||
; GFX10-NEXT: v_cmp_lt_u64_e64 s4, s[0:1], s[4:5]
|
||||
; GFX10-NEXT: s_and_b32 s8, s8, 1
|
||||
; GFX10-NEXT: s_cmp_lg_u32 s8, 0
|
||||
; GFX10-NEXT: s_addc_u32 s3, s3, s7
|
||||
; GFX10-NEXT: v_cndmask_b32_e64 v0, s0, -1, s4
|
||||
; GFX10-NEXT: v_cmp_lt_u64_e64 s4, s[0:1], s[4:5]
|
||||
; GFX10-NEXT: v_cmp_lt_u64_e64 s5, s[2:3], s[6:7]
|
||||
; GFX10-NEXT: v_cndmask_b32_e64 v0, s0, -1, s4
|
||||
; GFX10-NEXT: v_cndmask_b32_e64 v1, s1, -1, s4
|
||||
; GFX10-NEXT: v_readfirstlane_b32 s0, v0
|
||||
; GFX10-NEXT: v_readfirstlane_b32 s1, v1
|
||||
; GFX10-NEXT: v_cndmask_b32_e64 v2, s2, -1, s5
|
||||
; GFX10-NEXT: v_cndmask_b32_e64 v3, s3, -1, s5
|
||||
; GFX10-NEXT: v_readfirstlane_b32 s0, v0
|
||||
; GFX10-NEXT: v_readfirstlane_b32 s1, v1
|
||||
; GFX10-NEXT: v_readfirstlane_b32 s2, v2
|
||||
; GFX10-NEXT: v_readfirstlane_b32 s3, v3
|
||||
; GFX10-NEXT: ; return to shader part epilog
|
||||
|
@ -2940,19 +2904,10 @@ define amdgpu_ps i128 @s_uaddsat_i128(i128 inreg %lhs, i128 inreg %rhs) {
|
|||
; GFX6-LABEL: s_uaddsat_i128:
|
||||
; GFX6: ; %bb.0:
|
||||
; GFX6-NEXT: s_add_u32 s0, s0, s4
|
||||
; GFX6-NEXT: s_cselect_b32 s8, 1, 0
|
||||
; GFX6-NEXT: s_and_b32 s8, s8, 1
|
||||
; GFX6-NEXT: s_cmp_lg_u32 s8, 0
|
||||
; GFX6-NEXT: s_addc_u32 s1, s1, s5
|
||||
; GFX6-NEXT: s_cselect_b32 s8, 1, 0
|
||||
; GFX6-NEXT: s_and_b32 s8, s8, 1
|
||||
; GFX6-NEXT: s_cmp_lg_u32 s8, 0
|
||||
; GFX6-NEXT: s_addc_u32 s2, s2, s6
|
||||
; GFX6-NEXT: s_cselect_b32 s8, 1, 0
|
||||
; GFX6-NEXT: v_mov_b32_e32 v2, s4
|
||||
; GFX6-NEXT: s_and_b32 s8, s8, 1
|
||||
; GFX6-NEXT: s_addc_u32 s1, s1, s5
|
||||
; GFX6-NEXT: v_mov_b32_e32 v3, s5
|
||||
; GFX6-NEXT: s_cmp_lg_u32 s8, 0
|
||||
; GFX6-NEXT: s_addc_u32 s2, s2, s6
|
||||
; GFX6-NEXT: v_mov_b32_e32 v0, s6
|
||||
; GFX6-NEXT: v_cmp_lt_u64_e32 vcc, s[0:1], v[2:3]
|
||||
; GFX6-NEXT: s_addc_u32 s3, s3, s7
|
||||
|
@ -2981,18 +2936,9 @@ define amdgpu_ps i128 @s_uaddsat_i128(i128 inreg %lhs, i128 inreg %rhs) {
|
|||
; GFX8-LABEL: s_uaddsat_i128:
|
||||
; GFX8: ; %bb.0:
|
||||
; GFX8-NEXT: s_add_u32 s0, s0, s4
|
||||
; GFX8-NEXT: s_cselect_b32 s8, 1, 0
|
||||
; GFX8-NEXT: s_and_b32 s8, s8, 1
|
||||
; GFX8-NEXT: s_cmp_lg_u32 s8, 0
|
||||
; GFX8-NEXT: s_addc_u32 s1, s1, s5
|
||||
; GFX8-NEXT: s_cselect_b32 s8, 1, 0
|
||||
; GFX8-NEXT: s_and_b32 s8, s8, 1
|
||||
; GFX8-NEXT: s_cmp_lg_u32 s8, 0
|
||||
; GFX8-NEXT: s_addc_u32 s2, s2, s6
|
||||
; GFX8-NEXT: s_cselect_b32 s8, 1, 0
|
||||
; GFX8-NEXT: s_and_b32 s8, s8, 1
|
||||
; GFX8-NEXT: v_mov_b32_e32 v2, s4
|
||||
; GFX8-NEXT: s_cmp_lg_u32 s8, 0
|
||||
; GFX8-NEXT: s_addc_u32 s2, s2, s6
|
||||
; GFX8-NEXT: v_mov_b32_e32 v3, s5
|
||||
; GFX8-NEXT: s_addc_u32 s3, s3, s7
|
||||
; GFX8-NEXT: v_mov_b32_e32 v0, s6
|
||||
|
@ -3025,18 +2971,9 @@ define amdgpu_ps i128 @s_uaddsat_i128(i128 inreg %lhs, i128 inreg %rhs) {
|
|||
; GFX9-LABEL: s_uaddsat_i128:
|
||||
; GFX9: ; %bb.0:
|
||||
; GFX9-NEXT: s_add_u32 s0, s0, s4
|
||||
; GFX9-NEXT: s_cselect_b32 s8, 1, 0
|
||||
; GFX9-NEXT: s_and_b32 s8, s8, 1
|
||||
; GFX9-NEXT: s_cmp_lg_u32 s8, 0
|
||||
; GFX9-NEXT: s_addc_u32 s1, s1, s5
|
||||
; GFX9-NEXT: s_cselect_b32 s8, 1, 0
|
||||
; GFX9-NEXT: s_and_b32 s8, s8, 1
|
||||
; GFX9-NEXT: s_cmp_lg_u32 s8, 0
|
||||
; GFX9-NEXT: s_addc_u32 s2, s2, s6
|
||||
; GFX9-NEXT: s_cselect_b32 s8, 1, 0
|
||||
; GFX9-NEXT: s_and_b32 s8, s8, 1
|
||||
; GFX9-NEXT: v_mov_b32_e32 v2, s4
|
||||
; GFX9-NEXT: s_cmp_lg_u32 s8, 0
|
||||
; GFX9-NEXT: s_addc_u32 s2, s2, s6
|
||||
; GFX9-NEXT: v_mov_b32_e32 v3, s5
|
||||
; GFX9-NEXT: s_addc_u32 s3, s3, s7
|
||||
; GFX9-NEXT: v_mov_b32_e32 v0, s6
|
||||
|
@ -3069,26 +3006,17 @@ define amdgpu_ps i128 @s_uaddsat_i128(i128 inreg %lhs, i128 inreg %rhs) {
|
|||
; GFX10-LABEL: s_uaddsat_i128:
|
||||
; GFX10: ; %bb.0:
|
||||
; GFX10-NEXT: s_add_u32 s0, s0, s4
|
||||
; GFX10-NEXT: s_cselect_b32 s8, 1, 0
|
||||
; GFX10-NEXT: s_and_b32 s8, s8, 1
|
||||
; GFX10-NEXT: s_cmp_lg_u32 s8, 0
|
||||
; GFX10-NEXT: s_addc_u32 s1, s1, s5
|
||||
; GFX10-NEXT: s_cselect_b32 s8, 1, 0
|
||||
; GFX10-NEXT: v_cmp_lt_u64_e64 s4, s[0:1], s[4:5]
|
||||
; GFX10-NEXT: s_and_b32 s8, s8, 1
|
||||
; GFX10-NEXT: s_cmp_lg_u32 s8, 0
|
||||
; GFX10-NEXT: s_addc_u32 s2, s2, s6
|
||||
; GFX10-NEXT: s_cselect_b32 s8, 1, 0
|
||||
; GFX10-NEXT: v_cndmask_b32_e64 v0, 0, 1, s4
|
||||
; GFX10-NEXT: s_and_b32 s8, s8, 1
|
||||
; GFX10-NEXT: s_cmp_lg_u32 s8, 0
|
||||
; GFX10-NEXT: v_cmp_lt_u64_e64 s4, s[0:1], s[4:5]
|
||||
; GFX10-NEXT: s_addc_u32 s3, s3, s7
|
||||
; GFX10-NEXT: s_cmp_eq_u64 s[2:3], s[6:7]
|
||||
; GFX10-NEXT: v_cmp_lt_u64_e64 s5, s[2:3], s[6:7]
|
||||
; GFX10-NEXT: s_cselect_b32 s4, 1, 0
|
||||
; GFX10-NEXT: s_and_b32 s4, 1, s4
|
||||
; GFX10-NEXT: s_cselect_b32 s8, 1, 0
|
||||
; GFX10-NEXT: v_cndmask_b32_e64 v0, 0, 1, s4
|
||||
; GFX10-NEXT: v_cmp_lt_u64_e64 s4, s[2:3], s[6:7]
|
||||
; GFX10-NEXT: v_cndmask_b32_e64 v1, 0, 1, s4
|
||||
; GFX10-NEXT: s_and_b32 s4, 1, s8
|
||||
; GFX10-NEXT: v_cmp_ne_u32_e64 vcc_lo, 0, s4
|
||||
; GFX10-NEXT: v_cndmask_b32_e64 v1, 0, 1, s5
|
||||
; GFX10-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc_lo
|
||||
; GFX10-NEXT: v_and_b32_e32 v0, 1, v0
|
||||
; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0
|
||||
|
@ -3450,19 +3378,10 @@ define amdgpu_ps <2 x i128> @s_uaddsat_v2i128(<2 x i128> inreg %lhs, <2 x i128>
|
|||
; GFX6-LABEL: s_uaddsat_v2i128:
|
||||
; GFX6: ; %bb.0:
|
||||
; GFX6-NEXT: s_add_u32 s0, s0, s8
|
||||
; GFX6-NEXT: s_cselect_b32 s16, 1, 0
|
||||
; GFX6-NEXT: s_and_b32 s16, s16, 1
|
||||
; GFX6-NEXT: s_cmp_lg_u32 s16, 0
|
||||
; GFX6-NEXT: s_addc_u32 s1, s1, s9
|
||||
; GFX6-NEXT: s_cselect_b32 s16, 1, 0
|
||||
; GFX6-NEXT: s_and_b32 s16, s16, 1
|
||||
; GFX6-NEXT: s_cmp_lg_u32 s16, 0
|
||||
; GFX6-NEXT: s_addc_u32 s2, s2, s10
|
||||
; GFX6-NEXT: s_cselect_b32 s16, 1, 0
|
||||
; GFX6-NEXT: v_mov_b32_e32 v2, s8
|
||||
; GFX6-NEXT: s_and_b32 s16, s16, 1
|
||||
; GFX6-NEXT: s_addc_u32 s1, s1, s9
|
||||
; GFX6-NEXT: v_mov_b32_e32 v3, s9
|
||||
; GFX6-NEXT: s_cmp_lg_u32 s16, 0
|
||||
; GFX6-NEXT: s_addc_u32 s2, s2, s10
|
||||
; GFX6-NEXT: v_mov_b32_e32 v0, s10
|
||||
; GFX6-NEXT: v_cmp_lt_u64_e32 vcc, s[0:1], v[2:3]
|
||||
; GFX6-NEXT: s_addc_u32 s3, s3, s11
|
||||
|
@ -3472,30 +3391,21 @@ define amdgpu_ps <2 x i128> @s_uaddsat_v2i128(<2 x i128> inreg %lhs, <2 x i128>
|
|||
; GFX6-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc
|
||||
; GFX6-NEXT: v_cmp_eq_u64_e32 vcc, s[2:3], v[0:1]
|
||||
; GFX6-NEXT: v_mov_b32_e32 v1, s0
|
||||
; GFX6-NEXT: s_add_u32 s0, s4, s12
|
||||
; GFX6-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc
|
||||
; GFX6-NEXT: v_mov_b32_e32 v2, s1
|
||||
; GFX6-NEXT: s_cselect_b32 s1, 1, 0
|
||||
; GFX6-NEXT: s_and_b32 s1, s1, 1
|
||||
; GFX6-NEXT: s_cmp_lg_u32 s1, 0
|
||||
; GFX6-NEXT: v_and_b32_e32 v0, 1, v0
|
||||
; GFX6-NEXT: s_addc_u32 s1, s5, s13
|
||||
; GFX6-NEXT: v_mov_b32_e32 v2, s1
|
||||
; GFX6-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0
|
||||
; GFX6-NEXT: v_mov_b32_e32 v0, s2
|
||||
; GFX6-NEXT: s_cselect_b32 s2, 1, 0
|
||||
; GFX6-NEXT: s_and_b32 s2, s2, 1
|
||||
; GFX6-NEXT: s_cmp_lg_u32 s2, 0
|
||||
; GFX6-NEXT: s_addc_u32 s2, s6, s14
|
||||
; GFX6-NEXT: v_cndmask_b32_e64 v4, v1, -1, vcc
|
||||
; GFX6-NEXT: v_cndmask_b32_e64 v5, v2, -1, vcc
|
||||
; GFX6-NEXT: v_mov_b32_e32 v1, s3
|
||||
; GFX6-NEXT: s_cselect_b32 s3, 1, 0
|
||||
; GFX6-NEXT: s_add_u32 s0, s4, s12
|
||||
; GFX6-NEXT: v_mov_b32_e32 v2, s12
|
||||
; GFX6-NEXT: s_and_b32 s3, s3, 1
|
||||
; GFX6-NEXT: v_cndmask_b32_e64 v4, v1, -1, vcc
|
||||
; GFX6-NEXT: v_mov_b32_e32 v0, s2
|
||||
; GFX6-NEXT: v_mov_b32_e32 v1, s3
|
||||
; GFX6-NEXT: s_addc_u32 s1, s5, s13
|
||||
; GFX6-NEXT: v_mov_b32_e32 v3, s13
|
||||
; GFX6-NEXT: v_cndmask_b32_e64 v6, v0, -1, vcc
|
||||
; GFX6-NEXT: v_cndmask_b32_e64 v7, v1, -1, vcc
|
||||
; GFX6-NEXT: s_cmp_lg_u32 s3, 0
|
||||
; GFX6-NEXT: s_addc_u32 s2, s6, s14
|
||||
; GFX6-NEXT: v_mov_b32_e32 v0, s14
|
||||
; GFX6-NEXT: v_cmp_lt_u64_e32 vcc, s[0:1], v[2:3]
|
||||
; GFX6-NEXT: s_addc_u32 s3, s7, s15
|
||||
|
@ -3528,18 +3438,9 @@ define amdgpu_ps <2 x i128> @s_uaddsat_v2i128(<2 x i128> inreg %lhs, <2 x i128>
|
|||
; GFX8-LABEL: s_uaddsat_v2i128:
|
||||
; GFX8: ; %bb.0:
|
||||
; GFX8-NEXT: s_add_u32 s0, s0, s8
|
||||
; GFX8-NEXT: s_cselect_b32 s16, 1, 0
|
||||
; GFX8-NEXT: s_and_b32 s16, s16, 1
|
||||
; GFX8-NEXT: s_cmp_lg_u32 s16, 0
|
||||
; GFX8-NEXT: s_addc_u32 s1, s1, s9
|
||||
; GFX8-NEXT: s_cselect_b32 s16, 1, 0
|
||||
; GFX8-NEXT: s_and_b32 s16, s16, 1
|
||||
; GFX8-NEXT: s_cmp_lg_u32 s16, 0
|
||||
; GFX8-NEXT: s_addc_u32 s2, s2, s10
|
||||
; GFX8-NEXT: s_cselect_b32 s16, 1, 0
|
||||
; GFX8-NEXT: s_and_b32 s16, s16, 1
|
||||
; GFX8-NEXT: v_mov_b32_e32 v2, s8
|
||||
; GFX8-NEXT: s_cmp_lg_u32 s16, 0
|
||||
; GFX8-NEXT: s_addc_u32 s2, s2, s10
|
||||
; GFX8-NEXT: v_mov_b32_e32 v3, s9
|
||||
; GFX8-NEXT: s_addc_u32 s3, s3, s11
|
||||
; GFX8-NEXT: v_mov_b32_e32 v0, s10
|
||||
|
@ -3552,28 +3453,19 @@ define amdgpu_ps <2 x i128> @s_uaddsat_v2i128(<2 x i128> inreg %lhs, <2 x i128>
|
|||
; GFX8-NEXT: s_and_b32 s8, 1, s10
|
||||
; GFX8-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
|
||||
; GFX8-NEXT: v_cmp_ne_u32_e64 vcc, 0, s8
|
||||
; GFX8-NEXT: v_mov_b32_e32 v1, s0
|
||||
; GFX8-NEXT: s_add_u32 s0, s4, s12
|
||||
; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc
|
||||
; GFX8-NEXT: v_mov_b32_e32 v2, s1
|
||||
; GFX8-NEXT: s_cselect_b32 s1, 1, 0
|
||||
; GFX8-NEXT: s_and_b32 s1, s1, 1
|
||||
; GFX8-NEXT: s_cmp_lg_u32 s1, 0
|
||||
; GFX8-NEXT: v_and_b32_e32 v0, 1, v0
|
||||
; GFX8-NEXT: s_addc_u32 s1, s5, s13
|
||||
; GFX8-NEXT: v_mov_b32_e32 v1, s0
|
||||
; GFX8-NEXT: v_mov_b32_e32 v2, s1
|
||||
; GFX8-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0
|
||||
; GFX8-NEXT: v_mov_b32_e32 v0, s2
|
||||
; GFX8-NEXT: s_cselect_b32 s2, 1, 0
|
||||
; GFX8-NEXT: s_and_b32 s2, s2, 1
|
||||
; GFX8-NEXT: s_cmp_lg_u32 s2, 0
|
||||
; GFX8-NEXT: s_addc_u32 s2, s6, s14
|
||||
; GFX8-NEXT: v_cndmask_b32_e64 v4, v1, -1, vcc
|
||||
; GFX8-NEXT: v_mov_b32_e32 v1, s3
|
||||
; GFX8-NEXT: s_cselect_b32 s3, 1, 0
|
||||
; GFX8-NEXT: s_add_u32 s0, s4, s12
|
||||
; GFX8-NEXT: v_cndmask_b32_e64 v5, v2, -1, vcc
|
||||
; GFX8-NEXT: s_and_b32 s3, s3, 1
|
||||
; GFX8-NEXT: s_addc_u32 s1, s5, s13
|
||||
; GFX8-NEXT: v_mov_b32_e32 v2, s12
|
||||
; GFX8-NEXT: s_cmp_lg_u32 s3, 0
|
||||
; GFX8-NEXT: v_cndmask_b32_e64 v4, v1, -1, vcc
|
||||
; GFX8-NEXT: v_mov_b32_e32 v0, s2
|
||||
; GFX8-NEXT: v_mov_b32_e32 v1, s3
|
||||
; GFX8-NEXT: s_addc_u32 s2, s6, s14
|
||||
; GFX8-NEXT: v_mov_b32_e32 v3, s13
|
||||
; GFX8-NEXT: v_cndmask_b32_e64 v6, v0, -1, vcc
|
||||
; GFX8-NEXT: v_cndmask_b32_e64 v7, v1, -1, vcc
|
||||
|
@ -3612,18 +3504,9 @@ define amdgpu_ps <2 x i128> @s_uaddsat_v2i128(<2 x i128> inreg %lhs, <2 x i128>
|
|||
; GFX9-LABEL: s_uaddsat_v2i128:
|
||||
; GFX9: ; %bb.0:
|
||||
; GFX9-NEXT: s_add_u32 s0, s0, s8
|
||||
; GFX9-NEXT: s_cselect_b32 s16, 1, 0
|
||||
; GFX9-NEXT: s_and_b32 s16, s16, 1
|
||||
; GFX9-NEXT: s_cmp_lg_u32 s16, 0
|
||||
; GFX9-NEXT: s_addc_u32 s1, s1, s9
|
||||
; GFX9-NEXT: s_cselect_b32 s16, 1, 0
|
||||
; GFX9-NEXT: s_and_b32 s16, s16, 1
|
||||
; GFX9-NEXT: s_cmp_lg_u32 s16, 0
|
||||
; GFX9-NEXT: s_addc_u32 s2, s2, s10
|
||||
; GFX9-NEXT: s_cselect_b32 s16, 1, 0
|
||||
; GFX9-NEXT: s_and_b32 s16, s16, 1
|
||||
; GFX9-NEXT: v_mov_b32_e32 v2, s8
|
||||
; GFX9-NEXT: s_cmp_lg_u32 s16, 0
|
||||
; GFX9-NEXT: s_addc_u32 s2, s2, s10
|
||||
; GFX9-NEXT: v_mov_b32_e32 v3, s9
|
||||
; GFX9-NEXT: s_addc_u32 s3, s3, s11
|
||||
; GFX9-NEXT: v_mov_b32_e32 v0, s10
|
||||
|
@ -3636,28 +3519,19 @@ define amdgpu_ps <2 x i128> @s_uaddsat_v2i128(<2 x i128> inreg %lhs, <2 x i128>
|
|||
; GFX9-NEXT: s_and_b32 s8, 1, s10
|
||||
; GFX9-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
|
||||
; GFX9-NEXT: v_cmp_ne_u32_e64 vcc, 0, s8
|
||||
; GFX9-NEXT: v_mov_b32_e32 v1, s0
|
||||
; GFX9-NEXT: s_add_u32 s0, s4, s12
|
||||
; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc
|
||||
; GFX9-NEXT: v_mov_b32_e32 v2, s1
|
||||
; GFX9-NEXT: s_cselect_b32 s1, 1, 0
|
||||
; GFX9-NEXT: s_and_b32 s1, s1, 1
|
||||
; GFX9-NEXT: s_cmp_lg_u32 s1, 0
|
||||
; GFX9-NEXT: v_and_b32_e32 v0, 1, v0
|
||||
; GFX9-NEXT: s_addc_u32 s1, s5, s13
|
||||
; GFX9-NEXT: v_mov_b32_e32 v1, s0
|
||||
; GFX9-NEXT: v_mov_b32_e32 v2, s1
|
||||
; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0
|
||||
; GFX9-NEXT: v_mov_b32_e32 v0, s2
|
||||
; GFX9-NEXT: s_cselect_b32 s2, 1, 0
|
||||
; GFX9-NEXT: s_and_b32 s2, s2, 1
|
||||
; GFX9-NEXT: s_cmp_lg_u32 s2, 0
|
||||
; GFX9-NEXT: s_addc_u32 s2, s6, s14
|
||||
; GFX9-NEXT: v_cndmask_b32_e64 v4, v1, -1, vcc
|
||||
; GFX9-NEXT: v_mov_b32_e32 v1, s3
|
||||
; GFX9-NEXT: s_cselect_b32 s3, 1, 0
|
||||
; GFX9-NEXT: s_add_u32 s0, s4, s12
|
||||
; GFX9-NEXT: v_cndmask_b32_e64 v5, v2, -1, vcc
|
||||
; GFX9-NEXT: s_and_b32 s3, s3, 1
|
||||
; GFX9-NEXT: s_addc_u32 s1, s5, s13
|
||||
; GFX9-NEXT: v_mov_b32_e32 v2, s12
|
||||
; GFX9-NEXT: s_cmp_lg_u32 s3, 0
|
||||
; GFX9-NEXT: v_cndmask_b32_e64 v4, v1, -1, vcc
|
||||
; GFX9-NEXT: v_mov_b32_e32 v0, s2
|
||||
; GFX9-NEXT: v_mov_b32_e32 v1, s3
|
||||
; GFX9-NEXT: s_addc_u32 s2, s6, s14
|
||||
; GFX9-NEXT: v_mov_b32_e32 v3, s13
|
||||
; GFX9-NEXT: v_cndmask_b32_e64 v6, v0, -1, vcc
|
||||
; GFX9-NEXT: v_cndmask_b32_e64 v7, v1, -1, vcc
|
||||
|
@ -3696,69 +3570,51 @@ define amdgpu_ps <2 x i128> @s_uaddsat_v2i128(<2 x i128> inreg %lhs, <2 x i128>
|
|||
; GFX10-LABEL: s_uaddsat_v2i128:
|
||||
; GFX10: ; %bb.0:
|
||||
; GFX10-NEXT: s_add_u32 s0, s0, s8
|
||||
; GFX10-NEXT: s_cselect_b32 s16, 1, 0
|
||||
; GFX10-NEXT: s_and_b32 s16, s16, 1
|
||||
; GFX10-NEXT: s_cmp_lg_u32 s16, 0
|
||||
; GFX10-NEXT: s_addc_u32 s1, s1, s9
|
||||
; GFX10-NEXT: s_cselect_b32 s16, 1, 0
|
||||
; GFX10-NEXT: v_cmp_lt_u64_e64 s8, s[0:1], s[8:9]
|
||||
; GFX10-NEXT: s_and_b32 s16, s16, 1
|
||||
; GFX10-NEXT: s_cmp_lg_u32 s16, 0
|
||||
; GFX10-NEXT: s_addc_u32 s2, s2, s10
|
||||
; GFX10-NEXT: s_cselect_b32 s16, 1, 0
|
||||
; GFX10-NEXT: v_cndmask_b32_e64 v0, 0, 1, s8
|
||||
; GFX10-NEXT: s_and_b32 s16, s16, 1
|
||||
; GFX10-NEXT: s_cmp_lg_u32 s16, 0
|
||||
; GFX10-NEXT: v_cmp_lt_u64_e64 s8, s[0:1], s[8:9]
|
||||
; GFX10-NEXT: s_addc_u32 s3, s3, s11
|
||||
; GFX10-NEXT: s_cmp_eq_u64 s[2:3], s[10:11]
|
||||
; GFX10-NEXT: v_cmp_lt_u64_e64 s10, s[2:3], s[10:11]
|
||||
; GFX10-NEXT: s_cselect_b32 s16, 1, 0
|
||||
; GFX10-NEXT: v_cndmask_b32_e64 v0, 0, 1, s8
|
||||
; GFX10-NEXT: v_cmp_lt_u64_e64 s8, s[2:3], s[10:11]
|
||||
; GFX10-NEXT: v_cndmask_b32_e64 v1, 0, 1, s8
|
||||
; GFX10-NEXT: s_and_b32 s8, 1, s16
|
||||
; GFX10-NEXT: s_add_u32 s4, s4, s12
|
||||
; GFX10-NEXT: s_cselect_b32 s9, 1, 0
|
||||
; GFX10-NEXT: v_cmp_ne_u32_e64 vcc_lo, 0, s8
|
||||
; GFX10-NEXT: s_and_b32 s9, s9, 1
|
||||
; GFX10-NEXT: v_cndmask_b32_e64 v1, 0, 1, s10
|
||||
; GFX10-NEXT: s_cmp_lg_u32 s9, 0
|
||||
; GFX10-NEXT: s_addc_u32 s5, s5, s13
|
||||
; GFX10-NEXT: s_cselect_b32 s9, 1, 0
|
||||
; GFX10-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc_lo
|
||||
; GFX10-NEXT: s_and_b32 s9, s9, 1
|
||||
; GFX10-NEXT: s_cmp_lg_u32 s9, 0
|
||||
; GFX10-NEXT: v_cmp_ne_u32_e64 vcc_lo, 0, s8
|
||||
; GFX10-NEXT: v_cmp_lt_u64_e64 s9, s[4:5], s[12:13]
|
||||
; GFX10-NEXT: s_addc_u32 s6, s6, s14
|
||||
; GFX10-NEXT: s_cselect_b32 s8, 1, 0
|
||||
; GFX10-NEXT: v_and_b32_e32 v0, 1, v0
|
||||
; GFX10-NEXT: s_and_b32 s8, s8, 1
|
||||
; GFX10-NEXT: v_cndmask_b32_e64 v1, 0, 1, s9
|
||||
; GFX10-NEXT: s_cmp_lg_u32 s8, 0
|
||||
; GFX10-NEXT: s_addc_u32 s7, s7, s15
|
||||
; GFX10-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc_lo
|
||||
; GFX10-NEXT: s_cmp_eq_u64 s[6:7], s[14:15]
|
||||
; GFX10-NEXT: v_cndmask_b32_e64 v1, 0, 1, s9
|
||||
; GFX10-NEXT: v_cmp_lt_u64_e64 s9, s[6:7], s[14:15]
|
||||
; GFX10-NEXT: s_cselect_b32 s8, 1, 0
|
||||
; GFX10-NEXT: v_and_b32_e32 v0, 1, v0
|
||||
; GFX10-NEXT: s_and_b32 s8, 1, s8
|
||||
; GFX10-NEXT: v_cmp_ne_u32_e64 vcc_lo, 0, s8
|
||||
; GFX10-NEXT: v_cndmask_b32_e64 v2, 0, 1, s9
|
||||
; GFX10-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc_lo
|
||||
; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0
|
||||
; GFX10-NEXT: v_and_b32_e32 v0, 1, v1
|
||||
; GFX10-NEXT: v_cndmask_b32_e64 v1, s0, -1, vcc_lo
|
||||
; GFX10-NEXT: v_cndmask_b32_e64 v2, s1, -1, vcc_lo
|
||||
; GFX10-NEXT: v_cndmask_b32_e64 v3, s2, -1, vcc_lo
|
||||
; GFX10-NEXT: v_cndmask_b32_e64 v4, s3, -1, vcc_lo
|
||||
; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0
|
||||
; GFX10-NEXT: v_readfirstlane_b32 s0, v1
|
||||
; GFX10-NEXT: v_readfirstlane_b32 s1, v2
|
||||
; GFX10-NEXT: v_readfirstlane_b32 s2, v3
|
||||
; GFX10-NEXT: v_readfirstlane_b32 s3, v4
|
||||
; GFX10-NEXT: v_cndmask_b32_e64 v0, s4, -1, vcc_lo
|
||||
; GFX10-NEXT: v_cndmask_b32_e64 v1, s5, -1, vcc_lo
|
||||
; GFX10-NEXT: v_cndmask_b32_e64 v2, s6, -1, vcc_lo
|
||||
; GFX10-NEXT: v_cndmask_b32_e64 v3, s7, -1, vcc_lo
|
||||
; GFX10-NEXT: v_readfirstlane_b32 s4, v0
|
||||
; GFX10-NEXT: v_readfirstlane_b32 s5, v1
|
||||
; GFX10-NEXT: v_readfirstlane_b32 s6, v2
|
||||
; GFX10-NEXT: v_readfirstlane_b32 s7, v3
|
||||
; GFX10-NEXT: v_and_b32_e32 v1, 1, v1
|
||||
; GFX10-NEXT: v_cndmask_b32_e64 v0, s0, -1, vcc_lo
|
||||
; GFX10-NEXT: v_cndmask_b32_e64 v2, s2, -1, vcc_lo
|
||||
; GFX10-NEXT: v_cndmask_b32_e64 v3, s3, -1, vcc_lo
|
||||
; GFX10-NEXT: v_cmp_ne_u32_e64 s0, 0, v1
|
||||
; GFX10-NEXT: v_cndmask_b32_e64 v1, s1, -1, vcc_lo
|
||||
; GFX10-NEXT: v_readfirstlane_b32 s2, v2
|
||||
; GFX10-NEXT: v_readfirstlane_b32 s3, v3
|
||||
; GFX10-NEXT: v_cndmask_b32_e64 v4, s4, -1, s0
|
||||
; GFX10-NEXT: v_cndmask_b32_e64 v5, s5, -1, s0
|
||||
; GFX10-NEXT: v_cndmask_b32_e64 v6, s6, -1, s0
|
||||
; GFX10-NEXT: v_cndmask_b32_e64 v7, s7, -1, s0
|
||||
; GFX10-NEXT: v_readfirstlane_b32 s0, v0
|
||||
; GFX10-NEXT: v_readfirstlane_b32 s1, v1
|
||||
; GFX10-NEXT: v_readfirstlane_b32 s4, v4
|
||||
; GFX10-NEXT: v_readfirstlane_b32 s5, v5
|
||||
; GFX10-NEXT: v_readfirstlane_b32 s6, v6
|
||||
; GFX10-NEXT: v_readfirstlane_b32 s7, v7
|
||||
; GFX10-NEXT: ; return to shader part epilog
|
||||
%result = call <2 x i128> @llvm.uadd.sat.v2i128(<2 x i128> %lhs, <2 x i128> %rhs)
|
||||
ret <2 x i128> %result
|
||||
|
|
|
@ -194,14 +194,11 @@ define amdgpu_ps i64 @s_udiv_i64(i64 inreg %num, i64 inreg %den) {
|
|||
; CHECK-NEXT: v_mov_b32_e32 v1, s3
|
||||
; CHECK-NEXT: v_cvt_f32_u32_e32 v2, s3
|
||||
; CHECK-NEXT: s_sub_u32 s4, 0, s2
|
||||
; CHECK-NEXT: s_cselect_b32 s5, 1, 0
|
||||
; CHECK-NEXT: v_mov_b32_e32 v3, s1
|
||||
; CHECK-NEXT: v_mac_f32_e32 v0, 0x4f800000, v2
|
||||
; CHECK-NEXT: s_and_b32 s5, s5, 1
|
||||
; CHECK-NEXT: s_subb_u32 s5, 0, s3
|
||||
; CHECK-NEXT: v_rcp_iflag_f32_e32 v0, v0
|
||||
; CHECK-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0
|
||||
; CHECK-NEXT: s_cmp_lg_u32 s5, 0
|
||||
; CHECK-NEXT: s_subb_u32 s5, 0, s3
|
||||
; CHECK-NEXT: v_mul_f32_e32 v2, 0x2f800000, v0
|
||||
; CHECK-NEXT: v_trunc_f32_e32 v2, v2
|
||||
; CHECK-NEXT: v_mac_f32_e32 v0, 0xcf800000, v2
|
||||
|
|
|
@ -117,13 +117,10 @@ define amdgpu_kernel void @udivrem_i64(i64 addrspace(1)* %out0, i64 addrspace(1)
|
|||
; GFX8-NEXT: v_cvt_f32_u32_e32 v0, s11
|
||||
; GFX8-NEXT: v_cvt_f32_u32_e32 v1, s10
|
||||
; GFX8-NEXT: s_sub_u32 s0, 0, s10
|
||||
; GFX8-NEXT: s_cselect_b32 s1, 1, 0
|
||||
; GFX8-NEXT: s_subb_u32 s1, 0, s11
|
||||
; GFX8-NEXT: v_mul_f32_e32 v0, 0x4f800000, v0
|
||||
; GFX8-NEXT: v_add_f32_e32 v0, v0, v1
|
||||
; GFX8-NEXT: v_rcp_iflag_f32_e32 v0, v0
|
||||
; GFX8-NEXT: s_and_b32 s1, s1, 1
|
||||
; GFX8-NEXT: s_cmp_lg_u32 s1, 0
|
||||
; GFX8-NEXT: s_subb_u32 s1, 0, s11
|
||||
; GFX8-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0
|
||||
; GFX8-NEXT: v_mul_f32_e32 v1, 0x2f800000, v0
|
||||
; GFX8-NEXT: v_trunc_f32_e32 v1, v1
|
||||
|
@ -140,19 +137,19 @@ define amdgpu_kernel void @udivrem_i64(i64 addrspace(1)* %out0, i64 addrspace(1)
|
|||
; GFX8-NEXT: v_mul_lo_u32 v3, v1, v4
|
||||
; GFX8-NEXT: v_mul_lo_u32 v5, v0, v2
|
||||
; GFX8-NEXT: v_mul_hi_u32 v6, v0, v4
|
||||
; GFX8-NEXT: v_mul_lo_u32 v7, v1, v2
|
||||
; GFX8-NEXT: v_mul_hi_u32 v4, v1, v4
|
||||
; GFX8-NEXT: v_mul_lo_u32 v7, v1, v2
|
||||
; GFX8-NEXT: v_add_u32_e32 v3, vcc, v3, v5
|
||||
; GFX8-NEXT: v_mul_hi_u32 v8, v0, v2
|
||||
; GFX8-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc
|
||||
; GFX8-NEXT: v_add_u32_e32 v3, vcc, v3, v6
|
||||
; GFX8-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc
|
||||
; GFX8-NEXT: v_add_u32_e32 v3, vcc, v5, v3
|
||||
; GFX8-NEXT: v_mul_hi_u32 v5, v0, v2
|
||||
; GFX8-NEXT: v_add_u32_e32 v4, vcc, v7, v4
|
||||
; GFX8-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc
|
||||
; GFX8-NEXT: v_add_u32_e32 v4, vcc, v4, v5
|
||||
; GFX8-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc
|
||||
; GFX8-NEXT: v_add_u32_e32 v5, vcc, v6, v5
|
||||
; GFX8-NEXT: v_add_u32_e32 v4, vcc, v4, v8
|
||||
; GFX8-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc
|
||||
; GFX8-NEXT: v_add_u32_e32 v5, vcc, v5, v6
|
||||
; GFX8-NEXT: v_mul_hi_u32 v2, v1, v2
|
||||
; GFX8-NEXT: v_add_u32_e32 v3, vcc, v4, v3
|
||||
; GFX8-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc
|
||||
|
@ -269,13 +266,10 @@ define amdgpu_kernel void @udivrem_i64(i64 addrspace(1)* %out0, i64 addrspace(1)
|
|||
; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s11
|
||||
; GFX9-NEXT: v_cvt_f32_u32_e32 v1, s10
|
||||
; GFX9-NEXT: s_sub_u32 s0, 0, s10
|
||||
; GFX9-NEXT: s_cselect_b32 s1, 1, 0
|
||||
; GFX9-NEXT: s_subb_u32 s1, 0, s11
|
||||
; GFX9-NEXT: v_mul_f32_e32 v0, 0x4f800000, v0
|
||||
; GFX9-NEXT: v_add_f32_e32 v0, v0, v1
|
||||
; GFX9-NEXT: v_rcp_iflag_f32_e32 v0, v0
|
||||
; GFX9-NEXT: s_and_b32 s1, s1, 1
|
||||
; GFX9-NEXT: s_cmp_lg_u32 s1, 0
|
||||
; GFX9-NEXT: s_subb_u32 s1, 0, s11
|
||||
; GFX9-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0
|
||||
; GFX9-NEXT: v_mul_f32_e32 v1, 0x2f800000, v0
|
||||
; GFX9-NEXT: v_trunc_f32_e32 v1, v1
|
||||
|
@ -296,16 +290,16 @@ define amdgpu_kernel void @udivrem_i64(i64 addrspace(1)* %out0, i64 addrspace(1)
|
|||
; GFX9-NEXT: v_mul_hi_u32 v8, v0, v2
|
||||
; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, v3, v6
|
||||
; GFX9-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc
|
||||
; GFX9-NEXT: v_add_co_u32_e32 v5, vcc, v7, v5
|
||||
; GFX9-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc
|
||||
; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, v3, v4
|
||||
; GFX9-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc
|
||||
; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, v7, v5
|
||||
; GFX9-NEXT: v_mul_hi_u32 v2, v1, v2
|
||||
; GFX9-NEXT: v_add_u32_e32 v3, v6, v3
|
||||
; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, v5, v8
|
||||
; GFX9-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc
|
||||
; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, v4, v8
|
||||
; GFX9-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc
|
||||
; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, v4, v3
|
||||
; GFX9-NEXT: v_add_u32_e32 v5, v5, v6
|
||||
; GFX9-NEXT: v_add_u32_e32 v5, v7, v5
|
||||
; GFX9-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc
|
||||
; GFX9-NEXT: v_add3_u32 v2, v5, v4, v2
|
||||
; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v3
|
||||
|
@ -412,11 +406,8 @@ define amdgpu_kernel void @udivrem_i64(i64 addrspace(1)* %out0, i64 addrspace(1)
|
|||
; GFX10-NEXT: v_cvt_f32_u32_e32 v0, s11
|
||||
; GFX10-NEXT: v_cvt_f32_u32_e32 v1, s10
|
||||
; GFX10-NEXT: s_sub_u32 s0, 0, s10
|
||||
; GFX10-NEXT: s_cselect_b32 s1, 1, 0
|
||||
; GFX10-NEXT: s_and_b32 s1, s1, 1
|
||||
; GFX10-NEXT: v_mul_f32_e32 v0, 0x4f800000, v0
|
||||
; GFX10-NEXT: s_cmp_lg_u32 s1, 0
|
||||
; GFX10-NEXT: s_subb_u32 s1, 0, s11
|
||||
; GFX10-NEXT: v_mul_f32_e32 v0, 0x4f800000, v0
|
||||
; GFX10-NEXT: v_add_f32_e32 v0, v0, v1
|
||||
; GFX10-NEXT: v_rcp_iflag_f32_e32 v0, v0
|
||||
; GFX10-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0
|
||||
|
@ -1026,13 +1017,12 @@ define amdgpu_kernel void @udivrem_v2i64(<2 x i64> addrspace(1)* %out0, <2 x i64
|
|||
; GFX8-NEXT: v_cvt_f32_u32_e32 v0, s13
|
||||
; GFX8-NEXT: v_cvt_f32_u32_e32 v1, s12
|
||||
; GFX8-NEXT: s_sub_u32 s0, 0, s12
|
||||
; GFX8-NEXT: s_cselect_b32 s1, 1, 0
|
||||
; GFX8-NEXT: s_subb_u32 s1, 0, s13
|
||||
; GFX8-NEXT: v_mul_f32_e32 v0, 0x4f800000, v0
|
||||
; GFX8-NEXT: v_add_f32_e32 v0, v0, v1
|
||||
; GFX8-NEXT: v_rcp_iflag_f32_e32 v0, v0
|
||||
; GFX8-NEXT: s_and_b32 s1, s1, 1
|
||||
; GFX8-NEXT: s_cmp_lg_u32 s1, 0
|
||||
; GFX8-NEXT: s_subb_u32 s1, 0, s13
|
||||
; GFX8-NEXT: s_sub_u32 s2, 0, s14
|
||||
; GFX8-NEXT: s_subb_u32 s3, 0, s15
|
||||
; GFX8-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0
|
||||
; GFX8-NEXT: v_mul_f32_e32 v1, 0x2f800000, v0
|
||||
; GFX8-NEXT: v_trunc_f32_e32 v1, v1
|
||||
|
@ -1040,7 +1030,6 @@ define amdgpu_kernel void @udivrem_v2i64(<2 x i64> addrspace(1)* %out0, <2 x i64
|
|||
; GFX8-NEXT: v_add_f32_e32 v0, v2, v0
|
||||
; GFX8-NEXT: v_cvt_u32_f32_e32 v1, v1
|
||||
; GFX8-NEXT: v_cvt_u32_f32_e32 v0, v0
|
||||
; GFX8-NEXT: s_sub_u32 s2, 0, s14
|
||||
; GFX8-NEXT: v_mul_lo_u32 v2, s0, v1
|
||||
; GFX8-NEXT: v_mul_lo_u32 v3, s1, v0
|
||||
; GFX8-NEXT: v_mul_hi_u32 v5, s0, v0
|
||||
|
@ -1050,19 +1039,19 @@ define amdgpu_kernel void @udivrem_v2i64(<2 x i64> addrspace(1)* %out0, <2 x i64
|
|||
; GFX8-NEXT: v_mul_lo_u32 v3, v1, v4
|
||||
; GFX8-NEXT: v_mul_lo_u32 v5, v0, v2
|
||||
; GFX8-NEXT: v_mul_hi_u32 v6, v0, v4
|
||||
; GFX8-NEXT: v_mul_lo_u32 v7, v1, v2
|
||||
; GFX8-NEXT: v_mul_hi_u32 v4, v1, v4
|
||||
; GFX8-NEXT: v_mul_lo_u32 v7, v1, v2
|
||||
; GFX8-NEXT: v_add_u32_e32 v3, vcc, v3, v5
|
||||
; GFX8-NEXT: v_mul_hi_u32 v8, v0, v2
|
||||
; GFX8-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc
|
||||
; GFX8-NEXT: v_add_u32_e32 v3, vcc, v3, v6
|
||||
; GFX8-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc
|
||||
; GFX8-NEXT: v_add_u32_e32 v3, vcc, v5, v3
|
||||
; GFX8-NEXT: v_mul_hi_u32 v5, v0, v2
|
||||
; GFX8-NEXT: v_add_u32_e32 v4, vcc, v7, v4
|
||||
; GFX8-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc
|
||||
; GFX8-NEXT: v_add_u32_e32 v4, vcc, v4, v5
|
||||
; GFX8-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc
|
||||
; GFX8-NEXT: v_add_u32_e32 v5, vcc, v6, v5
|
||||
; GFX8-NEXT: v_add_u32_e32 v4, vcc, v4, v8
|
||||
; GFX8-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc
|
||||
; GFX8-NEXT: v_add_u32_e32 v5, vcc, v5, v6
|
||||
; GFX8-NEXT: v_mul_hi_u32 v2, v1, v2
|
||||
; GFX8-NEXT: v_add_u32_e32 v3, vcc, v4, v3
|
||||
; GFX8-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc
|
||||
|
@ -1171,23 +1160,19 @@ define amdgpu_kernel void @udivrem_v2i64(<2 x i64> addrspace(1)* %out0, <2 x i64
|
|||
; GFX8-NEXT: v_trunc_f32_e32 v6, v6
|
||||
; GFX8-NEXT: v_mul_f32_e32 v7, 0xcf800000, v6
|
||||
; GFX8-NEXT: v_add_f32_e32 v3, v7, v3
|
||||
; GFX8-NEXT: v_cndmask_b32_e64 v2, v8, v2, s[0:1]
|
||||
; GFX8-NEXT: v_cvt_u32_f32_e32 v3, v3
|
||||
; GFX8-NEXT: v_cvt_u32_f32_e32 v6, v6
|
||||
; GFX8-NEXT: s_cselect_b32 s0, 1, 0
|
||||
; GFX8-NEXT: s_and_b32 s0, s0, 1
|
||||
; GFX8-NEXT: s_cmp_lg_u32 s0, 0
|
||||
; GFX8-NEXT: s_subb_u32 s3, 0, s15
|
||||
; GFX8-NEXT: v_cndmask_b32_e64 v2, v8, v2, s[0:1]
|
||||
; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v10, vcc
|
||||
; GFX8-NEXT: v_mul_lo_u32 v7, s3, v3
|
||||
; GFX8-NEXT: v_mul_lo_u32 v8, s2, v6
|
||||
; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v10, vcc
|
||||
; GFX8-NEXT: v_mul_hi_u32 v10, s2, v3
|
||||
; GFX8-NEXT: v_mul_lo_u32 v9, s2, v3
|
||||
; GFX8-NEXT: v_cndmask_b32_e32 v5, v5, v2, vcc
|
||||
; GFX8-NEXT: v_add_u32_e64 v7, s[0:1], v7, v8
|
||||
; GFX8-NEXT: v_add_u32_e64 v7, s[0:1], v7, v10
|
||||
; GFX8-NEXT: v_mul_lo_u32 v8, v6, v9
|
||||
; GFX8-NEXT: v_mul_lo_u32 v10, v3, v7
|
||||
; GFX8-NEXT: v_cndmask_b32_e32 v5, v5, v2, vcc
|
||||
; GFX8-NEXT: v_mul_hi_u32 v2, v3, v9
|
||||
; GFX8-NEXT: v_mul_hi_u32 v9, v6, v9
|
||||
; GFX8-NEXT: v_add_u32_e32 v8, vcc, v8, v10
|
||||
|
@ -1318,13 +1303,13 @@ define amdgpu_kernel void @udivrem_v2i64(<2 x i64> addrspace(1)* %out0, <2 x i64
|
|||
; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s13
|
||||
; GFX9-NEXT: v_cvt_f32_u32_e32 v1, s12
|
||||
; GFX9-NEXT: s_sub_u32 s0, 0, s12
|
||||
; GFX9-NEXT: s_cselect_b32 s1, 1, 0
|
||||
; GFX9-NEXT: s_subb_u32 s1, 0, s13
|
||||
; GFX9-NEXT: v_mul_f32_e32 v0, 0x4f800000, v0
|
||||
; GFX9-NEXT: v_add_f32_e32 v0, v0, v1
|
||||
; GFX9-NEXT: v_rcp_iflag_f32_e32 v0, v0
|
||||
; GFX9-NEXT: s_and_b32 s1, s1, 1
|
||||
; GFX9-NEXT: s_cmp_lg_u32 s1, 0
|
||||
; GFX9-NEXT: s_subb_u32 s1, 0, s13
|
||||
; GFX9-NEXT: v_cvt_f32_u32_e32 v14, s15
|
||||
; GFX9-NEXT: s_sub_u32 s2, 0, s14
|
||||
; GFX9-NEXT: s_subb_u32 s3, 0, s15
|
||||
; GFX9-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0
|
||||
; GFX9-NEXT: v_mul_f32_e32 v1, 0x2f800000, v0
|
||||
; GFX9-NEXT: v_trunc_f32_e32 v1, v1
|
||||
|
@ -1332,14 +1317,12 @@ define amdgpu_kernel void @udivrem_v2i64(<2 x i64> addrspace(1)* %out0, <2 x i64
|
|||
; GFX9-NEXT: v_add_f32_e32 v0, v2, v0
|
||||
; GFX9-NEXT: v_cvt_u32_f32_e32 v1, v1
|
||||
; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0
|
||||
; GFX9-NEXT: v_cvt_f32_u32_e32 v14, s15
|
||||
; GFX9-NEXT: s_sub_u32 s2, 0, s14
|
||||
; GFX9-NEXT: v_mul_f32_e32 v14, 0x4f800000, v14
|
||||
; GFX9-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x0
|
||||
; GFX9-NEXT: v_mul_lo_u32 v2, s0, v1
|
||||
; GFX9-NEXT: v_mul_lo_u32 v3, s1, v0
|
||||
; GFX9-NEXT: v_mul_hi_u32 v4, s0, v0
|
||||
; GFX9-NEXT: v_mul_lo_u32 v5, s0, v0
|
||||
; GFX9-NEXT: v_mul_f32_e32 v14, 0x4f800000, v14
|
||||
; GFX9-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x0
|
||||
; GFX9-NEXT: v_add3_u32 v2, v3, v2, v4
|
||||
; GFX9-NEXT: v_mul_lo_u32 v3, v1, v5
|
||||
; GFX9-NEXT: v_mul_lo_u32 v6, v0, v2
|
||||
|
@ -1349,16 +1332,16 @@ define amdgpu_kernel void @udivrem_v2i64(<2 x i64> addrspace(1)* %out0, <2 x i64
|
|||
; GFX9-NEXT: v_mul_hi_u32 v8, v0, v2
|
||||
; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, v3, v6
|
||||
; GFX9-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc
|
||||
; GFX9-NEXT: v_add_co_u32_e32 v5, vcc, v7, v5
|
||||
; GFX9-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc
|
||||
; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, v3, v4
|
||||
; GFX9-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc
|
||||
; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, v7, v5
|
||||
; GFX9-NEXT: v_mul_hi_u32 v2, v1, v2
|
||||
; GFX9-NEXT: v_add_u32_e32 v3, v6, v3
|
||||
; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, v5, v8
|
||||
; GFX9-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc
|
||||
; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, v4, v8
|
||||
; GFX9-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc
|
||||
; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, v4, v3
|
||||
; GFX9-NEXT: v_add_u32_e32 v5, v5, v6
|
||||
; GFX9-NEXT: v_add_u32_e32 v5, v7, v5
|
||||
; GFX9-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc
|
||||
; GFX9-NEXT: v_add3_u32 v2, v5, v4, v2
|
||||
; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v3
|
||||
|
@ -1455,20 +1438,16 @@ define amdgpu_kernel void @udivrem_v2i64(<2 x i64> addrspace(1)* %out0, <2 x i64
|
|||
; GFX9-NEXT: v_add_f32_e32 v5, v13, v5
|
||||
; GFX9-NEXT: v_cvt_u32_f32_e32 v5, v5
|
||||
; GFX9-NEXT: v_cvt_u32_f32_e32 v12, v12
|
||||
; GFX9-NEXT: s_cselect_b32 s0, 1, 0
|
||||
; GFX9-NEXT: s_and_b32 s0, s0, 1
|
||||
; GFX9-NEXT: s_cmp_lg_u32 s0, 0
|
||||
; GFX9-NEXT: s_subb_u32 s3, 0, s15
|
||||
; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4
|
||||
; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v9, vcc
|
||||
; GFX9-NEXT: v_mul_lo_u32 v13, s3, v5
|
||||
; GFX9-NEXT: v_mul_lo_u32 v14, s2, v12
|
||||
; GFX9-NEXT: v_mul_hi_u32 v16, s2, v5
|
||||
; GFX9-NEXT: v_mul_lo_u32 v17, s2, v5
|
||||
; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4
|
||||
; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v9, vcc
|
||||
; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v10, vcc
|
||||
; GFX9-NEXT: v_add3_u32 v4, v13, v14, v16
|
||||
; GFX9-NEXT: v_mul_lo_u32 v9, v12, v17
|
||||
; GFX9-NEXT: v_mul_lo_u32 v13, v5, v4
|
||||
; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v10, vcc
|
||||
; GFX9-NEXT: v_mul_hi_u32 v10, v5, v17
|
||||
; GFX9-NEXT: v_mul_hi_u32 v14, v12, v17
|
||||
; GFX9-NEXT: v_add_co_u32_e64 v9, s[0:1], v9, v13
|
||||
|
@ -1600,19 +1579,13 @@ define amdgpu_kernel void @udivrem_v2i64(<2 x i64> addrspace(1)* %out0, <2 x i64
|
|||
; GFX10-NEXT: s_sub_u32 s0, 0, s12
|
||||
; GFX10-NEXT: v_mul_f32_e32 v0, 0x4f800000, v0
|
||||
; GFX10-NEXT: v_mul_f32_e32 v2, 0x4f800000, v2
|
||||
; GFX10-NEXT: s_cselect_b32 s1, 1, 0
|
||||
; GFX10-NEXT: s_and_b32 s1, s1, 1
|
||||
; GFX10-NEXT: s_subb_u32 s1, 0, s13
|
||||
; GFX10-NEXT: s_sub_u32 s2, 0, s14
|
||||
; GFX10-NEXT: s_subb_u32 s3, 0, s15
|
||||
; GFX10-NEXT: v_add_f32_e32 v0, v0, v1
|
||||
; GFX10-NEXT: v_add_f32_e32 v1, v2, v3
|
||||
; GFX10-NEXT: s_cmp_lg_u32 s1, 0
|
||||
; GFX10-NEXT: s_subb_u32 s1, 0, s13
|
||||
; GFX10-NEXT: v_rcp_iflag_f32_e32 v0, v0
|
||||
; GFX10-NEXT: v_rcp_iflag_f32_e32 v1, v1
|
||||
; GFX10-NEXT: s_sub_u32 s2, 0, s14
|
||||
; GFX10-NEXT: s_cselect_b32 s3, 1, 0
|
||||
; GFX10-NEXT: s_and_b32 s3, s3, 1
|
||||
; GFX10-NEXT: s_cmp_lg_u32 s3, 0
|
||||
; GFX10-NEXT: s_subb_u32 s3, 0, s15
|
||||
; GFX10-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0
|
||||
; GFX10-NEXT: v_mul_f32_e32 v1, 0x5f7ffffc, v1
|
||||
; GFX10-NEXT: v_mul_f32_e32 v2, 0x2f800000, v0
|
||||
|
@ -1690,174 +1663,174 @@ define amdgpu_kernel void @udivrem_v2i64(<2 x i64> addrspace(1)* %out0, <2 x i64
|
|||
; GFX10-NEXT: v_mul_lo_u32 v11, s2, v3
|
||||
; GFX10-NEXT: v_mul_lo_u32 v4, s0, v0
|
||||
; GFX10-NEXT: v_mul_lo_u32 v8, s2, v1
|
||||
; GFX10-NEXT: v_mov_b32_e32 v12, 0
|
||||
; GFX10-NEXT: v_add3_u32 v5, v6, v5, v7
|
||||
; GFX10-NEXT: v_add3_u32 v9, v9, v11, v10
|
||||
; GFX10-NEXT: v_mul_lo_u32 v12, v2, v4
|
||||
; GFX10-NEXT: v_mul_lo_u32 v13, v2, v4
|
||||
; GFX10-NEXT: v_mul_lo_u32 v10, v0, v5
|
||||
; GFX10-NEXT: v_mul_hi_u32 v13, v0, v4
|
||||
; GFX10-NEXT: v_mul_hi_u32 v14, v0, v4
|
||||
; GFX10-NEXT: v_mul_hi_u32 v4, v2, v4
|
||||
; GFX10-NEXT: v_mul_lo_u32 v11, v2, v5
|
||||
; GFX10-NEXT: v_mul_lo_u32 v6, v3, v8
|
||||
; GFX10-NEXT: v_mul_lo_u32 v15, v1, v9
|
||||
; GFX10-NEXT: v_mul_lo_u32 v16, v1, v9
|
||||
; GFX10-NEXT: v_mul_hi_u32 v7, v1, v8
|
||||
; GFX10-NEXT: v_mul_hi_u32 v8, v3, v8
|
||||
; GFX10-NEXT: v_add_co_u32 v10, s0, v12, v10
|
||||
; GFX10-NEXT: v_cndmask_b32_e64 v12, 0, 1, s0
|
||||
; GFX10-NEXT: v_add_co_u32 v4, s0, v11, v4
|
||||
; GFX10-NEXT: v_mul_lo_u32 v16, v3, v9
|
||||
; GFX10-NEXT: v_cndmask_b32_e64 v11, 0, 1, s0
|
||||
; GFX10-NEXT: v_add_co_u32 v6, s0, v6, v15
|
||||
; GFX10-NEXT: v_mul_hi_u32 v14, v0, v5
|
||||
; GFX10-NEXT: v_cndmask_b32_e64 v15, 0, 1, s0
|
||||
; GFX10-NEXT: v_add_co_u32 v10, s0, v10, v13
|
||||
; GFX10-NEXT: v_cndmask_b32_e64 v10, 0, 1, s0
|
||||
; GFX10-NEXT: v_add_co_u32 v8, s0, v16, v8
|
||||
; GFX10-NEXT: v_mul_lo_u32 v17, v3, v9
|
||||
; GFX10-NEXT: v_add_co_u32 v10, s0, v13, v10
|
||||
; GFX10-NEXT: v_cndmask_b32_e64 v13, 0, 1, s0
|
||||
; GFX10-NEXT: v_add_co_u32 v4, s0, v4, v14
|
||||
; GFX10-NEXT: v_add_nc_u32_e32 v10, v12, v10
|
||||
; GFX10-NEXT: v_add_co_u32 v4, s0, v11, v4
|
||||
; GFX10-NEXT: v_mul_hi_u32 v15, v0, v5
|
||||
; GFX10-NEXT: v_cndmask_b32_e64 v11, 0, 1, s0
|
||||
; GFX10-NEXT: v_add_co_u32 v6, s0, v6, v16
|
||||
; GFX10-NEXT: v_cndmask_b32_e64 v16, 0, 1, s0
|
||||
; GFX10-NEXT: v_add_co_u32 v8, s0, v17, v8
|
||||
; GFX10-NEXT: v_cndmask_b32_e64 v17, 0, 1, s0
|
||||
; GFX10-NEXT: v_add_co_u32 v10, s0, v10, v14
|
||||
; GFX10-NEXT: v_mul_hi_u32 v18, v1, v9
|
||||
; GFX10-NEXT: v_cndmask_b32_e64 v10, 0, 1, s0
|
||||
; GFX10-NEXT: v_add_co_u32 v4, s0, v4, v15
|
||||
; GFX10-NEXT: v_cndmask_b32_e64 v14, 0, 1, s0
|
||||
; GFX10-NEXT: v_add_co_u32 v6, s0, v6, v7
|
||||
; GFX10-NEXT: v_cndmask_b32_e64 v6, 0, 1, s0
|
||||
; GFX10-NEXT: v_add_nc_u32_e32 v7, v13, v10
|
||||
; GFX10-NEXT: v_add_co_u32 v8, s0, v8, v18
|
||||
; GFX10-NEXT: v_mul_hi_u32 v5, v2, v5
|
||||
; GFX10-NEXT: v_cndmask_b32_e64 v6, 0, 1, s0
|
||||
; GFX10-NEXT: v_add_co_u32 v4, s0, v4, v10
|
||||
; GFX10-NEXT: v_add_nc_u32_e32 v7, v11, v14
|
||||
; GFX10-NEXT: v_cndmask_b32_e64 v10, 0, 1, s0
|
||||
; GFX10-NEXT: v_mul_hi_u32 v17, v1, v9
|
||||
; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, v0, v4
|
||||
; GFX10-NEXT: v_add_nc_u32_e32 v6, v15, v6
|
||||
; GFX10-NEXT: v_add3_u32 v5, v7, v10, v5
|
||||
; GFX10-NEXT: v_add_nc_u32_e32 v6, v16, v6
|
||||
; GFX10-NEXT: v_add_co_u32 v4, s0, v4, v7
|
||||
; GFX10-NEXT: v_add_nc_u32_e32 v11, v11, v14
|
||||
; GFX10-NEXT: v_cndmask_b32_e64 v7, 0, 1, s0
|
||||
; GFX10-NEXT: v_mul_hi_u32 v9, v3, v9
|
||||
; GFX10-NEXT: v_mul_hi_u32 v10, s9, v0
|
||||
; GFX10-NEXT: v_mov_b32_e32 v12, 0
|
||||
; GFX10-NEXT: v_add_co_u32 v8, s0, v8, v17
|
||||
; GFX10-NEXT: v_add_co_u32 v6, s0, v8, v6
|
||||
; GFX10-NEXT: v_add_nc_u32_e32 v10, v17, v10
|
||||
; GFX10-NEXT: v_cndmask_b32_e64 v8, 0, 1, s0
|
||||
; GFX10-NEXT: v_add3_u32 v5, v11, v7, v5
|
||||
; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, v0, v4
|
||||
; GFX10-NEXT: v_add3_u32 v4, v10, v8, v9
|
||||
; GFX10-NEXT: v_add_co_ci_u32_e32 v2, vcc_lo, v2, v5, vcc_lo
|
||||
; GFX10-NEXT: v_cndmask_b32_e64 v11, 0, 1, s0
|
||||
; GFX10-NEXT: v_add_co_u32 v4, s0, v8, v6
|
||||
; GFX10-NEXT: v_mul_lo_u32 v6, s9, v0
|
||||
; GFX10-NEXT: v_mul_lo_u32 v8, s8, v2
|
||||
; GFX10-NEXT: v_mul_hi_u32 v0, s8, v0
|
||||
; GFX10-NEXT: v_add_nc_u32_e32 v7, v13, v11
|
||||
; GFX10-NEXT: v_cndmask_b32_e64 v5, 0, 1, s0
|
||||
; GFX10-NEXT: v_mul_lo_u32 v11, s9, v2
|
||||
; GFX10-NEXT: v_add_co_u32 v1, vcc_lo, v1, v4
|
||||
; GFX10-NEXT: v_add_co_u32 v6, s0, v6, v8
|
||||
; GFX10-NEXT: v_add3_u32 v5, v7, v5, v9
|
||||
; GFX10-NEXT: v_mul_hi_u32 v7, s8, v2
|
||||
; GFX10-NEXT: v_cndmask_b32_e64 v8, 0, 1, s0
|
||||
; GFX10-NEXT: v_add_co_u32 v0, s1, v6, v0
|
||||
; GFX10-NEXT: v_add_co_u32 v9, s0, v11, v10
|
||||
; GFX10-NEXT: v_cndmask_b32_e64 v0, 0, 1, s1
|
||||
; GFX10-NEXT: v_cndmask_b32_e64 v6, 0, 1, s0
|
||||
; GFX10-NEXT: v_add_co_u32 v1, vcc_lo, v1, v6
|
||||
; GFX10-NEXT: v_mul_lo_u32 v5, s9, v0
|
||||
; GFX10-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, v3, v4, vcc_lo
|
||||
; GFX10-NEXT: v_mul_lo_u32 v4, s8, v2
|
||||
; GFX10-NEXT: v_mul_hi_u32 v7, s8, v0
|
||||
; GFX10-NEXT: v_mul_hi_u32 v0, s9, v0
|
||||
; GFX10-NEXT: v_mul_lo_u32 v9, s9, v2
|
||||
; GFX10-NEXT: v_mul_hi_u32 v10, s8, v2
|
||||
; GFX10-NEXT: v_mul_hi_u32 v2, s9, v2
|
||||
; GFX10-NEXT: v_add_co_u32 v7, s0, v9, v7
|
||||
; GFX10-NEXT: v_add_nc_u32_e32 v0, v8, v0
|
||||
; GFX10-NEXT: v_cndmask_b32_e64 v9, 0, 1, s0
|
||||
; GFX10-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, v3, v5, vcc_lo
|
||||
; GFX10-NEXT: v_add_co_u32 v0, s0, v7, v0
|
||||
; GFX10-NEXT: v_add_nc_u32_e32 v4, v6, v9
|
||||
; GFX10-NEXT: v_cndmask_b32_e64 v5, 0, 1, s0
|
||||
; GFX10-NEXT: v_mul_lo_u32 v6, s11, v1
|
||||
; GFX10-NEXT: v_mul_lo_u32 v8, s10, v3
|
||||
; GFX10-NEXT: v_mul_lo_u32 v9, s13, v0
|
||||
; GFX10-NEXT: v_mul_hi_u32 v10, s12, v0
|
||||
; GFX10-NEXT: v_add3_u32 v2, v4, v5, v2
|
||||
; GFX10-NEXT: v_mul_hi_u32 v7, s10, v1
|
||||
; GFX10-NEXT: v_mul_hi_u32 v1, s11, v1
|
||||
; GFX10-NEXT: v_mul_lo_u32 v4, s11, v3
|
||||
; GFX10-NEXT: v_mul_lo_u32 v13, s12, v0
|
||||
; GFX10-NEXT: v_mul_lo_u32 v11, s12, v2
|
||||
; GFX10-NEXT: v_add_co_u32 v6, s0, v6, v8
|
||||
; GFX10-NEXT: v_cndmask_b32_e64 v8, 0, 1, s0
|
||||
; GFX10-NEXT: v_mul_hi_u32 v5, s10, v3
|
||||
; GFX10-NEXT: v_mul_hi_u32 v3, s11, v3
|
||||
; GFX10-NEXT: v_add_co_u32 v1, s0, v4, v1
|
||||
; GFX10-NEXT: v_add3_u32 v9, v9, v11, v10
|
||||
; GFX10-NEXT: v_cndmask_b32_e64 v4, 0, 1, s0
|
||||
; GFX10-NEXT: v_sub_co_u32 v10, vcc_lo, s8, v13
|
||||
; GFX10-NEXT: v_add_co_u32 v6, s0, v6, v7
|
||||
; GFX10-NEXT: v_sub_nc_u32_e32 v7, s9, v9
|
||||
; GFX10-NEXT: v_cndmask_b32_e64 v6, 0, 1, s0
|
||||
; GFX10-NEXT: v_sub_co_ci_u32_e64 v9, s0, s9, v9, vcc_lo
|
||||
; GFX10-NEXT: v_cmp_le_u32_e64 s0, s12, v10
|
||||
; GFX10-NEXT: v_subrev_co_ci_u32_e32 v7, vcc_lo, s13, v7, vcc_lo
|
||||
; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, s13, v9
|
||||
; GFX10-NEXT: v_add_nc_u32_e32 v6, v8, v6
|
||||
; GFX10-NEXT: v_cndmask_b32_e64 v11, 0, -1, s0
|
||||
; GFX10-NEXT: v_cndmask_b32_e64 v13, 0, -1, vcc_lo
|
||||
; GFX10-NEXT: v_sub_co_u32 v14, vcc_lo, v10, s12
|
||||
; GFX10-NEXT: v_subrev_co_ci_u32_e64 v15, s0, 0, v7, vcc_lo
|
||||
; GFX10-NEXT: v_add_co_u32 v1, s0, v1, v5
|
||||
; GFX10-NEXT: v_mul_hi_u32 v8, s10, v1
|
||||
; GFX10-NEXT: v_add_co_u32 v4, s0, v5, v4
|
||||
; GFX10-NEXT: v_cndmask_b32_e64 v5, 0, 1, s0
|
||||
; GFX10-NEXT: v_cmp_eq_u32_e64 s0, s13, v9
|
||||
; GFX10-NEXT: v_subrev_co_ci_u32_e32 v7, vcc_lo, s13, v7, vcc_lo
|
||||
; GFX10-NEXT: v_add_nc_u32_e32 v4, v4, v5
|
||||
; GFX10-NEXT: v_cndmask_b32_e64 v11, v13, v11, s0
|
||||
; GFX10-NEXT: v_cmp_le_u32_e64 s0, s13, v15
|
||||
; GFX10-NEXT: v_cndmask_b32_e64 v13, 0, -1, s0
|
||||
; GFX10-NEXT: v_cmp_le_u32_e64 s0, s12, v14
|
||||
; GFX10-NEXT: v_cndmask_b32_e64 v8, 0, -1, s0
|
||||
; GFX10-NEXT: v_add_co_u32 v0, s0, v9, v0
|
||||
; GFX10-NEXT: v_cndmask_b32_e64 v9, 0, 1, s0
|
||||
; GFX10-NEXT: v_add_co_u32 v4, s0, v4, v7
|
||||
; GFX10-NEXT: v_cndmask_b32_e64 v4, 0, 1, s0
|
||||
; GFX10-NEXT: v_add_co_u32 v0, s0, v0, v10
|
||||
; GFX10-NEXT: v_cndmask_b32_e64 v7, 0, 1, s0
|
||||
; GFX10-NEXT: v_mul_lo_u32 v10, s10, v3
|
||||
; GFX10-NEXT: v_add_nc_u32_e32 v4, v5, v4
|
||||
; GFX10-NEXT: v_mul_hi_u32 v1, s11, v1
|
||||
; GFX10-NEXT: v_mul_lo_u32 v5, s11, v3
|
||||
; GFX10-NEXT: v_add_nc_u32_e32 v7, v9, v7
|
||||
; GFX10-NEXT: v_mul_hi_u32 v11, s10, v3
|
||||
; GFX10-NEXT: v_add_co_u32 v0, s0, v0, v4
|
||||
; GFX10-NEXT: v_cndmask_b32_e64 v4, 0, 1, s0
|
||||
; GFX10-NEXT: v_add_co_u32 v6, s0, v6, v10
|
||||
; GFX10-NEXT: v_cndmask_b32_e64 v9, 0, 1, s0
|
||||
; GFX10-NEXT: v_add_co_u32 v1, s0, v5, v1
|
||||
; GFX10-NEXT: v_add3_u32 v2, v7, v4, v2
|
||||
; GFX10-NEXT: v_mul_lo_u32 v5, s13, v0
|
||||
; GFX10-NEXT: v_mul_hi_u32 v7, s12, v0
|
||||
; GFX10-NEXT: v_cndmask_b32_e64 v4, 0, 1, s0
|
||||
; GFX10-NEXT: v_mul_lo_u32 v13, s12, v0
|
||||
; GFX10-NEXT: v_mul_lo_u32 v10, s12, v2
|
||||
; GFX10-NEXT: v_add_co_u32 v6, s0, v6, v8
|
||||
; GFX10-NEXT: v_cndmask_b32_e64 v6, 0, 1, s0
|
||||
; GFX10-NEXT: v_add_co_u32 v1, s0, v1, v11
|
||||
; GFX10-NEXT: v_cndmask_b32_e64 v8, 0, 1, s0
|
||||
; GFX10-NEXT: v_mul_hi_u32 v3, s11, v3
|
||||
; GFX10-NEXT: v_add3_u32 v5, v5, v10, v7
|
||||
; GFX10-NEXT: v_sub_co_u32 v7, vcc_lo, s8, v13
|
||||
; GFX10-NEXT: v_add_nc_u32_e32 v4, v4, v8
|
||||
; GFX10-NEXT: v_add_nc_u32_e32 v6, v9, v6
|
||||
; GFX10-NEXT: v_sub_nc_u32_e32 v8, s9, v5
|
||||
; GFX10-NEXT: v_sub_co_ci_u32_e64 v5, s0, s9, v5, vcc_lo
|
||||
; GFX10-NEXT: v_cmp_le_u32_e64 s0, s12, v7
|
||||
; GFX10-NEXT: v_subrev_co_ci_u32_e32 v8, vcc_lo, s13, v8, vcc_lo
|
||||
; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, s13, v5
|
||||
; GFX10-NEXT: v_cndmask_b32_e64 v9, 0, -1, s0
|
||||
; GFX10-NEXT: v_cndmask_b32_e64 v10, 0, -1, vcc_lo
|
||||
; GFX10-NEXT: v_sub_co_u32 v11, vcc_lo, v7, s12
|
||||
; GFX10-NEXT: v_subrev_co_ci_u32_e64 v13, s0, 0, v8, vcc_lo
|
||||
; GFX10-NEXT: v_cmp_eq_u32_e64 s0, s13, v5
|
||||
; GFX10-NEXT: v_subrev_co_ci_u32_e32 v8, vcc_lo, s13, v8, vcc_lo
|
||||
; GFX10-NEXT: v_cndmask_b32_e64 v9, v10, v9, s0
|
||||
; GFX10-NEXT: v_cmp_le_u32_e64 s0, s12, v11
|
||||
; GFX10-NEXT: v_cndmask_b32_e64 v10, 0, -1, s0
|
||||
; GFX10-NEXT: v_cmp_le_u32_e64 s0, s13, v13
|
||||
; GFX10-NEXT: v_cndmask_b32_e64 v14, 0, -1, s0
|
||||
; GFX10-NEXT: v_add_co_u32 v6, s0, v1, v6
|
||||
; GFX10-NEXT: v_cndmask_b32_e64 v1, 0, 1, s0
|
||||
; GFX10-NEXT: v_add_co_u32 v5, s0, v0, 1
|
||||
; GFX10-NEXT: v_add_co_u32 v15, s0, v0, 1
|
||||
; GFX10-NEXT: v_add_co_ci_u32_e64 v16, s0, 0, v2, s0
|
||||
; GFX10-NEXT: v_cmp_eq_u32_e64 s0, s13, v15
|
||||
; GFX10-NEXT: v_cmp_eq_u32_e64 s0, s13, v13
|
||||
; GFX10-NEXT: v_add3_u32 v3, v4, v1, v3
|
||||
; GFX10-NEXT: v_mul_hi_u32 v18, s14, v6
|
||||
; GFX10-NEXT: v_cndmask_b32_e64 v8, v13, v8, s0
|
||||
; GFX10-NEXT: v_mul_lo_u32 v13, s15, v6
|
||||
; GFX10-NEXT: v_cndmask_b32_e64 v10, v14, v10, s0
|
||||
; GFX10-NEXT: v_mul_lo_u32 v14, s15, v6
|
||||
; GFX10-NEXT: v_mul_lo_u32 v17, s14, v3
|
||||
; GFX10-NEXT: v_add_co_u32 v1, s0, v5, 1
|
||||
; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v8
|
||||
; GFX10-NEXT: v_add_co_u32 v1, s0, v15, 1
|
||||
; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v10
|
||||
; GFX10-NEXT: v_add_co_ci_u32_e64 v4, s0, 0, v16, s0
|
||||
; GFX10-NEXT: v_sub_co_u32 v19, s0, v14, s12
|
||||
; GFX10-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc_lo
|
||||
; GFX10-NEXT: v_mul_lo_u32 v5, s14, v6
|
||||
; GFX10-NEXT: v_sub_co_u32 v19, s0, v11, s12
|
||||
; GFX10-NEXT: v_cndmask_b32_e32 v1, v15, v1, vcc_lo
|
||||
; GFX10-NEXT: v_mul_lo_u32 v15, s14, v6
|
||||
; GFX10-NEXT: v_cndmask_b32_e32 v4, v16, v4, vcc_lo
|
||||
; GFX10-NEXT: v_add3_u32 v13, v13, v17, v18
|
||||
; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v11
|
||||
; GFX10-NEXT: v_subrev_co_ci_u32_e64 v7, s0, 0, v7, s0
|
||||
; GFX10-NEXT: v_add3_u32 v14, v14, v17, v18
|
||||
; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v9
|
||||
; GFX10-NEXT: v_subrev_co_ci_u32_e64 v8, s0, 0, v8, s0
|
||||
; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc_lo
|
||||
; GFX10-NEXT: v_cndmask_b32_e32 v1, v2, v4, vcc_lo
|
||||
; GFX10-NEXT: v_sub_nc_u32_e32 v2, s11, v13
|
||||
; GFX10-NEXT: v_sub_co_u32 v11, s0, s10, v5
|
||||
; GFX10-NEXT: v_sub_co_ci_u32_e64 v16, s1, s11, v13, s0
|
||||
; GFX10-NEXT: v_sub_nc_u32_e32 v2, s11, v14
|
||||
; GFX10-NEXT: v_sub_co_u32 v9, s0, s10, v15
|
||||
; GFX10-NEXT: v_sub_co_ci_u32_e64 v15, s1, s11, v14, s0
|
||||
; GFX10-NEXT: v_subrev_co_ci_u32_e64 v2, s0, s15, v2, s0
|
||||
; GFX10-NEXT: v_cmp_le_u32_e64 s0, s14, v11
|
||||
; GFX10-NEXT: v_cmp_ne_u32_e64 s1, 0, v8
|
||||
; GFX10-NEXT: v_cmp_le_u32_e64 s2, s15, v16
|
||||
; GFX10-NEXT: v_cndmask_b32_e64 v8, 0, -1, s0
|
||||
; GFX10-NEXT: v_sub_co_u32 v13, s0, v11, s14
|
||||
; GFX10-NEXT: v_cndmask_b32_e64 v4, v14, v19, s1
|
||||
; GFX10-NEXT: v_cndmask_b32_e64 v5, 0, -1, s2
|
||||
; GFX10-NEXT: v_subrev_co_ci_u32_e64 v14, s2, 0, v2, s0
|
||||
; GFX10-NEXT: v_cndmask_b32_e64 v7, v15, v7, s1
|
||||
; GFX10-NEXT: v_cmp_eq_u32_e64 s1, s15, v16
|
||||
; GFX10-NEXT: v_cndmask_b32_e32 v4, v10, v4, vcc_lo
|
||||
; GFX10-NEXT: v_cmp_ne_u32_e64 s1, 0, v10
|
||||
; GFX10-NEXT: v_cmp_le_u32_e64 s0, s14, v9
|
||||
; GFX10-NEXT: v_cmp_le_u32_e64 s2, s15, v15
|
||||
; GFX10-NEXT: v_cndmask_b32_e64 v4, v11, v19, s1
|
||||
; GFX10-NEXT: v_cndmask_b32_e64 v11, 0, -1, s0
|
||||
; GFX10-NEXT: v_sub_co_u32 v14, s0, v9, s14
|
||||
; GFX10-NEXT: v_cndmask_b32_e64 v10, 0, -1, s2
|
||||
; GFX10-NEXT: v_subrev_co_ci_u32_e64 v16, s2, 0, v2, s0
|
||||
; GFX10-NEXT: v_cndmask_b32_e64 v8, v13, v8, s1
|
||||
; GFX10-NEXT: v_cmp_eq_u32_e64 s1, s15, v15
|
||||
; GFX10-NEXT: v_cndmask_b32_e32 v4, v7, v4, vcc_lo
|
||||
; GFX10-NEXT: v_subrev_co_ci_u32_e64 v2, s0, s15, v2, s0
|
||||
; GFX10-NEXT: v_cndmask_b32_e64 v5, v5, v8, s1
|
||||
; GFX10-NEXT: v_cmp_le_u32_e64 s1, s15, v14
|
||||
; GFX10-NEXT: v_cndmask_b32_e64 v8, 0, -1, s1
|
||||
; GFX10-NEXT: v_cmp_le_u32_e64 s1, s14, v13
|
||||
; GFX10-NEXT: v_cndmask_b32_e32 v5, v5, v8, vcc_lo
|
||||
; GFX10-NEXT: v_cndmask_b32_e64 v7, v10, v11, s1
|
||||
; GFX10-NEXT: v_cmp_le_u32_e64 s1, s15, v16
|
||||
; GFX10-NEXT: v_cndmask_b32_e64 v10, 0, -1, s1
|
||||
; GFX10-NEXT: v_add_co_u32 v15, s1, v6, 1
|
||||
; GFX10-NEXT: v_cmp_le_u32_e64 s1, s14, v14
|
||||
; GFX10-NEXT: v_cndmask_b32_e64 v11, 0, -1, s1
|
||||
; GFX10-NEXT: v_add_co_u32 v13, s1, v6, 1
|
||||
; GFX10-NEXT: v_add_co_ci_u32_e64 v17, s1, 0, v3, s1
|
||||
; GFX10-NEXT: v_cmp_eq_u32_e64 s1, s15, v14
|
||||
; GFX10-NEXT: v_cndmask_b32_e64 v8, v8, v10, s1
|
||||
; GFX10-NEXT: v_add_co_u32 v10, s1, v15, 1
|
||||
; GFX10-NEXT: v_cmp_eq_u32_e64 s1, s15, v16
|
||||
; GFX10-NEXT: v_cndmask_b32_e64 v10, v10, v11, s1
|
||||
; GFX10-NEXT: v_add_co_u32 v11, s1, v13, 1
|
||||
; GFX10-NEXT: v_add_co_ci_u32_e64 v18, s1, 0, v17, s1
|
||||
; GFX10-NEXT: v_cmp_ne_u32_e64 s0, 0, v8
|
||||
; GFX10-NEXT: v_sub_co_u32 v8, s1, v13, s14
|
||||
; GFX10-NEXT: v_cmp_ne_u32_e64 s0, 0, v10
|
||||
; GFX10-NEXT: v_sub_co_u32 v10, s1, v14, s14
|
||||
; GFX10-NEXT: v_subrev_co_ci_u32_e64 v2, s1, 0, v2, s1
|
||||
; GFX10-NEXT: v_cndmask_b32_e64 v10, v15, v10, s0
|
||||
; GFX10-NEXT: v_cndmask_b32_e64 v15, v17, v18, s0
|
||||
; GFX10-NEXT: v_cmp_ne_u32_e64 s1, 0, v5
|
||||
; GFX10-NEXT: v_cndmask_b32_e64 v8, v13, v8, s0
|
||||
; GFX10-NEXT: v_cndmask_b32_e64 v13, v14, v2, s0
|
||||
; GFX10-NEXT: v_cndmask_b32_e32 v5, v9, v7, vcc_lo
|
||||
; GFX10-NEXT: v_cndmask_b32_e64 v2, v6, v10, s1
|
||||
; GFX10-NEXT: v_cndmask_b32_e64 v3, v3, v15, s1
|
||||
; GFX10-NEXT: v_cndmask_b32_e64 v6, v11, v8, s1
|
||||
; GFX10-NEXT: v_cndmask_b32_e64 v7, v16, v13, s1
|
||||
; GFX10-NEXT: v_cndmask_b32_e64 v11, v13, v11, s0
|
||||
; GFX10-NEXT: v_cndmask_b32_e64 v13, v17, v18, s0
|
||||
; GFX10-NEXT: v_cmp_ne_u32_e64 s1, 0, v7
|
||||
; GFX10-NEXT: v_cndmask_b32_e64 v7, v14, v10, s0
|
||||
; GFX10-NEXT: v_cndmask_b32_e64 v10, v16, v2, s0
|
||||
; GFX10-NEXT: v_cndmask_b32_e64 v2, v6, v11, s1
|
||||
; GFX10-NEXT: v_cndmask_b32_e64 v3, v3, v13, s1
|
||||
; GFX10-NEXT: v_cndmask_b32_e64 v6, v9, v7, s1
|
||||
; GFX10-NEXT: v_cndmask_b32_e64 v7, v15, v10, s1
|
||||
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX10-NEXT: global_store_dwordx4 v12, v[0:3], s[4:5]
|
||||
; GFX10-NEXT: global_store_dwordx4 v12, v[4:7], s[6:7]
|
||||
|
|
|
@ -191,14 +191,11 @@ define amdgpu_ps i64 @s_urem_i64(i64 inreg %num, i64 inreg %den) {
|
|||
; CHECK-NEXT: v_mov_b32_e32 v1, s3
|
||||
; CHECK-NEXT: v_cvt_f32_u32_e32 v2, s3
|
||||
; CHECK-NEXT: s_sub_u32 s4, 0, s2
|
||||
; CHECK-NEXT: s_cselect_b32 s5, 1, 0
|
||||
; CHECK-NEXT: v_mov_b32_e32 v3, s1
|
||||
; CHECK-NEXT: v_mac_f32_e32 v0, 0x4f800000, v2
|
||||
; CHECK-NEXT: s_and_b32 s5, s5, 1
|
||||
; CHECK-NEXT: s_subb_u32 s5, 0, s3
|
||||
; CHECK-NEXT: v_rcp_iflag_f32_e32 v0, v0
|
||||
; CHECK-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0
|
||||
; CHECK-NEXT: s_cmp_lg_u32 s5, 0
|
||||
; CHECK-NEXT: s_subb_u32 s5, 0, s3
|
||||
; CHECK-NEXT: v_mul_f32_e32 v2, 0x2f800000, v0
|
||||
; CHECK-NEXT: v_trunc_f32_e32 v2, v2
|
||||
; CHECK-NEXT: v_mac_f32_e32 v0, 0xcf800000, v2
|
||||
|
@ -1103,226 +1100,220 @@ define <2 x i64> @v_urem_v2i64_oddk_denom(<2 x i64> %num) {
|
|||
; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; GISEL-NEXT: s_mov_b32 s8, 0x12d8fb
|
||||
; GISEL-NEXT: v_cvt_f32_ubyte0_e32 v4, 0
|
||||
; GISEL-NEXT: v_cvt_f32_u32_e32 v5, s8
|
||||
; GISEL-NEXT: v_cvt_f32_u32_e32 v6, s8
|
||||
; GISEL-NEXT: s_sub_u32 s6, 0, s8
|
||||
; GISEL-NEXT: s_cselect_b32 s4, 1, 0
|
||||
; GISEL-NEXT: v_madmk_f32 v6, v4, 0x4f800000, v5
|
||||
; GISEL-NEXT: s_and_b32 s4, s4, 1
|
||||
; GISEL-NEXT: v_mac_f32_e32 v5, 0x4f800000, v4
|
||||
; GISEL-NEXT: v_rcp_iflag_f32_e32 v4, v6
|
||||
; GISEL-NEXT: v_rcp_iflag_f32_e32 v5, v5
|
||||
; GISEL-NEXT: v_mul_f32_e32 v6, 0x5f7ffffc, v4
|
||||
; GISEL-NEXT: s_cmp_lg_u32 s4, 0
|
||||
; GISEL-NEXT: v_madmk_f32 v5, v4, 0x4f800000, v6
|
||||
; GISEL-NEXT: s_subb_u32 s7, 0, 0
|
||||
; GISEL-NEXT: s_bfe_i32 s4, -1, 0x10000
|
||||
; GISEL-NEXT: s_bfe_i32 s5, -1, 0x10000
|
||||
; GISEL-NEXT: v_mul_f32_e32 v7, 0x5f7ffffc, v5
|
||||
; GISEL-NEXT: v_mul_f32_e32 v8, 0x2f800000, v6
|
||||
; GISEL-NEXT: v_mac_f32_e32 v6, 0x4f800000, v4
|
||||
; GISEL-NEXT: v_rcp_iflag_f32_e32 v7, v5
|
||||
; GISEL-NEXT: v_mov_b32_e32 v5, s4
|
||||
; GISEL-NEXT: v_mov_b32_e32 v4, s5
|
||||
; GISEL-NEXT: v_mul_f32_e32 v9, 0x2f800000, v7
|
||||
; GISEL-NEXT: v_rcp_iflag_f32_e32 v6, v6
|
||||
; GISEL-NEXT: s_sub_u32 s9, 0, s8
|
||||
; GISEL-NEXT: s_cselect_b32 s4, 1, 0
|
||||
; GISEL-NEXT: v_trunc_f32_e32 v8, v8
|
||||
; GISEL-NEXT: v_trunc_f32_e32 v9, v9
|
||||
; GISEL-NEXT: s_and_b32 s4, s4, 1
|
||||
; GISEL-NEXT: v_mac_f32_e32 v6, 0xcf800000, v8
|
||||
; GISEL-NEXT: v_cvt_u32_f32_e32 v8, v8
|
||||
; GISEL-NEXT: v_mac_f32_e32 v7, 0xcf800000, v9
|
||||
; GISEL-NEXT: v_cvt_u32_f32_e32 v9, v9
|
||||
; GISEL-NEXT: v_cvt_u32_f32_e32 v6, v6
|
||||
; GISEL-NEXT: v_mul_lo_u32 v10, s6, v8
|
||||
; GISEL-NEXT: v_cvt_u32_f32_e32 v7, v7
|
||||
; GISEL-NEXT: s_cmp_lg_u32 s4, 0
|
||||
; GISEL-NEXT: v_mul_f32_e32 v7, 0x5f7ffffc, v7
|
||||
; GISEL-NEXT: v_mul_f32_e32 v6, 0x5f7ffffc, v6
|
||||
; GISEL-NEXT: s_subb_u32 s10, 0, 0
|
||||
; GISEL-NEXT: v_mul_lo_u32 v11, s9, v9
|
||||
; GISEL-NEXT: s_bfe_i32 s4, -1, 0x10000
|
||||
; GISEL-NEXT: s_bfe_i32 s11, -1, 0x10000
|
||||
; GISEL-NEXT: v_mul_lo_u32 v12, s6, v6
|
||||
; GISEL-NEXT: v_mul_lo_u32 v13, s7, v6
|
||||
; GISEL-NEXT: v_mul_hi_u32 v14, s6, v6
|
||||
; GISEL-NEXT: v_mul_lo_u32 v15, s9, v7
|
||||
; GISEL-NEXT: v_mul_lo_u32 v16, s10, v7
|
||||
; GISEL-NEXT: v_mul_hi_u32 v17, s9, v7
|
||||
; GISEL-NEXT: v_mov_b32_e32 v18, s4
|
||||
; GISEL-NEXT: v_add_i32_e32 v10, vcc, v13, v10
|
||||
; GISEL-NEXT: v_mul_lo_u32 v13, v8, v12
|
||||
; GISEL-NEXT: v_mul_hi_u32 v19, v6, v12
|
||||
; GISEL-NEXT: v_mul_hi_u32 v12, v8, v12
|
||||
; GISEL-NEXT: v_add_i32_e32 v11, vcc, v16, v11
|
||||
; GISEL-NEXT: v_mul_lo_u32 v16, v9, v15
|
||||
; GISEL-NEXT: v_add_i32_e32 v10, vcc, v10, v14
|
||||
; GISEL-NEXT: v_mul_hi_u32 v14, v7, v15
|
||||
; GISEL-NEXT: v_mul_hi_u32 v15, v9, v15
|
||||
; GISEL-NEXT: v_add_i32_e32 v11, vcc, v11, v17
|
||||
; GISEL-NEXT: v_mul_lo_u32 v17, v7, v11
|
||||
; GISEL-NEXT: v_add_i32_e32 v16, vcc, v16, v17
|
||||
; GISEL-NEXT: v_cndmask_b32_e64 v17, 0, 1, vcc
|
||||
; GISEL-NEXT: v_add_i32_e32 v14, vcc, v16, v14
|
||||
; GISEL-NEXT: v_mul_lo_u32 v14, v6, v10
|
||||
; GISEL-NEXT: v_mul_lo_u32 v16, v8, v10
|
||||
; GISEL-NEXT: v_add_i32_e64 v13, s[4:5], v13, v14
|
||||
; GISEL-NEXT: v_cndmask_b32_e64 v14, 0, 1, s[4:5]
|
||||
; GISEL-NEXT: v_add_i32_e64 v13, s[4:5], v13, v19
|
||||
; GISEL-NEXT: v_mul_hi_u32 v13, v6, v10
|
||||
; GISEL-NEXT: v_mul_hi_u32 v10, v8, v10
|
||||
; GISEL-NEXT: v_cndmask_b32_e64 v19, 0, 1, s[4:5]
|
||||
; GISEL-NEXT: v_mul_f32_e32 v8, 0x2f800000, v7
|
||||
; GISEL-NEXT: v_mul_f32_e32 v9, 0x2f800000, v6
|
||||
; GISEL-NEXT: v_mov_b32_e32 v10, s4
|
||||
; GISEL-NEXT: v_trunc_f32_e32 v8, v8
|
||||
; GISEL-NEXT: v_trunc_f32_e32 v9, v9
|
||||
; GISEL-NEXT: v_mac_f32_e32 v7, 0xcf800000, v8
|
||||
; GISEL-NEXT: v_cvt_u32_f32_e32 v8, v8
|
||||
; GISEL-NEXT: v_mac_f32_e32 v6, 0xcf800000, v9
|
||||
; GISEL-NEXT: v_cvt_u32_f32_e32 v9, v9
|
||||
; GISEL-NEXT: v_cvt_u32_f32_e32 v7, v7
|
||||
; GISEL-NEXT: v_mul_lo_u32 v11, s6, v8
|
||||
; GISEL-NEXT: v_cvt_u32_f32_e32 v6, v6
|
||||
; GISEL-NEXT: v_mul_lo_u32 v12, s9, v9
|
||||
; GISEL-NEXT: v_mul_lo_u32 v13, s6, v7
|
||||
; GISEL-NEXT: v_mul_lo_u32 v14, s7, v7
|
||||
; GISEL-NEXT: v_mul_hi_u32 v15, s6, v7
|
||||
; GISEL-NEXT: v_mul_lo_u32 v16, s9, v6
|
||||
; GISEL-NEXT: v_mul_lo_u32 v17, s10, v6
|
||||
; GISEL-NEXT: v_mul_hi_u32 v18, s9, v6
|
||||
; GISEL-NEXT: v_add_i32_e32 v11, vcc, v14, v11
|
||||
; GISEL-NEXT: v_mul_lo_u32 v14, v8, v13
|
||||
; GISEL-NEXT: v_mul_hi_u32 v19, v7, v13
|
||||
; GISEL-NEXT: v_mul_hi_u32 v13, v8, v13
|
||||
; GISEL-NEXT: v_add_i32_e32 v12, vcc, v17, v12
|
||||
; GISEL-NEXT: v_mul_lo_u32 v17, v9, v16
|
||||
; GISEL-NEXT: v_add_i32_e32 v11, vcc, v11, v15
|
||||
; GISEL-NEXT: v_mul_hi_u32 v15, v6, v16
|
||||
; GISEL-NEXT: v_mul_hi_u32 v16, v9, v16
|
||||
; GISEL-NEXT: v_add_i32_e32 v12, vcc, v12, v18
|
||||
; GISEL-NEXT: v_mul_lo_u32 v18, v6, v12
|
||||
; GISEL-NEXT: v_add_i32_e32 v17, vcc, v17, v18
|
||||
; GISEL-NEXT: v_cndmask_b32_e64 v18, 0, 1, vcc
|
||||
; GISEL-NEXT: v_add_i32_e32 v15, vcc, v17, v15
|
||||
; GISEL-NEXT: v_mul_lo_u32 v15, v7, v11
|
||||
; GISEL-NEXT: v_mul_lo_u32 v17, v8, v11
|
||||
; GISEL-NEXT: v_add_i32_e64 v14, s[4:5], v14, v15
|
||||
; GISEL-NEXT: v_cndmask_b32_e64 v15, 0, 1, s[4:5]
|
||||
; GISEL-NEXT: v_add_i32_e64 v14, s[4:5], v14, v19
|
||||
; GISEL-NEXT: v_mul_lo_u32 v19, v9, v11
|
||||
; GISEL-NEXT: v_add_i32_e64 v12, s[4:5], v16, v12
|
||||
; GISEL-NEXT: v_cndmask_b32_e64 v16, 0, 1, s[4:5]
|
||||
; GISEL-NEXT: v_add_i32_e64 v12, s[4:5], v12, v13
|
||||
; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, s[4:5]
|
||||
; GISEL-NEXT: v_add_i32_e64 v13, s[4:5], v16, v13
|
||||
; GISEL-NEXT: v_cndmask_b32_e64 v16, 0, 1, vcc
|
||||
; GISEL-NEXT: v_add_i32_e32 v16, vcc, v17, v16
|
||||
; GISEL-NEXT: v_mul_hi_u32 v17, v7, v11
|
||||
; GISEL-NEXT: v_mul_hi_u32 v11, v9, v11
|
||||
; GISEL-NEXT: v_add_i32_e32 v15, vcc, v19, v15
|
||||
; GISEL-NEXT: v_cndmask_b32_e64 v19, 0, 1, vcc
|
||||
; GISEL-NEXT: v_add_i32_e32 v15, vcc, v15, v17
|
||||
; GISEL-NEXT: v_cndmask_b32_e64 v17, 0, 1, vcc
|
||||
; GISEL-NEXT: v_add_i32_e32 v17, vcc, v19, v17
|
||||
; GISEL-NEXT: v_add_i32_e32 v12, vcc, v12, v14
|
||||
; GISEL-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc
|
||||
; GISEL-NEXT: v_add_i32_e32 v15, vcc, v15, v16
|
||||
; GISEL-NEXT: v_cndmask_b32_e64 v16, 0, 1, vcc
|
||||
; GISEL-NEXT: v_add_i32_e32 v13, vcc, v13, v14
|
||||
; GISEL-NEXT: v_add_i32_e32 v14, vcc, v17, v16
|
||||
; GISEL-NEXT: v_add_i32_e32 v10, vcc, v10, v13
|
||||
; GISEL-NEXT: v_add_i32_e32 v11, vcc, v11, v14
|
||||
; GISEL-NEXT: v_add_i32_e32 v6, vcc, v6, v12
|
||||
; GISEL-NEXT: v_addc_u32_e32 v8, vcc, v8, v10, vcc
|
||||
; GISEL-NEXT: v_mul_lo_u32 v10, s6, v6
|
||||
; GISEL-NEXT: v_mul_lo_u32 v12, s7, v6
|
||||
; GISEL-NEXT: v_mul_hi_u32 v13, s6, v6
|
||||
; GISEL-NEXT: v_add_i32_e32 v7, vcc, v7, v15
|
||||
; GISEL-NEXT: v_addc_u32_e32 v9, vcc, v9, v11, vcc
|
||||
; GISEL-NEXT: v_mul_lo_u32 v11, s9, v7
|
||||
; GISEL-NEXT: v_mul_lo_u32 v14, s10, v7
|
||||
; GISEL-NEXT: v_mul_hi_u32 v15, s9, v7
|
||||
; GISEL-NEXT: v_mul_lo_u32 v16, s6, v8
|
||||
; GISEL-NEXT: v_mul_lo_u32 v17, v8, v10
|
||||
; GISEL-NEXT: v_mul_hi_u32 v19, v6, v10
|
||||
; GISEL-NEXT: v_mul_hi_u32 v10, v8, v10
|
||||
; GISEL-NEXT: v_add_i32_e32 v12, vcc, v12, v16
|
||||
; GISEL-NEXT: v_mul_lo_u32 v16, s9, v9
|
||||
; GISEL-NEXT: v_add_i32_e32 v14, vcc, v14, v16
|
||||
; GISEL-NEXT: v_mul_lo_u32 v16, v9, v11
|
||||
; GISEL-NEXT: v_add_i32_e32 v12, vcc, v12, v13
|
||||
; GISEL-NEXT: v_mul_hi_u32 v13, v7, v11
|
||||
; GISEL-NEXT: v_mul_hi_u32 v11, v9, v11
|
||||
; GISEL-NEXT: v_add_i32_e32 v14, vcc, v14, v15
|
||||
; GISEL-NEXT: v_mul_lo_u32 v15, v7, v14
|
||||
; GISEL-NEXT: v_add_i32_e32 v15, vcc, v16, v15
|
||||
; GISEL-NEXT: v_cndmask_b32_e64 v16, 0, 1, vcc
|
||||
; GISEL-NEXT: v_add_i32_e32 v13, vcc, v15, v13
|
||||
; GISEL-NEXT: v_mul_lo_u32 v13, v6, v12
|
||||
; GISEL-NEXT: v_mul_lo_u32 v15, v8, v12
|
||||
; GISEL-NEXT: v_mul_hi_u32 v14, v7, v11
|
||||
; GISEL-NEXT: v_mul_hi_u32 v11, v8, v11
|
||||
; GISEL-NEXT: v_cndmask_b32_e64 v19, 0, 1, s[4:5]
|
||||
; GISEL-NEXT: v_add_i32_e64 v15, s[4:5], v15, v19
|
||||
; GISEL-NEXT: v_mul_lo_u32 v19, v9, v12
|
||||
; GISEL-NEXT: v_add_i32_e64 v13, s[4:5], v17, v13
|
||||
; GISEL-NEXT: v_cndmask_b32_e64 v17, 0, 1, s[4:5]
|
||||
; GISEL-NEXT: v_add_i32_e64 v13, s[4:5], v13, v19
|
||||
; GISEL-NEXT: v_mul_hi_u32 v13, v6, v12
|
||||
; GISEL-NEXT: v_cndmask_b32_e64 v19, 0, 1, s[4:5]
|
||||
; GISEL-NEXT: v_add_i32_e64 v17, s[4:5], v17, v19
|
||||
; GISEL-NEXT: v_mul_lo_u32 v19, v9, v14
|
||||
; GISEL-NEXT: v_add_i32_e64 v10, s[4:5], v15, v10
|
||||
; GISEL-NEXT: v_cndmask_b32_e64 v15, 0, 1, s[4:5]
|
||||
; GISEL-NEXT: v_add_i32_e64 v10, s[4:5], v10, v13
|
||||
; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, s[4:5]
|
||||
; GISEL-NEXT: v_add_i32_e64 v13, s[4:5], v15, v13
|
||||
; GISEL-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc
|
||||
; GISEL-NEXT: v_add_i32_e32 v15, vcc, v16, v15
|
||||
; GISEL-NEXT: v_mul_hi_u32 v16, v7, v14
|
||||
; GISEL-NEXT: v_add_i32_e32 v11, vcc, v19, v11
|
||||
; GISEL-NEXT: v_cndmask_b32_e64 v19, 0, 1, vcc
|
||||
; GISEL-NEXT: v_add_i32_e32 v11, vcc, v11, v16
|
||||
; GISEL-NEXT: v_cndmask_b32_e64 v16, 0, 1, vcc
|
||||
; GISEL-NEXT: v_add_i32_e32 v16, vcc, v19, v16
|
||||
; GISEL-NEXT: v_mov_b32_e32 v19, s11
|
||||
; GISEL-NEXT: v_mul_hi_u32 v12, v8, v12
|
||||
; GISEL-NEXT: v_mul_hi_u32 v14, v9, v14
|
||||
; GISEL-NEXT: v_add_i32_e32 v10, vcc, v10, v17
|
||||
; GISEL-NEXT: v_cndmask_b32_e64 v17, 0, 1, vcc
|
||||
; GISEL-NEXT: v_add_i32_e32 v11, vcc, v11, v15
|
||||
; GISEL-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc
|
||||
; GISEL-NEXT: v_add_i32_e32 v13, vcc, v13, v17
|
||||
; GISEL-NEXT: v_add_i32_e32 v15, vcc, v16, v15
|
||||
; GISEL-NEXT: v_add_i32_e32 v12, vcc, v12, v13
|
||||
; GISEL-NEXT: v_add_i32_e32 v13, vcc, v14, v15
|
||||
; GISEL-NEXT: v_add_i32_e32 v6, vcc, v6, v10
|
||||
; GISEL-NEXT: v_addc_u32_e32 v8, vcc, v8, v12, vcc
|
||||
; GISEL-NEXT: v_mul_lo_u32 v10, v1, v6
|
||||
; GISEL-NEXT: v_mul_hi_u32 v12, v0, v6
|
||||
; GISEL-NEXT: v_mul_hi_u32 v6, v1, v6
|
||||
; GISEL-NEXT: v_add_i32_e32 v7, vcc, v7, v11
|
||||
; GISEL-NEXT: v_addc_u32_e32 v9, vcc, v9, v13, vcc
|
||||
; GISEL-NEXT: v_mul_lo_u32 v11, v3, v7
|
||||
; GISEL-NEXT: v_mul_hi_u32 v13, v2, v7
|
||||
; GISEL-NEXT: v_mul_hi_u32 v7, v3, v7
|
||||
; GISEL-NEXT: v_mul_lo_u32 v14, v0, v8
|
||||
; GISEL-NEXT: v_mul_lo_u32 v15, v1, v8
|
||||
; GISEL-NEXT: v_mul_hi_u32 v16, v0, v8
|
||||
; GISEL-NEXT: v_mul_hi_u32 v8, v1, v8
|
||||
; GISEL-NEXT: v_mul_lo_u32 v17, v2, v9
|
||||
; GISEL-NEXT: v_add_i32_e32 v11, vcc, v11, v17
|
||||
; GISEL-NEXT: v_cndmask_b32_e64 v17, 0, 1, vcc
|
||||
; GISEL-NEXT: v_add_i32_e32 v11, vcc, v11, v13
|
||||
; GISEL-NEXT: v_mul_lo_u32 v11, v3, v9
|
||||
; GISEL-NEXT: v_mul_hi_u32 v13, v2, v9
|
||||
; GISEL-NEXT: v_mul_hi_u32 v9, v3, v9
|
||||
; GISEL-NEXT: v_add_i32_e64 v10, s[4:5], v10, v14
|
||||
; GISEL-NEXT: v_add_i32_e64 v13, s[4:5], v13, v14
|
||||
; GISEL-NEXT: v_cndmask_b32_e64 v14, 0, 1, s[4:5]
|
||||
; GISEL-NEXT: v_add_i32_e64 v6, s[4:5], v15, v6
|
||||
; GISEL-NEXT: v_cndmask_b32_e64 v15, 0, 1, s[4:5]
|
||||
; GISEL-NEXT: v_add_i32_e64 v7, s[4:5], v11, v7
|
||||
; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, 1, s[4:5]
|
||||
; GISEL-NEXT: v_add_i32_e64 v10, s[4:5], v10, v12
|
||||
; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, 1, s[4:5]
|
||||
; GISEL-NEXT: v_add_i32_e64 v6, s[4:5], v6, v16
|
||||
; GISEL-NEXT: v_cndmask_b32_e64 v12, 0, 1, s[4:5]
|
||||
; GISEL-NEXT: v_cndmask_b32_e64 v16, 0, 1, vcc
|
||||
; GISEL-NEXT: v_add_i32_e64 v14, s[4:5], v17, v14
|
||||
; GISEL-NEXT: v_cndmask_b32_e64 v17, 0, 1, vcc
|
||||
; GISEL-NEXT: v_add_i32_e32 v17, vcc, v18, v17
|
||||
; GISEL-NEXT: v_mul_hi_u32 v18, v6, v12
|
||||
; GISEL-NEXT: v_mul_hi_u32 v12, v9, v12
|
||||
; GISEL-NEXT: v_add_i32_e32 v16, vcc, v19, v16
|
||||
; GISEL-NEXT: v_cndmask_b32_e64 v19, 0, 1, vcc
|
||||
; GISEL-NEXT: v_add_i32_e32 v16, vcc, v16, v18
|
||||
; GISEL-NEXT: v_cndmask_b32_e64 v18, 0, 1, vcc
|
||||
; GISEL-NEXT: v_add_i32_e32 v18, vcc, v19, v18
|
||||
; GISEL-NEXT: v_add_i32_e32 v13, vcc, v13, v15
|
||||
; GISEL-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc
|
||||
; GISEL-NEXT: v_add_i32_e32 v16, vcc, v16, v17
|
||||
; GISEL-NEXT: v_cndmask_b32_e64 v17, 0, 1, vcc
|
||||
; GISEL-NEXT: v_add_i32_e32 v14, vcc, v14, v15
|
||||
; GISEL-NEXT: v_add_i32_e32 v15, vcc, v18, v17
|
||||
; GISEL-NEXT: v_add_i32_e32 v11, vcc, v11, v14
|
||||
; GISEL-NEXT: v_add_i32_e32 v12, vcc, v12, v15
|
||||
; GISEL-NEXT: v_add_i32_e32 v7, vcc, v7, v13
|
||||
; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc
|
||||
; GISEL-NEXT: v_add_i32_e32 v10, vcc, v14, v10
|
||||
; GISEL-NEXT: v_add_i32_e32 v12, vcc, v15, v12
|
||||
; GISEL-NEXT: v_add_i32_e32 v14, vcc, v17, v16
|
||||
; GISEL-NEXT: v_add_i32_e32 v11, vcc, v11, v13
|
||||
; GISEL-NEXT: v_add_i32_e32 v6, vcc, v6, v10
|
||||
; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc
|
||||
; GISEL-NEXT: v_add_i32_e32 v7, vcc, v7, v14
|
||||
; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc
|
||||
; GISEL-NEXT: v_add_i32_e32 v10, vcc, v12, v10
|
||||
; GISEL-NEXT: v_mul_lo_u32 v12, s8, v6
|
||||
; GISEL-NEXT: v_mul_lo_u32 v14, 0, v6
|
||||
; GISEL-NEXT: v_mul_hi_u32 v6, s8, v6
|
||||
; GISEL-NEXT: v_add_i32_e32 v11, vcc, v11, v13
|
||||
; GISEL-NEXT: v_addc_u32_e32 v8, vcc, v8, v11, vcc
|
||||
; GISEL-NEXT: v_mul_lo_u32 v11, s6, v7
|
||||
; GISEL-NEXT: v_mul_lo_u32 v13, s7, v7
|
||||
; GISEL-NEXT: v_mul_hi_u32 v14, s6, v7
|
||||
; GISEL-NEXT: v_add_i32_e32 v6, vcc, v6, v16
|
||||
; GISEL-NEXT: v_addc_u32_e32 v9, vcc, v9, v12, vcc
|
||||
; GISEL-NEXT: v_mul_lo_u32 v12, s9, v6
|
||||
; GISEL-NEXT: v_mul_lo_u32 v15, s10, v6
|
||||
; GISEL-NEXT: v_mul_hi_u32 v16, s9, v6
|
||||
; GISEL-NEXT: v_mul_lo_u32 v17, s6, v8
|
||||
; GISEL-NEXT: v_mul_lo_u32 v18, v8, v11
|
||||
; GISEL-NEXT: v_mul_hi_u32 v19, v7, v11
|
||||
; GISEL-NEXT: v_mul_hi_u32 v11, v8, v11
|
||||
; GISEL-NEXT: v_add_i32_e32 v13, vcc, v13, v17
|
||||
; GISEL-NEXT: v_mul_lo_u32 v17, s9, v9
|
||||
; GISEL-NEXT: v_add_i32_e32 v15, vcc, v15, v17
|
||||
; GISEL-NEXT: v_mul_lo_u32 v17, v9, v12
|
||||
; GISEL-NEXT: v_add_i32_e32 v13, vcc, v13, v14
|
||||
; GISEL-NEXT: v_mul_hi_u32 v14, v6, v12
|
||||
; GISEL-NEXT: v_mul_hi_u32 v12, v9, v12
|
||||
; GISEL-NEXT: v_add_i32_e32 v15, vcc, v15, v16
|
||||
; GISEL-NEXT: v_mul_lo_u32 v16, v6, v15
|
||||
; GISEL-NEXT: v_add_i32_e32 v16, vcc, v17, v16
|
||||
; GISEL-NEXT: v_cndmask_b32_e64 v17, 0, 1, vcc
|
||||
; GISEL-NEXT: v_add_i32_e32 v14, vcc, v16, v14
|
||||
; GISEL-NEXT: v_mul_lo_u32 v14, v7, v13
|
||||
; GISEL-NEXT: v_mul_lo_u32 v16, v8, v13
|
||||
; GISEL-NEXT: v_add_i32_e64 v14, s[4:5], v18, v14
|
||||
; GISEL-NEXT: v_cndmask_b32_e64 v18, 0, 1, s[4:5]
|
||||
; GISEL-NEXT: v_add_i32_e64 v14, s[4:5], v14, v19
|
||||
; GISEL-NEXT: v_mul_hi_u32 v14, v7, v13
|
||||
; GISEL-NEXT: v_cndmask_b32_e64 v19, 0, 1, s[4:5]
|
||||
; GISEL-NEXT: v_add_i32_e64 v18, s[4:5], v18, v19
|
||||
; GISEL-NEXT: v_mul_lo_u32 v19, v9, v15
|
||||
; GISEL-NEXT: v_add_i32_e64 v11, s[4:5], v16, v11
|
||||
; GISEL-NEXT: v_cndmask_b32_e64 v16, 0, 1, s[4:5]
|
||||
; GISEL-NEXT: v_add_i32_e64 v11, s[4:5], v11, v14
|
||||
; GISEL-NEXT: v_cndmask_b32_e64 v14, 0, 1, s[4:5]
|
||||
; GISEL-NEXT: v_add_i32_e64 v14, s[4:5], v16, v14
|
||||
; GISEL-NEXT: v_cndmask_b32_e64 v16, 0, 1, vcc
|
||||
; GISEL-NEXT: v_add_i32_e32 v16, vcc, v17, v16
|
||||
; GISEL-NEXT: v_mul_hi_u32 v17, v6, v15
|
||||
; GISEL-NEXT: v_add_i32_e32 v12, vcc, v19, v12
|
||||
; GISEL-NEXT: v_cndmask_b32_e64 v19, 0, 1, vcc
|
||||
; GISEL-NEXT: v_add_i32_e32 v12, vcc, v12, v17
|
||||
; GISEL-NEXT: v_cndmask_b32_e64 v17, 0, 1, vcc
|
||||
; GISEL-NEXT: v_add_i32_e32 v17, vcc, v19, v17
|
||||
; GISEL-NEXT: v_mov_b32_e32 v19, s11
|
||||
; GISEL-NEXT: v_mul_hi_u32 v13, v8, v13
|
||||
; GISEL-NEXT: v_mul_hi_u32 v15, v9, v15
|
||||
; GISEL-NEXT: v_add_i32_e32 v11, vcc, v11, v18
|
||||
; GISEL-NEXT: v_cndmask_b32_e64 v18, 0, 1, vcc
|
||||
; GISEL-NEXT: v_add_i32_e32 v12, vcc, v12, v16
|
||||
; GISEL-NEXT: v_cndmask_b32_e64 v16, 0, 1, vcc
|
||||
; GISEL-NEXT: v_add_i32_e32 v14, vcc, v14, v18
|
||||
; GISEL-NEXT: v_add_i32_e32 v16, vcc, v17, v16
|
||||
; GISEL-NEXT: v_add_i32_e32 v13, vcc, v13, v14
|
||||
; GISEL-NEXT: v_add_i32_e32 v14, vcc, v15, v16
|
||||
; GISEL-NEXT: v_add_i32_e32 v7, vcc, v7, v11
|
||||
; GISEL-NEXT: v_addc_u32_e32 v8, vcc, v8, v13, vcc
|
||||
; GISEL-NEXT: v_mul_lo_u32 v11, v1, v7
|
||||
; GISEL-NEXT: v_mul_hi_u32 v13, v0, v7
|
||||
; GISEL-NEXT: v_mul_hi_u32 v7, v1, v7
|
||||
; GISEL-NEXT: v_add_i32_e32 v6, vcc, v6, v12
|
||||
; GISEL-NEXT: v_addc_u32_e32 v9, vcc, v9, v14, vcc
|
||||
; GISEL-NEXT: v_mul_lo_u32 v12, v3, v6
|
||||
; GISEL-NEXT: v_mul_hi_u32 v14, v2, v6
|
||||
; GISEL-NEXT: v_mul_hi_u32 v6, v3, v6
|
||||
; GISEL-NEXT: v_mul_lo_u32 v15, v0, v8
|
||||
; GISEL-NEXT: v_mul_lo_u32 v16, v1, v8
|
||||
; GISEL-NEXT: v_mul_hi_u32 v17, v0, v8
|
||||
; GISEL-NEXT: v_mul_hi_u32 v8, v1, v8
|
||||
; GISEL-NEXT: v_mul_lo_u32 v18, v2, v9
|
||||
; GISEL-NEXT: v_add_i32_e32 v12, vcc, v12, v18
|
||||
; GISEL-NEXT: v_cndmask_b32_e64 v18, 0, 1, vcc
|
||||
; GISEL-NEXT: v_add_i32_e32 v12, vcc, v12, v14
|
||||
; GISEL-NEXT: v_mul_lo_u32 v12, v3, v9
|
||||
; GISEL-NEXT: v_mul_hi_u32 v14, v2, v9
|
||||
; GISEL-NEXT: v_mul_hi_u32 v9, v3, v9
|
||||
; GISEL-NEXT: v_add_i32_e64 v11, s[4:5], v11, v15
|
||||
; GISEL-NEXT: v_cndmask_b32_e64 v15, 0, 1, s[4:5]
|
||||
; GISEL-NEXT: v_add_i32_e64 v7, s[4:5], v16, v7
|
||||
; GISEL-NEXT: v_cndmask_b32_e64 v16, 0, 1, s[4:5]
|
||||
; GISEL-NEXT: v_add_i32_e64 v6, s[4:5], v12, v6
|
||||
; GISEL-NEXT: v_cndmask_b32_e64 v12, 0, 1, s[4:5]
|
||||
; GISEL-NEXT: v_add_i32_e64 v11, s[4:5], v11, v13
|
||||
; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, 1, s[4:5]
|
||||
; GISEL-NEXT: v_add_i32_e64 v7, s[4:5], v7, v17
|
||||
; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, s[4:5]
|
||||
; GISEL-NEXT: v_cndmask_b32_e64 v17, 0, 1, vcc
|
||||
; GISEL-NEXT: v_add_i32_e32 v6, vcc, v6, v14
|
||||
; GISEL-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc
|
||||
; GISEL-NEXT: v_add_i32_e32 v11, vcc, v15, v11
|
||||
; GISEL-NEXT: v_add_i32_e32 v13, vcc, v16, v13
|
||||
; GISEL-NEXT: v_add_i32_e32 v15, vcc, v18, v17
|
||||
; GISEL-NEXT: v_add_i32_e32 v12, vcc, v12, v14
|
||||
; GISEL-NEXT: v_add_i32_e32 v7, vcc, v7, v11
|
||||
; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc
|
||||
; GISEL-NEXT: v_add_i32_e32 v6, vcc, v6, v15
|
||||
; GISEL-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc
|
||||
; GISEL-NEXT: v_add_i32_e32 v11, vcc, v13, v11
|
||||
; GISEL-NEXT: v_mul_lo_u32 v13, s8, v7
|
||||
; GISEL-NEXT: v_mul_lo_u32 v15, 0, v7
|
||||
; GISEL-NEXT: v_mul_hi_u32 v7, s8, v7
|
||||
; GISEL-NEXT: v_add_i32_e32 v8, vcc, v8, v10
|
||||
; GISEL-NEXT: v_add_i32_e32 v9, vcc, v9, v11
|
||||
; GISEL-NEXT: v_add_i32_e32 v12, vcc, v12, v14
|
||||
; GISEL-NEXT: v_mul_lo_u32 v14, s8, v6
|
||||
; GISEL-NEXT: v_mul_lo_u32 v16, 0, v6
|
||||
; GISEL-NEXT: v_mul_hi_u32 v6, s8, v6
|
||||
; GISEL-NEXT: v_add_i32_e32 v8, vcc, v8, v11
|
||||
; GISEL-NEXT: v_add_i32_e32 v9, vcc, v9, v12
|
||||
; GISEL-NEXT: v_mul_lo_u32 v8, s8, v8
|
||||
; GISEL-NEXT: v_mul_lo_u32 v9, s8, v9
|
||||
; GISEL-NEXT: v_add_i32_e32 v8, vcc, v14, v8
|
||||
; GISEL-NEXT: v_add_i32_e32 v9, vcc, v15, v9
|
||||
; GISEL-NEXT: v_add_i32_e32 v6, vcc, v8, v6
|
||||
; GISEL-NEXT: v_add_i32_e32 v7, vcc, v9, v7
|
||||
; GISEL-NEXT: v_sub_i32_e32 v0, vcc, v0, v12
|
||||
; GISEL-NEXT: v_subb_u32_e64 v8, s[4:5], v1, v6, vcc
|
||||
; GISEL-NEXT: v_sub_i32_e64 v1, s[4:5], v1, v6
|
||||
; GISEL-NEXT: v_add_i32_e32 v8, vcc, v15, v8
|
||||
; GISEL-NEXT: v_add_i32_e32 v9, vcc, v16, v9
|
||||
; GISEL-NEXT: v_add_i32_e32 v7, vcc, v8, v7
|
||||
; GISEL-NEXT: v_add_i32_e32 v6, vcc, v9, v6
|
||||
; GISEL-NEXT: v_sub_i32_e32 v0, vcc, v0, v13
|
||||
; GISEL-NEXT: v_subb_u32_e64 v8, s[4:5], v1, v7, vcc
|
||||
; GISEL-NEXT: v_sub_i32_e64 v1, s[4:5], v1, v7
|
||||
; GISEL-NEXT: v_cmp_le_u32_e64 s[4:5], s8, v0
|
||||
; GISEL-NEXT: v_cndmask_b32_e64 v6, 0, -1, s[4:5]
|
||||
; GISEL-NEXT: v_sub_i32_e64 v2, s[4:5], v2, v13
|
||||
; GISEL-NEXT: v_subb_u32_e64 v9, s[6:7], v3, v7, s[4:5]
|
||||
; GISEL-NEXT: v_sub_i32_e64 v3, s[6:7], v3, v7
|
||||
; GISEL-NEXT: v_cndmask_b32_e64 v7, 0, -1, s[4:5]
|
||||
; GISEL-NEXT: v_sub_i32_e64 v2, s[4:5], v2, v14
|
||||
; GISEL-NEXT: v_subb_u32_e64 v9, s[6:7], v3, v6, s[4:5]
|
||||
; GISEL-NEXT: v_sub_i32_e64 v3, s[6:7], v3, v6
|
||||
; GISEL-NEXT: v_cmp_le_u32_e64 s[6:7], s8, v2
|
||||
; GISEL-NEXT: v_cndmask_b32_e64 v7, 0, -1, s[6:7]
|
||||
; GISEL-NEXT: v_cndmask_b32_e64 v6, 0, -1, s[6:7]
|
||||
; GISEL-NEXT: v_cmp_eq_u32_e64 s[6:7], 0, v8
|
||||
; GISEL-NEXT: v_cndmask_b32_e64 v5, v5, v6, s[6:7]
|
||||
; GISEL-NEXT: v_cndmask_b32_e64 v5, v5, v7, s[6:7]
|
||||
; GISEL-NEXT: v_subbrev_u32_e32 v1, vcc, 0, v1, vcc
|
||||
; GISEL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v9
|
||||
; GISEL-NEXT: v_cndmask_b32_e32 v6, v18, v7, vcc
|
||||
; GISEL-NEXT: v_cndmask_b32_e32 v6, v10, v6, vcc
|
||||
; GISEL-NEXT: v_subbrev_u32_e64 v3, vcc, 0, v3, s[4:5]
|
||||
; GISEL-NEXT: v_subrev_i32_e32 v7, vcc, s8, v0
|
||||
; GISEL-NEXT: v_subbrev_u32_e32 v1, vcc, 0, v1, vcc
|
||||
|
|
|
@ -2460,11 +2460,8 @@ define i64 @v_usubsat_i64(i64 %lhs, i64 %rhs) {
|
|||
define amdgpu_ps i64 @s_usubsat_i64(i64 inreg %lhs, i64 inreg %rhs) {
|
||||
; GFX6-LABEL: s_usubsat_i64:
|
||||
; GFX6: ; %bb.0:
|
||||
; GFX6-NEXT: s_sub_u32 s4, s0, s2
|
||||
; GFX6-NEXT: s_cselect_b32 s5, 1, 0
|
||||
; GFX6-NEXT: s_and_b32 s5, s5, 1
|
||||
; GFX6-NEXT: v_mov_b32_e32 v0, s2
|
||||
; GFX6-NEXT: s_cmp_lg_u32 s5, 0
|
||||
; GFX6-NEXT: s_sub_u32 s4, s0, s2
|
||||
; GFX6-NEXT: v_mov_b32_e32 v1, s3
|
||||
; GFX6-NEXT: s_subb_u32 s5, s1, s3
|
||||
; GFX6-NEXT: v_cmp_lt_u64_e32 vcc, s[0:1], v[0:1]
|
||||
|
@ -2478,11 +2475,8 @@ define amdgpu_ps i64 @s_usubsat_i64(i64 inreg %lhs, i64 inreg %rhs) {
|
|||
;
|
||||
; GFX8-LABEL: s_usubsat_i64:
|
||||
; GFX8: ; %bb.0:
|
||||
; GFX8-NEXT: s_sub_u32 s4, s0, s2
|
||||
; GFX8-NEXT: s_cselect_b32 s5, 1, 0
|
||||
; GFX8-NEXT: s_and_b32 s5, s5, 1
|
||||
; GFX8-NEXT: v_mov_b32_e32 v0, s2
|
||||
; GFX8-NEXT: s_cmp_lg_u32 s5, 0
|
||||
; GFX8-NEXT: s_sub_u32 s4, s0, s2
|
||||
; GFX8-NEXT: v_mov_b32_e32 v1, s3
|
||||
; GFX8-NEXT: s_subb_u32 s5, s1, s3
|
||||
; GFX8-NEXT: v_cmp_lt_u64_e32 vcc, s[0:1], v[0:1]
|
||||
|
@ -2496,11 +2490,8 @@ define amdgpu_ps i64 @s_usubsat_i64(i64 inreg %lhs, i64 inreg %rhs) {
|
|||
;
|
||||
; GFX9-LABEL: s_usubsat_i64:
|
||||
; GFX9: ; %bb.0:
|
||||
; GFX9-NEXT: s_sub_u32 s4, s0, s2
|
||||
; GFX9-NEXT: s_cselect_b32 s5, 1, 0
|
||||
; GFX9-NEXT: s_and_b32 s5, s5, 1
|
||||
; GFX9-NEXT: v_mov_b32_e32 v0, s2
|
||||
; GFX9-NEXT: s_cmp_lg_u32 s5, 0
|
||||
; GFX9-NEXT: s_sub_u32 s4, s0, s2
|
||||
; GFX9-NEXT: v_mov_b32_e32 v1, s3
|
||||
; GFX9-NEXT: s_subb_u32 s5, s1, s3
|
||||
; GFX9-NEXT: v_cmp_lt_u64_e32 vcc, s[0:1], v[0:1]
|
||||
|
@ -2515,10 +2506,7 @@ define amdgpu_ps i64 @s_usubsat_i64(i64 inreg %lhs, i64 inreg %rhs) {
|
|||
; GFX10-LABEL: s_usubsat_i64:
|
||||
; GFX10: ; %bb.0:
|
||||
; GFX10-NEXT: s_sub_u32 s4, s0, s2
|
||||
; GFX10-NEXT: s_cselect_b32 s5, 1, 0
|
||||
; GFX10-NEXT: v_cmp_lt_u64_e64 s0, s[0:1], s[2:3]
|
||||
; GFX10-NEXT: s_and_b32 s5, s5, 1
|
||||
; GFX10-NEXT: s_cmp_lg_u32 s5, 0
|
||||
; GFX10-NEXT: s_subb_u32 s1, s1, s3
|
||||
; GFX10-NEXT: v_cndmask_b32_e64 v0, s4, 0, s0
|
||||
; GFX10-NEXT: v_cndmask_b32_e64 v1, s1, 0, s0
|
||||
|
@ -2685,21 +2673,15 @@ define <2 x i64> @v_usubsat_v2i64(<2 x i64> %lhs, <2 x i64> %rhs) {
|
|||
define amdgpu_ps <2 x i64> @s_usubsat_v2i64(<2 x i64> inreg %lhs, <2 x i64> inreg %rhs) {
|
||||
; GFX6-LABEL: s_usubsat_v2i64:
|
||||
; GFX6: ; %bb.0:
|
||||
; GFX6-NEXT: s_sub_u32 s8, s0, s4
|
||||
; GFX6-NEXT: s_cselect_b32 s9, 1, 0
|
||||
; GFX6-NEXT: s_and_b32 s9, s9, 1
|
||||
; GFX6-NEXT: s_cmp_lg_u32 s9, 0
|
||||
; GFX6-NEXT: v_mov_b32_e32 v0, s4
|
||||
; GFX6-NEXT: s_subb_u32 s9, s1, s5
|
||||
; GFX6-NEXT: s_sub_u32 s8, s0, s4
|
||||
; GFX6-NEXT: v_mov_b32_e32 v1, s5
|
||||
; GFX6-NEXT: s_subb_u32 s9, s1, s5
|
||||
; GFX6-NEXT: v_cmp_lt_u64_e32 vcc, s[0:1], v[0:1]
|
||||
; GFX6-NEXT: s_sub_u32 s0, s2, s6
|
||||
; GFX6-NEXT: s_cselect_b32 s1, 1, 0
|
||||
; GFX6-NEXT: s_and_b32 s1, s1, 1
|
||||
; GFX6-NEXT: v_mov_b32_e32 v0, s6
|
||||
; GFX6-NEXT: v_mov_b32_e32 v2, s8
|
||||
; GFX6-NEXT: v_mov_b32_e32 v3, s9
|
||||
; GFX6-NEXT: s_cmp_lg_u32 s1, 0
|
||||
; GFX6-NEXT: s_sub_u32 s0, s2, s6
|
||||
; GFX6-NEXT: v_mov_b32_e32 v1, s7
|
||||
; GFX6-NEXT: v_cndmask_b32_e64 v2, v2, 0, vcc
|
||||
; GFX6-NEXT: v_cndmask_b32_e64 v3, v3, 0, vcc
|
||||
|
@ -2717,21 +2699,15 @@ define amdgpu_ps <2 x i64> @s_usubsat_v2i64(<2 x i64> inreg %lhs, <2 x i64> inre
|
|||
;
|
||||
; GFX8-LABEL: s_usubsat_v2i64:
|
||||
; GFX8: ; %bb.0:
|
||||
; GFX8-NEXT: s_sub_u32 s8, s0, s4
|
||||
; GFX8-NEXT: s_cselect_b32 s9, 1, 0
|
||||
; GFX8-NEXT: s_and_b32 s9, s9, 1
|
||||
; GFX8-NEXT: s_cmp_lg_u32 s9, 0
|
||||
; GFX8-NEXT: v_mov_b32_e32 v0, s4
|
||||
; GFX8-NEXT: s_subb_u32 s9, s1, s5
|
||||
; GFX8-NEXT: s_sub_u32 s8, s0, s4
|
||||
; GFX8-NEXT: v_mov_b32_e32 v1, s5
|
||||
; GFX8-NEXT: s_subb_u32 s9, s1, s5
|
||||
; GFX8-NEXT: v_cmp_lt_u64_e32 vcc, s[0:1], v[0:1]
|
||||
; GFX8-NEXT: s_sub_u32 s0, s2, s6
|
||||
; GFX8-NEXT: s_cselect_b32 s1, 1, 0
|
||||
; GFX8-NEXT: s_and_b32 s1, s1, 1
|
||||
; GFX8-NEXT: v_mov_b32_e32 v0, s6
|
||||
; GFX8-NEXT: v_mov_b32_e32 v2, s8
|
||||
; GFX8-NEXT: v_mov_b32_e32 v3, s9
|
||||
; GFX8-NEXT: s_cmp_lg_u32 s1, 0
|
||||
; GFX8-NEXT: s_sub_u32 s0, s2, s6
|
||||
; GFX8-NEXT: v_mov_b32_e32 v1, s7
|
||||
; GFX8-NEXT: v_cndmask_b32_e64 v2, v2, 0, vcc
|
||||
; GFX8-NEXT: v_cndmask_b32_e64 v3, v3, 0, vcc
|
||||
|
@ -2749,21 +2725,15 @@ define amdgpu_ps <2 x i64> @s_usubsat_v2i64(<2 x i64> inreg %lhs, <2 x i64> inre
|
|||
;
|
||||
; GFX9-LABEL: s_usubsat_v2i64:
|
||||
; GFX9: ; %bb.0:
|
||||
; GFX9-NEXT: s_sub_u32 s8, s0, s4
|
||||
; GFX9-NEXT: s_cselect_b32 s9, 1, 0
|
||||
; GFX9-NEXT: s_and_b32 s9, s9, 1
|
||||
; GFX9-NEXT: s_cmp_lg_u32 s9, 0
|
||||
; GFX9-NEXT: v_mov_b32_e32 v0, s4
|
||||
; GFX9-NEXT: s_subb_u32 s9, s1, s5
|
||||
; GFX9-NEXT: s_sub_u32 s8, s0, s4
|
||||
; GFX9-NEXT: v_mov_b32_e32 v1, s5
|
||||
; GFX9-NEXT: s_subb_u32 s9, s1, s5
|
||||
; GFX9-NEXT: v_cmp_lt_u64_e32 vcc, s[0:1], v[0:1]
|
||||
; GFX9-NEXT: s_sub_u32 s0, s2, s6
|
||||
; GFX9-NEXT: s_cselect_b32 s1, 1, 0
|
||||
; GFX9-NEXT: s_and_b32 s1, s1, 1
|
||||
; GFX9-NEXT: v_mov_b32_e32 v0, s6
|
||||
; GFX9-NEXT: v_mov_b32_e32 v2, s8
|
||||
; GFX9-NEXT: v_mov_b32_e32 v3, s9
|
||||
; GFX9-NEXT: s_cmp_lg_u32 s1, 0
|
||||
; GFX9-NEXT: s_sub_u32 s0, s2, s6
|
||||
; GFX9-NEXT: v_mov_b32_e32 v1, s7
|
||||
; GFX9-NEXT: v_cndmask_b32_e64 v2, v2, 0, vcc
|
||||
; GFX9-NEXT: v_cndmask_b32_e64 v3, v3, 0, vcc
|
||||
|
@ -2782,23 +2752,17 @@ define amdgpu_ps <2 x i64> @s_usubsat_v2i64(<2 x i64> inreg %lhs, <2 x i64> inre
|
|||
; GFX10-LABEL: s_usubsat_v2i64:
|
||||
; GFX10: ; %bb.0:
|
||||
; GFX10-NEXT: s_sub_u32 s8, s0, s4
|
||||
; GFX10-NEXT: s_cselect_b32 s9, 1, 0
|
||||
; GFX10-NEXT: s_and_b32 s9, s9, 1
|
||||
; GFX10-NEXT: s_cmp_lg_u32 s9, 0
|
||||
; GFX10-NEXT: s_subb_u32 s9, s1, s5
|
||||
; GFX10-NEXT: v_cmp_lt_u64_e64 s1, s[0:1], s[4:5]
|
||||
; GFX10-NEXT: v_cmp_lt_u64_e64 s4, s[0:1], s[4:5]
|
||||
; GFX10-NEXT: s_subb_u32 s1, s1, s5
|
||||
; GFX10-NEXT: s_sub_u32 s0, s2, s6
|
||||
; GFX10-NEXT: s_cselect_b32 s4, 1, 0
|
||||
; GFX10-NEXT: v_cmp_lt_u64_e64 s2, s[2:3], s[6:7]
|
||||
; GFX10-NEXT: s_and_b32 s4, s4, 1
|
||||
; GFX10-NEXT: s_cmp_lg_u32 s4, 0
|
||||
; GFX10-NEXT: v_cndmask_b32_e64 v0, s8, 0, s1
|
||||
; GFX10-NEXT: v_cndmask_b32_e64 v1, s9, 0, s1
|
||||
; GFX10-NEXT: v_cndmask_b32_e64 v1, s1, 0, s4
|
||||
; GFX10-NEXT: s_subb_u32 s1, s3, s7
|
||||
; GFX10-NEXT: v_cndmask_b32_e64 v0, s8, 0, s4
|
||||
; GFX10-NEXT: v_cndmask_b32_e64 v2, s0, 0, s2
|
||||
; GFX10-NEXT: v_cndmask_b32_e64 v3, s1, 0, s2
|
||||
; GFX10-NEXT: v_readfirstlane_b32 s0, v0
|
||||
; GFX10-NEXT: v_readfirstlane_b32 s1, v1
|
||||
; GFX10-NEXT: v_readfirstlane_b32 s0, v0
|
||||
; GFX10-NEXT: v_readfirstlane_b32 s2, v2
|
||||
; GFX10-NEXT: v_readfirstlane_b32 s3, v3
|
||||
; GFX10-NEXT: ; return to shader part epilog
|
||||
|
@ -2809,28 +2773,19 @@ define amdgpu_ps <2 x i64> @s_usubsat_v2i64(<2 x i64> inreg %lhs, <2 x i64> inre
|
|||
define amdgpu_ps i128 @s_usubsat_i128(i128 inreg %lhs, i128 inreg %rhs) {
|
||||
; GFX6-LABEL: s_usubsat_i128:
|
||||
; GFX6: ; %bb.0:
|
||||
; GFX6-NEXT: s_sub_u32 s8, s0, s4
|
||||
; GFX6-NEXT: s_cselect_b32 s9, 1, 0
|
||||
; GFX6-NEXT: s_and_b32 s9, s9, 1
|
||||
; GFX6-NEXT: v_mov_b32_e32 v2, s4
|
||||
; GFX6-NEXT: s_cmp_lg_u32 s9, 0
|
||||
; GFX6-NEXT: v_mov_b32_e32 v3, s5
|
||||
; GFX6-NEXT: s_subb_u32 s9, s1, s5
|
||||
; GFX6-NEXT: v_mov_b32_e32 v0, s6
|
||||
; GFX6-NEXT: v_cmp_lt_u64_e32 vcc, s[0:1], v[2:3]
|
||||
; GFX6-NEXT: s_cselect_b32 s10, 1, 0
|
||||
; GFX6-NEXT: v_mov_b32_e32 v1, s7
|
||||
; GFX6-NEXT: s_and_b32 s10, s10, 1
|
||||
; GFX6-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
|
||||
; GFX6-NEXT: v_cmp_lt_u64_e32 vcc, s[2:3], v[0:1]
|
||||
; GFX6-NEXT: s_cmp_lg_u32 s10, 0
|
||||
; GFX6-NEXT: s_subb_u32 s10, s2, s6
|
||||
; GFX6-NEXT: s_sub_u32 s8, s0, s4
|
||||
; GFX6-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc
|
||||
; GFX6-NEXT: v_cmp_eq_u64_e32 vcc, s[2:3], v[0:1]
|
||||
; GFX6-NEXT: s_cselect_b32 s11, 1, 0
|
||||
; GFX6-NEXT: s_and_b32 s11, s11, 1
|
||||
; GFX6-NEXT: s_subb_u32 s9, s1, s5
|
||||
; GFX6-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc
|
||||
; GFX6-NEXT: s_cmp_lg_u32 s11, 0
|
||||
; GFX6-NEXT: s_subb_u32 s10, s2, s6
|
||||
; GFX6-NEXT: v_and_b32_e32 v0, 1, v0
|
||||
; GFX6-NEXT: s_subb_u32 s11, s3, s7
|
||||
; GFX6-NEXT: v_mov_b32_e32 v1, s8
|
||||
|
@ -2851,18 +2806,9 @@ define amdgpu_ps i128 @s_usubsat_i128(i128 inreg %lhs, i128 inreg %rhs) {
|
|||
; GFX8-LABEL: s_usubsat_i128:
|
||||
; GFX8: ; %bb.0:
|
||||
; GFX8-NEXT: s_sub_u32 s8, s0, s4
|
||||
; GFX8-NEXT: s_cselect_b32 s9, 1, 0
|
||||
; GFX8-NEXT: s_and_b32 s9, s9, 1
|
||||
; GFX8-NEXT: s_cmp_lg_u32 s9, 0
|
||||
; GFX8-NEXT: s_subb_u32 s9, s1, s5
|
||||
; GFX8-NEXT: s_cselect_b32 s10, 1, 0
|
||||
; GFX8-NEXT: s_and_b32 s10, s10, 1
|
||||
; GFX8-NEXT: s_cmp_lg_u32 s10, 0
|
||||
; GFX8-NEXT: s_subb_u32 s10, s2, s6
|
||||
; GFX8-NEXT: s_cselect_b32 s11, 1, 0
|
||||
; GFX8-NEXT: s_and_b32 s11, s11, 1
|
||||
; GFX8-NEXT: v_mov_b32_e32 v2, s4
|
||||
; GFX8-NEXT: s_cmp_lg_u32 s11, 0
|
||||
; GFX8-NEXT: s_subb_u32 s10, s2, s6
|
||||
; GFX8-NEXT: v_mov_b32_e32 v3, s5
|
||||
; GFX8-NEXT: s_subb_u32 s11, s3, s7
|
||||
; GFX8-NEXT: v_mov_b32_e32 v0, s6
|
||||
|
@ -2895,18 +2841,9 @@ define amdgpu_ps i128 @s_usubsat_i128(i128 inreg %lhs, i128 inreg %rhs) {
|
|||
; GFX9-LABEL: s_usubsat_i128:
|
||||
; GFX9: ; %bb.0:
|
||||
; GFX9-NEXT: s_sub_u32 s8, s0, s4
|
||||
; GFX9-NEXT: s_cselect_b32 s9, 1, 0
|
||||
; GFX9-NEXT: s_and_b32 s9, s9, 1
|
||||
; GFX9-NEXT: s_cmp_lg_u32 s9, 0
|
||||
; GFX9-NEXT: s_subb_u32 s9, s1, s5
|
||||
; GFX9-NEXT: s_cselect_b32 s10, 1, 0
|
||||
; GFX9-NEXT: s_and_b32 s10, s10, 1
|
||||
; GFX9-NEXT: s_cmp_lg_u32 s10, 0
|
||||
; GFX9-NEXT: s_subb_u32 s10, s2, s6
|
||||
; GFX9-NEXT: s_cselect_b32 s11, 1, 0
|
||||
; GFX9-NEXT: s_and_b32 s11, s11, 1
|
||||
; GFX9-NEXT: v_mov_b32_e32 v2, s4
|
||||
; GFX9-NEXT: s_cmp_lg_u32 s11, 0
|
||||
; GFX9-NEXT: s_subb_u32 s10, s2, s6
|
||||
; GFX9-NEXT: v_mov_b32_e32 v3, s5
|
||||
; GFX9-NEXT: s_subb_u32 s11, s3, s7
|
||||
; GFX9-NEXT: v_mov_b32_e32 v0, s6
|
||||
|
@ -2939,33 +2876,24 @@ define amdgpu_ps i128 @s_usubsat_i128(i128 inreg %lhs, i128 inreg %rhs) {
|
|||
; GFX10-LABEL: s_usubsat_i128:
|
||||
; GFX10: ; %bb.0:
|
||||
; GFX10-NEXT: s_sub_u32 s8, s0, s4
|
||||
; GFX10-NEXT: s_cselect_b32 s9, 1, 0
|
||||
; GFX10-NEXT: v_cmp_lt_u64_e64 s0, s[0:1], s[4:5]
|
||||
; GFX10-NEXT: s_and_b32 s9, s9, 1
|
||||
; GFX10-NEXT: s_cmp_lg_u32 s9, 0
|
||||
; GFX10-NEXT: s_subb_u32 s9, s1, s5
|
||||
; GFX10-NEXT: s_cselect_b32 s10, 1, 0
|
||||
; GFX10-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0
|
||||
; GFX10-NEXT: s_and_b32 s10, s10, 1
|
||||
; GFX10-NEXT: s_cmp_lg_u32 s10, 0
|
||||
; GFX10-NEXT: s_subb_u32 s10, s2, s6
|
||||
; GFX10-NEXT: s_cselect_b32 s11, 1, 0
|
||||
; GFX10-NEXT: s_and_b32 s11, s11, 1
|
||||
; GFX10-NEXT: s_cmp_lg_u32 s11, 0
|
||||
; GFX10-NEXT: s_subb_u32 s1, s3, s7
|
||||
; GFX10-NEXT: s_subb_u32 s11, s3, s7
|
||||
; GFX10-NEXT: s_cmp_eq_u64 s[2:3], s[6:7]
|
||||
; GFX10-NEXT: v_cmp_lt_u64_e64 s2, s[2:3], s[6:7]
|
||||
; GFX10-NEXT: s_cselect_b32 s0, 1, 0
|
||||
; GFX10-NEXT: s_and_b32 s0, 1, s0
|
||||
; GFX10-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0
|
||||
; GFX10-NEXT: v_cmp_lt_u64_e64 s0, s[2:3], s[6:7]
|
||||
; GFX10-NEXT: s_cselect_b32 s12, 1, 0
|
||||
; GFX10-NEXT: v_cndmask_b32_e64 v1, 0, 1, s0
|
||||
; GFX10-NEXT: s_and_b32 s0, 1, s12
|
||||
; GFX10-NEXT: v_cmp_ne_u32_e64 vcc_lo, 0, s0
|
||||
; GFX10-NEXT: v_cndmask_b32_e64 v1, 0, 1, s2
|
||||
; GFX10-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc_lo
|
||||
; GFX10-NEXT: v_and_b32_e32 v0, 1, v0
|
||||
; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0
|
||||
; GFX10-NEXT: v_cndmask_b32_e64 v0, s8, 0, vcc_lo
|
||||
; GFX10-NEXT: v_cndmask_b32_e64 v1, s9, 0, vcc_lo
|
||||
; GFX10-NEXT: v_cndmask_b32_e64 v2, s10, 0, vcc_lo
|
||||
; GFX10-NEXT: v_cndmask_b32_e64 v3, s1, 0, vcc_lo
|
||||
; GFX10-NEXT: v_cndmask_b32_e64 v3, s11, 0, vcc_lo
|
||||
; GFX10-NEXT: v_readfirstlane_b32 s0, v0
|
||||
; GFX10-NEXT: v_readfirstlane_b32 s1, v1
|
||||
; GFX10-NEXT: v_readfirstlane_b32 s2, v2
|
||||
|
@ -3319,61 +3247,43 @@ define <2 x i128> @v_usubsat_v2i128(<2 x i128> %lhs, <2 x i128> %rhs) {
|
|||
define amdgpu_ps <2 x i128> @s_usubsat_v2i128(<2 x i128> inreg %lhs, <2 x i128> inreg %rhs) {
|
||||
; GFX6-LABEL: s_usubsat_v2i128:
|
||||
; GFX6: ; %bb.0:
|
||||
; GFX6-NEXT: s_sub_u32 s16, s0, s8
|
||||
; GFX6-NEXT: s_cselect_b32 s17, 1, 0
|
||||
; GFX6-NEXT: s_and_b32 s17, s17, 1
|
||||
; GFX6-NEXT: s_cmp_lg_u32 s17, 0
|
||||
; GFX6-NEXT: s_subb_u32 s17, s1, s9
|
||||
; GFX6-NEXT: v_mov_b32_e32 v2, s8
|
||||
; GFX6-NEXT: s_cselect_b32 s18, 1, 0
|
||||
; GFX6-NEXT: v_mov_b32_e32 v3, s9
|
||||
; GFX6-NEXT: s_and_b32 s18, s18, 1
|
||||
; GFX6-NEXT: v_mov_b32_e32 v0, s10
|
||||
; GFX6-NEXT: v_cmp_lt_u64_e32 vcc, s[0:1], v[2:3]
|
||||
; GFX6-NEXT: s_cmp_lg_u32 s18, 0
|
||||
; GFX6-NEXT: v_mov_b32_e32 v1, s11
|
||||
; GFX6-NEXT: s_subb_u32 s18, s2, s10
|
||||
; GFX6-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
|
||||
; GFX6-NEXT: v_cmp_lt_u64_e32 vcc, s[2:3], v[0:1]
|
||||
; GFX6-NEXT: s_cselect_b32 s19, 1, 0
|
||||
; GFX6-NEXT: s_and_b32 s19, s19, 1
|
||||
; GFX6-NEXT: s_sub_u32 s16, s0, s8
|
||||
; GFX6-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc
|
||||
; GFX6-NEXT: v_cmp_eq_u64_e32 vcc, s[2:3], v[0:1]
|
||||
; GFX6-NEXT: s_cmp_lg_u32 s19, 0
|
||||
; GFX6-NEXT: s_subb_u32 s19, s3, s11
|
||||
; GFX6-NEXT: s_subb_u32 s17, s1, s9
|
||||
; GFX6-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc
|
||||
; GFX6-NEXT: v_and_b32_e32 v0, 1, v0
|
||||
; GFX6-NEXT: s_sub_u32 s0, s4, s12
|
||||
; GFX6-NEXT: s_subb_u32 s18, s2, s10
|
||||
; GFX6-NEXT: v_mov_b32_e32 v2, s17
|
||||
; GFX6-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0
|
||||
; GFX6-NEXT: s_cselect_b32 s1, 1, 0
|
||||
; GFX6-NEXT: s_subb_u32 s19, s3, s11
|
||||
; GFX6-NEXT: v_mov_b32_e32 v1, s16
|
||||
; GFX6-NEXT: v_cndmask_b32_e64 v5, v2, 0, vcc
|
||||
; GFX6-NEXT: s_and_b32 s1, s1, 1
|
||||
; GFX6-NEXT: v_mov_b32_e32 v2, s12
|
||||
; GFX6-NEXT: v_cndmask_b32_e64 v4, v1, 0, vcc
|
||||
; GFX6-NEXT: v_mov_b32_e32 v0, s18
|
||||
; GFX6-NEXT: v_mov_b32_e32 v1, s19
|
||||
; GFX6-NEXT: s_cmp_lg_u32 s1, 0
|
||||
; GFX6-NEXT: v_mov_b32_e32 v3, s13
|
||||
; GFX6-NEXT: v_cndmask_b32_e64 v6, v0, 0, vcc
|
||||
; GFX6-NEXT: v_cndmask_b32_e64 v7, v1, 0, vcc
|
||||
; GFX6-NEXT: s_subb_u32 s1, s5, s13
|
||||
; GFX6-NEXT: v_mov_b32_e32 v0, s14
|
||||
; GFX6-NEXT: v_cmp_lt_u64_e32 vcc, s[4:5], v[2:3]
|
||||
; GFX6-NEXT: s_cselect_b32 s2, 1, 0
|
||||
; GFX6-NEXT: v_mov_b32_e32 v1, s15
|
||||
; GFX6-NEXT: s_and_b32 s2, s2, 1
|
||||
; GFX6-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
|
||||
; GFX6-NEXT: v_cmp_lt_u64_e32 vcc, s[6:7], v[0:1]
|
||||
; GFX6-NEXT: s_cmp_lg_u32 s2, 0
|
||||
; GFX6-NEXT: s_subb_u32 s2, s6, s14
|
||||
; GFX6-NEXT: s_sub_u32 s0, s4, s12
|
||||
; GFX6-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc
|
||||
; GFX6-NEXT: v_cmp_eq_u64_e32 vcc, s[6:7], v[0:1]
|
||||
; GFX6-NEXT: s_cselect_b32 s3, 1, 0
|
||||
; GFX6-NEXT: s_and_b32 s3, s3, 1
|
||||
; GFX6-NEXT: s_subb_u32 s1, s5, s13
|
||||
; GFX6-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc
|
||||
; GFX6-NEXT: s_cmp_lg_u32 s3, 0
|
||||
; GFX6-NEXT: s_subb_u32 s2, s6, s14
|
||||
; GFX6-NEXT: v_and_b32_e32 v0, 1, v0
|
||||
; GFX6-NEXT: s_subb_u32 s3, s7, s15
|
||||
; GFX6-NEXT: v_mov_b32_e32 v1, s0
|
||||
|
@ -3398,18 +3308,9 @@ define amdgpu_ps <2 x i128> @s_usubsat_v2i128(<2 x i128> inreg %lhs, <2 x i128>
|
|||
; GFX8-LABEL: s_usubsat_v2i128:
|
||||
; GFX8: ; %bb.0:
|
||||
; GFX8-NEXT: s_sub_u32 s16, s0, s8
|
||||
; GFX8-NEXT: s_cselect_b32 s17, 1, 0
|
||||
; GFX8-NEXT: s_and_b32 s17, s17, 1
|
||||
; GFX8-NEXT: s_cmp_lg_u32 s17, 0
|
||||
; GFX8-NEXT: s_subb_u32 s17, s1, s9
|
||||
; GFX8-NEXT: s_cselect_b32 s18, 1, 0
|
||||
; GFX8-NEXT: s_and_b32 s18, s18, 1
|
||||
; GFX8-NEXT: s_cmp_lg_u32 s18, 0
|
||||
; GFX8-NEXT: s_subb_u32 s18, s2, s10
|
||||
; GFX8-NEXT: s_cselect_b32 s19, 1, 0
|
||||
; GFX8-NEXT: s_and_b32 s19, s19, 1
|
||||
; GFX8-NEXT: v_mov_b32_e32 v2, s8
|
||||
; GFX8-NEXT: s_cmp_lg_u32 s19, 0
|
||||
; GFX8-NEXT: s_subb_u32 s18, s2, s10
|
||||
; GFX8-NEXT: v_mov_b32_e32 v3, s9
|
||||
; GFX8-NEXT: s_subb_u32 s19, s3, s11
|
||||
; GFX8-NEXT: v_mov_b32_e32 v0, s10
|
||||
|
@ -3422,28 +3323,19 @@ define amdgpu_ps <2 x i128> @s_usubsat_v2i128(<2 x i128> inreg %lhs, <2 x i128>
|
|||
; GFX8-NEXT: s_and_b32 s0, 1, s10
|
||||
; GFX8-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
|
||||
; GFX8-NEXT: v_cmp_ne_u32_e64 vcc, 0, s0
|
||||
; GFX8-NEXT: s_sub_u32 s0, s4, s12
|
||||
; GFX8-NEXT: s_cselect_b32 s1, 1, 0
|
||||
; GFX8-NEXT: s_and_b32 s1, s1, 1
|
||||
; GFX8-NEXT: s_cmp_lg_u32 s1, 0
|
||||
; GFX8-NEXT: s_subb_u32 s1, s5, s13
|
||||
; GFX8-NEXT: s_cselect_b32 s2, 1, 0
|
||||
; GFX8-NEXT: s_and_b32 s2, s2, 1
|
||||
; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc
|
||||
; GFX8-NEXT: s_cmp_lg_u32 s2, 0
|
||||
; GFX8-NEXT: v_and_b32_e32 v0, 1, v0
|
||||
; GFX8-NEXT: s_subb_u32 s2, s6, s14
|
||||
; GFX8-NEXT: v_mov_b32_e32 v2, s17
|
||||
; GFX8-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0
|
||||
; GFX8-NEXT: s_cselect_b32 s3, 1, 0
|
||||
; GFX8-NEXT: s_sub_u32 s0, s4, s12
|
||||
; GFX8-NEXT: v_mov_b32_e32 v1, s16
|
||||
; GFX8-NEXT: v_cndmask_b32_e64 v5, v2, 0, vcc
|
||||
; GFX8-NEXT: s_and_b32 s3, s3, 1
|
||||
; GFX8-NEXT: s_subb_u32 s1, s5, s13
|
||||
; GFX8-NEXT: v_mov_b32_e32 v2, s12
|
||||
; GFX8-NEXT: v_cndmask_b32_e64 v4, v1, 0, vcc
|
||||
; GFX8-NEXT: v_mov_b32_e32 v0, s18
|
||||
; GFX8-NEXT: v_mov_b32_e32 v1, s19
|
||||
; GFX8-NEXT: s_cmp_lg_u32 s3, 0
|
||||
; GFX8-NEXT: s_subb_u32 s2, s6, s14
|
||||
; GFX8-NEXT: v_mov_b32_e32 v3, s13
|
||||
; GFX8-NEXT: v_cndmask_b32_e64 v6, v0, 0, vcc
|
||||
; GFX8-NEXT: v_cndmask_b32_e64 v7, v1, 0, vcc
|
||||
|
@ -3482,18 +3374,9 @@ define amdgpu_ps <2 x i128> @s_usubsat_v2i128(<2 x i128> inreg %lhs, <2 x i128>
|
|||
; GFX9-LABEL: s_usubsat_v2i128:
|
||||
; GFX9: ; %bb.0:
|
||||
; GFX9-NEXT: s_sub_u32 s16, s0, s8
|
||||
; GFX9-NEXT: s_cselect_b32 s17, 1, 0
|
||||
; GFX9-NEXT: s_and_b32 s17, s17, 1
|
||||
; GFX9-NEXT: s_cmp_lg_u32 s17, 0
|
||||
; GFX9-NEXT: s_subb_u32 s17, s1, s9
|
||||
; GFX9-NEXT: s_cselect_b32 s18, 1, 0
|
||||
; GFX9-NEXT: s_and_b32 s18, s18, 1
|
||||
; GFX9-NEXT: s_cmp_lg_u32 s18, 0
|
||||
; GFX9-NEXT: s_subb_u32 s18, s2, s10
|
||||
; GFX9-NEXT: s_cselect_b32 s19, 1, 0
|
||||
; GFX9-NEXT: s_and_b32 s19, s19, 1
|
||||
; GFX9-NEXT: v_mov_b32_e32 v2, s8
|
||||
; GFX9-NEXT: s_cmp_lg_u32 s19, 0
|
||||
; GFX9-NEXT: s_subb_u32 s18, s2, s10
|
||||
; GFX9-NEXT: v_mov_b32_e32 v3, s9
|
||||
; GFX9-NEXT: s_subb_u32 s19, s3, s11
|
||||
; GFX9-NEXT: v_mov_b32_e32 v0, s10
|
||||
|
@ -3506,28 +3389,19 @@ define amdgpu_ps <2 x i128> @s_usubsat_v2i128(<2 x i128> inreg %lhs, <2 x i128>
|
|||
; GFX9-NEXT: s_and_b32 s0, 1, s10
|
||||
; GFX9-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
|
||||
; GFX9-NEXT: v_cmp_ne_u32_e64 vcc, 0, s0
|
||||
; GFX9-NEXT: s_sub_u32 s0, s4, s12
|
||||
; GFX9-NEXT: s_cselect_b32 s1, 1, 0
|
||||
; GFX9-NEXT: s_and_b32 s1, s1, 1
|
||||
; GFX9-NEXT: s_cmp_lg_u32 s1, 0
|
||||
; GFX9-NEXT: s_subb_u32 s1, s5, s13
|
||||
; GFX9-NEXT: s_cselect_b32 s2, 1, 0
|
||||
; GFX9-NEXT: s_and_b32 s2, s2, 1
|
||||
; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc
|
||||
; GFX9-NEXT: s_cmp_lg_u32 s2, 0
|
||||
; GFX9-NEXT: v_and_b32_e32 v0, 1, v0
|
||||
; GFX9-NEXT: s_subb_u32 s2, s6, s14
|
||||
; GFX9-NEXT: v_mov_b32_e32 v2, s17
|
||||
; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0
|
||||
; GFX9-NEXT: s_cselect_b32 s3, 1, 0
|
||||
; GFX9-NEXT: s_sub_u32 s0, s4, s12
|
||||
; GFX9-NEXT: v_mov_b32_e32 v1, s16
|
||||
; GFX9-NEXT: v_cndmask_b32_e64 v5, v2, 0, vcc
|
||||
; GFX9-NEXT: s_and_b32 s3, s3, 1
|
||||
; GFX9-NEXT: s_subb_u32 s1, s5, s13
|
||||
; GFX9-NEXT: v_mov_b32_e32 v2, s12
|
||||
; GFX9-NEXT: v_cndmask_b32_e64 v4, v1, 0, vcc
|
||||
; GFX9-NEXT: v_mov_b32_e32 v0, s18
|
||||
; GFX9-NEXT: v_mov_b32_e32 v1, s19
|
||||
; GFX9-NEXT: s_cmp_lg_u32 s3, 0
|
||||
; GFX9-NEXT: s_subb_u32 s2, s6, s14
|
||||
; GFX9-NEXT: v_mov_b32_e32 v3, s13
|
||||
; GFX9-NEXT: v_cndmask_b32_e64 v6, v0, 0, vcc
|
||||
; GFX9-NEXT: v_cndmask_b32_e64 v7, v1, 0, vcc
|
||||
|
@ -3566,69 +3440,51 @@ define amdgpu_ps <2 x i128> @s_usubsat_v2i128(<2 x i128> inreg %lhs, <2 x i128>
|
|||
; GFX10-LABEL: s_usubsat_v2i128:
|
||||
; GFX10: ; %bb.0:
|
||||
; GFX10-NEXT: s_sub_u32 s16, s0, s8
|
||||
; GFX10-NEXT: s_cselect_b32 s17, 1, 0
|
||||
; GFX10-NEXT: v_cmp_lt_u64_e64 s0, s[0:1], s[8:9]
|
||||
; GFX10-NEXT: s_and_b32 s17, s17, 1
|
||||
; GFX10-NEXT: s_cmp_lg_u32 s17, 0
|
||||
; GFX10-NEXT: s_subb_u32 s17, s1, s9
|
||||
; GFX10-NEXT: s_cselect_b32 s18, 1, 0
|
||||
; GFX10-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0
|
||||
; GFX10-NEXT: s_and_b32 s18, s18, 1
|
||||
; GFX10-NEXT: s_cmp_lg_u32 s18, 0
|
||||
; GFX10-NEXT: s_subb_u32 s18, s2, s10
|
||||
; GFX10-NEXT: s_cselect_b32 s19, 1, 0
|
||||
; GFX10-NEXT: s_and_b32 s19, s19, 1
|
||||
; GFX10-NEXT: s_cmp_lg_u32 s19, 0
|
||||
; GFX10-NEXT: s_subb_u32 s19, s3, s11
|
||||
; GFX10-NEXT: s_cmp_eq_u64 s[2:3], s[10:11]
|
||||
; GFX10-NEXT: v_cmp_lt_u64_e64 s2, s[2:3], s[10:11]
|
||||
; GFX10-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0
|
||||
; GFX10-NEXT: v_cmp_lt_u64_e64 s0, s[2:3], s[10:11]
|
||||
; GFX10-NEXT: s_cselect_b32 s20, 1, 0
|
||||
; GFX10-NEXT: v_cndmask_b32_e64 v1, 0, 1, s0
|
||||
; GFX10-NEXT: s_and_b32 s0, 1, s20
|
||||
; GFX10-NEXT: s_sub_u32 s8, s4, s12
|
||||
; GFX10-NEXT: s_cselect_b32 s1, 1, 0
|
||||
; GFX10-NEXT: s_sub_u32 s2, s4, s12
|
||||
; GFX10-NEXT: v_cmp_lt_u64_e64 s4, s[4:5], s[12:13]
|
||||
; GFX10-NEXT: v_cmp_ne_u32_e64 vcc_lo, 0, s0
|
||||
; GFX10-NEXT: s_and_b32 s1, s1, 1
|
||||
; GFX10-NEXT: v_cndmask_b32_e64 v1, 0, 1, s2
|
||||
; GFX10-NEXT: s_cmp_lg_u32 s1, 0
|
||||
; GFX10-NEXT: s_subb_u32 s3, s5, s13
|
||||
; GFX10-NEXT: s_cselect_b32 s1, 1, 0
|
||||
; GFX10-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc_lo
|
||||
; GFX10-NEXT: s_and_b32 s1, s1, 1
|
||||
; GFX10-NEXT: s_cmp_lg_u32 s1, 0
|
||||
; GFX10-NEXT: v_cmp_lt_u64_e64 s1, s[4:5], s[12:13]
|
||||
; GFX10-NEXT: s_subb_u32 s10, s6, s14
|
||||
; GFX10-NEXT: s_cselect_b32 s0, 1, 0
|
||||
; GFX10-NEXT: v_and_b32_e32 v0, 1, v0
|
||||
; GFX10-NEXT: s_and_b32 s0, s0, 1
|
||||
; GFX10-NEXT: v_cndmask_b32_e64 v1, 0, 1, s1
|
||||
; GFX10-NEXT: s_cmp_lg_u32 s0, 0
|
||||
; GFX10-NEXT: v_cmp_lt_u64_e64 s1, s[6:7], s[14:15]
|
||||
; GFX10-NEXT: s_subb_u32 s9, s7, s15
|
||||
; GFX10-NEXT: s_subb_u32 s1, s5, s13
|
||||
; GFX10-NEXT: s_subb_u32 s8, s6, s14
|
||||
; GFX10-NEXT: s_subb_u32 s3, s7, s15
|
||||
; GFX10-NEXT: s_cmp_eq_u64 s[6:7], s[14:15]
|
||||
; GFX10-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc_lo
|
||||
; GFX10-NEXT: v_cndmask_b32_e64 v1, 0, 1, s4
|
||||
; GFX10-NEXT: v_cmp_lt_u64_e64 s4, s[6:7], s[14:15]
|
||||
; GFX10-NEXT: s_cselect_b32 s0, 1, 0
|
||||
; GFX10-NEXT: s_and_b32 s0, 1, s0
|
||||
; GFX10-NEXT: v_cndmask_b32_e64 v2, 0, 1, s1
|
||||
; GFX10-NEXT: v_and_b32_e32 v0, 1, v0
|
||||
; GFX10-NEXT: v_cmp_ne_u32_e64 vcc_lo, 0, s0
|
||||
; GFX10-NEXT: v_cndmask_b32_e64 v2, 0, 1, s4
|
||||
; GFX10-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc_lo
|
||||
; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0
|
||||
; GFX10-NEXT: v_and_b32_e32 v0, 1, v1
|
||||
; GFX10-NEXT: v_cndmask_b32_e64 v1, s16, 0, vcc_lo
|
||||
; GFX10-NEXT: v_cndmask_b32_e64 v2, s17, 0, vcc_lo
|
||||
; GFX10-NEXT: v_cndmask_b32_e64 v3, s18, 0, vcc_lo
|
||||
; GFX10-NEXT: v_cndmask_b32_e64 v4, s19, 0, vcc_lo
|
||||
; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0
|
||||
; GFX10-NEXT: v_readfirstlane_b32 s0, v1
|
||||
; GFX10-NEXT: v_readfirstlane_b32 s1, v2
|
||||
; GFX10-NEXT: v_readfirstlane_b32 s2, v3
|
||||
; GFX10-NEXT: v_cndmask_b32_e64 v0, s8, 0, vcc_lo
|
||||
; GFX10-NEXT: v_cndmask_b32_e64 v1, s3, 0, vcc_lo
|
||||
; GFX10-NEXT: v_cndmask_b32_e64 v2, s10, 0, vcc_lo
|
||||
; GFX10-NEXT: v_cndmask_b32_e64 v3, s9, 0, vcc_lo
|
||||
; GFX10-NEXT: v_readfirstlane_b32 s3, v4
|
||||
; GFX10-NEXT: v_readfirstlane_b32 s4, v0
|
||||
; GFX10-NEXT: v_readfirstlane_b32 s5, v1
|
||||
; GFX10-NEXT: v_readfirstlane_b32 s6, v2
|
||||
; GFX10-NEXT: v_readfirstlane_b32 s7, v3
|
||||
; GFX10-NEXT: v_and_b32_e32 v1, 1, v1
|
||||
; GFX10-NEXT: v_cndmask_b32_e64 v0, s16, 0, vcc_lo
|
||||
; GFX10-NEXT: v_cndmask_b32_e64 v2, s18, 0, vcc_lo
|
||||
; GFX10-NEXT: v_cndmask_b32_e64 v3, s19, 0, vcc_lo
|
||||
; GFX10-NEXT: v_cmp_ne_u32_e64 s0, 0, v1
|
||||
; GFX10-NEXT: v_cndmask_b32_e64 v1, s17, 0, vcc_lo
|
||||
; GFX10-NEXT: v_cndmask_b32_e64 v4, s2, 0, s0
|
||||
; GFX10-NEXT: v_cndmask_b32_e64 v5, s1, 0, s0
|
||||
; GFX10-NEXT: v_cndmask_b32_e64 v6, s8, 0, s0
|
||||
; GFX10-NEXT: v_cndmask_b32_e64 v7, s3, 0, s0
|
||||
; GFX10-NEXT: v_readfirstlane_b32 s0, v0
|
||||
; GFX10-NEXT: v_readfirstlane_b32 s1, v1
|
||||
; GFX10-NEXT: v_readfirstlane_b32 s2, v2
|
||||
; GFX10-NEXT: v_readfirstlane_b32 s3, v3
|
||||
; GFX10-NEXT: v_readfirstlane_b32 s4, v4
|
||||
; GFX10-NEXT: v_readfirstlane_b32 s5, v5
|
||||
; GFX10-NEXT: v_readfirstlane_b32 s6, v6
|
||||
; GFX10-NEXT: v_readfirstlane_b32 s7, v7
|
||||
; GFX10-NEXT: ; return to shader part epilog
|
||||
%result = call <2 x i128> @llvm.usub.sat.v2i128(<2 x i128> %lhs, <2 x i128> %rhs)
|
||||
ret <2 x i128> %result
|
||||
|
|
|
@ -190,9 +190,6 @@ define amdgpu_ps <2 x i64> @scalar_xnor_i64_mul_use(i64 inreg %a, i64 inreg %b)
|
|||
; GCN-NEXT: s_xor_b64 s[2:3], s[0:1], s[2:3]
|
||||
; GCN-NEXT: s_not_b64 s[4:5], s[2:3]
|
||||
; GCN-NEXT: s_add_u32 s2, s2, s0
|
||||
; GCN-NEXT: s_cselect_b32 s0, 1, 0
|
||||
; GCN-NEXT: s_and_b32 s0, s0, 1
|
||||
; GCN-NEXT: s_cmp_lg_u32 s0, 0
|
||||
; GCN-NEXT: s_addc_u32 s3, s3, s1
|
||||
; GCN-NEXT: s_mov_b32 s0, s4
|
||||
; GCN-NEXT: s_mov_b32 s1, s5
|
||||
|
@ -203,11 +200,8 @@ define amdgpu_ps <2 x i64> @scalar_xnor_i64_mul_use(i64 inreg %a, i64 inreg %b)
|
|||
; GFX10-NEXT: s_xor_b64 s[2:3], s[0:1], s[2:3]
|
||||
; GFX10-NEXT: s_not_b64 s[4:5], s[2:3]
|
||||
; GFX10-NEXT: s_add_u32 s2, s2, s0
|
||||
; GFX10-NEXT: s_cselect_b32 s0, 1, 0
|
||||
; GFX10-NEXT: s_and_b32 s0, s0, 1
|
||||
; GFX10-NEXT: s_cmp_lg_u32 s0, 0
|
||||
; GFX10-NEXT: s_mov_b32 s0, s4
|
||||
; GFX10-NEXT: s_addc_u32 s3, s3, s1
|
||||
; GFX10-NEXT: s_mov_b32 s0, s4
|
||||
; GFX10-NEXT: s_mov_b32 s1, s5
|
||||
; GFX10-NEXT: ; return to shader part epilog
|
||||
%xor = xor i64 %a, %b
|
||||
|
|
|
@ -1616,9 +1616,6 @@ define amdgpu_kernel void @s_bitselect_i64_pat_0(i64 %a, i64 %b, i64 %mask) {
|
|||
; GFX8-GISEL-NEXT: s_andn2_b64 s[0:1], s[0:1], s[4:5]
|
||||
; GFX8-GISEL-NEXT: s_or_b64 s[0:1], s[2:3], s[0:1]
|
||||
; GFX8-GISEL-NEXT: s_add_u32 s0, s0, 10
|
||||
; GFX8-GISEL-NEXT: s_cselect_b32 s2, 1, 0
|
||||
; GFX8-GISEL-NEXT: s_and_b32 s2, s2, 1
|
||||
; GFX8-GISEL-NEXT: s_cmp_lg_u32 s2, 0
|
||||
; GFX8-GISEL-NEXT: s_addc_u32 s1, s1, 0
|
||||
; GFX8-GISEL-NEXT: v_mov_b32_e32 v0, s0
|
||||
; GFX8-GISEL-NEXT: v_mov_b32_e32 v1, s1
|
||||
|
@ -1635,9 +1632,6 @@ define amdgpu_kernel void @s_bitselect_i64_pat_0(i64 %a, i64 %b, i64 %mask) {
|
|||
; GFX10-GISEL-NEXT: s_andn2_b64 s[0:1], s[0:1], s[4:5]
|
||||
; GFX10-GISEL-NEXT: s_or_b64 s[0:1], s[2:3], s[0:1]
|
||||
; GFX10-GISEL-NEXT: s_add_u32 s0, s0, 10
|
||||
; GFX10-GISEL-NEXT: s_cselect_b32 s2, 1, 0
|
||||
; GFX10-GISEL-NEXT: s_and_b32 s2, s2, 1
|
||||
; GFX10-GISEL-NEXT: s_cmp_lg_u32 s2, 0
|
||||
; GFX10-GISEL-NEXT: s_addc_u32 s1, s1, 0
|
||||
; GFX10-GISEL-NEXT: v_mov_b32_e32 v0, s0
|
||||
; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, s1
|
||||
|
@ -1710,9 +1704,6 @@ define amdgpu_kernel void @s_bitselect_i64_pat_1(i64 %a, i64 %b, i64 %mask) {
|
|||
; GFX8-GISEL-NEXT: s_and_b64 s[2:3], s[2:3], s[6:7]
|
||||
; GFX8-GISEL-NEXT: s_xor_b64 s[0:1], s[2:3], s[0:1]
|
||||
; GFX8-GISEL-NEXT: s_add_u32 s0, s0, 10
|
||||
; GFX8-GISEL-NEXT: s_cselect_b32 s2, 1, 0
|
||||
; GFX8-GISEL-NEXT: s_and_b32 s2, s2, 1
|
||||
; GFX8-GISEL-NEXT: s_cmp_lg_u32 s2, 0
|
||||
; GFX8-GISEL-NEXT: s_addc_u32 s1, s1, 0
|
||||
; GFX8-GISEL-NEXT: v_mov_b32_e32 v0, s0
|
||||
; GFX8-GISEL-NEXT: v_mov_b32_e32 v1, s1
|
||||
|
@ -1729,9 +1720,6 @@ define amdgpu_kernel void @s_bitselect_i64_pat_1(i64 %a, i64 %b, i64 %mask) {
|
|||
; GFX10-GISEL-NEXT: s_and_b64 s[2:3], s[2:3], s[6:7]
|
||||
; GFX10-GISEL-NEXT: s_xor_b64 s[0:1], s[2:3], s[0:1]
|
||||
; GFX10-GISEL-NEXT: s_add_u32 s0, s0, 10
|
||||
; GFX10-GISEL-NEXT: s_cselect_b32 s2, 1, 0
|
||||
; GFX10-GISEL-NEXT: s_and_b32 s2, s2, 1
|
||||
; GFX10-GISEL-NEXT: s_cmp_lg_u32 s2, 0
|
||||
; GFX10-GISEL-NEXT: s_addc_u32 s1, s1, 0
|
||||
; GFX10-GISEL-NEXT: v_mov_b32_e32 v0, s0
|
||||
; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, s1
|
||||
|
@ -1804,9 +1792,6 @@ define amdgpu_kernel void @s_bitselect_i64_pat_2(i64 %a, i64 %b, i64 %mask) {
|
|||
; GFX8-GISEL-NEXT: s_and_b64 s[2:3], s[2:3], s[6:7]
|
||||
; GFX8-GISEL-NEXT: s_xor_b64 s[0:1], s[2:3], s[0:1]
|
||||
; GFX8-GISEL-NEXT: s_add_u32 s0, s0, 10
|
||||
; GFX8-GISEL-NEXT: s_cselect_b32 s2, 1, 0
|
||||
; GFX8-GISEL-NEXT: s_and_b32 s2, s2, 1
|
||||
; GFX8-GISEL-NEXT: s_cmp_lg_u32 s2, 0
|
||||
; GFX8-GISEL-NEXT: s_addc_u32 s1, s1, 0
|
||||
; GFX8-GISEL-NEXT: v_mov_b32_e32 v0, s0
|
||||
; GFX8-GISEL-NEXT: v_mov_b32_e32 v1, s1
|
||||
|
@ -1823,9 +1808,6 @@ define amdgpu_kernel void @s_bitselect_i64_pat_2(i64 %a, i64 %b, i64 %mask) {
|
|||
; GFX10-GISEL-NEXT: s_and_b64 s[2:3], s[2:3], s[6:7]
|
||||
; GFX10-GISEL-NEXT: s_xor_b64 s[0:1], s[2:3], s[0:1]
|
||||
; GFX10-GISEL-NEXT: s_add_u32 s0, s0, 10
|
||||
; GFX10-GISEL-NEXT: s_cselect_b32 s2, 1, 0
|
||||
; GFX10-GISEL-NEXT: s_and_b32 s2, s2, 1
|
||||
; GFX10-GISEL-NEXT: s_cmp_lg_u32 s2, 0
|
||||
; GFX10-GISEL-NEXT: s_addc_u32 s1, s1, 0
|
||||
; GFX10-GISEL-NEXT: v_mov_b32_e32 v0, s0
|
||||
; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, s1
|
||||
|
@ -1902,9 +1884,6 @@ define amdgpu_kernel void @s_bfi_sha256_ma_i64(i64 %x, i64 %y, i64 %z) {
|
|||
; GFX8-GISEL-NEXT: s_and_b64 s[0:1], s[6:7], s[0:1]
|
||||
; GFX8-GISEL-NEXT: s_or_b64 s[0:1], s[2:3], s[0:1]
|
||||
; GFX8-GISEL-NEXT: s_add_u32 s0, s0, 10
|
||||
; GFX8-GISEL-NEXT: s_cselect_b32 s2, 1, 0
|
||||
; GFX8-GISEL-NEXT: s_and_b32 s2, s2, 1
|
||||
; GFX8-GISEL-NEXT: s_cmp_lg_u32 s2, 0
|
||||
; GFX8-GISEL-NEXT: s_addc_u32 s1, s1, 0
|
||||
; GFX8-GISEL-NEXT: v_mov_b32_e32 v0, s0
|
||||
; GFX8-GISEL-NEXT: v_mov_b32_e32 v1, s1
|
||||
|
@ -1922,9 +1901,6 @@ define amdgpu_kernel void @s_bfi_sha256_ma_i64(i64 %x, i64 %y, i64 %z) {
|
|||
; GFX10-GISEL-NEXT: s_and_b64 s[2:3], s[6:7], s[2:3]
|
||||
; GFX10-GISEL-NEXT: s_or_b64 s[0:1], s[0:1], s[2:3]
|
||||
; GFX10-GISEL-NEXT: s_add_u32 s0, s0, 10
|
||||
; GFX10-GISEL-NEXT: s_cselect_b32 s2, 1, 0
|
||||
; GFX10-GISEL-NEXT: s_and_b32 s2, s2, 1
|
||||
; GFX10-GISEL-NEXT: s_cmp_lg_u32 s2, 0
|
||||
; GFX10-GISEL-NEXT: s_addc_u32 s1, s1, 0
|
||||
; GFX10-GISEL-NEXT: v_mov_b32_e32 v0, s0
|
||||
; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, s1
|
||||
|
|
|
@ -283,14 +283,8 @@ define amdgpu_ps i64 @s_csh_64_0(i64 inreg %a, i64 inreg %b) {
|
|||
; GISEL-NEXT: s_lshr_b64 s[6:7], s[0:1], s2
|
||||
; GISEL-NEXT: s_ashr_i64 s[0:1], s[0:1], s2
|
||||
; GISEL-NEXT: s_add_u32 s2, s4, s6
|
||||
; GISEL-NEXT: s_cselect_b32 s3, 1, 0
|
||||
; GISEL-NEXT: s_and_b32 s3, s3, 1
|
||||
; GISEL-NEXT: s_cmp_lg_u32 s3, 0
|
||||
; GISEL-NEXT: s_addc_u32 s3, s5, s7
|
||||
; GISEL-NEXT: s_add_u32 s0, s2, s0
|
||||
; GISEL-NEXT: s_cselect_b32 s2, 1, 0
|
||||
; GISEL-NEXT: s_and_b32 s2, s2, 1
|
||||
; GISEL-NEXT: s_cmp_lg_u32 s2, 0
|
||||
; GISEL-NEXT: s_addc_u32 s1, s3, s1
|
||||
; GISEL-NEXT: ; return to shader part epilog
|
||||
%and = and i64 %b, 63
|
||||
|
@ -322,14 +316,8 @@ define amdgpu_ps i64 @s_csh_64_1(i64 inreg %a, i64 inreg %b) {
|
|||
; GISEL-NEXT: s_lshr_b64 s[6:7], s[0:1], s2
|
||||
; GISEL-NEXT: s_ashr_i64 s[0:1], s[0:1], s2
|
||||
; GISEL-NEXT: s_add_u32 s2, s4, s6
|
||||
; GISEL-NEXT: s_cselect_b32 s3, 1, 0
|
||||
; GISEL-NEXT: s_and_b32 s3, s3, 1
|
||||
; GISEL-NEXT: s_cmp_lg_u32 s3, 0
|
||||
; GISEL-NEXT: s_addc_u32 s3, s5, s7
|
||||
; GISEL-NEXT: s_add_u32 s0, s2, s0
|
||||
; GISEL-NEXT: s_cselect_b32 s2, 1, 0
|
||||
; GISEL-NEXT: s_and_b32 s2, s2, 1
|
||||
; GISEL-NEXT: s_cmp_lg_u32 s2, 0
|
||||
; GISEL-NEXT: s_addc_u32 s1, s3, s1
|
||||
; GISEL-NEXT: ; return to shader part epilog
|
||||
%and = and i64 %b, 255
|
||||
|
|
|
@ -1972,3 +1972,24 @@ TEST_F(AMDGPUGISelMITest, TestKnownBitsAssertAlign) {
|
|||
CheckBits(30, Copies.size() - 2);
|
||||
CheckBits(5, Copies.size() - 1);
|
||||
}
|
||||
|
||||
TEST_F(AArch64GISelMITest, TestKnownBitsUADDO) {
|
||||
StringRef MIRString = R"(
|
||||
%ptr:_(p0) = G_IMPLICIT_DEF
|
||||
%ld0:_(s32) = G_LOAD %ptr(p0) :: (load (s16))
|
||||
%ld1:_(s32) = G_LOAD %ptr(p0) :: (load (s16))
|
||||
|
||||
%add:_(s32), %overflow:_(s32) = G_UADDO %ld0, %ld1
|
||||
%copy_overflow:_(s32) = COPY %overflow
|
||||
)";
|
||||
|
||||
setUp(MIRString);
|
||||
if (!TM)
|
||||
return;
|
||||
|
||||
Register CopyOverflow = Copies[Copies.size() - 1];
|
||||
GISelKnownBits Info(*MF);
|
||||
KnownBits Res = Info.getKnownBits(CopyOverflow);
|
||||
EXPECT_EQ(0u, Res.One.getZExtValue());
|
||||
EXPECT_EQ(31u, Res.Zero.countLeadingOnes());
|
||||
}
|
||||
|
|
|
@ -1527,3 +1527,24 @@ TEST_F(AArch64GISelMITest, TestKnownBitsVectorAssertZext) {
|
|||
EXPECT_EQ(0u, Res.One.getZExtValue());
|
||||
EXPECT_EQ(0xFFFFFFFFFFFFFFF8u, Res.Zero.getZExtValue());
|
||||
}
|
||||
|
||||
TEST_F(AArch64GISelMITest, TestNumSignBitsUAddoOverflow) {
|
||||
StringRef MIRString = R"(
|
||||
%copy_x0:_(s64) = COPY $x0
|
||||
%copy_x1:_(s64) = COPY $x1
|
||||
%x0_x1:_(<2 x s64>) = G_BUILD_VECTOR %copy_x0, %copy_x1
|
||||
%uaddo:_(<2 x s64>), %overflow:_(<2 x s32>) = G_UADDO %x0_x1, %x0_x1
|
||||
%result:_(<2 x s32>) = COPY %overflow
|
||||
)";
|
||||
|
||||
setUp(MIRString);
|
||||
if (!TM)
|
||||
return;
|
||||
|
||||
Register CopyOverflow = Copies[Copies.size() - 1];
|
||||
|
||||
GISelKnownBits Info(*MF);
|
||||
|
||||
// Assert sign-extension from vector boolean
|
||||
EXPECT_EQ(32u, Info.computeNumSignBits(CopyOverflow));
|
||||
}
|
||||
|
|
Loading…
Reference in New Issue