[AMDGPU][MC][GFX10][GFX90A] Corrected _e32/_e64 suffices

Fixed bugs https://bugs.llvm.org//show_bug.cgi?id=49643, https://bugs.llvm.org//show_bug.cgi?id=49644, https://bugs.llvm.org//show_bug.cgi?id=49645.

Differential Revision: https://reviews.llvm.org/D99413
This commit is contained in:
Dmitry Preobrazhensky 2021-04-01 14:21:00 +03:00
parent abbe80fa52
commit cd953434f2
66 changed files with 2039 additions and 1754 deletions

View File

@ -1136,14 +1136,18 @@ let AssemblerPredicate = isGFX10Plus, DecoderNamespace = "GFX10" in {
multiclass VOP3Only_Real_gfx10<bits<10> op> {
def _e64_gfx10 :
VOP3_Real<!cast<VOP3_Pseudo>(NAME#"_e64"), SIEncodingFamily.GFX10>,
VOP3e_gfx10<op, !cast<VOP3_Pseudo>(NAME#"_e64").Pfl>;
VOP3e_gfx10<op, !cast<VOP3_Pseudo>(NAME#"_e64").Pfl> {
let IsSingle = 1;
}
}
//===---------------------------- VOP3beOnly ----------------------------===//
multiclass VOP3beOnly_Real_gfx10<bits<10> op> {
def _e64_gfx10 :
VOP3_Real<!cast<VOP3_Pseudo>(NAME#"_e64"), SIEncodingFamily.GFX10>,
VOP3be_gfx10<op, !cast<VOP3_Pseudo>(NAME#"_e64").Pfl>;
VOP3be_gfx10<op, !cast<VOP3_Pseudo>(NAME#"_e64").Pfl> {
let IsSingle = 1;
}
}
} // End AssemblerPredicate = isGFX10Plus, DecoderNamespace = "GFX10"
@ -1191,7 +1195,10 @@ defm V_FMAAK_F16 : VOP2Only_Real_MADK_gfx10<0x038>;
defm V_MAX_F16 : VOP2_Real_gfx10<0x039>;
defm V_MIN_F16 : VOP2_Real_gfx10<0x03a>;
defm V_LDEXP_F16 : VOP2_Real_gfx10<0x03b>;
let IsSingle = 1 in {
defm V_PK_FMAC_F16 : VOP2_Real_e32_gfx10<0x03c>;
}
// VOP2 no carry-in, carry-out.
defm V_ADD_NC_U32 :
@ -1684,7 +1691,9 @@ let AssemblerPredicate = isGFX90APlus, DecoderNamespace = "GFX90A" in {
let SubtargetPredicate = isGFX90APlus in {
defm V_FMAC_F64 : VOP2_Real_e32e64_gfx90a <0x4>;
defm V_MUL_LEGACY_F32 : VOP2_Real_e64_gfx90a <0x2a1>;
let IsSingle = 1 in {
defm V_MUL_LEGACY_F32 : VOP2_Real_e64_gfx90a <0x2a1>;
}
} // End SubtargetPredicate = isGFX90APlus
multiclass VOP2_Real_DOT_ACC_gfx9<bits<6> op> : VOP2_Real_e32_vi<op> {

View File

@ -811,6 +811,7 @@ let AssemblerPredicate = isGFX10Plus, DecoderNamespace = "GFX10" in {
VOP3e_gfx10<op, !cast<VOP3_Pseudo>(opName#"_e64").Pfl> {
VOP3_Pseudo ps = !cast<VOP3_Pseudo>(opName#"_e64");
let AsmString = asmName # ps.AsmOperands;
let IsSingle = 1;
}
}
multiclass VOP3be_Real_gfx10<bits<10> op> {

View File

@ -32,7 +32,7 @@ define i8 @v_ashr_i8(i8 %value, i8 %amount) {
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: v_and_b32_e32 v1, 0xff, v1
; GFX10-NEXT: v_bfe_i32 v0, v0, 0, 8
; GFX10-NEXT: v_ashrrev_i16_e64 v0, v1, v0
; GFX10-NEXT: v_ashrrev_i16 v0, v1, v0
; GFX10-NEXT: s_setpc_b64 s[30:31]
%result = ashr i8 %value, %amount
ret i8 %result
@ -65,7 +65,7 @@ define i8 @v_ashr_i8_7(i8 %value) {
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: v_bfe_i32 v0, v0, 0, 8
; GFX10-NEXT: v_ashrrev_i16_e64 v0, 7, v0
; GFX10-NEXT: v_ashrrev_i16 v0, 7, v0
; GFX10-NEXT: s_setpc_b64 s[30:31]
%result = ashr i8 %value, 7
ret i8 %result
@ -595,7 +595,7 @@ define i16 @v_ashr_i16(i16 %value, i16 %amount) {
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: v_ashrrev_i16_e64 v0, v1, v0
; GFX10-NEXT: v_ashrrev_i16 v0, v1, v0
; GFX10-NEXT: s_setpc_b64 s[30:31]
%result = ashr i16 %value, %amount
ret i16 %result
@ -684,7 +684,7 @@ define amdgpu_ps half @ashr_i16_sv(i16 inreg %value, i16 %amount) {
;
; GFX10-LABEL: ashr_i16_sv:
; GFX10: ; %bb.0:
; GFX10-NEXT: v_ashrrev_i16_e64 v0, v0, s0
; GFX10-NEXT: v_ashrrev_i16 v0, v0, s0
; GFX10-NEXT: ; return to shader part epilog
%result = ashr i16 %value, %amount
%cast = bitcast i16 %result to half
@ -711,7 +711,7 @@ define amdgpu_ps half @ashr_i16_vs(i16 %value, i16 inreg %amount) {
;
; GFX10-LABEL: ashr_i16_vs:
; GFX10: ; %bb.0:
; GFX10-NEXT: v_ashrrev_i16_e64 v0, s0, v0
; GFX10-NEXT: v_ashrrev_i16 v0, s0, v0
; GFX10-NEXT: ; return to shader part epilog
%result = ashr i16 %value, %amount
%cast = bitcast i16 %result to half

View File

@ -11,7 +11,7 @@ declare i64 @llvm.smin.i64(i64, i64)
; GFX6789: v_mov_b32_e32 [[B]], 0xffff8000
; GFX6789: v_mov_b32_e32 [[C:v[0-9]+]], 0x7fff
; GFX6789: v_med3_i32 [[A]], [[B]], [[A]], [[C]]
; GFX10: v_cvt_pk_i16_i32_e64 [[A:v[0-9]+]], [[A]], [[B:v[0-9]+]]
; GFX10: v_cvt_pk_i16_i32 [[A:v[0-9]+]], [[A]], [[B:v[0-9]+]]
; GFX10: v_mov_b32_e32 [[B]], 0x7fff
; GFX10: v_med3_i32 [[A]], 0xffff8000, [[A]], [[B]]
define i16 @v_clamp_i64_i16(i64 %in) #0 {
@ -28,7 +28,7 @@ entry:
; GFX6789: v_mov_b32_e32 [[B]], 0xffff8000
; GFX6789: v_mov_b32_e32 [[C:v[0-9]+]], 0x7fff
; GFX6789: v_med3_i32 [[A]], [[B]], [[A]], [[C]]
; GFX10: v_cvt_pk_i16_i32_e64 [[A:v[0-9]+]], [[A]], [[B:v[0-9]+]]
; GFX10: v_cvt_pk_i16_i32 [[A:v[0-9]+]], [[A]], [[B:v[0-9]+]]
; GFX10: v_mov_b32_e32 [[B]], 0x7fff
; GFX10: v_med3_i32 [[A]], 0xffff8000, [[A]], [[B]]
define i16 @v_clamp_i64_i16_reverse(i64 %in) #0 {
@ -72,7 +72,7 @@ entry:
; GFX6789: v_mov_b32_e32 [[B]], 0xffffff01
; GFX6789: v_mov_b32_e32 [[C:v[0-9]+]], 0x100
; GFX6789: v_med3_i32 [[A]], [[B]], [[A]], [[C]]
; GFX10: v_cvt_pk_i16_i32_e64 [[A:v[0-9]+]], [[A]], [[B:v[0-9]+]]
; GFX10: v_cvt_pk_i16_i32 [[A:v[0-9]+]], [[A]], [[B:v[0-9]+]]
; GFX10: v_mov_b32_e32 [[B]], 0x100
; GFX10: v_med3_i32 [[A]], 0xffffff01, [[A]], [[B]]
define i16 @v_clamp_i64_i16_lower_than_short(i64 %in) #0 {
@ -89,7 +89,7 @@ entry:
; GFX6789: v_mov_b32_e32 [[B]], 0xffffff01
; GFX6789: v_mov_b32_e32 [[C:v[0-9]+]], 0x100
; GFX6789: v_med3_i32 [[A]], [[B]], [[A]], [[C]]
; GFX10: v_cvt_pk_i16_i32_e64 [[A:v[0-9]+]], [[A]], [[B:v[0-9]+]]
; GFX10: v_cvt_pk_i16_i32 [[A:v[0-9]+]], [[A]], [[B:v[0-9]+]]
; GFX10: v_mov_b32_e32 [[B]], 0x100
; GFX10: v_med3_i32 [[A]], 0xffffff01, [[A]], [[B]]
define i16 @v_clamp_i64_i16_lower_than_short_reverse(i64 %in) #0 {

View File

@ -4175,7 +4175,7 @@ define i32 @v_extract_v64i32_37(<64 x i32> addrspace(1)* %ptr) {
; GFX10-NEXT: s_mov_b32 s5, 0
; GFX10-NEXT: v_mov_b32_e32 v2, s4
; GFX10-NEXT: v_mov_b32_e32 v3, s5
; GFX10-NEXT: v_add_co_u32_e64 v0, vcc_lo, v0, v2
; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, v0, v2
; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo
; GFX10-NEXT: global_load_dwordx4 v[4:7], v[0:1], off offset:16
; GFX10-NEXT: s_waitcnt vmcnt(0)

View File

@ -127,11 +127,11 @@ define amdgpu_ps i7 @s_fshl_i7(i7 inreg %lhs, i7 inreg %rhs, i7 inreg %amt) {
; GFX10-NEXT: v_subrev_nc_u32_e32 v1, 7, v0
; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, 7, v0
; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc_lo
; GFX10-NEXT: v_sub_nc_u16_e64 v1, 6, v0
; GFX10-NEXT: v_sub_nc_u16 v1, 6, v0
; GFX10-NEXT: v_and_b32_e32 v0, s3, v0
; GFX10-NEXT: v_and_b32_e32 v1, s3, v1
; GFX10-NEXT: v_lshlrev_b16_e64 v0, v0, s0
; GFX10-NEXT: v_lshrrev_b16_e64 v1, v1, s1
; GFX10-NEXT: v_lshlrev_b16 v0, v0, s0
; GFX10-NEXT: v_lshrrev_b16 v1, v1, s1
; GFX10-NEXT: v_or_b32_e32 v0, v0, v1
; GFX10-NEXT: v_readfirstlane_b32 s0, v0
; GFX10-NEXT: ; return to shader part epilog
@ -245,7 +245,7 @@ define i7 @v_fshl_i7(i7 %lhs, i7 %rhs, i7 %amt) {
; GFX10-NEXT: v_and_b32_e32 v2, 0x7f, v2
; GFX10-NEXT: v_and_b32_e32 v1, 0x7f, v1
; GFX10-NEXT: v_rcp_iflag_f32_e32 v3, v3
; GFX10-NEXT: v_lshrrev_b16_e64 v1, 1, v1
; GFX10-NEXT: v_lshrrev_b16 v1, 1, v1
; GFX10-NEXT: v_mul_f32_e32 v3, 0x4f7ffffe, v3
; GFX10-NEXT: v_cvt_u32_f32_e32 v3, v3
; GFX10-NEXT: v_mul_lo_u32 v4, s4, v3
@ -261,11 +261,11 @@ define i7 @v_fshl_i7(i7 %lhs, i7 %rhs, i7 %amt) {
; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, 7, v2
; GFX10-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc_lo
; GFX10-NEXT: v_mov_b32_e32 v3, 0x7f
; GFX10-NEXT: v_sub_nc_u16_e64 v4, 6, v2
; GFX10-NEXT: v_sub_nc_u16 v4, 6, v2
; GFX10-NEXT: v_and_b32_e32 v2, v2, v3
; GFX10-NEXT: v_and_b32_e32 v3, v4, v3
; GFX10-NEXT: v_lshlrev_b16_e64 v0, v2, v0
; GFX10-NEXT: v_lshrrev_b16_e64 v1, v3, v1
; GFX10-NEXT: v_lshlrev_b16 v0, v2, v0
; GFX10-NEXT: v_lshrrev_b16 v1, v3, v1
; GFX10-NEXT: v_or_b32_e32 v0, v0, v1
; GFX10-NEXT: s_setpc_b64 s[30:31]
%result = call i7 @llvm.fshl.i7(i7 %lhs, i7 %rhs, i7 %amt)
@ -371,9 +371,9 @@ define i8 @v_fshl_i8(i8 %lhs, i8 %rhs, i8 %amt) {
; GFX10-NEXT: v_and_b32_e32 v1, 0xff, v1
; GFX10-NEXT: v_and_b32_e32 v2, 7, v2
; GFX10-NEXT: v_and_b32_e32 v3, 7, v3
; GFX10-NEXT: v_lshrrev_b16_e64 v1, 1, v1
; GFX10-NEXT: v_lshlrev_b16_e64 v0, v2, v0
; GFX10-NEXT: v_lshrrev_b16_e64 v1, v3, v1
; GFX10-NEXT: v_lshrrev_b16 v1, 1, v1
; GFX10-NEXT: v_lshlrev_b16 v0, v2, v0
; GFX10-NEXT: v_lshrrev_b16 v1, v3, v1
; GFX10-NEXT: v_or_b32_e32 v0, v0, v1
; GFX10-NEXT: s_setpc_b64 s[30:31]
%result = call i8 @llvm.fshl.i8(i8 %lhs, i8 %rhs, i8 %amt)
@ -452,8 +452,8 @@ define i8 @v_fshl_i8_4(i8 %lhs, i8 %rhs) {
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: v_and_b32_e32 v1, 0xff, v1
; GFX10-NEXT: v_lshlrev_b16_e64 v0, 4, v0
; GFX10-NEXT: v_lshrrev_b16_e64 v1, 4, v1
; GFX10-NEXT: v_lshlrev_b16 v0, 4, v0
; GFX10-NEXT: v_lshrrev_b16 v1, 4, v1
; GFX10-NEXT: v_or_b32_e32 v0, v0, v1
; GFX10-NEXT: s_setpc_b64 s[30:31]
%result = call i8 @llvm.fshl.i8(i8 %lhs, i8 %rhs, i8 4)
@ -532,8 +532,8 @@ define i8 @v_fshl_i8_5(i8 %lhs, i8 %rhs) {
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: v_and_b32_e32 v1, 0xff, v1
; GFX10-NEXT: v_lshlrev_b16_e64 v0, 5, v0
; GFX10-NEXT: v_lshrrev_b16_e64 v1, 3, v1
; GFX10-NEXT: v_lshlrev_b16 v0, 5, v0
; GFX10-NEXT: v_lshrrev_b16 v1, 3, v1
; GFX10-NEXT: v_or_b32_e32 v0, v0, v1
; GFX10-NEXT: s_setpc_b64 s[30:31]
%result = call i8 @llvm.fshl.i8(i8 %lhs, i8 %rhs, i8 5)
@ -756,13 +756,13 @@ define i16 @v_fshl_v2i8(i16 %lhs.arg, i16 %rhs.arg, i16 %amt.arg) {
; GFX10-NEXT: v_and_b32_e32 v3, 7, v3
; GFX10-NEXT: v_and_b32_e32 v2, 7, v2
; GFX10-NEXT: v_and_b32_e32 v6, 7, v6
; GFX10-NEXT: v_lshrrev_b16_e64 v4, 1, v4
; GFX10-NEXT: v_lshrrev_b16 v4, 1, v4
; GFX10-NEXT: v_and_b32_e32 v7, 7, v7
; GFX10-NEXT: v_lshrrev_b16_e64 v1, 1, v1
; GFX10-NEXT: v_lshlrev_b16_e64 v3, v3, v5
; GFX10-NEXT: v_lshlrev_b16_e64 v0, v2, v0
; GFX10-NEXT: v_lshrrev_b16_e64 v4, v6, v4
; GFX10-NEXT: v_lshrrev_b16_e64 v1, v7, v1
; GFX10-NEXT: v_lshrrev_b16 v1, 1, v1
; GFX10-NEXT: v_lshlrev_b16 v3, v3, v5
; GFX10-NEXT: v_lshlrev_b16 v0, v2, v0
; GFX10-NEXT: v_lshrrev_b16 v4, v6, v4
; GFX10-NEXT: v_lshrrev_b16 v1, v7, v1
; GFX10-NEXT: v_or_b32_e32 v2, v3, v4
; GFX10-NEXT: v_or_b32_e32 v0, v0, v1
; GFX10-NEXT: v_and_b32_sdwa v1, v2, s4 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
@ -1163,7 +1163,7 @@ define i32 @v_fshl_v4i8(i32 %lhs.arg, i32 %rhs.arg, i32 %amt.arg) {
; GFX10-NEXT: v_lshrrev_b32_e32 v3, 8, v0
; GFX10-NEXT: v_lshrrev_b32_e32 v4, 16, v0
; GFX10-NEXT: v_lshrrev_b32_e32 v5, 24, v0
; GFX10-NEXT: v_lshlrev_b16_e64 v0, v11, v0
; GFX10-NEXT: v_lshlrev_b16 v0, v11, v0
; GFX10-NEXT: v_xor_b32_e32 v11, -1, v8
; GFX10-NEXT: v_and_b32_e32 v8, 7, v8
; GFX10-NEXT: v_lshrrev_b32_e32 v6, 8, v1
@ -1174,25 +1174,25 @@ define i32 @v_fshl_v4i8(i32 %lhs.arg, i32 %rhs.arg, i32 %amt.arg) {
; GFX10-NEXT: v_and_b32_e32 v12, s4, v1
; GFX10-NEXT: v_and_b32_e32 v6, s4, v6
; GFX10-NEXT: v_and_b32_sdwa v1, v1, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; GFX10-NEXT: v_lshlrev_b16_e64 v3, v8, v3
; GFX10-NEXT: v_lshlrev_b16 v3, v8, v3
; GFX10-NEXT: v_xor_b32_e32 v8, -1, v9
; GFX10-NEXT: v_and_b32_e32 v2, 7, v2
; GFX10-NEXT: v_and_b32_e32 v9, 7, v9
; GFX10-NEXT: v_and_b32_e32 v11, 7, v11
; GFX10-NEXT: v_lshrrev_b16_e64 v6, 1, v6
; GFX10-NEXT: v_lshrrev_b16 v6, 1, v6
; GFX10-NEXT: v_and_b32_e32 v8, 7, v8
; GFX10-NEXT: v_lshrrev_b16_e64 v1, 1, v1
; GFX10-NEXT: v_lshrrev_b16 v1, 1, v1
; GFX10-NEXT: v_and_b32_e32 v13, 7, v13
; GFX10-NEXT: v_lshrrev_b16_e64 v7, 1, v7
; GFX10-NEXT: v_lshrrev_b16_e64 v6, v11, v6
; GFX10-NEXT: v_lshlrev_b16_e64 v2, v2, v5
; GFX10-NEXT: v_lshrrev_b16 v7, 1, v7
; GFX10-NEXT: v_lshrrev_b16 v6, v11, v6
; GFX10-NEXT: v_lshlrev_b16 v2, v2, v5
; GFX10-NEXT: v_and_b32_e32 v10, 7, v10
; GFX10-NEXT: v_lshrrev_b16_e64 v12, 1, v12
; GFX10-NEXT: v_lshrrev_b16_e64 v5, v13, v7
; GFX10-NEXT: v_lshlrev_b16_e64 v4, v9, v4
; GFX10-NEXT: v_lshrrev_b16_e64 v1, v8, v1
; GFX10-NEXT: v_lshrrev_b16 v12, 1, v12
; GFX10-NEXT: v_lshrrev_b16 v5, v13, v7
; GFX10-NEXT: v_lshlrev_b16 v4, v9, v4
; GFX10-NEXT: v_lshrrev_b16 v1, v8, v1
; GFX10-NEXT: v_or_b32_e32 v3, v3, v6
; GFX10-NEXT: v_lshrrev_b16_e64 v7, v10, v12
; GFX10-NEXT: v_lshrrev_b16 v7, v10, v12
; GFX10-NEXT: v_or_b32_e32 v2, v2, v5
; GFX10-NEXT: v_mov_b32_e32 v6, 8
; GFX10-NEXT: v_or_b32_e32 v1, v4, v1
@ -2980,10 +2980,10 @@ define i16 @v_fshl_i16(i16 %lhs, i16 %rhs, i16 %amt) {
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: v_xor_b32_e32 v3, -1, v2
; GFX10-NEXT: v_and_b32_e32 v2, 15, v2
; GFX10-NEXT: v_lshrrev_b16_e64 v1, 1, v1
; GFX10-NEXT: v_lshrrev_b16 v1, 1, v1
; GFX10-NEXT: v_and_b32_e32 v3, 15, v3
; GFX10-NEXT: v_lshlrev_b16_e64 v0, v2, v0
; GFX10-NEXT: v_lshrrev_b16_e64 v1, v3, v1
; GFX10-NEXT: v_lshlrev_b16 v0, v2, v0
; GFX10-NEXT: v_lshrrev_b16 v1, v3, v1
; GFX10-NEXT: v_or_b32_e32 v0, v0, v1
; GFX10-NEXT: s_setpc_b64 s[30:31]
%result = call i16 @llvm.fshl.i16(i16 %lhs, i16 %rhs, i16 %amt)
@ -3020,8 +3020,8 @@ define i16 @v_fshl_i16_4(i16 %lhs, i16 %rhs) {
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: v_lshlrev_b16_e64 v0, 4, v0
; GFX10-NEXT: v_lshrrev_b16_e64 v1, 12, v1
; GFX10-NEXT: v_lshlrev_b16 v0, 4, v0
; GFX10-NEXT: v_lshrrev_b16 v1, 12, v1
; GFX10-NEXT: v_or_b32_e32 v0, v0, v1
; GFX10-NEXT: s_setpc_b64 s[30:31]
%result = call i16 @llvm.fshl.i16(i16 %lhs, i16 %rhs, i16 4)
@ -3058,8 +3058,8 @@ define i16 @v_fshl_i16_5(i16 %lhs, i16 %rhs) {
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: v_lshlrev_b16_e64 v0, 5, v0
; GFX10-NEXT: v_lshrrev_b16_e64 v1, 11, v1
; GFX10-NEXT: v_lshlrev_b16 v0, 5, v0
; GFX10-NEXT: v_lshrrev_b16 v1, 11, v1
; GFX10-NEXT: v_or_b32_e32 v0, v0, v1
; GFX10-NEXT: s_setpc_b64 s[30:31]
%result = call i16 @llvm.fshl.i16(i16 %lhs, i16 %rhs, i16 5)
@ -3115,8 +3115,8 @@ define amdgpu_ps half @v_fshl_i16_ssv(i16 inreg %lhs, i16 inreg %rhs, i16 %amt)
; GFX10-NEXT: s_bfe_u32 s2, 1, 0x100000
; GFX10-NEXT: v_and_b32_e32 v1, 15, v1
; GFX10-NEXT: s_lshr_b32 s1, s1, s2
; GFX10-NEXT: v_lshlrev_b16_e64 v0, v0, s0
; GFX10-NEXT: v_lshrrev_b16_e64 v1, v1, s1
; GFX10-NEXT: v_lshlrev_b16 v0, v0, s0
; GFX10-NEXT: v_lshrrev_b16 v1, v1, s1
; GFX10-NEXT: v_or_b32_e32 v0, v0, v1
; GFX10-NEXT: ; return to shader part epilog
%result = call i16 @llvm.fshl.i16(i16 %lhs, i16 %rhs, i16 %amt)
@ -3162,11 +3162,11 @@ define amdgpu_ps half @v_fshl_i16_svs(i16 inreg %lhs, i16 %rhs, i16 inreg %amt)
;
; GFX10-LABEL: v_fshl_i16_svs:
; GFX10: ; %bb.0:
; GFX10-NEXT: v_lshrrev_b16_e64 v0, 1, v0
; GFX10-NEXT: v_lshrrev_b16 v0, 1, v0
; GFX10-NEXT: s_andn2_b32 s2, 15, s1
; GFX10-NEXT: s_and_b32 s1, s1, 15
; GFX10-NEXT: s_bfe_u32 s1, s1, 0x100000
; GFX10-NEXT: v_lshrrev_b16_e64 v0, s2, v0
; GFX10-NEXT: v_lshrrev_b16 v0, s2, v0
; GFX10-NEXT: s_lshl_b32 s0, s0, s1
; GFX10-NEXT: v_or_b32_e32 v0, s0, v0
; GFX10-NEXT: ; return to shader part epilog
@ -3221,7 +3221,7 @@ define amdgpu_ps half @v_fshl_i16_vss(i16 %lhs, i16 inreg %rhs, i16 inreg %amt)
; GFX10-NEXT: s_andn2_b32 s1, 15, s1
; GFX10-NEXT: s_bfe_u32 s0, s0, 0x100000
; GFX10-NEXT: s_bfe_u32 s3, 1, 0x100000
; GFX10-NEXT: v_lshlrev_b16_e64 v0, s2, v0
; GFX10-NEXT: v_lshlrev_b16 v0, s2, v0
; GFX10-NEXT: s_lshr_b32 s0, s0, s3
; GFX10-NEXT: s_bfe_u32 s1, s1, 0x100000
; GFX10-NEXT: s_lshr_b32 s0, s0, s1

View File

@ -124,11 +124,11 @@ define amdgpu_ps i7 @s_fshr_i7(i7 inreg %lhs, i7 inreg %rhs, i7 inreg %amt) {
; GFX10-NEXT: v_subrev_nc_u32_e32 v1, 7, v0
; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, 7, v0
; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc_lo
; GFX10-NEXT: v_sub_nc_u16_e64 v1, 6, v0
; GFX10-NEXT: v_sub_nc_u16 v1, 6, v0
; GFX10-NEXT: v_and_b32_e32 v0, s3, v0
; GFX10-NEXT: v_and_b32_e32 v1, s3, v1
; GFX10-NEXT: v_lshrrev_b16_e64 v0, v0, s1
; GFX10-NEXT: v_lshlrev_b16_e64 v1, v1, s0
; GFX10-NEXT: v_lshrrev_b16 v0, v0, s1
; GFX10-NEXT: v_lshlrev_b16 v1, v1, s0
; GFX10-NEXT: v_or_b32_e32 v0, v1, v0
; GFX10-NEXT: v_readfirstlane_b32 s0, v0
; GFX10-NEXT: ; return to shader part epilog
@ -240,7 +240,7 @@ define i7 @v_fshr_i7(i7 %lhs, i7 %rhs, i7 %amt) {
; GFX10-NEXT: v_cvt_f32_ubyte0_e32 v3, 7
; GFX10-NEXT: s_sub_i32 s4, 0, 7
; GFX10-NEXT: v_and_b32_e32 v2, 0x7f, v2
; GFX10-NEXT: v_lshlrev_b16_e64 v0, 1, v0
; GFX10-NEXT: v_lshlrev_b16 v0, 1, v0
; GFX10-NEXT: v_and_b32_e32 v1, 0x7f, v1
; GFX10-NEXT: v_rcp_iflag_f32_e32 v3, v3
; GFX10-NEXT: v_mul_f32_e32 v3, 0x4f7ffffe, v3
@ -258,11 +258,11 @@ define i7 @v_fshr_i7(i7 %lhs, i7 %rhs, i7 %amt) {
; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, 7, v2
; GFX10-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc_lo
; GFX10-NEXT: v_mov_b32_e32 v3, 0x7f
; GFX10-NEXT: v_sub_nc_u16_e64 v4, 6, v2
; GFX10-NEXT: v_sub_nc_u16 v4, 6, v2
; GFX10-NEXT: v_and_b32_e32 v2, v2, v3
; GFX10-NEXT: v_and_b32_e32 v7, v4, v3
; GFX10-NEXT: v_lshrrev_b16_e64 v1, v2, v1
; GFX10-NEXT: v_lshlrev_b16_e64 v0, v7, v0
; GFX10-NEXT: v_lshrrev_b16 v1, v2, v1
; GFX10-NEXT: v_lshlrev_b16 v0, v7, v0
; GFX10-NEXT: v_or_b32_e32 v0, v0, v1
; GFX10-NEXT: s_setpc_b64 s[30:31]
%result = call i7 @llvm.fshr.i7(i7 %lhs, i7 %rhs, i7 %amt)
@ -365,10 +365,10 @@ define i8 @v_fshr_i8(i8 %lhs, i8 %rhs, i8 %amt) {
; GFX10-NEXT: v_xor_b32_e32 v3, -1, v2
; GFX10-NEXT: v_and_b32_e32 v2, 7, v2
; GFX10-NEXT: v_and_b32_e32 v1, 0xff, v1
; GFX10-NEXT: v_lshlrev_b16_e64 v0, 1, v0
; GFX10-NEXT: v_lshlrev_b16 v0, 1, v0
; GFX10-NEXT: v_and_b32_e32 v3, 7, v3
; GFX10-NEXT: v_lshrrev_b16_e64 v1, v2, v1
; GFX10-NEXT: v_lshlrev_b16_e64 v0, v3, v0
; GFX10-NEXT: v_lshrrev_b16 v1, v2, v1
; GFX10-NEXT: v_lshlrev_b16 v0, v3, v0
; GFX10-NEXT: v_or_b32_e32 v0, v0, v1
; GFX10-NEXT: s_setpc_b64 s[30:31]
%result = call i8 @llvm.fshr.i8(i8 %lhs, i8 %rhs, i8 %amt)
@ -447,8 +447,8 @@ define i8 @v_fshr_i8_4(i8 %lhs, i8 %rhs) {
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: v_and_b32_e32 v1, 0xff, v1
; GFX10-NEXT: v_lshlrev_b16_e64 v0, 4, v0
; GFX10-NEXT: v_lshrrev_b16_e64 v1, 4, v1
; GFX10-NEXT: v_lshlrev_b16 v0, 4, v0
; GFX10-NEXT: v_lshrrev_b16 v1, 4, v1
; GFX10-NEXT: v_or_b32_e32 v0, v0, v1
; GFX10-NEXT: s_setpc_b64 s[30:31]
%result = call i8 @llvm.fshr.i8(i8 %lhs, i8 %rhs, i8 4)
@ -527,8 +527,8 @@ define i8 @v_fshr_i8_5(i8 %lhs, i8 %rhs) {
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: v_and_b32_e32 v1, 0xff, v1
; GFX10-NEXT: v_lshlrev_b16_e64 v0, 3, v0
; GFX10-NEXT: v_lshrrev_b16_e64 v1, 5, v1
; GFX10-NEXT: v_lshlrev_b16 v0, 3, v0
; GFX10-NEXT: v_lshrrev_b16 v1, 5, v1
; GFX10-NEXT: v_or_b32_e32 v0, v0, v1
; GFX10-NEXT: s_setpc_b64 s[30:31]
%result = call i8 @llvm.fshr.i8(i8 %lhs, i8 %rhs, i8 5)
@ -749,15 +749,15 @@ define i16 @v_fshr_v2i8(i16 %lhs.arg, i16 %rhs.arg, i16 %amt.arg) {
; GFX10-NEXT: v_xor_b32_e32 v2, -1, v2
; GFX10-NEXT: v_and_b32_e32 v3, 7, v3
; GFX10-NEXT: v_and_b32_e32 v5, s4, v5
; GFX10-NEXT: v_lshlrev_b16_e64 v4, 1, v4
; GFX10-NEXT: v_lshlrev_b16 v4, 1, v4
; GFX10-NEXT: v_and_b32_e32 v6, 7, v6
; GFX10-NEXT: v_and_b32_e32 v1, s4, v1
; GFX10-NEXT: v_lshlrev_b16_e64 v0, 1, v0
; GFX10-NEXT: v_lshlrev_b16 v0, 1, v0
; GFX10-NEXT: v_and_b32_e32 v2, 7, v2
; GFX10-NEXT: v_lshrrev_b16_e64 v3, v3, v5
; GFX10-NEXT: v_lshlrev_b16_e64 v4, v6, v4
; GFX10-NEXT: v_lshrrev_b16_e64 v1, v7, v1
; GFX10-NEXT: v_lshlrev_b16_e64 v0, v2, v0
; GFX10-NEXT: v_lshrrev_b16 v3, v3, v5
; GFX10-NEXT: v_lshlrev_b16 v4, v6, v4
; GFX10-NEXT: v_lshrrev_b16 v1, v7, v1
; GFX10-NEXT: v_lshlrev_b16 v0, v2, v0
; GFX10-NEXT: v_or_b32_e32 v2, v4, v3
; GFX10-NEXT: v_or_b32_e32 v0, v0, v1
; GFX10-NEXT: v_and_b32_sdwa v1, v2, s4 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
@ -1164,14 +1164,14 @@ define i32 @v_fshr_v4i8(i32 %lhs.arg, i32 %rhs.arg, i32 %amt.arg) {
; GFX10-NEXT: v_lshrrev_b32_e32 v10, 16, v2
; GFX10-NEXT: v_lshrrev_b32_e32 v12, 24, v2
; GFX10-NEXT: v_and_b32_e32 v15, 7, v8
; GFX10-NEXT: v_lshlrev_b16_e64 v0, 1, v0
; GFX10-NEXT: v_lshlrev_b16 v0, 1, v0
; GFX10-NEXT: v_and_b32_e32 v14, 7, v11
; GFX10-NEXT: v_lshlrev_b16_e64 v3, 1, v3
; GFX10-NEXT: v_lshlrev_b16 v3, 1, v3
; GFX10-NEXT: v_xor_b32_e32 v11, -1, v10
; GFX10-NEXT: v_lshrrev_b32_e32 v7, 8, v1
; GFX10-NEXT: v_lshlrev_b16_e64 v0, v15, v0
; GFX10-NEXT: v_lshlrev_b16 v0, v15, v0
; GFX10-NEXT: v_mov_b32_e32 v15, 0xff
; GFX10-NEXT: v_lshlrev_b16_e64 v3, v14, v3
; GFX10-NEXT: v_lshlrev_b16 v3, v14, v3
; GFX10-NEXT: v_xor_b32_e32 v14, -1, v12
; GFX10-NEXT: s_movk_i32 s4, 0xff
; GFX10-NEXT: v_lshrrev_b32_e32 v9, 24, v1
@ -1181,17 +1181,17 @@ define i32 @v_fshr_v4i8(i32 %lhs.arg, i32 %rhs.arg, i32 %amt.arg) {
; GFX10-NEXT: v_and_b32_e32 v6, 7, v6
; GFX10-NEXT: v_and_b32_e32 v7, s4, v7
; GFX10-NEXT: v_and_b32_e32 v11, 7, v11
; GFX10-NEXT: v_lshlrev_b16_e64 v4, 1, v4
; GFX10-NEXT: v_lshlrev_b16 v4, 1, v4
; GFX10-NEXT: v_and_b32_e32 v15, 7, v14
; GFX10-NEXT: v_lshlrev_b16_e64 v5, 1, v5
; GFX10-NEXT: v_lshlrev_b16 v5, 1, v5
; GFX10-NEXT: v_and_b32_e32 v12, 7, v12
; GFX10-NEXT: v_and_b32_e32 v2, 7, v2
; GFX10-NEXT: v_lshrrev_b16_e64 v6, v6, v7
; GFX10-NEXT: v_lshlrev_b16_e64 v4, v11, v4
; GFX10-NEXT: v_lshrrev_b16_e64 v1, v10, v1
; GFX10-NEXT: v_lshlrev_b16_e64 v5, v15, v5
; GFX10-NEXT: v_lshrrev_b16_e64 v7, v12, v9
; GFX10-NEXT: v_lshrrev_b16_e64 v2, v2, v8
; GFX10-NEXT: v_lshrrev_b16 v6, v6, v7
; GFX10-NEXT: v_lshlrev_b16 v4, v11, v4
; GFX10-NEXT: v_lshrrev_b16 v1, v10, v1
; GFX10-NEXT: v_lshlrev_b16 v5, v15, v5
; GFX10-NEXT: v_lshrrev_b16 v7, v12, v9
; GFX10-NEXT: v_lshrrev_b16 v2, v2, v8
; GFX10-NEXT: v_or_b32_e32 v3, v3, v6
; GFX10-NEXT: v_or_b32_e32 v1, v4, v1
; GFX10-NEXT: v_mov_b32_e32 v6, 8
@ -2814,10 +2814,10 @@ define i16 @v_fshr_i16(i16 %lhs, i16 %rhs, i16 %amt) {
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: v_xor_b32_e32 v3, -1, v2
; GFX10-NEXT: v_and_b32_e32 v2, 15, v2
; GFX10-NEXT: v_lshlrev_b16_e64 v0, 1, v0
; GFX10-NEXT: v_lshlrev_b16 v0, 1, v0
; GFX10-NEXT: v_and_b32_e32 v3, 15, v3
; GFX10-NEXT: v_lshrrev_b16_e64 v1, v2, v1
; GFX10-NEXT: v_lshlrev_b16_e64 v0, v3, v0
; GFX10-NEXT: v_lshrrev_b16 v1, v2, v1
; GFX10-NEXT: v_lshlrev_b16 v0, v3, v0
; GFX10-NEXT: v_or_b32_e32 v0, v0, v1
; GFX10-NEXT: s_setpc_b64 s[30:31]
%result = call i16 @llvm.fshr.i16(i16 %lhs, i16 %rhs, i16 %amt)
@ -2854,8 +2854,8 @@ define i16 @v_fshr_i16_4(i16 %lhs, i16 %rhs) {
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: v_lshlrev_b16_e64 v0, 12, v0
; GFX10-NEXT: v_lshrrev_b16_e64 v1, 4, v1
; GFX10-NEXT: v_lshlrev_b16 v0, 12, v0
; GFX10-NEXT: v_lshrrev_b16 v1, 4, v1
; GFX10-NEXT: v_or_b32_e32 v0, v0, v1
; GFX10-NEXT: s_setpc_b64 s[30:31]
%result = call i16 @llvm.fshr.i16(i16 %lhs, i16 %rhs, i16 4)
@ -2892,8 +2892,8 @@ define i16 @v_fshr_i16_5(i16 %lhs, i16 %rhs) {
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: v_lshlrev_b16_e64 v0, 11, v0
; GFX10-NEXT: v_lshrrev_b16_e64 v1, 5, v1
; GFX10-NEXT: v_lshlrev_b16 v0, 11, v0
; GFX10-NEXT: v_lshrrev_b16 v1, 5, v1
; GFX10-NEXT: v_or_b32_e32 v0, v0, v1
; GFX10-NEXT: s_setpc_b64 s[30:31]
%result = call i16 @llvm.fshr.i16(i16 %lhs, i16 %rhs, i16 5)
@ -2946,8 +2946,8 @@ define amdgpu_ps half @v_fshr_i16_ssv(i16 inreg %lhs, i16 inreg %rhs, i16 %amt)
; GFX10-NEXT: s_bfe_u32 s2, 1, 0x100000
; GFX10-NEXT: s_lshl_b32 s0, s0, s2
; GFX10-NEXT: v_and_b32_e32 v1, 15, v1
; GFX10-NEXT: v_lshrrev_b16_e64 v0, v0, s1
; GFX10-NEXT: v_lshlrev_b16_e64 v1, v1, s0
; GFX10-NEXT: v_lshrrev_b16 v0, v0, s1
; GFX10-NEXT: v_lshlrev_b16 v1, v1, s0
; GFX10-NEXT: v_or_b32_e32 v0, v1, v0
; GFX10-NEXT: ; return to shader part epilog
%result = call i16 @llvm.fshr.i16(i16 %lhs, i16 %rhs, i16 %amt)
@ -2998,7 +2998,7 @@ define amdgpu_ps half @v_fshr_i16_svs(i16 inreg %lhs, i16 %rhs, i16 inreg %amt)
; GFX10-NEXT: s_and_b32 s2, s1, 15
; GFX10-NEXT: s_andn2_b32 s1, 15, s1
; GFX10-NEXT: s_bfe_u32 s3, 1, 0x100000
; GFX10-NEXT: v_lshrrev_b16_e64 v0, s2, v0
; GFX10-NEXT: v_lshrrev_b16 v0, s2, v0
; GFX10-NEXT: s_lshl_b32 s0, s0, s3
; GFX10-NEXT: s_bfe_u32 s1, s1, 0x100000
; GFX10-NEXT: s_lshl_b32 s0, s0, s1
@ -3049,12 +3049,12 @@ define amdgpu_ps half @v_fshr_i16_vss(i16 %lhs, i16 inreg %rhs, i16 inreg %amt)
;
; GFX10-LABEL: v_fshr_i16_vss:
; GFX10: ; %bb.0:
; GFX10-NEXT: v_lshlrev_b16_e64 v0, 1, v0
; GFX10-NEXT: v_lshlrev_b16 v0, 1, v0
; GFX10-NEXT: s_andn2_b32 s2, 15, s1
; GFX10-NEXT: s_and_b32 s1, s1, 15
; GFX10-NEXT: s_bfe_u32 s0, s0, 0x100000
; GFX10-NEXT: s_bfe_u32 s1, s1, 0x100000
; GFX10-NEXT: v_lshlrev_b16_e64 v0, s2, v0
; GFX10-NEXT: v_lshlrev_b16 v0, s2, v0
; GFX10-NEXT: s_lshr_b32 s0, s0, s1
; GFX10-NEXT: v_or_b32_e32 v0, s0, v0
; GFX10-NEXT: ; return to shader part epilog

View File

@ -83,13 +83,13 @@ define amdgpu_kernel void @v_insert_v64i32_37(<64 x i32> addrspace(1)* %ptr.in,
; GFX10-NEXT: global_load_dwordx4 v[40:43], v70, s[0:1] offset:32
; GFX10-NEXT: global_load_dwordx4 v[44:47], v70, s[0:1] offset:48
; GFX10-NEXT: global_load_dwordx4 v[48:51], v70, s[0:1] offset:64
; GFX10-NEXT: v_add_co_u32_e64 v0, vcc_lo, v5, v70
; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, v5, v70
; GFX10-NEXT: v_add_co_ci_u32_e32 v5, vcc_lo, 0, v6, vcc_lo
; GFX10-NEXT: v_add_co_u32_e64 v64, vcc_lo, v0, 64
; GFX10-NEXT: v_add_co_u32 v64, vcc_lo, v0, 64
; GFX10-NEXT: v_add_co_ci_u32_e32 v65, vcc_lo, 0, v5, vcc_lo
; GFX10-NEXT: v_add_co_u32_e64 v66, vcc_lo, v0, v1
; GFX10-NEXT: v_add_co_u32 v66, vcc_lo, v0, v1
; GFX10-NEXT: v_add_co_ci_u32_e32 v67, vcc_lo, v5, v2, vcc_lo
; GFX10-NEXT: v_add_co_u32_e64 v68, vcc_lo, v0, v3
; GFX10-NEXT: v_add_co_u32 v68, vcc_lo, v0, v3
; GFX10-NEXT: v_add_co_ci_u32_e32 v69, vcc_lo, v5, v4, vcc_lo
; GFX10-NEXT: s_clause 0xa
; GFX10-NEXT: global_load_dwordx4 v[52:55], v[64:65], off offset:16

View File

@ -1433,14 +1433,14 @@ define amdgpu_kernel void @flat_atomic_inc_ret_i32_offset_addr64(i32* %out, i32*
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: v_mov_b32_e32 v0, s2
; GFX10-NEXT: v_mov_b32_e32 v1, s3
; GFX10-NEXT: v_add_co_u32_e64 v0, vcc_lo, v0, v2
; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, v0, v2
; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
; GFX10-NEXT: v_add_co_u32_e64 v0, vcc_lo, v0, 20
; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, v0, 20
; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
; GFX10-NEXT: flat_atomic_inc v3, v[0:1], v3 glc
; GFX10-NEXT: v_mov_b32_e32 v0, s0
; GFX10-NEXT: v_mov_b32_e32 v1, s1
; GFX10-NEXT: v_add_co_u32_e64 v0, vcc_lo, v0, v2
; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, v0, v2
; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX10-NEXT: flat_store_dword v[0:1], v3
@ -1505,10 +1505,10 @@ define amdgpu_kernel void @flat_atomic_inc_noret_i32_offset_addr64(i32* %ptr) #0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: v_mov_b32_e32 v0, s0
; GFX10-NEXT: v_mov_b32_e32 v1, s1
; GFX10-NEXT: v_add_co_u32_e64 v0, vcc_lo, v0, v2
; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, v0, v2
; GFX10-NEXT: v_mov_b32_e32 v2, 42
; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
; GFX10-NEXT: v_add_co_u32_e64 v0, vcc_lo, v0, 20
; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, v0, 20
; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
; GFX10-NEXT: flat_atomic_inc v0, v[0:1], v2 glc
; GFX10-NEXT: s_endpgm
@ -1830,14 +1830,14 @@ define amdgpu_kernel void @flat_atomic_inc_ret_i64_offset_addr64(i64* %out, i64*
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: v_mov_b32_e32 v8, s3
; GFX10-NEXT: v_mov_b32_e32 v7, s2
; GFX10-NEXT: v_add_co_u32_e64 v0, vcc_lo, v7, v4
; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, v7, v4
; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v8, vcc_lo
; GFX10-NEXT: v_add_co_u32_e64 v0, vcc_lo, v0, 40
; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, v0, 40
; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
; GFX10-NEXT: flat_atomic_inc_x2 v[0:1], v[0:1], v[2:3] glc
; GFX10-NEXT: v_mov_b32_e32 v3, s1
; GFX10-NEXT: v_mov_b32_e32 v2, s0
; GFX10-NEXT: v_add_co_u32_e64 v2, vcc_lo, v2, v4
; GFX10-NEXT: v_add_co_u32 v2, vcc_lo, v2, v4
; GFX10-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, 0, v3, vcc_lo
; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX10-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
@ -1905,11 +1905,11 @@ define amdgpu_kernel void @flat_atomic_inc_noret_i64_offset_addr64(i64* %ptr) #0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: v_mov_b32_e32 v0, s0
; GFX10-NEXT: v_mov_b32_e32 v1, s1
; GFX10-NEXT: v_add_co_u32_e64 v0, vcc_lo, v0, v2
; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, v0, v2
; GFX10-NEXT: v_mov_b32_e32 v2, 42
; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
; GFX10-NEXT: v_mov_b32_e32 v3, 0
; GFX10-NEXT: v_add_co_u32_e64 v0, vcc_lo, v0, 40
; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, v0, 40
; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
; GFX10-NEXT: flat_atomic_inc_x2 v[0:1], v[0:1], v[2:3] glc
; GFX10-NEXT: s_endpgm

View File

@ -23,7 +23,7 @@ define i32 @global_atomic_csub_offset(i32 addrspace(1)* %ptr, i32 %data) {
; GCN-NEXT: s_mov_b32 s5, 0
; GCN-NEXT: v_mov_b32_e32 v3, s4
; GCN-NEXT: v_mov_b32_e32 v4, s5
; GCN-NEXT: v_add_co_u32_e64 v0, vcc_lo, v0, v3
; GCN-NEXT: v_add_co_u32 v0, vcc_lo, v0, v3
; GCN-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v1, v4, vcc_lo
; GCN-NEXT: global_atomic_csub v0, v[0:1], v2, off glc
; GCN-NEXT: s_waitcnt vmcnt(0)
@ -54,7 +54,7 @@ define void @global_atomic_csub_offset_nortn(i32 addrspace(1)* %ptr, i32 %data)
; GCN-NEXT: s_mov_b32 s5, 0
; GCN-NEXT: v_mov_b32_e32 v3, s4
; GCN-NEXT: v_mov_b32_e32 v4, s5
; GCN-NEXT: v_add_co_u32_e64 v0, vcc_lo, v0, v3
; GCN-NEXT: v_add_co_u32 v0, vcc_lo, v0, v3
; GCN-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v1, v4, vcc_lo
; GCN-NEXT: global_atomic_csub v0, v[0:1], v2, off glc
; GCN-NEXT: s_waitcnt vmcnt(0)

View File

@ -32,7 +32,7 @@ define i8 @v_lshr_i8(i8 %value, i8 %amount) {
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: v_and_b32_e32 v1, 0xff, v1
; GFX10-NEXT: v_and_b32_e32 v0, 0xff, v0
; GFX10-NEXT: v_lshrrev_b16_e64 v0, v1, v0
; GFX10-NEXT: v_lshrrev_b16 v0, v1, v0
; GFX10-NEXT: s_setpc_b64 s[30:31]
%result = lshr i8 %value, %amount
ret i8 %result
@ -65,7 +65,7 @@ define i8 @v_lshr_i8_7(i8 %value) {
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: v_and_b32_e32 v0, 0xff, v0
; GFX10-NEXT: v_lshrrev_b16_e64 v0, 7, v0
; GFX10-NEXT: v_lshrrev_b16 v0, 7, v0
; GFX10-NEXT: s_setpc_b64 s[30:31]
%result = lshr i8 %value, 7
ret i8 %result
@ -133,6 +133,16 @@ define i24 @v_lshr_i24(i24 %value, i24 %amount) {
; GCN-NEXT: v_and_b32_e32 v0, s4, v0
; GCN-NEXT: v_lshrrev_b32_e32 v0, v1, v0
; GCN-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: v_lshr_i24:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: s_mov_b32 s4, 0xffffff
; GFX10-NEXT: v_and_b32_e32 v1, s4, v1
; GFX10-NEXT: v_and_b32_e32 v0, s4, v0
; GFX10-NEXT: v_lshrrev_b32_e32 v0, v1, v0
; GFX10-NEXT: s_setpc_b64 s[30:31]
%result = lshr i24 %value, %amount
ret i24 %result
}
@ -594,7 +604,7 @@ define i16 @v_lshr_i16(i16 %value, i16 %amount) {
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: v_lshrrev_b16_e64 v0, v1, v0
; GFX10-NEXT: v_lshrrev_b16 v0, v1, v0
; GFX10-NEXT: s_setpc_b64 s[30:31]
%result = lshr i16 %value, %amount
ret i16 %result
@ -688,7 +698,7 @@ define amdgpu_ps half @lshr_i16_sv(i16 inreg %value, i16 %amount) {
;
; GFX10-LABEL: lshr_i16_sv:
; GFX10: ; %bb.0:
; GFX10-NEXT: v_lshrrev_b16_e64 v0, v0, s0
; GFX10-NEXT: v_lshrrev_b16 v0, v0, s0
; GFX10-NEXT: ; return to shader part epilog
%result = lshr i16 %value, %amount
%cast = bitcast i16 %result to half
@ -716,7 +726,7 @@ define amdgpu_ps half @lshr_i16_vs(i16 %value, i16 inreg %amount) {
;
; GFX10-LABEL: lshr_i16_vs:
; GFX10: ; %bb.0:
; GFX10-NEXT: v_lshrrev_b16_e64 v0, s0, v0
; GFX10-NEXT: v_lshrrev_b16 v0, s0, v0
; GFX10-NEXT: ; return to shader part epilog
%result = lshr i16 %value, %amount
%cast = bitcast i16 %result to half

View File

@ -63,7 +63,7 @@ define i16 @v_mul_i16(i16 %num, i16 %den) {
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: v_mul_lo_u16_e64 v0, v0, v1
; GFX10-NEXT: v_mul_lo_u16 v0, v0, v1
; GFX10-NEXT: s_setpc_b64 s[30:31]
%result = mul i16 %num, %den
ret i16 %result
@ -130,7 +130,7 @@ define zeroext i16 @v_mul_i16_zeroext(i16 zeroext %num, i16 zeroext %den) {
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: v_mul_lo_u16_e64 v0, v0, v1
; GFX10-NEXT: v_mul_lo_u16 v0, v0, v1
; GFX10-NEXT: v_bfe_u32 v0, v0, 0, 16
; GFX10-NEXT: s_setpc_b64 s[30:31]
%result = mul i16 %num, %den
@ -203,7 +203,7 @@ define signext i16 @v_mul_i16_signext(i16 signext %num, i16 signext %den) {
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: v_mul_lo_u16_e64 v0, v0, v1
; GFX10-NEXT: v_mul_lo_u16 v0, v0, v1
; GFX10-NEXT: v_bfe_i32 v0, v0, 0, 16
; GFX10-NEXT: s_setpc_b64 s[30:31]
%result = mul i16 %num, %den
@ -570,6 +570,29 @@ define i96 @v_mul_i96(i96 %num, i96 %den) {
; GFX9-NEXT: v_mov_b32_e32 v0, v6
; GFX9-NEXT: v_mov_b32_e32 v1, v7
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: v_mul_i96:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: v_mul_lo_u32 v6, v1, v3
; GFX10-NEXT: v_mul_lo_u32 v7, v0, v4
; GFX10-NEXT: v_mul_hi_u32 v8, v0, v3
; GFX10-NEXT: v_mul_lo_u32 v9, v1, v4
; GFX10-NEXT: v_mul_lo_u32 v2, v2, v3
; GFX10-NEXT: v_mul_lo_u32 v5, v0, v5
; GFX10-NEXT: v_mul_hi_u32 v4, v0, v4
; GFX10-NEXT: v_mul_lo_u32 v0, v0, v3
; GFX10-NEXT: v_add_co_u32 v6, s4, v6, v7
; GFX10-NEXT: v_mul_hi_u32 v7, v1, v3
; GFX10-NEXT: v_cndmask_b32_e64 v11, 0, 1, s4
; GFX10-NEXT: v_add_nc_u32_e32 v2, v2, v9
; GFX10-NEXT: v_add_co_u32 v1, s4, v6, v8
; GFX10-NEXT: v_cndmask_b32_e64 v6, 0, 1, s4
; GFX10-NEXT: v_add3_u32 v2, v2, v5, v7
; GFX10-NEXT: v_add_nc_u32_e32 v3, v11, v6
; GFX10-NEXT: v_add3_u32 v2, v2, v4, v3
; GFX10-NEXT: s_setpc_b64 s[30:31]
%result = mul i96 %num, %den
ret i96 %result
}
@ -951,6 +974,50 @@ define i128 @v_mul_i128(i128 %num, i128 %den) {
; GFX9-NEXT: v_mov_b32_e32 v1, v9
; GFX9-NEXT: v_mov_b32_e32 v2, v10
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: v_mul_i128:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: v_mul_lo_u32 v8, v2, v4
; GFX10-NEXT: v_mul_lo_u32 v9, v1, v5
; GFX10-NEXT: v_mul_lo_u32 v10, v1, v4
; GFX10-NEXT: v_mul_lo_u32 v11, v0, v5
; GFX10-NEXT: v_mul_hi_u32 v12, v0, v4
; GFX10-NEXT: v_mul_lo_u32 v13, v0, v6
; GFX10-NEXT: v_mul_hi_u32 v15, v0, v5
; GFX10-NEXT: v_mul_lo_u32 v3, v3, v4
; GFX10-NEXT: v_mul_lo_u32 v7, v0, v7
; GFX10-NEXT: v_add_co_u32 v8, s4, v8, v9
; GFX10-NEXT: v_add_co_u32 v9, s5, v10, v11
; GFX10-NEXT: v_mul_hi_u32 v11, v1, v4
; GFX10-NEXT: v_cndmask_b32_e64 v14, 0, 1, s4
; GFX10-NEXT: v_cndmask_b32_e64 v10, 0, 1, s5
; GFX10-NEXT: v_add_co_u32 v13, s4, v8, v13
; GFX10-NEXT: v_add_co_u32 v8, s5, v9, v12
; GFX10-NEXT: v_cndmask_b32_e64 v12, 0, 1, s4
; GFX10-NEXT: v_cndmask_b32_e64 v9, 0, 1, s5
; GFX10-NEXT: v_add_co_u32 v18, s4, v13, v11
; GFX10-NEXT: v_cndmask_b32_e64 v13, 0, 1, s4
; GFX10-NEXT: v_add_nc_u32_e32 v9, v10, v9
; GFX10-NEXT: v_mul_lo_u32 v10, v2, v5
; GFX10-NEXT: v_add_co_u32 v11, s4, v18, v15
; GFX10-NEXT: v_mul_hi_u32 v15, v2, v4
; GFX10-NEXT: v_add3_u32 v12, v14, v12, v13
; GFX10-NEXT: v_mul_lo_u32 v13, v1, v6
; GFX10-NEXT: v_mul_hi_u32 v1, v1, v5
; GFX10-NEXT: v_cndmask_b32_e64 v14, 0, 1, s4
; GFX10-NEXT: v_add_co_u32 v2, s4, v11, v9
; GFX10-NEXT: v_add_nc_u32_e32 v10, v3, v10
; GFX10-NEXT: v_cndmask_b32_e64 v5, 0, 1, s4
; GFX10-NEXT: v_mul_hi_u32 v6, v0, v6
; GFX10-NEXT: v_mul_lo_u32 v0, v0, v4
; GFX10-NEXT: v_add3_u32 v10, v10, v13, v7
; GFX10-NEXT: v_add3_u32 v4, v12, v14, v5
; GFX10-NEXT: v_add3_u32 v1, v10, v15, v1
; GFX10-NEXT: v_add3_u32 v3, v1, v6, v4
; GFX10-NEXT: v_mov_b32_e32 v1, v8
; GFX10-NEXT: s_setpc_b64 s[30:31]
%result = mul i128 %num, %den
ret i128 %result
}
@ -2674,6 +2741,204 @@ define i256 @v_mul_i256(i256 %num, i256 %den) {
; GFX9-NEXT: v_mov_b32_e32 v5, v20
; GFX9-NEXT: v_mov_b32_e32 v6, v21
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: v_mul_i256:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: v_mul_lo_u32 v16, v1, v8
; GFX10-NEXT: v_mul_lo_u32 v17, v0, v9
; GFX10-NEXT: v_mul_hi_u32 v18, v0, v8
; GFX10-NEXT: v_mul_lo_u32 v19, v2, v8
; GFX10-NEXT: v_mul_lo_u32 v20, v1, v9
; GFX10-NEXT: v_mul_hi_u32 v21, v1, v8
; GFX10-NEXT: v_mul_lo_u32 v22, v3, v8
; GFX10-NEXT: v_mul_lo_u32 v25, v1, v10
; GFX10-NEXT: v_mul_hi_u32 v23, v0, v9
; GFX10-NEXT: v_add_co_u32 v16, s4, v16, v17
; GFX10-NEXT: v_mul_hi_u32 v27, v0, v10
; GFX10-NEXT: v_cndmask_b32_e64 v17, 0, 1, s4
; GFX10-NEXT: v_mul_lo_u32 v7, v7, v8
; GFX10-NEXT: v_mul_lo_u32 v15, v0, v15
; GFX10-NEXT: v_add_co_u32 v16, s4, v16, v18
; GFX10-NEXT: v_cndmask_b32_e64 v18, 0, 1, s4
; GFX10-NEXT: v_add_co_u32 v19, s4, v19, v20
; GFX10-NEXT: v_mul_lo_u32 v20, v2, v9
; GFX10-NEXT: v_cndmask_b32_e64 v24, 0, 1, s4
; GFX10-NEXT: v_add_nc_u32_e32 v17, v17, v18
; GFX10-NEXT: v_mul_lo_u32 v18, v0, v10
; GFX10-NEXT: v_add_co_u32 v18, s4, v19, v18
; GFX10-NEXT: v_cndmask_b32_e64 v19, 0, 1, s4
; GFX10-NEXT: v_add_co_u32 v20, s4, v22, v20
; GFX10-NEXT: v_mul_lo_u32 v22, v0, v11
; GFX10-NEXT: v_add_co_u32 v18, s5, v18, v21
; GFX10-NEXT: v_cndmask_b32_e64 v26, 0, 1, s4
; GFX10-NEXT: v_cndmask_b32_e64 v21, 0, 1, s5
; GFX10-NEXT: v_add_co_u32 v20, s4, v20, v25
; GFX10-NEXT: v_add_co_u32 v18, s5, v18, v23
; GFX10-NEXT: v_mul_hi_u32 v23, v1, v9
; GFX10-NEXT: v_add3_u32 v19, v24, v19, v21
; GFX10-NEXT: v_mul_hi_u32 v21, v2, v8
; GFX10-NEXT: v_cndmask_b32_e64 v24, 0, 1, s4
; GFX10-NEXT: v_add_co_u32 v20, s4, v20, v22
; GFX10-NEXT: v_cndmask_b32_e64 v29, 0, 1, s5
; GFX10-NEXT: v_cndmask_b32_e64 v25, 0, 1, s4
; GFX10-NEXT: v_add_co_u32 v17, s5, v18, v17
; GFX10-NEXT: v_cndmask_b32_e64 v18, 0, 1, s5
; GFX10-NEXT: v_add_co_u32 v20, s4, v20, v21
; GFX10-NEXT: v_add3_u32 v21, v26, v24, v25
; GFX10-NEXT: v_cndmask_b32_e64 v24, 0, 1, s4
; GFX10-NEXT: v_mul_lo_u32 v25, v4, v8
; GFX10-NEXT: v_mul_lo_u32 v26, v3, v9
; GFX10-NEXT: v_add_co_u32 v20, s4, v20, v23
; GFX10-NEXT: v_add3_u32 v18, v19, v29, v18
; GFX10-NEXT: v_cndmask_b32_e64 v23, 0, 1, s4
; GFX10-NEXT: v_mul_hi_u32 v29, v3, v9
; GFX10-NEXT: v_add_co_u32 v20, s5, v20, v27
; GFX10-NEXT: v_add3_u32 v30, v21, v24, v23
; GFX10-NEXT: v_mul_lo_u32 v21, v2, v10
; GFX10-NEXT: v_add_co_u32 v22, s4, v25, v26
; GFX10-NEXT: v_mul_lo_u32 v24, v1, v11
; GFX10-NEXT: v_cndmask_b32_e64 v25, 0, 1, s4
; GFX10-NEXT: v_mul_hi_u32 v26, v3, v8
; GFX10-NEXT: v_cndmask_b32_e64 v23, 0, 1, s5
; GFX10-NEXT: v_add_co_u32 v21, s4, v22, v21
; GFX10-NEXT: v_mul_lo_u32 v22, v0, v12
; GFX10-NEXT: v_cndmask_b32_e64 v27, 0, 1, s4
; GFX10-NEXT: v_add_co_u32 v21, s4, v21, v24
; GFX10-NEXT: v_cndmask_b32_e64 v24, 0, 1, s4
; GFX10-NEXT: v_add_co_u32 v18, s4, v20, v18
; GFX10-NEXT: v_cndmask_b32_e64 v20, 0, 1, s4
; GFX10-NEXT: v_add_co_u32 v21, s4, v21, v22
; GFX10-NEXT: v_mul_hi_u32 v22, v2, v9
; GFX10-NEXT: v_add3_u32 v24, v25, v27, v24
; GFX10-NEXT: v_cndmask_b32_e64 v25, 0, 1, s4
; GFX10-NEXT: v_add3_u32 v19, v30, v23, v20
; GFX10-NEXT: v_add_co_u32 v21, s4, v21, v26
; GFX10-NEXT: v_mul_hi_u32 v20, v1, v10
; GFX10-NEXT: v_cndmask_b32_e64 v26, 0, 1, s4
; GFX10-NEXT: v_mul_hi_u32 v27, v0, v11
; GFX10-NEXT: v_add_co_u32 v21, s4, v21, v22
; GFX10-NEXT: v_mul_lo_u32 v22, v5, v8
; GFX10-NEXT: v_add3_u32 v23, v24, v25, v26
; GFX10-NEXT: v_mul_lo_u32 v24, v4, v9
; GFX10-NEXT: v_cndmask_b32_e64 v30, 0, 1, s4
; GFX10-NEXT: v_add_co_u32 v20, s4, v21, v20
; GFX10-NEXT: v_mul_lo_u32 v26, v3, v10
; GFX10-NEXT: v_cndmask_b32_e64 v21, 0, 1, s4
; GFX10-NEXT: v_add_co_u32 v20, s5, v20, v27
; GFX10-NEXT: v_add_co_u32 v31, s4, v22, v24
; GFX10-NEXT: v_add3_u32 v35, v23, v30, v21
; GFX10-NEXT: v_mul_lo_u32 v23, v2, v11
; GFX10-NEXT: v_cndmask_b32_e64 v34, 0, 1, s4
; GFX10-NEXT: v_cndmask_b32_e64 v25, 0, 1, s5
; GFX10-NEXT: v_add_co_u32 v22, s4, v31, v26
; GFX10-NEXT: v_mul_lo_u32 v26, v1, v12
; GFX10-NEXT: v_cndmask_b32_e64 v27, 0, 1, s4
; GFX10-NEXT: v_add_co_u32 v19, s5, v20, v19
; GFX10-NEXT: v_add_co_u32 v31, s4, v22, v23
; GFX10-NEXT: v_mul_lo_u32 v23, v0, v13
; GFX10-NEXT: v_cndmask_b32_e64 v28, 0, 1, s4
; GFX10-NEXT: v_cndmask_b32_e64 v20, 0, 1, s5
; GFX10-NEXT: v_add_co_u32 v22, s4, v31, v26
; GFX10-NEXT: v_mul_hi_u32 v26, v4, v8
; GFX10-NEXT: v_cndmask_b32_e64 v30, 0, 1, s4
; GFX10-NEXT: v_add3_u32 v20, v35, v25, v20
; GFX10-NEXT: v_add_co_u32 v31, s4, v22, v23
; GFX10-NEXT: v_add3_u32 v23, v34, v27, v28
; GFX10-NEXT: v_cndmask_b32_e64 v24, 0, 1, s4
; GFX10-NEXT: v_mul_hi_u32 v22, v2, v10
; GFX10-NEXT: v_mul_lo_u32 v28, v5, v9
; GFX10-NEXT: v_add_co_u32 v27, s4, v31, v26
; GFX10-NEXT: v_mul_hi_u32 v26, v1, v11
; GFX10-NEXT: v_add3_u32 v23, v23, v30, v24
; GFX10-NEXT: v_cndmask_b32_e64 v24, 0, 1, s4
; GFX10-NEXT: v_add_co_u32 v21, s4, v27, v29
; GFX10-NEXT: v_mul_lo_u32 v27, v6, v8
; GFX10-NEXT: v_cndmask_b32_e64 v25, 0, 1, s4
; GFX10-NEXT: v_mul_hi_u32 v29, v0, v12
; GFX10-NEXT: v_add_co_u32 v21, s4, v21, v22
; GFX10-NEXT: v_add3_u32 v23, v23, v24, v25
; GFX10-NEXT: v_mul_lo_u32 v24, v4, v10
; GFX10-NEXT: v_cndmask_b32_e64 v33, 0, 1, s4
; GFX10-NEXT: v_add_co_u32 v25, s4, v27, v28
; GFX10-NEXT: v_add_co_u32 v31, s5, v21, v26
; GFX10-NEXT: v_mul_lo_u32 v27, v3, v11
; GFX10-NEXT: v_cndmask_b32_e64 v26, 0, 1, s5
; GFX10-NEXT: v_cndmask_b32_e64 v28, 0, 1, s4
; GFX10-NEXT: v_add_co_u32 v24, s4, v25, v24
; GFX10-NEXT: v_add_co_u32 v21, s5, v31, v29
; GFX10-NEXT: v_add3_u32 v39, v23, v33, v26
; GFX10-NEXT: v_mul_lo_u32 v23, v2, v12
; GFX10-NEXT: v_cndmask_b32_e64 v35, 0, 1, s4
; GFX10-NEXT: v_add_co_u32 v24, s4, v24, v27
; GFX10-NEXT: v_mul_lo_u32 v27, v1, v13
; GFX10-NEXT: v_cndmask_b32_e64 v29, 0, 1, s4
; GFX10-NEXT: v_mul_hi_u32 v31, v4, v9
; GFX10-NEXT: v_mul_hi_u32 v25, v3, v10
; GFX10-NEXT: v_cndmask_b32_e64 v26, 0, 1, s5
; GFX10-NEXT: v_add_co_u32 v34, s4, v24, v23
; GFX10-NEXT: v_mul_lo_u32 v24, v0, v14
; GFX10-NEXT: v_cndmask_b32_e64 v30, 0, 1, s4
; GFX10-NEXT: v_add3_u32 v35, v28, v35, v29
; GFX10-NEXT: v_add_co_u32 v20, s5, v21, v20
; GFX10-NEXT: v_add_co_u32 v23, s4, v34, v27
; GFX10-NEXT: v_mul_hi_u32 v27, v5, v8
; GFX10-NEXT: v_cndmask_b32_e64 v32, 0, 1, s4
; GFX10-NEXT: v_cndmask_b32_e64 v21, 0, 1, s5
; GFX10-NEXT: v_mul_lo_u32 v29, v3, v12
; GFX10-NEXT: v_add_co_u32 v34, s4, v23, v24
; GFX10-NEXT: v_mul_hi_u32 v3, v3, v11
; GFX10-NEXT: v_cndmask_b32_e64 v28, 0, 1, s4
; GFX10-NEXT: v_add3_u32 v22, v35, v30, v32
; GFX10-NEXT: v_add3_u32 v21, v39, v26, v21
; GFX10-NEXT: v_add_co_u32 v34, s4, v34, v27
; GFX10-NEXT: v_mul_hi_u32 v26, v2, v11
; GFX10-NEXT: v_cndmask_b32_e64 v27, 0, 1, s4
; GFX10-NEXT: v_add_co_u32 v23, s4, v34, v31
; GFX10-NEXT: v_cndmask_b32_e64 v24, 0, 1, s4
; GFX10-NEXT: v_add3_u32 v22, v22, v28, v27
; GFX10-NEXT: v_mul_lo_u32 v28, v6, v9
; GFX10-NEXT: v_add_co_u32 v23, s4, v23, v25
; GFX10-NEXT: v_mul_hi_u32 v27, v1, v12
; GFX10-NEXT: v_cndmask_b32_e64 v25, 0, 1, s4
; GFX10-NEXT: v_mul_hi_u32 v6, v6, v8
; GFX10-NEXT: v_add_co_u32 v30, s4, v23, v26
; GFX10-NEXT: v_add3_u32 v33, v22, v24, v25
; GFX10-NEXT: v_mul_lo_u32 v24, v5, v10
; GFX10-NEXT: v_mul_lo_u32 v25, v4, v11
; GFX10-NEXT: v_add_nc_u32_e32 v7, v7, v28
; GFX10-NEXT: v_mul_lo_u32 v28, v2, v13
; GFX10-NEXT: v_cndmask_b32_e64 v26, 0, 1, s4
; GFX10-NEXT: v_add_co_u32 v23, s4, v30, v27
; GFX10-NEXT: v_mul_hi_u32 v5, v5, v9
; GFX10-NEXT: v_cndmask_b32_e64 v27, 0, 1, s4
; GFX10-NEXT: v_mul_hi_u32 v4, v4, v10
; GFX10-NEXT: v_add3_u32 v7, v7, v24, v25
; GFX10-NEXT: v_mul_lo_u32 v24, v1, v14
; GFX10-NEXT: v_mul_hi_u32 v25, v0, v13
; GFX10-NEXT: v_add3_u32 v33, v33, v26, v27
; GFX10-NEXT: v_mul_hi_u32 v2, v2, v12
; GFX10-NEXT: v_add3_u32 v26, v7, v29, v28
; GFX10-NEXT: v_mul_hi_u32 v1, v1, v13
; GFX10-NEXT: v_add3_u32 v7, v26, v24, v15
; GFX10-NEXT: v_add_co_u32 v11, s4, v23, v25
; GFX10-NEXT: v_cndmask_b32_e64 v10, 0, 1, s4
; GFX10-NEXT: v_add3_u32 v5, v7, v6, v5
; GFX10-NEXT: v_add_co_u32 v6, s4, v11, v21
; GFX10-NEXT: v_cndmask_b32_e64 v7, 0, 1, s4
; GFX10-NEXT: v_add3_u32 v3, v5, v4, v3
; GFX10-NEXT: v_mul_hi_u32 v4, v0, v14
; GFX10-NEXT: v_mul_lo_u32 v0, v0, v8
; GFX10-NEXT: v_add3_u32 v5, v33, v10, v7
; GFX10-NEXT: v_add3_u32 v3, v3, v2, v1
; GFX10-NEXT: v_mov_b32_e32 v1, v16
; GFX10-NEXT: v_mov_b32_e32 v2, v17
; GFX10-NEXT: v_add3_u32 v7, v3, v4, v5
; GFX10-NEXT: v_mov_b32_e32 v3, v18
; GFX10-NEXT: v_mov_b32_e32 v4, v19
; GFX10-NEXT: v_mov_b32_e32 v5, v20
; GFX10-NEXT: s_setpc_b64 s[30:31]
%result = mul i256 %num, %den
ret i256 %result
}

View File

@ -48,10 +48,10 @@ define i7 @v_saddsat_i7(i7 %lhs, i7 %rhs) {
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: v_lshlrev_b16_e64 v0, 9, v0
; GFX10-NEXT: v_lshlrev_b16_e64 v1, 9, v1
; GFX10-NEXT: v_lshlrev_b16 v0, 9, v0
; GFX10-NEXT: v_lshlrev_b16 v1, 9, v1
; GFX10-NEXT: v_add_nc_i16 v0, v0, v1 clamp
; GFX10-NEXT: v_ashrrev_i16_e64 v0, 9, v0
; GFX10-NEXT: v_ashrrev_i16 v0, 9, v0
; GFX10-NEXT: s_setpc_b64 s[30:31]
%result = call i7 @llvm.sadd.sat.i7(i7 %lhs, i7 %rhs)
ret i7 %result
@ -111,7 +111,7 @@ define amdgpu_ps i7 @s_saddsat_i7(i7 inreg %lhs, i7 inreg %rhs) {
; GFX10-NEXT: s_lshl_b32 s0, s0, s2
; GFX10-NEXT: s_lshl_b32 s1, s1, s2
; GFX10-NEXT: v_add_nc_i16 v0, s0, s1 clamp
; GFX10-NEXT: v_ashrrev_i16_e64 v0, 9, v0
; GFX10-NEXT: v_ashrrev_i16 v0, 9, v0
; GFX10-NEXT: v_readfirstlane_b32 s0, v0
; GFX10-NEXT: ; return to shader part epilog
%result = call i7 @llvm.sadd.sat.i7(i7 %lhs, i7 %rhs)
@ -162,10 +162,10 @@ define i8 @v_saddsat_i8(i8 %lhs, i8 %rhs) {
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: v_lshlrev_b16_e64 v0, 8, v0
; GFX10-NEXT: v_lshlrev_b16_e64 v1, 8, v1
; GFX10-NEXT: v_lshlrev_b16 v0, 8, v0
; GFX10-NEXT: v_lshlrev_b16 v1, 8, v1
; GFX10-NEXT: v_add_nc_i16 v0, v0, v1 clamp
; GFX10-NEXT: v_ashrrev_i16_e64 v0, 8, v0
; GFX10-NEXT: v_ashrrev_i16 v0, 8, v0
; GFX10-NEXT: s_setpc_b64 s[30:31]
%result = call i8 @llvm.sadd.sat.i8(i8 %lhs, i8 %rhs)
ret i8 %result
@ -225,7 +225,7 @@ define amdgpu_ps i8 @s_saddsat_i8(i8 inreg %lhs, i8 inreg %rhs) {
; GFX10-NEXT: s_lshl_b32 s0, s0, s2
; GFX10-NEXT: s_lshl_b32 s1, s1, s2
; GFX10-NEXT: v_add_nc_i16 v0, s0, s1 clamp
; GFX10-NEXT: v_ashrrev_i16_e64 v0, 8, v0
; GFX10-NEXT: v_ashrrev_i16 v0, 8, v0
; GFX10-NEXT: v_readfirstlane_b32 s0, v0
; GFX10-NEXT: ; return to shader part epilog
%result = call i8 @llvm.sadd.sat.i8(i8 %lhs, i8 %rhs)
@ -4199,12 +4199,12 @@ define i64 @v_saddsat_i64(i64 %lhs, i64 %rhs) {
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: v_add_co_u32_e64 v10, vcc_lo, v0, v2
; GFX10-NEXT: v_add_co_u32 v10, vcc_lo, v0, v2
; GFX10-NEXT: v_add_co_ci_u32_e32 v11, vcc_lo, v1, v3, vcc_lo
; GFX10-NEXT: v_cmp_gt_i64_e32 vcc_lo, 0, v[2:3]
; GFX10-NEXT: v_ashrrev_i32_e32 v6, 31, v11
; GFX10-NEXT: v_cmp_lt_i64_e64 s4, v[10:11], v[0:1]
; GFX10-NEXT: v_add_co_u32_e64 v0, s5, v6, 0
; GFX10-NEXT: v_add_co_u32 v0, s5, v6, 0
; GFX10-NEXT: v_add_co_ci_u32_e64 v1, s5, 0x80000000, v6, s5
; GFX10-NEXT: s_xor_b32 vcc_lo, vcc_lo, s4
; GFX10-NEXT: v_cndmask_b32_e32 v0, v10, v0, vcc_lo
@ -4377,12 +4377,12 @@ define amdgpu_ps <2 x float> @saddsat_i64_sv(i64 inreg %lhs, i64 %rhs) {
;
; GFX10-LABEL: saddsat_i64_sv:
; GFX10: ; %bb.0:
; GFX10-NEXT: v_add_co_u32_e64 v2, vcc_lo, s0, v0
; GFX10-NEXT: v_add_co_u32 v2, vcc_lo, s0, v0
; GFX10-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, s1, v1, vcc_lo
; GFX10-NEXT: v_cmp_gt_i64_e32 vcc_lo, 0, v[0:1]
; GFX10-NEXT: v_ashrrev_i32_e32 v4, 31, v3
; GFX10-NEXT: v_cmp_gt_i64_e64 s0, s[0:1], v[2:3]
; GFX10-NEXT: v_add_co_u32_e64 v0, s1, v4, 0
; GFX10-NEXT: v_add_co_u32 v0, s1, v4, 0
; GFX10-NEXT: v_add_co_ci_u32_e64 v1, s1, 0x80000000, v4, s1
; GFX10-NEXT: s_xor_b32 vcc_lo, vcc_lo, s0
; GFX10-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc_lo
@ -4444,12 +4444,12 @@ define amdgpu_ps <2 x float> @saddsat_i64_vs(i64 %lhs, i64 inreg %rhs) {
;
; GFX10-LABEL: saddsat_i64_vs:
; GFX10: ; %bb.0:
; GFX10-NEXT: v_add_co_u32_e64 v2, vcc_lo, v0, s0
; GFX10-NEXT: v_add_co_u32 v2, vcc_lo, v0, s0
; GFX10-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, s1, v1, vcc_lo
; GFX10-NEXT: v_cmp_lt_i64_e64 s1, s[0:1], 0
; GFX10-NEXT: v_ashrrev_i32_e32 v4, 31, v3
; GFX10-NEXT: v_cmp_lt_i64_e32 vcc_lo, v[2:3], v[0:1]
; GFX10-NEXT: v_add_co_u32_e64 v0, s0, v4, 0
; GFX10-NEXT: v_add_co_u32 v0, s0, v4, 0
; GFX10-NEXT: v_add_co_ci_u32_e64 v1, s0, 0x80000000, v4, s0
; GFX10-NEXT: s_xor_b32 vcc_lo, s1, vcc_lo
; GFX10-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc_lo
@ -4548,18 +4548,18 @@ define <2 x i64> @v_saddsat_v2i64(<2 x i64> %lhs, <2 x i64> %rhs) {
; GFX10-NEXT: v_mov_b32_e32 v17, v2
; GFX10-NEXT: v_mov_b32_e32 v18, v3
; GFX10-NEXT: v_cmp_gt_i64_e64 s4, 0, v[4:5]
; GFX10-NEXT: v_add_co_u32_e64 v8, vcc_lo, v14, v4
; GFX10-NEXT: v_add_co_u32 v8, vcc_lo, v14, v4
; GFX10-NEXT: v_cmp_gt_i64_e64 s6, 0, v[6:7]
; GFX10-NEXT: v_add_co_ci_u32_e32 v9, vcc_lo, v15, v5, vcc_lo
; GFX10-NEXT: v_add_co_u32_e64 v19, vcc_lo, v17, v6
; GFX10-NEXT: v_add_co_u32 v19, vcc_lo, v17, v6
; GFX10-NEXT: v_add_co_ci_u32_e32 v20, vcc_lo, v18, v7, vcc_lo
; GFX10-NEXT: v_ashrrev_i32_e32 v12, 31, v9
; GFX10-NEXT: v_cmp_lt_i64_e32 vcc_lo, v[8:9], v[14:15]
; GFX10-NEXT: v_ashrrev_i32_e32 v0, 31, v20
; GFX10-NEXT: v_add_co_u32_e64 v1, s5, v12, 0
; GFX10-NEXT: v_add_co_u32 v1, s5, v12, 0
; GFX10-NEXT: v_add_co_ci_u32_e64 v4, s5, 0x80000000, v12, s5
; GFX10-NEXT: v_cmp_lt_i64_e64 s5, v[19:20], v[17:18]
; GFX10-NEXT: v_add_co_u32_e64 v2, s7, v0, 0
; GFX10-NEXT: v_add_co_u32 v2, s7, v0, 0
; GFX10-NEXT: s_xor_b32 vcc_lo, s4, vcc_lo
; GFX10-NEXT: v_add_co_ci_u32_e64 v3, s7, 0x80000000, v0, s7
; GFX10-NEXT: v_cndmask_b32_e32 v0, v8, v1, vcc_lo
@ -5320,7 +5320,7 @@ define amdgpu_ps <4 x float> @saddsat_i128_sv(i128 inreg %lhs, i128 %rhs) {
;
; GFX10-LABEL: saddsat_i128_sv:
; GFX10: ; %bb.0:
; GFX10-NEXT: v_add_co_u32_e64 v4, vcc_lo, s0, v0
; GFX10-NEXT: v_add_co_u32 v4, vcc_lo, s0, v0
; GFX10-NEXT: v_add_co_ci_u32_e32 v5, vcc_lo, s1, v1, vcc_lo
; GFX10-NEXT: v_add_co_ci_u32_e32 v6, vcc_lo, s2, v2, vcc_lo
; GFX10-NEXT: v_add_co_ci_u32_e32 v7, vcc_lo, s3, v3, vcc_lo
@ -5362,7 +5362,7 @@ define amdgpu_ps <4 x float> @saddsat_i128_sv(i128 inreg %lhs, i128 %rhs) {
; GFX10-NEXT: v_cndmask_b32_e64 v0, v15, v0, s0
; GFX10-NEXT: v_and_b32_e32 v8, 1, v9
; GFX10-NEXT: v_cndmask_b32_e64 v1, v15, v1, s0
; GFX10-NEXT: v_add_co_u32_e64 v2, vcc_lo, v2, 0
; GFX10-NEXT: v_add_co_u32 v2, vcc_lo, v2, 0
; GFX10-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, 0, v3, vcc_lo
; GFX10-NEXT: v_cmp_ne_u32_e64 s0, 0, v8
; GFX10-NEXT: v_add_co_ci_u32_e32 v8, vcc_lo, 0, v0, vcc_lo
@ -5574,7 +5574,7 @@ define amdgpu_ps <4 x float> @saddsat_i128_vs(i128 %lhs, i128 inreg %rhs) {
; GFX10-NEXT: v_mov_b32_e32 v9, v2
; GFX10-NEXT: v_mov_b32_e32 v10, v3
; GFX10-NEXT: s_cmp_eq_u64 s[2:3], 0
; GFX10-NEXT: v_add_co_u32_e64 v15, vcc_lo, v5, s0
; GFX10-NEXT: v_add_co_u32 v15, vcc_lo, v5, s0
; GFX10-NEXT: v_cmp_lt_u64_e64 s0, s[0:1], 0
; GFX10-NEXT: v_add_co_ci_u32_e32 v16, vcc_lo, s1, v6, vcc_lo
; GFX10-NEXT: s_cselect_b32 s4, 1, 0
@ -5618,7 +5618,7 @@ define amdgpu_ps <4 x float> @saddsat_i128_vs(i128 %lhs, i128 inreg %rhs) {
; GFX10-NEXT: v_cndmask_b32_e64 v0, v7, v0, s0
; GFX10-NEXT: v_and_b32_e32 v8, 1, v9
; GFX10-NEXT: v_cndmask_b32_e64 v1, v7, v1, s0
; GFX10-NEXT: v_add_co_u32_e64 v2, vcc_lo, v2, 0
; GFX10-NEXT: v_add_co_u32 v2, vcc_lo, v2, 0
; GFX10-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, 0, v3, vcc_lo
; GFX10-NEXT: v_cmp_ne_u32_e64 s0, 0, v8
; GFX10-NEXT: v_add_co_ci_u32_e32 v8, vcc_lo, 0, v0, vcc_lo
@ -5964,7 +5964,7 @@ define <2 x i128> @v_saddsat_v2i128(<2 x i128> %lhs, <2 x i128> %rhs) {
; GFX10-NEXT: v_mov_b32_e32 v20, v2
; GFX10-NEXT: v_mov_b32_e32 v21, v3
; GFX10-NEXT: s_movk_i32 s5, 0x7f
; GFX10-NEXT: v_add_co_u32_e64 v16, vcc_lo, v22, v8
; GFX10-NEXT: v_add_co_u32 v16, vcc_lo, v22, v8
; GFX10-NEXT: s_sub_i32 s6, 64, s5
; GFX10-NEXT: v_add_co_ci_u32_e32 v17, vcc_lo, v23, v9, vcc_lo
; GFX10-NEXT: s_sub_i32 s7, s5, 64
@ -6010,12 +6010,12 @@ define <2 x i128> @v_saddsat_v2i128(<2 x i128> %lhs, <2 x i128> %rhs) {
; GFX10-NEXT: v_cndmask_b32_e64 v0, v11, v0, s4
; GFX10-NEXT: v_cndmask_b32_e64 v1, v11, v1, s4
; GFX10-NEXT: v_and_b32_e32 v8, 1, v9
; GFX10-NEXT: v_add_co_u32_e64 v2, vcc_lo, v2, 0
; GFX10-NEXT: v_add_co_u32 v2, vcc_lo, v2, 0
; GFX10-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, 0, v3, vcc_lo
; GFX10-NEXT: v_add_co_ci_u32_e32 v20, vcc_lo, 0, v0, vcc_lo
; GFX10-NEXT: v_add_co_ci_u32_e32 v21, vcc_lo, 0x80000000, v1, vcc_lo
; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v8
; GFX10-NEXT: v_add_co_u32_e64 v8, s4, v26, v12
; GFX10-NEXT: v_add_co_u32 v8, s4, v26, v12
; GFX10-NEXT: v_add_co_ci_u32_e64 v9, s4, v27, v13, s4
; GFX10-NEXT: v_add_co_ci_u32_e64 v10, s4, v24, v14, s4
; GFX10-NEXT: v_cndmask_b32_e32 v0, v16, v2, vcc_lo
@ -6056,7 +6056,7 @@ define <2 x i128> @v_saddsat_v2i128(<2 x i128> %lhs, <2 x i128> %rhs) {
; GFX10-NEXT: v_and_b32_e32 v7, 1, v7
; GFX10-NEXT: v_cndmask_b32_e64 v3, v18, v3, s5
; GFX10-NEXT: v_cndmask_b32_e64 v4, v18, v4, s5
; GFX10-NEXT: v_add_co_u32_e64 v5, s4, v5, 0
; GFX10-NEXT: v_add_co_u32 v5, s4, v5, 0
; GFX10-NEXT: v_add_co_ci_u32_e64 v6, s4, 0, v6, s4
; GFX10-NEXT: v_cmp_ne_u32_e64 s5, 0, v7
; GFX10-NEXT: v_add_co_ci_u32_e64 v7, s4, 0, v3, s4

View File

@ -264,7 +264,7 @@ define amdgpu_kernel void @mulu24_shl64(i32 addrspace(1)* nocapture %arg) {
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: v_mov_b32_e32 v5, s1
; GFX10-NEXT: v_mov_b32_e32 v4, s0
; GFX10-NEXT: v_add_co_u32_e64 v2, vcc_lo, v4, v2
; GFX10-NEXT: v_add_co_u32 v2, vcc_lo, v4, v2
; GFX10-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, v5, v3, vcc_lo
; GFX10-NEXT: global_store_dword v[2:3], v1, off
; GFX10-NEXT: s_endpgm
@ -599,7 +599,7 @@ define i32 @v_shl_i32_zext_i16(i16 %x) {
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: v_and_b32_e32 v0, 0x3fff, v0
; GFX10-NEXT: v_lshlrev_b16_e64 v0, 2, v0
; GFX10-NEXT: v_lshlrev_b16 v0, 2, v0
; GFX10-NEXT: v_bfe_u32 v0, v0, 0, 16
; GFX10-NEXT: s_setpc_b64 s[30:31]
%and = and i16 %x, 16383

View File

@ -29,7 +29,7 @@ define i8 @v_shl_i8(i8 %value, i8 %amount) {
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: v_and_b32_e32 v1, 0xff, v1
; GFX10-NEXT: v_lshlrev_b16_e64 v0, v1, v0
; GFX10-NEXT: v_lshlrev_b16 v0, v1, v0
; GFX10-NEXT: s_setpc_b64 s[30:31]
%result = shl i8 %value, %amount
ret i8 %result
@ -58,7 +58,7 @@ define i8 @v_shl_i8_7(i8 %value) {
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: v_lshlrev_b16_e64 v0, 7, v0
; GFX10-NEXT: v_lshlrev_b16 v0, 7, v0
; GFX10-NEXT: s_setpc_b64 s[30:31]
%result = shl i8 %value, 7
ret i8 %result
@ -592,7 +592,7 @@ define i16 @v_shl_i16(i16 %value, i16 %amount) {
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: v_lshlrev_b16_e64 v0, v1, v0
; GFX10-NEXT: v_lshlrev_b16 v0, v1, v0
; GFX10-NEXT: s_setpc_b64 s[30:31]
%result = shl i16 %value, %amount
ret i16 %result
@ -693,7 +693,7 @@ define amdgpu_ps half @shl_i16_sv(i16 inreg %value, i16 %amount) {
;
; GFX10-LABEL: shl_i16_sv:
; GFX10: ; %bb.0:
; GFX10-NEXT: v_lshlrev_b16_e64 v0, v0, s0
; GFX10-NEXT: v_lshlrev_b16 v0, v0, s0
; GFX10-NEXT: ; return to shader part epilog
%result = shl i16 %value, %amount
%cast = bitcast i16 %result to half
@ -719,7 +719,7 @@ define amdgpu_ps half @shl_i16_vs(i16 %value, i16 inreg %amount) {
;
; GFX10-LABEL: shl_i16_vs:
; GFX10: ; %bb.0:
; GFX10-NEXT: v_lshlrev_b16_e64 v0, s0, v0
; GFX10-NEXT: v_lshlrev_b16 v0, s0, v0
; GFX10-NEXT: ; return to shader part epilog
%result = shl i16 %value, %amount
%cast = bitcast i16 %result to half

View File

@ -48,10 +48,10 @@ define i7 @v_ssubsat_i7(i7 %lhs, i7 %rhs) {
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: v_lshlrev_b16_e64 v0, 9, v0
; GFX10-NEXT: v_lshlrev_b16_e64 v1, 9, v1
; GFX10-NEXT: v_lshlrev_b16 v0, 9, v0
; GFX10-NEXT: v_lshlrev_b16 v1, 9, v1
; GFX10-NEXT: v_sub_nc_i16 v0, v0, v1 clamp
; GFX10-NEXT: v_ashrrev_i16_e64 v0, 9, v0
; GFX10-NEXT: v_ashrrev_i16 v0, 9, v0
; GFX10-NEXT: s_setpc_b64 s[30:31]
%result = call i7 @llvm.ssub.sat.i7(i7 %lhs, i7 %rhs)
ret i7 %result
@ -111,7 +111,7 @@ define amdgpu_ps i7 @s_ssubsat_i7(i7 inreg %lhs, i7 inreg %rhs) {
; GFX10-NEXT: s_lshl_b32 s0, s0, s2
; GFX10-NEXT: s_lshl_b32 s1, s1, s2
; GFX10-NEXT: v_sub_nc_i16 v0, s0, s1 clamp
; GFX10-NEXT: v_ashrrev_i16_e64 v0, 9, v0
; GFX10-NEXT: v_ashrrev_i16 v0, 9, v0
; GFX10-NEXT: v_readfirstlane_b32 s0, v0
; GFX10-NEXT: ; return to shader part epilog
%result = call i7 @llvm.ssub.sat.i7(i7 %lhs, i7 %rhs)
@ -162,10 +162,10 @@ define i8 @v_ssubsat_i8(i8 %lhs, i8 %rhs) {
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: v_lshlrev_b16_e64 v0, 8, v0
; GFX10-NEXT: v_lshlrev_b16_e64 v1, 8, v1
; GFX10-NEXT: v_lshlrev_b16 v0, 8, v0
; GFX10-NEXT: v_lshlrev_b16 v1, 8, v1
; GFX10-NEXT: v_sub_nc_i16 v0, v0, v1 clamp
; GFX10-NEXT: v_ashrrev_i16_e64 v0, 8, v0
; GFX10-NEXT: v_ashrrev_i16 v0, 8, v0
; GFX10-NEXT: s_setpc_b64 s[30:31]
%result = call i8 @llvm.ssub.sat.i8(i8 %lhs, i8 %rhs)
ret i8 %result
@ -225,7 +225,7 @@ define amdgpu_ps i8 @s_ssubsat_i8(i8 inreg %lhs, i8 inreg %rhs) {
; GFX10-NEXT: s_lshl_b32 s0, s0, s2
; GFX10-NEXT: s_lshl_b32 s1, s1, s2
; GFX10-NEXT: v_sub_nc_i16 v0, s0, s1 clamp
; GFX10-NEXT: v_ashrrev_i16_e64 v0, 8, v0
; GFX10-NEXT: v_ashrrev_i16 v0, 8, v0
; GFX10-NEXT: v_readfirstlane_b32 s0, v0
; GFX10-NEXT: ; return to shader part epilog
%result = call i8 @llvm.ssub.sat.i8(i8 %lhs, i8 %rhs)
@ -4185,12 +4185,12 @@ define i64 @v_ssubsat_i64(i64 %lhs, i64 %rhs) {
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: v_sub_co_u32_e64 v10, vcc_lo, v0, v2
; GFX10-NEXT: v_sub_co_u32 v10, vcc_lo, v0, v2
; GFX10-NEXT: v_sub_co_ci_u32_e32 v11, vcc_lo, v1, v3, vcc_lo
; GFX10-NEXT: v_cmp_lt_i64_e32 vcc_lo, 0, v[2:3]
; GFX10-NEXT: v_ashrrev_i32_e32 v6, 31, v11
; GFX10-NEXT: v_cmp_lt_i64_e64 s4, v[10:11], v[0:1]
; GFX10-NEXT: v_add_co_u32_e64 v0, s5, v6, 0
; GFX10-NEXT: v_add_co_u32 v0, s5, v6, 0
; GFX10-NEXT: v_add_co_ci_u32_e64 v1, s5, 0x80000000, v6, s5
; GFX10-NEXT: s_xor_b32 vcc_lo, vcc_lo, s4
; GFX10-NEXT: v_cndmask_b32_e32 v0, v10, v0, vcc_lo
@ -4363,12 +4363,12 @@ define amdgpu_ps <2 x float> @ssubsat_i64_sv(i64 inreg %lhs, i64 %rhs) {
;
; GFX10-LABEL: ssubsat_i64_sv:
; GFX10: ; %bb.0:
; GFX10-NEXT: v_sub_co_u32_e64 v2, vcc_lo, s0, v0
; GFX10-NEXT: v_sub_co_u32 v2, vcc_lo, s0, v0
; GFX10-NEXT: v_sub_co_ci_u32_e32 v3, vcc_lo, s1, v1, vcc_lo
; GFX10-NEXT: v_cmp_lt_i64_e32 vcc_lo, 0, v[0:1]
; GFX10-NEXT: v_ashrrev_i32_e32 v4, 31, v3
; GFX10-NEXT: v_cmp_gt_i64_e64 s0, s[0:1], v[2:3]
; GFX10-NEXT: v_add_co_u32_e64 v0, s1, v4, 0
; GFX10-NEXT: v_add_co_u32 v0, s1, v4, 0
; GFX10-NEXT: v_add_co_ci_u32_e64 v1, s1, 0x80000000, v4, s1
; GFX10-NEXT: s_xor_b32 vcc_lo, vcc_lo, s0
; GFX10-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc_lo
@ -4430,12 +4430,12 @@ define amdgpu_ps <2 x float> @ssubsat_i64_vs(i64 %lhs, i64 inreg %rhs) {
;
; GFX10-LABEL: ssubsat_i64_vs:
; GFX10: ; %bb.0:
; GFX10-NEXT: v_sub_co_u32_e64 v2, vcc_lo, v0, s0
; GFX10-NEXT: v_sub_co_u32 v2, vcc_lo, v0, s0
; GFX10-NEXT: v_subrev_co_ci_u32_e32 v3, vcc_lo, s1, v1, vcc_lo
; GFX10-NEXT: v_cmp_gt_i64_e64 s1, s[0:1], 0
; GFX10-NEXT: v_ashrrev_i32_e32 v4, 31, v3
; GFX10-NEXT: v_cmp_lt_i64_e32 vcc_lo, v[2:3], v[0:1]
; GFX10-NEXT: v_add_co_u32_e64 v0, s0, v4, 0
; GFX10-NEXT: v_add_co_u32 v0, s0, v4, 0
; GFX10-NEXT: v_add_co_ci_u32_e64 v1, s0, 0x80000000, v4, s0
; GFX10-NEXT: s_xor_b32 vcc_lo, s1, vcc_lo
; GFX10-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc_lo
@ -4534,18 +4534,18 @@ define <2 x i64> @v_ssubsat_v2i64(<2 x i64> %lhs, <2 x i64> %rhs) {
; GFX10-NEXT: v_mov_b32_e32 v17, v2
; GFX10-NEXT: v_mov_b32_e32 v18, v3
; GFX10-NEXT: v_cmp_lt_i64_e64 s4, 0, v[4:5]
; GFX10-NEXT: v_sub_co_u32_e64 v8, vcc_lo, v14, v4
; GFX10-NEXT: v_sub_co_u32 v8, vcc_lo, v14, v4
; GFX10-NEXT: v_cmp_lt_i64_e64 s6, 0, v[6:7]
; GFX10-NEXT: v_sub_co_ci_u32_e32 v9, vcc_lo, v15, v5, vcc_lo
; GFX10-NEXT: v_sub_co_u32_e64 v19, vcc_lo, v17, v6
; GFX10-NEXT: v_sub_co_u32 v19, vcc_lo, v17, v6
; GFX10-NEXT: v_sub_co_ci_u32_e32 v20, vcc_lo, v18, v7, vcc_lo
; GFX10-NEXT: v_ashrrev_i32_e32 v12, 31, v9
; GFX10-NEXT: v_cmp_lt_i64_e32 vcc_lo, v[8:9], v[14:15]
; GFX10-NEXT: v_ashrrev_i32_e32 v0, 31, v20
; GFX10-NEXT: v_add_co_u32_e64 v1, s5, v12, 0
; GFX10-NEXT: v_add_co_u32 v1, s5, v12, 0
; GFX10-NEXT: v_add_co_ci_u32_e64 v4, s5, 0x80000000, v12, s5
; GFX10-NEXT: v_cmp_lt_i64_e64 s5, v[19:20], v[17:18]
; GFX10-NEXT: v_add_co_u32_e64 v2, s7, v0, 0
; GFX10-NEXT: v_add_co_u32 v2, s7, v0, 0
; GFX10-NEXT: s_xor_b32 vcc_lo, s4, vcc_lo
; GFX10-NEXT: v_add_co_ci_u32_e64 v3, s7, 0x80000000, v0, s7
; GFX10-NEXT: v_cndmask_b32_e32 v0, v8, v1, vcc_lo
@ -5306,7 +5306,7 @@ define amdgpu_ps <4 x float> @ssubsat_i128_sv(i128 inreg %lhs, i128 %rhs) {
;
; GFX10-LABEL: ssubsat_i128_sv:
; GFX10: ; %bb.0:
; GFX10-NEXT: v_sub_co_u32_e64 v4, vcc_lo, s0, v0
; GFX10-NEXT: v_sub_co_u32 v4, vcc_lo, s0, v0
; GFX10-NEXT: v_sub_co_ci_u32_e32 v5, vcc_lo, s1, v1, vcc_lo
; GFX10-NEXT: v_sub_co_ci_u32_e32 v6, vcc_lo, s2, v2, vcc_lo
; GFX10-NEXT: v_sub_co_ci_u32_e32 v7, vcc_lo, s3, v3, vcc_lo
@ -5348,7 +5348,7 @@ define amdgpu_ps <4 x float> @ssubsat_i128_sv(i128 inreg %lhs, i128 %rhs) {
; GFX10-NEXT: v_cndmask_b32_e64 v0, v15, v0, s0
; GFX10-NEXT: v_and_b32_e32 v8, 1, v9
; GFX10-NEXT: v_cndmask_b32_e64 v1, v15, v1, s0
; GFX10-NEXT: v_add_co_u32_e64 v2, vcc_lo, v2, 0
; GFX10-NEXT: v_add_co_u32 v2, vcc_lo, v2, 0
; GFX10-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, 0, v3, vcc_lo
; GFX10-NEXT: v_cmp_ne_u32_e64 s0, 0, v8
; GFX10-NEXT: v_add_co_ci_u32_e32 v8, vcc_lo, 0, v0, vcc_lo
@ -5560,7 +5560,7 @@ define amdgpu_ps <4 x float> @ssubsat_i128_vs(i128 %lhs, i128 inreg %rhs) {
; GFX10-NEXT: v_mov_b32_e32 v9, v2
; GFX10-NEXT: v_mov_b32_e32 v10, v3
; GFX10-NEXT: s_cmp_eq_u64 s[2:3], 0
; GFX10-NEXT: v_sub_co_u32_e64 v15, vcc_lo, v5, s0
; GFX10-NEXT: v_sub_co_u32 v15, vcc_lo, v5, s0
; GFX10-NEXT: v_cmp_gt_u64_e64 s0, s[0:1], 0
; GFX10-NEXT: v_subrev_co_ci_u32_e32 v16, vcc_lo, s1, v6, vcc_lo
; GFX10-NEXT: s_cselect_b32 s4, 1, 0
@ -5604,7 +5604,7 @@ define amdgpu_ps <4 x float> @ssubsat_i128_vs(i128 %lhs, i128 inreg %rhs) {
; GFX10-NEXT: v_cndmask_b32_e64 v0, v7, v0, s0
; GFX10-NEXT: v_and_b32_e32 v8, 1, v9
; GFX10-NEXT: v_cndmask_b32_e64 v1, v7, v1, s0
; GFX10-NEXT: v_add_co_u32_e64 v2, vcc_lo, v2, 0
; GFX10-NEXT: v_add_co_u32 v2, vcc_lo, v2, 0
; GFX10-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, 0, v3, vcc_lo
; GFX10-NEXT: v_cmp_ne_u32_e64 s0, 0, v8
; GFX10-NEXT: v_add_co_ci_u32_e32 v8, vcc_lo, 0, v0, vcc_lo
@ -5950,7 +5950,7 @@ define <2 x i128> @v_ssubsat_v2i128(<2 x i128> %lhs, <2 x i128> %rhs) {
; GFX10-NEXT: v_mov_b32_e32 v20, v2
; GFX10-NEXT: v_mov_b32_e32 v21, v3
; GFX10-NEXT: s_movk_i32 s5, 0x7f
; GFX10-NEXT: v_sub_co_u32_e64 v16, vcc_lo, v22, v8
; GFX10-NEXT: v_sub_co_u32 v16, vcc_lo, v22, v8
; GFX10-NEXT: s_sub_i32 s6, 64, s5
; GFX10-NEXT: v_sub_co_ci_u32_e32 v17, vcc_lo, v23, v9, vcc_lo
; GFX10-NEXT: s_sub_i32 s7, s5, 64
@ -5996,12 +5996,12 @@ define <2 x i128> @v_ssubsat_v2i128(<2 x i128> %lhs, <2 x i128> %rhs) {
; GFX10-NEXT: v_cndmask_b32_e64 v0, v11, v0, s4
; GFX10-NEXT: v_cndmask_b32_e64 v1, v11, v1, s4
; GFX10-NEXT: v_and_b32_e32 v8, 1, v9
; GFX10-NEXT: v_add_co_u32_e64 v2, vcc_lo, v2, 0
; GFX10-NEXT: v_add_co_u32 v2, vcc_lo, v2, 0
; GFX10-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, 0, v3, vcc_lo
; GFX10-NEXT: v_add_co_ci_u32_e32 v20, vcc_lo, 0, v0, vcc_lo
; GFX10-NEXT: v_add_co_ci_u32_e32 v21, vcc_lo, 0x80000000, v1, vcc_lo
; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v8
; GFX10-NEXT: v_sub_co_u32_e64 v8, s4, v26, v12
; GFX10-NEXT: v_sub_co_u32 v8, s4, v26, v12
; GFX10-NEXT: v_sub_co_ci_u32_e64 v9, s4, v27, v13, s4
; GFX10-NEXT: v_sub_co_ci_u32_e64 v10, s4, v24, v14, s4
; GFX10-NEXT: v_cndmask_b32_e32 v0, v16, v2, vcc_lo
@ -6042,7 +6042,7 @@ define <2 x i128> @v_ssubsat_v2i128(<2 x i128> %lhs, <2 x i128> %rhs) {
; GFX10-NEXT: v_and_b32_e32 v7, 1, v7
; GFX10-NEXT: v_cndmask_b32_e64 v3, v18, v3, s5
; GFX10-NEXT: v_cndmask_b32_e64 v4, v18, v4, s5
; GFX10-NEXT: v_add_co_u32_e64 v5, s4, v5, 0
; GFX10-NEXT: v_add_co_u32 v5, s4, v5, 0
; GFX10-NEXT: v_add_co_ci_u32_e64 v6, s4, 0, v6, s4
; GFX10-NEXT: v_cmp_ne_u32_e64 s5, 0, v7
; GFX10-NEXT: v_add_co_ci_u32_e64 v7, s4, 0, v3, s4

View File

@ -38,10 +38,10 @@ define i7 @v_uaddsat_i7(i7 %lhs, i7 %rhs) {
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: v_lshlrev_b16_e64 v0, 9, v0
; GFX10-NEXT: v_lshlrev_b16_e64 v1, 9, v1
; GFX10-NEXT: v_add_nc_u16_e64 v0, v0, v1 clamp
; GFX10-NEXT: v_lshrrev_b16_e64 v0, 9, v0
; GFX10-NEXT: v_lshlrev_b16 v0, 9, v0
; GFX10-NEXT: v_lshlrev_b16 v1, 9, v1
; GFX10-NEXT: v_add_nc_u16 v0, v0, v1 clamp
; GFX10-NEXT: v_lshrrev_b16 v0, 9, v0
; GFX10-NEXT: s_setpc_b64 s[30:31]
%result = call i7 @llvm.uadd.sat.i7(i7 %lhs, i7 %rhs)
ret i7 %result
@ -85,8 +85,8 @@ define amdgpu_ps i7 @s_uaddsat_i7(i7 inreg %lhs, i7 inreg %rhs) {
; GFX10-NEXT: s_bfe_u32 s2, 9, 0x100000
; GFX10-NEXT: s_lshl_b32 s0, s0, s2
; GFX10-NEXT: s_lshl_b32 s1, s1, s2
; GFX10-NEXT: v_add_nc_u16_e64 v0, s0, s1 clamp
; GFX10-NEXT: v_lshrrev_b16_e64 v0, 9, v0
; GFX10-NEXT: v_add_nc_u16 v0, s0, s1 clamp
; GFX10-NEXT: v_lshrrev_b16 v0, 9, v0
; GFX10-NEXT: v_readfirstlane_b32 s0, v0
; GFX10-NEXT: ; return to shader part epilog
%result = call i7 @llvm.uadd.sat.i7(i7 %lhs, i7 %rhs)
@ -127,10 +127,10 @@ define i8 @v_uaddsat_i8(i8 %lhs, i8 %rhs) {
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: v_lshlrev_b16_e64 v0, 8, v0
; GFX10-NEXT: v_lshlrev_b16_e64 v1, 8, v1
; GFX10-NEXT: v_add_nc_u16_e64 v0, v0, v1 clamp
; GFX10-NEXT: v_lshrrev_b16_e64 v0, 8, v0
; GFX10-NEXT: v_lshlrev_b16 v0, 8, v0
; GFX10-NEXT: v_lshlrev_b16 v1, 8, v1
; GFX10-NEXT: v_add_nc_u16 v0, v0, v1 clamp
; GFX10-NEXT: v_lshrrev_b16 v0, 8, v0
; GFX10-NEXT: s_setpc_b64 s[30:31]
%result = call i8 @llvm.uadd.sat.i8(i8 %lhs, i8 %rhs)
ret i8 %result
@ -174,8 +174,8 @@ define amdgpu_ps i8 @s_uaddsat_i8(i8 inreg %lhs, i8 inreg %rhs) {
; GFX10-NEXT: s_bfe_u32 s2, 8, 0x100000
; GFX10-NEXT: s_lshl_b32 s0, s0, s2
; GFX10-NEXT: s_lshl_b32 s1, s1, s2
; GFX10-NEXT: v_add_nc_u16_e64 v0, s0, s1 clamp
; GFX10-NEXT: v_lshrrev_b16_e64 v0, 8, v0
; GFX10-NEXT: v_add_nc_u16 v0, s0, s1 clamp
; GFX10-NEXT: v_lshrrev_b16 v0, 8, v0
; GFX10-NEXT: v_readfirstlane_b32 s0, v0
; GFX10-NEXT: ; return to shader part epilog
%result = call i8 @llvm.uadd.sat.i8(i8 %lhs, i8 %rhs)
@ -1644,7 +1644,7 @@ define i16 @v_uaddsat_i16(i16 %lhs, i16 %rhs) {
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: v_add_nc_u16_e64 v0, v0, v1 clamp
; GFX10-NEXT: v_add_nc_u16 v0, v0, v1 clamp
; GFX10-NEXT: s_setpc_b64 s[30:31]
%result = call i16 @llvm.uadd.sat.i16(i16 %lhs, i16 %rhs)
ret i16 %result
@ -1677,7 +1677,7 @@ define amdgpu_ps i16 @s_uaddsat_i16(i16 inreg %lhs, i16 inreg %rhs) {
;
; GFX10-LABEL: s_uaddsat_i16:
; GFX10: ; %bb.0:
; GFX10-NEXT: v_add_nc_u16_e64 v0, s0, s1 clamp
; GFX10-NEXT: v_add_nc_u16 v0, s0, s1 clamp
; GFX10-NEXT: v_readfirstlane_b32 s0, v0
; GFX10-NEXT: ; return to shader part epilog
%result = call i16 @llvm.uadd.sat.i16(i16 %lhs, i16 %rhs)
@ -1707,7 +1707,7 @@ define amdgpu_ps half @uaddsat_i16_sv(i16 inreg %lhs, i16 %rhs) {
;
; GFX10-LABEL: uaddsat_i16_sv:
; GFX10: ; %bb.0:
; GFX10-NEXT: v_add_nc_u16_e64 v0, s0, v0 clamp
; GFX10-NEXT: v_add_nc_u16 v0, s0, v0 clamp
; GFX10-NEXT: ; return to shader part epilog
%result = call i16 @llvm.uadd.sat.i16(i16 %lhs, i16 %rhs)
%cast = bitcast i16 %result to half
@ -1737,7 +1737,7 @@ define amdgpu_ps half @uaddsat_i16_vs(i16 %lhs, i16 inreg %rhs) {
;
; GFX10-LABEL: uaddsat_i16_vs:
; GFX10: ; %bb.0:
; GFX10-NEXT: v_add_nc_u16_e64 v0, v0, s0 clamp
; GFX10-NEXT: v_add_nc_u16 v0, v0, s0 clamp
; GFX10-NEXT: ; return to shader part epilog
%result = call i16 @llvm.uadd.sat.i16(i16 %lhs, i16 %rhs)
%cast = bitcast i16 %result to half
@ -2599,7 +2599,7 @@ define i64 @v_uaddsat_i64(i64 %lhs, i64 %rhs) {
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: v_add_co_u32_e64 v0, vcc_lo, v0, v2
; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, v0, v2
; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo
; GFX10-NEXT: v_cmp_lt_u64_e32 vcc_lo, v[0:1], v[2:3]
; GFX10-NEXT: v_cndmask_b32_e64 v0, v0, -1, vcc_lo
@ -2714,7 +2714,7 @@ define amdgpu_ps <2 x float> @uaddsat_i64_sv(i64 inreg %lhs, i64 %rhs) {
;
; GFX10-LABEL: uaddsat_i64_sv:
; GFX10: ; %bb.0:
; GFX10-NEXT: v_add_co_u32_e64 v2, vcc_lo, s0, v0
; GFX10-NEXT: v_add_co_u32 v2, vcc_lo, s0, v0
; GFX10-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, s1, v1, vcc_lo
; GFX10-NEXT: v_cmp_lt_u64_e32 vcc_lo, v[2:3], v[0:1]
; GFX10-NEXT: v_cndmask_b32_e64 v0, v2, -1, vcc_lo
@ -2758,7 +2758,7 @@ define amdgpu_ps <2 x float> @uaddsat_i64_vs(i64 %lhs, i64 inreg %rhs) {
;
; GFX10-LABEL: uaddsat_i64_vs:
; GFX10: ; %bb.0:
; GFX10-NEXT: v_add_co_u32_e64 v0, vcc_lo, v0, s0
; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, v0, s0
; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, s1, v1, vcc_lo
; GFX10-NEXT: v_cmp_gt_u64_e32 vcc_lo, s[0:1], v[0:1]
; GFX10-NEXT: v_cndmask_b32_e64 v0, v0, -1, vcc_lo
@ -2823,9 +2823,9 @@ define <2 x i64> @v_uaddsat_v2i64(<2 x i64> %lhs, <2 x i64> %rhs) {
; GFX10-NEXT: v_mov_b32_e32 v11, v5
; GFX10-NEXT: v_mov_b32_e32 v15, v6
; GFX10-NEXT: v_mov_b32_e32 v16, v7
; GFX10-NEXT: v_add_co_u32_e64 v0, vcc_lo, v0, v10
; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, v0, v10
; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v1, v11, vcc_lo
; GFX10-NEXT: v_add_co_u32_e64 v5, vcc_lo, v2, v15
; GFX10-NEXT: v_add_co_u32 v5, vcc_lo, v2, v15
; GFX10-NEXT: v_add_co_ci_u32_e32 v6, vcc_lo, v3, v16, vcc_lo
; GFX10-NEXT: v_cmp_lt_u64_e32 vcc_lo, v[0:1], v[10:11]
; GFX10-NEXT: v_cmp_lt_u64_e64 s4, v[5:6], v[15:16]
@ -3203,7 +3203,7 @@ define amdgpu_ps <4 x float> @uaddsat_i128_sv(i128 inreg %lhs, i128 %rhs) {
;
; GFX10-LABEL: uaddsat_i128_sv:
; GFX10: ; %bb.0:
; GFX10-NEXT: v_add_co_u32_e64 v10, vcc_lo, s0, v0
; GFX10-NEXT: v_add_co_u32 v10, vcc_lo, s0, v0
; GFX10-NEXT: v_add_co_ci_u32_e32 v11, vcc_lo, s1, v1, vcc_lo
; GFX10-NEXT: v_add_co_ci_u32_e32 v4, vcc_lo, s2, v2, vcc_lo
; GFX10-NEXT: v_add_co_ci_u32_e32 v5, vcc_lo, s3, v3, vcc_lo
@ -3297,7 +3297,7 @@ define amdgpu_ps <4 x float> @uaddsat_i128_vs(i128 %lhs, i128 inreg %rhs) {
;
; GFX10-LABEL: uaddsat_i128_vs:
; GFX10: ; %bb.0:
; GFX10-NEXT: v_add_co_u32_e64 v0, vcc_lo, v0, s0
; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, v0, s0
; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, s1, v1, vcc_lo
; GFX10-NEXT: v_add_co_ci_u32_e32 v2, vcc_lo, s2, v2, vcc_lo
; GFX10-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, s3, v3, vcc_lo
@ -3440,7 +3440,7 @@ define <2 x i128> @v_uaddsat_v2i128(<2 x i128> %lhs, <2 x i128> %rhs) {
; GFX10-NEXT: v_mov_b32_e32 v16, v10
; GFX10-NEXT: v_mov_b32_e32 v17, v11
; GFX10-NEXT: v_mov_b32_e32 v10, v12
; GFX10-NEXT: v_add_co_u32_e64 v0, vcc_lo, v0, v18
; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, v0, v18
; GFX10-NEXT: v_mov_b32_e32 v11, v13
; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v1, v19, vcc_lo
; GFX10-NEXT: v_mov_b32_e32 v20, v14
@ -3449,7 +3449,7 @@ define <2 x i128> @v_uaddsat_v2i128(<2 x i128> %lhs, <2 x i128> %rhs) {
; GFX10-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, v3, v17, vcc_lo
; GFX10-NEXT: v_cmp_lt_u64_e32 vcc_lo, v[0:1], v[18:19]
; GFX10-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc_lo
; GFX10-NEXT: v_add_co_u32_e64 v4, vcc_lo, v4, v10
; GFX10-NEXT: v_add_co_u32 v4, vcc_lo, v4, v10
; GFX10-NEXT: v_add_co_ci_u32_e32 v5, vcc_lo, v5, v11, vcc_lo
; GFX10-NEXT: v_add_co_ci_u32_e32 v6, vcc_lo, v6, v20, vcc_lo
; GFX10-NEXT: v_add_co_ci_u32_e32 v7, vcc_lo, v7, v21, vcc_lo

View File

@ -37,10 +37,10 @@ define i7 @v_usubsat_i7(i7 %lhs, i7 %rhs) {
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: v_lshlrev_b16_e64 v0, 9, v0
; GFX10-NEXT: v_lshlrev_b16_e64 v1, 9, v1
; GFX10-NEXT: v_sub_nc_u16_e64 v0, v0, v1 clamp
; GFX10-NEXT: v_lshrrev_b16_e64 v0, 9, v0
; GFX10-NEXT: v_lshlrev_b16 v0, 9, v0
; GFX10-NEXT: v_lshlrev_b16 v1, 9, v1
; GFX10-NEXT: v_sub_nc_u16 v0, v0, v1 clamp
; GFX10-NEXT: v_lshrrev_b16 v0, 9, v0
; GFX10-NEXT: s_setpc_b64 s[30:31]
%result = call i7 @llvm.usub.sat.i7(i7 %lhs, i7 %rhs)
ret i7 %result
@ -83,8 +83,8 @@ define amdgpu_ps i7 @s_usubsat_i7(i7 inreg %lhs, i7 inreg %rhs) {
; GFX10-NEXT: s_bfe_u32 s2, 9, 0x100000
; GFX10-NEXT: s_lshl_b32 s0, s0, s2
; GFX10-NEXT: s_lshl_b32 s1, s1, s2
; GFX10-NEXT: v_sub_nc_u16_e64 v0, s0, s1 clamp
; GFX10-NEXT: v_lshrrev_b16_e64 v0, 9, v0
; GFX10-NEXT: v_sub_nc_u16 v0, s0, s1 clamp
; GFX10-NEXT: v_lshrrev_b16 v0, 9, v0
; GFX10-NEXT: v_readfirstlane_b32 s0, v0
; GFX10-NEXT: ; return to shader part epilog
%result = call i7 @llvm.usub.sat.i7(i7 %lhs, i7 %rhs)
@ -124,10 +124,10 @@ define i8 @v_usubsat_i8(i8 %lhs, i8 %rhs) {
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: v_lshlrev_b16_e64 v0, 8, v0
; GFX10-NEXT: v_lshlrev_b16_e64 v1, 8, v1
; GFX10-NEXT: v_sub_nc_u16_e64 v0, v0, v1 clamp
; GFX10-NEXT: v_lshrrev_b16_e64 v0, 8, v0
; GFX10-NEXT: v_lshlrev_b16 v0, 8, v0
; GFX10-NEXT: v_lshlrev_b16 v1, 8, v1
; GFX10-NEXT: v_sub_nc_u16 v0, v0, v1 clamp
; GFX10-NEXT: v_lshrrev_b16 v0, 8, v0
; GFX10-NEXT: s_setpc_b64 s[30:31]
%result = call i8 @llvm.usub.sat.i8(i8 %lhs, i8 %rhs)
ret i8 %result
@ -170,8 +170,8 @@ define amdgpu_ps i8 @s_usubsat_i8(i8 inreg %lhs, i8 inreg %rhs) {
; GFX10-NEXT: s_bfe_u32 s2, 8, 0x100000
; GFX10-NEXT: s_lshl_b32 s0, s0, s2
; GFX10-NEXT: s_lshl_b32 s1, s1, s2
; GFX10-NEXT: v_sub_nc_u16_e64 v0, s0, s1 clamp
; GFX10-NEXT: v_lshrrev_b16_e64 v0, 8, v0
; GFX10-NEXT: v_sub_nc_u16 v0, s0, s1 clamp
; GFX10-NEXT: v_lshrrev_b16 v0, 8, v0
; GFX10-NEXT: v_readfirstlane_b32 s0, v0
; GFX10-NEXT: ; return to shader part epilog
%result = call i8 @llvm.usub.sat.i8(i8 %lhs, i8 %rhs)
@ -1561,7 +1561,7 @@ define i16 @v_usubsat_i16(i16 %lhs, i16 %rhs) {
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: v_sub_nc_u16_e64 v0, v0, v1 clamp
; GFX10-NEXT: v_sub_nc_u16 v0, v0, v1 clamp
; GFX10-NEXT: s_setpc_b64 s[30:31]
%result = call i16 @llvm.usub.sat.i16(i16 %lhs, i16 %rhs)
ret i16 %result
@ -1593,7 +1593,7 @@ define amdgpu_ps i16 @s_usubsat_i16(i16 inreg %lhs, i16 inreg %rhs) {
;
; GFX10-LABEL: s_usubsat_i16:
; GFX10: ; %bb.0:
; GFX10-NEXT: v_sub_nc_u16_e64 v0, s0, s1 clamp
; GFX10-NEXT: v_sub_nc_u16 v0, s0, s1 clamp
; GFX10-NEXT: v_readfirstlane_b32 s0, v0
; GFX10-NEXT: ; return to shader part epilog
%result = call i16 @llvm.usub.sat.i16(i16 %lhs, i16 %rhs)
@ -1622,7 +1622,7 @@ define amdgpu_ps half @usubsat_i16_sv(i16 inreg %lhs, i16 %rhs) {
;
; GFX10-LABEL: usubsat_i16_sv:
; GFX10: ; %bb.0:
; GFX10-NEXT: v_sub_nc_u16_e64 v0, s0, v0 clamp
; GFX10-NEXT: v_sub_nc_u16 v0, s0, v0 clamp
; GFX10-NEXT: ; return to shader part epilog
%result = call i16 @llvm.usub.sat.i16(i16 %lhs, i16 %rhs)
%cast = bitcast i16 %result to half
@ -1651,7 +1651,7 @@ define amdgpu_ps half @usubsat_i16_vs(i16 %lhs, i16 inreg %rhs) {
;
; GFX10-LABEL: usubsat_i16_vs:
; GFX10: ; %bb.0:
; GFX10-NEXT: v_sub_nc_u16_e64 v0, v0, s0 clamp
; GFX10-NEXT: v_sub_nc_u16 v0, v0, s0 clamp
; GFX10-NEXT: ; return to shader part epilog
%result = call i16 @llvm.usub.sat.i16(i16 %lhs, i16 %rhs)
%cast = bitcast i16 %result to half
@ -2469,7 +2469,7 @@ define i64 @v_usubsat_i64(i64 %lhs, i64 %rhs) {
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: v_sub_co_u32_e64 v4, vcc_lo, v0, v2
; GFX10-NEXT: v_sub_co_u32 v4, vcc_lo, v0, v2
; GFX10-NEXT: v_sub_co_ci_u32_e32 v5, vcc_lo, v1, v3, vcc_lo
; GFX10-NEXT: v_cmp_lt_u64_e32 vcc_lo, v[0:1], v[2:3]
; GFX10-NEXT: v_cndmask_b32_e64 v0, v4, 0, vcc_lo
@ -2584,7 +2584,7 @@ define amdgpu_ps <2 x float> @usubsat_i64_sv(i64 inreg %lhs, i64 %rhs) {
;
; GFX10-LABEL: usubsat_i64_sv:
; GFX10: ; %bb.0:
; GFX10-NEXT: v_sub_co_u32_e64 v2, vcc_lo, s0, v0
; GFX10-NEXT: v_sub_co_u32 v2, vcc_lo, s0, v0
; GFX10-NEXT: v_sub_co_ci_u32_e32 v3, vcc_lo, s1, v1, vcc_lo
; GFX10-NEXT: v_cmp_lt_u64_e32 vcc_lo, s[0:1], v[0:1]
; GFX10-NEXT: v_cndmask_b32_e64 v0, v2, 0, vcc_lo
@ -2628,7 +2628,7 @@ define amdgpu_ps <2 x float> @usubsat_i64_vs(i64 %lhs, i64 inreg %rhs) {
;
; GFX10-LABEL: usubsat_i64_vs:
; GFX10: ; %bb.0:
; GFX10-NEXT: v_sub_co_u32_e64 v2, vcc_lo, v0, s0
; GFX10-NEXT: v_sub_co_u32 v2, vcc_lo, v0, s0
; GFX10-NEXT: v_subrev_co_ci_u32_e32 v3, vcc_lo, s1, v1, vcc_lo
; GFX10-NEXT: v_cmp_gt_u64_e32 vcc_lo, s[0:1], v[0:1]
; GFX10-NEXT: v_cndmask_b32_e64 v0, v2, 0, vcc_lo
@ -2693,10 +2693,10 @@ define <2 x i64> @v_usubsat_v2i64(<2 x i64> %lhs, <2 x i64> %rhs) {
; GFX10-NEXT: v_mov_b32_e32 v11, v1
; GFX10-NEXT: v_mov_b32_e32 v0, v2
; GFX10-NEXT: v_mov_b32_e32 v1, v3
; GFX10-NEXT: v_sub_co_u32_e64 v8, vcc_lo, v10, v4
; GFX10-NEXT: v_sub_co_u32 v8, vcc_lo, v10, v4
; GFX10-NEXT: v_sub_co_ci_u32_e32 v9, vcc_lo, v11, v5, vcc_lo
; GFX10-NEXT: v_cmp_lt_u64_e32 vcc_lo, v[10:11], v[4:5]
; GFX10-NEXT: v_sub_co_u32_e64 v4, s4, v0, v6
; GFX10-NEXT: v_sub_co_u32 v4, s4, v0, v6
; GFX10-NEXT: v_sub_co_ci_u32_e64 v5, s4, v1, v7, s4
; GFX10-NEXT: v_cmp_lt_u64_e64 s4, v[0:1], v[6:7]
; GFX10-NEXT: v_cndmask_b32_e64 v0, v8, 0, vcc_lo
@ -3079,7 +3079,7 @@ define amdgpu_ps <4 x float> @usubsat_i128_sv(i128 inreg %lhs, i128 %rhs) {
; GFX10-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc_lo
; GFX10-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[2:3], v[2:3]
; GFX10-NEXT: v_cndmask_b32_e32 v4, v5, v4, vcc_lo
; GFX10-NEXT: v_sub_co_u32_e64 v0, vcc_lo, s0, v0
; GFX10-NEXT: v_sub_co_u32 v0, vcc_lo, s0, v0
; GFX10-NEXT: v_sub_co_ci_u32_e32 v1, vcc_lo, s1, v1, vcc_lo
; GFX10-NEXT: v_and_b32_e32 v4, 1, v4
; GFX10-NEXT: v_sub_co_ci_u32_e32 v2, vcc_lo, s2, v2, vcc_lo
@ -3173,7 +3173,7 @@ define amdgpu_ps <4 x float> @usubsat_i128_vs(i128 %lhs, i128 inreg %rhs) {
; GFX10-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc_lo
; GFX10-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[2:3], v[2:3]
; GFX10-NEXT: v_cndmask_b32_e32 v4, v5, v4, vcc_lo
; GFX10-NEXT: v_sub_co_u32_e64 v0, vcc_lo, v0, s0
; GFX10-NEXT: v_sub_co_u32 v0, vcc_lo, v0, s0
; GFX10-NEXT: v_subrev_co_ci_u32_e32 v1, vcc_lo, s1, v1, vcc_lo
; GFX10-NEXT: v_and_b32_e32 v4, 1, v4
; GFX10-NEXT: v_subrev_co_ci_u32_e32 v2, vcc_lo, s2, v2, vcc_lo
@ -3326,13 +3326,13 @@ define <2 x i128> @v_usubsat_v2i128(<2 x i128> %lhs, <2 x i128> %rhs) {
; GFX10-NEXT: v_cmp_lt_u64_e32 vcc_lo, v[24:25], v[14:15]
; GFX10-NEXT: v_cmp_ne_u32_e64 s4, 0, v16
; GFX10-NEXT: v_cndmask_b32_e64 v18, 0, 1, vcc_lo
; GFX10-NEXT: v_sub_co_u32_e64 v0, vcc_lo, v22, v8
; GFX10-NEXT: v_sub_co_u32 v0, vcc_lo, v22, v8
; GFX10-NEXT: v_sub_co_ci_u32_e32 v1, vcc_lo, v23, v9, vcc_lo
; GFX10-NEXT: v_cndmask_b32_e64 v8, v18, v17, s5
; GFX10-NEXT: v_sub_co_ci_u32_e32 v2, vcc_lo, v20, v10, vcc_lo
; GFX10-NEXT: v_cndmask_b32_e64 v0, v0, 0, s4
; GFX10-NEXT: v_sub_co_ci_u32_e32 v3, vcc_lo, v21, v11, vcc_lo
; GFX10-NEXT: v_sub_co_u32_e64 v4, vcc_lo, v26, v12
; GFX10-NEXT: v_sub_co_u32 v4, vcc_lo, v26, v12
; GFX10-NEXT: v_and_b32_e32 v8, 1, v8
; GFX10-NEXT: v_sub_co_ci_u32_e32 v5, vcc_lo, v27, v13, vcc_lo
; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, 0, s4

View File

@ -81,8 +81,8 @@ define amdgpu_kernel void @add_i32_constant(i32 addrspace(1)* %out, i32 addrspac
; GFX1064-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
; GFX1064-NEXT: s_mov_b64 s[6:7], exec
; GFX1064-NEXT: ; implicit-def: $vgpr1
; GFX1064-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s6, 0
; GFX1064-NEXT: v_mbcnt_hi_u32_b32_e64 v0, s7, v0
; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0
; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, s7, v0
; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
; GFX1064-NEXT: s_and_saveexec_b64 s[4:5], vcc
; GFX1064-NEXT: s_cbranch_execz BB0_2
@ -117,7 +117,7 @@ define amdgpu_kernel void @add_i32_constant(i32 addrspace(1)* %out, i32 addrspac
; GFX1032-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
; GFX1032-NEXT: s_mov_b32 s5, exec_lo
; GFX1032-NEXT: ; implicit-def: $vgpr1
; GFX1032-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s5, 0
; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, s5, 0
; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
; GFX1032-NEXT: s_and_saveexec_b32 s4, vcc_lo
; GFX1032-NEXT: s_cbranch_execz BB0_2
@ -265,8 +265,8 @@ define amdgpu_kernel void @add_i32_uniform(i32 addrspace(1)* %out, i32 addrspace
; GFX1064-NEXT: s_load_dword s2, s[0:1], 0x34
; GFX1064-NEXT: s_mov_b64 s[8:9], exec
; GFX1064-NEXT: ; implicit-def: $vgpr1
; GFX1064-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s8, 0
; GFX1064-NEXT: v_mbcnt_hi_u32_b32_e64 v0, s9, v0
; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, s8, 0
; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, s9, v0
; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
; GFX1064-NEXT: s_and_saveexec_b64 s[0:1], vcc
; GFX1064-NEXT: s_cbranch_execz BB1_2
@ -304,7 +304,7 @@ define amdgpu_kernel void @add_i32_uniform(i32 addrspace(1)* %out, i32 addrspace
; GFX1032-NEXT: s_load_dword s2, s[0:1], 0x34
; GFX1032-NEXT: s_mov_b32 s3, exec_lo
; GFX1032-NEXT: ; implicit-def: $vgpr1
; GFX1032-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s3, 0
; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, s3, 0
; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
; GFX1032-NEXT: s_and_saveexec_b32 s0, vcc_lo
; GFX1032-NEXT: s_cbranch_execz BB1_2
@ -494,13 +494,13 @@ define amdgpu_kernel void @add_i32_varying(i32 addrspace(1)* %out, i32 addrspace
; GFX1064-NEXT: v_readlane_b32 s7, v1, 31
; GFX1064-NEXT: v_writelane_b32 v3, s6, 16
; GFX1064-NEXT: s_mov_b64 exec, s[4:5]
; GFX1064-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0
; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
; GFX1064-NEXT: s_or_saveexec_b64 s[4:5], -1
; GFX1064-NEXT: v_readlane_b32 s8, v1, 47
; GFX1064-NEXT: v_readlane_b32 s9, v1, 63
; GFX1064-NEXT: v_writelane_b32 v3, s7, 32
; GFX1064-NEXT: s_mov_b64 exec, s[4:5]
; GFX1064-NEXT: v_mbcnt_hi_u32_b32_e64 v0, exec_hi, v0
; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
; GFX1064-NEXT: s_or_saveexec_b64 s[6:7], -1
; GFX1064-NEXT: s_mov_b32 s4, s9
; GFX1064-NEXT: v_writelane_b32 v3, s8, 48
@ -556,7 +556,7 @@ define amdgpu_kernel void @add_i32_varying(i32 addrspace(1)* %out, i32 addrspace
; GFX1032-NEXT: v_readlane_b32 s6, v1, 31
; GFX1032-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf
; GFX1032-NEXT: s_mov_b32 exec_lo, s4
; GFX1032-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0
; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
; GFX1032-NEXT: s_or_saveexec_b32 s4, -1
; GFX1032-NEXT: v_writelane_b32 v3, s5, 16
; GFX1032-NEXT: s_mov_b32 exec_lo, s4
@ -680,8 +680,8 @@ define amdgpu_kernel void @add_i64_constant(i64 addrspace(1)* %out, i64 addrspac
; GFX1064-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
; GFX1064-NEXT: s_mov_b64 s[6:7], exec
; GFX1064-NEXT: ; implicit-def: $vgpr1_vgpr2
; GFX1064-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s6, 0
; GFX1064-NEXT: v_mbcnt_hi_u32_b32_e64 v0, s7, v0
; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0
; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, s7, v0
; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
; GFX1064-NEXT: s_and_saveexec_b64 s[4:5], vcc
; GFX1064-NEXT: s_cbranch_execz BB3_2
@ -718,7 +718,7 @@ define amdgpu_kernel void @add_i64_constant(i64 addrspace(1)* %out, i64 addrspac
; GFX1032-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
; GFX1032-NEXT: s_mov_b32 s5, exec_lo
; GFX1032-NEXT: ; implicit-def: $vgpr1_vgpr2
; GFX1032-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s5, 0
; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, s5, 0
; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
; GFX1032-NEXT: s_and_saveexec_b32 s4, vcc_lo
; GFX1032-NEXT: s_cbranch_execz BB3_2
@ -899,8 +899,8 @@ define amdgpu_kernel void @add_i64_uniform(i64 addrspace(1)* %out, i64 addrspace
; GFX1064-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; GFX1064-NEXT: s_mov_b64 s[8:9], exec
; GFX1064-NEXT: ; implicit-def: $vgpr1_vgpr2
; GFX1064-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s8, 0
; GFX1064-NEXT: v_mbcnt_hi_u32_b32_e64 v0, s9, v0
; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, s8, 0
; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, s9, v0
; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
; GFX1064-NEXT: s_and_saveexec_b64 s[0:1], vcc
; GFX1064-NEXT: s_cbranch_execz BB4_2
@ -935,7 +935,7 @@ define amdgpu_kernel void @add_i64_uniform(i64 addrspace(1)* %out, i64 addrspace
; GFX1064-NEXT: s_mov_b32 s7, 0x31016000
; GFX1064-NEXT: s_mov_b32 s6, -1
; GFX1064-NEXT: v_add_nc_u32_e32 v1, v4, v3
; GFX1064-NEXT: v_add_co_u32_e64 v0, vcc, s0, v0
; GFX1064-NEXT: v_add_co_u32 v0, vcc, s0, v0
; GFX1064-NEXT: v_add_co_ci_u32_e32 v1, vcc, s1, v1, vcc
; GFX1064-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
; GFX1064-NEXT: s_endpgm
@ -947,7 +947,7 @@ define amdgpu_kernel void @add_i64_uniform(i64 addrspace(1)* %out, i64 addrspace
; GFX1032-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; GFX1032-NEXT: s_mov_b32 s8, exec_lo
; GFX1032-NEXT: ; implicit-def: $vgpr1_vgpr2
; GFX1032-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s8, 0
; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, s8, 0
; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
; GFX1032-NEXT: s_and_saveexec_b32 s0, vcc_lo
; GFX1032-NEXT: s_cbranch_execz BB4_2
@ -982,7 +982,7 @@ define amdgpu_kernel void @add_i64_uniform(i64 addrspace(1)* %out, i64 addrspace
; GFX1032-NEXT: s_mov_b32 s7, 0x31016000
; GFX1032-NEXT: s_mov_b32 s6, -1
; GFX1032-NEXT: v_add_nc_u32_e32 v1, v4, v3
; GFX1032-NEXT: v_add_co_u32_e64 v0, vcc_lo, s0, v0
; GFX1032-NEXT: v_add_co_u32 v0, vcc_lo, s0, v0
; GFX1032-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, s1, v1, vcc_lo
; GFX1032-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
; GFX1032-NEXT: s_endpgm
@ -1170,8 +1170,8 @@ define amdgpu_kernel void @sub_i32_constant(i32 addrspace(1)* %out, i32 addrspac
; GFX1064-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
; GFX1064-NEXT: s_mov_b64 s[6:7], exec
; GFX1064-NEXT: ; implicit-def: $vgpr1
; GFX1064-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s6, 0
; GFX1064-NEXT: v_mbcnt_hi_u32_b32_e64 v0, s7, v0
; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0
; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, s7, v0
; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
; GFX1064-NEXT: s_and_saveexec_b64 s[4:5], vcc
; GFX1064-NEXT: s_cbranch_execz BB6_2
@ -1207,7 +1207,7 @@ define amdgpu_kernel void @sub_i32_constant(i32 addrspace(1)* %out, i32 addrspac
; GFX1032-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
; GFX1032-NEXT: s_mov_b32 s5, exec_lo
; GFX1032-NEXT: ; implicit-def: $vgpr1
; GFX1032-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s5, 0
; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, s5, 0
; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
; GFX1032-NEXT: s_and_saveexec_b32 s4, vcc_lo
; GFX1032-NEXT: s_cbranch_execz BB6_2
@ -1356,8 +1356,8 @@ define amdgpu_kernel void @sub_i32_uniform(i32 addrspace(1)* %out, i32 addrspace
; GFX1064-NEXT: s_load_dword s2, s[0:1], 0x34
; GFX1064-NEXT: s_mov_b64 s[8:9], exec
; GFX1064-NEXT: ; implicit-def: $vgpr1
; GFX1064-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s8, 0
; GFX1064-NEXT: v_mbcnt_hi_u32_b32_e64 v0, s9, v0
; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, s8, 0
; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, s9, v0
; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
; GFX1064-NEXT: s_and_saveexec_b64 s[0:1], vcc
; GFX1064-NEXT: s_cbranch_execz BB7_2
@ -1395,7 +1395,7 @@ define amdgpu_kernel void @sub_i32_uniform(i32 addrspace(1)* %out, i32 addrspace
; GFX1032-NEXT: s_load_dword s2, s[0:1], 0x34
; GFX1032-NEXT: s_mov_b32 s3, exec_lo
; GFX1032-NEXT: ; implicit-def: $vgpr1
; GFX1032-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s3, 0
; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, s3, 0
; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
; GFX1032-NEXT: s_and_saveexec_b32 s0, vcc_lo
; GFX1032-NEXT: s_cbranch_execz BB7_2
@ -1585,13 +1585,13 @@ define amdgpu_kernel void @sub_i32_varying(i32 addrspace(1)* %out, i32 addrspace
; GFX1064-NEXT: v_readlane_b32 s7, v1, 31
; GFX1064-NEXT: v_writelane_b32 v3, s6, 16
; GFX1064-NEXT: s_mov_b64 exec, s[4:5]
; GFX1064-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0
; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
; GFX1064-NEXT: s_or_saveexec_b64 s[4:5], -1
; GFX1064-NEXT: v_readlane_b32 s8, v1, 47
; GFX1064-NEXT: v_readlane_b32 s9, v1, 63
; GFX1064-NEXT: v_writelane_b32 v3, s7, 32
; GFX1064-NEXT: s_mov_b64 exec, s[4:5]
; GFX1064-NEXT: v_mbcnt_hi_u32_b32_e64 v0, exec_hi, v0
; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
; GFX1064-NEXT: s_or_saveexec_b64 s[6:7], -1
; GFX1064-NEXT: s_mov_b32 s4, s9
; GFX1064-NEXT: v_writelane_b32 v3, s8, 48
@ -1647,7 +1647,7 @@ define amdgpu_kernel void @sub_i32_varying(i32 addrspace(1)* %out, i32 addrspace
; GFX1032-NEXT: v_readlane_b32 s6, v1, 31
; GFX1032-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf
; GFX1032-NEXT: s_mov_b32 exec_lo, s4
; GFX1032-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0
; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
; GFX1032-NEXT: s_or_saveexec_b32 s4, -1
; GFX1032-NEXT: v_writelane_b32 v3, s5, 16
; GFX1032-NEXT: s_mov_b32 exec_lo, s4
@ -1811,8 +1811,8 @@ define amdgpu_kernel void @sub_i64_constant(i64 addrspace(1)* %out, i64 addrspac
; GFX1064-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
; GFX1064-NEXT: s_mov_b64 s[6:7], exec
; GFX1064-NEXT: ; implicit-def: $vgpr1_vgpr2
; GFX1064-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s6, 0
; GFX1064-NEXT: v_mbcnt_hi_u32_b32_e64 v0, s7, v0
; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0
; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, s7, v0
; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
; GFX1064-NEXT: s_and_saveexec_b64 s[4:5], vcc
; GFX1064-NEXT: s_cbranch_execz BB9_2
@ -1840,7 +1840,7 @@ define amdgpu_kernel void @sub_i64_constant(i64 addrspace(1)* %out, i64 addrspac
; GFX1064-NEXT: v_mul_u32_u24_e32 v1, 5, v0
; GFX1064-NEXT: v_readfirstlane_b32 s3, v2
; GFX1064-NEXT: v_mul_hi_u32_u24_e32 v2, 5, v0
; GFX1064-NEXT: v_sub_co_u32_e64 v0, vcc, s2, v1
; GFX1064-NEXT: v_sub_co_u32 v0, vcc, s2, v1
; GFX1064-NEXT: s_mov_b32 s2, -1
; GFX1064-NEXT: v_sub_co_ci_u32_e32 v1, vcc, s3, v2, vcc
; GFX1064-NEXT: s_mov_b32 s3, 0x31016000
@ -1852,7 +1852,7 @@ define amdgpu_kernel void @sub_i64_constant(i64 addrspace(1)* %out, i64 addrspac
; GFX1032-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
; GFX1032-NEXT: s_mov_b32 s5, exec_lo
; GFX1032-NEXT: ; implicit-def: $vgpr1_vgpr2
; GFX1032-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s5, 0
; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, s5, 0
; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
; GFX1032-NEXT: s_and_saveexec_b32 s4, vcc_lo
; GFX1032-NEXT: s_cbranch_execz BB9_2
@ -1880,7 +1880,7 @@ define amdgpu_kernel void @sub_i64_constant(i64 addrspace(1)* %out, i64 addrspac
; GFX1032-NEXT: v_mul_u32_u24_e32 v1, 5, v0
; GFX1032-NEXT: v_readfirstlane_b32 s3, v2
; GFX1032-NEXT: v_mul_hi_u32_u24_e32 v2, 5, v0
; GFX1032-NEXT: v_sub_co_u32_e64 v0, vcc_lo, s2, v1
; GFX1032-NEXT: v_sub_co_u32 v0, vcc_lo, s2, v1
; GFX1032-NEXT: s_mov_b32 s2, -1
; GFX1032-NEXT: v_sub_co_ci_u32_e32 v1, vcc_lo, s3, v2, vcc_lo
; GFX1032-NEXT: s_mov_b32 s3, 0x31016000
@ -2036,8 +2036,8 @@ define amdgpu_kernel void @sub_i64_uniform(i64 addrspace(1)* %out, i64 addrspace
; GFX1064-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; GFX1064-NEXT: s_mov_b64 s[8:9], exec
; GFX1064-NEXT: ; implicit-def: $vgpr1_vgpr2
; GFX1064-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s8, 0
; GFX1064-NEXT: v_mbcnt_hi_u32_b32_e64 v0, s9, v0
; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, s8, 0
; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, s9, v0
; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
; GFX1064-NEXT: s_and_saveexec_b64 s[0:1], vcc
; GFX1064-NEXT: s_cbranch_execz BB10_2
@ -2072,7 +2072,7 @@ define amdgpu_kernel void @sub_i64_uniform(i64 addrspace(1)* %out, i64 addrspace
; GFX1064-NEXT: s_mov_b32 s7, 0x31016000
; GFX1064-NEXT: s_mov_b32 s6, -1
; GFX1064-NEXT: v_add_nc_u32_e32 v1, v4, v3
; GFX1064-NEXT: v_sub_co_u32_e64 v0, vcc, s0, v0
; GFX1064-NEXT: v_sub_co_u32 v0, vcc, s0, v0
; GFX1064-NEXT: v_sub_co_ci_u32_e32 v1, vcc, s1, v1, vcc
; GFX1064-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
; GFX1064-NEXT: s_endpgm
@ -2084,7 +2084,7 @@ define amdgpu_kernel void @sub_i64_uniform(i64 addrspace(1)* %out, i64 addrspace
; GFX1032-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; GFX1032-NEXT: s_mov_b32 s8, exec_lo
; GFX1032-NEXT: ; implicit-def: $vgpr1_vgpr2
; GFX1032-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s8, 0
; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, s8, 0
; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
; GFX1032-NEXT: s_and_saveexec_b32 s0, vcc_lo
; GFX1032-NEXT: s_cbranch_execz BB10_2
@ -2119,7 +2119,7 @@ define amdgpu_kernel void @sub_i64_uniform(i64 addrspace(1)* %out, i64 addrspace
; GFX1032-NEXT: s_mov_b32 s7, 0x31016000
; GFX1032-NEXT: s_mov_b32 s6, -1
; GFX1032-NEXT: v_add_nc_u32_e32 v1, v4, v3
; GFX1032-NEXT: v_sub_co_u32_e64 v0, vcc_lo, s0, v0
; GFX1032-NEXT: v_sub_co_u32 v0, vcc_lo, s0, v0
; GFX1032-NEXT: v_sub_co_ci_u32_e32 v1, vcc_lo, s1, v1, vcc_lo
; GFX1032-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
; GFX1032-NEXT: s_endpgm

View File

@ -108,8 +108,8 @@ define amdgpu_kernel void @add_i32_constant(i32 addrspace(1)* %out) {
; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; GFX1064-NEXT: s_mov_b64 s[2:3], exec
; GFX1064-NEXT: ; implicit-def: $vgpr1
; GFX1064-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s2, 0
; GFX1064-NEXT: v_mbcnt_hi_u32_b32_e64 v0, s3, v0
; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0
; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0
; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
; GFX1064-NEXT: s_and_saveexec_b64 s[4:5], vcc
; GFX1064-NEXT: s_cbranch_execz BB0_2
@ -139,7 +139,7 @@ define amdgpu_kernel void @add_i32_constant(i32 addrspace(1)* %out) {
; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; GFX1032-NEXT: s_mov_b32 s3, exec_lo
; GFX1032-NEXT: ; implicit-def: $vgpr1
; GFX1032-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s3, 0
; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, s3, 0
; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
; GFX1032-NEXT: s_and_saveexec_b32 s2, vcc_lo
; GFX1032-NEXT: s_cbranch_execz BB0_2
@ -274,8 +274,8 @@ define amdgpu_kernel void @add_i32_uniform(i32 addrspace(1)* %out, i32 %additive
; GFX1064-NEXT: s_load_dword s2, s[0:1], 0x2c
; GFX1064-NEXT: s_mov_b64 s[6:7], exec
; GFX1064-NEXT: ; implicit-def: $vgpr1
; GFX1064-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s6, 0
; GFX1064-NEXT: v_mbcnt_hi_u32_b32_e64 v0, s7, v0
; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0
; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, s7, v0
; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
; GFX1064-NEXT: s_and_saveexec_b64 s[0:1], vcc
; GFX1064-NEXT: s_cbranch_execz BB1_2
@ -309,7 +309,7 @@ define amdgpu_kernel void @add_i32_uniform(i32 addrspace(1)* %out, i32 %additive
; GFX1032-NEXT: s_load_dword s2, s[0:1], 0x2c
; GFX1032-NEXT: s_mov_b32 s3, exec_lo
; GFX1032-NEXT: ; implicit-def: $vgpr1
; GFX1032-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s3, 0
; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, s3, 0
; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
; GFX1032-NEXT: s_and_saveexec_b32 s0, vcc_lo
; GFX1032-NEXT: s_cbranch_execz BB1_2
@ -484,13 +484,13 @@ define amdgpu_kernel void @add_i32_varying(i32 addrspace(1)* %out) {
; GFX1064-NEXT: v_readlane_b32 s5, v1, 31
; GFX1064-NEXT: v_writelane_b32 v3, s4, 16
; GFX1064-NEXT: s_mov_b64 exec, s[2:3]
; GFX1064-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0
; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
; GFX1064-NEXT: s_or_saveexec_b64 s[2:3], -1
; GFX1064-NEXT: v_readlane_b32 s7, v1, 63
; GFX1064-NEXT: v_readlane_b32 s6, v1, 47
; GFX1064-NEXT: v_writelane_b32 v3, s5, 32
; GFX1064-NEXT: s_mov_b64 exec, s[2:3]
; GFX1064-NEXT: v_mbcnt_hi_u32_b32_e64 v0, exec_hi, v0
; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
; GFX1064-NEXT: s_or_saveexec_b64 s[4:5], -1
; GFX1064-NEXT: v_writelane_b32 v3, s6, 48
; GFX1064-NEXT: s_mov_b64 exec, s[4:5]
@ -541,7 +541,7 @@ define amdgpu_kernel void @add_i32_varying(i32 addrspace(1)* %out) {
; GFX1032-NEXT: v_readlane_b32 s4, v1, 31
; GFX1032-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf
; GFX1032-NEXT: s_mov_b32 exec_lo, s2
; GFX1032-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0
; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
; GFX1032-NEXT: s_or_saveexec_b32 s2, -1
; GFX1032-NEXT: v_writelane_b32 v3, s3, 16
; GFX1032-NEXT: s_mov_b32 exec_lo, s2
@ -671,12 +671,12 @@ define amdgpu_kernel void @add_i32_varying_nouse() {
; GFX1064-NEXT: v_permlanex16_b32 v2, v2, -1, -1
; GFX1064-NEXT: v_add_nc_u32_e32 v1, v1, v2
; GFX1064-NEXT: s_mov_b64 exec, s[0:1]
; GFX1064-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0
; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
; GFX1064-NEXT: s_or_saveexec_b64 s[0:1], -1
; GFX1064-NEXT: v_readlane_b32 s2, v1, 0
; GFX1064-NEXT: v_readlane_b32 s3, v1, 32
; GFX1064-NEXT: s_mov_b64 exec, s[0:1]
; GFX1064-NEXT: v_mbcnt_hi_u32_b32_e64 v0, exec_hi, v0
; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
; GFX1064-NEXT: s_add_i32 s0, s2, s3
; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
; GFX1064-NEXT: s_and_saveexec_b64 s[2:3], vcc
@ -707,7 +707,7 @@ define amdgpu_kernel void @add_i32_varying_nouse() {
; GFX1032-NEXT: v_permlanex16_b32 v2, v2, -1, -1
; GFX1032-NEXT: v_add_nc_u32_e32 v1, v1, v2
; GFX1032-NEXT: s_mov_b32 exec_lo, s0
; GFX1032-NEXT: v_mbcnt_lo_u32_b32_e64 v3, exec_lo, 0
; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0
; GFX1032-NEXT: v_mov_b32_e32 v0, v1
; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v3
; GFX1032-NEXT: s_and_saveexec_b32 s0, vcc_lo
@ -837,8 +837,8 @@ define amdgpu_kernel void @add_i64_constant(i64 addrspace(1)* %out) {
; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; GFX1064-NEXT: s_mov_b64 s[4:5], exec
; GFX1064-NEXT: ; implicit-def: $vgpr1_vgpr2
; GFX1064-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s4, 0
; GFX1064-NEXT: v_mbcnt_hi_u32_b32_e64 v0, s5, v0
; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0
; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0
; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
; GFX1064-NEXT: s_and_saveexec_b64 s[2:3], vcc
; GFX1064-NEXT: s_cbranch_execz BB4_2
@ -870,7 +870,7 @@ define amdgpu_kernel void @add_i64_constant(i64 addrspace(1)* %out) {
; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; GFX1032-NEXT: s_mov_b32 s3, exec_lo
; GFX1032-NEXT: ; implicit-def: $vgpr1_vgpr2
; GFX1032-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s3, 0
; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, s3, 0
; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
; GFX1032-NEXT: s_and_saveexec_b32 s2, vcc_lo
; GFX1032-NEXT: s_cbranch_execz BB4_2
@ -1038,8 +1038,8 @@ define amdgpu_kernel void @add_i64_uniform(i64 addrspace(1)* %out, i64 %additive
; GFX1064-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
; GFX1064-NEXT: s_mov_b64 s[6:7], exec
; GFX1064-NEXT: ; implicit-def: $vgpr1_vgpr2
; GFX1064-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s6, 0
; GFX1064-NEXT: v_mbcnt_hi_u32_b32_e64 v0, s7, v0
; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0
; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, s7, v0
; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
; GFX1064-NEXT: s_and_saveexec_b64 s[4:5], vcc
; GFX1064-NEXT: s_cbranch_execz BB5_2
@ -1069,7 +1069,7 @@ define amdgpu_kernel void @add_i64_uniform(i64 addrspace(1)* %out, i64 %additive
; GFX1064-NEXT: v_readfirstlane_b32 s4, v2
; GFX1064-NEXT: s_mov_b32 s3, 0x31016000
; GFX1064-NEXT: v_add_nc_u32_e32 v1, v4, v3
; GFX1064-NEXT: v_add_co_u32_e64 v0, vcc, s2, v0
; GFX1064-NEXT: v_add_co_u32 v0, vcc, s2, v0
; GFX1064-NEXT: s_mov_b32 s2, -1
; GFX1064-NEXT: v_add_co_ci_u32_e32 v1, vcc, s4, v1, vcc
; GFX1064-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
@ -1080,7 +1080,7 @@ define amdgpu_kernel void @add_i64_uniform(i64 addrspace(1)* %out, i64 %additive
; GFX1032-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
; GFX1032-NEXT: s_mov_b32 s5, exec_lo
; GFX1032-NEXT: ; implicit-def: $vgpr1_vgpr2
; GFX1032-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s5, 0
; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, s5, 0
; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
; GFX1032-NEXT: s_and_saveexec_b32 s4, vcc_lo
; GFX1032-NEXT: s_cbranch_execz BB5_2
@ -1110,7 +1110,7 @@ define amdgpu_kernel void @add_i64_uniform(i64 addrspace(1)* %out, i64 %additive
; GFX1032-NEXT: v_readfirstlane_b32 s4, v2
; GFX1032-NEXT: s_mov_b32 s3, 0x31016000
; GFX1032-NEXT: v_add_nc_u32_e32 v1, v4, v3
; GFX1032-NEXT: v_add_co_u32_e64 v0, vcc_lo, s2, v0
; GFX1032-NEXT: v_add_co_u32 v0, vcc_lo, s2, v0
; GFX1032-NEXT: s_mov_b32 s2, -1
; GFX1032-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, s4, v1, vcc_lo
; GFX1032-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
@ -1286,8 +1286,8 @@ define amdgpu_kernel void @sub_i32_constant(i32 addrspace(1)* %out) {
; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; GFX1064-NEXT: s_mov_b64 s[2:3], exec
; GFX1064-NEXT: ; implicit-def: $vgpr1
; GFX1064-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s2, 0
; GFX1064-NEXT: v_mbcnt_hi_u32_b32_e64 v0, s3, v0
; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0
; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0
; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
; GFX1064-NEXT: s_and_saveexec_b64 s[4:5], vcc
; GFX1064-NEXT: s_cbranch_execz BB7_2
@ -1318,7 +1318,7 @@ define amdgpu_kernel void @sub_i32_constant(i32 addrspace(1)* %out) {
; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; GFX1032-NEXT: s_mov_b32 s3, exec_lo
; GFX1032-NEXT: ; implicit-def: $vgpr1
; GFX1032-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s3, 0
; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, s3, 0
; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
; GFX1032-NEXT: s_and_saveexec_b32 s2, vcc_lo
; GFX1032-NEXT: s_cbranch_execz BB7_2
@ -1454,8 +1454,8 @@ define amdgpu_kernel void @sub_i32_uniform(i32 addrspace(1)* %out, i32 %subitive
; GFX1064-NEXT: s_load_dword s2, s[0:1], 0x2c
; GFX1064-NEXT: s_mov_b64 s[6:7], exec
; GFX1064-NEXT: ; implicit-def: $vgpr1
; GFX1064-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s6, 0
; GFX1064-NEXT: v_mbcnt_hi_u32_b32_e64 v0, s7, v0
; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0
; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, s7, v0
; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
; GFX1064-NEXT: s_and_saveexec_b64 s[0:1], vcc
; GFX1064-NEXT: s_cbranch_execz BB8_2
@ -1489,7 +1489,7 @@ define amdgpu_kernel void @sub_i32_uniform(i32 addrspace(1)* %out, i32 %subitive
; GFX1032-NEXT: s_load_dword s2, s[0:1], 0x2c
; GFX1032-NEXT: s_mov_b32 s3, exec_lo
; GFX1032-NEXT: ; implicit-def: $vgpr1
; GFX1032-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s3, 0
; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, s3, 0
; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
; GFX1032-NEXT: s_and_saveexec_b32 s0, vcc_lo
; GFX1032-NEXT: s_cbranch_execz BB8_2
@ -1664,13 +1664,13 @@ define amdgpu_kernel void @sub_i32_varying(i32 addrspace(1)* %out) {
; GFX1064-NEXT: v_readlane_b32 s5, v1, 31
; GFX1064-NEXT: v_writelane_b32 v3, s4, 16
; GFX1064-NEXT: s_mov_b64 exec, s[2:3]
; GFX1064-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0
; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
; GFX1064-NEXT: s_or_saveexec_b64 s[2:3], -1
; GFX1064-NEXT: v_readlane_b32 s7, v1, 63
; GFX1064-NEXT: v_readlane_b32 s6, v1, 47
; GFX1064-NEXT: v_writelane_b32 v3, s5, 32
; GFX1064-NEXT: s_mov_b64 exec, s[2:3]
; GFX1064-NEXT: v_mbcnt_hi_u32_b32_e64 v0, exec_hi, v0
; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
; GFX1064-NEXT: s_or_saveexec_b64 s[4:5], -1
; GFX1064-NEXT: v_writelane_b32 v3, s6, 48
; GFX1064-NEXT: s_mov_b64 exec, s[4:5]
@ -1721,7 +1721,7 @@ define amdgpu_kernel void @sub_i32_varying(i32 addrspace(1)* %out) {
; GFX1032-NEXT: v_readlane_b32 s4, v1, 31
; GFX1032-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf
; GFX1032-NEXT: s_mov_b32 exec_lo, s2
; GFX1032-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0
; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
; GFX1032-NEXT: s_or_saveexec_b32 s2, -1
; GFX1032-NEXT: v_writelane_b32 v3, s3, 16
; GFX1032-NEXT: s_mov_b32 exec_lo, s2
@ -1851,12 +1851,12 @@ define amdgpu_kernel void @sub_i32_varying_nouse() {
; GFX1064-NEXT: v_permlanex16_b32 v2, v2, -1, -1
; GFX1064-NEXT: v_add_nc_u32_e32 v1, v1, v2
; GFX1064-NEXT: s_mov_b64 exec, s[0:1]
; GFX1064-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0
; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
; GFX1064-NEXT: s_or_saveexec_b64 s[0:1], -1
; GFX1064-NEXT: v_readlane_b32 s2, v1, 0
; GFX1064-NEXT: v_readlane_b32 s3, v1, 32
; GFX1064-NEXT: s_mov_b64 exec, s[0:1]
; GFX1064-NEXT: v_mbcnt_hi_u32_b32_e64 v0, exec_hi, v0
; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
; GFX1064-NEXT: s_add_i32 s0, s2, s3
; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
; GFX1064-NEXT: s_and_saveexec_b64 s[2:3], vcc
@ -1887,7 +1887,7 @@ define amdgpu_kernel void @sub_i32_varying_nouse() {
; GFX1032-NEXT: v_permlanex16_b32 v2, v2, -1, -1
; GFX1032-NEXT: v_add_nc_u32_e32 v1, v1, v2
; GFX1032-NEXT: s_mov_b32 exec_lo, s0
; GFX1032-NEXT: v_mbcnt_lo_u32_b32_e64 v3, exec_lo, 0
; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0
; GFX1032-NEXT: v_mov_b32_e32 v0, v1
; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v3
; GFX1032-NEXT: s_and_saveexec_b32 s0, vcc_lo
@ -2019,8 +2019,8 @@ define amdgpu_kernel void @sub_i64_constant(i64 addrspace(1)* %out) {
; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; GFX1064-NEXT: s_mov_b64 s[4:5], exec
; GFX1064-NEXT: ; implicit-def: $vgpr1_vgpr2
; GFX1064-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s4, 0
; GFX1064-NEXT: v_mbcnt_hi_u32_b32_e64 v0, s5, v0
; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0
; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0
; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
; GFX1064-NEXT: s_and_saveexec_b64 s[2:3], vcc
; GFX1064-NEXT: s_cbranch_execz BB11_2
@ -2042,7 +2042,7 @@ define amdgpu_kernel void @sub_i64_constant(i64 addrspace(1)* %out) {
; GFX1064-NEXT: v_mul_u32_u24_e32 v1, 5, v0
; GFX1064-NEXT: v_readfirstlane_b32 s3, v2
; GFX1064-NEXT: v_mul_hi_u32_u24_e32 v2, 5, v0
; GFX1064-NEXT: v_sub_co_u32_e64 v0, vcc, s2, v1
; GFX1064-NEXT: v_sub_co_u32 v0, vcc, s2, v1
; GFX1064-NEXT: s_mov_b32 s2, -1
; GFX1064-NEXT: v_sub_co_ci_u32_e32 v1, vcc, s3, v2, vcc
; GFX1064-NEXT: s_mov_b32 s3, 0x31016000
@ -2055,7 +2055,7 @@ define amdgpu_kernel void @sub_i64_constant(i64 addrspace(1)* %out) {
; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; GFX1032-NEXT: s_mov_b32 s3, exec_lo
; GFX1032-NEXT: ; implicit-def: $vgpr1_vgpr2
; GFX1032-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s3, 0
; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, s3, 0
; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
; GFX1032-NEXT: s_and_saveexec_b32 s2, vcc_lo
; GFX1032-NEXT: s_cbranch_execz BB11_2
@ -2077,7 +2077,7 @@ define amdgpu_kernel void @sub_i64_constant(i64 addrspace(1)* %out) {
; GFX1032-NEXT: v_mul_u32_u24_e32 v1, 5, v0
; GFX1032-NEXT: v_readfirstlane_b32 s3, v2
; GFX1032-NEXT: v_mul_hi_u32_u24_e32 v2, 5, v0
; GFX1032-NEXT: v_sub_co_u32_e64 v0, vcc_lo, s2, v1
; GFX1032-NEXT: v_sub_co_u32 v0, vcc_lo, s2, v1
; GFX1032-NEXT: s_mov_b32 s2, -1
; GFX1032-NEXT: v_sub_co_ci_u32_e32 v1, vcc_lo, s3, v2, vcc_lo
; GFX1032-NEXT: s_mov_b32 s3, 0x31016000
@ -2226,8 +2226,8 @@ define amdgpu_kernel void @sub_i64_uniform(i64 addrspace(1)* %out, i64 %subitive
; GFX1064-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
; GFX1064-NEXT: s_mov_b64 s[6:7], exec
; GFX1064-NEXT: ; implicit-def: $vgpr1_vgpr2
; GFX1064-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s6, 0
; GFX1064-NEXT: v_mbcnt_hi_u32_b32_e64 v0, s7, v0
; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0
; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, s7, v0
; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
; GFX1064-NEXT: s_and_saveexec_b64 s[4:5], vcc
; GFX1064-NEXT: s_cbranch_execz BB12_2
@ -2257,7 +2257,7 @@ define amdgpu_kernel void @sub_i64_uniform(i64 addrspace(1)* %out, i64 %subitive
; GFX1064-NEXT: v_readfirstlane_b32 s4, v2
; GFX1064-NEXT: s_mov_b32 s3, 0x31016000
; GFX1064-NEXT: v_add_nc_u32_e32 v1, v4, v3
; GFX1064-NEXT: v_sub_co_u32_e64 v0, vcc, s2, v0
; GFX1064-NEXT: v_sub_co_u32 v0, vcc, s2, v0
; GFX1064-NEXT: s_mov_b32 s2, -1
; GFX1064-NEXT: v_sub_co_ci_u32_e32 v1, vcc, s4, v1, vcc
; GFX1064-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
@ -2268,7 +2268,7 @@ define amdgpu_kernel void @sub_i64_uniform(i64 addrspace(1)* %out, i64 %subitive
; GFX1032-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
; GFX1032-NEXT: s_mov_b32 s5, exec_lo
; GFX1032-NEXT: ; implicit-def: $vgpr1_vgpr2
; GFX1032-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s5, 0
; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, s5, 0
; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
; GFX1032-NEXT: s_and_saveexec_b32 s4, vcc_lo
; GFX1032-NEXT: s_cbranch_execz BB12_2
@ -2298,7 +2298,7 @@ define amdgpu_kernel void @sub_i64_uniform(i64 addrspace(1)* %out, i64 %subitive
; GFX1032-NEXT: v_readfirstlane_b32 s4, v2
; GFX1032-NEXT: s_mov_b32 s3, 0x31016000
; GFX1032-NEXT: v_add_nc_u32_e32 v1, v4, v3
; GFX1032-NEXT: v_sub_co_u32_e64 v0, vcc_lo, s2, v0
; GFX1032-NEXT: v_sub_co_u32 v0, vcc_lo, s2, v0
; GFX1032-NEXT: s_mov_b32 s2, -1
; GFX1032-NEXT: v_sub_co_ci_u32_e32 v1, vcc_lo, s4, v1, vcc_lo
; GFX1032-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
@ -2518,13 +2518,13 @@ define amdgpu_kernel void @and_i32_varying(i32 addrspace(1)* %out) {
; GFX1064-NEXT: v_readlane_b32 s5, v1, 31
; GFX1064-NEXT: v_writelane_b32 v3, s4, 16
; GFX1064-NEXT: s_mov_b64 exec, s[2:3]
; GFX1064-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0
; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
; GFX1064-NEXT: s_or_saveexec_b64 s[2:3], -1
; GFX1064-NEXT: v_readlane_b32 s7, v1, 63
; GFX1064-NEXT: v_readlane_b32 s6, v1, 47
; GFX1064-NEXT: v_writelane_b32 v3, s5, 32
; GFX1064-NEXT: s_mov_b64 exec, s[2:3]
; GFX1064-NEXT: v_mbcnt_hi_u32_b32_e64 v0, exec_hi, v0
; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
; GFX1064-NEXT: s_or_saveexec_b64 s[4:5], -1
; GFX1064-NEXT: v_writelane_b32 v3, s6, 48
; GFX1064-NEXT: s_mov_b64 exec, s[4:5]
@ -2575,7 +2575,7 @@ define amdgpu_kernel void @and_i32_varying(i32 addrspace(1)* %out) {
; GFX1032-NEXT: v_readlane_b32 s4, v1, 31
; GFX1032-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf
; GFX1032-NEXT: s_mov_b32 exec_lo, s2
; GFX1032-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0
; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
; GFX1032-NEXT: s_or_saveexec_b32 s2, -1
; GFX1032-NEXT: v_writelane_b32 v3, s3, 16
; GFX1032-NEXT: s_mov_b32 exec_lo, s2
@ -2752,13 +2752,13 @@ define amdgpu_kernel void @or_i32_varying(i32 addrspace(1)* %out) {
; GFX1064-NEXT: v_readlane_b32 s5, v1, 31
; GFX1064-NEXT: v_writelane_b32 v3, s4, 16
; GFX1064-NEXT: s_mov_b64 exec, s[2:3]
; GFX1064-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0
; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
; GFX1064-NEXT: s_or_saveexec_b64 s[2:3], -1
; GFX1064-NEXT: v_readlane_b32 s7, v1, 63
; GFX1064-NEXT: v_readlane_b32 s6, v1, 47
; GFX1064-NEXT: v_writelane_b32 v3, s5, 32
; GFX1064-NEXT: s_mov_b64 exec, s[2:3]
; GFX1064-NEXT: v_mbcnt_hi_u32_b32_e64 v0, exec_hi, v0
; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
; GFX1064-NEXT: s_or_saveexec_b64 s[4:5], -1
; GFX1064-NEXT: v_writelane_b32 v3, s6, 48
; GFX1064-NEXT: s_mov_b64 exec, s[4:5]
@ -2809,7 +2809,7 @@ define amdgpu_kernel void @or_i32_varying(i32 addrspace(1)* %out) {
; GFX1032-NEXT: v_readlane_b32 s4, v1, 31
; GFX1032-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf
; GFX1032-NEXT: s_mov_b32 exec_lo, s2
; GFX1032-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0
; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
; GFX1032-NEXT: s_or_saveexec_b32 s2, -1
; GFX1032-NEXT: v_writelane_b32 v3, s3, 16
; GFX1032-NEXT: s_mov_b32 exec_lo, s2
@ -2986,13 +2986,13 @@ define amdgpu_kernel void @xor_i32_varying(i32 addrspace(1)* %out) {
; GFX1064-NEXT: v_readlane_b32 s5, v1, 31
; GFX1064-NEXT: v_writelane_b32 v3, s4, 16
; GFX1064-NEXT: s_mov_b64 exec, s[2:3]
; GFX1064-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0
; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
; GFX1064-NEXT: s_or_saveexec_b64 s[2:3], -1
; GFX1064-NEXT: v_readlane_b32 s7, v1, 63
; GFX1064-NEXT: v_readlane_b32 s6, v1, 47
; GFX1064-NEXT: v_writelane_b32 v3, s5, 32
; GFX1064-NEXT: s_mov_b64 exec, s[2:3]
; GFX1064-NEXT: v_mbcnt_hi_u32_b32_e64 v0, exec_hi, v0
; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
; GFX1064-NEXT: s_or_saveexec_b64 s[4:5], -1
; GFX1064-NEXT: v_writelane_b32 v3, s6, 48
; GFX1064-NEXT: s_mov_b64 exec, s[4:5]
@ -3043,7 +3043,7 @@ define amdgpu_kernel void @xor_i32_varying(i32 addrspace(1)* %out) {
; GFX1032-NEXT: v_readlane_b32 s4, v1, 31
; GFX1032-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf
; GFX1032-NEXT: s_mov_b32 exec_lo, s2
; GFX1032-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0
; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
; GFX1032-NEXT: s_or_saveexec_b32 s2, -1
; GFX1032-NEXT: v_writelane_b32 v3, s3, 16
; GFX1032-NEXT: s_mov_b32 exec_lo, s2
@ -3222,13 +3222,13 @@ define amdgpu_kernel void @max_i32_varying(i32 addrspace(1)* %out) {
; GFX1064-NEXT: v_readlane_b32 s5, v2, 31
; GFX1064-NEXT: v_writelane_b32 v1, s4, 16
; GFX1064-NEXT: s_mov_b64 exec, s[2:3]
; GFX1064-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0
; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
; GFX1064-NEXT: s_or_saveexec_b64 s[2:3], -1
; GFX1064-NEXT: v_readlane_b32 s7, v2, 63
; GFX1064-NEXT: v_readlane_b32 s6, v2, 47
; GFX1064-NEXT: v_writelane_b32 v1, s5, 32
; GFX1064-NEXT: s_mov_b64 exec, s[2:3]
; GFX1064-NEXT: v_mbcnt_hi_u32_b32_e64 v0, exec_hi, v0
; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
; GFX1064-NEXT: s_or_saveexec_b64 s[4:5], -1
; GFX1064-NEXT: v_writelane_b32 v1, s6, 48
; GFX1064-NEXT: s_mov_b64 exec, s[4:5]
@ -3281,7 +3281,7 @@ define amdgpu_kernel void @max_i32_varying(i32 addrspace(1)* %out) {
; GFX1032-NEXT: v_readlane_b32 s4, v2, 31
; GFX1032-NEXT: v_mov_b32_dpp v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf
; GFX1032-NEXT: s_mov_b32 exec_lo, s2
; GFX1032-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0
; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
; GFX1032-NEXT: s_or_saveexec_b32 s2, -1
; GFX1032-NEXT: v_writelane_b32 v1, s3, 16
; GFX1032-NEXT: s_mov_b32 exec_lo, s2
@ -3425,8 +3425,8 @@ define amdgpu_kernel void @max_i64_constant(i64 addrspace(1)* %out) {
; GFX1064-LABEL: max_i64_constant:
; GFX1064: ; %bb.0: ; %entry
; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; GFX1064-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0
; GFX1064-NEXT: v_mbcnt_hi_u32_b32_e64 v0, exec_hi, v0
; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
; GFX1064-NEXT: ; implicit-def: $vgpr0_vgpr1
; GFX1064-NEXT: s_and_saveexec_b64 s[2:3], vcc
@ -3459,7 +3459,7 @@ define amdgpu_kernel void @max_i64_constant(i64 addrspace(1)* %out) {
; GFX1032-LABEL: max_i64_constant:
; GFX1032: ; %bb.0: ; %entry
; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; GFX1032-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0
; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
; GFX1032-NEXT: ; implicit-def: $vgpr0_vgpr1
; GFX1032-NEXT: s_and_saveexec_b32 s2, vcc_lo
@ -3639,13 +3639,13 @@ define amdgpu_kernel void @min_i32_varying(i32 addrspace(1)* %out) {
; GFX1064-NEXT: v_readlane_b32 s5, v2, 31
; GFX1064-NEXT: v_writelane_b32 v1, s4, 16
; GFX1064-NEXT: s_mov_b64 exec, s[2:3]
; GFX1064-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0
; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
; GFX1064-NEXT: s_or_saveexec_b64 s[2:3], -1
; GFX1064-NEXT: v_readlane_b32 s7, v2, 63
; GFX1064-NEXT: v_readlane_b32 s6, v2, 47
; GFX1064-NEXT: v_writelane_b32 v1, s5, 32
; GFX1064-NEXT: s_mov_b64 exec, s[2:3]
; GFX1064-NEXT: v_mbcnt_hi_u32_b32_e64 v0, exec_hi, v0
; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
; GFX1064-NEXT: s_or_saveexec_b64 s[4:5], -1
; GFX1064-NEXT: v_writelane_b32 v1, s6, 48
; GFX1064-NEXT: s_mov_b64 exec, s[4:5]
@ -3698,7 +3698,7 @@ define amdgpu_kernel void @min_i32_varying(i32 addrspace(1)* %out) {
; GFX1032-NEXT: v_readlane_b32 s4, v2, 31
; GFX1032-NEXT: v_mov_b32_dpp v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf
; GFX1032-NEXT: s_mov_b32 exec_lo, s2
; GFX1032-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0
; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
; GFX1032-NEXT: s_or_saveexec_b32 s2, -1
; GFX1032-NEXT: v_writelane_b32 v1, s3, 16
; GFX1032-NEXT: s_mov_b32 exec_lo, s2
@ -3842,8 +3842,8 @@ define amdgpu_kernel void @min_i64_constant(i64 addrspace(1)* %out) {
; GFX1064-LABEL: min_i64_constant:
; GFX1064: ; %bb.0: ; %entry
; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; GFX1064-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0
; GFX1064-NEXT: v_mbcnt_hi_u32_b32_e64 v0, exec_hi, v0
; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
; GFX1064-NEXT: ; implicit-def: $vgpr0_vgpr1
; GFX1064-NEXT: s_and_saveexec_b64 s[2:3], vcc
@ -3876,7 +3876,7 @@ define amdgpu_kernel void @min_i64_constant(i64 addrspace(1)* %out) {
; GFX1032-LABEL: min_i64_constant:
; GFX1032: ; %bb.0: ; %entry
; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; GFX1032-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0
; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
; GFX1032-NEXT: ; implicit-def: $vgpr0_vgpr1
; GFX1032-NEXT: s_and_saveexec_b32 s2, vcc_lo
@ -4054,13 +4054,13 @@ define amdgpu_kernel void @umax_i32_varying(i32 addrspace(1)* %out) {
; GFX1064-NEXT: v_readlane_b32 s5, v1, 31
; GFX1064-NEXT: v_writelane_b32 v3, s4, 16
; GFX1064-NEXT: s_mov_b64 exec, s[2:3]
; GFX1064-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0
; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
; GFX1064-NEXT: s_or_saveexec_b64 s[2:3], -1
; GFX1064-NEXT: v_readlane_b32 s7, v1, 63
; GFX1064-NEXT: v_readlane_b32 s6, v1, 47
; GFX1064-NEXT: v_writelane_b32 v3, s5, 32
; GFX1064-NEXT: s_mov_b64 exec, s[2:3]
; GFX1064-NEXT: v_mbcnt_hi_u32_b32_e64 v0, exec_hi, v0
; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
; GFX1064-NEXT: s_or_saveexec_b64 s[4:5], -1
; GFX1064-NEXT: v_writelane_b32 v3, s6, 48
; GFX1064-NEXT: s_mov_b64 exec, s[4:5]
@ -4111,7 +4111,7 @@ define amdgpu_kernel void @umax_i32_varying(i32 addrspace(1)* %out) {
; GFX1032-NEXT: v_readlane_b32 s4, v1, 31
; GFX1032-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf
; GFX1032-NEXT: s_mov_b32 exec_lo, s2
; GFX1032-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0
; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
; GFX1032-NEXT: s_or_saveexec_b32 s2, -1
; GFX1032-NEXT: v_writelane_b32 v3, s3, 16
; GFX1032-NEXT: s_mov_b32 exec_lo, s2
@ -4252,8 +4252,8 @@ define amdgpu_kernel void @umax_i64_constant(i64 addrspace(1)* %out) {
; GFX1064-LABEL: umax_i64_constant:
; GFX1064: ; %bb.0: ; %entry
; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; GFX1064-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0
; GFX1064-NEXT: v_mbcnt_hi_u32_b32_e64 v0, exec_hi, v0
; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
; GFX1064-NEXT: ; implicit-def: $vgpr0_vgpr1
; GFX1064-NEXT: s_and_saveexec_b64 s[2:3], vcc
@ -4286,7 +4286,7 @@ define amdgpu_kernel void @umax_i64_constant(i64 addrspace(1)* %out) {
; GFX1032-LABEL: umax_i64_constant:
; GFX1032: ; %bb.0: ; %entry
; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; GFX1032-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0
; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
; GFX1032-NEXT: ; implicit-def: $vgpr0_vgpr1
; GFX1032-NEXT: s_and_saveexec_b32 s2, vcc_lo
@ -4464,13 +4464,13 @@ define amdgpu_kernel void @umin_i32_varying(i32 addrspace(1)* %out) {
; GFX1064-NEXT: v_readlane_b32 s5, v1, 31
; GFX1064-NEXT: v_writelane_b32 v3, s4, 16
; GFX1064-NEXT: s_mov_b64 exec, s[2:3]
; GFX1064-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0
; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
; GFX1064-NEXT: s_or_saveexec_b64 s[2:3], -1
; GFX1064-NEXT: v_readlane_b32 s7, v1, 63
; GFX1064-NEXT: v_readlane_b32 s6, v1, 47
; GFX1064-NEXT: v_writelane_b32 v3, s5, 32
; GFX1064-NEXT: s_mov_b64 exec, s[2:3]
; GFX1064-NEXT: v_mbcnt_hi_u32_b32_e64 v0, exec_hi, v0
; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
; GFX1064-NEXT: s_or_saveexec_b64 s[4:5], -1
; GFX1064-NEXT: v_writelane_b32 v3, s6, 48
; GFX1064-NEXT: s_mov_b64 exec, s[4:5]
@ -4521,7 +4521,7 @@ define amdgpu_kernel void @umin_i32_varying(i32 addrspace(1)* %out) {
; GFX1032-NEXT: v_readlane_b32 s4, v1, 31
; GFX1032-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf
; GFX1032-NEXT: s_mov_b32 exec_lo, s2
; GFX1032-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0
; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
; GFX1032-NEXT: s_or_saveexec_b32 s2, -1
; GFX1032-NEXT: v_writelane_b32 v3, s3, 16
; GFX1032-NEXT: s_mov_b32 exec_lo, s2
@ -4662,8 +4662,8 @@ define amdgpu_kernel void @umin_i64_constant(i64 addrspace(1)* %out) {
; GFX1064-LABEL: umin_i64_constant:
; GFX1064: ; %bb.0: ; %entry
; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; GFX1064-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0
; GFX1064-NEXT: v_mbcnt_hi_u32_b32_e64 v0, exec_hi, v0
; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
; GFX1064-NEXT: ; implicit-def: $vgpr0_vgpr1
; GFX1064-NEXT: s_and_saveexec_b64 s[2:3], vcc
@ -4696,7 +4696,7 @@ define amdgpu_kernel void @umin_i64_constant(i64 addrspace(1)* %out) {
; GFX1032-LABEL: umin_i64_constant:
; GFX1032: ; %bb.0: ; %entry
; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; GFX1032-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0
; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
; GFX1032-NEXT: ; implicit-def: $vgpr0_vgpr1
; GFX1032-NEXT: s_and_saveexec_b32 s2, vcc_lo

View File

@ -89,8 +89,8 @@ define amdgpu_ps void @add_i32_constant(<4 x i32> inreg %out, <4 x i32> inreg %i
; GFX1064-NEXT: ; %bb.1:
; GFX1064-NEXT: s_mov_b64 s[12:13], exec
; GFX1064-NEXT: ; implicit-def: $vgpr1
; GFX1064-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s12, 0
; GFX1064-NEXT: v_mbcnt_hi_u32_b32_e64 v0, s13, v0
; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, s12, 0
; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, s13, v0
; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
; GFX1064-NEXT: s_and_saveexec_b64 s[28:29], vcc
; GFX1064-NEXT: s_cbranch_execz BB0_3
@ -124,7 +124,7 @@ define amdgpu_ps void @add_i32_constant(<4 x i32> inreg %out, <4 x i32> inreg %i
; GFX1032-NEXT: ; %bb.1:
; GFX1032-NEXT: s_mov_b32 s10, exec_lo
; GFX1032-NEXT: ; implicit-def: $vgpr1
; GFX1032-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s10, 0
; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, s10, 0
; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
; GFX1032-NEXT: s_and_saveexec_b32 s9, vcc_lo
; GFX1032-NEXT: s_cbranch_execz BB0_3
@ -316,13 +316,13 @@ define amdgpu_ps void @add_i32_varying(<4 x i32> inreg %out, <4 x i32> inreg %in
; GFX1064-NEXT: v_readlane_b32 s13, v1, 31
; GFX1064-NEXT: v_writelane_b32 v3, s12, 16
; GFX1064-NEXT: s_mov_b64 exec, s[10:11]
; GFX1064-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0
; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
; GFX1064-NEXT: s_or_saveexec_b64 s[10:11], -1
; GFX1064-NEXT: v_readlane_b32 s12, v1, 63
; GFX1064-NEXT: v_readlane_b32 s14, v1, 47
; GFX1064-NEXT: v_writelane_b32 v3, s13, 32
; GFX1064-NEXT: s_mov_b64 exec, s[10:11]
; GFX1064-NEXT: v_mbcnt_hi_u32_b32_e64 v0, exec_hi, v0
; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
; GFX1064-NEXT: s_or_saveexec_b64 s[10:11], -1
; GFX1064-NEXT: v_writelane_b32 v3, s14, 48
; GFX1064-NEXT: s_mov_b64 exec, s[10:11]
@ -375,7 +375,7 @@ define amdgpu_ps void @add_i32_varying(<4 x i32> inreg %out, <4 x i32> inreg %in
; GFX1032-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf
; GFX1032-NEXT: v_readlane_b32 s10, v1, 15
; GFX1032-NEXT: s_mov_b32 exec_lo, s9
; GFX1032-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0
; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
; GFX1032-NEXT: s_or_saveexec_b32 s9, -1
; GFX1032-NEXT: v_writelane_b32 v3, s10, 16
; GFX1032-NEXT: s_mov_b32 exec_lo, s9

View File

@ -51,7 +51,7 @@ entry:
; GFX9: v_add_co_u32_e32 v{{[0-9]+}}, vcc, s{{[0-9]+}}, v{{[0-9]+}}
; GFX9: v_addc_co_u32_e32 v{{[0-9]+}}, vcc, 0, v{{[0-9]+}}, vcc
;
; GFX1010: v_add_co_u32_e64 v{{[0-9]+}}, [[CARRY:s[0-9]+]], s{{[0-9]+}}, v{{[0-9]+}}
; GFX1010: v_add_co_u32 v{{[0-9]+}}, [[CARRY:s[0-9]+]], s{{[0-9]+}}, v{{[0-9]+}}
; GFX1010: v_add_co_ci_u32_e64 v{{[0-9]+}}, [[CARRY]], s{{[0-9]+}}, 0, [[CARRY]]
define amdgpu_kernel void @vadd64rr(i64 addrspace(1)* %out, i64 %a) {
entry:
@ -81,7 +81,7 @@ entry:
; GFX9: v_mov_b32_e32 v1, 0x1234
; GFX9: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
;
; GFX1010: v_add_co_u32_e64 v{{[0-9]+}}, [[CARRY:s[0-9]+]], 0x56789876, v{{[0-9]+}}
; GFX1010: v_add_co_u32 v{{[0-9]+}}, [[CARRY:s[0-9]+]], 0x56789876, v{{[0-9]+}}
; GFX1010: v_add_co_ci_u32_e64 v{{[0-9]+}}, [[CARRY]], 0, 0x1234, [[CARRY]]
define amdgpu_kernel void @vadd64ri(i64 addrspace(1)* %out) {
entry:
@ -125,7 +125,7 @@ define amdgpu_kernel void @suaddo32(i32 addrspace(1)* %out, i1 addrspace(1)* %ca
; GFX9: v_add_co_u32_e32 v{{[0-9]+}}, vcc, s{{[0-9]+}}, v{{[0-9]+}}
; GFX9: v_cndmask_b32_e64 v{{[0-9]+}}, 0, 1, vcc
;
; GFX1010: v_add_co_u32_e64 v{{[0-9]+}}, [[CARRY:s[0-9]+]], s{{[0-9]+}}, s{{[0-9]+}}
; GFX1010: v_add_co_u32 v{{[0-9]+}}, [[CARRY:s[0-9]+]], s{{[0-9]+}}, s{{[0-9]+}}
; GFX1010: v_cndmask_b32_e64 v{{[0-9]+}}, 0, 1, [[CARRY]]
define amdgpu_kernel void @uaddo32_vcc_user(i32 addrspace(1)* %out, i1 addrspace(1)* %carryout, i32 %a, i32 %b) #0 {
%uadd = call { i32, i1 } @llvm.uadd.with.overflow.i32(i32 %a, i32 %b)
@ -170,7 +170,7 @@ define amdgpu_kernel void @suaddo64(i64 addrspace(1)* %out, i1 addrspace(1)* %ca
; GFX9: v_add_co_u32_e32 v{{[0-9]+}}, vcc, s{{[0-9]+}}, v0
; GFX9: v_addc_co_u32_e32 v{{[0-9]+}}, vcc, 0, v{{[0-9]+}}, vcc
;
; GFX1010: v_add_co_u32_e64 v{{[0-9]+}}, [[CARRY:s[0-9]+]], s{{[0-9]+}}, v0
; GFX1010: v_add_co_u32 v{{[0-9]+}}, [[CARRY:s[0-9]+]], s{{[0-9]+}}, v0
; GFX1010: v_add_co_ci_u32_e64 v{{[0-9]+}}, [[CARRY]], s{{[0-9]+}}, 0, [[CARRY]]
define amdgpu_kernel void @vuaddo64(i64 addrspace(1)* %out, i1 addrspace(1)* %carryout, i64 %a) #0 {
%tid = call i32 @llvm.amdgcn.workitem.id.x()
@ -236,7 +236,7 @@ entry:
; GFX9: v_sub_co_u32_e32 v{{[0-9]+}}, vcc, s{{[0-9]+}}, v{{[0-9]+}}
; GFX9: v_subbrev_co_u32_e32 v{{[0-9]+}}, vcc, 0, v{{[0-9]+}}, vcc
;
; GFX1010: v_sub_co_u32_e64 v{{[0-9]+}}, [[CARRY:s[0-9]+]], s{{[0-9]+}}, v{{[0-9]+}}
; GFX1010: v_sub_co_u32 v{{[0-9]+}}, [[CARRY:s[0-9]+]], s{{[0-9]+}}, v{{[0-9]+}}
; GFX1010: v_sub_co_ci_u32_e64 v{{[0-9]+}}, [[CARRY]], s{{[0-9]+}}, 0, [[CARRY]]
define amdgpu_kernel void @vsub64rr(i64 addrspace(1)* %out, i64 %a) {
entry:
@ -266,7 +266,7 @@ entry:
; GFX9: v_mov_b32_e32 v1, 0x1234
; GFX9: v_subbrev_co_u32_e32 v1, vcc, 0, v1, vcc
;
; GFX1010: v_sub_co_u32_e64 v{{[0-9]+}}, [[CARRY:s[0-9]+]], 0x56789876, v{{[0-9]+}}
; GFX1010: v_sub_co_u32 v{{[0-9]+}}, [[CARRY:s[0-9]+]], 0x56789876, v{{[0-9]+}}
; GFX1010: v_sub_co_ci_u32_e64 v{{[0-9]+}}, [[CARRY]], 0x1234, 0, [[CARRY]]
define amdgpu_kernel void @vsub64ri(i64 addrspace(1)* %out) {
entry:
@ -310,7 +310,7 @@ define amdgpu_kernel void @susubo32(i32 addrspace(1)* %out, i1 addrspace(1)* %ca
; GFX9: v_sub_co_u32_e32 v{{[0-9]+}}, vcc, s{{[0-9]+}}, v{{[0-9]+}}
; GFX9: v_cndmask_b32_e64 v{{[0-9]+}}, 0, 1, vcc
;
; GFX1010: v_sub_co_u32_e64 v{{[0-9]+}}, [[CARRY:s[0-9]+]], s{{[0-9]+}}, s{{[0-9]+}}
; GFX1010: v_sub_co_u32 v{{[0-9]+}}, [[CARRY:s[0-9]+]], s{{[0-9]+}}, s{{[0-9]+}}
; GFX1010: v_cndmask_b32_e64 v{{[0-9]+}}, 0, 1, [[CARRY]]
define amdgpu_kernel void @usubo32_vcc_user(i32 addrspace(1)* %out, i1 addrspace(1)* %carryout, i32 %a, i32 %b) #0 {
%usub = call { i32, i1 } @llvm.usub.with.overflow.i32(i32 %a, i32 %b)
@ -355,7 +355,7 @@ define amdgpu_kernel void @susubo64(i64 addrspace(1)* %out, i1 addrspace(1)* %ca
; GFX9: v_sub_co_u32_e32 v{{[0-9]+}}, vcc, s{{[0-9]+}}, v0
; GFX9: v_subbrev_co_u32_e32 v{{[0-9]+}}, vcc, 0, v{{[0-9]+}}, vcc
;
; GFX1010: v_sub_co_u32_e64 v{{[0-9]+}}, [[CARRY:s[0-9]+]], s{{[0-9]+}}, v0
; GFX1010: v_sub_co_u32 v{{[0-9]+}}, [[CARRY:s[0-9]+]], s{{[0-9]+}}, v0
; GFX1010: v_sub_co_ci_u32_e64 v{{[0-9]+}}, [[CARRY]], s{{[0-9]+}}, 0, [[CARRY]]
define amdgpu_kernel void @vusubo64(i64 addrspace(1)* %out, i1 addrspace(1)* %carryout, i64 %a) #0 {
%tid = call i32 @llvm.amdgcn.workitem.id.x()

View File

@ -26,7 +26,7 @@ define amdgpu_kernel void @test_sink_small_offset_global_atomic_csub_i32(i32 add
; GCN-LABEL: test_sink_small_offset_global_atomic_csub_i32:
; GCN: ; %bb.0: ; %entry
; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
; GCN-NEXT: v_mbcnt_lo_u32_b32_e64 v0, -1, 0
; GCN-NEXT: v_mbcnt_lo_u32_b32 v0, -1, 0
; GCN-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0
; GCN-NEXT: v_mov_b32_e32 v0, 0
; GCN-NEXT: s_and_saveexec_b32 s4, vcc_lo

View File

@ -695,7 +695,7 @@ define <2 x i16> @chain_hi_to_lo_flat_other_dep(i16 addrspace(0)* %ptr) {
; GFX10: ; %bb.0: ; %bb
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: v_add_co_u32_e64 v2, vcc_lo, v0, 2
; GFX10-NEXT: v_add_co_u32 v2, vcc_lo, v0, 2
; GFX10-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, 0, v1, vcc_lo
; GFX10-NEXT: flat_load_ushort v2, v[2:3] glc dlc
; GFX10-NEXT: s_waitcnt vmcnt(0)

View File

@ -452,7 +452,7 @@ define amdgpu_kernel void @v_ctlz_i8(i8 addrspace(1)* noalias %out, i8 addrspace
; GFX10-NEXT: v_cmp_ne_u16_e32 vcc_lo, 0, v1
; GFX10-NEXT: v_cndmask_b32_e32 v1, 32, v2, vcc_lo
; GFX10-NEXT: v_add_nc_u32_e32 v1, -16, v1
; GFX10-NEXT: v_add_nc_u16_e64 v1, v1, -8
; GFX10-NEXT: v_add_nc_u16 v1, v1, -8
; GFX10-NEXT: global_store_byte v0, v1, s[0:1]
; GFX10-NEXT: s_endpgm
%val = load i8, i8 addrspace(1)* %valptr

View File

@ -998,14 +998,14 @@ define amdgpu_kernel void @load_v4i8_to_v4f32_2_uses(<4 x float> addrspace(1)* n
; GFX10-NEXT: v_lshrrev_b32_e32 v2, 16, v0
; GFX10-NEXT: v_lshrrev_b32_sdwa v1, v1, v0 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
; GFX10-NEXT: v_and_b32_e32 v3, 0xffffff00, v0
; GFX10-NEXT: v_add_nc_u16_e64 v4, v0, 9
; GFX10-NEXT: v_add_nc_u16_e64 v2, v2, 9
; GFX10-NEXT: v_add_nc_u16 v4, v0, 9
; GFX10-NEXT: v_add_nc_u16 v2, v2, 9
; GFX10-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
; GFX10-NEXT: v_or_b32_sdwa v2, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
; GFX10-NEXT: v_mov_b32_e32 v4, 0
; GFX10-NEXT: v_cvt_f32_ubyte3_e32 v3, v0
; GFX10-NEXT: v_add_nc_u16_e64 v1, v1, s0
; GFX10-NEXT: v_add_nc_u16_e64 v5, v2, s0
; GFX10-NEXT: v_add_nc_u16 v1, v1, s0
; GFX10-NEXT: v_add_nc_u16 v5, v2, s0
; GFX10-NEXT: v_cvt_f32_ubyte2_e32 v2, v0
; GFX10-NEXT: v_lshlrev_b32_e32 v6, 16, v1
; GFX10-NEXT: v_cvt_f32_ubyte1_e32 v1, v0

View File

@ -45,7 +45,7 @@ define i32 @s_add_co_select_user() {
; GFX10-NEXT: s_mov_b64 s[4:5], 0
; GFX10-NEXT: s_load_dword s4, s[4:5], 0x0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: v_add_co_u32_e64 v0, s5, s4, s4
; GFX10-NEXT: v_add_co_u32 v0, s5, s4, s4
; GFX10-NEXT: s_cmpk_lg_u32 s5, 0x0
; GFX10-NEXT: s_addc_u32 s5, s4, 0
; GFX10-NEXT: s_cselect_b32 s6, 1, 0

View File

@ -671,10 +671,10 @@ define i16 @v_fshr_i16(i16 %src0, i16 %src1, i16 %src2) {
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: v_xor_b32_e32 v3, -1, v2
; GFX10-NEXT: v_and_b32_e32 v2, 15, v2
; GFX10-NEXT: v_lshlrev_b16_e64 v0, 1, v0
; GFX10-NEXT: v_lshlrev_b16 v0, 1, v0
; GFX10-NEXT: v_and_b32_e32 v3, 15, v3
; GFX10-NEXT: v_lshrrev_b16_e64 v1, v2, v1
; GFX10-NEXT: v_lshlrev_b16_e64 v0, v3, v0
; GFX10-NEXT: v_lshrrev_b16 v1, v2, v1
; GFX10-NEXT: v_lshlrev_b16 v0, v3, v0
; GFX10-NEXT: v_or_b32_e32 v0, v0, v1
; GFX10-NEXT: s_setpc_b64 s[30:31]
%ret = call i16 @llvm.fshr.i16(i16 %src0, i16 %src1, i16 %src2)
@ -846,27 +846,27 @@ define <3 x i16> @v_fshr_v3i16(<3 x i16> %src0, <3 x i16> %src1, <3 x i16> %src2
; GFX10-NEXT: v_lshrrev_b32_e32 v6, 16, v4
; GFX10-NEXT: v_xor_b32_e32 v8, -1, v4
; GFX10-NEXT: v_lshrrev_b32_e32 v10, 16, v0
; GFX10-NEXT: v_lshlrev_b16_e64 v0, 1, v0
; GFX10-NEXT: v_lshlrev_b16 v0, 1, v0
; GFX10-NEXT: v_and_b32_e32 v4, 15, v4
; GFX10-NEXT: v_and_b32_e32 v9, 15, v6
; GFX10-NEXT: v_xor_b32_e32 v6, -1, v6
; GFX10-NEXT: v_and_b32_e32 v15, 15, v8
; GFX10-NEXT: v_lshrrev_b32_e32 v7, 16, v2
; GFX10-NEXT: v_lshrrev_b16_e64 v2, v4, v2
; GFX10-NEXT: v_lshlrev_b16_e64 v10, 1, v10
; GFX10-NEXT: v_lshrrev_b16 v2, v4, v2
; GFX10-NEXT: v_lshlrev_b16 v10, 1, v10
; GFX10-NEXT: v_and_b32_e32 v19, 15, v6
; GFX10-NEXT: v_lshlrev_b16_e64 v0, v15, v0
; GFX10-NEXT: v_lshlrev_b16 v0, v15, v0
; GFX10-NEXT: v_xor_b32_e32 v11, -1, v5
; GFX10-NEXT: v_lshrrev_b16_e64 v4, v9, v7
; GFX10-NEXT: v_lshlrev_b16_e64 v1, 1, v1
; GFX10-NEXT: v_lshlrev_b16_e64 v6, v19, v10
; GFX10-NEXT: v_lshrrev_b16 v4, v9, v7
; GFX10-NEXT: v_lshlrev_b16 v1, 1, v1
; GFX10-NEXT: v_lshlrev_b16 v6, v19, v10
; GFX10-NEXT: v_or_b32_e32 v0, v0, v2
; GFX10-NEXT: v_and_b32_e32 v7, 15, v11
; GFX10-NEXT: v_and_b32_e32 v2, 15, v5
; GFX10-NEXT: v_or_b32_e32 v11, v6, v4
; GFX10-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX10-NEXT: v_lshlrev_b16_e64 v1, v7, v1
; GFX10-NEXT: v_lshrrev_b16_e64 v2, v2, v3
; GFX10-NEXT: v_lshlrev_b16 v1, v7, v1
; GFX10-NEXT: v_lshrrev_b16 v2, v2, v3
; GFX10-NEXT: v_lshl_or_b32 v0, v11, 16, v0
; GFX10-NEXT: v_or_b32_e32 v1, v1, v2
; GFX10-NEXT: s_setpc_b64 s[30:31]
@ -996,15 +996,15 @@ define <4 x i16> @v_fshr_v4i16(<4 x i16> %src0, <4 x i16> %src1, <4 x i16> %src2
; GFX10-NEXT: v_lshrrev_b32_e32 v11, 16, v0
; GFX10-NEXT: v_xor_b32_e32 v9, -1, v6
; GFX10-NEXT: v_and_b32_e32 v6, 15, v6
; GFX10-NEXT: v_lshlrev_b16_e64 v8, 1, v8
; GFX10-NEXT: v_lshlrev_b16 v8, 1, v8
; GFX10-NEXT: v_and_b32_e32 v13, 15, v10
; GFX10-NEXT: v_lshlrev_b16_e64 v1, 1, v1
; GFX10-NEXT: v_lshlrev_b16 v1, 1, v1
; GFX10-NEXT: v_and_b32_e32 v9, 15, v9
; GFX10-NEXT: v_lshrrev_b16_e64 v6, v6, v7
; GFX10-NEXT: v_lshlrev_b16_e64 v0, 1, v0
; GFX10-NEXT: v_lshrrev_b16 v6, v6, v7
; GFX10-NEXT: v_lshlrev_b16 v0, 1, v0
; GFX10-NEXT: v_lshrrev_b32_e32 v12, 16, v2
; GFX10-NEXT: v_lshlrev_b16_e64 v11, 1, v11
; GFX10-NEXT: v_lshlrev_b16_e64 v7, v9, v8
; GFX10-NEXT: v_lshlrev_b16 v11, 1, v11
; GFX10-NEXT: v_lshlrev_b16 v7, v9, v8
; GFX10-NEXT: v_xor_b32_e32 v9, -1, v10
; GFX10-NEXT: v_xor_b32_e32 v10, -1, v5
; GFX10-NEXT: v_xor_b32_e32 v8, -1, v4
@ -1013,12 +1013,12 @@ define <4 x i16> @v_fshr_v4i16(<4 x i16> %src0, <4 x i16> %src1, <4 x i16> %src2
; GFX10-NEXT: v_and_b32_e32 v9, 15, v9
; GFX10-NEXT: v_and_b32_e32 v10, 15, v10
; GFX10-NEXT: v_and_b32_e32 v15, 15, v8
; GFX10-NEXT: v_lshrrev_b16_e64 v2, v4, v2
; GFX10-NEXT: v_lshrrev_b16_e64 v3, v5, v3
; GFX10-NEXT: v_lshrrev_b16_e64 v4, v13, v12
; GFX10-NEXT: v_lshlrev_b16_e64 v1, v10, v1
; GFX10-NEXT: v_lshlrev_b16_e64 v0, v15, v0
; GFX10-NEXT: v_lshlrev_b16_e64 v5, v9, v11
; GFX10-NEXT: v_lshrrev_b16 v2, v4, v2
; GFX10-NEXT: v_lshrrev_b16 v3, v5, v3
; GFX10-NEXT: v_lshrrev_b16 v4, v13, v12
; GFX10-NEXT: v_lshlrev_b16 v1, v10, v1
; GFX10-NEXT: v_lshlrev_b16 v0, v15, v0
; GFX10-NEXT: v_lshlrev_b16 v5, v9, v11
; GFX10-NEXT: v_or_b32_e32 v1, v1, v3
; GFX10-NEXT: v_or_b32_e32 v0, v0, v2
; GFX10-NEXT: v_mov_b32_e32 v2, 0xffff

View File

@ -2,7 +2,7 @@
; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX9 %s
; GCN-LABEL: {{^}}test_add_lit:
; GFX10: v_add_co_u32_e64 v{{[0-9]+}}, vcc_lo, 0x80992bff, v{{[0-9]+}}
; GFX10: v_add_co_u32 v{{[0-9]+}}, vcc_lo, 0x80992bff, v{{[0-9]+}}
; GFX10: v_add_co_ci_u32_e32 v{{[0-9]+}}, vcc_lo, 0xe7, v{{[0-9]+}}, vcc_lo
; GFX9: v_mov_b32_e32 [[C2:v[0-9]+]], 0xe7
; GFX9: v_add_co_u32_e32 v{{[0-9]+}}, vcc, 0x80992bff, v{{[0-9]+}}

View File

@ -120,9 +120,9 @@ define amdgpu_ps float @global_xchg_saddr_i32_rtn_2048(i8 addrspace(1)* inreg %s
;
; GFX10-LABEL: global_xchg_saddr_i32_rtn_2048:
; GFX10: ; %bb.0:
; GFX10-NEXT: v_add_co_u32_e64 v0, s[0:1], s2, v0
; GFX10-NEXT: v_add_co_u32 v0, s[0:1], s2, v0
; GFX10-NEXT: v_add_co_ci_u32_e64 v3, s[0:1], s3, 0, s[0:1]
; GFX10-NEXT: v_add_co_u32_e64 v2, vcc, 0x800, v0
; GFX10-NEXT: v_add_co_u32 v2, vcc, 0x800, v0
; GFX10-NEXT: v_add_co_ci_u32_e32 v3, vcc, 0, v3, vcc
; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0

View File

@ -85,7 +85,7 @@ define amdgpu_ps float @global_load_saddr_i8_offset_neg4096(i8 addrspace(1)* inr
;
; GFX10-LABEL: global_load_saddr_i8_offset_neg4096:
; GFX10: ; %bb.0:
; GFX10-NEXT: v_add_co_u32_e64 v0, s[0:1], 0xfffff000, s2
; GFX10-NEXT: v_add_co_u32 v0, s[0:1], 0xfffff000, s2
; GFX10-NEXT: v_add_co_ci_u32_e64 v1, s[0:1], -1, s3, s[0:1]
; GFX10-NEXT: global_load_ubyte v0, v[0:1], off
; GFX10-NEXT: s_waitcnt vmcnt(0)
@ -111,7 +111,7 @@ define amdgpu_ps float @global_load_saddr_i8_offset_neg4097(i8 addrspace(1)* inr
;
; GFX10-LABEL: global_load_saddr_i8_offset_neg4097:
; GFX10: ; %bb.0:
; GFX10-NEXT: v_add_co_u32_e64 v0, s[0:1], 0xfffff000, s2
; GFX10-NEXT: v_add_co_u32 v0, s[0:1], 0xfffff000, s2
; GFX10-NEXT: v_add_co_ci_u32_e64 v1, s[0:1], -1, s3, s[0:1]
; GFX10-NEXT: global_load_ubyte v0, v[0:1], off offset:-1
; GFX10-NEXT: s_waitcnt vmcnt(0)
@ -137,7 +137,7 @@ define amdgpu_ps float @global_load_saddr_i8_offset_neg4098(i8 addrspace(1)* inr
;
; GFX10-LABEL: global_load_saddr_i8_offset_neg4098:
; GFX10: ; %bb.0:
; GFX10-NEXT: v_add_co_u32_e64 v0, s[0:1], 0xfffff000, s2
; GFX10-NEXT: v_add_co_u32 v0, s[0:1], 0xfffff000, s2
; GFX10-NEXT: v_add_co_ci_u32_e64 v1, s[0:1], -1, s3, s[0:1]
; GFX10-NEXT: global_load_ubyte v0, v[0:1], off offset:-2
; GFX10-NEXT: s_waitcnt vmcnt(0)
@ -241,7 +241,7 @@ define amdgpu_ps float @global_load_saddr_i8_offset_neg2049(i8 addrspace(1)* inr
;
; GFX10-LABEL: global_load_saddr_i8_offset_neg2049:
; GFX10: ; %bb.0:
; GFX10-NEXT: v_add_co_u32_e64 v0, s[0:1], 0xfffff800, s2
; GFX10-NEXT: v_add_co_u32 v0, s[0:1], 0xfffff800, s2
; GFX10-NEXT: v_add_co_ci_u32_e64 v1, s[0:1], -1, s3, s[0:1]
; GFX10-NEXT: global_load_ubyte v0, v[0:1], off offset:-1
; GFX10-NEXT: s_waitcnt vmcnt(0)
@ -264,7 +264,7 @@ define amdgpu_ps float @global_load_saddr_i8_offset_neg2050(i8 addrspace(1)* inr
;
; GFX10-LABEL: global_load_saddr_i8_offset_neg2050:
; GFX10: ; %bb.0:
; GFX10-NEXT: v_add_co_u32_e64 v0, s[0:1], 0xfffff800, s2
; GFX10-NEXT: v_add_co_u32 v0, s[0:1], 0xfffff800, s2
; GFX10-NEXT: v_add_co_ci_u32_e64 v1, s[0:1], -1, s3, s[0:1]
; GFX10-NEXT: global_load_ubyte v0, v[0:1], off offset:-2
; GFX10-NEXT: s_waitcnt vmcnt(0)
@ -309,7 +309,7 @@ define amdgpu_ps float @global_load_saddr_i8_offset_4294967296(i8 addrspace(1)*
;
; GFX10-LABEL: global_load_saddr_i8_offset_4294967296:
; GFX10: ; %bb.0:
; GFX10-NEXT: v_add_co_u32_e64 v0, s[0:1], 0, s2
; GFX10-NEXT: v_add_co_u32 v0, s[0:1], 0, s2
; GFX10-NEXT: v_add_co_ci_u32_e64 v1, s[0:1], 1, s3, s[0:1]
; GFX10-NEXT: global_load_ubyte v0, v[0:1], off
; GFX10-NEXT: s_waitcnt vmcnt(0)
@ -333,7 +333,7 @@ define amdgpu_ps float @global_load_saddr_i8_offset_4294967297(i8 addrspace(1)*
;
; GFX10-LABEL: global_load_saddr_i8_offset_4294967297:
; GFX10: ; %bb.0:
; GFX10-NEXT: v_add_co_u32_e64 v0, s[0:1], 0, s2
; GFX10-NEXT: v_add_co_u32 v0, s[0:1], 0, s2
; GFX10-NEXT: v_add_co_ci_u32_e64 v1, s[0:1], 1, s3, s[0:1]
; GFX10-NEXT: global_load_ubyte v0, v[0:1], off offset:1
; GFX10-NEXT: s_waitcnt vmcnt(0)
@ -357,7 +357,7 @@ define amdgpu_ps float @global_load_saddr_i8_offset_4294971391(i8 addrspace(1)*
;
; GFX10-LABEL: global_load_saddr_i8_offset_4294971391:
; GFX10: ; %bb.0:
; GFX10-NEXT: v_add_co_u32_e64 v0, s[0:1], 0x800, s2
; GFX10-NEXT: v_add_co_u32 v0, s[0:1], 0x800, s2
; GFX10-NEXT: v_add_co_ci_u32_e64 v1, s[0:1], 1, s3, s[0:1]
; GFX10-NEXT: global_load_ubyte v0, v[0:1], off offset:2047
; GFX10-NEXT: s_waitcnt vmcnt(0)
@ -382,7 +382,7 @@ define amdgpu_ps float @global_load_saddr_i8_offset_4294971392(i8 addrspace(1)*
;
; GFX10-LABEL: global_load_saddr_i8_offset_4294971392:
; GFX10: ; %bb.0:
; GFX10-NEXT: v_add_co_u32_e64 v0, s[0:1], 0x1000, s2
; GFX10-NEXT: v_add_co_u32 v0, s[0:1], 0x1000, s2
; GFX10-NEXT: v_add_co_ci_u32_e64 v1, s[0:1], 1, s3, s[0:1]
; GFX10-NEXT: global_load_ubyte v0, v[0:1], off
; GFX10-NEXT: s_waitcnt vmcnt(0)
@ -407,7 +407,7 @@ define amdgpu_ps float @global_load_saddr_i8_offset_neg4294967295(i8 addrspace(1
;
; GFX10-LABEL: global_load_saddr_i8_offset_neg4294967295:
; GFX10: ; %bb.0:
; GFX10-NEXT: v_add_co_u32_e64 v0, s[0:1], 0x800, s2
; GFX10-NEXT: v_add_co_u32 v0, s[0:1], 0x800, s2
; GFX10-NEXT: v_add_co_ci_u32_e64 v1, s[0:1], -1, s3, s[0:1]
; GFX10-NEXT: global_load_ubyte v0, v[0:1], off offset:-2047
; GFX10-NEXT: s_waitcnt vmcnt(0)
@ -431,7 +431,7 @@ define amdgpu_ps float @global_load_saddr_i8_offset_neg4294967296(i8 addrspace(1
;
; GFX10-LABEL: global_load_saddr_i8_offset_neg4294967296:
; GFX10: ; %bb.0:
; GFX10-NEXT: v_add_co_u32_e64 v0, s[0:1], 0, s2
; GFX10-NEXT: v_add_co_u32 v0, s[0:1], 0, s2
; GFX10-NEXT: v_add_co_ci_u32_e64 v1, s[0:1], -1, s3, s[0:1]
; GFX10-NEXT: global_load_ubyte v0, v[0:1], off
; GFX10-NEXT: s_waitcnt vmcnt(0)
@ -455,7 +455,7 @@ define amdgpu_ps float @global_load_saddr_i8_offset_neg4294967297(i8 addrspace(1
;
; GFX10-LABEL: global_load_saddr_i8_offset_neg4294967297:
; GFX10: ; %bb.0:
; GFX10-NEXT: v_add_co_u32_e64 v0, s[0:1], 0, s2
; GFX10-NEXT: v_add_co_u32 v0, s[0:1], 0, s2
; GFX10-NEXT: v_add_co_ci_u32_e64 v1, s[0:1], -1, s3, s[0:1]
; GFX10-NEXT: global_load_ubyte v0, v[0:1], off offset:-1
; GFX10-NEXT: s_waitcnt vmcnt(0)
@ -496,9 +496,9 @@ define amdgpu_ps float @global_load_saddr_i8_zext_vgpr_offset_4095(i8 addrspace(
;
; GFX10-LABEL: global_load_saddr_i8_zext_vgpr_offset_4095:
; GFX10: ; %bb.0:
; GFX10-NEXT: v_add_co_u32_e64 v0, s[0:1], s2, v0
; GFX10-NEXT: v_add_co_u32 v0, s[0:1], s2, v0
; GFX10-NEXT: v_add_co_ci_u32_e64 v1, s[0:1], s3, 0, s[0:1]
; GFX10-NEXT: v_add_co_u32_e64 v0, vcc, 0x800, v0
; GFX10-NEXT: v_add_co_u32 v0, vcc, 0x800, v0
; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc, 0, v1, vcc
; GFX10-NEXT: global_load_ubyte v0, v[0:1], off offset:2047
; GFX10-NEXT: s_waitcnt vmcnt(0)
@ -527,9 +527,9 @@ define amdgpu_ps float @global_load_saddr_i8_zext_vgpr_offset_4096(i8 addrspace(
;
; GFX10-LABEL: global_load_saddr_i8_zext_vgpr_offset_4096:
; GFX10: ; %bb.0:
; GFX10-NEXT: v_add_co_u32_e64 v0, s[0:1], s2, v0
; GFX10-NEXT: v_add_co_u32 v0, s[0:1], s2, v0
; GFX10-NEXT: v_add_co_ci_u32_e64 v1, s[0:1], s3, 0, s[0:1]
; GFX10-NEXT: v_add_co_u32_e64 v0, vcc, 0x1000, v0
; GFX10-NEXT: v_add_co_u32 v0, vcc, 0x1000, v0
; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc, 0, v1, vcc
; GFX10-NEXT: global_load_ubyte v0, v[0:1], off
; GFX10-NEXT: s_waitcnt vmcnt(0)
@ -553,9 +553,9 @@ define amdgpu_ps float @global_load_saddr_i8_zext_vgpr_offset_neg4096(i8 addrspa
;
; GFX10-LABEL: global_load_saddr_i8_zext_vgpr_offset_neg4096:
; GFX10: ; %bb.0:
; GFX10-NEXT: v_add_co_u32_e64 v0, s[0:1], s2, v0
; GFX10-NEXT: v_add_co_u32 v0, s[0:1], s2, v0
; GFX10-NEXT: v_add_co_ci_u32_e64 v1, s[0:1], s3, 0, s[0:1]
; GFX10-NEXT: v_add_co_u32_e64 v0, vcc, 0xfffff000, v0
; GFX10-NEXT: v_add_co_u32 v0, vcc, 0xfffff000, v0
; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc, -1, v1, vcc
; GFX10-NEXT: global_load_ubyte v0, v[0:1], off
; GFX10-NEXT: s_waitcnt vmcnt(0)
@ -584,9 +584,9 @@ define amdgpu_ps float @global_load_saddr_i8_zext_vgpr_offset_neg4097(i8 addrspa
;
; GFX10-LABEL: global_load_saddr_i8_zext_vgpr_offset_neg4097:
; GFX10: ; %bb.0:
; GFX10-NEXT: v_add_co_u32_e64 v0, s[0:1], s2, v0
; GFX10-NEXT: v_add_co_u32 v0, s[0:1], s2, v0
; GFX10-NEXT: v_add_co_ci_u32_e64 v1, s[0:1], s3, 0, s[0:1]
; GFX10-NEXT: v_add_co_u32_e64 v0, vcc, 0xfffff000, v0
; GFX10-NEXT: v_add_co_u32 v0, vcc, 0xfffff000, v0
; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc, -1, v1, vcc
; GFX10-NEXT: global_load_ubyte v0, v[0:1], off offset:-1
; GFX10-NEXT: s_waitcnt vmcnt(0)
@ -626,9 +626,9 @@ define amdgpu_ps float @global_load_saddr_i8_zext_vgpr_offset_2048(i8 addrspace(
;
; GFX10-LABEL: global_load_saddr_i8_zext_vgpr_offset_2048:
; GFX10: ; %bb.0:
; GFX10-NEXT: v_add_co_u32_e64 v0, s[0:1], s2, v0
; GFX10-NEXT: v_add_co_u32 v0, s[0:1], s2, v0
; GFX10-NEXT: v_add_co_ci_u32_e64 v1, s[0:1], s3, 0, s[0:1]
; GFX10-NEXT: v_add_co_u32_e64 v0, vcc, 0x800, v0
; GFX10-NEXT: v_add_co_u32 v0, vcc, 0x800, v0
; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc, 0, v1, vcc
; GFX10-NEXT: global_load_ubyte v0, v[0:1], off
; GFX10-NEXT: s_waitcnt vmcnt(0)
@ -668,9 +668,9 @@ define amdgpu_ps float @global_load_saddr_i8_zext_vgpr_offset_neg2049(i8 addrspa
;
; GFX10-LABEL: global_load_saddr_i8_zext_vgpr_offset_neg2049:
; GFX10: ; %bb.0:
; GFX10-NEXT: v_add_co_u32_e64 v0, s[0:1], s2, v0
; GFX10-NEXT: v_add_co_u32 v0, s[0:1], s2, v0
; GFX10-NEXT: v_add_co_ci_u32_e64 v1, s[0:1], s3, 0, s[0:1]
; GFX10-NEXT: v_add_co_u32_e64 v0, vcc, 0xfffff800, v0
; GFX10-NEXT: v_add_co_u32 v0, vcc, 0xfffff800, v0
; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc, -1, v1, vcc
; GFX10-NEXT: global_load_ubyte v0, v[0:1], off offset:-1
; GFX10-NEXT: s_waitcnt vmcnt(0)
@ -694,9 +694,9 @@ define amdgpu_ps float @global_load_saddr_i8_zext_vgpr_offset_4095_gep_order(i8
;
; GFX10-LABEL: global_load_saddr_i8_zext_vgpr_offset_4095_gep_order:
; GFX10: ; %bb.0:
; GFX10-NEXT: v_add_co_u32_e64 v0, s[0:1], s2, v0
; GFX10-NEXT: v_add_co_u32 v0, s[0:1], s2, v0
; GFX10-NEXT: v_add_co_ci_u32_e64 v1, s[0:1], s3, 0, s[0:1]
; GFX10-NEXT: v_add_co_u32_e64 v0, vcc, 0x800, v0
; GFX10-NEXT: v_add_co_u32 v0, vcc, 0x800, v0
; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc, 0, v1, vcc
; GFX10-NEXT: global_load_ubyte v0, v[0:1], off offset:2047
; GFX10-NEXT: s_waitcnt vmcnt(0)
@ -935,7 +935,7 @@ define amdgpu_ps float @global_load_i8_vgpr64_sgpr32(i8 addrspace(1)* %vbase, i3
;
; GFX10-LABEL: global_load_i8_vgpr64_sgpr32:
; GFX10: ; %bb.0:
; GFX10-NEXT: v_add_co_u32_e64 v0, vcc, v0, s2
; GFX10-NEXT: v_add_co_u32 v0, vcc, v0, s2
; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc, 0, v1, vcc
; GFX10-NEXT: global_load_ubyte v0, v[0:1], off
; GFX10-NEXT: s_waitcnt vmcnt(0)
@ -960,9 +960,9 @@ define amdgpu_ps float @global_load_i8_vgpr64_sgpr32_offset_4095(i8 addrspace(1)
;
; GFX10-LABEL: global_load_i8_vgpr64_sgpr32_offset_4095:
; GFX10: ; %bb.0:
; GFX10-NEXT: v_add_co_u32_e64 v0, vcc, v0, s2
; GFX10-NEXT: v_add_co_u32 v0, vcc, v0, s2
; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc, 0, v1, vcc
; GFX10-NEXT: v_add_co_u32_e64 v0, vcc, 0x800, v0
; GFX10-NEXT: v_add_co_u32 v0, vcc, 0x800, v0
; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc, 0, v1, vcc
; GFX10-NEXT: global_load_ubyte v0, v[0:1], off offset:2047
; GFX10-NEXT: s_waitcnt vmcnt(0)
@ -1001,7 +1001,7 @@ define amdgpu_ps float @global_load_saddr_f32_natural_addressing(float addrspace
; GFX10-NEXT: v_mov_b32_e32 v1, 0
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: v_lshlrev_b64 v[0:1], 2, v[0:1]
; GFX10-NEXT: v_add_co_u32_e64 v0, vcc, s2, v0
; GFX10-NEXT: v_add_co_u32 v0, vcc, s2, v0
; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc, s3, v1, vcc
; GFX10-NEXT: global_load_dword v0, v[0:1], off
; GFX10-NEXT: s_waitcnt vmcnt(0)
@ -1087,7 +1087,7 @@ define amdgpu_ps float @global_load_f32_saddr_zext_vgpr_range_too_large(float ad
; GFX10-NEXT: v_mov_b32_e32 v1, 0
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: v_lshlrev_b64 v[0:1], 2, v[0:1]
; GFX10-NEXT: v_add_co_u32_e64 v0, vcc, s2, v0
; GFX10-NEXT: v_add_co_u32 v0, vcc, s2, v0
; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc, s3, v1, vcc
; GFX10-NEXT: global_load_dword v0, v[0:1], off
; GFX10-NEXT: s_waitcnt vmcnt(0)

View File

@ -454,12 +454,12 @@ define amdgpu_kernel void @udiv16_invariant_denom(i16 addrspace(1)* nocapture %a
; GFX10-NEXT: BB4_1: ; %bb3
; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX10-NEXT: v_and_b32_e32 v2, s1, v4
; GFX10-NEXT: v_add_nc_u16_e64 v4, v4, 1
; GFX10-NEXT: v_add_nc_u16 v4, v4, 1
; GFX10-NEXT: v_cvt_f32_u32_e32 v7, v2
; GFX10-NEXT: v_lshlrev_b64 v[5:6], 1, v[2:3]
; GFX10-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x400, v4
; GFX10-NEXT: v_mul_f32_e32 v2, v7, v1
; GFX10-NEXT: v_add_co_u32_e64 v5, s0, s2, v5
; GFX10-NEXT: v_add_co_u32 v5, s0, s2, v5
; GFX10-NEXT: s_and_b32 vcc_lo, exec_lo, vcc_lo
; GFX10-NEXT: v_add_co_ci_u32_e64 v6, s0, s3, v6, s0
; GFX10-NEXT: v_trunc_f32_e32 v2, v2
@ -540,11 +540,11 @@ define amdgpu_kernel void @urem16_invariant_denom(i16 addrspace(1)* nocapture %a
; GFX10-NEXT: BB5_1: ; %bb3
; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX10-NEXT: v_and_b32_e32 v2, s1, v4
; GFX10-NEXT: v_add_nc_u16_e64 v4, v4, 1
; GFX10-NEXT: v_add_nc_u16 v4, v4, 1
; GFX10-NEXT: v_cvt_f32_u32_e32 v7, v2
; GFX10-NEXT: v_lshlrev_b64 v[5:6], 1, v[2:3]
; GFX10-NEXT: v_mul_f32_e32 v8, v7, v1
; GFX10-NEXT: v_add_co_u32_e64 v5, s0, s2, v5
; GFX10-NEXT: v_add_co_u32 v5, s0, s2, v5
; GFX10-NEXT: v_add_co_ci_u32_e64 v6, s0, s3, v6, s0
; GFX10-NEXT: v_trunc_f32_e32 v10, v8
; GFX10-NEXT: v_mad_f32 v7, -v10, v0, v7
@ -630,14 +630,14 @@ define amdgpu_kernel void @sdiv16_invariant_denom(i16 addrspace(1)* nocapture %a
; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX10-NEXT: v_bfe_i32 v5, v4, 0, 16
; GFX10-NEXT: v_and_b32_e32 v2, 0xffff, v4
; GFX10-NEXT: v_add_nc_u16_e64 v4, v4, 1
; GFX10-NEXT: v_add_nc_u16 v4, v4, 1
; GFX10-NEXT: v_cvt_f32_i32_e32 v7, v5
; GFX10-NEXT: v_xor_b32_e32 v8, s4, v5
; GFX10-NEXT: v_lshlrev_b64 v[5:6], 1, v[2:3]
; GFX10-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x400, v4
; GFX10-NEXT: v_mul_f32_e32 v2, v7, v1
; GFX10-NEXT: v_ashrrev_i32_e32 v8, 30, v8
; GFX10-NEXT: v_add_co_u32_e64 v5, s0, s2, v5
; GFX10-NEXT: v_add_co_u32 v5, s0, s2, v5
; GFX10-NEXT: s_and_b32 vcc_lo, exec_lo, vcc_lo
; GFX10-NEXT: v_trunc_f32_e32 v2, v2
; GFX10-NEXT: v_or_b32_e32 v8, 1, v8
@ -724,7 +724,7 @@ define amdgpu_kernel void @srem16_invariant_denom(i16 addrspace(1)* nocapture %a
; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX10-NEXT: v_bfe_i32 v7, v4, 0, 16
; GFX10-NEXT: v_and_b32_e32 v2, 0xffff, v4
; GFX10-NEXT: v_add_nc_u16_e64 v4, v4, 1
; GFX10-NEXT: v_add_nc_u16 v4, v4, 1
; GFX10-NEXT: v_cvt_f32_i32_e32 v11, v7
; GFX10-NEXT: v_xor_b32_e32 v6, s1, v7
; GFX10-NEXT: v_mul_f32_e32 v8, v11, v1
@ -738,7 +738,7 @@ define amdgpu_kernel void @srem16_invariant_denom(i16 addrspace(1)* nocapture %a
; GFX10-NEXT: v_lshlrev_b64 v[5:6], 1, v[2:3]
; GFX10-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x400, v4
; GFX10-NEXT: v_add_nc_u32_e32 v2, v8, v9
; GFX10-NEXT: v_add_co_u32_e64 v5, s0, s2, v5
; GFX10-NEXT: v_add_co_u32 v5, s0, s2, v5
; GFX10-NEXT: s_and_b32 vcc_lo, exec_lo, vcc_lo
; GFX10-NEXT: v_add_co_ci_u32_e64 v6, s0, s3, v6, s0
; GFX10-NEXT: v_mul_lo_u32 v2, v2, s1

View File

@ -2875,9 +2875,9 @@ define amdgpu_kernel void @notsdot2_sext8(<2 x i8> addrspace(1)* %src1,
; GFX10-DL-NEXT: global_load_ushort v2, v0, s[6:7]
; GFX10-DL-NEXT: s_load_dword s2, s[0:1], 0x0
; GFX10-DL-NEXT: s_waitcnt vmcnt(1)
; GFX10-DL-NEXT: v_lshrrev_b16_e64 v0, 8, v1
; GFX10-DL-NEXT: v_lshrrev_b16 v0, 8, v1
; GFX10-DL-NEXT: s_waitcnt vmcnt(0)
; GFX10-DL-NEXT: v_lshrrev_b16_e64 v3, 8, v2
; GFX10-DL-NEXT: v_lshrrev_b16 v3, 8, v2
; GFX10-DL-NEXT: v_mul_i32_i24_sdwa v1, sext(v2), sext(v1) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0
; GFX10-DL-NEXT: v_mov_b32_e32 v2, 0
; GFX10-DL-NEXT: v_mul_i32_i24_sdwa v0, sext(v3), sext(v0) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0

View File

@ -905,9 +905,9 @@ define amdgpu_kernel void @idot4_acc32_vecMul(<4 x i8> addrspace(1)* %src1,
; GFX10-DL-NEXT: global_load_dword v2, v0, s[6:7]
; GFX10-DL-NEXT: s_load_dword s2, s[0:1], 0x0
; GFX10-DL-NEXT: s_waitcnt vmcnt(1)
; GFX10-DL-NEXT: v_lshrrev_b16_e64 v0, 8, v1
; GFX10-DL-NEXT: v_lshrrev_b16 v0, 8, v1
; GFX10-DL-NEXT: s_waitcnt vmcnt(0)
; GFX10-DL-NEXT: v_lshrrev_b16_e64 v3, 8, v2
; GFX10-DL-NEXT: v_lshrrev_b16 v3, 8, v2
; GFX10-DL-NEXT: v_mul_i32_i24_sdwa v7, sext(v1), sext(v2) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0
; GFX10-DL-NEXT: v_mul_i32_i24_sdwa v0, sext(v0), sext(v3) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0
; GFX10-DL-NEXT: v_mul_i32_i24_sdwa v3, sext(v1), sext(v2) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2 src1_sel:BYTE_2
@ -1120,30 +1120,30 @@ define amdgpu_kernel void @idot4_acc16_vecMul(<4 x i8> addrspace(1)* %src1,
; GFX10-DL-NEXT: v_mov_b32_e32 v0, 0
; GFX10-DL-NEXT: global_load_ushort v3, v0, s[0:1]
; GFX10-DL-NEXT: s_waitcnt vmcnt(2)
; GFX10-DL-NEXT: v_ashrrev_i16_e64 v5, 8, v1
; GFX10-DL-NEXT: v_ashrrev_i16 v5, 8, v1
; GFX10-DL-NEXT: v_and_b32_sdwa v8, v4, sext(v1) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
; GFX10-DL-NEXT: s_waitcnt vmcnt(1)
; GFX10-DL-NEXT: v_ashrrev_i16_e64 v6, 8, v2
; GFX10-DL-NEXT: v_ashrrev_i16 v6, 8, v2
; GFX10-DL-NEXT: v_and_b32_sdwa v7, v4, sext(v2) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
; GFX10-DL-NEXT: v_lshrrev_b32_e32 v1, 16, v1
; GFX10-DL-NEXT: v_lshrrev_b32_e32 v2, 16, v2
; GFX10-DL-NEXT: v_lshl_or_b32 v5, v5, 16, v8
; GFX10-DL-NEXT: v_lshl_or_b32 v6, v6, 16, v7
; GFX10-DL-NEXT: v_ashrrev_i16_e64 v7, 8, v1
; GFX10-DL-NEXT: v_ashrrev_i16 v7, 8, v1
; GFX10-DL-NEXT: v_and_b32_sdwa v1, v4, sext(v1) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
; GFX10-DL-NEXT: v_ashrrev_i16_e64 v8, 8, v2
; GFX10-DL-NEXT: v_ashrrev_i16 v8, 8, v2
; GFX10-DL-NEXT: v_and_b32_sdwa v2, v4, sext(v2) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
; GFX10-DL-NEXT: v_pk_mul_lo_u16 v4, v5, v6
; GFX10-DL-NEXT: v_lshl_or_b32 v1, v7, 16, v1
; GFX10-DL-NEXT: v_lshl_or_b32 v2, v8, 16, v2
; GFX10-DL-NEXT: v_lshrrev_b32_e32 v5, 16, v4
; GFX10-DL-NEXT: s_waitcnt vmcnt(0)
; GFX10-DL-NEXT: v_add_nc_u16_e64 v3, v4, v3
; GFX10-DL-NEXT: v_add_nc_u16 v3, v4, v3
; GFX10-DL-NEXT: v_pk_mul_lo_u16 v1, v1, v2
; GFX10-DL-NEXT: v_add_nc_u16_e64 v2, v3, v5
; GFX10-DL-NEXT: v_add_nc_u16 v2, v3, v5
; GFX10-DL-NEXT: v_lshrrev_b32_e32 v3, 16, v1
; GFX10-DL-NEXT: v_add_nc_u16_e64 v1, v2, v1
; GFX10-DL-NEXT: v_add_nc_u16_e64 v1, v1, v3
; GFX10-DL-NEXT: v_add_nc_u16 v1, v2, v1
; GFX10-DL-NEXT: v_add_nc_u16 v1, v1, v3
; GFX10-DL-NEXT: global_store_short v0, v1, s[0:1]
; GFX10-DL-NEXT: s_endpgm
<4 x i8> addrspace(1)* %src2,

View File

@ -2020,10 +2020,10 @@ define amdgpu_kernel void @udot4_acc16_vecMul(<4 x i8> addrspace(1)* %src1,
; GFX10-DL-NEXT: v_mov_b32_e32 v0, 0
; GFX10-DL-NEXT: global_load_ushort v3, v0, s[0:1]
; GFX10-DL-NEXT: s_waitcnt vmcnt(2)
; GFX10-DL-NEXT: v_lshrrev_b16_e64 v5, 8, v1
; GFX10-DL-NEXT: v_lshrrev_b16 v5, 8, v1
; GFX10-DL-NEXT: v_and_b32_sdwa v8, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
; GFX10-DL-NEXT: s_waitcnt vmcnt(1)
; GFX10-DL-NEXT: v_lshrrev_b16_e64 v6, 8, v2
; GFX10-DL-NEXT: v_lshrrev_b16 v6, 8, v2
; GFX10-DL-NEXT: v_and_b32_sdwa v7, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
; GFX10-DL-NEXT: v_and_b32_sdwa v9, v1, s2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; GFX10-DL-NEXT: v_and_b32_sdwa v10, v2, s2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
@ -2038,12 +2038,12 @@ define amdgpu_kernel void @udot4_acc16_vecMul(<4 x i8> addrspace(1)* %src1,
; GFX10-DL-NEXT: v_lshl_or_b32 v1, v1, 16, v4
; GFX10-DL-NEXT: v_lshrrev_b32_e32 v4, 16, v5
; GFX10-DL-NEXT: s_waitcnt vmcnt(0)
; GFX10-DL-NEXT: v_add_nc_u16_e64 v3, v5, v3
; GFX10-DL-NEXT: v_add_nc_u16 v3, v5, v3
; GFX10-DL-NEXT: v_pk_mul_lo_u16 v1, v1, v2
; GFX10-DL-NEXT: v_add_nc_u16_e64 v2, v3, v4
; GFX10-DL-NEXT: v_add_nc_u16 v2, v3, v4
; GFX10-DL-NEXT: v_lshrrev_b32_e32 v3, 16, v1
; GFX10-DL-NEXT: v_add_nc_u16_e64 v1, v2, v1
; GFX10-DL-NEXT: v_add_nc_u16_e64 v1, v1, v3
; GFX10-DL-NEXT: v_add_nc_u16 v1, v2, v1
; GFX10-DL-NEXT: v_add_nc_u16 v1, v1, v3
; GFX10-DL-NEXT: global_store_short v0, v1, s[0:1]
; GFX10-DL-NEXT: s_endpgm
<4 x i8> addrspace(1)* %src2,
@ -2225,23 +2225,23 @@ define amdgpu_kernel void @udot4_acc8_vecMul(<4 x i8> addrspace(1)* %src1,
; GFX10-DL-NEXT: v_lshrrev_b32_e32 v5, 24, v2
; GFX10-DL-NEXT: v_lshrrev_b32_e32 v6, 16, v1
; GFX10-DL-NEXT: v_lshrrev_b32_e32 v7, 16, v2
; GFX10-DL-NEXT: v_lshrrev_b16_e64 v8, 8, v2
; GFX10-DL-NEXT: v_mul_lo_u16_e64 v4, v4, v5
; GFX10-DL-NEXT: v_lshrrev_b16_e64 v5, 8, v1
; GFX10-DL-NEXT: v_mul_lo_u16_e64 v9, v6, v7
; GFX10-DL-NEXT: v_lshrrev_b16 v8, 8, v2
; GFX10-DL-NEXT: v_mul_lo_u16 v4, v4, v5
; GFX10-DL-NEXT: v_lshrrev_b16 v5, 8, v1
; GFX10-DL-NEXT: v_mul_lo_u16 v9, v6, v7
; GFX10-DL-NEXT: s_waitcnt vmcnt(0)
; GFX10-DL-NEXT: v_mad_u16 v3, v1, v2, v3
; GFX10-DL-NEXT: v_lshlrev_b16_e64 v4, 8, v4
; GFX10-DL-NEXT: v_mul_lo_u16_e64 v5, v5, v8
; GFX10-DL-NEXT: v_lshlrev_b16 v4, 8, v4
; GFX10-DL-NEXT: v_mul_lo_u16 v5, v5, v8
; GFX10-DL-NEXT: v_or_b32_sdwa v4, v9, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX10-DL-NEXT: v_lshlrev_b16_e64 v5, 8, v5
; GFX10-DL-NEXT: v_lshlrev_b16 v5, 8, v5
; GFX10-DL-NEXT: v_lshlrev_b32_e32 v8, 16, v4
; GFX10-DL-NEXT: v_lshrrev_b32_e32 v2, 8, v4
; GFX10-DL-NEXT: v_or_b32_sdwa v5, v5, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; GFX10-DL-NEXT: v_lshrrev_b32_e32 v5, 8, v5
; GFX10-DL-NEXT: v_add_nc_u16_e64 v1, v3, v5
; GFX10-DL-NEXT: v_add_nc_u16 v1, v3, v5
; GFX10-DL-NEXT: v_mad_u16 v1, v6, v7, v1
; GFX10-DL-NEXT: v_add_nc_u16_e64 v1, v1, v2
; GFX10-DL-NEXT: v_add_nc_u16 v1, v1, v2
; GFX10-DL-NEXT: global_store_byte v0, v1, s[0:1]
; GFX10-DL-NEXT: s_endpgm
<4 x i8> addrspace(1)* %src2,

View File

@ -646,51 +646,51 @@ define amdgpu_kernel void @idot8_acc16(<8 x i4> addrspace(1)* %src1,
; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v10, 4, v1
; GFX10-DL-XNACK-NEXT: s_waitcnt vmcnt(1)
; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v16, 4, v2
; GFX10-DL-XNACK-NEXT: v_lshlrev_b16_e64 v17, 12, v2
; GFX10-DL-XNACK-NEXT: v_lshlrev_b16_e64 v1, 12, v1
; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v17, 12, v2
; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v1, 12, v1
; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v11, 28, v2
; GFX10-DL-XNACK-NEXT: v_lshlrev_b16_e64 v10, 12, v10
; GFX10-DL-XNACK-NEXT: v_lshlrev_b16_e64 v16, 12, v16
; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v10, 12, v10
; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v16, 12, v16
; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v12, 24, v2
; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v13, 20, v2
; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v14, 16, v2
; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v15, 12, v2
; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v2, 8, v2
; GFX10-DL-XNACK-NEXT: v_ashrrev_i16_e64 v18, 12, v1
; GFX10-DL-XNACK-NEXT: v_ashrrev_i16_e64 v17, 12, v17
; GFX10-DL-XNACK-NEXT: v_lshlrev_b16_e64 v9, 12, v9
; GFX10-DL-XNACK-NEXT: v_ashrrev_i16_e64 v10, 12, v10
; GFX10-DL-XNACK-NEXT: v_lshlrev_b16_e64 v2, 12, v2
; GFX10-DL-XNACK-NEXT: v_ashrrev_i16_e64 v16, 12, v16
; GFX10-DL-XNACK-NEXT: v_ashrrev_i16 v18, 12, v1
; GFX10-DL-XNACK-NEXT: v_ashrrev_i16 v17, 12, v17
; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v9, 12, v9
; GFX10-DL-XNACK-NEXT: v_ashrrev_i16 v10, 12, v10
; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v2, 12, v2
; GFX10-DL-XNACK-NEXT: v_ashrrev_i16 v16, 12, v16
; GFX10-DL-XNACK-NEXT: s_waitcnt vmcnt(0)
; GFX10-DL-XNACK-NEXT: v_mad_u16 v1, v18, v17, v3
; GFX10-DL-XNACK-NEXT: v_lshlrev_b16_e64 v8, 12, v8
; GFX10-DL-XNACK-NEXT: v_ashrrev_i16_e64 v3, 12, v9
; GFX10-DL-XNACK-NEXT: v_lshlrev_b16_e64 v9, 12, v15
; GFX10-DL-XNACK-NEXT: v_ashrrev_i16_e64 v2, 12, v2
; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v8, 12, v8
; GFX10-DL-XNACK-NEXT: v_ashrrev_i16 v3, 12, v9
; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v9, 12, v15
; GFX10-DL-XNACK-NEXT: v_ashrrev_i16 v2, 12, v2
; GFX10-DL-XNACK-NEXT: v_mad_u16 v1, v10, v16, v1
; GFX10-DL-XNACK-NEXT: v_lshlrev_b16_e64 v7, 12, v7
; GFX10-DL-XNACK-NEXT: v_lshlrev_b16_e64 v10, 12, v14
; GFX10-DL-XNACK-NEXT: v_ashrrev_i16_e64 v8, 12, v8
; GFX10-DL-XNACK-NEXT: v_ashrrev_i16_e64 v15, 12, v9
; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v7, 12, v7
; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v10, 12, v14
; GFX10-DL-XNACK-NEXT: v_ashrrev_i16 v8, 12, v8
; GFX10-DL-XNACK-NEXT: v_ashrrev_i16 v15, 12, v9
; GFX10-DL-XNACK-NEXT: v_mad_u16 v1, v3, v2, v1
; GFX10-DL-XNACK-NEXT: v_lshlrev_b16_e64 v6, 12, v6
; GFX10-DL-XNACK-NEXT: v_ashrrev_i16_e64 v2, 12, v7
; GFX10-DL-XNACK-NEXT: v_lshlrev_b16_e64 v7, 12, v13
; GFX10-DL-XNACK-NEXT: v_ashrrev_i16_e64 v3, 12, v10
; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v6, 12, v6
; GFX10-DL-XNACK-NEXT: v_ashrrev_i16 v2, 12, v7
; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v7, 12, v13
; GFX10-DL-XNACK-NEXT: v_ashrrev_i16 v3, 12, v10
; GFX10-DL-XNACK-NEXT: v_mad_u16 v1, v8, v15, v1
; GFX10-DL-XNACK-NEXT: v_lshlrev_b16_e64 v5, 12, v5
; GFX10-DL-XNACK-NEXT: v_lshlrev_b16_e64 v8, 12, v12
; GFX10-DL-XNACK-NEXT: v_ashrrev_i16_e64 v6, 12, v6
; GFX10-DL-XNACK-NEXT: v_ashrrev_i16_e64 v7, 12, v7
; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v5, 12, v5
; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v8, 12, v12
; GFX10-DL-XNACK-NEXT: v_ashrrev_i16 v6, 12, v6
; GFX10-DL-XNACK-NEXT: v_ashrrev_i16 v7, 12, v7
; GFX10-DL-XNACK-NEXT: v_mad_u16 v1, v2, v3, v1
; GFX10-DL-XNACK-NEXT: v_lshlrev_b16_e64 v4, 12, v4
; GFX10-DL-XNACK-NEXT: v_ashrrev_i16_e64 v2, 12, v5
; GFX10-DL-XNACK-NEXT: v_ashrrev_i16_e64 v3, 12, v8
; GFX10-DL-XNACK-NEXT: v_lshlrev_b16_e64 v5, 12, v11
; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v4, 12, v4
; GFX10-DL-XNACK-NEXT: v_ashrrev_i16 v2, 12, v5
; GFX10-DL-XNACK-NEXT: v_ashrrev_i16 v3, 12, v8
; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v5, 12, v11
; GFX10-DL-XNACK-NEXT: v_mad_u16 v1, v6, v7, v1
; GFX10-DL-XNACK-NEXT: v_ashrrev_i16_e64 v4, 12, v4
; GFX10-DL-XNACK-NEXT: v_ashrrev_i16_e64 v7, 12, v5
; GFX10-DL-XNACK-NEXT: v_ashrrev_i16 v4, 12, v4
; GFX10-DL-XNACK-NEXT: v_ashrrev_i16 v7, 12, v5
; GFX10-DL-XNACK-NEXT: v_mad_u16 v1, v2, v3, v1
; GFX10-DL-XNACK-NEXT: v_mad_u16 v1, v4, v7, v1
; GFX10-DL-XNACK-NEXT: global_store_short v0, v1, s[0:1]
@ -724,51 +724,51 @@ define amdgpu_kernel void @idot8_acc16(<8 x i4> addrspace(1)* %src1,
; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v10, 4, v1
; GFX10-DL-NOXNACK-NEXT: s_waitcnt vmcnt(1)
; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v16, 4, v0
; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16_e64 v17, 12, v0
; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16_e64 v1, 12, v1
; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v17, 12, v0
; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v1, 12, v1
; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v11, 28, v0
; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16_e64 v10, 12, v10
; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16_e64 v16, 12, v16
; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v10, 12, v10
; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v16, 12, v16
; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v12, 24, v0
; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v13, 20, v0
; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v14, 16, v0
; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v15, 12, v0
; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v0, 8, v0
; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16_e64 v18, 12, v1
; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16_e64 v17, 12, v17
; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16_e64 v9, 12, v9
; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16_e64 v10, 12, v10
; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16_e64 v0, 12, v0
; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16_e64 v16, 12, v16
; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v18, 12, v1
; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v17, 12, v17
; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v9, 12, v9
; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v10, 12, v10
; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v0, 12, v0
; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v16, 12, v16
; GFX10-DL-NOXNACK-NEXT: s_waitcnt vmcnt(0)
; GFX10-DL-NOXNACK-NEXT: v_mad_u16 v1, v18, v17, v3
; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16_e64 v3, 12, v9
; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16_e64 v9, 12, v15
; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16_e64 v8, 12, v8
; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16_e64 v0, 12, v0
; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v3, 12, v9
; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v9, 12, v15
; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v8, 12, v8
; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v0, 12, v0
; GFX10-DL-NOXNACK-NEXT: v_mad_u16 v1, v10, v16, v1
; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16_e64 v7, 12, v7
; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16_e64 v10, 12, v14
; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16_e64 v15, 12, v8
; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16_e64 v9, 12, v9
; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v7, 12, v7
; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v10, 12, v14
; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v15, 12, v8
; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v9, 12, v9
; GFX10-DL-NOXNACK-NEXT: v_mad_u16 v0, v3, v0, v1
; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16_e64 v6, 12, v6
; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16_e64 v1, 12, v7
; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16_e64 v7, 12, v13
; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16_e64 v3, 12, v10
; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v6, 12, v6
; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v1, 12, v7
; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v7, 12, v13
; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v3, 12, v10
; GFX10-DL-NOXNACK-NEXT: v_mad_u16 v0, v15, v9, v0
; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16_e64 v5, 12, v5
; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16_e64 v8, 12, v12
; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16_e64 v6, 12, v6
; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16_e64 v7, 12, v7
; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v5, 12, v5
; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v8, 12, v12
; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v6, 12, v6
; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v7, 12, v7
; GFX10-DL-NOXNACK-NEXT: v_mad_u16 v0, v1, v3, v0
; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16_e64 v1, 12, v5
; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16_e64 v5, 12, v11
; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16_e64 v3, 12, v8
; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16_e64 v4, 12, v4
; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v1, 12, v5
; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v5, 12, v11
; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v3, 12, v8
; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v4, 12, v4
; GFX10-DL-NOXNACK-NEXT: v_mad_u16 v0, v6, v7, v0
; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16_e64 v5, 12, v5
; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16_e64 v7, 12, v4
; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v5, 12, v5
; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v7, 12, v4
; GFX10-DL-NOXNACK-NEXT: v_mad_u16 v0, v1, v3, v0
; GFX10-DL-NOXNACK-NEXT: v_mad_u16 v0, v7, v5, v0
; GFX10-DL-NOXNACK-NEXT: global_store_short v2, v0, s[0:1]
@ -1220,51 +1220,51 @@ define amdgpu_kernel void @idot8_acc8(<8 x i4> addrspace(1)* %src1,
; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v10, 4, v1
; GFX10-DL-XNACK-NEXT: s_waitcnt vmcnt(1)
; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v16, 4, v2
; GFX10-DL-XNACK-NEXT: v_lshlrev_b16_e64 v17, 12, v2
; GFX10-DL-XNACK-NEXT: v_lshlrev_b16_e64 v1, 12, v1
; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v17, 12, v2
; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v1, 12, v1
; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v11, 28, v2
; GFX10-DL-XNACK-NEXT: v_lshlrev_b16_e64 v10, 12, v10
; GFX10-DL-XNACK-NEXT: v_lshlrev_b16_e64 v16, 12, v16
; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v10, 12, v10
; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v16, 12, v16
; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v12, 24, v2
; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v13, 20, v2
; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v14, 16, v2
; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v15, 12, v2
; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v2, 8, v2
; GFX10-DL-XNACK-NEXT: v_ashrrev_i16_e64 v18, 12, v1
; GFX10-DL-XNACK-NEXT: v_ashrrev_i16_e64 v17, 12, v17
; GFX10-DL-XNACK-NEXT: v_lshlrev_b16_e64 v9, 12, v9
; GFX10-DL-XNACK-NEXT: v_ashrrev_i16_e64 v10, 12, v10
; GFX10-DL-XNACK-NEXT: v_lshlrev_b16_e64 v2, 12, v2
; GFX10-DL-XNACK-NEXT: v_ashrrev_i16_e64 v16, 12, v16
; GFX10-DL-XNACK-NEXT: v_ashrrev_i16 v18, 12, v1
; GFX10-DL-XNACK-NEXT: v_ashrrev_i16 v17, 12, v17
; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v9, 12, v9
; GFX10-DL-XNACK-NEXT: v_ashrrev_i16 v10, 12, v10
; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v2, 12, v2
; GFX10-DL-XNACK-NEXT: v_ashrrev_i16 v16, 12, v16
; GFX10-DL-XNACK-NEXT: s_waitcnt vmcnt(0)
; GFX10-DL-XNACK-NEXT: v_mad_u16 v1, v18, v17, v3
; GFX10-DL-XNACK-NEXT: v_lshlrev_b16_e64 v8, 12, v8
; GFX10-DL-XNACK-NEXT: v_ashrrev_i16_e64 v3, 12, v9
; GFX10-DL-XNACK-NEXT: v_lshlrev_b16_e64 v9, 12, v15
; GFX10-DL-XNACK-NEXT: v_ashrrev_i16_e64 v2, 12, v2
; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v8, 12, v8
; GFX10-DL-XNACK-NEXT: v_ashrrev_i16 v3, 12, v9
; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v9, 12, v15
; GFX10-DL-XNACK-NEXT: v_ashrrev_i16 v2, 12, v2
; GFX10-DL-XNACK-NEXT: v_mad_u16 v1, v10, v16, v1
; GFX10-DL-XNACK-NEXT: v_lshlrev_b16_e64 v7, 12, v7
; GFX10-DL-XNACK-NEXT: v_lshlrev_b16_e64 v10, 12, v14
; GFX10-DL-XNACK-NEXT: v_ashrrev_i16_e64 v8, 12, v8
; GFX10-DL-XNACK-NEXT: v_ashrrev_i16_e64 v15, 12, v9
; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v7, 12, v7
; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v10, 12, v14
; GFX10-DL-XNACK-NEXT: v_ashrrev_i16 v8, 12, v8
; GFX10-DL-XNACK-NEXT: v_ashrrev_i16 v15, 12, v9
; GFX10-DL-XNACK-NEXT: v_mad_u16 v1, v3, v2, v1
; GFX10-DL-XNACK-NEXT: v_lshlrev_b16_e64 v6, 12, v6
; GFX10-DL-XNACK-NEXT: v_ashrrev_i16_e64 v2, 12, v7
; GFX10-DL-XNACK-NEXT: v_lshlrev_b16_e64 v7, 12, v13
; GFX10-DL-XNACK-NEXT: v_ashrrev_i16_e64 v3, 12, v10
; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v6, 12, v6
; GFX10-DL-XNACK-NEXT: v_ashrrev_i16 v2, 12, v7
; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v7, 12, v13
; GFX10-DL-XNACK-NEXT: v_ashrrev_i16 v3, 12, v10
; GFX10-DL-XNACK-NEXT: v_mad_u16 v1, v8, v15, v1
; GFX10-DL-XNACK-NEXT: v_lshlrev_b16_e64 v5, 12, v5
; GFX10-DL-XNACK-NEXT: v_lshlrev_b16_e64 v8, 12, v12
; GFX10-DL-XNACK-NEXT: v_ashrrev_i16_e64 v6, 12, v6
; GFX10-DL-XNACK-NEXT: v_ashrrev_i16_e64 v7, 12, v7
; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v5, 12, v5
; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v8, 12, v12
; GFX10-DL-XNACK-NEXT: v_ashrrev_i16 v6, 12, v6
; GFX10-DL-XNACK-NEXT: v_ashrrev_i16 v7, 12, v7
; GFX10-DL-XNACK-NEXT: v_mad_u16 v1, v2, v3, v1
; GFX10-DL-XNACK-NEXT: v_lshlrev_b16_e64 v4, 12, v4
; GFX10-DL-XNACK-NEXT: v_ashrrev_i16_e64 v2, 12, v5
; GFX10-DL-XNACK-NEXT: v_ashrrev_i16_e64 v3, 12, v8
; GFX10-DL-XNACK-NEXT: v_lshlrev_b16_e64 v5, 12, v11
; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v4, 12, v4
; GFX10-DL-XNACK-NEXT: v_ashrrev_i16 v2, 12, v5
; GFX10-DL-XNACK-NEXT: v_ashrrev_i16 v3, 12, v8
; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v5, 12, v11
; GFX10-DL-XNACK-NEXT: v_mad_u16 v1, v6, v7, v1
; GFX10-DL-XNACK-NEXT: v_ashrrev_i16_e64 v4, 12, v4
; GFX10-DL-XNACK-NEXT: v_ashrrev_i16_e64 v7, 12, v5
; GFX10-DL-XNACK-NEXT: v_ashrrev_i16 v4, 12, v4
; GFX10-DL-XNACK-NEXT: v_ashrrev_i16 v7, 12, v5
; GFX10-DL-XNACK-NEXT: v_mad_u16 v1, v2, v3, v1
; GFX10-DL-XNACK-NEXT: v_mad_u16 v1, v4, v7, v1
; GFX10-DL-XNACK-NEXT: global_store_byte v0, v1, s[0:1]
@ -1298,51 +1298,51 @@ define amdgpu_kernel void @idot8_acc8(<8 x i4> addrspace(1)* %src1,
; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v10, 4, v1
; GFX10-DL-NOXNACK-NEXT: s_waitcnt vmcnt(1)
; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v16, 4, v0
; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16_e64 v17, 12, v0
; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16_e64 v1, 12, v1
; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v17, 12, v0
; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v1, 12, v1
; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v11, 28, v0
; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16_e64 v10, 12, v10
; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16_e64 v16, 12, v16
; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v10, 12, v10
; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v16, 12, v16
; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v12, 24, v0
; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v13, 20, v0
; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v14, 16, v0
; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v15, 12, v0
; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v0, 8, v0
; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16_e64 v18, 12, v1
; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16_e64 v17, 12, v17
; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16_e64 v9, 12, v9
; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16_e64 v10, 12, v10
; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16_e64 v0, 12, v0
; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16_e64 v16, 12, v16
; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v18, 12, v1
; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v17, 12, v17
; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v9, 12, v9
; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v10, 12, v10
; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v0, 12, v0
; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v16, 12, v16
; GFX10-DL-NOXNACK-NEXT: s_waitcnt vmcnt(0)
; GFX10-DL-NOXNACK-NEXT: v_mad_u16 v1, v18, v17, v3
; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16_e64 v3, 12, v9
; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16_e64 v9, 12, v15
; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16_e64 v8, 12, v8
; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16_e64 v0, 12, v0
; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v3, 12, v9
; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v9, 12, v15
; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v8, 12, v8
; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v0, 12, v0
; GFX10-DL-NOXNACK-NEXT: v_mad_u16 v1, v10, v16, v1
; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16_e64 v7, 12, v7
; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16_e64 v10, 12, v14
; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16_e64 v15, 12, v8
; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16_e64 v9, 12, v9
; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v7, 12, v7
; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v10, 12, v14
; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v15, 12, v8
; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v9, 12, v9
; GFX10-DL-NOXNACK-NEXT: v_mad_u16 v0, v3, v0, v1
; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16_e64 v6, 12, v6
; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16_e64 v1, 12, v7
; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16_e64 v7, 12, v13
; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16_e64 v3, 12, v10
; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v6, 12, v6
; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v1, 12, v7
; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v7, 12, v13
; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v3, 12, v10
; GFX10-DL-NOXNACK-NEXT: v_mad_u16 v0, v15, v9, v0
; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16_e64 v5, 12, v5
; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16_e64 v8, 12, v12
; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16_e64 v6, 12, v6
; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16_e64 v7, 12, v7
; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v5, 12, v5
; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v8, 12, v12
; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v6, 12, v6
; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v7, 12, v7
; GFX10-DL-NOXNACK-NEXT: v_mad_u16 v0, v1, v3, v0
; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16_e64 v1, 12, v5
; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16_e64 v5, 12, v11
; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16_e64 v3, 12, v8
; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16_e64 v4, 12, v4
; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v1, 12, v5
; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v5, 12, v11
; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v3, 12, v8
; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v4, 12, v4
; GFX10-DL-NOXNACK-NEXT: v_mad_u16 v0, v6, v7, v0
; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16_e64 v5, 12, v5
; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16_e64 v7, 12, v4
; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v5, 12, v5
; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v7, 12, v4
; GFX10-DL-NOXNACK-NEXT: v_mad_u16 v0, v1, v3, v0
; GFX10-DL-NOXNACK-NEXT: v_mad_u16 v0, v7, v5, v0
; GFX10-DL-NOXNACK-NEXT: global_store_byte v2, v0, s[0:1]
@ -2574,30 +2574,30 @@ define amdgpu_kernel void @idot8_acc16_vecMul(<8 x i4> addrspace(1)* %src1,
; GFX10-DL-XNACK-NEXT: v_pk_ashrrev_i16 v2, 12, v2 op_sel_hi:[0,1]
; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v10, 16, v1
; GFX10-DL-XNACK-NEXT: s_waitcnt vmcnt(0)
; GFX10-DL-XNACK-NEXT: v_add_nc_u16_e64 v1, v1, v3
; GFX10-DL-XNACK-NEXT: v_add_nc_u16 v1, v1, v3
; GFX10-DL-XNACK-NEXT: v_and_b32_e32 v3, v4, v5
; GFX10-DL-XNACK-NEXT: v_pk_lshlrev_b16 v5, 12, v8 op_sel_hi:[0,1]
; GFX10-DL-XNACK-NEXT: v_and_b32_e32 v4, v4, v19
; GFX10-DL-XNACK-NEXT: v_pk_mul_lo_u16 v2, v9, v2
; GFX10-DL-XNACK-NEXT: v_add_nc_u16_e64 v1, v1, v10
; GFX10-DL-XNACK-NEXT: v_add_nc_u16 v1, v1, v10
; GFX10-DL-XNACK-NEXT: v_lshl_or_b32 v3, v6, 16, v3
; GFX10-DL-XNACK-NEXT: v_pk_ashrrev_i16 v5, 12, v5 op_sel_hi:[0,1]
; GFX10-DL-XNACK-NEXT: v_lshl_or_b32 v4, v14, 16, v4
; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v6, 16, v2
; GFX10-DL-XNACK-NEXT: v_add_nc_u16_e64 v1, v1, v2
; GFX10-DL-XNACK-NEXT: v_add_nc_u16 v1, v1, v2
; GFX10-DL-XNACK-NEXT: v_pk_lshlrev_b16 v2, 12, v3 op_sel_hi:[0,1]
; GFX10-DL-XNACK-NEXT: v_pk_lshlrev_b16 v3, 12, v4 op_sel_hi:[0,1]
; GFX10-DL-XNACK-NEXT: v_pk_mul_lo_u16 v4, v7, v5
; GFX10-DL-XNACK-NEXT: v_add_nc_u16_e64 v1, v1, v6
; GFX10-DL-XNACK-NEXT: v_add_nc_u16 v1, v1, v6
; GFX10-DL-XNACK-NEXT: v_pk_ashrrev_i16 v2, 12, v2 op_sel_hi:[0,1]
; GFX10-DL-XNACK-NEXT: v_pk_ashrrev_i16 v3, 12, v3 op_sel_hi:[0,1]
; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v5, 16, v4
; GFX10-DL-XNACK-NEXT: v_add_nc_u16_e64 v7, v1, v4
; GFX10-DL-XNACK-NEXT: v_add_nc_u16 v7, v1, v4
; GFX10-DL-XNACK-NEXT: v_pk_mul_lo_u16 v2, v2, v3
; GFX10-DL-XNACK-NEXT: v_add_nc_u16_e64 v1, v7, v5
; GFX10-DL-XNACK-NEXT: v_add_nc_u16 v1, v7, v5
; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v3, 16, v2
; GFX10-DL-XNACK-NEXT: v_add_nc_u16_e64 v1, v1, v2
; GFX10-DL-XNACK-NEXT: v_add_nc_u16_e64 v1, v1, v3
; GFX10-DL-XNACK-NEXT: v_add_nc_u16 v1, v1, v2
; GFX10-DL-XNACK-NEXT: v_add_nc_u16 v1, v1, v3
; GFX10-DL-XNACK-NEXT: global_store_short v0, v1, s[0:1]
; GFX10-DL-XNACK-NEXT: s_endpgm
;
@ -2662,30 +2662,30 @@ define amdgpu_kernel void @idot8_acc16_vecMul(<8 x i4> addrspace(1)* %src1,
; GFX10-DL-NOXNACK-NEXT: v_pk_ashrrev_i16 v0, 12, v0 op_sel_hi:[0,1]
; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v10, 16, v1
; GFX10-DL-NOXNACK-NEXT: s_waitcnt vmcnt(0)
; GFX10-DL-NOXNACK-NEXT: v_add_nc_u16_e64 v1, v1, v3
; GFX10-DL-NOXNACK-NEXT: v_add_nc_u16 v1, v1, v3
; GFX10-DL-NOXNACK-NEXT: v_and_b32_e32 v3, v4, v5
; GFX10-DL-NOXNACK-NEXT: v_pk_lshlrev_b16 v5, 12, v8 op_sel_hi:[0,1]
; GFX10-DL-NOXNACK-NEXT: v_and_b32_e32 v4, v4, v19
; GFX10-DL-NOXNACK-NEXT: v_pk_mul_lo_u16 v0, v9, v0
; GFX10-DL-NOXNACK-NEXT: v_add_nc_u16_e64 v1, v1, v10
; GFX10-DL-NOXNACK-NEXT: v_add_nc_u16 v1, v1, v10
; GFX10-DL-NOXNACK-NEXT: v_lshl_or_b32 v3, v6, 16, v3
; GFX10-DL-NOXNACK-NEXT: v_pk_ashrrev_i16 v5, 12, v5 op_sel_hi:[0,1]
; GFX10-DL-NOXNACK-NEXT: v_lshl_or_b32 v4, v14, 16, v4
; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v6, 16, v0
; GFX10-DL-NOXNACK-NEXT: v_add_nc_u16_e64 v0, v1, v0
; GFX10-DL-NOXNACK-NEXT: v_add_nc_u16 v0, v1, v0
; GFX10-DL-NOXNACK-NEXT: v_pk_lshlrev_b16 v1, 12, v3 op_sel_hi:[0,1]
; GFX10-DL-NOXNACK-NEXT: v_pk_lshlrev_b16 v3, 12, v4 op_sel_hi:[0,1]
; GFX10-DL-NOXNACK-NEXT: v_pk_mul_lo_u16 v4, v7, v5
; GFX10-DL-NOXNACK-NEXT: v_add_nc_u16_e64 v7, v0, v6
; GFX10-DL-NOXNACK-NEXT: v_add_nc_u16 v7, v0, v6
; GFX10-DL-NOXNACK-NEXT: v_pk_ashrrev_i16 v1, 12, v1 op_sel_hi:[0,1]
; GFX10-DL-NOXNACK-NEXT: v_pk_ashrrev_i16 v3, 12, v3 op_sel_hi:[0,1]
; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v5, 16, v4
; GFX10-DL-NOXNACK-NEXT: v_add_nc_u16_e64 v0, v7, v4
; GFX10-DL-NOXNACK-NEXT: v_add_nc_u16 v0, v7, v4
; GFX10-DL-NOXNACK-NEXT: v_pk_mul_lo_u16 v1, v1, v3
; GFX10-DL-NOXNACK-NEXT: v_add_nc_u16_e64 v0, v0, v5
; GFX10-DL-NOXNACK-NEXT: v_add_nc_u16 v0, v0, v5
; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v3, 16, v1
; GFX10-DL-NOXNACK-NEXT: v_add_nc_u16_e64 v0, v0, v1
; GFX10-DL-NOXNACK-NEXT: v_add_nc_u16_e64 v0, v0, v3
; GFX10-DL-NOXNACK-NEXT: v_add_nc_u16 v0, v0, v1
; GFX10-DL-NOXNACK-NEXT: v_add_nc_u16 v0, v0, v3
; GFX10-DL-NOXNACK-NEXT: global_store_short v2, v0, s[0:1]
; GFX10-DL-NOXNACK-NEXT: s_endpgm
; GFX10-DL-LABEL: idot8_acc16_vecMul:
@ -3215,80 +3215,80 @@ define amdgpu_kernel void @idot8_acc8_vecMul(<8 x i4> addrspace(1)* %src1,
; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v9, 8, v1
; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v16, 8, v2
; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v6, 28, v1
; GFX10-DL-XNACK-NEXT: v_lshlrev_b16_e64 v8, 12, v8
; GFX10-DL-XNACK-NEXT: v_lshlrev_b16_e64 v15, 12, v15
; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v8, 12, v8
; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v15, 12, v15
; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v13, 28, v2
; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v10, 4, v1
; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v17, 4, v2
; GFX10-DL-XNACK-NEXT: v_lshlrev_b16_e64 v9, 12, v9
; GFX10-DL-XNACK-NEXT: v_lshlrev_b16_e64 v16, 12, v16
; GFX10-DL-XNACK-NEXT: v_ashrrev_i16_e64 v8, 12, v8
; GFX10-DL-XNACK-NEXT: v_ashrrev_i16_e64 v15, 12, v15
; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v9, 12, v9
; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v16, 12, v16
; GFX10-DL-XNACK-NEXT: v_ashrrev_i16 v8, 12, v8
; GFX10-DL-XNACK-NEXT: v_ashrrev_i16 v15, 12, v15
; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v0, 20, v1
; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v7, 24, v1
; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v11, 20, v2
; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v14, 24, v2
; GFX10-DL-XNACK-NEXT: v_lshlrev_b16_e64 v6, 12, v6
; GFX10-DL-XNACK-NEXT: v_lshlrev_b16_e64 v13, 12, v13
; GFX10-DL-XNACK-NEXT: v_lshlrev_b16_e64 v10, 12, v10
; GFX10-DL-XNACK-NEXT: v_mul_lo_u16_e64 v8, v8, v15
; GFX10-DL-XNACK-NEXT: v_lshlrev_b16_e64 v17, 12, v17
; GFX10-DL-XNACK-NEXT: v_ashrrev_i16_e64 v9, 12, v9
; GFX10-DL-XNACK-NEXT: v_ashrrev_i16_e64 v16, 12, v16
; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v6, 12, v6
; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v13, 12, v13
; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v10, 12, v10
; GFX10-DL-XNACK-NEXT: v_mul_lo_u16 v8, v8, v15
; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v17, 12, v17
; GFX10-DL-XNACK-NEXT: v_ashrrev_i16 v9, 12, v9
; GFX10-DL-XNACK-NEXT: v_ashrrev_i16 v16, 12, v16
; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v5, 16, v1
; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v12, 16, v2
; GFX10-DL-XNACK-NEXT: v_lshlrev_b16_e64 v7, 12, v7
; GFX10-DL-XNACK-NEXT: v_lshlrev_b16_e64 v0, 12, v0
; GFX10-DL-XNACK-NEXT: v_lshlrev_b16_e64 v14, 12, v14
; GFX10-DL-XNACK-NEXT: v_lshlrev_b16_e64 v11, 12, v11
; GFX10-DL-XNACK-NEXT: v_ashrrev_i16_e64 v6, 12, v6
; GFX10-DL-XNACK-NEXT: v_ashrrev_i16_e64 v13, 12, v13
; GFX10-DL-XNACK-NEXT: v_lshlrev_b16_e64 v1, 12, v1
; GFX10-DL-XNACK-NEXT: v_lshlrev_b16_e64 v2, 12, v2
; GFX10-DL-XNACK-NEXT: v_ashrrev_i16_e64 v10, 12, v10
; GFX10-DL-XNACK-NEXT: v_ashrrev_i16_e64 v15, 12, v17
; GFX10-DL-XNACK-NEXT: v_mul_lo_u16_e64 v9, v9, v16
; GFX10-DL-XNACK-NEXT: v_lshlrev_b16_e64 v8, 8, v8
; GFX10-DL-XNACK-NEXT: v_lshlrev_b16_e64 v5, 12, v5
; GFX10-DL-XNACK-NEXT: v_mul_lo_u16_e64 v6, v6, v13
; GFX10-DL-XNACK-NEXT: v_lshlrev_b16_e64 v12, 12, v12
; GFX10-DL-XNACK-NEXT: v_ashrrev_i16_e64 v7, 12, v7
; GFX10-DL-XNACK-NEXT: v_ashrrev_i16_e64 v0, 12, v0
; GFX10-DL-XNACK-NEXT: v_ashrrev_i16_e64 v14, 12, v14
; GFX10-DL-XNACK-NEXT: v_ashrrev_i16_e64 v11, 12, v11
; GFX10-DL-XNACK-NEXT: v_mul_lo_u16_e64 v10, v10, v15
; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v7, 12, v7
; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v0, 12, v0
; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v14, 12, v14
; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v11, 12, v11
; GFX10-DL-XNACK-NEXT: v_ashrrev_i16 v6, 12, v6
; GFX10-DL-XNACK-NEXT: v_ashrrev_i16 v13, 12, v13
; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v1, 12, v1
; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v2, 12, v2
; GFX10-DL-XNACK-NEXT: v_ashrrev_i16 v10, 12, v10
; GFX10-DL-XNACK-NEXT: v_ashrrev_i16 v15, 12, v17
; GFX10-DL-XNACK-NEXT: v_mul_lo_u16 v9, v9, v16
; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v8, 8, v8
; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v5, 12, v5
; GFX10-DL-XNACK-NEXT: v_mul_lo_u16 v6, v6, v13
; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v12, 12, v12
; GFX10-DL-XNACK-NEXT: v_ashrrev_i16 v7, 12, v7
; GFX10-DL-XNACK-NEXT: v_ashrrev_i16 v0, 12, v0
; GFX10-DL-XNACK-NEXT: v_ashrrev_i16 v14, 12, v14
; GFX10-DL-XNACK-NEXT: v_ashrrev_i16 v11, 12, v11
; GFX10-DL-XNACK-NEXT: v_mul_lo_u16 v10, v10, v15
; GFX10-DL-XNACK-NEXT: v_or_b32_sdwa v8, v9, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX10-DL-XNACK-NEXT: v_ashrrev_i16_e64 v1, 12, v1
; GFX10-DL-XNACK-NEXT: v_ashrrev_i16_e64 v2, 12, v2
; GFX10-DL-XNACK-NEXT: v_mul_lo_u16_e64 v9, v0, v11
; GFX10-DL-XNACK-NEXT: v_ashrrev_i16_e64 v5, 12, v5
; GFX10-DL-XNACK-NEXT: v_ashrrev_i16_e64 v23, 12, v12
; GFX10-DL-XNACK-NEXT: v_mul_lo_u16_e64 v11, v7, v14
; GFX10-DL-XNACK-NEXT: v_lshlrev_b16_e64 v6, 8, v6
; GFX10-DL-XNACK-NEXT: v_mul_lo_u16_e64 v1, v1, v2
; GFX10-DL-XNACK-NEXT: v_lshlrev_b16_e64 v10, 8, v10
; GFX10-DL-XNACK-NEXT: v_ashrrev_i16 v1, 12, v1
; GFX10-DL-XNACK-NEXT: v_ashrrev_i16 v2, 12, v2
; GFX10-DL-XNACK-NEXT: v_mul_lo_u16 v9, v0, v11
; GFX10-DL-XNACK-NEXT: v_ashrrev_i16 v5, 12, v5
; GFX10-DL-XNACK-NEXT: v_ashrrev_i16 v23, 12, v12
; GFX10-DL-XNACK-NEXT: v_mul_lo_u16 v11, v7, v14
; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v6, 8, v6
; GFX10-DL-XNACK-NEXT: v_mul_lo_u16 v1, v1, v2
; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v10, 8, v10
; GFX10-DL-XNACK-NEXT: v_lshlrev_b32_e32 v0, 16, v8
; GFX10-DL-XNACK-NEXT: v_mul_lo_u16_e64 v2, v5, v23
; GFX10-DL-XNACK-NEXT: v_lshlrev_b16_e64 v9, 8, v9
; GFX10-DL-XNACK-NEXT: v_mul_lo_u16 v2, v5, v23
; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v9, 8, v9
; GFX10-DL-XNACK-NEXT: v_or_b32_sdwa v6, v11, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX10-DL-XNACK-NEXT: v_or_b32_sdwa v1, v1, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX10-DL-XNACK-NEXT: v_or_b32_sdwa v11, v10, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; GFX10-DL-XNACK-NEXT: v_or_b32_sdwa v2, v2, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX10-DL-XNACK-NEXT: v_lshlrev_b32_e32 v9, 16, v6
; GFX10-DL-XNACK-NEXT: s_waitcnt vmcnt(0)
; GFX10-DL-XNACK-NEXT: v_add_nc_u16_e64 v3, v1, v3
; GFX10-DL-XNACK-NEXT: v_add_nc_u16 v3, v1, v3
; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v10, 8, v11
; GFX10-DL-XNACK-NEXT: v_or_b32_sdwa v1, v2, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; GFX10-DL-XNACK-NEXT: v_add_nc_u16_e64 v9, v3, v10
; GFX10-DL-XNACK-NEXT: v_add_nc_u16 v9, v3, v10
; GFX10-DL-XNACK-NEXT: v_lshrrev_b64 v[2:3], 24, v[0:1]
; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v1, 8, v1
; GFX10-DL-XNACK-NEXT: v_add_nc_u16_e64 v0, v9, v8
; GFX10-DL-XNACK-NEXT: v_add_nc_u16_e64 v0, v0, v2
; GFX10-DL-XNACK-NEXT: v_add_nc_u16 v0, v9, v8
; GFX10-DL-XNACK-NEXT: v_add_nc_u16 v0, v0, v2
; GFX10-DL-XNACK-NEXT: v_mad_u16 v0, v5, v23, v0
; GFX10-DL-XNACK-NEXT: v_add_nc_u16_e64 v0, v0, v1
; GFX10-DL-XNACK-NEXT: v_add_nc_u16 v0, v0, v1
; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v1, 8, v6
; GFX10-DL-XNACK-NEXT: v_mad_u16 v0, v7, v14, v0
; GFX10-DL-XNACK-NEXT: v_add_nc_u16_e64 v0, v0, v1
; GFX10-DL-XNACK-NEXT: v_add_nc_u16 v0, v0, v1
; GFX10-DL-XNACK-NEXT: global_store_byte v19, v0, s[0:1]
; GFX10-DL-XNACK-NEXT: s_endpgm
;
@ -3319,77 +3319,77 @@ define amdgpu_kernel void @idot8_acc8_vecMul(<8 x i4> addrspace(1)* %src1,
; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v13, 28, v0
; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v14, 24, v0
; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v17, 4, v0
; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16_e64 v8, 12, v8
; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16_e64 v15, 12, v15
; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v8, 12, v8
; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v15, 12, v15
; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v16, 8, v0
; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16_e64 v0, 12, v0
; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v0, 12, v0
; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v6, 28, v1
; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v10, 4, v1
; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16_e64 v9, 12, v9
; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16_e64 v8, 12, v8
; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16_e64 v18, 12, v0
; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16_e64 v0, 12, v16
; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16_e64 v15, 12, v15
; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v9, 12, v9
; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v8, 12, v8
; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v18, 12, v0
; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v0, 12, v16
; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v15, 12, v15
; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v3, 20, v1
; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v7, 24, v1
; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16_e64 v6, 12, v6
; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16_e64 v13, 12, v13
; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16_e64 v10, 12, v10
; GFX10-DL-NOXNACK-NEXT: v_mul_lo_u16_e64 v8, v8, v15
; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16_e64 v17, 12, v17
; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16_e64 v9, 12, v9
; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16_e64 v0, 12, v0
; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v6, 12, v6
; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v13, 12, v13
; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v10, 12, v10
; GFX10-DL-NOXNACK-NEXT: v_mul_lo_u16 v8, v8, v15
; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v17, 12, v17
; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v9, 12, v9
; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v0, 12, v0
; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v5, 16, v1
; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16_e64 v7, 12, v7
; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16_e64 v3, 12, v3
; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16_e64 v14, 12, v14
; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16_e64 v11, 12, v11
; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16_e64 v6, 12, v6
; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16_e64 v13, 12, v13
; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16_e64 v1, 12, v1
; GFX10-DL-NOXNACK-NEXT: v_mul_lo_u16_e64 v23, v9, v0
; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16_e64 v8, 8, v8
; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16_e64 v10, 12, v10
; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16_e64 v15, 12, v17
; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16_e64 v5, 12, v5
; GFX10-DL-NOXNACK-NEXT: v_mul_lo_u16_e64 v6, v6, v13
; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16_e64 v12, 12, v12
; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16_e64 v9, 12, v11
; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16_e64 v3, 12, v3
; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16_e64 v7, 12, v7
; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16_e64 v14, 12, v14
; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16_e64 v1, 12, v1
; GFX10-DL-NOXNACK-NEXT: v_mul_lo_u16_e64 v10, v10, v15
; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v7, 12, v7
; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v3, 12, v3
; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v14, 12, v14
; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v11, 12, v11
; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v6, 12, v6
; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v13, 12, v13
; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v1, 12, v1
; GFX10-DL-NOXNACK-NEXT: v_mul_lo_u16 v23, v9, v0
; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v8, 8, v8
; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v10, 12, v10
; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v15, 12, v17
; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v5, 12, v5
; GFX10-DL-NOXNACK-NEXT: v_mul_lo_u16 v6, v6, v13
; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v12, 12, v12
; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v9, 12, v11
; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v3, 12, v3
; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v7, 12, v7
; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v14, 12, v14
; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v1, 12, v1
; GFX10-DL-NOXNACK-NEXT: v_mul_lo_u16 v10, v10, v15
; GFX10-DL-NOXNACK-NEXT: v_or_b32_sdwa v8, v23, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX10-DL-NOXNACK-NEXT: v_mul_lo_u16_e64 v3, v3, v9
; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16_e64 v5, 12, v5
; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16_e64 v11, 12, v12
; GFX10-DL-NOXNACK-NEXT: v_mul_lo_u16_e64 v9, v7, v14
; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16_e64 v6, 8, v6
; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16_e64 v10, 8, v10
; GFX10-DL-NOXNACK-NEXT: v_mul_lo_u16_e64 v1, v1, v18
; GFX10-DL-NOXNACK-NEXT: v_mul_lo_u16 v3, v3, v9
; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v5, 12, v5
; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v11, 12, v12
; GFX10-DL-NOXNACK-NEXT: v_mul_lo_u16 v9, v7, v14
; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v6, 8, v6
; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v10, 8, v10
; GFX10-DL-NOXNACK-NEXT: v_mul_lo_u16 v1, v1, v18
; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b32_e32 v0, 16, v8
; GFX10-DL-NOXNACK-NEXT: v_mul_lo_u16_e64 v12, v5, v11
; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16_e64 v3, 8, v3
; GFX10-DL-NOXNACK-NEXT: v_mul_lo_u16 v12, v5, v11
; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v3, 8, v3
; GFX10-DL-NOXNACK-NEXT: v_or_b32_sdwa v6, v9, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX10-DL-NOXNACK-NEXT: v_or_b32_sdwa v1, v1, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX10-DL-NOXNACK-NEXT: v_or_b32_sdwa v9, v10, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; GFX10-DL-NOXNACK-NEXT: v_or_b32_sdwa v3, v12, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b32_e32 v10, 16, v6
; GFX10-DL-NOXNACK-NEXT: s_waitcnt vmcnt(0)
; GFX10-DL-NOXNACK-NEXT: v_add_nc_u16_e64 v2, v1, v2
; GFX10-DL-NOXNACK-NEXT: v_add_nc_u16 v2, v1, v2
; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v9, 8, v9
; GFX10-DL-NOXNACK-NEXT: v_or_b32_sdwa v1, v3, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; GFX10-DL-NOXNACK-NEXT: v_add_nc_u16_e64 v9, v2, v9
; GFX10-DL-NOXNACK-NEXT: v_add_nc_u16 v9, v2, v9
; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b64 v[2:3], 24, v[0:1]
; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v1, 8, v1
; GFX10-DL-NOXNACK-NEXT: v_add_nc_u16_e64 v0, v9, v8
; GFX10-DL-NOXNACK-NEXT: v_add_nc_u16_e64 v0, v0, v2
; GFX10-DL-NOXNACK-NEXT: v_add_nc_u16 v0, v9, v8
; GFX10-DL-NOXNACK-NEXT: v_add_nc_u16 v0, v0, v2
; GFX10-DL-NOXNACK-NEXT: v_mad_u16 v0, v5, v11, v0
; GFX10-DL-NOXNACK-NEXT: v_add_nc_u16_e64 v0, v0, v1
; GFX10-DL-NOXNACK-NEXT: v_add_nc_u16 v0, v0, v1
; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v1, 8, v6
; GFX10-DL-NOXNACK-NEXT: v_mad_u16 v0, v7, v14, v0
; GFX10-DL-NOXNACK-NEXT: v_add_nc_u16_e64 v0, v0, v1
; GFX10-DL-NOXNACK-NEXT: v_add_nc_u16 v0, v0, v1
; GFX10-DL-NOXNACK-NEXT: global_store_byte v19, v0, s[0:1]
; GFX10-DL-NOXNACK-NEXT: s_endpgm
; GFX10-DL-LABEL: idot8_acc8_vecMul:

View File

@ -2393,32 +2393,32 @@ define amdgpu_kernel void @udot8_acc16_vecMul(<8 x i4> addrspace(1)* %src1,
; GFX10-DL-NEXT: v_lshrrev_b32_e32 v23, 28, v1
; GFX10-DL-NEXT: v_lshrrev_b32_e32 v12, 16, v6
; GFX10-DL-NEXT: s_waitcnt vmcnt(0)
; GFX10-DL-NEXT: v_add_nc_u16_e64 v3, v6, v3
; GFX10-DL-NEXT: v_add_nc_u16 v3, v6, v3
; GFX10-DL-NEXT: v_pk_mul_lo_u16 v9, v9, v10
; GFX10-DL-NEXT: v_bfe_u32 v1, v1, 20, 4
; GFX10-DL-NEXT: v_and_b32_e32 v11, v4, v11
; GFX10-DL-NEXT: v_bfe_u32 v6, v2, 20, 4
; GFX10-DL-NEXT: v_and_b32_e32 v7, v4, v7
; GFX10-DL-NEXT: v_add_nc_u16_e64 v3, v3, v12
; GFX10-DL-NEXT: v_add_nc_u16 v3, v3, v12
; GFX10-DL-NEXT: v_bfe_u32 v10, v2, 24, 4
; GFX10-DL-NEXT: v_lshl_or_b32 v1, v1, 16, v11
; GFX10-DL-NEXT: v_lshrrev_b32_e32 v2, 28, v2
; GFX10-DL-NEXT: v_lshl_or_b32 v6, v6, 16, v7
; GFX10-DL-NEXT: v_lshrrev_b32_e32 v7, 16, v9
; GFX10-DL-NEXT: v_add_nc_u16_e64 v14, v3, v9
; GFX10-DL-NEXT: v_add_nc_u16 v14, v3, v9
; GFX10-DL-NEXT: v_and_b32_e32 v9, v4, v10
; GFX10-DL-NEXT: v_and_b32_e32 v4, v4, v5
; GFX10-DL-NEXT: v_pk_mul_lo_u16 v1, v1, v6
; GFX10-DL-NEXT: v_add_nc_u16_e64 v3, v14, v7
; GFX10-DL-NEXT: v_add_nc_u16 v3, v14, v7
; GFX10-DL-NEXT: v_lshl_or_b32 v2, v2, 16, v9
; GFX10-DL-NEXT: v_lshl_or_b32 v4, v23, 16, v4
; GFX10-DL-NEXT: v_lshrrev_b32_e32 v5, 16, v1
; GFX10-DL-NEXT: v_add_nc_u16_e64 v3, v3, v1
; GFX10-DL-NEXT: v_add_nc_u16 v3, v3, v1
; GFX10-DL-NEXT: v_pk_mul_lo_u16 v2, v4, v2
; GFX10-DL-NEXT: v_add_nc_u16_e64 v1, v3, v5
; GFX10-DL-NEXT: v_add_nc_u16 v1, v3, v5
; GFX10-DL-NEXT: v_lshrrev_b32_e32 v3, 16, v2
; GFX10-DL-NEXT: v_add_nc_u16_e64 v1, v1, v2
; GFX10-DL-NEXT: v_add_nc_u16_e64 v1, v1, v3
; GFX10-DL-NEXT: v_add_nc_u16 v1, v1, v2
; GFX10-DL-NEXT: v_add_nc_u16 v1, v1, v3
; GFX10-DL-NEXT: global_store_short v0, v1, s[0:1]
; GFX10-DL-NEXT: s_endpgm
<8 x i4> addrspace(1)* %src2,
@ -2782,49 +2782,49 @@ define amdgpu_kernel void @udot8_acc8_vecMul(<8 x i4> addrspace(1)* %src1,
; GFX10-DL-NEXT: v_bfe_u32 v13, v2, 8, 4
; GFX10-DL-NEXT: v_lshrrev_b32_e32 v7, 28, v1
; GFX10-DL-NEXT: v_lshrrev_b32_e32 v14, 28, v2
; GFX10-DL-NEXT: v_mul_lo_u16_e64 v9, v9, v10
; GFX10-DL-NEXT: v_mul_lo_u16 v9, v9, v10
; GFX10-DL-NEXT: v_bfe_u32 v5, v1, 16, 4
; GFX10-DL-NEXT: v_mul_lo_u16_e64 v8, v8, v13
; GFX10-DL-NEXT: v_mul_lo_u16 v8, v8, v13
; GFX10-DL-NEXT: v_bfe_u32 v0, v1, 20, 4
; GFX10-DL-NEXT: v_bfe_u32 v6, v1, 24, 4
; GFX10-DL-NEXT: v_and_b32_e32 v11, 15, v1
; GFX10-DL-NEXT: v_lshlrev_b16_e64 v9, 8, v9
; GFX10-DL-NEXT: v_lshlrev_b16 v9, 8, v9
; GFX10-DL-NEXT: v_bfe_u32 v1, v1, 4, 4
; GFX10-DL-NEXT: v_bfe_u32 v15, v2, 4, 4
; GFX10-DL-NEXT: v_mul_lo_u16_e64 v7, v7, v14
; GFX10-DL-NEXT: v_mul_lo_u16 v7, v7, v14
; GFX10-DL-NEXT: v_bfe_u32 v10, v2, 20, 4
; GFX10-DL-NEXT: v_bfe_u32 v13, v2, 24, 4
; GFX10-DL-NEXT: v_bfe_u32 v23, v2, 16, 4
; GFX10-DL-NEXT: v_and_b32_e32 v2, 15, v2
; GFX10-DL-NEXT: v_mul_lo_u16_e64 v1, v1, v15
; GFX10-DL-NEXT: v_mul_lo_u16 v1, v1, v15
; GFX10-DL-NEXT: v_or_b32_e32 v8, v8, v9
; GFX10-DL-NEXT: v_mul_lo_u16_e64 v9, v0, v10
; GFX10-DL-NEXT: v_mul_lo_u16_e64 v10, v6, v13
; GFX10-DL-NEXT: v_lshlrev_b16_e64 v7, 8, v7
; GFX10-DL-NEXT: v_mul_lo_u16_e64 v2, v11, v2
; GFX10-DL-NEXT: v_lshlrev_b16_e64 v1, 8, v1
; GFX10-DL-NEXT: v_mul_lo_u16 v9, v0, v10
; GFX10-DL-NEXT: v_mul_lo_u16 v10, v6, v13
; GFX10-DL-NEXT: v_lshlrev_b16 v7, 8, v7
; GFX10-DL-NEXT: v_mul_lo_u16 v2, v11, v2
; GFX10-DL-NEXT: v_lshlrev_b16 v1, 8, v1
; GFX10-DL-NEXT: v_lshlrev_b32_e32 v0, 16, v8
; GFX10-DL-NEXT: v_mul_lo_u16_e64 v11, v5, v23
; GFX10-DL-NEXT: v_mul_lo_u16 v11, v5, v23
; GFX10-DL-NEXT: v_or_b32_e32 v7, v10, v7
; GFX10-DL-NEXT: v_lshlrev_b16_e64 v9, 8, v9
; GFX10-DL-NEXT: v_lshlrev_b16 v9, 8, v9
; GFX10-DL-NEXT: v_or_b32_sdwa v10, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; GFX10-DL-NEXT: v_or_b32_e32 v1, v2, v1
; GFX10-DL-NEXT: v_or_b32_e32 v2, v11, v9
; GFX10-DL-NEXT: v_lshlrev_b32_e32 v9, 16, v7
; GFX10-DL-NEXT: v_lshrrev_b32_e32 v10, 8, v10
; GFX10-DL-NEXT: s_waitcnt vmcnt(0)
; GFX10-DL-NEXT: v_add_nc_u16_e64 v3, v1, v3
; GFX10-DL-NEXT: v_add_nc_u16 v3, v1, v3
; GFX10-DL-NEXT: v_or_b32_sdwa v1, v2, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; GFX10-DL-NEXT: v_add_nc_u16_e64 v9, v3, v10
; GFX10-DL-NEXT: v_add_nc_u16 v9, v3, v10
; GFX10-DL-NEXT: v_lshrrev_b64 v[2:3], 24, v[0:1]
; GFX10-DL-NEXT: v_lshrrev_b32_e32 v1, 8, v1
; GFX10-DL-NEXT: v_add_nc_u16_e64 v0, v9, v8
; GFX10-DL-NEXT: v_add_nc_u16_e64 v0, v0, v2
; GFX10-DL-NEXT: v_add_nc_u16 v0, v9, v8
; GFX10-DL-NEXT: v_add_nc_u16 v0, v0, v2
; GFX10-DL-NEXT: v_mad_u16 v0, v5, v23, v0
; GFX10-DL-NEXT: v_add_nc_u16_e64 v0, v0, v1
; GFX10-DL-NEXT: v_add_nc_u16 v0, v0, v1
; GFX10-DL-NEXT: v_lshrrev_b32_e32 v1, 8, v7
; GFX10-DL-NEXT: v_mad_u16 v0, v6, v13, v0
; GFX10-DL-NEXT: v_add_nc_u16_e64 v0, v0, v1
; GFX10-DL-NEXT: v_add_nc_u16 v0, v0, v1
; GFX10-DL-NEXT: global_store_byte v19, v0, s[0:1]
; GFX10-DL-NEXT: s_endpgm
<8 x i4> addrspace(1)* %src2,
@ -3121,28 +3121,28 @@ define amdgpu_kernel void @udot8_acc4_vecMul(<8 x i4> addrspace(1)* %src1,
; GFX10-DL-NEXT: v_mul_u32_u24_e32 v6, v6, v7
; GFX10-DL-NEXT: v_bfe_u32 v7, v2, 12, 4
; GFX10-DL-NEXT: s_waitcnt vmcnt(0)
; GFX10-DL-NEXT: v_add_nc_u16_e64 v3, v4, v3
; GFX10-DL-NEXT: v_add_nc_u16 v3, v4, v3
; GFX10-DL-NEXT: v_bfe_u32 v4, v1, 12, 4
; GFX10-DL-NEXT: v_mul_u32_u24_e32 v5, v5, v8
; GFX10-DL-NEXT: v_bfe_u32 v8, v2, 16, 4
; GFX10-DL-NEXT: v_add_nc_u16_e64 v3, v3, v6
; GFX10-DL-NEXT: v_add_nc_u16 v3, v3, v6
; GFX10-DL-NEXT: v_bfe_u32 v6, v1, 16, 4
; GFX10-DL-NEXT: v_mul_u32_u24_e32 v4, v4, v7
; GFX10-DL-NEXT: v_bfe_u32 v7, v2, 20, 4
; GFX10-DL-NEXT: v_add_nc_u16_e64 v3, v3, v5
; GFX10-DL-NEXT: v_add_nc_u16 v3, v3, v5
; GFX10-DL-NEXT: v_bfe_u32 v5, v1, 20, 4
; GFX10-DL-NEXT: v_mul_u32_u24_e32 v6, v6, v8
; GFX10-DL-NEXT: v_bfe_u32 v8, v2, 24, 4
; GFX10-DL-NEXT: v_lshrrev_b32_e32 v1, 28, v1
; GFX10-DL-NEXT: v_add_nc_u16_e64 v3, v3, v4
; GFX10-DL-NEXT: v_add_nc_u16 v3, v3, v4
; GFX10-DL-NEXT: v_mul_u32_u24_e32 v5, v5, v7
; GFX10-DL-NEXT: v_lshrrev_b32_e32 v2, 28, v2
; GFX10-DL-NEXT: v_mul_u32_u24_e32 v4, v11, v8
; GFX10-DL-NEXT: v_add_nc_u16_e64 v3, v3, v6
; GFX10-DL-NEXT: v_add_nc_u16 v3, v3, v6
; GFX10-DL-NEXT: v_mul_u32_u24_e32 v1, v1, v2
; GFX10-DL-NEXT: v_add_nc_u16_e64 v3, v3, v5
; GFX10-DL-NEXT: v_add_nc_u16_e64 v2, v3, v4
; GFX10-DL-NEXT: v_add_nc_u16_e64 v1, v2, v1
; GFX10-DL-NEXT: v_add_nc_u16 v3, v3, v5
; GFX10-DL-NEXT: v_add_nc_u16 v2, v3, v4
; GFX10-DL-NEXT: v_add_nc_u16 v1, v2, v1
; GFX10-DL-NEXT: v_and_b32_e32 v1, 15, v1
; GFX10-DL-NEXT: global_store_byte v0, v1, s[2:3]
; GFX10-DL-NEXT: s_endpgm

View File

@ -1115,7 +1115,7 @@ define amdgpu_kernel void @add_inline_imm_neg_1_f16(half addrspace(1)* %out, i16
; GFX10-NEXT: buffer_load_ushort v0, off, s[8:11], 0 ; encoding: [0x00,0x00,0x28,0xe0,0x00,0x00,0x02,0x80]
; GFX10-NEXT: s_mov_b32 s5, s1 ; encoding: [0x01,0x03,0x85,0xbe]
; GFX10-NEXT: s_waitcnt vmcnt(0) ; encoding: [0x70,0x3f,0x8c,0xbf]
; GFX10-NEXT: v_add_nc_u16_e64 v0, v0, -1 ; encoding: [0x00,0x00,0x03,0xd7,0x00,0x83,0x01,0x00]
; GFX10-NEXT: v_add_nc_u16 v0, v0, -1 ; encoding: [0x00,0x00,0x03,0xd7,0x00,0x83,0x01,0x00]
; GFX10-NEXT: buffer_store_short v0, off, s[4:7], 0 ; encoding: [0x00,0x00,0x68,0xe0,0x00,0x00,0x01,0x80]
; GFX10-NEXT: s_endpgm ; encoding: [0x00,0x00,0x81,0xbf]
;
@ -1176,7 +1176,7 @@ define amdgpu_kernel void @add_inline_imm_neg_2_f16(half addrspace(1)* %out, i16
; GFX10-NEXT: buffer_load_ushort v0, off, s[8:11], 0 ; encoding: [0x00,0x00,0x28,0xe0,0x00,0x00,0x02,0x80]
; GFX10-NEXT: s_mov_b32 s5, s1 ; encoding: [0x01,0x03,0x85,0xbe]
; GFX10-NEXT: s_waitcnt vmcnt(0) ; encoding: [0x70,0x3f,0x8c,0xbf]
; GFX10-NEXT: v_add_nc_u16_e64 v0, v0, -2 ; encoding: [0x00,0x00,0x03,0xd7,0x00,0x85,0x01,0x00]
; GFX10-NEXT: v_add_nc_u16 v0, v0, -2 ; encoding: [0x00,0x00,0x03,0xd7,0x00,0x85,0x01,0x00]
; GFX10-NEXT: buffer_store_short v0, off, s[4:7], 0 ; encoding: [0x00,0x00,0x68,0xe0,0x00,0x00,0x01,0x80]
; GFX10-NEXT: s_endpgm ; encoding: [0x00,0x00,0x81,0xbf]
;
@ -1237,7 +1237,7 @@ define amdgpu_kernel void @add_inline_imm_neg_16_f16(half addrspace(1)* %out, i1
; GFX10-NEXT: buffer_load_ushort v0, off, s[8:11], 0 ; encoding: [0x00,0x00,0x28,0xe0,0x00,0x00,0x02,0x80]
; GFX10-NEXT: s_mov_b32 s5, s1 ; encoding: [0x01,0x03,0x85,0xbe]
; GFX10-NEXT: s_waitcnt vmcnt(0) ; encoding: [0x70,0x3f,0x8c,0xbf]
; GFX10-NEXT: v_add_nc_u16_e64 v0, v0, -16 ; encoding: [0x00,0x00,0x03,0xd7,0x00,0xa1,0x01,0x00]
; GFX10-NEXT: v_add_nc_u16 v0, v0, -16 ; encoding: [0x00,0x00,0x03,0xd7,0x00,0xa1,0x01,0x00]
; GFX10-NEXT: buffer_store_short v0, off, s[4:7], 0 ; encoding: [0x00,0x00,0x68,0xe0,0x00,0x00,0x01,0x80]
; GFX10-NEXT: s_endpgm ; encoding: [0x00,0x00,0x81,0xbf]
;
@ -1372,7 +1372,7 @@ define void @mul_inline_imm_0.5_i16(i16 addrspace(1)* %out, i16 %x) {
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; encoding: [0x00,0x00,0x8c,0xbf]
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; encoding: [0x00,0x00,0xfd,0xbb]
; GFX10-NEXT: v_mul_lo_u16_e64 v2, 0x3800, v2 ; encoding: [0x02,0x00,0x05,0xd7,0xff,0x04,0x02,0x00,0x00,0x38,0x00,0x00]
; GFX10-NEXT: v_mul_lo_u16 v2, 0x3800, v2 ; encoding: [0x02,0x00,0x05,0xd7,0xff,0x04,0x02,0x00,0x00,0x38,0x00,0x00]
; GFX10-NEXT: global_store_short v[0:1], v2, off ; encoding: [0x00,0x80,0x68,0xdc,0x00,0x02,0x7d,0x00]
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; encoding: [0x00,0x00,0xfd,0xbb]
; GFX10-NEXT: s_setpc_b64 s[30:31] ; encoding: [0x1e,0x20,0x80,0xbe]
@ -1407,7 +1407,7 @@ define void @mul_inline_imm_neg_0.5_i16(i16 addrspace(1)* %out, i16 %x) {
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; encoding: [0x00,0x00,0x8c,0xbf]
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; encoding: [0x00,0x00,0xfd,0xbb]
; GFX10-NEXT: v_mul_lo_u16_e64 v2, 0xb800, v2 ; encoding: [0x02,0x00,0x05,0xd7,0xff,0x04,0x02,0x00,0x00,0xb8,0xff,0xff]
; GFX10-NEXT: v_mul_lo_u16 v2, 0xb800, v2 ; encoding: [0x02,0x00,0x05,0xd7,0xff,0x04,0x02,0x00,0x00,0xb8,0xff,0xff]
; GFX10-NEXT: global_store_short v[0:1], v2, off ; encoding: [0x00,0x80,0x68,0xdc,0x00,0x02,0x7d,0x00]
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; encoding: [0x00,0x00,0xfd,0xbb]
; GFX10-NEXT: s_setpc_b64 s[30:31] ; encoding: [0x1e,0x20,0x80,0xbe]
@ -1442,7 +1442,7 @@ define void @mul_inline_imm_1.0_i16(i16 addrspace(1)* %out, i16 %x) {
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; encoding: [0x00,0x00,0x8c,0xbf]
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; encoding: [0x00,0x00,0xfd,0xbb]
; GFX10-NEXT: v_mul_lo_u16_e64 v2, 0x3c00, v2 ; encoding: [0x02,0x00,0x05,0xd7,0xff,0x04,0x02,0x00,0x00,0x3c,0x00,0x00]
; GFX10-NEXT: v_mul_lo_u16 v2, 0x3c00, v2 ; encoding: [0x02,0x00,0x05,0xd7,0xff,0x04,0x02,0x00,0x00,0x3c,0x00,0x00]
; GFX10-NEXT: global_store_short v[0:1], v2, off ; encoding: [0x00,0x80,0x68,0xdc,0x00,0x02,0x7d,0x00]
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; encoding: [0x00,0x00,0xfd,0xbb]
; GFX10-NEXT: s_setpc_b64 s[30:31] ; encoding: [0x1e,0x20,0x80,0xbe]
@ -1477,7 +1477,7 @@ define void @mul_inline_imm_neg_1.0_i16(i16 addrspace(1)* %out, i16 %x) {
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; encoding: [0x00,0x00,0x8c,0xbf]
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; encoding: [0x00,0x00,0xfd,0xbb]
; GFX10-NEXT: v_mul_lo_u16_e64 v2, 0xbc00, v2 ; encoding: [0x02,0x00,0x05,0xd7,0xff,0x04,0x02,0x00,0x00,0xbc,0xff,0xff]
; GFX10-NEXT: v_mul_lo_u16 v2, 0xbc00, v2 ; encoding: [0x02,0x00,0x05,0xd7,0xff,0x04,0x02,0x00,0x00,0xbc,0xff,0xff]
; GFX10-NEXT: global_store_short v[0:1], v2, off ; encoding: [0x00,0x80,0x68,0xdc,0x00,0x02,0x7d,0x00]
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; encoding: [0x00,0x00,0xfd,0xbb]
; GFX10-NEXT: s_setpc_b64 s[30:31] ; encoding: [0x1e,0x20,0x80,0xbe]
@ -1512,7 +1512,7 @@ define void @shl_inline_imm_2.0_i16(i16 addrspace(1)* %out, i16 %x) {
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; encoding: [0x00,0x00,0x8c,0xbf]
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; encoding: [0x00,0x00,0xfd,0xbb]
; GFX10-NEXT: v_lshlrev_b16_e64 v2, v2, 0x4000 ; encoding: [0x02,0x00,0x14,0xd7,0x02,0xff,0x01,0x00,0x00,0x40,0x00,0x00]
; GFX10-NEXT: v_lshlrev_b16 v2, v2, 0x4000 ; encoding: [0x02,0x00,0x14,0xd7,0x02,0xff,0x01,0x00,0x00,0x40,0x00,0x00]
; GFX10-NEXT: global_store_short v[0:1], v2, off ; encoding: [0x00,0x80,0x68,0xdc,0x00,0x02,0x7d,0x00]
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; encoding: [0x00,0x00,0xfd,0xbb]
; GFX10-NEXT: s_setpc_b64 s[30:31] ; encoding: [0x1e,0x20,0x80,0xbe]
@ -1548,7 +1548,7 @@ define void @shl_inline_imm_neg_2.0_i16(i16 addrspace(1)* %out, i16 %x) {
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; encoding: [0x00,0x00,0x8c,0xbf]
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; encoding: [0x00,0x00,0xfd,0xbb]
; GFX10-NEXT: v_lshlrev_b16_e64 v2, v2, 0xc000 ; encoding: [0x02,0x00,0x14,0xd7,0x02,0xff,0x01,0x00,0x00,0xc0,0xff,0xff]
; GFX10-NEXT: v_lshlrev_b16 v2, v2, 0xc000 ; encoding: [0x02,0x00,0x14,0xd7,0x02,0xff,0x01,0x00,0x00,0xc0,0xff,0xff]
; GFX10-NEXT: global_store_short v[0:1], v2, off ; encoding: [0x00,0x80,0x68,0xdc,0x00,0x02,0x7d,0x00]
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; encoding: [0x00,0x00,0xfd,0xbb]
; GFX10-NEXT: s_setpc_b64 s[30:31] ; encoding: [0x1e,0x20,0x80,0xbe]
@ -1584,7 +1584,7 @@ define void @mul_inline_imm_4.0_i16(i16 addrspace(1)* %out, i16 %x) {
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; encoding: [0x00,0x00,0x8c,0xbf]
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; encoding: [0x00,0x00,0xfd,0xbb]
; GFX10-NEXT: v_mul_lo_u16_e64 v2, 0x4400, v2 ; encoding: [0x02,0x00,0x05,0xd7,0xff,0x04,0x02,0x00,0x00,0x44,0x00,0x00]
; GFX10-NEXT: v_mul_lo_u16 v2, 0x4400, v2 ; encoding: [0x02,0x00,0x05,0xd7,0xff,0x04,0x02,0x00,0x00,0x44,0x00,0x00]
; GFX10-NEXT: global_store_short v[0:1], v2, off ; encoding: [0x00,0x80,0x68,0xdc,0x00,0x02,0x7d,0x00]
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; encoding: [0x00,0x00,0xfd,0xbb]
; GFX10-NEXT: s_setpc_b64 s[30:31] ; encoding: [0x1e,0x20,0x80,0xbe]
@ -1619,7 +1619,7 @@ define void @mul_inline_imm_neg_4.0_i16(i16 addrspace(1)* %out, i16 %x) {
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; encoding: [0x00,0x00,0x8c,0xbf]
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; encoding: [0x00,0x00,0xfd,0xbb]
; GFX10-NEXT: v_mul_lo_u16_e64 v2, 0xc400, v2 ; encoding: [0x02,0x00,0x05,0xd7,0xff,0x04,0x02,0x00,0x00,0xc4,0xff,0xff]
; GFX10-NEXT: v_mul_lo_u16 v2, 0xc400, v2 ; encoding: [0x02,0x00,0x05,0xd7,0xff,0x04,0x02,0x00,0x00,0xc4,0xff,0xff]
; GFX10-NEXT: global_store_short v[0:1], v2, off ; encoding: [0x00,0x80,0x68,0xdc,0x00,0x02,0x7d,0x00]
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; encoding: [0x00,0x00,0xfd,0xbb]
; GFX10-NEXT: s_setpc_b64 s[30:31] ; encoding: [0x1e,0x20,0x80,0xbe]
@ -1654,7 +1654,7 @@ define void @mul_inline_imm_inv2pi_i16(i16 addrspace(1)* %out, i16 %x) {
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; encoding: [0x00,0x00,0x8c,0xbf]
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; encoding: [0x00,0x00,0xfd,0xbb]
; GFX10-NEXT: v_mul_lo_u16_e64 v2, 0x3118, v2 ; encoding: [0x02,0x00,0x05,0xd7,0xff,0x04,0x02,0x00,0x18,0x31,0x00,0x00]
; GFX10-NEXT: v_mul_lo_u16 v2, 0x3118, v2 ; encoding: [0x02,0x00,0x05,0xd7,0xff,0x04,0x02,0x00,0x18,0x31,0x00,0x00]
; GFX10-NEXT: global_store_short v[0:1], v2, off ; encoding: [0x00,0x80,0x68,0xdc,0x00,0x02,0x7d,0x00]
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; encoding: [0x00,0x00,0xfd,0xbb]
; GFX10-NEXT: s_setpc_b64 s[30:31] ; encoding: [0x1e,0x20,0x80,0xbe]

View File

@ -6,7 +6,7 @@
; RUN: llc -march=amdgcn -mcpu=gfx1030 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,NOMADMACF32,GFX103 %s
; GCN-LABEL: {{^}}test_mul_legacy_f32:
; GCN: v_mul_legacy_f32_e{{(32|64)}} v{{[0-9]+}}, s{{[0-9]+}}, {{[sv][0-9]+}}
; GCN: v_mul_legacy_f32{{[_e3264]*}} v{{[0-9]+}}, s{{[0-9]+}}, {{[sv][0-9]+}}
define amdgpu_kernel void @test_mul_legacy_f32(float addrspace(1)* %out, float %a, float %b) #0 {
%result = call float @llvm.amdgcn.fmul.legacy(float %a, float %b)
store float %result, float addrspace(1)* %out, align 4
@ -14,7 +14,7 @@ define amdgpu_kernel void @test_mul_legacy_f32(float addrspace(1)* %out, float %
}
; GCN-LABEL: {{^}}test_mul_legacy_undef0_f32:
; GCN: v_mul_legacy_f32_e{{(32|64)}} v{{[0-9]+}}, s{{[0-9]+}}, {{[sv][0-9]+}}
; GCN: v_mul_legacy_f32{{[_e3264]*}} v{{[0-9]+}}, s{{[0-9]+}}, {{[sv][0-9]+}}
define amdgpu_kernel void @test_mul_legacy_undef0_f32(float addrspace(1)* %out, float %a) #0 {
%result = call float @llvm.amdgcn.fmul.legacy(float undef, float %a)
store float %result, float addrspace(1)* %out, align 4
@ -22,7 +22,7 @@ define amdgpu_kernel void @test_mul_legacy_undef0_f32(float addrspace(1)* %out,
}
; GCN-LABEL: {{^}}test_mul_legacy_undef1_f32:
; GCN: v_mul_legacy_f32_e{{(32|64)}} v{{[0-9]+}}, s{{[0-9]+}}, {{[sv][0-9]+}}
; GCN: v_mul_legacy_f32{{[_e3264]*}} v{{[0-9]+}}, s{{[0-9]+}}, {{[sv][0-9]+}}
define amdgpu_kernel void @test_mul_legacy_undef1_f32(float addrspace(1)* %out, float %a) #0 {
%result = call float @llvm.amdgcn.fmul.legacy(float %a, float undef)
store float %result, float addrspace(1)* %out, align 4
@ -30,7 +30,7 @@ define amdgpu_kernel void @test_mul_legacy_undef1_f32(float addrspace(1)* %out,
}
; GCN-LABEL: {{^}}test_mul_legacy_fabs_f32:
; GCN: v_mul_legacy_f32_e{{(32|64)}} v{{[0-9]+}}, |s{{[0-9]+}}|, |{{[sv][0-9]+}}|
; GCN: v_mul_legacy_f32{{[_e3264]*}} v{{[0-9]+}}, |s{{[0-9]+}}|, |{{[sv][0-9]+}}|
define amdgpu_kernel void @test_mul_legacy_fabs_f32(float addrspace(1)* %out, float %a, float %b) #0 {
%a.fabs = call float @llvm.fabs.f32(float %a)
%b.fabs = call float @llvm.fabs.f32(float %b)
@ -41,7 +41,7 @@ define amdgpu_kernel void @test_mul_legacy_fabs_f32(float addrspace(1)* %out, fl
; Don't form mad/mac instructions because they don't support denormals.
; GCN-LABEL: {{^}}test_add_mul_legacy_f32:
; GCN: v_mul_legacy_f32_e{{(32|64)}} v{{[0-9]+}}, s{{[0-9]+}}, {{[sv][0-9]+}}
; GCN: v_mul_legacy_f32{{[_e3264]*}} v{{[0-9]+}}, s{{[0-9]+}}, {{[sv][0-9]+}}
; GCN: v_add_f32_e{{(32|64)}} v{{[0-9]+}}, s{{[0-9]+}}, {{[sv][0-9]+}}
define amdgpu_kernel void @test_add_mul_legacy_f32(float addrspace(1)* %out, float %a, float %b, float %c) #0 {
%mul = call float @llvm.amdgcn.fmul.legacy(float %a, float %b)

View File

@ -62,12 +62,12 @@ define { i64, i1 } @umulo_i64_v_v(i64 %x, i64 %y) {
; GFX10-NEXT: v_mul_hi_u32 v9, v1, v3
; GFX10-NEXT: v_mul_lo_u32 v1, v1, v3
; GFX10-NEXT: v_mul_lo_u32 v0, v0, v2
; GFX10-NEXT: v_add_co_u32_e64 v10, vcc_lo, v6, v5
; GFX10-NEXT: v_add_co_u32 v10, vcc_lo, v6, v5
; GFX10-NEXT: v_add_co_ci_u32_e32 v4, vcc_lo, 0, v4, vcc_lo
; GFX10-NEXT: v_add_co_u32_e64 v3, vcc_lo, v10, v8
; GFX10-NEXT: v_add_co_u32 v3, vcc_lo, v10, v8
; GFX10-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, v4, v7, vcc_lo
; GFX10-NEXT: v_add_co_ci_u32_e32 v4, vcc_lo, 0, v9, vcc_lo
; GFX10-NEXT: v_add_co_u32_e64 v3, vcc_lo, v3, v1
; GFX10-NEXT: v_add_co_u32 v3, vcc_lo, v3, v1
; GFX10-NEXT: v_add3_u32 v1, v6, v5, v8
; GFX10-NEXT: v_add_co_ci_u32_e32 v4, vcc_lo, 0, v4, vcc_lo
; GFX10-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[3:4]
@ -164,21 +164,21 @@ define { i64, i1 } @smulo_i64_s_s(i64 %x, i64 %y) {
; GFX10-NEXT: v_mul_hi_u32 v7, v1, v2
; GFX10-NEXT: v_mul_hi_i32 v9, v1, v3
; GFX10-NEXT: v_mul_lo_u32 v11, v1, v3
; GFX10-NEXT: v_add_co_u32_e64 v10, vcc_lo, v5, v15
; GFX10-NEXT: v_add_co_u32 v10, vcc_lo, v5, v15
; GFX10-NEXT: v_add_co_ci_u32_e32 v6, vcc_lo, 0, v6, vcc_lo
; GFX10-NEXT: v_add_co_u32_e64 v10, vcc_lo, v10, v8
; GFX10-NEXT: v_add_co_u32 v10, vcc_lo, v10, v8
; GFX10-NEXT: v_add_co_ci_u32_e32 v6, vcc_lo, v6, v7, vcc_lo
; GFX10-NEXT: v_add_co_ci_u32_e32 v7, vcc_lo, 0, v9, vcc_lo
; GFX10-NEXT: v_add_co_u32_e64 v11, vcc_lo, v6, v11
; GFX10-NEXT: v_add_co_u32 v11, vcc_lo, v6, v11
; GFX10-NEXT: v_add_co_ci_u32_e32 v7, vcc_lo, 0, v7, vcc_lo
; GFX10-NEXT: v_sub_co_u32_e64 v9, vcc_lo, v11, v2
; GFX10-NEXT: v_sub_co_u32 v9, vcc_lo, v11, v2
; GFX10-NEXT: v_subrev_co_ci_u32_e32 v10, vcc_lo, 0, v7, vcc_lo
; GFX10-NEXT: v_cmp_gt_i32_e32 vcc_lo, 0, v1
; GFX10-NEXT: v_add3_u32 v1, v5, v15, v8
; GFX10-NEXT: v_cndmask_b32_e32 v6, v11, v9, vcc_lo
; GFX10-NEXT: v_cndmask_b32_e32 v7, v7, v10, vcc_lo
; GFX10-NEXT: v_ashrrev_i32_e32 v4, 31, v1
; GFX10-NEXT: v_sub_co_u32_e64 v8, vcc_lo, v6, v0
; GFX10-NEXT: v_sub_co_u32 v8, vcc_lo, v6, v0
; GFX10-NEXT: v_mul_lo_u32 v0, v0, v2
; GFX10-NEXT: v_subrev_co_ci_u32_e32 v9, vcc_lo, 0, v7, vcc_lo
; GFX10-NEXT: v_cmp_gt_i32_e32 vcc_lo, 0, v3
@ -410,7 +410,7 @@ define amdgpu_kernel void @smulo_i64_s(i64 %x, i64 %y) {
; GFX10-NEXT: v_cndmask_b32_e32 v1, s6, v1, vcc_lo
; GFX10-NEXT: s_ashr_i32 s4, s1, 31
; GFX10-NEXT: s_mov_b32 s5, s4
; GFX10-NEXT: v_sub_co_u32_e64 v2, vcc_lo, v0, s0
; GFX10-NEXT: v_sub_co_u32 v2, vcc_lo, v0, s0
; GFX10-NEXT: s_mul_i32 s0, s0, s2
; GFX10-NEXT: v_subrev_co_ci_u32_e32 v3, vcc_lo, 0, v1, vcc_lo
; GFX10-NEXT: v_cmp_lt_i32_e64 vcc_lo, s3, 0

View File

@ -3,7 +3,7 @@
; GCN-LABEL: {{^}}mul_legacy
; GFX908: v_mul_legacy_f32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}{{$}}
; GFX90A: v_mul_legacy_f32_e64 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}{{$}}
; GFX90A: v_mul_legacy_f32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}{{$}}
define amdgpu_kernel void @mul_legacy(
float addrspace(1)* %r,
float addrspace(1)* %a,

View File

@ -114,7 +114,7 @@ define amdgpu_kernel void @flat_nontemporal_load_1(
; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
; GFX10-WGP-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-WGP-NEXT: v_add_co_u32_e64 v0, s0, s0, v0
; GFX10-WGP-NEXT: v_add_co_u32 v0, s0, s0, v0
; GFX10-WGP-NEXT: v_add_co_ci_u32_e64 v1, s0, s1, 0, s0
; GFX10-WGP-NEXT: flat_load_dword v2, v[0:1] slc
; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2
@ -128,7 +128,7 @@ define amdgpu_kernel void @flat_nontemporal_load_1(
; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
; GFX10-CU-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: v_add_co_u32_e64 v0, s0, s0, v0
; GFX10-CU-NEXT: v_add_co_u32 v0, s0, s0, v0
; GFX10-CU-NEXT: v_add_co_ci_u32_e64 v1, s0, s1, 0, s0
; GFX10-CU-NEXT: flat_load_dword v2, v[0:1] slc
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2
@ -302,7 +302,7 @@ define amdgpu_kernel void @flat_nontemporal_store_1(
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s0
; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s1
; GFX10-WGP-NEXT: v_add_co_u32_e64 v0, s0, s2, v0
; GFX10-WGP-NEXT: v_add_co_u32 v0, s0, s2, v0
; GFX10-WGP-NEXT: flat_load_dword v2, v[1:2]
; GFX10-WGP-NEXT: v_add_co_ci_u32_e64 v1, s0, s3, 0, s0
; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
@ -316,7 +316,7 @@ define amdgpu_kernel void @flat_nontemporal_store_1(
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s0
; GFX10-CU-NEXT: v_mov_b32_e32 v2, s1
; GFX10-CU-NEXT: v_add_co_u32_e64 v0, s0, s2, v0
; GFX10-CU-NEXT: v_add_co_u32 v0, s0, s2, v0
; GFX10-CU-NEXT: flat_load_dword v2, v[1:2]
; GFX10-CU-NEXT: v_add_co_ci_u32_e64 v1, s0, s3, 0, s0
; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)

View File

@ -89,7 +89,7 @@ define amdgpu_kernel void @flat_nontemporal_load_1(
; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
; GFX10-WGP-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-WGP-NEXT: v_add_co_u32_e64 v0, s0, s0, v0
; GFX10-WGP-NEXT: v_add_co_u32 v0, s0, s0, v0
; GFX10-WGP-NEXT: v_add_co_ci_u32_e64 v1, s0, s1, 0, s0
; GFX10-WGP-NEXT: flat_load_dword v2, v[0:1] glc dlc
; GFX10-WGP-NEXT: s_waitcnt vmcnt(0)
@ -104,7 +104,7 @@ define amdgpu_kernel void @flat_nontemporal_load_1(
; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
; GFX10-CU-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: v_add_co_u32_e64 v0, s0, s0, v0
; GFX10-CU-NEXT: v_add_co_u32 v0, s0, s0, v0
; GFX10-CU-NEXT: v_add_co_ci_u32_e64 v1, s0, s1, 0, s0
; GFX10-CU-NEXT: flat_load_dword v2, v[0:1] glc dlc
; GFX10-CU-NEXT: s_waitcnt vmcnt(0)
@ -225,7 +225,7 @@ define amdgpu_kernel void @flat_nontemporal_store_1(
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s0
; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s1
; GFX10-WGP-NEXT: v_add_co_u32_e64 v0, s0, s2, v0
; GFX10-WGP-NEXT: v_add_co_u32 v0, s0, s2, v0
; GFX10-WGP-NEXT: flat_load_dword v2, v[1:2]
; GFX10-WGP-NEXT: v_add_co_ci_u32_e64 v1, s0, s3, 0, s0
; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
@ -240,7 +240,7 @@ define amdgpu_kernel void @flat_nontemporal_store_1(
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s0
; GFX10-CU-NEXT: v_mov_b32_e32 v2, s1
; GFX10-CU-NEXT: v_add_co_u32_e64 v0, s0, s2, v0
; GFX10-CU-NEXT: v_add_co_u32 v0, s0, s2, v0
; GFX10-CU-NEXT: flat_load_dword v2, v[1:2]
; GFX10-CU-NEXT: v_add_co_ci_u32_e64 v1, s0, s3, 0, s0
; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)

View File

@ -180,7 +180,7 @@ define amdgpu_kernel void @v_test_imin_slt_i32(i32 addrspace(1)* %out, i32 addrs
; SI: v_min_i32_e32
; GFX8_9: v_min_i16_e32
; GFX10: v_min_i16_e64
; GFX10: v_min_i16
; EG: MIN_INT
define amdgpu_kernel void @v_test_imin_slt_i16(i16 addrspace(1)* %out, i16 addrspace(1)* %aptr, i16 addrspace(1)* %bptr) #0 {
@ -354,7 +354,7 @@ define amdgpu_kernel void @v_test_umin_ult_i32(i32 addrspace(1)* %out, i32 addrs
; GFX8_9_10: {{flat|global}}_load_ubyte
; GFX8_9_10: {{flat|global}}_load_ubyte
; GFX8_9: v_min_u16_e32
; GFX10: v_min_u16_e64
; GFX10: v_min_u16
; EG: MIN_UINT
define amdgpu_kernel void @v_test_umin_ult_i8(i8 addrspace(1)* %out, i8 addrspace(1)* %a.ptr, i8 addrspace(1)* %b.ptr) #0 {

View File

@ -17,7 +17,7 @@ define i8 @flat_inst_valu_offset_1(i8* %p) {
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: v_add_co_u32_e64 v0, vcc_lo, v0, 1
; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, v0, 1
; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
; GFX10-NEXT: flat_load_ubyte v0, v[0:1]
; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
@ -39,7 +39,7 @@ define i8 @flat_inst_valu_offset_11bit_max(i8* %p) {
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: v_add_co_u32_e64 v0, vcc_lo, 0x7ff, v0
; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, 0x7ff, v0
; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
; GFX10-NEXT: flat_load_ubyte v0, v[0:1]
; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
@ -61,7 +61,7 @@ define i8 @flat_inst_valu_offset_12bit_max(i8* %p) {
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: v_add_co_u32_e64 v0, vcc_lo, 0xfff, v0
; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, 0xfff, v0
; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
; GFX10-NEXT: flat_load_ubyte v0, v[0:1]
; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
@ -85,7 +85,7 @@ define i8 @flat_inst_valu_offset_13bit_max(i8* %p) {
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: v_add_co_u32_e64 v0, vcc_lo, 0x1fff, v0
; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, 0x1fff, v0
; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
; GFX10-NEXT: flat_load_ubyte v0, v[0:1]
; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
@ -109,7 +109,7 @@ define i8 @flat_inst_valu_offset_neg_11bit_max(i8* %p) {
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: v_add_co_u32_e64 v0, vcc_lo, 0xfffff800, v0
; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, 0xfffff800, v0
; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo
; GFX10-NEXT: flat_load_ubyte v0, v[0:1]
; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
@ -133,7 +133,7 @@ define i8 @flat_inst_valu_offset_neg_12bit_max(i8* %p) {
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: v_add_co_u32_e64 v0, vcc_lo, 0xfffff000, v0
; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, 0xfffff000, v0
; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo
; GFX10-NEXT: flat_load_ubyte v0, v[0:1]
; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
@ -157,7 +157,7 @@ define i8 @flat_inst_valu_offset_neg_13bit_max(i8* %p) {
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: v_add_co_u32_e64 v0, vcc_lo, 0xffffe000, v0
; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, 0xffffe000, v0
; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo
; GFX10-NEXT: flat_load_ubyte v0, v[0:1]
; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
@ -179,7 +179,7 @@ define i8 @flat_inst_valu_offset_2x_11bit_max(i8* %p) {
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: v_add_co_u32_e64 v0, vcc_lo, 0xfff, v0
; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, 0xfff, v0
; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
; GFX10-NEXT: flat_load_ubyte v0, v[0:1]
; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
@ -203,7 +203,7 @@ define i8 @flat_inst_valu_offset_2x_12bit_max(i8* %p) {
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: v_add_co_u32_e64 v0, vcc_lo, 0x1fff, v0
; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, 0x1fff, v0
; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
; GFX10-NEXT: flat_load_ubyte v0, v[0:1]
; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
@ -227,7 +227,7 @@ define i8 @flat_inst_valu_offset_2x_13bit_max(i8* %p) {
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: v_add_co_u32_e64 v0, vcc_lo, 0x3fff, v0
; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, 0x3fff, v0
; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
; GFX10-NEXT: flat_load_ubyte v0, v[0:1]
; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
@ -251,7 +251,7 @@ define i8 @flat_inst_valu_offset_2x_neg_11bit_max(i8* %p) {
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: v_add_co_u32_e64 v0, vcc_lo, 0xfffff000, v0
; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, 0xfffff000, v0
; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo
; GFX10-NEXT: flat_load_ubyte v0, v[0:1]
; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
@ -275,7 +275,7 @@ define i8 @flat_inst_valu_offset_2x_neg_12bit_max(i8* %p) {
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: v_add_co_u32_e64 v0, vcc_lo, 0xffffe000, v0
; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, 0xffffe000, v0
; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo
; GFX10-NEXT: flat_load_ubyte v0, v[0:1]
; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
@ -299,7 +299,7 @@ define i8 @flat_inst_valu_offset_2x_neg_13bit_max(i8* %p) {
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: v_add_co_u32_e64 v0, vcc_lo, 0xffffc000, v0
; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, 0xffffc000, v0
; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo
; GFX10-NEXT: flat_load_ubyte v0, v[0:1]
; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
@ -324,7 +324,7 @@ define i8 @flat_inst_valu_offset_64bit_11bit_split0(i8* %p) {
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: v_add_co_u32_e64 v0, vcc_lo, 0x7ff, v0
; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, 0x7ff, v0
; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 2, v1, vcc_lo
; GFX10-NEXT: flat_load_ubyte v0, v[0:1]
; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
@ -349,7 +349,7 @@ define i8 @flat_inst_valu_offset_64bit_11bit_split1(i8* %p) {
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: v_add_co_u32_e64 v0, vcc_lo, 0x800, v0
; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, 0x800, v0
; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 2, v1, vcc_lo
; GFX10-NEXT: flat_load_ubyte v0, v[0:1]
; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
@ -374,7 +374,7 @@ define i8 @flat_inst_valu_offset_64bit_12bit_split0(i8* %p) {
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: v_add_co_u32_e64 v0, vcc_lo, 0xfff, v0
; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, 0xfff, v0
; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 2, v1, vcc_lo
; GFX10-NEXT: flat_load_ubyte v0, v[0:1]
; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
@ -399,7 +399,7 @@ define i8 @flat_inst_valu_offset_64bit_12bit_split1(i8* %p) {
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: v_add_co_u32_e64 v0, vcc_lo, 0x1000, v0
; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, 0x1000, v0
; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 2, v1, vcc_lo
; GFX10-NEXT: flat_load_ubyte v0, v[0:1]
; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
@ -424,7 +424,7 @@ define i8 @flat_inst_valu_offset_64bit_13bit_split0(i8* %p) {
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: v_add_co_u32_e64 v0, vcc_lo, 0x1fff, v0
; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, 0x1fff, v0
; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 2, v1, vcc_lo
; GFX10-NEXT: flat_load_ubyte v0, v[0:1]
; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
@ -449,7 +449,7 @@ define i8 @flat_inst_valu_offset_64bit_13bit_split1(i8* %p) {
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: v_add_co_u32_e64 v0, vcc_lo, 0x2000, v0
; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, 0x2000, v0
; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 2, v1, vcc_lo
; GFX10-NEXT: flat_load_ubyte v0, v[0:1]
; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
@ -475,7 +475,7 @@ define i8 @flat_inst_valu_offset_64bit_11bit_neg_high_split0(i8* %p) {
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: v_add_co_u32_e64 v0, vcc_lo, 0x7ff, v0
; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, 0x7ff, v0
; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0x80000000, v1, vcc_lo
; GFX10-NEXT: flat_load_ubyte v0, v[0:1]
; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
@ -501,7 +501,7 @@ define i8 @flat_inst_valu_offset_64bit_11bit_neg_high_split1(i8* %p) {
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: v_add_co_u32_e64 v0, vcc_lo, 0x800, v0
; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, 0x800, v0
; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0x80000000, v1, vcc_lo
; GFX10-NEXT: flat_load_ubyte v0, v[0:1]
; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
@ -527,7 +527,7 @@ define i8 @flat_inst_valu_offset_64bit_12bit_neg_high_split0(i8* %p) {
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: v_add_co_u32_e64 v0, vcc_lo, 0xfff, v0
; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, 0xfff, v0
; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0x80000000, v1, vcc_lo
; GFX10-NEXT: flat_load_ubyte v0, v[0:1]
; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
@ -553,7 +553,7 @@ define i8 @flat_inst_valu_offset_64bit_12bit_neg_high_split1(i8* %p) {
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: v_add_co_u32_e64 v0, vcc_lo, 0x1000, v0
; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, 0x1000, v0
; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0x80000000, v1, vcc_lo
; GFX10-NEXT: flat_load_ubyte v0, v[0:1]
; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
@ -579,7 +579,7 @@ define i8 @flat_inst_valu_offset_64bit_13bit_neg_high_split0(i8* %p) {
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: v_add_co_u32_e64 v0, vcc_lo, 0x1fff, v0
; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, 0x1fff, v0
; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0x80000000, v1, vcc_lo
; GFX10-NEXT: flat_load_ubyte v0, v[0:1]
; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
@ -605,7 +605,7 @@ define i8 @flat_inst_valu_offset_64bit_13bit_neg_high_split1(i8* %p) {
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: v_add_co_u32_e64 v0, vcc_lo, 0x2000, v0
; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, 0x2000, v0
; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0x80000000, v1, vcc_lo
; GFX10-NEXT: flat_load_ubyte v0, v[0:1]
; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)

View File

@ -57,7 +57,7 @@ define i8 @global_inst_valu_offset_12bit_max(i8 addrspace(1)* %p) {
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: v_add_co_u32_e64 v0, vcc_lo, 0x800, v0
; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, 0x800, v0
; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
; GFX10-NEXT: global_load_ubyte v0, v[0:1], off offset:2047
; GFX10-NEXT: s_waitcnt vmcnt(0)
@ -81,7 +81,7 @@ define i8 @global_inst_valu_offset_13bit_max(i8 addrspace(1)* %p) {
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: v_add_co_u32_e64 v0, vcc_lo, 0x1800, v0
; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, 0x1800, v0
; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
; GFX10-NEXT: global_load_ubyte v0, v[0:1], off offset:2047
; GFX10-NEXT: s_waitcnt vmcnt(0)
@ -123,7 +123,7 @@ define i8 @global_inst_valu_offset_neg_12bit_max(i8 addrspace(1)* %p) {
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: v_add_co_u32_e64 v0, vcc_lo, 0xfffff000, v0
; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, 0xfffff000, v0
; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo
; GFX10-NEXT: global_load_ubyte v0, v[0:1], off
; GFX10-NEXT: s_waitcnt vmcnt(0)
@ -147,7 +147,7 @@ define i8 @global_inst_valu_offset_neg_13bit_max(i8 addrspace(1)* %p) {
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: v_add_co_u32_e64 v0, vcc_lo, 0xffffe000, v0
; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, 0xffffe000, v0
; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo
; GFX10-NEXT: global_load_ubyte v0, v[0:1], off
; GFX10-NEXT: s_waitcnt vmcnt(0)
@ -169,7 +169,7 @@ define i8 @global_inst_valu_offset_2x_11bit_max(i8 addrspace(1)* %p) {
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: v_add_co_u32_e64 v0, vcc_lo, 0x800, v0
; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, 0x800, v0
; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
; GFX10-NEXT: global_load_ubyte v0, v[0:1], off offset:2047
; GFX10-NEXT: s_waitcnt vmcnt(0)
@ -193,7 +193,7 @@ define i8 @global_inst_valu_offset_2x_12bit_max(i8 addrspace(1)* %p) {
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: v_add_co_u32_e64 v0, vcc_lo, 0x1800, v0
; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, 0x1800, v0
; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
; GFX10-NEXT: global_load_ubyte v0, v[0:1], off offset:2047
; GFX10-NEXT: s_waitcnt vmcnt(0)
@ -217,7 +217,7 @@ define i8 @global_inst_valu_offset_2x_13bit_max(i8 addrspace(1)* %p) {
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: v_add_co_u32_e64 v0, vcc_lo, 0x3800, v0
; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, 0x3800, v0
; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
; GFX10-NEXT: global_load_ubyte v0, v[0:1], off offset:2047
; GFX10-NEXT: s_waitcnt vmcnt(0)
@ -239,7 +239,7 @@ define i8 @global_inst_valu_offset_2x_neg_11bit_max(i8 addrspace(1)* %p) {
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: v_add_co_u32_e64 v0, vcc_lo, 0xfffff000, v0
; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, 0xfffff000, v0
; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo
; GFX10-NEXT: global_load_ubyte v0, v[0:1], off
; GFX10-NEXT: s_waitcnt vmcnt(0)
@ -263,7 +263,7 @@ define i8 @global_inst_valu_offset_2x_neg_12bit_max(i8 addrspace(1)* %p) {
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: v_add_co_u32_e64 v0, vcc_lo, 0xffffe000, v0
; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, 0xffffe000, v0
; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo
; GFX10-NEXT: global_load_ubyte v0, v[0:1], off
; GFX10-NEXT: s_waitcnt vmcnt(0)
@ -287,7 +287,7 @@ define i8 @global_inst_valu_offset_2x_neg_13bit_max(i8 addrspace(1)* %p) {
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: v_add_co_u32_e64 v0, vcc_lo, 0xffffc000, v0
; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, 0xffffc000, v0
; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo
; GFX10-NEXT: global_load_ubyte v0, v[0:1], off
; GFX10-NEXT: s_waitcnt vmcnt(0)
@ -312,7 +312,7 @@ define i8 @global_inst_valu_offset_64bit_11bit_split0(i8 addrspace(1)* %p) {
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: v_add_co_u32_e64 v0, vcc_lo, 0, v0
; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, 0, v0
; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 2, v1, vcc_lo
; GFX10-NEXT: global_load_ubyte v0, v[0:1], off offset:2047
; GFX10-NEXT: s_waitcnt vmcnt(0)
@ -337,7 +337,7 @@ define i8 @global_inst_valu_offset_64bit_11bit_split1(i8 addrspace(1)* %p) {
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: v_add_co_u32_e64 v0, vcc_lo, 0x800, v0
; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, 0x800, v0
; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 2, v1, vcc_lo
; GFX10-NEXT: global_load_ubyte v0, v[0:1], off
; GFX10-NEXT: s_waitcnt vmcnt(0)
@ -362,7 +362,7 @@ define i8 @global_inst_valu_offset_64bit_12bit_split0(i8 addrspace(1)* %p) {
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: v_add_co_u32_e64 v0, vcc_lo, 0x800, v0
; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, 0x800, v0
; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 2, v1, vcc_lo
; GFX10-NEXT: global_load_ubyte v0, v[0:1], off offset:2047
; GFX10-NEXT: s_waitcnt vmcnt(0)
@ -387,7 +387,7 @@ define i8 @global_inst_valu_offset_64bit_12bit_split1(i8 addrspace(1)* %p) {
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: v_add_co_u32_e64 v0, vcc_lo, 0x1000, v0
; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, 0x1000, v0
; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 2, v1, vcc_lo
; GFX10-NEXT: global_load_ubyte v0, v[0:1], off
; GFX10-NEXT: s_waitcnt vmcnt(0)
@ -412,7 +412,7 @@ define i8 @global_inst_valu_offset_64bit_13bit_split0(i8 addrspace(1)* %p) {
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: v_add_co_u32_e64 v0, vcc_lo, 0x1800, v0
; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, 0x1800, v0
; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 2, v1, vcc_lo
; GFX10-NEXT: global_load_ubyte v0, v[0:1], off offset:2047
; GFX10-NEXT: s_waitcnt vmcnt(0)
@ -437,7 +437,7 @@ define i8 @global_inst_valu_offset_64bit_13bit_split1(i8 addrspace(1)* %p) {
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: v_add_co_u32_e64 v0, vcc_lo, 0x2000, v0
; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, 0x2000, v0
; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 2, v1, vcc_lo
; GFX10-NEXT: global_load_ubyte v0, v[0:1], off
; GFX10-NEXT: s_waitcnt vmcnt(0)
@ -463,7 +463,7 @@ define i8 @global_inst_valu_offset_64bit_11bit_neg_high_split0(i8 addrspace(1)*
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: v_add_co_u32_e64 v0, vcc_lo, 0x800, v0
; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, 0x800, v0
; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0x80000000, v1, vcc_lo
; GFX10-NEXT: global_load_ubyte v0, v[0:1], off offset:-1
; GFX10-NEXT: s_waitcnt vmcnt(0)
@ -489,7 +489,7 @@ define i8 @global_inst_valu_offset_64bit_11bit_neg_high_split1(i8 addrspace(1)*
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: v_add_co_u32_e64 v0, vcc_lo, 0x800, v0
; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, 0x800, v0
; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0x80000000, v1, vcc_lo
; GFX10-NEXT: global_load_ubyte v0, v[0:1], off
; GFX10-NEXT: s_waitcnt vmcnt(0)
@ -515,7 +515,7 @@ define i8 @global_inst_valu_offset_64bit_12bit_neg_high_split0(i8 addrspace(1)*
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: v_add_co_u32_e64 v0, vcc_lo, 0x1000, v0
; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, 0x1000, v0
; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0x80000000, v1, vcc_lo
; GFX10-NEXT: global_load_ubyte v0, v[0:1], off offset:-1
; GFX10-NEXT: s_waitcnt vmcnt(0)
@ -541,7 +541,7 @@ define i8 @global_inst_valu_offset_64bit_12bit_neg_high_split1(i8 addrspace(1)*
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: v_add_co_u32_e64 v0, vcc_lo, 0x1000, v0
; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, 0x1000, v0
; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0x80000000, v1, vcc_lo
; GFX10-NEXT: global_load_ubyte v0, v[0:1], off
; GFX10-NEXT: s_waitcnt vmcnt(0)
@ -567,7 +567,7 @@ define i8 @global_inst_valu_offset_64bit_13bit_neg_high_split0(i8 addrspace(1)*
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: v_add_co_u32_e64 v0, vcc_lo, 0x2000, v0
; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, 0x2000, v0
; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0x80000000, v1, vcc_lo
; GFX10-NEXT: global_load_ubyte v0, v[0:1], off offset:-1
; GFX10-NEXT: s_waitcnt vmcnt(0)
@ -593,7 +593,7 @@ define i8 @global_inst_valu_offset_64bit_13bit_neg_high_split1(i8 addrspace(1)*
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: v_add_co_u32_e64 v0, vcc_lo, 0x2000, v0
; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, 0x2000, v0
; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0x80000000, v1, vcc_lo
; GFX10-NEXT: global_load_ubyte v0, v[0:1], off
; GFX10-NEXT: s_waitcnt vmcnt(0)
@ -748,7 +748,7 @@ define amdgpu_kernel void @global_inst_salu_offset_neg_12bit_max(i8 addrspace(1)
; GFX10: ; %bb.0:
; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: v_add_co_u32_e64 v0, s0, 0xfffff000, s0
; GFX10-NEXT: v_add_co_u32 v0, s0, 0xfffff000, s0
; GFX10-NEXT: v_add_co_ci_u32_e64 v1, s0, -1, s1, s0
; GFX10-NEXT: global_load_ubyte v0, v[0:1], off glc dlc
; GFX10-NEXT: s_waitcnt vmcnt(0)
@ -778,7 +778,7 @@ define amdgpu_kernel void @global_inst_salu_offset_neg_13bit_max(i8 addrspace(1)
; GFX10: ; %bb.0:
; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: v_add_co_u32_e64 v0, s0, 0xffffe000, s0
; GFX10-NEXT: v_add_co_u32 v0, s0, 0xffffe000, s0
; GFX10-NEXT: v_add_co_ci_u32_e64 v1, s0, -1, s1, s0
; GFX10-NEXT: global_load_ubyte v0, v[0:1], off glc dlc
; GFX10-NEXT: s_waitcnt vmcnt(0)
@ -883,7 +883,7 @@ define amdgpu_kernel void @global_inst_salu_offset_2x_neg_11bit_max(i8 addrspace
; GFX10: ; %bb.0:
; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: v_add_co_u32_e64 v0, s0, 0xfffff000, s0
; GFX10-NEXT: v_add_co_u32 v0, s0, 0xfffff000, s0
; GFX10-NEXT: v_add_co_ci_u32_e64 v1, s0, -1, s1, s0
; GFX10-NEXT: global_load_ubyte v0, v[0:1], off glc dlc
; GFX10-NEXT: s_waitcnt vmcnt(0)
@ -913,7 +913,7 @@ define amdgpu_kernel void @global_inst_salu_offset_2x_neg_12bit_max(i8 addrspace
; GFX10: ; %bb.0:
; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: v_add_co_u32_e64 v0, s0, 0xffffe000, s0
; GFX10-NEXT: v_add_co_u32 v0, s0, 0xffffe000, s0
; GFX10-NEXT: v_add_co_ci_u32_e64 v1, s0, -1, s1, s0
; GFX10-NEXT: global_load_ubyte v0, v[0:1], off glc dlc
; GFX10-NEXT: s_waitcnt vmcnt(0)
@ -943,7 +943,7 @@ define amdgpu_kernel void @global_inst_salu_offset_2x_neg_13bit_max(i8 addrspace
; GFX10: ; %bb.0:
; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: v_add_co_u32_e64 v0, s0, 0xffffc000, s0
; GFX10-NEXT: v_add_co_u32 v0, s0, 0xffffc000, s0
; GFX10-NEXT: v_add_co_ci_u32_e64 v1, s0, -1, s1, s0
; GFX10-NEXT: global_load_ubyte v0, v[0:1], off glc dlc
; GFX10-NEXT: s_waitcnt vmcnt(0)
@ -973,7 +973,7 @@ define amdgpu_kernel void @global_inst_salu_offset_64bit_11bit_split0(i8 addrspa
; GFX10: ; %bb.0:
; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: v_add_co_u32_e64 v0, s0, 0, s0
; GFX10-NEXT: v_add_co_u32 v0, s0, 0, s0
; GFX10-NEXT: v_add_co_ci_u32_e64 v1, s0, 2, s1, s0
; GFX10-NEXT: global_load_ubyte v0, v[0:1], off offset:2047 glc dlc
; GFX10-NEXT: s_waitcnt vmcnt(0)
@ -1003,7 +1003,7 @@ define amdgpu_kernel void @global_inst_salu_offset_64bit_11bit_split1(i8 addrspa
; GFX10: ; %bb.0:
; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: v_add_co_u32_e64 v0, s0, 0x800, s0
; GFX10-NEXT: v_add_co_u32 v0, s0, 0x800, s0
; GFX10-NEXT: v_add_co_ci_u32_e64 v1, s0, 2, s1, s0
; GFX10-NEXT: global_load_ubyte v0, v[0:1], off glc dlc
; GFX10-NEXT: s_waitcnt vmcnt(0)
@ -1033,7 +1033,7 @@ define amdgpu_kernel void @global_inst_salu_offset_64bit_12bit_split0(i8 addrspa
; GFX10: ; %bb.0:
; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: v_add_co_u32_e64 v0, s0, 0x800, s0
; GFX10-NEXT: v_add_co_u32 v0, s0, 0x800, s0
; GFX10-NEXT: v_add_co_ci_u32_e64 v1, s0, 2, s1, s0
; GFX10-NEXT: global_load_ubyte v0, v[0:1], off offset:2047 glc dlc
; GFX10-NEXT: s_waitcnt vmcnt(0)
@ -1064,7 +1064,7 @@ define amdgpu_kernel void @global_inst_salu_offset_64bit_12bit_split1(i8 addrspa
; GFX10: ; %bb.0:
; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: v_add_co_u32_e64 v0, s0, 0x1000, s0
; GFX10-NEXT: v_add_co_u32 v0, s0, 0x1000, s0
; GFX10-NEXT: v_add_co_ci_u32_e64 v1, s0, 2, s1, s0
; GFX10-NEXT: global_load_ubyte v0, v[0:1], off glc dlc
; GFX10-NEXT: s_waitcnt vmcnt(0)
@ -1095,7 +1095,7 @@ define amdgpu_kernel void @global_inst_salu_offset_64bit_13bit_split0(i8 addrspa
; GFX10: ; %bb.0:
; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: v_add_co_u32_e64 v0, s0, 0x1800, s0
; GFX10-NEXT: v_add_co_u32 v0, s0, 0x1800, s0
; GFX10-NEXT: v_add_co_ci_u32_e64 v1, s0, 2, s1, s0
; GFX10-NEXT: global_load_ubyte v0, v[0:1], off offset:2047 glc dlc
; GFX10-NEXT: s_waitcnt vmcnt(0)
@ -1126,7 +1126,7 @@ define amdgpu_kernel void @global_inst_salu_offset_64bit_13bit_split1(i8 addrspa
; GFX10: ; %bb.0:
; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: v_add_co_u32_e64 v0, s0, 0x2000, s0
; GFX10-NEXT: v_add_co_u32 v0, s0, 0x2000, s0
; GFX10-NEXT: v_add_co_ci_u32_e64 v1, s0, 2, s1, s0
; GFX10-NEXT: global_load_ubyte v0, v[0:1], off glc dlc
; GFX10-NEXT: s_waitcnt vmcnt(0)
@ -1159,7 +1159,7 @@ define amdgpu_kernel void @global_inst_salu_offset_64bit_11bit_neg_high_split0(i
; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: v_mov_b32_e32 v1, s1
; GFX10-NEXT: v_add_co_u32_e64 v0, vcc_lo, 0x800, s0
; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, 0x800, s0
; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0x80000000, v1, vcc_lo
; GFX10-NEXT: global_load_ubyte v0, v[0:1], off offset:-1 glc dlc
; GFX10-NEXT: s_waitcnt vmcnt(0)
@ -1192,7 +1192,7 @@ define amdgpu_kernel void @global_inst_salu_offset_64bit_11bit_neg_high_split1(i
; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: v_mov_b32_e32 v1, s1
; GFX10-NEXT: v_add_co_u32_e64 v0, vcc_lo, 0x800, s0
; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, 0x800, s0
; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0x80000000, v1, vcc_lo
; GFX10-NEXT: global_load_ubyte v0, v[0:1], off glc dlc
; GFX10-NEXT: s_waitcnt vmcnt(0)
@ -1225,7 +1225,7 @@ define amdgpu_kernel void @global_inst_salu_offset_64bit_12bit_neg_high_split0(i
; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: v_mov_b32_e32 v1, s1
; GFX10-NEXT: v_add_co_u32_e64 v0, vcc_lo, 0x1000, s0
; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, 0x1000, s0
; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0x80000000, v1, vcc_lo
; GFX10-NEXT: global_load_ubyte v0, v[0:1], off offset:-1 glc dlc
; GFX10-NEXT: s_waitcnt vmcnt(0)
@ -1258,7 +1258,7 @@ define amdgpu_kernel void @global_inst_salu_offset_64bit_12bit_neg_high_split1(i
; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: v_mov_b32_e32 v1, s1
; GFX10-NEXT: v_add_co_u32_e64 v0, vcc_lo, 0x1000, s0
; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, 0x1000, s0
; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0x80000000, v1, vcc_lo
; GFX10-NEXT: global_load_ubyte v0, v[0:1], off glc dlc
; GFX10-NEXT: s_waitcnt vmcnt(0)
@ -1291,7 +1291,7 @@ define amdgpu_kernel void @global_inst_salu_offset_64bit_13bit_neg_high_split0(i
; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: v_mov_b32_e32 v1, s1
; GFX10-NEXT: v_add_co_u32_e64 v0, vcc_lo, 0x2000, s0
; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, 0x2000, s0
; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0x80000000, v1, vcc_lo
; GFX10-NEXT: global_load_ubyte v0, v[0:1], off offset:-1 glc dlc
; GFX10-NEXT: s_waitcnt vmcnt(0)
@ -1324,7 +1324,7 @@ define amdgpu_kernel void @global_inst_salu_offset_64bit_13bit_neg_high_split1(i
; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: v_mov_b32_e32 v1, s1
; GFX10-NEXT: v_add_co_u32_e64 v0, vcc_lo, 0x2000, s0
; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, 0x2000, s0
; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0x80000000, v1, vcc_lo
; GFX10-NEXT: global_load_ubyte v0, v[0:1], off glc dlc
; GFX10-NEXT: s_waitcnt vmcnt(0)

View File

@ -2,7 +2,7 @@
; RUN: llc -march=amdgcn -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX10 %s
; GCN-LABEL: {{^}}shl_i16:
; GCN: v_lshlrev_b16_e{{32|64}} [[OP:v[0-9]+]],
; GCN: v_lshlrev_b16{{[_e32]*}} [[OP:v[0-9]+]],
; GCN-NEXT: s_setpc_b64
define i16 @shl_i16(i16 %x, i16 %y) {
%res = shl i16 %x, %y
@ -10,7 +10,7 @@ define i16 @shl_i16(i16 %x, i16 %y) {
}
; GCN-LABEL: {{^}}lshr_i16:
; GCN: v_lshrrev_b16_e{{32|64}} [[OP:v[0-9]+]],
; GCN: v_lshrrev_b16{{[_e32]*}} [[OP:v[0-9]+]],
; GCN-NEXT: s_setpc_b64
define i16 @lshr_i16(i16 %x, i16 %y) {
%res = lshr i16 %x, %y
@ -18,7 +18,7 @@ define i16 @lshr_i16(i16 %x, i16 %y) {
}
; GCN-LABEL: {{^}}ashr_i16:
; GCN: v_ashrrev_i16_e{{32|64}} [[OP:v[0-9]+]],
; GCN: v_ashrrev_i16{{[_e32]*}} [[OP:v[0-9]+]],
; GCN-NEXT: s_setpc_b64
define i16 @ashr_i16(i16 %x, i16 %y) {
%res = ashr i16 %x, %y
@ -26,7 +26,7 @@ define i16 @ashr_i16(i16 %x, i16 %y) {
}
; GCN-LABEL: {{^}}add_u16:
; GCN: v_add_{{(nc_)*}}u16_e{{32|64}} [[OP:v[0-9]+]],
; GCN: v_add_{{(nc_)*}}u16{{[_e32]*}} [[OP:v[0-9]+]],
; GCN-NEXT: s_setpc_b64
define i16 @add_u16(i16 %x, i16 %y) {
%res = add i16 %x, %y
@ -34,7 +34,7 @@ define i16 @add_u16(i16 %x, i16 %y) {
}
; GCN-LABEL: {{^}}sub_u16:
; GCN: v_sub_{{(nc_)*}}u16_e{{32|64}} [[OP:v[0-9]+]],
; GCN: v_sub_{{(nc_)*}}u16{{[_e32]*}} [[OP:v[0-9]+]],
; GCN-NEXT: s_setpc_b64
define i16 @sub_u16(i16 %x, i16 %y) {
%res = sub i16 %x, %y
@ -42,7 +42,7 @@ define i16 @sub_u16(i16 %x, i16 %y) {
}
; GCN-LABEL: {{^}}mul_lo_u16:
; GCN: v_mul_lo_u16_e{{32|64}} [[OP:v[0-9]+]],
; GCN: v_mul_lo_u16{{[_e32]*}} [[OP:v[0-9]+]],
; GCN-NEXT: s_setpc_b64
define i16 @mul_lo_u16(i16 %x, i16 %y) {
%res = mul i16 %x, %y
@ -50,7 +50,7 @@ define i16 @mul_lo_u16(i16 %x, i16 %y) {
}
; GCN-LABEL: {{^}}min_u16:
; GCN: v_min_u16_e{{32|64}} [[OP:v[0-9]+]],
; GCN: v_min_u16{{[_e32]*}} [[OP:v[0-9]+]],
; GCN-NEXT: s_setpc_b64
define i16 @min_u16(i16 %x, i16 %y) {
%cmp = icmp ule i16 %x, %y
@ -59,7 +59,7 @@ define i16 @min_u16(i16 %x, i16 %y) {
}
; GCN-LABEL: {{^}}min_i16:
; GCN: v_min_i16_e{{32|64}} [[OP:v[0-9]+]],
; GCN: v_min_i16{{[_e32]*}} [[OP:v[0-9]+]],
; GCN-NEXT: s_setpc_b64
define i16 @min_i16(i16 %x, i16 %y) {
%cmp = icmp sle i16 %x, %y
@ -68,7 +68,7 @@ define i16 @min_i16(i16 %x, i16 %y) {
}
; GCN-LABEL: {{^}}max_u16:
; GCN: v_max_u16_e{{32|64}} [[OP:v[0-9]+]],
; GCN: v_max_u16{{[_e32]*}} [[OP:v[0-9]+]],
; GCN-NEXT: s_setpc_b64
define i16 @max_u16(i16 %x, i16 %y) {
%cmp = icmp uge i16 %x, %y
@ -77,7 +77,7 @@ define i16 @max_u16(i16 %x, i16 %y) {
}
; GCN-LABEL: {{^}}max_i16:
; GCN: v_max_i16_e{{32|64}} [[OP:v[0-9]+]],
; GCN: v_max_i16{{[_e32]*}} [[OP:v[0-9]+]],
; GCN-NEXT: s_setpc_b64
define i16 @max_i16(i16 %x, i16 %y) {
%cmp = icmp sge i16 %x, %y
@ -86,7 +86,7 @@ define i16 @max_i16(i16 %x, i16 %y) {
}
; GCN-LABEL: {{^}}shl_i16_zext_i32:
; GCN: v_lshlrev_b16_e{{32|64}} [[OP:v[0-9]+]],
; GCN: v_lshlrev_b16{{[_e32]*}} [[OP:v[0-9]+]],
; GFX10-NEXT: v_and_b32_e32 v{{[0-9]+}}, 0xffff, [[OP]]
; GCN-NEXT: s_setpc_b64
define i32 @shl_i16_zext_i32(i16 %x, i16 %y) {
@ -96,7 +96,7 @@ define i32 @shl_i16_zext_i32(i16 %x, i16 %y) {
}
; GCN-LABEL: {{^}}lshr_i16_zext_i32:
; GCN: v_lshrrev_b16_e{{32|64}} [[OP:v[0-9]+]],
; GCN: v_lshrrev_b16{{[_e32]*}} [[OP:v[0-9]+]],
; GFX10-NEXT: v_and_b32_e32 v{{[0-9]+}}, 0xffff, [[OP]]
; GCN-NEXT: s_setpc_b64
define i32 @lshr_i16_zext_i32(i16 %x, i16 %y) {
@ -106,7 +106,7 @@ define i32 @lshr_i16_zext_i32(i16 %x, i16 %y) {
}
; GCN-LABEL: {{^}}ashr_i16_zext_i32:
; GCN: v_ashrrev_i16_e{{32|64}} [[OP:v[0-9]+]],
; GCN: v_ashrrev_i16{{[_e32]*}} [[OP:v[0-9]+]],
; GFX10-NEXT: v_and_b32_e32 v{{[0-9]+}}, 0xffff, [[OP]]
; GCN-NEXT: s_setpc_b64
define i32 @ashr_i16_zext_i32(i16 %x, i16 %y) {
@ -116,7 +116,7 @@ define i32 @ashr_i16_zext_i32(i16 %x, i16 %y) {
}
; GCN-LABEL: {{^}}add_u16_zext_i32:
; GCN: v_add_{{(nc_)*}}u16_e{{32|64}} [[OP:v[0-9]+]],
; GCN: v_add_{{(nc_)*}}u16{{[_e32]*}} [[OP:v[0-9]+]],
; GFX10-NEXT: v_and_b32_e32 v{{[0-9]+}}, 0xffff, [[OP]]
; GCN-NEXT: s_setpc_b64
define i32 @add_u16_zext_i32(i16 %x, i16 %y) {
@ -126,7 +126,7 @@ define i32 @add_u16_zext_i32(i16 %x, i16 %y) {
}
; GCN-LABEL: {{^}}sub_u16_zext_i32:
; GCN: v_sub_{{(nc_)*}}u16_e{{32|64}} [[OP:v[0-9]+]],
; GCN: v_sub_{{(nc_)*}}u16{{[_e32]*}} [[OP:v[0-9]+]],
; GFX10-NEXT: v_and_b32_e32 v{{[0-9]+}}, 0xffff, [[OP]]
; GCN-NEXT: s_setpc_b64
define i32 @sub_u16_zext_i32(i16 %x, i16 %y) {
@ -136,7 +136,7 @@ define i32 @sub_u16_zext_i32(i16 %x, i16 %y) {
}
; GCN-LABEL: {{^}}mul_lo_u16_zext_i32:
; GCN: v_mul_lo_u16_e{{32|64}} [[OP:v[0-9]+]],
; GCN: v_mul_lo_u16{{[_e32]*}} [[OP:v[0-9]+]],
; GFX10-NEXT: v_and_b32_e32 v{{[0-9]+}}, 0xffff, [[OP]]
; GCN-NEXT: s_setpc_b64
define i32 @mul_lo_u16_zext_i32(i16 %x, i16 %y) {
@ -146,7 +146,7 @@ define i32 @mul_lo_u16_zext_i32(i16 %x, i16 %y) {
}
; GCN-LABEL: {{^}}min_u16_zext_i32:
; GCN: v_min_u16_e{{32|64}} [[OP:v[0-9]+]],
; GCN: v_min_u16{{[_e32]*}} [[OP:v[0-9]+]],
; GFX10-NEXT: v_and_b32_e32 v{{[0-9]+}}, 0xffff, [[OP]]
; GCN-NEXT: s_setpc_b64
define i32 @min_u16_zext_i32(i16 %x, i16 %y) {
@ -157,7 +157,7 @@ define i32 @min_u16_zext_i32(i16 %x, i16 %y) {
}
; GCN-LABEL: {{^}}min_i16_zext_i32:
; GCN: v_min_i16_e{{32|64}} [[OP:v[0-9]+]],
; GCN: v_min_i16{{[_e32]*}} [[OP:v[0-9]+]],
; GFX10-NEXT: v_and_b32_e32 v{{[0-9]+}}, 0xffff, [[OP]]
; GCN-NEXT: s_setpc_b64
define i32 @min_i16_zext_i32(i16 %x, i16 %y) {
@ -168,7 +168,7 @@ define i32 @min_i16_zext_i32(i16 %x, i16 %y) {
}
; GCN-LABEL: {{^}}max_u16_zext_i32:
; GCN: v_max_u16_e{{32|64}} [[OP:v[0-9]+]],
; GCN: v_max_u16{{[_e32]*}} [[OP:v[0-9]+]],
; GFX10-NEXT: v_and_b32_e32 v{{[0-9]+}}, 0xffff, [[OP]]
; GCN-NEXT: s_setpc_b64
define i32 @max_u16_zext_i32(i16 %x, i16 %y) {
@ -179,7 +179,7 @@ define i32 @max_u16_zext_i32(i16 %x, i16 %y) {
}
; GCN-LABEL: {{^}}max_i16_zext_i32:
; GCN: v_max_i16_e{{32|64}} [[OP:v[0-9]+]],
; GCN: v_max_i16{{[_e32]*}} [[OP:v[0-9]+]],
; GFX10-NEXT: v_and_b32_e32 v{{[0-9]+}}, 0xffff, [[OP]]
; GCN-NEXT: s_setpc_b64
define i32 @max_i16_zext_i32(i16 %x, i16 %y) {

View File

@ -89,7 +89,7 @@ define amdgpu_kernel void @saddo_i64_zext(i64 addrspace(1)* %out, i64 %a, i64 %b
; GFX10-NEXT: v_cmp_lt_i64_e64 s3, s[0:1], s[6:7]
; GFX10-NEXT: s_xor_b32 s2, s2, s3
; GFX10-NEXT: v_cndmask_b32_e64 v0, 0, 1, s2
; GFX10-NEXT: v_add_co_u32_e64 v0, s0, s0, v0
; GFX10-NEXT: v_add_co_u32 v0, s0, s0, v0
; GFX10-NEXT: v_add_co_ci_u32_e64 v1, s0, s1, 0, s0
; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5]
; GFX10-NEXT: s_endpgm
@ -461,7 +461,7 @@ define amdgpu_kernel void @v_saddo_i64(i64 addrspace(1)* %out, i1 addrspace(1)*
; GFX10-NEXT: global_load_dwordx2 v[9:10], v6, s[8:9]
; GFX10-NEXT: global_load_dwordx2 v[2:3], v6, s[10:11]
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: v_add_co_u32_e64 v7, vcc_lo, v9, v2
; GFX10-NEXT: v_add_co_u32 v7, vcc_lo, v9, v2
; GFX10-NEXT: v_add_co_ci_u32_e32 v8, vcc_lo, v10, v3, vcc_lo
; GFX10-NEXT: v_cmp_gt_i64_e32 vcc_lo, 0, v[2:3]
; GFX10-NEXT: v_cmp_lt_i64_e64 s0, v[7:8], v[9:10]

View File

@ -36,10 +36,10 @@ define i8 @v_saddsat_i8(i8 %lhs, i8 %rhs) {
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: v_lshlrev_b16_e64 v1, 8, v1
; GFX10-NEXT: v_lshlrev_b16_e64 v0, 8, v0
; GFX10-NEXT: v_lshlrev_b16 v1, 8, v1
; GFX10-NEXT: v_lshlrev_b16 v0, 8, v0
; GFX10-NEXT: v_add_nc_i16 v0, v0, v1 clamp
; GFX10-NEXT: v_ashrrev_i16_e64 v0, 8, v0
; GFX10-NEXT: v_ashrrev_i16 v0, 8, v0
; GFX10-NEXT: s_setpc_b64 s[30:31]
%result = call i8 @llvm.sadd.sat.i8(i8 %lhs, i8 %rhs)
ret i8 %result
@ -486,7 +486,7 @@ define i64 @v_saddsat_i64(i64 %lhs, i64 %rhs) {
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: v_add_co_u32_e64 v10, vcc_lo, v0, v2
; GFX10-NEXT: v_add_co_u32 v10, vcc_lo, v0, v2
; GFX10-NEXT: v_bfrev_b32_e32 v6, -2
; GFX10-NEXT: v_add_co_ci_u32_e32 v11, vcc_lo, v1, v3, vcc_lo
; GFX10-NEXT: v_cmp_gt_i64_e64 s4, 0, v[2:3]

View File

@ -61,7 +61,7 @@ define amdgpu_kernel void @mul_shr_i32(i32 addrspace(1)* %out, i32 addrspace(1)*
; NOSDWA: v_mul_lo_u16_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
; NOSDWA-NOT: v_mul_u32_u24_sdwa
; GFX89: v_mul_lo_u16_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
; GFX10: v_mul_lo_u16_e64 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
; GFX10: v_mul_lo_u16 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
; SDWA-NOT: v_mul_u32_u24_sdwa
define amdgpu_kernel void @mul_i16(i16 addrspace(1)* %out, i16 addrspace(1)* %ina, i16 addrspace(1)* %inb) #0 {
@ -268,7 +268,7 @@ entry:
; NOSDWA: v_mul_lo_u16_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
; NOSDWA-NOT: v_mul_u32_u24_sdwa
; GFX89: v_mul_lo_u16_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
; GFX10: v_mul_lo_u16_e64 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
; GFX10: v_mul_lo_u16 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
; SDWA-NOT: v_mul_u32_u24_sdwa
define amdgpu_kernel void @mul_i8(i8 addrspace(1)* %out, i8 addrspace(1)* %ina, i8 addrspace(1)* %inb) #0 {
@ -296,12 +296,12 @@ entry:
; GFX9-DAG: v_mul_lo_u16_sdwa v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:BYTE_1
; GFX9-DAG: v_mul_lo_u16_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
; GFX10-DAG: v_mul_lo_u16_e64
; GFX10-DAG: v_mul_lo_u16_e64
; GFX10-DAG: v_mul_lo_u16
; GFX10-DAG: v_mul_lo_u16
; GFX9: v_or_b32_sdwa v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX10: v_lshlrev_b16_e64 v{{[0-9]+}}, 8, v
; GFX10: v_lshlrev_b16 v{{[0-9]+}}, 8, v
; GFX10: v_or_b32_sdwa v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
define amdgpu_kernel void @mul_v2i8(<2 x i8> addrspace(1)* %out, <2 x i8> addrspace(1)* %ina, <2 x i8> addrspace(1)* %inb) #0 {
entry:
@ -331,10 +331,10 @@ entry:
; GFX9-DAG: v_mul_lo_u16_sdwa
; GFX9-DAG: v_mul_lo_u16_sdwa
; GFX10-DAG: v_mul_lo_u16_e64
; GFX10-DAG: v_mul_lo_u16_e64
; GFX10-DAG: v_mul_lo_u16_e64
; GFX10-DAG: v_mul_lo_u16_e64
; GFX10-DAG: v_mul_lo_u16
; GFX10-DAG: v_mul_lo_u16
; GFX10-DAG: v_mul_lo_u16
; GFX10-DAG: v_mul_lo_u16
define amdgpu_kernel void @mul_v4i8(<4 x i8> addrspace(1)* %out, <4 x i8> addrspace(1)* %ina, <4 x i8> addrspace(1)* %inb) #0 {
entry:
@ -370,14 +370,14 @@ entry:
; GFX9-DAG: v_mul_lo_u16_sdwa
; GFX9-DAG: v_mul_lo_u16_sdwa
; GFX10-DAG: v_mul_lo_u16_e64
; GFX10-DAG: v_mul_lo_u16_e64
; GFX10-DAG: v_mul_lo_u16_e64
; GFX10-DAG: v_mul_lo_u16_e64
; GFX10-DAG: v_mul_lo_u16_e64
; GFX10-DAG: v_mul_lo_u16_e64
; GFX10-DAG: v_mul_lo_u16_e64
; GFX10-DAG: v_mul_lo_u16_e64
; GFX10-DAG: v_mul_lo_u16
; GFX10-DAG: v_mul_lo_u16
; GFX10-DAG: v_mul_lo_u16
; GFX10-DAG: v_mul_lo_u16
; GFX10-DAG: v_mul_lo_u16
; GFX10-DAG: v_mul_lo_u16
; GFX10-DAG: v_mul_lo_u16
; GFX10-DAG: v_mul_lo_u16
define amdgpu_kernel void @mul_v8i8(<8 x i8> addrspace(1)* %out, <8 x i8> addrspace(1)* %ina, <8 x i8> addrspace(1)* %inb) #0 {
entry:

View File

@ -3,7 +3,7 @@
; GCN-LABEL: {{^}}shl_base_atomicrmw_global_atomic_csub_ptr:
; GCN-DAG: v_lshlrev_b64 v{{\[}}[[LO:[0-9]+]]:[[HI:[0-9]+]]{{\]}}, 2, v[4:5]
; GCN-DAG: v_mov_b32_e32 [[K:v[0-9]+]], 43
; GCN: v_add_co_u32_e64 v[[EXTRA_LO:[0-9]+]], vcc_lo, 0x80, v4
; GCN: v_add_co_u32 v[[EXTRA_LO:[0-9]+]], vcc_lo, 0x80, v4
; GCN: v_add_co_ci_u32_e32 v[[EXTRA_HI:[0-9]+]], vcc_lo, 0, v5, vcc_lo
; GCN: global_atomic_csub v{{[0-9]+}}, v{{\[}}[[LO]]:[[HI]]{{\]}}, [[K]], off offset:512 glc
; GCN: global_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, v{{\[}}[[EXTRA_LO]]:[[EXTRA_HI]]{{\]}}

View File

@ -716,7 +716,7 @@ define amdgpu_kernel void @v_test_i16_x_sub_64(i16 addrspace(1)* %out, i16 addrs
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: global_load_ushort v1, v0, s[2:3]
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: v_sub_nc_u16_e64 v1, v1, 64
; GFX10-NEXT: v_sub_nc_u16 v1, v1, 64
; GFX10-NEXT: global_store_short v0, v1, s[0:1]
; GFX10-NEXT: s_endpgm
%tid = call i32 @llvm.amdgcn.workitem.id.x()
@ -786,7 +786,7 @@ define amdgpu_kernel void @v_test_i16_x_sub_64_zext_to_i32(i32 addrspace(1)* %ou
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: global_load_ushort v1, v1, s[2:3]
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: v_sub_nc_u16_e64 v1, v1, 64
; GFX10-NEXT: v_sub_nc_u16 v1, v1, 64
; GFX10-NEXT: v_and_b32_e32 v1, 0xffff, v1
; GFX10-NEXT: global_store_dword v0, v1, s[0:1]
; GFX10-NEXT: s_endpgm
@ -873,8 +873,8 @@ define amdgpu_kernel void @v_test_i16_x_sub_64_multi_use(i16 addrspace(1)* %out,
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: global_load_ushort v2, v0, s[2:3] glc dlc
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: v_sub_nc_u16_e64 v1, v1, 64
; GFX10-NEXT: v_sub_nc_u16_e64 v2, v2, 64
; GFX10-NEXT: v_sub_nc_u16 v1, v1, 64
; GFX10-NEXT: v_sub_nc_u16 v2, v2, 64
; GFX10-NEXT: global_store_short v0, v1, s[0:1]
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: global_store_short v0, v2, s[0:1]

View File

@ -36,10 +36,10 @@ define i8 @v_ssubsat_i8(i8 %lhs, i8 %rhs) {
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: v_lshlrev_b16_e64 v1, 8, v1
; GFX10-NEXT: v_lshlrev_b16_e64 v0, 8, v0
; GFX10-NEXT: v_lshlrev_b16 v1, 8, v1
; GFX10-NEXT: v_lshlrev_b16 v0, 8, v0
; GFX10-NEXT: v_sub_nc_i16 v0, v0, v1 clamp
; GFX10-NEXT: v_ashrrev_i16_e64 v0, 8, v0
; GFX10-NEXT: v_ashrrev_i16 v0, 8, v0
; GFX10-NEXT: s_setpc_b64 s[30:31]
%result = call i8 @llvm.ssub.sat.i8(i8 %lhs, i8 %rhs)
ret i8 %result
@ -1100,7 +1100,7 @@ define i64 @v_ssubsat_i64(i64 %lhs, i64 %rhs) {
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: v_sub_co_u32_e64 v10, vcc_lo, v0, v2
; GFX10-NEXT: v_sub_co_u32 v10, vcc_lo, v0, v2
; GFX10-NEXT: v_bfrev_b32_e32 v6, -2
; GFX10-NEXT: v_sub_co_ci_u32_e32 v11, vcc_lo, v1, v3, vcc_lo
; GFX10-NEXT: v_cmp_lt_i64_e64 s4, 0, v[2:3]

View File

@ -34,7 +34,7 @@ define i8 @v_usubsat_i8(i8 %lhs, i8 %rhs) {
; GFX10-NEXT: s_movk_i32 s4, 0xff
; GFX10-NEXT: v_and_b32_e32 v1, s4, v1
; GFX10-NEXT: v_and_b32_e32 v0, s4, v0
; GFX10-NEXT: v_sub_nc_u16_e64 v0, v0, v1 clamp
; GFX10-NEXT: v_sub_nc_u16 v0, v0, v1 clamp
; GFX10-NEXT: s_setpc_b64 s[30:31]
%result = call i8 @llvm.usub.sat.i8(i8 %lhs, i8 %rhs)
ret i8 %result
@ -67,7 +67,7 @@ define i16 @v_usubsat_i16(i16 %lhs, i16 %rhs) {
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: v_sub_nc_u16_e64 v0, v0, v1 clamp
; GFX10-NEXT: v_sub_nc_u16 v0, v0, v1 clamp
; GFX10-NEXT: s_setpc_b64 s[30:31]
%result = call i16 @llvm.usub.sat.i16(i16 %lhs, i16 %rhs)
ret i16 %result
@ -577,7 +577,7 @@ define i64 @v_usubsat_i64(i64 %lhs, i64 %rhs) {
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: v_sub_co_u32_e64 v2, vcc_lo, v0, v2
; GFX10-NEXT: v_sub_co_u32 v2, vcc_lo, v0, v2
; GFX10-NEXT: v_sub_co_ci_u32_e32 v3, vcc_lo, v1, v3, vcc_lo
; GFX10-NEXT: v_cmp_gt_u64_e32 vcc_lo, v[2:3], v[0:1]
; GFX10-NEXT: v_cndmask_b32_e64 v0, v2, 0, vcc_lo

View File

@ -9,7 +9,7 @@ define void @vgpr_descriptor_waterfall_loop_idom_update(<4 x i32>* %arg) #0 {
; GCN-NEXT: BB0_1: ; %bb0
; GCN-NEXT: ; =>This Loop Header: Depth=1
; GCN-NEXT: ; Child Loop BB0_2 Depth 2
; GCN-NEXT: v_add_co_u32_e64 v6, vcc_lo, v0, 8
; GCN-NEXT: v_add_co_u32 v6, vcc_lo, v0, 8
; GCN-NEXT: s_mov_b32 s5, exec_lo
; GCN-NEXT: v_add_co_ci_u32_e32 v7, vcc_lo, 0, v1, vcc_lo
; GCN-NEXT: s_clause 0x1

View File

@ -285,9 +285,9 @@ bb8:
}
; GCN-LABEL: {{^}}test_addc_vop2b:
; GFX1032: v_add_co_u32_e64 v{{[0-9]+}}, vcc_lo, v{{[0-9]+}}, s{{[0-9]+}}
; GFX1032: v_add_co_u32 v{{[0-9]+}}, vcc_lo, v{{[0-9]+}}, s{{[0-9]+}}
; GFX1032: v_add_co_ci_u32_e32 v{{[0-9]+}}, vcc_lo, s{{[0-9]+}}, v{{[0-9]+}}, vcc_lo
; GFX1064: v_add_co_u32_e64 v{{[0-9]+}}, vcc, v{{[0-9]+}}, s{{[0-9]+}}
; GFX1064: v_add_co_u32 v{{[0-9]+}}, vcc, v{{[0-9]+}}, s{{[0-9]+}}
; GFX1064: v_add_co_ci_u32_e32 v{{[0-9]+}}, vcc, s{{[0-9]+}}, v{{[0-9]+}}, vcc{{$}}
define amdgpu_kernel void @test_addc_vop2b(i64 addrspace(1)* %arg, i64 %arg1) #0 {
bb:
@ -300,9 +300,9 @@ bb:
}
; GCN-LABEL: {{^}}test_subbrev_vop2b:
; GFX1032: v_sub_co_u32_e64 v{{[0-9]+}}, [[A0:s[0-9]+|vcc_lo]], v{{[0-9]+}}, s{{[0-9]+}}{{$}}
; GFX1032: v_sub_co_u32 v{{[0-9]+}}, [[A0:s[0-9]+|vcc_lo]], v{{[0-9]+}}, s{{[0-9]+}}{{$}}
; GFX1032: v_subrev_co_ci_u32_e32 v{{[0-9]+}}, vcc_lo, {{[vs][0-9]+}}, {{[vs][0-9]+}}, [[A0]]{{$}}
; GFX1064: v_sub_co_u32_e64 v{{[0-9]+}}, [[A0:s\[[0-9:]+\]|vcc]], v{{[0-9]+}}, s{{[0-9]+}}{{$}}
; GFX1064: v_sub_co_u32 v{{[0-9]+}}, [[A0:s\[[0-9:]+\]|vcc]], v{{[0-9]+}}, s{{[0-9]+}}{{$}}
; GFX1064: v_subrev_co_ci_u32_e32 v{{[0-9]+}}, vcc, {{[vs][0-9]+}}, {{[vs][0-9]+}}, [[A0]]{{$}}
define amdgpu_kernel void @test_subbrev_vop2b(i64 addrspace(1)* %arg, i64 %arg1) #0 {
bb:
@ -315,9 +315,9 @@ bb:
}
; GCN-LABEL: {{^}}test_subb_vop2b:
; GFX1032: v_sub_co_u32_e64 v{{[0-9]+}}, [[A0:s[0-9]+|vcc_lo]], s{{[0-9]+}}, v{{[0-9]+}}{{$}}
; GFX1032: v_sub_co_u32 v{{[0-9]+}}, [[A0:s[0-9]+|vcc_lo]], s{{[0-9]+}}, v{{[0-9]+}}{{$}}
; GFX1032: v_sub_co_ci_u32_e32 v{{[0-9]+}}, vcc_lo, {{[vs][0-9]+}}, v{{[0-9]+}}, [[A0]]{{$}}
; GFX1064: v_sub_co_u32_e64 v{{[0-9]+}}, [[A0:s\[[0-9:]+\]|vcc]], s{{[0-9]+}}, v{{[0-9]+}}{{$}}
; GFX1064: v_sub_co_u32 v{{[0-9]+}}, [[A0:s\[[0-9:]+\]|vcc]], s{{[0-9]+}}, v{{[0-9]+}}{{$}}
; GFX1064: v_sub_co_ci_u32_e32 v{{[0-9]+}}, vcc, {{[vs][0-9]+}}, v{{[0-9]+}}, [[A0]]{{$}}
define amdgpu_kernel void @test_subb_vop2b(i64 addrspace(1)* %arg, i64 %arg1) #0 {
bb:
@ -330,24 +330,24 @@ bb:
}
; GCN-LABEL: {{^}}test_udiv64:
; GFX1032: v_add_co_u32_e64 v{{[0-9]+}}, [[SDST:s[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}}
; GFX1032: v_add_co_u32 v{{[0-9]+}}, [[SDST:s[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}}
; GFX1032: v_add_co_ci_u32_e32 v{{[0-9]+}}, vcc_lo, 0, v{{[0-9]+}}, vcc_lo
; GFX1032: v_add_co_ci_u32_e64 v{{[0-9]+}}, vcc_lo, v{{[0-9]+}}, v{{[0-9]+}}, [[SDST]]
; GFX1032: v_add_co_u32_e64 v{{[0-9]+}}, vcc_lo, v{{[0-9]+}}, v{{[0-9]+}}
; GFX1032: v_add_co_u32_e64 v{{[0-9]+}}, vcc_lo, v{{[0-9]+}}, v{{[0-9]+}}
; GFX1032: v_add_co_u32_e64 v{{[0-9]+}}, vcc_lo, v{{[0-9]+}}, v{{[0-9]+}}
; GFX1032: v_add_co_u32 v{{[0-9]+}}, vcc_lo, v{{[0-9]+}}, v{{[0-9]+}}
; GFX1032: v_add_co_u32 v{{[0-9]+}}, vcc_lo, v{{[0-9]+}}, v{{[0-9]+}}
; GFX1032: v_add_co_u32 v{{[0-9]+}}, vcc_lo, v{{[0-9]+}}, v{{[0-9]+}}
; GFX1032: v_add_co_ci_u32_e32 v{{[0-9]+}}, vcc_lo, 0, v{{[0-9]+}}, vcc_lo
; GFX1032: v_sub_co_u32_e64 v{{[0-9]+}}, vcc_lo, s{{[0-9]+}}, v{{[0-9]+}}
; GFX1032: v_sub_co_u32 v{{[0-9]+}}, vcc_lo, s{{[0-9]+}}, v{{[0-9]+}}
; GFX1032: v_subrev_co_ci_u32_e64 v{{[0-9]+}}, s{{[0-9]+}}, {{[vs][0-9]+}}, v{{[0-9]+}}, vcc_lo
; GFX1032: v_sub_co_ci_u32_e32 v{{[0-9]+}}, vcc_lo, {{[vs][0-9]+}}, v{{[0-9]+}}, vcc_lo
; GFX1064: v_add_co_u32_e64 v{{[0-9]+}}, [[SDST:s\[[0-9:]+\]]], v{{[0-9]+}}, v{{[0-9]+}}
; GFX1064: v_add_co_u32 v{{[0-9]+}}, [[SDST:s\[[0-9:]+\]]], v{{[0-9]+}}, v{{[0-9]+}}
; GFX1064: v_add_co_ci_u32_e32 v{{[0-9]+}}, vcc, 0, v{{[0-9]+}}, vcc{{$}}
; GFX1064: v_add_co_ci_u32_e64 v{{[0-9]+}}, vcc, v{{[0-9]+}}, v{{[0-9]+}}, [[SDST]]
; GFX1064: v_add_co_u32_e64 v{{[0-9]+}}, vcc, v{{[0-9]+}}, v{{[0-9]+}}
; GFX1064: v_add_co_u32_e64 v{{[0-9]+}}, vcc, v{{[0-9]+}}, v{{[0-9]+}}
; GFX1064: v_add_co_u32_e64 v{{[0-9]+}}, vcc, v{{[0-9]+}}, v{{[0-9]+}}
; GFX1064: v_add_co_u32 v{{[0-9]+}}, vcc, v{{[0-9]+}}, v{{[0-9]+}}
; GFX1064: v_add_co_u32 v{{[0-9]+}}, vcc, v{{[0-9]+}}, v{{[0-9]+}}
; GFX1064: v_add_co_u32 v{{[0-9]+}}, vcc, v{{[0-9]+}}, v{{[0-9]+}}
; GFX1064: v_add_co_ci_u32_e32 v{{[0-9]+}}, vcc, 0, v{{[0-9]+}}, vcc{{$}}
; GFX1064: v_sub_co_u32_e64 v{{[0-9]+}}, s[{{[0-9:]+}}], s{{[0-9]+}}, v{{[0-9]+}}
; GFX1064: v_sub_co_u32 v{{[0-9]+}}, s[{{[0-9:]+}}], s{{[0-9]+}}, v{{[0-9]+}}
; GFX1064: v_subrev_co_ci_u32_e64 v{{[0-9]+}}, vcc, {{[vs][0-9]+}}, v{{[0-9]+}}, s[{{[0-9:]+}}]
; GFX1064: v_sub_co_ci_u32_e64 v{{[0-9]+}}, s[{{[0-9:]+}}], {{[vs][0-9]+}}, v{{[0-9]+}}, s[{{[0-9:]+}}]
define amdgpu_kernel void @test_udiv64(i64 addrspace(1)* %arg) #0 {

View File

@ -851,112 +851,112 @@ v_fmac_f64_e64 v[4:5], v[2:3], v[8:9] mul:4
// GFX90A: v_fmac_f64_e64 v[4:5], v[2:3], v[8:9] div:2 ; encoding: [0x04,0x00,0x04,0xd1,0x02,0x11,0x02,0x18]
v_fmac_f64_e64 v[4:5], v[2:3], v[8:9] div:2
// GFX90A: v_mul_legacy_f32_e64 v5, v1, v2 ; encoding: [0x05,0x00,0xa1,0xd2,0x01,0x05,0x02,0x00]
// GFX90A: v_mul_legacy_f32 v5, v1, v2 ; encoding: [0x05,0x00,0xa1,0xd2,0x01,0x05,0x02,0x00]
v_mul_legacy_f32_e64 v5, v1, v2
// GFX90A: v_mul_legacy_f32_e64 v255, v1, v2 ; encoding: [0xff,0x00,0xa1,0xd2,0x01,0x05,0x02,0x00]
// GFX90A: v_mul_legacy_f32 v255, v1, v2 ; encoding: [0xff,0x00,0xa1,0xd2,0x01,0x05,0x02,0x00]
v_mul_legacy_f32_e64 v255, v1, v2
// GFX90A: v_mul_legacy_f32_e64 v5, v255, v2 ; encoding: [0x05,0x00,0xa1,0xd2,0xff,0x05,0x02,0x00]
// GFX90A: v_mul_legacy_f32 v5, v255, v2 ; encoding: [0x05,0x00,0xa1,0xd2,0xff,0x05,0x02,0x00]
v_mul_legacy_f32_e64 v5, v255, v2
// GFX90A: v_mul_legacy_f32_e64 v5, s1, v2 ; encoding: [0x05,0x00,0xa1,0xd2,0x01,0x04,0x02,0x00]
// GFX90A: v_mul_legacy_f32 v5, s1, v2 ; encoding: [0x05,0x00,0xa1,0xd2,0x01,0x04,0x02,0x00]
v_mul_legacy_f32_e64 v5, s1, v2
// GFX90A: v_mul_legacy_f32_e64 v5, s101, v2 ; encoding: [0x05,0x00,0xa1,0xd2,0x65,0x04,0x02,0x00]
// GFX90A: v_mul_legacy_f32 v5, s101, v2 ; encoding: [0x05,0x00,0xa1,0xd2,0x65,0x04,0x02,0x00]
v_mul_legacy_f32_e64 v5, s101, v2
// GFX90A: v_mul_legacy_f32_e64 v5, vcc_lo, v2 ; encoding: [0x05,0x00,0xa1,0xd2,0x6a,0x04,0x02,0x00]
// GFX90A: v_mul_legacy_f32 v5, vcc_lo, v2 ; encoding: [0x05,0x00,0xa1,0xd2,0x6a,0x04,0x02,0x00]
v_mul_legacy_f32_e64 v5, vcc_lo, v2
// GFX90A: v_mul_legacy_f32_e64 v5, vcc_hi, v2 ; encoding: [0x05,0x00,0xa1,0xd2,0x6b,0x04,0x02,0x00]
// GFX90A: v_mul_legacy_f32 v5, vcc_hi, v2 ; encoding: [0x05,0x00,0xa1,0xd2,0x6b,0x04,0x02,0x00]
v_mul_legacy_f32_e64 v5, vcc_hi, v2
// GFX90A: v_mul_legacy_f32_e64 v5, m0, v2 ; encoding: [0x05,0x00,0xa1,0xd2,0x7c,0x04,0x02,0x00]
// GFX90A: v_mul_legacy_f32 v5, m0, v2 ; encoding: [0x05,0x00,0xa1,0xd2,0x7c,0x04,0x02,0x00]
v_mul_legacy_f32_e64 v5, m0, v2
// GFX90A: v_mul_legacy_f32_e64 v5, exec_lo, v2 ; encoding: [0x05,0x00,0xa1,0xd2,0x7e,0x04,0x02,0x00]
// GFX90A: v_mul_legacy_f32 v5, exec_lo, v2 ; encoding: [0x05,0x00,0xa1,0xd2,0x7e,0x04,0x02,0x00]
v_mul_legacy_f32_e64 v5, exec_lo, v2
// GFX90A: v_mul_legacy_f32_e64 v5, exec_hi, v2 ; encoding: [0x05,0x00,0xa1,0xd2,0x7f,0x04,0x02,0x00]
// GFX90A: v_mul_legacy_f32 v5, exec_hi, v2 ; encoding: [0x05,0x00,0xa1,0xd2,0x7f,0x04,0x02,0x00]
v_mul_legacy_f32_e64 v5, exec_hi, v2
// GFX90A: v_mul_legacy_f32_e64 v5, 0, v2 ; encoding: [0x05,0x00,0xa1,0xd2,0x80,0x04,0x02,0x00]
// GFX90A: v_mul_legacy_f32 v5, 0, v2 ; encoding: [0x05,0x00,0xa1,0xd2,0x80,0x04,0x02,0x00]
v_mul_legacy_f32_e64 v5, 0, v2
// GFX90A: v_mul_legacy_f32_e64 v5, -1, v2 ; encoding: [0x05,0x00,0xa1,0xd2,0xc1,0x04,0x02,0x00]
// GFX90A: v_mul_legacy_f32 v5, -1, v2 ; encoding: [0x05,0x00,0xa1,0xd2,0xc1,0x04,0x02,0x00]
v_mul_legacy_f32_e64 v5, -1, v2
// GFX90A: v_mul_legacy_f32_e64 v5, 0.5, v2 ; encoding: [0x05,0x00,0xa1,0xd2,0xf0,0x04,0x02,0x00]
// GFX90A: v_mul_legacy_f32 v5, 0.5, v2 ; encoding: [0x05,0x00,0xa1,0xd2,0xf0,0x04,0x02,0x00]
v_mul_legacy_f32_e64 v5, 0.5, v2
// GFX90A: v_mul_legacy_f32_e64 v5, -4.0, v2 ; encoding: [0x05,0x00,0xa1,0xd2,0xf7,0x04,0x02,0x00]
// GFX90A: v_mul_legacy_f32 v5, -4.0, v2 ; encoding: [0x05,0x00,0xa1,0xd2,0xf7,0x04,0x02,0x00]
v_mul_legacy_f32_e64 v5, -4.0, v2
// GFX90A: v_mul_legacy_f32_e64 v5, v1, v255 ; encoding: [0x05,0x00,0xa1,0xd2,0x01,0xff,0x03,0x00]
// GFX90A: v_mul_legacy_f32 v5, v1, v255 ; encoding: [0x05,0x00,0xa1,0xd2,0x01,0xff,0x03,0x00]
v_mul_legacy_f32_e64 v5, v1, v255
// GFX90A: v_mul_legacy_f32_e64 v5, v1, s2 ; encoding: [0x05,0x00,0xa1,0xd2,0x01,0x05,0x00,0x00]
// GFX90A: v_mul_legacy_f32 v5, v1, s2 ; encoding: [0x05,0x00,0xa1,0xd2,0x01,0x05,0x00,0x00]
v_mul_legacy_f32_e64 v5, v1, s2
// GFX90A: v_mul_legacy_f32_e64 v5, v1, s101 ; encoding: [0x05,0x00,0xa1,0xd2,0x01,0xcb,0x00,0x00]
// GFX90A: v_mul_legacy_f32 v5, v1, s101 ; encoding: [0x05,0x00,0xa1,0xd2,0x01,0xcb,0x00,0x00]
v_mul_legacy_f32_e64 v5, v1, s101
// GFX90A: v_mul_legacy_f32_e64 v5, v1, vcc_lo ; encoding: [0x05,0x00,0xa1,0xd2,0x01,0xd5,0x00,0x00]
// GFX90A: v_mul_legacy_f32 v5, v1, vcc_lo ; encoding: [0x05,0x00,0xa1,0xd2,0x01,0xd5,0x00,0x00]
v_mul_legacy_f32_e64 v5, v1, vcc_lo
// GFX90A: v_mul_legacy_f32_e64 v5, v1, vcc_hi ; encoding: [0x05,0x00,0xa1,0xd2,0x01,0xd7,0x00,0x00]
// GFX90A: v_mul_legacy_f32 v5, v1, vcc_hi ; encoding: [0x05,0x00,0xa1,0xd2,0x01,0xd7,0x00,0x00]
v_mul_legacy_f32_e64 v5, v1, vcc_hi
// GFX90A: v_mul_legacy_f32_e64 v5, v1, m0 ; encoding: [0x05,0x00,0xa1,0xd2,0x01,0xf9,0x00,0x00]
// GFX90A: v_mul_legacy_f32 v5, v1, m0 ; encoding: [0x05,0x00,0xa1,0xd2,0x01,0xf9,0x00,0x00]
v_mul_legacy_f32_e64 v5, v1, m0
// GFX90A: v_mul_legacy_f32_e64 v5, v1, exec_lo ; encoding: [0x05,0x00,0xa1,0xd2,0x01,0xfd,0x00,0x00]
// GFX90A: v_mul_legacy_f32 v5, v1, exec_lo ; encoding: [0x05,0x00,0xa1,0xd2,0x01,0xfd,0x00,0x00]
v_mul_legacy_f32_e64 v5, v1, exec_lo
// GFX90A: v_mul_legacy_f32_e64 v5, v1, exec_hi ; encoding: [0x05,0x00,0xa1,0xd2,0x01,0xff,0x00,0x00]
// GFX90A: v_mul_legacy_f32 v5, v1, exec_hi ; encoding: [0x05,0x00,0xa1,0xd2,0x01,0xff,0x00,0x00]
v_mul_legacy_f32_e64 v5, v1, exec_hi
// GFX90A: v_mul_legacy_f32_e64 v5, v1, 0 ; encoding: [0x05,0x00,0xa1,0xd2,0x01,0x01,0x01,0x00]
// GFX90A: v_mul_legacy_f32 v5, v1, 0 ; encoding: [0x05,0x00,0xa1,0xd2,0x01,0x01,0x01,0x00]
v_mul_legacy_f32_e64 v5, v1, 0
// GFX90A: v_mul_legacy_f32_e64 v5, v1, -1 ; encoding: [0x05,0x00,0xa1,0xd2,0x01,0x83,0x01,0x00]
// GFX90A: v_mul_legacy_f32 v5, v1, -1 ; encoding: [0x05,0x00,0xa1,0xd2,0x01,0x83,0x01,0x00]
v_mul_legacy_f32_e64 v5, v1, -1
// GFX90A: v_mul_legacy_f32_e64 v5, v1, 0.5 ; encoding: [0x05,0x00,0xa1,0xd2,0x01,0xe1,0x01,0x00]
// GFX90A: v_mul_legacy_f32 v5, v1, 0.5 ; encoding: [0x05,0x00,0xa1,0xd2,0x01,0xe1,0x01,0x00]
v_mul_legacy_f32_e64 v5, v1, 0.5
// GFX90A: v_mul_legacy_f32_e64 v5, v1, -4.0 ; encoding: [0x05,0x00,0xa1,0xd2,0x01,0xef,0x01,0x00]
// GFX90A: v_mul_legacy_f32 v5, v1, -4.0 ; encoding: [0x05,0x00,0xa1,0xd2,0x01,0xef,0x01,0x00]
v_mul_legacy_f32_e64 v5, v1, -4.0
// GFX90A: v_mul_legacy_f32_e64 v5, -v1, v2 ; encoding: [0x05,0x00,0xa1,0xd2,0x01,0x05,0x02,0x20]
// GFX90A: v_mul_legacy_f32 v5, -v1, v2 ; encoding: [0x05,0x00,0xa1,0xd2,0x01,0x05,0x02,0x20]
v_mul_legacy_f32_e64 v5, -v1, v2
// GFX90A: v_mul_legacy_f32_e64 v5, v1, -v2 ; encoding: [0x05,0x00,0xa1,0xd2,0x01,0x05,0x02,0x40]
// GFX90A: v_mul_legacy_f32 v5, v1, -v2 ; encoding: [0x05,0x00,0xa1,0xd2,0x01,0x05,0x02,0x40]
v_mul_legacy_f32_e64 v5, v1, -v2
// GFX90A: v_mul_legacy_f32_e64 v5, -v1, -v2 ; encoding: [0x05,0x00,0xa1,0xd2,0x01,0x05,0x02,0x60]
// GFX90A: v_mul_legacy_f32 v5, -v1, -v2 ; encoding: [0x05,0x00,0xa1,0xd2,0x01,0x05,0x02,0x60]
v_mul_legacy_f32_e64 v5, -v1, -v2
// GFX90A: v_mul_legacy_f32_e64 v5, |v1|, v2 ; encoding: [0x05,0x01,0xa1,0xd2,0x01,0x05,0x02,0x00]
// GFX90A: v_mul_legacy_f32 v5, |v1|, v2 ; encoding: [0x05,0x01,0xa1,0xd2,0x01,0x05,0x02,0x00]
v_mul_legacy_f32_e64 v5, |v1|, v2
// GFX90A: v_mul_legacy_f32_e64 v5, v1, |v2| ; encoding: [0x05,0x02,0xa1,0xd2,0x01,0x05,0x02,0x00]
// GFX90A: v_mul_legacy_f32 v5, v1, |v2| ; encoding: [0x05,0x02,0xa1,0xd2,0x01,0x05,0x02,0x00]
v_mul_legacy_f32_e64 v5, v1, |v2|
// GFX90A: v_mul_legacy_f32_e64 v5, |v1|, |v2| ; encoding: [0x05,0x03,0xa1,0xd2,0x01,0x05,0x02,0x00]
// GFX90A: v_mul_legacy_f32 v5, |v1|, |v2| ; encoding: [0x05,0x03,0xa1,0xd2,0x01,0x05,0x02,0x00]
v_mul_legacy_f32_e64 v5, |v1|, |v2|
// GFX90A: v_mul_legacy_f32_e64 v5, v1, v2 clamp ; encoding: [0x05,0x80,0xa1,0xd2,0x01,0x05,0x02,0x00]
// GFX90A: v_mul_legacy_f32 v5, v1, v2 clamp ; encoding: [0x05,0x80,0xa1,0xd2,0x01,0x05,0x02,0x00]
v_mul_legacy_f32_e64 v5, v1, v2 clamp
// GFX90A: v_mul_legacy_f32_e64 v5, v1, v2 mul:2 ; encoding: [0x05,0x00,0xa1,0xd2,0x01,0x05,0x02,0x08]
// GFX90A: v_mul_legacy_f32 v5, v1, v2 mul:2 ; encoding: [0x05,0x00,0xa1,0xd2,0x01,0x05,0x02,0x08]
v_mul_legacy_f32_e64 v5, v1, v2 mul:2
// GFX90A: v_mul_legacy_f32_e64 v5, v1, v2 mul:4 ; encoding: [0x05,0x00,0xa1,0xd2,0x01,0x05,0x02,0x10]
// GFX90A: v_mul_legacy_f32 v5, v1, v2 mul:4 ; encoding: [0x05,0x00,0xa1,0xd2,0x01,0x05,0x02,0x10]
v_mul_legacy_f32_e64 v5, v1, v2 mul:4
// GFX90A: v_mul_legacy_f32_e64 v5, v1, v2 div:2 ; encoding: [0x05,0x00,0xa1,0xd2,0x01,0x05,0x02,0x18]
// GFX90A: v_mul_legacy_f32 v5, v1, v2 div:2 ; encoding: [0x05,0x00,0xa1,0xd2,0x01,0x05,0x02,0x18]
v_mul_legacy_f32_e64 v5, v1, v2 div:2
// GFX90A: v_xor_b32_dpp v6, v29, v27 row_newbcast:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x36,0x0c,0x2a,0x1d,0x50,0x01,0xff]

View File

@ -283,4 +283,4 @@ v_pk_add_u16 v5, v1, 123456.0
// FIXME: v_pk_fmac_f16 cannot be promoted to VOP3 so '_e32' suffix is not valid
v_pk_fmac_f16 v5, 0x12345678, v2
// NOGFX9: error: instruction not supported on this GPU
// GFX10: v_pk_fmac_f16_e32 v5, 0x12345678, v2 ; encoding: [0xff,0x04,0x0a,0x78,0x78,0x56,0x34,0x12]
// GFX10: v_pk_fmac_f16 v5, 0x12345678, v2 ; encoding: [0xff,0x04,0x0a,0x78,0x78,0x56,0x34,0x12]

View File

@ -52,15 +52,15 @@ v_bfe_u32 v0, s1, 0x3039, s2
// GFX10-ERR: error: invalid operand (violates constant bus restrictions)
v_bfm_b32_e64 v0, 0x3039, s1
// GFX10: v_bfm_b32_e64 v0, 0x3039, s1 ; encoding: [0x00,0x00,0x63,0xd7,0xff,0x02,0x00,0x00,0x39,0x30,0x00,0x00]
// GFX10: v_bfm_b32 v0, 0x3039, s1 ; encoding: [0x00,0x00,0x63,0xd7,0xff,0x02,0x00,0x00,0x39,0x30,0x00,0x00]
// GFX9-ERR: error: literal operands are not supported
v_bfm_b32_e64 v0, 0x3039, v1
// GFX10: v_bfm_b32_e64 v0, 0x3039, v1 ; encoding: [0x00,0x00,0x63,0xd7,0xff,0x02,0x02,0x00,0x39,0x30,0x00,0x00]
// GFX10: v_bfm_b32 v0, 0x3039, v1 ; encoding: [0x00,0x00,0x63,0xd7,0xff,0x02,0x02,0x00,0x39,0x30,0x00,0x00]
// GFX9-ERR: error: literal operands are not supported
v_bfm_b32_e64 v0, 0x3039, 0x3039
// GFX10: v_bfm_b32_e64 v0, 0x3039, 0x3039 ; encoding: [0x00,0x00,0x63,0xd7,0xff,0xfe,0x01,0x00,0x39,0x30,0x00,0x00]
// GFX10: v_bfm_b32 v0, 0x3039, 0x3039 ; encoding: [0x00,0x00,0x63,0xd7,0xff,0xfe,0x01,0x00,0x39,0x30,0x00,0x00]
// GFX9-ERR: error: literal operands are not supported
v_bfm_b32_e64 v0, 0x3039, 0x3038
@ -160,15 +160,15 @@ v_add_f64 v[0:1], 1.23456, -abs(1.2345)
// GFX9-ERR: error: literal operands are not supported
v_max_i16_e64 v5, 0xfe0b, v2
// GFX10: v_max_i16_e64 v5, 0xfe0b, v2 ; encoding: [0x05,0x00,0x0a,0xd7,0xff,0x04,0x02,0x00,0x0b,0xfe,0x00,0x00]
// GFX10: v_max_i16 v5, 0xfe0b, v2 ; encoding: [0x05,0x00,0x0a,0xd7,0xff,0x04,0x02,0x00,0x0b,0xfe,0x00,0x00]
// GFX9-ERR: error: literal operands are not supported
v_max_i16_e64 v5, v1, 0x123
// GFX10: v_max_i16_e64 v5, v1, 0x123 ; encoding: [0x05,0x00,0x0a,0xd7,0x01,0xff,0x01,0x00,0x23,0x01,0x00,0x00]
// GFX10: v_max_i16 v5, v1, 0x123 ; encoding: [0x05,0x00,0x0a,0xd7,0x01,0xff,0x01,0x00,0x23,0x01,0x00,0x00]
// GFX9-ERR: error: literal operands are not supported
v_max_i16_e64 v5, 0x1234, 0x1234
// GFX10: v_max_i16_e64 v5, 0x1234, 0x1234 ; encoding: [0x05,0x00,0x0a,0xd7,0xff,0xfe,0x01,0x00,0x34,0x12,0x00,0x00]
// GFX10: v_max_i16 v5, 0x1234, 0x1234 ; encoding: [0x05,0x00,0x0a,0xd7,0xff,0xfe,0x01,0x00,0x34,0x12,0x00,0x00]
// GFX9-ERR: error: literal operands are not supported
v_min3_i16 v5, 0xfe0b, v2, v3
@ -196,19 +196,19 @@ v_min3_i16 v5, 0x5678, 0x5678, 0x5679
// GFX9-ERR: error: literal operands are not supported
v_add_nc_u16 v5, 0xfe0b, v2
// GFX10: v_add_nc_u16_e64 v5, 0xfe0b, v2 ; encoding: [0x05,0x00,0x03,0xd7,0xff,0x04,0x02,0x00,0x0b,0xfe,0x00,0x00]
// GFX10: v_add_nc_u16 v5, 0xfe0b, v2 ; encoding: [0x05,0x00,0x03,0xd7,0xff,0x04,0x02,0x00,0x0b,0xfe,0x00,0x00]
// GFX9-ERR: error: instruction not supported on this GPU
v_add_nc_u16 v5, v1, 0x1234
// GFX10: v_add_nc_u16_e64 v5, v1, 0x1234 ; encoding: [0x05,0x00,0x03,0xd7,0x01,0xff,0x01,0x00,0x34,0x12,0x00,0x00]
// GFX10: v_add_nc_u16 v5, v1, 0x1234 ; encoding: [0x05,0x00,0x03,0xd7,0x01,0xff,0x01,0x00,0x34,0x12,0x00,0x00]
// GFX9-ERR: error: instruction not supported on this GPU
v_add_nc_u16 v5, 0x1234, 0x1234
// GFX10: v_add_nc_u16_e64 v5, 0x1234, 0x1234 ; encoding: [0x05,0x00,0x03,0xd7,0xff,0xfe,0x01,0x00,0x34,0x12,0x00,0x00]
// GFX10: v_add_nc_u16 v5, 0x1234, 0x1234 ; encoding: [0x05,0x00,0x03,0xd7,0xff,0xfe,0x01,0x00,0x34,0x12,0x00,0x00]
// GFX9-ERR: error: instruction not supported on this GPU
v_ashrrev_i16_e64 v5, 0x3456, v2
// GFX10: v_ashrrev_i16_e64 v5, 0x3456, v2 ; encoding: [0x05,0x00,0x08,0xd7,0xff,0x04,0x02,0x00,0x56,0x34,0x00,0x00]
// GFX10: v_ashrrev_i16 v5, 0x3456, v2 ; encoding: [0x05,0x00,0x08,0xd7,0xff,0x04,0x02,0x00,0x56,0x34,0x00,0x00]
// GFX9-ERR: error: literal operands are not supported
v_mad_u16 v5, 0xfe0b, v2, v3

View File

@ -296,11 +296,11 @@ v_subrev_co_ci_u32_dpp v5, vcc, v1, v2, vcc quad_perm:[0,1,2,3] row_mask:0x0 ban
// GFX1064: v_subrev_co_ci_u32_dpp v5, vcc, v1, v2, vcc quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0 ; encoding: [0xfa,0x04,0x0a,0x54,0x01,0xe4,0x00,0x00]
v_add_co_u32 v0, s0, v0, v2
// GFX1032: v_add_co_u32_e64 v0, s0, v0, v2 ; encoding: [0x00,0x00,0x0f,0xd7,0x00,0x05,0x02,0x00]
// GFX1032: v_add_co_u32 v0, s0, v0, v2 ; encoding: [0x00,0x00,0x0f,0xd7,0x00,0x05,0x02,0x00]
// GFX1064-ERR: :[[@LINE-2]]:26: error: invalid operand for instruction
v_add_co_u32_e64 v0, s0, v0, v2
// GFX1032: v_add_co_u32_e64 v0, s0, v0, v2 ; encoding: [0x00,0x00,0x0f,0xd7,0x00,0x05,0x02,0x00]
// GFX1032: v_add_co_u32 v0, s0, v0, v2 ; encoding: [0x00,0x00,0x0f,0xd7,0x00,0x05,0x02,0x00]
// GFX1064-ERR: :[[@LINE-2]]:30: error: invalid operand for instruction
v_add_co_ci_u32_e64 v4, s0, v1, v5, s2
@ -308,11 +308,11 @@ v_add_co_ci_u32_e64 v4, s0, v1, v5, s2
// GFX1064-ERR: :[[@LINE-2]]:25: error: invalid operand for instruction
v_sub_co_u32 v0, s0, v0, v2
// GFX1032: v_sub_co_u32_e64 v0, s0, v0, v2 ; encoding: [0x00,0x00,0x10,0xd7,0x00,0x05,0x02,0x00]
// GFX1032: v_sub_co_u32 v0, s0, v0, v2 ; encoding: [0x00,0x00,0x10,0xd7,0x00,0x05,0x02,0x00]
// GFX1064-ERR: :[[@LINE-2]]:26: error: invalid operand for instruction
v_sub_co_u32_e64 v0, s0, v0, v2
// GFX1032: v_sub_co_u32_e64 v0, s0, v0, v2 ; encoding: [0x00,0x00,0x10,0xd7,0x00,0x05,0x02,0x00]
// GFX1032: v_sub_co_u32 v0, s0, v0, v2 ; encoding: [0x00,0x00,0x10,0xd7,0x00,0x05,0x02,0x00]
// GFX1064-ERR: :[[@LINE-2]]:30: error: invalid operand for instruction
v_sub_co_ci_u32_e64 v4, s0, v1, v5, s2
@ -320,11 +320,11 @@ v_sub_co_ci_u32_e64 v4, s0, v1, v5, s2
// GFX1064-ERR: :[[@LINE-2]]:25: error: invalid operand for instruction
v_subrev_co_u32 v0, s0, v0, v2
// GFX1032: v_subrev_co_u32_e64 v0, s0, v0, v2 ; encoding: [0x00,0x00,0x19,0xd7,0x00,0x05,0x02,0x00]
// GFX1032: v_subrev_co_u32 v0, s0, v0, v2 ; encoding: [0x00,0x00,0x19,0xd7,0x00,0x05,0x02,0x00]
// GFX1064-ERR: :[[@LINE-2]]:29: error: invalid operand for instruction
v_subrev_co_u32_e64 v0, s0, v0, v2
// GFX1032: v_subrev_co_u32_e64 v0, s0, v0, v2 ; encoding: [0x00,0x00,0x19,0xd7,0x00,0x05,0x02,0x00]
// GFX1032: v_subrev_co_u32 v0, s0, v0, v2 ; encoding: [0x00,0x00,0x19,0xd7,0x00,0x05,0x02,0x00]
// GFX1064-ERR: :[[@LINE-2]]:33: error: invalid operand for instruction
v_subrev_co_ci_u32_e64 v4, s0, v1, v5, s2
@ -333,11 +333,11 @@ v_subrev_co_ci_u32_e64 v4, s0, v1, v5, s2
v_add_co_u32 v0, s[0:1], v0, v2
// GFX1032-ERR: :[[@LINE-1]]:18: error: invalid operand for instruction
// GFX1064: v_add_co_u32_e64 v0, s[0:1], v0, v2 ; encoding: [0x00,0x00,0x0f,0xd7,0x00,0x05,0x02,0x00]
// GFX1064: v_add_co_u32 v0, s[0:1], v0, v2 ; encoding: [0x00,0x00,0x0f,0xd7,0x00,0x05,0x02,0x00]
v_add_co_u32_e64 v0, s[0:1], v0, v2
// GFX1032-ERR: :[[@LINE-1]]:22: error: invalid operand for instruction
// GFX1064: v_add_co_u32_e64 v0, s[0:1], v0, v2 ; encoding: [0x00,0x00,0x0f,0xd7,0x00,0x05,0x02,0x00]
// GFX1064: v_add_co_u32 v0, s[0:1], v0, v2 ; encoding: [0x00,0x00,0x0f,0xd7,0x00,0x05,0x02,0x00]
v_add_co_ci_u32_e64 v4, s[0:1], v1, v5, s[2:3]
// GFX1032-ERR: :[[@LINE-1]]:25: error: invalid operand for instruction
@ -345,11 +345,11 @@ v_add_co_ci_u32_e64 v4, s[0:1], v1, v5, s[2:3]
v_sub_co_u32 v0, s[0:1], v0, v2
// GFX1032-ERR: :[[@LINE-1]]:18: error: invalid operand for instruction
// GFX1064: v_sub_co_u32_e64 v0, s[0:1], v0, v2 ; encoding: [0x00,0x00,0x10,0xd7,0x00,0x05,0x02,0x00]
// GFX1064: v_sub_co_u32 v0, s[0:1], v0, v2 ; encoding: [0x00,0x00,0x10,0xd7,0x00,0x05,0x02,0x00]
v_sub_co_u32_e64 v0, s[0:1], v0, v2
// GFX1032-ERR: :[[@LINE-1]]:22: error: invalid operand for instruction
// GFX1064: v_sub_co_u32_e64 v0, s[0:1], v0, v2 ; encoding: [0x00,0x00,0x10,0xd7,0x00,0x05,0x02,0x00]
// GFX1064: v_sub_co_u32 v0, s[0:1], v0, v2 ; encoding: [0x00,0x00,0x10,0xd7,0x00,0x05,0x02,0x00]
v_sub_co_ci_u32_e64 v4, s[0:1], v1, v5, s[2:3]
// GFX1032-ERR: :[[@LINE-1]]:25: error: invalid operand for instruction
@ -357,11 +357,11 @@ v_sub_co_ci_u32_e64 v4, s[0:1], v1, v5, s[2:3]
v_subrev_co_u32 v0, s[0:1], v0, v2
// GFX1032-ERR: :[[@LINE-1]]:21: error: invalid operand for instruction
// GFX1064: v_subrev_co_u32_e64 v0, s[0:1], v0, v2 ; encoding: [0x00,0x00,0x19,0xd7,0x00,0x05,0x02,0x00]
// GFX1064: v_subrev_co_u32 v0, s[0:1], v0, v2 ; encoding: [0x00,0x00,0x19,0xd7,0x00,0x05,0x02,0x00]
v_subrev_co_u32_e64 v0, s[0:1], v0, v2
// GFX1032-ERR: :[[@LINE-1]]:25: error: invalid operand for instruction
// GFX1064: v_subrev_co_u32_e64 v0, s[0:1], v0, v2 ; encoding: [0x00,0x00,0x19,0xd7,0x00,0x05,0x02,0x00]
// GFX1064: v_subrev_co_u32 v0, s[0:1], v0, v2 ; encoding: [0x00,0x00,0x19,0xd7,0x00,0x05,0x02,0x00]
v_subrev_co_ci_u32_e64 v4, s[0:1], v1, v5, s[2:3]
// GFX1032-ERR: :[[@LINE-1]]:28: error: invalid operand for instruction

View File

@ -127,61 +127,61 @@ v_subrev_co_ci_u32_dpp v5, vcc, v1, v2, vcc quad_perm:[0,1,2,3] row_mask:0x0 ban
// GFX10: v_subrev_co_ci_u32_dpp v5, vcc, v1, v2, vcc quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0 ; encoding: [0xfa,0x04,0x0a,0x54,0x01,0xe4,0x00,0x00]
v_add_co_u32 v0, s0, v0, v2
// GFX10: v_add_co_u32_e64 v0, s0, v0, v2 ; encoding: [0x00,0x00,0x0f,0xd7,0x00,0x05,0x02,0x00]
// GFX10: v_add_co_u32 v0, s0, v0, v2 ; encoding: [0x00,0x00,0x0f,0xd7,0x00,0x05,0x02,0x00]
v_add_co_u32_e64 v0, s0, v0, v2
// GFX10: v_add_co_u32_e64 v0, s0, v0, v2 ; encoding: [0x00,0x00,0x0f,0xd7,0x00,0x05,0x02,0x00]
// GFX10: v_add_co_u32 v0, s0, v0, v2 ; encoding: [0x00,0x00,0x0f,0xd7,0x00,0x05,0x02,0x00]
v_add_co_ci_u32_e64 v4, s0, v1, v5, s2
// GFX10: v_add_co_ci_u32_e64 v4, s0, v1, v5, s2 ; encoding: [0x04,0x00,0x28,0xd5,0x01,0x0b,0x0a,0x00]
v_sub_co_u32 v0, s0, v0, v2
// GFX10: v_sub_co_u32_e64 v0, s0, v0, v2 ; encoding: [0x00,0x00,0x10,0xd7,0x00,0x05,0x02,0x00]
// GFX10: v_sub_co_u32 v0, s0, v0, v2 ; encoding: [0x00,0x00,0x10,0xd7,0x00,0x05,0x02,0x00]
v_sub_co_u32_e64 v0, s0, v0, v2
// GFX10: v_sub_co_u32_e64 v0, s0, v0, v2 ; encoding: [0x00,0x00,0x10,0xd7,0x00,0x05,0x02,0x00]
// GFX10: v_sub_co_u32 v0, s0, v0, v2 ; encoding: [0x00,0x00,0x10,0xd7,0x00,0x05,0x02,0x00]
v_sub_co_ci_u32_e64 v4, s0, v1, v5, s2
// GFX10: v_sub_co_ci_u32_e64 v4, s0, v1, v5, s2 ; encoding: [0x04,0x00,0x29,0xd5,0x01,0x0b,0x0a,0x00]
v_subrev_co_u32 v0, s0, v0, v2
// GFX10: v_subrev_co_u32_e64 v0, s0, v0, v2 ; encoding: [0x00,0x00,0x19,0xd7,0x00,0x05,0x02,0x00]
// GFX10: v_subrev_co_u32 v0, s0, v0, v2 ; encoding: [0x00,0x00,0x19,0xd7,0x00,0x05,0x02,0x00]
v_subrev_co_u32_e64 v0, s0, v0, v2
// GFX10: v_subrev_co_u32_e64 v0, s0, v0, v2 ; encoding: [0x00,0x00,0x19,0xd7,0x00,0x05,0x02,0x00]
// GFX10: v_subrev_co_u32 v0, s0, v0, v2 ; encoding: [0x00,0x00,0x19,0xd7,0x00,0x05,0x02,0x00]
v_subrev_co_ci_u32_e64 v4, s0, v1, v5, s2
// GFX10: v_subrev_co_ci_u32_e64 v4, s0, v1, v5, s2 ; encoding: [0x04,0x00,0x2a,0xd5,0x01,0x0b,0x0a,0x00]
v_add_co_u32 v0, s[0:1], v0, v2
// GFX10: v_add_co_u32_e64 v0, s[0:1], v0, v2 ; encoding: [0x00,0x00,0x0f,0xd7,0x00,0x05,0x02,0x00]
// GFX10: v_add_co_u32 v0, s[0:1], v0, v2 ; encoding: [0x00,0x00,0x0f,0xd7,0x00,0x05,0x02,0x00]
v_add_co_u32 v0, exec, v0, v2
// GFX10: v_add_co_u32_e64 v0, exec, v0, v2 ; encoding: [0x00,0x7e,0x0f,0xd7,0x00,0x05,0x02,0x00]
// GFX10: v_add_co_u32 v0, exec, v0, v2 ; encoding: [0x00,0x7e,0x0f,0xd7,0x00,0x05,0x02,0x00]
v_add_co_u32 v0, exec_lo, v0, v2
// GFX10: v_add_co_u32_e64 v0, exec_lo, v0, v2 ; encoding: [0x00,0x7e,0x0f,0xd7,0x00,0x05,0x02,0x00]
// GFX10: v_add_co_u32 v0, exec_lo, v0, v2 ; encoding: [0x00,0x7e,0x0f,0xd7,0x00,0x05,0x02,0x00]
v_add_co_u32_e64 v0, s[0:1], v0, v2
// GFX10: v_add_co_u32_e64 v0, s[0:1], v0, v2 ; encoding: [0x00,0x00,0x0f,0xd7,0x00,0x05,0x02,0x00]
// GFX10: v_add_co_u32 v0, s[0:1], v0, v2 ; encoding: [0x00,0x00,0x0f,0xd7,0x00,0x05,0x02,0x00]
v_add_co_ci_u32_e64 v4, s[0:1], v1, v5, s[2:3]
// GFX10: v_add_co_ci_u32_e64 v4, s[0:1], v1, v5, s[2:3] ; encoding: [0x04,0x00,0x28,0xd5,0x01,0x0b,0x0a,0x00]
v_sub_co_u32 v0, s[0:1], v0, v2
// GFX10: v_sub_co_u32_e64 v0, s[0:1], v0, v2 ; encoding: [0x00,0x00,0x10,0xd7,0x00,0x05,0x02,0x00]
// GFX10: v_sub_co_u32 v0, s[0:1], v0, v2 ; encoding: [0x00,0x00,0x10,0xd7,0x00,0x05,0x02,0x00]
v_sub_co_u32_e64 v0, s[0:1], v0, v2
// GFX10: v_sub_co_u32_e64 v0, s[0:1], v0, v2 ; encoding: [0x00,0x00,0x10,0xd7,0x00,0x05,0x02,0x00]
// GFX10: v_sub_co_u32 v0, s[0:1], v0, v2 ; encoding: [0x00,0x00,0x10,0xd7,0x00,0x05,0x02,0x00]
v_sub_co_ci_u32_e64 v4, s[0:1], v1, v5, s[2:3]
// GFX10: v_sub_co_ci_u32_e64 v4, s[0:1], v1, v5, s[2:3] ; encoding: [0x04,0x00,0x29,0xd5,0x01,0x0b,0x0a,0x00]
v_subrev_co_u32 v0, s[0:1], v0, v2
// GFX10: v_subrev_co_u32_e64 v0, s[0:1], v0, v2 ; encoding: [0x00,0x00,0x19,0xd7,0x00,0x05,0x02,0x00]
// GFX10: v_subrev_co_u32 v0, s[0:1], v0, v2 ; encoding: [0x00,0x00,0x19,0xd7,0x00,0x05,0x02,0x00]
v_subrev_co_u32_e64 v0, s[0:1], v0, v2
// GFX10: v_subrev_co_u32_e64 v0, s[0:1], v0, v2 ; encoding: [0x00,0x00,0x19,0xd7,0x00,0x05,0x02,0x00]
// GFX10: v_subrev_co_u32 v0, s[0:1], v0, v2 ; encoding: [0x00,0x00,0x19,0xd7,0x00,0x05,0x02,0x00]
v_subrev_co_ci_u32_e64 v4, s[0:1], v1, v5, s[2:3]
// GFX10: v_subrev_co_ci_u32_e64 v4, s[0:1], v1, v5, s[2:3] ; encoding: [0x04,0x00,0x2a,0xd5,0x01,0x0b,0x0a,0x00]

File diff suppressed because it is too large Load Diff

View File

@ -644,112 +644,112 @@
# GFX90A: v_fmac_f64_e64 v[4:5], v[2:3], v[8:9] div:2 ; encoding: [0x04,0x00,0x04,0xd1,0x02,0x11,0x02,0x18]
0x04,0x00,0x04,0xd1,0x02,0x11,0x02,0x18
# GFX90A: v_mul_legacy_f32_e64 v5, v1, v2 ; encoding: [0x05,0x00,0xa1,0xd2,0x01,0x05,0x02,0x00]
# GFX90A: v_mul_legacy_f32 v5, v1, v2 ; encoding: [0x05,0x00,0xa1,0xd2,0x01,0x05,0x02,0x00]
0x05,0x00,0xa1,0xd2,0x01,0x05,0x02,0x00
# GFX90A: v_mul_legacy_f32_e64 v255, v1, v2 ; encoding: [0xff,0x00,0xa1,0xd2,0x01,0x05,0x02,0x00]
# GFX90A: v_mul_legacy_f32 v255, v1, v2 ; encoding: [0xff,0x00,0xa1,0xd2,0x01,0x05,0x02,0x00]
0xff,0x00,0xa1,0xd2,0x01,0x05,0x02,0x00
# GFX90A: v_mul_legacy_f32_e64 v5, v255, v2 ; encoding: [0x05,0x00,0xa1,0xd2,0xff,0x05,0x02,0x00]
# GFX90A: v_mul_legacy_f32 v5, v255, v2 ; encoding: [0x05,0x00,0xa1,0xd2,0xff,0x05,0x02,0x00]
0x05,0x00,0xa1,0xd2,0xff,0x05,0x02,0x00
# GFX90A: v_mul_legacy_f32_e64 v5, s1, v2 ; encoding: [0x05,0x00,0xa1,0xd2,0x01,0x04,0x02,0x00]
# GFX90A: v_mul_legacy_f32 v5, s1, v2 ; encoding: [0x05,0x00,0xa1,0xd2,0x01,0x04,0x02,0x00]
0x05,0x00,0xa1,0xd2,0x01,0x04,0x02,0x00
# GFX90A: v_mul_legacy_f32_e64 v5, s101, v2 ; encoding: [0x05,0x00,0xa1,0xd2,0x65,0x04,0x02,0x00]
# GFX90A: v_mul_legacy_f32 v5, s101, v2 ; encoding: [0x05,0x00,0xa1,0xd2,0x65,0x04,0x02,0x00]
0x05,0x00,0xa1,0xd2,0x65,0x04,0x02,0x00
# GFX90A: v_mul_legacy_f32_e64 v5, vcc_lo, v2 ; encoding: [0x05,0x00,0xa1,0xd2,0x6a,0x04,0x02,0x00]
# GFX90A: v_mul_legacy_f32 v5, vcc_lo, v2 ; encoding: [0x05,0x00,0xa1,0xd2,0x6a,0x04,0x02,0x00]
0x05,0x00,0xa1,0xd2,0x6a,0x04,0x02,0x00
# GFX90A: v_mul_legacy_f32_e64 v5, vcc_hi, v2 ; encoding: [0x05,0x00,0xa1,0xd2,0x6b,0x04,0x02,0x00]
# GFX90A: v_mul_legacy_f32 v5, vcc_hi, v2 ; encoding: [0x05,0x00,0xa1,0xd2,0x6b,0x04,0x02,0x00]
0x05,0x00,0xa1,0xd2,0x6b,0x04,0x02,0x00
# GFX90A: v_mul_legacy_f32_e64 v5, m0, v2 ; encoding: [0x05,0x00,0xa1,0xd2,0x7c,0x04,0x02,0x00]
# GFX90A: v_mul_legacy_f32 v5, m0, v2 ; encoding: [0x05,0x00,0xa1,0xd2,0x7c,0x04,0x02,0x00]
0x05,0x00,0xa1,0xd2,0x7c,0x04,0x02,0x00
# GFX90A: v_mul_legacy_f32_e64 v5, exec_lo, v2 ; encoding: [0x05,0x00,0xa1,0xd2,0x7e,0x04,0x02,0x00]
# GFX90A: v_mul_legacy_f32 v5, exec_lo, v2 ; encoding: [0x05,0x00,0xa1,0xd2,0x7e,0x04,0x02,0x00]
0x05,0x00,0xa1,0xd2,0x7e,0x04,0x02,0x00
# GFX90A: v_mul_legacy_f32_e64 v5, exec_hi, v2 ; encoding: [0x05,0x00,0xa1,0xd2,0x7f,0x04,0x02,0x00]
# GFX90A: v_mul_legacy_f32 v5, exec_hi, v2 ; encoding: [0x05,0x00,0xa1,0xd2,0x7f,0x04,0x02,0x00]
0x05,0x00,0xa1,0xd2,0x7f,0x04,0x02,0x00
# GFX90A: v_mul_legacy_f32_e64 v5, 0, v2 ; encoding: [0x05,0x00,0xa1,0xd2,0x80,0x04,0x02,0x00]
# GFX90A: v_mul_legacy_f32 v5, 0, v2 ; encoding: [0x05,0x00,0xa1,0xd2,0x80,0x04,0x02,0x00]
0x05,0x00,0xa1,0xd2,0x80,0x04,0x02,0x00
# GFX90A: v_mul_legacy_f32_e64 v5, -1, v2 ; encoding: [0x05,0x00,0xa1,0xd2,0xc1,0x04,0x02,0x00]
# GFX90A: v_mul_legacy_f32 v5, -1, v2 ; encoding: [0x05,0x00,0xa1,0xd2,0xc1,0x04,0x02,0x00]
0x05,0x00,0xa1,0xd2,0xc1,0x04,0x02,0x00
# GFX90A: v_mul_legacy_f32_e64 v5, 0.5, v2 ; encoding: [0x05,0x00,0xa1,0xd2,0xf0,0x04,0x02,0x00]
# GFX90A: v_mul_legacy_f32 v5, 0.5, v2 ; encoding: [0x05,0x00,0xa1,0xd2,0xf0,0x04,0x02,0x00]
0x05,0x00,0xa1,0xd2,0xf0,0x04,0x02,0x00
# GFX90A: v_mul_legacy_f32_e64 v5, -4.0, v2 ; encoding: [0x05,0x00,0xa1,0xd2,0xf7,0x04,0x02,0x00]
# GFX90A: v_mul_legacy_f32 v5, -4.0, v2 ; encoding: [0x05,0x00,0xa1,0xd2,0xf7,0x04,0x02,0x00]
0x05,0x00,0xa1,0xd2,0xf7,0x04,0x02,0x00
# GFX90A: v_mul_legacy_f32_e64 v5, v1, v255 ; encoding: [0x05,0x00,0xa1,0xd2,0x01,0xff,0x03,0x00]
# GFX90A: v_mul_legacy_f32 v5, v1, v255 ; encoding: [0x05,0x00,0xa1,0xd2,0x01,0xff,0x03,0x00]
0x05,0x00,0xa1,0xd2,0x01,0xff,0x03,0x00
# GFX90A: v_mul_legacy_f32_e64 v5, v1, s2 ; encoding: [0x05,0x00,0xa1,0xd2,0x01,0x05,0x00,0x00]
# GFX90A: v_mul_legacy_f32 v5, v1, s2 ; encoding: [0x05,0x00,0xa1,0xd2,0x01,0x05,0x00,0x00]
0x05,0x00,0xa1,0xd2,0x01,0x05,0x00,0x00
# GFX90A: v_mul_legacy_f32_e64 v5, v1, s101 ; encoding: [0x05,0x00,0xa1,0xd2,0x01,0xcb,0x00,0x00]
# GFX90A: v_mul_legacy_f32 v5, v1, s101 ; encoding: [0x05,0x00,0xa1,0xd2,0x01,0xcb,0x00,0x00]
0x05,0x00,0xa1,0xd2,0x01,0xcb,0x00,0x00
# GFX90A: v_mul_legacy_f32_e64 v5, v1, vcc_lo ; encoding: [0x05,0x00,0xa1,0xd2,0x01,0xd5,0x00,0x00]
# GFX90A: v_mul_legacy_f32 v5, v1, vcc_lo ; encoding: [0x05,0x00,0xa1,0xd2,0x01,0xd5,0x00,0x00]
0x05,0x00,0xa1,0xd2,0x01,0xd5,0x00,0x00
# GFX90A: v_mul_legacy_f32_e64 v5, v1, vcc_hi ; encoding: [0x05,0x00,0xa1,0xd2,0x01,0xd7,0x00,0x00]
# GFX90A: v_mul_legacy_f32 v5, v1, vcc_hi ; encoding: [0x05,0x00,0xa1,0xd2,0x01,0xd7,0x00,0x00]
0x05,0x00,0xa1,0xd2,0x01,0xd7,0x00,0x00
# GFX90A: v_mul_legacy_f32_e64 v5, v1, m0 ; encoding: [0x05,0x00,0xa1,0xd2,0x01,0xf9,0x00,0x00]
# GFX90A: v_mul_legacy_f32 v5, v1, m0 ; encoding: [0x05,0x00,0xa1,0xd2,0x01,0xf9,0x00,0x00]
0x05,0x00,0xa1,0xd2,0x01,0xf9,0x00,0x00
# GFX90A: v_mul_legacy_f32_e64 v5, v1, exec_lo ; encoding: [0x05,0x00,0xa1,0xd2,0x01,0xfd,0x00,0x00]
# GFX90A: v_mul_legacy_f32 v5, v1, exec_lo ; encoding: [0x05,0x00,0xa1,0xd2,0x01,0xfd,0x00,0x00]
0x05,0x00,0xa1,0xd2,0x01,0xfd,0x00,0x00
# GFX90A: v_mul_legacy_f32_e64 v5, v1, exec_hi ; encoding: [0x05,0x00,0xa1,0xd2,0x01,0xff,0x00,0x00]
# GFX90A: v_mul_legacy_f32 v5, v1, exec_hi ; encoding: [0x05,0x00,0xa1,0xd2,0x01,0xff,0x00,0x00]
0x05,0x00,0xa1,0xd2,0x01,0xff,0x00,0x00
# GFX90A: v_mul_legacy_f32_e64 v5, v1, 0 ; encoding: [0x05,0x00,0xa1,0xd2,0x01,0x01,0x01,0x00]
# GFX90A: v_mul_legacy_f32 v5, v1, 0 ; encoding: [0x05,0x00,0xa1,0xd2,0x01,0x01,0x01,0x00]
0x05,0x00,0xa1,0xd2,0x01,0x01,0x01,0x00
# GFX90A: v_mul_legacy_f32_e64 v5, v1, -1 ; encoding: [0x05,0x00,0xa1,0xd2,0x01,0x83,0x01,0x00]
# GFX90A: v_mul_legacy_f32 v5, v1, -1 ; encoding: [0x05,0x00,0xa1,0xd2,0x01,0x83,0x01,0x00]
0x05,0x00,0xa1,0xd2,0x01,0x83,0x01,0x00
# GFX90A: v_mul_legacy_f32_e64 v5, v1, 0.5 ; encoding: [0x05,0x00,0xa1,0xd2,0x01,0xe1,0x01,0x00]
# GFX90A: v_mul_legacy_f32 v5, v1, 0.5 ; encoding: [0x05,0x00,0xa1,0xd2,0x01,0xe1,0x01,0x00]
0x05,0x00,0xa1,0xd2,0x01,0xe1,0x01,0x00
# GFX90A: v_mul_legacy_f32_e64 v5, v1, -4.0 ; encoding: [0x05,0x00,0xa1,0xd2,0x01,0xef,0x01,0x00]
# GFX90A: v_mul_legacy_f32 v5, v1, -4.0 ; encoding: [0x05,0x00,0xa1,0xd2,0x01,0xef,0x01,0x00]
0x05,0x00,0xa1,0xd2,0x01,0xef,0x01,0x00
# GFX90A: v_mul_legacy_f32_e64 v5, -v1, v2 ; encoding: [0x05,0x00,0xa1,0xd2,0x01,0x05,0x02,0x20]
# GFX90A: v_mul_legacy_f32 v5, -v1, v2 ; encoding: [0x05,0x00,0xa1,0xd2,0x01,0x05,0x02,0x20]
0x05,0x00,0xa1,0xd2,0x01,0x05,0x02,0x20
# GFX90A: v_mul_legacy_f32_e64 v5, v1, -v2 ; encoding: [0x05,0x00,0xa1,0xd2,0x01,0x05,0x02,0x40]
# GFX90A: v_mul_legacy_f32 v5, v1, -v2 ; encoding: [0x05,0x00,0xa1,0xd2,0x01,0x05,0x02,0x40]
0x05,0x00,0xa1,0xd2,0x01,0x05,0x02,0x40
# GFX90A: v_mul_legacy_f32_e64 v5, -v1, -v2 ; encoding: [0x05,0x00,0xa1,0xd2,0x01,0x05,0x02,0x60]
# GFX90A: v_mul_legacy_f32 v5, -v1, -v2 ; encoding: [0x05,0x00,0xa1,0xd2,0x01,0x05,0x02,0x60]
0x05,0x00,0xa1,0xd2,0x01,0x05,0x02,0x60
# GFX90A: v_mul_legacy_f32_e64 v5, |v1|, v2 ; encoding: [0x05,0x01,0xa1,0xd2,0x01,0x05,0x02,0x00]
# GFX90A: v_mul_legacy_f32 v5, |v1|, v2 ; encoding: [0x05,0x01,0xa1,0xd2,0x01,0x05,0x02,0x00]
0x05,0x01,0xa1,0xd2,0x01,0x05,0x02,0x00
# GFX90A: v_mul_legacy_f32_e64 v5, v1, |v2| ; encoding: [0x05,0x02,0xa1,0xd2,0x01,0x05,0x02,0x00]
# GFX90A: v_mul_legacy_f32 v5, v1, |v2| ; encoding: [0x05,0x02,0xa1,0xd2,0x01,0x05,0x02,0x00]
0x05,0x02,0xa1,0xd2,0x01,0x05,0x02,0x00
# GFX90A: v_mul_legacy_f32_e64 v5, |v1|, |v2| ; encoding: [0x05,0x03,0xa1,0xd2,0x01,0x05,0x02,0x00]
# GFX90A: v_mul_legacy_f32 v5, |v1|, |v2| ; encoding: [0x05,0x03,0xa1,0xd2,0x01,0x05,0x02,0x00]
0x05,0x03,0xa1,0xd2,0x01,0x05,0x02,0x00
# GFX90A: v_mul_legacy_f32_e64 v5, v1, v2 clamp ; encoding: [0x05,0x80,0xa1,0xd2,0x01,0x05,0x02,0x00]
# GFX90A: v_mul_legacy_f32 v5, v1, v2 clamp ; encoding: [0x05,0x80,0xa1,0xd2,0x01,0x05,0x02,0x00]
0x05,0x80,0xa1,0xd2,0x01,0x05,0x02,0x00
# GFX90A: v_mul_legacy_f32_e64 v5, v1, v2 mul:2 ; encoding: [0x05,0x00,0xa1,0xd2,0x01,0x05,0x02,0x08]
# GFX90A: v_mul_legacy_f32 v5, v1, v2 mul:2 ; encoding: [0x05,0x00,0xa1,0xd2,0x01,0x05,0x02,0x08]
0x05,0x00,0xa1,0xd2,0x01,0x05,0x02,0x08
# GFX90A: v_mul_legacy_f32_e64 v5, v1, v2 mul:4 ; encoding: [0x05,0x00,0xa1,0xd2,0x01,0x05,0x02,0x10]
# GFX90A: v_mul_legacy_f32 v5, v1, v2 mul:4 ; encoding: [0x05,0x00,0xa1,0xd2,0x01,0x05,0x02,0x10]
0x05,0x00,0xa1,0xd2,0x01,0x05,0x02,0x10
# GFX90A: v_mul_legacy_f32_e64 v5, v1, v2 div:2 ; encoding: [0x05,0x00,0xa1,0xd2,0x01,0x05,0x02,0x18]
# GFX90A: v_mul_legacy_f32 v5, v1, v2 div:2 ; encoding: [0x05,0x00,0xa1,0xd2,0x01,0x05,0x02,0x18]
0x05,0x00,0xa1,0xd2,0x01,0x05,0x02,0x18
# GFX90A: v_xor_b32_dpp v6, v29, v27 row_newbcast:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x36,0x0c,0x2a,0x1d,0x50,0x01,0xff]

View File

@ -145,5 +145,5 @@
#===----------------------------------------------------------------------===//
# FIXME: v_pk_fmac_f16 cannot be promoted to VOP3 so '_e32' suffix is not valid
# GFX10: v_pk_fmac_f16_e32 v5, 0x12345678, v2 ; encoding: [0xff,0x04,0x0a,0x78,0x78,0x56,0x34,0x12]
# GFX10: v_pk_fmac_f16 v5, 0x12345678, v2 ; encoding: [0xff,0x04,0x0a,0x78,0x78,0x56,0x34,0x12]
0xff,0x04,0x0a,0x78,0x78,0x56,0x34,0x12

View File

@ -15,10 +15,10 @@
# GFX10: v_bfe_u32 v0, s1, 0x3039, s1 ; encoding: [0x00,0x00,0x48,0xd5,0x01,0xfe,0x05,0x00,0x39,0x30,0x00,0x00]
0x00,0x00,0x48,0xd5,0x01,0xfe,0x05,0x00,0x39,0x30,0x00,0x00
# GFX10: v_bfm_b32_e64 v0, 0x3039, s1 ; encoding: [0x00,0x00,0x63,0xd7,0xff,0x02,0x00,0x00,0x39,0x30,0x00,0x00]
# GFX10: v_bfm_b32 v0, 0x3039, s1 ; encoding: [0x00,0x00,0x63,0xd7,0xff,0x02,0x00,0x00,0x39,0x30,0x00,0x00]
0x00,0x00,0x63,0xd7,0xff,0x02,0x00,0x00,0x39,0x30,0x00,0x00
# GFX10: v_bfm_b32_e64 v0, 0x3039, v1 ; encoding: [0x00,0x00,0x63,0xd7,0xff,0x02,0x02,0x00,0x39,0x30,0x00,0x00]
# GFX10: v_bfm_b32 v0, 0x3039, v1 ; encoding: [0x00,0x00,0x63,0xd7,0xff,0x02,0x02,0x00,0x39,0x30,0x00,0x00]
0x00,0x00,0x63,0xd7,0xff,0x02,0x02,0x00,0x39,0x30,0x00,0x00
# GFX10: v_pk_add_f16 v1, 0x4e40, v2 ; encoding: [0x01,0x40,0x0f,0xcc,0xff,0x04,0x02,0x18,0x40,0x4e,0x00,0x00]
@ -54,5 +54,5 @@
# GFX10: v_ceil_f16_e64 v255, 0xabcd clamp ; encoding: [0xff,0x80,0xdc,0xd5,0xff,0x00,0x00,0x00,0xcd,0xab,0xff,0xff]
0xff,0x80,0xdc,0xd5,0xff,0x00,0x00,0x00,0xcd,0xab,0xff,0xff
# GFX10: v_min_u16_e64 v5, v1, 0xabcd ; encoding: [0x05,0x00,0x0b,0xd7,0x01,0xff,0x01,0x00,0xcd,0xab,0xff,0xff]
# GFX10: v_min_u16 v5, v1, 0xabcd ; encoding: [0x05,0x00,0x0b,0xd7,0x01,0xff,0x01,0x00,0xcd,0xab,0xff,0xff]
0x05,0x00,0x0b,0xd7,0x01,0xff,0x01,0x00,0xcd,0xab,0xff,0xff

View File

@ -45,20 +45,20 @@
# GFX1064: v_cndmask_b32_dpp v5, v1, v2, vcc quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
0xfa,0x04,0x0a,0x02,0x01,0xe4,0x00,0x00
# GFX1032: v_add_co_u32_e64 v2, vcc_lo, s0, v2
# GFX1064: v_add_co_u32_e64 v2, vcc, s0, v2
# GFX1032: v_add_co_u32 v2, vcc_lo, s0, v2
# GFX1064: v_add_co_u32 v2, vcc, s0, v2
0x02,0x6a,0x0f,0xd7,0x00,0x04,0x02,0x00
# GFX1032: v_add_co_ci_u32_e32 v3, vcc_lo, v3, v4, vcc_lo
# GFX1064: v_add_co_ci_u32_e32 v3, vcc, v3, v4, vcc ;
0x03,0x09,0x06,0x50
# GFX1032: v_sub_co_u32_e64 v2, vcc_lo, s0, v2
# GFX1064: v_sub_co_u32_e64 v2, vcc, s0, v2
# GFX1032: v_sub_co_u32 v2, vcc_lo, s0, v2
# GFX1064: v_sub_co_u32 v2, vcc, s0, v2
0x02,0x6a,0x10,0xd7,0x00,0x04,0x02,0x00
# GFX1032: v_subrev_co_u32_e64 v2, vcc_lo, s0, v2
# GFX1064: v_subrev_co_u32_e64 v2, vcc, s0, v2
# GFX1032: v_subrev_co_u32 v2, vcc_lo, s0, v2
# GFX1064: v_subrev_co_u32 v2, vcc, s0, v2
0x02,0x6a,0x19,0xd7,0x00,0x04,0x02,0x00
# GFX1032: v_sub_co_ci_u32_e32 v3, vcc_lo, v3, v4, vcc_lo
@ -107,24 +107,24 @@
# gfx1064: v_subrev_co_ci_u32_dpp v5, vcc, v1, v2, vcc quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
# 0xfa,0x04,0x0a,0x54,0x01,0xe4,0x00,0x00
# GFX1032: v_add_co_u32_e64 v0, s0, v0, v2
# GFX1064: v_add_co_u32_e64 v0, s[0:1], v0, v2
# GFX1032: v_add_co_u32 v0, s0, v0, v2
# GFX1064: v_add_co_u32 v0, s[0:1], v0, v2
0x00,0x00,0x0f,0xd7,0x00,0x05,0x02,0x00
# GFX1032: v_add_co_ci_u32_e64 v4, s0, v1, v5, s2
# GFX1064: v_add_co_ci_u32_e64 v4, s[0:1], v1, v5, s[2:3]
0x04,0x00,0x28,0xd5,0x01,0x0b,0x0a,0x00
# GFX1032: v_sub_co_u32_e64 v0, s0, v0, v2
# GFX1064: v_sub_co_u32_e64 v0, s[0:1], v0, v2
# GFX1032: v_sub_co_u32 v0, s0, v0, v2
# GFX1064: v_sub_co_u32 v0, s[0:1], v0, v2
0x00,0x00,0x10,0xd7,0x00,0x05,0x02,0x00
# GFX1032: v_sub_co_ci_u32_e64 v4, s0, v1, v5, s2
# GFX1064: v_sub_co_ci_u32_e64 v4, s[0:1], v1, v5, s[2:3]
0x04,0x00,0x29,0xd5,0x01,0x0b,0x0a,0x00
# GFX1032: v_subrev_co_u32_e64 v0, s0, v0, v2
# GFX1064: v_subrev_co_u32_e64 v0, s[0:1], v0, v2
# GFX1032: v_subrev_co_u32 v0, s0, v0, v2
# GFX1064: v_subrev_co_u32 v0, s[0:1], v0, v2
0x00,0x00,0x19,0xd7,0x00,0x05,0x02,0x00
# GFX1032: v_subrev_co_ci_u32_e64 v4, s0, v1, v5, s2