AMDGPU/GlobalISel: Pack constant G_BUILD_VECTOR_TRUNCs when selecting

This commit is contained in:
Matt Arsenault 2020-07-18 15:30:59 -04:00
parent 0481e1ae3c
commit 5819159995
12 changed files with 2079 additions and 2069 deletions

View File

@ -616,11 +616,6 @@ bool AMDGPUInstructionSelector::selectG_UNMERGE_VALUES(MachineInstr &MI) const {
return true;
}
static bool isZero(Register Reg, const MachineRegisterInfo &MRI) {
int64_t Val;
return mi_match(Reg, MRI, m_ICst(Val)) && Val == 0;
}
bool AMDGPUInstructionSelector::selectG_BUILD_VECTOR_TRUNC(
MachineInstr &MI) const {
if (selectImpl(MI, *CoverageInfo))
@ -645,6 +640,20 @@ bool AMDGPUInstructionSelector::selectG_BUILD_VECTOR_TRUNC(
const DebugLoc &DL = MI.getDebugLoc();
MachineBasicBlock *BB = MI.getParent();
auto ConstSrc1 = getConstantVRegValWithLookThrough(Src1, *MRI, true, true);
if (ConstSrc1) {
auto ConstSrc0 = getConstantVRegValWithLookThrough(Src0, *MRI, true, true);
if (ConstSrc0) {
uint32_t Lo16 = static_cast<uint32_t>(ConstSrc0->Value) & 0xffff;
uint32_t Hi16 = static_cast<uint32_t>(ConstSrc1->Value) & 0xffff;
BuildMI(*BB, &MI, DL, TII.get(AMDGPU::S_MOV_B32), Dst)
.addImm(Lo16 | (Hi16 << 16));
MI.eraseFromParent();
return RBI.constrainGenericRegister(Dst, AMDGPU::SReg_32RegClass, *MRI);
}
}
// TODO: This should probably be a combine somewhere
// (build_vector_trunc $src0, undef -> copy $src0
MachineInstr *Src1Def = getDefIgnoringCopies(Src1, *MRI);
@ -686,7 +695,7 @@ bool AMDGPUInstructionSelector::selectG_BUILD_VECTOR_TRUNC(
} else if (Shift1) {
Opc = AMDGPU::S_PACK_LH_B32_B16;
MI.getOperand(2).setReg(ShiftSrc1);
} else if (Shift0 && isZero(Src1, *MRI)) {
} else if (Shift0 && ConstSrc1 && ConstSrc1->Value == 0) {
// build_vector_trunc (lshr $src0, 16), 0 -> s_lshr_b32 $src0, 16
auto MIB = BuildMI(*BB, &MI, DL, TII.get(AMDGPU::S_LSHR_B32), Dst)
.addReg(ShiftSrc0)

View File

@ -91,9 +91,8 @@ define <2 x i16> @v_add_v2i16_neg_inline_imm_splat(<2 x i16> %a) {
; GFX9-LABEL: v_add_v2i16_neg_inline_imm_splat:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: s_movk_i32 s4, 0xffc0
; GFX9-NEXT: s_pack_ll_b32_b16 s4, s4, s4
; GFX9-NEXT: v_pk_add_u16 v0, v0, s4
; GFX9-NEXT: v_mov_b32_e32 v1, 0xffc0ffc0
; GFX9-NEXT: v_pk_add_u16 v0, v0, v1
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: v_add_v2i16_neg_inline_imm_splat:
@ -113,8 +112,8 @@ define <2 x i16> @v_add_v2i16_neg_inline_imm_lo(<2 x i16> %a) {
; GFX9-LABEL: v_add_v2i16_neg_inline_imm_lo:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: s_pack_ll_b32_b16 s4, 0xffffffc0, 4
; GFX9-NEXT: v_pk_add_u16 v0, v0, s4
; GFX9-NEXT: v_mov_b32_e32 v1, 0x4ffc0
; GFX9-NEXT: v_pk_add_u16 v0, v0, v1
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: v_add_v2i16_neg_inline_imm_lo:
@ -133,8 +132,8 @@ define <2 x i16> @v_add_v2i16_neg_inline_imm_hi(<2 x i16> %a) {
; GFX9-LABEL: v_add_v2i16_neg_inline_imm_hi:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: s_pack_ll_b32_b16 s4, 4, 0xffffffc0
; GFX9-NEXT: v_pk_add_u16 v0, v0, s4
; GFX9-NEXT: v_mov_b32_e32 v1, 0xffc00004
; GFX9-NEXT: v_pk_add_u16 v0, v0, v1
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: v_add_v2i16_neg_inline_imm_hi:
@ -152,13 +151,10 @@ define <2 x i16> @v_add_v2i16_neg_inline_imm_hi(<2 x i16> %a) {
define amdgpu_ps i32 @s_add_v2i16_neg_inline_imm_splat(<2 x i16> inreg %a) {
; GFX9-LABEL: s_add_v2i16_neg_inline_imm_splat:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_movk_i32 s1, 0xffc0
; GFX9-NEXT: s_pack_ll_b32_b16 s1, s1, s1
; GFX9-NEXT: s_lshr_b32 s2, s0, 16
; GFX9-NEXT: s_lshr_b32 s3, s1, 16
; GFX9-NEXT: s_add_i32 s0, s0, s1
; GFX9-NEXT: s_add_i32 s2, s2, s3
; GFX9-NEXT: s_pack_ll_b32_b16 s0, s0, s2
; GFX9-NEXT: s_lshr_b32 s1, s0, 16
; GFX9-NEXT: s_add_i32 s0, s0, 0xffc0ffc0
; GFX9-NEXT: s_add_i32 s1, s1, 0xffc0
; GFX9-NEXT: s_pack_ll_b32_b16 s0, s0, s1
; GFX9-NEXT: ; return to shader part epilog
;
; GFX8-LABEL: s_add_v2i16_neg_inline_imm_splat:
@ -182,12 +178,10 @@ define amdgpu_ps i32 @s_add_v2i16_neg_inline_imm_splat(<2 x i16> inreg %a) {
define amdgpu_ps i32 @s_add_v2i16_neg_inline_imm_lo(<2 x i16> inreg %a) {
; GFX9-LABEL: s_add_v2i16_neg_inline_imm_lo:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_pack_ll_b32_b16 s1, 0xffffffc0, 4
; GFX9-NEXT: s_lshr_b32 s2, s0, 16
; GFX9-NEXT: s_lshr_b32 s3, s1, 16
; GFX9-NEXT: s_add_i32 s0, s0, s1
; GFX9-NEXT: s_add_i32 s2, s2, s3
; GFX9-NEXT: s_pack_ll_b32_b16 s0, s0, s2
; GFX9-NEXT: s_lshr_b32 s1, s0, 16
; GFX9-NEXT: s_add_i32 s0, s0, 0x4ffc0
; GFX9-NEXT: s_add_i32 s1, s1, 4
; GFX9-NEXT: s_pack_ll_b32_b16 s0, s0, s1
; GFX9-NEXT: ; return to shader part epilog
;
; GFX8-LABEL: s_add_v2i16_neg_inline_imm_lo:
@ -210,12 +204,10 @@ define amdgpu_ps i32 @s_add_v2i16_neg_inline_imm_lo(<2 x i16> inreg %a) {
define amdgpu_ps i32 @s_add_v2i16_neg_inline_imm_hi(<2 x i16> inreg %a) {
; GFX9-LABEL: s_add_v2i16_neg_inline_imm_hi:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_pack_ll_b32_b16 s1, 4, 0xffffffc0
; GFX9-NEXT: s_lshr_b32 s2, s0, 16
; GFX9-NEXT: s_lshr_b32 s3, s1, 16
; GFX9-NEXT: s_add_i32 s0, s0, s1
; GFX9-NEXT: s_add_i32 s2, s2, s3
; GFX9-NEXT: s_pack_ll_b32_b16 s0, s0, s2
; GFX9-NEXT: s_lshr_b32 s1, s0, 16
; GFX9-NEXT: s_add_i32 s0, s0, 0xffc00004
; GFX9-NEXT: s_add_i32 s1, s1, 0xffc0
; GFX9-NEXT: s_pack_ll_b32_b16 s0, s0, s1
; GFX9-NEXT: ; return to shader part epilog
;
; GFX8-LABEL: s_add_v2i16_neg_inline_imm_hi:

View File

@ -522,8 +522,7 @@ define <2 x i16> @v_ashr_v2i16_15(<2 x i16> %value) {
; GFX9-LABEL: v_ashr_v2i16_15:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: s_pack_ll_b32_b16 s4, 15, 15
; GFX9-NEXT: v_pk_ashrrev_i16 v0, s4, v0
; GFX9-NEXT: v_pk_ashrrev_i16 v0, 15, v0 op_sel_hi:[0,1]
; GFX9-NEXT: s_setpc_b64 s[30:31]
%result = ashr <2 x i16> %value, <i16 15, i16 15>
ret <2 x i16> %result

View File

@ -430,3 +430,273 @@ body: |
%5:sgpr(<2 x s16>) = G_BUILD_VECTOR_TRUNC %3, %4
S_ENDPGM 0, implicit %5
...
---
name: test_build_vector_trunc_s_v2s16_constant_constant
legalized: true
regBankSelected: true
tracksRegLiveness: true
body: |
bb.0:
; GFX9-LABEL: name: test_build_vector_trunc_s_v2s16_constant_constant
; GFX9: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 29884539
; GFX9: S_ENDPGM 0, implicit [[S_MOV_B32_]]
%0:sgpr(s32) = G_CONSTANT i32 123
%1:sgpr(s32) = G_CONSTANT i32 456
%2:sgpr(<2 x s16>) = G_BUILD_VECTOR_TRUNC %0, %1
S_ENDPGM 0, implicit %2
...
---
name: test_build_vector_trunc_s_v2s16_constant_impdef
legalized: true
regBankSelected: true
tracksRegLiveness: true
body: |
bb.0:
; GFX9-LABEL: name: test_build_vector_trunc_s_v2s16_constant_impdef
; GFX9: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 123
; GFX9: S_ENDPGM 0, implicit [[S_MOV_B32_]]
%0:sgpr(s32) = G_CONSTANT i32 123
%1:sgpr(s32) = G_IMPLICIT_DEF
%2:sgpr(<2 x s16>) = G_BUILD_VECTOR_TRUNC %0, %1
S_ENDPGM 0, implicit %2
...
---
name: test_build_vector_trunc_s_v2s16_impdef_constant
legalized: true
regBankSelected: true
tracksRegLiveness: true
body: |
bb.0:
; GFX9-LABEL: name: test_build_vector_trunc_s_v2s16_impdef_constant
; GFX9: [[DEF:%[0-9]+]]:sreg_32 = IMPLICIT_DEF
; GFX9: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 123
; GFX9: [[S_PACK_LL_B32_B16_:%[0-9]+]]:sreg_32 = S_PACK_LL_B32_B16 [[DEF]], [[S_MOV_B32_]]
; GFX9: S_ENDPGM 0, implicit [[S_PACK_LL_B32_B16_]]
%0:sgpr(s32) = G_IMPLICIT_DEF
%1:sgpr(s32) = G_CONSTANT i32 123
%2:sgpr(<2 x s16>) = G_BUILD_VECTOR_TRUNC %0, %1
S_ENDPGM 0, implicit %2
...
---
name: test_build_vector_trunc_s_v2s16_impdef_impdef
legalized: true
regBankSelected: true
tracksRegLiveness: true
body: |
bb.0:
; GFX9-LABEL: name: test_build_vector_trunc_s_v2s16_impdef_impdef
; GFX9: [[DEF:%[0-9]+]]:sreg_32 = IMPLICIT_DEF
; GFX9: S_ENDPGM 0, implicit [[DEF]]
%0:sgpr(s32) = G_IMPLICIT_DEF
%1:sgpr(s32) = G_IMPLICIT_DEF
%2:sgpr(<2 x s16>) = G_BUILD_VECTOR_TRUNC %0, %1
S_ENDPGM 0, implicit %2
...
---
name: test_build_vector_trunc_s_v2s16_zext_constant_zext_constant
legalized: true
regBankSelected: true
tracksRegLiveness: true
body: |
bb.0:
; GFX9-LABEL: name: test_build_vector_trunc_s_v2s16_zext_constant_zext_constant
; GFX9: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 29884539
; GFX9: S_ENDPGM 0, implicit [[S_MOV_B32_]]
%0:sgpr(s16) = G_CONSTANT i16 123
%1:sgpr(s16) = G_CONSTANT i16 456
%2:sgpr(s32) = G_ZEXT %0
%3:sgpr(s32) = G_ZEXT %1
%4:sgpr(<2 x s16>) = G_BUILD_VECTOR_TRUNC %2, %3
S_ENDPGM 0, implicit %4
...
---
name: test_build_vector_trunc_s_v2s16_zext_impdef_zext_constant
legalized: true
regBankSelected: true
tracksRegLiveness: true
body: |
bb.0:
; GFX9-LABEL: name: test_build_vector_trunc_s_v2s16_zext_impdef_zext_constant
; GFX9: [[DEF:%[0-9]+]]:sreg_32 = IMPLICIT_DEF
; GFX9: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 123
; GFX9: [[S_BFE_U32_:%[0-9]+]]:sreg_32 = S_BFE_U32 [[DEF]], 1048576, implicit-def $scc
; GFX9: [[S_BFE_U32_1:%[0-9]+]]:sreg_32 = S_BFE_U32 [[S_MOV_B32_]], 1048576, implicit-def $scc
; GFX9: [[S_PACK_LL_B32_B16_:%[0-9]+]]:sreg_32 = S_PACK_LL_B32_B16 [[S_BFE_U32_]], [[S_BFE_U32_1]]
; GFX9: S_ENDPGM 0, implicit [[S_PACK_LL_B32_B16_]]
%0:sgpr(s16) = G_IMPLICIT_DEF
%1:sgpr(s16) = G_CONSTANT i16 123
%2:sgpr(s32) = G_ZEXT %0
%3:sgpr(s32) = G_ZEXT %1
%4:sgpr(<2 x s16>) = G_BUILD_VECTOR_TRUNC %2, %3
S_ENDPGM 0, implicit %4
...
---
name: test_build_vector_trunc_s_v2s16_sext_constant_sext_constant
legalized: true
regBankSelected: true
tracksRegLiveness: true
body: |
bb.0:
; GFX9-LABEL: name: test_build_vector_trunc_s_v2s16_sext_constant_sext_constant
; GFX9: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 4294836208
; GFX9: S_ENDPGM 0, implicit [[S_MOV_B32_]]
%0:sgpr(s16) = G_CONSTANT i16 -16
%1:sgpr(s16) = G_CONSTANT i16 -3
%2:sgpr(s32) = G_SEXT %0
%3:sgpr(s32) = G_SEXT %1
%4:sgpr(<2 x s16>) = G_BUILD_VECTOR_TRUNC %2, %3
S_ENDPGM 0, implicit %4
...
---
name: test_build_vector_trunc_s_v2s16_anyext_constant_anyext_constant
legalized: true
regBankSelected: true
tracksRegLiveness: true
body: |
bb.0:
; GFX9-LABEL: name: test_build_vector_trunc_s_v2s16_anyext_constant_anyext_constant
; GFX9: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 123
; GFX9: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 456
; GFX9: [[S_PACK_LL_B32_B16_:%[0-9]+]]:sreg_32 = S_PACK_LL_B32_B16 [[S_MOV_B32_]], [[S_MOV_B32_1]]
; GFX9: S_ENDPGM 0, implicit [[S_PACK_LL_B32_B16_]]
%0:sgpr(s16) = G_CONSTANT i16 123
%1:sgpr(s16) = G_CONSTANT i16 456
%2:sgpr(s32) = G_ANYEXT %0
%3:sgpr(s32) = G_ANYEXT %1
%4:sgpr(<2 x s16>) = G_BUILD_VECTOR_TRUNC %2, %3
S_ENDPGM 0, implicit %4
...
---
name: test_build_vector_trunc_s_v2s16_anyext_impdef_anyext_constant
legalized: true
regBankSelected: true
tracksRegLiveness: true
body: |
bb.0:
; GFX9-LABEL: name: test_build_vector_trunc_s_v2s16_anyext_impdef_anyext_constant
; GFX9: [[DEF:%[0-9]+]]:sreg_32 = IMPLICIT_DEF
; GFX9: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 123
; GFX9: [[S_PACK_LL_B32_B16_:%[0-9]+]]:sreg_32 = S_PACK_LL_B32_B16 [[DEF]], [[S_MOV_B32_]]
; GFX9: S_ENDPGM 0, implicit [[S_PACK_LL_B32_B16_]]
%0:sgpr(s16) = G_IMPLICIT_DEF
%1:sgpr(s16) = G_CONSTANT i16 123
%2:sgpr(s32) = G_ANYEXT %0
%3:sgpr(s32) = G_ANYEXT %1
%4:sgpr(<2 x s16>) = G_BUILD_VECTOR_TRUNC %2, %3
S_ENDPGM 0, implicit %4
...
---
name: test_build_vector_trunc_s_v2s16_var_constant
legalized: true
regBankSelected: true
tracksRegLiveness: true
body: |
bb.0:
liveins: $sgpr0
; GFX9-LABEL: name: test_build_vector_trunc_s_v2s16_var_constant
; GFX9: liveins: $sgpr0
; GFX9: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0
; GFX9: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 456
; GFX9: [[S_PACK_LL_B32_B16_:%[0-9]+]]:sreg_32 = S_PACK_LL_B32_B16 [[COPY]], [[S_MOV_B32_]]
; GFX9: S_ENDPGM 0, implicit [[S_PACK_LL_B32_B16_]]
%0:sgpr(s32) = COPY $sgpr0
%1:sgpr(s32) = G_CONSTANT i32 456
%2:sgpr(<2 x s16>) = G_BUILD_VECTOR_TRUNC %0, %1
S_ENDPGM 0, implicit %2
...
---
name: test_build_vector_trunc_s_v2s16_constant_var
legalized: true
regBankSelected: true
tracksRegLiveness: true
body: |
bb.0:
liveins: $sgpr0
; GFX9-LABEL: name: test_build_vector_trunc_s_v2s16_constant_var
; GFX9: liveins: $sgpr0
; GFX9: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 456
; GFX9: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0
; GFX9: [[S_PACK_LL_B32_B16_:%[0-9]+]]:sreg_32 = S_PACK_LL_B32_B16 [[S_MOV_B32_]], [[COPY]]
; GFX9: S_ENDPGM 0, implicit [[S_PACK_LL_B32_B16_]]
%0:sgpr(s32) = G_CONSTANT i32 456
%1:sgpr(s32) = COPY $sgpr0
%2:sgpr(<2 x s16>) = G_BUILD_VECTOR_TRUNC %0, %1
S_ENDPGM 0, implicit %2
...
---
name: test_build_vector_trunc_s_v2s16_var_0
legalized: true
regBankSelected: true
tracksRegLiveness: true
body: |
bb.0:
liveins: $sgpr0
; GFX9-LABEL: name: test_build_vector_trunc_s_v2s16_var_0
; GFX9: liveins: $sgpr0
; GFX9: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0
; GFX9: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0
; GFX9: [[S_PACK_LL_B32_B16_:%[0-9]+]]:sreg_32 = S_PACK_LL_B32_B16 [[COPY]], [[S_MOV_B32_]]
; GFX9: S_ENDPGM 0, implicit [[S_PACK_LL_B32_B16_]]
%0:sgpr(s32) = COPY $sgpr0
%1:sgpr(s32) = G_CONSTANT i32 0
%2:sgpr(<2 x s16>) = G_BUILD_VECTOR_TRUNC %0, %1
S_ENDPGM 0, implicit %2
...
---
name: test_build_vector_trunc_s_v2s16_0_var
legalized: true
regBankSelected: true
tracksRegLiveness: true
body: |
bb.0:
liveins: $sgpr0
; GFX9-LABEL: name: test_build_vector_trunc_s_v2s16_0_var
; GFX9: liveins: $sgpr0
; GFX9: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0
; GFX9: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0
; GFX9: [[S_PACK_LL_B32_B16_:%[0-9]+]]:sreg_32 = S_PACK_LL_B32_B16 [[S_MOV_B32_]], [[COPY]]
; GFX9: S_ENDPGM 0, implicit [[S_PACK_LL_B32_B16_]]
%0:sgpr(s32) = G_CONSTANT i32 0
%1:sgpr(s32) = COPY $sgpr0
%2:sgpr(<2 x s16>) = G_BUILD_VECTOR_TRUNC %0, %1
S_ENDPGM 0, implicit %2
...

View File

@ -82,24 +82,21 @@ define i32 @v_sdot2_inline_literal_a(<2 x i16> %b, i32 %c) {
; GFX906-LABEL: v_sdot2_inline_literal_a:
; GFX906: ; %bb.0:
; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX906-NEXT: s_pack_ll_b32_b16 s4, 4, 4
; GFX906-NEXT: v_dot2_i32_i16 v0, s4, v0, v1
; GFX906-NEXT: v_dot2_i32_i16 v0, 4, v0, v1 op_sel_hi:[0,1,1]
; GFX906-NEXT: s_setpc_b64 s[30:31]
;
; GFX908-LABEL: v_sdot2_inline_literal_a:
; GFX908: ; %bb.0:
; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX908-NEXT: s_pack_ll_b32_b16 s4, 4, 4
; GFX908-NEXT: v_dot2_i32_i16 v0, s4, v0, v1
; GFX908-NEXT: v_dot2_i32_i16 v0, 4, v0, v1 op_sel_hi:[0,1,1]
; GFX908-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: v_sdot2_inline_literal_a:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: s_pack_ll_b32_b16 s4, 4, 4
; GFX10-NEXT: v_dot2_i32_i16 v0, 4, v0, v1 op_sel_hi:[0,1,1]
; GFX10-NEXT: ; implicit-def: $vcc_hi
; GFX10-NEXT: v_dot2_i32_i16 v0, s4, v0, v1
; GFX10-NEXT: s_setpc_b64 s[30:31]
%r = call i32 @llvm.amdgcn.sdot2(<2 x i16> <i16 4, i16 4>, <2 x i16> %b, i32 %c, i1 false)
ret i32 %r
@ -109,24 +106,21 @@ define i32 @v_sdot2_inline_literal_b(<2 x i16> %a, i32 %c) {
; GFX906-LABEL: v_sdot2_inline_literal_b:
; GFX906: ; %bb.0:
; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX906-NEXT: s_pack_ll_b32_b16 s4, 4, 4
; GFX906-NEXT: v_dot2_i32_i16 v0, v0, s4, v1
; GFX906-NEXT: v_dot2_i32_i16 v0, v0, 4, v1 op_sel_hi:[1,0,1]
; GFX906-NEXT: s_setpc_b64 s[30:31]
;
; GFX908-LABEL: v_sdot2_inline_literal_b:
; GFX908: ; %bb.0:
; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX908-NEXT: s_pack_ll_b32_b16 s4, 4, 4
; GFX908-NEXT: v_dot2_i32_i16 v0, v0, s4, v1
; GFX908-NEXT: v_dot2_i32_i16 v0, v0, 4, v1 op_sel_hi:[1,0,1]
; GFX908-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: v_sdot2_inline_literal_b:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: s_pack_ll_b32_b16 s4, 4, 4
; GFX10-NEXT: v_dot2_i32_i16 v0, v0, 4, v1 op_sel_hi:[1,0,1]
; GFX10-NEXT: ; implicit-def: $vcc_hi
; GFX10-NEXT: v_dot2_i32_i16 v0, v0, s4, v1
; GFX10-NEXT: s_setpc_b64 s[30:31]
%r = call i32 @llvm.amdgcn.sdot2(<2 x i16> %a, <2 x i16> <i16 4, i16 4>, i32 %c, i1 false)
ret i32 %r
@ -136,29 +130,21 @@ define i32 @v_sdot2_inline_literal_a_b(<2 x i16> %a, i32 %c) {
; GFX906-LABEL: v_sdot2_inline_literal_a_b:
; GFX906: ; %bb.0:
; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX906-NEXT: s_pack_ll_b32_b16 s5, 4, 4
; GFX906-NEXT: s_pack_ll_b32_b16 s4, 8, 8
; GFX906-NEXT: v_mov_b32_e32 v0, s5
; GFX906-NEXT: v_dot2_i32_i16 v0, s4, v0, v1
; GFX906-NEXT: v_dot2_i32_i16 v0, 8, 4, v1 op_sel_hi:[0,0,1]
; GFX906-NEXT: s_setpc_b64 s[30:31]
;
; GFX908-LABEL: v_sdot2_inline_literal_a_b:
; GFX908: ; %bb.0:
; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX908-NEXT: s_pack_ll_b32_b16 s5, 4, 4
; GFX908-NEXT: s_pack_ll_b32_b16 s4, 8, 8
; GFX908-NEXT: v_mov_b32_e32 v0, s5
; GFX908-NEXT: v_dot2_i32_i16 v0, s4, v0, v1
; GFX908-NEXT: v_dot2_i32_i16 v0, 8, 4, v1 op_sel_hi:[0,0,1]
; GFX908-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: v_sdot2_inline_literal_a_b:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: s_pack_ll_b32_b16 s4, 8, 8
; GFX10-NEXT: s_pack_ll_b32_b16 s5, 4, 4
; GFX10-NEXT: v_dot2_i32_i16 v0, 8, 4, v1 op_sel_hi:[0,0,1]
; GFX10-NEXT: ; implicit-def: $vcc_hi
; GFX10-NEXT: v_dot2_i32_i16 v0, s4, s5, v1
; GFX10-NEXT: s_setpc_b64 s[30:31]
%r = call i32 @llvm.amdgcn.sdot2(<2 x i16> <i16 8, i16 8>, <2 x i16> <i16 4, i16 4>, i32 %c, i1 false)
ret i32 %r
@ -168,29 +154,21 @@ define i32 @v_sdot2_inline_literal_a_b_c() {
; GFX906-LABEL: v_sdot2_inline_literal_a_b_c:
; GFX906: ; %bb.0:
; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX906-NEXT: s_pack_ll_b32_b16 s5, 4, 4
; GFX906-NEXT: s_pack_ll_b32_b16 s4, 8, 8
; GFX906-NEXT: v_mov_b32_e32 v0, s5
; GFX906-NEXT: v_dot2_i32_i16 v0, s4, v0, 8
; GFX906-NEXT: v_dot2_i32_i16 v0, 8, 4, 8 op_sel_hi:[0,0,1]
; GFX906-NEXT: s_setpc_b64 s[30:31]
;
; GFX908-LABEL: v_sdot2_inline_literal_a_b_c:
; GFX908: ; %bb.0:
; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX908-NEXT: s_pack_ll_b32_b16 s5, 4, 4
; GFX908-NEXT: s_pack_ll_b32_b16 s4, 8, 8
; GFX908-NEXT: v_mov_b32_e32 v0, s5
; GFX908-NEXT: v_dot2_i32_i16 v0, s4, v0, 8
; GFX908-NEXT: v_dot2_i32_i16 v0, 8, 4, 8 op_sel_hi:[0,0,1]
; GFX908-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: v_sdot2_inline_literal_a_b_c:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: s_pack_ll_b32_b16 s4, 8, 8
; GFX10-NEXT: s_pack_ll_b32_b16 s5, 4, 4
; GFX10-NEXT: v_dot2_i32_i16 v0, 8, 4, 8 op_sel_hi:[0,0,1]
; GFX10-NEXT: ; implicit-def: $vcc_hi
; GFX10-NEXT: v_dot2_i32_i16 v0, s4, s5, 8
; GFX10-NEXT: s_setpc_b64 s[30:31]
%r = call i32 @llvm.amdgcn.sdot2(<2 x i16> <i16 8, i16 8>, <2 x i16> <i16 4, i16 4>, i32 8, i1 false)
ret i32 %r

View File

@ -82,24 +82,21 @@ define i32 @v_udot2_inline_literal_a(<2 x i16> %b, i32 %c) {
; GFX906-LABEL: v_udot2_inline_literal_a:
; GFX906: ; %bb.0:
; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX906-NEXT: s_pack_ll_b32_b16 s4, 4, 4
; GFX906-NEXT: v_dot2_u32_u16 v0, s4, v0, v1
; GFX906-NEXT: v_dot2_u32_u16 v0, 4, v0, v1 op_sel_hi:[0,1,1]
; GFX906-NEXT: s_setpc_b64 s[30:31]
;
; GFX908-LABEL: v_udot2_inline_literal_a:
; GFX908: ; %bb.0:
; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX908-NEXT: s_pack_ll_b32_b16 s4, 4, 4
; GFX908-NEXT: v_dot2_u32_u16 v0, s4, v0, v1
; GFX908-NEXT: v_dot2_u32_u16 v0, 4, v0, v1 op_sel_hi:[0,1,1]
; GFX908-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: v_udot2_inline_literal_a:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: s_pack_ll_b32_b16 s4, 4, 4
; GFX10-NEXT: v_dot2_u32_u16 v0, 4, v0, v1 op_sel_hi:[0,1,1]
; GFX10-NEXT: ; implicit-def: $vcc_hi
; GFX10-NEXT: v_dot2_u32_u16 v0, s4, v0, v1
; GFX10-NEXT: s_setpc_b64 s[30:31]
%r = call i32 @llvm.amdgcn.udot2(<2 x i16> <i16 4, i16 4>, <2 x i16> %b, i32 %c, i1 false)
ret i32 %r
@ -109,24 +106,21 @@ define i32 @v_udot2_inline_literal_b(<2 x i16> %a, i32 %c) {
; GFX906-LABEL: v_udot2_inline_literal_b:
; GFX906: ; %bb.0:
; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX906-NEXT: s_pack_ll_b32_b16 s4, 4, 4
; GFX906-NEXT: v_dot2_u32_u16 v0, v0, s4, v1
; GFX906-NEXT: v_dot2_u32_u16 v0, v0, 4, v1 op_sel_hi:[1,0,1]
; GFX906-NEXT: s_setpc_b64 s[30:31]
;
; GFX908-LABEL: v_udot2_inline_literal_b:
; GFX908: ; %bb.0:
; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX908-NEXT: s_pack_ll_b32_b16 s4, 4, 4
; GFX908-NEXT: v_dot2_u32_u16 v0, v0, s4, v1
; GFX908-NEXT: v_dot2_u32_u16 v0, v0, 4, v1 op_sel_hi:[1,0,1]
; GFX908-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: v_udot2_inline_literal_b:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: s_pack_ll_b32_b16 s4, 4, 4
; GFX10-NEXT: v_dot2_u32_u16 v0, v0, 4, v1 op_sel_hi:[1,0,1]
; GFX10-NEXT: ; implicit-def: $vcc_hi
; GFX10-NEXT: v_dot2_u32_u16 v0, v0, s4, v1
; GFX10-NEXT: s_setpc_b64 s[30:31]
%r = call i32 @llvm.amdgcn.udot2(<2 x i16> %a, <2 x i16> <i16 4, i16 4>, i32 %c, i1 false)
ret i32 %r
@ -136,29 +130,21 @@ define i32 @v_udot2_inline_literal_a_b(<2 x i16> %a, i32 %c) {
; GFX906-LABEL: v_udot2_inline_literal_a_b:
; GFX906: ; %bb.0:
; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX906-NEXT: s_pack_ll_b32_b16 s5, 4, 4
; GFX906-NEXT: s_pack_ll_b32_b16 s4, 8, 8
; GFX906-NEXT: v_mov_b32_e32 v0, s5
; GFX906-NEXT: v_dot2_u32_u16 v0, s4, v0, v1
; GFX906-NEXT: v_dot2_u32_u16 v0, 8, 4, v1 op_sel_hi:[0,0,1]
; GFX906-NEXT: s_setpc_b64 s[30:31]
;
; GFX908-LABEL: v_udot2_inline_literal_a_b:
; GFX908: ; %bb.0:
; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX908-NEXT: s_pack_ll_b32_b16 s5, 4, 4
; GFX908-NEXT: s_pack_ll_b32_b16 s4, 8, 8
; GFX908-NEXT: v_mov_b32_e32 v0, s5
; GFX908-NEXT: v_dot2_u32_u16 v0, s4, v0, v1
; GFX908-NEXT: v_dot2_u32_u16 v0, 8, 4, v1 op_sel_hi:[0,0,1]
; GFX908-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: v_udot2_inline_literal_a_b:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: s_pack_ll_b32_b16 s4, 8, 8
; GFX10-NEXT: s_pack_ll_b32_b16 s5, 4, 4
; GFX10-NEXT: v_dot2_u32_u16 v0, 8, 4, v1 op_sel_hi:[0,0,1]
; GFX10-NEXT: ; implicit-def: $vcc_hi
; GFX10-NEXT: v_dot2_u32_u16 v0, s4, s5, v1
; GFX10-NEXT: s_setpc_b64 s[30:31]
%r = call i32 @llvm.amdgcn.udot2(<2 x i16> <i16 8, i16 8>, <2 x i16> <i16 4, i16 4>, i32 %c, i1 false)
ret i32 %r
@ -168,29 +154,21 @@ define i32 @v_udot2_inline_literal_a_b_c() {
; GFX906-LABEL: v_udot2_inline_literal_a_b_c:
; GFX906: ; %bb.0:
; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX906-NEXT: s_pack_ll_b32_b16 s5, 4, 4
; GFX906-NEXT: s_pack_ll_b32_b16 s4, 8, 8
; GFX906-NEXT: v_mov_b32_e32 v0, s5
; GFX906-NEXT: v_dot2_u32_u16 v0, s4, v0, 8
; GFX906-NEXT: v_dot2_u32_u16 v0, 8, 4, 8 op_sel_hi:[0,0,1]
; GFX906-NEXT: s_setpc_b64 s[30:31]
;
; GFX908-LABEL: v_udot2_inline_literal_a_b_c:
; GFX908: ; %bb.0:
; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX908-NEXT: s_pack_ll_b32_b16 s5, 4, 4
; GFX908-NEXT: s_pack_ll_b32_b16 s4, 8, 8
; GFX908-NEXT: v_mov_b32_e32 v0, s5
; GFX908-NEXT: v_dot2_u32_u16 v0, s4, v0, 8
; GFX908-NEXT: v_dot2_u32_u16 v0, 8, 4, 8 op_sel_hi:[0,0,1]
; GFX908-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: v_udot2_inline_literal_a_b_c:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: s_pack_ll_b32_b16 s4, 8, 8
; GFX10-NEXT: s_pack_ll_b32_b16 s5, 4, 4
; GFX10-NEXT: v_dot2_u32_u16 v0, 8, 4, 8 op_sel_hi:[0,0,1]
; GFX10-NEXT: ; implicit-def: $vcc_hi
; GFX10-NEXT: v_dot2_u32_u16 v0, s4, s5, 8
; GFX10-NEXT: s_setpc_b64 s[30:31]
%r = call i32 @llvm.amdgcn.udot2(<2 x i16> <i16 8, i16 8>, <2 x i16> <i16 4, i16 4>, i32 8, i1 false)
ret i32 %r

View File

@ -533,8 +533,7 @@ define <2 x i16> @v_lshr_v2i16_15(<2 x i16> %value) {
; GFX9-LABEL: v_lshr_v2i16_15:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: s_pack_ll_b32_b16 s4, 15, 15
; GFX9-NEXT: v_pk_lshrrev_b16 v0, s4, v0
; GFX9-NEXT: v_pk_lshrrev_b16 v0, 15, v0 op_sel_hi:[0,1]
; GFX9-NEXT: s_setpc_b64 s[30:31]
%result = lshr <2 x i16> %value, <i16 15, i16 15>
ret <2 x i16> %result

File diff suppressed because it is too large Load Diff

View File

@ -529,8 +529,7 @@ define <2 x i16> @v_shl_v2i16_15(<2 x i16> %value) {
; GFX9-LABEL: v_shl_v2i16_15:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: s_pack_ll_b32_b16 s4, 15, 15
; GFX9-NEXT: v_pk_lshlrev_b16 v0, s4, v0
; GFX9-NEXT: v_pk_lshlrev_b16 v0, 15, v0 op_sel_hi:[0,1]
; GFX9-NEXT: s_setpc_b64 s[30:31]
%result = shl <2 x i16> %value, <i16 15, i16 15>
ret <2 x i16> %result

File diff suppressed because it is too large Load Diff

View File

@ -2371,8 +2371,7 @@ define <2 x i16> @v_uaddsat_v2i16(<2 x i16> %lhs, <2 x i16> %rhs) {
; GFX9-LABEL: v_uaddsat_v2i16:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: s_pack_ll_b32_b16 s4, -1, -1
; GFX9-NEXT: v_xor_b32_e32 v2, s4, v0
; GFX9-NEXT: v_xor_b32_e32 v2, -1, v0
; GFX9-NEXT: v_pk_min_u16 v1, v2, v1
; GFX9-NEXT: v_pk_add_u16 v0, v0, v1
; GFX9-NEXT: s_setpc_b64 s[30:31]
@ -2381,9 +2380,8 @@ define <2 x i16> @v_uaddsat_v2i16(<2 x i16> %lhs, <2 x i16> %rhs) {
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: s_pack_ll_b32_b16 s4, -1, -1
; GFX10-NEXT: v_xor_b32_e32 v2, -1, v0
; GFX10-NEXT: ; implicit-def: $vcc_hi
; GFX10-NEXT: v_xor_b32_e32 v2, s4, v0
; GFX10-NEXT: v_pk_min_u16 v1, v2, v1
; GFX10-NEXT: v_pk_add_u16 v0, v0, v1
; GFX10-NEXT: s_setpc_b64 s[30:31]
@ -2439,8 +2437,7 @@ define amdgpu_ps i32 @s_uaddsat_v2i16(<2 x i16> inreg %lhs, <2 x i16> inreg %rhs
;
; GFX9-LABEL: s_uaddsat_v2i16:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_pack_ll_b32_b16 s2, -1, -1
; GFX9-NEXT: s_xor_b32 s2, s0, s2
; GFX9-NEXT: s_xor_b32 s2, s0, -1
; GFX9-NEXT: s_mov_b32 s4, 0xffff
; GFX9-NEXT: s_lshr_b32 s3, s2, 16
; GFX9-NEXT: s_lshr_b32 s5, s1, 16
@ -2460,15 +2457,14 @@ define amdgpu_ps i32 @s_uaddsat_v2i16(<2 x i16> inreg %lhs, <2 x i16> inreg %rhs
;
; GFX10-LABEL: s_uaddsat_v2i16:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_pack_ll_b32_b16 s2, -1, -1
; GFX10-NEXT: s_xor_b32 s2, s0, -1
; GFX10-NEXT: s_mov_b32 s3, 0xffff
; GFX10-NEXT: s_xor_b32 s2, s0, s2
; GFX10-NEXT: ; implicit-def: $vcc_hi
; GFX10-NEXT: s_lshr_b32 s4, s2, 16
; GFX10-NEXT: s_and_b32 s2, s2, s3
; GFX10-NEXT: s_and_b32 s3, s1, s3
; GFX10-NEXT: s_lshr_b32 s1, s1, 16
; GFX10-NEXT: s_cmp_lt_u32 s2, s3
; GFX10-NEXT: ; implicit-def: $vcc_hi
; GFX10-NEXT: s_cselect_b32 s2, s2, s3
; GFX10-NEXT: s_cmp_lt_u32 s4, s1
; GFX10-NEXT: s_cselect_b32 s1, s4, s1
@ -2522,17 +2518,15 @@ define amdgpu_ps float @uaddsat_v2i16_sv(<2 x i16> inreg %lhs, <2 x i16> %rhs) {
;
; GFX9-LABEL: uaddsat_v2i16_sv:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_pack_ll_b32_b16 s1, -1, -1
; GFX9-NEXT: s_xor_b32 s1, s0, s1
; GFX9-NEXT: s_xor_b32 s1, s0, -1
; GFX9-NEXT: v_pk_min_u16 v0, s1, v0
; GFX9-NEXT: v_pk_add_u16 v0, s0, v0
; GFX9-NEXT: ; return to shader part epilog
;
; GFX10-LABEL: uaddsat_v2i16_sv:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_pack_ll_b32_b16 s1, -1, -1
; GFX10-NEXT: s_xor_b32 s1, s0, -1
; GFX10-NEXT: ; implicit-def: $vcc_hi
; GFX10-NEXT: s_xor_b32 s1, s0, s1
; GFX10-NEXT: v_pk_min_u16 v0, s1, v0
; GFX10-NEXT: v_pk_add_u16 v0, s0, v0
; GFX10-NEXT: ; return to shader part epilog
@ -2578,17 +2572,15 @@ define amdgpu_ps float @uaddsat_v2i16_vs(<2 x i16> %lhs, <2 x i16> inreg %rhs) {
;
; GFX9-LABEL: uaddsat_v2i16_vs:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_pack_ll_b32_b16 s1, -1, -1
; GFX9-NEXT: v_xor_b32_e32 v1, s1, v0
; GFX9-NEXT: v_xor_b32_e32 v1, -1, v0
; GFX9-NEXT: v_pk_min_u16 v1, v1, s0
; GFX9-NEXT: v_pk_add_u16 v0, v0, v1
; GFX9-NEXT: ; return to shader part epilog
;
; GFX10-LABEL: uaddsat_v2i16_vs:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_pack_ll_b32_b16 s1, -1, -1
; GFX10-NEXT: v_xor_b32_e32 v1, -1, v0
; GFX10-NEXT: ; implicit-def: $vcc_hi
; GFX10-NEXT: v_xor_b32_e32 v1, s1, v0
; GFX10-NEXT: v_pk_min_u16 v1, v1, s0
; GFX10-NEXT: v_pk_add_u16 v0, v0, v1
; GFX10-NEXT: ; return to shader part epilog
@ -2671,11 +2663,10 @@ define <2 x float> @v_uaddsat_v4i16(<4 x i16> %lhs, <4 x i16> %rhs) {
; GFX9-LABEL: v_uaddsat_v4i16:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: s_pack_ll_b32_b16 s4, -1, -1
; GFX9-NEXT: v_xor_b32_e32 v4, s4, v0
; GFX9-NEXT: v_xor_b32_e32 v4, -1, v0
; GFX9-NEXT: v_pk_min_u16 v2, v4, v2
; GFX9-NEXT: v_pk_add_u16 v0, v0, v2
; GFX9-NEXT: v_xor_b32_e32 v2, s4, v1
; GFX9-NEXT: v_xor_b32_e32 v2, -1, v1
; GFX9-NEXT: v_pk_min_u16 v2, v2, v3
; GFX9-NEXT: v_pk_add_u16 v1, v1, v2
; GFX9-NEXT: s_setpc_b64 s[30:31]
@ -2684,10 +2675,9 @@ define <2 x float> @v_uaddsat_v4i16(<4 x i16> %lhs, <4 x i16> %rhs) {
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: s_pack_ll_b32_b16 s4, -1, -1
; GFX10-NEXT: v_xor_b32_e32 v4, -1, v0
; GFX10-NEXT: v_xor_b32_e32 v5, -1, v1
; GFX10-NEXT: ; implicit-def: $vcc_hi
; GFX10-NEXT: v_xor_b32_e32 v4, s4, v0
; GFX10-NEXT: v_xor_b32_e32 v5, s4, v1
; GFX10-NEXT: v_pk_min_u16 v2, v4, v2
; GFX10-NEXT: v_pk_min_u16 v3, v5, v3
; GFX10-NEXT: v_pk_add_u16 v0, v0, v2
@ -2782,28 +2772,27 @@ define amdgpu_ps <2 x i32> @s_uaddsat_v4i16(<4 x i16> inreg %lhs, <4 x i16> inre
;
; GFX9-LABEL: s_uaddsat_v4i16:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_pack_ll_b32_b16 s4, -1, -1
; GFX9-NEXT: s_xor_b32 s5, s0, s4
; GFX9-NEXT: s_mov_b32 s7, 0xffff
; GFX9-NEXT: s_lshr_b32 s6, s5, 16
; GFX9-NEXT: s_lshr_b32 s8, s2, 16
; GFX9-NEXT: s_and_b32 s5, s5, s7
; GFX9-NEXT: s_and_b32 s2, s2, s7
; GFX9-NEXT: s_cmp_lt_u32 s5, s2
; GFX9-NEXT: s_cselect_b32 s2, s5, s2
; GFX9-NEXT: s_cmp_lt_u32 s6, s8
; GFX9-NEXT: s_cselect_b32 s5, s6, s8
; GFX9-NEXT: s_pack_ll_b32_b16 s2, s2, s5
; GFX9-NEXT: s_lshr_b32 s5, s0, 16
; GFX9-NEXT: s_lshr_b32 s6, s2, 16
; GFX9-NEXT: s_xor_b32 s4, s0, -1
; GFX9-NEXT: s_mov_b32 s6, 0xffff
; GFX9-NEXT: s_lshr_b32 s5, s4, 16
; GFX9-NEXT: s_lshr_b32 s7, s2, 16
; GFX9-NEXT: s_and_b32 s4, s4, s6
; GFX9-NEXT: s_and_b32 s2, s2, s6
; GFX9-NEXT: s_cmp_lt_u32 s4, s2
; GFX9-NEXT: s_cselect_b32 s2, s4, s2
; GFX9-NEXT: s_cmp_lt_u32 s5, s7
; GFX9-NEXT: s_cselect_b32 s4, s5, s7
; GFX9-NEXT: s_pack_ll_b32_b16 s2, s2, s4
; GFX9-NEXT: s_lshr_b32 s4, s0, 16
; GFX9-NEXT: s_lshr_b32 s5, s2, 16
; GFX9-NEXT: s_add_i32 s0, s0, s2
; GFX9-NEXT: s_xor_b32 s2, s1, s4
; GFX9-NEXT: s_add_i32 s5, s5, s6
; GFX9-NEXT: s_pack_ll_b32_b16 s0, s0, s5
; GFX9-NEXT: s_add_i32 s4, s4, s5
; GFX9-NEXT: s_xor_b32 s2, s1, -1
; GFX9-NEXT: s_pack_ll_b32_b16 s0, s0, s4
; GFX9-NEXT: s_lshr_b32 s4, s2, 16
; GFX9-NEXT: s_lshr_b32 s5, s3, 16
; GFX9-NEXT: s_and_b32 s2, s2, s7
; GFX9-NEXT: s_and_b32 s3, s3, s7
; GFX9-NEXT: s_and_b32 s2, s2, s6
; GFX9-NEXT: s_and_b32 s3, s3, s6
; GFX9-NEXT: s_cmp_lt_u32 s2, s3
; GFX9-NEXT: s_cselect_b32 s2, s2, s3
; GFX9-NEXT: s_cmp_lt_u32 s4, s5
@ -2818,38 +2807,37 @@ define amdgpu_ps <2 x i32> @s_uaddsat_v4i16(<4 x i16> inreg %lhs, <4 x i16> inre
;
; GFX10-LABEL: s_uaddsat_v4i16:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_pack_ll_b32_b16 s4, -1, -1
; GFX10-NEXT: s_mov_b32 s6, 0xffff
; GFX10-NEXT: s_xor_b32 s5, s0, s4
; GFX10-NEXT: s_and_b32 s8, s2, s6
; GFX10-NEXT: s_lshr_b32 s7, s5, 16
; GFX10-NEXT: s_and_b32 s5, s5, s6
; GFX10-NEXT: s_xor_b32 s4, s0, -1
; GFX10-NEXT: s_mov_b32 s5, 0xffff
; GFX10-NEXT: s_lshr_b32 s6, s4, 16
; GFX10-NEXT: s_and_b32 s7, s2, s5
; GFX10-NEXT: s_and_b32 s4, s4, s5
; GFX10-NEXT: s_lshr_b32 s2, s2, 16
; GFX10-NEXT: s_cmp_lt_u32 s5, s8
; GFX10-NEXT: s_cmp_lt_u32 s4, s7
; GFX10-NEXT: ; implicit-def: $vcc_hi
; GFX10-NEXT: s_cselect_b32 s5, s5, s8
; GFX10-NEXT: s_cmp_lt_u32 s7, s2
; GFX10-NEXT: s_cselect_b32 s2, s7, s2
; GFX10-NEXT: s_pack_ll_b32_b16 s2, s5, s2
; GFX10-NEXT: s_lshr_b32 s5, s0, 16
; GFX10-NEXT: s_lshr_b32 s7, s2, 16
; GFX10-NEXT: s_cselect_b32 s4, s4, s7
; GFX10-NEXT: s_cmp_lt_u32 s6, s2
; GFX10-NEXT: s_cselect_b32 s2, s6, s2
; GFX10-NEXT: s_pack_ll_b32_b16 s2, s4, s2
; GFX10-NEXT: s_lshr_b32 s4, s0, 16
; GFX10-NEXT: s_lshr_b32 s6, s2, 16
; GFX10-NEXT: s_add_i32 s0, s0, s2
; GFX10-NEXT: s_xor_b32 s2, s1, s4
; GFX10-NEXT: s_add_i32 s5, s5, s7
; GFX10-NEXT: s_lshr_b32 s4, s2, 16
; GFX10-NEXT: s_and_b32 s2, s2, s6
; GFX10-NEXT: s_and_b32 s6, s3, s6
; GFX10-NEXT: s_xor_b32 s2, s1, -1
; GFX10-NEXT: s_add_i32 s4, s4, s6
; GFX10-NEXT: s_lshr_b32 s6, s2, 16
; GFX10-NEXT: s_and_b32 s2, s2, s5
; GFX10-NEXT: s_and_b32 s5, s3, s5
; GFX10-NEXT: s_lshr_b32 s3, s3, 16
; GFX10-NEXT: s_cmp_lt_u32 s2, s6
; GFX10-NEXT: s_pack_ll_b32_b16 s0, s0, s5
; GFX10-NEXT: s_cselect_b32 s2, s2, s6
; GFX10-NEXT: s_cmp_lt_u32 s4, s3
; GFX10-NEXT: s_cselect_b32 s3, s4, s3
; GFX10-NEXT: s_cmp_lt_u32 s2, s5
; GFX10-NEXT: s_pack_ll_b32_b16 s0, s0, s4
; GFX10-NEXT: s_cselect_b32 s2, s2, s5
; GFX10-NEXT: s_cmp_lt_u32 s6, s3
; GFX10-NEXT: s_cselect_b32 s3, s6, s3
; GFX10-NEXT: s_pack_ll_b32_b16 s2, s2, s3
; GFX10-NEXT: s_lshr_b32 s3, s1, 16
; GFX10-NEXT: s_lshr_b32 s4, s2, 16
; GFX10-NEXT: s_lshr_b32 s5, s2, 16
; GFX10-NEXT: s_add_i32 s1, s1, s2
; GFX10-NEXT: s_add_i32 s3, s3, s4
; GFX10-NEXT: s_add_i32 s3, s3, s5
; GFX10-NEXT: s_pack_ll_b32_b16 s1, s1, s3
; GFX10-NEXT: ; return to shader part epilog
%result = call <4 x i16> @llvm.uadd.sat.v4i16(<4 x i16> %lhs, <4 x i16> %rhs)
@ -2955,14 +2943,13 @@ define <3 x float> @v_uaddsat_v6i16(<6 x i16> %lhs, <6 x i16> %rhs) {
; GFX9-LABEL: v_uaddsat_v6i16:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: s_pack_ll_b32_b16 s4, -1, -1
; GFX9-NEXT: v_xor_b32_e32 v6, s4, v0
; GFX9-NEXT: v_xor_b32_e32 v6, -1, v0
; GFX9-NEXT: v_pk_min_u16 v3, v6, v3
; GFX9-NEXT: v_pk_add_u16 v0, v0, v3
; GFX9-NEXT: v_xor_b32_e32 v3, s4, v1
; GFX9-NEXT: v_xor_b32_e32 v3, -1, v1
; GFX9-NEXT: v_pk_min_u16 v3, v3, v4
; GFX9-NEXT: v_pk_add_u16 v1, v1, v3
; GFX9-NEXT: v_xor_b32_e32 v3, s4, v2
; GFX9-NEXT: v_xor_b32_e32 v3, -1, v2
; GFX9-NEXT: v_pk_min_u16 v3, v3, v5
; GFX9-NEXT: v_pk_add_u16 v2, v2, v3
; GFX9-NEXT: s_setpc_b64 s[30:31]
@ -2971,11 +2958,10 @@ define <3 x float> @v_uaddsat_v6i16(<6 x i16> %lhs, <6 x i16> %rhs) {
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: s_pack_ll_b32_b16 s4, -1, -1
; GFX10-NEXT: v_xor_b32_e32 v6, -1, v0
; GFX10-NEXT: v_xor_b32_e32 v7, -1, v1
; GFX10-NEXT: v_xor_b32_e32 v8, -1, v2
; GFX10-NEXT: ; implicit-def: $vcc_hi
; GFX10-NEXT: v_xor_b32_e32 v6, s4, v0
; GFX10-NEXT: v_xor_b32_e32 v7, s4, v1
; GFX10-NEXT: v_xor_b32_e32 v8, s4, v2
; GFX10-NEXT: v_pk_min_u16 v3, v6, v3
; GFX10-NEXT: v_pk_min_u16 v4, v7, v4
; GFX10-NEXT: v_pk_min_u16 v5, v8, v5
@ -3108,43 +3094,42 @@ define amdgpu_ps <3 x i32> @s_uaddsat_v6i16(<6 x i16> inreg %lhs, <6 x i16> inre
;
; GFX9-LABEL: s_uaddsat_v6i16:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_pack_ll_b32_b16 s6, -1, -1
; GFX9-NEXT: s_xor_b32 s7, s0, s6
; GFX9-NEXT: s_mov_b32 s9, 0xffff
; GFX9-NEXT: s_lshr_b32 s8, s7, 16
; GFX9-NEXT: s_lshr_b32 s10, s3, 16
; GFX9-NEXT: s_and_b32 s7, s7, s9
; GFX9-NEXT: s_and_b32 s3, s3, s9
; GFX9-NEXT: s_cmp_lt_u32 s7, s3
; GFX9-NEXT: s_cselect_b32 s3, s7, s3
; GFX9-NEXT: s_cmp_lt_u32 s8, s10
; GFX9-NEXT: s_cselect_b32 s7, s8, s10
; GFX9-NEXT: s_pack_ll_b32_b16 s3, s3, s7
; GFX9-NEXT: s_lshr_b32 s7, s0, 16
; GFX9-NEXT: s_lshr_b32 s8, s3, 16
; GFX9-NEXT: s_add_i32 s0, s0, s3
; GFX9-NEXT: s_add_i32 s7, s7, s8
; GFX9-NEXT: s_xor_b32 s3, s1, s6
; GFX9-NEXT: s_pack_ll_b32_b16 s0, s0, s7
; GFX9-NEXT: s_xor_b32 s6, s0, -1
; GFX9-NEXT: s_mov_b32 s8, 0xffff
; GFX9-NEXT: s_lshr_b32 s7, s6, 16
; GFX9-NEXT: s_lshr_b32 s9, s3, 16
; GFX9-NEXT: s_and_b32 s6, s6, s8
; GFX9-NEXT: s_and_b32 s3, s3, s8
; GFX9-NEXT: s_cmp_lt_u32 s6, s3
; GFX9-NEXT: s_cselect_b32 s3, s6, s3
; GFX9-NEXT: s_cmp_lt_u32 s7, s9
; GFX9-NEXT: s_cselect_b32 s6, s7, s9
; GFX9-NEXT: s_pack_ll_b32_b16 s3, s3, s6
; GFX9-NEXT: s_lshr_b32 s6, s0, 16
; GFX9-NEXT: s_lshr_b32 s7, s3, 16
; GFX9-NEXT: s_lshr_b32 s8, s4, 16
; GFX9-NEXT: s_and_b32 s3, s3, s9
; GFX9-NEXT: s_and_b32 s4, s4, s9
; GFX9-NEXT: s_add_i32 s0, s0, s3
; GFX9-NEXT: s_add_i32 s6, s6, s7
; GFX9-NEXT: s_xor_b32 s3, s1, -1
; GFX9-NEXT: s_pack_ll_b32_b16 s0, s0, s6
; GFX9-NEXT: s_lshr_b32 s6, s3, 16
; GFX9-NEXT: s_lshr_b32 s7, s4, 16
; GFX9-NEXT: s_and_b32 s3, s3, s8
; GFX9-NEXT: s_and_b32 s4, s4, s8
; GFX9-NEXT: s_cmp_lt_u32 s3, s4
; GFX9-NEXT: s_cselect_b32 s3, s3, s4
; GFX9-NEXT: s_cmp_lt_u32 s7, s8
; GFX9-NEXT: s_cselect_b32 s4, s7, s8
; GFX9-NEXT: s_cmp_lt_u32 s6, s7
; GFX9-NEXT: s_cselect_b32 s4, s6, s7
; GFX9-NEXT: s_pack_ll_b32_b16 s3, s3, s4
; GFX9-NEXT: s_lshr_b32 s4, s1, 16
; GFX9-NEXT: s_lshr_b32 s7, s3, 16
; GFX9-NEXT: s_lshr_b32 s6, s3, 16
; GFX9-NEXT: s_add_i32 s1, s1, s3
; GFX9-NEXT: s_add_i32 s4, s4, s7
; GFX9-NEXT: s_xor_b32 s3, s2, s6
; GFX9-NEXT: s_add_i32 s4, s4, s6
; GFX9-NEXT: s_xor_b32 s3, s2, -1
; GFX9-NEXT: s_pack_ll_b32_b16 s1, s1, s4
; GFX9-NEXT: s_lshr_b32 s4, s3, 16
; GFX9-NEXT: s_lshr_b32 s6, s5, 16
; GFX9-NEXT: s_and_b32 s3, s3, s9
; GFX9-NEXT: s_and_b32 s5, s5, s9
; GFX9-NEXT: s_and_b32 s3, s3, s8
; GFX9-NEXT: s_and_b32 s5, s5, s8
; GFX9-NEXT: s_cmp_lt_u32 s3, s5
; GFX9-NEXT: s_cselect_b32 s3, s3, s5
; GFX9-NEXT: s_cmp_lt_u32 s4, s6
@ -3159,48 +3144,47 @@ define amdgpu_ps <3 x i32> @s_uaddsat_v6i16(<6 x i16> inreg %lhs, <6 x i16> inre
;
; GFX10-LABEL: s_uaddsat_v6i16:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_pack_ll_b32_b16 s6, -1, -1
; GFX10-NEXT: s_mov_b32 s8, 0xffff
; GFX10-NEXT: s_xor_b32 s7, s0, s6
; GFX10-NEXT: s_and_b32 s10, s3, s8
; GFX10-NEXT: s_lshr_b32 s9, s7, 16
; GFX10-NEXT: s_and_b32 s7, s7, s8
; GFX10-NEXT: s_xor_b32 s6, s0, -1
; GFX10-NEXT: s_mov_b32 s7, 0xffff
; GFX10-NEXT: s_lshr_b32 s8, s6, 16
; GFX10-NEXT: s_and_b32 s9, s3, s7
; GFX10-NEXT: s_and_b32 s6, s6, s7
; GFX10-NEXT: s_lshr_b32 s3, s3, 16
; GFX10-NEXT: s_cmp_lt_u32 s7, s10
; GFX10-NEXT: s_cmp_lt_u32 s6, s9
; GFX10-NEXT: ; implicit-def: $vcc_hi
; GFX10-NEXT: s_cselect_b32 s7, s7, s10
; GFX10-NEXT: s_cmp_lt_u32 s9, s3
; GFX10-NEXT: s_cselect_b32 s3, s9, s3
; GFX10-NEXT: s_and_b32 s10, s4, s8
; GFX10-NEXT: s_pack_ll_b32_b16 s3, s7, s3
; GFX10-NEXT: s_lshr_b32 s7, s0, 16
; GFX10-NEXT: s_lshr_b32 s9, s3, 16
; GFX10-NEXT: s_cselect_b32 s6, s6, s9
; GFX10-NEXT: s_cmp_lt_u32 s8, s3
; GFX10-NEXT: s_cselect_b32 s3, s8, s3
; GFX10-NEXT: s_and_b32 s9, s4, s7
; GFX10-NEXT: s_pack_ll_b32_b16 s3, s6, s3
; GFX10-NEXT: s_lshr_b32 s6, s0, 16
; GFX10-NEXT: s_lshr_b32 s8, s3, 16
; GFX10-NEXT: s_add_i32 s0, s0, s3
; GFX10-NEXT: s_xor_b32 s3, s1, s6
; GFX10-NEXT: s_add_i32 s7, s7, s9
; GFX10-NEXT: s_lshr_b32 s9, s3, 16
; GFX10-NEXT: s_and_b32 s3, s3, s8
; GFX10-NEXT: s_xor_b32 s3, s1, -1
; GFX10-NEXT: s_add_i32 s6, s6, s8
; GFX10-NEXT: s_lshr_b32 s8, s3, 16
; GFX10-NEXT: s_and_b32 s3, s3, s7
; GFX10-NEXT: s_lshr_b32 s4, s4, 16
; GFX10-NEXT: s_cmp_lt_u32 s3, s10
; GFX10-NEXT: s_pack_ll_b32_b16 s0, s0, s7
; GFX10-NEXT: s_cselect_b32 s3, s3, s10
; GFX10-NEXT: s_cmp_lt_u32 s9, s4
; GFX10-NEXT: s_cselect_b32 s4, s9, s4
; GFX10-NEXT: s_cmp_lt_u32 s3, s9
; GFX10-NEXT: s_pack_ll_b32_b16 s0, s0, s6
; GFX10-NEXT: s_cselect_b32 s3, s3, s9
; GFX10-NEXT: s_cmp_lt_u32 s8, s4
; GFX10-NEXT: s_cselect_b32 s4, s8, s4
; GFX10-NEXT: s_pack_ll_b32_b16 s3, s3, s4
; GFX10-NEXT: s_lshr_b32 s4, s1, 16
; GFX10-NEXT: s_lshr_b32 s9, s3, 16
; GFX10-NEXT: s_lshr_b32 s8, s3, 16
; GFX10-NEXT: s_add_i32 s1, s1, s3
; GFX10-NEXT: s_xor_b32 s3, s2, s6
; GFX10-NEXT: s_add_i32 s4, s4, s9
; GFX10-NEXT: s_lshr_b32 s6, s3, 16
; GFX10-NEXT: s_and_b32 s3, s3, s8
; GFX10-NEXT: s_and_b32 s8, s5, s8
; GFX10-NEXT: s_xor_b32 s3, s2, -1
; GFX10-NEXT: s_add_i32 s4, s4, s8
; GFX10-NEXT: s_lshr_b32 s8, s3, 16
; GFX10-NEXT: s_and_b32 s3, s3, s7
; GFX10-NEXT: s_and_b32 s7, s5, s7
; GFX10-NEXT: s_lshr_b32 s5, s5, 16
; GFX10-NEXT: s_cmp_lt_u32 s3, s8
; GFX10-NEXT: s_cmp_lt_u32 s3, s7
; GFX10-NEXT: s_pack_ll_b32_b16 s1, s1, s4
; GFX10-NEXT: s_cselect_b32 s3, s3, s8
; GFX10-NEXT: s_cmp_lt_u32 s6, s5
; GFX10-NEXT: s_cselect_b32 s5, s6, s5
; GFX10-NEXT: s_cselect_b32 s3, s3, s7
; GFX10-NEXT: s_cmp_lt_u32 s8, s5
; GFX10-NEXT: s_cselect_b32 s5, s8, s5
; GFX10-NEXT: s_pack_ll_b32_b16 s3, s3, s5
; GFX10-NEXT: s_lshr_b32 s5, s2, 16
; GFX10-NEXT: s_lshr_b32 s6, s3, 16
@ -3324,17 +3308,16 @@ define <4 x float> @v_uaddsat_v8i16(<8 x i16> %lhs, <8 x i16> %rhs) {
; GFX9-LABEL: v_uaddsat_v8i16:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: s_pack_ll_b32_b16 s4, -1, -1
; GFX9-NEXT: v_xor_b32_e32 v8, s4, v0
; GFX9-NEXT: v_xor_b32_e32 v8, -1, v0
; GFX9-NEXT: v_pk_min_u16 v4, v8, v4
; GFX9-NEXT: v_pk_add_u16 v0, v0, v4
; GFX9-NEXT: v_xor_b32_e32 v4, s4, v1
; GFX9-NEXT: v_xor_b32_e32 v4, -1, v1
; GFX9-NEXT: v_pk_min_u16 v4, v4, v5
; GFX9-NEXT: v_pk_add_u16 v1, v1, v4
; GFX9-NEXT: v_xor_b32_e32 v4, s4, v2
; GFX9-NEXT: v_xor_b32_e32 v4, -1, v2
; GFX9-NEXT: v_pk_min_u16 v4, v4, v6
; GFX9-NEXT: v_pk_add_u16 v2, v2, v4
; GFX9-NEXT: v_xor_b32_e32 v4, s4, v3
; GFX9-NEXT: v_xor_b32_e32 v4, -1, v3
; GFX9-NEXT: v_pk_min_u16 v4, v4, v7
; GFX9-NEXT: v_pk_add_u16 v3, v3, v4
; GFX9-NEXT: s_setpc_b64 s[30:31]
@ -3343,12 +3326,11 @@ define <4 x float> @v_uaddsat_v8i16(<8 x i16> %lhs, <8 x i16> %rhs) {
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: s_pack_ll_b32_b16 s4, -1, -1
; GFX10-NEXT: v_xor_b32_e32 v15, -1, v0
; GFX10-NEXT: v_xor_b32_e32 v19, -1, v1
; GFX10-NEXT: v_xor_b32_e32 v23, -1, v2
; GFX10-NEXT: v_xor_b32_e32 v10, -1, v3
; GFX10-NEXT: ; implicit-def: $vcc_hi
; GFX10-NEXT: v_xor_b32_e32 v15, s4, v0
; GFX10-NEXT: v_xor_b32_e32 v19, s4, v1
; GFX10-NEXT: v_xor_b32_e32 v23, s4, v2
; GFX10-NEXT: v_xor_b32_e32 v10, s4, v3
; GFX10-NEXT: v_pk_min_u16 v11, v15, v4
; GFX10-NEXT: v_pk_min_u16 v15, v19, v5
; GFX10-NEXT: v_pk_min_u16 v19, v23, v6
@ -3519,58 +3501,57 @@ define amdgpu_ps <4 x i32> @s_uaddsat_v8i16(<8 x i16> inreg %lhs, <8 x i16> inre
;
; GFX9-LABEL: s_uaddsat_v8i16:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_pack_ll_b32_b16 s8, -1, -1
; GFX9-NEXT: s_xor_b32 s9, s0, s8
; GFX9-NEXT: s_mov_b32 s11, 0xffff
; GFX9-NEXT: s_lshr_b32 s10, s9, 16
; GFX9-NEXT: s_lshr_b32 s12, s4, 16
; GFX9-NEXT: s_and_b32 s9, s9, s11
; GFX9-NEXT: s_and_b32 s4, s4, s11
; GFX9-NEXT: s_cmp_lt_u32 s9, s4
; GFX9-NEXT: s_cselect_b32 s4, s9, s4
; GFX9-NEXT: s_cmp_lt_u32 s10, s12
; GFX9-NEXT: s_cselect_b32 s9, s10, s12
; GFX9-NEXT: s_pack_ll_b32_b16 s4, s4, s9
; GFX9-NEXT: s_lshr_b32 s9, s0, 16
; GFX9-NEXT: s_lshr_b32 s10, s4, 16
; GFX9-NEXT: s_add_i32 s0, s0, s4
; GFX9-NEXT: s_add_i32 s9, s9, s10
; GFX9-NEXT: s_xor_b32 s4, s1, s8
; GFX9-NEXT: s_pack_ll_b32_b16 s0, s0, s9
; GFX9-NEXT: s_xor_b32 s8, s0, -1
; GFX9-NEXT: s_mov_b32 s10, 0xffff
; GFX9-NEXT: s_lshr_b32 s9, s8, 16
; GFX9-NEXT: s_lshr_b32 s11, s4, 16
; GFX9-NEXT: s_and_b32 s8, s8, s10
; GFX9-NEXT: s_and_b32 s4, s4, s10
; GFX9-NEXT: s_cmp_lt_u32 s8, s4
; GFX9-NEXT: s_cselect_b32 s4, s8, s4
; GFX9-NEXT: s_cmp_lt_u32 s9, s11
; GFX9-NEXT: s_cselect_b32 s8, s9, s11
; GFX9-NEXT: s_pack_ll_b32_b16 s4, s4, s8
; GFX9-NEXT: s_lshr_b32 s8, s0, 16
; GFX9-NEXT: s_lshr_b32 s9, s4, 16
; GFX9-NEXT: s_lshr_b32 s10, s5, 16
; GFX9-NEXT: s_and_b32 s4, s4, s11
; GFX9-NEXT: s_and_b32 s5, s5, s11
; GFX9-NEXT: s_add_i32 s0, s0, s4
; GFX9-NEXT: s_add_i32 s8, s8, s9
; GFX9-NEXT: s_xor_b32 s4, s1, -1
; GFX9-NEXT: s_pack_ll_b32_b16 s0, s0, s8
; GFX9-NEXT: s_lshr_b32 s8, s4, 16
; GFX9-NEXT: s_lshr_b32 s9, s5, 16
; GFX9-NEXT: s_and_b32 s4, s4, s10
; GFX9-NEXT: s_and_b32 s5, s5, s10
; GFX9-NEXT: s_cmp_lt_u32 s4, s5
; GFX9-NEXT: s_cselect_b32 s4, s4, s5
; GFX9-NEXT: s_cmp_lt_u32 s9, s10
; GFX9-NEXT: s_cselect_b32 s5, s9, s10
; GFX9-NEXT: s_cmp_lt_u32 s8, s9
; GFX9-NEXT: s_cselect_b32 s5, s8, s9
; GFX9-NEXT: s_pack_ll_b32_b16 s4, s4, s5
; GFX9-NEXT: s_lshr_b32 s5, s1, 16
; GFX9-NEXT: s_lshr_b32 s9, s4, 16
; GFX9-NEXT: s_lshr_b32 s8, s4, 16
; GFX9-NEXT: s_add_i32 s1, s1, s4
; GFX9-NEXT: s_add_i32 s5, s5, s9
; GFX9-NEXT: s_xor_b32 s4, s2, s8
; GFX9-NEXT: s_add_i32 s5, s5, s8
; GFX9-NEXT: s_xor_b32 s4, s2, -1
; GFX9-NEXT: s_pack_ll_b32_b16 s1, s1, s5
; GFX9-NEXT: s_lshr_b32 s5, s4, 16
; GFX9-NEXT: s_lshr_b32 s9, s6, 16
; GFX9-NEXT: s_and_b32 s4, s4, s11
; GFX9-NEXT: s_and_b32 s6, s6, s11
; GFX9-NEXT: s_lshr_b32 s8, s6, 16
; GFX9-NEXT: s_and_b32 s4, s4, s10
; GFX9-NEXT: s_and_b32 s6, s6, s10
; GFX9-NEXT: s_cmp_lt_u32 s4, s6
; GFX9-NEXT: s_cselect_b32 s4, s4, s6
; GFX9-NEXT: s_cmp_lt_u32 s5, s9
; GFX9-NEXT: s_cselect_b32 s5, s5, s9
; GFX9-NEXT: s_cmp_lt_u32 s5, s8
; GFX9-NEXT: s_cselect_b32 s5, s5, s8
; GFX9-NEXT: s_pack_ll_b32_b16 s4, s4, s5
; GFX9-NEXT: s_lshr_b32 s5, s2, 16
; GFX9-NEXT: s_lshr_b32 s6, s4, 16
; GFX9-NEXT: s_add_i32 s2, s2, s4
; GFX9-NEXT: s_add_i32 s5, s5, s6
; GFX9-NEXT: s_xor_b32 s4, s3, s8
; GFX9-NEXT: s_xor_b32 s4, s3, -1
; GFX9-NEXT: s_pack_ll_b32_b16 s2, s2, s5
; GFX9-NEXT: s_lshr_b32 s5, s4, 16
; GFX9-NEXT: s_lshr_b32 s6, s7, 16
; GFX9-NEXT: s_and_b32 s4, s4, s11
; GFX9-NEXT: s_and_b32 s7, s7, s11
; GFX9-NEXT: s_and_b32 s4, s4, s10
; GFX9-NEXT: s_and_b32 s7, s7, s10
; GFX9-NEXT: s_cmp_lt_u32 s4, s7
; GFX9-NEXT: s_cselect_b32 s4, s4, s7
; GFX9-NEXT: s_cmp_lt_u32 s5, s6
@ -3585,63 +3566,62 @@ define amdgpu_ps <4 x i32> @s_uaddsat_v8i16(<8 x i16> inreg %lhs, <8 x i16> inre
;
; GFX10-LABEL: s_uaddsat_v8i16:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_pack_ll_b32_b16 s8, -1, -1
; GFX10-NEXT: s_mov_b32 s10, 0xffff
; GFX10-NEXT: s_xor_b32 s9, s0, s8
; GFX10-NEXT: s_and_b32 s12, s4, s10
; GFX10-NEXT: s_lshr_b32 s11, s9, 16
; GFX10-NEXT: s_and_b32 s9, s9, s10
; GFX10-NEXT: s_xor_b32 s8, s0, -1
; GFX10-NEXT: s_mov_b32 s9, 0xffff
; GFX10-NEXT: s_lshr_b32 s10, s8, 16
; GFX10-NEXT: s_and_b32 s11, s4, s9
; GFX10-NEXT: s_and_b32 s8, s8, s9
; GFX10-NEXT: s_lshr_b32 s4, s4, 16
; GFX10-NEXT: s_cmp_lt_u32 s9, s12
; GFX10-NEXT: s_cmp_lt_u32 s8, s11
; GFX10-NEXT: ; implicit-def: $vcc_hi
; GFX10-NEXT: s_cselect_b32 s9, s9, s12
; GFX10-NEXT: s_cmp_lt_u32 s11, s4
; GFX10-NEXT: s_cselect_b32 s4, s11, s4
; GFX10-NEXT: s_and_b32 s12, s5, s10
; GFX10-NEXT: s_pack_ll_b32_b16 s4, s9, s4
; GFX10-NEXT: s_lshr_b32 s9, s0, 16
; GFX10-NEXT: s_lshr_b32 s11, s4, 16
; GFX10-NEXT: s_cselect_b32 s8, s8, s11
; GFX10-NEXT: s_cmp_lt_u32 s10, s4
; GFX10-NEXT: s_cselect_b32 s4, s10, s4
; GFX10-NEXT: s_and_b32 s11, s5, s9
; GFX10-NEXT: s_pack_ll_b32_b16 s4, s8, s4
; GFX10-NEXT: s_lshr_b32 s8, s0, 16
; GFX10-NEXT: s_lshr_b32 s10, s4, 16
; GFX10-NEXT: s_add_i32 s0, s0, s4
; GFX10-NEXT: s_xor_b32 s4, s1, s8
; GFX10-NEXT: s_add_i32 s9, s9, s11
; GFX10-NEXT: s_lshr_b32 s11, s4, 16
; GFX10-NEXT: s_and_b32 s4, s4, s10
; GFX10-NEXT: s_xor_b32 s4, s1, -1
; GFX10-NEXT: s_add_i32 s8, s8, s10
; GFX10-NEXT: s_lshr_b32 s10, s4, 16
; GFX10-NEXT: s_and_b32 s4, s4, s9
; GFX10-NEXT: s_lshr_b32 s5, s5, 16
; GFX10-NEXT: s_cmp_lt_u32 s4, s12
; GFX10-NEXT: s_pack_ll_b32_b16 s0, s0, s9
; GFX10-NEXT: s_cselect_b32 s4, s4, s12
; GFX10-NEXT: s_cmp_lt_u32 s11, s5
; GFX10-NEXT: s_cselect_b32 s5, s11, s5
; GFX10-NEXT: s_and_b32 s12, s6, s10
; GFX10-NEXT: s_cmp_lt_u32 s4, s11
; GFX10-NEXT: s_pack_ll_b32_b16 s0, s0, s8
; GFX10-NEXT: s_cselect_b32 s4, s4, s11
; GFX10-NEXT: s_cmp_lt_u32 s10, s5
; GFX10-NEXT: s_cselect_b32 s5, s10, s5
; GFX10-NEXT: s_and_b32 s11, s6, s9
; GFX10-NEXT: s_pack_ll_b32_b16 s4, s4, s5
; GFX10-NEXT: s_lshr_b32 s5, s1, 16
; GFX10-NEXT: s_lshr_b32 s11, s4, 16
; GFX10-NEXT: s_lshr_b32 s10, s4, 16
; GFX10-NEXT: s_add_i32 s1, s1, s4
; GFX10-NEXT: s_xor_b32 s4, s2, s8
; GFX10-NEXT: s_add_i32 s5, s5, s11
; GFX10-NEXT: s_lshr_b32 s11, s4, 16
; GFX10-NEXT: s_and_b32 s4, s4, s10
; GFX10-NEXT: s_xor_b32 s4, s2, -1
; GFX10-NEXT: s_add_i32 s5, s5, s10
; GFX10-NEXT: s_lshr_b32 s10, s4, 16
; GFX10-NEXT: s_and_b32 s4, s4, s9
; GFX10-NEXT: s_lshr_b32 s6, s6, 16
; GFX10-NEXT: s_cmp_lt_u32 s4, s12
; GFX10-NEXT: s_cmp_lt_u32 s4, s11
; GFX10-NEXT: s_pack_ll_b32_b16 s1, s1, s5
; GFX10-NEXT: s_cselect_b32 s4, s4, s12
; GFX10-NEXT: s_cmp_lt_u32 s11, s6
; GFX10-NEXT: s_cselect_b32 s6, s11, s6
; GFX10-NEXT: s_cselect_b32 s4, s4, s11
; GFX10-NEXT: s_cmp_lt_u32 s10, s6
; GFX10-NEXT: s_cselect_b32 s6, s10, s6
; GFX10-NEXT: s_pack_ll_b32_b16 s4, s4, s6
; GFX10-NEXT: s_lshr_b32 s6, s2, 16
; GFX10-NEXT: s_lshr_b32 s11, s4, 16
; GFX10-NEXT: s_lshr_b32 s10, s4, 16
; GFX10-NEXT: s_add_i32 s2, s2, s4
; GFX10-NEXT: s_xor_b32 s4, s3, s8
; GFX10-NEXT: s_add_i32 s6, s6, s11
; GFX10-NEXT: s_lshr_b32 s8, s4, 16
; GFX10-NEXT: s_and_b32 s4, s4, s10
; GFX10-NEXT: s_and_b32 s10, s7, s10
; GFX10-NEXT: s_xor_b32 s4, s3, -1
; GFX10-NEXT: s_add_i32 s6, s6, s10
; GFX10-NEXT: s_lshr_b32 s10, s4, 16
; GFX10-NEXT: s_and_b32 s4, s4, s9
; GFX10-NEXT: s_and_b32 s9, s7, s9
; GFX10-NEXT: s_lshr_b32 s7, s7, 16
; GFX10-NEXT: s_cmp_lt_u32 s4, s10
; GFX10-NEXT: s_cmp_lt_u32 s4, s9
; GFX10-NEXT: s_pack_ll_b32_b16 s2, s2, s6
; GFX10-NEXT: s_cselect_b32 s4, s4, s10
; GFX10-NEXT: s_cmp_lt_u32 s8, s7
; GFX10-NEXT: s_cselect_b32 s7, s8, s7
; GFX10-NEXT: s_cselect_b32 s4, s4, s9
; GFX10-NEXT: s_cmp_lt_u32 s10, s7
; GFX10-NEXT: s_cselect_b32 s7, s10, s7
; GFX10-NEXT: s_lshr_b32 s5, s3, 16
; GFX10-NEXT: s_pack_ll_b32_b16 s4, s4, s7
; GFX10-NEXT: s_lshr_b32 s7, s4, 16

View File

@ -50,16 +50,14 @@ define amdgpu_ps i32 @scalar_xnor_v2i16_one_use(<2 x i16> inreg %a, <2 x i16> in
;
; GFX900-LABEL: scalar_xnor_v2i16_one_use:
; GFX900: ; %bb.0: ; %entry
; GFX900-NEXT: s_pack_ll_b32_b16 s2, -1, -1
; GFX900-NEXT: s_xor_b32 s0, s0, s1
; GFX900-NEXT: s_xor_b32 s0, s0, s2
; GFX900-NEXT: s_xor_b32 s0, s0, -1
; GFX900-NEXT: ; return to shader part epilog
;
; GFX906-LABEL: scalar_xnor_v2i16_one_use:
; GFX906: ; %bb.0: ; %entry
; GFX906-NEXT: s_pack_ll_b32_b16 s2, -1, -1
; GFX906-NEXT: s_xor_b32 s0, s0, s1
; GFX906-NEXT: s_xor_b32 s0, s0, s2
; GFX906-NEXT: s_xor_b32 s0, s0, -1
; GFX906-NEXT: ; return to shader part epilog
entry:
%xor = xor <2 x i16> %a, %b
@ -150,7 +148,7 @@ define amdgpu_ps i64 @scalar_xnor_v4i16_one_use(<4 x i16> inreg %a, <4 x i16> in
;
; GFX900-LABEL: scalar_xnor_v4i16_one_use:
; GFX900: ; %bb.0:
; GFX900-NEXT: s_pack_ll_b32_b16 s4, -1, -1
; GFX900-NEXT: s_mov_b32 s4, -1
; GFX900-NEXT: s_mov_b32 s5, s4
; GFX900-NEXT: s_xor_b64 s[0:1], s[0:1], s[2:3]
; GFX900-NEXT: s_xor_b64 s[0:1], s[0:1], s[4:5]
@ -158,7 +156,7 @@ define amdgpu_ps i64 @scalar_xnor_v4i16_one_use(<4 x i16> inreg %a, <4 x i16> in
;
; GFX906-LABEL: scalar_xnor_v4i16_one_use:
; GFX906: ; %bb.0:
; GFX906-NEXT: s_pack_ll_b32_b16 s4, -1, -1
; GFX906-NEXT: s_mov_b32 s4, -1
; GFX906-NEXT: s_mov_b32 s5, s4
; GFX906-NEXT: s_xor_b64 s[0:1], s[0:1], s[2:3]
; GFX906-NEXT: s_xor_b64 s[0:1], s[0:1], s[4:5]