From 5819159995657091e4e21e538509b2af210fd48d Mon Sep 17 00:00:00 2001 From: Matt Arsenault Date: Sat, 18 Jul 2020 15:30:59 -0400 Subject: [PATCH] AMDGPU/GlobalISel: Pack constant G_BUILD_VECTOR_TRUNCs when selecting --- .../AMDGPU/AMDGPUInstructionSelector.cpp | 21 +- .../CodeGen/AMDGPU/GlobalISel/add.v2i16.ll | 44 +- llvm/test/CodeGen/AMDGPU/GlobalISel/ashr.ll | 3 +- .../inst-select-build-vector-trunc.v2s16.mir | 270 +++ .../AMDGPU/GlobalISel/llvm.amdgcn.sdot2.ll | 46 +- .../AMDGPU/GlobalISel/llvm.amdgcn.udot2.ll | 46 +- llvm/test/CodeGen/AMDGPU/GlobalISel/lshr.ll | 3 +- .../test/CodeGen/AMDGPU/GlobalISel/saddsat.ll | 1708 ++++++++--------- llvm/test/CodeGen/AMDGPU/GlobalISel/shl.ll | 3 +- .../test/CodeGen/AMDGPU/GlobalISel/ssubsat.ll | 1568 +++++++-------- .../test/CodeGen/AMDGPU/GlobalISel/uaddsat.ll | 426 ++-- llvm/test/CodeGen/AMDGPU/GlobalISel/xnor.ll | 10 +- 12 files changed, 2079 insertions(+), 2069 deletions(-) diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp index a126ed1daf17..8bc597664634 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp @@ -616,11 +616,6 @@ bool AMDGPUInstructionSelector::selectG_UNMERGE_VALUES(MachineInstr &MI) const { return true; } -static bool isZero(Register Reg, const MachineRegisterInfo &MRI) { - int64_t Val; - return mi_match(Reg, MRI, m_ICst(Val)) && Val == 0; -} - bool AMDGPUInstructionSelector::selectG_BUILD_VECTOR_TRUNC( MachineInstr &MI) const { if (selectImpl(MI, *CoverageInfo)) @@ -645,6 +640,20 @@ bool AMDGPUInstructionSelector::selectG_BUILD_VECTOR_TRUNC( const DebugLoc &DL = MI.getDebugLoc(); MachineBasicBlock *BB = MI.getParent(); + auto ConstSrc1 = getConstantVRegValWithLookThrough(Src1, *MRI, true, true); + if (ConstSrc1) { + auto ConstSrc0 = getConstantVRegValWithLookThrough(Src0, *MRI, true, true); + if (ConstSrc0) { + uint32_t Lo16 = static_cast(ConstSrc0->Value) & 0xffff; + uint32_t Hi16 = static_cast(ConstSrc1->Value) & 0xffff; + + BuildMI(*BB, &MI, DL, TII.get(AMDGPU::S_MOV_B32), Dst) + .addImm(Lo16 | (Hi16 << 16)); + MI.eraseFromParent(); + return RBI.constrainGenericRegister(Dst, AMDGPU::SReg_32RegClass, *MRI); + } + } + // TODO: This should probably be a combine somewhere // (build_vector_trunc $src0, undef -> copy $src0 MachineInstr *Src1Def = getDefIgnoringCopies(Src1, *MRI); @@ -686,7 +695,7 @@ bool AMDGPUInstructionSelector::selectG_BUILD_VECTOR_TRUNC( } else if (Shift1) { Opc = AMDGPU::S_PACK_LH_B32_B16; MI.getOperand(2).setReg(ShiftSrc1); - } else if (Shift0 && isZero(Src1, *MRI)) { + } else if (Shift0 && ConstSrc1 && ConstSrc1->Value == 0) { // build_vector_trunc (lshr $src0, 16), 0 -> s_lshr_b32 $src0, 16 auto MIB = BuildMI(*BB, &MI, DL, TII.get(AMDGPU::S_LSHR_B32), Dst) .addReg(ShiftSrc0) diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/add.v2i16.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/add.v2i16.ll index c6c0eb7c4a93..2205bfe3c71d 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/add.v2i16.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/add.v2i16.ll @@ -91,9 +91,8 @@ define <2 x i16> @v_add_v2i16_neg_inline_imm_splat(<2 x i16> %a) { ; GFX9-LABEL: v_add_v2i16_neg_inline_imm_splat: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: s_movk_i32 s4, 0xffc0 -; GFX9-NEXT: s_pack_ll_b32_b16 s4, s4, s4 -; GFX9-NEXT: v_pk_add_u16 v0, v0, s4 +; GFX9-NEXT: v_mov_b32_e32 v1, 0xffc0ffc0 +; GFX9-NEXT: v_pk_add_u16 v0, v0, v1 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: v_add_v2i16_neg_inline_imm_splat: @@ -113,8 +112,8 @@ define <2 x i16> @v_add_v2i16_neg_inline_imm_lo(<2 x i16> %a) { ; GFX9-LABEL: v_add_v2i16_neg_inline_imm_lo: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: s_pack_ll_b32_b16 s4, 0xffffffc0, 4 -; GFX9-NEXT: v_pk_add_u16 v0, v0, s4 +; GFX9-NEXT: v_mov_b32_e32 v1, 0x4ffc0 +; GFX9-NEXT: v_pk_add_u16 v0, v0, v1 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: v_add_v2i16_neg_inline_imm_lo: @@ -133,8 +132,8 @@ define <2 x i16> @v_add_v2i16_neg_inline_imm_hi(<2 x i16> %a) { ; GFX9-LABEL: v_add_v2i16_neg_inline_imm_hi: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: s_pack_ll_b32_b16 s4, 4, 0xffffffc0 -; GFX9-NEXT: v_pk_add_u16 v0, v0, s4 +; GFX9-NEXT: v_mov_b32_e32 v1, 0xffc00004 +; GFX9-NEXT: v_pk_add_u16 v0, v0, v1 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: v_add_v2i16_neg_inline_imm_hi: @@ -152,13 +151,10 @@ define <2 x i16> @v_add_v2i16_neg_inline_imm_hi(<2 x i16> %a) { define amdgpu_ps i32 @s_add_v2i16_neg_inline_imm_splat(<2 x i16> inreg %a) { ; GFX9-LABEL: s_add_v2i16_neg_inline_imm_splat: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_movk_i32 s1, 0xffc0 -; GFX9-NEXT: s_pack_ll_b32_b16 s1, s1, s1 -; GFX9-NEXT: s_lshr_b32 s2, s0, 16 -; GFX9-NEXT: s_lshr_b32 s3, s1, 16 -; GFX9-NEXT: s_add_i32 s0, s0, s1 -; GFX9-NEXT: s_add_i32 s2, s2, s3 -; GFX9-NEXT: s_pack_ll_b32_b16 s0, s0, s2 +; GFX9-NEXT: s_lshr_b32 s1, s0, 16 +; GFX9-NEXT: s_add_i32 s0, s0, 0xffc0ffc0 +; GFX9-NEXT: s_add_i32 s1, s1, 0xffc0 +; GFX9-NEXT: s_pack_ll_b32_b16 s0, s0, s1 ; GFX9-NEXT: ; return to shader part epilog ; ; GFX8-LABEL: s_add_v2i16_neg_inline_imm_splat: @@ -182,12 +178,10 @@ define amdgpu_ps i32 @s_add_v2i16_neg_inline_imm_splat(<2 x i16> inreg %a) { define amdgpu_ps i32 @s_add_v2i16_neg_inline_imm_lo(<2 x i16> inreg %a) { ; GFX9-LABEL: s_add_v2i16_neg_inline_imm_lo: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_pack_ll_b32_b16 s1, 0xffffffc0, 4 -; GFX9-NEXT: s_lshr_b32 s2, s0, 16 -; GFX9-NEXT: s_lshr_b32 s3, s1, 16 -; GFX9-NEXT: s_add_i32 s0, s0, s1 -; GFX9-NEXT: s_add_i32 s2, s2, s3 -; GFX9-NEXT: s_pack_ll_b32_b16 s0, s0, s2 +; GFX9-NEXT: s_lshr_b32 s1, s0, 16 +; GFX9-NEXT: s_add_i32 s0, s0, 0x4ffc0 +; GFX9-NEXT: s_add_i32 s1, s1, 4 +; GFX9-NEXT: s_pack_ll_b32_b16 s0, s0, s1 ; GFX9-NEXT: ; return to shader part epilog ; ; GFX8-LABEL: s_add_v2i16_neg_inline_imm_lo: @@ -210,12 +204,10 @@ define amdgpu_ps i32 @s_add_v2i16_neg_inline_imm_lo(<2 x i16> inreg %a) { define amdgpu_ps i32 @s_add_v2i16_neg_inline_imm_hi(<2 x i16> inreg %a) { ; GFX9-LABEL: s_add_v2i16_neg_inline_imm_hi: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_pack_ll_b32_b16 s1, 4, 0xffffffc0 -; GFX9-NEXT: s_lshr_b32 s2, s0, 16 -; GFX9-NEXT: s_lshr_b32 s3, s1, 16 -; GFX9-NEXT: s_add_i32 s0, s0, s1 -; GFX9-NEXT: s_add_i32 s2, s2, s3 -; GFX9-NEXT: s_pack_ll_b32_b16 s0, s0, s2 +; GFX9-NEXT: s_lshr_b32 s1, s0, 16 +; GFX9-NEXT: s_add_i32 s0, s0, 0xffc00004 +; GFX9-NEXT: s_add_i32 s1, s1, 0xffc0 +; GFX9-NEXT: s_pack_ll_b32_b16 s0, s0, s1 ; GFX9-NEXT: ; return to shader part epilog ; ; GFX8-LABEL: s_add_v2i16_neg_inline_imm_hi: diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/ashr.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/ashr.ll index 71ee562f0ecc..c1896f81ef29 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/ashr.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/ashr.ll @@ -522,8 +522,7 @@ define <2 x i16> @v_ashr_v2i16_15(<2 x i16> %value) { ; GFX9-LABEL: v_ashr_v2i16_15: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: s_pack_ll_b32_b16 s4, 15, 15 -; GFX9-NEXT: v_pk_ashrrev_i16 v0, s4, v0 +; GFX9-NEXT: v_pk_ashrrev_i16 v0, 15, v0 op_sel_hi:[0,1] ; GFX9-NEXT: s_setpc_b64 s[30:31] %result = ashr <2 x i16> %value, ret <2 x i16> %result diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-build-vector-trunc.v2s16.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-build-vector-trunc.v2s16.mir index c380d3c77def..056ea79a9898 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-build-vector-trunc.v2s16.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-build-vector-trunc.v2s16.mir @@ -430,3 +430,273 @@ body: | %5:sgpr(<2 x s16>) = G_BUILD_VECTOR_TRUNC %3, %4 S_ENDPGM 0, implicit %5 ... + +--- +name: test_build_vector_trunc_s_v2s16_constant_constant +legalized: true +regBankSelected: true +tracksRegLiveness: true + +body: | + bb.0: + + ; GFX9-LABEL: name: test_build_vector_trunc_s_v2s16_constant_constant + ; GFX9: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 29884539 + ; GFX9: S_ENDPGM 0, implicit [[S_MOV_B32_]] + %0:sgpr(s32) = G_CONSTANT i32 123 + %1:sgpr(s32) = G_CONSTANT i32 456 + %2:sgpr(<2 x s16>) = G_BUILD_VECTOR_TRUNC %0, %1 + S_ENDPGM 0, implicit %2 +... + +--- +name: test_build_vector_trunc_s_v2s16_constant_impdef +legalized: true +regBankSelected: true +tracksRegLiveness: true + +body: | + bb.0: + + ; GFX9-LABEL: name: test_build_vector_trunc_s_v2s16_constant_impdef + ; GFX9: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 123 + ; GFX9: S_ENDPGM 0, implicit [[S_MOV_B32_]] + %0:sgpr(s32) = G_CONSTANT i32 123 + %1:sgpr(s32) = G_IMPLICIT_DEF + %2:sgpr(<2 x s16>) = G_BUILD_VECTOR_TRUNC %0, %1 + S_ENDPGM 0, implicit %2 +... + +--- +name: test_build_vector_trunc_s_v2s16_impdef_constant +legalized: true +regBankSelected: true +tracksRegLiveness: true + +body: | + bb.0: + + ; GFX9-LABEL: name: test_build_vector_trunc_s_v2s16_impdef_constant + ; GFX9: [[DEF:%[0-9]+]]:sreg_32 = IMPLICIT_DEF + ; GFX9: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 123 + ; GFX9: [[S_PACK_LL_B32_B16_:%[0-9]+]]:sreg_32 = S_PACK_LL_B32_B16 [[DEF]], [[S_MOV_B32_]] + ; GFX9: S_ENDPGM 0, implicit [[S_PACK_LL_B32_B16_]] + %0:sgpr(s32) = G_IMPLICIT_DEF + %1:sgpr(s32) = G_CONSTANT i32 123 + %2:sgpr(<2 x s16>) = G_BUILD_VECTOR_TRUNC %0, %1 + S_ENDPGM 0, implicit %2 +... + +--- +name: test_build_vector_trunc_s_v2s16_impdef_impdef +legalized: true +regBankSelected: true +tracksRegLiveness: true + +body: | + bb.0: + + ; GFX9-LABEL: name: test_build_vector_trunc_s_v2s16_impdef_impdef + ; GFX9: [[DEF:%[0-9]+]]:sreg_32 = IMPLICIT_DEF + ; GFX9: S_ENDPGM 0, implicit [[DEF]] + %0:sgpr(s32) = G_IMPLICIT_DEF + %1:sgpr(s32) = G_IMPLICIT_DEF + %2:sgpr(<2 x s16>) = G_BUILD_VECTOR_TRUNC %0, %1 + S_ENDPGM 0, implicit %2 +... + +--- +name: test_build_vector_trunc_s_v2s16_zext_constant_zext_constant +legalized: true +regBankSelected: true +tracksRegLiveness: true + +body: | + bb.0: + + ; GFX9-LABEL: name: test_build_vector_trunc_s_v2s16_zext_constant_zext_constant + ; GFX9: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 29884539 + ; GFX9: S_ENDPGM 0, implicit [[S_MOV_B32_]] + %0:sgpr(s16) = G_CONSTANT i16 123 + %1:sgpr(s16) = G_CONSTANT i16 456 + %2:sgpr(s32) = G_ZEXT %0 + %3:sgpr(s32) = G_ZEXT %1 + %4:sgpr(<2 x s16>) = G_BUILD_VECTOR_TRUNC %2, %3 + S_ENDPGM 0, implicit %4 +... + +--- +name: test_build_vector_trunc_s_v2s16_zext_impdef_zext_constant +legalized: true +regBankSelected: true +tracksRegLiveness: true + +body: | + bb.0: + + ; GFX9-LABEL: name: test_build_vector_trunc_s_v2s16_zext_impdef_zext_constant + ; GFX9: [[DEF:%[0-9]+]]:sreg_32 = IMPLICIT_DEF + ; GFX9: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 123 + ; GFX9: [[S_BFE_U32_:%[0-9]+]]:sreg_32 = S_BFE_U32 [[DEF]], 1048576, implicit-def $scc + ; GFX9: [[S_BFE_U32_1:%[0-9]+]]:sreg_32 = S_BFE_U32 [[S_MOV_B32_]], 1048576, implicit-def $scc + ; GFX9: [[S_PACK_LL_B32_B16_:%[0-9]+]]:sreg_32 = S_PACK_LL_B32_B16 [[S_BFE_U32_]], [[S_BFE_U32_1]] + ; GFX9: S_ENDPGM 0, implicit [[S_PACK_LL_B32_B16_]] + %0:sgpr(s16) = G_IMPLICIT_DEF + %1:sgpr(s16) = G_CONSTANT i16 123 + %2:sgpr(s32) = G_ZEXT %0 + %3:sgpr(s32) = G_ZEXT %1 + %4:sgpr(<2 x s16>) = G_BUILD_VECTOR_TRUNC %2, %3 + S_ENDPGM 0, implicit %4 +... + +--- +name: test_build_vector_trunc_s_v2s16_sext_constant_sext_constant +legalized: true +regBankSelected: true +tracksRegLiveness: true + +body: | + bb.0: + + ; GFX9-LABEL: name: test_build_vector_trunc_s_v2s16_sext_constant_sext_constant + ; GFX9: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 4294836208 + ; GFX9: S_ENDPGM 0, implicit [[S_MOV_B32_]] + %0:sgpr(s16) = G_CONSTANT i16 -16 + %1:sgpr(s16) = G_CONSTANT i16 -3 + %2:sgpr(s32) = G_SEXT %0 + %3:sgpr(s32) = G_SEXT %1 + %4:sgpr(<2 x s16>) = G_BUILD_VECTOR_TRUNC %2, %3 + S_ENDPGM 0, implicit %4 +... + +--- +name: test_build_vector_trunc_s_v2s16_anyext_constant_anyext_constant +legalized: true +regBankSelected: true +tracksRegLiveness: true + +body: | + bb.0: + + ; GFX9-LABEL: name: test_build_vector_trunc_s_v2s16_anyext_constant_anyext_constant + ; GFX9: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 123 + ; GFX9: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 456 + ; GFX9: [[S_PACK_LL_B32_B16_:%[0-9]+]]:sreg_32 = S_PACK_LL_B32_B16 [[S_MOV_B32_]], [[S_MOV_B32_1]] + ; GFX9: S_ENDPGM 0, implicit [[S_PACK_LL_B32_B16_]] + %0:sgpr(s16) = G_CONSTANT i16 123 + %1:sgpr(s16) = G_CONSTANT i16 456 + %2:sgpr(s32) = G_ANYEXT %0 + %3:sgpr(s32) = G_ANYEXT %1 + %4:sgpr(<2 x s16>) = G_BUILD_VECTOR_TRUNC %2, %3 + S_ENDPGM 0, implicit %4 +... + +--- +name: test_build_vector_trunc_s_v2s16_anyext_impdef_anyext_constant +legalized: true +regBankSelected: true +tracksRegLiveness: true + +body: | + bb.0: + + ; GFX9-LABEL: name: test_build_vector_trunc_s_v2s16_anyext_impdef_anyext_constant + ; GFX9: [[DEF:%[0-9]+]]:sreg_32 = IMPLICIT_DEF + ; GFX9: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 123 + ; GFX9: [[S_PACK_LL_B32_B16_:%[0-9]+]]:sreg_32 = S_PACK_LL_B32_B16 [[DEF]], [[S_MOV_B32_]] + ; GFX9: S_ENDPGM 0, implicit [[S_PACK_LL_B32_B16_]] + %0:sgpr(s16) = G_IMPLICIT_DEF + %1:sgpr(s16) = G_CONSTANT i16 123 + %2:sgpr(s32) = G_ANYEXT %0 + %3:sgpr(s32) = G_ANYEXT %1 + %4:sgpr(<2 x s16>) = G_BUILD_VECTOR_TRUNC %2, %3 + S_ENDPGM 0, implicit %4 +... + +--- +name: test_build_vector_trunc_s_v2s16_var_constant +legalized: true +regBankSelected: true +tracksRegLiveness: true + +body: | + bb.0: + liveins: $sgpr0 + + ; GFX9-LABEL: name: test_build_vector_trunc_s_v2s16_var_constant + ; GFX9: liveins: $sgpr0 + ; GFX9: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0 + ; GFX9: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 456 + ; GFX9: [[S_PACK_LL_B32_B16_:%[0-9]+]]:sreg_32 = S_PACK_LL_B32_B16 [[COPY]], [[S_MOV_B32_]] + ; GFX9: S_ENDPGM 0, implicit [[S_PACK_LL_B32_B16_]] + %0:sgpr(s32) = COPY $sgpr0 + %1:sgpr(s32) = G_CONSTANT i32 456 + %2:sgpr(<2 x s16>) = G_BUILD_VECTOR_TRUNC %0, %1 + S_ENDPGM 0, implicit %2 +... + +--- +name: test_build_vector_trunc_s_v2s16_constant_var +legalized: true +regBankSelected: true +tracksRegLiveness: true + +body: | + bb.0: + liveins: $sgpr0 + + ; GFX9-LABEL: name: test_build_vector_trunc_s_v2s16_constant_var + ; GFX9: liveins: $sgpr0 + ; GFX9: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 456 + ; GFX9: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0 + ; GFX9: [[S_PACK_LL_B32_B16_:%[0-9]+]]:sreg_32 = S_PACK_LL_B32_B16 [[S_MOV_B32_]], [[COPY]] + ; GFX9: S_ENDPGM 0, implicit [[S_PACK_LL_B32_B16_]] + %0:sgpr(s32) = G_CONSTANT i32 456 + %1:sgpr(s32) = COPY $sgpr0 + %2:sgpr(<2 x s16>) = G_BUILD_VECTOR_TRUNC %0, %1 + S_ENDPGM 0, implicit %2 +... + +--- +name: test_build_vector_trunc_s_v2s16_var_0 +legalized: true +regBankSelected: true +tracksRegLiveness: true + +body: | + bb.0: + liveins: $sgpr0 + + ; GFX9-LABEL: name: test_build_vector_trunc_s_v2s16_var_0 + ; GFX9: liveins: $sgpr0 + ; GFX9: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0 + ; GFX9: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0 + ; GFX9: [[S_PACK_LL_B32_B16_:%[0-9]+]]:sreg_32 = S_PACK_LL_B32_B16 [[COPY]], [[S_MOV_B32_]] + ; GFX9: S_ENDPGM 0, implicit [[S_PACK_LL_B32_B16_]] + %0:sgpr(s32) = COPY $sgpr0 + %1:sgpr(s32) = G_CONSTANT i32 0 + %2:sgpr(<2 x s16>) = G_BUILD_VECTOR_TRUNC %0, %1 + S_ENDPGM 0, implicit %2 +... + +--- +name: test_build_vector_trunc_s_v2s16_0_var +legalized: true +regBankSelected: true +tracksRegLiveness: true + +body: | + bb.0: + liveins: $sgpr0 + + ; GFX9-LABEL: name: test_build_vector_trunc_s_v2s16_0_var + ; GFX9: liveins: $sgpr0 + ; GFX9: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0 + ; GFX9: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0 + ; GFX9: [[S_PACK_LL_B32_B16_:%[0-9]+]]:sreg_32 = S_PACK_LL_B32_B16 [[S_MOV_B32_]], [[COPY]] + ; GFX9: S_ENDPGM 0, implicit [[S_PACK_LL_B32_B16_]] + %0:sgpr(s32) = G_CONSTANT i32 0 + %1:sgpr(s32) = COPY $sgpr0 + %2:sgpr(<2 x s16>) = G_BUILD_VECTOR_TRUNC %0, %1 + S_ENDPGM 0, implicit %2 +... diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.sdot2.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.sdot2.ll index fdcf0f1515f9..172656f08aef 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.sdot2.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.sdot2.ll @@ -82,24 +82,21 @@ define i32 @v_sdot2_inline_literal_a(<2 x i16> %b, i32 %c) { ; GFX906-LABEL: v_sdot2_inline_literal_a: ; GFX906: ; %bb.0: ; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX906-NEXT: s_pack_ll_b32_b16 s4, 4, 4 -; GFX906-NEXT: v_dot2_i32_i16 v0, s4, v0, v1 +; GFX906-NEXT: v_dot2_i32_i16 v0, 4, v0, v1 op_sel_hi:[0,1,1] ; GFX906-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: v_sdot2_inline_literal_a: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: s_pack_ll_b32_b16 s4, 4, 4 -; GFX908-NEXT: v_dot2_i32_i16 v0, s4, v0, v1 +; GFX908-NEXT: v_dot2_i32_i16 v0, 4, v0, v1 op_sel_hi:[0,1,1] ; GFX908-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_sdot2_inline_literal_a: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: s_pack_ll_b32_b16 s4, 4, 4 +; GFX10-NEXT: v_dot2_i32_i16 v0, 4, v0, v1 op_sel_hi:[0,1,1] ; GFX10-NEXT: ; implicit-def: $vcc_hi -; GFX10-NEXT: v_dot2_i32_i16 v0, s4, v0, v1 ; GFX10-NEXT: s_setpc_b64 s[30:31] %r = call i32 @llvm.amdgcn.sdot2(<2 x i16> , <2 x i16> %b, i32 %c, i1 false) ret i32 %r @@ -109,24 +106,21 @@ define i32 @v_sdot2_inline_literal_b(<2 x i16> %a, i32 %c) { ; GFX906-LABEL: v_sdot2_inline_literal_b: ; GFX906: ; %bb.0: ; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX906-NEXT: s_pack_ll_b32_b16 s4, 4, 4 -; GFX906-NEXT: v_dot2_i32_i16 v0, v0, s4, v1 +; GFX906-NEXT: v_dot2_i32_i16 v0, v0, 4, v1 op_sel_hi:[1,0,1] ; GFX906-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: v_sdot2_inline_literal_b: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: s_pack_ll_b32_b16 s4, 4, 4 -; GFX908-NEXT: v_dot2_i32_i16 v0, v0, s4, v1 +; GFX908-NEXT: v_dot2_i32_i16 v0, v0, 4, v1 op_sel_hi:[1,0,1] ; GFX908-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_sdot2_inline_literal_b: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: s_pack_ll_b32_b16 s4, 4, 4 +; GFX10-NEXT: v_dot2_i32_i16 v0, v0, 4, v1 op_sel_hi:[1,0,1] ; GFX10-NEXT: ; implicit-def: $vcc_hi -; GFX10-NEXT: v_dot2_i32_i16 v0, v0, s4, v1 ; GFX10-NEXT: s_setpc_b64 s[30:31] %r = call i32 @llvm.amdgcn.sdot2(<2 x i16> %a, <2 x i16> , i32 %c, i1 false) ret i32 %r @@ -136,29 +130,21 @@ define i32 @v_sdot2_inline_literal_a_b(<2 x i16> %a, i32 %c) { ; GFX906-LABEL: v_sdot2_inline_literal_a_b: ; GFX906: ; %bb.0: ; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX906-NEXT: s_pack_ll_b32_b16 s5, 4, 4 -; GFX906-NEXT: s_pack_ll_b32_b16 s4, 8, 8 -; GFX906-NEXT: v_mov_b32_e32 v0, s5 -; GFX906-NEXT: v_dot2_i32_i16 v0, s4, v0, v1 +; GFX906-NEXT: v_dot2_i32_i16 v0, 8, 4, v1 op_sel_hi:[0,0,1] ; GFX906-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: v_sdot2_inline_literal_a_b: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: s_pack_ll_b32_b16 s5, 4, 4 -; GFX908-NEXT: s_pack_ll_b32_b16 s4, 8, 8 -; GFX908-NEXT: v_mov_b32_e32 v0, s5 -; GFX908-NEXT: v_dot2_i32_i16 v0, s4, v0, v1 +; GFX908-NEXT: v_dot2_i32_i16 v0, 8, 4, v1 op_sel_hi:[0,0,1] ; GFX908-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_sdot2_inline_literal_a_b: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: s_pack_ll_b32_b16 s4, 8, 8 -; GFX10-NEXT: s_pack_ll_b32_b16 s5, 4, 4 +; GFX10-NEXT: v_dot2_i32_i16 v0, 8, 4, v1 op_sel_hi:[0,0,1] ; GFX10-NEXT: ; implicit-def: $vcc_hi -; GFX10-NEXT: v_dot2_i32_i16 v0, s4, s5, v1 ; GFX10-NEXT: s_setpc_b64 s[30:31] %r = call i32 @llvm.amdgcn.sdot2(<2 x i16> , <2 x i16> , i32 %c, i1 false) ret i32 %r @@ -168,29 +154,21 @@ define i32 @v_sdot2_inline_literal_a_b_c() { ; GFX906-LABEL: v_sdot2_inline_literal_a_b_c: ; GFX906: ; %bb.0: ; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX906-NEXT: s_pack_ll_b32_b16 s5, 4, 4 -; GFX906-NEXT: s_pack_ll_b32_b16 s4, 8, 8 -; GFX906-NEXT: v_mov_b32_e32 v0, s5 -; GFX906-NEXT: v_dot2_i32_i16 v0, s4, v0, 8 +; GFX906-NEXT: v_dot2_i32_i16 v0, 8, 4, 8 op_sel_hi:[0,0,1] ; GFX906-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: v_sdot2_inline_literal_a_b_c: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: s_pack_ll_b32_b16 s5, 4, 4 -; GFX908-NEXT: s_pack_ll_b32_b16 s4, 8, 8 -; GFX908-NEXT: v_mov_b32_e32 v0, s5 -; GFX908-NEXT: v_dot2_i32_i16 v0, s4, v0, 8 +; GFX908-NEXT: v_dot2_i32_i16 v0, 8, 4, 8 op_sel_hi:[0,0,1] ; GFX908-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_sdot2_inline_literal_a_b_c: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: s_pack_ll_b32_b16 s4, 8, 8 -; GFX10-NEXT: s_pack_ll_b32_b16 s5, 4, 4 +; GFX10-NEXT: v_dot2_i32_i16 v0, 8, 4, 8 op_sel_hi:[0,0,1] ; GFX10-NEXT: ; implicit-def: $vcc_hi -; GFX10-NEXT: v_dot2_i32_i16 v0, s4, s5, 8 ; GFX10-NEXT: s_setpc_b64 s[30:31] %r = call i32 @llvm.amdgcn.sdot2(<2 x i16> , <2 x i16> , i32 8, i1 false) ret i32 %r diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.udot2.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.udot2.ll index d285ee132cc2..976536c72883 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.udot2.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.udot2.ll @@ -82,24 +82,21 @@ define i32 @v_udot2_inline_literal_a(<2 x i16> %b, i32 %c) { ; GFX906-LABEL: v_udot2_inline_literal_a: ; GFX906: ; %bb.0: ; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX906-NEXT: s_pack_ll_b32_b16 s4, 4, 4 -; GFX906-NEXT: v_dot2_u32_u16 v0, s4, v0, v1 +; GFX906-NEXT: v_dot2_u32_u16 v0, 4, v0, v1 op_sel_hi:[0,1,1] ; GFX906-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: v_udot2_inline_literal_a: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: s_pack_ll_b32_b16 s4, 4, 4 -; GFX908-NEXT: v_dot2_u32_u16 v0, s4, v0, v1 +; GFX908-NEXT: v_dot2_u32_u16 v0, 4, v0, v1 op_sel_hi:[0,1,1] ; GFX908-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_udot2_inline_literal_a: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: s_pack_ll_b32_b16 s4, 4, 4 +; GFX10-NEXT: v_dot2_u32_u16 v0, 4, v0, v1 op_sel_hi:[0,1,1] ; GFX10-NEXT: ; implicit-def: $vcc_hi -; GFX10-NEXT: v_dot2_u32_u16 v0, s4, v0, v1 ; GFX10-NEXT: s_setpc_b64 s[30:31] %r = call i32 @llvm.amdgcn.udot2(<2 x i16> , <2 x i16> %b, i32 %c, i1 false) ret i32 %r @@ -109,24 +106,21 @@ define i32 @v_udot2_inline_literal_b(<2 x i16> %a, i32 %c) { ; GFX906-LABEL: v_udot2_inline_literal_b: ; GFX906: ; %bb.0: ; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX906-NEXT: s_pack_ll_b32_b16 s4, 4, 4 -; GFX906-NEXT: v_dot2_u32_u16 v0, v0, s4, v1 +; GFX906-NEXT: v_dot2_u32_u16 v0, v0, 4, v1 op_sel_hi:[1,0,1] ; GFX906-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: v_udot2_inline_literal_b: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: s_pack_ll_b32_b16 s4, 4, 4 -; GFX908-NEXT: v_dot2_u32_u16 v0, v0, s4, v1 +; GFX908-NEXT: v_dot2_u32_u16 v0, v0, 4, v1 op_sel_hi:[1,0,1] ; GFX908-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_udot2_inline_literal_b: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: s_pack_ll_b32_b16 s4, 4, 4 +; GFX10-NEXT: v_dot2_u32_u16 v0, v0, 4, v1 op_sel_hi:[1,0,1] ; GFX10-NEXT: ; implicit-def: $vcc_hi -; GFX10-NEXT: v_dot2_u32_u16 v0, v0, s4, v1 ; GFX10-NEXT: s_setpc_b64 s[30:31] %r = call i32 @llvm.amdgcn.udot2(<2 x i16> %a, <2 x i16> , i32 %c, i1 false) ret i32 %r @@ -136,29 +130,21 @@ define i32 @v_udot2_inline_literal_a_b(<2 x i16> %a, i32 %c) { ; GFX906-LABEL: v_udot2_inline_literal_a_b: ; GFX906: ; %bb.0: ; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX906-NEXT: s_pack_ll_b32_b16 s5, 4, 4 -; GFX906-NEXT: s_pack_ll_b32_b16 s4, 8, 8 -; GFX906-NEXT: v_mov_b32_e32 v0, s5 -; GFX906-NEXT: v_dot2_u32_u16 v0, s4, v0, v1 +; GFX906-NEXT: v_dot2_u32_u16 v0, 8, 4, v1 op_sel_hi:[0,0,1] ; GFX906-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: v_udot2_inline_literal_a_b: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: s_pack_ll_b32_b16 s5, 4, 4 -; GFX908-NEXT: s_pack_ll_b32_b16 s4, 8, 8 -; GFX908-NEXT: v_mov_b32_e32 v0, s5 -; GFX908-NEXT: v_dot2_u32_u16 v0, s4, v0, v1 +; GFX908-NEXT: v_dot2_u32_u16 v0, 8, 4, v1 op_sel_hi:[0,0,1] ; GFX908-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_udot2_inline_literal_a_b: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: s_pack_ll_b32_b16 s4, 8, 8 -; GFX10-NEXT: s_pack_ll_b32_b16 s5, 4, 4 +; GFX10-NEXT: v_dot2_u32_u16 v0, 8, 4, v1 op_sel_hi:[0,0,1] ; GFX10-NEXT: ; implicit-def: $vcc_hi -; GFX10-NEXT: v_dot2_u32_u16 v0, s4, s5, v1 ; GFX10-NEXT: s_setpc_b64 s[30:31] %r = call i32 @llvm.amdgcn.udot2(<2 x i16> , <2 x i16> , i32 %c, i1 false) ret i32 %r @@ -168,29 +154,21 @@ define i32 @v_udot2_inline_literal_a_b_c() { ; GFX906-LABEL: v_udot2_inline_literal_a_b_c: ; GFX906: ; %bb.0: ; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX906-NEXT: s_pack_ll_b32_b16 s5, 4, 4 -; GFX906-NEXT: s_pack_ll_b32_b16 s4, 8, 8 -; GFX906-NEXT: v_mov_b32_e32 v0, s5 -; GFX906-NEXT: v_dot2_u32_u16 v0, s4, v0, 8 +; GFX906-NEXT: v_dot2_u32_u16 v0, 8, 4, 8 op_sel_hi:[0,0,1] ; GFX906-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: v_udot2_inline_literal_a_b_c: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: s_pack_ll_b32_b16 s5, 4, 4 -; GFX908-NEXT: s_pack_ll_b32_b16 s4, 8, 8 -; GFX908-NEXT: v_mov_b32_e32 v0, s5 -; GFX908-NEXT: v_dot2_u32_u16 v0, s4, v0, 8 +; GFX908-NEXT: v_dot2_u32_u16 v0, 8, 4, 8 op_sel_hi:[0,0,1] ; GFX908-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_udot2_inline_literal_a_b_c: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: s_pack_ll_b32_b16 s4, 8, 8 -; GFX10-NEXT: s_pack_ll_b32_b16 s5, 4, 4 +; GFX10-NEXT: v_dot2_u32_u16 v0, 8, 4, 8 op_sel_hi:[0,0,1] ; GFX10-NEXT: ; implicit-def: $vcc_hi -; GFX10-NEXT: v_dot2_u32_u16 v0, s4, s5, 8 ; GFX10-NEXT: s_setpc_b64 s[30:31] %r = call i32 @llvm.amdgcn.udot2(<2 x i16> , <2 x i16> , i32 8, i1 false) ret i32 %r diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/lshr.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/lshr.ll index 9d82396bbc36..ea2631cbcb29 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/lshr.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/lshr.ll @@ -533,8 +533,7 @@ define <2 x i16> @v_lshr_v2i16_15(<2 x i16> %value) { ; GFX9-LABEL: v_lshr_v2i16_15: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: s_pack_ll_b32_b16 s4, 15, 15 -; GFX9-NEXT: v_pk_lshrrev_b16 v0, s4, v0 +; GFX9-NEXT: v_pk_lshrrev_b16 v0, 15, v0 op_sel_hi:[0,1] ; GFX9-NEXT: s_setpc_b64 s[30:31] %result = lshr <2 x i16> %value, ret <2 x i16> %result diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/saddsat.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/saddsat.ll index b2e7f1ea326f..ba672883fa56 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/saddsat.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/saddsat.ll @@ -4527,15 +4527,12 @@ define <2 x i16> @v_saddsat_v2i16(<2 x i16> %lhs, <2 x i16> %rhs) { ; GFX9-LABEL: v_saddsat_v2i16: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: s_movk_i32 s5, 0x8000 -; GFX9-NEXT: s_pack_ll_b32_b16 s6, 0, 0 -; GFX9-NEXT: s_movk_i32 s4, 0x7fff -; GFX9-NEXT: s_pack_ll_b32_b16 s5, s5, s5 -; GFX9-NEXT: v_pk_min_i16 v3, v0, s6 -; GFX9-NEXT: v_pk_sub_i16 v3, s5, v3 -; GFX9-NEXT: s_pack_ll_b32_b16 s4, s4, s4 -; GFX9-NEXT: v_pk_max_i16 v2, v0, s6 -; GFX9-NEXT: v_pk_sub_i16 v2, s4, v2 +; GFX9-NEXT: v_pk_max_i16 v2, v0, 0 +; GFX9-NEXT: v_mov_b32_e32 v3, 0x7fff7fff +; GFX9-NEXT: v_pk_sub_i16 v2, v3, v2 +; GFX9-NEXT: v_pk_min_i16 v3, v0, 0 +; GFX9-NEXT: v_mov_b32_e32 v4, 0x80008000 +; GFX9-NEXT: v_pk_sub_i16 v3, v4, v3 ; GFX9-NEXT: v_pk_max_i16 v1, v3, v1 ; GFX9-NEXT: v_pk_min_i16 v1, v1, v2 ; GFX9-NEXT: v_pk_add_u16 v0, v0, v1 @@ -4545,16 +4542,11 @@ define <2 x i16> @v_saddsat_v2i16(<2 x i16> %lhs, <2 x i16> %rhs) { ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: s_pack_ll_b32_b16 s4, 0, 0 -; GFX10-NEXT: s_movk_i32 s5, 0x8000 -; GFX10-NEXT: v_pk_min_i16 v2, v0, s4 -; GFX10-NEXT: s_pack_ll_b32_b16 s5, s5, s5 -; GFX10-NEXT: v_pk_max_i16 v3, v0, s4 -; GFX10-NEXT: s_movk_i32 s6, 0x7fff +; GFX10-NEXT: v_pk_min_i16 v2, v0, 0 +; GFX10-NEXT: v_pk_max_i16 v3, v0, 0 ; GFX10-NEXT: ; implicit-def: $vcc_hi -; GFX10-NEXT: v_pk_sub_i16 v2, s5, v2 -; GFX10-NEXT: s_pack_ll_b32_b16 s4, s6, s6 -; GFX10-NEXT: v_pk_sub_i16 v3, s4, v3 +; GFX10-NEXT: v_pk_sub_i16 v2, 0x80008000, v2 +; GFX10-NEXT: v_pk_sub_i16 v3, 0x7fff7fff, v3 ; GFX10-NEXT: v_pk_max_i16 v1, v2, v1 ; GFX10-NEXT: v_pk_min_i16 v1, v1, v3 ; GFX10-NEXT: v_pk_add_u16 v0, v0, v1 @@ -4650,53 +4642,45 @@ define amdgpu_ps i32 @s_saddsat_v2i16(<2 x i16> inreg %lhs, <2 x i16> inreg %rhs ; ; GFX9-LABEL: s_saddsat_v2i16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_pack_ll_b32_b16 s4, 0, 0 -; GFX9-NEXT: s_sext_i32_i16 s7, s4 -; GFX9-NEXT: s_sext_i32_i16 s5, s0 -; GFX9-NEXT: s_ashr_i32 s6, s0, 16 -; GFX9-NEXT: s_ashr_i32 s4, s4, 16 -; GFX9-NEXT: s_cmp_gt_i32 s5, s7 -; GFX9-NEXT: s_cselect_b32 s8, s5, s7 -; GFX9-NEXT: s_cmp_gt_i32 s6, s4 -; GFX9-NEXT: s_movk_i32 s2, 0x7fff -; GFX9-NEXT: s_cselect_b32 s9, s6, s4 -; GFX9-NEXT: s_pack_ll_b32_b16 s8, s8, s9 -; GFX9-NEXT: s_pack_ll_b32_b16 s2, s2, s2 -; GFX9-NEXT: s_lshr_b32 s9, s2, 16 -; GFX9-NEXT: s_lshr_b32 s10, s8, 16 -; GFX9-NEXT: s_sub_i32 s2, s2, s8 -; GFX9-NEXT: s_sub_i32 s8, s9, s10 -; GFX9-NEXT: s_cmp_lt_i32 s5, s7 -; GFX9-NEXT: s_cselect_b32 s5, s5, s7 -; GFX9-NEXT: s_cmp_lt_i32 s6, s4 -; GFX9-NEXT: s_movk_i32 s3, 0x8000 -; GFX9-NEXT: s_cselect_b32 s4, s6, s4 -; GFX9-NEXT: s_pack_ll_b32_b16 s4, s5, s4 -; GFX9-NEXT: s_pack_ll_b32_b16 s3, s3, s3 -; GFX9-NEXT: s_lshr_b32 s5, s3, 16 -; GFX9-NEXT: s_lshr_b32 s6, s4, 16 -; GFX9-NEXT: s_sub_i32 s3, s3, s4 -; GFX9-NEXT: s_sub_i32 s4, s5, s6 -; GFX9-NEXT: s_pack_ll_b32_b16 s3, s3, s4 -; GFX9-NEXT: s_sext_i32_i16 s4, s3 -; GFX9-NEXT: s_sext_i32_i16 s5, s1 -; GFX9-NEXT: s_ashr_i32 s3, s3, 16 -; GFX9-NEXT: s_ashr_i32 s1, s1, 16 -; GFX9-NEXT: s_cmp_gt_i32 s4, s5 -; GFX9-NEXT: s_cselect_b32 s4, s4, s5 -; GFX9-NEXT: s_cmp_gt_i32 s3, s1 -; GFX9-NEXT: s_cselect_b32 s1, s3, s1 -; GFX9-NEXT: s_pack_ll_b32_b16 s1, s4, s1 -; GFX9-NEXT: s_pack_ll_b32_b16 s2, s2, s8 -; GFX9-NEXT: s_sext_i32_i16 s3, s1 -; GFX9-NEXT: s_sext_i32_i16 s4, s2 -; GFX9-NEXT: s_ashr_i32 s1, s1, 16 +; GFX9-NEXT: s_sext_i32_i16 s2, s0 +; GFX9-NEXT: s_ashr_i32 s3, s0, 16 +; GFX9-NEXT: s_sext_i32_i16 s4, 0 +; GFX9-NEXT: s_cmp_gt_i32 s2, s4 +; GFX9-NEXT: s_cselect_b32 s5, s2, s4 +; GFX9-NEXT: s_cmp_gt_i32 s3, 0 +; GFX9-NEXT: s_cselect_b32 s6, s3, 0 +; GFX9-NEXT: s_pack_ll_b32_b16 s5, s5, s6 +; GFX9-NEXT: s_lshr_b32 s6, s5, 16 +; GFX9-NEXT: s_sub_i32 s5, 0x7fff7fff, s5 +; GFX9-NEXT: s_sub_i32 s6, 0x7fff, s6 +; GFX9-NEXT: s_cmp_lt_i32 s2, s4 +; GFX9-NEXT: s_cselect_b32 s2, s2, s4 +; GFX9-NEXT: s_cmp_lt_i32 s3, 0 +; GFX9-NEXT: s_cselect_b32 s3, s3, 0 +; GFX9-NEXT: s_pack_ll_b32_b16 s2, s2, s3 +; GFX9-NEXT: s_lshr_b32 s3, s2, 16 +; GFX9-NEXT: s_sub_i32 s2, 0x80008000, s2 +; GFX9-NEXT: s_sub_i32 s3, 0x8000, s3 +; GFX9-NEXT: s_pack_ll_b32_b16 s2, s2, s3 +; GFX9-NEXT: s_sext_i32_i16 s3, s2 +; GFX9-NEXT: s_sext_i32_i16 s4, s1 ; GFX9-NEXT: s_ashr_i32 s2, s2, 16 -; GFX9-NEXT: s_cmp_lt_i32 s3, s4 +; GFX9-NEXT: s_ashr_i32 s1, s1, 16 +; GFX9-NEXT: s_cmp_gt_i32 s3, s4 ; GFX9-NEXT: s_cselect_b32 s3, s3, s4 -; GFX9-NEXT: s_cmp_lt_i32 s1, s2 -; GFX9-NEXT: s_cselect_b32 s1, s1, s2 +; GFX9-NEXT: s_cmp_gt_i32 s2, s1 +; GFX9-NEXT: s_cselect_b32 s1, s2, s1 +; GFX9-NEXT: s_pack_ll_b32_b16 s5, s5, s6 ; GFX9-NEXT: s_pack_ll_b32_b16 s1, s3, s1 +; GFX9-NEXT: s_sext_i32_i16 s2, s1 +; GFX9-NEXT: s_ashr_i32 s1, s1, 16 +; GFX9-NEXT: s_sext_i32_i16 s3, s5 +; GFX9-NEXT: s_ashr_i32 s4, s5, 16 +; GFX9-NEXT: s_cmp_lt_i32 s2, s3 +; GFX9-NEXT: s_cselect_b32 s2, s2, s3 +; GFX9-NEXT: s_cmp_lt_i32 s1, s4 +; GFX9-NEXT: s_cselect_b32 s1, s1, s4 +; GFX9-NEXT: s_pack_ll_b32_b16 s1, s2, s1 ; GFX9-NEXT: s_lshr_b32 s2, s0, 16 ; GFX9-NEXT: s_lshr_b32 s3, s1, 16 ; GFX9-NEXT: s_add_i32 s0, s0, s1 @@ -4706,55 +4690,47 @@ define amdgpu_ps i32 @s_saddsat_v2i16(<2 x i16> inreg %lhs, <2 x i16> inreg %rhs ; ; GFX10-LABEL: s_saddsat_v2i16: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_pack_ll_b32_b16 s2, 0, 0 -; GFX10-NEXT: s_sext_i32_i16 s3, s0 -; GFX10-NEXT: s_sext_i32_i16 s5, s2 +; GFX10-NEXT: s_sext_i32_i16 s2, s0 +; GFX10-NEXT: s_sext_i32_i16 s3, 0 ; GFX10-NEXT: s_ashr_i32 s4, s0, 16 -; GFX10-NEXT: s_ashr_i32 s2, s2, 16 -; GFX10-NEXT: s_cmp_gt_i32 s3, s5 -; GFX10-NEXT: s_movk_i32 s7, 0x7fff -; GFX10-NEXT: s_cselect_b32 s6, s3, s5 -; GFX10-NEXT: s_cmp_gt_i32 s4, s2 -; GFX10-NEXT: s_pack_ll_b32_b16 s7, s7, s7 -; GFX10-NEXT: s_cselect_b32 s8, s4, s2 +; GFX10-NEXT: s_cmp_gt_i32 s2, s3 ; GFX10-NEXT: ; implicit-def: $vcc_hi -; GFX10-NEXT: s_pack_ll_b32_b16 s6, s6, s8 -; GFX10-NEXT: s_lshr_b32 s8, s7, 16 -; GFX10-NEXT: s_lshr_b32 s9, s6, 16 -; GFX10-NEXT: s_sub_i32 s6, s7, s6 -; GFX10-NEXT: s_sub_i32 s7, s8, s9 -; GFX10-NEXT: s_cmp_lt_i32 s3, s5 -; GFX10-NEXT: s_cselect_b32 s3, s3, s5 -; GFX10-NEXT: s_cmp_lt_i32 s4, s2 -; GFX10-NEXT: s_movk_i32 s5, 0x8000 -; GFX10-NEXT: s_cselect_b32 s2, s4, s2 -; GFX10-NEXT: s_pack_ll_b32_b16 s4, s5, s5 -; GFX10-NEXT: s_pack_ll_b32_b16 s2, s3, s2 -; GFX10-NEXT: s_lshr_b32 s3, s4, 16 -; GFX10-NEXT: s_lshr_b32 s5, s2, 16 -; GFX10-NEXT: s_sub_i32 s2, s4, s2 -; GFX10-NEXT: s_sub_i32 s3, s3, s5 +; GFX10-NEXT: s_cselect_b32 s5, s2, s3 +; GFX10-NEXT: s_cmp_gt_i32 s4, 0 +; GFX10-NEXT: s_cselect_b32 s6, s4, 0 +; GFX10-NEXT: s_pack_ll_b32_b16 s5, s5, s6 +; GFX10-NEXT: s_lshr_b32 s6, s5, 16 +; GFX10-NEXT: s_sub_i32 s5, 0x7fff7fff, s5 +; GFX10-NEXT: s_sub_i32 s6, 0x7fff, s6 +; GFX10-NEXT: s_cmp_lt_i32 s2, s3 +; GFX10-NEXT: s_cselect_b32 s2, s2, s3 +; GFX10-NEXT: s_cmp_lt_i32 s4, 0 +; GFX10-NEXT: s_cselect_b32 s3, s4, 0 ; GFX10-NEXT: s_sext_i32_i16 s4, s1 ; GFX10-NEXT: s_pack_ll_b32_b16 s2, s2, s3 ; GFX10-NEXT: s_ashr_i32 s1, s1, 16 +; GFX10-NEXT: s_lshr_b32 s3, s2, 16 +; GFX10-NEXT: s_sub_i32 s2, 0x80008000, s2 +; GFX10-NEXT: s_sub_i32 s3, 0x8000, s3 +; GFX10-NEXT: s_pack_ll_b32_b16 s2, s2, s3 ; GFX10-NEXT: s_sext_i32_i16 s3, s2 ; GFX10-NEXT: s_ashr_i32 s2, s2, 16 ; GFX10-NEXT: s_cmp_gt_i32 s3, s4 ; GFX10-NEXT: s_cselect_b32 s3, s3, s4 ; GFX10-NEXT: s_cmp_gt_i32 s2, s1 +; GFX10-NEXT: s_pack_ll_b32_b16 s4, s5, s6 ; GFX10-NEXT: s_cselect_b32 s1, s2, s1 -; GFX10-NEXT: s_pack_ll_b32_b16 s2, s6, s7 +; GFX10-NEXT: s_sext_i32_i16 s2, s4 ; GFX10-NEXT: s_pack_ll_b32_b16 s1, s3, s1 -; GFX10-NEXT: s_sext_i32_i16 s4, s2 -; GFX10-NEXT: s_sext_i32_i16 s3, s1 +; GFX10-NEXT: s_ashr_i32 s3, s4, 16 +; GFX10-NEXT: s_sext_i32_i16 s4, s1 ; GFX10-NEXT: s_ashr_i32 s1, s1, 16 -; GFX10-NEXT: s_ashr_i32 s2, s2, 16 -; GFX10-NEXT: s_cmp_lt_i32 s3, s4 -; GFX10-NEXT: s_cselect_b32 s3, s3, s4 -; GFX10-NEXT: s_cmp_lt_i32 s1, s2 -; GFX10-NEXT: s_cselect_b32 s1, s1, s2 +; GFX10-NEXT: s_cmp_lt_i32 s4, s2 +; GFX10-NEXT: s_cselect_b32 s2, s4, s2 +; GFX10-NEXT: s_cmp_lt_i32 s1, s3 +; GFX10-NEXT: s_cselect_b32 s1, s1, s3 +; GFX10-NEXT: s_pack_ll_b32_b16 s1, s2, s1 ; GFX10-NEXT: s_lshr_b32 s2, s0, 16 -; GFX10-NEXT: s_pack_ll_b32_b16 s1, s3, s1 ; GFX10-NEXT: s_lshr_b32 s3, s1, 16 ; GFX10-NEXT: s_add_i32 s0, s0, s1 ; GFX10-NEXT: s_add_i32 s2, s2, s3 @@ -4834,73 +4810,57 @@ define amdgpu_ps float @saddsat_v2i16_sv(<2 x i16> inreg %lhs, <2 x i16> %rhs) { ; ; GFX9-LABEL: saddsat_v2i16_sv: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_pack_ll_b32_b16 s3, 0, 0 -; GFX9-NEXT: s_sext_i32_i16 s6, s3 -; GFX9-NEXT: s_sext_i32_i16 s4, s0 -; GFX9-NEXT: s_ashr_i32 s5, s0, 16 -; GFX9-NEXT: s_ashr_i32 s3, s3, 16 -; GFX9-NEXT: s_cmp_gt_i32 s4, s6 -; GFX9-NEXT: s_cselect_b32 s7, s4, s6 -; GFX9-NEXT: s_cmp_gt_i32 s5, s3 -; GFX9-NEXT: s_movk_i32 s1, 0x7fff -; GFX9-NEXT: s_cselect_b32 s8, s5, s3 -; GFX9-NEXT: s_pack_ll_b32_b16 s7, s7, s8 -; GFX9-NEXT: s_pack_ll_b32_b16 s1, s1, s1 -; GFX9-NEXT: s_lshr_b32 s8, s1, 16 -; GFX9-NEXT: s_lshr_b32 s9, s7, 16 -; GFX9-NEXT: s_sub_i32 s1, s1, s7 -; GFX9-NEXT: s_sub_i32 s7, s8, s9 -; GFX9-NEXT: s_cmp_lt_i32 s4, s6 -; GFX9-NEXT: s_cselect_b32 s4, s4, s6 -; GFX9-NEXT: s_cmp_lt_i32 s5, s3 -; GFX9-NEXT: s_movk_i32 s2, 0x8000 -; GFX9-NEXT: s_cselect_b32 s3, s5, s3 -; GFX9-NEXT: s_pack_ll_b32_b16 s3, s4, s3 -; GFX9-NEXT: s_pack_ll_b32_b16 s2, s2, s2 -; GFX9-NEXT: s_lshr_b32 s4, s2, 16 -; GFX9-NEXT: s_lshr_b32 s5, s3, 16 -; GFX9-NEXT: s_sub_i32 s2, s2, s3 -; GFX9-NEXT: s_sub_i32 s3, s4, s5 -; GFX9-NEXT: s_pack_ll_b32_b16 s2, s2, s3 -; GFX9-NEXT: s_pack_ll_b32_b16 s1, s1, s7 -; GFX9-NEXT: v_pk_max_i16 v0, s2, v0 -; GFX9-NEXT: v_pk_min_i16 v0, v0, s1 +; GFX9-NEXT: s_sext_i32_i16 s1, s0 +; GFX9-NEXT: s_ashr_i32 s2, s0, 16 +; GFX9-NEXT: s_sext_i32_i16 s3, 0 +; GFX9-NEXT: s_cmp_gt_i32 s1, s3 +; GFX9-NEXT: s_cselect_b32 s4, s1, s3 +; GFX9-NEXT: s_cmp_gt_i32 s2, 0 +; GFX9-NEXT: s_cselect_b32 s5, s2, 0 +; GFX9-NEXT: s_pack_ll_b32_b16 s4, s4, s5 +; GFX9-NEXT: s_lshr_b32 s5, s4, 16 +; GFX9-NEXT: s_sub_i32 s4, 0x7fff7fff, s4 +; GFX9-NEXT: s_sub_i32 s5, 0x7fff, s5 +; GFX9-NEXT: s_cmp_lt_i32 s1, s3 +; GFX9-NEXT: s_cselect_b32 s1, s1, s3 +; GFX9-NEXT: s_cmp_lt_i32 s2, 0 +; GFX9-NEXT: s_cselect_b32 s2, s2, 0 +; GFX9-NEXT: s_pack_ll_b32_b16 s1, s1, s2 +; GFX9-NEXT: s_lshr_b32 s2, s1, 16 +; GFX9-NEXT: s_sub_i32 s1, 0x80008000, s1 +; GFX9-NEXT: s_sub_i32 s2, 0x8000, s2 +; GFX9-NEXT: s_pack_ll_b32_b16 s1, s1, s2 +; GFX9-NEXT: s_pack_ll_b32_b16 s4, s4, s5 +; GFX9-NEXT: v_pk_max_i16 v0, s1, v0 +; GFX9-NEXT: v_pk_min_i16 v0, v0, s4 ; GFX9-NEXT: v_pk_add_u16 v0, s0, v0 ; GFX9-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: saddsat_v2i16_sv: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_pack_ll_b32_b16 s1, 0, 0 -; GFX10-NEXT: s_sext_i32_i16 s2, s0 -; GFX10-NEXT: s_sext_i32_i16 s4, s1 +; GFX10-NEXT: s_sext_i32_i16 s1, s0 +; GFX10-NEXT: s_sext_i32_i16 s2, 0 ; GFX10-NEXT: s_ashr_i32 s3, s0, 16 -; GFX10-NEXT: s_ashr_i32 s1, s1, 16 -; GFX10-NEXT: s_cmp_gt_i32 s2, s4 -; GFX10-NEXT: s_movk_i32 s6, 0x7fff -; GFX10-NEXT: s_cselect_b32 s5, s2, s4 -; GFX10-NEXT: s_cmp_gt_i32 s3, s1 -; GFX10-NEXT: s_pack_ll_b32_b16 s6, s6, s6 -; GFX10-NEXT: s_cselect_b32 s7, s3, s1 +; GFX10-NEXT: s_cmp_gt_i32 s1, s2 ; GFX10-NEXT: ; implicit-def: $vcc_hi -; GFX10-NEXT: s_pack_ll_b32_b16 s5, s5, s7 -; GFX10-NEXT: s_lshr_b32 s7, s6, 16 -; GFX10-NEXT: s_lshr_b32 s8, s5, 16 -; GFX10-NEXT: s_sub_i32 s5, s6, s5 -; GFX10-NEXT: s_sub_i32 s6, s7, s8 -; GFX10-NEXT: s_cmp_lt_i32 s2, s4 -; GFX10-NEXT: s_cselect_b32 s2, s2, s4 -; GFX10-NEXT: s_cmp_lt_i32 s3, s1 -; GFX10-NEXT: s_movk_i32 s4, 0x8000 -; GFX10-NEXT: s_cselect_b32 s1, s3, s1 -; GFX10-NEXT: s_pack_ll_b32_b16 s3, s4, s4 -; GFX10-NEXT: s_pack_ll_b32_b16 s1, s2, s1 -; GFX10-NEXT: s_lshr_b32 s2, s3, 16 -; GFX10-NEXT: s_lshr_b32 s4, s1, 16 -; GFX10-NEXT: s_sub_i32 s1, s3, s1 -; GFX10-NEXT: s_sub_i32 s2, s2, s4 +; GFX10-NEXT: s_cselect_b32 s4, s1, s2 +; GFX10-NEXT: s_cmp_gt_i32 s3, 0 +; GFX10-NEXT: s_cselect_b32 s5, s3, 0 +; GFX10-NEXT: s_pack_ll_b32_b16 s4, s4, s5 +; GFX10-NEXT: s_lshr_b32 s5, s4, 16 +; GFX10-NEXT: s_sub_i32 s4, 0x7fff7fff, s4 +; GFX10-NEXT: s_sub_i32 s5, 0x7fff, s5 +; GFX10-NEXT: s_cmp_lt_i32 s1, s2 +; GFX10-NEXT: s_cselect_b32 s1, s1, s2 +; GFX10-NEXT: s_cmp_lt_i32 s3, 0 +; GFX10-NEXT: s_cselect_b32 s2, s3, 0 +; GFX10-NEXT: s_pack_ll_b32_b16 s1, s1, s2 +; GFX10-NEXT: s_lshr_b32 s2, s1, 16 +; GFX10-NEXT: s_sub_i32 s1, 0x80008000, s1 +; GFX10-NEXT: s_sub_i32 s2, 0x8000, s2 ; GFX10-NEXT: s_pack_ll_b32_b16 s1, s1, s2 ; GFX10-NEXT: v_pk_max_i16 v0, s1, v0 -; GFX10-NEXT: s_pack_ll_b32_b16 s1, s5, s6 +; GFX10-NEXT: s_pack_ll_b32_b16 s1, s4, s5 ; GFX10-NEXT: v_pk_min_i16 v0, v0, s1 ; GFX10-NEXT: v_pk_add_u16 v0, s0, v0 ; GFX10-NEXT: ; return to shader part epilog @@ -4966,15 +4926,12 @@ define amdgpu_ps float @saddsat_v2i16_vs(<2 x i16> %lhs, <2 x i16> inreg %rhs) { ; ; GFX9-LABEL: saddsat_v2i16_vs: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_movk_i32 s2, 0x8000 -; GFX9-NEXT: s_pack_ll_b32_b16 s3, 0, 0 -; GFX9-NEXT: s_movk_i32 s1, 0x7fff -; GFX9-NEXT: s_pack_ll_b32_b16 s2, s2, s2 -; GFX9-NEXT: v_pk_min_i16 v2, v0, s3 -; GFX9-NEXT: v_pk_sub_i16 v2, s2, v2 -; GFX9-NEXT: s_pack_ll_b32_b16 s1, s1, s1 -; GFX9-NEXT: v_pk_max_i16 v1, v0, s3 -; GFX9-NEXT: v_pk_sub_i16 v1, s1, v1 +; GFX9-NEXT: v_pk_max_i16 v1, v0, 0 +; GFX9-NEXT: v_mov_b32_e32 v2, 0x7fff7fff +; GFX9-NEXT: v_pk_sub_i16 v1, v2, v1 +; GFX9-NEXT: v_pk_min_i16 v2, v0, 0 +; GFX9-NEXT: v_mov_b32_e32 v3, 0x80008000 +; GFX9-NEXT: v_pk_sub_i16 v2, v3, v2 ; GFX9-NEXT: v_pk_max_i16 v2, v2, s0 ; GFX9-NEXT: v_pk_min_i16 v1, v2, v1 ; GFX9-NEXT: v_pk_add_u16 v0, v0, v1 @@ -4982,16 +4939,11 @@ define amdgpu_ps float @saddsat_v2i16_vs(<2 x i16> %lhs, <2 x i16> inreg %rhs) { ; ; GFX10-LABEL: saddsat_v2i16_vs: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_pack_ll_b32_b16 s1, 0, 0 -; GFX10-NEXT: s_movk_i32 s2, 0x8000 -; GFX10-NEXT: v_pk_min_i16 v1, v0, s1 -; GFX10-NEXT: s_pack_ll_b32_b16 s2, s2, s2 -; GFX10-NEXT: v_pk_max_i16 v2, v0, s1 -; GFX10-NEXT: s_movk_i32 s3, 0x7fff +; GFX10-NEXT: v_pk_min_i16 v1, v0, 0 +; GFX10-NEXT: v_pk_max_i16 v2, v0, 0 ; GFX10-NEXT: ; implicit-def: $vcc_hi -; GFX10-NEXT: v_pk_sub_i16 v1, s2, v1 -; GFX10-NEXT: s_pack_ll_b32_b16 s1, s3, s3 -; GFX10-NEXT: v_pk_sub_i16 v2, s1, v2 +; GFX10-NEXT: v_pk_sub_i16 v1, 0x80008000, v1 +; GFX10-NEXT: v_pk_sub_i16 v2, 0x7fff7fff, v2 ; GFX10-NEXT: v_pk_max_i16 v1, v1, s0 ; GFX10-NEXT: v_pk_min_i16 v1, v1, v2 ; GFX10-NEXT: v_pk_add_u16 v0, v0, v1 @@ -5113,22 +5065,19 @@ define <2 x float> @v_saddsat_v4i16(<4 x i16> %lhs, <4 x i16> %rhs) { ; GFX9-LABEL: v_saddsat_v4i16: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: s_movk_i32 s5, 0x8000 -; GFX9-NEXT: s_pack_ll_b32_b16 s6, 0, 0 -; GFX9-NEXT: s_movk_i32 s4, 0x7fff -; GFX9-NEXT: s_pack_ll_b32_b16 s5, s5, s5 -; GFX9-NEXT: v_pk_min_i16 v5, v0, s6 -; GFX9-NEXT: v_pk_sub_i16 v5, s5, v5 -; GFX9-NEXT: s_pack_ll_b32_b16 s4, s4, s4 -; GFX9-NEXT: v_pk_max_i16 v4, v0, s6 -; GFX9-NEXT: v_pk_sub_i16 v4, s4, v4 -; GFX9-NEXT: v_pk_max_i16 v2, v5, v2 +; GFX9-NEXT: v_pk_min_i16 v6, v0, 0 +; GFX9-NEXT: v_mov_b32_e32 v7, 0x80008000 +; GFX9-NEXT: v_pk_sub_i16 v6, v7, v6 +; GFX9-NEXT: v_pk_max_i16 v4, v0, 0 +; GFX9-NEXT: v_mov_b32_e32 v5, 0x7fff7fff +; GFX9-NEXT: v_pk_sub_i16 v4, v5, v4 +; GFX9-NEXT: v_pk_max_i16 v2, v6, v2 ; GFX9-NEXT: v_pk_min_i16 v2, v2, v4 -; GFX9-NEXT: v_pk_min_i16 v4, v1, s6 +; GFX9-NEXT: v_pk_min_i16 v4, v1, 0 ; GFX9-NEXT: v_pk_add_u16 v0, v0, v2 -; GFX9-NEXT: v_pk_max_i16 v2, v1, s6 -; GFX9-NEXT: v_pk_sub_i16 v4, s5, v4 -; GFX9-NEXT: v_pk_sub_i16 v2, s4, v2 +; GFX9-NEXT: v_pk_max_i16 v2, v1, 0 +; GFX9-NEXT: v_pk_sub_i16 v4, v7, v4 +; GFX9-NEXT: v_pk_sub_i16 v2, v5, v2 ; GFX9-NEXT: v_pk_max_i16 v3, v4, v3 ; GFX9-NEXT: v_pk_min_i16 v2, v3, v2 ; GFX9-NEXT: v_pk_add_u16 v1, v1, v2 @@ -5138,24 +5087,19 @@ define <2 x float> @v_saddsat_v4i16(<4 x i16> %lhs, <4 x i16> %rhs) { ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: s_pack_ll_b32_b16 s5, 0, 0 -; GFX10-NEXT: s_movk_i32 s6, 0x8000 -; GFX10-NEXT: v_pk_min_i16 v4, v0, s5 -; GFX10-NEXT: v_pk_min_i16 v5, v1, s5 -; GFX10-NEXT: s_pack_ll_b32_b16 s6, s6, s6 -; GFX10-NEXT: v_pk_max_i16 v6, v0, s5 -; GFX10-NEXT: v_pk_max_i16 v7, v1, s5 -; GFX10-NEXT: v_pk_sub_i16 v4, s6, v4 -; GFX10-NEXT: v_pk_sub_i16 v5, s6, v5 -; GFX10-NEXT: s_movk_i32 s4, 0x7fff +; GFX10-NEXT: v_pk_min_i16 v4, v0, 0 +; GFX10-NEXT: v_pk_min_i16 v5, v1, 0 +; GFX10-NEXT: v_pk_max_i16 v6, v0, 0 +; GFX10-NEXT: v_pk_max_i16 v7, v1, 0 ; GFX10-NEXT: ; implicit-def: $vcc_hi -; GFX10-NEXT: s_pack_ll_b32_b16 s4, s4, s4 +; GFX10-NEXT: v_pk_sub_i16 v4, 0x80008000, v4 +; GFX10-NEXT: v_pk_sub_i16 v5, 0x80008000, v5 +; GFX10-NEXT: v_pk_sub_i16 v6, 0x7fff7fff, v6 +; GFX10-NEXT: v_pk_sub_i16 v7, 0x7fff7fff, v7 ; GFX10-NEXT: v_pk_max_i16 v11, v4, v2 -; GFX10-NEXT: v_pk_sub_i16 v6, s4, v6 -; GFX10-NEXT: v_pk_sub_i16 v4, s4, v7 -; GFX10-NEXT: v_pk_max_i16 v3, v5, v3 +; GFX10-NEXT: v_pk_max_i16 v10, v5, v3 ; GFX10-NEXT: v_pk_min_i16 v2, v11, v6 -; GFX10-NEXT: v_pk_min_i16 v3, v3, v4 +; GFX10-NEXT: v_pk_min_i16 v3, v10, v7 ; GFX10-NEXT: v_pk_add_u16 v0, v0, v2 ; GFX10-NEXT: v_pk_add_u16 v1, v1, v3 ; GFX10-NEXT: s_setpc_b64 s[30:31] @@ -5321,76 +5265,72 @@ define amdgpu_ps <2 x i32> @s_saddsat_v4i16(<4 x i16> inreg %lhs, <4 x i16> inre ; ; GFX9-LABEL: s_saddsat_v4i16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_pack_ll_b32_b16 s6, 0, 0 -; GFX9-NEXT: s_sext_i32_i16 s9, s6 -; GFX9-NEXT: s_sext_i32_i16 s7, s0 -; GFX9-NEXT: s_ashr_i32 s8, s0, 16 +; GFX9-NEXT: s_sext_i32_i16 s6, s0 +; GFX9-NEXT: s_ashr_i32 s7, s0, 16 +; GFX9-NEXT: s_sext_i32_i16 s8, 0 +; GFX9-NEXT: s_cmp_gt_i32 s6, s8 +; GFX9-NEXT: s_cselect_b32 s9, s6, s8 +; GFX9-NEXT: s_cmp_gt_i32 s7, 0 +; GFX9-NEXT: s_cselect_b32 s10, s7, 0 +; GFX9-NEXT: s_pack_ll_b32_b16 s9, s9, s10 +; GFX9-NEXT: s_mov_b32 s4, 0x7fff7fff +; GFX9-NEXT: s_lshr_b32 s11, s9, 16 +; GFX9-NEXT: s_movk_i32 s10, 0x7fff +; GFX9-NEXT: s_sub_i32 s9, s4, s9 +; GFX9-NEXT: s_sub_i32 s11, s10, s11 +; GFX9-NEXT: s_cmp_lt_i32 s6, s8 +; GFX9-NEXT: s_cselect_b32 s6, s6, s8 +; GFX9-NEXT: s_cmp_lt_i32 s7, 0 +; GFX9-NEXT: s_cselect_b32 s7, s7, 0 +; GFX9-NEXT: s_pack_ll_b32_b16 s6, s6, s7 +; GFX9-NEXT: s_pack_ll_b32_b16 s9, s9, s11 +; GFX9-NEXT: s_mov_b32 s5, 0x80008000 +; GFX9-NEXT: s_lshr_b32 s11, s6, 16 +; GFX9-NEXT: s_mov_b32 s7, 0x8000 +; GFX9-NEXT: s_sub_i32 s6, s5, s6 +; GFX9-NEXT: s_sub_i32 s11, s7, s11 +; GFX9-NEXT: s_pack_ll_b32_b16 s6, s6, s11 +; GFX9-NEXT: s_sext_i32_i16 s11, s6 +; GFX9-NEXT: s_sext_i32_i16 s12, s2 ; GFX9-NEXT: s_ashr_i32 s6, s6, 16 -; GFX9-NEXT: s_cmp_gt_i32 s7, s9 -; GFX9-NEXT: s_cselect_b32 s10, s7, s9 -; GFX9-NEXT: s_cmp_gt_i32 s8, s6 -; GFX9-NEXT: s_movk_i32 s4, 0x7fff -; GFX9-NEXT: s_cselect_b32 s11, s8, s6 -; GFX9-NEXT: s_pack_ll_b32_b16 s10, s10, s11 -; GFX9-NEXT: s_pack_ll_b32_b16 s4, s4, s4 -; GFX9-NEXT: s_lshr_b32 s12, s10, 16 -; GFX9-NEXT: s_lshr_b32 s11, s4, 16 -; GFX9-NEXT: s_sub_i32 s10, s4, s10 -; GFX9-NEXT: s_sub_i32 s12, s11, s12 -; GFX9-NEXT: s_cmp_lt_i32 s7, s9 -; GFX9-NEXT: s_cselect_b32 s7, s7, s9 -; GFX9-NEXT: s_cmp_lt_i32 s8, s6 -; GFX9-NEXT: s_movk_i32 s5, 0x8000 -; GFX9-NEXT: s_cselect_b32 s8, s8, s6 -; GFX9-NEXT: s_pack_ll_b32_b16 s5, s5, s5 -; GFX9-NEXT: s_pack_ll_b32_b16 s7, s7, s8 -; GFX9-NEXT: s_pack_ll_b32_b16 s10, s10, s12 -; GFX9-NEXT: s_lshr_b32 s12, s7, 16 -; GFX9-NEXT: s_lshr_b32 s8, s5, 16 -; GFX9-NEXT: s_sub_i32 s7, s5, s7 -; GFX9-NEXT: s_sub_i32 s12, s8, s12 -; GFX9-NEXT: s_pack_ll_b32_b16 s7, s7, s12 -; GFX9-NEXT: s_sext_i32_i16 s12, s7 -; GFX9-NEXT: s_sext_i32_i16 s13, s2 -; GFX9-NEXT: s_ashr_i32 s7, s7, 16 ; GFX9-NEXT: s_ashr_i32 s2, s2, 16 -; GFX9-NEXT: s_cmp_gt_i32 s12, s13 -; GFX9-NEXT: s_cselect_b32 s12, s12, s13 -; GFX9-NEXT: s_cmp_gt_i32 s7, s2 -; GFX9-NEXT: s_cselect_b32 s2, s7, s2 -; GFX9-NEXT: s_pack_ll_b32_b16 s2, s12, s2 -; GFX9-NEXT: s_sext_i32_i16 s7, s2 -; GFX9-NEXT: s_sext_i32_i16 s12, s10 +; GFX9-NEXT: s_cmp_gt_i32 s11, s12 +; GFX9-NEXT: s_cselect_b32 s11, s11, s12 +; GFX9-NEXT: s_cmp_gt_i32 s6, s2 +; GFX9-NEXT: s_cselect_b32 s2, s6, s2 +; GFX9-NEXT: s_pack_ll_b32_b16 s2, s11, s2 +; GFX9-NEXT: s_sext_i32_i16 s6, s2 +; GFX9-NEXT: s_sext_i32_i16 s11, s9 ; GFX9-NEXT: s_ashr_i32 s2, s2, 16 -; GFX9-NEXT: s_ashr_i32 s10, s10, 16 -; GFX9-NEXT: s_cmp_lt_i32 s7, s12 -; GFX9-NEXT: s_cselect_b32 s7, s7, s12 -; GFX9-NEXT: s_cmp_lt_i32 s2, s10 -; GFX9-NEXT: s_cselect_b32 s2, s2, s10 -; GFX9-NEXT: s_pack_ll_b32_b16 s2, s7, s2 -; GFX9-NEXT: s_lshr_b32 s7, s0, 16 -; GFX9-NEXT: s_lshr_b32 s10, s2, 16 -; GFX9-NEXT: s_add_i32 s0, s0, s2 -; GFX9-NEXT: s_add_i32 s7, s7, s10 -; GFX9-NEXT: s_pack_ll_b32_b16 s0, s0, s7 -; GFX9-NEXT: s_sext_i32_i16 s2, s1 -; GFX9-NEXT: s_ashr_i32 s7, s1, 16 -; GFX9-NEXT: s_cmp_gt_i32 s2, s9 -; GFX9-NEXT: s_cselect_b32 s10, s2, s9 -; GFX9-NEXT: s_cmp_gt_i32 s7, s6 -; GFX9-NEXT: s_cselect_b32 s12, s7, s6 -; GFX9-NEXT: s_pack_ll_b32_b16 s10, s10, s12 -; GFX9-NEXT: s_lshr_b32 s12, s10, 16 -; GFX9-NEXT: s_sub_i32 s4, s4, s10 -; GFX9-NEXT: s_sub_i32 s10, s11, s12 +; GFX9-NEXT: s_ashr_i32 s9, s9, 16 +; GFX9-NEXT: s_cmp_lt_i32 s6, s11 +; GFX9-NEXT: s_cselect_b32 s6, s6, s11 ; GFX9-NEXT: s_cmp_lt_i32 s2, s9 ; GFX9-NEXT: s_cselect_b32 s2, s2, s9 -; GFX9-NEXT: s_cmp_lt_i32 s7, s6 -; GFX9-NEXT: s_cselect_b32 s6, s7, s6 +; GFX9-NEXT: s_pack_ll_b32_b16 s2, s6, s2 +; GFX9-NEXT: s_lshr_b32 s6, s0, 16 +; GFX9-NEXT: s_lshr_b32 s9, s2, 16 +; GFX9-NEXT: s_add_i32 s0, s0, s2 +; GFX9-NEXT: s_add_i32 s6, s6, s9 +; GFX9-NEXT: s_pack_ll_b32_b16 s0, s0, s6 +; GFX9-NEXT: s_sext_i32_i16 s2, s1 +; GFX9-NEXT: s_ashr_i32 s6, s1, 16 +; GFX9-NEXT: s_cmp_gt_i32 s2, s8 +; GFX9-NEXT: s_cselect_b32 s9, s2, s8 +; GFX9-NEXT: s_cmp_gt_i32 s6, 0 +; GFX9-NEXT: s_cselect_b32 s11, s6, 0 +; GFX9-NEXT: s_pack_ll_b32_b16 s9, s9, s11 +; GFX9-NEXT: s_lshr_b32 s11, s9, 16 +; GFX9-NEXT: s_sub_i32 s4, s4, s9 +; GFX9-NEXT: s_sub_i32 s9, s10, s11 +; GFX9-NEXT: s_cmp_lt_i32 s2, s8 +; GFX9-NEXT: s_cselect_b32 s2, s2, s8 +; GFX9-NEXT: s_cmp_lt_i32 s6, 0 +; GFX9-NEXT: s_cselect_b32 s6, s6, 0 ; GFX9-NEXT: s_pack_ll_b32_b16 s2, s2, s6 ; GFX9-NEXT: s_lshr_b32 s6, s2, 16 ; GFX9-NEXT: s_sub_i32 s2, s5, s2 -; GFX9-NEXT: s_sub_i32 s5, s8, s6 +; GFX9-NEXT: s_sub_i32 s5, s7, s6 ; GFX9-NEXT: s_pack_ll_b32_b16 s2, s2, s5 ; GFX9-NEXT: s_sext_i32_i16 s5, s2 ; GFX9-NEXT: s_sext_i32_i16 s6, s3 @@ -5401,7 +5341,7 @@ define amdgpu_ps <2 x i32> @s_saddsat_v4i16(<4 x i16> inreg %lhs, <4 x i16> inre ; GFX9-NEXT: s_cmp_gt_i32 s2, s3 ; GFX9-NEXT: s_cselect_b32 s2, s2, s3 ; GFX9-NEXT: s_pack_ll_b32_b16 s2, s5, s2 -; GFX9-NEXT: s_pack_ll_b32_b16 s4, s4, s10 +; GFX9-NEXT: s_pack_ll_b32_b16 s4, s4, s9 ; GFX9-NEXT: s_sext_i32_i16 s3, s2 ; GFX9-NEXT: s_sext_i32_i16 s5, s4 ; GFX9-NEXT: s_ashr_i32 s2, s2, 16 @@ -5420,94 +5360,90 @@ define amdgpu_ps <2 x i32> @s_saddsat_v4i16(<4 x i16> inreg %lhs, <4 x i16> inre ; ; GFX10-LABEL: s_saddsat_v4i16: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_pack_ll_b32_b16 s4, 0, 0 -; GFX10-NEXT: s_sext_i32_i16 s5, s0 -; GFX10-NEXT: s_sext_i32_i16 s7, s4 +; GFX10-NEXT: s_sext_i32_i16 s4, s0 +; GFX10-NEXT: s_sext_i32_i16 s5, 0 ; GFX10-NEXT: s_ashr_i32 s6, s0, 16 -; GFX10-NEXT: s_ashr_i32 s4, s4, 16 -; GFX10-NEXT: s_cmp_gt_i32 s5, s7 -; GFX10-NEXT: s_movk_i32 s9, 0x7fff -; GFX10-NEXT: s_cselect_b32 s8, s5, s7 -; GFX10-NEXT: s_cmp_gt_i32 s6, s4 -; GFX10-NEXT: s_pack_ll_b32_b16 s9, s9, s9 -; GFX10-NEXT: s_cselect_b32 s10, s6, s4 -; GFX10-NEXT: s_movk_i32 s12, 0x8000 -; GFX10-NEXT: s_pack_ll_b32_b16 s8, s8, s10 -; GFX10-NEXT: s_lshr_b32 s10, s9, 16 -; GFX10-NEXT: s_lshr_b32 s11, s8, 16 -; GFX10-NEXT: s_sub_i32 s8, s9, s8 -; GFX10-NEXT: s_sub_i32 s11, s10, s11 -; GFX10-NEXT: s_cmp_lt_i32 s5, s7 -; GFX10-NEXT: s_pack_ll_b32_b16 s12, s12, s12 -; GFX10-NEXT: s_cselect_b32 s5, s5, s7 -; GFX10-NEXT: s_cmp_lt_i32 s6, s4 -; GFX10-NEXT: s_sext_i32_i16 s14, s2 -; GFX10-NEXT: s_cselect_b32 s6, s6, s4 -; GFX10-NEXT: s_ashr_i32 s2, s2, 16 -; GFX10-NEXT: s_pack_ll_b32_b16 s5, s5, s6 -; GFX10-NEXT: s_lshr_b32 s6, s12, 16 -; GFX10-NEXT: s_lshr_b32 s13, s5, 16 -; GFX10-NEXT: s_sub_i32 s5, s12, s5 -; GFX10-NEXT: s_sub_i32 s13, s6, s13 +; GFX10-NEXT: s_cmp_gt_i32 s4, s5 +; GFX10-NEXT: s_mov_b32 s9, 0x7fff7fff +; GFX10-NEXT: s_cselect_b32 s7, s4, s5 +; GFX10-NEXT: s_cmp_gt_i32 s6, 0 +; GFX10-NEXT: s_mov_b32 s11, 0x80008000 +; GFX10-NEXT: s_cselect_b32 s8, s6, 0 +; GFX10-NEXT: s_sext_i32_i16 s13, s2 +; GFX10-NEXT: s_pack_ll_b32_b16 s7, s7, s8 +; GFX10-NEXT: s_movk_i32 s8, 0x7fff +; GFX10-NEXT: s_lshr_b32 s10, s7, 16 +; GFX10-NEXT: s_sub_i32 s7, s9, s7 +; GFX10-NEXT: s_sub_i32 s10, s8, s10 +; GFX10-NEXT: s_cmp_lt_i32 s4, s5 ; GFX10-NEXT: ; implicit-def: $vcc_hi -; GFX10-NEXT: s_pack_ll_b32_b16 s5, s5, s13 -; GFX10-NEXT: s_sext_i32_i16 s13, s5 -; GFX10-NEXT: s_ashr_i32 s5, s5, 16 -; GFX10-NEXT: s_cmp_gt_i32 s13, s14 -; GFX10-NEXT: s_cselect_b32 s13, s13, s14 -; GFX10-NEXT: s_cmp_gt_i32 s5, s2 -; GFX10-NEXT: s_cselect_b32 s2, s5, s2 -; GFX10-NEXT: s_pack_ll_b32_b16 s5, s8, s11 -; GFX10-NEXT: s_pack_ll_b32_b16 s2, s13, s2 -; GFX10-NEXT: s_sext_i32_i16 s11, s5 -; GFX10-NEXT: s_sext_i32_i16 s8, s2 +; GFX10-NEXT: s_cselect_b32 s4, s4, s5 +; GFX10-NEXT: s_cmp_lt_i32 s6, 0 +; GFX10-NEXT: s_cselect_b32 s6, s6, 0 ; GFX10-NEXT: s_ashr_i32 s2, s2, 16 -; GFX10-NEXT: s_ashr_i32 s5, s5, 16 -; GFX10-NEXT: s_cmp_lt_i32 s8, s11 -; GFX10-NEXT: s_cselect_b32 s8, s8, s11 -; GFX10-NEXT: s_cmp_lt_i32 s2, s5 -; GFX10-NEXT: s_cselect_b32 s2, s2, s5 -; GFX10-NEXT: s_lshr_b32 s5, s0, 16 -; GFX10-NEXT: s_pack_ll_b32_b16 s2, s8, s2 -; GFX10-NEXT: s_lshr_b32 s8, s2, 16 +; GFX10-NEXT: s_pack_ll_b32_b16 s4, s4, s6 +; GFX10-NEXT: s_mov_b32 s6, 0x8000 +; GFX10-NEXT: s_lshr_b32 s12, s4, 16 +; GFX10-NEXT: s_sub_i32 s4, s11, s4 +; GFX10-NEXT: s_sub_i32 s12, s6, s12 +; GFX10-NEXT: s_pack_ll_b32_b16 s4, s4, s12 +; GFX10-NEXT: s_sext_i32_i16 s12, s4 +; GFX10-NEXT: s_ashr_i32 s4, s4, 16 +; GFX10-NEXT: s_cmp_gt_i32 s12, s13 +; GFX10-NEXT: s_cselect_b32 s12, s12, s13 +; GFX10-NEXT: s_cmp_gt_i32 s4, s2 +; GFX10-NEXT: s_cselect_b32 s2, s4, s2 +; GFX10-NEXT: s_pack_ll_b32_b16 s4, s7, s10 +; GFX10-NEXT: s_pack_ll_b32_b16 s2, s12, s2 +; GFX10-NEXT: s_sext_i32_i16 s10, s4 +; GFX10-NEXT: s_sext_i32_i16 s7, s2 +; GFX10-NEXT: s_ashr_i32 s2, s2, 16 +; GFX10-NEXT: s_ashr_i32 s4, s4, 16 +; GFX10-NEXT: s_cmp_lt_i32 s7, s10 +; GFX10-NEXT: s_cselect_b32 s7, s7, s10 +; GFX10-NEXT: s_cmp_lt_i32 s2, s4 +; GFX10-NEXT: s_cselect_b32 s2, s2, s4 +; GFX10-NEXT: s_sext_i32_i16 s4, s1 +; GFX10-NEXT: s_pack_ll_b32_b16 s2, s7, s2 +; GFX10-NEXT: s_lshr_b32 s7, s0, 16 +; GFX10-NEXT: s_lshr_b32 s10, s2, 16 ; GFX10-NEXT: s_add_i32 s0, s0, s2 -; GFX10-NEXT: s_sext_i32_i16 s2, s1 -; GFX10-NEXT: s_add_i32 s5, s5, s8 -; GFX10-NEXT: s_ashr_i32 s8, s1, 16 -; GFX10-NEXT: s_cmp_gt_i32 s2, s7 -; GFX10-NEXT: s_pack_ll_b32_b16 s0, s0, s5 -; GFX10-NEXT: s_cselect_b32 s11, s2, s7 -; GFX10-NEXT: s_cmp_gt_i32 s8, s4 -; GFX10-NEXT: s_cselect_b32 s13, s8, s4 -; GFX10-NEXT: s_pack_ll_b32_b16 s11, s11, s13 -; GFX10-NEXT: s_lshr_b32 s13, s11, 16 -; GFX10-NEXT: s_sub_i32 s9, s9, s11 -; GFX10-NEXT: s_sub_i32 s10, s10, s13 -; GFX10-NEXT: s_cmp_lt_i32 s2, s7 -; GFX10-NEXT: s_cselect_b32 s2, s2, s7 -; GFX10-NEXT: s_cmp_lt_i32 s8, s4 -; GFX10-NEXT: s_cselect_b32 s4, s8, s4 -; GFX10-NEXT: s_pack_ll_b32_b16 s2, s2, s4 -; GFX10-NEXT: s_lshr_b32 s4, s2, 16 -; GFX10-NEXT: s_sub_i32 s2, s12, s2 -; GFX10-NEXT: s_sub_i32 s4, s6, s4 -; GFX10-NEXT: s_sext_i32_i16 s6, s3 -; GFX10-NEXT: s_pack_ll_b32_b16 s2, s2, s4 +; GFX10-NEXT: s_add_i32 s7, s7, s10 +; GFX10-NEXT: s_ashr_i32 s2, s1, 16 +; GFX10-NEXT: s_cmp_gt_i32 s4, s5 +; GFX10-NEXT: s_pack_ll_b32_b16 s0, s0, s7 +; GFX10-NEXT: s_cselect_b32 s10, s4, s5 +; GFX10-NEXT: s_cmp_gt_i32 s2, 0 +; GFX10-NEXT: s_cselect_b32 s12, s2, 0 +; GFX10-NEXT: s_pack_ll_b32_b16 s10, s10, s12 +; GFX10-NEXT: s_lshr_b32 s12, s10, 16 +; GFX10-NEXT: s_sub_i32 s9, s9, s10 +; GFX10-NEXT: s_sub_i32 s8, s8, s12 +; GFX10-NEXT: s_cmp_lt_i32 s4, s5 +; GFX10-NEXT: s_cselect_b32 s4, s4, s5 +; GFX10-NEXT: s_cmp_lt_i32 s2, 0 +; GFX10-NEXT: s_sext_i32_i16 s5, s3 +; GFX10-NEXT: s_cselect_b32 s2, s2, 0 ; GFX10-NEXT: s_ashr_i32 s3, s3, 16 +; GFX10-NEXT: s_pack_ll_b32_b16 s2, s4, s2 +; GFX10-NEXT: s_lshr_b32 s4, s2, 16 +; GFX10-NEXT: s_sub_i32 s2, s11, s2 +; GFX10-NEXT: s_sub_i32 s4, s6, s4 +; GFX10-NEXT: s_pack_ll_b32_b16 s2, s2, s4 ; GFX10-NEXT: s_sext_i32_i16 s4, s2 ; GFX10-NEXT: s_ashr_i32 s2, s2, 16 -; GFX10-NEXT: s_cmp_gt_i32 s4, s6 -; GFX10-NEXT: s_cselect_b32 s4, s4, s6 +; GFX10-NEXT: s_cmp_gt_i32 s4, s5 +; GFX10-NEXT: s_cselect_b32 s4, s4, s5 ; GFX10-NEXT: s_cmp_gt_i32 s2, s3 -; GFX10-NEXT: s_pack_ll_b32_b16 s6, s9, s10 +; GFX10-NEXT: s_pack_ll_b32_b16 s5, s9, s8 ; GFX10-NEXT: s_cselect_b32 s2, s2, s3 -; GFX10-NEXT: s_sext_i32_i16 s3, s6 +; GFX10-NEXT: s_sext_i32_i16 s3, s5 ; GFX10-NEXT: s_pack_ll_b32_b16 s2, s4, s2 -; GFX10-NEXT: s_ashr_i32 s4, s6, 16 -; GFX10-NEXT: s_sext_i32_i16 s6, s2 +; GFX10-NEXT: s_ashr_i32 s4, s5, 16 +; GFX10-NEXT: s_sext_i32_i16 s5, s2 ; GFX10-NEXT: s_ashr_i32 s2, s2, 16 -; GFX10-NEXT: s_cmp_lt_i32 s6, s3 -; GFX10-NEXT: s_cselect_b32 s3, s6, s3 +; GFX10-NEXT: s_cmp_lt_i32 s5, s3 +; GFX10-NEXT: s_cselect_b32 s3, s5, s3 ; GFX10-NEXT: s_cmp_lt_i32 s2, s4 ; GFX10-NEXT: s_cselect_b32 s2, s2, s4 ; GFX10-NEXT: s_pack_ll_b32_b16 s2, s3, s2 @@ -5676,29 +5612,26 @@ define <3 x float> @v_saddsat_v6i16(<6 x i16> %lhs, <6 x i16> %rhs) { ; GFX9-LABEL: v_saddsat_v6i16: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: s_movk_i32 s5, 0x8000 -; GFX9-NEXT: s_pack_ll_b32_b16 s6, 0, 0 -; GFX9-NEXT: s_movk_i32 s4, 0x7fff -; GFX9-NEXT: s_pack_ll_b32_b16 s5, s5, s5 -; GFX9-NEXT: v_pk_min_i16 v7, v0, s6 -; GFX9-NEXT: v_pk_sub_i16 v7, s5, v7 -; GFX9-NEXT: s_pack_ll_b32_b16 s4, s4, s4 -; GFX9-NEXT: v_pk_max_i16 v6, v0, s6 -; GFX9-NEXT: v_pk_sub_i16 v6, s4, v6 -; GFX9-NEXT: v_pk_max_i16 v3, v7, v3 +; GFX9-NEXT: v_pk_min_i16 v8, v0, 0 +; GFX9-NEXT: v_mov_b32_e32 v9, 0x80008000 +; GFX9-NEXT: v_pk_sub_i16 v8, v9, v8 +; GFX9-NEXT: v_pk_max_i16 v6, v0, 0 +; GFX9-NEXT: v_mov_b32_e32 v7, 0x7fff7fff +; GFX9-NEXT: v_pk_sub_i16 v6, v7, v6 +; GFX9-NEXT: v_pk_max_i16 v3, v8, v3 ; GFX9-NEXT: v_pk_min_i16 v3, v3, v6 -; GFX9-NEXT: v_pk_min_i16 v6, v1, s6 +; GFX9-NEXT: v_pk_min_i16 v6, v1, 0 ; GFX9-NEXT: v_pk_add_u16 v0, v0, v3 -; GFX9-NEXT: v_pk_max_i16 v3, v1, s6 -; GFX9-NEXT: v_pk_sub_i16 v6, s5, v6 -; GFX9-NEXT: v_pk_sub_i16 v3, s4, v3 +; GFX9-NEXT: v_pk_max_i16 v3, v1, 0 +; GFX9-NEXT: v_pk_sub_i16 v6, v9, v6 +; GFX9-NEXT: v_pk_sub_i16 v3, v7, v3 ; GFX9-NEXT: v_pk_max_i16 v4, v6, v4 ; GFX9-NEXT: v_pk_min_i16 v3, v4, v3 -; GFX9-NEXT: v_pk_min_i16 v4, v2, s6 -; GFX9-NEXT: v_pk_sub_i16 v4, s5, v4 +; GFX9-NEXT: v_pk_min_i16 v4, v2, 0 +; GFX9-NEXT: v_pk_sub_i16 v4, v9, v4 ; GFX9-NEXT: v_pk_add_u16 v1, v1, v3 -; GFX9-NEXT: v_pk_max_i16 v3, v2, s6 -; GFX9-NEXT: v_pk_sub_i16 v3, s4, v3 +; GFX9-NEXT: v_pk_max_i16 v3, v2, 0 +; GFX9-NEXT: v_pk_sub_i16 v3, v7, v3 ; GFX9-NEXT: v_pk_max_i16 v4, v4, v5 ; GFX9-NEXT: v_pk_min_i16 v3, v4, v3 ; GFX9-NEXT: v_pk_add_u16 v2, v2, v3 @@ -5708,28 +5641,23 @@ define <3 x float> @v_saddsat_v6i16(<6 x i16> %lhs, <6 x i16> %rhs) { ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: s_pack_ll_b32_b16 s5, 0, 0 -; GFX10-NEXT: s_movk_i32 s6, 0x8000 -; GFX10-NEXT: v_pk_min_i16 v7, v0, s5 -; GFX10-NEXT: v_pk_min_i16 v8, v1, s5 -; GFX10-NEXT: v_pk_min_i16 v9, v2, s5 -; GFX10-NEXT: s_pack_ll_b32_b16 s6, s6, s6 -; GFX10-NEXT: v_pk_max_i16 v6, v0, s5 -; GFX10-NEXT: v_pk_sub_i16 v14, s6, v7 -; GFX10-NEXT: v_pk_sub_i16 v15, s6, v8 -; GFX10-NEXT: v_pk_sub_i16 v19, s6, v9 -; GFX10-NEXT: v_pk_max_i16 v10, v1, s5 -; GFX10-NEXT: v_pk_max_i16 v11, v2, s5 -; GFX10-NEXT: s_movk_i32 s4, 0x7fff +; GFX10-NEXT: v_pk_min_i16 v7, v0, 0 +; GFX10-NEXT: v_pk_min_i16 v8, v1, 0 +; GFX10-NEXT: v_pk_min_i16 v9, v2, 0 +; GFX10-NEXT: v_pk_max_i16 v6, v0, 0 +; GFX10-NEXT: v_pk_max_i16 v10, v1, 0 +; GFX10-NEXT: v_pk_sub_i16 v14, 0x80008000, v7 +; GFX10-NEXT: v_pk_sub_i16 v15, 0x80008000, v8 +; GFX10-NEXT: v_pk_max_i16 v11, v2, 0 +; GFX10-NEXT: v_pk_sub_i16 v19, 0x80008000, v9 +; GFX10-NEXT: v_pk_sub_i16 v6, 0x7fff7fff, v6 ; GFX10-NEXT: v_pk_max_i16 v3, v14, v3 -; GFX10-NEXT: s_pack_ll_b32_b16 s4, s4, s4 +; GFX10-NEXT: v_pk_sub_i16 v7, 0x7fff7fff, v10 ; GFX10-NEXT: v_pk_max_i16 v4, v15, v4 -; GFX10-NEXT: v_pk_sub_i16 v6, s4, v6 -; GFX10-NEXT: v_pk_sub_i16 v7, s4, v10 -; GFX10-NEXT: v_pk_sub_i16 v8, s4, v11 +; GFX10-NEXT: v_pk_sub_i16 v8, 0x7fff7fff, v11 ; GFX10-NEXT: v_pk_max_i16 v5, v19, v5 -; GFX10-NEXT: ; implicit-def: $vcc_hi ; GFX10-NEXT: v_pk_min_i16 v3, v3, v6 +; GFX10-NEXT: ; implicit-def: $vcc_hi ; GFX10-NEXT: v_pk_min_i16 v4, v4, v7 ; GFX10-NEXT: v_pk_min_i16 v5, v5, v8 ; GFX10-NEXT: v_pk_add_u16 v0, v0, v3 @@ -5968,119 +5896,115 @@ define amdgpu_ps <3 x i32> @s_saddsat_v6i16(<6 x i16> inreg %lhs, <6 x i16> inre ; ; GFX9-LABEL: s_saddsat_v6i16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_pack_ll_b32_b16 s8, 0, 0 -; GFX9-NEXT: s_sext_i32_i16 s11, s8 -; GFX9-NEXT: s_sext_i32_i16 s9, s0 -; GFX9-NEXT: s_ashr_i32 s10, s0, 16 +; GFX9-NEXT: s_sext_i32_i16 s8, s0 +; GFX9-NEXT: s_ashr_i32 s9, s0, 16 +; GFX9-NEXT: s_sext_i32_i16 s10, 0 +; GFX9-NEXT: s_cmp_gt_i32 s8, s10 +; GFX9-NEXT: s_cselect_b32 s11, s8, s10 +; GFX9-NEXT: s_cmp_gt_i32 s9, 0 +; GFX9-NEXT: s_cselect_b32 s12, s9, 0 +; GFX9-NEXT: s_pack_ll_b32_b16 s11, s11, s12 +; GFX9-NEXT: s_mov_b32 s6, 0x7fff7fff +; GFX9-NEXT: s_lshr_b32 s13, s11, 16 +; GFX9-NEXT: s_movk_i32 s12, 0x7fff +; GFX9-NEXT: s_sub_i32 s11, s6, s11 +; GFX9-NEXT: s_sub_i32 s13, s12, s13 +; GFX9-NEXT: s_cmp_lt_i32 s8, s10 +; GFX9-NEXT: s_cselect_b32 s8, s8, s10 +; GFX9-NEXT: s_cmp_lt_i32 s9, 0 +; GFX9-NEXT: s_cselect_b32 s9, s9, 0 +; GFX9-NEXT: s_pack_ll_b32_b16 s8, s8, s9 +; GFX9-NEXT: s_pack_ll_b32_b16 s11, s11, s13 +; GFX9-NEXT: s_mov_b32 s7, 0x80008000 +; GFX9-NEXT: s_lshr_b32 s13, s8, 16 +; GFX9-NEXT: s_mov_b32 s9, 0x8000 +; GFX9-NEXT: s_sub_i32 s8, s7, s8 +; GFX9-NEXT: s_sub_i32 s13, s9, s13 +; GFX9-NEXT: s_pack_ll_b32_b16 s8, s8, s13 +; GFX9-NEXT: s_sext_i32_i16 s13, s8 +; GFX9-NEXT: s_sext_i32_i16 s14, s3 ; GFX9-NEXT: s_ashr_i32 s8, s8, 16 -; GFX9-NEXT: s_cmp_gt_i32 s9, s11 -; GFX9-NEXT: s_cselect_b32 s12, s9, s11 -; GFX9-NEXT: s_cmp_gt_i32 s10, s8 -; GFX9-NEXT: s_movk_i32 s6, 0x7fff -; GFX9-NEXT: s_cselect_b32 s13, s10, s8 -; GFX9-NEXT: s_pack_ll_b32_b16 s12, s12, s13 -; GFX9-NEXT: s_pack_ll_b32_b16 s6, s6, s6 -; GFX9-NEXT: s_lshr_b32 s14, s12, 16 -; GFX9-NEXT: s_lshr_b32 s13, s6, 16 -; GFX9-NEXT: s_sub_i32 s12, s6, s12 -; GFX9-NEXT: s_sub_i32 s14, s13, s14 -; GFX9-NEXT: s_cmp_lt_i32 s9, s11 -; GFX9-NEXT: s_cselect_b32 s9, s9, s11 -; GFX9-NEXT: s_cmp_lt_i32 s10, s8 -; GFX9-NEXT: s_movk_i32 s7, 0x8000 -; GFX9-NEXT: s_cselect_b32 s10, s10, s8 -; GFX9-NEXT: s_pack_ll_b32_b16 s7, s7, s7 -; GFX9-NEXT: s_pack_ll_b32_b16 s9, s9, s10 -; GFX9-NEXT: s_pack_ll_b32_b16 s12, s12, s14 -; GFX9-NEXT: s_lshr_b32 s14, s9, 16 -; GFX9-NEXT: s_lshr_b32 s10, s7, 16 -; GFX9-NEXT: s_sub_i32 s9, s7, s9 -; GFX9-NEXT: s_sub_i32 s14, s10, s14 -; GFX9-NEXT: s_pack_ll_b32_b16 s9, s9, s14 -; GFX9-NEXT: s_sext_i32_i16 s14, s9 -; GFX9-NEXT: s_sext_i32_i16 s15, s3 -; GFX9-NEXT: s_ashr_i32 s9, s9, 16 ; GFX9-NEXT: s_ashr_i32 s3, s3, 16 -; GFX9-NEXT: s_cmp_gt_i32 s14, s15 -; GFX9-NEXT: s_cselect_b32 s14, s14, s15 -; GFX9-NEXT: s_cmp_gt_i32 s9, s3 -; GFX9-NEXT: s_cselect_b32 s3, s9, s3 -; GFX9-NEXT: s_pack_ll_b32_b16 s3, s14, s3 -; GFX9-NEXT: s_sext_i32_i16 s9, s3 -; GFX9-NEXT: s_sext_i32_i16 s14, s12 +; GFX9-NEXT: s_cmp_gt_i32 s13, s14 +; GFX9-NEXT: s_cselect_b32 s13, s13, s14 +; GFX9-NEXT: s_cmp_gt_i32 s8, s3 +; GFX9-NEXT: s_cselect_b32 s3, s8, s3 +; GFX9-NEXT: s_pack_ll_b32_b16 s3, s13, s3 +; GFX9-NEXT: s_sext_i32_i16 s8, s3 +; GFX9-NEXT: s_sext_i32_i16 s13, s11 ; GFX9-NEXT: s_ashr_i32 s3, s3, 16 -; GFX9-NEXT: s_ashr_i32 s12, s12, 16 -; GFX9-NEXT: s_cmp_lt_i32 s9, s14 -; GFX9-NEXT: s_cselect_b32 s9, s9, s14 -; GFX9-NEXT: s_cmp_lt_i32 s3, s12 -; GFX9-NEXT: s_cselect_b32 s3, s3, s12 -; GFX9-NEXT: s_pack_ll_b32_b16 s3, s9, s3 -; GFX9-NEXT: s_lshr_b32 s9, s0, 16 -; GFX9-NEXT: s_lshr_b32 s12, s3, 16 -; GFX9-NEXT: s_add_i32 s0, s0, s3 -; GFX9-NEXT: s_add_i32 s9, s9, s12 -; GFX9-NEXT: s_pack_ll_b32_b16 s0, s0, s9 -; GFX9-NEXT: s_sext_i32_i16 s3, s1 -; GFX9-NEXT: s_ashr_i32 s9, s1, 16 -; GFX9-NEXT: s_cmp_gt_i32 s3, s11 -; GFX9-NEXT: s_cselect_b32 s12, s3, s11 -; GFX9-NEXT: s_cmp_gt_i32 s9, s8 -; GFX9-NEXT: s_cselect_b32 s14, s9, s8 -; GFX9-NEXT: s_pack_ll_b32_b16 s12, s12, s14 -; GFX9-NEXT: s_lshr_b32 s14, s12, 16 -; GFX9-NEXT: s_sub_i32 s12, s6, s12 -; GFX9-NEXT: s_sub_i32 s14, s13, s14 +; GFX9-NEXT: s_ashr_i32 s11, s11, 16 +; GFX9-NEXT: s_cmp_lt_i32 s8, s13 +; GFX9-NEXT: s_cselect_b32 s8, s8, s13 ; GFX9-NEXT: s_cmp_lt_i32 s3, s11 ; GFX9-NEXT: s_cselect_b32 s3, s3, s11 -; GFX9-NEXT: s_cmp_lt_i32 s9, s8 -; GFX9-NEXT: s_cselect_b32 s9, s9, s8 -; GFX9-NEXT: s_pack_ll_b32_b16 s3, s3, s9 -; GFX9-NEXT: s_lshr_b32 s9, s3, 16 +; GFX9-NEXT: s_pack_ll_b32_b16 s3, s8, s3 +; GFX9-NEXT: s_lshr_b32 s8, s0, 16 +; GFX9-NEXT: s_lshr_b32 s11, s3, 16 +; GFX9-NEXT: s_add_i32 s0, s0, s3 +; GFX9-NEXT: s_add_i32 s8, s8, s11 +; GFX9-NEXT: s_pack_ll_b32_b16 s0, s0, s8 +; GFX9-NEXT: s_sext_i32_i16 s3, s1 +; GFX9-NEXT: s_ashr_i32 s8, s1, 16 +; GFX9-NEXT: s_cmp_gt_i32 s3, s10 +; GFX9-NEXT: s_cselect_b32 s11, s3, s10 +; GFX9-NEXT: s_cmp_gt_i32 s8, 0 +; GFX9-NEXT: s_cselect_b32 s13, s8, 0 +; GFX9-NEXT: s_pack_ll_b32_b16 s11, s11, s13 +; GFX9-NEXT: s_lshr_b32 s13, s11, 16 +; GFX9-NEXT: s_sub_i32 s11, s6, s11 +; GFX9-NEXT: s_sub_i32 s13, s12, s13 +; GFX9-NEXT: s_cmp_lt_i32 s3, s10 +; GFX9-NEXT: s_cselect_b32 s3, s3, s10 +; GFX9-NEXT: s_cmp_lt_i32 s8, 0 +; GFX9-NEXT: s_cselect_b32 s8, s8, 0 +; GFX9-NEXT: s_pack_ll_b32_b16 s3, s3, s8 +; GFX9-NEXT: s_lshr_b32 s8, s3, 16 ; GFX9-NEXT: s_sub_i32 s3, s7, s3 -; GFX9-NEXT: s_sub_i32 s9, s10, s9 -; GFX9-NEXT: s_pack_ll_b32_b16 s3, s3, s9 -; GFX9-NEXT: s_pack_ll_b32_b16 s12, s12, s14 -; GFX9-NEXT: s_sext_i32_i16 s9, s3 -; GFX9-NEXT: s_sext_i32_i16 s14, s4 +; GFX9-NEXT: s_sub_i32 s8, s9, s8 +; GFX9-NEXT: s_pack_ll_b32_b16 s3, s3, s8 +; GFX9-NEXT: s_pack_ll_b32_b16 s11, s11, s13 +; GFX9-NEXT: s_sext_i32_i16 s8, s3 +; GFX9-NEXT: s_sext_i32_i16 s13, s4 ; GFX9-NEXT: s_ashr_i32 s3, s3, 16 ; GFX9-NEXT: s_ashr_i32 s4, s4, 16 -; GFX9-NEXT: s_cmp_gt_i32 s9, s14 -; GFX9-NEXT: s_cselect_b32 s9, s9, s14 +; GFX9-NEXT: s_cmp_gt_i32 s8, s13 +; GFX9-NEXT: s_cselect_b32 s8, s8, s13 ; GFX9-NEXT: s_cmp_gt_i32 s3, s4 ; GFX9-NEXT: s_cselect_b32 s3, s3, s4 -; GFX9-NEXT: s_pack_ll_b32_b16 s3, s9, s3 +; GFX9-NEXT: s_pack_ll_b32_b16 s3, s8, s3 ; GFX9-NEXT: s_sext_i32_i16 s4, s3 -; GFX9-NEXT: s_sext_i32_i16 s9, s12 +; GFX9-NEXT: s_sext_i32_i16 s8, s11 ; GFX9-NEXT: s_ashr_i32 s3, s3, 16 -; GFX9-NEXT: s_ashr_i32 s12, s12, 16 -; GFX9-NEXT: s_cmp_lt_i32 s4, s9 -; GFX9-NEXT: s_cselect_b32 s4, s4, s9 -; GFX9-NEXT: s_cmp_lt_i32 s3, s12 -; GFX9-NEXT: s_cselect_b32 s3, s3, s12 +; GFX9-NEXT: s_ashr_i32 s11, s11, 16 +; GFX9-NEXT: s_cmp_lt_i32 s4, s8 +; GFX9-NEXT: s_cselect_b32 s4, s4, s8 +; GFX9-NEXT: s_cmp_lt_i32 s3, s11 +; GFX9-NEXT: s_cselect_b32 s3, s3, s11 ; GFX9-NEXT: s_pack_ll_b32_b16 s3, s4, s3 ; GFX9-NEXT: s_lshr_b32 s4, s1, 16 -; GFX9-NEXT: s_lshr_b32 s9, s3, 16 +; GFX9-NEXT: s_lshr_b32 s8, s3, 16 ; GFX9-NEXT: s_add_i32 s1, s1, s3 -; GFX9-NEXT: s_add_i32 s4, s4, s9 +; GFX9-NEXT: s_add_i32 s4, s4, s8 ; GFX9-NEXT: s_pack_ll_b32_b16 s1, s1, s4 ; GFX9-NEXT: s_sext_i32_i16 s3, s2 ; GFX9-NEXT: s_ashr_i32 s4, s2, 16 -; GFX9-NEXT: s_cmp_gt_i32 s3, s11 -; GFX9-NEXT: s_cselect_b32 s9, s3, s11 -; GFX9-NEXT: s_cmp_gt_i32 s4, s8 -; GFX9-NEXT: s_cselect_b32 s12, s4, s8 -; GFX9-NEXT: s_pack_ll_b32_b16 s9, s9, s12 -; GFX9-NEXT: s_lshr_b32 s12, s9, 16 -; GFX9-NEXT: s_sub_i32 s6, s6, s9 -; GFX9-NEXT: s_sub_i32 s9, s13, s12 -; GFX9-NEXT: s_cmp_lt_i32 s3, s11 -; GFX9-NEXT: s_cselect_b32 s3, s3, s11 -; GFX9-NEXT: s_cmp_lt_i32 s4, s8 -; GFX9-NEXT: s_cselect_b32 s4, s4, s8 +; GFX9-NEXT: s_cmp_gt_i32 s3, s10 +; GFX9-NEXT: s_cselect_b32 s8, s3, s10 +; GFX9-NEXT: s_cmp_gt_i32 s4, 0 +; GFX9-NEXT: s_cselect_b32 s11, s4, 0 +; GFX9-NEXT: s_pack_ll_b32_b16 s8, s8, s11 +; GFX9-NEXT: s_lshr_b32 s11, s8, 16 +; GFX9-NEXT: s_sub_i32 s6, s6, s8 +; GFX9-NEXT: s_sub_i32 s8, s12, s11 +; GFX9-NEXT: s_cmp_lt_i32 s3, s10 +; GFX9-NEXT: s_cselect_b32 s3, s3, s10 +; GFX9-NEXT: s_cmp_lt_i32 s4, 0 +; GFX9-NEXT: s_cselect_b32 s4, s4, 0 ; GFX9-NEXT: s_pack_ll_b32_b16 s3, s3, s4 ; GFX9-NEXT: s_lshr_b32 s4, s3, 16 ; GFX9-NEXT: s_sub_i32 s3, s7, s3 -; GFX9-NEXT: s_sub_i32 s4, s10, s4 +; GFX9-NEXT: s_sub_i32 s4, s9, s4 ; GFX9-NEXT: s_pack_ll_b32_b16 s3, s3, s4 ; GFX9-NEXT: s_sext_i32_i16 s4, s3 ; GFX9-NEXT: s_sext_i32_i16 s7, s5 @@ -6091,7 +6015,7 @@ define amdgpu_ps <3 x i32> @s_saddsat_v6i16(<6 x i16> inreg %lhs, <6 x i16> inre ; GFX9-NEXT: s_cmp_gt_i32 s3, s5 ; GFX9-NEXT: s_cselect_b32 s3, s3, s5 ; GFX9-NEXT: s_pack_ll_b32_b16 s3, s4, s3 -; GFX9-NEXT: s_pack_ll_b32_b16 s6, s6, s9 +; GFX9-NEXT: s_pack_ll_b32_b16 s6, s6, s8 ; GFX9-NEXT: s_sext_i32_i16 s4, s3 ; GFX9-NEXT: s_sext_i32_i16 s5, s6 ; GFX9-NEXT: s_ashr_i32 s3, s3, 16 @@ -6110,141 +6034,137 @@ define amdgpu_ps <3 x i32> @s_saddsat_v6i16(<6 x i16> inreg %lhs, <6 x i16> inre ; ; GFX10-LABEL: s_saddsat_v6i16: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_pack_ll_b32_b16 s6, 0, 0 -; GFX10-NEXT: s_sext_i32_i16 s7, s0 -; GFX10-NEXT: s_sext_i32_i16 s9, s6 +; GFX10-NEXT: s_sext_i32_i16 s6, s0 +; GFX10-NEXT: s_sext_i32_i16 s7, 0 ; GFX10-NEXT: s_ashr_i32 s8, s0, 16 -; GFX10-NEXT: s_ashr_i32 s6, s6, 16 -; GFX10-NEXT: s_cmp_gt_i32 s7, s9 -; GFX10-NEXT: s_movk_i32 s11, 0x7fff -; GFX10-NEXT: s_cselect_b32 s10, s7, s9 -; GFX10-NEXT: s_cmp_gt_i32 s8, s6 -; GFX10-NEXT: s_pack_ll_b32_b16 s11, s11, s11 -; GFX10-NEXT: s_cselect_b32 s12, s8, s6 -; GFX10-NEXT: s_movk_i32 s14, 0x8000 -; GFX10-NEXT: s_pack_ll_b32_b16 s10, s10, s12 -; GFX10-NEXT: s_lshr_b32 s12, s11, 16 -; GFX10-NEXT: s_lshr_b32 s13, s10, 16 -; GFX10-NEXT: s_sub_i32 s10, s11, s10 -; GFX10-NEXT: s_sub_i32 s13, s12, s13 -; GFX10-NEXT: s_cmp_lt_i32 s7, s9 -; GFX10-NEXT: s_pack_ll_b32_b16 s14, s14, s14 -; GFX10-NEXT: s_cselect_b32 s7, s7, s9 -; GFX10-NEXT: s_cmp_lt_i32 s8, s6 -; GFX10-NEXT: s_sext_i32_i16 s16, s3 -; GFX10-NEXT: s_cselect_b32 s8, s8, s6 -; GFX10-NEXT: s_ashr_i32 s3, s3, 16 -; GFX10-NEXT: s_pack_ll_b32_b16 s7, s7, s8 -; GFX10-NEXT: s_lshr_b32 s8, s14, 16 -; GFX10-NEXT: s_lshr_b32 s15, s7, 16 -; GFX10-NEXT: s_sub_i32 s7, s14, s7 -; GFX10-NEXT: s_sub_i32 s15, s8, s15 +; GFX10-NEXT: s_cmp_gt_i32 s6, s7 +; GFX10-NEXT: s_mov_b32 s11, 0x7fff7fff +; GFX10-NEXT: s_cselect_b32 s9, s6, s7 +; GFX10-NEXT: s_cmp_gt_i32 s8, 0 +; GFX10-NEXT: s_mov_b32 s13, 0x80008000 +; GFX10-NEXT: s_cselect_b32 s10, s8, 0 +; GFX10-NEXT: s_sext_i32_i16 s15, s3 +; GFX10-NEXT: s_pack_ll_b32_b16 s9, s9, s10 +; GFX10-NEXT: s_movk_i32 s10, 0x7fff +; GFX10-NEXT: s_lshr_b32 s12, s9, 16 +; GFX10-NEXT: s_sub_i32 s9, s11, s9 +; GFX10-NEXT: s_sub_i32 s12, s10, s12 +; GFX10-NEXT: s_cmp_lt_i32 s6, s7 ; GFX10-NEXT: ; implicit-def: $vcc_hi -; GFX10-NEXT: s_pack_ll_b32_b16 s7, s7, s15 -; GFX10-NEXT: s_sext_i32_i16 s15, s7 -; GFX10-NEXT: s_ashr_i32 s7, s7, 16 -; GFX10-NEXT: s_cmp_gt_i32 s15, s16 -; GFX10-NEXT: s_cselect_b32 s15, s15, s16 -; GFX10-NEXT: s_cmp_gt_i32 s7, s3 -; GFX10-NEXT: s_sext_i32_i16 s16, s4 -; GFX10-NEXT: s_cselect_b32 s3, s7, s3 -; GFX10-NEXT: s_pack_ll_b32_b16 s7, s10, s13 -; GFX10-NEXT: s_pack_ll_b32_b16 s3, s15, s3 -; GFX10-NEXT: s_sext_i32_i16 s13, s7 -; GFX10-NEXT: s_sext_i32_i16 s10, s3 +; GFX10-NEXT: s_cselect_b32 s6, s6, s7 +; GFX10-NEXT: s_cmp_lt_i32 s8, 0 +; GFX10-NEXT: s_cselect_b32 s8, s8, 0 ; GFX10-NEXT: s_ashr_i32 s3, s3, 16 -; GFX10-NEXT: s_ashr_i32 s7, s7, 16 -; GFX10-NEXT: s_cmp_lt_i32 s10, s13 -; GFX10-NEXT: s_cselect_b32 s10, s10, s13 -; GFX10-NEXT: s_cmp_lt_i32 s3, s7 -; GFX10-NEXT: s_cselect_b32 s3, s3, s7 -; GFX10-NEXT: s_lshr_b32 s7, s0, 16 -; GFX10-NEXT: s_pack_ll_b32_b16 s3, s10, s3 -; GFX10-NEXT: s_lshr_b32 s10, s3, 16 -; GFX10-NEXT: s_add_i32 s0, s0, s3 -; GFX10-NEXT: s_sext_i32_i16 s3, s1 -; GFX10-NEXT: s_add_i32 s7, s7, s10 -; GFX10-NEXT: s_ashr_i32 s10, s1, 16 -; GFX10-NEXT: s_cmp_gt_i32 s3, s9 -; GFX10-NEXT: s_pack_ll_b32_b16 s0, s0, s7 -; GFX10-NEXT: s_cselect_b32 s13, s3, s9 -; GFX10-NEXT: s_cmp_gt_i32 s10, s6 -; GFX10-NEXT: s_cselect_b32 s15, s10, s6 -; GFX10-NEXT: s_pack_ll_b32_b16 s13, s13, s15 -; GFX10-NEXT: s_lshr_b32 s15, s13, 16 -; GFX10-NEXT: s_sub_i32 s13, s11, s13 -; GFX10-NEXT: s_sub_i32 s15, s12, s15 -; GFX10-NEXT: s_cmp_lt_i32 s3, s9 -; GFX10-NEXT: s_cselect_b32 s3, s3, s9 -; GFX10-NEXT: s_cmp_lt_i32 s10, s6 -; GFX10-NEXT: s_cselect_b32 s10, s10, s6 -; GFX10-NEXT: s_ashr_i32 s4, s4, 16 -; GFX10-NEXT: s_pack_ll_b32_b16 s3, s3, s10 -; GFX10-NEXT: s_lshr_b32 s10, s3, 16 -; GFX10-NEXT: s_sub_i32 s3, s14, s3 -; GFX10-NEXT: s_sub_i32 s10, s8, s10 -; GFX10-NEXT: s_pack_ll_b32_b16 s3, s3, s10 -; GFX10-NEXT: s_sext_i32_i16 s10, s3 +; GFX10-NEXT: s_pack_ll_b32_b16 s6, s6, s8 +; GFX10-NEXT: s_mov_b32 s8, 0x8000 +; GFX10-NEXT: s_lshr_b32 s14, s6, 16 +; GFX10-NEXT: s_sub_i32 s6, s13, s6 +; GFX10-NEXT: s_sub_i32 s14, s8, s14 +; GFX10-NEXT: s_pack_ll_b32_b16 s6, s6, s14 +; GFX10-NEXT: s_sext_i32_i16 s14, s6 +; GFX10-NEXT: s_ashr_i32 s6, s6, 16 +; GFX10-NEXT: s_cmp_gt_i32 s14, s15 +; GFX10-NEXT: s_cselect_b32 s14, s14, s15 +; GFX10-NEXT: s_cmp_gt_i32 s6, s3 +; GFX10-NEXT: s_sext_i32_i16 s15, s4 +; GFX10-NEXT: s_cselect_b32 s3, s6, s3 +; GFX10-NEXT: s_pack_ll_b32_b16 s6, s9, s12 +; GFX10-NEXT: s_pack_ll_b32_b16 s3, s14, s3 +; GFX10-NEXT: s_sext_i32_i16 s12, s6 +; GFX10-NEXT: s_sext_i32_i16 s9, s3 ; GFX10-NEXT: s_ashr_i32 s3, s3, 16 -; GFX10-NEXT: s_cmp_gt_i32 s10, s16 -; GFX10-NEXT: s_cselect_b32 s10, s10, s16 -; GFX10-NEXT: s_cmp_gt_i32 s3, s4 -; GFX10-NEXT: s_cselect_b32 s3, s3, s4 -; GFX10-NEXT: s_pack_ll_b32_b16 s4, s13, s15 -; GFX10-NEXT: s_pack_ll_b32_b16 s3, s10, s3 -; GFX10-NEXT: s_sext_i32_i16 s13, s4 -; GFX10-NEXT: s_sext_i32_i16 s10, s3 -; GFX10-NEXT: s_ashr_i32 s3, s3, 16 -; GFX10-NEXT: s_ashr_i32 s4, s4, 16 -; GFX10-NEXT: s_cmp_lt_i32 s10, s13 -; GFX10-NEXT: s_cselect_b32 s10, s10, s13 -; GFX10-NEXT: s_cmp_lt_i32 s3, s4 -; GFX10-NEXT: s_cselect_b32 s3, s3, s4 -; GFX10-NEXT: s_sext_i32_i16 s4, s2 -; GFX10-NEXT: s_pack_ll_b32_b16 s3, s10, s3 -; GFX10-NEXT: s_lshr_b32 s10, s1, 16 -; GFX10-NEXT: s_lshr_b32 s13, s3, 16 -; GFX10-NEXT: s_add_i32 s1, s1, s3 -; GFX10-NEXT: s_add_i32 s10, s10, s13 -; GFX10-NEXT: s_ashr_i32 s3, s2, 16 -; GFX10-NEXT: s_cmp_gt_i32 s4, s9 -; GFX10-NEXT: s_pack_ll_b32_b16 s1, s1, s10 -; GFX10-NEXT: s_cselect_b32 s13, s4, s9 -; GFX10-NEXT: s_cmp_gt_i32 s3, s6 -; GFX10-NEXT: s_cselect_b32 s15, s3, s6 -; GFX10-NEXT: s_pack_ll_b32_b16 s13, s13, s15 -; GFX10-NEXT: s_lshr_b32 s15, s13, 16 -; GFX10-NEXT: s_sub_i32 s11, s11, s13 -; GFX10-NEXT: s_sub_i32 s12, s12, s15 -; GFX10-NEXT: s_cmp_lt_i32 s4, s9 -; GFX10-NEXT: s_cselect_b32 s4, s4, s9 +; GFX10-NEXT: s_ashr_i32 s6, s6, 16 +; GFX10-NEXT: s_cmp_lt_i32 s9, s12 +; GFX10-NEXT: s_cselect_b32 s9, s9, s12 ; GFX10-NEXT: s_cmp_lt_i32 s3, s6 ; GFX10-NEXT: s_cselect_b32 s3, s3, s6 -; GFX10-NEXT: s_sext_i32_i16 s6, s5 -; GFX10-NEXT: s_pack_ll_b32_b16 s3, s4, s3 -; GFX10-NEXT: s_lshr_b32 s4, s3, 16 -; GFX10-NEXT: s_sub_i32 s3, s14, s3 -; GFX10-NEXT: s_sub_i32 s4, s8, s4 -; GFX10-NEXT: s_pack_ll_b32_b16 s3, s3, s4 -; GFX10-NEXT: s_ashr_i32 s4, s5, 16 -; GFX10-NEXT: s_sext_i32_i16 s5, s3 +; GFX10-NEXT: s_lshr_b32 s6, s0, 16 +; GFX10-NEXT: s_pack_ll_b32_b16 s3, s9, s3 +; GFX10-NEXT: s_lshr_b32 s9, s3, 16 +; GFX10-NEXT: s_add_i32 s0, s0, s3 +; GFX10-NEXT: s_sext_i32_i16 s3, s1 +; GFX10-NEXT: s_add_i32 s6, s6, s9 +; GFX10-NEXT: s_ashr_i32 s9, s1, 16 +; GFX10-NEXT: s_cmp_gt_i32 s3, s7 +; GFX10-NEXT: s_pack_ll_b32_b16 s0, s0, s6 +; GFX10-NEXT: s_cselect_b32 s12, s3, s7 +; GFX10-NEXT: s_cmp_gt_i32 s9, 0 +; GFX10-NEXT: s_cselect_b32 s14, s9, 0 +; GFX10-NEXT: s_pack_ll_b32_b16 s12, s12, s14 +; GFX10-NEXT: s_lshr_b32 s14, s12, 16 +; GFX10-NEXT: s_sub_i32 s12, s11, s12 +; GFX10-NEXT: s_sub_i32 s14, s10, s14 +; GFX10-NEXT: s_cmp_lt_i32 s3, s7 +; GFX10-NEXT: s_pack_ll_b32_b16 s12, s12, s14 +; GFX10-NEXT: s_cselect_b32 s3, s3, s7 +; GFX10-NEXT: s_cmp_lt_i32 s9, 0 +; GFX10-NEXT: s_cselect_b32 s9, s9, 0 +; GFX10-NEXT: s_ashr_i32 s4, s4, 16 +; GFX10-NEXT: s_pack_ll_b32_b16 s3, s3, s9 +; GFX10-NEXT: s_lshr_b32 s9, s3, 16 +; GFX10-NEXT: s_sub_i32 s3, s13, s3 +; GFX10-NEXT: s_sub_i32 s9, s8, s9 +; GFX10-NEXT: s_pack_ll_b32_b16 s3, s3, s9 +; GFX10-NEXT: s_sext_i32_i16 s9, s3 ; GFX10-NEXT: s_ashr_i32 s3, s3, 16 -; GFX10-NEXT: s_cmp_gt_i32 s5, s6 -; GFX10-NEXT: s_cselect_b32 s5, s5, s6 +; GFX10-NEXT: s_cmp_gt_i32 s9, s15 +; GFX10-NEXT: s_cselect_b32 s9, s9, s15 ; GFX10-NEXT: s_cmp_gt_i32 s3, s4 -; GFX10-NEXT: s_pack_ll_b32_b16 s6, s11, s12 ; GFX10-NEXT: s_cselect_b32 s3, s3, s4 -; GFX10-NEXT: s_sext_i32_i16 s4, s6 -; GFX10-NEXT: s_pack_ll_b32_b16 s3, s5, s3 -; GFX10-NEXT: s_ashr_i32 s5, s6, 16 -; GFX10-NEXT: s_sext_i32_i16 s6, s3 +; GFX10-NEXT: s_sext_i32_i16 s4, s12 +; GFX10-NEXT: s_pack_ll_b32_b16 s3, s9, s3 +; GFX10-NEXT: s_ashr_i32 s9, s12, 16 +; GFX10-NEXT: s_sext_i32_i16 s12, s3 ; GFX10-NEXT: s_ashr_i32 s3, s3, 16 -; GFX10-NEXT: s_cmp_lt_i32 s6, s4 -; GFX10-NEXT: s_cselect_b32 s4, s6, s4 -; GFX10-NEXT: s_cmp_lt_i32 s3, s5 -; GFX10-NEXT: s_cselect_b32 s3, s3, s5 +; GFX10-NEXT: s_cmp_lt_i32 s12, s4 +; GFX10-NEXT: s_cselect_b32 s4, s12, s4 +; GFX10-NEXT: s_cmp_lt_i32 s3, s9 +; GFX10-NEXT: s_sext_i32_i16 s12, s2 +; GFX10-NEXT: s_cselect_b32 s3, s3, s9 +; GFX10-NEXT: s_lshr_b32 s9, s1, 16 ; GFX10-NEXT: s_pack_ll_b32_b16 s3, s4, s3 +; GFX10-NEXT: s_ashr_i32 s4, s2, 16 +; GFX10-NEXT: s_lshr_b32 s14, s3, 16 +; GFX10-NEXT: s_add_i32 s1, s1, s3 +; GFX10-NEXT: s_add_i32 s9, s9, s14 +; GFX10-NEXT: s_cmp_gt_i32 s12, s7 +; GFX10-NEXT: s_pack_ll_b32_b16 s1, s1, s9 +; GFX10-NEXT: s_cselect_b32 s3, s12, s7 +; GFX10-NEXT: s_cmp_gt_i32 s4, 0 +; GFX10-NEXT: s_cselect_b32 s14, s4, 0 +; GFX10-NEXT: s_pack_ll_b32_b16 s3, s3, s14 +; GFX10-NEXT: s_lshr_b32 s14, s3, 16 +; GFX10-NEXT: s_sub_i32 s3, s11, s3 +; GFX10-NEXT: s_sub_i32 s10, s10, s14 +; GFX10-NEXT: s_cmp_lt_i32 s12, s7 +; GFX10-NEXT: s_pack_ll_b32_b16 s3, s3, s10 +; GFX10-NEXT: s_cselect_b32 s7, s12, s7 +; GFX10-NEXT: s_cmp_lt_i32 s4, 0 +; GFX10-NEXT: s_cselect_b32 s4, s4, 0 +; GFX10-NEXT: s_pack_ll_b32_b16 s4, s7, s4 +; GFX10-NEXT: s_lshr_b32 s7, s4, 16 +; GFX10-NEXT: s_sub_i32 s4, s13, s4 +; GFX10-NEXT: s_sub_i32 s7, s8, s7 +; GFX10-NEXT: s_sext_i32_i16 s8, s5 +; GFX10-NEXT: s_pack_ll_b32_b16 s4, s4, s7 +; GFX10-NEXT: s_ashr_i32 s5, s5, 16 +; GFX10-NEXT: s_sext_i32_i16 s7, s4 +; GFX10-NEXT: s_ashr_i32 s4, s4, 16 +; GFX10-NEXT: s_cmp_gt_i32 s7, s8 +; GFX10-NEXT: s_cselect_b32 s7, s7, s8 +; GFX10-NEXT: s_cmp_gt_i32 s4, s5 +; GFX10-NEXT: s_cselect_b32 s4, s4, s5 +; GFX10-NEXT: s_sext_i32_i16 s5, s3 +; GFX10-NEXT: s_pack_ll_b32_b16 s4, s7, s4 +; GFX10-NEXT: s_ashr_i32 s3, s3, 16 +; GFX10-NEXT: s_sext_i32_i16 s7, s4 +; GFX10-NEXT: s_ashr_i32 s4, s4, 16 +; GFX10-NEXT: s_cmp_lt_i32 s7, s5 +; GFX10-NEXT: s_cselect_b32 s5, s7, s5 +; GFX10-NEXT: s_cmp_lt_i32 s4, s3 +; GFX10-NEXT: s_cselect_b32 s3, s4, s3 ; GFX10-NEXT: s_lshr_b32 s4, s2, 16 +; GFX10-NEXT: s_pack_ll_b32_b16 s3, s5, s3 ; GFX10-NEXT: s_lshr_b32 s5, s3, 16 ; GFX10-NEXT: s_add_i32 s2, s2, s3 ; GFX10-NEXT: s_add_i32 s4, s4, s5 @@ -6438,36 +6358,33 @@ define <4 x float> @v_saddsat_v8i16(<8 x i16> %lhs, <8 x i16> %rhs) { ; GFX9-LABEL: v_saddsat_v8i16: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: s_movk_i32 s5, 0x8000 -; GFX9-NEXT: s_pack_ll_b32_b16 s6, 0, 0 -; GFX9-NEXT: s_movk_i32 s4, 0x7fff -; GFX9-NEXT: s_pack_ll_b32_b16 s5, s5, s5 -; GFX9-NEXT: v_pk_min_i16 v9, v0, s6 -; GFX9-NEXT: v_pk_sub_i16 v9, s5, v9 -; GFX9-NEXT: s_pack_ll_b32_b16 s4, s4, s4 -; GFX9-NEXT: v_pk_max_i16 v8, v0, s6 -; GFX9-NEXT: v_pk_sub_i16 v8, s4, v8 -; GFX9-NEXT: v_pk_max_i16 v4, v9, v4 +; GFX9-NEXT: v_pk_min_i16 v10, v0, 0 +; GFX9-NEXT: v_mov_b32_e32 v11, 0x80008000 +; GFX9-NEXT: v_pk_sub_i16 v10, v11, v10 +; GFX9-NEXT: v_pk_max_i16 v8, v0, 0 +; GFX9-NEXT: v_mov_b32_e32 v9, 0x7fff7fff +; GFX9-NEXT: v_pk_sub_i16 v8, v9, v8 +; GFX9-NEXT: v_pk_max_i16 v4, v10, v4 ; GFX9-NEXT: v_pk_min_i16 v4, v4, v8 -; GFX9-NEXT: v_pk_min_i16 v8, v1, s6 +; GFX9-NEXT: v_pk_min_i16 v8, v1, 0 ; GFX9-NEXT: v_pk_add_u16 v0, v0, v4 -; GFX9-NEXT: v_pk_max_i16 v4, v1, s6 -; GFX9-NEXT: v_pk_sub_i16 v8, s5, v8 -; GFX9-NEXT: v_pk_sub_i16 v4, s4, v4 +; GFX9-NEXT: v_pk_max_i16 v4, v1, 0 +; GFX9-NEXT: v_pk_sub_i16 v8, v11, v8 +; GFX9-NEXT: v_pk_sub_i16 v4, v9, v4 ; GFX9-NEXT: v_pk_max_i16 v5, v8, v5 ; GFX9-NEXT: v_pk_min_i16 v4, v5, v4 -; GFX9-NEXT: v_pk_min_i16 v5, v2, s6 -; GFX9-NEXT: v_pk_sub_i16 v5, s5, v5 +; GFX9-NEXT: v_pk_min_i16 v5, v2, 0 +; GFX9-NEXT: v_pk_sub_i16 v5, v11, v5 ; GFX9-NEXT: v_pk_add_u16 v1, v1, v4 -; GFX9-NEXT: v_pk_max_i16 v4, v2, s6 -; GFX9-NEXT: v_pk_sub_i16 v4, s4, v4 +; GFX9-NEXT: v_pk_max_i16 v4, v2, 0 +; GFX9-NEXT: v_pk_sub_i16 v4, v9, v4 ; GFX9-NEXT: v_pk_max_i16 v5, v5, v6 ; GFX9-NEXT: v_pk_min_i16 v4, v5, v4 -; GFX9-NEXT: v_pk_min_i16 v5, v3, s6 -; GFX9-NEXT: v_pk_sub_i16 v5, s5, v5 +; GFX9-NEXT: v_pk_min_i16 v5, v3, 0 +; GFX9-NEXT: v_pk_sub_i16 v5, v11, v5 ; GFX9-NEXT: v_pk_add_u16 v2, v2, v4 -; GFX9-NEXT: v_pk_max_i16 v4, v3, s6 -; GFX9-NEXT: v_pk_sub_i16 v4, s4, v4 +; GFX9-NEXT: v_pk_max_i16 v4, v3, 0 +; GFX9-NEXT: v_pk_sub_i16 v4, v9, v4 ; GFX9-NEXT: v_pk_max_i16 v5, v5, v7 ; GFX9-NEXT: v_pk_min_i16 v4, v5, v4 ; GFX9-NEXT: v_pk_add_u16 v3, v3, v4 @@ -6477,30 +6394,25 @@ define <4 x float> @v_saddsat_v8i16(<8 x i16> %lhs, <8 x i16> %rhs) { ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: s_pack_ll_b32_b16 s4, 0, 0 -; GFX10-NEXT: s_movk_i32 s5, 0x8000 -; GFX10-NEXT: v_pk_min_i16 v8, v0, s4 -; GFX10-NEXT: s_pack_ll_b32_b16 s5, s5, s5 -; GFX10-NEXT: v_pk_min_i16 v11, v1, s4 -; GFX10-NEXT: v_pk_min_i16 v12, v3, s4 -; GFX10-NEXT: v_pk_max_i16 v9, v0, s4 -; GFX10-NEXT: v_pk_sub_i16 v15, s5, v8 -; GFX10-NEXT: v_pk_min_i16 v8, v2, s4 -; GFX10-NEXT: v_pk_sub_i16 v11, s5, v11 -; GFX10-NEXT: v_pk_sub_i16 v12, s5, v12 -; GFX10-NEXT: v_pk_max_i16 v10, v1, s4 -; GFX10-NEXT: v_pk_max_i16 v13, v2, s4 -; GFX10-NEXT: v_pk_sub_i16 v8, s5, v8 -; GFX10-NEXT: v_pk_max_i16 v14, v3, s4 -; GFX10-NEXT: s_movk_i32 s6, 0x7fff -; GFX10-NEXT: v_pk_max_i16 v4, v15, v4 -; GFX10-NEXT: s_pack_ll_b32_b16 s6, s6, s6 +; GFX10-NEXT: v_pk_min_i16 v8, v0, 0 +; GFX10-NEXT: v_pk_min_i16 v11, v1, 0 +; GFX10-NEXT: v_pk_min_i16 v12, v3, 0 +; GFX10-NEXT: v_pk_max_i16 v9, v0, 0 +; GFX10-NEXT: v_pk_max_i16 v10, v1, 0 +; GFX10-NEXT: v_pk_sub_i16 v15, 0x80008000, v8 +; GFX10-NEXT: v_pk_min_i16 v8, v2, 0 +; GFX10-NEXT: v_pk_sub_i16 v11, 0x80008000, v11 +; GFX10-NEXT: v_pk_sub_i16 v12, 0x80008000, v12 +; GFX10-NEXT: v_pk_max_i16 v13, v2, 0 +; GFX10-NEXT: v_pk_max_i16 v14, v3, 0 +; GFX10-NEXT: v_pk_sub_i16 v8, 0x80008000, v8 ; GFX10-NEXT: v_pk_max_i16 v5, v11, v5 -; GFX10-NEXT: v_pk_sub_i16 v9, s6, v9 -; GFX10-NEXT: v_pk_sub_i16 v10, s6, v10 +; GFX10-NEXT: v_pk_sub_i16 v10, 0x7fff7fff, v10 +; GFX10-NEXT: v_pk_sub_i16 v9, 0x7fff7fff, v9 +; GFX10-NEXT: v_pk_max_i16 v4, v15, v4 ; GFX10-NEXT: v_pk_max_i16 v6, v8, v6 -; GFX10-NEXT: v_pk_sub_i16 v11, s6, v13 -; GFX10-NEXT: v_pk_sub_i16 v8, s6, v14 +; GFX10-NEXT: v_pk_sub_i16 v11, 0x7fff7fff, v13 +; GFX10-NEXT: v_pk_sub_i16 v8, 0x7fff7fff, v14 ; GFX10-NEXT: v_pk_max_i16 v7, v12, v7 ; GFX10-NEXT: v_pk_min_i16 v15, v4, v9 ; GFX10-NEXT: v_pk_min_i16 v19, v5, v10 @@ -6814,138 +6726,134 @@ define amdgpu_ps <4 x i32> @s_saddsat_v8i16(<8 x i16> inreg %lhs, <8 x i16> inre ; ; GFX9-LABEL: s_saddsat_v8i16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_pack_ll_b32_b16 s10, 0, 0 -; GFX9-NEXT: s_sext_i32_i16 s13, s10 -; GFX9-NEXT: s_sext_i32_i16 s11, s0 -; GFX9-NEXT: s_ashr_i32 s12, s0, 16 +; GFX9-NEXT: s_sext_i32_i16 s10, s0 +; GFX9-NEXT: s_ashr_i32 s11, s0, 16 +; GFX9-NEXT: s_sext_i32_i16 s12, 0 +; GFX9-NEXT: s_cmp_gt_i32 s10, s12 +; GFX9-NEXT: s_cselect_b32 s13, s10, s12 +; GFX9-NEXT: s_cmp_gt_i32 s11, 0 +; GFX9-NEXT: s_cselect_b32 s14, s11, 0 +; GFX9-NEXT: s_pack_ll_b32_b16 s13, s13, s14 +; GFX9-NEXT: s_mov_b32 s8, 0x7fff7fff +; GFX9-NEXT: s_lshr_b32 s15, s13, 16 +; GFX9-NEXT: s_movk_i32 s14, 0x7fff +; GFX9-NEXT: s_sub_i32 s13, s8, s13 +; GFX9-NEXT: s_sub_i32 s15, s14, s15 +; GFX9-NEXT: s_cmp_lt_i32 s10, s12 +; GFX9-NEXT: s_cselect_b32 s10, s10, s12 +; GFX9-NEXT: s_cmp_lt_i32 s11, 0 +; GFX9-NEXT: s_cselect_b32 s11, s11, 0 +; GFX9-NEXT: s_pack_ll_b32_b16 s10, s10, s11 +; GFX9-NEXT: s_pack_ll_b32_b16 s13, s13, s15 +; GFX9-NEXT: s_mov_b32 s9, 0x80008000 +; GFX9-NEXT: s_lshr_b32 s15, s10, 16 +; GFX9-NEXT: s_mov_b32 s11, 0x8000 +; GFX9-NEXT: s_sub_i32 s10, s9, s10 +; GFX9-NEXT: s_sub_i32 s15, s11, s15 +; GFX9-NEXT: s_pack_ll_b32_b16 s10, s10, s15 +; GFX9-NEXT: s_sext_i32_i16 s15, s10 +; GFX9-NEXT: s_sext_i32_i16 s16, s4 ; GFX9-NEXT: s_ashr_i32 s10, s10, 16 -; GFX9-NEXT: s_cmp_gt_i32 s11, s13 -; GFX9-NEXT: s_cselect_b32 s14, s11, s13 -; GFX9-NEXT: s_cmp_gt_i32 s12, s10 -; GFX9-NEXT: s_movk_i32 s8, 0x7fff -; GFX9-NEXT: s_cselect_b32 s15, s12, s10 -; GFX9-NEXT: s_pack_ll_b32_b16 s14, s14, s15 -; GFX9-NEXT: s_pack_ll_b32_b16 s8, s8, s8 -; GFX9-NEXT: s_lshr_b32 s16, s14, 16 -; GFX9-NEXT: s_lshr_b32 s15, s8, 16 -; GFX9-NEXT: s_sub_i32 s14, s8, s14 -; GFX9-NEXT: s_sub_i32 s16, s15, s16 -; GFX9-NEXT: s_cmp_lt_i32 s11, s13 -; GFX9-NEXT: s_cselect_b32 s11, s11, s13 -; GFX9-NEXT: s_cmp_lt_i32 s12, s10 -; GFX9-NEXT: s_movk_i32 s9, 0x8000 -; GFX9-NEXT: s_cselect_b32 s12, s12, s10 -; GFX9-NEXT: s_pack_ll_b32_b16 s9, s9, s9 -; GFX9-NEXT: s_pack_ll_b32_b16 s11, s11, s12 -; GFX9-NEXT: s_pack_ll_b32_b16 s14, s14, s16 -; GFX9-NEXT: s_lshr_b32 s16, s11, 16 -; GFX9-NEXT: s_lshr_b32 s12, s9, 16 -; GFX9-NEXT: s_sub_i32 s11, s9, s11 -; GFX9-NEXT: s_sub_i32 s16, s12, s16 -; GFX9-NEXT: s_pack_ll_b32_b16 s11, s11, s16 -; GFX9-NEXT: s_sext_i32_i16 s16, s11 -; GFX9-NEXT: s_sext_i32_i16 s17, s4 -; GFX9-NEXT: s_ashr_i32 s11, s11, 16 ; GFX9-NEXT: s_ashr_i32 s4, s4, 16 -; GFX9-NEXT: s_cmp_gt_i32 s16, s17 -; GFX9-NEXT: s_cselect_b32 s16, s16, s17 -; GFX9-NEXT: s_cmp_gt_i32 s11, s4 -; GFX9-NEXT: s_cselect_b32 s4, s11, s4 -; GFX9-NEXT: s_pack_ll_b32_b16 s4, s16, s4 -; GFX9-NEXT: s_sext_i32_i16 s11, s4 -; GFX9-NEXT: s_sext_i32_i16 s16, s14 +; GFX9-NEXT: s_cmp_gt_i32 s15, s16 +; GFX9-NEXT: s_cselect_b32 s15, s15, s16 +; GFX9-NEXT: s_cmp_gt_i32 s10, s4 +; GFX9-NEXT: s_cselect_b32 s4, s10, s4 +; GFX9-NEXT: s_pack_ll_b32_b16 s4, s15, s4 +; GFX9-NEXT: s_sext_i32_i16 s10, s4 +; GFX9-NEXT: s_sext_i32_i16 s15, s13 ; GFX9-NEXT: s_ashr_i32 s4, s4, 16 -; GFX9-NEXT: s_ashr_i32 s14, s14, 16 -; GFX9-NEXT: s_cmp_lt_i32 s11, s16 -; GFX9-NEXT: s_cselect_b32 s11, s11, s16 -; GFX9-NEXT: s_cmp_lt_i32 s4, s14 -; GFX9-NEXT: s_cselect_b32 s4, s4, s14 -; GFX9-NEXT: s_pack_ll_b32_b16 s4, s11, s4 -; GFX9-NEXT: s_lshr_b32 s11, s0, 16 -; GFX9-NEXT: s_lshr_b32 s14, s4, 16 -; GFX9-NEXT: s_add_i32 s0, s0, s4 -; GFX9-NEXT: s_add_i32 s11, s11, s14 -; GFX9-NEXT: s_pack_ll_b32_b16 s0, s0, s11 -; GFX9-NEXT: s_sext_i32_i16 s4, s1 -; GFX9-NEXT: s_ashr_i32 s11, s1, 16 -; GFX9-NEXT: s_cmp_gt_i32 s4, s13 -; GFX9-NEXT: s_cselect_b32 s14, s4, s13 -; GFX9-NEXT: s_cmp_gt_i32 s11, s10 -; GFX9-NEXT: s_cselect_b32 s16, s11, s10 -; GFX9-NEXT: s_pack_ll_b32_b16 s14, s14, s16 -; GFX9-NEXT: s_lshr_b32 s16, s14, 16 -; GFX9-NEXT: s_sub_i32 s14, s8, s14 -; GFX9-NEXT: s_sub_i32 s16, s15, s16 +; GFX9-NEXT: s_ashr_i32 s13, s13, 16 +; GFX9-NEXT: s_cmp_lt_i32 s10, s15 +; GFX9-NEXT: s_cselect_b32 s10, s10, s15 ; GFX9-NEXT: s_cmp_lt_i32 s4, s13 ; GFX9-NEXT: s_cselect_b32 s4, s4, s13 -; GFX9-NEXT: s_cmp_lt_i32 s11, s10 -; GFX9-NEXT: s_cselect_b32 s11, s11, s10 -; GFX9-NEXT: s_pack_ll_b32_b16 s4, s4, s11 -; GFX9-NEXT: s_lshr_b32 s11, s4, 16 +; GFX9-NEXT: s_pack_ll_b32_b16 s4, s10, s4 +; GFX9-NEXT: s_lshr_b32 s10, s0, 16 +; GFX9-NEXT: s_lshr_b32 s13, s4, 16 +; GFX9-NEXT: s_add_i32 s0, s0, s4 +; GFX9-NEXT: s_add_i32 s10, s10, s13 +; GFX9-NEXT: s_pack_ll_b32_b16 s0, s0, s10 +; GFX9-NEXT: s_sext_i32_i16 s4, s1 +; GFX9-NEXT: s_ashr_i32 s10, s1, 16 +; GFX9-NEXT: s_cmp_gt_i32 s4, s12 +; GFX9-NEXT: s_cselect_b32 s13, s4, s12 +; GFX9-NEXT: s_cmp_gt_i32 s10, 0 +; GFX9-NEXT: s_cselect_b32 s15, s10, 0 +; GFX9-NEXT: s_pack_ll_b32_b16 s13, s13, s15 +; GFX9-NEXT: s_lshr_b32 s15, s13, 16 +; GFX9-NEXT: s_sub_i32 s13, s8, s13 +; GFX9-NEXT: s_sub_i32 s15, s14, s15 +; GFX9-NEXT: s_cmp_lt_i32 s4, s12 +; GFX9-NEXT: s_cselect_b32 s4, s4, s12 +; GFX9-NEXT: s_cmp_lt_i32 s10, 0 +; GFX9-NEXT: s_cselect_b32 s10, s10, 0 +; GFX9-NEXT: s_pack_ll_b32_b16 s4, s4, s10 +; GFX9-NEXT: s_lshr_b32 s10, s4, 16 ; GFX9-NEXT: s_sub_i32 s4, s9, s4 -; GFX9-NEXT: s_sub_i32 s11, s12, s11 -; GFX9-NEXT: s_pack_ll_b32_b16 s4, s4, s11 -; GFX9-NEXT: s_pack_ll_b32_b16 s14, s14, s16 -; GFX9-NEXT: s_sext_i32_i16 s11, s4 -; GFX9-NEXT: s_sext_i32_i16 s16, s5 +; GFX9-NEXT: s_sub_i32 s10, s11, s10 +; GFX9-NEXT: s_pack_ll_b32_b16 s4, s4, s10 +; GFX9-NEXT: s_pack_ll_b32_b16 s13, s13, s15 +; GFX9-NEXT: s_sext_i32_i16 s10, s4 +; GFX9-NEXT: s_sext_i32_i16 s15, s5 ; GFX9-NEXT: s_ashr_i32 s4, s4, 16 ; GFX9-NEXT: s_ashr_i32 s5, s5, 16 -; GFX9-NEXT: s_cmp_gt_i32 s11, s16 -; GFX9-NEXT: s_cselect_b32 s11, s11, s16 +; GFX9-NEXT: s_cmp_gt_i32 s10, s15 +; GFX9-NEXT: s_cselect_b32 s10, s10, s15 ; GFX9-NEXT: s_cmp_gt_i32 s4, s5 ; GFX9-NEXT: s_cselect_b32 s4, s4, s5 -; GFX9-NEXT: s_pack_ll_b32_b16 s4, s11, s4 +; GFX9-NEXT: s_pack_ll_b32_b16 s4, s10, s4 ; GFX9-NEXT: s_sext_i32_i16 s5, s4 -; GFX9-NEXT: s_sext_i32_i16 s11, s14 +; GFX9-NEXT: s_sext_i32_i16 s10, s13 ; GFX9-NEXT: s_ashr_i32 s4, s4, 16 -; GFX9-NEXT: s_ashr_i32 s14, s14, 16 -; GFX9-NEXT: s_cmp_lt_i32 s5, s11 -; GFX9-NEXT: s_cselect_b32 s5, s5, s11 -; GFX9-NEXT: s_cmp_lt_i32 s4, s14 -; GFX9-NEXT: s_cselect_b32 s4, s4, s14 +; GFX9-NEXT: s_ashr_i32 s13, s13, 16 +; GFX9-NEXT: s_cmp_lt_i32 s5, s10 +; GFX9-NEXT: s_cselect_b32 s5, s5, s10 +; GFX9-NEXT: s_cmp_lt_i32 s4, s13 +; GFX9-NEXT: s_cselect_b32 s4, s4, s13 ; GFX9-NEXT: s_pack_ll_b32_b16 s4, s5, s4 ; GFX9-NEXT: s_lshr_b32 s5, s1, 16 -; GFX9-NEXT: s_lshr_b32 s11, s4, 16 +; GFX9-NEXT: s_lshr_b32 s10, s4, 16 ; GFX9-NEXT: s_add_i32 s1, s1, s4 -; GFX9-NEXT: s_add_i32 s5, s5, s11 +; GFX9-NEXT: s_add_i32 s5, s5, s10 ; GFX9-NEXT: s_pack_ll_b32_b16 s1, s1, s5 ; GFX9-NEXT: s_sext_i32_i16 s4, s2 ; GFX9-NEXT: s_ashr_i32 s5, s2, 16 -; GFX9-NEXT: s_cmp_gt_i32 s4, s13 -; GFX9-NEXT: s_cselect_b32 s11, s4, s13 -; GFX9-NEXT: s_cmp_gt_i32 s5, s10 -; GFX9-NEXT: s_cselect_b32 s14, s5, s10 -; GFX9-NEXT: s_pack_ll_b32_b16 s11, s11, s14 -; GFX9-NEXT: s_lshr_b32 s14, s11, 16 -; GFX9-NEXT: s_sub_i32 s11, s8, s11 -; GFX9-NEXT: s_sub_i32 s14, s15, s14 -; GFX9-NEXT: s_cmp_lt_i32 s4, s13 -; GFX9-NEXT: s_cselect_b32 s4, s4, s13 -; GFX9-NEXT: s_cmp_lt_i32 s5, s10 -; GFX9-NEXT: s_cselect_b32 s5, s5, s10 +; GFX9-NEXT: s_cmp_gt_i32 s4, s12 +; GFX9-NEXT: s_cselect_b32 s10, s4, s12 +; GFX9-NEXT: s_cmp_gt_i32 s5, 0 +; GFX9-NEXT: s_cselect_b32 s13, s5, 0 +; GFX9-NEXT: s_pack_ll_b32_b16 s10, s10, s13 +; GFX9-NEXT: s_lshr_b32 s13, s10, 16 +; GFX9-NEXT: s_sub_i32 s10, s8, s10 +; GFX9-NEXT: s_sub_i32 s13, s14, s13 +; GFX9-NEXT: s_cmp_lt_i32 s4, s12 +; GFX9-NEXT: s_cselect_b32 s4, s4, s12 +; GFX9-NEXT: s_cmp_lt_i32 s5, 0 +; GFX9-NEXT: s_cselect_b32 s5, s5, 0 ; GFX9-NEXT: s_pack_ll_b32_b16 s4, s4, s5 ; GFX9-NEXT: s_lshr_b32 s5, s4, 16 ; GFX9-NEXT: s_sub_i32 s4, s9, s4 -; GFX9-NEXT: s_sub_i32 s5, s12, s5 +; GFX9-NEXT: s_sub_i32 s5, s11, s5 ; GFX9-NEXT: s_pack_ll_b32_b16 s4, s4, s5 -; GFX9-NEXT: s_pack_ll_b32_b16 s11, s11, s14 +; GFX9-NEXT: s_pack_ll_b32_b16 s10, s10, s13 ; GFX9-NEXT: s_sext_i32_i16 s5, s4 -; GFX9-NEXT: s_sext_i32_i16 s14, s6 +; GFX9-NEXT: s_sext_i32_i16 s13, s6 ; GFX9-NEXT: s_ashr_i32 s4, s4, 16 ; GFX9-NEXT: s_ashr_i32 s6, s6, 16 -; GFX9-NEXT: s_cmp_gt_i32 s5, s14 -; GFX9-NEXT: s_cselect_b32 s5, s5, s14 +; GFX9-NEXT: s_cmp_gt_i32 s5, s13 +; GFX9-NEXT: s_cselect_b32 s5, s5, s13 ; GFX9-NEXT: s_cmp_gt_i32 s4, s6 ; GFX9-NEXT: s_cselect_b32 s4, s4, s6 ; GFX9-NEXT: s_pack_ll_b32_b16 s4, s5, s4 ; GFX9-NEXT: s_sext_i32_i16 s5, s4 -; GFX9-NEXT: s_sext_i32_i16 s6, s11 +; GFX9-NEXT: s_sext_i32_i16 s6, s10 ; GFX9-NEXT: s_ashr_i32 s4, s4, 16 -; GFX9-NEXT: s_ashr_i32 s11, s11, 16 +; GFX9-NEXT: s_ashr_i32 s10, s10, 16 ; GFX9-NEXT: s_cmp_lt_i32 s5, s6 ; GFX9-NEXT: s_cselect_b32 s5, s5, s6 -; GFX9-NEXT: s_cmp_lt_i32 s4, s11 -; GFX9-NEXT: s_cselect_b32 s4, s4, s11 +; GFX9-NEXT: s_cmp_lt_i32 s4, s10 +; GFX9-NEXT: s_cselect_b32 s4, s4, s10 ; GFX9-NEXT: s_pack_ll_b32_b16 s4, s5, s4 ; GFX9-NEXT: s_lshr_b32 s5, s2, 16 ; GFX9-NEXT: s_lshr_b32 s6, s4, 16 @@ -6954,22 +6862,22 @@ define amdgpu_ps <4 x i32> @s_saddsat_v8i16(<8 x i16> inreg %lhs, <8 x i16> inre ; GFX9-NEXT: s_pack_ll_b32_b16 s2, s2, s5 ; GFX9-NEXT: s_sext_i32_i16 s4, s3 ; GFX9-NEXT: s_ashr_i32 s5, s3, 16 -; GFX9-NEXT: s_cmp_gt_i32 s4, s13 -; GFX9-NEXT: s_cselect_b32 s6, s4, s13 -; GFX9-NEXT: s_cmp_gt_i32 s5, s10 -; GFX9-NEXT: s_cselect_b32 s11, s5, s10 -; GFX9-NEXT: s_pack_ll_b32_b16 s6, s6, s11 -; GFX9-NEXT: s_lshr_b32 s11, s6, 16 +; GFX9-NEXT: s_cmp_gt_i32 s4, s12 +; GFX9-NEXT: s_cselect_b32 s6, s4, s12 +; GFX9-NEXT: s_cmp_gt_i32 s5, 0 +; GFX9-NEXT: s_cselect_b32 s10, s5, 0 +; GFX9-NEXT: s_pack_ll_b32_b16 s6, s6, s10 +; GFX9-NEXT: s_lshr_b32 s10, s6, 16 ; GFX9-NEXT: s_sub_i32 s6, s8, s6 -; GFX9-NEXT: s_sub_i32 s8, s15, s11 -; GFX9-NEXT: s_cmp_lt_i32 s4, s13 -; GFX9-NEXT: s_cselect_b32 s4, s4, s13 -; GFX9-NEXT: s_cmp_lt_i32 s5, s10 -; GFX9-NEXT: s_cselect_b32 s5, s5, s10 +; GFX9-NEXT: s_sub_i32 s8, s14, s10 +; GFX9-NEXT: s_cmp_lt_i32 s4, s12 +; GFX9-NEXT: s_cselect_b32 s4, s4, s12 +; GFX9-NEXT: s_cmp_lt_i32 s5, 0 +; GFX9-NEXT: s_cselect_b32 s5, s5, 0 ; GFX9-NEXT: s_pack_ll_b32_b16 s4, s4, s5 ; GFX9-NEXT: s_lshr_b32 s5, s4, 16 ; GFX9-NEXT: s_sub_i32 s4, s9, s4 -; GFX9-NEXT: s_sub_i32 s5, s12, s5 +; GFX9-NEXT: s_sub_i32 s5, s11, s5 ; GFX9-NEXT: s_pack_ll_b32_b16 s4, s4, s5 ; GFX9-NEXT: s_pack_ll_b32_b16 s6, s6, s8 ; GFX9-NEXT: s_sext_i32_i16 s5, s4 @@ -6999,188 +6907,184 @@ define amdgpu_ps <4 x i32> @s_saddsat_v8i16(<8 x i16> inreg %lhs, <8 x i16> inre ; ; GFX10-LABEL: s_saddsat_v8i16: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_pack_ll_b32_b16 s8, 0, 0 -; GFX10-NEXT: s_sext_i32_i16 s9, s0 -; GFX10-NEXT: s_sext_i32_i16 s11, s8 +; GFX10-NEXT: s_sext_i32_i16 s8, s0 +; GFX10-NEXT: s_sext_i32_i16 s9, 0 ; GFX10-NEXT: s_ashr_i32 s10, s0, 16 -; GFX10-NEXT: s_ashr_i32 s8, s8, 16 -; GFX10-NEXT: s_cmp_gt_i32 s9, s11 -; GFX10-NEXT: s_movk_i32 s13, 0x7fff -; GFX10-NEXT: s_cselect_b32 s12, s9, s11 -; GFX10-NEXT: s_cmp_gt_i32 s10, s8 -; GFX10-NEXT: s_pack_ll_b32_b16 s13, s13, s13 -; GFX10-NEXT: s_cselect_b32 s14, s10, s8 -; GFX10-NEXT: s_movk_i32 s16, 0x8000 -; GFX10-NEXT: s_pack_ll_b32_b16 s12, s12, s14 -; GFX10-NEXT: s_lshr_b32 s14, s13, 16 -; GFX10-NEXT: s_lshr_b32 s15, s12, 16 -; GFX10-NEXT: s_sub_i32 s12, s13, s12 -; GFX10-NEXT: s_sub_i32 s15, s14, s15 -; GFX10-NEXT: s_cmp_lt_i32 s9, s11 -; GFX10-NEXT: s_pack_ll_b32_b16 s16, s16, s16 -; GFX10-NEXT: s_cselect_b32 s9, s9, s11 -; GFX10-NEXT: s_cmp_lt_i32 s10, s8 -; GFX10-NEXT: s_sext_i32_i16 s18, s4 -; GFX10-NEXT: s_cselect_b32 s10, s10, s8 -; GFX10-NEXT: s_ashr_i32 s4, s4, 16 -; GFX10-NEXT: s_pack_ll_b32_b16 s9, s9, s10 -; GFX10-NEXT: s_lshr_b32 s10, s16, 16 -; GFX10-NEXT: s_lshr_b32 s17, s9, 16 -; GFX10-NEXT: s_sub_i32 s9, s16, s9 -; GFX10-NEXT: s_sub_i32 s17, s10, s17 +; GFX10-NEXT: s_cmp_gt_i32 s8, s9 +; GFX10-NEXT: s_mov_b32 s13, 0x7fff7fff +; GFX10-NEXT: s_cselect_b32 s11, s8, s9 +; GFX10-NEXT: s_cmp_gt_i32 s10, 0 +; GFX10-NEXT: s_mov_b32 s15, 0x80008000 +; GFX10-NEXT: s_cselect_b32 s12, s10, 0 +; GFX10-NEXT: s_sext_i32_i16 s17, s4 +; GFX10-NEXT: s_pack_ll_b32_b16 s11, s11, s12 +; GFX10-NEXT: s_movk_i32 s12, 0x7fff +; GFX10-NEXT: s_lshr_b32 s14, s11, 16 +; GFX10-NEXT: s_sub_i32 s11, s13, s11 +; GFX10-NEXT: s_sub_i32 s14, s12, s14 +; GFX10-NEXT: s_cmp_lt_i32 s8, s9 ; GFX10-NEXT: ; implicit-def: $vcc_hi -; GFX10-NEXT: s_pack_ll_b32_b16 s9, s9, s17 -; GFX10-NEXT: s_sext_i32_i16 s17, s9 -; GFX10-NEXT: s_ashr_i32 s9, s9, 16 -; GFX10-NEXT: s_cmp_gt_i32 s17, s18 -; GFX10-NEXT: s_cselect_b32 s17, s17, s18 -; GFX10-NEXT: s_cmp_gt_i32 s9, s4 -; GFX10-NEXT: s_sext_i32_i16 s18, s5 -; GFX10-NEXT: s_cselect_b32 s4, s9, s4 -; GFX10-NEXT: s_pack_ll_b32_b16 s9, s12, s15 -; GFX10-NEXT: s_pack_ll_b32_b16 s4, s17, s4 -; GFX10-NEXT: s_sext_i32_i16 s15, s9 -; GFX10-NEXT: s_sext_i32_i16 s12, s4 +; GFX10-NEXT: s_cselect_b32 s8, s8, s9 +; GFX10-NEXT: s_cmp_lt_i32 s10, 0 +; GFX10-NEXT: s_cselect_b32 s10, s10, 0 ; GFX10-NEXT: s_ashr_i32 s4, s4, 16 -; GFX10-NEXT: s_ashr_i32 s9, s9, 16 -; GFX10-NEXT: s_cmp_lt_i32 s12, s15 -; GFX10-NEXT: s_cselect_b32 s12, s12, s15 -; GFX10-NEXT: s_cmp_lt_i32 s4, s9 -; GFX10-NEXT: s_cselect_b32 s4, s4, s9 -; GFX10-NEXT: s_lshr_b32 s9, s0, 16 -; GFX10-NEXT: s_pack_ll_b32_b16 s4, s12, s4 -; GFX10-NEXT: s_lshr_b32 s12, s4, 16 +; GFX10-NEXT: s_pack_ll_b32_b16 s8, s8, s10 +; GFX10-NEXT: s_mov_b32 s10, 0x8000 +; GFX10-NEXT: s_lshr_b32 s16, s8, 16 +; GFX10-NEXT: s_sub_i32 s8, s15, s8 +; GFX10-NEXT: s_sub_i32 s16, s10, s16 +; GFX10-NEXT: s_pack_ll_b32_b16 s8, s8, s16 +; GFX10-NEXT: s_sext_i32_i16 s16, s8 +; GFX10-NEXT: s_ashr_i32 s8, s8, 16 +; GFX10-NEXT: s_cmp_gt_i32 s16, s17 +; GFX10-NEXT: s_cselect_b32 s16, s16, s17 +; GFX10-NEXT: s_cmp_gt_i32 s8, s4 +; GFX10-NEXT: s_sext_i32_i16 s17, s5 +; GFX10-NEXT: s_cselect_b32 s4, s8, s4 +; GFX10-NEXT: s_pack_ll_b32_b16 s8, s11, s14 +; GFX10-NEXT: s_pack_ll_b32_b16 s4, s16, s4 +; GFX10-NEXT: s_sext_i32_i16 s14, s8 +; GFX10-NEXT: s_sext_i32_i16 s11, s4 +; GFX10-NEXT: s_ashr_i32 s4, s4, 16 +; GFX10-NEXT: s_ashr_i32 s8, s8, 16 +; GFX10-NEXT: s_cmp_lt_i32 s11, s14 +; GFX10-NEXT: s_cselect_b32 s11, s11, s14 +; GFX10-NEXT: s_cmp_lt_i32 s4, s8 +; GFX10-NEXT: s_cselect_b32 s4, s4, s8 +; GFX10-NEXT: s_lshr_b32 s8, s0, 16 +; GFX10-NEXT: s_pack_ll_b32_b16 s4, s11, s4 +; GFX10-NEXT: s_lshr_b32 s11, s4, 16 ; GFX10-NEXT: s_add_i32 s0, s0, s4 ; GFX10-NEXT: s_sext_i32_i16 s4, s1 -; GFX10-NEXT: s_add_i32 s9, s9, s12 -; GFX10-NEXT: s_ashr_i32 s12, s1, 16 -; GFX10-NEXT: s_cmp_gt_i32 s4, s11 -; GFX10-NEXT: s_pack_ll_b32_b16 s0, s0, s9 -; GFX10-NEXT: s_cselect_b32 s15, s4, s11 -; GFX10-NEXT: s_cmp_gt_i32 s12, s8 -; GFX10-NEXT: s_cselect_b32 s17, s12, s8 -; GFX10-NEXT: s_pack_ll_b32_b16 s15, s15, s17 -; GFX10-NEXT: s_lshr_b32 s17, s15, 16 -; GFX10-NEXT: s_sub_i32 s15, s13, s15 -; GFX10-NEXT: s_sub_i32 s17, s14, s17 -; GFX10-NEXT: s_cmp_lt_i32 s4, s11 -; GFX10-NEXT: s_cselect_b32 s4, s4, s11 -; GFX10-NEXT: s_cmp_lt_i32 s12, s8 -; GFX10-NEXT: s_cselect_b32 s12, s12, s8 +; GFX10-NEXT: s_add_i32 s8, s8, s11 +; GFX10-NEXT: s_ashr_i32 s11, s1, 16 +; GFX10-NEXT: s_cmp_gt_i32 s4, s9 +; GFX10-NEXT: s_pack_ll_b32_b16 s0, s0, s8 +; GFX10-NEXT: s_cselect_b32 s14, s4, s9 +; GFX10-NEXT: s_cmp_gt_i32 s11, 0 +; GFX10-NEXT: s_cselect_b32 s16, s11, 0 +; GFX10-NEXT: s_pack_ll_b32_b16 s14, s14, s16 +; GFX10-NEXT: s_lshr_b32 s16, s14, 16 +; GFX10-NEXT: s_sub_i32 s14, s13, s14 +; GFX10-NEXT: s_sub_i32 s16, s12, s16 +; GFX10-NEXT: s_cmp_lt_i32 s4, s9 +; GFX10-NEXT: s_pack_ll_b32_b16 s14, s14, s16 +; GFX10-NEXT: s_cselect_b32 s4, s4, s9 +; GFX10-NEXT: s_cmp_lt_i32 s11, 0 +; GFX10-NEXT: s_cselect_b32 s11, s11, 0 ; GFX10-NEXT: s_ashr_i32 s5, s5, 16 -; GFX10-NEXT: s_pack_ll_b32_b16 s4, s4, s12 -; GFX10-NEXT: s_lshr_b32 s12, s4, 16 -; GFX10-NEXT: s_sub_i32 s4, s16, s4 -; GFX10-NEXT: s_sub_i32 s12, s10, s12 -; GFX10-NEXT: s_pack_ll_b32_b16 s4, s4, s12 -; GFX10-NEXT: s_sext_i32_i16 s12, s4 +; GFX10-NEXT: s_pack_ll_b32_b16 s4, s4, s11 +; GFX10-NEXT: s_lshr_b32 s11, s4, 16 +; GFX10-NEXT: s_sub_i32 s4, s15, s4 +; GFX10-NEXT: s_sub_i32 s11, s10, s11 +; GFX10-NEXT: s_pack_ll_b32_b16 s4, s4, s11 +; GFX10-NEXT: s_sext_i32_i16 s11, s4 ; GFX10-NEXT: s_ashr_i32 s4, s4, 16 -; GFX10-NEXT: s_cmp_gt_i32 s12, s18 -; GFX10-NEXT: s_cselect_b32 s12, s12, s18 +; GFX10-NEXT: s_cmp_gt_i32 s11, s17 +; GFX10-NEXT: s_cselect_b32 s11, s11, s17 ; GFX10-NEXT: s_cmp_gt_i32 s4, s5 -; GFX10-NEXT: s_sext_i32_i16 s18, s6 +; GFX10-NEXT: s_sext_i32_i16 s17, s6 ; GFX10-NEXT: s_cselect_b32 s4, s4, s5 -; GFX10-NEXT: s_pack_ll_b32_b16 s5, s15, s17 -; GFX10-NEXT: s_pack_ll_b32_b16 s4, s12, s4 -; GFX10-NEXT: s_sext_i32_i16 s15, s5 -; GFX10-NEXT: s_sext_i32_i16 s12, s4 +; GFX10-NEXT: s_sext_i32_i16 s5, s14 +; GFX10-NEXT: s_pack_ll_b32_b16 s4, s11, s4 +; GFX10-NEXT: s_ashr_i32 s11, s14, 16 +; GFX10-NEXT: s_sext_i32_i16 s14, s4 ; GFX10-NEXT: s_ashr_i32 s4, s4, 16 -; GFX10-NEXT: s_ashr_i32 s5, s5, 16 -; GFX10-NEXT: s_cmp_lt_i32 s12, s15 -; GFX10-NEXT: s_cselect_b32 s12, s12, s15 -; GFX10-NEXT: s_cmp_lt_i32 s4, s5 -; GFX10-NEXT: s_cselect_b32 s4, s4, s5 -; GFX10-NEXT: s_lshr_b32 s5, s1, 16 -; GFX10-NEXT: s_pack_ll_b32_b16 s4, s12, s4 -; GFX10-NEXT: s_lshr_b32 s12, s4, 16 -; GFX10-NEXT: s_add_i32 s1, s1, s4 -; GFX10-NEXT: s_sext_i32_i16 s4, s2 -; GFX10-NEXT: s_add_i32 s5, s5, s12 -; GFX10-NEXT: s_ashr_i32 s12, s2, 16 -; GFX10-NEXT: s_cmp_gt_i32 s4, s11 -; GFX10-NEXT: s_pack_ll_b32_b16 s1, s1, s5 -; GFX10-NEXT: s_cselect_b32 s15, s4, s11 -; GFX10-NEXT: s_cmp_gt_i32 s12, s8 -; GFX10-NEXT: s_cselect_b32 s17, s12, s8 -; GFX10-NEXT: s_pack_ll_b32_b16 s15, s15, s17 -; GFX10-NEXT: s_lshr_b32 s17, s15, 16 -; GFX10-NEXT: s_sub_i32 s15, s13, s15 -; GFX10-NEXT: s_sub_i32 s17, s14, s17 +; GFX10-NEXT: s_cmp_lt_i32 s14, s5 +; GFX10-NEXT: s_cselect_b32 s5, s14, s5 ; GFX10-NEXT: s_cmp_lt_i32 s4, s11 -; GFX10-NEXT: s_pack_ll_b32_b16 s15, s15, s17 +; GFX10-NEXT: s_sext_i32_i16 s14, s2 ; GFX10-NEXT: s_cselect_b32 s4, s4, s11 -; GFX10-NEXT: s_cmp_lt_i32 s12, s8 -; GFX10-NEXT: s_cselect_b32 s12, s12, s8 +; GFX10-NEXT: s_lshr_b32 s11, s1, 16 +; GFX10-NEXT: s_pack_ll_b32_b16 s4, s5, s4 +; GFX10-NEXT: s_ashr_i32 s5, s2, 16 +; GFX10-NEXT: s_lshr_b32 s16, s4, 16 +; GFX10-NEXT: s_add_i32 s1, s1, s4 +; GFX10-NEXT: s_add_i32 s11, s11, s16 +; GFX10-NEXT: s_cmp_gt_i32 s14, s9 +; GFX10-NEXT: s_pack_ll_b32_b16 s1, s1, s11 +; GFX10-NEXT: s_cselect_b32 s4, s14, s9 +; GFX10-NEXT: s_cmp_gt_i32 s5, 0 +; GFX10-NEXT: s_cselect_b32 s16, s5, 0 +; GFX10-NEXT: s_pack_ll_b32_b16 s4, s4, s16 +; GFX10-NEXT: s_lshr_b32 s16, s4, 16 +; GFX10-NEXT: s_sub_i32 s4, s13, s4 +; GFX10-NEXT: s_sub_i32 s16, s12, s16 +; GFX10-NEXT: s_cmp_lt_i32 s14, s9 +; GFX10-NEXT: s_pack_ll_b32_b16 s4, s4, s16 +; GFX10-NEXT: s_cselect_b32 s14, s14, s9 +; GFX10-NEXT: s_cmp_lt_i32 s5, 0 +; GFX10-NEXT: s_cselect_b32 s5, s5, 0 ; GFX10-NEXT: s_ashr_i32 s6, s6, 16 -; GFX10-NEXT: s_pack_ll_b32_b16 s4, s4, s12 -; GFX10-NEXT: s_lshr_b32 s12, s4, 16 -; GFX10-NEXT: s_sub_i32 s4, s16, s4 -; GFX10-NEXT: s_sub_i32 s12, s10, s12 -; GFX10-NEXT: s_pack_ll_b32_b16 s4, s4, s12 -; GFX10-NEXT: s_sext_i32_i16 s12, s4 +; GFX10-NEXT: s_pack_ll_b32_b16 s5, s14, s5 +; GFX10-NEXT: s_lshr_b32 s14, s5, 16 +; GFX10-NEXT: s_sub_i32 s5, s15, s5 +; GFX10-NEXT: s_sub_i32 s14, s10, s14 +; GFX10-NEXT: s_pack_ll_b32_b16 s5, s5, s14 +; GFX10-NEXT: s_sext_i32_i16 s14, s5 +; GFX10-NEXT: s_ashr_i32 s5, s5, 16 +; GFX10-NEXT: s_cmp_gt_i32 s14, s17 +; GFX10-NEXT: s_cselect_b32 s14, s14, s17 +; GFX10-NEXT: s_cmp_gt_i32 s5, s6 +; GFX10-NEXT: s_cselect_b32 s5, s5, s6 +; GFX10-NEXT: s_sext_i32_i16 s6, s4 +; GFX10-NEXT: s_pack_ll_b32_b16 s5, s14, s5 ; GFX10-NEXT: s_ashr_i32 s4, s4, 16 -; GFX10-NEXT: s_cmp_gt_i32 s12, s18 -; GFX10-NEXT: s_cselect_b32 s12, s12, s18 -; GFX10-NEXT: s_cmp_gt_i32 s4, s6 -; GFX10-NEXT: s_cselect_b32 s4, s4, s6 -; GFX10-NEXT: s_sext_i32_i16 s6, s15 -; GFX10-NEXT: s_pack_ll_b32_b16 s4, s12, s4 -; GFX10-NEXT: s_ashr_i32 s12, s15, 16 -; GFX10-NEXT: s_sext_i32_i16 s15, s4 -; GFX10-NEXT: s_ashr_i32 s4, s4, 16 -; GFX10-NEXT: s_cmp_lt_i32 s15, s6 -; GFX10-NEXT: s_cselect_b32 s6, s15, s6 -; GFX10-NEXT: s_cmp_lt_i32 s4, s12 -; GFX10-NEXT: s_sext_i32_i16 s15, s3 -; GFX10-NEXT: s_cselect_b32 s4, s4, s12 -; GFX10-NEXT: s_lshr_b32 s12, s2, 16 +; GFX10-NEXT: s_sext_i32_i16 s14, s5 +; GFX10-NEXT: s_ashr_i32 s5, s5, 16 +; GFX10-NEXT: s_cmp_lt_i32 s14, s6 +; GFX10-NEXT: s_cselect_b32 s6, s14, s6 +; GFX10-NEXT: s_cmp_lt_i32 s5, s4 +; GFX10-NEXT: s_sext_i32_i16 s14, s3 +; GFX10-NEXT: s_cselect_b32 s4, s5, s4 +; GFX10-NEXT: s_lshr_b32 s5, s2, 16 ; GFX10-NEXT: s_pack_ll_b32_b16 s4, s6, s4 ; GFX10-NEXT: s_ashr_i32 s6, s3, 16 -; GFX10-NEXT: s_lshr_b32 s17, s4, 16 +; GFX10-NEXT: s_lshr_b32 s16, s4, 16 ; GFX10-NEXT: s_add_i32 s2, s2, s4 -; GFX10-NEXT: s_add_i32 s12, s12, s17 -; GFX10-NEXT: s_cmp_gt_i32 s15, s11 -; GFX10-NEXT: s_pack_ll_b32_b16 s2, s2, s12 -; GFX10-NEXT: s_cselect_b32 s4, s15, s11 -; GFX10-NEXT: s_cmp_gt_i32 s6, s8 -; GFX10-NEXT: s_cselect_b32 s17, s6, s8 -; GFX10-NEXT: s_pack_ll_b32_b16 s4, s4, s17 -; GFX10-NEXT: s_lshr_b32 s17, s4, 16 +; GFX10-NEXT: s_add_i32 s5, s5, s16 +; GFX10-NEXT: s_cmp_gt_i32 s14, s9 +; GFX10-NEXT: s_pack_ll_b32_b16 s2, s2, s5 +; GFX10-NEXT: s_cselect_b32 s4, s14, s9 +; GFX10-NEXT: s_cmp_gt_i32 s6, 0 +; GFX10-NEXT: s_cselect_b32 s16, s6, 0 +; GFX10-NEXT: s_pack_ll_b32_b16 s4, s4, s16 +; GFX10-NEXT: s_lshr_b32 s16, s4, 16 ; GFX10-NEXT: s_sub_i32 s4, s13, s4 -; GFX10-NEXT: s_sub_i32 s13, s14, s17 -; GFX10-NEXT: s_cmp_lt_i32 s15, s11 -; GFX10-NEXT: s_pack_ll_b32_b16 s4, s4, s13 -; GFX10-NEXT: s_cselect_b32 s11, s15, s11 -; GFX10-NEXT: s_cmp_lt_i32 s6, s8 -; GFX10-NEXT: s_cselect_b32 s6, s6, s8 -; GFX10-NEXT: s_pack_ll_b32_b16 s6, s11, s6 -; GFX10-NEXT: s_lshr_b32 s8, s6, 16 -; GFX10-NEXT: s_sub_i32 s6, s16, s6 -; GFX10-NEXT: s_sub_i32 s8, s10, s8 +; GFX10-NEXT: s_sub_i32 s12, s12, s16 +; GFX10-NEXT: s_cmp_lt_i32 s14, s9 +; GFX10-NEXT: s_pack_ll_b32_b16 s4, s4, s12 +; GFX10-NEXT: s_cselect_b32 s9, s14, s9 +; GFX10-NEXT: s_cmp_lt_i32 s6, 0 +; GFX10-NEXT: s_cselect_b32 s6, s6, 0 +; GFX10-NEXT: s_pack_ll_b32_b16 s6, s9, s6 +; GFX10-NEXT: s_lshr_b32 s9, s6, 16 +; GFX10-NEXT: s_sub_i32 s6, s15, s6 +; GFX10-NEXT: s_sub_i32 s9, s10, s9 ; GFX10-NEXT: s_sext_i32_i16 s10, s7 -; GFX10-NEXT: s_pack_ll_b32_b16 s6, s6, s8 +; GFX10-NEXT: s_pack_ll_b32_b16 s6, s6, s9 ; GFX10-NEXT: s_ashr_i32 s7, s7, 16 -; GFX10-NEXT: s_sext_i32_i16 s8, s6 +; GFX10-NEXT: s_sext_i32_i16 s9, s6 ; GFX10-NEXT: s_ashr_i32 s6, s6, 16 -; GFX10-NEXT: s_cmp_gt_i32 s8, s10 -; GFX10-NEXT: s_cselect_b32 s8, s8, s10 +; GFX10-NEXT: s_cmp_gt_i32 s9, s10 +; GFX10-NEXT: s_cselect_b32 s9, s9, s10 ; GFX10-NEXT: s_cmp_gt_i32 s6, s7 ; GFX10-NEXT: s_cselect_b32 s6, s6, s7 ; GFX10-NEXT: s_sext_i32_i16 s7, s4 -; GFX10-NEXT: s_pack_ll_b32_b16 s6, s8, s6 +; GFX10-NEXT: s_pack_ll_b32_b16 s6, s9, s6 ; GFX10-NEXT: s_ashr_i32 s4, s4, 16 -; GFX10-NEXT: s_sext_i32_i16 s8, s6 +; GFX10-NEXT: s_sext_i32_i16 s9, s6 ; GFX10-NEXT: s_ashr_i32 s6, s6, 16 -; GFX10-NEXT: s_cmp_lt_i32 s8, s7 -; GFX10-NEXT: s_cselect_b32 s7, s8, s7 +; GFX10-NEXT: s_cmp_lt_i32 s9, s7 +; GFX10-NEXT: s_cselect_b32 s7, s9, s7 ; GFX10-NEXT: s_cmp_lt_i32 s6, s4 ; GFX10-NEXT: s_cselect_b32 s4, s6, s4 -; GFX10-NEXT: s_lshr_b32 s5, s3, 16 +; GFX10-NEXT: s_lshr_b32 s6, s3, 16 ; GFX10-NEXT: s_pack_ll_b32_b16 s4, s7, s4 -; GFX10-NEXT: s_lshr_b32 s6, s4, 16 +; GFX10-NEXT: s_lshr_b32 s7, s4, 16 ; GFX10-NEXT: s_add_i32 s3, s3, s4 -; GFX10-NEXT: s_add_i32 s5, s5, s6 -; GFX10-NEXT: s_pack_ll_b32_b16 s3, s3, s5 +; GFX10-NEXT: s_add_i32 s6, s6, s7 +; GFX10-NEXT: s_pack_ll_b32_b16 s3, s3, s6 ; GFX10-NEXT: ; return to shader part epilog %result = call <8 x i16> @llvm.sadd.sat.v8i16(<8 x i16> %lhs, <8 x i16> %rhs) %cast = bitcast <8 x i16> %result to <4 x i32> diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/shl.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/shl.ll index ed1fe7af5f36..015f6b5de8b0 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/shl.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/shl.ll @@ -529,8 +529,7 @@ define <2 x i16> @v_shl_v2i16_15(<2 x i16> %value) { ; GFX9-LABEL: v_shl_v2i16_15: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: s_pack_ll_b32_b16 s4, 15, 15 -; GFX9-NEXT: v_pk_lshlrev_b16 v0, s4, v0 +; GFX9-NEXT: v_pk_lshlrev_b16 v0, 15, v0 op_sel_hi:[0,1] ; GFX9-NEXT: s_setpc_b64 s[30:31] %result = shl <2 x i16> %value, ret <2 x i16> %result diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/ssubsat.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/ssubsat.ll index 3e1778bcb881..ac2a75383cba 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/ssubsat.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/ssubsat.ll @@ -4512,15 +4512,12 @@ define <2 x i16> @v_ssubsat_v2i16(<2 x i16> %lhs, <2 x i16> %rhs) { ; GFX9-LABEL: v_ssubsat_v2i16: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: s_movk_i32 s4, 0x7fff -; GFX9-NEXT: s_pack_ll_b32_b16 s6, -1, -1 -; GFX9-NEXT: s_movk_i32 s5, 0x8000 -; GFX9-NEXT: s_pack_ll_b32_b16 s4, s4, s4 -; GFX9-NEXT: v_pk_max_i16 v2, v0, s6 -; GFX9-NEXT: v_pk_sub_i16 v2, v2, s4 -; GFX9-NEXT: s_pack_ll_b32_b16 s5, s5, s5 -; GFX9-NEXT: v_pk_min_i16 v3, v0, s6 -; GFX9-NEXT: v_pk_sub_i16 v3, v3, s5 +; GFX9-NEXT: v_pk_max_i16 v2, v0, -1 op_sel_hi:[1,0] +; GFX9-NEXT: v_mov_b32_e32 v3, 0x7fff7fff +; GFX9-NEXT: v_pk_sub_i16 v2, v2, v3 +; GFX9-NEXT: v_pk_min_i16 v3, v0, -1 op_sel_hi:[1,0] +; GFX9-NEXT: v_mov_b32_e32 v4, 0x80008000 +; GFX9-NEXT: v_pk_sub_i16 v3, v3, v4 ; GFX9-NEXT: v_pk_max_i16 v1, v2, v1 ; GFX9-NEXT: v_pk_min_i16 v1, v1, v3 ; GFX9-NEXT: v_pk_sub_i16 v0, v0, v1 @@ -4530,16 +4527,11 @@ define <2 x i16> @v_ssubsat_v2i16(<2 x i16> %lhs, <2 x i16> %rhs) { ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: s_pack_ll_b32_b16 s4, -1, -1 -; GFX10-NEXT: s_movk_i32 s5, 0x7fff -; GFX10-NEXT: v_pk_max_i16 v2, v0, s4 -; GFX10-NEXT: s_pack_ll_b32_b16 s5, s5, s5 -; GFX10-NEXT: v_pk_min_i16 v3, v0, s4 -; GFX10-NEXT: s_movk_i32 s6, 0x8000 +; GFX10-NEXT: v_pk_max_i16 v2, v0, -1 op_sel_hi:[1,0] +; GFX10-NEXT: v_pk_min_i16 v3, v0, -1 op_sel_hi:[1,0] ; GFX10-NEXT: ; implicit-def: $vcc_hi -; GFX10-NEXT: v_pk_sub_i16 v2, v2, s5 -; GFX10-NEXT: s_pack_ll_b32_b16 s4, s6, s6 -; GFX10-NEXT: v_pk_sub_i16 v3, v3, s4 +; GFX10-NEXT: v_pk_sub_i16 v2, v2, 0x7fff7fff +; GFX10-NEXT: v_pk_sub_i16 v3, v3, 0x80008000 ; GFX10-NEXT: v_pk_max_i16 v1, v2, v1 ; GFX10-NEXT: v_pk_min_i16 v1, v1, v3 ; GFX10-NEXT: v_pk_sub_i16 v0, v0, v1 @@ -4635,53 +4627,45 @@ define amdgpu_ps i32 @s_ssubsat_v2i16(<2 x i16> inreg %lhs, <2 x i16> inreg %rhs ; ; GFX9-LABEL: s_ssubsat_v2i16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_pack_ll_b32_b16 s4, -1, -1 -; GFX9-NEXT: s_sext_i32_i16 s7, s4 -; GFX9-NEXT: s_sext_i32_i16 s5, s0 -; GFX9-NEXT: s_ashr_i32 s6, s0, 16 -; GFX9-NEXT: s_ashr_i32 s4, s4, 16 -; GFX9-NEXT: s_cmp_gt_i32 s5, s7 -; GFX9-NEXT: s_cselect_b32 s8, s5, s7 -; GFX9-NEXT: s_cmp_gt_i32 s6, s4 -; GFX9-NEXT: s_movk_i32 s2, 0x7fff -; GFX9-NEXT: s_cselect_b32 s9, s6, s4 -; GFX9-NEXT: s_pack_ll_b32_b16 s8, s8, s9 -; GFX9-NEXT: s_pack_ll_b32_b16 s2, s2, s2 -; GFX9-NEXT: s_lshr_b32 s10, s2, 16 -; GFX9-NEXT: s_lshr_b32 s9, s8, 16 -; GFX9-NEXT: s_sub_i32 s2, s8, s2 -; GFX9-NEXT: s_sub_i32 s8, s9, s10 -; GFX9-NEXT: s_cmp_lt_i32 s5, s7 -; GFX9-NEXT: s_cselect_b32 s5, s5, s7 -; GFX9-NEXT: s_cmp_lt_i32 s6, s4 -; GFX9-NEXT: s_movk_i32 s3, 0x8000 -; GFX9-NEXT: s_cselect_b32 s4, s6, s4 -; GFX9-NEXT: s_pack_ll_b32_b16 s4, s5, s4 -; GFX9-NEXT: s_pack_ll_b32_b16 s3, s3, s3 -; GFX9-NEXT: s_lshr_b32 s6, s3, 16 -; GFX9-NEXT: s_lshr_b32 s5, s4, 16 -; GFX9-NEXT: s_sub_i32 s3, s4, s3 -; GFX9-NEXT: s_sub_i32 s4, s5, s6 -; GFX9-NEXT: s_pack_ll_b32_b16 s2, s2, s8 -; GFX9-NEXT: s_pack_ll_b32_b16 s3, s3, s4 -; GFX9-NEXT: s_sext_i32_i16 s4, s2 -; GFX9-NEXT: s_sext_i32_i16 s5, s1 -; GFX9-NEXT: s_ashr_i32 s2, s2, 16 -; GFX9-NEXT: s_ashr_i32 s1, s1, 16 -; GFX9-NEXT: s_cmp_gt_i32 s4, s5 -; GFX9-NEXT: s_cselect_b32 s4, s4, s5 -; GFX9-NEXT: s_cmp_gt_i32 s2, s1 -; GFX9-NEXT: s_cselect_b32 s1, s2, s1 -; GFX9-NEXT: s_pack_ll_b32_b16 s1, s4, s1 -; GFX9-NEXT: s_sext_i32_i16 s2, s1 -; GFX9-NEXT: s_sext_i32_i16 s4, s3 -; GFX9-NEXT: s_ashr_i32 s1, s1, 16 -; GFX9-NEXT: s_ashr_i32 s3, s3, 16 +; GFX9-NEXT: s_sext_i32_i16 s2, s0 +; GFX9-NEXT: s_ashr_i32 s3, s0, 16 +; GFX9-NEXT: s_sext_i32_i16 s4, -1 +; GFX9-NEXT: s_cmp_gt_i32 s2, s4 +; GFX9-NEXT: s_cselect_b32 s5, s2, s4 +; GFX9-NEXT: s_cmp_gt_i32 s3, -1 +; GFX9-NEXT: s_cselect_b32 s6, s3, -1 +; GFX9-NEXT: s_pack_ll_b32_b16 s5, s5, s6 +; GFX9-NEXT: s_lshr_b32 s6, s5, 16 +; GFX9-NEXT: s_sub_i32 s5, s5, 0x7fff7fff +; GFX9-NEXT: s_sub_i32 s6, s6, 0x7fff ; GFX9-NEXT: s_cmp_lt_i32 s2, s4 ; GFX9-NEXT: s_cselect_b32 s2, s2, s4 -; GFX9-NEXT: s_cmp_lt_i32 s1, s3 -; GFX9-NEXT: s_cselect_b32 s1, s1, s3 -; GFX9-NEXT: s_pack_ll_b32_b16 s1, s2, s1 +; GFX9-NEXT: s_cmp_lt_i32 s3, -1 +; GFX9-NEXT: s_cselect_b32 s3, s3, -1 +; GFX9-NEXT: s_pack_ll_b32_b16 s2, s2, s3 +; GFX9-NEXT: s_lshr_b32 s3, s2, 16 +; GFX9-NEXT: s_pack_ll_b32_b16 s5, s5, s6 +; GFX9-NEXT: s_sub_i32 s2, s2, 0x80008000 +; GFX9-NEXT: s_sub_i32 s3, s3, 0x8000 +; GFX9-NEXT: s_pack_ll_b32_b16 s2, s2, s3 +; GFX9-NEXT: s_sext_i32_i16 s3, s5 +; GFX9-NEXT: s_ashr_i32 s4, s5, 16 +; GFX9-NEXT: s_sext_i32_i16 s5, s1 +; GFX9-NEXT: s_ashr_i32 s1, s1, 16 +; GFX9-NEXT: s_cmp_gt_i32 s3, s5 +; GFX9-NEXT: s_cselect_b32 s3, s3, s5 +; GFX9-NEXT: s_cmp_gt_i32 s4, s1 +; GFX9-NEXT: s_cselect_b32 s1, s4, s1 +; GFX9-NEXT: s_pack_ll_b32_b16 s1, s3, s1 +; GFX9-NEXT: s_sext_i32_i16 s3, s1 +; GFX9-NEXT: s_sext_i32_i16 s4, s2 +; GFX9-NEXT: s_ashr_i32 s1, s1, 16 +; GFX9-NEXT: s_ashr_i32 s2, s2, 16 +; GFX9-NEXT: s_cmp_lt_i32 s3, s4 +; GFX9-NEXT: s_cselect_b32 s3, s3, s4 +; GFX9-NEXT: s_cmp_lt_i32 s1, s2 +; GFX9-NEXT: s_cselect_b32 s1, s1, s2 +; GFX9-NEXT: s_pack_ll_b32_b16 s1, s3, s1 ; GFX9-NEXT: s_lshr_b32 s2, s0, 16 ; GFX9-NEXT: s_lshr_b32 s3, s1, 16 ; GFX9-NEXT: s_sub_i32 s0, s0, s1 @@ -4691,47 +4675,39 @@ define amdgpu_ps i32 @s_ssubsat_v2i16(<2 x i16> inreg %lhs, <2 x i16> inreg %rhs ; ; GFX10-LABEL: s_ssubsat_v2i16: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_pack_ll_b32_b16 s2, -1, -1 -; GFX10-NEXT: s_sext_i32_i16 s3, s0 -; GFX10-NEXT: s_sext_i32_i16 s5, s2 +; GFX10-NEXT: s_sext_i32_i16 s2, s0 +; GFX10-NEXT: s_sext_i32_i16 s3, -1 ; GFX10-NEXT: s_ashr_i32 s4, s0, 16 -; GFX10-NEXT: s_ashr_i32 s2, s2, 16 -; GFX10-NEXT: s_cmp_gt_i32 s3, s5 -; GFX10-NEXT: s_movk_i32 s8, 0x7fff -; GFX10-NEXT: s_cselect_b32 s6, s3, s5 -; GFX10-NEXT: s_cmp_gt_i32 s4, s2 +; GFX10-NEXT: s_cmp_gt_i32 s2, s3 ; GFX10-NEXT: ; implicit-def: $vcc_hi -; GFX10-NEXT: s_cselect_b32 s7, s4, s2 -; GFX10-NEXT: s_pack_ll_b32_b16 s6, s6, s7 -; GFX10-NEXT: s_pack_ll_b32_b16 s7, s8, s8 -; GFX10-NEXT: s_lshr_b32 s8, s6, 16 -; GFX10-NEXT: s_lshr_b32 s9, s7, 16 -; GFX10-NEXT: s_sub_i32 s6, s6, s7 -; GFX10-NEXT: s_sub_i32 s7, s8, s9 -; GFX10-NEXT: s_cmp_lt_i32 s3, s5 -; GFX10-NEXT: s_cselect_b32 s3, s3, s5 -; GFX10-NEXT: s_cmp_lt_i32 s4, s2 -; GFX10-NEXT: s_movk_i32 s5, 0x8000 -; GFX10-NEXT: s_cselect_b32 s2, s4, s2 -; GFX10-NEXT: s_pack_ll_b32_b16 s4, s5, s5 -; GFX10-NEXT: s_pack_ll_b32_b16 s2, s3, s2 -; GFX10-NEXT: s_pack_ll_b32_b16 s3, s6, s7 -; GFX10-NEXT: s_lshr_b32 s5, s2, 16 -; GFX10-NEXT: s_lshr_b32 s6, s4, 16 -; GFX10-NEXT: s_sub_i32 s2, s2, s4 -; GFX10-NEXT: s_sub_i32 s4, s5, s6 -; GFX10-NEXT: s_sext_i32_i16 s5, s3 +; GFX10-NEXT: s_cselect_b32 s5, s2, s3 +; GFX10-NEXT: s_cmp_gt_i32 s4, -1 +; GFX10-NEXT: s_cselect_b32 s6, s4, -1 +; GFX10-NEXT: s_pack_ll_b32_b16 s5, s5, s6 ; GFX10-NEXT: s_sext_i32_i16 s6, s1 -; GFX10-NEXT: s_ashr_i32 s3, s3, 16 -; GFX10-NEXT: s_ashr_i32 s1, s1, 16 -; GFX10-NEXT: s_cmp_gt_i32 s5, s6 +; GFX10-NEXT: s_lshr_b32 s7, s5, 16 +; GFX10-NEXT: s_sub_i32 s5, s5, 0x7fff7fff +; GFX10-NEXT: s_sub_i32 s7, s7, 0x7fff +; GFX10-NEXT: s_cmp_lt_i32 s2, s3 +; GFX10-NEXT: s_pack_ll_b32_b16 s5, s5, s7 +; GFX10-NEXT: s_cselect_b32 s2, s2, s3 +; GFX10-NEXT: s_cmp_lt_i32 s4, -1 +; GFX10-NEXT: s_sext_i32_i16 s3, s5 +; GFX10-NEXT: s_cselect_b32 s4, s4, -1 +; GFX10-NEXT: s_ashr_i32 s5, s5, 16 ; GFX10-NEXT: s_pack_ll_b32_b16 s2, s2, s4 -; GFX10-NEXT: s_cselect_b32 s5, s5, s6 -; GFX10-NEXT: s_cmp_gt_i32 s3, s1 +; GFX10-NEXT: s_ashr_i32 s1, s1, 16 +; GFX10-NEXT: s_lshr_b32 s4, s2, 16 +; GFX10-NEXT: s_sub_i32 s2, s2, 0x80008000 +; GFX10-NEXT: s_sub_i32 s4, s4, 0x8000 +; GFX10-NEXT: s_cmp_gt_i32 s3, s6 +; GFX10-NEXT: s_pack_ll_b32_b16 s2, s2, s4 +; GFX10-NEXT: s_cselect_b32 s3, s3, s6 +; GFX10-NEXT: s_cmp_gt_i32 s5, s1 ; GFX10-NEXT: s_sext_i32_i16 s4, s2 -; GFX10-NEXT: s_cselect_b32 s1, s3, s1 +; GFX10-NEXT: s_cselect_b32 s1, s5, s1 ; GFX10-NEXT: s_ashr_i32 s2, s2, 16 -; GFX10-NEXT: s_pack_ll_b32_b16 s1, s5, s1 +; GFX10-NEXT: s_pack_ll_b32_b16 s1, s3, s1 ; GFX10-NEXT: s_sext_i32_i16 s3, s1 ; GFX10-NEXT: s_ashr_i32 s1, s1, 16 ; GFX10-NEXT: s_cmp_lt_i32 s3, s4 @@ -4819,72 +4795,56 @@ define amdgpu_ps float @ssubsat_v2i16_sv(<2 x i16> inreg %lhs, <2 x i16> %rhs) { ; ; GFX9-LABEL: ssubsat_v2i16_sv: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_pack_ll_b32_b16 s3, -1, -1 -; GFX9-NEXT: s_sext_i32_i16 s6, s3 -; GFX9-NEXT: s_sext_i32_i16 s4, s0 -; GFX9-NEXT: s_ashr_i32 s5, s0, 16 -; GFX9-NEXT: s_ashr_i32 s3, s3, 16 -; GFX9-NEXT: s_cmp_gt_i32 s4, s6 -; GFX9-NEXT: s_cselect_b32 s7, s4, s6 -; GFX9-NEXT: s_cmp_gt_i32 s5, s3 -; GFX9-NEXT: s_movk_i32 s1, 0x7fff -; GFX9-NEXT: s_cselect_b32 s8, s5, s3 -; GFX9-NEXT: s_pack_ll_b32_b16 s7, s7, s8 -; GFX9-NEXT: s_pack_ll_b32_b16 s1, s1, s1 -; GFX9-NEXT: s_lshr_b32 s9, s1, 16 -; GFX9-NEXT: s_lshr_b32 s8, s7, 16 -; GFX9-NEXT: s_sub_i32 s1, s7, s1 -; GFX9-NEXT: s_sub_i32 s7, s8, s9 -; GFX9-NEXT: s_cmp_lt_i32 s4, s6 -; GFX9-NEXT: s_cselect_b32 s4, s4, s6 -; GFX9-NEXT: s_cmp_lt_i32 s5, s3 -; GFX9-NEXT: s_movk_i32 s2, 0x8000 -; GFX9-NEXT: s_cselect_b32 s3, s5, s3 -; GFX9-NEXT: s_pack_ll_b32_b16 s3, s4, s3 -; GFX9-NEXT: s_pack_ll_b32_b16 s2, s2, s2 -; GFX9-NEXT: s_lshr_b32 s5, s2, 16 -; GFX9-NEXT: s_lshr_b32 s4, s3, 16 -; GFX9-NEXT: s_pack_ll_b32_b16 s1, s1, s7 -; GFX9-NEXT: s_sub_i32 s2, s3, s2 -; GFX9-NEXT: s_sub_i32 s3, s4, s5 -; GFX9-NEXT: s_pack_ll_b32_b16 s2, s2, s3 -; GFX9-NEXT: v_pk_max_i16 v0, s1, v0 -; GFX9-NEXT: v_pk_min_i16 v0, v0, s2 +; GFX9-NEXT: s_sext_i32_i16 s1, s0 +; GFX9-NEXT: s_ashr_i32 s2, s0, 16 +; GFX9-NEXT: s_sext_i32_i16 s3, -1 +; GFX9-NEXT: s_cmp_gt_i32 s1, s3 +; GFX9-NEXT: s_cselect_b32 s4, s1, s3 +; GFX9-NEXT: s_cmp_gt_i32 s2, -1 +; GFX9-NEXT: s_cselect_b32 s5, s2, -1 +; GFX9-NEXT: s_pack_ll_b32_b16 s4, s4, s5 +; GFX9-NEXT: s_lshr_b32 s5, s4, 16 +; GFX9-NEXT: s_sub_i32 s4, s4, 0x7fff7fff +; GFX9-NEXT: s_sub_i32 s5, s5, 0x7fff +; GFX9-NEXT: s_cmp_lt_i32 s1, s3 +; GFX9-NEXT: s_cselect_b32 s1, s1, s3 +; GFX9-NEXT: s_cmp_lt_i32 s2, -1 +; GFX9-NEXT: s_cselect_b32 s2, s2, -1 +; GFX9-NEXT: s_pack_ll_b32_b16 s1, s1, s2 +; GFX9-NEXT: s_lshr_b32 s2, s1, 16 +; GFX9-NEXT: s_pack_ll_b32_b16 s4, s4, s5 +; GFX9-NEXT: s_sub_i32 s1, s1, 0x80008000 +; GFX9-NEXT: s_sub_i32 s2, s2, 0x8000 +; GFX9-NEXT: s_pack_ll_b32_b16 s1, s1, s2 +; GFX9-NEXT: v_pk_max_i16 v0, s4, v0 +; GFX9-NEXT: v_pk_min_i16 v0, v0, s1 ; GFX9-NEXT: v_pk_sub_i16 v0, s0, v0 ; GFX9-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: ssubsat_v2i16_sv: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_pack_ll_b32_b16 s1, -1, -1 -; GFX10-NEXT: s_sext_i32_i16 s2, s0 -; GFX10-NEXT: s_sext_i32_i16 s4, s1 +; GFX10-NEXT: s_sext_i32_i16 s1, s0 +; GFX10-NEXT: s_sext_i32_i16 s2, -1 ; GFX10-NEXT: s_ashr_i32 s3, s0, 16 -; GFX10-NEXT: s_ashr_i32 s1, s1, 16 -; GFX10-NEXT: s_cmp_gt_i32 s2, s4 -; GFX10-NEXT: s_movk_i32 s7, 0x7fff -; GFX10-NEXT: s_cselect_b32 s5, s2, s4 -; GFX10-NEXT: s_cmp_gt_i32 s3, s1 +; GFX10-NEXT: s_cmp_gt_i32 s1, s2 ; GFX10-NEXT: ; implicit-def: $vcc_hi -; GFX10-NEXT: s_cselect_b32 s6, s3, s1 -; GFX10-NEXT: s_pack_ll_b32_b16 s5, s5, s6 -; GFX10-NEXT: s_pack_ll_b32_b16 s6, s7, s7 -; GFX10-NEXT: s_lshr_b32 s7, s5, 16 -; GFX10-NEXT: s_lshr_b32 s8, s6, 16 -; GFX10-NEXT: s_sub_i32 s5, s5, s6 -; GFX10-NEXT: s_sub_i32 s6, s7, s8 -; GFX10-NEXT: s_cmp_lt_i32 s2, s4 -; GFX10-NEXT: s_cselect_b32 s2, s2, s4 -; GFX10-NEXT: s_cmp_lt_i32 s3, s1 -; GFX10-NEXT: s_movk_i32 s4, 0x8000 -; GFX10-NEXT: s_cselect_b32 s1, s3, s1 -; GFX10-NEXT: s_pack_ll_b32_b16 s3, s4, s4 -; GFX10-NEXT: s_pack_ll_b32_b16 s1, s2, s1 -; GFX10-NEXT: s_pack_ll_b32_b16 s2, s5, s6 -; GFX10-NEXT: s_lshr_b32 s4, s1, 16 -; GFX10-NEXT: s_lshr_b32 s5, s3, 16 +; GFX10-NEXT: s_cselect_b32 s4, s1, s2 +; GFX10-NEXT: s_cmp_gt_i32 s3, -1 +; GFX10-NEXT: s_cselect_b32 s5, s3, -1 +; GFX10-NEXT: s_pack_ll_b32_b16 s4, s4, s5 +; GFX10-NEXT: s_lshr_b32 s5, s4, 16 +; GFX10-NEXT: s_sub_i32 s4, s4, 0x7fff7fff +; GFX10-NEXT: s_sub_i32 s5, s5, 0x7fff +; GFX10-NEXT: s_cmp_lt_i32 s1, s2 +; GFX10-NEXT: s_cselect_b32 s1, s1, s2 +; GFX10-NEXT: s_cmp_lt_i32 s3, -1 +; GFX10-NEXT: s_cselect_b32 s2, s3, -1 +; GFX10-NEXT: s_pack_ll_b32_b16 s1, s1, s2 +; GFX10-NEXT: s_pack_ll_b32_b16 s2, s4, s5 +; GFX10-NEXT: s_lshr_b32 s3, s1, 16 ; GFX10-NEXT: v_pk_max_i16 v0, s2, v0 -; GFX10-NEXT: s_sub_i32 s1, s1, s3 -; GFX10-NEXT: s_sub_i32 s2, s4, s5 +; GFX10-NEXT: s_sub_i32 s1, s1, 0x80008000 +; GFX10-NEXT: s_sub_i32 s2, s3, 0x8000 ; GFX10-NEXT: s_pack_ll_b32_b16 s1, s1, s2 ; GFX10-NEXT: v_pk_min_i16 v0, v0, s1 ; GFX10-NEXT: v_pk_sub_i16 v0, s0, v0 @@ -4951,15 +4911,12 @@ define amdgpu_ps float @ssubsat_v2i16_vs(<2 x i16> %lhs, <2 x i16> inreg %rhs) { ; ; GFX9-LABEL: ssubsat_v2i16_vs: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_movk_i32 s1, 0x7fff -; GFX9-NEXT: s_pack_ll_b32_b16 s3, -1, -1 -; GFX9-NEXT: s_movk_i32 s2, 0x8000 -; GFX9-NEXT: s_pack_ll_b32_b16 s1, s1, s1 -; GFX9-NEXT: v_pk_max_i16 v1, v0, s3 -; GFX9-NEXT: v_pk_sub_i16 v1, v1, s1 -; GFX9-NEXT: s_pack_ll_b32_b16 s2, s2, s2 -; GFX9-NEXT: v_pk_min_i16 v2, v0, s3 -; GFX9-NEXT: v_pk_sub_i16 v2, v2, s2 +; GFX9-NEXT: v_pk_max_i16 v1, v0, -1 op_sel_hi:[1,0] +; GFX9-NEXT: v_mov_b32_e32 v2, 0x7fff7fff +; GFX9-NEXT: v_pk_sub_i16 v1, v1, v2 +; GFX9-NEXT: v_pk_min_i16 v2, v0, -1 op_sel_hi:[1,0] +; GFX9-NEXT: v_mov_b32_e32 v3, 0x80008000 +; GFX9-NEXT: v_pk_sub_i16 v2, v2, v3 ; GFX9-NEXT: v_pk_max_i16 v1, v1, s0 ; GFX9-NEXT: v_pk_min_i16 v1, v1, v2 ; GFX9-NEXT: v_pk_sub_i16 v0, v0, v1 @@ -4967,16 +4924,11 @@ define amdgpu_ps float @ssubsat_v2i16_vs(<2 x i16> %lhs, <2 x i16> inreg %rhs) { ; ; GFX10-LABEL: ssubsat_v2i16_vs: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_pack_ll_b32_b16 s1, -1, -1 -; GFX10-NEXT: s_movk_i32 s2, 0x7fff -; GFX10-NEXT: v_pk_max_i16 v1, v0, s1 -; GFX10-NEXT: s_pack_ll_b32_b16 s2, s2, s2 -; GFX10-NEXT: v_pk_min_i16 v2, v0, s1 -; GFX10-NEXT: s_movk_i32 s3, 0x8000 +; GFX10-NEXT: v_pk_max_i16 v1, v0, -1 op_sel_hi:[1,0] +; GFX10-NEXT: v_pk_min_i16 v2, v0, -1 op_sel_hi:[1,0] ; GFX10-NEXT: ; implicit-def: $vcc_hi -; GFX10-NEXT: v_pk_sub_i16 v1, v1, s2 -; GFX10-NEXT: s_pack_ll_b32_b16 s1, s3, s3 -; GFX10-NEXT: v_pk_sub_i16 v2, v2, s1 +; GFX10-NEXT: v_pk_sub_i16 v1, v1, 0x7fff7fff +; GFX10-NEXT: v_pk_sub_i16 v2, v2, 0x80008000 ; GFX10-NEXT: v_pk_max_i16 v1, v1, s0 ; GFX10-NEXT: v_pk_min_i16 v1, v1, v2 ; GFX10-NEXT: v_pk_sub_i16 v0, v0, v1 @@ -5098,22 +5050,19 @@ define <2 x float> @v_ssubsat_v4i16(<4 x i16> %lhs, <4 x i16> %rhs) { ; GFX9-LABEL: v_ssubsat_v4i16: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: s_movk_i32 s4, 0x7fff -; GFX9-NEXT: s_pack_ll_b32_b16 s6, -1, -1 -; GFX9-NEXT: s_movk_i32 s5, 0x8000 -; GFX9-NEXT: s_pack_ll_b32_b16 s4, s4, s4 -; GFX9-NEXT: v_pk_max_i16 v4, v0, s6 -; GFX9-NEXT: v_pk_sub_i16 v4, v4, s4 -; GFX9-NEXT: s_pack_ll_b32_b16 s5, s5, s5 -; GFX9-NEXT: v_pk_min_i16 v5, v0, s6 +; GFX9-NEXT: v_pk_max_i16 v4, v0, -1 op_sel_hi:[1,0] +; GFX9-NEXT: v_mov_b32_e32 v5, 0x7fff7fff +; GFX9-NEXT: v_pk_sub_i16 v4, v4, v5 +; GFX9-NEXT: v_pk_min_i16 v6, v0, -1 op_sel_hi:[1,0] +; GFX9-NEXT: v_mov_b32_e32 v7, 0x80008000 ; GFX9-NEXT: v_pk_max_i16 v2, v4, v2 -; GFX9-NEXT: v_pk_sub_i16 v5, v5, s5 -; GFX9-NEXT: v_pk_min_i16 v2, v2, v5 +; GFX9-NEXT: v_pk_sub_i16 v6, v6, v7 +; GFX9-NEXT: v_pk_min_i16 v2, v2, v6 ; GFX9-NEXT: v_pk_sub_i16 v0, v0, v2 -; GFX9-NEXT: v_pk_max_i16 v2, v1, s6 -; GFX9-NEXT: v_pk_sub_i16 v2, v2, s4 -; GFX9-NEXT: v_pk_min_i16 v4, v1, s6 -; GFX9-NEXT: v_pk_sub_i16 v4, v4, s5 +; GFX9-NEXT: v_pk_max_i16 v2, v1, -1 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_sub_i16 v2, v2, v5 +; GFX9-NEXT: v_pk_min_i16 v4, v1, -1 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_sub_i16 v4, v4, v7 ; GFX9-NEXT: v_pk_max_i16 v2, v2, v3 ; GFX9-NEXT: v_pk_min_i16 v2, v2, v4 ; GFX9-NEXT: v_pk_sub_i16 v1, v1, v2 @@ -5123,24 +5072,19 @@ define <2 x float> @v_ssubsat_v4i16(<4 x i16> %lhs, <4 x i16> %rhs) { ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: s_pack_ll_b32_b16 s5, -1, -1 -; GFX10-NEXT: s_movk_i32 s4, 0x7fff -; GFX10-NEXT: v_pk_max_i16 v4, v0, s5 -; GFX10-NEXT: v_pk_max_i16 v5, v1, s5 -; GFX10-NEXT: s_pack_ll_b32_b16 s4, s4, s4 -; GFX10-NEXT: v_pk_min_i16 v6, v0, s5 -; GFX10-NEXT: v_pk_min_i16 v7, v1, s5 -; GFX10-NEXT: v_pk_sub_i16 v4, v4, s4 -; GFX10-NEXT: v_pk_sub_i16 v5, v5, s4 -; GFX10-NEXT: s_movk_i32 s6, 0x8000 +; GFX10-NEXT: v_pk_max_i16 v4, v0, -1 op_sel_hi:[1,0] +; GFX10-NEXT: v_pk_max_i16 v5, v1, -1 op_sel_hi:[1,0] +; GFX10-NEXT: v_pk_min_i16 v6, v0, -1 op_sel_hi:[1,0] +; GFX10-NEXT: v_pk_min_i16 v7, v1, -1 op_sel_hi:[1,0] ; GFX10-NEXT: ; implicit-def: $vcc_hi -; GFX10-NEXT: s_pack_ll_b32_b16 s6, s6, s6 +; GFX10-NEXT: v_pk_sub_i16 v4, v4, 0x7fff7fff +; GFX10-NEXT: v_pk_sub_i16 v5, v5, 0x7fff7fff +; GFX10-NEXT: v_pk_sub_i16 v6, v6, 0x80008000 +; GFX10-NEXT: v_pk_sub_i16 v7, v7, 0x80008000 ; GFX10-NEXT: v_pk_max_i16 v11, v4, v2 -; GFX10-NEXT: v_pk_sub_i16 v6, v6, s6 -; GFX10-NEXT: v_pk_sub_i16 v4, v7, s6 -; GFX10-NEXT: v_pk_max_i16 v3, v5, v3 +; GFX10-NEXT: v_pk_max_i16 v10, v5, v3 ; GFX10-NEXT: v_pk_min_i16 v2, v11, v6 -; GFX10-NEXT: v_pk_min_i16 v3, v3, v4 +; GFX10-NEXT: v_pk_min_i16 v3, v10, v7 ; GFX10-NEXT: v_pk_sub_i16 v0, v0, v2 ; GFX10-NEXT: v_pk_sub_i16 v1, v1, v3 ; GFX10-NEXT: s_setpc_b64 s[30:31] @@ -5306,77 +5250,73 @@ define amdgpu_ps <2 x i32> @s_ssubsat_v4i16(<4 x i16> inreg %lhs, <4 x i16> inre ; ; GFX9-LABEL: s_ssubsat_v4i16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_pack_ll_b32_b16 s6, -1, -1 +; GFX9-NEXT: s_sext_i32_i16 s6, s0 +; GFX9-NEXT: s_ashr_i32 s7, s0, 16 +; GFX9-NEXT: s_sext_i32_i16 s8, -1 +; GFX9-NEXT: s_cmp_gt_i32 s6, s8 +; GFX9-NEXT: s_cselect_b32 s9, s6, s8 +; GFX9-NEXT: s_cmp_gt_i32 s7, -1 +; GFX9-NEXT: s_cselect_b32 s10, s7, -1 +; GFX9-NEXT: s_pack_ll_b32_b16 s9, s9, s10 +; GFX9-NEXT: s_mov_b32 s4, 0x7fff7fff +; GFX9-NEXT: s_lshr_b32 s10, s9, 16 +; GFX9-NEXT: s_movk_i32 s11, 0x7fff +; GFX9-NEXT: s_sub_i32 s9, s9, s4 +; GFX9-NEXT: s_sub_i32 s10, s10, s11 +; GFX9-NEXT: s_cmp_lt_i32 s6, s8 +; GFX9-NEXT: s_cselect_b32 s6, s6, s8 +; GFX9-NEXT: s_cmp_lt_i32 s7, -1 +; GFX9-NEXT: s_cselect_b32 s7, s7, -1 +; GFX9-NEXT: s_pack_ll_b32_b16 s6, s6, s7 +; GFX9-NEXT: s_pack_ll_b32_b16 s9, s9, s10 +; GFX9-NEXT: s_mov_b32 s5, 0x80008000 +; GFX9-NEXT: s_lshr_b32 s7, s6, 16 +; GFX9-NEXT: s_mov_b32 s10, 0x8000 +; GFX9-NEXT: s_sub_i32 s6, s6, s5 +; GFX9-NEXT: s_sub_i32 s7, s7, s10 +; GFX9-NEXT: s_pack_ll_b32_b16 s6, s6, s7 +; GFX9-NEXT: s_sext_i32_i16 s7, s9 +; GFX9-NEXT: s_sext_i32_i16 s12, s2 +; GFX9-NEXT: s_ashr_i32 s9, s9, 16 +; GFX9-NEXT: s_ashr_i32 s2, s2, 16 +; GFX9-NEXT: s_cmp_gt_i32 s7, s12 +; GFX9-NEXT: s_cselect_b32 s7, s7, s12 +; GFX9-NEXT: s_cmp_gt_i32 s9, s2 +; GFX9-NEXT: s_cselect_b32 s2, s9, s2 +; GFX9-NEXT: s_pack_ll_b32_b16 s2, s7, s2 +; GFX9-NEXT: s_sext_i32_i16 s7, s2 ; GFX9-NEXT: s_sext_i32_i16 s9, s6 -; GFX9-NEXT: s_sext_i32_i16 s7, s0 -; GFX9-NEXT: s_ashr_i32 s8, s0, 16 +; GFX9-NEXT: s_ashr_i32 s2, s2, 16 ; GFX9-NEXT: s_ashr_i32 s6, s6, 16 -; GFX9-NEXT: s_cmp_gt_i32 s7, s9 -; GFX9-NEXT: s_cselect_b32 s10, s7, s9 -; GFX9-NEXT: s_cmp_gt_i32 s8, s6 -; GFX9-NEXT: s_movk_i32 s4, 0x7fff -; GFX9-NEXT: s_cselect_b32 s11, s8, s6 -; GFX9-NEXT: s_pack_ll_b32_b16 s10, s10, s11 -; GFX9-NEXT: s_pack_ll_b32_b16 s4, s4, s4 -; GFX9-NEXT: s_lshr_b32 s11, s10, 16 -; GFX9-NEXT: s_lshr_b32 s12, s4, 16 -; GFX9-NEXT: s_sub_i32 s10, s10, s4 -; GFX9-NEXT: s_sub_i32 s11, s11, s12 ; GFX9-NEXT: s_cmp_lt_i32 s7, s9 ; GFX9-NEXT: s_cselect_b32 s7, s7, s9 -; GFX9-NEXT: s_cmp_lt_i32 s8, s6 -; GFX9-NEXT: s_movk_i32 s5, 0x8000 -; GFX9-NEXT: s_cselect_b32 s8, s8, s6 -; GFX9-NEXT: s_pack_ll_b32_b16 s7, s7, s8 -; GFX9-NEXT: s_pack_ll_b32_b16 s5, s5, s5 -; GFX9-NEXT: s_pack_ll_b32_b16 s10, s10, s11 -; GFX9-NEXT: s_lshr_b32 s8, s7, 16 -; GFX9-NEXT: s_lshr_b32 s11, s5, 16 -; GFX9-NEXT: s_sub_i32 s7, s7, s5 -; GFX9-NEXT: s_sub_i32 s8, s8, s11 -; GFX9-NEXT: s_pack_ll_b32_b16 s7, s7, s8 -; GFX9-NEXT: s_sext_i32_i16 s8, s10 -; GFX9-NEXT: s_sext_i32_i16 s13, s2 -; GFX9-NEXT: s_ashr_i32 s10, s10, 16 -; GFX9-NEXT: s_ashr_i32 s2, s2, 16 -; GFX9-NEXT: s_cmp_gt_i32 s8, s13 -; GFX9-NEXT: s_cselect_b32 s8, s8, s13 -; GFX9-NEXT: s_cmp_gt_i32 s10, s2 -; GFX9-NEXT: s_cselect_b32 s2, s10, s2 -; GFX9-NEXT: s_pack_ll_b32_b16 s2, s8, s2 -; GFX9-NEXT: s_sext_i32_i16 s8, s2 -; GFX9-NEXT: s_sext_i32_i16 s10, s7 -; GFX9-NEXT: s_ashr_i32 s2, s2, 16 -; GFX9-NEXT: s_ashr_i32 s7, s7, 16 -; GFX9-NEXT: s_cmp_lt_i32 s8, s10 -; GFX9-NEXT: s_cselect_b32 s8, s8, s10 -; GFX9-NEXT: s_cmp_lt_i32 s2, s7 -; GFX9-NEXT: s_cselect_b32 s2, s2, s7 -; GFX9-NEXT: s_pack_ll_b32_b16 s2, s8, s2 -; GFX9-NEXT: s_lshr_b32 s7, s0, 16 -; GFX9-NEXT: s_lshr_b32 s8, s2, 16 +; GFX9-NEXT: s_cmp_lt_i32 s2, s6 +; GFX9-NEXT: s_cselect_b32 s2, s2, s6 +; GFX9-NEXT: s_pack_ll_b32_b16 s2, s7, s2 +; GFX9-NEXT: s_lshr_b32 s6, s0, 16 +; GFX9-NEXT: s_lshr_b32 s7, s2, 16 ; GFX9-NEXT: s_sub_i32 s0, s0, s2 -; GFX9-NEXT: s_sub_i32 s2, s7, s8 +; GFX9-NEXT: s_sub_i32 s2, s6, s7 ; GFX9-NEXT: s_pack_ll_b32_b16 s0, s0, s2 ; GFX9-NEXT: s_sext_i32_i16 s2, s1 -; GFX9-NEXT: s_ashr_i32 s7, s1, 16 -; GFX9-NEXT: s_cmp_gt_i32 s2, s9 -; GFX9-NEXT: s_cselect_b32 s8, s2, s9 -; GFX9-NEXT: s_cmp_gt_i32 s7, s6 -; GFX9-NEXT: s_cselect_b32 s10, s7, s6 -; GFX9-NEXT: s_pack_ll_b32_b16 s8, s8, s10 -; GFX9-NEXT: s_lshr_b32 s10, s8, 16 -; GFX9-NEXT: s_sub_i32 s4, s8, s4 -; GFX9-NEXT: s_sub_i32 s8, s10, s12 -; GFX9-NEXT: s_cmp_lt_i32 s2, s9 -; GFX9-NEXT: s_cselect_b32 s2, s2, s9 -; GFX9-NEXT: s_cmp_lt_i32 s7, s6 -; GFX9-NEXT: s_cselect_b32 s6, s7, s6 +; GFX9-NEXT: s_ashr_i32 s6, s1, 16 +; GFX9-NEXT: s_cmp_gt_i32 s2, s8 +; GFX9-NEXT: s_cselect_b32 s7, s2, s8 +; GFX9-NEXT: s_cmp_gt_i32 s6, -1 +; GFX9-NEXT: s_cselect_b32 s9, s6, -1 +; GFX9-NEXT: s_pack_ll_b32_b16 s7, s7, s9 +; GFX9-NEXT: s_lshr_b32 s9, s7, 16 +; GFX9-NEXT: s_sub_i32 s4, s7, s4 +; GFX9-NEXT: s_sub_i32 s7, s9, s11 +; GFX9-NEXT: s_cmp_lt_i32 s2, s8 +; GFX9-NEXT: s_cselect_b32 s2, s2, s8 +; GFX9-NEXT: s_cmp_lt_i32 s6, -1 +; GFX9-NEXT: s_cselect_b32 s6, s6, -1 ; GFX9-NEXT: s_pack_ll_b32_b16 s2, s2, s6 ; GFX9-NEXT: s_lshr_b32 s6, s2, 16 ; GFX9-NEXT: s_sub_i32 s2, s2, s5 -; GFX9-NEXT: s_sub_i32 s5, s6, s11 -; GFX9-NEXT: s_pack_ll_b32_b16 s4, s4, s8 +; GFX9-NEXT: s_sub_i32 s5, s6, s10 +; GFX9-NEXT: s_pack_ll_b32_b16 s4, s4, s7 ; GFX9-NEXT: s_pack_ll_b32_b16 s2, s2, s5 ; GFX9-NEXT: s_sext_i32_i16 s5, s4 ; GFX9-NEXT: s_sext_i32_i16 s6, s3 @@ -5405,80 +5345,76 @@ define amdgpu_ps <2 x i32> @s_ssubsat_v4i16(<4 x i16> inreg %lhs, <4 x i16> inre ; ; GFX10-LABEL: s_ssubsat_v4i16: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_pack_ll_b32_b16 s4, -1, -1 -; GFX10-NEXT: s_sext_i32_i16 s5, s0 -; GFX10-NEXT: s_sext_i32_i16 s7, s4 +; GFX10-NEXT: s_sext_i32_i16 s4, s0 +; GFX10-NEXT: s_sext_i32_i16 s5, -1 ; GFX10-NEXT: s_ashr_i32 s6, s0, 16 -; GFX10-NEXT: s_ashr_i32 s4, s4, 16 -; GFX10-NEXT: s_cmp_gt_i32 s5, s7 +; GFX10-NEXT: s_cmp_gt_i32 s4, s5 ; GFX10-NEXT: s_movk_i32 s10, 0x7fff -; GFX10-NEXT: s_cselect_b32 s8, s5, s7 -; GFX10-NEXT: s_cmp_gt_i32 s6, s4 -; GFX10-NEXT: s_movk_i32 s12, 0x8000 -; GFX10-NEXT: s_cselect_b32 s9, s6, s4 -; GFX10-NEXT: s_pack_ll_b32_b16 s12, s12, s12 -; GFX10-NEXT: s_pack_ll_b32_b16 s8, s8, s9 -; GFX10-NEXT: s_pack_ll_b32_b16 s9, s10, s10 -; GFX10-NEXT: s_lshr_b32 s10, s8, 16 -; GFX10-NEXT: s_lshr_b32 s11, s9, 16 -; GFX10-NEXT: s_sub_i32 s8, s8, s9 -; GFX10-NEXT: s_sub_i32 s10, s10, s11 -; GFX10-NEXT: s_cmp_lt_i32 s5, s7 -; GFX10-NEXT: s_sext_i32_i16 s14, s2 -; GFX10-NEXT: s_cselect_b32 s5, s5, s7 -; GFX10-NEXT: s_cmp_lt_i32 s6, s4 +; GFX10-NEXT: s_cselect_b32 s7, s4, s5 +; GFX10-NEXT: s_cmp_gt_i32 s6, -1 +; GFX10-NEXT: s_mov_b32 s11, 0x80008000 +; GFX10-NEXT: s_cselect_b32 s8, s6, -1 +; GFX10-NEXT: s_sext_i32_i16 s13, s2 +; GFX10-NEXT: s_pack_ll_b32_b16 s7, s7, s8 +; GFX10-NEXT: s_mov_b32 s8, 0x7fff7fff +; GFX10-NEXT: s_lshr_b32 s9, s7, 16 +; GFX10-NEXT: s_sub_i32 s7, s7, s8 +; GFX10-NEXT: s_sub_i32 s9, s9, s10 +; GFX10-NEXT: s_cmp_lt_i32 s4, s5 ; GFX10-NEXT: ; implicit-def: $vcc_hi -; GFX10-NEXT: s_cselect_b32 s6, s6, s4 +; GFX10-NEXT: s_cselect_b32 s4, s4, s5 +; GFX10-NEXT: s_cmp_lt_i32 s6, -1 +; GFX10-NEXT: s_cselect_b32 s6, s6, -1 ; GFX10-NEXT: s_ashr_i32 s2, s2, 16 -; GFX10-NEXT: s_pack_ll_b32_b16 s5, s5, s6 -; GFX10-NEXT: s_pack_ll_b32_b16 s6, s8, s10 -; GFX10-NEXT: s_lshr_b32 s8, s5, 16 -; GFX10-NEXT: s_lshr_b32 s10, s12, 16 -; GFX10-NEXT: s_sext_i32_i16 s13, s6 -; GFX10-NEXT: s_sub_i32 s5, s5, s12 -; GFX10-NEXT: s_sub_i32 s8, s8, s10 +; GFX10-NEXT: s_pack_ll_b32_b16 s4, s4, s6 +; GFX10-NEXT: s_pack_ll_b32_b16 s6, s7, s9 +; GFX10-NEXT: s_lshr_b32 s7, s4, 16 +; GFX10-NEXT: s_mov_b32 s9, 0x8000 +; GFX10-NEXT: s_sext_i32_i16 s12, s6 +; GFX10-NEXT: s_sub_i32 s4, s4, s11 +; GFX10-NEXT: s_sub_i32 s7, s7, s9 ; GFX10-NEXT: s_ashr_i32 s6, s6, 16 -; GFX10-NEXT: s_cmp_gt_i32 s13, s14 -; GFX10-NEXT: s_pack_ll_b32_b16 s5, s5, s8 -; GFX10-NEXT: s_cselect_b32 s13, s13, s14 +; GFX10-NEXT: s_cmp_gt_i32 s12, s13 +; GFX10-NEXT: s_pack_ll_b32_b16 s4, s4, s7 +; GFX10-NEXT: s_cselect_b32 s12, s12, s13 ; GFX10-NEXT: s_cmp_gt_i32 s6, s2 -; GFX10-NEXT: s_sext_i32_i16 s8, s5 +; GFX10-NEXT: s_sext_i32_i16 s7, s4 ; GFX10-NEXT: s_cselect_b32 s2, s6, s2 -; GFX10-NEXT: s_ashr_i32 s5, s5, 16 -; GFX10-NEXT: s_pack_ll_b32_b16 s2, s13, s2 +; GFX10-NEXT: s_ashr_i32 s4, s4, 16 +; GFX10-NEXT: s_pack_ll_b32_b16 s2, s12, s2 ; GFX10-NEXT: s_sext_i32_i16 s6, s2 ; GFX10-NEXT: s_ashr_i32 s2, s2, 16 -; GFX10-NEXT: s_cmp_lt_i32 s6, s8 -; GFX10-NEXT: s_cselect_b32 s6, s6, s8 -; GFX10-NEXT: s_cmp_lt_i32 s2, s5 -; GFX10-NEXT: s_cselect_b32 s2, s2, s5 -; GFX10-NEXT: s_lshr_b32 s5, s0, 16 +; GFX10-NEXT: s_cmp_lt_i32 s6, s7 +; GFX10-NEXT: s_cselect_b32 s6, s6, s7 +; GFX10-NEXT: s_cmp_lt_i32 s2, s4 +; GFX10-NEXT: s_cselect_b32 s2, s2, s4 +; GFX10-NEXT: s_lshr_b32 s4, s0, 16 ; GFX10-NEXT: s_pack_ll_b32_b16 s2, s6, s2 ; GFX10-NEXT: s_lshr_b32 s6, s2, 16 ; GFX10-NEXT: s_sub_i32 s0, s0, s2 -; GFX10-NEXT: s_sub_i32 s2, s5, s6 -; GFX10-NEXT: s_sext_i32_i16 s5, s1 +; GFX10-NEXT: s_sub_i32 s2, s4, s6 +; GFX10-NEXT: s_sext_i32_i16 s4, s1 ; GFX10-NEXT: s_ashr_i32 s6, s1, 16 -; GFX10-NEXT: s_cmp_gt_i32 s5, s7 +; GFX10-NEXT: s_cmp_gt_i32 s4, s5 ; GFX10-NEXT: s_pack_ll_b32_b16 s0, s0, s2 -; GFX10-NEXT: s_cselect_b32 s8, s5, s7 -; GFX10-NEXT: s_cmp_gt_i32 s6, s4 -; GFX10-NEXT: s_cselect_b32 s13, s6, s4 -; GFX10-NEXT: s_pack_ll_b32_b16 s8, s8, s13 -; GFX10-NEXT: s_lshr_b32 s13, s8, 16 -; GFX10-NEXT: s_sub_i32 s8, s8, s9 -; GFX10-NEXT: s_sub_i32 s9, s13, s11 -; GFX10-NEXT: s_cmp_lt_i32 s5, s7 -; GFX10-NEXT: s_cselect_b32 s5, s5, s7 -; GFX10-NEXT: s_cmp_lt_i32 s6, s4 -; GFX10-NEXT: s_cselect_b32 s4, s6, s4 -; GFX10-NEXT: s_pack_ll_b32_b16 s4, s5, s4 -; GFX10-NEXT: s_pack_ll_b32_b16 s5, s8, s9 +; GFX10-NEXT: s_cselect_b32 s7, s4, s5 +; GFX10-NEXT: s_cmp_gt_i32 s6, -1 +; GFX10-NEXT: s_cselect_b32 s12, s6, -1 +; GFX10-NEXT: s_pack_ll_b32_b16 s7, s7, s12 +; GFX10-NEXT: s_lshr_b32 s12, s7, 16 +; GFX10-NEXT: s_sub_i32 s7, s7, s8 +; GFX10-NEXT: s_sub_i32 s8, s12, s10 +; GFX10-NEXT: s_cmp_lt_i32 s4, s5 +; GFX10-NEXT: s_cselect_b32 s4, s4, s5 +; GFX10-NEXT: s_cmp_lt_i32 s6, -1 +; GFX10-NEXT: s_cselect_b32 s5, s6, -1 +; GFX10-NEXT: s_pack_ll_b32_b16 s4, s4, s5 +; GFX10-NEXT: s_pack_ll_b32_b16 s5, s7, s8 ; GFX10-NEXT: s_lshr_b32 s6, s4, 16 ; GFX10-NEXT: s_sext_i32_i16 s7, s5 ; GFX10-NEXT: s_sext_i32_i16 s8, s3 -; GFX10-NEXT: s_sub_i32 s4, s4, s12 -; GFX10-NEXT: s_sub_i32 s6, s6, s10 +; GFX10-NEXT: s_sub_i32 s4, s4, s11 +; GFX10-NEXT: s_sub_i32 s6, s6, s9 ; GFX10-NEXT: s_ashr_i32 s5, s5, 16 ; GFX10-NEXT: s_ashr_i32 s3, s3, 16 ; GFX10-NEXT: s_cmp_gt_i32 s7, s8 @@ -5661,29 +5597,26 @@ define <3 x float> @v_ssubsat_v6i16(<6 x i16> %lhs, <6 x i16> %rhs) { ; GFX9-LABEL: v_ssubsat_v6i16: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: s_movk_i32 s4, 0x7fff -; GFX9-NEXT: s_pack_ll_b32_b16 s6, -1, -1 -; GFX9-NEXT: s_movk_i32 s5, 0x8000 -; GFX9-NEXT: s_pack_ll_b32_b16 s4, s4, s4 -; GFX9-NEXT: v_pk_max_i16 v6, v0, s6 -; GFX9-NEXT: v_pk_sub_i16 v6, v6, s4 -; GFX9-NEXT: s_pack_ll_b32_b16 s5, s5, s5 -; GFX9-NEXT: v_pk_min_i16 v7, v0, s6 +; GFX9-NEXT: v_pk_max_i16 v6, v0, -1 op_sel_hi:[1,0] +; GFX9-NEXT: v_mov_b32_e32 v7, 0x7fff7fff +; GFX9-NEXT: v_pk_sub_i16 v6, v6, v7 +; GFX9-NEXT: v_pk_min_i16 v8, v0, -1 op_sel_hi:[1,0] +; GFX9-NEXT: v_mov_b32_e32 v9, 0x80008000 ; GFX9-NEXT: v_pk_max_i16 v3, v6, v3 -; GFX9-NEXT: v_pk_sub_i16 v7, v7, s5 -; GFX9-NEXT: v_pk_min_i16 v3, v3, v7 +; GFX9-NEXT: v_pk_sub_i16 v8, v8, v9 +; GFX9-NEXT: v_pk_min_i16 v3, v3, v8 ; GFX9-NEXT: v_pk_sub_i16 v0, v0, v3 -; GFX9-NEXT: v_pk_max_i16 v3, v1, s6 -; GFX9-NEXT: v_pk_sub_i16 v3, v3, s4 -; GFX9-NEXT: v_pk_min_i16 v6, v1, s6 +; GFX9-NEXT: v_pk_max_i16 v3, v1, -1 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_sub_i16 v3, v3, v7 +; GFX9-NEXT: v_pk_min_i16 v6, v1, -1 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_max_i16 v3, v3, v4 -; GFX9-NEXT: v_pk_sub_i16 v6, v6, s5 +; GFX9-NEXT: v_pk_sub_i16 v6, v6, v9 ; GFX9-NEXT: v_pk_min_i16 v3, v3, v6 ; GFX9-NEXT: v_pk_sub_i16 v1, v1, v3 -; GFX9-NEXT: v_pk_max_i16 v3, v2, s6 -; GFX9-NEXT: v_pk_sub_i16 v3, v3, s4 -; GFX9-NEXT: v_pk_min_i16 v4, v2, s6 -; GFX9-NEXT: v_pk_sub_i16 v4, v4, s5 +; GFX9-NEXT: v_pk_max_i16 v3, v2, -1 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_sub_i16 v3, v3, v7 +; GFX9-NEXT: v_pk_min_i16 v4, v2, -1 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_sub_i16 v4, v4, v9 ; GFX9-NEXT: v_pk_max_i16 v3, v3, v5 ; GFX9-NEXT: v_pk_min_i16 v3, v3, v4 ; GFX9-NEXT: v_pk_sub_i16 v2, v2, v3 @@ -5693,28 +5626,23 @@ define <3 x float> @v_ssubsat_v6i16(<6 x i16> %lhs, <6 x i16> %rhs) { ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: s_pack_ll_b32_b16 s5, -1, -1 -; GFX10-NEXT: s_movk_i32 s4, 0x7fff -; GFX10-NEXT: v_pk_max_i16 v6, v0, s5 -; GFX10-NEXT: v_pk_max_i16 v8, v1, s5 -; GFX10-NEXT: v_pk_max_i16 v9, v2, s5 -; GFX10-NEXT: s_pack_ll_b32_b16 s4, s4, s4 -; GFX10-NEXT: v_pk_min_i16 v7, v0, s5 -; GFX10-NEXT: v_pk_sub_i16 v6, v6, s4 -; GFX10-NEXT: v_pk_sub_i16 v15, v8, s4 -; GFX10-NEXT: v_pk_sub_i16 v19, v9, s4 -; GFX10-NEXT: v_pk_min_i16 v10, v1, s5 -; GFX10-NEXT: v_pk_min_i16 v11, v2, s5 -; GFX10-NEXT: s_movk_i32 s6, 0x8000 +; GFX10-NEXT: v_pk_max_i16 v6, v0, -1 op_sel_hi:[1,0] +; GFX10-NEXT: v_pk_max_i16 v8, v1, -1 op_sel_hi:[1,0] +; GFX10-NEXT: v_pk_max_i16 v9, v2, -1 op_sel_hi:[1,0] +; GFX10-NEXT: v_pk_min_i16 v7, v0, -1 op_sel_hi:[1,0] +; GFX10-NEXT: v_pk_min_i16 v10, v1, -1 op_sel_hi:[1,0] +; GFX10-NEXT: v_pk_sub_i16 v6, v6, 0x7fff7fff +; GFX10-NEXT: v_pk_sub_i16 v15, v8, 0x7fff7fff +; GFX10-NEXT: v_pk_min_i16 v11, v2, -1 op_sel_hi:[1,0] +; GFX10-NEXT: v_pk_sub_i16 v19, v9, 0x7fff7fff +; GFX10-NEXT: v_pk_sub_i16 v7, v7, 0x80008000 ; GFX10-NEXT: v_pk_max_i16 v14, v6, v3 -; GFX10-NEXT: s_pack_ll_b32_b16 s6, s6, s6 +; GFX10-NEXT: v_pk_sub_i16 v6, v10, 0x80008000 ; GFX10-NEXT: v_pk_max_i16 v4, v15, v4 -; GFX10-NEXT: v_pk_sub_i16 v7, v7, s6 -; GFX10-NEXT: v_pk_sub_i16 v6, v10, s6 -; GFX10-NEXT: v_pk_sub_i16 v8, v11, s6 +; GFX10-NEXT: v_pk_sub_i16 v8, v11, 0x80008000 ; GFX10-NEXT: v_pk_max_i16 v5, v19, v5 -; GFX10-NEXT: ; implicit-def: $vcc_hi ; GFX10-NEXT: v_pk_min_i16 v3, v14, v7 +; GFX10-NEXT: ; implicit-def: $vcc_hi ; GFX10-NEXT: v_pk_min_i16 v4, v4, v6 ; GFX10-NEXT: v_pk_min_i16 v5, v5, v8 ; GFX10-NEXT: v_pk_sub_i16 v0, v0, v3 @@ -5953,120 +5881,116 @@ define amdgpu_ps <3 x i32> @s_ssubsat_v6i16(<6 x i16> inreg %lhs, <6 x i16> inre ; ; GFX9-LABEL: s_ssubsat_v6i16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_pack_ll_b32_b16 s8, -1, -1 +; GFX9-NEXT: s_sext_i32_i16 s8, s0 +; GFX9-NEXT: s_ashr_i32 s9, s0, 16 +; GFX9-NEXT: s_sext_i32_i16 s10, -1 +; GFX9-NEXT: s_cmp_gt_i32 s8, s10 +; GFX9-NEXT: s_cselect_b32 s11, s8, s10 +; GFX9-NEXT: s_cmp_gt_i32 s9, -1 +; GFX9-NEXT: s_cselect_b32 s12, s9, -1 +; GFX9-NEXT: s_pack_ll_b32_b16 s11, s11, s12 +; GFX9-NEXT: s_mov_b32 s6, 0x7fff7fff +; GFX9-NEXT: s_lshr_b32 s12, s11, 16 +; GFX9-NEXT: s_movk_i32 s13, 0x7fff +; GFX9-NEXT: s_sub_i32 s11, s11, s6 +; GFX9-NEXT: s_sub_i32 s12, s12, s13 +; GFX9-NEXT: s_cmp_lt_i32 s8, s10 +; GFX9-NEXT: s_cselect_b32 s8, s8, s10 +; GFX9-NEXT: s_cmp_lt_i32 s9, -1 +; GFX9-NEXT: s_cselect_b32 s9, s9, -1 +; GFX9-NEXT: s_pack_ll_b32_b16 s8, s8, s9 +; GFX9-NEXT: s_pack_ll_b32_b16 s11, s11, s12 +; GFX9-NEXT: s_mov_b32 s7, 0x80008000 +; GFX9-NEXT: s_lshr_b32 s9, s8, 16 +; GFX9-NEXT: s_mov_b32 s12, 0x8000 +; GFX9-NEXT: s_sub_i32 s8, s8, s7 +; GFX9-NEXT: s_sub_i32 s9, s9, s12 +; GFX9-NEXT: s_pack_ll_b32_b16 s8, s8, s9 +; GFX9-NEXT: s_sext_i32_i16 s9, s11 +; GFX9-NEXT: s_sext_i32_i16 s14, s3 +; GFX9-NEXT: s_ashr_i32 s11, s11, 16 +; GFX9-NEXT: s_ashr_i32 s3, s3, 16 +; GFX9-NEXT: s_cmp_gt_i32 s9, s14 +; GFX9-NEXT: s_cselect_b32 s9, s9, s14 +; GFX9-NEXT: s_cmp_gt_i32 s11, s3 +; GFX9-NEXT: s_cselect_b32 s3, s11, s3 +; GFX9-NEXT: s_pack_ll_b32_b16 s3, s9, s3 +; GFX9-NEXT: s_sext_i32_i16 s9, s3 ; GFX9-NEXT: s_sext_i32_i16 s11, s8 -; GFX9-NEXT: s_sext_i32_i16 s9, s0 -; GFX9-NEXT: s_ashr_i32 s10, s0, 16 +; GFX9-NEXT: s_ashr_i32 s3, s3, 16 ; GFX9-NEXT: s_ashr_i32 s8, s8, 16 -; GFX9-NEXT: s_cmp_gt_i32 s9, s11 -; GFX9-NEXT: s_cselect_b32 s12, s9, s11 -; GFX9-NEXT: s_cmp_gt_i32 s10, s8 -; GFX9-NEXT: s_movk_i32 s6, 0x7fff -; GFX9-NEXT: s_cselect_b32 s13, s10, s8 -; GFX9-NEXT: s_pack_ll_b32_b16 s12, s12, s13 -; GFX9-NEXT: s_pack_ll_b32_b16 s6, s6, s6 -; GFX9-NEXT: s_lshr_b32 s13, s12, 16 -; GFX9-NEXT: s_lshr_b32 s14, s6, 16 -; GFX9-NEXT: s_sub_i32 s12, s12, s6 -; GFX9-NEXT: s_sub_i32 s13, s13, s14 ; GFX9-NEXT: s_cmp_lt_i32 s9, s11 ; GFX9-NEXT: s_cselect_b32 s9, s9, s11 -; GFX9-NEXT: s_cmp_lt_i32 s10, s8 -; GFX9-NEXT: s_movk_i32 s7, 0x8000 -; GFX9-NEXT: s_cselect_b32 s10, s10, s8 -; GFX9-NEXT: s_pack_ll_b32_b16 s9, s9, s10 -; GFX9-NEXT: s_pack_ll_b32_b16 s7, s7, s7 -; GFX9-NEXT: s_pack_ll_b32_b16 s12, s12, s13 -; GFX9-NEXT: s_lshr_b32 s10, s9, 16 -; GFX9-NEXT: s_lshr_b32 s13, s7, 16 -; GFX9-NEXT: s_sub_i32 s9, s9, s7 -; GFX9-NEXT: s_sub_i32 s10, s10, s13 -; GFX9-NEXT: s_pack_ll_b32_b16 s9, s9, s10 -; GFX9-NEXT: s_sext_i32_i16 s10, s12 -; GFX9-NEXT: s_sext_i32_i16 s15, s3 -; GFX9-NEXT: s_ashr_i32 s12, s12, 16 -; GFX9-NEXT: s_ashr_i32 s3, s3, 16 -; GFX9-NEXT: s_cmp_gt_i32 s10, s15 -; GFX9-NEXT: s_cselect_b32 s10, s10, s15 -; GFX9-NEXT: s_cmp_gt_i32 s12, s3 -; GFX9-NEXT: s_cselect_b32 s3, s12, s3 -; GFX9-NEXT: s_pack_ll_b32_b16 s3, s10, s3 -; GFX9-NEXT: s_sext_i32_i16 s10, s3 -; GFX9-NEXT: s_sext_i32_i16 s12, s9 -; GFX9-NEXT: s_ashr_i32 s3, s3, 16 -; GFX9-NEXT: s_ashr_i32 s9, s9, 16 -; GFX9-NEXT: s_cmp_lt_i32 s10, s12 -; GFX9-NEXT: s_cselect_b32 s10, s10, s12 -; GFX9-NEXT: s_cmp_lt_i32 s3, s9 -; GFX9-NEXT: s_cselect_b32 s3, s3, s9 -; GFX9-NEXT: s_pack_ll_b32_b16 s3, s10, s3 -; GFX9-NEXT: s_lshr_b32 s9, s0, 16 -; GFX9-NEXT: s_lshr_b32 s10, s3, 16 +; GFX9-NEXT: s_cmp_lt_i32 s3, s8 +; GFX9-NEXT: s_cselect_b32 s3, s3, s8 +; GFX9-NEXT: s_pack_ll_b32_b16 s3, s9, s3 +; GFX9-NEXT: s_lshr_b32 s8, s0, 16 +; GFX9-NEXT: s_lshr_b32 s9, s3, 16 ; GFX9-NEXT: s_sub_i32 s0, s0, s3 -; GFX9-NEXT: s_sub_i32 s3, s9, s10 +; GFX9-NEXT: s_sub_i32 s3, s8, s9 ; GFX9-NEXT: s_pack_ll_b32_b16 s0, s0, s3 ; GFX9-NEXT: s_sext_i32_i16 s3, s1 -; GFX9-NEXT: s_ashr_i32 s9, s1, 16 -; GFX9-NEXT: s_cmp_gt_i32 s3, s11 -; GFX9-NEXT: s_cselect_b32 s10, s3, s11 -; GFX9-NEXT: s_cmp_gt_i32 s9, s8 -; GFX9-NEXT: s_cselect_b32 s12, s9, s8 -; GFX9-NEXT: s_pack_ll_b32_b16 s10, s10, s12 -; GFX9-NEXT: s_lshr_b32 s12, s10, 16 -; GFX9-NEXT: s_sub_i32 s10, s10, s6 -; GFX9-NEXT: s_sub_i32 s12, s12, s14 -; GFX9-NEXT: s_cmp_lt_i32 s3, s11 -; GFX9-NEXT: s_cselect_b32 s3, s3, s11 -; GFX9-NEXT: s_cmp_lt_i32 s9, s8 -; GFX9-NEXT: s_cselect_b32 s9, s9, s8 -; GFX9-NEXT: s_pack_ll_b32_b16 s3, s3, s9 -; GFX9-NEXT: s_lshr_b32 s9, s3, 16 -; GFX9-NEXT: s_pack_ll_b32_b16 s10, s10, s12 +; GFX9-NEXT: s_ashr_i32 s8, s1, 16 +; GFX9-NEXT: s_cmp_gt_i32 s3, s10 +; GFX9-NEXT: s_cselect_b32 s9, s3, s10 +; GFX9-NEXT: s_cmp_gt_i32 s8, -1 +; GFX9-NEXT: s_cselect_b32 s11, s8, -1 +; GFX9-NEXT: s_pack_ll_b32_b16 s9, s9, s11 +; GFX9-NEXT: s_lshr_b32 s11, s9, 16 +; GFX9-NEXT: s_sub_i32 s9, s9, s6 +; GFX9-NEXT: s_sub_i32 s11, s11, s13 +; GFX9-NEXT: s_cmp_lt_i32 s3, s10 +; GFX9-NEXT: s_cselect_b32 s3, s3, s10 +; GFX9-NEXT: s_cmp_lt_i32 s8, -1 +; GFX9-NEXT: s_cselect_b32 s8, s8, -1 +; GFX9-NEXT: s_pack_ll_b32_b16 s3, s3, s8 +; GFX9-NEXT: s_lshr_b32 s8, s3, 16 +; GFX9-NEXT: s_pack_ll_b32_b16 s9, s9, s11 ; GFX9-NEXT: s_sub_i32 s3, s3, s7 -; GFX9-NEXT: s_sub_i32 s9, s9, s13 -; GFX9-NEXT: s_pack_ll_b32_b16 s3, s3, s9 -; GFX9-NEXT: s_sext_i32_i16 s9, s10 -; GFX9-NEXT: s_sext_i32_i16 s12, s4 -; GFX9-NEXT: s_ashr_i32 s10, s10, 16 +; GFX9-NEXT: s_sub_i32 s8, s8, s12 +; GFX9-NEXT: s_pack_ll_b32_b16 s3, s3, s8 +; GFX9-NEXT: s_sext_i32_i16 s8, s9 +; GFX9-NEXT: s_sext_i32_i16 s11, s4 +; GFX9-NEXT: s_ashr_i32 s9, s9, 16 ; GFX9-NEXT: s_ashr_i32 s4, s4, 16 -; GFX9-NEXT: s_cmp_gt_i32 s9, s12 -; GFX9-NEXT: s_cselect_b32 s9, s9, s12 -; GFX9-NEXT: s_cmp_gt_i32 s10, s4 -; GFX9-NEXT: s_cselect_b32 s4, s10, s4 -; GFX9-NEXT: s_pack_ll_b32_b16 s4, s9, s4 -; GFX9-NEXT: s_sext_i32_i16 s9, s4 -; GFX9-NEXT: s_sext_i32_i16 s10, s3 +; GFX9-NEXT: s_cmp_gt_i32 s8, s11 +; GFX9-NEXT: s_cselect_b32 s8, s8, s11 +; GFX9-NEXT: s_cmp_gt_i32 s9, s4 +; GFX9-NEXT: s_cselect_b32 s4, s9, s4 +; GFX9-NEXT: s_pack_ll_b32_b16 s4, s8, s4 +; GFX9-NEXT: s_sext_i32_i16 s8, s4 +; GFX9-NEXT: s_sext_i32_i16 s9, s3 ; GFX9-NEXT: s_ashr_i32 s4, s4, 16 ; GFX9-NEXT: s_ashr_i32 s3, s3, 16 -; GFX9-NEXT: s_cmp_lt_i32 s9, s10 -; GFX9-NEXT: s_cselect_b32 s9, s9, s10 +; GFX9-NEXT: s_cmp_lt_i32 s8, s9 +; GFX9-NEXT: s_cselect_b32 s8, s8, s9 ; GFX9-NEXT: s_cmp_lt_i32 s4, s3 ; GFX9-NEXT: s_cselect_b32 s3, s4, s3 -; GFX9-NEXT: s_pack_ll_b32_b16 s3, s9, s3 +; GFX9-NEXT: s_pack_ll_b32_b16 s3, s8, s3 ; GFX9-NEXT: s_lshr_b32 s4, s1, 16 -; GFX9-NEXT: s_lshr_b32 s9, s3, 16 +; GFX9-NEXT: s_lshr_b32 s8, s3, 16 ; GFX9-NEXT: s_sub_i32 s1, s1, s3 -; GFX9-NEXT: s_sub_i32 s3, s4, s9 +; GFX9-NEXT: s_sub_i32 s3, s4, s8 ; GFX9-NEXT: s_pack_ll_b32_b16 s1, s1, s3 ; GFX9-NEXT: s_sext_i32_i16 s3, s2 ; GFX9-NEXT: s_ashr_i32 s4, s2, 16 -; GFX9-NEXT: s_cmp_gt_i32 s3, s11 -; GFX9-NEXT: s_cselect_b32 s9, s3, s11 -; GFX9-NEXT: s_cmp_gt_i32 s4, s8 -; GFX9-NEXT: s_cselect_b32 s10, s4, s8 -; GFX9-NEXT: s_pack_ll_b32_b16 s9, s9, s10 -; GFX9-NEXT: s_lshr_b32 s10, s9, 16 -; GFX9-NEXT: s_sub_i32 s6, s9, s6 -; GFX9-NEXT: s_sub_i32 s9, s10, s14 -; GFX9-NEXT: s_cmp_lt_i32 s3, s11 -; GFX9-NEXT: s_cselect_b32 s3, s3, s11 -; GFX9-NEXT: s_cmp_lt_i32 s4, s8 -; GFX9-NEXT: s_cselect_b32 s4, s4, s8 +; GFX9-NEXT: s_cmp_gt_i32 s3, s10 +; GFX9-NEXT: s_cselect_b32 s8, s3, s10 +; GFX9-NEXT: s_cmp_gt_i32 s4, -1 +; GFX9-NEXT: s_cselect_b32 s9, s4, -1 +; GFX9-NEXT: s_pack_ll_b32_b16 s8, s8, s9 +; GFX9-NEXT: s_lshr_b32 s9, s8, 16 +; GFX9-NEXT: s_sub_i32 s6, s8, s6 +; GFX9-NEXT: s_sub_i32 s8, s9, s13 +; GFX9-NEXT: s_cmp_lt_i32 s3, s10 +; GFX9-NEXT: s_cselect_b32 s3, s3, s10 +; GFX9-NEXT: s_cmp_lt_i32 s4, -1 +; GFX9-NEXT: s_cselect_b32 s4, s4, -1 ; GFX9-NEXT: s_pack_ll_b32_b16 s3, s3, s4 ; GFX9-NEXT: s_lshr_b32 s4, s3, 16 ; GFX9-NEXT: s_sub_i32 s3, s3, s7 -; GFX9-NEXT: s_sub_i32 s4, s4, s13 -; GFX9-NEXT: s_pack_ll_b32_b16 s6, s6, s9 +; GFX9-NEXT: s_sub_i32 s4, s4, s12 +; GFX9-NEXT: s_pack_ll_b32_b16 s6, s6, s8 ; GFX9-NEXT: s_pack_ll_b32_b16 s3, s3, s4 ; GFX9-NEXT: s_sext_i32_i16 s4, s6 ; GFX9-NEXT: s_sext_i32_i16 s7, s5 @@ -6095,123 +6019,119 @@ define amdgpu_ps <3 x i32> @s_ssubsat_v6i16(<6 x i16> inreg %lhs, <6 x i16> inre ; ; GFX10-LABEL: s_ssubsat_v6i16: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_pack_ll_b32_b16 s6, -1, -1 -; GFX10-NEXT: s_sext_i32_i16 s7, s0 -; GFX10-NEXT: s_sext_i32_i16 s9, s6 +; GFX10-NEXT: s_sext_i32_i16 s6, s0 +; GFX10-NEXT: s_sext_i32_i16 s7, -1 ; GFX10-NEXT: s_ashr_i32 s8, s0, 16 -; GFX10-NEXT: s_ashr_i32 s6, s6, 16 -; GFX10-NEXT: s_cmp_gt_i32 s7, s9 +; GFX10-NEXT: s_cmp_gt_i32 s6, s7 ; GFX10-NEXT: s_movk_i32 s12, 0x7fff -; GFX10-NEXT: s_cselect_b32 s10, s7, s9 -; GFX10-NEXT: s_cmp_gt_i32 s8, s6 -; GFX10-NEXT: s_movk_i32 s14, 0x8000 -; GFX10-NEXT: s_cselect_b32 s11, s8, s6 -; GFX10-NEXT: s_pack_ll_b32_b16 s14, s14, s14 -; GFX10-NEXT: s_pack_ll_b32_b16 s10, s10, s11 -; GFX10-NEXT: s_pack_ll_b32_b16 s11, s12, s12 -; GFX10-NEXT: s_lshr_b32 s12, s10, 16 -; GFX10-NEXT: s_lshr_b32 s13, s11, 16 -; GFX10-NEXT: s_sub_i32 s10, s10, s11 -; GFX10-NEXT: s_sub_i32 s12, s12, s13 -; GFX10-NEXT: s_cmp_lt_i32 s7, s9 -; GFX10-NEXT: s_sext_i32_i16 s16, s3 -; GFX10-NEXT: s_cselect_b32 s7, s7, s9 -; GFX10-NEXT: s_cmp_lt_i32 s8, s6 +; GFX10-NEXT: s_cselect_b32 s9, s6, s7 +; GFX10-NEXT: s_cmp_gt_i32 s8, -1 +; GFX10-NEXT: s_mov_b32 s13, 0x80008000 +; GFX10-NEXT: s_cselect_b32 s10, s8, -1 +; GFX10-NEXT: s_sext_i32_i16 s15, s3 +; GFX10-NEXT: s_pack_ll_b32_b16 s9, s9, s10 +; GFX10-NEXT: s_mov_b32 s10, 0x7fff7fff +; GFX10-NEXT: s_lshr_b32 s11, s9, 16 +; GFX10-NEXT: s_sub_i32 s9, s9, s10 +; GFX10-NEXT: s_sub_i32 s11, s11, s12 +; GFX10-NEXT: s_cmp_lt_i32 s6, s7 ; GFX10-NEXT: ; implicit-def: $vcc_hi -; GFX10-NEXT: s_cselect_b32 s8, s8, s6 +; GFX10-NEXT: s_cselect_b32 s6, s6, s7 +; GFX10-NEXT: s_cmp_lt_i32 s8, -1 +; GFX10-NEXT: s_cselect_b32 s8, s8, -1 ; GFX10-NEXT: s_ashr_i32 s3, s3, 16 -; GFX10-NEXT: s_pack_ll_b32_b16 s7, s7, s8 -; GFX10-NEXT: s_pack_ll_b32_b16 s8, s10, s12 -; GFX10-NEXT: s_lshr_b32 s10, s7, 16 -; GFX10-NEXT: s_lshr_b32 s12, s14, 16 -; GFX10-NEXT: s_sext_i32_i16 s15, s8 -; GFX10-NEXT: s_sub_i32 s7, s7, s14 -; GFX10-NEXT: s_sub_i32 s10, s10, s12 +; GFX10-NEXT: s_pack_ll_b32_b16 s6, s6, s8 +; GFX10-NEXT: s_pack_ll_b32_b16 s8, s9, s11 +; GFX10-NEXT: s_lshr_b32 s9, s6, 16 +; GFX10-NEXT: s_mov_b32 s11, 0x8000 +; GFX10-NEXT: s_sext_i32_i16 s14, s8 +; GFX10-NEXT: s_sub_i32 s6, s6, s13 +; GFX10-NEXT: s_sub_i32 s9, s9, s11 ; GFX10-NEXT: s_ashr_i32 s8, s8, 16 -; GFX10-NEXT: s_cmp_gt_i32 s15, s16 -; GFX10-NEXT: s_pack_ll_b32_b16 s7, s7, s10 -; GFX10-NEXT: s_cselect_b32 s15, s15, s16 +; GFX10-NEXT: s_cmp_gt_i32 s14, s15 +; GFX10-NEXT: s_pack_ll_b32_b16 s6, s6, s9 +; GFX10-NEXT: s_cselect_b32 s14, s14, s15 ; GFX10-NEXT: s_cmp_gt_i32 s8, s3 -; GFX10-NEXT: s_sext_i32_i16 s10, s7 +; GFX10-NEXT: s_sext_i32_i16 s9, s6 ; GFX10-NEXT: s_cselect_b32 s3, s8, s3 -; GFX10-NEXT: s_ashr_i32 s7, s7, 16 -; GFX10-NEXT: s_pack_ll_b32_b16 s3, s15, s3 -; GFX10-NEXT: s_sext_i32_i16 s16, s4 +; GFX10-NEXT: s_ashr_i32 s6, s6, 16 +; GFX10-NEXT: s_pack_ll_b32_b16 s3, s14, s3 +; GFX10-NEXT: s_sext_i32_i16 s15, s4 ; GFX10-NEXT: s_sext_i32_i16 s8, s3 ; GFX10-NEXT: s_ashr_i32 s3, s3, 16 -; GFX10-NEXT: s_cmp_lt_i32 s8, s10 -; GFX10-NEXT: s_cselect_b32 s8, s8, s10 -; GFX10-NEXT: s_cmp_lt_i32 s3, s7 -; GFX10-NEXT: s_cselect_b32 s3, s3, s7 -; GFX10-NEXT: s_lshr_b32 s7, s0, 16 +; GFX10-NEXT: s_cmp_lt_i32 s8, s9 +; GFX10-NEXT: s_cselect_b32 s8, s8, s9 +; GFX10-NEXT: s_cmp_lt_i32 s3, s6 +; GFX10-NEXT: s_cselect_b32 s3, s3, s6 +; GFX10-NEXT: s_lshr_b32 s6, s0, 16 ; GFX10-NEXT: s_pack_ll_b32_b16 s3, s8, s3 ; GFX10-NEXT: s_lshr_b32 s8, s3, 16 ; GFX10-NEXT: s_sub_i32 s0, s0, s3 -; GFX10-NEXT: s_sub_i32 s3, s7, s8 -; GFX10-NEXT: s_sext_i32_i16 s7, s1 +; GFX10-NEXT: s_sub_i32 s3, s6, s8 +; GFX10-NEXT: s_sext_i32_i16 s6, s1 ; GFX10-NEXT: s_ashr_i32 s8, s1, 16 -; GFX10-NEXT: s_cmp_gt_i32 s7, s9 +; GFX10-NEXT: s_cmp_gt_i32 s6, s7 ; GFX10-NEXT: s_pack_ll_b32_b16 s0, s0, s3 -; GFX10-NEXT: s_cselect_b32 s10, s7, s9 -; GFX10-NEXT: s_cmp_gt_i32 s8, s6 -; GFX10-NEXT: s_cselect_b32 s15, s8, s6 -; GFX10-NEXT: s_pack_ll_b32_b16 s10, s10, s15 -; GFX10-NEXT: s_lshr_b32 s15, s10, 16 -; GFX10-NEXT: s_sub_i32 s10, s10, s11 -; GFX10-NEXT: s_sub_i32 s15, s15, s13 -; GFX10-NEXT: s_cmp_lt_i32 s7, s9 -; GFX10-NEXT: s_cselect_b32 s7, s7, s9 -; GFX10-NEXT: s_cmp_lt_i32 s8, s6 -; GFX10-NEXT: s_cselect_b32 s8, s8, s6 +; GFX10-NEXT: s_cselect_b32 s9, s6, s7 +; GFX10-NEXT: s_cmp_gt_i32 s8, -1 +; GFX10-NEXT: s_cselect_b32 s14, s8, -1 +; GFX10-NEXT: s_pack_ll_b32_b16 s9, s9, s14 +; GFX10-NEXT: s_lshr_b32 s14, s9, 16 +; GFX10-NEXT: s_sub_i32 s9, s9, s10 +; GFX10-NEXT: s_sub_i32 s14, s14, s12 +; GFX10-NEXT: s_cmp_lt_i32 s6, s7 +; GFX10-NEXT: s_cselect_b32 s6, s6, s7 +; GFX10-NEXT: s_cmp_lt_i32 s8, -1 +; GFX10-NEXT: s_cselect_b32 s8, s8, -1 ; GFX10-NEXT: s_ashr_i32 s4, s4, 16 -; GFX10-NEXT: s_pack_ll_b32_b16 s7, s7, s8 -; GFX10-NEXT: s_pack_ll_b32_b16 s8, s10, s15 -; GFX10-NEXT: s_lshr_b32 s10, s7, 16 -; GFX10-NEXT: s_sext_i32_i16 s15, s8 -; GFX10-NEXT: s_sub_i32 s7, s7, s14 -; GFX10-NEXT: s_sub_i32 s10, s10, s12 +; GFX10-NEXT: s_pack_ll_b32_b16 s6, s6, s8 +; GFX10-NEXT: s_pack_ll_b32_b16 s8, s9, s14 +; GFX10-NEXT: s_lshr_b32 s9, s6, 16 +; GFX10-NEXT: s_sext_i32_i16 s14, s8 +; GFX10-NEXT: s_sub_i32 s6, s6, s13 +; GFX10-NEXT: s_sub_i32 s9, s9, s11 ; GFX10-NEXT: s_ashr_i32 s8, s8, 16 -; GFX10-NEXT: s_cmp_gt_i32 s15, s16 -; GFX10-NEXT: s_pack_ll_b32_b16 s7, s7, s10 -; GFX10-NEXT: s_cselect_b32 s15, s15, s16 +; GFX10-NEXT: s_cmp_gt_i32 s14, s15 +; GFX10-NEXT: s_pack_ll_b32_b16 s6, s6, s9 +; GFX10-NEXT: s_cselect_b32 s14, s14, s15 ; GFX10-NEXT: s_cmp_gt_i32 s8, s4 -; GFX10-NEXT: s_sext_i32_i16 s10, s7 +; GFX10-NEXT: s_sext_i32_i16 s9, s6 ; GFX10-NEXT: s_cselect_b32 s4, s8, s4 -; GFX10-NEXT: s_ashr_i32 s7, s7, 16 -; GFX10-NEXT: s_pack_ll_b32_b16 s4, s15, s4 +; GFX10-NEXT: s_ashr_i32 s6, s6, 16 +; GFX10-NEXT: s_pack_ll_b32_b16 s4, s14, s4 ; GFX10-NEXT: s_sext_i32_i16 s8, s4 ; GFX10-NEXT: s_ashr_i32 s4, s4, 16 -; GFX10-NEXT: s_cmp_lt_i32 s8, s10 -; GFX10-NEXT: s_cselect_b32 s8, s8, s10 -; GFX10-NEXT: s_cmp_lt_i32 s4, s7 -; GFX10-NEXT: s_cselect_b32 s4, s4, s7 -; GFX10-NEXT: s_lshr_b32 s7, s1, 16 +; GFX10-NEXT: s_cmp_lt_i32 s8, s9 +; GFX10-NEXT: s_cselect_b32 s8, s8, s9 +; GFX10-NEXT: s_cmp_lt_i32 s4, s6 +; GFX10-NEXT: s_cselect_b32 s4, s4, s6 +; GFX10-NEXT: s_lshr_b32 s6, s1, 16 ; GFX10-NEXT: s_pack_ll_b32_b16 s4, s8, s4 ; GFX10-NEXT: s_lshr_b32 s8, s4, 16 ; GFX10-NEXT: s_sub_i32 s1, s1, s4 -; GFX10-NEXT: s_sub_i32 s4, s7, s8 -; GFX10-NEXT: s_sext_i32_i16 s7, s2 +; GFX10-NEXT: s_sub_i32 s4, s6, s8 +; GFX10-NEXT: s_sext_i32_i16 s6, s2 ; GFX10-NEXT: s_ashr_i32 s8, s2, 16 -; GFX10-NEXT: s_cmp_gt_i32 s7, s9 +; GFX10-NEXT: s_cmp_gt_i32 s6, s7 ; GFX10-NEXT: s_pack_ll_b32_b16 s1, s1, s4 -; GFX10-NEXT: s_cselect_b32 s10, s7, s9 -; GFX10-NEXT: s_cmp_gt_i32 s8, s6 -; GFX10-NEXT: s_cselect_b32 s15, s8, s6 -; GFX10-NEXT: s_pack_ll_b32_b16 s10, s10, s15 -; GFX10-NEXT: s_lshr_b32 s15, s10, 16 -; GFX10-NEXT: s_sub_i32 s10, s10, s11 -; GFX10-NEXT: s_sub_i32 s11, s15, s13 -; GFX10-NEXT: s_cmp_lt_i32 s7, s9 -; GFX10-NEXT: s_cselect_b32 s7, s7, s9 -; GFX10-NEXT: s_cmp_lt_i32 s8, s6 -; GFX10-NEXT: s_cselect_b32 s6, s8, s6 -; GFX10-NEXT: s_pack_ll_b32_b16 s6, s7, s6 -; GFX10-NEXT: s_pack_ll_b32_b16 s7, s10, s11 +; GFX10-NEXT: s_cselect_b32 s9, s6, s7 +; GFX10-NEXT: s_cmp_gt_i32 s8, -1 +; GFX10-NEXT: s_cselect_b32 s14, s8, -1 +; GFX10-NEXT: s_pack_ll_b32_b16 s9, s9, s14 +; GFX10-NEXT: s_lshr_b32 s14, s9, 16 +; GFX10-NEXT: s_sub_i32 s9, s9, s10 +; GFX10-NEXT: s_sub_i32 s10, s14, s12 +; GFX10-NEXT: s_cmp_lt_i32 s6, s7 +; GFX10-NEXT: s_cselect_b32 s6, s6, s7 +; GFX10-NEXT: s_cmp_lt_i32 s8, -1 +; GFX10-NEXT: s_cselect_b32 s7, s8, -1 +; GFX10-NEXT: s_pack_ll_b32_b16 s6, s6, s7 +; GFX10-NEXT: s_pack_ll_b32_b16 s7, s9, s10 ; GFX10-NEXT: s_lshr_b32 s8, s6, 16 ; GFX10-NEXT: s_sext_i32_i16 s9, s7 ; GFX10-NEXT: s_sext_i32_i16 s10, s5 -; GFX10-NEXT: s_sub_i32 s6, s6, s14 -; GFX10-NEXT: s_sub_i32 s8, s8, s12 +; GFX10-NEXT: s_sub_i32 s6, s6, s13 +; GFX10-NEXT: s_sub_i32 s8, s8, s11 ; GFX10-NEXT: s_ashr_i32 s7, s7, 16 ; GFX10-NEXT: s_ashr_i32 s5, s5, 16 ; GFX10-NEXT: s_cmp_gt_i32 s9, s10 @@ -6423,36 +6343,33 @@ define <4 x float> @v_ssubsat_v8i16(<8 x i16> %lhs, <8 x i16> %rhs) { ; GFX9-LABEL: v_ssubsat_v8i16: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: s_movk_i32 s4, 0x7fff -; GFX9-NEXT: s_pack_ll_b32_b16 s6, -1, -1 -; GFX9-NEXT: s_movk_i32 s5, 0x8000 -; GFX9-NEXT: s_pack_ll_b32_b16 s4, s4, s4 -; GFX9-NEXT: v_pk_max_i16 v8, v0, s6 -; GFX9-NEXT: v_pk_sub_i16 v8, v8, s4 -; GFX9-NEXT: s_pack_ll_b32_b16 s5, s5, s5 -; GFX9-NEXT: v_pk_min_i16 v9, v0, s6 +; GFX9-NEXT: v_pk_max_i16 v8, v0, -1 op_sel_hi:[1,0] +; GFX9-NEXT: v_mov_b32_e32 v9, 0x7fff7fff +; GFX9-NEXT: v_pk_sub_i16 v8, v8, v9 +; GFX9-NEXT: v_pk_min_i16 v10, v0, -1 op_sel_hi:[1,0] +; GFX9-NEXT: v_mov_b32_e32 v11, 0x80008000 ; GFX9-NEXT: v_pk_max_i16 v4, v8, v4 -; GFX9-NEXT: v_pk_sub_i16 v9, v9, s5 -; GFX9-NEXT: v_pk_min_i16 v4, v4, v9 +; GFX9-NEXT: v_pk_sub_i16 v10, v10, v11 +; GFX9-NEXT: v_pk_min_i16 v4, v4, v10 ; GFX9-NEXT: v_pk_sub_i16 v0, v0, v4 -; GFX9-NEXT: v_pk_max_i16 v4, v1, s6 -; GFX9-NEXT: v_pk_sub_i16 v4, v4, s4 -; GFX9-NEXT: v_pk_min_i16 v8, v1, s6 +; GFX9-NEXT: v_pk_max_i16 v4, v1, -1 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_sub_i16 v4, v4, v9 +; GFX9-NEXT: v_pk_min_i16 v8, v1, -1 op_sel_hi:[1,0] ; GFX9-NEXT: v_pk_max_i16 v4, v4, v5 -; GFX9-NEXT: v_pk_sub_i16 v8, v8, s5 +; GFX9-NEXT: v_pk_sub_i16 v8, v8, v11 ; GFX9-NEXT: v_pk_min_i16 v4, v4, v8 ; GFX9-NEXT: v_pk_sub_i16 v1, v1, v4 -; GFX9-NEXT: v_pk_max_i16 v4, v2, s6 -; GFX9-NEXT: v_pk_sub_i16 v4, v4, s4 -; GFX9-NEXT: v_pk_min_i16 v5, v2, s6 -; GFX9-NEXT: v_pk_sub_i16 v5, v5, s5 +; GFX9-NEXT: v_pk_max_i16 v4, v2, -1 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_sub_i16 v4, v4, v9 +; GFX9-NEXT: v_pk_min_i16 v5, v2, -1 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_sub_i16 v5, v5, v11 ; GFX9-NEXT: v_pk_max_i16 v4, v4, v6 ; GFX9-NEXT: v_pk_min_i16 v4, v4, v5 ; GFX9-NEXT: v_pk_sub_i16 v2, v2, v4 -; GFX9-NEXT: v_pk_max_i16 v4, v3, s6 -; GFX9-NEXT: v_pk_sub_i16 v4, v4, s4 -; GFX9-NEXT: v_pk_min_i16 v5, v3, s6 -; GFX9-NEXT: v_pk_sub_i16 v5, v5, s5 +; GFX9-NEXT: v_pk_max_i16 v4, v3, -1 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_sub_i16 v4, v4, v9 +; GFX9-NEXT: v_pk_min_i16 v5, v3, -1 op_sel_hi:[1,0] +; GFX9-NEXT: v_pk_sub_i16 v5, v5, v11 ; GFX9-NEXT: v_pk_max_i16 v4, v4, v7 ; GFX9-NEXT: v_pk_min_i16 v4, v4, v5 ; GFX9-NEXT: v_pk_sub_i16 v3, v3, v4 @@ -6462,30 +6379,25 @@ define <4 x float> @v_ssubsat_v8i16(<8 x i16> %lhs, <8 x i16> %rhs) { ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: s_pack_ll_b32_b16 s4, -1, -1 -; GFX10-NEXT: s_movk_i32 s5, 0x7fff -; GFX10-NEXT: v_pk_max_i16 v8, v0, s4 -; GFX10-NEXT: s_pack_ll_b32_b16 s5, s5, s5 -; GFX10-NEXT: v_pk_max_i16 v10, v1, s4 -; GFX10-NEXT: v_pk_max_i16 v12, v3, s4 -; GFX10-NEXT: v_pk_min_i16 v9, v0, s4 -; GFX10-NEXT: v_pk_sub_i16 v15, v8, s5 -; GFX10-NEXT: v_pk_max_i16 v8, v2, s4 -; GFX10-NEXT: v_pk_sub_i16 v10, v10, s5 -; GFX10-NEXT: v_pk_sub_i16 v12, v12, s5 -; GFX10-NEXT: v_pk_min_i16 v11, v1, s4 -; GFX10-NEXT: v_pk_min_i16 v13, v2, s4 -; GFX10-NEXT: v_pk_sub_i16 v8, v8, s5 -; GFX10-NEXT: v_pk_min_i16 v14, v3, s4 -; GFX10-NEXT: s_movk_i32 s6, 0x8000 +; GFX10-NEXT: v_pk_max_i16 v8, v0, -1 op_sel_hi:[1,0] +; GFX10-NEXT: v_pk_max_i16 v10, v1, -1 op_sel_hi:[1,0] +; GFX10-NEXT: v_pk_max_i16 v12, v3, -1 op_sel_hi:[1,0] +; GFX10-NEXT: v_pk_min_i16 v9, v0, -1 op_sel_hi:[1,0] +; GFX10-NEXT: v_pk_min_i16 v11, v1, -1 op_sel_hi:[1,0] +; GFX10-NEXT: v_pk_sub_i16 v15, v8, 0x7fff7fff +; GFX10-NEXT: v_pk_max_i16 v8, v2, -1 op_sel_hi:[1,0] +; GFX10-NEXT: v_pk_sub_i16 v10, v10, 0x7fff7fff +; GFX10-NEXT: v_pk_sub_i16 v12, v12, 0x7fff7fff +; GFX10-NEXT: v_pk_min_i16 v13, v2, -1 op_sel_hi:[1,0] +; GFX10-NEXT: v_pk_min_i16 v14, v3, -1 op_sel_hi:[1,0] +; GFX10-NEXT: v_pk_sub_i16 v8, v8, 0x7fff7fff ; GFX10-NEXT: v_pk_max_i16 v4, v15, v4 -; GFX10-NEXT: s_pack_ll_b32_b16 s6, s6, s6 +; GFX10-NEXT: v_pk_sub_i16 v9, v9, 0x80008000 ; GFX10-NEXT: v_pk_max_i16 v5, v10, v5 -; GFX10-NEXT: v_pk_sub_i16 v11, v11, s6 -; GFX10-NEXT: v_pk_sub_i16 v9, v9, s6 +; GFX10-NEXT: v_pk_sub_i16 v11, v11, 0x80008000 ; GFX10-NEXT: v_pk_max_i16 v15, v8, v6 -; GFX10-NEXT: v_pk_sub_i16 v10, v13, s6 -; GFX10-NEXT: v_pk_sub_i16 v8, v14, s6 +; GFX10-NEXT: v_pk_sub_i16 v10, v13, 0x80008000 +; GFX10-NEXT: v_pk_sub_i16 v8, v14, 0x80008000 ; GFX10-NEXT: v_pk_max_i16 v7, v12, v7 ; GFX10-NEXT: v_pk_min_i16 v19, v4, v9 ; GFX10-NEXT: v_pk_min_i16 v11, v5, v11 @@ -6799,136 +6711,132 @@ define amdgpu_ps <4 x i32> @s_ssubsat_v8i16(<8 x i16> inreg %lhs, <8 x i16> inre ; ; GFX9-LABEL: s_ssubsat_v8i16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_pack_ll_b32_b16 s10, -1, -1 +; GFX9-NEXT: s_sext_i32_i16 s10, s0 +; GFX9-NEXT: s_ashr_i32 s11, s0, 16 +; GFX9-NEXT: s_sext_i32_i16 s12, -1 +; GFX9-NEXT: s_cmp_gt_i32 s10, s12 +; GFX9-NEXT: s_cselect_b32 s13, s10, s12 +; GFX9-NEXT: s_cmp_gt_i32 s11, -1 +; GFX9-NEXT: s_cselect_b32 s14, s11, -1 +; GFX9-NEXT: s_pack_ll_b32_b16 s13, s13, s14 +; GFX9-NEXT: s_mov_b32 s8, 0x7fff7fff +; GFX9-NEXT: s_lshr_b32 s14, s13, 16 +; GFX9-NEXT: s_movk_i32 s15, 0x7fff +; GFX9-NEXT: s_sub_i32 s13, s13, s8 +; GFX9-NEXT: s_sub_i32 s14, s14, s15 +; GFX9-NEXT: s_cmp_lt_i32 s10, s12 +; GFX9-NEXT: s_cselect_b32 s10, s10, s12 +; GFX9-NEXT: s_cmp_lt_i32 s11, -1 +; GFX9-NEXT: s_cselect_b32 s11, s11, -1 +; GFX9-NEXT: s_pack_ll_b32_b16 s10, s10, s11 +; GFX9-NEXT: s_pack_ll_b32_b16 s13, s13, s14 +; GFX9-NEXT: s_mov_b32 s9, 0x80008000 +; GFX9-NEXT: s_lshr_b32 s11, s10, 16 +; GFX9-NEXT: s_mov_b32 s14, 0x8000 +; GFX9-NEXT: s_sub_i32 s10, s10, s9 +; GFX9-NEXT: s_sub_i32 s11, s11, s14 +; GFX9-NEXT: s_pack_ll_b32_b16 s10, s10, s11 +; GFX9-NEXT: s_sext_i32_i16 s11, s13 +; GFX9-NEXT: s_sext_i32_i16 s16, s4 +; GFX9-NEXT: s_ashr_i32 s13, s13, 16 +; GFX9-NEXT: s_ashr_i32 s4, s4, 16 +; GFX9-NEXT: s_cmp_gt_i32 s11, s16 +; GFX9-NEXT: s_cselect_b32 s11, s11, s16 +; GFX9-NEXT: s_cmp_gt_i32 s13, s4 +; GFX9-NEXT: s_cselect_b32 s4, s13, s4 +; GFX9-NEXT: s_pack_ll_b32_b16 s4, s11, s4 +; GFX9-NEXT: s_sext_i32_i16 s11, s4 ; GFX9-NEXT: s_sext_i32_i16 s13, s10 -; GFX9-NEXT: s_sext_i32_i16 s11, s0 -; GFX9-NEXT: s_ashr_i32 s12, s0, 16 +; GFX9-NEXT: s_ashr_i32 s4, s4, 16 ; GFX9-NEXT: s_ashr_i32 s10, s10, 16 -; GFX9-NEXT: s_cmp_gt_i32 s11, s13 -; GFX9-NEXT: s_cselect_b32 s14, s11, s13 -; GFX9-NEXT: s_cmp_gt_i32 s12, s10 -; GFX9-NEXT: s_movk_i32 s8, 0x7fff -; GFX9-NEXT: s_cselect_b32 s15, s12, s10 -; GFX9-NEXT: s_pack_ll_b32_b16 s14, s14, s15 -; GFX9-NEXT: s_pack_ll_b32_b16 s8, s8, s8 -; GFX9-NEXT: s_lshr_b32 s15, s14, 16 -; GFX9-NEXT: s_lshr_b32 s16, s8, 16 -; GFX9-NEXT: s_sub_i32 s14, s14, s8 -; GFX9-NEXT: s_sub_i32 s15, s15, s16 ; GFX9-NEXT: s_cmp_lt_i32 s11, s13 ; GFX9-NEXT: s_cselect_b32 s11, s11, s13 -; GFX9-NEXT: s_cmp_lt_i32 s12, s10 -; GFX9-NEXT: s_movk_i32 s9, 0x8000 -; GFX9-NEXT: s_cselect_b32 s12, s12, s10 -; GFX9-NEXT: s_pack_ll_b32_b16 s11, s11, s12 -; GFX9-NEXT: s_pack_ll_b32_b16 s9, s9, s9 -; GFX9-NEXT: s_pack_ll_b32_b16 s14, s14, s15 -; GFX9-NEXT: s_lshr_b32 s12, s11, 16 -; GFX9-NEXT: s_lshr_b32 s15, s9, 16 -; GFX9-NEXT: s_sub_i32 s11, s11, s9 -; GFX9-NEXT: s_sub_i32 s12, s12, s15 -; GFX9-NEXT: s_pack_ll_b32_b16 s11, s11, s12 -; GFX9-NEXT: s_sext_i32_i16 s12, s14 -; GFX9-NEXT: s_sext_i32_i16 s17, s4 -; GFX9-NEXT: s_ashr_i32 s14, s14, 16 -; GFX9-NEXT: s_ashr_i32 s4, s4, 16 -; GFX9-NEXT: s_cmp_gt_i32 s12, s17 -; GFX9-NEXT: s_cselect_b32 s12, s12, s17 -; GFX9-NEXT: s_cmp_gt_i32 s14, s4 -; GFX9-NEXT: s_cselect_b32 s4, s14, s4 -; GFX9-NEXT: s_pack_ll_b32_b16 s4, s12, s4 -; GFX9-NEXT: s_sext_i32_i16 s12, s4 -; GFX9-NEXT: s_sext_i32_i16 s14, s11 -; GFX9-NEXT: s_ashr_i32 s4, s4, 16 -; GFX9-NEXT: s_ashr_i32 s11, s11, 16 -; GFX9-NEXT: s_cmp_lt_i32 s12, s14 -; GFX9-NEXT: s_cselect_b32 s12, s12, s14 -; GFX9-NEXT: s_cmp_lt_i32 s4, s11 -; GFX9-NEXT: s_cselect_b32 s4, s4, s11 -; GFX9-NEXT: s_pack_ll_b32_b16 s4, s12, s4 -; GFX9-NEXT: s_lshr_b32 s11, s0, 16 -; GFX9-NEXT: s_lshr_b32 s12, s4, 16 +; GFX9-NEXT: s_cmp_lt_i32 s4, s10 +; GFX9-NEXT: s_cselect_b32 s4, s4, s10 +; GFX9-NEXT: s_pack_ll_b32_b16 s4, s11, s4 +; GFX9-NEXT: s_lshr_b32 s10, s0, 16 +; GFX9-NEXT: s_lshr_b32 s11, s4, 16 ; GFX9-NEXT: s_sub_i32 s0, s0, s4 -; GFX9-NEXT: s_sub_i32 s4, s11, s12 +; GFX9-NEXT: s_sub_i32 s4, s10, s11 ; GFX9-NEXT: s_pack_ll_b32_b16 s0, s0, s4 ; GFX9-NEXT: s_sext_i32_i16 s4, s1 -; GFX9-NEXT: s_ashr_i32 s11, s1, 16 -; GFX9-NEXT: s_cmp_gt_i32 s4, s13 -; GFX9-NEXT: s_cselect_b32 s12, s4, s13 -; GFX9-NEXT: s_cmp_gt_i32 s11, s10 -; GFX9-NEXT: s_cselect_b32 s14, s11, s10 -; GFX9-NEXT: s_pack_ll_b32_b16 s12, s12, s14 -; GFX9-NEXT: s_lshr_b32 s14, s12, 16 -; GFX9-NEXT: s_sub_i32 s12, s12, s8 -; GFX9-NEXT: s_sub_i32 s14, s14, s16 -; GFX9-NEXT: s_cmp_lt_i32 s4, s13 -; GFX9-NEXT: s_cselect_b32 s4, s4, s13 -; GFX9-NEXT: s_cmp_lt_i32 s11, s10 -; GFX9-NEXT: s_cselect_b32 s11, s11, s10 -; GFX9-NEXT: s_pack_ll_b32_b16 s4, s4, s11 -; GFX9-NEXT: s_lshr_b32 s11, s4, 16 -; GFX9-NEXT: s_pack_ll_b32_b16 s12, s12, s14 -; GFX9-NEXT: s_sub_i32 s4, s4, s9 -; GFX9-NEXT: s_sub_i32 s11, s11, s15 -; GFX9-NEXT: s_pack_ll_b32_b16 s4, s4, s11 -; GFX9-NEXT: s_sext_i32_i16 s11, s12 -; GFX9-NEXT: s_sext_i32_i16 s14, s5 -; GFX9-NEXT: s_ashr_i32 s12, s12, 16 -; GFX9-NEXT: s_ashr_i32 s5, s5, 16 -; GFX9-NEXT: s_cmp_gt_i32 s11, s14 -; GFX9-NEXT: s_cselect_b32 s11, s11, s14 -; GFX9-NEXT: s_cmp_gt_i32 s12, s5 -; GFX9-NEXT: s_cselect_b32 s5, s12, s5 -; GFX9-NEXT: s_pack_ll_b32_b16 s5, s11, s5 -; GFX9-NEXT: s_sext_i32_i16 s11, s5 -; GFX9-NEXT: s_sext_i32_i16 s12, s4 -; GFX9-NEXT: s_ashr_i32 s5, s5, 16 -; GFX9-NEXT: s_ashr_i32 s4, s4, 16 -; GFX9-NEXT: s_cmp_lt_i32 s11, s12 -; GFX9-NEXT: s_cselect_b32 s11, s11, s12 -; GFX9-NEXT: s_cmp_lt_i32 s5, s4 -; GFX9-NEXT: s_cselect_b32 s4, s5, s4 -; GFX9-NEXT: s_pack_ll_b32_b16 s4, s11, s4 -; GFX9-NEXT: s_lshr_b32 s5, s1, 16 -; GFX9-NEXT: s_lshr_b32 s11, s4, 16 -; GFX9-NEXT: s_sub_i32 s1, s1, s4 -; GFX9-NEXT: s_sub_i32 s4, s5, s11 -; GFX9-NEXT: s_pack_ll_b32_b16 s1, s1, s4 -; GFX9-NEXT: s_sext_i32_i16 s4, s2 -; GFX9-NEXT: s_ashr_i32 s5, s2, 16 -; GFX9-NEXT: s_cmp_gt_i32 s4, s13 -; GFX9-NEXT: s_cselect_b32 s11, s4, s13 -; GFX9-NEXT: s_cmp_gt_i32 s5, s10 -; GFX9-NEXT: s_cselect_b32 s12, s5, s10 -; GFX9-NEXT: s_pack_ll_b32_b16 s11, s11, s12 -; GFX9-NEXT: s_lshr_b32 s12, s11, 16 +; GFX9-NEXT: s_ashr_i32 s10, s1, 16 +; GFX9-NEXT: s_cmp_gt_i32 s4, s12 +; GFX9-NEXT: s_cselect_b32 s11, s4, s12 +; GFX9-NEXT: s_cmp_gt_i32 s10, -1 +; GFX9-NEXT: s_cselect_b32 s13, s10, -1 +; GFX9-NEXT: s_pack_ll_b32_b16 s11, s11, s13 +; GFX9-NEXT: s_lshr_b32 s13, s11, 16 ; GFX9-NEXT: s_sub_i32 s11, s11, s8 -; GFX9-NEXT: s_sub_i32 s12, s12, s16 -; GFX9-NEXT: s_cmp_lt_i32 s4, s13 -; GFX9-NEXT: s_cselect_b32 s4, s4, s13 -; GFX9-NEXT: s_cmp_lt_i32 s5, s10 -; GFX9-NEXT: s_cselect_b32 s5, s5, s10 -; GFX9-NEXT: s_pack_ll_b32_b16 s4, s4, s5 -; GFX9-NEXT: s_lshr_b32 s5, s4, 16 -; GFX9-NEXT: s_pack_ll_b32_b16 s11, s11, s12 +; GFX9-NEXT: s_sub_i32 s13, s13, s15 +; GFX9-NEXT: s_cmp_lt_i32 s4, s12 +; GFX9-NEXT: s_cselect_b32 s4, s4, s12 +; GFX9-NEXT: s_cmp_lt_i32 s10, -1 +; GFX9-NEXT: s_cselect_b32 s10, s10, -1 +; GFX9-NEXT: s_pack_ll_b32_b16 s4, s4, s10 +; GFX9-NEXT: s_lshr_b32 s10, s4, 16 +; GFX9-NEXT: s_pack_ll_b32_b16 s11, s11, s13 ; GFX9-NEXT: s_sub_i32 s4, s4, s9 -; GFX9-NEXT: s_sub_i32 s5, s5, s15 -; GFX9-NEXT: s_pack_ll_b32_b16 s4, s4, s5 -; GFX9-NEXT: s_sext_i32_i16 s5, s11 -; GFX9-NEXT: s_sext_i32_i16 s12, s6 +; GFX9-NEXT: s_sub_i32 s10, s10, s14 +; GFX9-NEXT: s_pack_ll_b32_b16 s4, s4, s10 +; GFX9-NEXT: s_sext_i32_i16 s10, s11 +; GFX9-NEXT: s_sext_i32_i16 s13, s5 ; GFX9-NEXT: s_ashr_i32 s11, s11, 16 -; GFX9-NEXT: s_ashr_i32 s6, s6, 16 -; GFX9-NEXT: s_cmp_gt_i32 s5, s12 -; GFX9-NEXT: s_cselect_b32 s5, s5, s12 -; GFX9-NEXT: s_cmp_gt_i32 s11, s6 -; GFX9-NEXT: s_cselect_b32 s6, s11, s6 -; GFX9-NEXT: s_pack_ll_b32_b16 s5, s5, s6 -; GFX9-NEXT: s_sext_i32_i16 s6, s5 +; GFX9-NEXT: s_ashr_i32 s5, s5, 16 +; GFX9-NEXT: s_cmp_gt_i32 s10, s13 +; GFX9-NEXT: s_cselect_b32 s10, s10, s13 +; GFX9-NEXT: s_cmp_gt_i32 s11, s5 +; GFX9-NEXT: s_cselect_b32 s5, s11, s5 +; GFX9-NEXT: s_pack_ll_b32_b16 s5, s10, s5 +; GFX9-NEXT: s_sext_i32_i16 s10, s5 ; GFX9-NEXT: s_sext_i32_i16 s11, s4 ; GFX9-NEXT: s_ashr_i32 s5, s5, 16 ; GFX9-NEXT: s_ashr_i32 s4, s4, 16 -; GFX9-NEXT: s_cmp_lt_i32 s6, s11 -; GFX9-NEXT: s_cselect_b32 s6, s6, s11 +; GFX9-NEXT: s_cmp_lt_i32 s10, s11 +; GFX9-NEXT: s_cselect_b32 s10, s10, s11 +; GFX9-NEXT: s_cmp_lt_i32 s5, s4 +; GFX9-NEXT: s_cselect_b32 s4, s5, s4 +; GFX9-NEXT: s_pack_ll_b32_b16 s4, s10, s4 +; GFX9-NEXT: s_lshr_b32 s5, s1, 16 +; GFX9-NEXT: s_lshr_b32 s10, s4, 16 +; GFX9-NEXT: s_sub_i32 s1, s1, s4 +; GFX9-NEXT: s_sub_i32 s4, s5, s10 +; GFX9-NEXT: s_pack_ll_b32_b16 s1, s1, s4 +; GFX9-NEXT: s_sext_i32_i16 s4, s2 +; GFX9-NEXT: s_ashr_i32 s5, s2, 16 +; GFX9-NEXT: s_cmp_gt_i32 s4, s12 +; GFX9-NEXT: s_cselect_b32 s10, s4, s12 +; GFX9-NEXT: s_cmp_gt_i32 s5, -1 +; GFX9-NEXT: s_cselect_b32 s11, s5, -1 +; GFX9-NEXT: s_pack_ll_b32_b16 s10, s10, s11 +; GFX9-NEXT: s_lshr_b32 s11, s10, 16 +; GFX9-NEXT: s_sub_i32 s10, s10, s8 +; GFX9-NEXT: s_sub_i32 s11, s11, s15 +; GFX9-NEXT: s_cmp_lt_i32 s4, s12 +; GFX9-NEXT: s_cselect_b32 s4, s4, s12 +; GFX9-NEXT: s_cmp_lt_i32 s5, -1 +; GFX9-NEXT: s_cselect_b32 s5, s5, -1 +; GFX9-NEXT: s_pack_ll_b32_b16 s4, s4, s5 +; GFX9-NEXT: s_lshr_b32 s5, s4, 16 +; GFX9-NEXT: s_pack_ll_b32_b16 s10, s10, s11 +; GFX9-NEXT: s_sub_i32 s4, s4, s9 +; GFX9-NEXT: s_sub_i32 s5, s5, s14 +; GFX9-NEXT: s_pack_ll_b32_b16 s4, s4, s5 +; GFX9-NEXT: s_sext_i32_i16 s5, s10 +; GFX9-NEXT: s_sext_i32_i16 s11, s6 +; GFX9-NEXT: s_ashr_i32 s10, s10, 16 +; GFX9-NEXT: s_ashr_i32 s6, s6, 16 +; GFX9-NEXT: s_cmp_gt_i32 s5, s11 +; GFX9-NEXT: s_cselect_b32 s5, s5, s11 +; GFX9-NEXT: s_cmp_gt_i32 s10, s6 +; GFX9-NEXT: s_cselect_b32 s6, s10, s6 +; GFX9-NEXT: s_pack_ll_b32_b16 s5, s5, s6 +; GFX9-NEXT: s_sext_i32_i16 s6, s5 +; GFX9-NEXT: s_sext_i32_i16 s10, s4 +; GFX9-NEXT: s_ashr_i32 s5, s5, 16 +; GFX9-NEXT: s_ashr_i32 s4, s4, 16 +; GFX9-NEXT: s_cmp_lt_i32 s6, s10 +; GFX9-NEXT: s_cselect_b32 s6, s6, s10 ; GFX9-NEXT: s_cmp_lt_i32 s5, s4 ; GFX9-NEXT: s_cselect_b32 s4, s5, s4 ; GFX9-NEXT: s_pack_ll_b32_b16 s4, s6, s4 @@ -6939,23 +6847,23 @@ define amdgpu_ps <4 x i32> @s_ssubsat_v8i16(<8 x i16> inreg %lhs, <8 x i16> inre ; GFX9-NEXT: s_pack_ll_b32_b16 s2, s2, s4 ; GFX9-NEXT: s_sext_i32_i16 s4, s3 ; GFX9-NEXT: s_ashr_i32 s5, s3, 16 -; GFX9-NEXT: s_cmp_gt_i32 s4, s13 -; GFX9-NEXT: s_cselect_b32 s6, s4, s13 -; GFX9-NEXT: s_cmp_gt_i32 s5, s10 -; GFX9-NEXT: s_cselect_b32 s11, s5, s10 -; GFX9-NEXT: s_pack_ll_b32_b16 s6, s6, s11 -; GFX9-NEXT: s_lshr_b32 s11, s6, 16 +; GFX9-NEXT: s_cmp_gt_i32 s4, s12 +; GFX9-NEXT: s_cselect_b32 s6, s4, s12 +; GFX9-NEXT: s_cmp_gt_i32 s5, -1 +; GFX9-NEXT: s_cselect_b32 s10, s5, -1 +; GFX9-NEXT: s_pack_ll_b32_b16 s6, s6, s10 +; GFX9-NEXT: s_lshr_b32 s10, s6, 16 ; GFX9-NEXT: s_sub_i32 s6, s6, s8 -; GFX9-NEXT: s_sub_i32 s8, s11, s16 -; GFX9-NEXT: s_cmp_lt_i32 s4, s13 -; GFX9-NEXT: s_cselect_b32 s4, s4, s13 -; GFX9-NEXT: s_cmp_lt_i32 s5, s10 -; GFX9-NEXT: s_cselect_b32 s5, s5, s10 +; GFX9-NEXT: s_sub_i32 s8, s10, s15 +; GFX9-NEXT: s_cmp_lt_i32 s4, s12 +; GFX9-NEXT: s_cselect_b32 s4, s4, s12 +; GFX9-NEXT: s_cmp_lt_i32 s5, -1 +; GFX9-NEXT: s_cselect_b32 s5, s5, -1 ; GFX9-NEXT: s_pack_ll_b32_b16 s4, s4, s5 ; GFX9-NEXT: s_lshr_b32 s5, s4, 16 ; GFX9-NEXT: s_pack_ll_b32_b16 s6, s6, s8 ; GFX9-NEXT: s_sub_i32 s4, s4, s9 -; GFX9-NEXT: s_sub_i32 s5, s5, s15 +; GFX9-NEXT: s_sub_i32 s5, s5, s14 ; GFX9-NEXT: s_pack_ll_b32_b16 s4, s4, s5 ; GFX9-NEXT: s_sext_i32_i16 s5, s6 ; GFX9-NEXT: s_sext_i32_i16 s8, s7 @@ -6984,166 +6892,162 @@ define amdgpu_ps <4 x i32> @s_ssubsat_v8i16(<8 x i16> inreg %lhs, <8 x i16> inre ; ; GFX10-LABEL: s_ssubsat_v8i16: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_pack_ll_b32_b16 s8, -1, -1 -; GFX10-NEXT: s_sext_i32_i16 s9, s0 -; GFX10-NEXT: s_sext_i32_i16 s11, s8 +; GFX10-NEXT: s_sext_i32_i16 s8, s0 +; GFX10-NEXT: s_sext_i32_i16 s9, -1 ; GFX10-NEXT: s_ashr_i32 s10, s0, 16 -; GFX10-NEXT: s_ashr_i32 s8, s8, 16 -; GFX10-NEXT: s_cmp_gt_i32 s9, s11 +; GFX10-NEXT: s_cmp_gt_i32 s8, s9 ; GFX10-NEXT: s_movk_i32 s14, 0x7fff -; GFX10-NEXT: s_cselect_b32 s12, s9, s11 -; GFX10-NEXT: s_cmp_gt_i32 s10, s8 -; GFX10-NEXT: s_movk_i32 s16, 0x8000 -; GFX10-NEXT: s_cselect_b32 s13, s10, s8 -; GFX10-NEXT: s_pack_ll_b32_b16 s16, s16, s16 -; GFX10-NEXT: s_pack_ll_b32_b16 s12, s12, s13 -; GFX10-NEXT: s_pack_ll_b32_b16 s13, s14, s14 -; GFX10-NEXT: s_lshr_b32 s14, s12, 16 -; GFX10-NEXT: s_lshr_b32 s15, s13, 16 -; GFX10-NEXT: s_sub_i32 s12, s12, s13 -; GFX10-NEXT: s_sub_i32 s14, s14, s15 -; GFX10-NEXT: s_cmp_lt_i32 s9, s11 -; GFX10-NEXT: s_sext_i32_i16 s18, s4 -; GFX10-NEXT: s_cselect_b32 s9, s9, s11 -; GFX10-NEXT: s_cmp_lt_i32 s10, s8 +; GFX10-NEXT: s_cselect_b32 s11, s8, s9 +; GFX10-NEXT: s_cmp_gt_i32 s10, -1 +; GFX10-NEXT: s_mov_b32 s15, 0x80008000 +; GFX10-NEXT: s_cselect_b32 s12, s10, -1 +; GFX10-NEXT: s_sext_i32_i16 s17, s4 +; GFX10-NEXT: s_pack_ll_b32_b16 s11, s11, s12 +; GFX10-NEXT: s_mov_b32 s12, 0x7fff7fff +; GFX10-NEXT: s_lshr_b32 s13, s11, 16 +; GFX10-NEXT: s_sub_i32 s11, s11, s12 +; GFX10-NEXT: s_sub_i32 s13, s13, s14 +; GFX10-NEXT: s_cmp_lt_i32 s8, s9 ; GFX10-NEXT: ; implicit-def: $vcc_hi -; GFX10-NEXT: s_cselect_b32 s10, s10, s8 +; GFX10-NEXT: s_cselect_b32 s8, s8, s9 +; GFX10-NEXT: s_cmp_lt_i32 s10, -1 +; GFX10-NEXT: s_cselect_b32 s10, s10, -1 ; GFX10-NEXT: s_ashr_i32 s4, s4, 16 -; GFX10-NEXT: s_pack_ll_b32_b16 s9, s9, s10 -; GFX10-NEXT: s_pack_ll_b32_b16 s10, s12, s14 -; GFX10-NEXT: s_lshr_b32 s12, s9, 16 -; GFX10-NEXT: s_lshr_b32 s14, s16, 16 -; GFX10-NEXT: s_sext_i32_i16 s17, s10 -; GFX10-NEXT: s_sub_i32 s9, s9, s16 -; GFX10-NEXT: s_sub_i32 s12, s12, s14 +; GFX10-NEXT: s_pack_ll_b32_b16 s8, s8, s10 +; GFX10-NEXT: s_pack_ll_b32_b16 s10, s11, s13 +; GFX10-NEXT: s_lshr_b32 s11, s8, 16 +; GFX10-NEXT: s_mov_b32 s13, 0x8000 +; GFX10-NEXT: s_sext_i32_i16 s16, s10 +; GFX10-NEXT: s_sub_i32 s8, s8, s15 +; GFX10-NEXT: s_sub_i32 s11, s11, s13 ; GFX10-NEXT: s_ashr_i32 s10, s10, 16 -; GFX10-NEXT: s_cmp_gt_i32 s17, s18 -; GFX10-NEXT: s_pack_ll_b32_b16 s9, s9, s12 -; GFX10-NEXT: s_cselect_b32 s17, s17, s18 +; GFX10-NEXT: s_cmp_gt_i32 s16, s17 +; GFX10-NEXT: s_pack_ll_b32_b16 s8, s8, s11 +; GFX10-NEXT: s_cselect_b32 s16, s16, s17 ; GFX10-NEXT: s_cmp_gt_i32 s10, s4 -; GFX10-NEXT: s_sext_i32_i16 s12, s9 +; GFX10-NEXT: s_sext_i32_i16 s11, s8 ; GFX10-NEXT: s_cselect_b32 s4, s10, s4 -; GFX10-NEXT: s_ashr_i32 s9, s9, 16 -; GFX10-NEXT: s_pack_ll_b32_b16 s4, s17, s4 -; GFX10-NEXT: s_sext_i32_i16 s18, s5 +; GFX10-NEXT: s_ashr_i32 s8, s8, 16 +; GFX10-NEXT: s_pack_ll_b32_b16 s4, s16, s4 +; GFX10-NEXT: s_sext_i32_i16 s17, s5 ; GFX10-NEXT: s_sext_i32_i16 s10, s4 ; GFX10-NEXT: s_ashr_i32 s4, s4, 16 -; GFX10-NEXT: s_cmp_lt_i32 s10, s12 -; GFX10-NEXT: s_cselect_b32 s10, s10, s12 -; GFX10-NEXT: s_cmp_lt_i32 s4, s9 -; GFX10-NEXT: s_cselect_b32 s4, s4, s9 -; GFX10-NEXT: s_lshr_b32 s9, s0, 16 +; GFX10-NEXT: s_cmp_lt_i32 s10, s11 +; GFX10-NEXT: s_cselect_b32 s10, s10, s11 +; GFX10-NEXT: s_cmp_lt_i32 s4, s8 +; GFX10-NEXT: s_cselect_b32 s4, s4, s8 +; GFX10-NEXT: s_lshr_b32 s8, s0, 16 ; GFX10-NEXT: s_pack_ll_b32_b16 s4, s10, s4 ; GFX10-NEXT: s_lshr_b32 s10, s4, 16 ; GFX10-NEXT: s_sub_i32 s0, s0, s4 -; GFX10-NEXT: s_sub_i32 s4, s9, s10 -; GFX10-NEXT: s_sext_i32_i16 s9, s1 +; GFX10-NEXT: s_sub_i32 s4, s8, s10 +; GFX10-NEXT: s_sext_i32_i16 s8, s1 ; GFX10-NEXT: s_ashr_i32 s10, s1, 16 -; GFX10-NEXT: s_cmp_gt_i32 s9, s11 +; GFX10-NEXT: s_cmp_gt_i32 s8, s9 ; GFX10-NEXT: s_pack_ll_b32_b16 s0, s0, s4 -; GFX10-NEXT: s_cselect_b32 s12, s9, s11 -; GFX10-NEXT: s_cmp_gt_i32 s10, s8 -; GFX10-NEXT: s_cselect_b32 s17, s10, s8 -; GFX10-NEXT: s_pack_ll_b32_b16 s12, s12, s17 -; GFX10-NEXT: s_lshr_b32 s17, s12, 16 -; GFX10-NEXT: s_sub_i32 s12, s12, s13 -; GFX10-NEXT: s_sub_i32 s17, s17, s15 -; GFX10-NEXT: s_cmp_lt_i32 s9, s11 -; GFX10-NEXT: s_cselect_b32 s9, s9, s11 -; GFX10-NEXT: s_cmp_lt_i32 s10, s8 -; GFX10-NEXT: s_cselect_b32 s10, s10, s8 +; GFX10-NEXT: s_cselect_b32 s11, s8, s9 +; GFX10-NEXT: s_cmp_gt_i32 s10, -1 +; GFX10-NEXT: s_cselect_b32 s16, s10, -1 +; GFX10-NEXT: s_pack_ll_b32_b16 s11, s11, s16 +; GFX10-NEXT: s_lshr_b32 s16, s11, 16 +; GFX10-NEXT: s_sub_i32 s11, s11, s12 +; GFX10-NEXT: s_sub_i32 s16, s16, s14 +; GFX10-NEXT: s_cmp_lt_i32 s8, s9 +; GFX10-NEXT: s_cselect_b32 s8, s8, s9 +; GFX10-NEXT: s_cmp_lt_i32 s10, -1 +; GFX10-NEXT: s_cselect_b32 s10, s10, -1 ; GFX10-NEXT: s_ashr_i32 s5, s5, 16 -; GFX10-NEXT: s_pack_ll_b32_b16 s9, s9, s10 -; GFX10-NEXT: s_pack_ll_b32_b16 s10, s12, s17 -; GFX10-NEXT: s_lshr_b32 s12, s9, 16 -; GFX10-NEXT: s_sext_i32_i16 s17, s10 -; GFX10-NEXT: s_sub_i32 s9, s9, s16 -; GFX10-NEXT: s_sub_i32 s12, s12, s14 +; GFX10-NEXT: s_pack_ll_b32_b16 s8, s8, s10 +; GFX10-NEXT: s_pack_ll_b32_b16 s10, s11, s16 +; GFX10-NEXT: s_lshr_b32 s11, s8, 16 +; GFX10-NEXT: s_sext_i32_i16 s16, s10 +; GFX10-NEXT: s_sub_i32 s8, s8, s15 +; GFX10-NEXT: s_sub_i32 s11, s11, s13 ; GFX10-NEXT: s_ashr_i32 s10, s10, 16 -; GFX10-NEXT: s_cmp_gt_i32 s17, s18 -; GFX10-NEXT: s_pack_ll_b32_b16 s9, s9, s12 -; GFX10-NEXT: s_cselect_b32 s17, s17, s18 +; GFX10-NEXT: s_cmp_gt_i32 s16, s17 +; GFX10-NEXT: s_pack_ll_b32_b16 s8, s8, s11 +; GFX10-NEXT: s_cselect_b32 s16, s16, s17 ; GFX10-NEXT: s_cmp_gt_i32 s10, s5 -; GFX10-NEXT: s_sext_i32_i16 s12, s9 +; GFX10-NEXT: s_sext_i32_i16 s11, s8 ; GFX10-NEXT: s_cselect_b32 s5, s10, s5 -; GFX10-NEXT: s_ashr_i32 s9, s9, 16 -; GFX10-NEXT: s_pack_ll_b32_b16 s5, s17, s5 -; GFX10-NEXT: s_sext_i32_i16 s18, s6 +; GFX10-NEXT: s_ashr_i32 s8, s8, 16 +; GFX10-NEXT: s_pack_ll_b32_b16 s5, s16, s5 +; GFX10-NEXT: s_sext_i32_i16 s17, s6 ; GFX10-NEXT: s_sext_i32_i16 s10, s5 ; GFX10-NEXT: s_ashr_i32 s5, s5, 16 -; GFX10-NEXT: s_cmp_lt_i32 s10, s12 -; GFX10-NEXT: s_cselect_b32 s10, s10, s12 -; GFX10-NEXT: s_cmp_lt_i32 s5, s9 -; GFX10-NEXT: s_cselect_b32 s5, s5, s9 -; GFX10-NEXT: s_lshr_b32 s9, s1, 16 +; GFX10-NEXT: s_cmp_lt_i32 s10, s11 +; GFX10-NEXT: s_cselect_b32 s10, s10, s11 +; GFX10-NEXT: s_cmp_lt_i32 s5, s8 +; GFX10-NEXT: s_cselect_b32 s5, s5, s8 +; GFX10-NEXT: s_lshr_b32 s8, s1, 16 ; GFX10-NEXT: s_pack_ll_b32_b16 s5, s10, s5 ; GFX10-NEXT: s_lshr_b32 s10, s5, 16 ; GFX10-NEXT: s_sub_i32 s1, s1, s5 -; GFX10-NEXT: s_sub_i32 s5, s9, s10 -; GFX10-NEXT: s_sext_i32_i16 s9, s2 +; GFX10-NEXT: s_sub_i32 s5, s8, s10 +; GFX10-NEXT: s_sext_i32_i16 s8, s2 ; GFX10-NEXT: s_ashr_i32 s10, s2, 16 -; GFX10-NEXT: s_cmp_gt_i32 s9, s11 +; GFX10-NEXT: s_cmp_gt_i32 s8, s9 ; GFX10-NEXT: s_pack_ll_b32_b16 s1, s1, s5 -; GFX10-NEXT: s_cselect_b32 s12, s9, s11 -; GFX10-NEXT: s_cmp_gt_i32 s10, s8 -; GFX10-NEXT: s_cselect_b32 s17, s10, s8 -; GFX10-NEXT: s_pack_ll_b32_b16 s12, s12, s17 -; GFX10-NEXT: s_lshr_b32 s17, s12, 16 -; GFX10-NEXT: s_sub_i32 s12, s12, s13 -; GFX10-NEXT: s_sub_i32 s17, s17, s15 -; GFX10-NEXT: s_cmp_lt_i32 s9, s11 -; GFX10-NEXT: s_cselect_b32 s9, s9, s11 -; GFX10-NEXT: s_cmp_lt_i32 s10, s8 -; GFX10-NEXT: s_cselect_b32 s10, s10, s8 +; GFX10-NEXT: s_cselect_b32 s11, s8, s9 +; GFX10-NEXT: s_cmp_gt_i32 s10, -1 +; GFX10-NEXT: s_cselect_b32 s16, s10, -1 +; GFX10-NEXT: s_pack_ll_b32_b16 s11, s11, s16 +; GFX10-NEXT: s_lshr_b32 s16, s11, 16 +; GFX10-NEXT: s_sub_i32 s11, s11, s12 +; GFX10-NEXT: s_sub_i32 s16, s16, s14 +; GFX10-NEXT: s_cmp_lt_i32 s8, s9 +; GFX10-NEXT: s_cselect_b32 s8, s8, s9 +; GFX10-NEXT: s_cmp_lt_i32 s10, -1 +; GFX10-NEXT: s_cselect_b32 s10, s10, -1 ; GFX10-NEXT: s_ashr_i32 s6, s6, 16 -; GFX10-NEXT: s_pack_ll_b32_b16 s9, s9, s10 -; GFX10-NEXT: s_pack_ll_b32_b16 s10, s12, s17 -; GFX10-NEXT: s_lshr_b32 s12, s9, 16 -; GFX10-NEXT: s_sext_i32_i16 s17, s10 -; GFX10-NEXT: s_sub_i32 s9, s9, s16 -; GFX10-NEXT: s_sub_i32 s12, s12, s14 +; GFX10-NEXT: s_pack_ll_b32_b16 s8, s8, s10 +; GFX10-NEXT: s_pack_ll_b32_b16 s10, s11, s16 +; GFX10-NEXT: s_lshr_b32 s11, s8, 16 +; GFX10-NEXT: s_sext_i32_i16 s16, s10 +; GFX10-NEXT: s_sub_i32 s8, s8, s15 +; GFX10-NEXT: s_sub_i32 s11, s11, s13 ; GFX10-NEXT: s_ashr_i32 s10, s10, 16 -; GFX10-NEXT: s_cmp_gt_i32 s17, s18 -; GFX10-NEXT: s_pack_ll_b32_b16 s9, s9, s12 -; GFX10-NEXT: s_cselect_b32 s17, s17, s18 +; GFX10-NEXT: s_cmp_gt_i32 s16, s17 +; GFX10-NEXT: s_pack_ll_b32_b16 s8, s8, s11 +; GFX10-NEXT: s_cselect_b32 s16, s16, s17 ; GFX10-NEXT: s_cmp_gt_i32 s10, s6 -; GFX10-NEXT: s_sext_i32_i16 s12, s9 +; GFX10-NEXT: s_sext_i32_i16 s11, s8 ; GFX10-NEXT: s_cselect_b32 s6, s10, s6 -; GFX10-NEXT: s_ashr_i32 s9, s9, 16 -; GFX10-NEXT: s_pack_ll_b32_b16 s6, s17, s6 +; GFX10-NEXT: s_ashr_i32 s8, s8, 16 +; GFX10-NEXT: s_pack_ll_b32_b16 s6, s16, s6 ; GFX10-NEXT: s_sext_i32_i16 s10, s6 ; GFX10-NEXT: s_ashr_i32 s6, s6, 16 -; GFX10-NEXT: s_cmp_lt_i32 s10, s12 -; GFX10-NEXT: s_cselect_b32 s10, s10, s12 -; GFX10-NEXT: s_cmp_lt_i32 s6, s9 -; GFX10-NEXT: s_cselect_b32 s6, s6, s9 -; GFX10-NEXT: s_lshr_b32 s9, s2, 16 +; GFX10-NEXT: s_cmp_lt_i32 s10, s11 +; GFX10-NEXT: s_cselect_b32 s10, s10, s11 +; GFX10-NEXT: s_cmp_lt_i32 s6, s8 +; GFX10-NEXT: s_cselect_b32 s6, s6, s8 +; GFX10-NEXT: s_lshr_b32 s8, s2, 16 ; GFX10-NEXT: s_pack_ll_b32_b16 s6, s10, s6 ; GFX10-NEXT: s_lshr_b32 s10, s6, 16 ; GFX10-NEXT: s_sub_i32 s2, s2, s6 -; GFX10-NEXT: s_sub_i32 s6, s9, s10 -; GFX10-NEXT: s_sext_i32_i16 s9, s3 +; GFX10-NEXT: s_sub_i32 s6, s8, s10 +; GFX10-NEXT: s_sext_i32_i16 s8, s3 ; GFX10-NEXT: s_ashr_i32 s10, s3, 16 -; GFX10-NEXT: s_cmp_gt_i32 s9, s11 +; GFX10-NEXT: s_cmp_gt_i32 s8, s9 ; GFX10-NEXT: s_pack_ll_b32_b16 s2, s2, s6 -; GFX10-NEXT: s_cselect_b32 s12, s9, s11 -; GFX10-NEXT: s_cmp_gt_i32 s10, s8 -; GFX10-NEXT: s_cselect_b32 s17, s10, s8 -; GFX10-NEXT: s_pack_ll_b32_b16 s12, s12, s17 -; GFX10-NEXT: s_lshr_b32 s17, s12, 16 -; GFX10-NEXT: s_sub_i32 s12, s12, s13 -; GFX10-NEXT: s_sub_i32 s13, s17, s15 -; GFX10-NEXT: s_cmp_lt_i32 s9, s11 -; GFX10-NEXT: s_cselect_b32 s9, s9, s11 -; GFX10-NEXT: s_cmp_lt_i32 s10, s8 -; GFX10-NEXT: s_cselect_b32 s8, s10, s8 -; GFX10-NEXT: s_pack_ll_b32_b16 s8, s9, s8 -; GFX10-NEXT: s_pack_ll_b32_b16 s9, s12, s13 +; GFX10-NEXT: s_cselect_b32 s11, s8, s9 +; GFX10-NEXT: s_cmp_gt_i32 s10, -1 +; GFX10-NEXT: s_cselect_b32 s16, s10, -1 +; GFX10-NEXT: s_pack_ll_b32_b16 s11, s11, s16 +; GFX10-NEXT: s_lshr_b32 s16, s11, 16 +; GFX10-NEXT: s_sub_i32 s11, s11, s12 +; GFX10-NEXT: s_sub_i32 s12, s16, s14 +; GFX10-NEXT: s_cmp_lt_i32 s8, s9 +; GFX10-NEXT: s_cselect_b32 s8, s8, s9 +; GFX10-NEXT: s_cmp_lt_i32 s10, -1 +; GFX10-NEXT: s_cselect_b32 s9, s10, -1 +; GFX10-NEXT: s_pack_ll_b32_b16 s8, s8, s9 +; GFX10-NEXT: s_pack_ll_b32_b16 s9, s11, s12 ; GFX10-NEXT: s_lshr_b32 s10, s8, 16 ; GFX10-NEXT: s_sext_i32_i16 s11, s9 ; GFX10-NEXT: s_sext_i32_i16 s12, s7 -; GFX10-NEXT: s_sub_i32 s8, s8, s16 -; GFX10-NEXT: s_sub_i32 s10, s10, s14 +; GFX10-NEXT: s_sub_i32 s8, s8, s15 +; GFX10-NEXT: s_sub_i32 s10, s10, s13 ; GFX10-NEXT: s_ashr_i32 s9, s9, 16 ; GFX10-NEXT: s_ashr_i32 s7, s7, 16 ; GFX10-NEXT: s_cmp_gt_i32 s11, s12 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/uaddsat.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/uaddsat.ll index 5570309a5be7..3a742fbcbd91 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/uaddsat.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/uaddsat.ll @@ -2371,8 +2371,7 @@ define <2 x i16> @v_uaddsat_v2i16(<2 x i16> %lhs, <2 x i16> %rhs) { ; GFX9-LABEL: v_uaddsat_v2i16: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: s_pack_ll_b32_b16 s4, -1, -1 -; GFX9-NEXT: v_xor_b32_e32 v2, s4, v0 +; GFX9-NEXT: v_xor_b32_e32 v2, -1, v0 ; GFX9-NEXT: v_pk_min_u16 v1, v2, v1 ; GFX9-NEXT: v_pk_add_u16 v0, v0, v1 ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -2381,9 +2380,8 @@ define <2 x i16> @v_uaddsat_v2i16(<2 x i16> %lhs, <2 x i16> %rhs) { ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: s_pack_ll_b32_b16 s4, -1, -1 +; GFX10-NEXT: v_xor_b32_e32 v2, -1, v0 ; GFX10-NEXT: ; implicit-def: $vcc_hi -; GFX10-NEXT: v_xor_b32_e32 v2, s4, v0 ; GFX10-NEXT: v_pk_min_u16 v1, v2, v1 ; GFX10-NEXT: v_pk_add_u16 v0, v0, v1 ; GFX10-NEXT: s_setpc_b64 s[30:31] @@ -2439,8 +2437,7 @@ define amdgpu_ps i32 @s_uaddsat_v2i16(<2 x i16> inreg %lhs, <2 x i16> inreg %rhs ; ; GFX9-LABEL: s_uaddsat_v2i16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_pack_ll_b32_b16 s2, -1, -1 -; GFX9-NEXT: s_xor_b32 s2, s0, s2 +; GFX9-NEXT: s_xor_b32 s2, s0, -1 ; GFX9-NEXT: s_mov_b32 s4, 0xffff ; GFX9-NEXT: s_lshr_b32 s3, s2, 16 ; GFX9-NEXT: s_lshr_b32 s5, s1, 16 @@ -2460,15 +2457,14 @@ define amdgpu_ps i32 @s_uaddsat_v2i16(<2 x i16> inreg %lhs, <2 x i16> inreg %rhs ; ; GFX10-LABEL: s_uaddsat_v2i16: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_pack_ll_b32_b16 s2, -1, -1 +; GFX10-NEXT: s_xor_b32 s2, s0, -1 ; GFX10-NEXT: s_mov_b32 s3, 0xffff -; GFX10-NEXT: s_xor_b32 s2, s0, s2 -; GFX10-NEXT: ; implicit-def: $vcc_hi ; GFX10-NEXT: s_lshr_b32 s4, s2, 16 ; GFX10-NEXT: s_and_b32 s2, s2, s3 ; GFX10-NEXT: s_and_b32 s3, s1, s3 ; GFX10-NEXT: s_lshr_b32 s1, s1, 16 ; GFX10-NEXT: s_cmp_lt_u32 s2, s3 +; GFX10-NEXT: ; implicit-def: $vcc_hi ; GFX10-NEXT: s_cselect_b32 s2, s2, s3 ; GFX10-NEXT: s_cmp_lt_u32 s4, s1 ; GFX10-NEXT: s_cselect_b32 s1, s4, s1 @@ -2522,17 +2518,15 @@ define amdgpu_ps float @uaddsat_v2i16_sv(<2 x i16> inreg %lhs, <2 x i16> %rhs) { ; ; GFX9-LABEL: uaddsat_v2i16_sv: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_pack_ll_b32_b16 s1, -1, -1 -; GFX9-NEXT: s_xor_b32 s1, s0, s1 +; GFX9-NEXT: s_xor_b32 s1, s0, -1 ; GFX9-NEXT: v_pk_min_u16 v0, s1, v0 ; GFX9-NEXT: v_pk_add_u16 v0, s0, v0 ; GFX9-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: uaddsat_v2i16_sv: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_pack_ll_b32_b16 s1, -1, -1 +; GFX10-NEXT: s_xor_b32 s1, s0, -1 ; GFX10-NEXT: ; implicit-def: $vcc_hi -; GFX10-NEXT: s_xor_b32 s1, s0, s1 ; GFX10-NEXT: v_pk_min_u16 v0, s1, v0 ; GFX10-NEXT: v_pk_add_u16 v0, s0, v0 ; GFX10-NEXT: ; return to shader part epilog @@ -2578,17 +2572,15 @@ define amdgpu_ps float @uaddsat_v2i16_vs(<2 x i16> %lhs, <2 x i16> inreg %rhs) { ; ; GFX9-LABEL: uaddsat_v2i16_vs: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_pack_ll_b32_b16 s1, -1, -1 -; GFX9-NEXT: v_xor_b32_e32 v1, s1, v0 +; GFX9-NEXT: v_xor_b32_e32 v1, -1, v0 ; GFX9-NEXT: v_pk_min_u16 v1, v1, s0 ; GFX9-NEXT: v_pk_add_u16 v0, v0, v1 ; GFX9-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: uaddsat_v2i16_vs: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_pack_ll_b32_b16 s1, -1, -1 +; GFX10-NEXT: v_xor_b32_e32 v1, -1, v0 ; GFX10-NEXT: ; implicit-def: $vcc_hi -; GFX10-NEXT: v_xor_b32_e32 v1, s1, v0 ; GFX10-NEXT: v_pk_min_u16 v1, v1, s0 ; GFX10-NEXT: v_pk_add_u16 v0, v0, v1 ; GFX10-NEXT: ; return to shader part epilog @@ -2671,11 +2663,10 @@ define <2 x float> @v_uaddsat_v4i16(<4 x i16> %lhs, <4 x i16> %rhs) { ; GFX9-LABEL: v_uaddsat_v4i16: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: s_pack_ll_b32_b16 s4, -1, -1 -; GFX9-NEXT: v_xor_b32_e32 v4, s4, v0 +; GFX9-NEXT: v_xor_b32_e32 v4, -1, v0 ; GFX9-NEXT: v_pk_min_u16 v2, v4, v2 ; GFX9-NEXT: v_pk_add_u16 v0, v0, v2 -; GFX9-NEXT: v_xor_b32_e32 v2, s4, v1 +; GFX9-NEXT: v_xor_b32_e32 v2, -1, v1 ; GFX9-NEXT: v_pk_min_u16 v2, v2, v3 ; GFX9-NEXT: v_pk_add_u16 v1, v1, v2 ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -2684,10 +2675,9 @@ define <2 x float> @v_uaddsat_v4i16(<4 x i16> %lhs, <4 x i16> %rhs) { ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: s_pack_ll_b32_b16 s4, -1, -1 +; GFX10-NEXT: v_xor_b32_e32 v4, -1, v0 +; GFX10-NEXT: v_xor_b32_e32 v5, -1, v1 ; GFX10-NEXT: ; implicit-def: $vcc_hi -; GFX10-NEXT: v_xor_b32_e32 v4, s4, v0 -; GFX10-NEXT: v_xor_b32_e32 v5, s4, v1 ; GFX10-NEXT: v_pk_min_u16 v2, v4, v2 ; GFX10-NEXT: v_pk_min_u16 v3, v5, v3 ; GFX10-NEXT: v_pk_add_u16 v0, v0, v2 @@ -2782,28 +2772,27 @@ define amdgpu_ps <2 x i32> @s_uaddsat_v4i16(<4 x i16> inreg %lhs, <4 x i16> inre ; ; GFX9-LABEL: s_uaddsat_v4i16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_pack_ll_b32_b16 s4, -1, -1 -; GFX9-NEXT: s_xor_b32 s5, s0, s4 -; GFX9-NEXT: s_mov_b32 s7, 0xffff -; GFX9-NEXT: s_lshr_b32 s6, s5, 16 -; GFX9-NEXT: s_lshr_b32 s8, s2, 16 -; GFX9-NEXT: s_and_b32 s5, s5, s7 -; GFX9-NEXT: s_and_b32 s2, s2, s7 -; GFX9-NEXT: s_cmp_lt_u32 s5, s2 -; GFX9-NEXT: s_cselect_b32 s2, s5, s2 -; GFX9-NEXT: s_cmp_lt_u32 s6, s8 -; GFX9-NEXT: s_cselect_b32 s5, s6, s8 -; GFX9-NEXT: s_pack_ll_b32_b16 s2, s2, s5 -; GFX9-NEXT: s_lshr_b32 s5, s0, 16 -; GFX9-NEXT: s_lshr_b32 s6, s2, 16 +; GFX9-NEXT: s_xor_b32 s4, s0, -1 +; GFX9-NEXT: s_mov_b32 s6, 0xffff +; GFX9-NEXT: s_lshr_b32 s5, s4, 16 +; GFX9-NEXT: s_lshr_b32 s7, s2, 16 +; GFX9-NEXT: s_and_b32 s4, s4, s6 +; GFX9-NEXT: s_and_b32 s2, s2, s6 +; GFX9-NEXT: s_cmp_lt_u32 s4, s2 +; GFX9-NEXT: s_cselect_b32 s2, s4, s2 +; GFX9-NEXT: s_cmp_lt_u32 s5, s7 +; GFX9-NEXT: s_cselect_b32 s4, s5, s7 +; GFX9-NEXT: s_pack_ll_b32_b16 s2, s2, s4 +; GFX9-NEXT: s_lshr_b32 s4, s0, 16 +; GFX9-NEXT: s_lshr_b32 s5, s2, 16 ; GFX9-NEXT: s_add_i32 s0, s0, s2 -; GFX9-NEXT: s_xor_b32 s2, s1, s4 -; GFX9-NEXT: s_add_i32 s5, s5, s6 -; GFX9-NEXT: s_pack_ll_b32_b16 s0, s0, s5 +; GFX9-NEXT: s_add_i32 s4, s4, s5 +; GFX9-NEXT: s_xor_b32 s2, s1, -1 +; GFX9-NEXT: s_pack_ll_b32_b16 s0, s0, s4 ; GFX9-NEXT: s_lshr_b32 s4, s2, 16 ; GFX9-NEXT: s_lshr_b32 s5, s3, 16 -; GFX9-NEXT: s_and_b32 s2, s2, s7 -; GFX9-NEXT: s_and_b32 s3, s3, s7 +; GFX9-NEXT: s_and_b32 s2, s2, s6 +; GFX9-NEXT: s_and_b32 s3, s3, s6 ; GFX9-NEXT: s_cmp_lt_u32 s2, s3 ; GFX9-NEXT: s_cselect_b32 s2, s2, s3 ; GFX9-NEXT: s_cmp_lt_u32 s4, s5 @@ -2818,38 +2807,37 @@ define amdgpu_ps <2 x i32> @s_uaddsat_v4i16(<4 x i16> inreg %lhs, <4 x i16> inre ; ; GFX10-LABEL: s_uaddsat_v4i16: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_pack_ll_b32_b16 s4, -1, -1 -; GFX10-NEXT: s_mov_b32 s6, 0xffff -; GFX10-NEXT: s_xor_b32 s5, s0, s4 -; GFX10-NEXT: s_and_b32 s8, s2, s6 -; GFX10-NEXT: s_lshr_b32 s7, s5, 16 -; GFX10-NEXT: s_and_b32 s5, s5, s6 +; GFX10-NEXT: s_xor_b32 s4, s0, -1 +; GFX10-NEXT: s_mov_b32 s5, 0xffff +; GFX10-NEXT: s_lshr_b32 s6, s4, 16 +; GFX10-NEXT: s_and_b32 s7, s2, s5 +; GFX10-NEXT: s_and_b32 s4, s4, s5 ; GFX10-NEXT: s_lshr_b32 s2, s2, 16 -; GFX10-NEXT: s_cmp_lt_u32 s5, s8 +; GFX10-NEXT: s_cmp_lt_u32 s4, s7 ; GFX10-NEXT: ; implicit-def: $vcc_hi -; GFX10-NEXT: s_cselect_b32 s5, s5, s8 -; GFX10-NEXT: s_cmp_lt_u32 s7, s2 -; GFX10-NEXT: s_cselect_b32 s2, s7, s2 -; GFX10-NEXT: s_pack_ll_b32_b16 s2, s5, s2 -; GFX10-NEXT: s_lshr_b32 s5, s0, 16 -; GFX10-NEXT: s_lshr_b32 s7, s2, 16 +; GFX10-NEXT: s_cselect_b32 s4, s4, s7 +; GFX10-NEXT: s_cmp_lt_u32 s6, s2 +; GFX10-NEXT: s_cselect_b32 s2, s6, s2 +; GFX10-NEXT: s_pack_ll_b32_b16 s2, s4, s2 +; GFX10-NEXT: s_lshr_b32 s4, s0, 16 +; GFX10-NEXT: s_lshr_b32 s6, s2, 16 ; GFX10-NEXT: s_add_i32 s0, s0, s2 -; GFX10-NEXT: s_xor_b32 s2, s1, s4 -; GFX10-NEXT: s_add_i32 s5, s5, s7 -; GFX10-NEXT: s_lshr_b32 s4, s2, 16 -; GFX10-NEXT: s_and_b32 s2, s2, s6 -; GFX10-NEXT: s_and_b32 s6, s3, s6 +; GFX10-NEXT: s_xor_b32 s2, s1, -1 +; GFX10-NEXT: s_add_i32 s4, s4, s6 +; GFX10-NEXT: s_lshr_b32 s6, s2, 16 +; GFX10-NEXT: s_and_b32 s2, s2, s5 +; GFX10-NEXT: s_and_b32 s5, s3, s5 ; GFX10-NEXT: s_lshr_b32 s3, s3, 16 -; GFX10-NEXT: s_cmp_lt_u32 s2, s6 -; GFX10-NEXT: s_pack_ll_b32_b16 s0, s0, s5 -; GFX10-NEXT: s_cselect_b32 s2, s2, s6 -; GFX10-NEXT: s_cmp_lt_u32 s4, s3 -; GFX10-NEXT: s_cselect_b32 s3, s4, s3 +; GFX10-NEXT: s_cmp_lt_u32 s2, s5 +; GFX10-NEXT: s_pack_ll_b32_b16 s0, s0, s4 +; GFX10-NEXT: s_cselect_b32 s2, s2, s5 +; GFX10-NEXT: s_cmp_lt_u32 s6, s3 +; GFX10-NEXT: s_cselect_b32 s3, s6, s3 ; GFX10-NEXT: s_pack_ll_b32_b16 s2, s2, s3 ; GFX10-NEXT: s_lshr_b32 s3, s1, 16 -; GFX10-NEXT: s_lshr_b32 s4, s2, 16 +; GFX10-NEXT: s_lshr_b32 s5, s2, 16 ; GFX10-NEXT: s_add_i32 s1, s1, s2 -; GFX10-NEXT: s_add_i32 s3, s3, s4 +; GFX10-NEXT: s_add_i32 s3, s3, s5 ; GFX10-NEXT: s_pack_ll_b32_b16 s1, s1, s3 ; GFX10-NEXT: ; return to shader part epilog %result = call <4 x i16> @llvm.uadd.sat.v4i16(<4 x i16> %lhs, <4 x i16> %rhs) @@ -2955,14 +2943,13 @@ define <3 x float> @v_uaddsat_v6i16(<6 x i16> %lhs, <6 x i16> %rhs) { ; GFX9-LABEL: v_uaddsat_v6i16: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: s_pack_ll_b32_b16 s4, -1, -1 -; GFX9-NEXT: v_xor_b32_e32 v6, s4, v0 +; GFX9-NEXT: v_xor_b32_e32 v6, -1, v0 ; GFX9-NEXT: v_pk_min_u16 v3, v6, v3 ; GFX9-NEXT: v_pk_add_u16 v0, v0, v3 -; GFX9-NEXT: v_xor_b32_e32 v3, s4, v1 +; GFX9-NEXT: v_xor_b32_e32 v3, -1, v1 ; GFX9-NEXT: v_pk_min_u16 v3, v3, v4 ; GFX9-NEXT: v_pk_add_u16 v1, v1, v3 -; GFX9-NEXT: v_xor_b32_e32 v3, s4, v2 +; GFX9-NEXT: v_xor_b32_e32 v3, -1, v2 ; GFX9-NEXT: v_pk_min_u16 v3, v3, v5 ; GFX9-NEXT: v_pk_add_u16 v2, v2, v3 ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -2971,11 +2958,10 @@ define <3 x float> @v_uaddsat_v6i16(<6 x i16> %lhs, <6 x i16> %rhs) { ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: s_pack_ll_b32_b16 s4, -1, -1 +; GFX10-NEXT: v_xor_b32_e32 v6, -1, v0 +; GFX10-NEXT: v_xor_b32_e32 v7, -1, v1 +; GFX10-NEXT: v_xor_b32_e32 v8, -1, v2 ; GFX10-NEXT: ; implicit-def: $vcc_hi -; GFX10-NEXT: v_xor_b32_e32 v6, s4, v0 -; GFX10-NEXT: v_xor_b32_e32 v7, s4, v1 -; GFX10-NEXT: v_xor_b32_e32 v8, s4, v2 ; GFX10-NEXT: v_pk_min_u16 v3, v6, v3 ; GFX10-NEXT: v_pk_min_u16 v4, v7, v4 ; GFX10-NEXT: v_pk_min_u16 v5, v8, v5 @@ -3108,43 +3094,42 @@ define amdgpu_ps <3 x i32> @s_uaddsat_v6i16(<6 x i16> inreg %lhs, <6 x i16> inre ; ; GFX9-LABEL: s_uaddsat_v6i16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_pack_ll_b32_b16 s6, -1, -1 -; GFX9-NEXT: s_xor_b32 s7, s0, s6 -; GFX9-NEXT: s_mov_b32 s9, 0xffff -; GFX9-NEXT: s_lshr_b32 s8, s7, 16 -; GFX9-NEXT: s_lshr_b32 s10, s3, 16 -; GFX9-NEXT: s_and_b32 s7, s7, s9 -; GFX9-NEXT: s_and_b32 s3, s3, s9 -; GFX9-NEXT: s_cmp_lt_u32 s7, s3 -; GFX9-NEXT: s_cselect_b32 s3, s7, s3 -; GFX9-NEXT: s_cmp_lt_u32 s8, s10 -; GFX9-NEXT: s_cselect_b32 s7, s8, s10 -; GFX9-NEXT: s_pack_ll_b32_b16 s3, s3, s7 -; GFX9-NEXT: s_lshr_b32 s7, s0, 16 -; GFX9-NEXT: s_lshr_b32 s8, s3, 16 -; GFX9-NEXT: s_add_i32 s0, s0, s3 -; GFX9-NEXT: s_add_i32 s7, s7, s8 -; GFX9-NEXT: s_xor_b32 s3, s1, s6 -; GFX9-NEXT: s_pack_ll_b32_b16 s0, s0, s7 +; GFX9-NEXT: s_xor_b32 s6, s0, -1 +; GFX9-NEXT: s_mov_b32 s8, 0xffff +; GFX9-NEXT: s_lshr_b32 s7, s6, 16 +; GFX9-NEXT: s_lshr_b32 s9, s3, 16 +; GFX9-NEXT: s_and_b32 s6, s6, s8 +; GFX9-NEXT: s_and_b32 s3, s3, s8 +; GFX9-NEXT: s_cmp_lt_u32 s6, s3 +; GFX9-NEXT: s_cselect_b32 s3, s6, s3 +; GFX9-NEXT: s_cmp_lt_u32 s7, s9 +; GFX9-NEXT: s_cselect_b32 s6, s7, s9 +; GFX9-NEXT: s_pack_ll_b32_b16 s3, s3, s6 +; GFX9-NEXT: s_lshr_b32 s6, s0, 16 ; GFX9-NEXT: s_lshr_b32 s7, s3, 16 -; GFX9-NEXT: s_lshr_b32 s8, s4, 16 -; GFX9-NEXT: s_and_b32 s3, s3, s9 -; GFX9-NEXT: s_and_b32 s4, s4, s9 +; GFX9-NEXT: s_add_i32 s0, s0, s3 +; GFX9-NEXT: s_add_i32 s6, s6, s7 +; GFX9-NEXT: s_xor_b32 s3, s1, -1 +; GFX9-NEXT: s_pack_ll_b32_b16 s0, s0, s6 +; GFX9-NEXT: s_lshr_b32 s6, s3, 16 +; GFX9-NEXT: s_lshr_b32 s7, s4, 16 +; GFX9-NEXT: s_and_b32 s3, s3, s8 +; GFX9-NEXT: s_and_b32 s4, s4, s8 ; GFX9-NEXT: s_cmp_lt_u32 s3, s4 ; GFX9-NEXT: s_cselect_b32 s3, s3, s4 -; GFX9-NEXT: s_cmp_lt_u32 s7, s8 -; GFX9-NEXT: s_cselect_b32 s4, s7, s8 +; GFX9-NEXT: s_cmp_lt_u32 s6, s7 +; GFX9-NEXT: s_cselect_b32 s4, s6, s7 ; GFX9-NEXT: s_pack_ll_b32_b16 s3, s3, s4 ; GFX9-NEXT: s_lshr_b32 s4, s1, 16 -; GFX9-NEXT: s_lshr_b32 s7, s3, 16 +; GFX9-NEXT: s_lshr_b32 s6, s3, 16 ; GFX9-NEXT: s_add_i32 s1, s1, s3 -; GFX9-NEXT: s_add_i32 s4, s4, s7 -; GFX9-NEXT: s_xor_b32 s3, s2, s6 +; GFX9-NEXT: s_add_i32 s4, s4, s6 +; GFX9-NEXT: s_xor_b32 s3, s2, -1 ; GFX9-NEXT: s_pack_ll_b32_b16 s1, s1, s4 ; GFX9-NEXT: s_lshr_b32 s4, s3, 16 ; GFX9-NEXT: s_lshr_b32 s6, s5, 16 -; GFX9-NEXT: s_and_b32 s3, s3, s9 -; GFX9-NEXT: s_and_b32 s5, s5, s9 +; GFX9-NEXT: s_and_b32 s3, s3, s8 +; GFX9-NEXT: s_and_b32 s5, s5, s8 ; GFX9-NEXT: s_cmp_lt_u32 s3, s5 ; GFX9-NEXT: s_cselect_b32 s3, s3, s5 ; GFX9-NEXT: s_cmp_lt_u32 s4, s6 @@ -3159,48 +3144,47 @@ define amdgpu_ps <3 x i32> @s_uaddsat_v6i16(<6 x i16> inreg %lhs, <6 x i16> inre ; ; GFX10-LABEL: s_uaddsat_v6i16: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_pack_ll_b32_b16 s6, -1, -1 -; GFX10-NEXT: s_mov_b32 s8, 0xffff -; GFX10-NEXT: s_xor_b32 s7, s0, s6 -; GFX10-NEXT: s_and_b32 s10, s3, s8 -; GFX10-NEXT: s_lshr_b32 s9, s7, 16 -; GFX10-NEXT: s_and_b32 s7, s7, s8 +; GFX10-NEXT: s_xor_b32 s6, s0, -1 +; GFX10-NEXT: s_mov_b32 s7, 0xffff +; GFX10-NEXT: s_lshr_b32 s8, s6, 16 +; GFX10-NEXT: s_and_b32 s9, s3, s7 +; GFX10-NEXT: s_and_b32 s6, s6, s7 ; GFX10-NEXT: s_lshr_b32 s3, s3, 16 -; GFX10-NEXT: s_cmp_lt_u32 s7, s10 +; GFX10-NEXT: s_cmp_lt_u32 s6, s9 ; GFX10-NEXT: ; implicit-def: $vcc_hi -; GFX10-NEXT: s_cselect_b32 s7, s7, s10 -; GFX10-NEXT: s_cmp_lt_u32 s9, s3 -; GFX10-NEXT: s_cselect_b32 s3, s9, s3 -; GFX10-NEXT: s_and_b32 s10, s4, s8 -; GFX10-NEXT: s_pack_ll_b32_b16 s3, s7, s3 -; GFX10-NEXT: s_lshr_b32 s7, s0, 16 -; GFX10-NEXT: s_lshr_b32 s9, s3, 16 +; GFX10-NEXT: s_cselect_b32 s6, s6, s9 +; GFX10-NEXT: s_cmp_lt_u32 s8, s3 +; GFX10-NEXT: s_cselect_b32 s3, s8, s3 +; GFX10-NEXT: s_and_b32 s9, s4, s7 +; GFX10-NEXT: s_pack_ll_b32_b16 s3, s6, s3 +; GFX10-NEXT: s_lshr_b32 s6, s0, 16 +; GFX10-NEXT: s_lshr_b32 s8, s3, 16 ; GFX10-NEXT: s_add_i32 s0, s0, s3 -; GFX10-NEXT: s_xor_b32 s3, s1, s6 -; GFX10-NEXT: s_add_i32 s7, s7, s9 -; GFX10-NEXT: s_lshr_b32 s9, s3, 16 -; GFX10-NEXT: s_and_b32 s3, s3, s8 +; GFX10-NEXT: s_xor_b32 s3, s1, -1 +; GFX10-NEXT: s_add_i32 s6, s6, s8 +; GFX10-NEXT: s_lshr_b32 s8, s3, 16 +; GFX10-NEXT: s_and_b32 s3, s3, s7 ; GFX10-NEXT: s_lshr_b32 s4, s4, 16 -; GFX10-NEXT: s_cmp_lt_u32 s3, s10 -; GFX10-NEXT: s_pack_ll_b32_b16 s0, s0, s7 -; GFX10-NEXT: s_cselect_b32 s3, s3, s10 -; GFX10-NEXT: s_cmp_lt_u32 s9, s4 -; GFX10-NEXT: s_cselect_b32 s4, s9, s4 +; GFX10-NEXT: s_cmp_lt_u32 s3, s9 +; GFX10-NEXT: s_pack_ll_b32_b16 s0, s0, s6 +; GFX10-NEXT: s_cselect_b32 s3, s3, s9 +; GFX10-NEXT: s_cmp_lt_u32 s8, s4 +; GFX10-NEXT: s_cselect_b32 s4, s8, s4 ; GFX10-NEXT: s_pack_ll_b32_b16 s3, s3, s4 ; GFX10-NEXT: s_lshr_b32 s4, s1, 16 -; GFX10-NEXT: s_lshr_b32 s9, s3, 16 +; GFX10-NEXT: s_lshr_b32 s8, s3, 16 ; GFX10-NEXT: s_add_i32 s1, s1, s3 -; GFX10-NEXT: s_xor_b32 s3, s2, s6 -; GFX10-NEXT: s_add_i32 s4, s4, s9 -; GFX10-NEXT: s_lshr_b32 s6, s3, 16 -; GFX10-NEXT: s_and_b32 s3, s3, s8 -; GFX10-NEXT: s_and_b32 s8, s5, s8 +; GFX10-NEXT: s_xor_b32 s3, s2, -1 +; GFX10-NEXT: s_add_i32 s4, s4, s8 +; GFX10-NEXT: s_lshr_b32 s8, s3, 16 +; GFX10-NEXT: s_and_b32 s3, s3, s7 +; GFX10-NEXT: s_and_b32 s7, s5, s7 ; GFX10-NEXT: s_lshr_b32 s5, s5, 16 -; GFX10-NEXT: s_cmp_lt_u32 s3, s8 +; GFX10-NEXT: s_cmp_lt_u32 s3, s7 ; GFX10-NEXT: s_pack_ll_b32_b16 s1, s1, s4 -; GFX10-NEXT: s_cselect_b32 s3, s3, s8 -; GFX10-NEXT: s_cmp_lt_u32 s6, s5 -; GFX10-NEXT: s_cselect_b32 s5, s6, s5 +; GFX10-NEXT: s_cselect_b32 s3, s3, s7 +; GFX10-NEXT: s_cmp_lt_u32 s8, s5 +; GFX10-NEXT: s_cselect_b32 s5, s8, s5 ; GFX10-NEXT: s_pack_ll_b32_b16 s3, s3, s5 ; GFX10-NEXT: s_lshr_b32 s5, s2, 16 ; GFX10-NEXT: s_lshr_b32 s6, s3, 16 @@ -3324,17 +3308,16 @@ define <4 x float> @v_uaddsat_v8i16(<8 x i16> %lhs, <8 x i16> %rhs) { ; GFX9-LABEL: v_uaddsat_v8i16: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: s_pack_ll_b32_b16 s4, -1, -1 -; GFX9-NEXT: v_xor_b32_e32 v8, s4, v0 +; GFX9-NEXT: v_xor_b32_e32 v8, -1, v0 ; GFX9-NEXT: v_pk_min_u16 v4, v8, v4 ; GFX9-NEXT: v_pk_add_u16 v0, v0, v4 -; GFX9-NEXT: v_xor_b32_e32 v4, s4, v1 +; GFX9-NEXT: v_xor_b32_e32 v4, -1, v1 ; GFX9-NEXT: v_pk_min_u16 v4, v4, v5 ; GFX9-NEXT: v_pk_add_u16 v1, v1, v4 -; GFX9-NEXT: v_xor_b32_e32 v4, s4, v2 +; GFX9-NEXT: v_xor_b32_e32 v4, -1, v2 ; GFX9-NEXT: v_pk_min_u16 v4, v4, v6 ; GFX9-NEXT: v_pk_add_u16 v2, v2, v4 -; GFX9-NEXT: v_xor_b32_e32 v4, s4, v3 +; GFX9-NEXT: v_xor_b32_e32 v4, -1, v3 ; GFX9-NEXT: v_pk_min_u16 v4, v4, v7 ; GFX9-NEXT: v_pk_add_u16 v3, v3, v4 ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -3343,12 +3326,11 @@ define <4 x float> @v_uaddsat_v8i16(<8 x i16> %lhs, <8 x i16> %rhs) { ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: s_pack_ll_b32_b16 s4, -1, -1 +; GFX10-NEXT: v_xor_b32_e32 v15, -1, v0 +; GFX10-NEXT: v_xor_b32_e32 v19, -1, v1 +; GFX10-NEXT: v_xor_b32_e32 v23, -1, v2 +; GFX10-NEXT: v_xor_b32_e32 v10, -1, v3 ; GFX10-NEXT: ; implicit-def: $vcc_hi -; GFX10-NEXT: v_xor_b32_e32 v15, s4, v0 -; GFX10-NEXT: v_xor_b32_e32 v19, s4, v1 -; GFX10-NEXT: v_xor_b32_e32 v23, s4, v2 -; GFX10-NEXT: v_xor_b32_e32 v10, s4, v3 ; GFX10-NEXT: v_pk_min_u16 v11, v15, v4 ; GFX10-NEXT: v_pk_min_u16 v15, v19, v5 ; GFX10-NEXT: v_pk_min_u16 v19, v23, v6 @@ -3519,58 +3501,57 @@ define amdgpu_ps <4 x i32> @s_uaddsat_v8i16(<8 x i16> inreg %lhs, <8 x i16> inre ; ; GFX9-LABEL: s_uaddsat_v8i16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_pack_ll_b32_b16 s8, -1, -1 -; GFX9-NEXT: s_xor_b32 s9, s0, s8 -; GFX9-NEXT: s_mov_b32 s11, 0xffff -; GFX9-NEXT: s_lshr_b32 s10, s9, 16 -; GFX9-NEXT: s_lshr_b32 s12, s4, 16 -; GFX9-NEXT: s_and_b32 s9, s9, s11 -; GFX9-NEXT: s_and_b32 s4, s4, s11 -; GFX9-NEXT: s_cmp_lt_u32 s9, s4 -; GFX9-NEXT: s_cselect_b32 s4, s9, s4 -; GFX9-NEXT: s_cmp_lt_u32 s10, s12 -; GFX9-NEXT: s_cselect_b32 s9, s10, s12 -; GFX9-NEXT: s_pack_ll_b32_b16 s4, s4, s9 -; GFX9-NEXT: s_lshr_b32 s9, s0, 16 -; GFX9-NEXT: s_lshr_b32 s10, s4, 16 -; GFX9-NEXT: s_add_i32 s0, s0, s4 -; GFX9-NEXT: s_add_i32 s9, s9, s10 -; GFX9-NEXT: s_xor_b32 s4, s1, s8 -; GFX9-NEXT: s_pack_ll_b32_b16 s0, s0, s9 +; GFX9-NEXT: s_xor_b32 s8, s0, -1 +; GFX9-NEXT: s_mov_b32 s10, 0xffff +; GFX9-NEXT: s_lshr_b32 s9, s8, 16 +; GFX9-NEXT: s_lshr_b32 s11, s4, 16 +; GFX9-NEXT: s_and_b32 s8, s8, s10 +; GFX9-NEXT: s_and_b32 s4, s4, s10 +; GFX9-NEXT: s_cmp_lt_u32 s8, s4 +; GFX9-NEXT: s_cselect_b32 s4, s8, s4 +; GFX9-NEXT: s_cmp_lt_u32 s9, s11 +; GFX9-NEXT: s_cselect_b32 s8, s9, s11 +; GFX9-NEXT: s_pack_ll_b32_b16 s4, s4, s8 +; GFX9-NEXT: s_lshr_b32 s8, s0, 16 ; GFX9-NEXT: s_lshr_b32 s9, s4, 16 -; GFX9-NEXT: s_lshr_b32 s10, s5, 16 -; GFX9-NEXT: s_and_b32 s4, s4, s11 -; GFX9-NEXT: s_and_b32 s5, s5, s11 +; GFX9-NEXT: s_add_i32 s0, s0, s4 +; GFX9-NEXT: s_add_i32 s8, s8, s9 +; GFX9-NEXT: s_xor_b32 s4, s1, -1 +; GFX9-NEXT: s_pack_ll_b32_b16 s0, s0, s8 +; GFX9-NEXT: s_lshr_b32 s8, s4, 16 +; GFX9-NEXT: s_lshr_b32 s9, s5, 16 +; GFX9-NEXT: s_and_b32 s4, s4, s10 +; GFX9-NEXT: s_and_b32 s5, s5, s10 ; GFX9-NEXT: s_cmp_lt_u32 s4, s5 ; GFX9-NEXT: s_cselect_b32 s4, s4, s5 -; GFX9-NEXT: s_cmp_lt_u32 s9, s10 -; GFX9-NEXT: s_cselect_b32 s5, s9, s10 +; GFX9-NEXT: s_cmp_lt_u32 s8, s9 +; GFX9-NEXT: s_cselect_b32 s5, s8, s9 ; GFX9-NEXT: s_pack_ll_b32_b16 s4, s4, s5 ; GFX9-NEXT: s_lshr_b32 s5, s1, 16 -; GFX9-NEXT: s_lshr_b32 s9, s4, 16 +; GFX9-NEXT: s_lshr_b32 s8, s4, 16 ; GFX9-NEXT: s_add_i32 s1, s1, s4 -; GFX9-NEXT: s_add_i32 s5, s5, s9 -; GFX9-NEXT: s_xor_b32 s4, s2, s8 +; GFX9-NEXT: s_add_i32 s5, s5, s8 +; GFX9-NEXT: s_xor_b32 s4, s2, -1 ; GFX9-NEXT: s_pack_ll_b32_b16 s1, s1, s5 ; GFX9-NEXT: s_lshr_b32 s5, s4, 16 -; GFX9-NEXT: s_lshr_b32 s9, s6, 16 -; GFX9-NEXT: s_and_b32 s4, s4, s11 -; GFX9-NEXT: s_and_b32 s6, s6, s11 +; GFX9-NEXT: s_lshr_b32 s8, s6, 16 +; GFX9-NEXT: s_and_b32 s4, s4, s10 +; GFX9-NEXT: s_and_b32 s6, s6, s10 ; GFX9-NEXT: s_cmp_lt_u32 s4, s6 ; GFX9-NEXT: s_cselect_b32 s4, s4, s6 -; GFX9-NEXT: s_cmp_lt_u32 s5, s9 -; GFX9-NEXT: s_cselect_b32 s5, s5, s9 +; GFX9-NEXT: s_cmp_lt_u32 s5, s8 +; GFX9-NEXT: s_cselect_b32 s5, s5, s8 ; GFX9-NEXT: s_pack_ll_b32_b16 s4, s4, s5 ; GFX9-NEXT: s_lshr_b32 s5, s2, 16 ; GFX9-NEXT: s_lshr_b32 s6, s4, 16 ; GFX9-NEXT: s_add_i32 s2, s2, s4 ; GFX9-NEXT: s_add_i32 s5, s5, s6 -; GFX9-NEXT: s_xor_b32 s4, s3, s8 +; GFX9-NEXT: s_xor_b32 s4, s3, -1 ; GFX9-NEXT: s_pack_ll_b32_b16 s2, s2, s5 ; GFX9-NEXT: s_lshr_b32 s5, s4, 16 ; GFX9-NEXT: s_lshr_b32 s6, s7, 16 -; GFX9-NEXT: s_and_b32 s4, s4, s11 -; GFX9-NEXT: s_and_b32 s7, s7, s11 +; GFX9-NEXT: s_and_b32 s4, s4, s10 +; GFX9-NEXT: s_and_b32 s7, s7, s10 ; GFX9-NEXT: s_cmp_lt_u32 s4, s7 ; GFX9-NEXT: s_cselect_b32 s4, s4, s7 ; GFX9-NEXT: s_cmp_lt_u32 s5, s6 @@ -3585,63 +3566,62 @@ define amdgpu_ps <4 x i32> @s_uaddsat_v8i16(<8 x i16> inreg %lhs, <8 x i16> inre ; ; GFX10-LABEL: s_uaddsat_v8i16: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_pack_ll_b32_b16 s8, -1, -1 -; GFX10-NEXT: s_mov_b32 s10, 0xffff -; GFX10-NEXT: s_xor_b32 s9, s0, s8 -; GFX10-NEXT: s_and_b32 s12, s4, s10 -; GFX10-NEXT: s_lshr_b32 s11, s9, 16 -; GFX10-NEXT: s_and_b32 s9, s9, s10 +; GFX10-NEXT: s_xor_b32 s8, s0, -1 +; GFX10-NEXT: s_mov_b32 s9, 0xffff +; GFX10-NEXT: s_lshr_b32 s10, s8, 16 +; GFX10-NEXT: s_and_b32 s11, s4, s9 +; GFX10-NEXT: s_and_b32 s8, s8, s9 ; GFX10-NEXT: s_lshr_b32 s4, s4, 16 -; GFX10-NEXT: s_cmp_lt_u32 s9, s12 +; GFX10-NEXT: s_cmp_lt_u32 s8, s11 ; GFX10-NEXT: ; implicit-def: $vcc_hi -; GFX10-NEXT: s_cselect_b32 s9, s9, s12 -; GFX10-NEXT: s_cmp_lt_u32 s11, s4 -; GFX10-NEXT: s_cselect_b32 s4, s11, s4 -; GFX10-NEXT: s_and_b32 s12, s5, s10 -; GFX10-NEXT: s_pack_ll_b32_b16 s4, s9, s4 -; GFX10-NEXT: s_lshr_b32 s9, s0, 16 -; GFX10-NEXT: s_lshr_b32 s11, s4, 16 +; GFX10-NEXT: s_cselect_b32 s8, s8, s11 +; GFX10-NEXT: s_cmp_lt_u32 s10, s4 +; GFX10-NEXT: s_cselect_b32 s4, s10, s4 +; GFX10-NEXT: s_and_b32 s11, s5, s9 +; GFX10-NEXT: s_pack_ll_b32_b16 s4, s8, s4 +; GFX10-NEXT: s_lshr_b32 s8, s0, 16 +; GFX10-NEXT: s_lshr_b32 s10, s4, 16 ; GFX10-NEXT: s_add_i32 s0, s0, s4 -; GFX10-NEXT: s_xor_b32 s4, s1, s8 -; GFX10-NEXT: s_add_i32 s9, s9, s11 -; GFX10-NEXT: s_lshr_b32 s11, s4, 16 -; GFX10-NEXT: s_and_b32 s4, s4, s10 +; GFX10-NEXT: s_xor_b32 s4, s1, -1 +; GFX10-NEXT: s_add_i32 s8, s8, s10 +; GFX10-NEXT: s_lshr_b32 s10, s4, 16 +; GFX10-NEXT: s_and_b32 s4, s4, s9 ; GFX10-NEXT: s_lshr_b32 s5, s5, 16 -; GFX10-NEXT: s_cmp_lt_u32 s4, s12 -; GFX10-NEXT: s_pack_ll_b32_b16 s0, s0, s9 -; GFX10-NEXT: s_cselect_b32 s4, s4, s12 -; GFX10-NEXT: s_cmp_lt_u32 s11, s5 -; GFX10-NEXT: s_cselect_b32 s5, s11, s5 -; GFX10-NEXT: s_and_b32 s12, s6, s10 +; GFX10-NEXT: s_cmp_lt_u32 s4, s11 +; GFX10-NEXT: s_pack_ll_b32_b16 s0, s0, s8 +; GFX10-NEXT: s_cselect_b32 s4, s4, s11 +; GFX10-NEXT: s_cmp_lt_u32 s10, s5 +; GFX10-NEXT: s_cselect_b32 s5, s10, s5 +; GFX10-NEXT: s_and_b32 s11, s6, s9 ; GFX10-NEXT: s_pack_ll_b32_b16 s4, s4, s5 ; GFX10-NEXT: s_lshr_b32 s5, s1, 16 -; GFX10-NEXT: s_lshr_b32 s11, s4, 16 +; GFX10-NEXT: s_lshr_b32 s10, s4, 16 ; GFX10-NEXT: s_add_i32 s1, s1, s4 -; GFX10-NEXT: s_xor_b32 s4, s2, s8 -; GFX10-NEXT: s_add_i32 s5, s5, s11 -; GFX10-NEXT: s_lshr_b32 s11, s4, 16 -; GFX10-NEXT: s_and_b32 s4, s4, s10 +; GFX10-NEXT: s_xor_b32 s4, s2, -1 +; GFX10-NEXT: s_add_i32 s5, s5, s10 +; GFX10-NEXT: s_lshr_b32 s10, s4, 16 +; GFX10-NEXT: s_and_b32 s4, s4, s9 ; GFX10-NEXT: s_lshr_b32 s6, s6, 16 -; GFX10-NEXT: s_cmp_lt_u32 s4, s12 +; GFX10-NEXT: s_cmp_lt_u32 s4, s11 ; GFX10-NEXT: s_pack_ll_b32_b16 s1, s1, s5 -; GFX10-NEXT: s_cselect_b32 s4, s4, s12 -; GFX10-NEXT: s_cmp_lt_u32 s11, s6 -; GFX10-NEXT: s_cselect_b32 s6, s11, s6 +; GFX10-NEXT: s_cselect_b32 s4, s4, s11 +; GFX10-NEXT: s_cmp_lt_u32 s10, s6 +; GFX10-NEXT: s_cselect_b32 s6, s10, s6 ; GFX10-NEXT: s_pack_ll_b32_b16 s4, s4, s6 ; GFX10-NEXT: s_lshr_b32 s6, s2, 16 -; GFX10-NEXT: s_lshr_b32 s11, s4, 16 +; GFX10-NEXT: s_lshr_b32 s10, s4, 16 ; GFX10-NEXT: s_add_i32 s2, s2, s4 -; GFX10-NEXT: s_xor_b32 s4, s3, s8 -; GFX10-NEXT: s_add_i32 s6, s6, s11 -; GFX10-NEXT: s_lshr_b32 s8, s4, 16 -; GFX10-NEXT: s_and_b32 s4, s4, s10 -; GFX10-NEXT: s_and_b32 s10, s7, s10 +; GFX10-NEXT: s_xor_b32 s4, s3, -1 +; GFX10-NEXT: s_add_i32 s6, s6, s10 +; GFX10-NEXT: s_lshr_b32 s10, s4, 16 +; GFX10-NEXT: s_and_b32 s4, s4, s9 +; GFX10-NEXT: s_and_b32 s9, s7, s9 ; GFX10-NEXT: s_lshr_b32 s7, s7, 16 -; GFX10-NEXT: s_cmp_lt_u32 s4, s10 +; GFX10-NEXT: s_cmp_lt_u32 s4, s9 ; GFX10-NEXT: s_pack_ll_b32_b16 s2, s2, s6 -; GFX10-NEXT: s_cselect_b32 s4, s4, s10 -; GFX10-NEXT: s_cmp_lt_u32 s8, s7 -; GFX10-NEXT: s_cselect_b32 s7, s8, s7 +; GFX10-NEXT: s_cselect_b32 s4, s4, s9 +; GFX10-NEXT: s_cmp_lt_u32 s10, s7 +; GFX10-NEXT: s_cselect_b32 s7, s10, s7 ; GFX10-NEXT: s_lshr_b32 s5, s3, 16 ; GFX10-NEXT: s_pack_ll_b32_b16 s4, s4, s7 ; GFX10-NEXT: s_lshr_b32 s7, s4, 16 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/xnor.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/xnor.ll index 2512aaaeb082..474f6655bda2 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/xnor.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/xnor.ll @@ -50,16 +50,14 @@ define amdgpu_ps i32 @scalar_xnor_v2i16_one_use(<2 x i16> inreg %a, <2 x i16> in ; ; GFX900-LABEL: scalar_xnor_v2i16_one_use: ; GFX900: ; %bb.0: ; %entry -; GFX900-NEXT: s_pack_ll_b32_b16 s2, -1, -1 ; GFX900-NEXT: s_xor_b32 s0, s0, s1 -; GFX900-NEXT: s_xor_b32 s0, s0, s2 +; GFX900-NEXT: s_xor_b32 s0, s0, -1 ; GFX900-NEXT: ; return to shader part epilog ; ; GFX906-LABEL: scalar_xnor_v2i16_one_use: ; GFX906: ; %bb.0: ; %entry -; GFX906-NEXT: s_pack_ll_b32_b16 s2, -1, -1 ; GFX906-NEXT: s_xor_b32 s0, s0, s1 -; GFX906-NEXT: s_xor_b32 s0, s0, s2 +; GFX906-NEXT: s_xor_b32 s0, s0, -1 ; GFX906-NEXT: ; return to shader part epilog entry: %xor = xor <2 x i16> %a, %b @@ -150,7 +148,7 @@ define amdgpu_ps i64 @scalar_xnor_v4i16_one_use(<4 x i16> inreg %a, <4 x i16> in ; ; GFX900-LABEL: scalar_xnor_v4i16_one_use: ; GFX900: ; %bb.0: -; GFX900-NEXT: s_pack_ll_b32_b16 s4, -1, -1 +; GFX900-NEXT: s_mov_b32 s4, -1 ; GFX900-NEXT: s_mov_b32 s5, s4 ; GFX900-NEXT: s_xor_b64 s[0:1], s[0:1], s[2:3] ; GFX900-NEXT: s_xor_b64 s[0:1], s[0:1], s[4:5] @@ -158,7 +156,7 @@ define amdgpu_ps i64 @scalar_xnor_v4i16_one_use(<4 x i16> inreg %a, <4 x i16> in ; ; GFX906-LABEL: scalar_xnor_v4i16_one_use: ; GFX906: ; %bb.0: -; GFX906-NEXT: s_pack_ll_b32_b16 s4, -1, -1 +; GFX906-NEXT: s_mov_b32 s4, -1 ; GFX906-NEXT: s_mov_b32 s5, s4 ; GFX906-NEXT: s_xor_b64 s[0:1], s[0:1], s[2:3] ; GFX906-NEXT: s_xor_b64 s[0:1], s[0:1], s[4:5]