From 3eb2281bc067688dc701cf94e267395680892cf0 Mon Sep 17 00:00:00 2001 From: Jay Foad Date: Mon, 16 May 2022 15:48:11 +0100 Subject: [PATCH] [AMDGPU] Aggressively fold immediates in SIFoldOperands Previously SIFoldOperands::foldInstOperand would only fold a non-inlinable immediate into a single user, so as not to increase code size by adding the same 32-bit literal operand to many instructions. This patch removes that restriction, so that a non-inlinable immediate will be folded into any number of users. The rationale is: - It reduces the number of registers used for holding constant values, which might increase occupancy. (On the other hand, many of these registers are SGPRs which no longer affect occupancy on GFX10+.) - It reduces ALU stalls between the instruction that loads a constant into a register, and the instruction that uses it. - The above benefits are expected to outweigh any increase in code size. Differential Revision: https://reviews.llvm.org/D114643 --- llvm/lib/Target/AMDGPU/SIFoldOperands.cpp | 84 +- .../CodeGen/AMDGPU/GlobalISel/add.v2i16.ll | 88 +- llvm/test/CodeGen/AMDGPU/GlobalISel/addo.ll | 7 +- llvm/test/CodeGen/AMDGPU/GlobalISel/andn2.ll | 112 +- llvm/test/CodeGen/AMDGPU/GlobalISel/ashr.ll | 124 +- llvm/test/CodeGen/AMDGPU/GlobalISel/bswap.ll | 48 +- .../AMDGPU/GlobalISel/combine-fma-add-mul.ll | 76 +- .../GlobalISel/combine-fma-sub-ext-neg-mul.ll | 20 +- .../AMDGPU/GlobalISel/combine-fma-sub-mul.ll | 20 +- .../GlobalISel/combine-fma-sub-neg-mul.ll | 15 +- .../AMDGPU/GlobalISel/extractelement.i8.ll | 490 +- .../CodeGen/AMDGPU/GlobalISel/fdiv.f32.ll | 42 +- .../CodeGen/AMDGPU/GlobalISel/flat-scratch.ll | 76 +- llvm/test/CodeGen/AMDGPU/GlobalISel/fmed3.ll | 14 +- llvm/test/CodeGen/AMDGPU/GlobalISel/fpow.ll | 5 +- llvm/test/CodeGen/AMDGPU/GlobalISel/fshl.ll | 801 ++-- llvm/test/CodeGen/AMDGPU/GlobalISel/fshr.ll | 946 ++-- .../AMDGPU/GlobalISel/insertelement.i16.ll | 814 ++-- .../AMDGPU/GlobalISel/insertelement.i8.ll | 4139 ++++++++--------- .../GlobalISel/llvm.amdgcn.div.scale.ll | 3 +- .../llvm.amdgcn.image.atomic.dim.a16.ll | 50 +- .../llvm.amdgcn.image.gather4.a16.dim.ll | 54 +- .../llvm.amdgcn.image.load.1d.d16.ll | 5 +- .../llvm.amdgcn.image.load.2darraymsaa.a16.ll | 19 +- .../llvm.amdgcn.image.load.3d.a16.ll | 17 +- .../llvm.amdgcn.image.sample.g16.ll | 111 +- .../GlobalISel/llvm.amdgcn.intersect_ray.ll | 54 +- .../AMDGPU/GlobalISel/llvm.amdgcn.sbfe.ll | 5 +- .../AMDGPU/GlobalISel/llvm.amdgcn.sdot4.ll | 13 +- .../AMDGPU/GlobalISel/llvm.amdgcn.udot4.ll | 13 +- llvm/test/CodeGen/AMDGPU/GlobalISel/lshr.ll | 173 +- llvm/test/CodeGen/AMDGPU/GlobalISel/mul.ll | 51 +- llvm/test/CodeGen/AMDGPU/GlobalISel/orn2.ll | 112 +- .../CodeGen/AMDGPU/GlobalISel/roundeven.ll | 11 +- .../test/CodeGen/AMDGPU/GlobalISel/saddsat.ll | 1244 +++-- .../CodeGen/AMDGPU/GlobalISel/sdiv.i32.ll | 48 +- .../CodeGen/AMDGPU/GlobalISel/sdiv.i64.ll | 10 +- .../test/CodeGen/AMDGPU/GlobalISel/sdivrem.ll | 238 +- .../AMDGPU/GlobalISel/shl-ext-reduce.ll | 15 +- llvm/test/CodeGen/AMDGPU/GlobalISel/shl.ll | 96 +- .../CodeGen/AMDGPU/GlobalISel/srem.i32.ll | 8 +- .../CodeGen/AMDGPU/GlobalISel/srem.i64.ll | 10 +- .../test/CodeGen/AMDGPU/GlobalISel/ssubsat.ll | 1256 +++-- .../AMDGPU/GlobalISel/store-local.128.ll | 27 +- .../AMDGPU/GlobalISel/store-local.96.ll | 19 +- llvm/test/CodeGen/AMDGPU/GlobalISel/subo.ll | 7 +- llvm/test/CodeGen/AMDGPU/GlobalISel/trunc.ll | 10 +- .../test/CodeGen/AMDGPU/GlobalISel/uaddsat.ll | 140 +- .../test/CodeGen/AMDGPU/GlobalISel/udivrem.ll | 193 +- .../CodeGen/AMDGPU/GlobalISel/urem.i32.ll | 4 +- .../CodeGen/AMDGPU/GlobalISel/urem.i64.ll | 279 +- .../test/CodeGen/AMDGPU/GlobalISel/usubsat.ll | 140 +- llvm/test/CodeGen/AMDGPU/GlobalISel/xnor.ll | 26 +- llvm/test/CodeGen/AMDGPU/add.v2i16.ll | 3 +- ...amdgpu-codegenprepare-fold-binop-select.ll | 4 +- .../AMDGPU/amdgpu-codegenprepare-idiv.ll | 2973 ++++++------ llvm/test/CodeGen/AMDGPU/and.ll | 10 +- .../atomic_optimizations_local_pointer.ll | 164 +- llvm/test/CodeGen/AMDGPU/bypass-div.ll | 108 +- .../CodeGen/AMDGPU/combine-reg-or-const.ll | 2 +- .../AMDGPU/constant-address-space-32bit.ll | 2 - llvm/test/CodeGen/AMDGPU/ctlz.ll | 11 +- llvm/test/CodeGen/AMDGPU/cttz.ll | 11 +- llvm/test/CodeGen/AMDGPU/cvt_f32_ubyte.ll | 10 +- .../CodeGen/AMDGPU/extract-subvector-16bit.ll | 19 +- llvm/test/CodeGen/AMDGPU/fabs.f16.ll | 5 +- llvm/test/CodeGen/AMDGPU/fabs.f64.ll | 12 +- llvm/test/CodeGen/AMDGPU/fabs.ll | 12 +- llvm/test/CodeGen/AMDGPU/fexp.ll | 7 +- llvm/test/CodeGen/AMDGPU/flat-scratch.ll | 162 +- llvm/test/CodeGen/AMDGPU/fmed3.ll | 5 +- llvm/test/CodeGen/AMDGPU/fneg-fabs.f16.ll | 5 +- llvm/test/CodeGen/AMDGPU/fneg-fabs.f64.ll | 14 +- llvm/test/CodeGen/AMDGPU/fneg-fabs.ll | 14 +- llvm/test/CodeGen/AMDGPU/fneg.ll | 13 +- ...ld-immediate-operand-shrink-with-carry.mir | 7 +- llvm/test/CodeGen/AMDGPU/frem.ll | 179 +- llvm/test/CodeGen/AMDGPU/fshr.ll | 64 +- llvm/test/CodeGen/AMDGPU/idiv-licm.ll | 38 +- llvm/test/CodeGen/AMDGPU/idot2.ll | 5 +- llvm/test/CodeGen/AMDGPU/idot4u.ll | 81 +- llvm/test/CodeGen/AMDGPU/idot8s.ll | 246 +- llvm/test/CodeGen/AMDGPU/idot8u.ll | 158 +- llvm/test/CodeGen/AMDGPU/immv216.ll | 5 +- .../CodeGen/AMDGPU/insert_vector_dynelt.ll | 5 +- llvm/test/CodeGen/AMDGPU/insert_vector_elt.ll | 15 +- .../CodeGen/AMDGPU/insert_vector_elt.v2i16.ll | 8 +- .../llvm.amdgcn.buffer.store.format.d16.ll | 5 +- .../llvm.amdgcn.image.sample.a16.dim.ll | 95 +- .../llvm.amdgcn.image.sample.g16.a16.dim.ll | 223 +- .../llvm.amdgcn.image.sample.g16.encode.ll | 77 +- .../AMDGPU/llvm.amdgcn.image.sample.g16.ll | 77 +- ...llvm.amdgcn.raw.buffer.store.format.d16.ll | 10 +- .../llvm.amdgcn.raw.tbuffer.store.d16.ll | 10 +- ...m.amdgcn.struct.buffer.store.format.d16.ll | 10 +- .../llvm.amdgcn.struct.tbuffer.store.d16.ll | 10 +- .../AMDGPU/llvm.amdgcn.tbuffer.store.d16.ll | 10 +- llvm/test/CodeGen/AMDGPU/llvm.log.f16.ll | 7 +- llvm/test/CodeGen/AMDGPU/llvm.log10.f16.ll | 7 +- llvm/test/CodeGen/AMDGPU/llvm.round.f64.ll | 114 +- llvm/test/CodeGen/AMDGPU/load-constant-i16.ll | 2119 ++++----- llvm/test/CodeGen/AMDGPU/load-global-i16.ll | 286 +- llvm/test/CodeGen/AMDGPU/madak.ll | 4 +- llvm/test/CodeGen/AMDGPU/max.ll | 4 +- llvm/test/CodeGen/AMDGPU/mul.ll | 2 +- llvm/test/CodeGen/AMDGPU/mul_uint24-amdgcn.ll | 99 +- llvm/test/CodeGen/AMDGPU/or.ll | 4 +- llvm/test/CodeGen/AMDGPU/packed-fp32.ll | 3 +- .../AMDGPU/promote-constOffset-to-imm.ll | 26 +- llvm/test/CodeGen/AMDGPU/s_addk_i32.ll | 6 +- llvm/test/CodeGen/AMDGPU/salu-to-valu.ll | 3 +- llvm/test/CodeGen/AMDGPU/scratch-buffer.ll | 3 +- llvm/test/CodeGen/AMDGPU/sdiv64.ll | 349 +- llvm/test/CodeGen/AMDGPU/sdwa-peephole.ll | 6 +- llvm/test/CodeGen/AMDGPU/setcc-opt.ll | 13 +- llvm/test/CodeGen/AMDGPU/shift-i128.ll | 2 +- llvm/test/CodeGen/AMDGPU/shl.v2i16.ll | 11 +- .../CodeGen/AMDGPU/shrink-add-sub-constant.ll | 20 +- .../AMDGPU/splitkit-getsubrangeformask.ll | 9 +- llvm/test/CodeGen/AMDGPU/srem64.ll | 537 ++- llvm/test/CodeGen/AMDGPU/strict_fadd.f16.ll | 9 +- llvm/test/CodeGen/AMDGPU/strict_fma.f16.ll | 9 +- llvm/test/CodeGen/AMDGPU/strict_fmul.f16.ll | 9 +- llvm/test/CodeGen/AMDGPU/strict_fsub.f16.ll | 9 +- llvm/test/CodeGen/AMDGPU/sub.v2i16.ll | 2 +- llvm/test/CodeGen/AMDGPU/uaddsat.ll | 7 +- llvm/test/CodeGen/AMDGPU/udiv.ll | 133 +- llvm/test/CodeGen/AMDGPU/udiv64.ll | 541 ++- llvm/test/CodeGen/AMDGPU/udivrem24.ll | 10 +- llvm/test/CodeGen/AMDGPU/urem64.ll | 451 +- llvm/test/CodeGen/AMDGPU/usubsat.ll | 5 +- llvm/test/CodeGen/AMDGPU/v_pack.ll | 15 +- .../CodeGen/AMDGPU/vector_shuffle.packed.ll | 13 +- llvm/test/CodeGen/AMDGPU/xor.ll | 4 +- llvm/test/CodeGen/AMDGPU/zero_extend.ll | 5 +- 135 files changed, 10543 insertions(+), 11599 deletions(-) diff --git a/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp b/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp index 367951458ea4..99aa8a60b04f 100644 --- a/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp +++ b/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp @@ -146,30 +146,6 @@ static unsigned macToMad(unsigned Opc) { return AMDGPU::INSTRUCTION_LIST_END; } -// Wrapper around isInlineConstant that understands special cases when -// instruction types are replaced during operand folding. -static bool isInlineConstantIfFolded(const SIInstrInfo *TII, - const MachineInstr &UseMI, - unsigned OpNo, - const MachineOperand &OpToFold) { - if (TII->isInlineConstant(UseMI, OpNo, OpToFold)) - return true; - - unsigned Opc = UseMI.getOpcode(); - unsigned NewOpc = macToMad(Opc); - if (NewOpc != AMDGPU::INSTRUCTION_LIST_END) { - // Special case for mac. Since this is replaced with mad when folded into - // src2, we need to check the legality for the final instruction. - int Src2Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2); - if (static_cast(OpNo) == Src2Idx) { - const MCInstrDesc &MadDesc = TII->get(NewOpc); - return TII->isInlineConstant(OpToFold, MadDesc.OpInfo[OpNo].OperandType); - } - } - - return false; -} - // TODO: Add heuristic that the frame index might not fit in the addressing mode // immediate offset to avoid materializing in loops. static bool frameIndexMayFold(const SIInstrInfo *TII, @@ -1267,59 +1243,13 @@ bool SIFoldOperands::foldInstOperand(MachineInstr &MI, } } - bool FoldingImm = OpToFold.isImm() || OpToFold.isFI() || OpToFold.isGlobal(); - if (FoldingImm) { - unsigned NumLiteralUses = 0; - MachineOperand *NonInlineUse = nullptr; - int NonInlineUseOpNo = -1; - - for (auto &Use : - make_early_inc_range(MRI->use_nodbg_operands(Dst.getReg()))) { - MachineInstr *UseMI = Use.getParent(); - unsigned OpNo = UseMI->getOperandNo(&Use); - - // Try to fold any inline immediate uses, and then only fold other - // constants if they have one use. - // - // The legality of the inline immediate must be checked based on the use - // operand, not the defining instruction, because 32-bit instructions - // with 32-bit inline immediate sources may be used to materialize - // constants used in 16-bit operands. - // - // e.g. it is unsafe to fold: - // s_mov_b32 s0, 1.0 // materializes 0x3f800000 - // v_add_f16 v0, v1, s0 // 1.0 f16 inline immediate sees 0x00003c00 - - // Folding immediates with more than one use will increase program size. - // FIXME: This will also reduce register usage, which may be better - // in some cases. A better heuristic is needed. - if (isInlineConstantIfFolded(TII, *UseMI, OpNo, OpToFold)) { - foldOperand(OpToFold, UseMI, OpNo, FoldList, CopiesToReplace); - } else if (frameIndexMayFold(TII, *UseMI, OpNo, OpToFold)) { - foldOperand(OpToFold, UseMI, OpNo, FoldList, CopiesToReplace); - } else { - if (++NumLiteralUses == 1) { - NonInlineUse = &Use; - NonInlineUseOpNo = OpNo; - } - } - } - - if (NumLiteralUses == 1) { - MachineInstr *UseMI = NonInlineUse->getParent(); - foldOperand(OpToFold, UseMI, NonInlineUseOpNo, FoldList, CopiesToReplace); - } - } else { - // Folding register. - SmallVector UsesToProcess; - for (auto &Use : MRI->use_nodbg_operands(Dst.getReg())) - UsesToProcess.push_back(&Use); - for (auto U : UsesToProcess) { - MachineInstr *UseMI = U->getParent(); - - foldOperand(OpToFold, UseMI, UseMI->getOperandNo(U), - FoldList, CopiesToReplace); - } + SmallVector UsesToProcess; + for (auto &Use : MRI->use_nodbg_operands(Dst.getReg())) + UsesToProcess.push_back(&Use); + for (auto U : UsesToProcess) { + MachineInstr *UseMI = U->getParent(); + foldOperand(OpToFold, UseMI, UseMI->getOperandNo(U), FoldList, + CopiesToReplace); } if (CopiesToReplace.empty() && FoldList.empty()) diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/add.v2i16.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/add.v2i16.ll index 1578076411dc..aa29bf5ee65f 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/add.v2i16.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/add.v2i16.ll @@ -127,9 +127,8 @@ define <2 x i16> @v_add_v2i16_neg_inline_imm_splat(<2 x i16> %a) { ; GFX8-LABEL: v_add_v2i16_neg_inline_imm_splat: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: s_movk_i32 s4, 0xffc0 -; GFX8-NEXT: v_mov_b32_e32 v2, s4 -; GFX8-NEXT: v_add_u16_e32 v1, s4, v0 +; GFX8-NEXT: v_mov_b32_e32 v2, 0xffffffc0 +; GFX8-NEXT: v_add_u16_e32 v1, 0xffc0, v0 ; GFX8-NEXT: v_add_u16_sdwa v0, v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; GFX8-NEXT: v_or_b32_e32 v0, v1, v0 ; GFX8-NEXT: s_setpc_b64 s[30:31] @@ -209,14 +208,12 @@ define amdgpu_ps i32 @s_add_v2i16_neg_inline_imm_splat(<2 x i16> inreg %a) { ; ; GFX8-LABEL: s_add_v2i16_neg_inline_imm_splat: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_mov_b32 s3, 0xffff -; GFX8-NEXT: s_mov_b32 s1, 0xffc0 -; GFX8-NEXT: s_lshr_b32 s2, s0, 16 -; GFX8-NEXT: s_and_b32 s0, s0, s3 -; GFX8-NEXT: s_add_i32 s0, s0, s1 -; GFX8-NEXT: s_add_i32 s2, s2, s1 -; GFX8-NEXT: s_lshl_b32 s1, s2, 16 -; GFX8-NEXT: s_and_b32 s0, s0, s3 +; GFX8-NEXT: s_lshr_b32 s1, s0, 16 +; GFX8-NEXT: s_and_b32 s0, s0, 0xffff +; GFX8-NEXT: s_add_i32 s0, s0, 0xffc0 +; GFX8-NEXT: s_add_i32 s1, s1, 0xffc0 +; GFX8-NEXT: s_lshl_b32 s1, s1, 16 +; GFX8-NEXT: s_and_b32 s0, s0, 0xffff ; GFX8-NEXT: s_or_b32 s0, s1, s0 ; GFX8-NEXT: ; return to shader part epilog ; @@ -243,13 +240,12 @@ define amdgpu_ps i32 @s_add_v2i16_neg_inline_imm_lo(<2 x i16> inreg %a) { ; ; GFX8-LABEL: s_add_v2i16_neg_inline_imm_lo: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_mov_b32 s2, 0xffff ; GFX8-NEXT: s_lshr_b32 s1, s0, 16 -; GFX8-NEXT: s_and_b32 s0, s0, s2 +; GFX8-NEXT: s_and_b32 s0, s0, 0xffff ; GFX8-NEXT: s_add_i32 s0, s0, 0xffc0 ; GFX8-NEXT: s_add_i32 s1, s1, 4 ; GFX8-NEXT: s_lshl_b32 s1, s1, 16 -; GFX8-NEXT: s_and_b32 s0, s0, s2 +; GFX8-NEXT: s_and_b32 s0, s0, 0xffff ; GFX8-NEXT: s_or_b32 s0, s1, s0 ; GFX8-NEXT: ; return to shader part epilog ; @@ -276,13 +272,12 @@ define amdgpu_ps i32 @s_add_v2i16_neg_inline_imm_hi(<2 x i16> inreg %a) { ; ; GFX8-LABEL: s_add_v2i16_neg_inline_imm_hi: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_mov_b32 s2, 0xffff ; GFX8-NEXT: s_lshr_b32 s1, s0, 16 -; GFX8-NEXT: s_and_b32 s0, s0, s2 +; GFX8-NEXT: s_and_b32 s0, s0, 0xffff ; GFX8-NEXT: s_add_i32 s0, s0, 4 ; GFX8-NEXT: s_add_i32 s1, s1, 0xffc0 ; GFX8-NEXT: s_lshl_b32 s1, s1, 16 -; GFX8-NEXT: s_and_b32 s0, s0, s2 +; GFX8-NEXT: s_and_b32 s0, s0, 0xffff ; GFX8-NEXT: s_or_b32 s0, s1, s0 ; GFX8-NEXT: ; return to shader part epilog ; @@ -310,15 +305,14 @@ define amdgpu_ps i32 @s_add_v2i16(<2 x i16> inreg %a, <2 x i16> inreg %b) { ; ; GFX8-LABEL: s_add_v2i16: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_mov_b32 s3, 0xffff ; GFX8-NEXT: s_lshr_b32 s2, s0, 16 -; GFX8-NEXT: s_and_b32 s0, s0, s3 -; GFX8-NEXT: s_lshr_b32 s4, s1, 16 -; GFX8-NEXT: s_and_b32 s1, s1, s3 +; GFX8-NEXT: s_and_b32 s0, s0, 0xffff +; GFX8-NEXT: s_lshr_b32 s3, s1, 16 +; GFX8-NEXT: s_and_b32 s1, s1, 0xffff ; GFX8-NEXT: s_add_i32 s0, s0, s1 -; GFX8-NEXT: s_add_i32 s2, s2, s4 +; GFX8-NEXT: s_add_i32 s2, s2, s3 ; GFX8-NEXT: s_lshl_b32 s1, s2, 16 -; GFX8-NEXT: s_and_b32 s0, s0, s3 +; GFX8-NEXT: s_and_b32 s0, s0, 0xffff ; GFX8-NEXT: s_or_b32 s0, s1, s0 ; GFX8-NEXT: ; return to shader part epilog ; @@ -349,15 +343,14 @@ define amdgpu_ps i32 @s_add_v2i16_fneg_lhs(<2 x half> inreg %a, <2 x i16> inreg ; GFX8-LABEL: s_add_v2i16_fneg_lhs: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_xor_b32 s0, s0, 0x80008000 -; GFX8-NEXT: s_mov_b32 s3, 0xffff ; GFX8-NEXT: s_lshr_b32 s2, s0, 16 -; GFX8-NEXT: s_and_b32 s0, s0, s3 -; GFX8-NEXT: s_lshr_b32 s4, s1, 16 -; GFX8-NEXT: s_and_b32 s1, s1, s3 +; GFX8-NEXT: s_and_b32 s0, s0, 0xffff +; GFX8-NEXT: s_lshr_b32 s3, s1, 16 +; GFX8-NEXT: s_and_b32 s1, s1, 0xffff ; GFX8-NEXT: s_add_i32 s0, s0, s1 -; GFX8-NEXT: s_add_i32 s2, s2, s4 +; GFX8-NEXT: s_add_i32 s2, s2, s3 ; GFX8-NEXT: s_lshl_b32 s1, s2, 16 -; GFX8-NEXT: s_and_b32 s0, s0, s3 +; GFX8-NEXT: s_and_b32 s0, s0, 0xffff ; GFX8-NEXT: s_or_b32 s0, s1, s0 ; GFX8-NEXT: ; return to shader part epilog ; @@ -391,15 +384,14 @@ define amdgpu_ps i32 @s_add_v2i16_fneg_rhs(<2 x i16> inreg %a, <2 x half> inreg ; GFX8-LABEL: s_add_v2i16_fneg_rhs: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_xor_b32 s1, s1, 0x80008000 -; GFX8-NEXT: s_mov_b32 s3, 0xffff ; GFX8-NEXT: s_lshr_b32 s2, s0, 16 -; GFX8-NEXT: s_and_b32 s0, s0, s3 -; GFX8-NEXT: s_lshr_b32 s4, s1, 16 -; GFX8-NEXT: s_and_b32 s1, s1, s3 +; GFX8-NEXT: s_and_b32 s0, s0, 0xffff +; GFX8-NEXT: s_lshr_b32 s3, s1, 16 +; GFX8-NEXT: s_and_b32 s1, s1, 0xffff ; GFX8-NEXT: s_add_i32 s0, s0, s1 -; GFX8-NEXT: s_add_i32 s2, s2, s4 +; GFX8-NEXT: s_add_i32 s2, s2, s3 ; GFX8-NEXT: s_lshl_b32 s1, s2, 16 -; GFX8-NEXT: s_and_b32 s0, s0, s3 +; GFX8-NEXT: s_and_b32 s0, s0, 0xffff ; GFX8-NEXT: s_or_b32 s0, s1, s0 ; GFX8-NEXT: ; return to shader part epilog ; @@ -422,9 +414,8 @@ define amdgpu_ps i32 @s_add_v2i16_fneg_rhs(<2 x i16> inreg %a, <2 x half> inreg define amdgpu_ps i32 @s_add_v2i16_fneg_lhs_fneg_rhs(<2 x half> inreg %a, <2 x half> inreg %b) { ; GFX9-LABEL: s_add_v2i16_fneg_lhs_fneg_rhs: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_mov_b32 s2, 0x80008000 -; GFX9-NEXT: s_xor_b32 s0, s0, s2 -; GFX9-NEXT: s_xor_b32 s1, s1, s2 +; GFX9-NEXT: s_xor_b32 s0, s0, 0x80008000 +; GFX9-NEXT: s_xor_b32 s1, s1, 0x80008000 ; GFX9-NEXT: s_lshr_b32 s2, s0, 16 ; GFX9-NEXT: s_lshr_b32 s3, s1, 16 ; GFX9-NEXT: s_add_i32 s0, s0, s1 @@ -434,26 +425,23 @@ define amdgpu_ps i32 @s_add_v2i16_fneg_lhs_fneg_rhs(<2 x half> inreg %a, <2 x ha ; ; GFX8-LABEL: s_add_v2i16_fneg_lhs_fneg_rhs: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_mov_b32 s2, 0x80008000 -; GFX8-NEXT: s_xor_b32 s0, s0, s2 -; GFX8-NEXT: s_xor_b32 s1, s1, s2 -; GFX8-NEXT: s_mov_b32 s3, 0xffff +; GFX8-NEXT: s_xor_b32 s0, s0, 0x80008000 +; GFX8-NEXT: s_xor_b32 s1, s1, 0x80008000 ; GFX8-NEXT: s_lshr_b32 s2, s0, 16 -; GFX8-NEXT: s_and_b32 s0, s0, s3 -; GFX8-NEXT: s_lshr_b32 s4, s1, 16 -; GFX8-NEXT: s_and_b32 s1, s1, s3 +; GFX8-NEXT: s_and_b32 s0, s0, 0xffff +; GFX8-NEXT: s_lshr_b32 s3, s1, 16 +; GFX8-NEXT: s_and_b32 s1, s1, 0xffff ; GFX8-NEXT: s_add_i32 s0, s0, s1 -; GFX8-NEXT: s_add_i32 s2, s2, s4 +; GFX8-NEXT: s_add_i32 s2, s2, s3 ; GFX8-NEXT: s_lshl_b32 s1, s2, 16 -; GFX8-NEXT: s_and_b32 s0, s0, s3 +; GFX8-NEXT: s_and_b32 s0, s0, 0xffff ; GFX8-NEXT: s_or_b32 s0, s1, s0 ; GFX8-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: s_add_v2i16_fneg_lhs_fneg_rhs: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_mov_b32 s2, 0x80008000 -; GFX10-NEXT: s_xor_b32 s0, s0, s2 -; GFX10-NEXT: s_xor_b32 s1, s1, s2 +; GFX10-NEXT: s_xor_b32 s0, s0, 0x80008000 +; GFX10-NEXT: s_xor_b32 s1, s1, 0x80008000 ; GFX10-NEXT: s_lshr_b32 s2, s0, 16 ; GFX10-NEXT: s_lshr_b32 s3, s1, 16 ; GFX10-NEXT: s_add_i32 s0, s0, s1 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/addo.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/addo.ll index 646705337aab..2280dc45e7de 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/addo.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/addo.ll @@ -967,7 +967,7 @@ define amdgpu_ps i16 @uaddo_i16_sv(i16 inreg %a, i16 %b) { ; GFX7-LABEL: uaddo_i16_sv: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_mov_b32 s1, 0xffff -; GFX7-NEXT: s_and_b32 s0, s0, s1 +; GFX7-NEXT: s_and_b32 s0, s0, 0xffff ; GFX7-NEXT: v_and_b32_e32 v0, s1, v0 ; GFX7-NEXT: v_add_i32_e32 v0, vcc, s0, v0 ; GFX7-NEXT: v_and_b32_e32 v1, s1, v0 @@ -980,7 +980,7 @@ define amdgpu_ps i16 @uaddo_i16_sv(i16 inreg %a, i16 %b) { ; GFX8-LABEL: uaddo_i16_sv: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_mov_b32 s1, 0xffff -; GFX8-NEXT: s_and_b32 s0, s0, s1 +; GFX8-NEXT: s_and_b32 s0, s0, 0xffff ; GFX8-NEXT: v_and_b32_e32 v0, s1, v0 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v0 ; GFX8-NEXT: v_and_b32_e32 v1, s1, v0 @@ -992,8 +992,7 @@ define amdgpu_ps i16 @uaddo_i16_sv(i16 inreg %a, i16 %b) { ; ; GFX9-LABEL: uaddo_i16_sv: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_mov_b32 s1, 0xffff -; GFX9-NEXT: s_and_b32 s0, s0, s1 +; GFX9-NEXT: s_and_b32 s0, s0, 0xffff ; GFX9-NEXT: v_add_u32_sdwa v0, s0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; GFX9-NEXT: v_cmp_ne_u32_sdwa s[0:1], v0, v0 src0_sel:DWORD src1_sel:WORD_0 ; GFX9-NEXT: v_cndmask_b32_e64 v1, 0, 1, s[0:1] diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/andn2.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/andn2.ll index 2e117b2e0f37..99d3bf4f2c4e 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/andn2.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/andn2.ll @@ -429,13 +429,12 @@ define amdgpu_ps float @v_andn2_i16_vs(i16 %src0, i16 inreg %src1) { define amdgpu_ps i32 @s_andn2_v2i16(<2 x i16> inreg %src0, <2 x i16> inreg %src1) { ; GFX6-LABEL: s_andn2_v2i16: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_mov_b32 s1, 0xffff ; GFX6-NEXT: s_lshl_b32 s0, s3, 16 -; GFX6-NEXT: s_and_b32 s2, s2, s1 -; GFX6-NEXT: s_or_b32 s0, s0, s2 -; GFX6-NEXT: s_lshl_b32 s2, s5, 16 -; GFX6-NEXT: s_and_b32 s1, s4, s1 -; GFX6-NEXT: s_or_b32 s1, s2, s1 +; GFX6-NEXT: s_and_b32 s1, s2, 0xffff +; GFX6-NEXT: s_or_b32 s0, s0, s1 +; GFX6-NEXT: s_lshl_b32 s1, s5, 16 +; GFX6-NEXT: s_and_b32 s2, s4, 0xffff +; GFX6-NEXT: s_or_b32 s1, s1, s2 ; GFX6-NEXT: s_xor_b32 s1, s1, -1 ; GFX6-NEXT: s_and_b32 s0, s0, s1 ; GFX6-NEXT: ; return to shader part epilog @@ -458,13 +457,12 @@ define amdgpu_ps i32 @s_andn2_v2i16(<2 x i16> inreg %src0, <2 x i16> inreg %src1 define amdgpu_ps i32 @s_andn2_v2i16_commute(<2 x i16> inreg %src0, <2 x i16> inreg %src1) { ; GFX6-LABEL: s_andn2_v2i16_commute: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_mov_b32 s1, 0xffff ; GFX6-NEXT: s_lshl_b32 s0, s3, 16 -; GFX6-NEXT: s_and_b32 s2, s2, s1 -; GFX6-NEXT: s_or_b32 s0, s0, s2 -; GFX6-NEXT: s_lshl_b32 s2, s5, 16 -; GFX6-NEXT: s_and_b32 s1, s4, s1 -; GFX6-NEXT: s_or_b32 s1, s2, s1 +; GFX6-NEXT: s_and_b32 s1, s2, 0xffff +; GFX6-NEXT: s_or_b32 s0, s0, s1 +; GFX6-NEXT: s_lshl_b32 s1, s5, 16 +; GFX6-NEXT: s_and_b32 s2, s4, 0xffff +; GFX6-NEXT: s_or_b32 s1, s1, s2 ; GFX6-NEXT: s_xor_b32 s1, s1, -1 ; GFX6-NEXT: s_and_b32 s0, s1, s0 ; GFX6-NEXT: ; return to shader part epilog @@ -487,13 +485,12 @@ define amdgpu_ps i32 @s_andn2_v2i16_commute(<2 x i16> inreg %src0, <2 x i16> inr define amdgpu_ps { i32, i32 } @s_andn2_v2i16_multi_use(<2 x i16> inreg %src0, <2 x i16> inreg %src1) { ; GFX6-LABEL: s_andn2_v2i16_multi_use: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_mov_b32 s1, 0xffff ; GFX6-NEXT: s_lshl_b32 s0, s3, 16 -; GFX6-NEXT: s_and_b32 s2, s2, s1 -; GFX6-NEXT: s_or_b32 s0, s0, s2 -; GFX6-NEXT: s_lshl_b32 s2, s5, 16 -; GFX6-NEXT: s_and_b32 s1, s4, s1 -; GFX6-NEXT: s_or_b32 s1, s2, s1 +; GFX6-NEXT: s_and_b32 s1, s2, 0xffff +; GFX6-NEXT: s_or_b32 s0, s0, s1 +; GFX6-NEXT: s_lshl_b32 s1, s5, 16 +; GFX6-NEXT: s_and_b32 s2, s4, 0xffff +; GFX6-NEXT: s_or_b32 s1, s1, s2 ; GFX6-NEXT: s_xor_b32 s1, s1, -1 ; GFX6-NEXT: s_and_b32 s0, s0, s1 ; GFX6-NEXT: ; return to shader part epilog @@ -522,19 +519,18 @@ define amdgpu_ps { i32, i32 } @s_andn2_v2i16_multi_use(<2 x i16> inreg %src0, <2 define amdgpu_ps { i32, i32 } @s_andn2_v2i16_multi_foldable_use(<2 x i16> inreg %src0, <2 x i16> inreg %src1, <2 x i16> inreg %src2) { ; GFX6-LABEL: s_andn2_v2i16_multi_foldable_use: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_mov_b32 s1, 0xffff ; GFX6-NEXT: s_lshl_b32 s0, s3, 16 -; GFX6-NEXT: s_and_b32 s2, s2, s1 -; GFX6-NEXT: s_or_b32 s0, s0, s2 -; GFX6-NEXT: s_lshl_b32 s2, s5, 16 -; GFX6-NEXT: s_and_b32 s3, s4, s1 +; GFX6-NEXT: s_and_b32 s1, s2, 0xffff +; GFX6-NEXT: s_or_b32 s0, s0, s1 +; GFX6-NEXT: s_lshl_b32 s1, s5, 16 +; GFX6-NEXT: s_and_b32 s2, s4, 0xffff +; GFX6-NEXT: s_or_b32 s1, s1, s2 +; GFX6-NEXT: s_lshl_b32 s2, s7, 16 +; GFX6-NEXT: s_and_b32 s3, s6, 0xffff ; GFX6-NEXT: s_or_b32 s2, s2, s3 -; GFX6-NEXT: s_lshl_b32 s3, s7, 16 -; GFX6-NEXT: s_and_b32 s1, s6, s1 -; GFX6-NEXT: s_or_b32 s1, s3, s1 -; GFX6-NEXT: s_xor_b32 s1, s1, -1 -; GFX6-NEXT: s_and_b32 s0, s0, s1 -; GFX6-NEXT: s_and_b32 s1, s2, s1 +; GFX6-NEXT: s_xor_b32 s2, s2, -1 +; GFX6-NEXT: s_and_b32 s0, s0, s2 +; GFX6-NEXT: s_and_b32 s1, s1, s2 ; GFX6-NEXT: ; return to shader part epilog ; ; GFX9-LABEL: s_andn2_v2i16_multi_foldable_use: @@ -630,18 +626,17 @@ define amdgpu_ps i64 @s_andn2_v4i16(<4 x i16> inreg %src0, <4 x i16> inreg %src1 ; GFX6-LABEL: s_andn2_v4i16: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_lshl_b32 s0, s3, 16 -; GFX6-NEXT: s_mov_b32 s3, 0xffff -; GFX6-NEXT: s_and_b32 s1, s2, s3 +; GFX6-NEXT: s_and_b32 s1, s2, 0xffff ; GFX6-NEXT: s_or_b32 s0, s0, s1 ; GFX6-NEXT: s_lshl_b32 s1, s5, 16 -; GFX6-NEXT: s_and_b32 s2, s4, s3 +; GFX6-NEXT: s_and_b32 s2, s4, 0xffff ; GFX6-NEXT: s_or_b32 s1, s1, s2 ; GFX6-NEXT: s_lshl_b32 s2, s7, 16 -; GFX6-NEXT: s_and_b32 s4, s6, s3 -; GFX6-NEXT: s_or_b32 s2, s2, s4 -; GFX6-NEXT: s_lshl_b32 s4, s9, 16 -; GFX6-NEXT: s_and_b32 s3, s8, s3 -; GFX6-NEXT: s_or_b32 s3, s4, s3 +; GFX6-NEXT: s_and_b32 s3, s6, 0xffff +; GFX6-NEXT: s_or_b32 s2, s2, s3 +; GFX6-NEXT: s_lshl_b32 s3, s9, 16 +; GFX6-NEXT: s_and_b32 s4, s8, 0xffff +; GFX6-NEXT: s_or_b32 s3, s3, s4 ; GFX6-NEXT: s_mov_b32 s4, -1 ; GFX6-NEXT: s_mov_b32 s5, s4 ; GFX6-NEXT: s_xor_b64 s[2:3], s[2:3], s[4:5] @@ -673,18 +668,17 @@ define amdgpu_ps i64 @s_andn2_v4i16_commute(<4 x i16> inreg %src0, <4 x i16> inr ; GFX6-LABEL: s_andn2_v4i16_commute: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_lshl_b32 s0, s3, 16 -; GFX6-NEXT: s_mov_b32 s3, 0xffff -; GFX6-NEXT: s_and_b32 s1, s2, s3 +; GFX6-NEXT: s_and_b32 s1, s2, 0xffff ; GFX6-NEXT: s_or_b32 s0, s0, s1 ; GFX6-NEXT: s_lshl_b32 s1, s5, 16 -; GFX6-NEXT: s_and_b32 s2, s4, s3 +; GFX6-NEXT: s_and_b32 s2, s4, 0xffff ; GFX6-NEXT: s_or_b32 s1, s1, s2 ; GFX6-NEXT: s_lshl_b32 s2, s7, 16 -; GFX6-NEXT: s_and_b32 s4, s6, s3 -; GFX6-NEXT: s_or_b32 s2, s2, s4 -; GFX6-NEXT: s_lshl_b32 s4, s9, 16 -; GFX6-NEXT: s_and_b32 s3, s8, s3 -; GFX6-NEXT: s_or_b32 s3, s4, s3 +; GFX6-NEXT: s_and_b32 s3, s6, 0xffff +; GFX6-NEXT: s_or_b32 s2, s2, s3 +; GFX6-NEXT: s_lshl_b32 s3, s9, 16 +; GFX6-NEXT: s_and_b32 s4, s8, 0xffff +; GFX6-NEXT: s_or_b32 s3, s3, s4 ; GFX6-NEXT: s_mov_b32 s4, -1 ; GFX6-NEXT: s_mov_b32 s5, s4 ; GFX6-NEXT: s_xor_b64 s[2:3], s[2:3], s[4:5] @@ -716,18 +710,17 @@ define amdgpu_ps { i64, i64 } @s_andn2_v4i16_multi_use(<4 x i16> inreg %src0, <4 ; GFX6-LABEL: s_andn2_v4i16_multi_use: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_lshl_b32 s0, s3, 16 -; GFX6-NEXT: s_mov_b32 s3, 0xffff -; GFX6-NEXT: s_and_b32 s1, s2, s3 +; GFX6-NEXT: s_and_b32 s1, s2, 0xffff ; GFX6-NEXT: s_or_b32 s0, s0, s1 ; GFX6-NEXT: s_lshl_b32 s1, s5, 16 -; GFX6-NEXT: s_and_b32 s2, s4, s3 +; GFX6-NEXT: s_and_b32 s2, s4, 0xffff ; GFX6-NEXT: s_or_b32 s1, s1, s2 ; GFX6-NEXT: s_lshl_b32 s2, s7, 16 -; GFX6-NEXT: s_and_b32 s4, s6, s3 -; GFX6-NEXT: s_or_b32 s2, s2, s4 -; GFX6-NEXT: s_lshl_b32 s4, s9, 16 -; GFX6-NEXT: s_and_b32 s3, s8, s3 -; GFX6-NEXT: s_or_b32 s3, s4, s3 +; GFX6-NEXT: s_and_b32 s3, s6, 0xffff +; GFX6-NEXT: s_or_b32 s2, s2, s3 +; GFX6-NEXT: s_lshl_b32 s3, s9, 16 +; GFX6-NEXT: s_and_b32 s4, s8, 0xffff +; GFX6-NEXT: s_or_b32 s3, s3, s4 ; GFX6-NEXT: s_mov_b32 s4, -1 ; GFX6-NEXT: s_mov_b32 s5, s4 ; GFX6-NEXT: s_xor_b64 s[2:3], s[2:3], s[4:5] @@ -766,24 +759,23 @@ define amdgpu_ps { i64, i64 } @s_andn2_v4i16_multi_use(<4 x i16> inreg %src0, <4 define amdgpu_ps { i64, i64 } @s_andn2_v4i16_multi_foldable_use(<4 x i16> inreg %src0, <4 x i16> inreg %src1, <4 x i16> inreg %src2) { ; GFX6-LABEL: s_andn2_v4i16_multi_foldable_use: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_mov_b32 s14, 0xffff ; GFX6-NEXT: s_lshl_b32 s0, s3, 16 -; GFX6-NEXT: s_and_b32 s1, s2, s14 +; GFX6-NEXT: s_and_b32 s1, s2, 0xffff ; GFX6-NEXT: s_or_b32 s0, s0, s1 ; GFX6-NEXT: s_lshl_b32 s1, s5, 16 -; GFX6-NEXT: s_and_b32 s2, s4, s14 +; GFX6-NEXT: s_and_b32 s2, s4, 0xffff ; GFX6-NEXT: s_or_b32 s1, s1, s2 ; GFX6-NEXT: s_lshl_b32 s2, s7, 16 -; GFX6-NEXT: s_and_b32 s3, s6, s14 +; GFX6-NEXT: s_and_b32 s3, s6, 0xffff ; GFX6-NEXT: s_or_b32 s2, s2, s3 ; GFX6-NEXT: s_lshl_b32 s3, s9, 16 -; GFX6-NEXT: s_and_b32 s4, s8, s14 +; GFX6-NEXT: s_and_b32 s4, s8, 0xffff ; GFX6-NEXT: s_or_b32 s3, s3, s4 ; GFX6-NEXT: s_lshl_b32 s4, s11, 16 -; GFX6-NEXT: s_and_b32 s5, s10, s14 +; GFX6-NEXT: s_and_b32 s5, s10, 0xffff ; GFX6-NEXT: s_or_b32 s4, s4, s5 ; GFX6-NEXT: s_lshl_b32 s5, s13, 16 -; GFX6-NEXT: s_and_b32 s6, s12, s14 +; GFX6-NEXT: s_and_b32 s6, s12, 0xffff ; GFX6-NEXT: s_or_b32 s5, s5, s6 ; GFX6-NEXT: s_mov_b32 s6, -1 ; GFX6-NEXT: s_mov_b32 s7, s6 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/ashr.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/ashr.ll index 7d393cd26a44..a50950782f3c 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/ashr.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/ashr.ll @@ -794,24 +794,22 @@ define amdgpu_ps i32 @s_ashr_v2i16(<2 x i16> inreg %value, <2 x i16> inreg %amou ; GFX6-LABEL: s_ashr_v2i16: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_sext_i32_i16 s1, s1 -; GFX6-NEXT: s_mov_b32 s4, 0xffff ; GFX6-NEXT: s_sext_i32_i16 s0, s0 ; GFX6-NEXT: s_ashr_i32 s1, s1, s3 ; GFX6-NEXT: s_ashr_i32 s0, s0, s2 -; GFX6-NEXT: s_and_b32 s1, s1, s4 -; GFX6-NEXT: s_and_b32 s0, s0, s4 +; GFX6-NEXT: s_and_b32 s1, s1, 0xffff +; GFX6-NEXT: s_and_b32 s0, s0, 0xffff ; GFX6-NEXT: s_lshl_b32 s1, s1, 16 ; GFX6-NEXT: s_or_b32 s0, s0, s1 ; GFX6-NEXT: ; return to shader part epilog ; ; GFX8-LABEL: s_ashr_v2i16: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_mov_b32 s3, 0x100010 ; GFX8-NEXT: s_sext_i32_i16 s2, s0 -; GFX8-NEXT: s_bfe_i32 s0, s0, s3 -; GFX8-NEXT: s_sext_i32_i16 s4, s1 -; GFX8-NEXT: s_bfe_i32 s1, s1, s3 -; GFX8-NEXT: s_ashr_i32 s2, s2, s4 +; GFX8-NEXT: s_bfe_i32 s0, s0, 0x100010 +; GFX8-NEXT: s_sext_i32_i16 s3, s1 +; GFX8-NEXT: s_bfe_i32 s1, s1, 0x100010 +; GFX8-NEXT: s_ashr_i32 s2, s2, s3 ; GFX8-NEXT: s_ashr_i32 s0, s0, s1 ; GFX8-NEXT: s_lshl_b32 s0, s0, 16 ; GFX8-NEXT: s_and_b32 s1, s2, 0xffff @@ -886,12 +884,12 @@ define amdgpu_ps float @ashr_v2i16_sv(<2 x i16> inreg %value, <2 x i16> %amount) define amdgpu_ps float @ashr_v2i16_vs(<2 x i16> %value, <2 x i16> inreg %amount) { ; GFX6-LABEL: ashr_v2i16_vs: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_mov_b32 s2, 0xffff -; GFX6-NEXT: s_and_b32 s0, s0, s2 +; GFX6-NEXT: s_and_b32 s0, s0, 0xffff ; GFX6-NEXT: v_bfe_i32 v0, v0, 0, 16 ; GFX6-NEXT: v_ashrrev_i32_e32 v0, s0, v0 -; GFX6-NEXT: s_and_b32 s0, s1, s2 +; GFX6-NEXT: s_and_b32 s0, s1, 0xffff ; GFX6-NEXT: v_bfe_i32 v1, v1, 0, 16 +; GFX6-NEXT: s_mov_b32 s2, 0xffff ; GFX6-NEXT: v_ashrrev_i32_e32 v1, s0, v1 ; GFX6-NEXT: v_and_b32_e32 v1, s2, v1 ; GFX6-NEXT: v_and_b32_e32 v0, s2, v0 @@ -994,45 +992,42 @@ define amdgpu_ps <2 x i32> @s_ashr_v4i16(<4 x i16> inreg %value, <4 x i16> inreg ; GFX6-LABEL: s_ashr_v4i16: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_sext_i32_i16 s1, s1 -; GFX6-NEXT: s_mov_b32 s8, 0xffff ; GFX6-NEXT: s_sext_i32_i16 s0, s0 ; GFX6-NEXT: s_ashr_i32 s1, s1, s5 ; GFX6-NEXT: s_ashr_i32 s0, s0, s4 ; GFX6-NEXT: s_sext_i32_i16 s2, s2 ; GFX6-NEXT: s_sext_i32_i16 s3, s3 -; GFX6-NEXT: s_and_b32 s1, s1, s8 +; GFX6-NEXT: s_and_b32 s1, s1, 0xffff ; GFX6-NEXT: s_ashr_i32 s2, s2, s6 ; GFX6-NEXT: s_ashr_i32 s3, s3, s7 -; GFX6-NEXT: s_and_b32 s0, s0, s8 +; GFX6-NEXT: s_and_b32 s0, s0, 0xffff ; GFX6-NEXT: s_lshl_b32 s1, s1, 16 ; GFX6-NEXT: s_or_b32 s0, s0, s1 -; GFX6-NEXT: s_and_b32 s1, s2, s8 -; GFX6-NEXT: s_and_b32 s2, s3, s8 +; GFX6-NEXT: s_and_b32 s1, s2, 0xffff +; GFX6-NEXT: s_and_b32 s2, s3, 0xffff ; GFX6-NEXT: s_lshl_b32 s2, s2, 16 ; GFX6-NEXT: s_or_b32 s1, s1, s2 ; GFX6-NEXT: ; return to shader part epilog ; ; GFX8-LABEL: s_ashr_v4i16: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_mov_b32 s5, 0x100010 ; GFX8-NEXT: s_sext_i32_i16 s4, s0 -; GFX8-NEXT: s_bfe_i32 s0, s0, s5 -; GFX8-NEXT: s_sext_i32_i16 s6, s1 -; GFX8-NEXT: s_bfe_i32 s1, s1, s5 -; GFX8-NEXT: s_sext_i32_i16 s7, s2 -; GFX8-NEXT: s_bfe_i32 s2, s2, s5 -; GFX8-NEXT: s_sext_i32_i16 s8, s3 -; GFX8-NEXT: s_bfe_i32 s3, s3, s5 -; GFX8-NEXT: s_ashr_i32 s4, s4, s7 +; GFX8-NEXT: s_bfe_i32 s0, s0, 0x100010 +; GFX8-NEXT: s_sext_i32_i16 s5, s1 +; GFX8-NEXT: s_bfe_i32 s1, s1, 0x100010 +; GFX8-NEXT: s_sext_i32_i16 s6, s2 +; GFX8-NEXT: s_bfe_i32 s2, s2, 0x100010 +; GFX8-NEXT: s_sext_i32_i16 s7, s3 +; GFX8-NEXT: s_bfe_i32 s3, s3, 0x100010 +; GFX8-NEXT: s_ashr_i32 s4, s4, s6 ; GFX8-NEXT: s_ashr_i32 s0, s0, s2 -; GFX8-NEXT: s_ashr_i32 s2, s6, s8 +; GFX8-NEXT: s_ashr_i32 s2, s5, s7 ; GFX8-NEXT: s_ashr_i32 s1, s1, s3 -; GFX8-NEXT: s_mov_b32 s3, 0xffff ; GFX8-NEXT: s_lshl_b32 s0, s0, 16 -; GFX8-NEXT: s_and_b32 s4, s4, s3 +; GFX8-NEXT: s_and_b32 s3, s4, 0xffff ; GFX8-NEXT: s_lshl_b32 s1, s1, 16 -; GFX8-NEXT: s_and_b32 s2, s2, s3 -; GFX8-NEXT: s_or_b32 s0, s0, s4 +; GFX8-NEXT: s_and_b32 s2, s2, 0xffff +; GFX8-NEXT: s_or_b32 s0, s0, s3 ; GFX8-NEXT: s_or_b32 s1, s1, s2 ; GFX8-NEXT: ; return to shader part epilog ; @@ -1191,79 +1186,76 @@ define amdgpu_ps <4 x i32> @s_ashr_v8i16(<8 x i16> inreg %value, <8 x i16> inreg ; GFX6-LABEL: s_ashr_v8i16: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_sext_i32_i16 s1, s1 -; GFX6-NEXT: s_mov_b32 s16, 0xffff ; GFX6-NEXT: s_sext_i32_i16 s0, s0 ; GFX6-NEXT: s_ashr_i32 s1, s1, s9 ; GFX6-NEXT: s_ashr_i32 s0, s0, s8 ; GFX6-NEXT: s_sext_i32_i16 s2, s2 ; GFX6-NEXT: s_sext_i32_i16 s3, s3 -; GFX6-NEXT: s_and_b32 s1, s1, s16 +; GFX6-NEXT: s_and_b32 s1, s1, 0xffff ; GFX6-NEXT: s_ashr_i32 s2, s2, s10 ; GFX6-NEXT: s_ashr_i32 s3, s3, s11 ; GFX6-NEXT: s_sext_i32_i16 s5, s5 -; GFX6-NEXT: s_and_b32 s0, s0, s16 +; GFX6-NEXT: s_and_b32 s0, s0, 0xffff ; GFX6-NEXT: s_lshl_b32 s1, s1, 16 ; GFX6-NEXT: s_sext_i32_i16 s4, s4 ; GFX6-NEXT: s_ashr_i32 s5, s5, s13 ; GFX6-NEXT: s_sext_i32_i16 s7, s7 ; GFX6-NEXT: s_or_b32 s0, s0, s1 -; GFX6-NEXT: s_and_b32 s1, s2, s16 -; GFX6-NEXT: s_and_b32 s2, s3, s16 +; GFX6-NEXT: s_and_b32 s1, s2, 0xffff +; GFX6-NEXT: s_and_b32 s2, s3, 0xffff ; GFX6-NEXT: s_ashr_i32 s4, s4, s12 ; GFX6-NEXT: s_sext_i32_i16 s6, s6 ; GFX6-NEXT: s_ashr_i32 s7, s7, s15 ; GFX6-NEXT: s_lshl_b32 s2, s2, 16 -; GFX6-NEXT: s_and_b32 s3, s5, s16 +; GFX6-NEXT: s_and_b32 s3, s5, 0xffff ; GFX6-NEXT: s_ashr_i32 s6, s6, s14 ; GFX6-NEXT: s_or_b32 s1, s1, s2 -; GFX6-NEXT: s_and_b32 s2, s4, s16 +; GFX6-NEXT: s_and_b32 s2, s4, 0xffff ; GFX6-NEXT: s_lshl_b32 s3, s3, 16 -; GFX6-NEXT: s_and_b32 s4, s7, s16 +; GFX6-NEXT: s_and_b32 s4, s7, 0xffff ; GFX6-NEXT: s_or_b32 s2, s2, s3 -; GFX6-NEXT: s_and_b32 s3, s6, s16 +; GFX6-NEXT: s_and_b32 s3, s6, 0xffff ; GFX6-NEXT: s_lshl_b32 s4, s4, 16 ; GFX6-NEXT: s_or_b32 s3, s3, s4 ; GFX6-NEXT: ; return to shader part epilog ; ; GFX8-LABEL: s_ashr_v8i16: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_mov_b32 s9, 0x100010 ; GFX8-NEXT: s_sext_i32_i16 s8, s0 -; GFX8-NEXT: s_bfe_i32 s0, s0, s9 -; GFX8-NEXT: s_sext_i32_i16 s10, s1 -; GFX8-NEXT: s_bfe_i32 s1, s1, s9 -; GFX8-NEXT: s_sext_i32_i16 s12, s3 -; GFX8-NEXT: s_bfe_i32 s3, s3, s9 -; GFX8-NEXT: s_sext_i32_i16 s13, s4 -; GFX8-NEXT: s_bfe_i32 s4, s4, s9 -; GFX8-NEXT: s_sext_i32_i16 s14, s5 -; GFX8-NEXT: s_bfe_i32 s5, s5, s9 -; GFX8-NEXT: s_sext_i32_i16 s16, s7 -; GFX8-NEXT: s_bfe_i32 s7, s7, s9 -; GFX8-NEXT: s_sext_i32_i16 s11, s2 -; GFX8-NEXT: s_bfe_i32 s2, s2, s9 -; GFX8-NEXT: s_sext_i32_i16 s15, s6 -; GFX8-NEXT: s_bfe_i32 s6, s6, s9 +; GFX8-NEXT: s_bfe_i32 s0, s0, 0x100010 +; GFX8-NEXT: s_sext_i32_i16 s9, s1 +; GFX8-NEXT: s_bfe_i32 s1, s1, 0x100010 +; GFX8-NEXT: s_sext_i32_i16 s12, s4 +; GFX8-NEXT: s_bfe_i32 s4, s4, 0x100010 +; GFX8-NEXT: s_sext_i32_i16 s13, s5 +; GFX8-NEXT: s_bfe_i32 s5, s5, 0x100010 +; GFX8-NEXT: s_sext_i32_i16 s10, s2 +; GFX8-NEXT: s_bfe_i32 s2, s2, 0x100010 +; GFX8-NEXT: s_sext_i32_i16 s14, s6 +; GFX8-NEXT: s_bfe_i32 s6, s6, 0x100010 ; GFX8-NEXT: s_ashr_i32 s0, s0, s4 -; GFX8-NEXT: s_ashr_i32 s4, s10, s14 +; GFX8-NEXT: s_ashr_i32 s4, s9, s13 ; GFX8-NEXT: s_ashr_i32 s1, s1, s5 -; GFX8-NEXT: s_ashr_i32 s3, s3, s7 -; GFX8-NEXT: s_mov_b32 s7, 0xffff -; GFX8-NEXT: s_ashr_i32 s5, s11, s15 +; GFX8-NEXT: s_sext_i32_i16 s11, s3 +; GFX8-NEXT: s_bfe_i32 s3, s3, 0x100010 +; GFX8-NEXT: s_sext_i32_i16 s15, s7 +; GFX8-NEXT: s_bfe_i32 s7, s7, 0x100010 +; GFX8-NEXT: s_ashr_i32 s5, s10, s14 ; GFX8-NEXT: s_ashr_i32 s2, s2, s6 ; GFX8-NEXT: s_lshl_b32 s1, s1, 16 -; GFX8-NEXT: s_and_b32 s4, s4, s7 -; GFX8-NEXT: s_ashr_i32 s8, s8, s13 -; GFX8-NEXT: s_ashr_i32 s6, s12, s16 +; GFX8-NEXT: s_and_b32 s4, s4, 0xffff +; GFX8-NEXT: s_ashr_i32 s8, s8, s12 +; GFX8-NEXT: s_ashr_i32 s6, s11, s15 +; GFX8-NEXT: s_ashr_i32 s3, s3, s7 ; GFX8-NEXT: s_or_b32 s1, s1, s4 ; GFX8-NEXT: s_lshl_b32 s2, s2, 16 -; GFX8-NEXT: s_and_b32 s4, s5, s7 +; GFX8-NEXT: s_and_b32 s4, s5, 0xffff ; GFX8-NEXT: s_lshl_b32 s0, s0, 16 -; GFX8-NEXT: s_and_b32 s8, s8, s7 +; GFX8-NEXT: s_and_b32 s7, s8, 0xffff ; GFX8-NEXT: s_or_b32 s2, s2, s4 ; GFX8-NEXT: s_lshl_b32 s3, s3, 16 -; GFX8-NEXT: s_and_b32 s4, s6, s7 -; GFX8-NEXT: s_or_b32 s0, s0, s8 +; GFX8-NEXT: s_and_b32 s4, s6, 0xffff +; GFX8-NEXT: s_or_b32 s0, s0, s7 ; GFX8-NEXT: s_or_b32 s3, s3, s4 ; GFX8-NEXT: ; return to shader part epilog ; diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/bswap.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/bswap.ll index 1a18d7f138cd..f017ca8ce2f8 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/bswap.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/bswap.ll @@ -111,9 +111,8 @@ define amdgpu_ps <2 x i32> @s_bswap_v2i32(<2 x i32> inreg %src) { ; ; GFX10-LABEL: s_bswap_v2i32: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_mov_b32 s2, 0x10203 -; GFX10-NEXT: v_perm_b32 v0, 0, s0, s2 -; GFX10-NEXT: v_perm_b32 v1, 0, s1, s2 +; GFX10-NEXT: v_perm_b32 v0, 0, s0, 0x10203 +; GFX10-NEXT: v_perm_b32 v1, 0, s1, 0x10203 ; GFX10-NEXT: v_readfirstlane_b32 s0, v0 ; GFX10-NEXT: v_readfirstlane_b32 s1, v1 ; GFX10-NEXT: ; return to shader part epilog @@ -154,9 +153,8 @@ define <2 x i32> @v_bswap_v2i32(<2 x i32> %src) { ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: s_mov_b32 s4, 0x10203 -; GFX10-NEXT: v_perm_b32 v0, 0, v0, s4 -; GFX10-NEXT: v_perm_b32 v1, 0, v1, s4 +; GFX10-NEXT: v_perm_b32 v0, 0, v0, 0x10203 +; GFX10-NEXT: v_perm_b32 v1, 0, v1, 0x10203 ; GFX10-NEXT: s_setpc_b64 s[30:31] %bswap = call <2 x i32> @llvm.bswap.v2i32(<2 x i32> %src) ret <2 x i32> %bswap @@ -200,9 +198,8 @@ define amdgpu_ps i64 @s_bswap_i64(i64 inreg %src) { ; ; GFX10-LABEL: s_bswap_i64: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_mov_b32 s2, 0x10203 -; GFX10-NEXT: v_perm_b32 v0, 0, s1, s2 -; GFX10-NEXT: v_perm_b32 v1, 0, s0, s2 +; GFX10-NEXT: v_perm_b32 v0, 0, s1, 0x10203 +; GFX10-NEXT: v_perm_b32 v1, 0, s0, 0x10203 ; GFX10-NEXT: v_readfirstlane_b32 s0, v0 ; GFX10-NEXT: v_readfirstlane_b32 s1, v1 ; GFX10-NEXT: ; return to shader part epilog @@ -246,9 +243,8 @@ define i64 @v_bswap_i64(i64 %src) { ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: s_mov_b32 s4, 0x10203 -; GFX10-NEXT: v_perm_b32 v2, 0, v1, s4 -; GFX10-NEXT: v_perm_b32 v1, 0, v0, s4 +; GFX10-NEXT: v_perm_b32 v2, 0, v1, 0x10203 +; GFX10-NEXT: v_perm_b32 v1, 0, v0, 0x10203 ; GFX10-NEXT: v_mov_b32_e32 v0, v2 ; GFX10-NEXT: s_setpc_b64 s[30:31] %bswap = call i64 @llvm.bswap.i64(i64 %src) @@ -313,11 +309,10 @@ define amdgpu_ps <2 x i64> @s_bswap_v2i64(<2 x i64> inreg %src) { ; ; GFX10-LABEL: s_bswap_v2i64: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_mov_b32 s4, 0x10203 -; GFX10-NEXT: v_perm_b32 v0, 0, s1, s4 -; GFX10-NEXT: v_perm_b32 v1, 0, s0, s4 -; GFX10-NEXT: v_perm_b32 v2, 0, s3, s4 -; GFX10-NEXT: v_perm_b32 v3, 0, s2, s4 +; GFX10-NEXT: v_perm_b32 v0, 0, s1, 0x10203 +; GFX10-NEXT: v_perm_b32 v1, 0, s0, 0x10203 +; GFX10-NEXT: v_perm_b32 v2, 0, s3, 0x10203 +; GFX10-NEXT: v_perm_b32 v3, 0, s2, 0x10203 ; GFX10-NEXT: v_readfirstlane_b32 s0, v0 ; GFX10-NEXT: v_readfirstlane_b32 s1, v1 ; GFX10-NEXT: v_readfirstlane_b32 s2, v2 @@ -376,11 +371,10 @@ define <2 x i64> @v_bswap_v2i64(<2 x i64> %src) { ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: s_mov_b32 s4, 0x10203 -; GFX10-NEXT: v_perm_b32 v4, 0, v1, s4 -; GFX10-NEXT: v_perm_b32 v5, 0, v3, s4 -; GFX10-NEXT: v_perm_b32 v1, 0, v0, s4 -; GFX10-NEXT: v_perm_b32 v3, 0, v2, s4 +; GFX10-NEXT: v_perm_b32 v4, 0, v1, 0x10203 +; GFX10-NEXT: v_perm_b32 v5, 0, v3, 0x10203 +; GFX10-NEXT: v_perm_b32 v1, 0, v0, 0x10203 +; GFX10-NEXT: v_perm_b32 v3, 0, v2, 0x10203 ; GFX10-NEXT: v_mov_b32_e32 v0, v4 ; GFX10-NEXT: v_mov_b32_e32 v2, v5 ; GFX10-NEXT: s_setpc_b64 s[30:31] @@ -457,12 +451,11 @@ define i16 @v_bswap_i16(i16 %src) { define amdgpu_ps i32 @s_bswap_v2i16(<2 x i16> inreg %src) { ; GFX7-LABEL: s_bswap_v2i16: ; GFX7: ; %bb.0: -; GFX7-NEXT: s_mov_b32 s3, 0x80008 ; GFX7-NEXT: s_lshl_b32 s2, s0, 8 -; GFX7-NEXT: s_bfe_u32 s0, s0, s3 +; GFX7-NEXT: s_bfe_u32 s0, s0, 0x80008 ; GFX7-NEXT: s_or_b32 s0, s0, s2 ; GFX7-NEXT: s_lshl_b32 s2, s1, 8 -; GFX7-NEXT: s_bfe_u32 s1, s1, s3 +; GFX7-NEXT: s_bfe_u32 s1, s1, 0x80008 ; GFX7-NEXT: s_or_b32 s1, s1, s2 ; GFX7-NEXT: s_bfe_u32 s1, s1, 0x100000 ; GFX7-NEXT: s_bfe_u32 s0, s0, 0x100000 @@ -647,9 +640,8 @@ define i64 @v_bswap_i48(i64 %src) { ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: s_mov_b32 s4, 0x10203 -; GFX10-NEXT: v_perm_b32 v1, 0, v1, s4 -; GFX10-NEXT: v_perm_b32 v2, 0, v0, s4 +; GFX10-NEXT: v_perm_b32 v1, 0, v1, 0x10203 +; GFX10-NEXT: v_perm_b32 v2, 0, v0, 0x10203 ; GFX10-NEXT: v_lshrrev_b64 v[0:1], 16, v[1:2] ; GFX10-NEXT: s_setpc_b64 s[30:31] %trunc = trunc i64 %src to i48 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-fma-add-mul.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-fma-add-mul.ll index 785efb6c4375..f4782f54b891 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-fma-add-mul.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-fma-add-mul.ll @@ -901,30 +901,29 @@ define <3 x half> @test_3xhalf_add_mul_rhs(<3 x half> %x, <3 x half> %y, <3 x ha ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: v_lshrrev_b32_e32 v6, 16, v0 ; GFX10-NEXT: v_lshrrev_b32_e32 v7, 16, v2 -; GFX10-NEXT: v_mov_b32_e32 v8, 0xffff ; GFX10-NEXT: s_lshl_b32 s4, s4, 16 +; GFX10-NEXT: v_and_or_b32 v1, 0xffff, v1, s4 ; GFX10-NEXT: v_lshlrev_b32_e32 v6, 16, v6 ; GFX10-NEXT: v_lshlrev_b32_e32 v7, 16, v7 -; GFX10-NEXT: v_and_or_b32 v1, v1, v8, s4 -; GFX10-NEXT: v_and_or_b32 v3, v3, v8, s4 -; GFX10-NEXT: v_and_or_b32 v0, v0, v8, v6 -; GFX10-NEXT: v_and_or_b32 v2, v2, v8, v7 +; GFX10-NEXT: v_and_or_b32 v3, 0xffff, v3, s4 +; GFX10-NEXT: v_and_or_b32 v0, 0xffff, v0, v6 +; GFX10-NEXT: v_and_or_b32 v2, 0xffff, v2, v7 ; GFX10-NEXT: v_pk_mul_f16 v1, v1, v3 ; GFX10-NEXT: v_pk_mul_f16 v0, v0, v2 ; GFX10-NEXT: v_lshrrev_b32_e32 v2, 16, v4 -; GFX10-NEXT: v_and_or_b32 v1, v1, v8, s4 +; GFX10-NEXT: v_and_or_b32 v1, 0xffff, v1, s4 ; GFX10-NEXT: v_lshrrev_b32_e32 v6, 16, v0 ; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX10-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; GFX10-NEXT: v_and_or_b32 v2, v4, v8, v2 -; GFX10-NEXT: v_and_or_b32 v0, v0, v8, v6 +; GFX10-NEXT: v_and_or_b32 v2, 0xffff, v4, v2 +; GFX10-NEXT: v_and_or_b32 v0, 0xffff, v0, v6 ; GFX10-NEXT: v_pk_add_f16 v0, v2, v0 -; GFX10-NEXT: v_and_or_b32 v2, v5, v8, s4 +; GFX10-NEXT: v_and_or_b32 v2, 0xffff, v5, s4 ; GFX10-NEXT: v_lshrrev_b32_e32 v3, 16, v0 ; GFX10-NEXT: v_pk_add_f16 v1, v2, v1 ; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GFX10-NEXT: v_and_or_b32 v1, v1, v8, s4 -; GFX10-NEXT: v_and_or_b32 v0, v0, v8, v3 +; GFX10-NEXT: v_and_or_b32 v1, 0xffff, v1, s4 +; GFX10-NEXT: v_and_or_b32 v0, 0xffff, v0, v3 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-CONTRACT-LABEL: test_3xhalf_add_mul_rhs: @@ -934,23 +933,22 @@ define <3 x half> @test_3xhalf_add_mul_rhs(<3 x half> %x, <3 x half> %y, <3 x ha ; GFX10-CONTRACT-NEXT: v_lshrrev_b32_e32 v6, 16, v0 ; GFX10-CONTRACT-NEXT: v_lshrrev_b32_e32 v7, 16, v2 ; GFX10-CONTRACT-NEXT: v_lshrrev_b32_e32 v8, 16, v4 -; GFX10-CONTRACT-NEXT: v_mov_b32_e32 v9, 0xffff ; GFX10-CONTRACT-NEXT: s_lshl_b32 s4, s4, 16 +; GFX10-CONTRACT-NEXT: v_and_or_b32 v1, 0xffff, v1, s4 ; GFX10-CONTRACT-NEXT: v_lshlrev_b32_e32 v6, 16, v6 ; GFX10-CONTRACT-NEXT: v_lshlrev_b32_e32 v7, 16, v7 ; GFX10-CONTRACT-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; GFX10-CONTRACT-NEXT: v_and_or_b32 v1, v1, v9, s4 -; GFX10-CONTRACT-NEXT: v_and_or_b32 v0, v0, v9, v6 -; GFX10-CONTRACT-NEXT: v_and_or_b32 v2, v2, v9, v7 -; GFX10-CONTRACT-NEXT: v_and_or_b32 v4, v4, v9, v8 +; GFX10-CONTRACT-NEXT: v_and_or_b32 v0, 0xffff, v0, v6 +; GFX10-CONTRACT-NEXT: v_and_or_b32 v2, 0xffff, v2, v7 +; GFX10-CONTRACT-NEXT: v_and_or_b32 v4, 0xffff, v4, v8 ; GFX10-CONTRACT-NEXT: v_pk_fma_f16 v0, v0, v2, v4 -; GFX10-CONTRACT-NEXT: v_and_or_b32 v2, v3, v9, s4 -; GFX10-CONTRACT-NEXT: v_and_or_b32 v4, v5, v9, s4 +; GFX10-CONTRACT-NEXT: v_and_or_b32 v2, 0xffff, v3, s4 +; GFX10-CONTRACT-NEXT: v_and_or_b32 v4, 0xffff, v5, s4 ; GFX10-CONTRACT-NEXT: v_lshrrev_b32_e32 v3, 16, v0 ; GFX10-CONTRACT-NEXT: v_pk_fma_f16 v1, v1, v2, v4 ; GFX10-CONTRACT-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GFX10-CONTRACT-NEXT: v_and_or_b32 v1, v1, v9, s4 -; GFX10-CONTRACT-NEXT: v_and_or_b32 v0, v0, v9, v3 +; GFX10-CONTRACT-NEXT: v_and_or_b32 v1, 0xffff, v1, s4 +; GFX10-CONTRACT-NEXT: v_and_or_b32 v0, 0xffff, v0, v3 ; GFX10-CONTRACT-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-DENORM-LABEL: test_3xhalf_add_mul_rhs: @@ -959,30 +957,29 @@ define <3 x half> @test_3xhalf_add_mul_rhs(<3 x half> %x, <3 x half> %y, <3 x ha ; GFX10-DENORM-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-DENORM-NEXT: v_lshrrev_b32_e32 v6, 16, v0 ; GFX10-DENORM-NEXT: v_lshrrev_b32_e32 v7, 16, v2 -; GFX10-DENORM-NEXT: v_mov_b32_e32 v8, 0xffff ; GFX10-DENORM-NEXT: s_lshl_b32 s4, s4, 16 +; GFX10-DENORM-NEXT: v_and_or_b32 v1, 0xffff, v1, s4 ; GFX10-DENORM-NEXT: v_lshlrev_b32_e32 v6, 16, v6 ; GFX10-DENORM-NEXT: v_lshlrev_b32_e32 v7, 16, v7 -; GFX10-DENORM-NEXT: v_and_or_b32 v1, v1, v8, s4 -; GFX10-DENORM-NEXT: v_and_or_b32 v3, v3, v8, s4 -; GFX10-DENORM-NEXT: v_and_or_b32 v0, v0, v8, v6 -; GFX10-DENORM-NEXT: v_and_or_b32 v2, v2, v8, v7 +; GFX10-DENORM-NEXT: v_and_or_b32 v3, 0xffff, v3, s4 +; GFX10-DENORM-NEXT: v_and_or_b32 v0, 0xffff, v0, v6 +; GFX10-DENORM-NEXT: v_and_or_b32 v2, 0xffff, v2, v7 ; GFX10-DENORM-NEXT: v_pk_mul_f16 v1, v1, v3 ; GFX10-DENORM-NEXT: v_pk_mul_f16 v0, v0, v2 ; GFX10-DENORM-NEXT: v_lshrrev_b32_e32 v2, 16, v4 -; GFX10-DENORM-NEXT: v_and_or_b32 v1, v1, v8, s4 +; GFX10-DENORM-NEXT: v_and_or_b32 v1, 0xffff, v1, s4 ; GFX10-DENORM-NEXT: v_lshrrev_b32_e32 v6, 16, v0 ; GFX10-DENORM-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX10-DENORM-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; GFX10-DENORM-NEXT: v_and_or_b32 v2, v4, v8, v2 -; GFX10-DENORM-NEXT: v_and_or_b32 v0, v0, v8, v6 +; GFX10-DENORM-NEXT: v_and_or_b32 v2, 0xffff, v4, v2 +; GFX10-DENORM-NEXT: v_and_or_b32 v0, 0xffff, v0, v6 ; GFX10-DENORM-NEXT: v_pk_add_f16 v0, v2, v0 -; GFX10-DENORM-NEXT: v_and_or_b32 v2, v5, v8, s4 +; GFX10-DENORM-NEXT: v_and_or_b32 v2, 0xffff, v5, s4 ; GFX10-DENORM-NEXT: v_lshrrev_b32_e32 v3, 16, v0 ; GFX10-DENORM-NEXT: v_pk_add_f16 v1, v2, v1 ; GFX10-DENORM-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GFX10-DENORM-NEXT: v_and_or_b32 v1, v1, v8, s4 -; GFX10-DENORM-NEXT: v_and_or_b32 v0, v0, v8, v3 +; GFX10-DENORM-NEXT: v_and_or_b32 v1, 0xffff, v1, s4 +; GFX10-DENORM-NEXT: v_and_or_b32 v0, 0xffff, v0, v3 ; GFX10-DENORM-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-UNSAFE-LABEL: test_3xhalf_add_mul_rhs: @@ -992,23 +989,22 @@ define <3 x half> @test_3xhalf_add_mul_rhs(<3 x half> %x, <3 x half> %y, <3 x ha ; GFX10-UNSAFE-NEXT: v_lshrrev_b32_e32 v6, 16, v0 ; GFX10-UNSAFE-NEXT: v_lshrrev_b32_e32 v7, 16, v2 ; GFX10-UNSAFE-NEXT: v_lshrrev_b32_e32 v8, 16, v4 -; GFX10-UNSAFE-NEXT: v_mov_b32_e32 v9, 0xffff ; GFX10-UNSAFE-NEXT: s_lshl_b32 s4, s4, 16 +; GFX10-UNSAFE-NEXT: v_and_or_b32 v1, 0xffff, v1, s4 ; GFX10-UNSAFE-NEXT: v_lshlrev_b32_e32 v6, 16, v6 ; GFX10-UNSAFE-NEXT: v_lshlrev_b32_e32 v7, 16, v7 ; GFX10-UNSAFE-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; GFX10-UNSAFE-NEXT: v_and_or_b32 v1, v1, v9, s4 -; GFX10-UNSAFE-NEXT: v_and_or_b32 v0, v0, v9, v6 -; GFX10-UNSAFE-NEXT: v_and_or_b32 v2, v2, v9, v7 -; GFX10-UNSAFE-NEXT: v_and_or_b32 v4, v4, v9, v8 +; GFX10-UNSAFE-NEXT: v_and_or_b32 v0, 0xffff, v0, v6 +; GFX10-UNSAFE-NEXT: v_and_or_b32 v2, 0xffff, v2, v7 +; GFX10-UNSAFE-NEXT: v_and_or_b32 v4, 0xffff, v4, v8 ; GFX10-UNSAFE-NEXT: v_pk_fma_f16 v0, v0, v2, v4 -; GFX10-UNSAFE-NEXT: v_and_or_b32 v2, v3, v9, s4 -; GFX10-UNSAFE-NEXT: v_and_or_b32 v4, v5, v9, s4 +; GFX10-UNSAFE-NEXT: v_and_or_b32 v2, 0xffff, v3, s4 +; GFX10-UNSAFE-NEXT: v_and_or_b32 v4, 0xffff, v5, s4 ; GFX10-UNSAFE-NEXT: v_lshrrev_b32_e32 v3, 16, v0 ; GFX10-UNSAFE-NEXT: v_pk_fma_f16 v1, v1, v2, v4 ; GFX10-UNSAFE-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GFX10-UNSAFE-NEXT: v_and_or_b32 v1, v1, v9, s4 -; GFX10-UNSAFE-NEXT: v_and_or_b32 v0, v0, v9, v3 +; GFX10-UNSAFE-NEXT: v_and_or_b32 v1, 0xffff, v1, s4 +; GFX10-UNSAFE-NEXT: v_and_or_b32 v0, 0xffff, v0, v3 ; GFX10-UNSAFE-NEXT: s_setpc_b64 s[30:31] .entry: %a = fmul <3 x half> %x, %y diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-fma-sub-ext-neg-mul.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-fma-sub-ext-neg-mul.ll index bb0a3b352eb3..84002c0e3f22 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-fma-sub-ext-neg-mul.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-fma-sub-ext-neg-mul.ll @@ -113,10 +113,9 @@ define amdgpu_vs <4 x float> @test_v4f16_to_v4f32_sub_ext_neg_mul(<4 x half> %x, ; ; GFX10-DENORM-LABEL: test_v4f16_to_v4f32_sub_ext_neg_mul: ; GFX10-DENORM: ; %bb.0: ; %entry -; GFX10-DENORM-NEXT: s_mov_b32 s0, 0x80008000 +; GFX10-DENORM-NEXT: v_xor_b32_e32 v2, 0x80008000, v2 +; GFX10-DENORM-NEXT: v_xor_b32_e32 v3, 0x80008000, v3 ; GFX10-DENORM-NEXT: v_cvt_f32_f16_e32 v8, v0 -; GFX10-DENORM-NEXT: v_xor_b32_e32 v2, s0, v2 -; GFX10-DENORM-NEXT: v_xor_b32_e32 v3, s0, v3 ; GFX10-DENORM-NEXT: v_cvt_f32_f16_sdwa v9, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 ; GFX10-DENORM-NEXT: v_cvt_f32_f16_e32 v10, v1 ; GFX10-DENORM-NEXT: v_cvt_f32_f16_sdwa v11, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 @@ -155,10 +154,9 @@ define amdgpu_vs <4 x float> @test_v4f16_to_v4f32_sub_neg_ext_mul(<4 x half> %x, ; ; GFX10-DENORM-LABEL: test_v4f16_to_v4f32_sub_neg_ext_mul: ; GFX10-DENORM: ; %bb.0: ; %entry -; GFX10-DENORM-NEXT: s_mov_b32 s0, 0x80008000 +; GFX10-DENORM-NEXT: v_xor_b32_e32 v2, 0x80008000, v2 +; GFX10-DENORM-NEXT: v_xor_b32_e32 v3, 0x80008000, v3 ; GFX10-DENORM-NEXT: v_cvt_f32_f16_e32 v8, v0 -; GFX10-DENORM-NEXT: v_xor_b32_e32 v2, s0, v2 -; GFX10-DENORM-NEXT: v_xor_b32_e32 v3, s0, v3 ; GFX10-DENORM-NEXT: v_cvt_f32_f16_sdwa v9, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 ; GFX10-DENORM-NEXT: v_cvt_f32_f16_e32 v10, v1 ; GFX10-DENORM-NEXT: v_cvt_f32_f16_sdwa v11, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 @@ -198,10 +196,9 @@ define amdgpu_vs <4 x float> @test_v4f16_to_v4f32_sub_ext_neg_mul2(<4 x float> % ; ; GFX10-DENORM-LABEL: test_v4f16_to_v4f32_sub_ext_neg_mul2: ; GFX10-DENORM: ; %bb.0: ; %entry -; GFX10-DENORM-NEXT: s_mov_b32 s0, 0x80008000 +; GFX10-DENORM-NEXT: v_xor_b32_e32 v6, 0x80008000, v6 +; GFX10-DENORM-NEXT: v_xor_b32_e32 v7, 0x80008000, v7 ; GFX10-DENORM-NEXT: v_cvt_f32_f16_e32 v8, v4 -; GFX10-DENORM-NEXT: v_xor_b32_e32 v6, s0, v6 -; GFX10-DENORM-NEXT: v_xor_b32_e32 v7, s0, v7 ; GFX10-DENORM-NEXT: v_cvt_f32_f16_sdwa v4, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 ; GFX10-DENORM-NEXT: v_cvt_f32_f16_e32 v9, v5 ; GFX10-DENORM-NEXT: v_cvt_f32_f16_sdwa v5, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 @@ -240,10 +237,9 @@ define amdgpu_vs <4 x float> @test_v4f16_to_v4f32_sub_neg_ext_mul2(<4 x float> % ; ; GFX10-DENORM-LABEL: test_v4f16_to_v4f32_sub_neg_ext_mul2: ; GFX10-DENORM: ; %bb.0: ; %entry -; GFX10-DENORM-NEXT: s_mov_b32 s0, 0x80008000 +; GFX10-DENORM-NEXT: v_xor_b32_e32 v6, 0x80008000, v6 +; GFX10-DENORM-NEXT: v_xor_b32_e32 v7, 0x80008000, v7 ; GFX10-DENORM-NEXT: v_cvt_f32_f16_e32 v8, v4 -; GFX10-DENORM-NEXT: v_xor_b32_e32 v6, s0, v6 -; GFX10-DENORM-NEXT: v_xor_b32_e32 v7, s0, v7 ; GFX10-DENORM-NEXT: v_cvt_f32_f16_sdwa v4, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 ; GFX10-DENORM-NEXT: v_cvt_f32_f16_e32 v9, v5 ; GFX10-DENORM-NEXT: v_cvt_f32_f16_sdwa v5, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-fma-sub-mul.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-fma-sub-mul.ll index c40b18713478..088c86548401 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-fma-sub-mul.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-fma-sub-mul.ll @@ -488,10 +488,9 @@ define <4 x half> @test_v4f16_sub_mul(<4 x half> %x, <4 x half> %y, <4 x half> % ; GFX10-NEXT: v_add_f16_e64 v2, v0, -v4 ; GFX10-NEXT: v_add_f16_sdwa v0, v0, -v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; GFX10-NEXT: v_add_f16_e64 v3, v1, -v5 -; GFX10-NEXT: v_mov_b32_e32 v4, 0xffff ; GFX10-NEXT: v_add_f16_sdwa v1, v1, -v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX10-NEXT: v_and_or_b32 v0, v2, v4, v0 -; GFX10-NEXT: v_and_or_b32 v1, v3, v4, v1 +; GFX10-NEXT: v_and_or_b32 v0, 0xffff, v2, v0 +; GFX10-NEXT: v_and_or_b32 v1, 0xffff, v3, v1 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-CONTRACT-LABEL: test_v4f16_sub_mul: @@ -511,10 +510,9 @@ define <4 x half> @test_v4f16_sub_mul(<4 x half> %x, <4 x half> %y, <4 x half> % ; GFX10-DENORM-NEXT: v_add_f16_e64 v2, v0, -v4 ; GFX10-DENORM-NEXT: v_add_f16_sdwa v0, v0, -v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; GFX10-DENORM-NEXT: v_add_f16_e64 v3, v1, -v5 -; GFX10-DENORM-NEXT: v_mov_b32_e32 v4, 0xffff ; GFX10-DENORM-NEXT: v_add_f16_sdwa v1, v1, -v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX10-DENORM-NEXT: v_and_or_b32 v0, v2, v4, v0 -; GFX10-DENORM-NEXT: v_and_or_b32 v1, v3, v4, v1 +; GFX10-DENORM-NEXT: v_and_or_b32 v0, 0xffff, v2, v0 +; GFX10-DENORM-NEXT: v_and_or_b32 v1, 0xffff, v3, v1 ; GFX10-DENORM-NEXT: s_setpc_b64 s[30:31] .entry: %a = fmul <4 x half> %x, %y @@ -567,10 +565,9 @@ define <4 x half> @test_v4f16_sub_mul_rhs(<4 x half> %x, <4 x half> %y, <4 x hal ; GFX10-NEXT: v_add_f16_e64 v2, v4, -v0 ; GFX10-NEXT: v_add_f16_sdwa v0, v4, -v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; GFX10-NEXT: v_add_f16_e64 v3, v5, -v1 -; GFX10-NEXT: v_mov_b32_e32 v4, 0xffff ; GFX10-NEXT: v_add_f16_sdwa v1, v5, -v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX10-NEXT: v_and_or_b32 v0, v2, v4, v0 -; GFX10-NEXT: v_and_or_b32 v1, v3, v4, v1 +; GFX10-NEXT: v_and_or_b32 v0, 0xffff, v2, v0 +; GFX10-NEXT: v_and_or_b32 v1, 0xffff, v3, v1 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-CONTRACT-LABEL: test_v4f16_sub_mul_rhs: @@ -590,10 +587,9 @@ define <4 x half> @test_v4f16_sub_mul_rhs(<4 x half> %x, <4 x half> %y, <4 x hal ; GFX10-DENORM-NEXT: v_add_f16_e64 v2, v4, -v0 ; GFX10-DENORM-NEXT: v_add_f16_sdwa v0, v4, -v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; GFX10-DENORM-NEXT: v_add_f16_e64 v3, v5, -v1 -; GFX10-DENORM-NEXT: v_mov_b32_e32 v4, 0xffff ; GFX10-DENORM-NEXT: v_add_f16_sdwa v1, v5, -v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX10-DENORM-NEXT: v_and_or_b32 v0, v2, v4, v0 -; GFX10-DENORM-NEXT: v_and_or_b32 v1, v3, v4, v1 +; GFX10-DENORM-NEXT: v_and_or_b32 v0, 0xffff, v2, v0 +; GFX10-DENORM-NEXT: v_and_or_b32 v1, 0xffff, v3, v1 ; GFX10-DENORM-NEXT: s_setpc_b64 s[30:31] .entry: %a = fmul <4 x half> %x, %y diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-fma-sub-neg-mul.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-fma-sub-neg-mul.ll index f148d9b10ac7..fb96801ca2cc 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-fma-sub-neg-mul.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-fma-sub-neg-mul.ll @@ -90,9 +90,8 @@ define half @test_f16_sub_ext_neg_mul(half %x, half %y, half %z) { ; GFX10-CONTRACT: ; %bb.0: ; %entry ; GFX10-CONTRACT-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-CONTRACT-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-CONTRACT-NEXT: s_mov_b32 s4, 0x8000 -; GFX10-CONTRACT-NEXT: v_xor_b32_e32 v1, s4, v1 -; GFX10-CONTRACT-NEXT: v_xor_b32_e32 v2, s4, v2 +; GFX10-CONTRACT-NEXT: v_xor_b32_e32 v1, 0x8000, v1 +; GFX10-CONTRACT-NEXT: v_xor_b32_e32 v2, 0x8000, v2 ; GFX10-CONTRACT-NEXT: v_fma_f16 v0, v0, v1, v2 ; GFX10-CONTRACT-NEXT: s_setpc_b64 s[30:31] ; @@ -278,10 +277,9 @@ define <4 x half> @test_v4f16_sub_ext_neg_mul(<4 x half> %x, <4 x half> %y, <4 x ; GFX10-NEXT: v_add_f16_e64 v2, v0, -v4 ; GFX10-NEXT: v_add_f16_sdwa v0, v0, -v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; GFX10-NEXT: v_add_f16_e64 v3, v1, -v5 -; GFX10-NEXT: v_mov_b32_e32 v4, 0xffff ; GFX10-NEXT: v_add_f16_sdwa v1, v1, -v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX10-NEXT: v_and_or_b32 v0, v2, v4, v0 -; GFX10-NEXT: v_and_or_b32 v1, v3, v4, v1 +; GFX10-NEXT: v_and_or_b32 v0, 0xffff, v2, v0 +; GFX10-NEXT: v_and_or_b32 v1, 0xffff, v3, v1 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-CONTRACT-LABEL: test_v4f16_sub_ext_neg_mul: @@ -301,10 +299,9 @@ define <4 x half> @test_v4f16_sub_ext_neg_mul(<4 x half> %x, <4 x half> %y, <4 x ; GFX10-DENORM-NEXT: v_add_f16_e64 v2, v0, -v4 ; GFX10-DENORM-NEXT: v_add_f16_sdwa v0, v0, -v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; GFX10-DENORM-NEXT: v_add_f16_e64 v3, v1, -v5 -; GFX10-DENORM-NEXT: v_mov_b32_e32 v4, 0xffff ; GFX10-DENORM-NEXT: v_add_f16_sdwa v1, v1, -v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX10-DENORM-NEXT: v_and_or_b32 v0, v2, v4, v0 -; GFX10-DENORM-NEXT: v_and_or_b32 v1, v3, v4, v1 +; GFX10-DENORM-NEXT: v_and_or_b32 v0, 0xffff, v2, v0 +; GFX10-DENORM-NEXT: v_and_or_b32 v1, 0xffff, v3, v1 ; GFX10-DENORM-NEXT: s_setpc_b64 s[30:31] entry: %a = fmul <4 x half> %x, %y diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.i8.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.i8.ll index a8e207856f5d..bb35b9c25d25 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.i8.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.i8.ll @@ -763,29 +763,26 @@ define amdgpu_ps i8 @extractelement_sgpr_v8i8_sgpr_idx(<8 x i8> addrspace(4)* in ; GCN-LABEL: extractelement_sgpr_v8i8_sgpr_idx: ; GCN: ; %bb.0: ; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 -; GCN-NEXT: s_mov_b32 s7, 0x80008 -; GCN-NEXT: s_movk_i32 s2, 0xff ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_bfe_u32 s8, s0, s7 -; GCN-NEXT: s_and_b32 s6, s0, s2 -; GCN-NEXT: s_lshl_b32 s8, s8, 8 -; GCN-NEXT: s_or_b32 s6, s6, s8 -; GCN-NEXT: s_mov_b32 s8, 0x80010 -; GCN-NEXT: s_lshr_b32 s3, s0, 24 -; GCN-NEXT: s_bfe_u32 s0, s0, s8 +; GCN-NEXT: s_bfe_u32 s6, s0, 0x80008 +; GCN-NEXT: s_lshr_b32 s2, s0, 24 +; GCN-NEXT: s_and_b32 s5, s0, 0xff +; GCN-NEXT: s_lshl_b32 s6, s6, 8 +; GCN-NEXT: s_bfe_u32 s0, s0, 0x80010 +; GCN-NEXT: s_or_b32 s5, s5, s6 ; GCN-NEXT: s_lshl_b32 s0, s0, 16 -; GCN-NEXT: s_or_b32 s0, s6, s0 -; GCN-NEXT: s_lshl_b32 s3, s3, 24 -; GCN-NEXT: s_or_b32 s0, s0, s3 -; GCN-NEXT: s_bfe_u32 s3, s1, s7 -; GCN-NEXT: s_lshr_b32 s5, s1, 24 -; GCN-NEXT: s_and_b32 s2, s1, s2 -; GCN-NEXT: s_lshl_b32 s3, s3, 8 -; GCN-NEXT: s_bfe_u32 s1, s1, s8 -; GCN-NEXT: s_or_b32 s2, s2, s3 +; GCN-NEXT: s_or_b32 s0, s5, s0 +; GCN-NEXT: s_lshl_b32 s2, s2, 24 +; GCN-NEXT: s_bfe_u32 s5, s1, 0x80008 +; GCN-NEXT: s_lshr_b32 s3, s1, 24 +; GCN-NEXT: s_or_b32 s0, s0, s2 +; GCN-NEXT: s_and_b32 s2, s1, 0xff +; GCN-NEXT: s_lshl_b32 s5, s5, 8 +; GCN-NEXT: s_bfe_u32 s1, s1, 0x80010 +; GCN-NEXT: s_or_b32 s2, s2, s5 ; GCN-NEXT: s_lshl_b32 s1, s1, 16 ; GCN-NEXT: s_or_b32 s1, s2, s1 -; GCN-NEXT: s_lshl_b32 s2, s5, 24 +; GCN-NEXT: s_lshl_b32 s2, s3, 24 ; GCN-NEXT: s_or_b32 s1, s1, s2 ; GCN-NEXT: s_lshr_b32 s2, s4, 2 ; GCN-NEXT: s_cmp_eq_u32 s2, 1 @@ -798,32 +795,29 @@ define amdgpu_ps i8 @extractelement_sgpr_v8i8_sgpr_idx(<8 x i8> addrspace(4)* in ; GFX10-LABEL: extractelement_sgpr_v8i8_sgpr_idx: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 -; GFX10-NEXT: s_mov_b32 s3, 0x80008 -; GFX10-NEXT: s_movk_i32 s2, 0xff -; GFX10-NEXT: s_mov_b32 s5, 0x80010 -; GFX10-NEXT: s_lshr_b32 s6, s4, 2 +; GFX10-NEXT: s_lshr_b32 s2, s4, 2 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: s_bfe_u32 s10, s0, s3 -; GFX10-NEXT: s_bfe_u32 s3, s1, s3 -; GFX10-NEXT: s_lshr_b32 s7, s0, 24 -; GFX10-NEXT: s_lshr_b32 s8, s1, 24 -; GFX10-NEXT: s_and_b32 s9, s0, s2 -; GFX10-NEXT: s_bfe_u32 s0, s0, s5 -; GFX10-NEXT: s_and_b32 s2, s1, s2 -; GFX10-NEXT: s_bfe_u32 s1, s1, s5 -; GFX10-NEXT: s_lshl_b32 s5, s10, 8 -; GFX10-NEXT: s_lshl_b32 s3, s3, 8 +; GFX10-NEXT: s_bfe_u32 s7, s0, 0x80008 +; GFX10-NEXT: s_bfe_u32 s9, s1, 0x80008 +; GFX10-NEXT: s_lshr_b32 s3, s0, 24 +; GFX10-NEXT: s_lshr_b32 s5, s1, 24 +; GFX10-NEXT: s_and_b32 s6, s0, 0xff +; GFX10-NEXT: s_bfe_u32 s0, s0, 0x80010 +; GFX10-NEXT: s_and_b32 s8, s1, 0xff +; GFX10-NEXT: s_bfe_u32 s1, s1, 0x80010 +; GFX10-NEXT: s_lshl_b32 s7, s7, 8 +; GFX10-NEXT: s_lshl_b32 s9, s9, 8 ; GFX10-NEXT: s_lshl_b32 s0, s0, 16 ; GFX10-NEXT: s_lshl_b32 s1, s1, 16 -; GFX10-NEXT: s_or_b32 s5, s9, s5 -; GFX10-NEXT: s_or_b32 s2, s2, s3 -; GFX10-NEXT: s_lshl_b32 s7, s7, 24 -; GFX10-NEXT: s_lshl_b32 s8, s8, 24 -; GFX10-NEXT: s_or_b32 s0, s5, s0 -; GFX10-NEXT: s_or_b32 s1, s2, s1 -; GFX10-NEXT: s_or_b32 s0, s0, s7 -; GFX10-NEXT: s_or_b32 s1, s1, s8 -; GFX10-NEXT: s_cmp_eq_u32 s6, 1 +; GFX10-NEXT: s_or_b32 s6, s6, s7 +; GFX10-NEXT: s_or_b32 s7, s8, s9 +; GFX10-NEXT: s_lshl_b32 s3, s3, 24 +; GFX10-NEXT: s_lshl_b32 s5, s5, 24 +; GFX10-NEXT: s_or_b32 s0, s6, s0 +; GFX10-NEXT: s_or_b32 s1, s7, s1 +; GFX10-NEXT: s_or_b32 s0, s0, s3 +; GFX10-NEXT: s_or_b32 s1, s1, s5 +; GFX10-NEXT: s_cmp_eq_u32 s2, 1 ; GFX10-NEXT: s_cselect_b32 s0, s1, s0 ; GFX10-NEXT: s_and_b32 s1, s4, 3 ; GFX10-NEXT: s_lshl_b32 s1, s1, 3 @@ -934,7 +928,6 @@ define amdgpu_ps i8 @extractelement_vgpr_v8i8_sgpr_idx(<8 x i8> addrspace(1)* %p ; GFX10-NEXT: global_load_dwordx2 v[0:1], v[0:1], off ; GFX10-NEXT: s_mov_b32 s0, 8 ; GFX10-NEXT: s_mov_b32 s1, 16 -; GFX10-NEXT: s_movk_i32 s3, 0xff ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_lshrrev_b32_e32 v2, 24, v0 ; GFX10-NEXT: v_lshlrev_b32_sdwa v3, s0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 @@ -942,9 +935,9 @@ define amdgpu_ps i8 @extractelement_vgpr_v8i8_sgpr_idx(<8 x i8> addrspace(1)* %p ; GFX10-NEXT: v_lshlrev_b32_sdwa v5, s0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 ; GFX10-NEXT: v_lshlrev_b32_sdwa v6, s1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 ; GFX10-NEXT: v_lshlrev_b32_sdwa v7, s1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 -; GFX10-NEXT: v_and_or_b32 v0, v0, s3, v3 +; GFX10-NEXT: v_and_or_b32 v0, v0, 0xff, v3 ; GFX10-NEXT: v_lshlrev_b32_e32 v2, 24, v2 -; GFX10-NEXT: v_and_or_b32 v1, v1, s3, v5 +; GFX10-NEXT: v_and_or_b32 v1, v1, 0xff, v5 ; GFX10-NEXT: v_lshlrev_b32_e32 v3, 24, v4 ; GFX10-NEXT: s_lshr_b32 s0, s2, 2 ; GFX10-NEXT: v_or3_b32 v0, v0, v6, v2 @@ -1063,7 +1056,6 @@ define i8 @extractelement_vgpr_v8i8_vgpr_idx(<8 x i8> addrspace(1)* %ptr, i32 %i ; GFX10-NEXT: global_load_dwordx2 v[0:1], v[0:1], off ; GFX10-NEXT: s_mov_b32 s4, 8 ; GFX10-NEXT: s_mov_b32 s5, 16 -; GFX10-NEXT: s_movk_i32 s6, 0xff ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_lshrrev_b32_e32 v3, 24, v0 ; GFX10-NEXT: v_lshlrev_b32_sdwa v4, s4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 @@ -1071,9 +1063,9 @@ define i8 @extractelement_vgpr_v8i8_vgpr_idx(<8 x i8> addrspace(1)* %ptr, i32 %i ; GFX10-NEXT: v_lshlrev_b32_sdwa v6, s4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 ; GFX10-NEXT: v_lshlrev_b32_sdwa v7, s5, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 ; GFX10-NEXT: v_lshlrev_b32_sdwa v8, s5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 -; GFX10-NEXT: v_and_or_b32 v0, v0, s6, v4 +; GFX10-NEXT: v_and_or_b32 v0, v0, 0xff, v4 ; GFX10-NEXT: v_lshlrev_b32_e32 v3, 24, v3 -; GFX10-NEXT: v_and_or_b32 v1, v1, s6, v6 +; GFX10-NEXT: v_and_or_b32 v1, v1, 0xff, v6 ; GFX10-NEXT: v_lshlrev_b32_e32 v4, 24, v5 ; GFX10-NEXT: v_lshrrev_b32_e32 v5, 2, v2 ; GFX10-NEXT: v_and_b32_e32 v2, 3, v2 @@ -1093,37 +1085,34 @@ define amdgpu_ps i8 @extractelement_sgpr_v8i8_vgpr_idx(<8 x i8> addrspace(4)* in ; GCN-LABEL: extractelement_sgpr_v8i8_vgpr_idx: ; GCN: ; %bb.0: ; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 -; GCN-NEXT: s_mov_b32 s6, 0x80008 -; GCN-NEXT: s_movk_i32 s2, 0xff ; GCN-NEXT: v_lshrrev_b32_e32 v1, 2, v0 ; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 1, v1 +; GCN-NEXT: v_and_b32_e32 v0, 3, v0 +; GCN-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_bfe_u32 s7, s0, s6 -; GCN-NEXT: s_and_b32 s5, s0, s2 -; GCN-NEXT: s_lshl_b32 s7, s7, 8 -; GCN-NEXT: s_or_b32 s5, s5, s7 -; GCN-NEXT: s_mov_b32 s7, 0x80010 -; GCN-NEXT: s_lshr_b32 s3, s0, 24 -; GCN-NEXT: s_bfe_u32 s0, s0, s7 +; GCN-NEXT: s_bfe_u32 s5, s0, 0x80008 +; GCN-NEXT: s_lshr_b32 s2, s0, 24 +; GCN-NEXT: s_and_b32 s4, s0, 0xff +; GCN-NEXT: s_lshl_b32 s5, s5, 8 +; GCN-NEXT: s_bfe_u32 s0, s0, 0x80010 +; GCN-NEXT: s_or_b32 s4, s4, s5 ; GCN-NEXT: s_lshl_b32 s0, s0, 16 -; GCN-NEXT: s_or_b32 s0, s5, s0 -; GCN-NEXT: s_lshl_b32 s3, s3, 24 -; GCN-NEXT: s_or_b32 s0, s0, s3 -; GCN-NEXT: s_bfe_u32 s3, s1, s6 -; GCN-NEXT: s_lshr_b32 s4, s1, 24 -; GCN-NEXT: s_and_b32 s2, s1, s2 -; GCN-NEXT: s_lshl_b32 s3, s3, 8 -; GCN-NEXT: s_bfe_u32 s1, s1, s7 -; GCN-NEXT: s_or_b32 s2, s2, s3 +; GCN-NEXT: s_or_b32 s0, s4, s0 +; GCN-NEXT: s_lshl_b32 s2, s2, 24 +; GCN-NEXT: s_bfe_u32 s4, s1, 0x80008 +; GCN-NEXT: s_lshr_b32 s3, s1, 24 +; GCN-NEXT: s_or_b32 s0, s0, s2 +; GCN-NEXT: s_and_b32 s2, s1, 0xff +; GCN-NEXT: s_lshl_b32 s4, s4, 8 +; GCN-NEXT: s_bfe_u32 s1, s1, 0x80010 +; GCN-NEXT: s_or_b32 s2, s2, s4 ; GCN-NEXT: s_lshl_b32 s1, s1, 16 ; GCN-NEXT: s_or_b32 s1, s2, s1 -; GCN-NEXT: s_lshl_b32 s2, s4, 24 +; GCN-NEXT: s_lshl_b32 s2, s3, 24 ; GCN-NEXT: s_or_b32 s1, s1, s2 ; GCN-NEXT: v_mov_b32_e32 v2, s0 ; GCN-NEXT: v_mov_b32_e32 v3, s1 -; GCN-NEXT: v_and_b32_e32 v0, 3, v0 ; GCN-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc -; GCN-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; GCN-NEXT: v_lshrrev_b32_e32 v0, v0, v1 ; GCN-NEXT: v_readfirstlane_b32 s0, v0 ; GCN-NEXT: ; return to shader part epilog @@ -1131,33 +1120,30 @@ define amdgpu_ps i8 @extractelement_sgpr_v8i8_vgpr_idx(<8 x i8> addrspace(4)* in ; GFX10-LABEL: extractelement_sgpr_v8i8_vgpr_idx: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 -; GFX10-NEXT: s_mov_b32 s3, 0x80008 -; GFX10-NEXT: s_movk_i32 s2, 0xff -; GFX10-NEXT: s_mov_b32 s4, 0x80010 ; GFX10-NEXT: v_lshrrev_b32_e32 v1, 2, v0 ; GFX10-NEXT: v_and_b32_e32 v0, 3, v0 ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v1 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: s_bfe_u32 s8, s0, s3 -; GFX10-NEXT: s_bfe_u32 s3, s1, s3 -; GFX10-NEXT: s_lshr_b32 s6, s1, 24 -; GFX10-NEXT: s_and_b32 s7, s0, s2 -; GFX10-NEXT: s_and_b32 s2, s1, s2 -; GFX10-NEXT: s_bfe_u32 s1, s1, s4 -; GFX10-NEXT: s_lshl_b32 s3, s3, 8 +; GFX10-NEXT: s_bfe_u32 s7, s1, 0x80008 +; GFX10-NEXT: s_lshr_b32 s3, s1, 24 +; GFX10-NEXT: s_and_b32 s6, s1, 0xff +; GFX10-NEXT: s_bfe_u32 s1, s1, 0x80010 +; GFX10-NEXT: s_lshl_b32 s7, s7, 8 ; GFX10-NEXT: s_lshl_b32 s1, s1, 16 -; GFX10-NEXT: s_or_b32 s2, s2, s3 -; GFX10-NEXT: s_lshl_b32 s3, s6, 24 -; GFX10-NEXT: s_or_b32 s1, s2, s1 -; GFX10-NEXT: s_lshr_b32 s5, s0, 24 -; GFX10-NEXT: s_bfe_u32 s0, s0, s4 -; GFX10-NEXT: s_lshl_b32 s4, s8, 8 +; GFX10-NEXT: s_or_b32 s6, s6, s7 +; GFX10-NEXT: s_bfe_u32 s5, s0, 0x80008 +; GFX10-NEXT: s_lshl_b32 s3, s3, 24 +; GFX10-NEXT: s_or_b32 s1, s6, s1 +; GFX10-NEXT: s_lshr_b32 s2, s0, 24 +; GFX10-NEXT: s_and_b32 s4, s0, 0xff +; GFX10-NEXT: s_bfe_u32 s0, s0, 0x80010 +; GFX10-NEXT: s_lshl_b32 s5, s5, 8 ; GFX10-NEXT: s_or_b32 s1, s1, s3 ; GFX10-NEXT: s_lshl_b32 s0, s0, 16 -; GFX10-NEXT: s_or_b32 s3, s7, s4 +; GFX10-NEXT: s_or_b32 s3, s4, s5 ; GFX10-NEXT: v_mov_b32_e32 v2, s1 -; GFX10-NEXT: s_lshl_b32 s2, s5, 24 +; GFX10-NEXT: s_lshl_b32 s2, s2, 24 ; GFX10-NEXT: s_or_b32 s0, s3, s0 ; GFX10-NEXT: s_or_b32 s0, s0, s2 ; GFX10-NEXT: v_cndmask_b32_e32 v1, s0, v2, vcc_lo @@ -2089,45 +2075,42 @@ define amdgpu_ps i8 @extractelement_sgpr_v16i8_sgpr_idx(<16 x i8> addrspace(4)* ; GCN-LABEL: extractelement_sgpr_v16i8_sgpr_idx: ; GCN: ; %bb.0: ; GCN-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x0 -; GCN-NEXT: s_mov_b32 s11, 0x80008 -; GCN-NEXT: s_movk_i32 s9, 0xff ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_bfe_u32 s12, s0, s11 -; GCN-NEXT: s_and_b32 s10, s0, s9 -; GCN-NEXT: s_lshl_b32 s12, s12, 8 -; GCN-NEXT: s_or_b32 s10, s10, s12 -; GCN-NEXT: s_mov_b32 s12, 0x80010 +; GCN-NEXT: s_bfe_u32 s10, s0, 0x80008 ; GCN-NEXT: s_lshr_b32 s5, s0, 24 -; GCN-NEXT: s_bfe_u32 s0, s0, s12 +; GCN-NEXT: s_and_b32 s9, s0, 0xff +; GCN-NEXT: s_lshl_b32 s10, s10, 8 +; GCN-NEXT: s_bfe_u32 s0, s0, 0x80010 +; GCN-NEXT: s_or_b32 s9, s9, s10 ; GCN-NEXT: s_lshl_b32 s0, s0, 16 -; GCN-NEXT: s_or_b32 s0, s10, s0 +; GCN-NEXT: s_or_b32 s0, s9, s0 ; GCN-NEXT: s_lshl_b32 s5, s5, 24 -; GCN-NEXT: s_bfe_u32 s10, s1, s11 +; GCN-NEXT: s_bfe_u32 s9, s1, 0x80008 ; GCN-NEXT: s_lshr_b32 s6, s1, 24 ; GCN-NEXT: s_or_b32 s0, s0, s5 -; GCN-NEXT: s_and_b32 s5, s1, s9 -; GCN-NEXT: s_lshl_b32 s10, s10, 8 -; GCN-NEXT: s_bfe_u32 s1, s1, s12 -; GCN-NEXT: s_or_b32 s5, s5, s10 +; GCN-NEXT: s_and_b32 s5, s1, 0xff +; GCN-NEXT: s_lshl_b32 s9, s9, 8 +; GCN-NEXT: s_bfe_u32 s1, s1, 0x80010 +; GCN-NEXT: s_or_b32 s5, s5, s9 ; GCN-NEXT: s_lshl_b32 s1, s1, 16 ; GCN-NEXT: s_or_b32 s1, s5, s1 ; GCN-NEXT: s_lshl_b32 s5, s6, 24 -; GCN-NEXT: s_bfe_u32 s6, s2, s11 +; GCN-NEXT: s_bfe_u32 s6, s2, 0x80008 ; GCN-NEXT: s_lshr_b32 s7, s2, 24 ; GCN-NEXT: s_or_b32 s1, s1, s5 -; GCN-NEXT: s_and_b32 s5, s2, s9 +; GCN-NEXT: s_and_b32 s5, s2, 0xff ; GCN-NEXT: s_lshl_b32 s6, s6, 8 -; GCN-NEXT: s_bfe_u32 s2, s2, s12 +; GCN-NEXT: s_bfe_u32 s2, s2, 0x80010 ; GCN-NEXT: s_or_b32 s5, s5, s6 ; GCN-NEXT: s_lshl_b32 s2, s2, 16 ; GCN-NEXT: s_or_b32 s2, s5, s2 ; GCN-NEXT: s_lshl_b32 s5, s7, 24 -; GCN-NEXT: s_bfe_u32 s6, s3, s11 +; GCN-NEXT: s_bfe_u32 s6, s3, 0x80008 ; GCN-NEXT: s_lshr_b32 s8, s3, 24 ; GCN-NEXT: s_or_b32 s2, s2, s5 -; GCN-NEXT: s_and_b32 s5, s3, s9 +; GCN-NEXT: s_and_b32 s5, s3, 0xff ; GCN-NEXT: s_lshl_b32 s6, s6, 8 -; GCN-NEXT: s_bfe_u32 s3, s3, s12 +; GCN-NEXT: s_bfe_u32 s3, s3, 0x80010 ; GCN-NEXT: s_or_b32 s5, s5, s6 ; GCN-NEXT: s_lshl_b32 s3, s3, 16 ; GCN-NEXT: s_or_b32 s3, s5, s3 @@ -2148,50 +2131,47 @@ define amdgpu_ps i8 @extractelement_sgpr_v16i8_sgpr_idx(<16 x i8> addrspace(4)* ; GFX10-LABEL: extractelement_sgpr_v16i8_sgpr_idx: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x0 -; GFX10-NEXT: s_mov_b32 s6, 0x80008 -; GFX10-NEXT: s_movk_i32 s5, 0xff -; GFX10-NEXT: s_mov_b32 s7, 0x80010 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: s_bfe_u32 s13, s0, s6 -; GFX10-NEXT: s_lshr_b32 s8, s0, 24 -; GFX10-NEXT: s_and_b32 s12, s0, s5 -; GFX10-NEXT: s_bfe_u32 s0, s0, s7 -; GFX10-NEXT: s_lshl_b32 s13, s13, 8 -; GFX10-NEXT: s_bfe_u32 s15, s1, s6 -; GFX10-NEXT: s_bfe_u32 s17, s2, s6 -; GFX10-NEXT: s_lshl_b32 s0, s0, 16 -; GFX10-NEXT: s_or_b32 s12, s12, s13 -; GFX10-NEXT: s_bfe_u32 s6, s3, s6 -; GFX10-NEXT: s_lshr_b32 s9, s1, 24 -; GFX10-NEXT: s_lshr_b32 s10, s2, 24 -; GFX10-NEXT: s_lshr_b32 s11, s3, 24 -; GFX10-NEXT: s_and_b32 s14, s1, s5 -; GFX10-NEXT: s_bfe_u32 s1, s1, s7 -; GFX10-NEXT: s_and_b32 s16, s2, s5 -; GFX10-NEXT: s_lshl_b32 s8, s8, 24 -; GFX10-NEXT: s_lshl_b32 s15, s15, 8 -; GFX10-NEXT: s_lshl_b32 s17, s17, 8 -; GFX10-NEXT: s_or_b32 s0, s12, s0 -; GFX10-NEXT: s_bfe_u32 s2, s2, s7 -; GFX10-NEXT: s_and_b32 s5, s3, s5 -; GFX10-NEXT: s_lshl_b32 s6, s6, 8 -; GFX10-NEXT: s_bfe_u32 s3, s3, s7 +; GFX10-NEXT: s_bfe_u32 s10, s0, 0x80008 +; GFX10-NEXT: s_bfe_u32 s12, s1, 0x80008 +; GFX10-NEXT: s_lshr_b32 s6, s1, 24 +; GFX10-NEXT: s_and_b32 s9, s0, 0xff +; GFX10-NEXT: s_and_b32 s11, s1, 0xff +; GFX10-NEXT: s_bfe_u32 s1, s1, 0x80010 +; GFX10-NEXT: s_lshl_b32 s10, s10, 8 +; GFX10-NEXT: s_lshl_b32 s12, s12, 8 +; GFX10-NEXT: s_lshr_b32 s5, s0, 24 +; GFX10-NEXT: s_bfe_u32 s0, s0, 0x80010 ; GFX10-NEXT: s_lshl_b32 s1, s1, 16 -; GFX10-NEXT: s_or_b32 s13, s14, s15 -; GFX10-NEXT: s_or_b32 s0, s0, s8 -; GFX10-NEXT: s_or_b32 s8, s16, s17 +; GFX10-NEXT: s_or_b32 s9, s9, s10 +; GFX10-NEXT: s_or_b32 s10, s11, s12 +; GFX10-NEXT: s_bfe_u32 s14, s2, 0x80008 +; GFX10-NEXT: s_lshl_b32 s0, s0, 16 +; GFX10-NEXT: s_lshl_b32 s6, s6, 24 +; GFX10-NEXT: s_or_b32 s1, s10, s1 +; GFX10-NEXT: s_lshr_b32 s7, s2, 24 +; GFX10-NEXT: s_and_b32 s13, s2, 0xff +; GFX10-NEXT: s_bfe_u32 s2, s2, 0x80010 +; GFX10-NEXT: s_lshl_b32 s5, s5, 24 +; GFX10-NEXT: s_lshl_b32 s14, s14, 8 +; GFX10-NEXT: s_or_b32 s0, s9, s0 +; GFX10-NEXT: s_or_b32 s1, s1, s6 +; GFX10-NEXT: s_bfe_u32 s6, s3, 0x80008 +; GFX10-NEXT: s_lshr_b32 s8, s3, 24 ; GFX10-NEXT: s_lshl_b32 s2, s2, 16 -; GFX10-NEXT: s_or_b32 s5, s5, s6 +; GFX10-NEXT: s_or_b32 s11, s13, s14 +; GFX10-NEXT: s_or_b32 s0, s0, s5 +; GFX10-NEXT: s_lshl_b32 s5, s7, 24 +; GFX10-NEXT: s_and_b32 s7, s3, 0xff +; GFX10-NEXT: s_lshl_b32 s6, s6, 8 +; GFX10-NEXT: s_bfe_u32 s3, s3, 0x80010 +; GFX10-NEXT: s_or_b32 s2, s11, s2 +; GFX10-NEXT: s_or_b32 s6, s7, s6 ; GFX10-NEXT: s_lshl_b32 s3, s3, 16 -; GFX10-NEXT: s_lshl_b32 s9, s9, 24 -; GFX10-NEXT: s_or_b32 s1, s13, s1 -; GFX10-NEXT: s_or_b32 s2, s8, s2 -; GFX10-NEXT: s_lshl_b32 s8, s10, 24 -; GFX10-NEXT: s_or_b32 s3, s5, s3 -; GFX10-NEXT: s_lshl_b32 s5, s11, 24 +; GFX10-NEXT: s_or_b32 s2, s2, s5 +; GFX10-NEXT: s_or_b32 s3, s6, s3 +; GFX10-NEXT: s_lshl_b32 s5, s8, 24 ; GFX10-NEXT: s_lshr_b32 s6, s4, 2 -; GFX10-NEXT: s_or_b32 s1, s1, s9 -; GFX10-NEXT: s_or_b32 s2, s2, s8 ; GFX10-NEXT: s_or_b32 s3, s3, s5 ; GFX10-NEXT: s_cmp_eq_u32 s6, 1 ; GFX10-NEXT: s_cselect_b32 s0, s1, s0 @@ -2371,37 +2351,35 @@ define amdgpu_ps i8 @extractelement_vgpr_v16i8_sgpr_idx(<16 x i8> addrspace(1)* ; GFX10: ; %bb.0: ; GFX10-NEXT: global_load_dwordx4 v[0:3], v[0:1], off ; GFX10-NEXT: s_mov_b32 s0, 8 -; GFX10-NEXT: v_mov_b32_e32 v5, 8 +; GFX10-NEXT: v_mov_b32_e32 v4, 8 ; GFX10-NEXT: s_mov_b32 s1, 16 -; GFX10-NEXT: s_movk_i32 s3, 0xff -; GFX10-NEXT: v_mov_b32_e32 v4, 0xff -; GFX10-NEXT: v_mov_b32_e32 v6, 16 +; GFX10-NEXT: v_mov_b32_e32 v5, 16 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_lshrrev_b32_e32 v7, 24, v0 -; GFX10-NEXT: v_lshrrev_b32_e32 v8, 24, v1 -; GFX10-NEXT: v_lshlrev_b32_sdwa v10, s0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 -; GFX10-NEXT: v_lshlrev_b32_sdwa v12, s0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 -; GFX10-NEXT: v_lshrrev_b32_e32 v9, 24, v2 -; GFX10-NEXT: v_lshlrev_b32_sdwa v11, s1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 -; GFX10-NEXT: v_lshlrev_b32_sdwa v13, s1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 -; GFX10-NEXT: v_lshlrev_b32_sdwa v14, v5, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 -; GFX10-NEXT: v_and_or_b32 v0, v0, s3, v10 +; GFX10-NEXT: v_lshrrev_b32_e32 v6, 24, v0 +; GFX10-NEXT: v_lshrrev_b32_e32 v7, 24, v1 +; GFX10-NEXT: v_lshlrev_b32_sdwa v9, s0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 +; GFX10-NEXT: v_lshlrev_b32_sdwa v11, s0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 +; GFX10-NEXT: v_lshrrev_b32_e32 v8, 24, v2 +; GFX10-NEXT: v_lshlrev_b32_sdwa v10, s1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 +; GFX10-NEXT: v_lshlrev_b32_sdwa v12, s1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 +; GFX10-NEXT: v_lshlrev_b32_sdwa v13, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 +; GFX10-NEXT: v_and_or_b32 v0, v0, 0xff, v9 +; GFX10-NEXT: v_lshlrev_b32_e32 v6, 24, v6 +; GFX10-NEXT: v_and_or_b32 v1, v1, 0xff, v11 ; GFX10-NEXT: v_lshlrev_b32_e32 v7, 24, v7 -; GFX10-NEXT: v_and_or_b32 v1, v1, s3, v12 -; GFX10-NEXT: v_lshlrev_b32_e32 v8, 24, v8 ; GFX10-NEXT: s_lshr_b32 s0, s2, 2 -; GFX10-NEXT: v_lshlrev_b32_sdwa v15, v6, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 -; GFX10-NEXT: v_and_or_b32 v2, v2, v4, v14 -; GFX10-NEXT: v_lshlrev_b32_e32 v9, 24, v9 -; GFX10-NEXT: v_lshlrev_b32_sdwa v5, v5, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 -; GFX10-NEXT: v_lshrrev_b32_e32 v10, 24, v3 -; GFX10-NEXT: v_or3_b32 v0, v0, v11, v7 -; GFX10-NEXT: v_or3_b32 v1, v1, v13, v8 +; GFX10-NEXT: v_lshlrev_b32_sdwa v14, v5, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 +; GFX10-NEXT: v_and_or_b32 v2, 0xff, v2, v13 +; GFX10-NEXT: v_lshlrev_b32_e32 v8, 24, v8 +; GFX10-NEXT: v_lshlrev_b32_sdwa v4, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 +; GFX10-NEXT: v_lshrrev_b32_e32 v9, 24, v3 +; GFX10-NEXT: v_or3_b32 v0, v0, v10, v6 +; GFX10-NEXT: v_or3_b32 v1, v1, v12, v7 ; GFX10-NEXT: v_cmp_eq_u32_e64 vcc_lo, s0, 1 -; GFX10-NEXT: v_or3_b32 v2, v2, v15, v9 -; GFX10-NEXT: v_and_or_b32 v4, v3, v4, v5 -; GFX10-NEXT: v_lshlrev_b32_sdwa v3, v6, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 -; GFX10-NEXT: v_lshlrev_b32_e32 v5, 24, v10 +; GFX10-NEXT: v_or3_b32 v2, v2, v14, v8 +; GFX10-NEXT: v_and_or_b32 v4, 0xff, v3, v4 +; GFX10-NEXT: v_lshlrev_b32_sdwa v3, v5, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 +; GFX10-NEXT: v_lshlrev_b32_e32 v5, 24, v9 ; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc_lo ; GFX10-NEXT: v_cmp_eq_u32_e64 vcc_lo, s0, 2 ; GFX10-NEXT: v_or3_b32 v1, v4, v3, v5 @@ -2583,43 +2561,41 @@ define i8 @extractelement_vgpr_v16i8_vgpr_idx(<16 x i8> addrspace(1)* %ptr, i32 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: global_load_dwordx4 v[3:6], v[0:1], off ; GFX10-NEXT: s_mov_b32 s4, 8 -; GFX10-NEXT: v_mov_b32_e32 v1, 8 +; GFX10-NEXT: v_mov_b32_e32 v0, 8 ; GFX10-NEXT: s_mov_b32 s5, 16 -; GFX10-NEXT: s_movk_i32 s6, 0xff -; GFX10-NEXT: v_mov_b32_e32 v0, 0xff -; GFX10-NEXT: v_mov_b32_e32 v7, 16 -; GFX10-NEXT: v_lshrrev_b32_e32 v8, 2, v2 +; GFX10-NEXT: v_mov_b32_e32 v1, 16 +; GFX10-NEXT: v_lshrrev_b32_e32 v7, 2, v2 ; GFX10-NEXT: v_and_b32_e32 v2, 3, v2 -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v8 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v7 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_lshrrev_b32_e32 v9, 24, v3 -; GFX10-NEXT: v_lshrrev_b32_e32 v10, 24, v4 -; GFX10-NEXT: v_lshlrev_b32_sdwa v13, s4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 -; GFX10-NEXT: v_lshlrev_b32_sdwa v15, s4, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 -; GFX10-NEXT: v_lshrrev_b32_e32 v11, 24, v5 -; GFX10-NEXT: v_lshlrev_b32_sdwa v14, s5, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 -; GFX10-NEXT: v_lshlrev_b32_sdwa v16, s5, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 -; GFX10-NEXT: v_lshlrev_b32_sdwa v17, v1, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 -; GFX10-NEXT: v_and_or_b32 v3, v3, s6, v13 +; GFX10-NEXT: v_lshrrev_b32_e32 v8, 24, v3 +; GFX10-NEXT: v_lshrrev_b32_e32 v9, 24, v4 +; GFX10-NEXT: v_lshlrev_b32_sdwa v12, s4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 +; GFX10-NEXT: v_lshlrev_b32_sdwa v14, s4, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 +; GFX10-NEXT: v_lshrrev_b32_e32 v10, 24, v5 +; GFX10-NEXT: v_lshlrev_b32_sdwa v13, s5, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 +; GFX10-NEXT: v_lshlrev_b32_sdwa v15, s5, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 +; GFX10-NEXT: v_lshlrev_b32_sdwa v16, v0, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 +; GFX10-NEXT: v_and_or_b32 v3, v3, 0xff, v12 +; GFX10-NEXT: v_lshlrev_b32_e32 v8, 24, v8 +; GFX10-NEXT: v_and_or_b32 v4, v4, 0xff, v14 ; GFX10-NEXT: v_lshlrev_b32_e32 v9, 24, v9 -; GFX10-NEXT: v_and_or_b32 v4, v4, s6, v15 +; GFX10-NEXT: v_lshrrev_b32_e32 v11, 24, v6 +; GFX10-NEXT: v_lshlrev_b32_sdwa v17, v1, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 +; GFX10-NEXT: v_lshlrev_b32_sdwa v0, v0, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 +; GFX10-NEXT: v_and_or_b32 v5, 0xff, v5, v16 ; GFX10-NEXT: v_lshlrev_b32_e32 v10, 24, v10 -; GFX10-NEXT: v_lshrrev_b32_e32 v12, 24, v6 -; GFX10-NEXT: v_lshlrev_b32_sdwa v18, v7, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 -; GFX10-NEXT: v_lshlrev_b32_sdwa v1, v1, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 -; GFX10-NEXT: v_and_or_b32 v5, v5, v0, v17 -; GFX10-NEXT: v_lshlrev_b32_e32 v11, 24, v11 -; GFX10-NEXT: v_or3_b32 v3, v3, v14, v9 -; GFX10-NEXT: v_or3_b32 v4, v4, v16, v10 -; GFX10-NEXT: v_lshlrev_b32_sdwa v7, v7, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 -; GFX10-NEXT: v_and_or_b32 v0, v6, v0, v1 -; GFX10-NEXT: v_lshlrev_b32_e32 v1, 24, v12 -; GFX10-NEXT: v_or3_b32 v5, v5, v18, v11 +; GFX10-NEXT: v_or3_b32 v3, v3, v13, v8 +; GFX10-NEXT: v_or3_b32 v4, v4, v15, v9 +; GFX10-NEXT: v_lshlrev_b32_sdwa v1, v1, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 +; GFX10-NEXT: v_and_or_b32 v0, 0xff, v6, v0 +; GFX10-NEXT: v_lshlrev_b32_e32 v6, 24, v11 +; GFX10-NEXT: v_or3_b32 v5, v5, v17, v10 ; GFX10-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc_lo -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 2, v8 -; GFX10-NEXT: v_or3_b32 v0, v0, v7, v1 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 2, v7 +; GFX10-NEXT: v_or3_b32 v0, v0, v1, v6 ; GFX10-NEXT: v_cndmask_b32_e32 v1, v3, v5, vcc_lo -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 3, v8 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 3, v7 ; GFX10-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc_lo ; GFX10-NEXT: v_lshlrev_b32_e32 v1, 3, v2 ; GFX10-NEXT: v_lshrrev_b32_e32 v0, v1, v0 @@ -2633,47 +2609,46 @@ define amdgpu_ps i8 @extractelement_sgpr_v16i8_vgpr_idx(<16 x i8> addrspace(4)* ; GCN-LABEL: extractelement_sgpr_v16i8_vgpr_idx: ; GCN: ; %bb.0: ; GCN-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x0 -; GCN-NEXT: s_mov_b32 s10, 0x80008 -; GCN-NEXT: s_movk_i32 s8, 0xff ; GCN-NEXT: v_lshrrev_b32_e32 v1, 2, v0 ; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 1, v1 +; GCN-NEXT: v_and_b32_e32 v0, 3, v0 +; GCN-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_bfe_u32 s11, s0, s10 -; GCN-NEXT: s_and_b32 s9, s0, s8 -; GCN-NEXT: s_lshl_b32 s11, s11, 8 -; GCN-NEXT: s_or_b32 s9, s9, s11 -; GCN-NEXT: s_mov_b32 s11, 0x80010 +; GCN-NEXT: s_bfe_u32 s9, s0, 0x80008 ; GCN-NEXT: s_lshr_b32 s4, s0, 24 -; GCN-NEXT: s_bfe_u32 s0, s0, s11 +; GCN-NEXT: s_and_b32 s8, s0, 0xff +; GCN-NEXT: s_lshl_b32 s9, s9, 8 +; GCN-NEXT: s_bfe_u32 s0, s0, 0x80010 +; GCN-NEXT: s_or_b32 s8, s8, s9 ; GCN-NEXT: s_lshl_b32 s0, s0, 16 -; GCN-NEXT: s_or_b32 s0, s9, s0 +; GCN-NEXT: s_or_b32 s0, s8, s0 ; GCN-NEXT: s_lshl_b32 s4, s4, 24 -; GCN-NEXT: s_bfe_u32 s9, s1, s10 +; GCN-NEXT: s_bfe_u32 s8, s1, 0x80008 ; GCN-NEXT: s_lshr_b32 s5, s1, 24 ; GCN-NEXT: s_or_b32 s0, s0, s4 -; GCN-NEXT: s_and_b32 s4, s1, s8 -; GCN-NEXT: s_lshl_b32 s9, s9, 8 -; GCN-NEXT: s_bfe_u32 s1, s1, s11 -; GCN-NEXT: s_or_b32 s4, s4, s9 +; GCN-NEXT: s_and_b32 s4, s1, 0xff +; GCN-NEXT: s_lshl_b32 s8, s8, 8 +; GCN-NEXT: s_bfe_u32 s1, s1, 0x80010 +; GCN-NEXT: s_or_b32 s4, s4, s8 ; GCN-NEXT: s_lshl_b32 s1, s1, 16 ; GCN-NEXT: s_or_b32 s1, s4, s1 ; GCN-NEXT: s_lshl_b32 s4, s5, 24 -; GCN-NEXT: s_bfe_u32 s5, s2, s10 +; GCN-NEXT: s_bfe_u32 s5, s2, 0x80008 ; GCN-NEXT: s_lshr_b32 s6, s2, 24 ; GCN-NEXT: s_or_b32 s1, s1, s4 -; GCN-NEXT: s_and_b32 s4, s2, s8 +; GCN-NEXT: s_and_b32 s4, s2, 0xff ; GCN-NEXT: s_lshl_b32 s5, s5, 8 -; GCN-NEXT: s_bfe_u32 s2, s2, s11 +; GCN-NEXT: s_bfe_u32 s2, s2, 0x80010 ; GCN-NEXT: s_or_b32 s4, s4, s5 ; GCN-NEXT: s_lshl_b32 s2, s2, 16 ; GCN-NEXT: s_or_b32 s2, s4, s2 ; GCN-NEXT: s_lshl_b32 s4, s6, 24 -; GCN-NEXT: s_bfe_u32 s5, s3, s10 +; GCN-NEXT: s_bfe_u32 s5, s3, 0x80008 ; GCN-NEXT: s_lshr_b32 s7, s3, 24 ; GCN-NEXT: s_or_b32 s2, s2, s4 -; GCN-NEXT: s_and_b32 s4, s3, s8 +; GCN-NEXT: s_and_b32 s4, s3, 0xff ; GCN-NEXT: s_lshl_b32 s5, s5, 8 -; GCN-NEXT: s_bfe_u32 s3, s3, s11 +; GCN-NEXT: s_bfe_u32 s3, s3, 0x80010 ; GCN-NEXT: s_or_b32 s4, s4, s5 ; GCN-NEXT: s_lshl_b32 s3, s3, 16 ; GCN-NEXT: s_or_b32 s3, s4, s3 @@ -2687,9 +2662,7 @@ define amdgpu_ps i8 @extractelement_sgpr_v16i8_vgpr_idx(<16 x i8> addrspace(4)* ; GCN-NEXT: v_mov_b32_e32 v5, s3 ; GCN-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc ; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 3, v1 -; GCN-NEXT: v_and_b32_e32 v0, 3, v0 ; GCN-NEXT: v_cndmask_b32_e32 v1, v2, v5, vcc -; GCN-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; GCN-NEXT: v_lshrrev_b32_e32 v0, v0, v1 ; GCN-NEXT: v_readfirstlane_b32 s0, v0 ; GCN-NEXT: ; return to shader part epilog @@ -2697,58 +2670,55 @@ define amdgpu_ps i8 @extractelement_sgpr_v16i8_vgpr_idx(<16 x i8> addrspace(4)* ; GFX10-LABEL: extractelement_sgpr_v16i8_vgpr_idx: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x0 -; GFX10-NEXT: s_mov_b32 s5, 0x80008 -; GFX10-NEXT: s_movk_i32 s4, 0xff -; GFX10-NEXT: s_mov_b32 s6, 0x80010 ; GFX10-NEXT: v_lshrrev_b32_e32 v1, 2, v0 ; GFX10-NEXT: v_and_b32_e32 v0, 3, v0 ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v1 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: s_bfe_u32 s12, s0, s5 -; GFX10-NEXT: s_bfe_u32 s14, s1, s5 -; GFX10-NEXT: s_lshr_b32 s8, s1, 24 -; GFX10-NEXT: s_and_b32 s11, s0, s4 -; GFX10-NEXT: s_and_b32 s13, s1, s4 -; GFX10-NEXT: s_bfe_u32 s1, s1, s6 -; GFX10-NEXT: s_lshl_b32 s12, s12, 8 -; GFX10-NEXT: s_lshl_b32 s14, s14, 8 +; GFX10-NEXT: s_bfe_u32 s9, s0, 0x80008 +; GFX10-NEXT: s_bfe_u32 s11, s1, 0x80008 +; GFX10-NEXT: s_lshr_b32 s5, s1, 24 +; GFX10-NEXT: s_and_b32 s8, s0, 0xff +; GFX10-NEXT: s_and_b32 s10, s1, 0xff +; GFX10-NEXT: s_bfe_u32 s1, s1, 0x80010 +; GFX10-NEXT: s_lshl_b32 s9, s9, 8 +; GFX10-NEXT: s_lshl_b32 s11, s11, 8 ; GFX10-NEXT: s_lshl_b32 s1, s1, 16 -; GFX10-NEXT: s_or_b32 s11, s11, s12 -; GFX10-NEXT: s_or_b32 s12, s13, s14 -; GFX10-NEXT: s_lshl_b32 s8, s8, 24 -; GFX10-NEXT: s_or_b32 s1, s12, s1 -; GFX10-NEXT: s_lshr_b32 s7, s0, 24 -; GFX10-NEXT: s_bfe_u32 s0, s0, s6 -; GFX10-NEXT: s_or_b32 s1, s1, s8 +; GFX10-NEXT: s_or_b32 s8, s8, s9 +; GFX10-NEXT: s_or_b32 s9, s10, s11 +; GFX10-NEXT: s_lshl_b32 s5, s5, 24 +; GFX10-NEXT: s_or_b32 s1, s9, s1 +; GFX10-NEXT: s_lshr_b32 s4, s0, 24 +; GFX10-NEXT: s_bfe_u32 s0, s0, 0x80010 +; GFX10-NEXT: s_or_b32 s1, s1, s5 ; GFX10-NEXT: s_lshl_b32 s0, s0, 16 -; GFX10-NEXT: s_bfe_u32 s16, s2, s5 +; GFX10-NEXT: s_bfe_u32 s13, s2, 0x80008 ; GFX10-NEXT: v_mov_b32_e32 v2, s1 -; GFX10-NEXT: s_lshl_b32 s7, s7, 24 -; GFX10-NEXT: s_or_b32 s0, s11, s0 -; GFX10-NEXT: s_lshr_b32 s9, s2, 24 -; GFX10-NEXT: s_and_b32 s15, s2, s4 -; GFX10-NEXT: s_lshl_b32 s16, s16, 8 -; GFX10-NEXT: s_bfe_u32 s2, s2, s6 -; GFX10-NEXT: s_or_b32 s0, s0, s7 -; GFX10-NEXT: s_or_b32 s7, s15, s16 +; GFX10-NEXT: s_lshl_b32 s4, s4, 24 +; GFX10-NEXT: s_or_b32 s0, s8, s0 +; GFX10-NEXT: s_lshr_b32 s6, s2, 24 +; GFX10-NEXT: s_and_b32 s12, s2, 0xff +; GFX10-NEXT: s_bfe_u32 s2, s2, 0x80010 +; GFX10-NEXT: s_lshl_b32 s13, s13, 8 +; GFX10-NEXT: s_or_b32 s0, s0, s4 ; GFX10-NEXT: s_lshl_b32 s2, s2, 16 -; GFX10-NEXT: s_bfe_u32 s5, s3, s5 +; GFX10-NEXT: s_or_b32 s10, s12, s13 +; GFX10-NEXT: s_bfe_u32 s5, s3, 0x80008 ; GFX10-NEXT: v_cndmask_b32_e32 v2, s0, v2, vcc_lo ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 2, v1 -; GFX10-NEXT: s_or_b32 s2, s7, s2 -; GFX10-NEXT: s_lshl_b32 s7, s9, 24 -; GFX10-NEXT: s_and_b32 s4, s3, s4 +; GFX10-NEXT: s_or_b32 s2, s10, s2 +; GFX10-NEXT: s_lshl_b32 s4, s6, 24 +; GFX10-NEXT: s_and_b32 s6, s3, 0xff ; GFX10-NEXT: s_lshl_b32 s5, s5, 8 -; GFX10-NEXT: s_bfe_u32 s1, s3, s6 -; GFX10-NEXT: s_or_b32 s2, s2, s7 -; GFX10-NEXT: s_lshr_b32 s10, s3, 24 -; GFX10-NEXT: s_or_b32 s3, s4, s5 +; GFX10-NEXT: s_bfe_u32 s1, s3, 0x80010 +; GFX10-NEXT: s_or_b32 s2, s2, s4 +; GFX10-NEXT: s_lshr_b32 s7, s3, 24 +; GFX10-NEXT: s_or_b32 s3, s6, s5 ; GFX10-NEXT: s_lshl_b32 s1, s1, 16 ; GFX10-NEXT: v_cndmask_b32_e64 v2, v2, s2, vcc_lo ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 3, v1 ; GFX10-NEXT: s_or_b32 s0, s3, s1 -; GFX10-NEXT: s_lshl_b32 s1, s10, 24 +; GFX10-NEXT: s_lshl_b32 s1, s7, 24 ; GFX10-NEXT: s_or_b32 s3, s0, s1 ; GFX10-NEXT: v_cndmask_b32_e64 v1, v2, s3, vcc_lo ; GFX10-NEXT: v_lshrrev_b32_e32 v0, v0, v1 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/fdiv.f32.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/fdiv.f32.ll index 140c3ff69407..9e1dc06de933 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/fdiv.f32.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/fdiv.f32.ll @@ -901,19 +901,17 @@ define <2 x float> @v_fdiv_v2f32_ulp25(<2 x float> %a, <2 x float> %b) { ; GFX10-FLUSH: ; %bb.0: ; GFX10-FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-FLUSH-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-FLUSH-NEXT: s_mov_b32 s4, 0x6f800000 -; GFX10-FLUSH-NEXT: s_mov_b32 s5, 0x2f800000 -; GFX10-FLUSH-NEXT: v_cmp_gt_f32_e64 s6, |v2|, s4 -; GFX10-FLUSH-NEXT: v_cmp_gt_f32_e64 s4, |v3|, s4 -; GFX10-FLUSH-NEXT: v_cndmask_b32_e64 v4, 1.0, s5, s6 -; GFX10-FLUSH-NEXT: v_cndmask_b32_e64 v5, 1.0, s5, s4 +; GFX10-FLUSH-NEXT: v_cmp_lt_f32_e64 s4, 0x6f800000, |v2| +; GFX10-FLUSH-NEXT: v_cndmask_b32_e64 v4, 1.0, 0x2f800000, s4 +; GFX10-FLUSH-NEXT: v_cmp_lt_f32_e64 s4, 0x6f800000, |v3| ; GFX10-FLUSH-NEXT: v_mul_f32_e32 v2, v2, v4 -; GFX10-FLUSH-NEXT: v_mul_f32_e32 v3, v3, v5 +; GFX10-FLUSH-NEXT: v_cndmask_b32_e64 v5, 1.0, 0x2f800000, s4 ; GFX10-FLUSH-NEXT: v_rcp_f32_e32 v2, v2 +; GFX10-FLUSH-NEXT: v_mul_f32_e32 v3, v3, v5 ; GFX10-FLUSH-NEXT: v_rcp_f32_e32 v3, v3 ; GFX10-FLUSH-NEXT: v_mul_f32_e32 v0, v0, v2 -; GFX10-FLUSH-NEXT: v_mul_f32_e32 v1, v1, v3 ; GFX10-FLUSH-NEXT: v_mul_f32_e32 v0, v4, v0 +; GFX10-FLUSH-NEXT: v_mul_f32_e32 v1, v1, v3 ; GFX10-FLUSH-NEXT: v_mul_f32_e32 v1, v5, v1 ; GFX10-FLUSH-NEXT: s_setpc_b64 s[30:31] %fdiv = fdiv <2 x float> %a, %b, !fpmath !0 @@ -1334,19 +1332,17 @@ define <2 x float> @v_rcp_v2f32_ulp25(<2 x float> %x) { ; GFX10-IEEE: ; %bb.0: ; GFX10-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-IEEE-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-IEEE-NEXT: s_mov_b32 s4, 0x6f800000 -; GFX10-IEEE-NEXT: s_mov_b32 s5, 0x2f800000 -; GFX10-IEEE-NEXT: v_cmp_gt_f32_e64 s6, |v0|, s4 -; GFX10-IEEE-NEXT: v_cmp_gt_f32_e64 s4, |v1|, s4 -; GFX10-IEEE-NEXT: v_cndmask_b32_e64 v2, 1.0, s5, s6 -; GFX10-IEEE-NEXT: v_cndmask_b32_e64 v3, 1.0, s5, s4 +; GFX10-IEEE-NEXT: v_cmp_lt_f32_e64 s4, 0x6f800000, |v0| +; GFX10-IEEE-NEXT: v_cndmask_b32_e64 v2, 1.0, 0x2f800000, s4 +; GFX10-IEEE-NEXT: v_cmp_lt_f32_e64 s4, 0x6f800000, |v1| ; GFX10-IEEE-NEXT: v_mul_f32_e32 v0, v0, v2 -; GFX10-IEEE-NEXT: v_mul_f32_e32 v1, v1, v3 +; GFX10-IEEE-NEXT: v_cndmask_b32_e64 v3, 1.0, 0x2f800000, s4 ; GFX10-IEEE-NEXT: v_rcp_f32_e32 v0, v0 +; GFX10-IEEE-NEXT: v_mul_f32_e32 v1, v1, v3 ; GFX10-IEEE-NEXT: v_rcp_f32_e32 v1, v1 ; GFX10-IEEE-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; GFX10-IEEE-NEXT: v_mul_f32_e32 v1, 1.0, v1 ; GFX10-IEEE-NEXT: v_mul_f32_e32 v0, v2, v0 +; GFX10-IEEE-NEXT: v_mul_f32_e32 v1, 1.0, v1 ; GFX10-IEEE-NEXT: v_mul_f32_e32 v1, v3, v1 ; GFX10-IEEE-NEXT: s_setpc_b64 s[30:31] ; @@ -1492,19 +1488,17 @@ define <2 x float> @v_fdiv_v2f32_arcp_ulp25(<2 x float> %a, <2 x float> %b) { ; GFX10-FLUSH: ; %bb.0: ; GFX10-FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-FLUSH-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-FLUSH-NEXT: s_mov_b32 s4, 0x6f800000 -; GFX10-FLUSH-NEXT: s_mov_b32 s5, 0x2f800000 -; GFX10-FLUSH-NEXT: v_cmp_gt_f32_e64 s6, |v2|, s4 -; GFX10-FLUSH-NEXT: v_cmp_gt_f32_e64 s4, |v3|, s4 -; GFX10-FLUSH-NEXT: v_cndmask_b32_e64 v4, 1.0, s5, s6 -; GFX10-FLUSH-NEXT: v_cndmask_b32_e64 v5, 1.0, s5, s4 +; GFX10-FLUSH-NEXT: v_cmp_lt_f32_e64 s4, 0x6f800000, |v2| +; GFX10-FLUSH-NEXT: v_cndmask_b32_e64 v4, 1.0, 0x2f800000, s4 +; GFX10-FLUSH-NEXT: v_cmp_lt_f32_e64 s4, 0x6f800000, |v3| ; GFX10-FLUSH-NEXT: v_mul_f32_e32 v2, v2, v4 -; GFX10-FLUSH-NEXT: v_mul_f32_e32 v3, v3, v5 +; GFX10-FLUSH-NEXT: v_cndmask_b32_e64 v5, 1.0, 0x2f800000, s4 ; GFX10-FLUSH-NEXT: v_rcp_f32_e32 v2, v2 +; GFX10-FLUSH-NEXT: v_mul_f32_e32 v3, v3, v5 ; GFX10-FLUSH-NEXT: v_rcp_f32_e32 v3, v3 ; GFX10-FLUSH-NEXT: v_mul_f32_e32 v0, v0, v2 -; GFX10-FLUSH-NEXT: v_mul_f32_e32 v1, v1, v3 ; GFX10-FLUSH-NEXT: v_mul_f32_e32 v0, v4, v0 +; GFX10-FLUSH-NEXT: v_mul_f32_e32 v1, v1, v3 ; GFX10-FLUSH-NEXT: v_mul_f32_e32 v1, v5, v1 ; GFX10-FLUSH-NEXT: s_setpc_b64 s[30:31] %fdiv = fdiv arcp <2 x float> %a, %b, !fpmath !0 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/flat-scratch.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/flat-scratch.ll index 1c3c79f8b867..9cb1f58f4ad3 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/flat-scratch.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/flat-scratch.ll @@ -96,12 +96,11 @@ define amdgpu_kernel void @store_load_vindex_kernel() { ; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1 ; GFX10-NEXT: v_sub_nc_u32_e32 v1, 0, v0 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX10-NEXT: v_mov_b32_e32 v2, 4 -; GFX10-NEXT: v_mov_b32_e32 v3, 15 +; GFX10-NEXT: v_mov_b32_e32 v2, 15 ; GFX10-NEXT: v_lshlrev_b32_e32 v1, 2, v1 -; GFX10-NEXT: v_add_nc_u32_e32 v0, v2, v0 -; GFX10-NEXT: v_add_nc_u32_e32 v1, v2, v1 -; GFX10-NEXT: scratch_store_dword v0, v3, off +; GFX10-NEXT: v_add_nc_u32_e32 v0, 4, v0 +; GFX10-NEXT: v_add_nc_u32_e32 v1, 4, v1 +; GFX10-NEXT: scratch_store_dword v0, v2, off ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: scratch_load_dword v0, v1, off offset:124 glc dlc ; GFX10-NEXT: s_waitcnt vmcnt(0) @@ -156,12 +155,11 @@ define void @store_load_vindex_foo(i32 %idx) { ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: v_and_b32_e32 v1, 15, v0 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX10-NEXT: v_mov_b32_e32 v2, s32 -; GFX10-NEXT: v_mov_b32_e32 v3, 15 +; GFX10-NEXT: v_mov_b32_e32 v2, 15 ; GFX10-NEXT: v_lshlrev_b32_e32 v1, 2, v1 -; GFX10-NEXT: v_add_nc_u32_e32 v0, v2, v0 -; GFX10-NEXT: v_add_nc_u32_e32 v1, v2, v1 -; GFX10-NEXT: scratch_store_dword v0, v3, off +; GFX10-NEXT: v_add_nc_u32_e32 v0, s32, v0 +; GFX10-NEXT: v_add_nc_u32_e32 v1, s32, v1 +; GFX10-NEXT: scratch_store_dword v0, v2, off ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: scratch_load_dword v0, v1, off glc dlc ; GFX10-NEXT: s_waitcnt vmcnt(0) @@ -327,14 +325,13 @@ define amdgpu_kernel void @store_load_vindex_small_offset_kernel() { ; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1 ; GFX10-NEXT: v_sub_nc_u32_e32 v1, 0, v0 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX10-NEXT: v_mov_b32_e32 v2, 0x104 -; GFX10-NEXT: v_mov_b32_e32 v3, 15 -; GFX10-NEXT: v_lshlrev_b32_e32 v1, 2, v1 -; GFX10-NEXT: v_add_nc_u32_e32 v0, v2, v0 -; GFX10-NEXT: v_add_nc_u32_e32 v1, v2, v1 -; GFX10-NEXT: scratch_load_dword v2, off, off offset:4 glc dlc +; GFX10-NEXT: v_mov_b32_e32 v2, 15 +; GFX10-NEXT: scratch_load_dword v3, off, off offset:4 glc dlc ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: scratch_store_dword v0, v3, off +; GFX10-NEXT: v_lshlrev_b32_e32 v1, 2, v1 +; GFX10-NEXT: v_add_nc_u32_e32 v0, 0x104, v0 +; GFX10-NEXT: v_add_nc_u32_e32 v1, 0x104, v1 +; GFX10-NEXT: scratch_store_dword v0, v2, off ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: scratch_load_dword v0, v1, off offset:124 glc dlc ; GFX10-NEXT: s_waitcnt vmcnt(0) @@ -396,16 +393,16 @@ define void @store_load_vindex_small_offset_foo(i32 %idx) { ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: v_and_b32_e32 v1, 15, v0 -; GFX10-NEXT: s_add_i32 vcc_lo, s32, 0x100 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX10-NEXT: v_mov_b32_e32 v2, vcc_lo -; GFX10-NEXT: v_mov_b32_e32 v3, 15 -; GFX10-NEXT: v_lshlrev_b32_e32 v1, 2, v1 -; GFX10-NEXT: v_add_nc_u32_e32 v0, v2, v0 -; GFX10-NEXT: v_add_nc_u32_e32 v1, v2, v1 -; GFX10-NEXT: scratch_load_dword v2, off, s32 glc dlc +; GFX10-NEXT: s_add_i32 vcc_lo, s32, 0x100 +; GFX10-NEXT: v_mov_b32_e32 v2, 15 +; GFX10-NEXT: scratch_load_dword v3, off, s32 glc dlc ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: scratch_store_dword v0, v3, off +; GFX10-NEXT: v_lshlrev_b32_e32 v1, 2, v1 +; GFX10-NEXT: v_add_nc_u32_e32 v0, vcc_lo, v0 +; GFX10-NEXT: s_add_i32 vcc_lo, s32, 0x100 +; GFX10-NEXT: v_add_nc_u32_e32 v1, vcc_lo, v1 +; GFX10-NEXT: scratch_store_dword v0, v2, off ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: scratch_load_dword v0, v1, off glc dlc ; GFX10-NEXT: s_waitcnt vmcnt(0) @@ -548,14 +545,13 @@ define amdgpu_kernel void @store_load_vindex_large_offset_kernel() { ; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1 ; GFX10-NEXT: v_sub_nc_u32_e32 v1, 0, v0 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX10-NEXT: v_mov_b32_e32 v2, 0x4004 -; GFX10-NEXT: v_mov_b32_e32 v3, 15 -; GFX10-NEXT: v_lshlrev_b32_e32 v1, 2, v1 -; GFX10-NEXT: v_add_nc_u32_e32 v0, v2, v0 -; GFX10-NEXT: v_add_nc_u32_e32 v1, v2, v1 -; GFX10-NEXT: scratch_load_dword v2, off, off offset:4 glc dlc +; GFX10-NEXT: v_mov_b32_e32 v2, 15 +; GFX10-NEXT: scratch_load_dword v3, off, off offset:4 glc dlc ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: scratch_store_dword v0, v3, off +; GFX10-NEXT: v_lshlrev_b32_e32 v1, 2, v1 +; GFX10-NEXT: v_add_nc_u32_e32 v0, 0x4004, v0 +; GFX10-NEXT: v_add_nc_u32_e32 v1, 0x4004, v1 +; GFX10-NEXT: scratch_store_dword v0, v2, off ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: scratch_load_dword v0, v1, off offset:124 glc dlc ; GFX10-NEXT: s_waitcnt vmcnt(0) @@ -619,16 +615,16 @@ define void @store_load_vindex_large_offset_foo(i32 %idx) { ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: v_and_b32_e32 v1, 15, v0 -; GFX10-NEXT: s_add_i32 vcc_lo, s32, 0x4004 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX10-NEXT: v_mov_b32_e32 v2, vcc_lo -; GFX10-NEXT: v_mov_b32_e32 v3, 15 -; GFX10-NEXT: v_lshlrev_b32_e32 v1, 2, v1 -; GFX10-NEXT: v_add_nc_u32_e32 v0, v2, v0 -; GFX10-NEXT: v_add_nc_u32_e32 v1, v2, v1 -; GFX10-NEXT: scratch_load_dword v2, off, s32 offset:4 glc dlc +; GFX10-NEXT: s_add_i32 vcc_lo, s32, 0x4004 +; GFX10-NEXT: v_mov_b32_e32 v2, 15 +; GFX10-NEXT: scratch_load_dword v3, off, s32 offset:4 glc dlc ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: scratch_store_dword v0, v3, off +; GFX10-NEXT: v_lshlrev_b32_e32 v1, 2, v1 +; GFX10-NEXT: v_add_nc_u32_e32 v0, vcc_lo, v0 +; GFX10-NEXT: s_add_i32 vcc_lo, s32, 0x4004 +; GFX10-NEXT: v_add_nc_u32_e32 v1, vcc_lo, v1 +; GFX10-NEXT: scratch_store_dword v0, v2, off ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: scratch_load_dword v0, v1, off glc dlc ; GFX10-NEXT: s_waitcnt vmcnt(0) diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/fmed3.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/fmed3.ll index 5d8ae5851266..9f7fe19a17ed 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/fmed3.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/fmed3.ll @@ -325,10 +325,8 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat0_srcmod012(float add ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: global_load_dword v3, v0, s[6:7] glc dlc ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-NEXT: s_mov_b32 s2, 0x80000000 -; GFX10-NEXT: v_sub_f32_e32 v1, s2, v1 -; GFX10-NEXT: v_sub_f32_e64 v3, s2, |v3| +; GFX10-NEXT: v_sub_f32_e32 v1, 0x80000000, v1 +; GFX10-NEXT: v_sub_f32_e64 v3, 0x80000000, |v3| ; GFX10-NEXT: v_med3_f32 v1, v1, |v2|, v3 ; GFX10-NEXT: global_store_dword v0, v1, s[0:1] ; GFX10-NEXT: s_endpgm @@ -447,11 +445,9 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat0_negabs012(float add ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: global_load_dword v3, v0, s[6:7] glc dlc ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-NEXT: s_mov_b32 s2, 0x80000000 -; GFX10-NEXT: v_sub_f32_e64 v1, s2, |v1| -; GFX10-NEXT: v_sub_f32_e64 v2, s2, |v2| -; GFX10-NEXT: v_sub_f32_e64 v3, s2, |v3| +; GFX10-NEXT: v_sub_f32_e64 v1, 0x80000000, |v1| +; GFX10-NEXT: v_sub_f32_e64 v2, 0x80000000, |v2| +; GFX10-NEXT: v_sub_f32_e64 v3, 0x80000000, |v3| ; GFX10-NEXT: v_med3_f32 v1, v1, v2, v3 ; GFX10-NEXT: global_store_dword v0, v1, s[0:1] ; GFX10-NEXT: s_endpgm diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/fpow.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/fpow.ll index 4667975ca0d0..7843173f11b3 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/fpow.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/fpow.ll @@ -473,9 +473,8 @@ define <2 x half> @v_pow_v2f16_fneg_lhs_rhs(<2 x half> %x, <2 x half> %y) { ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: s_mov_b32 s4, 0x80008000 -; GFX10-NEXT: v_xor_b32_e32 v0, s4, v0 -; GFX10-NEXT: v_xor_b32_e32 v1, s4, v1 +; GFX10-NEXT: v_xor_b32_e32 v0, 0x80008000, v0 +; GFX10-NEXT: v_xor_b32_e32 v1, 0x80008000, v1 ; GFX10-NEXT: v_log_f16_e32 v2, v0 ; GFX10-NEXT: v_log_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 ; GFX10-NEXT: v_cvt_f32_f16_e32 v3, v1 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/fshl.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/fshl.ll index bb019f99b5ff..cb407ee28980 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/fshl.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/fshl.ll @@ -9,8 +9,8 @@ define amdgpu_ps i7 @s_fshl_i7(i7 inreg %lhs, i7 inreg %rhs, i7 inreg %amt) { ; GFX6: ; %bb.0: ; GFX6-NEXT: v_cvt_f32_ubyte0_e32 v0, 7 ; GFX6-NEXT: v_rcp_iflag_f32_e32 v0, v0 +; GFX6-NEXT: s_and_b32 s2, s2, 0x7f ; GFX6-NEXT: s_movk_i32 s3, 0x7f -; GFX6-NEXT: s_and_b32 s2, s2, s3 ; GFX6-NEXT: s_bfe_u32 s1, s1, 0x60001 ; GFX6-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX6-NEXT: v_cvt_u32_f32_e32 v0, v0 @@ -39,9 +39,9 @@ define amdgpu_ps i7 @s_fshl_i7(i7 inreg %lhs, i7 inreg %rhs, i7 inreg %amt) { ; GFX8: ; %bb.0: ; GFX8-NEXT: v_cvt_f32_ubyte0_e32 v0, 7 ; GFX8-NEXT: v_rcp_iflag_f32_e32 v0, v0 +; GFX8-NEXT: s_and_b32 s2, s2, 0x7f +; GFX8-NEXT: s_and_b32 s1, s1, 0x7f ; GFX8-NEXT: s_movk_i32 s3, 0x7f -; GFX8-NEXT: s_and_b32 s2, s2, s3 -; GFX8-NEXT: s_and_b32 s1, s1, s3 ; GFX8-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX8-NEXT: v_cvt_u32_f32_e32 v0, v0 ; GFX8-NEXT: s_bfe_u32 s1, s1, 0x100000 @@ -71,9 +71,9 @@ define amdgpu_ps i7 @s_fshl_i7(i7 inreg %lhs, i7 inreg %rhs, i7 inreg %amt) { ; GFX9: ; %bb.0: ; GFX9-NEXT: v_cvt_f32_ubyte0_e32 v0, 7 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v0, v0 +; GFX9-NEXT: s_and_b32 s2, s2, 0x7f +; GFX9-NEXT: s_and_b32 s1, s1, 0x7f ; GFX9-NEXT: s_movk_i32 s3, 0x7f -; GFX9-NEXT: s_and_b32 s2, s2, s3 -; GFX9-NEXT: s_and_b32 s1, s1, s3 ; GFX9-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0 ; GFX9-NEXT: s_bfe_u32 s1, s1, 0x100000 @@ -102,11 +102,10 @@ define amdgpu_ps i7 @s_fshl_i7(i7 inreg %lhs, i7 inreg %rhs, i7 inreg %amt) { ; GFX10-LABEL: s_fshl_i7: ; GFX10: ; %bb.0: ; GFX10-NEXT: v_cvt_f32_ubyte0_e32 v0, 7 -; GFX10-NEXT: s_movk_i32 s3, 0x7f -; GFX10-NEXT: s_and_b32 s2, s2, s3 -; GFX10-NEXT: s_and_b32 s1, s1, s3 -; GFX10-NEXT: v_rcp_iflag_f32_e32 v0, v0 +; GFX10-NEXT: s_and_b32 s2, s2, 0x7f +; GFX10-NEXT: s_and_b32 s1, s1, 0x7f ; GFX10-NEXT: s_bfe_u32 s1, s1, 0x100000 +; GFX10-NEXT: v_rcp_iflag_f32_e32 v0, v0 ; GFX10-NEXT: s_lshr_b32 s1, s1, 1 ; GFX10-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX10-NEXT: v_cvt_u32_f32_e32 v0, v0 @@ -123,8 +122,8 @@ define amdgpu_ps i7 @s_fshl_i7(i7 inreg %lhs, i7 inreg %rhs, i7 inreg %amt) { ; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, 7, v0 ; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc_lo ; GFX10-NEXT: v_sub_nc_u16 v1, 6, v0 -; GFX10-NEXT: v_and_b32_e32 v0, s3, v0 -; GFX10-NEXT: v_and_b32_e32 v1, s3, v1 +; GFX10-NEXT: v_and_b32_e32 v0, 0x7f, v0 +; GFX10-NEXT: v_and_b32_e32 v1, 0x7f, v1 ; GFX10-NEXT: v_lshlrev_b16 v0, v0, s0 ; GFX10-NEXT: v_lshrrev_b16 v1, v1, s1 ; GFX10-NEXT: v_or_b32_e32 v0, v0, v1 @@ -250,10 +249,9 @@ define i7 @v_fshl_i7(i7 %lhs, i7 %rhs, i7 %amt) { ; GFX10-NEXT: v_subrev_nc_u32_e32 v3, 7, v2 ; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, 7, v2 ; GFX10-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc_lo -; GFX10-NEXT: v_mov_b32_e32 v3, 0x7f -; GFX10-NEXT: v_sub_nc_u16 v4, 6, v2 -; GFX10-NEXT: v_and_b32_e32 v2, v2, v3 -; GFX10-NEXT: v_and_b32_e32 v3, v4, v3 +; GFX10-NEXT: v_sub_nc_u16 v3, 6, v2 +; GFX10-NEXT: v_and_b32_e32 v2, 0x7f, v2 +; GFX10-NEXT: v_and_b32_e32 v3, 0x7f, v3 ; GFX10-NEXT: v_lshlrev_b16 v0, v2, v0 ; GFX10-NEXT: v_lshrrev_b16 v1, v3, v1 ; GFX10-NEXT: v_or_b32_e32 v0, v0, v1 @@ -541,68 +539,65 @@ define amdgpu_ps i16 @s_fshl_v2i8(i16 inreg %lhs.arg, i16 inreg %rhs.arg, i16 in ; GFX6-NEXT: s_lshr_b32 s1, s1, 1 ; GFX6-NEXT: s_lshl_b32 s2, s3, s2 ; GFX6-NEXT: s_lshr_b32 s1, s1, s4 -; GFX6-NEXT: s_movk_i32 s6, 0xff ; GFX6-NEXT: s_or_b32 s1, s2, s1 -; GFX6-NEXT: s_and_b32 s1, s1, s6 -; GFX6-NEXT: s_and_b32 s0, s0, s6 +; GFX6-NEXT: s_and_b32 s1, s1, 0xff +; GFX6-NEXT: s_and_b32 s0, s0, 0xff ; GFX6-NEXT: s_lshl_b32 s1, s1, 8 ; GFX6-NEXT: s_or_b32 s0, s0, s1 ; GFX6-NEXT: ; return to shader part epilog ; ; GFX8-LABEL: s_fshl_v2i8: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_and_b32 s6, s2, 7 -; GFX8-NEXT: s_lshr_b32 s3, s0, 8 -; GFX8-NEXT: s_lshl_b32 s0, s0, s6 -; GFX8-NEXT: s_movk_i32 s6, 0xff ; GFX8-NEXT: s_lshr_b32 s4, s1, 8 -; GFX8-NEXT: s_and_b32 s1, s1, s6 +; GFX8-NEXT: s_and_b32 s1, s1, 0xff ; GFX8-NEXT: s_bfe_u32 s1, s1, 0x100000 ; GFX8-NEXT: s_lshr_b32 s5, s2, 8 +; GFX8-NEXT: s_and_b32 s6, s2, 7 ; GFX8-NEXT: s_andn2_b32 s2, 7, s2 ; GFX8-NEXT: s_lshr_b32 s1, s1, 1 +; GFX8-NEXT: s_lshr_b32 s3, s0, 8 +; GFX8-NEXT: s_lshl_b32 s0, s0, s6 ; GFX8-NEXT: s_lshr_b32 s1, s1, s2 ; GFX8-NEXT: s_or_b32 s0, s0, s1 ; GFX8-NEXT: s_and_b32 s1, s5, 7 ; GFX8-NEXT: s_lshl_b32 s1, s3, s1 -; GFX8-NEXT: s_and_b32 s3, s4, s6 +; GFX8-NEXT: s_and_b32 s3, s4, 0xff ; GFX8-NEXT: s_bfe_u32 s3, s3, 0x100000 ; GFX8-NEXT: s_andn2_b32 s2, 7, s5 ; GFX8-NEXT: s_lshr_b32 s3, s3, 1 ; GFX8-NEXT: s_lshr_b32 s2, s3, s2 ; GFX8-NEXT: s_or_b32 s1, s1, s2 -; GFX8-NEXT: s_and_b32 s1, s1, s6 +; GFX8-NEXT: s_and_b32 s1, s1, 0xff ; GFX8-NEXT: s_bfe_u32 s2, 8, 0x100000 -; GFX8-NEXT: s_and_b32 s0, s0, s6 +; GFX8-NEXT: s_and_b32 s0, s0, 0xff ; GFX8-NEXT: s_lshl_b32 s1, s1, s2 ; GFX8-NEXT: s_or_b32 s0, s0, s1 ; GFX8-NEXT: ; return to shader part epilog ; ; GFX9-LABEL: s_fshl_v2i8: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_and_b32 s6, s2, 7 -; GFX9-NEXT: s_lshr_b32 s3, s0, 8 -; GFX9-NEXT: s_lshl_b32 s0, s0, s6 -; GFX9-NEXT: s_movk_i32 s6, 0xff ; GFX9-NEXT: s_lshr_b32 s4, s1, 8 -; GFX9-NEXT: s_and_b32 s1, s1, s6 +; GFX9-NEXT: s_and_b32 s1, s1, 0xff ; GFX9-NEXT: s_bfe_u32 s1, s1, 0x100000 ; GFX9-NEXT: s_lshr_b32 s5, s2, 8 +; GFX9-NEXT: s_and_b32 s6, s2, 7 ; GFX9-NEXT: s_andn2_b32 s2, 7, s2 ; GFX9-NEXT: s_lshr_b32 s1, s1, 1 +; GFX9-NEXT: s_lshr_b32 s3, s0, 8 +; GFX9-NEXT: s_lshl_b32 s0, s0, s6 ; GFX9-NEXT: s_lshr_b32 s1, s1, s2 ; GFX9-NEXT: s_or_b32 s0, s0, s1 ; GFX9-NEXT: s_and_b32 s1, s5, 7 ; GFX9-NEXT: s_lshl_b32 s1, s3, s1 -; GFX9-NEXT: s_and_b32 s3, s4, s6 +; GFX9-NEXT: s_and_b32 s3, s4, 0xff ; GFX9-NEXT: s_bfe_u32 s3, s3, 0x100000 ; GFX9-NEXT: s_andn2_b32 s2, 7, s5 ; GFX9-NEXT: s_lshr_b32 s3, s3, 1 ; GFX9-NEXT: s_lshr_b32 s2, s3, s2 ; GFX9-NEXT: s_or_b32 s1, s1, s2 -; GFX9-NEXT: s_and_b32 s1, s1, s6 +; GFX9-NEXT: s_and_b32 s1, s1, 0xff ; GFX9-NEXT: s_bfe_u32 s2, 8, 0x100000 -; GFX9-NEXT: s_and_b32 s0, s0, s6 +; GFX9-NEXT: s_and_b32 s0, s0, 0xff ; GFX9-NEXT: s_lshl_b32 s1, s1, s2 ; GFX9-NEXT: s_or_b32 s0, s0, s1 ; GFX9-NEXT: ; return to shader part epilog @@ -610,28 +605,27 @@ define amdgpu_ps i16 @s_fshl_v2i8(i16 inreg %lhs.arg, i16 inreg %rhs.arg, i16 in ; GFX10-LABEL: s_fshl_v2i8: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_lshr_b32 s4, s1, 8 -; GFX10-NEXT: s_movk_i32 s6, 0xff ; GFX10-NEXT: s_lshr_b32 s5, s2, 8 -; GFX10-NEXT: s_and_b32 s4, s4, s6 -; GFX10-NEXT: s_and_b32 s7, s2, 7 -; GFX10-NEXT: s_and_b32 s1, s1, s6 +; GFX10-NEXT: s_and_b32 s4, s4, 0xff +; GFX10-NEXT: s_and_b32 s6, s2, 7 +; GFX10-NEXT: s_and_b32 s1, s1, 0xff ; GFX10-NEXT: s_bfe_u32 s4, s4, 0x100000 ; GFX10-NEXT: s_lshr_b32 s3, s0, 8 ; GFX10-NEXT: s_bfe_u32 s1, s1, 0x100000 -; GFX10-NEXT: s_lshl_b32 s0, s0, s7 -; GFX10-NEXT: s_and_b32 s7, s5, 7 +; GFX10-NEXT: s_lshl_b32 s0, s0, s6 +; GFX10-NEXT: s_and_b32 s6, s5, 7 ; GFX10-NEXT: s_andn2_b32 s5, 7, s5 ; GFX10-NEXT: s_lshr_b32 s4, s4, 1 ; GFX10-NEXT: s_andn2_b32 s2, 7, s2 ; GFX10-NEXT: s_lshr_b32 s1, s1, 1 -; GFX10-NEXT: s_lshl_b32 s3, s3, s7 +; GFX10-NEXT: s_lshl_b32 s3, s3, s6 ; GFX10-NEXT: s_lshr_b32 s4, s4, s5 ; GFX10-NEXT: s_lshr_b32 s1, s1, s2 ; GFX10-NEXT: s_or_b32 s2, s3, s4 ; GFX10-NEXT: s_or_b32 s0, s0, s1 -; GFX10-NEXT: s_and_b32 s1, s2, s6 +; GFX10-NEXT: s_and_b32 s1, s2, 0xff ; GFX10-NEXT: s_bfe_u32 s2, 8, 0x100000 -; GFX10-NEXT: s_and_b32 s0, s0, s6 +; GFX10-NEXT: s_and_b32 s0, s0, 0xff ; GFX10-NEXT: s_lshl_b32 s1, s1, s2 ; GFX10-NEXT: s_or_b32 s0, s0, s1 ; GFX10-NEXT: ; return to shader part epilog @@ -729,20 +723,20 @@ define i16 @v_fshl_v2i8(i16 %lhs.arg, i16 %rhs.arg, i16 %amt.arg) { ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: v_lshrrev_b32_e32 v3, 8, v2 ; GFX10-NEXT: v_lshrrev_b32_e32 v4, 8, v1 -; GFX10-NEXT: s_movk_i32 s4, 0xff ; GFX10-NEXT: v_lshrrev_b32_e32 v5, 8, v0 ; GFX10-NEXT: v_xor_b32_e32 v7, -1, v2 +; GFX10-NEXT: v_and_b32_e32 v1, 0xff, v1 ; GFX10-NEXT: v_xor_b32_e32 v6, -1, v3 -; GFX10-NEXT: v_and_b32_e32 v4, s4, v4 -; GFX10-NEXT: v_and_b32_e32 v1, s4, v1 +; GFX10-NEXT: v_and_b32_e32 v4, 0xff, v4 ; GFX10-NEXT: v_and_b32_e32 v3, 7, v3 ; GFX10-NEXT: v_and_b32_e32 v2, 7, v2 +; GFX10-NEXT: v_and_b32_e32 v7, 7, v7 ; GFX10-NEXT: v_and_b32_e32 v6, 7, v6 ; GFX10-NEXT: v_lshrrev_b16 v4, 1, v4 -; GFX10-NEXT: v_and_b32_e32 v7, 7, v7 ; GFX10-NEXT: v_lshrrev_b16 v1, 1, v1 ; GFX10-NEXT: v_lshlrev_b16 v3, v3, v5 ; GFX10-NEXT: v_lshlrev_b16 v0, v2, v0 +; GFX10-NEXT: s_movk_i32 s4, 0xff ; GFX10-NEXT: v_lshrrev_b16 v4, v6, v4 ; GFX10-NEXT: v_lshrrev_b16 v1, v7, v1 ; GFX10-NEXT: v_or_b32_e32 v2, v3, v4 @@ -785,22 +779,21 @@ define amdgpu_ps i32 @s_fshl_v4i8(i32 inreg %lhs.arg, i32 inreg %rhs.arg, i32 in ; GFX6-NEXT: s_bfe_u32 s4, s1, 0x80010 ; GFX6-NEXT: s_andn2_b32 s6, 7, s7 ; GFX6-NEXT: s_lshr_b32 s4, s4, 1 -; GFX6-NEXT: s_movk_i32 s10, 0xff ; GFX6-NEXT: s_lshr_b32 s4, s4, s6 ; GFX6-NEXT: s_or_b32 s3, s3, s4 ; GFX6-NEXT: s_and_b32 s4, s8, 7 ; GFX6-NEXT: s_andn2_b32 s6, 7, s8 ; GFX6-NEXT: s_lshr_b32 s1, s1, 25 -; GFX6-NEXT: s_and_b32 s2, s2, s10 +; GFX6-NEXT: s_and_b32 s2, s2, 0xff ; GFX6-NEXT: s_lshl_b32 s4, s5, s4 ; GFX6-NEXT: s_lshr_b32 s1, s1, s6 -; GFX6-NEXT: s_and_b32 s0, s0, s10 +; GFX6-NEXT: s_and_b32 s0, s0, 0xff ; GFX6-NEXT: s_lshl_b32 s2, s2, 8 ; GFX6-NEXT: s_or_b32 s1, s4, s1 ; GFX6-NEXT: s_or_b32 s0, s0, s2 -; GFX6-NEXT: s_and_b32 s2, s3, s10 +; GFX6-NEXT: s_and_b32 s2, s3, 0xff ; GFX6-NEXT: s_lshl_b32 s2, s2, 16 -; GFX6-NEXT: s_and_b32 s1, s1, s10 +; GFX6-NEXT: s_and_b32 s1, s1, 0xff ; GFX6-NEXT: s_or_b32 s0, s0, s2 ; GFX6-NEXT: s_lshl_b32 s1, s1, 24 ; GFX6-NEXT: s_or_b32 s0, s0, s1 @@ -808,11 +801,10 @@ define amdgpu_ps i32 @s_fshl_v4i8(i32 inreg %lhs.arg, i32 inreg %rhs.arg, i32 in ; ; GFX8-LABEL: s_fshl_v4i8: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_movk_i32 s13, 0xff ; GFX8-NEXT: s_lshr_b32 s6, s1, 8 ; GFX8-NEXT: s_lshr_b32 s7, s1, 16 ; GFX8-NEXT: s_lshr_b32 s8, s1, 24 -; GFX8-NEXT: s_and_b32 s1, s1, s13 +; GFX8-NEXT: s_and_b32 s1, s1, 0xff ; GFX8-NEXT: s_bfe_u32 s1, s1, 0x100000 ; GFX8-NEXT: s_lshr_b32 s9, s2, 8 ; GFX8-NEXT: s_lshr_b32 s10, s2, 16 @@ -828,7 +820,7 @@ define amdgpu_ps i32 @s_fshl_v4i8(i32 inreg %lhs.arg, i32 inreg %rhs.arg, i32 in ; GFX8-NEXT: s_or_b32 s0, s0, s1 ; GFX8-NEXT: s_and_b32 s1, s9, 7 ; GFX8-NEXT: s_lshl_b32 s1, s3, s1 -; GFX8-NEXT: s_and_b32 s3, s6, s13 +; GFX8-NEXT: s_and_b32 s3, s6, 0xff ; GFX8-NEXT: s_bfe_u32 s3, s3, 0x100000 ; GFX8-NEXT: s_andn2_b32 s2, 7, s9 ; GFX8-NEXT: s_lshr_b32 s3, s3, 1 @@ -836,37 +828,36 @@ define amdgpu_ps i32 @s_fshl_v4i8(i32 inreg %lhs.arg, i32 inreg %rhs.arg, i32 in ; GFX8-NEXT: s_or_b32 s1, s1, s2 ; GFX8-NEXT: s_and_b32 s2, s10, 7 ; GFX8-NEXT: s_lshl_b32 s2, s4, s2 -; GFX8-NEXT: s_and_b32 s4, s7, s13 +; GFX8-NEXT: s_and_b32 s4, s7, 0xff ; GFX8-NEXT: s_bfe_u32 s4, s4, 0x100000 ; GFX8-NEXT: s_andn2_b32 s3, 7, s10 ; GFX8-NEXT: s_lshr_b32 s4, s4, 1 ; GFX8-NEXT: s_lshr_b32 s3, s4, s3 ; GFX8-NEXT: s_or_b32 s2, s2, s3 ; GFX8-NEXT: s_and_b32 s3, s11, 7 -; GFX8-NEXT: s_and_b32 s1, s1, s13 +; GFX8-NEXT: s_and_b32 s1, s1, 0xff ; GFX8-NEXT: s_andn2_b32 s4, 7, s11 ; GFX8-NEXT: s_lshl_b32 s3, s5, s3 ; GFX8-NEXT: s_lshr_b32 s5, s8, 1 -; GFX8-NEXT: s_and_b32 s0, s0, s13 +; GFX8-NEXT: s_and_b32 s0, s0, 0xff ; GFX8-NEXT: s_lshl_b32 s1, s1, 8 ; GFX8-NEXT: s_lshr_b32 s4, s5, s4 ; GFX8-NEXT: s_or_b32 s0, s0, s1 -; GFX8-NEXT: s_and_b32 s1, s2, s13 +; GFX8-NEXT: s_and_b32 s1, s2, 0xff ; GFX8-NEXT: s_or_b32 s3, s3, s4 ; GFX8-NEXT: s_lshl_b32 s1, s1, 16 ; GFX8-NEXT: s_or_b32 s0, s0, s1 -; GFX8-NEXT: s_and_b32 s1, s3, s13 +; GFX8-NEXT: s_and_b32 s1, s3, 0xff ; GFX8-NEXT: s_lshl_b32 s1, s1, 24 ; GFX8-NEXT: s_or_b32 s0, s0, s1 ; GFX8-NEXT: ; return to shader part epilog ; ; GFX9-LABEL: s_fshl_v4i8: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_movk_i32 s13, 0xff ; GFX9-NEXT: s_lshr_b32 s6, s1, 8 ; GFX9-NEXT: s_lshr_b32 s7, s1, 16 ; GFX9-NEXT: s_lshr_b32 s8, s1, 24 -; GFX9-NEXT: s_and_b32 s1, s1, s13 +; GFX9-NEXT: s_and_b32 s1, s1, 0xff ; GFX9-NEXT: s_bfe_u32 s1, s1, 0x100000 ; GFX9-NEXT: s_lshr_b32 s9, s2, 8 ; GFX9-NEXT: s_lshr_b32 s10, s2, 16 @@ -882,7 +873,7 @@ define amdgpu_ps i32 @s_fshl_v4i8(i32 inreg %lhs.arg, i32 inreg %rhs.arg, i32 in ; GFX9-NEXT: s_or_b32 s0, s0, s1 ; GFX9-NEXT: s_and_b32 s1, s9, 7 ; GFX9-NEXT: s_lshl_b32 s1, s3, s1 -; GFX9-NEXT: s_and_b32 s3, s6, s13 +; GFX9-NEXT: s_and_b32 s3, s6, 0xff ; GFX9-NEXT: s_bfe_u32 s3, s3, 0x100000 ; GFX9-NEXT: s_andn2_b32 s2, 7, s9 ; GFX9-NEXT: s_lshr_b32 s3, s3, 1 @@ -890,79 +881,78 @@ define amdgpu_ps i32 @s_fshl_v4i8(i32 inreg %lhs.arg, i32 inreg %rhs.arg, i32 in ; GFX9-NEXT: s_or_b32 s1, s1, s2 ; GFX9-NEXT: s_and_b32 s2, s10, 7 ; GFX9-NEXT: s_lshl_b32 s2, s4, s2 -; GFX9-NEXT: s_and_b32 s4, s7, s13 +; GFX9-NEXT: s_and_b32 s4, s7, 0xff ; GFX9-NEXT: s_bfe_u32 s4, s4, 0x100000 ; GFX9-NEXT: s_andn2_b32 s3, 7, s10 ; GFX9-NEXT: s_lshr_b32 s4, s4, 1 ; GFX9-NEXT: s_lshr_b32 s3, s4, s3 ; GFX9-NEXT: s_or_b32 s2, s2, s3 ; GFX9-NEXT: s_and_b32 s3, s11, 7 -; GFX9-NEXT: s_and_b32 s1, s1, s13 +; GFX9-NEXT: s_and_b32 s1, s1, 0xff ; GFX9-NEXT: s_andn2_b32 s4, 7, s11 ; GFX9-NEXT: s_lshl_b32 s3, s5, s3 ; GFX9-NEXT: s_lshr_b32 s5, s8, 1 -; GFX9-NEXT: s_and_b32 s0, s0, s13 +; GFX9-NEXT: s_and_b32 s0, s0, 0xff ; GFX9-NEXT: s_lshl_b32 s1, s1, 8 ; GFX9-NEXT: s_lshr_b32 s4, s5, s4 ; GFX9-NEXT: s_or_b32 s0, s0, s1 -; GFX9-NEXT: s_and_b32 s1, s2, s13 +; GFX9-NEXT: s_and_b32 s1, s2, 0xff ; GFX9-NEXT: s_or_b32 s3, s3, s4 ; GFX9-NEXT: s_lshl_b32 s1, s1, 16 ; GFX9-NEXT: s_or_b32 s0, s0, s1 -; GFX9-NEXT: s_and_b32 s1, s3, s13 +; GFX9-NEXT: s_and_b32 s1, s3, 0xff ; GFX9-NEXT: s_lshl_b32 s1, s1, 24 ; GFX9-NEXT: s_or_b32 s0, s0, s1 ; GFX9-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: s_fshl_v4i8: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_movk_i32 s11, 0xff ; GFX10-NEXT: s_lshr_b32 s6, s1, 8 ; GFX10-NEXT: s_lshr_b32 s7, s1, 16 ; GFX10-NEXT: s_lshr_b32 s8, s1, 24 -; GFX10-NEXT: s_and_b32 s1, s1, s11 +; GFX10-NEXT: s_and_b32 s1, s1, 0xff ; GFX10-NEXT: s_lshr_b32 s9, s2, 8 ; GFX10-NEXT: s_bfe_u32 s1, s1, 0x100000 ; GFX10-NEXT: s_lshr_b32 s10, s2, 16 -; GFX10-NEXT: s_lshr_b32 s12, s2, 24 -; GFX10-NEXT: s_and_b32 s13, s2, 7 +; GFX10-NEXT: s_lshr_b32 s11, s2, 24 +; GFX10-NEXT: s_and_b32 s12, s2, 7 ; GFX10-NEXT: s_andn2_b32 s2, 7, s2 ; GFX10-NEXT: s_lshr_b32 s1, s1, 1 ; GFX10-NEXT: s_lshr_b32 s3, s0, 8 ; GFX10-NEXT: s_lshr_b32 s1, s1, s2 -; GFX10-NEXT: s_and_b32 s2, s6, s11 +; GFX10-NEXT: s_and_b32 s2, s6, 0xff ; GFX10-NEXT: s_and_b32 s6, s9, 7 ; GFX10-NEXT: s_bfe_u32 s2, s2, 0x100000 ; GFX10-NEXT: s_andn2_b32 s9, 7, s9 ; GFX10-NEXT: s_lshr_b32 s2, s2, 1 ; GFX10-NEXT: s_lshr_b32 s4, s0, 16 ; GFX10-NEXT: s_lshr_b32 s5, s0, 24 -; GFX10-NEXT: s_lshl_b32 s0, s0, s13 +; GFX10-NEXT: s_lshl_b32 s0, s0, s12 ; GFX10-NEXT: s_lshl_b32 s3, s3, s6 ; GFX10-NEXT: s_lshr_b32 s2, s2, s9 ; GFX10-NEXT: s_or_b32 s0, s0, s1 ; GFX10-NEXT: s_or_b32 s1, s3, s2 -; GFX10-NEXT: s_and_b32 s2, s7, s11 +; GFX10-NEXT: s_and_b32 s2, s7, 0xff ; GFX10-NEXT: s_and_b32 s3, s10, 7 ; GFX10-NEXT: s_bfe_u32 s2, s2, 0x100000 ; GFX10-NEXT: s_andn2_b32 s6, 7, s10 ; GFX10-NEXT: s_lshr_b32 s2, s2, 1 ; GFX10-NEXT: s_lshl_b32 s3, s4, s3 ; GFX10-NEXT: s_lshr_b32 s2, s2, s6 -; GFX10-NEXT: s_and_b32 s4, s12, 7 -; GFX10-NEXT: s_andn2_b32 s6, 7, s12 +; GFX10-NEXT: s_and_b32 s4, s11, 7 +; GFX10-NEXT: s_andn2_b32 s6, 7, s11 ; GFX10-NEXT: s_lshr_b32 s7, s8, 1 ; GFX10-NEXT: s_lshl_b32 s4, s5, s4 ; GFX10-NEXT: s_lshr_b32 s5, s7, s6 ; GFX10-NEXT: s_or_b32 s2, s3, s2 -; GFX10-NEXT: s_and_b32 s1, s1, s11 +; GFX10-NEXT: s_and_b32 s1, s1, 0xff ; GFX10-NEXT: s_or_b32 s3, s4, s5 -; GFX10-NEXT: s_and_b32 s0, s0, s11 +; GFX10-NEXT: s_and_b32 s0, s0, 0xff ; GFX10-NEXT: s_lshl_b32 s1, s1, 8 -; GFX10-NEXT: s_and_b32 s2, s2, s11 +; GFX10-NEXT: s_and_b32 s2, s2, 0xff ; GFX10-NEXT: s_or_b32 s0, s0, s1 ; GFX10-NEXT: s_lshl_b32 s1, s2, 16 -; GFX10-NEXT: s_and_b32 s2, s3, s11 +; GFX10-NEXT: s_and_b32 s2, s3, 0xff ; GFX10-NEXT: s_or_b32 s0, s0, s1 ; GFX10-NEXT: s_lshl_b32 s1, s2, 24 ; GFX10-NEXT: s_or_b32 s0, s0, s1 @@ -1136,51 +1126,50 @@ define i32 @v_fshl_v4i8(i32 %lhs.arg, i32 %rhs.arg, i32 %amt.arg) { ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: v_lshrrev_b32_e32 v8, 8, v2 -; GFX10-NEXT: v_and_b32_e32 v11, 7, v2 +; GFX10-NEXT: v_and_b32_e32 v10, 7, v2 ; GFX10-NEXT: v_lshrrev_b32_e32 v3, 8, v0 ; GFX10-NEXT: v_lshrrev_b32_e32 v4, 16, v0 ; GFX10-NEXT: v_lshrrev_b32_e32 v5, 24, v0 ; GFX10-NEXT: v_lshrrev_b32_e32 v6, 8, v1 -; GFX10-NEXT: v_lshrrev_b32_e32 v9, 16, v2 -; GFX10-NEXT: v_xor_b32_e32 v10, -1, v2 +; GFX10-NEXT: v_xor_b32_e32 v9, -1, v2 +; GFX10-NEXT: v_lshrrev_b32_e32 v11, 16, v2 ; GFX10-NEXT: v_lshrrev_b32_e32 v2, 24, v2 -; GFX10-NEXT: v_lshlrev_b16 v0, v11, v0 -; GFX10-NEXT: v_xor_b32_e32 v11, -1, v8 +; GFX10-NEXT: v_lshlrev_b16 v0, v10, v0 +; GFX10-NEXT: v_xor_b32_e32 v10, -1, v8 ; GFX10-NEXT: v_and_b32_e32 v8, 7, v8 ; GFX10-NEXT: v_mov_b32_e32 v13, 0xff -; GFX10-NEXT: s_movk_i32 s4, 0xff ; GFX10-NEXT: v_lshrrev_b32_e32 v7, 24, v1 -; GFX10-NEXT: v_and_b32_e32 v12, s4, v1 -; GFX10-NEXT: v_and_b32_e32 v6, s4, v6 +; GFX10-NEXT: v_and_b32_e32 v12, 0xff, v1 +; GFX10-NEXT: v_and_b32_e32 v6, 0xff, v6 ; GFX10-NEXT: v_lshlrev_b16 v3, v8, v3 -; GFX10-NEXT: v_xor_b32_e32 v8, -1, v9 +; GFX10-NEXT: v_xor_b32_e32 v8, -1, v11 ; GFX10-NEXT: v_and_b32_sdwa v1, v1, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; GFX10-NEXT: v_xor_b32_e32 v13, -1, v2 -; GFX10-NEXT: v_and_b32_e32 v11, 7, v11 +; GFX10-NEXT: v_and_b32_e32 v10, 7, v10 ; GFX10-NEXT: v_lshrrev_b16 v6, 1, v6 -; GFX10-NEXT: v_and_b32_e32 v9, 7, v9 +; GFX10-NEXT: v_and_b32_e32 v11, 7, v11 ; GFX10-NEXT: v_and_b32_e32 v8, 7, v8 ; GFX10-NEXT: v_lshrrev_b16 v1, 1, v1 ; GFX10-NEXT: v_and_b32_e32 v2, 7, v2 ; GFX10-NEXT: v_and_b32_e32 v13, 7, v13 ; GFX10-NEXT: v_lshrrev_b16 v7, 1, v7 -; GFX10-NEXT: v_and_b32_e32 v10, 7, v10 +; GFX10-NEXT: v_and_b32_e32 v9, 7, v9 ; GFX10-NEXT: v_lshrrev_b16 v12, 1, v12 -; GFX10-NEXT: v_lshrrev_b16 v6, v11, v6 -; GFX10-NEXT: v_lshlrev_b16 v4, v9, v4 +; GFX10-NEXT: v_lshrrev_b16 v6, v10, v6 +; GFX10-NEXT: v_lshlrev_b16 v4, v11, v4 ; GFX10-NEXT: v_lshrrev_b16 v1, v8, v1 ; GFX10-NEXT: v_lshlrev_b16 v2, v2, v5 ; GFX10-NEXT: v_lshrrev_b16 v5, v13, v7 -; GFX10-NEXT: v_lshrrev_b16 v7, v10, v12 +; GFX10-NEXT: v_lshrrev_b16 v7, v9, v12 ; GFX10-NEXT: v_or_b32_e32 v3, v3, v6 ; GFX10-NEXT: v_mov_b32_e32 v6, 8 ; GFX10-NEXT: v_or_b32_e32 v1, v4, v1 ; GFX10-NEXT: v_or_b32_e32 v2, v2, v5 ; GFX10-NEXT: v_or_b32_e32 v0, v0, v7 ; GFX10-NEXT: v_lshlrev_b32_sdwa v3, v6, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX10-NEXT: v_and_b32_e32 v1, s4, v1 -; GFX10-NEXT: v_and_b32_e32 v2, s4, v2 -; GFX10-NEXT: v_and_or_b32 v0, v0, s4, v3 +; GFX10-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GFX10-NEXT: v_and_b32_e32 v2, 0xff, v2 +; GFX10-NEXT: v_and_or_b32 v0, v0, 0xff, v3 ; GFX10-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX10-NEXT: v_lshlrev_b32_e32 v2, 24, v2 ; GFX10-NEXT: v_or3_b32 v0, v0, v1, v2 @@ -1199,8 +1188,8 @@ define amdgpu_ps i24 @s_fshl_i24(i24 inreg %lhs, i24 inreg %rhs, i24 inreg %amt) ; GFX6-NEXT: v_cvt_f32_ubyte0_e32 v0, 24 ; GFX6-NEXT: v_rcp_iflag_f32_e32 v0, v0 ; GFX6-NEXT: v_mov_b32_e32 v1, 0xffffffe8 +; GFX6-NEXT: s_and_b32 s2, s2, 0xffffff ; GFX6-NEXT: s_mov_b32 s3, 0xffffff -; GFX6-NEXT: s_and_b32 s2, s2, s3 ; GFX6-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX6-NEXT: v_cvt_u32_f32_e32 v0, v0 ; GFX6-NEXT: s_bfe_u32 s1, s1, 0x170001 @@ -1230,8 +1219,8 @@ define amdgpu_ps i24 @s_fshl_i24(i24 inreg %lhs, i24 inreg %rhs, i24 inreg %amt) ; GFX8-NEXT: v_cvt_f32_ubyte0_e32 v0, 24 ; GFX8-NEXT: v_rcp_iflag_f32_e32 v0, v0 ; GFX8-NEXT: v_mov_b32_e32 v1, 0xffffffe8 +; GFX8-NEXT: s_and_b32 s2, s2, 0xffffff ; GFX8-NEXT: s_mov_b32 s3, 0xffffff -; GFX8-NEXT: s_and_b32 s2, s2, s3 ; GFX8-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX8-NEXT: v_cvt_u32_f32_e32 v0, v0 ; GFX8-NEXT: s_bfe_u32 s1, s1, 0x170001 @@ -1261,8 +1250,8 @@ define amdgpu_ps i24 @s_fshl_i24(i24 inreg %lhs, i24 inreg %rhs, i24 inreg %amt) ; GFX9-NEXT: v_cvt_f32_ubyte0_e32 v0, 24 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v0, v0 ; GFX9-NEXT: v_mov_b32_e32 v1, 0xffffffe8 +; GFX9-NEXT: s_and_b32 s2, s2, 0xffffff ; GFX9-NEXT: s_mov_b32 s3, 0xffffff -; GFX9-NEXT: s_and_b32 s2, s2, s3 ; GFX9-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0 ; GFX9-NEXT: s_bfe_u32 s1, s1, 0x170001 @@ -1289,9 +1278,8 @@ define amdgpu_ps i24 @s_fshl_i24(i24 inreg %lhs, i24 inreg %rhs, i24 inreg %amt) ; GFX10-LABEL: s_fshl_i24: ; GFX10: ; %bb.0: ; GFX10-NEXT: v_cvt_f32_ubyte0_e32 v0, 24 -; GFX10-NEXT: s_mov_b32 s3, 0xffffff +; GFX10-NEXT: s_and_b32 s2, s2, 0xffffff ; GFX10-NEXT: s_bfe_u32 s1, s1, 0x170001 -; GFX10-NEXT: s_and_b32 s2, s2, s3 ; GFX10-NEXT: v_rcp_iflag_f32_e32 v0, v0 ; GFX10-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX10-NEXT: v_cvt_u32_f32_e32 v0, v0 @@ -1308,8 +1296,8 @@ define amdgpu_ps i24 @s_fshl_i24(i24 inreg %lhs, i24 inreg %rhs, i24 inreg %amt) ; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, 24, v0 ; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc_lo ; GFX10-NEXT: v_sub_nc_u32_e32 v1, 23, v0 -; GFX10-NEXT: v_and_b32_e32 v0, s3, v0 -; GFX10-NEXT: v_and_b32_e32 v1, s3, v1 +; GFX10-NEXT: v_and_b32_e32 v0, 0xffffff, v0 +; GFX10-NEXT: v_and_b32_e32 v1, 0xffffff, v1 ; GFX10-NEXT: v_lshrrev_b32_e64 v1, v1, s1 ; GFX10-NEXT: v_lshl_or_b32 v0, s0, v0, v1 ; GFX10-NEXT: v_readfirstlane_b32 s0, v0 @@ -1433,11 +1421,10 @@ define i24 @v_fshl_i24(i24 %lhs, i24 %rhs, i24 %amt) { ; GFX10-NEXT: v_subrev_nc_u32_e32 v3, 24, v2 ; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, 24, v2 ; GFX10-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc_lo -; GFX10-NEXT: v_mov_b32_e32 v3, 0xffffff -; GFX10-NEXT: v_sub_nc_u32_e32 v4, 23, v2 -; GFX10-NEXT: v_and_b32_e32 v2, v2, v3 -; GFX10-NEXT: v_and_b32_e32 v4, v4, v3 -; GFX10-NEXT: v_lshrrev_b32_e32 v1, v4, v1 +; GFX10-NEXT: v_sub_nc_u32_e32 v3, 23, v2 +; GFX10-NEXT: v_and_b32_e32 v2, 0xffffff, v2 +; GFX10-NEXT: v_and_b32_e32 v3, 0xffffff, v3 +; GFX10-NEXT: v_lshrrev_b32_e32 v1, v3, v1 ; GFX10-NEXT: v_lshl_or_b32 v0, v0, v2, v1 ; GFX10-NEXT: s_setpc_b64 s[30:31] %result = call i24 @llvm.fshl.i24(i24 %lhs, i24 %rhs, i24 %amt) @@ -1449,25 +1436,23 @@ define amdgpu_ps i48 @s_fshl_v2i24(i48 inreg %lhs.arg, i48 inreg %rhs.arg, i48 i ; GFX6: ; %bb.0: ; GFX6-NEXT: v_cvt_f32_ubyte0_e32 v0, 24 ; GFX6-NEXT: v_rcp_iflag_f32_e32 v0, v0 -; GFX6-NEXT: s_movk_i32 s9, 0xff -; GFX6-NEXT: s_mov_b32 s11, 0x80008 ; GFX6-NEXT: s_lshr_b32 s6, s0, 16 ; GFX6-NEXT: s_lshr_b32 s7, s0, 24 -; GFX6-NEXT: s_and_b32 s10, s0, s9 -; GFX6-NEXT: s_bfe_u32 s0, s0, s11 +; GFX6-NEXT: s_and_b32 s10, s0, 0xff +; GFX6-NEXT: s_bfe_u32 s0, s0, 0x80008 ; GFX6-NEXT: s_lshl_b32 s0, s0, 8 -; GFX6-NEXT: s_and_b32 s6, s6, s9 +; GFX6-NEXT: s_and_b32 s6, s6, 0xff ; GFX6-NEXT: s_or_b32 s0, s10, s0 ; GFX6-NEXT: s_bfe_u32 s6, s6, 0x100000 ; GFX6-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX6-NEXT: s_lshr_b32 s8, s1, 8 ; GFX6-NEXT: s_bfe_u32 s0, s0, 0x100000 ; GFX6-NEXT: s_lshl_b32 s6, s6, 16 -; GFX6-NEXT: s_and_b32 s1, s1, s9 +; GFX6-NEXT: s_and_b32 s1, s1, 0xff ; GFX6-NEXT: v_cvt_u32_f32_e32 v0, v0 ; GFX6-NEXT: s_or_b32 s0, s0, s6 ; GFX6-NEXT: s_lshl_b32 s1, s1, 8 -; GFX6-NEXT: s_and_b32 s6, s8, s9 +; GFX6-NEXT: s_and_b32 s6, s8, 0xff ; GFX6-NEXT: s_or_b32 s1, s7, s1 ; GFX6-NEXT: s_bfe_u32 s6, s6, 0x100000 ; GFX6-NEXT: s_bfe_u32 s1, s1, 0x100000 @@ -1476,20 +1461,20 @@ define amdgpu_ps i48 @s_fshl_v2i24(i48 inreg %lhs.arg, i48 inreg %rhs.arg, i48 i ; GFX6-NEXT: s_or_b32 s1, s1, s6 ; GFX6-NEXT: s_lshr_b32 s6, s2, 16 ; GFX6-NEXT: s_lshr_b32 s7, s2, 24 -; GFX6-NEXT: s_and_b32 s10, s2, s9 -; GFX6-NEXT: s_bfe_u32 s2, s2, s11 +; GFX6-NEXT: s_and_b32 s10, s2, 0xff +; GFX6-NEXT: s_bfe_u32 s2, s2, 0x80008 ; GFX6-NEXT: v_mul_lo_u32 v2, v1, v0 ; GFX6-NEXT: s_lshl_b32 s2, s2, 8 -; GFX6-NEXT: s_and_b32 s6, s6, s9 +; GFX6-NEXT: s_and_b32 s6, s6, 0xff ; GFX6-NEXT: s_or_b32 s2, s10, s2 ; GFX6-NEXT: s_bfe_u32 s6, s6, 0x100000 ; GFX6-NEXT: s_lshr_b32 s8, s3, 8 ; GFX6-NEXT: s_bfe_u32 s2, s2, 0x100000 ; GFX6-NEXT: s_lshl_b32 s6, s6, 16 -; GFX6-NEXT: s_and_b32 s3, s3, s9 +; GFX6-NEXT: s_and_b32 s3, s3, 0xff ; GFX6-NEXT: s_or_b32 s2, s2, s6 ; GFX6-NEXT: s_lshl_b32 s3, s3, 8 -; GFX6-NEXT: s_and_b32 s6, s8, s9 +; GFX6-NEXT: s_and_b32 s6, s8, 0xff ; GFX6-NEXT: v_mul_hi_u32 v2, v0, v2 ; GFX6-NEXT: s_or_b32 s3, s7, s3 ; GFX6-NEXT: s_bfe_u32 s6, s6, 0x100000 @@ -1498,10 +1483,10 @@ define amdgpu_ps i48 @s_fshl_v2i24(i48 inreg %lhs.arg, i48 inreg %rhs.arg, i48 i ; GFX6-NEXT: s_or_b32 s3, s3, s6 ; GFX6-NEXT: s_lshr_b32 s6, s4, 16 ; GFX6-NEXT: s_lshr_b32 s7, s4, 24 -; GFX6-NEXT: s_and_b32 s10, s4, s9 -; GFX6-NEXT: s_bfe_u32 s4, s4, s11 +; GFX6-NEXT: s_and_b32 s10, s4, 0xff +; GFX6-NEXT: s_bfe_u32 s4, s4, 0x80008 ; GFX6-NEXT: s_lshl_b32 s4, s4, 8 -; GFX6-NEXT: s_and_b32 s6, s6, s9 +; GFX6-NEXT: s_and_b32 s6, s6, 0xff ; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v2 ; GFX6-NEXT: v_cvt_f32_ubyte0_e32 v2, 24 ; GFX6-NEXT: s_or_b32 s4, s10, s4 @@ -1515,14 +1500,14 @@ define amdgpu_ps i48 @s_fshl_v2i24(i48 inreg %lhs.arg, i48 inreg %rhs.arg, i48 i ; GFX6-NEXT: v_cvt_u32_f32_e32 v2, v2 ; GFX6-NEXT: s_lshr_b32 s8, s5, 8 ; GFX6-NEXT: v_mul_lo_u32 v0, v0, 24 -; GFX6-NEXT: s_and_b32 s5, s5, s9 +; GFX6-NEXT: s_and_b32 s5, s5, 0xff ; GFX6-NEXT: v_mul_lo_u32 v1, v1, v2 ; GFX6-NEXT: s_lshl_b32 s5, s5, 8 ; GFX6-NEXT: v_sub_i32_e32 v0, vcc, s4, v0 ; GFX6-NEXT: v_subrev_i32_e32 v3, vcc, 24, v0 ; GFX6-NEXT: v_cmp_le_u32_e32 vcc, 24, v0 ; GFX6-NEXT: v_mul_hi_u32 v1, v2, v1 -; GFX6-NEXT: s_and_b32 s6, s8, s9 +; GFX6-NEXT: s_and_b32 s6, s8, 0xff ; GFX6-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc ; GFX6-NEXT: s_or_b32 s5, s7, s5 ; GFX6-NEXT: s_bfe_u32 s6, s6, 0x100000 @@ -1555,6 +1540,7 @@ define amdgpu_ps i48 @s_fshl_v2i24(i48 inreg %lhs.arg, i48 inreg %rhs.arg, i48 i ; GFX6-NEXT: v_and_b32_e32 v1, v1, v4 ; GFX6-NEXT: s_lshr_b32 s0, s3, 1 ; GFX6-NEXT: v_and_b32_e32 v2, v2, v4 +; GFX6-NEXT: s_movk_i32 s9, 0xff ; GFX6-NEXT: v_lshl_b32_e32 v1, s1, v1 ; GFX6-NEXT: v_lshr_b32_e32 v2, s0, v2 ; GFX6-NEXT: v_bfe_u32 v3, v0, 8, 8 @@ -1579,25 +1565,24 @@ define amdgpu_ps i48 @s_fshl_v2i24(i48 inreg %lhs.arg, i48 inreg %rhs.arg, i48 i ; GFX8-LABEL: s_fshl_v2i24: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_lshr_b32 s6, s0, 8 -; GFX8-NEXT: s_movk_i32 s10, 0xff -; GFX8-NEXT: s_and_b32 s6, s6, s10 -; GFX8-NEXT: s_bfe_u32 s11, 8, 0x100000 +; GFX8-NEXT: s_and_b32 s6, s6, 0xff +; GFX8-NEXT: s_bfe_u32 s10, 8, 0x100000 ; GFX8-NEXT: s_lshr_b32 s7, s0, 16 ; GFX8-NEXT: s_lshr_b32 s8, s0, 24 -; GFX8-NEXT: s_and_b32 s0, s0, s10 -; GFX8-NEXT: s_lshl_b32 s6, s6, s11 +; GFX8-NEXT: s_and_b32 s0, s0, 0xff +; GFX8-NEXT: s_lshl_b32 s6, s6, s10 ; GFX8-NEXT: s_or_b32 s0, s0, s6 -; GFX8-NEXT: s_and_b32 s6, s7, s10 +; GFX8-NEXT: s_and_b32 s6, s7, 0xff ; GFX8-NEXT: v_cvt_f32_ubyte0_e32 v0, 24 ; GFX8-NEXT: s_bfe_u32 s6, s6, 0x100000 ; GFX8-NEXT: v_rcp_iflag_f32_e32 v0, v0 ; GFX8-NEXT: s_lshr_b32 s9, s1, 8 ; GFX8-NEXT: s_bfe_u32 s0, s0, 0x100000 ; GFX8-NEXT: s_lshl_b32 s6, s6, 16 -; GFX8-NEXT: s_and_b32 s1, s1, s10 +; GFX8-NEXT: s_and_b32 s1, s1, 0xff ; GFX8-NEXT: s_or_b32 s0, s0, s6 -; GFX8-NEXT: s_lshl_b32 s1, s1, s11 -; GFX8-NEXT: s_and_b32 s6, s9, s10 +; GFX8-NEXT: s_lshl_b32 s1, s1, s10 +; GFX8-NEXT: s_and_b32 s6, s9, 0xff ; GFX8-NEXT: s_or_b32 s1, s8, s1 ; GFX8-NEXT: s_bfe_u32 s6, s6, 0x100000 ; GFX8-NEXT: s_bfe_u32 s1, s1, 0x100000 @@ -1606,23 +1591,23 @@ define amdgpu_ps i48 @s_fshl_v2i24(i48 inreg %lhs.arg, i48 inreg %rhs.arg, i48 i ; GFX8-NEXT: s_or_b32 s1, s1, s6 ; GFX8-NEXT: s_lshr_b32 s6, s2, 8 ; GFX8-NEXT: v_cvt_u32_f32_e32 v0, v0 -; GFX8-NEXT: s_and_b32 s6, s6, s10 +; GFX8-NEXT: s_and_b32 s6, s6, 0xff ; GFX8-NEXT: s_lshr_b32 s7, s2, 16 ; GFX8-NEXT: s_lshr_b32 s8, s2, 24 -; GFX8-NEXT: s_and_b32 s2, s2, s10 -; GFX8-NEXT: s_lshl_b32 s6, s6, s11 +; GFX8-NEXT: s_and_b32 s2, s2, 0xff +; GFX8-NEXT: s_lshl_b32 s6, s6, s10 ; GFX8-NEXT: s_or_b32 s2, s2, s6 -; GFX8-NEXT: s_and_b32 s6, s7, s10 +; GFX8-NEXT: s_and_b32 s6, s7, 0xff ; GFX8-NEXT: v_mov_b32_e32 v1, 0xffffffe8 ; GFX8-NEXT: s_bfe_u32 s6, s6, 0x100000 ; GFX8-NEXT: v_mul_lo_u32 v2, v1, v0 ; GFX8-NEXT: s_lshr_b32 s9, s3, 8 ; GFX8-NEXT: s_bfe_u32 s2, s2, 0x100000 ; GFX8-NEXT: s_lshl_b32 s6, s6, 16 -; GFX8-NEXT: s_and_b32 s3, s3, s10 +; GFX8-NEXT: s_and_b32 s3, s3, 0xff ; GFX8-NEXT: s_or_b32 s2, s2, s6 -; GFX8-NEXT: s_lshl_b32 s3, s3, s11 -; GFX8-NEXT: s_and_b32 s6, s9, s10 +; GFX8-NEXT: s_lshl_b32 s3, s3, s10 +; GFX8-NEXT: s_and_b32 s6, s9, 0xff ; GFX8-NEXT: s_or_b32 s3, s8, s3 ; GFX8-NEXT: s_bfe_u32 s6, s6, 0x100000 ; GFX8-NEXT: s_bfe_u32 s3, s3, 0x100000 @@ -1630,13 +1615,13 @@ define amdgpu_ps i48 @s_fshl_v2i24(i48 inreg %lhs.arg, i48 inreg %rhs.arg, i48 i ; GFX8-NEXT: v_mul_hi_u32 v2, v0, v2 ; GFX8-NEXT: s_or_b32 s3, s3, s6 ; GFX8-NEXT: s_lshr_b32 s6, s4, 8 -; GFX8-NEXT: s_and_b32 s6, s6, s10 +; GFX8-NEXT: s_and_b32 s6, s6, 0xff ; GFX8-NEXT: s_lshr_b32 s7, s4, 16 ; GFX8-NEXT: s_lshr_b32 s8, s4, 24 -; GFX8-NEXT: s_and_b32 s4, s4, s10 -; GFX8-NEXT: s_lshl_b32 s6, s6, s11 +; GFX8-NEXT: s_and_b32 s4, s4, 0xff +; GFX8-NEXT: s_lshl_b32 s6, s6, s10 ; GFX8-NEXT: s_or_b32 s4, s4, s6 -; GFX8-NEXT: s_and_b32 s6, s7, s10 +; GFX8-NEXT: s_and_b32 s6, s7, 0xff ; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v2 ; GFX8-NEXT: v_cvt_f32_ubyte0_e32 v2, 24 ; GFX8-NEXT: s_bfe_u32 s6, s6, 0x100000 @@ -1649,14 +1634,14 @@ define amdgpu_ps i48 @s_fshl_v2i24(i48 inreg %lhs.arg, i48 inreg %rhs.arg, i48 i ; GFX8-NEXT: v_cvt_u32_f32_e32 v2, v2 ; GFX8-NEXT: s_lshr_b32 s9, s5, 8 ; GFX8-NEXT: v_mul_lo_u32 v0, v0, 24 -; GFX8-NEXT: s_and_b32 s5, s5, s10 +; GFX8-NEXT: s_and_b32 s5, s5, 0xff ; GFX8-NEXT: v_mul_lo_u32 v1, v1, v2 -; GFX8-NEXT: s_lshl_b32 s5, s5, s11 +; GFX8-NEXT: s_lshl_b32 s5, s5, s10 ; GFX8-NEXT: v_sub_u32_e32 v0, vcc, s4, v0 ; GFX8-NEXT: v_subrev_u32_e32 v3, vcc, 24, v0 ; GFX8-NEXT: v_cmp_le_u32_e32 vcc, 24, v0 ; GFX8-NEXT: v_mul_hi_u32 v1, v2, v1 -; GFX8-NEXT: s_and_b32 s6, s9, s10 +; GFX8-NEXT: s_and_b32 s6, s9, 0xff ; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc ; GFX8-NEXT: s_or_b32 s5, s8, s5 ; GFX8-NEXT: s_bfe_u32 s6, s6, 0x100000 @@ -1698,7 +1683,7 @@ define amdgpu_ps i48 @s_fshl_v2i24(i48 inreg %lhs.arg, i48 inreg %rhs.arg, i48 i ; GFX8-NEXT: v_or_b32_sdwa v3, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX8-NEXT: v_lshlrev_b32_sdwa v0, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 ; GFX8-NEXT: v_or_b32_e32 v0, v3, v0 -; GFX8-NEXT: v_and_b32_e32 v3, s10, v1 +; GFX8-NEXT: v_and_b32_e32 v3, 0xff, v1 ; GFX8-NEXT: v_lshlrev_b32_e32 v3, 24, v3 ; GFX8-NEXT: v_lshlrev_b32_sdwa v2, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 ; GFX8-NEXT: v_or_b32_e32 v0, v0, v3 @@ -1712,27 +1697,26 @@ define amdgpu_ps i48 @s_fshl_v2i24(i48 inreg %lhs.arg, i48 inreg %rhs.arg, i48 i ; GFX9-NEXT: v_cvt_f32_ubyte0_e32 v0, 24 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v0, v0 ; GFX9-NEXT: s_lshr_b32 s7, s0, 8 -; GFX9-NEXT: s_movk_i32 s12, 0xff -; GFX9-NEXT: s_and_b32 s7, s7, s12 +; GFX9-NEXT: s_and_b32 s7, s7, 0xff +; GFX9-NEXT: s_bfe_u32 s12, 8, 0x100000 ; GFX9-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 -; GFX9-NEXT: s_bfe_u32 s13, 8, 0x100000 ; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0 ; GFX9-NEXT: s_lshr_b32 s9, s0, 16 ; GFX9-NEXT: s_lshr_b32 s10, s0, 24 -; GFX9-NEXT: s_and_b32 s0, s0, s12 -; GFX9-NEXT: s_lshl_b32 s7, s7, s13 +; GFX9-NEXT: s_and_b32 s0, s0, 0xff +; GFX9-NEXT: s_lshl_b32 s7, s7, s12 ; GFX9-NEXT: s_or_b32 s0, s0, s7 -; GFX9-NEXT: s_and_b32 s7, s9, s12 +; GFX9-NEXT: s_and_b32 s7, s9, 0xff ; GFX9-NEXT: s_bfe_u32 s7, s7, 0x100000 ; GFX9-NEXT: v_mov_b32_e32 v1, 0xffffffe8 ; GFX9-NEXT: s_lshr_b32 s11, s1, 8 ; GFX9-NEXT: s_bfe_u32 s0, s0, 0x100000 ; GFX9-NEXT: s_lshl_b32 s7, s7, 16 -; GFX9-NEXT: s_and_b32 s1, s1, s12 +; GFX9-NEXT: s_and_b32 s1, s1, 0xff ; GFX9-NEXT: v_mul_lo_u32 v2, v1, v0 ; GFX9-NEXT: s_or_b32 s0, s0, s7 -; GFX9-NEXT: s_lshl_b32 s1, s1, s13 -; GFX9-NEXT: s_and_b32 s7, s11, s12 +; GFX9-NEXT: s_lshl_b32 s1, s1, s12 +; GFX9-NEXT: s_and_b32 s7, s11, 0xff ; GFX9-NEXT: s_or_b32 s1, s10, s1 ; GFX9-NEXT: s_bfe_u32 s7, s7, 0x100000 ; GFX9-NEXT: s_bfe_u32 s1, s1, 0x100000 @@ -1740,24 +1724,24 @@ define amdgpu_ps i48 @s_fshl_v2i24(i48 inreg %lhs.arg, i48 inreg %rhs.arg, i48 i ; GFX9-NEXT: s_or_b32 s1, s1, s7 ; GFX9-NEXT: s_lshr_b32 s7, s2, 8 ; GFX9-NEXT: v_mul_hi_u32 v2, v0, v2 -; GFX9-NEXT: s_and_b32 s7, s7, s12 +; GFX9-NEXT: s_and_b32 s7, s7, 0xff ; GFX9-NEXT: s_lshr_b32 s9, s2, 16 ; GFX9-NEXT: s_lshr_b32 s10, s2, 24 -; GFX9-NEXT: s_and_b32 s2, s2, s12 -; GFX9-NEXT: s_lshl_b32 s7, s7, s13 +; GFX9-NEXT: s_and_b32 s2, s2, 0xff +; GFX9-NEXT: s_lshl_b32 s7, s7, s12 ; GFX9-NEXT: s_or_b32 s2, s2, s7 -; GFX9-NEXT: s_and_b32 s7, s9, s12 +; GFX9-NEXT: s_and_b32 s7, s9, 0xff ; GFX9-NEXT: s_bfe_u32 s7, s7, 0x100000 ; GFX9-NEXT: v_add_u32_e32 v0, v0, v2 ; GFX9-NEXT: v_cvt_f32_ubyte0_e32 v2, 24 ; GFX9-NEXT: s_lshr_b32 s11, s3, 8 ; GFX9-NEXT: s_bfe_u32 s2, s2, 0x100000 ; GFX9-NEXT: s_lshl_b32 s7, s7, 16 -; GFX9-NEXT: s_and_b32 s3, s3, s12 +; GFX9-NEXT: s_and_b32 s3, s3, 0xff ; GFX9-NEXT: v_rcp_iflag_f32_e32 v2, v2 ; GFX9-NEXT: s_or_b32 s2, s2, s7 -; GFX9-NEXT: s_lshl_b32 s3, s3, s13 -; GFX9-NEXT: s_and_b32 s7, s11, s12 +; GFX9-NEXT: s_lshl_b32 s3, s3, s12 +; GFX9-NEXT: s_and_b32 s7, s11, 0xff ; GFX9-NEXT: s_or_b32 s3, s10, s3 ; GFX9-NEXT: s_bfe_u32 s7, s7, 0x100000 ; GFX9-NEXT: s_bfe_u32 s3, s3, 0x100000 @@ -1765,14 +1749,14 @@ define amdgpu_ps i48 @s_fshl_v2i24(i48 inreg %lhs.arg, i48 inreg %rhs.arg, i48 i ; GFX9-NEXT: s_or_b32 s3, s3, s7 ; GFX9-NEXT: s_lshr_b32 s7, s4, 8 ; GFX9-NEXT: v_mul_f32_e32 v2, 0x4f7ffffe, v2 -; GFX9-NEXT: s_and_b32 s7, s7, s12 +; GFX9-NEXT: s_and_b32 s7, s7, 0xff ; GFX9-NEXT: v_cvt_u32_f32_e32 v2, v2 ; GFX9-NEXT: s_lshr_b32 s9, s4, 16 ; GFX9-NEXT: s_lshr_b32 s10, s4, 24 -; GFX9-NEXT: s_and_b32 s4, s4, s12 -; GFX9-NEXT: s_lshl_b32 s7, s7, s13 +; GFX9-NEXT: s_and_b32 s4, s4, 0xff +; GFX9-NEXT: s_lshl_b32 s7, s7, s12 ; GFX9-NEXT: s_or_b32 s4, s4, s7 -; GFX9-NEXT: s_and_b32 s7, s9, s12 +; GFX9-NEXT: s_and_b32 s7, s9, 0xff ; GFX9-NEXT: s_bfe_u32 s7, s7, 0x100000 ; GFX9-NEXT: s_bfe_u32 s4, s4, 0x100000 ; GFX9-NEXT: s_lshl_b32 s7, s7, 16 @@ -1780,10 +1764,10 @@ define amdgpu_ps i48 @s_fshl_v2i24(i48 inreg %lhs.arg, i48 inreg %rhs.arg, i48 i ; GFX9-NEXT: s_or_b32 s4, s4, s7 ; GFX9-NEXT: v_mul_hi_u32 v0, s4, v0 ; GFX9-NEXT: s_lshr_b32 s11, s5, 8 -; GFX9-NEXT: s_and_b32 s5, s5, s12 +; GFX9-NEXT: s_and_b32 s5, s5, 0xff ; GFX9-NEXT: v_mul_hi_u32 v1, v2, v1 -; GFX9-NEXT: s_lshl_b32 s5, s5, s13 -; GFX9-NEXT: s_and_b32 s7, s11, s12 +; GFX9-NEXT: s_lshl_b32 s5, s5, s12 +; GFX9-NEXT: s_and_b32 s7, s11, 0xff ; GFX9-NEXT: s_or_b32 s5, s10, s5 ; GFX9-NEXT: s_bfe_u32 s7, s7, 0x100000 ; GFX9-NEXT: v_mul_lo_u32 v0, v0, 24 @@ -1822,10 +1806,11 @@ define amdgpu_ps i48 @s_fshl_v2i24(i48 inreg %lhs.arg, i48 inreg %rhs.arg, i48 i ; GFX9-NEXT: v_lshrrev_b32_e64 v2, v2, s0 ; GFX9-NEXT: s_mov_b32 s6, 8 ; GFX9-NEXT: v_lshl_or_b32 v1, s1, v1, v2 +; GFX9-NEXT: s_movk_i32 s0, 0xff ; GFX9-NEXT: s_mov_b32 s8, 16 ; GFX9-NEXT: v_lshlrev_b32_sdwa v2, s6, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 -; GFX9-NEXT: v_and_b32_e32 v3, s12, v1 -; GFX9-NEXT: v_and_or_b32 v2, v0, s12, v2 +; GFX9-NEXT: v_and_b32_e32 v3, s0, v1 +; GFX9-NEXT: v_and_or_b32 v2, v0, s0, v2 ; GFX9-NEXT: v_lshlrev_b32_sdwa v0, s8, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 ; GFX9-NEXT: v_lshlrev_b32_e32 v3, 24, v3 ; GFX9-NEXT: v_or3_b32 v0, v2, v0, v3 @@ -1840,120 +1825,117 @@ define amdgpu_ps i48 @s_fshl_v2i24(i48 inreg %lhs.arg, i48 inreg %rhs.arg, i48 i ; GFX10: ; %bb.0: ; GFX10-NEXT: v_cvt_f32_ubyte0_e32 v0, 24 ; GFX10-NEXT: v_cvt_f32_ubyte0_e32 v1, 24 -; GFX10-NEXT: s_movk_i32 s9, 0xff -; GFX10-NEXT: s_lshr_b32 s10, s1, 8 -; GFX10-NEXT: s_bfe_u32 s11, 8, 0x100000 +; GFX10-NEXT: s_lshr_b32 s6, s0, 8 +; GFX10-NEXT: s_lshr_b32 s7, s0, 16 +; GFX10-NEXT: s_and_b32 s6, s6, 0xff ; GFX10-NEXT: v_rcp_iflag_f32_e32 v0, v0 ; GFX10-NEXT: v_rcp_iflag_f32_e32 v1, v1 -; GFX10-NEXT: s_and_b32 s1, s1, s9 -; GFX10-NEXT: s_lshr_b32 s6, s0, 8 +; GFX10-NEXT: s_bfe_u32 s10, 8, 0x100000 ; GFX10-NEXT: s_lshr_b32 s8, s0, 24 -; GFX10-NEXT: s_lshl_b32 s1, s1, s11 -; GFX10-NEXT: s_and_b32 s6, s6, s9 -; GFX10-NEXT: s_or_b32 s1, s8, s1 -; GFX10-NEXT: s_lshr_b32 s8, s4, 8 -; GFX10-NEXT: s_lshr_b32 s7, s0, 16 +; GFX10-NEXT: s_and_b32 s0, s0, 0xff +; GFX10-NEXT: s_lshl_b32 s6, s6, s10 +; GFX10-NEXT: s_and_b32 s7, s7, 0xff +; GFX10-NEXT: s_or_b32 s0, s0, s6 +; GFX10-NEXT: s_bfe_u32 s6, s7, 0x100000 +; GFX10-NEXT: s_lshr_b32 s7, s4, 8 ; GFX10-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX10-NEXT: v_mul_f32_e32 v1, 0x4f7ffffe, v1 -; GFX10-NEXT: s_and_b32 s0, s0, s9 -; GFX10-NEXT: s_lshl_b32 s6, s6, s11 -; GFX10-NEXT: s_and_b32 s8, s8, s9 +; GFX10-NEXT: s_and_b32 s7, s7, 0xff +; GFX10-NEXT: s_lshr_b32 s11, s4, 16 +; GFX10-NEXT: s_lshr_b32 s12, s4, 24 ; GFX10-NEXT: v_cvt_u32_f32_e32 v0, v0 ; GFX10-NEXT: v_cvt_u32_f32_e32 v1, v1 -; GFX10-NEXT: s_or_b32 s0, s0, s6 -; GFX10-NEXT: s_and_b32 s6, s7, s9 -; GFX10-NEXT: s_and_b32 s7, s10, s9 +; GFX10-NEXT: s_and_b32 s4, s4, 0xff +; GFX10-NEXT: s_lshl_b32 s7, s7, s10 +; GFX10-NEXT: s_lshr_b32 s13, s5, 8 ; GFX10-NEXT: v_mul_lo_u32 v2, 0xffffffe8, v0 ; GFX10-NEXT: v_mul_lo_u32 v3, 0xffffffe8, v1 -; GFX10-NEXT: s_lshr_b32 s10, s4, 16 -; GFX10-NEXT: s_lshr_b32 s12, s4, 24 -; GFX10-NEXT: s_and_b32 s4, s4, s9 -; GFX10-NEXT: s_lshl_b32 s8, s8, s11 -; GFX10-NEXT: s_lshr_b32 s13, s5, 8 -; GFX10-NEXT: s_or_b32 s4, s4, s8 -; GFX10-NEXT: v_mul_hi_u32 v2, v0, v2 -; GFX10-NEXT: s_and_b32 s8, s10, s9 -; GFX10-NEXT: v_mul_hi_u32 v3, v1, v3 -; GFX10-NEXT: s_bfe_u32 s8, s8, 0x100000 +; GFX10-NEXT: s_or_b32 s4, s4, s7 +; GFX10-NEXT: s_and_b32 s7, s11, 0xff ; GFX10-NEXT: s_bfe_u32 s4, s4, 0x100000 -; GFX10-NEXT: s_lshl_b32 s8, s8, 16 -; GFX10-NEXT: s_and_b32 s5, s5, s9 -; GFX10-NEXT: s_or_b32 s4, s4, s8 -; GFX10-NEXT: v_add_nc_u32_e32 v0, v0, v2 -; GFX10-NEXT: s_lshl_b32 s5, s5, s11 -; GFX10-NEXT: s_and_b32 s8, s13, s9 -; GFX10-NEXT: s_or_b32 s5, s12, s5 -; GFX10-NEXT: s_bfe_u32 s8, s8, 0x100000 -; GFX10-NEXT: v_add_nc_u32_e32 v1, v1, v3 -; GFX10-NEXT: v_mul_hi_u32 v0, s4, v0 -; GFX10-NEXT: s_bfe_u32 s5, s5, 0x100000 -; GFX10-NEXT: s_lshl_b32 s8, s8, 16 -; GFX10-NEXT: s_lshr_b32 s10, s2, 16 -; GFX10-NEXT: s_or_b32 s5, s5, s8 -; GFX10-NEXT: s_lshr_b32 s8, s2, 8 -; GFX10-NEXT: v_mul_hi_u32 v1, s5, v1 -; GFX10-NEXT: s_and_b32 s8, s8, s9 -; GFX10-NEXT: v_mul_lo_u32 v0, v0, 24 -; GFX10-NEXT: s_and_b32 s12, s2, s9 -; GFX10-NEXT: s_lshl_b32 s8, s8, s11 -; GFX10-NEXT: s_and_b32 s10, s10, s9 -; GFX10-NEXT: s_or_b32 s8, s12, s8 -; GFX10-NEXT: s_lshr_b32 s2, s2, 24 -; GFX10-NEXT: v_mul_lo_u32 v1, v1, 24 -; GFX10-NEXT: s_bfe_u32 s6, s6, 0x100000 -; GFX10-NEXT: v_sub_nc_u32_e32 v0, s4, v0 -; GFX10-NEXT: s_bfe_u32 s4, s8, 0x100000 -; GFX10-NEXT: s_bfe_u32 s8, s10, 0x100000 ; GFX10-NEXT: s_bfe_u32 s7, s7, 0x100000 -; GFX10-NEXT: s_bfe_u32 s0, s0, 0x100000 -; GFX10-NEXT: v_subrev_nc_u32_e32 v2, 24, v0 +; GFX10-NEXT: s_and_b32 s5, s5, 0xff +; GFX10-NEXT: s_lshl_b32 s7, s7, 16 +; GFX10-NEXT: v_mul_hi_u32 v2, v0, v2 +; GFX10-NEXT: v_mul_hi_u32 v3, v1, v3 +; GFX10-NEXT: s_lshl_b32 s5, s5, s10 +; GFX10-NEXT: s_or_b32 s4, s4, s7 +; GFX10-NEXT: s_and_b32 s7, s13, 0xff +; GFX10-NEXT: s_or_b32 s5, s12, s5 +; GFX10-NEXT: s_bfe_u32 s7, s7, 0x100000 +; GFX10-NEXT: s_bfe_u32 s5, s5, 0x100000 +; GFX10-NEXT: v_add_nc_u32_e32 v0, v0, v2 +; GFX10-NEXT: v_add_nc_u32_e32 v1, v1, v3 +; GFX10-NEXT: s_lshl_b32 s7, s7, 16 +; GFX10-NEXT: s_lshr_b32 s9, s1, 8 +; GFX10-NEXT: s_or_b32 s5, s5, s7 +; GFX10-NEXT: v_mul_hi_u32 v0, s4, v0 +; GFX10-NEXT: v_mul_hi_u32 v1, s5, v1 +; GFX10-NEXT: s_and_b32 s1, s1, 0xff +; GFX10-NEXT: s_and_b32 s7, s9, 0xff +; GFX10-NEXT: s_lshl_b32 s1, s1, s10 +; GFX10-NEXT: s_lshr_b32 s9, s2, 16 +; GFX10-NEXT: s_or_b32 s1, s8, s1 +; GFX10-NEXT: s_lshr_b32 s8, s2, 8 +; GFX10-NEXT: v_mul_lo_u32 v0, v0, 24 +; GFX10-NEXT: v_mul_lo_u32 v1, v1, 24 +; GFX10-NEXT: s_and_b32 s8, s8, 0xff +; GFX10-NEXT: s_lshr_b32 s11, s2, 24 +; GFX10-NEXT: s_and_b32 s2, s2, 0xff +; GFX10-NEXT: s_lshl_b32 s8, s8, s10 +; GFX10-NEXT: s_bfe_u32 s7, s7, 0x100000 +; GFX10-NEXT: s_or_b32 s2, s2, s8 +; GFX10-NEXT: v_sub_nc_u32_e32 v0, s4, v0 ; GFX10-NEXT: v_sub_nc_u32_e32 v1, s5, v1 +; GFX10-NEXT: s_lshr_b32 s4, s3, 8 +; GFX10-NEXT: s_and_b32 s5, s9, 0xff +; GFX10-NEXT: s_and_b32 s3, s3, 0xff +; GFX10-NEXT: v_subrev_nc_u32_e32 v2, 24, v0 ; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, 24, v0 -; GFX10-NEXT: s_lshl_b32 s5, s8, 16 -; GFX10-NEXT: s_lshr_b32 s8, s3, 8 -; GFX10-NEXT: s_and_b32 s3, s3, s9 ; GFX10-NEXT: v_subrev_nc_u32_e32 v3, 24, v1 +; GFX10-NEXT: s_bfe_u32 s5, s5, 0x100000 +; GFX10-NEXT: s_lshl_b32 s3, s3, s10 +; GFX10-NEXT: s_and_b32 s4, s4, 0xff ; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc_lo ; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, 24, v1 -; GFX10-NEXT: s_lshl_b32 s3, s3, s11 -; GFX10-NEXT: s_or_b32 s4, s4, s5 -; GFX10-NEXT: s_or_b32 s2, s2, s3 +; GFX10-NEXT: s_bfe_u32 s2, s2, 0x100000 +; GFX10-NEXT: s_lshl_b32 s5, s5, 16 +; GFX10-NEXT: s_or_b32 s3, s11, s3 ; GFX10-NEXT: v_subrev_nc_u32_e32 v2, 24, v0 ; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc_lo ; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, 24, v0 -; GFX10-NEXT: s_and_b32 s3, s8, s9 -; GFX10-NEXT: s_mov_b32 s5, 0xffffff +; GFX10-NEXT: s_bfe_u32 s4, s4, 0x100000 +; GFX10-NEXT: s_or_b32 s2, s2, s5 ; GFX10-NEXT: s_bfe_u32 s3, s3, 0x100000 ; GFX10-NEXT: v_subrev_nc_u32_e32 v3, 24, v1 ; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc_lo ; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, 24, v1 -; GFX10-NEXT: s_bfe_u32 s2, s2, 0x100000 -; GFX10-NEXT: s_lshl_b32 s3, s3, 16 -; GFX10-NEXT: s_lshl_b32 s6, s6, 16 +; GFX10-NEXT: s_lshl_b32 s4, s4, 16 +; GFX10-NEXT: s_lshr_b32 s2, s2, 1 +; GFX10-NEXT: s_or_b32 s3, s3, s4 ; GFX10-NEXT: v_sub_nc_u32_e32 v2, 23, v0 ; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc_lo -; GFX10-NEXT: v_mov_b32_e32 v3, 0xffffff -; GFX10-NEXT: s_or_b32 s2, s2, s3 -; GFX10-NEXT: s_lshr_b32 s3, s4, 1 -; GFX10-NEXT: v_and_b32_e32 v2, s5, v2 -; GFX10-NEXT: v_sub_nc_u32_e32 v4, 23, v1 -; GFX10-NEXT: s_lshr_b32 s2, s2, 1 -; GFX10-NEXT: v_and_b32_e32 v0, s5, v0 -; GFX10-NEXT: v_and_b32_e32 v1, v1, v3 -; GFX10-NEXT: v_lshrrev_b32_e64 v2, v2, s3 -; GFX10-NEXT: v_and_b32_e32 v4, v4, v3 +; GFX10-NEXT: v_and_b32_e32 v0, 0xffffff, v0 +; GFX10-NEXT: s_bfe_u32 s0, s0, 0x100000 +; GFX10-NEXT: s_lshl_b32 s6, s6, 16 +; GFX10-NEXT: v_and_b32_e32 v2, 0xffffff, v2 +; GFX10-NEXT: v_sub_nc_u32_e32 v3, 23, v1 +; GFX10-NEXT: v_and_b32_e32 v1, 0xffffff, v1 ; GFX10-NEXT: s_bfe_u32 s1, s1, 0x100000 ; GFX10-NEXT: s_lshl_b32 s7, s7, 16 +; GFX10-NEXT: v_lshrrev_b32_e64 v2, v2, s2 +; GFX10-NEXT: v_and_b32_e32 v3, 0xffffff, v3 +; GFX10-NEXT: s_lshr_b32 s2, s3, 1 ; GFX10-NEXT: s_or_b32 s0, s0, s6 ; GFX10-NEXT: s_or_b32 s1, s1, s7 -; GFX10-NEXT: v_lshrrev_b32_e64 v3, v4, s2 ; GFX10-NEXT: v_lshl_or_b32 v0, s0, v0, v2 +; GFX10-NEXT: v_lshrrev_b32_e64 v3, v3, s2 ; GFX10-NEXT: s_mov_b32 s0, 8 -; GFX10-NEXT: v_lshl_or_b32 v1, s1, v1, v3 ; GFX10-NEXT: v_lshlrev_b32_sdwa v2, s0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 +; GFX10-NEXT: v_lshl_or_b32 v1, s1, v1, v3 ; GFX10-NEXT: s_mov_b32 s0, 16 -; GFX10-NEXT: v_and_b32_e32 v3, s9, v1 -; GFX10-NEXT: v_and_or_b32 v2, v0, s9, v2 +; GFX10-NEXT: v_and_or_b32 v2, v0, 0xff, v2 +; GFX10-NEXT: v_and_b32_e32 v3, 0xff, v1 ; GFX10-NEXT: v_lshlrev_b32_sdwa v0, s0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 ; GFX10-NEXT: v_bfe_u32 v4, v1, 8, 8 ; GFX10-NEXT: v_bfe_u32 v1, v1, 16, 8 @@ -2141,12 +2123,11 @@ define <2 x i24> @v_fshl_v2i24(<2 x i24> %lhs, <2 x i24> %rhs, <2 x i24> %amt) { ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: v_cvt_f32_ubyte0_e32 v6, 24 ; GFX10-NEXT: v_cvt_f32_ubyte0_e32 v7, 24 -; GFX10-NEXT: v_mov_b32_e32 v10, 0xffffff ; GFX10-NEXT: v_and_b32_e32 v4, 0xffffff, v4 +; GFX10-NEXT: v_and_b32_e32 v5, 0xffffff, v5 ; GFX10-NEXT: v_bfe_u32 v2, v2, 1, 23 ; GFX10-NEXT: v_rcp_iflag_f32_e32 v6, v6 ; GFX10-NEXT: v_rcp_iflag_f32_e32 v7, v7 -; GFX10-NEXT: v_and_b32_e32 v5, v5, v10 ; GFX10-NEXT: v_bfe_u32 v3, v3, 1, 23 ; GFX10-NEXT: v_mul_f32_e32 v6, 0x4f7ffffe, v6 ; GFX10-NEXT: v_mul_f32_e32 v7, 0x4f7ffffe, v7 @@ -2177,12 +2158,12 @@ define <2 x i24> @v_fshl_v2i24(<2 x i24> %lhs, <2 x i24> %rhs, <2 x i24> %amt) { ; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, 24, v5 ; GFX10-NEXT: v_sub_nc_u32_e32 v6, 23, v4 ; GFX10-NEXT: v_cndmask_b32_e32 v5, v5, v7, vcc_lo -; GFX10-NEXT: v_and_b32_e32 v4, v4, v10 -; GFX10-NEXT: v_and_b32_e32 v6, v6, v10 +; GFX10-NEXT: v_and_b32_e32 v4, 0xffffff, v4 +; GFX10-NEXT: v_and_b32_e32 v6, 0xffffff, v6 ; GFX10-NEXT: v_sub_nc_u32_e32 v7, 23, v5 -; GFX10-NEXT: v_and_b32_e32 v5, v5, v10 +; GFX10-NEXT: v_and_b32_e32 v5, 0xffffff, v5 ; GFX10-NEXT: v_lshrrev_b32_e32 v2, v6, v2 -; GFX10-NEXT: v_and_b32_e32 v7, v7, v10 +; GFX10-NEXT: v_and_b32_e32 v7, 0xffffff, v7 ; GFX10-NEXT: v_lshl_or_b32 v0, v0, v4, v2 ; GFX10-NEXT: v_lshrrev_b32_e32 v3, v7, v3 ; GFX10-NEXT: v_lshl_or_b32 v1, v1, v5, v3 @@ -3176,19 +3157,18 @@ define amdgpu_ps i32 @s_fshl_v2i16(<2 x i16> inreg %lhs, <2 x i16> inreg %rhs, < ; GFX6-LABEL: s_fshl_v2i16: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_and_b32 s6, s4, 15 -; GFX6-NEXT: s_bfe_u32 s6, s6, 0x100000 ; GFX6-NEXT: s_andn2_b32 s4, 15, s4 -; GFX6-NEXT: s_lshl_b32 s0, s0, s6 -; GFX6-NEXT: s_mov_b32 s6, 0xf0001 -; GFX6-NEXT: s_bfe_u32 s2, s2, s6 +; GFX6-NEXT: s_bfe_u32 s6, s6, 0x100000 +; GFX6-NEXT: s_bfe_u32 s2, s2, 0xf0001 ; GFX6-NEXT: s_bfe_u32 s4, s4, 0x100000 +; GFX6-NEXT: s_lshl_b32 s0, s0, s6 ; GFX6-NEXT: s_lshr_b32 s2, s2, s4 ; GFX6-NEXT: s_or_b32 s0, s0, s2 ; GFX6-NEXT: s_and_b32 s2, s5, 15 ; GFX6-NEXT: s_andn2_b32 s4, 15, s5 ; GFX6-NEXT: s_bfe_u32 s2, s2, 0x100000 ; GFX6-NEXT: s_lshl_b32 s1, s1, s2 -; GFX6-NEXT: s_bfe_u32 s2, s3, s6 +; GFX6-NEXT: s_bfe_u32 s2, s3, 0xf0001 ; GFX6-NEXT: s_bfe_u32 s3, s4, 0x100000 ; GFX6-NEXT: s_lshr_b32 s2, s2, s3 ; GFX6-NEXT: s_or_b32 s1, s1, s2 @@ -3229,22 +3209,20 @@ define amdgpu_ps i32 @s_fshl_v2i16(<2 x i16> inreg %lhs, <2 x i16> inreg %rhs, < ; ; GFX9-LABEL: s_fshl_v2i16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_mov_b32 s3, 0xf000f -; GFX9-NEXT: s_and_b32 s4, s2, s3 -; GFX9-NEXT: s_andn2_b32 s2, s3, s2 -; GFX9-NEXT: s_lshr_b32 s3, s0, 16 -; GFX9-NEXT: s_lshr_b32 s5, s4, 16 -; GFX9-NEXT: s_lshl_b32 s0, s0, s4 -; GFX9-NEXT: s_lshl_b32 s3, s3, s5 -; GFX9-NEXT: s_mov_b32 s4, 0xffff +; GFX9-NEXT: s_and_b32 s3, s2, 0xf000f +; GFX9-NEXT: s_lshr_b32 s4, s0, 16 +; GFX9-NEXT: s_lshr_b32 s5, s3, 16 +; GFX9-NEXT: s_lshl_b32 s0, s0, s3 +; GFX9-NEXT: s_lshl_b32 s3, s4, s5 ; GFX9-NEXT: s_pack_ll_b32_b16 s0, s0, s3 ; GFX9-NEXT: s_lshr_b32 s3, s1, 16 -; GFX9-NEXT: s_and_b32 s1, s1, s4 +; GFX9-NEXT: s_and_b32 s1, s1, 0xffff ; GFX9-NEXT: s_lshr_b32 s1, s1, 0x10001 ; GFX9-NEXT: s_lshr_b32 s3, s3, 1 +; GFX9-NEXT: s_andn2_b32 s2, 0xf000f, s2 ; GFX9-NEXT: s_pack_ll_b32_b16 s1, s1, s3 ; GFX9-NEXT: s_lshr_b32 s3, s1, 16 -; GFX9-NEXT: s_and_b32 s1, s1, s4 +; GFX9-NEXT: s_and_b32 s1, s1, 0xffff ; GFX9-NEXT: s_lshr_b32 s4, s2, 16 ; GFX9-NEXT: s_lshr_b32 s1, s1, s2 ; GFX9-NEXT: s_lshr_b32 s2, s3, s4 @@ -3254,22 +3232,20 @@ define amdgpu_ps i32 @s_fshl_v2i16(<2 x i16> inreg %lhs, <2 x i16> inreg %rhs, < ; ; GFX10-LABEL: s_fshl_v2i16: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_mov_b32 s5, 0xffff -; GFX10-NEXT: s_mov_b32 s3, 0xf000f -; GFX10-NEXT: s_and_b32 s7, s1, s5 +; GFX10-NEXT: s_and_b32 s6, s1, 0xffff ; GFX10-NEXT: s_lshr_b32 s1, s1, 16 -; GFX10-NEXT: s_lshr_b32 s7, s7, 0x10001 +; GFX10-NEXT: s_and_b32 s3, s2, 0xf000f +; GFX10-NEXT: s_lshr_b32 s6, s6, 0x10001 ; GFX10-NEXT: s_lshr_b32 s1, s1, 1 -; GFX10-NEXT: s_and_b32 s4, s2, s3 -; GFX10-NEXT: s_andn2_b32 s2, s3, s2 -; GFX10-NEXT: s_pack_ll_b32_b16 s1, s7, s1 -; GFX10-NEXT: s_lshr_b32 s3, s0, 16 -; GFX10-NEXT: s_lshr_b32 s6, s4, 16 -; GFX10-NEXT: s_lshl_b32 s0, s0, s4 +; GFX10-NEXT: s_andn2_b32 s2, 0xf000f, s2 +; GFX10-NEXT: s_lshr_b32 s4, s0, 16 +; GFX10-NEXT: s_lshr_b32 s5, s3, 16 +; GFX10-NEXT: s_pack_ll_b32_b16 s1, s6, s1 +; GFX10-NEXT: s_lshl_b32 s0, s0, s3 +; GFX10-NEXT: s_lshl_b32 s3, s4, s5 ; GFX10-NEXT: s_lshr_b32 s4, s1, 16 -; GFX10-NEXT: s_and_b32 s1, s1, s5 +; GFX10-NEXT: s_and_b32 s1, s1, 0xffff ; GFX10-NEXT: s_lshr_b32 s5, s2, 16 -; GFX10-NEXT: s_lshl_b32 s3, s3, s6 ; GFX10-NEXT: s_lshr_b32 s1, s1, s2 ; GFX10-NEXT: s_lshr_b32 s2, s4, s5 ; GFX10-NEXT: s_pack_ll_b32_b16 s0, s0, s3 @@ -3347,10 +3323,9 @@ define <2 x i16> @v_fshl_v2i16(<2 x i16> %lhs, <2 x i16> %rhs, <2 x i16> %amt) { ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: v_xor_b32_e32 v3, -1, v2 -; GFX10-NEXT: s_mov_b32 s4, 0xf000f +; GFX10-NEXT: v_and_b32_e32 v2, 0xf000f, v2 ; GFX10-NEXT: v_pk_lshrrev_b16 v1, 1, v1 op_sel_hi:[0,1] -; GFX10-NEXT: v_and_b32_e32 v2, s4, v2 -; GFX10-NEXT: v_and_b32_e32 v3, s4, v3 +; GFX10-NEXT: v_and_b32_e32 v3, 0xf000f, v3 ; GFX10-NEXT: v_pk_lshlrev_b16 v0, v2, v0 ; GFX10-NEXT: v_pk_lshrrev_b16 v1, v3, v1 ; GFX10-NEXT: v_or_b32_e32 v0, v0, v1 @@ -3420,19 +3395,18 @@ define amdgpu_ps float @v_fshl_v2i16_ssv(<2 x i16> inreg %lhs, <2 x i16> inreg % ; GFX6: ; %bb.0: ; GFX6-NEXT: v_and_b32_e32 v2, 15, v0 ; GFX6-NEXT: v_xor_b32_e32 v0, -1, v0 -; GFX6-NEXT: v_bfe_u32 v2, v2, 0, 16 ; GFX6-NEXT: v_and_b32_e32 v0, 15, v0 +; GFX6-NEXT: v_bfe_u32 v2, v2, 0, 16 ; GFX6-NEXT: v_lshl_b32_e32 v2, s0, v2 -; GFX6-NEXT: s_mov_b32 s0, 0xf0001 -; GFX6-NEXT: s_bfe_u32 s2, s2, s0 +; GFX6-NEXT: s_bfe_u32 s0, s2, 0xf0001 ; GFX6-NEXT: v_bfe_u32 v0, v0, 0, 16 -; GFX6-NEXT: v_lshr_b32_e32 v0, s2, v0 +; GFX6-NEXT: v_lshr_b32_e32 v0, s0, v0 ; GFX6-NEXT: v_or_b32_e32 v0, v2, v0 ; GFX6-NEXT: v_and_b32_e32 v2, 15, v1 ; GFX6-NEXT: v_xor_b32_e32 v1, -1, v1 ; GFX6-NEXT: v_and_b32_e32 v1, 15, v1 ; GFX6-NEXT: v_bfe_u32 v2, v2, 0, 16 -; GFX6-NEXT: s_bfe_u32 s0, s3, s0 +; GFX6-NEXT: s_bfe_u32 s0, s3, 0xf0001 ; GFX6-NEXT: v_bfe_u32 v1, v1, 0, 16 ; GFX6-NEXT: v_lshl_b32_e32 v2, s1, v2 ; GFX6-NEXT: v_lshr_b32_e32 v1, s0, v1 @@ -3488,13 +3462,12 @@ define amdgpu_ps float @v_fshl_v2i16_ssv(<2 x i16> inreg %lhs, <2 x i16> inreg % ; GFX10-LABEL: v_fshl_v2i16_ssv: ; GFX10: ; %bb.0: ; GFX10-NEXT: v_xor_b32_e32 v1, -1, v0 -; GFX10-NEXT: s_mov_b32 s2, 0xf000f -; GFX10-NEXT: s_lshr_b32 s3, s1, 16 +; GFX10-NEXT: s_lshr_b32 s2, s1, 16 ; GFX10-NEXT: s_and_b32 s1, s1, 0xffff -; GFX10-NEXT: v_and_b32_e32 v0, s2, v0 -; GFX10-NEXT: v_and_b32_e32 v1, s2, v1 +; GFX10-NEXT: v_and_b32_e32 v0, 0xf000f, v0 ; GFX10-NEXT: s_lshr_b32 s1, s1, 0x10001 -; GFX10-NEXT: s_lshr_b32 s2, s3, 1 +; GFX10-NEXT: v_and_b32_e32 v1, 0xf000f, v1 +; GFX10-NEXT: s_lshr_b32 s2, s2, 1 ; GFX10-NEXT: s_pack_ll_b32_b16 s1, s1, s2 ; GFX10-NEXT: v_pk_lshlrev_b16 v0, v0, s0 ; GFX10-NEXT: v_pk_lshrrev_b16 v1, v1, s1 @@ -3556,13 +3529,12 @@ define amdgpu_ps float @v_fshl_v2i16_svs(<2 x i16> inreg %lhs, <2 x i16> %rhs, < ; ; GFX9-LABEL: v_fshl_v2i16_svs: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_mov_b32 s2, 0xf000f -; GFX9-NEXT: s_and_b32 s3, s1, s2 -; GFX9-NEXT: s_andn2_b32 s1, s2, s1 -; GFX9-NEXT: s_lshr_b32 s2, s0, 16 -; GFX9-NEXT: s_lshr_b32 s4, s3, 16 -; GFX9-NEXT: s_lshl_b32 s0, s0, s3 -; GFX9-NEXT: s_lshl_b32 s2, s2, s4 +; GFX9-NEXT: s_and_b32 s2, s1, 0xf000f +; GFX9-NEXT: s_lshr_b32 s3, s0, 16 +; GFX9-NEXT: s_lshr_b32 s4, s2, 16 +; GFX9-NEXT: s_andn2_b32 s1, 0xf000f, s1 +; GFX9-NEXT: s_lshl_b32 s0, s0, s2 +; GFX9-NEXT: s_lshl_b32 s2, s3, s4 ; GFX9-NEXT: v_pk_lshrrev_b16 v0, 1, v0 op_sel_hi:[0,1] ; GFX9-NEXT: s_pack_ll_b32_b16 s0, s0, s2 ; GFX9-NEXT: v_pk_lshrrev_b16 v0, s1, v0 @@ -3571,15 +3543,14 @@ define amdgpu_ps float @v_fshl_v2i16_svs(<2 x i16> inreg %lhs, <2 x i16> %rhs, < ; ; GFX10-LABEL: v_fshl_v2i16_svs: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_mov_b32 s2, 0xf000f ; GFX10-NEXT: v_pk_lshrrev_b16 v0, 1, v0 op_sel_hi:[0,1] -; GFX10-NEXT: s_and_b32 s3, s1, s2 -; GFX10-NEXT: s_andn2_b32 s1, s2, s1 -; GFX10-NEXT: s_lshr_b32 s2, s0, 16 -; GFX10-NEXT: s_lshr_b32 s4, s3, 16 +; GFX10-NEXT: s_and_b32 s2, s1, 0xf000f +; GFX10-NEXT: s_andn2_b32 s1, 0xf000f, s1 +; GFX10-NEXT: s_lshr_b32 s3, s0, 16 +; GFX10-NEXT: s_lshr_b32 s4, s2, 16 ; GFX10-NEXT: v_pk_lshrrev_b16 v0, s1, v0 -; GFX10-NEXT: s_lshl_b32 s0, s0, s3 -; GFX10-NEXT: s_lshl_b32 s1, s2, s4 +; GFX10-NEXT: s_lshl_b32 s0, s0, s2 +; GFX10-NEXT: s_lshl_b32 s1, s3, s4 ; GFX10-NEXT: s_pack_ll_b32_b16 s0, s0, s1 ; GFX10-NEXT: v_or_b32_e32 v0, s0, v0 ; GFX10-NEXT: ; return to shader part epilog @@ -3592,19 +3563,18 @@ define amdgpu_ps float @v_fshl_v2i16_vss(<2 x i16> %lhs, <2 x i16> inreg %rhs, < ; GFX6-LABEL: v_fshl_v2i16_vss: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_and_b32 s4, s2, 15 -; GFX6-NEXT: s_bfe_u32 s4, s4, 0x100000 ; GFX6-NEXT: s_andn2_b32 s2, 15, s2 -; GFX6-NEXT: v_lshlrev_b32_e32 v0, s4, v0 -; GFX6-NEXT: s_mov_b32 s4, 0xf0001 -; GFX6-NEXT: s_bfe_u32 s0, s0, s4 +; GFX6-NEXT: s_bfe_u32 s4, s4, 0x100000 +; GFX6-NEXT: s_bfe_u32 s0, s0, 0xf0001 ; GFX6-NEXT: s_bfe_u32 s2, s2, 0x100000 +; GFX6-NEXT: v_lshlrev_b32_e32 v0, s4, v0 ; GFX6-NEXT: s_lshr_b32 s0, s0, s2 ; GFX6-NEXT: v_or_b32_e32 v0, s0, v0 ; GFX6-NEXT: s_and_b32 s0, s3, 15 ; GFX6-NEXT: s_andn2_b32 s2, 15, s3 ; GFX6-NEXT: s_bfe_u32 s0, s0, 0x100000 ; GFX6-NEXT: v_lshlrev_b32_e32 v1, s0, v1 -; GFX6-NEXT: s_bfe_u32 s0, s1, s4 +; GFX6-NEXT: s_bfe_u32 s0, s1, 0xf0001 ; GFX6-NEXT: s_bfe_u32 s1, s2, 0x100000 ; GFX6-NEXT: s_lshr_b32 s0, s0, s1 ; GFX6-NEXT: v_or_b32_e32 v1, s0, v1 @@ -3642,18 +3612,16 @@ define amdgpu_ps float @v_fshl_v2i16_vss(<2 x i16> %lhs, <2 x i16> inreg %rhs, < ; ; GFX9-LABEL: v_fshl_v2i16_vss: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_mov_b32 s2, 0xf000f -; GFX9-NEXT: s_and_b32 s3, s1, s2 -; GFX9-NEXT: v_pk_lshlrev_b16 v0, s3, v0 -; GFX9-NEXT: s_mov_b32 s3, 0xffff -; GFX9-NEXT: s_andn2_b32 s1, s2, s1 +; GFX9-NEXT: s_and_b32 s2, s1, 0xf000f +; GFX9-NEXT: v_pk_lshlrev_b16 v0, s2, v0 ; GFX9-NEXT: s_lshr_b32 s2, s0, 16 -; GFX9-NEXT: s_and_b32 s0, s0, s3 +; GFX9-NEXT: s_and_b32 s0, s0, 0xffff ; GFX9-NEXT: s_lshr_b32 s0, s0, 0x10001 ; GFX9-NEXT: s_lshr_b32 s2, s2, 1 +; GFX9-NEXT: s_andn2_b32 s1, 0xf000f, s1 ; GFX9-NEXT: s_pack_ll_b32_b16 s0, s0, s2 ; GFX9-NEXT: s_lshr_b32 s2, s0, 16 -; GFX9-NEXT: s_and_b32 s0, s0, s3 +; GFX9-NEXT: s_and_b32 s0, s0, 0xffff ; GFX9-NEXT: s_lshr_b32 s3, s1, 16 ; GFX9-NEXT: s_lshr_b32 s0, s0, s1 ; GFX9-NEXT: s_lshr_b32 s1, s2, s3 @@ -3663,18 +3631,16 @@ define amdgpu_ps float @v_fshl_v2i16_vss(<2 x i16> %lhs, <2 x i16> inreg %rhs, < ; ; GFX10-LABEL: v_fshl_v2i16_vss: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_mov_b32 s3, 0xffff -; GFX10-NEXT: s_mov_b32 s2, 0xf000f -; GFX10-NEXT: s_and_b32 s5, s0, s3 +; GFX10-NEXT: s_and_b32 s3, s0, 0xffff ; GFX10-NEXT: s_lshr_b32 s0, s0, 16 -; GFX10-NEXT: s_lshr_b32 s5, s5, 0x10001 +; GFX10-NEXT: s_lshr_b32 s3, s3, 0x10001 ; GFX10-NEXT: s_lshr_b32 s0, s0, 1 -; GFX10-NEXT: s_and_b32 s4, s1, s2 -; GFX10-NEXT: s_andn2_b32 s1, s2, s1 -; GFX10-NEXT: s_pack_ll_b32_b16 s0, s5, s0 -; GFX10-NEXT: v_pk_lshlrev_b16 v0, s4, v0 +; GFX10-NEXT: s_and_b32 s2, s1, 0xf000f +; GFX10-NEXT: s_andn2_b32 s1, 0xf000f, s1 +; GFX10-NEXT: s_pack_ll_b32_b16 s0, s3, s0 +; GFX10-NEXT: v_pk_lshlrev_b16 v0, s2, v0 ; GFX10-NEXT: s_lshr_b32 s2, s0, 16 -; GFX10-NEXT: s_and_b32 s0, s0, s3 +; GFX10-NEXT: s_and_b32 s0, s0, 0xffff ; GFX10-NEXT: s_lshr_b32 s3, s1, 16 ; GFX10-NEXT: s_lshr_b32 s0, s0, s1 ; GFX10-NEXT: s_lshr_b32 s1, s2, s3 @@ -3704,19 +3670,18 @@ define amdgpu_ps <2 x i32> @s_fshl_v4i16(<4 x i16> inreg %lhs, <4 x i16> inreg % ; GFX6-LABEL: s_fshl_v4i16: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_and_b32 s12, s8, 15 -; GFX6-NEXT: s_bfe_u32 s12, s12, 0x100000 ; GFX6-NEXT: s_andn2_b32 s8, 15, s8 -; GFX6-NEXT: s_lshl_b32 s0, s0, s12 -; GFX6-NEXT: s_mov_b32 s12, 0xf0001 -; GFX6-NEXT: s_bfe_u32 s4, s4, s12 +; GFX6-NEXT: s_bfe_u32 s12, s12, 0x100000 +; GFX6-NEXT: s_bfe_u32 s4, s4, 0xf0001 ; GFX6-NEXT: s_bfe_u32 s8, s8, 0x100000 +; GFX6-NEXT: s_lshl_b32 s0, s0, s12 ; GFX6-NEXT: s_lshr_b32 s4, s4, s8 ; GFX6-NEXT: s_or_b32 s0, s0, s4 ; GFX6-NEXT: s_and_b32 s4, s9, 15 ; GFX6-NEXT: s_andn2_b32 s8, 15, s9 ; GFX6-NEXT: s_bfe_u32 s4, s4, 0x100000 ; GFX6-NEXT: s_lshl_b32 s1, s1, s4 -; GFX6-NEXT: s_bfe_u32 s4, s5, s12 +; GFX6-NEXT: s_bfe_u32 s4, s5, 0xf0001 ; GFX6-NEXT: s_bfe_u32 s5, s8, 0x100000 ; GFX6-NEXT: s_lshr_b32 s4, s4, s5 ; GFX6-NEXT: s_or_b32 s1, s1, s4 @@ -3724,7 +3689,7 @@ define amdgpu_ps <2 x i32> @s_fshl_v4i16(<4 x i16> inreg %lhs, <4 x i16> inreg % ; GFX6-NEXT: s_andn2_b32 s5, 15, s10 ; GFX6-NEXT: s_bfe_u32 s4, s4, 0x100000 ; GFX6-NEXT: s_lshl_b32 s2, s2, s4 -; GFX6-NEXT: s_bfe_u32 s4, s6, s12 +; GFX6-NEXT: s_bfe_u32 s4, s6, 0xf0001 ; GFX6-NEXT: s_bfe_u32 s5, s5, 0x100000 ; GFX6-NEXT: s_lshr_b32 s4, s4, s5 ; GFX6-NEXT: s_or_b32 s2, s2, s4 @@ -3732,7 +3697,7 @@ define amdgpu_ps <2 x i32> @s_fshl_v4i16(<4 x i16> inreg %lhs, <4 x i16> inreg % ; GFX6-NEXT: s_andn2_b32 s5, 15, s11 ; GFX6-NEXT: s_bfe_u32 s4, s4, 0x100000 ; GFX6-NEXT: s_lshl_b32 s3, s3, s4 -; GFX6-NEXT: s_bfe_u32 s4, s7, s12 +; GFX6-NEXT: s_bfe_u32 s4, s7, 0xf0001 ; GFX6-NEXT: s_bfe_u32 s5, s5, 0x100000 ; GFX6-NEXT: s_lshr_b32 s4, s4, s5 ; GFX6-NEXT: s_bfe_u32 s1, s1, 0x100000 @@ -3801,42 +3766,39 @@ define amdgpu_ps <2 x i32> @s_fshl_v4i16(<4 x i16> inreg %lhs, <4 x i16> inreg % ; ; GFX9-LABEL: s_fshl_v4i16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_mov_b32 s6, 0xf000f -; GFX9-NEXT: s_and_b32 s7, s4, s6 -; GFX9-NEXT: s_lshr_b32 s9, s0, 16 -; GFX9-NEXT: s_lshr_b32 s10, s7, 16 -; GFX9-NEXT: s_lshl_b32 s0, s0, s7 -; GFX9-NEXT: s_lshl_b32 s7, s9, s10 -; GFX9-NEXT: s_mov_b32 s9, 0xffff -; GFX9-NEXT: s_mov_b32 s8, 0x10001 -; GFX9-NEXT: s_pack_ll_b32_b16 s0, s0, s7 -; GFX9-NEXT: s_lshr_b32 s7, s2, 16 -; GFX9-NEXT: s_and_b32 s2, s2, s9 -; GFX9-NEXT: s_lshr_b32 s2, s2, s8 -; GFX9-NEXT: s_lshr_b32 s7, s7, 1 -; GFX9-NEXT: s_andn2_b32 s4, s6, s4 -; GFX9-NEXT: s_pack_ll_b32_b16 s2, s2, s7 -; GFX9-NEXT: s_lshr_b32 s7, s2, 16 -; GFX9-NEXT: s_and_b32 s2, s2, s9 -; GFX9-NEXT: s_lshr_b32 s10, s4, 16 +; GFX9-NEXT: s_and_b32 s6, s4, 0xf000f +; GFX9-NEXT: s_lshr_b32 s7, s0, 16 +; GFX9-NEXT: s_lshr_b32 s8, s6, 16 +; GFX9-NEXT: s_lshl_b32 s0, s0, s6 +; GFX9-NEXT: s_lshl_b32 s6, s7, s8 +; GFX9-NEXT: s_pack_ll_b32_b16 s0, s0, s6 +; GFX9-NEXT: s_lshr_b32 s6, s2, 16 +; GFX9-NEXT: s_and_b32 s2, s2, 0xffff +; GFX9-NEXT: s_lshr_b32 s2, s2, 0x10001 +; GFX9-NEXT: s_lshr_b32 s6, s6, 1 +; GFX9-NEXT: s_andn2_b32 s4, 0xf000f, s4 +; GFX9-NEXT: s_pack_ll_b32_b16 s2, s2, s6 +; GFX9-NEXT: s_lshr_b32 s6, s2, 16 +; GFX9-NEXT: s_and_b32 s2, s2, 0xffff +; GFX9-NEXT: s_lshr_b32 s7, s4, 16 ; GFX9-NEXT: s_lshr_b32 s2, s2, s4 -; GFX9-NEXT: s_lshr_b32 s4, s7, s10 +; GFX9-NEXT: s_lshr_b32 s4, s6, s7 ; GFX9-NEXT: s_pack_ll_b32_b16 s2, s2, s4 ; GFX9-NEXT: s_or_b32 s0, s0, s2 -; GFX9-NEXT: s_and_b32 s2, s5, s6 -; GFX9-NEXT: s_andn2_b32 s4, s6, s5 +; GFX9-NEXT: s_and_b32 s2, s5, 0xf000f +; GFX9-NEXT: s_andn2_b32 s4, 0xf000f, s5 ; GFX9-NEXT: s_lshr_b32 s5, s1, 16 ; GFX9-NEXT: s_lshr_b32 s6, s2, 16 ; GFX9-NEXT: s_lshl_b32 s1, s1, s2 ; GFX9-NEXT: s_lshl_b32 s2, s5, s6 ; GFX9-NEXT: s_pack_ll_b32_b16 s1, s1, s2 ; GFX9-NEXT: s_lshr_b32 s2, s3, 16 -; GFX9-NEXT: s_and_b32 s3, s3, s9 -; GFX9-NEXT: s_lshr_b32 s3, s3, s8 +; GFX9-NEXT: s_and_b32 s3, s3, 0xffff +; GFX9-NEXT: s_lshr_b32 s3, s3, 0x10001 ; GFX9-NEXT: s_lshr_b32 s2, s2, 1 ; GFX9-NEXT: s_pack_ll_b32_b16 s2, s3, s2 ; GFX9-NEXT: s_lshr_b32 s3, s2, 16 -; GFX9-NEXT: s_and_b32 s2, s2, s9 +; GFX9-NEXT: s_and_b32 s2, s2, 0xffff ; GFX9-NEXT: s_lshr_b32 s5, s4, 16 ; GFX9-NEXT: s_lshr_b32 s2, s2, s4 ; GFX9-NEXT: s_lshr_b32 s3, s3, s5 @@ -3846,40 +3808,37 @@ define amdgpu_ps <2 x i32> @s_fshl_v4i16(<4 x i16> inreg %lhs, <4 x i16> inreg % ; ; GFX10-LABEL: s_fshl_v4i16: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_mov_b32 s10, 0xffff -; GFX10-NEXT: s_mov_b32 s6, 0xf000f -; GFX10-NEXT: s_mov_b32 s8, 0x10001 -; GFX10-NEXT: s_and_b32 s12, s2, s10 +; GFX10-NEXT: s_and_b32 s9, s2, 0xffff ; GFX10-NEXT: s_lshr_b32 s2, s2, 16 -; GFX10-NEXT: s_and_b32 s7, s4, s6 -; GFX10-NEXT: s_lshr_b32 s12, s12, s8 +; GFX10-NEXT: s_and_b32 s6, s4, 0xf000f +; GFX10-NEXT: s_lshr_b32 s9, s9, 0x10001 ; GFX10-NEXT: s_lshr_b32 s2, s2, 1 -; GFX10-NEXT: s_andn2_b32 s4, s6, s4 -; GFX10-NEXT: s_lshr_b32 s9, s0, 16 -; GFX10-NEXT: s_lshr_b32 s11, s7, 16 -; GFX10-NEXT: s_pack_ll_b32_b16 s2, s12, s2 -; GFX10-NEXT: s_lshl_b32 s0, s0, s7 -; GFX10-NEXT: s_lshl_b32 s7, s9, s11 -; GFX10-NEXT: s_lshr_b32 s9, s2, 16 -; GFX10-NEXT: s_and_b32 s2, s2, s10 -; GFX10-NEXT: s_lshr_b32 s11, s4, 16 +; GFX10-NEXT: s_andn2_b32 s4, 0xf000f, s4 +; GFX10-NEXT: s_lshr_b32 s7, s0, 16 +; GFX10-NEXT: s_lshr_b32 s8, s6, 16 +; GFX10-NEXT: s_pack_ll_b32_b16 s2, s9, s2 +; GFX10-NEXT: s_lshl_b32 s0, s0, s6 +; GFX10-NEXT: s_lshl_b32 s6, s7, s8 +; GFX10-NEXT: s_lshr_b32 s7, s2, 16 +; GFX10-NEXT: s_and_b32 s2, s2, 0xffff +; GFX10-NEXT: s_lshr_b32 s8, s4, 16 ; GFX10-NEXT: s_lshr_b32 s2, s2, s4 -; GFX10-NEXT: s_lshr_b32 s4, s9, s11 -; GFX10-NEXT: s_and_b32 s9, s3, s10 +; GFX10-NEXT: s_lshr_b32 s4, s7, s8 +; GFX10-NEXT: s_and_b32 s8, s3, 0xffff ; GFX10-NEXT: s_lshr_b32 s3, s3, 16 ; GFX10-NEXT: s_pack_ll_b32_b16 s2, s2, s4 -; GFX10-NEXT: s_and_b32 s4, s5, s6 -; GFX10-NEXT: s_lshr_b32 s8, s9, s8 +; GFX10-NEXT: s_and_b32 s4, s5, 0xf000f +; GFX10-NEXT: s_lshr_b32 s8, s8, 0x10001 ; GFX10-NEXT: s_lshr_b32 s3, s3, 1 -; GFX10-NEXT: s_pack_ll_b32_b16 s0, s0, s7 -; GFX10-NEXT: s_andn2_b32 s5, s6, s5 +; GFX10-NEXT: s_pack_ll_b32_b16 s0, s0, s6 +; GFX10-NEXT: s_andn2_b32 s5, 0xf000f, s5 ; GFX10-NEXT: s_lshr_b32 s6, s1, 16 ; GFX10-NEXT: s_lshr_b32 s7, s4, 16 ; GFX10-NEXT: s_pack_ll_b32_b16 s3, s8, s3 ; GFX10-NEXT: s_lshl_b32 s1, s1, s4 ; GFX10-NEXT: s_lshl_b32 s4, s6, s7 ; GFX10-NEXT: s_lshr_b32 s6, s3, 16 -; GFX10-NEXT: s_and_b32 s3, s3, s10 +; GFX10-NEXT: s_and_b32 s3, s3, 0xffff ; GFX10-NEXT: s_lshr_b32 s7, s5, 16 ; GFX10-NEXT: s_lshr_b32 s3, s3, s5 ; GFX10-NEXT: s_lshr_b32 s5, s6, s7 @@ -4003,16 +3962,15 @@ define <4 x half> @v_fshl_v4i16(<4 x i16> %lhs, <4 x i16> %rhs, <4 x i16> %amt) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: v_xor_b32_e32 v6, -1, v4 ; GFX10-NEXT: v_xor_b32_e32 v7, -1, v5 -; GFX10-NEXT: s_mov_b32 s4, 0xf000f +; GFX10-NEXT: v_and_b32_e32 v4, 0xf000f, v4 ; GFX10-NEXT: v_pk_lshrrev_b16 v2, 1, v2 op_sel_hi:[0,1] -; GFX10-NEXT: v_and_b32_e32 v4, s4, v4 -; GFX10-NEXT: v_and_b32_e32 v6, s4, v6 -; GFX10-NEXT: v_and_b32_e32 v5, s4, v5 -; GFX10-NEXT: v_and_b32_e32 v7, s4, v7 +; GFX10-NEXT: v_and_b32_e32 v5, 0xf000f, v5 +; GFX10-NEXT: v_and_b32_e32 v6, 0xf000f, v6 ; GFX10-NEXT: v_pk_lshrrev_b16 v3, 1, v3 op_sel_hi:[0,1] +; GFX10-NEXT: v_and_b32_e32 v7, 0xf000f, v7 ; GFX10-NEXT: v_pk_lshlrev_b16 v0, v4, v0 -; GFX10-NEXT: v_pk_lshrrev_b16 v2, v6, v2 ; GFX10-NEXT: v_pk_lshlrev_b16 v1, v5, v1 +; GFX10-NEXT: v_pk_lshrrev_b16 v2, v6, v2 ; GFX10-NEXT: v_pk_lshrrev_b16 v3, v7, v3 ; GFX10-NEXT: v_or_b32_e32 v0, v0, v2 ; GFX10-NEXT: v_or_b32_e32 v1, v1, v3 @@ -4893,14 +4851,13 @@ define i128 @v_fshl_i128(i128 %lhs, i128 %rhs, i128 %amt) { ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: s_movk_i32 s4, 0x7f -; GFX10-NEXT: v_lshrrev_b64 v[4:5], 1, v[4:5] -; GFX10-NEXT: v_and_b32_e32 v18, s4, v8 +; GFX10-NEXT: v_and_b32_e32 v18, 0x7f, v8 ; GFX10-NEXT: v_xor_b32_e32 v8, -1, v8 +; GFX10-NEXT: v_lshrrev_b64 v[4:5], 1, v[4:5] ; GFX10-NEXT: v_lshlrev_b32_e32 v12, 31, v6 ; GFX10-NEXT: v_lshrrev_b64 v[6:7], 1, v[6:7] ; GFX10-NEXT: v_sub_nc_u32_e32 v9, 64, v18 -; GFX10-NEXT: v_and_b32_e32 v19, s4, v8 +; GFX10-NEXT: v_and_b32_e32 v19, 0x7f, v8 ; GFX10-NEXT: v_lshlrev_b64 v[10:11], v18, v[2:3] ; GFX10-NEXT: v_or_b32_e32 v5, v5, v12 ; GFX10-NEXT: v_subrev_nc_u32_e32 v20, 64, v18 @@ -5104,44 +5061,43 @@ define amdgpu_ps <4 x float> @v_fshl_i128_ssv(i128 inreg %lhs, i128 inreg %rhs, ; ; GFX10-LABEL: v_fshl_i128_ssv: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_movk_i32 s9, 0x7f -; GFX10-NEXT: v_xor_b32_e32 v4, -1, v0 -; GFX10-NEXT: v_and_b32_e32 v12, s9, v0 +; GFX10-NEXT: v_and_b32_e32 v12, 0x7f, v0 +; GFX10-NEXT: v_xor_b32_e32 v0, -1, v0 ; GFX10-NEXT: s_mov_b32 s8, 0 ; GFX10-NEXT: s_lshr_b64 s[4:5], s[4:5], 1 -; GFX10-NEXT: v_and_b32_e32 v13, s9, v4 -; GFX10-NEXT: v_sub_nc_u32_e32 v2, 64, v12 -; GFX10-NEXT: v_lshlrev_b64 v[0:1], v12, s[2:3] ; GFX10-NEXT: s_lshl_b32 s9, s6, 31 -; GFX10-NEXT: s_lshr_b64 s[6:7], s[6:7], 1 +; GFX10-NEXT: v_sub_nc_u32_e32 v2, 64, v12 +; GFX10-NEXT: v_and_b32_e32 v13, 0x7f, v0 +; GFX10-NEXT: v_lshlrev_b64 v[0:1], v12, s[2:3] ; GFX10-NEXT: s_or_b64 s[8:9], s[4:5], s[8:9] +; GFX10-NEXT: s_lshr_b64 s[6:7], s[6:7], 1 ; GFX10-NEXT: v_lshrrev_b64 v[2:3], v2, s[0:1] +; GFX10-NEXT: v_sub_nc_u32_e32 v8, 64, v13 ; GFX10-NEXT: v_subrev_nc_u32_e32 v10, 64, v12 -; GFX10-NEXT: v_subrev_nc_u32_e32 v14, 64, v13 ; GFX10-NEXT: v_lshrrev_b64 v[6:7], v13, s[8:9] ; GFX10-NEXT: v_cmp_gt_u32_e32 vcc_lo, 64, v12 ; GFX10-NEXT: v_lshlrev_b64 v[4:5], v12, s[0:1] ; GFX10-NEXT: v_or_b32_e32 v2, v2, v0 -; GFX10-NEXT: v_sub_nc_u32_e32 v0, 64, v13 +; GFX10-NEXT: v_subrev_nc_u32_e32 v0, 64, v13 +; GFX10-NEXT: v_lshlrev_b64 v[8:9], v8, s[6:7] ; GFX10-NEXT: v_lshlrev_b64 v[10:11], v10, s[0:1] ; GFX10-NEXT: v_or_b32_e32 v3, v3, v1 ; GFX10-NEXT: v_cmp_gt_u32_e64 s0, 64, v13 +; GFX10-NEXT: v_lshrrev_b64 v[0:1], v0, s[6:7] ; GFX10-NEXT: v_cmp_eq_u32_e64 s1, 0, v13 -; GFX10-NEXT: v_lshlrev_b64 v[8:9], v0, s[6:7] -; GFX10-NEXT: v_lshrrev_b64 v[0:1], v14, s[6:7] -; GFX10-NEXT: v_cmp_eq_u32_e64 s4, 0, v12 -; GFX10-NEXT: v_cndmask_b32_e32 v4, 0, v4, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e32 v5, 0, v5, vcc_lo ; GFX10-NEXT: v_or_b32_e32 v6, v6, v8 ; GFX10-NEXT: v_or_b32_e32 v7, v7, v9 ; GFX10-NEXT: v_cndmask_b32_e32 v8, v10, v2, vcc_lo ; GFX10-NEXT: v_cndmask_b32_e32 v10, v11, v3, vcc_lo ; GFX10-NEXT: v_lshrrev_b64 v[2:3], v13, s[6:7] ; GFX10-NEXT: v_cndmask_b32_e64 v0, v0, v6, s0 +; GFX10-NEXT: v_cmp_eq_u32_e64 s4, 0, v12 ; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, v7, s0 +; GFX10-NEXT: v_cndmask_b32_e32 v4, 0, v4, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e32 v5, 0, v5, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e64 v0, v0, s8, s1 ; GFX10-NEXT: v_cndmask_b32_e64 v6, v8, s2, s4 ; GFX10-NEXT: v_cndmask_b32_e64 v7, v10, s3, s4 -; GFX10-NEXT: v_cndmask_b32_e64 v0, v0, s8, s1 ; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, s9, s1 ; GFX10-NEXT: v_cndmask_b32_e64 v2, 0, v2, s0 ; GFX10-NEXT: v_cndmask_b32_e64 v3, 0, v3, s0 @@ -6343,14 +6299,13 @@ define <2 x i128> @v_fshl_v2i128(<2 x i128> %lhs, <2 x i128> %rhs, <2 x i128> %a ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: s_movk_i32 s7, 0x7f -; GFX10-NEXT: v_lshrrev_b64 v[8:9], 1, v[8:9] -; GFX10-NEXT: v_and_b32_e32 v27, s7, v16 +; GFX10-NEXT: v_and_b32_e32 v27, 0x7f, v16 ; GFX10-NEXT: v_xor_b32_e32 v16, -1, v16 +; GFX10-NEXT: v_lshrrev_b64 v[8:9], 1, v[8:9] ; GFX10-NEXT: v_lshlrev_b32_e32 v21, 31, v10 ; GFX10-NEXT: v_lshrrev_b64 v[10:11], 1, v[10:11] ; GFX10-NEXT: v_sub_nc_u32_e32 v17, 64, v27 -; GFX10-NEXT: v_and_b32_e32 v28, s7, v16 +; GFX10-NEXT: v_and_b32_e32 v28, 0x7f, v16 ; GFX10-NEXT: v_lshlrev_b64 v[18:19], v27, v[2:3] ; GFX10-NEXT: v_or_b32_e32 v9, v9, v21 ; GFX10-NEXT: v_subrev_nc_u32_e32 v29, 64, v27 @@ -6382,7 +6337,7 @@ define <2 x i128> @v_fshl_v2i128(<2 x i128> %lhs, <2 x i128> %rhs, <2 x i128> %a ; GFX10-NEXT: v_cndmask_b32_e64 v2, v18, v2, s6 ; GFX10-NEXT: v_cndmask_b32_e64 v8, v10, v9, s5 ; GFX10-NEXT: v_cndmask_b32_e64 v9, 0, v0, s4 -; GFX10-NEXT: v_and_b32_e32 v23, s7, v20 +; GFX10-NEXT: v_and_b32_e32 v23, 0x7f, v20 ; GFX10-NEXT: v_or_b32_e32 v0, v21, v3 ; GFX10-NEXT: v_xor_b32_e32 v3, -1, v20 ; GFX10-NEXT: v_cndmask_b32_e64 v24, 0, v1, s4 @@ -6391,7 +6346,7 @@ define <2 x i128> @v_fshl_v2i128(<2 x i128> %lhs, <2 x i128> %rhs, <2 x i128> %a ; GFX10-NEXT: v_or_b32_e32 v2, v2, v9 ; GFX10-NEXT: v_lshrrev_b64 v[8:9], 1, v[12:13] ; GFX10-NEXT: v_lshlrev_b32_e32 v16, 31, v14 -; GFX10-NEXT: v_and_b32_e32 v25, s7, v3 +; GFX10-NEXT: v_and_b32_e32 v25, 0x7f, v3 ; GFX10-NEXT: v_lshrrev_b64 v[10:11], v10, v[4:5] ; GFX10-NEXT: v_lshlrev_b64 v[12:13], v23, v[6:7] ; GFX10-NEXT: v_lshrrev_b64 v[14:15], 1, v[14:15] diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/fshr.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/fshr.ll index 9e76467fd55e..52f78974d9aa 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/fshr.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/fshr.ll @@ -9,12 +9,12 @@ define amdgpu_ps i7 @s_fshr_i7(i7 inreg %lhs, i7 inreg %rhs, i7 inreg %amt) { ; GFX6: ; %bb.0: ; GFX6-NEXT: v_cvt_f32_ubyte0_e32 v0, 7 ; GFX6-NEXT: v_rcp_iflag_f32_e32 v0, v0 +; GFX6-NEXT: s_and_b32 s2, s2, 0x7f ; GFX6-NEXT: s_movk_i32 s3, 0x7f -; GFX6-NEXT: s_and_b32 s2, s2, s3 ; GFX6-NEXT: s_lshl_b32 s0, s0, 1 ; GFX6-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX6-NEXT: v_cvt_u32_f32_e32 v0, v0 -; GFX6-NEXT: s_and_b32 s1, s1, s3 +; GFX6-NEXT: s_and_b32 s1, s1, 0x7f ; GFX6-NEXT: v_mul_lo_u32 v1, -7, v0 ; GFX6-NEXT: v_mul_hi_u32 v1, v0, v1 ; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v1 @@ -40,12 +40,12 @@ define amdgpu_ps i7 @s_fshr_i7(i7 inreg %lhs, i7 inreg %rhs, i7 inreg %amt) { ; GFX8: ; %bb.0: ; GFX8-NEXT: v_cvt_f32_ubyte0_e32 v0, 7 ; GFX8-NEXT: v_rcp_iflag_f32_e32 v0, v0 +; GFX8-NEXT: s_and_b32 s2, s2, 0x7f ; GFX8-NEXT: s_movk_i32 s3, 0x7f -; GFX8-NEXT: s_and_b32 s2, s2, s3 ; GFX8-NEXT: s_lshl_b32 s0, s0, 1 ; GFX8-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX8-NEXT: v_cvt_u32_f32_e32 v0, v0 -; GFX8-NEXT: s_and_b32 s1, s1, s3 +; GFX8-NEXT: s_and_b32 s1, s1, 0x7f ; GFX8-NEXT: v_mul_lo_u32 v1, -7, v0 ; GFX8-NEXT: v_mul_hi_u32 v1, v0, v1 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v1 @@ -71,12 +71,12 @@ define amdgpu_ps i7 @s_fshr_i7(i7 inreg %lhs, i7 inreg %rhs, i7 inreg %amt) { ; GFX9: ; %bb.0: ; GFX9-NEXT: v_cvt_f32_ubyte0_e32 v0, 7 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v0, v0 +; GFX9-NEXT: s_and_b32 s2, s2, 0x7f ; GFX9-NEXT: s_movk_i32 s3, 0x7f -; GFX9-NEXT: s_and_b32 s2, s2, s3 ; GFX9-NEXT: s_lshl_b32 s0, s0, 1 ; GFX9-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0 -; GFX9-NEXT: s_and_b32 s1, s1, s3 +; GFX9-NEXT: s_and_b32 s1, s1, 0x7f ; GFX9-NEXT: v_mul_lo_u32 v1, -7, v0 ; GFX9-NEXT: v_mul_hi_u32 v1, v0, v1 ; GFX9-NEXT: v_add_u32_e32 v0, v0, v1 @@ -101,10 +101,9 @@ define amdgpu_ps i7 @s_fshr_i7(i7 inreg %lhs, i7 inreg %rhs, i7 inreg %amt) { ; GFX10-LABEL: s_fshr_i7: ; GFX10: ; %bb.0: ; GFX10-NEXT: v_cvt_f32_ubyte0_e32 v0, 7 -; GFX10-NEXT: s_movk_i32 s3, 0x7f +; GFX10-NEXT: s_and_b32 s2, s2, 0x7f ; GFX10-NEXT: s_lshl_b32 s0, s0, 1 -; GFX10-NEXT: s_and_b32 s2, s2, s3 -; GFX10-NEXT: s_and_b32 s1, s1, s3 +; GFX10-NEXT: s_and_b32 s1, s1, 0x7f ; GFX10-NEXT: v_rcp_iflag_f32_e32 v0, v0 ; GFX10-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX10-NEXT: v_cvt_u32_f32_e32 v0, v0 @@ -121,8 +120,8 @@ define amdgpu_ps i7 @s_fshr_i7(i7 inreg %lhs, i7 inreg %rhs, i7 inreg %amt) { ; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, 7, v0 ; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc_lo ; GFX10-NEXT: v_sub_nc_u16 v1, 6, v0 -; GFX10-NEXT: v_and_b32_e32 v0, s3, v0 -; GFX10-NEXT: v_and_b32_e32 v1, s3, v1 +; GFX10-NEXT: v_and_b32_e32 v0, 0x7f, v0 +; GFX10-NEXT: v_and_b32_e32 v1, 0x7f, v1 ; GFX10-NEXT: v_lshrrev_b16 v0, v0, s1 ; GFX10-NEXT: v_lshlrev_b16 v1, v1, s0 ; GFX10-NEXT: v_or_b32_e32 v0, v1, v0 @@ -249,12 +248,11 @@ define i7 @v_fshr_i7(i7 %lhs, i7 %rhs, i7 %amt) { ; GFX10-NEXT: v_subrev_nc_u32_e32 v3, 7, v2 ; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, 7, v2 ; GFX10-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc_lo -; GFX10-NEXT: v_mov_b32_e32 v3, 0x7f -; GFX10-NEXT: v_sub_nc_u16 v4, 6, v2 -; GFX10-NEXT: v_and_b32_e32 v2, v2, v3 -; GFX10-NEXT: v_and_b32_e32 v4, v4, v3 +; GFX10-NEXT: v_sub_nc_u16 v3, 6, v2 +; GFX10-NEXT: v_and_b32_e32 v2, 0x7f, v2 +; GFX10-NEXT: v_and_b32_e32 v3, 0x7f, v3 ; GFX10-NEXT: v_lshrrev_b16 v1, v2, v1 -; GFX10-NEXT: v_lshlrev_b16 v0, v4, v0 +; GFX10-NEXT: v_lshlrev_b16 v0, v3, v0 ; GFX10-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX10-NEXT: s_setpc_b64 s[30:31] %result = call i7 @llvm.fshr.i7(i7 %lhs, i7 %rhs, i7 %amt) @@ -530,10 +528,9 @@ define amdgpu_ps i16 @s_fshr_v2i8(i16 inreg %lhs.arg, i16 inreg %rhs.arg, i16 in ; GFX6-NEXT: s_lshr_b32 s4, s2, 8 ; GFX6-NEXT: s_and_b32 s5, s2, 7 ; GFX6-NEXT: s_andn2_b32 s2, 7, s2 -; GFX6-NEXT: s_movk_i32 s6, 0xff ; GFX6-NEXT: s_lshl_b32 s0, s0, 1 ; GFX6-NEXT: s_lshl_b32 s0, s0, s2 -; GFX6-NEXT: s_and_b32 s2, s1, s6 +; GFX6-NEXT: s_and_b32 s2, s1, 0xff ; GFX6-NEXT: s_lshr_b32 s2, s2, s5 ; GFX6-NEXT: s_or_b32 s0, s0, s2 ; GFX6-NEXT: s_and_b32 s2, s4, 7 @@ -543,8 +540,8 @@ define amdgpu_ps i16 @s_fshr_v2i8(i16 inreg %lhs.arg, i16 inreg %rhs.arg, i16 in ; GFX6-NEXT: s_lshl_b32 s3, s3, s4 ; GFX6-NEXT: s_lshr_b32 s1, s1, s2 ; GFX6-NEXT: s_or_b32 s1, s3, s1 -; GFX6-NEXT: s_and_b32 s1, s1, s6 -; GFX6-NEXT: s_and_b32 s0, s0, s6 +; GFX6-NEXT: s_and_b32 s1, s1, 0xff +; GFX6-NEXT: s_and_b32 s0, s0, 0xff ; GFX6-NEXT: s_lshl_b32 s1, s1, 8 ; GFX6-NEXT: s_or_b32 s0, s0, s1 ; GFX6-NEXT: ; return to shader part epilog @@ -552,28 +549,27 @@ define amdgpu_ps i16 @s_fshr_v2i8(i16 inreg %lhs.arg, i16 inreg %rhs.arg, i16 in ; GFX8-LABEL: s_fshr_v2i8: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_lshr_b32 s3, s0, 8 +; GFX8-NEXT: s_lshr_b32 s4, s1, 8 ; GFX8-NEXT: s_lshr_b32 s5, s2, 8 ; GFX8-NEXT: s_and_b32 s6, s2, 7 ; GFX8-NEXT: s_andn2_b32 s2, 7, s2 ; GFX8-NEXT: s_lshl_b32 s0, s0, 1 +; GFX8-NEXT: s_and_b32 s1, s1, 0xff ; GFX8-NEXT: s_lshl_b32 s0, s0, s2 -; GFX8-NEXT: s_movk_i32 s2, 0xff -; GFX8-NEXT: s_lshr_b32 s4, s1, 8 -; GFX8-NEXT: s_and_b32 s1, s1, s2 ; GFX8-NEXT: s_bfe_u32 s1, s1, 0x100000 +; GFX8-NEXT: s_andn2_b32 s2, 7, s5 +; GFX8-NEXT: s_lshl_b32 s3, s3, 1 ; GFX8-NEXT: s_lshr_b32 s1, s1, s6 -; GFX8-NEXT: s_and_b32 s4, s4, s2 +; GFX8-NEXT: s_lshl_b32 s2, s3, s2 +; GFX8-NEXT: s_and_b32 s3, s4, 0xff ; GFX8-NEXT: s_or_b32 s0, s0, s1 ; GFX8-NEXT: s_and_b32 s1, s5, 7 -; GFX8-NEXT: s_andn2_b32 s5, 7, s5 -; GFX8-NEXT: s_lshl_b32 s3, s3, 1 -; GFX8-NEXT: s_bfe_u32 s4, s4, 0x100000 -; GFX8-NEXT: s_lshl_b32 s3, s3, s5 -; GFX8-NEXT: s_lshr_b32 s1, s4, s1 -; GFX8-NEXT: s_or_b32 s1, s3, s1 -; GFX8-NEXT: s_and_b32 s0, s0, s2 -; GFX8-NEXT: s_and_b32 s1, s1, s2 +; GFX8-NEXT: s_bfe_u32 s3, s3, 0x100000 +; GFX8-NEXT: s_lshr_b32 s1, s3, s1 +; GFX8-NEXT: s_or_b32 s1, s2, s1 +; GFX8-NEXT: s_and_b32 s1, s1, 0xff ; GFX8-NEXT: s_bfe_u32 s2, 8, 0x100000 +; GFX8-NEXT: s_and_b32 s0, s0, 0xff ; GFX8-NEXT: s_lshl_b32 s1, s1, s2 ; GFX8-NEXT: s_or_b32 s0, s0, s1 ; GFX8-NEXT: ; return to shader part epilog @@ -581,28 +577,27 @@ define amdgpu_ps i16 @s_fshr_v2i8(i16 inreg %lhs.arg, i16 inreg %rhs.arg, i16 in ; GFX9-LABEL: s_fshr_v2i8: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_lshr_b32 s3, s0, 8 +; GFX9-NEXT: s_lshr_b32 s4, s1, 8 ; GFX9-NEXT: s_lshr_b32 s5, s2, 8 ; GFX9-NEXT: s_and_b32 s6, s2, 7 ; GFX9-NEXT: s_andn2_b32 s2, 7, s2 ; GFX9-NEXT: s_lshl_b32 s0, s0, 1 +; GFX9-NEXT: s_and_b32 s1, s1, 0xff ; GFX9-NEXT: s_lshl_b32 s0, s0, s2 -; GFX9-NEXT: s_movk_i32 s2, 0xff -; GFX9-NEXT: s_lshr_b32 s4, s1, 8 -; GFX9-NEXT: s_and_b32 s1, s1, s2 ; GFX9-NEXT: s_bfe_u32 s1, s1, 0x100000 +; GFX9-NEXT: s_andn2_b32 s2, 7, s5 +; GFX9-NEXT: s_lshl_b32 s3, s3, 1 ; GFX9-NEXT: s_lshr_b32 s1, s1, s6 -; GFX9-NEXT: s_and_b32 s4, s4, s2 +; GFX9-NEXT: s_lshl_b32 s2, s3, s2 +; GFX9-NEXT: s_and_b32 s3, s4, 0xff ; GFX9-NEXT: s_or_b32 s0, s0, s1 ; GFX9-NEXT: s_and_b32 s1, s5, 7 -; GFX9-NEXT: s_andn2_b32 s5, 7, s5 -; GFX9-NEXT: s_lshl_b32 s3, s3, 1 -; GFX9-NEXT: s_bfe_u32 s4, s4, 0x100000 -; GFX9-NEXT: s_lshl_b32 s3, s3, s5 -; GFX9-NEXT: s_lshr_b32 s1, s4, s1 -; GFX9-NEXT: s_or_b32 s1, s3, s1 -; GFX9-NEXT: s_and_b32 s0, s0, s2 -; GFX9-NEXT: s_and_b32 s1, s1, s2 +; GFX9-NEXT: s_bfe_u32 s3, s3, 0x100000 +; GFX9-NEXT: s_lshr_b32 s1, s3, s1 +; GFX9-NEXT: s_or_b32 s1, s2, s1 +; GFX9-NEXT: s_and_b32 s1, s1, 0xff ; GFX9-NEXT: s_bfe_u32 s2, 8, 0x100000 +; GFX9-NEXT: s_and_b32 s0, s0, 0xff ; GFX9-NEXT: s_lshl_b32 s1, s1, s2 ; GFX9-NEXT: s_or_b32 s0, s0, s1 ; GFX9-NEXT: ; return to shader part epilog @@ -610,14 +605,13 @@ define amdgpu_ps i16 @s_fshr_v2i8(i16 inreg %lhs.arg, i16 inreg %rhs.arg, i16 in ; GFX10-LABEL: s_fshr_v2i8: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_lshr_b32 s4, s1, 8 -; GFX10-NEXT: s_movk_i32 s7, 0xff ; GFX10-NEXT: s_lshr_b32 s3, s0, 8 ; GFX10-NEXT: s_lshr_b32 s5, s2, 8 ; GFX10-NEXT: s_and_b32 s6, s2, 7 ; GFX10-NEXT: s_andn2_b32 s2, 7, s2 ; GFX10-NEXT: s_lshl_b32 s0, s0, 1 -; GFX10-NEXT: s_and_b32 s4, s4, s7 -; GFX10-NEXT: s_and_b32 s1, s1, s7 +; GFX10-NEXT: s_and_b32 s4, s4, 0xff +; GFX10-NEXT: s_and_b32 s1, s1, 0xff ; GFX10-NEXT: s_lshl_b32 s0, s0, s2 ; GFX10-NEXT: s_and_b32 s2, s5, 7 ; GFX10-NEXT: s_andn2_b32 s5, 7, s5 @@ -629,9 +623,9 @@ define amdgpu_ps i16 @s_fshr_v2i8(i16 inreg %lhs.arg, i16 inreg %rhs.arg, i16 in ; GFX10-NEXT: s_lshr_b32 s1, s1, s6 ; GFX10-NEXT: s_or_b32 s2, s3, s2 ; GFX10-NEXT: s_or_b32 s0, s0, s1 -; GFX10-NEXT: s_and_b32 s1, s2, s7 +; GFX10-NEXT: s_and_b32 s1, s2, 0xff ; GFX10-NEXT: s_bfe_u32 s2, 8, 0x100000 -; GFX10-NEXT: s_and_b32 s0, s0, s7 +; GFX10-NEXT: s_and_b32 s0, s0, 0xff ; GFX10-NEXT: s_lshl_b32 s1, s1, s2 ; GFX10-NEXT: s_or_b32 s0, s0, s1 ; GFX10-NEXT: ; return to shader part epilog @@ -729,18 +723,18 @@ define i16 @v_fshr_v2i8(i16 %lhs.arg, i16 %rhs.arg, i16 %amt.arg) { ; GFX10-NEXT: v_lshrrev_b32_e32 v3, 8, v2 ; GFX10-NEXT: v_lshrrev_b32_e32 v4, 8, v0 ; GFX10-NEXT: v_lshrrev_b32_e32 v5, 8, v1 -; GFX10-NEXT: s_movk_i32 s4, 0xff ; GFX10-NEXT: v_and_b32_e32 v7, 7, v2 -; GFX10-NEXT: v_xor_b32_e32 v6, -1, v3 ; GFX10-NEXT: v_xor_b32_e32 v2, -1, v2 +; GFX10-NEXT: v_xor_b32_e32 v6, -1, v3 ; GFX10-NEXT: v_and_b32_e32 v3, 7, v3 ; GFX10-NEXT: v_lshlrev_b16 v4, 1, v4 -; GFX10-NEXT: v_and_b32_e32 v5, s4, v5 -; GFX10-NEXT: v_and_b32_e32 v6, 7, v6 +; GFX10-NEXT: v_and_b32_e32 v5, 0xff, v5 ; GFX10-NEXT: v_lshlrev_b16 v0, 1, v0 -; GFX10-NEXT: v_and_b32_e32 v1, s4, v1 +; GFX10-NEXT: v_and_b32_e32 v6, 7, v6 +; GFX10-NEXT: v_and_b32_e32 v1, 0xff, v1 ; GFX10-NEXT: v_and_b32_e32 v2, 7, v2 ; GFX10-NEXT: v_lshrrev_b16 v3, v3, v5 +; GFX10-NEXT: s_movk_i32 s4, 0xff ; GFX10-NEXT: v_lshlrev_b16 v4, v6, v4 ; GFX10-NEXT: v_lshrrev_b16 v1, v7, v1 ; GFX10-NEXT: v_lshlrev_b16 v0, v2, v0 @@ -768,10 +762,9 @@ define amdgpu_ps i32 @s_fshr_v4i8(i32 inreg %lhs.arg, i32 inreg %rhs.arg, i32 in ; GFX6-NEXT: s_lshr_b32 s9, s2, 24 ; GFX6-NEXT: s_and_b32 s10, s2, 7 ; GFX6-NEXT: s_andn2_b32 s2, 7, s2 -; GFX6-NEXT: s_movk_i32 s11, 0xff ; GFX6-NEXT: s_lshl_b32 s0, s0, 1 ; GFX6-NEXT: s_lshl_b32 s0, s0, s2 -; GFX6-NEXT: s_and_b32 s2, s1, s11 +; GFX6-NEXT: s_and_b32 s2, s1, 0xff ; GFX6-NEXT: s_lshr_b32 s2, s2, s10 ; GFX6-NEXT: s_or_b32 s0, s0, s2 ; GFX6-NEXT: s_and_b32 s2, s7, 7 @@ -792,24 +785,23 @@ define amdgpu_ps i32 @s_fshr_v4i8(i32 inreg %lhs.arg, i32 inreg %rhs.arg, i32 in ; GFX6-NEXT: s_and_b32 s3, s9, 7 ; GFX6-NEXT: s_andn2_b32 s4, 7, s9 ; GFX6-NEXT: s_lshl_b32 s5, s5, 1 -; GFX6-NEXT: s_and_b32 s2, s2, s11 +; GFX6-NEXT: s_and_b32 s2, s2, 0xff ; GFX6-NEXT: s_lshl_b32 s4, s5, s4 ; GFX6-NEXT: s_lshr_b32 s3, s6, s3 -; GFX6-NEXT: s_and_b32 s0, s0, s11 +; GFX6-NEXT: s_and_b32 s0, s0, 0xff ; GFX6-NEXT: s_lshl_b32 s2, s2, 8 -; GFX6-NEXT: s_and_b32 s1, s1, s11 +; GFX6-NEXT: s_and_b32 s1, s1, 0xff ; GFX6-NEXT: s_or_b32 s3, s4, s3 ; GFX6-NEXT: s_or_b32 s0, s0, s2 ; GFX6-NEXT: s_lshl_b32 s1, s1, 16 ; GFX6-NEXT: s_or_b32 s0, s0, s1 -; GFX6-NEXT: s_and_b32 s1, s3, s11 +; GFX6-NEXT: s_and_b32 s1, s3, 0xff ; GFX6-NEXT: s_lshl_b32 s1, s1, 24 ; GFX6-NEXT: s_or_b32 s0, s0, s1 ; GFX6-NEXT: ; return to shader part epilog ; ; GFX8-LABEL: s_fshr_v4i8: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_movk_i32 s13, 0xff ; GFX8-NEXT: s_lshr_b32 s3, s0, 8 ; GFX8-NEXT: s_lshr_b32 s4, s0, 16 ; GFX8-NEXT: s_lshr_b32 s5, s0, 24 @@ -822,14 +814,14 @@ define amdgpu_ps i32 @s_fshr_v4i8(i32 inreg %lhs.arg, i32 inreg %rhs.arg, i32 in ; GFX8-NEXT: s_and_b32 s12, s2, 7 ; GFX8-NEXT: s_andn2_b32 s2, 7, s2 ; GFX8-NEXT: s_lshl_b32 s0, s0, 1 -; GFX8-NEXT: s_and_b32 s1, s1, s13 +; GFX8-NEXT: s_and_b32 s1, s1, 0xff ; GFX8-NEXT: s_lshl_b32 s0, s0, s2 ; GFX8-NEXT: s_bfe_u32 s1, s1, 0x100000 ; GFX8-NEXT: s_andn2_b32 s2, 7, s9 ; GFX8-NEXT: s_lshl_b32 s3, s3, 1 ; GFX8-NEXT: s_lshr_b32 s1, s1, s12 ; GFX8-NEXT: s_lshl_b32 s2, s3, s2 -; GFX8-NEXT: s_and_b32 s3, s6, s13 +; GFX8-NEXT: s_and_b32 s3, s6, 0xff ; GFX8-NEXT: s_or_b32 s0, s0, s1 ; GFX8-NEXT: s_and_b32 s1, s9, 7 ; GFX8-NEXT: s_bfe_u32 s3, s3, 0x100000 @@ -837,33 +829,32 @@ define amdgpu_ps i32 @s_fshr_v4i8(i32 inreg %lhs.arg, i32 inreg %rhs.arg, i32 in ; GFX8-NEXT: s_andn2_b32 s3, 7, s10 ; GFX8-NEXT: s_lshl_b32 s4, s4, 1 ; GFX8-NEXT: s_lshl_b32 s3, s4, s3 -; GFX8-NEXT: s_and_b32 s4, s7, s13 +; GFX8-NEXT: s_and_b32 s4, s7, 0xff ; GFX8-NEXT: s_or_b32 s1, s2, s1 ; GFX8-NEXT: s_and_b32 s2, s10, 7 ; GFX8-NEXT: s_bfe_u32 s4, s4, 0x100000 ; GFX8-NEXT: s_lshr_b32 s2, s4, s2 -; GFX8-NEXT: s_and_b32 s1, s1, s13 +; GFX8-NEXT: s_and_b32 s1, s1, 0xff ; GFX8-NEXT: s_or_b32 s2, s3, s2 ; GFX8-NEXT: s_and_b32 s3, s11, 7 ; GFX8-NEXT: s_andn2_b32 s4, 7, s11 ; GFX8-NEXT: s_lshl_b32 s5, s5, 1 -; GFX8-NEXT: s_and_b32 s0, s0, s13 +; GFX8-NEXT: s_and_b32 s0, s0, 0xff ; GFX8-NEXT: s_lshl_b32 s1, s1, 8 ; GFX8-NEXT: s_lshl_b32 s4, s5, s4 ; GFX8-NEXT: s_lshr_b32 s3, s8, s3 ; GFX8-NEXT: s_or_b32 s0, s0, s1 -; GFX8-NEXT: s_and_b32 s1, s2, s13 +; GFX8-NEXT: s_and_b32 s1, s2, 0xff ; GFX8-NEXT: s_or_b32 s3, s4, s3 ; GFX8-NEXT: s_lshl_b32 s1, s1, 16 ; GFX8-NEXT: s_or_b32 s0, s0, s1 -; GFX8-NEXT: s_and_b32 s1, s3, s13 +; GFX8-NEXT: s_and_b32 s1, s3, 0xff ; GFX8-NEXT: s_lshl_b32 s1, s1, 24 ; GFX8-NEXT: s_or_b32 s0, s0, s1 ; GFX8-NEXT: ; return to shader part epilog ; ; GFX9-LABEL: s_fshr_v4i8: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_movk_i32 s13, 0xff ; GFX9-NEXT: s_lshr_b32 s3, s0, 8 ; GFX9-NEXT: s_lshr_b32 s4, s0, 16 ; GFX9-NEXT: s_lshr_b32 s5, s0, 24 @@ -876,14 +867,14 @@ define amdgpu_ps i32 @s_fshr_v4i8(i32 inreg %lhs.arg, i32 inreg %rhs.arg, i32 in ; GFX9-NEXT: s_and_b32 s12, s2, 7 ; GFX9-NEXT: s_andn2_b32 s2, 7, s2 ; GFX9-NEXT: s_lshl_b32 s0, s0, 1 -; GFX9-NEXT: s_and_b32 s1, s1, s13 +; GFX9-NEXT: s_and_b32 s1, s1, 0xff ; GFX9-NEXT: s_lshl_b32 s0, s0, s2 ; GFX9-NEXT: s_bfe_u32 s1, s1, 0x100000 ; GFX9-NEXT: s_andn2_b32 s2, 7, s9 ; GFX9-NEXT: s_lshl_b32 s3, s3, 1 ; GFX9-NEXT: s_lshr_b32 s1, s1, s12 ; GFX9-NEXT: s_lshl_b32 s2, s3, s2 -; GFX9-NEXT: s_and_b32 s3, s6, s13 +; GFX9-NEXT: s_and_b32 s3, s6, 0xff ; GFX9-NEXT: s_or_b32 s0, s0, s1 ; GFX9-NEXT: s_and_b32 s1, s9, 7 ; GFX9-NEXT: s_bfe_u32 s3, s3, 0x100000 @@ -891,26 +882,26 @@ define amdgpu_ps i32 @s_fshr_v4i8(i32 inreg %lhs.arg, i32 inreg %rhs.arg, i32 in ; GFX9-NEXT: s_andn2_b32 s3, 7, s10 ; GFX9-NEXT: s_lshl_b32 s4, s4, 1 ; GFX9-NEXT: s_lshl_b32 s3, s4, s3 -; GFX9-NEXT: s_and_b32 s4, s7, s13 +; GFX9-NEXT: s_and_b32 s4, s7, 0xff ; GFX9-NEXT: s_or_b32 s1, s2, s1 ; GFX9-NEXT: s_and_b32 s2, s10, 7 ; GFX9-NEXT: s_bfe_u32 s4, s4, 0x100000 ; GFX9-NEXT: s_lshr_b32 s2, s4, s2 -; GFX9-NEXT: s_and_b32 s1, s1, s13 +; GFX9-NEXT: s_and_b32 s1, s1, 0xff ; GFX9-NEXT: s_or_b32 s2, s3, s2 ; GFX9-NEXT: s_and_b32 s3, s11, 7 ; GFX9-NEXT: s_andn2_b32 s4, 7, s11 ; GFX9-NEXT: s_lshl_b32 s5, s5, 1 -; GFX9-NEXT: s_and_b32 s0, s0, s13 +; GFX9-NEXT: s_and_b32 s0, s0, 0xff ; GFX9-NEXT: s_lshl_b32 s1, s1, 8 ; GFX9-NEXT: s_lshl_b32 s4, s5, s4 ; GFX9-NEXT: s_lshr_b32 s3, s8, s3 ; GFX9-NEXT: s_or_b32 s0, s0, s1 -; GFX9-NEXT: s_and_b32 s1, s2, s13 +; GFX9-NEXT: s_and_b32 s1, s2, 0xff ; GFX9-NEXT: s_or_b32 s3, s4, s3 ; GFX9-NEXT: s_lshl_b32 s1, s1, 16 ; GFX9-NEXT: s_or_b32 s0, s0, s1 -; GFX9-NEXT: s_and_b32 s1, s3, s13 +; GFX9-NEXT: s_and_b32 s1, s3, 0xff ; GFX9-NEXT: s_lshl_b32 s1, s1, 24 ; GFX9-NEXT: s_or_b32 s0, s0, s1 ; GFX9-NEXT: ; return to shader part epilog @@ -918,7 +909,6 @@ define amdgpu_ps i32 @s_fshr_v4i8(i32 inreg %lhs.arg, i32 inreg %rhs.arg, i32 in ; GFX10-LABEL: s_fshr_v4i8: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_lshr_b32 s6, s1, 8 -; GFX10-NEXT: s_movk_i32 s13, 0xff ; GFX10-NEXT: s_lshr_b32 s3, s0, 8 ; GFX10-NEXT: s_lshr_b32 s4, s0, 16 ; GFX10-NEXT: s_lshr_b32 s5, s0, 24 @@ -929,9 +919,9 @@ define amdgpu_ps i32 @s_fshr_v4i8(i32 inreg %lhs.arg, i32 inreg %rhs.arg, i32 in ; GFX10-NEXT: s_lshr_b32 s11, s2, 24 ; GFX10-NEXT: s_and_b32 s12, s2, 7 ; GFX10-NEXT: s_andn2_b32 s2, 7, s2 -; GFX10-NEXT: s_and_b32 s1, s1, s13 +; GFX10-NEXT: s_and_b32 s1, s1, 0xff ; GFX10-NEXT: s_lshl_b32 s0, s0, 1 -; GFX10-NEXT: s_and_b32 s6, s6, s13 +; GFX10-NEXT: s_and_b32 s6, s6, 0xff ; GFX10-NEXT: s_bfe_u32 s1, s1, 0x100000 ; GFX10-NEXT: s_lshl_b32 s0, s0, s2 ; GFX10-NEXT: s_and_b32 s2, s9, 7 @@ -941,7 +931,7 @@ define amdgpu_ps i32 @s_fshr_v4i8(i32 inreg %lhs.arg, i32 inreg %rhs.arg, i32 in ; GFX10-NEXT: s_lshr_b32 s1, s1, s12 ; GFX10-NEXT: s_lshl_b32 s3, s3, s9 ; GFX10-NEXT: s_lshr_b32 s2, s6, s2 -; GFX10-NEXT: s_and_b32 s6, s7, s13 +; GFX10-NEXT: s_and_b32 s6, s7, 0xff ; GFX10-NEXT: s_or_b32 s0, s0, s1 ; GFX10-NEXT: s_or_b32 s1, s3, s2 ; GFX10-NEXT: s_and_b32 s2, s10, 7 @@ -956,14 +946,14 @@ define amdgpu_ps i32 @s_fshr_v4i8(i32 inreg %lhs.arg, i32 inreg %rhs.arg, i32 in ; GFX10-NEXT: s_lshl_b32 s4, s5, s4 ; GFX10-NEXT: s_lshr_b32 s5, s8, s6 ; GFX10-NEXT: s_or_b32 s2, s3, s2 -; GFX10-NEXT: s_and_b32 s1, s1, s13 +; GFX10-NEXT: s_and_b32 s1, s1, 0xff ; GFX10-NEXT: s_or_b32 s3, s4, s5 -; GFX10-NEXT: s_and_b32 s0, s0, s13 +; GFX10-NEXT: s_and_b32 s0, s0, 0xff ; GFX10-NEXT: s_lshl_b32 s1, s1, 8 -; GFX10-NEXT: s_and_b32 s2, s2, s13 +; GFX10-NEXT: s_and_b32 s2, s2, 0xff ; GFX10-NEXT: s_or_b32 s0, s0, s1 ; GFX10-NEXT: s_lshl_b32 s1, s2, 16 -; GFX10-NEXT: s_and_b32 s2, s3, s13 +; GFX10-NEXT: s_and_b32 s2, s3, 0xff ; GFX10-NEXT: s_or_b32 s0, s0, s1 ; GFX10-NEXT: s_lshl_b32 s1, s2, 24 ; GFX10-NEXT: s_or_b32 s0, s0, s1 @@ -1136,52 +1126,51 @@ define i32 @v_fshr_v4i8(i32 %lhs.arg, i32 %rhs.arg, i32 %amt.arg) { ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: v_lshrrev_b32_e32 v6, 8, v2 +; GFX10-NEXT: v_lshrrev_b32_e32 v5, 8, v2 ; GFX10-NEXT: v_lshrrev_b32_e32 v3, 8, v0 ; GFX10-NEXT: v_xor_b32_e32 v8, -1, v2 -; GFX10-NEXT: v_lshrrev_b32_e32 v10, 16, v2 +; GFX10-NEXT: v_lshrrev_b32_e32 v11, 16, v2 ; GFX10-NEXT: v_lshrrev_b32_e32 v12, 24, v2 -; GFX10-NEXT: v_xor_b32_e32 v11, -1, v6 +; GFX10-NEXT: v_xor_b32_e32 v10, -1, v5 ; GFX10-NEXT: v_lshlrev_b16 v3, 1, v3 ; GFX10-NEXT: v_lshrrev_b32_e32 v4, 16, v0 -; GFX10-NEXT: v_lshrrev_b32_e32 v5, 24, v0 +; GFX10-NEXT: v_lshrrev_b32_e32 v6, 24, v0 ; GFX10-NEXT: v_lshrrev_b32_e32 v7, 8, v1 -; GFX10-NEXT: v_and_b32_e32 v11, 7, v11 -; GFX10-NEXT: v_and_b32_e32 v8, 7, v8 +; GFX10-NEXT: v_and_b32_e32 v10, 7, v10 ; GFX10-NEXT: v_lshlrev_b16 v0, 1, v0 +; GFX10-NEXT: v_and_b32_e32 v8, 7, v8 ; GFX10-NEXT: v_mov_b32_e32 v13, 0xff ; GFX10-NEXT: v_xor_b32_e32 v14, -1, v12 -; GFX10-NEXT: v_lshlrev_b16 v3, v11, v3 -; GFX10-NEXT: v_xor_b32_e32 v11, -1, v10 -; GFX10-NEXT: s_movk_i32 s4, 0xff +; GFX10-NEXT: v_lshlrev_b16 v3, v10, v3 +; GFX10-NEXT: v_xor_b32_e32 v10, -1, v11 ; GFX10-NEXT: v_lshrrev_b32_e32 v9, 24, v1 ; GFX10-NEXT: v_lshlrev_b16 v0, v8, v0 -; GFX10-NEXT: v_and_b32_e32 v8, s4, v1 -; GFX10-NEXT: v_and_b32_e32 v6, 7, v6 -; GFX10-NEXT: v_and_b32_e32 v7, s4, v7 -; GFX10-NEXT: v_and_b32_e32 v10, 7, v10 +; GFX10-NEXT: v_and_b32_e32 v8, 0xff, v1 +; GFX10-NEXT: v_and_b32_e32 v5, 7, v5 +; GFX10-NEXT: v_and_b32_e32 v7, 0xff, v7 ; GFX10-NEXT: v_and_b32_e32 v11, 7, v11 +; GFX10-NEXT: v_and_b32_e32 v10, 7, v10 ; GFX10-NEXT: v_lshlrev_b16 v4, 1, v4 ; GFX10-NEXT: v_and_b32_sdwa v1, v1, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; GFX10-NEXT: v_and_b32_e32 v13, 7, v14 -; GFX10-NEXT: v_lshlrev_b16 v5, 1, v5 +; GFX10-NEXT: v_lshlrev_b16 v6, 1, v6 ; GFX10-NEXT: v_and_b32_e32 v12, 7, v12 ; GFX10-NEXT: v_and_b32_e32 v2, 7, v2 -; GFX10-NEXT: v_lshrrev_b16 v6, v6, v7 -; GFX10-NEXT: v_lshlrev_b16 v4, v11, v4 -; GFX10-NEXT: v_lshrrev_b16 v1, v10, v1 -; GFX10-NEXT: v_lshlrev_b16 v5, v13, v5 +; GFX10-NEXT: v_lshrrev_b16 v5, v5, v7 +; GFX10-NEXT: v_lshlrev_b16 v4, v10, v4 +; GFX10-NEXT: v_lshrrev_b16 v1, v11, v1 +; GFX10-NEXT: v_lshlrev_b16 v6, v13, v6 ; GFX10-NEXT: v_lshrrev_b16 v7, v12, v9 ; GFX10-NEXT: v_lshrrev_b16 v2, v2, v8 -; GFX10-NEXT: v_or_b32_e32 v3, v3, v6 -; GFX10-NEXT: v_mov_b32_e32 v6, 8 +; GFX10-NEXT: v_or_b32_e32 v3, v3, v5 +; GFX10-NEXT: v_mov_b32_e32 v5, 8 ; GFX10-NEXT: v_or_b32_e32 v1, v4, v1 -; GFX10-NEXT: v_or_b32_e32 v4, v5, v7 +; GFX10-NEXT: v_or_b32_e32 v4, v6, v7 ; GFX10-NEXT: v_or_b32_e32 v0, v0, v2 -; GFX10-NEXT: v_lshlrev_b32_sdwa v2, v6, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX10-NEXT: v_and_b32_e32 v1, s4, v1 -; GFX10-NEXT: v_and_b32_e32 v3, s4, v4 -; GFX10-NEXT: v_and_or_b32 v0, v0, s4, v2 +; GFX10-NEXT: v_lshlrev_b32_sdwa v2, v5, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX10-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GFX10-NEXT: v_and_b32_e32 v3, 0xff, v4 +; GFX10-NEXT: v_and_or_b32 v0, v0, 0xff, v2 ; GFX10-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX10-NEXT: v_lshlrev_b32_e32 v2, 24, v3 ; GFX10-NEXT: v_or3_b32 v0, v0, v1, v2 @@ -1200,12 +1189,12 @@ define amdgpu_ps i24 @s_fshr_i24(i24 inreg %lhs, i24 inreg %rhs, i24 inreg %amt) ; GFX6-NEXT: v_cvt_f32_ubyte0_e32 v0, 24 ; GFX6-NEXT: v_rcp_iflag_f32_e32 v0, v0 ; GFX6-NEXT: v_mov_b32_e32 v1, 0xffffffe8 +; GFX6-NEXT: s_and_b32 s2, s2, 0xffffff ; GFX6-NEXT: s_mov_b32 s3, 0xffffff -; GFX6-NEXT: s_and_b32 s2, s2, s3 ; GFX6-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX6-NEXT: v_cvt_u32_f32_e32 v0, v0 ; GFX6-NEXT: s_lshl_b32 s0, s0, 1 -; GFX6-NEXT: s_and_b32 s1, s1, s3 +; GFX6-NEXT: s_and_b32 s1, s1, 0xffffff ; GFX6-NEXT: v_mul_lo_u32 v1, v1, v0 ; GFX6-NEXT: v_mul_hi_u32 v1, v0, v1 ; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v1 @@ -1232,12 +1221,12 @@ define amdgpu_ps i24 @s_fshr_i24(i24 inreg %lhs, i24 inreg %rhs, i24 inreg %amt) ; GFX8-NEXT: v_cvt_f32_ubyte0_e32 v0, 24 ; GFX8-NEXT: v_rcp_iflag_f32_e32 v0, v0 ; GFX8-NEXT: v_mov_b32_e32 v1, 0xffffffe8 +; GFX8-NEXT: s_and_b32 s2, s2, 0xffffff ; GFX8-NEXT: s_mov_b32 s3, 0xffffff -; GFX8-NEXT: s_and_b32 s2, s2, s3 ; GFX8-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX8-NEXT: v_cvt_u32_f32_e32 v0, v0 ; GFX8-NEXT: s_lshl_b32 s0, s0, 1 -; GFX8-NEXT: s_and_b32 s1, s1, s3 +; GFX8-NEXT: s_and_b32 s1, s1, 0xffffff ; GFX8-NEXT: v_mul_lo_u32 v1, v1, v0 ; GFX8-NEXT: v_mul_hi_u32 v1, v0, v1 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v1 @@ -1264,11 +1253,11 @@ define amdgpu_ps i24 @s_fshr_i24(i24 inreg %lhs, i24 inreg %rhs, i24 inreg %amt) ; GFX9-NEXT: v_cvt_f32_ubyte0_e32 v0, 24 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v0, v0 ; GFX9-NEXT: v_mov_b32_e32 v1, 0xffffffe8 +; GFX9-NEXT: s_and_b32 s2, s2, 0xffffff ; GFX9-NEXT: s_mov_b32 s3, 0xffffff -; GFX9-NEXT: s_and_b32 s2, s2, s3 ; GFX9-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0 -; GFX9-NEXT: s_and_b32 s1, s1, s3 +; GFX9-NEXT: s_and_b32 s1, s1, 0xffffff ; GFX9-NEXT: s_lshl_b32 s0, s0, 1 ; GFX9-NEXT: v_mul_lo_u32 v1, v1, v0 ; GFX9-NEXT: v_mul_hi_u32 v1, v0, v1 @@ -1293,10 +1282,9 @@ define amdgpu_ps i24 @s_fshr_i24(i24 inreg %lhs, i24 inreg %rhs, i24 inreg %amt) ; GFX10-LABEL: s_fshr_i24: ; GFX10: ; %bb.0: ; GFX10-NEXT: v_cvt_f32_ubyte0_e32 v0, 24 -; GFX10-NEXT: s_mov_b32 s3, 0xffffff +; GFX10-NEXT: s_and_b32 s2, s2, 0xffffff +; GFX10-NEXT: s_and_b32 s1, s1, 0xffffff ; GFX10-NEXT: s_lshl_b32 s0, s0, 1 -; GFX10-NEXT: s_and_b32 s2, s2, s3 -; GFX10-NEXT: s_and_b32 s1, s1, s3 ; GFX10-NEXT: v_rcp_iflag_f32_e32 v0, v0 ; GFX10-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX10-NEXT: v_cvt_u32_f32_e32 v0, v0 @@ -1313,8 +1301,8 @@ define amdgpu_ps i24 @s_fshr_i24(i24 inreg %lhs, i24 inreg %rhs, i24 inreg %amt) ; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, 24, v0 ; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc_lo ; GFX10-NEXT: v_sub_nc_u32_e32 v1, 23, v0 -; GFX10-NEXT: v_and_b32_e32 v0, s3, v0 -; GFX10-NEXT: v_and_b32_e32 v1, s3, v1 +; GFX10-NEXT: v_and_b32_e32 v0, 0xffffff, v0 +; GFX10-NEXT: v_and_b32_e32 v1, 0xffffff, v1 ; GFX10-NEXT: v_lshrrev_b32_e64 v0, v0, s1 ; GFX10-NEXT: v_lshl_or_b32 v0, s0, v1, v0 ; GFX10-NEXT: v_readfirstlane_b32 s0, v0 @@ -1425,6 +1413,7 @@ define i24 @v_fshr_i24(i24 %lhs, i24 %rhs, i24 %amt) { ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: v_cvt_f32_ubyte0_e32 v3, 24 ; GFX10-NEXT: v_and_b32_e32 v2, 0xffffff, v2 +; GFX10-NEXT: v_and_b32_e32 v1, 0xffffff, v1 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX10-NEXT: v_rcp_iflag_f32_e32 v3, v3 ; GFX10-NEXT: v_mul_f32_e32 v3, 0x4f7ffffe, v3 @@ -1432,9 +1421,7 @@ define i24 @v_fshr_i24(i24 %lhs, i24 %rhs, i24 %amt) { ; GFX10-NEXT: v_mul_lo_u32 v4, 0xffffffe8, v3 ; GFX10-NEXT: v_mul_hi_u32 v4, v3, v4 ; GFX10-NEXT: v_add_nc_u32_e32 v3, v3, v4 -; GFX10-NEXT: v_mov_b32_e32 v4, 0xffffff ; GFX10-NEXT: v_mul_hi_u32 v3, v2, v3 -; GFX10-NEXT: v_and_b32_e32 v1, v1, v4 ; GFX10-NEXT: v_mul_lo_u32 v3, v3, 24 ; GFX10-NEXT: v_sub_nc_u32_e32 v2, v2, v3 ; GFX10-NEXT: v_subrev_nc_u32_e32 v3, 24, v2 @@ -1444,8 +1431,8 @@ define i24 @v_fshr_i24(i24 %lhs, i24 %rhs, i24 %amt) { ; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, 24, v2 ; GFX10-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc_lo ; GFX10-NEXT: v_sub_nc_u32_e32 v3, 23, v2 -; GFX10-NEXT: v_and_b32_e32 v2, v2, v4 -; GFX10-NEXT: v_and_b32_e32 v3, v3, v4 +; GFX10-NEXT: v_and_b32_e32 v2, 0xffffff, v2 +; GFX10-NEXT: v_and_b32_e32 v3, 0xffffff, v3 ; GFX10-NEXT: v_lshrrev_b32_e32 v1, v2, v1 ; GFX10-NEXT: v_lshl_or_b32 v0, v0, v3, v1 ; GFX10-NEXT: s_setpc_b64 s[30:31] @@ -1458,38 +1445,36 @@ define amdgpu_ps i48 @s_fshr_v2i24(i48 inreg %lhs.arg, i48 inreg %rhs.arg, i48 i ; GFX6: ; %bb.0: ; GFX6-NEXT: v_cvt_f32_ubyte0_e32 v0, 24 ; GFX6-NEXT: v_rcp_iflag_f32_e32 v0, v0 -; GFX6-NEXT: s_movk_i32 s9, 0xff -; GFX6-NEXT: s_mov_b32 s11, 0x80008 ; GFX6-NEXT: s_lshr_b32 s6, s0, 16 -; GFX6-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 -; GFX6-NEXT: v_cvt_u32_f32_e32 v0, v0 ; GFX6-NEXT: s_lshr_b32 s7, s0, 24 ; GFX6-NEXT: s_lshr_b32 s8, s1, 8 -; GFX6-NEXT: s_and_b32 s10, s0, s9 -; GFX6-NEXT: s_bfe_u32 s0, s0, s11 -; GFX6-NEXT: s_and_b32 s1, s1, s9 +; GFX6-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 +; GFX6-NEXT: v_cvt_u32_f32_e32 v0, v0 +; GFX6-NEXT: s_and_b32 s10, s0, 0xff +; GFX6-NEXT: s_bfe_u32 s0, s0, 0x80008 +; GFX6-NEXT: s_and_b32 s1, s1, 0xff ; GFX6-NEXT: s_lshl_b32 s0, s0, 8 ; GFX6-NEXT: s_lshl_b32 s1, s1, 8 ; GFX6-NEXT: v_mov_b32_e32 v1, 0xffffffe8 ; GFX6-NEXT: s_or_b32 s0, s10, s0 ; GFX6-NEXT: s_or_b32 s1, s7, s1 -; GFX6-NEXT: s_and_b32 s7, s8, s9 +; GFX6-NEXT: s_and_b32 s7, s8, 0xff ; GFX6-NEXT: s_lshr_b32 s8, s2, 16 ; GFX6-NEXT: s_lshr_b32 s10, s2, 24 -; GFX6-NEXT: s_and_b32 s13, s2, s9 -; GFX6-NEXT: s_bfe_u32 s2, s2, s11 +; GFX6-NEXT: s_and_b32 s12, s2, 0xff +; GFX6-NEXT: s_bfe_u32 s2, s2, 0x80008 ; GFX6-NEXT: v_mul_lo_u32 v2, v1, v0 ; GFX6-NEXT: s_lshl_b32 s2, s2, 8 -; GFX6-NEXT: s_and_b32 s8, s8, s9 -; GFX6-NEXT: s_or_b32 s2, s13, s2 +; GFX6-NEXT: s_and_b32 s8, s8, 0xff +; GFX6-NEXT: s_or_b32 s2, s12, s2 ; GFX6-NEXT: s_bfe_u32 s8, s8, 0x100000 -; GFX6-NEXT: s_lshr_b32 s12, s3, 8 +; GFX6-NEXT: s_lshr_b32 s11, s3, 8 ; GFX6-NEXT: s_bfe_u32 s2, s2, 0x100000 ; GFX6-NEXT: s_lshl_b32 s8, s8, 16 -; GFX6-NEXT: s_and_b32 s3, s3, s9 +; GFX6-NEXT: s_and_b32 s3, s3, 0xff ; GFX6-NEXT: s_or_b32 s2, s2, s8 ; GFX6-NEXT: s_lshl_b32 s3, s3, 8 -; GFX6-NEXT: s_and_b32 s8, s12, s9 +; GFX6-NEXT: s_and_b32 s8, s11, 0xff ; GFX6-NEXT: v_mul_hi_u32 v2, v0, v2 ; GFX6-NEXT: s_or_b32 s3, s10, s3 ; GFX6-NEXT: s_bfe_u32 s8, s8, 0x100000 @@ -1498,13 +1483,13 @@ define amdgpu_ps i48 @s_fshr_v2i24(i48 inreg %lhs.arg, i48 inreg %rhs.arg, i48 i ; GFX6-NEXT: s_or_b32 s3, s3, s8 ; GFX6-NEXT: s_lshr_b32 s8, s4, 16 ; GFX6-NEXT: s_lshr_b32 s10, s4, 24 -; GFX6-NEXT: s_and_b32 s13, s4, s9 -; GFX6-NEXT: s_bfe_u32 s4, s4, s11 +; GFX6-NEXT: s_and_b32 s12, s4, 0xff +; GFX6-NEXT: s_bfe_u32 s4, s4, 0x80008 ; GFX6-NEXT: s_lshl_b32 s4, s4, 8 -; GFX6-NEXT: s_and_b32 s8, s8, s9 +; GFX6-NEXT: s_and_b32 s8, s8, 0xff ; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v2 ; GFX6-NEXT: v_cvt_f32_ubyte0_e32 v2, 24 -; GFX6-NEXT: s_or_b32 s4, s13, s4 +; GFX6-NEXT: s_or_b32 s4, s12, s4 ; GFX6-NEXT: s_bfe_u32 s8, s8, 0x100000 ; GFX6-NEXT: v_rcp_iflag_f32_e32 v2, v2 ; GFX6-NEXT: s_bfe_u32 s4, s4, 0x100000 @@ -1513,16 +1498,16 @@ define amdgpu_ps i48 @s_fshr_v2i24(i48 inreg %lhs.arg, i48 inreg %rhs.arg, i48 i ; GFX6-NEXT: v_mul_hi_u32 v0, s4, v0 ; GFX6-NEXT: v_mul_f32_e32 v2, 0x4f7ffffe, v2 ; GFX6-NEXT: v_cvt_u32_f32_e32 v2, v2 -; GFX6-NEXT: s_lshr_b32 s12, s5, 8 +; GFX6-NEXT: s_lshr_b32 s11, s5, 8 ; GFX6-NEXT: v_mul_lo_u32 v0, v0, 24 -; GFX6-NEXT: s_and_b32 s5, s5, s9 +; GFX6-NEXT: s_and_b32 s5, s5, 0xff ; GFX6-NEXT: v_mul_lo_u32 v1, v1, v2 ; GFX6-NEXT: s_lshl_b32 s5, s5, 8 ; GFX6-NEXT: v_sub_i32_e32 v0, vcc, s4, v0 ; GFX6-NEXT: v_subrev_i32_e32 v3, vcc, 24, v0 ; GFX6-NEXT: v_cmp_le_u32_e32 vcc, 24, v0 ; GFX6-NEXT: v_mul_hi_u32 v1, v2, v1 -; GFX6-NEXT: s_and_b32 s8, s12, s9 +; GFX6-NEXT: s_and_b32 s8, s11, 0xff ; GFX6-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc ; GFX6-NEXT: s_or_b32 s5, s10, s5 ; GFX6-NEXT: s_bfe_u32 s8, s8, 0x100000 @@ -1534,7 +1519,7 @@ define amdgpu_ps i48 @s_fshr_v2i24(i48 inreg %lhs.arg, i48 inreg %rhs.arg, i48 i ; GFX6-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc ; GFX6-NEXT: v_add_i32_e32 v1, vcc, v2, v1 ; GFX6-NEXT: v_mul_hi_u32 v1, s5, v1 -; GFX6-NEXT: s_and_b32 s6, s6, s9 +; GFX6-NEXT: s_and_b32 s6, s6, 0xff ; GFX6-NEXT: s_bfe_u32 s0, s0, 0x100000 ; GFX6-NEXT: s_bfe_u32 s6, s6, 0x100000 ; GFX6-NEXT: v_mul_lo_u32 v1, v1, 24 @@ -1564,6 +1549,7 @@ define amdgpu_ps i48 @s_fshr_v2i24(i48 inreg %lhs.arg, i48 inreg %rhs.arg, i48 i ; GFX6-NEXT: s_or_b32 s0, s0, s1 ; GFX6-NEXT: v_and_b32_e32 v2, v2, v4 ; GFX6-NEXT: v_and_b32_e32 v1, v1, v4 +; GFX6-NEXT: s_movk_i32 s9, 0xff ; GFX6-NEXT: v_lshl_b32_e32 v2, s0, v2 ; GFX6-NEXT: v_lshr_b32_e32 v1, s3, v1 ; GFX6-NEXT: v_bfe_u32 v3, v0, 8, 8 @@ -1589,55 +1575,54 @@ define amdgpu_ps i48 @s_fshr_v2i24(i48 inreg %lhs.arg, i48 inreg %rhs.arg, i48 i ; GFX8: ; %bb.0: ; GFX8-NEXT: v_cvt_f32_ubyte0_e32 v0, 24 ; GFX8-NEXT: v_rcp_iflag_f32_e32 v0, v0 -; GFX8-NEXT: s_movk_i32 s10, 0xff ; GFX8-NEXT: s_lshr_b32 s9, s1, 8 -; GFX8-NEXT: s_bfe_u32 s11, 8, 0x100000 -; GFX8-NEXT: s_and_b32 s1, s1, s10 +; GFX8-NEXT: s_bfe_u32 s10, 8, 0x100000 +; GFX8-NEXT: s_and_b32 s1, s1, 0xff ; GFX8-NEXT: s_lshr_b32 s6, s0, 8 ; GFX8-NEXT: s_lshr_b32 s8, s0, 24 -; GFX8-NEXT: s_lshl_b32 s1, s1, s11 +; GFX8-NEXT: s_lshl_b32 s1, s1, s10 ; GFX8-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 -; GFX8-NEXT: s_and_b32 s6, s6, s10 +; GFX8-NEXT: s_and_b32 s6, s6, 0xff ; GFX8-NEXT: s_or_b32 s1, s8, s1 ; GFX8-NEXT: s_lshr_b32 s8, s2, 8 ; GFX8-NEXT: v_cvt_u32_f32_e32 v0, v0 ; GFX8-NEXT: s_lshr_b32 s7, s0, 16 -; GFX8-NEXT: s_and_b32 s0, s0, s10 -; GFX8-NEXT: s_lshl_b32 s6, s6, s11 -; GFX8-NEXT: s_and_b32 s8, s8, s10 +; GFX8-NEXT: s_and_b32 s0, s0, 0xff +; GFX8-NEXT: s_lshl_b32 s6, s6, s10 +; GFX8-NEXT: s_and_b32 s8, s8, 0xff ; GFX8-NEXT: s_or_b32 s0, s0, s6 -; GFX8-NEXT: s_and_b32 s6, s7, s10 -; GFX8-NEXT: s_and_b32 s7, s9, s10 +; GFX8-NEXT: s_and_b32 s6, s7, 0xff +; GFX8-NEXT: s_and_b32 s7, s9, 0xff ; GFX8-NEXT: s_lshr_b32 s9, s2, 16 -; GFX8-NEXT: s_lshr_b32 s12, s2, 24 -; GFX8-NEXT: s_and_b32 s2, s2, s10 -; GFX8-NEXT: s_lshl_b32 s8, s8, s11 +; GFX8-NEXT: s_lshr_b32 s11, s2, 24 +; GFX8-NEXT: s_and_b32 s2, s2, 0xff +; GFX8-NEXT: s_lshl_b32 s8, s8, s10 ; GFX8-NEXT: s_or_b32 s2, s2, s8 -; GFX8-NEXT: s_and_b32 s8, s9, s10 +; GFX8-NEXT: s_and_b32 s8, s9, 0xff ; GFX8-NEXT: v_mov_b32_e32 v1, 0xffffffe8 ; GFX8-NEXT: s_bfe_u32 s8, s8, 0x100000 ; GFX8-NEXT: v_mul_lo_u32 v2, v1, v0 -; GFX8-NEXT: s_lshr_b32 s13, s3, 8 +; GFX8-NEXT: s_lshr_b32 s12, s3, 8 ; GFX8-NEXT: s_bfe_u32 s2, s2, 0x100000 ; GFX8-NEXT: s_lshl_b32 s8, s8, 16 -; GFX8-NEXT: s_and_b32 s3, s3, s10 +; GFX8-NEXT: s_and_b32 s3, s3, 0xff ; GFX8-NEXT: s_or_b32 s2, s2, s8 -; GFX8-NEXT: s_lshl_b32 s3, s3, s11 -; GFX8-NEXT: s_and_b32 s8, s13, s10 -; GFX8-NEXT: s_or_b32 s3, s12, s3 +; GFX8-NEXT: s_lshl_b32 s3, s3, s10 +; GFX8-NEXT: s_and_b32 s8, s12, 0xff +; GFX8-NEXT: s_or_b32 s3, s11, s3 ; GFX8-NEXT: s_bfe_u32 s8, s8, 0x100000 ; GFX8-NEXT: s_bfe_u32 s3, s3, 0x100000 ; GFX8-NEXT: s_lshl_b32 s8, s8, 16 ; GFX8-NEXT: v_mul_hi_u32 v2, v0, v2 ; GFX8-NEXT: s_or_b32 s3, s3, s8 ; GFX8-NEXT: s_lshr_b32 s8, s4, 8 -; GFX8-NEXT: s_and_b32 s8, s8, s10 +; GFX8-NEXT: s_and_b32 s8, s8, 0xff ; GFX8-NEXT: s_lshr_b32 s9, s4, 16 -; GFX8-NEXT: s_lshr_b32 s12, s4, 24 -; GFX8-NEXT: s_and_b32 s4, s4, s10 -; GFX8-NEXT: s_lshl_b32 s8, s8, s11 +; GFX8-NEXT: s_lshr_b32 s11, s4, 24 +; GFX8-NEXT: s_and_b32 s4, s4, 0xff +; GFX8-NEXT: s_lshl_b32 s8, s8, s10 ; GFX8-NEXT: s_or_b32 s4, s4, s8 -; GFX8-NEXT: s_and_b32 s8, s9, s10 +; GFX8-NEXT: s_and_b32 s8, s9, 0xff ; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v2 ; GFX8-NEXT: v_cvt_f32_ubyte0_e32 v2, 24 ; GFX8-NEXT: s_bfe_u32 s8, s8, 0x100000 @@ -1648,18 +1633,18 @@ define amdgpu_ps i48 @s_fshr_v2i24(i48 inreg %lhs.arg, i48 inreg %rhs.arg, i48 i ; GFX8-NEXT: v_mul_hi_u32 v0, s4, v0 ; GFX8-NEXT: v_mul_f32_e32 v2, 0x4f7ffffe, v2 ; GFX8-NEXT: v_cvt_u32_f32_e32 v2, v2 -; GFX8-NEXT: s_lshr_b32 s13, s5, 8 +; GFX8-NEXT: s_lshr_b32 s12, s5, 8 ; GFX8-NEXT: v_mul_lo_u32 v0, v0, 24 -; GFX8-NEXT: s_and_b32 s5, s5, s10 +; GFX8-NEXT: s_and_b32 s5, s5, 0xff ; GFX8-NEXT: v_mul_lo_u32 v1, v1, v2 -; GFX8-NEXT: s_lshl_b32 s5, s5, s11 +; GFX8-NEXT: s_lshl_b32 s5, s5, s10 ; GFX8-NEXT: v_sub_u32_e32 v0, vcc, s4, v0 ; GFX8-NEXT: v_subrev_u32_e32 v3, vcc, 24, v0 ; GFX8-NEXT: v_cmp_le_u32_e32 vcc, 24, v0 ; GFX8-NEXT: v_mul_hi_u32 v1, v2, v1 -; GFX8-NEXT: s_and_b32 s8, s13, s10 +; GFX8-NEXT: s_and_b32 s8, s12, 0xff ; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc -; GFX8-NEXT: s_or_b32 s5, s12, s5 +; GFX8-NEXT: s_or_b32 s5, s11, s5 ; GFX8-NEXT: s_bfe_u32 s8, s8, 0x100000 ; GFX8-NEXT: v_subrev_u32_e32 v3, vcc, 24, v0 ; GFX8-NEXT: s_bfe_u32 s5, s5, 0x100000 @@ -1707,7 +1692,7 @@ define amdgpu_ps i48 @s_fshr_v2i24(i48 inreg %lhs.arg, i48 inreg %rhs.arg, i48 i ; GFX8-NEXT: v_or_b32_sdwa v3, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX8-NEXT: v_lshlrev_b32_sdwa v0, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 ; GFX8-NEXT: v_or_b32_e32 v0, v3, v0 -; GFX8-NEXT: v_and_b32_e32 v3, s10, v1 +; GFX8-NEXT: v_and_b32_e32 v3, 0xff, v1 ; GFX8-NEXT: v_lshlrev_b32_e32 v3, 24, v3 ; GFX8-NEXT: v_lshlrev_b32_sdwa v2, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 ; GFX8-NEXT: v_or_b32_e32 v0, v0, v3 @@ -1721,45 +1706,44 @@ define amdgpu_ps i48 @s_fshr_v2i24(i48 inreg %lhs.arg, i48 inreg %rhs.arg, i48 i ; GFX9-NEXT: v_cvt_f32_ubyte0_e32 v0, 24 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v0, v0 ; GFX9-NEXT: v_mov_b32_e32 v1, 0xffffffe8 -; GFX9-NEXT: s_movk_i32 s12, 0xff ; GFX9-NEXT: s_lshr_b32 s11, s1, 8 +; GFX9-NEXT: s_bfe_u32 s12, 8, 0x100000 ; GFX9-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0 -; GFX9-NEXT: s_bfe_u32 s13, 8, 0x100000 -; GFX9-NEXT: s_and_b32 s1, s1, s12 +; GFX9-NEXT: s_and_b32 s1, s1, 0xff ; GFX9-NEXT: s_lshr_b32 s7, s0, 8 -; GFX9-NEXT: v_mul_lo_u32 v2, v1, v0 ; GFX9-NEXT: s_lshr_b32 s10, s0, 24 -; GFX9-NEXT: s_lshl_b32 s1, s1, s13 -; GFX9-NEXT: s_and_b32 s7, s7, s12 -; GFX9-NEXT: v_mul_hi_u32 v2, v0, v2 +; GFX9-NEXT: v_mul_lo_u32 v2, v1, v0 +; GFX9-NEXT: s_lshl_b32 s1, s1, s12 +; GFX9-NEXT: s_and_b32 s7, s7, 0xff ; GFX9-NEXT: s_or_b32 s1, s10, s1 +; GFX9-NEXT: v_mul_hi_u32 v2, v0, v2 ; GFX9-NEXT: s_lshr_b32 s10, s2, 8 ; GFX9-NEXT: s_lshr_b32 s9, s0, 16 -; GFX9-NEXT: s_and_b32 s0, s0, s12 -; GFX9-NEXT: s_lshl_b32 s7, s7, s13 -; GFX9-NEXT: s_and_b32 s10, s10, s12 +; GFX9-NEXT: s_and_b32 s0, s0, 0xff +; GFX9-NEXT: s_lshl_b32 s7, s7, s12 +; GFX9-NEXT: s_and_b32 s10, s10, 0xff ; GFX9-NEXT: s_or_b32 s0, s0, s7 -; GFX9-NEXT: s_and_b32 s7, s9, s12 -; GFX9-NEXT: s_and_b32 s9, s11, s12 +; GFX9-NEXT: s_and_b32 s7, s9, 0xff +; GFX9-NEXT: s_and_b32 s9, s11, 0xff ; GFX9-NEXT: s_lshr_b32 s11, s2, 16 -; GFX9-NEXT: s_lshr_b32 s14, s2, 24 -; GFX9-NEXT: s_and_b32 s2, s2, s12 -; GFX9-NEXT: s_lshl_b32 s10, s10, s13 +; GFX9-NEXT: s_lshr_b32 s13, s2, 24 +; GFX9-NEXT: s_and_b32 s2, s2, 0xff +; GFX9-NEXT: s_lshl_b32 s10, s10, s12 ; GFX9-NEXT: s_or_b32 s2, s2, s10 -; GFX9-NEXT: s_and_b32 s10, s11, s12 +; GFX9-NEXT: s_and_b32 s10, s11, 0xff ; GFX9-NEXT: v_add_u32_e32 v0, v0, v2 ; GFX9-NEXT: v_cvt_f32_ubyte0_e32 v2, 24 ; GFX9-NEXT: s_bfe_u32 s10, s10, 0x100000 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v2, v2 -; GFX9-NEXT: s_lshr_b32 s15, s3, 8 +; GFX9-NEXT: s_lshr_b32 s14, s3, 8 ; GFX9-NEXT: s_bfe_u32 s2, s2, 0x100000 ; GFX9-NEXT: s_lshl_b32 s10, s10, 16 -; GFX9-NEXT: s_and_b32 s3, s3, s12 +; GFX9-NEXT: s_and_b32 s3, s3, 0xff ; GFX9-NEXT: s_or_b32 s2, s2, s10 -; GFX9-NEXT: s_lshl_b32 s3, s3, s13 -; GFX9-NEXT: s_and_b32 s10, s15, s12 -; GFX9-NEXT: s_or_b32 s3, s14, s3 +; GFX9-NEXT: s_lshl_b32 s3, s3, s12 +; GFX9-NEXT: s_and_b32 s10, s14, 0xff +; GFX9-NEXT: s_or_b32 s3, s13, s3 ; GFX9-NEXT: s_bfe_u32 s10, s10, 0x100000 ; GFX9-NEXT: s_bfe_u32 s3, s3, 0x100000 ; GFX9-NEXT: s_lshl_b32 s10, s10, 16 @@ -1767,25 +1751,25 @@ define amdgpu_ps i48 @s_fshr_v2i24(i48 inreg %lhs.arg, i48 inreg %rhs.arg, i48 i ; GFX9-NEXT: s_or_b32 s3, s3, s10 ; GFX9-NEXT: s_lshr_b32 s10, s4, 8 ; GFX9-NEXT: v_cvt_u32_f32_e32 v2, v2 -; GFX9-NEXT: s_and_b32 s10, s10, s12 +; GFX9-NEXT: s_and_b32 s10, s10, 0xff ; GFX9-NEXT: s_lshr_b32 s11, s4, 16 -; GFX9-NEXT: s_lshr_b32 s14, s4, 24 -; GFX9-NEXT: s_and_b32 s4, s4, s12 -; GFX9-NEXT: s_lshl_b32 s10, s10, s13 +; GFX9-NEXT: s_lshr_b32 s13, s4, 24 +; GFX9-NEXT: s_and_b32 s4, s4, 0xff +; GFX9-NEXT: s_lshl_b32 s10, s10, s12 ; GFX9-NEXT: s_or_b32 s4, s4, s10 -; GFX9-NEXT: s_and_b32 s10, s11, s12 +; GFX9-NEXT: s_and_b32 s10, s11, 0xff ; GFX9-NEXT: s_bfe_u32 s10, s10, 0x100000 ; GFX9-NEXT: v_mul_lo_u32 v1, v1, v2 ; GFX9-NEXT: s_bfe_u32 s4, s4, 0x100000 ; GFX9-NEXT: s_lshl_b32 s10, s10, 16 ; GFX9-NEXT: s_or_b32 s4, s4, s10 ; GFX9-NEXT: v_mul_hi_u32 v0, s4, v0 -; GFX9-NEXT: s_lshr_b32 s15, s5, 8 -; GFX9-NEXT: s_and_b32 s5, s5, s12 +; GFX9-NEXT: s_lshr_b32 s14, s5, 8 +; GFX9-NEXT: s_and_b32 s5, s5, 0xff ; GFX9-NEXT: v_mul_hi_u32 v1, v2, v1 -; GFX9-NEXT: s_lshl_b32 s5, s5, s13 -; GFX9-NEXT: s_and_b32 s10, s15, s12 -; GFX9-NEXT: s_or_b32 s5, s14, s5 +; GFX9-NEXT: s_lshl_b32 s5, s5, s12 +; GFX9-NEXT: s_and_b32 s10, s14, 0xff +; GFX9-NEXT: s_or_b32 s5, s13, s5 ; GFX9-NEXT: s_bfe_u32 s10, s10, 0x100000 ; GFX9-NEXT: s_bfe_u32 s5, s5, 0x100000 ; GFX9-NEXT: s_lshl_b32 s10, s10, 16 @@ -1831,10 +1815,11 @@ define amdgpu_ps i48 @s_fshr_v2i24(i48 inreg %lhs.arg, i48 inreg %rhs.arg, i48 i ; GFX9-NEXT: v_lshrrev_b32_e64 v1, v1, s3 ; GFX9-NEXT: s_mov_b32 s6, 8 ; GFX9-NEXT: v_lshl_or_b32 v1, s0, v3, v1 +; GFX9-NEXT: s_movk_i32 s0, 0xff ; GFX9-NEXT: s_mov_b32 s8, 16 ; GFX9-NEXT: v_lshlrev_b32_sdwa v2, s6, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 -; GFX9-NEXT: v_and_b32_e32 v3, s12, v1 -; GFX9-NEXT: v_and_or_b32 v2, v0, s12, v2 +; GFX9-NEXT: v_and_b32_e32 v3, s0, v1 +; GFX9-NEXT: v_and_or_b32 v2, v0, s0, v2 ; GFX9-NEXT: v_lshlrev_b32_sdwa v0, s8, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 ; GFX9-NEXT: v_lshlrev_b32_e32 v3, 24, v3 ; GFX9-NEXT: v_or3_b32 v0, v2, v0, v3 @@ -1849,122 +1834,119 @@ define amdgpu_ps i48 @s_fshr_v2i24(i48 inreg %lhs.arg, i48 inreg %rhs.arg, i48 i ; GFX10: ; %bb.0: ; GFX10-NEXT: v_cvt_f32_ubyte0_e32 v0, 24 ; GFX10-NEXT: v_cvt_f32_ubyte0_e32 v1, 24 -; GFX10-NEXT: s_movk_i32 s9, 0xff -; GFX10-NEXT: s_lshr_b32 s12, s4, 8 +; GFX10-NEXT: s_lshr_b32 s9, s1, 8 ; GFX10-NEXT: s_bfe_u32 s10, 8, 0x100000 +; GFX10-NEXT: s_and_b32 s1, s1, 0xff ; GFX10-NEXT: v_rcp_iflag_f32_e32 v0, v0 ; GFX10-NEXT: v_rcp_iflag_f32_e32 v1, v1 -; GFX10-NEXT: s_lshr_b32 s13, s4, 16 -; GFX10-NEXT: s_and_b32 s12, s12, s9 -; GFX10-NEXT: s_lshr_b32 s14, s4, 24 -; GFX10-NEXT: s_and_b32 s4, s4, s9 -; GFX10-NEXT: s_lshl_b32 s12, s12, s10 -; GFX10-NEXT: s_and_b32 s13, s13, s9 -; GFX10-NEXT: s_or_b32 s4, s4, s12 -; GFX10-NEXT: s_bfe_u32 s12, s13, 0x100000 -; GFX10-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 -; GFX10-NEXT: v_mul_f32_e32 v1, 0x4f7ffffe, v1 -; GFX10-NEXT: s_lshr_b32 s15, s5, 8 -; GFX10-NEXT: s_bfe_u32 s4, s4, 0x100000 -; GFX10-NEXT: s_lshl_b32 s12, s12, 16 -; GFX10-NEXT: v_cvt_u32_f32_e32 v0, v0 -; GFX10-NEXT: v_cvt_u32_f32_e32 v1, v1 -; GFX10-NEXT: s_and_b32 s5, s5, s9 -; GFX10-NEXT: s_or_b32 s4, s4, s12 -; GFX10-NEXT: s_lshl_b32 s5, s5, s10 -; GFX10-NEXT: v_mul_lo_u32 v2, 0xffffffe8, v0 -; GFX10-NEXT: v_mul_lo_u32 v3, 0xffffffe8, v1 -; GFX10-NEXT: s_and_b32 s12, s15, s9 -; GFX10-NEXT: s_or_b32 s5, s14, s5 -; GFX10-NEXT: s_bfe_u32 s12, s12, 0x100000 -; GFX10-NEXT: s_bfe_u32 s5, s5, 0x100000 -; GFX10-NEXT: s_lshl_b32 s12, s12, 16 -; GFX10-NEXT: s_lshr_b32 s11, s1, 8 -; GFX10-NEXT: v_mul_hi_u32 v2, v0, v2 -; GFX10-NEXT: s_or_b32 s5, s5, s12 -; GFX10-NEXT: s_and_b32 s1, s1, s9 ; GFX10-NEXT: s_lshr_b32 s6, s0, 8 ; GFX10-NEXT: s_lshr_b32 s8, s0, 24 ; GFX10-NEXT: s_lshl_b32 s1, s1, s10 -; GFX10-NEXT: s_and_b32 s6, s6, s9 +; GFX10-NEXT: s_and_b32 s6, s6, 0xff ; GFX10-NEXT: s_or_b32 s1, s8, s1 -; GFX10-NEXT: v_add_nc_u32_e32 v0, v0, v2 -; GFX10-NEXT: v_mul_hi_u32 v2, v1, v3 -; GFX10-NEXT: s_lshr_b32 s8, s2, 8 +; GFX10-NEXT: s_lshr_b32 s8, s4, 8 ; GFX10-NEXT: s_lshr_b32 s7, s0, 16 -; GFX10-NEXT: s_and_b32 s0, s0, s9 -; GFX10-NEXT: v_mul_hi_u32 v0, s4, v0 +; GFX10-NEXT: s_and_b32 s0, s0, 0xff +; GFX10-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 +; GFX10-NEXT: v_mul_f32_e32 v1, 0x4f7ffffe, v1 ; GFX10-NEXT: s_lshl_b32 s6, s6, s10 -; GFX10-NEXT: s_and_b32 s8, s8, s9 +; GFX10-NEXT: s_and_b32 s8, s8, 0xff ; GFX10-NEXT: s_or_b32 s0, s0, s6 -; GFX10-NEXT: v_add_nc_u32_e32 v1, v1, v2 -; GFX10-NEXT: s_and_b32 s6, s7, s9 -; GFX10-NEXT: s_and_b32 s7, s11, s9 -; GFX10-NEXT: s_lshr_b32 s11, s2, 16 -; GFX10-NEXT: v_mul_lo_u32 v0, v0, 24 -; GFX10-NEXT: v_mul_hi_u32 v1, s5, v1 -; GFX10-NEXT: s_lshr_b32 s13, s2, 24 -; GFX10-NEXT: s_and_b32 s2, s2, s9 +; GFX10-NEXT: v_cvt_u32_f32_e32 v0, v0 +; GFX10-NEXT: v_cvt_u32_f32_e32 v1, v1 +; GFX10-NEXT: s_and_b32 s6, s7, 0xff +; GFX10-NEXT: s_and_b32 s7, s9, 0xff +; GFX10-NEXT: s_lshr_b32 s9, s4, 16 +; GFX10-NEXT: v_mul_lo_u32 v2, 0xffffffe8, v0 +; GFX10-NEXT: v_mul_lo_u32 v3, 0xffffffe8, v1 +; GFX10-NEXT: s_lshr_b32 s11, s4, 24 +; GFX10-NEXT: s_and_b32 s4, s4, 0xff ; GFX10-NEXT: s_lshl_b32 s8, s8, s10 +; GFX10-NEXT: s_lshr_b32 s12, s5, 8 +; GFX10-NEXT: s_or_b32 s4, s4, s8 +; GFX10-NEXT: s_and_b32 s8, s9, 0xff +; GFX10-NEXT: v_mul_hi_u32 v2, v0, v2 +; GFX10-NEXT: v_mul_hi_u32 v3, v1, v3 +; GFX10-NEXT: s_bfe_u32 s8, s8, 0x100000 +; GFX10-NEXT: s_bfe_u32 s4, s4, 0x100000 +; GFX10-NEXT: s_and_b32 s5, s5, 0xff +; GFX10-NEXT: s_lshl_b32 s8, s8, 16 +; GFX10-NEXT: s_lshl_b32 s5, s5, s10 +; GFX10-NEXT: s_or_b32 s4, s4, s8 +; GFX10-NEXT: v_add_nc_u32_e32 v0, v0, v2 +; GFX10-NEXT: s_and_b32 s8, s12, 0xff +; GFX10-NEXT: s_or_b32 s5, s11, s5 +; GFX10-NEXT: s_bfe_u32 s8, s8, 0x100000 +; GFX10-NEXT: v_add_nc_u32_e32 v1, v1, v3 +; GFX10-NEXT: v_mul_hi_u32 v0, s4, v0 +; GFX10-NEXT: s_bfe_u32 s5, s5, 0x100000 +; GFX10-NEXT: s_lshl_b32 s8, s8, 16 +; GFX10-NEXT: s_lshr_b32 s9, s2, 8 +; GFX10-NEXT: s_or_b32 s5, s5, s8 +; GFX10-NEXT: s_lshr_b32 s8, s2, 16 +; GFX10-NEXT: v_mul_hi_u32 v1, s5, v1 +; GFX10-NEXT: s_and_b32 s9, s9, 0xff +; GFX10-NEXT: v_mul_lo_u32 v0, v0, 24 +; GFX10-NEXT: s_lshr_b32 s11, s2, 24 ; GFX10-NEXT: s_lshr_b32 s12, s3, 8 -; GFX10-NEXT: s_or_b32 s2, s2, s8 -; GFX10-NEXT: s_and_b32 s8, s11, s9 -; GFX10-NEXT: v_sub_nc_u32_e32 v0, s4, v0 +; GFX10-NEXT: s_and_b32 s2, s2, 0xff +; GFX10-NEXT: s_lshl_b32 s9, s9, s10 +; GFX10-NEXT: s_and_b32 s8, s8, 0xff ; GFX10-NEXT: v_mul_lo_u32 v1, v1, 24 +; GFX10-NEXT: s_and_b32 s3, s3, 0xff +; GFX10-NEXT: v_sub_nc_u32_e32 v0, s4, v0 +; GFX10-NEXT: s_or_b32 s2, s2, s9 ; GFX10-NEXT: s_bfe_u32 s4, s8, 0x100000 -; GFX10-NEXT: s_bfe_u32 s2, s2, 0x100000 -; GFX10-NEXT: s_lshl_b32 s4, s4, 16 -; GFX10-NEXT: v_subrev_nc_u32_e32 v2, 24, v0 -; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, 24, v0 -; GFX10-NEXT: s_and_b32 s3, s3, s9 -; GFX10-NEXT: s_or_b32 s2, s2, s4 -; GFX10-NEXT: v_sub_nc_u32_e32 v1, s5, v1 -; GFX10-NEXT: s_mov_b32 s4, 0xffffff -; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc_lo ; GFX10-NEXT: s_lshl_b32 s3, s3, s10 -; GFX10-NEXT: s_and_b32 s5, s12, s9 +; GFX10-NEXT: s_bfe_u32 s2, s2, 0x100000 +; GFX10-NEXT: v_subrev_nc_u32_e32 v2, 24, v0 +; GFX10-NEXT: v_sub_nc_u32_e32 v1, s5, v1 +; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, 24, v0 +; GFX10-NEXT: s_and_b32 s5, s12, 0xff +; GFX10-NEXT: s_lshl_b32 s4, s4, 16 +; GFX10-NEXT: s_or_b32 s3, s11, s3 +; GFX10-NEXT: s_bfe_u32 s5, s5, 0x100000 +; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc_lo ; GFX10-NEXT: v_subrev_nc_u32_e32 v2, 24, v1 ; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, 24, v1 -; GFX10-NEXT: v_subrev_nc_u32_e32 v3, 24, v0 -; GFX10-NEXT: s_or_b32 s3, s13, s3 -; GFX10-NEXT: s_bfe_u32 s5, s5, 0x100000 ; GFX10-NEXT: s_bfe_u32 s3, s3, 0x100000 +; GFX10-NEXT: s_lshl_b32 s5, s5, 16 +; GFX10-NEXT: v_subrev_nc_u32_e32 v3, 24, v0 +; GFX10-NEXT: s_or_b32 s2, s2, s4 ; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc_lo ; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, 24, v0 -; GFX10-NEXT: s_lshl_b32 s5, s5, 16 ; GFX10-NEXT: s_bfe_u32 s0, s0, 0x100000 ; GFX10-NEXT: s_bfe_u32 s6, s6, 0x100000 +; GFX10-NEXT: s_or_b32 s3, s3, s5 ; GFX10-NEXT: v_subrev_nc_u32_e32 v2, 24, v1 ; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc_lo ; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, 24, v1 -; GFX10-NEXT: s_or_b32 s3, s3, s5 ; GFX10-NEXT: s_bfe_u32 s1, s1, 0x100000 ; GFX10-NEXT: s_bfe_u32 s7, s7, 0x100000 +; GFX10-NEXT: s_lshl_b32 s4, s6, 17 ; GFX10-NEXT: v_sub_nc_u32_e32 v3, 23, v0 ; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc_lo -; GFX10-NEXT: v_mov_b32_e32 v2, 0xffffff -; GFX10-NEXT: v_and_b32_e32 v0, s4, v0 -; GFX10-NEXT: s_lshl_b32 s5, s6, 17 -; GFX10-NEXT: v_and_b32_e32 v3, s4, v3 -; GFX10-NEXT: v_sub_nc_u32_e32 v4, 23, v1 -; GFX10-NEXT: v_and_b32_e32 v1, v1, v2 -; GFX10-NEXT: v_lshrrev_b32_e64 v0, v0, s2 +; GFX10-NEXT: v_and_b32_e32 v0, 0xffffff, v0 ; GFX10-NEXT: s_lshl_b32 s0, s0, 1 -; GFX10-NEXT: s_lshl_b32 s2, s7, 17 -; GFX10-NEXT: v_and_b32_e32 v2, v4, v2 -; GFX10-NEXT: v_lshrrev_b32_e64 v1, v1, s3 -; GFX10-NEXT: s_or_b32 s0, s5, s0 ; GFX10-NEXT: s_lshl_b32 s1, s1, 1 +; GFX10-NEXT: v_and_b32_e32 v3, 0xffffff, v3 +; GFX10-NEXT: v_sub_nc_u32_e32 v2, 23, v1 +; GFX10-NEXT: v_and_b32_e32 v1, 0xffffff, v1 +; GFX10-NEXT: v_lshrrev_b32_e64 v0, v0, s2 +; GFX10-NEXT: s_or_b32 s0, s4, s0 +; GFX10-NEXT: s_lshl_b32 s2, s7, 17 +; GFX10-NEXT: v_and_b32_e32 v2, 0xffffff, v2 +; GFX10-NEXT: v_lshrrev_b32_e64 v1, v1, s3 ; GFX10-NEXT: v_lshl_or_b32 v0, s0, v3, v0 ; GFX10-NEXT: s_or_b32 s0, s2, s1 ; GFX10-NEXT: v_lshl_or_b32 v1, s0, v2, v1 ; GFX10-NEXT: s_mov_b32 s0, 8 ; GFX10-NEXT: v_lshlrev_b32_sdwa v2, s0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 ; GFX10-NEXT: s_mov_b32 s0, 16 -; GFX10-NEXT: v_and_b32_e32 v3, s9, v1 +; GFX10-NEXT: v_and_b32_e32 v3, 0xff, v1 ; GFX10-NEXT: v_bfe_u32 v4, v1, 8, 8 ; GFX10-NEXT: v_bfe_u32 v1, v1, 16, 8 -; GFX10-NEXT: v_and_or_b32 v2, v0, s9, v2 +; GFX10-NEXT: v_and_or_b32 v2, v0, 0xff, v2 ; GFX10-NEXT: v_lshlrev_b32_sdwa v0, s0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 ; GFX10-NEXT: v_lshlrev_b32_e32 v3, 24, v3 ; GFX10-NEXT: v_lshl_or_b32 v1, v1, 8, v4 @@ -2156,14 +2138,13 @@ define <2 x i24> @v_fshr_v2i24(<2 x i24> %lhs, <2 x i24> %rhs, <2 x i24> %amt) { ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: v_cvt_f32_ubyte0_e32 v6, 24 ; GFX10-NEXT: v_cvt_f32_ubyte0_e32 v7, 24 -; GFX10-NEXT: v_mov_b32_e32 v10, 0xffffff ; GFX10-NEXT: v_and_b32_e32 v4, 0xffffff, v4 -; GFX10-NEXT: v_lshlrev_b32_e32 v0, 1, v0 +; GFX10-NEXT: v_and_b32_e32 v5, 0xffffff, v5 +; GFX10-NEXT: v_and_b32_e32 v2, 0xffffff, v2 ; GFX10-NEXT: v_rcp_iflag_f32_e32 v6, v6 ; GFX10-NEXT: v_rcp_iflag_f32_e32 v7, v7 -; GFX10-NEXT: v_and_b32_e32 v5, v5, v10 -; GFX10-NEXT: v_and_b32_e32 v2, v2, v10 -; GFX10-NEXT: v_and_b32_e32 v3, v3, v10 +; GFX10-NEXT: v_and_b32_e32 v3, 0xffffff, v3 +; GFX10-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX10-NEXT: v_lshlrev_b32_e32 v1, 1, v1 ; GFX10-NEXT: v_mul_f32_e32 v6, 0x4f7ffffe, v6 ; GFX10-NEXT: v_mul_f32_e32 v7, 0x4f7ffffe, v7 @@ -2194,12 +2175,12 @@ define <2 x i24> @v_fshr_v2i24(<2 x i24> %lhs, <2 x i24> %rhs, <2 x i24> %amt) { ; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, 24, v5 ; GFX10-NEXT: v_sub_nc_u32_e32 v6, 23, v4 ; GFX10-NEXT: v_cndmask_b32_e32 v5, v5, v7, vcc_lo -; GFX10-NEXT: v_and_b32_e32 v4, v4, v10 -; GFX10-NEXT: v_and_b32_e32 v6, v6, v10 +; GFX10-NEXT: v_and_b32_e32 v4, 0xffffff, v4 +; GFX10-NEXT: v_and_b32_e32 v6, 0xffffff, v6 ; GFX10-NEXT: v_sub_nc_u32_e32 v7, 23, v5 -; GFX10-NEXT: v_and_b32_e32 v5, v5, v10 +; GFX10-NEXT: v_and_b32_e32 v5, 0xffffff, v5 ; GFX10-NEXT: v_lshrrev_b32_e32 v2, v4, v2 -; GFX10-NEXT: v_and_b32_e32 v4, v7, v10 +; GFX10-NEXT: v_and_b32_e32 v4, 0xffffff, v7 ; GFX10-NEXT: v_lshrrev_b32_e32 v3, v5, v3 ; GFX10-NEXT: v_lshl_or_b32 v0, v0, v6, v2 ; GFX10-NEXT: v_lshl_or_b32 v1, v1, v4, v3 @@ -3032,25 +3013,24 @@ define amdgpu_ps i32 @s_fshr_v2i16(<2 x i16> inreg %lhs, <2 x i16> inreg %rhs, < ; GFX6-NEXT: s_and_b32 s4, s4, 0xffff ; GFX6-NEXT: s_or_b32 s4, s5, s4 ; GFX6-NEXT: s_bfe_u32 s5, 1, 0x100000 -; GFX6-NEXT: s_mov_b32 s6, 0xf0001 ; GFX6-NEXT: s_lshl_b32 s0, s0, s5 -; GFX6-NEXT: s_bfe_u32 s7, s2, s6 -; GFX6-NEXT: s_bfe_u32 s8, 14, 0x100000 +; GFX6-NEXT: s_bfe_u32 s6, s2, 0xf0001 +; GFX6-NEXT: s_bfe_u32 s7, 14, 0x100000 ; GFX6-NEXT: s_lshl_b32 s1, s1, s5 -; GFX6-NEXT: s_bfe_u32 s5, s3, s6 -; GFX6-NEXT: s_lshr_b32 s7, s7, s8 -; GFX6-NEXT: s_lshr_b32 s5, s5, s8 +; GFX6-NEXT: s_bfe_u32 s5, s3, 0xf0001 +; GFX6-NEXT: s_lshr_b32 s6, s6, s7 +; GFX6-NEXT: s_lshr_b32 s5, s5, s7 ; GFX6-NEXT: s_xor_b32 s4, s4, -1 -; GFX6-NEXT: s_or_b32 s0, s0, s7 +; GFX6-NEXT: s_or_b32 s0, s0, s6 ; GFX6-NEXT: s_or_b32 s1, s1, s5 ; GFX6-NEXT: s_lshl_b32 s2, s2, 1 ; GFX6-NEXT: s_lshr_b32 s5, s4, 16 -; GFX6-NEXT: s_and_b32 s7, s4, 15 +; GFX6-NEXT: s_and_b32 s6, s4, 15 ; GFX6-NEXT: s_andn2_b32 s4, 15, s4 -; GFX6-NEXT: s_bfe_u32 s7, s7, 0x100000 -; GFX6-NEXT: s_bfe_u32 s2, s2, s6 +; GFX6-NEXT: s_bfe_u32 s6, s6, 0x100000 +; GFX6-NEXT: s_bfe_u32 s2, s2, 0xf0001 ; GFX6-NEXT: s_bfe_u32 s4, s4, 0x100000 -; GFX6-NEXT: s_lshl_b32 s0, s0, s7 +; GFX6-NEXT: s_lshl_b32 s0, s0, s6 ; GFX6-NEXT: s_lshr_b32 s2, s2, s4 ; GFX6-NEXT: s_or_b32 s0, s0, s2 ; GFX6-NEXT: s_and_b32 s2, s5, 15 @@ -3058,7 +3038,7 @@ define amdgpu_ps i32 @s_fshr_v2i16(<2 x i16> inreg %lhs, <2 x i16> inreg %rhs, < ; GFX6-NEXT: s_andn2_b32 s4, 15, s5 ; GFX6-NEXT: s_bfe_u32 s2, s2, 0x100000 ; GFX6-NEXT: s_lshl_b32 s1, s1, s2 -; GFX6-NEXT: s_bfe_u32 s2, s3, s6 +; GFX6-NEXT: s_bfe_u32 s2, s3, 0xf0001 ; GFX6-NEXT: s_bfe_u32 s3, s4, 0x100000 ; GFX6-NEXT: s_lshr_b32 s2, s2, s3 ; GFX6-NEXT: s_or_b32 s1, s1, s2 @@ -3111,45 +3091,43 @@ define amdgpu_ps i32 @s_fshr_v2i16(<2 x i16> inreg %lhs, <2 x i16> inreg %rhs, < ; ; GFX9-LABEL: s_fshr_v2i16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_mov_b32 s3, 0xf000f -; GFX9-NEXT: s_and_b32 s4, s2, s3 -; GFX9-NEXT: s_andn2_b32 s2, s3, s2 -; GFX9-NEXT: s_lshr_b32 s3, s0, 16 +; GFX9-NEXT: s_lshr_b32 s4, s0, 16 ; GFX9-NEXT: s_lshl_b32 s0, s0, 0x10001 -; GFX9-NEXT: s_lshl_b32 s3, s3, 1 -; GFX9-NEXT: s_pack_ll_b32_b16 s0, s0, s3 -; GFX9-NEXT: s_lshr_b32 s3, s0, 16 +; GFX9-NEXT: s_lshl_b32 s4, s4, 1 +; GFX9-NEXT: s_and_b32 s3, s2, 0xf000f +; GFX9-NEXT: s_andn2_b32 s2, 0xf000f, s2 +; GFX9-NEXT: s_pack_ll_b32_b16 s0, s0, s4 +; GFX9-NEXT: s_lshr_b32 s4, s0, 16 ; GFX9-NEXT: s_lshr_b32 s5, s2, 16 ; GFX9-NEXT: s_lshl_b32 s0, s0, s2 -; GFX9-NEXT: s_lshl_b32 s2, s3, s5 +; GFX9-NEXT: s_lshl_b32 s2, s4, s5 ; GFX9-NEXT: s_pack_ll_b32_b16 s0, s0, s2 ; GFX9-NEXT: s_lshr_b32 s2, s1, 16 ; GFX9-NEXT: s_and_b32 s1, s1, 0xffff -; GFX9-NEXT: s_lshr_b32 s3, s4, 16 -; GFX9-NEXT: s_lshr_b32 s1, s1, s4 -; GFX9-NEXT: s_lshr_b32 s2, s2, s3 +; GFX9-NEXT: s_lshr_b32 s4, s3, 16 +; GFX9-NEXT: s_lshr_b32 s1, s1, s3 +; GFX9-NEXT: s_lshr_b32 s2, s2, s4 ; GFX9-NEXT: s_pack_ll_b32_b16 s1, s1, s2 ; GFX9-NEXT: s_or_b32 s0, s0, s1 ; GFX9-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: s_fshr_v2i16: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_lshr_b32 s4, s0, 16 -; GFX10-NEXT: s_mov_b32 s3, 0xf000f -; GFX10-NEXT: s_lshl_b32 s0, s0, 0x10001 -; GFX10-NEXT: s_lshl_b32 s4, s4, 1 -; GFX10-NEXT: s_and_b32 s5, s2, s3 -; GFX10-NEXT: s_pack_ll_b32_b16 s0, s0, s4 -; GFX10-NEXT: s_andn2_b32 s2, s3, s2 ; GFX10-NEXT: s_lshr_b32 s3, s0, 16 -; GFX10-NEXT: s_lshr_b32 s4, s2, 16 +; GFX10-NEXT: s_lshl_b32 s0, s0, 0x10001 +; GFX10-NEXT: s_lshl_b32 s3, s3, 1 +; GFX10-NEXT: s_and_b32 s4, s2, 0xf000f +; GFX10-NEXT: s_pack_ll_b32_b16 s0, s0, s3 +; GFX10-NEXT: s_andn2_b32 s2, 0xf000f, s2 +; GFX10-NEXT: s_lshr_b32 s3, s0, 16 +; GFX10-NEXT: s_lshr_b32 s5, s2, 16 ; GFX10-NEXT: s_lshl_b32 s0, s0, s2 -; GFX10-NEXT: s_lshl_b32 s2, s3, s4 +; GFX10-NEXT: s_lshl_b32 s2, s3, s5 ; GFX10-NEXT: s_lshr_b32 s3, s1, 16 ; GFX10-NEXT: s_and_b32 s1, s1, 0xffff -; GFX10-NEXT: s_lshr_b32 s4, s5, 16 -; GFX10-NEXT: s_lshr_b32 s1, s1, s5 -; GFX10-NEXT: s_lshr_b32 s3, s3, s4 +; GFX10-NEXT: s_lshr_b32 s5, s4, 16 +; GFX10-NEXT: s_lshr_b32 s1, s1, s4 +; GFX10-NEXT: s_lshr_b32 s3, s3, s5 ; GFX10-NEXT: s_pack_ll_b32_b16 s0, s0, s2 ; GFX10-NEXT: s_pack_ll_b32_b16 s1, s1, s3 ; GFX10-NEXT: s_or_b32 s0, s0, s1 @@ -3252,10 +3230,9 @@ define <2 x i16> @v_fshr_v2i16(<2 x i16> %lhs, <2 x i16> %rhs, <2 x i16> %amt) { ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: v_xor_b32_e32 v3, -1, v2 -; GFX10-NEXT: s_mov_b32 s4, 0xf000f ; GFX10-NEXT: v_pk_lshlrev_b16 v0, 1, v0 op_sel_hi:[0,1] -; GFX10-NEXT: v_and_b32_e32 v2, s4, v2 -; GFX10-NEXT: v_and_b32_e32 v3, s4, v3 +; GFX10-NEXT: v_and_b32_e32 v2, 0xf000f, v2 +; GFX10-NEXT: v_and_b32_e32 v3, 0xf000f, v3 ; GFX10-NEXT: v_pk_lshrrev_b16 v1, v2, v1 ; GFX10-NEXT: v_pk_lshlrev_b16 v0, v3, v0 ; GFX10-NEXT: v_or_b32_e32 v0, v0, v1 @@ -3326,35 +3303,34 @@ define amdgpu_ps float @v_fshr_v2i16_ssv(<2 x i16> inreg %lhs, <2 x i16> inreg % ; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX6-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX6-NEXT: v_or_b32_e32 v0, v1, v0 -; GFX6-NEXT: s_mov_b32 s5, 0xf0001 ; GFX6-NEXT: s_bfe_u32 s4, 1, 0x100000 -; GFX6-NEXT: s_bfe_u32 s6, s2, s5 -; GFX6-NEXT: s_bfe_u32 s7, 14, 0x100000 +; GFX6-NEXT: s_bfe_u32 s5, s2, 0xf0001 +; GFX6-NEXT: s_bfe_u32 s6, 14, 0x100000 ; GFX6-NEXT: v_xor_b32_e32 v0, -1, v0 ; GFX6-NEXT: s_lshl_b32 s0, s0, s4 -; GFX6-NEXT: s_lshr_b32 s6, s6, s7 +; GFX6-NEXT: s_lshr_b32 s5, s5, s6 ; GFX6-NEXT: v_lshrrev_b32_e32 v1, 16, v0 ; GFX6-NEXT: v_and_b32_e32 v2, 15, v0 ; GFX6-NEXT: v_xor_b32_e32 v0, -1, v0 -; GFX6-NEXT: s_or_b32 s0, s0, s6 +; GFX6-NEXT: s_or_b32 s0, s0, s5 ; GFX6-NEXT: s_lshl_b32 s2, s2, 1 ; GFX6-NEXT: v_and_b32_e32 v0, 15, v0 ; GFX6-NEXT: v_bfe_u32 v2, v2, 0, 16 ; GFX6-NEXT: v_lshl_b32_e32 v2, s0, v2 -; GFX6-NEXT: s_bfe_u32 s0, s2, s5 +; GFX6-NEXT: s_bfe_u32 s0, s2, 0xf0001 ; GFX6-NEXT: v_bfe_u32 v0, v0, 0, 16 ; GFX6-NEXT: v_lshr_b32_e32 v0, s0, v0 ; GFX6-NEXT: s_lshl_b32 s1, s1, s4 -; GFX6-NEXT: s_bfe_u32 s4, s3, s5 +; GFX6-NEXT: s_bfe_u32 s4, s3, 0xf0001 ; GFX6-NEXT: v_or_b32_e32 v0, v2, v0 ; GFX6-NEXT: v_and_b32_e32 v2, 15, v1 ; GFX6-NEXT: v_xor_b32_e32 v1, -1, v1 -; GFX6-NEXT: s_lshr_b32 s4, s4, s7 +; GFX6-NEXT: s_lshr_b32 s4, s4, s6 ; GFX6-NEXT: s_lshl_b32 s3, s3, 1 ; GFX6-NEXT: v_and_b32_e32 v1, 15, v1 ; GFX6-NEXT: s_or_b32 s1, s1, s4 ; GFX6-NEXT: v_bfe_u32 v2, v2, 0, 16 -; GFX6-NEXT: s_bfe_u32 s0, s3, s5 +; GFX6-NEXT: s_bfe_u32 s0, s3, 0xf0001 ; GFX6-NEXT: v_bfe_u32 v1, v1, 0, 16 ; GFX6-NEXT: v_lshl_b32_e32 v2, s1, v2 ; GFX6-NEXT: v_lshr_b32_e32 v1, s0, v1 @@ -3421,12 +3397,11 @@ define amdgpu_ps float @v_fshr_v2i16_ssv(<2 x i16> inreg %lhs, <2 x i16> inreg % ; GFX10-LABEL: v_fshr_v2i16_ssv: ; GFX10: ; %bb.0: ; GFX10-NEXT: v_xor_b32_e32 v1, -1, v0 -; GFX10-NEXT: s_mov_b32 s2, 0xf000f -; GFX10-NEXT: s_lshr_b32 s3, s0, 16 -; GFX10-NEXT: v_and_b32_e32 v0, s2, v0 +; GFX10-NEXT: s_lshr_b32 s2, s0, 16 +; GFX10-NEXT: v_and_b32_e32 v0, 0xf000f, v0 ; GFX10-NEXT: s_lshl_b32 s0, s0, 0x10001 -; GFX10-NEXT: v_and_b32_e32 v1, s2, v1 -; GFX10-NEXT: s_lshl_b32 s2, s3, 1 +; GFX10-NEXT: s_lshl_b32 s2, s2, 1 +; GFX10-NEXT: v_and_b32_e32 v1, 0xf000f, v1 ; GFX10-NEXT: s_pack_ll_b32_b16 s0, s0, s2 ; GFX10-NEXT: v_pk_lshrrev_b16 v0, v0, s1 ; GFX10-NEXT: v_pk_lshlrev_b16 v1, v1, s0 @@ -3514,36 +3489,34 @@ define amdgpu_ps float @v_fshr_v2i16_svs(<2 x i16> inreg %lhs, <2 x i16> %rhs, < ; ; GFX9-LABEL: v_fshr_v2i16_svs: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_mov_b32 s2, 0xf000f -; GFX9-NEXT: s_and_b32 s3, s1, s2 -; GFX9-NEXT: s_andn2_b32 s1, s2, s1 -; GFX9-NEXT: s_lshr_b32 s2, s0, 16 +; GFX9-NEXT: s_lshr_b32 s3, s0, 16 ; GFX9-NEXT: s_lshl_b32 s0, s0, 0x10001 -; GFX9-NEXT: s_lshl_b32 s2, s2, 1 -; GFX9-NEXT: s_pack_ll_b32_b16 s0, s0, s2 -; GFX9-NEXT: s_lshr_b32 s2, s0, 16 +; GFX9-NEXT: s_lshl_b32 s3, s3, 1 +; GFX9-NEXT: s_and_b32 s2, s1, 0xf000f +; GFX9-NEXT: s_andn2_b32 s1, 0xf000f, s1 +; GFX9-NEXT: s_pack_ll_b32_b16 s0, s0, s3 +; GFX9-NEXT: s_lshr_b32 s3, s0, 16 ; GFX9-NEXT: s_lshr_b32 s4, s1, 16 ; GFX9-NEXT: s_lshl_b32 s0, s0, s1 -; GFX9-NEXT: s_lshl_b32 s1, s2, s4 +; GFX9-NEXT: s_lshl_b32 s1, s3, s4 ; GFX9-NEXT: s_pack_ll_b32_b16 s0, s0, s1 -; GFX9-NEXT: v_pk_lshrrev_b16 v0, s3, v0 +; GFX9-NEXT: v_pk_lshrrev_b16 v0, s2, v0 ; GFX9-NEXT: v_or_b32_e32 v0, s0, v0 ; GFX9-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: v_fshr_v2i16_svs: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_lshr_b32 s3, s0, 16 -; GFX10-NEXT: s_mov_b32 s2, 0xf000f -; GFX10-NEXT: s_lshl_b32 s0, s0, 0x10001 -; GFX10-NEXT: s_lshl_b32 s3, s3, 1 -; GFX10-NEXT: s_and_b32 s4, s1, s2 -; GFX10-NEXT: s_pack_ll_b32_b16 s0, s0, s3 -; GFX10-NEXT: s_andn2_b32 s1, s2, s1 ; GFX10-NEXT: s_lshr_b32 s2, s0, 16 -; GFX10-NEXT: s_lshr_b32 s3, s1, 16 -; GFX10-NEXT: v_pk_lshrrev_b16 v0, s4, v0 +; GFX10-NEXT: s_lshl_b32 s0, s0, 0x10001 +; GFX10-NEXT: s_lshl_b32 s2, s2, 1 +; GFX10-NEXT: s_and_b32 s3, s1, 0xf000f +; GFX10-NEXT: s_pack_ll_b32_b16 s0, s0, s2 +; GFX10-NEXT: s_andn2_b32 s1, 0xf000f, s1 +; GFX10-NEXT: s_lshr_b32 s2, s0, 16 +; GFX10-NEXT: s_lshr_b32 s4, s1, 16 +; GFX10-NEXT: v_pk_lshrrev_b16 v0, s3, v0 ; GFX10-NEXT: s_lshl_b32 s0, s0, s1 -; GFX10-NEXT: s_lshl_b32 s1, s2, s3 +; GFX10-NEXT: s_lshl_b32 s1, s2, s4 ; GFX10-NEXT: s_pack_ll_b32_b16 s0, s0, s1 ; GFX10-NEXT: v_or_b32_e32 v0, s0, v0 ; GFX10-NEXT: ; return to shader part epilog @@ -3559,25 +3532,24 @@ define amdgpu_ps float @v_fshr_v2i16_vss(<2 x i16> %lhs, <2 x i16> inreg %rhs, < ; GFX6-NEXT: s_and_b32 s2, s2, 0xffff ; GFX6-NEXT: s_or_b32 s2, s3, s2 ; GFX6-NEXT: s_bfe_u32 s3, 1, 0x100000 -; GFX6-NEXT: s_mov_b32 s4, 0xf0001 ; GFX6-NEXT: v_lshlrev_b32_e32 v0, s3, v0 -; GFX6-NEXT: s_bfe_u32 s5, s0, s4 -; GFX6-NEXT: s_bfe_u32 s6, 14, 0x100000 +; GFX6-NEXT: s_bfe_u32 s4, s0, 0xf0001 +; GFX6-NEXT: s_bfe_u32 s5, 14, 0x100000 ; GFX6-NEXT: v_lshlrev_b32_e32 v1, s3, v1 -; GFX6-NEXT: s_bfe_u32 s3, s1, s4 -; GFX6-NEXT: s_lshr_b32 s5, s5, s6 -; GFX6-NEXT: s_lshr_b32 s3, s3, s6 +; GFX6-NEXT: s_bfe_u32 s3, s1, 0xf0001 +; GFX6-NEXT: s_lshr_b32 s4, s4, s5 +; GFX6-NEXT: s_lshr_b32 s3, s3, s5 ; GFX6-NEXT: s_xor_b32 s2, s2, -1 -; GFX6-NEXT: v_or_b32_e32 v0, s5, v0 +; GFX6-NEXT: v_or_b32_e32 v0, s4, v0 ; GFX6-NEXT: v_or_b32_e32 v1, s3, v1 ; GFX6-NEXT: s_lshl_b32 s0, s0, 1 ; GFX6-NEXT: s_lshr_b32 s3, s2, 16 -; GFX6-NEXT: s_and_b32 s5, s2, 15 +; GFX6-NEXT: s_and_b32 s4, s2, 15 ; GFX6-NEXT: s_andn2_b32 s2, 15, s2 -; GFX6-NEXT: s_bfe_u32 s5, s5, 0x100000 -; GFX6-NEXT: s_bfe_u32 s0, s0, s4 +; GFX6-NEXT: s_bfe_u32 s4, s4, 0x100000 +; GFX6-NEXT: s_bfe_u32 s0, s0, 0xf0001 ; GFX6-NEXT: s_bfe_u32 s2, s2, 0x100000 -; GFX6-NEXT: v_lshlrev_b32_e32 v0, s5, v0 +; GFX6-NEXT: v_lshlrev_b32_e32 v0, s4, v0 ; GFX6-NEXT: s_lshr_b32 s0, s0, s2 ; GFX6-NEXT: v_or_b32_e32 v0, s0, v0 ; GFX6-NEXT: s_and_b32 s0, s3, 15 @@ -3585,7 +3557,7 @@ define amdgpu_ps float @v_fshr_v2i16_vss(<2 x i16> %lhs, <2 x i16> inreg %rhs, < ; GFX6-NEXT: s_andn2_b32 s2, 15, s3 ; GFX6-NEXT: s_bfe_u32 s0, s0, 0x100000 ; GFX6-NEXT: v_lshlrev_b32_e32 v1, s0, v1 -; GFX6-NEXT: s_bfe_u32 s0, s1, s4 +; GFX6-NEXT: s_bfe_u32 s0, s1, 0xf0001 ; GFX6-NEXT: s_bfe_u32 s1, s2, 0x100000 ; GFX6-NEXT: s_lshr_b32 s0, s0, s1 ; GFX6-NEXT: v_or_b32_e32 v1, s0, v1 @@ -3635,32 +3607,30 @@ define amdgpu_ps float @v_fshr_v2i16_vss(<2 x i16> %lhs, <2 x i16> inreg %rhs, < ; ; GFX9-LABEL: v_fshr_v2i16_vss: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_mov_b32 s2, 0xf000f -; GFX9-NEXT: s_and_b32 s3, s1, s2 -; GFX9-NEXT: s_andn2_b32 s1, s2, s1 +; GFX9-NEXT: s_and_b32 s2, s1, 0xf000f +; GFX9-NEXT: s_andn2_b32 s1, 0xf000f, s1 ; GFX9-NEXT: v_pk_lshlrev_b16 v0, 1, v0 op_sel_hi:[0,1] ; GFX9-NEXT: v_pk_lshlrev_b16 v0, s1, v0 ; GFX9-NEXT: s_lshr_b32 s1, s0, 16 ; GFX9-NEXT: s_and_b32 s0, s0, 0xffff -; GFX9-NEXT: s_lshr_b32 s2, s3, 16 -; GFX9-NEXT: s_lshr_b32 s0, s0, s3 -; GFX9-NEXT: s_lshr_b32 s1, s1, s2 +; GFX9-NEXT: s_lshr_b32 s3, s2, 16 +; GFX9-NEXT: s_lshr_b32 s0, s0, s2 +; GFX9-NEXT: s_lshr_b32 s1, s1, s3 ; GFX9-NEXT: s_pack_ll_b32_b16 s0, s0, s1 ; GFX9-NEXT: v_or_b32_e32 v0, s0, v0 ; GFX9-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: v_fshr_v2i16_vss: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_mov_b32 s2, 0xf000f ; GFX10-NEXT: v_pk_lshlrev_b16 v0, 1, v0 op_sel_hi:[0,1] -; GFX10-NEXT: s_and_b32 s3, s1, s2 -; GFX10-NEXT: s_andn2_b32 s1, s2, s1 -; GFX10-NEXT: s_lshr_b32 s2, s0, 16 +; GFX10-NEXT: s_and_b32 s2, s1, 0xf000f +; GFX10-NEXT: s_andn2_b32 s1, 0xf000f, s1 +; GFX10-NEXT: s_lshr_b32 s3, s0, 16 ; GFX10-NEXT: s_and_b32 s0, s0, 0xffff -; GFX10-NEXT: s_lshr_b32 s4, s3, 16 +; GFX10-NEXT: s_lshr_b32 s4, s2, 16 ; GFX10-NEXT: v_pk_lshlrev_b16 v0, s1, v0 -; GFX10-NEXT: s_lshr_b32 s0, s0, s3 -; GFX10-NEXT: s_lshr_b32 s1, s2, s4 +; GFX10-NEXT: s_lshr_b32 s0, s0, s2 +; GFX10-NEXT: s_lshr_b32 s1, s3, s4 ; GFX10-NEXT: s_pack_ll_b32_b16 s0, s0, s1 ; GFX10-NEXT: v_or_b32_e32 v0, s0, v0 ; GFX10-NEXT: ; return to shader part epilog @@ -3686,41 +3656,39 @@ define amdgpu_ps float @v_fshr_v2i16_vss(<2 x i16> %lhs, <2 x i16> inreg %rhs, < define amdgpu_ps <2 x i32> @s_fshr_v4i16(<4 x i16> inreg %lhs, <4 x i16> inreg %rhs, <4 x i16> inreg %amt) { ; GFX6-LABEL: s_fshr_v4i16: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_mov_b32 s12, 0xffff ; GFX6-NEXT: s_lshl_b32 s9, s9, 16 -; GFX6-NEXT: s_and_b32 s8, s8, s12 +; GFX6-NEXT: s_and_b32 s8, s8, 0xffff ; GFX6-NEXT: s_or_b32 s8, s9, s8 ; GFX6-NEXT: s_lshl_b32 s9, s11, 16 -; GFX6-NEXT: s_and_b32 s10, s10, s12 -; GFX6-NEXT: s_mov_b32 s11, 0xf0001 +; GFX6-NEXT: s_and_b32 s10, s10, 0xffff ; GFX6-NEXT: s_or_b32 s9, s9, s10 ; GFX6-NEXT: s_bfe_u32 s10, 1, 0x100000 -; GFX6-NEXT: s_bfe_u32 s12, s4, s11 -; GFX6-NEXT: s_bfe_u32 s13, 14, 0x100000 +; GFX6-NEXT: s_bfe_u32 s11, s4, 0xf0001 +; GFX6-NEXT: s_bfe_u32 s12, 14, 0x100000 ; GFX6-NEXT: s_lshl_b32 s0, s0, s10 -; GFX6-NEXT: s_lshr_b32 s12, s12, s13 -; GFX6-NEXT: s_or_b32 s0, s0, s12 -; GFX6-NEXT: s_bfe_u32 s12, s5, s11 +; GFX6-NEXT: s_lshr_b32 s11, s11, s12 +; GFX6-NEXT: s_or_b32 s0, s0, s11 +; GFX6-NEXT: s_bfe_u32 s11, s5, 0xf0001 ; GFX6-NEXT: s_lshl_b32 s1, s1, s10 -; GFX6-NEXT: s_lshr_b32 s12, s12, s13 +; GFX6-NEXT: s_lshr_b32 s11, s11, s12 ; GFX6-NEXT: s_xor_b32 s8, s8, -1 -; GFX6-NEXT: s_or_b32 s1, s1, s12 +; GFX6-NEXT: s_or_b32 s1, s1, s11 ; GFX6-NEXT: s_lshl_b32 s4, s4, 1 -; GFX6-NEXT: s_lshr_b32 s12, s8, 16 -; GFX6-NEXT: s_and_b32 s14, s8, 15 +; GFX6-NEXT: s_lshr_b32 s11, s8, 16 +; GFX6-NEXT: s_and_b32 s13, s8, 15 ; GFX6-NEXT: s_andn2_b32 s8, 15, s8 -; GFX6-NEXT: s_bfe_u32 s14, s14, 0x100000 -; GFX6-NEXT: s_bfe_u32 s4, s4, s11 +; GFX6-NEXT: s_bfe_u32 s13, s13, 0x100000 +; GFX6-NEXT: s_bfe_u32 s4, s4, 0xf0001 ; GFX6-NEXT: s_bfe_u32 s8, s8, 0x100000 -; GFX6-NEXT: s_lshl_b32 s0, s0, s14 +; GFX6-NEXT: s_lshl_b32 s0, s0, s13 ; GFX6-NEXT: s_lshr_b32 s4, s4, s8 ; GFX6-NEXT: s_or_b32 s0, s0, s4 -; GFX6-NEXT: s_and_b32 s4, s12, 15 +; GFX6-NEXT: s_and_b32 s4, s11, 15 ; GFX6-NEXT: s_lshl_b32 s5, s5, 1 -; GFX6-NEXT: s_andn2_b32 s8, 15, s12 +; GFX6-NEXT: s_andn2_b32 s8, 15, s11 ; GFX6-NEXT: s_bfe_u32 s4, s4, 0x100000 ; GFX6-NEXT: s_lshl_b32 s1, s1, s4 -; GFX6-NEXT: s_bfe_u32 s4, s5, s11 +; GFX6-NEXT: s_bfe_u32 s4, s5, 0xf0001 ; GFX6-NEXT: s_bfe_u32 s5, s8, 0x100000 ; GFX6-NEXT: s_lshr_b32 s4, s4, s5 ; GFX6-NEXT: s_or_b32 s1, s1, s4 @@ -3729,12 +3697,12 @@ define amdgpu_ps <2 x i32> @s_fshr_v4i16(<4 x i16> inreg %lhs, <4 x i16> inreg % ; GFX6-NEXT: s_lshl_b32 s1, s1, 16 ; GFX6-NEXT: s_or_b32 s0, s0, s1 ; GFX6-NEXT: s_lshl_b32 s1, s2, s10 -; GFX6-NEXT: s_bfe_u32 s2, s6, s11 -; GFX6-NEXT: s_lshr_b32 s2, s2, s13 +; GFX6-NEXT: s_bfe_u32 s2, s6, 0xf0001 +; GFX6-NEXT: s_lshr_b32 s2, s2, s12 ; GFX6-NEXT: s_or_b32 s1, s1, s2 ; GFX6-NEXT: s_lshl_b32 s2, s3, s10 -; GFX6-NEXT: s_bfe_u32 s3, s7, s11 -; GFX6-NEXT: s_lshr_b32 s3, s3, s13 +; GFX6-NEXT: s_bfe_u32 s3, s7, 0xf0001 +; GFX6-NEXT: s_lshr_b32 s3, s3, s12 ; GFX6-NEXT: s_xor_b32 s5, s9, -1 ; GFX6-NEXT: s_or_b32 s2, s2, s3 ; GFX6-NEXT: s_lshl_b32 s3, s6, 1 @@ -3743,7 +3711,7 @@ define amdgpu_ps <2 x i32> @s_fshr_v4i16(<4 x i16> inreg %lhs, <4 x i16> inreg % ; GFX6-NEXT: s_and_b32 s7, s5, 15 ; GFX6-NEXT: s_andn2_b32 s5, 15, s5 ; GFX6-NEXT: s_bfe_u32 s7, s7, 0x100000 -; GFX6-NEXT: s_bfe_u32 s3, s3, s11 +; GFX6-NEXT: s_bfe_u32 s3, s3, 0xf0001 ; GFX6-NEXT: s_bfe_u32 s5, s5, 0x100000 ; GFX6-NEXT: s_lshl_b32 s1, s1, s7 ; GFX6-NEXT: s_lshr_b32 s3, s3, s5 @@ -3752,7 +3720,7 @@ define amdgpu_ps <2 x i32> @s_fshr_v4i16(<4 x i16> inreg %lhs, <4 x i16> inreg % ; GFX6-NEXT: s_andn2_b32 s5, 15, s6 ; GFX6-NEXT: s_bfe_u32 s3, s3, 0x100000 ; GFX6-NEXT: s_lshl_b32 s2, s2, s3 -; GFX6-NEXT: s_bfe_u32 s3, s4, s11 +; GFX6-NEXT: s_bfe_u32 s3, s4, 0xf0001 ; GFX6-NEXT: s_bfe_u32 s4, s5, 0x100000 ; GFX6-NEXT: s_lshr_b32 s3, s3, s4 ; GFX6-NEXT: s_or_b32 s2, s2, s3 @@ -3840,31 +3808,28 @@ define amdgpu_ps <2 x i32> @s_fshr_v4i16(<4 x i16> inreg %lhs, <4 x i16> inreg % ; ; GFX9-LABEL: s_fshr_v4i16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_mov_b32 s8, 0x10001 -; GFX9-NEXT: s_lshr_b32 s9, s0, 16 -; GFX9-NEXT: s_mov_b32 s6, 0xf000f -; GFX9-NEXT: s_lshl_b32 s0, s0, s8 -; GFX9-NEXT: s_lshl_b32 s9, s9, 1 -; GFX9-NEXT: s_and_b32 s7, s4, s6 -; GFX9-NEXT: s_andn2_b32 s4, s6, s4 -; GFX9-NEXT: s_pack_ll_b32_b16 s0, s0, s9 -; GFX9-NEXT: s_lshr_b32 s9, s0, 16 -; GFX9-NEXT: s_lshr_b32 s10, s4, 16 +; GFX9-NEXT: s_lshr_b32 s7, s0, 16 +; GFX9-NEXT: s_lshl_b32 s0, s0, 0x10001 +; GFX9-NEXT: s_lshl_b32 s7, s7, 1 +; GFX9-NEXT: s_and_b32 s6, s4, 0xf000f +; GFX9-NEXT: s_andn2_b32 s4, 0xf000f, s4 +; GFX9-NEXT: s_pack_ll_b32_b16 s0, s0, s7 +; GFX9-NEXT: s_lshr_b32 s7, s0, 16 +; GFX9-NEXT: s_lshr_b32 s8, s4, 16 ; GFX9-NEXT: s_lshl_b32 s0, s0, s4 -; GFX9-NEXT: s_lshl_b32 s4, s9, s10 -; GFX9-NEXT: s_mov_b32 s9, 0xffff +; GFX9-NEXT: s_lshl_b32 s4, s7, s8 ; GFX9-NEXT: s_pack_ll_b32_b16 s0, s0, s4 ; GFX9-NEXT: s_lshr_b32 s4, s2, 16 -; GFX9-NEXT: s_and_b32 s2, s2, s9 -; GFX9-NEXT: s_lshr_b32 s10, s7, 16 -; GFX9-NEXT: s_lshr_b32 s2, s2, s7 -; GFX9-NEXT: s_lshr_b32 s4, s4, s10 +; GFX9-NEXT: s_and_b32 s2, s2, 0xffff +; GFX9-NEXT: s_lshr_b32 s7, s6, 16 +; GFX9-NEXT: s_lshr_b32 s2, s2, s6 +; GFX9-NEXT: s_lshr_b32 s4, s4, s7 ; GFX9-NEXT: s_pack_ll_b32_b16 s2, s2, s4 ; GFX9-NEXT: s_or_b32 s0, s0, s2 -; GFX9-NEXT: s_and_b32 s2, s5, s6 -; GFX9-NEXT: s_andn2_b32 s4, s6, s5 +; GFX9-NEXT: s_and_b32 s2, s5, 0xf000f +; GFX9-NEXT: s_andn2_b32 s4, 0xf000f, s5 ; GFX9-NEXT: s_lshr_b32 s5, s1, 16 -; GFX9-NEXT: s_lshl_b32 s1, s1, s8 +; GFX9-NEXT: s_lshl_b32 s1, s1, 0x10001 ; GFX9-NEXT: s_lshl_b32 s5, s5, 1 ; GFX9-NEXT: s_pack_ll_b32_b16 s1, s1, s5 ; GFX9-NEXT: s_lshr_b32 s5, s1, 16 @@ -3873,7 +3838,7 @@ define amdgpu_ps <2 x i32> @s_fshr_v4i16(<4 x i16> inreg %lhs, <4 x i16> inreg % ; GFX9-NEXT: s_lshl_b32 s4, s5, s6 ; GFX9-NEXT: s_pack_ll_b32_b16 s1, s1, s4 ; GFX9-NEXT: s_lshr_b32 s4, s3, 16 -; GFX9-NEXT: s_and_b32 s3, s3, s9 +; GFX9-NEXT: s_and_b32 s3, s3, 0xffff ; GFX9-NEXT: s_lshr_b32 s5, s2, 16 ; GFX9-NEXT: s_lshr_b32 s2, s3, s2 ; GFX9-NEXT: s_lshr_b32 s3, s4, s5 @@ -3883,41 +3848,38 @@ define amdgpu_ps <2 x i32> @s_fshr_v4i16(<4 x i16> inreg %lhs, <4 x i16> inreg % ; ; GFX10-LABEL: s_fshr_v4i16: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_mov_b32 s7, 0x10001 -; GFX10-NEXT: s_lshr_b32 s8, s0, 16 -; GFX10-NEXT: s_mov_b32 s6, 0xf000f -; GFX10-NEXT: s_lshl_b32 s0, s0, s7 -; GFX10-NEXT: s_lshl_b32 s8, s8, 1 -; GFX10-NEXT: s_and_b32 s9, s4, s6 -; GFX10-NEXT: s_pack_ll_b32_b16 s0, s0, s8 -; GFX10-NEXT: s_andn2_b32 s4, s6, s4 -; GFX10-NEXT: s_lshr_b32 s8, s0, 16 -; GFX10-NEXT: s_lshr_b32 s10, s4, 16 +; GFX10-NEXT: s_lshr_b32 s6, s0, 16 +; GFX10-NEXT: s_lshl_b32 s0, s0, 0x10001 +; GFX10-NEXT: s_lshl_b32 s6, s6, 1 +; GFX10-NEXT: s_and_b32 s7, s4, 0xf000f +; GFX10-NEXT: s_pack_ll_b32_b16 s0, s0, s6 +; GFX10-NEXT: s_andn2_b32 s4, 0xf000f, s4 +; GFX10-NEXT: s_lshr_b32 s6, s0, 16 +; GFX10-NEXT: s_lshr_b32 s8, s4, 16 ; GFX10-NEXT: s_lshl_b32 s0, s0, s4 -; GFX10-NEXT: s_lshl_b32 s4, s8, s10 -; GFX10-NEXT: s_mov_b32 s8, 0xffff +; GFX10-NEXT: s_lshl_b32 s4, s6, s8 +; GFX10-NEXT: s_lshr_b32 s6, s2, 16 ; GFX10-NEXT: s_pack_ll_b32_b16 s0, s0, s4 ; GFX10-NEXT: s_lshr_b32 s4, s1, 16 -; GFX10-NEXT: s_lshl_b32 s1, s1, s7 +; GFX10-NEXT: s_and_b32 s2, s2, 0xffff +; GFX10-NEXT: s_lshr_b32 s8, s7, 16 +; GFX10-NEXT: s_lshl_b32 s1, s1, 0x10001 ; GFX10-NEXT: s_lshl_b32 s4, s4, 1 -; GFX10-NEXT: s_and_b32 s7, s5, s6 +; GFX10-NEXT: s_lshr_b32 s2, s2, s7 +; GFX10-NEXT: s_lshr_b32 s6, s6, s8 ; GFX10-NEXT: s_pack_ll_b32_b16 s1, s1, s4 -; GFX10-NEXT: s_andn2_b32 s4, s6, s5 +; GFX10-NEXT: s_andn2_b32 s4, 0xf000f, s5 +; GFX10-NEXT: s_pack_ll_b32_b16 s2, s2, s6 +; GFX10-NEXT: s_and_b32 s6, s5, 0xf000f ; GFX10-NEXT: s_lshr_b32 s5, s1, 16 -; GFX10-NEXT: s_lshr_b32 s6, s4, 16 -; GFX10-NEXT: s_lshr_b32 s10, s2, 16 -; GFX10-NEXT: s_and_b32 s2, s2, s8 -; GFX10-NEXT: s_lshr_b32 s11, s9, 16 +; GFX10-NEXT: s_lshr_b32 s7, s4, 16 ; GFX10-NEXT: s_lshl_b32 s1, s1, s4 -; GFX10-NEXT: s_lshl_b32 s4, s5, s6 +; GFX10-NEXT: s_lshl_b32 s4, s5, s7 ; GFX10-NEXT: s_lshr_b32 s5, s3, 16 -; GFX10-NEXT: s_and_b32 s3, s3, s8 -; GFX10-NEXT: s_lshr_b32 s6, s7, 16 -; GFX10-NEXT: s_lshr_b32 s2, s2, s9 -; GFX10-NEXT: s_lshr_b32 s9, s10, s11 -; GFX10-NEXT: s_lshr_b32 s3, s3, s7 -; GFX10-NEXT: s_lshr_b32 s5, s5, s6 -; GFX10-NEXT: s_pack_ll_b32_b16 s2, s2, s9 +; GFX10-NEXT: s_and_b32 s3, s3, 0xffff +; GFX10-NEXT: s_lshr_b32 s7, s6, 16 +; GFX10-NEXT: s_lshr_b32 s3, s3, s6 +; GFX10-NEXT: s_lshr_b32 s5, s5, s7 ; GFX10-NEXT: s_pack_ll_b32_b16 s1, s1, s4 ; GFX10-NEXT: s_pack_ll_b32_b16 s3, s3, s5 ; GFX10-NEXT: s_or_b32 s0, s0, s2 @@ -4090,16 +4052,15 @@ define <4 x half> @v_fshr_v4i16(<4 x i16> %lhs, <4 x i16> %rhs, <4 x i16> %amt) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: v_xor_b32_e32 v6, -1, v4 ; GFX10-NEXT: v_xor_b32_e32 v7, -1, v5 -; GFX10-NEXT: s_mov_b32 s4, 0xf000f +; GFX10-NEXT: v_and_b32_e32 v4, 0xf000f, v4 ; GFX10-NEXT: v_pk_lshlrev_b16 v0, 1, v0 op_sel_hi:[0,1] -; GFX10-NEXT: v_and_b32_e32 v4, s4, v4 -; GFX10-NEXT: v_and_b32_e32 v6, s4, v6 -; GFX10-NEXT: v_and_b32_e32 v5, s4, v5 +; GFX10-NEXT: v_and_b32_e32 v5, 0xf000f, v5 +; GFX10-NEXT: v_and_b32_e32 v6, 0xf000f, v6 ; GFX10-NEXT: v_pk_lshlrev_b16 v1, 1, v1 op_sel_hi:[0,1] -; GFX10-NEXT: v_and_b32_e32 v7, s4, v7 +; GFX10-NEXT: v_and_b32_e32 v7, 0xf000f, v7 ; GFX10-NEXT: v_pk_lshrrev_b16 v2, v4, v2 -; GFX10-NEXT: v_pk_lshlrev_b16 v0, v6, v0 ; GFX10-NEXT: v_pk_lshrrev_b16 v3, v5, v3 +; GFX10-NEXT: v_pk_lshlrev_b16 v0, v6, v0 ; GFX10-NEXT: v_pk_lshlrev_b16 v1, v7, v1 ; GFX10-NEXT: v_or_b32_e32 v0, v0, v2 ; GFX10-NEXT: v_or_b32_e32 v1, v1, v3 @@ -5008,35 +4969,34 @@ define i128 @v_fshr_i128(i128 %lhs, i128 %rhs, i128 %amt) { ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: v_xor_b32_e32 v9, -1, v8 -; GFX10-NEXT: s_movk_i32 s4, 0x7f ; GFX10-NEXT: v_lshlrev_b64 v[2:3], 1, v[2:3] ; GFX10-NEXT: v_lshrrev_b32_e32 v10, 31, v1 -; GFX10-NEXT: v_and_b32_e32 v19, s4, v8 -; GFX10-NEXT: v_and_b32_e32 v18, s4, v9 +; GFX10-NEXT: v_and_b32_e32 v19, 0x7f, v8 ; GFX10-NEXT: v_lshlrev_b64 v[0:1], 1, v[0:1] +; GFX10-NEXT: v_and_b32_e32 v18, 0x7f, v9 ; GFX10-NEXT: v_or_b32_e32 v2, v2, v10 ; GFX10-NEXT: v_sub_nc_u32_e32 v16, 64, v19 +; GFX10-NEXT: v_subrev_nc_u32_e32 v21, 64, v19 ; GFX10-NEXT: v_sub_nc_u32_e32 v10, 64, v18 -; GFX10-NEXT: v_subrev_nc_u32_e32 v21, 64, v18 -; GFX10-NEXT: v_subrev_nc_u32_e32 v20, 64, v19 +; GFX10-NEXT: v_subrev_nc_u32_e32 v20, 64, v18 ; GFX10-NEXT: v_lshlrev_b64 v[8:9], v18, v[2:3] ; GFX10-NEXT: v_lshrrev_b64 v[12:13], v19, v[4:5] -; GFX10-NEXT: v_lshrrev_b64 v[10:11], v10, v[0:1] ; GFX10-NEXT: v_lshlrev_b64 v[16:17], v16, v[6:7] +; GFX10-NEXT: v_lshrrev_b64 v[10:11], v10, v[0:1] ; GFX10-NEXT: v_lshlrev_b64 v[14:15], v18, v[0:1] -; GFX10-NEXT: v_lshlrev_b64 v[0:1], v21, v[0:1] +; GFX10-NEXT: v_lshlrev_b64 v[0:1], v20, v[0:1] ; GFX10-NEXT: v_cmp_gt_u32_e32 vcc_lo, 64, v18 ; GFX10-NEXT: v_cmp_gt_u32_e64 s4, 64, v19 +; GFX10-NEXT: v_or_b32_e32 v12, v12, v16 ; GFX10-NEXT: v_or_b32_e32 v10, v10, v8 ; GFX10-NEXT: v_or_b32_e32 v11, v11, v9 -; GFX10-NEXT: v_lshrrev_b64 v[8:9], v20, v[6:7] -; GFX10-NEXT: v_or_b32_e32 v12, v12, v16 +; GFX10-NEXT: v_lshrrev_b64 v[8:9], v21, v[6:7] ; GFX10-NEXT: v_or_b32_e32 v13, v13, v17 +; GFX10-NEXT: v_cmp_eq_u32_e64 s5, 0, v19 ; GFX10-NEXT: v_cndmask_b32_e32 v10, v0, v10, vcc_lo ; GFX10-NEXT: v_cndmask_b32_e32 v11, v1, v11, vcc_lo ; GFX10-NEXT: v_lshrrev_b64 v[0:1], v19, v[6:7] ; GFX10-NEXT: v_cndmask_b32_e64 v8, v8, v12, s4 -; GFX10-NEXT: v_cmp_eq_u32_e64 s5, 0, v19 ; GFX10-NEXT: v_cmp_eq_u32_e64 s6, 0, v18 ; GFX10-NEXT: v_cndmask_b32_e64 v6, v9, v13, s4 ; GFX10-NEXT: v_cndmask_b32_e32 v14, 0, v14, vcc_lo @@ -5219,36 +5179,35 @@ define amdgpu_ps <4 x float> @v_fshr_i128_ssv(i128 inreg %lhs, i128 inreg %rhs, ; GFX10-LABEL: v_fshr_i128_ssv: ; GFX10: ; %bb.0: ; GFX10-NEXT: v_xor_b32_e32 v1, -1, v0 -; GFX10-NEXT: s_movk_i32 s10, 0x7f +; GFX10-NEXT: v_and_b32_e32 v13, 0x7f, v0 ; GFX10-NEXT: s_mov_b32 s9, 0 -; GFX10-NEXT: v_and_b32_e32 v13, s10, v0 ; GFX10-NEXT: s_lshl_b64 s[2:3], s[2:3], 1 -; GFX10-NEXT: v_and_b32_e32 v12, s10, v1 ; GFX10-NEXT: s_lshr_b32 s8, s1, 31 -; GFX10-NEXT: s_lshl_b64 s[0:1], s[0:1], 1 +; GFX10-NEXT: v_and_b32_e32 v12, 0x7f, v1 ; GFX10-NEXT: v_sub_nc_u32_e32 v8, 64, v13 +; GFX10-NEXT: s_lshl_b64 s[0:1], s[0:1], 1 ; GFX10-NEXT: s_or_b64 s[8:9], s[2:3], s[8:9] +; GFX10-NEXT: v_subrev_nc_u32_e32 v14, 64, v13 ; GFX10-NEXT: v_sub_nc_u32_e32 v2, 64, v12 ; GFX10-NEXT: v_lshlrev_b64 v[0:1], v12, s[8:9] ; GFX10-NEXT: v_subrev_nc_u32_e32 v10, 64, v12 -; GFX10-NEXT: v_subrev_nc_u32_e32 v14, 64, v13 ; GFX10-NEXT: v_lshrrev_b64 v[4:5], v13, s[4:5] -; GFX10-NEXT: v_lshrrev_b64 v[2:3], v2, s[0:1] ; GFX10-NEXT: v_lshlrev_b64 v[8:9], v8, s[6:7] -; GFX10-NEXT: v_lshlrev_b64 v[10:11], v10, s[0:1] +; GFX10-NEXT: v_lshrrev_b64 v[2:3], v2, s[0:1] ; GFX10-NEXT: v_cmp_gt_u32_e32 vcc_lo, 64, v12 +; GFX10-NEXT: v_lshlrev_b64 v[10:11], v10, s[0:1] ; GFX10-NEXT: v_lshlrev_b64 v[6:7], v12, s[0:1] ; GFX10-NEXT: v_cmp_gt_u32_e64 s0, 64, v13 +; GFX10-NEXT: v_or_b32_e32 v4, v4, v8 ; GFX10-NEXT: v_or_b32_e32 v2, v2, v0 ; GFX10-NEXT: v_or_b32_e32 v3, v3, v1 ; GFX10-NEXT: v_lshrrev_b64 v[0:1], v14, s[6:7] -; GFX10-NEXT: v_or_b32_e32 v4, v4, v8 ; GFX10-NEXT: v_or_b32_e32 v5, v5, v9 +; GFX10-NEXT: v_cmp_eq_u32_e64 s1, 0, v13 ; GFX10-NEXT: v_cndmask_b32_e32 v8, v10, v2, vcc_lo ; GFX10-NEXT: v_cndmask_b32_e32 v10, v11, v3, vcc_lo ; GFX10-NEXT: v_lshrrev_b64 v[2:3], v13, s[6:7] ; GFX10-NEXT: v_cndmask_b32_e64 v0, v0, v4, s0 -; GFX10-NEXT: v_cmp_eq_u32_e64 s1, 0, v13 ; GFX10-NEXT: v_cmp_eq_u32_e64 s2, 0, v12 ; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, v5, s0 ; GFX10-NEXT: v_cndmask_b32_e32 v6, 0, v6, vcc_lo @@ -6464,11 +6423,10 @@ define <2 x i128> @v_fshr_v2i128(<2 x i128> %lhs, <2 x i128> %rhs, <2 x i128> %a ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: v_xor_b32_e32 v17, -1, v16 -; GFX10-NEXT: s_movk_i32 s5, 0x7f ; GFX10-NEXT: v_lshlrev_b64 v[2:3], 1, v[2:3] -; GFX10-NEXT: v_and_b32_e32 v26, s5, v16 +; GFX10-NEXT: v_and_b32_e32 v26, 0x7f, v16 ; GFX10-NEXT: v_lshlrev_b64 v[6:7], 1, v[6:7] -; GFX10-NEXT: v_and_b32_e32 v25, s5, v17 +; GFX10-NEXT: v_and_b32_e32 v25, 0x7f, v17 ; GFX10-NEXT: v_lshrrev_b32_e32 v17, 31, v1 ; GFX10-NEXT: v_lshlrev_b64 v[0:1], 1, v[0:1] ; GFX10-NEXT: v_subrev_nc_u32_e32 v27, 64, v26 @@ -6502,14 +6460,14 @@ define <2 x i128> @v_fshr_v2i128(<2 x i128> %lhs, <2 x i128> %rhs, <2 x i128> %a ; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, v17, s4 ; GFX10-NEXT: v_lshrrev_b64 v[2:3], v26, v[10:11] ; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v8, vcc_lo -; GFX10-NEXT: v_and_b32_e32 v25, s5, v16 +; GFX10-NEXT: v_and_b32_e32 v25, 0x7f, v16 ; GFX10-NEXT: v_lshrrev_b32_e32 v8, 31, v5 ; GFX10-NEXT: v_lshlrev_b64 v[4:5], 1, v[4:5] ; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v9, vcc_lo ; GFX10-NEXT: v_or_b32_e32 v0, v23, v0 ; GFX10-NEXT: v_sub_nc_u32_e32 v9, 64, v25 ; GFX10-NEXT: v_or_b32_e32 v6, v6, v8 -; GFX10-NEXT: v_and_b32_e32 v23, s5, v20 +; GFX10-NEXT: v_and_b32_e32 v23, 0x7f, v20 ; GFX10-NEXT: v_cndmask_b32_e64 v2, 0, v2, s4 ; GFX10-NEXT: v_cndmask_b32_e64 v26, 0, v3, s4 ; GFX10-NEXT: v_lshrrev_b64 v[8:9], v9, v[4:5] diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.i16.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.i16.ll index 4588b84e664a..17fcb5488684 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.i16.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.i16.ll @@ -9,14 +9,13 @@ define amdgpu_ps void @insertelement_s_v2i16_s_s(<2 x i16> addrspace(4)* inreg % ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dword s0, s[2:3], 0x0 ; GFX9-NEXT: s_and_b32 s1, s5, 1 -; GFX9-NEXT: s_mov_b32 s2, 0xffff ; GFX9-NEXT: s_lshl_b32 s1, s1, 4 -; GFX9-NEXT: s_and_b32 s3, s4, s2 -; GFX9-NEXT: s_lshl_b32 s3, s3, s1 -; GFX9-NEXT: s_lshl_b32 s1, s2, s1 +; GFX9-NEXT: s_and_b32 s2, s4, 0xffff +; GFX9-NEXT: s_lshl_b32 s2, s2, s1 +; GFX9-NEXT: s_lshl_b32 s1, 0xffff, s1 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_andn2_b32 s0, s0, s1 -; GFX9-NEXT: s_or_b32 s0, s0, s3 +; GFX9-NEXT: s_or_b32 s0, s0, s2 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: v_mov_b32_e32 v2, s0 @@ -27,14 +26,13 @@ define amdgpu_ps void @insertelement_s_v2i16_s_s(<2 x i16> addrspace(4)* inreg % ; GFX8: ; %bb.0: ; GFX8-NEXT: s_load_dword s0, s[2:3], 0x0 ; GFX8-NEXT: s_and_b32 s1, s5, 1 -; GFX8-NEXT: s_mov_b32 s2, 0xffff ; GFX8-NEXT: s_lshl_b32 s1, s1, 4 -; GFX8-NEXT: s_and_b32 s3, s4, s2 -; GFX8-NEXT: s_lshl_b32 s3, s3, s1 -; GFX8-NEXT: s_lshl_b32 s1, s2, s1 +; GFX8-NEXT: s_and_b32 s2, s4, 0xffff +; GFX8-NEXT: s_lshl_b32 s2, s2, s1 +; GFX8-NEXT: s_lshl_b32 s1, 0xffff, s1 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: s_andn2_b32 s0, s0, s1 -; GFX8-NEXT: s_or_b32 s0, s0, s3 +; GFX8-NEXT: s_or_b32 s0, s0, s2 ; GFX8-NEXT: v_mov_b32_e32 v0, 0 ; GFX8-NEXT: v_mov_b32_e32 v1, 0 ; GFX8-NEXT: v_mov_b32_e32 v2, s0 @@ -45,14 +43,13 @@ define amdgpu_ps void @insertelement_s_v2i16_s_s(<2 x i16> addrspace(4)* inreg % ; GFX7: ; %bb.0: ; GFX7-NEXT: s_load_dword s0, s[2:3], 0x0 ; GFX7-NEXT: s_and_b32 s1, s5, 1 -; GFX7-NEXT: s_mov_b32 s2, 0xffff ; GFX7-NEXT: s_lshl_b32 s1, s1, 4 -; GFX7-NEXT: s_and_b32 s3, s4, s2 -; GFX7-NEXT: s_lshl_b32 s3, s3, s1 -; GFX7-NEXT: s_lshl_b32 s1, s2, s1 +; GFX7-NEXT: s_and_b32 s2, s4, 0xffff +; GFX7-NEXT: s_lshl_b32 s2, s2, s1 +; GFX7-NEXT: s_lshl_b32 s1, 0xffff, s1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_andn2_b32 s0, s0, s1 -; GFX7-NEXT: s_or_b32 s0, s0, s3 +; GFX7-NEXT: s_or_b32 s0, s0, s2 ; GFX7-NEXT: v_mov_b32_e32 v0, 0 ; GFX7-NEXT: v_mov_b32_e32 v1, 0 ; GFX7-NEXT: v_mov_b32_e32 v2, s0 @@ -63,15 +60,14 @@ define amdgpu_ps void @insertelement_s_v2i16_s_s(<2 x i16> addrspace(4)* inreg % ; GFX10: ; %bb.0: ; GFX10-NEXT: s_load_dword s0, s[2:3], 0x0 ; GFX10-NEXT: s_and_b32 s1, s5, 1 -; GFX10-NEXT: s_mov_b32 s2, 0xffff +; GFX10-NEXT: s_and_b32 s2, s4, 0xffff ; GFX10-NEXT: s_lshl_b32 s1, s1, 4 -; GFX10-NEXT: s_and_b32 s3, s4, s2 -; GFX10-NEXT: s_lshl_b32 s2, s2, s1 -; GFX10-NEXT: s_lshl_b32 s1, s3, s1 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-NEXT: s_lshl_b32 s3, 0xffff, s1 +; GFX10-NEXT: s_lshl_b32 s1, s2, s1 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: s_andn2_b32 s0, s0, s2 +; GFX10-NEXT: s_andn2_b32 s0, s0, s3 ; GFX10-NEXT: s_or_b32 s0, s0, s1 ; GFX10-NEXT: v_mov_b32_e32 v2, s0 ; GFX10-NEXT: global_store_dword v[0:1], v2, off @@ -87,13 +83,12 @@ define amdgpu_ps void @insertelement_v_v2i16_s_s(<2 x i16> addrspace(1)* %ptr, i ; GFX9: ; %bb.0: ; GFX9-NEXT: global_load_dword v2, v[0:1], off ; GFX9-NEXT: s_and_b32 s0, s3, 1 -; GFX9-NEXT: s_mov_b32 s1, 0xffff ; GFX9-NEXT: s_lshl_b32 s0, s0, 4 -; GFX9-NEXT: s_and_b32 s2, s2, s1 -; GFX9-NEXT: s_lshl_b32 s2, s2, s0 -; GFX9-NEXT: s_lshl_b32 s0, s1, s0 +; GFX9-NEXT: s_and_b32 s1, s2, 0xffff +; GFX9-NEXT: s_lshl_b32 s1, s1, s0 +; GFX9-NEXT: s_lshl_b32 s0, 0xffff, s0 ; GFX9-NEXT: s_not_b32 s0, s0 -; GFX9-NEXT: v_mov_b32_e32 v3, s2 +; GFX9-NEXT: v_mov_b32_e32 v3, s1 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: s_waitcnt vmcnt(0) @@ -104,36 +99,34 @@ define amdgpu_ps void @insertelement_v_v2i16_s_s(<2 x i16> addrspace(1)* %ptr, i ; GFX8-LABEL: insertelement_v_v2i16_s_s: ; GFX8: ; %bb.0: ; GFX8-NEXT: flat_load_dword v0, v[0:1] -; GFX8-NEXT: s_and_b32 s1, s3, 1 -; GFX8-NEXT: s_mov_b32 s0, 0xffff -; GFX8-NEXT: s_lshl_b32 s1, s1, 4 -; GFX8-NEXT: s_and_b32 s2, s2, s0 -; GFX8-NEXT: s_lshl_b32 s0, s0, s1 +; GFX8-NEXT: s_and_b32 s0, s3, 1 +; GFX8-NEXT: s_and_b32 s1, s2, 0xffff +; GFX8-NEXT: s_lshl_b32 s0, s0, 4 +; GFX8-NEXT: s_lshl_b32 s1, s1, s0 +; GFX8-NEXT: s_lshl_b32 s0, 0xffff, s0 ; GFX8-NEXT: s_not_b32 s0, s0 -; GFX8-NEXT: s_lshl_b32 s2, s2, s1 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_and_b32_e32 v2, s0, v0 ; GFX8-NEXT: v_mov_b32_e32 v0, 0 ; GFX8-NEXT: v_mov_b32_e32 v1, 0 -; GFX8-NEXT: v_or_b32_e32 v2, s2, v2 +; GFX8-NEXT: v_or_b32_e32 v2, s1, v2 ; GFX8-NEXT: flat_store_dword v[0:1], v2 ; GFX8-NEXT: s_endpgm ; ; GFX7-LABEL: insertelement_v_v2i16_s_s: ; GFX7: ; %bb.0: ; GFX7-NEXT: flat_load_dword v0, v[0:1] -; GFX7-NEXT: s_and_b32 s1, s3, 1 -; GFX7-NEXT: s_mov_b32 s0, 0xffff -; GFX7-NEXT: s_lshl_b32 s1, s1, 4 -; GFX7-NEXT: s_and_b32 s2, s2, s0 -; GFX7-NEXT: s_lshl_b32 s0, s0, s1 +; GFX7-NEXT: s_and_b32 s0, s3, 1 +; GFX7-NEXT: s_and_b32 s1, s2, 0xffff +; GFX7-NEXT: s_lshl_b32 s0, s0, 4 +; GFX7-NEXT: s_lshl_b32 s1, s1, s0 +; GFX7-NEXT: s_lshl_b32 s0, 0xffff, s0 ; GFX7-NEXT: s_not_b32 s0, s0 -; GFX7-NEXT: s_lshl_b32 s2, s2, s1 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_and_b32_e32 v2, s0, v0 ; GFX7-NEXT: v_mov_b32_e32 v0, 0 ; GFX7-NEXT: v_mov_b32_e32 v1, 0 -; GFX7-NEXT: v_or_b32_e32 v2, s2, v2 +; GFX7-NEXT: v_or_b32_e32 v2, s1, v2 ; GFX7-NEXT: flat_store_dword v[0:1], v2 ; GFX7-NEXT: s_endpgm ; @@ -141,13 +134,12 @@ define amdgpu_ps void @insertelement_v_v2i16_s_s(<2 x i16> addrspace(1)* %ptr, i ; GFX10: ; %bb.0: ; GFX10-NEXT: global_load_dword v2, v[0:1], off ; GFX10-NEXT: s_and_b32 s0, s3, 1 -; GFX10-NEXT: s_mov_b32 s1, 0xffff +; GFX10-NEXT: s_and_b32 s1, s2, 0xffff ; GFX10-NEXT: s_lshl_b32 s0, s0, 4 -; GFX10-NEXT: s_and_b32 s2, s2, s1 -; GFX10-NEXT: s_lshl_b32 s1, s1, s0 -; GFX10-NEXT: s_lshl_b32 s0, s2, s0 -; GFX10-NEXT: s_not_b32 s1, s1 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-NEXT: s_lshl_b32 s2, 0xffff, s0 +; GFX10-NEXT: s_lshl_b32 s0, s1, s0 +; GFX10-NEXT: s_not_b32 s1, s2 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_and_or_b32 v2, v2, s1, s0 @@ -165,9 +157,8 @@ define amdgpu_ps void @insertelement_s_v2i16_v_s(<2 x i16> addrspace(4)* inreg % ; GFX9-NEXT: s_load_dword s0, s[2:3], 0x0 ; GFX9-NEXT: s_and_b32 s1, s4, 1 ; GFX9-NEXT: s_lshl_b32 s1, s1, 4 -; GFX9-NEXT: s_mov_b32 s2, 0xffff -; GFX9-NEXT: v_and_b32_e32 v2, s2, v0 -; GFX9-NEXT: s_lshl_b32 s2, s2, s1 +; GFX9-NEXT: s_lshl_b32 s2, 0xffff, s1 +; GFX9-NEXT: v_and_b32_e32 v2, 0xffff, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_andn2_b32 s0, s0, s2 ; GFX9-NEXT: v_mov_b32_e32 v3, s0 @@ -182,9 +173,8 @@ define amdgpu_ps void @insertelement_s_v2i16_v_s(<2 x i16> addrspace(4)* inreg % ; GFX8-NEXT: s_load_dword s0, s[2:3], 0x0 ; GFX8-NEXT: s_and_b32 s1, s4, 1 ; GFX8-NEXT: s_lshl_b32 s1, s1, 4 -; GFX8-NEXT: s_mov_b32 s2, 0xffff ; GFX8-NEXT: v_mov_b32_e32 v1, s1 -; GFX8-NEXT: s_lshl_b32 s1, s2, s1 +; GFX8-NEXT: s_lshl_b32 s1, 0xffff, s1 ; GFX8-NEXT: v_lshlrev_b32_sdwa v2, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: s_andn2_b32 s0, s0, s1 @@ -198,11 +188,10 @@ define amdgpu_ps void @insertelement_s_v2i16_v_s(<2 x i16> addrspace(4)* inreg % ; GFX7: ; %bb.0: ; GFX7-NEXT: s_load_dword s0, s[2:3], 0x0 ; GFX7-NEXT: s_and_b32 s1, s4, 1 -; GFX7-NEXT: s_mov_b32 s2, 0xffff ; GFX7-NEXT: s_lshl_b32 s1, s1, 4 -; GFX7-NEXT: v_and_b32_e32 v0, s2, v0 +; GFX7-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX7-NEXT: v_lshlrev_b32_e32 v2, s1, v0 -; GFX7-NEXT: s_lshl_b32 s1, s2, s1 +; GFX7-NEXT: s_lshl_b32 s1, 0xffff, s1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_andn2_b32 s0, s0, s1 ; GFX7-NEXT: v_mov_b32_e32 v0, 0 @@ -215,11 +204,10 @@ define amdgpu_ps void @insertelement_s_v2i16_v_s(<2 x i16> addrspace(4)* inreg % ; GFX10: ; %bb.0: ; GFX10-NEXT: s_load_dword s0, s[2:3], 0x0 ; GFX10-NEXT: s_and_b32 s1, s4, 1 -; GFX10-NEXT: s_mov_b32 s2, 0xffff +; GFX10-NEXT: v_and_b32_e32 v2, 0xffff, v0 ; GFX10-NEXT: s_lshl_b32 s1, s1, 4 -; GFX10-NEXT: v_and_b32_e32 v2, s2, v0 -; GFX10-NEXT: s_lshl_b32 s2, s2, s1 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-NEXT: s_lshl_b32 s2, 0xffff, s1 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_andn2_b32 s0, s0, s2 @@ -237,9 +225,9 @@ define amdgpu_ps void @insertelement_s_v2i16_s_v(<2 x i16> addrspace(4)* inreg % ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dword s0, s[2:3], 0x0 ; GFX9-NEXT: v_and_b32_e32 v0, 1, v0 -; GFX9-NEXT: s_mov_b32 s1, 0xffff ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 4, v0 -; GFX9-NEXT: s_and_b32 s2, s4, s1 +; GFX9-NEXT: s_mov_b32 s1, 0xffff +; GFX9-NEXT: s_and_b32 s2, s4, 0xffff ; GFX9-NEXT: v_lshlrev_b32_e64 v2, v0, s2 ; GFX9-NEXT: v_lshlrev_b32_e64 v0, v0, s1 ; GFX9-NEXT: v_xor_b32_e32 v3, -1, v0 @@ -254,9 +242,9 @@ define amdgpu_ps void @insertelement_s_v2i16_s_v(<2 x i16> addrspace(4)* inreg % ; GFX8: ; %bb.0: ; GFX8-NEXT: s_load_dword s0, s[2:3], 0x0 ; GFX8-NEXT: v_and_b32_e32 v0, 1, v0 -; GFX8-NEXT: s_mov_b32 s1, 0xffff ; GFX8-NEXT: v_lshlrev_b32_e32 v0, 4, v0 -; GFX8-NEXT: s_and_b32 s2, s4, s1 +; GFX8-NEXT: s_mov_b32 s1, 0xffff +; GFX8-NEXT: s_and_b32 s2, s4, 0xffff ; GFX8-NEXT: v_lshlrev_b32_e64 v2, v0, s2 ; GFX8-NEXT: v_lshlrev_b32_e64 v0, v0, s1 ; GFX8-NEXT: v_xor_b32_e32 v0, -1, v0 @@ -272,11 +260,10 @@ define amdgpu_ps void @insertelement_s_v2i16_s_v(<2 x i16> addrspace(4)* inreg % ; GFX7: ; %bb.0: ; GFX7-NEXT: s_load_dword s0, s[2:3], 0x0 ; GFX7-NEXT: v_and_b32_e32 v0, 1, v0 -; GFX7-NEXT: s_mov_b32 s1, 0xffff ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 4, v0 -; GFX7-NEXT: s_and_b32 s2, s4, s1 -; GFX7-NEXT: v_lshl_b32_e32 v2, s2, v0 -; GFX7-NEXT: v_lshl_b32_e32 v0, s1, v0 +; GFX7-NEXT: s_and_b32 s1, s4, 0xffff +; GFX7-NEXT: v_lshl_b32_e32 v2, s1, v0 +; GFX7-NEXT: v_lshl_b32_e32 v0, 0xffff, v0 ; GFX7-NEXT: v_xor_b32_e32 v0, -1, v0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_and_b32_e32 v3, s0, v0 @@ -290,10 +277,9 @@ define amdgpu_ps void @insertelement_s_v2i16_s_v(<2 x i16> addrspace(4)* inreg % ; GFX10: ; %bb.0: ; GFX10-NEXT: s_load_dword s0, s[2:3], 0x0 ; GFX10-NEXT: v_and_b32_e32 v0, 1, v0 -; GFX10-NEXT: s_mov_b32 s1, 0xffff +; GFX10-NEXT: s_and_b32 s1, s4, 0xffff ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 4, v0 -; GFX10-NEXT: v_lshlrev_b32_e64 v1, v0, s1 -; GFX10-NEXT: s_and_b32 s1, s4, s1 +; GFX10-NEXT: v_lshlrev_b32_e64 v1, v0, 0xffff ; GFX10-NEXT: v_lshlrev_b32_e64 v2, v0, s1 ; GFX10-NEXT: v_xor_b32_e32 v3, -1, v1 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 @@ -364,9 +350,8 @@ define amdgpu_ps void @insertelement_s_v2i16_v_v(<2 x i16> addrspace(4)* inreg % ; GFX10: ; %bb.0: ; GFX10-NEXT: s_load_dword s0, s[2:3], 0x0 ; GFX10-NEXT: v_and_b32_e32 v1, 1, v1 -; GFX10-NEXT: s_mov_b32 s1, 0xffff ; GFX10-NEXT: v_lshlrev_b32_e32 v1, 4, v1 -; GFX10-NEXT: v_lshlrev_b32_e64 v2, v1, s1 +; GFX10-NEXT: v_lshlrev_b32_e64 v2, v1, 0xffff ; GFX10-NEXT: v_lshlrev_b32_sdwa v3, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 @@ -386,9 +371,9 @@ define amdgpu_ps void @insertelement_v_v2i16_s_v(<2 x i16> addrspace(1)* %ptr, i ; GFX9: ; %bb.0: ; GFX9-NEXT: global_load_dword v3, v[0:1], off ; GFX9-NEXT: v_and_b32_e32 v0, 1, v2 -; GFX9-NEXT: s_mov_b32 s0, 0xffff ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 4, v0 -; GFX9-NEXT: s_and_b32 s1, s2, s0 +; GFX9-NEXT: s_mov_b32 s0, 0xffff +; GFX9-NEXT: s_and_b32 s1, s2, 0xffff ; GFX9-NEXT: v_lshlrev_b32_e64 v2, v0, s1 ; GFX9-NEXT: v_lshlrev_b32_e64 v0, v0, s0 ; GFX9-NEXT: v_xor_b32_e32 v4, -1, v0 @@ -402,9 +387,9 @@ define amdgpu_ps void @insertelement_v_v2i16_s_v(<2 x i16> addrspace(1)* %ptr, i ; GFX8-LABEL: insertelement_v_v2i16_s_v: ; GFX8: ; %bb.0: ; GFX8-NEXT: flat_load_dword v0, v[0:1] -; GFX8-NEXT: s_mov_b32 s0, 0xffff ; GFX8-NEXT: v_and_b32_e32 v1, 1, v2 -; GFX8-NEXT: s_and_b32 s1, s2, s0 +; GFX8-NEXT: s_mov_b32 s0, 0xffff +; GFX8-NEXT: s_and_b32 s1, s2, 0xffff ; GFX8-NEXT: v_lshlrev_b32_e32 v1, 4, v1 ; GFX8-NEXT: v_lshlrev_b32_e64 v2, v1, s1 ; GFX8-NEXT: v_lshlrev_b32_e64 v1, v1, s0 @@ -420,12 +405,11 @@ define amdgpu_ps void @insertelement_v_v2i16_s_v(<2 x i16> addrspace(1)* %ptr, i ; GFX7-LABEL: insertelement_v_v2i16_s_v: ; GFX7: ; %bb.0: ; GFX7-NEXT: flat_load_dword v0, v[0:1] -; GFX7-NEXT: s_mov_b32 s0, 0xffff ; GFX7-NEXT: v_and_b32_e32 v1, 1, v2 -; GFX7-NEXT: s_and_b32 s1, s2, s0 +; GFX7-NEXT: s_and_b32 s0, s2, 0xffff ; GFX7-NEXT: v_lshlrev_b32_e32 v1, 4, v1 -; GFX7-NEXT: v_lshl_b32_e32 v2, s1, v1 -; GFX7-NEXT: v_lshl_b32_e32 v1, s0, v1 +; GFX7-NEXT: v_lshl_b32_e32 v2, s0, v1 +; GFX7-NEXT: v_lshl_b32_e32 v1, 0xffff, v1 ; GFX7-NEXT: v_xor_b32_e32 v1, -1, v1 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_and_b32_e32 v3, v0, v1 @@ -439,10 +423,9 @@ define amdgpu_ps void @insertelement_v_v2i16_s_v(<2 x i16> addrspace(1)* %ptr, i ; GFX10: ; %bb.0: ; GFX10-NEXT: global_load_dword v3, v[0:1], off ; GFX10-NEXT: v_and_b32_e32 v0, 1, v2 -; GFX10-NEXT: s_mov_b32 s0, 0xffff +; GFX10-NEXT: s_and_b32 s0, s2, 0xffff ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 4, v0 -; GFX10-NEXT: v_lshlrev_b32_e64 v1, v0, s0 -; GFX10-NEXT: s_and_b32 s0, s2, s0 +; GFX10-NEXT: v_lshlrev_b32_e64 v1, v0, 0xffff ; GFX10-NEXT: v_lshlrev_b32_e64 v2, v0, s0 ; GFX10-NEXT: v_xor_b32_e32 v4, -1, v1 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 @@ -463,9 +446,8 @@ define amdgpu_ps void @insertelement_v_v2i16_v_s(<2 x i16> addrspace(1)* %ptr, i ; GFX9-NEXT: global_load_dword v3, v[0:1], off ; GFX9-NEXT: s_and_b32 s0, s2, 1 ; GFX9-NEXT: s_lshl_b32 s0, s0, 4 -; GFX9-NEXT: s_mov_b32 s1, 0xffff ; GFX9-NEXT: v_lshlrev_b32_sdwa v2, s0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; GFX9-NEXT: s_lshl_b32 s0, s1, s0 +; GFX9-NEXT: s_lshl_b32 s0, 0xffff, s0 ; GFX9-NEXT: s_not_b32 s0, s0 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 @@ -477,11 +459,10 @@ define amdgpu_ps void @insertelement_v_v2i16_v_s(<2 x i16> addrspace(1)* %ptr, i ; GFX8-LABEL: insertelement_v_v2i16_v_s: ; GFX8: ; %bb.0: ; GFX8-NEXT: flat_load_dword v0, v[0:1] -; GFX8-NEXT: s_and_b32 s1, s2, 1 -; GFX8-NEXT: s_mov_b32 s0, 0xffff -; GFX8-NEXT: s_lshl_b32 s1, s1, 4 -; GFX8-NEXT: s_lshl_b32 s0, s0, s1 -; GFX8-NEXT: v_mov_b32_e32 v1, s1 +; GFX8-NEXT: s_and_b32 s0, s2, 1 +; GFX8-NEXT: s_lshl_b32 s0, s0, 4 +; GFX8-NEXT: v_mov_b32_e32 v1, s0 +; GFX8-NEXT: s_lshl_b32 s0, 0xffff, s0 ; GFX8-NEXT: s_not_b32 s0, s0 ; GFX8-NEXT: v_lshlrev_b32_sdwa v2, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; GFX8-NEXT: s_waitcnt vmcnt(0) @@ -495,13 +476,12 @@ define amdgpu_ps void @insertelement_v_v2i16_v_s(<2 x i16> addrspace(1)* %ptr, i ; GFX7-LABEL: insertelement_v_v2i16_v_s: ; GFX7: ; %bb.0: ; GFX7-NEXT: flat_load_dword v0, v[0:1] -; GFX7-NEXT: s_and_b32 s1, s2, 1 -; GFX7-NEXT: s_mov_b32 s0, 0xffff -; GFX7-NEXT: s_lshl_b32 s1, s1, 4 -; GFX7-NEXT: v_and_b32_e32 v1, s0, v2 -; GFX7-NEXT: s_lshl_b32 s0, s0, s1 +; GFX7-NEXT: s_and_b32 s0, s2, 1 +; GFX7-NEXT: v_and_b32_e32 v1, 0xffff, v2 +; GFX7-NEXT: s_lshl_b32 s0, s0, 4 +; GFX7-NEXT: v_lshlrev_b32_e32 v2, s0, v1 +; GFX7-NEXT: s_lshl_b32 s0, 0xffff, s0 ; GFX7-NEXT: s_not_b32 s0, s0 -; GFX7-NEXT: v_lshlrev_b32_e32 v2, s1, v1 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_and_b32_e32 v3, s0, v0 ; GFX7-NEXT: v_mov_b32_e32 v0, 0 @@ -514,12 +494,11 @@ define amdgpu_ps void @insertelement_v_v2i16_v_s(<2 x i16> addrspace(1)* %ptr, i ; GFX10: ; %bb.0: ; GFX10-NEXT: global_load_dword v3, v[0:1], off ; GFX10-NEXT: s_and_b32 s0, s2, 1 -; GFX10-NEXT: s_mov_b32 s1, 0xffff -; GFX10-NEXT: s_lshl_b32 s0, s0, 4 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 -; GFX10-NEXT: v_lshlrev_b32_sdwa v2, s0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; GFX10-NEXT: s_lshl_b32 s0, s1, s0 +; GFX10-NEXT: s_lshl_b32 s0, s0, 4 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 +; GFX10-NEXT: v_lshlrev_b32_sdwa v2, s0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; GFX10-NEXT: s_lshl_b32 s0, 0xffff, s0 ; GFX10-NEXT: s_not_b32 s0, s0 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_and_or_b32 v2, v3, s0, v2 @@ -587,9 +566,8 @@ define amdgpu_ps void @insertelement_v_v2i16_v_v(<2 x i16> addrspace(1)* %ptr, i ; GFX10: ; %bb.0: ; GFX10-NEXT: global_load_dword v4, v[0:1], off ; GFX10-NEXT: v_and_b32_e32 v0, 1, v3 -; GFX10-NEXT: s_mov_b32 s0, 0xffff ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 4, v0 -; GFX10-NEXT: v_lshlrev_b32_e64 v1, v0, s0 +; GFX10-NEXT: v_lshlrev_b32_e64 v1, v0, 0xffff ; GFX10-NEXT: v_lshlrev_b32_sdwa v2, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; GFX10-NEXT: v_xor_b32_e32 v3, -1, v1 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 @@ -665,22 +643,21 @@ define amdgpu_ps void @insertelement_v_v4i16_s_s(<4 x i16> addrspace(1)* %ptr, i ; GFX9-LABEL: insertelement_v_v4i16_s_s: ; GFX9: ; %bb.0: ; GFX9-NEXT: global_load_dwordx2 v[0:1], v[0:1], off -; GFX9-NEXT: s_mov_b32 s0, 0xffff -; GFX9-NEXT: s_lshr_b32 s1, s3, 1 -; GFX9-NEXT: s_and_b32 s3, s3, 1 -; GFX9-NEXT: s_and_b32 s2, s2, s0 -; GFX9-NEXT: s_lshl_b32 s3, s3, 4 -; GFX9-NEXT: s_lshl_b32 s2, s2, s3 -; GFX9-NEXT: s_lshl_b32 s0, s0, s3 -; GFX9-NEXT: v_cmp_eq_u32_e64 vcc, s1, 1 -; GFX9-NEXT: s_not_b32 s0, s0 +; GFX9-NEXT: s_and_b32 s1, s3, 1 +; GFX9-NEXT: s_lshr_b32 s0, s3, 1 +; GFX9-NEXT: s_and_b32 s2, s2, 0xffff +; GFX9-NEXT: s_lshl_b32 s1, s1, 4 +; GFX9-NEXT: s_lshl_b32 s2, s2, s1 +; GFX9-NEXT: s_lshl_b32 s1, 0xffff, s1 +; GFX9-NEXT: v_cmp_eq_u32_e64 vcc, s0, 1 +; GFX9-NEXT: s_not_b32 s1, s1 ; GFX9-NEXT: v_mov_b32_e32 v4, s2 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: v_mov_b32_e32 v3, 0 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_cndmask_b32_e32 v5, v0, v1, vcc -; GFX9-NEXT: v_and_or_b32 v4, v5, s0, v4 -; GFX9-NEXT: v_cmp_eq_u32_e64 s[0:1], s1, 0 +; GFX9-NEXT: v_and_or_b32 v4, v5, s1, v4 +; GFX9-NEXT: v_cmp_eq_u32_e64 s[0:1], s0, 0 ; GFX9-NEXT: v_cndmask_b32_e64 v0, v0, v4, s[0:1] ; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc ; GFX9-NEXT: global_store_dwordx2 v[2:3], v[0:1], off @@ -689,22 +666,21 @@ define amdgpu_ps void @insertelement_v_v4i16_s_s(<4 x i16> addrspace(1)* %ptr, i ; GFX8-LABEL: insertelement_v_v4i16_s_s: ; GFX8: ; %bb.0: ; GFX8-NEXT: flat_load_dwordx2 v[0:1], v[0:1] -; GFX8-NEXT: s_lshr_b32 s1, s3, 1 -; GFX8-NEXT: s_and_b32 s3, s3, 1 -; GFX8-NEXT: s_mov_b32 s0, 0xffff -; GFX8-NEXT: s_lshl_b32 s3, s3, 4 -; GFX8-NEXT: s_and_b32 s2, s2, s0 -; GFX8-NEXT: s_lshl_b32 s0, s0, s3 -; GFX8-NEXT: v_cmp_eq_u32_e64 vcc, s1, 1 -; GFX8-NEXT: s_not_b32 s0, s0 -; GFX8-NEXT: s_lshl_b32 s2, s2, s3 +; GFX8-NEXT: s_and_b32 s1, s3, 1 +; GFX8-NEXT: s_lshr_b32 s0, s3, 1 +; GFX8-NEXT: s_and_b32 s2, s2, 0xffff +; GFX8-NEXT: s_lshl_b32 s1, s1, 4 +; GFX8-NEXT: s_lshl_b32 s2, s2, s1 +; GFX8-NEXT: s_lshl_b32 s1, 0xffff, s1 +; GFX8-NEXT: v_cmp_eq_u32_e64 vcc, s0, 1 +; GFX8-NEXT: s_not_b32 s1, s1 ; GFX8-NEXT: v_mov_b32_e32 v2, 0 ; GFX8-NEXT: v_mov_b32_e32 v3, 0 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_cndmask_b32_e32 v4, v0, v1, vcc -; GFX8-NEXT: v_and_b32_e32 v4, s0, v4 +; GFX8-NEXT: v_and_b32_e32 v4, s1, v4 ; GFX8-NEXT: v_or_b32_e32 v4, s2, v4 -; GFX8-NEXT: v_cmp_eq_u32_e64 s[0:1], s1, 0 +; GFX8-NEXT: v_cmp_eq_u32_e64 s[0:1], s0, 0 ; GFX8-NEXT: v_cndmask_b32_e64 v0, v0, v4, s[0:1] ; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc ; GFX8-NEXT: flat_store_dwordx2 v[2:3], v[0:1] @@ -713,22 +689,21 @@ define amdgpu_ps void @insertelement_v_v4i16_s_s(<4 x i16> addrspace(1)* %ptr, i ; GFX7-LABEL: insertelement_v_v4i16_s_s: ; GFX7: ; %bb.0: ; GFX7-NEXT: flat_load_dwordx2 v[0:1], v[0:1] -; GFX7-NEXT: s_lshr_b32 s1, s3, 1 -; GFX7-NEXT: s_and_b32 s3, s3, 1 -; GFX7-NEXT: s_mov_b32 s0, 0xffff -; GFX7-NEXT: s_lshl_b32 s3, s3, 4 -; GFX7-NEXT: s_and_b32 s2, s2, s0 -; GFX7-NEXT: s_lshl_b32 s0, s0, s3 -; GFX7-NEXT: v_cmp_eq_u32_e64 vcc, s1, 1 -; GFX7-NEXT: s_not_b32 s0, s0 -; GFX7-NEXT: s_lshl_b32 s2, s2, s3 +; GFX7-NEXT: s_and_b32 s1, s3, 1 +; GFX7-NEXT: s_lshr_b32 s0, s3, 1 +; GFX7-NEXT: s_and_b32 s2, s2, 0xffff +; GFX7-NEXT: s_lshl_b32 s1, s1, 4 +; GFX7-NEXT: s_lshl_b32 s2, s2, s1 +; GFX7-NEXT: s_lshl_b32 s1, 0xffff, s1 +; GFX7-NEXT: v_cmp_eq_u32_e64 vcc, s0, 1 +; GFX7-NEXT: s_not_b32 s1, s1 ; GFX7-NEXT: v_mov_b32_e32 v2, 0 ; GFX7-NEXT: v_mov_b32_e32 v3, 0 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_cndmask_b32_e32 v4, v0, v1, vcc -; GFX7-NEXT: v_and_b32_e32 v4, s0, v4 +; GFX7-NEXT: v_and_b32_e32 v4, s1, v4 ; GFX7-NEXT: v_or_b32_e32 v4, s2, v4 -; GFX7-NEXT: v_cmp_eq_u32_e64 s[0:1], s1, 0 +; GFX7-NEXT: v_cmp_eq_u32_e64 s[0:1], s0, 0 ; GFX7-NEXT: v_cndmask_b32_e64 v0, v0, v4, s[0:1] ; GFX7-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc ; GFX7-NEXT: flat_store_dwordx2 v[2:3], v[0:1] @@ -737,23 +712,22 @@ define amdgpu_ps void @insertelement_v_v4i16_s_s(<4 x i16> addrspace(1)* %ptr, i ; GFX10-LABEL: insertelement_v_v4i16_s_s: ; GFX10: ; %bb.0: ; GFX10-NEXT: global_load_dwordx2 v[0:1], v[0:1], off -; GFX10-NEXT: s_lshr_b32 s1, s3, 1 -; GFX10-NEXT: s_and_b32 s3, s3, 1 -; GFX10-NEXT: v_cmp_eq_u32_e64 vcc_lo, s1, 1 -; GFX10-NEXT: s_mov_b32 s0, 0xffff -; GFX10-NEXT: s_lshl_b32 s3, s3, 4 -; GFX10-NEXT: s_and_b32 s2, s2, s0 -; GFX10-NEXT: s_lshl_b32 s0, s0, s3 -; GFX10-NEXT: s_lshl_b32 s2, s2, s3 -; GFX10-NEXT: s_not_b32 s0, s0 +; GFX10-NEXT: s_lshr_b32 s0, s3, 1 +; GFX10-NEXT: s_and_b32 s1, s3, 1 +; GFX10-NEXT: v_cmp_eq_u32_e64 vcc_lo, s0, 1 +; GFX10-NEXT: s_lshl_b32 s1, s1, 4 +; GFX10-NEXT: s_and_b32 s2, s2, 0xffff +; GFX10-NEXT: s_lshl_b32 s3, 0xffff, s1 +; GFX10-NEXT: s_lshl_b32 s1, s2, s1 +; GFX10-NEXT: s_not_b32 s2, s3 +; GFX10-NEXT: v_cmp_eq_u32_e64 s0, s0, 0 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_cndmask_b32_e32 v2, v0, v1, vcc_lo -; GFX10-NEXT: v_and_or_b32 v4, v2, s0, s2 -; GFX10-NEXT: v_cmp_eq_u32_e64 s0, s1, 0 +; GFX10-NEXT: v_and_or_b32 v4, v2, s2, s1 ; GFX10-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-NEXT: v_mov_b32_e32 v3, 0 -; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc_lo ; GFX10-NEXT: v_cndmask_b32_e64 v0, v0, v4, s0 +; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc_lo ; GFX10-NEXT: global_store_dwordx2 v[2:3], v[0:1], off ; GFX10-NEXT: s_endpgm %vec = load <4 x i16>, <4 x i16> addrspace(1 )* %ptr @@ -768,18 +742,17 @@ define amdgpu_ps void @insertelement_s_v4i16_v_s(<4 x i16> addrspace(4)* inreg % ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; GFX9-NEXT: s_lshr_b32 s2, s4, 1 ; GFX9-NEXT: s_cmp_eq_u32 s2, 1 -; GFX9-NEXT: s_mov_b32 s5, 0xffff -; GFX9-NEXT: v_and_b32_e32 v0, s5, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX9-NEXT: v_cmp_eq_u32_e64 vcc, s2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_cselect_b32 s3, s1, s0 ; GFX9-NEXT: s_and_b32 s4, s4, 1 ; GFX9-NEXT: s_lshl_b32 s4, s4, 4 -; GFX9-NEXT: s_lshl_b32 s5, s5, s4 +; GFX9-NEXT: s_lshl_b32 s5, 0xffff, s4 ; GFX9-NEXT: s_andn2_b32 s3, s3, s5 ; GFX9-NEXT: v_mov_b32_e32 v1, s3 ; GFX9-NEXT: v_lshl_or_b32 v4, v0, s4, v1 ; GFX9-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NEXT: v_cmp_eq_u32_e64 vcc, s2, 0 ; GFX9-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc ; GFX9-NEXT: v_mov_b32_e32 v2, 0 @@ -794,21 +767,20 @@ define amdgpu_ps void @insertelement_s_v4i16_v_s(<4 x i16> addrspace(4)* inreg % ; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; GFX8-NEXT: s_lshr_b32 s2, s4, 1 ; GFX8-NEXT: s_cmp_eq_u32 s2, 1 -; GFX8-NEXT: s_mov_b32 s5, 0xffff ; GFX8-NEXT: v_cmp_eq_u32_e64 vcc, s2, 0 +; GFX8-NEXT: v_mov_b32_e32 v2, 0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: s_cselect_b32 s3, s1, s0 ; GFX8-NEXT: s_and_b32 s4, s4, 1 ; GFX8-NEXT: s_lshl_b32 s4, s4, 4 ; GFX8-NEXT: v_mov_b32_e32 v1, s4 -; GFX8-NEXT: s_lshl_b32 s4, s5, s4 +; GFX8-NEXT: s_lshl_b32 s4, 0xffff, s4 ; GFX8-NEXT: v_lshlrev_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; GFX8-NEXT: s_andn2_b32 s3, s3, s4 ; GFX8-NEXT: v_or_b32_e32 v4, s3, v0 ; GFX8-NEXT: v_mov_b32_e32 v0, s0 ; GFX8-NEXT: v_mov_b32_e32 v1, s1 ; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc -; GFX8-NEXT: v_mov_b32_e32 v2, 0 ; GFX8-NEXT: v_cmp_eq_u32_e64 vcc, s2, 1 ; GFX8-NEXT: v_mov_b32_e32 v3, 0 ; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc @@ -820,18 +792,17 @@ define amdgpu_ps void @insertelement_s_v4i16_v_s(<4 x i16> addrspace(4)* inreg % ; GFX7-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; GFX7-NEXT: s_lshr_b32 s2, s4, 1 ; GFX7-NEXT: s_cmp_eq_u32 s2, 1 -; GFX7-NEXT: s_mov_b32 s5, 0xffff -; GFX7-NEXT: v_and_b32_e32 v0, s5, v0 +; GFX7-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX7-NEXT: v_cmp_eq_u32_e64 vcc, s2, 0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_cselect_b32 s3, s1, s0 ; GFX7-NEXT: s_and_b32 s4, s4, 1 ; GFX7-NEXT: s_lshl_b32 s4, s4, 4 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, s4, v0 -; GFX7-NEXT: s_lshl_b32 s4, s5, s4 +; GFX7-NEXT: s_lshl_b32 s4, 0xffff, s4 ; GFX7-NEXT: s_andn2_b32 s3, s3, s4 ; GFX7-NEXT: v_or_b32_e32 v4, s3, v0 ; GFX7-NEXT: v_mov_b32_e32 v0, s0 -; GFX7-NEXT: v_cmp_eq_u32_e64 vcc, s2, 0 ; GFX7-NEXT: v_mov_b32_e32 v1, s1 ; GFX7-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc ; GFX7-NEXT: v_mov_b32_e32 v2, 0 @@ -845,9 +816,8 @@ define amdgpu_ps void @insertelement_s_v4i16_v_s(<4 x i16> addrspace(4)* inreg % ; GFX10: ; %bb.0: ; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; GFX10-NEXT: s_lshr_b32 s2, s4, 1 -; GFX10-NEXT: s_mov_b32 s5, 0xffff +; GFX10-NEXT: v_and_b32_e32 v2, 0xffff, v0 ; GFX10-NEXT: s_cmp_eq_u32 s2, 1 -; GFX10-NEXT: v_and_b32_e32 v2, s5, v0 ; GFX10-NEXT: v_cmp_eq_u32_e64 vcc_lo, s2, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_cselect_b32 s3, s1, s0 @@ -855,7 +825,7 @@ define amdgpu_ps void @insertelement_s_v4i16_v_s(<4 x i16> addrspace(4)* inreg % ; GFX10-NEXT: v_mov_b32_e32 v0, s0 ; GFX10-NEXT: s_lshl_b32 s4, s4, 4 ; GFX10-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-NEXT: s_lshl_b32 s5, s5, s4 +; GFX10-NEXT: s_lshl_b32 s5, 0xffff, s4 ; GFX10-NEXT: s_andn2_b32 s3, s3, s5 ; GFX10-NEXT: v_lshl_or_b32 v4, v2, s4, s3 ; GFX10-NEXT: v_mov_b32_e32 v2, 0 @@ -877,13 +847,13 @@ define amdgpu_ps void @insertelement_s_v4i16_s_v(<4 x i16> addrspace(4)* inreg % ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; GFX9-NEXT: v_lshrrev_b32_e32 v2, 1, v0 ; GFX9-NEXT: v_and_b32_e32 v0, 1, v0 -; GFX9-NEXT: s_mov_b32 s2, 0xffff ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v2 +; GFX9-NEXT: v_lshlrev_b32_e32 v0, 4, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v1, s0 ; GFX9-NEXT: v_mov_b32_e32 v3, s1 -; GFX9-NEXT: v_lshlrev_b32_e32 v0, 4, v0 -; GFX9-NEXT: s_and_b32 s3, s4, s2 +; GFX9-NEXT: s_mov_b32 s2, 0xffff +; GFX9-NEXT: s_and_b32 s3, s4, 0xffff ; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc ; GFX9-NEXT: v_lshlrev_b32_e64 v3, v0, s3 ; GFX9-NEXT: v_lshlrev_b32_e64 v0, v0, s2 @@ -904,13 +874,13 @@ define amdgpu_ps void @insertelement_s_v4i16_s_v(<4 x i16> addrspace(4)* inreg % ; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; GFX8-NEXT: v_lshrrev_b32_e32 v2, 1, v0 ; GFX8-NEXT: v_and_b32_e32 v0, 1, v0 -; GFX8-NEXT: s_mov_b32 s2, 0xffff ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 1, v2 +; GFX8-NEXT: v_lshlrev_b32_e32 v0, 4, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, s0 ; GFX8-NEXT: v_mov_b32_e32 v3, s1 -; GFX8-NEXT: v_lshlrev_b32_e32 v0, 4, v0 -; GFX8-NEXT: s_and_b32 s3, s4, s2 +; GFX8-NEXT: s_mov_b32 s2, 0xffff +; GFX8-NEXT: s_and_b32 s3, s4, 0xffff ; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc ; GFX8-NEXT: v_lshlrev_b32_e64 v3, v0, s3 ; GFX8-NEXT: v_lshlrev_b32_e64 v0, v0, s2 @@ -932,16 +902,15 @@ define amdgpu_ps void @insertelement_s_v4i16_s_v(<4 x i16> addrspace(4)* inreg % ; GFX7-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; GFX7-NEXT: v_lshrrev_b32_e32 v2, 1, v0 ; GFX7-NEXT: v_and_b32_e32 v0, 1, v0 -; GFX7-NEXT: s_mov_b32 s2, 0xffff ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v2 +; GFX7-NEXT: v_lshlrev_b32_e32 v0, 4, v0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v1, s0 ; GFX7-NEXT: v_mov_b32_e32 v3, s1 -; GFX7-NEXT: v_lshlrev_b32_e32 v0, 4, v0 -; GFX7-NEXT: s_and_b32 s3, s4, s2 +; GFX7-NEXT: s_and_b32 s2, s4, 0xffff ; GFX7-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc -; GFX7-NEXT: v_lshl_b32_e32 v3, s3, v0 -; GFX7-NEXT: v_lshl_b32_e32 v0, s2, v0 +; GFX7-NEXT: v_lshl_b32_e32 v3, s2, v0 +; GFX7-NEXT: v_lshl_b32_e32 v0, 0xffff, v0 ; GFX7-NEXT: v_xor_b32_e32 v0, -1, v0 ; GFX7-NEXT: v_and_b32_e32 v0, v1, v0 ; GFX7-NEXT: v_or_b32_e32 v4, v0, v3 @@ -960,12 +929,11 @@ define amdgpu_ps void @insertelement_s_v4i16_s_v(<4 x i16> addrspace(4)* inreg % ; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; GFX10-NEXT: v_and_b32_e32 v1, 1, v0 ; GFX10-NEXT: v_lshrrev_b32_e32 v4, 1, v0 -; GFX10-NEXT: s_mov_b32 s2, 0xffff -; GFX10-NEXT: s_and_b32 s3, s4, s2 +; GFX10-NEXT: s_and_b32 s2, s4, 0xffff ; GFX10-NEXT: v_lshlrev_b32_e32 v1, 4, v1 ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v4 -; GFX10-NEXT: v_lshlrev_b32_e64 v2, v1, s2 -; GFX10-NEXT: v_lshlrev_b32_e64 v3, v1, s3 +; GFX10-NEXT: v_lshlrev_b32_e64 v2, v1, 0xffff +; GFX10-NEXT: v_lshlrev_b32_e64 v3, v1, s2 ; GFX10-NEXT: v_xor_b32_e32 v2, -1, v2 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v0, s1 @@ -1073,10 +1041,9 @@ define amdgpu_ps void @insertelement_s_v4i16_v_v(<4 x i16> addrspace(4)* inreg % ; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; GFX10-NEXT: v_and_b32_e32 v2, 1, v1 ; GFX10-NEXT: v_lshrrev_b32_e32 v4, 1, v1 -; GFX10-NEXT: s_mov_b32 s2, 0xffff ; GFX10-NEXT: v_lshlrev_b32_e32 v2, 4, v2 ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v4 -; GFX10-NEXT: v_lshlrev_b32_e64 v3, v2, s2 +; GFX10-NEXT: v_lshlrev_b32_e64 v3, v2, 0xffff ; GFX10-NEXT: v_lshlrev_b32_sdwa v2, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; GFX10-NEXT: v_xor_b32_e32 v3, -1, v3 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) @@ -1102,10 +1069,10 @@ define amdgpu_ps void @insertelement_v_v4i16_s_v(<4 x i16> addrspace(1)* %ptr, i ; GFX9-LABEL: insertelement_v_v4i16_s_v: ; GFX9: ; %bb.0: ; GFX9-NEXT: global_load_dwordx2 v[0:1], v[0:1], off -; GFX9-NEXT: s_mov_b32 s0, 0xffff ; GFX9-NEXT: v_lshrrev_b32_e32 v5, 1, v2 ; GFX9-NEXT: v_and_b32_e32 v2, 1, v2 -; GFX9-NEXT: s_and_b32 s1, s2, s0 +; GFX9-NEXT: s_mov_b32 s0, 0xffff +; GFX9-NEXT: s_and_b32 s1, s2, 0xffff ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 4, v2 ; GFX9-NEXT: v_lshlrev_b32_e64 v6, v2, s1 ; GFX9-NEXT: v_lshlrev_b32_e64 v2, v2, s0 @@ -1125,10 +1092,10 @@ define amdgpu_ps void @insertelement_v_v4i16_s_v(<4 x i16> addrspace(1)* %ptr, i ; GFX8-LABEL: insertelement_v_v4i16_s_v: ; GFX8: ; %bb.0: ; GFX8-NEXT: flat_load_dwordx2 v[0:1], v[0:1] -; GFX8-NEXT: s_mov_b32 s0, 0xffff ; GFX8-NEXT: v_lshrrev_b32_e32 v5, 1, v2 ; GFX8-NEXT: v_and_b32_e32 v2, 1, v2 -; GFX8-NEXT: s_and_b32 s1, s2, s0 +; GFX8-NEXT: s_mov_b32 s0, 0xffff +; GFX8-NEXT: s_and_b32 s1, s2, 0xffff ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 4, v2 ; GFX8-NEXT: v_lshlrev_b32_e64 v6, v2, s1 ; GFX8-NEXT: v_lshlrev_b32_e64 v2, v2, s0 @@ -1149,13 +1116,12 @@ define amdgpu_ps void @insertelement_v_v4i16_s_v(<4 x i16> addrspace(1)* %ptr, i ; GFX7-LABEL: insertelement_v_v4i16_s_v: ; GFX7: ; %bb.0: ; GFX7-NEXT: flat_load_dwordx2 v[0:1], v[0:1] -; GFX7-NEXT: s_mov_b32 s0, 0xffff ; GFX7-NEXT: v_lshrrev_b32_e32 v5, 1, v2 ; GFX7-NEXT: v_and_b32_e32 v2, 1, v2 -; GFX7-NEXT: s_and_b32 s1, s2, s0 +; GFX7-NEXT: s_and_b32 s0, s2, 0xffff ; GFX7-NEXT: v_lshlrev_b32_e32 v2, 4, v2 -; GFX7-NEXT: v_lshl_b32_e32 v6, s1, v2 -; GFX7-NEXT: v_lshl_b32_e32 v2, s0, v2 +; GFX7-NEXT: v_lshl_b32_e32 v6, s0, v2 +; GFX7-NEXT: v_lshl_b32_e32 v2, 0xffff, v2 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v5 ; GFX7-NEXT: v_xor_b32_e32 v2, -1, v2 ; GFX7-NEXT: v_mov_b32_e32 v3, 0 @@ -1175,11 +1141,10 @@ define amdgpu_ps void @insertelement_v_v4i16_s_v(<4 x i16> addrspace(1)* %ptr, i ; GFX10-NEXT: global_load_dwordx2 v[0:1], v[0:1], off ; GFX10-NEXT: v_and_b32_e32 v3, 1, v2 ; GFX10-NEXT: v_lshrrev_b32_e32 v5, 1, v2 -; GFX10-NEXT: s_mov_b32 s0, 0xffff +; GFX10-NEXT: s_and_b32 s0, s2, 0xffff ; GFX10-NEXT: v_lshlrev_b32_e32 v3, 4, v3 ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v5 -; GFX10-NEXT: v_lshlrev_b32_e64 v4, v3, s0 -; GFX10-NEXT: s_and_b32 s0, s2, s0 +; GFX10-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff ; GFX10-NEXT: v_lshlrev_b32_e64 v2, v3, s0 ; GFX10-NEXT: v_cmp_eq_u32_e64 s0, 0, v5 ; GFX10-NEXT: v_xor_b32_e32 v3, -1, v4 @@ -1202,20 +1167,19 @@ define amdgpu_ps void @insertelement_v_v4i16_v_s(<4 x i16> addrspace(1)* %ptr, i ; GFX9-LABEL: insertelement_v_v4i16_v_s: ; GFX9: ; %bb.0: ; GFX9-NEXT: global_load_dwordx2 v[0:1], v[0:1], off -; GFX9-NEXT: s_lshr_b32 s1, s2, 1 -; GFX9-NEXT: s_and_b32 s2, s2, 1 -; GFX9-NEXT: s_mov_b32 s0, 0xffff -; GFX9-NEXT: s_lshl_b32 s2, s2, 4 -; GFX9-NEXT: s_lshl_b32 s0, s0, s2 -; GFX9-NEXT: v_cmp_eq_u32_e64 vcc, s1, 1 -; GFX9-NEXT: v_lshlrev_b32_sdwa v2, s2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; GFX9-NEXT: s_not_b32 s0, s0 +; GFX9-NEXT: s_and_b32 s1, s2, 1 +; GFX9-NEXT: s_lshr_b32 s0, s2, 1 +; GFX9-NEXT: s_lshl_b32 s1, s1, 4 +; GFX9-NEXT: v_lshlrev_b32_sdwa v2, s1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; GFX9-NEXT: s_lshl_b32 s1, 0xffff, s1 +; GFX9-NEXT: v_cmp_eq_u32_e64 vcc, s0, 1 +; GFX9-NEXT: s_not_b32 s1, s1 ; GFX9-NEXT: v_mov_b32_e32 v3, 0 ; GFX9-NEXT: v_mov_b32_e32 v4, 0 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_cndmask_b32_e32 v5, v0, v1, vcc -; GFX9-NEXT: v_and_or_b32 v2, v5, s0, v2 -; GFX9-NEXT: v_cmp_eq_u32_e64 s[0:1], s1, 0 +; GFX9-NEXT: v_and_or_b32 v2, v5, s1, v2 +; GFX9-NEXT: v_cmp_eq_u32_e64 s[0:1], s0, 0 ; GFX9-NEXT: v_cndmask_b32_e64 v0, v0, v2, s[0:1] ; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc ; GFX9-NEXT: global_store_dwordx2 v[3:4], v[0:1], off @@ -1224,22 +1188,21 @@ define amdgpu_ps void @insertelement_v_v4i16_v_s(<4 x i16> addrspace(1)* %ptr, i ; GFX8-LABEL: insertelement_v_v4i16_v_s: ; GFX8: ; %bb.0: ; GFX8-NEXT: flat_load_dwordx2 v[0:1], v[0:1] -; GFX8-NEXT: s_lshr_b32 s1, s2, 1 -; GFX8-NEXT: s_and_b32 s2, s2, 1 -; GFX8-NEXT: s_mov_b32 s0, 0xffff -; GFX8-NEXT: s_lshl_b32 s2, s2, 4 -; GFX8-NEXT: v_mov_b32_e32 v5, s2 -; GFX8-NEXT: s_lshl_b32 s0, s0, s2 -; GFX8-NEXT: v_cmp_eq_u32_e64 vcc, s1, 1 +; GFX8-NEXT: s_and_b32 s1, s2, 1 +; GFX8-NEXT: s_lshr_b32 s0, s2, 1 +; GFX8-NEXT: s_lshl_b32 s1, s1, 4 +; GFX8-NEXT: v_mov_b32_e32 v5, s1 +; GFX8-NEXT: s_lshl_b32 s1, 0xffff, s1 +; GFX8-NEXT: v_cmp_eq_u32_e64 vcc, s0, 1 ; GFX8-NEXT: v_lshlrev_b32_sdwa v2, v5, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; GFX8-NEXT: s_not_b32 s0, s0 +; GFX8-NEXT: s_not_b32 s1, s1 ; GFX8-NEXT: v_mov_b32_e32 v3, 0 ; GFX8-NEXT: v_mov_b32_e32 v4, 0 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_cndmask_b32_e32 v5, v0, v1, vcc -; GFX8-NEXT: v_and_b32_e32 v5, s0, v5 +; GFX8-NEXT: v_and_b32_e32 v5, s1, v5 ; GFX8-NEXT: v_or_b32_e32 v2, v5, v2 -; GFX8-NEXT: v_cmp_eq_u32_e64 s[0:1], s1, 0 +; GFX8-NEXT: v_cmp_eq_u32_e64 s[0:1], s0, 0 ; GFX8-NEXT: v_cndmask_b32_e64 v0, v0, v2, s[0:1] ; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc ; GFX8-NEXT: flat_store_dwordx2 v[3:4], v[0:1] @@ -1248,22 +1211,21 @@ define amdgpu_ps void @insertelement_v_v4i16_v_s(<4 x i16> addrspace(1)* %ptr, i ; GFX7-LABEL: insertelement_v_v4i16_v_s: ; GFX7: ; %bb.0: ; GFX7-NEXT: flat_load_dwordx2 v[0:1], v[0:1] -; GFX7-NEXT: s_lshr_b32 s1, s2, 1 -; GFX7-NEXT: s_and_b32 s2, s2, 1 -; GFX7-NEXT: s_mov_b32 s0, 0xffff -; GFX7-NEXT: s_lshl_b32 s2, s2, 4 -; GFX7-NEXT: v_and_b32_e32 v2, s0, v2 -; GFX7-NEXT: s_lshl_b32 s0, s0, s2 -; GFX7-NEXT: v_cmp_eq_u32_e64 vcc, s1, 1 -; GFX7-NEXT: s_not_b32 s0, s0 -; GFX7-NEXT: v_lshlrev_b32_e32 v2, s2, v2 +; GFX7-NEXT: s_and_b32 s1, s2, 1 +; GFX7-NEXT: s_lshr_b32 s0, s2, 1 +; GFX7-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX7-NEXT: s_lshl_b32 s1, s1, 4 +; GFX7-NEXT: v_lshlrev_b32_e32 v2, s1, v2 +; GFX7-NEXT: s_lshl_b32 s1, 0xffff, s1 +; GFX7-NEXT: v_cmp_eq_u32_e64 vcc, s0, 1 +; GFX7-NEXT: s_not_b32 s1, s1 ; GFX7-NEXT: v_mov_b32_e32 v3, 0 ; GFX7-NEXT: v_mov_b32_e32 v4, 0 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_cndmask_b32_e32 v5, v0, v1, vcc -; GFX7-NEXT: v_and_b32_e32 v5, s0, v5 +; GFX7-NEXT: v_and_b32_e32 v5, s1, v5 ; GFX7-NEXT: v_or_b32_e32 v2, v5, v2 -; GFX7-NEXT: v_cmp_eq_u32_e64 s[0:1], s1, 0 +; GFX7-NEXT: v_cmp_eq_u32_e64 s[0:1], s0, 0 ; GFX7-NEXT: v_cndmask_b32_e64 v0, v0, v2, s[0:1] ; GFX7-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc ; GFX7-NEXT: flat_store_dwordx2 v[3:4], v[0:1] @@ -1272,18 +1234,17 @@ define amdgpu_ps void @insertelement_v_v4i16_v_s(<4 x i16> addrspace(1)* %ptr, i ; GFX10-LABEL: insertelement_v_v4i16_v_s: ; GFX10: ; %bb.0: ; GFX10-NEXT: global_load_dwordx2 v[0:1], v[0:1], off +; GFX10-NEXT: s_lshr_b32 s1, s2, 1 ; GFX10-NEXT: s_and_b32 s0, s2, 1 -; GFX10-NEXT: s_lshr_b32 s2, s2, 1 +; GFX10-NEXT: v_cmp_eq_u32_e64 vcc_lo, s1, 1 ; GFX10-NEXT: s_lshl_b32 s0, s0, 4 -; GFX10-NEXT: v_cmp_eq_u32_e64 vcc_lo, s2, 1 -; GFX10-NEXT: s_mov_b32 s1, 0xffff ; GFX10-NEXT: v_lshlrev_b32_sdwa v2, s0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; GFX10-NEXT: s_lshl_b32 s0, s1, s0 +; GFX10-NEXT: s_lshl_b32 s0, 0xffff, s0 ; GFX10-NEXT: s_not_b32 s0, s0 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_cndmask_b32_e32 v3, v0, v1, vcc_lo ; GFX10-NEXT: v_and_or_b32 v4, v3, s0, v2 -; GFX10-NEXT: v_cmp_eq_u32_e64 s0, s2, 0 +; GFX10-NEXT: v_cmp_eq_u32_e64 s0, s1, 0 ; GFX10-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-NEXT: v_mov_b32_e32 v3, 0 ; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc_lo @@ -1371,12 +1332,11 @@ define amdgpu_ps void @insertelement_v_v4i16_v_v(<4 x i16> addrspace(1)* %ptr, i ; GFX10-NEXT: global_load_dwordx2 v[0:1], v[0:1], off ; GFX10-NEXT: v_and_b32_e32 v4, 1, v3 ; GFX10-NEXT: v_lshrrev_b32_e32 v6, 1, v3 -; GFX10-NEXT: s_mov_b32 s0, 0xffff ; GFX10-NEXT: v_lshlrev_b32_e32 v4, 4, v4 ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v6 -; GFX10-NEXT: v_lshlrev_b32_e64 v5, v4, s0 -; GFX10-NEXT: v_lshlrev_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; GFX10-NEXT: v_cmp_eq_u32_e64 s0, 0, v6 +; GFX10-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff +; GFX10-NEXT: v_lshlrev_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; GFX10-NEXT: v_xor_b32_e32 v3, -1, v5 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_cndmask_b32_e32 v4, v0, v1, vcc_lo @@ -1399,8 +1359,8 @@ define amdgpu_ps void @insertelement_s_v8i16_s_s(<8 x i16> addrspace(4)* inreg % ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x0 ; GFX9-NEXT: s_lshr_b32 s6, s5, 1 ; GFX9-NEXT: s_cmp_eq_u32 s6, 1 -; GFX9-NEXT: s_mov_b32 s8, 0xffff ; GFX9-NEXT: v_mov_b32_e32 v4, 0 +; GFX9-NEXT: v_mov_b32_e32 v5, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_cselect_b32 s7, s1, s0 ; GFX9-NEXT: s_cmp_eq_u32 s6, 2 @@ -1409,9 +1369,9 @@ define amdgpu_ps void @insertelement_s_v8i16_s_s(<8 x i16> addrspace(4)* inreg % ; GFX9-NEXT: s_cselect_b32 s7, s3, s7 ; GFX9-NEXT: s_and_b32 s5, s5, 1 ; GFX9-NEXT: s_lshl_b32 s5, s5, 4 -; GFX9-NEXT: s_and_b32 s4, s4, s8 +; GFX9-NEXT: s_and_b32 s4, s4, 0xffff ; GFX9-NEXT: s_lshl_b32 s4, s4, s5 -; GFX9-NEXT: s_lshl_b32 s5, s8, s5 +; GFX9-NEXT: s_lshl_b32 s5, 0xffff, s5 ; GFX9-NEXT: s_andn2_b32 s5, s7, s5 ; GFX9-NEXT: s_or_b32 s4, s5, s4 ; GFX9-NEXT: s_cmp_eq_u32 s6, 0 @@ -1423,7 +1383,6 @@ define amdgpu_ps void @insertelement_s_v8i16_s_s(<8 x i16> addrspace(4)* inreg % ; GFX9-NEXT: s_cmp_eq_u32 s6, 3 ; GFX9-NEXT: s_cselect_b32 s3, s4, s3 ; GFX9-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NEXT: v_mov_b32_e32 v5, 0 ; GFX9-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-NEXT: v_mov_b32_e32 v2, s2 ; GFX9-NEXT: v_mov_b32_e32 v3, s3 @@ -1435,8 +1394,8 @@ define amdgpu_ps void @insertelement_s_v8i16_s_s(<8 x i16> addrspace(4)* inreg % ; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x0 ; GFX8-NEXT: s_lshr_b32 s6, s5, 1 ; GFX8-NEXT: s_cmp_eq_u32 s6, 1 -; GFX8-NEXT: s_mov_b32 s8, 0xffff ; GFX8-NEXT: v_mov_b32_e32 v4, 0 +; GFX8-NEXT: v_mov_b32_e32 v5, 0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: s_cselect_b32 s7, s1, s0 ; GFX8-NEXT: s_cmp_eq_u32 s6, 2 @@ -1445,9 +1404,9 @@ define amdgpu_ps void @insertelement_s_v8i16_s_s(<8 x i16> addrspace(4)* inreg % ; GFX8-NEXT: s_cselect_b32 s7, s3, s7 ; GFX8-NEXT: s_and_b32 s5, s5, 1 ; GFX8-NEXT: s_lshl_b32 s5, s5, 4 -; GFX8-NEXT: s_and_b32 s4, s4, s8 +; GFX8-NEXT: s_and_b32 s4, s4, 0xffff ; GFX8-NEXT: s_lshl_b32 s4, s4, s5 -; GFX8-NEXT: s_lshl_b32 s5, s8, s5 +; GFX8-NEXT: s_lshl_b32 s5, 0xffff, s5 ; GFX8-NEXT: s_andn2_b32 s5, s7, s5 ; GFX8-NEXT: s_or_b32 s4, s5, s4 ; GFX8-NEXT: s_cmp_eq_u32 s6, 0 @@ -1459,7 +1418,6 @@ define amdgpu_ps void @insertelement_s_v8i16_s_s(<8 x i16> addrspace(4)* inreg % ; GFX8-NEXT: s_cmp_eq_u32 s6, 3 ; GFX8-NEXT: s_cselect_b32 s3, s4, s3 ; GFX8-NEXT: v_mov_b32_e32 v0, s0 -; GFX8-NEXT: v_mov_b32_e32 v5, 0 ; GFX8-NEXT: v_mov_b32_e32 v1, s1 ; GFX8-NEXT: v_mov_b32_e32 v2, s2 ; GFX8-NEXT: v_mov_b32_e32 v3, s3 @@ -1471,7 +1429,6 @@ define amdgpu_ps void @insertelement_s_v8i16_s_s(<8 x i16> addrspace(4)* inreg % ; GFX7-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x0 ; GFX7-NEXT: s_lshr_b32 s6, s5, 1 ; GFX7-NEXT: s_cmp_eq_u32 s6, 1 -; GFX7-NEXT: s_mov_b32 s8, 0xffff ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_cselect_b32 s7, s1, s0 ; GFX7-NEXT: s_cmp_eq_u32 s6, 2 @@ -1480,9 +1437,9 @@ define amdgpu_ps void @insertelement_s_v8i16_s_s(<8 x i16> addrspace(4)* inreg % ; GFX7-NEXT: s_cselect_b32 s7, s3, s7 ; GFX7-NEXT: s_and_b32 s5, s5, 1 ; GFX7-NEXT: s_lshl_b32 s5, s5, 4 -; GFX7-NEXT: s_and_b32 s4, s4, s8 +; GFX7-NEXT: s_and_b32 s4, s4, 0xffff ; GFX7-NEXT: s_lshl_b32 s4, s4, s5 -; GFX7-NEXT: s_lshl_b32 s5, s8, s5 +; GFX7-NEXT: s_lshl_b32 s5, 0xffff, s5 ; GFX7-NEXT: s_andn2_b32 s5, s7, s5 ; GFX7-NEXT: s_or_b32 s4, s5, s4 ; GFX7-NEXT: s_cmp_eq_u32 s6, 0 @@ -1507,9 +1464,8 @@ define amdgpu_ps void @insertelement_s_v8i16_s_s(<8 x i16> addrspace(4)* inreg % ; GFX10: ; %bb.0: ; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x0 ; GFX10-NEXT: s_lshr_b32 s6, s5, 1 -; GFX10-NEXT: s_mov_b32 s8, 0xffff -; GFX10-NEXT: s_cmp_eq_u32 s6, 1 ; GFX10-NEXT: v_mov_b32_e32 v4, 0 +; GFX10-NEXT: s_cmp_eq_u32 s6, 1 ; GFX10-NEXT: v_mov_b32_e32 v5, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_cselect_b32 s7, s1, s0 @@ -1518,9 +1474,9 @@ define amdgpu_ps void @insertelement_s_v8i16_s_s(<8 x i16> addrspace(4)* inreg % ; GFX10-NEXT: s_cmp_eq_u32 s6, 3 ; GFX10-NEXT: s_cselect_b32 s7, s3, s7 ; GFX10-NEXT: s_and_b32 s5, s5, 1 -; GFX10-NEXT: s_and_b32 s4, s4, s8 +; GFX10-NEXT: s_and_b32 s4, s4, 0xffff ; GFX10-NEXT: s_lshl_b32 s5, s5, 4 -; GFX10-NEXT: s_lshl_b32 s8, s8, s5 +; GFX10-NEXT: s_lshl_b32 s8, 0xffff, s5 ; GFX10-NEXT: s_lshl_b32 s4, s4, s5 ; GFX10-NEXT: s_andn2_b32 s5, s7, s8 ; GFX10-NEXT: s_or_b32 s4, s5, s4 @@ -1548,17 +1504,16 @@ define amdgpu_ps void @insertelement_v_v8i16_s_s(<8 x i16> addrspace(1)* %ptr, i ; GFX9-LABEL: insertelement_v_v8i16_s_s: ; GFX9: ; %bb.0: ; GFX9-NEXT: global_load_dwordx4 v[0:3], v[0:1], off -; GFX9-NEXT: s_and_b32 s1, s3, 1 -; GFX9-NEXT: s_mov_b32 s0, 0xffff +; GFX9-NEXT: s_and_b32 s0, s3, 1 ; GFX9-NEXT: s_lshr_b32 s4, s3, 1 -; GFX9-NEXT: s_lshl_b32 s1, s1, 4 -; GFX9-NEXT: s_and_b32 s2, s2, s0 -; GFX9-NEXT: s_lshl_b32 s0, s0, s1 +; GFX9-NEXT: s_and_b32 s1, s2, 0xffff +; GFX9-NEXT: s_lshl_b32 s0, s0, 4 +; GFX9-NEXT: s_lshl_b32 s1, s1, s0 +; GFX9-NEXT: s_lshl_b32 s0, 0xffff, s0 ; GFX9-NEXT: v_cmp_eq_u32_e64 vcc, s4, 1 -; GFX9-NEXT: s_lshl_b32 s2, s2, s1 ; GFX9-NEXT: s_not_b32 s5, s0 +; GFX9-NEXT: v_mov_b32_e32 v6, s1 ; GFX9-NEXT: v_cmp_eq_u32_e64 s[0:1], s4, 2 -; GFX9-NEXT: v_mov_b32_e32 v6, s2 ; GFX9-NEXT: v_cmp_eq_u32_e64 s[2:3], s4, 3 ; GFX9-NEXT: v_mov_b32_e32 v4, 0 ; GFX9-NEXT: v_mov_b32_e32 v5, 0 @@ -1578,14 +1533,13 @@ define amdgpu_ps void @insertelement_v_v8i16_s_s(<8 x i16> addrspace(1)* %ptr, i ; GFX8-LABEL: insertelement_v_v8i16_s_s: ; GFX8: ; %bb.0: ; GFX8-NEXT: flat_load_dwordx4 v[0:3], v[0:1] -; GFX8-NEXT: s_and_b32 s1, s3, 1 -; GFX8-NEXT: s_mov_b32 s0, 0xffff +; GFX8-NEXT: s_and_b32 s0, s3, 1 ; GFX8-NEXT: s_lshr_b32 s4, s3, 1 -; GFX8-NEXT: s_lshl_b32 s1, s1, 4 -; GFX8-NEXT: s_and_b32 s2, s2, s0 -; GFX8-NEXT: s_lshl_b32 s0, s0, s1 +; GFX8-NEXT: s_and_b32 s1, s2, 0xffff +; GFX8-NEXT: s_lshl_b32 s0, s0, 4 +; GFX8-NEXT: s_lshl_b32 s5, s1, s0 +; GFX8-NEXT: s_lshl_b32 s0, 0xffff, s0 ; GFX8-NEXT: v_cmp_eq_u32_e64 vcc, s4, 1 -; GFX8-NEXT: s_lshl_b32 s5, s2, s1 ; GFX8-NEXT: s_not_b32 s6, s0 ; GFX8-NEXT: v_cmp_eq_u32_e64 s[0:1], s4, 2 ; GFX8-NEXT: v_cmp_eq_u32_e64 s[2:3], s4, 3 @@ -1611,14 +1565,13 @@ define amdgpu_ps void @insertelement_v_v8i16_s_s(<8 x i16> addrspace(1)* %ptr, i ; GFX7-NEXT: s_mov_b32 s11, 0xf000 ; GFX7-NEXT: s_mov_b64 s[8:9], 0 ; GFX7-NEXT: buffer_load_dwordx4 v[0:3], v[0:1], s[8:11], 0 addr64 -; GFX7-NEXT: s_and_b32 s1, s3, 1 -; GFX7-NEXT: s_mov_b32 s0, 0xffff +; GFX7-NEXT: s_and_b32 s0, s3, 1 ; GFX7-NEXT: s_lshr_b32 s4, s3, 1 -; GFX7-NEXT: s_lshl_b32 s1, s1, 4 -; GFX7-NEXT: s_and_b32 s2, s2, s0 -; GFX7-NEXT: s_lshl_b32 s0, s0, s1 +; GFX7-NEXT: s_and_b32 s1, s2, 0xffff +; GFX7-NEXT: s_lshl_b32 s0, s0, 4 +; GFX7-NEXT: s_lshl_b32 s5, s1, s0 +; GFX7-NEXT: s_lshl_b32 s0, 0xffff, s0 ; GFX7-NEXT: v_cmp_eq_u32_e64 vcc, s4, 1 -; GFX7-NEXT: s_lshl_b32 s5, s2, s1 ; GFX7-NEXT: s_not_b32 s6, s0 ; GFX7-NEXT: v_cmp_eq_u32_e64 s[0:1], s4, 2 ; GFX7-NEXT: v_cmp_eq_u32_e64 s[2:3], s4, 3 @@ -1646,9 +1599,8 @@ define amdgpu_ps void @insertelement_v_v8i16_s_s(<8 x i16> addrspace(1)* %ptr, i ; GFX10-NEXT: v_cmp_eq_u32_e64 s0, s4, 2 ; GFX10-NEXT: s_lshl_b32 s3, s1, 4 ; GFX10-NEXT: v_cmp_eq_u32_e64 s1, s4, 3 -; GFX10-NEXT: s_mov_b32 s5, 0xffff -; GFX10-NEXT: s_and_b32 s2, s2, s5 -; GFX10-NEXT: s_lshl_b32 s5, s5, s3 +; GFX10-NEXT: s_and_b32 s2, s2, 0xffff +; GFX10-NEXT: s_lshl_b32 s5, 0xffff, s3 ; GFX10-NEXT: s_lshl_b32 s2, s2, s3 ; GFX10-NEXT: s_not_b32 s3, s5 ; GFX10-NEXT: s_waitcnt vmcnt(0) @@ -1677,8 +1629,8 @@ define amdgpu_ps void @insertelement_s_v8i16_v_s(<8 x i16> addrspace(4)* inreg % ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x0 ; GFX9-NEXT: s_lshr_b32 s5, s4, 1 ; GFX9-NEXT: s_cmp_eq_u32 s5, 1 -; GFX9-NEXT: s_mov_b32 s7, 0xffff -; GFX9-NEXT: v_and_b32_e32 v0, s7, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX9-NEXT: v_cmp_eq_u32_e64 vcc, s5, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_cselect_b32 s6, s1, s0 ; GFX9-NEXT: s_cmp_eq_u32 s5, 2 @@ -1687,12 +1639,11 @@ define amdgpu_ps void @insertelement_s_v8i16_v_s(<8 x i16> addrspace(4)* inreg % ; GFX9-NEXT: s_cselect_b32 s6, s3, s6 ; GFX9-NEXT: s_and_b32 s4, s4, 1 ; GFX9-NEXT: s_lshl_b32 s4, s4, 4 -; GFX9-NEXT: s_lshl_b32 s7, s7, s4 +; GFX9-NEXT: s_lshl_b32 s7, 0xffff, s4 ; GFX9-NEXT: s_andn2_b32 s6, s6, s7 ; GFX9-NEXT: v_mov_b32_e32 v1, s6 ; GFX9-NEXT: v_lshl_or_b32 v6, v0, s4, v1 ; GFX9-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NEXT: v_cmp_eq_u32_e64 vcc, s5, 0 ; GFX9-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v6, vcc ; GFX9-NEXT: v_cmp_eq_u32_e64 vcc, s5, 1 @@ -1713,8 +1664,8 @@ define amdgpu_ps void @insertelement_s_v8i16_v_s(<8 x i16> addrspace(4)* inreg % ; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x0 ; GFX8-NEXT: s_lshr_b32 s5, s4, 1 ; GFX8-NEXT: s_cmp_eq_u32 s5, 1 -; GFX8-NEXT: s_mov_b32 s7, 0xffff ; GFX8-NEXT: v_cmp_eq_u32_e64 vcc, s5, 0 +; GFX8-NEXT: v_mov_b32_e32 v4, 0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: s_cselect_b32 s6, s1, s0 ; GFX8-NEXT: s_cmp_eq_u32 s5, 2 @@ -1724,7 +1675,7 @@ define amdgpu_ps void @insertelement_s_v8i16_v_s(<8 x i16> addrspace(4)* inreg % ; GFX8-NEXT: s_and_b32 s4, s4, 1 ; GFX8-NEXT: s_lshl_b32 s4, s4, 4 ; GFX8-NEXT: v_mov_b32_e32 v1, s4 -; GFX8-NEXT: s_lshl_b32 s4, s7, s4 +; GFX8-NEXT: s_lshl_b32 s4, 0xffff, s4 ; GFX8-NEXT: v_lshlrev_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; GFX8-NEXT: s_andn2_b32 s4, s6, s4 ; GFX8-NEXT: v_or_b32_e32 v6, s4, v0 @@ -1737,7 +1688,6 @@ define amdgpu_ps void @insertelement_s_v8i16_v_s(<8 x i16> addrspace(4)* inreg % ; GFX8-NEXT: v_cmp_eq_u32_e64 vcc, s5, 2 ; GFX8-NEXT: v_mov_b32_e32 v3, s3 ; GFX8-NEXT: v_cndmask_b32_e32 v2, v2, v6, vcc -; GFX8-NEXT: v_mov_b32_e32 v4, 0 ; GFX8-NEXT: v_cmp_eq_u32_e64 vcc, s5, 3 ; GFX8-NEXT: v_mov_b32_e32 v5, 0 ; GFX8-NEXT: v_cndmask_b32_e32 v3, v3, v6, vcc @@ -1749,8 +1699,8 @@ define amdgpu_ps void @insertelement_s_v8i16_v_s(<8 x i16> addrspace(4)* inreg % ; GFX7-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x0 ; GFX7-NEXT: s_lshr_b32 s5, s4, 1 ; GFX7-NEXT: s_cmp_eq_u32 s5, 1 -; GFX7-NEXT: s_mov_b32 s7, 0xffff -; GFX7-NEXT: v_and_b32_e32 v0, s7, v0 +; GFX7-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX7-NEXT: v_cmp_eq_u32_e64 vcc, s5, 0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_cselect_b32 s6, s1, s0 ; GFX7-NEXT: s_cmp_eq_u32 s5, 2 @@ -1760,11 +1710,10 @@ define amdgpu_ps void @insertelement_s_v8i16_v_s(<8 x i16> addrspace(4)* inreg % ; GFX7-NEXT: s_and_b32 s4, s4, 1 ; GFX7-NEXT: s_lshl_b32 s4, s4, 4 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, s4, v0 -; GFX7-NEXT: s_lshl_b32 s4, s7, s4 +; GFX7-NEXT: s_lshl_b32 s4, 0xffff, s4 ; GFX7-NEXT: s_andn2_b32 s4, s6, s4 ; GFX7-NEXT: v_or_b32_e32 v4, s4, v0 ; GFX7-NEXT: v_mov_b32_e32 v0, s0 -; GFX7-NEXT: v_cmp_eq_u32_e64 vcc, s5, 0 ; GFX7-NEXT: v_mov_b32_e32 v1, s1 ; GFX7-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc ; GFX7-NEXT: v_cmp_eq_u32_e64 vcc, s5, 1 @@ -1785,9 +1734,8 @@ define amdgpu_ps void @insertelement_s_v8i16_v_s(<8 x i16> addrspace(4)* inreg % ; GFX10: ; %bb.0: ; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x0 ; GFX10-NEXT: s_lshr_b32 s5, s4, 1 -; GFX10-NEXT: s_mov_b32 s7, 0xffff +; GFX10-NEXT: v_and_b32_e32 v4, 0xffff, v0 ; GFX10-NEXT: s_cmp_eq_u32 s5, 1 -; GFX10-NEXT: v_and_b32_e32 v4, s7, v0 ; GFX10-NEXT: v_cmp_eq_u32_e64 vcc_lo, s5, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_cselect_b32 s6, s1, s0 @@ -1801,7 +1749,7 @@ define amdgpu_ps void @insertelement_s_v8i16_v_s(<8 x i16> addrspace(4)* inreg % ; GFX10-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-NEXT: s_lshl_b32 s4, s4, 4 ; GFX10-NEXT: v_mov_b32_e32 v3, s3 -; GFX10-NEXT: s_lshl_b32 s7, s7, s4 +; GFX10-NEXT: s_lshl_b32 s7, 0xffff, s4 ; GFX10-NEXT: s_andn2_b32 s6, s6, s7 ; GFX10-NEXT: v_lshl_or_b32 v6, v4, s4, s6 ; GFX10-NEXT: v_mov_b32_e32 v4, 0 @@ -1828,15 +1776,15 @@ define amdgpu_ps void @insertelement_s_v8i16_s_v(<8 x i16> addrspace(4)* inreg % ; GFX9-NEXT: v_lshrrev_b32_e32 v4, 1, v0 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v4 ; GFX9-NEXT: v_and_b32_e32 v0, 1, v0 -; GFX9-NEXT: s_mov_b32 s5, 0xffff +; GFX9-NEXT: v_cmp_eq_u32_e64 s[0:1], 2, v4 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v1, s8 ; GFX9-NEXT: v_mov_b32_e32 v2, s9 ; GFX9-NEXT: v_mov_b32_e32 v3, s10 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc -; GFX9-NEXT: v_cmp_eq_u32_e64 s[0:1], 2, v4 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 4, v0 -; GFX9-NEXT: s_and_b32 s4, s4, s5 +; GFX9-NEXT: s_mov_b32 s5, 0xffff +; GFX9-NEXT: s_and_b32 s4, s4, 0xffff ; GFX9-NEXT: v_mov_b32_e32 v5, s11 ; GFX9-NEXT: v_cndmask_b32_e64 v1, v1, v3, s[0:1] ; GFX9-NEXT: v_cmp_eq_u32_e64 s[2:3], 3, v4 @@ -1865,15 +1813,15 @@ define amdgpu_ps void @insertelement_s_v8i16_s_v(<8 x i16> addrspace(4)* inreg % ; GFX8-NEXT: v_lshrrev_b32_e32 v4, 1, v0 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 1, v4 ; GFX8-NEXT: v_and_b32_e32 v0, 1, v0 -; GFX8-NEXT: s_mov_b32 s5, 0xffff +; GFX8-NEXT: v_cmp_eq_u32_e64 s[0:1], 2, v4 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, s8 ; GFX8-NEXT: v_mov_b32_e32 v2, s9 ; GFX8-NEXT: v_mov_b32_e32 v3, s10 ; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc -; GFX8-NEXT: v_cmp_eq_u32_e64 s[0:1], 2, v4 ; GFX8-NEXT: v_lshlrev_b32_e32 v0, 4, v0 -; GFX8-NEXT: s_and_b32 s4, s4, s5 +; GFX8-NEXT: s_mov_b32 s5, 0xffff +; GFX8-NEXT: s_and_b32 s4, s4, 0xffff ; GFX8-NEXT: v_mov_b32_e32 v5, s11 ; GFX8-NEXT: v_cndmask_b32_e64 v1, v1, v3, s[0:1] ; GFX8-NEXT: v_cmp_eq_u32_e64 s[2:3], 3, v4 @@ -1903,20 +1851,19 @@ define amdgpu_ps void @insertelement_s_v8i16_s_v(<8 x i16> addrspace(4)* inreg % ; GFX7-NEXT: v_lshrrev_b32_e32 v4, 1, v0 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v4 ; GFX7-NEXT: v_and_b32_e32 v0, 1, v0 -; GFX7-NEXT: s_mov_b32 s5, 0xffff +; GFX7-NEXT: v_cmp_eq_u32_e64 s[0:1], 2, v4 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v1, s8 ; GFX7-NEXT: v_mov_b32_e32 v2, s9 ; GFX7-NEXT: v_mov_b32_e32 v3, s10 ; GFX7-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc -; GFX7-NEXT: v_cmp_eq_u32_e64 s[0:1], 2, v4 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 4, v0 -; GFX7-NEXT: s_and_b32 s4, s4, s5 +; GFX7-NEXT: s_and_b32 s4, s4, 0xffff ; GFX7-NEXT: v_mov_b32_e32 v5, s11 ; GFX7-NEXT: v_cndmask_b32_e64 v1, v1, v3, s[0:1] ; GFX7-NEXT: v_cmp_eq_u32_e64 s[2:3], 3, v4 ; GFX7-NEXT: v_lshl_b32_e32 v2, s4, v0 -; GFX7-NEXT: v_lshl_b32_e32 v0, s5, v0 +; GFX7-NEXT: v_lshl_b32_e32 v0, 0xffff, v0 ; GFX7-NEXT: v_cndmask_b32_e64 v1, v1, v5, s[2:3] ; GFX7-NEXT: v_xor_b32_e32 v0, -1, v0 ; GFX7-NEXT: v_and_b32_e32 v0, v1, v0 @@ -1939,15 +1886,14 @@ define amdgpu_ps void @insertelement_s_v8i16_s_v(<8 x i16> addrspace(4)* inreg % ; GFX10-LABEL: insertelement_s_v8i16_s_v: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0x0 -; GFX10-NEXT: v_and_b32_e32 v1, 1, v0 ; GFX10-NEXT: v_lshrrev_b32_e32 v6, 1, v0 -; GFX10-NEXT: s_mov_b32 s0, 0xffff -; GFX10-NEXT: s_and_b32 s1, s4, s0 -; GFX10-NEXT: v_lshlrev_b32_e32 v1, 4, v1 +; GFX10-NEXT: v_and_b32_e32 v1, 1, v0 +; GFX10-NEXT: s_and_b32 s1, s4, 0xffff ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v6 -; GFX10-NEXT: v_cmp_eq_u32_e64 s2, 0, v6 -; GFX10-NEXT: v_lshlrev_b32_e64 v2, v1, s0 +; GFX10-NEXT: v_lshlrev_b32_e32 v1, 4, v1 ; GFX10-NEXT: v_cmp_eq_u32_e64 s0, 2, v6 +; GFX10-NEXT: v_cmp_eq_u32_e64 s2, 0, v6 +; GFX10-NEXT: v_lshlrev_b32_e64 v2, v1, 0xffff ; GFX10-NEXT: v_lshlrev_b32_e64 v4, v1, s1 ; GFX10-NEXT: v_cmp_eq_u32_e64 s1, 3, v6 ; GFX10-NEXT: v_xor_b32_e32 v5, -1, v2 @@ -2091,15 +2037,15 @@ define amdgpu_ps void @insertelement_s_v8i16_v_v(<8 x i16> addrspace(4)* inreg % ; GFX10-LABEL: insertelement_s_v8i16_v_v: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0 -; GFX10-NEXT: v_and_b32_e32 v2, 1, v1 ; GFX10-NEXT: v_lshrrev_b32_e32 v6, 1, v1 -; GFX10-NEXT: s_mov_b32 s0, 0xffff -; GFX10-NEXT: v_lshlrev_b32_e32 v2, 4, v2 +; GFX10-NEXT: v_and_b32_e32 v2, 1, v1 ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v6 -; GFX10-NEXT: v_cmp_eq_u32_e64 s1, 3, v6 -; GFX10-NEXT: v_cmp_eq_u32_e64 s2, 0, v6 -; GFX10-NEXT: v_lshlrev_b32_e64 v3, v2, s0 +; GFX10-NEXT: v_lshlrev_b32_e32 v2, 4, v2 ; GFX10-NEXT: v_cmp_eq_u32_e64 s0, 2, v6 +; GFX10-NEXT: v_cmp_eq_u32_e64 s1, 3, v6 +; GFX10-NEXT: s_mov_b32 null, 0 +; GFX10-NEXT: v_cmp_eq_u32_e64 s2, 0, v6 +; GFX10-NEXT: v_lshlrev_b32_e64 v3, v2, 0xffff ; GFX10-NEXT: v_lshlrev_b32_sdwa v4, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; GFX10-NEXT: v_xor_b32_e32 v5, -1, v3 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) @@ -2130,10 +2076,10 @@ define amdgpu_ps void @insertelement_v_v8i16_s_v(<8 x i16> addrspace(1)* %ptr, i ; GFX9-LABEL: insertelement_v_v8i16_s_v: ; GFX9: ; %bb.0: ; GFX9-NEXT: global_load_dwordx4 v[3:6], v[0:1], off -; GFX9-NEXT: s_mov_b32 s0, 0xffff ; GFX9-NEXT: v_lshrrev_b32_e32 v0, 1, v2 ; GFX9-NEXT: v_and_b32_e32 v1, 1, v2 -; GFX9-NEXT: s_and_b32 s1, s2, s0 +; GFX9-NEXT: s_mov_b32 s0, 0xffff +; GFX9-NEXT: s_and_b32 s1, s2, 0xffff ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 4, v1 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 ; GFX9-NEXT: v_lshlrev_b32_e64 v2, v1, s1 @@ -2159,10 +2105,10 @@ define amdgpu_ps void @insertelement_v_v8i16_s_v(<8 x i16> addrspace(1)* %ptr, i ; GFX8-LABEL: insertelement_v_v8i16_s_v: ; GFX8: ; %bb.0: ; GFX8-NEXT: flat_load_dwordx4 v[3:6], v[0:1] -; GFX8-NEXT: s_mov_b32 s0, 0xffff ; GFX8-NEXT: v_lshrrev_b32_e32 v0, 1, v2 ; GFX8-NEXT: v_and_b32_e32 v1, 1, v2 -; GFX8-NEXT: s_and_b32 s1, s2, s0 +; GFX8-NEXT: s_mov_b32 s0, 0xffff +; GFX8-NEXT: s_and_b32 s1, s2, 0xffff ; GFX8-NEXT: v_lshlrev_b32_e32 v1, 4, v1 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 ; GFX8-NEXT: v_lshlrev_b32_e64 v2, v1, s1 @@ -2192,15 +2138,14 @@ define amdgpu_ps void @insertelement_v_v8i16_s_v(<8 x i16> addrspace(1)* %ptr, i ; GFX7-NEXT: s_mov_b32 s11, 0xf000 ; GFX7-NEXT: s_mov_b64 s[8:9], 0 ; GFX7-NEXT: buffer_load_dwordx4 v[3:6], v[0:1], s[8:11], 0 addr64 -; GFX7-NEXT: s_mov_b32 s0, 0xffff ; GFX7-NEXT: v_lshrrev_b32_e32 v0, 1, v2 ; GFX7-NEXT: v_and_b32_e32 v1, 1, v2 -; GFX7-NEXT: s_and_b32 s1, s2, s0 +; GFX7-NEXT: s_and_b32 s0, s2, 0xffff ; GFX7-NEXT: v_lshlrev_b32_e32 v1, 4, v1 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 -; GFX7-NEXT: v_lshl_b32_e32 v2, s1, v1 -; GFX7-NEXT: v_lshl_b32_e32 v1, s0, v1 +; GFX7-NEXT: v_lshl_b32_e32 v2, s0, v1 ; GFX7-NEXT: v_cmp_eq_u32_e64 s[0:1], 2, v0 +; GFX7-NEXT: v_lshl_b32_e32 v1, 0xffff, v1 ; GFX7-NEXT: v_cmp_eq_u32_e64 s[2:3], 3, v0 ; GFX7-NEXT: v_xor_b32_e32 v1, -1, v1 ; GFX7-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v0 @@ -2221,15 +2166,14 @@ define amdgpu_ps void @insertelement_v_v8i16_s_v(<8 x i16> addrspace(1)* %ptr, i ; GFX10-LABEL: insertelement_v_v8i16_s_v: ; GFX10: ; %bb.0: ; GFX10-NEXT: global_load_dwordx4 v[3:6], v[0:1], off -; GFX10-NEXT: v_and_b32_e32 v0, 1, v2 ; GFX10-NEXT: v_lshrrev_b32_e32 v1, 1, v2 -; GFX10-NEXT: s_mov_b32 s0, 0xffff -; GFX10-NEXT: s_and_b32 s1, s2, s0 -; GFX10-NEXT: v_lshlrev_b32_e32 v0, 4, v0 +; GFX10-NEXT: v_and_b32_e32 v0, 1, v2 +; GFX10-NEXT: s_and_b32 s1, s2, 0xffff ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v1 -; GFX10-NEXT: v_cmp_eq_u32_e64 s2, 0, v1 -; GFX10-NEXT: v_lshlrev_b32_e64 v7, v0, s0 +; GFX10-NEXT: v_lshlrev_b32_e32 v0, 4, v0 ; GFX10-NEXT: v_cmp_eq_u32_e64 s0, 2, v1 +; GFX10-NEXT: v_cmp_eq_u32_e64 s2, 0, v1 +; GFX10-NEXT: v_lshlrev_b32_e64 v7, v0, 0xffff ; GFX10-NEXT: v_lshlrev_b32_e64 v0, v0, s1 ; GFX10-NEXT: v_cmp_eq_u32_e64 s1, 3, v1 ; GFX10-NEXT: v_xor_b32_e32 v7, -1, v7 @@ -2256,13 +2200,12 @@ define amdgpu_ps void @insertelement_v_v8i16_v_s(<8 x i16> addrspace(1)* %ptr, i ; GFX9-LABEL: insertelement_v_v8i16_v_s: ; GFX9: ; %bb.0: ; GFX9-NEXT: global_load_dwordx4 v[3:6], v[0:1], off -; GFX9-NEXT: s_and_b32 s1, s2, 1 -; GFX9-NEXT: s_mov_b32 s0, 0xffff +; GFX9-NEXT: s_and_b32 s0, s2, 1 ; GFX9-NEXT: s_lshr_b32 s4, s2, 1 -; GFX9-NEXT: s_lshl_b32 s1, s1, 4 -; GFX9-NEXT: s_lshl_b32 s0, s0, s1 +; GFX9-NEXT: s_lshl_b32 s0, s0, 4 +; GFX9-NEXT: v_lshlrev_b32_sdwa v0, s0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; GFX9-NEXT: s_lshl_b32 s0, 0xffff, s0 ; GFX9-NEXT: v_cmp_eq_u32_e64 vcc, s4, 1 -; GFX9-NEXT: v_lshlrev_b32_sdwa v0, s1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; GFX9-NEXT: s_not_b32 s5, s0 ; GFX9-NEXT: v_cmp_eq_u32_e64 s[0:1], s4, 2 ; GFX9-NEXT: v_cmp_eq_u32_e64 s[2:3], s4, 3 @@ -2284,13 +2227,12 @@ define amdgpu_ps void @insertelement_v_v8i16_v_s(<8 x i16> addrspace(1)* %ptr, i ; GFX8-LABEL: insertelement_v_v8i16_v_s: ; GFX8: ; %bb.0: ; GFX8-NEXT: flat_load_dwordx4 v[3:6], v[0:1] -; GFX8-NEXT: s_and_b32 s1, s2, 1 -; GFX8-NEXT: s_mov_b32 s0, 0xffff +; GFX8-NEXT: s_and_b32 s0, s2, 1 ; GFX8-NEXT: s_lshr_b32 s4, s2, 1 -; GFX8-NEXT: s_lshl_b32 s1, s1, 4 -; GFX8-NEXT: s_lshl_b32 s0, s0, s1 +; GFX8-NEXT: s_lshl_b32 s0, s0, 4 +; GFX8-NEXT: v_mov_b32_e32 v0, s0 +; GFX8-NEXT: s_lshl_b32 s0, 0xffff, s0 ; GFX8-NEXT: v_cmp_eq_u32_e64 vcc, s4, 1 -; GFX8-NEXT: v_mov_b32_e32 v0, s1 ; GFX8-NEXT: s_not_b32 s5, s0 ; GFX8-NEXT: v_cmp_eq_u32_e64 s[0:1], s4, 2 ; GFX8-NEXT: v_cmp_eq_u32_e64 s[2:3], s4, 3 @@ -2317,14 +2259,13 @@ define amdgpu_ps void @insertelement_v_v8i16_v_s(<8 x i16> addrspace(1)* %ptr, i ; GFX7-NEXT: s_mov_b32 s11, 0xf000 ; GFX7-NEXT: s_mov_b64 s[8:9], 0 ; GFX7-NEXT: buffer_load_dwordx4 v[3:6], v[0:1], s[8:11], 0 addr64 -; GFX7-NEXT: s_and_b32 s1, s2, 1 -; GFX7-NEXT: s_mov_b32 s0, 0xffff +; GFX7-NEXT: s_and_b32 s0, s2, 1 ; GFX7-NEXT: s_lshr_b32 s4, s2, 1 -; GFX7-NEXT: s_lshl_b32 s1, s1, 4 -; GFX7-NEXT: v_and_b32_e32 v0, s0, v2 -; GFX7-NEXT: s_lshl_b32 s0, s0, s1 +; GFX7-NEXT: v_and_b32_e32 v0, 0xffff, v2 +; GFX7-NEXT: s_lshl_b32 s0, s0, 4 +; GFX7-NEXT: v_lshlrev_b32_e32 v0, s0, v0 +; GFX7-NEXT: s_lshl_b32 s0, 0xffff, s0 ; GFX7-NEXT: v_cmp_eq_u32_e64 vcc, s4, 1 -; GFX7-NEXT: v_lshlrev_b32_e32 v0, s1, v0 ; GFX7-NEXT: s_not_b32 s5, s0 ; GFX7-NEXT: v_cmp_eq_u32_e64 s[0:1], s4, 2 ; GFX7-NEXT: v_cmp_eq_u32_e64 s[2:3], s4, 3 @@ -2352,9 +2293,8 @@ define amdgpu_ps void @insertelement_v_v8i16_v_s(<8 x i16> addrspace(1)* %ptr, i ; GFX10-NEXT: v_cmp_eq_u32_e64 s0, s3, 2 ; GFX10-NEXT: s_lshl_b32 s2, s1, 4 ; GFX10-NEXT: v_cmp_eq_u32_e64 s1, s3, 3 -; GFX10-NEXT: s_mov_b32 s4, 0xffff ; GFX10-NEXT: v_lshlrev_b32_sdwa v1, s2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; GFX10-NEXT: s_lshl_b32 s2, s4, s2 +; GFX10-NEXT: s_lshl_b32 s2, 0xffff, s2 ; GFX10-NEXT: v_mov_b32_e32 v7, 0 ; GFX10-NEXT: s_not_b32 s2, s2 ; GFX10-NEXT: v_mov_b32_e32 v8, 0 @@ -2469,15 +2409,14 @@ define amdgpu_ps void @insertelement_v_v8i16_v_v(<8 x i16> addrspace(1)* %ptr, i ; GFX10-LABEL: insertelement_v_v8i16_v_v: ; GFX10: ; %bb.0: ; GFX10-NEXT: global_load_dwordx4 v[4:7], v[0:1], off -; GFX10-NEXT: v_and_b32_e32 v0, 1, v3 ; GFX10-NEXT: v_lshrrev_b32_e32 v1, 1, v3 -; GFX10-NEXT: s_mov_b32 s0, 0xffff -; GFX10-NEXT: v_lshlrev_b32_e32 v0, 4, v0 +; GFX10-NEXT: v_and_b32_e32 v0, 1, v3 ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v1 +; GFX10-NEXT: v_lshlrev_b32_e32 v0, 4, v0 +; GFX10-NEXT: v_cmp_eq_u32_e64 s0, 2, v1 ; GFX10-NEXT: v_cmp_eq_u32_e64 s1, 3, v1 ; GFX10-NEXT: v_cmp_eq_u32_e64 s2, 0, v1 -; GFX10-NEXT: v_lshlrev_b32_e64 v8, v0, s0 -; GFX10-NEXT: v_cmp_eq_u32_e64 s0, 2, v1 +; GFX10-NEXT: v_lshlrev_b32_e64 v8, v0, 0xffff ; GFX10-NEXT: v_lshlrev_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; GFX10-NEXT: v_xor_b32_e32 v2, -1, v8 ; GFX10-NEXT: v_mov_b32_e32 v8, 0 @@ -2505,8 +2444,8 @@ define amdgpu_ps void @insertelement_s_v16i16_s_s(<16 x i16> addrspace(4)* inreg ; GFX9-NEXT: s_load_dwordx8 s[8:15], s[2:3], 0x0 ; GFX9-NEXT: s_lshr_b32 s7, s5, 1 ; GFX9-NEXT: s_cmp_eq_u32 s7, 1 -; GFX9-NEXT: s_mov_b32 s2, 0xffff ; GFX9-NEXT: v_mov_b32_e32 v4, 0 +; GFX9-NEXT: v_mov_b32_e32 v5, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_cselect_b32 s0, s9, s8 ; GFX9-NEXT: s_cmp_eq_u32 s7, 2 @@ -2523,11 +2462,11 @@ define amdgpu_ps void @insertelement_s_v16i16_s_s(<16 x i16> addrspace(4)* inreg ; GFX9-NEXT: s_cselect_b32 s0, s15, s0 ; GFX9-NEXT: s_and_b32 s1, s5, 1 ; GFX9-NEXT: s_lshl_b32 s1, s1, 4 -; GFX9-NEXT: s_and_b32 s3, s4, s2 -; GFX9-NEXT: s_lshl_b32 s3, s3, s1 -; GFX9-NEXT: s_lshl_b32 s1, s2, s1 +; GFX9-NEXT: s_and_b32 s2, s4, 0xffff +; GFX9-NEXT: s_lshl_b32 s2, s2, s1 +; GFX9-NEXT: s_lshl_b32 s1, 0xffff, s1 ; GFX9-NEXT: s_andn2_b32 s0, s0, s1 -; GFX9-NEXT: s_or_b32 s16, s0, s3 +; GFX9-NEXT: s_or_b32 s16, s0, s2 ; GFX9-NEXT: s_cmp_eq_u32 s7, 0 ; GFX9-NEXT: s_cselect_b32 s0, s16, s8 ; GFX9-NEXT: s_cmp_eq_u32 s7, 1 @@ -2544,7 +2483,6 @@ define amdgpu_ps void @insertelement_s_v16i16_s_s(<16 x i16> addrspace(4)* inreg ; GFX9-NEXT: v_mov_b32_e32 v0, s0 ; GFX9-NEXT: s_cselect_b32 s6, s16, s14 ; GFX9-NEXT: s_cmp_eq_u32 s7, 7 -; GFX9-NEXT: v_mov_b32_e32 v5, 0 ; GFX9-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-NEXT: v_mov_b32_e32 v2, s2 ; GFX9-NEXT: v_mov_b32_e32 v3, s3 @@ -2564,8 +2502,8 @@ define amdgpu_ps void @insertelement_s_v16i16_s_s(<16 x i16> addrspace(4)* inreg ; GFX8-NEXT: s_load_dwordx8 s[8:15], s[2:3], 0x0 ; GFX8-NEXT: s_lshr_b32 s7, s5, 1 ; GFX8-NEXT: s_cmp_eq_u32 s7, 1 -; GFX8-NEXT: s_mov_b32 s2, 0xffff ; GFX8-NEXT: v_mov_b32_e32 v4, 0 +; GFX8-NEXT: v_mov_b32_e32 v5, 0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: s_cselect_b32 s0, s9, s8 ; GFX8-NEXT: s_cmp_eq_u32 s7, 2 @@ -2582,11 +2520,11 @@ define amdgpu_ps void @insertelement_s_v16i16_s_s(<16 x i16> addrspace(4)* inreg ; GFX8-NEXT: s_cselect_b32 s0, s15, s0 ; GFX8-NEXT: s_and_b32 s1, s5, 1 ; GFX8-NEXT: s_lshl_b32 s1, s1, 4 -; GFX8-NEXT: s_and_b32 s3, s4, s2 -; GFX8-NEXT: s_lshl_b32 s3, s3, s1 -; GFX8-NEXT: s_lshl_b32 s1, s2, s1 +; GFX8-NEXT: s_and_b32 s2, s4, 0xffff +; GFX8-NEXT: s_lshl_b32 s2, s2, s1 +; GFX8-NEXT: s_lshl_b32 s1, 0xffff, s1 ; GFX8-NEXT: s_andn2_b32 s0, s0, s1 -; GFX8-NEXT: s_or_b32 s16, s0, s3 +; GFX8-NEXT: s_or_b32 s16, s0, s2 ; GFX8-NEXT: s_cmp_eq_u32 s7, 0 ; GFX8-NEXT: s_cselect_b32 s0, s16, s8 ; GFX8-NEXT: s_cmp_eq_u32 s7, 1 @@ -2603,7 +2541,6 @@ define amdgpu_ps void @insertelement_s_v16i16_s_s(<16 x i16> addrspace(4)* inreg ; GFX8-NEXT: v_mov_b32_e32 v0, s0 ; GFX8-NEXT: s_cselect_b32 s6, s16, s14 ; GFX8-NEXT: s_cmp_eq_u32 s7, 7 -; GFX8-NEXT: v_mov_b32_e32 v5, 0 ; GFX8-NEXT: v_mov_b32_e32 v1, s1 ; GFX8-NEXT: v_mov_b32_e32 v2, s2 ; GFX8-NEXT: v_mov_b32_e32 v3, s3 @@ -2623,7 +2560,6 @@ define amdgpu_ps void @insertelement_s_v16i16_s_s(<16 x i16> addrspace(4)* inreg ; GFX7-NEXT: s_load_dwordx8 s[8:15], s[2:3], 0x0 ; GFX7-NEXT: s_lshr_b32 s7, s5, 1 ; GFX7-NEXT: s_cmp_eq_u32 s7, 1 -; GFX7-NEXT: s_mov_b32 s2, 0xffff ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_cselect_b32 s0, s9, s8 ; GFX7-NEXT: s_cmp_eq_u32 s7, 2 @@ -2640,11 +2576,11 @@ define amdgpu_ps void @insertelement_s_v16i16_s_s(<16 x i16> addrspace(4)* inreg ; GFX7-NEXT: s_cselect_b32 s0, s15, s0 ; GFX7-NEXT: s_and_b32 s1, s5, 1 ; GFX7-NEXT: s_lshl_b32 s1, s1, 4 -; GFX7-NEXT: s_and_b32 s3, s4, s2 -; GFX7-NEXT: s_lshl_b32 s3, s3, s1 -; GFX7-NEXT: s_lshl_b32 s1, s2, s1 +; GFX7-NEXT: s_and_b32 s2, s4, 0xffff +; GFX7-NEXT: s_lshl_b32 s2, s2, s1 +; GFX7-NEXT: s_lshl_b32 s1, 0xffff, s1 ; GFX7-NEXT: s_andn2_b32 s0, s0, s1 -; GFX7-NEXT: s_or_b32 s16, s0, s3 +; GFX7-NEXT: s_or_b32 s16, s0, s2 ; GFX7-NEXT: s_cmp_eq_u32 s7, 0 ; GFX7-NEXT: s_cselect_b32 s0, s16, s8 ; GFX7-NEXT: s_cmp_eq_u32 s7, 1 @@ -2681,9 +2617,8 @@ define amdgpu_ps void @insertelement_s_v16i16_s_s(<16 x i16> addrspace(4)* inreg ; GFX10: ; %bb.0: ; GFX10-NEXT: s_load_dwordx8 s[8:15], s[2:3], 0x0 ; GFX10-NEXT: s_lshr_b32 s7, s5, 1 -; GFX10-NEXT: s_mov_b32 s2, 0xffff -; GFX10-NEXT: s_cmp_eq_u32 s7, 1 ; GFX10-NEXT: v_mov_b32_e32 v8, 0 +; GFX10-NEXT: s_cmp_eq_u32 s7, 1 ; GFX10-NEXT: v_mov_b32_e32 v9, 0 ; GFX10-NEXT: v_mov_b32_e32 v10, 16 ; GFX10-NEXT: v_mov_b32_e32 v11, 0 @@ -2702,11 +2637,11 @@ define amdgpu_ps void @insertelement_s_v16i16_s_s(<16 x i16> addrspace(4)* inreg ; GFX10-NEXT: s_cmp_eq_u32 s7, 7 ; GFX10-NEXT: s_cselect_b32 s0, s15, s0 ; GFX10-NEXT: s_and_b32 s1, s5, 1 -; GFX10-NEXT: s_and_b32 s3, s4, s2 +; GFX10-NEXT: s_and_b32 s2, s4, 0xffff ; GFX10-NEXT: s_lshl_b32 s1, s1, 4 -; GFX10-NEXT: s_lshl_b32 s2, s2, s1 -; GFX10-NEXT: s_lshl_b32 s1, s3, s1 -; GFX10-NEXT: s_andn2_b32 s0, s0, s2 +; GFX10-NEXT: s_lshl_b32 s3, 0xffff, s1 +; GFX10-NEXT: s_lshl_b32 s1, s2, s1 +; GFX10-NEXT: s_andn2_b32 s0, s0, s3 ; GFX10-NEXT: s_or_b32 s16, s0, s1 ; GFX10-NEXT: s_cmp_eq_u32 s7, 0 ; GFX10-NEXT: s_cselect_b32 s0, s16, s8 @@ -2746,17 +2681,16 @@ define amdgpu_ps void @insertelement_v_v16i16_s_s(<16 x i16> addrspace(1)* %ptr, ; GFX9: ; %bb.0: ; GFX9-NEXT: global_load_dwordx4 v[2:5], v[0:1], off ; GFX9-NEXT: global_load_dwordx4 v[6:9], v[0:1], off offset:16 -; GFX9-NEXT: s_and_b32 s1, s3, 1 -; GFX9-NEXT: s_mov_b32 s0, 0xffff +; GFX9-NEXT: s_and_b32 s0, s3, 1 ; GFX9-NEXT: s_lshr_b32 s12, s3, 1 -; GFX9-NEXT: s_lshl_b32 s1, s1, 4 -; GFX9-NEXT: s_and_b32 s2, s2, s0 -; GFX9-NEXT: s_lshl_b32 s0, s0, s1 +; GFX9-NEXT: s_and_b32 s1, s2, 0xffff +; GFX9-NEXT: s_lshl_b32 s0, s0, 4 +; GFX9-NEXT: s_lshl_b32 s1, s1, s0 +; GFX9-NEXT: s_lshl_b32 s0, 0xffff, s0 ; GFX9-NEXT: v_cmp_eq_u32_e64 vcc, s12, 1 -; GFX9-NEXT: s_lshl_b32 s2, s2, s1 ; GFX9-NEXT: s_not_b32 s13, s0 +; GFX9-NEXT: v_mov_b32_e32 v0, s1 ; GFX9-NEXT: v_cmp_eq_u32_e64 s[0:1], s12, 2 -; GFX9-NEXT: v_mov_b32_e32 v0, s2 ; GFX9-NEXT: v_cmp_eq_u32_e64 s[2:3], s12, 3 ; GFX9-NEXT: v_cmp_eq_u32_e64 s[4:5], s12, 4 ; GFX9-NEXT: v_cmp_eq_u32_e64 s[6:7], s12, 5 @@ -2795,14 +2729,13 @@ define amdgpu_ps void @insertelement_v_v16i16_s_s(<16 x i16> addrspace(1)* %ptr, ; GFX8-NEXT: v_add_u32_e32 v0, vcc, 16, v0 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; GFX8-NEXT: flat_load_dwordx4 v[6:9], v[0:1] -; GFX8-NEXT: s_and_b32 s1, s3, 1 -; GFX8-NEXT: s_mov_b32 s0, 0xffff +; GFX8-NEXT: s_and_b32 s0, s3, 1 ; GFX8-NEXT: s_lshr_b32 s12, s3, 1 -; GFX8-NEXT: s_lshl_b32 s1, s1, 4 -; GFX8-NEXT: s_and_b32 s2, s2, s0 -; GFX8-NEXT: s_lshl_b32 s0, s0, s1 +; GFX8-NEXT: s_and_b32 s1, s2, 0xffff +; GFX8-NEXT: s_lshl_b32 s0, s0, 4 +; GFX8-NEXT: s_lshl_b32 s13, s1, s0 +; GFX8-NEXT: s_lshl_b32 s0, 0xffff, s0 ; GFX8-NEXT: v_cmp_eq_u32_e64 vcc, s12, 1 -; GFX8-NEXT: s_lshl_b32 s13, s2, s1 ; GFX8-NEXT: s_not_b32 s14, s0 ; GFX8-NEXT: v_cmp_eq_u32_e64 s[0:1], s12, 2 ; GFX8-NEXT: v_cmp_eq_u32_e64 s[2:3], s12, 3 @@ -2845,14 +2778,13 @@ define amdgpu_ps void @insertelement_v_v16i16_s_s(<16 x i16> addrspace(1)* %ptr, ; GFX7-NEXT: s_mov_b64 s[16:17], 0 ; GFX7-NEXT: buffer_load_dwordx4 v[2:5], v[0:1], s[16:19], 0 addr64 ; GFX7-NEXT: buffer_load_dwordx4 v[6:9], v[0:1], s[16:19], 0 addr64 offset:16 -; GFX7-NEXT: s_and_b32 s1, s3, 1 -; GFX7-NEXT: s_mov_b32 s0, 0xffff +; GFX7-NEXT: s_and_b32 s0, s3, 1 ; GFX7-NEXT: s_lshr_b32 s12, s3, 1 -; GFX7-NEXT: s_lshl_b32 s1, s1, 4 -; GFX7-NEXT: s_and_b32 s2, s2, s0 -; GFX7-NEXT: s_lshl_b32 s0, s0, s1 +; GFX7-NEXT: s_and_b32 s1, s2, 0xffff +; GFX7-NEXT: s_lshl_b32 s0, s0, 4 +; GFX7-NEXT: s_lshl_b32 s13, s1, s0 +; GFX7-NEXT: s_lshl_b32 s0, 0xffff, s0 ; GFX7-NEXT: v_cmp_eq_u32_e64 vcc, s12, 1 -; GFX7-NEXT: s_lshl_b32 s13, s2, s1 ; GFX7-NEXT: s_not_b32 s14, s0 ; GFX7-NEXT: v_cmp_eq_u32_e64 s[0:1], s12, 2 ; GFX7-NEXT: v_cmp_eq_u32_e64 s[2:3], s12, 3 @@ -2892,22 +2824,21 @@ define amdgpu_ps void @insertelement_v_v16i16_s_s(<16 x i16> addrspace(1)* %ptr, ; GFX10-NEXT: global_load_dwordx4 v[2:5], v[0:1], off ; GFX10-NEXT: global_load_dwordx4 v[6:9], v[0:1], off offset:16 ; GFX10-NEXT: s_lshr_b32 s7, s3, 1 -; GFX10-NEXT: s_mov_b32 s8, 0xffff +; GFX10-NEXT: s_and_b32 s8, s2, 0xffff ; GFX10-NEXT: v_cmp_eq_u32_e64 vcc_lo, s7, 1 ; GFX10-NEXT: v_cmp_eq_u32_e64 s0, s7, 2 ; GFX10-NEXT: v_cmp_eq_u32_e64 s1, s7, 3 ; GFX10-NEXT: v_cmp_eq_u32_e64 s4, s7, 4 ; GFX10-NEXT: v_cmp_eq_u32_e64 s5, s7, 5 ; GFX10-NEXT: v_cmp_eq_u32_e64 s6, s7, 6 -; GFX10-NEXT: s_and_b32 s9, s2, s8 ; GFX10-NEXT: v_cmp_eq_u32_e64 s2, s7, 7 ; GFX10-NEXT: s_and_b32 s3, s3, 1 ; GFX10-NEXT: v_mov_b32_e32 v10, 0 ; GFX10-NEXT: s_lshl_b32 s3, s3, 4 ; GFX10-NEXT: v_mov_b32_e32 v11, 0 -; GFX10-NEXT: s_lshl_b32 s8, s8, s3 -; GFX10-NEXT: s_lshl_b32 s3, s9, s3 -; GFX10-NEXT: s_not_b32 s8, s8 +; GFX10-NEXT: s_lshl_b32 s9, 0xffff, s3 +; GFX10-NEXT: s_lshl_b32 s3, s8, s3 +; GFX10-NEXT: s_not_b32 s8, s9 ; GFX10-NEXT: v_mov_b32_e32 v12, 16 ; GFX10-NEXT: v_mov_b32_e32 v13, 0 ; GFX10-NEXT: s_waitcnt vmcnt(1) @@ -2944,8 +2875,8 @@ define amdgpu_ps void @insertelement_s_v16i16_v_s(<16 x i16> addrspace(4)* inreg ; GFX9-NEXT: s_load_dwordx8 s[8:15], s[2:3], 0x0 ; GFX9-NEXT: s_lshr_b32 s2, s4, 1 ; GFX9-NEXT: s_cmp_eq_u32 s2, 1 -; GFX9-NEXT: s_mov_b32 s3, 0xffff -; GFX9-NEXT: v_and_b32_e32 v0, s3, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX9-NEXT: v_cmp_eq_u32_e64 vcc, s2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_cselect_b32 s0, s9, s8 ; GFX9-NEXT: s_cmp_eq_u32 s2, 2 @@ -2962,12 +2893,11 @@ define amdgpu_ps void @insertelement_s_v16i16_v_s(<16 x i16> addrspace(4)* inreg ; GFX9-NEXT: s_cselect_b32 s0, s15, s0 ; GFX9-NEXT: s_and_b32 s1, s4, 1 ; GFX9-NEXT: s_lshl_b32 s1, s1, 4 -; GFX9-NEXT: s_lshl_b32 s3, s3, s1 +; GFX9-NEXT: s_lshl_b32 s3, 0xffff, s1 ; GFX9-NEXT: s_andn2_b32 s0, s0, s3 ; GFX9-NEXT: v_mov_b32_e32 v1, s0 ; GFX9-NEXT: v_lshl_or_b32 v8, v0, s1, v1 ; GFX9-NEXT: v_mov_b32_e32 v0, s8 -; GFX9-NEXT: v_cmp_eq_u32_e64 vcc, s2, 0 ; GFX9-NEXT: v_mov_b32_e32 v1, s9 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v8, vcc ; GFX9-NEXT: v_cmp_eq_u32_e64 vcc, s2, 1 @@ -3003,8 +2933,8 @@ define amdgpu_ps void @insertelement_s_v16i16_v_s(<16 x i16> addrspace(4)* inreg ; GFX8-NEXT: s_load_dwordx8 s[8:15], s[2:3], 0x0 ; GFX8-NEXT: s_lshr_b32 s2, s4, 1 ; GFX8-NEXT: s_cmp_eq_u32 s2, 1 -; GFX8-NEXT: s_mov_b32 s3, 0xffff ; GFX8-NEXT: v_cmp_eq_u32_e64 vcc, s2, 0 +; GFX8-NEXT: v_mov_b32_e32 v10, 16 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: s_cselect_b32 s0, s9, s8 ; GFX8-NEXT: s_cmp_eq_u32 s2, 2 @@ -3022,7 +2952,7 @@ define amdgpu_ps void @insertelement_s_v16i16_v_s(<16 x i16> addrspace(4)* inreg ; GFX8-NEXT: s_and_b32 s1, s4, 1 ; GFX8-NEXT: s_lshl_b32 s1, s1, 4 ; GFX8-NEXT: v_mov_b32_e32 v1, s1 -; GFX8-NEXT: s_lshl_b32 s1, s3, s1 +; GFX8-NEXT: s_lshl_b32 s1, 0xffff, s1 ; GFX8-NEXT: v_lshlrev_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; GFX8-NEXT: s_andn2_b32 s0, s0, s1 ; GFX8-NEXT: v_or_b32_e32 v8, s0, v0 @@ -3051,7 +2981,6 @@ define amdgpu_ps void @insertelement_s_v16i16_v_s(<16 x i16> addrspace(4)* inreg ; GFX8-NEXT: v_cndmask_b32_e32 v7, v7, v8, vcc ; GFX8-NEXT: v_mov_b32_e32 v8, 0 ; GFX8-NEXT: v_mov_b32_e32 v9, 0 -; GFX8-NEXT: v_mov_b32_e32 v10, 16 ; GFX8-NEXT: v_mov_b32_e32 v11, 0 ; GFX8-NEXT: flat_store_dwordx4 v[8:9], v[0:3] ; GFX8-NEXT: flat_store_dwordx4 v[10:11], v[4:7] @@ -3062,8 +2991,8 @@ define amdgpu_ps void @insertelement_s_v16i16_v_s(<16 x i16> addrspace(4)* inreg ; GFX7-NEXT: s_load_dwordx8 s[8:15], s[2:3], 0x0 ; GFX7-NEXT: s_lshr_b32 s2, s4, 1 ; GFX7-NEXT: s_cmp_eq_u32 s2, 1 -; GFX7-NEXT: s_mov_b32 s3, 0xffff -; GFX7-NEXT: v_and_b32_e32 v0, s3, v0 +; GFX7-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX7-NEXT: v_cmp_eq_u32_e64 vcc, s2, 0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_cselect_b32 s0, s9, s8 ; GFX7-NEXT: s_cmp_eq_u32 s2, 2 @@ -3081,11 +3010,10 @@ define amdgpu_ps void @insertelement_s_v16i16_v_s(<16 x i16> addrspace(4)* inreg ; GFX7-NEXT: s_and_b32 s1, s4, 1 ; GFX7-NEXT: s_lshl_b32 s1, s1, 4 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, s1, v0 -; GFX7-NEXT: s_lshl_b32 s1, s3, s1 +; GFX7-NEXT: s_lshl_b32 s1, 0xffff, s1 ; GFX7-NEXT: s_andn2_b32 s0, s0, s1 ; GFX7-NEXT: v_or_b32_e32 v8, s0, v0 ; GFX7-NEXT: v_mov_b32_e32 v0, s8 -; GFX7-NEXT: v_cmp_eq_u32_e64 vcc, s2, 0 ; GFX7-NEXT: v_mov_b32_e32 v1, s9 ; GFX7-NEXT: v_cndmask_b32_e32 v0, v0, v8, vcc ; GFX7-NEXT: v_cmp_eq_u32_e64 vcc, s2, 1 @@ -3120,9 +3048,8 @@ define amdgpu_ps void @insertelement_s_v16i16_v_s(<16 x i16> addrspace(4)* inreg ; GFX10: ; %bb.0: ; GFX10-NEXT: s_load_dwordx8 s[8:15], s[2:3], 0x0 ; GFX10-NEXT: s_lshr_b32 s0, s4, 1 -; GFX10-NEXT: s_mov_b32 s3, 0xffff +; GFX10-NEXT: v_and_b32_e32 v8, 0xffff, v0 ; GFX10-NEXT: s_cmp_eq_u32 s0, 1 -; GFX10-NEXT: v_and_b32_e32 v8, s3, v0 ; GFX10-NEXT: v_cmp_eq_u32_e64 vcc_lo, s0, 0 ; GFX10-NEXT: v_mov_b32_e32 v10, 16 ; GFX10-NEXT: v_mov_b32_e32 v11, 0 @@ -3150,7 +3077,7 @@ define amdgpu_ps void @insertelement_s_v16i16_v_s(<16 x i16> addrspace(4)* inreg ; GFX10-NEXT: v_mov_b32_e32 v6, s14 ; GFX10-NEXT: s_lshl_b32 s2, s2, 4 ; GFX10-NEXT: v_mov_b32_e32 v7, s15 -; GFX10-NEXT: s_lshl_b32 s3, s3, s2 +; GFX10-NEXT: s_lshl_b32 s3, 0xffff, s2 ; GFX10-NEXT: s_andn2_b32 s1, s1, s3 ; GFX10-NEXT: v_lshl_or_b32 v12, v8, s2, s1 ; GFX10-NEXT: v_mov_b32_e32 v8, 0 @@ -3201,12 +3128,12 @@ define amdgpu_ps void @insertelement_s_v16i16_s_v(<16 x i16> addrspace(4)* inreg ; GFX9-NEXT: v_cndmask_b32_e64 v1, v1, v5, s[14:15] ; GFX9-NEXT: v_cmp_eq_u32_e64 s[6:7], 5, v8 ; GFX9-NEXT: v_and_b32_e32 v0, 1, v0 -; GFX9-NEXT: s_mov_b32 s5, 0xffff ; GFX9-NEXT: v_mov_b32_e32 v7, s22 ; GFX9-NEXT: v_cndmask_b32_e64 v1, v1, v6, s[6:7] ; GFX9-NEXT: v_cmp_eq_u32_e64 s[8:9], 6, v8 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 4, v0 -; GFX9-NEXT: s_and_b32 s4, s4, s5 +; GFX9-NEXT: s_mov_b32 s5, 0xffff +; GFX9-NEXT: s_and_b32 s4, s4, 0xffff ; GFX9-NEXT: v_mov_b32_e32 v9, s23 ; GFX9-NEXT: v_cndmask_b32_e64 v1, v1, v7, s[8:9] ; GFX9-NEXT: v_cmp_eq_u32_e64 s[10:11], 7, v8 @@ -3261,12 +3188,12 @@ define amdgpu_ps void @insertelement_s_v16i16_s_v(<16 x i16> addrspace(4)* inreg ; GFX8-NEXT: v_cndmask_b32_e64 v1, v1, v5, s[14:15] ; GFX8-NEXT: v_cmp_eq_u32_e64 s[6:7], 5, v8 ; GFX8-NEXT: v_and_b32_e32 v0, 1, v0 -; GFX8-NEXT: s_mov_b32 s5, 0xffff ; GFX8-NEXT: v_mov_b32_e32 v7, s22 ; GFX8-NEXT: v_cndmask_b32_e64 v1, v1, v6, s[6:7] ; GFX8-NEXT: v_cmp_eq_u32_e64 s[8:9], 6, v8 ; GFX8-NEXT: v_lshlrev_b32_e32 v0, 4, v0 -; GFX8-NEXT: s_and_b32 s4, s4, s5 +; GFX8-NEXT: s_mov_b32 s5, 0xffff +; GFX8-NEXT: s_and_b32 s4, s4, 0xffff ; GFX8-NEXT: v_mov_b32_e32 v9, s23 ; GFX8-NEXT: v_cndmask_b32_e64 v1, v1, v7, s[8:9] ; GFX8-NEXT: v_cmp_eq_u32_e64 s[10:11], 7, v8 @@ -3322,17 +3249,16 @@ define amdgpu_ps void @insertelement_s_v16i16_s_v(<16 x i16> addrspace(4)* inreg ; GFX7-NEXT: v_cndmask_b32_e64 v1, v1, v5, s[14:15] ; GFX7-NEXT: v_cmp_eq_u32_e64 s[6:7], 5, v8 ; GFX7-NEXT: v_and_b32_e32 v0, 1, v0 -; GFX7-NEXT: s_mov_b32 s5, 0xffff ; GFX7-NEXT: v_mov_b32_e32 v7, s22 ; GFX7-NEXT: v_cndmask_b32_e64 v1, v1, v6, s[6:7] ; GFX7-NEXT: v_cmp_eq_u32_e64 s[8:9], 6, v8 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 4, v0 -; GFX7-NEXT: s_and_b32 s4, s4, s5 +; GFX7-NEXT: s_and_b32 s4, s4, 0xffff ; GFX7-NEXT: v_mov_b32_e32 v9, s23 ; GFX7-NEXT: v_cndmask_b32_e64 v1, v1, v7, s[8:9] ; GFX7-NEXT: v_cmp_eq_u32_e64 s[10:11], 7, v8 ; GFX7-NEXT: v_lshl_b32_e32 v2, s4, v0 -; GFX7-NEXT: v_lshl_b32_e32 v0, s5, v0 +; GFX7-NEXT: v_lshl_b32_e32 v0, 0xffff, v0 ; GFX7-NEXT: v_cndmask_b32_e64 v1, v1, v9, s[10:11] ; GFX7-NEXT: v_xor_b32_e32 v0, -1, v0 ; GFX7-NEXT: v_and_b32_e32 v0, v1, v0 @@ -3367,8 +3293,7 @@ define amdgpu_ps void @insertelement_s_v16i16_s_v(<16 x i16> addrspace(4)* inreg ; GFX10-NEXT: s_load_dwordx8 s[8:15], s[2:3], 0x0 ; GFX10-NEXT: v_lshrrev_b32_e32 v12, 1, v0 ; GFX10-NEXT: v_and_b32_e32 v0, 1, v0 -; GFX10-NEXT: s_mov_b32 s5, 0xffff -; GFX10-NEXT: s_and_b32 s6, s4, s5 +; GFX10-NEXT: s_and_b32 s5, s4, 0xffff ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v12 ; GFX10-NEXT: v_cmp_eq_u32_e64 s0, 2, v12 ; GFX10-NEXT: v_cmp_eq_u32_e64 s1, 3, v12 @@ -3376,10 +3301,10 @@ define amdgpu_ps void @insertelement_s_v16i16_s_v(<16 x i16> addrspace(4)* inreg ; GFX10-NEXT: v_cmp_eq_u32_e64 s3, 5, v12 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 4, v0 ; GFX10-NEXT: v_cmp_eq_u32_e64 s4, 6, v12 -; GFX10-NEXT: v_lshlrev_b32_e64 v2, v0, s5 -; GFX10-NEXT: v_cmp_eq_u32_e64 s5, 7, v12 -; GFX10-NEXT: v_lshlrev_b32_e64 v8, v0, s6 ; GFX10-NEXT: v_cmp_eq_u32_e64 s6, 0, v12 +; GFX10-NEXT: v_lshlrev_b32_e64 v2, v0, 0xffff +; GFX10-NEXT: v_lshlrev_b32_e64 v8, v0, s5 +; GFX10-NEXT: v_cmp_eq_u32_e64 s5, 7, v12 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v1, s9 ; GFX10-NEXT: v_xor_b32_e32 v9, -1, v2 @@ -3606,21 +3531,21 @@ define amdgpu_ps void @insertelement_s_v16i16_v_v(<16 x i16> addrspace(4)* inreg ; GFX10-NEXT: s_load_dwordx8 s[8:15], s[2:3], 0x0 ; GFX10-NEXT: v_lshrrev_b32_e32 v12, 1, v1 ; GFX10-NEXT: v_and_b32_e32 v1, 1, v1 -; GFX10-NEXT: s_mov_b32 s4, 0xffff ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v12 ; GFX10-NEXT: v_cmp_eq_u32_e64 s0, 2, v12 ; GFX10-NEXT: v_cmp_eq_u32_e64 s1, 3, v12 +; GFX10-NEXT: s_mov_b32 null, 0 ; GFX10-NEXT: v_cmp_eq_u32_e64 s2, 4, v12 -; GFX10-NEXT: v_lshlrev_b32_e32 v1, 4, v1 ; GFX10-NEXT: v_cmp_eq_u32_e64 s3, 5, v12 +; GFX10-NEXT: v_lshlrev_b32_e32 v1, 4, v1 +; GFX10-NEXT: v_cmp_eq_u32_e64 s4, 6, v12 ; GFX10-NEXT: v_cmp_eq_u32_e64 s5, 7, v12 ; GFX10-NEXT: v_cmp_eq_u32_e64 s6, 0, v12 -; GFX10-NEXT: v_lshlrev_b32_e64 v3, v1, s4 -; GFX10-NEXT: v_cmp_eq_u32_e64 s4, 6, v12 +; GFX10-NEXT: v_lshlrev_b32_e64 v3, v1, 0xffff ; GFX10-NEXT: v_lshlrev_b32_sdwa v8, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; GFX10-NEXT: v_xor_b32_e32 v9, -1, v3 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v2, s9 +; GFX10-NEXT: v_xor_b32_e32 v9, -1, v3 ; GFX10-NEXT: v_cndmask_b32_e32 v2, s8, v2, vcc_lo ; GFX10-NEXT: v_cndmask_b32_e64 v2, v2, s10, s0 ; GFX10-NEXT: v_cndmask_b32_e64 v2, v2, s11, s1 @@ -3663,10 +3588,10 @@ define amdgpu_ps void @insertelement_v_v16i16_s_v(<16 x i16> addrspace(1)* %ptr, ; GFX9: ; %bb.0: ; GFX9-NEXT: global_load_dwordx4 v[3:6], v[0:1], off ; GFX9-NEXT: global_load_dwordx4 v[7:10], v[0:1], off offset:16 -; GFX9-NEXT: s_mov_b32 s0, 0xffff ; GFX9-NEXT: v_lshrrev_b32_e32 v0, 1, v2 ; GFX9-NEXT: v_and_b32_e32 v1, 1, v2 -; GFX9-NEXT: s_and_b32 s1, s2, s0 +; GFX9-NEXT: s_mov_b32 s0, 0xffff +; GFX9-NEXT: s_and_b32 s1, s2, 0xffff ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 4, v1 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 ; GFX9-NEXT: v_lshlrev_b32_e64 v2, v1, s1 @@ -3711,10 +3636,10 @@ define amdgpu_ps void @insertelement_v_v16i16_s_v(<16 x i16> addrspace(1)* %ptr, ; GFX8-NEXT: v_add_u32_e32 v0, vcc, 16, v0 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; GFX8-NEXT: flat_load_dwordx4 v[7:10], v[0:1] -; GFX8-NEXT: s_mov_b32 s0, 0xffff ; GFX8-NEXT: v_lshrrev_b32_e32 v0, 1, v2 ; GFX8-NEXT: v_and_b32_e32 v1, 1, v2 -; GFX8-NEXT: s_and_b32 s1, s2, s0 +; GFX8-NEXT: s_mov_b32 s0, 0xffff +; GFX8-NEXT: s_and_b32 s1, s2, 0xffff ; GFX8-NEXT: v_lshlrev_b32_e32 v1, 4, v1 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 ; GFX8-NEXT: v_lshlrev_b32_e64 v2, v1, s1 @@ -3761,19 +3686,18 @@ define amdgpu_ps void @insertelement_v_v16i16_s_v(<16 x i16> addrspace(1)* %ptr, ; GFX7-NEXT: s_mov_b64 s[16:17], 0 ; GFX7-NEXT: buffer_load_dwordx4 v[3:6], v[0:1], s[16:19], 0 addr64 ; GFX7-NEXT: buffer_load_dwordx4 v[7:10], v[0:1], s[16:19], 0 addr64 offset:16 -; GFX7-NEXT: s_mov_b32 s0, 0xffff ; GFX7-NEXT: v_lshrrev_b32_e32 v0, 1, v2 ; GFX7-NEXT: v_and_b32_e32 v1, 1, v2 -; GFX7-NEXT: s_and_b32 s1, s2, s0 +; GFX7-NEXT: s_and_b32 s0, s2, 0xffff ; GFX7-NEXT: v_lshlrev_b32_e32 v1, 4, v1 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 -; GFX7-NEXT: v_lshl_b32_e32 v2, s1, v1 -; GFX7-NEXT: v_lshl_b32_e32 v1, s0, v1 +; GFX7-NEXT: v_lshl_b32_e32 v2, s0, v1 ; GFX7-NEXT: v_cmp_eq_u32_e64 s[0:1], 2, v0 ; GFX7-NEXT: v_cmp_eq_u32_e64 s[2:3], 3, v0 ; GFX7-NEXT: v_cmp_eq_u32_e64 s[4:5], 4, v0 ; GFX7-NEXT: v_cmp_eq_u32_e64 s[6:7], 5, v0 ; GFX7-NEXT: v_cmp_eq_u32_e64 s[8:9], 6, v0 +; GFX7-NEXT: v_lshl_b32_e32 v1, 0xffff, v1 ; GFX7-NEXT: v_cmp_eq_u32_e64 s[10:11], 7, v0 ; GFX7-NEXT: v_xor_b32_e32 v1, -1, v1 ; GFX7-NEXT: v_cmp_eq_u32_e64 s[12:13], 0, v0 @@ -3809,9 +3733,9 @@ define amdgpu_ps void @insertelement_v_v16i16_s_v(<16 x i16> addrspace(1)* %ptr, ; GFX10-NEXT: global_load_dwordx4 v[7:10], v[0:1], off offset:16 ; GFX10-NEXT: v_lshrrev_b32_e32 v0, 1, v2 ; GFX10-NEXT: v_and_b32_e32 v2, 1, v2 -; GFX10-NEXT: s_mov_b32 s5, 0xffff +; GFX10-NEXT: s_and_b32 s5, s2, 0xffff ; GFX10-NEXT: v_mov_b32_e32 v13, 16 -; GFX10-NEXT: s_and_b32 s6, s2, s5 +; GFX10-NEXT: v_mov_b32_e32 v14, 0 ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 ; GFX10-NEXT: v_cmp_eq_u32_e64 s0, 2, v0 ; GFX10-NEXT: v_cmp_eq_u32_e64 s1, 3, v0 @@ -3819,11 +3743,10 @@ define amdgpu_ps void @insertelement_v_v16i16_s_v(<16 x i16> addrspace(1)* %ptr, ; GFX10-NEXT: v_cmp_eq_u32_e64 s4, 5, v0 ; GFX10-NEXT: v_lshlrev_b32_e32 v2, 4, v2 ; GFX10-NEXT: v_cmp_eq_u32_e64 s2, 6, v0 -; GFX10-NEXT: v_mov_b32_e32 v14, 0 -; GFX10-NEXT: v_lshlrev_b32_e64 v11, v2, s5 -; GFX10-NEXT: v_cmp_eq_u32_e64 s5, 7, v0 -; GFX10-NEXT: v_lshlrev_b32_e64 v2, v2, s6 ; GFX10-NEXT: v_cmp_eq_u32_e64 s6, 0, v0 +; GFX10-NEXT: v_lshlrev_b32_e64 v11, v2, 0xffff +; GFX10-NEXT: v_lshlrev_b32_e64 v2, v2, s5 +; GFX10-NEXT: v_cmp_eq_u32_e64 s5, 7, v0 ; GFX10-NEXT: v_xor_b32_e32 v11, -1, v11 ; GFX10-NEXT: s_waitcnt vmcnt(1) ; GFX10-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc_lo @@ -3859,13 +3782,12 @@ define amdgpu_ps void @insertelement_v_v16i16_v_s(<16 x i16> addrspace(1)* %ptr, ; GFX9: ; %bb.0: ; GFX9-NEXT: global_load_dwordx4 v[3:6], v[0:1], off ; GFX9-NEXT: global_load_dwordx4 v[7:10], v[0:1], off offset:16 -; GFX9-NEXT: s_and_b32 s1, s2, 1 -; GFX9-NEXT: s_mov_b32 s0, 0xffff +; GFX9-NEXT: s_and_b32 s0, s2, 1 ; GFX9-NEXT: s_lshr_b32 s12, s2, 1 -; GFX9-NEXT: s_lshl_b32 s1, s1, 4 -; GFX9-NEXT: s_lshl_b32 s0, s0, s1 +; GFX9-NEXT: s_lshl_b32 s0, s0, 4 +; GFX9-NEXT: v_lshlrev_b32_sdwa v0, s0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; GFX9-NEXT: s_lshl_b32 s0, 0xffff, s0 ; GFX9-NEXT: v_cmp_eq_u32_e64 vcc, s12, 1 -; GFX9-NEXT: v_lshlrev_b32_sdwa v0, s1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; GFX9-NEXT: s_not_b32 s13, s0 ; GFX9-NEXT: v_cmp_eq_u32_e64 s[0:1], s12, 2 ; GFX9-NEXT: v_cmp_eq_u32_e64 s[2:3], s12, 3 @@ -3906,13 +3828,12 @@ define amdgpu_ps void @insertelement_v_v16i16_v_s(<16 x i16> addrspace(1)* %ptr, ; GFX8-NEXT: v_add_u32_e32 v0, vcc, 16, v0 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; GFX8-NEXT: flat_load_dwordx4 v[7:10], v[0:1] -; GFX8-NEXT: s_and_b32 s1, s2, 1 -; GFX8-NEXT: s_mov_b32 s0, 0xffff +; GFX8-NEXT: s_and_b32 s0, s2, 1 ; GFX8-NEXT: s_lshr_b32 s12, s2, 1 -; GFX8-NEXT: s_lshl_b32 s1, s1, 4 -; GFX8-NEXT: s_lshl_b32 s0, s0, s1 +; GFX8-NEXT: s_lshl_b32 s0, s0, 4 +; GFX8-NEXT: v_mov_b32_e32 v0, s0 +; GFX8-NEXT: s_lshl_b32 s0, 0xffff, s0 ; GFX8-NEXT: v_cmp_eq_u32_e64 vcc, s12, 1 -; GFX8-NEXT: v_mov_b32_e32 v0, s1 ; GFX8-NEXT: s_not_b32 s13, s0 ; GFX8-NEXT: v_cmp_eq_u32_e64 s[0:1], s12, 2 ; GFX8-NEXT: v_cmp_eq_u32_e64 s[2:3], s12, 3 @@ -3956,14 +3877,13 @@ define amdgpu_ps void @insertelement_v_v16i16_v_s(<16 x i16> addrspace(1)* %ptr, ; GFX7-NEXT: s_mov_b64 s[16:17], 0 ; GFX7-NEXT: buffer_load_dwordx4 v[3:6], v[0:1], s[16:19], 0 addr64 ; GFX7-NEXT: buffer_load_dwordx4 v[7:10], v[0:1], s[16:19], 0 addr64 offset:16 -; GFX7-NEXT: s_and_b32 s1, s2, 1 -; GFX7-NEXT: s_mov_b32 s0, 0xffff +; GFX7-NEXT: s_and_b32 s0, s2, 1 ; GFX7-NEXT: s_lshr_b32 s12, s2, 1 -; GFX7-NEXT: s_lshl_b32 s1, s1, 4 -; GFX7-NEXT: v_and_b32_e32 v0, s0, v2 -; GFX7-NEXT: s_lshl_b32 s0, s0, s1 +; GFX7-NEXT: v_and_b32_e32 v0, 0xffff, v2 +; GFX7-NEXT: s_lshl_b32 s0, s0, 4 +; GFX7-NEXT: v_lshlrev_b32_e32 v0, s0, v0 +; GFX7-NEXT: s_lshl_b32 s0, 0xffff, s0 ; GFX7-NEXT: v_cmp_eq_u32_e64 vcc, s12, 1 -; GFX7-NEXT: v_lshlrev_b32_e32 v0, s1, v0 ; GFX7-NEXT: s_not_b32 s13, s0 ; GFX7-NEXT: v_cmp_eq_u32_e64 s[0:1], s12, 2 ; GFX7-NEXT: v_cmp_eq_u32_e64 s[2:3], s12, 3 @@ -4012,9 +3932,8 @@ define amdgpu_ps void @insertelement_v_v16i16_v_s(<16 x i16> addrspace(1)* %ptr, ; GFX10-NEXT: v_cmp_eq_u32_e64 s2, s6, 6 ; GFX10-NEXT: s_lshl_b32 s7, s5, 4 ; GFX10-NEXT: v_cmp_eq_u32_e64 s5, s6, 7 -; GFX10-NEXT: s_mov_b32 s8, 0xffff ; GFX10-NEXT: v_lshlrev_b32_sdwa v1, s7, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; GFX10-NEXT: s_lshl_b32 s7, s8, s7 +; GFX10-NEXT: s_lshl_b32 s7, 0xffff, s7 ; GFX10-NEXT: v_cmp_eq_u32_e64 s6, s6, 0 ; GFX10-NEXT: s_not_b32 s7, s7 ; GFX10-NEXT: v_mov_b32_e32 v11, 0 @@ -4197,19 +4116,18 @@ define amdgpu_ps void @insertelement_v_v16i16_v_v(<16 x i16> addrspace(1)* %ptr, ; GFX10-NEXT: global_load_dwordx4 v[8:11], v[0:1], off offset:16 ; GFX10-NEXT: v_lshrrev_b32_e32 v0, 1, v3 ; GFX10-NEXT: v_and_b32_e32 v3, 1, v3 -; GFX10-NEXT: s_mov_b32 s4, 0xffff ; GFX10-NEXT: v_mov_b32_e32 v14, 16 ; GFX10-NEXT: v_mov_b32_e32 v15, 0 ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 ; GFX10-NEXT: v_cmp_eq_u32_e64 s0, 2, v0 ; GFX10-NEXT: v_cmp_eq_u32_e64 s1, 3, v0 ; GFX10-NEXT: v_cmp_eq_u32_e64 s2, 4, v0 -; GFX10-NEXT: v_lshlrev_b32_e32 v3, 4, v3 ; GFX10-NEXT: v_cmp_eq_u32_e64 s3, 5, v0 +; GFX10-NEXT: v_lshlrev_b32_e32 v3, 4, v3 +; GFX10-NEXT: v_cmp_eq_u32_e64 s4, 6, v0 ; GFX10-NEXT: v_cmp_eq_u32_e64 s5, 7, v0 ; GFX10-NEXT: v_cmp_eq_u32_e64 s6, 0, v0 -; GFX10-NEXT: v_lshlrev_b32_e64 v12, v3, s4 -; GFX10-NEXT: v_cmp_eq_u32_e64 s4, 6, v0 +; GFX10-NEXT: v_lshlrev_b32_e64 v12, v3, 0xffff ; GFX10-NEXT: v_lshlrev_b32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; GFX10-NEXT: v_xor_b32_e32 v3, -1, v12 ; GFX10-NEXT: v_mov_b32_e32 v12, 0 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.i8.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.i8.ll index 21457c369b3c..3e6bc4e42524 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.i8.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.i8.ll @@ -722,33 +722,34 @@ define amdgpu_ps void @insertelement_v_v4i8_s_s(<4 x i8> addrspace(1)* %ptr, i8 ; GFX9: ; %bb.0: ; GFX9-NEXT: global_load_dword v0, v[0:1], off ; GFX9-NEXT: s_mov_b32 s0, 8 -; GFX9-NEXT: s_movk_i32 s4, 0xff ; GFX9-NEXT: s_and_b32 s3, s3, 3 ; GFX9-NEXT: s_mov_b32 s1, 16 -; GFX9-NEXT: s_and_b32 s2, s2, s4 +; GFX9-NEXT: s_movk_i32 s4, 0xff +; GFX9-NEXT: s_and_b32 s2, s2, 0xff ; GFX9-NEXT: s_lshl_b32 s3, s3, 3 ; GFX9-NEXT: s_lshl_b32 s2, s2, s3 -; GFX9-NEXT: s_lshl_b32 s3, s4, s3 +; GFX9-NEXT: s_lshl_b32 s3, 0xff, s3 ; GFX9-NEXT: s_not_b32 s3, s3 -; GFX9-NEXT: v_mov_b32_e32 v3, s2 -; GFX9-NEXT: v_mov_b32_e32 v1, 8 -; GFX9-NEXT: v_mov_b32_e32 v2, 16 +; GFX9-NEXT: v_mov_b32_e32 v4, s2 +; GFX9-NEXT: v_mov_b32_e32 v2, 8 +; GFX9-NEXT: v_mov_b32_e32 v1, 0xff +; GFX9-NEXT: v_mov_b32_e32 v3, 16 ; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_lshrrev_b32_e32 v5, 24, v0 +; GFX9-NEXT: v_lshlrev_b32_sdwa v6, s0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 +; GFX9-NEXT: v_lshlrev_b32_sdwa v7, s1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 +; GFX9-NEXT: v_and_or_b32 v0, v0, s4, v6 +; GFX9-NEXT: v_lshlrev_b32_e32 v5, 24, v5 +; GFX9-NEXT: v_or3_b32 v0, v0, v7, v5 +; GFX9-NEXT: v_and_or_b32 v0, v0, s3, v4 ; GFX9-NEXT: v_lshrrev_b32_e32 v4, 24, v0 -; GFX9-NEXT: v_lshlrev_b32_sdwa v5, s0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 -; GFX9-NEXT: v_lshlrev_b32_sdwa v6, s1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 -; GFX9-NEXT: v_and_or_b32 v0, v0, s4, v5 +; GFX9-NEXT: v_lshlrev_b32_sdwa v2, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 +; GFX9-NEXT: v_lshlrev_b32_sdwa v3, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 +; GFX9-NEXT: v_and_or_b32 v2, v0, v1, v2 ; GFX9-NEXT: v_lshlrev_b32_e32 v4, 24, v4 -; GFX9-NEXT: v_or3_b32 v0, v0, v6, v4 -; GFX9-NEXT: v_and_or_b32 v0, v0, s3, v3 -; GFX9-NEXT: v_lshrrev_b32_e32 v3, 24, v0 -; GFX9-NEXT: v_lshlrev_b32_sdwa v1, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 -; GFX9-NEXT: v_lshlrev_b32_sdwa v2, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 -; GFX9-NEXT: v_and_or_b32 v4, v0, s4, v1 -; GFX9-NEXT: v_lshlrev_b32_e32 v3, 24, v3 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 -; GFX9-NEXT: v_or3_b32 v2, v4, v2, v3 +; GFX9-NEXT: v_or3_b32 v2, v2, v3, v4 ; GFX9-NEXT: global_store_dword v[0:1], v2, off ; GFX9-NEXT: s_endpgm ; @@ -757,13 +758,12 @@ define amdgpu_ps void @insertelement_v_v4i8_s_s(<4 x i8> addrspace(1)* %ptr, i8 ; GFX8-NEXT: flat_load_dword v0, v[0:1] ; GFX8-NEXT: v_mov_b32_e32 v1, 8 ; GFX8-NEXT: v_mov_b32_e32 v2, 16 -; GFX8-NEXT: s_and_b32 s1, s3, 3 -; GFX8-NEXT: s_movk_i32 s0, 0xff -; GFX8-NEXT: s_lshl_b32 s1, s1, 3 -; GFX8-NEXT: s_and_b32 s2, s2, s0 -; GFX8-NEXT: s_lshl_b32 s0, s0, s1 +; GFX8-NEXT: s_and_b32 s0, s3, 3 +; GFX8-NEXT: s_and_b32 s1, s2, 0xff +; GFX8-NEXT: s_lshl_b32 s0, s0, 3 +; GFX8-NEXT: s_lshl_b32 s1, s1, s0 +; GFX8-NEXT: s_lshl_b32 s0, 0xff, s0 ; GFX8-NEXT: s_not_b32 s0, s0 -; GFX8-NEXT: s_lshl_b32 s2, s2, s1 ; GFX8-NEXT: v_mov_b32_e32 v3, 8 ; GFX8-NEXT: v_mov_b32_e32 v4, 16 ; GFX8-NEXT: s_waitcnt vmcnt(0) @@ -775,7 +775,7 @@ define amdgpu_ps void @insertelement_v_v4i8_s_s(<4 x i8> addrspace(1)* %ptr, i8 ; GFX8-NEXT: v_or_b32_e32 v0, v0, v2 ; GFX8-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX8-NEXT: v_and_b32_e32 v0, s0, v0 -; GFX8-NEXT: v_or_b32_e32 v0, s2, v0 +; GFX8-NEXT: v_or_b32_e32 v0, s1, v0 ; GFX8-NEXT: v_lshlrev_b32_sdwa v2, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 ; GFX8-NEXT: v_lshrrev_b32_e32 v1, 24, v0 ; GFX8-NEXT: v_lshlrev_b32_sdwa v3, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 @@ -794,18 +794,17 @@ define amdgpu_ps void @insertelement_v_v4i8_s_s(<4 x i8> addrspace(1)* %ptr, i8 ; GFX7-NEXT: s_mov_b32 s7, 0xf000 ; GFX7-NEXT: s_mov_b64 s[4:5], 0 ; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[4:7], 0 addr64 -; GFX7-NEXT: s_movk_i32 s0, 0xff -; GFX7-NEXT: s_and_b32 s1, s3, 3 -; GFX7-NEXT: s_and_b32 s2, s2, s0 -; GFX7-NEXT: s_lshl_b32 s1, s1, 3 -; GFX7-NEXT: s_lshl_b32 s2, s2, s1 -; GFX7-NEXT: s_lshl_b32 s1, s0, s1 -; GFX7-NEXT: s_not_b32 s1, s1 +; GFX7-NEXT: s_and_b32 s0, s3, 3 +; GFX7-NEXT: s_and_b32 s1, s2, 0xff +; GFX7-NEXT: s_lshl_b32 s0, s0, 3 +; GFX7-NEXT: s_lshl_b32 s1, s1, s0 +; GFX7-NEXT: s_lshl_b32 s0, 0xff, s0 +; GFX7-NEXT: s_not_b32 s0, s0 ; GFX7-NEXT: s_mov_b32 s6, -1 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_bfe_u32 v3, v0, 8, 8 ; GFX7-NEXT: v_lshrrev_b32_e32 v1, 24, v0 -; GFX7-NEXT: v_and_b32_e32 v2, s0, v0 +; GFX7-NEXT: v_and_b32_e32 v2, 0xff, v0 ; GFX7-NEXT: v_bfe_u32 v0, v0, 16, 8 ; GFX7-NEXT: v_lshlrev_b32_e32 v3, 8, v3 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 @@ -813,11 +812,11 @@ define amdgpu_ps void @insertelement_v_v4i8_s_s(<4 x i8> addrspace(1)* %ptr, i8 ; GFX7-NEXT: v_lshlrev_b32_e32 v1, 24, v1 ; GFX7-NEXT: v_or_b32_e32 v0, v2, v0 ; GFX7-NEXT: v_or_b32_e32 v0, v0, v1 -; GFX7-NEXT: v_and_b32_e32 v0, s1, v0 -; GFX7-NEXT: v_or_b32_e32 v0, s2, v0 +; GFX7-NEXT: v_and_b32_e32 v0, s0, v0 +; GFX7-NEXT: v_or_b32_e32 v0, s1, v0 ; GFX7-NEXT: v_bfe_u32 v3, v0, 8, 8 ; GFX7-NEXT: v_lshrrev_b32_e32 v1, 24, v0 -; GFX7-NEXT: v_and_b32_e32 v2, s0, v0 +; GFX7-NEXT: v_and_b32_e32 v2, 0xff, v0 ; GFX7-NEXT: v_bfe_u32 v0, v0, 16, 8 ; GFX7-NEXT: v_lshlrev_b32_e32 v3, 8, v3 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 @@ -832,28 +831,27 @@ define amdgpu_ps void @insertelement_v_v4i8_s_s(<4 x i8> addrspace(1)* %ptr, i8 ; GFX10: ; %bb.0: ; GFX10-NEXT: global_load_dword v0, v[0:1], off ; GFX10-NEXT: s_mov_b32 s0, 8 -; GFX10-NEXT: s_movk_i32 s1, 0xff -; GFX10-NEXT: s_and_b32 s2, s2, s1 +; GFX10-NEXT: s_and_b32 s1, s2, 0xff ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_lshlrev_b32_sdwa v1, s0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 ; GFX10-NEXT: v_lshrrev_b32_e32 v2, 24, v0 ; GFX10-NEXT: s_mov_b32 s0, 16 ; GFX10-NEXT: v_lshlrev_b32_sdwa v3, s0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 -; GFX10-NEXT: v_and_or_b32 v0, v0, s1, v1 +; GFX10-NEXT: v_and_or_b32 v0, v0, 0xff, v1 ; GFX10-NEXT: v_lshlrev_b32_e32 v1, 24, v2 ; GFX10-NEXT: s_and_b32 s0, s3, 3 ; GFX10-NEXT: v_mov_b32_e32 v2, 16 ; GFX10-NEXT: s_lshl_b32 s0, s0, 3 ; GFX10-NEXT: v_or3_b32 v0, v0, v3, v1 -; GFX10-NEXT: s_lshl_b32 s3, s1, s0 -; GFX10-NEXT: s_lshl_b32 s0, s2, s0 -; GFX10-NEXT: s_not_b32 s2, s3 +; GFX10-NEXT: s_lshl_b32 s2, 0xff, s0 +; GFX10-NEXT: s_lshl_b32 s0, s1, s0 +; GFX10-NEXT: s_not_b32 s1, s2 ; GFX10-NEXT: v_mov_b32_e32 v1, 8 -; GFX10-NEXT: v_and_or_b32 v0, v0, s2, s0 +; GFX10-NEXT: v_and_or_b32 v0, v0, s1, s0 ; GFX10-NEXT: v_lshlrev_b32_sdwa v1, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 ; GFX10-NEXT: v_lshrrev_b32_e32 v3, 24, v0 ; GFX10-NEXT: v_lshlrev_b32_sdwa v2, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 -; GFX10-NEXT: v_and_or_b32 v4, v0, s1, v1 +; GFX10-NEXT: v_and_or_b32 v4, 0xff, v0, v1 ; GFX10-NEXT: v_lshlrev_b32_e32 v3, 24, v3 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 @@ -877,7 +875,7 @@ define amdgpu_ps void @insertelement_s_v4i8_v_s(<4 x i8> addrspace(4)* inreg %pt ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_bfe_u32 s7, s0, 0x80008 ; GFX9-NEXT: s_lshr_b32 s3, s0, 24 -; GFX9-NEXT: s_and_b32 s6, s0, s5 +; GFX9-NEXT: s_and_b32 s6, s0, 0xff ; GFX9-NEXT: s_lshl_b32 s7, s7, 8 ; GFX9-NEXT: s_bfe_u32 s0, s0, 0x80010 ; GFX9-NEXT: s_or_b32 s6, s6, s7 @@ -887,7 +885,7 @@ define amdgpu_ps void @insertelement_s_v4i8_v_s(<4 x i8> addrspace(4)* inreg %pt ; GFX9-NEXT: s_or_b32 s0, s0, s3 ; GFX9-NEXT: s_and_b32 s3, s4, 3 ; GFX9-NEXT: s_lshl_b32 s3, s3, 3 -; GFX9-NEXT: s_lshl_b32 s4, s5, s3 +; GFX9-NEXT: s_lshl_b32 s4, 0xff, s3 ; GFX9-NEXT: s_andn2_b32 s0, s0, s4 ; GFX9-NEXT: v_mov_b32_e32 v1, s0 ; GFX9-NEXT: v_lshl_or_b32 v0, v0, s3, v1 @@ -905,24 +903,23 @@ define amdgpu_ps void @insertelement_s_v4i8_v_s(<4 x i8> addrspace(4)* inreg %pt ; GFX8-LABEL: insertelement_s_v4i8_v_s: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_load_dword s0, s[2:3], 0x0 -; GFX8-NEXT: s_movk_i32 s1, 0xff ; GFX8-NEXT: v_mov_b32_e32 v2, 8 ; GFX8-NEXT: v_mov_b32_e32 v3, 16 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_bfe_u32 s5, s0, 0x80008 -; GFX8-NEXT: s_lshr_b32 s2, s0, 24 -; GFX8-NEXT: s_and_b32 s3, s0, s1 -; GFX8-NEXT: s_lshl_b32 s5, s5, 8 +; GFX8-NEXT: s_bfe_u32 s3, s0, 0x80008 +; GFX8-NEXT: s_lshr_b32 s1, s0, 24 +; GFX8-NEXT: s_and_b32 s2, s0, 0xff +; GFX8-NEXT: s_lshl_b32 s3, s3, 8 ; GFX8-NEXT: s_bfe_u32 s0, s0, 0x80010 -; GFX8-NEXT: s_or_b32 s3, s3, s5 +; GFX8-NEXT: s_or_b32 s2, s2, s3 ; GFX8-NEXT: s_lshl_b32 s0, s0, 16 -; GFX8-NEXT: s_or_b32 s0, s3, s0 -; GFX8-NEXT: s_lshl_b32 s2, s2, 24 -; GFX8-NEXT: s_or_b32 s0, s0, s2 -; GFX8-NEXT: s_and_b32 s2, s4, 3 -; GFX8-NEXT: s_lshl_b32 s2, s2, 3 -; GFX8-NEXT: v_mov_b32_e32 v1, s2 -; GFX8-NEXT: s_lshl_b32 s1, s1, s2 +; GFX8-NEXT: s_or_b32 s0, s2, s0 +; GFX8-NEXT: s_lshl_b32 s1, s1, 24 +; GFX8-NEXT: s_or_b32 s0, s0, s1 +; GFX8-NEXT: s_and_b32 s1, s4, 3 +; GFX8-NEXT: s_lshl_b32 s1, s1, 3 +; GFX8-NEXT: v_mov_b32_e32 v1, s1 +; GFX8-NEXT: s_lshl_b32 s1, 0xff, s1 ; GFX8-NEXT: v_lshlrev_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX8-NEXT: s_andn2_b32 s0, s0, s1 ; GFX8-NEXT: v_or_b32_e32 v0, s0, v0 @@ -946,7 +943,7 @@ define amdgpu_ps void @insertelement_s_v4i8_v_s(<4 x i8> addrspace(4)* inreg %pt ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_bfe_u32 s5, s0, 0x80008 ; GFX7-NEXT: s_lshr_b32 s1, s0, 24 -; GFX7-NEXT: s_and_b32 s3, s0, s2 +; GFX7-NEXT: s_and_b32 s3, s0, 0xff ; GFX7-NEXT: s_lshl_b32 s5, s5, 8 ; GFX7-NEXT: s_bfe_u32 s0, s0, 0x80010 ; GFX7-NEXT: s_or_b32 s3, s3, s5 @@ -957,7 +954,7 @@ define amdgpu_ps void @insertelement_s_v4i8_v_s(<4 x i8> addrspace(4)* inreg %pt ; GFX7-NEXT: s_and_b32 s1, s4, 3 ; GFX7-NEXT: s_lshl_b32 s1, s1, 3 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, s1, v0 -; GFX7-NEXT: s_lshl_b32 s1, s2, s1 +; GFX7-NEXT: s_lshl_b32 s1, 0xff, s1 ; GFX7-NEXT: s_andn2_b32 s0, s0, s1 ; GFX7-NEXT: v_or_b32_e32 v0, s0, v0 ; GFX7-NEXT: v_bfe_u32 v3, v0, 8, 8 @@ -979,30 +976,29 @@ define amdgpu_ps void @insertelement_s_v4i8_v_s(<4 x i8> addrspace(4)* inreg %pt ; GFX10-LABEL: insertelement_s_v4i8_v_s: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_load_dword s0, s[2:3], 0x0 -; GFX10-NEXT: s_movk_i32 s2, 0xff ; GFX10-NEXT: s_and_b32 s1, s4, 3 -; GFX10-NEXT: v_and_b32_e32 v0, s2, v0 +; GFX10-NEXT: v_and_b32_e32 v0, 0xff, v0 ; GFX10-NEXT: s_lshl_b32 s1, s1, 3 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: s_bfe_u32 s5, s0, 0x80008 -; GFX10-NEXT: s_lshr_b32 s3, s0, 24 -; GFX10-NEXT: s_and_b32 s4, s0, s2 +; GFX10-NEXT: s_bfe_u32 s4, s0, 0x80008 +; GFX10-NEXT: s_lshr_b32 s2, s0, 24 +; GFX10-NEXT: s_and_b32 s3, s0, 0xff ; GFX10-NEXT: s_bfe_u32 s0, s0, 0x80010 -; GFX10-NEXT: s_lshl_b32 s5, s5, 8 +; GFX10-NEXT: s_lshl_b32 s4, s4, 8 ; GFX10-NEXT: s_lshl_b32 s0, s0, 16 -; GFX10-NEXT: s_or_b32 s4, s4, s5 -; GFX10-NEXT: s_lshl_b32 s3, s3, 24 -; GFX10-NEXT: s_or_b32 s0, s4, s0 -; GFX10-NEXT: s_lshl_b32 s4, s2, s1 -; GFX10-NEXT: s_or_b32 s0, s0, s3 -; GFX10-NEXT: s_andn2_b32 s0, s0, s4 +; GFX10-NEXT: s_or_b32 s3, s3, s4 +; GFX10-NEXT: s_lshl_b32 s2, s2, 24 +; GFX10-NEXT: s_or_b32 s0, s3, s0 +; GFX10-NEXT: s_lshl_b32 s3, 0xff, s1 +; GFX10-NEXT: s_or_b32 s0, s0, s2 +; GFX10-NEXT: s_andn2_b32 s0, s0, s3 ; GFX10-NEXT: v_lshl_or_b32 v0, v0, s1, s0 ; GFX10-NEXT: s_mov_b32 s0, 8 ; GFX10-NEXT: v_lshlrev_b32_sdwa v1, s0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 ; GFX10-NEXT: v_lshrrev_b32_e32 v2, 24, v0 ; GFX10-NEXT: s_mov_b32 s0, 16 ; GFX10-NEXT: v_lshlrev_b32_sdwa v3, s0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 -; GFX10-NEXT: v_and_or_b32 v4, v0, s2, v1 +; GFX10-NEXT: v_and_or_b32 v4, v0, 0xff, v1 ; GFX10-NEXT: v_lshlrev_b32_e32 v2, 24, v2 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 @@ -1019,14 +1015,14 @@ define amdgpu_ps void @insertelement_s_v4i8_s_v(<4 x i8> addrspace(4)* inreg %pt ; GFX9-LABEL: insertelement_s_v4i8_s_v: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dword s0, s[2:3], 0x0 -; GFX9-NEXT: s_movk_i32 s5, 0xff ; GFX9-NEXT: v_and_b32_e32 v0, 3, v0 +; GFX9-NEXT: s_movk_i32 s5, 0xff ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; GFX9-NEXT: s_mov_b32 s1, 8 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_bfe_u32 s7, s0, 0x80008 ; GFX9-NEXT: s_lshr_b32 s3, s0, 24 -; GFX9-NEXT: s_and_b32 s6, s0, s5 +; GFX9-NEXT: s_and_b32 s6, s0, 0xff ; GFX9-NEXT: s_lshl_b32 s7, s7, 8 ; GFX9-NEXT: s_bfe_u32 s0, s0, 0x80010 ; GFX9-NEXT: s_or_b32 s6, s6, s7 @@ -1034,7 +1030,7 @@ define amdgpu_ps void @insertelement_s_v4i8_s_v(<4 x i8> addrspace(4)* inreg %pt ; GFX9-NEXT: s_or_b32 s0, s6, s0 ; GFX9-NEXT: s_lshl_b32 s3, s3, 24 ; GFX9-NEXT: s_or_b32 s0, s0, s3 -; GFX9-NEXT: s_and_b32 s3, s4, s5 +; GFX9-NEXT: s_and_b32 s3, s4, 0xff ; GFX9-NEXT: v_lshlrev_b32_e64 v1, v0, s3 ; GFX9-NEXT: v_lshlrev_b32_e64 v0, v0, s5 ; GFX9-NEXT: v_xor_b32_e32 v0, -1, v0 @@ -1054,14 +1050,14 @@ define amdgpu_ps void @insertelement_s_v4i8_s_v(<4 x i8> addrspace(4)* inreg %pt ; GFX8-LABEL: insertelement_s_v4i8_s_v: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_load_dword s0, s[2:3], 0x0 -; GFX8-NEXT: s_movk_i32 s1, 0xff ; GFX8-NEXT: v_and_b32_e32 v0, 3, v0 +; GFX8-NEXT: s_movk_i32 s1, 0xff ; GFX8-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; GFX8-NEXT: v_mov_b32_e32 v2, 8 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: s_bfe_u32 s5, s0, 0x80008 ; GFX8-NEXT: s_lshr_b32 s2, s0, 24 -; GFX8-NEXT: s_and_b32 s3, s0, s1 +; GFX8-NEXT: s_and_b32 s3, s0, 0xff ; GFX8-NEXT: s_lshl_b32 s5, s5, 8 ; GFX8-NEXT: s_bfe_u32 s0, s0, 0x80010 ; GFX8-NEXT: s_or_b32 s3, s3, s5 @@ -1069,7 +1065,7 @@ define amdgpu_ps void @insertelement_s_v4i8_s_v(<4 x i8> addrspace(4)* inreg %pt ; GFX8-NEXT: s_or_b32 s0, s3, s0 ; GFX8-NEXT: s_lshl_b32 s2, s2, 24 ; GFX8-NEXT: s_or_b32 s0, s0, s2 -; GFX8-NEXT: s_and_b32 s2, s4, s1 +; GFX8-NEXT: s_and_b32 s2, s4, 0xff ; GFX8-NEXT: v_lshlrev_b32_e64 v1, v0, s2 ; GFX8-NEXT: v_lshlrev_b32_e64 v0, v0, s1 ; GFX8-NEXT: v_xor_b32_e32 v0, -1, v0 @@ -1091,13 +1087,13 @@ define amdgpu_ps void @insertelement_s_v4i8_s_v(<4 x i8> addrspace(4)* inreg %pt ; GFX7-LABEL: insertelement_s_v4i8_s_v: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_load_dword s0, s[2:3], 0x0 -; GFX7-NEXT: s_movk_i32 s2, 0xff ; GFX7-NEXT: v_and_b32_e32 v0, 3, v0 +; GFX7-NEXT: s_movk_i32 s2, 0xff ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_bfe_u32 s5, s0, 0x80008 ; GFX7-NEXT: s_lshr_b32 s1, s0, 24 -; GFX7-NEXT: s_and_b32 s3, s0, s2 +; GFX7-NEXT: s_and_b32 s3, s0, 0xff ; GFX7-NEXT: s_lshl_b32 s5, s5, 8 ; GFX7-NEXT: s_bfe_u32 s0, s0, 0x80010 ; GFX7-NEXT: s_or_b32 s3, s3, s5 @@ -1105,7 +1101,7 @@ define amdgpu_ps void @insertelement_s_v4i8_s_v(<4 x i8> addrspace(4)* inreg %pt ; GFX7-NEXT: s_or_b32 s0, s3, s0 ; GFX7-NEXT: s_lshl_b32 s1, s1, 24 ; GFX7-NEXT: s_or_b32 s0, s0, s1 -; GFX7-NEXT: s_and_b32 s1, s4, s2 +; GFX7-NEXT: s_and_b32 s1, s4, 0xff ; GFX7-NEXT: v_lshl_b32_e32 v1, s1, v0 ; GFX7-NEXT: v_lshl_b32_e32 v0, s2, v0 ; GFX7-NEXT: v_xor_b32_e32 v0, -1, v0 @@ -1131,30 +1127,29 @@ define amdgpu_ps void @insertelement_s_v4i8_s_v(<4 x i8> addrspace(4)* inreg %pt ; GFX10: ; %bb.0: ; GFX10-NEXT: s_load_dword s0, s[2:3], 0x0 ; GFX10-NEXT: v_and_b32_e32 v0, 3, v0 -; GFX10-NEXT: s_movk_i32 s1, 0xff -; GFX10-NEXT: s_and_b32 s2, s4, s1 +; GFX10-NEXT: s_and_b32 s1, s4, 0xff ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 3, v0 -; GFX10-NEXT: v_lshlrev_b32_e64 v1, v0, s1 -; GFX10-NEXT: v_lshlrev_b32_e64 v0, v0, s2 +; GFX10-NEXT: v_lshlrev_b32_e64 v1, v0, 0xff +; GFX10-NEXT: v_lshlrev_b32_e64 v0, v0, s1 ; GFX10-NEXT: v_xor_b32_e32 v1, -1, v1 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: s_bfe_u32 s4, s0, 0x80008 -; GFX10-NEXT: s_lshr_b32 s2, s0, 24 -; GFX10-NEXT: s_and_b32 s3, s0, s1 +; GFX10-NEXT: s_bfe_u32 s3, s0, 0x80008 +; GFX10-NEXT: s_lshr_b32 s1, s0, 24 +; GFX10-NEXT: s_and_b32 s2, s0, 0xff ; GFX10-NEXT: s_bfe_u32 s0, s0, 0x80010 -; GFX10-NEXT: s_lshl_b32 s4, s4, 8 +; GFX10-NEXT: s_lshl_b32 s3, s3, 8 ; GFX10-NEXT: s_lshl_b32 s0, s0, 16 -; GFX10-NEXT: s_or_b32 s3, s3, s4 -; GFX10-NEXT: s_lshl_b32 s2, s2, 24 -; GFX10-NEXT: s_or_b32 s0, s3, s0 -; GFX10-NEXT: s_or_b32 s0, s0, s2 +; GFX10-NEXT: s_or_b32 s2, s2, s3 +; GFX10-NEXT: s_lshl_b32 s1, s1, 24 +; GFX10-NEXT: s_or_b32 s0, s2, s0 +; GFX10-NEXT: s_or_b32 s0, s0, s1 ; GFX10-NEXT: v_and_or_b32 v0, s0, v1, v0 ; GFX10-NEXT: s_mov_b32 s0, 8 ; GFX10-NEXT: v_lshlrev_b32_sdwa v1, s0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 ; GFX10-NEXT: v_lshrrev_b32_e32 v2, 24, v0 ; GFX10-NEXT: s_mov_b32 s0, 16 ; GFX10-NEXT: v_lshlrev_b32_sdwa v3, s0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 -; GFX10-NEXT: v_and_or_b32 v4, v0, s1, v1 +; GFX10-NEXT: v_and_or_b32 v4, v0, 0xff, v1 ; GFX10-NEXT: v_lshlrev_b32_e32 v2, 24, v2 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 @@ -1171,14 +1166,14 @@ define amdgpu_ps void @insertelement_s_v4i8_v_v(<4 x i8> addrspace(4)* inreg %pt ; GFX9-LABEL: insertelement_s_v4i8_v_v: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dword s0, s[2:3], 0x0 -; GFX9-NEXT: s_movk_i32 s4, 0xff ; GFX9-NEXT: v_and_b32_e32 v1, 3, v1 +; GFX9-NEXT: s_movk_i32 s4, 0xff ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 3, v1 ; GFX9-NEXT: v_lshlrev_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_bfe_u32 s6, s0, 0x80008 ; GFX9-NEXT: s_lshr_b32 s3, s0, 24 -; GFX9-NEXT: s_and_b32 s5, s0, s4 +; GFX9-NEXT: s_and_b32 s5, s0, 0xff ; GFX9-NEXT: s_lshl_b32 s6, s6, 8 ; GFX9-NEXT: s_bfe_u32 s0, s0, 0x80010 ; GFX9-NEXT: s_or_b32 s5, s5, s6 @@ -1205,14 +1200,14 @@ define amdgpu_ps void @insertelement_s_v4i8_v_v(<4 x i8> addrspace(4)* inreg %pt ; GFX8-LABEL: insertelement_s_v4i8_v_v: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_load_dword s0, s[2:3], 0x0 -; GFX8-NEXT: s_movk_i32 s1, 0xff ; GFX8-NEXT: v_and_b32_e32 v1, 3, v1 +; GFX8-NEXT: s_movk_i32 s1, 0xff ; GFX8-NEXT: v_lshlrev_b32_e32 v1, 3, v1 ; GFX8-NEXT: v_lshlrev_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: s_bfe_u32 s4, s0, 0x80008 ; GFX8-NEXT: s_lshr_b32 s2, s0, 24 -; GFX8-NEXT: s_and_b32 s3, s0, s1 +; GFX8-NEXT: s_and_b32 s3, s0, 0xff ; GFX8-NEXT: s_lshl_b32 s4, s4, 8 ; GFX8-NEXT: s_bfe_u32 s0, s0, 0x80010 ; GFX8-NEXT: s_or_b32 s3, s3, s4 @@ -1248,7 +1243,7 @@ define amdgpu_ps void @insertelement_s_v4i8_v_v(<4 x i8> addrspace(4)* inreg %pt ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_bfe_u32 s4, s0, 0x80008 ; GFX7-NEXT: s_lshr_b32 s1, s0, 24 -; GFX7-NEXT: s_and_b32 s3, s0, s2 +; GFX7-NEXT: s_and_b32 s3, s0, 0xff ; GFX7-NEXT: s_lshl_b32 s4, s4, 8 ; GFX7-NEXT: s_bfe_u32 s0, s0, 0x80010 ; GFX7-NEXT: s_or_b32 s3, s3, s4 @@ -1281,29 +1276,28 @@ define amdgpu_ps void @insertelement_s_v4i8_v_v(<4 x i8> addrspace(4)* inreg %pt ; GFX10: ; %bb.0: ; GFX10-NEXT: s_load_dword s0, s[2:3], 0x0 ; GFX10-NEXT: v_and_b32_e32 v1, 3, v1 -; GFX10-NEXT: s_movk_i32 s1, 0xff ; GFX10-NEXT: v_lshlrev_b32_e32 v1, 3, v1 -; GFX10-NEXT: v_lshlrev_b32_e64 v2, v1, s1 +; GFX10-NEXT: v_lshlrev_b32_e64 v2, v1, 0xff ; GFX10-NEXT: v_lshlrev_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX10-NEXT: v_xor_b32_e32 v1, -1, v2 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: s_bfe_u32 s4, s0, 0x80008 -; GFX10-NEXT: s_lshr_b32 s2, s0, 24 -; GFX10-NEXT: s_and_b32 s3, s0, s1 +; GFX10-NEXT: s_bfe_u32 s3, s0, 0x80008 +; GFX10-NEXT: s_lshr_b32 s1, s0, 24 +; GFX10-NEXT: s_and_b32 s2, s0, 0xff ; GFX10-NEXT: s_bfe_u32 s0, s0, 0x80010 -; GFX10-NEXT: s_lshl_b32 s4, s4, 8 +; GFX10-NEXT: s_lshl_b32 s3, s3, 8 ; GFX10-NEXT: s_lshl_b32 s0, s0, 16 -; GFX10-NEXT: s_or_b32 s3, s3, s4 -; GFX10-NEXT: s_lshl_b32 s2, s2, 24 -; GFX10-NEXT: s_or_b32 s0, s3, s0 -; GFX10-NEXT: s_or_b32 s0, s0, s2 +; GFX10-NEXT: s_or_b32 s2, s2, s3 +; GFX10-NEXT: s_lshl_b32 s1, s1, 24 +; GFX10-NEXT: s_or_b32 s0, s2, s0 +; GFX10-NEXT: s_or_b32 s0, s0, s1 ; GFX10-NEXT: v_and_or_b32 v0, s0, v1, v0 ; GFX10-NEXT: s_mov_b32 s0, 8 ; GFX10-NEXT: v_lshlrev_b32_sdwa v1, s0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 ; GFX10-NEXT: v_lshrrev_b32_e32 v2, 24, v0 ; GFX10-NEXT: s_mov_b32 s0, 16 ; GFX10-NEXT: v_lshlrev_b32_sdwa v3, s0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 -; GFX10-NEXT: v_and_or_b32 v4, v0, s1, v1 +; GFX10-NEXT: v_and_or_b32 v4, v0, 0xff, v1 ; GFX10-NEXT: v_lshlrev_b32_e32 v2, 24, v2 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 @@ -1321,62 +1315,63 @@ define amdgpu_ps void @insertelement_v_v4i8_s_v(<4 x i8> addrspace(1)* %ptr, i8 ; GFX9: ; %bb.0: ; GFX9-NEXT: global_load_dword v0, v[0:1], off ; GFX9-NEXT: s_mov_b32 s0, 8 -; GFX9-NEXT: s_movk_i32 s3, 0xff ; GFX9-NEXT: v_and_b32_e32 v2, 3, v2 ; GFX9-NEXT: s_mov_b32 s1, 16 -; GFX9-NEXT: s_and_b32 s2, s2, s3 +; GFX9-NEXT: s_movk_i32 s3, 0xff +; GFX9-NEXT: s_and_b32 s2, s2, 0xff ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 3, v2 -; GFX9-NEXT: v_lshlrev_b32_e64 v4, v2, s2 +; GFX9-NEXT: v_lshlrev_b32_e64 v5, v2, s2 ; GFX9-NEXT: v_lshlrev_b32_e64 v2, v2, s3 ; GFX9-NEXT: v_xor_b32_e32 v2, -1, v2 -; GFX9-NEXT: v_mov_b32_e32 v1, 8 -; GFX9-NEXT: v_mov_b32_e32 v3, 16 +; GFX9-NEXT: v_mov_b32_e32 v3, 8 +; GFX9-NEXT: v_mov_b32_e32 v1, 0xff +; GFX9-NEXT: v_mov_b32_e32 v4, 16 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_lshrrev_b32_e32 v5, 24, v0 -; GFX9-NEXT: v_lshlrev_b32_sdwa v6, s0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 -; GFX9-NEXT: v_lshlrev_b32_sdwa v7, s1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 -; GFX9-NEXT: v_and_or_b32 v0, v0, s3, v6 -; GFX9-NEXT: v_lshlrev_b32_e32 v5, 24, v5 -; GFX9-NEXT: v_or3_b32 v0, v0, v7, v5 -; GFX9-NEXT: v_and_or_b32 v0, v0, v2, v4 +; GFX9-NEXT: v_lshrrev_b32_e32 v6, 24, v0 +; GFX9-NEXT: v_lshlrev_b32_sdwa v7, s0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 +; GFX9-NEXT: v_lshlrev_b32_sdwa v8, s1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 +; GFX9-NEXT: v_and_or_b32 v0, v0, s3, v7 +; GFX9-NEXT: v_lshlrev_b32_e32 v6, 24, v6 +; GFX9-NEXT: v_or3_b32 v0, v0, v8, v6 +; GFX9-NEXT: v_and_or_b32 v0, v0, v2, v5 ; GFX9-NEXT: v_lshrrev_b32_e32 v2, 24, v0 -; GFX9-NEXT: v_lshlrev_b32_sdwa v1, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 -; GFX9-NEXT: v_lshlrev_b32_sdwa v3, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 -; GFX9-NEXT: v_and_or_b32 v4, v0, s3, v1 +; GFX9-NEXT: v_lshlrev_b32_sdwa v3, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 +; GFX9-NEXT: v_lshlrev_b32_sdwa v4, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 +; GFX9-NEXT: v_and_or_b32 v3, v0, v1, v3 ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 24, v2 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 -; GFX9-NEXT: v_or3_b32 v2, v4, v3, v2 +; GFX9-NEXT: v_or3_b32 v2, v3, v4, v2 ; GFX9-NEXT: global_store_dword v[0:1], v2, off ; GFX9-NEXT: s_endpgm ; ; GFX8-LABEL: insertelement_v_v4i8_s_v: ; GFX8: ; %bb.0: ; GFX8-NEXT: flat_load_dword v0, v[0:1] -; GFX8-NEXT: v_mov_b32_e32 v1, 8 -; GFX8-NEXT: s_movk_i32 s0, 0xff -; GFX8-NEXT: v_mov_b32_e32 v3, 16 +; GFX8-NEXT: v_mov_b32_e32 v3, 8 ; GFX8-NEXT: v_and_b32_e32 v2, 3, v2 -; GFX8-NEXT: s_and_b32 s1, s2, s0 +; GFX8-NEXT: v_mov_b32_e32 v1, 0xff +; GFX8-NEXT: v_mov_b32_e32 v4, 16 +; GFX8-NEXT: s_and_b32 s0, s2, 0xff ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 3, v2 -; GFX8-NEXT: v_lshlrev_b32_e64 v6, v2, s1 -; GFX8-NEXT: v_lshlrev_b32_e64 v2, v2, s0 -; GFX8-NEXT: v_xor_b32_e32 v2, -1, v2 -; GFX8-NEXT: v_mov_b32_e32 v4, 8 -; GFX8-NEXT: v_mov_b32_e32 v5, 16 +; GFX8-NEXT: v_lshlrev_b32_e64 v7, v2, s0 +; GFX8-NEXT: v_lshlrev_b32_e32 v1, v2, v1 +; GFX8-NEXT: v_xor_b32_e32 v1, -1, v1 +; GFX8-NEXT: v_mov_b32_e32 v5, 8 +; GFX8-NEXT: v_mov_b32_e32 v6, 16 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_lshlrev_b32_sdwa v1, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 -; GFX8-NEXT: v_lshrrev_b32_e32 v7, 24, v0 -; GFX8-NEXT: v_lshlrev_b32_sdwa v3, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 -; GFX8-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX8-NEXT: v_lshlrev_b32_e32 v1, 24, v7 -; GFX8-NEXT: v_or_b32_e32 v0, v0, v3 -; GFX8-NEXT: v_or_b32_e32 v0, v0, v1 -; GFX8-NEXT: v_and_b32_e32 v0, v0, v2 -; GFX8-NEXT: v_or_b32_e32 v0, v0, v6 -; GFX8-NEXT: v_lshlrev_b32_sdwa v2, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 +; GFX8-NEXT: v_lshlrev_b32_sdwa v3, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 +; GFX8-NEXT: v_lshrrev_b32_e32 v2, 24, v0 +; GFX8-NEXT: v_lshlrev_b32_sdwa v4, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 +; GFX8-NEXT: v_or_b32_sdwa v0, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX8-NEXT: v_lshlrev_b32_e32 v2, 24, v2 +; GFX8-NEXT: v_or_b32_e32 v0, v0, v4 +; GFX8-NEXT: v_or_b32_e32 v0, v0, v2 +; GFX8-NEXT: v_and_b32_e32 v0, v0, v1 +; GFX8-NEXT: v_or_b32_e32 v0, v0, v7 +; GFX8-NEXT: v_lshlrev_b32_sdwa v2, v5, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 ; GFX8-NEXT: v_lshrrev_b32_e32 v1, 24, v0 -; GFX8-NEXT: v_lshlrev_b32_sdwa v3, v5, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 +; GFX8-NEXT: v_lshlrev_b32_sdwa v3, v6, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 ; GFX8-NEXT: v_or_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 24, v1 ; GFX8-NEXT: v_or_b32_e32 v3, v0, v3 @@ -1392,37 +1387,37 @@ define amdgpu_ps void @insertelement_v_v4i8_s_v(<4 x i8> addrspace(1)* %ptr, i8 ; GFX7-NEXT: s_mov_b32 s7, 0xf000 ; GFX7-NEXT: s_mov_b64 s[4:5], 0 ; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[4:7], 0 addr64 -; GFX7-NEXT: s_movk_i32 s0, 0xff -; GFX7-NEXT: v_and_b32_e32 v1, 3, v2 -; GFX7-NEXT: s_and_b32 s1, s2, s0 -; GFX7-NEXT: v_lshlrev_b32_e32 v1, 3, v1 -; GFX7-NEXT: v_lshl_b32_e32 v2, s1, v1 -; GFX7-NEXT: v_lshl_b32_e32 v1, s0, v1 -; GFX7-NEXT: v_xor_b32_e32 v1, -1, v1 +; GFX7-NEXT: v_and_b32_e32 v2, 3, v2 +; GFX7-NEXT: v_mov_b32_e32 v1, 0xff +; GFX7-NEXT: s_and_b32 s0, s2, 0xff +; GFX7-NEXT: v_lshlrev_b32_e32 v2, 3, v2 +; GFX7-NEXT: v_lshl_b32_e32 v3, s0, v2 +; GFX7-NEXT: v_lshlrev_b32_e32 v2, v2, v1 +; GFX7-NEXT: v_xor_b32_e32 v2, -1, v2 ; GFX7-NEXT: s_mov_b32 s6, -1 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_bfe_u32 v5, v0, 8, 8 -; GFX7-NEXT: v_lshrrev_b32_e32 v3, 24, v0 -; GFX7-NEXT: v_and_b32_e32 v4, s0, v0 +; GFX7-NEXT: v_bfe_u32 v6, v0, 8, 8 +; GFX7-NEXT: v_lshrrev_b32_e32 v4, 24, v0 +; GFX7-NEXT: v_and_b32_e32 v5, 0xff, v0 ; GFX7-NEXT: v_bfe_u32 v0, v0, 16, 8 -; GFX7-NEXT: v_lshlrev_b32_e32 v5, 8, v5 +; GFX7-NEXT: v_lshlrev_b32_e32 v6, 8, v6 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX7-NEXT: v_or_b32_e32 v4, v4, v5 -; GFX7-NEXT: v_lshlrev_b32_e32 v3, 24, v3 -; GFX7-NEXT: v_or_b32_e32 v0, v4, v0 +; GFX7-NEXT: v_or_b32_e32 v5, v5, v6 +; GFX7-NEXT: v_lshlrev_b32_e32 v4, 24, v4 +; GFX7-NEXT: v_or_b32_e32 v0, v5, v0 +; GFX7-NEXT: v_or_b32_e32 v0, v0, v4 +; GFX7-NEXT: v_and_b32_e32 v0, v0, v2 ; GFX7-NEXT: v_or_b32_e32 v0, v0, v3 -; GFX7-NEXT: v_and_b32_e32 v0, v0, v1 -; GFX7-NEXT: v_or_b32_e32 v0, v0, v2 ; GFX7-NEXT: v_bfe_u32 v3, v0, 8, 8 -; GFX7-NEXT: v_lshrrev_b32_e32 v1, 24, v0 -; GFX7-NEXT: v_and_b32_e32 v2, s0, v0 +; GFX7-NEXT: v_lshrrev_b32_e32 v2, 24, v0 +; GFX7-NEXT: v_and_b32_e32 v1, v0, v1 ; GFX7-NEXT: v_bfe_u32 v0, v0, 16, 8 ; GFX7-NEXT: v_lshlrev_b32_e32 v3, 8, v3 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX7-NEXT: v_or_b32_e32 v2, v2, v3 -; GFX7-NEXT: v_lshlrev_b32_e32 v1, 24, v1 -; GFX7-NEXT: v_or_b32_e32 v0, v2, v0 -; GFX7-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX7-NEXT: v_or_b32_e32 v1, v1, v3 +; GFX7-NEXT: v_lshlrev_b32_e32 v2, 24, v2 +; GFX7-NEXT: v_or_b32_e32 v0, v1, v0 +; GFX7-NEXT: v_or_b32_e32 v0, v0, v2 ; GFX7-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GFX7-NEXT: s_endpgm ; @@ -1431,17 +1426,16 @@ define amdgpu_ps void @insertelement_v_v4i8_s_v(<4 x i8> addrspace(1)* %ptr, i8 ; GFX10-NEXT: global_load_dword v0, v[0:1], off ; GFX10-NEXT: v_and_b32_e32 v1, 3, v2 ; GFX10-NEXT: s_mov_b32 s0, 8 -; GFX10-NEXT: s_movk_i32 s1, 0xff ; GFX10-NEXT: v_lshlrev_b32_e32 v1, 3, v1 -; GFX10-NEXT: v_lshlrev_b32_e64 v4, v1, s1 +; GFX10-NEXT: v_lshlrev_b32_e64 v4, v1, 0xff ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_lshlrev_b32_sdwa v2, s0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 ; GFX10-NEXT: v_lshrrev_b32_e32 v3, 24, v0 ; GFX10-NEXT: s_mov_b32 s0, 16 ; GFX10-NEXT: v_lshlrev_b32_sdwa v5, s0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 -; GFX10-NEXT: v_and_or_b32 v0, v0, s1, v2 +; GFX10-NEXT: v_and_or_b32 v0, v0, 0xff, v2 ; GFX10-NEXT: v_lshlrev_b32_e32 v2, 24, v3 -; GFX10-NEXT: s_and_b32 s0, s2, s1 +; GFX10-NEXT: s_and_b32 s0, s2, 0xff ; GFX10-NEXT: v_xor_b32_e32 v3, -1, v4 ; GFX10-NEXT: v_lshlrev_b32_e64 v1, v1, s0 ; GFX10-NEXT: v_or3_b32 v0, v0, v5, v2 @@ -1451,7 +1445,7 @@ define amdgpu_ps void @insertelement_v_v4i8_s_v(<4 x i8> addrspace(1)* %ptr, i8 ; GFX10-NEXT: v_lshlrev_b32_sdwa v2, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 ; GFX10-NEXT: v_lshrrev_b32_e32 v3, 24, v0 ; GFX10-NEXT: v_lshlrev_b32_sdwa v4, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 -; GFX10-NEXT: v_and_or_b32 v2, v0, s1, v2 +; GFX10-NEXT: v_and_or_b32 v2, 0xff, v0, v2 ; GFX10-NEXT: v_lshlrev_b32_e32 v3, 24, v3 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 @@ -1474,40 +1468,40 @@ define amdgpu_ps void @insertelement_v_v4i8_v_s(<4 x i8> addrspace(1)* %ptr, i8 ; GFX9-NEXT: s_movk_i32 s3, 0xff ; GFX9-NEXT: s_lshl_b32 s2, s2, 3 ; GFX9-NEXT: v_lshlrev_b32_sdwa v2, s2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: s_lshl_b32 s2, s3, s2 +; GFX9-NEXT: s_lshl_b32 s2, 0xff, s2 ; GFX9-NEXT: s_not_b32 s2, s2 -; GFX9-NEXT: v_mov_b32_e32 v1, 8 -; GFX9-NEXT: v_mov_b32_e32 v3, 16 +; GFX9-NEXT: v_mov_b32_e32 v3, 8 +; GFX9-NEXT: v_mov_b32_e32 v1, 0xff +; GFX9-NEXT: v_mov_b32_e32 v4, 16 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_lshrrev_b32_e32 v4, 24, v0 -; GFX9-NEXT: v_lshlrev_b32_sdwa v5, s0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 -; GFX9-NEXT: v_lshlrev_b32_sdwa v6, s1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 -; GFX9-NEXT: v_and_or_b32 v0, v0, s3, v5 -; GFX9-NEXT: v_lshlrev_b32_e32 v4, 24, v4 -; GFX9-NEXT: v_or3_b32 v0, v0, v6, v4 +; GFX9-NEXT: v_lshrrev_b32_e32 v5, 24, v0 +; GFX9-NEXT: v_lshlrev_b32_sdwa v6, s0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 +; GFX9-NEXT: v_lshlrev_b32_sdwa v7, s1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 +; GFX9-NEXT: v_and_or_b32 v0, v0, s3, v6 +; GFX9-NEXT: v_lshlrev_b32_e32 v5, 24, v5 +; GFX9-NEXT: v_or3_b32 v0, v0, v7, v5 ; GFX9-NEXT: v_and_or_b32 v0, v0, s2, v2 ; GFX9-NEXT: v_lshrrev_b32_e32 v2, 24, v0 -; GFX9-NEXT: v_lshlrev_b32_sdwa v1, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 -; GFX9-NEXT: v_lshlrev_b32_sdwa v3, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 -; GFX9-NEXT: v_and_or_b32 v4, v0, s3, v1 +; GFX9-NEXT: v_lshlrev_b32_sdwa v3, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 +; GFX9-NEXT: v_lshlrev_b32_sdwa v4, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 +; GFX9-NEXT: v_and_or_b32 v3, v0, v1, v3 ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 24, v2 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 -; GFX9-NEXT: v_or3_b32 v2, v4, v3, v2 +; GFX9-NEXT: v_or3_b32 v2, v3, v4, v2 ; GFX9-NEXT: global_store_dword v[0:1], v2, off ; GFX9-NEXT: s_endpgm ; ; GFX8-LABEL: insertelement_v_v4i8_v_s: ; GFX8: ; %bb.0: ; GFX8-NEXT: flat_load_dword v0, v[0:1] -; GFX8-NEXT: s_and_b32 s1, s2, 3 +; GFX8-NEXT: s_and_b32 s0, s2, 3 ; GFX8-NEXT: v_mov_b32_e32 v1, 8 -; GFX8-NEXT: s_lshl_b32 s1, s1, 3 +; GFX8-NEXT: s_lshl_b32 s0, s0, 3 ; GFX8-NEXT: v_mov_b32_e32 v3, 16 -; GFX8-NEXT: v_mov_b32_e32 v6, s1 -; GFX8-NEXT: s_movk_i32 s0, 0xff +; GFX8-NEXT: v_mov_b32_e32 v6, s0 ; GFX8-NEXT: v_lshlrev_b32_sdwa v2, v6, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX8-NEXT: s_lshl_b32 s0, s0, s1 +; GFX8-NEXT: s_lshl_b32 s0, 0xff, s0 ; GFX8-NEXT: s_not_b32 s0, s0 ; GFX8-NEXT: v_mov_b32_e32 v4, 8 ; GFX8-NEXT: v_mov_b32_e32 v5, 16 @@ -1544,7 +1538,7 @@ define amdgpu_ps void @insertelement_v_v4i8_v_s(<4 x i8> addrspace(1)* %ptr, i8 ; GFX7-NEXT: v_and_b32_e32 v1, s0, v2 ; GFX7-NEXT: s_lshl_b32 s1, s1, 3 ; GFX7-NEXT: v_lshlrev_b32_e32 v1, s1, v1 -; GFX7-NEXT: s_lshl_b32 s1, s0, s1 +; GFX7-NEXT: s_lshl_b32 s1, 0xff, s1 ; GFX7-NEXT: s_not_b32 s1, s1 ; GFX7-NEXT: s_mov_b32 s6, -1 ; GFX7-NEXT: s_waitcnt vmcnt(0) @@ -1562,7 +1556,7 @@ define amdgpu_ps void @insertelement_v_v4i8_v_s(<4 x i8> addrspace(1)* %ptr, i8 ; GFX7-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX7-NEXT: v_bfe_u32 v3, v0, 8, 8 ; GFX7-NEXT: v_lshrrev_b32_e32 v1, 24, v0 -; GFX7-NEXT: v_and_b32_e32 v2, s0, v0 +; GFX7-NEXT: v_and_b32_e32 v2, 0xff, v0 ; GFX7-NEXT: v_bfe_u32 v0, v0, 16, 8 ; GFX7-NEXT: v_lshlrev_b32_e32 v3, 8, v3 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 @@ -1577,27 +1571,26 @@ define amdgpu_ps void @insertelement_v_v4i8_v_s(<4 x i8> addrspace(1)* %ptr, i8 ; GFX10: ; %bb.0: ; GFX10-NEXT: global_load_dword v0, v[0:1], off ; GFX10-NEXT: s_mov_b32 s0, 8 -; GFX10-NEXT: s_mov_b32 s1, 16 -; GFX10-NEXT: s_and_b32 s2, s2, 3 +; GFX10-NEXT: s_and_b32 s1, s2, 3 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_lshlrev_b32_sdwa v1, s0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 ; GFX10-NEXT: v_lshrrev_b32_e32 v3, 24, v0 -; GFX10-NEXT: s_movk_i32 s0, 0xff -; GFX10-NEXT: v_lshlrev_b32_sdwa v4, s1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 -; GFX10-NEXT: s_lshl_b32 s1, s2, 3 -; GFX10-NEXT: v_and_or_b32 v0, v0, s0, v1 +; GFX10-NEXT: s_mov_b32 s0, 16 +; GFX10-NEXT: v_lshlrev_b32_sdwa v4, s0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 +; GFX10-NEXT: v_and_or_b32 v0, v0, 0xff, v1 ; GFX10-NEXT: v_lshlrev_b32_e32 v1, 24, v3 -; GFX10-NEXT: v_lshlrev_b32_sdwa v2, s1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX10-NEXT: s_lshl_b32 s1, s0, s1 -; GFX10-NEXT: s_not_b32 s1, s1 +; GFX10-NEXT: s_lshl_b32 s0, s1, 3 +; GFX10-NEXT: v_lshlrev_b32_sdwa v2, s0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX10-NEXT: s_lshl_b32 s0, 0xff, s0 ; GFX10-NEXT: v_or3_b32 v0, v0, v4, v1 +; GFX10-NEXT: s_not_b32 s0, s0 ; GFX10-NEXT: v_mov_b32_e32 v1, 8 -; GFX10-NEXT: v_and_or_b32 v0, v0, s1, v2 +; GFX10-NEXT: v_and_or_b32 v0, v0, s0, v2 ; GFX10-NEXT: v_mov_b32_e32 v2, 16 ; GFX10-NEXT: v_lshlrev_b32_sdwa v1, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 ; GFX10-NEXT: v_lshrrev_b32_e32 v3, 24, v0 ; GFX10-NEXT: v_lshlrev_b32_sdwa v2, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 -; GFX10-NEXT: v_and_or_b32 v4, v0, s0, v1 +; GFX10-NEXT: v_and_or_b32 v4, 0xff, v0, v1 ; GFX10-NEXT: v_lshlrev_b32_e32 v3, 24, v3 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 @@ -1725,9 +1718,8 @@ define amdgpu_ps void @insertelement_v_v4i8_v_v(<4 x i8> addrspace(1)* %ptr, i8 ; GFX10-NEXT: global_load_dword v0, v[0:1], off ; GFX10-NEXT: v_and_b32_e32 v1, 3, v3 ; GFX10-NEXT: s_mov_b32 s0, 8 -; GFX10-NEXT: s_movk_i32 s1, 0xff ; GFX10-NEXT: v_lshlrev_b32_e32 v1, 3, v1 -; GFX10-NEXT: v_lshlrev_b32_e64 v5, v1, s1 +; GFX10-NEXT: v_lshlrev_b32_e64 v5, v1, 0xff ; GFX10-NEXT: v_lshlrev_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX10-NEXT: v_xor_b32_e32 v2, -1, v5 ; GFX10-NEXT: s_waitcnt vmcnt(0) @@ -1735,7 +1727,7 @@ define amdgpu_ps void @insertelement_v_v4i8_v_v(<4 x i8> addrspace(1)* %ptr, i8 ; GFX10-NEXT: v_lshrrev_b32_e32 v4, 24, v0 ; GFX10-NEXT: s_mov_b32 s0, 16 ; GFX10-NEXT: v_lshlrev_b32_sdwa v6, s0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 -; GFX10-NEXT: v_and_or_b32 v0, v0, s1, v3 +; GFX10-NEXT: v_and_or_b32 v0, v0, 0xff, v3 ; GFX10-NEXT: v_lshlrev_b32_e32 v3, 24, v4 ; GFX10-NEXT: v_or3_b32 v0, v0, v6, v3 ; GFX10-NEXT: v_mov_b32_e32 v3, 8 @@ -1761,28 +1753,25 @@ define amdgpu_ps void @insertelement_s_v8i8_s_s(<8 x i8> addrspace(4)* inreg %pt ; GFX9-LABEL: insertelement_s_v8i8_s_s: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 -; GFX9-NEXT: s_mov_b32 s8, 0x80008 -; GFX9-NEXT: s_movk_i32 s6, 0xff ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_bfe_u32 s9, s0, s8 -; GFX9-NEXT: s_and_b32 s7, s0, s6 -; GFX9-NEXT: s_lshl_b32 s9, s9, 8 -; GFX9-NEXT: s_or_b32 s7, s7, s9 -; GFX9-NEXT: s_mov_b32 s9, 0x80010 +; GFX9-NEXT: s_bfe_u32 s7, s0, 0x80008 ; GFX9-NEXT: s_lshr_b32 s2, s0, 24 -; GFX9-NEXT: s_bfe_u32 s0, s0, s9 +; GFX9-NEXT: s_and_b32 s6, s0, 0xff +; GFX9-NEXT: s_lshl_b32 s7, s7, 8 +; GFX9-NEXT: s_bfe_u32 s0, s0, 0x80010 +; GFX9-NEXT: s_or_b32 s6, s6, s7 ; GFX9-NEXT: s_lshl_b32 s0, s0, 16 -; GFX9-NEXT: s_or_b32 s0, s7, s0 +; GFX9-NEXT: s_or_b32 s0, s6, s0 ; GFX9-NEXT: s_lshl_b32 s2, s2, 24 -; GFX9-NEXT: s_bfe_u32 s7, s1, s8 +; GFX9-NEXT: s_bfe_u32 s6, s1, 0x80008 ; GFX9-NEXT: s_lshr_b32 s3, s1, 24 ; GFX9-NEXT: s_or_b32 s0, s0, s2 -; GFX9-NEXT: s_and_b32 s2, s1, s6 -; GFX9-NEXT: s_lshl_b32 s7, s7, 8 -; GFX9-NEXT: s_bfe_u32 s1, s1, s9 -; GFX9-NEXT: s_or_b32 s2, s2, s7 +; GFX9-NEXT: s_and_b32 s2, s1, 0xff +; GFX9-NEXT: s_lshl_b32 s6, s6, 8 +; GFX9-NEXT: s_bfe_u32 s1, s1, 0x80010 +; GFX9-NEXT: s_or_b32 s2, s2, s6 ; GFX9-NEXT: s_lshl_b32 s1, s1, 16 ; GFX9-NEXT: s_or_b32 s1, s2, s1 ; GFX9-NEXT: s_lshl_b32 s2, s3, 24 @@ -1792,30 +1781,30 @@ define amdgpu_ps void @insertelement_s_v8i8_s_s(<8 x i8> addrspace(4)* inreg %pt ; GFX9-NEXT: s_cselect_b32 s3, s1, s0 ; GFX9-NEXT: s_and_b32 s5, s5, 3 ; GFX9-NEXT: s_lshl_b32 s5, s5, 3 -; GFX9-NEXT: s_and_b32 s4, s4, s6 +; GFX9-NEXT: s_and_b32 s4, s4, 0xff ; GFX9-NEXT: s_lshl_b32 s4, s4, s5 -; GFX9-NEXT: s_lshl_b32 s5, s6, s5 +; GFX9-NEXT: s_lshl_b32 s5, 0xff, s5 ; GFX9-NEXT: s_andn2_b32 s3, s3, s5 ; GFX9-NEXT: s_or_b32 s3, s3, s4 ; GFX9-NEXT: s_cmp_eq_u32 s2, 0 ; GFX9-NEXT: s_cselect_b32 s0, s3, s0 ; GFX9-NEXT: s_cmp_eq_u32 s2, 1 ; GFX9-NEXT: s_cselect_b32 s1, s3, s1 -; GFX9-NEXT: s_bfe_u32 s5, s0, s8 +; GFX9-NEXT: s_bfe_u32 s5, s0, 0x80008 ; GFX9-NEXT: s_lshr_b32 s2, s0, 24 -; GFX9-NEXT: s_and_b32 s4, s0, s6 +; GFX9-NEXT: s_and_b32 s4, s0, 0xff ; GFX9-NEXT: s_lshl_b32 s5, s5, 8 -; GFX9-NEXT: s_bfe_u32 s0, s0, s9 +; GFX9-NEXT: s_bfe_u32 s0, s0, 0x80010 ; GFX9-NEXT: s_or_b32 s4, s4, s5 ; GFX9-NEXT: s_lshl_b32 s0, s0, 16 ; GFX9-NEXT: s_or_b32 s0, s4, s0 ; GFX9-NEXT: s_lshl_b32 s2, s2, 24 -; GFX9-NEXT: s_bfe_u32 s4, s1, s8 +; GFX9-NEXT: s_bfe_u32 s4, s1, 0x80008 ; GFX9-NEXT: s_lshr_b32 s3, s1, 24 ; GFX9-NEXT: s_or_b32 s0, s0, s2 -; GFX9-NEXT: s_and_b32 s2, s1, s6 +; GFX9-NEXT: s_and_b32 s2, s1, 0xff ; GFX9-NEXT: s_lshl_b32 s4, s4, 8 -; GFX9-NEXT: s_bfe_u32 s1, s1, s9 +; GFX9-NEXT: s_bfe_u32 s1, s1, 0x80010 ; GFX9-NEXT: s_or_b32 s2, s2, s4 ; GFX9-NEXT: s_lshl_b32 s1, s1, 16 ; GFX9-NEXT: s_or_b32 s1, s2, s1 @@ -1829,28 +1818,25 @@ define amdgpu_ps void @insertelement_s_v8i8_s_s(<8 x i8> addrspace(4)* inreg %pt ; GFX8-LABEL: insertelement_s_v8i8_s_s: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 -; GFX8-NEXT: s_mov_b32 s8, 0x80008 -; GFX8-NEXT: s_movk_i32 s6, 0xff ; GFX8-NEXT: v_mov_b32_e32 v0, 0 ; GFX8-NEXT: v_mov_b32_e32 v1, 0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_bfe_u32 s9, s0, s8 -; GFX8-NEXT: s_and_b32 s7, s0, s6 -; GFX8-NEXT: s_lshl_b32 s9, s9, 8 -; GFX8-NEXT: s_or_b32 s7, s7, s9 -; GFX8-NEXT: s_mov_b32 s9, 0x80010 +; GFX8-NEXT: s_bfe_u32 s7, s0, 0x80008 ; GFX8-NEXT: s_lshr_b32 s2, s0, 24 -; GFX8-NEXT: s_bfe_u32 s0, s0, s9 +; GFX8-NEXT: s_and_b32 s6, s0, 0xff +; GFX8-NEXT: s_lshl_b32 s7, s7, 8 +; GFX8-NEXT: s_bfe_u32 s0, s0, 0x80010 +; GFX8-NEXT: s_or_b32 s6, s6, s7 ; GFX8-NEXT: s_lshl_b32 s0, s0, 16 -; GFX8-NEXT: s_or_b32 s0, s7, s0 +; GFX8-NEXT: s_or_b32 s0, s6, s0 ; GFX8-NEXT: s_lshl_b32 s2, s2, 24 -; GFX8-NEXT: s_bfe_u32 s7, s1, s8 +; GFX8-NEXT: s_bfe_u32 s6, s1, 0x80008 ; GFX8-NEXT: s_lshr_b32 s3, s1, 24 ; GFX8-NEXT: s_or_b32 s0, s0, s2 -; GFX8-NEXT: s_and_b32 s2, s1, s6 -; GFX8-NEXT: s_lshl_b32 s7, s7, 8 -; GFX8-NEXT: s_bfe_u32 s1, s1, s9 -; GFX8-NEXT: s_or_b32 s2, s2, s7 +; GFX8-NEXT: s_and_b32 s2, s1, 0xff +; GFX8-NEXT: s_lshl_b32 s6, s6, 8 +; GFX8-NEXT: s_bfe_u32 s1, s1, 0x80010 +; GFX8-NEXT: s_or_b32 s2, s2, s6 ; GFX8-NEXT: s_lshl_b32 s1, s1, 16 ; GFX8-NEXT: s_or_b32 s1, s2, s1 ; GFX8-NEXT: s_lshl_b32 s2, s3, 24 @@ -1860,30 +1846,30 @@ define amdgpu_ps void @insertelement_s_v8i8_s_s(<8 x i8> addrspace(4)* inreg %pt ; GFX8-NEXT: s_cselect_b32 s3, s1, s0 ; GFX8-NEXT: s_and_b32 s5, s5, 3 ; GFX8-NEXT: s_lshl_b32 s5, s5, 3 -; GFX8-NEXT: s_and_b32 s4, s4, s6 +; GFX8-NEXT: s_and_b32 s4, s4, 0xff ; GFX8-NEXT: s_lshl_b32 s4, s4, s5 -; GFX8-NEXT: s_lshl_b32 s5, s6, s5 +; GFX8-NEXT: s_lshl_b32 s5, 0xff, s5 ; GFX8-NEXT: s_andn2_b32 s3, s3, s5 ; GFX8-NEXT: s_or_b32 s3, s3, s4 ; GFX8-NEXT: s_cmp_eq_u32 s2, 0 ; GFX8-NEXT: s_cselect_b32 s0, s3, s0 ; GFX8-NEXT: s_cmp_eq_u32 s2, 1 ; GFX8-NEXT: s_cselect_b32 s1, s3, s1 -; GFX8-NEXT: s_bfe_u32 s5, s0, s8 +; GFX8-NEXT: s_bfe_u32 s5, s0, 0x80008 ; GFX8-NEXT: s_lshr_b32 s2, s0, 24 -; GFX8-NEXT: s_and_b32 s4, s0, s6 +; GFX8-NEXT: s_and_b32 s4, s0, 0xff ; GFX8-NEXT: s_lshl_b32 s5, s5, 8 -; GFX8-NEXT: s_bfe_u32 s0, s0, s9 +; GFX8-NEXT: s_bfe_u32 s0, s0, 0x80010 ; GFX8-NEXT: s_or_b32 s4, s4, s5 ; GFX8-NEXT: s_lshl_b32 s0, s0, 16 ; GFX8-NEXT: s_or_b32 s0, s4, s0 ; GFX8-NEXT: s_lshl_b32 s2, s2, 24 -; GFX8-NEXT: s_bfe_u32 s4, s1, s8 +; GFX8-NEXT: s_bfe_u32 s4, s1, 0x80008 ; GFX8-NEXT: s_lshr_b32 s3, s1, 24 ; GFX8-NEXT: s_or_b32 s0, s0, s2 -; GFX8-NEXT: s_and_b32 s2, s1, s6 +; GFX8-NEXT: s_and_b32 s2, s1, 0xff ; GFX8-NEXT: s_lshl_b32 s4, s4, 8 -; GFX8-NEXT: s_bfe_u32 s1, s1, s9 +; GFX8-NEXT: s_bfe_u32 s1, s1, 0x80010 ; GFX8-NEXT: s_or_b32 s2, s2, s4 ; GFX8-NEXT: s_lshl_b32 s1, s1, 16 ; GFX8-NEXT: s_or_b32 s1, s2, s1 @@ -1897,26 +1883,23 @@ define amdgpu_ps void @insertelement_s_v8i8_s_s(<8 x i8> addrspace(4)* inreg %pt ; GFX7-LABEL: insertelement_s_v8i8_s_s: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 -; GFX7-NEXT: s_mov_b32 s8, 0x80008 -; GFX7-NEXT: s_movk_i32 s6, 0xff ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: s_bfe_u32 s9, s0, s8 -; GFX7-NEXT: s_and_b32 s7, s0, s6 -; GFX7-NEXT: s_lshl_b32 s9, s9, 8 -; GFX7-NEXT: s_or_b32 s7, s7, s9 -; GFX7-NEXT: s_mov_b32 s9, 0x80010 +; GFX7-NEXT: s_bfe_u32 s7, s0, 0x80008 ; GFX7-NEXT: s_lshr_b32 s2, s0, 24 -; GFX7-NEXT: s_bfe_u32 s0, s0, s9 +; GFX7-NEXT: s_and_b32 s6, s0, 0xff +; GFX7-NEXT: s_lshl_b32 s7, s7, 8 +; GFX7-NEXT: s_bfe_u32 s0, s0, 0x80010 +; GFX7-NEXT: s_or_b32 s6, s6, s7 ; GFX7-NEXT: s_lshl_b32 s0, s0, 16 -; GFX7-NEXT: s_or_b32 s0, s7, s0 +; GFX7-NEXT: s_or_b32 s0, s6, s0 ; GFX7-NEXT: s_lshl_b32 s2, s2, 24 -; GFX7-NEXT: s_bfe_u32 s7, s1, s8 +; GFX7-NEXT: s_bfe_u32 s6, s1, 0x80008 ; GFX7-NEXT: s_lshr_b32 s3, s1, 24 ; GFX7-NEXT: s_or_b32 s0, s0, s2 -; GFX7-NEXT: s_and_b32 s2, s1, s6 -; GFX7-NEXT: s_lshl_b32 s7, s7, 8 -; GFX7-NEXT: s_bfe_u32 s1, s1, s9 -; GFX7-NEXT: s_or_b32 s2, s2, s7 +; GFX7-NEXT: s_and_b32 s2, s1, 0xff +; GFX7-NEXT: s_lshl_b32 s6, s6, 8 +; GFX7-NEXT: s_bfe_u32 s1, s1, 0x80010 +; GFX7-NEXT: s_or_b32 s2, s2, s6 ; GFX7-NEXT: s_lshl_b32 s1, s1, 16 ; GFX7-NEXT: s_or_b32 s1, s2, s1 ; GFX7-NEXT: s_lshl_b32 s2, s3, 24 @@ -1926,30 +1909,30 @@ define amdgpu_ps void @insertelement_s_v8i8_s_s(<8 x i8> addrspace(4)* inreg %pt ; GFX7-NEXT: s_cselect_b32 s3, s1, s0 ; GFX7-NEXT: s_and_b32 s5, s5, 3 ; GFX7-NEXT: s_lshl_b32 s5, s5, 3 -; GFX7-NEXT: s_and_b32 s4, s4, s6 +; GFX7-NEXT: s_and_b32 s4, s4, 0xff ; GFX7-NEXT: s_lshl_b32 s4, s4, s5 -; GFX7-NEXT: s_lshl_b32 s5, s6, s5 +; GFX7-NEXT: s_lshl_b32 s5, 0xff, s5 ; GFX7-NEXT: s_andn2_b32 s3, s3, s5 ; GFX7-NEXT: s_or_b32 s3, s3, s4 ; GFX7-NEXT: s_cmp_eq_u32 s2, 0 ; GFX7-NEXT: s_cselect_b32 s4, s3, s0 ; GFX7-NEXT: s_cmp_eq_u32 s2, 1 ; GFX7-NEXT: s_cselect_b32 s3, s3, s1 -; GFX7-NEXT: s_bfe_u32 s10, s4, s8 +; GFX7-NEXT: s_bfe_u32 s7, s4, 0x80008 ; GFX7-NEXT: s_lshr_b32 s2, s4, 24 -; GFX7-NEXT: s_and_b32 s7, s4, s6 -; GFX7-NEXT: s_lshl_b32 s10, s10, 8 -; GFX7-NEXT: s_bfe_u32 s4, s4, s9 -; GFX7-NEXT: s_or_b32 s7, s7, s10 +; GFX7-NEXT: s_and_b32 s6, s4, 0xff +; GFX7-NEXT: s_lshl_b32 s7, s7, 8 +; GFX7-NEXT: s_bfe_u32 s4, s4, 0x80010 +; GFX7-NEXT: s_or_b32 s6, s6, s7 ; GFX7-NEXT: s_lshl_b32 s4, s4, 16 -; GFX7-NEXT: s_or_b32 s4, s7, s4 +; GFX7-NEXT: s_or_b32 s4, s6, s4 ; GFX7-NEXT: s_lshl_b32 s2, s2, 24 -; GFX7-NEXT: s_or_b32 s2, s4, s2 -; GFX7-NEXT: s_and_b32 s4, s3, s6 -; GFX7-NEXT: s_bfe_u32 s6, s3, s8 +; GFX7-NEXT: s_bfe_u32 s6, s3, 0x80008 ; GFX7-NEXT: s_lshr_b32 s5, s3, 24 +; GFX7-NEXT: s_or_b32 s2, s4, s2 +; GFX7-NEXT: s_and_b32 s4, s3, 0xff ; GFX7-NEXT: s_lshl_b32 s6, s6, 8 -; GFX7-NEXT: s_bfe_u32 s3, s3, s9 +; GFX7-NEXT: s_bfe_u32 s3, s3, 0x80010 ; GFX7-NEXT: s_or_b32 s4, s4, s6 ; GFX7-NEXT: s_lshl_b32 s3, s3, 16 ; GFX7-NEXT: s_or_b32 s3, s4, s3 @@ -1966,66 +1949,63 @@ define amdgpu_ps void @insertelement_s_v8i8_s_s(<8 x i8> addrspace(4)* inreg %pt ; GFX10-LABEL: insertelement_s_v8i8_s_s: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 -; GFX10-NEXT: s_mov_b32 s3, 0x80008 -; GFX10-NEXT: s_movk_i32 s2, 0xff -; GFX10-NEXT: s_mov_b32 s6, 0x80010 -; GFX10-NEXT: s_lshr_b32 s7, s5, 2 +; GFX10-NEXT: s_lshr_b32 s2, s5, 2 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: s_bfe_u32 s11, s0, s3 -; GFX10-NEXT: s_bfe_u32 s13, s1, s3 -; GFX10-NEXT: s_lshr_b32 s8, s0, 24 -; GFX10-NEXT: s_lshr_b32 s9, s1, 24 -; GFX10-NEXT: s_and_b32 s10, s0, s2 -; GFX10-NEXT: s_bfe_u32 s0, s0, s6 -; GFX10-NEXT: s_and_b32 s12, s1, s2 -; GFX10-NEXT: s_bfe_u32 s1, s1, s6 -; GFX10-NEXT: s_lshl_b32 s11, s11, 8 -; GFX10-NEXT: s_lshl_b32 s13, s13, 8 +; GFX10-NEXT: s_bfe_u32 s8, s0, 0x80008 +; GFX10-NEXT: s_bfe_u32 s10, s1, 0x80008 +; GFX10-NEXT: s_lshr_b32 s3, s0, 24 +; GFX10-NEXT: s_lshr_b32 s6, s1, 24 +; GFX10-NEXT: s_and_b32 s7, s0, 0xff +; GFX10-NEXT: s_bfe_u32 s0, s0, 0x80010 +; GFX10-NEXT: s_and_b32 s9, s1, 0xff +; GFX10-NEXT: s_bfe_u32 s1, s1, 0x80010 +; GFX10-NEXT: s_lshl_b32 s8, s8, 8 +; GFX10-NEXT: s_lshl_b32 s10, s10, 8 ; GFX10-NEXT: s_lshl_b32 s0, s0, 16 ; GFX10-NEXT: s_lshl_b32 s1, s1, 16 -; GFX10-NEXT: s_or_b32 s10, s10, s11 -; GFX10-NEXT: s_or_b32 s11, s12, s13 -; GFX10-NEXT: s_lshl_b32 s8, s8, 24 -; GFX10-NEXT: s_lshl_b32 s9, s9, 24 -; GFX10-NEXT: s_or_b32 s0, s10, s0 -; GFX10-NEXT: s_or_b32 s1, s11, s1 -; GFX10-NEXT: s_or_b32 s0, s0, s8 -; GFX10-NEXT: s_or_b32 s1, s1, s9 -; GFX10-NEXT: s_cmp_eq_u32 s7, 1 -; GFX10-NEXT: s_cselect_b32 s8, s1, s0 -; GFX10-NEXT: s_and_b32 s5, s5, 3 -; GFX10-NEXT: s_and_b32 s4, s4, s2 -; GFX10-NEXT: s_lshl_b32 s5, s5, 3 -; GFX10-NEXT: s_lshl_b32 s9, s2, s5 -; GFX10-NEXT: s_lshl_b32 s4, s4, s5 -; GFX10-NEXT: s_andn2_b32 s5, s8, s9 -; GFX10-NEXT: s_or_b32 s4, s5, s4 -; GFX10-NEXT: s_cmp_eq_u32 s7, 0 -; GFX10-NEXT: s_cselect_b32 s0, s4, s0 -; GFX10-NEXT: s_cmp_eq_u32 s7, 1 -; GFX10-NEXT: s_cselect_b32 s1, s4, s1 -; GFX10-NEXT: s_bfe_u32 s7, s0, s3 -; GFX10-NEXT: s_bfe_u32 s3, s1, s3 -; GFX10-NEXT: s_and_b32 s5, s0, s2 -; GFX10-NEXT: s_lshr_b32 s8, s1, 24 -; GFX10-NEXT: s_and_b32 s2, s1, s2 -; GFX10-NEXT: s_lshl_b32 s3, s3, 8 -; GFX10-NEXT: s_bfe_u32 s1, s1, s6 -; GFX10-NEXT: s_lshr_b32 s4, s0, 24 -; GFX10-NEXT: s_bfe_u32 s0, s0, s6 -; GFX10-NEXT: s_lshl_b32 s7, s7, 8 -; GFX10-NEXT: s_or_b32 s2, s2, s3 -; GFX10-NEXT: s_lshl_b32 s1, s1, 16 -; GFX10-NEXT: s_lshl_b32 s0, s0, 16 -; GFX10-NEXT: s_or_b32 s5, s5, s7 -; GFX10-NEXT: s_or_b32 s1, s2, s1 -; GFX10-NEXT: s_lshl_b32 s2, s8, 24 -; GFX10-NEXT: s_or_b32 s0, s5, s0 -; GFX10-NEXT: s_lshl_b32 s3, s4, 24 -; GFX10-NEXT: s_or_b32 s1, s1, s2 +; GFX10-NEXT: s_or_b32 s7, s7, s8 +; GFX10-NEXT: s_or_b32 s8, s9, s10 +; GFX10-NEXT: s_lshl_b32 s3, s3, 24 +; GFX10-NEXT: s_lshl_b32 s6, s6, 24 +; GFX10-NEXT: s_or_b32 s0, s7, s0 +; GFX10-NEXT: s_or_b32 s1, s8, s1 ; GFX10-NEXT: s_or_b32 s0, s0, s3 +; GFX10-NEXT: s_or_b32 s1, s1, s6 +; GFX10-NEXT: s_cmp_eq_u32 s2, 1 +; GFX10-NEXT: s_cselect_b32 s3, s1, s0 +; GFX10-NEXT: s_and_b32 s5, s5, 3 +; GFX10-NEXT: s_and_b32 s4, s4, 0xff +; GFX10-NEXT: s_lshl_b32 s5, s5, 3 +; GFX10-NEXT: s_lshl_b32 s6, 0xff, s5 +; GFX10-NEXT: s_lshl_b32 s4, s4, s5 +; GFX10-NEXT: s_andn2_b32 s3, s3, s6 +; GFX10-NEXT: s_or_b32 s3, s3, s4 +; GFX10-NEXT: s_cmp_eq_u32 s2, 0 +; GFX10-NEXT: s_cselect_b32 s0, s3, s0 +; GFX10-NEXT: s_cmp_eq_u32 s2, 1 +; GFX10-NEXT: s_cselect_b32 s1, s3, s1 +; GFX10-NEXT: s_bfe_u32 s4, s0, 0x80008 +; GFX10-NEXT: s_lshr_b32 s2, s0, 24 +; GFX10-NEXT: s_and_b32 s3, s0, 0xff +; GFX10-NEXT: s_bfe_u32 s0, s0, 0x80010 +; GFX10-NEXT: s_lshl_b32 s4, s4, 8 +; GFX10-NEXT: s_bfe_u32 s7, s1, 0x80008 +; GFX10-NEXT: s_lshr_b32 s5, s1, 24 +; GFX10-NEXT: s_lshl_b32 s0, s0, 16 +; GFX10-NEXT: s_and_b32 s6, s1, 0xff +; GFX10-NEXT: s_bfe_u32 s1, s1, 0x80010 +; GFX10-NEXT: s_or_b32 s3, s3, s4 +; GFX10-NEXT: s_lshl_b32 s4, s7, 8 +; GFX10-NEXT: s_or_b32 s0, s3, s0 +; GFX10-NEXT: s_or_b32 s3, s6, s4 +; GFX10-NEXT: s_lshl_b32 s1, s1, 16 +; GFX10-NEXT: s_lshl_b32 s2, s2, 24 +; GFX10-NEXT: s_or_b32 s1, s3, s1 +; GFX10-NEXT: s_lshl_b32 s3, s5, 24 +; GFX10-NEXT: s_or_b32 s0, s0, s2 +; GFX10-NEXT: s_or_b32 s1, s1, s3 ; GFX10-NEXT: v_mov_b32_e32 v3, s1 ; GFX10-NEXT: v_mov_b32_e32 v2, s0 ; GFX10-NEXT: global_store_dwordx2 v[0:1], v[2:3], off @@ -2045,47 +2025,48 @@ define amdgpu_ps void @insertelement_v_v8i8_s_s(<8 x i8> addrspace(1)* %ptr, i8 ; GFX9-NEXT: s_movk_i32 s4, 0xff ; GFX9-NEXT: s_lshr_b32 s5, s3, 2 ; GFX9-NEXT: s_and_b32 s3, s3, 3 -; GFX9-NEXT: s_and_b32 s2, s2, s4 +; GFX9-NEXT: s_and_b32 s2, s2, 0xff ; GFX9-NEXT: s_lshl_b32 s3, s3, 3 ; GFX9-NEXT: s_lshl_b32 s2, s2, s3 -; GFX9-NEXT: s_lshl_b32 s3, s4, s3 +; GFX9-NEXT: s_lshl_b32 s3, 0xff, s3 ; GFX9-NEXT: v_cmp_eq_u32_e64 vcc, s5, 1 ; GFX9-NEXT: s_not_b32 s3, s3 -; GFX9-NEXT: v_mov_b32_e32 v6, s2 -; GFX9-NEXT: v_mov_b32_e32 v4, 8 -; GFX9-NEXT: v_mov_b32_e32 v5, 16 +; GFX9-NEXT: v_mov_b32_e32 v7, s2 +; GFX9-NEXT: v_mov_b32_e32 v5, 8 +; GFX9-NEXT: v_mov_b32_e32 v4, 0xff +; GFX9-NEXT: v_mov_b32_e32 v6, 16 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: v_mov_b32_e32 v3, 0 ; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_lshrrev_b32_e32 v8, 24, v0 +; GFX9-NEXT: v_lshrrev_b32_e32 v9, 24, v1 +; GFX9-NEXT: v_lshlrev_b32_sdwa v10, s0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 +; GFX9-NEXT: v_lshlrev_b32_sdwa v12, s0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 +; GFX9-NEXT: v_lshlrev_b32_sdwa v11, s1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 +; GFX9-NEXT: v_lshlrev_b32_sdwa v13, s1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 +; GFX9-NEXT: v_and_or_b32 v0, v0, s4, v10 +; GFX9-NEXT: v_lshlrev_b32_e32 v8, 24, v8 +; GFX9-NEXT: v_and_or_b32 v1, v1, s4, v12 +; GFX9-NEXT: v_lshlrev_b32_e32 v9, 24, v9 +; GFX9-NEXT: v_or3_b32 v0, v0, v11, v8 +; GFX9-NEXT: v_or3_b32 v1, v1, v13, v9 +; GFX9-NEXT: v_cndmask_b32_e32 v8, v0, v1, vcc +; GFX9-NEXT: v_and_or_b32 v7, v8, s3, v7 +; GFX9-NEXT: v_cmp_eq_u32_e64 s[0:1], s5, 0 +; GFX9-NEXT: v_cndmask_b32_e64 v0, v0, v7, s[0:1] +; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v7, vcc ; GFX9-NEXT: v_lshrrev_b32_e32 v7, 24, v0 ; GFX9-NEXT: v_lshrrev_b32_e32 v8, 24, v1 -; GFX9-NEXT: v_lshlrev_b32_sdwa v9, s0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 -; GFX9-NEXT: v_lshlrev_b32_sdwa v11, s0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 -; GFX9-NEXT: v_lshlrev_b32_sdwa v10, s1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 -; GFX9-NEXT: v_lshlrev_b32_sdwa v12, s1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 -; GFX9-NEXT: v_and_or_b32 v0, v0, s4, v9 +; GFX9-NEXT: v_lshlrev_b32_sdwa v9, v5, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 +; GFX9-NEXT: v_lshlrev_b32_sdwa v5, v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 +; GFX9-NEXT: v_lshlrev_b32_sdwa v10, v6, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 +; GFX9-NEXT: v_lshlrev_b32_sdwa v6, v6, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 +; GFX9-NEXT: v_and_or_b32 v0, v0, v4, v9 ; GFX9-NEXT: v_lshlrev_b32_e32 v7, 24, v7 -; GFX9-NEXT: v_and_or_b32 v1, v1, s4, v11 -; GFX9-NEXT: v_lshlrev_b32_e32 v8, 24, v8 +; GFX9-NEXT: v_and_or_b32 v1, v1, v4, v5 +; GFX9-NEXT: v_lshlrev_b32_e32 v4, 24, v8 ; GFX9-NEXT: v_or3_b32 v0, v0, v10, v7 -; GFX9-NEXT: v_or3_b32 v1, v1, v12, v8 -; GFX9-NEXT: v_cndmask_b32_e32 v7, v0, v1, vcc -; GFX9-NEXT: v_and_or_b32 v6, v7, s3, v6 -; GFX9-NEXT: v_cmp_eq_u32_e64 s[0:1], s5, 0 -; GFX9-NEXT: v_cndmask_b32_e64 v0, v0, v6, s[0:1] -; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v6, vcc -; GFX9-NEXT: v_lshrrev_b32_e32 v6, 24, v0 -; GFX9-NEXT: v_lshrrev_b32_e32 v7, 24, v1 -; GFX9-NEXT: v_lshlrev_b32_sdwa v8, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 -; GFX9-NEXT: v_lshlrev_b32_sdwa v4, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 -; GFX9-NEXT: v_lshlrev_b32_sdwa v9, v5, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 -; GFX9-NEXT: v_lshlrev_b32_sdwa v5, v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 -; GFX9-NEXT: v_and_or_b32 v0, v0, s4, v8 -; GFX9-NEXT: v_lshlrev_b32_e32 v6, 24, v6 -; GFX9-NEXT: v_and_or_b32 v1, v1, s4, v4 -; GFX9-NEXT: v_lshlrev_b32_e32 v4, 24, v7 -; GFX9-NEXT: v_or3_b32 v0, v0, v9, v6 -; GFX9-NEXT: v_or3_b32 v1, v1, v5, v4 +; GFX9-NEXT: v_or3_b32 v1, v1, v6, v4 ; GFX9-NEXT: global_store_dwordx2 v[2:3], v[0:1], off ; GFX9-NEXT: s_endpgm ; @@ -2094,15 +2075,14 @@ define amdgpu_ps void @insertelement_v_v8i8_s_s(<8 x i8> addrspace(1)* %ptr, i8 ; GFX8-NEXT: flat_load_dwordx2 v[0:1], v[0:1] ; GFX8-NEXT: v_mov_b32_e32 v4, 8 ; GFX8-NEXT: v_mov_b32_e32 v5, 16 -; GFX8-NEXT: s_lshr_b32 s1, s3, 2 -; GFX8-NEXT: s_and_b32 s3, s3, 3 -; GFX8-NEXT: s_movk_i32 s0, 0xff -; GFX8-NEXT: s_lshl_b32 s3, s3, 3 -; GFX8-NEXT: s_and_b32 s2, s2, s0 -; GFX8-NEXT: s_lshl_b32 s0, s0, s3 -; GFX8-NEXT: v_cmp_eq_u32_e64 vcc, s1, 1 -; GFX8-NEXT: s_not_b32 s0, s0 -; GFX8-NEXT: s_lshl_b32 s2, s2, s3 +; GFX8-NEXT: s_and_b32 s1, s3, 3 +; GFX8-NEXT: s_lshr_b32 s0, s3, 2 +; GFX8-NEXT: s_and_b32 s2, s2, 0xff +; GFX8-NEXT: s_lshl_b32 s1, s1, 3 +; GFX8-NEXT: s_lshl_b32 s2, s2, s1 +; GFX8-NEXT: s_lshl_b32 s1, 0xff, s1 +; GFX8-NEXT: v_cmp_eq_u32_e64 vcc, s0, 1 +; GFX8-NEXT: s_not_b32 s1, s1 ; GFX8-NEXT: v_mov_b32_e32 v6, 8 ; GFX8-NEXT: v_mov_b32_e32 v7, 16 ; GFX8-NEXT: v_mov_b32_e32 v2, 0 @@ -2123,9 +2103,9 @@ define amdgpu_ps void @insertelement_v_v8i8_s_s(<8 x i8> addrspace(1)* %ptr, i8 ; GFX8-NEXT: v_or_b32_e32 v0, v0, v8 ; GFX8-NEXT: v_or_b32_e32 v1, v1, v4 ; GFX8-NEXT: v_cndmask_b32_e32 v4, v0, v1, vcc -; GFX8-NEXT: v_and_b32_e32 v4, s0, v4 +; GFX8-NEXT: v_and_b32_e32 v4, s1, v4 ; GFX8-NEXT: v_or_b32_e32 v4, s2, v4 -; GFX8-NEXT: v_cmp_eq_u32_e64 s[0:1], s1, 0 +; GFX8-NEXT: v_cmp_eq_u32_e64 s[0:1], s0, 0 ; GFX8-NEXT: v_cndmask_b32_e64 v0, v0, v4, s[0:1] ; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc ; GFX8-NEXT: v_lshlrev_b32_sdwa v8, v6, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 @@ -2151,63 +2131,64 @@ define amdgpu_ps void @insertelement_v_v8i8_s_s(<8 x i8> addrspace(1)* %ptr, i8 ; GFX7-NEXT: s_mov_b32 s7, 0xf000 ; GFX7-NEXT: s_mov_b64 s[4:5], 0 ; GFX7-NEXT: buffer_load_dwordx2 v[0:1], v[0:1], s[4:7], 0 addr64 -; GFX7-NEXT: s_movk_i32 s6, 0xff -; GFX7-NEXT: s_and_b32 s1, s3, 3 -; GFX7-NEXT: s_lshr_b32 s0, s3, 2 -; GFX7-NEXT: s_and_b32 s2, s2, s6 -; GFX7-NEXT: s_lshl_b32 s1, s1, 3 -; GFX7-NEXT: s_lshl_b32 s2, s2, s1 -; GFX7-NEXT: s_lshl_b32 s1, s6, s1 -; GFX7-NEXT: v_cmp_eq_u32_e64 vcc, s0, 1 -; GFX7-NEXT: s_not_b32 s1, s1 -; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_bfe_u32 v5, v0, 8, 8 -; GFX7-NEXT: v_bfe_u32 v7, v1, 8, 8 -; GFX7-NEXT: v_lshrrev_b32_e32 v2, 24, v0 -; GFX7-NEXT: v_lshrrev_b32_e32 v3, 24, v1 -; GFX7-NEXT: v_and_b32_e32 v4, s6, v0 -; GFX7-NEXT: v_bfe_u32 v0, v0, 16, 8 -; GFX7-NEXT: v_and_b32_e32 v6, s6, v1 -; GFX7-NEXT: v_bfe_u32 v1, v1, 16, 8 -; GFX7-NEXT: v_lshlrev_b32_e32 v5, 8, v5 -; GFX7-NEXT: v_lshlrev_b32_e32 v7, 8, v7 -; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX7-NEXT: v_or_b32_e32 v4, v4, v5 -; GFX7-NEXT: v_or_b32_e32 v5, v6, v7 -; GFX7-NEXT: v_lshlrev_b32_e32 v2, 24, v2 -; GFX7-NEXT: v_lshlrev_b32_e32 v3, 24, v3 -; GFX7-NEXT: v_or_b32_e32 v0, v4, v0 -; GFX7-NEXT: v_or_b32_e32 v1, v5, v1 -; GFX7-NEXT: v_or_b32_e32 v0, v0, v2 -; GFX7-NEXT: v_or_b32_e32 v1, v1, v3 -; GFX7-NEXT: v_cndmask_b32_e32 v2, v0, v1, vcc -; GFX7-NEXT: v_and_b32_e32 v2, s1, v2 -; GFX7-NEXT: v_or_b32_e32 v2, s2, v2 -; GFX7-NEXT: v_cmp_eq_u32_e64 s[0:1], s0, 0 -; GFX7-NEXT: v_cndmask_b32_e64 v0, v0, v2, s[0:1] -; GFX7-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc -; GFX7-NEXT: v_bfe_u32 v5, v0, 8, 8 -; GFX7-NEXT: v_bfe_u32 v7, v1, 8, 8 -; GFX7-NEXT: v_lshrrev_b32_e32 v2, 24, v0 -; GFX7-NEXT: v_lshrrev_b32_e32 v3, 24, v1 -; GFX7-NEXT: v_and_b32_e32 v4, s6, v0 -; GFX7-NEXT: v_bfe_u32 v0, v0, 16, 8 -; GFX7-NEXT: v_and_b32_e32 v6, s6, v1 -; GFX7-NEXT: v_bfe_u32 v1, v1, 16, 8 -; GFX7-NEXT: v_lshlrev_b32_e32 v5, 8, v5 -; GFX7-NEXT: v_lshlrev_b32_e32 v7, 8, v7 -; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX7-NEXT: v_or_b32_e32 v4, v4, v5 -; GFX7-NEXT: v_or_b32_e32 v5, v6, v7 -; GFX7-NEXT: v_lshlrev_b32_e32 v2, 24, v2 -; GFX7-NEXT: v_lshlrev_b32_e32 v3, 24, v3 -; GFX7-NEXT: v_or_b32_e32 v0, v4, v0 -; GFX7-NEXT: v_or_b32_e32 v1, v5, v1 -; GFX7-NEXT: v_or_b32_e32 v0, v0, v2 -; GFX7-NEXT: v_or_b32_e32 v1, v1, v3 +; GFX7-NEXT: s_movk_i32 s0, 0xff +; GFX7-NEXT: s_lshr_b32 s1, s3, 2 +; GFX7-NEXT: s_and_b32 s3, s3, 3 +; GFX7-NEXT: s_and_b32 s2, s2, 0xff +; GFX7-NEXT: s_lshl_b32 s3, s3, 3 +; GFX7-NEXT: s_lshl_b32 s2, s2, s3 +; GFX7-NEXT: s_lshl_b32 s3, 0xff, s3 +; GFX7-NEXT: v_cmp_eq_u32_e64 vcc, s1, 1 +; GFX7-NEXT: s_not_b32 s3, s3 +; GFX7-NEXT: v_mov_b32_e32 v2, 0xff ; GFX7-NEXT: s_mov_b32 s6, -1 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: v_bfe_u32 v6, v0, 8, 8 +; GFX7-NEXT: v_bfe_u32 v8, v1, 8, 8 +; GFX7-NEXT: v_lshrrev_b32_e32 v3, 24, v0 +; GFX7-NEXT: v_lshrrev_b32_e32 v4, 24, v1 +; GFX7-NEXT: v_and_b32_e32 v5, s0, v0 +; GFX7-NEXT: v_bfe_u32 v0, v0, 16, 8 +; GFX7-NEXT: v_and_b32_e32 v7, s0, v1 +; GFX7-NEXT: v_bfe_u32 v1, v1, 16, 8 +; GFX7-NEXT: v_lshlrev_b32_e32 v6, 8, v6 +; GFX7-NEXT: v_lshlrev_b32_e32 v8, 8, v8 +; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX7-NEXT: v_or_b32_e32 v5, v5, v6 +; GFX7-NEXT: v_or_b32_e32 v6, v7, v8 +; GFX7-NEXT: v_lshlrev_b32_e32 v3, 24, v3 +; GFX7-NEXT: v_lshlrev_b32_e32 v4, 24, v4 +; GFX7-NEXT: v_or_b32_e32 v0, v5, v0 +; GFX7-NEXT: v_or_b32_e32 v1, v6, v1 +; GFX7-NEXT: v_or_b32_e32 v0, v0, v3 +; GFX7-NEXT: v_or_b32_e32 v1, v1, v4 +; GFX7-NEXT: v_cndmask_b32_e32 v3, v0, v1, vcc +; GFX7-NEXT: v_and_b32_e32 v3, s3, v3 +; GFX7-NEXT: v_or_b32_e32 v3, s2, v3 +; GFX7-NEXT: v_cmp_eq_u32_e64 s[0:1], s1, 0 +; GFX7-NEXT: v_cndmask_b32_e64 v0, v0, v3, s[0:1] +; GFX7-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc +; GFX7-NEXT: v_bfe_u32 v6, v0, 8, 8 +; GFX7-NEXT: v_bfe_u32 v7, v1, 8, 8 +; GFX7-NEXT: v_lshrrev_b32_e32 v3, 24, v0 +; GFX7-NEXT: v_lshrrev_b32_e32 v4, 24, v1 +; GFX7-NEXT: v_and_b32_e32 v5, v0, v2 +; GFX7-NEXT: v_bfe_u32 v0, v0, 16, 8 +; GFX7-NEXT: v_and_b32_e32 v2, v1, v2 +; GFX7-NEXT: v_bfe_u32 v1, v1, 16, 8 +; GFX7-NEXT: v_lshlrev_b32_e32 v6, 8, v6 +; GFX7-NEXT: v_lshlrev_b32_e32 v7, 8, v7 +; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX7-NEXT: v_or_b32_e32 v5, v5, v6 +; GFX7-NEXT: v_or_b32_e32 v2, v2, v7 +; GFX7-NEXT: v_lshlrev_b32_e32 v3, 24, v3 +; GFX7-NEXT: v_lshlrev_b32_e32 v4, 24, v4 +; GFX7-NEXT: v_or_b32_e32 v0, v5, v0 +; GFX7-NEXT: v_or_b32_e32 v1, v2, v1 +; GFX7-NEXT: v_or_b32_e32 v0, v0, v3 +; GFX7-NEXT: v_or_b32_e32 v1, v1, v4 ; GFX7-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; GFX7-NEXT: s_endpgm ; @@ -2216,8 +2197,7 @@ define amdgpu_ps void @insertelement_v_v8i8_s_s(<8 x i8> addrspace(1)* %ptr, i8 ; GFX10-NEXT: global_load_dwordx2 v[0:1], v[0:1], off ; GFX10-NEXT: s_mov_b32 s0, 8 ; GFX10-NEXT: s_mov_b32 s1, 16 -; GFX10-NEXT: s_movk_i32 s4, 0xff -; GFX10-NEXT: s_and_b32 s2, s2, s4 +; GFX10-NEXT: s_and_b32 s2, s2, 0xff ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_lshrrev_b32_e32 v2, 24, v0 ; GFX10-NEXT: v_lshlrev_b32_sdwa v3, s0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 @@ -2225,9 +2205,9 @@ define amdgpu_ps void @insertelement_v_v8i8_s_s(<8 x i8> addrspace(1)* %ptr, i8 ; GFX10-NEXT: v_lshlrev_b32_sdwa v5, s0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 ; GFX10-NEXT: v_lshlrev_b32_sdwa v6, s1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 ; GFX10-NEXT: v_lshlrev_b32_sdwa v7, s1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 -; GFX10-NEXT: v_and_or_b32 v0, v0, s4, v3 +; GFX10-NEXT: v_and_or_b32 v0, v0, 0xff, v3 ; GFX10-NEXT: v_lshlrev_b32_e32 v2, 24, v2 -; GFX10-NEXT: v_and_or_b32 v1, v1, s4, v5 +; GFX10-NEXT: v_and_or_b32 v1, v1, 0xff, v5 ; GFX10-NEXT: v_lshlrev_b32_e32 v3, 24, v4 ; GFX10-NEXT: s_lshr_b32 s0, s3, 2 ; GFX10-NEXT: s_and_b32 s1, s3, 3 @@ -2236,7 +2216,7 @@ define amdgpu_ps void @insertelement_v_v8i8_s_s(<8 x i8> addrspace(1)* %ptr, i8 ; GFX10-NEXT: v_or3_b32 v1, v1, v7, v3 ; GFX10-NEXT: s_lshl_b32 s1, s1, 3 ; GFX10-NEXT: v_cmp_eq_u32_e64 s0, s0, 0 -; GFX10-NEXT: s_lshl_b32 s3, s4, s1 +; GFX10-NEXT: s_lshl_b32 s3, 0xff, s1 ; GFX10-NEXT: s_lshl_b32 s1, s2, s1 ; GFX10-NEXT: v_cndmask_b32_e32 v2, v0, v1, vcc_lo ; GFX10-NEXT: s_not_b32 s2, s3 @@ -2251,9 +2231,9 @@ define amdgpu_ps void @insertelement_v_v8i8_s_s(<8 x i8> addrspace(1)* %ptr, i8 ; GFX10-NEXT: v_lshlrev_b32_sdwa v3, v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 ; GFX10-NEXT: v_lshlrev_b32_sdwa v7, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 ; GFX10-NEXT: v_lshlrev_b32_sdwa v8, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 -; GFX10-NEXT: v_and_or_b32 v2, v0, s4, v5 +; GFX10-NEXT: v_and_or_b32 v2, 0xff, v0, v5 ; GFX10-NEXT: v_lshlrev_b32_e32 v4, 24, v4 -; GFX10-NEXT: v_and_or_b32 v3, v1, s4, v3 +; GFX10-NEXT: v_and_or_b32 v3, 0xff, v1, v3 ; GFX10-NEXT: v_lshlrev_b32_e32 v5, 24, v6 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 @@ -2271,28 +2251,27 @@ define amdgpu_ps void @insertelement_s_v8i8_v_s(<8 x i8> addrspace(4)* inreg %pt ; GFX9-LABEL: insertelement_s_v8i8_v_s: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 -; GFX9-NEXT: s_mov_b32 s9, 0x80008 -; GFX9-NEXT: s_movk_i32 s7, 0xff -; GFX9-NEXT: v_and_b32_e32 v0, s7, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xff, v0 ; GFX9-NEXT: s_mov_b32 s2, 8 +; GFX9-NEXT: s_mov_b32 s3, 16 +; GFX9-NEXT: v_mov_b32_e32 v2, 0xff ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_bfe_u32 s10, s0, s9 -; GFX9-NEXT: s_and_b32 s8, s0, s7 -; GFX9-NEXT: s_lshl_b32 s10, s10, 8 -; GFX9-NEXT: s_or_b32 s8, s8, s10 -; GFX9-NEXT: s_mov_b32 s10, 0x80010 +; GFX9-NEXT: s_bfe_u32 s8, s0, 0x80008 ; GFX9-NEXT: s_lshr_b32 s5, s0, 24 -; GFX9-NEXT: s_bfe_u32 s0, s0, s10 +; GFX9-NEXT: s_and_b32 s7, s0, 0xff +; GFX9-NEXT: s_lshl_b32 s8, s8, 8 +; GFX9-NEXT: s_bfe_u32 s0, s0, 0x80010 +; GFX9-NEXT: s_or_b32 s7, s7, s8 ; GFX9-NEXT: s_lshl_b32 s0, s0, 16 -; GFX9-NEXT: s_or_b32 s0, s8, s0 +; GFX9-NEXT: s_or_b32 s0, s7, s0 ; GFX9-NEXT: s_lshl_b32 s5, s5, 24 -; GFX9-NEXT: s_bfe_u32 s8, s1, s9 +; GFX9-NEXT: s_bfe_u32 s7, s1, 0x80008 ; GFX9-NEXT: s_lshr_b32 s6, s1, 24 ; GFX9-NEXT: s_or_b32 s0, s0, s5 -; GFX9-NEXT: s_and_b32 s5, s1, s7 -; GFX9-NEXT: s_lshl_b32 s8, s8, 8 -; GFX9-NEXT: s_bfe_u32 s1, s1, s10 -; GFX9-NEXT: s_or_b32 s5, s5, s8 +; GFX9-NEXT: s_and_b32 s5, s1, 0xff +; GFX9-NEXT: s_lshl_b32 s7, s7, 8 +; GFX9-NEXT: s_bfe_u32 s1, s1, 0x80010 +; GFX9-NEXT: s_or_b32 s5, s5, s7 ; GFX9-NEXT: s_lshl_b32 s1, s1, 16 ; GFX9-NEXT: s_or_b32 s1, s5, s1 ; GFX9-NEXT: s_lshl_b32 s5, s6, 24 @@ -2302,59 +2281,55 @@ define amdgpu_ps void @insertelement_s_v8i8_v_s(<8 x i8> addrspace(4)* inreg %pt ; GFX9-NEXT: s_cselect_b32 s6, s1, s0 ; GFX9-NEXT: s_and_b32 s4, s4, 3 ; GFX9-NEXT: s_lshl_b32 s4, s4, 3 -; GFX9-NEXT: s_lshl_b32 s8, s7, s4 -; GFX9-NEXT: s_andn2_b32 s6, s6, s8 +; GFX9-NEXT: s_lshl_b32 s7, 0xff, s4 +; GFX9-NEXT: s_andn2_b32 s6, s6, s7 ; GFX9-NEXT: v_mov_b32_e32 v1, s6 -; GFX9-NEXT: v_lshl_or_b32 v2, v0, s4, v1 +; GFX9-NEXT: v_lshl_or_b32 v3, v0, s4, v1 ; GFX9-NEXT: v_mov_b32_e32 v0, s0 ; GFX9-NEXT: v_cmp_eq_u32_e64 vcc, s5, 0 ; GFX9-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc ; GFX9-NEXT: v_cmp_eq_u32_e64 vcc, s5, 1 -; GFX9-NEXT: s_mov_b32 s3, 16 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc -; GFX9-NEXT: v_lshrrev_b32_e32 v2, 24, v0 -; GFX9-NEXT: v_lshlrev_b32_sdwa v4, s2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 -; GFX9-NEXT: v_and_or_b32 v4, v0, s7, v4 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc +; GFX9-NEXT: v_lshrrev_b32_e32 v3, 24, v0 +; GFX9-NEXT: v_lshlrev_b32_sdwa v5, s2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 +; GFX9-NEXT: v_and_or_b32 v5, v0, v2, v5 ; GFX9-NEXT: v_lshlrev_b32_sdwa v0, s3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 -; GFX9-NEXT: v_lshlrev_b32_e32 v2, 24, v2 -; GFX9-NEXT: v_lshrrev_b32_e32 v3, 24, v1 -; GFX9-NEXT: v_or3_b32 v0, v4, v0, v2 -; GFX9-NEXT: v_lshlrev_b32_sdwa v2, s2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 -; GFX9-NEXT: v_and_or_b32 v4, v1, s7, v2 +; GFX9-NEXT: v_lshlrev_b32_e32 v3, 24, v3 +; GFX9-NEXT: v_lshrrev_b32_e32 v4, 24, v1 +; GFX9-NEXT: v_or3_b32 v0, v5, v0, v3 +; GFX9-NEXT: v_lshlrev_b32_sdwa v3, s2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 +; GFX9-NEXT: v_and_or_b32 v5, v1, v2, v3 ; GFX9-NEXT: v_lshlrev_b32_sdwa v1, s3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 -; GFX9-NEXT: v_lshlrev_b32_e32 v5, 24, v3 +; GFX9-NEXT: v_lshlrev_b32_e32 v4, 24, v4 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: v_mov_b32_e32 v3, 0 -; GFX9-NEXT: v_or3_b32 v1, v4, v1, v5 +; GFX9-NEXT: v_or3_b32 v1, v5, v1, v4 ; GFX9-NEXT: global_store_dwordx2 v[2:3], v[0:1], off ; GFX9-NEXT: s_endpgm ; ; GFX8-LABEL: insertelement_s_v8i8_v_s: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 -; GFX8-NEXT: s_mov_b32 s7, 0x80008 -; GFX8-NEXT: s_movk_i32 s5, 0xff ; GFX8-NEXT: v_mov_b32_e32 v4, 8 ; GFX8-NEXT: v_mov_b32_e32 v6, 16 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_bfe_u32 s8, s0, s7 -; GFX8-NEXT: s_and_b32 s6, s0, s5 -; GFX8-NEXT: s_lshl_b32 s8, s8, 8 -; GFX8-NEXT: s_or_b32 s6, s6, s8 -; GFX8-NEXT: s_mov_b32 s8, 0x80010 +; GFX8-NEXT: s_bfe_u32 s6, s0, 0x80008 ; GFX8-NEXT: s_lshr_b32 s2, s0, 24 -; GFX8-NEXT: s_bfe_u32 s0, s0, s8 +; GFX8-NEXT: s_and_b32 s5, s0, 0xff +; GFX8-NEXT: s_lshl_b32 s6, s6, 8 +; GFX8-NEXT: s_bfe_u32 s0, s0, 0x80010 +; GFX8-NEXT: s_or_b32 s5, s5, s6 ; GFX8-NEXT: s_lshl_b32 s0, s0, 16 -; GFX8-NEXT: s_or_b32 s0, s6, s0 +; GFX8-NEXT: s_or_b32 s0, s5, s0 ; GFX8-NEXT: s_lshl_b32 s2, s2, 24 -; GFX8-NEXT: s_bfe_u32 s6, s1, s7 +; GFX8-NEXT: s_bfe_u32 s5, s1, 0x80008 ; GFX8-NEXT: s_lshr_b32 s3, s1, 24 ; GFX8-NEXT: s_or_b32 s0, s0, s2 -; GFX8-NEXT: s_and_b32 s2, s1, s5 -; GFX8-NEXT: s_lshl_b32 s6, s6, 8 -; GFX8-NEXT: s_bfe_u32 s1, s1, s8 -; GFX8-NEXT: s_or_b32 s2, s2, s6 +; GFX8-NEXT: s_and_b32 s2, s1, 0xff +; GFX8-NEXT: s_lshl_b32 s5, s5, 8 +; GFX8-NEXT: s_bfe_u32 s1, s1, 0x80010 +; GFX8-NEXT: s_or_b32 s2, s2, s5 ; GFX8-NEXT: s_lshl_b32 s1, s1, 16 ; GFX8-NEXT: s_or_b32 s1, s2, s1 ; GFX8-NEXT: s_lshl_b32 s2, s3, 24 @@ -2365,7 +2340,7 @@ define amdgpu_ps void @insertelement_s_v8i8_v_s(<8 x i8> addrspace(4)* inreg %pt ; GFX8-NEXT: s_and_b32 s4, s4, 3 ; GFX8-NEXT: s_lshl_b32 s4, s4, 3 ; GFX8-NEXT: v_mov_b32_e32 v1, s4 -; GFX8-NEXT: s_lshl_b32 s4, s5, s4 +; GFX8-NEXT: s_lshl_b32 s4, 0xff, s4 ; GFX8-NEXT: v_lshlrev_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX8-NEXT: s_andn2_b32 s3, s3, s4 ; GFX8-NEXT: v_or_b32_e32 v2, s3, v0 @@ -2397,27 +2372,25 @@ define amdgpu_ps void @insertelement_s_v8i8_v_s(<8 x i8> addrspace(4)* inreg %pt ; GFX7-LABEL: insertelement_s_v8i8_v_s: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 -; GFX7-NEXT: s_mov_b32 s7, 0x80008 -; GFX7-NEXT: s_movk_i32 s5, 0xff -; GFX7-NEXT: v_and_b32_e32 v0, s5, v0 +; GFX7-NEXT: v_and_b32_e32 v0, 0xff, v0 +; GFX7-NEXT: v_mov_b32_e32 v2, 0xff ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: s_bfe_u32 s8, s0, s7 -; GFX7-NEXT: s_and_b32 s6, s0, s5 -; GFX7-NEXT: s_lshl_b32 s8, s8, 8 -; GFX7-NEXT: s_or_b32 s6, s6, s8 -; GFX7-NEXT: s_mov_b32 s8, 0x80010 +; GFX7-NEXT: s_bfe_u32 s6, s0, 0x80008 ; GFX7-NEXT: s_lshr_b32 s2, s0, 24 -; GFX7-NEXT: s_bfe_u32 s0, s0, s8 +; GFX7-NEXT: s_and_b32 s5, s0, 0xff +; GFX7-NEXT: s_lshl_b32 s6, s6, 8 +; GFX7-NEXT: s_bfe_u32 s0, s0, 0x80010 +; GFX7-NEXT: s_or_b32 s5, s5, s6 ; GFX7-NEXT: s_lshl_b32 s0, s0, 16 -; GFX7-NEXT: s_or_b32 s0, s6, s0 +; GFX7-NEXT: s_or_b32 s0, s5, s0 ; GFX7-NEXT: s_lshl_b32 s2, s2, 24 -; GFX7-NEXT: s_bfe_u32 s6, s1, s7 +; GFX7-NEXT: s_bfe_u32 s5, s1, 0x80008 ; GFX7-NEXT: s_lshr_b32 s3, s1, 24 ; GFX7-NEXT: s_or_b32 s0, s0, s2 -; GFX7-NEXT: s_and_b32 s2, s1, s5 -; GFX7-NEXT: s_lshl_b32 s6, s6, 8 -; GFX7-NEXT: s_bfe_u32 s1, s1, s8 -; GFX7-NEXT: s_or_b32 s2, s2, s6 +; GFX7-NEXT: s_and_b32 s2, s1, 0xff +; GFX7-NEXT: s_lshl_b32 s5, s5, 8 +; GFX7-NEXT: s_bfe_u32 s1, s1, 0x80010 +; GFX7-NEXT: s_or_b32 s2, s2, s5 ; GFX7-NEXT: s_lshl_b32 s1, s1, 16 ; GFX7-NEXT: s_or_b32 s1, s2, s1 ; GFX7-NEXT: s_lshl_b32 s2, s3, 24 @@ -2428,34 +2401,34 @@ define amdgpu_ps void @insertelement_s_v8i8_v_s(<8 x i8> addrspace(4)* inreg %pt ; GFX7-NEXT: s_and_b32 s4, s4, 3 ; GFX7-NEXT: s_lshl_b32 s4, s4, 3 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, s4, v0 -; GFX7-NEXT: s_lshl_b32 s4, s5, s4 +; GFX7-NEXT: s_lshl_b32 s4, 0xff, s4 ; GFX7-NEXT: s_andn2_b32 s3, s3, s4 -; GFX7-NEXT: v_or_b32_e32 v2, s3, v0 +; GFX7-NEXT: v_or_b32_e32 v3, s3, v0 ; GFX7-NEXT: v_mov_b32_e32 v0, s0 ; GFX7-NEXT: v_cmp_eq_u32_e64 vcc, s2, 0 -; GFX7-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc +; GFX7-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc ; GFX7-NEXT: v_mov_b32_e32 v1, s1 ; GFX7-NEXT: v_cmp_eq_u32_e64 vcc, s2, 1 -; GFX7-NEXT: v_bfe_u32 v5, v0, 8, 8 -; GFX7-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc -; GFX7-NEXT: v_lshrrev_b32_e32 v2, 24, v0 -; GFX7-NEXT: v_and_b32_e32 v4, s5, v0 -; GFX7-NEXT: v_lshlrev_b32_e32 v5, 8, v5 +; GFX7-NEXT: v_bfe_u32 v6, v0, 8, 8 +; GFX7-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc +; GFX7-NEXT: v_lshrrev_b32_e32 v3, 24, v0 +; GFX7-NEXT: v_and_b32_e32 v5, v0, v2 +; GFX7-NEXT: v_lshlrev_b32_e32 v6, 8, v6 ; GFX7-NEXT: v_bfe_u32 v0, v0, 16, 8 -; GFX7-NEXT: v_or_b32_e32 v4, v4, v5 +; GFX7-NEXT: v_or_b32_e32 v5, v5, v6 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX7-NEXT: v_or_b32_e32 v0, v4, v0 -; GFX7-NEXT: v_lshlrev_b32_e32 v2, 24, v2 -; GFX7-NEXT: v_bfe_u32 v4, v1, 8, 8 -; GFX7-NEXT: v_lshrrev_b32_e32 v3, 24, v1 -; GFX7-NEXT: v_or_b32_e32 v0, v0, v2 -; GFX7-NEXT: v_and_b32_e32 v2, s5, v1 -; GFX7-NEXT: v_lshlrev_b32_e32 v4, 8, v4 +; GFX7-NEXT: v_or_b32_e32 v0, v5, v0 +; GFX7-NEXT: v_lshlrev_b32_e32 v3, 24, v3 +; GFX7-NEXT: v_or_b32_e32 v0, v0, v3 +; GFX7-NEXT: v_bfe_u32 v3, v1, 8, 8 +; GFX7-NEXT: v_lshrrev_b32_e32 v4, 24, v1 +; GFX7-NEXT: v_and_b32_e32 v2, v1, v2 +; GFX7-NEXT: v_lshlrev_b32_e32 v3, 8, v3 ; GFX7-NEXT: v_bfe_u32 v1, v1, 16, 8 -; GFX7-NEXT: v_or_b32_e32 v2, v2, v4 +; GFX7-NEXT: v_or_b32_e32 v2, v2, v3 ; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX7-NEXT: v_or_b32_e32 v1, v2, v1 -; GFX7-NEXT: v_lshlrev_b32_e32 v2, 24, v3 +; GFX7-NEXT: v_lshlrev_b32_e32 v2, 24, v4 ; GFX7-NEXT: s_mov_b64 s[0:1], 0 ; GFX7-NEXT: v_or_b32_e32 v1, v1, v2 ; GFX7-NEXT: s_mov_b32 s2, -1 @@ -2466,57 +2439,54 @@ define amdgpu_ps void @insertelement_s_v8i8_v_s(<8 x i8> addrspace(4)* inreg %pt ; GFX10-LABEL: insertelement_s_v8i8_v_s: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 -; GFX10-NEXT: s_mov_b32 s3, 0x80008 -; GFX10-NEXT: s_movk_i32 s2, 0xff -; GFX10-NEXT: s_mov_b32 s5, 0x80010 -; GFX10-NEXT: s_lshr_b32 s6, s4, 2 -; GFX10-NEXT: v_and_b32_e32 v2, s2, v0 -; GFX10-NEXT: v_cmp_eq_u32_e64 vcc_lo, s6, 0 +; GFX10-NEXT: s_lshr_b32 s2, s4, 2 +; GFX10-NEXT: v_and_b32_e32 v2, 0xff, v0 +; GFX10-NEXT: v_cmp_eq_u32_e64 vcc_lo, s2, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: s_bfe_u32 s10, s0, s3 -; GFX10-NEXT: s_bfe_u32 s3, s1, s3 -; GFX10-NEXT: s_lshr_b32 s7, s0, 24 -; GFX10-NEXT: s_lshr_b32 s8, s1, 24 -; GFX10-NEXT: s_and_b32 s9, s0, s2 -; GFX10-NEXT: s_bfe_u32 s0, s0, s5 -; GFX10-NEXT: s_and_b32 s11, s1, s2 -; GFX10-NEXT: s_bfe_u32 s1, s1, s5 -; GFX10-NEXT: s_lshl_b32 s5, s10, 8 -; GFX10-NEXT: s_lshl_b32 s3, s3, 8 +; GFX10-NEXT: s_bfe_u32 s7, s0, 0x80008 +; GFX10-NEXT: s_bfe_u32 s9, s1, 0x80008 +; GFX10-NEXT: s_lshr_b32 s3, s0, 24 +; GFX10-NEXT: s_lshr_b32 s5, s1, 24 +; GFX10-NEXT: s_and_b32 s6, s0, 0xff +; GFX10-NEXT: s_bfe_u32 s0, s0, 0x80010 +; GFX10-NEXT: s_and_b32 s8, s1, 0xff +; GFX10-NEXT: s_bfe_u32 s1, s1, 0x80010 +; GFX10-NEXT: s_lshl_b32 s7, s7, 8 +; GFX10-NEXT: s_lshl_b32 s9, s9, 8 ; GFX10-NEXT: s_lshl_b32 s0, s0, 16 ; GFX10-NEXT: s_lshl_b32 s1, s1, 16 -; GFX10-NEXT: s_or_b32 s5, s9, s5 -; GFX10-NEXT: s_or_b32 s3, s11, s3 -; GFX10-NEXT: s_lshl_b32 s7, s7, 24 -; GFX10-NEXT: s_lshl_b32 s8, s8, 24 -; GFX10-NEXT: s_or_b32 s0, s5, s0 -; GFX10-NEXT: s_or_b32 s1, s3, s1 -; GFX10-NEXT: s_or_b32 s0, s0, s7 -; GFX10-NEXT: s_or_b32 s1, s1, s8 -; GFX10-NEXT: s_cmp_eq_u32 s6, 1 +; GFX10-NEXT: s_or_b32 s6, s6, s7 +; GFX10-NEXT: s_or_b32 s7, s8, s9 +; GFX10-NEXT: s_lshl_b32 s3, s3, 24 +; GFX10-NEXT: s_lshl_b32 s5, s5, 24 +; GFX10-NEXT: s_or_b32 s0, s6, s0 +; GFX10-NEXT: s_or_b32 s1, s7, s1 +; GFX10-NEXT: s_or_b32 s0, s0, s3 +; GFX10-NEXT: s_or_b32 s1, s1, s5 +; GFX10-NEXT: s_cmp_eq_u32 s2, 1 ; GFX10-NEXT: v_mov_b32_e32 v0, s0 ; GFX10-NEXT: s_cselect_b32 s3, s1, s0 ; GFX10-NEXT: s_and_b32 s4, s4, 3 ; GFX10-NEXT: v_mov_b32_e32 v1, s1 ; GFX10-NEXT: s_lshl_b32 s4, s4, 3 ; GFX10-NEXT: s_mov_b32 s0, 8 -; GFX10-NEXT: s_lshl_b32 s5, s2, s4 +; GFX10-NEXT: s_lshl_b32 s5, 0xff, s4 ; GFX10-NEXT: s_mov_b32 s1, 16 ; GFX10-NEXT: s_andn2_b32 s3, s3, s5 ; GFX10-NEXT: v_lshl_or_b32 v2, v2, s4, s3 ; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc_lo -; GFX10-NEXT: v_cmp_eq_u32_e64 vcc_lo, s6, 1 +; GFX10-NEXT: v_cmp_eq_u32_e64 vcc_lo, s2, 1 ; GFX10-NEXT: v_lshlrev_b32_sdwa v3, s0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 ; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc_lo ; GFX10-NEXT: v_lshrrev_b32_e32 v2, 24, v0 ; GFX10-NEXT: v_lshlrev_b32_sdwa v6, s1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 -; GFX10-NEXT: v_and_or_b32 v3, v0, s2, v3 +; GFX10-NEXT: v_and_or_b32 v3, 0xff, v0, v3 ; GFX10-NEXT: v_lshrrev_b32_e32 v4, 24, v1 ; GFX10-NEXT: v_lshlrev_b32_sdwa v5, s0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 ; GFX10-NEXT: v_lshlrev_b32_sdwa v7, s1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 ; GFX10-NEXT: v_lshlrev_b32_e32 v2, 24, v2 ; GFX10-NEXT: v_lshlrev_b32_e32 v4, 24, v4 -; GFX10-NEXT: v_and_or_b32 v5, v1, s2, v5 +; GFX10-NEXT: v_and_or_b32 v5, 0xff, v1, v5 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: v_or3_b32 v2, v3, v6, v2 @@ -2533,27 +2503,26 @@ define amdgpu_ps void @insertelement_s_v8i8_s_v(<8 x i8> addrspace(4)* inreg %pt ; GFX9-LABEL: insertelement_s_v8i8_s_v: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 -; GFX9-NEXT: s_mov_b32 s9, 0x80008 -; GFX9-NEXT: s_movk_i32 s7, 0xff ; GFX9-NEXT: v_lshrrev_b32_e32 v2, 2, v0 ; GFX9-NEXT: v_and_b32_e32 v0, 3, v0 +; GFX9-NEXT: s_movk_i32 s7, 0xff +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v2 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_bfe_u32 s10, s0, s9 -; GFX9-NEXT: s_and_b32 s8, s0, s7 -; GFX9-NEXT: s_lshl_b32 s10, s10, 8 -; GFX9-NEXT: s_or_b32 s8, s8, s10 -; GFX9-NEXT: s_mov_b32 s10, 0x80010 +; GFX9-NEXT: s_bfe_u32 s9, s0, 0x80008 ; GFX9-NEXT: s_lshr_b32 s5, s0, 24 -; GFX9-NEXT: s_bfe_u32 s0, s0, s10 +; GFX9-NEXT: s_and_b32 s8, s0, 0xff +; GFX9-NEXT: s_lshl_b32 s9, s9, 8 +; GFX9-NEXT: s_bfe_u32 s0, s0, 0x80010 +; GFX9-NEXT: s_or_b32 s8, s8, s9 ; GFX9-NEXT: s_lshl_b32 s0, s0, 16 ; GFX9-NEXT: s_or_b32 s0, s8, s0 ; GFX9-NEXT: s_lshl_b32 s5, s5, 24 -; GFX9-NEXT: s_bfe_u32 s8, s1, s9 +; GFX9-NEXT: s_bfe_u32 s8, s1, 0x80008 ; GFX9-NEXT: s_lshr_b32 s6, s1, 24 ; GFX9-NEXT: s_or_b32 s0, s0, s5 -; GFX9-NEXT: s_and_b32 s5, s1, s7 +; GFX9-NEXT: s_and_b32 s5, s1, 0xff ; GFX9-NEXT: s_lshl_b32 s8, s8, 8 -; GFX9-NEXT: s_bfe_u32 s1, s1, s10 +; GFX9-NEXT: s_bfe_u32 s1, s1, 0x80010 ; GFX9-NEXT: s_or_b32 s5, s5, s8 ; GFX9-NEXT: s_lshl_b32 s1, s1, 16 ; GFX9-NEXT: s_or_b32 s1, s5, s1 @@ -2561,9 +2530,8 @@ define amdgpu_ps void @insertelement_s_v8i8_s_v(<8 x i8> addrspace(4)* inreg %pt ; GFX9-NEXT: s_or_b32 s1, s1, s5 ; GFX9-NEXT: v_mov_b32_e32 v1, s0 ; GFX9-NEXT: v_mov_b32_e32 v3, s1 -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v2 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 3, v0 -; GFX9-NEXT: s_and_b32 s4, s4, s7 +; GFX9-NEXT: s_and_b32 s4, s4, 0xff ; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc ; GFX9-NEXT: v_lshlrev_b32_e64 v3, v0, s4 ; GFX9-NEXT: v_lshlrev_b32_e64 v0, v0, s7 @@ -2575,16 +2543,17 @@ define amdgpu_ps void @insertelement_s_v8i8_s_v(<8 x i8> addrspace(4)* inreg %pt ; GFX9-NEXT: s_mov_b32 s2, 8 ; GFX9-NEXT: v_cndmask_b32_e64 v0, v0, v3, s[0:1] ; GFX9-NEXT: s_mov_b32 s3, 16 +; GFX9-NEXT: v_mov_b32_e32 v4, 0xff ; GFX9-NEXT: v_lshrrev_b32_e32 v2, 24, v0 -; GFX9-NEXT: v_lshlrev_b32_sdwa v4, s2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 +; GFX9-NEXT: v_lshlrev_b32_sdwa v5, s2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc -; GFX9-NEXT: v_and_or_b32 v4, v0, s7, v4 +; GFX9-NEXT: v_and_or_b32 v5, v0, v4, v5 ; GFX9-NEXT: v_lshlrev_b32_sdwa v0, s3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 24, v2 ; GFX9-NEXT: v_lshrrev_b32_e32 v3, 24, v1 -; GFX9-NEXT: v_or3_b32 v0, v4, v0, v2 +; GFX9-NEXT: v_or3_b32 v0, v5, v0, v2 ; GFX9-NEXT: v_lshlrev_b32_sdwa v2, s2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 -; GFX9-NEXT: v_and_or_b32 v4, v1, s7, v2 +; GFX9-NEXT: v_and_or_b32 v4, v1, v4, v2 ; GFX9-NEXT: v_lshlrev_b32_sdwa v1, s3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 ; GFX9-NEXT: v_lshlrev_b32_e32 v5, 24, v3 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 @@ -2596,27 +2565,26 @@ define amdgpu_ps void @insertelement_s_v8i8_s_v(<8 x i8> addrspace(4)* inreg %pt ; GFX8-LABEL: insertelement_s_v8i8_s_v: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 -; GFX8-NEXT: s_mov_b32 s7, 0x80008 -; GFX8-NEXT: s_movk_i32 s5, 0xff ; GFX8-NEXT: v_lshrrev_b32_e32 v2, 2, v0 ; GFX8-NEXT: v_and_b32_e32 v0, 3, v0 +; GFX8-NEXT: s_movk_i32 s5, 0xff +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 1, v2 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_bfe_u32 s8, s0, s7 -; GFX8-NEXT: s_and_b32 s6, s0, s5 -; GFX8-NEXT: s_lshl_b32 s8, s8, 8 -; GFX8-NEXT: s_or_b32 s6, s6, s8 -; GFX8-NEXT: s_mov_b32 s8, 0x80010 +; GFX8-NEXT: s_bfe_u32 s7, s0, 0x80008 ; GFX8-NEXT: s_lshr_b32 s2, s0, 24 -; GFX8-NEXT: s_bfe_u32 s0, s0, s8 +; GFX8-NEXT: s_and_b32 s6, s0, 0xff +; GFX8-NEXT: s_lshl_b32 s7, s7, 8 +; GFX8-NEXT: s_bfe_u32 s0, s0, 0x80010 +; GFX8-NEXT: s_or_b32 s6, s6, s7 ; GFX8-NEXT: s_lshl_b32 s0, s0, 16 ; GFX8-NEXT: s_or_b32 s0, s6, s0 ; GFX8-NEXT: s_lshl_b32 s2, s2, 24 -; GFX8-NEXT: s_bfe_u32 s6, s1, s7 +; GFX8-NEXT: s_bfe_u32 s6, s1, 0x80008 ; GFX8-NEXT: s_lshr_b32 s3, s1, 24 ; GFX8-NEXT: s_or_b32 s0, s0, s2 -; GFX8-NEXT: s_and_b32 s2, s1, s5 +; GFX8-NEXT: s_and_b32 s2, s1, 0xff ; GFX8-NEXT: s_lshl_b32 s6, s6, 8 -; GFX8-NEXT: s_bfe_u32 s1, s1, s8 +; GFX8-NEXT: s_bfe_u32 s1, s1, 0x80010 ; GFX8-NEXT: s_or_b32 s2, s2, s6 ; GFX8-NEXT: s_lshl_b32 s1, s1, 16 ; GFX8-NEXT: s_or_b32 s1, s2, s1 @@ -2624,9 +2592,8 @@ define amdgpu_ps void @insertelement_s_v8i8_s_v(<8 x i8> addrspace(4)* inreg %pt ; GFX8-NEXT: s_or_b32 s1, s1, s2 ; GFX8-NEXT: v_mov_b32_e32 v1, s0 ; GFX8-NEXT: v_mov_b32_e32 v3, s1 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 1, v2 ; GFX8-NEXT: v_lshlrev_b32_e32 v0, 3, v0 -; GFX8-NEXT: s_and_b32 s2, s4, s5 +; GFX8-NEXT: s_and_b32 s2, s4, 0xff ; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc ; GFX8-NEXT: v_lshlrev_b32_e64 v3, v0, s2 ; GFX8-NEXT: v_lshlrev_b32_e64 v0, v0, s5 @@ -2662,27 +2629,26 @@ define amdgpu_ps void @insertelement_s_v8i8_s_v(<8 x i8> addrspace(4)* inreg %pt ; GFX7-LABEL: insertelement_s_v8i8_s_v: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 -; GFX7-NEXT: s_mov_b32 s7, 0x80008 -; GFX7-NEXT: s_movk_i32 s5, 0xff ; GFX7-NEXT: v_lshrrev_b32_e32 v2, 2, v0 ; GFX7-NEXT: v_and_b32_e32 v0, 3, v0 +; GFX7-NEXT: s_movk_i32 s5, 0xff +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: s_bfe_u32 s8, s0, s7 -; GFX7-NEXT: s_and_b32 s6, s0, s5 -; GFX7-NEXT: s_lshl_b32 s8, s8, 8 -; GFX7-NEXT: s_or_b32 s6, s6, s8 -; GFX7-NEXT: s_mov_b32 s8, 0x80010 +; GFX7-NEXT: s_bfe_u32 s7, s0, 0x80008 ; GFX7-NEXT: s_lshr_b32 s2, s0, 24 -; GFX7-NEXT: s_bfe_u32 s0, s0, s8 +; GFX7-NEXT: s_and_b32 s6, s0, 0xff +; GFX7-NEXT: s_lshl_b32 s7, s7, 8 +; GFX7-NEXT: s_bfe_u32 s0, s0, 0x80010 +; GFX7-NEXT: s_or_b32 s6, s6, s7 ; GFX7-NEXT: s_lshl_b32 s0, s0, 16 ; GFX7-NEXT: s_or_b32 s0, s6, s0 ; GFX7-NEXT: s_lshl_b32 s2, s2, 24 -; GFX7-NEXT: s_bfe_u32 s6, s1, s7 +; GFX7-NEXT: s_bfe_u32 s6, s1, 0x80008 ; GFX7-NEXT: s_lshr_b32 s3, s1, 24 ; GFX7-NEXT: s_or_b32 s0, s0, s2 -; GFX7-NEXT: s_and_b32 s2, s1, s5 +; GFX7-NEXT: s_and_b32 s2, s1, 0xff ; GFX7-NEXT: s_lshl_b32 s6, s6, 8 -; GFX7-NEXT: s_bfe_u32 s1, s1, s8 +; GFX7-NEXT: s_bfe_u32 s1, s1, 0x80010 ; GFX7-NEXT: s_or_b32 s2, s2, s6 ; GFX7-NEXT: s_lshl_b32 s1, s1, 16 ; GFX7-NEXT: s_or_b32 s1, s2, s1 @@ -2690,9 +2656,8 @@ define amdgpu_ps void @insertelement_s_v8i8_s_v(<8 x i8> addrspace(4)* inreg %pt ; GFX7-NEXT: s_or_b32 s1, s1, s2 ; GFX7-NEXT: v_mov_b32_e32 v1, s0 ; GFX7-NEXT: v_mov_b32_e32 v3, s1 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v2 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 3, v0 -; GFX7-NEXT: s_and_b32 s2, s4, s5 +; GFX7-NEXT: s_and_b32 s2, s4, 0xff ; GFX7-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc ; GFX7-NEXT: v_lshl_b32_e32 v3, s2, v0 ; GFX7-NEXT: v_lshl_b32_e32 v0, s5, v0 @@ -2716,7 +2681,7 @@ define amdgpu_ps void @insertelement_s_v8i8_s_v(<8 x i8> addrspace(4)* inreg %pt ; GFX7-NEXT: v_bfe_u32 v4, v1, 8, 8 ; GFX7-NEXT: v_lshrrev_b32_e32 v3, 24, v1 ; GFX7-NEXT: v_or_b32_e32 v0, v0, v2 -; GFX7-NEXT: v_and_b32_e32 v2, s5, v1 +; GFX7-NEXT: v_and_b32_e32 v2, 0xff, v1 ; GFX7-NEXT: v_lshlrev_b32_e32 v4, 8, v4 ; GFX7-NEXT: v_bfe_u32 v1, v1, 16, 8 ; GFX7-NEXT: v_or_b32_e32 v2, v2, v4 @@ -2734,38 +2699,35 @@ define amdgpu_ps void @insertelement_s_v8i8_s_v(<8 x i8> addrspace(4)* inreg %pt ; GFX10: ; %bb.0: ; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; GFX10-NEXT: v_and_b32_e32 v1, 3, v0 -; GFX10-NEXT: s_mov_b32 s3, 0x80008 -; GFX10-NEXT: s_movk_i32 s2, 0xff -; GFX10-NEXT: s_mov_b32 s5, 0x80010 -; GFX10-NEXT: s_and_b32 s4, s4, s2 -; GFX10-NEXT: v_lshlrev_b32_e32 v1, 3, v1 +; GFX10-NEXT: s_and_b32 s2, s4, 0xff ; GFX10-NEXT: v_lshrrev_b32_e32 v2, 2, v0 -; GFX10-NEXT: v_lshlrev_b32_e64 v3, v1, s4 -; GFX10-NEXT: v_lshlrev_b32_e64 v0, v1, s2 +; GFX10-NEXT: v_lshlrev_b32_e32 v1, 3, v1 ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v2 +; GFX10-NEXT: v_lshlrev_b32_e64 v3, v1, s2 +; GFX10-NEXT: v_lshlrev_b32_e64 v0, v1, 0xff ; GFX10-NEXT: v_xor_b32_e32 v4, -1, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: s_bfe_u32 s8, s0, s3 -; GFX10-NEXT: s_bfe_u32 s3, s1, s3 -; GFX10-NEXT: s_lshr_b32 s6, s1, 24 -; GFX10-NEXT: s_and_b32 s9, s1, s2 -; GFX10-NEXT: s_bfe_u32 s1, s1, s5 -; GFX10-NEXT: s_lshl_b32 s3, s3, 8 +; GFX10-NEXT: s_bfe_u32 s7, s1, 0x80008 +; GFX10-NEXT: s_lshr_b32 s3, s1, 24 +; GFX10-NEXT: s_and_b32 s6, s1, 0xff +; GFX10-NEXT: s_bfe_u32 s1, s1, 0x80010 +; GFX10-NEXT: s_lshl_b32 s7, s7, 8 ; GFX10-NEXT: s_lshl_b32 s1, s1, 16 -; GFX10-NEXT: s_or_b32 s3, s9, s3 -; GFX10-NEXT: s_lshl_b32 s6, s6, 24 -; GFX10-NEXT: s_or_b32 s1, s3, s1 -; GFX10-NEXT: s_lshr_b32 s4, s0, 24 -; GFX10-NEXT: s_and_b32 s7, s0, s2 -; GFX10-NEXT: s_bfe_u32 s0, s0, s5 -; GFX10-NEXT: s_lshl_b32 s5, s8, 8 -; GFX10-NEXT: s_or_b32 s1, s1, s6 +; GFX10-NEXT: s_or_b32 s6, s6, s7 +; GFX10-NEXT: s_bfe_u32 s5, s0, 0x80008 +; GFX10-NEXT: s_lshl_b32 s3, s3, 24 +; GFX10-NEXT: s_or_b32 s1, s6, s1 +; GFX10-NEXT: s_lshr_b32 s2, s0, 24 +; GFX10-NEXT: s_and_b32 s4, s0, 0xff +; GFX10-NEXT: s_bfe_u32 s0, s0, 0x80010 +; GFX10-NEXT: s_lshl_b32 s5, s5, 8 +; GFX10-NEXT: s_or_b32 s1, s1, s3 ; GFX10-NEXT: s_lshl_b32 s0, s0, 16 -; GFX10-NEXT: s_lshl_b32 s3, s4, 24 -; GFX10-NEXT: s_or_b32 s4, s7, s5 +; GFX10-NEXT: s_or_b32 s3, s4, s5 ; GFX10-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-NEXT: s_or_b32 s0, s4, s0 -; GFX10-NEXT: s_or_b32 s0, s0, s3 +; GFX10-NEXT: s_lshl_b32 s2, s2, 24 +; GFX10-NEXT: s_or_b32 s0, s3, s0 +; GFX10-NEXT: s_or_b32 s0, s0, s2 ; GFX10-NEXT: v_cndmask_b32_e32 v5, s0, v1, vcc_lo ; GFX10-NEXT: v_mov_b32_e32 v0, s0 ; GFX10-NEXT: v_mov_b32_e32 v1, s1 @@ -2781,9 +2743,9 @@ define amdgpu_ps void @insertelement_s_v8i8_s_v(<8 x i8> addrspace(4)* inreg %pt ; GFX10-NEXT: v_lshlrev_b32_sdwa v5, s0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 ; GFX10-NEXT: v_lshlrev_b32_sdwa v6, s1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 ; GFX10-NEXT: v_lshlrev_b32_sdwa v7, s1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 -; GFX10-NEXT: v_and_or_b32 v3, v0, s2, v3 +; GFX10-NEXT: v_and_or_b32 v3, 0xff, v0, v3 ; GFX10-NEXT: v_lshlrev_b32_e32 v2, 24, v2 -; GFX10-NEXT: v_and_or_b32 v5, v1, s2, v5 +; GFX10-NEXT: v_and_or_b32 v5, 0xff, v1, v5 ; GFX10-NEXT: v_lshlrev_b32_e32 v4, 24, v4 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 @@ -2801,33 +2763,31 @@ define amdgpu_ps void @insertelement_s_v8i8_v_v(<8 x i8> addrspace(4)* inreg %pt ; GFX9-LABEL: insertelement_s_v8i8_v_v: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 -; GFX9-NEXT: s_mov_b32 s8, 0x80008 -; GFX9-NEXT: s_movk_i32 s6, 0xff ; GFX9-NEXT: v_lshrrev_b32_e32 v2, 2, v1 ; GFX9-NEXT: v_and_b32_e32 v1, 3, v1 +; GFX9-NEXT: s_movk_i32 s6, 0xff +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 3, v1 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_bfe_u32 s9, s0, s8 -; GFX9-NEXT: s_and_b32 s7, s0, s6 -; GFX9-NEXT: s_lshl_b32 s9, s9, 8 -; GFX9-NEXT: s_or_b32 s7, s7, s9 -; GFX9-NEXT: s_mov_b32 s9, 0x80010 +; GFX9-NEXT: s_bfe_u32 s8, s0, 0x80008 ; GFX9-NEXT: s_lshr_b32 s4, s0, 24 -; GFX9-NEXT: s_bfe_u32 s0, s0, s9 +; GFX9-NEXT: s_and_b32 s7, s0, 0xff +; GFX9-NEXT: s_lshl_b32 s8, s8, 8 +; GFX9-NEXT: s_bfe_u32 s0, s0, 0x80010 +; GFX9-NEXT: s_or_b32 s7, s7, s8 ; GFX9-NEXT: s_lshl_b32 s0, s0, 16 ; GFX9-NEXT: s_or_b32 s0, s7, s0 ; GFX9-NEXT: s_lshl_b32 s4, s4, 24 -; GFX9-NEXT: s_bfe_u32 s7, s1, s8 +; GFX9-NEXT: s_bfe_u32 s7, s1, 0x80008 ; GFX9-NEXT: s_lshr_b32 s5, s1, 24 ; GFX9-NEXT: s_or_b32 s0, s0, s4 -; GFX9-NEXT: s_and_b32 s4, s1, s6 +; GFX9-NEXT: s_and_b32 s4, s1, 0xff ; GFX9-NEXT: s_lshl_b32 s7, s7, 8 -; GFX9-NEXT: s_bfe_u32 s1, s1, s9 +; GFX9-NEXT: s_bfe_u32 s1, s1, 0x80010 ; GFX9-NEXT: s_or_b32 s4, s4, s7 ; GFX9-NEXT: s_lshl_b32 s1, s1, 16 ; GFX9-NEXT: s_or_b32 s1, s4, s1 ; GFX9-NEXT: s_lshl_b32 s4, s5, 24 ; GFX9-NEXT: s_or_b32 s1, s1, s4 -; GFX9-NEXT: v_lshlrev_b32_e32 v1, 3, v1 ; GFX9-NEXT: v_mov_b32_e32 v3, s0 ; GFX9-NEXT: v_mov_b32_e32 v4, s1 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v2 @@ -2842,16 +2802,17 @@ define amdgpu_ps void @insertelement_s_v8i8_v_v(<8 x i8> addrspace(4)* inreg %pt ; GFX9-NEXT: s_mov_b32 s2, 8 ; GFX9-NEXT: v_cndmask_b32_e64 v0, v0, v3, s[0:1] ; GFX9-NEXT: s_mov_b32 s3, 16 +; GFX9-NEXT: v_mov_b32_e32 v4, 0xff ; GFX9-NEXT: v_lshrrev_b32_e32 v2, 24, v0 -; GFX9-NEXT: v_lshlrev_b32_sdwa v4, s2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 +; GFX9-NEXT: v_lshlrev_b32_sdwa v5, s2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc -; GFX9-NEXT: v_and_or_b32 v4, v0, s6, v4 +; GFX9-NEXT: v_and_or_b32 v5, v0, v4, v5 ; GFX9-NEXT: v_lshlrev_b32_sdwa v0, s3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 24, v2 ; GFX9-NEXT: v_lshrrev_b32_e32 v3, 24, v1 -; GFX9-NEXT: v_or3_b32 v0, v4, v0, v2 +; GFX9-NEXT: v_or3_b32 v0, v5, v0, v2 ; GFX9-NEXT: v_lshlrev_b32_sdwa v2, s2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 -; GFX9-NEXT: v_and_or_b32 v4, v1, s6, v2 +; GFX9-NEXT: v_and_or_b32 v4, v1, v4, v2 ; GFX9-NEXT: v_lshlrev_b32_sdwa v1, s3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 ; GFX9-NEXT: v_lshlrev_b32_e32 v5, 24, v3 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 @@ -2863,33 +2824,31 @@ define amdgpu_ps void @insertelement_s_v8i8_v_v(<8 x i8> addrspace(4)* inreg %pt ; GFX8-LABEL: insertelement_s_v8i8_v_v: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 -; GFX8-NEXT: s_mov_b32 s6, 0x80008 -; GFX8-NEXT: s_movk_i32 s4, 0xff ; GFX8-NEXT: v_lshrrev_b32_e32 v2, 2, v1 ; GFX8-NEXT: v_and_b32_e32 v1, 3, v1 +; GFX8-NEXT: s_movk_i32 s4, 0xff +; GFX8-NEXT: v_lshlrev_b32_e32 v1, 3, v1 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_bfe_u32 s7, s0, s6 -; GFX8-NEXT: s_and_b32 s5, s0, s4 -; GFX8-NEXT: s_lshl_b32 s7, s7, 8 -; GFX8-NEXT: s_or_b32 s5, s5, s7 -; GFX8-NEXT: s_mov_b32 s7, 0x80010 +; GFX8-NEXT: s_bfe_u32 s6, s0, 0x80008 ; GFX8-NEXT: s_lshr_b32 s2, s0, 24 -; GFX8-NEXT: s_bfe_u32 s0, s0, s7 +; GFX8-NEXT: s_and_b32 s5, s0, 0xff +; GFX8-NEXT: s_lshl_b32 s6, s6, 8 +; GFX8-NEXT: s_bfe_u32 s0, s0, 0x80010 +; GFX8-NEXT: s_or_b32 s5, s5, s6 ; GFX8-NEXT: s_lshl_b32 s0, s0, 16 ; GFX8-NEXT: s_or_b32 s0, s5, s0 ; GFX8-NEXT: s_lshl_b32 s2, s2, 24 -; GFX8-NEXT: s_bfe_u32 s5, s1, s6 +; GFX8-NEXT: s_bfe_u32 s5, s1, 0x80008 ; GFX8-NEXT: s_lshr_b32 s3, s1, 24 ; GFX8-NEXT: s_or_b32 s0, s0, s2 -; GFX8-NEXT: s_and_b32 s2, s1, s4 +; GFX8-NEXT: s_and_b32 s2, s1, 0xff ; GFX8-NEXT: s_lshl_b32 s5, s5, 8 -; GFX8-NEXT: s_bfe_u32 s1, s1, s7 +; GFX8-NEXT: s_bfe_u32 s1, s1, 0x80010 ; GFX8-NEXT: s_or_b32 s2, s2, s5 ; GFX8-NEXT: s_lshl_b32 s1, s1, 16 ; GFX8-NEXT: s_or_b32 s1, s2, s1 ; GFX8-NEXT: s_lshl_b32 s2, s3, 24 ; GFX8-NEXT: s_or_b32 s1, s1, s2 -; GFX8-NEXT: v_lshlrev_b32_e32 v1, 3, v1 ; GFX8-NEXT: v_mov_b32_e32 v3, s0 ; GFX8-NEXT: v_mov_b32_e32 v4, s1 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 1, v2 @@ -2928,33 +2887,31 @@ define amdgpu_ps void @insertelement_s_v8i8_v_v(<8 x i8> addrspace(4)* inreg %pt ; GFX7-LABEL: insertelement_s_v8i8_v_v: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 -; GFX7-NEXT: s_mov_b32 s6, 0x80008 ; GFX7-NEXT: s_movk_i32 s4, 0xff ; GFX7-NEXT: v_lshrrev_b32_e32 v2, 2, v1 ; GFX7-NEXT: v_and_b32_e32 v1, 3, v1 +; GFX7-NEXT: v_lshlrev_b32_e32 v1, 3, v1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: s_bfe_u32 s7, s0, s6 -; GFX7-NEXT: s_and_b32 s5, s0, s4 -; GFX7-NEXT: s_lshl_b32 s7, s7, 8 -; GFX7-NEXT: s_or_b32 s5, s5, s7 -; GFX7-NEXT: s_mov_b32 s7, 0x80010 +; GFX7-NEXT: s_bfe_u32 s6, s0, 0x80008 ; GFX7-NEXT: s_lshr_b32 s2, s0, 24 -; GFX7-NEXT: s_bfe_u32 s0, s0, s7 +; GFX7-NEXT: s_and_b32 s5, s0, 0xff +; GFX7-NEXT: s_lshl_b32 s6, s6, 8 +; GFX7-NEXT: s_bfe_u32 s0, s0, 0x80010 +; GFX7-NEXT: s_or_b32 s5, s5, s6 ; GFX7-NEXT: s_lshl_b32 s0, s0, 16 ; GFX7-NEXT: s_or_b32 s0, s5, s0 ; GFX7-NEXT: s_lshl_b32 s2, s2, 24 -; GFX7-NEXT: s_bfe_u32 s5, s1, s6 +; GFX7-NEXT: s_bfe_u32 s5, s1, 0x80008 ; GFX7-NEXT: s_lshr_b32 s3, s1, 24 ; GFX7-NEXT: s_or_b32 s0, s0, s2 -; GFX7-NEXT: s_and_b32 s2, s1, s4 +; GFX7-NEXT: s_and_b32 s2, s1, 0xff ; GFX7-NEXT: s_lshl_b32 s5, s5, 8 -; GFX7-NEXT: s_bfe_u32 s1, s1, s7 +; GFX7-NEXT: s_bfe_u32 s1, s1, 0x80010 ; GFX7-NEXT: s_or_b32 s2, s2, s5 ; GFX7-NEXT: s_lshl_b32 s1, s1, 16 ; GFX7-NEXT: s_or_b32 s1, s2, s1 ; GFX7-NEXT: s_lshl_b32 s2, s3, 24 ; GFX7-NEXT: s_or_b32 s1, s1, s2 -; GFX7-NEXT: v_lshlrev_b32_e32 v1, 3, v1 ; GFX7-NEXT: v_and_b32_e32 v0, s4, v0 ; GFX7-NEXT: v_mov_b32_e32 v3, s0 ; GFX7-NEXT: v_mov_b32_e32 v4, s1 @@ -2982,7 +2939,7 @@ define amdgpu_ps void @insertelement_s_v8i8_v_v(<8 x i8> addrspace(4)* inreg %pt ; GFX7-NEXT: v_bfe_u32 v4, v1, 8, 8 ; GFX7-NEXT: v_lshrrev_b32_e32 v3, 24, v1 ; GFX7-NEXT: v_or_b32_e32 v0, v0, v2 -; GFX7-NEXT: v_and_b32_e32 v2, s4, v1 +; GFX7-NEXT: v_and_b32_e32 v2, 0xff, v1 ; GFX7-NEXT: v_lshlrev_b32_e32 v4, 8, v4 ; GFX7-NEXT: v_bfe_u32 v1, v1, 16, 8 ; GFX7-NEXT: v_or_b32_e32 v2, v2, v4 @@ -2999,38 +2956,35 @@ define amdgpu_ps void @insertelement_s_v8i8_v_v(<8 x i8> addrspace(4)* inreg %pt ; GFX10-LABEL: insertelement_s_v8i8_v_v: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 -; GFX10-NEXT: s_mov_b32 s3, 0x80008 ; GFX10-NEXT: v_and_b32_e32 v2, 3, v1 -; GFX10-NEXT: s_movk_i32 s2, 0xff -; GFX10-NEXT: s_mov_b32 s4, 0x80010 ; GFX10-NEXT: v_lshrrev_b32_e32 v3, 2, v1 ; GFX10-NEXT: v_lshlrev_b32_e32 v2, 3, v2 ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v3 ; GFX10-NEXT: v_lshlrev_b32_sdwa v4, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX10-NEXT: v_lshlrev_b32_e64 v0, v2, s2 +; GFX10-NEXT: v_lshlrev_b32_e64 v0, v2, 0xff ; GFX10-NEXT: v_xor_b32_e32 v2, -1, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: s_bfe_u32 s8, s0, s3 -; GFX10-NEXT: s_bfe_u32 s3, s1, s3 -; GFX10-NEXT: s_lshr_b32 s6, s1, 24 -; GFX10-NEXT: s_and_b32 s9, s1, s2 -; GFX10-NEXT: s_bfe_u32 s1, s1, s4 -; GFX10-NEXT: s_lshl_b32 s3, s3, 8 +; GFX10-NEXT: s_bfe_u32 s7, s1, 0x80008 +; GFX10-NEXT: s_lshr_b32 s3, s1, 24 +; GFX10-NEXT: s_and_b32 s6, s1, 0xff +; GFX10-NEXT: s_bfe_u32 s1, s1, 0x80010 +; GFX10-NEXT: s_lshl_b32 s7, s7, 8 ; GFX10-NEXT: s_lshl_b32 s1, s1, 16 -; GFX10-NEXT: s_or_b32 s3, s9, s3 -; GFX10-NEXT: s_lshl_b32 s6, s6, 24 -; GFX10-NEXT: s_or_b32 s1, s3, s1 -; GFX10-NEXT: s_lshr_b32 s5, s0, 24 -; GFX10-NEXT: s_and_b32 s7, s0, s2 -; GFX10-NEXT: s_bfe_u32 s0, s0, s4 -; GFX10-NEXT: s_lshl_b32 s4, s8, 8 -; GFX10-NEXT: s_or_b32 s1, s1, s6 +; GFX10-NEXT: s_or_b32 s6, s6, s7 +; GFX10-NEXT: s_bfe_u32 s5, s0, 0x80008 +; GFX10-NEXT: s_lshl_b32 s3, s3, 24 +; GFX10-NEXT: s_or_b32 s1, s6, s1 +; GFX10-NEXT: s_lshr_b32 s2, s0, 24 +; GFX10-NEXT: s_and_b32 s4, s0, 0xff +; GFX10-NEXT: s_bfe_u32 s0, s0, 0x80010 +; GFX10-NEXT: s_lshl_b32 s5, s5, 8 +; GFX10-NEXT: s_or_b32 s1, s1, s3 ; GFX10-NEXT: s_lshl_b32 s0, s0, 16 -; GFX10-NEXT: s_or_b32 s4, s7, s4 +; GFX10-NEXT: s_or_b32 s3, s4, s5 ; GFX10-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-NEXT: s_lshl_b32 s3, s5, 24 -; GFX10-NEXT: s_or_b32 s0, s4, s0 -; GFX10-NEXT: s_or_b32 s0, s0, s3 +; GFX10-NEXT: s_lshl_b32 s2, s2, 24 +; GFX10-NEXT: s_or_b32 s0, s3, s0 +; GFX10-NEXT: s_or_b32 s0, s0, s2 ; GFX10-NEXT: v_cndmask_b32_e32 v5, s0, v1, vcc_lo ; GFX10-NEXT: v_mov_b32_e32 v0, s0 ; GFX10-NEXT: v_mov_b32_e32 v1, s1 @@ -3046,9 +3000,9 @@ define amdgpu_ps void @insertelement_s_v8i8_v_v(<8 x i8> addrspace(4)* inreg %pt ; GFX10-NEXT: v_lshlrev_b32_sdwa v5, s0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 ; GFX10-NEXT: v_lshlrev_b32_sdwa v6, s1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 ; GFX10-NEXT: v_lshlrev_b32_sdwa v7, s1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 -; GFX10-NEXT: v_and_or_b32 v3, v0, s2, v3 +; GFX10-NEXT: v_and_or_b32 v3, 0xff, v0, v3 ; GFX10-NEXT: v_lshlrev_b32_e32 v2, 24, v2 -; GFX10-NEXT: v_and_or_b32 v5, v1, s2, v5 +; GFX10-NEXT: v_and_or_b32 v5, 0xff, v1, v5 ; GFX10-NEXT: v_lshlrev_b32_e32 v4, 24, v4 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 @@ -3069,102 +3023,103 @@ define amdgpu_ps void @insertelement_v_v8i8_s_v(<8 x i8> addrspace(1)* %ptr, i8 ; GFX9-NEXT: s_mov_b32 s0, 8 ; GFX9-NEXT: s_mov_b32 s1, 16 ; GFX9-NEXT: s_movk_i32 s3, 0xff -; GFX9-NEXT: v_lshrrev_b32_e32 v7, 2, v2 +; GFX9-NEXT: v_lshrrev_b32_e32 v8, 2, v2 ; GFX9-NEXT: v_and_b32_e32 v2, 3, v2 -; GFX9-NEXT: s_and_b32 s2, s2, s3 +; GFX9-NEXT: v_mov_b32_e32 v5, 0xff +; GFX9-NEXT: s_and_b32 s2, s2, 0xff ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 3, v2 -; GFX9-NEXT: v_lshlrev_b32_e64 v8, v2, s2 -; GFX9-NEXT: v_lshlrev_b32_e64 v2, v2, s3 -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v7 +; GFX9-NEXT: v_lshlrev_b32_e64 v9, v2, s2 +; GFX9-NEXT: v_lshlrev_b32_e32 v2, v2, v5 +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v8 ; GFX9-NEXT: v_xor_b32_e32 v2, -1, v2 -; GFX9-NEXT: v_mov_b32_e32 v5, 8 -; GFX9-NEXT: v_mov_b32_e32 v6, 16 +; GFX9-NEXT: v_mov_b32_e32 v6, 8 +; GFX9-NEXT: v_mov_b32_e32 v7, 16 ; GFX9-NEXT: v_mov_b32_e32 v3, 0 ; GFX9-NEXT: v_mov_b32_e32 v4, 0 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_lshrrev_b32_e32 v9, 24, v0 -; GFX9-NEXT: v_lshrrev_b32_e32 v10, 24, v1 -; GFX9-NEXT: v_lshlrev_b32_sdwa v11, s0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 -; GFX9-NEXT: v_lshlrev_b32_sdwa v13, s0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 -; GFX9-NEXT: v_lshlrev_b32_sdwa v12, s1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 -; GFX9-NEXT: v_lshlrev_b32_sdwa v14, s1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 -; GFX9-NEXT: v_and_or_b32 v0, v0, s3, v11 -; GFX9-NEXT: v_lshlrev_b32_e32 v9, 24, v9 -; GFX9-NEXT: v_and_or_b32 v1, v1, s3, v13 +; GFX9-NEXT: v_lshrrev_b32_e32 v10, 24, v0 +; GFX9-NEXT: v_lshrrev_b32_e32 v11, 24, v1 +; GFX9-NEXT: v_lshlrev_b32_sdwa v12, s0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 +; GFX9-NEXT: v_lshlrev_b32_sdwa v14, s0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 +; GFX9-NEXT: v_lshlrev_b32_sdwa v13, s1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 +; GFX9-NEXT: v_lshlrev_b32_sdwa v15, s1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 +; GFX9-NEXT: v_and_or_b32 v0, v0, s3, v12 ; GFX9-NEXT: v_lshlrev_b32_e32 v10, 24, v10 -; GFX9-NEXT: v_or3_b32 v0, v0, v12, v9 -; GFX9-NEXT: v_or3_b32 v1, v1, v14, v10 -; GFX9-NEXT: v_cndmask_b32_e32 v9, v0, v1, vcc -; GFX9-NEXT: v_and_or_b32 v2, v9, v2, v8 -; GFX9-NEXT: v_cmp_eq_u32_e64 s[0:1], 0, v7 +; GFX9-NEXT: v_and_or_b32 v1, v1, s3, v14 +; GFX9-NEXT: v_lshlrev_b32_e32 v11, 24, v11 +; GFX9-NEXT: v_or3_b32 v0, v0, v13, v10 +; GFX9-NEXT: v_or3_b32 v1, v1, v15, v11 +; GFX9-NEXT: v_cndmask_b32_e32 v10, v0, v1, vcc +; GFX9-NEXT: v_and_or_b32 v2, v10, v2, v9 +; GFX9-NEXT: v_cmp_eq_u32_e64 s[0:1], 0, v8 ; GFX9-NEXT: v_cndmask_b32_e64 v0, v0, v2, s[0:1] ; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc ; GFX9-NEXT: v_lshrrev_b32_e32 v2, 24, v0 -; GFX9-NEXT: v_lshrrev_b32_e32 v7, 24, v1 -; GFX9-NEXT: v_lshlrev_b32_sdwa v8, v5, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 -; GFX9-NEXT: v_lshlrev_b32_sdwa v5, v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 -; GFX9-NEXT: v_lshlrev_b32_sdwa v9, v6, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 -; GFX9-NEXT: v_lshlrev_b32_sdwa v6, v6, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 -; GFX9-NEXT: v_and_or_b32 v0, v0, s3, v8 +; GFX9-NEXT: v_lshrrev_b32_e32 v8, 24, v1 +; GFX9-NEXT: v_lshlrev_b32_sdwa v9, v6, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 +; GFX9-NEXT: v_lshlrev_b32_sdwa v6, v6, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 +; GFX9-NEXT: v_lshlrev_b32_sdwa v10, v7, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 +; GFX9-NEXT: v_lshlrev_b32_sdwa v7, v7, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 +; GFX9-NEXT: v_and_or_b32 v0, v0, v5, v9 ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 24, v2 -; GFX9-NEXT: v_and_or_b32 v1, v1, s3, v5 -; GFX9-NEXT: v_lshlrev_b32_e32 v5, 24, v7 -; GFX9-NEXT: v_or3_b32 v0, v0, v9, v2 -; GFX9-NEXT: v_or3_b32 v1, v1, v6, v5 +; GFX9-NEXT: v_and_or_b32 v1, v1, v5, v6 +; GFX9-NEXT: v_lshlrev_b32_e32 v5, 24, v8 +; GFX9-NEXT: v_or3_b32 v0, v0, v10, v2 +; GFX9-NEXT: v_or3_b32 v1, v1, v7, v5 ; GFX9-NEXT: global_store_dwordx2 v[3:4], v[0:1], off ; GFX9-NEXT: s_endpgm ; ; GFX8-LABEL: insertelement_v_v8i8_s_v: ; GFX8: ; %bb.0: ; GFX8-NEXT: flat_load_dwordx2 v[0:1], v[0:1] -; GFX8-NEXT: v_mov_b32_e32 v5, 8 -; GFX8-NEXT: v_mov_b32_e32 v6, 16 -; GFX8-NEXT: s_movk_i32 s0, 0xff -; GFX8-NEXT: v_lshrrev_b32_e32 v9, 2, v2 +; GFX8-NEXT: v_mov_b32_e32 v6, 8 +; GFX8-NEXT: v_lshrrev_b32_e32 v10, 2, v2 ; GFX8-NEXT: v_and_b32_e32 v2, 3, v2 -; GFX8-NEXT: s_and_b32 s1, s2, s0 +; GFX8-NEXT: v_mov_b32_e32 v5, 0xff +; GFX8-NEXT: v_mov_b32_e32 v7, 16 +; GFX8-NEXT: s_and_b32 s0, s2, 0xff ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 3, v2 -; GFX8-NEXT: v_lshlrev_b32_e64 v10, v2, s1 -; GFX8-NEXT: v_lshlrev_b32_e64 v2, v2, s0 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 1, v9 +; GFX8-NEXT: v_lshlrev_b32_e64 v11, v2, s0 +; GFX8-NEXT: v_lshlrev_b32_e32 v2, v2, v5 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 1, v10 ; GFX8-NEXT: v_xor_b32_e32 v2, -1, v2 -; GFX8-NEXT: v_cmp_eq_u32_e64 s[0:1], 0, v9 -; GFX8-NEXT: v_mov_b32_e32 v7, 8 -; GFX8-NEXT: v_mov_b32_e32 v8, 16 +; GFX8-NEXT: v_cmp_eq_u32_e64 s[0:1], 0, v10 +; GFX8-NEXT: v_mov_b32_e32 v8, 8 +; GFX8-NEXT: v_mov_b32_e32 v9, 16 ; GFX8-NEXT: v_mov_b32_e32 v3, 0 ; GFX8-NEXT: v_mov_b32_e32 v4, 0 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_lshlrev_b32_sdwa v13, v5, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 -; GFX8-NEXT: v_lshlrev_b32_sdwa v5, v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 -; GFX8-NEXT: v_lshrrev_b32_e32 v11, 24, v0 +; GFX8-NEXT: v_lshlrev_b32_sdwa v13, v6, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 +; GFX8-NEXT: v_lshlrev_b32_sdwa v6, v6, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 +; GFX8-NEXT: v_lshrrev_b32_e32 v5, 24, v0 ; GFX8-NEXT: v_lshrrev_b32_e32 v12, 24, v1 -; GFX8-NEXT: v_lshlrev_b32_sdwa v14, v6, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 -; GFX8-NEXT: v_lshlrev_b32_sdwa v6, v6, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 +; GFX8-NEXT: v_lshlrev_b32_sdwa v14, v7, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 +; GFX8-NEXT: v_lshlrev_b32_sdwa v7, v7, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 ; GFX8-NEXT: v_or_b32_sdwa v0, v0, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX8-NEXT: v_or_b32_sdwa v1, v1, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX8-NEXT: v_lshlrev_b32_e32 v11, 24, v11 -; GFX8-NEXT: v_lshlrev_b32_e32 v5, 24, v12 +; GFX8-NEXT: v_or_b32_sdwa v1, v1, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX8-NEXT: v_lshlrev_b32_e32 v5, 24, v5 +; GFX8-NEXT: v_lshlrev_b32_e32 v6, 24, v12 ; GFX8-NEXT: v_or_b32_e32 v0, v0, v14 +; GFX8-NEXT: v_or_b32_e32 v1, v1, v7 +; GFX8-NEXT: v_or_b32_e32 v0, v0, v5 ; GFX8-NEXT: v_or_b32_e32 v1, v1, v6 -; GFX8-NEXT: v_or_b32_e32 v0, v0, v11 -; GFX8-NEXT: v_or_b32_e32 v1, v1, v5 ; GFX8-NEXT: v_cndmask_b32_e32 v5, v0, v1, vcc ; GFX8-NEXT: v_and_b32_e32 v2, v5, v2 -; GFX8-NEXT: v_or_b32_e32 v2, v2, v10 +; GFX8-NEXT: v_or_b32_e32 v2, v2, v11 ; GFX8-NEXT: v_cndmask_b32_e64 v0, v0, v2, s[0:1] ; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc -; GFX8-NEXT: v_lshlrev_b32_sdwa v6, v7, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 -; GFX8-NEXT: v_lshlrev_b32_sdwa v7, v7, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 +; GFX8-NEXT: v_lshlrev_b32_sdwa v6, v8, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 +; GFX8-NEXT: v_lshlrev_b32_sdwa v8, v8, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 ; GFX8-NEXT: v_lshrrev_b32_e32 v2, 24, v0 ; GFX8-NEXT: v_lshrrev_b32_e32 v5, 24, v1 -; GFX8-NEXT: v_lshlrev_b32_sdwa v9, v8, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 -; GFX8-NEXT: v_lshlrev_b32_sdwa v8, v8, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 +; GFX8-NEXT: v_lshlrev_b32_sdwa v7, v9, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 +; GFX8-NEXT: v_lshlrev_b32_sdwa v9, v9, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 ; GFX8-NEXT: v_or_b32_sdwa v0, v0, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX8-NEXT: v_or_b32_sdwa v1, v1, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_sdwa v1, v1, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 24, v2 ; GFX8-NEXT: v_lshlrev_b32_e32 v5, 24, v5 -; GFX8-NEXT: v_or_b32_e32 v0, v0, v9 -; GFX8-NEXT: v_or_b32_e32 v1, v1, v8 +; GFX8-NEXT: v_or_b32_e32 v0, v0, v7 +; GFX8-NEXT: v_or_b32_e32 v1, v1, v9 ; GFX8-NEXT: v_or_b32_e32 v0, v0, v2 ; GFX8-NEXT: v_or_b32_e32 v1, v1, v5 ; GFX8-NEXT: flat_store_dwordx2 v[3:4], v[0:1] @@ -3176,63 +3131,64 @@ define amdgpu_ps void @insertelement_v_v8i8_s_v(<8 x i8> addrspace(1)* %ptr, i8 ; GFX7-NEXT: s_mov_b32 s7, 0xf000 ; GFX7-NEXT: s_mov_b64 s[4:5], 0 ; GFX7-NEXT: buffer_load_dwordx2 v[0:1], v[0:1], s[4:7], 0 addr64 -; GFX7-NEXT: s_movk_i32 s3, 0xff -; GFX7-NEXT: v_lshrrev_b32_e32 v3, 2, v2 +; GFX7-NEXT: s_movk_i32 s0, 0xff +; GFX7-NEXT: v_lshrrev_b32_e32 v4, 2, v2 ; GFX7-NEXT: v_and_b32_e32 v2, 3, v2 -; GFX7-NEXT: s_and_b32 s0, s2, s3 +; GFX7-NEXT: v_mov_b32_e32 v3, 0xff +; GFX7-NEXT: s_and_b32 s1, s2, 0xff ; GFX7-NEXT: v_lshlrev_b32_e32 v2, 3, v2 -; GFX7-NEXT: v_lshl_b32_e32 v4, s0, v2 -; GFX7-NEXT: v_lshl_b32_e32 v2, s3, v2 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v3 +; GFX7-NEXT: v_lshl_b32_e32 v5, s1, v2 +; GFX7-NEXT: v_lshlrev_b32_e32 v2, v2, v3 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v4 ; GFX7-NEXT: v_xor_b32_e32 v2, -1, v2 -; GFX7-NEXT: v_cmp_eq_u32_e64 s[0:1], 0, v3 ; GFX7-NEXT: s_mov_b32 s6, -1 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_bfe_u32 v8, v0, 8, 8 -; GFX7-NEXT: v_bfe_u32 v10, v1, 8, 8 -; GFX7-NEXT: v_lshrrev_b32_e32 v5, 24, v0 -; GFX7-NEXT: v_lshrrev_b32_e32 v6, 24, v1 -; GFX7-NEXT: v_and_b32_e32 v7, s3, v0 +; GFX7-NEXT: v_bfe_u32 v9, v0, 8, 8 +; GFX7-NEXT: v_bfe_u32 v11, v1, 8, 8 +; GFX7-NEXT: v_lshrrev_b32_e32 v6, 24, v0 +; GFX7-NEXT: v_lshrrev_b32_e32 v7, 24, v1 +; GFX7-NEXT: v_and_b32_e32 v8, s0, v0 ; GFX7-NEXT: v_bfe_u32 v0, v0, 16, 8 -; GFX7-NEXT: v_and_b32_e32 v9, s3, v1 +; GFX7-NEXT: v_and_b32_e32 v10, s0, v1 ; GFX7-NEXT: v_bfe_u32 v1, v1, 16, 8 -; GFX7-NEXT: v_lshlrev_b32_e32 v8, 8, v8 -; GFX7-NEXT: v_lshlrev_b32_e32 v10, 8, v10 +; GFX7-NEXT: v_lshlrev_b32_e32 v9, 8, v9 +; GFX7-NEXT: v_lshlrev_b32_e32 v11, 8, v11 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX7-NEXT: v_or_b32_e32 v7, v7, v8 -; GFX7-NEXT: v_or_b32_e32 v8, v9, v10 -; GFX7-NEXT: v_lshlrev_b32_e32 v5, 24, v5 +; GFX7-NEXT: v_or_b32_e32 v8, v8, v9 +; GFX7-NEXT: v_or_b32_e32 v9, v10, v11 ; GFX7-NEXT: v_lshlrev_b32_e32 v6, 24, v6 -; GFX7-NEXT: v_or_b32_e32 v0, v7, v0 -; GFX7-NEXT: v_or_b32_e32 v1, v8, v1 -; GFX7-NEXT: v_or_b32_e32 v0, v0, v5 -; GFX7-NEXT: v_or_b32_e32 v1, v1, v6 -; GFX7-NEXT: v_cndmask_b32_e32 v5, v0, v1, vcc -; GFX7-NEXT: v_and_b32_e32 v2, v5, v2 -; GFX7-NEXT: v_or_b32_e32 v2, v2, v4 +; GFX7-NEXT: v_lshlrev_b32_e32 v7, 24, v7 +; GFX7-NEXT: v_or_b32_e32 v0, v8, v0 +; GFX7-NEXT: v_or_b32_e32 v1, v9, v1 +; GFX7-NEXT: v_or_b32_e32 v0, v0, v6 +; GFX7-NEXT: v_or_b32_e32 v1, v1, v7 +; GFX7-NEXT: v_cndmask_b32_e32 v6, v0, v1, vcc +; GFX7-NEXT: v_and_b32_e32 v2, v6, v2 +; GFX7-NEXT: v_or_b32_e32 v2, v2, v5 +; GFX7-NEXT: v_cmp_eq_u32_e64 s[0:1], 0, v4 ; GFX7-NEXT: v_cndmask_b32_e64 v0, v0, v2, s[0:1] ; GFX7-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc -; GFX7-NEXT: v_bfe_u32 v5, v0, 8, 8 +; GFX7-NEXT: v_bfe_u32 v6, v0, 8, 8 ; GFX7-NEXT: v_bfe_u32 v7, v1, 8, 8 ; GFX7-NEXT: v_lshrrev_b32_e32 v2, 24, v0 -; GFX7-NEXT: v_lshrrev_b32_e32 v3, 24, v1 -; GFX7-NEXT: v_and_b32_e32 v4, s3, v0 +; GFX7-NEXT: v_lshrrev_b32_e32 v4, 24, v1 +; GFX7-NEXT: v_and_b32_e32 v5, v0, v3 ; GFX7-NEXT: v_bfe_u32 v0, v0, 16, 8 -; GFX7-NEXT: v_and_b32_e32 v6, s3, v1 +; GFX7-NEXT: v_and_b32_e32 v3, v1, v3 ; GFX7-NEXT: v_bfe_u32 v1, v1, 16, 8 -; GFX7-NEXT: v_lshlrev_b32_e32 v5, 8, v5 +; GFX7-NEXT: v_lshlrev_b32_e32 v6, 8, v6 ; GFX7-NEXT: v_lshlrev_b32_e32 v7, 8, v7 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX7-NEXT: v_or_b32_e32 v4, v4, v5 -; GFX7-NEXT: v_or_b32_e32 v5, v6, v7 +; GFX7-NEXT: v_or_b32_e32 v5, v5, v6 +; GFX7-NEXT: v_or_b32_e32 v3, v3, v7 ; GFX7-NEXT: v_lshlrev_b32_e32 v2, 24, v2 -; GFX7-NEXT: v_lshlrev_b32_e32 v3, 24, v3 -; GFX7-NEXT: v_or_b32_e32 v0, v4, v0 -; GFX7-NEXT: v_or_b32_e32 v1, v5, v1 +; GFX7-NEXT: v_lshlrev_b32_e32 v4, 24, v4 +; GFX7-NEXT: v_or_b32_e32 v0, v5, v0 +; GFX7-NEXT: v_or_b32_e32 v1, v3, v1 ; GFX7-NEXT: v_or_b32_e32 v0, v0, v2 -; GFX7-NEXT: v_or_b32_e32 v1, v1, v3 +; GFX7-NEXT: v_or_b32_e32 v1, v1, v4 ; GFX7-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; GFX7-NEXT: s_endpgm ; @@ -3242,7 +3198,6 @@ define amdgpu_ps void @insertelement_v_v8i8_s_v(<8 x i8> addrspace(1)* %ptr, i8 ; GFX10-NEXT: s_mov_b32 s0, 8 ; GFX10-NEXT: v_and_b32_e32 v3, 3, v2 ; GFX10-NEXT: s_mov_b32 s1, 16 -; GFX10-NEXT: s_movk_i32 s3, 0xff ; GFX10-NEXT: v_lshrrev_b32_e32 v2, 2, v2 ; GFX10-NEXT: v_lshlrev_b32_e32 v3, 3, v3 ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v2 @@ -3253,12 +3208,12 @@ define amdgpu_ps void @insertelement_v_v8i8_s_v(<8 x i8> addrspace(1)* %ptr, i8 ; GFX10-NEXT: v_lshlrev_b32_sdwa v7, s0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 ; GFX10-NEXT: v_lshlrev_b32_sdwa v8, s1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 ; GFX10-NEXT: v_lshlrev_b32_sdwa v9, s1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 -; GFX10-NEXT: v_and_or_b32 v0, v0, s3, v5 +; GFX10-NEXT: v_and_or_b32 v0, v0, 0xff, v5 ; GFX10-NEXT: v_lshlrev_b32_e32 v4, 24, v4 -; GFX10-NEXT: v_and_or_b32 v1, v1, s3, v7 +; GFX10-NEXT: v_and_or_b32 v1, v1, 0xff, v7 ; GFX10-NEXT: v_lshlrev_b32_e32 v5, 24, v6 -; GFX10-NEXT: v_lshlrev_b32_e64 v6, v3, s3 -; GFX10-NEXT: s_and_b32 s0, s2, s3 +; GFX10-NEXT: v_lshlrev_b32_e64 v6, v3, 0xff +; GFX10-NEXT: s_and_b32 s0, s2, 0xff ; GFX10-NEXT: v_or3_b32 v0, v0, v8, v4 ; GFX10-NEXT: v_lshlrev_b32_e64 v3, v3, s0 ; GFX10-NEXT: v_or3_b32 v1, v1, v9, v5 @@ -3276,9 +3231,9 @@ define amdgpu_ps void @insertelement_v_v8i8_s_v(<8 x i8> addrspace(1)* %ptr, i8 ; GFX10-NEXT: v_lshlrev_b32_sdwa v2, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 ; GFX10-NEXT: v_lshlrev_b32_sdwa v7, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 ; GFX10-NEXT: v_lshlrev_b32_sdwa v3, v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 -; GFX10-NEXT: v_and_or_b32 v5, v0, s3, v5 +; GFX10-NEXT: v_and_or_b32 v5, 0xff, v0, v5 ; GFX10-NEXT: v_lshlrev_b32_e32 v4, 24, v4 -; GFX10-NEXT: v_and_or_b32 v8, v1, s3, v2 +; GFX10-NEXT: v_and_or_b32 v8, 0xff, v1, v2 ; GFX10-NEXT: v_lshlrev_b32_e32 v6, 24, v6 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 @@ -3303,60 +3258,60 @@ define amdgpu_ps void @insertelement_v_v8i8_v_s(<8 x i8> addrspace(1)* %ptr, i8 ; GFX9-NEXT: s_and_b32 s2, s2, 3 ; GFX9-NEXT: s_lshl_b32 s2, s2, 3 ; GFX9-NEXT: v_lshlrev_b32_sdwa v2, s2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: s_lshl_b32 s2, s3, s2 +; GFX9-NEXT: s_lshl_b32 s2, 0xff, s2 ; GFX9-NEXT: v_cmp_eq_u32_e64 vcc, s4, 1 ; GFX9-NEXT: s_not_b32 s2, s2 -; GFX9-NEXT: v_mov_b32_e32 v5, 8 -; GFX9-NEXT: v_mov_b32_e32 v6, 16 +; GFX9-NEXT: v_mov_b32_e32 v6, 8 +; GFX9-NEXT: v_mov_b32_e32 v5, 0xff +; GFX9-NEXT: v_mov_b32_e32 v7, 16 ; GFX9-NEXT: v_mov_b32_e32 v3, 0 ; GFX9-NEXT: v_mov_b32_e32 v4, 0 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_lshrrev_b32_e32 v7, 24, v0 -; GFX9-NEXT: v_lshrrev_b32_e32 v8, 24, v1 -; GFX9-NEXT: v_lshlrev_b32_sdwa v9, s0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 -; GFX9-NEXT: v_lshlrev_b32_sdwa v11, s0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 -; GFX9-NEXT: v_lshlrev_b32_sdwa v10, s1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 -; GFX9-NEXT: v_lshlrev_b32_sdwa v12, s1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 -; GFX9-NEXT: v_and_or_b32 v0, v0, s3, v9 -; GFX9-NEXT: v_lshlrev_b32_e32 v7, 24, v7 -; GFX9-NEXT: v_and_or_b32 v1, v1, s3, v11 +; GFX9-NEXT: v_lshrrev_b32_e32 v8, 24, v0 +; GFX9-NEXT: v_lshrrev_b32_e32 v9, 24, v1 +; GFX9-NEXT: v_lshlrev_b32_sdwa v10, s0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 +; GFX9-NEXT: v_lshlrev_b32_sdwa v12, s0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 +; GFX9-NEXT: v_lshlrev_b32_sdwa v11, s1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 +; GFX9-NEXT: v_lshlrev_b32_sdwa v13, s1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 +; GFX9-NEXT: v_and_or_b32 v0, v0, s3, v10 ; GFX9-NEXT: v_lshlrev_b32_e32 v8, 24, v8 -; GFX9-NEXT: v_or3_b32 v0, v0, v10, v7 -; GFX9-NEXT: v_or3_b32 v1, v1, v12, v8 -; GFX9-NEXT: v_cndmask_b32_e32 v7, v0, v1, vcc -; GFX9-NEXT: v_and_or_b32 v2, v7, s2, v2 +; GFX9-NEXT: v_and_or_b32 v1, v1, s3, v12 +; GFX9-NEXT: v_lshlrev_b32_e32 v9, 24, v9 +; GFX9-NEXT: v_or3_b32 v0, v0, v11, v8 +; GFX9-NEXT: v_or3_b32 v1, v1, v13, v9 +; GFX9-NEXT: v_cndmask_b32_e32 v8, v0, v1, vcc +; GFX9-NEXT: v_and_or_b32 v2, v8, s2, v2 ; GFX9-NEXT: v_cmp_eq_u32_e64 s[0:1], s4, 0 ; GFX9-NEXT: v_cndmask_b32_e64 v0, v0, v2, s[0:1] ; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc ; GFX9-NEXT: v_lshrrev_b32_e32 v2, 24, v0 -; GFX9-NEXT: v_lshrrev_b32_e32 v7, 24, v1 -; GFX9-NEXT: v_lshlrev_b32_sdwa v8, v5, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 -; GFX9-NEXT: v_lshlrev_b32_sdwa v5, v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 -; GFX9-NEXT: v_lshlrev_b32_sdwa v9, v6, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 -; GFX9-NEXT: v_lshlrev_b32_sdwa v6, v6, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 -; GFX9-NEXT: v_and_or_b32 v0, v0, s3, v8 +; GFX9-NEXT: v_lshrrev_b32_e32 v8, 24, v1 +; GFX9-NEXT: v_lshlrev_b32_sdwa v9, v6, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 +; GFX9-NEXT: v_lshlrev_b32_sdwa v6, v6, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 +; GFX9-NEXT: v_lshlrev_b32_sdwa v10, v7, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 +; GFX9-NEXT: v_lshlrev_b32_sdwa v7, v7, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 +; GFX9-NEXT: v_and_or_b32 v0, v0, v5, v9 ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 24, v2 -; GFX9-NEXT: v_and_or_b32 v1, v1, s3, v5 -; GFX9-NEXT: v_lshlrev_b32_e32 v5, 24, v7 -; GFX9-NEXT: v_or3_b32 v0, v0, v9, v2 -; GFX9-NEXT: v_or3_b32 v1, v1, v6, v5 +; GFX9-NEXT: v_and_or_b32 v1, v1, v5, v6 +; GFX9-NEXT: v_lshlrev_b32_e32 v5, 24, v8 +; GFX9-NEXT: v_or3_b32 v0, v0, v10, v2 +; GFX9-NEXT: v_or3_b32 v1, v1, v7, v5 ; GFX9-NEXT: global_store_dwordx2 v[3:4], v[0:1], off ; GFX9-NEXT: s_endpgm ; ; GFX8-LABEL: insertelement_v_v8i8_v_s: ; GFX8: ; %bb.0: ; GFX8-NEXT: flat_load_dwordx2 v[0:1], v[0:1] -; GFX8-NEXT: s_lshr_b32 s1, s2, 2 -; GFX8-NEXT: s_and_b32 s2, s2, 3 +; GFX8-NEXT: s_and_b32 s1, s2, 3 ; GFX8-NEXT: v_mov_b32_e32 v5, 8 -; GFX8-NEXT: s_lshl_b32 s2, s2, 3 +; GFX8-NEXT: s_lshl_b32 s1, s1, 3 ; GFX8-NEXT: v_mov_b32_e32 v6, 16 -; GFX8-NEXT: v_mov_b32_e32 v9, s2 +; GFX8-NEXT: v_mov_b32_e32 v9, s1 ; GFX8-NEXT: v_lshlrev_b32_sdwa v2, v9, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX8-NEXT: s_movk_i32 s0, 0xff -; GFX8-NEXT: s_lshl_b32 s0, s0, s2 -; GFX8-NEXT: v_cmp_eq_u32_e64 vcc, s1, 1 -; GFX8-NEXT: s_not_b32 s0, s0 +; GFX8-NEXT: s_lshr_b32 s0, s2, 2 +; GFX8-NEXT: s_lshl_b32 s1, 0xff, s1 +; GFX8-NEXT: v_cmp_eq_u32_e64 vcc, s0, 1 +; GFX8-NEXT: s_not_b32 s1, s1 ; GFX8-NEXT: v_mov_b32_e32 v7, 8 ; GFX8-NEXT: v_mov_b32_e32 v8, 16 ; GFX8-NEXT: v_mov_b32_e32 v3, 0 @@ -3377,9 +3332,9 @@ define amdgpu_ps void @insertelement_v_v8i8_v_s(<8 x i8> addrspace(1)* %ptr, i8 ; GFX8-NEXT: v_or_b32_e32 v0, v0, v9 ; GFX8-NEXT: v_or_b32_e32 v1, v1, v5 ; GFX8-NEXT: v_cndmask_b32_e32 v5, v0, v1, vcc -; GFX8-NEXT: v_and_b32_e32 v5, s0, v5 +; GFX8-NEXT: v_and_b32_e32 v5, s1, v5 ; GFX8-NEXT: v_or_b32_e32 v2, v5, v2 -; GFX8-NEXT: v_cmp_eq_u32_e64 s[0:1], s1, 0 +; GFX8-NEXT: v_cmp_eq_u32_e64 s[0:1], s0, 0 ; GFX8-NEXT: v_cndmask_b32_e64 v0, v0, v2, s[0:1] ; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc ; GFX8-NEXT: v_lshlrev_b32_sdwa v6, v7, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 @@ -3405,63 +3360,64 @@ define amdgpu_ps void @insertelement_v_v8i8_v_s(<8 x i8> addrspace(1)* %ptr, i8 ; GFX7-NEXT: s_mov_b32 s7, 0xf000 ; GFX7-NEXT: s_mov_b64 s[4:5], 0 ; GFX7-NEXT: buffer_load_dwordx2 v[0:1], v[0:1], s[4:7], 0 addr64 -; GFX7-NEXT: s_movk_i32 s3, 0xff -; GFX7-NEXT: s_and_b32 s1, s2, 3 -; GFX7-NEXT: s_lshr_b32 s0, s2, 2 -; GFX7-NEXT: v_and_b32_e32 v2, s3, v2 -; GFX7-NEXT: s_lshl_b32 s1, s1, 3 -; GFX7-NEXT: v_lshlrev_b32_e32 v2, s1, v2 -; GFX7-NEXT: s_lshl_b32 s1, s3, s1 -; GFX7-NEXT: v_cmp_eq_u32_e64 vcc, s0, 1 -; GFX7-NEXT: s_not_b32 s1, s1 +; GFX7-NEXT: s_movk_i32 s0, 0xff +; GFX7-NEXT: v_mov_b32_e32 v3, 0xff +; GFX7-NEXT: s_lshr_b32 s1, s2, 2 +; GFX7-NEXT: s_and_b32 s2, s2, 3 +; GFX7-NEXT: v_and_b32_e32 v2, v2, v3 +; GFX7-NEXT: s_lshl_b32 s2, s2, 3 +; GFX7-NEXT: v_lshlrev_b32_e32 v2, s2, v2 +; GFX7-NEXT: s_lshl_b32 s2, 0xff, s2 +; GFX7-NEXT: v_cmp_eq_u32_e64 vcc, s1, 1 +; GFX7-NEXT: s_not_b32 s2, s2 ; GFX7-NEXT: s_mov_b32 s6, -1 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_bfe_u32 v6, v0, 8, 8 -; GFX7-NEXT: v_bfe_u32 v8, v1, 8, 8 -; GFX7-NEXT: v_lshrrev_b32_e32 v3, 24, v0 -; GFX7-NEXT: v_lshrrev_b32_e32 v4, 24, v1 -; GFX7-NEXT: v_and_b32_e32 v5, s3, v0 +; GFX7-NEXT: v_bfe_u32 v7, v0, 8, 8 +; GFX7-NEXT: v_bfe_u32 v9, v1, 8, 8 +; GFX7-NEXT: v_lshrrev_b32_e32 v4, 24, v0 +; GFX7-NEXT: v_lshrrev_b32_e32 v5, 24, v1 +; GFX7-NEXT: v_and_b32_e32 v6, s0, v0 ; GFX7-NEXT: v_bfe_u32 v0, v0, 16, 8 -; GFX7-NEXT: v_and_b32_e32 v7, s3, v1 +; GFX7-NEXT: v_and_b32_e32 v8, s0, v1 ; GFX7-NEXT: v_bfe_u32 v1, v1, 16, 8 -; GFX7-NEXT: v_lshlrev_b32_e32 v6, 8, v6 -; GFX7-NEXT: v_lshlrev_b32_e32 v8, 8, v8 +; GFX7-NEXT: v_lshlrev_b32_e32 v7, 8, v7 +; GFX7-NEXT: v_lshlrev_b32_e32 v9, 8, v9 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX7-NEXT: v_or_b32_e32 v5, v5, v6 -; GFX7-NEXT: v_or_b32_e32 v6, v7, v8 -; GFX7-NEXT: v_lshlrev_b32_e32 v3, 24, v3 +; GFX7-NEXT: v_or_b32_e32 v6, v6, v7 +; GFX7-NEXT: v_or_b32_e32 v7, v8, v9 ; GFX7-NEXT: v_lshlrev_b32_e32 v4, 24, v4 -; GFX7-NEXT: v_or_b32_e32 v0, v5, v0 -; GFX7-NEXT: v_or_b32_e32 v1, v6, v1 -; GFX7-NEXT: v_or_b32_e32 v0, v0, v3 -; GFX7-NEXT: v_or_b32_e32 v1, v1, v4 -; GFX7-NEXT: v_cndmask_b32_e32 v3, v0, v1, vcc -; GFX7-NEXT: v_and_b32_e32 v3, s1, v3 -; GFX7-NEXT: v_or_b32_e32 v2, v3, v2 -; GFX7-NEXT: v_cmp_eq_u32_e64 s[0:1], s0, 0 +; GFX7-NEXT: v_lshlrev_b32_e32 v5, 24, v5 +; GFX7-NEXT: v_or_b32_e32 v0, v6, v0 +; GFX7-NEXT: v_or_b32_e32 v1, v7, v1 +; GFX7-NEXT: v_or_b32_e32 v0, v0, v4 +; GFX7-NEXT: v_or_b32_e32 v1, v1, v5 +; GFX7-NEXT: v_cndmask_b32_e32 v4, v0, v1, vcc +; GFX7-NEXT: v_and_b32_e32 v4, s2, v4 +; GFX7-NEXT: v_or_b32_e32 v2, v4, v2 +; GFX7-NEXT: v_cmp_eq_u32_e64 s[0:1], s1, 0 ; GFX7-NEXT: v_cndmask_b32_e64 v0, v0, v2, s[0:1] ; GFX7-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc -; GFX7-NEXT: v_bfe_u32 v5, v0, 8, 8 +; GFX7-NEXT: v_bfe_u32 v6, v0, 8, 8 ; GFX7-NEXT: v_bfe_u32 v7, v1, 8, 8 ; GFX7-NEXT: v_lshrrev_b32_e32 v2, 24, v0 -; GFX7-NEXT: v_lshrrev_b32_e32 v3, 24, v1 -; GFX7-NEXT: v_and_b32_e32 v4, s3, v0 +; GFX7-NEXT: v_lshrrev_b32_e32 v4, 24, v1 +; GFX7-NEXT: v_and_b32_e32 v5, v0, v3 ; GFX7-NEXT: v_bfe_u32 v0, v0, 16, 8 -; GFX7-NEXT: v_and_b32_e32 v6, s3, v1 +; GFX7-NEXT: v_and_b32_e32 v3, v1, v3 ; GFX7-NEXT: v_bfe_u32 v1, v1, 16, 8 -; GFX7-NEXT: v_lshlrev_b32_e32 v5, 8, v5 +; GFX7-NEXT: v_lshlrev_b32_e32 v6, 8, v6 ; GFX7-NEXT: v_lshlrev_b32_e32 v7, 8, v7 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX7-NEXT: v_or_b32_e32 v4, v4, v5 -; GFX7-NEXT: v_or_b32_e32 v5, v6, v7 +; GFX7-NEXT: v_or_b32_e32 v5, v5, v6 +; GFX7-NEXT: v_or_b32_e32 v3, v3, v7 ; GFX7-NEXT: v_lshlrev_b32_e32 v2, 24, v2 -; GFX7-NEXT: v_lshlrev_b32_e32 v3, 24, v3 -; GFX7-NEXT: v_or_b32_e32 v0, v4, v0 -; GFX7-NEXT: v_or_b32_e32 v1, v5, v1 +; GFX7-NEXT: v_lshlrev_b32_e32 v4, 24, v4 +; GFX7-NEXT: v_or_b32_e32 v0, v5, v0 +; GFX7-NEXT: v_or_b32_e32 v1, v3, v1 ; GFX7-NEXT: v_or_b32_e32 v0, v0, v2 -; GFX7-NEXT: v_or_b32_e32 v1, v1, v3 +; GFX7-NEXT: v_or_b32_e32 v1, v1, v4 ; GFX7-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; GFX7-NEXT: s_endpgm ; @@ -3470,7 +3426,6 @@ define amdgpu_ps void @insertelement_v_v8i8_v_s(<8 x i8> addrspace(1)* %ptr, i8 ; GFX10-NEXT: global_load_dwordx2 v[0:1], v[0:1], off ; GFX10-NEXT: s_mov_b32 s0, 8 ; GFX10-NEXT: s_mov_b32 s1, 16 -; GFX10-NEXT: s_movk_i32 s3, 0xff ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_lshrrev_b32_e32 v3, 24, v0 ; GFX10-NEXT: v_lshlrev_b32_sdwa v4, s0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 @@ -3478,9 +3433,9 @@ define amdgpu_ps void @insertelement_v_v8i8_v_s(<8 x i8> addrspace(1)* %ptr, i8 ; GFX10-NEXT: v_lshlrev_b32_sdwa v6, s0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 ; GFX10-NEXT: v_lshlrev_b32_sdwa v7, s1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 ; GFX10-NEXT: v_lshlrev_b32_sdwa v8, s1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 -; GFX10-NEXT: v_and_or_b32 v0, v0, s3, v4 +; GFX10-NEXT: v_and_or_b32 v0, v0, 0xff, v4 ; GFX10-NEXT: v_lshlrev_b32_e32 v3, 24, v3 -; GFX10-NEXT: v_and_or_b32 v1, v1, s3, v6 +; GFX10-NEXT: v_and_or_b32 v1, v1, 0xff, v6 ; GFX10-NEXT: v_lshlrev_b32_e32 v4, 24, v5 ; GFX10-NEXT: s_lshr_b32 s1, s2, 2 ; GFX10-NEXT: s_and_b32 s0, s2, 3 @@ -3489,7 +3444,7 @@ define amdgpu_ps void @insertelement_v_v8i8_v_s(<8 x i8> addrspace(1)* %ptr, i8 ; GFX10-NEXT: v_or3_b32 v1, v1, v8, v4 ; GFX10-NEXT: s_lshl_b32 s0, s0, 3 ; GFX10-NEXT: v_lshlrev_b32_sdwa v2, s0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX10-NEXT: s_lshl_b32 s0, s3, s0 +; GFX10-NEXT: s_lshl_b32 s0, 0xff, s0 ; GFX10-NEXT: v_cndmask_b32_e32 v3, v0, v1, vcc_lo ; GFX10-NEXT: s_not_b32 s0, s0 ; GFX10-NEXT: v_and_or_b32 v2, v3, s0, v2 @@ -3505,8 +3460,8 @@ define amdgpu_ps void @insertelement_v_v8i8_v_s(<8 x i8> addrspace(1)* %ptr, i8 ; GFX10-NEXT: v_lshlrev_b32_sdwa v7, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 ; GFX10-NEXT: v_lshlrev_b32_sdwa v8, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 ; GFX10-NEXT: v_lshlrev_b32_e32 v4, 24, v4 -; GFX10-NEXT: v_and_or_b32 v2, v0, s3, v5 -; GFX10-NEXT: v_and_or_b32 v3, v1, s3, v3 +; GFX10-NEXT: v_and_or_b32 v2, 0xff, v0, v5 +; GFX10-NEXT: v_and_or_b32 v3, 0xff, v1, v3 ; GFX10-NEXT: v_lshlrev_b32_e32 v5, 24, v6 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 @@ -3700,48 +3655,46 @@ define amdgpu_ps void @insertelement_v_v8i8_v_v(<8 x i8> addrspace(1)* %ptr, i8 ; GFX10-NEXT: s_mov_b32 s0, 8 ; GFX10-NEXT: v_and_b32_e32 v4, 3, v3 ; GFX10-NEXT: s_mov_b32 s1, 16 -; GFX10-NEXT: s_movk_i32 s2, 0xff -; GFX10-NEXT: v_mov_b32_e32 v5, 0xff ; GFX10-NEXT: v_lshrrev_b32_e32 v3, 2, v3 ; GFX10-NEXT: v_lshlrev_b32_e32 v4, 3, v4 ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v3 ; GFX10-NEXT: v_lshlrev_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_lshrrev_b32_e32 v6, 24, v0 -; GFX10-NEXT: v_lshlrev_b32_sdwa v7, s0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 -; GFX10-NEXT: v_lshrrev_b32_e32 v8, 24, v1 -; GFX10-NEXT: v_lshlrev_b32_sdwa v9, s0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 -; GFX10-NEXT: v_lshlrev_b32_sdwa v10, s1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 -; GFX10-NEXT: v_lshlrev_b32_sdwa v11, s1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 -; GFX10-NEXT: v_and_or_b32 v0, v0, s2, v7 -; GFX10-NEXT: v_lshlrev_b32_e32 v6, 24, v6 -; GFX10-NEXT: v_and_or_b32 v1, v1, s2, v9 -; GFX10-NEXT: v_lshlrev_b32_e32 v7, 24, v8 -; GFX10-NEXT: v_lshlrev_b32_e32 v8, v4, v5 +; GFX10-NEXT: v_lshrrev_b32_e32 v5, 24, v0 +; GFX10-NEXT: v_lshlrev_b32_sdwa v6, s0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 +; GFX10-NEXT: v_lshrrev_b32_e32 v7, 24, v1 +; GFX10-NEXT: v_lshlrev_b32_sdwa v8, s0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 +; GFX10-NEXT: v_lshlrev_b32_sdwa v9, s1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 +; GFX10-NEXT: v_lshlrev_b32_sdwa v10, s1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 +; GFX10-NEXT: v_and_or_b32 v0, v0, 0xff, v6 +; GFX10-NEXT: v_lshlrev_b32_e32 v5, 24, v5 +; GFX10-NEXT: v_and_or_b32 v1, v1, 0xff, v8 +; GFX10-NEXT: v_lshlrev_b32_e32 v6, 24, v7 +; GFX10-NEXT: v_lshlrev_b32_e64 v7, v4, 0xff ; GFX10-NEXT: v_cmp_eq_u32_e64 s0, 0, v3 -; GFX10-NEXT: v_or3_b32 v0, v0, v10, v6 +; GFX10-NEXT: v_or3_b32 v0, v0, v9, v5 ; GFX10-NEXT: v_mov_b32_e32 v3, 8 -; GFX10-NEXT: v_or3_b32 v1, v1, v11, v7 -; GFX10-NEXT: v_xor_b32_e32 v4, -1, v8 -; GFX10-NEXT: v_cndmask_b32_e32 v6, v0, v1, vcc_lo -; GFX10-NEXT: v_and_or_b32 v2, v6, v4, v2 +; GFX10-NEXT: v_or3_b32 v1, v1, v10, v6 +; GFX10-NEXT: v_xor_b32_e32 v4, -1, v7 +; GFX10-NEXT: v_cndmask_b32_e32 v5, v0, v1, vcc_lo +; GFX10-NEXT: v_and_or_b32 v2, v5, v4, v2 ; GFX10-NEXT: v_cndmask_b32_e64 v0, v0, v2, s0 ; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc_lo ; GFX10-NEXT: v_mov_b32_e32 v2, 16 ; GFX10-NEXT: v_lshrrev_b32_e32 v4, 24, v0 -; GFX10-NEXT: v_lshlrev_b32_sdwa v6, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 -; GFX10-NEXT: v_lshrrev_b32_e32 v7, 24, v1 +; GFX10-NEXT: v_lshlrev_b32_sdwa v5, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 +; GFX10-NEXT: v_lshrrev_b32_e32 v6, 24, v1 ; GFX10-NEXT: v_lshlrev_b32_sdwa v3, v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 -; GFX10-NEXT: v_lshlrev_b32_sdwa v8, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 -; GFX10-NEXT: v_lshlrev_b32_sdwa v9, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 -; GFX10-NEXT: v_and_or_b32 v2, v0, v5, v6 +; GFX10-NEXT: v_lshlrev_b32_sdwa v7, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 +; GFX10-NEXT: v_lshlrev_b32_sdwa v8, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 +; GFX10-NEXT: v_and_or_b32 v2, 0xff, v0, v5 ; GFX10-NEXT: v_lshlrev_b32_e32 v4, 24, v4 -; GFX10-NEXT: v_and_or_b32 v3, v1, v5, v3 -; GFX10-NEXT: v_lshlrev_b32_e32 v5, 24, v7 +; GFX10-NEXT: v_and_or_b32 v3, 0xff, v1, v3 +; GFX10-NEXT: v_lshlrev_b32_e32 v5, 24, v6 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 -; GFX10-NEXT: v_or3_b32 v2, v2, v8, v4 -; GFX10-NEXT: v_or3_b32 v3, v3, v9, v5 +; GFX10-NEXT: v_or3_b32 v2, v2, v7, v4 +; GFX10-NEXT: v_or3_b32 v3, v3, v8, v5 ; GFX10-NEXT: global_store_dwordx2 v[0:1], v[2:3], off ; GFX10-NEXT: s_endpgm %vec = load <8 x i8>, <8 x i8> addrspace(1)* %ptr @@ -3754,47 +3707,44 @@ define amdgpu_ps void @insertelement_s_v16i8_s_s(<16 x i8> addrspace(4)* inreg % ; GFX9-LABEL: insertelement_s_v16i8_s_s: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x0 -; GFX9-NEXT: s_mov_b32 s12, 0x80008 -; GFX9-NEXT: s_movk_i32 s10, 0xff ; GFX9-NEXT: v_mov_b32_e32 v4, 0 ; GFX9-NEXT: v_mov_b32_e32 v5, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_bfe_u32 s13, s0, s12 -; GFX9-NEXT: s_and_b32 s11, s0, s10 -; GFX9-NEXT: s_lshl_b32 s13, s13, 8 -; GFX9-NEXT: s_or_b32 s11, s11, s13 -; GFX9-NEXT: s_mov_b32 s13, 0x80010 +; GFX9-NEXT: s_bfe_u32 s11, s0, 0x80008 ; GFX9-NEXT: s_lshr_b32 s6, s0, 24 -; GFX9-NEXT: s_bfe_u32 s0, s0, s13 +; GFX9-NEXT: s_and_b32 s10, s0, 0xff +; GFX9-NEXT: s_lshl_b32 s11, s11, 8 +; GFX9-NEXT: s_bfe_u32 s0, s0, 0x80010 +; GFX9-NEXT: s_or_b32 s10, s10, s11 ; GFX9-NEXT: s_lshl_b32 s0, s0, 16 -; GFX9-NEXT: s_or_b32 s0, s11, s0 +; GFX9-NEXT: s_or_b32 s0, s10, s0 ; GFX9-NEXT: s_lshl_b32 s6, s6, 24 -; GFX9-NEXT: s_bfe_u32 s11, s1, s12 +; GFX9-NEXT: s_bfe_u32 s10, s1, 0x80008 ; GFX9-NEXT: s_lshr_b32 s7, s1, 24 ; GFX9-NEXT: s_or_b32 s0, s0, s6 -; GFX9-NEXT: s_and_b32 s6, s1, s10 -; GFX9-NEXT: s_lshl_b32 s11, s11, 8 -; GFX9-NEXT: s_bfe_u32 s1, s1, s13 -; GFX9-NEXT: s_or_b32 s6, s6, s11 +; GFX9-NEXT: s_and_b32 s6, s1, 0xff +; GFX9-NEXT: s_lshl_b32 s10, s10, 8 +; GFX9-NEXT: s_bfe_u32 s1, s1, 0x80010 +; GFX9-NEXT: s_or_b32 s6, s6, s10 ; GFX9-NEXT: s_lshl_b32 s1, s1, 16 ; GFX9-NEXT: s_or_b32 s1, s6, s1 ; GFX9-NEXT: s_lshl_b32 s6, s7, 24 -; GFX9-NEXT: s_bfe_u32 s7, s2, s12 +; GFX9-NEXT: s_bfe_u32 s7, s2, 0x80008 ; GFX9-NEXT: s_lshr_b32 s8, s2, 24 ; GFX9-NEXT: s_or_b32 s1, s1, s6 -; GFX9-NEXT: s_and_b32 s6, s2, s10 +; GFX9-NEXT: s_and_b32 s6, s2, 0xff ; GFX9-NEXT: s_lshl_b32 s7, s7, 8 -; GFX9-NEXT: s_bfe_u32 s2, s2, s13 +; GFX9-NEXT: s_bfe_u32 s2, s2, 0x80010 ; GFX9-NEXT: s_or_b32 s6, s6, s7 ; GFX9-NEXT: s_lshl_b32 s2, s2, 16 ; GFX9-NEXT: s_or_b32 s2, s6, s2 ; GFX9-NEXT: s_lshl_b32 s6, s8, 24 -; GFX9-NEXT: s_bfe_u32 s7, s3, s12 +; GFX9-NEXT: s_bfe_u32 s7, s3, 0x80008 ; GFX9-NEXT: s_lshr_b32 s9, s3, 24 ; GFX9-NEXT: s_or_b32 s2, s2, s6 -; GFX9-NEXT: s_and_b32 s6, s3, s10 +; GFX9-NEXT: s_and_b32 s6, s3, 0xff ; GFX9-NEXT: s_lshl_b32 s7, s7, 8 -; GFX9-NEXT: s_bfe_u32 s3, s3, s13 +; GFX9-NEXT: s_bfe_u32 s3, s3, 0x80010 ; GFX9-NEXT: s_or_b32 s6, s6, s7 ; GFX9-NEXT: s_lshl_b32 s3, s3, 16 ; GFX9-NEXT: s_or_b32 s3, s6, s3 @@ -3809,9 +3759,9 @@ define amdgpu_ps void @insertelement_s_v16i8_s_s(<16 x i8> addrspace(4)* inreg % ; GFX9-NEXT: s_cselect_b32 s7, s3, s7 ; GFX9-NEXT: s_and_b32 s5, s5, 3 ; GFX9-NEXT: s_lshl_b32 s5, s5, 3 -; GFX9-NEXT: s_and_b32 s4, s4, s10 +; GFX9-NEXT: s_and_b32 s4, s4, 0xff ; GFX9-NEXT: s_lshl_b32 s4, s4, s5 -; GFX9-NEXT: s_lshl_b32 s5, s10, s5 +; GFX9-NEXT: s_lshl_b32 s5, 0xff, s5 ; GFX9-NEXT: s_andn2_b32 s5, s7, s5 ; GFX9-NEXT: s_or_b32 s4, s5, s4 ; GFX9-NEXT: s_cmp_eq_u32 s6, 0 @@ -3822,41 +3772,41 @@ define amdgpu_ps void @insertelement_s_v16i8_s_s(<16 x i8> addrspace(4)* inreg % ; GFX9-NEXT: s_cselect_b32 s2, s4, s2 ; GFX9-NEXT: s_cmp_eq_u32 s6, 3 ; GFX9-NEXT: s_cselect_b32 s3, s4, s3 -; GFX9-NEXT: s_bfe_u32 s9, s0, s12 +; GFX9-NEXT: s_bfe_u32 s9, s0, 0x80008 ; GFX9-NEXT: s_lshr_b32 s4, s0, 24 -; GFX9-NEXT: s_and_b32 s8, s0, s10 +; GFX9-NEXT: s_and_b32 s8, s0, 0xff ; GFX9-NEXT: s_lshl_b32 s9, s9, 8 -; GFX9-NEXT: s_bfe_u32 s0, s0, s13 +; GFX9-NEXT: s_bfe_u32 s0, s0, 0x80010 ; GFX9-NEXT: s_or_b32 s8, s8, s9 ; GFX9-NEXT: s_lshl_b32 s0, s0, 16 ; GFX9-NEXT: s_or_b32 s0, s8, s0 ; GFX9-NEXT: s_lshl_b32 s4, s4, 24 -; GFX9-NEXT: s_bfe_u32 s8, s1, s12 +; GFX9-NEXT: s_bfe_u32 s8, s1, 0x80008 ; GFX9-NEXT: s_lshr_b32 s5, s1, 24 ; GFX9-NEXT: s_or_b32 s0, s0, s4 -; GFX9-NEXT: s_and_b32 s4, s1, s10 +; GFX9-NEXT: s_and_b32 s4, s1, 0xff ; GFX9-NEXT: s_lshl_b32 s8, s8, 8 -; GFX9-NEXT: s_bfe_u32 s1, s1, s13 +; GFX9-NEXT: s_bfe_u32 s1, s1, 0x80010 ; GFX9-NEXT: s_or_b32 s4, s4, s8 ; GFX9-NEXT: s_lshl_b32 s1, s1, 16 ; GFX9-NEXT: s_or_b32 s1, s4, s1 ; GFX9-NEXT: s_lshl_b32 s4, s5, 24 -; GFX9-NEXT: s_bfe_u32 s5, s2, s12 +; GFX9-NEXT: s_bfe_u32 s5, s2, 0x80008 ; GFX9-NEXT: s_lshr_b32 s6, s2, 24 ; GFX9-NEXT: s_or_b32 s1, s1, s4 -; GFX9-NEXT: s_and_b32 s4, s2, s10 +; GFX9-NEXT: s_and_b32 s4, s2, 0xff ; GFX9-NEXT: s_lshl_b32 s5, s5, 8 -; GFX9-NEXT: s_bfe_u32 s2, s2, s13 +; GFX9-NEXT: s_bfe_u32 s2, s2, 0x80010 ; GFX9-NEXT: s_or_b32 s4, s4, s5 ; GFX9-NEXT: s_lshl_b32 s2, s2, 16 ; GFX9-NEXT: s_or_b32 s2, s4, s2 ; GFX9-NEXT: s_lshl_b32 s4, s6, 24 -; GFX9-NEXT: s_bfe_u32 s5, s3, s12 +; GFX9-NEXT: s_bfe_u32 s5, s3, 0x80008 ; GFX9-NEXT: s_lshr_b32 s7, s3, 24 ; GFX9-NEXT: s_or_b32 s2, s2, s4 -; GFX9-NEXT: s_and_b32 s4, s3, s10 +; GFX9-NEXT: s_and_b32 s4, s3, 0xff ; GFX9-NEXT: s_lshl_b32 s5, s5, 8 -; GFX9-NEXT: s_bfe_u32 s3, s3, s13 +; GFX9-NEXT: s_bfe_u32 s3, s3, 0x80010 ; GFX9-NEXT: s_or_b32 s4, s4, s5 ; GFX9-NEXT: s_lshl_b32 s3, s3, 16 ; GFX9-NEXT: s_or_b32 s3, s4, s3 @@ -3872,47 +3822,44 @@ define amdgpu_ps void @insertelement_s_v16i8_s_s(<16 x i8> addrspace(4)* inreg % ; GFX8-LABEL: insertelement_s_v16i8_s_s: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x0 -; GFX8-NEXT: s_mov_b32 s12, 0x80008 -; GFX8-NEXT: s_movk_i32 s10, 0xff ; GFX8-NEXT: v_mov_b32_e32 v4, 0 ; GFX8-NEXT: v_mov_b32_e32 v5, 0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_bfe_u32 s13, s0, s12 -; GFX8-NEXT: s_and_b32 s11, s0, s10 -; GFX8-NEXT: s_lshl_b32 s13, s13, 8 -; GFX8-NEXT: s_or_b32 s11, s11, s13 -; GFX8-NEXT: s_mov_b32 s13, 0x80010 +; GFX8-NEXT: s_bfe_u32 s11, s0, 0x80008 ; GFX8-NEXT: s_lshr_b32 s6, s0, 24 -; GFX8-NEXT: s_bfe_u32 s0, s0, s13 +; GFX8-NEXT: s_and_b32 s10, s0, 0xff +; GFX8-NEXT: s_lshl_b32 s11, s11, 8 +; GFX8-NEXT: s_bfe_u32 s0, s0, 0x80010 +; GFX8-NEXT: s_or_b32 s10, s10, s11 ; GFX8-NEXT: s_lshl_b32 s0, s0, 16 -; GFX8-NEXT: s_or_b32 s0, s11, s0 +; GFX8-NEXT: s_or_b32 s0, s10, s0 ; GFX8-NEXT: s_lshl_b32 s6, s6, 24 -; GFX8-NEXT: s_bfe_u32 s11, s1, s12 +; GFX8-NEXT: s_bfe_u32 s10, s1, 0x80008 ; GFX8-NEXT: s_lshr_b32 s7, s1, 24 ; GFX8-NEXT: s_or_b32 s0, s0, s6 -; GFX8-NEXT: s_and_b32 s6, s1, s10 -; GFX8-NEXT: s_lshl_b32 s11, s11, 8 -; GFX8-NEXT: s_bfe_u32 s1, s1, s13 -; GFX8-NEXT: s_or_b32 s6, s6, s11 +; GFX8-NEXT: s_and_b32 s6, s1, 0xff +; GFX8-NEXT: s_lshl_b32 s10, s10, 8 +; GFX8-NEXT: s_bfe_u32 s1, s1, 0x80010 +; GFX8-NEXT: s_or_b32 s6, s6, s10 ; GFX8-NEXT: s_lshl_b32 s1, s1, 16 ; GFX8-NEXT: s_or_b32 s1, s6, s1 ; GFX8-NEXT: s_lshl_b32 s6, s7, 24 -; GFX8-NEXT: s_bfe_u32 s7, s2, s12 +; GFX8-NEXT: s_bfe_u32 s7, s2, 0x80008 ; GFX8-NEXT: s_lshr_b32 s8, s2, 24 ; GFX8-NEXT: s_or_b32 s1, s1, s6 -; GFX8-NEXT: s_and_b32 s6, s2, s10 +; GFX8-NEXT: s_and_b32 s6, s2, 0xff ; GFX8-NEXT: s_lshl_b32 s7, s7, 8 -; GFX8-NEXT: s_bfe_u32 s2, s2, s13 +; GFX8-NEXT: s_bfe_u32 s2, s2, 0x80010 ; GFX8-NEXT: s_or_b32 s6, s6, s7 ; GFX8-NEXT: s_lshl_b32 s2, s2, 16 ; GFX8-NEXT: s_or_b32 s2, s6, s2 ; GFX8-NEXT: s_lshl_b32 s6, s8, 24 -; GFX8-NEXT: s_bfe_u32 s7, s3, s12 +; GFX8-NEXT: s_bfe_u32 s7, s3, 0x80008 ; GFX8-NEXT: s_lshr_b32 s9, s3, 24 ; GFX8-NEXT: s_or_b32 s2, s2, s6 -; GFX8-NEXT: s_and_b32 s6, s3, s10 +; GFX8-NEXT: s_and_b32 s6, s3, 0xff ; GFX8-NEXT: s_lshl_b32 s7, s7, 8 -; GFX8-NEXT: s_bfe_u32 s3, s3, s13 +; GFX8-NEXT: s_bfe_u32 s3, s3, 0x80010 ; GFX8-NEXT: s_or_b32 s6, s6, s7 ; GFX8-NEXT: s_lshl_b32 s3, s3, 16 ; GFX8-NEXT: s_or_b32 s3, s6, s3 @@ -3927,9 +3874,9 @@ define amdgpu_ps void @insertelement_s_v16i8_s_s(<16 x i8> addrspace(4)* inreg % ; GFX8-NEXT: s_cselect_b32 s7, s3, s7 ; GFX8-NEXT: s_and_b32 s5, s5, 3 ; GFX8-NEXT: s_lshl_b32 s5, s5, 3 -; GFX8-NEXT: s_and_b32 s4, s4, s10 +; GFX8-NEXT: s_and_b32 s4, s4, 0xff ; GFX8-NEXT: s_lshl_b32 s4, s4, s5 -; GFX8-NEXT: s_lshl_b32 s5, s10, s5 +; GFX8-NEXT: s_lshl_b32 s5, 0xff, s5 ; GFX8-NEXT: s_andn2_b32 s5, s7, s5 ; GFX8-NEXT: s_or_b32 s4, s5, s4 ; GFX8-NEXT: s_cmp_eq_u32 s6, 0 @@ -3940,41 +3887,41 @@ define amdgpu_ps void @insertelement_s_v16i8_s_s(<16 x i8> addrspace(4)* inreg % ; GFX8-NEXT: s_cselect_b32 s2, s4, s2 ; GFX8-NEXT: s_cmp_eq_u32 s6, 3 ; GFX8-NEXT: s_cselect_b32 s3, s4, s3 -; GFX8-NEXT: s_bfe_u32 s9, s0, s12 +; GFX8-NEXT: s_bfe_u32 s9, s0, 0x80008 ; GFX8-NEXT: s_lshr_b32 s4, s0, 24 -; GFX8-NEXT: s_and_b32 s8, s0, s10 +; GFX8-NEXT: s_and_b32 s8, s0, 0xff ; GFX8-NEXT: s_lshl_b32 s9, s9, 8 -; GFX8-NEXT: s_bfe_u32 s0, s0, s13 +; GFX8-NEXT: s_bfe_u32 s0, s0, 0x80010 ; GFX8-NEXT: s_or_b32 s8, s8, s9 ; GFX8-NEXT: s_lshl_b32 s0, s0, 16 ; GFX8-NEXT: s_or_b32 s0, s8, s0 ; GFX8-NEXT: s_lshl_b32 s4, s4, 24 -; GFX8-NEXT: s_bfe_u32 s8, s1, s12 +; GFX8-NEXT: s_bfe_u32 s8, s1, 0x80008 ; GFX8-NEXT: s_lshr_b32 s5, s1, 24 ; GFX8-NEXT: s_or_b32 s0, s0, s4 -; GFX8-NEXT: s_and_b32 s4, s1, s10 +; GFX8-NEXT: s_and_b32 s4, s1, 0xff ; GFX8-NEXT: s_lshl_b32 s8, s8, 8 -; GFX8-NEXT: s_bfe_u32 s1, s1, s13 +; GFX8-NEXT: s_bfe_u32 s1, s1, 0x80010 ; GFX8-NEXT: s_or_b32 s4, s4, s8 ; GFX8-NEXT: s_lshl_b32 s1, s1, 16 ; GFX8-NEXT: s_or_b32 s1, s4, s1 ; GFX8-NEXT: s_lshl_b32 s4, s5, 24 -; GFX8-NEXT: s_bfe_u32 s5, s2, s12 +; GFX8-NEXT: s_bfe_u32 s5, s2, 0x80008 ; GFX8-NEXT: s_lshr_b32 s6, s2, 24 ; GFX8-NEXT: s_or_b32 s1, s1, s4 -; GFX8-NEXT: s_and_b32 s4, s2, s10 +; GFX8-NEXT: s_and_b32 s4, s2, 0xff ; GFX8-NEXT: s_lshl_b32 s5, s5, 8 -; GFX8-NEXT: s_bfe_u32 s2, s2, s13 +; GFX8-NEXT: s_bfe_u32 s2, s2, 0x80010 ; GFX8-NEXT: s_or_b32 s4, s4, s5 ; GFX8-NEXT: s_lshl_b32 s2, s2, 16 ; GFX8-NEXT: s_or_b32 s2, s4, s2 ; GFX8-NEXT: s_lshl_b32 s4, s6, 24 -; GFX8-NEXT: s_bfe_u32 s5, s3, s12 +; GFX8-NEXT: s_bfe_u32 s5, s3, 0x80008 ; GFX8-NEXT: s_lshr_b32 s7, s3, 24 ; GFX8-NEXT: s_or_b32 s2, s2, s4 -; GFX8-NEXT: s_and_b32 s4, s3, s10 +; GFX8-NEXT: s_and_b32 s4, s3, 0xff ; GFX8-NEXT: s_lshl_b32 s5, s5, 8 -; GFX8-NEXT: s_bfe_u32 s3, s3, s13 +; GFX8-NEXT: s_bfe_u32 s3, s3, 0x80010 ; GFX8-NEXT: s_or_b32 s4, s4, s5 ; GFX8-NEXT: s_lshl_b32 s3, s3, 16 ; GFX8-NEXT: s_or_b32 s3, s4, s3 @@ -3990,45 +3937,42 @@ define amdgpu_ps void @insertelement_s_v16i8_s_s(<16 x i8> addrspace(4)* inreg % ; GFX7-LABEL: insertelement_s_v16i8_s_s: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x0 -; GFX7-NEXT: s_mov_b32 s12, 0x80008 -; GFX7-NEXT: s_movk_i32 s10, 0xff ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: s_bfe_u32 s13, s0, s12 -; GFX7-NEXT: s_and_b32 s11, s0, s10 -; GFX7-NEXT: s_lshl_b32 s13, s13, 8 -; GFX7-NEXT: s_or_b32 s11, s11, s13 -; GFX7-NEXT: s_mov_b32 s13, 0x80010 +; GFX7-NEXT: s_bfe_u32 s11, s0, 0x80008 ; GFX7-NEXT: s_lshr_b32 s6, s0, 24 -; GFX7-NEXT: s_bfe_u32 s0, s0, s13 +; GFX7-NEXT: s_and_b32 s10, s0, 0xff +; GFX7-NEXT: s_lshl_b32 s11, s11, 8 +; GFX7-NEXT: s_bfe_u32 s0, s0, 0x80010 +; GFX7-NEXT: s_or_b32 s10, s10, s11 ; GFX7-NEXT: s_lshl_b32 s0, s0, 16 -; GFX7-NEXT: s_or_b32 s0, s11, s0 +; GFX7-NEXT: s_or_b32 s0, s10, s0 ; GFX7-NEXT: s_lshl_b32 s6, s6, 24 -; GFX7-NEXT: s_bfe_u32 s11, s1, s12 +; GFX7-NEXT: s_bfe_u32 s10, s1, 0x80008 ; GFX7-NEXT: s_lshr_b32 s7, s1, 24 ; GFX7-NEXT: s_or_b32 s0, s0, s6 -; GFX7-NEXT: s_and_b32 s6, s1, s10 -; GFX7-NEXT: s_lshl_b32 s11, s11, 8 -; GFX7-NEXT: s_bfe_u32 s1, s1, s13 -; GFX7-NEXT: s_or_b32 s6, s6, s11 +; GFX7-NEXT: s_and_b32 s6, s1, 0xff +; GFX7-NEXT: s_lshl_b32 s10, s10, 8 +; GFX7-NEXT: s_bfe_u32 s1, s1, 0x80010 +; GFX7-NEXT: s_or_b32 s6, s6, s10 ; GFX7-NEXT: s_lshl_b32 s1, s1, 16 ; GFX7-NEXT: s_or_b32 s1, s6, s1 ; GFX7-NEXT: s_lshl_b32 s6, s7, 24 -; GFX7-NEXT: s_bfe_u32 s7, s2, s12 +; GFX7-NEXT: s_bfe_u32 s7, s2, 0x80008 ; GFX7-NEXT: s_lshr_b32 s8, s2, 24 ; GFX7-NEXT: s_or_b32 s1, s1, s6 -; GFX7-NEXT: s_and_b32 s6, s2, s10 +; GFX7-NEXT: s_and_b32 s6, s2, 0xff ; GFX7-NEXT: s_lshl_b32 s7, s7, 8 -; GFX7-NEXT: s_bfe_u32 s2, s2, s13 +; GFX7-NEXT: s_bfe_u32 s2, s2, 0x80010 ; GFX7-NEXT: s_or_b32 s6, s6, s7 ; GFX7-NEXT: s_lshl_b32 s2, s2, 16 ; GFX7-NEXT: s_or_b32 s2, s6, s2 ; GFX7-NEXT: s_lshl_b32 s6, s8, 24 -; GFX7-NEXT: s_bfe_u32 s7, s3, s12 +; GFX7-NEXT: s_bfe_u32 s7, s3, 0x80008 ; GFX7-NEXT: s_lshr_b32 s9, s3, 24 ; GFX7-NEXT: s_or_b32 s2, s2, s6 -; GFX7-NEXT: s_and_b32 s6, s3, s10 +; GFX7-NEXT: s_and_b32 s6, s3, 0xff ; GFX7-NEXT: s_lshl_b32 s7, s7, 8 -; GFX7-NEXT: s_bfe_u32 s3, s3, s13 +; GFX7-NEXT: s_bfe_u32 s3, s3, 0x80010 ; GFX7-NEXT: s_or_b32 s6, s6, s7 ; GFX7-NEXT: s_lshl_b32 s3, s3, 16 ; GFX7-NEXT: s_or_b32 s3, s6, s3 @@ -4043,9 +3987,9 @@ define amdgpu_ps void @insertelement_s_v16i8_s_s(<16 x i8> addrspace(4)* inreg % ; GFX7-NEXT: s_cselect_b32 s7, s3, s7 ; GFX7-NEXT: s_and_b32 s5, s5, 3 ; GFX7-NEXT: s_lshl_b32 s5, s5, 3 -; GFX7-NEXT: s_and_b32 s4, s4, s10 +; GFX7-NEXT: s_and_b32 s4, s4, 0xff ; GFX7-NEXT: s_lshl_b32 s4, s4, s5 -; GFX7-NEXT: s_lshl_b32 s5, s10, s5 +; GFX7-NEXT: s_lshl_b32 s5, 0xff, s5 ; GFX7-NEXT: s_andn2_b32 s5, s7, s5 ; GFX7-NEXT: s_or_b32 s4, s5, s4 ; GFX7-NEXT: s_cmp_eq_u32 s6, 0 @@ -4056,41 +4000,41 @@ define amdgpu_ps void @insertelement_s_v16i8_s_s(<16 x i8> addrspace(4)* inreg % ; GFX7-NEXT: s_cselect_b32 s2, s4, s2 ; GFX7-NEXT: s_cmp_eq_u32 s6, 3 ; GFX7-NEXT: s_cselect_b32 s3, s4, s3 -; GFX7-NEXT: s_bfe_u32 s14, s5, s12 +; GFX7-NEXT: s_bfe_u32 s11, s5, 0x80008 ; GFX7-NEXT: s_lshr_b32 s4, s5, 24 -; GFX7-NEXT: s_and_b32 s11, s5, s10 -; GFX7-NEXT: s_lshl_b32 s14, s14, 8 -; GFX7-NEXT: s_bfe_u32 s5, s5, s13 -; GFX7-NEXT: s_or_b32 s11, s11, s14 +; GFX7-NEXT: s_and_b32 s10, s5, 0xff +; GFX7-NEXT: s_lshl_b32 s11, s11, 8 +; GFX7-NEXT: s_bfe_u32 s5, s5, 0x80010 +; GFX7-NEXT: s_or_b32 s10, s10, s11 ; GFX7-NEXT: s_lshl_b32 s5, s5, 16 -; GFX7-NEXT: s_or_b32 s5, s11, s5 +; GFX7-NEXT: s_or_b32 s5, s10, s5 ; GFX7-NEXT: s_lshl_b32 s4, s4, 24 -; GFX7-NEXT: s_bfe_u32 s11, s7, s12 +; GFX7-NEXT: s_bfe_u32 s10, s7, 0x80008 ; GFX7-NEXT: s_lshr_b32 s6, s7, 24 ; GFX7-NEXT: s_or_b32 s4, s5, s4 -; GFX7-NEXT: s_and_b32 s5, s7, s10 -; GFX7-NEXT: s_lshl_b32 s11, s11, 8 -; GFX7-NEXT: s_bfe_u32 s7, s7, s13 -; GFX7-NEXT: s_or_b32 s5, s5, s11 +; GFX7-NEXT: s_and_b32 s5, s7, 0xff +; GFX7-NEXT: s_lshl_b32 s10, s10, 8 +; GFX7-NEXT: s_bfe_u32 s7, s7, 0x80010 +; GFX7-NEXT: s_or_b32 s5, s5, s10 ; GFX7-NEXT: s_lshl_b32 s7, s7, 16 ; GFX7-NEXT: s_or_b32 s5, s5, s7 ; GFX7-NEXT: s_lshl_b32 s6, s6, 24 -; GFX7-NEXT: s_bfe_u32 s7, s2, s12 +; GFX7-NEXT: s_bfe_u32 s7, s2, 0x80008 ; GFX7-NEXT: s_lshr_b32 s8, s2, 24 ; GFX7-NEXT: s_or_b32 s5, s5, s6 -; GFX7-NEXT: s_and_b32 s6, s2, s10 +; GFX7-NEXT: s_and_b32 s6, s2, 0xff ; GFX7-NEXT: s_lshl_b32 s7, s7, 8 -; GFX7-NEXT: s_bfe_u32 s2, s2, s13 +; GFX7-NEXT: s_bfe_u32 s2, s2, 0x80010 ; GFX7-NEXT: s_or_b32 s6, s6, s7 ; GFX7-NEXT: s_lshl_b32 s2, s2, 16 ; GFX7-NEXT: s_or_b32 s2, s6, s2 ; GFX7-NEXT: s_lshl_b32 s6, s8, 24 -; GFX7-NEXT: s_bfe_u32 s7, s3, s12 +; GFX7-NEXT: s_bfe_u32 s7, s3, 0x80008 ; GFX7-NEXT: s_lshr_b32 s9, s3, 24 ; GFX7-NEXT: s_or_b32 s6, s2, s6 -; GFX7-NEXT: s_and_b32 s2, s3, s10 +; GFX7-NEXT: s_and_b32 s2, s3, 0xff ; GFX7-NEXT: s_lshl_b32 s7, s7, 8 -; GFX7-NEXT: s_bfe_u32 s3, s3, s13 +; GFX7-NEXT: s_bfe_u32 s3, s3, 0x80010 ; GFX7-NEXT: s_or_b32 s2, s2, s7 ; GFX7-NEXT: s_lshl_b32 s3, s3, 16 ; GFX7-NEXT: s_or_b32 s2, s2, s3 @@ -4109,112 +4053,109 @@ define amdgpu_ps void @insertelement_s_v16i8_s_s(<16 x i8> addrspace(4)* inreg % ; GFX10-LABEL: insertelement_s_v16i8_s_s: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x0 -; GFX10-NEXT: s_mov_b32 s7, 0x80008 -; GFX10-NEXT: s_movk_i32 s6, 0xff -; GFX10-NEXT: s_mov_b32 s8, 0x80010 ; GFX10-NEXT: v_mov_b32_e32 v4, 0 ; GFX10-NEXT: v_mov_b32_e32 v5, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: s_bfe_u32 s14, s0, s7 -; GFX10-NEXT: s_lshr_b32 s9, s0, 24 -; GFX10-NEXT: s_and_b32 s13, s0, s6 -; GFX10-NEXT: s_bfe_u32 s0, s0, s8 -; GFX10-NEXT: s_bfe_u32 s16, s1, s7 -; GFX10-NEXT: s_lshl_b32 s14, s14, 8 -; GFX10-NEXT: s_lshr_b32 s10, s1, 24 -; GFX10-NEXT: s_and_b32 s15, s1, s6 -; GFX10-NEXT: s_bfe_u32 s1, s1, s8 -; GFX10-NEXT: s_lshl_b32 s0, s0, 16 -; GFX10-NEXT: s_lshl_b32 s16, s16, 8 -; GFX10-NEXT: s_or_b32 s13, s13, s14 -; GFX10-NEXT: s_bfe_u32 s18, s2, s7 -; GFX10-NEXT: s_lshl_b32 s9, s9, 24 +; GFX10-NEXT: s_bfe_u32 s11, s0, 0x80008 +; GFX10-NEXT: s_bfe_u32 s13, s1, 0x80008 +; GFX10-NEXT: s_lshr_b32 s7, s1, 24 +; GFX10-NEXT: s_and_b32 s10, s0, 0xff +; GFX10-NEXT: s_and_b32 s12, s1, 0xff +; GFX10-NEXT: s_bfe_u32 s1, s1, 0x80010 +; GFX10-NEXT: s_lshl_b32 s11, s11, 8 +; GFX10-NEXT: s_lshl_b32 s13, s13, 8 +; GFX10-NEXT: s_lshr_b32 s6, s0, 24 +; GFX10-NEXT: s_bfe_u32 s0, s0, 0x80010 ; GFX10-NEXT: s_lshl_b32 s1, s1, 16 -; GFX10-NEXT: s_or_b32 s14, s15, s16 -; GFX10-NEXT: s_or_b32 s0, s13, s0 -; GFX10-NEXT: s_lshr_b32 s11, s2, 24 -; GFX10-NEXT: s_and_b32 s17, s2, s6 -; GFX10-NEXT: s_lshl_b32 s10, s10, 24 -; GFX10-NEXT: s_or_b32 s1, s14, s1 -; GFX10-NEXT: s_or_b32 s0, s0, s9 -; GFX10-NEXT: s_lshl_b32 s9, s18, 8 -; GFX10-NEXT: s_bfe_u32 s2, s2, s8 -; GFX10-NEXT: s_or_b32 s9, s17, s9 +; GFX10-NEXT: s_or_b32 s10, s10, s11 +; GFX10-NEXT: s_or_b32 s11, s12, s13 +; GFX10-NEXT: s_bfe_u32 s15, s2, 0x80008 +; GFX10-NEXT: s_lshl_b32 s0, s0, 16 +; GFX10-NEXT: s_lshl_b32 s7, s7, 24 +; GFX10-NEXT: s_or_b32 s1, s11, s1 +; GFX10-NEXT: s_lshr_b32 s8, s2, 24 +; GFX10-NEXT: s_and_b32 s14, s2, 0xff +; GFX10-NEXT: s_bfe_u32 s2, s2, 0x80010 +; GFX10-NEXT: s_lshl_b32 s6, s6, 24 +; GFX10-NEXT: s_lshl_b32 s15, s15, 8 +; GFX10-NEXT: s_or_b32 s0, s10, s0 +; GFX10-NEXT: s_or_b32 s1, s1, s7 +; GFX10-NEXT: s_bfe_u32 s7, s3, 0x80008 +; GFX10-NEXT: s_lshr_b32 s9, s3, 24 +; GFX10-NEXT: s_or_b32 s12, s14, s15 +; GFX10-NEXT: s_or_b32 s0, s0, s6 ; GFX10-NEXT: s_lshl_b32 s2, s2, 16 -; GFX10-NEXT: s_or_b32 s1, s1, s10 -; GFX10-NEXT: s_bfe_u32 s10, s3, s7 -; GFX10-NEXT: s_lshr_b32 s12, s3, 24 -; GFX10-NEXT: s_or_b32 s2, s9, s2 -; GFX10-NEXT: s_lshl_b32 s9, s11, 24 -; GFX10-NEXT: s_and_b32 s11, s3, s6 -; GFX10-NEXT: s_lshl_b32 s10, s10, 8 -; GFX10-NEXT: s_bfe_u32 s3, s3, s8 -; GFX10-NEXT: s_or_b32 s10, s11, s10 +; GFX10-NEXT: s_lshl_b32 s6, s8, 24 +; GFX10-NEXT: s_and_b32 s8, s3, 0xff +; GFX10-NEXT: s_lshl_b32 s7, s7, 8 +; GFX10-NEXT: s_bfe_u32 s3, s3, 0x80010 +; GFX10-NEXT: s_or_b32 s2, s12, s2 +; GFX10-NEXT: s_or_b32 s7, s8, s7 ; GFX10-NEXT: s_lshl_b32 s3, s3, 16 -; GFX10-NEXT: s_or_b32 s2, s2, s9 -; GFX10-NEXT: s_or_b32 s3, s10, s3 -; GFX10-NEXT: s_lshl_b32 s9, s12, 24 -; GFX10-NEXT: s_lshr_b32 s10, s5, 2 -; GFX10-NEXT: s_or_b32 s3, s3, s9 -; GFX10-NEXT: s_cmp_eq_u32 s10, 1 -; GFX10-NEXT: s_cselect_b32 s9, s1, s0 -; GFX10-NEXT: s_cmp_eq_u32 s10, 2 -; GFX10-NEXT: s_cselect_b32 s9, s2, s9 -; GFX10-NEXT: s_cmp_eq_u32 s10, 3 -; GFX10-NEXT: s_cselect_b32 s9, s3, s9 +; GFX10-NEXT: s_or_b32 s2, s2, s6 +; GFX10-NEXT: s_or_b32 s3, s7, s3 +; GFX10-NEXT: s_lshl_b32 s6, s9, 24 +; GFX10-NEXT: s_lshr_b32 s7, s5, 2 +; GFX10-NEXT: s_or_b32 s3, s3, s6 +; GFX10-NEXT: s_cmp_eq_u32 s7, 1 +; GFX10-NEXT: s_cselect_b32 s6, s1, s0 +; GFX10-NEXT: s_cmp_eq_u32 s7, 2 +; GFX10-NEXT: s_cselect_b32 s6, s2, s6 +; GFX10-NEXT: s_cmp_eq_u32 s7, 3 +; GFX10-NEXT: s_cselect_b32 s6, s3, s6 ; GFX10-NEXT: s_and_b32 s5, s5, 3 -; GFX10-NEXT: s_and_b32 s4, s4, s6 +; GFX10-NEXT: s_and_b32 s4, s4, 0xff ; GFX10-NEXT: s_lshl_b32 s5, s5, 3 -; GFX10-NEXT: s_lshl_b32 s11, s6, s5 +; GFX10-NEXT: s_lshl_b32 s8, 0xff, s5 ; GFX10-NEXT: s_lshl_b32 s4, s4, s5 -; GFX10-NEXT: s_andn2_b32 s5, s9, s11 +; GFX10-NEXT: s_andn2_b32 s5, s6, s8 ; GFX10-NEXT: s_or_b32 s4, s5, s4 -; GFX10-NEXT: s_cmp_eq_u32 s10, 0 +; GFX10-NEXT: s_cmp_eq_u32 s7, 0 ; GFX10-NEXT: s_cselect_b32 s0, s4, s0 -; GFX10-NEXT: s_cmp_eq_u32 s10, 1 +; GFX10-NEXT: s_cmp_eq_u32 s7, 1 ; GFX10-NEXT: s_cselect_b32 s1, s4, s1 -; GFX10-NEXT: s_cmp_eq_u32 s10, 2 +; GFX10-NEXT: s_cmp_eq_u32 s7, 2 ; GFX10-NEXT: s_cselect_b32 s2, s4, s2 -; GFX10-NEXT: s_cmp_eq_u32 s10, 3 +; GFX10-NEXT: s_cmp_eq_u32 s7, 3 ; GFX10-NEXT: s_cselect_b32 s3, s4, s3 -; GFX10-NEXT: s_bfe_u32 s10, s0, s7 +; GFX10-NEXT: s_bfe_u32 s7, s0, 0x80008 ; GFX10-NEXT: s_lshr_b32 s4, s0, 24 -; GFX10-NEXT: s_and_b32 s11, s0, s6 -; GFX10-NEXT: s_lshl_b32 s10, s10, 8 -; GFX10-NEXT: s_bfe_u32 s0, s0, s8 -; GFX10-NEXT: s_or_b32 s10, s11, s10 +; GFX10-NEXT: s_and_b32 s8, s0, 0xff +; GFX10-NEXT: s_lshl_b32 s7, s7, 8 +; GFX10-NEXT: s_bfe_u32 s0, s0, 0x80010 +; GFX10-NEXT: s_or_b32 s7, s8, s7 ; GFX10-NEXT: s_lshl_b32 s0, s0, 16 ; GFX10-NEXT: s_lshr_b32 s5, s1, 24 -; GFX10-NEXT: s_or_b32 s0, s10, s0 -; GFX10-NEXT: s_bfe_u32 s10, s1, s7 +; GFX10-NEXT: s_or_b32 s0, s7, s0 +; GFX10-NEXT: s_bfe_u32 s7, s1, 0x80008 ; GFX10-NEXT: s_lshl_b32 s4, s4, 24 -; GFX10-NEXT: s_and_b32 s12, s1, s6 -; GFX10-NEXT: s_lshl_b32 s10, s10, 8 -; GFX10-NEXT: s_bfe_u32 s1, s1, s8 -; GFX10-NEXT: s_or_b32 s10, s12, s10 +; GFX10-NEXT: s_and_b32 s9, s1, 0xff +; GFX10-NEXT: s_lshl_b32 s7, s7, 8 +; GFX10-NEXT: s_bfe_u32 s1, s1, 0x80010 +; GFX10-NEXT: s_or_b32 s7, s9, s7 ; GFX10-NEXT: s_lshl_b32 s1, s1, 16 ; GFX10-NEXT: s_or_b32 s0, s0, s4 ; GFX10-NEXT: s_lshl_b32 s4, s5, 24 -; GFX10-NEXT: s_bfe_u32 s5, s2, s7 -; GFX10-NEXT: s_lshr_b32 s9, s2, 24 -; GFX10-NEXT: s_or_b32 s1, s10, s1 -; GFX10-NEXT: s_and_b32 s10, s2, s6 +; GFX10-NEXT: s_bfe_u32 s5, s2, 0x80008 +; GFX10-NEXT: s_lshr_b32 s6, s2, 24 +; GFX10-NEXT: s_or_b32 s1, s7, s1 +; GFX10-NEXT: s_and_b32 s7, s2, 0xff ; GFX10-NEXT: s_lshl_b32 s5, s5, 8 -; GFX10-NEXT: s_bfe_u32 s2, s2, s8 -; GFX10-NEXT: s_or_b32 s5, s10, s5 +; GFX10-NEXT: s_bfe_u32 s2, s2, 0x80010 +; GFX10-NEXT: s_or_b32 s5, s7, s5 ; GFX10-NEXT: s_lshl_b32 s2, s2, 16 ; GFX10-NEXT: s_or_b32 s1, s1, s4 -; GFX10-NEXT: s_bfe_u32 s4, s3, s7 -; GFX10-NEXT: s_lshr_b32 s11, s3, 24 +; GFX10-NEXT: s_bfe_u32 s4, s3, 0x80008 +; GFX10-NEXT: s_lshr_b32 s8, s3, 24 ; GFX10-NEXT: s_or_b32 s2, s5, s2 -; GFX10-NEXT: s_and_b32 s5, s3, s6 +; GFX10-NEXT: s_and_b32 s5, s3, 0xff ; GFX10-NEXT: s_lshl_b32 s4, s4, 8 -; GFX10-NEXT: s_bfe_u32 s3, s3, s8 +; GFX10-NEXT: s_bfe_u32 s3, s3, 0x80010 ; GFX10-NEXT: s_or_b32 s4, s5, s4 ; GFX10-NEXT: s_lshl_b32 s3, s3, 16 -; GFX10-NEXT: s_lshl_b32 s5, s9, 24 +; GFX10-NEXT: s_lshl_b32 s5, s6, 24 ; GFX10-NEXT: s_or_b32 s3, s4, s3 -; GFX10-NEXT: s_lshl_b32 s4, s11, 24 +; GFX10-NEXT: s_lshl_b32 s4, s8, 24 ; GFX10-NEXT: s_or_b32 s2, s2, s5 ; GFX10-NEXT: s_or_b32 s3, s3, s4 ; GFX10-NEXT: v_mov_b32_e32 v0, s0 @@ -4235,19 +4176,18 @@ define amdgpu_ps void @insertelement_v_v16i8_s_s(<16 x i8> addrspace(1)* %ptr, i ; GFX9-NEXT: global_load_dwordx4 v[0:3], v[0:1], off ; GFX9-NEXT: s_mov_b32 s0, 8 ; GFX9-NEXT: s_mov_b32 s1, 16 -; GFX9-NEXT: s_movk_i32 s6, 0xff -; GFX9-NEXT: v_mov_b32_e32 v6, 8 -; GFX9-NEXT: v_mov_b32_e32 v7, 16 -; GFX9-NEXT: s_lshr_b32 s4, s3, 2 +; GFX9-NEXT: s_movk_i32 s4, 0xff +; GFX9-NEXT: v_mov_b32_e32 v7, 8 +; GFX9-NEXT: v_mov_b32_e32 v6, 0xff +; GFX9-NEXT: v_mov_b32_e32 v8, 16 +; GFX9-NEXT: s_lshr_b32 s5, s3, 2 ; GFX9-NEXT: s_and_b32 s3, s3, 3 -; GFX9-NEXT: s_and_b32 s2, s2, s6 +; GFX9-NEXT: s_and_b32 s2, s2, 0xff ; GFX9-NEXT: s_lshl_b32 s3, s3, 3 -; GFX9-NEXT: v_cmp_eq_u32_e64 vcc, s4, 1 +; GFX9-NEXT: v_cmp_eq_u32_e64 vcc, s5, 1 ; GFX9-NEXT: s_lshl_b32 s2, s2, s3 -; GFX9-NEXT: s_lshl_b32 s3, s6, s3 -; GFX9-NEXT: s_not_b32 s5, s3 -; GFX9-NEXT: v_mov_b32_e32 v8, s2 -; GFX9-NEXT: v_cmp_eq_u32_e64 s[2:3], s4, 3 +; GFX9-NEXT: s_lshl_b32 s3, 0xff, s3 +; GFX9-NEXT: s_not_b32 s6, s3 ; GFX9-NEXT: v_mov_b32_e32 v4, 0 ; GFX9-NEXT: v_mov_b32_e32 v5, 0 ; GFX9-NEXT: s_waitcnt vmcnt(0) @@ -4258,57 +4198,59 @@ define amdgpu_ps void @insertelement_v_v16i8_s_s(<16 x i8> addrspace(1)* %ptr, i ; GFX9-NEXT: v_lshrrev_b32_e32 v11, 24, v2 ; GFX9-NEXT: v_lshlrev_b32_sdwa v14, s1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 ; GFX9-NEXT: v_lshlrev_b32_sdwa v16, s1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 -; GFX9-NEXT: v_lshlrev_b32_sdwa v17, v6, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 -; GFX9-NEXT: v_and_or_b32 v0, v0, s6, v13 -; GFX9-NEXT: v_and_or_b32 v1, v1, s6, v15 +; GFX9-NEXT: v_lshlrev_b32_sdwa v17, v7, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 +; GFX9-NEXT: v_and_or_b32 v0, v0, s4, v13 +; GFX9-NEXT: v_and_or_b32 v1, v1, s4, v15 ; GFX9-NEXT: v_lshlrev_b32_e32 v9, 24, v9 ; GFX9-NEXT: v_lshlrev_b32_e32 v10, 24, v10 ; GFX9-NEXT: v_lshrrev_b32_e32 v12, 24, v3 -; GFX9-NEXT: v_lshlrev_b32_sdwa v18, v7, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 -; GFX9-NEXT: v_lshlrev_b32_sdwa v19, v6, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 -; GFX9-NEXT: v_and_or_b32 v2, v2, s6, v17 +; GFX9-NEXT: v_lshlrev_b32_sdwa v18, v8, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 +; GFX9-NEXT: v_lshlrev_b32_sdwa v19, v7, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 +; GFX9-NEXT: v_and_or_b32 v2, v2, v6, v17 ; GFX9-NEXT: v_lshlrev_b32_e32 v11, 24, v11 ; GFX9-NEXT: v_or3_b32 v0, v0, v14, v9 ; GFX9-NEXT: v_or3_b32 v1, v1, v16, v10 -; GFX9-NEXT: v_and_or_b32 v13, v3, s6, v19 -; GFX9-NEXT: v_lshlrev_b32_sdwa v3, v7, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 +; GFX9-NEXT: v_and_or_b32 v13, v3, v6, v19 +; GFX9-NEXT: v_lshlrev_b32_sdwa v3, v8, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 ; GFX9-NEXT: v_lshlrev_b32_e32 v12, 24, v12 ; GFX9-NEXT: v_or3_b32 v2, v2, v18, v11 ; GFX9-NEXT: v_cndmask_b32_e32 v9, v0, v1, vcc -; GFX9-NEXT: v_cmp_eq_u32_e64 s[0:1], s4, 2 +; GFX9-NEXT: v_cmp_eq_u32_e64 s[0:1], s5, 2 +; GFX9-NEXT: v_mov_b32_e32 v15, s2 ; GFX9-NEXT: v_or3_b32 v3, v13, v3, v12 ; GFX9-NEXT: v_cndmask_b32_e64 v9, v9, v2, s[0:1] +; GFX9-NEXT: v_cmp_eq_u32_e64 s[2:3], s5, 3 ; GFX9-NEXT: v_cndmask_b32_e64 v9, v9, v3, s[2:3] -; GFX9-NEXT: v_and_or_b32 v8, v9, s5, v8 -; GFX9-NEXT: v_cmp_eq_u32_e64 s[4:5], s4, 0 -; GFX9-NEXT: v_cndmask_b32_e64 v0, v0, v8, s[4:5] -; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v8, vcc -; GFX9-NEXT: v_cndmask_b32_e64 v2, v2, v8, s[0:1] -; GFX9-NEXT: v_cndmask_b32_e64 v3, v3, v8, s[2:3] -; GFX9-NEXT: v_lshrrev_b32_e32 v8, 24, v0 -; GFX9-NEXT: v_lshrrev_b32_e32 v9, 24, v1 -; GFX9-NEXT: v_lshrrev_b32_e32 v10, 24, v2 -; GFX9-NEXT: v_lshrrev_b32_e32 v11, 24, v3 -; GFX9-NEXT: v_lshlrev_b32_sdwa v12, v6, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 -; GFX9-NEXT: v_lshlrev_b32_sdwa v14, v6, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 -; GFX9-NEXT: v_lshlrev_b32_sdwa v16, v6, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 -; GFX9-NEXT: v_lshlrev_b32_sdwa v6, v6, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 -; GFX9-NEXT: v_lshlrev_b32_sdwa v13, v7, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 -; GFX9-NEXT: v_lshlrev_b32_sdwa v15, v7, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 -; GFX9-NEXT: v_lshlrev_b32_sdwa v17, v7, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 -; GFX9-NEXT: v_lshlrev_b32_sdwa v7, v7, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 -; GFX9-NEXT: v_and_or_b32 v0, v0, s6, v12 -; GFX9-NEXT: v_lshlrev_b32_e32 v8, 24, v8 -; GFX9-NEXT: v_and_or_b32 v1, v1, s6, v14 +; GFX9-NEXT: v_and_or_b32 v9, v9, s6, v15 +; GFX9-NEXT: v_cmp_eq_u32_e64 s[4:5], s5, 0 +; GFX9-NEXT: v_cndmask_b32_e64 v0, v0, v9, s[4:5] +; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v9, vcc +; GFX9-NEXT: v_cndmask_b32_e64 v2, v2, v9, s[0:1] +; GFX9-NEXT: v_cndmask_b32_e64 v3, v3, v9, s[2:3] +; GFX9-NEXT: v_lshrrev_b32_e32 v9, 24, v0 +; GFX9-NEXT: v_lshrrev_b32_e32 v10, 24, v1 +; GFX9-NEXT: v_lshrrev_b32_e32 v11, 24, v2 +; GFX9-NEXT: v_lshrrev_b32_e32 v12, 24, v3 +; GFX9-NEXT: v_lshlrev_b32_sdwa v13, v7, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 +; GFX9-NEXT: v_lshlrev_b32_sdwa v15, v7, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 +; GFX9-NEXT: v_lshlrev_b32_sdwa v17, v7, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 +; GFX9-NEXT: v_lshlrev_b32_sdwa v7, v7, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 +; GFX9-NEXT: v_lshlrev_b32_sdwa v14, v8, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 +; GFX9-NEXT: v_lshlrev_b32_sdwa v16, v8, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 +; GFX9-NEXT: v_lshlrev_b32_sdwa v18, v8, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 +; GFX9-NEXT: v_lshlrev_b32_sdwa v8, v8, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 +; GFX9-NEXT: v_and_or_b32 v0, v0, v6, v13 ; GFX9-NEXT: v_lshlrev_b32_e32 v9, 24, v9 -; GFX9-NEXT: v_and_or_b32 v2, v2, s6, v16 +; GFX9-NEXT: v_and_or_b32 v1, v1, v6, v15 ; GFX9-NEXT: v_lshlrev_b32_e32 v10, 24, v10 -; GFX9-NEXT: v_and_or_b32 v3, v3, s6, v6 -; GFX9-NEXT: v_lshlrev_b32_e32 v6, 24, v11 -; GFX9-NEXT: v_or3_b32 v0, v0, v13, v8 -; GFX9-NEXT: v_or3_b32 v1, v1, v15, v9 -; GFX9-NEXT: v_or3_b32 v2, v2, v17, v10 -; GFX9-NEXT: v_or3_b32 v3, v3, v7, v6 +; GFX9-NEXT: v_and_or_b32 v2, v2, v6, v17 +; GFX9-NEXT: v_lshlrev_b32_e32 v11, 24, v11 +; GFX9-NEXT: v_and_or_b32 v3, v3, v6, v7 +; GFX9-NEXT: v_lshlrev_b32_e32 v6, 24, v12 +; GFX9-NEXT: v_or3_b32 v0, v0, v14, v9 +; GFX9-NEXT: v_or3_b32 v1, v1, v16, v10 +; GFX9-NEXT: v_or3_b32 v2, v2, v18, v11 +; GFX9-NEXT: v_or3_b32 v3, v3, v8, v6 ; GFX9-NEXT: global_store_dwordx4 v[4:5], v[0:3], off ; GFX9-NEXT: s_endpgm ; @@ -4319,14 +4261,13 @@ define amdgpu_ps void @insertelement_v_v16i8_s_s(<16 x i8> addrspace(1)* %ptr, i ; GFX8-NEXT: v_mov_b32_e32 v7, 16 ; GFX8-NEXT: v_mov_b32_e32 v8, 8 ; GFX8-NEXT: v_mov_b32_e32 v9, 16 -; GFX8-NEXT: s_and_b32 s1, s3, 3 -; GFX8-NEXT: s_movk_i32 s0, 0xff +; GFX8-NEXT: s_and_b32 s0, s3, 3 ; GFX8-NEXT: s_lshr_b32 s4, s3, 2 -; GFX8-NEXT: s_lshl_b32 s1, s1, 3 -; GFX8-NEXT: s_and_b32 s2, s2, s0 -; GFX8-NEXT: s_lshl_b32 s0, s0, s1 +; GFX8-NEXT: s_and_b32 s1, s2, 0xff +; GFX8-NEXT: s_lshl_b32 s0, s0, 3 +; GFX8-NEXT: s_lshl_b32 s5, s1, s0 +; GFX8-NEXT: s_lshl_b32 s0, 0xff, s0 ; GFX8-NEXT: v_cmp_eq_u32_e64 vcc, s4, 1 -; GFX8-NEXT: s_lshl_b32 s5, s2, s1 ; GFX8-NEXT: s_not_b32 s6, s0 ; GFX8-NEXT: v_cmp_eq_u32_e64 s[0:1], s4, 2 ; GFX8-NEXT: v_cmp_eq_u32_e64 s[2:3], s4, 3 @@ -4408,108 +4349,109 @@ define amdgpu_ps void @insertelement_v_v16i8_s_s(<16 x i8> addrspace(1)* %ptr, i ; GFX7-NEXT: s_mov_b32 s11, 0xf000 ; GFX7-NEXT: s_mov_b64 s[8:9], 0 ; GFX7-NEXT: buffer_load_dwordx4 v[0:3], v[0:1], s[8:11], 0 addr64 -; GFX7-NEXT: s_movk_i32 s6, 0xff -; GFX7-NEXT: s_and_b32 s0, s3, 3 +; GFX7-NEXT: s_movk_i32 s0, 0xff +; GFX7-NEXT: v_mov_b32_e32 v4, 0xff +; GFX7-NEXT: s_and_b32 s1, s3, 3 ; GFX7-NEXT: s_lshr_b32 s4, s3, 2 -; GFX7-NEXT: s_and_b32 s1, s2, s6 -; GFX7-NEXT: s_lshl_b32 s0, s0, 3 -; GFX7-NEXT: s_lshl_b32 s5, s1, s0 -; GFX7-NEXT: s_lshl_b32 s0, s6, s0 +; GFX7-NEXT: s_and_b32 s2, s2, 0xff +; GFX7-NEXT: s_lshl_b32 s1, s1, 3 +; GFX7-NEXT: s_lshl_b32 s5, s2, s1 +; GFX7-NEXT: s_lshl_b32 s1, 0xff, s1 ; GFX7-NEXT: v_cmp_eq_u32_e64 vcc, s4, 1 -; GFX7-NEXT: s_not_b32 s7, s0 -; GFX7-NEXT: v_cmp_eq_u32_e64 s[0:1], s4, 2 +; GFX7-NEXT: s_not_b32 s6, s1 ; GFX7-NEXT: v_cmp_eq_u32_e64 s[2:3], s4, 3 ; GFX7-NEXT: s_mov_b32 s10, -1 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_bfe_u32 v9, v0, 8, 8 -; GFX7-NEXT: v_bfe_u32 v11, v1, 8, 8 -; GFX7-NEXT: v_lshrrev_b32_e32 v4, 24, v0 -; GFX7-NEXT: v_lshrrev_b32_e32 v5, 24, v1 -; GFX7-NEXT: v_and_b32_e32 v8, s6, v0 +; GFX7-NEXT: v_bfe_u32 v10, v0, 8, 8 +; GFX7-NEXT: v_bfe_u32 v12, v1, 8, 8 +; GFX7-NEXT: v_lshrrev_b32_e32 v5, 24, v0 +; GFX7-NEXT: v_lshrrev_b32_e32 v6, 24, v1 +; GFX7-NEXT: v_and_b32_e32 v9, s0, v0 ; GFX7-NEXT: v_bfe_u32 v0, v0, 16, 8 -; GFX7-NEXT: v_and_b32_e32 v10, s6, v1 +; GFX7-NEXT: v_and_b32_e32 v11, s0, v1 ; GFX7-NEXT: v_bfe_u32 v1, v1, 16, 8 -; GFX7-NEXT: v_bfe_u32 v13, v2, 8, 8 -; GFX7-NEXT: v_lshlrev_b32_e32 v9, 8, v9 -; GFX7-NEXT: v_lshlrev_b32_e32 v11, 8, v11 -; GFX7-NEXT: v_lshrrev_b32_e32 v6, 24, v2 -; GFX7-NEXT: v_and_b32_e32 v12, s6, v2 +; GFX7-NEXT: v_bfe_u32 v14, v2, 8, 8 +; GFX7-NEXT: v_lshlrev_b32_e32 v10, 8, v10 +; GFX7-NEXT: v_lshlrev_b32_e32 v12, 8, v12 +; GFX7-NEXT: v_lshrrev_b32_e32 v7, 24, v2 +; GFX7-NEXT: v_and_b32_e32 v13, v2, v4 ; GFX7-NEXT: v_bfe_u32 v2, v2, 16, 8 -; GFX7-NEXT: v_bfe_u32 v15, v3, 8, 8 +; GFX7-NEXT: v_bfe_u32 v16, v3, 8, 8 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX7-NEXT: v_lshlrev_b32_e32 v13, 8, v13 -; GFX7-NEXT: v_or_b32_e32 v8, v8, v9 -; GFX7-NEXT: v_or_b32_e32 v9, v10, v11 -; GFX7-NEXT: v_lshrrev_b32_e32 v7, 24, v3 -; GFX7-NEXT: v_and_b32_e32 v14, s6, v3 +; GFX7-NEXT: v_lshlrev_b32_e32 v14, 8, v14 +; GFX7-NEXT: v_or_b32_e32 v9, v9, v10 +; GFX7-NEXT: v_or_b32_e32 v10, v11, v12 +; GFX7-NEXT: v_lshrrev_b32_e32 v8, 24, v3 +; GFX7-NEXT: v_and_b32_e32 v15, v3, v4 ; GFX7-NEXT: v_bfe_u32 v3, v3, 16, 8 -; GFX7-NEXT: v_lshlrev_b32_e32 v4, 24, v4 ; GFX7-NEXT: v_lshlrev_b32_e32 v5, 24, v5 -; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX7-NEXT: v_lshlrev_b32_e32 v15, 8, v15 -; GFX7-NEXT: v_or_b32_e32 v10, v12, v13 -; GFX7-NEXT: v_or_b32_e32 v0, v8, v0 -; GFX7-NEXT: v_or_b32_e32 v1, v9, v1 ; GFX7-NEXT: v_lshlrev_b32_e32 v6, 24, v6 -; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GFX7-NEXT: v_or_b32_e32 v11, v14, v15 -; GFX7-NEXT: v_or_b32_e32 v2, v10, v2 -; GFX7-NEXT: v_or_b32_e32 v0, v0, v4 -; GFX7-NEXT: v_or_b32_e32 v1, v1, v5 +; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX7-NEXT: v_lshlrev_b32_e32 v16, 8, v16 +; GFX7-NEXT: v_or_b32_e32 v11, v13, v14 +; GFX7-NEXT: v_or_b32_e32 v0, v9, v0 +; GFX7-NEXT: v_or_b32_e32 v1, v10, v1 ; GFX7-NEXT: v_lshlrev_b32_e32 v7, 24, v7 -; GFX7-NEXT: v_or_b32_e32 v3, v11, v3 -; GFX7-NEXT: v_or_b32_e32 v2, v2, v6 -; GFX7-NEXT: v_cndmask_b32_e32 v4, v0, v1, vcc -; GFX7-NEXT: v_or_b32_e32 v3, v3, v7 -; GFX7-NEXT: v_cndmask_b32_e64 v4, v4, v2, s[0:1] -; GFX7-NEXT: v_cndmask_b32_e64 v4, v4, v3, s[2:3] -; GFX7-NEXT: v_and_b32_e32 v4, s7, v4 -; GFX7-NEXT: v_or_b32_e32 v4, s5, v4 +; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX7-NEXT: v_or_b32_e32 v12, v15, v16 +; GFX7-NEXT: v_or_b32_e32 v2, v11, v2 +; GFX7-NEXT: v_or_b32_e32 v0, v0, v5 +; GFX7-NEXT: v_or_b32_e32 v1, v1, v6 +; GFX7-NEXT: v_lshlrev_b32_e32 v8, 24, v8 +; GFX7-NEXT: v_or_b32_e32 v3, v12, v3 +; GFX7-NEXT: v_or_b32_e32 v2, v2, v7 +; GFX7-NEXT: v_cndmask_b32_e32 v5, v0, v1, vcc +; GFX7-NEXT: v_cmp_eq_u32_e64 s[0:1], s4, 2 +; GFX7-NEXT: v_or_b32_e32 v3, v3, v8 +; GFX7-NEXT: v_cndmask_b32_e64 v5, v5, v2, s[0:1] +; GFX7-NEXT: v_cndmask_b32_e64 v5, v5, v3, s[2:3] +; GFX7-NEXT: v_and_b32_e32 v5, s6, v5 +; GFX7-NEXT: v_or_b32_e32 v5, s5, v5 ; GFX7-NEXT: v_cmp_eq_u32_e64 s[4:5], s4, 0 -; GFX7-NEXT: v_cndmask_b32_e64 v0, v0, v4, s[4:5] -; GFX7-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc -; GFX7-NEXT: v_cndmask_b32_e64 v2, v2, v4, s[0:1] -; GFX7-NEXT: v_bfe_u32 v9, v0, 8, 8 -; GFX7-NEXT: v_bfe_u32 v11, v1, 8, 8 -; GFX7-NEXT: v_cndmask_b32_e64 v3, v3, v4, s[2:3] -; GFX7-NEXT: v_lshrrev_b32_e32 v4, 24, v0 -; GFX7-NEXT: v_lshrrev_b32_e32 v5, 24, v1 -; GFX7-NEXT: v_and_b32_e32 v8, s6, v0 +; GFX7-NEXT: v_cndmask_b32_e64 v0, v0, v5, s[4:5] +; GFX7-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc +; GFX7-NEXT: v_cndmask_b32_e64 v2, v2, v5, s[0:1] +; GFX7-NEXT: v_bfe_u32 v10, v0, 8, 8 +; GFX7-NEXT: v_cndmask_b32_e64 v3, v3, v5, s[2:3] +; GFX7-NEXT: v_lshrrev_b32_e32 v5, 24, v0 +; GFX7-NEXT: v_and_b32_e32 v9, v0, v4 ; GFX7-NEXT: v_bfe_u32 v0, v0, 16, 8 -; GFX7-NEXT: v_and_b32_e32 v10, s6, v1 -; GFX7-NEXT: v_bfe_u32 v1, v1, 16, 8 -; GFX7-NEXT: v_bfe_u32 v13, v2, 8, 8 -; GFX7-NEXT: v_lshlrev_b32_e32 v9, 8, v9 -; GFX7-NEXT: v_lshlrev_b32_e32 v11, 8, v11 -; GFX7-NEXT: v_lshrrev_b32_e32 v6, 24, v2 -; GFX7-NEXT: v_and_b32_e32 v12, s6, v2 +; GFX7-NEXT: v_bfe_u32 v12, v1, 8, 8 +; GFX7-NEXT: v_bfe_u32 v14, v2, 8, 8 +; GFX7-NEXT: v_lshlrev_b32_e32 v10, 8, v10 +; GFX7-NEXT: v_lshrrev_b32_e32 v7, 24, v2 +; GFX7-NEXT: v_and_b32_e32 v11, v1, v4 +; GFX7-NEXT: v_and_b32_e32 v13, v2, v4 ; GFX7-NEXT: v_bfe_u32 v2, v2, 16, 8 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX7-NEXT: v_lshlrev_b32_e32 v13, 8, v13 -; GFX7-NEXT: v_or_b32_e32 v8, v8, v9 -; GFX7-NEXT: v_or_b32_e32 v9, v10, v11 -; GFX7-NEXT: v_lshlrev_b32_e32 v4, 24, v4 +; GFX7-NEXT: v_lshlrev_b32_e32 v12, 8, v12 +; GFX7-NEXT: v_lshlrev_b32_e32 v14, 8, v14 +; GFX7-NEXT: v_or_b32_e32 v9, v9, v10 ; GFX7-NEXT: v_lshlrev_b32_e32 v5, 24, v5 +; GFX7-NEXT: v_or_b32_e32 v10, v11, v12 +; GFX7-NEXT: v_or_b32_e32 v11, v13, v14 +; GFX7-NEXT: v_or_b32_e32 v0, v9, v0 ; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX7-NEXT: v_or_b32_e32 v10, v12, v13 -; GFX7-NEXT: v_or_b32_e32 v0, v8, v0 -; GFX7-NEXT: v_or_b32_e32 v1, v9, v1 -; GFX7-NEXT: v_or_b32_e32 v0, v0, v4 -; GFX7-NEXT: v_or_b32_e32 v1, v1, v5 -; GFX7-NEXT: v_or_b32_e32 v2, v10, v2 -; GFX7-NEXT: v_lshlrev_b32_e32 v4, 24, v6 +; GFX7-NEXT: v_or_b32_e32 v0, v0, v5 +; GFX7-NEXT: v_or_b32_e32 v2, v11, v2 +; GFX7-NEXT: v_lshlrev_b32_e32 v5, 24, v7 +; GFX7-NEXT: v_or_b32_e32 v2, v2, v5 ; GFX7-NEXT: v_bfe_u32 v5, v3, 8, 8 -; GFX7-NEXT: v_lshrrev_b32_e32 v7, 24, v3 -; GFX7-NEXT: v_or_b32_e32 v2, v2, v4 -; GFX7-NEXT: v_and_b32_e32 v4, s6, v3 +; GFX7-NEXT: v_lshrrev_b32_e32 v6, 24, v1 +; GFX7-NEXT: v_lshrrev_b32_e32 v8, 24, v3 +; GFX7-NEXT: v_bfe_u32 v1, v1, 16, 8 +; GFX7-NEXT: v_and_b32_e32 v4, v3, v4 ; GFX7-NEXT: v_lshlrev_b32_e32 v5, 8, v5 ; GFX7-NEXT: v_bfe_u32 v3, v3, 16, 8 +; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX7-NEXT: v_or_b32_e32 v4, v4, v5 ; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX7-NEXT: v_lshlrev_b32_e32 v6, 24, v6 +; GFX7-NEXT: v_or_b32_e32 v1, v10, v1 ; GFX7-NEXT: v_or_b32_e32 v3, v4, v3 -; GFX7-NEXT: v_lshlrev_b32_e32 v4, 24, v7 +; GFX7-NEXT: v_lshlrev_b32_e32 v4, 24, v8 +; GFX7-NEXT: v_or_b32_e32 v1, v1, v6 ; GFX7-NEXT: v_or_b32_e32 v3, v3, v4 ; GFX7-NEXT: buffer_store_dwordx4 v[0:3], off, s[8:11], 0 ; GFX7-NEXT: s_endpgm @@ -4520,11 +4462,10 @@ define amdgpu_ps void @insertelement_v_v16i8_s_s(<16 x i8> addrspace(1)* %ptr, i ; GFX10-NEXT: s_mov_b32 s0, 8 ; GFX10-NEXT: v_mov_b32_e32 v4, 8 ; GFX10-NEXT: s_mov_b32 s1, 16 -; GFX10-NEXT: s_movk_i32 s4, 0xff ; GFX10-NEXT: v_mov_b32_e32 v5, 16 -; GFX10-NEXT: s_lshr_b32 s5, s3, 2 -; GFX10-NEXT: s_and_b32 s2, s2, s4 -; GFX10-NEXT: v_cmp_eq_u32_e64 vcc_lo, s5, 1 +; GFX10-NEXT: s_lshr_b32 s4, s3, 2 +; GFX10-NEXT: s_and_b32 s2, s2, 0xff +; GFX10-NEXT: v_cmp_eq_u32_e64 vcc_lo, s4, 1 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_lshrrev_b32_e32 v6, 24, v0 ; GFX10-NEXT: v_lshrrev_b32_e32 v7, 24, v1 @@ -4534,34 +4475,34 @@ define amdgpu_ps void @insertelement_v_v16i8_s_s(<16 x i8> addrspace(1)* %ptr, i ; GFX10-NEXT: v_lshlrev_b32_sdwa v11, s1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 ; GFX10-NEXT: v_lshlrev_b32_sdwa v13, s1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 ; GFX10-NEXT: v_lshlrev_b32_sdwa v14, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 -; GFX10-NEXT: v_and_or_b32 v0, v0, s4, v10 +; GFX10-NEXT: v_and_or_b32 v0, v0, 0xff, v10 ; GFX10-NEXT: v_lshlrev_b32_e32 v6, 24, v6 -; GFX10-NEXT: v_and_or_b32 v1, v1, s4, v12 +; GFX10-NEXT: v_and_or_b32 v1, v1, 0xff, v12 ; GFX10-NEXT: v_lshlrev_b32_e32 v7, 24, v7 ; GFX10-NEXT: v_lshrrev_b32_e32 v9, 24, v3 ; GFX10-NEXT: v_lshlrev_b32_sdwa v15, v5, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 ; GFX10-NEXT: v_lshlrev_b32_sdwa v16, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 -; GFX10-NEXT: v_and_or_b32 v2, v2, s4, v14 +; GFX10-NEXT: v_and_or_b32 v2, 0xff, v2, v14 ; GFX10-NEXT: v_lshlrev_b32_e32 v8, 24, v8 ; GFX10-NEXT: v_or3_b32 v0, v0, v11, v6 ; GFX10-NEXT: v_or3_b32 v1, v1, v13, v7 ; GFX10-NEXT: v_lshlrev_b32_sdwa v10, v5, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 -; GFX10-NEXT: v_and_or_b32 v3, v3, s4, v16 +; GFX10-NEXT: v_and_or_b32 v3, 0xff, v3, v16 ; GFX10-NEXT: v_lshlrev_b32_e32 v6, 24, v9 ; GFX10-NEXT: v_or3_b32 v2, v2, v15, v8 ; GFX10-NEXT: v_cndmask_b32_e32 v7, v0, v1, vcc_lo -; GFX10-NEXT: v_cmp_eq_u32_e64 s0, s5, 2 +; GFX10-NEXT: v_cmp_eq_u32_e64 s0, s4, 2 ; GFX10-NEXT: s_and_b32 s1, s3, 3 ; GFX10-NEXT: v_or3_b32 v3, v3, v10, v6 ; GFX10-NEXT: s_lshl_b32 s3, s1, 3 -; GFX10-NEXT: v_cmp_eq_u32_e64 s1, s5, 3 +; GFX10-NEXT: v_cmp_eq_u32_e64 s1, s4, 3 ; GFX10-NEXT: v_cndmask_b32_e64 v6, v7, v2, s0 -; GFX10-NEXT: s_lshl_b32 s6, s4, s3 +; GFX10-NEXT: s_lshl_b32 s5, 0xff, s3 ; GFX10-NEXT: s_lshl_b32 s2, s2, s3 -; GFX10-NEXT: s_not_b32 s3, s6 +; GFX10-NEXT: s_not_b32 s3, s5 ; GFX10-NEXT: v_cndmask_b32_e64 v6, v6, v3, s1 ; GFX10-NEXT: v_and_or_b32 v6, v6, s3, s2 -; GFX10-NEXT: v_cmp_eq_u32_e64 s2, s5, 0 +; GFX10-NEXT: v_cmp_eq_u32_e64 s2, s4, 0 ; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v6, vcc_lo ; GFX10-NEXT: v_cndmask_b32_e64 v0, v0, v6, s2 ; GFX10-NEXT: v_cndmask_b32_e64 v2, v2, v6, s0 @@ -4578,13 +4519,13 @@ define amdgpu_ps void @insertelement_v_v16i8_s_s(<16 x i8> addrspace(1)* %ptr, i ; GFX10-NEXT: v_lshlrev_b32_sdwa v13, v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 ; GFX10-NEXT: v_lshlrev_b32_sdwa v15, v5, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 ; GFX10-NEXT: v_lshlrev_b32_sdwa v16, v5, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 -; GFX10-NEXT: v_and_or_b32 v0, v0, s4, v10 +; GFX10-NEXT: v_and_or_b32 v0, 0xff, v0, v10 ; GFX10-NEXT: v_lshlrev_b32_e32 v6, 24, v6 -; GFX10-NEXT: v_and_or_b32 v1, v1, s4, v12 +; GFX10-NEXT: v_and_or_b32 v1, 0xff, v1, v12 ; GFX10-NEXT: v_lshlrev_b32_e32 v7, 24, v7 -; GFX10-NEXT: v_and_or_b32 v2, v2, s4, v14 +; GFX10-NEXT: v_and_or_b32 v2, 0xff, v2, v14 ; GFX10-NEXT: v_lshlrev_b32_e32 v8, 24, v8 -; GFX10-NEXT: v_and_or_b32 v3, v3, s4, v4 +; GFX10-NEXT: v_and_or_b32 v3, 0xff, v3, v4 ; GFX10-NEXT: v_lshlrev_b32_e32 v9, 24, v9 ; GFX10-NEXT: v_mov_b32_e32 v4, 0 ; GFX10-NEXT: v_mov_b32_e32 v5, 0 @@ -4604,47 +4545,46 @@ define amdgpu_ps void @insertelement_s_v16i8_v_s(<16 x i8> addrspace(4)* inreg % ; GFX9-LABEL: insertelement_s_v16i8_v_s: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x0 -; GFX9-NEXT: s_mov_b32 s13, 0x80008 -; GFX9-NEXT: s_movk_i32 s11, 0xff -; GFX9-NEXT: v_and_b32_e32 v0, s11, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xff, v0 ; GFX9-NEXT: s_mov_b32 s5, 8 +; GFX9-NEXT: s_mov_b32 s6, 16 +; GFX9-NEXT: v_mov_b32_e32 v4, 0xff ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_bfe_u32 s14, s0, s13 -; GFX9-NEXT: s_and_b32 s12, s0, s11 -; GFX9-NEXT: s_lshl_b32 s14, s14, 8 -; GFX9-NEXT: s_or_b32 s12, s12, s14 -; GFX9-NEXT: s_mov_b32 s14, 0x80010 +; GFX9-NEXT: s_bfe_u32 s12, s0, 0x80008 ; GFX9-NEXT: s_lshr_b32 s7, s0, 24 -; GFX9-NEXT: s_bfe_u32 s0, s0, s14 +; GFX9-NEXT: s_and_b32 s11, s0, 0xff +; GFX9-NEXT: s_lshl_b32 s12, s12, 8 +; GFX9-NEXT: s_bfe_u32 s0, s0, 0x80010 +; GFX9-NEXT: s_or_b32 s11, s11, s12 ; GFX9-NEXT: s_lshl_b32 s0, s0, 16 -; GFX9-NEXT: s_or_b32 s0, s12, s0 +; GFX9-NEXT: s_or_b32 s0, s11, s0 ; GFX9-NEXT: s_lshl_b32 s7, s7, 24 -; GFX9-NEXT: s_bfe_u32 s12, s1, s13 +; GFX9-NEXT: s_bfe_u32 s11, s1, 0x80008 ; GFX9-NEXT: s_lshr_b32 s8, s1, 24 ; GFX9-NEXT: s_or_b32 s0, s0, s7 -; GFX9-NEXT: s_and_b32 s7, s1, s11 -; GFX9-NEXT: s_lshl_b32 s12, s12, 8 -; GFX9-NEXT: s_bfe_u32 s1, s1, s14 -; GFX9-NEXT: s_or_b32 s7, s7, s12 +; GFX9-NEXT: s_and_b32 s7, s1, 0xff +; GFX9-NEXT: s_lshl_b32 s11, s11, 8 +; GFX9-NEXT: s_bfe_u32 s1, s1, 0x80010 +; GFX9-NEXT: s_or_b32 s7, s7, s11 ; GFX9-NEXT: s_lshl_b32 s1, s1, 16 ; GFX9-NEXT: s_or_b32 s1, s7, s1 ; GFX9-NEXT: s_lshl_b32 s7, s8, 24 -; GFX9-NEXT: s_bfe_u32 s8, s2, s13 +; GFX9-NEXT: s_bfe_u32 s8, s2, 0x80008 ; GFX9-NEXT: s_lshr_b32 s9, s2, 24 ; GFX9-NEXT: s_or_b32 s1, s1, s7 -; GFX9-NEXT: s_and_b32 s7, s2, s11 +; GFX9-NEXT: s_and_b32 s7, s2, 0xff ; GFX9-NEXT: s_lshl_b32 s8, s8, 8 -; GFX9-NEXT: s_bfe_u32 s2, s2, s14 +; GFX9-NEXT: s_bfe_u32 s2, s2, 0x80010 ; GFX9-NEXT: s_or_b32 s7, s7, s8 ; GFX9-NEXT: s_lshl_b32 s2, s2, 16 ; GFX9-NEXT: s_or_b32 s2, s7, s2 ; GFX9-NEXT: s_lshl_b32 s7, s9, 24 -; GFX9-NEXT: s_bfe_u32 s8, s3, s13 +; GFX9-NEXT: s_bfe_u32 s8, s3, 0x80008 ; GFX9-NEXT: s_lshr_b32 s10, s3, 24 ; GFX9-NEXT: s_or_b32 s2, s2, s7 -; GFX9-NEXT: s_and_b32 s7, s3, s11 +; GFX9-NEXT: s_and_b32 s7, s3, 0xff ; GFX9-NEXT: s_lshl_b32 s8, s8, 8 -; GFX9-NEXT: s_bfe_u32 s3, s3, s14 +; GFX9-NEXT: s_bfe_u32 s3, s3, 0x80010 ; GFX9-NEXT: s_or_b32 s7, s7, s8 ; GFX9-NEXT: s_lshl_b32 s3, s3, 16 ; GFX9-NEXT: s_or_b32 s3, s7, s3 @@ -4659,48 +4599,47 @@ define amdgpu_ps void @insertelement_s_v16i8_v_s(<16 x i8> addrspace(4)* inreg % ; GFX9-NEXT: s_cselect_b32 s8, s3, s8 ; GFX9-NEXT: s_and_b32 s4, s4, 3 ; GFX9-NEXT: s_lshl_b32 s4, s4, 3 -; GFX9-NEXT: s_lshl_b32 s9, s11, s4 +; GFX9-NEXT: s_lshl_b32 s9, 0xff, s4 ; GFX9-NEXT: s_andn2_b32 s8, s8, s9 ; GFX9-NEXT: v_mov_b32_e32 v1, s8 -; GFX9-NEXT: v_lshl_or_b32 v4, v0, s4, v1 +; GFX9-NEXT: v_lshl_or_b32 v5, v0, s4, v1 ; GFX9-NEXT: v_mov_b32_e32 v0, s0 ; GFX9-NEXT: v_cmp_eq_u32_e64 vcc, s7, 0 ; GFX9-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v5, vcc ; GFX9-NEXT: v_cmp_eq_u32_e64 vcc, s7, 1 ; GFX9-NEXT: v_mov_b32_e32 v2, s2 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc ; GFX9-NEXT: v_cmp_eq_u32_e64 vcc, s7, 2 ; GFX9-NEXT: v_mov_b32_e32 v3, s3 -; GFX9-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v2, v2, v5, vcc ; GFX9-NEXT: v_cmp_eq_u32_e64 vcc, s7, 3 -; GFX9-NEXT: s_mov_b32 s6, 16 -; GFX9-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc -; GFX9-NEXT: v_lshrrev_b32_e32 v4, 24, v0 -; GFX9-NEXT: v_lshlrev_b32_sdwa v8, s5, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 -; GFX9-NEXT: v_and_or_b32 v8, v0, s11, v8 +; GFX9-NEXT: v_cndmask_b32_e32 v3, v3, v5, vcc +; GFX9-NEXT: v_lshrrev_b32_e32 v5, 24, v0 +; GFX9-NEXT: v_lshlrev_b32_sdwa v9, s5, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 +; GFX9-NEXT: v_and_or_b32 v9, v0, v4, v9 ; GFX9-NEXT: v_lshlrev_b32_sdwa v0, s6, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 -; GFX9-NEXT: v_lshlrev_b32_e32 v4, 24, v4 -; GFX9-NEXT: v_lshrrev_b32_e32 v5, 24, v1 -; GFX9-NEXT: v_or3_b32 v0, v8, v0, v4 -; GFX9-NEXT: v_lshlrev_b32_sdwa v4, s5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 -; GFX9-NEXT: v_and_or_b32 v4, v1, s11, v4 -; GFX9-NEXT: v_lshlrev_b32_sdwa v1, s6, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 ; GFX9-NEXT: v_lshlrev_b32_e32 v5, 24, v5 -; GFX9-NEXT: v_or3_b32 v1, v4, v1, v5 -; GFX9-NEXT: v_mov_b32_e32 v4, 8 -; GFX9-NEXT: v_lshrrev_b32_e32 v6, 24, v2 -; GFX9-NEXT: v_lshlrev_b32_sdwa v5, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 -; GFX9-NEXT: v_mov_b32_e32 v8, 16 -; GFX9-NEXT: v_lshrrev_b32_e32 v7, 24, v3 -; GFX9-NEXT: v_and_or_b32 v5, v2, s11, v5 -; GFX9-NEXT: v_lshlrev_b32_sdwa v2, v8, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 +; GFX9-NEXT: v_lshrrev_b32_e32 v6, 24, v1 +; GFX9-NEXT: v_or3_b32 v0, v9, v0, v5 +; GFX9-NEXT: v_lshlrev_b32_sdwa v5, s5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 +; GFX9-NEXT: v_and_or_b32 v5, v1, v4, v5 +; GFX9-NEXT: v_lshlrev_b32_sdwa v1, s6, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 ; GFX9-NEXT: v_lshlrev_b32_e32 v6, 24, v6 -; GFX9-NEXT: v_lshlrev_b32_sdwa v4, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 -; GFX9-NEXT: v_or3_b32 v2, v5, v2, v6 -; GFX9-NEXT: v_and_or_b32 v6, v3, s11, v4 -; GFX9-NEXT: v_lshlrev_b32_sdwa v3, v8, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 +; GFX9-NEXT: v_or3_b32 v1, v5, v1, v6 +; GFX9-NEXT: v_mov_b32_e32 v5, 8 +; GFX9-NEXT: v_lshrrev_b32_e32 v7, 24, v2 +; GFX9-NEXT: v_lshlrev_b32_sdwa v6, v5, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 +; GFX9-NEXT: v_mov_b32_e32 v9, 16 +; GFX9-NEXT: v_lshrrev_b32_e32 v8, 24, v3 +; GFX9-NEXT: v_and_or_b32 v6, v2, v4, v6 +; GFX9-NEXT: v_lshlrev_b32_sdwa v2, v9, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 ; GFX9-NEXT: v_lshlrev_b32_e32 v7, 24, v7 +; GFX9-NEXT: v_lshlrev_b32_sdwa v5, v5, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 +; GFX9-NEXT: v_or3_b32 v2, v6, v2, v7 +; GFX9-NEXT: v_and_or_b32 v6, v3, v4, v5 +; GFX9-NEXT: v_lshlrev_b32_sdwa v3, v9, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 +; GFX9-NEXT: v_lshlrev_b32_e32 v7, 24, v8 ; GFX9-NEXT: v_mov_b32_e32 v4, 0 ; GFX9-NEXT: v_mov_b32_e32 v5, 0 ; GFX9-NEXT: v_or3_b32 v3, v6, v3, v7 @@ -4710,47 +4649,44 @@ define amdgpu_ps void @insertelement_s_v16i8_v_s(<16 x i8> addrspace(4)* inreg % ; GFX8-LABEL: insertelement_s_v16i8_v_s: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x0 -; GFX8-NEXT: s_mov_b32 s11, 0x80008 -; GFX8-NEXT: s_movk_i32 s9, 0xff ; GFX8-NEXT: v_mov_b32_e32 v8, 8 ; GFX8-NEXT: v_mov_b32_e32 v10, 16 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_bfe_u32 s12, s0, s11 -; GFX8-NEXT: s_and_b32 s10, s0, s9 -; GFX8-NEXT: s_lshl_b32 s12, s12, 8 -; GFX8-NEXT: s_or_b32 s10, s10, s12 -; GFX8-NEXT: s_mov_b32 s12, 0x80010 +; GFX8-NEXT: s_bfe_u32 s10, s0, 0x80008 ; GFX8-NEXT: s_lshr_b32 s5, s0, 24 -; GFX8-NEXT: s_bfe_u32 s0, s0, s12 +; GFX8-NEXT: s_and_b32 s9, s0, 0xff +; GFX8-NEXT: s_lshl_b32 s10, s10, 8 +; GFX8-NEXT: s_bfe_u32 s0, s0, 0x80010 +; GFX8-NEXT: s_or_b32 s9, s9, s10 ; GFX8-NEXT: s_lshl_b32 s0, s0, 16 -; GFX8-NEXT: s_or_b32 s0, s10, s0 +; GFX8-NEXT: s_or_b32 s0, s9, s0 ; GFX8-NEXT: s_lshl_b32 s5, s5, 24 -; GFX8-NEXT: s_bfe_u32 s10, s1, s11 +; GFX8-NEXT: s_bfe_u32 s9, s1, 0x80008 ; GFX8-NEXT: s_lshr_b32 s6, s1, 24 ; GFX8-NEXT: s_or_b32 s0, s0, s5 -; GFX8-NEXT: s_and_b32 s5, s1, s9 -; GFX8-NEXT: s_lshl_b32 s10, s10, 8 -; GFX8-NEXT: s_bfe_u32 s1, s1, s12 -; GFX8-NEXT: s_or_b32 s5, s5, s10 +; GFX8-NEXT: s_and_b32 s5, s1, 0xff +; GFX8-NEXT: s_lshl_b32 s9, s9, 8 +; GFX8-NEXT: s_bfe_u32 s1, s1, 0x80010 +; GFX8-NEXT: s_or_b32 s5, s5, s9 ; GFX8-NEXT: s_lshl_b32 s1, s1, 16 ; GFX8-NEXT: s_or_b32 s1, s5, s1 ; GFX8-NEXT: s_lshl_b32 s5, s6, 24 -; GFX8-NEXT: s_bfe_u32 s6, s2, s11 +; GFX8-NEXT: s_bfe_u32 s6, s2, 0x80008 ; GFX8-NEXT: s_lshr_b32 s7, s2, 24 ; GFX8-NEXT: s_or_b32 s1, s1, s5 -; GFX8-NEXT: s_and_b32 s5, s2, s9 +; GFX8-NEXT: s_and_b32 s5, s2, 0xff ; GFX8-NEXT: s_lshl_b32 s6, s6, 8 -; GFX8-NEXT: s_bfe_u32 s2, s2, s12 +; GFX8-NEXT: s_bfe_u32 s2, s2, 0x80010 ; GFX8-NEXT: s_or_b32 s5, s5, s6 ; GFX8-NEXT: s_lshl_b32 s2, s2, 16 ; GFX8-NEXT: s_or_b32 s2, s5, s2 ; GFX8-NEXT: s_lshl_b32 s5, s7, 24 -; GFX8-NEXT: s_bfe_u32 s6, s3, s11 +; GFX8-NEXT: s_bfe_u32 s6, s3, 0x80008 ; GFX8-NEXT: s_lshr_b32 s8, s3, 24 ; GFX8-NEXT: s_or_b32 s2, s2, s5 -; GFX8-NEXT: s_and_b32 s5, s3, s9 +; GFX8-NEXT: s_and_b32 s5, s3, 0xff ; GFX8-NEXT: s_lshl_b32 s6, s6, 8 -; GFX8-NEXT: s_bfe_u32 s3, s3, s12 +; GFX8-NEXT: s_bfe_u32 s3, s3, 0x80010 ; GFX8-NEXT: s_or_b32 s5, s5, s6 ; GFX8-NEXT: s_lshl_b32 s3, s3, 16 ; GFX8-NEXT: s_or_b32 s3, s5, s3 @@ -4766,7 +4702,7 @@ define amdgpu_ps void @insertelement_s_v16i8_v_s(<16 x i8> addrspace(4)* inreg % ; GFX8-NEXT: s_and_b32 s4, s4, 3 ; GFX8-NEXT: s_lshl_b32 s4, s4, 3 ; GFX8-NEXT: v_mov_b32_e32 v1, s4 -; GFX8-NEXT: s_lshl_b32 s4, s9, s4 +; GFX8-NEXT: s_lshl_b32 s4, 0xff, s4 ; GFX8-NEXT: v_lshlrev_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX8-NEXT: s_andn2_b32 s4, s6, s4 ; GFX8-NEXT: v_or_b32_e32 v4, s4, v0 @@ -4820,46 +4756,44 @@ define amdgpu_ps void @insertelement_s_v16i8_v_s(<16 x i8> addrspace(4)* inreg % ; GFX7-LABEL: insertelement_s_v16i8_v_s: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x0 -; GFX7-NEXT: s_mov_b32 s11, 0x80008 -; GFX7-NEXT: s_movk_i32 s9, 0xff -; GFX7-NEXT: v_and_b32_e32 v0, s9, v0 +; GFX7-NEXT: v_and_b32_e32 v0, 0xff, v0 +; GFX7-NEXT: v_mov_b32_e32 v4, 0xff ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: s_bfe_u32 s12, s0, s11 -; GFX7-NEXT: s_and_b32 s10, s0, s9 -; GFX7-NEXT: s_lshl_b32 s12, s12, 8 -; GFX7-NEXT: s_or_b32 s10, s10, s12 -; GFX7-NEXT: s_mov_b32 s12, 0x80010 +; GFX7-NEXT: s_bfe_u32 s10, s0, 0x80008 ; GFX7-NEXT: s_lshr_b32 s5, s0, 24 -; GFX7-NEXT: s_bfe_u32 s0, s0, s12 +; GFX7-NEXT: s_and_b32 s9, s0, 0xff +; GFX7-NEXT: s_lshl_b32 s10, s10, 8 +; GFX7-NEXT: s_bfe_u32 s0, s0, 0x80010 +; GFX7-NEXT: s_or_b32 s9, s9, s10 ; GFX7-NEXT: s_lshl_b32 s0, s0, 16 -; GFX7-NEXT: s_or_b32 s0, s10, s0 +; GFX7-NEXT: s_or_b32 s0, s9, s0 ; GFX7-NEXT: s_lshl_b32 s5, s5, 24 -; GFX7-NEXT: s_bfe_u32 s10, s1, s11 +; GFX7-NEXT: s_bfe_u32 s9, s1, 0x80008 ; GFX7-NEXT: s_lshr_b32 s6, s1, 24 ; GFX7-NEXT: s_or_b32 s0, s0, s5 -; GFX7-NEXT: s_and_b32 s5, s1, s9 -; GFX7-NEXT: s_lshl_b32 s10, s10, 8 -; GFX7-NEXT: s_bfe_u32 s1, s1, s12 -; GFX7-NEXT: s_or_b32 s5, s5, s10 +; GFX7-NEXT: s_and_b32 s5, s1, 0xff +; GFX7-NEXT: s_lshl_b32 s9, s9, 8 +; GFX7-NEXT: s_bfe_u32 s1, s1, 0x80010 +; GFX7-NEXT: s_or_b32 s5, s5, s9 ; GFX7-NEXT: s_lshl_b32 s1, s1, 16 ; GFX7-NEXT: s_or_b32 s1, s5, s1 ; GFX7-NEXT: s_lshl_b32 s5, s6, 24 -; GFX7-NEXT: s_bfe_u32 s6, s2, s11 +; GFX7-NEXT: s_bfe_u32 s6, s2, 0x80008 ; GFX7-NEXT: s_lshr_b32 s7, s2, 24 ; GFX7-NEXT: s_or_b32 s1, s1, s5 -; GFX7-NEXT: s_and_b32 s5, s2, s9 +; GFX7-NEXT: s_and_b32 s5, s2, 0xff ; GFX7-NEXT: s_lshl_b32 s6, s6, 8 -; GFX7-NEXT: s_bfe_u32 s2, s2, s12 +; GFX7-NEXT: s_bfe_u32 s2, s2, 0x80010 ; GFX7-NEXT: s_or_b32 s5, s5, s6 ; GFX7-NEXT: s_lshl_b32 s2, s2, 16 ; GFX7-NEXT: s_or_b32 s2, s5, s2 ; GFX7-NEXT: s_lshl_b32 s5, s7, 24 -; GFX7-NEXT: s_bfe_u32 s6, s3, s11 +; GFX7-NEXT: s_bfe_u32 s6, s3, 0x80008 ; GFX7-NEXT: s_lshr_b32 s8, s3, 24 ; GFX7-NEXT: s_or_b32 s2, s2, s5 -; GFX7-NEXT: s_and_b32 s5, s3, s9 +; GFX7-NEXT: s_and_b32 s5, s3, 0xff ; GFX7-NEXT: s_lshl_b32 s6, s6, 8 -; GFX7-NEXT: s_bfe_u32 s3, s3, s12 +; GFX7-NEXT: s_bfe_u32 s3, s3, 0x80010 ; GFX7-NEXT: s_or_b32 s5, s5, s6 ; GFX7-NEXT: s_lshl_b32 s3, s3, 16 ; GFX7-NEXT: s_or_b32 s3, s5, s3 @@ -4875,60 +4809,60 @@ define amdgpu_ps void @insertelement_s_v16i8_v_s(<16 x i8> addrspace(4)* inreg % ; GFX7-NEXT: s_and_b32 s4, s4, 3 ; GFX7-NEXT: s_lshl_b32 s4, s4, 3 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, s4, v0 -; GFX7-NEXT: s_lshl_b32 s4, s9, s4 +; GFX7-NEXT: s_lshl_b32 s4, 0xff, s4 ; GFX7-NEXT: s_andn2_b32 s4, s6, s4 -; GFX7-NEXT: v_or_b32_e32 v4, s4, v0 +; GFX7-NEXT: v_or_b32_e32 v5, s4, v0 ; GFX7-NEXT: v_mov_b32_e32 v0, s0 ; GFX7-NEXT: v_cmp_eq_u32_e64 vcc, s5, 0 ; GFX7-NEXT: v_mov_b32_e32 v1, s1 -; GFX7-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc +; GFX7-NEXT: v_cndmask_b32_e32 v0, v0, v5, vcc ; GFX7-NEXT: v_cmp_eq_u32_e64 vcc, s5, 1 ; GFX7-NEXT: v_mov_b32_e32 v2, s2 -; GFX7-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc +; GFX7-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc ; GFX7-NEXT: v_cmp_eq_u32_e64 vcc, s5, 2 ; GFX7-NEXT: v_mov_b32_e32 v3, s3 -; GFX7-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc +; GFX7-NEXT: v_cndmask_b32_e32 v2, v2, v5, vcc ; GFX7-NEXT: v_cmp_eq_u32_e64 vcc, s5, 3 -; GFX7-NEXT: v_bfe_u32 v9, v0, 8, 8 -; GFX7-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc -; GFX7-NEXT: v_lshrrev_b32_e32 v4, 24, v0 -; GFX7-NEXT: v_and_b32_e32 v8, s9, v0 -; GFX7-NEXT: v_lshlrev_b32_e32 v9, 8, v9 +; GFX7-NEXT: v_bfe_u32 v10, v0, 8, 8 +; GFX7-NEXT: v_cndmask_b32_e32 v3, v3, v5, vcc +; GFX7-NEXT: v_lshrrev_b32_e32 v5, 24, v0 +; GFX7-NEXT: v_and_b32_e32 v9, v0, v4 +; GFX7-NEXT: v_lshlrev_b32_e32 v10, 8, v10 ; GFX7-NEXT: v_bfe_u32 v0, v0, 16, 8 -; GFX7-NEXT: v_or_b32_e32 v8, v8, v9 +; GFX7-NEXT: v_or_b32_e32 v9, v9, v10 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX7-NEXT: v_or_b32_e32 v0, v8, v0 -; GFX7-NEXT: v_lshlrev_b32_e32 v4, 24, v4 -; GFX7-NEXT: v_bfe_u32 v8, v1, 8, 8 -; GFX7-NEXT: v_lshrrev_b32_e32 v5, 24, v1 -; GFX7-NEXT: v_or_b32_e32 v0, v0, v4 -; GFX7-NEXT: v_and_b32_e32 v4, s9, v1 -; GFX7-NEXT: v_lshlrev_b32_e32 v8, 8, v8 +; GFX7-NEXT: v_or_b32_e32 v0, v9, v0 +; GFX7-NEXT: v_lshlrev_b32_e32 v5, 24, v5 +; GFX7-NEXT: v_bfe_u32 v9, v1, 8, 8 +; GFX7-NEXT: v_lshrrev_b32_e32 v6, 24, v1 +; GFX7-NEXT: v_or_b32_e32 v0, v0, v5 +; GFX7-NEXT: v_and_b32_e32 v5, v1, v4 +; GFX7-NEXT: v_lshlrev_b32_e32 v9, 8, v9 ; GFX7-NEXT: v_bfe_u32 v1, v1, 16, 8 -; GFX7-NEXT: v_or_b32_e32 v4, v4, v8 +; GFX7-NEXT: v_or_b32_e32 v5, v5, v9 ; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX7-NEXT: v_or_b32_e32 v1, v4, v1 -; GFX7-NEXT: v_lshlrev_b32_e32 v4, 24, v5 -; GFX7-NEXT: v_bfe_u32 v5, v2, 8, 8 -; GFX7-NEXT: v_lshrrev_b32_e32 v6, 24, v2 -; GFX7-NEXT: v_or_b32_e32 v1, v1, v4 -; GFX7-NEXT: v_and_b32_e32 v4, s9, v2 -; GFX7-NEXT: v_lshlrev_b32_e32 v5, 8, v5 +; GFX7-NEXT: v_or_b32_e32 v1, v5, v1 +; GFX7-NEXT: v_lshlrev_b32_e32 v5, 24, v6 +; GFX7-NEXT: v_bfe_u32 v6, v2, 8, 8 +; GFX7-NEXT: v_lshrrev_b32_e32 v7, 24, v2 +; GFX7-NEXT: v_or_b32_e32 v1, v1, v5 +; GFX7-NEXT: v_and_b32_e32 v5, v2, v4 +; GFX7-NEXT: v_lshlrev_b32_e32 v6, 8, v6 ; GFX7-NEXT: v_bfe_u32 v2, v2, 16, 8 -; GFX7-NEXT: v_or_b32_e32 v4, v4, v5 +; GFX7-NEXT: v_or_b32_e32 v5, v5, v6 ; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX7-NEXT: v_or_b32_e32 v2, v4, v2 -; GFX7-NEXT: v_lshlrev_b32_e32 v4, 24, v6 +; GFX7-NEXT: v_or_b32_e32 v2, v5, v2 +; GFX7-NEXT: v_lshlrev_b32_e32 v5, 24, v7 +; GFX7-NEXT: v_or_b32_e32 v2, v2, v5 ; GFX7-NEXT: v_bfe_u32 v5, v3, 8, 8 -; GFX7-NEXT: v_lshrrev_b32_e32 v7, 24, v3 -; GFX7-NEXT: v_or_b32_e32 v2, v2, v4 -; GFX7-NEXT: v_and_b32_e32 v4, s9, v3 +; GFX7-NEXT: v_lshrrev_b32_e32 v8, 24, v3 +; GFX7-NEXT: v_and_b32_e32 v4, v3, v4 ; GFX7-NEXT: v_lshlrev_b32_e32 v5, 8, v5 ; GFX7-NEXT: v_bfe_u32 v3, v3, 16, 8 ; GFX7-NEXT: v_or_b32_e32 v4, v4, v5 ; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; GFX7-NEXT: v_or_b32_e32 v3, v4, v3 -; GFX7-NEXT: v_lshlrev_b32_e32 v4, 24, v7 +; GFX7-NEXT: v_lshlrev_b32_e32 v4, 24, v8 ; GFX7-NEXT: s_mov_b64 s[0:1], 0 ; GFX7-NEXT: v_or_b32_e32 v3, v3, v4 ; GFX7-NEXT: s_mov_b32 s2, -1 @@ -4939,81 +4873,78 @@ define amdgpu_ps void @insertelement_s_v16i8_v_s(<16 x i8> addrspace(4)* inreg % ; GFX10-LABEL: insertelement_s_v16i8_v_s: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x0 -; GFX10-NEXT: s_mov_b32 s6, 0x80008 -; GFX10-NEXT: s_movk_i32 s5, 0xff -; GFX10-NEXT: s_mov_b32 s7, 0x80010 -; GFX10-NEXT: v_and_b32_e32 v0, s5, v0 +; GFX10-NEXT: v_and_b32_e32 v0, 0xff, v0 ; GFX10-NEXT: v_mov_b32_e32 v10, 8 ; GFX10-NEXT: v_mov_b32_e32 v12, 16 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: s_bfe_u32 s13, s0, s6 -; GFX10-NEXT: s_bfe_u32 s15, s1, s6 -; GFX10-NEXT: s_lshr_b32 s8, s0, 24 -; GFX10-NEXT: s_lshr_b32 s9, s1, 24 -; GFX10-NEXT: s_and_b32 s12, s0, s5 -; GFX10-NEXT: s_bfe_u32 s0, s0, s7 -; GFX10-NEXT: s_and_b32 s14, s1, s5 -; GFX10-NEXT: s_bfe_u32 s1, s1, s7 -; GFX10-NEXT: s_lshl_b32 s13, s13, 8 -; GFX10-NEXT: s_lshl_b32 s15, s15, 8 -; GFX10-NEXT: s_lshl_b32 s0, s0, 16 +; GFX10-NEXT: s_bfe_u32 s10, s0, 0x80008 +; GFX10-NEXT: s_bfe_u32 s12, s1, 0x80008 +; GFX10-NEXT: s_lshr_b32 s6, s1, 24 +; GFX10-NEXT: s_and_b32 s9, s0, 0xff +; GFX10-NEXT: s_and_b32 s11, s1, 0xff +; GFX10-NEXT: s_bfe_u32 s1, s1, 0x80010 +; GFX10-NEXT: s_lshl_b32 s10, s10, 8 +; GFX10-NEXT: s_lshl_b32 s12, s12, 8 +; GFX10-NEXT: s_lshr_b32 s5, s0, 24 +; GFX10-NEXT: s_bfe_u32 s0, s0, 0x80010 ; GFX10-NEXT: s_lshl_b32 s1, s1, 16 -; GFX10-NEXT: s_or_b32 s12, s12, s13 -; GFX10-NEXT: s_or_b32 s13, s14, s15 -; GFX10-NEXT: s_bfe_u32 s17, s2, s6 -; GFX10-NEXT: s_lshl_b32 s8, s8, 24 -; GFX10-NEXT: s_lshl_b32 s9, s9, 24 -; GFX10-NEXT: s_or_b32 s0, s12, s0 -; GFX10-NEXT: s_or_b32 s1, s13, s1 -; GFX10-NEXT: s_bfe_u32 s6, s3, s6 -; GFX10-NEXT: s_lshr_b32 s10, s2, 24 -; GFX10-NEXT: s_lshr_b32 s11, s3, 24 -; GFX10-NEXT: s_and_b32 s16, s2, s5 -; GFX10-NEXT: s_or_b32 s0, s0, s8 -; GFX10-NEXT: s_lshl_b32 s8, s17, 8 -; GFX10-NEXT: s_bfe_u32 s2, s2, s7 -; GFX10-NEXT: s_or_b32 s1, s1, s9 -; GFX10-NEXT: s_and_b32 s9, s3, s5 -; GFX10-NEXT: s_lshl_b32 s6, s6, 8 -; GFX10-NEXT: s_bfe_u32 s3, s3, s7 -; GFX10-NEXT: s_or_b32 s8, s16, s8 +; GFX10-NEXT: s_or_b32 s9, s9, s10 +; GFX10-NEXT: s_or_b32 s10, s11, s12 +; GFX10-NEXT: s_bfe_u32 s14, s2, 0x80008 +; GFX10-NEXT: s_lshl_b32 s0, s0, 16 +; GFX10-NEXT: s_lshl_b32 s6, s6, 24 +; GFX10-NEXT: s_or_b32 s1, s10, s1 +; GFX10-NEXT: s_lshr_b32 s7, s2, 24 +; GFX10-NEXT: s_and_b32 s13, s2, 0xff +; GFX10-NEXT: s_bfe_u32 s2, s2, 0x80010 +; GFX10-NEXT: s_lshl_b32 s5, s5, 24 +; GFX10-NEXT: s_lshl_b32 s14, s14, 8 +; GFX10-NEXT: s_or_b32 s0, s9, s0 +; GFX10-NEXT: s_or_b32 s1, s1, s6 +; GFX10-NEXT: s_bfe_u32 s6, s3, 0x80008 +; GFX10-NEXT: s_lshr_b32 s8, s3, 24 +; GFX10-NEXT: s_or_b32 s11, s13, s14 +; GFX10-NEXT: s_or_b32 s0, s0, s5 ; GFX10-NEXT: s_lshl_b32 s2, s2, 16 -; GFX10-NEXT: s_or_b32 s6, s9, s6 +; GFX10-NEXT: s_lshl_b32 s5, s7, 24 +; GFX10-NEXT: s_and_b32 s7, s3, 0xff +; GFX10-NEXT: s_lshl_b32 s6, s6, 8 +; GFX10-NEXT: s_bfe_u32 s3, s3, 0x80010 +; GFX10-NEXT: s_or_b32 s2, s11, s2 +; GFX10-NEXT: s_or_b32 s6, s7, s6 ; GFX10-NEXT: s_lshl_b32 s3, s3, 16 -; GFX10-NEXT: s_or_b32 s2, s8, s2 -; GFX10-NEXT: s_lshl_b32 s8, s10, 24 +; GFX10-NEXT: s_or_b32 s2, s2, s5 ; GFX10-NEXT: s_or_b32 s3, s6, s3 -; GFX10-NEXT: s_lshl_b32 s6, s11, 24 -; GFX10-NEXT: s_lshr_b32 s7, s4, 2 -; GFX10-NEXT: s_or_b32 s2, s2, s8 -; GFX10-NEXT: s_or_b32 s3, s3, s6 -; GFX10-NEXT: s_cmp_eq_u32 s7, 1 -; GFX10-NEXT: v_cmp_eq_u32_e64 vcc_lo, s7, 0 -; GFX10-NEXT: s_cselect_b32 s6, s1, s0 -; GFX10-NEXT: s_cmp_eq_u32 s7, 2 -; GFX10-NEXT: s_cselect_b32 s6, s2, s6 -; GFX10-NEXT: s_cmp_eq_u32 s7, 3 -; GFX10-NEXT: s_cselect_b32 s6, s3, s6 +; GFX10-NEXT: s_lshl_b32 s5, s8, 24 +; GFX10-NEXT: s_lshr_b32 s6, s4, 2 +; GFX10-NEXT: s_or_b32 s3, s3, s5 +; GFX10-NEXT: s_cmp_eq_u32 s6, 1 +; GFX10-NEXT: v_cmp_eq_u32_e64 vcc_lo, s6, 0 +; GFX10-NEXT: s_cselect_b32 s5, s1, s0 +; GFX10-NEXT: s_cmp_eq_u32 s6, 2 +; GFX10-NEXT: s_cselect_b32 s5, s2, s5 +; GFX10-NEXT: s_cmp_eq_u32 s6, 3 +; GFX10-NEXT: s_cselect_b32 s5, s3, s5 ; GFX10-NEXT: s_and_b32 s4, s4, 3 ; GFX10-NEXT: s_lshl_b32 s4, s4, 3 -; GFX10-NEXT: s_lshl_b32 s8, s5, s4 -; GFX10-NEXT: s_andn2_b32 s6, s6, s8 -; GFX10-NEXT: v_lshl_or_b32 v4, v0, s4, s6 +; GFX10-NEXT: s_lshl_b32 s7, 0xff, s4 +; GFX10-NEXT: s_andn2_b32 s5, s5, s7 +; GFX10-NEXT: v_lshl_or_b32 v4, v0, s4, s5 ; GFX10-NEXT: v_mov_b32_e32 v0, s0 ; GFX10-NEXT: v_mov_b32_e32 v1, s1 ; GFX10-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-NEXT: v_mov_b32_e32 v3, s3 ; GFX10-NEXT: s_mov_b32 s0, 8 ; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc_lo -; GFX10-NEXT: v_cmp_eq_u32_e64 vcc_lo, s7, 1 +; GFX10-NEXT: v_cmp_eq_u32_e64 vcc_lo, s6, 1 ; GFX10-NEXT: s_mov_b32 s1, 16 ; GFX10-NEXT: v_lshlrev_b32_sdwa v6, s0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 ; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc_lo -; GFX10-NEXT: v_cmp_eq_u32_e64 vcc_lo, s7, 2 -; GFX10-NEXT: v_and_or_b32 v6, v0, s5, v6 +; GFX10-NEXT: v_cmp_eq_u32_e64 vcc_lo, s6, 2 +; GFX10-NEXT: v_and_or_b32 v6, 0xff, v0, v6 ; GFX10-NEXT: v_lshrrev_b32_e32 v5, 24, v1 ; GFX10-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc_lo -; GFX10-NEXT: v_cmp_eq_u32_e64 vcc_lo, s7, 3 +; GFX10-NEXT: v_cmp_eq_u32_e64 vcc_lo, s6, 3 ; GFX10-NEXT: v_lshlrev_b32_sdwa v9, s0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 ; GFX10-NEXT: v_lshlrev_b32_e32 v5, 24, v5 ; GFX10-NEXT: v_lshrrev_b32_e32 v7, 24, v2 @@ -5021,15 +4952,15 @@ define amdgpu_ps void @insertelement_s_v16i8_v_s(<16 x i8> addrspace(4)* inreg % ; GFX10-NEXT: v_lshrrev_b32_e32 v4, 24, v0 ; GFX10-NEXT: v_lshlrev_b32_sdwa v11, v10, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 ; GFX10-NEXT: v_lshlrev_b32_sdwa v0, s1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 -; GFX10-NEXT: v_and_or_b32 v9, v1, s5, v9 +; GFX10-NEXT: v_and_or_b32 v9, 0xff, v1, v9 ; GFX10-NEXT: v_lshrrev_b32_e32 v8, 24, v3 ; GFX10-NEXT: v_lshlrev_b32_sdwa v10, v10, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 ; GFX10-NEXT: v_lshlrev_b32_e32 v4, 24, v4 ; GFX10-NEXT: v_lshlrev_b32_sdwa v1, s1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 -; GFX10-NEXT: v_and_or_b32 v11, v2, s5, v11 +; GFX10-NEXT: v_and_or_b32 v11, 0xff, v2, v11 ; GFX10-NEXT: v_lshlrev_b32_sdwa v2, v12, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 ; GFX10-NEXT: v_lshlrev_b32_e32 v7, 24, v7 -; GFX10-NEXT: v_and_or_b32 v10, v3, s5, v10 +; GFX10-NEXT: v_and_or_b32 v10, 0xff, v3, v10 ; GFX10-NEXT: v_lshlrev_b32_sdwa v3, v12, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 ; GFX10-NEXT: v_lshlrev_b32_e32 v8, 24, v8 ; GFX10-NEXT: v_or3_b32 v0, v6, v0, v4 @@ -5050,60 +4981,58 @@ define amdgpu_ps void @insertelement_s_v16i8_s_v(<16 x i8> addrspace(4)* inreg % ; GFX9-LABEL: insertelement_s_v16i8_s_v: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x0 -; GFX9-NEXT: s_mov_b32 s13, 0x80008 -; GFX9-NEXT: s_movk_i32 s12, 0xff ; GFX9-NEXT: v_lshrrev_b32_e32 v4, 2, v0 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v4 +; GFX9-NEXT: v_and_b32_e32 v0, 3, v0 +; GFX9-NEXT: s_movk_i32 s12, 0xff ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_bfe_u32 s14, s0, s13 -; GFX9-NEXT: s_and_b32 s8, s0, s12 -; GFX9-NEXT: s_lshl_b32 s14, s14, 8 -; GFX9-NEXT: s_or_b32 s8, s8, s14 -; GFX9-NEXT: s_mov_b32 s14, 0x80010 +; GFX9-NEXT: s_bfe_u32 s13, s0, 0x80008 ; GFX9-NEXT: s_lshr_b32 s5, s0, 24 -; GFX9-NEXT: s_bfe_u32 s0, s0, s14 +; GFX9-NEXT: s_and_b32 s8, s0, 0xff +; GFX9-NEXT: s_lshl_b32 s13, s13, 8 +; GFX9-NEXT: s_bfe_u32 s0, s0, 0x80010 +; GFX9-NEXT: s_or_b32 s8, s8, s13 ; GFX9-NEXT: s_lshl_b32 s0, s0, 16 ; GFX9-NEXT: s_or_b32 s0, s8, s0 ; GFX9-NEXT: s_lshl_b32 s5, s5, 24 ; GFX9-NEXT: s_or_b32 s8, s0, s5 -; GFX9-NEXT: s_bfe_u32 s5, s1, s13 +; GFX9-NEXT: s_bfe_u32 s5, s1, 0x80008 ; GFX9-NEXT: s_lshr_b32 s9, s1, 24 -; GFX9-NEXT: s_and_b32 s0, s1, s12 +; GFX9-NEXT: s_and_b32 s0, s1, 0xff ; GFX9-NEXT: s_lshl_b32 s5, s5, 8 -; GFX9-NEXT: s_bfe_u32 s1, s1, s14 +; GFX9-NEXT: s_bfe_u32 s1, s1, 0x80010 ; GFX9-NEXT: s_or_b32 s0, s0, s5 ; GFX9-NEXT: s_lshl_b32 s1, s1, 16 ; GFX9-NEXT: s_or_b32 s0, s0, s1 ; GFX9-NEXT: s_lshl_b32 s1, s9, 24 ; GFX9-NEXT: s_or_b32 s9, s0, s1 -; GFX9-NEXT: s_bfe_u32 s1, s2, s13 -; GFX9-NEXT: s_and_b32 s0, s2, s12 +; GFX9-NEXT: s_bfe_u32 s1, s2, 0x80008 +; GFX9-NEXT: s_and_b32 s0, s2, 0xff ; GFX9-NEXT: s_lshl_b32 s1, s1, 8 ; GFX9-NEXT: s_or_b32 s0, s0, s1 -; GFX9-NEXT: s_bfe_u32 s1, s2, s14 +; GFX9-NEXT: s_bfe_u32 s1, s2, 0x80010 ; GFX9-NEXT: s_lshr_b32 s10, s2, 24 ; GFX9-NEXT: s_lshl_b32 s1, s1, 16 ; GFX9-NEXT: s_or_b32 s0, s0, s1 ; GFX9-NEXT: s_lshl_b32 s1, s10, 24 ; GFX9-NEXT: s_or_b32 s10, s0, s1 -; GFX9-NEXT: s_bfe_u32 s1, s3, s13 -; GFX9-NEXT: s_and_b32 s0, s3, s12 +; GFX9-NEXT: s_bfe_u32 s1, s3, 0x80008 +; GFX9-NEXT: s_and_b32 s0, s3, 0xff ; GFX9-NEXT: s_lshl_b32 s1, s1, 8 ; GFX9-NEXT: s_or_b32 s0, s0, s1 -; GFX9-NEXT: s_bfe_u32 s1, s3, s14 +; GFX9-NEXT: s_bfe_u32 s1, s3, 0x80010 ; GFX9-NEXT: s_lshr_b32 s11, s3, 24 ; GFX9-NEXT: s_lshl_b32 s1, s1, 16 ; GFX9-NEXT: s_or_b32 s0, s0, s1 ; GFX9-NEXT: s_lshl_b32 s1, s11, 24 ; GFX9-NEXT: v_mov_b32_e32 v1, s8 ; GFX9-NEXT: v_mov_b32_e32 v2, s9 -; GFX9-NEXT: v_and_b32_e32 v0, 3, v0 ; GFX9-NEXT: s_or_b32 s11, s0, s1 ; GFX9-NEXT: v_mov_b32_e32 v3, s10 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc ; GFX9-NEXT: v_cmp_eq_u32_e64 s[0:1], 2, v4 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 3, v0 -; GFX9-NEXT: s_and_b32 s4, s4, s12 +; GFX9-NEXT: s_and_b32 s4, s4, 0xff ; GFX9-NEXT: v_mov_b32_e32 v5, s11 ; GFX9-NEXT: v_cndmask_b32_e64 v1, v1, v3, s[0:1] ; GFX9-NEXT: v_cmp_eq_u32_e64 s[2:3], 3, v4 @@ -5111,43 +5040,44 @@ define amdgpu_ps void @insertelement_s_v16i8_s_v(<16 x i8> addrspace(4)* inreg % ; GFX9-NEXT: v_lshlrev_b32_e64 v0, v0, s12 ; GFX9-NEXT: v_cndmask_b32_e64 v1, v1, v5, s[2:3] ; GFX9-NEXT: v_xor_b32_e32 v0, -1, v0 -; GFX9-NEXT: v_and_or_b32 v5, v1, v0, v2 +; GFX9-NEXT: v_and_or_b32 v6, v1, v0, v2 ; GFX9-NEXT: v_mov_b32_e32 v0, s8 ; GFX9-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v4 ; GFX9-NEXT: s_mov_b32 s6, 8 -; GFX9-NEXT: v_cndmask_b32_e64 v0, v0, v5, s[4:5] +; GFX9-NEXT: v_cndmask_b32_e64 v0, v0, v6, s[4:5] ; GFX9-NEXT: s_mov_b32 s7, 16 +; GFX9-NEXT: v_mov_b32_e32 v5, 0xff ; GFX9-NEXT: v_mov_b32_e32 v1, s9 ; GFX9-NEXT: v_lshrrev_b32_e32 v4, 24, v0 -; GFX9-NEXT: v_lshlrev_b32_sdwa v8, s6, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 +; GFX9-NEXT: v_lshlrev_b32_sdwa v9, s6, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 ; GFX9-NEXT: v_mov_b32_e32 v2, s10 ; GFX9-NEXT: v_mov_b32_e32 v3, s11 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc -; GFX9-NEXT: v_and_or_b32 v8, v0, s12, v8 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v6, vcc +; GFX9-NEXT: v_and_or_b32 v9, v0, v5, v9 ; GFX9-NEXT: v_lshlrev_b32_sdwa v0, s7, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 ; GFX9-NEXT: v_lshlrev_b32_e32 v4, 24, v4 -; GFX9-NEXT: v_cndmask_b32_e64 v2, v2, v5, s[0:1] -; GFX9-NEXT: v_cndmask_b32_e64 v3, v3, v5, s[2:3] -; GFX9-NEXT: v_lshrrev_b32_e32 v5, 24, v1 -; GFX9-NEXT: v_or3_b32 v0, v8, v0, v4 +; GFX9-NEXT: v_cndmask_b32_e64 v2, v2, v6, s[0:1] +; GFX9-NEXT: v_cndmask_b32_e64 v3, v3, v6, s[2:3] +; GFX9-NEXT: v_lshrrev_b32_e32 v6, 24, v1 +; GFX9-NEXT: v_or3_b32 v0, v9, v0, v4 ; GFX9-NEXT: v_lshlrev_b32_sdwa v4, s6, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 -; GFX9-NEXT: v_and_or_b32 v4, v1, s12, v4 +; GFX9-NEXT: v_and_or_b32 v4, v1, v5, v4 ; GFX9-NEXT: v_lshlrev_b32_sdwa v1, s7, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 -; GFX9-NEXT: v_lshlrev_b32_e32 v5, 24, v5 -; GFX9-NEXT: v_or3_b32 v1, v4, v1, v5 -; GFX9-NEXT: v_mov_b32_e32 v4, 8 -; GFX9-NEXT: v_lshrrev_b32_e32 v6, 24, v2 -; GFX9-NEXT: v_lshlrev_b32_sdwa v5, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 -; GFX9-NEXT: v_mov_b32_e32 v8, 16 -; GFX9-NEXT: v_lshrrev_b32_e32 v7, 24, v3 -; GFX9-NEXT: v_and_or_b32 v5, v2, s12, v5 -; GFX9-NEXT: v_lshlrev_b32_sdwa v2, v8, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 ; GFX9-NEXT: v_lshlrev_b32_e32 v6, 24, v6 -; GFX9-NEXT: v_lshlrev_b32_sdwa v4, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 -; GFX9-NEXT: v_or3_b32 v2, v5, v2, v6 -; GFX9-NEXT: v_and_or_b32 v6, v3, s12, v4 -; GFX9-NEXT: v_lshlrev_b32_sdwa v3, v8, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 +; GFX9-NEXT: v_or3_b32 v1, v4, v1, v6 +; GFX9-NEXT: v_mov_b32_e32 v4, 8 +; GFX9-NEXT: v_lshrrev_b32_e32 v7, 24, v2 +; GFX9-NEXT: v_lshlrev_b32_sdwa v6, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 +; GFX9-NEXT: v_mov_b32_e32 v9, 16 +; GFX9-NEXT: v_lshrrev_b32_e32 v8, 24, v3 +; GFX9-NEXT: v_and_or_b32 v6, v2, v5, v6 +; GFX9-NEXT: v_lshlrev_b32_sdwa v2, v9, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 ; GFX9-NEXT: v_lshlrev_b32_e32 v7, 24, v7 +; GFX9-NEXT: v_lshlrev_b32_sdwa v4, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 +; GFX9-NEXT: v_or3_b32 v2, v6, v2, v7 +; GFX9-NEXT: v_and_or_b32 v6, v3, v5, v4 +; GFX9-NEXT: v_lshlrev_b32_sdwa v3, v9, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 +; GFX9-NEXT: v_lshlrev_b32_e32 v7, 24, v8 ; GFX9-NEXT: v_mov_b32_e32 v4, 0 ; GFX9-NEXT: v_mov_b32_e32 v5, 0 ; GFX9-NEXT: v_or3_b32 v3, v6, v3, v7 @@ -5157,60 +5087,58 @@ define amdgpu_ps void @insertelement_s_v16i8_s_v(<16 x i8> addrspace(4)* inreg % ; GFX8-LABEL: insertelement_s_v16i8_s_v: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x0 -; GFX8-NEXT: s_mov_b32 s13, 0x80008 -; GFX8-NEXT: s_movk_i32 s12, 0xff -; GFX8-NEXT: s_mov_b32 s14, 0x80010 ; GFX8-NEXT: v_lshrrev_b32_e32 v4, 2, v0 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 1, v4 +; GFX8-NEXT: v_and_b32_e32 v0, 3, v0 +; GFX8-NEXT: s_movk_i32 s12, 0xff ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_bfe_u32 s9, s0, s13 +; GFX8-NEXT: s_bfe_u32 s9, s0, 0x80008 ; GFX8-NEXT: s_lshr_b32 s5, s0, 24 -; GFX8-NEXT: s_and_b32 s8, s0, s12 +; GFX8-NEXT: s_and_b32 s8, s0, 0xff ; GFX8-NEXT: s_lshl_b32 s9, s9, 8 -; GFX8-NEXT: s_bfe_u32 s0, s0, s14 +; GFX8-NEXT: s_bfe_u32 s0, s0, 0x80010 ; GFX8-NEXT: s_or_b32 s8, s8, s9 ; GFX8-NEXT: s_lshl_b32 s0, s0, 16 ; GFX8-NEXT: s_or_b32 s0, s8, s0 ; GFX8-NEXT: s_lshl_b32 s5, s5, 24 ; GFX8-NEXT: s_or_b32 s8, s0, s5 -; GFX8-NEXT: s_bfe_u32 s5, s1, s13 +; GFX8-NEXT: s_bfe_u32 s5, s1, 0x80008 ; GFX8-NEXT: s_lshr_b32 s6, s1, 24 -; GFX8-NEXT: s_and_b32 s0, s1, s12 +; GFX8-NEXT: s_and_b32 s0, s1, 0xff ; GFX8-NEXT: s_lshl_b32 s5, s5, 8 -; GFX8-NEXT: s_bfe_u32 s1, s1, s14 +; GFX8-NEXT: s_bfe_u32 s1, s1, 0x80010 ; GFX8-NEXT: s_or_b32 s0, s0, s5 ; GFX8-NEXT: s_lshl_b32 s1, s1, 16 ; GFX8-NEXT: s_or_b32 s0, s0, s1 ; GFX8-NEXT: s_lshl_b32 s1, s6, 24 ; GFX8-NEXT: s_or_b32 s9, s0, s1 -; GFX8-NEXT: s_bfe_u32 s1, s2, s13 -; GFX8-NEXT: s_and_b32 s0, s2, s12 +; GFX8-NEXT: s_bfe_u32 s1, s2, 0x80008 +; GFX8-NEXT: s_and_b32 s0, s2, 0xff ; GFX8-NEXT: s_lshl_b32 s1, s1, 8 ; GFX8-NEXT: s_or_b32 s0, s0, s1 -; GFX8-NEXT: s_bfe_u32 s1, s2, s14 +; GFX8-NEXT: s_bfe_u32 s1, s2, 0x80010 ; GFX8-NEXT: s_lshr_b32 s7, s2, 24 ; GFX8-NEXT: s_lshl_b32 s1, s1, 16 ; GFX8-NEXT: s_or_b32 s0, s0, s1 ; GFX8-NEXT: s_lshl_b32 s1, s7, 24 ; GFX8-NEXT: s_or_b32 s10, s0, s1 -; GFX8-NEXT: s_bfe_u32 s1, s3, s13 -; GFX8-NEXT: s_and_b32 s0, s3, s12 +; GFX8-NEXT: s_bfe_u32 s1, s3, 0x80008 +; GFX8-NEXT: s_and_b32 s0, s3, 0xff ; GFX8-NEXT: s_lshl_b32 s1, s1, 8 ; GFX8-NEXT: s_or_b32 s0, s0, s1 -; GFX8-NEXT: s_bfe_u32 s1, s3, s14 +; GFX8-NEXT: s_bfe_u32 s1, s3, 0x80010 ; GFX8-NEXT: s_lshr_b32 s11, s3, 24 ; GFX8-NEXT: s_lshl_b32 s1, s1, 16 ; GFX8-NEXT: s_or_b32 s0, s0, s1 ; GFX8-NEXT: s_lshl_b32 s1, s11, 24 ; GFX8-NEXT: v_mov_b32_e32 v1, s8 ; GFX8-NEXT: v_mov_b32_e32 v2, s9 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 1, v4 -; GFX8-NEXT: v_and_b32_e32 v0, 3, v0 ; GFX8-NEXT: s_or_b32 s11, s0, s1 ; GFX8-NEXT: v_mov_b32_e32 v3, s10 ; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc ; GFX8-NEXT: v_cmp_eq_u32_e64 s[0:1], 2, v4 ; GFX8-NEXT: v_lshlrev_b32_e32 v0, 3, v0 -; GFX8-NEXT: s_and_b32 s4, s4, s12 +; GFX8-NEXT: s_and_b32 s4, s4, 0xff ; GFX8-NEXT: v_mov_b32_e32 v5, s11 ; GFX8-NEXT: v_cndmask_b32_e64 v1, v1, v3, s[0:1] ; GFX8-NEXT: v_cmp_eq_u32_e64 s[2:3], 3, v4 @@ -5269,117 +5197,115 @@ define amdgpu_ps void @insertelement_s_v16i8_s_v(<16 x i8> addrspace(4)* inreg % ; GFX7-LABEL: insertelement_s_v16i8_s_v: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x0 -; GFX7-NEXT: s_mov_b32 s13, 0x80008 -; GFX7-NEXT: s_movk_i32 s12, 0xff -; GFX7-NEXT: s_mov_b32 s14, 0x80010 ; GFX7-NEXT: v_lshrrev_b32_e32 v4, 2, v0 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v4 +; GFX7-NEXT: v_and_b32_e32 v0, 3, v0 +; GFX7-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: s_bfe_u32 s9, s0, s13 +; GFX7-NEXT: s_bfe_u32 s9, s0, 0x80008 ; GFX7-NEXT: s_lshr_b32 s5, s0, 24 -; GFX7-NEXT: s_and_b32 s8, s0, s12 +; GFX7-NEXT: s_and_b32 s8, s0, 0xff ; GFX7-NEXT: s_lshl_b32 s9, s9, 8 -; GFX7-NEXT: s_bfe_u32 s0, s0, s14 +; GFX7-NEXT: s_bfe_u32 s0, s0, 0x80010 ; GFX7-NEXT: s_or_b32 s8, s8, s9 ; GFX7-NEXT: s_lshl_b32 s0, s0, 16 ; GFX7-NEXT: s_or_b32 s0, s8, s0 ; GFX7-NEXT: s_lshl_b32 s5, s5, 24 ; GFX7-NEXT: s_or_b32 s8, s0, s5 -; GFX7-NEXT: s_bfe_u32 s5, s1, s13 +; GFX7-NEXT: s_bfe_u32 s5, s1, 0x80008 ; GFX7-NEXT: s_lshr_b32 s6, s1, 24 -; GFX7-NEXT: s_and_b32 s0, s1, s12 +; GFX7-NEXT: s_and_b32 s0, s1, 0xff ; GFX7-NEXT: s_lshl_b32 s5, s5, 8 -; GFX7-NEXT: s_bfe_u32 s1, s1, s14 +; GFX7-NEXT: s_bfe_u32 s1, s1, 0x80010 ; GFX7-NEXT: s_or_b32 s0, s0, s5 ; GFX7-NEXT: s_lshl_b32 s1, s1, 16 ; GFX7-NEXT: s_or_b32 s0, s0, s1 ; GFX7-NEXT: s_lshl_b32 s1, s6, 24 ; GFX7-NEXT: s_or_b32 s9, s0, s1 -; GFX7-NEXT: s_bfe_u32 s1, s2, s13 -; GFX7-NEXT: s_and_b32 s0, s2, s12 +; GFX7-NEXT: s_bfe_u32 s1, s2, 0x80008 +; GFX7-NEXT: s_and_b32 s0, s2, 0xff ; GFX7-NEXT: s_lshl_b32 s1, s1, 8 ; GFX7-NEXT: s_or_b32 s0, s0, s1 -; GFX7-NEXT: s_bfe_u32 s1, s2, s14 +; GFX7-NEXT: s_bfe_u32 s1, s2, 0x80010 ; GFX7-NEXT: s_lshr_b32 s7, s2, 24 ; GFX7-NEXT: s_lshl_b32 s1, s1, 16 ; GFX7-NEXT: s_or_b32 s0, s0, s1 ; GFX7-NEXT: s_lshl_b32 s1, s7, 24 ; GFX7-NEXT: s_or_b32 s10, s0, s1 -; GFX7-NEXT: s_bfe_u32 s1, s3, s13 -; GFX7-NEXT: s_and_b32 s0, s3, s12 +; GFX7-NEXT: s_bfe_u32 s1, s3, 0x80008 +; GFX7-NEXT: s_and_b32 s0, s3, 0xff ; GFX7-NEXT: s_lshl_b32 s1, s1, 8 ; GFX7-NEXT: s_or_b32 s0, s0, s1 -; GFX7-NEXT: s_bfe_u32 s1, s3, s14 +; GFX7-NEXT: s_bfe_u32 s1, s3, 0x80010 ; GFX7-NEXT: s_lshr_b32 s11, s3, 24 ; GFX7-NEXT: s_lshl_b32 s1, s1, 16 ; GFX7-NEXT: s_or_b32 s0, s0, s1 ; GFX7-NEXT: s_lshl_b32 s1, s11, 24 ; GFX7-NEXT: v_mov_b32_e32 v1, s8 ; GFX7-NEXT: v_mov_b32_e32 v2, s9 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v4 -; GFX7-NEXT: v_and_b32_e32 v0, 3, v0 ; GFX7-NEXT: s_or_b32 s11, s0, s1 ; GFX7-NEXT: v_mov_b32_e32 v3, s10 ; GFX7-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc ; GFX7-NEXT: v_cmp_eq_u32_e64 s[0:1], 2, v4 -; GFX7-NEXT: v_lshlrev_b32_e32 v0, 3, v0 -; GFX7-NEXT: s_and_b32 s4, s4, s12 +; GFX7-NEXT: s_and_b32 s4, s4, 0xff ; GFX7-NEXT: v_mov_b32_e32 v5, s11 ; GFX7-NEXT: v_cndmask_b32_e64 v1, v1, v3, s[0:1] ; GFX7-NEXT: v_cmp_eq_u32_e64 s[2:3], 3, v4 ; GFX7-NEXT: v_lshl_b32_e32 v2, s4, v0 -; GFX7-NEXT: v_lshl_b32_e32 v0, s12, v0 +; GFX7-NEXT: v_lshl_b32_e32 v0, 0xff, v0 ; GFX7-NEXT: v_cndmask_b32_e64 v1, v1, v5, s[2:3] ; GFX7-NEXT: v_xor_b32_e32 v0, -1, v0 ; GFX7-NEXT: v_and_b32_e32 v0, v1, v0 -; GFX7-NEXT: v_or_b32_e32 v5, v0, v2 +; GFX7-NEXT: v_or_b32_e32 v6, v0, v2 ; GFX7-NEXT: v_mov_b32_e32 v0, s8 ; GFX7-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v4 -; GFX7-NEXT: v_cndmask_b32_e64 v0, v0, v5, s[4:5] -; GFX7-NEXT: v_bfe_u32 v9, v0, 8, 8 +; GFX7-NEXT: v_cndmask_b32_e64 v0, v0, v6, s[4:5] +; GFX7-NEXT: v_mov_b32_e32 v5, 0xff +; GFX7-NEXT: v_bfe_u32 v10, v0, 8, 8 ; GFX7-NEXT: v_mov_b32_e32 v1, s9 ; GFX7-NEXT: v_lshrrev_b32_e32 v4, 24, v0 -; GFX7-NEXT: v_and_b32_e32 v8, s12, v0 -; GFX7-NEXT: v_lshlrev_b32_e32 v9, 8, v9 +; GFX7-NEXT: v_and_b32_e32 v9, v0, v5 +; GFX7-NEXT: v_lshlrev_b32_e32 v10, 8, v10 ; GFX7-NEXT: v_bfe_u32 v0, v0, 16, 8 -; GFX7-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc -; GFX7-NEXT: v_or_b32_e32 v8, v8, v9 +; GFX7-NEXT: v_cndmask_b32_e32 v1, v1, v6, vcc +; GFX7-NEXT: v_or_b32_e32 v9, v9, v10 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX7-NEXT: v_mov_b32_e32 v2, s10 ; GFX7-NEXT: v_mov_b32_e32 v3, s11 -; GFX7-NEXT: v_or_b32_e32 v0, v8, v0 +; GFX7-NEXT: v_or_b32_e32 v0, v9, v0 ; GFX7-NEXT: v_lshlrev_b32_e32 v4, 24, v4 -; GFX7-NEXT: v_bfe_u32 v8, v1, 8, 8 -; GFX7-NEXT: v_cndmask_b32_e64 v2, v2, v5, s[0:1] -; GFX7-NEXT: v_cndmask_b32_e64 v3, v3, v5, s[2:3] -; GFX7-NEXT: v_lshrrev_b32_e32 v5, 24, v1 +; GFX7-NEXT: v_bfe_u32 v9, v1, 8, 8 +; GFX7-NEXT: v_cndmask_b32_e64 v2, v2, v6, s[0:1] +; GFX7-NEXT: v_cndmask_b32_e64 v3, v3, v6, s[2:3] +; GFX7-NEXT: v_lshrrev_b32_e32 v6, 24, v1 ; GFX7-NEXT: v_or_b32_e32 v0, v0, v4 -; GFX7-NEXT: v_and_b32_e32 v4, s12, v1 -; GFX7-NEXT: v_lshlrev_b32_e32 v8, 8, v8 +; GFX7-NEXT: v_and_b32_e32 v4, v1, v5 +; GFX7-NEXT: v_lshlrev_b32_e32 v9, 8, v9 ; GFX7-NEXT: v_bfe_u32 v1, v1, 16, 8 -; GFX7-NEXT: v_or_b32_e32 v4, v4, v8 +; GFX7-NEXT: v_or_b32_e32 v4, v4, v9 ; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX7-NEXT: v_or_b32_e32 v1, v4, v1 -; GFX7-NEXT: v_lshlrev_b32_e32 v4, 24, v5 -; GFX7-NEXT: v_bfe_u32 v5, v2, 8, 8 -; GFX7-NEXT: v_lshrrev_b32_e32 v6, 24, v2 +; GFX7-NEXT: v_lshlrev_b32_e32 v4, 24, v6 +; GFX7-NEXT: v_bfe_u32 v6, v2, 8, 8 +; GFX7-NEXT: v_lshrrev_b32_e32 v7, 24, v2 ; GFX7-NEXT: v_or_b32_e32 v1, v1, v4 -; GFX7-NEXT: v_and_b32_e32 v4, s12, v2 -; GFX7-NEXT: v_lshlrev_b32_e32 v5, 8, v5 +; GFX7-NEXT: v_and_b32_e32 v4, v2, v5 +; GFX7-NEXT: v_lshlrev_b32_e32 v6, 8, v6 ; GFX7-NEXT: v_bfe_u32 v2, v2, 16, 8 -; GFX7-NEXT: v_or_b32_e32 v4, v4, v5 +; GFX7-NEXT: v_or_b32_e32 v4, v4, v6 ; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX7-NEXT: v_or_b32_e32 v2, v4, v2 -; GFX7-NEXT: v_lshlrev_b32_e32 v4, 24, v6 -; GFX7-NEXT: v_bfe_u32 v5, v3, 8, 8 -; GFX7-NEXT: v_lshrrev_b32_e32 v7, 24, v3 +; GFX7-NEXT: v_lshlrev_b32_e32 v4, 24, v7 ; GFX7-NEXT: v_or_b32_e32 v2, v2, v4 -; GFX7-NEXT: v_and_b32_e32 v4, s12, v3 +; GFX7-NEXT: v_and_b32_e32 v4, v3, v5 +; GFX7-NEXT: v_bfe_u32 v5, v3, 8, 8 +; GFX7-NEXT: v_lshrrev_b32_e32 v8, 24, v3 ; GFX7-NEXT: v_lshlrev_b32_e32 v5, 8, v5 ; GFX7-NEXT: v_bfe_u32 v3, v3, 16, 8 ; GFX7-NEXT: v_or_b32_e32 v4, v4, v5 ; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; GFX7-NEXT: v_or_b32_e32 v3, v4, v3 -; GFX7-NEXT: v_lshlrev_b32_e32 v4, 24, v7 +; GFX7-NEXT: v_lshlrev_b32_e32 v4, 24, v8 ; GFX7-NEXT: s_mov_b64 s[0:1], 0 ; GFX7-NEXT: v_or_b32_e32 v3, v3, v4 ; GFX7-NEXT: s_mov_b32 s2, -1 @@ -5390,54 +5316,51 @@ define amdgpu_ps void @insertelement_s_v16i8_s_v(<16 x i8> addrspace(4)* inreg % ; GFX10-LABEL: insertelement_s_v16i8_s_v: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x0 -; GFX10-NEXT: s_mov_b32 s6, 0x80008 -; GFX10-NEXT: s_movk_i32 s5, 0xff -; GFX10-NEXT: s_mov_b32 s7, 0x80010 ; GFX10-NEXT: v_lshrrev_b32_e32 v4, 2, v0 ; GFX10-NEXT: v_and_b32_e32 v0, 3, v0 ; GFX10-NEXT: v_mov_b32_e32 v10, 8 ; GFX10-NEXT: v_mov_b32_e32 v12, 16 ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v4 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 3, v0 -; GFX10-NEXT: v_lshlrev_b32_e64 v2, v0, s5 +; GFX10-NEXT: v_lshlrev_b32_e64 v2, v0, 0xff ; GFX10-NEXT: v_xor_b32_e32 v2, -1, v2 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: s_bfe_u32 s13, s0, s6 -; GFX10-NEXT: s_lshr_b32 s8, s0, 24 -; GFX10-NEXT: s_and_b32 s12, s0, s5 -; GFX10-NEXT: s_bfe_u32 s0, s0, s7 -; GFX10-NEXT: s_lshl_b32 s13, s13, 8 -; GFX10-NEXT: s_bfe_u32 s15, s1, s6 +; GFX10-NEXT: s_bfe_u32 s9, s0, 0x80008 +; GFX10-NEXT: s_bfe_u32 s12, s1, 0x80008 +; GFX10-NEXT: s_lshr_b32 s5, s0, 24 +; GFX10-NEXT: s_and_b32 s8, s0, 0xff +; GFX10-NEXT: s_bfe_u32 s0, s0, 0x80010 +; GFX10-NEXT: s_lshl_b32 s9, s9, 8 +; GFX10-NEXT: s_lshr_b32 s6, s1, 24 +; GFX10-NEXT: s_and_b32 s10, s1, 0xff +; GFX10-NEXT: s_bfe_u32 s1, s1, 0x80010 +; GFX10-NEXT: s_lshl_b32 s12, s12, 8 +; GFX10-NEXT: s_bfe_u32 s14, s2, 0x80008 ; GFX10-NEXT: s_lshl_b32 s0, s0, 16 -; GFX10-NEXT: s_or_b32 s12, s12, s13 -; GFX10-NEXT: s_lshr_b32 s9, s1, 24 -; GFX10-NEXT: s_and_b32 s14, s1, s5 -; GFX10-NEXT: s_bfe_u32 s1, s1, s7 -; GFX10-NEXT: s_lshl_b32 s15, s15, 8 -; GFX10-NEXT: s_bfe_u32 s17, s2, s6 -; GFX10-NEXT: s_lshl_b32 s8, s8, 24 -; GFX10-NEXT: s_or_b32 s0, s12, s0 +; GFX10-NEXT: s_or_b32 s8, s8, s9 ; GFX10-NEXT: s_lshl_b32 s1, s1, 16 -; GFX10-NEXT: s_or_b32 s13, s14, s15 -; GFX10-NEXT: s_lshr_b32 s10, s2, 24 -; GFX10-NEXT: s_and_b32 s16, s2, s5 -; GFX10-NEXT: s_or_b32 s8, s0, s8 -; GFX10-NEXT: s_lshl_b32 s0, s17, 8 -; GFX10-NEXT: s_bfe_u32 s2, s2, s7 -; GFX10-NEXT: s_lshl_b32 s9, s9, 24 -; GFX10-NEXT: s_or_b32 s1, s13, s1 -; GFX10-NEXT: s_or_b32 s0, s16, s0 -; GFX10-NEXT: s_lshl_b32 s2, s2, 16 -; GFX10-NEXT: s_or_b32 s9, s1, s9 -; GFX10-NEXT: s_or_b32 s0, s0, s2 -; GFX10-NEXT: s_lshl_b32 s1, s10, 24 -; GFX10-NEXT: s_bfe_u32 s2, s3, s6 +; GFX10-NEXT: s_or_b32 s9, s10, s12 +; GFX10-NEXT: s_lshr_b32 s7, s2, 24 +; GFX10-NEXT: s_and_b32 s13, s2, 0xff +; GFX10-NEXT: s_bfe_u32 s2, s2, 0x80010 +; GFX10-NEXT: s_lshl_b32 s5, s5, 24 +; GFX10-NEXT: s_lshl_b32 s14, s14, 8 +; GFX10-NEXT: s_or_b32 s0, s8, s0 +; GFX10-NEXT: s_lshl_b32 s6, s6, 24 +; GFX10-NEXT: s_or_b32 s1, s9, s1 +; GFX10-NEXT: s_or_b32 s10, s13, s14 +; GFX10-NEXT: s_or_b32 s8, s0, s5 +; GFX10-NEXT: s_lshl_b32 s0, s2, 16 +; GFX10-NEXT: s_or_b32 s9, s1, s6 +; GFX10-NEXT: s_or_b32 s0, s10, s0 +; GFX10-NEXT: s_lshl_b32 s1, s7, 24 +; GFX10-NEXT: s_bfe_u32 s2, s3, 0x80008 ; GFX10-NEXT: v_mov_b32_e32 v1, s9 -; GFX10-NEXT: s_and_b32 s6, s3, s5 +; GFX10-NEXT: s_and_b32 s5, s3, 0xff ; GFX10-NEXT: s_lshl_b32 s2, s2, 8 ; GFX10-NEXT: s_or_b32 s10, s0, s1 -; GFX10-NEXT: s_bfe_u32 s1, s3, s7 -; GFX10-NEXT: s_or_b32 s0, s6, s2 +; GFX10-NEXT: s_bfe_u32 s1, s3, 0x80010 +; GFX10-NEXT: s_or_b32 s0, s5, s2 ; GFX10-NEXT: s_lshl_b32 s1, s1, 16 ; GFX10-NEXT: v_cndmask_b32_e32 v1, s8, v1, vcc_lo ; GFX10-NEXT: s_or_b32 s1, s0, s1 @@ -5448,7 +5371,7 @@ define amdgpu_ps void @insertelement_s_v16i8_s_v(<16 x i8> addrspace(4)* inreg % ; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, s10, s0 ; GFX10-NEXT: s_or_b32 s11, s1, s2 ; GFX10-NEXT: v_cmp_eq_u32_e64 s1, 3, v4 -; GFX10-NEXT: s_and_b32 s2, s4, s5 +; GFX10-NEXT: s_and_b32 s2, s4, 0xff ; GFX10-NEXT: v_lshlrev_b32_e64 v0, v0, s2 ; GFX10-NEXT: v_cmp_eq_u32_e64 s2, 0, v4 ; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, s11, s1 @@ -5470,16 +5393,16 @@ define amdgpu_ps void @insertelement_s_v16i8_s_v(<16 x i8> addrspace(4)* inreg % ; GFX10-NEXT: v_lshrrev_b32_e32 v8, 24, v3 ; GFX10-NEXT: v_lshlrev_b32_sdwa v11, v10, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 ; GFX10-NEXT: v_lshlrev_b32_sdwa v10, v10, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 -; GFX10-NEXT: v_and_or_b32 v6, v0, s5, v6 +; GFX10-NEXT: v_and_or_b32 v6, 0xff, v0, v6 ; GFX10-NEXT: v_lshlrev_b32_sdwa v0, s2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 ; GFX10-NEXT: v_lshlrev_b32_e32 v4, 24, v4 -; GFX10-NEXT: v_and_or_b32 v9, v1, s5, v9 +; GFX10-NEXT: v_and_or_b32 v9, 0xff, v1, v9 ; GFX10-NEXT: v_lshlrev_b32_sdwa v1, s2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 ; GFX10-NEXT: v_lshlrev_b32_e32 v5, 24, v5 -; GFX10-NEXT: v_and_or_b32 v11, v2, s5, v11 +; GFX10-NEXT: v_and_or_b32 v11, 0xff, v2, v11 ; GFX10-NEXT: v_lshlrev_b32_sdwa v2, v12, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 ; GFX10-NEXT: v_lshlrev_b32_e32 v7, 24, v7 -; GFX10-NEXT: v_and_or_b32 v10, v3, s5, v10 +; GFX10-NEXT: v_and_or_b32 v10, 0xff, v3, v10 ; GFX10-NEXT: v_lshlrev_b32_sdwa v3, v12, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 ; GFX10-NEXT: v_lshlrev_b32_e32 v8, 24, v8 ; GFX10-NEXT: v_or3_b32 v0, v6, v0, v4 @@ -5500,54 +5423,52 @@ define amdgpu_ps void @insertelement_s_v16i8_v_v(<16 x i8> addrspace(4)* inreg % ; GFX9-LABEL: insertelement_s_v16i8_v_v: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x0 -; GFX9-NEXT: s_mov_b32 s12, 0x80008 -; GFX9-NEXT: s_movk_i32 s10, 0xff ; GFX9-NEXT: v_lshrrev_b32_e32 v4, 2, v1 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v4 +; GFX9-NEXT: v_and_b32_e32 v1, 3, v1 +; GFX9-NEXT: s_movk_i32 s10, 0xff ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_bfe_u32 s13, s0, s12 -; GFX9-NEXT: s_and_b32 s11, s0, s10 -; GFX9-NEXT: s_lshl_b32 s13, s13, 8 -; GFX9-NEXT: s_or_b32 s11, s11, s13 -; GFX9-NEXT: s_mov_b32 s13, 0x80010 +; GFX9-NEXT: s_bfe_u32 s12, s0, 0x80008 ; GFX9-NEXT: s_lshr_b32 s4, s0, 24 -; GFX9-NEXT: s_bfe_u32 s0, s0, s13 +; GFX9-NEXT: s_and_b32 s11, s0, 0xff +; GFX9-NEXT: s_lshl_b32 s12, s12, 8 +; GFX9-NEXT: s_bfe_u32 s0, s0, 0x80010 +; GFX9-NEXT: s_or_b32 s11, s11, s12 ; GFX9-NEXT: s_lshl_b32 s0, s0, 16 ; GFX9-NEXT: s_or_b32 s0, s11, s0 ; GFX9-NEXT: s_lshl_b32 s4, s4, 24 -; GFX9-NEXT: s_bfe_u32 s11, s1, s12 +; GFX9-NEXT: s_bfe_u32 s11, s1, 0x80008 ; GFX9-NEXT: s_lshr_b32 s5, s1, 24 ; GFX9-NEXT: s_or_b32 s4, s0, s4 -; GFX9-NEXT: s_and_b32 s0, s1, s10 +; GFX9-NEXT: s_and_b32 s0, s1, 0xff ; GFX9-NEXT: s_lshl_b32 s11, s11, 8 -; GFX9-NEXT: s_bfe_u32 s1, s1, s13 +; GFX9-NEXT: s_bfe_u32 s1, s1, 0x80010 ; GFX9-NEXT: s_or_b32 s0, s0, s11 ; GFX9-NEXT: s_lshl_b32 s1, s1, 16 ; GFX9-NEXT: s_or_b32 s0, s0, s1 ; GFX9-NEXT: s_lshl_b32 s1, s5, 24 ; GFX9-NEXT: s_or_b32 s5, s0, s1 -; GFX9-NEXT: s_bfe_u32 s1, s2, s12 -; GFX9-NEXT: s_and_b32 s0, s2, s10 +; GFX9-NEXT: s_bfe_u32 s1, s2, 0x80008 +; GFX9-NEXT: s_and_b32 s0, s2, 0xff ; GFX9-NEXT: s_lshl_b32 s1, s1, 8 ; GFX9-NEXT: s_or_b32 s0, s0, s1 -; GFX9-NEXT: s_bfe_u32 s1, s2, s13 +; GFX9-NEXT: s_bfe_u32 s1, s2, 0x80010 ; GFX9-NEXT: s_lshr_b32 s6, s2, 24 ; GFX9-NEXT: s_lshl_b32 s1, s1, 16 ; GFX9-NEXT: s_or_b32 s0, s0, s1 ; GFX9-NEXT: s_lshl_b32 s1, s6, 24 ; GFX9-NEXT: s_or_b32 s6, s0, s1 -; GFX9-NEXT: s_bfe_u32 s1, s3, s12 -; GFX9-NEXT: s_and_b32 s0, s3, s10 +; GFX9-NEXT: s_bfe_u32 s1, s3, 0x80008 +; GFX9-NEXT: s_and_b32 s0, s3, 0xff ; GFX9-NEXT: s_lshl_b32 s1, s1, 8 ; GFX9-NEXT: s_or_b32 s0, s0, s1 -; GFX9-NEXT: s_bfe_u32 s1, s3, s13 +; GFX9-NEXT: s_bfe_u32 s1, s3, 0x80010 ; GFX9-NEXT: s_lshr_b32 s7, s3, 24 ; GFX9-NEXT: s_lshl_b32 s1, s1, 16 ; GFX9-NEXT: s_or_b32 s0, s0, s1 ; GFX9-NEXT: s_lshl_b32 s1, s7, 24 ; GFX9-NEXT: v_mov_b32_e32 v2, s4 ; GFX9-NEXT: v_mov_b32_e32 v3, s5 -; GFX9-NEXT: v_and_b32_e32 v1, 3, v1 ; GFX9-NEXT: s_or_b32 s7, s0, s1 ; GFX9-NEXT: v_mov_b32_e32 v5, s6 ; GFX9-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc @@ -5560,43 +5481,44 @@ define amdgpu_ps void @insertelement_s_v16i8_v_v(<16 x i8> addrspace(4)* inreg % ; GFX9-NEXT: v_lshlrev_b32_e64 v1, v1, s10 ; GFX9-NEXT: v_cndmask_b32_e64 v2, v2, v6, s[2:3] ; GFX9-NEXT: v_xor_b32_e32 v1, -1, v1 -; GFX9-NEXT: v_and_or_b32 v5, v2, v1, v0 +; GFX9-NEXT: v_and_or_b32 v6, v2, v1, v0 ; GFX9-NEXT: v_mov_b32_e32 v0, s4 ; GFX9-NEXT: v_mov_b32_e32 v1, s5 ; GFX9-NEXT: v_mov_b32_e32 v2, s6 ; GFX9-NEXT: v_mov_b32_e32 v3, s7 ; GFX9-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v4 ; GFX9-NEXT: s_mov_b32 s8, 8 -; GFX9-NEXT: v_cndmask_b32_e64 v0, v0, v5, s[4:5] +; GFX9-NEXT: v_cndmask_b32_e64 v0, v0, v6, s[4:5] ; GFX9-NEXT: s_mov_b32 s9, 16 +; GFX9-NEXT: v_mov_b32_e32 v5, 0xff ; GFX9-NEXT: v_lshrrev_b32_e32 v4, 24, v0 -; GFX9-NEXT: v_lshlrev_b32_sdwa v8, s8, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc -; GFX9-NEXT: v_and_or_b32 v8, v0, s10, v8 +; GFX9-NEXT: v_lshlrev_b32_sdwa v9, s8, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v6, vcc +; GFX9-NEXT: v_and_or_b32 v9, v0, v5, v9 ; GFX9-NEXT: v_lshlrev_b32_sdwa v0, s9, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 ; GFX9-NEXT: v_lshlrev_b32_e32 v4, 24, v4 -; GFX9-NEXT: v_cndmask_b32_e64 v2, v2, v5, s[0:1] -; GFX9-NEXT: v_cndmask_b32_e64 v3, v3, v5, s[2:3] -; GFX9-NEXT: v_lshrrev_b32_e32 v5, 24, v1 -; GFX9-NEXT: v_or3_b32 v0, v8, v0, v4 +; GFX9-NEXT: v_cndmask_b32_e64 v2, v2, v6, s[0:1] +; GFX9-NEXT: v_cndmask_b32_e64 v3, v3, v6, s[2:3] +; GFX9-NEXT: v_lshrrev_b32_e32 v6, 24, v1 +; GFX9-NEXT: v_or3_b32 v0, v9, v0, v4 ; GFX9-NEXT: v_lshlrev_b32_sdwa v4, s8, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 -; GFX9-NEXT: v_and_or_b32 v4, v1, s10, v4 +; GFX9-NEXT: v_and_or_b32 v4, v1, v5, v4 ; GFX9-NEXT: v_lshlrev_b32_sdwa v1, s9, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 -; GFX9-NEXT: v_lshlrev_b32_e32 v5, 24, v5 -; GFX9-NEXT: v_or3_b32 v1, v4, v1, v5 -; GFX9-NEXT: v_mov_b32_e32 v4, 8 -; GFX9-NEXT: v_lshrrev_b32_e32 v6, 24, v2 -; GFX9-NEXT: v_lshlrev_b32_sdwa v5, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 -; GFX9-NEXT: v_mov_b32_e32 v8, 16 -; GFX9-NEXT: v_lshrrev_b32_e32 v7, 24, v3 -; GFX9-NEXT: v_and_or_b32 v5, v2, s10, v5 -; GFX9-NEXT: v_lshlrev_b32_sdwa v2, v8, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 ; GFX9-NEXT: v_lshlrev_b32_e32 v6, 24, v6 -; GFX9-NEXT: v_lshlrev_b32_sdwa v4, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 -; GFX9-NEXT: v_or3_b32 v2, v5, v2, v6 -; GFX9-NEXT: v_and_or_b32 v6, v3, s10, v4 -; GFX9-NEXT: v_lshlrev_b32_sdwa v3, v8, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 +; GFX9-NEXT: v_or3_b32 v1, v4, v1, v6 +; GFX9-NEXT: v_mov_b32_e32 v4, 8 +; GFX9-NEXT: v_lshrrev_b32_e32 v7, 24, v2 +; GFX9-NEXT: v_lshlrev_b32_sdwa v6, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 +; GFX9-NEXT: v_mov_b32_e32 v9, 16 +; GFX9-NEXT: v_lshrrev_b32_e32 v8, 24, v3 +; GFX9-NEXT: v_and_or_b32 v6, v2, v5, v6 +; GFX9-NEXT: v_lshlrev_b32_sdwa v2, v9, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 ; GFX9-NEXT: v_lshlrev_b32_e32 v7, 24, v7 +; GFX9-NEXT: v_lshlrev_b32_sdwa v4, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 +; GFX9-NEXT: v_or3_b32 v2, v6, v2, v7 +; GFX9-NEXT: v_and_or_b32 v6, v3, v5, v4 +; GFX9-NEXT: v_lshlrev_b32_sdwa v3, v9, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 +; GFX9-NEXT: v_lshlrev_b32_e32 v7, 24, v8 ; GFX9-NEXT: v_mov_b32_e32 v4, 0 ; GFX9-NEXT: v_mov_b32_e32 v5, 0 ; GFX9-NEXT: v_or3_b32 v3, v6, v3, v7 @@ -5606,54 +5528,52 @@ define amdgpu_ps void @insertelement_s_v16i8_v_v(<16 x i8> addrspace(4)* inreg % ; GFX8-LABEL: insertelement_s_v16i8_v_v: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x0 -; GFX8-NEXT: s_mov_b32 s10, 0x80008 -; GFX8-NEXT: s_movk_i32 s8, 0xff ; GFX8-NEXT: v_lshrrev_b32_e32 v4, 2, v1 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 1, v4 +; GFX8-NEXT: v_and_b32_e32 v1, 3, v1 +; GFX8-NEXT: s_movk_i32 s8, 0xff ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_bfe_u32 s11, s0, s10 -; GFX8-NEXT: s_and_b32 s9, s0, s8 -; GFX8-NEXT: s_lshl_b32 s11, s11, 8 -; GFX8-NEXT: s_or_b32 s9, s9, s11 -; GFX8-NEXT: s_mov_b32 s11, 0x80010 +; GFX8-NEXT: s_bfe_u32 s10, s0, 0x80008 ; GFX8-NEXT: s_lshr_b32 s4, s0, 24 -; GFX8-NEXT: s_bfe_u32 s0, s0, s11 +; GFX8-NEXT: s_and_b32 s9, s0, 0xff +; GFX8-NEXT: s_lshl_b32 s10, s10, 8 +; GFX8-NEXT: s_bfe_u32 s0, s0, 0x80010 +; GFX8-NEXT: s_or_b32 s9, s9, s10 ; GFX8-NEXT: s_lshl_b32 s0, s0, 16 ; GFX8-NEXT: s_or_b32 s0, s9, s0 ; GFX8-NEXT: s_lshl_b32 s4, s4, 24 -; GFX8-NEXT: s_bfe_u32 s9, s1, s10 +; GFX8-NEXT: s_bfe_u32 s9, s1, 0x80008 ; GFX8-NEXT: s_lshr_b32 s5, s1, 24 ; GFX8-NEXT: s_or_b32 s4, s0, s4 -; GFX8-NEXT: s_and_b32 s0, s1, s8 +; GFX8-NEXT: s_and_b32 s0, s1, 0xff ; GFX8-NEXT: s_lshl_b32 s9, s9, 8 -; GFX8-NEXT: s_bfe_u32 s1, s1, s11 +; GFX8-NEXT: s_bfe_u32 s1, s1, 0x80010 ; GFX8-NEXT: s_or_b32 s0, s0, s9 ; GFX8-NEXT: s_lshl_b32 s1, s1, 16 ; GFX8-NEXT: s_or_b32 s0, s0, s1 ; GFX8-NEXT: s_lshl_b32 s1, s5, 24 ; GFX8-NEXT: s_or_b32 s5, s0, s1 -; GFX8-NEXT: s_bfe_u32 s1, s2, s10 -; GFX8-NEXT: s_and_b32 s0, s2, s8 +; GFX8-NEXT: s_bfe_u32 s1, s2, 0x80008 +; GFX8-NEXT: s_and_b32 s0, s2, 0xff ; GFX8-NEXT: s_lshl_b32 s1, s1, 8 ; GFX8-NEXT: s_or_b32 s0, s0, s1 -; GFX8-NEXT: s_bfe_u32 s1, s2, s11 +; GFX8-NEXT: s_bfe_u32 s1, s2, 0x80010 ; GFX8-NEXT: s_lshr_b32 s6, s2, 24 ; GFX8-NEXT: s_lshl_b32 s1, s1, 16 ; GFX8-NEXT: s_or_b32 s0, s0, s1 ; GFX8-NEXT: s_lshl_b32 s1, s6, 24 ; GFX8-NEXT: s_or_b32 s6, s0, s1 -; GFX8-NEXT: s_bfe_u32 s1, s3, s10 -; GFX8-NEXT: s_and_b32 s0, s3, s8 +; GFX8-NEXT: s_bfe_u32 s1, s3, 0x80008 +; GFX8-NEXT: s_and_b32 s0, s3, 0xff ; GFX8-NEXT: s_lshl_b32 s1, s1, 8 ; GFX8-NEXT: s_or_b32 s0, s0, s1 -; GFX8-NEXT: s_bfe_u32 s1, s3, s11 +; GFX8-NEXT: s_bfe_u32 s1, s3, 0x80010 ; GFX8-NEXT: s_lshr_b32 s7, s3, 24 ; GFX8-NEXT: s_lshl_b32 s1, s1, 16 ; GFX8-NEXT: s_or_b32 s0, s0, s1 ; GFX8-NEXT: s_lshl_b32 s1, s7, 24 ; GFX8-NEXT: v_mov_b32_e32 v2, s4 ; GFX8-NEXT: v_mov_b32_e32 v3, s5 -; GFX8-NEXT: v_and_b32_e32 v1, 3, v1 ; GFX8-NEXT: s_or_b32 s7, s0, s1 ; GFX8-NEXT: v_mov_b32_e32 v5, s6 ; GFX8-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc @@ -5717,54 +5637,52 @@ define amdgpu_ps void @insertelement_s_v16i8_v_v(<16 x i8> addrspace(4)* inreg % ; GFX7-LABEL: insertelement_s_v16i8_v_v: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x0 -; GFX7-NEXT: s_mov_b32 s10, 0x80008 -; GFX7-NEXT: s_movk_i32 s8, 0xff ; GFX7-NEXT: v_lshrrev_b32_e32 v4, 2, v1 +; GFX7-NEXT: s_movk_i32 s8, 0xff ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v4 +; GFX7-NEXT: v_and_b32_e32 v1, 3, v1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: s_bfe_u32 s11, s0, s10 -; GFX7-NEXT: s_and_b32 s9, s0, s8 -; GFX7-NEXT: s_lshl_b32 s11, s11, 8 -; GFX7-NEXT: s_or_b32 s9, s9, s11 -; GFX7-NEXT: s_mov_b32 s11, 0x80010 +; GFX7-NEXT: s_bfe_u32 s10, s0, 0x80008 ; GFX7-NEXT: s_lshr_b32 s4, s0, 24 -; GFX7-NEXT: s_bfe_u32 s0, s0, s11 +; GFX7-NEXT: s_and_b32 s9, s0, 0xff +; GFX7-NEXT: s_lshl_b32 s10, s10, 8 +; GFX7-NEXT: s_bfe_u32 s0, s0, 0x80010 +; GFX7-NEXT: s_or_b32 s9, s9, s10 ; GFX7-NEXT: s_lshl_b32 s0, s0, 16 ; GFX7-NEXT: s_or_b32 s0, s9, s0 ; GFX7-NEXT: s_lshl_b32 s4, s4, 24 -; GFX7-NEXT: s_bfe_u32 s9, s1, s10 +; GFX7-NEXT: s_bfe_u32 s9, s1, 0x80008 ; GFX7-NEXT: s_lshr_b32 s5, s1, 24 ; GFX7-NEXT: s_or_b32 s4, s0, s4 -; GFX7-NEXT: s_and_b32 s0, s1, s8 +; GFX7-NEXT: s_and_b32 s0, s1, 0xff ; GFX7-NEXT: s_lshl_b32 s9, s9, 8 -; GFX7-NEXT: s_bfe_u32 s1, s1, s11 +; GFX7-NEXT: s_bfe_u32 s1, s1, 0x80010 ; GFX7-NEXT: s_or_b32 s0, s0, s9 ; GFX7-NEXT: s_lshl_b32 s1, s1, 16 ; GFX7-NEXT: s_or_b32 s0, s0, s1 ; GFX7-NEXT: s_lshl_b32 s1, s5, 24 ; GFX7-NEXT: s_or_b32 s5, s0, s1 -; GFX7-NEXT: s_bfe_u32 s1, s2, s10 -; GFX7-NEXT: s_and_b32 s0, s2, s8 +; GFX7-NEXT: s_bfe_u32 s1, s2, 0x80008 +; GFX7-NEXT: s_and_b32 s0, s2, 0xff ; GFX7-NEXT: s_lshl_b32 s1, s1, 8 ; GFX7-NEXT: s_or_b32 s0, s0, s1 -; GFX7-NEXT: s_bfe_u32 s1, s2, s11 +; GFX7-NEXT: s_bfe_u32 s1, s2, 0x80010 ; GFX7-NEXT: s_lshr_b32 s6, s2, 24 ; GFX7-NEXT: s_lshl_b32 s1, s1, 16 ; GFX7-NEXT: s_or_b32 s0, s0, s1 ; GFX7-NEXT: s_lshl_b32 s1, s6, 24 ; GFX7-NEXT: s_or_b32 s6, s0, s1 -; GFX7-NEXT: s_bfe_u32 s1, s3, s10 -; GFX7-NEXT: s_and_b32 s0, s3, s8 +; GFX7-NEXT: s_bfe_u32 s1, s3, 0x80008 +; GFX7-NEXT: s_and_b32 s0, s3, 0xff ; GFX7-NEXT: s_lshl_b32 s1, s1, 8 ; GFX7-NEXT: s_or_b32 s0, s0, s1 -; GFX7-NEXT: s_bfe_u32 s1, s3, s11 +; GFX7-NEXT: s_bfe_u32 s1, s3, 0x80010 ; GFX7-NEXT: s_lshr_b32 s7, s3, 24 ; GFX7-NEXT: s_lshl_b32 s1, s1, 16 ; GFX7-NEXT: s_or_b32 s0, s0, s1 ; GFX7-NEXT: s_lshl_b32 s1, s7, 24 ; GFX7-NEXT: v_mov_b32_e32 v2, s4 ; GFX7-NEXT: v_mov_b32_e32 v3, s5 -; GFX7-NEXT: v_and_b32_e32 v1, 3, v1 ; GFX7-NEXT: s_or_b32 s7, s0, s1 ; GFX7-NEXT: v_mov_b32_e32 v5, s6 ; GFX7-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc @@ -5779,55 +5697,56 @@ define amdgpu_ps void @insertelement_s_v16i8_v_v(<16 x i8> addrspace(4)* inreg % ; GFX7-NEXT: v_cndmask_b32_e64 v2, v2, v6, s[2:3] ; GFX7-NEXT: v_xor_b32_e32 v1, -1, v1 ; GFX7-NEXT: v_and_b32_e32 v1, v2, v1 -; GFX7-NEXT: v_or_b32_e32 v5, v1, v0 +; GFX7-NEXT: v_or_b32_e32 v6, v1, v0 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: v_mov_b32_e32 v2, s6 ; GFX7-NEXT: v_mov_b32_e32 v3, s7 ; GFX7-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v4 -; GFX7-NEXT: v_cndmask_b32_e64 v0, v0, v5, s[4:5] -; GFX7-NEXT: v_bfe_u32 v9, v0, 8, 8 +; GFX7-NEXT: v_cndmask_b32_e64 v0, v0, v6, s[4:5] +; GFX7-NEXT: v_mov_b32_e32 v5, 0xff +; GFX7-NEXT: v_bfe_u32 v10, v0, 8, 8 ; GFX7-NEXT: v_lshrrev_b32_e32 v4, 24, v0 -; GFX7-NEXT: v_and_b32_e32 v8, s8, v0 -; GFX7-NEXT: v_lshlrev_b32_e32 v9, 8, v9 +; GFX7-NEXT: v_and_b32_e32 v9, v0, v5 +; GFX7-NEXT: v_lshlrev_b32_e32 v10, 8, v10 ; GFX7-NEXT: v_bfe_u32 v0, v0, 16, 8 -; GFX7-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc -; GFX7-NEXT: v_or_b32_e32 v8, v8, v9 +; GFX7-NEXT: v_cndmask_b32_e32 v1, v1, v6, vcc +; GFX7-NEXT: v_or_b32_e32 v9, v9, v10 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX7-NEXT: v_or_b32_e32 v0, v8, v0 +; GFX7-NEXT: v_or_b32_e32 v0, v9, v0 ; GFX7-NEXT: v_lshlrev_b32_e32 v4, 24, v4 -; GFX7-NEXT: v_bfe_u32 v8, v1, 8, 8 -; GFX7-NEXT: v_cndmask_b32_e64 v2, v2, v5, s[0:1] -; GFX7-NEXT: v_cndmask_b32_e64 v3, v3, v5, s[2:3] -; GFX7-NEXT: v_lshrrev_b32_e32 v5, 24, v1 +; GFX7-NEXT: v_bfe_u32 v9, v1, 8, 8 +; GFX7-NEXT: v_cndmask_b32_e64 v2, v2, v6, s[0:1] +; GFX7-NEXT: v_cndmask_b32_e64 v3, v3, v6, s[2:3] +; GFX7-NEXT: v_lshrrev_b32_e32 v6, 24, v1 ; GFX7-NEXT: v_or_b32_e32 v0, v0, v4 -; GFX7-NEXT: v_and_b32_e32 v4, s8, v1 -; GFX7-NEXT: v_lshlrev_b32_e32 v8, 8, v8 +; GFX7-NEXT: v_and_b32_e32 v4, v1, v5 +; GFX7-NEXT: v_lshlrev_b32_e32 v9, 8, v9 ; GFX7-NEXT: v_bfe_u32 v1, v1, 16, 8 -; GFX7-NEXT: v_or_b32_e32 v4, v4, v8 +; GFX7-NEXT: v_or_b32_e32 v4, v4, v9 ; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX7-NEXT: v_or_b32_e32 v1, v4, v1 -; GFX7-NEXT: v_lshlrev_b32_e32 v4, 24, v5 -; GFX7-NEXT: v_bfe_u32 v5, v2, 8, 8 -; GFX7-NEXT: v_lshrrev_b32_e32 v6, 24, v2 +; GFX7-NEXT: v_lshlrev_b32_e32 v4, 24, v6 +; GFX7-NEXT: v_bfe_u32 v6, v2, 8, 8 +; GFX7-NEXT: v_lshrrev_b32_e32 v7, 24, v2 ; GFX7-NEXT: v_or_b32_e32 v1, v1, v4 -; GFX7-NEXT: v_and_b32_e32 v4, s8, v2 -; GFX7-NEXT: v_lshlrev_b32_e32 v5, 8, v5 +; GFX7-NEXT: v_and_b32_e32 v4, v2, v5 +; GFX7-NEXT: v_lshlrev_b32_e32 v6, 8, v6 ; GFX7-NEXT: v_bfe_u32 v2, v2, 16, 8 -; GFX7-NEXT: v_or_b32_e32 v4, v4, v5 +; GFX7-NEXT: v_or_b32_e32 v4, v4, v6 ; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX7-NEXT: v_or_b32_e32 v2, v4, v2 -; GFX7-NEXT: v_lshlrev_b32_e32 v4, 24, v6 -; GFX7-NEXT: v_bfe_u32 v5, v3, 8, 8 -; GFX7-NEXT: v_lshrrev_b32_e32 v7, 24, v3 +; GFX7-NEXT: v_lshlrev_b32_e32 v4, 24, v7 ; GFX7-NEXT: v_or_b32_e32 v2, v2, v4 -; GFX7-NEXT: v_and_b32_e32 v4, s8, v3 +; GFX7-NEXT: v_and_b32_e32 v4, v3, v5 +; GFX7-NEXT: v_bfe_u32 v5, v3, 8, 8 +; GFX7-NEXT: v_lshrrev_b32_e32 v8, 24, v3 ; GFX7-NEXT: v_lshlrev_b32_e32 v5, 8, v5 ; GFX7-NEXT: v_bfe_u32 v3, v3, 16, 8 ; GFX7-NEXT: v_or_b32_e32 v4, v4, v5 ; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; GFX7-NEXT: v_or_b32_e32 v3, v4, v3 -; GFX7-NEXT: v_lshlrev_b32_e32 v4, 24, v7 +; GFX7-NEXT: v_lshlrev_b32_e32 v4, 24, v8 ; GFX7-NEXT: s_mov_b64 s[0:1], 0 ; GFX7-NEXT: v_or_b32_e32 v3, v3, v4 ; GFX7-NEXT: s_mov_b32 s2, -1 @@ -5838,65 +5757,62 @@ define amdgpu_ps void @insertelement_s_v16i8_v_v(<16 x i8> addrspace(4)* inreg % ; GFX10-LABEL: insertelement_s_v16i8_v_v: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x0 -; GFX10-NEXT: s_mov_b32 s7, 0x80008 -; GFX10-NEXT: s_movk_i32 s8, 0xff -; GFX10-NEXT: s_mov_b32 s9, 0x80010 ; GFX10-NEXT: v_lshrrev_b32_e32 v4, 2, v1 ; GFX10-NEXT: v_and_b32_e32 v1, 3, v1 ; GFX10-NEXT: v_mov_b32_e32 v10, 8 ; GFX10-NEXT: v_mov_b32_e32 v12, 16 ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v4 ; GFX10-NEXT: v_lshlrev_b32_e32 v1, 3, v1 -; GFX10-NEXT: v_lshlrev_b32_e64 v3, v1, s8 +; GFX10-NEXT: v_lshlrev_b32_e64 v3, v1, 0xff ; GFX10-NEXT: v_lshlrev_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX10-NEXT: v_xor_b32_e32 v1, -1, v3 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: s_bfe_u32 s12, s0, s7 +; GFX10-NEXT: s_bfe_u32 s9, s0, 0x80008 ; GFX10-NEXT: s_lshr_b32 s4, s0, 24 -; GFX10-NEXT: s_and_b32 s11, s0, s8 -; GFX10-NEXT: s_bfe_u32 s0, s0, s9 -; GFX10-NEXT: s_lshl_b32 s12, s12, 8 -; GFX10-NEXT: s_lshl_b32 s0, s0, 16 -; GFX10-NEXT: s_or_b32 s11, s11, s12 -; GFX10-NEXT: s_bfe_u32 s16, s2, s7 -; GFX10-NEXT: s_lshl_b32 s4, s4, 24 -; GFX10-NEXT: s_or_b32 s0, s11, s0 -; GFX10-NEXT: s_bfe_u32 s14, s1, s7 -; GFX10-NEXT: s_and_b32 s15, s2, s8 -; GFX10-NEXT: s_lshl_b32 s16, s16, 8 -; GFX10-NEXT: s_or_b32 s4, s0, s4 -; GFX10-NEXT: s_bfe_u32 s0, s2, s9 +; GFX10-NEXT: s_and_b32 s8, s0, 0xff +; GFX10-NEXT: s_bfe_u32 s0, s0, 0x80010 +; GFX10-NEXT: s_bfe_u32 s11, s1, 0x80008 +; GFX10-NEXT: s_lshl_b32 s9, s9, 8 ; GFX10-NEXT: s_lshr_b32 s5, s1, 24 -; GFX10-NEXT: s_and_b32 s13, s1, s8 -; GFX10-NEXT: s_bfe_u32 s1, s1, s9 -; GFX10-NEXT: s_lshl_b32 s14, s14, 8 -; GFX10-NEXT: s_lshr_b32 s6, s2, 24 -; GFX10-NEXT: s_or_b32 s2, s15, s16 +; GFX10-NEXT: s_and_b32 s10, s1, 0xff +; GFX10-NEXT: s_bfe_u32 s1, s1, 0x80010 ; GFX10-NEXT: s_lshl_b32 s0, s0, 16 +; GFX10-NEXT: s_lshl_b32 s11, s11, 8 +; GFX10-NEXT: s_or_b32 s8, s8, s9 +; GFX10-NEXT: s_lshl_b32 s4, s4, 24 ; GFX10-NEXT: s_lshl_b32 s1, s1, 16 -; GFX10-NEXT: s_or_b32 s12, s13, s14 -; GFX10-NEXT: s_or_b32 s0, s2, s0 -; GFX10-NEXT: s_lshl_b32 s2, s6, 24 -; GFX10-NEXT: s_or_b32 s1, s12, s1 -; GFX10-NEXT: s_lshl_b32 s5, s5, 24 -; GFX10-NEXT: s_or_b32 s6, s0, s2 -; GFX10-NEXT: s_bfe_u32 s0, s3, s7 -; GFX10-NEXT: s_or_b32 s5, s1, s5 -; GFX10-NEXT: s_and_b32 s1, s3, s8 +; GFX10-NEXT: s_or_b32 s9, s10, s11 +; GFX10-NEXT: s_or_b32 s0, s8, s0 +; GFX10-NEXT: s_or_b32 s1, s9, s1 +; GFX10-NEXT: s_or_b32 s4, s0, s4 +; GFX10-NEXT: s_lshl_b32 s0, s5, 24 +; GFX10-NEXT: s_bfe_u32 s13, s2, 0x80008 +; GFX10-NEXT: s_or_b32 s5, s1, s0 +; GFX10-NEXT: s_bfe_u32 s0, s3, 0x80008 +; GFX10-NEXT: s_and_b32 s1, s3, 0xff ; GFX10-NEXT: s_lshl_b32 s0, s0, 8 ; GFX10-NEXT: v_mov_b32_e32 v2, s5 ; GFX10-NEXT: s_or_b32 s0, s1, s0 -; GFX10-NEXT: s_bfe_u32 s1, s3, s9 -; GFX10-NEXT: s_lshr_b32 s10, s3, 24 +; GFX10-NEXT: s_bfe_u32 s1, s3, 0x80010 +; GFX10-NEXT: s_lshr_b32 s6, s2, 24 +; GFX10-NEXT: s_and_b32 s12, s2, 0xff +; GFX10-NEXT: s_bfe_u32 s2, s2, 0x80010 +; GFX10-NEXT: s_lshl_b32 s13, s13, 8 ; GFX10-NEXT: s_lshl_b32 s1, s1, 16 +; GFX10-NEXT: s_lshl_b32 s2, s2, 16 +; GFX10-NEXT: s_or_b32 s10, s12, s13 ; GFX10-NEXT: v_cndmask_b32_e32 v2, s4, v2, vcc_lo ; GFX10-NEXT: s_or_b32 s1, s0, s1 ; GFX10-NEXT: v_cmp_eq_u32_e64 s0, 2, v4 -; GFX10-NEXT: s_lshl_b32 s2, s10, 24 +; GFX10-NEXT: s_lshr_b32 s7, s3, 24 +; GFX10-NEXT: s_or_b32 s2, s10, s2 +; GFX10-NEXT: s_lshl_b32 s6, s6, 24 ; GFX10-NEXT: s_mov_b32 s3, 8 +; GFX10-NEXT: s_or_b32 s6, s2, s6 +; GFX10-NEXT: s_lshl_b32 s2, s7, 24 +; GFX10-NEXT: v_cndmask_b32_e64 v2, v2, s6, s0 ; GFX10-NEXT: s_or_b32 s7, s1, s2 ; GFX10-NEXT: v_cmp_eq_u32_e64 s1, 3, v4 -; GFX10-NEXT: v_cndmask_b32_e64 v2, v2, s6, s0 ; GFX10-NEXT: v_cmp_eq_u32_e64 s2, 0, v4 ; GFX10-NEXT: v_cndmask_b32_e64 v2, v2, s7, s1 ; GFX10-NEXT: v_and_or_b32 v5, v2, v1, v0 @@ -5917,16 +5833,16 @@ define amdgpu_ps void @insertelement_s_v16i8_v_v(<16 x i8> addrspace(4)* inreg % ; GFX10-NEXT: v_lshrrev_b32_e32 v8, 24, v3 ; GFX10-NEXT: v_lshlrev_b32_sdwa v11, v10, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 ; GFX10-NEXT: v_lshlrev_b32_sdwa v10, v10, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 -; GFX10-NEXT: v_and_or_b32 v6, v0, s8, v6 +; GFX10-NEXT: v_and_or_b32 v6, 0xff, v0, v6 ; GFX10-NEXT: v_lshlrev_b32_sdwa v0, s2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 ; GFX10-NEXT: v_lshlrev_b32_e32 v4, 24, v4 -; GFX10-NEXT: v_and_or_b32 v9, v1, s8, v9 +; GFX10-NEXT: v_and_or_b32 v9, 0xff, v1, v9 ; GFX10-NEXT: v_lshlrev_b32_sdwa v1, s2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 ; GFX10-NEXT: v_lshlrev_b32_e32 v5, 24, v5 -; GFX10-NEXT: v_and_or_b32 v11, v2, s8, v11 +; GFX10-NEXT: v_and_or_b32 v11, 0xff, v2, v11 ; GFX10-NEXT: v_lshlrev_b32_sdwa v2, v12, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 ; GFX10-NEXT: v_lshlrev_b32_e32 v7, 24, v7 -; GFX10-NEXT: v_and_or_b32 v10, v3, s8, v10 +; GFX10-NEXT: v_and_or_b32 v10, 0xff, v3, v10 ; GFX10-NEXT: v_lshlrev_b32_sdwa v3, v12, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 ; GFX10-NEXT: v_lshlrev_b32_e32 v8, 24, v8 ; GFX10-NEXT: v_or3_b32 v0, v6, v0, v4 @@ -5948,171 +5864,172 @@ define amdgpu_ps void @insertelement_v_v16i8_s_v(<16 x i8> addrspace(1)* %ptr, i ; GFX9: ; %bb.0: ; GFX9-NEXT: global_load_dwordx4 v[3:6], v[0:1], off ; GFX9-NEXT: s_mov_b32 s0, 8 +; GFX9-NEXT: v_mov_b32_e32 v1, 8 ; GFX9-NEXT: s_mov_b32 s1, 16 -; GFX9-NEXT: s_movk_i32 s6, 0xff -; GFX9-NEXT: v_mov_b32_e32 v0, 8 -; GFX9-NEXT: v_mov_b32_e32 v1, 16 -; GFX9-NEXT: v_mov_b32_e32 v7, 0 -; GFX9-NEXT: v_mov_b32_e32 v8, 0 +; GFX9-NEXT: s_movk_i32 s3, 0xff +; GFX9-NEXT: v_mov_b32_e32 v0, 0xff +; GFX9-NEXT: v_mov_b32_e32 v7, 16 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_lshrrev_b32_e32 v9, 24, v3 -; GFX9-NEXT: v_lshrrev_b32_e32 v10, 24, v4 -; GFX9-NEXT: v_lshlrev_b32_sdwa v13, s0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 -; GFX9-NEXT: v_lshlrev_b32_sdwa v15, s0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 -; GFX9-NEXT: v_lshrrev_b32_e32 v11, 24, v5 -; GFX9-NEXT: v_lshlrev_b32_sdwa v14, s1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 -; GFX9-NEXT: v_lshlrev_b32_sdwa v16, s1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 -; GFX9-NEXT: v_lshlrev_b32_sdwa v17, v0, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 -; GFX9-NEXT: v_and_or_b32 v3, v3, s6, v13 -; GFX9-NEXT: v_and_or_b32 v4, v4, s6, v15 -; GFX9-NEXT: v_lshrrev_b32_e32 v15, 2, v2 +; GFX9-NEXT: v_lshrrev_b32_e32 v8, 24, v3 +; GFX9-NEXT: v_lshrrev_b32_e32 v9, 24, v4 +; GFX9-NEXT: v_lshlrev_b32_sdwa v12, s0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 +; GFX9-NEXT: v_lshlrev_b32_sdwa v14, s0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 +; GFX9-NEXT: v_lshlrev_b32_sdwa v16, v1, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 +; GFX9-NEXT: v_lshrrev_b32_e32 v10, 24, v5 +; GFX9-NEXT: v_lshlrev_b32_sdwa v13, s1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 +; GFX9-NEXT: v_lshlrev_b32_sdwa v15, s1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 +; GFX9-NEXT: v_and_or_b32 v3, v3, s3, v12 +; GFX9-NEXT: v_and_or_b32 v12, v4, s3, v14 +; GFX9-NEXT: v_and_or_b32 v14, v5, v0, v16 +; GFX9-NEXT: v_lshrrev_b32_e32 v16, 2, v2 ; GFX9-NEXT: v_and_b32_e32 v2, 3, v2 +; GFX9-NEXT: v_lshlrev_b32_e32 v8, 24, v8 ; GFX9-NEXT: v_lshlrev_b32_e32 v9, 24, v9 -; GFX9-NEXT: v_lshlrev_b32_e32 v10, 24, v10 -; GFX9-NEXT: v_lshrrev_b32_e32 v12, 24, v6 -; GFX9-NEXT: v_lshlrev_b32_sdwa v18, v1, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 -; GFX9-NEXT: v_lshlrev_b32_sdwa v19, v0, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 -; GFX9-NEXT: v_and_or_b32 v5, v5, s6, v17 -; GFX9-NEXT: s_and_b32 s0, s2, s6 +; GFX9-NEXT: v_lshrrev_b32_e32 v11, 24, v6 +; GFX9-NEXT: v_lshlrev_b32_sdwa v17, v7, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 +; GFX9-NEXT: v_lshlrev_b32_sdwa v18, v1, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 +; GFX9-NEXT: s_and_b32 s0, s2, 0xff ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 3, v2 +; GFX9-NEXT: v_lshlrev_b32_e32 v10, 24, v10 +; GFX9-NEXT: v_or3_b32 v3, v3, v13, v8 +; GFX9-NEXT: v_or3_b32 v8, v12, v15, v9 +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v16 +; GFX9-NEXT: v_lshlrev_b32_sdwa v19, v7, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 +; GFX9-NEXT: v_and_or_b32 v6, v6, v0, v18 +; GFX9-NEXT: v_lshlrev_b32_e64 v18, v2, s0 ; GFX9-NEXT: v_lshlrev_b32_e32 v11, 24, v11 -; GFX9-NEXT: v_or3_b32 v3, v3, v14, v9 -; GFX9-NEXT: v_or3_b32 v4, v4, v16, v10 -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v15 -; GFX9-NEXT: v_and_or_b32 v13, v6, s6, v19 -; GFX9-NEXT: v_lshlrev_b32_sdwa v6, v1, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 -; GFX9-NEXT: v_lshlrev_b32_e64 v17, v2, s0 -; GFX9-NEXT: v_lshlrev_b32_e32 v12, 24, v12 -; GFX9-NEXT: v_or3_b32 v5, v5, v18, v11 -; GFX9-NEXT: v_cndmask_b32_e32 v9, v3, v4, vcc -; GFX9-NEXT: v_cmp_eq_u32_e64 s[0:1], 2, v15 -; GFX9-NEXT: v_lshlrev_b32_e64 v2, v2, s6 -; GFX9-NEXT: v_or3_b32 v6, v13, v6, v12 -; GFX9-NEXT: v_cndmask_b32_e64 v9, v9, v5, s[0:1] -; GFX9-NEXT: v_cmp_eq_u32_e64 s[2:3], 3, v15 +; GFX9-NEXT: v_or3_b32 v9, v14, v17, v10 +; GFX9-NEXT: v_cndmask_b32_e32 v10, v3, v8, vcc +; GFX9-NEXT: v_cmp_eq_u32_e64 s[0:1], 2, v16 +; GFX9-NEXT: v_lshlrev_b32_e32 v2, v2, v0 +; GFX9-NEXT: v_or3_b32 v6, v6, v19, v11 +; GFX9-NEXT: v_cndmask_b32_e64 v10, v10, v9, s[0:1] +; GFX9-NEXT: v_cmp_eq_u32_e64 s[2:3], 3, v16 ; GFX9-NEXT: v_xor_b32_e32 v2, -1, v2 -; GFX9-NEXT: v_cndmask_b32_e64 v9, v9, v6, s[2:3] -; GFX9-NEXT: v_and_or_b32 v2, v9, v2, v17 -; GFX9-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v15 +; GFX9-NEXT: v_cndmask_b32_e64 v10, v10, v6, s[2:3] +; GFX9-NEXT: v_and_or_b32 v2, v10, v2, v18 +; GFX9-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v16 ; GFX9-NEXT: v_cndmask_b32_e64 v3, v3, v2, s[4:5] -; GFX9-NEXT: v_cndmask_b32_e32 v4, v4, v2, vcc -; GFX9-NEXT: v_cndmask_b32_e64 v5, v5, v2, s[0:1] +; GFX9-NEXT: v_cndmask_b32_e32 v8, v8, v2, vcc +; GFX9-NEXT: v_cndmask_b32_e64 v9, v9, v2, s[0:1] ; GFX9-NEXT: v_cndmask_b32_e64 v2, v6, v2, s[2:3] ; GFX9-NEXT: v_lshrrev_b32_e32 v6, 24, v3 -; GFX9-NEXT: v_lshrrev_b32_e32 v9, 24, v4 -; GFX9-NEXT: v_lshrrev_b32_e32 v10, 24, v5 -; GFX9-NEXT: v_lshrrev_b32_e32 v11, 24, v2 -; GFX9-NEXT: v_lshlrev_b32_sdwa v12, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 -; GFX9-NEXT: v_lshlrev_b32_sdwa v14, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 -; GFX9-NEXT: v_lshlrev_b32_sdwa v16, v0, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 -; GFX9-NEXT: v_lshlrev_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 -; GFX9-NEXT: v_lshlrev_b32_sdwa v13, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 -; GFX9-NEXT: v_lshlrev_b32_sdwa v15, v1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 -; GFX9-NEXT: v_lshlrev_b32_sdwa v17, v1, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 -; GFX9-NEXT: v_lshlrev_b32_sdwa v18, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 -; GFX9-NEXT: v_and_or_b32 v1, v3, s6, v12 -; GFX9-NEXT: v_lshlrev_b32_e32 v3, 24, v6 -; GFX9-NEXT: v_and_or_b32 v4, v4, s6, v14 -; GFX9-NEXT: v_lshlrev_b32_e32 v6, 24, v9 -; GFX9-NEXT: v_and_or_b32 v5, v5, s6, v16 -; GFX9-NEXT: v_lshlrev_b32_e32 v9, 24, v10 -; GFX9-NEXT: v_and_or_b32 v10, v2, s6, v0 +; GFX9-NEXT: v_lshrrev_b32_e32 v10, 24, v8 +; GFX9-NEXT: v_lshrrev_b32_e32 v11, 24, v9 +; GFX9-NEXT: v_lshrrev_b32_e32 v12, 24, v2 +; GFX9-NEXT: v_lshlrev_b32_sdwa v13, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 +; GFX9-NEXT: v_lshlrev_b32_sdwa v15, v1, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 +; GFX9-NEXT: v_lshlrev_b32_sdwa v17, v1, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 +; GFX9-NEXT: v_lshlrev_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 +; GFX9-NEXT: v_mov_b32_e32 v4, 0 +; GFX9-NEXT: v_lshlrev_b32_sdwa v14, v7, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 +; GFX9-NEXT: v_lshlrev_b32_sdwa v16, v7, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 +; GFX9-NEXT: v_lshlrev_b32_sdwa v18, v7, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 +; GFX9-NEXT: v_lshlrev_b32_sdwa v7, v7, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 +; GFX9-NEXT: v_and_or_b32 v3, v3, v0, v13 +; GFX9-NEXT: v_lshlrev_b32_e32 v6, 24, v6 +; GFX9-NEXT: v_and_or_b32 v8, v8, v0, v15 +; GFX9-NEXT: v_lshlrev_b32_e32 v10, 24, v10 +; GFX9-NEXT: v_and_or_b32 v9, v9, v0, v17 ; GFX9-NEXT: v_lshlrev_b32_e32 v11, 24, v11 -; GFX9-NEXT: v_or3_b32 v0, v1, v13, v3 -; GFX9-NEXT: v_or3_b32 v1, v4, v15, v6 -; GFX9-NEXT: v_or3_b32 v2, v5, v17, v9 -; GFX9-NEXT: v_or3_b32 v3, v10, v18, v11 -; GFX9-NEXT: global_store_dwordx4 v[7:8], v[0:3], off +; GFX9-NEXT: v_and_or_b32 v13, v2, v0, v1 +; GFX9-NEXT: v_lshlrev_b32_e32 v12, 24, v12 +; GFX9-NEXT: v_mov_b32_e32 v5, 0 +; GFX9-NEXT: v_or3_b32 v0, v3, v14, v6 +; GFX9-NEXT: v_or3_b32 v1, v8, v16, v10 +; GFX9-NEXT: v_or3_b32 v2, v9, v18, v11 +; GFX9-NEXT: v_or3_b32 v3, v13, v7, v12 +; GFX9-NEXT: global_store_dwordx4 v[4:5], v[0:3], off ; GFX9-NEXT: s_endpgm ; ; GFX8-LABEL: insertelement_v_v16i8_s_v: ; GFX8: ; %bb.0: ; GFX8-NEXT: flat_load_dwordx4 v[3:6], v[0:1] -; GFX8-NEXT: v_mov_b32_e32 v0, 8 -; GFX8-NEXT: v_mov_b32_e32 v1, 16 -; GFX8-NEXT: v_mov_b32_e32 v9, 8 -; GFX8-NEXT: v_mov_b32_e32 v10, 16 -; GFX8-NEXT: s_movk_i32 s0, 0xff -; GFX8-NEXT: s_and_b32 s1, s2, s0 -; GFX8-NEXT: v_mov_b32_e32 v7, 0 -; GFX8-NEXT: v_mov_b32_e32 v8, 0 +; GFX8-NEXT: v_mov_b32_e32 v1, 8 +; GFX8-NEXT: v_mov_b32_e32 v8, 8 +; GFX8-NEXT: v_mov_b32_e32 v7, 16 +; GFX8-NEXT: v_mov_b32_e32 v0, 0xff +; GFX8-NEXT: v_mov_b32_e32 v9, 16 +; GFX8-NEXT: s_and_b32 s0, s2, 0xff ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_lshlrev_b32_sdwa v15, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 -; GFX8-NEXT: v_lshlrev_b32_sdwa v0, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 -; GFX8-NEXT: v_lshrrev_b32_e32 v11, 24, v3 -; GFX8-NEXT: v_lshrrev_b32_e32 v12, 24, v4 -; GFX8-NEXT: v_lshlrev_b32_sdwa v16, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 -; GFX8-NEXT: v_lshlrev_b32_sdwa v1, v1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 -; GFX8-NEXT: v_lshlrev_b32_sdwa v17, v9, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 -; GFX8-NEXT: v_or_b32_sdwa v3, v3, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX8-NEXT: v_or_b32_sdwa v0, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX8-NEXT: v_lshrrev_b32_e32 v13, 24, v5 -; GFX8-NEXT: v_lshlrev_b32_sdwa v18, v10, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 -; GFX8-NEXT: v_lshlrev_b32_sdwa v19, v9, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 -; GFX8-NEXT: v_or_b32_sdwa v4, v5, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX8-NEXT: v_lshrrev_b32_e32 v15, 2, v2 +; GFX8-NEXT: v_lshlrev_b32_sdwa v14, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 +; GFX8-NEXT: v_lshlrev_b32_sdwa v16, v8, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 +; GFX8-NEXT: v_lshrrev_b32_e32 v10, 24, v3 +; GFX8-NEXT: v_lshlrev_b32_sdwa v15, v7, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 +; GFX8-NEXT: v_lshlrev_b32_sdwa v1, v1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 +; GFX8-NEXT: v_or_b32_sdwa v3, v3, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_sdwa v14, v5, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX8-NEXT: v_lshrrev_b32_e32 v16, 2, v2 ; GFX8-NEXT: v_and_b32_e32 v2, 3, v2 -; GFX8-NEXT: v_lshlrev_b32_e32 v11, 24, v11 -; GFX8-NEXT: v_lshlrev_b32_e32 v12, 24, v12 -; GFX8-NEXT: v_or_b32_e32 v3, v3, v16 -; GFX8-NEXT: v_or_b32_e32 v0, v0, v1 -; GFX8-NEXT: v_lshrrev_b32_e32 v14, 24, v6 -; GFX8-NEXT: v_or_b32_sdwa v5, v6, v19 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX8-NEXT: v_lshlrev_b32_sdwa v6, v10, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 +; GFX8-NEXT: v_lshrrev_b32_e32 v11, 24, v4 +; GFX8-NEXT: v_lshlrev_b32_sdwa v7, v7, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 +; GFX8-NEXT: v_lshlrev_b32_sdwa v18, v8, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 +; GFX8-NEXT: v_or_b32_sdwa v1, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 3, v2 -; GFX8-NEXT: v_lshlrev_b32_e32 v13, 24, v13 -; GFX8-NEXT: v_or_b32_e32 v1, v4, v18 -; GFX8-NEXT: v_or_b32_e32 v3, v3, v11 -; GFX8-NEXT: v_or_b32_e32 v0, v0, v12 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 1, v15 -; GFX8-NEXT: v_lshlrev_b32_e64 v17, v2, s1 -; GFX8-NEXT: v_lshlrev_b32_e64 v2, v2, s0 -; GFX8-NEXT: v_lshlrev_b32_e32 v14, 24, v14 -; GFX8-NEXT: v_or_b32_e32 v4, v5, v6 -; GFX8-NEXT: v_or_b32_e32 v1, v1, v13 -; GFX8-NEXT: v_cndmask_b32_e32 v5, v3, v0, vcc -; GFX8-NEXT: v_cmp_eq_u32_e64 s[0:1], 2, v15 -; GFX8-NEXT: v_or_b32_e32 v4, v4, v14 -; GFX8-NEXT: v_cndmask_b32_e64 v5, v5, v1, s[0:1] -; GFX8-NEXT: v_cmp_eq_u32_e64 s[2:3], 3, v15 -; GFX8-NEXT: v_xor_b32_e32 v2, -1, v2 -; GFX8-NEXT: v_cndmask_b32_e64 v5, v5, v4, s[2:3] -; GFX8-NEXT: v_and_b32_e32 v2, v5, v2 -; GFX8-NEXT: v_or_b32_e32 v2, v2, v17 -; GFX8-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v15 -; GFX8-NEXT: v_cndmask_b32_e64 v3, v3, v2, s[4:5] -; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc -; GFX8-NEXT: v_cndmask_b32_e64 v1, v1, v2, s[0:1] -; GFX8-NEXT: v_cndmask_b32_e64 v2, v4, v2, s[2:3] -; GFX8-NEXT: v_lshlrev_b32_sdwa v12, v9, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 -; GFX8-NEXT: v_lshlrev_b32_sdwa v14, v9, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 -; GFX8-NEXT: v_lshlrev_b32_sdwa v16, v9, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 -; GFX8-NEXT: v_lshlrev_b32_sdwa v9, v9, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 -; GFX8-NEXT: v_lshrrev_b32_e32 v4, 24, v3 -; GFX8-NEXT: v_lshrrev_b32_e32 v5, 24, v0 -; GFX8-NEXT: v_lshrrev_b32_e32 v6, 24, v1 -; GFX8-NEXT: v_lshrrev_b32_e32 v11, 24, v2 -; GFX8-NEXT: v_lshlrev_b32_sdwa v13, v10, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 -; GFX8-NEXT: v_lshlrev_b32_sdwa v15, v10, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 -; GFX8-NEXT: v_lshlrev_b32_sdwa v17, v10, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 -; GFX8-NEXT: v_lshlrev_b32_sdwa v10, v10, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 -; GFX8-NEXT: v_or_b32_sdwa v3, v3, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX8-NEXT: v_or_b32_sdwa v0, v0, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX8-NEXT: v_or_b32_sdwa v1, v1, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX8-NEXT: v_or_b32_sdwa v2, v2, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX8-NEXT: v_lshlrev_b32_e32 v4, 24, v4 -; GFX8-NEXT: v_lshlrev_b32_e32 v5, 24, v5 +; GFX8-NEXT: v_lshrrev_b32_e32 v12, 24, v5 +; GFX8-NEXT: v_lshrrev_b32_e32 v13, 24, v6 +; GFX8-NEXT: v_lshlrev_b32_sdwa v17, v9, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 +; GFX8-NEXT: v_lshlrev_b32_sdwa v19, v9, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 +; GFX8-NEXT: v_or_b32_sdwa v6, v6, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX8-NEXT: v_lshlrev_b32_e64 v18, v2, s0 +; GFX8-NEXT: v_lshlrev_b32_e32 v0, v2, v0 +; GFX8-NEXT: v_lshlrev_b32_e32 v2, 24, v10 +; GFX8-NEXT: v_lshlrev_b32_e32 v10, 24, v11 +; GFX8-NEXT: v_or_b32_e32 v3, v3, v15 +; GFX8-NEXT: v_or_b32_e32 v1, v1, v7 +; GFX8-NEXT: v_lshlrev_b32_e32 v11, 24, v12 +; GFX8-NEXT: v_or_b32_e32 v7, v14, v17 +; GFX8-NEXT: v_or_b32_e32 v2, v3, v2 +; GFX8-NEXT: v_or_b32_e32 v1, v1, v10 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 1, v16 +; GFX8-NEXT: v_lshlrev_b32_e32 v12, 24, v13 +; GFX8-NEXT: v_or_b32_e32 v6, v6, v19 +; GFX8-NEXT: v_or_b32_e32 v3, v7, v11 +; GFX8-NEXT: v_cndmask_b32_e32 v7, v2, v1, vcc +; GFX8-NEXT: v_cmp_eq_u32_e64 s[0:1], 2, v16 +; GFX8-NEXT: v_or_b32_e32 v6, v6, v12 +; GFX8-NEXT: v_cndmask_b32_e64 v7, v7, v3, s[0:1] +; GFX8-NEXT: v_cmp_eq_u32_e64 s[2:3], 3, v16 +; GFX8-NEXT: v_xor_b32_e32 v0, -1, v0 +; GFX8-NEXT: v_cndmask_b32_e64 v7, v7, v6, s[2:3] +; GFX8-NEXT: v_and_b32_e32 v0, v7, v0 +; GFX8-NEXT: v_or_b32_e32 v0, v0, v18 +; GFX8-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v16 +; GFX8-NEXT: v_cndmask_b32_e64 v2, v2, v0, s[4:5] +; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v0, vcc +; GFX8-NEXT: v_cndmask_b32_e64 v3, v3, v0, s[0:1] +; GFX8-NEXT: v_cndmask_b32_e64 v0, v6, v0, s[2:3] +; GFX8-NEXT: v_lshlrev_b32_sdwa v12, v8, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 +; GFX8-NEXT: v_lshlrev_b32_sdwa v14, v8, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 +; GFX8-NEXT: v_lshlrev_b32_sdwa v16, v8, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 +; GFX8-NEXT: v_lshlrev_b32_sdwa v8, v8, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 +; GFX8-NEXT: v_lshrrev_b32_e32 v6, 24, v2 +; GFX8-NEXT: v_lshrrev_b32_e32 v7, 24, v1 +; GFX8-NEXT: v_lshrrev_b32_e32 v10, 24, v3 +; GFX8-NEXT: v_lshrrev_b32_e32 v11, 24, v0 +; GFX8-NEXT: v_lshlrev_b32_sdwa v13, v9, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 +; GFX8-NEXT: v_lshlrev_b32_sdwa v15, v9, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 +; GFX8-NEXT: v_lshlrev_b32_sdwa v17, v9, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 +; GFX8-NEXT: v_lshlrev_b32_sdwa v9, v9, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 +; GFX8-NEXT: v_or_b32_sdwa v2, v2, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_sdwa v1, v1, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_sdwa v3, v3, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_sdwa v0, v0, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX8-NEXT: v_mov_b32_e32 v4, 0 ; GFX8-NEXT: v_lshlrev_b32_e32 v6, 24, v6 -; GFX8-NEXT: v_lshlrev_b32_e32 v9, 24, v11 -; GFX8-NEXT: v_or_b32_e32 v3, v3, v13 -; GFX8-NEXT: v_or_b32_e32 v11, v0, v15 -; GFX8-NEXT: v_or_b32_e32 v12, v1, v17 -; GFX8-NEXT: v_or_b32_e32 v10, v2, v10 -; GFX8-NEXT: v_or_b32_e32 v0, v3, v4 -; GFX8-NEXT: v_or_b32_e32 v1, v11, v5 -; GFX8-NEXT: v_or_b32_e32 v2, v12, v6 -; GFX8-NEXT: v_or_b32_e32 v3, v10, v9 -; GFX8-NEXT: flat_store_dwordx4 v[7:8], v[0:3] +; GFX8-NEXT: v_lshlrev_b32_e32 v7, 24, v7 +; GFX8-NEXT: v_lshlrev_b32_e32 v10, 24, v10 +; GFX8-NEXT: v_lshlrev_b32_e32 v8, 24, v11 +; GFX8-NEXT: v_or_b32_e32 v2, v2, v13 +; GFX8-NEXT: v_or_b32_e32 v1, v1, v15 +; GFX8-NEXT: v_or_b32_e32 v3, v3, v17 +; GFX8-NEXT: v_or_b32_e32 v9, v0, v9 +; GFX8-NEXT: v_mov_b32_e32 v5, 0 +; GFX8-NEXT: v_or_b32_e32 v0, v2, v6 +; GFX8-NEXT: v_or_b32_e32 v1, v1, v7 +; GFX8-NEXT: v_or_b32_e32 v2, v3, v10 +; GFX8-NEXT: v_or_b32_e32 v3, v9, v8 +; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX8-NEXT: s_endpgm ; ; GFX7-LABEL: insertelement_v_v16i8_s_v: @@ -6121,108 +6038,109 @@ define amdgpu_ps void @insertelement_v_v16i8_s_v(<16 x i8> addrspace(1)* %ptr, i ; GFX7-NEXT: s_mov_b32 s11, 0xf000 ; GFX7-NEXT: s_mov_b64 s[8:9], 0 ; GFX7-NEXT: buffer_load_dwordx4 v[3:6], v[0:1], s[8:11], 0 addr64 -; GFX7-NEXT: s_movk_i32 s6, 0xff -; GFX7-NEXT: v_lshrrev_b32_e32 v17, 2, v2 +; GFX7-NEXT: s_movk_i32 s0, 0xff +; GFX7-NEXT: v_mov_b32_e32 v7, 0xff +; GFX7-NEXT: v_lshrrev_b32_e32 v18, 2, v2 ; GFX7-NEXT: v_and_b32_e32 v2, 3, v2 -; GFX7-NEXT: s_and_b32 s0, s2, s6 ; GFX7-NEXT: v_lshlrev_b32_e32 v2, 3, v2 -; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v17 -; GFX7-NEXT: v_lshl_b32_e32 v18, s0, v2 -; GFX7-NEXT: v_cmp_eq_u32_e64 s[0:1], 2, v17 -; GFX7-NEXT: v_lshl_b32_e32 v2, s6, v2 -; GFX7-NEXT: v_cmp_eq_u32_e64 s[2:3], 3, v17 -; GFX7-NEXT: v_xor_b32_e32 v2, -1, v2 -; GFX7-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v17 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v18 +; GFX7-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v18 ; GFX7-NEXT: s_mov_b32 s10, -1 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_bfe_u32 v10, v3, 8, 8 -; GFX7-NEXT: v_bfe_u32 v12, v4, 8, 8 +; GFX7-NEXT: v_bfe_u32 v11, v3, 8, 8 +; GFX7-NEXT: v_bfe_u32 v13, v4, 8, 8 ; GFX7-NEXT: v_lshrrev_b32_e32 v0, 24, v3 ; GFX7-NEXT: v_lshrrev_b32_e32 v1, 24, v4 -; GFX7-NEXT: v_and_b32_e32 v9, s6, v3 +; GFX7-NEXT: v_and_b32_e32 v10, s0, v3 ; GFX7-NEXT: v_bfe_u32 v3, v3, 16, 8 -; GFX7-NEXT: v_and_b32_e32 v11, s6, v4 +; GFX7-NEXT: v_and_b32_e32 v12, s0, v4 ; GFX7-NEXT: v_bfe_u32 v4, v4, 16, 8 -; GFX7-NEXT: v_bfe_u32 v14, v5, 8, 8 -; GFX7-NEXT: v_lshlrev_b32_e32 v10, 8, v10 -; GFX7-NEXT: v_lshlrev_b32_e32 v12, 8, v12 -; GFX7-NEXT: v_lshrrev_b32_e32 v7, 24, v5 -; GFX7-NEXT: v_and_b32_e32 v13, s6, v5 +; GFX7-NEXT: v_bfe_u32 v15, v5, 8, 8 +; GFX7-NEXT: v_lshlrev_b32_e32 v11, 8, v11 +; GFX7-NEXT: v_lshlrev_b32_e32 v13, 8, v13 +; GFX7-NEXT: v_lshrrev_b32_e32 v8, 24, v5 +; GFX7-NEXT: v_and_b32_e32 v14, v5, v7 ; GFX7-NEXT: v_bfe_u32 v5, v5, 16, 8 -; GFX7-NEXT: v_bfe_u32 v16, v6, 8, 8 +; GFX7-NEXT: v_bfe_u32 v17, v6, 8, 8 ; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; GFX7-NEXT: v_lshlrev_b32_e32 v14, 8, v14 -; GFX7-NEXT: v_or_b32_e32 v9, v9, v10 -; GFX7-NEXT: v_or_b32_e32 v10, v11, v12 -; GFX7-NEXT: v_lshrrev_b32_e32 v8, 24, v6 -; GFX7-NEXT: v_and_b32_e32 v15, s6, v6 +; GFX7-NEXT: v_lshlrev_b32_e32 v15, 8, v15 +; GFX7-NEXT: v_or_b32_e32 v10, v10, v11 +; GFX7-NEXT: v_or_b32_e32 v11, v12, v13 +; GFX7-NEXT: v_lshrrev_b32_e32 v9, 24, v6 +; GFX7-NEXT: v_and_b32_e32 v16, v6, v7 ; GFX7-NEXT: v_bfe_u32 v6, v6, 16, 8 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 24, v0 ; GFX7-NEXT: v_lshlrev_b32_e32 v1, 24, v1 ; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GFX7-NEXT: v_lshlrev_b32_e32 v16, 8, v16 -; GFX7-NEXT: v_or_b32_e32 v11, v13, v14 -; GFX7-NEXT: v_or_b32_e32 v3, v9, v3 -; GFX7-NEXT: v_or_b32_e32 v4, v10, v4 -; GFX7-NEXT: v_lshlrev_b32_e32 v7, 24, v7 +; GFX7-NEXT: v_lshlrev_b32_e32 v17, 8, v17 +; GFX7-NEXT: v_or_b32_e32 v12, v14, v15 +; GFX7-NEXT: v_or_b32_e32 v3, v10, v3 +; GFX7-NEXT: v_or_b32_e32 v4, v11, v4 +; GFX7-NEXT: s_and_b32 s0, s2, 0xff +; GFX7-NEXT: v_lshlrev_b32_e32 v8, 24, v8 ; GFX7-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; GFX7-NEXT: v_or_b32_e32 v12, v15, v16 -; GFX7-NEXT: v_or_b32_e32 v5, v11, v5 +; GFX7-NEXT: v_or_b32_e32 v13, v16, v17 +; GFX7-NEXT: v_or_b32_e32 v5, v12, v5 ; GFX7-NEXT: v_or_b32_e32 v0, v3, v0 ; GFX7-NEXT: v_or_b32_e32 v1, v4, v1 -; GFX7-NEXT: v_lshlrev_b32_e32 v8, 24, v8 -; GFX7-NEXT: v_or_b32_e32 v6, v12, v6 -; GFX7-NEXT: v_or_b32_e32 v3, v5, v7 +; GFX7-NEXT: v_lshl_b32_e32 v19, s0, v2 +; GFX7-NEXT: v_lshlrev_b32_e32 v9, 24, v9 +; GFX7-NEXT: v_or_b32_e32 v6, v13, v6 +; GFX7-NEXT: v_or_b32_e32 v3, v5, v8 ; GFX7-NEXT: v_cndmask_b32_e32 v5, v0, v1, vcc -; GFX7-NEXT: v_or_b32_e32 v4, v6, v8 +; GFX7-NEXT: v_cmp_eq_u32_e64 s[0:1], 2, v18 +; GFX7-NEXT: v_lshlrev_b32_e32 v2, v2, v7 +; GFX7-NEXT: v_or_b32_e32 v4, v6, v9 ; GFX7-NEXT: v_cndmask_b32_e64 v5, v5, v3, s[0:1] +; GFX7-NEXT: v_cmp_eq_u32_e64 s[2:3], 3, v18 +; GFX7-NEXT: v_xor_b32_e32 v2, -1, v2 ; GFX7-NEXT: v_cndmask_b32_e64 v5, v5, v4, s[2:3] ; GFX7-NEXT: v_and_b32_e32 v2, v5, v2 -; GFX7-NEXT: v_or_b32_e32 v2, v2, v18 +; GFX7-NEXT: v_or_b32_e32 v2, v2, v19 ; GFX7-NEXT: v_cndmask_b32_e64 v0, v0, v2, s[4:5] ; GFX7-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc +; GFX7-NEXT: v_bfe_u32 v10, v0, 8, 8 ; GFX7-NEXT: v_cndmask_b32_e64 v3, v3, v2, s[0:1] -; GFX7-NEXT: v_bfe_u32 v9, v0, 8, 8 -; GFX7-NEXT: v_bfe_u32 v11, v1, 8, 8 ; GFX7-NEXT: v_cndmask_b32_e64 v4, v4, v2, s[2:3] ; GFX7-NEXT: v_lshrrev_b32_e32 v2, 24, v0 -; GFX7-NEXT: v_lshrrev_b32_e32 v5, 24, v1 -; GFX7-NEXT: v_and_b32_e32 v8, s6, v0 +; GFX7-NEXT: v_and_b32_e32 v9, v0, v7 ; GFX7-NEXT: v_bfe_u32 v0, v0, 16, 8 -; GFX7-NEXT: v_and_b32_e32 v10, s6, v1 +; GFX7-NEXT: v_bfe_u32 v12, v1, 8, 8 +; GFX7-NEXT: v_lshlrev_b32_e32 v10, 8, v10 +; GFX7-NEXT: v_lshrrev_b32_e32 v5, 24, v1 +; GFX7-NEXT: v_and_b32_e32 v11, v1, v7 ; GFX7-NEXT: v_bfe_u32 v1, v1, 16, 8 -; GFX7-NEXT: v_bfe_u32 v13, v3, 8, 8 -; GFX7-NEXT: v_lshlrev_b32_e32 v9, 8, v9 -; GFX7-NEXT: v_lshlrev_b32_e32 v11, 8, v11 -; GFX7-NEXT: v_lshrrev_b32_e32 v6, 24, v3 -; GFX7-NEXT: v_and_b32_e32 v12, s6, v3 -; GFX7-NEXT: v_bfe_u32 v3, v3, 16, 8 +; GFX7-NEXT: v_bfe_u32 v14, v3, 8, 8 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX7-NEXT: v_lshlrev_b32_e32 v13, 8, v13 -; GFX7-NEXT: v_or_b32_e32 v8, v8, v9 -; GFX7-NEXT: v_or_b32_e32 v9, v10, v11 +; GFX7-NEXT: v_lshlrev_b32_e32 v12, 8, v12 +; GFX7-NEXT: v_or_b32_e32 v9, v9, v10 +; GFX7-NEXT: v_lshrrev_b32_e32 v6, 24, v3 +; GFX7-NEXT: v_and_b32_e32 v13, v3, v7 +; GFX7-NEXT: v_bfe_u32 v3, v3, 16, 8 ; GFX7-NEXT: v_lshlrev_b32_e32 v2, 24, v2 +; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX7-NEXT: v_lshlrev_b32_e32 v14, 8, v14 +; GFX7-NEXT: v_or_b32_e32 v10, v11, v12 +; GFX7-NEXT: v_or_b32_e32 v0, v9, v0 ; GFX7-NEXT: v_lshlrev_b32_e32 v5, 24, v5 -; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GFX7-NEXT: v_or_b32_e32 v10, v12, v13 -; GFX7-NEXT: v_or_b32_e32 v0, v8, v0 -; GFX7-NEXT: v_or_b32_e32 v1, v9, v1 +; GFX7-NEXT: v_or_b32_e32 v11, v13, v14 +; GFX7-NEXT: v_or_b32_e32 v1, v10, v1 ; GFX7-NEXT: v_or_b32_e32 v0, v0, v2 +; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v3 ; GFX7-NEXT: v_or_b32_e32 v1, v1, v5 -; GFX7-NEXT: v_or_b32_e32 v2, v10, v3 +; GFX7-NEXT: v_or_b32_e32 v2, v11, v2 ; GFX7-NEXT: v_lshlrev_b32_e32 v3, 24, v6 ; GFX7-NEXT: v_bfe_u32 v5, v4, 8, 8 -; GFX7-NEXT: v_lshrrev_b32_e32 v7, 24, v4 +; GFX7-NEXT: v_lshrrev_b32_e32 v8, 24, v4 ; GFX7-NEXT: v_or_b32_e32 v2, v2, v3 -; GFX7-NEXT: v_and_b32_e32 v3, s6, v4 +; GFX7-NEXT: v_and_b32_e32 v3, v4, v7 ; GFX7-NEXT: v_lshlrev_b32_e32 v5, 8, v5 ; GFX7-NEXT: v_bfe_u32 v4, v4, 16, 8 ; GFX7-NEXT: v_or_b32_e32 v3, v3, v5 ; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v4 ; GFX7-NEXT: v_or_b32_e32 v3, v3, v4 -; GFX7-NEXT: v_lshlrev_b32_e32 v4, 24, v7 +; GFX7-NEXT: v_lshlrev_b32_e32 v4, 24, v8 ; GFX7-NEXT: v_or_b32_e32 v3, v3, v4 ; GFX7-NEXT: buffer_store_dwordx4 v[0:3], off, s[8:11], 0 ; GFX7-NEXT: s_endpgm @@ -6233,7 +6151,6 @@ define amdgpu_ps void @insertelement_v_v16i8_s_v(<16 x i8> addrspace(1)* %ptr, i ; GFX10-NEXT: s_mov_b32 s0, 8 ; GFX10-NEXT: v_mov_b32_e32 v1, 8 ; GFX10-NEXT: s_mov_b32 s1, 16 -; GFX10-NEXT: s_movk_i32 s3, 0xff ; GFX10-NEXT: v_and_b32_e32 v0, 3, v2 ; GFX10-NEXT: v_mov_b32_e32 v7, 16 ; GFX10-NEXT: v_lshrrev_b32_e32 v2, 2, v2 @@ -6248,25 +6165,25 @@ define amdgpu_ps void @insertelement_v_v16i8_s_v(<16 x i8> addrspace(1)* %ptr, i ; GFX10-NEXT: v_lshlrev_b32_sdwa v13, s1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 ; GFX10-NEXT: v_lshlrev_b32_sdwa v15, s1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 ; GFX10-NEXT: v_lshlrev_b32_sdwa v16, v1, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 -; GFX10-NEXT: v_and_or_b32 v3, v3, s3, v12 +; GFX10-NEXT: v_and_or_b32 v3, v3, 0xff, v12 ; GFX10-NEXT: v_lshlrev_b32_e32 v8, 24, v8 -; GFX10-NEXT: v_and_or_b32 v4, v4, s3, v14 +; GFX10-NEXT: v_and_or_b32 v4, v4, 0xff, v14 ; GFX10-NEXT: v_lshlrev_b32_e32 v9, 24, v9 ; GFX10-NEXT: v_lshrrev_b32_e32 v11, 24, v6 ; GFX10-NEXT: v_lshlrev_b32_sdwa v17, v7, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 ; GFX10-NEXT: v_lshlrev_b32_sdwa v18, v1, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 -; GFX10-NEXT: v_and_or_b32 v5, v5, s3, v16 +; GFX10-NEXT: v_and_or_b32 v5, 0xff, v5, v16 ; GFX10-NEXT: v_lshlrev_b32_e32 v10, 24, v10 ; GFX10-NEXT: v_or3_b32 v3, v3, v13, v8 ; GFX10-NEXT: v_or3_b32 v4, v4, v15, v9 ; GFX10-NEXT: v_lshlrev_b32_sdwa v12, v7, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 -; GFX10-NEXT: v_and_or_b32 v6, v6, s3, v18 +; GFX10-NEXT: v_and_or_b32 v6, 0xff, v6, v18 ; GFX10-NEXT: v_lshlrev_b32_e32 v8, 24, v11 ; GFX10-NEXT: v_or3_b32 v5, v5, v17, v10 ; GFX10-NEXT: v_cndmask_b32_e32 v9, v3, v4, vcc_lo ; GFX10-NEXT: v_cmp_eq_u32_e64 s0, 2, v2 -; GFX10-NEXT: s_and_b32 s1, s2, s3 -; GFX10-NEXT: v_lshlrev_b32_e64 v10, v0, s3 +; GFX10-NEXT: s_and_b32 s1, s2, 0xff +; GFX10-NEXT: v_lshlrev_b32_e64 v10, v0, 0xff ; GFX10-NEXT: v_or3_b32 v6, v6, v12, v8 ; GFX10-NEXT: v_lshlrev_b32_e64 v0, v0, s1 ; GFX10-NEXT: v_cndmask_b32_e64 v8, v9, v5, s0 @@ -6291,13 +6208,13 @@ define amdgpu_ps void @insertelement_v_v16i8_s_v(<16 x i8> addrspace(1)* %ptr, i ; GFX10-NEXT: v_lshlrev_b32_sdwa v13, v7, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 ; GFX10-NEXT: v_lshlrev_b32_sdwa v15, v7, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 ; GFX10-NEXT: v_lshlrev_b32_sdwa v7, v7, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 -; GFX10-NEXT: v_and_or_b32 v2, v2, s3, v10 +; GFX10-NEXT: v_and_or_b32 v2, 0xff, v2, v10 ; GFX10-NEXT: v_lshlrev_b32_e32 v10, 24, v5 -; GFX10-NEXT: v_and_or_b32 v3, v3, s3, v12 +; GFX10-NEXT: v_and_or_b32 v3, 0xff, v3, v12 ; GFX10-NEXT: v_lshlrev_b32_e32 v6, 24, v6 -; GFX10-NEXT: v_and_or_b32 v12, v4, s3, v14 +; GFX10-NEXT: v_and_or_b32 v12, 0xff, v4, v14 ; GFX10-NEXT: v_lshlrev_b32_e32 v8, 24, v8 -; GFX10-NEXT: v_and_or_b32 v14, v0, s3, v1 +; GFX10-NEXT: v_and_or_b32 v14, 0xff, v0, v1 ; GFX10-NEXT: v_lshlrev_b32_e32 v9, 24, v9 ; GFX10-NEXT: v_mov_b32_e32 v4, 0 ; GFX10-NEXT: v_mov_b32_e32 v5, 0 @@ -6319,95 +6236,95 @@ define amdgpu_ps void @insertelement_v_v16i8_v_s(<16 x i8> addrspace(1)* %ptr, i ; GFX9-NEXT: global_load_dwordx4 v[3:6], v[0:1], off ; GFX9-NEXT: s_mov_b32 s0, 8 ; GFX9-NEXT: s_mov_b32 s1, 16 -; GFX9-NEXT: s_movk_i32 s6, 0xff -; GFX9-NEXT: v_mov_b32_e32 v0, 8 -; GFX9-NEXT: v_mov_b32_e32 v1, 16 +; GFX9-NEXT: s_movk_i32 s3, 0xff +; GFX9-NEXT: v_mov_b32_e32 v1, 8 +; GFX9-NEXT: v_mov_b32_e32 v0, 0xff +; GFX9-NEXT: v_mov_b32_e32 v7, 16 ; GFX9-NEXT: s_lshr_b32 s4, s2, 2 -; GFX9-NEXT: s_and_b32 s2, s2, 3 -; GFX9-NEXT: s_lshl_b32 s2, s2, 3 ; GFX9-NEXT: v_cmp_eq_u32_e64 vcc, s4, 1 -; GFX9-NEXT: v_lshlrev_b32_sdwa v2, s2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX9-NEXT: s_lshl_b32 s2, s6, s2 -; GFX9-NEXT: s_not_b32 s5, s2 -; GFX9-NEXT: v_cmp_eq_u32_e64 s[2:3], s4, 3 -; GFX9-NEXT: v_mov_b32_e32 v7, 0 -; GFX9-NEXT: v_mov_b32_e32 v8, 0 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_lshrrev_b32_e32 v9, 24, v3 -; GFX9-NEXT: v_lshrrev_b32_e32 v10, 24, v4 -; GFX9-NEXT: v_lshlrev_b32_sdwa v13, s0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 -; GFX9-NEXT: v_lshlrev_b32_sdwa v15, s0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 -; GFX9-NEXT: v_lshrrev_b32_e32 v11, 24, v5 -; GFX9-NEXT: v_lshlrev_b32_sdwa v14, s1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 -; GFX9-NEXT: v_lshlrev_b32_sdwa v16, s1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 -; GFX9-NEXT: v_lshlrev_b32_sdwa v17, v0, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 -; GFX9-NEXT: v_and_or_b32 v3, v3, s6, v13 -; GFX9-NEXT: v_and_or_b32 v4, v4, s6, v15 +; GFX9-NEXT: v_lshrrev_b32_e32 v8, 24, v3 +; GFX9-NEXT: v_lshrrev_b32_e32 v9, 24, v4 +; GFX9-NEXT: v_lshlrev_b32_sdwa v12, s0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 +; GFX9-NEXT: v_lshlrev_b32_sdwa v14, s0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 +; GFX9-NEXT: s_and_b32 s0, s2, 3 +; GFX9-NEXT: v_lshrrev_b32_e32 v10, 24, v5 +; GFX9-NEXT: v_lshlrev_b32_sdwa v13, s1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 +; GFX9-NEXT: v_lshlrev_b32_sdwa v15, s1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 +; GFX9-NEXT: v_lshlrev_b32_sdwa v16, v1, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 +; GFX9-NEXT: v_and_or_b32 v3, v3, s3, v12 +; GFX9-NEXT: v_and_or_b32 v12, v4, s3, v14 +; GFX9-NEXT: s_lshl_b32 s0, s0, 3 +; GFX9-NEXT: v_lshlrev_b32_e32 v8, 24, v8 ; GFX9-NEXT: v_lshlrev_b32_e32 v9, 24, v9 +; GFX9-NEXT: v_lshrrev_b32_e32 v11, 24, v6 +; GFX9-NEXT: v_lshlrev_b32_sdwa v17, v7, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 +; GFX9-NEXT: v_lshlrev_b32_sdwa v18, v1, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 +; GFX9-NEXT: v_and_or_b32 v14, v5, v0, v16 +; GFX9-NEXT: v_lshlrev_b32_sdwa v2, s0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NEXT: s_lshl_b32 s0, 0xff, s0 ; GFX9-NEXT: v_lshlrev_b32_e32 v10, 24, v10 -; GFX9-NEXT: v_lshrrev_b32_e32 v12, 24, v6 -; GFX9-NEXT: v_lshlrev_b32_sdwa v18, v1, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 -; GFX9-NEXT: v_lshlrev_b32_sdwa v19, v0, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 -; GFX9-NEXT: v_and_or_b32 v5, v5, s6, v17 +; GFX9-NEXT: v_or3_b32 v3, v3, v13, v8 +; GFX9-NEXT: v_or3_b32 v8, v12, v15, v9 +; GFX9-NEXT: v_lshlrev_b32_sdwa v19, v7, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 +; GFX9-NEXT: v_and_or_b32 v6, v6, v0, v18 +; GFX9-NEXT: s_not_b32 s5, s0 ; GFX9-NEXT: v_lshlrev_b32_e32 v11, 24, v11 -; GFX9-NEXT: v_or3_b32 v3, v3, v14, v9 -; GFX9-NEXT: v_or3_b32 v4, v4, v16, v10 -; GFX9-NEXT: v_and_or_b32 v13, v6, s6, v19 -; GFX9-NEXT: v_lshlrev_b32_sdwa v6, v1, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 -; GFX9-NEXT: v_lshlrev_b32_e32 v12, 24, v12 -; GFX9-NEXT: v_or3_b32 v5, v5, v18, v11 -; GFX9-NEXT: v_cndmask_b32_e32 v9, v3, v4, vcc +; GFX9-NEXT: v_or3_b32 v9, v14, v17, v10 +; GFX9-NEXT: v_cndmask_b32_e32 v10, v3, v8, vcc ; GFX9-NEXT: v_cmp_eq_u32_e64 s[0:1], s4, 2 -; GFX9-NEXT: v_or3_b32 v6, v13, v6, v12 -; GFX9-NEXT: v_cndmask_b32_e64 v9, v9, v5, s[0:1] -; GFX9-NEXT: v_cndmask_b32_e64 v9, v9, v6, s[2:3] -; GFX9-NEXT: v_and_or_b32 v2, v9, s5, v2 +; GFX9-NEXT: v_or3_b32 v6, v6, v19, v11 +; GFX9-NEXT: v_cndmask_b32_e64 v10, v10, v9, s[0:1] +; GFX9-NEXT: v_cmp_eq_u32_e64 s[2:3], s4, 3 +; GFX9-NEXT: v_cndmask_b32_e64 v10, v10, v6, s[2:3] +; GFX9-NEXT: v_and_or_b32 v2, v10, s5, v2 ; GFX9-NEXT: v_cmp_eq_u32_e64 s[4:5], s4, 0 ; GFX9-NEXT: v_cndmask_b32_e64 v3, v3, v2, s[4:5] -; GFX9-NEXT: v_cndmask_b32_e32 v4, v4, v2, vcc -; GFX9-NEXT: v_cndmask_b32_e64 v5, v5, v2, s[0:1] +; GFX9-NEXT: v_cndmask_b32_e32 v8, v8, v2, vcc +; GFX9-NEXT: v_cndmask_b32_e64 v9, v9, v2, s[0:1] ; GFX9-NEXT: v_cndmask_b32_e64 v2, v6, v2, s[2:3] ; GFX9-NEXT: v_lshrrev_b32_e32 v6, 24, v3 -; GFX9-NEXT: v_lshrrev_b32_e32 v9, 24, v4 -; GFX9-NEXT: v_lshrrev_b32_e32 v10, 24, v5 -; GFX9-NEXT: v_lshrrev_b32_e32 v11, 24, v2 -; GFX9-NEXT: v_lshlrev_b32_sdwa v12, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 -; GFX9-NEXT: v_lshlrev_b32_sdwa v14, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 -; GFX9-NEXT: v_lshlrev_b32_sdwa v16, v0, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 -; GFX9-NEXT: v_lshlrev_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 -; GFX9-NEXT: v_lshlrev_b32_sdwa v13, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 -; GFX9-NEXT: v_lshlrev_b32_sdwa v15, v1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 -; GFX9-NEXT: v_lshlrev_b32_sdwa v17, v1, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 -; GFX9-NEXT: v_lshlrev_b32_sdwa v18, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 -; GFX9-NEXT: v_and_or_b32 v1, v3, s6, v12 -; GFX9-NEXT: v_lshlrev_b32_e32 v3, 24, v6 -; GFX9-NEXT: v_and_or_b32 v4, v4, s6, v14 -; GFX9-NEXT: v_lshlrev_b32_e32 v6, 24, v9 -; GFX9-NEXT: v_and_or_b32 v5, v5, s6, v16 -; GFX9-NEXT: v_lshlrev_b32_e32 v9, 24, v10 -; GFX9-NEXT: v_and_or_b32 v10, v2, s6, v0 +; GFX9-NEXT: v_lshrrev_b32_e32 v10, 24, v8 +; GFX9-NEXT: v_lshrrev_b32_e32 v11, 24, v9 +; GFX9-NEXT: v_lshrrev_b32_e32 v12, 24, v2 +; GFX9-NEXT: v_lshlrev_b32_sdwa v13, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 +; GFX9-NEXT: v_lshlrev_b32_sdwa v15, v1, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 +; GFX9-NEXT: v_lshlrev_b32_sdwa v17, v1, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 +; GFX9-NEXT: v_lshlrev_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 +; GFX9-NEXT: v_mov_b32_e32 v4, 0 +; GFX9-NEXT: v_lshlrev_b32_sdwa v14, v7, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 +; GFX9-NEXT: v_lshlrev_b32_sdwa v16, v7, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 +; GFX9-NEXT: v_lshlrev_b32_sdwa v18, v7, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 +; GFX9-NEXT: v_lshlrev_b32_sdwa v7, v7, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 +; GFX9-NEXT: v_and_or_b32 v3, v3, v0, v13 +; GFX9-NEXT: v_lshlrev_b32_e32 v6, 24, v6 +; GFX9-NEXT: v_and_or_b32 v8, v8, v0, v15 +; GFX9-NEXT: v_lshlrev_b32_e32 v10, 24, v10 +; GFX9-NEXT: v_and_or_b32 v9, v9, v0, v17 ; GFX9-NEXT: v_lshlrev_b32_e32 v11, 24, v11 -; GFX9-NEXT: v_or3_b32 v0, v1, v13, v3 -; GFX9-NEXT: v_or3_b32 v1, v4, v15, v6 -; GFX9-NEXT: v_or3_b32 v2, v5, v17, v9 -; GFX9-NEXT: v_or3_b32 v3, v10, v18, v11 -; GFX9-NEXT: global_store_dwordx4 v[7:8], v[0:3], off +; GFX9-NEXT: v_and_or_b32 v13, v2, v0, v1 +; GFX9-NEXT: v_lshlrev_b32_e32 v12, 24, v12 +; GFX9-NEXT: v_mov_b32_e32 v5, 0 +; GFX9-NEXT: v_or3_b32 v0, v3, v14, v6 +; GFX9-NEXT: v_or3_b32 v1, v8, v16, v10 +; GFX9-NEXT: v_or3_b32 v2, v9, v18, v11 +; GFX9-NEXT: v_or3_b32 v3, v13, v7, v12 +; GFX9-NEXT: global_store_dwordx4 v[4:5], v[0:3], off ; GFX9-NEXT: s_endpgm ; ; GFX8-LABEL: insertelement_v_v16i8_v_s: ; GFX8: ; %bb.0: ; GFX8-NEXT: flat_load_dwordx4 v[3:6], v[0:1] -; GFX8-NEXT: s_and_b32 s1, s2, 3 +; GFX8-NEXT: s_and_b32 s0, s2, 3 ; GFX8-NEXT: v_mov_b32_e32 v0, 8 -; GFX8-NEXT: s_lshl_b32 s1, s1, 3 +; GFX8-NEXT: s_lshl_b32 s0, s0, 3 ; GFX8-NEXT: v_mov_b32_e32 v1, 16 ; GFX8-NEXT: v_mov_b32_e32 v9, 8 -; GFX8-NEXT: v_mov_b32_e32 v11, s1 +; GFX8-NEXT: v_mov_b32_e32 v11, s0 ; GFX8-NEXT: v_mov_b32_e32 v10, 16 ; GFX8-NEXT: v_lshlrev_b32_sdwa v2, v11, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX8-NEXT: s_movk_i32 s0, 0xff ; GFX8-NEXT: s_lshr_b32 s4, s2, 2 -; GFX8-NEXT: s_lshl_b32 s0, s0, s1 +; GFX8-NEXT: s_lshl_b32 s0, 0xff, s0 ; GFX8-NEXT: v_cmp_eq_u32_e64 vcc, s4, 1 ; GFX8-NEXT: s_not_b32 s5, s0 ; GFX8-NEXT: v_cmp_eq_u32_e64 s[0:1], s4, 2 @@ -6490,108 +6407,109 @@ define amdgpu_ps void @insertelement_v_v16i8_v_s(<16 x i8> addrspace(1)* %ptr, i ; GFX7-NEXT: s_mov_b32 s11, 0xf000 ; GFX7-NEXT: s_mov_b64 s[8:9], 0 ; GFX7-NEXT: buffer_load_dwordx4 v[3:6], v[0:1], s[8:11], 0 addr64 -; GFX7-NEXT: s_movk_i32 s6, 0xff -; GFX7-NEXT: v_and_b32_e32 v0, s6, v2 -; GFX7-NEXT: s_and_b32 s0, s2, 3 +; GFX7-NEXT: s_movk_i32 s0, 0xff +; GFX7-NEXT: v_mov_b32_e32 v7, 0xff ; GFX7-NEXT: s_lshr_b32 s4, s2, 2 -; GFX7-NEXT: s_lshl_b32 s0, s0, 3 -; GFX7-NEXT: v_lshlrev_b32_e32 v0, s0, v0 -; GFX7-NEXT: s_lshl_b32 s0, s6, s0 +; GFX7-NEXT: v_and_b32_e32 v2, v2, v7 ; GFX7-NEXT: v_cmp_eq_u32_e64 vcc, s4, 1 -; GFX7-NEXT: s_not_b32 s5, s0 -; GFX7-NEXT: v_cmp_eq_u32_e64 s[0:1], s4, 2 -; GFX7-NEXT: v_cmp_eq_u32_e64 s[2:3], s4, 3 ; GFX7-NEXT: s_mov_b32 s10, -1 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_bfe_u32 v10, v3, 8, 8 -; GFX7-NEXT: v_bfe_u32 v12, v4, 8, 8 -; GFX7-NEXT: v_lshrrev_b32_e32 v1, 24, v3 -; GFX7-NEXT: v_lshrrev_b32_e32 v2, 24, v4 -; GFX7-NEXT: v_and_b32_e32 v9, s6, v3 +; GFX7-NEXT: v_bfe_u32 v11, v3, 8, 8 +; GFX7-NEXT: v_bfe_u32 v13, v4, 8, 8 +; GFX7-NEXT: v_lshrrev_b32_e32 v0, 24, v3 +; GFX7-NEXT: v_lshrrev_b32_e32 v1, 24, v4 +; GFX7-NEXT: v_and_b32_e32 v10, s0, v3 ; GFX7-NEXT: v_bfe_u32 v3, v3, 16, 8 -; GFX7-NEXT: v_and_b32_e32 v11, s6, v4 +; GFX7-NEXT: v_and_b32_e32 v12, s0, v4 ; GFX7-NEXT: v_bfe_u32 v4, v4, 16, 8 -; GFX7-NEXT: v_bfe_u32 v14, v5, 8, 8 -; GFX7-NEXT: v_lshlrev_b32_e32 v10, 8, v10 -; GFX7-NEXT: v_lshlrev_b32_e32 v12, 8, v12 -; GFX7-NEXT: v_lshrrev_b32_e32 v7, 24, v5 -; GFX7-NEXT: v_and_b32_e32 v13, s6, v5 +; GFX7-NEXT: v_bfe_u32 v15, v5, 8, 8 +; GFX7-NEXT: v_lshlrev_b32_e32 v11, 8, v11 +; GFX7-NEXT: v_lshlrev_b32_e32 v13, 8, v13 +; GFX7-NEXT: v_lshrrev_b32_e32 v8, 24, v5 +; GFX7-NEXT: v_and_b32_e32 v14, v5, v7 ; GFX7-NEXT: v_bfe_u32 v5, v5, 16, 8 -; GFX7-NEXT: v_bfe_u32 v16, v6, 8, 8 +; GFX7-NEXT: v_bfe_u32 v17, v6, 8, 8 +; GFX7-NEXT: s_and_b32 s0, s2, 3 ; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; GFX7-NEXT: v_lshlrev_b32_e32 v14, 8, v14 -; GFX7-NEXT: v_or_b32_e32 v9, v9, v10 -; GFX7-NEXT: v_or_b32_e32 v10, v11, v12 -; GFX7-NEXT: v_lshrrev_b32_e32 v8, 24, v6 -; GFX7-NEXT: v_and_b32_e32 v15, s6, v6 +; GFX7-NEXT: v_lshlrev_b32_e32 v15, 8, v15 +; GFX7-NEXT: v_or_b32_e32 v10, v10, v11 +; GFX7-NEXT: v_or_b32_e32 v11, v12, v13 +; GFX7-NEXT: v_lshrrev_b32_e32 v9, 24, v6 +; GFX7-NEXT: v_and_b32_e32 v16, v6, v7 ; GFX7-NEXT: v_bfe_u32 v6, v6, 16, 8 +; GFX7-NEXT: s_lshl_b32 s0, s0, 3 +; GFX7-NEXT: v_lshlrev_b32_e32 v0, 24, v0 ; GFX7-NEXT: v_lshlrev_b32_e32 v1, 24, v1 -; GFX7-NEXT: v_lshlrev_b32_e32 v2, 24, v2 ; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GFX7-NEXT: v_lshlrev_b32_e32 v16, 8, v16 -; GFX7-NEXT: v_or_b32_e32 v11, v13, v14 -; GFX7-NEXT: v_or_b32_e32 v3, v9, v3 -; GFX7-NEXT: v_or_b32_e32 v4, v10, v4 -; GFX7-NEXT: v_lshlrev_b32_e32 v7, 24, v7 -; GFX7-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; GFX7-NEXT: v_or_b32_e32 v12, v15, v16 -; GFX7-NEXT: v_or_b32_e32 v5, v11, v5 -; GFX7-NEXT: v_or_b32_e32 v1, v3, v1 -; GFX7-NEXT: v_or_b32_e32 v2, v4, v2 +; GFX7-NEXT: v_lshlrev_b32_e32 v17, 8, v17 +; GFX7-NEXT: v_or_b32_e32 v12, v14, v15 +; GFX7-NEXT: v_or_b32_e32 v3, v10, v3 +; GFX7-NEXT: v_or_b32_e32 v4, v11, v4 +; GFX7-NEXT: v_lshlrev_b32_e32 v2, s0, v2 +; GFX7-NEXT: s_lshl_b32 s0, 0xff, s0 ; GFX7-NEXT: v_lshlrev_b32_e32 v8, 24, v8 -; GFX7-NEXT: v_or_b32_e32 v6, v12, v6 -; GFX7-NEXT: v_or_b32_e32 v3, v5, v7 -; GFX7-NEXT: v_cndmask_b32_e32 v5, v1, v2, vcc -; GFX7-NEXT: v_or_b32_e32 v4, v6, v8 +; GFX7-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; GFX7-NEXT: v_or_b32_e32 v13, v16, v17 +; GFX7-NEXT: v_or_b32_e32 v5, v12, v5 +; GFX7-NEXT: v_or_b32_e32 v0, v3, v0 +; GFX7-NEXT: v_or_b32_e32 v1, v4, v1 +; GFX7-NEXT: s_not_b32 s5, s0 +; GFX7-NEXT: v_lshlrev_b32_e32 v9, 24, v9 +; GFX7-NEXT: v_or_b32_e32 v6, v13, v6 +; GFX7-NEXT: v_or_b32_e32 v3, v5, v8 +; GFX7-NEXT: v_cndmask_b32_e32 v5, v0, v1, vcc +; GFX7-NEXT: v_cmp_eq_u32_e64 s[0:1], s4, 2 +; GFX7-NEXT: v_or_b32_e32 v4, v6, v9 ; GFX7-NEXT: v_cndmask_b32_e64 v5, v5, v3, s[0:1] +; GFX7-NEXT: v_cmp_eq_u32_e64 s[2:3], s4, 3 ; GFX7-NEXT: v_cndmask_b32_e64 v5, v5, v4, s[2:3] ; GFX7-NEXT: v_and_b32_e32 v5, s5, v5 -; GFX7-NEXT: v_or_b32_e32 v0, v5, v0 +; GFX7-NEXT: v_or_b32_e32 v2, v5, v2 ; GFX7-NEXT: v_cmp_eq_u32_e64 s[4:5], s4, 0 -; GFX7-NEXT: v_cndmask_b32_e64 v1, v1, v0, s[4:5] -; GFX7-NEXT: v_cndmask_b32_e32 v2, v2, v0, vcc -; GFX7-NEXT: v_cndmask_b32_e64 v3, v3, v0, s[0:1] -; GFX7-NEXT: v_bfe_u32 v9, v1, 8, 8 -; GFX7-NEXT: v_bfe_u32 v11, v2, 8, 8 -; GFX7-NEXT: v_cndmask_b32_e64 v4, v4, v0, s[2:3] -; GFX7-NEXT: v_lshrrev_b32_e32 v0, 24, v1 -; GFX7-NEXT: v_lshrrev_b32_e32 v5, 24, v2 -; GFX7-NEXT: v_and_b32_e32 v8, s6, v1 +; GFX7-NEXT: v_cndmask_b32_e64 v0, v0, v2, s[4:5] +; GFX7-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc +; GFX7-NEXT: v_bfe_u32 v10, v0, 8, 8 +; GFX7-NEXT: v_cndmask_b32_e64 v3, v3, v2, s[0:1] +; GFX7-NEXT: v_cndmask_b32_e64 v4, v4, v2, s[2:3] +; GFX7-NEXT: v_lshrrev_b32_e32 v2, 24, v0 +; GFX7-NEXT: v_and_b32_e32 v9, v0, v7 +; GFX7-NEXT: v_bfe_u32 v0, v0, 16, 8 +; GFX7-NEXT: v_bfe_u32 v12, v1, 8, 8 +; GFX7-NEXT: v_lshlrev_b32_e32 v10, 8, v10 +; GFX7-NEXT: v_lshrrev_b32_e32 v5, 24, v1 +; GFX7-NEXT: v_and_b32_e32 v11, v1, v7 ; GFX7-NEXT: v_bfe_u32 v1, v1, 16, 8 -; GFX7-NEXT: v_and_b32_e32 v10, s6, v2 -; GFX7-NEXT: v_bfe_u32 v2, v2, 16, 8 -; GFX7-NEXT: v_bfe_u32 v13, v3, 8, 8 -; GFX7-NEXT: v_lshlrev_b32_e32 v9, 8, v9 -; GFX7-NEXT: v_lshlrev_b32_e32 v11, 8, v11 +; GFX7-NEXT: v_bfe_u32 v14, v3, 8, 8 +; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX7-NEXT: v_lshlrev_b32_e32 v12, 8, v12 +; GFX7-NEXT: v_or_b32_e32 v9, v9, v10 ; GFX7-NEXT: v_lshrrev_b32_e32 v6, 24, v3 -; GFX7-NEXT: v_and_b32_e32 v12, s6, v3 +; GFX7-NEXT: v_and_b32_e32 v13, v3, v7 ; GFX7-NEXT: v_bfe_u32 v3, v3, 16, 8 +; GFX7-NEXT: v_lshlrev_b32_e32 v2, 24, v2 ; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX7-NEXT: v_lshlrev_b32_e32 v13, 8, v13 -; GFX7-NEXT: v_or_b32_e32 v8, v8, v9 -; GFX7-NEXT: v_or_b32_e32 v9, v10, v11 -; GFX7-NEXT: v_lshlrev_b32_e32 v0, 24, v0 +; GFX7-NEXT: v_lshlrev_b32_e32 v14, 8, v14 +; GFX7-NEXT: v_or_b32_e32 v10, v11, v12 +; GFX7-NEXT: v_or_b32_e32 v0, v9, v0 ; GFX7-NEXT: v_lshlrev_b32_e32 v5, 24, v5 -; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GFX7-NEXT: v_or_b32_e32 v10, v12, v13 -; GFX7-NEXT: v_or_b32_e32 v1, v8, v1 -; GFX7-NEXT: v_or_b32_e32 v2, v9, v2 -; GFX7-NEXT: v_or_b32_e32 v0, v1, v0 -; GFX7-NEXT: v_or_b32_e32 v1, v2, v5 -; GFX7-NEXT: v_or_b32_e32 v2, v10, v3 +; GFX7-NEXT: v_or_b32_e32 v11, v13, v14 +; GFX7-NEXT: v_or_b32_e32 v1, v10, v1 +; GFX7-NEXT: v_or_b32_e32 v0, v0, v2 +; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v3 +; GFX7-NEXT: v_or_b32_e32 v1, v1, v5 +; GFX7-NEXT: v_or_b32_e32 v2, v11, v2 ; GFX7-NEXT: v_lshlrev_b32_e32 v3, 24, v6 ; GFX7-NEXT: v_bfe_u32 v5, v4, 8, 8 -; GFX7-NEXT: v_lshrrev_b32_e32 v7, 24, v4 +; GFX7-NEXT: v_lshrrev_b32_e32 v8, 24, v4 ; GFX7-NEXT: v_or_b32_e32 v2, v2, v3 -; GFX7-NEXT: v_and_b32_e32 v3, s6, v4 +; GFX7-NEXT: v_and_b32_e32 v3, v4, v7 ; GFX7-NEXT: v_lshlrev_b32_e32 v5, 8, v5 ; GFX7-NEXT: v_bfe_u32 v4, v4, 16, 8 ; GFX7-NEXT: v_or_b32_e32 v3, v3, v5 ; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v4 ; GFX7-NEXT: v_or_b32_e32 v3, v3, v4 -; GFX7-NEXT: v_lshlrev_b32_e32 v4, 24, v7 +; GFX7-NEXT: v_lshlrev_b32_e32 v4, 24, v8 ; GFX7-NEXT: v_or_b32_e32 v3, v3, v4 ; GFX7-NEXT: buffer_store_dwordx4 v[0:3], off, s[8:11], 0 ; GFX7-NEXT: s_endpgm @@ -6602,10 +6520,9 @@ define amdgpu_ps void @insertelement_v_v16i8_v_s(<16 x i8> addrspace(1)* %ptr, i ; GFX10-NEXT: s_mov_b32 s0, 8 ; GFX10-NEXT: v_mov_b32_e32 v0, 8 ; GFX10-NEXT: s_mov_b32 s1, 16 -; GFX10-NEXT: s_movk_i32 s3, 0xff ; GFX10-NEXT: v_mov_b32_e32 v1, 16 -; GFX10-NEXT: s_lshr_b32 s4, s2, 2 -; GFX10-NEXT: v_cmp_eq_u32_e64 vcc_lo, s4, 1 +; GFX10-NEXT: s_lshr_b32 s3, s2, 2 +; GFX10-NEXT: v_cmp_eq_u32_e64 vcc_lo, s3, 1 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_lshrrev_b32_e32 v7, 24, v3 ; GFX10-NEXT: v_lshrrev_b32_e32 v8, 24, v4 @@ -6615,34 +6532,34 @@ define amdgpu_ps void @insertelement_v_v16i8_v_s(<16 x i8> addrspace(1)* %ptr, i ; GFX10-NEXT: v_lshlrev_b32_sdwa v12, s1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 ; GFX10-NEXT: v_lshlrev_b32_sdwa v14, s1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 ; GFX10-NEXT: v_lshlrev_b32_sdwa v15, v0, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 -; GFX10-NEXT: v_and_or_b32 v3, v3, s3, v11 +; GFX10-NEXT: v_and_or_b32 v3, v3, 0xff, v11 ; GFX10-NEXT: v_lshlrev_b32_e32 v7, 24, v7 -; GFX10-NEXT: v_and_or_b32 v4, v4, s3, v13 +; GFX10-NEXT: v_and_or_b32 v4, v4, 0xff, v13 ; GFX10-NEXT: v_lshlrev_b32_e32 v8, 24, v8 ; GFX10-NEXT: v_lshrrev_b32_e32 v10, 24, v6 ; GFX10-NEXT: v_lshlrev_b32_sdwa v16, v1, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 ; GFX10-NEXT: v_lshlrev_b32_sdwa v17, v0, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 -; GFX10-NEXT: v_and_or_b32 v5, v5, s3, v15 +; GFX10-NEXT: v_and_or_b32 v5, 0xff, v5, v15 ; GFX10-NEXT: v_lshlrev_b32_e32 v9, 24, v9 ; GFX10-NEXT: v_or3_b32 v3, v3, v12, v7 ; GFX10-NEXT: v_or3_b32 v4, v4, v14, v8 ; GFX10-NEXT: v_lshlrev_b32_sdwa v11, v1, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 -; GFX10-NEXT: v_and_or_b32 v6, v6, s3, v17 +; GFX10-NEXT: v_and_or_b32 v6, 0xff, v6, v17 ; GFX10-NEXT: v_lshlrev_b32_e32 v7, 24, v10 ; GFX10-NEXT: v_or3_b32 v5, v5, v16, v9 ; GFX10-NEXT: v_cndmask_b32_e32 v8, v3, v4, vcc_lo -; GFX10-NEXT: v_cmp_eq_u32_e64 s0, s4, 2 +; GFX10-NEXT: v_cmp_eq_u32_e64 s0, s3, 2 ; GFX10-NEXT: s_and_b32 s1, s2, 3 ; GFX10-NEXT: v_or3_b32 v6, v6, v11, v7 ; GFX10-NEXT: s_lshl_b32 s2, s1, 3 -; GFX10-NEXT: v_cmp_eq_u32_e64 s1, s4, 3 +; GFX10-NEXT: v_cmp_eq_u32_e64 s1, s3, 3 ; GFX10-NEXT: v_cndmask_b32_e64 v7, v8, v5, s0 ; GFX10-NEXT: v_lshlrev_b32_sdwa v2, s2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX10-NEXT: s_lshl_b32 s2, s3, s2 +; GFX10-NEXT: s_lshl_b32 s2, 0xff, s2 ; GFX10-NEXT: s_not_b32 s2, s2 ; GFX10-NEXT: v_cndmask_b32_e64 v7, v7, v6, s1 ; GFX10-NEXT: v_and_or_b32 v2, v7, s2, v2 -; GFX10-NEXT: v_cmp_eq_u32_e64 s2, s4, 0 +; GFX10-NEXT: v_cmp_eq_u32_e64 s2, s3, 0 ; GFX10-NEXT: v_cndmask_b32_e32 v4, v4, v2, vcc_lo ; GFX10-NEXT: v_cndmask_b32_e64 v3, v3, v2, s2 ; GFX10-NEXT: v_cndmask_b32_e64 v5, v5, v2, s0 @@ -6659,13 +6576,13 @@ define amdgpu_ps void @insertelement_v_v16i8_v_s(<16 x i8> addrspace(1)* %ptr, i ; GFX10-NEXT: v_lshlrev_b32_sdwa v13, v1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 ; GFX10-NEXT: v_lshlrev_b32_sdwa v15, v1, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 ; GFX10-NEXT: v_lshlrev_b32_sdwa v16, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 -; GFX10-NEXT: v_and_or_b32 v1, v3, s3, v10 +; GFX10-NEXT: v_and_or_b32 v1, 0xff, v3, v10 ; GFX10-NEXT: v_lshlrev_b32_e32 v3, 24, v6 -; GFX10-NEXT: v_and_or_b32 v6, v4, s3, v12 +; GFX10-NEXT: v_and_or_b32 v6, 0xff, v4, v12 ; GFX10-NEXT: v_lshlrev_b32_e32 v7, 24, v7 -; GFX10-NEXT: v_and_or_b32 v10, v5, s3, v14 +; GFX10-NEXT: v_and_or_b32 v10, 0xff, v5, v14 ; GFX10-NEXT: v_lshlrev_b32_e32 v8, 24, v8 -; GFX10-NEXT: v_and_or_b32 v12, v2, s3, v0 +; GFX10-NEXT: v_and_or_b32 v12, 0xff, v2, v0 ; GFX10-NEXT: v_lshlrev_b32_e32 v9, 24, v9 ; GFX10-NEXT: v_mov_b32_e32 v4, 0 ; GFX10-NEXT: v_mov_b32_e32 v5, 0 @@ -6969,50 +6886,48 @@ define amdgpu_ps void @insertelement_v_v16i8_v_v(<16 x i8> addrspace(1)* %ptr, i ; GFX10: ; %bb.0: ; GFX10-NEXT: global_load_dwordx4 v[4:7], v[0:1], off ; GFX10-NEXT: s_mov_b32 s0, 8 -; GFX10-NEXT: v_mov_b32_e32 v8, 8 +; GFX10-NEXT: v_mov_b32_e32 v1, 8 ; GFX10-NEXT: s_mov_b32 s1, 16 -; GFX10-NEXT: s_movk_i32 s2, 0xff ; GFX10-NEXT: v_and_b32_e32 v0, 3, v3 -; GFX10-NEXT: v_mov_b32_e32 v1, 0xff -; GFX10-NEXT: v_mov_b32_e32 v9, 16 +; GFX10-NEXT: v_mov_b32_e32 v8, 16 ; GFX10-NEXT: v_lshrrev_b32_e32 v3, 2, v3 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v3 +; GFX10-NEXT: v_cmp_eq_u32_e64 s2, 0, v3 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_lshrrev_b32_e32 v10, 24, v4 -; GFX10-NEXT: v_lshrrev_b32_e32 v11, 24, v5 -; GFX10-NEXT: v_lshlrev_b32_sdwa v14, s0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 -; GFX10-NEXT: v_lshlrev_b32_sdwa v16, s0, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 -; GFX10-NEXT: v_lshrrev_b32_e32 v12, 24, v6 -; GFX10-NEXT: v_lshlrev_b32_sdwa v15, s1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 -; GFX10-NEXT: v_lshlrev_b32_sdwa v17, s1, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 -; GFX10-NEXT: v_lshlrev_b32_sdwa v18, v8, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 -; GFX10-NEXT: v_and_or_b32 v4, v4, s2, v14 +; GFX10-NEXT: v_lshrrev_b32_e32 v9, 24, v4 +; GFX10-NEXT: v_lshrrev_b32_e32 v10, 24, v5 +; GFX10-NEXT: v_lshlrev_b32_sdwa v13, s0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 +; GFX10-NEXT: v_lshlrev_b32_sdwa v15, s0, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 +; GFX10-NEXT: v_lshrrev_b32_e32 v11, 24, v6 +; GFX10-NEXT: v_lshlrev_b32_sdwa v14, s1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 +; GFX10-NEXT: v_lshlrev_b32_sdwa v16, s1, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 +; GFX10-NEXT: v_lshlrev_b32_sdwa v17, v1, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 +; GFX10-NEXT: v_and_or_b32 v4, v4, 0xff, v13 +; GFX10-NEXT: v_lshlrev_b32_e32 v9, 24, v9 +; GFX10-NEXT: v_and_or_b32 v5, v5, 0xff, v15 ; GFX10-NEXT: v_lshlrev_b32_e32 v10, 24, v10 -; GFX10-NEXT: v_and_or_b32 v5, v5, s2, v16 +; GFX10-NEXT: v_lshrrev_b32_e32 v12, 24, v7 +; GFX10-NEXT: v_lshlrev_b32_sdwa v18, v8, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 +; GFX10-NEXT: v_lshlrev_b32_sdwa v19, v1, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 +; GFX10-NEXT: v_and_or_b32 v6, 0xff, v6, v17 ; GFX10-NEXT: v_lshlrev_b32_e32 v11, 24, v11 -; GFX10-NEXT: v_lshrrev_b32_e32 v13, 24, v7 -; GFX10-NEXT: v_lshlrev_b32_sdwa v19, v9, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 -; GFX10-NEXT: v_lshlrev_b32_sdwa v20, v8, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 -; GFX10-NEXT: v_and_or_b32 v6, v6, v1, v18 -; GFX10-NEXT: v_lshlrev_b32_e32 v12, 24, v12 -; GFX10-NEXT: v_or3_b32 v4, v4, v15, v10 -; GFX10-NEXT: v_or3_b32 v5, v5, v17, v11 -; GFX10-NEXT: v_lshlrev_b32_sdwa v14, v9, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 -; GFX10-NEXT: v_and_or_b32 v7, v7, v1, v20 -; GFX10-NEXT: v_lshlrev_b32_e32 v10, 24, v13 -; GFX10-NEXT: v_or3_b32 v6, v6, v19, v12 -; GFX10-NEXT: v_cndmask_b32_e32 v11, v4, v5, vcc_lo +; GFX10-NEXT: v_or3_b32 v4, v4, v14, v9 +; GFX10-NEXT: v_or3_b32 v5, v5, v16, v10 +; GFX10-NEXT: v_lshlrev_b32_sdwa v13, v8, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 +; GFX10-NEXT: v_and_or_b32 v7, 0xff, v7, v19 +; GFX10-NEXT: v_lshlrev_b32_e32 v9, 24, v12 +; GFX10-NEXT: v_or3_b32 v6, v6, v18, v11 +; GFX10-NEXT: v_cndmask_b32_e32 v10, v4, v5, vcc_lo ; GFX10-NEXT: v_cmp_eq_u32_e64 s0, 2, v3 -; GFX10-NEXT: v_lshlrev_b32_e32 v12, v0, v1 -; GFX10-NEXT: v_or3_b32 v7, v7, v14, v10 +; GFX10-NEXT: v_lshlrev_b32_e64 v11, v0, 0xff +; GFX10-NEXT: v_or3_b32 v7, v7, v13, v9 ; GFX10-NEXT: v_cmp_eq_u32_e64 s1, 3, v3 ; GFX10-NEXT: v_lshlrev_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX10-NEXT: v_cndmask_b32_e64 v10, v11, v6, s0 -; GFX10-NEXT: v_xor_b32_e32 v2, -1, v12 -; GFX10-NEXT: v_cmp_eq_u32_e64 s2, 0, v3 -; GFX10-NEXT: v_cndmask_b32_e64 v10, v10, v7, s1 -; GFX10-NEXT: v_and_or_b32 v0, v10, v2, v0 +; GFX10-NEXT: v_cndmask_b32_e64 v9, v10, v6, s0 +; GFX10-NEXT: v_xor_b32_e32 v2, -1, v11 +; GFX10-NEXT: v_cndmask_b32_e64 v9, v9, v7, s1 +; GFX10-NEXT: v_and_or_b32 v0, v9, v2, v0 ; GFX10-NEXT: v_cndmask_b32_e64 v2, v4, v0, s2 ; GFX10-NEXT: v_cndmask_b32_e32 v3, v5, v0, vcc_lo ; GFX10-NEXT: v_cndmask_b32_e64 v4, v6, v0, s0 @@ -7020,29 +6935,29 @@ define amdgpu_ps void @insertelement_v_v16i8_v_v(<16 x i8> addrspace(1)* %ptr, i ; GFX10-NEXT: v_lshrrev_b32_e32 v5, 24, v2 ; GFX10-NEXT: v_lshrrev_b32_e32 v6, 24, v3 ; GFX10-NEXT: v_lshrrev_b32_e32 v7, 24, v4 -; GFX10-NEXT: v_lshrrev_b32_e32 v10, 24, v0 -; GFX10-NEXT: v_lshlrev_b32_sdwa v11, v8, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 -; GFX10-NEXT: v_lshlrev_b32_sdwa v13, v8, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 -; GFX10-NEXT: v_lshlrev_b32_sdwa v15, v8, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 -; GFX10-NEXT: v_lshlrev_b32_sdwa v8, v8, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 -; GFX10-NEXT: v_lshlrev_b32_sdwa v12, v9, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 -; GFX10-NEXT: v_lshlrev_b32_sdwa v14, v9, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 -; GFX10-NEXT: v_lshlrev_b32_sdwa v16, v9, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 -; GFX10-NEXT: v_lshlrev_b32_sdwa v9, v9, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 -; GFX10-NEXT: v_and_or_b32 v2, v2, v1, v11 -; GFX10-NEXT: v_lshlrev_b32_e32 v11, 24, v5 -; GFX10-NEXT: v_and_or_b32 v3, v3, v1, v13 +; GFX10-NEXT: v_lshrrev_b32_e32 v9, 24, v0 +; GFX10-NEXT: v_lshlrev_b32_sdwa v10, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 +; GFX10-NEXT: v_lshlrev_b32_sdwa v12, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 +; GFX10-NEXT: v_lshlrev_b32_sdwa v14, v1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 +; GFX10-NEXT: v_lshlrev_b32_sdwa v1, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 +; GFX10-NEXT: v_lshlrev_b32_sdwa v11, v8, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 +; GFX10-NEXT: v_lshlrev_b32_sdwa v13, v8, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 +; GFX10-NEXT: v_lshlrev_b32_sdwa v15, v8, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 +; GFX10-NEXT: v_lshlrev_b32_sdwa v8, v8, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 +; GFX10-NEXT: v_and_or_b32 v2, 0xff, v2, v10 +; GFX10-NEXT: v_lshlrev_b32_e32 v10, 24, v5 +; GFX10-NEXT: v_and_or_b32 v3, 0xff, v3, v12 ; GFX10-NEXT: v_lshlrev_b32_e32 v6, 24, v6 -; GFX10-NEXT: v_and_or_b32 v13, v4, v1, v15 +; GFX10-NEXT: v_and_or_b32 v12, 0xff, v4, v14 ; GFX10-NEXT: v_lshlrev_b32_e32 v7, 24, v7 -; GFX10-NEXT: v_and_or_b32 v8, v0, v1, v8 -; GFX10-NEXT: v_lshlrev_b32_e32 v10, 24, v10 +; GFX10-NEXT: v_and_or_b32 v14, 0xff, v0, v1 +; GFX10-NEXT: v_lshlrev_b32_e32 v9, 24, v9 ; GFX10-NEXT: v_mov_b32_e32 v4, 0 ; GFX10-NEXT: v_mov_b32_e32 v5, 0 -; GFX10-NEXT: v_or3_b32 v0, v2, v12, v11 -; GFX10-NEXT: v_or3_b32 v1, v3, v14, v6 -; GFX10-NEXT: v_or3_b32 v2, v13, v16, v7 -; GFX10-NEXT: v_or3_b32 v3, v8, v9, v10 +; GFX10-NEXT: v_or3_b32 v0, v2, v11, v10 +; GFX10-NEXT: v_or3_b32 v1, v3, v13, v6 +; GFX10-NEXT: v_or3_b32 v2, v12, v15, v7 +; GFX10-NEXT: v_or3_b32 v3, v14, v8, v9 ; GFX10-NEXT: global_store_dwordx4 v[4:5], v[0:3], off ; GFX10-NEXT: s_endpgm %vec = load <16 x i8>, <16 x i8> addrspace(1)* %ptr diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.div.scale.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.div.scale.ll index 114bfc0aa515..ef3d6f6b479e 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.div.scale.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.div.scale.ll @@ -1261,10 +1261,9 @@ define amdgpu_kernel void @test_div_scale_f32_undef_val_val(float addrspace(1)* ; GFX10-LABEL: test_div_scale_f32_undef_val_val: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; GFX10-NEXT: v_mov_b32_e32 v0, 0x41000000 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_div_scale_f32 v0, s2, v0, v0, s0 +; GFX10-NEXT: v_div_scale_f32 v0, s2, 0x41000000, 0x41000000, s0 ; GFX10-NEXT: global_store_dword v1, v0, s[0:1] ; GFX10-NEXT: s_endpgm %result = call { float, i1 } @llvm.amdgcn.div.scale.f32(float undef, float 8.0, i1 false) diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.atomic.dim.a16.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.atomic.dim.a16.ll index 8803508a7ecd..5135a791b970 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.atomic.dim.a16.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.atomic.dim.a16.ll @@ -505,15 +505,14 @@ define amdgpu_ps float @atomic_add_i32_3d(<8 x i32> inreg %rsrc, i32 %data, i16 ; ; GFX10-LABEL: atomic_add_i32_3d: ; GFX10: ; %bb.0: ; %main_body -; GFX10-NEXT: v_mov_b32_e32 v4, 0xffff ; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX10-NEXT: s_mov_b32 s0, s2 ; GFX10-NEXT: s_mov_b32 s2, s4 ; GFX10-NEXT: s_mov_b32 s4, s6 ; GFX10-NEXT: s_mov_b32 s6, s8 ; GFX10-NEXT: s_lshl_b32 s8, s0, 16 -; GFX10-NEXT: v_and_or_b32 v1, v1, v4, v2 -; GFX10-NEXT: v_and_or_b32 v2, v3, v4, s8 +; GFX10-NEXT: v_and_or_b32 v1, 0xffff, v1, v2 +; GFX10-NEXT: v_and_or_b32 v2, 0xffff, v3, s8 ; GFX10-NEXT: s_mov_b32 s1, s3 ; GFX10-NEXT: s_mov_b32 s3, s5 ; GFX10-NEXT: s_mov_b32 s5, s7 @@ -549,15 +548,14 @@ define amdgpu_ps float @atomic_add_i32_cube(<8 x i32> inreg %rsrc, i32 %data, i1 ; ; GFX10-LABEL: atomic_add_i32_cube: ; GFX10: ; %bb.0: ; %main_body -; GFX10-NEXT: v_mov_b32_e32 v4, 0xffff ; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX10-NEXT: s_mov_b32 s0, s2 ; GFX10-NEXT: s_mov_b32 s2, s4 ; GFX10-NEXT: s_mov_b32 s4, s6 ; GFX10-NEXT: s_mov_b32 s6, s8 ; GFX10-NEXT: s_lshl_b32 s8, s0, 16 -; GFX10-NEXT: v_and_or_b32 v1, v1, v4, v2 -; GFX10-NEXT: v_and_or_b32 v2, v3, v4, s8 +; GFX10-NEXT: v_and_or_b32 v1, 0xffff, v1, v2 +; GFX10-NEXT: v_and_or_b32 v2, 0xffff, v3, s8 ; GFX10-NEXT: s_mov_b32 s1, s3 ; GFX10-NEXT: s_mov_b32 s3, s5 ; GFX10-NEXT: s_mov_b32 s5, s7 @@ -632,15 +630,14 @@ define amdgpu_ps float @atomic_add_i32_2darray(<8 x i32> inreg %rsrc, i32 %data, ; ; GFX10-LABEL: atomic_add_i32_2darray: ; GFX10: ; %bb.0: ; %main_body -; GFX10-NEXT: v_mov_b32_e32 v4, 0xffff ; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX10-NEXT: s_mov_b32 s0, s2 ; GFX10-NEXT: s_mov_b32 s2, s4 ; GFX10-NEXT: s_mov_b32 s4, s6 ; GFX10-NEXT: s_mov_b32 s6, s8 ; GFX10-NEXT: s_lshl_b32 s8, s0, 16 -; GFX10-NEXT: v_and_or_b32 v1, v1, v4, v2 -; GFX10-NEXT: v_and_or_b32 v2, v3, v4, s8 +; GFX10-NEXT: v_and_or_b32 v1, 0xffff, v1, v2 +; GFX10-NEXT: v_and_or_b32 v2, 0xffff, v3, s8 ; GFX10-NEXT: s_mov_b32 s1, s3 ; GFX10-NEXT: s_mov_b32 s3, s5 ; GFX10-NEXT: s_mov_b32 s5, s7 @@ -676,15 +673,14 @@ define amdgpu_ps float @atomic_add_i32_2dmsaa(<8 x i32> inreg %rsrc, i32 %data, ; ; GFX10-LABEL: atomic_add_i32_2dmsaa: ; GFX10: ; %bb.0: ; %main_body -; GFX10-NEXT: v_mov_b32_e32 v4, 0xffff ; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX10-NEXT: s_mov_b32 s0, s2 ; GFX10-NEXT: s_mov_b32 s2, s4 ; GFX10-NEXT: s_mov_b32 s4, s6 ; GFX10-NEXT: s_mov_b32 s6, s8 ; GFX10-NEXT: s_lshl_b32 s8, s0, 16 -; GFX10-NEXT: v_and_or_b32 v1, v1, v4, v2 -; GFX10-NEXT: v_and_or_b32 v2, v3, v4, s8 +; GFX10-NEXT: v_and_or_b32 v1, 0xffff, v1, v2 +; GFX10-NEXT: v_and_or_b32 v2, 0xffff, v3, s8 ; GFX10-NEXT: s_mov_b32 s1, s3 ; GFX10-NEXT: s_mov_b32 s3, s5 ; GFX10-NEXT: s_mov_b32 s5, s7 @@ -720,14 +716,13 @@ define amdgpu_ps float @atomic_add_i32_2darraymsaa(<8 x i32> inreg %rsrc, i32 %d ; ; GFX10-LABEL: atomic_add_i32_2darraymsaa: ; GFX10: ; %bb.0: ; %main_body -; GFX10-NEXT: v_mov_b32_e32 v5, 0xffff ; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v4 ; GFX10-NEXT: s_mov_b32 s0, s2 ; GFX10-NEXT: s_mov_b32 s1, s3 ; GFX10-NEXT: s_mov_b32 s2, s4 -; GFX10-NEXT: v_and_or_b32 v1, v1, v5, v2 -; GFX10-NEXT: v_and_or_b32 v2, v3, v5, v4 +; GFX10-NEXT: v_and_or_b32 v1, 0xffff, v1, v2 +; GFX10-NEXT: v_and_or_b32 v2, 0xffff, v3, v4 ; GFX10-NEXT: s_mov_b32 s3, s5 ; GFX10-NEXT: s_mov_b32 s4, s6 ; GFX10-NEXT: s_mov_b32 s5, s7 @@ -1279,15 +1274,14 @@ define amdgpu_ps <2 x float> @atomic_add_i64_3d(<8 x i32> inreg %rsrc, i64 %data ; ; GFX10-LABEL: atomic_add_i64_3d: ; GFX10: ; %bb.0: ; %main_body -; GFX10-NEXT: v_mov_b32_e32 v5, 0xffff ; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; GFX10-NEXT: s_mov_b32 s0, s2 ; GFX10-NEXT: s_mov_b32 s2, s4 ; GFX10-NEXT: s_mov_b32 s4, s6 ; GFX10-NEXT: s_mov_b32 s6, s8 ; GFX10-NEXT: s_lshl_b32 s8, s0, 16 -; GFX10-NEXT: v_and_or_b32 v2, v2, v5, v3 -; GFX10-NEXT: v_and_or_b32 v3, v4, v5, s8 +; GFX10-NEXT: v_and_or_b32 v2, 0xffff, v2, v3 +; GFX10-NEXT: v_and_or_b32 v3, 0xffff, v4, s8 ; GFX10-NEXT: s_mov_b32 s1, s3 ; GFX10-NEXT: s_mov_b32 s3, s5 ; GFX10-NEXT: s_mov_b32 s5, s7 @@ -1323,15 +1317,14 @@ define amdgpu_ps <2 x float> @atomic_add_i64_cube(<8 x i32> inreg %rsrc, i64 %da ; ; GFX10-LABEL: atomic_add_i64_cube: ; GFX10: ; %bb.0: ; %main_body -; GFX10-NEXT: v_mov_b32_e32 v5, 0xffff ; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; GFX10-NEXT: s_mov_b32 s0, s2 ; GFX10-NEXT: s_mov_b32 s2, s4 ; GFX10-NEXT: s_mov_b32 s4, s6 ; GFX10-NEXT: s_mov_b32 s6, s8 ; GFX10-NEXT: s_lshl_b32 s8, s0, 16 -; GFX10-NEXT: v_and_or_b32 v2, v2, v5, v3 -; GFX10-NEXT: v_and_or_b32 v3, v4, v5, s8 +; GFX10-NEXT: v_and_or_b32 v2, 0xffff, v2, v3 +; GFX10-NEXT: v_and_or_b32 v3, 0xffff, v4, s8 ; GFX10-NEXT: s_mov_b32 s1, s3 ; GFX10-NEXT: s_mov_b32 s3, s5 ; GFX10-NEXT: s_mov_b32 s5, s7 @@ -1406,15 +1399,14 @@ define amdgpu_ps <2 x float> @atomic_add_i64_2darray(<8 x i32> inreg %rsrc, i64 ; ; GFX10-LABEL: atomic_add_i64_2darray: ; GFX10: ; %bb.0: ; %main_body -; GFX10-NEXT: v_mov_b32_e32 v5, 0xffff ; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; GFX10-NEXT: s_mov_b32 s0, s2 ; GFX10-NEXT: s_mov_b32 s2, s4 ; GFX10-NEXT: s_mov_b32 s4, s6 ; GFX10-NEXT: s_mov_b32 s6, s8 ; GFX10-NEXT: s_lshl_b32 s8, s0, 16 -; GFX10-NEXT: v_and_or_b32 v2, v2, v5, v3 -; GFX10-NEXT: v_and_or_b32 v3, v4, v5, s8 +; GFX10-NEXT: v_and_or_b32 v2, 0xffff, v2, v3 +; GFX10-NEXT: v_and_or_b32 v3, 0xffff, v4, s8 ; GFX10-NEXT: s_mov_b32 s1, s3 ; GFX10-NEXT: s_mov_b32 s3, s5 ; GFX10-NEXT: s_mov_b32 s5, s7 @@ -1450,15 +1442,14 @@ define amdgpu_ps <2 x float> @atomic_add_i64_2dmsaa(<8 x i32> inreg %rsrc, i64 % ; ; GFX10-LABEL: atomic_add_i64_2dmsaa: ; GFX10: ; %bb.0: ; %main_body -; GFX10-NEXT: v_mov_b32_e32 v5, 0xffff ; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; GFX10-NEXT: s_mov_b32 s0, s2 ; GFX10-NEXT: s_mov_b32 s2, s4 ; GFX10-NEXT: s_mov_b32 s4, s6 ; GFX10-NEXT: s_mov_b32 s6, s8 ; GFX10-NEXT: s_lshl_b32 s8, s0, 16 -; GFX10-NEXT: v_and_or_b32 v2, v2, v5, v3 -; GFX10-NEXT: v_and_or_b32 v3, v4, v5, s8 +; GFX10-NEXT: v_and_or_b32 v2, 0xffff, v2, v3 +; GFX10-NEXT: v_and_or_b32 v3, 0xffff, v4, s8 ; GFX10-NEXT: s_mov_b32 s1, s3 ; GFX10-NEXT: s_mov_b32 s3, s5 ; GFX10-NEXT: s_mov_b32 s5, s7 @@ -1494,14 +1485,13 @@ define amdgpu_ps <2 x float> @atomic_add_i64_2darraymsaa(<8 x i32> inreg %rsrc, ; ; GFX10-LABEL: atomic_add_i64_2darraymsaa: ; GFX10: ; %bb.0: ; %main_body -; GFX10-NEXT: v_mov_b32_e32 v6, 0xffff ; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; GFX10-NEXT: v_lshlrev_b32_e32 v5, 16, v5 ; GFX10-NEXT: s_mov_b32 s0, s2 ; GFX10-NEXT: s_mov_b32 s1, s3 ; GFX10-NEXT: s_mov_b32 s2, s4 -; GFX10-NEXT: v_and_or_b32 v2, v2, v6, v3 -; GFX10-NEXT: v_and_or_b32 v3, v4, v6, v5 +; GFX10-NEXT: v_and_or_b32 v2, 0xffff, v2, v3 +; GFX10-NEXT: v_and_or_b32 v3, 0xffff, v4, v5 ; GFX10-NEXT: s_mov_b32 s3, s5 ; GFX10-NEXT: s_mov_b32 s4, s6 ; GFX10-NEXT: s_mov_b32 s5, s7 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.gather4.a16.dim.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.gather4.a16.dim.ll index 686658f24cad..1bb1cc6db8f5 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.gather4.a16.dim.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.gather4.a16.dim.ll @@ -86,7 +86,6 @@ define amdgpu_ps <4 x float> @gather4_cube(<8 x i32> inreg %rsrc, <4 x i32> inre ; GFX10NSA-NEXT: s_mov_b32 s14, exec_lo ; GFX10NSA-NEXT: s_mov_b32 s0, s2 ; GFX10NSA-NEXT: s_wqm_b32 exec_lo, exec_lo -; GFX10NSA-NEXT: v_mov_b32_e32 v3, 0xffff ; GFX10NSA-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX10NSA-NEXT: s_mov_b32 s2, s4 ; GFX10NSA-NEXT: s_mov_b32 s4, s6 @@ -100,8 +99,8 @@ define amdgpu_ps <4 x float> @gather4_cube(<8 x i32> inreg %rsrc, <4 x i32> inre ; GFX10NSA-NEXT: s_mov_b32 s7, s9 ; GFX10NSA-NEXT: s_mov_b32 s9, s11 ; GFX10NSA-NEXT: s_mov_b32 s11, s13 -; GFX10NSA-NEXT: v_and_or_b32 v0, v0, v3, v1 -; GFX10NSA-NEXT: v_and_or_b32 v1, v2, v3, s12 +; GFX10NSA-NEXT: v_and_or_b32 v0, 0xffff, v0, v1 +; GFX10NSA-NEXT: v_and_or_b32 v1, 0xffff, v2, s12 ; GFX10NSA-NEXT: s_and_b32 exec_lo, exec_lo, s14 ; GFX10NSA-NEXT: image_gather4 v[0:3], v[0:1], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_CUBE a16 ; GFX10NSA-NEXT: s_waitcnt vmcnt(0) @@ -143,7 +142,6 @@ define amdgpu_ps <4 x float> @gather4_2darray(<8 x i32> inreg %rsrc, <4 x i32> i ; GFX10NSA-NEXT: s_mov_b32 s14, exec_lo ; GFX10NSA-NEXT: s_mov_b32 s0, s2 ; GFX10NSA-NEXT: s_wqm_b32 exec_lo, exec_lo -; GFX10NSA-NEXT: v_mov_b32_e32 v3, 0xffff ; GFX10NSA-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX10NSA-NEXT: s_mov_b32 s2, s4 ; GFX10NSA-NEXT: s_mov_b32 s4, s6 @@ -157,8 +155,8 @@ define amdgpu_ps <4 x float> @gather4_2darray(<8 x i32> inreg %rsrc, <4 x i32> i ; GFX10NSA-NEXT: s_mov_b32 s7, s9 ; GFX10NSA-NEXT: s_mov_b32 s9, s11 ; GFX10NSA-NEXT: s_mov_b32 s11, s13 -; GFX10NSA-NEXT: v_and_or_b32 v0, v0, v3, v1 -; GFX10NSA-NEXT: v_and_or_b32 v1, v2, v3, s12 +; GFX10NSA-NEXT: v_and_or_b32 v0, 0xffff, v0, v1 +; GFX10NSA-NEXT: v_and_or_b32 v1, 0xffff, v2, s12 ; GFX10NSA-NEXT: s_and_b32 exec_lo, exec_lo, s14 ; GFX10NSA-NEXT: image_gather4 v[0:3], v[0:1], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_2D_ARRAY a16 ; GFX10NSA-NEXT: s_waitcnt vmcnt(0) @@ -252,7 +250,6 @@ define amdgpu_ps <4 x float> @gather4_cl_2d(<8 x i32> inreg %rsrc, <4 x i32> inr ; GFX10NSA-NEXT: s_mov_b32 s14, exec_lo ; GFX10NSA-NEXT: s_mov_b32 s0, s2 ; GFX10NSA-NEXT: s_wqm_b32 exec_lo, exec_lo -; GFX10NSA-NEXT: v_mov_b32_e32 v3, 0xffff ; GFX10NSA-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX10NSA-NEXT: s_mov_b32 s2, s4 ; GFX10NSA-NEXT: s_mov_b32 s4, s6 @@ -266,8 +263,8 @@ define amdgpu_ps <4 x float> @gather4_cl_2d(<8 x i32> inreg %rsrc, <4 x i32> inr ; GFX10NSA-NEXT: s_mov_b32 s7, s9 ; GFX10NSA-NEXT: s_mov_b32 s9, s11 ; GFX10NSA-NEXT: s_mov_b32 s11, s13 -; GFX10NSA-NEXT: v_and_or_b32 v0, v0, v3, v1 -; GFX10NSA-NEXT: v_and_or_b32 v1, v2, v3, s12 +; GFX10NSA-NEXT: v_and_or_b32 v0, 0xffff, v0, v1 +; GFX10NSA-NEXT: v_and_or_b32 v1, 0xffff, v2, s12 ; GFX10NSA-NEXT: s_and_b32 exec_lo, exec_lo, s14 ; GFX10NSA-NEXT: image_gather4_cl v[0:3], v[0:1], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_2D a16 ; GFX10NSA-NEXT: s_waitcnt vmcnt(0) @@ -309,7 +306,6 @@ define amdgpu_ps <4 x float> @gather4_c_cl_2d(<8 x i32> inreg %rsrc, <4 x i32> i ; GFX10NSA-NEXT: s_mov_b32 s14, exec_lo ; GFX10NSA-NEXT: s_mov_b32 s0, s2 ; GFX10NSA-NEXT: s_wqm_b32 exec_lo, exec_lo -; GFX10NSA-NEXT: v_mov_b32_e32 v4, 0xffff ; GFX10NSA-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX10NSA-NEXT: s_mov_b32 s2, s4 ; GFX10NSA-NEXT: s_mov_b32 s4, s6 @@ -323,8 +319,8 @@ define amdgpu_ps <4 x float> @gather4_c_cl_2d(<8 x i32> inreg %rsrc, <4 x i32> i ; GFX10NSA-NEXT: s_mov_b32 s7, s9 ; GFX10NSA-NEXT: s_mov_b32 s9, s11 ; GFX10NSA-NEXT: s_mov_b32 s11, s13 -; GFX10NSA-NEXT: v_and_or_b32 v1, v1, v4, v2 -; GFX10NSA-NEXT: v_and_or_b32 v2, v3, v4, s12 +; GFX10NSA-NEXT: v_and_or_b32 v1, 0xffff, v1, v2 +; GFX10NSA-NEXT: v_and_or_b32 v2, 0xffff, v3, s12 ; GFX10NSA-NEXT: s_and_b32 exec_lo, exec_lo, s14 ; GFX10NSA-NEXT: image_gather4_c_cl v[0:3], v[0:2], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_2D a16 ; GFX10NSA-NEXT: s_waitcnt vmcnt(0) @@ -366,7 +362,6 @@ define amdgpu_ps <4 x float> @gather4_b_2d(<8 x i32> inreg %rsrc, <4 x i32> inre ; GFX10NSA-NEXT: s_mov_b32 s14, exec_lo ; GFX10NSA-NEXT: s_mov_b32 s0, s2 ; GFX10NSA-NEXT: s_wqm_b32 exec_lo, exec_lo -; GFX10NSA-NEXT: v_mov_b32_e32 v3, 0xffff ; GFX10NSA-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX10NSA-NEXT: s_mov_b32 s2, s4 ; GFX10NSA-NEXT: s_mov_b32 s4, s6 @@ -380,8 +375,8 @@ define amdgpu_ps <4 x float> @gather4_b_2d(<8 x i32> inreg %rsrc, <4 x i32> inre ; GFX10NSA-NEXT: s_mov_b32 s7, s9 ; GFX10NSA-NEXT: s_mov_b32 s9, s11 ; GFX10NSA-NEXT: s_mov_b32 s11, s13 -; GFX10NSA-NEXT: v_and_or_b32 v0, v0, v3, s12 -; GFX10NSA-NEXT: v_and_or_b32 v1, v1, v3, v2 +; GFX10NSA-NEXT: v_and_or_b32 v0, 0xffff, v0, s12 +; GFX10NSA-NEXT: v_and_or_b32 v1, 0xffff, v1, v2 ; GFX10NSA-NEXT: s_and_b32 exec_lo, exec_lo, s14 ; GFX10NSA-NEXT: image_gather4_b v[0:3], v[0:1], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_2D a16 ; GFX10NSA-NEXT: s_waitcnt vmcnt(0) @@ -423,7 +418,6 @@ define amdgpu_ps <4 x float> @gather4_c_b_2d(<8 x i32> inreg %rsrc, <4 x i32> in ; GFX10NSA-NEXT: s_mov_b32 s14, exec_lo ; GFX10NSA-NEXT: s_mov_b32 s0, s2 ; GFX10NSA-NEXT: s_wqm_b32 exec_lo, exec_lo -; GFX10NSA-NEXT: v_mov_b32_e32 v4, 0xffff ; GFX10NSA-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; GFX10NSA-NEXT: s_mov_b32 s2, s4 ; GFX10NSA-NEXT: s_mov_b32 s4, s6 @@ -437,8 +431,8 @@ define amdgpu_ps <4 x float> @gather4_c_b_2d(<8 x i32> inreg %rsrc, <4 x i32> in ; GFX10NSA-NEXT: s_mov_b32 s7, s9 ; GFX10NSA-NEXT: s_mov_b32 s9, s11 ; GFX10NSA-NEXT: s_mov_b32 s11, s13 -; GFX10NSA-NEXT: v_and_or_b32 v0, v0, v4, s12 -; GFX10NSA-NEXT: v_and_or_b32 v2, v2, v4, v3 +; GFX10NSA-NEXT: v_and_or_b32 v0, 0xffff, v0, s12 +; GFX10NSA-NEXT: v_and_or_b32 v2, 0xffff, v2, v3 ; GFX10NSA-NEXT: s_and_b32 exec_lo, exec_lo, s14 ; GFX10NSA-NEXT: image_gather4_c_b v[0:3], v[0:2], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_2D a16 ; GFX10NSA-NEXT: s_waitcnt vmcnt(0) @@ -481,7 +475,6 @@ define amdgpu_ps <4 x float> @gather4_b_cl_2d(<8 x i32> inreg %rsrc, <4 x i32> i ; GFX10NSA-NEXT: s_mov_b32 s14, exec_lo ; GFX10NSA-NEXT: s_mov_b32 s0, s2 ; GFX10NSA-NEXT: s_wqm_b32 exec_lo, exec_lo -; GFX10NSA-NEXT: v_mov_b32_e32 v4, 0xffff ; GFX10NSA-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX10NSA-NEXT: s_mov_b32 s2, s4 ; GFX10NSA-NEXT: s_mov_b32 s4, s6 @@ -495,9 +488,9 @@ define amdgpu_ps <4 x float> @gather4_b_cl_2d(<8 x i32> inreg %rsrc, <4 x i32> i ; GFX10NSA-NEXT: s_mov_b32 s7, s9 ; GFX10NSA-NEXT: s_mov_b32 s9, s11 ; GFX10NSA-NEXT: s_mov_b32 s11, s13 -; GFX10NSA-NEXT: v_and_or_b32 v0, v0, v4, s12 -; GFX10NSA-NEXT: v_and_or_b32 v1, v1, v4, v2 -; GFX10NSA-NEXT: v_and_or_b32 v2, v3, v4, s12 +; GFX10NSA-NEXT: v_and_or_b32 v0, 0xffff, v0, s12 +; GFX10NSA-NEXT: v_and_or_b32 v1, 0xffff, v1, v2 +; GFX10NSA-NEXT: v_and_or_b32 v2, 0xffff, v3, s12 ; GFX10NSA-NEXT: s_and_b32 exec_lo, exec_lo, s14 ; GFX10NSA-NEXT: image_gather4_b_cl v[0:3], v[0:2], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_2D a16 ; GFX10NSA-NEXT: s_waitcnt vmcnt(0) @@ -540,7 +533,6 @@ define amdgpu_ps <4 x float> @gather4_c_b_cl_2d(<8 x i32> inreg %rsrc, <4 x i32> ; GFX10NSA-NEXT: s_mov_b32 s14, exec_lo ; GFX10NSA-NEXT: s_mov_b32 s0, s2 ; GFX10NSA-NEXT: s_wqm_b32 exec_lo, exec_lo -; GFX10NSA-NEXT: v_mov_b32_e32 v5, 0xffff ; GFX10NSA-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; GFX10NSA-NEXT: s_mov_b32 s2, s4 ; GFX10NSA-NEXT: s_mov_b32 s4, s6 @@ -554,9 +546,9 @@ define amdgpu_ps <4 x float> @gather4_c_b_cl_2d(<8 x i32> inreg %rsrc, <4 x i32> ; GFX10NSA-NEXT: s_mov_b32 s7, s9 ; GFX10NSA-NEXT: s_mov_b32 s9, s11 ; GFX10NSA-NEXT: s_mov_b32 s11, s13 -; GFX10NSA-NEXT: v_and_or_b32 v0, v0, v5, s12 -; GFX10NSA-NEXT: v_and_or_b32 v2, v2, v5, v3 -; GFX10NSA-NEXT: v_and_or_b32 v3, v4, v5, s12 +; GFX10NSA-NEXT: v_and_or_b32 v0, 0xffff, v0, s12 +; GFX10NSA-NEXT: v_and_or_b32 v2, 0xffff, v2, v3 +; GFX10NSA-NEXT: v_and_or_b32 v3, 0xffff, v4, s12 ; GFX10NSA-NEXT: s_and_b32 exec_lo, exec_lo, s14 ; GFX10NSA-NEXT: image_gather4_c_b_cl v[0:3], v[0:3], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_2D a16 ; GFX10NSA-NEXT: s_waitcnt vmcnt(0) @@ -592,7 +584,6 @@ define amdgpu_ps <4 x float> @gather4_l_2d(<8 x i32> inreg %rsrc, <4 x i32> inre ; ; GFX10NSA-LABEL: gather4_l_2d: ; GFX10NSA: ; %bb.0: ; %main_body -; GFX10NSA-NEXT: v_mov_b32_e32 v3, 0xffff ; GFX10NSA-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX10NSA-NEXT: s_mov_b32 s0, s2 ; GFX10NSA-NEXT: s_mov_b32 s2, s4 @@ -601,8 +592,8 @@ define amdgpu_ps <4 x float> @gather4_l_2d(<8 x i32> inreg %rsrc, <4 x i32> inre ; GFX10NSA-NEXT: s_mov_b32 s8, s10 ; GFX10NSA-NEXT: s_mov_b32 s10, s12 ; GFX10NSA-NEXT: s_lshl_b32 s12, s0, 16 -; GFX10NSA-NEXT: v_and_or_b32 v0, v0, v3, v1 -; GFX10NSA-NEXT: v_and_or_b32 v1, v2, v3, s12 +; GFX10NSA-NEXT: v_and_or_b32 v0, 0xffff, v0, v1 +; GFX10NSA-NEXT: v_and_or_b32 v1, 0xffff, v2, s12 ; GFX10NSA-NEXT: s_mov_b32 s1, s3 ; GFX10NSA-NEXT: s_mov_b32 s3, s5 ; GFX10NSA-NEXT: s_mov_b32 s5, s7 @@ -643,7 +634,6 @@ define amdgpu_ps <4 x float> @gather4_c_l_2d(<8 x i32> inreg %rsrc, <4 x i32> in ; ; GFX10NSA-LABEL: gather4_c_l_2d: ; GFX10NSA: ; %bb.0: ; %main_body -; GFX10NSA-NEXT: v_mov_b32_e32 v4, 0xffff ; GFX10NSA-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX10NSA-NEXT: s_mov_b32 s0, s2 ; GFX10NSA-NEXT: s_mov_b32 s2, s4 @@ -652,8 +642,8 @@ define amdgpu_ps <4 x float> @gather4_c_l_2d(<8 x i32> inreg %rsrc, <4 x i32> in ; GFX10NSA-NEXT: s_mov_b32 s8, s10 ; GFX10NSA-NEXT: s_mov_b32 s10, s12 ; GFX10NSA-NEXT: s_lshl_b32 s12, s0, 16 -; GFX10NSA-NEXT: v_and_or_b32 v1, v1, v4, v2 -; GFX10NSA-NEXT: v_and_or_b32 v2, v3, v4, s12 +; GFX10NSA-NEXT: v_and_or_b32 v1, 0xffff, v1, v2 +; GFX10NSA-NEXT: v_and_or_b32 v2, 0xffff, v3, s12 ; GFX10NSA-NEXT: s_mov_b32 s1, s3 ; GFX10NSA-NEXT: s_mov_b32 s3, s5 ; GFX10NSA-NEXT: s_mov_b32 s5, s7 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.load.1d.d16.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.load.1d.d16.ll index 88b3244df5fa..3201adf3bd8d 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.load.1d.d16.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.load.1d.d16.ll @@ -564,15 +564,14 @@ define amdgpu_ps <3 x half> @load_1d_v3f16_xyz(<8 x i32> inreg %rsrc, i32 %s) { ; GFX10-NEXT: s_mov_b32 s5, s7 ; GFX10-NEXT: s_mov_b32 s6, s8 ; GFX10-NEXT: s_mov_b32 s7, s9 -; GFX10-NEXT: v_mov_b32_e32 v3, 0xffff ; GFX10-NEXT: image_load v[0:1], v0, s[0:7] dmask:0x7 dim:SQ_RSRC_IMG_1D unorm d16 ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_lshl_b32 s0, s0, 16 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_lshrrev_b32_e32 v2, 16, v0 -; GFX10-NEXT: v_and_or_b32 v1, v1, v3, s0 +; GFX10-NEXT: v_and_or_b32 v1, 0xffff, v1, s0 ; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX10-NEXT: v_and_or_b32 v0, v0, v3, v2 +; GFX10-NEXT: v_and_or_b32 v0, 0xffff, v0, v2 ; GFX10-NEXT: ; return to shader part epilog %v = call <3 x half> @llvm.amdgcn.image.load.1d.v3f16.i32(i32 7, i32 %s, <8 x i32> %rsrc, i32 0, i32 0) ret <3 x half> %v diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.load.2darraymsaa.a16.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.load.2darraymsaa.a16.ll index cb596f2021e4..e5108c53440e 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.load.2darraymsaa.a16.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.load.2darraymsaa.a16.ll @@ -24,14 +24,13 @@ define amdgpu_ps <4 x float> @load_2darraymsaa_v4f32_xyzw(<8 x i32> inreg %rsrc, ; ; GFX10-LABEL: load_2darraymsaa_v4f32_xyzw: ; GFX10: ; %bb.0: -; GFX10-NEXT: v_mov_b32_e32 v4, 0xffff ; GFX10-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; GFX10-NEXT: s_mov_b32 s0, s2 ; GFX10-NEXT: s_mov_b32 s1, s3 ; GFX10-NEXT: s_mov_b32 s2, s4 -; GFX10-NEXT: v_and_or_b32 v0, v0, v4, v1 -; GFX10-NEXT: v_and_or_b32 v1, v2, v4, v3 +; GFX10-NEXT: v_and_or_b32 v0, 0xffff, v0, v1 +; GFX10-NEXT: v_and_or_b32 v1, 0xffff, v2, v3 ; GFX10-NEXT: s_mov_b32 s3, s5 ; GFX10-NEXT: s_mov_b32 s4, s6 ; GFX10-NEXT: s_mov_b32 s5, s7 @@ -79,17 +78,16 @@ define amdgpu_ps <4 x float> @load_2darraymsaa_v4f32_xyzw_tfe(<8 x i32> inreg %r ; GFX10-LABEL: load_2darraymsaa_v4f32_xyzw_tfe: ; GFX10: ; %bb.0: ; GFX10-NEXT: v_mov_b32_e32 v5, 0 -; GFX10-NEXT: v_mov_b32_e32 v4, 0xffff ; GFX10-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; GFX10-NEXT: s_mov_b32 s0, s2 +; GFX10-NEXT: s_mov_b32 s1, s3 ; GFX10-NEXT: v_mov_b32_e32 v6, v5 ; GFX10-NEXT: v_mov_b32_e32 v7, v5 ; GFX10-NEXT: v_mov_b32_e32 v8, v5 ; GFX10-NEXT: v_mov_b32_e32 v9, v5 -; GFX10-NEXT: v_and_or_b32 v10, v0, v4, v1 -; GFX10-NEXT: v_and_or_b32 v11, v2, v4, v3 -; GFX10-NEXT: s_mov_b32 s1, s3 +; GFX10-NEXT: v_and_or_b32 v10, 0xffff, v0, v1 +; GFX10-NEXT: v_and_or_b32 v11, 0xffff, v2, v3 ; GFX10-NEXT: s_mov_b32 s2, s4 ; GFX10-NEXT: s_mov_b32 s3, s5 ; GFX10-NEXT: s_mov_b32 s4, s6 @@ -148,17 +146,16 @@ define amdgpu_ps <4 x float> @load_2darraymsaa_v4f32_xyzw_tfe_lwe(<8 x i32> inre ; GFX10-LABEL: load_2darraymsaa_v4f32_xyzw_tfe_lwe: ; GFX10: ; %bb.0: ; GFX10-NEXT: v_mov_b32_e32 v5, 0 -; GFX10-NEXT: v_mov_b32_e32 v4, 0xffff ; GFX10-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; GFX10-NEXT: s_mov_b32 s0, s2 +; GFX10-NEXT: s_mov_b32 s1, s3 ; GFX10-NEXT: v_mov_b32_e32 v6, v5 ; GFX10-NEXT: v_mov_b32_e32 v7, v5 ; GFX10-NEXT: v_mov_b32_e32 v8, v5 ; GFX10-NEXT: v_mov_b32_e32 v9, v5 -; GFX10-NEXT: v_and_or_b32 v10, v0, v4, v1 -; GFX10-NEXT: v_and_or_b32 v11, v2, v4, v3 -; GFX10-NEXT: s_mov_b32 s1, s3 +; GFX10-NEXT: v_and_or_b32 v10, 0xffff, v0, v1 +; GFX10-NEXT: v_and_or_b32 v11, 0xffff, v2, v3 ; GFX10-NEXT: s_mov_b32 s2, s4 ; GFX10-NEXT: s_mov_b32 s3, s5 ; GFX10-NEXT: s_mov_b32 s4, s6 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.load.3d.a16.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.load.3d.a16.ll index dcc3137545e9..533c449de47e 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.load.3d.a16.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.load.3d.a16.ll @@ -24,15 +24,14 @@ define amdgpu_ps <4 x float> @load_3d_v4f32_xyzw(<8 x i32> inreg %rsrc, i16 %s, ; ; GFX10-LABEL: load_3d_v4f32_xyzw: ; GFX10: ; %bb.0: -; GFX10-NEXT: v_mov_b32_e32 v3, 0xffff -; GFX10-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v1 ; GFX10-NEXT: s_mov_b32 s0, s2 ; GFX10-NEXT: s_mov_b32 s2, s4 ; GFX10-NEXT: s_mov_b32 s4, s6 ; GFX10-NEXT: s_mov_b32 s6, s8 ; GFX10-NEXT: s_lshl_b32 s8, s0, 16 -; GFX10-NEXT: v_and_or_b32 v0, v0, v3, v1 -; GFX10-NEXT: v_and_or_b32 v1, v2, v3, s8 +; GFX10-NEXT: v_and_or_b32 v0, 0xffff, v0, v3 +; GFX10-NEXT: v_and_or_b32 v1, 0xffff, v2, s8 ; GFX10-NEXT: s_mov_b32 s1, s3 ; GFX10-NEXT: s_mov_b32 s3, s5 ; GFX10-NEXT: s_mov_b32 s5, s7 @@ -79,7 +78,6 @@ define amdgpu_ps <4 x float> @load_3d_v4f32_xyzw_tfe(<8 x i32> inreg %rsrc, i32 ; GFX10-LABEL: load_3d_v4f32_xyzw_tfe: ; GFX10: ; %bb.0: ; GFX10-NEXT: v_mov_b32_e32 v5, 0 -; GFX10-NEXT: v_mov_b32_e32 v3, 0xffff ; GFX10-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX10-NEXT: s_mov_b32 s0, s2 ; GFX10-NEXT: s_mov_b32 s2, s4 @@ -90,8 +88,8 @@ define amdgpu_ps <4 x float> @load_3d_v4f32_xyzw_tfe(<8 x i32> inreg %rsrc, i32 ; GFX10-NEXT: v_mov_b32_e32 v7, v5 ; GFX10-NEXT: v_mov_b32_e32 v8, v5 ; GFX10-NEXT: v_mov_b32_e32 v9, v5 -; GFX10-NEXT: v_and_or_b32 v10, v0, v3, v1 -; GFX10-NEXT: v_and_or_b32 v11, v2, v3, s8 +; GFX10-NEXT: v_and_or_b32 v10, 0xffff, v0, v1 +; GFX10-NEXT: v_and_or_b32 v11, 0xffff, v2, s8 ; GFX10-NEXT: s_mov_b32 s1, s3 ; GFX10-NEXT: s_mov_b32 s3, s5 ; GFX10-NEXT: s_mov_b32 s5, s7 @@ -148,7 +146,6 @@ define amdgpu_ps <4 x float> @load_3d_v4f32_xyzw_tfe_lwe(<8 x i32> inreg %rsrc, ; GFX10-LABEL: load_3d_v4f32_xyzw_tfe_lwe: ; GFX10: ; %bb.0: ; GFX10-NEXT: v_mov_b32_e32 v5, 0 -; GFX10-NEXT: v_mov_b32_e32 v3, 0xffff ; GFX10-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX10-NEXT: s_mov_b32 s0, s2 ; GFX10-NEXT: s_mov_b32 s2, s4 @@ -159,8 +156,8 @@ define amdgpu_ps <4 x float> @load_3d_v4f32_xyzw_tfe_lwe(<8 x i32> inreg %rsrc, ; GFX10-NEXT: v_mov_b32_e32 v7, v5 ; GFX10-NEXT: v_mov_b32_e32 v8, v5 ; GFX10-NEXT: v_mov_b32_e32 v9, v5 -; GFX10-NEXT: v_and_or_b32 v10, v0, v3, v1 -; GFX10-NEXT: v_and_or_b32 v11, v2, v3, s8 +; GFX10-NEXT: v_and_or_b32 v10, 0xffff, v0, v1 +; GFX10-NEXT: v_and_or_b32 v11, 0xffff, v2, s8 ; GFX10-NEXT: s_mov_b32 s1, s3 ; GFX10-NEXT: s_mov_b32 s3, s5 ; GFX10-NEXT: s_mov_b32 s5, s7 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.sample.g16.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.sample.g16.ll index 7705bb2392ed..ad20d9d5e343 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.sample.g16.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.sample.g16.ll @@ -4,10 +4,9 @@ define amdgpu_ps <4 x float> @sample_d_1d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, half %dsdh, half %dsdv, float %s) { ; GFX10-LABEL: sample_d_1d: ; GFX10: ; %bb.0: ; %main_body -; GFX10-NEXT: v_mov_b32_e32 v3, 0xffff ; GFX10-NEXT: s_lshl_b32 s12, s0, 16 -; GFX10-NEXT: v_and_or_b32 v0, v0, v3, s12 -; GFX10-NEXT: v_and_or_b32 v1, v1, v3, s12 +; GFX10-NEXT: v_and_or_b32 v0, 0xffff, v0, s12 +; GFX10-NEXT: v_and_or_b32 v1, 0xffff, v1, s12 ; GFX10-NEXT: image_sample_d_g16 v[0:3], v[0:2], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ; return to shader part epilog @@ -19,11 +18,10 @@ main_body: define amdgpu_ps <4 x float> @sample_d_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, half %dsdh, half %dtdh, half %dsdv, half %dtdv, float %s, float %t) { ; GFX10-LABEL: sample_d_2d: ; GFX10: ; %bb.0: ; %main_body -; GFX10-NEXT: v_mov_b32_e32 v6, 0xffff ; GFX10-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GFX10-NEXT: v_and_or_b32 v0, v0, v6, v1 -; GFX10-NEXT: v_and_or_b32 v1, v2, v6, v3 +; GFX10-NEXT: v_and_or_b32 v0, 0xffff, v0, v1 +; GFX10-NEXT: v_and_or_b32 v1, 0xffff, v2, v3 ; GFX10-NEXT: image_sample_d_g16 v[0:3], [v0, v1, v4, v5], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ; return to shader part epilog @@ -37,14 +35,13 @@ define amdgpu_ps <4 x float> @sample_d_3d(<8 x i32> inreg %rsrc, <4 x i32> inreg ; GFX10: ; %bb.0: ; %main_body ; GFX10-NEXT: v_mov_b32_e32 v9, v2 ; GFX10-NEXT: v_mov_b32_e32 v10, v3 -; GFX10-NEXT: v_mov_b32_e32 v11, 0xffff ; GFX10-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v4 ; GFX10-NEXT: s_lshl_b32 s12, s0, 16 -; GFX10-NEXT: v_and_or_b32 v3, v9, v11, s12 -; GFX10-NEXT: v_and_or_b32 v2, v0, v11, v1 -; GFX10-NEXT: v_and_or_b32 v4, v10, v11, v4 -; GFX10-NEXT: v_and_or_b32 v5, v5, v11, s12 +; GFX10-NEXT: v_and_or_b32 v3, 0xffff, v9, s12 +; GFX10-NEXT: v_and_or_b32 v2, 0xffff, v0, v1 +; GFX10-NEXT: v_and_or_b32 v4, 0xffff, v10, v4 +; GFX10-NEXT: v_and_or_b32 v5, 0xffff, v5, s12 ; GFX10-NEXT: image_sample_d_g16 v[0:3], v[2:8], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_3D ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ; return to shader part epilog @@ -56,10 +53,9 @@ main_body: define amdgpu_ps <4 x float> @sample_c_d_1d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %zcompare, half %dsdh, half %dsdv, float %s) { ; GFX10-LABEL: sample_c_d_1d: ; GFX10: ; %bb.0: ; %main_body -; GFX10-NEXT: v_mov_b32_e32 v4, 0xffff ; GFX10-NEXT: s_lshl_b32 s12, s0, 16 -; GFX10-NEXT: v_and_or_b32 v1, v1, v4, s12 -; GFX10-NEXT: v_and_or_b32 v2, v2, v4, s12 +; GFX10-NEXT: v_and_or_b32 v1, 0xffff, v1, s12 +; GFX10-NEXT: v_and_or_b32 v2, 0xffff, v2, s12 ; GFX10-NEXT: image_sample_c_d_g16 v[0:3], v[0:3], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ; return to shader part epilog @@ -71,11 +67,10 @@ main_body: define amdgpu_ps <4 x float> @sample_c_d_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %zcompare, half %dsdh, half %dtdh, half %dsdv, half %dtdv, float %s, float %t) { ; GFX10-LABEL: sample_c_d_2d: ; GFX10: ; %bb.0: ; %main_body -; GFX10-NEXT: v_mov_b32_e32 v7, 0xffff ; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; GFX10-NEXT: v_and_or_b32 v1, v1, v7, v2 -; GFX10-NEXT: v_and_or_b32 v2, v3, v7, v4 +; GFX10-NEXT: v_and_or_b32 v1, 0xffff, v1, v2 +; GFX10-NEXT: v_and_or_b32 v2, 0xffff, v3, v4 ; GFX10-NEXT: image_sample_c_d_g16 v[0:3], [v0, v1, v2, v5, v6], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ; return to shader part epilog @@ -87,10 +82,9 @@ main_body: define amdgpu_ps <4 x float> @sample_d_cl_1d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, half %dsdh, half %dsdv, float %s, float %clamp) { ; GFX10-LABEL: sample_d_cl_1d: ; GFX10: ; %bb.0: ; %main_body -; GFX10-NEXT: v_mov_b32_e32 v4, 0xffff ; GFX10-NEXT: s_lshl_b32 s12, s0, 16 -; GFX10-NEXT: v_and_or_b32 v0, v0, v4, s12 -; GFX10-NEXT: v_and_or_b32 v1, v1, v4, s12 +; GFX10-NEXT: v_and_or_b32 v0, 0xffff, v0, s12 +; GFX10-NEXT: v_and_or_b32 v1, 0xffff, v1, s12 ; GFX10-NEXT: image_sample_d_cl_g16 v[0:3], v[0:3], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ; return to shader part epilog @@ -102,11 +96,10 @@ main_body: define amdgpu_ps <4 x float> @sample_d_cl_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, half %dsdh, half %dtdh, half %dsdv, half %dtdv, float %s, float %t, float %clamp) { ; GFX10-LABEL: sample_d_cl_2d: ; GFX10: ; %bb.0: ; %main_body -; GFX10-NEXT: v_mov_b32_e32 v7, 0xffff ; GFX10-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GFX10-NEXT: v_and_or_b32 v0, v0, v7, v1 -; GFX10-NEXT: v_and_or_b32 v1, v2, v7, v3 +; GFX10-NEXT: v_and_or_b32 v0, 0xffff, v0, v1 +; GFX10-NEXT: v_and_or_b32 v1, 0xffff, v2, v3 ; GFX10-NEXT: image_sample_d_cl_g16 v[0:3], [v0, v1, v4, v5, v6], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ; return to shader part epilog @@ -118,10 +111,9 @@ main_body: define amdgpu_ps <4 x float> @sample_c_d_cl_1d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %zcompare, half %dsdh, half %dsdv, float %s, float %clamp) { ; GFX10-LABEL: sample_c_d_cl_1d: ; GFX10: ; %bb.0: ; %main_body -; GFX10-NEXT: v_mov_b32_e32 v5, 0xffff ; GFX10-NEXT: s_lshl_b32 s12, s0, 16 -; GFX10-NEXT: v_and_or_b32 v1, v1, v5, s12 -; GFX10-NEXT: v_and_or_b32 v2, v2, v5, s12 +; GFX10-NEXT: v_and_or_b32 v1, 0xffff, v1, s12 +; GFX10-NEXT: v_and_or_b32 v2, 0xffff, v2, s12 ; GFX10-NEXT: image_sample_c_d_cl_g16 v[0:3], v[0:4], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ; return to shader part epilog @@ -136,11 +128,10 @@ define amdgpu_ps <4 x float> @sample_c_d_cl_2d(<8 x i32> inreg %rsrc, <4 x i32> ; GFX10-NEXT: v_mov_b32_e32 v8, v2 ; GFX10-NEXT: v_mov_b32_e32 v9, v3 ; GFX10-NEXT: v_mov_b32_e32 v2, v0 -; GFX10-NEXT: v_mov_b32_e32 v0, 0xffff ; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v8 -; GFX10-NEXT: v_and_or_b32 v4, v9, v0, v4 -; GFX10-NEXT: v_and_or_b32 v3, v1, v0, v3 +; GFX10-NEXT: v_lshlrev_b32_e32 v0, 16, v8 +; GFX10-NEXT: v_and_or_b32 v4, 0xffff, v9, v4 +; GFX10-NEXT: v_and_or_b32 v3, 0xffff, v1, v0 ; GFX10-NEXT: image_sample_c_d_cl_g16 v[0:3], v[2:7], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ; return to shader part epilog @@ -152,10 +143,9 @@ main_body: define amdgpu_ps <4 x float> @sample_cd_1d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, half %dsdh, half %dsdv, float %s) { ; GFX10-LABEL: sample_cd_1d: ; GFX10: ; %bb.0: ; %main_body -; GFX10-NEXT: v_mov_b32_e32 v3, 0xffff ; GFX10-NEXT: s_lshl_b32 s12, s0, 16 -; GFX10-NEXT: v_and_or_b32 v0, v0, v3, s12 -; GFX10-NEXT: v_and_or_b32 v1, v1, v3, s12 +; GFX10-NEXT: v_and_or_b32 v0, 0xffff, v0, s12 +; GFX10-NEXT: v_and_or_b32 v1, 0xffff, v1, s12 ; GFX10-NEXT: image_sample_cd_g16 v[0:3], v[0:2], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ; return to shader part epilog @@ -167,11 +157,10 @@ main_body: define amdgpu_ps <4 x float> @sample_cd_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, half %dsdh, half %dtdh, half %dsdv, half %dtdv, float %s, float %t) { ; GFX10-LABEL: sample_cd_2d: ; GFX10: ; %bb.0: ; %main_body -; GFX10-NEXT: v_mov_b32_e32 v6, 0xffff ; GFX10-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GFX10-NEXT: v_and_or_b32 v0, v0, v6, v1 -; GFX10-NEXT: v_and_or_b32 v1, v2, v6, v3 +; GFX10-NEXT: v_and_or_b32 v0, 0xffff, v0, v1 +; GFX10-NEXT: v_and_or_b32 v1, 0xffff, v2, v3 ; GFX10-NEXT: image_sample_cd_g16 v[0:3], [v0, v1, v4, v5], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ; return to shader part epilog @@ -183,10 +172,9 @@ main_body: define amdgpu_ps <4 x float> @sample_c_cd_1d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %zcompare, half %dsdh, half %dsdv, float %s) { ; GFX10-LABEL: sample_c_cd_1d: ; GFX10: ; %bb.0: ; %main_body -; GFX10-NEXT: v_mov_b32_e32 v4, 0xffff ; GFX10-NEXT: s_lshl_b32 s12, s0, 16 -; GFX10-NEXT: v_and_or_b32 v1, v1, v4, s12 -; GFX10-NEXT: v_and_or_b32 v2, v2, v4, s12 +; GFX10-NEXT: v_and_or_b32 v1, 0xffff, v1, s12 +; GFX10-NEXT: v_and_or_b32 v2, 0xffff, v2, s12 ; GFX10-NEXT: image_sample_c_cd_g16 v[0:3], v[0:3], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ; return to shader part epilog @@ -198,11 +186,10 @@ main_body: define amdgpu_ps <4 x float> @sample_c_cd_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %zcompare, half %dsdh, half %dtdh, half %dsdv, half %dtdv, float %s, float %t) { ; GFX10-LABEL: sample_c_cd_2d: ; GFX10: ; %bb.0: ; %main_body -; GFX10-NEXT: v_mov_b32_e32 v7, 0xffff ; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; GFX10-NEXT: v_and_or_b32 v1, v1, v7, v2 -; GFX10-NEXT: v_and_or_b32 v2, v3, v7, v4 +; GFX10-NEXT: v_and_or_b32 v1, 0xffff, v1, v2 +; GFX10-NEXT: v_and_or_b32 v2, 0xffff, v3, v4 ; GFX10-NEXT: image_sample_c_cd_g16 v[0:3], [v0, v1, v2, v5, v6], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ; return to shader part epilog @@ -214,10 +201,9 @@ main_body: define amdgpu_ps <4 x float> @sample_cd_cl_1d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, half %dsdh, half %dsdv, float %s, float %clamp) { ; GFX10-LABEL: sample_cd_cl_1d: ; GFX10: ; %bb.0: ; %main_body -; GFX10-NEXT: v_mov_b32_e32 v4, 0xffff ; GFX10-NEXT: s_lshl_b32 s12, s0, 16 -; GFX10-NEXT: v_and_or_b32 v0, v0, v4, s12 -; GFX10-NEXT: v_and_or_b32 v1, v1, v4, s12 +; GFX10-NEXT: v_and_or_b32 v0, 0xffff, v0, s12 +; GFX10-NEXT: v_and_or_b32 v1, 0xffff, v1, s12 ; GFX10-NEXT: image_sample_cd_cl_g16 v[0:3], v[0:3], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ; return to shader part epilog @@ -229,11 +215,10 @@ main_body: define amdgpu_ps <4 x float> @sample_cd_cl_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, half %dsdh, half %dtdh, half %dsdv, half %dtdv, float %s, float %t, float %clamp) { ; GFX10-LABEL: sample_cd_cl_2d: ; GFX10: ; %bb.0: ; %main_body -; GFX10-NEXT: v_mov_b32_e32 v7, 0xffff ; GFX10-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GFX10-NEXT: v_and_or_b32 v0, v0, v7, v1 -; GFX10-NEXT: v_and_or_b32 v1, v2, v7, v3 +; GFX10-NEXT: v_and_or_b32 v0, 0xffff, v0, v1 +; GFX10-NEXT: v_and_or_b32 v1, 0xffff, v2, v3 ; GFX10-NEXT: image_sample_cd_cl_g16 v[0:3], [v0, v1, v4, v5, v6], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ; return to shader part epilog @@ -245,10 +230,9 @@ main_body: define amdgpu_ps <4 x float> @sample_c_cd_cl_1d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %zcompare, half %dsdh, half %dsdv, float %s, float %clamp) { ; GFX10-LABEL: sample_c_cd_cl_1d: ; GFX10: ; %bb.0: ; %main_body -; GFX10-NEXT: v_mov_b32_e32 v5, 0xffff ; GFX10-NEXT: s_lshl_b32 s12, s0, 16 -; GFX10-NEXT: v_and_or_b32 v1, v1, v5, s12 -; GFX10-NEXT: v_and_or_b32 v2, v2, v5, s12 +; GFX10-NEXT: v_and_or_b32 v1, 0xffff, v1, s12 +; GFX10-NEXT: v_and_or_b32 v2, 0xffff, v2, s12 ; GFX10-NEXT: image_sample_c_cd_cl_g16 v[0:3], v[0:4], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ; return to shader part epilog @@ -263,11 +247,10 @@ define amdgpu_ps <4 x float> @sample_c_cd_cl_2d(<8 x i32> inreg %rsrc, <4 x i32> ; GFX10-NEXT: v_mov_b32_e32 v8, v2 ; GFX10-NEXT: v_mov_b32_e32 v9, v3 ; GFX10-NEXT: v_mov_b32_e32 v2, v0 -; GFX10-NEXT: v_mov_b32_e32 v0, 0xffff ; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v8 -; GFX10-NEXT: v_and_or_b32 v4, v9, v0, v4 -; GFX10-NEXT: v_and_or_b32 v3, v1, v0, v3 +; GFX10-NEXT: v_lshlrev_b32_e32 v0, 16, v8 +; GFX10-NEXT: v_and_or_b32 v4, 0xffff, v9, v4 +; GFX10-NEXT: v_and_or_b32 v3, 0xffff, v1, v0 ; GFX10-NEXT: image_sample_c_cd_cl_g16 v[0:3], v[2:7], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ; return to shader part epilog @@ -284,11 +267,10 @@ define amdgpu_ps float @sample_c_d_o_2darray_V1(<8 x i32> inreg %rsrc, <4 x i32> ; GFX10-NEXT: v_mov_b32_e32 v11, v4 ; GFX10-NEXT: v_mov_b32_e32 v2, v0 ; GFX10-NEXT: v_mov_b32_e32 v3, v1 -; GFX10-NEXT: v_mov_b32_e32 v0, 0xffff -; GFX10-NEXT: v_lshlrev_b32_e32 v1, 16, v9 -; GFX10-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GFX10-NEXT: v_and_or_b32 v4, v10, v0, v1 -; GFX10-NEXT: v_and_or_b32 v5, v11, v0, v5 +; GFX10-NEXT: v_lshlrev_b32_e32 v0, 16, v9 +; GFX10-NEXT: v_lshlrev_b32_e32 v1, 16, v5 +; GFX10-NEXT: v_and_or_b32 v4, 0xffff, v10, v0 +; GFX10-NEXT: v_and_or_b32 v5, 0xffff, v11, v1 ; GFX10-NEXT: image_sample_c_d_o_g16 v0, v[2:8], s[0:7], s[8:11] dmask:0x4 dim:SQ_RSRC_IMG_2D_ARRAY ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ; return to shader part epilog @@ -305,11 +287,10 @@ define amdgpu_ps <2 x float> @sample_c_d_o_2darray_V2(<8 x i32> inreg %rsrc, <4 ; GFX10-NEXT: v_mov_b32_e32 v11, v4 ; GFX10-NEXT: v_mov_b32_e32 v2, v0 ; GFX10-NEXT: v_mov_b32_e32 v3, v1 -; GFX10-NEXT: v_mov_b32_e32 v0, 0xffff -; GFX10-NEXT: v_lshlrev_b32_e32 v1, 16, v9 -; GFX10-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GFX10-NEXT: v_and_or_b32 v4, v10, v0, v1 -; GFX10-NEXT: v_and_or_b32 v5, v11, v0, v5 +; GFX10-NEXT: v_lshlrev_b32_e32 v0, 16, v9 +; GFX10-NEXT: v_lshlrev_b32_e32 v1, 16, v5 +; GFX10-NEXT: v_and_or_b32 v4, 0xffff, v10, v0 +; GFX10-NEXT: v_and_or_b32 v5, 0xffff, v11, v1 ; GFX10-NEXT: image_sample_c_d_o_g16 v[0:1], v[2:8], s[0:7], s[8:11] dmask:0x6 dim:SQ_RSRC_IMG_2D_ARRAY ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ; return to shader part epilog diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.intersect_ray.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.intersect_ray.ll index 15755f4455cb..80bc1114de88 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.intersect_ray.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.intersect_ray.ll @@ -49,15 +49,14 @@ define amdgpu_ps <4 x float> @image_bvh_intersect_ray_flat(i32 %node_ptr, float define amdgpu_ps <4 x float> @image_bvh_intersect_ray_a16(i32 %node_ptr, float %ray_extent, <3 x float> %ray_origin, <3 x half> %ray_dir, <3 x half> %ray_inv_dir, <4 x i32> inreg %tdescr) { ; GCN-LABEL: image_bvh_intersect_ray_a16: ; GCN: ; %bb.0: -; GCN-NEXT: s_mov_b32 s4, 0xffff ; GCN-NEXT: v_lshrrev_b32_e32 v9, 16, v5 -; GCN-NEXT: v_and_b32_e32 v10, s4, v7 -; GCN-NEXT: v_and_b32_e32 v8, s4, v8 +; GCN-NEXT: v_and_b32_e32 v10, 0xffff, v7 +; GCN-NEXT: v_and_b32_e32 v8, 0xffff, v8 ; GCN-NEXT: v_lshlrev_b32_e32 v9, 16, v9 ; GCN-NEXT: v_lshlrev_b32_e32 v10, 16, v10 ; GCN-NEXT: v_alignbit_b32 v7, v8, v7, 16 -; GCN-NEXT: v_and_or_b32 v5, v5, s4, v9 -; GCN-NEXT: v_and_or_b32 v6, v6, s4, v10 +; GCN-NEXT: v_and_or_b32 v5, v5, 0xffff, v9 +; GCN-NEXT: v_and_or_b32 v6, v6, 0xffff, v10 ; GCN-NEXT: image_bvh_intersect_ray v[0:3], v[0:7], s[0:3] a16 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: ; return to shader part epilog @@ -101,15 +100,14 @@ define amdgpu_ps <4 x float> @image_bvh64_intersect_ray_flat(<2 x i32> %node_ptr define amdgpu_ps <4 x float> @image_bvh64_intersect_ray_a16(i64 %node_ptr, float %ray_extent, <3 x float> %ray_origin, <3 x half> %ray_dir, <3 x half> %ray_inv_dir, <4 x i32> inreg %tdescr) { ; GCN-LABEL: image_bvh64_intersect_ray_a16: ; GCN: ; %bb.0: -; GCN-NEXT: s_mov_b32 s4, 0xffff ; GCN-NEXT: v_lshrrev_b32_e32 v10, 16, v6 -; GCN-NEXT: v_and_b32_e32 v11, s4, v8 -; GCN-NEXT: v_and_b32_e32 v9, s4, v9 +; GCN-NEXT: v_and_b32_e32 v11, 0xffff, v8 +; GCN-NEXT: v_and_b32_e32 v9, 0xffff, v9 ; GCN-NEXT: v_lshlrev_b32_e32 v10, 16, v10 ; GCN-NEXT: v_lshlrev_b32_e32 v11, 16, v11 ; GCN-NEXT: v_alignbit_b32 v8, v9, v8, 16 -; GCN-NEXT: v_and_or_b32 v6, v6, s4, v10 -; GCN-NEXT: v_and_or_b32 v7, v7, s4, v11 +; GCN-NEXT: v_and_or_b32 v6, v6, 0xffff, v10 +; GCN-NEXT: v_and_or_b32 v7, v7, 0xffff, v11 ; GCN-NEXT: image_bvh64_intersect_ray v[0:3], v[0:15], s[0:3] a16 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: ; return to shader part epilog @@ -202,21 +200,20 @@ define amdgpu_ps <4 x float> @image_bvh_intersect_ray_vgpr_descr(i32 %node_ptr, define amdgpu_ps <4 x float> @image_bvh_intersect_ray_a16_vgpr_descr(i32 %node_ptr, float %ray_extent, <3 x float> %ray_origin, <3 x half> %ray_dir, <3 x half> %ray_inv_dir, <4 x i32> %tdescr) { ; GFX1030-LABEL: image_bvh_intersect_ray_a16_vgpr_descr: ; GFX1030: ; %bb.0: -; GFX1030-NEXT: s_mov_b32 s0, 0xffff ; GFX1030-NEXT: v_mov_b32_e32 v13, v0 ; GFX1030-NEXT: v_mov_b32_e32 v14, v1 ; GFX1030-NEXT: v_lshrrev_b32_e32 v0, 16, v5 -; GFX1030-NEXT: v_and_b32_e32 v1, s0, v7 +; GFX1030-NEXT: v_and_b32_e32 v1, 0xffff, v7 ; GFX1030-NEXT: v_mov_b32_e32 v15, v2 -; GFX1030-NEXT: v_and_b32_e32 v2, s0, v8 +; GFX1030-NEXT: v_and_b32_e32 v2, 0xffff, v8 ; GFX1030-NEXT: v_mov_b32_e32 v16, v3 ; GFX1030-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX1030-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX1030-NEXT: v_mov_b32_e32 v17, v4 ; GFX1030-NEXT: v_alignbit_b32 v20, v2, v7, 16 ; GFX1030-NEXT: s_mov_b32 s1, exec_lo -; GFX1030-NEXT: v_and_or_b32 v18, v5, s0, v0 -; GFX1030-NEXT: v_and_or_b32 v19, v6, s0, v1 +; GFX1030-NEXT: v_and_or_b32 v18, v5, 0xffff, v0 +; GFX1030-NEXT: v_and_or_b32 v19, v6, 0xffff, v1 ; GFX1030-NEXT: .LBB7_1: ; =>This Inner Loop Header: Depth=1 ; GFX1030-NEXT: v_readfirstlane_b32 s4, v9 ; GFX1030-NEXT: v_readfirstlane_b32 s5, v10 @@ -246,16 +243,15 @@ define amdgpu_ps <4 x float> @image_bvh_intersect_ray_a16_vgpr_descr(i32 %node_p ; ; GFX1013-LABEL: image_bvh_intersect_ray_a16_vgpr_descr: ; GFX1013: ; %bb.0: -; GFX1013-NEXT: s_mov_b32 s0, 0xffff ; GFX1013-NEXT: v_lshrrev_b32_e32 v13, 16, v5 -; GFX1013-NEXT: v_and_b32_e32 v14, s0, v7 -; GFX1013-NEXT: v_and_b32_e32 v8, s0, v8 +; GFX1013-NEXT: v_and_b32_e32 v14, 0xffff, v7 +; GFX1013-NEXT: v_and_b32_e32 v8, 0xffff, v8 ; GFX1013-NEXT: s_mov_b32 s1, exec_lo ; GFX1013-NEXT: v_lshlrev_b32_e32 v13, 16, v13 ; GFX1013-NEXT: v_lshlrev_b32_e32 v14, 16, v14 ; GFX1013-NEXT: v_alignbit_b32 v7, v8, v7, 16 -; GFX1013-NEXT: v_and_or_b32 v5, v5, s0, v13 -; GFX1013-NEXT: v_and_or_b32 v6, v6, s0, v14 +; GFX1013-NEXT: v_and_or_b32 v5, v5, 0xffff, v13 +; GFX1013-NEXT: v_and_or_b32 v6, v6, 0xffff, v14 ; GFX1013-NEXT: .LBB7_1: ; =>This Inner Loop Header: Depth=1 ; GFX1013-NEXT: v_readfirstlane_b32 s4, v9 ; GFX1013-NEXT: v_readfirstlane_b32 s5, v10 @@ -371,21 +367,20 @@ define amdgpu_ps <4 x float> @image_bvh64_intersect_ray_vgpr_descr(i64 %node_ptr define amdgpu_ps <4 x float> @image_bvh64_intersect_ray_a16_vgpr_descr(i64 %node_ptr, float %ray_extent, <3 x float> %ray_origin, <3 x half> %ray_dir, <3 x half> %ray_inv_dir, <4 x i32> %tdescr) { ; GFX1030-LABEL: image_bvh64_intersect_ray_a16_vgpr_descr: ; GFX1030: ; %bb.0: -; GFX1030-NEXT: s_mov_b32 s0, 0xffff ; GFX1030-NEXT: v_mov_b32_e32 v14, v0 ; GFX1030-NEXT: v_mov_b32_e32 v15, v1 ; GFX1030-NEXT: v_lshrrev_b32_e32 v0, 16, v6 -; GFX1030-NEXT: v_and_b32_e32 v1, s0, v8 +; GFX1030-NEXT: v_and_b32_e32 v1, 0xffff, v8 ; GFX1030-NEXT: v_mov_b32_e32 v16, v2 -; GFX1030-NEXT: v_and_b32_e32 v2, s0, v9 +; GFX1030-NEXT: v_and_b32_e32 v2, 0xffff, v9 ; GFX1030-NEXT: v_mov_b32_e32 v17, v3 ; GFX1030-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX1030-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX1030-NEXT: v_mov_b32_e32 v18, v4 ; GFX1030-NEXT: v_mov_b32_e32 v19, v5 ; GFX1030-NEXT: v_alignbit_b32 v22, v2, v8, 16 -; GFX1030-NEXT: v_and_or_b32 v20, v6, s0, v0 -; GFX1030-NEXT: v_and_or_b32 v21, v7, s0, v1 +; GFX1030-NEXT: v_and_or_b32 v20, v6, 0xffff, v0 +; GFX1030-NEXT: v_and_or_b32 v21, v7, 0xffff, v1 ; GFX1030-NEXT: s_mov_b32 s1, exec_lo ; GFX1030-NEXT: .LBB9_1: ; =>This Inner Loop Header: Depth=1 ; GFX1030-NEXT: v_readfirstlane_b32 s4, v10 @@ -417,20 +412,19 @@ define amdgpu_ps <4 x float> @image_bvh64_intersect_ray_a16_vgpr_descr(i64 %node ; ; GFX1013-LABEL: image_bvh64_intersect_ray_a16_vgpr_descr: ; GFX1013: ; %bb.0: -; GFX1013-NEXT: s_mov_b32 s0, 0xffff ; GFX1013-NEXT: v_mov_b32_e32 v16, v10 ; GFX1013-NEXT: v_mov_b32_e32 v17, v11 ; GFX1013-NEXT: v_lshrrev_b32_e32 v10, 16, v6 -; GFX1013-NEXT: v_and_b32_e32 v11, s0, v8 -; GFX1013-NEXT: v_and_b32_e32 v9, s0, v9 +; GFX1013-NEXT: v_and_b32_e32 v11, 0xffff, v8 +; GFX1013-NEXT: v_and_b32_e32 v9, 0xffff, v9 ; GFX1013-NEXT: v_mov_b32_e32 v18, v12 ; GFX1013-NEXT: v_mov_b32_e32 v19, v13 ; GFX1013-NEXT: v_lshlrev_b32_e32 v10, 16, v10 ; GFX1013-NEXT: v_lshlrev_b32_e32 v11, 16, v11 ; GFX1013-NEXT: v_alignbit_b32 v8, v9, v8, 16 ; GFX1013-NEXT: s_mov_b32 s1, exec_lo -; GFX1013-NEXT: v_and_or_b32 v6, v6, s0, v10 -; GFX1013-NEXT: v_and_or_b32 v7, v7, s0, v11 +; GFX1013-NEXT: v_and_or_b32 v6, v6, 0xffff, v10 +; GFX1013-NEXT: v_and_or_b32 v7, v7, 0xffff, v11 ; GFX1013-NEXT: .LBB9_1: ; =>This Inner Loop Header: Depth=1 ; GFX1013-NEXT: v_readfirstlane_b32 s4, v16 ; GFX1013-NEXT: v_readfirstlane_b32 s5, v17 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.sbfe.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.sbfe.ll index fa0d86214e65..723b9d8140db 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.sbfe.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.sbfe.ll @@ -752,14 +752,13 @@ define amdgpu_kernel void @bfe_8_bfe_8(i32 addrspace(1)* %out, i32 addrspace(1)* ; GFX6-LABEL: bfe_8_bfe_8: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2 -; GFX6-NEXT: s_mov_b32 s4, 0x80000 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_load_dword s3, s[2:3], 0x0 ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: s_bfe_i32 s3, s3, s4 -; GFX6-NEXT: s_bfe_i32 s3, s3, s4 +; GFX6-NEXT: s_bfe_i32 s3, s3, 0x80000 +; GFX6-NEXT: s_bfe_i32 s3, s3, 0x80000 ; GFX6-NEXT: v_mov_b32_e32 v0, s3 ; GFX6-NEXT: s_mov_b32 s3, 0xf000 ; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.sdot4.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.sdot4.ll index 866bae4b3400..6a6b12108620 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.sdot4.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.sdot4.ll @@ -66,17 +66,16 @@ define i32 @v_sdot4_cast_v4i8(<4 x i8> %a, <4 x i8> %b, i32 %c) { ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: s_mov_b32 s4, 8 -; GFX10-NEXT: s_movk_i32 s5, 0xff ; GFX10-NEXT: v_lshlrev_b32_sdwa v1, s4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX10-NEXT: v_and_or_b32 v0, v0, s5, v1 -; GFX10-NEXT: v_and_b32_e32 v1, s5, v2 -; GFX10-NEXT: v_and_b32_e32 v2, s5, v3 +; GFX10-NEXT: v_and_or_b32 v0, v0, 0xff, v1 +; GFX10-NEXT: v_and_b32_e32 v1, 0xff, v2 +; GFX10-NEXT: v_and_b32_e32 v2, 0xff, v3 ; GFX10-NEXT: v_lshlrev_b32_sdwa v3, s4, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX10-NEXT: v_and_b32_e32 v5, s5, v6 -; GFX10-NEXT: v_and_b32_e32 v6, s5, v7 +; GFX10-NEXT: v_and_b32_e32 v5, 0xff, v6 +; GFX10-NEXT: v_and_b32_e32 v6, 0xff, v7 ; GFX10-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX10-NEXT: v_lshlrev_b32_e32 v2, 24, v2 -; GFX10-NEXT: v_and_or_b32 v3, v4, s5, v3 +; GFX10-NEXT: v_and_or_b32 v3, v4, 0xff, v3 ; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v5 ; GFX10-NEXT: v_lshlrev_b32_e32 v5, 24, v6 ; GFX10-NEXT: v_or3_b32 v0, v0, v1, v2 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.udot4.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.udot4.ll index ffcc4ed7d38f..6b28f32ba073 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.udot4.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.udot4.ll @@ -66,17 +66,16 @@ define i32 @v_udot4_cast_v4i8(<4 x i8> %a, <4 x i8> %b, i32 %c) { ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: s_mov_b32 s4, 8 -; GFX10-NEXT: s_movk_i32 s5, 0xff ; GFX10-NEXT: v_lshlrev_b32_sdwa v1, s4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX10-NEXT: v_and_or_b32 v0, v0, s5, v1 -; GFX10-NEXT: v_and_b32_e32 v1, s5, v2 -; GFX10-NEXT: v_and_b32_e32 v2, s5, v3 +; GFX10-NEXT: v_and_or_b32 v0, v0, 0xff, v1 +; GFX10-NEXT: v_and_b32_e32 v1, 0xff, v2 +; GFX10-NEXT: v_and_b32_e32 v2, 0xff, v3 ; GFX10-NEXT: v_lshlrev_b32_sdwa v3, s4, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX10-NEXT: v_and_b32_e32 v5, s5, v6 -; GFX10-NEXT: v_and_b32_e32 v6, s5, v7 +; GFX10-NEXT: v_and_b32_e32 v5, 0xff, v6 +; GFX10-NEXT: v_and_b32_e32 v6, 0xff, v7 ; GFX10-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX10-NEXT: v_lshlrev_b32_e32 v2, 24, v2 -; GFX10-NEXT: v_and_or_b32 v3, v4, s5, v3 +; GFX10-NEXT: v_and_or_b32 v3, v4, 0xff, v3 ; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v5 ; GFX10-NEXT: v_lshlrev_b32_e32 v5, 24, v6 ; GFX10-NEXT: v_or3_b32 v0, v0, v1, v2 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/lshr.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/lshr.ll index c2f624f3a2ee..107a95da9d16 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/lshr.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/lshr.ll @@ -115,9 +115,8 @@ define i24 @v_lshr_i24(i24 %value, i24 %amount) { ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: s_mov_b32 s4, 0xffffff -; GFX10-NEXT: v_and_b32_e32 v1, s4, v1 -; GFX10-NEXT: v_and_b32_e32 v0, s4, v0 +; GFX10-NEXT: v_and_b32_e32 v1, 0xffffff, v1 +; GFX10-NEXT: v_and_b32_e32 v0, 0xffffff, v0 ; GFX10-NEXT: v_lshrrev_b32_e32 v0, v1, v0 ; GFX10-NEXT: s_setpc_b64 s[30:31] %result = lshr i24 %value, %amount @@ -631,9 +630,8 @@ define amdgpu_ps i16 @s_lshr_i16_15(i16 inreg %value) { define amdgpu_ps half @lshr_i16_sv(i16 inreg %value, i16 %amount) { ; GFX6-LABEL: lshr_i16_sv: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_mov_b32 s1, 0xffff -; GFX6-NEXT: v_and_b32_e32 v0, s1, v0 -; GFX6-NEXT: s_and_b32 s0, s0, s1 +; GFX6-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX6-NEXT: s_and_b32 s0, s0, 0xffff ; GFX6-NEXT: v_lshr_b32_e32 v0, s0, v0 ; GFX6-NEXT: ; return to shader part epilog ; @@ -659,9 +657,8 @@ define amdgpu_ps half @lshr_i16_sv(i16 inreg %value, i16 %amount) { define amdgpu_ps half @lshr_i16_vs(i16 %value, i16 inreg %amount) { ; GFX6-LABEL: lshr_i16_vs: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_mov_b32 s1, 0xffff -; GFX6-NEXT: s_and_b32 s0, s0, s1 -; GFX6-NEXT: v_and_b32_e32 v0, s1, v0 +; GFX6-NEXT: s_and_b32 s0, s0, 0xffff +; GFX6-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX6-NEXT: v_lshrrev_b32_e32 v0, s0, v0 ; GFX6-NEXT: ; return to shader part epilog ; @@ -757,9 +754,8 @@ define <2 x i16> @v_lshr_v2i16_15(<2 x i16> %value) { define amdgpu_ps i32 @s_lshr_v2i16(<2 x i16> inreg %value, <2 x i16> inreg %amount) { ; GFX6-LABEL: s_lshr_v2i16: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_mov_b32 s4, 0xffff -; GFX6-NEXT: s_and_b32 s1, s1, s4 -; GFX6-NEXT: s_and_b32 s0, s0, s4 +; GFX6-NEXT: s_and_b32 s1, s1, 0xffff +; GFX6-NEXT: s_and_b32 s0, s0, 0xffff ; GFX6-NEXT: s_lshr_b32 s1, s1, s3 ; GFX6-NEXT: s_lshr_b32 s0, s0, s2 ; GFX6-NEXT: s_lshl_b32 s1, s1, 16 @@ -768,14 +764,13 @@ define amdgpu_ps i32 @s_lshr_v2i16(<2 x i16> inreg %value, <2 x i16> inreg %amou ; ; GFX8-LABEL: s_lshr_v2i16: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_mov_b32 s3, 0xffff ; GFX8-NEXT: s_lshr_b32 s2, s0, 16 -; GFX8-NEXT: s_and_b32 s0, s0, s3 -; GFX8-NEXT: s_lshr_b32 s4, s1, 16 +; GFX8-NEXT: s_and_b32 s0, s0, 0xffff +; GFX8-NEXT: s_lshr_b32 s3, s1, 16 ; GFX8-NEXT: s_lshr_b32 s0, s0, s1 -; GFX8-NEXT: s_lshr_b32 s1, s2, s4 +; GFX8-NEXT: s_lshr_b32 s1, s2, s3 ; GFX8-NEXT: s_lshl_b32 s1, s1, 16 -; GFX8-NEXT: s_and_b32 s0, s0, s3 +; GFX8-NEXT: s_and_b32 s0, s0, 0xffff ; GFX8-NEXT: s_or_b32 s0, s1, s0 ; GFX8-NEXT: ; return to shader part epilog ; @@ -808,10 +803,10 @@ define amdgpu_ps float @lshr_v2i16_sv(<2 x i16> inreg %value, <2 x i16> %amount) ; GFX6: ; %bb.0: ; GFX6-NEXT: s_mov_b32 s2, 0xffff ; GFX6-NEXT: v_and_b32_e32 v0, s2, v0 -; GFX6-NEXT: s_and_b32 s0, s0, s2 +; GFX6-NEXT: s_and_b32 s0, s0, 0xffff ; GFX6-NEXT: v_lshr_b32_e32 v0, s0, v0 ; GFX6-NEXT: v_and_b32_e32 v1, s2, v1 -; GFX6-NEXT: s_and_b32 s0, s1, s2 +; GFX6-NEXT: s_and_b32 s0, s1, 0xffff ; GFX6-NEXT: v_lshr_b32_e32 v1, s0, v1 ; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX6-NEXT: v_or_b32_e32 v0, v0, v1 @@ -844,10 +839,10 @@ define amdgpu_ps float @lshr_v2i16_vs(<2 x i16> %value, <2 x i16> inreg %amount) ; GFX6-LABEL: lshr_v2i16_vs: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_mov_b32 s2, 0xffff -; GFX6-NEXT: s_and_b32 s0, s0, s2 +; GFX6-NEXT: s_and_b32 s0, s0, 0xffff ; GFX6-NEXT: v_and_b32_e32 v0, s2, v0 ; GFX6-NEXT: v_lshrrev_b32_e32 v0, s0, v0 -; GFX6-NEXT: s_and_b32 s0, s1, s2 +; GFX6-NEXT: s_and_b32 s0, s1, 0xffff ; GFX6-NEXT: v_and_b32_e32 v1, s2, v1 ; GFX6-NEXT: v_lshrrev_b32_e32 v1, s0, v1 ; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1 @@ -944,13 +939,12 @@ define <2 x float> @v_lshr_v4i16(<4 x i16> %value, <4 x i16> %amount) { define amdgpu_ps <2 x i32> @s_lshr_v4i16(<4 x i16> inreg %value, <4 x i16> inreg %amount) { ; GFX6-LABEL: s_lshr_v4i16: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_mov_b32 s8, 0xffff -; GFX6-NEXT: s_and_b32 s1, s1, s8 -; GFX6-NEXT: s_and_b32 s0, s0, s8 +; GFX6-NEXT: s_and_b32 s1, s1, 0xffff +; GFX6-NEXT: s_and_b32 s0, s0, 0xffff ; GFX6-NEXT: s_lshr_b32 s1, s1, s5 -; GFX6-NEXT: s_and_b32 s3, s3, s8 +; GFX6-NEXT: s_and_b32 s3, s3, 0xffff ; GFX6-NEXT: s_lshr_b32 s0, s0, s4 -; GFX6-NEXT: s_and_b32 s2, s2, s8 +; GFX6-NEXT: s_and_b32 s2, s2, 0xffff ; GFX6-NEXT: s_lshr_b32 s3, s3, s7 ; GFX6-NEXT: s_lshl_b32 s1, s1, 16 ; GFX6-NEXT: s_lshr_b32 s2, s2, s6 @@ -961,36 +955,34 @@ define amdgpu_ps <2 x i32> @s_lshr_v4i16(<4 x i16> inreg %value, <4 x i16> inreg ; ; GFX8-LABEL: s_lshr_v4i16: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_mov_b32 s6, 0xffff ; GFX8-NEXT: s_lshr_b32 s4, s0, 16 -; GFX8-NEXT: s_and_b32 s0, s0, s6 -; GFX8-NEXT: s_lshr_b32 s7, s2, 16 +; GFX8-NEXT: s_and_b32 s0, s0, 0xffff +; GFX8-NEXT: s_lshr_b32 s6, s2, 16 ; GFX8-NEXT: s_lshr_b32 s5, s1, 16 -; GFX8-NEXT: s_and_b32 s1, s1, s6 -; GFX8-NEXT: s_lshr_b32 s8, s3, 16 +; GFX8-NEXT: s_and_b32 s1, s1, 0xffff +; GFX8-NEXT: s_lshr_b32 s7, s3, 16 ; GFX8-NEXT: s_lshr_b32 s0, s0, s2 -; GFX8-NEXT: s_lshr_b32 s2, s4, s7 +; GFX8-NEXT: s_lshr_b32 s2, s4, s6 ; GFX8-NEXT: s_lshr_b32 s1, s1, s3 -; GFX8-NEXT: s_lshr_b32 s3, s5, s8 +; GFX8-NEXT: s_lshr_b32 s3, s5, s7 ; GFX8-NEXT: s_lshl_b32 s2, s2, 16 -; GFX8-NEXT: s_and_b32 s0, s0, s6 +; GFX8-NEXT: s_and_b32 s0, s0, 0xffff ; GFX8-NEXT: s_or_b32 s0, s2, s0 ; GFX8-NEXT: s_lshl_b32 s2, s3, 16 -; GFX8-NEXT: s_and_b32 s1, s1, s6 +; GFX8-NEXT: s_and_b32 s1, s1, 0xffff ; GFX8-NEXT: s_or_b32 s1, s2, s1 ; GFX8-NEXT: ; return to shader part epilog ; ; GFX9-LABEL: s_lshr_v4i16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_mov_b32 s5, 0xffff ; GFX9-NEXT: s_lshr_b32 s4, s0, 16 -; GFX9-NEXT: s_and_b32 s0, s0, s5 -; GFX9-NEXT: s_lshr_b32 s6, s2, 16 +; GFX9-NEXT: s_and_b32 s0, s0, 0xffff +; GFX9-NEXT: s_lshr_b32 s5, s2, 16 ; GFX9-NEXT: s_lshr_b32 s0, s0, s2 -; GFX9-NEXT: s_lshr_b32 s2, s4, s6 +; GFX9-NEXT: s_lshr_b32 s2, s4, s5 ; GFX9-NEXT: s_pack_ll_b32_b16 s0, s0, s2 ; GFX9-NEXT: s_lshr_b32 s2, s1, 16 -; GFX9-NEXT: s_and_b32 s1, s1, s5 +; GFX9-NEXT: s_and_b32 s1, s1, 0xffff ; GFX9-NEXT: s_lshr_b32 s4, s3, 16 ; GFX9-NEXT: s_lshr_b32 s1, s1, s3 ; GFX9-NEXT: s_lshr_b32 s2, s2, s4 @@ -999,17 +991,16 @@ define amdgpu_ps <2 x i32> @s_lshr_v4i16(<4 x i16> inreg %value, <4 x i16> inreg ; ; GFX10-LABEL: s_lshr_v4i16: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_mov_b32 s4, 0xffff -; GFX10-NEXT: s_lshr_b32 s5, s0, 16 -; GFX10-NEXT: s_and_b32 s0, s0, s4 -; GFX10-NEXT: s_lshr_b32 s6, s2, 16 +; GFX10-NEXT: s_lshr_b32 s4, s0, 16 +; GFX10-NEXT: s_and_b32 s0, s0, 0xffff +; GFX10-NEXT: s_lshr_b32 s5, s2, 16 ; GFX10-NEXT: s_lshr_b32 s0, s0, s2 -; GFX10-NEXT: s_lshr_b32 s2, s5, s6 -; GFX10-NEXT: s_lshr_b32 s5, s1, 16 -; GFX10-NEXT: s_and_b32 s1, s1, s4 -; GFX10-NEXT: s_lshr_b32 s4, s3, 16 +; GFX10-NEXT: s_lshr_b32 s2, s4, s5 +; GFX10-NEXT: s_lshr_b32 s4, s1, 16 +; GFX10-NEXT: s_and_b32 s1, s1, 0xffff +; GFX10-NEXT: s_lshr_b32 s5, s3, 16 ; GFX10-NEXT: s_lshr_b32 s1, s1, s3 -; GFX10-NEXT: s_lshr_b32 s3, s5, s4 +; GFX10-NEXT: s_lshr_b32 s3, s4, s5 ; GFX10-NEXT: s_pack_ll_b32_b16 s0, s0, s2 ; GFX10-NEXT: s_pack_ll_b32_b16 s1, s1, s3 ; GFX10-NEXT: ; return to shader part epilog @@ -1124,21 +1115,20 @@ define <4 x float> @v_lshr_v8i16(<8 x i16> %value, <8 x i16> %amount) { define amdgpu_ps <4 x i32> @s_lshr_v8i16(<8 x i16> inreg %value, <8 x i16> inreg %amount) { ; GFX6-LABEL: s_lshr_v8i16: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_mov_b32 s16, 0xffff -; GFX6-NEXT: s_and_b32 s1, s1, s16 -; GFX6-NEXT: s_and_b32 s0, s0, s16 +; GFX6-NEXT: s_and_b32 s1, s1, 0xffff +; GFX6-NEXT: s_and_b32 s0, s0, 0xffff ; GFX6-NEXT: s_lshr_b32 s1, s1, s9 -; GFX6-NEXT: s_and_b32 s3, s3, s16 +; GFX6-NEXT: s_and_b32 s3, s3, 0xffff ; GFX6-NEXT: s_lshr_b32 s0, s0, s8 -; GFX6-NEXT: s_and_b32 s2, s2, s16 +; GFX6-NEXT: s_and_b32 s2, s2, 0xffff ; GFX6-NEXT: s_lshr_b32 s3, s3, s11 -; GFX6-NEXT: s_and_b32 s5, s5, s16 -; GFX6-NEXT: s_and_b32 s7, s7, s16 +; GFX6-NEXT: s_and_b32 s5, s5, 0xffff +; GFX6-NEXT: s_and_b32 s7, s7, 0xffff ; GFX6-NEXT: s_lshl_b32 s1, s1, 16 ; GFX6-NEXT: s_lshr_b32 s2, s2, s10 -; GFX6-NEXT: s_and_b32 s4, s4, s16 +; GFX6-NEXT: s_and_b32 s4, s4, 0xffff ; GFX6-NEXT: s_lshr_b32 s5, s5, s13 -; GFX6-NEXT: s_and_b32 s6, s6, s16 +; GFX6-NEXT: s_and_b32 s6, s6, 0xffff ; GFX6-NEXT: s_lshr_b32 s7, s7, s15 ; GFX6-NEXT: s_or_b32 s0, s0, s1 ; GFX6-NEXT: s_lshl_b32 s1, s3, 16 @@ -1153,64 +1143,62 @@ define amdgpu_ps <4 x i32> @s_lshr_v8i16(<8 x i16> inreg %value, <8 x i16> inreg ; ; GFX8-LABEL: s_lshr_v8i16: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_mov_b32 s12, 0xffff ; GFX8-NEXT: s_lshr_b32 s8, s0, 16 -; GFX8-NEXT: s_and_b32 s0, s0, s12 -; GFX8-NEXT: s_lshr_b32 s13, s4, 16 +; GFX8-NEXT: s_and_b32 s0, s0, 0xffff +; GFX8-NEXT: s_lshr_b32 s12, s4, 16 ; GFX8-NEXT: s_lshr_b32 s9, s1, 16 -; GFX8-NEXT: s_and_b32 s1, s1, s12 -; GFX8-NEXT: s_lshr_b32 s14, s5, 16 +; GFX8-NEXT: s_and_b32 s1, s1, 0xffff +; GFX8-NEXT: s_lshr_b32 s13, s5, 16 ; GFX8-NEXT: s_lshr_b32 s0, s0, s4 -; GFX8-NEXT: s_lshr_b32 s4, s8, s13 +; GFX8-NEXT: s_lshr_b32 s4, s8, s12 ; GFX8-NEXT: s_lshr_b32 s10, s2, 16 -; GFX8-NEXT: s_and_b32 s2, s2, s12 -; GFX8-NEXT: s_lshr_b32 s15, s6, 16 +; GFX8-NEXT: s_and_b32 s2, s2, 0xffff +; GFX8-NEXT: s_lshr_b32 s14, s6, 16 ; GFX8-NEXT: s_lshr_b32 s1, s1, s5 -; GFX8-NEXT: s_lshr_b32 s5, s9, s14 +; GFX8-NEXT: s_lshr_b32 s5, s9, s13 ; GFX8-NEXT: s_lshl_b32 s4, s4, 16 -; GFX8-NEXT: s_and_b32 s0, s0, s12 +; GFX8-NEXT: s_and_b32 s0, s0, 0xffff ; GFX8-NEXT: s_lshr_b32 s11, s3, 16 -; GFX8-NEXT: s_and_b32 s3, s3, s12 -; GFX8-NEXT: s_lshr_b32 s16, s7, 16 +; GFX8-NEXT: s_and_b32 s3, s3, 0xffff +; GFX8-NEXT: s_lshr_b32 s15, s7, 16 ; GFX8-NEXT: s_lshr_b32 s2, s2, s6 -; GFX8-NEXT: s_lshr_b32 s6, s10, s15 +; GFX8-NEXT: s_lshr_b32 s6, s10, s14 ; GFX8-NEXT: s_or_b32 s0, s4, s0 ; GFX8-NEXT: s_lshl_b32 s4, s5, 16 -; GFX8-NEXT: s_and_b32 s1, s1, s12 +; GFX8-NEXT: s_and_b32 s1, s1, 0xffff ; GFX8-NEXT: s_lshr_b32 s3, s3, s7 -; GFX8-NEXT: s_lshr_b32 s7, s11, s16 +; GFX8-NEXT: s_lshr_b32 s7, s11, s15 ; GFX8-NEXT: s_or_b32 s1, s4, s1 ; GFX8-NEXT: s_lshl_b32 s4, s6, 16 -; GFX8-NEXT: s_and_b32 s2, s2, s12 +; GFX8-NEXT: s_and_b32 s2, s2, 0xffff ; GFX8-NEXT: s_or_b32 s2, s4, s2 ; GFX8-NEXT: s_lshl_b32 s4, s7, 16 -; GFX8-NEXT: s_and_b32 s3, s3, s12 +; GFX8-NEXT: s_and_b32 s3, s3, 0xffff ; GFX8-NEXT: s_or_b32 s3, s4, s3 ; GFX8-NEXT: ; return to shader part epilog ; ; GFX9-LABEL: s_lshr_v8i16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_mov_b32 s9, 0xffff ; GFX9-NEXT: s_lshr_b32 s8, s0, 16 -; GFX9-NEXT: s_and_b32 s0, s0, s9 -; GFX9-NEXT: s_lshr_b32 s10, s4, 16 +; GFX9-NEXT: s_and_b32 s0, s0, 0xffff +; GFX9-NEXT: s_lshr_b32 s9, s4, 16 ; GFX9-NEXT: s_lshr_b32 s0, s0, s4 -; GFX9-NEXT: s_lshr_b32 s4, s8, s10 +; GFX9-NEXT: s_lshr_b32 s4, s8, s9 ; GFX9-NEXT: s_pack_ll_b32_b16 s0, s0, s4 ; GFX9-NEXT: s_lshr_b32 s4, s1, 16 -; GFX9-NEXT: s_and_b32 s1, s1, s9 +; GFX9-NEXT: s_and_b32 s1, s1, 0xffff ; GFX9-NEXT: s_lshr_b32 s8, s5, 16 ; GFX9-NEXT: s_lshr_b32 s1, s1, s5 ; GFX9-NEXT: s_lshr_b32 s4, s4, s8 ; GFX9-NEXT: s_pack_ll_b32_b16 s1, s1, s4 ; GFX9-NEXT: s_lshr_b32 s4, s2, 16 -; GFX9-NEXT: s_and_b32 s2, s2, s9 +; GFX9-NEXT: s_and_b32 s2, s2, 0xffff ; GFX9-NEXT: s_lshr_b32 s5, s6, 16 ; GFX9-NEXT: s_lshr_b32 s2, s2, s6 ; GFX9-NEXT: s_lshr_b32 s4, s4, s5 ; GFX9-NEXT: s_pack_ll_b32_b16 s2, s2, s4 ; GFX9-NEXT: s_lshr_b32 s4, s3, 16 -; GFX9-NEXT: s_and_b32 s3, s3, s9 +; GFX9-NEXT: s_and_b32 s3, s3, 0xffff ; GFX9-NEXT: s_lshr_b32 s5, s7, 16 ; GFX9-NEXT: s_lshr_b32 s3, s3, s7 ; GFX9-NEXT: s_lshr_b32 s4, s4, s5 @@ -1219,26 +1207,25 @@ define amdgpu_ps <4 x i32> @s_lshr_v8i16(<8 x i16> inreg %value, <8 x i16> inreg ; ; GFX10-LABEL: s_lshr_v8i16: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_mov_b32 s8, 0xffff -; GFX10-NEXT: s_lshr_b32 s9, s0, 16 -; GFX10-NEXT: s_and_b32 s0, s0, s8 -; GFX10-NEXT: s_lshr_b32 s10, s4, 16 +; GFX10-NEXT: s_lshr_b32 s8, s0, 16 +; GFX10-NEXT: s_and_b32 s0, s0, 0xffff +; GFX10-NEXT: s_lshr_b32 s9, s4, 16 ; GFX10-NEXT: s_lshr_b32 s0, s0, s4 -; GFX10-NEXT: s_lshr_b32 s4, s9, s10 -; GFX10-NEXT: s_lshr_b32 s9, s1, 16 -; GFX10-NEXT: s_and_b32 s1, s1, s8 -; GFX10-NEXT: s_lshr_b32 s10, s5, 16 +; GFX10-NEXT: s_lshr_b32 s4, s8, s9 +; GFX10-NEXT: s_lshr_b32 s8, s1, 16 +; GFX10-NEXT: s_and_b32 s1, s1, 0xffff +; GFX10-NEXT: s_lshr_b32 s9, s5, 16 ; GFX10-NEXT: s_lshr_b32 s1, s1, s5 -; GFX10-NEXT: s_lshr_b32 s5, s9, s10 +; GFX10-NEXT: s_lshr_b32 s5, s8, s9 ; GFX10-NEXT: s_pack_ll_b32_b16 s0, s0, s4 ; GFX10-NEXT: s_pack_ll_b32_b16 s1, s1, s5 ; GFX10-NEXT: s_lshr_b32 s4, s2, 16 -; GFX10-NEXT: s_and_b32 s2, s2, s8 +; GFX10-NEXT: s_and_b32 s2, s2, 0xffff ; GFX10-NEXT: s_lshr_b32 s5, s6, 16 ; GFX10-NEXT: s_lshr_b32 s2, s2, s6 ; GFX10-NEXT: s_lshr_b32 s4, s4, s5 ; GFX10-NEXT: s_lshr_b32 s5, s3, 16 -; GFX10-NEXT: s_and_b32 s3, s3, s8 +; GFX10-NEXT: s_and_b32 s3, s3, 0xffff ; GFX10-NEXT: s_lshr_b32 s6, s7, 16 ; GFX10-NEXT: s_lshr_b32 s3, s3, s7 ; GFX10-NEXT: s_lshr_b32 s5, s5, s6 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/mul.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/mul.ll index ccf6e6be39be..e5586787ac5b 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/mul.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/mul.ll @@ -12,25 +12,22 @@ define amdgpu_ps i16 @s_mul_i16(i16 inreg %num, i16 inreg %den) { ; ; GFX8-LABEL: s_mul_i16: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_mov_b32 s2, 0xffff -; GFX8-NEXT: s_and_b32 s0, s0, s2 -; GFX8-NEXT: s_and_b32 s1, s1, s2 +; GFX8-NEXT: s_and_b32 s0, s0, 0xffff +; GFX8-NEXT: s_and_b32 s1, s1, 0xffff ; GFX8-NEXT: s_mul_i32 s0, s0, s1 ; GFX8-NEXT: ; return to shader part epilog ; ; GFX9-LABEL: s_mul_i16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_mov_b32 s2, 0xffff -; GFX9-NEXT: s_and_b32 s0, s0, s2 -; GFX9-NEXT: s_and_b32 s1, s1, s2 +; GFX9-NEXT: s_and_b32 s0, s0, 0xffff +; GFX9-NEXT: s_and_b32 s1, s1, 0xffff ; GFX9-NEXT: s_mul_i32 s0, s0, s1 ; GFX9-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: s_mul_i16: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_mov_b32 s2, 0xffff -; GFX10-NEXT: s_and_b32 s0, s0, s2 -; GFX10-NEXT: s_and_b32 s1, s1, s2 +; GFX10-NEXT: s_and_b32 s0, s0, 0xffff +; GFX10-NEXT: s_and_b32 s1, s1, 0xffff ; GFX10-NEXT: s_mul_i32 s0, s0, s1 ; GFX10-NEXT: ; return to shader part epilog %result = mul i16 %num, %den @@ -78,29 +75,26 @@ define amdgpu_ps zeroext i16 @s_mul_i16_zeroext(i16 inreg zeroext %num, i16 inre ; ; GFX8-LABEL: s_mul_i16_zeroext: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_mov_b32 s2, 0xffff -; GFX8-NEXT: s_and_b32 s0, s0, s2 -; GFX8-NEXT: s_and_b32 s1, s1, s2 +; GFX8-NEXT: s_and_b32 s0, s0, 0xffff +; GFX8-NEXT: s_and_b32 s1, s1, 0xffff ; GFX8-NEXT: s_mul_i32 s0, s0, s1 -; GFX8-NEXT: s_and_b32 s0, s0, s2 +; GFX8-NEXT: s_and_b32 s0, s0, 0xffff ; GFX8-NEXT: ; return to shader part epilog ; ; GFX9-LABEL: s_mul_i16_zeroext: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_mov_b32 s2, 0xffff -; GFX9-NEXT: s_and_b32 s0, s0, s2 -; GFX9-NEXT: s_and_b32 s1, s1, s2 +; GFX9-NEXT: s_and_b32 s0, s0, 0xffff +; GFX9-NEXT: s_and_b32 s1, s1, 0xffff ; GFX9-NEXT: s_mul_i32 s0, s0, s1 -; GFX9-NEXT: s_and_b32 s0, s0, s2 +; GFX9-NEXT: s_and_b32 s0, s0, 0xffff ; GFX9-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: s_mul_i16_zeroext: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_mov_b32 s2, 0xffff -; GFX10-NEXT: s_and_b32 s0, s0, s2 -; GFX10-NEXT: s_and_b32 s1, s1, s2 +; GFX10-NEXT: s_and_b32 s0, s0, 0xffff +; GFX10-NEXT: s_and_b32 s1, s1, 0xffff ; GFX10-NEXT: s_mul_i32 s0, s0, s1 -; GFX10-NEXT: s_and_b32 s0, s0, s2 +; GFX10-NEXT: s_and_b32 s0, s0, 0xffff ; GFX10-NEXT: ; return to shader part epilog %result = mul i16 %num, %den ret i16 %result @@ -146,27 +140,24 @@ define amdgpu_ps signext i16 @s_mul_i16_signext(i16 inreg signext %num, i16 inre ; ; GFX8-LABEL: s_mul_i16_signext: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_mov_b32 s2, 0xffff -; GFX8-NEXT: s_and_b32 s0, s0, s2 -; GFX8-NEXT: s_and_b32 s1, s1, s2 +; GFX8-NEXT: s_and_b32 s0, s0, 0xffff +; GFX8-NEXT: s_and_b32 s1, s1, 0xffff ; GFX8-NEXT: s_mul_i32 s0, s0, s1 ; GFX8-NEXT: s_sext_i32_i16 s0, s0 ; GFX8-NEXT: ; return to shader part epilog ; ; GFX9-LABEL: s_mul_i16_signext: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_mov_b32 s2, 0xffff -; GFX9-NEXT: s_and_b32 s0, s0, s2 -; GFX9-NEXT: s_and_b32 s1, s1, s2 +; GFX9-NEXT: s_and_b32 s0, s0, 0xffff +; GFX9-NEXT: s_and_b32 s1, s1, 0xffff ; GFX9-NEXT: s_mul_i32 s0, s0, s1 ; GFX9-NEXT: s_sext_i32_i16 s0, s0 ; GFX9-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: s_mul_i16_signext: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_mov_b32 s2, 0xffff -; GFX10-NEXT: s_and_b32 s0, s0, s2 -; GFX10-NEXT: s_and_b32 s1, s1, s2 +; GFX10-NEXT: s_and_b32 s0, s0, 0xffff +; GFX10-NEXT: s_and_b32 s1, s1, 0xffff ; GFX10-NEXT: s_mul_i32 s0, s0, s1 ; GFX10-NEXT: s_sext_i32_i16 s0, s0 ; GFX10-NEXT: ; return to shader part epilog diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/orn2.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/orn2.ll index bec739bd0bf0..e325c8b9af02 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/orn2.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/orn2.ll @@ -429,13 +429,12 @@ define amdgpu_ps float @v_orn2_i16_vs(i16 %src0, i16 inreg %src1) { define amdgpu_ps i32 @s_orn2_v2i16(<2 x i16> inreg %src0, <2 x i16> inreg %src1) { ; GFX6-LABEL: s_orn2_v2i16: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_mov_b32 s1, 0xffff ; GFX6-NEXT: s_lshl_b32 s0, s3, 16 -; GFX6-NEXT: s_and_b32 s2, s2, s1 -; GFX6-NEXT: s_or_b32 s0, s0, s2 -; GFX6-NEXT: s_lshl_b32 s2, s5, 16 -; GFX6-NEXT: s_and_b32 s1, s4, s1 -; GFX6-NEXT: s_or_b32 s1, s2, s1 +; GFX6-NEXT: s_and_b32 s1, s2, 0xffff +; GFX6-NEXT: s_or_b32 s0, s0, s1 +; GFX6-NEXT: s_lshl_b32 s1, s5, 16 +; GFX6-NEXT: s_and_b32 s2, s4, 0xffff +; GFX6-NEXT: s_or_b32 s1, s1, s2 ; GFX6-NEXT: s_xor_b32 s1, s1, -1 ; GFX6-NEXT: s_or_b32 s0, s0, s1 ; GFX6-NEXT: ; return to shader part epilog @@ -458,13 +457,12 @@ define amdgpu_ps i32 @s_orn2_v2i16(<2 x i16> inreg %src0, <2 x i16> inreg %src1) define amdgpu_ps i32 @s_orn2_v2i16_commute(<2 x i16> inreg %src0, <2 x i16> inreg %src1) { ; GFX6-LABEL: s_orn2_v2i16_commute: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_mov_b32 s1, 0xffff ; GFX6-NEXT: s_lshl_b32 s0, s3, 16 -; GFX6-NEXT: s_and_b32 s2, s2, s1 -; GFX6-NEXT: s_or_b32 s0, s0, s2 -; GFX6-NEXT: s_lshl_b32 s2, s5, 16 -; GFX6-NEXT: s_and_b32 s1, s4, s1 -; GFX6-NEXT: s_or_b32 s1, s2, s1 +; GFX6-NEXT: s_and_b32 s1, s2, 0xffff +; GFX6-NEXT: s_or_b32 s0, s0, s1 +; GFX6-NEXT: s_lshl_b32 s1, s5, 16 +; GFX6-NEXT: s_and_b32 s2, s4, 0xffff +; GFX6-NEXT: s_or_b32 s1, s1, s2 ; GFX6-NEXT: s_xor_b32 s1, s1, -1 ; GFX6-NEXT: s_or_b32 s0, s1, s0 ; GFX6-NEXT: ; return to shader part epilog @@ -487,13 +485,12 @@ define amdgpu_ps i32 @s_orn2_v2i16_commute(<2 x i16> inreg %src0, <2 x i16> inre define amdgpu_ps { i32, i32 } @s_orn2_v2i16_multi_use(<2 x i16> inreg %src0, <2 x i16> inreg %src1) { ; GFX6-LABEL: s_orn2_v2i16_multi_use: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_mov_b32 s1, 0xffff ; GFX6-NEXT: s_lshl_b32 s0, s3, 16 -; GFX6-NEXT: s_and_b32 s2, s2, s1 -; GFX6-NEXT: s_or_b32 s0, s0, s2 -; GFX6-NEXT: s_lshl_b32 s2, s5, 16 -; GFX6-NEXT: s_and_b32 s1, s4, s1 -; GFX6-NEXT: s_or_b32 s1, s2, s1 +; GFX6-NEXT: s_and_b32 s1, s2, 0xffff +; GFX6-NEXT: s_or_b32 s0, s0, s1 +; GFX6-NEXT: s_lshl_b32 s1, s5, 16 +; GFX6-NEXT: s_and_b32 s2, s4, 0xffff +; GFX6-NEXT: s_or_b32 s1, s1, s2 ; GFX6-NEXT: s_xor_b32 s1, s1, -1 ; GFX6-NEXT: s_or_b32 s0, s0, s1 ; GFX6-NEXT: ; return to shader part epilog @@ -522,19 +519,18 @@ define amdgpu_ps { i32, i32 } @s_orn2_v2i16_multi_use(<2 x i16> inreg %src0, <2 define amdgpu_ps { i32, i32 } @s_orn2_v2i16_multi_foldable_use(<2 x i16> inreg %src0, <2 x i16> inreg %src1, <2 x i16> inreg %src2) { ; GFX6-LABEL: s_orn2_v2i16_multi_foldable_use: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_mov_b32 s1, 0xffff ; GFX6-NEXT: s_lshl_b32 s0, s3, 16 -; GFX6-NEXT: s_and_b32 s2, s2, s1 -; GFX6-NEXT: s_or_b32 s0, s0, s2 -; GFX6-NEXT: s_lshl_b32 s2, s5, 16 -; GFX6-NEXT: s_and_b32 s3, s4, s1 -; GFX6-NEXT: s_or_b32 s2, s2, s3 -; GFX6-NEXT: s_lshl_b32 s3, s7, 16 -; GFX6-NEXT: s_and_b32 s1, s6, s1 -; GFX6-NEXT: s_or_b32 s1, s3, s1 -; GFX6-NEXT: s_xor_b32 s1, s1, -1 +; GFX6-NEXT: s_and_b32 s1, s2, 0xffff ; GFX6-NEXT: s_or_b32 s0, s0, s1 -; GFX6-NEXT: s_or_b32 s1, s2, s1 +; GFX6-NEXT: s_lshl_b32 s1, s5, 16 +; GFX6-NEXT: s_and_b32 s2, s4, 0xffff +; GFX6-NEXT: s_or_b32 s1, s1, s2 +; GFX6-NEXT: s_lshl_b32 s2, s7, 16 +; GFX6-NEXT: s_and_b32 s3, s6, 0xffff +; GFX6-NEXT: s_or_b32 s2, s2, s3 +; GFX6-NEXT: s_xor_b32 s2, s2, -1 +; GFX6-NEXT: s_or_b32 s0, s0, s2 +; GFX6-NEXT: s_or_b32 s1, s1, s2 ; GFX6-NEXT: ; return to shader part epilog ; ; GFX9-LABEL: s_orn2_v2i16_multi_foldable_use: @@ -630,18 +626,17 @@ define amdgpu_ps i64 @s_orn2_v4i16(<4 x i16> inreg %src0, <4 x i16> inreg %src1) ; GFX6-LABEL: s_orn2_v4i16: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_lshl_b32 s0, s3, 16 -; GFX6-NEXT: s_mov_b32 s3, 0xffff -; GFX6-NEXT: s_and_b32 s1, s2, s3 +; GFX6-NEXT: s_and_b32 s1, s2, 0xffff ; GFX6-NEXT: s_or_b32 s0, s0, s1 ; GFX6-NEXT: s_lshl_b32 s1, s5, 16 -; GFX6-NEXT: s_and_b32 s2, s4, s3 +; GFX6-NEXT: s_and_b32 s2, s4, 0xffff ; GFX6-NEXT: s_or_b32 s1, s1, s2 ; GFX6-NEXT: s_lshl_b32 s2, s7, 16 -; GFX6-NEXT: s_and_b32 s4, s6, s3 -; GFX6-NEXT: s_or_b32 s2, s2, s4 -; GFX6-NEXT: s_lshl_b32 s4, s9, 16 -; GFX6-NEXT: s_and_b32 s3, s8, s3 -; GFX6-NEXT: s_or_b32 s3, s4, s3 +; GFX6-NEXT: s_and_b32 s3, s6, 0xffff +; GFX6-NEXT: s_or_b32 s2, s2, s3 +; GFX6-NEXT: s_lshl_b32 s3, s9, 16 +; GFX6-NEXT: s_and_b32 s4, s8, 0xffff +; GFX6-NEXT: s_or_b32 s3, s3, s4 ; GFX6-NEXT: s_mov_b32 s4, -1 ; GFX6-NEXT: s_mov_b32 s5, s4 ; GFX6-NEXT: s_xor_b64 s[2:3], s[2:3], s[4:5] @@ -673,18 +668,17 @@ define amdgpu_ps i64 @s_orn2_v4i16_commute(<4 x i16> inreg %src0, <4 x i16> inre ; GFX6-LABEL: s_orn2_v4i16_commute: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_lshl_b32 s0, s3, 16 -; GFX6-NEXT: s_mov_b32 s3, 0xffff -; GFX6-NEXT: s_and_b32 s1, s2, s3 +; GFX6-NEXT: s_and_b32 s1, s2, 0xffff ; GFX6-NEXT: s_or_b32 s0, s0, s1 ; GFX6-NEXT: s_lshl_b32 s1, s5, 16 -; GFX6-NEXT: s_and_b32 s2, s4, s3 +; GFX6-NEXT: s_and_b32 s2, s4, 0xffff ; GFX6-NEXT: s_or_b32 s1, s1, s2 ; GFX6-NEXT: s_lshl_b32 s2, s7, 16 -; GFX6-NEXT: s_and_b32 s4, s6, s3 -; GFX6-NEXT: s_or_b32 s2, s2, s4 -; GFX6-NEXT: s_lshl_b32 s4, s9, 16 -; GFX6-NEXT: s_and_b32 s3, s8, s3 -; GFX6-NEXT: s_or_b32 s3, s4, s3 +; GFX6-NEXT: s_and_b32 s3, s6, 0xffff +; GFX6-NEXT: s_or_b32 s2, s2, s3 +; GFX6-NEXT: s_lshl_b32 s3, s9, 16 +; GFX6-NEXT: s_and_b32 s4, s8, 0xffff +; GFX6-NEXT: s_or_b32 s3, s3, s4 ; GFX6-NEXT: s_mov_b32 s4, -1 ; GFX6-NEXT: s_mov_b32 s5, s4 ; GFX6-NEXT: s_xor_b64 s[2:3], s[2:3], s[4:5] @@ -716,18 +710,17 @@ define amdgpu_ps { i64, i64 } @s_orn2_v4i16_multi_use(<4 x i16> inreg %src0, <4 ; GFX6-LABEL: s_orn2_v4i16_multi_use: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_lshl_b32 s0, s3, 16 -; GFX6-NEXT: s_mov_b32 s3, 0xffff -; GFX6-NEXT: s_and_b32 s1, s2, s3 +; GFX6-NEXT: s_and_b32 s1, s2, 0xffff ; GFX6-NEXT: s_or_b32 s0, s0, s1 ; GFX6-NEXT: s_lshl_b32 s1, s5, 16 -; GFX6-NEXT: s_and_b32 s2, s4, s3 +; GFX6-NEXT: s_and_b32 s2, s4, 0xffff ; GFX6-NEXT: s_or_b32 s1, s1, s2 ; GFX6-NEXT: s_lshl_b32 s2, s7, 16 -; GFX6-NEXT: s_and_b32 s4, s6, s3 -; GFX6-NEXT: s_or_b32 s2, s2, s4 -; GFX6-NEXT: s_lshl_b32 s4, s9, 16 -; GFX6-NEXT: s_and_b32 s3, s8, s3 -; GFX6-NEXT: s_or_b32 s3, s4, s3 +; GFX6-NEXT: s_and_b32 s3, s6, 0xffff +; GFX6-NEXT: s_or_b32 s2, s2, s3 +; GFX6-NEXT: s_lshl_b32 s3, s9, 16 +; GFX6-NEXT: s_and_b32 s4, s8, 0xffff +; GFX6-NEXT: s_or_b32 s3, s3, s4 ; GFX6-NEXT: s_mov_b32 s4, -1 ; GFX6-NEXT: s_mov_b32 s5, s4 ; GFX6-NEXT: s_xor_b64 s[2:3], s[2:3], s[4:5] @@ -766,24 +759,23 @@ define amdgpu_ps { i64, i64 } @s_orn2_v4i16_multi_use(<4 x i16> inreg %src0, <4 define amdgpu_ps { i64, i64 } @s_orn2_v4i16_multi_foldable_use(<4 x i16> inreg %src0, <4 x i16> inreg %src1, <4 x i16> inreg %src2) { ; GFX6-LABEL: s_orn2_v4i16_multi_foldable_use: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_mov_b32 s14, 0xffff ; GFX6-NEXT: s_lshl_b32 s0, s3, 16 -; GFX6-NEXT: s_and_b32 s1, s2, s14 +; GFX6-NEXT: s_and_b32 s1, s2, 0xffff ; GFX6-NEXT: s_or_b32 s0, s0, s1 ; GFX6-NEXT: s_lshl_b32 s1, s5, 16 -; GFX6-NEXT: s_and_b32 s2, s4, s14 +; GFX6-NEXT: s_and_b32 s2, s4, 0xffff ; GFX6-NEXT: s_or_b32 s1, s1, s2 ; GFX6-NEXT: s_lshl_b32 s2, s7, 16 -; GFX6-NEXT: s_and_b32 s3, s6, s14 +; GFX6-NEXT: s_and_b32 s3, s6, 0xffff ; GFX6-NEXT: s_or_b32 s2, s2, s3 ; GFX6-NEXT: s_lshl_b32 s3, s9, 16 -; GFX6-NEXT: s_and_b32 s4, s8, s14 +; GFX6-NEXT: s_and_b32 s4, s8, 0xffff ; GFX6-NEXT: s_or_b32 s3, s3, s4 ; GFX6-NEXT: s_lshl_b32 s4, s11, 16 -; GFX6-NEXT: s_and_b32 s5, s10, s14 +; GFX6-NEXT: s_and_b32 s5, s10, 0xffff ; GFX6-NEXT: s_or_b32 s4, s4, s5 ; GFX6-NEXT: s_lshl_b32 s5, s13, 16 -; GFX6-NEXT: s_and_b32 s6, s12, s14 +; GFX6-NEXT: s_and_b32 s6, s12, 0xffff ; GFX6-NEXT: s_or_b32 s5, s5, s6 ; GFX6-NEXT: s_mov_b32 s6, -1 ; GFX6-NEXT: s_mov_b32 s7, s6 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/roundeven.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/roundeven.ll index 641602657005..0a2414d397cb 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/roundeven.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/roundeven.ll @@ -401,13 +401,12 @@ define <4 x half> @v_roundeven_v4f16(<4 x half> %x) { ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: v_mov_b32_e32 v2, 0xffff -; GFX10-NEXT: v_rndne_f16_e32 v3, v0 +; GFX10-NEXT: v_rndne_f16_e32 v2, v0 ; GFX10-NEXT: v_rndne_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 -; GFX10-NEXT: v_rndne_f16_e32 v4, v1 +; GFX10-NEXT: v_rndne_f16_e32 v3, v1 ; GFX10-NEXT: v_rndne_f16_sdwa v1, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 -; GFX10-NEXT: v_and_or_b32 v0, v3, v2, v0 -; GFX10-NEXT: v_and_or_b32 v1, v4, v2, v1 +; GFX10-NEXT: v_and_or_b32 v0, 0xffff, v2, v0 +; GFX10-NEXT: v_and_or_b32 v1, 0xffff, v3, v1 ; GFX10-NEXT: s_setpc_b64 s[30:31] %roundeven = call <4 x half> @llvm.roundeven.v4f16(<4 x half> %x) ret <4 x half> %roundeven @@ -610,8 +609,8 @@ define <2 x double> @v_roundeven_v2f64(<2 x double> %x) { ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: s_brev_b32 s6, 1 -; GFX6-NEXT: s_mov_b32 s7, 0x43300000 ; GFX6-NEXT: v_and_b32_e32 v5, s6, v1 +; GFX6-NEXT: s_mov_b32 s7, 0x43300000 ; GFX6-NEXT: v_mov_b32_e32 v4, 0 ; GFX6-NEXT: v_or_b32_e32 v5, s7, v5 ; GFX6-NEXT: v_add_f64 v[6:7], v[0:1], v[4:5] diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/saddsat.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/saddsat.ll index db72cf406c9c..9bf8d106cf01 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/saddsat.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/saddsat.ll @@ -320,12 +320,11 @@ define i16 @v_saddsat_v2i8(i16 %lhs.arg, i16 %rhs.arg) { ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: s_mov_b32 s4, 8 -; GFX10-NEXT: v_mov_b32_e32 v2, 0xffff -; GFX10-NEXT: v_lshrrev_b32_sdwa v3, s4, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX10-NEXT: v_lshrrev_b32_sdwa v4, s4, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX10-NEXT: v_lshrrev_b32_sdwa v2, s4, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX10-NEXT: v_lshrrev_b32_sdwa v3, s4, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX10-NEXT: s_movk_i32 s4, 0xff -; GFX10-NEXT: v_and_or_b32 v0, v0, v2, v3 -; GFX10-NEXT: v_and_or_b32 v1, v1, v2, v4 +; GFX10-NEXT: v_and_or_b32 v0, 0xffff, v0, v2 +; GFX10-NEXT: v_and_or_b32 v1, 0xffff, v1, v3 ; GFX10-NEXT: v_pk_lshlrev_b16 v0, 8, v0 op_sel_hi:[0,1] ; GFX10-NEXT: v_pk_lshlrev_b16 v1, 8, v1 op_sel_hi:[0,1] ; GFX10-NEXT: v_pk_add_i16 v0, v0, v1 clamp @@ -345,31 +344,28 @@ define amdgpu_ps i16 @s_saddsat_v2i8(i16 inreg %lhs.arg, i16 inreg %rhs.arg) { ; GFX6: ; %bb.0: ; GFX6-NEXT: s_lshr_b32 s2, s0, 8 ; GFX6-NEXT: s_lshl_b32 s0, s0, 24 -; GFX6-NEXT: s_brev_b32 s5, 1 -; GFX6-NEXT: s_min_i32 s7, s0, 0 +; GFX6-NEXT: s_min_i32 s5, s0, 0 ; GFX6-NEXT: s_lshr_b32 s3, s1, 8 ; GFX6-NEXT: s_lshl_b32 s1, s1, 24 -; GFX6-NEXT: s_brev_b32 s4, -2 -; GFX6-NEXT: s_max_i32 s6, s0, 0 -; GFX6-NEXT: s_sub_i32 s7, s5, s7 -; GFX6-NEXT: s_sub_i32 s6, s4, s6 -; GFX6-NEXT: s_max_i32 s1, s7, s1 -; GFX6-NEXT: s_min_i32 s1, s1, s6 +; GFX6-NEXT: s_max_i32 s4, s0, 0 +; GFX6-NEXT: s_sub_i32 s5, 0x80000000, s5 +; GFX6-NEXT: s_sub_i32 s4, 0x7fffffff, s4 +; GFX6-NEXT: s_max_i32 s1, s5, s1 +; GFX6-NEXT: s_min_i32 s1, s1, s4 ; GFX6-NEXT: s_add_i32 s0, s0, s1 ; GFX6-NEXT: s_lshl_b32 s1, s2, 24 +; GFX6-NEXT: s_min_i32 s4, s1, 0 ; GFX6-NEXT: s_lshl_b32 s2, s3, 24 ; GFX6-NEXT: s_max_i32 s3, s1, 0 -; GFX6-NEXT: s_sub_i32 s3, s4, s3 -; GFX6-NEXT: s_min_i32 s4, s1, 0 -; GFX6-NEXT: s_sub_i32 s4, s5, s4 +; GFX6-NEXT: s_sub_i32 s4, 0x80000000, s4 +; GFX6-NEXT: s_sub_i32 s3, 0x7fffffff, s3 ; GFX6-NEXT: s_max_i32 s2, s4, s2 ; GFX6-NEXT: s_min_i32 s2, s2, s3 ; GFX6-NEXT: s_add_i32 s1, s1, s2 ; GFX6-NEXT: s_ashr_i32 s1, s1, 24 -; GFX6-NEXT: s_movk_i32 s2, 0xff ; GFX6-NEXT: s_ashr_i32 s0, s0, 24 -; GFX6-NEXT: s_and_b32 s1, s1, s2 -; GFX6-NEXT: s_and_b32 s0, s0, s2 +; GFX6-NEXT: s_and_b32 s1, s1, 0xff +; GFX6-NEXT: s_and_b32 s0, s0, 0xff ; GFX6-NEXT: s_lshl_b32 s1, s1, 8 ; GFX6-NEXT: s_or_b32 s0, s0, s1 ; GFX6-NEXT: ; return to shader part epilog @@ -379,32 +375,30 @@ define amdgpu_ps i16 @s_saddsat_v2i8(i16 inreg %lhs.arg, i16 inreg %rhs.arg) { ; GFX8-NEXT: s_bfe_u32 s4, 8, 0x100000 ; GFX8-NEXT: s_lshr_b32 s2, s0, 8 ; GFX8-NEXT: s_lshl_b32 s0, s0, s4 -; GFX8-NEXT: s_sext_i32_i16 s7, s0 -; GFX8-NEXT: s_sext_i32_i16 s8, 0 -; GFX8-NEXT: s_movk_i32 s6, 0x8000 -; GFX8-NEXT: s_max_i32 s9, s7, s8 -; GFX8-NEXT: s_min_i32 s7, s7, s8 +; GFX8-NEXT: s_sext_i32_i16 s5, s0 +; GFX8-NEXT: s_sext_i32_i16 s6, 0 +; GFX8-NEXT: s_max_i32 s7, s5, s6 +; GFX8-NEXT: s_min_i32 s5, s5, s6 ; GFX8-NEXT: s_lshr_b32 s3, s1, 8 ; GFX8-NEXT: s_lshl_b32 s1, s1, s4 -; GFX8-NEXT: s_sub_i32 s7, s6, s7 -; GFX8-NEXT: s_movk_i32 s5, 0x7fff -; GFX8-NEXT: s_sext_i32_i16 s7, s7 +; GFX8-NEXT: s_sub_i32 s5, 0xffff8000, s5 +; GFX8-NEXT: s_sext_i32_i16 s5, s5 ; GFX8-NEXT: s_sext_i32_i16 s1, s1 -; GFX8-NEXT: s_sub_i32 s9, s5, s9 -; GFX8-NEXT: s_max_i32 s1, s7, s1 +; GFX8-NEXT: s_sub_i32 s7, 0x7fff, s7 +; GFX8-NEXT: s_max_i32 s1, s5, s1 ; GFX8-NEXT: s_sext_i32_i16 s1, s1 -; GFX8-NEXT: s_sext_i32_i16 s7, s9 -; GFX8-NEXT: s_min_i32 s1, s1, s7 +; GFX8-NEXT: s_sext_i32_i16 s5, s7 +; GFX8-NEXT: s_min_i32 s1, s1, s5 ; GFX8-NEXT: s_add_i32 s0, s0, s1 ; GFX8-NEXT: s_lshl_b32 s1, s2, s4 ; GFX8-NEXT: s_lshl_b32 s2, s3, s4 ; GFX8-NEXT: s_sext_i32_i16 s3, s1 -; GFX8-NEXT: s_max_i32 s7, s3, s8 -; GFX8-NEXT: s_min_i32 s3, s3, s8 -; GFX8-NEXT: s_sub_i32 s3, s6, s3 +; GFX8-NEXT: s_max_i32 s5, s3, s6 +; GFX8-NEXT: s_min_i32 s3, s3, s6 +; GFX8-NEXT: s_sub_i32 s3, 0xffff8000, s3 ; GFX8-NEXT: s_sext_i32_i16 s3, s3 ; GFX8-NEXT: s_sext_i32_i16 s2, s2 -; GFX8-NEXT: s_sub_i32 s5, s5, s7 +; GFX8-NEXT: s_sub_i32 s5, 0x7fff, s5 ; GFX8-NEXT: s_max_i32 s2, s3, s2 ; GFX8-NEXT: s_sext_i32_i16 s2, s2 ; GFX8-NEXT: s_sext_i32_i16 s3, s5 @@ -413,10 +407,9 @@ define amdgpu_ps i16 @s_saddsat_v2i8(i16 inreg %lhs.arg, i16 inreg %rhs.arg) { ; GFX8-NEXT: s_sext_i32_i16 s1, s1 ; GFX8-NEXT: s_sext_i32_i16 s0, s0 ; GFX8-NEXT: s_ashr_i32 s1, s1, s4 -; GFX8-NEXT: s_movk_i32 s2, 0xff ; GFX8-NEXT: s_ashr_i32 s0, s0, s4 -; GFX8-NEXT: s_and_b32 s1, s1, s2 -; GFX8-NEXT: s_and_b32 s0, s0, s2 +; GFX8-NEXT: s_and_b32 s1, s1, 0xff +; GFX8-NEXT: s_and_b32 s0, s0, 0xff ; GFX8-NEXT: s_lshl_b32 s1, s1, s4 ; GFX8-NEXT: s_or_b32 s0, s0, s1 ; GFX8-NEXT: ; return to shader part epilog @@ -424,17 +417,16 @@ define amdgpu_ps i16 @s_saddsat_v2i8(i16 inreg %lhs.arg, i16 inreg %rhs.arg) { ; GFX9-LABEL: s_saddsat_v2i8: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_lshr_b32 s2, s0, 8 -; GFX9-NEXT: s_lshr_b32 s3, s1, 8 ; GFX9-NEXT: s_pack_ll_b32_b16 s0, s0, s2 +; GFX9-NEXT: s_lshr_b32 s3, s1, 8 +; GFX9-NEXT: s_lshr_b32 s2, s0, 16 ; GFX9-NEXT: s_pack_ll_b32_b16 s1, s1, s3 -; GFX9-NEXT: s_mov_b32 s2, 0x80008 -; GFX9-NEXT: s_lshr_b32 s3, s0, 16 -; GFX9-NEXT: s_lshl_b32 s0, s0, s2 -; GFX9-NEXT: s_lshl_b32 s3, s3, 8 -; GFX9-NEXT: s_pack_ll_b32_b16 s0, s0, s3 -; GFX9-NEXT: s_lshr_b32 s3, s1, 16 -; GFX9-NEXT: s_lshl_b32 s1, s1, s2 -; GFX9-NEXT: s_lshl_b32 s2, s3, 8 +; GFX9-NEXT: s_lshl_b32 s0, s0, 0x80008 +; GFX9-NEXT: s_lshl_b32 s2, s2, 8 +; GFX9-NEXT: s_pack_ll_b32_b16 s0, s0, s2 +; GFX9-NEXT: s_lshr_b32 s2, s1, 16 +; GFX9-NEXT: s_lshl_b32 s1, s1, 0x80008 +; GFX9-NEXT: s_lshl_b32 s2, s2, 8 ; GFX9-NEXT: s_pack_ll_b32_b16 s1, s1, s2 ; GFX9-NEXT: v_mov_b32_e32 v0, s1 ; GFX9-NEXT: v_pk_add_i16 v0, s0, v0 clamp @@ -451,15 +443,14 @@ define amdgpu_ps i16 @s_saddsat_v2i8(i16 inreg %lhs.arg, i16 inreg %rhs.arg) { ; GFX10-NEXT: s_lshr_b32 s3, s1, 8 ; GFX10-NEXT: s_pack_ll_b32_b16 s0, s0, s2 ; GFX10-NEXT: s_pack_ll_b32_b16 s1, s1, s3 -; GFX10-NEXT: s_mov_b32 s2, 0x80008 -; GFX10-NEXT: s_lshr_b32 s3, s0, 16 -; GFX10-NEXT: s_lshr_b32 s4, s1, 16 -; GFX10-NEXT: s_lshl_b32 s0, s0, s2 +; GFX10-NEXT: s_lshr_b32 s2, s0, 16 +; GFX10-NEXT: s_lshr_b32 s3, s1, 16 +; GFX10-NEXT: s_lshl_b32 s0, s0, 0x80008 +; GFX10-NEXT: s_lshl_b32 s2, s2, 8 +; GFX10-NEXT: s_lshl_b32 s1, s1, 0x80008 ; GFX10-NEXT: s_lshl_b32 s3, s3, 8 -; GFX10-NEXT: s_lshl_b32 s1, s1, s2 -; GFX10-NEXT: s_lshl_b32 s2, s4, 8 -; GFX10-NEXT: s_pack_ll_b32_b16 s0, s0, s3 -; GFX10-NEXT: s_pack_ll_b32_b16 s1, s1, s2 +; GFX10-NEXT: s_pack_ll_b32_b16 s0, s0, s2 +; GFX10-NEXT: s_pack_ll_b32_b16 s1, s1, s3 ; GFX10-NEXT: v_pk_add_i16 v0, s0, s1 clamp ; GFX10-NEXT: s_movk_i32 s0, 0xff ; GFX10-NEXT: v_pk_ashrrev_i16 v0, 8, v0 op_sel_hi:[0,1] @@ -639,35 +630,33 @@ define i32 @v_saddsat_v4i8(i32 %lhs.arg, i32 %rhs.arg) { ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: v_lshrrev_b32_e32 v4, 24, v0 -; GFX10-NEXT: v_lshrrev_b32_e32 v5, 24, v1 +; GFX10-NEXT: v_lshrrev_b32_e32 v3, 24, v0 +; GFX10-NEXT: v_lshrrev_b32_e32 v4, 24, v1 ; GFX10-NEXT: s_mov_b32 s4, 8 -; GFX10-NEXT: v_mov_b32_e32 v2, 0xffff -; GFX10-NEXT: v_lshrrev_b32_sdwa v3, s4, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX10-NEXT: v_lshrrev_b32_e32 v6, 16, v0 -; GFX10-NEXT: v_lshrrev_b32_sdwa v7, s4, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX10-NEXT: v_lshrrev_b32_e32 v8, 16, v1 +; GFX10-NEXT: v_lshrrev_b32_e32 v5, 16, v0 +; GFX10-NEXT: v_lshrrev_b32_sdwa v2, s4, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX10-NEXT: v_lshrrev_b32_sdwa v6, s4, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX10-NEXT: v_lshrrev_b32_e32 v7, 16, v1 +; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; GFX10-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GFX10-NEXT: v_and_or_b32 v0, v0, v2, v3 -; GFX10-NEXT: v_and_or_b32 v1, v1, v2, v7 -; GFX10-NEXT: s_movk_i32 s4, 0xff -; GFX10-NEXT: v_and_or_b32 v3, v6, v2, v4 -; GFX10-NEXT: v_and_or_b32 v2, v8, v2, v5 +; GFX10-NEXT: v_and_or_b32 v0, 0xffff, v0, v2 +; GFX10-NEXT: v_and_or_b32 v1, 0xffff, v1, v6 +; GFX10-NEXT: v_and_or_b32 v2, 0xffff, v5, v3 +; GFX10-NEXT: v_and_or_b32 v3, 0xffff, v7, v4 ; GFX10-NEXT: v_pk_lshlrev_b16 v0, 8, v0 op_sel_hi:[0,1] ; GFX10-NEXT: v_pk_lshlrev_b16 v1, 8, v1 op_sel_hi:[0,1] ; GFX10-NEXT: v_mov_b32_e32 v4, 24 -; GFX10-NEXT: v_pk_lshlrev_b16 v3, 8, v3 op_sel_hi:[0,1] ; GFX10-NEXT: v_pk_lshlrev_b16 v2, 8, v2 op_sel_hi:[0,1] +; GFX10-NEXT: v_pk_lshlrev_b16 v3, 8, v3 op_sel_hi:[0,1] ; GFX10-NEXT: v_pk_add_i16 v0, v0, v1 clamp -; GFX10-NEXT: v_pk_add_i16 v1, v3, v2 clamp +; GFX10-NEXT: v_pk_add_i16 v1, v2, v3 clamp ; GFX10-NEXT: v_mov_b32_e32 v2, 8 ; GFX10-NEXT: v_pk_ashrrev_i16 v0, 8, v0 op_sel_hi:[0,1] ; GFX10-NEXT: v_pk_ashrrev_i16 v1, 8, v1 op_sel_hi:[0,1] ; GFX10-NEXT: v_lshlrev_b32_sdwa v2, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 -; GFX10-NEXT: v_and_b32_e32 v3, s4, v1 +; GFX10-NEXT: v_and_b32_e32 v3, 0xff, v1 ; GFX10-NEXT: v_lshlrev_b32_sdwa v1, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 -; GFX10-NEXT: v_and_or_b32 v0, v0, s4, v2 +; GFX10-NEXT: v_and_or_b32 v0, v0, 0xff, v2 ; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v3 ; GFX10-NEXT: v_or3_b32 v0, v0, v2, v1 ; GFX10-NEXT: s_setpc_b64 s[30:31] @@ -685,59 +674,56 @@ define amdgpu_ps i32 @s_saddsat_v4i8(i32 inreg %lhs.arg, i32 inreg %rhs.arg) { ; GFX6-NEXT: s_lshr_b32 s3, s0, 16 ; GFX6-NEXT: s_lshr_b32 s4, s0, 24 ; GFX6-NEXT: s_lshl_b32 s0, s0, 24 -; GFX6-NEXT: s_brev_b32 s9, 1 -; GFX6-NEXT: s_min_i32 s11, s0, 0 +; GFX6-NEXT: s_min_i32 s9, s0, 0 ; GFX6-NEXT: s_lshr_b32 s5, s1, 8 ; GFX6-NEXT: s_lshr_b32 s6, s1, 16 ; GFX6-NEXT: s_lshr_b32 s7, s1, 24 ; GFX6-NEXT: s_lshl_b32 s1, s1, 24 -; GFX6-NEXT: s_brev_b32 s8, -2 -; GFX6-NEXT: s_max_i32 s10, s0, 0 -; GFX6-NEXT: s_sub_i32 s11, s9, s11 -; GFX6-NEXT: s_sub_i32 s10, s8, s10 -; GFX6-NEXT: s_max_i32 s1, s11, s1 -; GFX6-NEXT: s_min_i32 s1, s1, s10 +; GFX6-NEXT: s_max_i32 s8, s0, 0 +; GFX6-NEXT: s_sub_i32 s9, 0x80000000, s9 +; GFX6-NEXT: s_sub_i32 s8, 0x7fffffff, s8 +; GFX6-NEXT: s_max_i32 s1, s9, s1 +; GFX6-NEXT: s_min_i32 s1, s1, s8 ; GFX6-NEXT: s_add_i32 s0, s0, s1 ; GFX6-NEXT: s_lshl_b32 s1, s2, 24 -; GFX6-NEXT: s_min_i32 s10, s1, 0 +; GFX6-NEXT: s_min_i32 s8, s1, 0 ; GFX6-NEXT: s_lshl_b32 s2, s5, 24 ; GFX6-NEXT: s_max_i32 s5, s1, 0 -; GFX6-NEXT: s_sub_i32 s10, s9, s10 -; GFX6-NEXT: s_sub_i32 s5, s8, s5 -; GFX6-NEXT: s_max_i32 s2, s10, s2 +; GFX6-NEXT: s_sub_i32 s8, 0x80000000, s8 +; GFX6-NEXT: s_sub_i32 s5, 0x7fffffff, s5 +; GFX6-NEXT: s_max_i32 s2, s8, s2 ; GFX6-NEXT: s_min_i32 s2, s2, s5 ; GFX6-NEXT: s_add_i32 s1, s1, s2 ; GFX6-NEXT: s_lshl_b32 s2, s3, 24 ; GFX6-NEXT: s_lshl_b32 s3, s6, 24 ; GFX6-NEXT: s_min_i32 s6, s2, 0 ; GFX6-NEXT: s_max_i32 s5, s2, 0 -; GFX6-NEXT: s_sub_i32 s6, s9, s6 -; GFX6-NEXT: s_sub_i32 s5, s8, s5 +; GFX6-NEXT: s_sub_i32 s6, 0x80000000, s6 +; GFX6-NEXT: s_sub_i32 s5, 0x7fffffff, s5 ; GFX6-NEXT: s_max_i32 s3, s6, s3 ; GFX6-NEXT: s_min_i32 s3, s3, s5 ; GFX6-NEXT: s_add_i32 s2, s2, s3 ; GFX6-NEXT: s_lshl_b32 s3, s4, 24 ; GFX6-NEXT: s_min_i32 s6, s3, 0 +; GFX6-NEXT: s_ashr_i32 s1, s1, 24 ; GFX6-NEXT: s_lshl_b32 s4, s7, 24 ; GFX6-NEXT: s_max_i32 s5, s3, 0 -; GFX6-NEXT: s_sub_i32 s6, s9, s6 -; GFX6-NEXT: s_sub_i32 s5, s8, s5 -; GFX6-NEXT: s_max_i32 s4, s6, s4 -; GFX6-NEXT: s_min_i32 s4, s4, s5 -; GFX6-NEXT: s_ashr_i32 s1, s1, 24 -; GFX6-NEXT: s_add_i32 s3, s3, s4 -; GFX6-NEXT: s_movk_i32 s4, 0xff +; GFX6-NEXT: s_sub_i32 s6, 0x80000000, s6 ; GFX6-NEXT: s_ashr_i32 s0, s0, 24 -; GFX6-NEXT: s_and_b32 s1, s1, s4 +; GFX6-NEXT: s_sub_i32 s5, 0x7fffffff, s5 +; GFX6-NEXT: s_max_i32 s4, s6, s4 +; GFX6-NEXT: s_and_b32 s1, s1, 0xff ; GFX6-NEXT: s_ashr_i32 s2, s2, 24 -; GFX6-NEXT: s_and_b32 s0, s0, s4 +; GFX6-NEXT: s_min_i32 s4, s4, s5 +; GFX6-NEXT: s_and_b32 s0, s0, 0xff ; GFX6-NEXT: s_lshl_b32 s1, s1, 8 +; GFX6-NEXT: s_add_i32 s3, s3, s4 ; GFX6-NEXT: s_or_b32 s0, s0, s1 -; GFX6-NEXT: s_and_b32 s1, s2, s4 +; GFX6-NEXT: s_and_b32 s1, s2, 0xff ; GFX6-NEXT: s_ashr_i32 s3, s3, 24 ; GFX6-NEXT: s_lshl_b32 s1, s1, 16 ; GFX6-NEXT: s_or_b32 s0, s0, s1 -; GFX6-NEXT: s_and_b32 s1, s3, s4 +; GFX6-NEXT: s_and_b32 s1, s3, 0xff ; GFX6-NEXT: s_lshl_b32 s1, s1, 24 ; GFX6-NEXT: s_or_b32 s0, s0, s1 ; GFX6-NEXT: ; return to shader part epilog @@ -749,48 +735,46 @@ define amdgpu_ps i32 @s_saddsat_v4i8(i32 inreg %lhs.arg, i32 inreg %rhs.arg) { ; GFX8-NEXT: s_lshr_b32 s3, s0, 16 ; GFX8-NEXT: s_lshr_b32 s4, s0, 24 ; GFX8-NEXT: s_lshl_b32 s0, s0, s8 -; GFX8-NEXT: s_sext_i32_i16 s11, s0 -; GFX8-NEXT: s_sext_i32_i16 s12, 0 -; GFX8-NEXT: s_movk_i32 s10, 0x8000 -; GFX8-NEXT: s_max_i32 s13, s11, s12 -; GFX8-NEXT: s_min_i32 s11, s11, s12 +; GFX8-NEXT: s_sext_i32_i16 s9, s0 +; GFX8-NEXT: s_sext_i32_i16 s10, 0 +; GFX8-NEXT: s_max_i32 s11, s9, s10 +; GFX8-NEXT: s_min_i32 s9, s9, s10 ; GFX8-NEXT: s_lshr_b32 s5, s1, 8 ; GFX8-NEXT: s_lshr_b32 s6, s1, 16 ; GFX8-NEXT: s_lshr_b32 s7, s1, 24 ; GFX8-NEXT: s_lshl_b32 s1, s1, s8 -; GFX8-NEXT: s_sub_i32 s11, s10, s11 -; GFX8-NEXT: s_movk_i32 s9, 0x7fff -; GFX8-NEXT: s_sext_i32_i16 s11, s11 +; GFX8-NEXT: s_sub_i32 s9, 0xffff8000, s9 +; GFX8-NEXT: s_sext_i32_i16 s9, s9 ; GFX8-NEXT: s_sext_i32_i16 s1, s1 -; GFX8-NEXT: s_sub_i32 s13, s9, s13 -; GFX8-NEXT: s_max_i32 s1, s11, s1 +; GFX8-NEXT: s_sub_i32 s11, 0x7fff, s11 +; GFX8-NEXT: s_max_i32 s1, s9, s1 ; GFX8-NEXT: s_sext_i32_i16 s1, s1 -; GFX8-NEXT: s_sext_i32_i16 s11, s13 -; GFX8-NEXT: s_min_i32 s1, s1, s11 +; GFX8-NEXT: s_sext_i32_i16 s9, s11 +; GFX8-NEXT: s_min_i32 s1, s1, s9 ; GFX8-NEXT: s_add_i32 s0, s0, s1 ; GFX8-NEXT: s_lshl_b32 s1, s2, s8 ; GFX8-NEXT: s_lshl_b32 s2, s5, s8 ; GFX8-NEXT: s_sext_i32_i16 s5, s1 -; GFX8-NEXT: s_max_i32 s11, s5, s12 -; GFX8-NEXT: s_min_i32 s5, s5, s12 -; GFX8-NEXT: s_sub_i32 s5, s10, s5 +; GFX8-NEXT: s_max_i32 s9, s5, s10 +; GFX8-NEXT: s_min_i32 s5, s5, s10 +; GFX8-NEXT: s_sub_i32 s5, 0xffff8000, s5 ; GFX8-NEXT: s_sext_i32_i16 s5, s5 ; GFX8-NEXT: s_sext_i32_i16 s2, s2 -; GFX8-NEXT: s_sub_i32 s11, s9, s11 +; GFX8-NEXT: s_sub_i32 s9, 0x7fff, s9 ; GFX8-NEXT: s_max_i32 s2, s5, s2 ; GFX8-NEXT: s_sext_i32_i16 s2, s2 -; GFX8-NEXT: s_sext_i32_i16 s5, s11 +; GFX8-NEXT: s_sext_i32_i16 s5, s9 ; GFX8-NEXT: s_min_i32 s2, s2, s5 ; GFX8-NEXT: s_add_i32 s1, s1, s2 ; GFX8-NEXT: s_lshl_b32 s2, s3, s8 ; GFX8-NEXT: s_sext_i32_i16 s5, s2 ; GFX8-NEXT: s_lshl_b32 s3, s6, s8 -; GFX8-NEXT: s_max_i32 s6, s5, s12 -; GFX8-NEXT: s_min_i32 s5, s5, s12 -; GFX8-NEXT: s_sub_i32 s5, s10, s5 +; GFX8-NEXT: s_max_i32 s6, s5, s10 +; GFX8-NEXT: s_min_i32 s5, s5, s10 +; GFX8-NEXT: s_sub_i32 s5, 0xffff8000, s5 ; GFX8-NEXT: s_sext_i32_i16 s5, s5 ; GFX8-NEXT: s_sext_i32_i16 s3, s3 -; GFX8-NEXT: s_sub_i32 s6, s9, s6 +; GFX8-NEXT: s_sub_i32 s6, 0x7fff, s6 ; GFX8-NEXT: s_max_i32 s3, s5, s3 ; GFX8-NEXT: s_sext_i32_i16 s3, s3 ; GFX8-NEXT: s_sext_i32_i16 s5, s6 @@ -798,35 +782,34 @@ define amdgpu_ps i32 @s_saddsat_v4i8(i32 inreg %lhs.arg, i32 inreg %rhs.arg) { ; GFX8-NEXT: s_add_i32 s2, s2, s3 ; GFX8-NEXT: s_lshl_b32 s3, s4, s8 ; GFX8-NEXT: s_sext_i32_i16 s5, s3 -; GFX8-NEXT: s_max_i32 s6, s5, s12 -; GFX8-NEXT: s_min_i32 s5, s5, s12 +; GFX8-NEXT: s_max_i32 s6, s5, s10 +; GFX8-NEXT: s_min_i32 s5, s5, s10 ; GFX8-NEXT: s_lshl_b32 s4, s7, s8 -; GFX8-NEXT: s_sub_i32 s5, s10, s5 +; GFX8-NEXT: s_sub_i32 s5, 0xffff8000, s5 ; GFX8-NEXT: s_sext_i32_i16 s5, s5 ; GFX8-NEXT: s_sext_i32_i16 s4, s4 -; GFX8-NEXT: s_sub_i32 s6, s9, s6 -; GFX8-NEXT: s_max_i32 s4, s5, s4 -; GFX8-NEXT: s_sext_i32_i16 s4, s4 -; GFX8-NEXT: s_sext_i32_i16 s5, s6 ; GFX8-NEXT: s_sext_i32_i16 s1, s1 -; GFX8-NEXT: s_min_i32 s4, s4, s5 +; GFX8-NEXT: s_sub_i32 s6, 0x7fff, s6 +; GFX8-NEXT: s_max_i32 s4, s5, s4 ; GFX8-NEXT: s_sext_i32_i16 s0, s0 ; GFX8-NEXT: s_ashr_i32 s1, s1, s8 -; GFX8-NEXT: s_add_i32 s3, s3, s4 -; GFX8-NEXT: s_movk_i32 s4, 0xff +; GFX8-NEXT: s_sext_i32_i16 s4, s4 +; GFX8-NEXT: s_sext_i32_i16 s5, s6 ; GFX8-NEXT: s_ashr_i32 s0, s0, s8 ; GFX8-NEXT: s_sext_i32_i16 s2, s2 -; GFX8-NEXT: s_and_b32 s1, s1, s4 +; GFX8-NEXT: s_min_i32 s4, s4, s5 +; GFX8-NEXT: s_and_b32 s1, s1, 0xff ; GFX8-NEXT: s_ashr_i32 s2, s2, s8 -; GFX8-NEXT: s_and_b32 s0, s0, s4 +; GFX8-NEXT: s_add_i32 s3, s3, s4 +; GFX8-NEXT: s_and_b32 s0, s0, 0xff ; GFX8-NEXT: s_lshl_b32 s1, s1, 8 ; GFX8-NEXT: s_sext_i32_i16 s3, s3 ; GFX8-NEXT: s_or_b32 s0, s0, s1 -; GFX8-NEXT: s_and_b32 s1, s2, s4 +; GFX8-NEXT: s_and_b32 s1, s2, 0xff ; GFX8-NEXT: s_ashr_i32 s3, s3, s8 ; GFX8-NEXT: s_lshl_b32 s1, s1, 16 ; GFX8-NEXT: s_or_b32 s0, s0, s1 -; GFX8-NEXT: s_and_b32 s1, s3, s4 +; GFX8-NEXT: s_and_b32 s1, s3, 0xff ; GFX8-NEXT: s_lshl_b32 s1, s1, 24 ; GFX8-NEXT: s_or_b32 s0, s0, s1 ; GFX8-NEXT: ; return to shader part epilog @@ -838,27 +821,26 @@ define amdgpu_ps i32 @s_saddsat_v4i8(i32 inreg %lhs.arg, i32 inreg %rhs.arg) { ; GFX9-NEXT: s_lshr_b32 s6, s0, 24 ; GFX9-NEXT: s_pack_ll_b32_b16 s0, s0, s3 ; GFX9-NEXT: s_pack_ll_b32_b16 s3, s4, s6 -; GFX9-NEXT: s_mov_b32 s4, 0x80008 -; GFX9-NEXT: s_lshr_b32 s6, s0, 16 +; GFX9-NEXT: s_lshr_b32 s4, s0, 16 ; GFX9-NEXT: s_lshr_b32 s7, s1, 8 -; GFX9-NEXT: s_lshl_b32 s0, s0, s4 -; GFX9-NEXT: s_lshl_b32 s6, s6, 8 +; GFX9-NEXT: s_lshl_b32 s0, s0, 0x80008 +; GFX9-NEXT: s_lshl_b32 s4, s4, 8 ; GFX9-NEXT: s_lshr_b32 s8, s1, 16 ; GFX9-NEXT: s_lshr_b32 s9, s1, 24 -; GFX9-NEXT: s_pack_ll_b32_b16 s0, s0, s6 -; GFX9-NEXT: s_lshr_b32 s6, s3, 16 +; GFX9-NEXT: s_pack_ll_b32_b16 s0, s0, s4 +; GFX9-NEXT: s_lshr_b32 s4, s3, 16 ; GFX9-NEXT: s_pack_ll_b32_b16 s1, s1, s7 -; GFX9-NEXT: s_lshl_b32 s3, s3, s4 +; GFX9-NEXT: s_lshl_b32 s3, s3, 0x80008 +; GFX9-NEXT: s_lshl_b32 s4, s4, 8 +; GFX9-NEXT: s_lshr_b32 s6, s1, 16 +; GFX9-NEXT: s_pack_ll_b32_b16 s3, s3, s4 +; GFX9-NEXT: s_pack_ll_b32_b16 s4, s8, s9 +; GFX9-NEXT: s_lshl_b32 s1, s1, 0x80008 +; GFX9-NEXT: s_lshl_b32 s6, s6, 8 +; GFX9-NEXT: s_pack_ll_b32_b16 s1, s1, s6 +; GFX9-NEXT: s_lshr_b32 s6, s4, 16 +; GFX9-NEXT: s_lshl_b32 s4, s4, 0x80008 ; GFX9-NEXT: s_lshl_b32 s6, s6, 8 -; GFX9-NEXT: s_lshr_b32 s7, s1, 16 -; GFX9-NEXT: s_pack_ll_b32_b16 s3, s3, s6 -; GFX9-NEXT: s_pack_ll_b32_b16 s6, s8, s9 -; GFX9-NEXT: s_lshl_b32 s1, s1, s4 -; GFX9-NEXT: s_lshl_b32 s7, s7, 8 -; GFX9-NEXT: s_pack_ll_b32_b16 s1, s1, s7 -; GFX9-NEXT: s_lshr_b32 s7, s6, 16 -; GFX9-NEXT: s_lshl_b32 s4, s6, s4 -; GFX9-NEXT: s_lshl_b32 s6, s7, 8 ; GFX9-NEXT: s_pack_ll_b32_b16 s4, s4, s6 ; GFX9-NEXT: v_mov_b32_e32 v0, s1 ; GFX9-NEXT: v_pk_add_i16 v0, s0, v0 clamp @@ -885,39 +867,37 @@ define amdgpu_ps i32 @s_saddsat_v4i8(i32 inreg %lhs.arg, i32 inreg %rhs.arg) { ; GFX10-NEXT: s_lshr_b32 s4, s0, 24 ; GFX10-NEXT: s_pack_ll_b32_b16 s0, s0, s2 ; GFX10-NEXT: s_pack_ll_b32_b16 s2, s3, s4 -; GFX10-NEXT: s_mov_b32 s3, 0x80008 -; GFX10-NEXT: s_lshr_b32 s4, s0, 16 +; GFX10-NEXT: s_lshr_b32 s3, s0, 16 ; GFX10-NEXT: s_lshr_b32 s5, s1, 8 ; GFX10-NEXT: s_lshr_b32 s6, s1, 16 ; GFX10-NEXT: s_lshr_b32 s7, s1, 24 -; GFX10-NEXT: s_lshl_b32 s0, s0, s3 +; GFX10-NEXT: s_lshl_b32 s0, s0, 0x80008 +; GFX10-NEXT: s_lshl_b32 s3, s3, 8 +; GFX10-NEXT: s_lshr_b32 s4, s2, 16 +; GFX10-NEXT: s_lshl_b32 s2, s2, 0x80008 ; GFX10-NEXT: s_lshl_b32 s4, s4, 8 +; GFX10-NEXT: s_pack_ll_b32_b16 s0, s0, s3 ; GFX10-NEXT: s_pack_ll_b32_b16 s1, s1, s5 -; GFX10-NEXT: s_pack_ll_b32_b16 s0, s0, s4 -; GFX10-NEXT: s_pack_ll_b32_b16 s4, s6, s7 -; GFX10-NEXT: s_lshr_b32 s8, s2, 16 -; GFX10-NEXT: s_lshr_b32 s5, s1, 16 -; GFX10-NEXT: s_lshr_b32 s6, s4, 16 -; GFX10-NEXT: s_lshl_b32 s2, s2, s3 -; GFX10-NEXT: s_lshl_b32 s8, s8, 8 -; GFX10-NEXT: s_lshl_b32 s1, s1, s3 +; GFX10-NEXT: s_pack_ll_b32_b16 s3, s6, s7 +; GFX10-NEXT: s_pack_ll_b32_b16 s2, s2, s4 +; GFX10-NEXT: s_lshr_b32 s4, s1, 16 +; GFX10-NEXT: s_lshr_b32 s5, s3, 16 +; GFX10-NEXT: s_lshl_b32 s1, s1, 0x80008 +; GFX10-NEXT: s_lshl_b32 s4, s4, 8 +; GFX10-NEXT: s_lshl_b32 s3, s3, 0x80008 ; GFX10-NEXT: s_lshl_b32 s5, s5, 8 -; GFX10-NEXT: s_lshl_b32 s3, s4, s3 -; GFX10-NEXT: s_lshl_b32 s4, s6, 8 -; GFX10-NEXT: s_pack_ll_b32_b16 s2, s2, s8 -; GFX10-NEXT: s_pack_ll_b32_b16 s1, s1, s5 -; GFX10-NEXT: s_pack_ll_b32_b16 s3, s3, s4 +; GFX10-NEXT: s_pack_ll_b32_b16 s1, s1, s4 +; GFX10-NEXT: s_pack_ll_b32_b16 s3, s3, s5 ; GFX10-NEXT: v_pk_add_i16 v0, s0, s1 clamp ; GFX10-NEXT: v_pk_add_i16 v1, s2, s3 clamp ; GFX10-NEXT: s_mov_b32 s0, 8 -; GFX10-NEXT: s_movk_i32 s1, 0xff ; GFX10-NEXT: v_pk_ashrrev_i16 v0, 8, v0 op_sel_hi:[0,1] ; GFX10-NEXT: v_pk_ashrrev_i16 v1, 8, v1 op_sel_hi:[0,1] ; GFX10-NEXT: v_lshlrev_b32_sdwa v2, s0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 -; GFX10-NEXT: v_and_b32_e32 v3, s1, v1 +; GFX10-NEXT: v_and_b32_e32 v3, 0xff, v1 ; GFX10-NEXT: s_mov_b32 s0, 24 ; GFX10-NEXT: v_lshlrev_b32_sdwa v1, s0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 -; GFX10-NEXT: v_and_or_b32 v0, v0, s1, v2 +; GFX10-NEXT: v_and_or_b32 v0, v0, 0xff, v2 ; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v3 ; GFX10-NEXT: v_or3_b32 v0, v0, v2, v1 ; GFX10-NEXT: v_readfirstlane_b32 s0, v0 @@ -1269,19 +1249,17 @@ define <2 x i32> @v_saddsat_v2i32(<2 x i32> %lhs, <2 x i32> %rhs) { define amdgpu_ps <2 x i32> @s_saddsat_v2i32(<2 x i32> inreg %lhs, <2 x i32> inreg %rhs) { ; GFX6-LABEL: s_saddsat_v2i32: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_brev_b32 s5, 1 -; GFX6-NEXT: s_min_i32 s7, s0, 0 -; GFX6-NEXT: s_brev_b32 s4, -2 -; GFX6-NEXT: s_max_i32 s6, s0, 0 -; GFX6-NEXT: s_sub_i32 s7, s5, s7 -; GFX6-NEXT: s_sub_i32 s6, s4, s6 -; GFX6-NEXT: s_max_i32 s2, s7, s2 -; GFX6-NEXT: s_min_i32 s2, s2, s6 +; GFX6-NEXT: s_min_i32 s5, s0, 0 +; GFX6-NEXT: s_max_i32 s4, s0, 0 +; GFX6-NEXT: s_sub_i32 s5, 0x80000000, s5 +; GFX6-NEXT: s_sub_i32 s4, 0x7fffffff, s4 +; GFX6-NEXT: s_max_i32 s2, s5, s2 +; GFX6-NEXT: s_min_i32 s2, s2, s4 +; GFX6-NEXT: s_min_i32 s4, s1, 0 ; GFX6-NEXT: s_add_i32 s0, s0, s2 ; GFX6-NEXT: s_max_i32 s2, s1, 0 -; GFX6-NEXT: s_sub_i32 s2, s4, s2 -; GFX6-NEXT: s_min_i32 s4, s1, 0 -; GFX6-NEXT: s_sub_i32 s4, s5, s4 +; GFX6-NEXT: s_sub_i32 s4, 0x80000000, s4 +; GFX6-NEXT: s_sub_i32 s2, 0x7fffffff, s2 ; GFX6-NEXT: s_max_i32 s3, s4, s3 ; GFX6-NEXT: s_min_i32 s2, s3, s2 ; GFX6-NEXT: s_add_i32 s1, s1, s2 @@ -1289,19 +1267,17 @@ define amdgpu_ps <2 x i32> @s_saddsat_v2i32(<2 x i32> inreg %lhs, <2 x i32> inre ; ; GFX8-LABEL: s_saddsat_v2i32: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_brev_b32 s5, 1 -; GFX8-NEXT: s_min_i32 s7, s0, 0 -; GFX8-NEXT: s_brev_b32 s4, -2 -; GFX8-NEXT: s_max_i32 s6, s0, 0 -; GFX8-NEXT: s_sub_i32 s7, s5, s7 -; GFX8-NEXT: s_sub_i32 s6, s4, s6 -; GFX8-NEXT: s_max_i32 s2, s7, s2 -; GFX8-NEXT: s_min_i32 s2, s2, s6 +; GFX8-NEXT: s_min_i32 s5, s0, 0 +; GFX8-NEXT: s_max_i32 s4, s0, 0 +; GFX8-NEXT: s_sub_i32 s5, 0x80000000, s5 +; GFX8-NEXT: s_sub_i32 s4, 0x7fffffff, s4 +; GFX8-NEXT: s_max_i32 s2, s5, s2 +; GFX8-NEXT: s_min_i32 s2, s2, s4 +; GFX8-NEXT: s_min_i32 s4, s1, 0 ; GFX8-NEXT: s_add_i32 s0, s0, s2 ; GFX8-NEXT: s_max_i32 s2, s1, 0 -; GFX8-NEXT: s_sub_i32 s2, s4, s2 -; GFX8-NEXT: s_min_i32 s4, s1, 0 -; GFX8-NEXT: s_sub_i32 s4, s5, s4 +; GFX8-NEXT: s_sub_i32 s4, 0x80000000, s4 +; GFX8-NEXT: s_sub_i32 s2, 0x7fffffff, s2 ; GFX8-NEXT: s_max_i32 s3, s4, s3 ; GFX8-NEXT: s_min_i32 s2, s3, s2 ; GFX8-NEXT: s_add_i32 s1, s1, s2 @@ -1408,26 +1384,24 @@ define <3 x i32> @v_saddsat_v3i32(<3 x i32> %lhs, <3 x i32> %rhs) { define amdgpu_ps <3 x i32> @s_saddsat_v3i32(<3 x i32> inreg %lhs, <3 x i32> inreg %rhs) { ; GFX6-LABEL: s_saddsat_v3i32: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_brev_b32 s7, 1 -; GFX6-NEXT: s_min_i32 s9, s0, 0 -; GFX6-NEXT: s_brev_b32 s6, -2 -; GFX6-NEXT: s_max_i32 s8, s0, 0 -; GFX6-NEXT: s_sub_i32 s9, s7, s9 -; GFX6-NEXT: s_sub_i32 s8, s6, s8 -; GFX6-NEXT: s_max_i32 s3, s9, s3 -; GFX6-NEXT: s_min_i32 s3, s3, s8 -; GFX6-NEXT: s_min_i32 s8, s1, 0 +; GFX6-NEXT: s_min_i32 s7, s0, 0 +; GFX6-NEXT: s_max_i32 s6, s0, 0 +; GFX6-NEXT: s_sub_i32 s7, 0x80000000, s7 +; GFX6-NEXT: s_sub_i32 s6, 0x7fffffff, s6 +; GFX6-NEXT: s_max_i32 s3, s7, s3 +; GFX6-NEXT: s_min_i32 s3, s3, s6 +; GFX6-NEXT: s_min_i32 s6, s1, 0 ; GFX6-NEXT: s_add_i32 s0, s0, s3 ; GFX6-NEXT: s_max_i32 s3, s1, 0 -; GFX6-NEXT: s_sub_i32 s8, s7, s8 -; GFX6-NEXT: s_sub_i32 s3, s6, s3 -; GFX6-NEXT: s_max_i32 s4, s8, s4 +; GFX6-NEXT: s_sub_i32 s6, 0x80000000, s6 +; GFX6-NEXT: s_sub_i32 s3, 0x7fffffff, s3 +; GFX6-NEXT: s_max_i32 s4, s6, s4 ; GFX6-NEXT: s_min_i32 s3, s4, s3 ; GFX6-NEXT: s_min_i32 s4, s2, 0 ; GFX6-NEXT: s_add_i32 s1, s1, s3 ; GFX6-NEXT: s_max_i32 s3, s2, 0 -; GFX6-NEXT: s_sub_i32 s4, s7, s4 -; GFX6-NEXT: s_sub_i32 s3, s6, s3 +; GFX6-NEXT: s_sub_i32 s4, 0x80000000, s4 +; GFX6-NEXT: s_sub_i32 s3, 0x7fffffff, s3 ; GFX6-NEXT: s_max_i32 s4, s4, s5 ; GFX6-NEXT: s_min_i32 s3, s4, s3 ; GFX6-NEXT: s_add_i32 s2, s2, s3 @@ -1435,26 +1409,24 @@ define amdgpu_ps <3 x i32> @s_saddsat_v3i32(<3 x i32> inreg %lhs, <3 x i32> inre ; ; GFX8-LABEL: s_saddsat_v3i32: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_brev_b32 s7, 1 -; GFX8-NEXT: s_min_i32 s9, s0, 0 -; GFX8-NEXT: s_brev_b32 s6, -2 -; GFX8-NEXT: s_max_i32 s8, s0, 0 -; GFX8-NEXT: s_sub_i32 s9, s7, s9 -; GFX8-NEXT: s_sub_i32 s8, s6, s8 -; GFX8-NEXT: s_max_i32 s3, s9, s3 -; GFX8-NEXT: s_min_i32 s3, s3, s8 -; GFX8-NEXT: s_min_i32 s8, s1, 0 +; GFX8-NEXT: s_min_i32 s7, s0, 0 +; GFX8-NEXT: s_max_i32 s6, s0, 0 +; GFX8-NEXT: s_sub_i32 s7, 0x80000000, s7 +; GFX8-NEXT: s_sub_i32 s6, 0x7fffffff, s6 +; GFX8-NEXT: s_max_i32 s3, s7, s3 +; GFX8-NEXT: s_min_i32 s3, s3, s6 +; GFX8-NEXT: s_min_i32 s6, s1, 0 ; GFX8-NEXT: s_add_i32 s0, s0, s3 ; GFX8-NEXT: s_max_i32 s3, s1, 0 -; GFX8-NEXT: s_sub_i32 s8, s7, s8 -; GFX8-NEXT: s_sub_i32 s3, s6, s3 -; GFX8-NEXT: s_max_i32 s4, s8, s4 +; GFX8-NEXT: s_sub_i32 s6, 0x80000000, s6 +; GFX8-NEXT: s_sub_i32 s3, 0x7fffffff, s3 +; GFX8-NEXT: s_max_i32 s4, s6, s4 ; GFX8-NEXT: s_min_i32 s3, s4, s3 ; GFX8-NEXT: s_min_i32 s4, s2, 0 ; GFX8-NEXT: s_add_i32 s1, s1, s3 ; GFX8-NEXT: s_max_i32 s3, s2, 0 -; GFX8-NEXT: s_sub_i32 s4, s7, s4 -; GFX8-NEXT: s_sub_i32 s3, s6, s3 +; GFX8-NEXT: s_sub_i32 s4, 0x80000000, s4 +; GFX8-NEXT: s_sub_i32 s3, 0x7fffffff, s3 ; GFX8-NEXT: s_max_i32 s4, s4, s5 ; GFX8-NEXT: s_min_i32 s3, s4, s3 ; GFX8-NEXT: s_add_i32 s2, s2, s3 @@ -1582,33 +1554,31 @@ define <4 x i32> @v_saddsat_v4i32(<4 x i32> %lhs, <4 x i32> %rhs) { define amdgpu_ps <4 x i32> @s_saddsat_v4i32(<4 x i32> inreg %lhs, <4 x i32> inreg %rhs) { ; GFX6-LABEL: s_saddsat_v4i32: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_brev_b32 s9, 1 -; GFX6-NEXT: s_min_i32 s11, s0, 0 -; GFX6-NEXT: s_brev_b32 s8, -2 -; GFX6-NEXT: s_max_i32 s10, s0, 0 -; GFX6-NEXT: s_sub_i32 s11, s9, s11 -; GFX6-NEXT: s_sub_i32 s10, s8, s10 -; GFX6-NEXT: s_max_i32 s4, s11, s4 -; GFX6-NEXT: s_min_i32 s4, s4, s10 -; GFX6-NEXT: s_min_i32 s10, s1, 0 +; GFX6-NEXT: s_min_i32 s9, s0, 0 +; GFX6-NEXT: s_max_i32 s8, s0, 0 +; GFX6-NEXT: s_sub_i32 s9, 0x80000000, s9 +; GFX6-NEXT: s_sub_i32 s8, 0x7fffffff, s8 +; GFX6-NEXT: s_max_i32 s4, s9, s4 +; GFX6-NEXT: s_min_i32 s4, s4, s8 +; GFX6-NEXT: s_min_i32 s8, s1, 0 ; GFX6-NEXT: s_add_i32 s0, s0, s4 ; GFX6-NEXT: s_max_i32 s4, s1, 0 -; GFX6-NEXT: s_sub_i32 s10, s9, s10 -; GFX6-NEXT: s_sub_i32 s4, s8, s4 -; GFX6-NEXT: s_max_i32 s5, s10, s5 +; GFX6-NEXT: s_sub_i32 s8, 0x80000000, s8 +; GFX6-NEXT: s_sub_i32 s4, 0x7fffffff, s4 +; GFX6-NEXT: s_max_i32 s5, s8, s5 ; GFX6-NEXT: s_min_i32 s4, s5, s4 ; GFX6-NEXT: s_min_i32 s5, s2, 0 ; GFX6-NEXT: s_add_i32 s1, s1, s4 ; GFX6-NEXT: s_max_i32 s4, s2, 0 -; GFX6-NEXT: s_sub_i32 s5, s9, s5 -; GFX6-NEXT: s_sub_i32 s4, s8, s4 +; GFX6-NEXT: s_sub_i32 s5, 0x80000000, s5 +; GFX6-NEXT: s_sub_i32 s4, 0x7fffffff, s4 ; GFX6-NEXT: s_max_i32 s5, s5, s6 ; GFX6-NEXT: s_min_i32 s4, s5, s4 ; GFX6-NEXT: s_min_i32 s5, s3, 0 ; GFX6-NEXT: s_add_i32 s2, s2, s4 ; GFX6-NEXT: s_max_i32 s4, s3, 0 -; GFX6-NEXT: s_sub_i32 s5, s9, s5 -; GFX6-NEXT: s_sub_i32 s4, s8, s4 +; GFX6-NEXT: s_sub_i32 s5, 0x80000000, s5 +; GFX6-NEXT: s_sub_i32 s4, 0x7fffffff, s4 ; GFX6-NEXT: s_max_i32 s5, s5, s7 ; GFX6-NEXT: s_min_i32 s4, s5, s4 ; GFX6-NEXT: s_add_i32 s3, s3, s4 @@ -1616,33 +1586,31 @@ define amdgpu_ps <4 x i32> @s_saddsat_v4i32(<4 x i32> inreg %lhs, <4 x i32> inre ; ; GFX8-LABEL: s_saddsat_v4i32: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_brev_b32 s9, 1 -; GFX8-NEXT: s_min_i32 s11, s0, 0 -; GFX8-NEXT: s_brev_b32 s8, -2 -; GFX8-NEXT: s_max_i32 s10, s0, 0 -; GFX8-NEXT: s_sub_i32 s11, s9, s11 -; GFX8-NEXT: s_sub_i32 s10, s8, s10 -; GFX8-NEXT: s_max_i32 s4, s11, s4 -; GFX8-NEXT: s_min_i32 s4, s4, s10 -; GFX8-NEXT: s_min_i32 s10, s1, 0 +; GFX8-NEXT: s_min_i32 s9, s0, 0 +; GFX8-NEXT: s_max_i32 s8, s0, 0 +; GFX8-NEXT: s_sub_i32 s9, 0x80000000, s9 +; GFX8-NEXT: s_sub_i32 s8, 0x7fffffff, s8 +; GFX8-NEXT: s_max_i32 s4, s9, s4 +; GFX8-NEXT: s_min_i32 s4, s4, s8 +; GFX8-NEXT: s_min_i32 s8, s1, 0 ; GFX8-NEXT: s_add_i32 s0, s0, s4 ; GFX8-NEXT: s_max_i32 s4, s1, 0 -; GFX8-NEXT: s_sub_i32 s10, s9, s10 -; GFX8-NEXT: s_sub_i32 s4, s8, s4 -; GFX8-NEXT: s_max_i32 s5, s10, s5 +; GFX8-NEXT: s_sub_i32 s8, 0x80000000, s8 +; GFX8-NEXT: s_sub_i32 s4, 0x7fffffff, s4 +; GFX8-NEXT: s_max_i32 s5, s8, s5 ; GFX8-NEXT: s_min_i32 s4, s5, s4 ; GFX8-NEXT: s_min_i32 s5, s2, 0 ; GFX8-NEXT: s_add_i32 s1, s1, s4 ; GFX8-NEXT: s_max_i32 s4, s2, 0 -; GFX8-NEXT: s_sub_i32 s5, s9, s5 -; GFX8-NEXT: s_sub_i32 s4, s8, s4 +; GFX8-NEXT: s_sub_i32 s5, 0x80000000, s5 +; GFX8-NEXT: s_sub_i32 s4, 0x7fffffff, s4 ; GFX8-NEXT: s_max_i32 s5, s5, s6 ; GFX8-NEXT: s_min_i32 s4, s5, s4 ; GFX8-NEXT: s_min_i32 s5, s3, 0 ; GFX8-NEXT: s_add_i32 s2, s2, s4 ; GFX8-NEXT: s_max_i32 s4, s3, 0 -; GFX8-NEXT: s_sub_i32 s5, s9, s5 -; GFX8-NEXT: s_sub_i32 s4, s8, s4 +; GFX8-NEXT: s_sub_i32 s5, 0x80000000, s5 +; GFX8-NEXT: s_sub_i32 s4, 0x7fffffff, s4 ; GFX8-NEXT: s_max_i32 s5, s5, s7 ; GFX8-NEXT: s_min_i32 s4, s5, s4 ; GFX8-NEXT: s_add_i32 s3, s3, s4 @@ -1704,21 +1672,20 @@ define <5 x i32> @v_saddsat_v5i32(<5 x i32> %lhs, <5 x i32> %rhs) { ; GFX6-NEXT: v_sub_i32_e32 v6, vcc, s5, v6 ; GFX6-NEXT: v_sub_i32_e32 v5, vcc, s4, v5 ; GFX6-NEXT: v_max_i32_e32 v6, v6, v7 -; GFX6-NEXT: v_bfrev_b32_e32 v13, 1 ; GFX6-NEXT: v_min_i32_e32 v5, v6, v5 ; GFX6-NEXT: v_min_i32_e32 v6, 0, v3 ; GFX6-NEXT: v_bfrev_b32_e32 v11, -2 ; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v5 ; GFX6-NEXT: v_max_i32_e32 v5, 0, v3 -; GFX6-NEXT: v_sub_i32_e32 v6, vcc, v13, v6 +; GFX6-NEXT: v_sub_i32_e32 v6, vcc, 0x80000000, v6 ; GFX6-NEXT: v_sub_i32_e32 v5, vcc, v11, v5 ; GFX6-NEXT: v_max_i32_e32 v6, v6, v8 ; GFX6-NEXT: v_min_i32_e32 v5, v6, v5 ; GFX6-NEXT: v_min_i32_e32 v6, 0, v4 ; GFX6-NEXT: v_add_i32_e32 v3, vcc, v3, v5 ; GFX6-NEXT: v_max_i32_e32 v5, 0, v4 -; GFX6-NEXT: v_sub_i32_e32 v6, vcc, v13, v6 -; GFX6-NEXT: v_sub_i32_e32 v5, vcc, v11, v5 +; GFX6-NEXT: v_sub_i32_e32 v6, vcc, 0x80000000, v6 +; GFX6-NEXT: v_sub_i32_e32 v5, vcc, 0x7fffffff, v5 ; GFX6-NEXT: v_max_i32_e32 v6, v6, v9 ; GFX6-NEXT: v_min_i32_e32 v5, v6, v5 ; GFX6-NEXT: v_add_i32_e32 v4, vcc, v4, v5 @@ -1748,21 +1715,20 @@ define <5 x i32> @v_saddsat_v5i32(<5 x i32> %lhs, <5 x i32> %rhs) { ; GFX8-NEXT: v_sub_u32_e32 v6, vcc, s5, v6 ; GFX8-NEXT: v_sub_u32_e32 v5, vcc, s4, v5 ; GFX8-NEXT: v_max_i32_e32 v6, v6, v7 -; GFX8-NEXT: v_bfrev_b32_e32 v13, 1 ; GFX8-NEXT: v_min_i32_e32 v5, v6, v5 ; GFX8-NEXT: v_min_i32_e32 v6, 0, v3 ; GFX8-NEXT: v_bfrev_b32_e32 v11, -2 ; GFX8-NEXT: v_add_u32_e32 v2, vcc, v2, v5 ; GFX8-NEXT: v_max_i32_e32 v5, 0, v3 -; GFX8-NEXT: v_sub_u32_e32 v6, vcc, v13, v6 +; GFX8-NEXT: v_sub_u32_e32 v6, vcc, 0x80000000, v6 ; GFX8-NEXT: v_sub_u32_e32 v5, vcc, v11, v5 ; GFX8-NEXT: v_max_i32_e32 v6, v6, v8 ; GFX8-NEXT: v_min_i32_e32 v5, v6, v5 ; GFX8-NEXT: v_min_i32_e32 v6, 0, v4 ; GFX8-NEXT: v_add_u32_e32 v3, vcc, v3, v5 ; GFX8-NEXT: v_max_i32_e32 v5, 0, v4 -; GFX8-NEXT: v_sub_u32_e32 v6, vcc, v13, v6 -; GFX8-NEXT: v_sub_u32_e32 v5, vcc, v11, v5 +; GFX8-NEXT: v_sub_u32_e32 v6, vcc, 0x80000000, v6 +; GFX8-NEXT: v_sub_u32_e32 v5, vcc, 0x7fffffff, v5 ; GFX8-NEXT: v_max_i32_e32 v6, v6, v9 ; GFX8-NEXT: v_min_i32_e32 v5, v6, v5 ; GFX8-NEXT: v_add_u32_e32 v4, vcc, v4, v5 @@ -1795,40 +1761,38 @@ define <5 x i32> @v_saddsat_v5i32(<5 x i32> %lhs, <5 x i32> %rhs) { define amdgpu_ps <5 x i32> @s_saddsat_v5i32(<5 x i32> inreg %lhs, <5 x i32> inreg %rhs) { ; GFX6-LABEL: s_saddsat_v5i32: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_brev_b32 s11, 1 -; GFX6-NEXT: s_min_i32 s13, s0, 0 -; GFX6-NEXT: s_brev_b32 s10, -2 -; GFX6-NEXT: s_max_i32 s12, s0, 0 -; GFX6-NEXT: s_sub_i32 s13, s11, s13 -; GFX6-NEXT: s_sub_i32 s12, s10, s12 -; GFX6-NEXT: s_max_i32 s5, s13, s5 -; GFX6-NEXT: s_min_i32 s5, s5, s12 -; GFX6-NEXT: s_min_i32 s12, s1, 0 +; GFX6-NEXT: s_min_i32 s11, s0, 0 +; GFX6-NEXT: s_max_i32 s10, s0, 0 +; GFX6-NEXT: s_sub_i32 s11, 0x80000000, s11 +; GFX6-NEXT: s_sub_i32 s10, 0x7fffffff, s10 +; GFX6-NEXT: s_max_i32 s5, s11, s5 +; GFX6-NEXT: s_min_i32 s5, s5, s10 +; GFX6-NEXT: s_min_i32 s10, s1, 0 ; GFX6-NEXT: s_add_i32 s0, s0, s5 ; GFX6-NEXT: s_max_i32 s5, s1, 0 -; GFX6-NEXT: s_sub_i32 s12, s11, s12 -; GFX6-NEXT: s_sub_i32 s5, s10, s5 -; GFX6-NEXT: s_max_i32 s6, s12, s6 +; GFX6-NEXT: s_sub_i32 s10, 0x80000000, s10 +; GFX6-NEXT: s_sub_i32 s5, 0x7fffffff, s5 +; GFX6-NEXT: s_max_i32 s6, s10, s6 ; GFX6-NEXT: s_min_i32 s5, s6, s5 ; GFX6-NEXT: s_min_i32 s6, s2, 0 ; GFX6-NEXT: s_add_i32 s1, s1, s5 ; GFX6-NEXT: s_max_i32 s5, s2, 0 -; GFX6-NEXT: s_sub_i32 s6, s11, s6 -; GFX6-NEXT: s_sub_i32 s5, s10, s5 +; GFX6-NEXT: s_sub_i32 s6, 0x80000000, s6 +; GFX6-NEXT: s_sub_i32 s5, 0x7fffffff, s5 ; GFX6-NEXT: s_max_i32 s6, s6, s7 ; GFX6-NEXT: s_min_i32 s5, s6, s5 ; GFX6-NEXT: s_min_i32 s6, s3, 0 ; GFX6-NEXT: s_add_i32 s2, s2, s5 ; GFX6-NEXT: s_max_i32 s5, s3, 0 -; GFX6-NEXT: s_sub_i32 s6, s11, s6 -; GFX6-NEXT: s_sub_i32 s5, s10, s5 +; GFX6-NEXT: s_sub_i32 s6, 0x80000000, s6 +; GFX6-NEXT: s_sub_i32 s5, 0x7fffffff, s5 ; GFX6-NEXT: s_max_i32 s6, s6, s8 ; GFX6-NEXT: s_min_i32 s5, s6, s5 ; GFX6-NEXT: s_min_i32 s6, s4, 0 ; GFX6-NEXT: s_add_i32 s3, s3, s5 ; GFX6-NEXT: s_max_i32 s5, s4, 0 -; GFX6-NEXT: s_sub_i32 s6, s11, s6 -; GFX6-NEXT: s_sub_i32 s5, s10, s5 +; GFX6-NEXT: s_sub_i32 s6, 0x80000000, s6 +; GFX6-NEXT: s_sub_i32 s5, 0x7fffffff, s5 ; GFX6-NEXT: s_max_i32 s6, s6, s9 ; GFX6-NEXT: s_min_i32 s5, s6, s5 ; GFX6-NEXT: s_add_i32 s4, s4, s5 @@ -1836,40 +1800,38 @@ define amdgpu_ps <5 x i32> @s_saddsat_v5i32(<5 x i32> inreg %lhs, <5 x i32> inre ; ; GFX8-LABEL: s_saddsat_v5i32: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_brev_b32 s11, 1 -; GFX8-NEXT: s_min_i32 s13, s0, 0 -; GFX8-NEXT: s_brev_b32 s10, -2 -; GFX8-NEXT: s_max_i32 s12, s0, 0 -; GFX8-NEXT: s_sub_i32 s13, s11, s13 -; GFX8-NEXT: s_sub_i32 s12, s10, s12 -; GFX8-NEXT: s_max_i32 s5, s13, s5 -; GFX8-NEXT: s_min_i32 s5, s5, s12 -; GFX8-NEXT: s_min_i32 s12, s1, 0 +; GFX8-NEXT: s_min_i32 s11, s0, 0 +; GFX8-NEXT: s_max_i32 s10, s0, 0 +; GFX8-NEXT: s_sub_i32 s11, 0x80000000, s11 +; GFX8-NEXT: s_sub_i32 s10, 0x7fffffff, s10 +; GFX8-NEXT: s_max_i32 s5, s11, s5 +; GFX8-NEXT: s_min_i32 s5, s5, s10 +; GFX8-NEXT: s_min_i32 s10, s1, 0 ; GFX8-NEXT: s_add_i32 s0, s0, s5 ; GFX8-NEXT: s_max_i32 s5, s1, 0 -; GFX8-NEXT: s_sub_i32 s12, s11, s12 -; GFX8-NEXT: s_sub_i32 s5, s10, s5 -; GFX8-NEXT: s_max_i32 s6, s12, s6 +; GFX8-NEXT: s_sub_i32 s10, 0x80000000, s10 +; GFX8-NEXT: s_sub_i32 s5, 0x7fffffff, s5 +; GFX8-NEXT: s_max_i32 s6, s10, s6 ; GFX8-NEXT: s_min_i32 s5, s6, s5 ; GFX8-NEXT: s_min_i32 s6, s2, 0 ; GFX8-NEXT: s_add_i32 s1, s1, s5 ; GFX8-NEXT: s_max_i32 s5, s2, 0 -; GFX8-NEXT: s_sub_i32 s6, s11, s6 -; GFX8-NEXT: s_sub_i32 s5, s10, s5 +; GFX8-NEXT: s_sub_i32 s6, 0x80000000, s6 +; GFX8-NEXT: s_sub_i32 s5, 0x7fffffff, s5 ; GFX8-NEXT: s_max_i32 s6, s6, s7 ; GFX8-NEXT: s_min_i32 s5, s6, s5 ; GFX8-NEXT: s_min_i32 s6, s3, 0 ; GFX8-NEXT: s_add_i32 s2, s2, s5 ; GFX8-NEXT: s_max_i32 s5, s3, 0 -; GFX8-NEXT: s_sub_i32 s6, s11, s6 -; GFX8-NEXT: s_sub_i32 s5, s10, s5 +; GFX8-NEXT: s_sub_i32 s6, 0x80000000, s6 +; GFX8-NEXT: s_sub_i32 s5, 0x7fffffff, s5 ; GFX8-NEXT: s_max_i32 s6, s6, s8 ; GFX8-NEXT: s_min_i32 s5, s6, s5 ; GFX8-NEXT: s_min_i32 s6, s4, 0 ; GFX8-NEXT: s_add_i32 s3, s3, s5 ; GFX8-NEXT: s_max_i32 s5, s4, 0 -; GFX8-NEXT: s_sub_i32 s6, s11, s6 -; GFX8-NEXT: s_sub_i32 s5, s10, s5 +; GFX8-NEXT: s_sub_i32 s6, 0x80000000, s6 +; GFX8-NEXT: s_sub_i32 s5, 0x7fffffff, s5 ; GFX8-NEXT: s_max_i32 s6, s6, s9 ; GFX8-NEXT: s_min_i32 s5, s6, s5 ; GFX8-NEXT: s_add_i32 s4, s4, s5 @@ -2211,117 +2173,115 @@ define <16 x i32> @v_saddsat_v16i32(<16 x i32> %lhs, <16 x i32> %rhs) { define amdgpu_ps <16 x i32> @s_saddsat_v16i32(<16 x i32> inreg %lhs, <16 x i32> inreg %rhs) { ; GFX6-LABEL: s_saddsat_v16i32: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_brev_b32 s33, 1 -; GFX6-NEXT: s_min_i32 s35, s0, 0 -; GFX6-NEXT: s_brev_b32 s32, -2 -; GFX6-NEXT: s_max_i32 s34, s0, 0 -; GFX6-NEXT: s_sub_i32 s35, s33, s35 -; GFX6-NEXT: s_sub_i32 s34, s32, s34 -; GFX6-NEXT: s_max_i32 s16, s35, s16 -; GFX6-NEXT: s_min_i32 s16, s16, s34 -; GFX6-NEXT: s_min_i32 s34, s1, 0 +; GFX6-NEXT: s_min_i32 s33, s0, 0 +; GFX6-NEXT: s_max_i32 s32, s0, 0 +; GFX6-NEXT: s_sub_i32 s33, 0x80000000, s33 +; GFX6-NEXT: s_sub_i32 s32, 0x7fffffff, s32 +; GFX6-NEXT: s_max_i32 s16, s33, s16 +; GFX6-NEXT: s_min_i32 s16, s16, s32 +; GFX6-NEXT: s_min_i32 s32, s1, 0 ; GFX6-NEXT: s_add_i32 s0, s0, s16 ; GFX6-NEXT: s_max_i32 s16, s1, 0 -; GFX6-NEXT: s_sub_i32 s34, s33, s34 -; GFX6-NEXT: s_sub_i32 s16, s32, s16 -; GFX6-NEXT: s_max_i32 s17, s34, s17 +; GFX6-NEXT: s_sub_i32 s32, 0x80000000, s32 +; GFX6-NEXT: s_sub_i32 s16, 0x7fffffff, s16 +; GFX6-NEXT: s_max_i32 s17, s32, s17 ; GFX6-NEXT: s_min_i32 s16, s17, s16 ; GFX6-NEXT: s_min_i32 s17, s2, 0 ; GFX6-NEXT: s_add_i32 s1, s1, s16 ; GFX6-NEXT: s_max_i32 s16, s2, 0 -; GFX6-NEXT: s_sub_i32 s17, s33, s17 -; GFX6-NEXT: s_sub_i32 s16, s32, s16 +; GFX6-NEXT: s_sub_i32 s17, 0x80000000, s17 +; GFX6-NEXT: s_sub_i32 s16, 0x7fffffff, s16 ; GFX6-NEXT: s_max_i32 s17, s17, s18 ; GFX6-NEXT: s_min_i32 s16, s17, s16 ; GFX6-NEXT: s_min_i32 s17, s3, 0 ; GFX6-NEXT: s_add_i32 s2, s2, s16 ; GFX6-NEXT: s_max_i32 s16, s3, 0 -; GFX6-NEXT: s_sub_i32 s17, s33, s17 -; GFX6-NEXT: s_sub_i32 s16, s32, s16 +; GFX6-NEXT: s_sub_i32 s17, 0x80000000, s17 +; GFX6-NEXT: s_sub_i32 s16, 0x7fffffff, s16 ; GFX6-NEXT: s_max_i32 s17, s17, s19 ; GFX6-NEXT: s_min_i32 s16, s17, s16 ; GFX6-NEXT: s_min_i32 s17, s4, 0 ; GFX6-NEXT: s_add_i32 s3, s3, s16 ; GFX6-NEXT: s_max_i32 s16, s4, 0 -; GFX6-NEXT: s_sub_i32 s17, s33, s17 -; GFX6-NEXT: s_sub_i32 s16, s32, s16 +; GFX6-NEXT: s_sub_i32 s17, 0x80000000, s17 +; GFX6-NEXT: s_sub_i32 s16, 0x7fffffff, s16 ; GFX6-NEXT: s_max_i32 s17, s17, s20 ; GFX6-NEXT: s_min_i32 s16, s17, s16 ; GFX6-NEXT: s_min_i32 s17, s5, 0 ; GFX6-NEXT: s_add_i32 s4, s4, s16 ; GFX6-NEXT: s_max_i32 s16, s5, 0 -; GFX6-NEXT: s_sub_i32 s17, s33, s17 -; GFX6-NEXT: s_sub_i32 s16, s32, s16 +; GFX6-NEXT: s_sub_i32 s17, 0x80000000, s17 +; GFX6-NEXT: s_sub_i32 s16, 0x7fffffff, s16 ; GFX6-NEXT: s_max_i32 s17, s17, s21 ; GFX6-NEXT: s_min_i32 s16, s17, s16 ; GFX6-NEXT: s_min_i32 s17, s6, 0 ; GFX6-NEXT: s_add_i32 s5, s5, s16 ; GFX6-NEXT: s_max_i32 s16, s6, 0 -; GFX6-NEXT: s_sub_i32 s17, s33, s17 -; GFX6-NEXT: s_sub_i32 s16, s32, s16 +; GFX6-NEXT: s_sub_i32 s17, 0x80000000, s17 +; GFX6-NEXT: s_sub_i32 s16, 0x7fffffff, s16 ; GFX6-NEXT: s_max_i32 s17, s17, s22 ; GFX6-NEXT: s_min_i32 s16, s17, s16 ; GFX6-NEXT: s_min_i32 s17, s7, 0 ; GFX6-NEXT: s_add_i32 s6, s6, s16 ; GFX6-NEXT: s_max_i32 s16, s7, 0 -; GFX6-NEXT: s_sub_i32 s17, s33, s17 -; GFX6-NEXT: s_sub_i32 s16, s32, s16 +; GFX6-NEXT: s_sub_i32 s17, 0x80000000, s17 +; GFX6-NEXT: s_sub_i32 s16, 0x7fffffff, s16 ; GFX6-NEXT: s_max_i32 s17, s17, s23 ; GFX6-NEXT: s_min_i32 s16, s17, s16 ; GFX6-NEXT: s_min_i32 s17, s8, 0 ; GFX6-NEXT: s_add_i32 s7, s7, s16 ; GFX6-NEXT: s_max_i32 s16, s8, 0 -; GFX6-NEXT: s_sub_i32 s17, s33, s17 -; GFX6-NEXT: s_sub_i32 s16, s32, s16 +; GFX6-NEXT: s_sub_i32 s17, 0x80000000, s17 +; GFX6-NEXT: s_sub_i32 s16, 0x7fffffff, s16 ; GFX6-NEXT: s_max_i32 s17, s17, s24 ; GFX6-NEXT: s_min_i32 s16, s17, s16 ; GFX6-NEXT: s_min_i32 s17, s9, 0 ; GFX6-NEXT: s_add_i32 s8, s8, s16 ; GFX6-NEXT: s_max_i32 s16, s9, 0 -; GFX6-NEXT: s_sub_i32 s17, s33, s17 -; GFX6-NEXT: s_sub_i32 s16, s32, s16 +; GFX6-NEXT: s_sub_i32 s17, 0x80000000, s17 +; GFX6-NEXT: s_sub_i32 s16, 0x7fffffff, s16 ; GFX6-NEXT: s_max_i32 s17, s17, s25 ; GFX6-NEXT: s_min_i32 s16, s17, s16 ; GFX6-NEXT: s_min_i32 s17, s10, 0 ; GFX6-NEXT: s_add_i32 s9, s9, s16 ; GFX6-NEXT: s_max_i32 s16, s10, 0 -; GFX6-NEXT: s_sub_i32 s17, s33, s17 -; GFX6-NEXT: s_sub_i32 s16, s32, s16 +; GFX6-NEXT: s_sub_i32 s17, 0x80000000, s17 +; GFX6-NEXT: s_sub_i32 s16, 0x7fffffff, s16 ; GFX6-NEXT: s_max_i32 s17, s17, s26 ; GFX6-NEXT: s_min_i32 s16, s17, s16 ; GFX6-NEXT: s_min_i32 s17, s11, 0 ; GFX6-NEXT: s_add_i32 s10, s10, s16 ; GFX6-NEXT: s_max_i32 s16, s11, 0 -; GFX6-NEXT: s_sub_i32 s17, s33, s17 -; GFX6-NEXT: s_sub_i32 s16, s32, s16 +; GFX6-NEXT: s_sub_i32 s17, 0x80000000, s17 +; GFX6-NEXT: s_sub_i32 s16, 0x7fffffff, s16 ; GFX6-NEXT: s_max_i32 s17, s17, s27 ; GFX6-NEXT: s_min_i32 s16, s17, s16 ; GFX6-NEXT: s_min_i32 s17, s12, 0 ; GFX6-NEXT: s_add_i32 s11, s11, s16 ; GFX6-NEXT: s_max_i32 s16, s12, 0 -; GFX6-NEXT: s_sub_i32 s17, s33, s17 -; GFX6-NEXT: s_sub_i32 s16, s32, s16 +; GFX6-NEXT: s_sub_i32 s17, 0x80000000, s17 +; GFX6-NEXT: s_sub_i32 s16, 0x7fffffff, s16 ; GFX6-NEXT: s_max_i32 s17, s17, s28 ; GFX6-NEXT: s_min_i32 s16, s17, s16 ; GFX6-NEXT: s_min_i32 s17, s13, 0 ; GFX6-NEXT: s_add_i32 s12, s12, s16 ; GFX6-NEXT: s_max_i32 s16, s13, 0 -; GFX6-NEXT: s_sub_i32 s17, s33, s17 -; GFX6-NEXT: s_sub_i32 s16, s32, s16 +; GFX6-NEXT: s_sub_i32 s17, 0x80000000, s17 +; GFX6-NEXT: s_sub_i32 s16, 0x7fffffff, s16 ; GFX6-NEXT: s_max_i32 s17, s17, s29 ; GFX6-NEXT: s_min_i32 s16, s17, s16 ; GFX6-NEXT: s_min_i32 s17, s14, 0 ; GFX6-NEXT: s_add_i32 s13, s13, s16 ; GFX6-NEXT: s_max_i32 s16, s14, 0 -; GFX6-NEXT: s_sub_i32 s17, s33, s17 -; GFX6-NEXT: s_sub_i32 s16, s32, s16 +; GFX6-NEXT: s_sub_i32 s17, 0x80000000, s17 +; GFX6-NEXT: s_sub_i32 s16, 0x7fffffff, s16 ; GFX6-NEXT: s_max_i32 s17, s17, s30 ; GFX6-NEXT: s_min_i32 s16, s17, s16 ; GFX6-NEXT: s_min_i32 s17, s15, 0 ; GFX6-NEXT: s_add_i32 s14, s14, s16 ; GFX6-NEXT: s_max_i32 s16, s15, 0 -; GFX6-NEXT: s_sub_i32 s17, s33, s17 -; GFX6-NEXT: s_sub_i32 s16, s32, s16 +; GFX6-NEXT: s_sub_i32 s17, 0x80000000, s17 +; GFX6-NEXT: s_sub_i32 s16, 0x7fffffff, s16 ; GFX6-NEXT: s_max_i32 s17, s17, s31 ; GFX6-NEXT: s_min_i32 s16, s17, s16 ; GFX6-NEXT: s_add_i32 s15, s15, s16 @@ -2329,117 +2289,115 @@ define amdgpu_ps <16 x i32> @s_saddsat_v16i32(<16 x i32> inreg %lhs, <16 x i32> ; ; GFX8-LABEL: s_saddsat_v16i32: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_brev_b32 s33, 1 -; GFX8-NEXT: s_min_i32 s35, s0, 0 -; GFX8-NEXT: s_brev_b32 s32, -2 -; GFX8-NEXT: s_max_i32 s34, s0, 0 -; GFX8-NEXT: s_sub_i32 s35, s33, s35 -; GFX8-NEXT: s_sub_i32 s34, s32, s34 -; GFX8-NEXT: s_max_i32 s16, s35, s16 -; GFX8-NEXT: s_min_i32 s16, s16, s34 -; GFX8-NEXT: s_min_i32 s34, s1, 0 +; GFX8-NEXT: s_min_i32 s33, s0, 0 +; GFX8-NEXT: s_max_i32 s32, s0, 0 +; GFX8-NEXT: s_sub_i32 s33, 0x80000000, s33 +; GFX8-NEXT: s_sub_i32 s32, 0x7fffffff, s32 +; GFX8-NEXT: s_max_i32 s16, s33, s16 +; GFX8-NEXT: s_min_i32 s16, s16, s32 +; GFX8-NEXT: s_min_i32 s32, s1, 0 ; GFX8-NEXT: s_add_i32 s0, s0, s16 ; GFX8-NEXT: s_max_i32 s16, s1, 0 -; GFX8-NEXT: s_sub_i32 s34, s33, s34 -; GFX8-NEXT: s_sub_i32 s16, s32, s16 -; GFX8-NEXT: s_max_i32 s17, s34, s17 +; GFX8-NEXT: s_sub_i32 s32, 0x80000000, s32 +; GFX8-NEXT: s_sub_i32 s16, 0x7fffffff, s16 +; GFX8-NEXT: s_max_i32 s17, s32, s17 ; GFX8-NEXT: s_min_i32 s16, s17, s16 ; GFX8-NEXT: s_min_i32 s17, s2, 0 ; GFX8-NEXT: s_add_i32 s1, s1, s16 ; GFX8-NEXT: s_max_i32 s16, s2, 0 -; GFX8-NEXT: s_sub_i32 s17, s33, s17 -; GFX8-NEXT: s_sub_i32 s16, s32, s16 +; GFX8-NEXT: s_sub_i32 s17, 0x80000000, s17 +; GFX8-NEXT: s_sub_i32 s16, 0x7fffffff, s16 ; GFX8-NEXT: s_max_i32 s17, s17, s18 ; GFX8-NEXT: s_min_i32 s16, s17, s16 ; GFX8-NEXT: s_min_i32 s17, s3, 0 ; GFX8-NEXT: s_add_i32 s2, s2, s16 ; GFX8-NEXT: s_max_i32 s16, s3, 0 -; GFX8-NEXT: s_sub_i32 s17, s33, s17 -; GFX8-NEXT: s_sub_i32 s16, s32, s16 +; GFX8-NEXT: s_sub_i32 s17, 0x80000000, s17 +; GFX8-NEXT: s_sub_i32 s16, 0x7fffffff, s16 ; GFX8-NEXT: s_max_i32 s17, s17, s19 ; GFX8-NEXT: s_min_i32 s16, s17, s16 ; GFX8-NEXT: s_min_i32 s17, s4, 0 ; GFX8-NEXT: s_add_i32 s3, s3, s16 ; GFX8-NEXT: s_max_i32 s16, s4, 0 -; GFX8-NEXT: s_sub_i32 s17, s33, s17 -; GFX8-NEXT: s_sub_i32 s16, s32, s16 +; GFX8-NEXT: s_sub_i32 s17, 0x80000000, s17 +; GFX8-NEXT: s_sub_i32 s16, 0x7fffffff, s16 ; GFX8-NEXT: s_max_i32 s17, s17, s20 ; GFX8-NEXT: s_min_i32 s16, s17, s16 ; GFX8-NEXT: s_min_i32 s17, s5, 0 ; GFX8-NEXT: s_add_i32 s4, s4, s16 ; GFX8-NEXT: s_max_i32 s16, s5, 0 -; GFX8-NEXT: s_sub_i32 s17, s33, s17 -; GFX8-NEXT: s_sub_i32 s16, s32, s16 +; GFX8-NEXT: s_sub_i32 s17, 0x80000000, s17 +; GFX8-NEXT: s_sub_i32 s16, 0x7fffffff, s16 ; GFX8-NEXT: s_max_i32 s17, s17, s21 ; GFX8-NEXT: s_min_i32 s16, s17, s16 ; GFX8-NEXT: s_min_i32 s17, s6, 0 ; GFX8-NEXT: s_add_i32 s5, s5, s16 ; GFX8-NEXT: s_max_i32 s16, s6, 0 -; GFX8-NEXT: s_sub_i32 s17, s33, s17 -; GFX8-NEXT: s_sub_i32 s16, s32, s16 +; GFX8-NEXT: s_sub_i32 s17, 0x80000000, s17 +; GFX8-NEXT: s_sub_i32 s16, 0x7fffffff, s16 ; GFX8-NEXT: s_max_i32 s17, s17, s22 ; GFX8-NEXT: s_min_i32 s16, s17, s16 ; GFX8-NEXT: s_min_i32 s17, s7, 0 ; GFX8-NEXT: s_add_i32 s6, s6, s16 ; GFX8-NEXT: s_max_i32 s16, s7, 0 -; GFX8-NEXT: s_sub_i32 s17, s33, s17 -; GFX8-NEXT: s_sub_i32 s16, s32, s16 +; GFX8-NEXT: s_sub_i32 s17, 0x80000000, s17 +; GFX8-NEXT: s_sub_i32 s16, 0x7fffffff, s16 ; GFX8-NEXT: s_max_i32 s17, s17, s23 ; GFX8-NEXT: s_min_i32 s16, s17, s16 ; GFX8-NEXT: s_min_i32 s17, s8, 0 ; GFX8-NEXT: s_add_i32 s7, s7, s16 ; GFX8-NEXT: s_max_i32 s16, s8, 0 -; GFX8-NEXT: s_sub_i32 s17, s33, s17 -; GFX8-NEXT: s_sub_i32 s16, s32, s16 +; GFX8-NEXT: s_sub_i32 s17, 0x80000000, s17 +; GFX8-NEXT: s_sub_i32 s16, 0x7fffffff, s16 ; GFX8-NEXT: s_max_i32 s17, s17, s24 ; GFX8-NEXT: s_min_i32 s16, s17, s16 ; GFX8-NEXT: s_min_i32 s17, s9, 0 ; GFX8-NEXT: s_add_i32 s8, s8, s16 ; GFX8-NEXT: s_max_i32 s16, s9, 0 -; GFX8-NEXT: s_sub_i32 s17, s33, s17 -; GFX8-NEXT: s_sub_i32 s16, s32, s16 +; GFX8-NEXT: s_sub_i32 s17, 0x80000000, s17 +; GFX8-NEXT: s_sub_i32 s16, 0x7fffffff, s16 ; GFX8-NEXT: s_max_i32 s17, s17, s25 ; GFX8-NEXT: s_min_i32 s16, s17, s16 ; GFX8-NEXT: s_min_i32 s17, s10, 0 ; GFX8-NEXT: s_add_i32 s9, s9, s16 ; GFX8-NEXT: s_max_i32 s16, s10, 0 -; GFX8-NEXT: s_sub_i32 s17, s33, s17 -; GFX8-NEXT: s_sub_i32 s16, s32, s16 +; GFX8-NEXT: s_sub_i32 s17, 0x80000000, s17 +; GFX8-NEXT: s_sub_i32 s16, 0x7fffffff, s16 ; GFX8-NEXT: s_max_i32 s17, s17, s26 ; GFX8-NEXT: s_min_i32 s16, s17, s16 ; GFX8-NEXT: s_min_i32 s17, s11, 0 ; GFX8-NEXT: s_add_i32 s10, s10, s16 ; GFX8-NEXT: s_max_i32 s16, s11, 0 -; GFX8-NEXT: s_sub_i32 s17, s33, s17 -; GFX8-NEXT: s_sub_i32 s16, s32, s16 +; GFX8-NEXT: s_sub_i32 s17, 0x80000000, s17 +; GFX8-NEXT: s_sub_i32 s16, 0x7fffffff, s16 ; GFX8-NEXT: s_max_i32 s17, s17, s27 ; GFX8-NEXT: s_min_i32 s16, s17, s16 ; GFX8-NEXT: s_min_i32 s17, s12, 0 ; GFX8-NEXT: s_add_i32 s11, s11, s16 ; GFX8-NEXT: s_max_i32 s16, s12, 0 -; GFX8-NEXT: s_sub_i32 s17, s33, s17 -; GFX8-NEXT: s_sub_i32 s16, s32, s16 +; GFX8-NEXT: s_sub_i32 s17, 0x80000000, s17 +; GFX8-NEXT: s_sub_i32 s16, 0x7fffffff, s16 ; GFX8-NEXT: s_max_i32 s17, s17, s28 ; GFX8-NEXT: s_min_i32 s16, s17, s16 ; GFX8-NEXT: s_min_i32 s17, s13, 0 ; GFX8-NEXT: s_add_i32 s12, s12, s16 ; GFX8-NEXT: s_max_i32 s16, s13, 0 -; GFX8-NEXT: s_sub_i32 s17, s33, s17 -; GFX8-NEXT: s_sub_i32 s16, s32, s16 +; GFX8-NEXT: s_sub_i32 s17, 0x80000000, s17 +; GFX8-NEXT: s_sub_i32 s16, 0x7fffffff, s16 ; GFX8-NEXT: s_max_i32 s17, s17, s29 ; GFX8-NEXT: s_min_i32 s16, s17, s16 ; GFX8-NEXT: s_min_i32 s17, s14, 0 ; GFX8-NEXT: s_add_i32 s13, s13, s16 ; GFX8-NEXT: s_max_i32 s16, s14, 0 -; GFX8-NEXT: s_sub_i32 s17, s33, s17 -; GFX8-NEXT: s_sub_i32 s16, s32, s16 +; GFX8-NEXT: s_sub_i32 s17, 0x80000000, s17 +; GFX8-NEXT: s_sub_i32 s16, 0x7fffffff, s16 ; GFX8-NEXT: s_max_i32 s17, s17, s30 ; GFX8-NEXT: s_min_i32 s16, s17, s16 ; GFX8-NEXT: s_min_i32 s17, s15, 0 ; GFX8-NEXT: s_add_i32 s14, s14, s16 ; GFX8-NEXT: s_max_i32 s16, s15, 0 -; GFX8-NEXT: s_sub_i32 s17, s33, s17 -; GFX8-NEXT: s_sub_i32 s16, s32, s16 +; GFX8-NEXT: s_sub_i32 s17, 0x80000000, s17 +; GFX8-NEXT: s_sub_i32 s16, 0x7fffffff, s16 ; GFX8-NEXT: s_max_i32 s17, s17, s31 ; GFX8-NEXT: s_min_i32 s16, s17, s16 ; GFX8-NEXT: s_add_i32 s15, s15, s16 @@ -2781,60 +2739,55 @@ define amdgpu_ps i32 @s_saddsat_v2i16(<2 x i16> inreg %lhs, <2 x i16> inreg %rhs ; GFX6-LABEL: s_saddsat_v2i16: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_lshl_b32 s0, s0, 16 -; GFX6-NEXT: s_brev_b32 s5, 1 -; GFX6-NEXT: s_min_i32 s7, s0, 0 +; GFX6-NEXT: s_min_i32 s5, s0, 0 ; GFX6-NEXT: s_lshl_b32 s2, s2, 16 -; GFX6-NEXT: s_brev_b32 s4, -2 -; GFX6-NEXT: s_max_i32 s6, s0, 0 -; GFX6-NEXT: s_sub_i32 s7, s5, s7 -; GFX6-NEXT: s_sub_i32 s6, s4, s6 -; GFX6-NEXT: s_max_i32 s2, s7, s2 -; GFX6-NEXT: s_min_i32 s2, s2, s6 +; GFX6-NEXT: s_max_i32 s4, s0, 0 +; GFX6-NEXT: s_sub_i32 s5, 0x80000000, s5 +; GFX6-NEXT: s_sub_i32 s4, 0x7fffffff, s4 +; GFX6-NEXT: s_max_i32 s2, s5, s2 ; GFX6-NEXT: s_lshl_b32 s1, s1, 16 +; GFX6-NEXT: s_min_i32 s2, s2, s4 +; GFX6-NEXT: s_min_i32 s4, s1, 0 ; GFX6-NEXT: s_add_i32 s0, s0, s2 ; GFX6-NEXT: s_lshl_b32 s2, s3, 16 ; GFX6-NEXT: s_max_i32 s3, s1, 0 -; GFX6-NEXT: s_sub_i32 s3, s4, s3 -; GFX6-NEXT: s_min_i32 s4, s1, 0 -; GFX6-NEXT: s_sub_i32 s4, s5, s4 +; GFX6-NEXT: s_sub_i32 s4, 0x80000000, s4 +; GFX6-NEXT: s_sub_i32 s3, 0x7fffffff, s3 ; GFX6-NEXT: s_max_i32 s2, s4, s2 ; GFX6-NEXT: s_min_i32 s2, s2, s3 ; GFX6-NEXT: s_add_i32 s1, s1, s2 ; GFX6-NEXT: s_ashr_i32 s1, s1, 16 -; GFX6-NEXT: s_mov_b32 s2, 0xffff ; GFX6-NEXT: s_ashr_i32 s0, s0, 16 -; GFX6-NEXT: s_and_b32 s1, s1, s2 -; GFX6-NEXT: s_and_b32 s0, s0, s2 +; GFX6-NEXT: s_and_b32 s1, s1, 0xffff +; GFX6-NEXT: s_and_b32 s0, s0, 0xffff ; GFX6-NEXT: s_lshl_b32 s1, s1, 16 ; GFX6-NEXT: s_or_b32 s0, s0, s1 ; GFX6-NEXT: ; return to shader part epilog ; ; GFX8-LABEL: s_saddsat_v2i16: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_sext_i32_i16 s6, s0 -; GFX8-NEXT: s_sext_i32_i16 s7, 0 -; GFX8-NEXT: s_movk_i32 s5, 0x8000 -; GFX8-NEXT: s_max_i32 s8, s6, s7 -; GFX8-NEXT: s_min_i32 s6, s6, s7 -; GFX8-NEXT: s_sub_i32 s6, s5, s6 +; GFX8-NEXT: s_sext_i32_i16 s4, s0 +; GFX8-NEXT: s_sext_i32_i16 s5, 0 +; GFX8-NEXT: s_max_i32 s6, s4, s5 +; GFX8-NEXT: s_min_i32 s4, s4, s5 +; GFX8-NEXT: s_sub_i32 s4, 0xffff8000, s4 ; GFX8-NEXT: s_lshr_b32 s3, s1, 16 -; GFX8-NEXT: s_movk_i32 s4, 0x7fff -; GFX8-NEXT: s_sext_i32_i16 s6, s6 +; GFX8-NEXT: s_sext_i32_i16 s4, s4 ; GFX8-NEXT: s_sext_i32_i16 s1, s1 -; GFX8-NEXT: s_sub_i32 s8, s4, s8 -; GFX8-NEXT: s_max_i32 s1, s6, s1 +; GFX8-NEXT: s_sub_i32 s6, 0x7fff, s6 +; GFX8-NEXT: s_max_i32 s1, s4, s1 ; GFX8-NEXT: s_sext_i32_i16 s1, s1 -; GFX8-NEXT: s_sext_i32_i16 s6, s8 +; GFX8-NEXT: s_sext_i32_i16 s4, s6 ; GFX8-NEXT: s_lshr_b32 s2, s0, 16 -; GFX8-NEXT: s_min_i32 s1, s1, s6 +; GFX8-NEXT: s_min_i32 s1, s1, s4 ; GFX8-NEXT: s_add_i32 s0, s0, s1 ; GFX8-NEXT: s_sext_i32_i16 s1, s2 -; GFX8-NEXT: s_max_i32 s6, s1, s7 -; GFX8-NEXT: s_min_i32 s1, s1, s7 -; GFX8-NEXT: s_sub_i32 s1, s5, s1 +; GFX8-NEXT: s_max_i32 s4, s1, s5 +; GFX8-NEXT: s_min_i32 s1, s1, s5 +; GFX8-NEXT: s_sub_i32 s1, 0xffff8000, s1 ; GFX8-NEXT: s_sext_i32_i16 s1, s1 ; GFX8-NEXT: s_sext_i32_i16 s3, s3 -; GFX8-NEXT: s_sub_i32 s4, s4, s6 +; GFX8-NEXT: s_sub_i32 s4, 0x7fff, s4 ; GFX8-NEXT: s_max_i32 s1, s1, s3 ; GFX8-NEXT: s_sext_i32_i16 s1, s1 ; GFX8-NEXT: s_sext_i32_i16 s3, s4 @@ -2867,22 +2820,20 @@ define amdgpu_ps float @saddsat_v2i16_sv(<2 x i16> inreg %lhs, <2 x i16> %rhs) { ; GFX6-LABEL: saddsat_v2i16_sv: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_lshl_b32 s0, s0, 16 -; GFX6-NEXT: s_brev_b32 s3, 1 -; GFX6-NEXT: s_min_i32 s5, s0, 0 +; GFX6-NEXT: s_min_i32 s3, s0, 0 ; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX6-NEXT: s_brev_b32 s2, -2 -; GFX6-NEXT: s_max_i32 s4, s0, 0 -; GFX6-NEXT: s_sub_i32 s5, s3, s5 -; GFX6-NEXT: s_sub_i32 s4, s2, s4 -; GFX6-NEXT: v_max_i32_e32 v0, s5, v0 -; GFX6-NEXT: v_min_i32_e32 v0, s4, v0 +; GFX6-NEXT: s_max_i32 s2, s0, 0 +; GFX6-NEXT: s_sub_i32 s3, 0x80000000, s3 +; GFX6-NEXT: s_sub_i32 s2, 0x7fffffff, s2 +; GFX6-NEXT: v_max_i32_e32 v0, s3, v0 +; GFX6-NEXT: v_min_i32_e32 v0, s2, v0 ; GFX6-NEXT: v_add_i32_e32 v0, vcc, s0, v0 ; GFX6-NEXT: s_lshl_b32 s0, s1, 16 -; GFX6-NEXT: s_max_i32 s1, s0, 0 -; GFX6-NEXT: s_sub_i32 s1, s2, s1 ; GFX6-NEXT: s_min_i32 s2, s0, 0 ; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX6-NEXT: s_sub_i32 s2, s3, s2 +; GFX6-NEXT: s_max_i32 s1, s0, 0 +; GFX6-NEXT: s_sub_i32 s2, 0x80000000, s2 +; GFX6-NEXT: s_sub_i32 s1, 0x7fffffff, s1 ; GFX6-NEXT: v_max_i32_e32 v1, s2, v1 ; GFX6-NEXT: v_min_i32_e32 v1, s1, v1 ; GFX6-NEXT: v_add_i32_e32 v1, vcc, s0, v1 @@ -2897,25 +2848,23 @@ define amdgpu_ps float @saddsat_v2i16_sv(<2 x i16> inreg %lhs, <2 x i16> %rhs) { ; ; GFX8-LABEL: saddsat_v2i16_sv: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_sext_i32_i16 s4, s0 -; GFX8-NEXT: s_sext_i32_i16 s5, 0 -; GFX8-NEXT: s_movk_i32 s3, 0x8000 -; GFX8-NEXT: s_max_i32 s6, s4, s5 -; GFX8-NEXT: s_min_i32 s4, s4, s5 +; GFX8-NEXT: s_sext_i32_i16 s2, s0 +; GFX8-NEXT: s_sext_i32_i16 s3, 0 +; GFX8-NEXT: s_max_i32 s4, s2, s3 +; GFX8-NEXT: s_min_i32 s2, s2, s3 ; GFX8-NEXT: s_lshr_b32 s1, s0, 16 -; GFX8-NEXT: s_movk_i32 s2, 0x7fff -; GFX8-NEXT: s_sub_i32 s4, s3, s4 -; GFX8-NEXT: s_sub_i32 s6, s2, s6 -; GFX8-NEXT: v_max_i16_e32 v1, s4, v0 -; GFX8-NEXT: s_sext_i32_i16 s4, s1 -; GFX8-NEXT: v_min_i16_e32 v1, s6, v1 -; GFX8-NEXT: s_max_i32 s6, s4, s5 -; GFX8-NEXT: s_min_i32 s4, s4, s5 -; GFX8-NEXT: s_sub_i32 s3, s3, s4 -; GFX8-NEXT: v_mov_b32_e32 v2, s3 -; GFX8-NEXT: s_sub_i32 s2, s2, s6 +; GFX8-NEXT: s_sub_i32 s2, 0xffff8000, s2 +; GFX8-NEXT: s_sub_i32 s4, 0x7fff, s4 +; GFX8-NEXT: v_max_i16_e32 v1, s2, v0 +; GFX8-NEXT: s_sext_i32_i16 s2, s1 +; GFX8-NEXT: v_min_i16_e32 v1, s4, v1 +; GFX8-NEXT: s_max_i32 s4, s2, s3 +; GFX8-NEXT: s_min_i32 s2, s2, s3 +; GFX8-NEXT: s_sub_i32 s2, 0xffff8000, s2 +; GFX8-NEXT: v_mov_b32_e32 v2, s2 +; GFX8-NEXT: s_sub_i32 s4, 0x7fff, s4 ; GFX8-NEXT: v_max_i16_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX8-NEXT: v_min_i16_e32 v0, s2, v0 +; GFX8-NEXT: v_min_i16_e32 v0, s4, v0 ; GFX8-NEXT: v_mov_b32_e32 v2, s1 ; GFX8-NEXT: v_add_u16_e32 v1, s0, v1 ; GFX8-NEXT: v_add_u16_sdwa v0, v2, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD @@ -3137,31 +3086,29 @@ define amdgpu_ps <2 x i32> @s_saddsat_v4i16(<4 x i16> inreg %lhs, <4 x i16> inre ; GFX6-LABEL: s_saddsat_v4i16: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_lshl_b32 s0, s0, 16 -; GFX6-NEXT: s_brev_b32 s9, 1 -; GFX6-NEXT: s_min_i32 s11, s0, 0 +; GFX6-NEXT: s_min_i32 s9, s0, 0 ; GFX6-NEXT: s_lshl_b32 s4, s4, 16 -; GFX6-NEXT: s_brev_b32 s8, -2 -; GFX6-NEXT: s_max_i32 s10, s0, 0 -; GFX6-NEXT: s_sub_i32 s11, s9, s11 -; GFX6-NEXT: s_sub_i32 s10, s8, s10 -; GFX6-NEXT: s_max_i32 s4, s11, s4 +; GFX6-NEXT: s_max_i32 s8, s0, 0 +; GFX6-NEXT: s_sub_i32 s9, 0x80000000, s9 +; GFX6-NEXT: s_sub_i32 s8, 0x7fffffff, s8 +; GFX6-NEXT: s_max_i32 s4, s9, s4 ; GFX6-NEXT: s_lshl_b32 s1, s1, 16 -; GFX6-NEXT: s_min_i32 s4, s4, s10 -; GFX6-NEXT: s_min_i32 s10, s1, 0 +; GFX6-NEXT: s_min_i32 s4, s4, s8 +; GFX6-NEXT: s_min_i32 s8, s1, 0 ; GFX6-NEXT: s_add_i32 s0, s0, s4 ; GFX6-NEXT: s_lshl_b32 s4, s5, 16 ; GFX6-NEXT: s_max_i32 s5, s1, 0 -; GFX6-NEXT: s_sub_i32 s10, s9, s10 -; GFX6-NEXT: s_sub_i32 s5, s8, s5 -; GFX6-NEXT: s_max_i32 s4, s10, s4 +; GFX6-NEXT: s_sub_i32 s8, 0x80000000, s8 +; GFX6-NEXT: s_sub_i32 s5, 0x7fffffff, s5 +; GFX6-NEXT: s_max_i32 s4, s8, s4 ; GFX6-NEXT: s_min_i32 s4, s4, s5 ; GFX6-NEXT: s_lshl_b32 s2, s2, 16 ; GFX6-NEXT: s_add_i32 s1, s1, s4 ; GFX6-NEXT: s_lshl_b32 s4, s6, 16 ; GFX6-NEXT: s_min_i32 s6, s2, 0 ; GFX6-NEXT: s_max_i32 s5, s2, 0 -; GFX6-NEXT: s_sub_i32 s6, s9, s6 -; GFX6-NEXT: s_sub_i32 s5, s8, s5 +; GFX6-NEXT: s_sub_i32 s6, 0x80000000, s6 +; GFX6-NEXT: s_sub_i32 s5, 0x7fffffff, s5 ; GFX6-NEXT: s_max_i32 s4, s6, s4 ; GFX6-NEXT: s_lshl_b32 s3, s3, 16 ; GFX6-NEXT: s_min_i32 s4, s4, s5 @@ -3169,65 +3116,62 @@ define amdgpu_ps <2 x i32> @s_saddsat_v4i16(<4 x i16> inreg %lhs, <4 x i16> inre ; GFX6-NEXT: s_add_i32 s2, s2, s4 ; GFX6-NEXT: s_lshl_b32 s4, s7, 16 ; GFX6-NEXT: s_max_i32 s5, s3, 0 -; GFX6-NEXT: s_sub_i32 s6, s9, s6 -; GFX6-NEXT: s_sub_i32 s5, s8, s5 +; GFX6-NEXT: s_sub_i32 s6, 0x80000000, s6 +; GFX6-NEXT: s_sub_i32 s5, 0x7fffffff, s5 ; GFX6-NEXT: s_max_i32 s4, s6, s4 -; GFX6-NEXT: s_min_i32 s4, s4, s5 ; GFX6-NEXT: s_ashr_i32 s1, s1, 16 -; GFX6-NEXT: s_add_i32 s3, s3, s4 -; GFX6-NEXT: s_mov_b32 s4, 0xffff +; GFX6-NEXT: s_min_i32 s4, s4, s5 ; GFX6-NEXT: s_ashr_i32 s0, s0, 16 -; GFX6-NEXT: s_and_b32 s1, s1, s4 +; GFX6-NEXT: s_add_i32 s3, s3, s4 +; GFX6-NEXT: s_and_b32 s1, s1, 0xffff ; GFX6-NEXT: s_ashr_i32 s2, s2, 16 ; GFX6-NEXT: s_ashr_i32 s3, s3, 16 -; GFX6-NEXT: s_and_b32 s0, s0, s4 +; GFX6-NEXT: s_and_b32 s0, s0, 0xffff ; GFX6-NEXT: s_lshl_b32 s1, s1, 16 ; GFX6-NEXT: s_or_b32 s0, s0, s1 -; GFX6-NEXT: s_and_b32 s1, s2, s4 -; GFX6-NEXT: s_and_b32 s2, s3, s4 +; GFX6-NEXT: s_and_b32 s1, s2, 0xffff +; GFX6-NEXT: s_and_b32 s2, s3, 0xffff ; GFX6-NEXT: s_lshl_b32 s2, s2, 16 ; GFX6-NEXT: s_or_b32 s1, s1, s2 ; GFX6-NEXT: ; return to shader part epilog ; ; GFX8-LABEL: s_saddsat_v4i16: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_sext_i32_i16 s10, s0 -; GFX8-NEXT: s_sext_i32_i16 s11, 0 -; GFX8-NEXT: s_movk_i32 s9, 0x8000 -; GFX8-NEXT: s_max_i32 s12, s10, s11 -; GFX8-NEXT: s_min_i32 s10, s10, s11 -; GFX8-NEXT: s_sub_i32 s10, s9, s10 +; GFX8-NEXT: s_sext_i32_i16 s8, s0 +; GFX8-NEXT: s_sext_i32_i16 s9, 0 +; GFX8-NEXT: s_max_i32 s10, s8, s9 +; GFX8-NEXT: s_min_i32 s8, s8, s9 +; GFX8-NEXT: s_sub_i32 s8, 0xffff8000, s8 ; GFX8-NEXT: s_lshr_b32 s6, s2, 16 -; GFX8-NEXT: s_movk_i32 s8, 0x7fff -; GFX8-NEXT: s_sext_i32_i16 s10, s10 +; GFX8-NEXT: s_sext_i32_i16 s8, s8 ; GFX8-NEXT: s_sext_i32_i16 s2, s2 -; GFX8-NEXT: s_sub_i32 s12, s8, s12 -; GFX8-NEXT: s_max_i32 s2, s10, s2 +; GFX8-NEXT: s_sub_i32 s10, 0x7fff, s10 +; GFX8-NEXT: s_max_i32 s2, s8, s2 ; GFX8-NEXT: s_sext_i32_i16 s2, s2 -; GFX8-NEXT: s_sext_i32_i16 s10, s12 +; GFX8-NEXT: s_sext_i32_i16 s8, s10 ; GFX8-NEXT: s_lshr_b32 s4, s0, 16 -; GFX8-NEXT: s_min_i32 s2, s2, s10 +; GFX8-NEXT: s_min_i32 s2, s2, s8 ; GFX8-NEXT: s_add_i32 s0, s0, s2 ; GFX8-NEXT: s_sext_i32_i16 s2, s4 -; GFX8-NEXT: s_max_i32 s10, s2, s11 -; GFX8-NEXT: s_min_i32 s2, s2, s11 -; GFX8-NEXT: s_sub_i32 s2, s9, s2 +; GFX8-NEXT: s_max_i32 s8, s2, s9 +; GFX8-NEXT: s_min_i32 s2, s2, s9 +; GFX8-NEXT: s_sub_i32 s2, 0xffff8000, s2 ; GFX8-NEXT: s_sext_i32_i16 s2, s2 ; GFX8-NEXT: s_sext_i32_i16 s6, s6 -; GFX8-NEXT: s_sub_i32 s10, s8, s10 +; GFX8-NEXT: s_sub_i32 s8, 0x7fff, s8 ; GFX8-NEXT: s_max_i32 s2, s2, s6 ; GFX8-NEXT: s_sext_i32_i16 s2, s2 -; GFX8-NEXT: s_sext_i32_i16 s6, s10 +; GFX8-NEXT: s_sext_i32_i16 s6, s8 ; GFX8-NEXT: s_min_i32 s2, s2, s6 ; GFX8-NEXT: s_add_i32 s4, s4, s2 ; GFX8-NEXT: s_sext_i32_i16 s2, s1 -; GFX8-NEXT: s_max_i32 s6, s2, s11 -; GFX8-NEXT: s_min_i32 s2, s2, s11 -; GFX8-NEXT: s_sub_i32 s2, s9, s2 +; GFX8-NEXT: s_max_i32 s6, s2, s9 +; GFX8-NEXT: s_min_i32 s2, s2, s9 +; GFX8-NEXT: s_sub_i32 s2, 0xffff8000, s2 ; GFX8-NEXT: s_lshr_b32 s7, s3, 16 ; GFX8-NEXT: s_sext_i32_i16 s2, s2 ; GFX8-NEXT: s_sext_i32_i16 s3, s3 -; GFX8-NEXT: s_sub_i32 s6, s8, s6 +; GFX8-NEXT: s_sub_i32 s6, 0x7fff, s6 ; GFX8-NEXT: s_max_i32 s2, s2, s3 ; GFX8-NEXT: s_sext_i32_i16 s2, s2 ; GFX8-NEXT: s_sext_i32_i16 s3, s6 @@ -3235,12 +3179,12 @@ define amdgpu_ps <2 x i32> @s_saddsat_v4i16(<4 x i16> inreg %lhs, <4 x i16> inre ; GFX8-NEXT: s_min_i32 s2, s2, s3 ; GFX8-NEXT: s_add_i32 s1, s1, s2 ; GFX8-NEXT: s_sext_i32_i16 s2, s5 -; GFX8-NEXT: s_max_i32 s3, s2, s11 -; GFX8-NEXT: s_min_i32 s2, s2, s11 -; GFX8-NEXT: s_sub_i32 s2, s9, s2 +; GFX8-NEXT: s_max_i32 s3, s2, s9 +; GFX8-NEXT: s_min_i32 s2, s2, s9 +; GFX8-NEXT: s_sub_i32 s2, 0xffff8000, s2 ; GFX8-NEXT: s_sext_i32_i16 s2, s2 ; GFX8-NEXT: s_sext_i32_i16 s6, s7 -; GFX8-NEXT: s_sub_i32 s3, s8, s3 +; GFX8-NEXT: s_sub_i32 s3, 0x7fff, s3 ; GFX8-NEXT: s_max_i32 s2, s2, s6 ; GFX8-NEXT: s_sext_i32_i16 s2, s2 ; GFX8-NEXT: s_sext_i32_i16 s3, s3 @@ -3454,31 +3398,29 @@ define amdgpu_ps <3 x i32> @s_saddsat_v6i16(<6 x i16> inreg %lhs, <6 x i16> inre ; GFX6-LABEL: s_saddsat_v6i16: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_lshl_b32 s0, s0, 16 -; GFX6-NEXT: s_brev_b32 s13, 1 -; GFX6-NEXT: s_min_i32 s15, s0, 0 +; GFX6-NEXT: s_min_i32 s13, s0, 0 ; GFX6-NEXT: s_lshl_b32 s6, s6, 16 -; GFX6-NEXT: s_brev_b32 s12, -2 -; GFX6-NEXT: s_max_i32 s14, s0, 0 -; GFX6-NEXT: s_sub_i32 s15, s13, s15 -; GFX6-NEXT: s_sub_i32 s14, s12, s14 -; GFX6-NEXT: s_max_i32 s6, s15, s6 +; GFX6-NEXT: s_max_i32 s12, s0, 0 +; GFX6-NEXT: s_sub_i32 s13, 0x80000000, s13 +; GFX6-NEXT: s_sub_i32 s12, 0x7fffffff, s12 +; GFX6-NEXT: s_max_i32 s6, s13, s6 ; GFX6-NEXT: s_lshl_b32 s1, s1, 16 -; GFX6-NEXT: s_min_i32 s6, s6, s14 -; GFX6-NEXT: s_min_i32 s14, s1, 0 +; GFX6-NEXT: s_min_i32 s6, s6, s12 +; GFX6-NEXT: s_min_i32 s12, s1, 0 ; GFX6-NEXT: s_add_i32 s0, s0, s6 ; GFX6-NEXT: s_lshl_b32 s6, s7, 16 ; GFX6-NEXT: s_max_i32 s7, s1, 0 -; GFX6-NEXT: s_sub_i32 s14, s13, s14 -; GFX6-NEXT: s_sub_i32 s7, s12, s7 -; GFX6-NEXT: s_max_i32 s6, s14, s6 +; GFX6-NEXT: s_sub_i32 s12, 0x80000000, s12 +; GFX6-NEXT: s_sub_i32 s7, 0x7fffffff, s7 +; GFX6-NEXT: s_max_i32 s6, s12, s6 ; GFX6-NEXT: s_min_i32 s6, s6, s7 ; GFX6-NEXT: s_lshl_b32 s2, s2, 16 ; GFX6-NEXT: s_add_i32 s1, s1, s6 ; GFX6-NEXT: s_lshl_b32 s6, s8, 16 ; GFX6-NEXT: s_min_i32 s8, s2, 0 ; GFX6-NEXT: s_max_i32 s7, s2, 0 -; GFX6-NEXT: s_sub_i32 s8, s13, s8 -; GFX6-NEXT: s_sub_i32 s7, s12, s7 +; GFX6-NEXT: s_sub_i32 s8, 0x80000000, s8 +; GFX6-NEXT: s_sub_i32 s7, 0x7fffffff, s7 ; GFX6-NEXT: s_max_i32 s6, s8, s6 ; GFX6-NEXT: s_lshl_b32 s3, s3, 16 ; GFX6-NEXT: s_min_i32 s6, s6, s7 @@ -3486,8 +3428,8 @@ define amdgpu_ps <3 x i32> @s_saddsat_v6i16(<6 x i16> inreg %lhs, <6 x i16> inre ; GFX6-NEXT: s_add_i32 s2, s2, s6 ; GFX6-NEXT: s_lshl_b32 s6, s9, 16 ; GFX6-NEXT: s_max_i32 s7, s3, 0 -; GFX6-NEXT: s_sub_i32 s8, s13, s8 -; GFX6-NEXT: s_sub_i32 s7, s12, s7 +; GFX6-NEXT: s_sub_i32 s8, 0x80000000, s8 +; GFX6-NEXT: s_sub_i32 s7, 0x7fffffff, s7 ; GFX6-NEXT: s_max_i32 s6, s8, s6 ; GFX6-NEXT: s_lshl_b32 s4, s4, 16 ; GFX6-NEXT: s_min_i32 s6, s6, s7 @@ -3495,8 +3437,8 @@ define amdgpu_ps <3 x i32> @s_saddsat_v6i16(<6 x i16> inreg %lhs, <6 x i16> inre ; GFX6-NEXT: s_add_i32 s3, s3, s6 ; GFX6-NEXT: s_lshl_b32 s6, s10, 16 ; GFX6-NEXT: s_max_i32 s7, s4, 0 -; GFX6-NEXT: s_sub_i32 s8, s13, s8 -; GFX6-NEXT: s_sub_i32 s7, s12, s7 +; GFX6-NEXT: s_sub_i32 s8, 0x80000000, s8 +; GFX6-NEXT: s_sub_i32 s7, 0x7fffffff, s7 ; GFX6-NEXT: s_max_i32 s6, s8, s6 ; GFX6-NEXT: s_lshl_b32 s5, s5, 16 ; GFX6-NEXT: s_min_i32 s6, s6, s7 @@ -3504,71 +3446,68 @@ define amdgpu_ps <3 x i32> @s_saddsat_v6i16(<6 x i16> inreg %lhs, <6 x i16> inre ; GFX6-NEXT: s_add_i32 s4, s4, s6 ; GFX6-NEXT: s_lshl_b32 s6, s11, 16 ; GFX6-NEXT: s_max_i32 s7, s5, 0 -; GFX6-NEXT: s_sub_i32 s8, s13, s8 -; GFX6-NEXT: s_sub_i32 s7, s12, s7 -; GFX6-NEXT: s_max_i32 s6, s8, s6 -; GFX6-NEXT: s_min_i32 s6, s6, s7 +; GFX6-NEXT: s_sub_i32 s8, 0x80000000, s8 ; GFX6-NEXT: s_ashr_i32 s1, s1, 16 -; GFX6-NEXT: s_add_i32 s5, s5, s6 -; GFX6-NEXT: s_mov_b32 s6, 0xffff +; GFX6-NEXT: s_sub_i32 s7, 0x7fffffff, s7 +; GFX6-NEXT: s_max_i32 s6, s8, s6 ; GFX6-NEXT: s_ashr_i32 s0, s0, 16 -; GFX6-NEXT: s_and_b32 s1, s1, s6 +; GFX6-NEXT: s_min_i32 s6, s6, s7 +; GFX6-NEXT: s_and_b32 s1, s1, 0xffff ; GFX6-NEXT: s_ashr_i32 s2, s2, 16 ; GFX6-NEXT: s_ashr_i32 s3, s3, 16 -; GFX6-NEXT: s_and_b32 s0, s0, s6 +; GFX6-NEXT: s_add_i32 s5, s5, s6 +; GFX6-NEXT: s_and_b32 s0, s0, 0xffff ; GFX6-NEXT: s_lshl_b32 s1, s1, 16 ; GFX6-NEXT: s_ashr_i32 s5, s5, 16 ; GFX6-NEXT: s_or_b32 s0, s0, s1 -; GFX6-NEXT: s_and_b32 s1, s2, s6 -; GFX6-NEXT: s_and_b32 s2, s3, s6 +; GFX6-NEXT: s_and_b32 s1, s2, 0xffff +; GFX6-NEXT: s_and_b32 s2, s3, 0xffff ; GFX6-NEXT: s_ashr_i32 s4, s4, 16 ; GFX6-NEXT: s_lshl_b32 s2, s2, 16 -; GFX6-NEXT: s_and_b32 s3, s5, s6 +; GFX6-NEXT: s_and_b32 s3, s5, 0xffff ; GFX6-NEXT: s_or_b32 s1, s1, s2 -; GFX6-NEXT: s_and_b32 s2, s4, s6 +; GFX6-NEXT: s_and_b32 s2, s4, 0xffff ; GFX6-NEXT: s_lshl_b32 s3, s3, 16 ; GFX6-NEXT: s_or_b32 s2, s2, s3 ; GFX6-NEXT: ; return to shader part epilog ; ; GFX8-LABEL: s_saddsat_v6i16: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_sext_i32_i16 s14, s0 -; GFX8-NEXT: s_sext_i32_i16 s15, 0 -; GFX8-NEXT: s_movk_i32 s13, 0x8000 -; GFX8-NEXT: s_max_i32 s16, s14, s15 -; GFX8-NEXT: s_min_i32 s14, s14, s15 -; GFX8-NEXT: s_sub_i32 s14, s13, s14 +; GFX8-NEXT: s_sext_i32_i16 s12, s0 +; GFX8-NEXT: s_sext_i32_i16 s13, 0 +; GFX8-NEXT: s_max_i32 s14, s12, s13 +; GFX8-NEXT: s_min_i32 s12, s12, s13 +; GFX8-NEXT: s_sub_i32 s12, 0xffff8000, s12 ; GFX8-NEXT: s_lshr_b32 s9, s3, 16 -; GFX8-NEXT: s_movk_i32 s12, 0x7fff -; GFX8-NEXT: s_sext_i32_i16 s14, s14 +; GFX8-NEXT: s_sext_i32_i16 s12, s12 ; GFX8-NEXT: s_sext_i32_i16 s3, s3 -; GFX8-NEXT: s_sub_i32 s16, s12, s16 -; GFX8-NEXT: s_max_i32 s3, s14, s3 +; GFX8-NEXT: s_sub_i32 s14, 0x7fff, s14 +; GFX8-NEXT: s_max_i32 s3, s12, s3 ; GFX8-NEXT: s_sext_i32_i16 s3, s3 -; GFX8-NEXT: s_sext_i32_i16 s14, s16 +; GFX8-NEXT: s_sext_i32_i16 s12, s14 ; GFX8-NEXT: s_lshr_b32 s6, s0, 16 -; GFX8-NEXT: s_min_i32 s3, s3, s14 +; GFX8-NEXT: s_min_i32 s3, s3, s12 ; GFX8-NEXT: s_add_i32 s0, s0, s3 ; GFX8-NEXT: s_sext_i32_i16 s3, s6 -; GFX8-NEXT: s_max_i32 s14, s3, s15 -; GFX8-NEXT: s_min_i32 s3, s3, s15 -; GFX8-NEXT: s_sub_i32 s3, s13, s3 +; GFX8-NEXT: s_max_i32 s12, s3, s13 +; GFX8-NEXT: s_min_i32 s3, s3, s13 +; GFX8-NEXT: s_sub_i32 s3, 0xffff8000, s3 ; GFX8-NEXT: s_sext_i32_i16 s3, s3 ; GFX8-NEXT: s_sext_i32_i16 s9, s9 -; GFX8-NEXT: s_sub_i32 s14, s12, s14 +; GFX8-NEXT: s_sub_i32 s12, 0x7fff, s12 ; GFX8-NEXT: s_max_i32 s3, s3, s9 ; GFX8-NEXT: s_sext_i32_i16 s3, s3 -; GFX8-NEXT: s_sext_i32_i16 s9, s14 +; GFX8-NEXT: s_sext_i32_i16 s9, s12 ; GFX8-NEXT: s_min_i32 s3, s3, s9 ; GFX8-NEXT: s_add_i32 s6, s6, s3 ; GFX8-NEXT: s_sext_i32_i16 s3, s1 -; GFX8-NEXT: s_max_i32 s9, s3, s15 -; GFX8-NEXT: s_min_i32 s3, s3, s15 -; GFX8-NEXT: s_sub_i32 s3, s13, s3 +; GFX8-NEXT: s_max_i32 s9, s3, s13 +; GFX8-NEXT: s_min_i32 s3, s3, s13 +; GFX8-NEXT: s_sub_i32 s3, 0xffff8000, s3 ; GFX8-NEXT: s_lshr_b32 s10, s4, 16 ; GFX8-NEXT: s_sext_i32_i16 s3, s3 ; GFX8-NEXT: s_sext_i32_i16 s4, s4 -; GFX8-NEXT: s_sub_i32 s9, s12, s9 +; GFX8-NEXT: s_sub_i32 s9, 0x7fff, s9 ; GFX8-NEXT: s_max_i32 s3, s3, s4 ; GFX8-NEXT: s_sext_i32_i16 s3, s3 ; GFX8-NEXT: s_sext_i32_i16 s4, s9 @@ -3576,25 +3515,25 @@ define amdgpu_ps <3 x i32> @s_saddsat_v6i16(<6 x i16> inreg %lhs, <6 x i16> inre ; GFX8-NEXT: s_min_i32 s3, s3, s4 ; GFX8-NEXT: s_add_i32 s1, s1, s3 ; GFX8-NEXT: s_sext_i32_i16 s3, s7 -; GFX8-NEXT: s_max_i32 s4, s3, s15 -; GFX8-NEXT: s_min_i32 s3, s3, s15 -; GFX8-NEXT: s_sub_i32 s3, s13, s3 +; GFX8-NEXT: s_max_i32 s4, s3, s13 +; GFX8-NEXT: s_min_i32 s3, s3, s13 +; GFX8-NEXT: s_sub_i32 s3, 0xffff8000, s3 ; GFX8-NEXT: s_sext_i32_i16 s3, s3 ; GFX8-NEXT: s_sext_i32_i16 s9, s10 -; GFX8-NEXT: s_sub_i32 s4, s12, s4 +; GFX8-NEXT: s_sub_i32 s4, 0x7fff, s4 ; GFX8-NEXT: s_max_i32 s3, s3, s9 ; GFX8-NEXT: s_sext_i32_i16 s3, s3 ; GFX8-NEXT: s_sext_i32_i16 s4, s4 ; GFX8-NEXT: s_min_i32 s3, s3, s4 ; GFX8-NEXT: s_add_i32 s7, s7, s3 ; GFX8-NEXT: s_sext_i32_i16 s3, s2 -; GFX8-NEXT: s_max_i32 s4, s3, s15 -; GFX8-NEXT: s_min_i32 s3, s3, s15 -; GFX8-NEXT: s_sub_i32 s3, s13, s3 +; GFX8-NEXT: s_max_i32 s4, s3, s13 +; GFX8-NEXT: s_min_i32 s3, s3, s13 +; GFX8-NEXT: s_sub_i32 s3, 0xffff8000, s3 ; GFX8-NEXT: s_lshr_b32 s11, s5, 16 ; GFX8-NEXT: s_sext_i32_i16 s3, s3 ; GFX8-NEXT: s_sext_i32_i16 s5, s5 -; GFX8-NEXT: s_sub_i32 s4, s12, s4 +; GFX8-NEXT: s_sub_i32 s4, 0x7fff, s4 ; GFX8-NEXT: s_max_i32 s3, s3, s5 ; GFX8-NEXT: s_sext_i32_i16 s3, s3 ; GFX8-NEXT: s_sext_i32_i16 s4, s4 @@ -3602,12 +3541,12 @@ define amdgpu_ps <3 x i32> @s_saddsat_v6i16(<6 x i16> inreg %lhs, <6 x i16> inre ; GFX8-NEXT: s_min_i32 s3, s3, s4 ; GFX8-NEXT: s_add_i32 s2, s2, s3 ; GFX8-NEXT: s_sext_i32_i16 s3, s8 -; GFX8-NEXT: s_max_i32 s4, s3, s15 -; GFX8-NEXT: s_min_i32 s3, s3, s15 -; GFX8-NEXT: s_sub_i32 s3, s13, s3 +; GFX8-NEXT: s_max_i32 s4, s3, s13 +; GFX8-NEXT: s_min_i32 s3, s3, s13 +; GFX8-NEXT: s_sub_i32 s3, 0xffff8000, s3 ; GFX8-NEXT: s_sext_i32_i16 s3, s3 ; GFX8-NEXT: s_sext_i32_i16 s5, s11 -; GFX8-NEXT: s_sub_i32 s4, s12, s4 +; GFX8-NEXT: s_sub_i32 s4, 0x7fff, s4 ; GFX8-NEXT: s_max_i32 s3, s3, s5 ; GFX8-NEXT: s_sext_i32_i16 s3, s3 ; GFX8-NEXT: s_sext_i32_i16 s4, s4 @@ -3861,31 +3800,29 @@ define amdgpu_ps <4 x i32> @s_saddsat_v8i16(<8 x i16> inreg %lhs, <8 x i16> inre ; GFX6-LABEL: s_saddsat_v8i16: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_lshl_b32 s0, s0, 16 -; GFX6-NEXT: s_brev_b32 s17, 1 -; GFX6-NEXT: s_min_i32 s19, s0, 0 +; GFX6-NEXT: s_min_i32 s17, s0, 0 ; GFX6-NEXT: s_lshl_b32 s8, s8, 16 -; GFX6-NEXT: s_brev_b32 s16, -2 -; GFX6-NEXT: s_max_i32 s18, s0, 0 -; GFX6-NEXT: s_sub_i32 s19, s17, s19 -; GFX6-NEXT: s_sub_i32 s18, s16, s18 -; GFX6-NEXT: s_max_i32 s8, s19, s8 +; GFX6-NEXT: s_max_i32 s16, s0, 0 +; GFX6-NEXT: s_sub_i32 s17, 0x80000000, s17 +; GFX6-NEXT: s_sub_i32 s16, 0x7fffffff, s16 +; GFX6-NEXT: s_max_i32 s8, s17, s8 ; GFX6-NEXT: s_lshl_b32 s1, s1, 16 -; GFX6-NEXT: s_min_i32 s8, s8, s18 -; GFX6-NEXT: s_min_i32 s18, s1, 0 +; GFX6-NEXT: s_min_i32 s8, s8, s16 +; GFX6-NEXT: s_min_i32 s16, s1, 0 ; GFX6-NEXT: s_add_i32 s0, s0, s8 ; GFX6-NEXT: s_lshl_b32 s8, s9, 16 ; GFX6-NEXT: s_max_i32 s9, s1, 0 -; GFX6-NEXT: s_sub_i32 s18, s17, s18 -; GFX6-NEXT: s_sub_i32 s9, s16, s9 -; GFX6-NEXT: s_max_i32 s8, s18, s8 +; GFX6-NEXT: s_sub_i32 s16, 0x80000000, s16 +; GFX6-NEXT: s_sub_i32 s9, 0x7fffffff, s9 +; GFX6-NEXT: s_max_i32 s8, s16, s8 ; GFX6-NEXT: s_min_i32 s8, s8, s9 ; GFX6-NEXT: s_lshl_b32 s2, s2, 16 ; GFX6-NEXT: s_add_i32 s1, s1, s8 ; GFX6-NEXT: s_lshl_b32 s8, s10, 16 ; GFX6-NEXT: s_min_i32 s10, s2, 0 ; GFX6-NEXT: s_max_i32 s9, s2, 0 -; GFX6-NEXT: s_sub_i32 s10, s17, s10 -; GFX6-NEXT: s_sub_i32 s9, s16, s9 +; GFX6-NEXT: s_sub_i32 s10, 0x80000000, s10 +; GFX6-NEXT: s_sub_i32 s9, 0x7fffffff, s9 ; GFX6-NEXT: s_max_i32 s8, s10, s8 ; GFX6-NEXT: s_lshl_b32 s3, s3, 16 ; GFX6-NEXT: s_min_i32 s8, s8, s9 @@ -3893,8 +3830,8 @@ define amdgpu_ps <4 x i32> @s_saddsat_v8i16(<8 x i16> inreg %lhs, <8 x i16> inre ; GFX6-NEXT: s_add_i32 s2, s2, s8 ; GFX6-NEXT: s_lshl_b32 s8, s11, 16 ; GFX6-NEXT: s_max_i32 s9, s3, 0 -; GFX6-NEXT: s_sub_i32 s10, s17, s10 -; GFX6-NEXT: s_sub_i32 s9, s16, s9 +; GFX6-NEXT: s_sub_i32 s10, 0x80000000, s10 +; GFX6-NEXT: s_sub_i32 s9, 0x7fffffff, s9 ; GFX6-NEXT: s_max_i32 s8, s10, s8 ; GFX6-NEXT: s_lshl_b32 s4, s4, 16 ; GFX6-NEXT: s_min_i32 s8, s8, s9 @@ -3902,8 +3839,8 @@ define amdgpu_ps <4 x i32> @s_saddsat_v8i16(<8 x i16> inreg %lhs, <8 x i16> inre ; GFX6-NEXT: s_add_i32 s3, s3, s8 ; GFX6-NEXT: s_lshl_b32 s8, s12, 16 ; GFX6-NEXT: s_max_i32 s9, s4, 0 -; GFX6-NEXT: s_sub_i32 s10, s17, s10 -; GFX6-NEXT: s_sub_i32 s9, s16, s9 +; GFX6-NEXT: s_sub_i32 s10, 0x80000000, s10 +; GFX6-NEXT: s_sub_i32 s9, 0x7fffffff, s9 ; GFX6-NEXT: s_max_i32 s8, s10, s8 ; GFX6-NEXT: s_lshl_b32 s5, s5, 16 ; GFX6-NEXT: s_min_i32 s8, s8, s9 @@ -3911,8 +3848,8 @@ define amdgpu_ps <4 x i32> @s_saddsat_v8i16(<8 x i16> inreg %lhs, <8 x i16> inre ; GFX6-NEXT: s_add_i32 s4, s4, s8 ; GFX6-NEXT: s_lshl_b32 s8, s13, 16 ; GFX6-NEXT: s_max_i32 s9, s5, 0 -; GFX6-NEXT: s_sub_i32 s10, s17, s10 -; GFX6-NEXT: s_sub_i32 s9, s16, s9 +; GFX6-NEXT: s_sub_i32 s10, 0x80000000, s10 +; GFX6-NEXT: s_sub_i32 s9, 0x7fffffff, s9 ; GFX6-NEXT: s_max_i32 s8, s10, s8 ; GFX6-NEXT: s_lshl_b32 s6, s6, 16 ; GFX6-NEXT: s_min_i32 s8, s8, s9 @@ -3920,86 +3857,83 @@ define amdgpu_ps <4 x i32> @s_saddsat_v8i16(<8 x i16> inreg %lhs, <8 x i16> inre ; GFX6-NEXT: s_add_i32 s5, s5, s8 ; GFX6-NEXT: s_lshl_b32 s8, s14, 16 ; GFX6-NEXT: s_max_i32 s9, s6, 0 -; GFX6-NEXT: s_sub_i32 s10, s17, s10 -; GFX6-NEXT: s_sub_i32 s9, s16, s9 +; GFX6-NEXT: s_sub_i32 s10, 0x80000000, s10 +; GFX6-NEXT: s_sub_i32 s9, 0x7fffffff, s9 ; GFX6-NEXT: s_max_i32 s8, s10, s8 ; GFX6-NEXT: s_lshl_b32 s7, s7, 16 ; GFX6-NEXT: s_min_i32 s8, s8, s9 ; GFX6-NEXT: s_min_i32 s10, s7, 0 +; GFX6-NEXT: s_ashr_i32 s1, s1, 16 ; GFX6-NEXT: s_add_i32 s6, s6, s8 ; GFX6-NEXT: s_lshl_b32 s8, s15, 16 ; GFX6-NEXT: s_max_i32 s9, s7, 0 -; GFX6-NEXT: s_sub_i32 s10, s17, s10 -; GFX6-NEXT: s_sub_i32 s9, s16, s9 -; GFX6-NEXT: s_max_i32 s8, s10, s8 -; GFX6-NEXT: s_min_i32 s8, s8, s9 -; GFX6-NEXT: s_ashr_i32 s1, s1, 16 -; GFX6-NEXT: s_add_i32 s7, s7, s8 -; GFX6-NEXT: s_mov_b32 s8, 0xffff +; GFX6-NEXT: s_sub_i32 s10, 0x80000000, s10 ; GFX6-NEXT: s_ashr_i32 s0, s0, 16 -; GFX6-NEXT: s_and_b32 s1, s1, s8 +; GFX6-NEXT: s_sub_i32 s9, 0x7fffffff, s9 +; GFX6-NEXT: s_max_i32 s8, s10, s8 +; GFX6-NEXT: s_and_b32 s1, s1, 0xffff ; GFX6-NEXT: s_ashr_i32 s2, s2, 16 ; GFX6-NEXT: s_ashr_i32 s3, s3, 16 -; GFX6-NEXT: s_and_b32 s0, s0, s8 +; GFX6-NEXT: s_min_i32 s8, s8, s9 +; GFX6-NEXT: s_and_b32 s0, s0, 0xffff ; GFX6-NEXT: s_lshl_b32 s1, s1, 16 ; GFX6-NEXT: s_ashr_i32 s5, s5, 16 +; GFX6-NEXT: s_add_i32 s7, s7, s8 ; GFX6-NEXT: s_or_b32 s0, s0, s1 -; GFX6-NEXT: s_and_b32 s1, s2, s8 -; GFX6-NEXT: s_and_b32 s2, s3, s8 +; GFX6-NEXT: s_and_b32 s1, s2, 0xffff +; GFX6-NEXT: s_and_b32 s2, s3, 0xffff ; GFX6-NEXT: s_ashr_i32 s4, s4, 16 ; GFX6-NEXT: s_ashr_i32 s7, s7, 16 ; GFX6-NEXT: s_lshl_b32 s2, s2, 16 -; GFX6-NEXT: s_and_b32 s3, s5, s8 +; GFX6-NEXT: s_and_b32 s3, s5, 0xffff ; GFX6-NEXT: s_ashr_i32 s6, s6, 16 ; GFX6-NEXT: s_or_b32 s1, s1, s2 -; GFX6-NEXT: s_and_b32 s2, s4, s8 +; GFX6-NEXT: s_and_b32 s2, s4, 0xffff ; GFX6-NEXT: s_lshl_b32 s3, s3, 16 -; GFX6-NEXT: s_and_b32 s4, s7, s8 +; GFX6-NEXT: s_and_b32 s4, s7, 0xffff ; GFX6-NEXT: s_or_b32 s2, s2, s3 -; GFX6-NEXT: s_and_b32 s3, s6, s8 +; GFX6-NEXT: s_and_b32 s3, s6, 0xffff ; GFX6-NEXT: s_lshl_b32 s4, s4, 16 ; GFX6-NEXT: s_or_b32 s3, s3, s4 ; GFX6-NEXT: ; return to shader part epilog ; ; GFX8-LABEL: s_saddsat_v8i16: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_sext_i32_i16 s18, s0 -; GFX8-NEXT: s_sext_i32_i16 s19, 0 -; GFX8-NEXT: s_movk_i32 s17, 0x8000 -; GFX8-NEXT: s_max_i32 s20, s18, s19 -; GFX8-NEXT: s_min_i32 s18, s18, s19 -; GFX8-NEXT: s_sub_i32 s18, s17, s18 +; GFX8-NEXT: s_sext_i32_i16 s16, s0 +; GFX8-NEXT: s_sext_i32_i16 s17, 0 +; GFX8-NEXT: s_max_i32 s18, s16, s17 +; GFX8-NEXT: s_min_i32 s16, s16, s17 +; GFX8-NEXT: s_sub_i32 s16, 0xffff8000, s16 ; GFX8-NEXT: s_lshr_b32 s12, s4, 16 -; GFX8-NEXT: s_movk_i32 s16, 0x7fff -; GFX8-NEXT: s_sext_i32_i16 s18, s18 +; GFX8-NEXT: s_sext_i32_i16 s16, s16 ; GFX8-NEXT: s_sext_i32_i16 s4, s4 -; GFX8-NEXT: s_sub_i32 s20, s16, s20 -; GFX8-NEXT: s_max_i32 s4, s18, s4 +; GFX8-NEXT: s_sub_i32 s18, 0x7fff, s18 +; GFX8-NEXT: s_max_i32 s4, s16, s4 ; GFX8-NEXT: s_sext_i32_i16 s4, s4 -; GFX8-NEXT: s_sext_i32_i16 s18, s20 +; GFX8-NEXT: s_sext_i32_i16 s16, s18 ; GFX8-NEXT: s_lshr_b32 s8, s0, 16 -; GFX8-NEXT: s_min_i32 s4, s4, s18 +; GFX8-NEXT: s_min_i32 s4, s4, s16 ; GFX8-NEXT: s_add_i32 s0, s0, s4 ; GFX8-NEXT: s_sext_i32_i16 s4, s8 -; GFX8-NEXT: s_max_i32 s18, s4, s19 -; GFX8-NEXT: s_min_i32 s4, s4, s19 -; GFX8-NEXT: s_sub_i32 s4, s17, s4 +; GFX8-NEXT: s_max_i32 s16, s4, s17 +; GFX8-NEXT: s_min_i32 s4, s4, s17 +; GFX8-NEXT: s_sub_i32 s4, 0xffff8000, s4 ; GFX8-NEXT: s_sext_i32_i16 s4, s4 ; GFX8-NEXT: s_sext_i32_i16 s12, s12 -; GFX8-NEXT: s_sub_i32 s18, s16, s18 +; GFX8-NEXT: s_sub_i32 s16, 0x7fff, s16 ; GFX8-NEXT: s_max_i32 s4, s4, s12 ; GFX8-NEXT: s_sext_i32_i16 s4, s4 -; GFX8-NEXT: s_sext_i32_i16 s12, s18 +; GFX8-NEXT: s_sext_i32_i16 s12, s16 ; GFX8-NEXT: s_min_i32 s4, s4, s12 ; GFX8-NEXT: s_add_i32 s8, s8, s4 ; GFX8-NEXT: s_sext_i32_i16 s4, s1 -; GFX8-NEXT: s_max_i32 s12, s4, s19 -; GFX8-NEXT: s_min_i32 s4, s4, s19 -; GFX8-NEXT: s_sub_i32 s4, s17, s4 +; GFX8-NEXT: s_max_i32 s12, s4, s17 +; GFX8-NEXT: s_min_i32 s4, s4, s17 +; GFX8-NEXT: s_sub_i32 s4, 0xffff8000, s4 ; GFX8-NEXT: s_lshr_b32 s13, s5, 16 ; GFX8-NEXT: s_sext_i32_i16 s4, s4 ; GFX8-NEXT: s_sext_i32_i16 s5, s5 -; GFX8-NEXT: s_sub_i32 s12, s16, s12 +; GFX8-NEXT: s_sub_i32 s12, 0x7fff, s12 ; GFX8-NEXT: s_max_i32 s4, s4, s5 ; GFX8-NEXT: s_sext_i32_i16 s4, s4 ; GFX8-NEXT: s_sext_i32_i16 s5, s12 @@ -4007,25 +3941,25 @@ define amdgpu_ps <4 x i32> @s_saddsat_v8i16(<8 x i16> inreg %lhs, <8 x i16> inre ; GFX8-NEXT: s_min_i32 s4, s4, s5 ; GFX8-NEXT: s_add_i32 s1, s1, s4 ; GFX8-NEXT: s_sext_i32_i16 s4, s9 -; GFX8-NEXT: s_max_i32 s5, s4, s19 -; GFX8-NEXT: s_min_i32 s4, s4, s19 -; GFX8-NEXT: s_sub_i32 s4, s17, s4 +; GFX8-NEXT: s_max_i32 s5, s4, s17 +; GFX8-NEXT: s_min_i32 s4, s4, s17 +; GFX8-NEXT: s_sub_i32 s4, 0xffff8000, s4 ; GFX8-NEXT: s_sext_i32_i16 s4, s4 ; GFX8-NEXT: s_sext_i32_i16 s12, s13 -; GFX8-NEXT: s_sub_i32 s5, s16, s5 +; GFX8-NEXT: s_sub_i32 s5, 0x7fff, s5 ; GFX8-NEXT: s_max_i32 s4, s4, s12 ; GFX8-NEXT: s_sext_i32_i16 s4, s4 ; GFX8-NEXT: s_sext_i32_i16 s5, s5 ; GFX8-NEXT: s_min_i32 s4, s4, s5 ; GFX8-NEXT: s_add_i32 s9, s9, s4 ; GFX8-NEXT: s_sext_i32_i16 s4, s2 -; GFX8-NEXT: s_max_i32 s5, s4, s19 -; GFX8-NEXT: s_min_i32 s4, s4, s19 -; GFX8-NEXT: s_sub_i32 s4, s17, s4 +; GFX8-NEXT: s_max_i32 s5, s4, s17 +; GFX8-NEXT: s_min_i32 s4, s4, s17 +; GFX8-NEXT: s_sub_i32 s4, 0xffff8000, s4 ; GFX8-NEXT: s_lshr_b32 s14, s6, 16 ; GFX8-NEXT: s_sext_i32_i16 s4, s4 ; GFX8-NEXT: s_sext_i32_i16 s6, s6 -; GFX8-NEXT: s_sub_i32 s5, s16, s5 +; GFX8-NEXT: s_sub_i32 s5, 0x7fff, s5 ; GFX8-NEXT: s_max_i32 s4, s4, s6 ; GFX8-NEXT: s_sext_i32_i16 s4, s4 ; GFX8-NEXT: s_sext_i32_i16 s5, s5 @@ -4033,24 +3967,24 @@ define amdgpu_ps <4 x i32> @s_saddsat_v8i16(<8 x i16> inreg %lhs, <8 x i16> inre ; GFX8-NEXT: s_min_i32 s4, s4, s5 ; GFX8-NEXT: s_add_i32 s2, s2, s4 ; GFX8-NEXT: s_sext_i32_i16 s4, s10 -; GFX8-NEXT: s_max_i32 s5, s4, s19 -; GFX8-NEXT: s_min_i32 s4, s4, s19 -; GFX8-NEXT: s_sub_i32 s4, s17, s4 +; GFX8-NEXT: s_max_i32 s5, s4, s17 +; GFX8-NEXT: s_min_i32 s4, s4, s17 +; GFX8-NEXT: s_sub_i32 s4, 0xffff8000, s4 ; GFX8-NEXT: s_sext_i32_i16 s4, s4 ; GFX8-NEXT: s_sext_i32_i16 s6, s14 -; GFX8-NEXT: s_sub_i32 s5, s16, s5 +; GFX8-NEXT: s_sub_i32 s5, 0x7fff, s5 ; GFX8-NEXT: s_max_i32 s4, s4, s6 ; GFX8-NEXT: s_sext_i32_i16 s4, s4 ; GFX8-NEXT: s_sext_i32_i16 s5, s5 ; GFX8-NEXT: s_min_i32 s4, s4, s5 ; GFX8-NEXT: s_add_i32 s10, s10, s4 ; GFX8-NEXT: s_sext_i32_i16 s4, s3 -; GFX8-NEXT: s_max_i32 s5, s4, s19 -; GFX8-NEXT: s_min_i32 s4, s4, s19 -; GFX8-NEXT: s_sub_i32 s4, s17, s4 +; GFX8-NEXT: s_max_i32 s5, s4, s17 +; GFX8-NEXT: s_min_i32 s4, s4, s17 +; GFX8-NEXT: s_sub_i32 s4, 0xffff8000, s4 ; GFX8-NEXT: s_sext_i32_i16 s4, s4 ; GFX8-NEXT: s_sext_i32_i16 s6, s7 -; GFX8-NEXT: s_sub_i32 s5, s16, s5 +; GFX8-NEXT: s_sub_i32 s5, 0x7fff, s5 ; GFX8-NEXT: s_max_i32 s4, s4, s6 ; GFX8-NEXT: s_sext_i32_i16 s4, s4 ; GFX8-NEXT: s_sext_i32_i16 s5, s5 @@ -4058,13 +3992,13 @@ define amdgpu_ps <4 x i32> @s_saddsat_v8i16(<8 x i16> inreg %lhs, <8 x i16> inre ; GFX8-NEXT: s_min_i32 s4, s4, s5 ; GFX8-NEXT: s_add_i32 s3, s3, s4 ; GFX8-NEXT: s_sext_i32_i16 s4, s11 -; GFX8-NEXT: s_max_i32 s5, s4, s19 -; GFX8-NEXT: s_min_i32 s4, s4, s19 +; GFX8-NEXT: s_max_i32 s5, s4, s17 +; GFX8-NEXT: s_min_i32 s4, s4, s17 ; GFX8-NEXT: s_lshr_b32 s15, s7, 16 -; GFX8-NEXT: s_sub_i32 s4, s17, s4 +; GFX8-NEXT: s_sub_i32 s4, 0xffff8000, s4 ; GFX8-NEXT: s_sext_i32_i16 s4, s4 ; GFX8-NEXT: s_sext_i32_i16 s6, s15 -; GFX8-NEXT: s_sub_i32 s5, s16, s5 +; GFX8-NEXT: s_sub_i32 s5, 0x7fff, s5 ; GFX8-NEXT: s_max_i32 s4, s4, s6 ; GFX8-NEXT: s_sext_i32_i16 s4, s4 ; GFX8-NEXT: s_sext_i32_i16 s5, s5 @@ -4553,13 +4487,12 @@ define amdgpu_ps <2 x i64> @s_saddsat_v2i64(<2 x i64> inreg %lhs, <2 x i64> inre ; GFX6-NEXT: v_cmp_lt_i64_e32 vcc, s[8:9], v[0:1] ; GFX6-NEXT: v_cmp_lt_i64_e64 s[0:1], s[4:5], 0 ; GFX6-NEXT: s_ashr_i32 s4, s9, 31 -; GFX6-NEXT: s_mov_b32 s10, 0 +; GFX6-NEXT: s_mov_b32 s5, 0 ; GFX6-NEXT: s_xor_b64 vcc, s[0:1], vcc -; GFX6-NEXT: s_brev_b32 s5, 1 -; GFX6-NEXT: s_cmp_lg_u32 s10, 0 +; GFX6-NEXT: s_cmp_lg_u32 s5, 0 ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: v_mov_b32_e32 v1, s8 -; GFX6-NEXT: s_addc_u32 s1, s4, s5 +; GFX6-NEXT: s_addc_u32 s1, s4, 0x80000000 ; GFX6-NEXT: v_cndmask_b32_e32 v4, v1, v0, vcc ; GFX6-NEXT: s_add_u32 s0, s2, s6 ; GFX6-NEXT: v_mov_b32_e32 v0, s2 @@ -4571,10 +4504,10 @@ define amdgpu_ps <2 x i64> @s_saddsat_v2i64(<2 x i64> inreg %lhs, <2 x i64> inre ; GFX6-NEXT: v_cmp_lt_i64_e32 vcc, s[0:1], v[0:1] ; GFX6-NEXT: v_cmp_lt_i64_e64 s[2:3], s[6:7], 0 ; GFX6-NEXT: s_ashr_i32 s4, s1, 31 -; GFX6-NEXT: s_mov_b32 s6, 0 +; GFX6-NEXT: s_mov_b32 s5, 0 ; GFX6-NEXT: s_xor_b64 vcc, s[2:3], vcc -; GFX6-NEXT: s_cmp_lg_u32 s6, 0 -; GFX6-NEXT: s_addc_u32 s3, s4, s5 +; GFX6-NEXT: s_cmp_lg_u32 s5, 0 +; GFX6-NEXT: s_addc_u32 s3, s4, 0x80000000 ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: v_mov_b32_e32 v1, s0 ; GFX6-NEXT: v_mov_b32_e32 v3, s3 @@ -4596,13 +4529,12 @@ define amdgpu_ps <2 x i64> @s_saddsat_v2i64(<2 x i64> inreg %lhs, <2 x i64> inre ; GFX8-NEXT: v_cmp_lt_i64_e32 vcc, s[8:9], v[0:1] ; GFX8-NEXT: v_cmp_lt_i64_e64 s[0:1], s[4:5], 0 ; GFX8-NEXT: s_ashr_i32 s4, s9, 31 -; GFX8-NEXT: s_mov_b32 s10, 0 +; GFX8-NEXT: s_mov_b32 s5, 0 ; GFX8-NEXT: s_xor_b64 vcc, s[0:1], vcc -; GFX8-NEXT: s_brev_b32 s5, 1 -; GFX8-NEXT: s_cmp_lg_u32 s10, 0 +; GFX8-NEXT: s_cmp_lg_u32 s5, 0 ; GFX8-NEXT: v_mov_b32_e32 v0, s4 ; GFX8-NEXT: v_mov_b32_e32 v1, s8 -; GFX8-NEXT: s_addc_u32 s1, s4, s5 +; GFX8-NEXT: s_addc_u32 s1, s4, 0x80000000 ; GFX8-NEXT: v_cndmask_b32_e32 v4, v1, v0, vcc ; GFX8-NEXT: s_add_u32 s0, s2, s6 ; GFX8-NEXT: v_mov_b32_e32 v0, s2 @@ -4614,10 +4546,10 @@ define amdgpu_ps <2 x i64> @s_saddsat_v2i64(<2 x i64> inreg %lhs, <2 x i64> inre ; GFX8-NEXT: v_cmp_lt_i64_e32 vcc, s[0:1], v[0:1] ; GFX8-NEXT: v_cmp_lt_i64_e64 s[2:3], s[6:7], 0 ; GFX8-NEXT: s_ashr_i32 s4, s1, 31 -; GFX8-NEXT: s_mov_b32 s6, 0 +; GFX8-NEXT: s_mov_b32 s5, 0 ; GFX8-NEXT: s_xor_b64 vcc, s[2:3], vcc -; GFX8-NEXT: s_cmp_lg_u32 s6, 0 -; GFX8-NEXT: s_addc_u32 s3, s4, s5 +; GFX8-NEXT: s_cmp_lg_u32 s5, 0 +; GFX8-NEXT: s_addc_u32 s3, s4, 0x80000000 ; GFX8-NEXT: v_mov_b32_e32 v0, s4 ; GFX8-NEXT: v_mov_b32_e32 v1, s0 ; GFX8-NEXT: v_mov_b32_e32 v3, s3 @@ -4639,13 +4571,12 @@ define amdgpu_ps <2 x i64> @s_saddsat_v2i64(<2 x i64> inreg %lhs, <2 x i64> inre ; GFX9-NEXT: v_cmp_lt_i64_e32 vcc, s[8:9], v[0:1] ; GFX9-NEXT: v_cmp_lt_i64_e64 s[0:1], s[4:5], 0 ; GFX9-NEXT: s_ashr_i32 s4, s9, 31 -; GFX9-NEXT: s_mov_b32 s10, 0 +; GFX9-NEXT: s_mov_b32 s5, 0 ; GFX9-NEXT: s_xor_b64 vcc, s[0:1], vcc -; GFX9-NEXT: s_brev_b32 s5, 1 -; GFX9-NEXT: s_cmp_lg_u32 s10, 0 +; GFX9-NEXT: s_cmp_lg_u32 s5, 0 ; GFX9-NEXT: v_mov_b32_e32 v0, s4 ; GFX9-NEXT: v_mov_b32_e32 v1, s8 -; GFX9-NEXT: s_addc_u32 s1, s4, s5 +; GFX9-NEXT: s_addc_u32 s1, s4, 0x80000000 ; GFX9-NEXT: v_cndmask_b32_e32 v4, v1, v0, vcc ; GFX9-NEXT: s_add_u32 s0, s2, s6 ; GFX9-NEXT: v_mov_b32_e32 v0, s2 @@ -4657,10 +4588,10 @@ define amdgpu_ps <2 x i64> @s_saddsat_v2i64(<2 x i64> inreg %lhs, <2 x i64> inre ; GFX9-NEXT: v_cmp_lt_i64_e32 vcc, s[0:1], v[0:1] ; GFX9-NEXT: v_cmp_lt_i64_e64 s[2:3], s[6:7], 0 ; GFX9-NEXT: s_ashr_i32 s4, s1, 31 -; GFX9-NEXT: s_mov_b32 s6, 0 +; GFX9-NEXT: s_mov_b32 s5, 0 ; GFX9-NEXT: s_xor_b64 vcc, s[2:3], vcc -; GFX9-NEXT: s_cmp_lg_u32 s6, 0 -; GFX9-NEXT: s_addc_u32 s3, s4, s5 +; GFX9-NEXT: s_cmp_lg_u32 s5, 0 +; GFX9-NEXT: s_addc_u32 s3, s4, 0x80000000 ; GFX9-NEXT: v_mov_b32_e32 v0, s4 ; GFX9-NEXT: v_mov_b32_e32 v1, s0 ; GFX9-NEXT: v_mov_b32_e32 v3, s3 @@ -4679,15 +4610,14 @@ define amdgpu_ps <2 x i64> @s_saddsat_v2i64(<2 x i64> inreg %lhs, <2 x i64> inre ; GFX10-NEXT: s_addc_u32 s9, s1, s5 ; GFX10-NEXT: v_cmp_lt_i64_e64 s4, s[4:5], 0 ; GFX10-NEXT: v_cmp_lt_i64_e64 s1, s[8:9], s[0:1] -; GFX10-NEXT: s_mov_b32 s11, 0 +; GFX10-NEXT: s_mov_b32 s10, 0 ; GFX10-NEXT: s_ashr_i32 s0, s9, 31 ; GFX10-NEXT: v_mov_b32_e32 v0, s8 -; GFX10-NEXT: s_brev_b32 s10, 1 ; GFX10-NEXT: v_mov_b32_e32 v1, s9 ; GFX10-NEXT: s_xor_b32 s8, s4, s1 -; GFX10-NEXT: s_cmp_lg_u32 s11, 0 +; GFX10-NEXT: s_cmp_lg_u32 s10, 0 ; GFX10-NEXT: v_cndmask_b32_e64 v0, v0, s0, s8 -; GFX10-NEXT: s_addc_u32 s1, s0, s10 +; GFX10-NEXT: s_addc_u32 s1, s0, 0x80000000 ; GFX10-NEXT: s_add_u32 s4, s2, s6 ; GFX10-NEXT: s_addc_u32 s5, s3, s7 ; GFX10-NEXT: v_mov_b32_e32 v2, s4 @@ -4697,9 +4627,9 @@ define amdgpu_ps <2 x i64> @s_saddsat_v2i64(<2 x i64> inreg %lhs, <2 x i64> inre ; GFX10-NEXT: v_mov_b32_e32 v3, s5 ; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, s1, s8 ; GFX10-NEXT: s_xor_b32 s2, s3, s2 -; GFX10-NEXT: s_cmp_lg_u32 s11, 0 +; GFX10-NEXT: s_cmp_lg_u32 s10, 0 ; GFX10-NEXT: v_cndmask_b32_e64 v2, v2, s0, s2 -; GFX10-NEXT: s_addc_u32 s1, s0, s10 +; GFX10-NEXT: s_addc_u32 s1, s0, 0x80000000 ; GFX10-NEXT: v_readfirstlane_b32 s0, v0 ; GFX10-NEXT: v_cndmask_b32_e64 v3, v3, s1, s2 ; GFX10-NEXT: v_readfirstlane_b32 s1, v1 @@ -5453,10 +5383,9 @@ define amdgpu_ps <2 x i128> @s_saddsat_v2i128(<2 x i128> inreg %lhs, <2 x i128> ; GFX6-NEXT: s_cmp_lg_u32 s1, 0 ; GFX6-NEXT: v_xor_b32_e32 v0, v1, v0 ; GFX6-NEXT: s_addc_u32 s1, s0, 0 -; GFX6-NEXT: s_brev_b32 s10, 1 ; GFX6-NEXT: s_addc_u32 s2, s0, 0 ; GFX6-NEXT: v_and_b32_e32 v0, 1, v0 -; GFX6-NEXT: s_addc_u32 s3, s0, s10 +; GFX6-NEXT: s_addc_u32 s3, s0, 0x80000000 ; GFX6-NEXT: v_mov_b32_e32 v1, s0 ; GFX6-NEXT: v_mov_b32_e32 v2, s1 ; GFX6-NEXT: v_mov_b32_e32 v3, s8 @@ -5495,7 +5424,7 @@ define amdgpu_ps <2 x i128> @s_saddsat_v2i128(<2 x i128> inreg %lhs, <2 x i128> ; GFX6-NEXT: s_addc_u32 s5, s4, 0 ; GFX6-NEXT: s_addc_u32 s6, s4, 0 ; GFX6-NEXT: v_and_b32_e32 v0, 1, v0 -; GFX6-NEXT: s_addc_u32 s7, s4, s10 +; GFX6-NEXT: s_addc_u32 s7, s4, 0x80000000 ; GFX6-NEXT: v_mov_b32_e32 v1, s4 ; GFX6-NEXT: v_mov_b32_e32 v2, s5 ; GFX6-NEXT: v_mov_b32_e32 v3, s0 @@ -5550,10 +5479,9 @@ define amdgpu_ps <2 x i128> @s_saddsat_v2i128(<2 x i128> inreg %lhs, <2 x i128> ; GFX8-NEXT: s_cmp_lg_u32 s1, 0 ; GFX8-NEXT: v_xor_b32_e32 v0, v1, v0 ; GFX8-NEXT: s_addc_u32 s1, s0, 0 -; GFX8-NEXT: s_brev_b32 s10, 1 ; GFX8-NEXT: s_addc_u32 s2, s0, 0 ; GFX8-NEXT: v_and_b32_e32 v0, 1, v0 -; GFX8-NEXT: s_addc_u32 s3, s0, s10 +; GFX8-NEXT: s_addc_u32 s3, s0, 0x80000000 ; GFX8-NEXT: v_mov_b32_e32 v1, s0 ; GFX8-NEXT: v_mov_b32_e32 v2, s1 ; GFX8-NEXT: v_mov_b32_e32 v3, s8 @@ -5598,7 +5526,7 @@ define amdgpu_ps <2 x i128> @s_saddsat_v2i128(<2 x i128> inreg %lhs, <2 x i128> ; GFX8-NEXT: s_addc_u32 s5, s4, 0 ; GFX8-NEXT: s_addc_u32 s6, s4, 0 ; GFX8-NEXT: v_and_b32_e32 v0, 1, v0 -; GFX8-NEXT: s_addc_u32 s7, s4, s10 +; GFX8-NEXT: s_addc_u32 s7, s4, 0x80000000 ; GFX8-NEXT: v_mov_b32_e32 v1, s4 ; GFX8-NEXT: v_mov_b32_e32 v2, s5 ; GFX8-NEXT: v_mov_b32_e32 v3, s0 @@ -5653,10 +5581,9 @@ define amdgpu_ps <2 x i128> @s_saddsat_v2i128(<2 x i128> inreg %lhs, <2 x i128> ; GFX9-NEXT: s_cmp_lg_u32 s1, 0 ; GFX9-NEXT: v_xor_b32_e32 v0, v1, v0 ; GFX9-NEXT: s_addc_u32 s1, s0, 0 -; GFX9-NEXT: s_brev_b32 s10, 1 ; GFX9-NEXT: s_addc_u32 s2, s0, 0 ; GFX9-NEXT: v_and_b32_e32 v0, 1, v0 -; GFX9-NEXT: s_addc_u32 s3, s0, s10 +; GFX9-NEXT: s_addc_u32 s3, s0, 0x80000000 ; GFX9-NEXT: v_mov_b32_e32 v1, s0 ; GFX9-NEXT: v_mov_b32_e32 v2, s1 ; GFX9-NEXT: v_mov_b32_e32 v3, s8 @@ -5701,7 +5628,7 @@ define amdgpu_ps <2 x i128> @s_saddsat_v2i128(<2 x i128> inreg %lhs, <2 x i128> ; GFX9-NEXT: s_addc_u32 s5, s4, 0 ; GFX9-NEXT: s_addc_u32 s6, s4, 0 ; GFX9-NEXT: v_and_b32_e32 v0, 1, v0 -; GFX9-NEXT: s_addc_u32 s7, s4, s10 +; GFX9-NEXT: s_addc_u32 s7, s4, 0x80000000 ; GFX9-NEXT: v_mov_b32_e32 v1, s4 ; GFX9-NEXT: v_mov_b32_e32 v2, s5 ; GFX9-NEXT: v_mov_b32_e32 v3, s0 @@ -5747,14 +5674,13 @@ define amdgpu_ps <2 x i128> @s_saddsat_v2i128(<2 x i128> inreg %lhs, <2 x i128> ; GFX10-NEXT: s_and_b32 s1, 1, s1 ; GFX10-NEXT: s_ashr_i32 s2, s17, 31 ; GFX10-NEXT: v_cmp_ne_u32_e64 s0, 0, s1 -; GFX10-NEXT: s_brev_b32 s11, 1 ; GFX10-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc_lo ; GFX10-NEXT: v_cndmask_b32_e64 v1, v2, 0, s0 ; GFX10-NEXT: s_mov_b32 s0, 0 ; GFX10-NEXT: s_cmp_lg_u32 s0, 0 ; GFX10-NEXT: s_addc_u32 s1, s2, 0 ; GFX10-NEXT: s_addc_u32 s10, s2, 0 -; GFX10-NEXT: s_addc_u32 s3, s2, s11 +; GFX10-NEXT: s_addc_u32 s3, s2, 0x80000000 ; GFX10-NEXT: s_add_u32 s12, s4, s12 ; GFX10-NEXT: s_addc_u32 s13, s5, s13 ; GFX10-NEXT: s_addc_u32 s18, s6, s14 @@ -5795,7 +5721,7 @@ define amdgpu_ps <2 x i128> @s_saddsat_v2i128(<2 x i128> inreg %lhs, <2 x i128> ; GFX10-NEXT: v_and_b32_e32 v2, 1, v2 ; GFX10-NEXT: s_addc_u32 s1, s0, 0 ; GFX10-NEXT: s_addc_u32 s2, s0, 0 -; GFX10-NEXT: s_addc_u32 s3, s0, s11 +; GFX10-NEXT: s_addc_u32 s3, s0, 0x80000000 ; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v2 ; GFX10-NEXT: v_mov_b32_e32 v2, s18 ; GFX10-NEXT: v_cndmask_b32_e64 v5, v5, s0, vcc_lo diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/sdiv.i32.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/sdiv.i32.ll index af008f0cabde..2599327910eb 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/sdiv.i32.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/sdiv.i32.ll @@ -309,10 +309,10 @@ define i32 @v_sdiv_i32_pow2k_denom(i32 %num) { ; CHECK-NEXT: v_lshlrev_b32_e32 v3, 12, v2 ; CHECK-NEXT: v_add_i32_e32 v4, vcc, 1, v2 ; CHECK-NEXT: v_sub_i32_e32 v0, vcc, v0, v3 -; CHECK-NEXT: v_cmp_le_u32_e32 vcc, s6, v0 -; CHECK-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc -; CHECK-NEXT: v_subrev_i32_e64 v3, s[4:5], s6, v0 -; CHECK-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc +; CHECK-NEXT: v_cmp_le_u32_e64 s[4:5], s6, v0 +; CHECK-NEXT: v_cndmask_b32_e64 v2, v2, v4, s[4:5] +; CHECK-NEXT: v_subrev_i32_e32 v3, vcc, s6, v0 +; CHECK-NEXT: v_cndmask_b32_e64 v0, v0, v3, s[4:5] ; CHECK-NEXT: v_add_i32_e32 v3, vcc, 1, v2 ; CHECK-NEXT: v_cmp_le_u32_e32 vcc, s6, v0 ; CHECK-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc @@ -408,15 +408,15 @@ define <2 x i32> @v_sdiv_v2i32_pow2k_denom(<2 x i32> %num) { ; CGP-NEXT: v_add_i32_e32 v10, vcc, 1, v4 ; CGP-NEXT: v_sub_i32_e32 v0, vcc, v0, v7 ; CGP-NEXT: v_sub_i32_e32 v1, vcc, v1, v9 -; CGP-NEXT: v_cmp_le_u32_e32 vcc, s8, v0 -; CGP-NEXT: v_cndmask_b32_e32 v3, v3, v8, vcc -; CGP-NEXT: v_subrev_i32_e64 v7, s[4:5], s8, v0 -; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v1, v5 -; CGP-NEXT: v_cndmask_b32_e64 v4, v4, v10, s[4:5] -; CGP-NEXT: v_sub_i32_e64 v8, s[6:7], v1, v5 -; CGP-NEXT: v_cndmask_b32_e32 v0, v0, v7, vcc +; CGP-NEXT: v_cmp_le_u32_e64 s[4:5], s8, v0 +; CGP-NEXT: v_cndmask_b32_e64 v3, v3, v8, s[4:5] +; CGP-NEXT: v_subrev_i32_e32 v7, vcc, s8, v0 +; CGP-NEXT: v_cmp_ge_u32_e64 s[6:7], v1, v5 +; CGP-NEXT: v_cndmask_b32_e64 v4, v4, v10, s[6:7] +; CGP-NEXT: v_subrev_i32_e32 v8, vcc, 0x1000, v1 +; CGP-NEXT: v_cndmask_b32_e64 v0, v0, v7, s[4:5] ; CGP-NEXT: v_add_i32_e32 v7, vcc, 1, v3 -; CGP-NEXT: v_cndmask_b32_e64 v1, v1, v8, s[4:5] +; CGP-NEXT: v_cndmask_b32_e64 v1, v1, v8, s[6:7] ; CGP-NEXT: v_add_i32_e32 v8, vcc, 1, v4 ; CGP-NEXT: v_cmp_le_u32_e32 vcc, s8, v0 ; CGP-NEXT: v_cndmask_b32_e32 v0, v3, v7, vcc @@ -450,10 +450,10 @@ define i32 @v_sdiv_i32_oddk_denom(i32 %num) { ; CHECK-NEXT: v_mul_lo_u32 v3, v2, s6 ; CHECK-NEXT: v_add_i32_e32 v4, vcc, 1, v2 ; CHECK-NEXT: v_sub_i32_e32 v0, vcc, v0, v3 -; CHECK-NEXT: v_cmp_le_u32_e32 vcc, s6, v0 -; CHECK-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc -; CHECK-NEXT: v_subrev_i32_e64 v3, s[4:5], s6, v0 -; CHECK-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc +; CHECK-NEXT: v_cmp_le_u32_e64 s[4:5], s6, v0 +; CHECK-NEXT: v_cndmask_b32_e64 v2, v2, v4, s[4:5] +; CHECK-NEXT: v_subrev_i32_e32 v3, vcc, s6, v0 +; CHECK-NEXT: v_cndmask_b32_e64 v0, v0, v3, s[4:5] ; CHECK-NEXT: v_add_i32_e32 v3, vcc, 1, v2 ; CHECK-NEXT: v_cmp_le_u32_e32 vcc, s6, v0 ; CHECK-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc @@ -549,15 +549,15 @@ define <2 x i32> @v_sdiv_v2i32_oddk_denom(<2 x i32> %num) { ; CGP-NEXT: v_add_i32_e32 v10, vcc, 1, v4 ; CGP-NEXT: v_sub_i32_e32 v0, vcc, v0, v7 ; CGP-NEXT: v_sub_i32_e32 v1, vcc, v1, v9 -; CGP-NEXT: v_cmp_le_u32_e32 vcc, s8, v0 -; CGP-NEXT: v_cndmask_b32_e32 v3, v3, v8, vcc -; CGP-NEXT: v_subrev_i32_e64 v7, s[4:5], s8, v0 -; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v1, v5 -; CGP-NEXT: v_cndmask_b32_e64 v4, v4, v10, s[4:5] -; CGP-NEXT: v_sub_i32_e64 v8, s[6:7], v1, v5 -; CGP-NEXT: v_cndmask_b32_e32 v0, v0, v7, vcc +; CGP-NEXT: v_cmp_le_u32_e64 s[4:5], s8, v0 +; CGP-NEXT: v_cndmask_b32_e64 v3, v3, v8, s[4:5] +; CGP-NEXT: v_subrev_i32_e32 v7, vcc, s8, v0 +; CGP-NEXT: v_cmp_ge_u32_e64 s[6:7], v1, v5 +; CGP-NEXT: v_cndmask_b32_e64 v4, v4, v10, s[6:7] +; CGP-NEXT: v_subrev_i32_e32 v8, vcc, 0x12d8fb, v1 +; CGP-NEXT: v_cndmask_b32_e64 v0, v0, v7, s[4:5] ; CGP-NEXT: v_add_i32_e32 v7, vcc, 1, v3 -; CGP-NEXT: v_cndmask_b32_e64 v1, v1, v8, s[4:5] +; CGP-NEXT: v_cndmask_b32_e64 v1, v1, v8, s[6:7] ; CGP-NEXT: v_add_i32_e32 v8, vcc, 1, v4 ; CGP-NEXT: v_cmp_le_u32_e32 vcc, s8, v0 ; CGP-NEXT: v_cndmask_b32_e32 v0, v3, v7, vcc diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/sdiv.i64.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/sdiv.i64.ll index d4378da215ee..f92b02bd7b6e 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/sdiv.i64.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/sdiv.i64.ll @@ -1186,9 +1186,8 @@ define <2 x i64> @v_sdiv_v2i64_pow2k_denom(<2 x i64> %num) { ; GISEL-LABEL: v_sdiv_v2i64_pow2k_denom: ; GISEL: ; %bb.0: ; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GISEL-NEXT: s_movk_i32 s10, 0x1000 ; GISEL-NEXT: s_mov_b32 s6, 0 -; GISEL-NEXT: s_add_u32 s4, s10, 0 +; GISEL-NEXT: s_add_u32 s4, 0x1000, 0 ; GISEL-NEXT: s_mov_b32 s7, s6 ; GISEL-NEXT: s_addc_u32 s5, 0, 0 ; GISEL-NEXT: s_xor_b64 s[8:9], s[4:5], s[6:7] @@ -1317,7 +1316,7 @@ define <2 x i64> @v_sdiv_v2i64_pow2k_denom(<2 x i64> %num) { ; GISEL-NEXT: v_cndmask_b32_e32 v0, v10, v0, vcc ; GISEL-NEXT: v_add_i32_e32 v1, vcc, 1, v8 ; GISEL-NEXT: v_addc_u32_e32 v10, vcc, 0, v9, vcc -; GISEL-NEXT: s_add_u32 s4, s10, 0 +; GISEL-NEXT: s_add_u32 s4, 0x1000, 0 ; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 ; GISEL-NEXT: s_addc_u32 s5, 0, 0 ; GISEL-NEXT: v_cndmask_b32_e32 v0, v8, v1, vcc @@ -1890,9 +1889,8 @@ define <2 x i64> @v_sdiv_v2i64_oddk_denom(<2 x i64> %num) { ; GISEL-LABEL: v_sdiv_v2i64_oddk_denom: ; GISEL: ; %bb.0: ; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GISEL-NEXT: s_mov_b32 s10, 0x12d8fb ; GISEL-NEXT: s_mov_b32 s6, 0 -; GISEL-NEXT: s_add_u32 s4, s10, 0 +; GISEL-NEXT: s_add_u32 s4, 0x12d8fb, 0 ; GISEL-NEXT: s_mov_b32 s7, s6 ; GISEL-NEXT: s_addc_u32 s5, 0, 0 ; GISEL-NEXT: s_xor_b64 s[8:9], s[4:5], s[6:7] @@ -2021,7 +2019,7 @@ define <2 x i64> @v_sdiv_v2i64_oddk_denom(<2 x i64> %num) { ; GISEL-NEXT: v_cndmask_b32_e32 v0, v10, v0, vcc ; GISEL-NEXT: v_add_i32_e32 v1, vcc, 1, v8 ; GISEL-NEXT: v_addc_u32_e32 v10, vcc, 0, v9, vcc -; GISEL-NEXT: s_add_u32 s4, s10, 0 +; GISEL-NEXT: s_add_u32 s4, 0x12d8fb, 0 ; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 ; GISEL-NEXT: s_addc_u32 s5, 0, 0 ; GISEL-NEXT: v_cndmask_b32_e32 v0, v8, v1, vcc diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/sdivrem.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/sdivrem.ll index 5d773c3d9c5e..3120da6cfc4c 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/sdivrem.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/sdivrem.ll @@ -1157,121 +1157,120 @@ define amdgpu_kernel void @sdivrem_v4i32(<4 x i32> addrspace(1)* %out0, <4 x i32 ; GFX10-LABEL: sdivrem_v4i32: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x10 -; GFX10-NEXT: v_mov_b32_e32 v4, 0x4f7ffffe ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_ashr_i32 s0, s12, 31 -; GFX10-NEXT: s_ashr_i32 s2, s14, 31 -; GFX10-NEXT: s_add_i32 s6, s12, s0 -; GFX10-NEXT: s_add_i32 s12, s14, s2 -; GFX10-NEXT: s_xor_b32 s14, s6, s0 ; GFX10-NEXT: s_ashr_i32 s1, s13, 31 -; GFX10-NEXT: v_cvt_f32_u32_e32 v0, s14 +; GFX10-NEXT: s_ashr_i32 s2, s14, 31 ; GFX10-NEXT: s_ashr_i32 s3, s15, 31 +; GFX10-NEXT: s_add_i32 s6, s12, s0 ; GFX10-NEXT: s_add_i32 s7, s13, s1 +; GFX10-NEXT: s_add_i32 s12, s14, s2 ; GFX10-NEXT: s_add_i32 s13, s15, s3 +; GFX10-NEXT: s_xor_b32 s14, s6, s0 ; GFX10-NEXT: s_xor_b32 s15, s7, s1 ; GFX10-NEXT: s_xor_b32 s12, s12, s2 +; GFX10-NEXT: v_cvt_f32_u32_e32 v0, s14 +; GFX10-NEXT: s_xor_b32 s13, s13, s3 ; GFX10-NEXT: v_cvt_f32_u32_e32 v1, s15 ; GFX10-NEXT: v_cvt_f32_u32_e32 v2, s12 -; GFX10-NEXT: v_rcp_iflag_f32_e32 v0, v0 -; GFX10-NEXT: s_xor_b32 s13, s13, s3 -; GFX10-NEXT: s_sub_i32 s6, 0, s14 ; GFX10-NEXT: v_cvt_f32_u32_e32 v3, s13 +; GFX10-NEXT: v_rcp_iflag_f32_e32 v0, v0 +; GFX10-NEXT: s_sub_i32 s6, 0, s14 ; GFX10-NEXT: v_rcp_iflag_f32_e32 v1, v1 ; GFX10-NEXT: v_rcp_iflag_f32_e32 v2, v2 +; GFX10-NEXT: v_rcp_iflag_f32_e32 v3, v3 ; GFX10-NEXT: s_sub_i32 s7, 0, s15 ; GFX10-NEXT: s_sub_i32 s19, 0, s12 -; GFX10-NEXT: v_rcp_iflag_f32_e32 v3, v3 ; GFX10-NEXT: s_ashr_i32 s16, s8, 31 -; GFX10-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 -; GFX10-NEXT: s_ashr_i32 s18, s10, 31 ; GFX10-NEXT: s_ashr_i32 s17, s9, 31 +; GFX10-NEXT: s_ashr_i32 s18, s10, 31 +; GFX10-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX10-NEXT: s_xor_b32 s20, s16, s0 -; GFX10-NEXT: v_mul_f32_e32 v1, v1, v4 -; GFX10-NEXT: v_mul_f32_e32 v2, v2, v4 +; GFX10-NEXT: v_mul_f32_e32 v1, 0x4f7ffffe, v1 +; GFX10-NEXT: v_mul_f32_e32 v2, 0x4f7ffffe, v2 +; GFX10-NEXT: v_mul_f32_e32 v3, 0x4f7ffffe, v3 ; GFX10-NEXT: v_cvt_u32_f32_e32 v0, v0 ; GFX10-NEXT: s_xor_b32 s21, s17, s1 -; GFX10-NEXT: v_mul_f32_e32 v3, v3, v4 ; GFX10-NEXT: v_cvt_u32_f32_e32 v1, v1 ; GFX10-NEXT: v_cvt_u32_f32_e32 v2, v2 +; GFX10-NEXT: v_cvt_u32_f32_e32 v3, v3 ; GFX10-NEXT: v_mul_lo_u32 v4, s6, v0 ; GFX10-NEXT: s_sub_i32 s6, 0, s13 -; GFX10-NEXT: v_cvt_u32_f32_e32 v3, v3 ; GFX10-NEXT: v_mul_lo_u32 v5, s7, v1 ; GFX10-NEXT: v_mul_lo_u32 v6, s19, v2 -; GFX10-NEXT: s_ashr_i32 s19, s11, 31 -; GFX10-NEXT: s_add_i32 s7, s9, s17 ; GFX10-NEXT: v_mul_lo_u32 v7, s6, v3 -; GFX10-NEXT: v_mul_hi_u32 v4, v0, v4 +; GFX10-NEXT: s_ashr_i32 s19, s11, 31 ; GFX10-NEXT: s_add_i32 s6, s8, s16 +; GFX10-NEXT: s_add_i32 s7, s9, s17 +; GFX10-NEXT: v_mul_hi_u32 v4, v0, v4 ; GFX10-NEXT: s_add_i32 s8, s10, s18 ; GFX10-NEXT: v_mul_hi_u32 v5, v1, v5 ; GFX10-NEXT: v_mul_hi_u32 v6, v2, v6 -; GFX10-NEXT: s_xor_b32 s10, s6, s16 -; GFX10-NEXT: s_add_i32 s9, s11, s19 ; GFX10-NEXT: v_mul_hi_u32 v7, v3, v7 -; GFX10-NEXT: v_add_nc_u32_e32 v0, v0, v4 +; GFX10-NEXT: s_add_i32 s9, s11, s19 +; GFX10-NEXT: s_xor_b32 s10, s6, s16 ; GFX10-NEXT: s_xor_b32 s11, s7, s17 +; GFX10-NEXT: v_add_nc_u32_e32 v0, v0, v4 ; GFX10-NEXT: s_xor_b32 s8, s8, s18 ; GFX10-NEXT: v_add_nc_u32_e32 v1, v1, v5 ; GFX10-NEXT: v_add_nc_u32_e32 v2, v2, v6 -; GFX10-NEXT: v_mul_hi_u32 v0, s10, v0 -; GFX10-NEXT: s_xor_b32 s9, s9, s19 ; GFX10-NEXT: v_add_nc_u32_e32 v3, v3, v7 +; GFX10-NEXT: s_xor_b32 s9, s9, s19 +; GFX10-NEXT: v_mul_hi_u32 v0, s10, v0 ; GFX10-NEXT: v_mul_hi_u32 v1, s11, v1 ; GFX10-NEXT: v_mul_hi_u32 v2, s8, v2 +; GFX10-NEXT: v_mul_hi_u32 v3, s9, v3 ; GFX10-NEXT: s_xor_b32 s22, s18, s2 ; GFX10-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x0 -; GFX10-NEXT: v_mul_hi_u32 v3, s9, v3 ; GFX10-NEXT: v_mul_lo_u32 v4, v0, s14 -; GFX10-NEXT: v_add_nc_u32_e32 v8, 1, v0 ; GFX10-NEXT: v_mul_lo_u32 v5, v1, s15 ; GFX10-NEXT: v_mul_lo_u32 v6, v2, s12 +; GFX10-NEXT: v_mul_lo_u32 v7, v3, s13 +; GFX10-NEXT: v_add_nc_u32_e32 v8, 1, v0 ; GFX10-NEXT: v_add_nc_u32_e32 v9, 1, v1 ; GFX10-NEXT: v_add_nc_u32_e32 v10, 1, v2 -; GFX10-NEXT: v_mul_lo_u32 v7, v3, s13 -; GFX10-NEXT: v_sub_nc_u32_e32 v4, s10, v4 ; GFX10-NEXT: v_add_nc_u32_e32 v11, 1, v3 +; GFX10-NEXT: v_sub_nc_u32_e32 v4, s10, v4 ; GFX10-NEXT: v_sub_nc_u32_e32 v5, s11, v5 ; GFX10-NEXT: v_sub_nc_u32_e32 v6, s8, v6 -; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, s14, v4 ; GFX10-NEXT: v_sub_nc_u32_e32 v7, s9, v7 +; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, s14, v4 ; GFX10-NEXT: v_cmp_le_u32_e64 s0, s15, v5 ; GFX10-NEXT: v_cmp_le_u32_e64 s1, s12, v6 +; GFX10-NEXT: v_cmp_le_u32_e64 s2, s13, v7 ; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v8, vcc_lo ; GFX10-NEXT: v_subrev_nc_u32_e32 v8, s14, v4 -; GFX10-NEXT: v_cmp_le_u32_e64 s2, s13, v7 ; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, v9, s0 ; GFX10-NEXT: v_subrev_nc_u32_e32 v9, s15, v5 ; GFX10-NEXT: v_cndmask_b32_e64 v2, v2, v10, s1 ; GFX10-NEXT: v_subrev_nc_u32_e32 v10, s12, v6 -; GFX10-NEXT: v_cndmask_b32_e32 v4, v4, v8, vcc_lo ; GFX10-NEXT: v_cndmask_b32_e64 v3, v3, v11, s2 ; GFX10-NEXT: v_subrev_nc_u32_e32 v11, s13, v7 -; GFX10-NEXT: v_add_nc_u32_e32 v8, 1, v0 +; GFX10-NEXT: v_cndmask_b32_e32 v4, v4, v8, vcc_lo ; GFX10-NEXT: v_cndmask_b32_e64 v5, v5, v9, s0 ; GFX10-NEXT: v_cndmask_b32_e64 v6, v6, v10, s1 -; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, s14, v4 +; GFX10-NEXT: v_add_nc_u32_e32 v8, 1, v0 ; GFX10-NEXT: v_cndmask_b32_e64 v7, v7, v11, s2 ; GFX10-NEXT: v_add_nc_u32_e32 v9, 1, v1 ; GFX10-NEXT: v_add_nc_u32_e32 v10, 1, v2 +; GFX10-NEXT: v_add_nc_u32_e32 v11, 1, v3 +; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, s14, v4 ; GFX10-NEXT: v_cmp_le_u32_e64 s0, s15, v5 +; GFX10-NEXT: v_cmp_le_u32_e64 s1, s12, v6 +; GFX10-NEXT: v_cmp_le_u32_e64 s2, s13, v7 ; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v8, vcc_lo ; GFX10-NEXT: v_subrev_nc_u32_e32 v8, s14, v4 -; GFX10-NEXT: v_cmp_le_u32_e64 s1, s12, v6 -; GFX10-NEXT: v_add_nc_u32_e32 v11, 1, v3 -; GFX10-NEXT: v_cmp_le_u32_e64 s2, s13, v7 ; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, v9, s0 ; GFX10-NEXT: v_subrev_nc_u32_e32 v9, s15, v5 ; GFX10-NEXT: v_cndmask_b32_e64 v2, v2, v10, s1 ; GFX10-NEXT: v_subrev_nc_u32_e32 v10, s12, v6 -; GFX10-NEXT: v_cndmask_b32_e32 v4, v4, v8, vcc_lo -; GFX10-NEXT: v_subrev_nc_u32_e32 v8, s13, v7 ; GFX10-NEXT: v_cndmask_b32_e64 v3, v3, v11, s2 +; GFX10-NEXT: v_subrev_nc_u32_e32 v11, s13, v7 +; GFX10-NEXT: v_cndmask_b32_e32 v4, v4, v8, vcc_lo ; GFX10-NEXT: v_cndmask_b32_e64 v5, v5, v9, s0 ; GFX10-NEXT: v_cndmask_b32_e64 v6, v6, v10, s1 ; GFX10-NEXT: s_xor_b32 s0, s19, s3 -; GFX10-NEXT: v_cndmask_b32_e64 v7, v7, v8, s2 +; GFX10-NEXT: v_cndmask_b32_e64 v7, v7, v11, s2 ; GFX10-NEXT: v_xor_b32_e32 v0, s20, v0 ; GFX10-NEXT: v_xor_b32_e32 v1, s21, v1 ; GFX10-NEXT: v_xor_b32_e32 v2, s22, v2 @@ -2817,7 +2816,6 @@ define amdgpu_kernel void @sdivrem_v2i16(<2 x i16> addrspace(1)* %out0, <2 x i16 ; GFX8-LABEL: sdivrem_v2i16: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x10 -; GFX8-NEXT: s_mov_b32 s10, 0x100010 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: s_sext_i32_i16 s0, s3 ; GFX8-NEXT: s_ashr_i32 s8, s0, 31 @@ -2825,41 +2823,41 @@ define amdgpu_kernel void @sdivrem_v2i16(<2 x i16> addrspace(1)* %out0, <2 x i16 ; GFX8-NEXT: s_xor_b32 s9, s0, s8 ; GFX8-NEXT: v_cvt_f32_u32_e32 v0, s9 ; GFX8-NEXT: s_sub_i32 s6, 0, s9 -; GFX8-NEXT: s_sext_i32_i16 s0, s2 -; GFX8-NEXT: s_bfe_i32 s1, s3, s10 +; GFX8-NEXT: s_bfe_i32 s1, s3, 0x100010 +; GFX8-NEXT: s_ashr_i32 s10, s1, 31 ; GFX8-NEXT: v_rcp_iflag_f32_e32 v0, v0 -; GFX8-NEXT: s_ashr_i32 s3, s0, 31 -; GFX8-NEXT: s_ashr_i32 s11, s1, 31 -; GFX8-NEXT: s_add_i32 s0, s0, s3 +; GFX8-NEXT: s_add_i32 s1, s1, s10 +; GFX8-NEXT: s_xor_b32 s11, s1, s10 +; GFX8-NEXT: s_sext_i32_i16 s0, s2 ; GFX8-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX8-NEXT: v_cvt_u32_f32_e32 v0, v0 -; GFX8-NEXT: s_add_i32 s1, s1, s11 -; GFX8-NEXT: s_xor_b32 s0, s0, s3 -; GFX8-NEXT: s_xor_b32 s12, s1, s11 +; GFX8-NEXT: v_cvt_f32_u32_e32 v2, s11 +; GFX8-NEXT: s_ashr_i32 s3, s0, 31 +; GFX8-NEXT: s_add_i32 s0, s0, s3 ; GFX8-NEXT: v_mul_lo_u32 v1, s6, v0 -; GFX8-NEXT: v_cvt_f32_u32_e32 v2, s12 +; GFX8-NEXT: s_xor_b32 s0, s0, s3 +; GFX8-NEXT: v_rcp_iflag_f32_e32 v2, v2 ; GFX8-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x0 ; GFX8-NEXT: v_mul_hi_u32 v1, v0, v1 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v1 ; GFX8-NEXT: v_mul_hi_u32 v0, s0, v0 -; GFX8-NEXT: v_rcp_iflag_f32_e32 v1, v2 +; GFX8-NEXT: v_mul_f32_e32 v1, 0x4f7ffffe, v2 +; GFX8-NEXT: v_cvt_u32_f32_e32 v1, v1 ; GFX8-NEXT: v_mul_lo_u32 v2, v0, s9 ; GFX8-NEXT: v_add_u32_e32 v3, vcc, 1, v0 -; GFX8-NEXT: v_mul_f32_e32 v1, 0x4f7ffffe, v1 ; GFX8-NEXT: v_sub_u32_e32 v2, vcc, s0, v2 ; GFX8-NEXT: v_cmp_le_u32_e32 vcc, s9, v2 ; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc ; GFX8-NEXT: v_subrev_u32_e64 v3, s[0:1], s9, v2 ; GFX8-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc ; GFX8-NEXT: v_add_u32_e32 v3, vcc, 1, v0 -; GFX8-NEXT: v_cvt_u32_f32_e32 v1, v1 ; GFX8-NEXT: v_cmp_le_u32_e32 vcc, s9, v2 ; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc ; GFX8-NEXT: v_subrev_u32_e64 v3, s[0:1], s9, v2 -; GFX8-NEXT: s_sub_i32 s1, 0, s12 +; GFX8-NEXT: s_sub_i32 s1, 0, s11 ; GFX8-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc ; GFX8-NEXT: v_mul_lo_u32 v3, s1, v1 -; GFX8-NEXT: s_bfe_i32 s1, s2, s10 +; GFX8-NEXT: s_bfe_i32 s1, s2, 0x100010 ; GFX8-NEXT: s_ashr_i32 s2, s1, 31 ; GFX8-NEXT: s_add_i32 s1, s1, s2 ; GFX8-NEXT: v_mul_hi_u32 v3, v1, v3 @@ -2870,19 +2868,19 @@ define amdgpu_kernel void @sdivrem_v2i16(<2 x i16> addrspace(1)* %out0, <2 x i16 ; GFX8-NEXT: v_mul_hi_u32 v1, s1, v1 ; GFX8-NEXT: v_xor_b32_e32 v2, s3, v2 ; GFX8-NEXT: v_subrev_u32_e32 v0, vcc, s0, v0 -; GFX8-NEXT: v_mul_lo_u32 v3, v1, s12 +; GFX8-NEXT: v_mul_lo_u32 v3, v1, s11 ; GFX8-NEXT: v_subrev_u32_e32 v2, vcc, s3, v2 ; GFX8-NEXT: v_add_u32_e32 v4, vcc, 1, v1 ; GFX8-NEXT: v_sub_u32_e32 v3, vcc, s1, v3 -; GFX8-NEXT: v_cmp_le_u32_e32 vcc, s12, v3 +; GFX8-NEXT: v_cmp_le_u32_e32 vcc, s11, v3 ; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc -; GFX8-NEXT: v_subrev_u32_e64 v4, s[0:1], s12, v3 +; GFX8-NEXT: v_subrev_u32_e64 v4, s[0:1], s11, v3 ; GFX8-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc ; GFX8-NEXT: v_add_u32_e32 v4, vcc, 1, v1 -; GFX8-NEXT: v_cmp_le_u32_e32 vcc, s12, v3 +; GFX8-NEXT: v_cmp_le_u32_e32 vcc, s11, v3 ; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc -; GFX8-NEXT: v_subrev_u32_e64 v4, s[0:1], s12, v3 -; GFX8-NEXT: s_xor_b32 s0, s2, s11 +; GFX8-NEXT: v_subrev_u32_e64 v4, s[0:1], s11, v3 +; GFX8-NEXT: s_xor_b32 s0, s2, s10 ; GFX8-NEXT: v_xor_b32_e32 v1, s0, v1 ; GFX8-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc ; GFX8-NEXT: v_subrev_u32_e32 v1, vcc, s0, v1 @@ -2907,45 +2905,44 @@ define amdgpu_kernel void @sdivrem_v2i16(<2 x i16> addrspace(1)* %out0, <2 x i16 ; GFX9-LABEL: sdivrem_v2i16: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x10 -; GFX9-NEXT: s_mov_b32 s10, 0x100010 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_sext_i32_i16 s0, s7 ; GFX9-NEXT: s_ashr_i32 s8, s0, 31 ; GFX9-NEXT: s_add_i32 s0, s0, s8 ; GFX9-NEXT: s_xor_b32 s9, s0, s8 ; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s9 -; GFX9-NEXT: s_bfe_i32 s1, s7, s10 -; GFX9-NEXT: s_ashr_i32 s7, s1, 31 -; GFX9-NEXT: s_add_i32 s1, s1, s7 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX9-NEXT: s_bfe_i32 s5, s7, 0x100010 +; GFX9-NEXT: s_ashr_i32 s7, s5, 31 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v0, v0 -; GFX9-NEXT: s_xor_b32 s11, s1, s7 -; GFX9-NEXT: v_cvt_f32_u32_e32 v1, s11 -; GFX9-NEXT: s_sub_i32 s1, 0, s9 +; GFX9-NEXT: s_add_i32 s5, s5, s7 +; GFX9-NEXT: s_xor_b32 s5, s5, s7 +; GFX9-NEXT: v_cvt_f32_u32_e32 v1, s5 ; GFX9-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0 +; GFX9-NEXT: s_sub_i32 s10, 0, s9 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v1, v1 -; GFX9-NEXT: s_sext_i32_i16 s0, s6 -; GFX9-NEXT: s_ashr_i32 s12, s0, 31 -; GFX9-NEXT: v_mul_lo_u32 v2, s1, v0 +; GFX9-NEXT: s_sext_i32_i16 s4, s6 +; GFX9-NEXT: v_mul_lo_u32 v2, s10, v0 +; GFX9-NEXT: s_ashr_i32 s10, s4, 31 ; GFX9-NEXT: v_mul_f32_e32 v1, 0x4f7ffffe, v1 ; GFX9-NEXT: v_cvt_u32_f32_e32 v1, v1 -; GFX9-NEXT: s_add_i32 s0, s0, s12 ; GFX9-NEXT: v_mul_hi_u32 v2, v0, v2 -; GFX9-NEXT: s_xor_b32 s13, s0, s12 -; GFX9-NEXT: s_sub_i32 s0, 0, s11 +; GFX9-NEXT: s_add_i32 s4, s4, s10 +; GFX9-NEXT: s_xor_b32 s4, s4, s10 +; GFX9-NEXT: s_sub_i32 s11, 0, s5 ; GFX9-NEXT: v_add_u32_e32 v0, v0, v2 -; GFX9-NEXT: v_mul_hi_u32 v0, s13, v0 -; GFX9-NEXT: v_mul_lo_u32 v2, s0, v1 -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; GFX9-NEXT: s_bfe_i32 s4, s6, s10 +; GFX9-NEXT: v_mul_hi_u32 v0, s4, v0 +; GFX9-NEXT: v_mul_lo_u32 v2, s11, v1 +; GFX9-NEXT: s_bfe_i32 s6, s6, 0x100010 +; GFX9-NEXT: s_ashr_i32 s11, s6, 31 ; GFX9-NEXT: v_mul_lo_u32 v3, v0, s9 ; GFX9-NEXT: v_mul_hi_u32 v2, v1, v2 -; GFX9-NEXT: s_ashr_i32 s5, s4, 31 -; GFX9-NEXT: s_add_i32 s4, s4, s5 -; GFX9-NEXT: v_sub_u32_e32 v3, s13, v3 +; GFX9-NEXT: s_add_i32 s6, s6, s11 ; GFX9-NEXT: v_add_u32_e32 v4, 1, v0 +; GFX9-NEXT: v_sub_u32_e32 v3, s4, v3 ; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s9, v3 -; GFX9-NEXT: s_xor_b32 s4, s4, s5 +; GFX9-NEXT: s_xor_b32 s4, s6, s11 ; GFX9-NEXT: v_add_u32_e32 v1, v1, v2 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc ; GFX9-NEXT: v_subrev_u32_e32 v4, s9, v3 @@ -2956,28 +2953,28 @@ define amdgpu_kernel void @sdivrem_v2i16(<2 x i16> addrspace(1)* %out0, <2 x i16 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc ; GFX9-NEXT: v_subrev_u32_e32 v4, s9, v3 ; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc -; GFX9-NEXT: v_mul_lo_u32 v3, v1, s11 +; GFX9-NEXT: v_mul_lo_u32 v3, v1, s5 ; GFX9-NEXT: v_add_u32_e32 v4, 1, v1 -; GFX9-NEXT: s_xor_b32 s6, s12, s8 +; GFX9-NEXT: s_xor_b32 s6, s10, s8 ; GFX9-NEXT: v_xor_b32_e32 v0, s6, v0 ; GFX9-NEXT: v_sub_u32_e32 v3, s4, v3 -; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s11, v3 +; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s5, v3 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc -; GFX9-NEXT: v_subrev_u32_e32 v4, s11, v3 +; GFX9-NEXT: v_subrev_u32_e32 v4, s5, v3 ; GFX9-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc ; GFX9-NEXT: v_add_u32_e32 v4, 1, v1 -; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s11, v3 +; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s5, v3 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc -; GFX9-NEXT: v_subrev_u32_e32 v4, s11, v3 +; GFX9-NEXT: v_subrev_u32_e32 v4, s5, v3 ; GFX9-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc -; GFX9-NEXT: s_xor_b32 s4, s5, s7 -; GFX9-NEXT: v_xor_b32_e32 v2, s12, v2 +; GFX9-NEXT: s_xor_b32 s4, s11, s7 +; GFX9-NEXT: v_xor_b32_e32 v2, s10, v2 ; GFX9-NEXT: v_xor_b32_e32 v1, s4, v1 -; GFX9-NEXT: v_xor_b32_e32 v3, s5, v3 +; GFX9-NEXT: v_xor_b32_e32 v3, s11, v3 ; GFX9-NEXT: v_subrev_u32_e32 v0, s6, v0 -; GFX9-NEXT: v_subrev_u32_e32 v2, s12, v2 +; GFX9-NEXT: v_subrev_u32_e32 v2, s10, v2 ; GFX9-NEXT: v_sub_u32_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX9-NEXT: v_sub_u32_sdwa v3, v3, s5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_sub_u32_sdwa v3, v3, s11 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-NEXT: v_mov_b32_e32 v4, 0xffff ; GFX9-NEXT: v_and_or_b32 v0, v0, v4, v1 ; GFX9-NEXT: v_and_or_b32 v1, v2, v4, v3 @@ -2990,19 +2987,18 @@ define amdgpu_kernel void @sdivrem_v2i16(<2 x i16> addrspace(1)* %out0, <2 x i16 ; GFX10-LABEL: sdivrem_v2i16: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x10 -; GFX10-NEXT: s_mov_b32 s2, 0x100010 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: s_sext_i32_i16 s3, s1 -; GFX10-NEXT: s_bfe_i32 s1, s1, s2 -; GFX10-NEXT: s_ashr_i32 s8, s3, 31 -; GFX10-NEXT: s_ashr_i32 s9, s1, 31 -; GFX10-NEXT: s_add_i32 s3, s3, s8 -; GFX10-NEXT: s_add_i32 s1, s1, s9 -; GFX10-NEXT: s_xor_b32 s3, s3, s8 -; GFX10-NEXT: s_xor_b32 s1, s1, s9 -; GFX10-NEXT: v_cvt_f32_u32_e32 v0, s3 +; GFX10-NEXT: s_sext_i32_i16 s2, s1 +; GFX10-NEXT: s_bfe_i32 s1, s1, 0x100010 +; GFX10-NEXT: s_ashr_i32 s3, s2, 31 +; GFX10-NEXT: s_ashr_i32 s8, s1, 31 +; GFX10-NEXT: s_add_i32 s2, s2, s3 +; GFX10-NEXT: s_add_i32 s1, s1, s8 +; GFX10-NEXT: s_xor_b32 s2, s2, s3 +; GFX10-NEXT: s_xor_b32 s1, s1, s8 +; GFX10-NEXT: v_cvt_f32_u32_e32 v0, s2 ; GFX10-NEXT: v_cvt_f32_u32_e32 v1, s1 -; GFX10-NEXT: s_sub_i32 s6, 0, s3 +; GFX10-NEXT: s_sub_i32 s6, 0, s2 ; GFX10-NEXT: s_sub_i32 s7, 0, s1 ; GFX10-NEXT: v_rcp_iflag_f32_e32 v0, v0 ; GFX10-NEXT: v_rcp_iflag_f32_e32 v1, v1 @@ -3013,29 +3009,29 @@ define amdgpu_kernel void @sdivrem_v2i16(<2 x i16> addrspace(1)* %out0, <2 x i16 ; GFX10-NEXT: v_mul_lo_u32 v2, s6, v0 ; GFX10-NEXT: v_mul_lo_u32 v3, s7, v1 ; GFX10-NEXT: s_sext_i32_i16 s6, s0 -; GFX10-NEXT: s_bfe_i32 s0, s0, s2 -; GFX10-NEXT: s_ashr_i32 s2, s6, 31 +; GFX10-NEXT: s_bfe_i32 s0, s0, 0x100010 +; GFX10-NEXT: s_ashr_i32 s9, s6, 31 ; GFX10-NEXT: s_ashr_i32 s10, s0, 31 -; GFX10-NEXT: s_add_i32 s6, s6, s2 +; GFX10-NEXT: s_add_i32 s6, s6, s9 ; GFX10-NEXT: s_add_i32 s0, s0, s10 ; GFX10-NEXT: v_mul_hi_u32 v2, v0, v2 ; GFX10-NEXT: v_mul_hi_u32 v3, v1, v3 -; GFX10-NEXT: s_xor_b32 s6, s6, s2 +; GFX10-NEXT: s_xor_b32 s6, s6, s9 ; GFX10-NEXT: s_xor_b32 s0, s0, s10 ; GFX10-NEXT: v_add_nc_u32_e32 v0, v0, v2 ; GFX10-NEXT: v_add_nc_u32_e32 v1, v1, v3 ; GFX10-NEXT: v_mul_hi_u32 v0, s6, v0 ; GFX10-NEXT: v_mul_hi_u32 v1, s0, v1 -; GFX10-NEXT: v_mul_lo_u32 v2, v0, s3 +; GFX10-NEXT: v_mul_lo_u32 v2, v0, s2 ; GFX10-NEXT: v_mul_lo_u32 v3, v1, s1 ; GFX10-NEXT: v_add_nc_u32_e32 v4, 1, v0 ; GFX10-NEXT: v_add_nc_u32_e32 v5, 1, v1 ; GFX10-NEXT: v_sub_nc_u32_e32 v2, s6, v2 ; GFX10-NEXT: v_sub_nc_u32_e32 v3, s0, v3 ; GFX10-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x0 -; GFX10-NEXT: v_subrev_nc_u32_e32 v6, s3, v2 +; GFX10-NEXT: v_subrev_nc_u32_e32 v6, s2, v2 ; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, s1, v3 -; GFX10-NEXT: v_cmp_le_u32_e64 s0, s3, v2 +; GFX10-NEXT: v_cmp_le_u32_e64 s0, s2, v2 ; GFX10-NEXT: v_subrev_nc_u32_e32 v7, s1, v3 ; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc_lo ; GFX10-NEXT: v_cndmask_b32_e64 v0, v0, v4, s0 @@ -3043,28 +3039,27 @@ define amdgpu_kernel void @sdivrem_v2i16(<2 x i16> addrspace(1)* %out0, <2 x i16 ; GFX10-NEXT: v_cndmask_b32_e32 v3, v3, v7, vcc_lo ; GFX10-NEXT: v_add_nc_u32_e32 v5, 1, v1 ; GFX10-NEXT: v_add_nc_u32_e32 v4, 1, v0 -; GFX10-NEXT: v_cmp_le_u32_e64 s0, s3, v2 +; GFX10-NEXT: v_cmp_le_u32_e64 s0, s2, v2 ; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, s1, v3 -; GFX10-NEXT: v_subrev_nc_u32_e32 v6, s3, v2 +; GFX10-NEXT: v_subrev_nc_u32_e32 v6, s2, v2 ; GFX10-NEXT: v_subrev_nc_u32_e32 v7, s1, v3 -; GFX10-NEXT: s_xor_b32 s1, s2, s8 +; GFX10-NEXT: s_xor_b32 s1, s9, s3 ; GFX10-NEXT: v_cndmask_b32_e64 v0, v0, v4, s0 ; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc_lo ; GFX10-NEXT: v_cndmask_b32_e64 v2, v2, v6, s0 ; GFX10-NEXT: v_cndmask_b32_e32 v3, v3, v7, vcc_lo -; GFX10-NEXT: s_xor_b32 s0, s10, s9 +; GFX10-NEXT: s_xor_b32 s0, s10, s8 ; GFX10-NEXT: v_xor_b32_e32 v0, s1, v0 ; GFX10-NEXT: v_xor_b32_e32 v1, s0, v1 -; GFX10-NEXT: v_xor_b32_e32 v2, s2, v2 +; GFX10-NEXT: v_xor_b32_e32 v2, s9, v2 ; GFX10-NEXT: v_xor_b32_e32 v3, s10, v3 -; GFX10-NEXT: v_mov_b32_e32 v4, 0xffff ; GFX10-NEXT: v_subrev_nc_u32_e32 v0, s1, v0 ; GFX10-NEXT: v_sub_nc_u32_sdwa v1, v1, s0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX10-NEXT: v_subrev_nc_u32_e32 v2, s2, v2 +; GFX10-NEXT: v_subrev_nc_u32_e32 v2, s9, v2 ; GFX10-NEXT: v_sub_nc_u32_sdwa v3, v3, s10 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX10-NEXT: v_and_or_b32 v0, v0, v4, v1 +; GFX10-NEXT: v_and_or_b32 v0, 0xffff, v0, v1 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 -; GFX10-NEXT: v_and_or_b32 v2, v2, v4, v3 +; GFX10-NEXT: v_and_or_b32 v2, 0xffff, v2, v3 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: global_store_dword v1, v0, s[4:5] ; GFX10-NEXT: global_store_dword v1, v2, s[6:7] @@ -3366,9 +3361,8 @@ define amdgpu_kernel void @sdivrem_i27(i27 addrspace(1)* %out0, i27 addrspace(1) ; GFX10-NEXT: v_xor_b32_e32 v1, s7, v1 ; GFX10-NEXT: v_subrev_nc_u32_e32 v0, s4, v0 ; GFX10-NEXT: v_subrev_nc_u32_e32 v1, s7, v1 -; GFX10-NEXT: s_mov_b32 s4, 0x7ffffff -; GFX10-NEXT: v_and_b32_e32 v0, s4, v0 -; GFX10-NEXT: v_and_b32_e32 v1, s4, v1 +; GFX10-NEXT: v_and_b32_e32 v0, 0x7ffffff, v0 +; GFX10-NEXT: v_and_b32_e32 v1, 0x7ffffff, v1 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: global_store_dword v2, v0, s[0:1] ; GFX10-NEXT: global_store_dword v2, v1, s[2:3] diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/shl-ext-reduce.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/shl-ext-reduce.ll index 7e5ecaac8d9c..97a09585b73e 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/shl-ext-reduce.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/shl-ext-reduce.ll @@ -439,9 +439,8 @@ define <2 x i64> @v_shl_v2i64_zext_v2i32(<2 x i32> %x) { ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: v_mov_b32_e32 v3, 0 -; GFX10-NEXT: s_brev_b32 s4, -4 -; GFX10-NEXT: v_and_b32_e32 v2, s4, v0 -; GFX10-NEXT: v_and_b32_e32 v4, s4, v1 +; GFX10-NEXT: v_and_b32_e32 v2, 0x3fffffff, v0 +; GFX10-NEXT: v_and_b32_e32 v4, 0x3fffffff, v1 ; GFX10-NEXT: v_mov_b32_e32 v5, v3 ; GFX10-NEXT: v_lshlrev_b64 v[0:1], 2, v[2:3] ; GFX10-NEXT: v_lshlrev_b64 v[2:3], 2, v[4:5] @@ -523,9 +522,8 @@ define <2 x i64> @v_shl_v2i64_sext_v2i32(<2 x i32> %x) { ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: s_brev_b32 s4, -8 -; GFX10-NEXT: v_and_b32_e32 v0, s4, v0 -; GFX10-NEXT: v_and_b32_e32 v2, s4, v1 +; GFX10-NEXT: v_and_b32_e32 v0, 0x1fffffff, v0 +; GFX10-NEXT: v_and_b32_e32 v2, 0x1fffffff, v1 ; GFX10-NEXT: v_ashrrev_i32_e32 v1, 31, v0 ; GFX10-NEXT: v_ashrrev_i32_e32 v3, 31, v2 ; GFX10-NEXT: v_lshlrev_b64 v[0:1], 2, v[0:1] @@ -608,13 +606,12 @@ define i32 @v_shl_i32_zext_i16(i16 %x) { define amdgpu_ps <2 x i32> @s_shl_v2i32_zext_v2i16(<2 x i16> inreg %x) { ; GFX7-LABEL: s_shl_v2i32_zext_v2i16: ; GFX7: ; %bb.0: -; GFX7-NEXT: s_mov_b32 s2, 0xffff ; GFX7-NEXT: s_lshl_b32 s1, s1, 16 -; GFX7-NEXT: s_and_b32 s0, s0, s2 +; GFX7-NEXT: s_and_b32 s0, s0, 0xffff ; GFX7-NEXT: s_or_b32 s0, s1, s0 ; GFX7-NEXT: s_and_b32 s0, s0, 0x3fff3fff ; GFX7-NEXT: s_lshr_b32 s1, s0, 16 -; GFX7-NEXT: s_and_b32 s0, s0, s2 +; GFX7-NEXT: s_and_b32 s0, s0, 0xffff ; GFX7-NEXT: s_lshl_b32 s0, s0, 2 ; GFX7-NEXT: s_lshl_b32 s1, s1, 2 ; GFX7-NEXT: ; return to shader part epilog diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/shl.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/shl.ll index acf525adeeff..15173a7d5cea 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/shl.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/shl.ll @@ -785,25 +785,23 @@ define <2 x i16> @v_shl_v2i16_15(<2 x i16> %value) { define amdgpu_ps i32 @s_shl_v2i16(<2 x i16> inreg %value, <2 x i16> inreg %amount) { ; GFX6-LABEL: s_shl_v2i16: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_mov_b32 s4, 0xffff ; GFX6-NEXT: s_lshl_b32 s1, s1, s3 ; GFX6-NEXT: s_lshl_b32 s0, s0, s2 -; GFX6-NEXT: s_and_b32 s1, s1, s4 -; GFX6-NEXT: s_and_b32 s0, s0, s4 +; GFX6-NEXT: s_and_b32 s1, s1, 0xffff +; GFX6-NEXT: s_and_b32 s0, s0, 0xffff ; GFX6-NEXT: s_lshl_b32 s1, s1, 16 ; GFX6-NEXT: s_or_b32 s0, s0, s1 ; GFX6-NEXT: ; return to shader part epilog ; ; GFX8-LABEL: s_shl_v2i16: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_mov_b32 s3, 0xffff ; GFX8-NEXT: s_lshr_b32 s2, s0, 16 -; GFX8-NEXT: s_and_b32 s0, s0, s3 -; GFX8-NEXT: s_lshr_b32 s4, s1, 16 +; GFX8-NEXT: s_and_b32 s0, s0, 0xffff +; GFX8-NEXT: s_lshr_b32 s3, s1, 16 ; GFX8-NEXT: s_lshl_b32 s0, s0, s1 -; GFX8-NEXT: s_lshl_b32 s1, s2, s4 +; GFX8-NEXT: s_lshl_b32 s1, s2, s3 ; GFX8-NEXT: s_lshl_b32 s1, s1, 16 -; GFX8-NEXT: s_and_b32 s0, s0, s3 +; GFX8-NEXT: s_and_b32 s0, s0, 0xffff ; GFX8-NEXT: s_or_b32 s0, s1, s0 ; GFX8-NEXT: ; return to shader part epilog ; @@ -869,10 +867,10 @@ define amdgpu_ps float @shl_v2i16_sv(<2 x i16> inreg %value, <2 x i16> %amount) define amdgpu_ps float @shl_v2i16_vs(<2 x i16> %value, <2 x i16> inreg %amount) { ; GFX6-LABEL: shl_v2i16_vs: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_mov_b32 s2, 0xffff -; GFX6-NEXT: s_and_b32 s0, s0, s2 +; GFX6-NEXT: s_and_b32 s0, s0, 0xffff ; GFX6-NEXT: v_lshlrev_b32_e32 v0, s0, v0 -; GFX6-NEXT: s_and_b32 s0, s1, s2 +; GFX6-NEXT: s_and_b32 s0, s1, 0xffff +; GFX6-NEXT: s_mov_b32 s2, 0xffff ; GFX6-NEXT: v_lshlrev_b32_e32 v1, s0, v1 ; GFX6-NEXT: v_and_b32_e32 v1, s2, v1 ; GFX6-NEXT: v_and_b32_e32 v0, s2, v0 @@ -970,39 +968,37 @@ define <2 x float> @v_shl_v4i16(<4 x i16> %value, <4 x i16> %amount) { define amdgpu_ps <2 x i32> @s_shl_v4i16(<4 x i16> inreg %value, <4 x i16> inreg %amount) { ; GFX6-LABEL: s_shl_v4i16: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_mov_b32 s8, 0xffff ; GFX6-NEXT: s_lshl_b32 s1, s1, s5 ; GFX6-NEXT: s_lshl_b32 s0, s0, s4 -; GFX6-NEXT: s_and_b32 s1, s1, s8 +; GFX6-NEXT: s_and_b32 s1, s1, 0xffff ; GFX6-NEXT: s_lshl_b32 s2, s2, s6 ; GFX6-NEXT: s_lshl_b32 s3, s3, s7 -; GFX6-NEXT: s_and_b32 s0, s0, s8 +; GFX6-NEXT: s_and_b32 s0, s0, 0xffff ; GFX6-NEXT: s_lshl_b32 s1, s1, 16 ; GFX6-NEXT: s_or_b32 s0, s0, s1 -; GFX6-NEXT: s_and_b32 s1, s2, s8 -; GFX6-NEXT: s_and_b32 s2, s3, s8 +; GFX6-NEXT: s_and_b32 s1, s2, 0xffff +; GFX6-NEXT: s_and_b32 s2, s3, 0xffff ; GFX6-NEXT: s_lshl_b32 s2, s2, 16 ; GFX6-NEXT: s_or_b32 s1, s1, s2 ; GFX6-NEXT: ; return to shader part epilog ; ; GFX8-LABEL: s_shl_v4i16: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_mov_b32 s6, 0xffff ; GFX8-NEXT: s_lshr_b32 s4, s0, 16 -; GFX8-NEXT: s_and_b32 s0, s0, s6 -; GFX8-NEXT: s_lshr_b32 s7, s2, 16 +; GFX8-NEXT: s_and_b32 s0, s0, 0xffff +; GFX8-NEXT: s_lshr_b32 s6, s2, 16 ; GFX8-NEXT: s_lshr_b32 s5, s1, 16 -; GFX8-NEXT: s_and_b32 s1, s1, s6 -; GFX8-NEXT: s_lshr_b32 s8, s3, 16 +; GFX8-NEXT: s_and_b32 s1, s1, 0xffff +; GFX8-NEXT: s_lshr_b32 s7, s3, 16 ; GFX8-NEXT: s_lshl_b32 s0, s0, s2 -; GFX8-NEXT: s_lshl_b32 s2, s4, s7 +; GFX8-NEXT: s_lshl_b32 s2, s4, s6 ; GFX8-NEXT: s_lshl_b32 s1, s1, s3 -; GFX8-NEXT: s_lshl_b32 s3, s5, s8 +; GFX8-NEXT: s_lshl_b32 s3, s5, s7 ; GFX8-NEXT: s_lshl_b32 s2, s2, 16 -; GFX8-NEXT: s_and_b32 s0, s0, s6 +; GFX8-NEXT: s_and_b32 s0, s0, 0xffff ; GFX8-NEXT: s_or_b32 s0, s2, s0 ; GFX8-NEXT: s_lshl_b32 s2, s3, 16 -; GFX8-NEXT: s_and_b32 s1, s1, s6 +; GFX8-NEXT: s_and_b32 s1, s1, 0xffff ; GFX8-NEXT: s_or_b32 s1, s2, s1 ; GFX8-NEXT: ; return to shader part epilog ; @@ -1144,67 +1140,65 @@ define <4 x float> @v_shl_v8i16(<8 x i16> %value, <8 x i16> %amount) { define amdgpu_ps <4 x i32> @s_shl_v8i16(<8 x i16> inreg %value, <8 x i16> inreg %amount) { ; GFX6-LABEL: s_shl_v8i16: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_mov_b32 s16, 0xffff ; GFX6-NEXT: s_lshl_b32 s1, s1, s9 ; GFX6-NEXT: s_lshl_b32 s0, s0, s8 -; GFX6-NEXT: s_and_b32 s1, s1, s16 +; GFX6-NEXT: s_and_b32 s1, s1, 0xffff ; GFX6-NEXT: s_lshl_b32 s2, s2, s10 ; GFX6-NEXT: s_lshl_b32 s3, s3, s11 -; GFX6-NEXT: s_and_b32 s0, s0, s16 +; GFX6-NEXT: s_and_b32 s0, s0, 0xffff ; GFX6-NEXT: s_lshl_b32 s1, s1, 16 ; GFX6-NEXT: s_lshl_b32 s5, s5, s13 ; GFX6-NEXT: s_or_b32 s0, s0, s1 -; GFX6-NEXT: s_and_b32 s1, s2, s16 -; GFX6-NEXT: s_and_b32 s2, s3, s16 +; GFX6-NEXT: s_and_b32 s1, s2, 0xffff +; GFX6-NEXT: s_and_b32 s2, s3, 0xffff ; GFX6-NEXT: s_lshl_b32 s4, s4, s12 ; GFX6-NEXT: s_lshl_b32 s7, s7, s15 ; GFX6-NEXT: s_lshl_b32 s2, s2, 16 -; GFX6-NEXT: s_and_b32 s3, s5, s16 +; GFX6-NEXT: s_and_b32 s3, s5, 0xffff ; GFX6-NEXT: s_lshl_b32 s6, s6, s14 ; GFX6-NEXT: s_or_b32 s1, s1, s2 -; GFX6-NEXT: s_and_b32 s2, s4, s16 +; GFX6-NEXT: s_and_b32 s2, s4, 0xffff ; GFX6-NEXT: s_lshl_b32 s3, s3, 16 -; GFX6-NEXT: s_and_b32 s4, s7, s16 +; GFX6-NEXT: s_and_b32 s4, s7, 0xffff ; GFX6-NEXT: s_or_b32 s2, s2, s3 -; GFX6-NEXT: s_and_b32 s3, s6, s16 +; GFX6-NEXT: s_and_b32 s3, s6, 0xffff ; GFX6-NEXT: s_lshl_b32 s4, s4, 16 ; GFX6-NEXT: s_or_b32 s3, s3, s4 ; GFX6-NEXT: ; return to shader part epilog ; ; GFX8-LABEL: s_shl_v8i16: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_mov_b32 s12, 0xffff ; GFX8-NEXT: s_lshr_b32 s8, s0, 16 -; GFX8-NEXT: s_and_b32 s0, s0, s12 -; GFX8-NEXT: s_lshr_b32 s13, s4, 16 +; GFX8-NEXT: s_and_b32 s0, s0, 0xffff +; GFX8-NEXT: s_lshr_b32 s12, s4, 16 ; GFX8-NEXT: s_lshr_b32 s9, s1, 16 -; GFX8-NEXT: s_and_b32 s1, s1, s12 -; GFX8-NEXT: s_lshr_b32 s14, s5, 16 +; GFX8-NEXT: s_and_b32 s1, s1, 0xffff +; GFX8-NEXT: s_lshr_b32 s13, s5, 16 ; GFX8-NEXT: s_lshl_b32 s0, s0, s4 -; GFX8-NEXT: s_lshl_b32 s4, s8, s13 +; GFX8-NEXT: s_lshl_b32 s4, s8, s12 ; GFX8-NEXT: s_lshr_b32 s10, s2, 16 -; GFX8-NEXT: s_and_b32 s2, s2, s12 -; GFX8-NEXT: s_lshr_b32 s15, s6, 16 +; GFX8-NEXT: s_and_b32 s2, s2, 0xffff +; GFX8-NEXT: s_lshr_b32 s14, s6, 16 ; GFX8-NEXT: s_lshl_b32 s1, s1, s5 -; GFX8-NEXT: s_lshl_b32 s5, s9, s14 +; GFX8-NEXT: s_lshl_b32 s5, s9, s13 ; GFX8-NEXT: s_lshl_b32 s4, s4, 16 -; GFX8-NEXT: s_and_b32 s0, s0, s12 +; GFX8-NEXT: s_and_b32 s0, s0, 0xffff ; GFX8-NEXT: s_lshr_b32 s11, s3, 16 -; GFX8-NEXT: s_and_b32 s3, s3, s12 -; GFX8-NEXT: s_lshr_b32 s16, s7, 16 +; GFX8-NEXT: s_and_b32 s3, s3, 0xffff +; GFX8-NEXT: s_lshr_b32 s15, s7, 16 ; GFX8-NEXT: s_lshl_b32 s2, s2, s6 -; GFX8-NEXT: s_lshl_b32 s6, s10, s15 +; GFX8-NEXT: s_lshl_b32 s6, s10, s14 ; GFX8-NEXT: s_or_b32 s0, s4, s0 ; GFX8-NEXT: s_lshl_b32 s4, s5, 16 -; GFX8-NEXT: s_and_b32 s1, s1, s12 +; GFX8-NEXT: s_and_b32 s1, s1, 0xffff ; GFX8-NEXT: s_lshl_b32 s3, s3, s7 -; GFX8-NEXT: s_lshl_b32 s7, s11, s16 +; GFX8-NEXT: s_lshl_b32 s7, s11, s15 ; GFX8-NEXT: s_or_b32 s1, s4, s1 ; GFX8-NEXT: s_lshl_b32 s4, s6, 16 -; GFX8-NEXT: s_and_b32 s2, s2, s12 +; GFX8-NEXT: s_and_b32 s2, s2, 0xffff ; GFX8-NEXT: s_or_b32 s2, s4, s2 ; GFX8-NEXT: s_lshl_b32 s4, s7, 16 -; GFX8-NEXT: s_and_b32 s3, s3, s12 +; GFX8-NEXT: s_and_b32 s3, s3, 0xffff ; GFX8-NEXT: s_or_b32 s3, s4, s3 ; GFX8-NEXT: ; return to shader part epilog ; diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/srem.i32.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/srem.i32.ll index a85173ac78df..f4eb1437cce3 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/srem.i32.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/srem.i32.ll @@ -377,13 +377,13 @@ define <2 x i32> @v_srem_v2i32_pow2k_denom(<2 x i32> %num) { ; CGP-NEXT: v_sub_i32_e32 v0, vcc, v0, v3 ; CGP-NEXT: v_sub_i32_e32 v1, vcc, v1, v4 ; CGP-NEXT: v_subrev_i32_e32 v3, vcc, s4, v0 -; CGP-NEXT: v_sub_i32_e32 v4, vcc, v1, v5 +; CGP-NEXT: v_subrev_i32_e32 v4, vcc, 0x1000, v1 ; CGP-NEXT: v_cmp_le_u32_e32 vcc, s4, v0 ; CGP-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc ; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v1, v5 ; CGP-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc ; CGP-NEXT: v_subrev_i32_e32 v3, vcc, s4, v0 -; CGP-NEXT: v_sub_i32_e32 v4, vcc, v1, v5 +; CGP-NEXT: v_subrev_i32_e32 v4, vcc, 0x1000, v1 ; CGP-NEXT: v_cmp_le_u32_e32 vcc, s4, v0 ; CGP-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc ; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v1, v5 @@ -508,13 +508,13 @@ define <2 x i32> @v_srem_v2i32_oddk_denom(<2 x i32> %num) { ; CGP-NEXT: v_sub_i32_e32 v0, vcc, v0, v3 ; CGP-NEXT: v_sub_i32_e32 v1, vcc, v1, v4 ; CGP-NEXT: v_subrev_i32_e32 v3, vcc, s4, v0 -; CGP-NEXT: v_sub_i32_e32 v4, vcc, v1, v5 +; CGP-NEXT: v_subrev_i32_e32 v4, vcc, 0x12d8fb, v1 ; CGP-NEXT: v_cmp_le_u32_e32 vcc, s4, v0 ; CGP-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc ; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v1, v5 ; CGP-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc ; CGP-NEXT: v_subrev_i32_e32 v3, vcc, s4, v0 -; CGP-NEXT: v_sub_i32_e32 v4, vcc, v1, v5 +; CGP-NEXT: v_subrev_i32_e32 v4, vcc, 0x12d8fb, v1 ; CGP-NEXT: v_cmp_le_u32_e32 vcc, s4, v0 ; CGP-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc ; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v1, v5 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/srem.i64.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/srem.i64.ll index 0cddf3e2c86e..33d88ad98528 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/srem.i64.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/srem.i64.ll @@ -1164,9 +1164,8 @@ define <2 x i64> @v_srem_v2i64_pow2k_denom(<2 x i64> %num) { ; GISEL-LABEL: v_srem_v2i64_pow2k_denom: ; GISEL: ; %bb.0: ; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GISEL-NEXT: s_movk_i32 s10, 0x1000 ; GISEL-NEXT: s_mov_b32 s6, 0 -; GISEL-NEXT: s_add_u32 s4, s10, 0 +; GISEL-NEXT: s_add_u32 s4, 0x1000, 0 ; GISEL-NEXT: s_mov_b32 s7, s6 ; GISEL-NEXT: s_addc_u32 s5, 0, 0 ; GISEL-NEXT: s_xor_b64 s[8:9], s[4:5], s[6:7] @@ -1294,7 +1293,7 @@ define <2 x i64> @v_srem_v2i64_pow2k_denom(<2 x i64> %num) { ; GISEL-NEXT: v_subrev_i32_e32 v9, vcc, s8, v7 ; GISEL-NEXT: v_cndmask_b32_e64 v10, v10, v11, s[4:5] ; GISEL-NEXT: v_subbrev_u32_e32 v1, vcc, 0, v1, vcc -; GISEL-NEXT: s_add_u32 s4, s10, 0 +; GISEL-NEXT: s_add_u32 s4, 0x1000, 0 ; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10 ; GISEL-NEXT: s_addc_u32 s5, 0, 0 ; GISEL-NEXT: v_cndmask_b32_e32 v7, v7, v9, vcc @@ -1860,9 +1859,8 @@ define <2 x i64> @v_srem_v2i64_oddk_denom(<2 x i64> %num) { ; GISEL-LABEL: v_srem_v2i64_oddk_denom: ; GISEL: ; %bb.0: ; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GISEL-NEXT: s_mov_b32 s10, 0x12d8fb ; GISEL-NEXT: s_mov_b32 s6, 0 -; GISEL-NEXT: s_add_u32 s4, s10, 0 +; GISEL-NEXT: s_add_u32 s4, 0x12d8fb, 0 ; GISEL-NEXT: s_mov_b32 s7, s6 ; GISEL-NEXT: s_addc_u32 s5, 0, 0 ; GISEL-NEXT: s_xor_b64 s[8:9], s[4:5], s[6:7] @@ -1990,7 +1988,7 @@ define <2 x i64> @v_srem_v2i64_oddk_denom(<2 x i64> %num) { ; GISEL-NEXT: v_subrev_i32_e32 v9, vcc, s8, v7 ; GISEL-NEXT: v_cndmask_b32_e64 v10, v10, v11, s[4:5] ; GISEL-NEXT: v_subbrev_u32_e32 v1, vcc, 0, v1, vcc -; GISEL-NEXT: s_add_u32 s4, s10, 0 +; GISEL-NEXT: s_add_u32 s4, 0x12d8fb, 0 ; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10 ; GISEL-NEXT: s_addc_u32 s5, 0, 0 ; GISEL-NEXT: v_cndmask_b32_e32 v7, v7, v9, vcc diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/ssubsat.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/ssubsat.ll index 38599949d777..95e1c42572dd 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/ssubsat.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/ssubsat.ll @@ -320,12 +320,11 @@ define i16 @v_ssubsat_v2i8(i16 %lhs.arg, i16 %rhs.arg) { ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: s_mov_b32 s4, 8 -; GFX10-NEXT: v_mov_b32_e32 v2, 0xffff -; GFX10-NEXT: v_lshrrev_b32_sdwa v3, s4, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX10-NEXT: v_lshrrev_b32_sdwa v4, s4, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX10-NEXT: v_lshrrev_b32_sdwa v2, s4, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX10-NEXT: v_lshrrev_b32_sdwa v3, s4, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX10-NEXT: s_movk_i32 s4, 0xff -; GFX10-NEXT: v_and_or_b32 v0, v0, v2, v3 -; GFX10-NEXT: v_and_or_b32 v1, v1, v2, v4 +; GFX10-NEXT: v_and_or_b32 v0, 0xffff, v0, v2 +; GFX10-NEXT: v_and_or_b32 v1, 0xffff, v1, v3 ; GFX10-NEXT: v_pk_lshlrev_b16 v0, 8, v0 op_sel_hi:[0,1] ; GFX10-NEXT: v_pk_lshlrev_b16 v1, 8, v1 op_sel_hi:[0,1] ; GFX10-NEXT: v_pk_sub_i16 v0, v0, v1 clamp @@ -345,31 +344,28 @@ define amdgpu_ps i16 @s_ssubsat_v2i8(i16 inreg %lhs.arg, i16 inreg %rhs.arg) { ; GFX6: ; %bb.0: ; GFX6-NEXT: s_lshr_b32 s2, s0, 8 ; GFX6-NEXT: s_lshl_b32 s0, s0, 24 -; GFX6-NEXT: s_brev_b32 s4, -2 -; GFX6-NEXT: s_max_i32 s6, s0, -1 +; GFX6-NEXT: s_max_i32 s4, s0, -1 ; GFX6-NEXT: s_lshr_b32 s3, s1, 8 ; GFX6-NEXT: s_lshl_b32 s1, s1, 24 -; GFX6-NEXT: s_brev_b32 s5, 1 -; GFX6-NEXT: s_sub_i32 s6, s6, s4 -; GFX6-NEXT: s_min_i32 s7, s0, -1 -; GFX6-NEXT: s_sub_i32 s7, s7, s5 -; GFX6-NEXT: s_max_i32 s1, s6, s1 -; GFX6-NEXT: s_min_i32 s1, s1, s7 +; GFX6-NEXT: s_sub_i32 s4, s4, 0x7fffffff +; GFX6-NEXT: s_min_i32 s5, s0, -1 +; GFX6-NEXT: s_sub_i32 s5, s5, 0x80000000 +; GFX6-NEXT: s_max_i32 s1, s4, s1 +; GFX6-NEXT: s_min_i32 s1, s1, s5 ; GFX6-NEXT: s_sub_i32 s0, s0, s1 ; GFX6-NEXT: s_lshl_b32 s1, s2, 24 ; GFX6-NEXT: s_lshl_b32 s2, s3, 24 ; GFX6-NEXT: s_max_i32 s3, s1, -1 -; GFX6-NEXT: s_sub_i32 s3, s3, s4 +; GFX6-NEXT: s_sub_i32 s3, s3, 0x7fffffff ; GFX6-NEXT: s_min_i32 s4, s1, -1 -; GFX6-NEXT: s_sub_i32 s4, s4, s5 +; GFX6-NEXT: s_sub_i32 s4, s4, 0x80000000 ; GFX6-NEXT: s_max_i32 s2, s3, s2 ; GFX6-NEXT: s_min_i32 s2, s2, s4 ; GFX6-NEXT: s_sub_i32 s1, s1, s2 ; GFX6-NEXT: s_ashr_i32 s1, s1, 24 -; GFX6-NEXT: s_movk_i32 s2, 0xff ; GFX6-NEXT: s_ashr_i32 s0, s0, 24 -; GFX6-NEXT: s_and_b32 s1, s1, s2 -; GFX6-NEXT: s_and_b32 s0, s0, s2 +; GFX6-NEXT: s_and_b32 s1, s1, 0xff +; GFX6-NEXT: s_and_b32 s0, s0, 0xff ; GFX6-NEXT: s_lshl_b32 s1, s1, 8 ; GFX6-NEXT: s_or_b32 s0, s0, s1 ; GFX6-NEXT: ; return to shader part epilog @@ -379,32 +375,30 @@ define amdgpu_ps i16 @s_ssubsat_v2i8(i16 inreg %lhs.arg, i16 inreg %rhs.arg) { ; GFX8-NEXT: s_bfe_u32 s4, 8, 0x100000 ; GFX8-NEXT: s_lshr_b32 s2, s0, 8 ; GFX8-NEXT: s_lshl_b32 s0, s0, s4 -; GFX8-NEXT: s_sext_i32_i16 s7, s0 -; GFX8-NEXT: s_sext_i32_i16 s8, -1 -; GFX8-NEXT: s_movk_i32 s5, 0x7fff -; GFX8-NEXT: s_max_i32 s9, s7, s8 +; GFX8-NEXT: s_sext_i32_i16 s5, s0 +; GFX8-NEXT: s_sext_i32_i16 s6, -1 +; GFX8-NEXT: s_max_i32 s7, s5, s6 ; GFX8-NEXT: s_lshr_b32 s3, s1, 8 ; GFX8-NEXT: s_lshl_b32 s1, s1, s4 -; GFX8-NEXT: s_sub_i32 s9, s9, s5 -; GFX8-NEXT: s_movk_i32 s6, 0x8000 -; GFX8-NEXT: s_min_i32 s7, s7, s8 -; GFX8-NEXT: s_sext_i32_i16 s9, s9 -; GFX8-NEXT: s_sext_i32_i16 s1, s1 -; GFX8-NEXT: s_sub_i32 s7, s7, s6 -; GFX8-NEXT: s_max_i32 s1, s9, s1 -; GFX8-NEXT: s_sext_i32_i16 s1, s1 +; GFX8-NEXT: s_sub_i32 s7, s7, 0x7fff +; GFX8-NEXT: s_min_i32 s5, s5, s6 ; GFX8-NEXT: s_sext_i32_i16 s7, s7 -; GFX8-NEXT: s_min_i32 s1, s1, s7 +; GFX8-NEXT: s_sext_i32_i16 s1, s1 +; GFX8-NEXT: s_sub_i32 s5, s5, 0xffff8000 +; GFX8-NEXT: s_max_i32 s1, s7, s1 +; GFX8-NEXT: s_sext_i32_i16 s1, s1 +; GFX8-NEXT: s_sext_i32_i16 s5, s5 +; GFX8-NEXT: s_min_i32 s1, s1, s5 ; GFX8-NEXT: s_sub_i32 s0, s0, s1 ; GFX8-NEXT: s_lshl_b32 s1, s2, s4 ; GFX8-NEXT: s_lshl_b32 s2, s3, s4 ; GFX8-NEXT: s_sext_i32_i16 s3, s1 -; GFX8-NEXT: s_max_i32 s7, s3, s8 -; GFX8-NEXT: s_sub_i32 s5, s7, s5 -; GFX8-NEXT: s_min_i32 s3, s3, s8 +; GFX8-NEXT: s_max_i32 s5, s3, s6 +; GFX8-NEXT: s_sub_i32 s5, s5, 0x7fff +; GFX8-NEXT: s_min_i32 s3, s3, s6 ; GFX8-NEXT: s_sext_i32_i16 s5, s5 ; GFX8-NEXT: s_sext_i32_i16 s2, s2 -; GFX8-NEXT: s_sub_i32 s3, s3, s6 +; GFX8-NEXT: s_sub_i32 s3, s3, 0xffff8000 ; GFX8-NEXT: s_max_i32 s2, s5, s2 ; GFX8-NEXT: s_sext_i32_i16 s2, s2 ; GFX8-NEXT: s_sext_i32_i16 s3, s3 @@ -413,10 +407,9 @@ define amdgpu_ps i16 @s_ssubsat_v2i8(i16 inreg %lhs.arg, i16 inreg %rhs.arg) { ; GFX8-NEXT: s_sext_i32_i16 s1, s1 ; GFX8-NEXT: s_sext_i32_i16 s0, s0 ; GFX8-NEXT: s_ashr_i32 s1, s1, s4 -; GFX8-NEXT: s_movk_i32 s2, 0xff ; GFX8-NEXT: s_ashr_i32 s0, s0, s4 -; GFX8-NEXT: s_and_b32 s1, s1, s2 -; GFX8-NEXT: s_and_b32 s0, s0, s2 +; GFX8-NEXT: s_and_b32 s1, s1, 0xff +; GFX8-NEXT: s_and_b32 s0, s0, 0xff ; GFX8-NEXT: s_lshl_b32 s1, s1, s4 ; GFX8-NEXT: s_or_b32 s0, s0, s1 ; GFX8-NEXT: ; return to shader part epilog @@ -424,17 +417,16 @@ define amdgpu_ps i16 @s_ssubsat_v2i8(i16 inreg %lhs.arg, i16 inreg %rhs.arg) { ; GFX9-LABEL: s_ssubsat_v2i8: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_lshr_b32 s2, s0, 8 -; GFX9-NEXT: s_lshr_b32 s3, s1, 8 ; GFX9-NEXT: s_pack_ll_b32_b16 s0, s0, s2 +; GFX9-NEXT: s_lshr_b32 s3, s1, 8 +; GFX9-NEXT: s_lshr_b32 s2, s0, 16 ; GFX9-NEXT: s_pack_ll_b32_b16 s1, s1, s3 -; GFX9-NEXT: s_mov_b32 s2, 0x80008 -; GFX9-NEXT: s_lshr_b32 s3, s0, 16 -; GFX9-NEXT: s_lshl_b32 s0, s0, s2 -; GFX9-NEXT: s_lshl_b32 s3, s3, 8 -; GFX9-NEXT: s_pack_ll_b32_b16 s0, s0, s3 -; GFX9-NEXT: s_lshr_b32 s3, s1, 16 -; GFX9-NEXT: s_lshl_b32 s1, s1, s2 -; GFX9-NEXT: s_lshl_b32 s2, s3, 8 +; GFX9-NEXT: s_lshl_b32 s0, s0, 0x80008 +; GFX9-NEXT: s_lshl_b32 s2, s2, 8 +; GFX9-NEXT: s_pack_ll_b32_b16 s0, s0, s2 +; GFX9-NEXT: s_lshr_b32 s2, s1, 16 +; GFX9-NEXT: s_lshl_b32 s1, s1, 0x80008 +; GFX9-NEXT: s_lshl_b32 s2, s2, 8 ; GFX9-NEXT: s_pack_ll_b32_b16 s1, s1, s2 ; GFX9-NEXT: v_mov_b32_e32 v0, s1 ; GFX9-NEXT: v_pk_sub_i16 v0, s0, v0 clamp @@ -451,15 +443,14 @@ define amdgpu_ps i16 @s_ssubsat_v2i8(i16 inreg %lhs.arg, i16 inreg %rhs.arg) { ; GFX10-NEXT: s_lshr_b32 s3, s1, 8 ; GFX10-NEXT: s_pack_ll_b32_b16 s0, s0, s2 ; GFX10-NEXT: s_pack_ll_b32_b16 s1, s1, s3 -; GFX10-NEXT: s_mov_b32 s2, 0x80008 -; GFX10-NEXT: s_lshr_b32 s3, s0, 16 -; GFX10-NEXT: s_lshr_b32 s4, s1, 16 -; GFX10-NEXT: s_lshl_b32 s0, s0, s2 +; GFX10-NEXT: s_lshr_b32 s2, s0, 16 +; GFX10-NEXT: s_lshr_b32 s3, s1, 16 +; GFX10-NEXT: s_lshl_b32 s0, s0, 0x80008 +; GFX10-NEXT: s_lshl_b32 s2, s2, 8 +; GFX10-NEXT: s_lshl_b32 s1, s1, 0x80008 ; GFX10-NEXT: s_lshl_b32 s3, s3, 8 -; GFX10-NEXT: s_lshl_b32 s1, s1, s2 -; GFX10-NEXT: s_lshl_b32 s2, s4, 8 -; GFX10-NEXT: s_pack_ll_b32_b16 s0, s0, s3 -; GFX10-NEXT: s_pack_ll_b32_b16 s1, s1, s2 +; GFX10-NEXT: s_pack_ll_b32_b16 s0, s0, s2 +; GFX10-NEXT: s_pack_ll_b32_b16 s1, s1, s3 ; GFX10-NEXT: v_pk_sub_i16 v0, s0, s1 clamp ; GFX10-NEXT: s_movk_i32 s0, 0xff ; GFX10-NEXT: v_pk_ashrrev_i16 v0, 8, v0 op_sel_hi:[0,1] @@ -639,35 +630,33 @@ define i32 @v_ssubsat_v4i8(i32 %lhs.arg, i32 %rhs.arg) { ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: v_lshrrev_b32_e32 v4, 24, v0 -; GFX10-NEXT: v_lshrrev_b32_e32 v5, 24, v1 +; GFX10-NEXT: v_lshrrev_b32_e32 v3, 24, v0 +; GFX10-NEXT: v_lshrrev_b32_e32 v4, 24, v1 ; GFX10-NEXT: s_mov_b32 s4, 8 -; GFX10-NEXT: v_mov_b32_e32 v2, 0xffff -; GFX10-NEXT: v_lshrrev_b32_sdwa v3, s4, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX10-NEXT: v_lshrrev_b32_e32 v6, 16, v0 -; GFX10-NEXT: v_lshrrev_b32_sdwa v7, s4, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX10-NEXT: v_lshrrev_b32_e32 v8, 16, v1 +; GFX10-NEXT: v_lshrrev_b32_e32 v5, 16, v0 +; GFX10-NEXT: v_lshrrev_b32_sdwa v2, s4, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX10-NEXT: v_lshrrev_b32_sdwa v6, s4, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX10-NEXT: v_lshrrev_b32_e32 v7, 16, v1 +; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; GFX10-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GFX10-NEXT: v_and_or_b32 v0, v0, v2, v3 -; GFX10-NEXT: v_and_or_b32 v1, v1, v2, v7 -; GFX10-NEXT: s_movk_i32 s4, 0xff -; GFX10-NEXT: v_and_or_b32 v3, v6, v2, v4 -; GFX10-NEXT: v_and_or_b32 v2, v8, v2, v5 +; GFX10-NEXT: v_and_or_b32 v0, 0xffff, v0, v2 +; GFX10-NEXT: v_and_or_b32 v1, 0xffff, v1, v6 +; GFX10-NEXT: v_and_or_b32 v2, 0xffff, v5, v3 +; GFX10-NEXT: v_and_or_b32 v3, 0xffff, v7, v4 ; GFX10-NEXT: v_pk_lshlrev_b16 v0, 8, v0 op_sel_hi:[0,1] ; GFX10-NEXT: v_pk_lshlrev_b16 v1, 8, v1 op_sel_hi:[0,1] ; GFX10-NEXT: v_mov_b32_e32 v4, 24 -; GFX10-NEXT: v_pk_lshlrev_b16 v3, 8, v3 op_sel_hi:[0,1] ; GFX10-NEXT: v_pk_lshlrev_b16 v2, 8, v2 op_sel_hi:[0,1] +; GFX10-NEXT: v_pk_lshlrev_b16 v3, 8, v3 op_sel_hi:[0,1] ; GFX10-NEXT: v_pk_sub_i16 v0, v0, v1 clamp -; GFX10-NEXT: v_pk_sub_i16 v1, v3, v2 clamp +; GFX10-NEXT: v_pk_sub_i16 v1, v2, v3 clamp ; GFX10-NEXT: v_mov_b32_e32 v2, 8 ; GFX10-NEXT: v_pk_ashrrev_i16 v0, 8, v0 op_sel_hi:[0,1] ; GFX10-NEXT: v_pk_ashrrev_i16 v1, 8, v1 op_sel_hi:[0,1] ; GFX10-NEXT: v_lshlrev_b32_sdwa v2, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 -; GFX10-NEXT: v_and_b32_e32 v3, s4, v1 +; GFX10-NEXT: v_and_b32_e32 v3, 0xff, v1 ; GFX10-NEXT: v_lshlrev_b32_sdwa v1, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 -; GFX10-NEXT: v_and_or_b32 v0, v0, s4, v2 +; GFX10-NEXT: v_and_or_b32 v0, v0, 0xff, v2 ; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v3 ; GFX10-NEXT: v_or3_b32 v0, v0, v2, v1 ; GFX10-NEXT: s_setpc_b64 s[30:31] @@ -685,59 +674,56 @@ define amdgpu_ps i32 @s_ssubsat_v4i8(i32 inreg %lhs.arg, i32 inreg %rhs.arg) { ; GFX6-NEXT: s_lshr_b32 s3, s0, 16 ; GFX6-NEXT: s_lshr_b32 s4, s0, 24 ; GFX6-NEXT: s_lshl_b32 s0, s0, 24 -; GFX6-NEXT: s_brev_b32 s8, -2 -; GFX6-NEXT: s_max_i32 s10, s0, -1 +; GFX6-NEXT: s_max_i32 s8, s0, -1 ; GFX6-NEXT: s_lshr_b32 s5, s1, 8 ; GFX6-NEXT: s_lshr_b32 s6, s1, 16 ; GFX6-NEXT: s_lshr_b32 s7, s1, 24 ; GFX6-NEXT: s_lshl_b32 s1, s1, 24 -; GFX6-NEXT: s_brev_b32 s9, 1 -; GFX6-NEXT: s_sub_i32 s10, s10, s8 -; GFX6-NEXT: s_min_i32 s11, s0, -1 -; GFX6-NEXT: s_sub_i32 s11, s11, s9 -; GFX6-NEXT: s_max_i32 s1, s10, s1 -; GFX6-NEXT: s_min_i32 s1, s1, s11 +; GFX6-NEXT: s_sub_i32 s8, s8, 0x7fffffff +; GFX6-NEXT: s_min_i32 s9, s0, -1 +; GFX6-NEXT: s_sub_i32 s9, s9, 0x80000000 +; GFX6-NEXT: s_max_i32 s1, s8, s1 +; GFX6-NEXT: s_min_i32 s1, s1, s9 ; GFX6-NEXT: s_sub_i32 s0, s0, s1 ; GFX6-NEXT: s_lshl_b32 s1, s2, 24 ; GFX6-NEXT: s_lshl_b32 s2, s5, 24 ; GFX6-NEXT: s_max_i32 s5, s1, -1 -; GFX6-NEXT: s_sub_i32 s5, s5, s8 -; GFX6-NEXT: s_min_i32 s10, s1, -1 -; GFX6-NEXT: s_sub_i32 s10, s10, s9 +; GFX6-NEXT: s_sub_i32 s5, s5, 0x7fffffff +; GFX6-NEXT: s_min_i32 s8, s1, -1 +; GFX6-NEXT: s_sub_i32 s8, s8, 0x80000000 ; GFX6-NEXT: s_max_i32 s2, s5, s2 -; GFX6-NEXT: s_min_i32 s2, s2, s10 +; GFX6-NEXT: s_min_i32 s2, s2, s8 ; GFX6-NEXT: s_sub_i32 s1, s1, s2 ; GFX6-NEXT: s_lshl_b32 s2, s3, 24 ; GFX6-NEXT: s_max_i32 s5, s2, -1 ; GFX6-NEXT: s_lshl_b32 s3, s6, 24 -; GFX6-NEXT: s_sub_i32 s5, s5, s8 +; GFX6-NEXT: s_sub_i32 s5, s5, 0x7fffffff ; GFX6-NEXT: s_min_i32 s6, s2, -1 -; GFX6-NEXT: s_sub_i32 s6, s6, s9 +; GFX6-NEXT: s_sub_i32 s6, s6, 0x80000000 ; GFX6-NEXT: s_max_i32 s3, s5, s3 ; GFX6-NEXT: s_min_i32 s3, s3, s6 ; GFX6-NEXT: s_sub_i32 s2, s2, s3 ; GFX6-NEXT: s_lshl_b32 s3, s4, 24 ; GFX6-NEXT: s_max_i32 s5, s3, -1 -; GFX6-NEXT: s_lshl_b32 s4, s7, 24 -; GFX6-NEXT: s_sub_i32 s5, s5, s8 -; GFX6-NEXT: s_min_i32 s6, s3, -1 -; GFX6-NEXT: s_sub_i32 s6, s6, s9 -; GFX6-NEXT: s_max_i32 s4, s5, s4 -; GFX6-NEXT: s_min_i32 s4, s4, s6 ; GFX6-NEXT: s_ashr_i32 s1, s1, 24 -; GFX6-NEXT: s_sub_i32 s3, s3, s4 -; GFX6-NEXT: s_movk_i32 s4, 0xff +; GFX6-NEXT: s_lshl_b32 s4, s7, 24 +; GFX6-NEXT: s_sub_i32 s5, s5, 0x7fffffff +; GFX6-NEXT: s_min_i32 s6, s3, -1 ; GFX6-NEXT: s_ashr_i32 s0, s0, 24 -; GFX6-NEXT: s_and_b32 s1, s1, s4 +; GFX6-NEXT: s_sub_i32 s6, s6, 0x80000000 +; GFX6-NEXT: s_max_i32 s4, s5, s4 +; GFX6-NEXT: s_and_b32 s1, s1, 0xff ; GFX6-NEXT: s_ashr_i32 s2, s2, 24 -; GFX6-NEXT: s_and_b32 s0, s0, s4 +; GFX6-NEXT: s_min_i32 s4, s4, s6 +; GFX6-NEXT: s_and_b32 s0, s0, 0xff ; GFX6-NEXT: s_lshl_b32 s1, s1, 8 +; GFX6-NEXT: s_sub_i32 s3, s3, s4 ; GFX6-NEXT: s_or_b32 s0, s0, s1 -; GFX6-NEXT: s_and_b32 s1, s2, s4 +; GFX6-NEXT: s_and_b32 s1, s2, 0xff ; GFX6-NEXT: s_ashr_i32 s3, s3, 24 ; GFX6-NEXT: s_lshl_b32 s1, s1, 16 ; GFX6-NEXT: s_or_b32 s0, s0, s1 -; GFX6-NEXT: s_and_b32 s1, s3, s4 +; GFX6-NEXT: s_and_b32 s1, s3, 0xff ; GFX6-NEXT: s_lshl_b32 s1, s1, 24 ; GFX6-NEXT: s_or_b32 s0, s0, s1 ; GFX6-NEXT: ; return to shader part epilog @@ -749,35 +735,33 @@ define amdgpu_ps i32 @s_ssubsat_v4i8(i32 inreg %lhs.arg, i32 inreg %rhs.arg) { ; GFX8-NEXT: s_lshr_b32 s3, s0, 16 ; GFX8-NEXT: s_lshr_b32 s4, s0, 24 ; GFX8-NEXT: s_lshl_b32 s0, s0, s8 -; GFX8-NEXT: s_sext_i32_i16 s11, s0 -; GFX8-NEXT: s_sext_i32_i16 s12, -1 -; GFX8-NEXT: s_movk_i32 s9, 0x7fff -; GFX8-NEXT: s_max_i32 s13, s11, s12 +; GFX8-NEXT: s_sext_i32_i16 s9, s0 +; GFX8-NEXT: s_sext_i32_i16 s10, -1 +; GFX8-NEXT: s_max_i32 s11, s9, s10 ; GFX8-NEXT: s_lshr_b32 s5, s1, 8 ; GFX8-NEXT: s_lshr_b32 s6, s1, 16 ; GFX8-NEXT: s_lshr_b32 s7, s1, 24 ; GFX8-NEXT: s_lshl_b32 s1, s1, s8 -; GFX8-NEXT: s_sub_i32 s13, s13, s9 -; GFX8-NEXT: s_movk_i32 s10, 0x8000 -; GFX8-NEXT: s_min_i32 s11, s11, s12 -; GFX8-NEXT: s_sext_i32_i16 s13, s13 -; GFX8-NEXT: s_sext_i32_i16 s1, s1 -; GFX8-NEXT: s_sub_i32 s11, s11, s10 -; GFX8-NEXT: s_max_i32 s1, s13, s1 -; GFX8-NEXT: s_sext_i32_i16 s1, s1 +; GFX8-NEXT: s_sub_i32 s11, s11, 0x7fff +; GFX8-NEXT: s_min_i32 s9, s9, s10 ; GFX8-NEXT: s_sext_i32_i16 s11, s11 -; GFX8-NEXT: s_min_i32 s1, s1, s11 +; GFX8-NEXT: s_sext_i32_i16 s1, s1 +; GFX8-NEXT: s_sub_i32 s9, s9, 0xffff8000 +; GFX8-NEXT: s_max_i32 s1, s11, s1 +; GFX8-NEXT: s_sext_i32_i16 s1, s1 +; GFX8-NEXT: s_sext_i32_i16 s9, s9 +; GFX8-NEXT: s_min_i32 s1, s1, s9 ; GFX8-NEXT: s_sub_i32 s0, s0, s1 ; GFX8-NEXT: s_lshl_b32 s1, s2, s8 ; GFX8-NEXT: s_lshl_b32 s2, s5, s8 ; GFX8-NEXT: s_sext_i32_i16 s5, s1 -; GFX8-NEXT: s_max_i32 s11, s5, s12 -; GFX8-NEXT: s_sub_i32 s11, s11, s9 -; GFX8-NEXT: s_min_i32 s5, s5, s12 -; GFX8-NEXT: s_sext_i32_i16 s11, s11 +; GFX8-NEXT: s_max_i32 s9, s5, s10 +; GFX8-NEXT: s_sub_i32 s9, s9, 0x7fff +; GFX8-NEXT: s_min_i32 s5, s5, s10 +; GFX8-NEXT: s_sext_i32_i16 s9, s9 ; GFX8-NEXT: s_sext_i32_i16 s2, s2 -; GFX8-NEXT: s_sub_i32 s5, s5, s10 -; GFX8-NEXT: s_max_i32 s2, s11, s2 +; GFX8-NEXT: s_sub_i32 s5, s5, 0xffff8000 +; GFX8-NEXT: s_max_i32 s2, s9, s2 ; GFX8-NEXT: s_sext_i32_i16 s2, s2 ; GFX8-NEXT: s_sext_i32_i16 s5, s5 ; GFX8-NEXT: s_min_i32 s2, s2, s5 @@ -785,12 +769,12 @@ define amdgpu_ps i32 @s_ssubsat_v4i8(i32 inreg %lhs.arg, i32 inreg %rhs.arg) { ; GFX8-NEXT: s_lshl_b32 s2, s3, s8 ; GFX8-NEXT: s_sext_i32_i16 s5, s2 ; GFX8-NEXT: s_lshl_b32 s3, s6, s8 -; GFX8-NEXT: s_max_i32 s6, s5, s12 -; GFX8-NEXT: s_sub_i32 s6, s6, s9 -; GFX8-NEXT: s_min_i32 s5, s5, s12 +; GFX8-NEXT: s_max_i32 s6, s5, s10 +; GFX8-NEXT: s_sub_i32 s6, s6, 0x7fff +; GFX8-NEXT: s_min_i32 s5, s5, s10 ; GFX8-NEXT: s_sext_i32_i16 s6, s6 ; GFX8-NEXT: s_sext_i32_i16 s3, s3 -; GFX8-NEXT: s_sub_i32 s5, s5, s10 +; GFX8-NEXT: s_sub_i32 s5, s5, 0xffff8000 ; GFX8-NEXT: s_max_i32 s3, s6, s3 ; GFX8-NEXT: s_sext_i32_i16 s3, s3 ; GFX8-NEXT: s_sext_i32_i16 s5, s5 @@ -798,35 +782,34 @@ define amdgpu_ps i32 @s_ssubsat_v4i8(i32 inreg %lhs.arg, i32 inreg %rhs.arg) { ; GFX8-NEXT: s_sub_i32 s2, s2, s3 ; GFX8-NEXT: s_lshl_b32 s3, s4, s8 ; GFX8-NEXT: s_sext_i32_i16 s5, s3 -; GFX8-NEXT: s_max_i32 s6, s5, s12 +; GFX8-NEXT: s_max_i32 s6, s5, s10 ; GFX8-NEXT: s_lshl_b32 s4, s7, s8 -; GFX8-NEXT: s_sub_i32 s6, s6, s9 -; GFX8-NEXT: s_min_i32 s5, s5, s12 +; GFX8-NEXT: s_sub_i32 s6, s6, 0x7fff +; GFX8-NEXT: s_min_i32 s5, s5, s10 ; GFX8-NEXT: s_sext_i32_i16 s6, s6 ; GFX8-NEXT: s_sext_i32_i16 s4, s4 -; GFX8-NEXT: s_sub_i32 s5, s5, s10 -; GFX8-NEXT: s_max_i32 s4, s6, s4 -; GFX8-NEXT: s_sext_i32_i16 s4, s4 -; GFX8-NEXT: s_sext_i32_i16 s5, s5 ; GFX8-NEXT: s_sext_i32_i16 s1, s1 -; GFX8-NEXT: s_min_i32 s4, s4, s5 +; GFX8-NEXT: s_sub_i32 s5, s5, 0xffff8000 +; GFX8-NEXT: s_max_i32 s4, s6, s4 ; GFX8-NEXT: s_sext_i32_i16 s0, s0 ; GFX8-NEXT: s_ashr_i32 s1, s1, s8 -; GFX8-NEXT: s_sub_i32 s3, s3, s4 -; GFX8-NEXT: s_movk_i32 s4, 0xff +; GFX8-NEXT: s_sext_i32_i16 s4, s4 +; GFX8-NEXT: s_sext_i32_i16 s5, s5 ; GFX8-NEXT: s_ashr_i32 s0, s0, s8 ; GFX8-NEXT: s_sext_i32_i16 s2, s2 -; GFX8-NEXT: s_and_b32 s1, s1, s4 +; GFX8-NEXT: s_min_i32 s4, s4, s5 +; GFX8-NEXT: s_and_b32 s1, s1, 0xff ; GFX8-NEXT: s_ashr_i32 s2, s2, s8 -; GFX8-NEXT: s_and_b32 s0, s0, s4 +; GFX8-NEXT: s_sub_i32 s3, s3, s4 +; GFX8-NEXT: s_and_b32 s0, s0, 0xff ; GFX8-NEXT: s_lshl_b32 s1, s1, 8 ; GFX8-NEXT: s_sext_i32_i16 s3, s3 ; GFX8-NEXT: s_or_b32 s0, s0, s1 -; GFX8-NEXT: s_and_b32 s1, s2, s4 +; GFX8-NEXT: s_and_b32 s1, s2, 0xff ; GFX8-NEXT: s_ashr_i32 s3, s3, s8 ; GFX8-NEXT: s_lshl_b32 s1, s1, 16 ; GFX8-NEXT: s_or_b32 s0, s0, s1 -; GFX8-NEXT: s_and_b32 s1, s3, s4 +; GFX8-NEXT: s_and_b32 s1, s3, 0xff ; GFX8-NEXT: s_lshl_b32 s1, s1, 24 ; GFX8-NEXT: s_or_b32 s0, s0, s1 ; GFX8-NEXT: ; return to shader part epilog @@ -838,27 +821,26 @@ define amdgpu_ps i32 @s_ssubsat_v4i8(i32 inreg %lhs.arg, i32 inreg %rhs.arg) { ; GFX9-NEXT: s_lshr_b32 s6, s0, 24 ; GFX9-NEXT: s_pack_ll_b32_b16 s0, s0, s3 ; GFX9-NEXT: s_pack_ll_b32_b16 s3, s4, s6 -; GFX9-NEXT: s_mov_b32 s4, 0x80008 -; GFX9-NEXT: s_lshr_b32 s6, s0, 16 +; GFX9-NEXT: s_lshr_b32 s4, s0, 16 ; GFX9-NEXT: s_lshr_b32 s7, s1, 8 -; GFX9-NEXT: s_lshl_b32 s0, s0, s4 -; GFX9-NEXT: s_lshl_b32 s6, s6, 8 +; GFX9-NEXT: s_lshl_b32 s0, s0, 0x80008 +; GFX9-NEXT: s_lshl_b32 s4, s4, 8 ; GFX9-NEXT: s_lshr_b32 s8, s1, 16 ; GFX9-NEXT: s_lshr_b32 s9, s1, 24 -; GFX9-NEXT: s_pack_ll_b32_b16 s0, s0, s6 -; GFX9-NEXT: s_lshr_b32 s6, s3, 16 +; GFX9-NEXT: s_pack_ll_b32_b16 s0, s0, s4 +; GFX9-NEXT: s_lshr_b32 s4, s3, 16 ; GFX9-NEXT: s_pack_ll_b32_b16 s1, s1, s7 -; GFX9-NEXT: s_lshl_b32 s3, s3, s4 +; GFX9-NEXT: s_lshl_b32 s3, s3, 0x80008 +; GFX9-NEXT: s_lshl_b32 s4, s4, 8 +; GFX9-NEXT: s_lshr_b32 s6, s1, 16 +; GFX9-NEXT: s_pack_ll_b32_b16 s3, s3, s4 +; GFX9-NEXT: s_pack_ll_b32_b16 s4, s8, s9 +; GFX9-NEXT: s_lshl_b32 s1, s1, 0x80008 +; GFX9-NEXT: s_lshl_b32 s6, s6, 8 +; GFX9-NEXT: s_pack_ll_b32_b16 s1, s1, s6 +; GFX9-NEXT: s_lshr_b32 s6, s4, 16 +; GFX9-NEXT: s_lshl_b32 s4, s4, 0x80008 ; GFX9-NEXT: s_lshl_b32 s6, s6, 8 -; GFX9-NEXT: s_lshr_b32 s7, s1, 16 -; GFX9-NEXT: s_pack_ll_b32_b16 s3, s3, s6 -; GFX9-NEXT: s_pack_ll_b32_b16 s6, s8, s9 -; GFX9-NEXT: s_lshl_b32 s1, s1, s4 -; GFX9-NEXT: s_lshl_b32 s7, s7, 8 -; GFX9-NEXT: s_pack_ll_b32_b16 s1, s1, s7 -; GFX9-NEXT: s_lshr_b32 s7, s6, 16 -; GFX9-NEXT: s_lshl_b32 s4, s6, s4 -; GFX9-NEXT: s_lshl_b32 s6, s7, 8 ; GFX9-NEXT: s_pack_ll_b32_b16 s4, s4, s6 ; GFX9-NEXT: v_mov_b32_e32 v0, s1 ; GFX9-NEXT: v_pk_sub_i16 v0, s0, v0 clamp @@ -885,39 +867,37 @@ define amdgpu_ps i32 @s_ssubsat_v4i8(i32 inreg %lhs.arg, i32 inreg %rhs.arg) { ; GFX10-NEXT: s_lshr_b32 s4, s0, 24 ; GFX10-NEXT: s_pack_ll_b32_b16 s0, s0, s2 ; GFX10-NEXT: s_pack_ll_b32_b16 s2, s3, s4 -; GFX10-NEXT: s_mov_b32 s3, 0x80008 -; GFX10-NEXT: s_lshr_b32 s4, s0, 16 +; GFX10-NEXT: s_lshr_b32 s3, s0, 16 ; GFX10-NEXT: s_lshr_b32 s5, s1, 8 ; GFX10-NEXT: s_lshr_b32 s6, s1, 16 ; GFX10-NEXT: s_lshr_b32 s7, s1, 24 -; GFX10-NEXT: s_lshl_b32 s0, s0, s3 +; GFX10-NEXT: s_lshl_b32 s0, s0, 0x80008 +; GFX10-NEXT: s_lshl_b32 s3, s3, 8 +; GFX10-NEXT: s_lshr_b32 s4, s2, 16 +; GFX10-NEXT: s_lshl_b32 s2, s2, 0x80008 ; GFX10-NEXT: s_lshl_b32 s4, s4, 8 +; GFX10-NEXT: s_pack_ll_b32_b16 s0, s0, s3 ; GFX10-NEXT: s_pack_ll_b32_b16 s1, s1, s5 -; GFX10-NEXT: s_pack_ll_b32_b16 s0, s0, s4 -; GFX10-NEXT: s_pack_ll_b32_b16 s4, s6, s7 -; GFX10-NEXT: s_lshr_b32 s8, s2, 16 -; GFX10-NEXT: s_lshr_b32 s5, s1, 16 -; GFX10-NEXT: s_lshr_b32 s6, s4, 16 -; GFX10-NEXT: s_lshl_b32 s2, s2, s3 -; GFX10-NEXT: s_lshl_b32 s8, s8, 8 -; GFX10-NEXT: s_lshl_b32 s1, s1, s3 +; GFX10-NEXT: s_pack_ll_b32_b16 s3, s6, s7 +; GFX10-NEXT: s_pack_ll_b32_b16 s2, s2, s4 +; GFX10-NEXT: s_lshr_b32 s4, s1, 16 +; GFX10-NEXT: s_lshr_b32 s5, s3, 16 +; GFX10-NEXT: s_lshl_b32 s1, s1, 0x80008 +; GFX10-NEXT: s_lshl_b32 s4, s4, 8 +; GFX10-NEXT: s_lshl_b32 s3, s3, 0x80008 ; GFX10-NEXT: s_lshl_b32 s5, s5, 8 -; GFX10-NEXT: s_lshl_b32 s3, s4, s3 -; GFX10-NEXT: s_lshl_b32 s4, s6, 8 -; GFX10-NEXT: s_pack_ll_b32_b16 s2, s2, s8 -; GFX10-NEXT: s_pack_ll_b32_b16 s1, s1, s5 -; GFX10-NEXT: s_pack_ll_b32_b16 s3, s3, s4 +; GFX10-NEXT: s_pack_ll_b32_b16 s1, s1, s4 +; GFX10-NEXT: s_pack_ll_b32_b16 s3, s3, s5 ; GFX10-NEXT: v_pk_sub_i16 v0, s0, s1 clamp ; GFX10-NEXT: v_pk_sub_i16 v1, s2, s3 clamp ; GFX10-NEXT: s_mov_b32 s0, 8 -; GFX10-NEXT: s_movk_i32 s1, 0xff ; GFX10-NEXT: v_pk_ashrrev_i16 v0, 8, v0 op_sel_hi:[0,1] ; GFX10-NEXT: v_pk_ashrrev_i16 v1, 8, v1 op_sel_hi:[0,1] ; GFX10-NEXT: v_lshlrev_b32_sdwa v2, s0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 -; GFX10-NEXT: v_and_b32_e32 v3, s1, v1 +; GFX10-NEXT: v_and_b32_e32 v3, 0xff, v1 ; GFX10-NEXT: s_mov_b32 s0, 24 ; GFX10-NEXT: v_lshlrev_b32_sdwa v1, s0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 -; GFX10-NEXT: v_and_or_b32 v0, v0, s1, v2 +; GFX10-NEXT: v_and_or_b32 v0, v0, 0xff, v2 ; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v3 ; GFX10-NEXT: v_or3_b32 v0, v0, v2, v1 ; GFX10-NEXT: v_readfirstlane_b32 s0, v0 @@ -1255,19 +1235,17 @@ define <2 x i32> @v_ssubsat_v2i32(<2 x i32> %lhs, <2 x i32> %rhs) { define amdgpu_ps <2 x i32> @s_ssubsat_v2i32(<2 x i32> inreg %lhs, <2 x i32> inreg %rhs) { ; GFX6-LABEL: s_ssubsat_v2i32: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_brev_b32 s4, -2 -; GFX6-NEXT: s_max_i32 s6, s0, -1 -; GFX6-NEXT: s_brev_b32 s5, 1 -; GFX6-NEXT: s_sub_i32 s6, s6, s4 -; GFX6-NEXT: s_min_i32 s7, s0, -1 -; GFX6-NEXT: s_sub_i32 s7, s7, s5 -; GFX6-NEXT: s_max_i32 s2, s6, s2 -; GFX6-NEXT: s_min_i32 s2, s2, s7 +; GFX6-NEXT: s_max_i32 s4, s0, -1 +; GFX6-NEXT: s_sub_i32 s4, s4, 0x7fffffff +; GFX6-NEXT: s_min_i32 s5, s0, -1 +; GFX6-NEXT: s_sub_i32 s5, s5, 0x80000000 +; GFX6-NEXT: s_max_i32 s2, s4, s2 +; GFX6-NEXT: s_min_i32 s2, s2, s5 ; GFX6-NEXT: s_sub_i32 s0, s0, s2 ; GFX6-NEXT: s_max_i32 s2, s1, -1 -; GFX6-NEXT: s_sub_i32 s2, s2, s4 +; GFX6-NEXT: s_sub_i32 s2, s2, 0x7fffffff ; GFX6-NEXT: s_min_i32 s4, s1, -1 -; GFX6-NEXT: s_sub_i32 s4, s4, s5 +; GFX6-NEXT: s_sub_i32 s4, s4, 0x80000000 ; GFX6-NEXT: s_max_i32 s2, s2, s3 ; GFX6-NEXT: s_min_i32 s2, s2, s4 ; GFX6-NEXT: s_sub_i32 s1, s1, s2 @@ -1275,19 +1253,17 @@ define amdgpu_ps <2 x i32> @s_ssubsat_v2i32(<2 x i32> inreg %lhs, <2 x i32> inre ; ; GFX8-LABEL: s_ssubsat_v2i32: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_brev_b32 s4, -2 -; GFX8-NEXT: s_max_i32 s6, s0, -1 -; GFX8-NEXT: s_brev_b32 s5, 1 -; GFX8-NEXT: s_sub_i32 s6, s6, s4 -; GFX8-NEXT: s_min_i32 s7, s0, -1 -; GFX8-NEXT: s_sub_i32 s7, s7, s5 -; GFX8-NEXT: s_max_i32 s2, s6, s2 -; GFX8-NEXT: s_min_i32 s2, s2, s7 +; GFX8-NEXT: s_max_i32 s4, s0, -1 +; GFX8-NEXT: s_sub_i32 s4, s4, 0x7fffffff +; GFX8-NEXT: s_min_i32 s5, s0, -1 +; GFX8-NEXT: s_sub_i32 s5, s5, 0x80000000 +; GFX8-NEXT: s_max_i32 s2, s4, s2 +; GFX8-NEXT: s_min_i32 s2, s2, s5 ; GFX8-NEXT: s_sub_i32 s0, s0, s2 ; GFX8-NEXT: s_max_i32 s2, s1, -1 -; GFX8-NEXT: s_sub_i32 s2, s2, s4 +; GFX8-NEXT: s_sub_i32 s2, s2, 0x7fffffff ; GFX8-NEXT: s_min_i32 s4, s1, -1 -; GFX8-NEXT: s_sub_i32 s4, s4, s5 +; GFX8-NEXT: s_sub_i32 s4, s4, 0x80000000 ; GFX8-NEXT: s_max_i32 s2, s2, s3 ; GFX8-NEXT: s_min_i32 s2, s2, s4 ; GFX8-NEXT: s_sub_i32 s1, s1, s2 @@ -1394,26 +1370,24 @@ define <3 x i32> @v_ssubsat_v3i32(<3 x i32> %lhs, <3 x i32> %rhs) { define amdgpu_ps <3 x i32> @s_ssubsat_v3i32(<3 x i32> inreg %lhs, <3 x i32> inreg %rhs) { ; GFX6-LABEL: s_ssubsat_v3i32: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_brev_b32 s6, -2 -; GFX6-NEXT: s_max_i32 s8, s0, -1 -; GFX6-NEXT: s_brev_b32 s7, 1 -; GFX6-NEXT: s_sub_i32 s8, s8, s6 -; GFX6-NEXT: s_min_i32 s9, s0, -1 -; GFX6-NEXT: s_sub_i32 s9, s9, s7 -; GFX6-NEXT: s_max_i32 s3, s8, s3 -; GFX6-NEXT: s_min_i32 s3, s3, s9 +; GFX6-NEXT: s_max_i32 s6, s0, -1 +; GFX6-NEXT: s_sub_i32 s6, s6, 0x7fffffff +; GFX6-NEXT: s_min_i32 s7, s0, -1 +; GFX6-NEXT: s_sub_i32 s7, s7, 0x80000000 +; GFX6-NEXT: s_max_i32 s3, s6, s3 +; GFX6-NEXT: s_min_i32 s3, s3, s7 ; GFX6-NEXT: s_sub_i32 s0, s0, s3 ; GFX6-NEXT: s_max_i32 s3, s1, -1 -; GFX6-NEXT: s_sub_i32 s3, s3, s6 -; GFX6-NEXT: s_min_i32 s8, s1, -1 -; GFX6-NEXT: s_sub_i32 s8, s8, s7 +; GFX6-NEXT: s_sub_i32 s3, s3, 0x7fffffff +; GFX6-NEXT: s_min_i32 s6, s1, -1 +; GFX6-NEXT: s_sub_i32 s6, s6, 0x80000000 ; GFX6-NEXT: s_max_i32 s3, s3, s4 -; GFX6-NEXT: s_min_i32 s3, s3, s8 +; GFX6-NEXT: s_min_i32 s3, s3, s6 ; GFX6-NEXT: s_sub_i32 s1, s1, s3 ; GFX6-NEXT: s_max_i32 s3, s2, -1 -; GFX6-NEXT: s_sub_i32 s3, s3, s6 +; GFX6-NEXT: s_sub_i32 s3, s3, 0x7fffffff ; GFX6-NEXT: s_min_i32 s4, s2, -1 -; GFX6-NEXT: s_sub_i32 s4, s4, s7 +; GFX6-NEXT: s_sub_i32 s4, s4, 0x80000000 ; GFX6-NEXT: s_max_i32 s3, s3, s5 ; GFX6-NEXT: s_min_i32 s3, s3, s4 ; GFX6-NEXT: s_sub_i32 s2, s2, s3 @@ -1421,26 +1395,24 @@ define amdgpu_ps <3 x i32> @s_ssubsat_v3i32(<3 x i32> inreg %lhs, <3 x i32> inre ; ; GFX8-LABEL: s_ssubsat_v3i32: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_brev_b32 s6, -2 -; GFX8-NEXT: s_max_i32 s8, s0, -1 -; GFX8-NEXT: s_brev_b32 s7, 1 -; GFX8-NEXT: s_sub_i32 s8, s8, s6 -; GFX8-NEXT: s_min_i32 s9, s0, -1 -; GFX8-NEXT: s_sub_i32 s9, s9, s7 -; GFX8-NEXT: s_max_i32 s3, s8, s3 -; GFX8-NEXT: s_min_i32 s3, s3, s9 +; GFX8-NEXT: s_max_i32 s6, s0, -1 +; GFX8-NEXT: s_sub_i32 s6, s6, 0x7fffffff +; GFX8-NEXT: s_min_i32 s7, s0, -1 +; GFX8-NEXT: s_sub_i32 s7, s7, 0x80000000 +; GFX8-NEXT: s_max_i32 s3, s6, s3 +; GFX8-NEXT: s_min_i32 s3, s3, s7 ; GFX8-NEXT: s_sub_i32 s0, s0, s3 ; GFX8-NEXT: s_max_i32 s3, s1, -1 -; GFX8-NEXT: s_sub_i32 s3, s3, s6 -; GFX8-NEXT: s_min_i32 s8, s1, -1 -; GFX8-NEXT: s_sub_i32 s8, s8, s7 +; GFX8-NEXT: s_sub_i32 s3, s3, 0x7fffffff +; GFX8-NEXT: s_min_i32 s6, s1, -1 +; GFX8-NEXT: s_sub_i32 s6, s6, 0x80000000 ; GFX8-NEXT: s_max_i32 s3, s3, s4 -; GFX8-NEXT: s_min_i32 s3, s3, s8 +; GFX8-NEXT: s_min_i32 s3, s3, s6 ; GFX8-NEXT: s_sub_i32 s1, s1, s3 ; GFX8-NEXT: s_max_i32 s3, s2, -1 -; GFX8-NEXT: s_sub_i32 s3, s3, s6 +; GFX8-NEXT: s_sub_i32 s3, s3, 0x7fffffff ; GFX8-NEXT: s_min_i32 s4, s2, -1 -; GFX8-NEXT: s_sub_i32 s4, s4, s7 +; GFX8-NEXT: s_sub_i32 s4, s4, 0x80000000 ; GFX8-NEXT: s_max_i32 s3, s3, s5 ; GFX8-NEXT: s_min_i32 s3, s3, s4 ; GFX8-NEXT: s_sub_i32 s2, s2, s3 @@ -1568,33 +1540,31 @@ define <4 x i32> @v_ssubsat_v4i32(<4 x i32> %lhs, <4 x i32> %rhs) { define amdgpu_ps <4 x i32> @s_ssubsat_v4i32(<4 x i32> inreg %lhs, <4 x i32> inreg %rhs) { ; GFX6-LABEL: s_ssubsat_v4i32: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_brev_b32 s8, -2 -; GFX6-NEXT: s_max_i32 s10, s0, -1 -; GFX6-NEXT: s_brev_b32 s9, 1 -; GFX6-NEXT: s_sub_i32 s10, s10, s8 -; GFX6-NEXT: s_min_i32 s11, s0, -1 -; GFX6-NEXT: s_sub_i32 s11, s11, s9 -; GFX6-NEXT: s_max_i32 s4, s10, s4 -; GFX6-NEXT: s_min_i32 s4, s4, s11 +; GFX6-NEXT: s_max_i32 s8, s0, -1 +; GFX6-NEXT: s_sub_i32 s8, s8, 0x7fffffff +; GFX6-NEXT: s_min_i32 s9, s0, -1 +; GFX6-NEXT: s_sub_i32 s9, s9, 0x80000000 +; GFX6-NEXT: s_max_i32 s4, s8, s4 +; GFX6-NEXT: s_min_i32 s4, s4, s9 ; GFX6-NEXT: s_sub_i32 s0, s0, s4 ; GFX6-NEXT: s_max_i32 s4, s1, -1 -; GFX6-NEXT: s_sub_i32 s4, s4, s8 -; GFX6-NEXT: s_min_i32 s10, s1, -1 -; GFX6-NEXT: s_sub_i32 s10, s10, s9 +; GFX6-NEXT: s_sub_i32 s4, s4, 0x7fffffff +; GFX6-NEXT: s_min_i32 s8, s1, -1 +; GFX6-NEXT: s_sub_i32 s8, s8, 0x80000000 ; GFX6-NEXT: s_max_i32 s4, s4, s5 -; GFX6-NEXT: s_min_i32 s4, s4, s10 +; GFX6-NEXT: s_min_i32 s4, s4, s8 ; GFX6-NEXT: s_sub_i32 s1, s1, s4 ; GFX6-NEXT: s_max_i32 s4, s2, -1 -; GFX6-NEXT: s_sub_i32 s4, s4, s8 +; GFX6-NEXT: s_sub_i32 s4, s4, 0x7fffffff ; GFX6-NEXT: s_min_i32 s5, s2, -1 -; GFX6-NEXT: s_sub_i32 s5, s5, s9 +; GFX6-NEXT: s_sub_i32 s5, s5, 0x80000000 ; GFX6-NEXT: s_max_i32 s4, s4, s6 ; GFX6-NEXT: s_min_i32 s4, s4, s5 ; GFX6-NEXT: s_sub_i32 s2, s2, s4 ; GFX6-NEXT: s_max_i32 s4, s3, -1 -; GFX6-NEXT: s_sub_i32 s4, s4, s8 +; GFX6-NEXT: s_sub_i32 s4, s4, 0x7fffffff ; GFX6-NEXT: s_min_i32 s5, s3, -1 -; GFX6-NEXT: s_sub_i32 s5, s5, s9 +; GFX6-NEXT: s_sub_i32 s5, s5, 0x80000000 ; GFX6-NEXT: s_max_i32 s4, s4, s7 ; GFX6-NEXT: s_min_i32 s4, s4, s5 ; GFX6-NEXT: s_sub_i32 s3, s3, s4 @@ -1602,33 +1572,31 @@ define amdgpu_ps <4 x i32> @s_ssubsat_v4i32(<4 x i32> inreg %lhs, <4 x i32> inre ; ; GFX8-LABEL: s_ssubsat_v4i32: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_brev_b32 s8, -2 -; GFX8-NEXT: s_max_i32 s10, s0, -1 -; GFX8-NEXT: s_brev_b32 s9, 1 -; GFX8-NEXT: s_sub_i32 s10, s10, s8 -; GFX8-NEXT: s_min_i32 s11, s0, -1 -; GFX8-NEXT: s_sub_i32 s11, s11, s9 -; GFX8-NEXT: s_max_i32 s4, s10, s4 -; GFX8-NEXT: s_min_i32 s4, s4, s11 +; GFX8-NEXT: s_max_i32 s8, s0, -1 +; GFX8-NEXT: s_sub_i32 s8, s8, 0x7fffffff +; GFX8-NEXT: s_min_i32 s9, s0, -1 +; GFX8-NEXT: s_sub_i32 s9, s9, 0x80000000 +; GFX8-NEXT: s_max_i32 s4, s8, s4 +; GFX8-NEXT: s_min_i32 s4, s4, s9 ; GFX8-NEXT: s_sub_i32 s0, s0, s4 ; GFX8-NEXT: s_max_i32 s4, s1, -1 -; GFX8-NEXT: s_sub_i32 s4, s4, s8 -; GFX8-NEXT: s_min_i32 s10, s1, -1 -; GFX8-NEXT: s_sub_i32 s10, s10, s9 +; GFX8-NEXT: s_sub_i32 s4, s4, 0x7fffffff +; GFX8-NEXT: s_min_i32 s8, s1, -1 +; GFX8-NEXT: s_sub_i32 s8, s8, 0x80000000 ; GFX8-NEXT: s_max_i32 s4, s4, s5 -; GFX8-NEXT: s_min_i32 s4, s4, s10 +; GFX8-NEXT: s_min_i32 s4, s4, s8 ; GFX8-NEXT: s_sub_i32 s1, s1, s4 ; GFX8-NEXT: s_max_i32 s4, s2, -1 -; GFX8-NEXT: s_sub_i32 s4, s4, s8 +; GFX8-NEXT: s_sub_i32 s4, s4, 0x7fffffff ; GFX8-NEXT: s_min_i32 s5, s2, -1 -; GFX8-NEXT: s_sub_i32 s5, s5, s9 +; GFX8-NEXT: s_sub_i32 s5, s5, 0x80000000 ; GFX8-NEXT: s_max_i32 s4, s4, s6 ; GFX8-NEXT: s_min_i32 s4, s4, s5 ; GFX8-NEXT: s_sub_i32 s2, s2, s4 ; GFX8-NEXT: s_max_i32 s4, s3, -1 -; GFX8-NEXT: s_sub_i32 s4, s4, s8 +; GFX8-NEXT: s_sub_i32 s4, s4, 0x7fffffff ; GFX8-NEXT: s_min_i32 s5, s3, -1 -; GFX8-NEXT: s_sub_i32 s5, s5, s9 +; GFX8-NEXT: s_sub_i32 s5, s5, 0x80000000 ; GFX8-NEXT: s_max_i32 s4, s4, s7 ; GFX8-NEXT: s_min_i32 s4, s4, s5 ; GFX8-NEXT: s_sub_i32 s3, s3, s4 @@ -1694,17 +1662,16 @@ define <5 x i32> @v_ssubsat_v5i32(<5 x i32> %lhs, <5 x i32> %rhs) { ; GFX6-NEXT: v_bfrev_b32_e32 v11, -2 ; GFX6-NEXT: v_sub_i32_e32 v2, vcc, v2, v5 ; GFX6-NEXT: v_max_i32_e32 v5, -1, v3 -; GFX6-NEXT: v_bfrev_b32_e32 v13, 1 ; GFX6-NEXT: v_sub_i32_e32 v5, vcc, v5, v11 ; GFX6-NEXT: v_min_i32_e32 v6, -1, v3 -; GFX6-NEXT: v_sub_i32_e32 v6, vcc, v6, v13 +; GFX6-NEXT: v_subrev_i32_e32 v6, vcc, 0x80000000, v6 ; GFX6-NEXT: v_max_i32_e32 v5, v5, v8 ; GFX6-NEXT: v_min_i32_e32 v5, v5, v6 ; GFX6-NEXT: v_sub_i32_e32 v3, vcc, v3, v5 ; GFX6-NEXT: v_max_i32_e32 v5, -1, v4 -; GFX6-NEXT: v_sub_i32_e32 v5, vcc, v5, v11 +; GFX6-NEXT: v_subrev_i32_e32 v5, vcc, 0x7fffffff, v5 ; GFX6-NEXT: v_min_i32_e32 v6, -1, v4 -; GFX6-NEXT: v_sub_i32_e32 v6, vcc, v6, v13 +; GFX6-NEXT: v_subrev_i32_e32 v6, vcc, 0x80000000, v6 ; GFX6-NEXT: v_max_i32_e32 v5, v5, v9 ; GFX6-NEXT: v_min_i32_e32 v5, v5, v6 ; GFX6-NEXT: v_sub_i32_e32 v4, vcc, v4, v5 @@ -1738,17 +1705,16 @@ define <5 x i32> @v_ssubsat_v5i32(<5 x i32> %lhs, <5 x i32> %rhs) { ; GFX8-NEXT: v_bfrev_b32_e32 v11, -2 ; GFX8-NEXT: v_sub_u32_e32 v2, vcc, v2, v5 ; GFX8-NEXT: v_max_i32_e32 v5, -1, v3 -; GFX8-NEXT: v_bfrev_b32_e32 v13, 1 ; GFX8-NEXT: v_sub_u32_e32 v5, vcc, v5, v11 ; GFX8-NEXT: v_min_i32_e32 v6, -1, v3 -; GFX8-NEXT: v_sub_u32_e32 v6, vcc, v6, v13 +; GFX8-NEXT: v_subrev_u32_e32 v6, vcc, 0x80000000, v6 ; GFX8-NEXT: v_max_i32_e32 v5, v5, v8 ; GFX8-NEXT: v_min_i32_e32 v5, v5, v6 ; GFX8-NEXT: v_sub_u32_e32 v3, vcc, v3, v5 ; GFX8-NEXT: v_max_i32_e32 v5, -1, v4 -; GFX8-NEXT: v_sub_u32_e32 v5, vcc, v5, v11 +; GFX8-NEXT: v_subrev_u32_e32 v5, vcc, 0x7fffffff, v5 ; GFX8-NEXT: v_min_i32_e32 v6, -1, v4 -; GFX8-NEXT: v_sub_u32_e32 v6, vcc, v6, v13 +; GFX8-NEXT: v_subrev_u32_e32 v6, vcc, 0x80000000, v6 ; GFX8-NEXT: v_max_i32_e32 v5, v5, v9 ; GFX8-NEXT: v_min_i32_e32 v5, v5, v6 ; GFX8-NEXT: v_sub_u32_e32 v4, vcc, v4, v5 @@ -1781,40 +1747,38 @@ define <5 x i32> @v_ssubsat_v5i32(<5 x i32> %lhs, <5 x i32> %rhs) { define amdgpu_ps <5 x i32> @s_ssubsat_v5i32(<5 x i32> inreg %lhs, <5 x i32> inreg %rhs) { ; GFX6-LABEL: s_ssubsat_v5i32: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_brev_b32 s10, -2 -; GFX6-NEXT: s_max_i32 s12, s0, -1 -; GFX6-NEXT: s_brev_b32 s11, 1 -; GFX6-NEXT: s_sub_i32 s12, s12, s10 -; GFX6-NEXT: s_min_i32 s13, s0, -1 -; GFX6-NEXT: s_sub_i32 s13, s13, s11 -; GFX6-NEXT: s_max_i32 s5, s12, s5 -; GFX6-NEXT: s_min_i32 s5, s5, s13 +; GFX6-NEXT: s_max_i32 s10, s0, -1 +; GFX6-NEXT: s_sub_i32 s10, s10, 0x7fffffff +; GFX6-NEXT: s_min_i32 s11, s0, -1 +; GFX6-NEXT: s_sub_i32 s11, s11, 0x80000000 +; GFX6-NEXT: s_max_i32 s5, s10, s5 +; GFX6-NEXT: s_min_i32 s5, s5, s11 ; GFX6-NEXT: s_sub_i32 s0, s0, s5 ; GFX6-NEXT: s_max_i32 s5, s1, -1 -; GFX6-NEXT: s_sub_i32 s5, s5, s10 -; GFX6-NEXT: s_min_i32 s12, s1, -1 -; GFX6-NEXT: s_sub_i32 s12, s12, s11 +; GFX6-NEXT: s_sub_i32 s5, s5, 0x7fffffff +; GFX6-NEXT: s_min_i32 s10, s1, -1 +; GFX6-NEXT: s_sub_i32 s10, s10, 0x80000000 ; GFX6-NEXT: s_max_i32 s5, s5, s6 -; GFX6-NEXT: s_min_i32 s5, s5, s12 +; GFX6-NEXT: s_min_i32 s5, s5, s10 ; GFX6-NEXT: s_sub_i32 s1, s1, s5 ; GFX6-NEXT: s_max_i32 s5, s2, -1 -; GFX6-NEXT: s_sub_i32 s5, s5, s10 +; GFX6-NEXT: s_sub_i32 s5, s5, 0x7fffffff ; GFX6-NEXT: s_min_i32 s6, s2, -1 -; GFX6-NEXT: s_sub_i32 s6, s6, s11 +; GFX6-NEXT: s_sub_i32 s6, s6, 0x80000000 ; GFX6-NEXT: s_max_i32 s5, s5, s7 ; GFX6-NEXT: s_min_i32 s5, s5, s6 ; GFX6-NEXT: s_sub_i32 s2, s2, s5 ; GFX6-NEXT: s_max_i32 s5, s3, -1 -; GFX6-NEXT: s_sub_i32 s5, s5, s10 +; GFX6-NEXT: s_sub_i32 s5, s5, 0x7fffffff ; GFX6-NEXT: s_min_i32 s6, s3, -1 -; GFX6-NEXT: s_sub_i32 s6, s6, s11 +; GFX6-NEXT: s_sub_i32 s6, s6, 0x80000000 ; GFX6-NEXT: s_max_i32 s5, s5, s8 ; GFX6-NEXT: s_min_i32 s5, s5, s6 ; GFX6-NEXT: s_sub_i32 s3, s3, s5 ; GFX6-NEXT: s_max_i32 s5, s4, -1 -; GFX6-NEXT: s_sub_i32 s5, s5, s10 +; GFX6-NEXT: s_sub_i32 s5, s5, 0x7fffffff ; GFX6-NEXT: s_min_i32 s6, s4, -1 -; GFX6-NEXT: s_sub_i32 s6, s6, s11 +; GFX6-NEXT: s_sub_i32 s6, s6, 0x80000000 ; GFX6-NEXT: s_max_i32 s5, s5, s9 ; GFX6-NEXT: s_min_i32 s5, s5, s6 ; GFX6-NEXT: s_sub_i32 s4, s4, s5 @@ -1822,40 +1786,38 @@ define amdgpu_ps <5 x i32> @s_ssubsat_v5i32(<5 x i32> inreg %lhs, <5 x i32> inre ; ; GFX8-LABEL: s_ssubsat_v5i32: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_brev_b32 s10, -2 -; GFX8-NEXT: s_max_i32 s12, s0, -1 -; GFX8-NEXT: s_brev_b32 s11, 1 -; GFX8-NEXT: s_sub_i32 s12, s12, s10 -; GFX8-NEXT: s_min_i32 s13, s0, -1 -; GFX8-NEXT: s_sub_i32 s13, s13, s11 -; GFX8-NEXT: s_max_i32 s5, s12, s5 -; GFX8-NEXT: s_min_i32 s5, s5, s13 +; GFX8-NEXT: s_max_i32 s10, s0, -1 +; GFX8-NEXT: s_sub_i32 s10, s10, 0x7fffffff +; GFX8-NEXT: s_min_i32 s11, s0, -1 +; GFX8-NEXT: s_sub_i32 s11, s11, 0x80000000 +; GFX8-NEXT: s_max_i32 s5, s10, s5 +; GFX8-NEXT: s_min_i32 s5, s5, s11 ; GFX8-NEXT: s_sub_i32 s0, s0, s5 ; GFX8-NEXT: s_max_i32 s5, s1, -1 -; GFX8-NEXT: s_sub_i32 s5, s5, s10 -; GFX8-NEXT: s_min_i32 s12, s1, -1 -; GFX8-NEXT: s_sub_i32 s12, s12, s11 +; GFX8-NEXT: s_sub_i32 s5, s5, 0x7fffffff +; GFX8-NEXT: s_min_i32 s10, s1, -1 +; GFX8-NEXT: s_sub_i32 s10, s10, 0x80000000 ; GFX8-NEXT: s_max_i32 s5, s5, s6 -; GFX8-NEXT: s_min_i32 s5, s5, s12 +; GFX8-NEXT: s_min_i32 s5, s5, s10 ; GFX8-NEXT: s_sub_i32 s1, s1, s5 ; GFX8-NEXT: s_max_i32 s5, s2, -1 -; GFX8-NEXT: s_sub_i32 s5, s5, s10 +; GFX8-NEXT: s_sub_i32 s5, s5, 0x7fffffff ; GFX8-NEXT: s_min_i32 s6, s2, -1 -; GFX8-NEXT: s_sub_i32 s6, s6, s11 +; GFX8-NEXT: s_sub_i32 s6, s6, 0x80000000 ; GFX8-NEXT: s_max_i32 s5, s5, s7 ; GFX8-NEXT: s_min_i32 s5, s5, s6 ; GFX8-NEXT: s_sub_i32 s2, s2, s5 ; GFX8-NEXT: s_max_i32 s5, s3, -1 -; GFX8-NEXT: s_sub_i32 s5, s5, s10 +; GFX8-NEXT: s_sub_i32 s5, s5, 0x7fffffff ; GFX8-NEXT: s_min_i32 s6, s3, -1 -; GFX8-NEXT: s_sub_i32 s6, s6, s11 +; GFX8-NEXT: s_sub_i32 s6, s6, 0x80000000 ; GFX8-NEXT: s_max_i32 s5, s5, s8 ; GFX8-NEXT: s_min_i32 s5, s5, s6 ; GFX8-NEXT: s_sub_i32 s3, s3, s5 ; GFX8-NEXT: s_max_i32 s5, s4, -1 -; GFX8-NEXT: s_sub_i32 s5, s5, s10 +; GFX8-NEXT: s_sub_i32 s5, s5, 0x7fffffff ; GFX8-NEXT: s_min_i32 s6, s4, -1 -; GFX8-NEXT: s_sub_i32 s6, s6, s11 +; GFX8-NEXT: s_sub_i32 s6, s6, 0x80000000 ; GFX8-NEXT: s_max_i32 s5, s5, s9 ; GFX8-NEXT: s_min_i32 s5, s5, s6 ; GFX8-NEXT: s_sub_i32 s4, s4, s5 @@ -2197,117 +2159,115 @@ define <16 x i32> @v_ssubsat_v16i32(<16 x i32> %lhs, <16 x i32> %rhs) { define amdgpu_ps <16 x i32> @s_ssubsat_v16i32(<16 x i32> inreg %lhs, <16 x i32> inreg %rhs) { ; GFX6-LABEL: s_ssubsat_v16i32: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_brev_b32 s32, -2 -; GFX6-NEXT: s_max_i32 s34, s0, -1 -; GFX6-NEXT: s_brev_b32 s33, 1 -; GFX6-NEXT: s_sub_i32 s34, s34, s32 -; GFX6-NEXT: s_min_i32 s35, s0, -1 -; GFX6-NEXT: s_sub_i32 s35, s35, s33 -; GFX6-NEXT: s_max_i32 s16, s34, s16 -; GFX6-NEXT: s_min_i32 s16, s16, s35 +; GFX6-NEXT: s_max_i32 s32, s0, -1 +; GFX6-NEXT: s_sub_i32 s32, s32, 0x7fffffff +; GFX6-NEXT: s_min_i32 s33, s0, -1 +; GFX6-NEXT: s_sub_i32 s33, s33, 0x80000000 +; GFX6-NEXT: s_max_i32 s16, s32, s16 +; GFX6-NEXT: s_min_i32 s16, s16, s33 ; GFX6-NEXT: s_sub_i32 s0, s0, s16 ; GFX6-NEXT: s_max_i32 s16, s1, -1 -; GFX6-NEXT: s_sub_i32 s16, s16, s32 -; GFX6-NEXT: s_min_i32 s34, s1, -1 -; GFX6-NEXT: s_sub_i32 s34, s34, s33 +; GFX6-NEXT: s_sub_i32 s16, s16, 0x7fffffff +; GFX6-NEXT: s_min_i32 s32, s1, -1 +; GFX6-NEXT: s_sub_i32 s32, s32, 0x80000000 ; GFX6-NEXT: s_max_i32 s16, s16, s17 -; GFX6-NEXT: s_min_i32 s16, s16, s34 +; GFX6-NEXT: s_min_i32 s16, s16, s32 ; GFX6-NEXT: s_sub_i32 s1, s1, s16 ; GFX6-NEXT: s_max_i32 s16, s2, -1 -; GFX6-NEXT: s_sub_i32 s16, s16, s32 +; GFX6-NEXT: s_sub_i32 s16, s16, 0x7fffffff ; GFX6-NEXT: s_min_i32 s17, s2, -1 -; GFX6-NEXT: s_sub_i32 s17, s17, s33 +; GFX6-NEXT: s_sub_i32 s17, s17, 0x80000000 ; GFX6-NEXT: s_max_i32 s16, s16, s18 ; GFX6-NEXT: s_min_i32 s16, s16, s17 ; GFX6-NEXT: s_sub_i32 s2, s2, s16 ; GFX6-NEXT: s_max_i32 s16, s3, -1 -; GFX6-NEXT: s_sub_i32 s16, s16, s32 +; GFX6-NEXT: s_sub_i32 s16, s16, 0x7fffffff ; GFX6-NEXT: s_min_i32 s17, s3, -1 -; GFX6-NEXT: s_sub_i32 s17, s17, s33 +; GFX6-NEXT: s_sub_i32 s17, s17, 0x80000000 ; GFX6-NEXT: s_max_i32 s16, s16, s19 ; GFX6-NEXT: s_min_i32 s16, s16, s17 ; GFX6-NEXT: s_sub_i32 s3, s3, s16 ; GFX6-NEXT: s_max_i32 s16, s4, -1 -; GFX6-NEXT: s_sub_i32 s16, s16, s32 +; GFX6-NEXT: s_sub_i32 s16, s16, 0x7fffffff ; GFX6-NEXT: s_min_i32 s17, s4, -1 -; GFX6-NEXT: s_sub_i32 s17, s17, s33 +; GFX6-NEXT: s_sub_i32 s17, s17, 0x80000000 ; GFX6-NEXT: s_max_i32 s16, s16, s20 ; GFX6-NEXT: s_min_i32 s16, s16, s17 ; GFX6-NEXT: s_sub_i32 s4, s4, s16 ; GFX6-NEXT: s_max_i32 s16, s5, -1 -; GFX6-NEXT: s_sub_i32 s16, s16, s32 +; GFX6-NEXT: s_sub_i32 s16, s16, 0x7fffffff ; GFX6-NEXT: s_min_i32 s17, s5, -1 -; GFX6-NEXT: s_sub_i32 s17, s17, s33 +; GFX6-NEXT: s_sub_i32 s17, s17, 0x80000000 ; GFX6-NEXT: s_max_i32 s16, s16, s21 ; GFX6-NEXT: s_min_i32 s16, s16, s17 ; GFX6-NEXT: s_sub_i32 s5, s5, s16 ; GFX6-NEXT: s_max_i32 s16, s6, -1 -; GFX6-NEXT: s_sub_i32 s16, s16, s32 +; GFX6-NEXT: s_sub_i32 s16, s16, 0x7fffffff ; GFX6-NEXT: s_min_i32 s17, s6, -1 -; GFX6-NEXT: s_sub_i32 s17, s17, s33 +; GFX6-NEXT: s_sub_i32 s17, s17, 0x80000000 ; GFX6-NEXT: s_max_i32 s16, s16, s22 ; GFX6-NEXT: s_min_i32 s16, s16, s17 ; GFX6-NEXT: s_sub_i32 s6, s6, s16 ; GFX6-NEXT: s_max_i32 s16, s7, -1 -; GFX6-NEXT: s_sub_i32 s16, s16, s32 +; GFX6-NEXT: s_sub_i32 s16, s16, 0x7fffffff ; GFX6-NEXT: s_min_i32 s17, s7, -1 -; GFX6-NEXT: s_sub_i32 s17, s17, s33 +; GFX6-NEXT: s_sub_i32 s17, s17, 0x80000000 ; GFX6-NEXT: s_max_i32 s16, s16, s23 ; GFX6-NEXT: s_min_i32 s16, s16, s17 ; GFX6-NEXT: s_sub_i32 s7, s7, s16 ; GFX6-NEXT: s_max_i32 s16, s8, -1 -; GFX6-NEXT: s_sub_i32 s16, s16, s32 +; GFX6-NEXT: s_sub_i32 s16, s16, 0x7fffffff ; GFX6-NEXT: s_min_i32 s17, s8, -1 -; GFX6-NEXT: s_sub_i32 s17, s17, s33 +; GFX6-NEXT: s_sub_i32 s17, s17, 0x80000000 ; GFX6-NEXT: s_max_i32 s16, s16, s24 ; GFX6-NEXT: s_min_i32 s16, s16, s17 ; GFX6-NEXT: s_sub_i32 s8, s8, s16 ; GFX6-NEXT: s_max_i32 s16, s9, -1 -; GFX6-NEXT: s_sub_i32 s16, s16, s32 +; GFX6-NEXT: s_sub_i32 s16, s16, 0x7fffffff ; GFX6-NEXT: s_min_i32 s17, s9, -1 -; GFX6-NEXT: s_sub_i32 s17, s17, s33 +; GFX6-NEXT: s_sub_i32 s17, s17, 0x80000000 ; GFX6-NEXT: s_max_i32 s16, s16, s25 ; GFX6-NEXT: s_min_i32 s16, s16, s17 ; GFX6-NEXT: s_sub_i32 s9, s9, s16 ; GFX6-NEXT: s_max_i32 s16, s10, -1 -; GFX6-NEXT: s_sub_i32 s16, s16, s32 +; GFX6-NEXT: s_sub_i32 s16, s16, 0x7fffffff ; GFX6-NEXT: s_min_i32 s17, s10, -1 -; GFX6-NEXT: s_sub_i32 s17, s17, s33 +; GFX6-NEXT: s_sub_i32 s17, s17, 0x80000000 ; GFX6-NEXT: s_max_i32 s16, s16, s26 ; GFX6-NEXT: s_min_i32 s16, s16, s17 ; GFX6-NEXT: s_sub_i32 s10, s10, s16 ; GFX6-NEXT: s_max_i32 s16, s11, -1 -; GFX6-NEXT: s_sub_i32 s16, s16, s32 +; GFX6-NEXT: s_sub_i32 s16, s16, 0x7fffffff ; GFX6-NEXT: s_min_i32 s17, s11, -1 -; GFX6-NEXT: s_sub_i32 s17, s17, s33 +; GFX6-NEXT: s_sub_i32 s17, s17, 0x80000000 ; GFX6-NEXT: s_max_i32 s16, s16, s27 ; GFX6-NEXT: s_min_i32 s16, s16, s17 ; GFX6-NEXT: s_sub_i32 s11, s11, s16 ; GFX6-NEXT: s_max_i32 s16, s12, -1 -; GFX6-NEXT: s_sub_i32 s16, s16, s32 +; GFX6-NEXT: s_sub_i32 s16, s16, 0x7fffffff ; GFX6-NEXT: s_min_i32 s17, s12, -1 -; GFX6-NEXT: s_sub_i32 s17, s17, s33 +; GFX6-NEXT: s_sub_i32 s17, s17, 0x80000000 ; GFX6-NEXT: s_max_i32 s16, s16, s28 ; GFX6-NEXT: s_min_i32 s16, s16, s17 ; GFX6-NEXT: s_sub_i32 s12, s12, s16 ; GFX6-NEXT: s_max_i32 s16, s13, -1 -; GFX6-NEXT: s_sub_i32 s16, s16, s32 +; GFX6-NEXT: s_sub_i32 s16, s16, 0x7fffffff ; GFX6-NEXT: s_min_i32 s17, s13, -1 -; GFX6-NEXT: s_sub_i32 s17, s17, s33 +; GFX6-NEXT: s_sub_i32 s17, s17, 0x80000000 ; GFX6-NEXT: s_max_i32 s16, s16, s29 ; GFX6-NEXT: s_min_i32 s16, s16, s17 ; GFX6-NEXT: s_sub_i32 s13, s13, s16 ; GFX6-NEXT: s_max_i32 s16, s14, -1 -; GFX6-NEXT: s_sub_i32 s16, s16, s32 +; GFX6-NEXT: s_sub_i32 s16, s16, 0x7fffffff ; GFX6-NEXT: s_min_i32 s17, s14, -1 -; GFX6-NEXT: s_sub_i32 s17, s17, s33 +; GFX6-NEXT: s_sub_i32 s17, s17, 0x80000000 ; GFX6-NEXT: s_max_i32 s16, s16, s30 ; GFX6-NEXT: s_min_i32 s16, s16, s17 ; GFX6-NEXT: s_sub_i32 s14, s14, s16 ; GFX6-NEXT: s_max_i32 s16, s15, -1 -; GFX6-NEXT: s_sub_i32 s16, s16, s32 +; GFX6-NEXT: s_sub_i32 s16, s16, 0x7fffffff ; GFX6-NEXT: s_min_i32 s17, s15, -1 -; GFX6-NEXT: s_sub_i32 s17, s17, s33 +; GFX6-NEXT: s_sub_i32 s17, s17, 0x80000000 ; GFX6-NEXT: s_max_i32 s16, s16, s31 ; GFX6-NEXT: s_min_i32 s16, s16, s17 ; GFX6-NEXT: s_sub_i32 s15, s15, s16 @@ -2315,117 +2275,115 @@ define amdgpu_ps <16 x i32> @s_ssubsat_v16i32(<16 x i32> inreg %lhs, <16 x i32> ; ; GFX8-LABEL: s_ssubsat_v16i32: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_brev_b32 s32, -2 -; GFX8-NEXT: s_max_i32 s34, s0, -1 -; GFX8-NEXT: s_brev_b32 s33, 1 -; GFX8-NEXT: s_sub_i32 s34, s34, s32 -; GFX8-NEXT: s_min_i32 s35, s0, -1 -; GFX8-NEXT: s_sub_i32 s35, s35, s33 -; GFX8-NEXT: s_max_i32 s16, s34, s16 -; GFX8-NEXT: s_min_i32 s16, s16, s35 +; GFX8-NEXT: s_max_i32 s32, s0, -1 +; GFX8-NEXT: s_sub_i32 s32, s32, 0x7fffffff +; GFX8-NEXT: s_min_i32 s33, s0, -1 +; GFX8-NEXT: s_sub_i32 s33, s33, 0x80000000 +; GFX8-NEXT: s_max_i32 s16, s32, s16 +; GFX8-NEXT: s_min_i32 s16, s16, s33 ; GFX8-NEXT: s_sub_i32 s0, s0, s16 ; GFX8-NEXT: s_max_i32 s16, s1, -1 -; GFX8-NEXT: s_sub_i32 s16, s16, s32 -; GFX8-NEXT: s_min_i32 s34, s1, -1 -; GFX8-NEXT: s_sub_i32 s34, s34, s33 +; GFX8-NEXT: s_sub_i32 s16, s16, 0x7fffffff +; GFX8-NEXT: s_min_i32 s32, s1, -1 +; GFX8-NEXT: s_sub_i32 s32, s32, 0x80000000 ; GFX8-NEXT: s_max_i32 s16, s16, s17 -; GFX8-NEXT: s_min_i32 s16, s16, s34 +; GFX8-NEXT: s_min_i32 s16, s16, s32 ; GFX8-NEXT: s_sub_i32 s1, s1, s16 ; GFX8-NEXT: s_max_i32 s16, s2, -1 -; GFX8-NEXT: s_sub_i32 s16, s16, s32 +; GFX8-NEXT: s_sub_i32 s16, s16, 0x7fffffff ; GFX8-NEXT: s_min_i32 s17, s2, -1 -; GFX8-NEXT: s_sub_i32 s17, s17, s33 +; GFX8-NEXT: s_sub_i32 s17, s17, 0x80000000 ; GFX8-NEXT: s_max_i32 s16, s16, s18 ; GFX8-NEXT: s_min_i32 s16, s16, s17 ; GFX8-NEXT: s_sub_i32 s2, s2, s16 ; GFX8-NEXT: s_max_i32 s16, s3, -1 -; GFX8-NEXT: s_sub_i32 s16, s16, s32 +; GFX8-NEXT: s_sub_i32 s16, s16, 0x7fffffff ; GFX8-NEXT: s_min_i32 s17, s3, -1 -; GFX8-NEXT: s_sub_i32 s17, s17, s33 +; GFX8-NEXT: s_sub_i32 s17, s17, 0x80000000 ; GFX8-NEXT: s_max_i32 s16, s16, s19 ; GFX8-NEXT: s_min_i32 s16, s16, s17 ; GFX8-NEXT: s_sub_i32 s3, s3, s16 ; GFX8-NEXT: s_max_i32 s16, s4, -1 -; GFX8-NEXT: s_sub_i32 s16, s16, s32 +; GFX8-NEXT: s_sub_i32 s16, s16, 0x7fffffff ; GFX8-NEXT: s_min_i32 s17, s4, -1 -; GFX8-NEXT: s_sub_i32 s17, s17, s33 +; GFX8-NEXT: s_sub_i32 s17, s17, 0x80000000 ; GFX8-NEXT: s_max_i32 s16, s16, s20 ; GFX8-NEXT: s_min_i32 s16, s16, s17 ; GFX8-NEXT: s_sub_i32 s4, s4, s16 ; GFX8-NEXT: s_max_i32 s16, s5, -1 -; GFX8-NEXT: s_sub_i32 s16, s16, s32 +; GFX8-NEXT: s_sub_i32 s16, s16, 0x7fffffff ; GFX8-NEXT: s_min_i32 s17, s5, -1 -; GFX8-NEXT: s_sub_i32 s17, s17, s33 +; GFX8-NEXT: s_sub_i32 s17, s17, 0x80000000 ; GFX8-NEXT: s_max_i32 s16, s16, s21 ; GFX8-NEXT: s_min_i32 s16, s16, s17 ; GFX8-NEXT: s_sub_i32 s5, s5, s16 ; GFX8-NEXT: s_max_i32 s16, s6, -1 -; GFX8-NEXT: s_sub_i32 s16, s16, s32 +; GFX8-NEXT: s_sub_i32 s16, s16, 0x7fffffff ; GFX8-NEXT: s_min_i32 s17, s6, -1 -; GFX8-NEXT: s_sub_i32 s17, s17, s33 +; GFX8-NEXT: s_sub_i32 s17, s17, 0x80000000 ; GFX8-NEXT: s_max_i32 s16, s16, s22 ; GFX8-NEXT: s_min_i32 s16, s16, s17 ; GFX8-NEXT: s_sub_i32 s6, s6, s16 ; GFX8-NEXT: s_max_i32 s16, s7, -1 -; GFX8-NEXT: s_sub_i32 s16, s16, s32 +; GFX8-NEXT: s_sub_i32 s16, s16, 0x7fffffff ; GFX8-NEXT: s_min_i32 s17, s7, -1 -; GFX8-NEXT: s_sub_i32 s17, s17, s33 +; GFX8-NEXT: s_sub_i32 s17, s17, 0x80000000 ; GFX8-NEXT: s_max_i32 s16, s16, s23 ; GFX8-NEXT: s_min_i32 s16, s16, s17 ; GFX8-NEXT: s_sub_i32 s7, s7, s16 ; GFX8-NEXT: s_max_i32 s16, s8, -1 -; GFX8-NEXT: s_sub_i32 s16, s16, s32 +; GFX8-NEXT: s_sub_i32 s16, s16, 0x7fffffff ; GFX8-NEXT: s_min_i32 s17, s8, -1 -; GFX8-NEXT: s_sub_i32 s17, s17, s33 +; GFX8-NEXT: s_sub_i32 s17, s17, 0x80000000 ; GFX8-NEXT: s_max_i32 s16, s16, s24 ; GFX8-NEXT: s_min_i32 s16, s16, s17 ; GFX8-NEXT: s_sub_i32 s8, s8, s16 ; GFX8-NEXT: s_max_i32 s16, s9, -1 -; GFX8-NEXT: s_sub_i32 s16, s16, s32 +; GFX8-NEXT: s_sub_i32 s16, s16, 0x7fffffff ; GFX8-NEXT: s_min_i32 s17, s9, -1 -; GFX8-NEXT: s_sub_i32 s17, s17, s33 +; GFX8-NEXT: s_sub_i32 s17, s17, 0x80000000 ; GFX8-NEXT: s_max_i32 s16, s16, s25 ; GFX8-NEXT: s_min_i32 s16, s16, s17 ; GFX8-NEXT: s_sub_i32 s9, s9, s16 ; GFX8-NEXT: s_max_i32 s16, s10, -1 -; GFX8-NEXT: s_sub_i32 s16, s16, s32 +; GFX8-NEXT: s_sub_i32 s16, s16, 0x7fffffff ; GFX8-NEXT: s_min_i32 s17, s10, -1 -; GFX8-NEXT: s_sub_i32 s17, s17, s33 +; GFX8-NEXT: s_sub_i32 s17, s17, 0x80000000 ; GFX8-NEXT: s_max_i32 s16, s16, s26 ; GFX8-NEXT: s_min_i32 s16, s16, s17 ; GFX8-NEXT: s_sub_i32 s10, s10, s16 ; GFX8-NEXT: s_max_i32 s16, s11, -1 -; GFX8-NEXT: s_sub_i32 s16, s16, s32 +; GFX8-NEXT: s_sub_i32 s16, s16, 0x7fffffff ; GFX8-NEXT: s_min_i32 s17, s11, -1 -; GFX8-NEXT: s_sub_i32 s17, s17, s33 +; GFX8-NEXT: s_sub_i32 s17, s17, 0x80000000 ; GFX8-NEXT: s_max_i32 s16, s16, s27 ; GFX8-NEXT: s_min_i32 s16, s16, s17 ; GFX8-NEXT: s_sub_i32 s11, s11, s16 ; GFX8-NEXT: s_max_i32 s16, s12, -1 -; GFX8-NEXT: s_sub_i32 s16, s16, s32 +; GFX8-NEXT: s_sub_i32 s16, s16, 0x7fffffff ; GFX8-NEXT: s_min_i32 s17, s12, -1 -; GFX8-NEXT: s_sub_i32 s17, s17, s33 +; GFX8-NEXT: s_sub_i32 s17, s17, 0x80000000 ; GFX8-NEXT: s_max_i32 s16, s16, s28 ; GFX8-NEXT: s_min_i32 s16, s16, s17 ; GFX8-NEXT: s_sub_i32 s12, s12, s16 ; GFX8-NEXT: s_max_i32 s16, s13, -1 -; GFX8-NEXT: s_sub_i32 s16, s16, s32 +; GFX8-NEXT: s_sub_i32 s16, s16, 0x7fffffff ; GFX8-NEXT: s_min_i32 s17, s13, -1 -; GFX8-NEXT: s_sub_i32 s17, s17, s33 +; GFX8-NEXT: s_sub_i32 s17, s17, 0x80000000 ; GFX8-NEXT: s_max_i32 s16, s16, s29 ; GFX8-NEXT: s_min_i32 s16, s16, s17 ; GFX8-NEXT: s_sub_i32 s13, s13, s16 ; GFX8-NEXT: s_max_i32 s16, s14, -1 -; GFX8-NEXT: s_sub_i32 s16, s16, s32 +; GFX8-NEXT: s_sub_i32 s16, s16, 0x7fffffff ; GFX8-NEXT: s_min_i32 s17, s14, -1 -; GFX8-NEXT: s_sub_i32 s17, s17, s33 +; GFX8-NEXT: s_sub_i32 s17, s17, 0x80000000 ; GFX8-NEXT: s_max_i32 s16, s16, s30 ; GFX8-NEXT: s_min_i32 s16, s16, s17 ; GFX8-NEXT: s_sub_i32 s14, s14, s16 ; GFX8-NEXT: s_max_i32 s16, s15, -1 -; GFX8-NEXT: s_sub_i32 s16, s16, s32 +; GFX8-NEXT: s_sub_i32 s16, s16, 0x7fffffff ; GFX8-NEXT: s_min_i32 s17, s15, -1 -; GFX8-NEXT: s_sub_i32 s17, s17, s33 +; GFX8-NEXT: s_sub_i32 s17, s17, 0x80000000 ; GFX8-NEXT: s_max_i32 s16, s16, s31 ; GFX8-NEXT: s_min_i32 s16, s16, s17 ; GFX8-NEXT: s_sub_i32 s15, s15, s16 @@ -2767,60 +2725,55 @@ define amdgpu_ps i32 @s_ssubsat_v2i16(<2 x i16> inreg %lhs, <2 x i16> inreg %rhs ; GFX6-LABEL: s_ssubsat_v2i16: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_lshl_b32 s0, s0, 16 -; GFX6-NEXT: s_brev_b32 s4, -2 -; GFX6-NEXT: s_max_i32 s6, s0, -1 +; GFX6-NEXT: s_max_i32 s4, s0, -1 ; GFX6-NEXT: s_lshl_b32 s2, s2, 16 -; GFX6-NEXT: s_brev_b32 s5, 1 -; GFX6-NEXT: s_sub_i32 s6, s6, s4 -; GFX6-NEXT: s_min_i32 s7, s0, -1 -; GFX6-NEXT: s_sub_i32 s7, s7, s5 -; GFX6-NEXT: s_max_i32 s2, s6, s2 -; GFX6-NEXT: s_min_i32 s2, s2, s7 +; GFX6-NEXT: s_sub_i32 s4, s4, 0x7fffffff +; GFX6-NEXT: s_min_i32 s5, s0, -1 +; GFX6-NEXT: s_sub_i32 s5, s5, 0x80000000 +; GFX6-NEXT: s_max_i32 s2, s4, s2 +; GFX6-NEXT: s_min_i32 s2, s2, s5 ; GFX6-NEXT: s_lshl_b32 s1, s1, 16 ; GFX6-NEXT: s_sub_i32 s0, s0, s2 ; GFX6-NEXT: s_lshl_b32 s2, s3, 16 ; GFX6-NEXT: s_max_i32 s3, s1, -1 -; GFX6-NEXT: s_sub_i32 s3, s3, s4 +; GFX6-NEXT: s_sub_i32 s3, s3, 0x7fffffff ; GFX6-NEXT: s_min_i32 s4, s1, -1 -; GFX6-NEXT: s_sub_i32 s4, s4, s5 +; GFX6-NEXT: s_sub_i32 s4, s4, 0x80000000 ; GFX6-NEXT: s_max_i32 s2, s3, s2 ; GFX6-NEXT: s_min_i32 s2, s2, s4 ; GFX6-NEXT: s_sub_i32 s1, s1, s2 ; GFX6-NEXT: s_ashr_i32 s1, s1, 16 -; GFX6-NEXT: s_mov_b32 s2, 0xffff ; GFX6-NEXT: s_ashr_i32 s0, s0, 16 -; GFX6-NEXT: s_and_b32 s1, s1, s2 -; GFX6-NEXT: s_and_b32 s0, s0, s2 +; GFX6-NEXT: s_and_b32 s1, s1, 0xffff +; GFX6-NEXT: s_and_b32 s0, s0, 0xffff ; GFX6-NEXT: s_lshl_b32 s1, s1, 16 ; GFX6-NEXT: s_or_b32 s0, s0, s1 ; GFX6-NEXT: ; return to shader part epilog ; ; GFX8-LABEL: s_ssubsat_v2i16: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_sext_i32_i16 s6, s0 -; GFX8-NEXT: s_sext_i32_i16 s7, -1 -; GFX8-NEXT: s_movk_i32 s4, 0x7fff -; GFX8-NEXT: s_max_i32 s8, s6, s7 -; GFX8-NEXT: s_sub_i32 s8, s8, s4 +; GFX8-NEXT: s_sext_i32_i16 s4, s0 +; GFX8-NEXT: s_sext_i32_i16 s5, -1 +; GFX8-NEXT: s_max_i32 s6, s4, s5 +; GFX8-NEXT: s_sub_i32 s6, s6, 0x7fff ; GFX8-NEXT: s_lshr_b32 s3, s1, 16 -; GFX8-NEXT: s_movk_i32 s5, 0x8000 -; GFX8-NEXT: s_min_i32 s6, s6, s7 -; GFX8-NEXT: s_sext_i32_i16 s8, s8 -; GFX8-NEXT: s_sext_i32_i16 s1, s1 -; GFX8-NEXT: s_sub_i32 s6, s6, s5 -; GFX8-NEXT: s_max_i32 s1, s8, s1 -; GFX8-NEXT: s_sext_i32_i16 s1, s1 +; GFX8-NEXT: s_min_i32 s4, s4, s5 ; GFX8-NEXT: s_sext_i32_i16 s6, s6 +; GFX8-NEXT: s_sext_i32_i16 s1, s1 +; GFX8-NEXT: s_sub_i32 s4, s4, 0xffff8000 +; GFX8-NEXT: s_max_i32 s1, s6, s1 +; GFX8-NEXT: s_sext_i32_i16 s1, s1 +; GFX8-NEXT: s_sext_i32_i16 s4, s4 ; GFX8-NEXT: s_lshr_b32 s2, s0, 16 -; GFX8-NEXT: s_min_i32 s1, s1, s6 +; GFX8-NEXT: s_min_i32 s1, s1, s4 ; GFX8-NEXT: s_sub_i32 s0, s0, s1 ; GFX8-NEXT: s_sext_i32_i16 s1, s2 -; GFX8-NEXT: s_max_i32 s6, s1, s7 -; GFX8-NEXT: s_sub_i32 s4, s6, s4 -; GFX8-NEXT: s_min_i32 s1, s1, s7 +; GFX8-NEXT: s_max_i32 s4, s1, s5 +; GFX8-NEXT: s_sub_i32 s4, s4, 0x7fff +; GFX8-NEXT: s_min_i32 s1, s1, s5 ; GFX8-NEXT: s_sext_i32_i16 s4, s4 ; GFX8-NEXT: s_sext_i32_i16 s3, s3 -; GFX8-NEXT: s_sub_i32 s1, s1, s5 +; GFX8-NEXT: s_sub_i32 s1, s1, 0xffff8000 ; GFX8-NEXT: s_max_i32 s3, s4, s3 ; GFX8-NEXT: s_sext_i32_i16 s3, s3 ; GFX8-NEXT: s_sext_i32_i16 s1, s1 @@ -2853,22 +2806,20 @@ define amdgpu_ps float @ssubsat_v2i16_sv(<2 x i16> inreg %lhs, <2 x i16> %rhs) { ; GFX6-LABEL: ssubsat_v2i16_sv: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_lshl_b32 s0, s0, 16 -; GFX6-NEXT: s_brev_b32 s2, -2 -; GFX6-NEXT: s_max_i32 s4, s0, -1 +; GFX6-NEXT: s_max_i32 s2, s0, -1 ; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX6-NEXT: s_brev_b32 s3, 1 -; GFX6-NEXT: s_sub_i32 s4, s4, s2 -; GFX6-NEXT: s_min_i32 s5, s0, -1 -; GFX6-NEXT: s_sub_i32 s5, s5, s3 -; GFX6-NEXT: v_max_i32_e32 v0, s4, v0 -; GFX6-NEXT: v_min_i32_e32 v0, s5, v0 +; GFX6-NEXT: s_sub_i32 s2, s2, 0x7fffffff +; GFX6-NEXT: s_min_i32 s3, s0, -1 +; GFX6-NEXT: s_sub_i32 s3, s3, 0x80000000 +; GFX6-NEXT: v_max_i32_e32 v0, s2, v0 +; GFX6-NEXT: v_min_i32_e32 v0, s3, v0 ; GFX6-NEXT: v_sub_i32_e32 v0, vcc, s0, v0 ; GFX6-NEXT: s_lshl_b32 s0, s1, 16 ; GFX6-NEXT: s_max_i32 s1, s0, -1 ; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX6-NEXT: s_sub_i32 s1, s1, s2 +; GFX6-NEXT: s_sub_i32 s1, s1, 0x7fffffff ; GFX6-NEXT: s_min_i32 s2, s0, -1 -; GFX6-NEXT: s_sub_i32 s2, s2, s3 +; GFX6-NEXT: s_sub_i32 s2, s2, 0x80000000 ; GFX6-NEXT: v_max_i32_e32 v1, s1, v1 ; GFX6-NEXT: v_min_i32_e32 v1, s2, v1 ; GFX6-NEXT: v_sub_i32_e32 v1, vcc, s0, v1 @@ -2883,25 +2834,23 @@ define amdgpu_ps float @ssubsat_v2i16_sv(<2 x i16> inreg %lhs, <2 x i16> %rhs) { ; ; GFX8-LABEL: ssubsat_v2i16_sv: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_sext_i32_i16 s4, s0 -; GFX8-NEXT: s_sext_i32_i16 s5, -1 -; GFX8-NEXT: s_movk_i32 s2, 0x7fff -; GFX8-NEXT: s_max_i32 s6, s4, s5 -; GFX8-NEXT: s_movk_i32 s3, 0x8000 -; GFX8-NEXT: s_sub_i32 s6, s6, s2 -; GFX8-NEXT: s_min_i32 s4, s4, s5 +; GFX8-NEXT: s_sext_i32_i16 s2, s0 +; GFX8-NEXT: s_sext_i32_i16 s3, -1 +; GFX8-NEXT: s_max_i32 s4, s2, s3 +; GFX8-NEXT: s_sub_i32 s4, s4, 0x7fff +; GFX8-NEXT: s_min_i32 s2, s2, s3 ; GFX8-NEXT: s_lshr_b32 s1, s0, 16 -; GFX8-NEXT: s_sub_i32 s4, s4, s3 -; GFX8-NEXT: v_max_i16_e32 v1, s6, v0 -; GFX8-NEXT: v_min_i16_e32 v1, s4, v1 -; GFX8-NEXT: s_sext_i32_i16 s4, s1 -; GFX8-NEXT: s_max_i32 s6, s4, s5 -; GFX8-NEXT: s_sub_i32 s2, s6, s2 -; GFX8-NEXT: s_min_i32 s4, s4, s5 -; GFX8-NEXT: v_mov_b32_e32 v2, s2 -; GFX8-NEXT: s_sub_i32 s3, s4, s3 +; GFX8-NEXT: s_sub_i32 s2, s2, 0xffff8000 +; GFX8-NEXT: v_max_i16_e32 v1, s4, v0 +; GFX8-NEXT: v_min_i16_e32 v1, s2, v1 +; GFX8-NEXT: s_sext_i32_i16 s2, s1 +; GFX8-NEXT: s_max_i32 s4, s2, s3 +; GFX8-NEXT: s_sub_i32 s4, s4, 0x7fff +; GFX8-NEXT: s_min_i32 s2, s2, s3 +; GFX8-NEXT: v_mov_b32_e32 v2, s4 +; GFX8-NEXT: s_sub_i32 s2, s2, 0xffff8000 ; GFX8-NEXT: v_max_i16_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX8-NEXT: v_min_i16_e32 v0, s3, v0 +; GFX8-NEXT: v_min_i16_e32 v0, s2, v0 ; GFX8-NEXT: v_mov_b32_e32 v2, s1 ; GFX8-NEXT: v_sub_u16_e32 v1, s0, v1 ; GFX8-NEXT: v_sub_u16_sdwa v0, v2, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD @@ -3123,97 +3072,92 @@ define amdgpu_ps <2 x i32> @s_ssubsat_v4i16(<4 x i16> inreg %lhs, <4 x i16> inre ; GFX6-LABEL: s_ssubsat_v4i16: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_lshl_b32 s0, s0, 16 -; GFX6-NEXT: s_brev_b32 s8, -2 -; GFX6-NEXT: s_max_i32 s10, s0, -1 +; GFX6-NEXT: s_max_i32 s8, s0, -1 ; GFX6-NEXT: s_lshl_b32 s4, s4, 16 -; GFX6-NEXT: s_brev_b32 s9, 1 -; GFX6-NEXT: s_sub_i32 s10, s10, s8 -; GFX6-NEXT: s_min_i32 s11, s0, -1 -; GFX6-NEXT: s_sub_i32 s11, s11, s9 -; GFX6-NEXT: s_max_i32 s4, s10, s4 -; GFX6-NEXT: s_min_i32 s4, s4, s11 +; GFX6-NEXT: s_sub_i32 s8, s8, 0x7fffffff +; GFX6-NEXT: s_min_i32 s9, s0, -1 +; GFX6-NEXT: s_sub_i32 s9, s9, 0x80000000 +; GFX6-NEXT: s_max_i32 s4, s8, s4 +; GFX6-NEXT: s_min_i32 s4, s4, s9 ; GFX6-NEXT: s_lshl_b32 s1, s1, 16 ; GFX6-NEXT: s_sub_i32 s0, s0, s4 ; GFX6-NEXT: s_lshl_b32 s4, s5, 16 ; GFX6-NEXT: s_max_i32 s5, s1, -1 -; GFX6-NEXT: s_sub_i32 s5, s5, s8 -; GFX6-NEXT: s_min_i32 s10, s1, -1 -; GFX6-NEXT: s_sub_i32 s10, s10, s9 +; GFX6-NEXT: s_sub_i32 s5, s5, 0x7fffffff +; GFX6-NEXT: s_min_i32 s8, s1, -1 +; GFX6-NEXT: s_sub_i32 s8, s8, 0x80000000 ; GFX6-NEXT: s_max_i32 s4, s5, s4 ; GFX6-NEXT: s_lshl_b32 s2, s2, 16 -; GFX6-NEXT: s_min_i32 s4, s4, s10 +; GFX6-NEXT: s_min_i32 s4, s4, s8 ; GFX6-NEXT: s_max_i32 s5, s2, -1 ; GFX6-NEXT: s_sub_i32 s1, s1, s4 ; GFX6-NEXT: s_lshl_b32 s4, s6, 16 -; GFX6-NEXT: s_sub_i32 s5, s5, s8 +; GFX6-NEXT: s_sub_i32 s5, s5, 0x7fffffff ; GFX6-NEXT: s_min_i32 s6, s2, -1 -; GFX6-NEXT: s_sub_i32 s6, s6, s9 +; GFX6-NEXT: s_sub_i32 s6, s6, 0x80000000 ; GFX6-NEXT: s_max_i32 s4, s5, s4 ; GFX6-NEXT: s_lshl_b32 s3, s3, 16 ; GFX6-NEXT: s_min_i32 s4, s4, s6 ; GFX6-NEXT: s_max_i32 s5, s3, -1 ; GFX6-NEXT: s_sub_i32 s2, s2, s4 ; GFX6-NEXT: s_lshl_b32 s4, s7, 16 -; GFX6-NEXT: s_sub_i32 s5, s5, s8 +; GFX6-NEXT: s_sub_i32 s5, s5, 0x7fffffff ; GFX6-NEXT: s_min_i32 s6, s3, -1 -; GFX6-NEXT: s_sub_i32 s6, s6, s9 +; GFX6-NEXT: s_sub_i32 s6, s6, 0x80000000 ; GFX6-NEXT: s_max_i32 s4, s5, s4 -; GFX6-NEXT: s_min_i32 s4, s4, s6 ; GFX6-NEXT: s_ashr_i32 s1, s1, 16 -; GFX6-NEXT: s_sub_i32 s3, s3, s4 -; GFX6-NEXT: s_mov_b32 s4, 0xffff +; GFX6-NEXT: s_min_i32 s4, s4, s6 ; GFX6-NEXT: s_ashr_i32 s0, s0, 16 -; GFX6-NEXT: s_and_b32 s1, s1, s4 +; GFX6-NEXT: s_sub_i32 s3, s3, s4 +; GFX6-NEXT: s_and_b32 s1, s1, 0xffff ; GFX6-NEXT: s_ashr_i32 s2, s2, 16 ; GFX6-NEXT: s_ashr_i32 s3, s3, 16 -; GFX6-NEXT: s_and_b32 s0, s0, s4 +; GFX6-NEXT: s_and_b32 s0, s0, 0xffff ; GFX6-NEXT: s_lshl_b32 s1, s1, 16 ; GFX6-NEXT: s_or_b32 s0, s0, s1 -; GFX6-NEXT: s_and_b32 s1, s2, s4 -; GFX6-NEXT: s_and_b32 s2, s3, s4 +; GFX6-NEXT: s_and_b32 s1, s2, 0xffff +; GFX6-NEXT: s_and_b32 s2, s3, 0xffff ; GFX6-NEXT: s_lshl_b32 s2, s2, 16 ; GFX6-NEXT: s_or_b32 s1, s1, s2 ; GFX6-NEXT: ; return to shader part epilog ; ; GFX8-LABEL: s_ssubsat_v4i16: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_sext_i32_i16 s10, s0 -; GFX8-NEXT: s_sext_i32_i16 s11, -1 -; GFX8-NEXT: s_movk_i32 s8, 0x7fff -; GFX8-NEXT: s_max_i32 s12, s10, s11 -; GFX8-NEXT: s_sub_i32 s12, s12, s8 +; GFX8-NEXT: s_sext_i32_i16 s8, s0 +; GFX8-NEXT: s_sext_i32_i16 s9, -1 +; GFX8-NEXT: s_max_i32 s10, s8, s9 +; GFX8-NEXT: s_sub_i32 s10, s10, 0x7fff ; GFX8-NEXT: s_lshr_b32 s6, s2, 16 -; GFX8-NEXT: s_movk_i32 s9, 0x8000 -; GFX8-NEXT: s_min_i32 s10, s10, s11 -; GFX8-NEXT: s_sext_i32_i16 s12, s12 -; GFX8-NEXT: s_sext_i32_i16 s2, s2 -; GFX8-NEXT: s_sub_i32 s10, s10, s9 -; GFX8-NEXT: s_max_i32 s2, s12, s2 -; GFX8-NEXT: s_sext_i32_i16 s2, s2 +; GFX8-NEXT: s_min_i32 s8, s8, s9 ; GFX8-NEXT: s_sext_i32_i16 s10, s10 +; GFX8-NEXT: s_sext_i32_i16 s2, s2 +; GFX8-NEXT: s_sub_i32 s8, s8, 0xffff8000 +; GFX8-NEXT: s_max_i32 s2, s10, s2 +; GFX8-NEXT: s_sext_i32_i16 s2, s2 +; GFX8-NEXT: s_sext_i32_i16 s8, s8 ; GFX8-NEXT: s_lshr_b32 s4, s0, 16 -; GFX8-NEXT: s_min_i32 s2, s2, s10 +; GFX8-NEXT: s_min_i32 s2, s2, s8 ; GFX8-NEXT: s_sub_i32 s0, s0, s2 ; GFX8-NEXT: s_sext_i32_i16 s2, s4 -; GFX8-NEXT: s_max_i32 s10, s2, s11 -; GFX8-NEXT: s_sub_i32 s10, s10, s8 -; GFX8-NEXT: s_min_i32 s2, s2, s11 -; GFX8-NEXT: s_sext_i32_i16 s10, s10 +; GFX8-NEXT: s_max_i32 s8, s2, s9 +; GFX8-NEXT: s_sub_i32 s8, s8, 0x7fff +; GFX8-NEXT: s_min_i32 s2, s2, s9 +; GFX8-NEXT: s_sext_i32_i16 s8, s8 ; GFX8-NEXT: s_sext_i32_i16 s6, s6 -; GFX8-NEXT: s_sub_i32 s2, s2, s9 -; GFX8-NEXT: s_max_i32 s6, s10, s6 +; GFX8-NEXT: s_sub_i32 s2, s2, 0xffff8000 +; GFX8-NEXT: s_max_i32 s6, s8, s6 ; GFX8-NEXT: s_sext_i32_i16 s6, s6 ; GFX8-NEXT: s_sext_i32_i16 s2, s2 ; GFX8-NEXT: s_min_i32 s2, s6, s2 ; GFX8-NEXT: s_sub_i32 s2, s4, s2 ; GFX8-NEXT: s_sext_i32_i16 s4, s1 -; GFX8-NEXT: s_max_i32 s6, s4, s11 -; GFX8-NEXT: s_sub_i32 s6, s6, s8 +; GFX8-NEXT: s_max_i32 s6, s4, s9 +; GFX8-NEXT: s_sub_i32 s6, s6, 0x7fff ; GFX8-NEXT: s_lshr_b32 s7, s3, 16 -; GFX8-NEXT: s_min_i32 s4, s4, s11 +; GFX8-NEXT: s_min_i32 s4, s4, s9 ; GFX8-NEXT: s_sext_i32_i16 s6, s6 ; GFX8-NEXT: s_sext_i32_i16 s3, s3 -; GFX8-NEXT: s_sub_i32 s4, s4, s9 +; GFX8-NEXT: s_sub_i32 s4, s4, 0xffff8000 ; GFX8-NEXT: s_max_i32 s3, s6, s3 ; GFX8-NEXT: s_sext_i32_i16 s3, s3 ; GFX8-NEXT: s_sext_i32_i16 s4, s4 @@ -3221,12 +3165,12 @@ define amdgpu_ps <2 x i32> @s_ssubsat_v4i16(<4 x i16> inreg %lhs, <4 x i16> inre ; GFX8-NEXT: s_min_i32 s3, s3, s4 ; GFX8-NEXT: s_sub_i32 s1, s1, s3 ; GFX8-NEXT: s_sext_i32_i16 s3, s5 -; GFX8-NEXT: s_max_i32 s4, s3, s11 -; GFX8-NEXT: s_sub_i32 s4, s4, s8 -; GFX8-NEXT: s_min_i32 s3, s3, s11 +; GFX8-NEXT: s_max_i32 s4, s3, s9 +; GFX8-NEXT: s_sub_i32 s4, s4, 0x7fff +; GFX8-NEXT: s_min_i32 s3, s3, s9 ; GFX8-NEXT: s_sext_i32_i16 s4, s4 ; GFX8-NEXT: s_sext_i32_i16 s6, s7 -; GFX8-NEXT: s_sub_i32 s3, s3, s9 +; GFX8-NEXT: s_sub_i32 s3, s3, 0xffff8000 ; GFX8-NEXT: s_max_i32 s4, s4, s6 ; GFX8-NEXT: s_sext_i32_i16 s4, s4 ; GFX8-NEXT: s_sext_i32_i16 s3, s3 @@ -3440,121 +3384,116 @@ define amdgpu_ps <3 x i32> @s_ssubsat_v6i16(<6 x i16> inreg %lhs, <6 x i16> inre ; GFX6-LABEL: s_ssubsat_v6i16: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_lshl_b32 s0, s0, 16 -; GFX6-NEXT: s_brev_b32 s12, -2 -; GFX6-NEXT: s_max_i32 s14, s0, -1 +; GFX6-NEXT: s_max_i32 s12, s0, -1 ; GFX6-NEXT: s_lshl_b32 s6, s6, 16 -; GFX6-NEXT: s_brev_b32 s13, 1 -; GFX6-NEXT: s_sub_i32 s14, s14, s12 -; GFX6-NEXT: s_min_i32 s15, s0, -1 -; GFX6-NEXT: s_sub_i32 s15, s15, s13 -; GFX6-NEXT: s_max_i32 s6, s14, s6 -; GFX6-NEXT: s_min_i32 s6, s6, s15 +; GFX6-NEXT: s_sub_i32 s12, s12, 0x7fffffff +; GFX6-NEXT: s_min_i32 s13, s0, -1 +; GFX6-NEXT: s_sub_i32 s13, s13, 0x80000000 +; GFX6-NEXT: s_max_i32 s6, s12, s6 +; GFX6-NEXT: s_min_i32 s6, s6, s13 ; GFX6-NEXT: s_lshl_b32 s1, s1, 16 ; GFX6-NEXT: s_sub_i32 s0, s0, s6 ; GFX6-NEXT: s_lshl_b32 s6, s7, 16 ; GFX6-NEXT: s_max_i32 s7, s1, -1 -; GFX6-NEXT: s_sub_i32 s7, s7, s12 -; GFX6-NEXT: s_min_i32 s14, s1, -1 -; GFX6-NEXT: s_sub_i32 s14, s14, s13 +; GFX6-NEXT: s_sub_i32 s7, s7, 0x7fffffff +; GFX6-NEXT: s_min_i32 s12, s1, -1 +; GFX6-NEXT: s_sub_i32 s12, s12, 0x80000000 ; GFX6-NEXT: s_max_i32 s6, s7, s6 ; GFX6-NEXT: s_lshl_b32 s2, s2, 16 -; GFX6-NEXT: s_min_i32 s6, s6, s14 +; GFX6-NEXT: s_min_i32 s6, s6, s12 ; GFX6-NEXT: s_max_i32 s7, s2, -1 ; GFX6-NEXT: s_sub_i32 s1, s1, s6 ; GFX6-NEXT: s_lshl_b32 s6, s8, 16 -; GFX6-NEXT: s_sub_i32 s7, s7, s12 +; GFX6-NEXT: s_sub_i32 s7, s7, 0x7fffffff ; GFX6-NEXT: s_min_i32 s8, s2, -1 -; GFX6-NEXT: s_sub_i32 s8, s8, s13 +; GFX6-NEXT: s_sub_i32 s8, s8, 0x80000000 ; GFX6-NEXT: s_max_i32 s6, s7, s6 ; GFX6-NEXT: s_lshl_b32 s3, s3, 16 ; GFX6-NEXT: s_min_i32 s6, s6, s8 ; GFX6-NEXT: s_max_i32 s7, s3, -1 ; GFX6-NEXT: s_sub_i32 s2, s2, s6 ; GFX6-NEXT: s_lshl_b32 s6, s9, 16 -; GFX6-NEXT: s_sub_i32 s7, s7, s12 +; GFX6-NEXT: s_sub_i32 s7, s7, 0x7fffffff ; GFX6-NEXT: s_min_i32 s8, s3, -1 -; GFX6-NEXT: s_sub_i32 s8, s8, s13 +; GFX6-NEXT: s_sub_i32 s8, s8, 0x80000000 ; GFX6-NEXT: s_max_i32 s6, s7, s6 ; GFX6-NEXT: s_lshl_b32 s4, s4, 16 ; GFX6-NEXT: s_min_i32 s6, s6, s8 ; GFX6-NEXT: s_max_i32 s7, s4, -1 ; GFX6-NEXT: s_sub_i32 s3, s3, s6 ; GFX6-NEXT: s_lshl_b32 s6, s10, 16 -; GFX6-NEXT: s_sub_i32 s7, s7, s12 +; GFX6-NEXT: s_sub_i32 s7, s7, 0x7fffffff ; GFX6-NEXT: s_min_i32 s8, s4, -1 -; GFX6-NEXT: s_sub_i32 s8, s8, s13 +; GFX6-NEXT: s_sub_i32 s8, s8, 0x80000000 ; GFX6-NEXT: s_max_i32 s6, s7, s6 ; GFX6-NEXT: s_lshl_b32 s5, s5, 16 ; GFX6-NEXT: s_min_i32 s6, s6, s8 ; GFX6-NEXT: s_max_i32 s7, s5, -1 ; GFX6-NEXT: s_sub_i32 s4, s4, s6 ; GFX6-NEXT: s_lshl_b32 s6, s11, 16 -; GFX6-NEXT: s_sub_i32 s7, s7, s12 +; GFX6-NEXT: s_sub_i32 s7, s7, 0x7fffffff ; GFX6-NEXT: s_min_i32 s8, s5, -1 -; GFX6-NEXT: s_sub_i32 s8, s8, s13 -; GFX6-NEXT: s_max_i32 s6, s7, s6 -; GFX6-NEXT: s_min_i32 s6, s6, s8 ; GFX6-NEXT: s_ashr_i32 s1, s1, 16 -; GFX6-NEXT: s_sub_i32 s5, s5, s6 -; GFX6-NEXT: s_mov_b32 s6, 0xffff +; GFX6-NEXT: s_sub_i32 s8, s8, 0x80000000 +; GFX6-NEXT: s_max_i32 s6, s7, s6 ; GFX6-NEXT: s_ashr_i32 s0, s0, 16 -; GFX6-NEXT: s_and_b32 s1, s1, s6 +; GFX6-NEXT: s_min_i32 s6, s6, s8 +; GFX6-NEXT: s_and_b32 s1, s1, 0xffff ; GFX6-NEXT: s_ashr_i32 s2, s2, 16 ; GFX6-NEXT: s_ashr_i32 s3, s3, 16 -; GFX6-NEXT: s_and_b32 s0, s0, s6 +; GFX6-NEXT: s_sub_i32 s5, s5, s6 +; GFX6-NEXT: s_and_b32 s0, s0, 0xffff ; GFX6-NEXT: s_lshl_b32 s1, s1, 16 ; GFX6-NEXT: s_ashr_i32 s5, s5, 16 ; GFX6-NEXT: s_or_b32 s0, s0, s1 -; GFX6-NEXT: s_and_b32 s1, s2, s6 -; GFX6-NEXT: s_and_b32 s2, s3, s6 +; GFX6-NEXT: s_and_b32 s1, s2, 0xffff +; GFX6-NEXT: s_and_b32 s2, s3, 0xffff ; GFX6-NEXT: s_ashr_i32 s4, s4, 16 ; GFX6-NEXT: s_lshl_b32 s2, s2, 16 -; GFX6-NEXT: s_and_b32 s3, s5, s6 +; GFX6-NEXT: s_and_b32 s3, s5, 0xffff ; GFX6-NEXT: s_or_b32 s1, s1, s2 -; GFX6-NEXT: s_and_b32 s2, s4, s6 +; GFX6-NEXT: s_and_b32 s2, s4, 0xffff ; GFX6-NEXT: s_lshl_b32 s3, s3, 16 ; GFX6-NEXT: s_or_b32 s2, s2, s3 ; GFX6-NEXT: ; return to shader part epilog ; ; GFX8-LABEL: s_ssubsat_v6i16: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_sext_i32_i16 s14, s0 -; GFX8-NEXT: s_sext_i32_i16 s15, -1 -; GFX8-NEXT: s_movk_i32 s12, 0x7fff -; GFX8-NEXT: s_max_i32 s16, s14, s15 -; GFX8-NEXT: s_sub_i32 s16, s16, s12 +; GFX8-NEXT: s_sext_i32_i16 s12, s0 +; GFX8-NEXT: s_sext_i32_i16 s13, -1 +; GFX8-NEXT: s_max_i32 s14, s12, s13 +; GFX8-NEXT: s_sub_i32 s14, s14, 0x7fff ; GFX8-NEXT: s_lshr_b32 s9, s3, 16 -; GFX8-NEXT: s_movk_i32 s13, 0x8000 -; GFX8-NEXT: s_min_i32 s14, s14, s15 -; GFX8-NEXT: s_sext_i32_i16 s16, s16 -; GFX8-NEXT: s_sext_i32_i16 s3, s3 -; GFX8-NEXT: s_sub_i32 s14, s14, s13 -; GFX8-NEXT: s_max_i32 s3, s16, s3 -; GFX8-NEXT: s_sext_i32_i16 s3, s3 +; GFX8-NEXT: s_min_i32 s12, s12, s13 ; GFX8-NEXT: s_sext_i32_i16 s14, s14 +; GFX8-NEXT: s_sext_i32_i16 s3, s3 +; GFX8-NEXT: s_sub_i32 s12, s12, 0xffff8000 +; GFX8-NEXT: s_max_i32 s3, s14, s3 +; GFX8-NEXT: s_sext_i32_i16 s3, s3 +; GFX8-NEXT: s_sext_i32_i16 s12, s12 ; GFX8-NEXT: s_lshr_b32 s6, s0, 16 -; GFX8-NEXT: s_min_i32 s3, s3, s14 +; GFX8-NEXT: s_min_i32 s3, s3, s12 ; GFX8-NEXT: s_sub_i32 s0, s0, s3 ; GFX8-NEXT: s_sext_i32_i16 s3, s6 -; GFX8-NEXT: s_max_i32 s14, s3, s15 -; GFX8-NEXT: s_sub_i32 s14, s14, s12 -; GFX8-NEXT: s_min_i32 s3, s3, s15 -; GFX8-NEXT: s_sext_i32_i16 s14, s14 +; GFX8-NEXT: s_max_i32 s12, s3, s13 +; GFX8-NEXT: s_sub_i32 s12, s12, 0x7fff +; GFX8-NEXT: s_min_i32 s3, s3, s13 +; GFX8-NEXT: s_sext_i32_i16 s12, s12 ; GFX8-NEXT: s_sext_i32_i16 s9, s9 -; GFX8-NEXT: s_sub_i32 s3, s3, s13 -; GFX8-NEXT: s_max_i32 s9, s14, s9 +; GFX8-NEXT: s_sub_i32 s3, s3, 0xffff8000 +; GFX8-NEXT: s_max_i32 s9, s12, s9 ; GFX8-NEXT: s_sext_i32_i16 s9, s9 ; GFX8-NEXT: s_sext_i32_i16 s3, s3 ; GFX8-NEXT: s_min_i32 s3, s9, s3 ; GFX8-NEXT: s_sub_i32 s3, s6, s3 ; GFX8-NEXT: s_sext_i32_i16 s6, s1 -; GFX8-NEXT: s_max_i32 s9, s6, s15 -; GFX8-NEXT: s_sub_i32 s9, s9, s12 +; GFX8-NEXT: s_max_i32 s9, s6, s13 +; GFX8-NEXT: s_sub_i32 s9, s9, 0x7fff ; GFX8-NEXT: s_lshr_b32 s10, s4, 16 -; GFX8-NEXT: s_min_i32 s6, s6, s15 +; GFX8-NEXT: s_min_i32 s6, s6, s13 ; GFX8-NEXT: s_sext_i32_i16 s9, s9 ; GFX8-NEXT: s_sext_i32_i16 s4, s4 -; GFX8-NEXT: s_sub_i32 s6, s6, s13 +; GFX8-NEXT: s_sub_i32 s6, s6, 0xffff8000 ; GFX8-NEXT: s_max_i32 s4, s9, s4 ; GFX8-NEXT: s_sext_i32_i16 s4, s4 ; GFX8-NEXT: s_sext_i32_i16 s6, s6 @@ -3562,25 +3501,25 @@ define amdgpu_ps <3 x i32> @s_ssubsat_v6i16(<6 x i16> inreg %lhs, <6 x i16> inre ; GFX8-NEXT: s_min_i32 s4, s4, s6 ; GFX8-NEXT: s_sub_i32 s1, s1, s4 ; GFX8-NEXT: s_sext_i32_i16 s4, s7 -; GFX8-NEXT: s_max_i32 s6, s4, s15 -; GFX8-NEXT: s_sub_i32 s6, s6, s12 -; GFX8-NEXT: s_min_i32 s4, s4, s15 +; GFX8-NEXT: s_max_i32 s6, s4, s13 +; GFX8-NEXT: s_sub_i32 s6, s6, 0x7fff +; GFX8-NEXT: s_min_i32 s4, s4, s13 ; GFX8-NEXT: s_sext_i32_i16 s6, s6 ; GFX8-NEXT: s_sext_i32_i16 s9, s10 -; GFX8-NEXT: s_sub_i32 s4, s4, s13 +; GFX8-NEXT: s_sub_i32 s4, s4, 0xffff8000 ; GFX8-NEXT: s_max_i32 s6, s6, s9 ; GFX8-NEXT: s_sext_i32_i16 s6, s6 ; GFX8-NEXT: s_sext_i32_i16 s4, s4 ; GFX8-NEXT: s_min_i32 s4, s6, s4 ; GFX8-NEXT: s_sext_i32_i16 s6, s2 ; GFX8-NEXT: s_sub_i32 s4, s7, s4 -; GFX8-NEXT: s_max_i32 s7, s6, s15 -; GFX8-NEXT: s_sub_i32 s7, s7, s12 +; GFX8-NEXT: s_max_i32 s7, s6, s13 +; GFX8-NEXT: s_sub_i32 s7, s7, 0x7fff ; GFX8-NEXT: s_lshr_b32 s11, s5, 16 -; GFX8-NEXT: s_min_i32 s6, s6, s15 +; GFX8-NEXT: s_min_i32 s6, s6, s13 ; GFX8-NEXT: s_sext_i32_i16 s7, s7 ; GFX8-NEXT: s_sext_i32_i16 s5, s5 -; GFX8-NEXT: s_sub_i32 s6, s6, s13 +; GFX8-NEXT: s_sub_i32 s6, s6, 0xffff8000 ; GFX8-NEXT: s_max_i32 s5, s7, s5 ; GFX8-NEXT: s_sext_i32_i16 s5, s5 ; GFX8-NEXT: s_sext_i32_i16 s6, s6 @@ -3588,12 +3527,12 @@ define amdgpu_ps <3 x i32> @s_ssubsat_v6i16(<6 x i16> inreg %lhs, <6 x i16> inre ; GFX8-NEXT: s_min_i32 s5, s5, s6 ; GFX8-NEXT: s_sub_i32 s2, s2, s5 ; GFX8-NEXT: s_sext_i32_i16 s5, s8 -; GFX8-NEXT: s_max_i32 s6, s5, s15 -; GFX8-NEXT: s_sub_i32 s6, s6, s12 -; GFX8-NEXT: s_min_i32 s5, s5, s15 +; GFX8-NEXT: s_max_i32 s6, s5, s13 +; GFX8-NEXT: s_sub_i32 s6, s6, 0x7fff +; GFX8-NEXT: s_min_i32 s5, s5, s13 ; GFX8-NEXT: s_sext_i32_i16 s6, s6 ; GFX8-NEXT: s_sext_i32_i16 s7, s11 -; GFX8-NEXT: s_sub_i32 s5, s5, s13 +; GFX8-NEXT: s_sub_i32 s5, s5, 0xffff8000 ; GFX8-NEXT: s_max_i32 s6, s6, s7 ; GFX8-NEXT: s_bfe_u32 s3, s3, 0x100000 ; GFX8-NEXT: s_sext_i32_i16 s6, s6 @@ -3847,145 +3786,140 @@ define amdgpu_ps <4 x i32> @s_ssubsat_v8i16(<8 x i16> inreg %lhs, <8 x i16> inre ; GFX6-LABEL: s_ssubsat_v8i16: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_lshl_b32 s0, s0, 16 -; GFX6-NEXT: s_brev_b32 s16, -2 -; GFX6-NEXT: s_max_i32 s18, s0, -1 +; GFX6-NEXT: s_max_i32 s16, s0, -1 ; GFX6-NEXT: s_lshl_b32 s8, s8, 16 -; GFX6-NEXT: s_brev_b32 s17, 1 -; GFX6-NEXT: s_sub_i32 s18, s18, s16 -; GFX6-NEXT: s_min_i32 s19, s0, -1 -; GFX6-NEXT: s_sub_i32 s19, s19, s17 -; GFX6-NEXT: s_max_i32 s8, s18, s8 -; GFX6-NEXT: s_min_i32 s8, s8, s19 +; GFX6-NEXT: s_sub_i32 s16, s16, 0x7fffffff +; GFX6-NEXT: s_min_i32 s17, s0, -1 +; GFX6-NEXT: s_sub_i32 s17, s17, 0x80000000 +; GFX6-NEXT: s_max_i32 s8, s16, s8 +; GFX6-NEXT: s_min_i32 s8, s8, s17 ; GFX6-NEXT: s_lshl_b32 s1, s1, 16 ; GFX6-NEXT: s_sub_i32 s0, s0, s8 ; GFX6-NEXT: s_lshl_b32 s8, s9, 16 ; GFX6-NEXT: s_max_i32 s9, s1, -1 -; GFX6-NEXT: s_sub_i32 s9, s9, s16 -; GFX6-NEXT: s_min_i32 s18, s1, -1 -; GFX6-NEXT: s_sub_i32 s18, s18, s17 +; GFX6-NEXT: s_sub_i32 s9, s9, 0x7fffffff +; GFX6-NEXT: s_min_i32 s16, s1, -1 +; GFX6-NEXT: s_sub_i32 s16, s16, 0x80000000 ; GFX6-NEXT: s_max_i32 s8, s9, s8 ; GFX6-NEXT: s_lshl_b32 s2, s2, 16 -; GFX6-NEXT: s_min_i32 s8, s8, s18 +; GFX6-NEXT: s_min_i32 s8, s8, s16 ; GFX6-NEXT: s_max_i32 s9, s2, -1 ; GFX6-NEXT: s_sub_i32 s1, s1, s8 ; GFX6-NEXT: s_lshl_b32 s8, s10, 16 -; GFX6-NEXT: s_sub_i32 s9, s9, s16 +; GFX6-NEXT: s_sub_i32 s9, s9, 0x7fffffff ; GFX6-NEXT: s_min_i32 s10, s2, -1 -; GFX6-NEXT: s_sub_i32 s10, s10, s17 +; GFX6-NEXT: s_sub_i32 s10, s10, 0x80000000 ; GFX6-NEXT: s_max_i32 s8, s9, s8 ; GFX6-NEXT: s_lshl_b32 s3, s3, 16 ; GFX6-NEXT: s_min_i32 s8, s8, s10 ; GFX6-NEXT: s_max_i32 s9, s3, -1 ; GFX6-NEXT: s_sub_i32 s2, s2, s8 ; GFX6-NEXT: s_lshl_b32 s8, s11, 16 -; GFX6-NEXT: s_sub_i32 s9, s9, s16 +; GFX6-NEXT: s_sub_i32 s9, s9, 0x7fffffff ; GFX6-NEXT: s_min_i32 s10, s3, -1 -; GFX6-NEXT: s_sub_i32 s10, s10, s17 +; GFX6-NEXT: s_sub_i32 s10, s10, 0x80000000 ; GFX6-NEXT: s_max_i32 s8, s9, s8 ; GFX6-NEXT: s_lshl_b32 s4, s4, 16 ; GFX6-NEXT: s_min_i32 s8, s8, s10 ; GFX6-NEXT: s_max_i32 s9, s4, -1 ; GFX6-NEXT: s_sub_i32 s3, s3, s8 ; GFX6-NEXT: s_lshl_b32 s8, s12, 16 -; GFX6-NEXT: s_sub_i32 s9, s9, s16 +; GFX6-NEXT: s_sub_i32 s9, s9, 0x7fffffff ; GFX6-NEXT: s_min_i32 s10, s4, -1 -; GFX6-NEXT: s_sub_i32 s10, s10, s17 +; GFX6-NEXT: s_sub_i32 s10, s10, 0x80000000 ; GFX6-NEXT: s_max_i32 s8, s9, s8 ; GFX6-NEXT: s_lshl_b32 s5, s5, 16 ; GFX6-NEXT: s_min_i32 s8, s8, s10 ; GFX6-NEXT: s_max_i32 s9, s5, -1 ; GFX6-NEXT: s_sub_i32 s4, s4, s8 ; GFX6-NEXT: s_lshl_b32 s8, s13, 16 -; GFX6-NEXT: s_sub_i32 s9, s9, s16 +; GFX6-NEXT: s_sub_i32 s9, s9, 0x7fffffff ; GFX6-NEXT: s_min_i32 s10, s5, -1 -; GFX6-NEXT: s_sub_i32 s10, s10, s17 +; GFX6-NEXT: s_sub_i32 s10, s10, 0x80000000 ; GFX6-NEXT: s_max_i32 s8, s9, s8 ; GFX6-NEXT: s_lshl_b32 s6, s6, 16 ; GFX6-NEXT: s_min_i32 s8, s8, s10 ; GFX6-NEXT: s_max_i32 s9, s6, -1 ; GFX6-NEXT: s_sub_i32 s5, s5, s8 ; GFX6-NEXT: s_lshl_b32 s8, s14, 16 -; GFX6-NEXT: s_sub_i32 s9, s9, s16 +; GFX6-NEXT: s_sub_i32 s9, s9, 0x7fffffff ; GFX6-NEXT: s_min_i32 s10, s6, -1 -; GFX6-NEXT: s_sub_i32 s10, s10, s17 +; GFX6-NEXT: s_sub_i32 s10, s10, 0x80000000 ; GFX6-NEXT: s_max_i32 s8, s9, s8 ; GFX6-NEXT: s_lshl_b32 s7, s7, 16 ; GFX6-NEXT: s_min_i32 s8, s8, s10 ; GFX6-NEXT: s_max_i32 s9, s7, -1 +; GFX6-NEXT: s_ashr_i32 s1, s1, 16 ; GFX6-NEXT: s_sub_i32 s6, s6, s8 ; GFX6-NEXT: s_lshl_b32 s8, s15, 16 -; GFX6-NEXT: s_sub_i32 s9, s9, s16 +; GFX6-NEXT: s_sub_i32 s9, s9, 0x7fffffff ; GFX6-NEXT: s_min_i32 s10, s7, -1 -; GFX6-NEXT: s_sub_i32 s10, s10, s17 -; GFX6-NEXT: s_max_i32 s8, s9, s8 -; GFX6-NEXT: s_min_i32 s8, s8, s10 -; GFX6-NEXT: s_ashr_i32 s1, s1, 16 -; GFX6-NEXT: s_sub_i32 s7, s7, s8 -; GFX6-NEXT: s_mov_b32 s8, 0xffff ; GFX6-NEXT: s_ashr_i32 s0, s0, 16 -; GFX6-NEXT: s_and_b32 s1, s1, s8 +; GFX6-NEXT: s_sub_i32 s10, s10, 0x80000000 +; GFX6-NEXT: s_max_i32 s8, s9, s8 +; GFX6-NEXT: s_and_b32 s1, s1, 0xffff ; GFX6-NEXT: s_ashr_i32 s2, s2, 16 ; GFX6-NEXT: s_ashr_i32 s3, s3, 16 -; GFX6-NEXT: s_and_b32 s0, s0, s8 +; GFX6-NEXT: s_min_i32 s8, s8, s10 +; GFX6-NEXT: s_and_b32 s0, s0, 0xffff ; GFX6-NEXT: s_lshl_b32 s1, s1, 16 ; GFX6-NEXT: s_ashr_i32 s5, s5, 16 +; GFX6-NEXT: s_sub_i32 s7, s7, s8 ; GFX6-NEXT: s_or_b32 s0, s0, s1 -; GFX6-NEXT: s_and_b32 s1, s2, s8 -; GFX6-NEXT: s_and_b32 s2, s3, s8 +; GFX6-NEXT: s_and_b32 s1, s2, 0xffff +; GFX6-NEXT: s_and_b32 s2, s3, 0xffff ; GFX6-NEXT: s_ashr_i32 s4, s4, 16 ; GFX6-NEXT: s_ashr_i32 s7, s7, 16 ; GFX6-NEXT: s_lshl_b32 s2, s2, 16 -; GFX6-NEXT: s_and_b32 s3, s5, s8 +; GFX6-NEXT: s_and_b32 s3, s5, 0xffff ; GFX6-NEXT: s_ashr_i32 s6, s6, 16 ; GFX6-NEXT: s_or_b32 s1, s1, s2 -; GFX6-NEXT: s_and_b32 s2, s4, s8 +; GFX6-NEXT: s_and_b32 s2, s4, 0xffff ; GFX6-NEXT: s_lshl_b32 s3, s3, 16 -; GFX6-NEXT: s_and_b32 s4, s7, s8 +; GFX6-NEXT: s_and_b32 s4, s7, 0xffff ; GFX6-NEXT: s_or_b32 s2, s2, s3 -; GFX6-NEXT: s_and_b32 s3, s6, s8 +; GFX6-NEXT: s_and_b32 s3, s6, 0xffff ; GFX6-NEXT: s_lshl_b32 s4, s4, 16 ; GFX6-NEXT: s_or_b32 s3, s3, s4 ; GFX6-NEXT: ; return to shader part epilog ; ; GFX8-LABEL: s_ssubsat_v8i16: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_sext_i32_i16 s18, s0 -; GFX8-NEXT: s_sext_i32_i16 s19, -1 -; GFX8-NEXT: s_movk_i32 s16, 0x7fff -; GFX8-NEXT: s_max_i32 s20, s18, s19 -; GFX8-NEXT: s_sub_i32 s20, s20, s16 +; GFX8-NEXT: s_sext_i32_i16 s16, s0 +; GFX8-NEXT: s_sext_i32_i16 s17, -1 +; GFX8-NEXT: s_max_i32 s18, s16, s17 +; GFX8-NEXT: s_sub_i32 s18, s18, 0x7fff ; GFX8-NEXT: s_lshr_b32 s12, s4, 16 -; GFX8-NEXT: s_movk_i32 s17, 0x8000 -; GFX8-NEXT: s_min_i32 s18, s18, s19 -; GFX8-NEXT: s_sext_i32_i16 s20, s20 -; GFX8-NEXT: s_sext_i32_i16 s4, s4 -; GFX8-NEXT: s_sub_i32 s18, s18, s17 -; GFX8-NEXT: s_max_i32 s4, s20, s4 -; GFX8-NEXT: s_sext_i32_i16 s4, s4 +; GFX8-NEXT: s_min_i32 s16, s16, s17 ; GFX8-NEXT: s_sext_i32_i16 s18, s18 +; GFX8-NEXT: s_sext_i32_i16 s4, s4 +; GFX8-NEXT: s_sub_i32 s16, s16, 0xffff8000 +; GFX8-NEXT: s_max_i32 s4, s18, s4 +; GFX8-NEXT: s_sext_i32_i16 s4, s4 +; GFX8-NEXT: s_sext_i32_i16 s16, s16 ; GFX8-NEXT: s_lshr_b32 s8, s0, 16 -; GFX8-NEXT: s_min_i32 s4, s4, s18 +; GFX8-NEXT: s_min_i32 s4, s4, s16 ; GFX8-NEXT: s_sub_i32 s0, s0, s4 ; GFX8-NEXT: s_sext_i32_i16 s4, s8 -; GFX8-NEXT: s_max_i32 s18, s4, s19 -; GFX8-NEXT: s_sub_i32 s18, s18, s16 -; GFX8-NEXT: s_min_i32 s4, s4, s19 -; GFX8-NEXT: s_sext_i32_i16 s18, s18 +; GFX8-NEXT: s_max_i32 s16, s4, s17 +; GFX8-NEXT: s_sub_i32 s16, s16, 0x7fff +; GFX8-NEXT: s_min_i32 s4, s4, s17 +; GFX8-NEXT: s_sext_i32_i16 s16, s16 ; GFX8-NEXT: s_sext_i32_i16 s12, s12 -; GFX8-NEXT: s_sub_i32 s4, s4, s17 -; GFX8-NEXT: s_max_i32 s12, s18, s12 +; GFX8-NEXT: s_sub_i32 s4, s4, 0xffff8000 +; GFX8-NEXT: s_max_i32 s12, s16, s12 ; GFX8-NEXT: s_sext_i32_i16 s12, s12 ; GFX8-NEXT: s_sext_i32_i16 s4, s4 ; GFX8-NEXT: s_min_i32 s4, s12, s4 ; GFX8-NEXT: s_sub_i32 s4, s8, s4 ; GFX8-NEXT: s_sext_i32_i16 s8, s1 -; GFX8-NEXT: s_max_i32 s12, s8, s19 -; GFX8-NEXT: s_sub_i32 s12, s12, s16 +; GFX8-NEXT: s_max_i32 s12, s8, s17 +; GFX8-NEXT: s_sub_i32 s12, s12, 0x7fff ; GFX8-NEXT: s_lshr_b32 s13, s5, 16 -; GFX8-NEXT: s_min_i32 s8, s8, s19 +; GFX8-NEXT: s_min_i32 s8, s8, s17 ; GFX8-NEXT: s_sext_i32_i16 s12, s12 ; GFX8-NEXT: s_sext_i32_i16 s5, s5 -; GFX8-NEXT: s_sub_i32 s8, s8, s17 +; GFX8-NEXT: s_sub_i32 s8, s8, 0xffff8000 ; GFX8-NEXT: s_max_i32 s5, s12, s5 ; GFX8-NEXT: s_sext_i32_i16 s5, s5 ; GFX8-NEXT: s_sext_i32_i16 s8, s8 @@ -3993,25 +3927,25 @@ define amdgpu_ps <4 x i32> @s_ssubsat_v8i16(<8 x i16> inreg %lhs, <8 x i16> inre ; GFX8-NEXT: s_min_i32 s5, s5, s8 ; GFX8-NEXT: s_sub_i32 s1, s1, s5 ; GFX8-NEXT: s_sext_i32_i16 s5, s9 -; GFX8-NEXT: s_max_i32 s8, s5, s19 -; GFX8-NEXT: s_sub_i32 s8, s8, s16 -; GFX8-NEXT: s_min_i32 s5, s5, s19 +; GFX8-NEXT: s_max_i32 s8, s5, s17 +; GFX8-NEXT: s_sub_i32 s8, s8, 0x7fff +; GFX8-NEXT: s_min_i32 s5, s5, s17 ; GFX8-NEXT: s_sext_i32_i16 s8, s8 ; GFX8-NEXT: s_sext_i32_i16 s12, s13 -; GFX8-NEXT: s_sub_i32 s5, s5, s17 +; GFX8-NEXT: s_sub_i32 s5, s5, 0xffff8000 ; GFX8-NEXT: s_max_i32 s8, s8, s12 ; GFX8-NEXT: s_sext_i32_i16 s8, s8 ; GFX8-NEXT: s_sext_i32_i16 s5, s5 ; GFX8-NEXT: s_min_i32 s5, s8, s5 ; GFX8-NEXT: s_sext_i32_i16 s8, s2 ; GFX8-NEXT: s_sub_i32 s5, s9, s5 -; GFX8-NEXT: s_max_i32 s9, s8, s19 -; GFX8-NEXT: s_sub_i32 s9, s9, s16 +; GFX8-NEXT: s_max_i32 s9, s8, s17 +; GFX8-NEXT: s_sub_i32 s9, s9, 0x7fff ; GFX8-NEXT: s_lshr_b32 s14, s6, 16 -; GFX8-NEXT: s_min_i32 s8, s8, s19 +; GFX8-NEXT: s_min_i32 s8, s8, s17 ; GFX8-NEXT: s_sext_i32_i16 s9, s9 ; GFX8-NEXT: s_sext_i32_i16 s6, s6 -; GFX8-NEXT: s_sub_i32 s8, s8, s17 +; GFX8-NEXT: s_sub_i32 s8, s8, 0xffff8000 ; GFX8-NEXT: s_max_i32 s6, s9, s6 ; GFX8-NEXT: s_sext_i32_i16 s6, s6 ; GFX8-NEXT: s_sext_i32_i16 s8, s8 @@ -4019,24 +3953,24 @@ define amdgpu_ps <4 x i32> @s_ssubsat_v8i16(<8 x i16> inreg %lhs, <8 x i16> inre ; GFX8-NEXT: s_min_i32 s6, s6, s8 ; GFX8-NEXT: s_sub_i32 s2, s2, s6 ; GFX8-NEXT: s_sext_i32_i16 s6, s10 -; GFX8-NEXT: s_max_i32 s8, s6, s19 -; GFX8-NEXT: s_sub_i32 s8, s8, s16 -; GFX8-NEXT: s_min_i32 s6, s6, s19 +; GFX8-NEXT: s_max_i32 s8, s6, s17 +; GFX8-NEXT: s_sub_i32 s8, s8, 0x7fff +; GFX8-NEXT: s_min_i32 s6, s6, s17 ; GFX8-NEXT: s_sext_i32_i16 s8, s8 ; GFX8-NEXT: s_sext_i32_i16 s9, s14 -; GFX8-NEXT: s_sub_i32 s6, s6, s17 +; GFX8-NEXT: s_sub_i32 s6, s6, 0xffff8000 ; GFX8-NEXT: s_max_i32 s8, s8, s9 ; GFX8-NEXT: s_sext_i32_i16 s8, s8 ; GFX8-NEXT: s_sext_i32_i16 s6, s6 ; GFX8-NEXT: s_min_i32 s6, s8, s6 ; GFX8-NEXT: s_sext_i32_i16 s8, s3 -; GFX8-NEXT: s_max_i32 s9, s8, s19 -; GFX8-NEXT: s_sub_i32 s9, s9, s16 +; GFX8-NEXT: s_max_i32 s9, s8, s17 +; GFX8-NEXT: s_sub_i32 s9, s9, 0x7fff ; GFX8-NEXT: s_lshr_b32 s15, s7, 16 -; GFX8-NEXT: s_min_i32 s8, s8, s19 +; GFX8-NEXT: s_min_i32 s8, s8, s17 ; GFX8-NEXT: s_sext_i32_i16 s9, s9 ; GFX8-NEXT: s_sext_i32_i16 s7, s7 -; GFX8-NEXT: s_sub_i32 s8, s8, s17 +; GFX8-NEXT: s_sub_i32 s8, s8, 0xffff8000 ; GFX8-NEXT: s_max_i32 s7, s9, s7 ; GFX8-NEXT: s_sext_i32_i16 s7, s7 ; GFX8-NEXT: s_sext_i32_i16 s8, s8 @@ -4044,15 +3978,15 @@ define amdgpu_ps <4 x i32> @s_ssubsat_v8i16(<8 x i16> inreg %lhs, <8 x i16> inre ; GFX8-NEXT: s_min_i32 s7, s7, s8 ; GFX8-NEXT: s_sub_i32 s3, s3, s7 ; GFX8-NEXT: s_sext_i32_i16 s7, s11 -; GFX8-NEXT: s_max_i32 s8, s7, s19 -; GFX8-NEXT: s_sub_i32 s8, s8, s16 +; GFX8-NEXT: s_max_i32 s8, s7, s17 +; GFX8-NEXT: s_sub_i32 s8, s8, 0x7fff ; GFX8-NEXT: s_bfe_u32 s4, s4, 0x100000 -; GFX8-NEXT: s_min_i32 s7, s7, s19 +; GFX8-NEXT: s_min_i32 s7, s7, s17 ; GFX8-NEXT: s_sext_i32_i16 s8, s8 ; GFX8-NEXT: s_sext_i32_i16 s9, s15 ; GFX8-NEXT: s_bfe_u32 s0, s0, 0x100000 ; GFX8-NEXT: s_lshl_b32 s4, s4, 16 -; GFX8-NEXT: s_sub_i32 s7, s7, s17 +; GFX8-NEXT: s_sub_i32 s7, s7, 0xffff8000 ; GFX8-NEXT: s_max_i32 s8, s8, s9 ; GFX8-NEXT: s_or_b32 s0, s0, s4 ; GFX8-NEXT: s_bfe_u32 s4, s5, 0x100000 @@ -4539,13 +4473,12 @@ define amdgpu_ps <2 x i64> @s_ssubsat_v2i64(<2 x i64> inreg %lhs, <2 x i64> inre ; GFX6-NEXT: v_cmp_lt_i64_e32 vcc, s[8:9], v[0:1] ; GFX6-NEXT: v_cmp_gt_i64_e64 s[0:1], s[4:5], 0 ; GFX6-NEXT: s_ashr_i32 s4, s9, 31 -; GFX6-NEXT: s_mov_b32 s10, 0 +; GFX6-NEXT: s_mov_b32 s5, 0 ; GFX6-NEXT: s_xor_b64 vcc, s[0:1], vcc -; GFX6-NEXT: s_brev_b32 s5, 1 -; GFX6-NEXT: s_cmp_lg_u32 s10, 0 +; GFX6-NEXT: s_cmp_lg_u32 s5, 0 ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: v_mov_b32_e32 v1, s8 -; GFX6-NEXT: s_addc_u32 s1, s4, s5 +; GFX6-NEXT: s_addc_u32 s1, s4, 0x80000000 ; GFX6-NEXT: v_cndmask_b32_e32 v4, v1, v0, vcc ; GFX6-NEXT: s_sub_u32 s0, s2, s6 ; GFX6-NEXT: v_mov_b32_e32 v0, s2 @@ -4557,10 +4490,10 @@ define amdgpu_ps <2 x i64> @s_ssubsat_v2i64(<2 x i64> inreg %lhs, <2 x i64> inre ; GFX6-NEXT: v_cmp_lt_i64_e32 vcc, s[0:1], v[0:1] ; GFX6-NEXT: v_cmp_gt_i64_e64 s[2:3], s[6:7], 0 ; GFX6-NEXT: s_ashr_i32 s4, s1, 31 -; GFX6-NEXT: s_mov_b32 s6, 0 +; GFX6-NEXT: s_mov_b32 s5, 0 ; GFX6-NEXT: s_xor_b64 vcc, s[2:3], vcc -; GFX6-NEXT: s_cmp_lg_u32 s6, 0 -; GFX6-NEXT: s_addc_u32 s3, s4, s5 +; GFX6-NEXT: s_cmp_lg_u32 s5, 0 +; GFX6-NEXT: s_addc_u32 s3, s4, 0x80000000 ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: v_mov_b32_e32 v1, s0 ; GFX6-NEXT: v_mov_b32_e32 v3, s3 @@ -4582,13 +4515,12 @@ define amdgpu_ps <2 x i64> @s_ssubsat_v2i64(<2 x i64> inreg %lhs, <2 x i64> inre ; GFX8-NEXT: v_cmp_lt_i64_e32 vcc, s[8:9], v[0:1] ; GFX8-NEXT: v_cmp_gt_i64_e64 s[0:1], s[4:5], 0 ; GFX8-NEXT: s_ashr_i32 s4, s9, 31 -; GFX8-NEXT: s_mov_b32 s10, 0 +; GFX8-NEXT: s_mov_b32 s5, 0 ; GFX8-NEXT: s_xor_b64 vcc, s[0:1], vcc -; GFX8-NEXT: s_brev_b32 s5, 1 -; GFX8-NEXT: s_cmp_lg_u32 s10, 0 +; GFX8-NEXT: s_cmp_lg_u32 s5, 0 ; GFX8-NEXT: v_mov_b32_e32 v0, s4 ; GFX8-NEXT: v_mov_b32_e32 v1, s8 -; GFX8-NEXT: s_addc_u32 s1, s4, s5 +; GFX8-NEXT: s_addc_u32 s1, s4, 0x80000000 ; GFX8-NEXT: v_cndmask_b32_e32 v4, v1, v0, vcc ; GFX8-NEXT: s_sub_u32 s0, s2, s6 ; GFX8-NEXT: v_mov_b32_e32 v0, s2 @@ -4600,10 +4532,10 @@ define amdgpu_ps <2 x i64> @s_ssubsat_v2i64(<2 x i64> inreg %lhs, <2 x i64> inre ; GFX8-NEXT: v_cmp_lt_i64_e32 vcc, s[0:1], v[0:1] ; GFX8-NEXT: v_cmp_gt_i64_e64 s[2:3], s[6:7], 0 ; GFX8-NEXT: s_ashr_i32 s4, s1, 31 -; GFX8-NEXT: s_mov_b32 s6, 0 +; GFX8-NEXT: s_mov_b32 s5, 0 ; GFX8-NEXT: s_xor_b64 vcc, s[2:3], vcc -; GFX8-NEXT: s_cmp_lg_u32 s6, 0 -; GFX8-NEXT: s_addc_u32 s3, s4, s5 +; GFX8-NEXT: s_cmp_lg_u32 s5, 0 +; GFX8-NEXT: s_addc_u32 s3, s4, 0x80000000 ; GFX8-NEXT: v_mov_b32_e32 v0, s4 ; GFX8-NEXT: v_mov_b32_e32 v1, s0 ; GFX8-NEXT: v_mov_b32_e32 v3, s3 @@ -4625,13 +4557,12 @@ define amdgpu_ps <2 x i64> @s_ssubsat_v2i64(<2 x i64> inreg %lhs, <2 x i64> inre ; GFX9-NEXT: v_cmp_lt_i64_e32 vcc, s[8:9], v[0:1] ; GFX9-NEXT: v_cmp_gt_i64_e64 s[0:1], s[4:5], 0 ; GFX9-NEXT: s_ashr_i32 s4, s9, 31 -; GFX9-NEXT: s_mov_b32 s10, 0 +; GFX9-NEXT: s_mov_b32 s5, 0 ; GFX9-NEXT: s_xor_b64 vcc, s[0:1], vcc -; GFX9-NEXT: s_brev_b32 s5, 1 -; GFX9-NEXT: s_cmp_lg_u32 s10, 0 +; GFX9-NEXT: s_cmp_lg_u32 s5, 0 ; GFX9-NEXT: v_mov_b32_e32 v0, s4 ; GFX9-NEXT: v_mov_b32_e32 v1, s8 -; GFX9-NEXT: s_addc_u32 s1, s4, s5 +; GFX9-NEXT: s_addc_u32 s1, s4, 0x80000000 ; GFX9-NEXT: v_cndmask_b32_e32 v4, v1, v0, vcc ; GFX9-NEXT: s_sub_u32 s0, s2, s6 ; GFX9-NEXT: v_mov_b32_e32 v0, s2 @@ -4643,10 +4574,10 @@ define amdgpu_ps <2 x i64> @s_ssubsat_v2i64(<2 x i64> inreg %lhs, <2 x i64> inre ; GFX9-NEXT: v_cmp_lt_i64_e32 vcc, s[0:1], v[0:1] ; GFX9-NEXT: v_cmp_gt_i64_e64 s[2:3], s[6:7], 0 ; GFX9-NEXT: s_ashr_i32 s4, s1, 31 -; GFX9-NEXT: s_mov_b32 s6, 0 +; GFX9-NEXT: s_mov_b32 s5, 0 ; GFX9-NEXT: s_xor_b64 vcc, s[2:3], vcc -; GFX9-NEXT: s_cmp_lg_u32 s6, 0 -; GFX9-NEXT: s_addc_u32 s3, s4, s5 +; GFX9-NEXT: s_cmp_lg_u32 s5, 0 +; GFX9-NEXT: s_addc_u32 s3, s4, 0x80000000 ; GFX9-NEXT: v_mov_b32_e32 v0, s4 ; GFX9-NEXT: v_mov_b32_e32 v1, s0 ; GFX9-NEXT: v_mov_b32_e32 v3, s3 @@ -4665,15 +4596,14 @@ define amdgpu_ps <2 x i64> @s_ssubsat_v2i64(<2 x i64> inreg %lhs, <2 x i64> inre ; GFX10-NEXT: s_subb_u32 s9, s1, s5 ; GFX10-NEXT: v_cmp_gt_i64_e64 s4, s[4:5], 0 ; GFX10-NEXT: v_cmp_lt_i64_e64 s1, s[8:9], s[0:1] -; GFX10-NEXT: s_mov_b32 s11, 0 +; GFX10-NEXT: s_mov_b32 s10, 0 ; GFX10-NEXT: s_ashr_i32 s0, s9, 31 ; GFX10-NEXT: v_mov_b32_e32 v0, s8 -; GFX10-NEXT: s_brev_b32 s10, 1 ; GFX10-NEXT: v_mov_b32_e32 v1, s9 ; GFX10-NEXT: s_xor_b32 s8, s4, s1 -; GFX10-NEXT: s_cmp_lg_u32 s11, 0 +; GFX10-NEXT: s_cmp_lg_u32 s10, 0 ; GFX10-NEXT: v_cndmask_b32_e64 v0, v0, s0, s8 -; GFX10-NEXT: s_addc_u32 s1, s0, s10 +; GFX10-NEXT: s_addc_u32 s1, s0, 0x80000000 ; GFX10-NEXT: s_sub_u32 s4, s2, s6 ; GFX10-NEXT: s_subb_u32 s5, s3, s7 ; GFX10-NEXT: v_mov_b32_e32 v2, s4 @@ -4683,9 +4613,9 @@ define amdgpu_ps <2 x i64> @s_ssubsat_v2i64(<2 x i64> inreg %lhs, <2 x i64> inre ; GFX10-NEXT: v_mov_b32_e32 v3, s5 ; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, s1, s8 ; GFX10-NEXT: s_xor_b32 s2, s3, s2 -; GFX10-NEXT: s_cmp_lg_u32 s11, 0 +; GFX10-NEXT: s_cmp_lg_u32 s10, 0 ; GFX10-NEXT: v_cndmask_b32_e64 v2, v2, s0, s2 -; GFX10-NEXT: s_addc_u32 s1, s0, s10 +; GFX10-NEXT: s_addc_u32 s1, s0, 0x80000000 ; GFX10-NEXT: v_readfirstlane_b32 s0, v0 ; GFX10-NEXT: v_cndmask_b32_e64 v3, v3, s1, s2 ; GFX10-NEXT: v_readfirstlane_b32 s1, v1 @@ -5481,10 +5411,9 @@ define amdgpu_ps <2 x i128> @s_ssubsat_v2i128(<2 x i128> inreg %lhs, <2 x i128> ; GFX6-NEXT: s_cmp_lg_u32 s1, 0 ; GFX6-NEXT: v_xor_b32_e32 v0, v1, v0 ; GFX6-NEXT: s_addc_u32 s1, s0, 0 -; GFX6-NEXT: s_brev_b32 s8, 1 ; GFX6-NEXT: s_addc_u32 s2, s0, 0 ; GFX6-NEXT: v_and_b32_e32 v0, 1, v0 -; GFX6-NEXT: s_addc_u32 s3, s0, s8 +; GFX6-NEXT: s_addc_u32 s3, s0, 0x80000000 ; GFX6-NEXT: v_mov_b32_e32 v1, s0 ; GFX6-NEXT: v_mov_b32_e32 v2, s1 ; GFX6-NEXT: v_mov_b32_e32 v3, s16 @@ -5525,7 +5454,7 @@ define amdgpu_ps <2 x i128> @s_ssubsat_v2i128(<2 x i128> inreg %lhs, <2 x i128> ; GFX6-NEXT: s_addc_u32 s5, s4, 0 ; GFX6-NEXT: s_addc_u32 s6, s4, 0 ; GFX6-NEXT: v_and_b32_e32 v0, 1, v0 -; GFX6-NEXT: s_addc_u32 s7, s4, s8 +; GFX6-NEXT: s_addc_u32 s7, s4, 0x80000000 ; GFX6-NEXT: v_mov_b32_e32 v1, s4 ; GFX6-NEXT: v_mov_b32_e32 v2, s5 ; GFX6-NEXT: v_mov_b32_e32 v3, s0 @@ -5582,10 +5511,9 @@ define amdgpu_ps <2 x i128> @s_ssubsat_v2i128(<2 x i128> inreg %lhs, <2 x i128> ; GFX8-NEXT: s_cmp_lg_u32 s1, 0 ; GFX8-NEXT: v_xor_b32_e32 v0, v1, v0 ; GFX8-NEXT: s_addc_u32 s1, s0, 0 -; GFX8-NEXT: s_brev_b32 s8, 1 ; GFX8-NEXT: s_addc_u32 s2, s0, 0 ; GFX8-NEXT: v_and_b32_e32 v0, 1, v0 -; GFX8-NEXT: s_addc_u32 s3, s0, s8 +; GFX8-NEXT: s_addc_u32 s3, s0, 0x80000000 ; GFX8-NEXT: v_mov_b32_e32 v1, s0 ; GFX8-NEXT: v_mov_b32_e32 v2, s1 ; GFX8-NEXT: v_mov_b32_e32 v3, s16 @@ -5632,7 +5560,7 @@ define amdgpu_ps <2 x i128> @s_ssubsat_v2i128(<2 x i128> inreg %lhs, <2 x i128> ; GFX8-NEXT: s_addc_u32 s5, s4, 0 ; GFX8-NEXT: s_addc_u32 s6, s4, 0 ; GFX8-NEXT: v_and_b32_e32 v0, 1, v0 -; GFX8-NEXT: s_addc_u32 s7, s4, s8 +; GFX8-NEXT: s_addc_u32 s7, s4, 0x80000000 ; GFX8-NEXT: v_mov_b32_e32 v1, s4 ; GFX8-NEXT: v_mov_b32_e32 v2, s5 ; GFX8-NEXT: v_mov_b32_e32 v3, s0 @@ -5689,10 +5617,9 @@ define amdgpu_ps <2 x i128> @s_ssubsat_v2i128(<2 x i128> inreg %lhs, <2 x i128> ; GFX9-NEXT: s_cmp_lg_u32 s1, 0 ; GFX9-NEXT: v_xor_b32_e32 v0, v1, v0 ; GFX9-NEXT: s_addc_u32 s1, s0, 0 -; GFX9-NEXT: s_brev_b32 s8, 1 ; GFX9-NEXT: s_addc_u32 s2, s0, 0 ; GFX9-NEXT: v_and_b32_e32 v0, 1, v0 -; GFX9-NEXT: s_addc_u32 s3, s0, s8 +; GFX9-NEXT: s_addc_u32 s3, s0, 0x80000000 ; GFX9-NEXT: v_mov_b32_e32 v1, s0 ; GFX9-NEXT: v_mov_b32_e32 v2, s1 ; GFX9-NEXT: v_mov_b32_e32 v3, s16 @@ -5739,7 +5666,7 @@ define amdgpu_ps <2 x i128> @s_ssubsat_v2i128(<2 x i128> inreg %lhs, <2 x i128> ; GFX9-NEXT: s_addc_u32 s5, s4, 0 ; GFX9-NEXT: s_addc_u32 s6, s4, 0 ; GFX9-NEXT: v_and_b32_e32 v0, 1, v0 -; GFX9-NEXT: s_addc_u32 s7, s4, s8 +; GFX9-NEXT: s_addc_u32 s7, s4, 0x80000000 ; GFX9-NEXT: v_mov_b32_e32 v1, s4 ; GFX9-NEXT: v_mov_b32_e32 v2, s5 ; GFX9-NEXT: v_mov_b32_e32 v3, s0 @@ -5770,7 +5697,6 @@ define amdgpu_ps <2 x i128> @s_ssubsat_v2i128(<2 x i128> inreg %lhs, <2 x i128> ; GFX10-NEXT: s_subb_u32 s18, s2, s10 ; GFX10-NEXT: v_cmp_lt_u64_e64 s0, s[16:17], s[0:1] ; GFX10-NEXT: s_subb_u32 s19, s3, s11 -; GFX10-NEXT: s_brev_b32 s21, 1 ; GFX10-NEXT: s_cmp_eq_u64 s[18:19], s[2:3] ; GFX10-NEXT: s_cselect_b32 s20, 1, 0 ; GFX10-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0 @@ -5792,7 +5718,7 @@ define amdgpu_ps <2 x i128> @s_ssubsat_v2i128(<2 x i128> inreg %lhs, <2 x i128> ; GFX10-NEXT: s_addc_u32 s1, s0, 0 ; GFX10-NEXT: v_cndmask_b32_e64 v3, 0, 1, s2 ; GFX10-NEXT: s_addc_u32 s2, s0, 0 -; GFX10-NEXT: s_addc_u32 s3, s0, s21 +; GFX10-NEXT: s_addc_u32 s3, s0, 0x80000000 ; GFX10-NEXT: s_sub_u32 s8, s4, s12 ; GFX10-NEXT: s_subb_u32 s9, s5, s13 ; GFX10-NEXT: s_subb_u32 s10, s6, s14 @@ -5838,7 +5764,7 @@ define amdgpu_ps <2 x i128> @s_ssubsat_v2i128(<2 x i128> inreg %lhs, <2 x i128> ; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v3 ; GFX10-NEXT: v_mov_b32_e32 v3, s10 ; GFX10-NEXT: s_addc_u32 s2, s0, 0 -; GFX10-NEXT: s_addc_u32 s3, s0, s21 +; GFX10-NEXT: s_addc_u32 s3, s0, 0x80000000 ; GFX10-NEXT: v_cndmask_b32_e64 v5, v5, s0, vcc_lo ; GFX10-NEXT: v_cndmask_b32_e64 v6, v6, s1, vcc_lo ; GFX10-NEXT: v_cndmask_b32_e64 v3, v3, s2, vcc_lo diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/store-local.128.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/store-local.128.ll index 913cb7306483..20cd35c1d469 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/store-local.128.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/store-local.128.ll @@ -113,44 +113,43 @@ define amdgpu_kernel void @store_lds_v4i32_align1(<4 x i32> addrspace(3)* %out, ; GFX7: ; %bb.0: ; GFX7-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x4 ; GFX7-NEXT: s_load_dword s0, s[0:1], 0x0 -; GFX7-NEXT: s_mov_b32 s1, 0x80008 ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: s_bfe_u32 s3, s4, s1 +; GFX7-NEXT: s_bfe_u32 s2, s4, 0x80008 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s0 -; GFX7-NEXT: s_lshr_b32 s2, s4, 16 +; GFX7-NEXT: s_lshr_b32 s1, s4, 16 ; GFX7-NEXT: ds_write_b8 v1, v0 -; GFX7-NEXT: v_mov_b32_e32 v0, s3 +; GFX7-NEXT: v_mov_b32_e32 v0, s2 ; GFX7-NEXT: ds_write_b8 v1, v0 offset:1 ; GFX7-NEXT: s_lshr_b32 s0, s4, 24 -; GFX7-NEXT: v_mov_b32_e32 v0, s2 +; GFX7-NEXT: v_mov_b32_e32 v0, s1 ; GFX7-NEXT: ds_write_b8 v1, v0 offset:2 ; GFX7-NEXT: v_mov_b32_e32 v0, s0 ; GFX7-NEXT: ds_write_b8 v1, v0 offset:3 -; GFX7-NEXT: s_bfe_u32 s2, s5, s1 +; GFX7-NEXT: s_bfe_u32 s1, s5, 0x80008 ; GFX7-NEXT: v_mov_b32_e32 v0, s5 ; GFX7-NEXT: s_lshr_b32 s0, s5, 16 ; GFX7-NEXT: ds_write_b8 v1, v0 offset:4 -; GFX7-NEXT: v_mov_b32_e32 v0, s2 +; GFX7-NEXT: v_mov_b32_e32 v0, s1 ; GFX7-NEXT: ds_write_b8 v1, v0 offset:5 -; GFX7-NEXT: s_lshr_b32 s2, s5, 24 +; GFX7-NEXT: s_lshr_b32 s1, s5, 24 ; GFX7-NEXT: v_mov_b32_e32 v0, s0 ; GFX7-NEXT: ds_write_b8 v1, v0 offset:6 -; GFX7-NEXT: v_mov_b32_e32 v0, s2 +; GFX7-NEXT: v_mov_b32_e32 v0, s1 ; GFX7-NEXT: ds_write_b8 v1, v0 offset:7 -; GFX7-NEXT: s_bfe_u32 s2, s6, s1 +; GFX7-NEXT: s_bfe_u32 s1, s6, 0x80008 ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: s_lshr_b32 s0, s6, 16 ; GFX7-NEXT: ds_write_b8 v1, v0 offset:8 -; GFX7-NEXT: v_mov_b32_e32 v0, s2 +; GFX7-NEXT: v_mov_b32_e32 v0, s1 ; GFX7-NEXT: ds_write_b8 v1, v0 offset:9 -; GFX7-NEXT: s_lshr_b32 s2, s6, 24 +; GFX7-NEXT: s_lshr_b32 s1, s6, 24 ; GFX7-NEXT: v_mov_b32_e32 v0, s0 ; GFX7-NEXT: ds_write_b8 v1, v0 offset:10 -; GFX7-NEXT: v_mov_b32_e32 v0, s2 +; GFX7-NEXT: v_mov_b32_e32 v0, s1 ; GFX7-NEXT: ds_write_b8 v1, v0 offset:11 -; GFX7-NEXT: s_bfe_u32 s1, s7, s1 +; GFX7-NEXT: s_bfe_u32 s1, s7, 0x80008 ; GFX7-NEXT: v_mov_b32_e32 v0, s7 ; GFX7-NEXT: s_lshr_b32 s0, s7, 16 ; GFX7-NEXT: ds_write_b8 v1, v0 offset:12 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/store-local.96.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/store-local.96.ll index d19684bcbff9..7777b8403036 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/store-local.96.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/store-local.96.ll @@ -98,33 +98,32 @@ define amdgpu_kernel void @store_lds_v3i32_align1(<3 x i32> addrspace(3)* %out, ; GFX7: ; %bb.0: ; GFX7-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x4 ; GFX7-NEXT: s_load_dword s0, s[0:1], 0x0 -; GFX7-NEXT: s_mov_b32 s1, 0x80008 ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: s_bfe_u32 s3, s4, s1 +; GFX7-NEXT: s_bfe_u32 s2, s4, 0x80008 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s0 -; GFX7-NEXT: s_lshr_b32 s2, s4, 16 +; GFX7-NEXT: s_lshr_b32 s1, s4, 16 ; GFX7-NEXT: ds_write_b8 v1, v0 -; GFX7-NEXT: v_mov_b32_e32 v0, s3 +; GFX7-NEXT: v_mov_b32_e32 v0, s2 ; GFX7-NEXT: ds_write_b8 v1, v0 offset:1 ; GFX7-NEXT: s_lshr_b32 s0, s4, 24 -; GFX7-NEXT: v_mov_b32_e32 v0, s2 +; GFX7-NEXT: v_mov_b32_e32 v0, s1 ; GFX7-NEXT: ds_write_b8 v1, v0 offset:2 ; GFX7-NEXT: v_mov_b32_e32 v0, s0 ; GFX7-NEXT: ds_write_b8 v1, v0 offset:3 -; GFX7-NEXT: s_bfe_u32 s2, s5, s1 +; GFX7-NEXT: s_bfe_u32 s1, s5, 0x80008 ; GFX7-NEXT: v_mov_b32_e32 v0, s5 ; GFX7-NEXT: s_lshr_b32 s0, s5, 16 ; GFX7-NEXT: ds_write_b8 v1, v0 offset:4 -; GFX7-NEXT: v_mov_b32_e32 v0, s2 +; GFX7-NEXT: v_mov_b32_e32 v0, s1 ; GFX7-NEXT: ds_write_b8 v1, v0 offset:5 -; GFX7-NEXT: s_lshr_b32 s2, s5, 24 +; GFX7-NEXT: s_lshr_b32 s1, s5, 24 ; GFX7-NEXT: v_mov_b32_e32 v0, s0 ; GFX7-NEXT: ds_write_b8 v1, v0 offset:6 -; GFX7-NEXT: v_mov_b32_e32 v0, s2 +; GFX7-NEXT: v_mov_b32_e32 v0, s1 ; GFX7-NEXT: ds_write_b8 v1, v0 offset:7 -; GFX7-NEXT: s_bfe_u32 s1, s6, s1 +; GFX7-NEXT: s_bfe_u32 s1, s6, 0x80008 ; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: s_lshr_b32 s0, s6, 16 ; GFX7-NEXT: ds_write_b8 v1, v0 offset:8 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/subo.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/subo.ll index ec59cb495898..59d9ef9ec425 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/subo.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/subo.ll @@ -967,7 +967,7 @@ define amdgpu_ps i16 @usubo_i16_sv(i16 inreg %a, i16 %b) { ; GFX7-LABEL: usubo_i16_sv: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_mov_b32 s1, 0xffff -; GFX7-NEXT: s_and_b32 s0, s0, s1 +; GFX7-NEXT: s_and_b32 s0, s0, 0xffff ; GFX7-NEXT: v_and_b32_e32 v0, s1, v0 ; GFX7-NEXT: v_sub_i32_e32 v0, vcc, s0, v0 ; GFX7-NEXT: v_and_b32_e32 v1, s1, v0 @@ -980,7 +980,7 @@ define amdgpu_ps i16 @usubo_i16_sv(i16 inreg %a, i16 %b) { ; GFX8-LABEL: usubo_i16_sv: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_mov_b32 s1, 0xffff -; GFX8-NEXT: s_and_b32 s0, s0, s1 +; GFX8-NEXT: s_and_b32 s0, s0, 0xffff ; GFX8-NEXT: v_and_b32_e32 v0, s1, v0 ; GFX8-NEXT: v_sub_u32_e32 v0, vcc, s0, v0 ; GFX8-NEXT: v_and_b32_e32 v1, s1, v0 @@ -992,8 +992,7 @@ define amdgpu_ps i16 @usubo_i16_sv(i16 inreg %a, i16 %b) { ; ; GFX9-LABEL: usubo_i16_sv: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_mov_b32 s1, 0xffff -; GFX9-NEXT: s_and_b32 s0, s0, s1 +; GFX9-NEXT: s_and_b32 s0, s0, 0xffff ; GFX9-NEXT: v_sub_u32_sdwa v0, s0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; GFX9-NEXT: v_cmp_ne_u32_sdwa s[0:1], v0, v0 src0_sel:DWORD src1_sel:WORD_0 ; GFX9-NEXT: v_cndmask_b32_e64 v1, 0, 1, s[0:1] diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/trunc.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/trunc.ll index ae5c8f9a4ade..fc67a417c709 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/trunc.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/trunc.ll @@ -162,23 +162,21 @@ define <2 x i32> @v_trunc_v4i32_to_v4i16(<4 x i32> %src) { define amdgpu_ps <2 x i32> @s_trunc_v4i32_to_v4i16(<4 x i32> inreg %src) { ; GFX7-LABEL: s_trunc_v4i32_to_v4i16: ; GFX7: ; %bb.0: -; GFX7-NEXT: s_mov_b32 s4, 0xffff ; GFX7-NEXT: s_lshl_b32 s1, s1, 16 -; GFX7-NEXT: s_and_b32 s0, s0, s4 +; GFX7-NEXT: s_and_b32 s0, s0, 0xffff ; GFX7-NEXT: s_or_b32 s0, s1, s0 ; GFX7-NEXT: s_lshl_b32 s1, s3, 16 -; GFX7-NEXT: s_and_b32 s2, s2, s4 +; GFX7-NEXT: s_and_b32 s2, s2, 0xffff ; GFX7-NEXT: s_or_b32 s1, s1, s2 ; GFX7-NEXT: ; return to shader part epilog ; ; GFX8-LABEL: s_trunc_v4i32_to_v4i16: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_mov_b32 s4, 0xffff ; GFX8-NEXT: s_lshl_b32 s1, s1, 16 -; GFX8-NEXT: s_and_b32 s0, s0, s4 +; GFX8-NEXT: s_and_b32 s0, s0, 0xffff ; GFX8-NEXT: s_or_b32 s0, s1, s0 ; GFX8-NEXT: s_lshl_b32 s1, s3, 16 -; GFX8-NEXT: s_and_b32 s2, s2, s4 +; GFX8-NEXT: s_and_b32 s2, s2, 0xffff ; GFX8-NEXT: s_or_b32 s1, s1, s2 ; GFX8-NEXT: ; return to shader part epilog %trunc = trunc <4 x i32> %src to <4 x i16> diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/uaddsat.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/uaddsat.ll index 24284351fc91..5134da030ec2 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/uaddsat.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/uaddsat.ll @@ -242,12 +242,11 @@ define i16 @v_uaddsat_v2i8(i16 %lhs.arg, i16 %rhs.arg) { ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: s_mov_b32 s4, 8 -; GFX10-NEXT: v_mov_b32_e32 v2, 0xffff -; GFX10-NEXT: v_lshrrev_b32_sdwa v3, s4, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX10-NEXT: v_lshrrev_b32_sdwa v4, s4, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX10-NEXT: v_lshrrev_b32_sdwa v2, s4, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX10-NEXT: v_lshrrev_b32_sdwa v3, s4, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX10-NEXT: s_movk_i32 s4, 0xff -; GFX10-NEXT: v_and_or_b32 v0, v0, v2, v3 -; GFX10-NEXT: v_and_or_b32 v1, v1, v2, v4 +; GFX10-NEXT: v_and_or_b32 v0, 0xffff, v0, v2 +; GFX10-NEXT: v_and_or_b32 v1, 0xffff, v1, v3 ; GFX10-NEXT: v_pk_lshlrev_b16 v0, 8, v0 op_sel_hi:[0,1] ; GFX10-NEXT: v_pk_lshlrev_b16 v1, 8, v1 op_sel_hi:[0,1] ; GFX10-NEXT: v_pk_add_u16 v0, v0, v1 clamp @@ -305,17 +304,16 @@ define amdgpu_ps i16 @s_uaddsat_v2i8(i16 inreg %lhs.arg, i16 inreg %rhs.arg) { ; GFX9-LABEL: s_uaddsat_v2i8: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_lshr_b32 s2, s0, 8 -; GFX9-NEXT: s_lshr_b32 s3, s1, 8 ; GFX9-NEXT: s_pack_ll_b32_b16 s0, s0, s2 +; GFX9-NEXT: s_lshr_b32 s3, s1, 8 +; GFX9-NEXT: s_lshr_b32 s2, s0, 16 ; GFX9-NEXT: s_pack_ll_b32_b16 s1, s1, s3 -; GFX9-NEXT: s_mov_b32 s2, 0x80008 -; GFX9-NEXT: s_lshr_b32 s3, s0, 16 -; GFX9-NEXT: s_lshl_b32 s0, s0, s2 -; GFX9-NEXT: s_lshl_b32 s3, s3, 8 -; GFX9-NEXT: s_pack_ll_b32_b16 s0, s0, s3 -; GFX9-NEXT: s_lshr_b32 s3, s1, 16 -; GFX9-NEXT: s_lshl_b32 s1, s1, s2 -; GFX9-NEXT: s_lshl_b32 s2, s3, 8 +; GFX9-NEXT: s_lshl_b32 s0, s0, 0x80008 +; GFX9-NEXT: s_lshl_b32 s2, s2, 8 +; GFX9-NEXT: s_pack_ll_b32_b16 s0, s0, s2 +; GFX9-NEXT: s_lshr_b32 s2, s1, 16 +; GFX9-NEXT: s_lshl_b32 s1, s1, 0x80008 +; GFX9-NEXT: s_lshl_b32 s2, s2, 8 ; GFX9-NEXT: s_pack_ll_b32_b16 s1, s1, s2 ; GFX9-NEXT: v_mov_b32_e32 v0, s1 ; GFX9-NEXT: v_pk_add_u16 v0, s0, v0 clamp @@ -332,15 +330,14 @@ define amdgpu_ps i16 @s_uaddsat_v2i8(i16 inreg %lhs.arg, i16 inreg %rhs.arg) { ; GFX10-NEXT: s_lshr_b32 s3, s1, 8 ; GFX10-NEXT: s_pack_ll_b32_b16 s0, s0, s2 ; GFX10-NEXT: s_pack_ll_b32_b16 s1, s1, s3 -; GFX10-NEXT: s_mov_b32 s2, 0x80008 -; GFX10-NEXT: s_lshr_b32 s3, s0, 16 -; GFX10-NEXT: s_lshr_b32 s4, s1, 16 -; GFX10-NEXT: s_lshl_b32 s0, s0, s2 +; GFX10-NEXT: s_lshr_b32 s2, s0, 16 +; GFX10-NEXT: s_lshr_b32 s3, s1, 16 +; GFX10-NEXT: s_lshl_b32 s0, s0, 0x80008 +; GFX10-NEXT: s_lshl_b32 s2, s2, 8 +; GFX10-NEXT: s_lshl_b32 s1, s1, 0x80008 ; GFX10-NEXT: s_lshl_b32 s3, s3, 8 -; GFX10-NEXT: s_lshl_b32 s1, s1, s2 -; GFX10-NEXT: s_lshl_b32 s2, s4, 8 -; GFX10-NEXT: s_pack_ll_b32_b16 s0, s0, s3 -; GFX10-NEXT: s_pack_ll_b32_b16 s1, s1, s2 +; GFX10-NEXT: s_pack_ll_b32_b16 s0, s0, s2 +; GFX10-NEXT: s_pack_ll_b32_b16 s1, s1, s3 ; GFX10-NEXT: v_pk_add_u16 v0, s0, s1 clamp ; GFX10-NEXT: s_movk_i32 s0, 0xff ; GFX10-NEXT: v_pk_lshrrev_b16 v0, 8, v0 op_sel_hi:[0,1] @@ -466,35 +463,33 @@ define i32 @v_uaddsat_v4i8(i32 %lhs.arg, i32 %rhs.arg) { ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: v_lshrrev_b32_e32 v4, 24, v0 -; GFX10-NEXT: v_lshrrev_b32_e32 v5, 24, v1 +; GFX10-NEXT: v_lshrrev_b32_e32 v3, 24, v0 +; GFX10-NEXT: v_lshrrev_b32_e32 v4, 24, v1 ; GFX10-NEXT: s_mov_b32 s4, 8 -; GFX10-NEXT: v_mov_b32_e32 v2, 0xffff -; GFX10-NEXT: v_lshrrev_b32_sdwa v3, s4, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX10-NEXT: v_lshrrev_b32_e32 v6, 16, v0 -; GFX10-NEXT: v_lshrrev_b32_sdwa v7, s4, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX10-NEXT: v_lshrrev_b32_e32 v8, 16, v1 +; GFX10-NEXT: v_lshrrev_b32_e32 v5, 16, v0 +; GFX10-NEXT: v_lshrrev_b32_sdwa v2, s4, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX10-NEXT: v_lshrrev_b32_sdwa v6, s4, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX10-NEXT: v_lshrrev_b32_e32 v7, 16, v1 +; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; GFX10-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GFX10-NEXT: v_and_or_b32 v0, v0, v2, v3 -; GFX10-NEXT: v_and_or_b32 v1, v1, v2, v7 -; GFX10-NEXT: s_movk_i32 s4, 0xff -; GFX10-NEXT: v_and_or_b32 v3, v6, v2, v4 -; GFX10-NEXT: v_and_or_b32 v2, v8, v2, v5 +; GFX10-NEXT: v_and_or_b32 v0, 0xffff, v0, v2 +; GFX10-NEXT: v_and_or_b32 v1, 0xffff, v1, v6 +; GFX10-NEXT: v_and_or_b32 v2, 0xffff, v5, v3 +; GFX10-NEXT: v_and_or_b32 v3, 0xffff, v7, v4 ; GFX10-NEXT: v_pk_lshlrev_b16 v0, 8, v0 op_sel_hi:[0,1] ; GFX10-NEXT: v_pk_lshlrev_b16 v1, 8, v1 op_sel_hi:[0,1] ; GFX10-NEXT: v_mov_b32_e32 v4, 24 -; GFX10-NEXT: v_pk_lshlrev_b16 v3, 8, v3 op_sel_hi:[0,1] ; GFX10-NEXT: v_pk_lshlrev_b16 v2, 8, v2 op_sel_hi:[0,1] +; GFX10-NEXT: v_pk_lshlrev_b16 v3, 8, v3 op_sel_hi:[0,1] ; GFX10-NEXT: v_pk_add_u16 v0, v0, v1 clamp -; GFX10-NEXT: v_pk_add_u16 v1, v3, v2 clamp +; GFX10-NEXT: v_pk_add_u16 v1, v2, v3 clamp ; GFX10-NEXT: v_mov_b32_e32 v2, 8 ; GFX10-NEXT: v_pk_lshrrev_b16 v0, 8, v0 op_sel_hi:[0,1] ; GFX10-NEXT: v_pk_lshrrev_b16 v1, 8, v1 op_sel_hi:[0,1] ; GFX10-NEXT: v_lshlrev_b32_sdwa v2, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 -; GFX10-NEXT: v_and_b32_e32 v3, s4, v1 +; GFX10-NEXT: v_and_b32_e32 v3, 0xff, v1 ; GFX10-NEXT: v_lshlrev_b32_sdwa v1, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 -; GFX10-NEXT: v_and_or_b32 v0, v0, s4, v2 +; GFX10-NEXT: v_and_or_b32 v0, v0, 0xff, v2 ; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v3 ; GFX10-NEXT: v_or3_b32 v0, v0, v2, v1 ; GFX10-NEXT: s_setpc_b64 s[30:31] @@ -590,27 +585,26 @@ define amdgpu_ps i32 @s_uaddsat_v4i8(i32 inreg %lhs.arg, i32 inreg %rhs.arg) { ; GFX9-NEXT: s_lshr_b32 s6, s0, 24 ; GFX9-NEXT: s_pack_ll_b32_b16 s0, s0, s3 ; GFX9-NEXT: s_pack_ll_b32_b16 s3, s4, s6 -; GFX9-NEXT: s_mov_b32 s4, 0x80008 -; GFX9-NEXT: s_lshr_b32 s6, s0, 16 +; GFX9-NEXT: s_lshr_b32 s4, s0, 16 ; GFX9-NEXT: s_lshr_b32 s7, s1, 8 -; GFX9-NEXT: s_lshl_b32 s0, s0, s4 -; GFX9-NEXT: s_lshl_b32 s6, s6, 8 +; GFX9-NEXT: s_lshl_b32 s0, s0, 0x80008 +; GFX9-NEXT: s_lshl_b32 s4, s4, 8 ; GFX9-NEXT: s_lshr_b32 s8, s1, 16 ; GFX9-NEXT: s_lshr_b32 s9, s1, 24 -; GFX9-NEXT: s_pack_ll_b32_b16 s0, s0, s6 -; GFX9-NEXT: s_lshr_b32 s6, s3, 16 +; GFX9-NEXT: s_pack_ll_b32_b16 s0, s0, s4 +; GFX9-NEXT: s_lshr_b32 s4, s3, 16 ; GFX9-NEXT: s_pack_ll_b32_b16 s1, s1, s7 -; GFX9-NEXT: s_lshl_b32 s3, s3, s4 +; GFX9-NEXT: s_lshl_b32 s3, s3, 0x80008 +; GFX9-NEXT: s_lshl_b32 s4, s4, 8 +; GFX9-NEXT: s_lshr_b32 s6, s1, 16 +; GFX9-NEXT: s_pack_ll_b32_b16 s3, s3, s4 +; GFX9-NEXT: s_pack_ll_b32_b16 s4, s8, s9 +; GFX9-NEXT: s_lshl_b32 s1, s1, 0x80008 +; GFX9-NEXT: s_lshl_b32 s6, s6, 8 +; GFX9-NEXT: s_pack_ll_b32_b16 s1, s1, s6 +; GFX9-NEXT: s_lshr_b32 s6, s4, 16 +; GFX9-NEXT: s_lshl_b32 s4, s4, 0x80008 ; GFX9-NEXT: s_lshl_b32 s6, s6, 8 -; GFX9-NEXT: s_lshr_b32 s7, s1, 16 -; GFX9-NEXT: s_pack_ll_b32_b16 s3, s3, s6 -; GFX9-NEXT: s_pack_ll_b32_b16 s6, s8, s9 -; GFX9-NEXT: s_lshl_b32 s1, s1, s4 -; GFX9-NEXT: s_lshl_b32 s7, s7, 8 -; GFX9-NEXT: s_pack_ll_b32_b16 s1, s1, s7 -; GFX9-NEXT: s_lshr_b32 s7, s6, 16 -; GFX9-NEXT: s_lshl_b32 s4, s6, s4 -; GFX9-NEXT: s_lshl_b32 s6, s7, 8 ; GFX9-NEXT: s_pack_ll_b32_b16 s4, s4, s6 ; GFX9-NEXT: v_mov_b32_e32 v0, s1 ; GFX9-NEXT: v_pk_add_u16 v0, s0, v0 clamp @@ -637,39 +631,37 @@ define amdgpu_ps i32 @s_uaddsat_v4i8(i32 inreg %lhs.arg, i32 inreg %rhs.arg) { ; GFX10-NEXT: s_lshr_b32 s4, s0, 24 ; GFX10-NEXT: s_pack_ll_b32_b16 s0, s0, s2 ; GFX10-NEXT: s_pack_ll_b32_b16 s2, s3, s4 -; GFX10-NEXT: s_mov_b32 s3, 0x80008 -; GFX10-NEXT: s_lshr_b32 s4, s0, 16 +; GFX10-NEXT: s_lshr_b32 s3, s0, 16 ; GFX10-NEXT: s_lshr_b32 s5, s1, 8 ; GFX10-NEXT: s_lshr_b32 s6, s1, 16 ; GFX10-NEXT: s_lshr_b32 s7, s1, 24 -; GFX10-NEXT: s_lshl_b32 s0, s0, s3 +; GFX10-NEXT: s_lshl_b32 s0, s0, 0x80008 +; GFX10-NEXT: s_lshl_b32 s3, s3, 8 +; GFX10-NEXT: s_lshr_b32 s4, s2, 16 +; GFX10-NEXT: s_lshl_b32 s2, s2, 0x80008 ; GFX10-NEXT: s_lshl_b32 s4, s4, 8 +; GFX10-NEXT: s_pack_ll_b32_b16 s0, s0, s3 ; GFX10-NEXT: s_pack_ll_b32_b16 s1, s1, s5 -; GFX10-NEXT: s_pack_ll_b32_b16 s0, s0, s4 -; GFX10-NEXT: s_pack_ll_b32_b16 s4, s6, s7 -; GFX10-NEXT: s_lshr_b32 s8, s2, 16 -; GFX10-NEXT: s_lshr_b32 s5, s1, 16 -; GFX10-NEXT: s_lshr_b32 s6, s4, 16 -; GFX10-NEXT: s_lshl_b32 s2, s2, s3 -; GFX10-NEXT: s_lshl_b32 s8, s8, 8 -; GFX10-NEXT: s_lshl_b32 s1, s1, s3 +; GFX10-NEXT: s_pack_ll_b32_b16 s3, s6, s7 +; GFX10-NEXT: s_pack_ll_b32_b16 s2, s2, s4 +; GFX10-NEXT: s_lshr_b32 s4, s1, 16 +; GFX10-NEXT: s_lshr_b32 s5, s3, 16 +; GFX10-NEXT: s_lshl_b32 s1, s1, 0x80008 +; GFX10-NEXT: s_lshl_b32 s4, s4, 8 +; GFX10-NEXT: s_lshl_b32 s3, s3, 0x80008 ; GFX10-NEXT: s_lshl_b32 s5, s5, 8 -; GFX10-NEXT: s_lshl_b32 s3, s4, s3 -; GFX10-NEXT: s_lshl_b32 s4, s6, 8 -; GFX10-NEXT: s_pack_ll_b32_b16 s2, s2, s8 -; GFX10-NEXT: s_pack_ll_b32_b16 s1, s1, s5 -; GFX10-NEXT: s_pack_ll_b32_b16 s3, s3, s4 +; GFX10-NEXT: s_pack_ll_b32_b16 s1, s1, s4 +; GFX10-NEXT: s_pack_ll_b32_b16 s3, s3, s5 ; GFX10-NEXT: v_pk_add_u16 v0, s0, s1 clamp ; GFX10-NEXT: v_pk_add_u16 v1, s2, s3 clamp ; GFX10-NEXT: s_mov_b32 s0, 8 -; GFX10-NEXT: s_movk_i32 s1, 0xff ; GFX10-NEXT: v_pk_lshrrev_b16 v0, 8, v0 op_sel_hi:[0,1] ; GFX10-NEXT: v_pk_lshrrev_b16 v1, 8, v1 op_sel_hi:[0,1] ; GFX10-NEXT: v_lshlrev_b32_sdwa v2, s0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 -; GFX10-NEXT: v_and_b32_e32 v3, s1, v1 +; GFX10-NEXT: v_and_b32_e32 v3, 0xff, v1 ; GFX10-NEXT: s_mov_b32 s0, 24 ; GFX10-NEXT: v_lshlrev_b32_sdwa v1, s0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 -; GFX10-NEXT: v_and_or_b32 v0, v0, s1, v2 +; GFX10-NEXT: v_and_or_b32 v0, v0, 0xff, v2 ; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v3 ; GFX10-NEXT: v_or3_b32 v0, v0, v2, v1 ; GFX10-NEXT: v_readfirstlane_b32 s0, v0 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/udivrem.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/udivrem.ll index 12423fc70269..0f1b97e9d3ed 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/udivrem.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/udivrem.ll @@ -910,9 +910,8 @@ define amdgpu_kernel void @udivrem_v4i32(<4 x i32> addrspace(1)* %out0, <4 x i32 ; GFX10-LABEL: udivrem_v4i32: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x10 -; GFX10-NEXT: v_mov_b32_e32 v4, 0x4f7ffffe -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x0 ; GFX10-NEXT: v_mov_b32_e32 v8, 0 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_cvt_f32_u32_e32 v0, s12 ; GFX10-NEXT: v_cvt_f32_u32_e32 v1, s13 @@ -926,9 +925,9 @@ define amdgpu_kernel void @udivrem_v4i32(<4 x i32> addrspace(1)* %out0, <4 x i32 ; GFX10-NEXT: s_sub_i32 s1, 0, s13 ; GFX10-NEXT: s_sub_i32 s2, 0, s14 ; GFX10-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 -; GFX10-NEXT: v_mul_f32_e32 v1, v1, v4 -; GFX10-NEXT: v_mul_f32_e32 v2, v2, v4 -; GFX10-NEXT: v_mul_f32_e32 v3, v3, v4 +; GFX10-NEXT: v_mul_f32_e32 v1, 0x4f7ffffe, v1 +; GFX10-NEXT: v_mul_f32_e32 v2, 0x4f7ffffe, v2 +; GFX10-NEXT: v_mul_f32_e32 v3, 0x4f7ffffe, v3 ; GFX10-NEXT: v_cvt_u32_f32_e32 v0, v0 ; GFX10-NEXT: v_cvt_u32_f32_e32 v1, v1 ; GFX10-NEXT: v_cvt_u32_f32_e32 v2, v2 @@ -2269,24 +2268,24 @@ define amdgpu_kernel void @udivrem_v2i16(<2 x i16> addrspace(1)* %out0, <2 x i16 ; GFX8-LABEL: udivrem_v2i16: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x10 -; GFX8-NEXT: s_mov_b32 s2, 0xffff ; GFX8-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x0 +; GFX8-NEXT: s_mov_b32 s8, 0xffff ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_and_b32 s3, s1, s2 -; GFX8-NEXT: v_cvt_f32_u32_e32 v0, s3 -; GFX8-NEXT: s_lshr_b32 s8, s1, 16 -; GFX8-NEXT: v_cvt_f32_u32_e32 v1, s8 -; GFX8-NEXT: s_sub_i32 s1, 0, s3 +; GFX8-NEXT: s_and_b32 s2, s1, 0xffff +; GFX8-NEXT: v_cvt_f32_u32_e32 v0, s2 +; GFX8-NEXT: s_lshr_b32 s3, s1, 16 +; GFX8-NEXT: v_cvt_f32_u32_e32 v1, s3 +; GFX8-NEXT: s_sub_i32 s1, 0, s2 ; GFX8-NEXT: v_rcp_iflag_f32_e32 v0, v0 ; GFX8-NEXT: s_lshr_b32 s9, s0, 16 ; GFX8-NEXT: v_rcp_iflag_f32_e32 v1, v1 -; GFX8-NEXT: s_and_b32 s0, s0, s2 +; GFX8-NEXT: s_and_b32 s0, s0, 0xffff ; GFX8-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX8-NEXT: v_cvt_u32_f32_e32 v0, v0 ; GFX8-NEXT: v_mul_f32_e32 v1, 0x4f7ffffe, v1 ; GFX8-NEXT: v_cvt_u32_f32_e32 v1, v1 ; GFX8-NEXT: v_mul_lo_u32 v2, s1, v0 -; GFX8-NEXT: s_sub_i32 s1, 0, s8 +; GFX8-NEXT: s_sub_i32 s1, 0, s3 ; GFX8-NEXT: v_mul_lo_u32 v3, s1, v1 ; GFX8-NEXT: v_mul_hi_u32 v2, v0, v2 ; GFX8-NEXT: v_mul_hi_u32 v3, v1, v3 @@ -2294,34 +2293,34 @@ define amdgpu_kernel void @udivrem_v2i16(<2 x i16> addrspace(1)* %out0, <2 x i16 ; GFX8-NEXT: v_mul_hi_u32 v0, s0, v0 ; GFX8-NEXT: v_add_u32_e32 v1, vcc, v1, v3 ; GFX8-NEXT: v_mul_hi_u32 v1, s9, v1 -; GFX8-NEXT: v_mul_lo_u32 v2, v0, s3 +; GFX8-NEXT: v_mul_lo_u32 v2, v0, s2 ; GFX8-NEXT: v_add_u32_e32 v3, vcc, 1, v0 ; GFX8-NEXT: v_sub_u32_e32 v2, vcc, s0, v2 -; GFX8-NEXT: v_cmp_le_u32_e32 vcc, s3, v2 +; GFX8-NEXT: v_cmp_le_u32_e32 vcc, s2, v2 ; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc -; GFX8-NEXT: v_subrev_u32_e64 v3, s[0:1], s3, v2 +; GFX8-NEXT: v_subrev_u32_e64 v3, s[0:1], s2, v2 ; GFX8-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc ; GFX8-NEXT: v_add_u32_e32 v3, vcc, 1, v0 -; GFX8-NEXT: v_cmp_le_u32_e32 vcc, s3, v2 +; GFX8-NEXT: v_cmp_le_u32_e32 vcc, s2, v2 ; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc -; GFX8-NEXT: v_mul_lo_u32 v3, v1, s8 -; GFX8-NEXT: v_subrev_u32_e64 v4, s[0:1], s3, v2 +; GFX8-NEXT: v_mul_lo_u32 v3, v1, s3 +; GFX8-NEXT: v_subrev_u32_e64 v4, s[0:1], s2, v2 ; GFX8-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc ; GFX8-NEXT: v_sub_u32_e32 v3, vcc, s9, v3 ; GFX8-NEXT: v_add_u32_e32 v4, vcc, 1, v1 -; GFX8-NEXT: v_cmp_le_u32_e32 vcc, s8, v3 +; GFX8-NEXT: v_cmp_le_u32_e32 vcc, s3, v3 ; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc -; GFX8-NEXT: v_subrev_u32_e64 v4, s[0:1], s8, v3 +; GFX8-NEXT: v_subrev_u32_e64 v4, s[0:1], s3, v3 ; GFX8-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc ; GFX8-NEXT: v_add_u32_e32 v4, vcc, 1, v1 -; GFX8-NEXT: v_cmp_le_u32_e32 vcc, s8, v3 +; GFX8-NEXT: v_cmp_le_u32_e32 vcc, s3, v3 ; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc -; GFX8-NEXT: v_subrev_u32_e64 v4, s[0:1], s8, v3 -; GFX8-NEXT: v_and_b32_e32 v1, s2, v1 +; GFX8-NEXT: v_subrev_u32_e64 v4, s[0:1], s3, v3 +; GFX8-NEXT: v_and_b32_e32 v1, s8, v1 ; GFX8-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc ; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX8-NEXT: v_or_b32_sdwa v4, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX8-NEXT: v_and_b32_e32 v0, s2, v3 +; GFX8-NEXT: v_and_b32_e32 v0, s8, v3 ; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX8-NEXT: v_or_b32_sdwa v2, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX8-NEXT: v_mov_b32_e32 v0, s4 @@ -2335,54 +2334,53 @@ define amdgpu_kernel void @udivrem_v2i16(<2 x i16> addrspace(1)* %out0, <2 x i16 ; GFX9-LABEL: udivrem_v2i16: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x10 -; GFX9-NEXT: s_mov_b32 s2, 0xffff ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_and_b32 s7, s1, s2 -; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s7 -; GFX9-NEXT: s_lshr_b32 s6, s1, 16 -; GFX9-NEXT: v_cvt_f32_u32_e32 v1, s6 -; GFX9-NEXT: s_sub_i32 s1, 0, s7 +; GFX9-NEXT: s_and_b32 s3, s1, 0xffff +; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s3 +; GFX9-NEXT: s_lshr_b32 s2, s1, 16 +; GFX9-NEXT: v_cvt_f32_u32_e32 v1, s2 +; GFX9-NEXT: s_sub_i32 s1, 0, s3 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v0, v0 -; GFX9-NEXT: s_sub_i32 s3, 0, s6 +; GFX9-NEXT: s_sub_i32 s6, 0, s2 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v1, v1 -; GFX9-NEXT: s_and_b32 s9, s0, s2 ; GFX9-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0 ; GFX9-NEXT: v_mul_f32_e32 v1, 0x4f7ffffe, v1 ; GFX9-NEXT: v_cvt_u32_f32_e32 v1, v1 -; GFX9-NEXT: s_lshr_b32 s8, s0, 16 ; GFX9-NEXT: v_mul_lo_u32 v2, s1, v0 -; GFX9-NEXT: v_mul_lo_u32 v3, s3, v1 -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX9-NEXT: s_lshr_b32 s1, s0, 16 +; GFX9-NEXT: v_mul_lo_u32 v3, s6, v1 +; GFX9-NEXT: s_and_b32 s0, s0, 0xffff ; GFX9-NEXT: v_mul_hi_u32 v2, v0, v2 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x0 ; GFX9-NEXT: v_mul_hi_u32 v3, v1, v3 ; GFX9-NEXT: v_add_u32_e32 v0, v0, v2 -; GFX9-NEXT: v_mul_hi_u32 v0, s9, v0 +; GFX9-NEXT: v_mul_hi_u32 v0, s0, v0 ; GFX9-NEXT: v_add_u32_e32 v1, v1, v3 -; GFX9-NEXT: v_mul_hi_u32 v1, s8, v1 -; GFX9-NEXT: v_mul_lo_u32 v2, v0, s7 +; GFX9-NEXT: v_mul_hi_u32 v1, s1, v1 +; GFX9-NEXT: v_mul_lo_u32 v2, v0, s3 ; GFX9-NEXT: v_add_u32_e32 v4, 1, v0 -; GFX9-NEXT: v_mul_lo_u32 v3, v1, s6 +; GFX9-NEXT: v_mul_lo_u32 v3, v1, s2 ; GFX9-NEXT: v_add_u32_e32 v5, 1, v1 -; GFX9-NEXT: v_sub_u32_e32 v2, s9, v2 -; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s7, v2 +; GFX9-NEXT: v_sub_u32_e32 v2, s0, v2 +; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s3, v2 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc -; GFX9-NEXT: v_subrev_u32_e32 v4, s7, v2 +; GFX9-NEXT: v_subrev_u32_e32 v4, s3, v2 ; GFX9-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc ; GFX9-NEXT: v_add_u32_e32 v4, 1, v0 -; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s7, v2 -; GFX9-NEXT: v_sub_u32_e32 v3, s8, v3 +; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s3, v2 +; GFX9-NEXT: v_sub_u32_e32 v3, s1, v3 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc -; GFX9-NEXT: v_subrev_u32_e32 v4, s7, v2 +; GFX9-NEXT: v_subrev_u32_e32 v4, s3, v2 +; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s2, v3 ; GFX9-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc -; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s6, v3 -; GFX9-NEXT: v_subrev_u32_e32 v4, s6, v3 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc +; GFX9-NEXT: v_subrev_u32_e32 v4, s2, v3 +; GFX9-NEXT: v_cndmask_b32_e64 v1, v1, v5, s[0:1] +; GFX9-NEXT: v_cndmask_b32_e64 v3, v3, v4, s[0:1] ; GFX9-NEXT: v_add_u32_e32 v4, 1, v1 -; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s6, v3 +; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s2, v3 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc -; GFX9-NEXT: v_subrev_u32_e32 v4, s6, v3 +; GFX9-NEXT: v_subrev_u32_e32 v4, s2, v3 ; GFX9-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc ; GFX9-NEXT: v_mov_b32_e32 v4, 0xffff ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 16, v1 @@ -2391,43 +2389,42 @@ define amdgpu_kernel void @udivrem_v2i16(<2 x i16> addrspace(1)* %out0, <2 x i16 ; GFX9-NEXT: v_and_or_b32 v1, v2, v4, v1 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_store_dword v2, v0, s[0:1] -; GFX9-NEXT: global_store_dword v2, v1, s[2:3] +; GFX9-NEXT: global_store_dword v2, v0, s[4:5] +; GFX9-NEXT: global_store_dword v2, v1, s[6:7] ; GFX9-NEXT: s_endpgm ; ; GFX10-LABEL: udivrem_v2i16: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x10 -; GFX10-NEXT: s_mov_b32 s3, 0xffff ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_lshr_b32 s2, s1, 16 -; GFX10-NEXT: s_and_b32 s1, s1, s3 +; GFX10-NEXT: s_and_b32 s1, s1, 0xffff ; GFX10-NEXT: v_cvt_f32_u32_e32 v0, s2 ; GFX10-NEXT: v_cvt_f32_u32_e32 v1, s1 -; GFX10-NEXT: s_sub_i32 s6, 0, s2 +; GFX10-NEXT: s_sub_i32 s3, 0, s2 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x0 ; GFX10-NEXT: v_rcp_iflag_f32_e32 v0, v0 ; GFX10-NEXT: v_rcp_iflag_f32_e32 v1, v1 ; GFX10-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX10-NEXT: v_mul_f32_e32 v1, 0x4f7ffffe, v1 ; GFX10-NEXT: v_cvt_u32_f32_e32 v0, v0 ; GFX10-NEXT: v_cvt_u32_f32_e32 v1, v1 -; GFX10-NEXT: v_mul_lo_u32 v2, s6, v0 -; GFX10-NEXT: s_sub_i32 s6, 0, s1 -; GFX10-NEXT: v_mul_lo_u32 v3, s6, v1 -; GFX10-NEXT: s_lshr_b32 s6, s0, 16 -; GFX10-NEXT: s_and_b32 s0, s0, s3 +; GFX10-NEXT: v_mul_lo_u32 v2, s3, v0 +; GFX10-NEXT: s_sub_i32 s3, 0, s1 +; GFX10-NEXT: v_mul_lo_u32 v3, s3, v1 +; GFX10-NEXT: s_lshr_b32 s3, s0, 16 +; GFX10-NEXT: s_and_b32 s0, s0, 0xffff ; GFX10-NEXT: v_mul_hi_u32 v2, v0, v2 ; GFX10-NEXT: v_mul_hi_u32 v3, v1, v3 ; GFX10-NEXT: v_add_nc_u32_e32 v0, v0, v2 ; GFX10-NEXT: v_add_nc_u32_e32 v1, v1, v3 -; GFX10-NEXT: v_mul_hi_u32 v0, s6, v0 +; GFX10-NEXT: v_mul_hi_u32 v0, s3, v0 ; GFX10-NEXT: v_mul_hi_u32 v1, s0, v1 ; GFX10-NEXT: v_mul_lo_u32 v2, v0, s2 ; GFX10-NEXT: v_add_nc_u32_e32 v4, 1, v0 ; GFX10-NEXT: v_mul_lo_u32 v3, v1, s1 ; GFX10-NEXT: v_add_nc_u32_e32 v6, 1, v1 -; GFX10-NEXT: v_sub_nc_u32_e32 v2, s6, v2 -; GFX10-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x0 +; GFX10-NEXT: v_sub_nc_u32_e32 v2, s3, v2 ; GFX10-NEXT: v_sub_nc_u32_e32 v3, s0, v3 ; GFX10-NEXT: v_subrev_nc_u32_e32 v5, s2, v2 ; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, s2, v2 @@ -2445,14 +2442,13 @@ define amdgpu_kernel void @udivrem_v2i16(<2 x i16> addrspace(1)* %out0, <2 x i16 ; GFX10-NEXT: v_subrev_nc_u32_e32 v7, s1, v3 ; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc_lo ; GFX10-NEXT: v_cndmask_b32_e32 v2, v2, v5, vcc_lo -; GFX10-NEXT: v_mov_b32_e32 v4, 0xffff ; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, v6, s0 ; GFX10-NEXT: v_cndmask_b32_e64 v3, v3, v7, s0 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX10-NEXT: v_and_or_b32 v0, v1, v4, v0 +; GFX10-NEXT: v_and_or_b32 v0, 0xffff, v1, v0 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 -; GFX10-NEXT: v_and_or_b32 v2, v3, v4, v2 +; GFX10-NEXT: v_and_or_b32 v2, 0xffff, v3, v2 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: global_store_dword v1, v0, s[4:5] ; GFX10-NEXT: global_store_dword v1, v2, s[6:7] @@ -2586,9 +2582,8 @@ define amdgpu_kernel void @udivrem_i27(i27 addrspace(1)* %out0, i27 addrspace(1) ; GFX8-LABEL: udivrem_i27: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x10 -; GFX8-NEXT: s_mov_b32 s8, 0x7ffffff ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_and_b32 s7, s7, s8 +; GFX8-NEXT: s_and_b32 s7, s7, 0x7ffffff ; GFX8-NEXT: v_cvt_f32_u32_e32 v0, s7 ; GFX8-NEXT: s_sub_i32 s0, 0, s7 ; GFX8-NEXT: v_rcp_iflag_f32_e32 v0, v0 @@ -2596,7 +2591,8 @@ define amdgpu_kernel void @udivrem_i27(i27 addrspace(1)* %out0, i27 addrspace(1) ; GFX8-NEXT: v_cvt_u32_f32_e32 v0, v0 ; GFX8-NEXT: v_mul_lo_u32 v1, s0, v0 ; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; GFX8-NEXT: s_and_b32 s4, s6, s8 +; GFX8-NEXT: s_and_b32 s4, s6, 0x7ffffff +; GFX8-NEXT: s_mov_b32 s5, 0x7ffffff ; GFX8-NEXT: v_mul_hi_u32 v1, v0, v1 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v1 ; GFX8-NEXT: v_mul_hi_u32 v2, s4, v0 @@ -2614,11 +2610,11 @@ define amdgpu_kernel void @udivrem_i27(i27 addrspace(1)* %out0, i27 addrspace(1) ; GFX8-NEXT: v_cmp_le_u32_e32 vcc, s7, v3 ; GFX8-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc ; GFX8-NEXT: v_subrev_u32_e64 v4, s[0:1], s7, v3 -; GFX8-NEXT: v_and_b32_e32 v2, s8, v2 +; GFX8-NEXT: v_and_b32_e32 v2, s5, v2 ; GFX8-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc ; GFX8-NEXT: flat_store_dword v[0:1], v2 ; GFX8-NEXT: v_mov_b32_e32 v0, s2 -; GFX8-NEXT: v_and_b32_e32 v2, s8, v3 +; GFX8-NEXT: v_and_b32_e32 v2, s5, v3 ; GFX8-NEXT: v_mov_b32_e32 v1, s3 ; GFX8-NEXT: flat_store_dword v[0:1], v2 ; GFX8-NEXT: s_endpgm @@ -2626,49 +2622,48 @@ define amdgpu_kernel void @udivrem_i27(i27 addrspace(1)* %out0, i27 addrspace(1) ; GFX9-LABEL: udivrem_i27: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x10 -; GFX9-NEXT: s_mov_b32 s6, 0x7ffffff ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_and_b32 s7, s1, s6 -; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s7 -; GFX9-NEXT: s_sub_i32 s1, 0, s7 -; GFX9-NEXT: s_and_b32 s8, s0, s6 +; GFX9-NEXT: s_and_b32 s6, s1, 0x7ffffff +; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s6 +; GFX9-NEXT: s_sub_i32 s1, 0, s6 +; GFX9-NEXT: s_and_b32 s7, s0, 0x7ffffff ; GFX9-NEXT: v_rcp_iflag_f32_e32 v0, v0 ; GFX9-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0 ; GFX9-NEXT: v_mul_lo_u32 v1, s1, v0 ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX9-NEXT: s_mov_b32 s4, 0x7ffffff ; GFX9-NEXT: v_mul_hi_u32 v1, v0, v1 ; GFX9-NEXT: v_add_u32_e32 v0, v0, v1 -; GFX9-NEXT: v_mul_hi_u32 v0, s8, v0 -; GFX9-NEXT: v_mul_lo_u32 v1, v0, s7 +; GFX9-NEXT: v_mul_hi_u32 v0, s7, v0 +; GFX9-NEXT: v_mul_lo_u32 v1, v0, s6 ; GFX9-NEXT: v_add_u32_e32 v3, 1, v0 -; GFX9-NEXT: v_sub_u32_e32 v1, s8, v1 -; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s7, v1 +; GFX9-NEXT: v_sub_u32_e32 v1, s7, v1 +; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s6, v1 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc -; GFX9-NEXT: v_subrev_u32_e32 v3, s7, v1 +; GFX9-NEXT: v_subrev_u32_e32 v3, s6, v1 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc ; GFX9-NEXT: v_add_u32_e32 v3, 1, v0 -; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s7, v1 +; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s6, v1 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc -; GFX9-NEXT: v_subrev_u32_e32 v3, s7, v1 +; GFX9-NEXT: v_subrev_u32_e32 v3, s6, v1 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc -; GFX9-NEXT: v_and_b32_e32 v0, s6, v0 +; GFX9-NEXT: v_and_b32_e32 v0, s4, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_store_dword v2, v0, s[0:1] -; GFX9-NEXT: v_and_b32_e32 v0, s6, v1 +; GFX9-NEXT: v_and_b32_e32 v0, s4, v1 ; GFX9-NEXT: global_store_dword v2, v0, s[2:3] ; GFX9-NEXT: s_endpgm ; ; GFX10-LABEL: udivrem_i27: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x10 -; GFX10-NEXT: s_mov_b32 s6, 0x7ffffff ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: s_and_b32 s7, s1, s6 -; GFX10-NEXT: s_and_b32 s0, s0, s6 -; GFX10-NEXT: v_cvt_f32_u32_e32 v0, s7 -; GFX10-NEXT: s_sub_i32 s1, 0, s7 +; GFX10-NEXT: s_and_b32 s6, s1, 0x7ffffff +; GFX10-NEXT: s_and_b32 s0, s0, 0x7ffffff +; GFX10-NEXT: v_cvt_f32_u32_e32 v0, s6 +; GFX10-NEXT: s_sub_i32 s1, 0, s6 ; GFX10-NEXT: v_rcp_iflag_f32_e32 v0, v0 ; GFX10-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX10-NEXT: v_cvt_u32_f32_e32 v0, v0 @@ -2676,22 +2671,22 @@ define amdgpu_kernel void @udivrem_i27(i27 addrspace(1)* %out0, i27 addrspace(1) ; GFX10-NEXT: v_mul_hi_u32 v1, v0, v1 ; GFX10-NEXT: v_add_nc_u32_e32 v0, v0, v1 ; GFX10-NEXT: v_mul_hi_u32 v0, s0, v0 -; GFX10-NEXT: v_mul_lo_u32 v1, v0, s7 +; GFX10-NEXT: v_mul_lo_u32 v1, v0, s6 ; GFX10-NEXT: v_add_nc_u32_e32 v2, 1, v0 ; GFX10-NEXT: v_sub_nc_u32_e32 v1, s0, v1 ; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; GFX10-NEXT: v_subrev_nc_u32_e32 v3, s7, v1 -; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, s7, v1 +; GFX10-NEXT: v_subrev_nc_u32_e32 v3, s6, v1 +; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, s6, v1 ; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc_lo ; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc_lo ; GFX10-NEXT: v_add_nc_u32_e32 v2, 1, v0 -; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, s7, v1 -; GFX10-NEXT: v_subrev_nc_u32_e32 v3, s7, v1 +; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, s6, v1 +; GFX10-NEXT: v_subrev_nc_u32_e32 v3, s6, v1 ; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc_lo ; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc_lo ; GFX10-NEXT: v_mov_b32_e32 v2, 0 -; GFX10-NEXT: v_and_b32_e32 v0, s6, v0 -; GFX10-NEXT: v_and_b32_e32 v1, s6, v1 +; GFX10-NEXT: v_and_b32_e32 v0, 0x7ffffff, v0 +; GFX10-NEXT: v_and_b32_e32 v1, 0x7ffffff, v1 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: global_store_dword v2, v0, s[0:1] ; GFX10-NEXT: global_store_dword v2, v1, s[2:3] diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/urem.i32.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/urem.i32.ll index 738cd237eb01..df31f96f177a 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/urem.i32.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/urem.i32.ll @@ -272,13 +272,13 @@ define <2 x i32> @v_urem_v2i32_oddk_denom(<2 x i32> %num) { ; GISEL-NEXT: v_sub_i32_e32 v0, vcc, v0, v4 ; GISEL-NEXT: v_sub_i32_e32 v1, vcc, v1, v3 ; GISEL-NEXT: v_subrev_i32_e32 v3, vcc, s4, v0 -; GISEL-NEXT: v_sub_i32_e32 v4, vcc, v1, v2 +; GISEL-NEXT: v_subrev_i32_e32 v4, vcc, 0x12d8fb, v1 ; GISEL-NEXT: v_cmp_le_u32_e32 vcc, s4, v0 ; GISEL-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc ; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v1, v2 ; GISEL-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc ; GISEL-NEXT: v_subrev_i32_e32 v3, vcc, s4, v0 -; GISEL-NEXT: v_sub_i32_e32 v4, vcc, v1, v2 +; GISEL-NEXT: v_subrev_i32_e32 v4, vcc, 0x12d8fb, v1 ; GISEL-NEXT: v_cmp_le_u32_e32 vcc, s4, v0 ; GISEL-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc ; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v1, v2 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/urem.i64.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/urem.i64.ll index 27de0ccd4b23..a08c9277393e 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/urem.i64.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/urem.i64.ll @@ -1079,9 +1079,9 @@ define i64 @v_urem_i64_oddk_denom(i64 %num) { ; CHECK-NEXT: v_subbrev_u32_e32 v1, vcc, 0, v1, vcc ; CHECK-NEXT: v_cmp_ge_u32_e32 vcc, v5, v2 ; CHECK-NEXT: v_cndmask_b32_e64 v7, 0, -1, vcc -; CHECK-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 -; CHECK-NEXT: v_cndmask_b32_e32 v6, v6, v7, vcc ; CHECK-NEXT: v_sub_i32_e32 v2, vcc, v5, v2 +; CHECK-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v1 +; CHECK-NEXT: v_cndmask_b32_e64 v6, v6, v7, s[4:5] ; CHECK-NEXT: v_subbrev_u32_e32 v7, vcc, 0, v1, vcc ; CHECK-NEXT: v_cmp_ne_u32_e32 vcc, 0, v6 ; CHECK-NEXT: v_cndmask_b32_e32 v2, v5, v2, vcc @@ -1099,67 +1099,67 @@ define <2 x i64> @v_urem_v2i64_oddk_denom(<2 x i64> %num) { ; GISEL: ; %bb.0: ; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GISEL-NEXT: s_mov_b32 s8, 0x12d8fb -; GISEL-NEXT: v_cvt_f32_ubyte0_e32 v4, 0 -; GISEL-NEXT: v_cvt_f32_u32_e32 v6, s8 -; GISEL-NEXT: s_sub_u32 s6, 0, s8 -; GISEL-NEXT: v_madmk_f32 v5, v4, 0x4f800000, v6 +; GISEL-NEXT: v_mov_b32_e32 v4, 0x12d8fb +; GISEL-NEXT: v_cvt_f32_ubyte0_e32 v7, 0 +; GISEL-NEXT: s_sub_u32 s6, 0, 0x12d8fb +; GISEL-NEXT: v_cvt_f32_u32_e32 v8, s8 ; GISEL-NEXT: s_subb_u32 s7, 0, 0 ; GISEL-NEXT: s_bfe_i32 s4, -1, 0x10000 ; GISEL-NEXT: s_bfe_i32 s5, -1, 0x10000 -; GISEL-NEXT: v_mac_f32_e32 v6, 0x4f800000, v4 -; GISEL-NEXT: v_rcp_iflag_f32_e32 v7, v5 -; GISEL-NEXT: v_mov_b32_e32 v5, s4 -; GISEL-NEXT: v_mov_b32_e32 v4, s5 -; GISEL-NEXT: v_rcp_iflag_f32_e32 v6, v6 -; GISEL-NEXT: s_sub_u32 s9, 0, s8 -; GISEL-NEXT: v_mul_f32_e32 v7, 0x5f7ffffc, v7 -; GISEL-NEXT: v_mul_f32_e32 v6, 0x5f7ffffc, v6 +; GISEL-NEXT: v_madmk_f32 v9, v7, 0x4f800000, v8 +; GISEL-NEXT: v_mov_b32_e32 v6, s4 +; GISEL-NEXT: v_mov_b32_e32 v5, s5 +; GISEL-NEXT: v_mac_f32_e32 v8, 0x4f800000, v7 +; GISEL-NEXT: s_sub_u32 s9, 0, 0x12d8fb +; GISEL-NEXT: v_rcp_iflag_f32_e32 v7, v9 +; GISEL-NEXT: v_rcp_iflag_f32_e32 v8, v8 ; GISEL-NEXT: s_subb_u32 s10, 0, 0 -; GISEL-NEXT: s_bfe_i32 s4, -1, 0x10000 ; GISEL-NEXT: s_bfe_i32 s11, -1, 0x10000 -; GISEL-NEXT: v_mul_f32_e32 v8, 0x2f800000, v7 -; GISEL-NEXT: v_mul_f32_e32 v9, 0x2f800000, v6 -; GISEL-NEXT: v_mov_b32_e32 v10, s4 -; GISEL-NEXT: v_trunc_f32_e32 v8, v8 +; GISEL-NEXT: s_bfe_i32 s12, -1, 0x10000 +; GISEL-NEXT: v_mul_f32_e32 v7, 0x5f7ffffc, v7 +; GISEL-NEXT: v_mul_f32_e32 v8, 0x5f7ffffc, v8 +; GISEL-NEXT: v_mul_f32_e32 v9, 0x2f800000, v7 +; GISEL-NEXT: v_mul_f32_e32 v10, 0x2f800000, v8 ; GISEL-NEXT: v_trunc_f32_e32 v9, v9 -; GISEL-NEXT: v_mac_f32_e32 v7, 0xcf800000, v8 -; GISEL-NEXT: v_cvt_u32_f32_e32 v8, v8 -; GISEL-NEXT: v_mac_f32_e32 v6, 0xcf800000, v9 +; GISEL-NEXT: v_trunc_f32_e32 v10, v10 +; GISEL-NEXT: v_mac_f32_e32 v7, 0xcf800000, v9 ; GISEL-NEXT: v_cvt_u32_f32_e32 v9, v9 +; GISEL-NEXT: v_mac_f32_e32 v8, 0xcf800000, v10 +; GISEL-NEXT: v_cvt_u32_f32_e32 v10, v10 ; GISEL-NEXT: v_cvt_u32_f32_e32 v7, v7 -; GISEL-NEXT: v_mul_lo_u32 v11, s6, v8 -; GISEL-NEXT: v_cvt_u32_f32_e32 v6, v6 -; GISEL-NEXT: v_mul_lo_u32 v12, s9, v9 +; GISEL-NEXT: v_mul_lo_u32 v11, s6, v9 +; GISEL-NEXT: v_cvt_u32_f32_e32 v8, v8 +; GISEL-NEXT: v_mul_lo_u32 v12, s9, v10 ; GISEL-NEXT: v_mul_lo_u32 v13, s6, v7 ; GISEL-NEXT: v_mul_lo_u32 v14, s7, v7 ; GISEL-NEXT: v_mul_hi_u32 v15, s6, v7 -; GISEL-NEXT: v_mul_lo_u32 v16, s9, v6 -; GISEL-NEXT: v_mul_lo_u32 v17, s10, v6 -; GISEL-NEXT: v_mul_hi_u32 v18, s9, v6 +; GISEL-NEXT: v_mul_lo_u32 v16, s9, v8 +; GISEL-NEXT: v_mul_lo_u32 v17, s10, v8 +; GISEL-NEXT: v_mul_hi_u32 v18, s9, v8 ; GISEL-NEXT: v_add_i32_e32 v11, vcc, v14, v11 -; GISEL-NEXT: v_mul_lo_u32 v14, v8, v13 +; GISEL-NEXT: v_mul_lo_u32 v14, v9, v13 ; GISEL-NEXT: v_mul_hi_u32 v19, v7, v13 -; GISEL-NEXT: v_mul_hi_u32 v13, v8, v13 +; GISEL-NEXT: v_mul_hi_u32 v13, v9, v13 ; GISEL-NEXT: v_add_i32_e32 v12, vcc, v17, v12 -; GISEL-NEXT: v_mul_lo_u32 v17, v9, v16 +; GISEL-NEXT: v_mul_lo_u32 v17, v10, v16 ; GISEL-NEXT: v_add_i32_e32 v11, vcc, v11, v15 -; GISEL-NEXT: v_mul_hi_u32 v15, v6, v16 -; GISEL-NEXT: v_mul_hi_u32 v16, v9, v16 +; GISEL-NEXT: v_mul_hi_u32 v15, v8, v16 +; GISEL-NEXT: v_mul_hi_u32 v16, v10, v16 ; GISEL-NEXT: v_add_i32_e32 v12, vcc, v12, v18 -; GISEL-NEXT: v_mul_lo_u32 v18, v6, v12 +; GISEL-NEXT: v_mul_lo_u32 v18, v8, v12 ; GISEL-NEXT: v_add_i32_e32 v17, vcc, v17, v18 ; GISEL-NEXT: v_cndmask_b32_e64 v18, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v15, vcc, v17, v15 ; GISEL-NEXT: v_mul_lo_u32 v15, v7, v11 -; GISEL-NEXT: v_mul_lo_u32 v17, v8, v11 +; GISEL-NEXT: v_mul_lo_u32 v17, v9, v11 ; GISEL-NEXT: v_add_i32_e64 v14, s[4:5], v14, v15 ; GISEL-NEXT: v_cndmask_b32_e64 v15, 0, 1, s[4:5] ; GISEL-NEXT: v_add_i32_e64 v14, s[4:5], v14, v19 ; GISEL-NEXT: v_mul_hi_u32 v14, v7, v11 -; GISEL-NEXT: v_mul_hi_u32 v11, v8, v11 +; GISEL-NEXT: v_mul_hi_u32 v11, v9, v11 ; GISEL-NEXT: v_cndmask_b32_e64 v19, 0, 1, s[4:5] ; GISEL-NEXT: v_add_i32_e64 v15, s[4:5], v15, v19 -; GISEL-NEXT: v_mul_lo_u32 v19, v9, v12 +; GISEL-NEXT: v_mul_lo_u32 v19, v10, v12 ; GISEL-NEXT: v_add_i32_e64 v13, s[4:5], v17, v13 ; GISEL-NEXT: v_cndmask_b32_e64 v17, 0, 1, s[4:5] ; GISEL-NEXT: v_add_i32_e64 v13, s[4:5], v13, v14 @@ -1167,8 +1167,8 @@ define <2 x i64> @v_urem_v2i64_oddk_denom(<2 x i64> %num) { ; GISEL-NEXT: v_add_i32_e64 v14, s[4:5], v17, v14 ; GISEL-NEXT: v_cndmask_b32_e64 v17, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v17, vcc, v18, v17 -; GISEL-NEXT: v_mul_hi_u32 v18, v6, v12 -; GISEL-NEXT: v_mul_hi_u32 v12, v9, v12 +; GISEL-NEXT: v_mul_hi_u32 v18, v8, v12 +; GISEL-NEXT: v_mul_hi_u32 v12, v10, v12 ; GISEL-NEXT: v_add_i32_e32 v16, vcc, v19, v16 ; GISEL-NEXT: v_cndmask_b32_e64 v19, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v16, vcc, v16, v18 @@ -1183,40 +1183,40 @@ define <2 x i64> @v_urem_v2i64_oddk_denom(<2 x i64> %num) { ; GISEL-NEXT: v_add_i32_e32 v11, vcc, v11, v14 ; GISEL-NEXT: v_add_i32_e32 v12, vcc, v12, v15 ; GISEL-NEXT: v_add_i32_e32 v7, vcc, v7, v13 -; GISEL-NEXT: v_addc_u32_e32 v8, vcc, v8, v11, vcc +; GISEL-NEXT: v_addc_u32_e32 v9, vcc, v9, v11, vcc ; GISEL-NEXT: v_mul_lo_u32 v11, s6, v7 ; GISEL-NEXT: v_mul_lo_u32 v13, s7, v7 ; GISEL-NEXT: v_mul_hi_u32 v14, s6, v7 -; GISEL-NEXT: v_add_i32_e32 v6, vcc, v6, v16 -; GISEL-NEXT: v_addc_u32_e32 v9, vcc, v9, v12, vcc -; GISEL-NEXT: v_mul_lo_u32 v12, s9, v6 -; GISEL-NEXT: v_mul_lo_u32 v15, s10, v6 -; GISEL-NEXT: v_mul_hi_u32 v16, s9, v6 -; GISEL-NEXT: v_mul_lo_u32 v17, s6, v8 -; GISEL-NEXT: v_mul_lo_u32 v18, v8, v11 +; GISEL-NEXT: v_add_i32_e32 v8, vcc, v8, v16 +; GISEL-NEXT: v_addc_u32_e32 v10, vcc, v10, v12, vcc +; GISEL-NEXT: v_mul_lo_u32 v12, s9, v8 +; GISEL-NEXT: v_mul_lo_u32 v15, s10, v8 +; GISEL-NEXT: v_mul_hi_u32 v16, s9, v8 +; GISEL-NEXT: v_mul_lo_u32 v17, s6, v9 +; GISEL-NEXT: v_mul_lo_u32 v18, v9, v11 ; GISEL-NEXT: v_mul_hi_u32 v19, v7, v11 -; GISEL-NEXT: v_mul_hi_u32 v11, v8, v11 +; GISEL-NEXT: v_mul_hi_u32 v11, v9, v11 ; GISEL-NEXT: v_add_i32_e32 v13, vcc, v13, v17 -; GISEL-NEXT: v_mul_lo_u32 v17, s9, v9 +; GISEL-NEXT: v_mul_lo_u32 v17, s9, v10 ; GISEL-NEXT: v_add_i32_e32 v15, vcc, v15, v17 -; GISEL-NEXT: v_mul_lo_u32 v17, v9, v12 +; GISEL-NEXT: v_mul_lo_u32 v17, v10, v12 ; GISEL-NEXT: v_add_i32_e32 v13, vcc, v13, v14 -; GISEL-NEXT: v_mul_hi_u32 v14, v6, v12 -; GISEL-NEXT: v_mul_hi_u32 v12, v9, v12 +; GISEL-NEXT: v_mul_hi_u32 v14, v8, v12 +; GISEL-NEXT: v_mul_hi_u32 v12, v10, v12 ; GISEL-NEXT: v_add_i32_e32 v15, vcc, v15, v16 -; GISEL-NEXT: v_mul_lo_u32 v16, v6, v15 +; GISEL-NEXT: v_mul_lo_u32 v16, v8, v15 ; GISEL-NEXT: v_add_i32_e32 v16, vcc, v17, v16 ; GISEL-NEXT: v_cndmask_b32_e64 v17, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v14, vcc, v16, v14 ; GISEL-NEXT: v_mul_lo_u32 v14, v7, v13 -; GISEL-NEXT: v_mul_lo_u32 v16, v8, v13 +; GISEL-NEXT: v_mul_lo_u32 v16, v9, v13 ; GISEL-NEXT: v_add_i32_e64 v14, s[4:5], v18, v14 ; GISEL-NEXT: v_cndmask_b32_e64 v18, 0, 1, s[4:5] ; GISEL-NEXT: v_add_i32_e64 v14, s[4:5], v14, v19 ; GISEL-NEXT: v_mul_hi_u32 v14, v7, v13 ; GISEL-NEXT: v_cndmask_b32_e64 v19, 0, 1, s[4:5] ; GISEL-NEXT: v_add_i32_e64 v18, s[4:5], v18, v19 -; GISEL-NEXT: v_mul_lo_u32 v19, v9, v15 +; GISEL-NEXT: v_mul_lo_u32 v19, v10, v15 ; GISEL-NEXT: v_add_i32_e64 v11, s[4:5], v16, v11 ; GISEL-NEXT: v_cndmask_b32_e64 v16, 0, 1, s[4:5] ; GISEL-NEXT: v_add_i32_e64 v11, s[4:5], v11, v14 @@ -1224,125 +1224,126 @@ define <2 x i64> @v_urem_v2i64_oddk_denom(<2 x i64> %num) { ; GISEL-NEXT: v_add_i32_e64 v14, s[4:5], v16, v14 ; GISEL-NEXT: v_cndmask_b32_e64 v16, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v16, vcc, v17, v16 -; GISEL-NEXT: v_mul_hi_u32 v17, v6, v15 +; GISEL-NEXT: v_mul_hi_u32 v17, v8, v15 ; GISEL-NEXT: v_add_i32_e32 v12, vcc, v19, v12 ; GISEL-NEXT: v_cndmask_b32_e64 v19, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v12, vcc, v12, v17 ; GISEL-NEXT: v_cndmask_b32_e64 v17, 0, 1, vcc ; GISEL-NEXT: v_add_i32_e32 v17, vcc, v19, v17 ; GISEL-NEXT: v_mov_b32_e32 v19, s11 -; GISEL-NEXT: v_mul_hi_u32 v13, v8, v13 -; GISEL-NEXT: v_mul_hi_u32 v15, v9, v15 ; GISEL-NEXT: v_add_i32_e32 v11, vcc, v11, v18 ; GISEL-NEXT: v_cndmask_b32_e64 v18, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v14, vcc, v14, v18 +; GISEL-NEXT: v_mov_b32_e32 v18, s12 +; GISEL-NEXT: v_mul_hi_u32 v13, v9, v13 +; GISEL-NEXT: v_mul_hi_u32 v15, v10, v15 ; GISEL-NEXT: v_add_i32_e32 v12, vcc, v12, v16 ; GISEL-NEXT: v_cndmask_b32_e64 v16, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v14, vcc, v14, v18 ; GISEL-NEXT: v_add_i32_e32 v16, vcc, v17, v16 ; GISEL-NEXT: v_add_i32_e32 v13, vcc, v13, v14 ; GISEL-NEXT: v_add_i32_e32 v14, vcc, v15, v16 ; GISEL-NEXT: v_add_i32_e32 v7, vcc, v7, v11 -; GISEL-NEXT: v_addc_u32_e32 v8, vcc, v8, v13, vcc +; GISEL-NEXT: v_addc_u32_e32 v9, vcc, v9, v13, vcc ; GISEL-NEXT: v_mul_lo_u32 v11, v1, v7 ; GISEL-NEXT: v_mul_hi_u32 v13, v0, v7 ; GISEL-NEXT: v_mul_hi_u32 v7, v1, v7 -; GISEL-NEXT: v_add_i32_e32 v6, vcc, v6, v12 -; GISEL-NEXT: v_addc_u32_e32 v9, vcc, v9, v14, vcc -; GISEL-NEXT: v_mul_lo_u32 v12, v3, v6 -; GISEL-NEXT: v_mul_hi_u32 v14, v2, v6 -; GISEL-NEXT: v_mul_hi_u32 v6, v3, v6 -; GISEL-NEXT: v_mul_lo_u32 v15, v0, v8 -; GISEL-NEXT: v_mul_lo_u32 v16, v1, v8 -; GISEL-NEXT: v_mul_hi_u32 v17, v0, v8 -; GISEL-NEXT: v_mul_hi_u32 v8, v1, v8 -; GISEL-NEXT: v_mul_lo_u32 v18, v2, v9 -; GISEL-NEXT: v_add_i32_e32 v12, vcc, v12, v18 -; GISEL-NEXT: v_cndmask_b32_e64 v18, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v12, vcc, v12, v14 -; GISEL-NEXT: v_mul_lo_u32 v12, v3, v9 -; GISEL-NEXT: v_mul_hi_u32 v14, v2, v9 -; GISEL-NEXT: v_mul_hi_u32 v9, v3, v9 -; GISEL-NEXT: v_add_i32_e64 v11, s[4:5], v11, v15 -; GISEL-NEXT: v_cndmask_b32_e64 v15, 0, 1, s[4:5] -; GISEL-NEXT: v_add_i32_e64 v7, s[4:5], v16, v7 -; GISEL-NEXT: v_cndmask_b32_e64 v16, 0, 1, s[4:5] -; GISEL-NEXT: v_add_i32_e64 v6, s[4:5], v12, v6 +; GISEL-NEXT: v_add_i32_e32 v8, vcc, v8, v12 +; GISEL-NEXT: v_addc_u32_e32 v10, vcc, v10, v14, vcc +; GISEL-NEXT: v_mul_lo_u32 v12, v3, v8 +; GISEL-NEXT: v_mul_hi_u32 v14, v2, v8 +; GISEL-NEXT: v_mul_hi_u32 v8, v3, v8 +; GISEL-NEXT: v_mul_lo_u32 v15, v0, v9 +; GISEL-NEXT: v_mul_lo_u32 v16, v1, v9 +; GISEL-NEXT: v_mul_hi_u32 v17, v0, v9 +; GISEL-NEXT: v_mul_hi_u32 v9, v1, v9 +; GISEL-NEXT: v_add_i32_e32 v11, vcc, v11, v15 +; GISEL-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v11, vcc, v11, v13 +; GISEL-NEXT: v_mul_lo_u32 v11, v2, v10 +; GISEL-NEXT: v_mul_lo_u32 v13, v3, v10 +; GISEL-NEXT: v_add_i32_e64 v11, s[4:5], v12, v11 ; GISEL-NEXT: v_cndmask_b32_e64 v12, 0, 1, s[4:5] -; GISEL-NEXT: v_add_i32_e64 v11, s[4:5], v11, v13 -; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, 1, s[4:5] -; GISEL-NEXT: v_add_i32_e64 v7, s[4:5], v7, v17 -; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, s[4:5] +; GISEL-NEXT: v_add_i32_e64 v11, s[4:5], v11, v14 +; GISEL-NEXT: v_mul_hi_u32 v11, v2, v10 +; GISEL-NEXT: v_mul_hi_u32 v10, v3, v10 +; GISEL-NEXT: v_add_i32_e64 v7, s[6:7], v16, v7 +; GISEL-NEXT: v_cndmask_b32_e64 v14, 0, 1, s[6:7] +; GISEL-NEXT: v_add_i32_e64 v8, s[6:7], v13, v8 +; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, s[6:7] +; GISEL-NEXT: v_cndmask_b32_e64 v16, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v7, vcc, v7, v17 ; GISEL-NEXT: v_cndmask_b32_e64 v17, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v6, vcc, v6, v14 -; GISEL-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v11, vcc, v15, v11 -; GISEL-NEXT: v_add_i32_e32 v13, vcc, v16, v13 -; GISEL-NEXT: v_add_i32_e32 v15, vcc, v18, v17 -; GISEL-NEXT: v_add_i32_e32 v12, vcc, v12, v14 -; GISEL-NEXT: v_add_i32_e32 v7, vcc, v7, v11 +; GISEL-NEXT: v_add_i32_e32 v15, vcc, v15, v16 +; GISEL-NEXT: v_cndmask_b32_e64 v16, 0, 1, s[4:5] +; GISEL-NEXT: v_add_i32_e32 v8, vcc, v8, v11 ; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc -; GISEL-NEXT: v_add_i32_e32 v6, vcc, v6, v15 -; GISEL-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v14, vcc, v14, v17 +; GISEL-NEXT: v_add_i32_e32 v12, vcc, v12, v16 ; GISEL-NEXT: v_add_i32_e32 v11, vcc, v13, v11 -; GISEL-NEXT: v_mul_lo_u32 v13, s8, v7 +; GISEL-NEXT: v_add_i32_e32 v7, vcc, v7, v15 +; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v8, vcc, v8, v12 +; GISEL-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc +; GISEL-NEXT: v_add_i32_e32 v13, vcc, v14, v13 +; GISEL-NEXT: v_mul_lo_u32 v14, s8, v7 ; GISEL-NEXT: v_mul_lo_u32 v15, 0, v7 ; GISEL-NEXT: v_mul_hi_u32 v7, s8, v7 -; GISEL-NEXT: v_add_i32_e32 v12, vcc, v12, v14 -; GISEL-NEXT: v_mul_lo_u32 v14, s8, v6 -; GISEL-NEXT: v_mul_lo_u32 v16, 0, v6 -; GISEL-NEXT: v_mul_hi_u32 v6, s8, v6 -; GISEL-NEXT: v_add_i32_e32 v8, vcc, v8, v11 -; GISEL-NEXT: v_add_i32_e32 v9, vcc, v9, v12 -; GISEL-NEXT: v_mul_lo_u32 v8, s8, v8 +; GISEL-NEXT: v_add_i32_e32 v11, vcc, v11, v12 +; GISEL-NEXT: v_mul_lo_u32 v12, s8, v8 +; GISEL-NEXT: v_mul_lo_u32 v16, 0, v8 +; GISEL-NEXT: v_mul_hi_u32 v8, s8, v8 +; GISEL-NEXT: v_add_i32_e32 v9, vcc, v9, v13 +; GISEL-NEXT: v_add_i32_e32 v10, vcc, v10, v11 ; GISEL-NEXT: v_mul_lo_u32 v9, s8, v9 -; GISEL-NEXT: v_add_i32_e32 v8, vcc, v15, v8 -; GISEL-NEXT: v_add_i32_e32 v9, vcc, v16, v9 -; GISEL-NEXT: v_add_i32_e32 v7, vcc, v8, v7 -; GISEL-NEXT: v_add_i32_e32 v6, vcc, v9, v6 -; GISEL-NEXT: v_sub_i32_e32 v0, vcc, v0, v13 -; GISEL-NEXT: v_subb_u32_e64 v8, s[4:5], v1, v7, vcc +; GISEL-NEXT: v_mul_lo_u32 v10, s8, v10 +; GISEL-NEXT: v_add_i32_e32 v9, vcc, v15, v9 +; GISEL-NEXT: v_add_i32_e32 v10, vcc, v16, v10 +; GISEL-NEXT: v_add_i32_e32 v7, vcc, v9, v7 +; GISEL-NEXT: v_add_i32_e32 v8, vcc, v10, v8 +; GISEL-NEXT: v_sub_i32_e32 v0, vcc, v0, v14 +; GISEL-NEXT: v_subb_u32_e64 v9, s[4:5], v1, v7, vcc ; GISEL-NEXT: v_sub_i32_e64 v1, s[4:5], v1, v7 -; GISEL-NEXT: v_cmp_le_u32_e64 s[4:5], s8, v0 +; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v0, v4 ; GISEL-NEXT: v_cndmask_b32_e64 v7, 0, -1, s[4:5] -; GISEL-NEXT: v_sub_i32_e64 v2, s[4:5], v2, v14 -; GISEL-NEXT: v_subb_u32_e64 v9, s[6:7], v3, v6, s[4:5] -; GISEL-NEXT: v_sub_i32_e64 v3, s[6:7], v3, v6 -; GISEL-NEXT: v_cmp_le_u32_e64 s[6:7], s8, v2 -; GISEL-NEXT: v_cndmask_b32_e64 v6, 0, -1, s[6:7] -; GISEL-NEXT: v_cmp_eq_u32_e64 s[6:7], 0, v8 -; GISEL-NEXT: v_cndmask_b32_e64 v5, v5, v7, s[6:7] +; GISEL-NEXT: v_sub_i32_e64 v2, s[4:5], v2, v12 +; GISEL-NEXT: v_subb_u32_e64 v10, s[6:7], v3, v8, s[4:5] +; GISEL-NEXT: v_sub_i32_e64 v3, s[6:7], v3, v8 +; GISEL-NEXT: v_cmp_ge_u32_e64 s[6:7], v2, v4 +; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, -1, s[6:7] +; GISEL-NEXT: v_cmp_eq_u32_e64 s[6:7], 0, v9 +; GISEL-NEXT: v_cndmask_b32_e64 v6, v6, v7, s[6:7] ; GISEL-NEXT: v_subbrev_u32_e32 v1, vcc, 0, v1, vcc -; GISEL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v9 -; GISEL-NEXT: v_cndmask_b32_e32 v6, v10, v6, vcc +; GISEL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v10 +; GISEL-NEXT: v_cndmask_b32_e32 v7, v19, v8, vcc ; GISEL-NEXT: v_subbrev_u32_e64 v3, vcc, 0, v3, s[4:5] -; GISEL-NEXT: v_subrev_i32_e32 v7, vcc, s8, v0 +; GISEL-NEXT: v_sub_i32_e32 v8, vcc, v0, v4 ; GISEL-NEXT: v_subbrev_u32_e32 v1, vcc, 0, v1, vcc -; GISEL-NEXT: v_cmp_le_u32_e32 vcc, s8, v7 -; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, -1, vcc -; GISEL-NEXT: v_subrev_i32_e32 v11, vcc, s8, v2 +; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v8, v4 +; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, -1, vcc +; GISEL-NEXT: v_sub_i32_e32 v12, vcc, v2, v4 ; GISEL-NEXT: v_subbrev_u32_e32 v3, vcc, 0, v3, vcc -; GISEL-NEXT: v_cmp_le_u32_e32 vcc, s8, v11 -; GISEL-NEXT: v_cndmask_b32_e64 v12, 0, -1, vcc +; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v12, v4 +; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, -1, vcc ; GISEL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 -; GISEL-NEXT: v_cndmask_b32_e32 v4, v4, v10, vcc -; GISEL-NEXT: v_subrev_i32_e32 v10, vcc, s8, v7 -; GISEL-NEXT: v_subbrev_u32_e32 v13, vcc, 0, v1, vcc +; GISEL-NEXT: v_cndmask_b32_e32 v5, v5, v11, vcc +; GISEL-NEXT: v_sub_i32_e32 v11, vcc, v8, v4 +; GISEL-NEXT: v_subbrev_u32_e32 v14, vcc, 0, v1, vcc ; GISEL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 -; GISEL-NEXT: v_cndmask_b32_e32 v12, v19, v12, vcc -; GISEL-NEXT: v_subrev_i32_e32 v14, vcc, s8, v11 +; GISEL-NEXT: v_cndmask_b32_e32 v13, v18, v13, vcc +; GISEL-NEXT: v_sub_i32_e32 v4, vcc, v12, v4 ; GISEL-NEXT: v_subbrev_u32_e32 v15, vcc, 0, v3, vcc -; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 -; GISEL-NEXT: v_cndmask_b32_e32 v4, v7, v10, vcc -; GISEL-NEXT: v_cmp_ne_u32_e64 s[4:5], 0, v12 -; GISEL-NEXT: v_cndmask_b32_e64 v7, v11, v14, s[4:5] -; GISEL-NEXT: v_cndmask_b32_e32 v1, v1, v13, vcc ; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v5 -; GISEL-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc +; GISEL-NEXT: v_cndmask_b32_e32 v5, v8, v11, vcc +; GISEL-NEXT: v_cmp_ne_u32_e64 s[4:5], 0, v13 +; GISEL-NEXT: v_cndmask_b32_e64 v4, v12, v4, s[4:5] +; GISEL-NEXT: v_cndmask_b32_e32 v1, v1, v14, vcc +; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v6 +; GISEL-NEXT: v_cndmask_b32_e32 v0, v0, v5, vcc ; GISEL-NEXT: v_cndmask_b32_e64 v3, v3, v15, s[4:5] -; GISEL-NEXT: v_cmp_ne_u32_e64 s[4:5], 0, v6 -; GISEL-NEXT: v_cndmask_b32_e64 v2, v2, v7, s[4:5] -; GISEL-NEXT: v_cndmask_b32_e32 v1, v8, v1, vcc -; GISEL-NEXT: v_cndmask_b32_e64 v3, v9, v3, s[4:5] +; GISEL-NEXT: v_cmp_ne_u32_e64 s[4:5], 0, v7 +; GISEL-NEXT: v_cndmask_b32_e64 v2, v2, v4, s[4:5] +; GISEL-NEXT: v_cndmask_b32_e32 v1, v9, v1, vcc +; GISEL-NEXT: v_cndmask_b32_e64 v3, v10, v3, s[4:5] ; GISEL-NEXT: s_setpc_b64 s[30:31] ; ; CGP-LABEL: v_urem_v2i64_oddk_denom: diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/usubsat.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/usubsat.ll index 3a1566b63e50..fe46feea4ebb 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/usubsat.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/usubsat.ll @@ -236,12 +236,11 @@ define i16 @v_usubsat_v2i8(i16 %lhs.arg, i16 %rhs.arg) { ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: s_mov_b32 s4, 8 -; GFX10-NEXT: v_mov_b32_e32 v2, 0xffff -; GFX10-NEXT: v_lshrrev_b32_sdwa v3, s4, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX10-NEXT: v_lshrrev_b32_sdwa v4, s4, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX10-NEXT: v_lshrrev_b32_sdwa v2, s4, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX10-NEXT: v_lshrrev_b32_sdwa v3, s4, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX10-NEXT: s_movk_i32 s4, 0xff -; GFX10-NEXT: v_and_or_b32 v0, v0, v2, v3 -; GFX10-NEXT: v_and_or_b32 v1, v1, v2, v4 +; GFX10-NEXT: v_and_or_b32 v0, 0xffff, v0, v2 +; GFX10-NEXT: v_and_or_b32 v1, 0xffff, v1, v3 ; GFX10-NEXT: v_pk_lshlrev_b16 v0, 8, v0 op_sel_hi:[0,1] ; GFX10-NEXT: v_pk_lshlrev_b16 v1, 8, v1 op_sel_hi:[0,1] ; GFX10-NEXT: v_pk_sub_u16 v0, v0, v1 clamp @@ -297,17 +296,16 @@ define amdgpu_ps i16 @s_usubsat_v2i8(i16 inreg %lhs.arg, i16 inreg %rhs.arg) { ; GFX9-LABEL: s_usubsat_v2i8: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_lshr_b32 s2, s0, 8 -; GFX9-NEXT: s_lshr_b32 s3, s1, 8 ; GFX9-NEXT: s_pack_ll_b32_b16 s0, s0, s2 +; GFX9-NEXT: s_lshr_b32 s3, s1, 8 +; GFX9-NEXT: s_lshr_b32 s2, s0, 16 ; GFX9-NEXT: s_pack_ll_b32_b16 s1, s1, s3 -; GFX9-NEXT: s_mov_b32 s2, 0x80008 -; GFX9-NEXT: s_lshr_b32 s3, s0, 16 -; GFX9-NEXT: s_lshl_b32 s0, s0, s2 -; GFX9-NEXT: s_lshl_b32 s3, s3, 8 -; GFX9-NEXT: s_pack_ll_b32_b16 s0, s0, s3 -; GFX9-NEXT: s_lshr_b32 s3, s1, 16 -; GFX9-NEXT: s_lshl_b32 s1, s1, s2 -; GFX9-NEXT: s_lshl_b32 s2, s3, 8 +; GFX9-NEXT: s_lshl_b32 s0, s0, 0x80008 +; GFX9-NEXT: s_lshl_b32 s2, s2, 8 +; GFX9-NEXT: s_pack_ll_b32_b16 s0, s0, s2 +; GFX9-NEXT: s_lshr_b32 s2, s1, 16 +; GFX9-NEXT: s_lshl_b32 s1, s1, 0x80008 +; GFX9-NEXT: s_lshl_b32 s2, s2, 8 ; GFX9-NEXT: s_pack_ll_b32_b16 s1, s1, s2 ; GFX9-NEXT: v_mov_b32_e32 v0, s1 ; GFX9-NEXT: v_pk_sub_u16 v0, s0, v0 clamp @@ -324,15 +322,14 @@ define amdgpu_ps i16 @s_usubsat_v2i8(i16 inreg %lhs.arg, i16 inreg %rhs.arg) { ; GFX10-NEXT: s_lshr_b32 s3, s1, 8 ; GFX10-NEXT: s_pack_ll_b32_b16 s0, s0, s2 ; GFX10-NEXT: s_pack_ll_b32_b16 s1, s1, s3 -; GFX10-NEXT: s_mov_b32 s2, 0x80008 -; GFX10-NEXT: s_lshr_b32 s3, s0, 16 -; GFX10-NEXT: s_lshr_b32 s4, s1, 16 -; GFX10-NEXT: s_lshl_b32 s0, s0, s2 +; GFX10-NEXT: s_lshr_b32 s2, s0, 16 +; GFX10-NEXT: s_lshr_b32 s3, s1, 16 +; GFX10-NEXT: s_lshl_b32 s0, s0, 0x80008 +; GFX10-NEXT: s_lshl_b32 s2, s2, 8 +; GFX10-NEXT: s_lshl_b32 s1, s1, 0x80008 ; GFX10-NEXT: s_lshl_b32 s3, s3, 8 -; GFX10-NEXT: s_lshl_b32 s1, s1, s2 -; GFX10-NEXT: s_lshl_b32 s2, s4, 8 -; GFX10-NEXT: s_pack_ll_b32_b16 s0, s0, s3 -; GFX10-NEXT: s_pack_ll_b32_b16 s1, s1, s2 +; GFX10-NEXT: s_pack_ll_b32_b16 s0, s0, s2 +; GFX10-NEXT: s_pack_ll_b32_b16 s1, s1, s3 ; GFX10-NEXT: v_pk_sub_u16 v0, s0, s1 clamp ; GFX10-NEXT: s_movk_i32 s0, 0xff ; GFX10-NEXT: v_pk_lshrrev_b16 v0, 8, v0 op_sel_hi:[0,1] @@ -454,35 +451,33 @@ define i32 @v_usubsat_v4i8(i32 %lhs.arg, i32 %rhs.arg) { ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: v_lshrrev_b32_e32 v4, 24, v0 -; GFX10-NEXT: v_lshrrev_b32_e32 v5, 24, v1 +; GFX10-NEXT: v_lshrrev_b32_e32 v3, 24, v0 +; GFX10-NEXT: v_lshrrev_b32_e32 v4, 24, v1 ; GFX10-NEXT: s_mov_b32 s4, 8 -; GFX10-NEXT: v_mov_b32_e32 v2, 0xffff -; GFX10-NEXT: v_lshrrev_b32_sdwa v3, s4, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX10-NEXT: v_lshrrev_b32_e32 v6, 16, v0 -; GFX10-NEXT: v_lshrrev_b32_sdwa v7, s4, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX10-NEXT: v_lshrrev_b32_e32 v8, 16, v1 +; GFX10-NEXT: v_lshrrev_b32_e32 v5, 16, v0 +; GFX10-NEXT: v_lshrrev_b32_sdwa v2, s4, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX10-NEXT: v_lshrrev_b32_sdwa v6, s4, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX10-NEXT: v_lshrrev_b32_e32 v7, 16, v1 +; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; GFX10-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GFX10-NEXT: v_and_or_b32 v0, v0, v2, v3 -; GFX10-NEXT: v_and_or_b32 v1, v1, v2, v7 -; GFX10-NEXT: s_movk_i32 s4, 0xff -; GFX10-NEXT: v_and_or_b32 v3, v6, v2, v4 -; GFX10-NEXT: v_and_or_b32 v2, v8, v2, v5 +; GFX10-NEXT: v_and_or_b32 v0, 0xffff, v0, v2 +; GFX10-NEXT: v_and_or_b32 v1, 0xffff, v1, v6 +; GFX10-NEXT: v_and_or_b32 v2, 0xffff, v5, v3 +; GFX10-NEXT: v_and_or_b32 v3, 0xffff, v7, v4 ; GFX10-NEXT: v_pk_lshlrev_b16 v0, 8, v0 op_sel_hi:[0,1] ; GFX10-NEXT: v_pk_lshlrev_b16 v1, 8, v1 op_sel_hi:[0,1] ; GFX10-NEXT: v_mov_b32_e32 v4, 24 -; GFX10-NEXT: v_pk_lshlrev_b16 v3, 8, v3 op_sel_hi:[0,1] ; GFX10-NEXT: v_pk_lshlrev_b16 v2, 8, v2 op_sel_hi:[0,1] +; GFX10-NEXT: v_pk_lshlrev_b16 v3, 8, v3 op_sel_hi:[0,1] ; GFX10-NEXT: v_pk_sub_u16 v0, v0, v1 clamp -; GFX10-NEXT: v_pk_sub_u16 v1, v3, v2 clamp +; GFX10-NEXT: v_pk_sub_u16 v1, v2, v3 clamp ; GFX10-NEXT: v_mov_b32_e32 v2, 8 ; GFX10-NEXT: v_pk_lshrrev_b16 v0, 8, v0 op_sel_hi:[0,1] ; GFX10-NEXT: v_pk_lshrrev_b16 v1, 8, v1 op_sel_hi:[0,1] ; GFX10-NEXT: v_lshlrev_b32_sdwa v2, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 -; GFX10-NEXT: v_and_b32_e32 v3, s4, v1 +; GFX10-NEXT: v_and_b32_e32 v3, 0xff, v1 ; GFX10-NEXT: v_lshlrev_b32_sdwa v1, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 -; GFX10-NEXT: v_and_or_b32 v0, v0, s4, v2 +; GFX10-NEXT: v_and_or_b32 v0, v0, 0xff, v2 ; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v3 ; GFX10-NEXT: v_or3_b32 v0, v0, v2, v1 ; GFX10-NEXT: s_setpc_b64 s[30:31] @@ -574,27 +569,26 @@ define amdgpu_ps i32 @s_usubsat_v4i8(i32 inreg %lhs.arg, i32 inreg %rhs.arg) { ; GFX9-NEXT: s_lshr_b32 s6, s0, 24 ; GFX9-NEXT: s_pack_ll_b32_b16 s0, s0, s3 ; GFX9-NEXT: s_pack_ll_b32_b16 s3, s4, s6 -; GFX9-NEXT: s_mov_b32 s4, 0x80008 -; GFX9-NEXT: s_lshr_b32 s6, s0, 16 +; GFX9-NEXT: s_lshr_b32 s4, s0, 16 ; GFX9-NEXT: s_lshr_b32 s7, s1, 8 -; GFX9-NEXT: s_lshl_b32 s0, s0, s4 -; GFX9-NEXT: s_lshl_b32 s6, s6, 8 +; GFX9-NEXT: s_lshl_b32 s0, s0, 0x80008 +; GFX9-NEXT: s_lshl_b32 s4, s4, 8 ; GFX9-NEXT: s_lshr_b32 s8, s1, 16 ; GFX9-NEXT: s_lshr_b32 s9, s1, 24 -; GFX9-NEXT: s_pack_ll_b32_b16 s0, s0, s6 -; GFX9-NEXT: s_lshr_b32 s6, s3, 16 +; GFX9-NEXT: s_pack_ll_b32_b16 s0, s0, s4 +; GFX9-NEXT: s_lshr_b32 s4, s3, 16 ; GFX9-NEXT: s_pack_ll_b32_b16 s1, s1, s7 -; GFX9-NEXT: s_lshl_b32 s3, s3, s4 +; GFX9-NEXT: s_lshl_b32 s3, s3, 0x80008 +; GFX9-NEXT: s_lshl_b32 s4, s4, 8 +; GFX9-NEXT: s_lshr_b32 s6, s1, 16 +; GFX9-NEXT: s_pack_ll_b32_b16 s3, s3, s4 +; GFX9-NEXT: s_pack_ll_b32_b16 s4, s8, s9 +; GFX9-NEXT: s_lshl_b32 s1, s1, 0x80008 +; GFX9-NEXT: s_lshl_b32 s6, s6, 8 +; GFX9-NEXT: s_pack_ll_b32_b16 s1, s1, s6 +; GFX9-NEXT: s_lshr_b32 s6, s4, 16 +; GFX9-NEXT: s_lshl_b32 s4, s4, 0x80008 ; GFX9-NEXT: s_lshl_b32 s6, s6, 8 -; GFX9-NEXT: s_lshr_b32 s7, s1, 16 -; GFX9-NEXT: s_pack_ll_b32_b16 s3, s3, s6 -; GFX9-NEXT: s_pack_ll_b32_b16 s6, s8, s9 -; GFX9-NEXT: s_lshl_b32 s1, s1, s4 -; GFX9-NEXT: s_lshl_b32 s7, s7, 8 -; GFX9-NEXT: s_pack_ll_b32_b16 s1, s1, s7 -; GFX9-NEXT: s_lshr_b32 s7, s6, 16 -; GFX9-NEXT: s_lshl_b32 s4, s6, s4 -; GFX9-NEXT: s_lshl_b32 s6, s7, 8 ; GFX9-NEXT: s_pack_ll_b32_b16 s4, s4, s6 ; GFX9-NEXT: v_mov_b32_e32 v0, s1 ; GFX9-NEXT: v_pk_sub_u16 v0, s0, v0 clamp @@ -621,39 +615,37 @@ define amdgpu_ps i32 @s_usubsat_v4i8(i32 inreg %lhs.arg, i32 inreg %rhs.arg) { ; GFX10-NEXT: s_lshr_b32 s4, s0, 24 ; GFX10-NEXT: s_pack_ll_b32_b16 s0, s0, s2 ; GFX10-NEXT: s_pack_ll_b32_b16 s2, s3, s4 -; GFX10-NEXT: s_mov_b32 s3, 0x80008 -; GFX10-NEXT: s_lshr_b32 s4, s0, 16 +; GFX10-NEXT: s_lshr_b32 s3, s0, 16 ; GFX10-NEXT: s_lshr_b32 s5, s1, 8 ; GFX10-NEXT: s_lshr_b32 s6, s1, 16 ; GFX10-NEXT: s_lshr_b32 s7, s1, 24 -; GFX10-NEXT: s_lshl_b32 s0, s0, s3 +; GFX10-NEXT: s_lshl_b32 s0, s0, 0x80008 +; GFX10-NEXT: s_lshl_b32 s3, s3, 8 +; GFX10-NEXT: s_lshr_b32 s4, s2, 16 +; GFX10-NEXT: s_lshl_b32 s2, s2, 0x80008 ; GFX10-NEXT: s_lshl_b32 s4, s4, 8 +; GFX10-NEXT: s_pack_ll_b32_b16 s0, s0, s3 ; GFX10-NEXT: s_pack_ll_b32_b16 s1, s1, s5 -; GFX10-NEXT: s_pack_ll_b32_b16 s0, s0, s4 -; GFX10-NEXT: s_pack_ll_b32_b16 s4, s6, s7 -; GFX10-NEXT: s_lshr_b32 s8, s2, 16 -; GFX10-NEXT: s_lshr_b32 s5, s1, 16 -; GFX10-NEXT: s_lshr_b32 s6, s4, 16 -; GFX10-NEXT: s_lshl_b32 s2, s2, s3 -; GFX10-NEXT: s_lshl_b32 s8, s8, 8 -; GFX10-NEXT: s_lshl_b32 s1, s1, s3 +; GFX10-NEXT: s_pack_ll_b32_b16 s3, s6, s7 +; GFX10-NEXT: s_pack_ll_b32_b16 s2, s2, s4 +; GFX10-NEXT: s_lshr_b32 s4, s1, 16 +; GFX10-NEXT: s_lshr_b32 s5, s3, 16 +; GFX10-NEXT: s_lshl_b32 s1, s1, 0x80008 +; GFX10-NEXT: s_lshl_b32 s4, s4, 8 +; GFX10-NEXT: s_lshl_b32 s3, s3, 0x80008 ; GFX10-NEXT: s_lshl_b32 s5, s5, 8 -; GFX10-NEXT: s_lshl_b32 s3, s4, s3 -; GFX10-NEXT: s_lshl_b32 s4, s6, 8 -; GFX10-NEXT: s_pack_ll_b32_b16 s2, s2, s8 -; GFX10-NEXT: s_pack_ll_b32_b16 s1, s1, s5 -; GFX10-NEXT: s_pack_ll_b32_b16 s3, s3, s4 +; GFX10-NEXT: s_pack_ll_b32_b16 s1, s1, s4 +; GFX10-NEXT: s_pack_ll_b32_b16 s3, s3, s5 ; GFX10-NEXT: v_pk_sub_u16 v0, s0, s1 clamp ; GFX10-NEXT: v_pk_sub_u16 v1, s2, s3 clamp ; GFX10-NEXT: s_mov_b32 s0, 8 -; GFX10-NEXT: s_movk_i32 s1, 0xff ; GFX10-NEXT: v_pk_lshrrev_b16 v0, 8, v0 op_sel_hi:[0,1] ; GFX10-NEXT: v_pk_lshrrev_b16 v1, 8, v1 op_sel_hi:[0,1] ; GFX10-NEXT: v_lshlrev_b32_sdwa v2, s0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 -; GFX10-NEXT: v_and_b32_e32 v3, s1, v1 +; GFX10-NEXT: v_and_b32_e32 v3, 0xff, v1 ; GFX10-NEXT: s_mov_b32 s0, 24 ; GFX10-NEXT: v_lshlrev_b32_sdwa v1, s0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 -; GFX10-NEXT: v_and_or_b32 v0, v0, s1, v2 +; GFX10-NEXT: v_and_or_b32 v0, v0, 0xff, v2 ; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v3 ; GFX10-NEXT: v_or3_b32 v0, v0, v2, v1 ; GFX10-NEXT: v_readfirstlane_b32 s0, v0 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/xnor.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/xnor.ll index 89b3900dc288..c1a44cef4aa2 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/xnor.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/xnor.ll @@ -25,12 +25,11 @@ entry: define amdgpu_ps i32 @scalar_xnor_v2i16_one_use(<2 x i16> inreg %a, <2 x i16> inreg %b) { ; GFX7-LABEL: scalar_xnor_v2i16_one_use: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_mov_b32 s4, 0xffff ; GFX7-NEXT: s_lshl_b32 s1, s1, 16 -; GFX7-NEXT: s_and_b32 s0, s0, s4 +; GFX7-NEXT: s_and_b32 s0, s0, 0xffff ; GFX7-NEXT: s_or_b32 s0, s1, s0 ; GFX7-NEXT: s_lshl_b32 s1, s3, 16 -; GFX7-NEXT: s_and_b32 s2, s2, s4 +; GFX7-NEXT: s_and_b32 s2, s2, 0xffff ; GFX7-NEXT: s_or_b32 s1, s1, s2 ; GFX7-NEXT: s_xor_b32 s0, s0, s1 ; GFX7-NEXT: s_xor_b32 s0, s0, -1 @@ -42,10 +41,10 @@ define amdgpu_ps i32 @scalar_xnor_v2i16_one_use(<2 x i16> inreg %a, <2 x i16> in ; GFX8-NEXT: s_xor_b32 s0, s0, s1 ; GFX8-NEXT: s_mov_b32 s3, s2 ; GFX8-NEXT: s_lshr_b32 s1, s0, 16 -; GFX8-NEXT: s_and_b32 s0, s0, s2 +; GFX8-NEXT: s_and_b32 s0, s0, 0xffff ; GFX8-NEXT: s_xor_b64 s[0:1], s[0:1], s[2:3] ; GFX8-NEXT: s_lshl_b32 s1, s1, 16 -; GFX8-NEXT: s_and_b32 s0, s0, s2 +; GFX8-NEXT: s_and_b32 s0, s0, 0xffff ; GFX8-NEXT: s_or_b32 s0, s1, s0 ; GFX8-NEXT: ; return to shader part epilog ; @@ -117,18 +116,17 @@ define amdgpu_ps i64 @scalar_xnor_i64_one_use(i64 inreg %a, i64 inreg %b) { define amdgpu_ps i64 @scalar_xnor_v4i16_one_use(<4 x i16> inreg %a, <4 x i16> inreg %b) { ; GFX7-LABEL: scalar_xnor_v4i16_one_use: ; GFX7: ; %bb.0: -; GFX7-NEXT: s_mov_b32 s8, 0xffff ; GFX7-NEXT: s_lshl_b32 s1, s1, 16 -; GFX7-NEXT: s_and_b32 s0, s0, s8 +; GFX7-NEXT: s_and_b32 s0, s0, 0xffff ; GFX7-NEXT: s_or_b32 s0, s1, s0 ; GFX7-NEXT: s_lshl_b32 s1, s3, 16 -; GFX7-NEXT: s_and_b32 s2, s2, s8 +; GFX7-NEXT: s_and_b32 s2, s2, 0xffff ; GFX7-NEXT: s_or_b32 s1, s1, s2 ; GFX7-NEXT: s_lshl_b32 s2, s5, 16 -; GFX7-NEXT: s_and_b32 s3, s4, s8 +; GFX7-NEXT: s_and_b32 s3, s4, 0xffff ; GFX7-NEXT: s_or_b32 s2, s2, s3 ; GFX7-NEXT: s_lshl_b32 s3, s7, 16 -; GFX7-NEXT: s_and_b32 s4, s6, s8 +; GFX7-NEXT: s_and_b32 s4, s6, 0xffff ; GFX7-NEXT: s_or_b32 s3, s3, s4 ; GFX7-NEXT: s_mov_b32 s4, -1 ; GFX7-NEXT: s_mov_b32 s5, s4 @@ -142,16 +140,16 @@ define amdgpu_ps i64 @scalar_xnor_v4i16_one_use(<4 x i16> inreg %a, <4 x i16> in ; GFX8-NEXT: s_xor_b64 s[0:1], s[0:1], s[2:3] ; GFX8-NEXT: s_mov_b32 s5, s4 ; GFX8-NEXT: s_lshr_b32 s3, s0, 16 -; GFX8-NEXT: s_and_b32 s2, s0, s4 +; GFX8-NEXT: s_and_b32 s2, s0, 0xffff ; GFX8-NEXT: s_lshr_b32 s7, s1, 16 -; GFX8-NEXT: s_and_b32 s6, s1, s4 +; GFX8-NEXT: s_and_b32 s6, s1, 0xffff ; GFX8-NEXT: s_xor_b64 s[0:1], s[2:3], s[4:5] ; GFX8-NEXT: s_xor_b64 s[2:3], s[6:7], s[4:5] ; GFX8-NEXT: s_lshl_b32 s1, s1, 16 -; GFX8-NEXT: s_and_b32 s0, s0, s4 +; GFX8-NEXT: s_and_b32 s0, s0, 0xffff ; GFX8-NEXT: s_or_b32 s0, s1, s0 ; GFX8-NEXT: s_lshl_b32 s1, s3, 16 -; GFX8-NEXT: s_and_b32 s2, s2, s4 +; GFX8-NEXT: s_and_b32 s2, s2, 0xffff ; GFX8-NEXT: s_or_b32 s1, s1, s2 ; GFX8-NEXT: ; return to shader part epilog ; diff --git a/llvm/test/CodeGen/AMDGPU/add.v2i16.ll b/llvm/test/CodeGen/AMDGPU/add.v2i16.ll index 5b35f592dd27..f88ce8b36825 100644 --- a/llvm/test/CodeGen/AMDGPU/add.v2i16.ll +++ b/llvm/test/CodeGen/AMDGPU/add.v2i16.ll @@ -205,7 +205,8 @@ define amdgpu_kernel void @v_test_add_v2i16_zext_to_v2i32(<2 x i32> addrspace(1) ; GFX9PLUS: global_load_dword [[B:v[0-9]+]] ; GFX9PLUS: v_pk_add_u16 [[ADD:v[0-9]+]], [[A]], [[B]] -; GFX9PLUS-DAG: v_and_b32_e32 v[[ELT0:[0-9]+]], [[MASK]], [[ADD]] +; GFX9-DAG: v_and_b32_e32 v[[ELT0:[0-9]+]], [[MASK]], [[ADD]] +; GFX10-DAG: v_and_b32_e32 v[[ELT0:[0-9]+]], 0xffff, [[ADD]] ; GFX9PLUS-DAG: v_and_b32_sdwa v{{[0-9]+}}, [[MASK]], v{{[0-9]+}} dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX9PLUS: buffer_store_dwordx4 diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-fold-binop-select.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-fold-binop-select.ll index 1bd1a22bf5a9..a936bcf4ef31 100644 --- a/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-fold-binop-select.ll +++ b/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-fold-binop-select.ll @@ -150,7 +150,7 @@ define i32 @select_sdiv_lhs_opaque_const0_i32(i1 %cond) { ; GCN-NEXT: v_mul_hi_u32 v2, v2, s4 ; GCN-NEXT: v_mul_lo_u32 v3, v2, v0 ; GCN-NEXT: v_add_u32_e32 v4, vcc, 1, v2 -; GCN-NEXT: v_sub_u32_e32 v3, vcc, s4, v3 +; GCN-NEXT: v_sub_u32_e32 v3, vcc, 0xf4240, v3 ; GCN-NEXT: v_cmp_ge_u32_e32 vcc, v3, v0 ; GCN-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc ; GCN-NEXT: v_sub_u32_e64 v4, s[4:5], v3, v0 @@ -232,7 +232,7 @@ define i32 @select_sdiv_lhs_opaque_const1_i32(i1 %cond) { ; GCN-NEXT: v_mul_hi_u32 v2, v2, s4 ; GCN-NEXT: v_mul_lo_u32 v3, v2, v0 ; GCN-NEXT: v_add_u32_e32 v4, vcc, 1, v2 -; GCN-NEXT: v_sub_u32_e32 v3, vcc, s4, v3 +; GCN-NEXT: v_sub_u32_e32 v3, vcc, 0xf4240, v3 ; GCN-NEXT: v_cmp_ge_u32_e32 vcc, v3, v0 ; GCN-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc ; GCN-NEXT: v_sub_u32_e64 v4, s[4:5], v3, v0 diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-idiv.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-idiv.ll index 9f1e3f9e3922..6b6734efae92 100644 --- a/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-idiv.ll +++ b/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-idiv.ll @@ -2526,12 +2526,12 @@ define amdgpu_kernel void @udiv_v4i16(<4 x i16> addrspace(1)* %out, <4 x i16> %x ; GFX6: ; %bb.0: ; GFX6-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0xb ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 -; GFX6-NEXT: s_mov_b32 s8, 0xffff ; GFX6-NEXT: s_mov_b32 s3, 0xf000 +; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: s_and_b32 s2, s6, s8 -; GFX6-NEXT: v_cvt_f32_u32_e32 v0, s2 -; GFX6-NEXT: s_and_b32 s9, s4, s8 +; GFX6-NEXT: s_and_b32 s8, s6, 0xffff +; GFX6-NEXT: v_cvt_f32_u32_e32 v0, s8 +; GFX6-NEXT: s_and_b32 s9, s4, 0xffff ; GFX6-NEXT: s_lshr_b32 s6, s6, 16 ; GFX6-NEXT: v_cvt_f32_u32_e32 v1, s9 ; GFX6-NEXT: v_rcp_iflag_f32_e32 v2, v0 @@ -2546,11 +2546,11 @@ define amdgpu_kernel void @udiv_v4i16(<4 x i16> addrspace(1)* %out, <4 x i16> %x ; GFX6-NEXT: v_cmp_ge_f32_e64 vcc, |v1|, v0 ; GFX6-NEXT: v_mul_f32_e32 v1, v4, v5 ; GFX6-NEXT: v_trunc_f32_e32 v1, v1 -; GFX6-NEXT: s_and_b32 s4, s7, s8 +; GFX6-NEXT: s_and_b32 s4, s7, 0xffff ; GFX6-NEXT: v_addc_u32_e32 v0, vcc, 0, v2, vcc ; GFX6-NEXT: v_mad_f32 v2, -v1, v3, v4 ; GFX6-NEXT: v_cvt_f32_u32_e32 v4, s4 -; GFX6-NEXT: s_and_b32 s4, s5, s8 +; GFX6-NEXT: s_and_b32 s4, s5, 0xffff ; GFX6-NEXT: v_cvt_u32_f32_e32 v1, v1 ; GFX6-NEXT: v_cvt_f32_u32_e32 v5, s4 ; GFX6-NEXT: v_rcp_iflag_f32_e32 v6, v4 @@ -2566,7 +2566,7 @@ define amdgpu_kernel void @udiv_v4i16(<4 x i16> addrspace(1)* %out, <4 x i16> %x ; GFX6-NEXT: v_cvt_u32_f32_e32 v1, v1 ; GFX6-NEXT: v_rcp_iflag_f32_e32 v7, v5 ; GFX6-NEXT: v_cmp_ge_f32_e64 vcc, |v3|, v4 -; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX6-NEXT: s_mov_b32 s8, 0xffff ; GFX6-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; GFX6-NEXT: v_mul_f32_e32 v3, v6, v7 ; GFX6-NEXT: v_trunc_f32_e32 v3, v3 @@ -2576,8 +2576,8 @@ define amdgpu_kernel void @udiv_v4i16(<4 x i16> addrspace(1)* %out, <4 x i16> %x ; GFX6-NEXT: v_and_b32_e32 v1, s8, v1 ; GFX6-NEXT: v_addc_u32_e32 v3, vcc, 0, v4, vcc ; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX6-NEXT: v_and_b32_e32 v0, s8, v0 -; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: v_or_b32_e32 v1, v1, v3 ; GFX6-NEXT: v_or_b32_e32 v0, v0, v2 ; GFX6-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 @@ -2586,60 +2586,59 @@ define amdgpu_kernel void @udiv_v4i16(<4 x i16> addrspace(1)* %out, <4 x i16> %x ; GFX9-LABEL: udiv_v4i16: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 -; GFX9-NEXT: s_mov_b32 s0, 0xffff -; GFX9-NEXT: v_mov_b32_e32 v2, 0 +; GFX9-NEXT: v_mov_b32_e32 v6, 0 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_and_b32 s8, s6, s0 -; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s8 +; GFX9-NEXT: s_and_b32 s3, s6, 0xffff +; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s3 +; GFX9-NEXT: s_and_b32 s2, s4, 0xffff ; GFX9-NEXT: s_lshr_b32 s6, s6, 16 -; GFX9-NEXT: s_and_b32 s1, s4, s0 -; GFX9-NEXT: v_cvt_f32_u32_e32 v3, s6 -; GFX9-NEXT: v_cvt_f32_u32_e32 v1, s1 +; GFX9-NEXT: v_cvt_f32_u32_e32 v2, s2 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v4, v0 -; GFX9-NEXT: s_lshr_b32 s1, s4, 16 -; GFX9-NEXT: v_cvt_f32_u32_e32 v5, s1 -; GFX9-NEXT: v_rcp_iflag_f32_e32 v6, v3 -; GFX9-NEXT: v_mul_f32_e32 v4, v1, v4 +; GFX9-NEXT: v_cvt_f32_u32_e32 v1, s6 +; GFX9-NEXT: s_lshr_b32 s4, s4, 16 +; GFX9-NEXT: v_cvt_f32_u32_e32 v3, s4 +; GFX9-NEXT: v_mul_f32_e32 v4, v2, v4 +; GFX9-NEXT: v_rcp_iflag_f32_e32 v5, v1 ; GFX9-NEXT: v_trunc_f32_e32 v4, v4 -; GFX9-NEXT: v_mad_f32 v1, -v4, v0, v1 -; GFX9-NEXT: v_cmp_ge_f32_e64 vcc, |v1|, v0 -; GFX9-NEXT: v_mul_f32_e32 v1, v5, v6 -; GFX9-NEXT: v_trunc_f32_e32 v1, v1 -; GFX9-NEXT: s_and_b32 s1, s7, s0 +; GFX9-NEXT: s_and_b32 s2, s7, 0xffff ; GFX9-NEXT: v_cvt_u32_f32_e32 v7, v4 -; GFX9-NEXT: v_mad_f32 v4, -v1, v3, v5 -; GFX9-NEXT: v_cvt_f32_u32_e32 v5, s1 -; GFX9-NEXT: s_and_b32 s0, s5, s0 +; GFX9-NEXT: v_mad_f32 v2, -v4, v0, v2 +; GFX9-NEXT: v_cvt_f32_u32_e32 v4, s2 +; GFX9-NEXT: v_mul_f32_e32 v5, v3, v5 +; GFX9-NEXT: v_cmp_ge_f32_e64 vcc, |v2|, v0 +; GFX9-NEXT: s_and_b32 s2, s5, 0xffff ; GFX9-NEXT: v_addc_co_u32_e32 v0, vcc, 0, v7, vcc -; GFX9-NEXT: v_cvt_u32_f32_e32 v1, v1 -; GFX9-NEXT: v_cvt_f32_u32_e32 v6, s0 -; GFX9-NEXT: v_rcp_iflag_f32_e32 v7, v5 -; GFX9-NEXT: v_cmp_ge_f32_e64 vcc, |v4|, v3 -; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v1, vcc -; GFX9-NEXT: v_mul_f32_e32 v1, v6, v7 +; GFX9-NEXT: v_trunc_f32_e32 v2, v5 +; GFX9-NEXT: v_cvt_f32_u32_e32 v5, s2 +; GFX9-NEXT: v_rcp_iflag_f32_e32 v7, v4 +; GFX9-NEXT: v_mad_f32 v3, -v2, v1, v3 +; GFX9-NEXT: v_cmp_ge_f32_e64 vcc, |v3|, v1 +; GFX9-NEXT: s_lshr_b32 s2, s7, 16 +; GFX9-NEXT: v_mul_f32_e32 v1, v5, v7 ; GFX9-NEXT: v_trunc_f32_e32 v1, v1 -; GFX9-NEXT: s_lshr_b32 s0, s7, 16 -; GFX9-NEXT: v_mad_f32 v4, -v1, v5, v6 -; GFX9-NEXT: v_cvt_f32_u32_e32 v6, s0 -; GFX9-NEXT: s_lshr_b32 s0, s5, 16 -; GFX9-NEXT: v_cvt_f32_u32_e32 v7, s0 +; GFX9-NEXT: v_mad_f32 v3, -v1, v4, v5 +; GFX9-NEXT: v_cvt_f32_u32_e32 v5, s2 +; GFX9-NEXT: v_cvt_u32_f32_e32 v2, v2 +; GFX9-NEXT: s_lshr_b32 s2, s5, 16 +; GFX9-NEXT: v_cvt_f32_u32_e32 v7, s2 +; GFX9-NEXT: v_rcp_iflag_f32_e32 v8, v5 +; GFX9-NEXT: v_addc_co_u32_e32 v2, vcc, 0, v2, vcc ; GFX9-NEXT: v_cvt_u32_f32_e32 v1, v1 -; GFX9-NEXT: v_rcp_iflag_f32_e32 v8, v6 -; GFX9-NEXT: v_cmp_ge_f32_e64 vcc, |v4|, v5 +; GFX9-NEXT: v_cmp_ge_f32_e64 vcc, |v3|, v4 +; GFX9-NEXT: v_mul_f32_e32 v3, v7, v8 +; GFX9-NEXT: v_trunc_f32_e32 v3, v3 +; GFX9-NEXT: v_cvt_u32_f32_e32 v4, v3 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc -; GFX9-NEXT: v_mul_f32_e32 v4, v7, v8 -; GFX9-NEXT: v_trunc_f32_e32 v4, v4 -; GFX9-NEXT: v_cvt_u32_f32_e32 v5, v4 -; GFX9-NEXT: v_mad_f32 v4, -v4, v6, v7 -; GFX9-NEXT: v_cmp_ge_f32_e64 vcc, |v4|, v6 -; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, 0, v5, vcc -; GFX9-NEXT: v_mov_b32_e32 v5, 0xffff -; GFX9-NEXT: v_and_b32_e32 v1, v5, v1 -; GFX9-NEXT: v_and_b32_e32 v0, v5, v0 -; GFX9-NEXT: v_lshl_or_b32 v1, v4, 16, v1 -; GFX9-NEXT: v_lshl_or_b32 v0, v3, 16, v0 -; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3] +; GFX9-NEXT: v_mad_f32 v3, -v3, v5, v7 +; GFX9-NEXT: v_cmp_ge_f32_e64 vcc, |v3|, v5 +; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v4, vcc +; GFX9-NEXT: v_mov_b32_e32 v4, 0xffff +; GFX9-NEXT: v_and_b32_e32 v1, v4, v1 +; GFX9-NEXT: v_and_b32_e32 v0, v4, v0 +; GFX9-NEXT: v_lshl_or_b32 v1, v3, 16, v1 +; GFX9-NEXT: v_lshl_or_b32 v0, v2, 16, v0 +; GFX9-NEXT: global_store_dwordx2 v6, v[0:1], s[0:1] ; GFX9-NEXT: s_endpgm %r = udiv <4 x i16> %x, %y store <4 x i16> %r, <4 x i16> addrspace(1)* %out @@ -2743,14 +2742,15 @@ define amdgpu_kernel void @urem_v4i16(<4 x i16> addrspace(1)* %out, <4 x i16> %x ; GFX6: ; %bb.0: ; GFX6-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0xb ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 -; GFX6-NEXT: s_mov_b32 s8, 0xffff ; GFX6-NEXT: s_mov_b32 s3, 0xf000 +; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: s_and_b32 s2, s6, s8 -; GFX6-NEXT: v_cvt_f32_u32_e32 v0, s2 +; GFX6-NEXT: s_and_b32 s8, s6, 0xffff +; GFX6-NEXT: v_cvt_f32_u32_e32 v0, s8 ; GFX6-NEXT: v_mov_b32_e32 v4, s6 +; GFX6-NEXT: s_mov_b32 s8, 0xffff ; GFX6-NEXT: v_alignbit_b32 v4, s7, v4, 16 -; GFX6-NEXT: s_and_b32 s9, s4, s8 +; GFX6-NEXT: s_and_b32 s9, s4, 0xffff ; GFX6-NEXT: v_and_b32_e32 v5, s8, v4 ; GFX6-NEXT: v_cvt_f32_u32_e32 v1, s9 ; GFX6-NEXT: v_rcp_iflag_f32_e32 v2, v0 @@ -2772,10 +2772,10 @@ define amdgpu_kernel void @urem_v4i16(<4 x i16> addrspace(1)* %out, <4 x i16> %x ; GFX6-NEXT: v_mad_f32 v1, -v1, v5, v6 ; GFX6-NEXT: v_mul_lo_u32 v0, v0, s6 ; GFX6-NEXT: v_cmp_ge_f32_e64 vcc, |v1|, v5 -; GFX6-NEXT: s_and_b32 s6, s7, s8 +; GFX6-NEXT: s_and_b32 s6, s7, 0xffff ; GFX6-NEXT: v_addc_u32_e32 v1, vcc, 0, v2, vcc ; GFX6-NEXT: v_cvt_f32_u32_e32 v2, s6 -; GFX6-NEXT: s_and_b32 s6, s5, s8 +; GFX6-NEXT: s_and_b32 s6, s5, 0xffff ; GFX6-NEXT: v_mul_lo_u32 v1, v1, v4 ; GFX6-NEXT: v_cvt_f32_u32_e32 v4, s6 ; GFX6-NEXT: v_rcp_iflag_f32_e32 v5, v2 @@ -2801,7 +2801,6 @@ define amdgpu_kernel void @urem_v4i16(<4 x i16> addrspace(1)* %out, <4 x i16> %x ; GFX6-NEXT: v_mul_lo_u32 v1, v1, s7 ; GFX6-NEXT: v_mul_lo_u32 v2, v2, s4 ; GFX6-NEXT: v_and_b32_e32 v0, s8, v0 -; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: v_sub_i32_e32 v1, vcc, s5, v1 ; GFX6-NEXT: v_sub_i32_e32 v2, vcc, s6, v2 ; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v2 @@ -2815,68 +2814,67 @@ define amdgpu_kernel void @urem_v4i16(<4 x i16> addrspace(1)* %out, <4 x i16> %x ; GFX9-LABEL: urem_v4i16: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 -; GFX9-NEXT: s_mov_b32 s0, 0xffff -; GFX9-NEXT: v_mov_b32_e32 v2, 0 +; GFX9-NEXT: v_mov_b32_e32 v6, 0 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_and_b32 s8, s6, s0 -; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s8 -; GFX9-NEXT: s_and_b32 s1, s4, s0 -; GFX9-NEXT: v_cvt_f32_u32_e32 v1, s1 +; GFX9-NEXT: s_and_b32 s3, s6, 0xffff +; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s3 +; GFX9-NEXT: s_and_b32 s2, s4, 0xffff +; GFX9-NEXT: v_cvt_f32_u32_e32 v2, s2 ; GFX9-NEXT: s_lshr_b32 s6, s6, 16 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v4, v0 -; GFX9-NEXT: v_cvt_f32_u32_e32 v3, s6 +; GFX9-NEXT: v_cvt_f32_u32_e32 v1, s6 ; GFX9-NEXT: s_lshr_b32 s4, s4, 16 -; GFX9-NEXT: v_cvt_f32_u32_e32 v5, s4 -; GFX9-NEXT: v_mul_f32_e32 v4, v1, v4 +; GFX9-NEXT: v_cvt_f32_u32_e32 v3, s4 +; GFX9-NEXT: v_mul_f32_e32 v4, v2, v4 ; GFX9-NEXT: v_trunc_f32_e32 v4, v4 -; GFX9-NEXT: v_rcp_iflag_f32_e32 v6, v3 ; GFX9-NEXT: v_cvt_u32_f32_e32 v7, v4 -; GFX9-NEXT: v_mad_f32 v1, -v4, v0, v1 -; GFX9-NEXT: v_cmp_ge_f32_e64 vcc, |v1|, v0 -; GFX9-NEXT: v_mul_f32_e32 v1, v5, v6 +; GFX9-NEXT: v_mad_f32 v2, -v4, v0, v2 +; GFX9-NEXT: v_cmp_ge_f32_e64 vcc, |v2|, v0 +; GFX9-NEXT: v_rcp_iflag_f32_e32 v5, v1 ; GFX9-NEXT: v_addc_co_u32_e32 v0, vcc, 0, v7, vcc -; GFX9-NEXT: v_mul_lo_u32 v0, v0, s8 -; GFX9-NEXT: v_trunc_f32_e32 v1, v1 -; GFX9-NEXT: s_and_b32 s8, s7, s0 -; GFX9-NEXT: v_mad_f32 v4, -v1, v3, v5 +; GFX9-NEXT: v_mul_lo_u32 v0, v0, s3 +; GFX9-NEXT: s_and_b32 s3, s7, 0xffff +; GFX9-NEXT: v_cvt_f32_u32_e32 v4, s3 +; GFX9-NEXT: v_mul_f32_e32 v5, v3, v5 +; GFX9-NEXT: v_trunc_f32_e32 v2, v5 +; GFX9-NEXT: s_and_b32 s8, s5, 0xffff +; GFX9-NEXT: v_mad_f32 v3, -v2, v1, v3 +; GFX9-NEXT: v_cvt_u32_f32_e32 v2, v2 ; GFX9-NEXT: v_cvt_f32_u32_e32 v5, s8 -; GFX9-NEXT: s_and_b32 s0, s5, s0 -; GFX9-NEXT: v_cvt_u32_f32_e32 v1, v1 -; GFX9-NEXT: v_cvt_f32_u32_e32 v6, s0 -; GFX9-NEXT: v_rcp_iflag_f32_e32 v7, v5 -; GFX9-NEXT: v_cmp_ge_f32_e64 vcc, |v4|, v3 -; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc -; GFX9-NEXT: v_mul_f32_e32 v3, v6, v7 +; GFX9-NEXT: v_rcp_iflag_f32_e32 v7, v4 +; GFX9-NEXT: v_cmp_ge_f32_e64 vcc, |v3|, v1 +; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v2, vcc +; GFX9-NEXT: v_mul_f32_e32 v2, v5, v7 ; GFX9-NEXT: v_mul_lo_u32 v1, v1, s6 -; GFX9-NEXT: v_trunc_f32_e32 v3, v3 +; GFX9-NEXT: v_trunc_f32_e32 v2, v2 ; GFX9-NEXT: s_lshr_b32 s6, s7, 16 -; GFX9-NEXT: v_mad_f32 v4, -v3, v5, v6 -; GFX9-NEXT: v_cvt_f32_u32_e32 v6, s6 +; GFX9-NEXT: v_mad_f32 v3, -v2, v4, v5 +; GFX9-NEXT: v_cvt_f32_u32_e32 v5, s6 ; GFX9-NEXT: s_lshr_b32 s5, s5, 16 ; GFX9-NEXT: v_cvt_f32_u32_e32 v7, s5 -; GFX9-NEXT: v_cvt_u32_f32_e32 v3, v3 -; GFX9-NEXT: v_rcp_iflag_f32_e32 v8, v6 -; GFX9-NEXT: v_cmp_ge_f32_e64 vcc, |v4|, v5 -; GFX9-NEXT: v_sub_u32_e32 v0, s1, v0 -; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc -; GFX9-NEXT: v_mul_f32_e32 v4, v7, v8 -; GFX9-NEXT: v_trunc_f32_e32 v4, v4 -; GFX9-NEXT: v_cvt_u32_f32_e32 v5, v4 -; GFX9-NEXT: v_mad_f32 v4, -v4, v6, v7 -; GFX9-NEXT: v_cmp_ge_f32_e64 vcc, |v4|, v6 -; GFX9-NEXT: v_mul_lo_u32 v3, v3, s8 -; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, 0, v5, vcc -; GFX9-NEXT: v_mul_lo_u32 v4, v4, s6 -; GFX9-NEXT: v_sub_u32_e32 v5, s4, v1 -; GFX9-NEXT: v_sub_u32_e32 v1, s0, v3 -; GFX9-NEXT: v_sub_u32_e32 v3, s5, v4 -; GFX9-NEXT: v_mov_b32_e32 v4, 0xffff -; GFX9-NEXT: v_and_b32_e32 v1, v4, v1 -; GFX9-NEXT: v_and_b32_e32 v0, v4, v0 -; GFX9-NEXT: v_lshl_or_b32 v1, v3, 16, v1 -; GFX9-NEXT: v_lshl_or_b32 v0, v5, 16, v0 -; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3] +; GFX9-NEXT: v_cvt_u32_f32_e32 v2, v2 +; GFX9-NEXT: v_rcp_iflag_f32_e32 v8, v5 +; GFX9-NEXT: v_cmp_ge_f32_e64 vcc, |v3|, v4 +; GFX9-NEXT: v_sub_u32_e32 v0, s2, v0 +; GFX9-NEXT: v_addc_co_u32_e32 v2, vcc, 0, v2, vcc +; GFX9-NEXT: v_mul_f32_e32 v3, v7, v8 +; GFX9-NEXT: v_trunc_f32_e32 v3, v3 +; GFX9-NEXT: v_cvt_u32_f32_e32 v4, v3 +; GFX9-NEXT: v_mad_f32 v3, -v3, v5, v7 +; GFX9-NEXT: v_cmp_ge_f32_e64 vcc, |v3|, v5 +; GFX9-NEXT: v_mul_lo_u32 v2, v2, s3 +; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v4, vcc +; GFX9-NEXT: v_mul_lo_u32 v3, v3, s6 +; GFX9-NEXT: v_sub_u32_e32 v4, s4, v1 +; GFX9-NEXT: v_sub_u32_e32 v1, s8, v2 +; GFX9-NEXT: v_sub_u32_e32 v2, s5, v3 +; GFX9-NEXT: v_mov_b32_e32 v3, 0xffff +; GFX9-NEXT: v_and_b32_e32 v1, v3, v1 +; GFX9-NEXT: v_and_b32_e32 v0, v3, v0 +; GFX9-NEXT: v_lshl_or_b32 v1, v2, 16, v1 +; GFX9-NEXT: v_lshl_or_b32 v0, v4, 16, v0 +; GFX9-NEXT: global_store_dwordx2 v6, v[0:1], s[0:1] ; GFX9-NEXT: s_endpgm %r = urem <4 x i16> %x, %y store <4 x i16> %r, <4 x i16> addrspace(1)* %out @@ -3831,45 +3829,44 @@ define amdgpu_kernel void @udiv_v3i16(<3 x i16> addrspace(1)* %out, <3 x i16> %x ; GFX6: ; %bb.0: ; GFX6-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0xb ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 -; GFX6-NEXT: s_mov_b32 s8, 0xffff ; GFX6-NEXT: s_mov_b32 s3, 0xf000 +; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: s_and_b32 s2, s6, s8 -; GFX6-NEXT: v_cvt_f32_u32_e32 v0, s2 -; GFX6-NEXT: s_and_b32 s9, s4, s8 +; GFX6-NEXT: s_and_b32 s9, s6, 0xffff +; GFX6-NEXT: v_cvt_f32_u32_e32 v0, s9 ; GFX6-NEXT: s_lshr_b32 s6, s6, 16 -; GFX6-NEXT: v_cvt_f32_u32_e32 v1, s9 -; GFX6-NEXT: v_rcp_iflag_f32_e32 v2, v0 -; GFX6-NEXT: v_cvt_f32_u32_e32 v3, s6 +; GFX6-NEXT: s_and_b32 s8, s4, 0xffff +; GFX6-NEXT: v_cvt_f32_u32_e32 v2, s6 +; GFX6-NEXT: v_cvt_f32_u32_e32 v1, s8 +; GFX6-NEXT: v_rcp_iflag_f32_e32 v3, v0 ; GFX6-NEXT: s_lshr_b32 s4, s4, 16 ; GFX6-NEXT: v_cvt_f32_u32_e32 v4, s4 -; GFX6-NEXT: v_mul_f32_e32 v2, v1, v2 -; GFX6-NEXT: v_rcp_iflag_f32_e32 v5, v3 -; GFX6-NEXT: v_trunc_f32_e32 v2, v2 -; GFX6-NEXT: v_mad_f32 v1, -v2, v0, v1 -; GFX6-NEXT: v_cvt_u32_f32_e32 v2, v2 +; GFX6-NEXT: v_rcp_iflag_f32_e32 v5, v2 +; GFX6-NEXT: v_mul_f32_e32 v3, v1, v3 +; GFX6-NEXT: v_trunc_f32_e32 v3, v3 +; GFX6-NEXT: v_mad_f32 v1, -v3, v0, v1 ; GFX6-NEXT: v_cmp_ge_f32_e64 vcc, |v1|, v0 ; GFX6-NEXT: v_mul_f32_e32 v1, v4, v5 ; GFX6-NEXT: v_trunc_f32_e32 v1, v1 -; GFX6-NEXT: s_and_b32 s4, s7, s8 -; GFX6-NEXT: v_addc_u32_e32 v0, vcc, 0, v2, vcc -; GFX6-NEXT: v_mad_f32 v2, -v1, v3, v4 +; GFX6-NEXT: s_and_b32 s4, s7, 0xffff +; GFX6-NEXT: v_cvt_u32_f32_e32 v6, v3 +; GFX6-NEXT: v_mad_f32 v3, -v1, v2, v4 ; GFX6-NEXT: v_cvt_f32_u32_e32 v4, s4 -; GFX6-NEXT: s_and_b32 s4, s5, s8 +; GFX6-NEXT: s_and_b32 s4, s5, 0xffff +; GFX6-NEXT: v_addc_u32_e32 v0, vcc, 0, v6, vcc ; GFX6-NEXT: v_cvt_f32_u32_e32 v5, s4 -; GFX6-NEXT: v_cvt_u32_f32_e32 v1, v1 ; GFX6-NEXT: v_rcp_iflag_f32_e32 v6, v4 -; GFX6-NEXT: v_cmp_ge_f32_e64 vcc, |v2|, v3 -; GFX6-NEXT: s_mov_b32 s2, -1 -; GFX6-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GFX6-NEXT: v_cvt_u32_f32_e32 v1, v1 +; GFX6-NEXT: v_cmp_ge_f32_e64 vcc, |v3|, v2 +; GFX6-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX6-NEXT: v_mul_f32_e32 v2, v5, v6 ; GFX6-NEXT: v_trunc_f32_e32 v2, v2 ; GFX6-NEXT: v_cvt_u32_f32_e32 v3, v2 +; GFX6-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; GFX6-NEXT: v_mad_f32 v2, -v2, v4, v5 ; GFX6-NEXT: v_cmp_ge_f32_e64 vcc, |v2|, v4 -; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX6-NEXT: v_addc_u32_e32 v2, vcc, 0, v3, vcc -; GFX6-NEXT: v_and_b32_e32 v0, s8, v0 +; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX6-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX6-NEXT: buffer_store_short v2, off, s[0:3], 0 offset:4 ; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 @@ -3878,47 +3875,46 @@ define amdgpu_kernel void @udiv_v3i16(<3 x i16> addrspace(1)* %out, <3 x i16> %x ; GFX9-LABEL: udiv_v3i16: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 -; GFX9-NEXT: s_mov_b32 s0, 0xffff -; GFX9-NEXT: v_mov_b32_e32 v0, 0 +; GFX9-NEXT: v_mov_b32_e32 v6, 0 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_and_b32 s8, s6, s0 -; GFX9-NEXT: v_cvt_f32_u32_e32 v1, s8 +; GFX9-NEXT: s_and_b32 s3, s6, 0xffff +; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s3 +; GFX9-NEXT: s_and_b32 s2, s4, 0xffff ; GFX9-NEXT: s_lshr_b32 s6, s6, 16 -; GFX9-NEXT: s_and_b32 s1, s4, s0 -; GFX9-NEXT: v_cvt_f32_u32_e32 v3, s6 -; GFX9-NEXT: v_cvt_f32_u32_e32 v2, s1 -; GFX9-NEXT: v_rcp_iflag_f32_e32 v4, v1 -; GFX9-NEXT: s_lshr_b32 s1, s4, 16 -; GFX9-NEXT: v_cvt_f32_u32_e32 v5, s1 -; GFX9-NEXT: v_rcp_iflag_f32_e32 v6, v3 +; GFX9-NEXT: v_cvt_f32_u32_e32 v1, s6 +; GFX9-NEXT: v_cvt_f32_u32_e32 v2, s2 +; GFX9-NEXT: v_rcp_iflag_f32_e32 v4, v0 +; GFX9-NEXT: s_lshr_b32 s4, s4, 16 +; GFX9-NEXT: v_cvt_f32_u32_e32 v3, s4 +; GFX9-NEXT: v_rcp_iflag_f32_e32 v5, v1 ; GFX9-NEXT: v_mul_f32_e32 v4, v2, v4 ; GFX9-NEXT: v_trunc_f32_e32 v4, v4 -; GFX9-NEXT: v_mad_f32 v2, -v4, v1, v2 -; GFX9-NEXT: v_cmp_ge_f32_e64 vcc, |v2|, v1 -; GFX9-NEXT: v_mul_f32_e32 v2, v5, v6 -; GFX9-NEXT: v_trunc_f32_e32 v2, v2 -; GFX9-NEXT: s_and_b32 s1, s7, s0 +; GFX9-NEXT: s_and_b32 s2, s7, 0xffff ; GFX9-NEXT: v_cvt_u32_f32_e32 v7, v4 -; GFX9-NEXT: v_mad_f32 v4, -v2, v3, v5 -; GFX9-NEXT: v_cvt_f32_u32_e32 v5, s1 -; GFX9-NEXT: s_and_b32 s0, s5, s0 -; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v7, vcc -; GFX9-NEXT: v_cvt_f32_u32_e32 v6, s0 -; GFX9-NEXT: v_rcp_iflag_f32_e32 v7, v5 +; GFX9-NEXT: v_mad_f32 v2, -v4, v0, v2 +; GFX9-NEXT: v_cvt_f32_u32_e32 v4, s2 +; GFX9-NEXT: v_mul_f32_e32 v5, v3, v5 +; GFX9-NEXT: v_cmp_ge_f32_e64 vcc, |v2|, v0 +; GFX9-NEXT: v_trunc_f32_e32 v2, v5 +; GFX9-NEXT: s_and_b32 s2, s5, 0xffff +; GFX9-NEXT: v_addc_co_u32_e32 v0, vcc, 0, v7, vcc +; GFX9-NEXT: v_mad_f32 v3, -v2, v1, v3 ; GFX9-NEXT: v_cvt_u32_f32_e32 v2, v2 -; GFX9-NEXT: v_cmp_ge_f32_e64 vcc, |v4|, v3 -; GFX9-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GFX9-NEXT: v_mul_f32_e32 v3, v6, v7 -; GFX9-NEXT: v_trunc_f32_e32 v3, v3 -; GFX9-NEXT: v_cvt_u32_f32_e32 v4, v3 -; GFX9-NEXT: v_addc_co_u32_e32 v2, vcc, 0, v2, vcc -; GFX9-NEXT: v_mad_f32 v3, -v3, v5, v6 -; GFX9-NEXT: v_cmp_ge_f32_e64 vcc, |v3|, v5 -; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v4, vcc -; GFX9-NEXT: v_lshl_or_b32 v1, v2, 16, v1 -; GFX9-NEXT: global_store_short v0, v3, s[2:3] offset:4 -; GFX9-NEXT: global_store_dword v0, v1, s[2:3] +; GFX9-NEXT: v_cvt_f32_u32_e32 v5, s2 +; GFX9-NEXT: v_rcp_iflag_f32_e32 v7, v4 +; GFX9-NEXT: v_cmp_ge_f32_e64 vcc, |v3|, v1 +; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v2, vcc +; GFX9-NEXT: v_mul_f32_e32 v2, v5, v7 +; GFX9-NEXT: v_trunc_f32_e32 v2, v2 +; GFX9-NEXT: v_cvt_u32_f32_e32 v3, v2 +; GFX9-NEXT: v_mad_f32 v2, -v2, v4, v5 +; GFX9-NEXT: v_cmp_ge_f32_e64 vcc, |v2|, v4 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX9-NEXT: v_addc_co_u32_e32 v2, vcc, 0, v3, vcc +; GFX9-NEXT: v_lshl_or_b32 v0, v1, 16, v0 +; GFX9-NEXT: global_store_short v6, v2, s[0:1] offset:4 +; GFX9-NEXT: global_store_dword v6, v0, s[0:1] ; GFX9-NEXT: s_endpgm %r = udiv <3 x i16> %x, %y store <3 x i16> %r, <3 x i16> addrspace(1)* %out @@ -4003,9 +3999,9 @@ define amdgpu_kernel void @urem_v3i16(<3 x i16> addrspace(1)* %out, <3 x i16> %x ; GFX6-NEXT: s_mov_b32 s8, 0xffff ; GFX6-NEXT: s_mov_b32 s3, 0xf000 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: s_and_b32 s9, s6, s8 +; GFX6-NEXT: s_and_b32 s9, s6, 0xffff ; GFX6-NEXT: v_cvt_f32_u32_e32 v0, s9 -; GFX6-NEXT: s_and_b32 s2, s4, s8 +; GFX6-NEXT: s_and_b32 s2, s4, 0xffff ; GFX6-NEXT: v_cvt_f32_u32_e32 v3, s2 ; GFX6-NEXT: v_mov_b32_e32 v2, s6 ; GFX6-NEXT: v_rcp_iflag_f32_e32 v4, v0 @@ -4025,13 +4021,13 @@ define amdgpu_kernel void @urem_v3i16(<3 x i16> addrspace(1)* %out, <3 x i16> %x ; GFX6-NEXT: v_cvt_f32_u32_e32 v3, v3 ; GFX6-NEXT: v_rcp_iflag_f32_e32 v4, v5 ; GFX6-NEXT: v_sub_i32_e32 v0, vcc, s4, v0 -; GFX6-NEXT: s_and_b32 s4, s7, s8 +; GFX6-NEXT: s_and_b32 s4, s7, 0xffff ; GFX6-NEXT: v_cvt_f32_u32_e32 v6, s4 ; GFX6-NEXT: v_mul_f32_e32 v4, v3, v4 ; GFX6-NEXT: v_trunc_f32_e32 v4, v4 ; GFX6-NEXT: v_mad_f32 v3, -v4, v5, v3 ; GFX6-NEXT: v_cvt_u32_f32_e32 v4, v4 -; GFX6-NEXT: s_and_b32 s4, s5, s8 +; GFX6-NEXT: s_and_b32 s4, s5, 0xffff ; GFX6-NEXT: v_cvt_f32_u32_e32 v7, s4 ; GFX6-NEXT: v_rcp_iflag_f32_e32 v8, v6 ; GFX6-NEXT: v_cmp_ge_f32_e64 vcc, |v3|, v5 @@ -4057,51 +4053,51 @@ define amdgpu_kernel void @urem_v3i16(<3 x i16> addrspace(1)* %out, <3 x i16> %x ; GFX9-LABEL: urem_v3i16: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x2c -; GFX9-NEXT: s_mov_b32 s2, 0xffff -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_and_b32 s8, s6, s2 -; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s8 -; GFX9-NEXT: s_and_b32 s3, s4, s2 +; GFX9-NEXT: s_and_b32 s3, s6, 0xffff +; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s3 +; GFX9-NEXT: s_and_b32 s2, s4, 0xffff +; GFX9-NEXT: v_cvt_f32_u32_e32 v2, s2 ; GFX9-NEXT: s_lshr_b32 s6, s6, 16 -; GFX9-NEXT: v_cvt_f32_u32_e32 v1, s6 -; GFX9-NEXT: v_cvt_f32_u32_e32 v2, s3 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v4, v0 +; GFX9-NEXT: v_cvt_f32_u32_e32 v1, s6 ; GFX9-NEXT: s_lshr_b32 s4, s4, 16 ; GFX9-NEXT: v_cvt_f32_u32_e32 v3, s4 -; GFX9-NEXT: v_rcp_iflag_f32_e32 v5, v1 ; GFX9-NEXT: v_mul_f32_e32 v4, v2, v4 ; GFX9-NEXT: v_trunc_f32_e32 v4, v4 -; GFX9-NEXT: s_and_b32 s7, s7, s2 +; GFX9-NEXT: v_rcp_iflag_f32_e32 v5, v1 ; GFX9-NEXT: v_cvt_u32_f32_e32 v6, v4 ; GFX9-NEXT: v_mad_f32 v2, -v4, v0, v2 -; GFX9-NEXT: v_cvt_f32_u32_e32 v4, s7 -; GFX9-NEXT: v_mul_f32_e32 v5, v3, v5 ; GFX9-NEXT: v_cmp_ge_f32_e64 vcc, |v2|, v0 -; GFX9-NEXT: v_trunc_f32_e32 v2, v5 -; GFX9-NEXT: s_and_b32 s2, s5, s2 +; GFX9-NEXT: v_mul_f32_e32 v5, v3, v5 ; GFX9-NEXT: v_addc_co_u32_e32 v0, vcc, 0, v6, vcc -; GFX9-NEXT: v_mad_f32 v3, -v2, v1, v3 -; GFX9-NEXT: v_cvt_u32_f32_e32 v2, v2 -; GFX9-NEXT: v_cvt_f32_u32_e32 v5, s2 -; GFX9-NEXT: v_rcp_iflag_f32_e32 v6, v4 -; GFX9-NEXT: v_cmp_ge_f32_e64 vcc, |v3|, v1 -; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v2, vcc +; GFX9-NEXT: v_trunc_f32_e32 v5, v5 +; GFX9-NEXT: v_mul_lo_u32 v0, v0, s3 +; GFX9-NEXT: s_and_b32 s3, s7, 0xffff +; GFX9-NEXT: v_mad_f32 v2, -v5, v1, v3 +; GFX9-NEXT: v_cvt_f32_u32_e32 v3, s3 +; GFX9-NEXT: s_and_b32 s5, s5, 0xffff +; GFX9-NEXT: v_cvt_u32_f32_e32 v4, v5 +; GFX9-NEXT: v_cvt_f32_u32_e32 v5, s5 +; GFX9-NEXT: v_rcp_iflag_f32_e32 v6, v3 +; GFX9-NEXT: v_cmp_ge_f32_e64 vcc, |v2|, v1 +; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v4, vcc ; GFX9-NEXT: v_mul_f32_e32 v2, v5, v6 ; GFX9-NEXT: v_trunc_f32_e32 v2, v2 -; GFX9-NEXT: v_cvt_u32_f32_e32 v3, v2 -; GFX9-NEXT: v_mad_f32 v2, -v2, v4, v5 -; GFX9-NEXT: v_cmp_ge_f32_e64 vcc, |v2|, v4 -; GFX9-NEXT: v_mul_lo_u32 v0, v0, s8 -; GFX9-NEXT: v_addc_co_u32_e32 v2, vcc, 0, v3, vcc +; GFX9-NEXT: v_cvt_u32_f32_e32 v4, v2 +; GFX9-NEXT: v_mad_f32 v2, -v2, v3, v5 +; GFX9-NEXT: v_cmp_ge_f32_e64 vcc, |v2|, v3 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX9-NEXT: v_addc_co_u32_e32 v2, vcc, 0, v4, vcc ; GFX9-NEXT: v_mul_lo_u32 v1, v1, s6 -; GFX9-NEXT: v_mul_lo_u32 v2, v2, s7 -; GFX9-NEXT: v_sub_u32_e32 v0, s3, v0 +; GFX9-NEXT: v_mul_lo_u32 v2, v2, s3 +; GFX9-NEXT: v_sub_u32_e32 v0, s2, v0 ; GFX9-NEXT: v_mov_b32_e32 v3, 0 ; GFX9-NEXT: v_sub_u32_e32 v1, s4, v1 -; GFX9-NEXT: v_sub_u32_e32 v2, s2, v2 +; GFX9-NEXT: v_sub_u32_e32 v2, s5, v2 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX9-NEXT: v_lshl_or_b32 v0, v1, 16, v0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_store_short v3, v2, s[0:1] offset:4 ; GFX9-NEXT: global_store_dword v3, v0, s[0:1] ; GFX9-NEXT: s_endpgm @@ -4611,18 +4607,18 @@ define amdgpu_kernel void @udiv_v3i15(<3 x i15> addrspace(1)* %out, <3 x i15> %x ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s6, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s2 -; GFX6-NEXT: v_alignbit_b32 v0, s3, v0, 30 -; GFX6-NEXT: s_movk_i32 s3, 0x7fff -; GFX6-NEXT: s_and_b32 s9, s0, s3 +; GFX6-NEXT: s_and_b32 s8, s2, 0x7fff +; GFX6-NEXT: v_cvt_f32_u32_e32 v3, s8 +; GFX6-NEXT: s_and_b32 s9, s0, 0x7fff ; GFX6-NEXT: v_cvt_f32_u32_e32 v1, s9 -; GFX6-NEXT: s_and_b32 s8, s2, s3 ; GFX6-NEXT: v_mov_b32_e32 v2, s0 ; GFX6-NEXT: s_bfe_u32 s0, s0, 0xf000f -; GFX6-NEXT: v_cvt_f32_u32_e32 v3, s8 -; GFX6-NEXT: v_rcp_iflag_f32_e32 v4, v1 ; GFX6-NEXT: v_cvt_f32_u32_e32 v5, s0 +; GFX6-NEXT: v_rcp_iflag_f32_e32 v4, v1 +; GFX6-NEXT: v_mov_b32_e32 v0, s2 ; GFX6-NEXT: s_bfe_u32 s2, s2, 0xf000f +; GFX6-NEXT: v_alignbit_b32 v0, s3, v0, 30 +; GFX6-NEXT: s_movk_i32 s3, 0x7fff ; GFX6-NEXT: v_alignbit_b32 v2, s1, v2, 30 ; GFX6-NEXT: v_mul_f32_e32 v4, v3, v4 ; GFX6-NEXT: v_cvt_f32_u32_e32 v6, s2 @@ -4671,8 +4667,8 @@ define amdgpu_kernel void @udiv_v3i15(<3 x i15> addrspace(1)* %out, <3 x i15> %x ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s2 ; GFX9-NEXT: v_alignbit_b32 v0, s3, v0, 30 -; GFX9-NEXT: s_and_b32 s3, s2, s6 -; GFX9-NEXT: s_and_b32 s7, s0, s6 +; GFX9-NEXT: s_and_b32 s3, s2, 0x7fff +; GFX9-NEXT: s_and_b32 s7, s0, 0x7fff ; GFX9-NEXT: v_cvt_f32_u32_e32 v1, s7 ; GFX9-NEXT: v_mov_b32_e32 v3, s0 ; GFX9-NEXT: s_bfe_u32 s0, s0, 0xf000f @@ -4800,45 +4796,45 @@ define amdgpu_kernel void @urem_v3i15(<3 x i15> addrspace(1)* %out, <3 x i15> %x ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s6, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: v_mov_b32_e32 v0, s2 -; GFX6-NEXT: v_alignbit_b32 v0, s3, v0, 30 -; GFX6-NEXT: s_movk_i32 s3, 0x7fff -; GFX6-NEXT: s_and_b32 s10, s0, s3 -; GFX6-NEXT: v_cvt_f32_u32_e32 v1, s10 -; GFX6-NEXT: s_and_b32 s9, s2, s3 +; GFX6-NEXT: s_and_b32 s9, s2, 0x7fff ; GFX6-NEXT: v_cvt_f32_u32_e32 v3, s9 +; GFX6-NEXT: s_and_b32 s10, s0, 0x7fff +; GFX6-NEXT: v_cvt_f32_u32_e32 v1, s10 ; GFX6-NEXT: v_mov_b32_e32 v2, s0 -; GFX6-NEXT: v_rcp_iflag_f32_e32 v4, v1 ; GFX6-NEXT: v_alignbit_b32 v2, s1, v2, 30 ; GFX6-NEXT: s_bfe_u32 s1, s0, 0xf000f +; GFX6-NEXT: v_rcp_iflag_f32_e32 v4, v1 ; GFX6-NEXT: v_cvt_f32_u32_e32 v5, s1 +; GFX6-NEXT: s_bfe_u32 s10, s2, 0xf000f +; GFX6-NEXT: v_mov_b32_e32 v0, s2 ; GFX6-NEXT: v_mul_f32_e32 v4, v3, v4 ; GFX6-NEXT: v_trunc_f32_e32 v4, v4 ; GFX6-NEXT: v_mad_f32 v3, -v4, v1, v3 ; GFX6-NEXT: v_cvt_u32_f32_e32 v4, v4 ; GFX6-NEXT: v_cmp_ge_f32_e64 vcc, |v3|, v1 -; GFX6-NEXT: s_bfe_u32 s10, s2, 0xf000f ; GFX6-NEXT: v_cvt_f32_u32_e32 v3, s10 +; GFX6-NEXT: v_alignbit_b32 v0, s3, v0, 30 ; GFX6-NEXT: v_addc_u32_e32 v1, vcc, 0, v4, vcc ; GFX6-NEXT: v_mul_lo_u32 v1, v1, s0 ; GFX6-NEXT: v_rcp_iflag_f32_e32 v4, v5 +; GFX6-NEXT: s_movk_i32 s3, 0x7fff ; GFX6-NEXT: v_and_b32_e32 v2, s3, v2 -; GFX6-NEXT: v_and_b32_e32 v0, s3, v0 ; GFX6-NEXT: v_sub_i32_e32 v6, vcc, s2, v1 ; GFX6-NEXT: v_mul_f32_e32 v1, v3, v4 ; GFX6-NEXT: v_cvt_f32_u32_e32 v4, v2 +; GFX6-NEXT: v_and_b32_e32 v0, s3, v0 ; GFX6-NEXT: v_cvt_f32_u32_e32 v7, v0 ; GFX6-NEXT: v_trunc_f32_e32 v1, v1 -; GFX6-NEXT: v_mad_f32 v3, -v1, v5, v3 ; GFX6-NEXT: v_rcp_iflag_f32_e32 v8, v4 +; GFX6-NEXT: v_mad_f32 v3, -v1, v5, v3 ; GFX6-NEXT: v_cvt_u32_f32_e32 v1, v1 ; GFX6-NEXT: v_cmp_ge_f32_e64 vcc, |v3|, v5 -; GFX6-NEXT: s_lshr_b32 s0, s0, 15 ; GFX6-NEXT: v_mul_f32_e32 v3, v7, v8 ; GFX6-NEXT: v_trunc_f32_e32 v3, v3 ; GFX6-NEXT: v_cvt_u32_f32_e32 v5, v3 ; GFX6-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; GFX6-NEXT: v_mad_f32 v3, -v3, v4, v7 +; GFX6-NEXT: s_lshr_b32 s0, s0, 15 ; GFX6-NEXT: v_cmp_ge_f32_e64 vcc, |v3|, v4 ; GFX6-NEXT: v_mul_lo_u32 v1, v1, s0 ; GFX6-NEXT: v_addc_u32_e32 v3, vcc, 0, v5, vcc @@ -4868,8 +4864,8 @@ define amdgpu_kernel void @urem_v3i15(<3 x i15> addrspace(1)* %out, <3 x i15> %x ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s2 ; GFX9-NEXT: v_alignbit_b32 v0, s3, v0, 30 -; GFX9-NEXT: s_and_b32 s3, s2, s6 -; GFX9-NEXT: s_and_b32 s8, s0, s6 +; GFX9-NEXT: s_and_b32 s3, s2, 0x7fff +; GFX9-NEXT: s_and_b32 s8, s0, 0x7fff ; GFX9-NEXT: v_cvt_f32_u32_e32 v1, s8 ; GFX9-NEXT: v_cvt_f32_u32_e32 v4, s3 ; GFX9-NEXT: s_bfe_u32 s3, s0, 0xf000f @@ -5675,24 +5671,23 @@ define amdgpu_kernel void @udiv_v2i32_pow2_shl_denom(<2 x i32> addrspace(1)* %ou ; GFX6: ; %bb.0: ; GFX6-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0xb ; GFX6-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x9 -; GFX6-NEXT: s_movk_i32 s2, 0x1000 ; GFX6-NEXT: s_mov_b32 s0, 0x4f7ffffe ; GFX6-NEXT: s_mov_b32 s11, 0xf000 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: s_lshl_b32 s3, s2, s6 -; GFX6-NEXT: v_cvt_f32_u32_e32 v0, s3 -; GFX6-NEXT: s_lshl_b32 s2, s2, s7 -; GFX6-NEXT: v_cvt_f32_u32_e32 v1, s2 ; GFX6-NEXT: s_mov_b32 s10, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_lshl_b32 s2, 0x1000, s6 +; GFX6-NEXT: v_cvt_f32_u32_e32 v0, s2 +; GFX6-NEXT: s_lshl_b32 s3, 0x1000, s7 +; GFX6-NEXT: v_cvt_f32_u32_e32 v1, s3 ; GFX6-NEXT: v_rcp_iflag_f32_e32 v0, v0 ; GFX6-NEXT: v_rcp_iflag_f32_e32 v1, v1 ; GFX6-NEXT: v_mul_f32_e32 v0, s0, v0 ; GFX6-NEXT: v_cvt_u32_f32_e32 v0, v0 ; GFX6-NEXT: v_mul_f32_e32 v1, s0, v1 -; GFX6-NEXT: s_sub_i32 s0, 0, s3 +; GFX6-NEXT: s_sub_i32 s0, 0, s2 ; GFX6-NEXT: v_cvt_u32_f32_e32 v1, v1 ; GFX6-NEXT: v_mul_lo_u32 v2, s0, v0 -; GFX6-NEXT: s_sub_i32 s0, 0, s2 +; GFX6-NEXT: s_sub_i32 s0, 0, s3 ; GFX6-NEXT: v_mul_lo_u32 v3, s0, v1 ; GFX6-NEXT: v_mul_hi_u32 v2, v0, v2 ; GFX6-NEXT: v_mul_hi_u32 v3, v1, v3 @@ -5700,25 +5695,25 @@ define amdgpu_kernel void @udiv_v2i32_pow2_shl_denom(<2 x i32> addrspace(1)* %ou ; GFX6-NEXT: v_mul_hi_u32 v0, s4, v0 ; GFX6-NEXT: v_add_i32_e32 v1, vcc, v3, v1 ; GFX6-NEXT: v_mul_hi_u32 v1, s5, v1 -; GFX6-NEXT: v_mul_lo_u32 v2, v0, s3 +; GFX6-NEXT: v_mul_lo_u32 v2, v0, s2 ; GFX6-NEXT: v_add_i32_e32 v3, vcc, 1, v0 -; GFX6-NEXT: v_mul_lo_u32 v4, v1, s2 +; GFX6-NEXT: v_mul_lo_u32 v4, v1, s3 ; GFX6-NEXT: v_sub_i32_e32 v2, vcc, s4, v2 -; GFX6-NEXT: v_cmp_le_u32_e64 s[0:1], s3, v2 +; GFX6-NEXT: v_cmp_le_u32_e64 s[0:1], s2, v2 ; GFX6-NEXT: v_cndmask_b32_e64 v0, v0, v3, s[0:1] -; GFX6-NEXT: v_subrev_i32_e32 v3, vcc, s3, v2 +; GFX6-NEXT: v_subrev_i32_e32 v3, vcc, s2, v2 ; GFX6-NEXT: v_cndmask_b32_e64 v2, v2, v3, s[0:1] ; GFX6-NEXT: v_add_i32_e32 v3, vcc, 1, v0 -; GFX6-NEXT: v_cmp_le_u32_e32 vcc, s3, v2 +; GFX6-NEXT: v_cmp_le_u32_e32 vcc, s2, v2 ; GFX6-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc ; GFX6-NEXT: v_sub_i32_e32 v2, vcc, s5, v4 ; GFX6-NEXT: v_add_i32_e32 v3, vcc, 1, v1 -; GFX6-NEXT: v_cmp_le_u32_e64 s[0:1], s2, v2 +; GFX6-NEXT: v_cmp_le_u32_e64 s[0:1], s3, v2 ; GFX6-NEXT: v_cndmask_b32_e64 v1, v1, v3, s[0:1] -; GFX6-NEXT: v_subrev_i32_e32 v3, vcc, s2, v2 +; GFX6-NEXT: v_subrev_i32_e32 v3, vcc, s3, v2 ; GFX6-NEXT: v_cndmask_b32_e64 v2, v2, v3, s[0:1] ; GFX6-NEXT: v_add_i32_e32 v3, vcc, 1, v1 -; GFX6-NEXT: v_cmp_le_u32_e32 vcc, s2, v2 +; GFX6-NEXT: v_cmp_le_u32_e32 vcc, s3, v2 ; GFX6-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc ; GFX6-NEXT: buffer_store_dwordx2 v[0:1], off, s[8:11], 0 ; GFX6-NEXT: s_endpgm @@ -5726,19 +5721,18 @@ define amdgpu_kernel void @udiv_v2i32_pow2_shl_denom(<2 x i32> addrspace(1)* %ou ; GFX9-LABEL: udiv_v2i32_pow2_shl_denom: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x2c -; GFX9-NEXT: s_movk_i32 s2, 0x1000 -; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_lshl_b32 s6, s2, s6 -; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s6 -; GFX9-NEXT: s_lshl_b32 s7, s2, s7 -; GFX9-NEXT: v_cvt_f32_u32_e32 v1, s7 ; GFX9-NEXT: s_mov_b32 s2, 0x4f7ffffe -; GFX9-NEXT: v_rcp_iflag_f32_e32 v0, v0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_lshl_b32 s6, 0x1000, s6 +; GFX9-NEXT: s_lshl_b32 s7, 0x1000, s7 +; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s6 +; GFX9-NEXT: v_cvt_f32_u32_e32 v1, s7 ; GFX9-NEXT: s_sub_i32 s3, 0, s7 +; GFX9-NEXT: v_rcp_iflag_f32_e32 v0, v0 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v1, v1 ; GFX9-NEXT: v_mul_f32_e32 v0, s2, v0 -; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0 ; GFX9-NEXT: v_mul_f32_e32 v1, s2, v1 +; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0 ; GFX9-NEXT: v_cvt_u32_f32_e32 v1, v1 ; GFX9-NEXT: s_sub_i32 s2, 0, s6 ; GFX9-NEXT: v_mul_lo_u32 v2, s2, v0 @@ -5747,26 +5741,26 @@ define amdgpu_kernel void @udiv_v2i32_pow2_shl_denom(<2 x i32> addrspace(1)* %ou ; GFX9-NEXT: v_mul_hi_u32 v2, v0, v2 ; GFX9-NEXT: v_mul_hi_u32 v3, v1, v3 ; GFX9-NEXT: v_add_u32_e32 v0, v0, v2 -; GFX9-NEXT: v_mul_hi_u32 v0, s4, v0 ; GFX9-NEXT: v_add_u32_e32 v1, v1, v3 +; GFX9-NEXT: v_mul_hi_u32 v0, s4, v0 ; GFX9-NEXT: v_mul_hi_u32 v1, s5, v1 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: v_mul_lo_u32 v3, v0, s6 -; GFX9-NEXT: v_add_u32_e32 v5, 1, v0 ; GFX9-NEXT: v_mul_lo_u32 v4, v1, s7 +; GFX9-NEXT: v_add_u32_e32 v5, 1, v0 ; GFX9-NEXT: v_add_u32_e32 v6, 1, v1 ; GFX9-NEXT: v_sub_u32_e32 v3, s4, v3 +; GFX9-NEXT: v_sub_u32_e32 v4, s5, v4 ; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s6, v3 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v5, vcc ; GFX9-NEXT: v_subrev_u32_e32 v5, s6, v3 -; GFX9-NEXT: v_sub_u32_e32 v4, s5, v4 -; GFX9-NEXT: v_cndmask_b32_e32 v3, v3, v5, vcc ; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s7, v4 -; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s6, v3 -; GFX9-NEXT: v_subrev_u32_e32 v3, s7, v4 ; GFX9-NEXT: v_cndmask_b32_e64 v1, v1, v6, s[0:1] +; GFX9-NEXT: v_subrev_u32_e32 v6, s7, v4 +; GFX9-NEXT: v_cndmask_b32_e32 v3, v3, v5, vcc ; GFX9-NEXT: v_add_u32_e32 v5, 1, v0 -; GFX9-NEXT: v_cndmask_b32_e64 v3, v4, v3, s[0:1] +; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s6, v3 +; GFX9-NEXT: v_cndmask_b32_e64 v3, v4, v6, s[0:1] ; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v5, vcc ; GFX9-NEXT: v_add_u32_e32 v4, 1, v1 ; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s7, v3 @@ -5913,12 +5907,11 @@ define amdgpu_kernel void @urem_v2i32_pow2k_denom(<2 x i32> addrspace(1)* %out, ; GFX6: ; %bb.0: ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 -; GFX6-NEXT: s_movk_i32 s6, 0xfff ; GFX6-NEXT: s_mov_b32 s3, 0xf000 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: s_and_b32 s4, s4, s6 -; GFX6-NEXT: s_and_b32 s5, s5, s6 +; GFX6-NEXT: s_and_b32 s4, s4, 0xfff +; GFX6-NEXT: s_and_b32 s5, s5, 0xfff ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: v_mov_b32_e32 v1, s5 ; GFX6-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 @@ -5928,13 +5921,12 @@ define amdgpu_kernel void @urem_v2i32_pow2k_denom(<2 x i32> addrspace(1)* %out, ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c ; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 -; GFX9-NEXT: s_movk_i32 s0, 0xfff ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_and_b32 s1, s2, s0 -; GFX9-NEXT: s_and_b32 s0, s3, s0 -; GFX9-NEXT: v_mov_b32_e32 v0, s1 -; GFX9-NEXT: v_mov_b32_e32 v1, s0 +; GFX9-NEXT: s_and_b32 s0, s2, 0xfff +; GFX9-NEXT: s_and_b32 s1, s3, 0xfff +; GFX9-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] ; GFX9-NEXT: s_endpgm %r = urem <2 x i32> %x, @@ -6011,27 +6003,26 @@ define amdgpu_kernel void @urem_v2i32_pow2_shl_denom(<2 x i32> addrspace(1)* %ou ; GFX6-LABEL: urem_v2i32_pow2_shl_denom: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0xb -; GFX6-NEXT: s_movk_i32 s2, 0x1000 +; GFX6-NEXT: s_mov_b32 s2, 0x4f7ffffe ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; GFX6-NEXT: s_mov_b32 s3, 0xf000 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: s_lshl_b32 s6, s2, s6 +; GFX6-NEXT: s_lshl_b32 s6, 0x1000, s6 ; GFX6-NEXT: v_cvt_f32_u32_e32 v0, s6 -; GFX6-NEXT: s_lshl_b32 s7, s2, s7 +; GFX6-NEXT: s_lshl_b32 s7, 0x1000, s7 ; GFX6-NEXT: v_cvt_f32_u32_e32 v1, s7 -; GFX6-NEXT: s_mov_b32 s2, 0x4f7ffffe ; GFX6-NEXT: v_rcp_iflag_f32_e32 v0, v0 ; GFX6-NEXT: v_rcp_iflag_f32_e32 v1, v1 ; GFX6-NEXT: v_mul_f32_e32 v0, s2, v0 ; GFX6-NEXT: v_cvt_u32_f32_e32 v0, v0 ; GFX6-NEXT: v_mul_f32_e32 v1, s2, v1 -; GFX6-NEXT: s_sub_i32 s2, 0, s6 ; GFX6-NEXT: v_cvt_u32_f32_e32 v1, v1 +; GFX6-NEXT: s_sub_i32 s2, 0, s6 ; GFX6-NEXT: v_mul_lo_u32 v2, s2, v0 ; GFX6-NEXT: s_sub_i32 s2, 0, s7 ; GFX6-NEXT: v_mul_lo_u32 v3, s2, v1 -; GFX6-NEXT: v_mul_hi_u32 v2, v0, v2 ; GFX6-NEXT: s_mov_b32 s2, -1 +; GFX6-NEXT: v_mul_hi_u32 v2, v0, v2 ; GFX6-NEXT: v_mul_hi_u32 v3, v1, v3 ; GFX6-NEXT: v_add_i32_e32 v0, vcc, v2, v0 ; GFX6-NEXT: v_mul_hi_u32 v0, s4, v0 @@ -6041,12 +6032,12 @@ define amdgpu_kernel void @urem_v2i32_pow2_shl_denom(<2 x i32> addrspace(1)* %ou ; GFX6-NEXT: v_mul_lo_u32 v1, v1, s7 ; GFX6-NEXT: v_sub_i32_e32 v0, vcc, s4, v0 ; GFX6-NEXT: v_subrev_i32_e32 v2, vcc, s6, v0 +; GFX6-NEXT: v_sub_i32_e32 v1, vcc, s5, v1 ; GFX6-NEXT: v_cmp_le_u32_e32 vcc, s6, v0 ; GFX6-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc ; GFX6-NEXT: v_subrev_i32_e32 v2, vcc, s6, v0 ; GFX6-NEXT: v_cmp_le_u32_e32 vcc, s6, v0 ; GFX6-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc -; GFX6-NEXT: v_sub_i32_e32 v1, vcc, s5, v1 ; GFX6-NEXT: v_subrev_i32_e32 v2, vcc, s7, v1 ; GFX6-NEXT: v_cmp_le_u32_e32 vcc, s7, v1 ; GFX6-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc @@ -6059,22 +6050,21 @@ define amdgpu_kernel void @urem_v2i32_pow2_shl_denom(<2 x i32> addrspace(1)* %ou ; GFX9-LABEL: urem_v2i32_pow2_shl_denom: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x2c -; GFX9-NEXT: s_movk_i32 s2, 0x1000 -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_lshl_b32 s3, s2, s7 -; GFX9-NEXT: s_lshl_b32 s2, s2, s6 -; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s2 -; GFX9-NEXT: v_cvt_f32_u32_e32 v1, s3 +; GFX9-NEXT: s_lshl_b32 s3, 0x1000, s6 +; GFX9-NEXT: s_lshl_b32 s2, 0x1000, s7 +; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s3 +; GFX9-NEXT: v_cvt_f32_u32_e32 v1, s2 ; GFX9-NEXT: s_mov_b32 s6, 0x4f7ffffe -; GFX9-NEXT: s_sub_i32 s7, 0, s3 +; GFX9-NEXT: s_sub_i32 s7, 0, s2 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v0, v0 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v1, v1 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-NEXT: v_mul_f32_e32 v0, s6, v0 ; GFX9-NEXT: v_mul_f32_e32 v1, s6, v1 ; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0 ; GFX9-NEXT: v_cvt_u32_f32_e32 v1, v1 -; GFX9-NEXT: s_sub_i32 s6, 0, s2 +; GFX9-NEXT: s_sub_i32 s6, 0, s3 ; GFX9-NEXT: v_mul_lo_u32 v2, s6, v0 ; GFX9-NEXT: v_mul_lo_u32 v3, s7, v1 ; GFX9-NEXT: v_mul_hi_u32 v2, v0, v2 @@ -6084,22 +6074,23 @@ define amdgpu_kernel void @urem_v2i32_pow2_shl_denom(<2 x i32> addrspace(1)* %ou ; GFX9-NEXT: v_mul_hi_u32 v0, s4, v0 ; GFX9-NEXT: v_mul_hi_u32 v1, s5, v1 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 -; GFX9-NEXT: v_mul_lo_u32 v0, v0, s2 -; GFX9-NEXT: v_mul_lo_u32 v1, v1, s3 +; GFX9-NEXT: v_mul_lo_u32 v0, v0, s3 +; GFX9-NEXT: v_mul_lo_u32 v1, v1, s2 ; GFX9-NEXT: v_sub_u32_e32 v0, s4, v0 ; GFX9-NEXT: v_sub_u32_e32 v1, s5, v1 -; GFX9-NEXT: v_subrev_u32_e32 v3, s2, v0 -; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s2, v0 -; GFX9-NEXT: v_subrev_u32_e32 v4, s3, v1 +; GFX9-NEXT: v_subrev_u32_e32 v3, s3, v0 +; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s3, v0 +; GFX9-NEXT: v_subrev_u32_e32 v4, s2, v1 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc -; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s3, v1 +; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s2, v1 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc -; GFX9-NEXT: v_subrev_u32_e32 v3, s2, v0 -; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s2, v0 -; GFX9-NEXT: v_subrev_u32_e32 v4, s3, v1 +; GFX9-NEXT: v_subrev_u32_e32 v3, s3, v0 +; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s3, v0 +; GFX9-NEXT: v_subrev_u32_e32 v4, s2, v1 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc -; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s3, v1 +; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s2, v1 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc +; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] ; GFX9-NEXT: s_endpgm %shl.y = shl <2 x i32> , %y @@ -6478,140 +6469,138 @@ define amdgpu_kernel void @sdiv_v2i32_pow2_shl_denom(<2 x i32> addrspace(1)* %ou ; ; GFX6-LABEL: sdiv_v2i32_pow2_shl_denom: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0xb -; GFX6-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x9 -; GFX6-NEXT: s_movk_i32 s2, 0x1000 -; GFX6-NEXT: s_mov_b32 s12, 0x4f7ffffe -; GFX6-NEXT: s_mov_b32 s11, 0xf000 +; GFX6-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0xb +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 +; GFX6-NEXT: s_mov_b32 s13, 0x4f7ffffe +; GFX6-NEXT: s_mov_b32 s7, 0xf000 +; GFX6-NEXT: s_mov_b32 s6, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: s_lshl_b32 s3, s2, s6 -; GFX6-NEXT: s_ashr_i32 s6, s3, 31 -; GFX6-NEXT: s_add_i32 s3, s3, s6 -; GFX6-NEXT: s_xor_b32 s3, s3, s6 -; GFX6-NEXT: v_cvt_f32_u32_e32 v0, s3 -; GFX6-NEXT: s_lshl_b32 s0, s2, s7 -; GFX6-NEXT: s_sub_i32 s7, 0, s3 -; GFX6-NEXT: s_ashr_i32 s2, s0, 31 +; GFX6-NEXT: s_lshl_b32 s0, 0x1000, s10 +; GFX6-NEXT: s_ashr_i32 s1, s0, 31 +; GFX6-NEXT: s_add_i32 s0, s0, s1 +; GFX6-NEXT: s_xor_b32 s2, s0, s1 +; GFX6-NEXT: v_cvt_f32_u32_e32 v0, s2 +; GFX6-NEXT: s_lshl_b32 s0, 0x1000, s11 +; GFX6-NEXT: s_sub_i32 s11, 0, s2 +; GFX6-NEXT: s_ashr_i32 s10, s0, 31 ; GFX6-NEXT: v_rcp_iflag_f32_e32 v0, v0 -; GFX6-NEXT: s_add_i32 s0, s0, s2 -; GFX6-NEXT: s_ashr_i32 s1, s4, 31 -; GFX6-NEXT: s_mov_b32 s10, -1 -; GFX6-NEXT: v_mul_f32_e32 v0, s12, v0 +; GFX6-NEXT: s_add_i32 s0, s0, s10 +; GFX6-NEXT: s_ashr_i32 s3, s8, 31 +; GFX6-NEXT: s_add_i32 s8, s8, s3 +; GFX6-NEXT: v_mul_f32_e32 v0, s13, v0 ; GFX6-NEXT: v_cvt_u32_f32_e32 v0, v0 -; GFX6-NEXT: v_mul_lo_u32 v1, s7, v0 -; GFX6-NEXT: s_xor_b32 s7, s0, s2 -; GFX6-NEXT: v_cvt_f32_u32_e32 v2, s7 -; GFX6-NEXT: s_add_i32 s0, s4, s1 +; GFX6-NEXT: s_xor_b32 s12, s3, s1 +; GFX6-NEXT: v_mul_lo_u32 v1, s11, v0 +; GFX6-NEXT: s_xor_b32 s11, s0, s10 +; GFX6-NEXT: v_cvt_f32_u32_e32 v2, s11 +; GFX6-NEXT: s_xor_b32 s0, s8, s3 ; GFX6-NEXT: v_mul_hi_u32 v1, v0, v1 -; GFX6-NEXT: s_xor_b32 s0, s0, s1 +; GFX6-NEXT: s_sub_i32 s3, 0, s11 ; GFX6-NEXT: v_rcp_iflag_f32_e32 v2, v2 -; GFX6-NEXT: s_xor_b32 s4, s1, s6 ; GFX6-NEXT: v_add_i32_e32 v0, vcc, v1, v0 ; GFX6-NEXT: v_mul_hi_u32 v0, s0, v0 -; GFX6-NEXT: v_mul_f32_e32 v1, s12, v2 +; GFX6-NEXT: v_mul_f32_e32 v1, s13, v2 ; GFX6-NEXT: v_cvt_u32_f32_e32 v1, v1 -; GFX6-NEXT: v_mul_lo_u32 v2, v0, s3 +; GFX6-NEXT: v_mul_lo_u32 v2, v0, s2 ; GFX6-NEXT: v_add_i32_e32 v3, vcc, 1, v0 ; GFX6-NEXT: v_sub_i32_e32 v2, vcc, s0, v2 -; GFX6-NEXT: v_cmp_le_u32_e64 s[0:1], s3, v2 +; GFX6-NEXT: v_cmp_le_u32_e64 s[0:1], s2, v2 ; GFX6-NEXT: v_cndmask_b32_e64 v0, v0, v3, s[0:1] -; GFX6-NEXT: v_subrev_i32_e32 v3, vcc, s3, v2 +; GFX6-NEXT: v_subrev_i32_e32 v3, vcc, s2, v2 ; GFX6-NEXT: v_cndmask_b32_e64 v2, v2, v3, s[0:1] -; GFX6-NEXT: s_sub_i32 s0, 0, s7 -; GFX6-NEXT: v_mul_lo_u32 v3, s0, v1 -; GFX6-NEXT: s_ashr_i32 s0, s5, 31 -; GFX6-NEXT: s_add_i32 s1, s5, s0 +; GFX6-NEXT: v_mul_lo_u32 v3, s3, v1 +; GFX6-NEXT: s_ashr_i32 s0, s9, 31 +; GFX6-NEXT: s_add_i32 s1, s9, s0 ; GFX6-NEXT: s_xor_b32 s1, s1, s0 ; GFX6-NEXT: v_mul_hi_u32 v3, v1, v3 ; GFX6-NEXT: v_add_i32_e32 v4, vcc, 1, v0 -; GFX6-NEXT: s_xor_b32 s2, s0, s2 ; GFX6-NEXT: v_add_i32_e32 v1, vcc, v3, v1 ; GFX6-NEXT: v_mul_hi_u32 v1, s1, v1 -; GFX6-NEXT: v_cmp_le_u32_e32 vcc, s3, v2 +; GFX6-NEXT: v_cmp_le_u32_e32 vcc, s2, v2 ; GFX6-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc -; GFX6-NEXT: v_xor_b32_e32 v0, s4, v0 -; GFX6-NEXT: v_mul_lo_u32 v2, v1, s7 +; GFX6-NEXT: s_xor_b32 s2, s0, s10 +; GFX6-NEXT: v_mul_lo_u32 v2, v1, s11 ; GFX6-NEXT: v_add_i32_e32 v3, vcc, 1, v1 -; GFX6-NEXT: v_subrev_i32_e32 v0, vcc, s4, v0 +; GFX6-NEXT: v_xor_b32_e32 v0, s12, v0 ; GFX6-NEXT: v_sub_i32_e32 v2, vcc, s1, v2 -; GFX6-NEXT: v_cmp_le_u32_e64 s[0:1], s7, v2 +; GFX6-NEXT: v_cmp_le_u32_e64 s[0:1], s11, v2 ; GFX6-NEXT: v_cndmask_b32_e64 v1, v1, v3, s[0:1] -; GFX6-NEXT: v_subrev_i32_e32 v3, vcc, s7, v2 +; GFX6-NEXT: v_subrev_i32_e32 v3, vcc, s11, v2 +; GFX6-NEXT: v_subrev_i32_e32 v0, vcc, s12, v0 ; GFX6-NEXT: v_cndmask_b32_e64 v2, v2, v3, s[0:1] ; GFX6-NEXT: v_add_i32_e32 v3, vcc, 1, v1 -; GFX6-NEXT: v_cmp_le_u32_e32 vcc, s7, v2 +; GFX6-NEXT: v_cmp_le_u32_e32 vcc, s11, v2 ; GFX6-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc ; GFX6-NEXT: v_xor_b32_e32 v1, s2, v1 ; GFX6-NEXT: v_subrev_i32_e32 v1, vcc, s2, v1 -; GFX6-NEXT: buffer_store_dwordx2 v[0:1], off, s[8:11], 0 +; GFX6-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; GFX6-NEXT: s_endpgm ; ; GFX9-LABEL: sdiv_v2i32_pow2_shl_denom: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x2c ; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 -; GFX9-NEXT: s_movk_i32 s0, 0x1000 -; GFX9-NEXT: s_mov_b32 s10, 0x4f7ffffe +; GFX9-NEXT: s_mov_b32 s11, 0x4f7ffffe ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_lshl_b32 s1, s0, s6 -; GFX9-NEXT: s_ashr_i32 s6, s1, 31 -; GFX9-NEXT: s_add_i32 s1, s1, s6 -; GFX9-NEXT: s_xor_b32 s1, s1, s6 -; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s1 -; GFX9-NEXT: s_lshl_b32 s0, s0, s7 -; GFX9-NEXT: s_ashr_i32 s8, s0, 31 -; GFX9-NEXT: s_add_i32 s0, s0, s8 +; GFX9-NEXT: s_lshl_b32 s0, 0x1000, s6 +; GFX9-NEXT: s_ashr_i32 s1, s0, 31 +; GFX9-NEXT: s_add_i32 s0, s0, s1 +; GFX9-NEXT: s_xor_b32 s0, s0, s1 +; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s0 +; GFX9-NEXT: s_lshl_b32 s6, 0x1000, s7 +; GFX9-NEXT: s_ashr_i32 s9, s6, 31 +; GFX9-NEXT: s_add_i32 s6, s6, s9 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v0, v0 -; GFX9-NEXT: s_xor_b32 s0, s0, s8 -; GFX9-NEXT: v_cvt_f32_u32_e32 v1, s0 -; GFX9-NEXT: s_sub_i32 s9, 0, s1 -; GFX9-NEXT: v_mul_f32_e32 v0, s10, v0 +; GFX9-NEXT: s_xor_b32 s6, s6, s9 +; GFX9-NEXT: v_cvt_f32_u32_e32 v1, s6 +; GFX9-NEXT: s_sub_i32 s10, 0, s0 +; GFX9-NEXT: v_mul_f32_e32 v0, s11, v0 ; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v1, v1 ; GFX9-NEXT: s_ashr_i32 s7, s4, 31 ; GFX9-NEXT: s_add_i32 s4, s4, s7 -; GFX9-NEXT: v_mul_lo_u32 v3, s9, v0 -; GFX9-NEXT: v_mul_f32_e32 v1, s10, v1 +; GFX9-NEXT: v_mul_lo_u32 v3, s10, v0 +; GFX9-NEXT: v_mul_f32_e32 v1, s11, v1 ; GFX9-NEXT: v_cvt_u32_f32_e32 v1, v1 ; GFX9-NEXT: s_xor_b32 s4, s4, s7 ; GFX9-NEXT: v_mul_hi_u32 v3, v0, v3 -; GFX9-NEXT: s_sub_i32 s10, 0, s0 -; GFX9-NEXT: s_ashr_i32 s9, s5, 31 -; GFX9-NEXT: s_add_i32 s5, s5, s9 +; GFX9-NEXT: s_sub_i32 s10, 0, s6 +; GFX9-NEXT: s_ashr_i32 s8, s5, 31 +; GFX9-NEXT: s_add_i32 s5, s5, s8 ; GFX9-NEXT: v_add_u32_e32 v0, v0, v3 ; GFX9-NEXT: v_mul_hi_u32 v0, s4, v0 ; GFX9-NEXT: v_mul_lo_u32 v3, s10, v1 -; GFX9-NEXT: s_xor_b32 s6, s7, s6 -; GFX9-NEXT: v_mul_lo_u32 v4, v0, s1 +; GFX9-NEXT: s_xor_b32 s5, s5, s8 +; GFX9-NEXT: s_xor_b32 s1, s7, s1 +; GFX9-NEXT: v_mul_lo_u32 v4, v0, s0 ; GFX9-NEXT: v_mul_hi_u32 v3, v1, v3 ; GFX9-NEXT: v_add_u32_e32 v5, 1, v0 ; GFX9-NEXT: v_sub_u32_e32 v4, s4, v4 -; GFX9-NEXT: s_xor_b32 s4, s5, s9 ; GFX9-NEXT: v_add_u32_e32 v1, v1, v3 -; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s1, v4 -; GFX9-NEXT: v_mul_hi_u32 v1, s4, v1 +; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s0, v4 +; GFX9-NEXT: v_mul_hi_u32 v1, s5, v1 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v5, vcc -; GFX9-NEXT: v_subrev_u32_e32 v5, s1, v4 +; GFX9-NEXT: v_subrev_u32_e32 v5, s0, v4 ; GFX9-NEXT: v_cndmask_b32_e32 v4, v4, v5, vcc ; GFX9-NEXT: v_add_u32_e32 v3, 1, v0 -; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s1, v4 +; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s0, v4 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc -; GFX9-NEXT: v_mul_lo_u32 v3, v1, s0 +; GFX9-NEXT: v_mul_lo_u32 v3, v1, s6 ; GFX9-NEXT: v_add_u32_e32 v4, 1, v1 -; GFX9-NEXT: s_xor_b32 s1, s9, s8 -; GFX9-NEXT: v_xor_b32_e32 v0, s6, v0 -; GFX9-NEXT: v_sub_u32_e32 v3, s4, v3 -; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s0, v3 +; GFX9-NEXT: s_xor_b32 s0, s8, s9 +; GFX9-NEXT: v_xor_b32_e32 v0, s1, v0 +; GFX9-NEXT: v_sub_u32_e32 v3, s5, v3 +; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s6, v3 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc -; GFX9-NEXT: v_subrev_u32_e32 v4, s0, v3 +; GFX9-NEXT: v_subrev_u32_e32 v4, s6, v3 ; GFX9-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc ; GFX9-NEXT: v_add_u32_e32 v4, 1, v1 -; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s0, v3 +; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s6, v3 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc -; GFX9-NEXT: v_xor_b32_e32 v1, s1, v1 -; GFX9-NEXT: v_subrev_u32_e32 v0, s6, v0 -; GFX9-NEXT: v_subrev_u32_e32 v1, s1, v1 +; GFX9-NEXT: v_xor_b32_e32 v1, s0, v1 +; GFX9-NEXT: v_subrev_u32_e32 v0, s1, v0 +; GFX9-NEXT: v_subrev_u32_e32 v1, s0, v1 ; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3] ; GFX9-NEXT: s_endpgm %shl.y = shl <2 x i32> , %y @@ -6806,19 +6795,18 @@ define amdgpu_kernel void @srem_v2i32_pow2k_denom(<2 x i32> addrspace(1)* %out, ; GFX6: ; %bb.0: ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 -; GFX6-NEXT: s_movk_i32 s6, 0xf000 ; GFX6-NEXT: s_mov_b32 s3, 0xf000 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: s_ashr_i32 s7, s4, 31 -; GFX6-NEXT: s_lshr_b32 s7, s7, 20 -; GFX6-NEXT: s_add_i32 s7, s4, s7 -; GFX6-NEXT: s_and_b32 s7, s7, s6 -; GFX6-NEXT: s_sub_i32 s4, s4, s7 +; GFX6-NEXT: s_ashr_i32 s6, s4, 31 +; GFX6-NEXT: s_lshr_b32 s6, s6, 20 +; GFX6-NEXT: s_add_i32 s6, s4, s6 ; GFX6-NEXT: s_ashr_i32 s7, s5, 31 -; GFX6-NEXT: s_lshr_b32 s7, s7, 20 -; GFX6-NEXT: s_add_i32 s7, s5, s7 -; GFX6-NEXT: s_and_b32 s6, s7, s6 +; GFX6-NEXT: s_and_b32 s6, s6, 0xfffff000 +; GFX6-NEXT: s_sub_i32 s4, s4, s6 +; GFX6-NEXT: s_lshr_b32 s6, s7, 20 +; GFX6-NEXT: s_add_i32 s6, s5, s6 +; GFX6-NEXT: s_and_b32 s6, s6, 0xfffff000 ; GFX6-NEXT: s_sub_i32 s5, s5, s6 ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: v_mov_b32_e32 v1, s5 @@ -6829,21 +6817,20 @@ define amdgpu_kernel void @srem_v2i32_pow2k_denom(<2 x i32> addrspace(1)* %out, ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c ; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 -; GFX9-NEXT: s_movk_i32 s0, 0xf000 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_ashr_i32 s1, s2, 31 +; GFX9-NEXT: s_ashr_i32 s0, s2, 31 +; GFX9-NEXT: s_ashr_i32 s1, s3, 31 +; GFX9-NEXT: s_lshr_b32 s0, s0, 20 ; GFX9-NEXT: s_lshr_b32 s1, s1, 20 -; GFX9-NEXT: s_add_i32 s1, s2, s1 -; GFX9-NEXT: s_ashr_i32 s6, s3, 31 -; GFX9-NEXT: s_and_b32 s1, s1, s0 -; GFX9-NEXT: s_sub_i32 s1, s2, s1 -; GFX9-NEXT: s_lshr_b32 s2, s6, 20 -; GFX9-NEXT: s_add_i32 s2, s3, s2 -; GFX9-NEXT: s_and_b32 s0, s2, s0 -; GFX9-NEXT: s_sub_i32 s0, s3, s0 -; GFX9-NEXT: v_mov_b32_e32 v0, s1 -; GFX9-NEXT: v_mov_b32_e32 v1, s0 +; GFX9-NEXT: s_add_i32 s0, s2, s0 +; GFX9-NEXT: s_add_i32 s1, s3, s1 +; GFX9-NEXT: s_and_b32 s0, s0, 0xfffff000 +; GFX9-NEXT: s_and_b32 s1, s1, 0xfffff000 +; GFX9-NEXT: s_sub_i32 s0, s2, s0 +; GFX9-NEXT: s_sub_i32 s1, s3, s1 +; GFX9-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] ; GFX9-NEXT: s_endpgm %r = srem <2 x i32> %x, @@ -6936,39 +6923,38 @@ define amdgpu_kernel void @srem_v2i32_pow2_shl_denom(<2 x i32> addrspace(1)* %ou ; GFX6-LABEL: srem_v2i32_pow2_shl_denom: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0xb -; GFX6-NEXT: s_movk_i32 s8, 0x1000 -; GFX6-NEXT: s_mov_b32 s11, 0x4f7ffffe ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; GFX6-NEXT: s_mov_b32 s9, 0x4f7ffffe ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: s_lshl_b32 s2, s8, s6 +; GFX6-NEXT: s_lshl_b32 s2, 0x1000, s6 ; GFX6-NEXT: s_ashr_i32 s3, s2, 31 ; GFX6-NEXT: s_add_i32 s2, s2, s3 ; GFX6-NEXT: s_xor_b32 s6, s2, s3 ; GFX6-NEXT: v_cvt_f32_u32_e32 v0, s6 -; GFX6-NEXT: s_lshl_b32 s7, s8, s7 -; GFX6-NEXT: s_ashr_i32 s9, s7, 31 -; GFX6-NEXT: s_add_i32 s7, s7, s9 +; GFX6-NEXT: s_lshl_b32 s2, 0x1000, s7 +; GFX6-NEXT: s_ashr_i32 s7, s2, 31 +; GFX6-NEXT: s_add_i32 s2, s2, s7 ; GFX6-NEXT: v_rcp_iflag_f32_e32 v0, v0 -; GFX6-NEXT: s_sub_i32 s10, 0, s6 -; GFX6-NEXT: s_xor_b32 s7, s7, s9 -; GFX6-NEXT: v_cvt_f32_u32_e32 v2, s7 -; GFX6-NEXT: v_mul_f32_e32 v0, s11, v0 +; GFX6-NEXT: s_xor_b32 s7, s2, s7 +; GFX6-NEXT: v_cvt_f32_u32_e32 v1, s7 +; GFX6-NEXT: s_sub_i32 s2, 0, s6 +; GFX6-NEXT: v_mul_f32_e32 v0, s9, v0 ; GFX6-NEXT: v_cvt_u32_f32_e32 v0, v0 +; GFX6-NEXT: v_rcp_iflag_f32_e32 v1, v1 ; GFX6-NEXT: s_ashr_i32 s8, s4, 31 -; GFX6-NEXT: v_rcp_iflag_f32_e32 v2, v2 -; GFX6-NEXT: s_add_i32 s4, s4, s8 -; GFX6-NEXT: v_mul_lo_u32 v1, s10, v0 -; GFX6-NEXT: s_xor_b32 s4, s4, s8 -; GFX6-NEXT: s_sub_i32 s10, 0, s7 -; GFX6-NEXT: s_ashr_i32 s9, s5, 31 -; GFX6-NEXT: v_mul_hi_u32 v1, v0, v1 ; GFX6-NEXT: s_mov_b32 s3, 0xf000 -; GFX6-NEXT: s_mov_b32 s2, -1 -; GFX6-NEXT: v_add_i32_e32 v0, vcc, v1, v0 -; GFX6-NEXT: v_mul_f32_e32 v1, s11, v2 +; GFX6-NEXT: v_mul_lo_u32 v2, s2, v0 +; GFX6-NEXT: s_add_i32 s2, s4, s8 +; GFX6-NEXT: v_mul_f32_e32 v1, s9, v1 +; GFX6-NEXT: s_xor_b32 s4, s2, s8 +; GFX6-NEXT: v_mul_hi_u32 v2, v0, v2 ; GFX6-NEXT: v_cvt_u32_f32_e32 v1, v1 +; GFX6-NEXT: s_sub_i32 s2, 0, s7 +; GFX6-NEXT: s_ashr_i32 s9, s5, 31 +; GFX6-NEXT: v_add_i32_e32 v0, vcc, v2, v0 ; GFX6-NEXT: v_mul_hi_u32 v0, s4, v0 -; GFX6-NEXT: v_mul_lo_u32 v2, s10, v1 +; GFX6-NEXT: v_mul_lo_u32 v2, s2, v1 +; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: v_mul_lo_u32 v0, v0, s6 ; GFX6-NEXT: v_mul_hi_u32 v2, v1, v2 ; GFX6-NEXT: v_sub_i32_e32 v0, vcc, s4, v0 @@ -7000,64 +6986,63 @@ define amdgpu_kernel void @srem_v2i32_pow2_shl_denom(<2 x i32> addrspace(1)* %ou ; GFX9-LABEL: srem_v2i32_pow2_shl_denom: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x2c -; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 -; GFX9-NEXT: s_movk_i32 s0, 0x1000 -; GFX9-NEXT: s_mov_b32 s8, 0x4f7ffffe -; GFX9-NEXT: v_mov_b32_e32 v2, 0 +; GFX9-NEXT: s_mov_b32 s9, 0x4f7ffffe +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_lshl_b32 s1, s0, s6 -; GFX9-NEXT: s_ashr_i32 s6, s1, 31 -; GFX9-NEXT: s_add_i32 s1, s1, s6 -; GFX9-NEXT: s_xor_b32 s1, s1, s6 -; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s1 -; GFX9-NEXT: s_lshl_b32 s0, s0, s7 -; GFX9-NEXT: s_ashr_i32 s6, s0, 31 -; GFX9-NEXT: s_add_i32 s0, s0, s6 -; GFX9-NEXT: s_xor_b32 s0, s0, s6 +; GFX9-NEXT: s_lshl_b32 s3, 0x1000, s6 +; GFX9-NEXT: s_ashr_i32 s6, s3, 31 +; GFX9-NEXT: s_add_i32 s3, s3, s6 +; GFX9-NEXT: s_lshl_b32 s2, 0x1000, s7 +; GFX9-NEXT: s_xor_b32 s3, s3, s6 +; GFX9-NEXT: s_ashr_i32 s7, s2, 31 +; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s3 +; GFX9-NEXT: s_add_i32 s2, s2, s7 +; GFX9-NEXT: s_xor_b32 s2, s2, s7 +; GFX9-NEXT: v_cvt_f32_u32_e32 v1, s2 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v0, v0 -; GFX9-NEXT: v_cvt_f32_u32_e32 v1, s0 -; GFX9-NEXT: s_sub_i32 s7, 0, s1 +; GFX9-NEXT: s_sub_i32 s8, 0, s3 ; GFX9-NEXT: s_ashr_i32 s6, s4, 31 -; GFX9-NEXT: v_mul_f32_e32 v0, s8, v0 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v1, v1 +; GFX9-NEXT: v_mul_f32_e32 v0, s9, v0 ; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0 ; GFX9-NEXT: s_add_i32 s4, s4, s6 -; GFX9-NEXT: s_xor_b32 s4, s4, s6 -; GFX9-NEXT: v_mul_f32_e32 v1, s8, v1 -; GFX9-NEXT: v_mul_lo_u32 v3, s7, v0 +; GFX9-NEXT: v_mul_f32_e32 v1, s9, v1 ; GFX9-NEXT: v_cvt_u32_f32_e32 v1, v1 -; GFX9-NEXT: s_sub_i32 s7, 0, s0 -; GFX9-NEXT: v_mul_hi_u32 v3, v0, v3 -; GFX9-NEXT: v_mul_lo_u32 v4, s7, v1 +; GFX9-NEXT: v_mul_lo_u32 v2, s8, v0 +; GFX9-NEXT: s_sub_i32 s8, 0, s2 +; GFX9-NEXT: s_xor_b32 s4, s4, s6 +; GFX9-NEXT: v_mul_lo_u32 v3, s8, v1 +; GFX9-NEXT: v_mul_hi_u32 v2, v0, v2 ; GFX9-NEXT: s_ashr_i32 s7, s5, 31 ; GFX9-NEXT: s_add_i32 s5, s5, s7 -; GFX9-NEXT: v_add_u32_e32 v0, v0, v3 -; GFX9-NEXT: v_mul_hi_u32 v3, v1, v4 +; GFX9-NEXT: v_mul_hi_u32 v3, v1, v3 +; GFX9-NEXT: v_add_u32_e32 v0, v0, v2 ; GFX9-NEXT: v_mul_hi_u32 v0, s4, v0 ; GFX9-NEXT: s_xor_b32 s5, s5, s7 ; GFX9-NEXT: v_add_u32_e32 v1, v1, v3 ; GFX9-NEXT: v_mul_hi_u32 v1, s5, v1 -; GFX9-NEXT: v_mul_lo_u32 v0, v0, s1 -; GFX9-NEXT: v_mul_lo_u32 v1, v1, s0 +; GFX9-NEXT: v_mul_lo_u32 v0, v0, s3 +; GFX9-NEXT: v_mov_b32_e32 v2, 0 +; GFX9-NEXT: v_mul_lo_u32 v1, v1, s2 ; GFX9-NEXT: v_sub_u32_e32 v0, s4, v0 -; GFX9-NEXT: v_subrev_u32_e32 v3, s1, v0 -; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s1, v0 +; GFX9-NEXT: v_subrev_u32_e32 v3, s3, v0 +; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s3, v0 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc -; GFX9-NEXT: v_subrev_u32_e32 v3, s1, v0 -; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s1, v0 +; GFX9-NEXT: v_subrev_u32_e32 v3, s3, v0 +; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s3, v0 ; GFX9-NEXT: v_sub_u32_e32 v1, s5, v1 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc -; GFX9-NEXT: v_subrev_u32_e32 v3, s0, v1 -; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s0, v1 +; GFX9-NEXT: v_subrev_u32_e32 v3, s2, v1 +; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s2, v1 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc -; GFX9-NEXT: v_subrev_u32_e32 v3, s0, v1 -; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s0, v1 +; GFX9-NEXT: v_subrev_u32_e32 v3, s2, v1 +; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s2, v1 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc ; GFX9-NEXT: v_xor_b32_e32 v0, s6, v0 ; GFX9-NEXT: v_xor_b32_e32 v1, s7, v1 ; GFX9-NEXT: v_subrev_u32_e32 v0, s6, v0 ; GFX9-NEXT: v_subrev_u32_e32 v1, s7, v1 -; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3] +; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] ; GFX9-NEXT: s_endpgm %shl.y = shl <2 x i32> , %y %r = srem <2 x i32> %x, %shl.y @@ -7079,36 +7064,35 @@ define amdgpu_kernel void @udiv_i64_oddk_denom(i64 addrspace(1)* %out, i64 %x) { ; GFX6-NEXT: v_rcp_f32_e32 v0, v0 ; GFX6-NEXT: s_movk_i32 s4, 0xfee0 ; GFX6-NEXT: s_mov_b32 s5, 0x68958c89 -; GFX6-NEXT: v_mov_b32_e32 v7, 0 +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GFX6-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0 ; GFX6-NEXT: v_mul_f32_e32 v1, 0x2f800000, v0 ; GFX6-NEXT: v_trunc_f32_e32 v1, v1 ; GFX6-NEXT: v_mac_f32_e32 v0, 0xcf800000, v1 ; GFX6-NEXT: v_cvt_u32_f32_e32 v0, v0 ; GFX6-NEXT: v_cvt_u32_f32_e32 v1, v1 -; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GFX6-NEXT: s_movk_i32 s8, 0x11f +; GFX6-NEXT: s_mov_b32 s9, 0x976a7377 ; GFX6-NEXT: v_mul_lo_u32 v2, v0, s4 ; GFX6-NEXT: v_mul_hi_u32 v3, v0, s5 ; GFX6-NEXT: v_mul_lo_u32 v4, v1, s5 -; GFX6-NEXT: s_mov_b32 s9, 0x976a7377 +; GFX6-NEXT: v_mul_lo_u32 v5, v0, s5 ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: v_add_i32_e32 v2, vcc, v3, v2 -; GFX6-NEXT: v_mul_lo_u32 v3, v0, s5 ; GFX6-NEXT: v_add_i32_e32 v2, vcc, v4, v2 -; GFX6-NEXT: v_mul_lo_u32 v4, v0, v2 -; GFX6-NEXT: v_mul_hi_u32 v5, v0, v3 +; GFX6-NEXT: v_mul_lo_u32 v3, v0, v2 +; GFX6-NEXT: v_mul_hi_u32 v4, v0, v5 ; GFX6-NEXT: v_mul_hi_u32 v6, v0, v2 -; GFX6-NEXT: v_mul_hi_u32 v8, v1, v2 +; GFX6-NEXT: v_mul_hi_u32 v7, v1, v2 ; GFX6-NEXT: v_mul_lo_u32 v2, v1, v2 -; GFX6-NEXT: v_add_i32_e32 v4, vcc, v5, v4 -; GFX6-NEXT: v_addc_u32_e32 v5, vcc, 0, v6, vcc -; GFX6-NEXT: v_mul_lo_u32 v6, v1, v3 -; GFX6-NEXT: v_mul_hi_u32 v3, v1, v3 +; GFX6-NEXT: v_add_i32_e32 v3, vcc, v4, v3 +; GFX6-NEXT: v_addc_u32_e32 v4, vcc, 0, v6, vcc +; GFX6-NEXT: v_mul_lo_u32 v6, v1, v5 +; GFX6-NEXT: v_mul_hi_u32 v5, v1, v5 ; GFX6-NEXT: s_mov_b32 s6, -1 -; GFX6-NEXT: v_add_i32_e32 v4, vcc, v4, v6 -; GFX6-NEXT: v_addc_u32_e32 v3, vcc, v5, v3, vcc -; GFX6-NEXT: v_addc_u32_e32 v4, vcc, v8, v7, vcc +; GFX6-NEXT: v_add_i32_e32 v3, vcc, v3, v6 +; GFX6-NEXT: v_addc_u32_e32 v3, vcc, v4, v5, vcc +; GFX6-NEXT: v_addc_u32_e32 v4, vcc, 0, v7, vcc ; GFX6-NEXT: v_add_i32_e32 v2, vcc, v3, v2 ; GFX6-NEXT: v_addc_u32_e32 v3, vcc, 0, v4, vcc ; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v2 @@ -7124,7 +7108,7 @@ define amdgpu_kernel void @udiv_i64_oddk_denom(i64 addrspace(1)* %out, i64 %x) { ; GFX6-NEXT: v_mul_lo_u32 v4, v0, v2 ; GFX6-NEXT: v_mul_hi_u32 v5, v0, v3 ; GFX6-NEXT: v_mul_hi_u32 v6, v0, v2 -; GFX6-NEXT: v_mul_hi_u32 v8, v1, v2 +; GFX6-NEXT: v_mul_hi_u32 v7, v1, v2 ; GFX6-NEXT: v_mul_lo_u32 v2, v1, v2 ; GFX6-NEXT: v_add_i32_e32 v4, vcc, v5, v4 ; GFX6-NEXT: v_addc_u32_e32 v5, vcc, 0, v6, vcc @@ -7133,7 +7117,7 @@ define amdgpu_kernel void @udiv_i64_oddk_denom(i64 addrspace(1)* %out, i64 %x) { ; GFX6-NEXT: s_mov_b32 s5, s1 ; GFX6-NEXT: v_add_i32_e32 v4, vcc, v4, v6 ; GFX6-NEXT: v_addc_u32_e32 v3, vcc, v5, v3, vcc -; GFX6-NEXT: v_addc_u32_e32 v4, vcc, v8, v7, vcc +; GFX6-NEXT: v_addc_u32_e32 v4, vcc, 0, v7, vcc ; GFX6-NEXT: v_add_i32_e32 v2, vcc, v3, v2 ; GFX6-NEXT: v_addc_u32_e32 v3, vcc, 0, v4, vcc ; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v2 @@ -7149,13 +7133,13 @@ define amdgpu_kernel void @udiv_i64_oddk_denom(i64 addrspace(1)* %out, i64 %x) { ; GFX6-NEXT: v_mul_hi_u32 v0, s3, v0 ; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v4 ; GFX6-NEXT: v_addc_u32_e32 v0, vcc, v3, v0, vcc -; GFX6-NEXT: v_addc_u32_e32 v2, vcc, v5, v7, vcc +; GFX6-NEXT: v_addc_u32_e32 v2, vcc, 0, v5, vcc ; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v1 ; GFX6-NEXT: v_addc_u32_e32 v1, vcc, 0, v2, vcc ; GFX6-NEXT: v_mul_lo_u32 v2, v0, s8 ; GFX6-NEXT: v_mul_hi_u32 v3, v0, s9 ; GFX6-NEXT: v_mul_lo_u32 v4, v1, s9 -; GFX6-NEXT: v_mov_b32_e32 v5, s8 +; GFX6-NEXT: v_mov_b32_e32 v5, 0x11f ; GFX6-NEXT: v_add_i32_e32 v2, vcc, v3, v2 ; GFX6-NEXT: v_mul_lo_u32 v3, v0, s9 ; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v4 @@ -7201,14 +7185,13 @@ define amdgpu_kernel void @udiv_i64_oddk_denom(i64 addrspace(1)* %out, i64 %x) { ; GFX9-NEXT: v_rcp_f32_e32 v0, v0 ; GFX9-NEXT: s_movk_i32 s2, 0xfee0 ; GFX9-NEXT: s_mov_b32 s3, 0x68958c89 -; GFX9-NEXT: v_mov_b32_e32 v6, 0 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0 ; GFX9-NEXT: v_mul_f32_e32 v1, 0x2f800000, v0 ; GFX9-NEXT: v_trunc_f32_e32 v1, v1 ; GFX9-NEXT: v_mac_f32_e32 v0, 0xcf800000, v1 ; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0 ; GFX9-NEXT: v_cvt_u32_f32_e32 v1, v1 -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-NEXT: v_mul_lo_u32 v2, v0, s2 ; GFX9-NEXT: v_mul_hi_u32 v3, v0, s3 ; GFX9-NEXT: v_mul_lo_u32 v5, v1, s3 @@ -7217,16 +7200,16 @@ define amdgpu_kernel void @udiv_i64_oddk_denom(i64 addrspace(1)* %out, i64 %x) { ; GFX9-NEXT: v_add_u32_e32 v2, v2, v5 ; GFX9-NEXT: v_mul_hi_u32 v3, v0, v4 ; GFX9-NEXT: v_mul_lo_u32 v5, v0, v2 -; GFX9-NEXT: v_mul_hi_u32 v7, v0, v2 -; GFX9-NEXT: v_mul_hi_u32 v8, v1, v2 +; GFX9-NEXT: v_mul_hi_u32 v6, v0, v2 +; GFX9-NEXT: v_mul_hi_u32 v7, v1, v2 ; GFX9-NEXT: v_mul_lo_u32 v2, v1, v2 ; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, v3, v5 -; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v7, vcc -; GFX9-NEXT: v_mul_lo_u32 v7, v1, v4 +; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v6, vcc +; GFX9-NEXT: v_mul_lo_u32 v6, v1, v4 ; GFX9-NEXT: v_mul_hi_u32 v4, v1, v4 -; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, v3, v7 +; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, v3, v6 ; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, v5, v4, vcc -; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, v8, v6, vcc +; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, 0, v7, vcc ; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v3, v2 ; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v4, vcc ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v2 @@ -7240,17 +7223,17 @@ define amdgpu_kernel void @udiv_i64_oddk_denom(i64 addrspace(1)* %out, i64 %x) { ; GFX9-NEXT: v_add_u32_e32 v2, v2, v4 ; GFX9-NEXT: v_mul_lo_u32 v3, v0, v2 ; GFX9-NEXT: v_mul_hi_u32 v4, v0, v5 -; GFX9-NEXT: v_mul_hi_u32 v7, v0, v2 -; GFX9-NEXT: v_mul_hi_u32 v8, v1, v2 +; GFX9-NEXT: v_mul_hi_u32 v6, v0, v2 +; GFX9-NEXT: v_mul_hi_u32 v7, v1, v2 ; GFX9-NEXT: v_mul_lo_u32 v2, v1, v2 ; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, v4, v3 -; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, 0, v7, vcc -; GFX9-NEXT: v_mul_lo_u32 v7, v1, v5 +; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, 0, v6, vcc +; GFX9-NEXT: v_mul_lo_u32 v6, v1, v5 ; GFX9-NEXT: v_mul_hi_u32 v5, v1, v5 ; GFX9-NEXT: s_mov_b32 s3, 0x976a7377 -; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, v3, v7 +; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, v3, v6 ; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, v4, v5, vcc -; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, v8, v6, vcc +; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, 0, v7, vcc ; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v3, v2 ; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v4, vcc ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v2 @@ -7265,32 +7248,33 @@ define amdgpu_kernel void @udiv_i64_oddk_denom(i64 addrspace(1)* %out, i64 %x) { ; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v4, vcc ; GFX9-NEXT: v_mul_lo_u32 v4, s7, v0 ; GFX9-NEXT: v_mul_hi_u32 v0, s7, v0 +; GFX9-NEXT: v_mov_b32_e32 v6, 0x11f ; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v2, v4 ; GFX9-NEXT: v_addc_co_u32_e32 v0, vcc, v3, v0, vcc -; GFX9-NEXT: v_addc_co_u32_e32 v2, vcc, v5, v6, vcc +; GFX9-NEXT: v_addc_co_u32_e32 v2, vcc, 0, v5, vcc ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v1 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v2, vcc ; GFX9-NEXT: v_mul_lo_u32 v2, v0, s2 ; GFX9-NEXT: v_mul_hi_u32 v3, v0, s3 ; GFX9-NEXT: v_mul_lo_u32 v4, v1, s3 -; GFX9-NEXT: v_mov_b32_e32 v5, s2 +; GFX9-NEXT: v_mov_b32_e32 v5, 0 ; GFX9-NEXT: v_add_u32_e32 v2, v3, v2 ; GFX9-NEXT: v_mul_lo_u32 v3, v0, s3 ; GFX9-NEXT: v_add_u32_e32 v2, v2, v4 ; GFX9-NEXT: v_sub_u32_e32 v4, s7, v2 ; GFX9-NEXT: v_sub_co_u32_e32 v3, vcc, s6, v3 -; GFX9-NEXT: v_subb_co_u32_e64 v4, s[0:1], v4, v5, vcc -; GFX9-NEXT: v_subrev_co_u32_e64 v5, s[0:1], s3, v3 +; GFX9-NEXT: v_subb_co_u32_e64 v4, s[0:1], v4, v6, vcc +; GFX9-NEXT: v_subrev_co_u32_e64 v6, s[0:1], s3, v3 ; GFX9-NEXT: v_subbrev_co_u32_e64 v4, s[0:1], 0, v4, s[0:1] ; GFX9-NEXT: s_movk_i32 s3, 0x11e ; GFX9-NEXT: v_cmp_lt_u32_e64 s[0:1], s3, v4 ; GFX9-NEXT: s_mov_b32 s6, 0x976a7376 ; GFX9-NEXT: v_cndmask_b32_e64 v7, 0, -1, s[0:1] -; GFX9-NEXT: v_cmp_lt_u32_e64 s[0:1], s6, v5 -; GFX9-NEXT: v_cndmask_b32_e64 v5, 0, -1, s[0:1] +; GFX9-NEXT: v_cmp_lt_u32_e64 s[0:1], s6, v6 +; GFX9-NEXT: v_cndmask_b32_e64 v6, 0, -1, s[0:1] ; GFX9-NEXT: v_cmp_eq_u32_e64 s[0:1], s2, v4 -; GFX9-NEXT: v_cndmask_b32_e64 v4, v7, v5, s[0:1] -; GFX9-NEXT: v_add_co_u32_e64 v5, s[0:1], 2, v0 +; GFX9-NEXT: v_cndmask_b32_e64 v4, v7, v6, s[0:1] +; GFX9-NEXT: v_add_co_u32_e64 v6, s[0:1], 2, v0 ; GFX9-NEXT: v_addc_co_u32_e64 v7, s[0:1], 0, v1, s[0:1] ; GFX9-NEXT: v_add_co_u32_e64 v8, s[0:1], 1, v0 ; GFX9-NEXT: v_addc_co_u32_e64 v9, s[0:1], 0, v1, s[0:1] @@ -7305,10 +7289,10 @@ define amdgpu_kernel void @udiv_i64_oddk_denom(i64 addrspace(1)* %out, i64 %x) { ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, s2, v2 ; GFX9-NEXT: v_cndmask_b32_e32 v2, v7, v3, vcc ; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 -; GFX9-NEXT: v_cndmask_b32_e64 v2, v8, v5, s[0:1] +; GFX9-NEXT: v_cndmask_b32_e64 v2, v8, v6, s[0:1] ; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc ; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc -; GFX9-NEXT: global_store_dwordx2 v6, v[0:1], s[4:5] +; GFX9-NEXT: global_store_dwordx2 v5, v[0:1], s[4:5] ; GFX9-NEXT: s_endpgm %r = udiv i64 %x, 1235195949943 store i64 %r, i64 addrspace(1)* %out @@ -7454,9 +7438,9 @@ define amdgpu_kernel void @udiv_v2i64_mixed_pow2k_denom(<2 x i64> addrspace(1)* ; GFX6-NEXT: v_madak_f32 v0, 0, v0, 0x457ff000 ; GFX6-NEXT: v_rcp_f32_e32 v0, v0 ; GFX6-NEXT: s_movk_i32 s6, 0xf001 -; GFX6-NEXT: v_mov_b32_e32 v7, 0 ; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 ; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0xd +; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0 ; GFX6-NEXT: v_mul_f32_e32 v1, 0x2f800000, v0 ; GFX6-NEXT: v_trunc_f32_e32 v1, v1 @@ -7467,23 +7451,22 @@ define amdgpu_kernel void @udiv_v2i64_mixed_pow2k_denom(<2 x i64> addrspace(1)* ; GFX6-NEXT: s_lshr_b64 s[8:9], s[0:1], 12 ; GFX6-NEXT: s_movk_i32 s0, 0xfff ; GFX6-NEXT: v_mul_hi_u32 v2, v0, s6 -; GFX6-NEXT: v_mul_lo_u32 v3, v1, s6 -; GFX6-NEXT: v_mul_lo_u32 v4, v0, s6 -; GFX6-NEXT: s_mov_b32 s7, 0xf000 +; GFX6-NEXT: v_mul_lo_u32 v4, v1, s6 +; GFX6-NEXT: v_mul_lo_u32 v3, v0, s6 ; GFX6-NEXT: v_subrev_i32_e32 v2, vcc, v0, v2 -; GFX6-NEXT: v_add_i32_e32 v2, vcc, v3, v2 -; GFX6-NEXT: v_mul_lo_u32 v3, v0, v2 -; GFX6-NEXT: v_mul_hi_u32 v5, v0, v4 +; GFX6-NEXT: v_add_i32_e32 v2, vcc, v4, v2 +; GFX6-NEXT: v_mul_hi_u32 v5, v0, v3 +; GFX6-NEXT: v_mul_lo_u32 v4, v0, v2 ; GFX6-NEXT: v_mul_hi_u32 v6, v0, v2 -; GFX6-NEXT: v_mul_hi_u32 v8, v1, v2 +; GFX6-NEXT: v_mul_hi_u32 v7, v1, v2 ; GFX6-NEXT: v_mul_lo_u32 v2, v1, v2 -; GFX6-NEXT: v_add_i32_e32 v3, vcc, v5, v3 +; GFX6-NEXT: v_add_i32_e32 v4, vcc, v5, v4 ; GFX6-NEXT: v_addc_u32_e32 v5, vcc, 0, v6, vcc -; GFX6-NEXT: v_mul_lo_u32 v6, v1, v4 -; GFX6-NEXT: v_mul_hi_u32 v4, v1, v4 -; GFX6-NEXT: v_add_i32_e32 v3, vcc, v3, v6 -; GFX6-NEXT: v_addc_u32_e32 v3, vcc, v5, v4, vcc -; GFX6-NEXT: v_addc_u32_e32 v4, vcc, v8, v7, vcc +; GFX6-NEXT: v_mul_lo_u32 v6, v1, v3 +; GFX6-NEXT: v_mul_hi_u32 v3, v1, v3 +; GFX6-NEXT: v_add_i32_e32 v4, vcc, v4, v6 +; GFX6-NEXT: v_addc_u32_e32 v3, vcc, v5, v3, vcc +; GFX6-NEXT: v_addc_u32_e32 v4, vcc, 0, v7, vcc ; GFX6-NEXT: v_add_i32_e32 v2, vcc, v3, v2 ; GFX6-NEXT: v_addc_u32_e32 v3, vcc, 0, v4, vcc ; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v2 @@ -7497,7 +7480,7 @@ define amdgpu_kernel void @udiv_v2i64_mixed_pow2k_denom(<2 x i64> addrspace(1)* ; GFX6-NEXT: v_mul_lo_u32 v3, v0, v2 ; GFX6-NEXT: v_mul_hi_u32 v5, v0, v4 ; GFX6-NEXT: v_mul_hi_u32 v6, v0, v2 -; GFX6-NEXT: v_mul_hi_u32 v8, v1, v2 +; GFX6-NEXT: v_mul_hi_u32 v7, v1, v2 ; GFX6-NEXT: v_mul_lo_u32 v2, v1, v2 ; GFX6-NEXT: v_add_i32_e32 v3, vcc, v5, v3 ; GFX6-NEXT: v_addc_u32_e32 v5, vcc, 0, v6, vcc @@ -7505,7 +7488,7 @@ define amdgpu_kernel void @udiv_v2i64_mixed_pow2k_denom(<2 x i64> addrspace(1)* ; GFX6-NEXT: v_mul_hi_u32 v4, v1, v4 ; GFX6-NEXT: v_add_i32_e32 v3, vcc, v3, v6 ; GFX6-NEXT: v_addc_u32_e32 v3, vcc, v5, v4, vcc -; GFX6-NEXT: v_addc_u32_e32 v4, vcc, v8, v7, vcc +; GFX6-NEXT: v_addc_u32_e32 v4, vcc, 0, v7, vcc ; GFX6-NEXT: v_add_i32_e32 v2, vcc, v3, v2 ; GFX6-NEXT: v_addc_u32_e32 v3, vcc, 0, v4, vcc ; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v2 @@ -7521,7 +7504,7 @@ define amdgpu_kernel void @udiv_v2i64_mixed_pow2k_denom(<2 x i64> addrspace(1)* ; GFX6-NEXT: v_mul_hi_u32 v0, s3, v0 ; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v4 ; GFX6-NEXT: v_addc_u32_e32 v0, vcc, v3, v0, vcc -; GFX6-NEXT: v_addc_u32_e32 v2, vcc, v5, v7, vcc +; GFX6-NEXT: v_addc_u32_e32 v2, vcc, 0, v5, vcc ; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v1 ; GFX6-NEXT: v_addc_u32_e32 v1, vcc, 0, v2, vcc ; GFX6-NEXT: v_mul_lo_u32 v4, v1, s0 @@ -7563,7 +7546,6 @@ define amdgpu_kernel void @udiv_v2i64_mixed_pow2k_denom(<2 x i64> addrspace(1)* ; GFX9-NEXT: v_madak_f32 v0, 0, v0, 0x457ff000 ; GFX9-NEXT: v_rcp_f32_e32 v0, v0 ; GFX9-NEXT: s_movk_i32 s2, 0xf001 -; GFX9-NEXT: v_mov_b32_e32 v5, 0 ; GFX9-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0 ; GFX9-NEXT: v_mul_f32_e32 v1, 0x2f800000, v0 ; GFX9-NEXT: v_trunc_f32_e32 v1, v1 @@ -7575,75 +7557,76 @@ define amdgpu_kernel void @udiv_v2i64_mixed_pow2k_denom(<2 x i64> addrspace(1)* ; GFX9-NEXT: v_mul_lo_u32 v3, v0, s2 ; GFX9-NEXT: v_sub_u32_e32 v2, v2, v0 ; GFX9-NEXT: v_add_u32_e32 v2, v2, v4 -; GFX9-NEXT: v_mul_hi_u32 v6, v0, v3 +; GFX9-NEXT: v_mul_hi_u32 v5, v0, v3 ; GFX9-NEXT: v_mul_lo_u32 v4, v0, v2 -; GFX9-NEXT: v_mul_hi_u32 v8, v0, v2 -; GFX9-NEXT: v_mul_lo_u32 v7, v1, v3 +; GFX9-NEXT: v_mul_hi_u32 v7, v0, v2 +; GFX9-NEXT: v_mul_lo_u32 v6, v1, v3 ; GFX9-NEXT: v_mul_hi_u32 v3, v1, v3 -; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, v6, v4 -; GFX9-NEXT: v_addc_co_u32_e32 v6, vcc, 0, v8, vcc ; GFX9-NEXT: v_mul_hi_u32 v8, v1, v2 +; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, v5, v4 +; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v7, vcc ; GFX9-NEXT: v_mul_lo_u32 v2, v1, v2 -; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, v4, v7 -; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, v6, v3, vcc -; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, v8, v5, vcc +; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, v4, v6 +; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, v5, v3, vcc +; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, 0, v8, vcc ; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v3, v2 ; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v4, vcc ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v2 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v3, vcc ; GFX9-NEXT: v_mul_hi_u32 v2, v0, s2 ; GFX9-NEXT: v_mul_lo_u32 v3, v1, s2 -; GFX9-NEXT: v_mul_lo_u32 v4, v0, s2 +; GFX9-NEXT: v_mul_lo_u32 v5, v0, s2 ; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 ; GFX9-NEXT: v_sub_u32_e32 v2, v2, v0 ; GFX9-NEXT: v_add_u32_e32 v2, v2, v3 ; GFX9-NEXT: v_mul_lo_u32 v3, v0, v2 -; GFX9-NEXT: v_mul_hi_u32 v6, v0, v4 +; GFX9-NEXT: v_mul_hi_u32 v6, v0, v5 ; GFX9-NEXT: v_mul_hi_u32 v7, v0, v2 ; GFX9-NEXT: v_mul_hi_u32 v8, v1, v2 ; GFX9-NEXT: v_mul_lo_u32 v2, v1, v2 ; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, v6, v3 ; GFX9-NEXT: v_addc_co_u32_e32 v6, vcc, 0, v7, vcc -; GFX9-NEXT: v_mul_lo_u32 v7, v1, v4 -; GFX9-NEXT: v_mul_hi_u32 v4, v1, v4 +; GFX9-NEXT: v_mul_lo_u32 v7, v1, v5 +; GFX9-NEXT: v_mul_hi_u32 v5, v1, v5 ; GFX9-NEXT: s_movk_i32 s0, 0xfff ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_lshr_b64 s[4:5], s[4:5], 12 ; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, v3, v7 -; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, v6, v4, vcc -; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, v8, v5, vcc +; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, v6, v5, vcc +; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v8, vcc ; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v3, v2 -; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v4, vcc +; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v5, vcc ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v2 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v3, vcc ; GFX9-NEXT: v_mul_lo_u32 v2, s6, v1 ; GFX9-NEXT: v_mul_hi_u32 v3, s6, v0 -; GFX9-NEXT: v_mul_hi_u32 v4, s6, v1 +; GFX9-NEXT: v_mul_hi_u32 v5, s6, v1 ; GFX9-NEXT: v_mul_hi_u32 v6, s7, v1 ; GFX9-NEXT: v_mul_lo_u32 v1, s7, v1 ; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v3, v2 -; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v4, vcc -; GFX9-NEXT: v_mul_lo_u32 v4, s7, v0 +; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v5, vcc +; GFX9-NEXT: v_mul_lo_u32 v5, s7, v0 ; GFX9-NEXT: v_mul_hi_u32 v0, s7, v0 -; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v2, v4 +; GFX9-NEXT: v_mov_b32_e32 v4, 0 +; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v2, v5 ; GFX9-NEXT: v_addc_co_u32_e32 v0, vcc, v3, v0, vcc -; GFX9-NEXT: v_addc_co_u32_e32 v2, vcc, v6, v5, vcc +; GFX9-NEXT: v_addc_co_u32_e32 v2, vcc, 0, v6, vcc ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v1 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v2, vcc ; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, 2, v0 -; GFX9-NEXT: v_mul_lo_u32 v4, v1, s0 +; GFX9-NEXT: v_mul_lo_u32 v5, v1, s0 ; GFX9-NEXT: v_mul_hi_u32 v6, v0, s0 ; GFX9-NEXT: v_mul_lo_u32 v9, v0, s0 ; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v1, vcc ; GFX9-NEXT: v_add_co_u32_e32 v7, vcc, 1, v0 ; GFX9-NEXT: v_addc_co_u32_e32 v8, vcc, 0, v1, vcc -; GFX9-NEXT: v_add_u32_e32 v4, v6, v4 +; GFX9-NEXT: v_add_u32_e32 v5, v6, v5 ; GFX9-NEXT: v_mov_b32_e32 v6, s7 ; GFX9-NEXT: v_sub_co_u32_e32 v9, vcc, s6, v9 -; GFX9-NEXT: v_subb_co_u32_e32 v4, vcc, v6, v4, vcc +; GFX9-NEXT: v_subb_co_u32_e32 v5, vcc, v6, v5, vcc ; GFX9-NEXT: v_subrev_co_u32_e32 v6, vcc, s0, v9 -; GFX9-NEXT: v_subbrev_co_u32_e32 v10, vcc, 0, v4, vcc +; GFX9-NEXT: v_subbrev_co_u32_e32 v10, vcc, 0, v5, vcc ; GFX9-NEXT: s_movk_i32 s0, 0xffe ; GFX9-NEXT: v_cmp_lt_u32_e32 vcc, s0, v6 ; GFX9-NEXT: v_cndmask_b32_e64 v6, 0, -1, vcc @@ -7652,16 +7635,16 @@ define amdgpu_kernel void @udiv_v2i64_mixed_pow2k_denom(<2 x i64> addrspace(1)* ; GFX9-NEXT: v_cmp_lt_u32_e64 s[0:1], s0, v9 ; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v6 ; GFX9-NEXT: v_cndmask_b32_e64 v6, 0, -1, s[0:1] -; GFX9-NEXT: v_cmp_eq_u32_e64 s[0:1], 0, v4 -; GFX9-NEXT: v_cndmask_b32_e64 v4, -1, v6, s[0:1] +; GFX9-NEXT: v_cmp_eq_u32_e64 s[0:1], 0, v5 +; GFX9-NEXT: v_cndmask_b32_e64 v5, -1, v6, s[0:1] ; GFX9-NEXT: v_cndmask_b32_e32 v3, v8, v3, vcc -; GFX9-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v4 +; GFX9-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v5 ; GFX9-NEXT: v_cndmask_b32_e64 v3, v1, v3, s[0:1] ; GFX9-NEXT: v_cndmask_b32_e32 v1, v7, v2, vcc ; GFX9-NEXT: v_cndmask_b32_e64 v2, v0, v1, s[0:1] ; GFX9-NEXT: v_mov_b32_e32 v0, s4 ; GFX9-NEXT: v_mov_b32_e32 v1, s5 -; GFX9-NEXT: global_store_dwordx4 v5, v[0:3], s[2:3] +; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[2:3] ; GFX9-NEXT: s_endpgm %r = udiv <2 x i64> %x, store <2 x i64> %r, <2 x i64> addrspace(1)* %out @@ -7736,38 +7719,37 @@ define amdgpu_kernel void @urem_i64_oddk_denom(i64 addrspace(1)* %out, i64 %x) { ; GFX6-NEXT: v_rcp_f32_e32 v0, v0 ; GFX6-NEXT: s_movk_i32 s2, 0xfee0 ; GFX6-NEXT: s_mov_b32 s3, 0x689e0837 -; GFX6-NEXT: v_mov_b32_e32 v7, 0 +; GFX6-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 ; GFX6-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0 ; GFX6-NEXT: v_mul_f32_e32 v1, 0x2f800000, v0 ; GFX6-NEXT: v_trunc_f32_e32 v1, v1 ; GFX6-NEXT: v_mac_f32_e32 v0, 0xcf800000, v1 ; GFX6-NEXT: v_cvt_u32_f32_e32 v0, v0 ; GFX6-NEXT: v_cvt_u32_f32_e32 v1, v1 -; GFX6-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; GFX6-NEXT: s_mov_b32 s12, 0x9761f7c9 -; GFX6-NEXT: v_mul_lo_u32 v2, v0, s2 -; GFX6-NEXT: v_mul_hi_u32 v3, v0, s3 -; GFX6-NEXT: v_mul_lo_u32 v4, v1, s3 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_mov_b32 s8, s4 ; GFX6-NEXT: s_movk_i32 s4, 0x11f +; GFX6-NEXT: v_mul_lo_u32 v2, v0, s2 +; GFX6-NEXT: v_mul_hi_u32 v3, v0, s3 +; GFX6-NEXT: v_mul_lo_u32 v4, v1, s3 +; GFX6-NEXT: v_mul_lo_u32 v5, v0, s3 +; GFX6-NEXT: s_mov_b32 s12, 0x9761f7c9 ; GFX6-NEXT: v_add_i32_e32 v2, vcc, v3, v2 -; GFX6-NEXT: v_mul_lo_u32 v3, v0, s3 ; GFX6-NEXT: v_add_i32_e32 v2, vcc, v4, v2 -; GFX6-NEXT: v_mul_lo_u32 v4, v0, v2 -; GFX6-NEXT: v_mul_hi_u32 v5, v0, v3 +; GFX6-NEXT: v_mul_lo_u32 v3, v0, v2 +; GFX6-NEXT: v_mul_hi_u32 v4, v0, v5 ; GFX6-NEXT: v_mul_hi_u32 v6, v0, v2 -; GFX6-NEXT: v_mul_hi_u32 v8, v1, v2 +; GFX6-NEXT: v_mul_hi_u32 v7, v1, v2 ; GFX6-NEXT: v_mul_lo_u32 v2, v1, v2 -; GFX6-NEXT: v_add_i32_e32 v4, vcc, v5, v4 -; GFX6-NEXT: v_addc_u32_e32 v5, vcc, 0, v6, vcc -; GFX6-NEXT: v_mul_lo_u32 v6, v1, v3 -; GFX6-NEXT: v_mul_hi_u32 v3, v1, v3 +; GFX6-NEXT: v_add_i32_e32 v3, vcc, v4, v3 +; GFX6-NEXT: v_addc_u32_e32 v4, vcc, 0, v6, vcc +; GFX6-NEXT: v_mul_lo_u32 v6, v1, v5 +; GFX6-NEXT: v_mul_hi_u32 v5, v1, v5 ; GFX6-NEXT: s_mov_b32 s9, s5 ; GFX6-NEXT: s_movk_i32 s5, 0x11e -; GFX6-NEXT: v_add_i32_e32 v4, vcc, v4, v6 -; GFX6-NEXT: v_addc_u32_e32 v3, vcc, v5, v3, vcc -; GFX6-NEXT: v_addc_u32_e32 v4, vcc, v8, v7, vcc +; GFX6-NEXT: v_add_i32_e32 v3, vcc, v3, v6 +; GFX6-NEXT: v_addc_u32_e32 v3, vcc, v4, v5, vcc +; GFX6-NEXT: v_addc_u32_e32 v4, vcc, 0, v7, vcc ; GFX6-NEXT: v_add_i32_e32 v2, vcc, v3, v2 ; GFX6-NEXT: v_addc_u32_e32 v3, vcc, 0, v4, vcc ; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v2 @@ -7783,7 +7765,7 @@ define amdgpu_kernel void @urem_i64_oddk_denom(i64 addrspace(1)* %out, i64 %x) { ; GFX6-NEXT: v_mul_lo_u32 v4, v0, v2 ; GFX6-NEXT: v_mul_hi_u32 v5, v0, v3 ; GFX6-NEXT: v_mul_hi_u32 v6, v0, v2 -; GFX6-NEXT: v_mul_hi_u32 v8, v1, v2 +; GFX6-NEXT: v_mul_hi_u32 v7, v1, v2 ; GFX6-NEXT: v_mul_lo_u32 v2, v1, v2 ; GFX6-NEXT: v_add_i32_e32 v4, vcc, v5, v4 ; GFX6-NEXT: v_addc_u32_e32 v5, vcc, 0, v6, vcc @@ -7791,7 +7773,7 @@ define amdgpu_kernel void @urem_i64_oddk_denom(i64 addrspace(1)* %out, i64 %x) { ; GFX6-NEXT: v_mul_hi_u32 v3, v1, v3 ; GFX6-NEXT: v_add_i32_e32 v4, vcc, v4, v6 ; GFX6-NEXT: v_addc_u32_e32 v3, vcc, v5, v3, vcc -; GFX6-NEXT: v_addc_u32_e32 v4, vcc, v8, v7, vcc +; GFX6-NEXT: v_addc_u32_e32 v4, vcc, 0, v7, vcc ; GFX6-NEXT: v_add_i32_e32 v2, vcc, v3, v2 ; GFX6-NEXT: v_addc_u32_e32 v3, vcc, 0, v4, vcc ; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v2 @@ -7807,7 +7789,7 @@ define amdgpu_kernel void @urem_i64_oddk_denom(i64 addrspace(1)* %out, i64 %x) { ; GFX6-NEXT: v_mul_hi_u32 v0, s7, v0 ; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v4 ; GFX6-NEXT: v_addc_u32_e32 v0, vcc, v3, v0, vcc -; GFX6-NEXT: v_addc_u32_e32 v2, vcc, v5, v7, vcc +; GFX6-NEXT: v_addc_u32_e32 v2, vcc, 0, v5, vcc ; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v1 ; GFX6-NEXT: v_addc_u32_e32 v1, vcc, 0, v2, vcc ; GFX6-NEXT: v_mul_lo_u32 v2, v0, s4 @@ -7817,7 +7799,7 @@ define amdgpu_kernel void @urem_i64_oddk_denom(i64 addrspace(1)* %out, i64 %x) { ; GFX6-NEXT: v_add_i32_e32 v2, vcc, v3, v2 ; GFX6-NEXT: v_add_i32_e32 v1, vcc, v2, v1 ; GFX6-NEXT: v_sub_i32_e32 v2, vcc, s7, v1 -; GFX6-NEXT: v_mov_b32_e32 v3, s4 +; GFX6-NEXT: v_mov_b32_e32 v3, 0x11f ; GFX6-NEXT: v_sub_i32_e32 v0, vcc, s6, v0 ; GFX6-NEXT: v_subb_u32_e64 v2, s[0:1], v2, v3, vcc ; GFX6-NEXT: v_subrev_i32_e64 v4, s[0:1], s12, v0 @@ -7857,35 +7839,34 @@ define amdgpu_kernel void @urem_i64_oddk_denom(i64 addrspace(1)* %out, i64 %x) { ; GFX9-NEXT: v_rcp_f32_e32 v0, v0 ; GFX9-NEXT: s_movk_i32 s2, 0xfee0 ; GFX9-NEXT: s_mov_b32 s3, 0x689e0837 -; GFX9-NEXT: v_mov_b32_e32 v6, 0 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0 ; GFX9-NEXT: v_mul_f32_e32 v1, 0x2f800000, v0 ; GFX9-NEXT: v_trunc_f32_e32 v1, v1 ; GFX9-NEXT: v_mac_f32_e32 v0, 0xcf800000, v1 ; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0 ; GFX9-NEXT: v_cvt_u32_f32_e32 v1, v1 -; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-NEXT: s_movk_i32 s8, 0x11f +; GFX9-NEXT: s_mov_b32 s9, 0x9761f7c9 ; GFX9-NEXT: v_mul_lo_u32 v2, v0, s2 ; GFX9-NEXT: v_mul_hi_u32 v3, v0, s3 ; GFX9-NEXT: v_mul_lo_u32 v5, v1, s3 ; GFX9-NEXT: v_mul_lo_u32 v4, v0, s3 -; GFX9-NEXT: s_mov_b32 s9, 0x9761f7c9 +; GFX9-NEXT: s_mov_b32 s10, 0x9761f7c8 ; GFX9-NEXT: v_add_u32_e32 v2, v3, v2 ; GFX9-NEXT: v_add_u32_e32 v2, v2, v5 ; GFX9-NEXT: v_mul_hi_u32 v3, v0, v4 ; GFX9-NEXT: v_mul_lo_u32 v5, v0, v2 -; GFX9-NEXT: v_mul_hi_u32 v7, v0, v2 -; GFX9-NEXT: v_mul_hi_u32 v8, v1, v2 +; GFX9-NEXT: v_mul_hi_u32 v6, v0, v2 +; GFX9-NEXT: v_mul_hi_u32 v7, v1, v2 ; GFX9-NEXT: v_mul_lo_u32 v2, v1, v2 ; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, v3, v5 -; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v7, vcc -; GFX9-NEXT: v_mul_lo_u32 v7, v1, v4 +; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v6, vcc +; GFX9-NEXT: v_mul_lo_u32 v6, v1, v4 ; GFX9-NEXT: v_mul_hi_u32 v4, v1, v4 -; GFX9-NEXT: s_mov_b32 s10, 0x9761f7c8 -; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, v3, v7 +; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, v3, v6 ; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, v5, v4, vcc -; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, v8, v6, vcc +; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, 0, v7, vcc ; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v3, v2 ; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v4, vcc ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v2 @@ -7898,16 +7879,16 @@ define amdgpu_kernel void @urem_i64_oddk_denom(i64 addrspace(1)* %out, i64 %x) { ; GFX9-NEXT: v_add_u32_e32 v2, v2, v4 ; GFX9-NEXT: v_mul_lo_u32 v3, v0, v2 ; GFX9-NEXT: v_mul_hi_u32 v4, v0, v5 -; GFX9-NEXT: v_mul_hi_u32 v7, v0, v2 -; GFX9-NEXT: v_mul_hi_u32 v8, v1, v2 +; GFX9-NEXT: v_mul_hi_u32 v6, v0, v2 +; GFX9-NEXT: v_mul_hi_u32 v7, v1, v2 ; GFX9-NEXT: v_mul_lo_u32 v2, v1, v2 ; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, v4, v3 -; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, 0, v7, vcc -; GFX9-NEXT: v_mul_lo_u32 v7, v1, v5 +; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, 0, v6, vcc +; GFX9-NEXT: v_mul_lo_u32 v6, v1, v5 ; GFX9-NEXT: v_mul_hi_u32 v5, v1, v5 -; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, v3, v7 +; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, v3, v6 ; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, v4, v5, vcc -; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, v8, v6, vcc +; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, 0, v7, vcc ; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v3, v2 ; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v4, vcc ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v2 @@ -7924,46 +7905,47 @@ define amdgpu_kernel void @urem_i64_oddk_denom(i64 addrspace(1)* %out, i64 %x) { ; GFX9-NEXT: v_mul_hi_u32 v0, s7, v0 ; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v2, v4 ; GFX9-NEXT: v_addc_co_u32_e32 v0, vcc, v3, v0, vcc -; GFX9-NEXT: v_addc_co_u32_e32 v2, vcc, v5, v6, vcc +; GFX9-NEXT: v_addc_co_u32_e32 v2, vcc, 0, v5, vcc ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v1 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v2, vcc ; GFX9-NEXT: v_mul_lo_u32 v2, v0, s8 ; GFX9-NEXT: v_mul_hi_u32 v3, v0, s9 ; GFX9-NEXT: v_mul_lo_u32 v1, v1, s9 ; GFX9-NEXT: v_mul_lo_u32 v0, v0, s9 +; GFX9-NEXT: v_mov_b32_e32 v4, 0 ; GFX9-NEXT: v_add_u32_e32 v2, v3, v2 ; GFX9-NEXT: v_add_u32_e32 v1, v2, v1 ; GFX9-NEXT: v_sub_u32_e32 v2, s7, v1 -; GFX9-NEXT: v_mov_b32_e32 v3, s8 +; GFX9-NEXT: v_mov_b32_e32 v3, 0x11f ; GFX9-NEXT: v_sub_co_u32_e32 v0, vcc, s6, v0 ; GFX9-NEXT: v_subb_co_u32_e64 v2, s[0:1], v2, v3, vcc -; GFX9-NEXT: v_subrev_co_u32_e64 v4, s[0:1], s9, v0 -; GFX9-NEXT: v_subbrev_co_u32_e64 v5, s[2:3], 0, v2, s[0:1] +; GFX9-NEXT: v_subrev_co_u32_e64 v5, s[0:1], s9, v0 +; GFX9-NEXT: v_subbrev_co_u32_e64 v6, s[2:3], 0, v2, s[0:1] ; GFX9-NEXT: s_movk_i32 s6, 0x11e -; GFX9-NEXT: v_cmp_lt_u32_e64 s[2:3], s6, v5 +; GFX9-NEXT: v_cmp_lt_u32_e64 s[2:3], s6, v6 ; GFX9-NEXT: v_cndmask_b32_e64 v7, 0, -1, s[2:3] -; GFX9-NEXT: v_cmp_lt_u32_e64 s[2:3], s10, v4 +; GFX9-NEXT: v_cmp_lt_u32_e64 s[2:3], s10, v5 ; GFX9-NEXT: v_subb_co_u32_e64 v2, s[0:1], v2, v3, s[0:1] ; GFX9-NEXT: v_cndmask_b32_e64 v8, 0, -1, s[2:3] -; GFX9-NEXT: v_cmp_eq_u32_e64 s[2:3], s8, v5 -; GFX9-NEXT: v_subrev_co_u32_e64 v3, s[0:1], s9, v4 +; GFX9-NEXT: v_cmp_eq_u32_e64 s[2:3], s8, v6 +; GFX9-NEXT: v_subrev_co_u32_e64 v3, s[0:1], s9, v5 ; GFX9-NEXT: v_cndmask_b32_e64 v7, v7, v8, s[2:3] ; GFX9-NEXT: v_subbrev_co_u32_e64 v2, s[0:1], 0, v2, s[0:1] ; GFX9-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v7 -; GFX9-NEXT: v_cndmask_b32_e64 v2, v5, v2, s[0:1] -; GFX9-NEXT: v_mov_b32_e32 v5, s7 -; GFX9-NEXT: v_subb_co_u32_e32 v1, vcc, v5, v1, vcc +; GFX9-NEXT: v_cndmask_b32_e64 v2, v6, v2, s[0:1] +; GFX9-NEXT: v_mov_b32_e32 v6, s7 +; GFX9-NEXT: v_subb_co_u32_e32 v1, vcc, v6, v1, vcc ; GFX9-NEXT: v_cmp_lt_u32_e32 vcc, s6, v1 -; GFX9-NEXT: v_cndmask_b32_e64 v5, 0, -1, vcc +; GFX9-NEXT: v_cndmask_b32_e64 v6, 0, -1, vcc ; GFX9-NEXT: v_cmp_lt_u32_e32 vcc, s10, v0 ; GFX9-NEXT: v_cndmask_b32_e64 v7, 0, -1, vcc ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, s8, v1 -; GFX9-NEXT: v_cndmask_b32_e32 v5, v5, v7, vcc -; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v5 +; GFX9-NEXT: v_cndmask_b32_e32 v6, v6, v7, vcc +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v6 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc -; GFX9-NEXT: v_cndmask_b32_e64 v2, v4, v3, s[0:1] +; GFX9-NEXT: v_cndmask_b32_e64 v2, v5, v3, s[0:1] ; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc -; GFX9-NEXT: global_store_dwordx2 v6, v[0:1], s[4:5] +; GFX9-NEXT: global_store_dwordx2 v4, v[0:1], s[4:5] ; GFX9-NEXT: s_endpgm %r = urem i64 %x, 1235195393993 store i64 %r, i64 addrspace(1)* %out @@ -8066,13 +8048,12 @@ define amdgpu_kernel void @urem_v2i64_pow2k_denom(<2 x i64> addrspace(1)* %out, ; GFX6: ; %bb.0: ; GFX6-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0xd ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 -; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: s_movk_i32 s5, 0xfff ; GFX6-NEXT: v_mov_b32_e32 v1, 0 ; GFX6-NEXT: s_mov_b32 s3, 0xf000 -; GFX6-NEXT: s_and_b32 s4, s4, s5 -; GFX6-NEXT: s_and_b32 s5, s6, s5 ; GFX6-NEXT: s_mov_b32 s2, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_and_b32 s4, s4, 0xfff +; GFX6-NEXT: s_and_b32 s5, s6, 0xfff ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: v_mov_b32_e32 v2, s5 ; GFX6-NEXT: v_mov_b32_e32 v3, v1 @@ -8083,14 +8064,13 @@ define amdgpu_kernel void @urem_v2i64_pow2k_denom(<2 x i64> addrspace(1)* %out, ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 ; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 -; GFX9-NEXT: s_movk_i32 s0, 0xfff ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: v_mov_b32_e32 v3, v1 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_and_b32 s1, s4, s0 -; GFX9-NEXT: s_and_b32 s0, s6, s0 -; GFX9-NEXT: v_mov_b32_e32 v0, s1 -; GFX9-NEXT: v_mov_b32_e32 v2, s0 +; GFX9-NEXT: s_and_b32 s0, s4, 0xfff +; GFX9-NEXT: s_and_b32 s1, s6, 0xfff +; GFX9-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NEXT: v_mov_b32_e32 v2, s1 ; GFX9-NEXT: global_store_dwordx4 v1, v[0:3], s[2:3] ; GFX9-NEXT: s_endpgm %r = urem <2 x i64> %x, @@ -8174,8 +8154,8 @@ define amdgpu_kernel void @sdiv_i64_oddk_denom(i64 addrspace(1)* %out, i64 %x) { ; GFX6-NEXT: v_madak_f32 v0, 0, v0, 0x4996c7d8 ; GFX6-NEXT: v_rcp_f32_e32 v0, v0 ; GFX6-NEXT: s_mov_b32 s5, 0xffed2705 -; GFX6-NEXT: v_mov_b32_e32 v7, 0 ; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0 ; GFX6-NEXT: v_mul_f32_e32 v1, 0x2f800000, v0 ; GFX6-NEXT: v_trunc_f32_e32 v1, v1 @@ -8192,12 +8172,12 @@ define amdgpu_kernel void @sdiv_i64_oddk_denom(i64 addrspace(1)* %out, i64 %x) { ; GFX6-NEXT: s_addc_u32 s3, s3, s8 ; GFX6-NEXT: v_add_i32_e32 v2, vcc, v3, v2 ; GFX6-NEXT: v_subrev_i32_e32 v2, vcc, v0, v2 -; GFX6-NEXT: v_mul_lo_u32 v3, v0, v2 -; GFX6-NEXT: v_mul_hi_u32 v5, v0, v4 +; GFX6-NEXT: v_mul_hi_u32 v3, v0, v4 +; GFX6-NEXT: v_mul_lo_u32 v5, v0, v2 ; GFX6-NEXT: v_mul_hi_u32 v6, v0, v2 -; GFX6-NEXT: v_mul_hi_u32 v8, v1, v2 +; GFX6-NEXT: v_mul_hi_u32 v7, v1, v2 ; GFX6-NEXT: v_mul_lo_u32 v2, v1, v2 -; GFX6-NEXT: v_add_i32_e32 v3, vcc, v5, v3 +; GFX6-NEXT: v_add_i32_e32 v3, vcc, v3, v5 ; GFX6-NEXT: v_addc_u32_e32 v5, vcc, 0, v6, vcc ; GFX6-NEXT: v_mul_lo_u32 v6, v1, v4 ; GFX6-NEXT: v_mul_hi_u32 v4, v1, v4 @@ -8205,7 +8185,7 @@ define amdgpu_kernel void @sdiv_i64_oddk_denom(i64 addrspace(1)* %out, i64 %x) { ; GFX6-NEXT: s_mov_b32 s4, s0 ; GFX6-NEXT: v_add_i32_e32 v3, vcc, v3, v6 ; GFX6-NEXT: v_addc_u32_e32 v3, vcc, v5, v4, vcc -; GFX6-NEXT: v_addc_u32_e32 v4, vcc, v8, v7, vcc +; GFX6-NEXT: v_addc_u32_e32 v4, vcc, 0, v7, vcc ; GFX6-NEXT: v_add_i32_e32 v2, vcc, v3, v2 ; GFX6-NEXT: v_addc_u32_e32 v3, vcc, 0, v4, vcc ; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v2 @@ -8213,23 +8193,22 @@ define amdgpu_kernel void @sdiv_i64_oddk_denom(i64 addrspace(1)* %out, i64 %x) { ; GFX6-NEXT: v_mul_lo_u32 v2, v1, s5 ; GFX6-NEXT: v_mul_hi_u32 v3, v0, s5 ; GFX6-NEXT: s_mov_b32 s0, 0x12d8fb -; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s6, -1 ; GFX6-NEXT: v_add_i32_e32 v2, vcc, v3, v2 ; GFX6-NEXT: v_mul_lo_u32 v3, v0, s5 ; GFX6-NEXT: v_subrev_i32_e32 v2, vcc, v0, v2 ; GFX6-NEXT: v_mul_lo_u32 v6, v0, v2 -; GFX6-NEXT: v_mul_hi_u32 v8, v0, v3 -; GFX6-NEXT: v_mul_hi_u32 v9, v0, v2 +; GFX6-NEXT: v_mul_hi_u32 v7, v0, v3 +; GFX6-NEXT: v_mul_hi_u32 v8, v0, v2 ; GFX6-NEXT: v_mul_hi_u32 v5, v1, v3 ; GFX6-NEXT: v_mul_lo_u32 v3, v1, v3 ; GFX6-NEXT: v_mul_hi_u32 v4, v1, v2 -; GFX6-NEXT: v_add_i32_e32 v6, vcc, v8, v6 -; GFX6-NEXT: v_addc_u32_e32 v8, vcc, 0, v9, vcc +; GFX6-NEXT: v_add_i32_e32 v6, vcc, v7, v6 +; GFX6-NEXT: v_addc_u32_e32 v7, vcc, 0, v8, vcc ; GFX6-NEXT: v_mul_lo_u32 v2, v1, v2 ; GFX6-NEXT: v_add_i32_e32 v3, vcc, v6, v3 -; GFX6-NEXT: v_addc_u32_e32 v3, vcc, v8, v5, vcc -; GFX6-NEXT: v_addc_u32_e32 v4, vcc, v4, v7, vcc +; GFX6-NEXT: v_addc_u32_e32 v3, vcc, v7, v5, vcc +; GFX6-NEXT: v_addc_u32_e32 v4, vcc, 0, v4, vcc ; GFX6-NEXT: v_add_i32_e32 v2, vcc, v3, v2 ; GFX6-NEXT: v_addc_u32_e32 v3, vcc, 0, v4, vcc ; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v2 @@ -8246,7 +8225,7 @@ define amdgpu_kernel void @sdiv_i64_oddk_denom(i64 addrspace(1)* %out, i64 %x) { ; GFX6-NEXT: s_mov_b32 s5, s1 ; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v4 ; GFX6-NEXT: v_addc_u32_e32 v0, vcc, v3, v0, vcc -; GFX6-NEXT: v_addc_u32_e32 v2, vcc, v5, v7, vcc +; GFX6-NEXT: v_addc_u32_e32 v2, vcc, 0, v5, vcc ; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v1 ; GFX6-NEXT: v_addc_u32_e32 v1, vcc, 0, v2, vcc ; GFX6-NEXT: v_mul_lo_u32 v4, v1, s0 @@ -8291,7 +8270,6 @@ define amdgpu_kernel void @sdiv_i64_oddk_denom(i64 addrspace(1)* %out, i64 %x) { ; GFX9-NEXT: v_madak_f32 v0, 0, v0, 0x4996c7d8 ; GFX9-NEXT: v_rcp_f32_e32 v0, v0 ; GFX9-NEXT: s_mov_b32 s2, 0xffed2705 -; GFX9-NEXT: v_mov_b32_e32 v5, 0 ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0 ; GFX9-NEXT: v_mul_f32_e32 v1, 0x2f800000, v0 @@ -8306,16 +8284,16 @@ define amdgpu_kernel void @sdiv_i64_oddk_denom(i64 addrspace(1)* %out, i64 %x) { ; GFX9-NEXT: v_sub_u32_e32 v2, v2, v0 ; GFX9-NEXT: v_mul_hi_u32 v3, v0, v4 ; GFX9-NEXT: v_mul_lo_u32 v6, v0, v2 -; GFX9-NEXT: v_mul_hi_u32 v8, v0, v2 -; GFX9-NEXT: v_mul_lo_u32 v7, v1, v4 +; GFX9-NEXT: v_mul_hi_u32 v7, v0, v2 +; GFX9-NEXT: v_mul_lo_u32 v5, v1, v4 ; GFX9-NEXT: v_mul_hi_u32 v4, v1, v4 -; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, v3, v6 -; GFX9-NEXT: v_addc_co_u32_e32 v6, vcc, 0, v8, vcc ; GFX9-NEXT: v_mul_hi_u32 v8, v1, v2 +; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, v3, v6 +; GFX9-NEXT: v_addc_co_u32_e32 v6, vcc, 0, v7, vcc ; GFX9-NEXT: v_mul_lo_u32 v2, v1, v2 -; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, v3, v7 +; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, v3, v5 ; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, v6, v4, vcc -; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, v8, v5, vcc +; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, 0, v8, vcc ; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v3, v2 ; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v4, vcc ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v2 @@ -8328,18 +8306,18 @@ define amdgpu_kernel void @sdiv_i64_oddk_denom(i64 addrspace(1)* %out, i64 %x) { ; GFX9-NEXT: s_add_u32 s0, s6, s2 ; GFX9-NEXT: v_add_u32_e32 v2, v3, v2 ; GFX9-NEXT: v_sub_u32_e32 v2, v2, v0 -; GFX9-NEXT: v_mul_lo_u32 v7, v0, v2 -; GFX9-NEXT: v_mul_hi_u32 v8, v0, v4 -; GFX9-NEXT: v_mul_hi_u32 v9, v0, v2 -; GFX9-NEXT: v_mul_hi_u32 v6, v1, v4 +; GFX9-NEXT: v_mul_lo_u32 v6, v0, v2 +; GFX9-NEXT: v_mul_hi_u32 v7, v0, v4 +; GFX9-NEXT: v_mul_hi_u32 v8, v0, v2 +; GFX9-NEXT: v_mul_hi_u32 v5, v1, v4 ; GFX9-NEXT: v_mul_lo_u32 v4, v1, v4 ; GFX9-NEXT: v_mul_hi_u32 v3, v1, v2 -; GFX9-NEXT: v_add_co_u32_e32 v7, vcc, v8, v7 -; GFX9-NEXT: v_addc_co_u32_e32 v8, vcc, 0, v9, vcc +; GFX9-NEXT: v_add_co_u32_e32 v6, vcc, v7, v6 +; GFX9-NEXT: v_addc_co_u32_e32 v7, vcc, 0, v8, vcc ; GFX9-NEXT: v_mul_lo_u32 v2, v1, v2 -; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, v7, v4 -; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, v8, v6, vcc -; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, v3, v5, vcc +; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, v6, v4 +; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, v7, v5, vcc +; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc ; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v4, v2 ; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v2 @@ -8349,32 +8327,33 @@ define amdgpu_kernel void @sdiv_i64_oddk_denom(i64 addrspace(1)* %out, i64 %x) { ; GFX9-NEXT: s_xor_b64 s[0:1], s[0:1], s[2:3] ; GFX9-NEXT: v_mul_lo_u32 v2, s0, v1 ; GFX9-NEXT: v_mul_hi_u32 v3, s0, v0 -; GFX9-NEXT: v_mul_hi_u32 v4, s0, v1 +; GFX9-NEXT: v_mul_hi_u32 v5, s0, v1 ; GFX9-NEXT: v_mul_hi_u32 v6, s1, v1 ; GFX9-NEXT: v_mul_lo_u32 v1, s1, v1 ; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v3, v2 -; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v4, vcc -; GFX9-NEXT: v_mul_lo_u32 v4, s1, v0 +; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v5, vcc +; GFX9-NEXT: v_mul_lo_u32 v5, s1, v0 ; GFX9-NEXT: v_mul_hi_u32 v0, s1, v0 ; GFX9-NEXT: s_mov_b32 s3, 0x12d8fb -; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v2, v4 +; GFX9-NEXT: v_mov_b32_e32 v4, 0 +; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v2, v5 ; GFX9-NEXT: v_addc_co_u32_e32 v0, vcc, v3, v0, vcc -; GFX9-NEXT: v_addc_co_u32_e32 v2, vcc, v6, v5, vcc +; GFX9-NEXT: v_addc_co_u32_e32 v2, vcc, 0, v6, vcc ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v1 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v2, vcc ; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, 2, v0 -; GFX9-NEXT: v_mul_lo_u32 v4, v1, s3 +; GFX9-NEXT: v_mul_lo_u32 v5, v1, s3 ; GFX9-NEXT: v_mul_hi_u32 v6, v0, s3 ; GFX9-NEXT: v_mul_lo_u32 v9, v0, s3 ; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v1, vcc ; GFX9-NEXT: v_add_co_u32_e32 v7, vcc, 1, v0 ; GFX9-NEXT: v_addc_co_u32_e32 v8, vcc, 0, v1, vcc -; GFX9-NEXT: v_add_u32_e32 v4, v6, v4 +; GFX9-NEXT: v_add_u32_e32 v5, v6, v5 ; GFX9-NEXT: v_mov_b32_e32 v6, s1 ; GFX9-NEXT: v_sub_co_u32_e32 v9, vcc, s0, v9 -; GFX9-NEXT: v_subb_co_u32_e32 v4, vcc, v6, v4, vcc +; GFX9-NEXT: v_subb_co_u32_e32 v5, vcc, v6, v5, vcc ; GFX9-NEXT: v_subrev_co_u32_e32 v6, vcc, s3, v9 -; GFX9-NEXT: v_subbrev_co_u32_e32 v10, vcc, 0, v4, vcc +; GFX9-NEXT: v_subbrev_co_u32_e32 v10, vcc, 0, v5, vcc ; GFX9-NEXT: s_mov_b32 s0, 0x12d8fa ; GFX9-NEXT: v_cmp_lt_u32_e32 vcc, s0, v6 ; GFX9-NEXT: v_cndmask_b32_e64 v6, 0, -1, vcc @@ -8383,9 +8362,9 @@ define amdgpu_kernel void @sdiv_i64_oddk_denom(i64 addrspace(1)* %out, i64 %x) { ; GFX9-NEXT: v_cmp_lt_u32_e64 s[0:1], s0, v9 ; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v6 ; GFX9-NEXT: v_cndmask_b32_e64 v6, 0, -1, s[0:1] -; GFX9-NEXT: v_cmp_eq_u32_e64 s[0:1], 0, v4 -; GFX9-NEXT: v_cndmask_b32_e64 v4, -1, v6, s[0:1] -; GFX9-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v4 +; GFX9-NEXT: v_cmp_eq_u32_e64 s[0:1], 0, v5 +; GFX9-NEXT: v_cndmask_b32_e64 v5, -1, v6, s[0:1] +; GFX9-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v5 ; GFX9-NEXT: v_cndmask_b32_e32 v2, v7, v2, vcc ; GFX9-NEXT: v_cndmask_b32_e32 v3, v8, v3, vcc ; GFX9-NEXT: v_cndmask_b32_e64 v0, v0, v2, s[0:1] @@ -8395,7 +8374,7 @@ define amdgpu_kernel void @sdiv_i64_oddk_denom(i64 addrspace(1)* %out, i64 %x) { ; GFX9-NEXT: v_mov_b32_e32 v2, s2 ; GFX9-NEXT: v_subrev_co_u32_e32 v0, vcc, s2, v0 ; GFX9-NEXT: v_subb_co_u32_e32 v1, vcc, v1, v2, vcc -; GFX9-NEXT: global_store_dwordx2 v5, v[0:1], s[4:5] +; GFX9-NEXT: global_store_dwordx2 v4, v[0:1], s[4:5] ; GFX9-NEXT: s_endpgm %r = sdiv i64 %x, 1235195 store i64 %r, i64 addrspace(1)* %out @@ -8501,48 +8480,47 @@ define amdgpu_kernel void @sdiv_i64_pow2_shl_denom(i64 addrspace(1)* %out, i64 % ; GFX6-NEXT: s_xor_b64 s[2:3], s[2:3], s[12:13] ; GFX6-NEXT: v_add_i32_e32 v3, vcc, v3, v6 ; GFX6-NEXT: v_addc_u32_e32 v3, vcc, v5, v4, vcc -; GFX6-NEXT: v_mov_b32_e32 v4, 0 -; GFX6-NEXT: v_addc_u32_e32 v5, vcc, v7, v4, vcc +; GFX6-NEXT: v_addc_u32_e32 v4, vcc, 0, v7, vcc ; GFX6-NEXT: v_add_i32_e32 v2, vcc, v3, v2 -; GFX6-NEXT: v_addc_u32_e32 v3, vcc, 0, v5, vcc +; GFX6-NEXT: v_addc_u32_e32 v3, vcc, 0, v4, vcc ; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v2 ; GFX6-NEXT: v_addc_u32_e32 v1, vcc, v1, v3, vcc ; GFX6-NEXT: v_mul_lo_u32 v2, s4, v1 ; GFX6-NEXT: v_mul_hi_u32 v3, s4, v0 -; GFX6-NEXT: v_mul_lo_u32 v5, s5, v0 +; GFX6-NEXT: v_mul_lo_u32 v4, s5, v0 ; GFX6-NEXT: s_mov_b32 s5, s1 ; GFX6-NEXT: v_add_i32_e32 v2, vcc, v3, v2 ; GFX6-NEXT: v_mul_lo_u32 v3, s4, v0 -; GFX6-NEXT: v_add_i32_e32 v2, vcc, v5, v2 -; GFX6-NEXT: v_mul_lo_u32 v7, v0, v2 -; GFX6-NEXT: v_mul_hi_u32 v8, v0, v3 -; GFX6-NEXT: v_mul_hi_u32 v9, v0, v2 -; GFX6-NEXT: v_mul_hi_u32 v6, v1, v3 +; GFX6-NEXT: v_add_i32_e32 v2, vcc, v4, v2 +; GFX6-NEXT: v_mul_lo_u32 v6, v0, v2 +; GFX6-NEXT: v_mul_hi_u32 v7, v0, v3 +; GFX6-NEXT: v_mul_hi_u32 v8, v0, v2 +; GFX6-NEXT: v_mul_hi_u32 v5, v1, v3 ; GFX6-NEXT: v_mul_lo_u32 v3, v1, v3 -; GFX6-NEXT: v_mul_hi_u32 v5, v1, v2 -; GFX6-NEXT: v_add_i32_e32 v7, vcc, v8, v7 -; GFX6-NEXT: v_addc_u32_e32 v8, vcc, 0, v9, vcc +; GFX6-NEXT: v_mul_hi_u32 v4, v1, v2 +; GFX6-NEXT: v_add_i32_e32 v6, vcc, v7, v6 +; GFX6-NEXT: v_addc_u32_e32 v7, vcc, 0, v8, vcc ; GFX6-NEXT: v_mul_lo_u32 v2, v1, v2 -; GFX6-NEXT: v_add_i32_e32 v3, vcc, v7, v3 -; GFX6-NEXT: v_addc_u32_e32 v3, vcc, v8, v6, vcc -; GFX6-NEXT: v_addc_u32_e32 v5, vcc, v5, v4, vcc +; GFX6-NEXT: v_add_i32_e32 v3, vcc, v6, v3 +; GFX6-NEXT: v_addc_u32_e32 v3, vcc, v7, v5, vcc +; GFX6-NEXT: v_addc_u32_e32 v4, vcc, 0, v4, vcc ; GFX6-NEXT: v_add_i32_e32 v2, vcc, v3, v2 -; GFX6-NEXT: v_addc_u32_e32 v3, vcc, 0, v5, vcc +; GFX6-NEXT: v_addc_u32_e32 v3, vcc, 0, v4, vcc ; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v2 ; GFX6-NEXT: v_addc_u32_e32 v1, vcc, v1, v3, vcc ; GFX6-NEXT: v_mul_lo_u32 v2, s2, v1 ; GFX6-NEXT: v_mul_hi_u32 v3, s2, v0 -; GFX6-NEXT: v_mul_hi_u32 v5, s2, v1 -; GFX6-NEXT: v_mul_hi_u32 v6, s3, v1 +; GFX6-NEXT: v_mul_hi_u32 v4, s2, v1 +; GFX6-NEXT: v_mul_hi_u32 v5, s3, v1 ; GFX6-NEXT: v_mul_lo_u32 v1, s3, v1 ; GFX6-NEXT: v_add_i32_e32 v2, vcc, v3, v2 -; GFX6-NEXT: v_addc_u32_e32 v3, vcc, 0, v5, vcc -; GFX6-NEXT: v_mul_lo_u32 v5, s3, v0 +; GFX6-NEXT: v_addc_u32_e32 v3, vcc, 0, v4, vcc +; GFX6-NEXT: v_mul_lo_u32 v4, s3, v0 ; GFX6-NEXT: v_mul_hi_u32 v0, s3, v0 ; GFX6-NEXT: s_mov_b32 s4, s0 -; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v5 +; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v4 ; GFX6-NEXT: v_addc_u32_e32 v0, vcc, v3, v0, vcc -; GFX6-NEXT: v_addc_u32_e32 v2, vcc, v6, v4, vcc +; GFX6-NEXT: v_addc_u32_e32 v2, vcc, 0, v5, vcc ; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v1 ; GFX6-NEXT: v_addc_u32_e32 v1, vcc, 0, v2, vcc ; GFX6-NEXT: v_mul_lo_u32 v2, s10, v1 @@ -8594,7 +8572,6 @@ define amdgpu_kernel void @sdiv_i64_pow2_shl_denom(i64 addrspace(1)* %out, i64 % ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dword s4, s[0:1], 0x34 ; GFX9-NEXT: s_mov_b64 s[2:3], 0x1000 -; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_lshl_b64 s[4:5], s[2:3], s4 ; GFX9-NEXT: s_ashr_i32 s2, s5, 31 @@ -8614,114 +8591,115 @@ define amdgpu_kernel void @sdiv_i64_pow2_shl_denom(i64 addrspace(1)* %out, i64 % ; GFX9-NEXT: v_mac_f32_e32 v0, 0xcf800000, v1 ; GFX9-NEXT: v_cvt_u32_f32_e32 v1, v1 ; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0 -; GFX9-NEXT: v_mul_lo_u32 v3, s10, v1 -; GFX9-NEXT: v_mul_hi_u32 v4, s10, v0 -; GFX9-NEXT: v_mul_lo_u32 v6, s4, v0 -; GFX9-NEXT: v_mul_lo_u32 v5, s10, v0 -; GFX9-NEXT: v_add_u32_e32 v3, v4, v3 -; GFX9-NEXT: v_add_u32_e32 v3, v3, v6 -; GFX9-NEXT: v_mul_hi_u32 v4, v0, v5 -; GFX9-NEXT: v_mul_lo_u32 v6, v0, v3 -; GFX9-NEXT: v_mul_hi_u32 v8, v0, v3 -; GFX9-NEXT: v_mul_hi_u32 v7, v1, v5 -; GFX9-NEXT: v_mul_lo_u32 v5, v1, v5 -; GFX9-NEXT: v_mul_hi_u32 v9, v1, v3 -; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, v4, v6 -; GFX9-NEXT: v_addc_co_u32_e32 v6, vcc, 0, v8, vcc -; GFX9-NEXT: v_mul_lo_u32 v3, v1, v3 -; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, v4, v5 -; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, v6, v7, vcc -; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, v9, v2, vcc -; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, v4, v3 -; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, 0, v5, vcc -; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v3 -; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v4, vcc -; GFX9-NEXT: v_mul_lo_u32 v3, s10, v1 -; GFX9-NEXT: v_mul_hi_u32 v4, s10, v0 +; GFX9-NEXT: v_mul_lo_u32 v2, s10, v1 +; GFX9-NEXT: v_mul_hi_u32 v3, s10, v0 ; GFX9-NEXT: v_mul_lo_u32 v5, s4, v0 -; GFX9-NEXT: v_mul_lo_u32 v6, s10, v0 +; GFX9-NEXT: v_mul_lo_u32 v4, s10, v0 +; GFX9-NEXT: v_add_u32_e32 v2, v3, v2 +; GFX9-NEXT: v_add_u32_e32 v2, v2, v5 +; GFX9-NEXT: v_mul_hi_u32 v3, v0, v4 +; GFX9-NEXT: v_mul_lo_u32 v5, v0, v2 +; GFX9-NEXT: v_mul_hi_u32 v7, v0, v2 +; GFX9-NEXT: v_mul_hi_u32 v6, v1, v4 +; GFX9-NEXT: v_mul_lo_u32 v4, v1, v4 +; GFX9-NEXT: v_mul_hi_u32 v8, v1, v2 +; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, v3, v5 +; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v7, vcc +; GFX9-NEXT: v_mul_lo_u32 v2, v1, v2 +; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, v3, v4 +; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, v5, v6, vcc +; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, 0, v8, vcc +; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v3, v2 +; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v4, vcc +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v2 +; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v3, vcc +; GFX9-NEXT: v_mul_lo_u32 v2, s10, v1 +; GFX9-NEXT: v_mul_hi_u32 v3, s10, v0 +; GFX9-NEXT: v_mul_lo_u32 v4, s4, v0 +; GFX9-NEXT: v_mul_lo_u32 v5, s10, v0 ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GFX9-NEXT: v_add_u32_e32 v3, v4, v3 -; GFX9-NEXT: v_add_u32_e32 v3, v3, v5 -; GFX9-NEXT: v_mul_lo_u32 v7, v0, v3 -; GFX9-NEXT: v_mul_hi_u32 v8, v0, v6 -; GFX9-NEXT: v_mul_hi_u32 v9, v0, v3 -; GFX9-NEXT: v_mul_hi_u32 v5, v1, v6 -; GFX9-NEXT: v_mul_lo_u32 v6, v1, v6 -; GFX9-NEXT: v_mul_hi_u32 v4, v1, v3 -; GFX9-NEXT: v_add_co_u32_e32 v7, vcc, v8, v7 -; GFX9-NEXT: v_addc_co_u32_e32 v8, vcc, 0, v9, vcc -; GFX9-NEXT: v_mul_lo_u32 v3, v1, v3 +; GFX9-NEXT: v_add_u32_e32 v2, v3, v2 +; GFX9-NEXT: v_add_u32_e32 v2, v2, v4 +; GFX9-NEXT: v_mul_lo_u32 v6, v0, v2 +; GFX9-NEXT: v_mul_hi_u32 v7, v0, v5 +; GFX9-NEXT: v_mul_hi_u32 v8, v0, v2 +; GFX9-NEXT: v_mul_hi_u32 v4, v1, v5 +; GFX9-NEXT: v_mul_lo_u32 v5, v1, v5 +; GFX9-NEXT: v_mul_hi_u32 v3, v1, v2 ; GFX9-NEXT: v_add_co_u32_e32 v6, vcc, v7, v6 -; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, v8, v5, vcc -; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, v4, v2, vcc -; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, v5, v3 +; GFX9-NEXT: v_addc_co_u32_e32 v7, vcc, 0, v8, vcc +; GFX9-NEXT: v_mul_lo_u32 v2, v1, v2 +; GFX9-NEXT: v_add_co_u32_e32 v5, vcc, v6, v5 +; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, v7, v4, vcc +; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc +; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v4, v2 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_ashr_i32 s10, s7, 31 -; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, 0, v4, vcc +; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc ; GFX9-NEXT: s_add_u32 s0, s6, s10 -; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v3 +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v2 ; GFX9-NEXT: s_mov_b32 s11, s10 ; GFX9-NEXT: s_addc_u32 s1, s7, s10 -; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v4, vcc +; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v3, vcc ; GFX9-NEXT: s_xor_b64 s[6:7], s[0:1], s[10:11] -; GFX9-NEXT: v_mul_lo_u32 v3, s6, v1 -; GFX9-NEXT: v_mul_hi_u32 v4, s6, v0 -; GFX9-NEXT: v_mul_hi_u32 v5, s6, v1 -; GFX9-NEXT: v_mul_hi_u32 v6, s7, v1 +; GFX9-NEXT: v_mul_lo_u32 v2, s6, v1 +; GFX9-NEXT: v_mul_hi_u32 v3, s6, v0 +; GFX9-NEXT: v_mul_hi_u32 v4, s6, v1 +; GFX9-NEXT: v_mul_hi_u32 v5, s7, v1 ; GFX9-NEXT: v_mul_lo_u32 v1, s7, v1 -; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, v4, v3 -; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, 0, v5, vcc -; GFX9-NEXT: v_mul_lo_u32 v5, s7, v0 +; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v3, v2 +; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v4, vcc +; GFX9-NEXT: v_mul_lo_u32 v4, s7, v0 ; GFX9-NEXT: v_mul_hi_u32 v0, s7, v0 -; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, v3, v5 -; GFX9-NEXT: v_addc_co_u32_e32 v0, vcc, v4, v0, vcc -; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, v6, v2, vcc -; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v1 -; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v3, vcc -; GFX9-NEXT: v_mul_lo_u32 v3, s8, v1 -; GFX9-NEXT: v_mul_hi_u32 v4, s8, v0 -; GFX9-NEXT: v_mul_lo_u32 v5, s9, v0 ; GFX9-NEXT: v_mov_b32_e32 v6, s9 -; GFX9-NEXT: v_add_u32_e32 v3, v4, v3 -; GFX9-NEXT: v_mul_lo_u32 v4, s8, v0 -; GFX9-NEXT: v_add_u32_e32 v3, v3, v5 -; GFX9-NEXT: v_sub_u32_e32 v5, s7, v3 -; GFX9-NEXT: v_sub_co_u32_e32 v4, vcc, s6, v4 -; GFX9-NEXT: v_subb_co_u32_e64 v5, s[0:1], v5, v6, vcc -; GFX9-NEXT: v_subrev_co_u32_e64 v6, s[0:1], s8, v4 -; GFX9-NEXT: v_subbrev_co_u32_e64 v5, s[0:1], 0, v5, s[0:1] -; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s9, v5 +; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v2, v4 +; GFX9-NEXT: v_addc_co_u32_e32 v0, vcc, v3, v0, vcc +; GFX9-NEXT: v_addc_co_u32_e32 v2, vcc, 0, v5, vcc +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v1 +; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v2, vcc +; GFX9-NEXT: v_mul_lo_u32 v2, s8, v1 +; GFX9-NEXT: v_mul_hi_u32 v3, s8, v0 +; GFX9-NEXT: v_mul_lo_u32 v4, s9, v0 +; GFX9-NEXT: v_mov_b32_e32 v5, 0 +; GFX9-NEXT: v_add_u32_e32 v2, v3, v2 +; GFX9-NEXT: v_mul_lo_u32 v3, s8, v0 +; GFX9-NEXT: v_add_u32_e32 v2, v2, v4 +; GFX9-NEXT: v_sub_u32_e32 v4, s7, v2 +; GFX9-NEXT: v_sub_co_u32_e32 v3, vcc, s6, v3 +; GFX9-NEXT: v_subb_co_u32_e64 v4, s[0:1], v4, v6, vcc +; GFX9-NEXT: v_subrev_co_u32_e64 v6, s[0:1], s8, v3 +; GFX9-NEXT: v_subbrev_co_u32_e64 v4, s[0:1], 0, v4, s[0:1] +; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s9, v4 ; GFX9-NEXT: v_cndmask_b32_e64 v7, 0, -1, s[0:1] ; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s8, v6 ; GFX9-NEXT: v_cndmask_b32_e64 v6, 0, -1, s[0:1] -; GFX9-NEXT: v_cmp_eq_u32_e64 s[0:1], s9, v5 -; GFX9-NEXT: v_cndmask_b32_e64 v5, v7, v6, s[0:1] +; GFX9-NEXT: v_cmp_eq_u32_e64 s[0:1], s9, v4 +; GFX9-NEXT: v_cndmask_b32_e64 v4, v7, v6, s[0:1] ; GFX9-NEXT: v_add_co_u32_e64 v6, s[0:1], 2, v0 ; GFX9-NEXT: v_addc_co_u32_e64 v7, s[0:1], 0, v1, s[0:1] ; GFX9-NEXT: v_add_co_u32_e64 v8, s[0:1], 1, v0 ; GFX9-NEXT: v_addc_co_u32_e64 v9, s[0:1], 0, v1, s[0:1] -; GFX9-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v5 -; GFX9-NEXT: v_cndmask_b32_e64 v5, v9, v7, s[0:1] +; GFX9-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v4 +; GFX9-NEXT: v_cndmask_b32_e64 v4, v9, v7, s[0:1] ; GFX9-NEXT: v_mov_b32_e32 v7, s7 -; GFX9-NEXT: v_subb_co_u32_e32 v3, vcc, v7, v3, vcc -; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s9, v3 +; GFX9-NEXT: v_subb_co_u32_e32 v2, vcc, v7, v2, vcc +; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s9, v2 ; GFX9-NEXT: v_cndmask_b32_e64 v7, 0, -1, vcc -; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s8, v4 -; GFX9-NEXT: v_cndmask_b32_e64 v4, 0, -1, vcc -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, s9, v3 -; GFX9-NEXT: v_cndmask_b32_e32 v3, v7, v4, vcc -; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v3 -; GFX9-NEXT: v_cndmask_b32_e64 v3, v8, v6, s[0:1] -; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc +; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s8, v3 +; GFX9-NEXT: v_cndmask_b32_e64 v3, 0, -1, vcc +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, s9, v2 +; GFX9-NEXT: v_cndmask_b32_e32 v2, v7, v3, vcc +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 +; GFX9-NEXT: v_cndmask_b32_e64 v2, v8, v6, s[0:1] +; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc ; GFX9-NEXT: s_xor_b64 s[0:1], s[10:11], s[2:3] -; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc ; GFX9-NEXT: v_xor_b32_e32 v0, s0, v0 ; GFX9-NEXT: v_xor_b32_e32 v1, s1, v1 -; GFX9-NEXT: v_mov_b32_e32 v3, s1 +; GFX9-NEXT: v_mov_b32_e32 v2, s1 ; GFX9-NEXT: v_subrev_co_u32_e32 v0, vcc, s0, v0 -; GFX9-NEXT: v_subb_co_u32_e32 v1, vcc, v1, v3, vcc -; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] +; GFX9-NEXT: v_subb_co_u32_e32 v1, vcc, v1, v2, vcc +; GFX9-NEXT: global_store_dwordx2 v5, v[0:1], s[4:5] ; GFX9-NEXT: s_endpgm %shl.y = shl i64 4096, %y %r = sdiv i64 %x, %shl.y @@ -8842,10 +8820,9 @@ define amdgpu_kernel void @ssdiv_v2i64_mixed_pow2k_denom(<2 x i64> addrspace(1)* ; GFX6-NEXT: s_add_u32 s0, s2, s10 ; GFX6-NEXT: v_add_i32_e32 v4, vcc, v4, v6 ; GFX6-NEXT: v_addc_u32_e32 v3, vcc, v5, v3, vcc -; GFX6-NEXT: v_mov_b32_e32 v4, 0 -; GFX6-NEXT: v_addc_u32_e32 v5, vcc, v7, v4, vcc +; GFX6-NEXT: v_addc_u32_e32 v4, vcc, 0, v7, vcc ; GFX6-NEXT: v_add_i32_e32 v2, vcc, v3, v2 -; GFX6-NEXT: v_addc_u32_e32 v3, vcc, 0, v5, vcc +; GFX6-NEXT: v_addc_u32_e32 v3, vcc, 0, v4, vcc ; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v2 ; GFX6-NEXT: v_addc_u32_e32 v1, vcc, v1, v3, vcc ; GFX6-NEXT: v_mul_lo_u32 v2, v1, s6 @@ -8856,36 +8833,36 @@ define amdgpu_kernel void @ssdiv_v2i64_mixed_pow2k_denom(<2 x i64> addrspace(1)* ; GFX6-NEXT: v_add_i32_e32 v2, vcc, v3, v2 ; GFX6-NEXT: v_mul_lo_u32 v3, v0, s6 ; GFX6-NEXT: v_subrev_i32_e32 v2, vcc, v0, v2 -; GFX6-NEXT: v_mul_lo_u32 v7, v0, v2 -; GFX6-NEXT: v_mul_hi_u32 v8, v0, v3 -; GFX6-NEXT: v_mul_hi_u32 v9, v0, v2 -; GFX6-NEXT: v_mul_hi_u32 v6, v1, v3 +; GFX6-NEXT: v_mul_lo_u32 v6, v0, v2 +; GFX6-NEXT: v_mul_hi_u32 v7, v0, v3 +; GFX6-NEXT: v_mul_hi_u32 v8, v0, v2 +; GFX6-NEXT: v_mul_hi_u32 v5, v1, v3 ; GFX6-NEXT: v_mul_lo_u32 v3, v1, v3 -; GFX6-NEXT: v_mul_hi_u32 v5, v1, v2 -; GFX6-NEXT: v_add_i32_e32 v7, vcc, v8, v7 -; GFX6-NEXT: v_addc_u32_e32 v8, vcc, 0, v9, vcc +; GFX6-NEXT: v_mul_hi_u32 v4, v1, v2 +; GFX6-NEXT: v_add_i32_e32 v6, vcc, v7, v6 +; GFX6-NEXT: v_addc_u32_e32 v7, vcc, 0, v8, vcc ; GFX6-NEXT: v_mul_lo_u32 v2, v1, v2 -; GFX6-NEXT: v_add_i32_e32 v3, vcc, v7, v3 -; GFX6-NEXT: v_addc_u32_e32 v3, vcc, v8, v6, vcc -; GFX6-NEXT: v_addc_u32_e32 v5, vcc, v5, v4, vcc +; GFX6-NEXT: v_add_i32_e32 v3, vcc, v6, v3 +; GFX6-NEXT: v_addc_u32_e32 v3, vcc, v7, v5, vcc +; GFX6-NEXT: v_addc_u32_e32 v4, vcc, 0, v4, vcc ; GFX6-NEXT: v_add_i32_e32 v2, vcc, v3, v2 -; GFX6-NEXT: v_addc_u32_e32 v3, vcc, 0, v5, vcc +; GFX6-NEXT: v_addc_u32_e32 v3, vcc, 0, v4, vcc ; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v2 ; GFX6-NEXT: v_addc_u32_e32 v1, vcc, v1, v3, vcc ; GFX6-NEXT: v_mul_lo_u32 v2, s0, v1 ; GFX6-NEXT: v_mul_hi_u32 v3, s0, v0 -; GFX6-NEXT: v_mul_hi_u32 v5, s0, v1 -; GFX6-NEXT: v_mul_hi_u32 v6, s1, v1 +; GFX6-NEXT: v_mul_hi_u32 v4, s0, v1 +; GFX6-NEXT: v_mul_hi_u32 v5, s1, v1 ; GFX6-NEXT: v_mul_lo_u32 v1, s1, v1 ; GFX6-NEXT: v_add_i32_e32 v2, vcc, v3, v2 -; GFX6-NEXT: v_addc_u32_e32 v3, vcc, 0, v5, vcc -; GFX6-NEXT: v_mul_lo_u32 v5, s1, v0 +; GFX6-NEXT: v_addc_u32_e32 v3, vcc, 0, v4, vcc +; GFX6-NEXT: v_mul_lo_u32 v4, s1, v0 ; GFX6-NEXT: v_mul_hi_u32 v0, s1, v0 ; GFX6-NEXT: s_movk_i32 s2, 0xfff ; GFX6-NEXT: s_mov_b32 s6, -1 -; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v5 +; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v4 ; GFX6-NEXT: v_addc_u32_e32 v0, vcc, v3, v0, vcc -; GFX6-NEXT: v_addc_u32_e32 v2, vcc, v6, v4, vcc +; GFX6-NEXT: v_addc_u32_e32 v2, vcc, 0, v5, vcc ; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v1 ; GFX6-NEXT: v_addc_u32_e32 v1, vcc, 0, v2, vcc ; GFX6-NEXT: v_mul_lo_u32 v4, v1, s2 @@ -8933,95 +8910,95 @@ define amdgpu_kernel void @ssdiv_v2i64_mixed_pow2k_denom(<2 x i64> addrspace(1)* ; GFX9-NEXT: v_mac_f32_e32 v0, 0, v1 ; GFX9-NEXT: v_rcp_f32_e32 v0, v0 ; GFX9-NEXT: s_movk_i32 s8, 0xf001 -; GFX9-NEXT: v_mov_b32_e32 v4, 0 ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX9-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0 ; GFX9-NEXT: v_mul_f32_e32 v1, 0x2f800000, v0 ; GFX9-NEXT: v_trunc_f32_e32 v1, v1 ; GFX9-NEXT: v_mac_f32_e32 v0, 0xcf800000, v1 ; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0 ; GFX9-NEXT: v_cvt_u32_f32_e32 v1, v1 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_ashr_i32 s0, s5, 31 +; GFX9-NEXT: s_lshr_b32 s0, s0, 20 ; GFX9-NEXT: v_mul_hi_u32 v2, v0, s8 ; GFX9-NEXT: v_mul_lo_u32 v3, v1, s8 -; GFX9-NEXT: v_mul_lo_u32 v5, v0, s8 +; GFX9-NEXT: v_mul_lo_u32 v4, v0, s8 +; GFX9-NEXT: s_add_u32 s0, s4, s0 +; GFX9-NEXT: s_addc_u32 s1, s5, 0 ; GFX9-NEXT: v_add_u32_e32 v2, v2, v3 ; GFX9-NEXT: v_sub_u32_e32 v2, v2, v0 ; GFX9-NEXT: v_mul_lo_u32 v3, v0, v2 -; GFX9-NEXT: v_mul_hi_u32 v6, v0, v5 -; GFX9-NEXT: v_mul_hi_u32 v7, v0, v2 -; GFX9-NEXT: v_mul_hi_u32 v8, v1, v2 +; GFX9-NEXT: v_mul_hi_u32 v5, v0, v4 +; GFX9-NEXT: v_mul_hi_u32 v6, v0, v2 +; GFX9-NEXT: v_mul_hi_u32 v7, v1, v2 ; GFX9-NEXT: v_mul_lo_u32 v2, v1, v2 -; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, v6, v3 -; GFX9-NEXT: v_addc_co_u32_e32 v6, vcc, 0, v7, vcc -; GFX9-NEXT: v_mul_lo_u32 v7, v1, v5 -; GFX9-NEXT: v_mul_hi_u32 v5, v1, v5 -; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, v3, v7 -; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, v6, v5, vcc -; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, v8, v4, vcc +; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, v5, v3 +; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v6, vcc +; GFX9-NEXT: v_mul_lo_u32 v6, v1, v4 +; GFX9-NEXT: v_mul_hi_u32 v4, v1, v4 +; GFX9-NEXT: s_ashr_i64 s[4:5], s[0:1], 12 +; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, v3, v6 +; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, v5, v4, vcc +; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, 0, v7, vcc ; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v3, v2 -; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v5, vcc +; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v4, vcc ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v2 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v3, vcc ; GFX9-NEXT: v_mul_lo_u32 v2, v1, s8 ; GFX9-NEXT: v_mul_hi_u32 v3, v0, s8 -; GFX9-NEXT: v_mul_lo_u32 v5, v0, s8 -; GFX9-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x24 -; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_ashr_i32 s2, s5, 31 +; GFX9-NEXT: v_mul_lo_u32 v4, v0, s8 +; GFX9-NEXT: s_ashr_i32 s8, s7, 31 +; GFX9-NEXT: s_add_u32 s0, s6, s8 ; GFX9-NEXT: v_add_u32_e32 v2, v3, v2 ; GFX9-NEXT: v_sub_u32_e32 v2, v2, v0 -; GFX9-NEXT: v_mul_lo_u32 v7, v0, v2 -; GFX9-NEXT: v_mul_hi_u32 v8, v0, v5 -; GFX9-NEXT: v_mul_hi_u32 v9, v0, v2 -; GFX9-NEXT: v_mul_hi_u32 v6, v1, v5 -; GFX9-NEXT: v_mul_lo_u32 v5, v1, v5 +; GFX9-NEXT: v_mul_lo_u32 v6, v0, v2 +; GFX9-NEXT: v_mul_hi_u32 v7, v0, v4 +; GFX9-NEXT: v_mul_hi_u32 v8, v0, v2 +; GFX9-NEXT: v_mul_hi_u32 v5, v1, v4 +; GFX9-NEXT: v_mul_lo_u32 v4, v1, v4 ; GFX9-NEXT: v_mul_hi_u32 v3, v1, v2 -; GFX9-NEXT: v_add_co_u32_e32 v7, vcc, v8, v7 -; GFX9-NEXT: v_addc_co_u32_e32 v8, vcc, 0, v9, vcc +; GFX9-NEXT: v_add_co_u32_e32 v6, vcc, v7, v6 +; GFX9-NEXT: v_addc_co_u32_e32 v7, vcc, 0, v8, vcc ; GFX9-NEXT: v_mul_lo_u32 v2, v1, v2 -; GFX9-NEXT: s_lshr_b32 s2, s2, 20 -; GFX9-NEXT: v_add_co_u32_e32 v5, vcc, v7, v5 -; GFX9-NEXT: s_add_u32 s2, s4, s2 -; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, v8, v6, vcc -; GFX9-NEXT: s_addc_u32 s3, s5, 0 -; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, v3, v4, vcc -; GFX9-NEXT: s_ashr_i64 s[2:3], s[2:3], 12 -; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v5, v2 -; GFX9-NEXT: s_ashr_i32 s4, s7, 31 +; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, v6, v4 +; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, v7, v5, vcc +; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc +; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v4, v2 ; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc -; GFX9-NEXT: s_add_u32 s6, s6, s4 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v2 -; GFX9-NEXT: s_mov_b32 s5, s4 -; GFX9-NEXT: s_addc_u32 s7, s7, s4 +; GFX9-NEXT: s_mov_b32 s9, s8 +; GFX9-NEXT: s_addc_u32 s1, s7, s8 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v3, vcc -; GFX9-NEXT: s_xor_b64 s[6:7], s[6:7], s[4:5] -; GFX9-NEXT: v_mul_lo_u32 v2, s6, v1 -; GFX9-NEXT: v_mul_hi_u32 v3, s6, v0 -; GFX9-NEXT: v_mul_hi_u32 v5, s6, v1 -; GFX9-NEXT: v_mul_hi_u32 v6, s7, v1 -; GFX9-NEXT: v_mul_lo_u32 v1, s7, v1 +; GFX9-NEXT: s_xor_b64 s[0:1], s[0:1], s[8:9] +; GFX9-NEXT: v_mul_lo_u32 v2, s0, v1 +; GFX9-NEXT: v_mul_hi_u32 v3, s0, v0 +; GFX9-NEXT: v_mul_hi_u32 v5, s0, v1 +; GFX9-NEXT: v_mul_hi_u32 v6, s1, v1 +; GFX9-NEXT: v_mul_lo_u32 v1, s1, v1 ; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v3, v2 ; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v5, vcc -; GFX9-NEXT: v_mul_lo_u32 v5, s7, v0 -; GFX9-NEXT: v_mul_hi_u32 v0, s7, v0 -; GFX9-NEXT: s_movk_i32 s0, 0xfff +; GFX9-NEXT: v_mul_lo_u32 v5, s1, v0 +; GFX9-NEXT: v_mul_hi_u32 v0, s1, v0 +; GFX9-NEXT: s_movk_i32 s6, 0xfff +; GFX9-NEXT: v_mov_b32_e32 v4, 0 ; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v2, v5 ; GFX9-NEXT: v_addc_co_u32_e32 v0, vcc, v3, v0, vcc -; GFX9-NEXT: v_addc_co_u32_e32 v2, vcc, v6, v4, vcc +; GFX9-NEXT: v_addc_co_u32_e32 v2, vcc, 0, v6, vcc ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v1 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v2, vcc ; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, 2, v0 -; GFX9-NEXT: v_mul_lo_u32 v5, v1, s0 -; GFX9-NEXT: v_mul_hi_u32 v6, v0, s0 -; GFX9-NEXT: v_mul_lo_u32 v9, v0, s0 +; GFX9-NEXT: v_mul_lo_u32 v5, v1, s6 +; GFX9-NEXT: v_mul_hi_u32 v6, v0, s6 +; GFX9-NEXT: v_mul_lo_u32 v9, v0, s6 ; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v1, vcc ; GFX9-NEXT: v_add_co_u32_e32 v7, vcc, 1, v0 ; GFX9-NEXT: v_addc_co_u32_e32 v8, vcc, 0, v1, vcc ; GFX9-NEXT: v_add_u32_e32 v5, v6, v5 -; GFX9-NEXT: v_mov_b32_e32 v6, s7 -; GFX9-NEXT: v_sub_co_u32_e32 v9, vcc, s6, v9 +; GFX9-NEXT: v_mov_b32_e32 v6, s1 +; GFX9-NEXT: v_sub_co_u32_e32 v9, vcc, s0, v9 ; GFX9-NEXT: v_subb_co_u32_e32 v5, vcc, v6, v5, vcc -; GFX9-NEXT: v_subrev_co_u32_e32 v6, vcc, s0, v9 +; GFX9-NEXT: v_subrev_co_u32_e32 v6, vcc, s6, v9 ; GFX9-NEXT: v_subbrev_co_u32_e32 v10, vcc, 0, v5, vcc ; GFX9-NEXT: s_movk_i32 s0, 0xffe ; GFX9-NEXT: v_cmp_lt_u32_e32 vcc, s0, v6 @@ -9038,14 +9015,14 @@ define amdgpu_kernel void @ssdiv_v2i64_mixed_pow2k_denom(<2 x i64> addrspace(1)* ; GFX9-NEXT: v_cndmask_b32_e32 v3, v8, v3, vcc ; GFX9-NEXT: v_cndmask_b32_e64 v0, v0, v2, s[0:1] ; GFX9-NEXT: v_cndmask_b32_e64 v1, v1, v3, s[0:1] -; GFX9-NEXT: v_xor_b32_e32 v0, s4, v0 -; GFX9-NEXT: v_xor_b32_e32 v1, s4, v1 -; GFX9-NEXT: v_mov_b32_e32 v3, s4 -; GFX9-NEXT: v_subrev_co_u32_e32 v2, vcc, s4, v0 +; GFX9-NEXT: v_xor_b32_e32 v0, s8, v0 +; GFX9-NEXT: v_xor_b32_e32 v1, s8, v1 +; GFX9-NEXT: v_mov_b32_e32 v3, s8 +; GFX9-NEXT: v_subrev_co_u32_e32 v2, vcc, s8, v0 ; GFX9-NEXT: v_subb_co_u32_e32 v3, vcc, v1, v3, vcc -; GFX9-NEXT: v_mov_b32_e32 v0, s2 -; GFX9-NEXT: v_mov_b32_e32 v1, s3 -; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[8:9] +; GFX9-NEXT: v_mov_b32_e32 v0, s4 +; GFX9-NEXT: v_mov_b32_e32 v1, s5 +; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[2:3] ; GFX9-NEXT: s_endpgm %r = sdiv <2 x i64> %x, store <2 x i64> %r, <2 x i64> addrspace(1)* %out @@ -9096,201 +9073,200 @@ define amdgpu_kernel void @sdiv_v2i64_pow2_shl_denom(<2 x i64> addrspace(1)* %ou ; GFX6-NEXT: v_trunc_f32_e32 v1, v1 ; GFX6-NEXT: v_mac_f32_e32 v0, s21, v1 ; GFX6-NEXT: v_cvt_u32_f32_e32 v1, v1 -; GFX6-NEXT: v_cvt_u32_f32_e32 v2, v0 +; GFX6-NEXT: v_cvt_u32_f32_e32 v0, v0 ; GFX6-NEXT: s_mov_b32 s17, s16 ; GFX6-NEXT: s_addc_u32 s1, s5, s16 -; GFX6-NEXT: v_mul_lo_u32 v0, s10, v1 -; GFX6-NEXT: v_mul_hi_u32 v3, s10, v2 -; GFX6-NEXT: v_mul_lo_u32 v4, s11, v2 -; GFX6-NEXT: v_mul_lo_u32 v5, s10, v2 +; GFX6-NEXT: v_mul_lo_u32 v2, s10, v1 +; GFX6-NEXT: v_mul_hi_u32 v3, s10, v0 +; GFX6-NEXT: v_mul_lo_u32 v4, s11, v0 +; GFX6-NEXT: v_mul_lo_u32 v5, s10, v0 ; GFX6-NEXT: s_xor_b64 s[4:5], s[0:1], s[16:17] -; GFX6-NEXT: v_add_i32_e32 v0, vcc, v3, v0 -; GFX6-NEXT: v_add_i32_e32 v3, vcc, v0, v4 -; GFX6-NEXT: v_mul_lo_u32 v0, v2, v3 -; GFX6-NEXT: v_mul_hi_u32 v4, v2, v5 -; GFX6-NEXT: v_mul_hi_u32 v6, v2, v3 -; GFX6-NEXT: v_mul_hi_u32 v7, v1, v3 -; GFX6-NEXT: v_mul_lo_u32 v3, v1, v3 -; GFX6-NEXT: v_add_i32_e32 v0, vcc, v4, v0 +; GFX6-NEXT: v_add_i32_e32 v2, vcc, v3, v2 +; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v4 +; GFX6-NEXT: v_mul_lo_u32 v3, v0, v2 +; GFX6-NEXT: v_mul_hi_u32 v4, v0, v5 +; GFX6-NEXT: v_mul_hi_u32 v6, v0, v2 +; GFX6-NEXT: v_mul_hi_u32 v7, v1, v2 +; GFX6-NEXT: v_mul_lo_u32 v2, v1, v2 +; GFX6-NEXT: v_add_i32_e32 v3, vcc, v4, v3 ; GFX6-NEXT: v_addc_u32_e32 v4, vcc, 0, v6, vcc ; GFX6-NEXT: v_mul_lo_u32 v6, v1, v5 ; GFX6-NEXT: v_mul_hi_u32 v5, v1, v5 ; GFX6-NEXT: s_xor_b64 s[14:15], s[16:17], s[14:15] -; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v6 -; GFX6-NEXT: v_addc_u32_e32 v4, vcc, v4, v5, vcc -; GFX6-NEXT: v_mov_b32_e32 v0, 0 -; GFX6-NEXT: v_addc_u32_e32 v5, vcc, v7, v0, vcc -; GFX6-NEXT: v_add_i32_e32 v3, vcc, v4, v3 -; GFX6-NEXT: v_addc_u32_e32 v4, vcc, 0, v5, vcc -; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v3 -; GFX6-NEXT: v_addc_u32_e32 v1, vcc, v1, v4, vcc -; GFX6-NEXT: v_mul_lo_u32 v3, s10, v1 -; GFX6-NEXT: v_mul_hi_u32 v4, s10, v2 -; GFX6-NEXT: v_mul_lo_u32 v5, s11, v2 +; GFX6-NEXT: v_add_i32_e32 v3, vcc, v3, v6 +; GFX6-NEXT: v_addc_u32_e32 v3, vcc, v4, v5, vcc +; GFX6-NEXT: v_addc_u32_e32 v4, vcc, 0, v7, vcc +; GFX6-NEXT: v_add_i32_e32 v2, vcc, v3, v2 +; GFX6-NEXT: v_addc_u32_e32 v3, vcc, 0, v4, vcc +; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v2 +; GFX6-NEXT: v_addc_u32_e32 v1, vcc, v1, v3, vcc +; GFX6-NEXT: v_mul_lo_u32 v2, s10, v1 +; GFX6-NEXT: v_mul_hi_u32 v3, s10, v0 +; GFX6-NEXT: v_mul_lo_u32 v4, s11, v0 ; GFX6-NEXT: s_mov_b32 s11, 0xf000 -; GFX6-NEXT: v_add_i32_e32 v3, vcc, v4, v3 -; GFX6-NEXT: v_mul_lo_u32 v4, s10, v2 -; GFX6-NEXT: v_add_i32_e32 v3, vcc, v5, v3 -; GFX6-NEXT: v_mul_lo_u32 v7, v2, v3 -; GFX6-NEXT: v_mul_hi_u32 v8, v2, v4 -; GFX6-NEXT: v_mul_hi_u32 v9, v2, v3 -; GFX6-NEXT: v_mul_hi_u32 v6, v1, v4 -; GFX6-NEXT: v_mul_lo_u32 v4, v1, v4 +; GFX6-NEXT: v_add_i32_e32 v2, vcc, v3, v2 +; GFX6-NEXT: v_mul_lo_u32 v3, s10, v0 +; GFX6-NEXT: v_add_i32_e32 v2, vcc, v4, v2 +; GFX6-NEXT: v_mul_lo_u32 v6, v0, v2 +; GFX6-NEXT: v_mul_hi_u32 v7, v0, v3 +; GFX6-NEXT: v_mul_hi_u32 v8, v0, v2 ; GFX6-NEXT: v_mul_hi_u32 v5, v1, v3 -; GFX6-NEXT: v_add_i32_e32 v7, vcc, v8, v7 -; GFX6-NEXT: v_addc_u32_e32 v8, vcc, 0, v9, vcc ; GFX6-NEXT: v_mul_lo_u32 v3, v1, v3 -; GFX6-NEXT: v_add_i32_e32 v4, vcc, v7, v4 -; GFX6-NEXT: v_addc_u32_e32 v4, vcc, v8, v6, vcc -; GFX6-NEXT: v_addc_u32_e32 v5, vcc, v5, v0, vcc -; GFX6-NEXT: v_add_i32_e32 v3, vcc, v4, v3 -; GFX6-NEXT: v_addc_u32_e32 v4, vcc, 0, v5, vcc -; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v3 -; GFX6-NEXT: v_addc_u32_e32 v1, vcc, v1, v4, vcc -; GFX6-NEXT: v_mul_lo_u32 v3, s4, v1 -; GFX6-NEXT: v_mul_hi_u32 v4, s4, v2 -; GFX6-NEXT: v_mul_hi_u32 v5, s4, v1 -; GFX6-NEXT: v_mul_hi_u32 v6, s5, v1 +; GFX6-NEXT: v_mul_hi_u32 v4, v1, v2 +; GFX6-NEXT: v_add_i32_e32 v6, vcc, v7, v6 +; GFX6-NEXT: v_addc_u32_e32 v7, vcc, 0, v8, vcc +; GFX6-NEXT: v_mul_lo_u32 v2, v1, v2 +; GFX6-NEXT: v_add_i32_e32 v3, vcc, v6, v3 +; GFX6-NEXT: v_addc_u32_e32 v3, vcc, v7, v5, vcc +; GFX6-NEXT: v_addc_u32_e32 v4, vcc, 0, v4, vcc +; GFX6-NEXT: v_add_i32_e32 v2, vcc, v3, v2 +; GFX6-NEXT: v_addc_u32_e32 v3, vcc, 0, v4, vcc +; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v2 +; GFX6-NEXT: v_addc_u32_e32 v1, vcc, v1, v3, vcc +; GFX6-NEXT: v_mul_lo_u32 v2, s4, v1 +; GFX6-NEXT: v_mul_hi_u32 v3, s4, v0 +; GFX6-NEXT: v_mul_hi_u32 v4, s4, v1 +; GFX6-NEXT: v_mul_hi_u32 v5, s5, v1 ; GFX6-NEXT: v_mul_lo_u32 v1, s5, v1 -; GFX6-NEXT: v_add_i32_e32 v3, vcc, v4, v3 -; GFX6-NEXT: v_addc_u32_e32 v4, vcc, 0, v5, vcc -; GFX6-NEXT: v_mul_lo_u32 v5, s5, v2 -; GFX6-NEXT: v_mul_hi_u32 v2, s5, v2 +; GFX6-NEXT: v_add_i32_e32 v2, vcc, v3, v2 +; GFX6-NEXT: v_addc_u32_e32 v3, vcc, 0, v4, vcc +; GFX6-NEXT: v_mul_lo_u32 v4, s5, v0 +; GFX6-NEXT: v_mul_hi_u32 v0, s5, v0 ; GFX6-NEXT: s_mov_b32 s10, -1 -; GFX6-NEXT: v_add_i32_e32 v3, vcc, v3, v5 -; GFX6-NEXT: v_addc_u32_e32 v2, vcc, v4, v2, vcc -; GFX6-NEXT: v_addc_u32_e32 v3, vcc, v6, v0, vcc -; GFX6-NEXT: v_add_i32_e32 v1, vcc, v2, v1 -; GFX6-NEXT: v_addc_u32_e32 v2, vcc, 0, v3, vcc -; GFX6-NEXT: v_mul_lo_u32 v3, s12, v2 -; GFX6-NEXT: v_mul_hi_u32 v4, s12, v1 -; GFX6-NEXT: v_mul_lo_u32 v5, s13, v1 -; GFX6-NEXT: v_mov_b32_e32 v6, s13 -; GFX6-NEXT: v_add_i32_e32 v3, vcc, v4, v3 -; GFX6-NEXT: v_mul_lo_u32 v4, s12, v1 -; GFX6-NEXT: v_add_i32_e32 v3, vcc, v3, v5 -; GFX6-NEXT: v_sub_i32_e32 v5, vcc, s5, v3 -; GFX6-NEXT: v_sub_i32_e32 v4, vcc, s4, v4 -; GFX6-NEXT: v_subb_u32_e64 v5, s[0:1], v5, v6, vcc -; GFX6-NEXT: v_subrev_i32_e64 v6, s[0:1], s12, v4 -; GFX6-NEXT: v_subbrev_u32_e64 v5, s[0:1], 0, v5, s[0:1] -; GFX6-NEXT: v_cmp_le_u32_e64 s[0:1], s13, v5 -; GFX6-NEXT: v_cndmask_b32_e64 v7, 0, -1, s[0:1] -; GFX6-NEXT: v_cmp_le_u32_e64 s[0:1], s12, v6 +; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v4 +; GFX6-NEXT: v_addc_u32_e32 v0, vcc, v3, v0, vcc +; GFX6-NEXT: v_addc_u32_e32 v2, vcc, 0, v5, vcc +; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v1 +; GFX6-NEXT: v_addc_u32_e32 v1, vcc, 0, v2, vcc +; GFX6-NEXT: v_mul_lo_u32 v2, s12, v1 +; GFX6-NEXT: v_mul_hi_u32 v3, s12, v0 +; GFX6-NEXT: v_mul_lo_u32 v4, s13, v0 +; GFX6-NEXT: v_mov_b32_e32 v5, s13 +; GFX6-NEXT: v_add_i32_e32 v2, vcc, v3, v2 +; GFX6-NEXT: v_mul_lo_u32 v3, s12, v0 +; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v4 +; GFX6-NEXT: v_sub_i32_e32 v4, vcc, s5, v2 +; GFX6-NEXT: v_sub_i32_e32 v3, vcc, s4, v3 +; GFX6-NEXT: v_subb_u32_e64 v4, s[0:1], v4, v5, vcc +; GFX6-NEXT: v_subrev_i32_e64 v5, s[0:1], s12, v3 +; GFX6-NEXT: v_subbrev_u32_e64 v4, s[0:1], 0, v4, s[0:1] +; GFX6-NEXT: v_cmp_le_u32_e64 s[0:1], s13, v4 ; GFX6-NEXT: v_cndmask_b32_e64 v6, 0, -1, s[0:1] -; GFX6-NEXT: v_cmp_eq_u32_e64 s[0:1], s13, v5 -; GFX6-NEXT: v_cndmask_b32_e64 v5, v7, v6, s[0:1] -; GFX6-NEXT: v_add_i32_e64 v6, s[0:1], 2, v1 -; GFX6-NEXT: v_addc_u32_e64 v7, s[0:1], 0, v2, s[0:1] -; GFX6-NEXT: v_add_i32_e64 v8, s[0:1], 1, v1 -; GFX6-NEXT: v_addc_u32_e64 v9, s[0:1], 0, v2, s[0:1] +; GFX6-NEXT: v_cmp_le_u32_e64 s[0:1], s12, v5 +; GFX6-NEXT: v_cndmask_b32_e64 v5, 0, -1, s[0:1] +; GFX6-NEXT: v_cmp_eq_u32_e64 s[0:1], s13, v4 +; GFX6-NEXT: v_cndmask_b32_e64 v4, v6, v5, s[0:1] +; GFX6-NEXT: v_add_i32_e64 v5, s[0:1], 2, v0 +; GFX6-NEXT: v_addc_u32_e64 v6, s[0:1], 0, v1, s[0:1] +; GFX6-NEXT: v_add_i32_e64 v7, s[0:1], 1, v0 +; GFX6-NEXT: v_addc_u32_e64 v8, s[0:1], 0, v1, s[0:1] ; GFX6-NEXT: s_ashr_i32 s4, s3, 31 -; GFX6-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v5 +; GFX6-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v4 ; GFX6-NEXT: s_add_u32 s2, s2, s4 -; GFX6-NEXT: v_cndmask_b32_e64 v5, v9, v7, s[0:1] -; GFX6-NEXT: v_mov_b32_e32 v7, s5 +; GFX6-NEXT: v_cndmask_b32_e64 v4, v8, v6, s[0:1] +; GFX6-NEXT: v_mov_b32_e32 v6, s5 ; GFX6-NEXT: s_mov_b32 s5, s4 ; GFX6-NEXT: s_addc_u32 s3, s3, s4 ; GFX6-NEXT: s_xor_b64 s[2:3], s[2:3], s[4:5] -; GFX6-NEXT: v_cvt_f32_u32_e32 v9, s2 -; GFX6-NEXT: v_cvt_f32_u32_e32 v10, s3 -; GFX6-NEXT: v_subb_u32_e32 v3, vcc, v7, v3, vcc -; GFX6-NEXT: v_cmp_le_u32_e32 vcc, s13, v3 -; GFX6-NEXT: v_cndmask_b32_e64 v7, 0, -1, vcc -; GFX6-NEXT: v_cmp_le_u32_e32 vcc, s12, v4 -; GFX6-NEXT: v_cndmask_b32_e64 v4, 0, -1, vcc -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, s13, v3 -; GFX6-NEXT: v_mac_f32_e32 v9, s18, v10 -; GFX6-NEXT: v_cndmask_b32_e32 v3, v7, v4, vcc -; GFX6-NEXT: v_rcp_f32_e32 v4, v9 -; GFX6-NEXT: v_cmp_ne_u32_e32 vcc, 0, v3 -; GFX6-NEXT: v_cndmask_b32_e32 v2, v2, v5, vcc -; GFX6-NEXT: v_cndmask_b32_e64 v3, v8, v6, s[0:1] -; GFX6-NEXT: v_mul_f32_e32 v4, s19, v4 -; GFX6-NEXT: v_mul_f32_e32 v5, s20, v4 -; GFX6-NEXT: v_trunc_f32_e32 v5, v5 -; GFX6-NEXT: v_mac_f32_e32 v4, s21, v5 +; GFX6-NEXT: v_cvt_f32_u32_e32 v8, s2 +; GFX6-NEXT: v_cvt_f32_u32_e32 v9, s3 +; GFX6-NEXT: v_subb_u32_e32 v2, vcc, v6, v2, vcc +; GFX6-NEXT: v_cmp_le_u32_e32 vcc, s13, v2 +; GFX6-NEXT: v_cndmask_b32_e64 v6, 0, -1, vcc +; GFX6-NEXT: v_cmp_le_u32_e32 vcc, s12, v3 +; GFX6-NEXT: v_cndmask_b32_e64 v3, 0, -1, vcc +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, s13, v2 +; GFX6-NEXT: v_mac_f32_e32 v8, s18, v9 +; GFX6-NEXT: v_cndmask_b32_e32 v2, v6, v3, vcc +; GFX6-NEXT: v_rcp_f32_e32 v3, v8 +; GFX6-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 +; GFX6-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc +; GFX6-NEXT: v_cndmask_b32_e64 v2, v7, v5, s[0:1] +; GFX6-NEXT: v_mul_f32_e32 v3, s19, v3 +; GFX6-NEXT: v_mul_f32_e32 v4, s20, v3 +; GFX6-NEXT: v_trunc_f32_e32 v4, v4 +; GFX6-NEXT: v_mac_f32_e32 v3, s21, v4 +; GFX6-NEXT: v_cvt_u32_f32_e32 v3, v3 ; GFX6-NEXT: v_cvt_u32_f32_e32 v4, v4 -; GFX6-NEXT: v_cvt_u32_f32_e32 v5, v5 ; GFX6-NEXT: s_sub_u32 s0, 0, s2 -; GFX6-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc -; GFX6-NEXT: v_mul_hi_u32 v3, s0, v4 -; GFX6-NEXT: v_mul_lo_u32 v6, s0, v5 -; GFX6-NEXT: s_subb_u32 s1, 0, s3 -; GFX6-NEXT: v_mul_lo_u32 v7, s1, v4 -; GFX6-NEXT: s_ashr_i32 s12, s7, 31 -; GFX6-NEXT: v_add_i32_e32 v3, vcc, v3, v6 -; GFX6-NEXT: v_mul_lo_u32 v6, s0, v4 -; GFX6-NEXT: v_add_i32_e32 v3, vcc, v3, v7 -; GFX6-NEXT: v_mul_lo_u32 v7, v4, v3 -; GFX6-NEXT: v_mul_hi_u32 v8, v4, v6 -; GFX6-NEXT: v_mul_hi_u32 v9, v4, v3 -; GFX6-NEXT: v_mul_hi_u32 v10, v5, v3 -; GFX6-NEXT: v_mul_lo_u32 v3, v5, v3 -; GFX6-NEXT: v_add_i32_e32 v7, vcc, v8, v7 -; GFX6-NEXT: v_addc_u32_e32 v8, vcc, 0, v9, vcc -; GFX6-NEXT: v_mul_lo_u32 v9, v5, v6 -; GFX6-NEXT: v_mul_hi_u32 v6, v5, v6 -; GFX6-NEXT: s_mov_b32 s13, s12 -; GFX6-NEXT: v_xor_b32_e32 v1, s14, v1 -; GFX6-NEXT: v_add_i32_e32 v7, vcc, v7, v9 -; GFX6-NEXT: v_addc_u32_e32 v6, vcc, v8, v6, vcc -; GFX6-NEXT: v_addc_u32_e32 v7, vcc, v10, v0, vcc -; GFX6-NEXT: v_add_i32_e32 v3, vcc, v6, v3 -; GFX6-NEXT: v_addc_u32_e32 v6, vcc, 0, v7, vcc -; GFX6-NEXT: v_add_i32_e32 v3, vcc, v4, v3 -; GFX6-NEXT: v_addc_u32_e32 v4, vcc, v5, v6, vcc +; GFX6-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc +; GFX6-NEXT: v_mul_hi_u32 v2, s0, v3 ; GFX6-NEXT: v_mul_lo_u32 v5, s0, v4 -; GFX6-NEXT: v_mul_hi_u32 v6, s0, v3 -; GFX6-NEXT: v_mul_lo_u32 v7, s1, v3 -; GFX6-NEXT: v_xor_b32_e32 v2, s15, v2 -; GFX6-NEXT: v_add_i32_e32 v5, vcc, v6, v5 -; GFX6-NEXT: v_mul_lo_u32 v6, s0, v3 -; GFX6-NEXT: v_add_i32_e32 v5, vcc, v7, v5 -; GFX6-NEXT: v_mul_lo_u32 v9, v3, v5 -; GFX6-NEXT: v_mul_hi_u32 v10, v3, v6 -; GFX6-NEXT: v_mul_hi_u32 v11, v3, v5 -; GFX6-NEXT: v_mul_hi_u32 v8, v4, v6 -; GFX6-NEXT: v_mul_lo_u32 v6, v4, v6 -; GFX6-NEXT: v_mul_hi_u32 v7, v4, v5 -; GFX6-NEXT: v_add_i32_e32 v9, vcc, v10, v9 -; GFX6-NEXT: v_addc_u32_e32 v10, vcc, 0, v11, vcc -; GFX6-NEXT: v_mul_lo_u32 v5, v4, v5 -; GFX6-NEXT: v_add_i32_e32 v6, vcc, v9, v6 -; GFX6-NEXT: v_addc_u32_e32 v6, vcc, v10, v8, vcc -; GFX6-NEXT: v_addc_u32_e32 v7, vcc, v7, v0, vcc -; GFX6-NEXT: v_add_i32_e32 v5, vcc, v6, v5 -; GFX6-NEXT: v_addc_u32_e32 v6, vcc, 0, v7, vcc -; GFX6-NEXT: s_add_u32 s0, s6, s12 -; GFX6-NEXT: v_add_i32_e32 v3, vcc, v3, v5 -; GFX6-NEXT: s_addc_u32 s1, s7, s12 -; GFX6-NEXT: v_addc_u32_e32 v4, vcc, v4, v6, vcc -; GFX6-NEXT: s_xor_b64 s[6:7], s[0:1], s[12:13] -; GFX6-NEXT: v_mul_lo_u32 v5, s6, v4 -; GFX6-NEXT: v_mul_hi_u32 v6, s6, v3 -; GFX6-NEXT: v_mul_hi_u32 v8, s6, v4 -; GFX6-NEXT: v_mul_hi_u32 v9, s7, v4 -; GFX6-NEXT: v_mul_lo_u32 v4, s7, v4 -; GFX6-NEXT: v_add_i32_e32 v5, vcc, v6, v5 -; GFX6-NEXT: v_addc_u32_e32 v6, vcc, 0, v8, vcc -; GFX6-NEXT: v_mul_lo_u32 v8, s7, v3 -; GFX6-NEXT: v_mul_hi_u32 v3, s7, v3 -; GFX6-NEXT: v_mov_b32_e32 v7, s15 -; GFX6-NEXT: v_add_i32_e32 v5, vcc, v5, v8 -; GFX6-NEXT: v_addc_u32_e32 v3, vcc, v6, v3, vcc -; GFX6-NEXT: v_addc_u32_e32 v0, vcc, v9, v0, vcc -; GFX6-NEXT: v_add_i32_e32 v3, vcc, v3, v4 -; GFX6-NEXT: v_addc_u32_e32 v4, vcc, 0, v0, vcc -; GFX6-NEXT: v_mul_lo_u32 v5, s2, v4 -; GFX6-NEXT: v_mul_hi_u32 v6, s2, v3 -; GFX6-NEXT: v_subrev_i32_e32 v0, vcc, s14, v1 -; GFX6-NEXT: v_subb_u32_e32 v1, vcc, v2, v7, vcc -; GFX6-NEXT: v_mul_lo_u32 v2, s3, v3 -; GFX6-NEXT: v_add_i32_e32 v5, vcc, v6, v5 -; GFX6-NEXT: v_mov_b32_e32 v7, s3 +; GFX6-NEXT: s_subb_u32 s1, 0, s3 +; GFX6-NEXT: v_mul_lo_u32 v6, s1, v3 +; GFX6-NEXT: s_ashr_i32 s12, s7, 31 +; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v5 +; GFX6-NEXT: v_mul_lo_u32 v5, s0, v3 +; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v6 +; GFX6-NEXT: v_mul_lo_u32 v6, v3, v2 +; GFX6-NEXT: v_mul_hi_u32 v7, v3, v5 +; GFX6-NEXT: v_mul_hi_u32 v8, v3, v2 +; GFX6-NEXT: v_mul_hi_u32 v9, v4, v2 +; GFX6-NEXT: v_mul_lo_u32 v2, v4, v2 +; GFX6-NEXT: v_add_i32_e32 v6, vcc, v7, v6 +; GFX6-NEXT: v_addc_u32_e32 v7, vcc, 0, v8, vcc +; GFX6-NEXT: v_mul_lo_u32 v8, v4, v5 +; GFX6-NEXT: v_mul_hi_u32 v5, v4, v5 +; GFX6-NEXT: s_mov_b32 s13, s12 +; GFX6-NEXT: v_xor_b32_e32 v0, s14, v0 +; GFX6-NEXT: v_add_i32_e32 v6, vcc, v6, v8 +; GFX6-NEXT: v_addc_u32_e32 v5, vcc, v7, v5, vcc +; GFX6-NEXT: v_addc_u32_e32 v6, vcc, 0, v9, vcc ; GFX6-NEXT: v_add_i32_e32 v2, vcc, v5, v2 -; GFX6-NEXT: v_mul_lo_u32 v5, s2, v3 -; GFX6-NEXT: v_sub_i32_e32 v6, vcc, s7, v2 +; GFX6-NEXT: v_addc_u32_e32 v5, vcc, 0, v6, vcc +; GFX6-NEXT: v_add_i32_e32 v2, vcc, v3, v2 +; GFX6-NEXT: v_addc_u32_e32 v3, vcc, v4, v5, vcc +; GFX6-NEXT: v_mul_lo_u32 v4, s0, v3 +; GFX6-NEXT: v_mul_hi_u32 v5, s0, v2 +; GFX6-NEXT: v_mul_lo_u32 v6, s1, v2 +; GFX6-NEXT: v_xor_b32_e32 v1, s15, v1 +; GFX6-NEXT: v_add_i32_e32 v4, vcc, v5, v4 +; GFX6-NEXT: v_mul_lo_u32 v5, s0, v2 +; GFX6-NEXT: v_add_i32_e32 v4, vcc, v6, v4 +; GFX6-NEXT: v_mul_lo_u32 v8, v2, v4 +; GFX6-NEXT: v_mul_hi_u32 v9, v2, v5 +; GFX6-NEXT: v_mul_hi_u32 v10, v2, v4 +; GFX6-NEXT: v_mul_hi_u32 v7, v3, v5 +; GFX6-NEXT: v_mul_lo_u32 v5, v3, v5 +; GFX6-NEXT: v_mul_hi_u32 v6, v3, v4 +; GFX6-NEXT: v_add_i32_e32 v8, vcc, v9, v8 +; GFX6-NEXT: v_addc_u32_e32 v9, vcc, 0, v10, vcc +; GFX6-NEXT: v_mul_lo_u32 v4, v3, v4 +; GFX6-NEXT: v_add_i32_e32 v5, vcc, v8, v5 +; GFX6-NEXT: v_addc_u32_e32 v5, vcc, v9, v7, vcc +; GFX6-NEXT: v_addc_u32_e32 v6, vcc, 0, v6, vcc +; GFX6-NEXT: v_add_i32_e32 v4, vcc, v5, v4 +; GFX6-NEXT: v_addc_u32_e32 v5, vcc, 0, v6, vcc +; GFX6-NEXT: s_add_u32 s0, s6, s12 +; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v4 +; GFX6-NEXT: s_addc_u32 s1, s7, s12 +; GFX6-NEXT: v_addc_u32_e32 v3, vcc, v3, v5, vcc +; GFX6-NEXT: s_xor_b64 s[6:7], s[0:1], s[12:13] +; GFX6-NEXT: v_mul_lo_u32 v4, s6, v3 +; GFX6-NEXT: v_mul_hi_u32 v5, s6, v2 +; GFX6-NEXT: v_mul_hi_u32 v7, s6, v3 +; GFX6-NEXT: v_mul_hi_u32 v8, s7, v3 +; GFX6-NEXT: v_mul_lo_u32 v3, s7, v3 +; GFX6-NEXT: v_add_i32_e32 v4, vcc, v5, v4 +; GFX6-NEXT: v_addc_u32_e32 v5, vcc, 0, v7, vcc +; GFX6-NEXT: v_mul_lo_u32 v7, s7, v2 +; GFX6-NEXT: v_mul_hi_u32 v2, s7, v2 +; GFX6-NEXT: v_mov_b32_e32 v6, s15 +; GFX6-NEXT: v_add_i32_e32 v4, vcc, v4, v7 +; GFX6-NEXT: v_addc_u32_e32 v2, vcc, v5, v2, vcc +; GFX6-NEXT: v_addc_u32_e32 v4, vcc, 0, v8, vcc +; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v3 +; GFX6-NEXT: v_addc_u32_e32 v3, vcc, 0, v4, vcc +; GFX6-NEXT: v_mul_lo_u32 v4, s2, v3 +; GFX6-NEXT: v_mul_hi_u32 v5, s2, v2 +; GFX6-NEXT: v_subrev_i32_e32 v0, vcc, s14, v0 +; GFX6-NEXT: v_subb_u32_e32 v1, vcc, v1, v6, vcc +; GFX6-NEXT: v_mul_lo_u32 v6, s3, v2 +; GFX6-NEXT: v_add_i32_e32 v4, vcc, v5, v4 +; GFX6-NEXT: v_mul_lo_u32 v5, s2, v2 +; GFX6-NEXT: v_add_i32_e32 v4, vcc, v4, v6 +; GFX6-NEXT: v_sub_i32_e32 v6, vcc, s7, v4 +; GFX6-NEXT: v_mov_b32_e32 v7, s3 ; GFX6-NEXT: v_sub_i32_e32 v5, vcc, s6, v5 ; GFX6-NEXT: v_subb_u32_e64 v6, s[0:1], v6, v7, vcc ; GFX6-NEXT: v_subrev_i32_e64 v7, s[0:1], s2, v5 @@ -9301,30 +9277,30 @@ define amdgpu_kernel void @sdiv_v2i64_pow2_shl_denom(<2 x i64> addrspace(1)* %ou ; GFX6-NEXT: v_cndmask_b32_e64 v7, 0, -1, s[0:1] ; GFX6-NEXT: v_cmp_eq_u32_e64 s[0:1], s3, v6 ; GFX6-NEXT: v_cndmask_b32_e64 v6, v8, v7, s[0:1] -; GFX6-NEXT: v_add_i32_e64 v7, s[0:1], 2, v3 -; GFX6-NEXT: v_addc_u32_e64 v8, s[0:1], 0, v4, s[0:1] -; GFX6-NEXT: v_add_i32_e64 v9, s[0:1], 1, v3 -; GFX6-NEXT: v_addc_u32_e64 v10, s[0:1], 0, v4, s[0:1] +; GFX6-NEXT: v_add_i32_e64 v7, s[0:1], 2, v2 +; GFX6-NEXT: v_addc_u32_e64 v8, s[0:1], 0, v3, s[0:1] +; GFX6-NEXT: v_add_i32_e64 v9, s[0:1], 1, v2 +; GFX6-NEXT: v_addc_u32_e64 v10, s[0:1], 0, v3, s[0:1] ; GFX6-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v6 ; GFX6-NEXT: v_cndmask_b32_e64 v6, v10, v8, s[0:1] ; GFX6-NEXT: v_mov_b32_e32 v8, s7 -; GFX6-NEXT: v_subb_u32_e32 v2, vcc, v8, v2, vcc -; GFX6-NEXT: v_cmp_le_u32_e32 vcc, s3, v2 +; GFX6-NEXT: v_subb_u32_e32 v4, vcc, v8, v4, vcc +; GFX6-NEXT: v_cmp_le_u32_e32 vcc, s3, v4 ; GFX6-NEXT: v_cndmask_b32_e64 v8, 0, -1, vcc ; GFX6-NEXT: v_cmp_le_u32_e32 vcc, s2, v5 ; GFX6-NEXT: v_cndmask_b32_e64 v5, 0, -1, vcc -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, s3, v2 -; GFX6-NEXT: v_cndmask_b32_e32 v2, v8, v5, vcc -; GFX6-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 -; GFX6-NEXT: v_cndmask_b32_e32 v2, v4, v6, vcc +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, s3, v4 +; GFX6-NEXT: v_cndmask_b32_e32 v4, v8, v5, vcc +; GFX6-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 ; GFX6-NEXT: v_cndmask_b32_e64 v4, v9, v7, s[0:1] -; GFX6-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc +; GFX6-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc ; GFX6-NEXT: s_xor_b64 s[0:1], s[12:13], s[4:5] -; GFX6-NEXT: v_xor_b32_e32 v3, s0, v3 -; GFX6-NEXT: v_xor_b32_e32 v4, s1, v2 -; GFX6-NEXT: v_mov_b32_e32 v5, s1 -; GFX6-NEXT: v_subrev_i32_e32 v2, vcc, s0, v3 -; GFX6-NEXT: v_subb_u32_e32 v3, vcc, v4, v5, vcc +; GFX6-NEXT: v_cndmask_b32_e32 v3, v3, v6, vcc +; GFX6-NEXT: v_xor_b32_e32 v2, s0, v2 +; GFX6-NEXT: v_xor_b32_e32 v3, s1, v3 +; GFX6-NEXT: v_mov_b32_e32 v4, s1 +; GFX6-NEXT: v_subrev_i32_e32 v2, vcc, s0, v2 +; GFX6-NEXT: v_subb_u32_e32 v3, vcc, v3, v4, vcc ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[8:11], 0 ; GFX6-NEXT: s_endpgm @@ -9358,75 +9334,75 @@ define amdgpu_kernel void @sdiv_v2i64_pow2_shl_denom(<2 x i64> addrspace(1)* %ou ; GFX9-NEXT: v_trunc_f32_e32 v1, v1 ; GFX9-NEXT: v_mac_f32_e32 v0, s19, v1 ; GFX9-NEXT: v_cvt_u32_f32_e32 v1, v1 -; GFX9-NEXT: v_cvt_u32_f32_e32 v2, v0 -; GFX9-NEXT: v_mul_lo_u32 v0, s2, v1 -; GFX9-NEXT: v_mul_hi_u32 v3, s2, v2 -; GFX9-NEXT: v_mul_lo_u32 v5, s3, v2 -; GFX9-NEXT: v_mul_lo_u32 v4, s2, v2 -; GFX9-NEXT: v_add_u32_e32 v0, v3, v0 -; GFX9-NEXT: v_add_u32_e32 v5, v0, v5 -; GFX9-NEXT: v_mul_hi_u32 v3, v2, v4 -; GFX9-NEXT: v_mul_lo_u32 v6, v2, v5 -; GFX9-NEXT: v_mul_hi_u32 v7, v2, v5 -; GFX9-NEXT: v_mul_hi_u32 v8, v1, v5 -; GFX9-NEXT: v_mov_b32_e32 v0, 0 -; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, v3, v6 -; GFX9-NEXT: v_addc_co_u32_e32 v6, vcc, 0, v7, vcc -; GFX9-NEXT: v_mul_lo_u32 v7, v1, v4 -; GFX9-NEXT: v_mul_hi_u32 v4, v1, v4 -; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, v3, v7 -; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, v6, v4, vcc -; GFX9-NEXT: v_mul_lo_u32 v4, v1, v5 -; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, v8, v0, vcc -; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, v3, v4 -; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, 0, v5, vcc -; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v2, v3 -; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v4, vcc -; GFX9-NEXT: v_mul_lo_u32 v3, s2, v1 -; GFX9-NEXT: v_mul_hi_u32 v4, s2, v2 -; GFX9-NEXT: v_mul_lo_u32 v5, s3, v2 -; GFX9-NEXT: v_mul_lo_u32 v6, s2, v2 -; GFX9-NEXT: s_add_u32 s2, s4, s14 -; GFX9-NEXT: v_add_u32_e32 v3, v4, v3 -; GFX9-NEXT: v_add_u32_e32 v3, v3, v5 -; GFX9-NEXT: v_mul_lo_u32 v7, v2, v3 -; GFX9-NEXT: v_mul_hi_u32 v8, v2, v6 -; GFX9-NEXT: v_mul_hi_u32 v9, v2, v3 -; GFX9-NEXT: v_mul_hi_u32 v5, v1, v6 -; GFX9-NEXT: v_mul_lo_u32 v6, v1, v6 -; GFX9-NEXT: v_mul_hi_u32 v4, v1, v3 -; GFX9-NEXT: v_add_co_u32_e32 v7, vcc, v8, v7 -; GFX9-NEXT: v_addc_co_u32_e32 v8, vcc, 0, v9, vcc -; GFX9-NEXT: v_mul_lo_u32 v3, v1, v3 -; GFX9-NEXT: v_add_co_u32_e32 v6, vcc, v7, v6 -; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, v8, v5, vcc -; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, v4, v0, vcc -; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, v5, v3 -; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, 0, v4, vcc -; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v2, v3 -; GFX9-NEXT: s_addc_u32 s3, s5, s14 -; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v4, vcc -; GFX9-NEXT: s_xor_b64 s[4:5], s[2:3], s[14:15] -; GFX9-NEXT: v_mul_lo_u32 v3, s4, v1 -; GFX9-NEXT: v_mul_hi_u32 v4, s4, v2 -; GFX9-NEXT: v_mul_hi_u32 v5, s4, v1 -; GFX9-NEXT: v_mul_hi_u32 v6, s5, v1 -; GFX9-NEXT: v_mul_lo_u32 v1, s5, v1 -; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, v4, v3 -; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, 0, v5, vcc -; GFX9-NEXT: v_mul_lo_u32 v5, s5, v2 -; GFX9-NEXT: v_mul_hi_u32 v2, s5, v2 -; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 -; GFX9-NEXT: s_xor_b64 s[12:13], s[14:15], s[12:13] +; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0 +; GFX9-NEXT: v_mul_lo_u32 v2, s2, v1 +; GFX9-NEXT: v_mul_hi_u32 v3, s2, v0 +; GFX9-NEXT: v_mul_lo_u32 v5, s3, v0 +; GFX9-NEXT: v_mul_lo_u32 v4, s2, v0 +; GFX9-NEXT: v_add_u32_e32 v2, v3, v2 +; GFX9-NEXT: v_add_u32_e32 v2, v2, v5 +; GFX9-NEXT: v_mul_hi_u32 v3, v0, v4 +; GFX9-NEXT: v_mul_lo_u32 v5, v0, v2 +; GFX9-NEXT: v_mul_hi_u32 v6, v0, v2 +; GFX9-NEXT: v_mul_hi_u32 v7, v1, v2 +; GFX9-NEXT: v_mul_lo_u32 v2, v1, v2 ; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, v3, v5 -; GFX9-NEXT: v_addc_co_u32_e32 v2, vcc, v4, v2, vcc -; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, v6, v0, vcc -; GFX9-NEXT: v_add_co_u32_e32 v1, vcc, v2, v1 -; GFX9-NEXT: v_addc_co_u32_e32 v2, vcc, 0, v3, vcc +; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v6, vcc +; GFX9-NEXT: v_mul_lo_u32 v6, v1, v4 +; GFX9-NEXT: v_mul_hi_u32 v4, v1, v4 +; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, v3, v6 +; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, v5, v4, vcc +; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, 0, v7, vcc +; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v3, v2 +; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v4, vcc +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v2 +; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v3, vcc +; GFX9-NEXT: v_mul_lo_u32 v2, s2, v1 +; GFX9-NEXT: v_mul_hi_u32 v3, s2, v0 +; GFX9-NEXT: v_mul_lo_u32 v4, s3, v0 +; GFX9-NEXT: v_mul_lo_u32 v5, s2, v0 +; GFX9-NEXT: s_add_u32 s2, s4, s14 +; GFX9-NEXT: v_add_u32_e32 v2, v3, v2 +; GFX9-NEXT: v_add_u32_e32 v2, v2, v4 +; GFX9-NEXT: v_mul_lo_u32 v6, v0, v2 +; GFX9-NEXT: v_mul_hi_u32 v7, v0, v5 +; GFX9-NEXT: v_mul_hi_u32 v8, v0, v2 +; GFX9-NEXT: v_mul_hi_u32 v4, v1, v5 +; GFX9-NEXT: v_mul_lo_u32 v5, v1, v5 +; GFX9-NEXT: v_mul_hi_u32 v3, v1, v2 +; GFX9-NEXT: v_add_co_u32_e32 v6, vcc, v7, v6 +; GFX9-NEXT: v_addc_co_u32_e32 v7, vcc, 0, v8, vcc +; GFX9-NEXT: v_mul_lo_u32 v2, v1, v2 +; GFX9-NEXT: v_add_co_u32_e32 v5, vcc, v6, v5 +; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, v7, v4, vcc +; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc +; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v4, v2 +; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v2 +; GFX9-NEXT: s_addc_u32 s3, s5, s14 +; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v3, vcc +; GFX9-NEXT: s_xor_b64 s[4:5], s[2:3], s[14:15] +; GFX9-NEXT: v_mul_lo_u32 v2, s4, v1 +; GFX9-NEXT: v_mul_hi_u32 v3, s4, v0 +; GFX9-NEXT: v_mul_hi_u32 v4, s4, v1 +; GFX9-NEXT: v_mul_hi_u32 v5, s5, v1 +; GFX9-NEXT: v_mul_lo_u32 v1, s5, v1 +; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v3, v2 +; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v4, vcc +; GFX9-NEXT: v_mul_lo_u32 v4, s5, v0 +; GFX9-NEXT: v_mul_hi_u32 v0, s5, v0 +; GFX9-NEXT: v_mov_b32_e32 v6, s9 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v2, v4 +; GFX9-NEXT: v_addc_co_u32_e32 v0, vcc, v3, v0, vcc +; GFX9-NEXT: v_addc_co_u32_e32 v2, vcc, 0, v5, vcc +; GFX9-NEXT: v_add_co_u32_e32 v1, vcc, v0, v1 +; GFX9-NEXT: v_addc_co_u32_e32 v2, vcc, 0, v2, vcc ; GFX9-NEXT: v_mul_lo_u32 v3, s8, v2 ; GFX9-NEXT: v_mul_hi_u32 v4, s8, v1 ; GFX9-NEXT: v_mul_lo_u32 v5, s9, v1 -; GFX9-NEXT: v_mov_b32_e32 v6, s9 +; GFX9-NEXT: s_xor_b64 s[12:13], s[14:15], s[12:13] +; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: v_add_u32_e32 v3, v4, v3 ; GFX9-NEXT: v_mul_lo_u32 v4, s8, v1 ; GFX9-NEXT: v_add_u32_e32 v3, v3, v5 @@ -9495,7 +9471,7 @@ define amdgpu_kernel void @sdiv_v2i64_pow2_shl_denom(<2 x i64> addrspace(1)* %ou ; GFX9-NEXT: s_mov_b32 s9, s8 ; GFX9-NEXT: v_add_co_u32_e32 v7, vcc, v7, v9 ; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, v8, v3, vcc -; GFX9-NEXT: v_addc_co_u32_e32 v7, vcc, v10, v0, vcc +; GFX9-NEXT: v_addc_co_u32_e32 v7, vcc, 0, v10, vcc ; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, v3, v6 ; GFX9-NEXT: v_addc_co_u32_e32 v6, vcc, 0, v7, vcc ; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, v4, v3 @@ -9518,7 +9494,7 @@ define amdgpu_kernel void @sdiv_v2i64_pow2_shl_denom(<2 x i64> addrspace(1)* %ou ; GFX9-NEXT: v_mul_lo_u32 v5, v4, v5 ; GFX9-NEXT: v_add_co_u32_e32 v8, vcc, v9, v8 ; GFX9-NEXT: v_addc_co_u32_e32 v7, vcc, v10, v7, vcc -; GFX9-NEXT: v_addc_co_u32_e32 v6, vcc, v6, v0, vcc +; GFX9-NEXT: v_addc_co_u32_e32 v6, vcc, 0, v6, vcc ; GFX9-NEXT: v_add_co_u32_e32 v5, vcc, v7, v5 ; GFX9-NEXT: v_addc_co_u32_e32 v6, vcc, 0, v6, vcc ; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, v3, v5 @@ -9538,7 +9514,7 @@ define amdgpu_kernel void @sdiv_v2i64_pow2_shl_denom(<2 x i64> addrspace(1)* %ou ; GFX9-NEXT: v_xor_b32_e32 v2, s13, v2 ; GFX9-NEXT: v_add_co_u32_e32 v5, vcc, v5, v8 ; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, v6, v3, vcc -; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, v9, v0, vcc +; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v9, vcc ; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, v3, v4 ; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, 0, v5, vcc ; GFX9-NEXT: v_mul_lo_u32 v5, s10, v4 @@ -9607,8 +9583,8 @@ define amdgpu_kernel void @srem_i64_oddk_denom(i64 addrspace(1)* %out, i64 %x) { ; GFX6-NEXT: v_madak_f32 v0, 0, v0, 0x4996c7d8 ; GFX6-NEXT: v_rcp_f32_e32 v0, v0 ; GFX6-NEXT: s_mov_b32 s4, 0xffed2705 -; GFX6-NEXT: v_mov_b32_e32 v5, 0 ; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0 ; GFX6-NEXT: v_mul_f32_e32 v1, 0x2f800000, v0 ; GFX6-NEXT: v_trunc_f32_e32 v1, v1 @@ -9626,42 +9602,41 @@ define amdgpu_kernel void @srem_i64_oddk_denom(i64 addrspace(1)* %out, i64 %x) { ; GFX6-NEXT: v_add_i32_e32 v2, vcc, v3, v2 ; GFX6-NEXT: v_subrev_i32_e32 v2, vcc, v0, v2 ; GFX6-NEXT: v_mul_hi_u32 v3, v0, v4 -; GFX6-NEXT: v_mul_lo_u32 v6, v0, v2 -; GFX6-NEXT: v_mul_hi_u32 v7, v0, v2 -; GFX6-NEXT: v_mul_hi_u32 v8, v1, v2 +; GFX6-NEXT: v_mul_lo_u32 v5, v0, v2 +; GFX6-NEXT: v_mul_hi_u32 v6, v0, v2 +; GFX6-NEXT: v_mul_hi_u32 v7, v1, v2 ; GFX6-NEXT: v_mul_lo_u32 v2, v1, v2 -; GFX6-NEXT: v_add_i32_e32 v3, vcc, v3, v6 -; GFX6-NEXT: v_addc_u32_e32 v6, vcc, 0, v7, vcc -; GFX6-NEXT: v_mul_lo_u32 v7, v1, v4 +; GFX6-NEXT: v_add_i32_e32 v3, vcc, v3, v5 +; GFX6-NEXT: v_addc_u32_e32 v5, vcc, 0, v6, vcc +; GFX6-NEXT: v_mul_lo_u32 v6, v1, v4 ; GFX6-NEXT: v_mul_hi_u32 v4, v1, v4 ; GFX6-NEXT: s_xor_b64 s[2:3], s[2:3], s[8:9] ; GFX6-NEXT: s_mov_b32 s5, s1 -; GFX6-NEXT: v_add_i32_e32 v3, vcc, v3, v7 -; GFX6-NEXT: v_addc_u32_e32 v3, vcc, v6, v4, vcc -; GFX6-NEXT: v_addc_u32_e32 v4, vcc, v8, v5, vcc +; GFX6-NEXT: v_add_i32_e32 v3, vcc, v3, v6 +; GFX6-NEXT: v_addc_u32_e32 v3, vcc, v5, v4, vcc +; GFX6-NEXT: v_addc_u32_e32 v4, vcc, 0, v7, vcc ; GFX6-NEXT: v_add_i32_e32 v2, vcc, v3, v2 ; GFX6-NEXT: v_addc_u32_e32 v3, vcc, 0, v4, vcc ; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v2 ; GFX6-NEXT: v_addc_u32_e32 v1, vcc, v1, v3, vcc ; GFX6-NEXT: v_mul_lo_u32 v2, v1, s4 ; GFX6-NEXT: v_mul_hi_u32 v3, v0, s4 -; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s6, -1 ; GFX6-NEXT: v_add_i32_e32 v2, vcc, v3, v2 ; GFX6-NEXT: v_mul_lo_u32 v3, v0, s4 ; GFX6-NEXT: v_subrev_i32_e32 v2, vcc, v0, v2 -; GFX6-NEXT: v_mul_lo_u32 v7, v0, v2 -; GFX6-NEXT: v_mul_hi_u32 v8, v0, v3 -; GFX6-NEXT: v_mul_hi_u32 v9, v0, v2 -; GFX6-NEXT: v_mul_hi_u32 v6, v1, v3 +; GFX6-NEXT: v_mul_lo_u32 v6, v0, v2 +; GFX6-NEXT: v_mul_hi_u32 v7, v0, v3 +; GFX6-NEXT: v_mul_hi_u32 v8, v0, v2 +; GFX6-NEXT: v_mul_hi_u32 v5, v1, v3 ; GFX6-NEXT: v_mul_lo_u32 v3, v1, v3 ; GFX6-NEXT: v_mul_hi_u32 v4, v1, v2 -; GFX6-NEXT: v_add_i32_e32 v7, vcc, v8, v7 -; GFX6-NEXT: v_addc_u32_e32 v8, vcc, 0, v9, vcc +; GFX6-NEXT: v_add_i32_e32 v6, vcc, v7, v6 +; GFX6-NEXT: v_addc_u32_e32 v7, vcc, 0, v8, vcc ; GFX6-NEXT: v_mul_lo_u32 v2, v1, v2 -; GFX6-NEXT: v_add_i32_e32 v3, vcc, v7, v3 -; GFX6-NEXT: v_addc_u32_e32 v3, vcc, v8, v6, vcc -; GFX6-NEXT: v_addc_u32_e32 v4, vcc, v4, v5, vcc +; GFX6-NEXT: v_add_i32_e32 v3, vcc, v6, v3 +; GFX6-NEXT: v_addc_u32_e32 v3, vcc, v7, v5, vcc +; GFX6-NEXT: v_addc_u32_e32 v4, vcc, 0, v4, vcc ; GFX6-NEXT: v_add_i32_e32 v2, vcc, v3, v2 ; GFX6-NEXT: v_addc_u32_e32 v3, vcc, 0, v4, vcc ; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v2 @@ -9669,7 +9644,7 @@ define amdgpu_kernel void @srem_i64_oddk_denom(i64 addrspace(1)* %out, i64 %x) { ; GFX6-NEXT: v_mul_lo_u32 v2, s2, v1 ; GFX6-NEXT: v_mul_hi_u32 v3, s2, v0 ; GFX6-NEXT: v_mul_hi_u32 v4, s2, v1 -; GFX6-NEXT: v_mul_hi_u32 v6, s3, v1 +; GFX6-NEXT: v_mul_hi_u32 v5, s3, v1 ; GFX6-NEXT: v_mul_lo_u32 v1, s3, v1 ; GFX6-NEXT: v_add_i32_e32 v2, vcc, v3, v2 ; GFX6-NEXT: v_addc_u32_e32 v3, vcc, 0, v4, vcc @@ -9679,7 +9654,7 @@ define amdgpu_kernel void @srem_i64_oddk_denom(i64 addrspace(1)* %out, i64 %x) { ; GFX6-NEXT: s_mov_b32 s0, 0x12d8fb ; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v4 ; GFX6-NEXT: v_addc_u32_e32 v0, vcc, v3, v0, vcc -; GFX6-NEXT: v_addc_u32_e32 v2, vcc, v6, v5, vcc +; GFX6-NEXT: v_addc_u32_e32 v2, vcc, 0, v5, vcc ; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v1 ; GFX6-NEXT: v_addc_u32_e32 v1, vcc, 0, v2, vcc ; GFX6-NEXT: v_mul_lo_u32 v1, v1, s0 @@ -9722,7 +9697,6 @@ define amdgpu_kernel void @srem_i64_oddk_denom(i64 addrspace(1)* %out, i64 %x) { ; GFX9-NEXT: v_madak_f32 v0, 0, v0, 0x4996c7d8 ; GFX9-NEXT: v_rcp_f32_e32 v0, v0 ; GFX9-NEXT: s_mov_b32 s2, 0xffed2705 -; GFX9-NEXT: v_mov_b32_e32 v5, 0 ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0 ; GFX9-NEXT: v_mul_f32_e32 v1, 0x2f800000, v0 @@ -9737,16 +9711,16 @@ define amdgpu_kernel void @srem_i64_oddk_denom(i64 addrspace(1)* %out, i64 %x) { ; GFX9-NEXT: v_sub_u32_e32 v2, v2, v0 ; GFX9-NEXT: v_mul_hi_u32 v3, v0, v4 ; GFX9-NEXT: v_mul_lo_u32 v6, v0, v2 -; GFX9-NEXT: v_mul_hi_u32 v8, v0, v2 -; GFX9-NEXT: v_mul_lo_u32 v7, v1, v4 +; GFX9-NEXT: v_mul_hi_u32 v7, v0, v2 +; GFX9-NEXT: v_mul_lo_u32 v5, v1, v4 ; GFX9-NEXT: v_mul_hi_u32 v4, v1, v4 -; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, v3, v6 -; GFX9-NEXT: v_addc_co_u32_e32 v6, vcc, 0, v8, vcc ; GFX9-NEXT: v_mul_hi_u32 v8, v1, v2 +; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, v3, v6 +; GFX9-NEXT: v_addc_co_u32_e32 v6, vcc, 0, v7, vcc ; GFX9-NEXT: v_mul_lo_u32 v2, v1, v2 -; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, v3, v7 +; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, v3, v5 ; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, v6, v4, vcc -; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, v8, v5, vcc +; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, 0, v8, vcc ; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v3, v2 ; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v4, vcc ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v2 @@ -9759,18 +9733,18 @@ define amdgpu_kernel void @srem_i64_oddk_denom(i64 addrspace(1)* %out, i64 %x) { ; GFX9-NEXT: s_add_u32 s0, s6, s2 ; GFX9-NEXT: v_add_u32_e32 v2, v3, v2 ; GFX9-NEXT: v_sub_u32_e32 v2, v2, v0 -; GFX9-NEXT: v_mul_lo_u32 v7, v0, v2 -; GFX9-NEXT: v_mul_hi_u32 v8, v0, v4 -; GFX9-NEXT: v_mul_hi_u32 v9, v0, v2 -; GFX9-NEXT: v_mul_hi_u32 v6, v1, v4 +; GFX9-NEXT: v_mul_lo_u32 v6, v0, v2 +; GFX9-NEXT: v_mul_hi_u32 v7, v0, v4 +; GFX9-NEXT: v_mul_hi_u32 v8, v0, v2 +; GFX9-NEXT: v_mul_hi_u32 v5, v1, v4 ; GFX9-NEXT: v_mul_lo_u32 v4, v1, v4 ; GFX9-NEXT: v_mul_hi_u32 v3, v1, v2 -; GFX9-NEXT: v_add_co_u32_e32 v7, vcc, v8, v7 -; GFX9-NEXT: v_addc_co_u32_e32 v8, vcc, 0, v9, vcc +; GFX9-NEXT: v_add_co_u32_e32 v6, vcc, v7, v6 +; GFX9-NEXT: v_addc_co_u32_e32 v7, vcc, 0, v8, vcc ; GFX9-NEXT: v_mul_lo_u32 v2, v1, v2 -; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, v7, v4 -; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, v8, v6, vcc -; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, v3, v5, vcc +; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, v6, v4 +; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, v7, v5, vcc +; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc ; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v4, v2 ; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v2 @@ -9781,7 +9755,7 @@ define amdgpu_kernel void @srem_i64_oddk_denom(i64 addrspace(1)* %out, i64 %x) { ; GFX9-NEXT: v_mul_lo_u32 v2, s0, v1 ; GFX9-NEXT: v_mul_hi_u32 v3, s0, v0 ; GFX9-NEXT: v_mul_hi_u32 v4, s0, v1 -; GFX9-NEXT: v_mul_hi_u32 v6, s1, v1 +; GFX9-NEXT: v_mul_hi_u32 v5, s1, v1 ; GFX9-NEXT: v_mul_lo_u32 v1, s1, v1 ; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v3, v2 ; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v4, vcc @@ -9790,41 +9764,42 @@ define amdgpu_kernel void @srem_i64_oddk_denom(i64 addrspace(1)* %out, i64 %x) { ; GFX9-NEXT: s_mov_b32 s3, 0x12d8fb ; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v2, v4 ; GFX9-NEXT: v_addc_co_u32_e32 v0, vcc, v3, v0, vcc -; GFX9-NEXT: v_addc_co_u32_e32 v2, vcc, v6, v5, vcc +; GFX9-NEXT: v_addc_co_u32_e32 v2, vcc, 0, v5, vcc ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v1 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v2, vcc ; GFX9-NEXT: v_mul_lo_u32 v1, v1, s3 ; GFX9-NEXT: v_mul_hi_u32 v2, v0, s3 ; GFX9-NEXT: v_mul_lo_u32 v0, v0, s3 +; GFX9-NEXT: v_mov_b32_e32 v3, 0 ; GFX9-NEXT: v_add_u32_e32 v1, v2, v1 ; GFX9-NEXT: v_mov_b32_e32 v2, s1 ; GFX9-NEXT: v_sub_co_u32_e32 v0, vcc, s0, v0 ; GFX9-NEXT: v_subb_co_u32_e32 v1, vcc, v2, v1, vcc ; GFX9-NEXT: v_subrev_co_u32_e32 v2, vcc, s3, v0 -; GFX9-NEXT: v_subbrev_co_u32_e32 v3, vcc, 0, v1, vcc -; GFX9-NEXT: v_subrev_co_u32_e32 v4, vcc, s3, v2 -; GFX9-NEXT: v_subbrev_co_u32_e32 v6, vcc, 0, v3, vcc +; GFX9-NEXT: v_subbrev_co_u32_e32 v4, vcc, 0, v1, vcc +; GFX9-NEXT: v_subrev_co_u32_e32 v5, vcc, s3, v2 +; GFX9-NEXT: v_subbrev_co_u32_e32 v6, vcc, 0, v4, vcc ; GFX9-NEXT: s_mov_b32 s0, 0x12d8fa ; GFX9-NEXT: v_cmp_lt_u32_e32 vcc, s0, v2 ; GFX9-NEXT: v_cndmask_b32_e64 v7, 0, -1, vcc -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v4 ; GFX9-NEXT: v_cndmask_b32_e32 v7, -1, v7, vcc ; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v7 ; GFX9-NEXT: v_cmp_lt_u32_e64 s[0:1], s0, v0 -; GFX9-NEXT: v_cndmask_b32_e32 v3, v3, v6, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v4, v4, v6, vcc ; GFX9-NEXT: v_cndmask_b32_e64 v6, 0, -1, s[0:1] ; GFX9-NEXT: v_cmp_eq_u32_e64 s[0:1], 0, v1 ; GFX9-NEXT: v_cndmask_b32_e64 v6, -1, v6, s[0:1] ; GFX9-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v6 -; GFX9-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v2, v2, v5, vcc ; GFX9-NEXT: v_cndmask_b32_e64 v0, v0, v2, s[0:1] -; GFX9-NEXT: v_cndmask_b32_e64 v1, v1, v3, s[0:1] +; GFX9-NEXT: v_cndmask_b32_e64 v1, v1, v4, s[0:1] ; GFX9-NEXT: v_xor_b32_e32 v0, s2, v0 ; GFX9-NEXT: v_xor_b32_e32 v1, s2, v1 ; GFX9-NEXT: v_mov_b32_e32 v2, s2 ; GFX9-NEXT: v_subrev_co_u32_e32 v0, vcc, s2, v0 ; GFX9-NEXT: v_subb_co_u32_e32 v1, vcc, v1, v2, vcc -; GFX9-NEXT: global_store_dwordx2 v5, v[0:1], s[4:5] +; GFX9-NEXT: global_store_dwordx2 v3, v[0:1], s[4:5] ; GFX9-NEXT: s_endpgm %r = srem i64 %x, 1235195 store i64 %r, i64 addrspace(1)* %out @@ -9934,48 +9909,47 @@ define amdgpu_kernel void @srem_i64_pow2_shl_denom(i64 addrspace(1)* %out, i64 % ; GFX6-NEXT: s_xor_b64 s[12:13], s[2:3], s[10:11] ; GFX6-NEXT: v_add_i32_e32 v3, vcc, v3, v6 ; GFX6-NEXT: v_addc_u32_e32 v3, vcc, v5, v4, vcc -; GFX6-NEXT: v_mov_b32_e32 v4, 0 -; GFX6-NEXT: v_addc_u32_e32 v5, vcc, v7, v4, vcc +; GFX6-NEXT: v_addc_u32_e32 v4, vcc, 0, v7, vcc ; GFX6-NEXT: v_add_i32_e32 v2, vcc, v3, v2 -; GFX6-NEXT: v_addc_u32_e32 v3, vcc, 0, v5, vcc +; GFX6-NEXT: v_addc_u32_e32 v3, vcc, 0, v4, vcc ; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v2 ; GFX6-NEXT: v_addc_u32_e32 v1, vcc, v1, v3, vcc ; GFX6-NEXT: v_mul_lo_u32 v2, s4, v1 ; GFX6-NEXT: v_mul_hi_u32 v3, s4, v0 -; GFX6-NEXT: v_mul_lo_u32 v5, s5, v0 +; GFX6-NEXT: v_mul_lo_u32 v4, s5, v0 ; GFX6-NEXT: s_mov_b32 s5, s1 ; GFX6-NEXT: v_add_i32_e32 v2, vcc, v3, v2 ; GFX6-NEXT: v_mul_lo_u32 v3, s4, v0 -; GFX6-NEXT: v_add_i32_e32 v2, vcc, v5, v2 -; GFX6-NEXT: v_mul_lo_u32 v7, v0, v2 -; GFX6-NEXT: v_mul_hi_u32 v8, v0, v3 -; GFX6-NEXT: v_mul_hi_u32 v9, v0, v2 -; GFX6-NEXT: v_mul_hi_u32 v6, v1, v3 +; GFX6-NEXT: v_add_i32_e32 v2, vcc, v4, v2 +; GFX6-NEXT: v_mul_lo_u32 v6, v0, v2 +; GFX6-NEXT: v_mul_hi_u32 v7, v0, v3 +; GFX6-NEXT: v_mul_hi_u32 v8, v0, v2 +; GFX6-NEXT: v_mul_hi_u32 v5, v1, v3 ; GFX6-NEXT: v_mul_lo_u32 v3, v1, v3 -; GFX6-NEXT: v_mul_hi_u32 v5, v1, v2 -; GFX6-NEXT: v_add_i32_e32 v7, vcc, v8, v7 -; GFX6-NEXT: v_addc_u32_e32 v8, vcc, 0, v9, vcc +; GFX6-NEXT: v_mul_hi_u32 v4, v1, v2 +; GFX6-NEXT: v_add_i32_e32 v6, vcc, v7, v6 +; GFX6-NEXT: v_addc_u32_e32 v7, vcc, 0, v8, vcc ; GFX6-NEXT: v_mul_lo_u32 v2, v1, v2 -; GFX6-NEXT: v_add_i32_e32 v3, vcc, v7, v3 -; GFX6-NEXT: v_addc_u32_e32 v3, vcc, v8, v6, vcc -; GFX6-NEXT: v_addc_u32_e32 v5, vcc, v5, v4, vcc +; GFX6-NEXT: v_add_i32_e32 v3, vcc, v6, v3 +; GFX6-NEXT: v_addc_u32_e32 v3, vcc, v7, v5, vcc +; GFX6-NEXT: v_addc_u32_e32 v4, vcc, 0, v4, vcc ; GFX6-NEXT: v_add_i32_e32 v2, vcc, v3, v2 -; GFX6-NEXT: v_addc_u32_e32 v3, vcc, 0, v5, vcc +; GFX6-NEXT: v_addc_u32_e32 v3, vcc, 0, v4, vcc ; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v2 ; GFX6-NEXT: v_addc_u32_e32 v1, vcc, v1, v3, vcc ; GFX6-NEXT: v_mul_lo_u32 v2, s12, v1 ; GFX6-NEXT: v_mul_hi_u32 v3, s12, v0 -; GFX6-NEXT: v_mul_hi_u32 v5, s12, v1 -; GFX6-NEXT: v_mul_hi_u32 v6, s13, v1 +; GFX6-NEXT: v_mul_hi_u32 v4, s12, v1 +; GFX6-NEXT: v_mul_hi_u32 v5, s13, v1 ; GFX6-NEXT: v_mul_lo_u32 v1, s13, v1 ; GFX6-NEXT: v_add_i32_e32 v2, vcc, v3, v2 -; GFX6-NEXT: v_addc_u32_e32 v3, vcc, 0, v5, vcc -; GFX6-NEXT: v_mul_lo_u32 v5, s13, v0 +; GFX6-NEXT: v_addc_u32_e32 v3, vcc, 0, v4, vcc +; GFX6-NEXT: v_mul_lo_u32 v4, s13, v0 ; GFX6-NEXT: v_mul_hi_u32 v0, s13, v0 ; GFX6-NEXT: s_mov_b32 s4, s0 -; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v5 +; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v4 ; GFX6-NEXT: v_addc_u32_e32 v0, vcc, v3, v0, vcc -; GFX6-NEXT: v_addc_u32_e32 v2, vcc, v6, v4, vcc +; GFX6-NEXT: v_addc_u32_e32 v2, vcc, 0, v5, vcc ; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v1 ; GFX6-NEXT: v_addc_u32_e32 v1, vcc, 0, v2, vcc ; GFX6-NEXT: v_mul_lo_u32 v1, s8, v1 @@ -10025,7 +9999,6 @@ define amdgpu_kernel void @srem_i64_pow2_shl_denom(i64 addrspace(1)* %out, i64 % ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dword s4, s[0:1], 0x34 ; GFX9-NEXT: s_mov_b64 s[2:3], 0x1000 -; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_lshl_b64 s[2:3], s[2:3], s4 ; GFX9-NEXT: s_ashr_i32 s4, s3, 31 @@ -10050,89 +10023,90 @@ define amdgpu_kernel void @srem_i64_pow2_shl_denom(i64 addrspace(1)* %out, i64 % ; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0 ; GFX9-NEXT: s_add_u32 s0, s6, s10 ; GFX9-NEXT: s_mov_b32 s11, s10 -; GFX9-NEXT: v_mul_lo_u32 v3, s2, v1 -; GFX9-NEXT: v_mul_hi_u32 v4, s2, v0 -; GFX9-NEXT: v_mul_lo_u32 v6, s3, v0 -; GFX9-NEXT: v_mul_lo_u32 v5, s2, v0 -; GFX9-NEXT: s_addc_u32 s1, s7, s10 -; GFX9-NEXT: v_add_u32_e32 v3, v4, v3 -; GFX9-NEXT: v_add_u32_e32 v3, v3, v6 -; GFX9-NEXT: v_mul_hi_u32 v4, v0, v5 -; GFX9-NEXT: v_mul_lo_u32 v6, v0, v3 -; GFX9-NEXT: v_mul_hi_u32 v8, v0, v3 -; GFX9-NEXT: v_mul_hi_u32 v7, v1, v5 -; GFX9-NEXT: v_mul_lo_u32 v5, v1, v5 -; GFX9-NEXT: v_mul_hi_u32 v9, v1, v3 -; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, v4, v6 -; GFX9-NEXT: v_addc_co_u32_e32 v6, vcc, 0, v8, vcc -; GFX9-NEXT: v_mul_lo_u32 v3, v1, v3 -; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, v4, v5 -; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, v6, v7, vcc -; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, v9, v2, vcc -; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, v4, v3 -; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, 0, v5, vcc -; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v3 -; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v4, vcc -; GFX9-NEXT: v_mul_lo_u32 v3, s2, v1 -; GFX9-NEXT: v_mul_hi_u32 v4, s2, v0 +; GFX9-NEXT: v_mul_lo_u32 v2, s2, v1 +; GFX9-NEXT: v_mul_hi_u32 v3, s2, v0 ; GFX9-NEXT: v_mul_lo_u32 v5, s3, v0 -; GFX9-NEXT: v_mul_lo_u32 v6, s2, v0 -; GFX9-NEXT: s_xor_b64 s[6:7], s[0:1], s[10:11] -; GFX9-NEXT: v_add_u32_e32 v3, v4, v3 -; GFX9-NEXT: v_add_u32_e32 v3, v3, v5 -; GFX9-NEXT: v_mul_lo_u32 v7, v0, v3 -; GFX9-NEXT: v_mul_hi_u32 v8, v0, v6 -; GFX9-NEXT: v_mul_hi_u32 v9, v0, v3 -; GFX9-NEXT: v_mul_hi_u32 v5, v1, v6 -; GFX9-NEXT: v_mul_lo_u32 v6, v1, v6 -; GFX9-NEXT: v_mul_hi_u32 v4, v1, v3 -; GFX9-NEXT: v_add_co_u32_e32 v7, vcc, v8, v7 -; GFX9-NEXT: v_addc_co_u32_e32 v8, vcc, 0, v9, vcc -; GFX9-NEXT: v_mul_lo_u32 v3, v1, v3 -; GFX9-NEXT: v_add_co_u32_e32 v6, vcc, v7, v6 -; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, v8, v5, vcc -; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, v4, v2, vcc -; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, v5, v3 -; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, 0, v4, vcc -; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v3 -; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v4, vcc -; GFX9-NEXT: v_mul_lo_u32 v3, s6, v1 -; GFX9-NEXT: v_mul_hi_u32 v4, s6, v0 -; GFX9-NEXT: v_mul_hi_u32 v5, s6, v1 -; GFX9-NEXT: v_mul_hi_u32 v6, s7, v1 -; GFX9-NEXT: v_mul_lo_u32 v1, s7, v1 -; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, v4, v3 -; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, 0, v5, vcc -; GFX9-NEXT: v_mul_lo_u32 v5, s7, v0 -; GFX9-NEXT: v_mul_hi_u32 v0, s7, v0 +; GFX9-NEXT: v_mul_lo_u32 v4, s2, v0 +; GFX9-NEXT: s_addc_u32 s1, s7, s10 +; GFX9-NEXT: v_add_u32_e32 v2, v3, v2 +; GFX9-NEXT: v_add_u32_e32 v2, v2, v5 +; GFX9-NEXT: v_mul_hi_u32 v3, v0, v4 +; GFX9-NEXT: v_mul_lo_u32 v5, v0, v2 +; GFX9-NEXT: v_mul_hi_u32 v7, v0, v2 +; GFX9-NEXT: v_mul_hi_u32 v6, v1, v4 +; GFX9-NEXT: v_mul_lo_u32 v4, v1, v4 +; GFX9-NEXT: v_mul_hi_u32 v8, v1, v2 ; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, v3, v5 -; GFX9-NEXT: v_addc_co_u32_e32 v0, vcc, v4, v0, vcc -; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, v6, v2, vcc +; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v7, vcc +; GFX9-NEXT: v_mul_lo_u32 v2, v1, v2 +; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, v3, v4 +; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, v5, v6, vcc +; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, 0, v8, vcc +; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v3, v2 +; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v4, vcc +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v2 +; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v3, vcc +; GFX9-NEXT: v_mul_lo_u32 v2, s2, v1 +; GFX9-NEXT: v_mul_hi_u32 v3, s2, v0 +; GFX9-NEXT: v_mul_lo_u32 v4, s3, v0 +; GFX9-NEXT: v_mul_lo_u32 v5, s2, v0 +; GFX9-NEXT: s_xor_b64 s[6:7], s[0:1], s[10:11] +; GFX9-NEXT: v_add_u32_e32 v2, v3, v2 +; GFX9-NEXT: v_add_u32_e32 v2, v2, v4 +; GFX9-NEXT: v_mul_lo_u32 v6, v0, v2 +; GFX9-NEXT: v_mul_hi_u32 v7, v0, v5 +; GFX9-NEXT: v_mul_hi_u32 v8, v0, v2 +; GFX9-NEXT: v_mul_hi_u32 v4, v1, v5 +; GFX9-NEXT: v_mul_lo_u32 v5, v1, v5 +; GFX9-NEXT: v_mul_hi_u32 v3, v1, v2 +; GFX9-NEXT: v_add_co_u32_e32 v6, vcc, v7, v6 +; GFX9-NEXT: v_addc_co_u32_e32 v7, vcc, 0, v8, vcc +; GFX9-NEXT: v_mul_lo_u32 v2, v1, v2 +; GFX9-NEXT: v_add_co_u32_e32 v5, vcc, v6, v5 +; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, v7, v4, vcc +; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc +; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v4, v2 +; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v2 +; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v3, vcc +; GFX9-NEXT: v_mul_lo_u32 v2, s6, v1 +; GFX9-NEXT: v_mul_hi_u32 v3, s6, v0 +; GFX9-NEXT: v_mul_hi_u32 v4, s6, v1 +; GFX9-NEXT: v_mul_hi_u32 v5, s7, v1 +; GFX9-NEXT: v_mul_lo_u32 v1, s7, v1 +; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v3, v2 +; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v4, vcc +; GFX9-NEXT: v_mul_lo_u32 v4, s7, v0 +; GFX9-NEXT: v_mul_hi_u32 v0, s7, v0 +; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v2, v4 +; GFX9-NEXT: v_addc_co_u32_e32 v0, vcc, v3, v0, vcc +; GFX9-NEXT: v_addc_co_u32_e32 v2, vcc, 0, v5, vcc ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v1 -; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v3, vcc +; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v2, vcc ; GFX9-NEXT: v_mul_lo_u32 v1, s8, v1 -; GFX9-NEXT: v_mul_hi_u32 v3, s8, v0 -; GFX9-NEXT: v_mul_lo_u32 v4, s9, v0 +; GFX9-NEXT: v_mul_hi_u32 v2, s8, v0 +; GFX9-NEXT: v_mul_lo_u32 v3, s9, v0 ; GFX9-NEXT: v_mul_lo_u32 v0, s8, v0 -; GFX9-NEXT: v_add_u32_e32 v1, v3, v1 -; GFX9-NEXT: v_add_u32_e32 v1, v1, v4 -; GFX9-NEXT: v_sub_u32_e32 v3, s7, v1 -; GFX9-NEXT: v_mov_b32_e32 v4, s9 +; GFX9-NEXT: v_mov_b32_e32 v4, 0 +; GFX9-NEXT: v_add_u32_e32 v1, v2, v1 +; GFX9-NEXT: v_add_u32_e32 v1, v1, v3 +; GFX9-NEXT: v_sub_u32_e32 v2, s7, v1 +; GFX9-NEXT: v_mov_b32_e32 v3, s9 ; GFX9-NEXT: v_sub_co_u32_e32 v0, vcc, s6, v0 -; GFX9-NEXT: v_subb_co_u32_e64 v3, s[0:1], v3, v4, vcc +; GFX9-NEXT: v_subb_co_u32_e64 v2, s[0:1], v2, v3, vcc ; GFX9-NEXT: v_subrev_co_u32_e64 v5, s[0:1], s8, v0 -; GFX9-NEXT: v_subbrev_co_u32_e64 v6, s[2:3], 0, v3, s[0:1] +; GFX9-NEXT: v_subbrev_co_u32_e64 v6, s[2:3], 0, v2, s[0:1] ; GFX9-NEXT: v_cmp_le_u32_e64 s[2:3], s9, v6 ; GFX9-NEXT: v_cndmask_b32_e64 v7, 0, -1, s[2:3] ; GFX9-NEXT: v_cmp_le_u32_e64 s[2:3], s8, v5 -; GFX9-NEXT: v_subb_co_u32_e64 v3, s[0:1], v3, v4, s[0:1] +; GFX9-NEXT: v_subb_co_u32_e64 v2, s[0:1], v2, v3, s[0:1] ; GFX9-NEXT: v_cndmask_b32_e64 v8, 0, -1, s[2:3] ; GFX9-NEXT: v_cmp_eq_u32_e64 s[2:3], s9, v6 -; GFX9-NEXT: v_subrev_co_u32_e64 v4, s[0:1], s8, v5 +; GFX9-NEXT: v_subrev_co_u32_e64 v3, s[0:1], s8, v5 ; GFX9-NEXT: v_cndmask_b32_e64 v7, v7, v8, s[2:3] -; GFX9-NEXT: v_subbrev_co_u32_e64 v3, s[0:1], 0, v3, s[0:1] +; GFX9-NEXT: v_subbrev_co_u32_e64 v2, s[0:1], 0, v2, s[0:1] ; GFX9-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v7 -; GFX9-NEXT: v_cndmask_b32_e64 v3, v6, v3, s[0:1] +; GFX9-NEXT: v_cndmask_b32_e64 v2, v6, v2, s[0:1] ; GFX9-NEXT: v_mov_b32_e32 v6, s7 ; GFX9-NEXT: v_subb_co_u32_e32 v1, vcc, v6, v1, vcc ; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s9, v1 @@ -10142,15 +10116,15 @@ define amdgpu_kernel void @srem_i64_pow2_shl_denom(i64 addrspace(1)* %out, i64 % ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, s9, v1 ; GFX9-NEXT: v_cndmask_b32_e32 v6, v6, v7, vcc ; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v6 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc -; GFX9-NEXT: v_cndmask_b32_e64 v3, v5, v4, s[0:1] -; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc +; GFX9-NEXT: v_cndmask_b32_e64 v2, v5, v3, s[0:1] +; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc ; GFX9-NEXT: v_xor_b32_e32 v0, s10, v0 ; GFX9-NEXT: v_xor_b32_e32 v1, s10, v1 -; GFX9-NEXT: v_mov_b32_e32 v3, s10 +; GFX9-NEXT: v_mov_b32_e32 v2, s10 ; GFX9-NEXT: v_subrev_co_u32_e32 v0, vcc, s10, v0 -; GFX9-NEXT: v_subb_co_u32_e32 v1, vcc, v1, v3, vcc -; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] +; GFX9-NEXT: v_subb_co_u32_e32 v1, vcc, v1, v2, vcc +; GFX9-NEXT: global_store_dwordx2 v4, v[0:1], s[4:5] ; GFX9-NEXT: s_endpgm %shl.y = shl i64 4096, %y %r = srem i64 %x, %shl.y @@ -10173,24 +10147,23 @@ define amdgpu_kernel void @srem_v2i64_pow2k_denom(<2 x i64> addrspace(1)* %out, ; GFX6: ; %bb.0: ; GFX6-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0xd ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 -; GFX6-NEXT: s_movk_i32 s8, 0xf000 ; GFX6-NEXT: s_mov_b32 s3, 0xf000 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: s_ashr_i32 s9, s5, 31 -; GFX6-NEXT: s_lshr_b32 s9, s9, 20 -; GFX6-NEXT: s_add_u32 s9, s4, s9 -; GFX6-NEXT: s_addc_u32 s10, s5, 0 -; GFX6-NEXT: s_and_b32 s9, s9, s8 -; GFX6-NEXT: s_sub_u32 s4, s4, s9 -; GFX6-NEXT: s_subb_u32 s5, s5, s10 -; GFX6-NEXT: s_ashr_i32 s9, s7, 31 -; GFX6-NEXT: s_lshr_b32 s9, s9, 20 -; GFX6-NEXT: s_add_u32 s9, s6, s9 -; GFX6-NEXT: s_addc_u32 s10, s7, 0 -; GFX6-NEXT: s_and_b32 s8, s9, s8 +; GFX6-NEXT: s_ashr_i32 s8, s5, 31 +; GFX6-NEXT: s_lshr_b32 s8, s8, 20 +; GFX6-NEXT: s_add_u32 s8, s4, s8 +; GFX6-NEXT: s_addc_u32 s9, s5, 0 +; GFX6-NEXT: s_and_b32 s8, s8, 0xfffff000 +; GFX6-NEXT: s_sub_u32 s4, s4, s8 +; GFX6-NEXT: s_subb_u32 s5, s5, s9 +; GFX6-NEXT: s_ashr_i32 s8, s7, 31 +; GFX6-NEXT: s_lshr_b32 s8, s8, 20 +; GFX6-NEXT: s_add_u32 s8, s6, s8 +; GFX6-NEXT: s_addc_u32 s9, s7, 0 +; GFX6-NEXT: s_and_b32 s8, s8, 0xfffff000 ; GFX6-NEXT: s_sub_u32 s6, s6, s8 -; GFX6-NEXT: s_subb_u32 s7, s7, s10 +; GFX6-NEXT: s_subb_u32 s7, s7, s9 ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: v_mov_b32_e32 v1, s5 ; GFX6-NEXT: v_mov_b32_e32 v2, s6 @@ -10202,26 +10175,25 @@ define amdgpu_kernel void @srem_v2i64_pow2k_denom(<2 x i64> addrspace(1)* %out, ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 ; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 -; GFX9-NEXT: s_movk_i32 s0, 0xf000 ; GFX9-NEXT: v_mov_b32_e32 v4, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_ashr_i32 s1, s5, 31 -; GFX9-NEXT: s_lshr_b32 s1, s1, 20 -; GFX9-NEXT: s_add_u32 s1, s4, s1 -; GFX9-NEXT: s_addc_u32 s8, s5, 0 -; GFX9-NEXT: s_and_b32 s1, s1, s0 -; GFX9-NEXT: s_sub_u32 s1, s4, s1 -; GFX9-NEXT: s_subb_u32 s4, s5, s8 -; GFX9-NEXT: s_ashr_i32 s5, s7, 31 -; GFX9-NEXT: s_lshr_b32 s5, s5, 20 -; GFX9-NEXT: s_add_u32 s5, s6, s5 -; GFX9-NEXT: s_addc_u32 s8, s7, 0 -; GFX9-NEXT: s_and_b32 s0, s5, s0 -; GFX9-NEXT: s_sub_u32 s0, s6, s0 -; GFX9-NEXT: s_subb_u32 s5, s7, s8 -; GFX9-NEXT: v_mov_b32_e32 v0, s1 -; GFX9-NEXT: v_mov_b32_e32 v1, s4 -; GFX9-NEXT: v_mov_b32_e32 v2, s0 +; GFX9-NEXT: s_ashr_i32 s0, s5, 31 +; GFX9-NEXT: s_lshr_b32 s0, s0, 20 +; GFX9-NEXT: s_add_u32 s0, s4, s0 +; GFX9-NEXT: s_addc_u32 s1, s5, 0 +; GFX9-NEXT: s_and_b32 s0, s0, 0xfffff000 +; GFX9-NEXT: s_sub_u32 s0, s4, s0 +; GFX9-NEXT: s_subb_u32 s1, s5, s1 +; GFX9-NEXT: s_ashr_i32 s4, s7, 31 +; GFX9-NEXT: s_lshr_b32 s4, s4, 20 +; GFX9-NEXT: s_add_u32 s4, s6, s4 +; GFX9-NEXT: s_addc_u32 s5, s7, 0 +; GFX9-NEXT: s_and_b32 s4, s4, 0xfffff000 +; GFX9-NEXT: s_sub_u32 s4, s6, s4 +; GFX9-NEXT: s_subb_u32 s5, s7, s5 +; GFX9-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NEXT: v_mov_b32_e32 v2, s4 ; GFX9-NEXT: v_mov_b32_e32 v3, s5 ; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[2:3] ; GFX9-NEXT: s_endpgm @@ -10274,202 +10246,201 @@ define amdgpu_kernel void @srem_v2i64_pow2_shl_denom(<2 x i64> addrspace(1)* %ou ; GFX6-NEXT: v_trunc_f32_e32 v1, v1 ; GFX6-NEXT: v_mac_f32_e32 v0, s21, v1 ; GFX6-NEXT: v_cvt_u32_f32_e32 v1, v1 -; GFX6-NEXT: v_cvt_u32_f32_e32 v2, v0 +; GFX6-NEXT: v_cvt_u32_f32_e32 v0, v0 ; GFX6-NEXT: s_mov_b32 s13, s12 ; GFX6-NEXT: s_addc_u32 s1, s5, s12 -; GFX6-NEXT: v_mul_lo_u32 v0, s2, v1 -; GFX6-NEXT: v_mul_hi_u32 v3, s2, v2 -; GFX6-NEXT: v_mul_lo_u32 v4, s3, v2 -; GFX6-NEXT: v_mul_lo_u32 v5, s2, v2 +; GFX6-NEXT: v_mul_lo_u32 v2, s2, v1 +; GFX6-NEXT: v_mul_hi_u32 v3, s2, v0 +; GFX6-NEXT: v_mul_lo_u32 v4, s3, v0 +; GFX6-NEXT: v_mul_lo_u32 v5, s2, v0 ; GFX6-NEXT: s_xor_b64 s[4:5], s[0:1], s[12:13] -; GFX6-NEXT: v_add_i32_e32 v0, vcc, v3, v0 -; GFX6-NEXT: v_add_i32_e32 v3, vcc, v0, v4 -; GFX6-NEXT: v_mul_lo_u32 v0, v2, v3 -; GFX6-NEXT: v_mul_hi_u32 v4, v2, v5 -; GFX6-NEXT: v_mul_hi_u32 v6, v2, v3 -; GFX6-NEXT: v_mul_hi_u32 v7, v1, v3 -; GFX6-NEXT: v_mul_lo_u32 v3, v1, v3 -; GFX6-NEXT: v_add_i32_e32 v0, vcc, v4, v0 +; GFX6-NEXT: v_add_i32_e32 v2, vcc, v3, v2 +; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v4 +; GFX6-NEXT: v_mul_lo_u32 v3, v0, v2 +; GFX6-NEXT: v_mul_hi_u32 v4, v0, v5 +; GFX6-NEXT: v_mul_hi_u32 v6, v0, v2 +; GFX6-NEXT: v_mul_hi_u32 v7, v1, v2 +; GFX6-NEXT: v_mul_lo_u32 v2, v1, v2 +; GFX6-NEXT: v_add_i32_e32 v3, vcc, v4, v3 ; GFX6-NEXT: v_addc_u32_e32 v4, vcc, 0, v6, vcc ; GFX6-NEXT: v_mul_lo_u32 v6, v1, v5 ; GFX6-NEXT: v_mul_hi_u32 v5, v1, v5 ; GFX6-NEXT: s_mov_b32 s11, 0xf000 ; GFX6-NEXT: s_mov_b32 s10, -1 -; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v6 -; GFX6-NEXT: v_addc_u32_e32 v4, vcc, v4, v5, vcc -; GFX6-NEXT: v_mov_b32_e32 v0, 0 -; GFX6-NEXT: v_addc_u32_e32 v5, vcc, v7, v0, vcc -; GFX6-NEXT: v_add_i32_e32 v3, vcc, v4, v3 -; GFX6-NEXT: v_addc_u32_e32 v4, vcc, 0, v5, vcc -; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v3 -; GFX6-NEXT: v_addc_u32_e32 v1, vcc, v1, v4, vcc -; GFX6-NEXT: v_mul_lo_u32 v3, s2, v1 -; GFX6-NEXT: v_mul_hi_u32 v4, s2, v2 -; GFX6-NEXT: v_mul_lo_u32 v5, s3, v2 -; GFX6-NEXT: v_add_i32_e32 v3, vcc, v4, v3 -; GFX6-NEXT: v_mul_lo_u32 v4, s2, v2 -; GFX6-NEXT: v_add_i32_e32 v3, vcc, v5, v3 -; GFX6-NEXT: v_mul_lo_u32 v7, v2, v3 -; GFX6-NEXT: v_mul_hi_u32 v8, v2, v4 -; GFX6-NEXT: v_mul_hi_u32 v9, v2, v3 -; GFX6-NEXT: v_mul_hi_u32 v6, v1, v4 -; GFX6-NEXT: v_mul_lo_u32 v4, v1, v4 -; GFX6-NEXT: v_mul_hi_u32 v5, v1, v3 -; GFX6-NEXT: v_add_i32_e32 v7, vcc, v8, v7 -; GFX6-NEXT: v_addc_u32_e32 v8, vcc, 0, v9, vcc -; GFX6-NEXT: v_mul_lo_u32 v3, v1, v3 -; GFX6-NEXT: v_add_i32_e32 v4, vcc, v7, v4 -; GFX6-NEXT: v_addc_u32_e32 v4, vcc, v8, v6, vcc -; GFX6-NEXT: v_addc_u32_e32 v5, vcc, v5, v0, vcc -; GFX6-NEXT: v_add_i32_e32 v3, vcc, v4, v3 -; GFX6-NEXT: v_addc_u32_e32 v4, vcc, 0, v5, vcc -; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v3 -; GFX6-NEXT: v_addc_u32_e32 v1, vcc, v1, v4, vcc -; GFX6-NEXT: v_mul_lo_u32 v3, s4, v1 -; GFX6-NEXT: v_mul_hi_u32 v4, s4, v2 -; GFX6-NEXT: v_mul_hi_u32 v5, s4, v1 -; GFX6-NEXT: v_mul_hi_u32 v6, s5, v1 -; GFX6-NEXT: v_mul_lo_u32 v1, s5, v1 -; GFX6-NEXT: v_add_i32_e32 v3, vcc, v4, v3 -; GFX6-NEXT: v_addc_u32_e32 v4, vcc, 0, v5, vcc -; GFX6-NEXT: v_mul_lo_u32 v5, s5, v2 -; GFX6-NEXT: v_mul_hi_u32 v2, s5, v2 -; GFX6-NEXT: v_add_i32_e32 v3, vcc, v3, v5 -; GFX6-NEXT: v_addc_u32_e32 v2, vcc, v4, v2, vcc -; GFX6-NEXT: v_addc_u32_e32 v3, vcc, v6, v0, vcc -; GFX6-NEXT: v_add_i32_e32 v1, vcc, v2, v1 -; GFX6-NEXT: v_addc_u32_e32 v2, vcc, 0, v3, vcc -; GFX6-NEXT: v_mul_lo_u32 v2, s16, v2 -; GFX6-NEXT: v_mul_hi_u32 v3, s16, v1 -; GFX6-NEXT: v_mul_lo_u32 v4, s17, v1 -; GFX6-NEXT: v_mul_lo_u32 v1, s16, v1 +; GFX6-NEXT: v_add_i32_e32 v3, vcc, v3, v6 +; GFX6-NEXT: v_addc_u32_e32 v3, vcc, v4, v5, vcc +; GFX6-NEXT: v_addc_u32_e32 v4, vcc, 0, v7, vcc ; GFX6-NEXT: v_add_i32_e32 v2, vcc, v3, v2 +; GFX6-NEXT: v_addc_u32_e32 v3, vcc, 0, v4, vcc +; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v2 +; GFX6-NEXT: v_addc_u32_e32 v1, vcc, v1, v3, vcc +; GFX6-NEXT: v_mul_lo_u32 v2, s2, v1 +; GFX6-NEXT: v_mul_hi_u32 v3, s2, v0 +; GFX6-NEXT: v_mul_lo_u32 v4, s3, v0 +; GFX6-NEXT: v_add_i32_e32 v2, vcc, v3, v2 +; GFX6-NEXT: v_mul_lo_u32 v3, s2, v0 +; GFX6-NEXT: v_add_i32_e32 v2, vcc, v4, v2 +; GFX6-NEXT: v_mul_lo_u32 v6, v0, v2 +; GFX6-NEXT: v_mul_hi_u32 v7, v0, v3 +; GFX6-NEXT: v_mul_hi_u32 v8, v0, v2 +; GFX6-NEXT: v_mul_hi_u32 v5, v1, v3 +; GFX6-NEXT: v_mul_lo_u32 v3, v1, v3 +; GFX6-NEXT: v_mul_hi_u32 v4, v1, v2 +; GFX6-NEXT: v_add_i32_e32 v6, vcc, v7, v6 +; GFX6-NEXT: v_addc_u32_e32 v7, vcc, 0, v8, vcc +; GFX6-NEXT: v_mul_lo_u32 v2, v1, v2 +; GFX6-NEXT: v_add_i32_e32 v3, vcc, v6, v3 +; GFX6-NEXT: v_addc_u32_e32 v3, vcc, v7, v5, vcc +; GFX6-NEXT: v_addc_u32_e32 v4, vcc, 0, v4, vcc +; GFX6-NEXT: v_add_i32_e32 v2, vcc, v3, v2 +; GFX6-NEXT: v_addc_u32_e32 v3, vcc, 0, v4, vcc +; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v2 +; GFX6-NEXT: v_addc_u32_e32 v1, vcc, v1, v3, vcc +; GFX6-NEXT: v_mul_lo_u32 v2, s4, v1 +; GFX6-NEXT: v_mul_hi_u32 v3, s4, v0 +; GFX6-NEXT: v_mul_hi_u32 v4, s4, v1 +; GFX6-NEXT: v_mul_hi_u32 v5, s5, v1 +; GFX6-NEXT: v_mul_lo_u32 v1, s5, v1 +; GFX6-NEXT: v_add_i32_e32 v2, vcc, v3, v2 +; GFX6-NEXT: v_addc_u32_e32 v3, vcc, 0, v4, vcc +; GFX6-NEXT: v_mul_lo_u32 v4, s5, v0 +; GFX6-NEXT: v_mul_hi_u32 v0, s5, v0 ; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v4 -; GFX6-NEXT: v_sub_i32_e32 v3, vcc, s5, v2 -; GFX6-NEXT: v_mov_b32_e32 v4, s17 -; GFX6-NEXT: v_sub_i32_e32 v1, vcc, s4, v1 -; GFX6-NEXT: v_subb_u32_e64 v3, s[0:1], v3, v4, vcc -; GFX6-NEXT: v_subrev_i32_e64 v5, s[0:1], s16, v1 -; GFX6-NEXT: v_subbrev_u32_e64 v6, s[2:3], 0, v3, s[0:1] -; GFX6-NEXT: v_cmp_le_u32_e64 s[2:3], s17, v6 +; GFX6-NEXT: v_addc_u32_e32 v0, vcc, v3, v0, vcc +; GFX6-NEXT: v_addc_u32_e32 v2, vcc, 0, v5, vcc +; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v1 +; GFX6-NEXT: v_addc_u32_e32 v1, vcc, 0, v2, vcc +; GFX6-NEXT: v_mul_lo_u32 v1, s16, v1 +; GFX6-NEXT: v_mul_hi_u32 v2, s16, v0 +; GFX6-NEXT: v_mul_lo_u32 v3, s17, v0 +; GFX6-NEXT: v_mul_lo_u32 v0, s16, v0 +; GFX6-NEXT: v_add_i32_e32 v1, vcc, v2, v1 +; GFX6-NEXT: v_add_i32_e32 v1, vcc, v1, v3 +; GFX6-NEXT: v_sub_i32_e32 v2, vcc, s5, v1 +; GFX6-NEXT: v_mov_b32_e32 v3, s17 +; GFX6-NEXT: v_sub_i32_e32 v0, vcc, s4, v0 +; GFX6-NEXT: v_subb_u32_e64 v2, s[0:1], v2, v3, vcc +; GFX6-NEXT: v_subrev_i32_e64 v4, s[0:1], s16, v0 +; GFX6-NEXT: v_subbrev_u32_e64 v5, s[2:3], 0, v2, s[0:1] +; GFX6-NEXT: v_cmp_le_u32_e64 s[2:3], s17, v5 +; GFX6-NEXT: v_cndmask_b32_e64 v6, 0, -1, s[2:3] +; GFX6-NEXT: v_cmp_le_u32_e64 s[2:3], s16, v4 +; GFX6-NEXT: v_subb_u32_e64 v2, s[0:1], v2, v3, s[0:1] ; GFX6-NEXT: v_cndmask_b32_e64 v7, 0, -1, s[2:3] -; GFX6-NEXT: v_cmp_le_u32_e64 s[2:3], s16, v5 -; GFX6-NEXT: v_subb_u32_e64 v3, s[0:1], v3, v4, s[0:1] -; GFX6-NEXT: v_cndmask_b32_e64 v8, 0, -1, s[2:3] -; GFX6-NEXT: v_cmp_eq_u32_e64 s[2:3], s17, v6 -; GFX6-NEXT: v_subrev_i32_e64 v4, s[0:1], s16, v5 -; GFX6-NEXT: v_cndmask_b32_e64 v7, v7, v8, s[2:3] -; GFX6-NEXT: v_subbrev_u32_e64 v3, s[0:1], 0, v3, s[0:1] +; GFX6-NEXT: v_cmp_eq_u32_e64 s[2:3], s17, v5 +; GFX6-NEXT: v_subrev_i32_e64 v3, s[0:1], s16, v4 +; GFX6-NEXT: v_cndmask_b32_e64 v6, v6, v7, s[2:3] +; GFX6-NEXT: v_subbrev_u32_e64 v2, s[0:1], 0, v2, s[0:1] ; GFX6-NEXT: s_ashr_i32 s2, s15, 31 -; GFX6-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v7 +; GFX6-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v6 ; GFX6-NEXT: s_add_u32 s4, s14, s2 -; GFX6-NEXT: v_cndmask_b32_e64 v3, v6, v3, s[0:1] -; GFX6-NEXT: v_mov_b32_e32 v6, s5 +; GFX6-NEXT: v_cndmask_b32_e64 v2, v5, v2, s[0:1] +; GFX6-NEXT: v_mov_b32_e32 v5, s5 ; GFX6-NEXT: s_mov_b32 s3, s2 ; GFX6-NEXT: s_addc_u32 s5, s15, s2 ; GFX6-NEXT: s_xor_b64 s[4:5], s[4:5], s[2:3] -; GFX6-NEXT: v_cvt_f32_u32_e32 v7, s4 -; GFX6-NEXT: v_cvt_f32_u32_e32 v8, s5 -; GFX6-NEXT: v_subb_u32_e32 v2, vcc, v6, v2, vcc -; GFX6-NEXT: v_cmp_le_u32_e32 vcc, s17, v2 -; GFX6-NEXT: v_mac_f32_e32 v7, s18, v8 -; GFX6-NEXT: v_cndmask_b32_e64 v6, 0, -1, vcc -; GFX6-NEXT: v_cmp_le_u32_e32 vcc, s16, v1 -; GFX6-NEXT: v_rcp_f32_e32 v7, v7 -; GFX6-NEXT: v_cndmask_b32_e64 v9, 0, -1, vcc -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, s17, v2 -; GFX6-NEXT: v_cndmask_b32_e32 v6, v6, v9, vcc -; GFX6-NEXT: v_cmp_ne_u32_e32 vcc, 0, v6 -; GFX6-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc -; GFX6-NEXT: v_cndmask_b32_e64 v3, v5, v4, s[0:1] -; GFX6-NEXT: v_mul_f32_e32 v4, s19, v7 -; GFX6-NEXT: v_mul_f32_e32 v5, s20, v4 -; GFX6-NEXT: v_trunc_f32_e32 v5, v5 -; GFX6-NEXT: v_mac_f32_e32 v4, s21, v5 +; GFX6-NEXT: v_cvt_f32_u32_e32 v6, s4 +; GFX6-NEXT: v_cvt_f32_u32_e32 v7, s5 +; GFX6-NEXT: v_subb_u32_e32 v1, vcc, v5, v1, vcc +; GFX6-NEXT: v_cmp_le_u32_e32 vcc, s17, v1 +; GFX6-NEXT: v_mac_f32_e32 v6, s18, v7 +; GFX6-NEXT: v_cndmask_b32_e64 v5, 0, -1, vcc +; GFX6-NEXT: v_cmp_le_u32_e32 vcc, s16, v0 +; GFX6-NEXT: v_rcp_f32_e32 v6, v6 +; GFX6-NEXT: v_cndmask_b32_e64 v8, 0, -1, vcc +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, s17, v1 +; GFX6-NEXT: v_cndmask_b32_e32 v5, v5, v8, vcc +; GFX6-NEXT: v_cmp_ne_u32_e32 vcc, 0, v5 +; GFX6-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc +; GFX6-NEXT: v_cndmask_b32_e64 v2, v4, v3, s[0:1] +; GFX6-NEXT: v_mul_f32_e32 v3, s19, v6 +; GFX6-NEXT: v_mul_f32_e32 v4, s20, v3 +; GFX6-NEXT: v_trunc_f32_e32 v4, v4 +; GFX6-NEXT: v_mac_f32_e32 v3, s21, v4 +; GFX6-NEXT: v_cvt_u32_f32_e32 v3, v3 ; GFX6-NEXT: v_cvt_u32_f32_e32 v4, v4 -; GFX6-NEXT: v_cvt_u32_f32_e32 v5, v5 ; GFX6-NEXT: s_sub_u32 s0, 0, s4 -; GFX6-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc -; GFX6-NEXT: v_mul_hi_u32 v3, s0, v4 -; GFX6-NEXT: v_mul_lo_u32 v6, s0, v5 -; GFX6-NEXT: s_subb_u32 s1, 0, s5 -; GFX6-NEXT: v_mul_lo_u32 v7, s1, v4 -; GFX6-NEXT: s_ashr_i32 s14, s7, 31 -; GFX6-NEXT: v_add_i32_e32 v3, vcc, v3, v6 -; GFX6-NEXT: v_mul_lo_u32 v6, s0, v4 -; GFX6-NEXT: v_add_i32_e32 v3, vcc, v3, v7 -; GFX6-NEXT: v_mul_lo_u32 v7, v4, v3 -; GFX6-NEXT: v_mul_hi_u32 v8, v4, v6 -; GFX6-NEXT: v_mul_hi_u32 v9, v4, v3 -; GFX6-NEXT: v_mul_hi_u32 v10, v5, v3 -; GFX6-NEXT: v_mul_lo_u32 v3, v5, v3 -; GFX6-NEXT: v_add_i32_e32 v7, vcc, v8, v7 -; GFX6-NEXT: v_addc_u32_e32 v8, vcc, 0, v9, vcc -; GFX6-NEXT: v_mul_lo_u32 v9, v5, v6 -; GFX6-NEXT: v_mul_hi_u32 v6, v5, v6 -; GFX6-NEXT: s_mov_b32 s15, s14 -; GFX6-NEXT: v_xor_b32_e32 v1, s12, v1 -; GFX6-NEXT: v_add_i32_e32 v7, vcc, v7, v9 -; GFX6-NEXT: v_addc_u32_e32 v6, vcc, v8, v6, vcc -; GFX6-NEXT: v_addc_u32_e32 v7, vcc, v10, v0, vcc -; GFX6-NEXT: v_add_i32_e32 v3, vcc, v6, v3 -; GFX6-NEXT: v_addc_u32_e32 v6, vcc, 0, v7, vcc -; GFX6-NEXT: v_add_i32_e32 v3, vcc, v4, v3 -; GFX6-NEXT: v_addc_u32_e32 v4, vcc, v5, v6, vcc +; GFX6-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc +; GFX6-NEXT: v_mul_hi_u32 v2, s0, v3 ; GFX6-NEXT: v_mul_lo_u32 v5, s0, v4 -; GFX6-NEXT: v_mul_hi_u32 v6, s0, v3 -; GFX6-NEXT: v_mul_lo_u32 v7, s1, v3 -; GFX6-NEXT: v_xor_b32_e32 v2, s12, v2 -; GFX6-NEXT: v_add_i32_e32 v5, vcc, v6, v5 -; GFX6-NEXT: v_mul_lo_u32 v6, s0, v3 -; GFX6-NEXT: v_add_i32_e32 v5, vcc, v7, v5 -; GFX6-NEXT: v_mul_lo_u32 v9, v3, v5 -; GFX6-NEXT: v_mul_hi_u32 v10, v3, v6 -; GFX6-NEXT: v_mul_hi_u32 v11, v3, v5 -; GFX6-NEXT: v_mul_hi_u32 v8, v4, v6 -; GFX6-NEXT: v_mul_lo_u32 v6, v4, v6 -; GFX6-NEXT: v_mul_hi_u32 v7, v4, v5 -; GFX6-NEXT: v_add_i32_e32 v9, vcc, v10, v9 -; GFX6-NEXT: v_addc_u32_e32 v10, vcc, 0, v11, vcc -; GFX6-NEXT: v_mul_lo_u32 v5, v4, v5 -; GFX6-NEXT: v_add_i32_e32 v6, vcc, v9, v6 -; GFX6-NEXT: v_addc_u32_e32 v6, vcc, v10, v8, vcc -; GFX6-NEXT: v_addc_u32_e32 v7, vcc, v7, v0, vcc -; GFX6-NEXT: v_add_i32_e32 v5, vcc, v6, v5 -; GFX6-NEXT: v_addc_u32_e32 v6, vcc, 0, v7, vcc -; GFX6-NEXT: s_add_u32 s0, s6, s14 -; GFX6-NEXT: v_add_i32_e32 v3, vcc, v3, v5 -; GFX6-NEXT: s_addc_u32 s1, s7, s14 -; GFX6-NEXT: v_addc_u32_e32 v4, vcc, v4, v6, vcc -; GFX6-NEXT: s_xor_b64 s[6:7], s[0:1], s[14:15] -; GFX6-NEXT: v_mul_lo_u32 v5, s6, v4 -; GFX6-NEXT: v_mul_hi_u32 v6, s6, v3 -; GFX6-NEXT: v_mul_hi_u32 v8, s6, v4 -; GFX6-NEXT: v_mul_hi_u32 v9, s7, v4 -; GFX6-NEXT: v_mul_lo_u32 v4, s7, v4 -; GFX6-NEXT: v_add_i32_e32 v5, vcc, v6, v5 -; GFX6-NEXT: v_addc_u32_e32 v6, vcc, 0, v8, vcc -; GFX6-NEXT: v_mul_lo_u32 v8, s7, v3 -; GFX6-NEXT: v_mul_hi_u32 v3, s7, v3 -; GFX6-NEXT: v_mov_b32_e32 v7, s12 -; GFX6-NEXT: v_add_i32_e32 v5, vcc, v5, v8 -; GFX6-NEXT: v_addc_u32_e32 v3, vcc, v6, v3, vcc -; GFX6-NEXT: v_addc_u32_e32 v0, vcc, v9, v0, vcc -; GFX6-NEXT: v_add_i32_e32 v3, vcc, v3, v4 -; GFX6-NEXT: v_addc_u32_e32 v0, vcc, 0, v0, vcc -; GFX6-NEXT: v_mul_lo_u32 v4, s4, v0 -; GFX6-NEXT: v_mul_hi_u32 v5, s4, v3 -; GFX6-NEXT: v_subrev_i32_e32 v0, vcc, s12, v1 -; GFX6-NEXT: v_subb_u32_e32 v1, vcc, v2, v7, vcc -; GFX6-NEXT: v_mul_lo_u32 v2, s5, v3 -; GFX6-NEXT: v_mul_lo_u32 v3, s4, v3 +; GFX6-NEXT: s_subb_u32 s1, 0, s5 +; GFX6-NEXT: v_mul_lo_u32 v6, s1, v3 +; GFX6-NEXT: s_ashr_i32 s14, s7, 31 +; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v5 +; GFX6-NEXT: v_mul_lo_u32 v5, s0, v3 +; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v6 +; GFX6-NEXT: v_mul_lo_u32 v6, v3, v2 +; GFX6-NEXT: v_mul_hi_u32 v7, v3, v5 +; GFX6-NEXT: v_mul_hi_u32 v8, v3, v2 +; GFX6-NEXT: v_mul_hi_u32 v9, v4, v2 +; GFX6-NEXT: v_mul_lo_u32 v2, v4, v2 +; GFX6-NEXT: v_add_i32_e32 v6, vcc, v7, v6 +; GFX6-NEXT: v_addc_u32_e32 v7, vcc, 0, v8, vcc +; GFX6-NEXT: v_mul_lo_u32 v8, v4, v5 +; GFX6-NEXT: v_mul_hi_u32 v5, v4, v5 +; GFX6-NEXT: s_mov_b32 s15, s14 +; GFX6-NEXT: v_xor_b32_e32 v0, s12, v0 +; GFX6-NEXT: v_add_i32_e32 v6, vcc, v6, v8 +; GFX6-NEXT: v_addc_u32_e32 v5, vcc, v7, v5, vcc +; GFX6-NEXT: v_addc_u32_e32 v6, vcc, 0, v9, vcc +; GFX6-NEXT: v_add_i32_e32 v2, vcc, v5, v2 +; GFX6-NEXT: v_addc_u32_e32 v5, vcc, 0, v6, vcc +; GFX6-NEXT: v_add_i32_e32 v2, vcc, v3, v2 +; GFX6-NEXT: v_addc_u32_e32 v3, vcc, v4, v5, vcc +; GFX6-NEXT: v_mul_lo_u32 v4, s0, v3 +; GFX6-NEXT: v_mul_hi_u32 v5, s0, v2 +; GFX6-NEXT: v_mul_lo_u32 v6, s1, v2 +; GFX6-NEXT: v_xor_b32_e32 v1, s12, v1 ; GFX6-NEXT: v_add_i32_e32 v4, vcc, v5, v4 -; GFX6-NEXT: v_add_i32_e32 v2, vcc, v4, v2 -; GFX6-NEXT: v_sub_i32_e32 v4, vcc, s7, v2 +; GFX6-NEXT: v_mul_lo_u32 v5, s0, v2 +; GFX6-NEXT: v_add_i32_e32 v4, vcc, v6, v4 +; GFX6-NEXT: v_mul_lo_u32 v8, v2, v4 +; GFX6-NEXT: v_mul_hi_u32 v9, v2, v5 +; GFX6-NEXT: v_mul_hi_u32 v10, v2, v4 +; GFX6-NEXT: v_mul_hi_u32 v7, v3, v5 +; GFX6-NEXT: v_mul_lo_u32 v5, v3, v5 +; GFX6-NEXT: v_mul_hi_u32 v6, v3, v4 +; GFX6-NEXT: v_add_i32_e32 v8, vcc, v9, v8 +; GFX6-NEXT: v_addc_u32_e32 v9, vcc, 0, v10, vcc +; GFX6-NEXT: v_mul_lo_u32 v4, v3, v4 +; GFX6-NEXT: v_add_i32_e32 v5, vcc, v8, v5 +; GFX6-NEXT: v_addc_u32_e32 v5, vcc, v9, v7, vcc +; GFX6-NEXT: v_addc_u32_e32 v6, vcc, 0, v6, vcc +; GFX6-NEXT: v_add_i32_e32 v4, vcc, v5, v4 +; GFX6-NEXT: v_addc_u32_e32 v5, vcc, 0, v6, vcc +; GFX6-NEXT: s_add_u32 s0, s6, s14 +; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v4 +; GFX6-NEXT: s_addc_u32 s1, s7, s14 +; GFX6-NEXT: v_addc_u32_e32 v3, vcc, v3, v5, vcc +; GFX6-NEXT: s_xor_b64 s[6:7], s[0:1], s[14:15] +; GFX6-NEXT: v_mul_lo_u32 v4, s6, v3 +; GFX6-NEXT: v_mul_hi_u32 v5, s6, v2 +; GFX6-NEXT: v_mul_hi_u32 v7, s6, v3 +; GFX6-NEXT: v_mul_hi_u32 v8, s7, v3 +; GFX6-NEXT: v_mul_lo_u32 v3, s7, v3 +; GFX6-NEXT: v_add_i32_e32 v4, vcc, v5, v4 +; GFX6-NEXT: v_addc_u32_e32 v5, vcc, 0, v7, vcc +; GFX6-NEXT: v_mul_lo_u32 v7, s7, v2 +; GFX6-NEXT: v_mul_hi_u32 v2, s7, v2 +; GFX6-NEXT: v_mov_b32_e32 v6, s12 +; GFX6-NEXT: v_add_i32_e32 v4, vcc, v4, v7 +; GFX6-NEXT: v_addc_u32_e32 v2, vcc, v5, v2, vcc +; GFX6-NEXT: v_addc_u32_e32 v4, vcc, 0, v8, vcc +; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v3 +; GFX6-NEXT: v_addc_u32_e32 v3, vcc, 0, v4, vcc +; GFX6-NEXT: v_mul_lo_u32 v3, s4, v3 +; GFX6-NEXT: v_mul_hi_u32 v4, s4, v2 +; GFX6-NEXT: v_mul_lo_u32 v5, s5, v2 +; GFX6-NEXT: v_subrev_i32_e32 v0, vcc, s12, v0 +; GFX6-NEXT: v_mul_lo_u32 v2, s4, v2 +; GFX6-NEXT: v_subb_u32_e32 v1, vcc, v1, v6, vcc +; GFX6-NEXT: v_add_i32_e32 v3, vcc, v4, v3 +; GFX6-NEXT: v_add_i32_e32 v3, vcc, v3, v5 +; GFX6-NEXT: v_sub_i32_e32 v4, vcc, s7, v3 ; GFX6-NEXT: v_mov_b32_e32 v5, s5 -; GFX6-NEXT: v_sub_i32_e32 v3, vcc, s6, v3 +; GFX6-NEXT: v_sub_i32_e32 v2, vcc, s6, v2 ; GFX6-NEXT: v_subb_u32_e64 v4, s[0:1], v4, v5, vcc -; GFX6-NEXT: v_subrev_i32_e64 v6, s[0:1], s4, v3 +; GFX6-NEXT: v_subrev_i32_e64 v6, s[0:1], s4, v2 ; GFX6-NEXT: v_subbrev_u32_e64 v7, s[2:3], 0, v4, s[0:1] ; GFX6-NEXT: v_cmp_le_u32_e64 s[2:3], s5, v7 ; GFX6-NEXT: v_cndmask_b32_e64 v8, 0, -1, s[2:3] @@ -10483,22 +10454,22 @@ define amdgpu_kernel void @srem_v2i64_pow2_shl_denom(<2 x i64> addrspace(1)* %ou ; GFX6-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v8 ; GFX6-NEXT: v_cndmask_b32_e64 v4, v7, v4, s[0:1] ; GFX6-NEXT: v_mov_b32_e32 v7, s7 -; GFX6-NEXT: v_subb_u32_e32 v2, vcc, v7, v2, vcc -; GFX6-NEXT: v_cmp_le_u32_e32 vcc, s5, v2 +; GFX6-NEXT: v_subb_u32_e32 v3, vcc, v7, v3, vcc +; GFX6-NEXT: v_cmp_le_u32_e32 vcc, s5, v3 ; GFX6-NEXT: v_cndmask_b32_e64 v7, 0, -1, vcc -; GFX6-NEXT: v_cmp_le_u32_e32 vcc, s4, v3 +; GFX6-NEXT: v_cmp_le_u32_e32 vcc, s4, v2 ; GFX6-NEXT: v_cndmask_b32_e64 v8, 0, -1, vcc -; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, s5, v2 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, s5, v3 ; GFX6-NEXT: v_cndmask_b32_e32 v7, v7, v8, vcc ; GFX6-NEXT: v_cmp_ne_u32_e32 vcc, 0, v7 -; GFX6-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc -; GFX6-NEXT: v_cndmask_b32_e64 v4, v6, v5, s[0:1] ; GFX6-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc +; GFX6-NEXT: v_cndmask_b32_e64 v4, v6, v5, s[0:1] +; GFX6-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc +; GFX6-NEXT: v_xor_b32_e32 v2, s14, v2 ; GFX6-NEXT: v_xor_b32_e32 v3, s14, v3 -; GFX6-NEXT: v_xor_b32_e32 v4, s14, v2 -; GFX6-NEXT: v_mov_b32_e32 v5, s14 -; GFX6-NEXT: v_subrev_i32_e32 v2, vcc, s14, v3 -; GFX6-NEXT: v_subb_u32_e32 v3, vcc, v4, v5, vcc +; GFX6-NEXT: v_mov_b32_e32 v4, s14 +; GFX6-NEXT: v_subrev_i32_e32 v2, vcc, s14, v2 +; GFX6-NEXT: v_subb_u32_e32 v3, vcc, v3, v4, vcc ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[8:11], 0 ; GFX6-NEXT: s_endpgm @@ -10532,74 +10503,74 @@ define amdgpu_kernel void @srem_v2i64_pow2_shl_denom(<2 x i64> addrspace(1)* %ou ; GFX9-NEXT: v_trunc_f32_e32 v1, v1 ; GFX9-NEXT: v_mac_f32_e32 v0, s19, v1 ; GFX9-NEXT: v_cvt_u32_f32_e32 v1, v1 -; GFX9-NEXT: v_cvt_u32_f32_e32 v2, v0 -; GFX9-NEXT: v_mul_lo_u32 v0, s2, v1 -; GFX9-NEXT: v_mul_hi_u32 v3, s2, v2 -; GFX9-NEXT: v_mul_lo_u32 v5, s3, v2 -; GFX9-NEXT: v_mul_lo_u32 v4, s2, v2 -; GFX9-NEXT: v_add_u32_e32 v0, v3, v0 -; GFX9-NEXT: v_add_u32_e32 v5, v0, v5 -; GFX9-NEXT: v_mul_hi_u32 v3, v2, v4 -; GFX9-NEXT: v_mul_lo_u32 v6, v2, v5 -; GFX9-NEXT: v_mul_hi_u32 v7, v2, v5 -; GFX9-NEXT: v_mul_hi_u32 v8, v1, v5 -; GFX9-NEXT: v_mov_b32_e32 v0, 0 -; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, v3, v6 -; GFX9-NEXT: v_addc_co_u32_e32 v6, vcc, 0, v7, vcc -; GFX9-NEXT: v_mul_lo_u32 v7, v1, v4 -; GFX9-NEXT: v_mul_hi_u32 v4, v1, v4 -; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, v3, v7 -; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, v6, v4, vcc -; GFX9-NEXT: v_mul_lo_u32 v4, v1, v5 -; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, v8, v0, vcc -; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, v3, v4 -; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, 0, v5, vcc -; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v2, v3 -; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v4, vcc -; GFX9-NEXT: v_mul_lo_u32 v3, s2, v1 -; GFX9-NEXT: v_mul_hi_u32 v4, s2, v2 -; GFX9-NEXT: v_mul_lo_u32 v5, s3, v2 -; GFX9-NEXT: v_mul_lo_u32 v6, s2, v2 -; GFX9-NEXT: s_add_u32 s2, s4, s8 -; GFX9-NEXT: v_add_u32_e32 v3, v4, v3 -; GFX9-NEXT: v_add_u32_e32 v3, v3, v5 -; GFX9-NEXT: v_mul_lo_u32 v7, v2, v3 -; GFX9-NEXT: v_mul_hi_u32 v8, v2, v6 -; GFX9-NEXT: v_mul_hi_u32 v9, v2, v3 -; GFX9-NEXT: v_mul_hi_u32 v5, v1, v6 -; GFX9-NEXT: v_mul_lo_u32 v6, v1, v6 -; GFX9-NEXT: v_mul_hi_u32 v4, v1, v3 -; GFX9-NEXT: v_add_co_u32_e32 v7, vcc, v8, v7 -; GFX9-NEXT: v_addc_co_u32_e32 v8, vcc, 0, v9, vcc -; GFX9-NEXT: v_mul_lo_u32 v3, v1, v3 -; GFX9-NEXT: v_add_co_u32_e32 v6, vcc, v7, v6 -; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, v8, v5, vcc -; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, v4, v0, vcc -; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, v5, v3 -; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, 0, v4, vcc -; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v2, v3 -; GFX9-NEXT: s_addc_u32 s3, s5, s8 -; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v4, vcc -; GFX9-NEXT: s_xor_b64 s[14:15], s[2:3], s[8:9] -; GFX9-NEXT: v_mul_lo_u32 v3, s14, v1 -; GFX9-NEXT: v_mul_hi_u32 v4, s14, v2 -; GFX9-NEXT: v_mul_hi_u32 v5, s14, v1 -; GFX9-NEXT: v_mul_hi_u32 v6, s15, v1 -; GFX9-NEXT: v_mul_lo_u32 v1, s15, v1 -; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, v4, v3 -; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, 0, v5, vcc -; GFX9-NEXT: v_mul_lo_u32 v5, s15, v2 -; GFX9-NEXT: v_mul_hi_u32 v2, s15, v2 -; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 +; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0 +; GFX9-NEXT: v_mul_lo_u32 v2, s2, v1 +; GFX9-NEXT: v_mul_hi_u32 v3, s2, v0 +; GFX9-NEXT: v_mul_lo_u32 v5, s3, v0 +; GFX9-NEXT: v_mul_lo_u32 v4, s2, v0 +; GFX9-NEXT: v_add_u32_e32 v2, v3, v2 +; GFX9-NEXT: v_add_u32_e32 v2, v2, v5 +; GFX9-NEXT: v_mul_hi_u32 v3, v0, v4 +; GFX9-NEXT: v_mul_lo_u32 v5, v0, v2 +; GFX9-NEXT: v_mul_hi_u32 v6, v0, v2 +; GFX9-NEXT: v_mul_hi_u32 v7, v1, v2 +; GFX9-NEXT: v_mul_lo_u32 v2, v1, v2 ; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, v3, v5 -; GFX9-NEXT: v_addc_co_u32_e32 v2, vcc, v4, v2, vcc -; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, v6, v0, vcc -; GFX9-NEXT: v_add_co_u32_e32 v1, vcc, v2, v1 -; GFX9-NEXT: v_addc_co_u32_e32 v2, vcc, 0, v3, vcc -; GFX9-NEXT: v_mul_lo_u32 v2, s12, v2 +; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v6, vcc +; GFX9-NEXT: v_mul_lo_u32 v6, v1, v4 +; GFX9-NEXT: v_mul_hi_u32 v4, v1, v4 +; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, v3, v6 +; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, v5, v4, vcc +; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, 0, v7, vcc +; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v3, v2 +; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v4, vcc +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v2 +; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v3, vcc +; GFX9-NEXT: v_mul_lo_u32 v2, s2, v1 +; GFX9-NEXT: v_mul_hi_u32 v3, s2, v0 +; GFX9-NEXT: v_mul_lo_u32 v4, s3, v0 +; GFX9-NEXT: v_mul_lo_u32 v5, s2, v0 +; GFX9-NEXT: s_add_u32 s2, s4, s8 +; GFX9-NEXT: v_add_u32_e32 v2, v3, v2 +; GFX9-NEXT: v_add_u32_e32 v2, v2, v4 +; GFX9-NEXT: v_mul_lo_u32 v6, v0, v2 +; GFX9-NEXT: v_mul_hi_u32 v7, v0, v5 +; GFX9-NEXT: v_mul_hi_u32 v8, v0, v2 +; GFX9-NEXT: v_mul_hi_u32 v4, v1, v5 +; GFX9-NEXT: v_mul_lo_u32 v5, v1, v5 +; GFX9-NEXT: v_mul_hi_u32 v3, v1, v2 +; GFX9-NEXT: v_add_co_u32_e32 v6, vcc, v7, v6 +; GFX9-NEXT: v_addc_co_u32_e32 v7, vcc, 0, v8, vcc +; GFX9-NEXT: v_mul_lo_u32 v2, v1, v2 +; GFX9-NEXT: v_add_co_u32_e32 v5, vcc, v6, v5 +; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, v7, v4, vcc +; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc +; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v4, v2 +; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v2 +; GFX9-NEXT: s_addc_u32 s3, s5, s8 +; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v3, vcc +; GFX9-NEXT: s_xor_b64 s[14:15], s[2:3], s[8:9] +; GFX9-NEXT: v_mul_lo_u32 v2, s14, v1 +; GFX9-NEXT: v_mul_hi_u32 v3, s14, v0 +; GFX9-NEXT: v_mul_hi_u32 v4, s14, v1 +; GFX9-NEXT: v_mul_hi_u32 v5, s15, v1 +; GFX9-NEXT: v_mul_lo_u32 v1, s15, v1 +; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v3, v2 +; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v4, vcc +; GFX9-NEXT: v_mul_lo_u32 v4, s15, v0 +; GFX9-NEXT: v_mul_hi_u32 v0, s15, v0 +; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 +; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v2, v4 +; GFX9-NEXT: v_addc_co_u32_e32 v0, vcc, v3, v0, vcc +; GFX9-NEXT: v_addc_co_u32_e32 v2, vcc, 0, v5, vcc +; GFX9-NEXT: v_add_co_u32_e32 v1, vcc, v0, v1 +; GFX9-NEXT: v_addc_co_u32_e32 v0, vcc, 0, v2, vcc +; GFX9-NEXT: v_mul_lo_u32 v2, s12, v0 ; GFX9-NEXT: v_mul_hi_u32 v3, s12, v1 ; GFX9-NEXT: v_mul_lo_u32 v4, s13, v1 ; GFX9-NEXT: v_mul_lo_u32 v1, s12, v1 +; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: v_add_u32_e32 v2, v3, v2 ; GFX9-NEXT: v_add_u32_e32 v2, v2, v4 ; GFX9-NEXT: v_sub_u32_e32 v3, s15, v2 @@ -10667,7 +10638,7 @@ define amdgpu_kernel void @srem_v2i64_pow2_shl_denom(<2 x i64> addrspace(1)* %ou ; GFX9-NEXT: s_mov_b32 s13, s12 ; GFX9-NEXT: v_add_co_u32_e32 v7, vcc, v7, v9 ; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, v8, v3, vcc -; GFX9-NEXT: v_addc_co_u32_e32 v7, vcc, v10, v0, vcc +; GFX9-NEXT: v_addc_co_u32_e32 v7, vcc, 0, v10, vcc ; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, v3, v6 ; GFX9-NEXT: v_addc_co_u32_e32 v6, vcc, 0, v7, vcc ; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, v4, v3 @@ -10690,7 +10661,7 @@ define amdgpu_kernel void @srem_v2i64_pow2_shl_denom(<2 x i64> addrspace(1)* %ou ; GFX9-NEXT: v_mul_lo_u32 v5, v4, v5 ; GFX9-NEXT: v_add_co_u32_e32 v8, vcc, v9, v8 ; GFX9-NEXT: v_addc_co_u32_e32 v7, vcc, v10, v7, vcc -; GFX9-NEXT: v_addc_co_u32_e32 v6, vcc, v6, v0, vcc +; GFX9-NEXT: v_addc_co_u32_e32 v6, vcc, 0, v6, vcc ; GFX9-NEXT: v_add_co_u32_e32 v5, vcc, v7, v5 ; GFX9-NEXT: v_addc_co_u32_e32 v6, vcc, 0, v6, vcc ; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, v3, v5 @@ -10710,7 +10681,7 @@ define amdgpu_kernel void @srem_v2i64_pow2_shl_denom(<2 x i64> addrspace(1)* %ou ; GFX9-NEXT: v_xor_b32_e32 v2, s8, v2 ; GFX9-NEXT: v_add_co_u32_e32 v5, vcc, v5, v8 ; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, v6, v3, vcc -; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, v9, v0, vcc +; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v9, vcc ; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, v3, v4 ; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, 0, v5, vcc ; GFX9-NEXT: v_mul_lo_u32 v4, s10, v4 diff --git a/llvm/test/CodeGen/AMDGPU/and.ll b/llvm/test/CodeGen/AMDGPU/and.ll index 8cad7d033ac8..e82f2c6ea917 100644 --- a/llvm/test/CodeGen/AMDGPU/and.ll +++ b/llvm/test/CodeGen/AMDGPU/and.ll @@ -63,9 +63,8 @@ define amdgpu_kernel void @s_and_constant_i32(i32 addrspace(1)* %out, i32 %a) { ; Second use is a VGPR use of the constant. ; FUNC-LABEL: {{^}}s_and_multi_use_constant_i32_0: -; SI: s_mov_b32 [[K:s[0-9]+]], 0x12d687 -; SI-DAG: s_and_b32 [[AND:s[0-9]+]], s{{[0-9]+}}, [[K]] -; SI-DAG: v_mov_b32_e32 [[VK:v[0-9]+]], [[K]] +; SI-DAG: s_and_b32 [[AND:s[0-9]+]], s{{[0-9]+}}, 0x12d687 +; SI-DAG: v_mov_b32_e32 [[VK:v[0-9]+]], 0x12d687 ; SI: buffer_store_dword [[VK]] define amdgpu_kernel void @s_and_multi_use_constant_i32_0(i32 addrspace(1)* %out, i32 %a, i32 %b) { %and = and i32 %a, 1234567 @@ -79,10 +78,9 @@ define amdgpu_kernel void @s_and_multi_use_constant_i32_0(i32 addrspace(1)* %out ; Second use is another SGPR use of the constant. ; FUNC-LABEL: {{^}}s_and_multi_use_constant_i32_1: -; SI: s_mov_b32 [[K:s[0-9]+]], 0x12d687 -; SI: s_and_b32 [[AND:s[0-9]+]], s{{[0-9]+}}, [[K]] +; SI: s_and_b32 [[AND:s[0-9]+]], s{{[0-9]+}}, 0x12d687 ; SI: s_add_i32 -; SI: s_add_i32 [[ADD:s[0-9]+]], s{{[0-9]+}}, [[K]] +; SI: s_add_i32 [[ADD:s[0-9]+]], s{{[0-9]+}}, 0x12d687 ; SI: v_mov_b32_e32 [[VADD:v[0-9]+]], [[ADD]] ; SI: buffer_store_dword [[VADD]] define amdgpu_kernel void @s_and_multi_use_constant_i32_1(i32 addrspace(1)* %out, i32 %a, i32 %b) { diff --git a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll index c855060e12e2..e478e2c0b62b 100644 --- a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll +++ b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll @@ -3067,7 +3067,7 @@ define amdgpu_kernel void @max_i32_varying(i32 addrspace(1)* %out) { ; GFX8-NEXT: s_mov_b64 exec, s[2:3] ; GFX8-NEXT: v_mov_b32_e32 v2, v0 ; GFX8-NEXT: s_not_b64 exec, exec -; GFX8-NEXT: v_mov_b32_e32 v2, v1 +; GFX8-NEXT: v_bfrev_b32_e32 v2, 1 ; GFX8-NEXT: s_not_b64 exec, exec ; GFX8-NEXT: s_or_saveexec_b64 s[2:3], -1 ; GFX8-NEXT: v_max_i32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf @@ -3118,7 +3118,7 @@ define amdgpu_kernel void @max_i32_varying(i32 addrspace(1)* %out) { ; GFX9-NEXT: s_mov_b64 exec, s[2:3] ; GFX9-NEXT: v_mov_b32_e32 v2, v0 ; GFX9-NEXT: s_not_b64 exec, exec -; GFX9-NEXT: v_mov_b32_e32 v2, v1 +; GFX9-NEXT: v_bfrev_b32_e32 v2, 1 ; GFX9-NEXT: s_not_b64 exec, exec ; GFX9-NEXT: s_or_saveexec_b64 s[2:3], -1 ; GFX9-NEXT: v_max_i32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf @@ -3160,41 +3160,39 @@ define amdgpu_kernel void @max_i32_varying(i32 addrspace(1)* %out) { ; ; GFX1064-LABEL: max_i32_varying: ; GFX1064: ; %bb.0: ; %entry -; GFX1064-NEXT: s_or_saveexec_b64 s[2:3], -1 +; GFX1064-NEXT: v_mov_b32_e32 v1, v0 +; GFX1064-NEXT: s_not_b64 exec, exec ; GFX1064-NEXT: v_bfrev_b32_e32 v1, 1 -; GFX1064-NEXT: s_mov_b64 exec, s[2:3] -; GFX1064-NEXT: v_mov_b32_e32 v2, v0 -; GFX1064-NEXT: s_not_b64 exec, exec -; GFX1064-NEXT: v_mov_b32_e32 v2, v1 ; GFX1064-NEXT: s_not_b64 exec, exec ; GFX1064-NEXT: s_or_saveexec_b64 s[2:3], -1 -; GFX1064-NEXT: v_max_i32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX1064-NEXT: v_max_i32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf -; GFX1064-NEXT: v_max_i32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf -; GFX1064-NEXT: v_max_i32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf -; GFX1064-NEXT: v_mov_b32_e32 v3, v2 -; GFX1064-NEXT: v_permlanex16_b32 v3, v3, -1, -1 -; GFX1064-NEXT: v_max_i32_dpp v2, v3, v2 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf -; GFX1064-NEXT: v_readlane_b32 s4, v2, 31 -; GFX1064-NEXT: v_mov_b32_e32 v3, s4 -; GFX1064-NEXT: v_max_i32_dpp v2, v3, v2 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf -; GFX1064-NEXT: v_readlane_b32 s4, v2, 15 -; GFX1064-NEXT: v_mov_b32_dpp v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1064-NEXT: v_max_i32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1064-NEXT: v_bfrev_b32_e32 v3, 1 +; GFX1064-NEXT: v_max_i32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX1064-NEXT: v_max_i32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX1064-NEXT: v_max_i32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX1064-NEXT: v_mov_b32_e32 v2, v1 +; GFX1064-NEXT: v_permlanex16_b32 v2, v2, -1, -1 +; GFX1064-NEXT: v_max_i32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf +; GFX1064-NEXT: v_readlane_b32 s4, v1, 31 +; GFX1064-NEXT: v_mov_b32_e32 v2, s4 +; GFX1064-NEXT: v_max_i32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf +; GFX1064-NEXT: v_readlane_b32 s4, v1, 15 +; GFX1064-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; GFX1064-NEXT: s_mov_b64 exec, s[2:3] ; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX1064-NEXT: s_or_saveexec_b64 s[2:3], -1 -; GFX1064-NEXT: v_readlane_b32 s5, v2, 31 -; GFX1064-NEXT: v_writelane_b32 v1, s4, 16 +; GFX1064-NEXT: v_readlane_b32 s5, v1, 31 +; GFX1064-NEXT: v_writelane_b32 v3, s4, 16 ; GFX1064-NEXT: s_mov_b64 exec, s[2:3] ; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1064-NEXT: s_or_saveexec_b64 s[2:3], -1 -; GFX1064-NEXT: v_readlane_b32 s7, v2, 63 -; GFX1064-NEXT: v_readlane_b32 s6, v2, 47 -; GFX1064-NEXT: v_writelane_b32 v1, s5, 32 +; GFX1064-NEXT: v_readlane_b32 s7, v1, 63 +; GFX1064-NEXT: v_readlane_b32 s6, v1, 47 +; GFX1064-NEXT: v_writelane_b32 v3, s5, 32 ; GFX1064-NEXT: s_mov_b64 exec, s[2:3] ; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX1064-NEXT: s_or_saveexec_b64 s[4:5], -1 -; GFX1064-NEXT: v_writelane_b32 v1, s6, 48 +; GFX1064-NEXT: v_writelane_b32 v3, s6, 48 ; GFX1064-NEXT: s_mov_b64 exec, s[4:5] ; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX1064-NEXT: s_mov_b32 s2, -1 @@ -3214,7 +3212,7 @@ define amdgpu_kernel void @max_i32_varying(i32 addrspace(1)* %out) { ; GFX1064-NEXT: s_waitcnt_depctr 0xffe3 ; GFX1064-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX1064-NEXT: v_readfirstlane_b32 s3, v0 -; GFX1064-NEXT: v_mov_b32_e32 v0, v1 +; GFX1064-NEXT: v_mov_b32_e32 v0, v3 ; GFX1064-NEXT: v_max_i32_e32 v0, s3, v0 ; GFX1064-NEXT: s_mov_b32 s3, 0x31016000 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) @@ -3223,31 +3221,29 @@ define amdgpu_kernel void @max_i32_varying(i32 addrspace(1)* %out) { ; ; GFX1032-LABEL: max_i32_varying: ; GFX1032: ; %bb.0: ; %entry -; GFX1032-NEXT: s_or_saveexec_b32 s2, -1 +; GFX1032-NEXT: v_mov_b32_e32 v1, v0 +; GFX1032-NEXT: s_not_b32 exec_lo, exec_lo ; GFX1032-NEXT: v_bfrev_b32_e32 v1, 1 -; GFX1032-NEXT: s_mov_b32 exec_lo, s2 -; GFX1032-NEXT: v_mov_b32_e32 v2, v0 -; GFX1032-NEXT: s_not_b32 exec_lo, exec_lo -; GFX1032-NEXT: v_mov_b32_e32 v2, v1 ; GFX1032-NEXT: s_not_b32 exec_lo, exec_lo ; GFX1032-NEXT: s_or_saveexec_b32 s2, -1 -; GFX1032-NEXT: v_max_i32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX1032-NEXT: v_max_i32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf -; GFX1032-NEXT: v_max_i32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf -; GFX1032-NEXT: v_max_i32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf -; GFX1032-NEXT: v_mov_b32_e32 v3, v2 -; GFX1032-NEXT: v_permlanex16_b32 v3, v3, -1, -1 +; GFX1032-NEXT: v_max_i32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1032-NEXT: v_max_i32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX1032-NEXT: v_max_i32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX1032-NEXT: v_max_i32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX1032-NEXT: v_mov_b32_e32 v2, v1 +; GFX1032-NEXT: v_permlanex16_b32 v2, v2, -1, -1 ; GFX1032-NEXT: s_mov_b32 exec_lo, s2 ; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX1032-NEXT: s_or_saveexec_b32 s2, -1 -; GFX1032-NEXT: v_max_i32_dpp v2, v3, v2 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf -; GFX1032-NEXT: v_readlane_b32 s3, v2, 15 -; GFX1032-NEXT: v_readlane_b32 s4, v2, 31 -; GFX1032-NEXT: v_mov_b32_dpp v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1032-NEXT: v_max_i32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf +; GFX1032-NEXT: v_bfrev_b32_e32 v3, 1 +; GFX1032-NEXT: v_readlane_b32 s3, v1, 15 +; GFX1032-NEXT: v_readlane_b32 s4, v1, 31 +; GFX1032-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; GFX1032-NEXT: s_mov_b32 exec_lo, s2 ; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1032-NEXT: s_or_saveexec_b32 s2, -1 -; GFX1032-NEXT: v_writelane_b32 v1, s3, 16 +; GFX1032-NEXT: v_writelane_b32 v3, s3, 16 ; GFX1032-NEXT: s_mov_b32 exec_lo, s2 ; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX1032-NEXT: s_mov_b32 s2, -1 @@ -3266,7 +3262,7 @@ define amdgpu_kernel void @max_i32_varying(i32 addrspace(1)* %out) { ; GFX1032-NEXT: s_waitcnt_depctr 0xffe3 ; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s3 ; GFX1032-NEXT: v_readfirstlane_b32 s3, v0 -; GFX1032-NEXT: v_mov_b32_e32 v0, v1 +; GFX1032-NEXT: v_mov_b32_e32 v0, v3 ; GFX1032-NEXT: v_max_i32_e32 v0, s3, v0 ; GFX1032-NEXT: s_mov_b32 s3, 0x31016000 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) @@ -3484,7 +3480,7 @@ define amdgpu_kernel void @min_i32_varying(i32 addrspace(1)* %out) { ; GFX8-NEXT: s_mov_b64 exec, s[2:3] ; GFX8-NEXT: v_mov_b32_e32 v2, v0 ; GFX8-NEXT: s_not_b64 exec, exec -; GFX8-NEXT: v_mov_b32_e32 v2, v1 +; GFX8-NEXT: v_bfrev_b32_e32 v2, -2 ; GFX8-NEXT: s_not_b64 exec, exec ; GFX8-NEXT: s_or_saveexec_b64 s[2:3], -1 ; GFX8-NEXT: v_min_i32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf @@ -3535,7 +3531,7 @@ define amdgpu_kernel void @min_i32_varying(i32 addrspace(1)* %out) { ; GFX9-NEXT: s_mov_b64 exec, s[2:3] ; GFX9-NEXT: v_mov_b32_e32 v2, v0 ; GFX9-NEXT: s_not_b64 exec, exec -; GFX9-NEXT: v_mov_b32_e32 v2, v1 +; GFX9-NEXT: v_bfrev_b32_e32 v2, -2 ; GFX9-NEXT: s_not_b64 exec, exec ; GFX9-NEXT: s_or_saveexec_b64 s[2:3], -1 ; GFX9-NEXT: v_min_i32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf @@ -3577,41 +3573,39 @@ define amdgpu_kernel void @min_i32_varying(i32 addrspace(1)* %out) { ; ; GFX1064-LABEL: min_i32_varying: ; GFX1064: ; %bb.0: ; %entry -; GFX1064-NEXT: s_or_saveexec_b64 s[2:3], -1 +; GFX1064-NEXT: v_mov_b32_e32 v1, v0 +; GFX1064-NEXT: s_not_b64 exec, exec ; GFX1064-NEXT: v_bfrev_b32_e32 v1, -2 -; GFX1064-NEXT: s_mov_b64 exec, s[2:3] -; GFX1064-NEXT: v_mov_b32_e32 v2, v0 -; GFX1064-NEXT: s_not_b64 exec, exec -; GFX1064-NEXT: v_mov_b32_e32 v2, v1 ; GFX1064-NEXT: s_not_b64 exec, exec ; GFX1064-NEXT: s_or_saveexec_b64 s[2:3], -1 -; GFX1064-NEXT: v_min_i32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX1064-NEXT: v_min_i32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf -; GFX1064-NEXT: v_min_i32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf -; GFX1064-NEXT: v_min_i32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf -; GFX1064-NEXT: v_mov_b32_e32 v3, v2 -; GFX1064-NEXT: v_permlanex16_b32 v3, v3, -1, -1 -; GFX1064-NEXT: v_min_i32_dpp v2, v3, v2 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf -; GFX1064-NEXT: v_readlane_b32 s4, v2, 31 -; GFX1064-NEXT: v_mov_b32_e32 v3, s4 -; GFX1064-NEXT: v_min_i32_dpp v2, v3, v2 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf -; GFX1064-NEXT: v_readlane_b32 s4, v2, 15 -; GFX1064-NEXT: v_mov_b32_dpp v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1064-NEXT: v_min_i32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1064-NEXT: v_bfrev_b32_e32 v3, -2 +; GFX1064-NEXT: v_min_i32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX1064-NEXT: v_min_i32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX1064-NEXT: v_min_i32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX1064-NEXT: v_mov_b32_e32 v2, v1 +; GFX1064-NEXT: v_permlanex16_b32 v2, v2, -1, -1 +; GFX1064-NEXT: v_min_i32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf +; GFX1064-NEXT: v_readlane_b32 s4, v1, 31 +; GFX1064-NEXT: v_mov_b32_e32 v2, s4 +; GFX1064-NEXT: v_min_i32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf +; GFX1064-NEXT: v_readlane_b32 s4, v1, 15 +; GFX1064-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; GFX1064-NEXT: s_mov_b64 exec, s[2:3] ; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX1064-NEXT: s_or_saveexec_b64 s[2:3], -1 -; GFX1064-NEXT: v_readlane_b32 s5, v2, 31 -; GFX1064-NEXT: v_writelane_b32 v1, s4, 16 +; GFX1064-NEXT: v_readlane_b32 s5, v1, 31 +; GFX1064-NEXT: v_writelane_b32 v3, s4, 16 ; GFX1064-NEXT: s_mov_b64 exec, s[2:3] ; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1064-NEXT: s_or_saveexec_b64 s[2:3], -1 -; GFX1064-NEXT: v_readlane_b32 s7, v2, 63 -; GFX1064-NEXT: v_readlane_b32 s6, v2, 47 -; GFX1064-NEXT: v_writelane_b32 v1, s5, 32 +; GFX1064-NEXT: v_readlane_b32 s7, v1, 63 +; GFX1064-NEXT: v_readlane_b32 s6, v1, 47 +; GFX1064-NEXT: v_writelane_b32 v3, s5, 32 ; GFX1064-NEXT: s_mov_b64 exec, s[2:3] ; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX1064-NEXT: s_or_saveexec_b64 s[4:5], -1 -; GFX1064-NEXT: v_writelane_b32 v1, s6, 48 +; GFX1064-NEXT: v_writelane_b32 v3, s6, 48 ; GFX1064-NEXT: s_mov_b64 exec, s[4:5] ; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX1064-NEXT: s_mov_b32 s2, -1 @@ -3631,7 +3625,7 @@ define amdgpu_kernel void @min_i32_varying(i32 addrspace(1)* %out) { ; GFX1064-NEXT: s_waitcnt_depctr 0xffe3 ; GFX1064-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX1064-NEXT: v_readfirstlane_b32 s3, v0 -; GFX1064-NEXT: v_mov_b32_e32 v0, v1 +; GFX1064-NEXT: v_mov_b32_e32 v0, v3 ; GFX1064-NEXT: v_min_i32_e32 v0, s3, v0 ; GFX1064-NEXT: s_mov_b32 s3, 0x31016000 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) @@ -3640,31 +3634,29 @@ define amdgpu_kernel void @min_i32_varying(i32 addrspace(1)* %out) { ; ; GFX1032-LABEL: min_i32_varying: ; GFX1032: ; %bb.0: ; %entry -; GFX1032-NEXT: s_or_saveexec_b32 s2, -1 +; GFX1032-NEXT: v_mov_b32_e32 v1, v0 +; GFX1032-NEXT: s_not_b32 exec_lo, exec_lo ; GFX1032-NEXT: v_bfrev_b32_e32 v1, -2 -; GFX1032-NEXT: s_mov_b32 exec_lo, s2 -; GFX1032-NEXT: v_mov_b32_e32 v2, v0 -; GFX1032-NEXT: s_not_b32 exec_lo, exec_lo -; GFX1032-NEXT: v_mov_b32_e32 v2, v1 ; GFX1032-NEXT: s_not_b32 exec_lo, exec_lo ; GFX1032-NEXT: s_or_saveexec_b32 s2, -1 -; GFX1032-NEXT: v_min_i32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX1032-NEXT: v_min_i32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf -; GFX1032-NEXT: v_min_i32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf -; GFX1032-NEXT: v_min_i32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf -; GFX1032-NEXT: v_mov_b32_e32 v3, v2 -; GFX1032-NEXT: v_permlanex16_b32 v3, v3, -1, -1 +; GFX1032-NEXT: v_min_i32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1032-NEXT: v_min_i32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX1032-NEXT: v_min_i32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX1032-NEXT: v_min_i32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX1032-NEXT: v_mov_b32_e32 v2, v1 +; GFX1032-NEXT: v_permlanex16_b32 v2, v2, -1, -1 ; GFX1032-NEXT: s_mov_b32 exec_lo, s2 ; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX1032-NEXT: s_or_saveexec_b32 s2, -1 -; GFX1032-NEXT: v_min_i32_dpp v2, v3, v2 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf -; GFX1032-NEXT: v_readlane_b32 s3, v2, 15 -; GFX1032-NEXT: v_readlane_b32 s4, v2, 31 -; GFX1032-NEXT: v_mov_b32_dpp v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1032-NEXT: v_min_i32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf +; GFX1032-NEXT: v_bfrev_b32_e32 v3, -2 +; GFX1032-NEXT: v_readlane_b32 s3, v1, 15 +; GFX1032-NEXT: v_readlane_b32 s4, v1, 31 +; GFX1032-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; GFX1032-NEXT: s_mov_b32 exec_lo, s2 ; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1032-NEXT: s_or_saveexec_b32 s2, -1 -; GFX1032-NEXT: v_writelane_b32 v1, s3, 16 +; GFX1032-NEXT: v_writelane_b32 v3, s3, 16 ; GFX1032-NEXT: s_mov_b32 exec_lo, s2 ; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX1032-NEXT: s_mov_b32 s2, -1 @@ -3683,7 +3675,7 @@ define amdgpu_kernel void @min_i32_varying(i32 addrspace(1)* %out) { ; GFX1032-NEXT: s_waitcnt_depctr 0xffe3 ; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s3 ; GFX1032-NEXT: v_readfirstlane_b32 s3, v0 -; GFX1032-NEXT: v_mov_b32_e32 v0, v1 +; GFX1032-NEXT: v_mov_b32_e32 v0, v3 ; GFX1032-NEXT: v_min_i32_e32 v0, s3, v0 ; GFX1032-NEXT: s_mov_b32 s3, 0x31016000 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) diff --git a/llvm/test/CodeGen/AMDGPU/bypass-div.ll b/llvm/test/CodeGen/AMDGPU/bypass-div.ll index 907ba8dd3086..9077a2857137 100644 --- a/llvm/test/CodeGen/AMDGPU/bypass-div.ll +++ b/llvm/test/CodeGen/AMDGPU/bypass-div.ll @@ -27,7 +27,6 @@ define i64 @sdiv64(i64 %a, i64 %b) { ; GFX9-NEXT: v_subb_co_u32_e32 v8, vcc, 0, v10, vcc ; GFX9-NEXT: v_mac_f32_e32 v2, 0x4f800000, v3 ; GFX9-NEXT: v_rcp_f32_e32 v2, v2 -; GFX9-NEXT: v_mov_b32_e32 v14, 0 ; GFX9-NEXT: v_mul_f32_e32 v2, 0x5f7ffffc, v2 ; GFX9-NEXT: v_mul_f32_e32 v3, 0x2f800000, v2 ; GFX9-NEXT: v_trunc_f32_e32 v3, v3 @@ -42,11 +41,11 @@ define i64 @sdiv64(i64 %a, i64 %b) { ; GFX9-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v6, v5, 0 ; GFX9-NEXT: v_add_co_u32_e32 v13, vcc, v13, v3 ; GFX9-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v12, v2, 0 -; GFX9-NEXT: v_addc_co_u32_e32 v15, vcc, 0, v4, vcc +; GFX9-NEXT: v_addc_co_u32_e32 v14, vcc, 0, v4, vcc ; GFX9-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v12, v5, 0 ; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v13, v2 -; GFX9-NEXT: v_addc_co_u32_e32 v2, vcc, v15, v3, vcc -; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, v5, v14, vcc +; GFX9-NEXT: v_addc_co_u32_e32 v2, vcc, v14, v3, vcc +; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v5, vcc ; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v2, v4 ; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc ; GFX9-NEXT: v_add_co_u32_e32 v13, vcc, v6, v2 @@ -57,13 +56,13 @@ define i64 @sdiv64(i64 %a, i64 %b) { ; GFX9-NEXT: v_add3_u32 v5, v3, v4, v5 ; GFX9-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v12, v5, 0 ; GFX9-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v13, v5, 0 -; GFX9-NEXT: v_mul_hi_u32 v15, v13, v2 +; GFX9-NEXT: v_mul_hi_u32 v14, v13, v2 ; GFX9-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v12, v2, 0 -; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v15, v5 +; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v14, v5 ; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v6, vcc ; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v2, v7 ; GFX9-NEXT: v_addc_co_u32_e32 v2, vcc, v5, v8, vcc -; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, v4, v14, vcc +; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, 0, v4, vcc ; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v2, v3 ; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v4, vcc ; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v13, v2 @@ -81,7 +80,7 @@ define i64 @sdiv64(i64 %a, i64 %b) { ; GFX9-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v5, v3, 0 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v7, v0 ; GFX9-NEXT: v_addc_co_u32_e32 v0, vcc, v8, v1, vcc -; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v3, v14, vcc +; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v3, vcc ; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v0, v2 ; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v1, vcc ; GFX9-NEXT: v_mul_lo_u32 v7, v10, v2 @@ -175,7 +174,6 @@ define i64 @udiv64(i64 %a, i64 %b) { ; GFX9-NEXT: v_subb_co_u32_e32 v11, vcc, 0, v3, vcc ; GFX9-NEXT: v_mac_f32_e32 v4, 0x4f800000, v5 ; GFX9-NEXT: v_rcp_f32_e32 v4, v4 -; GFX9-NEXT: v_mov_b32_e32 v13, 0 ; GFX9-NEXT: v_mul_f32_e32 v4, 0x5f7ffffc, v4 ; GFX9-NEXT: v_mul_f32_e32 v5, 0x2f800000, v4 ; GFX9-NEXT: v_trunc_f32_e32 v5, v5 @@ -190,32 +188,32 @@ define i64 @udiv64(i64 %a, i64 %b) { ; GFX9-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v9, v7, 0 ; GFX9-NEXT: v_add_co_u32_e32 v12, vcc, v12, v5 ; GFX9-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v8, v4, 0 -; GFX9-NEXT: v_addc_co_u32_e32 v14, vcc, 0, v6, vcc +; GFX9-NEXT: v_addc_co_u32_e32 v13, vcc, 0, v6, vcc ; GFX9-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v8, v7, 0 ; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, v12, v4 -; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, v14, v5, vcc -; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, v7, v13, vcc +; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, v13, v5, vcc +; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v7, vcc ; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, v4, v6 ; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v5, vcc ; GFX9-NEXT: v_add_co_u32_e32 v12, vcc, v9, v4 -; GFX9-NEXT: v_addc_co_u32_e32 v14, vcc, v8, v5, vcc -; GFX9-NEXT: v_mul_lo_u32 v6, v10, v14 +; GFX9-NEXT: v_addc_co_u32_e32 v13, vcc, v8, v5, vcc +; GFX9-NEXT: v_mul_lo_u32 v6, v10, v13 ; GFX9-NEXT: v_mul_lo_u32 v7, v11, v12 ; GFX9-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v10, v12, 0 ; GFX9-NEXT: v_add3_u32 v7, v5, v6, v7 -; GFX9-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v14, v7, 0 +; GFX9-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v13, v7, 0 ; GFX9-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v12, v7, 0 ; GFX9-NEXT: v_mul_hi_u32 v11, v12, v4 -; GFX9-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v14, v4, 0 +; GFX9-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v13, v4, 0 ; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, v11, v7 ; GFX9-NEXT: v_addc_co_u32_e32 v7, vcc, 0, v8, vcc ; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, v4, v9 ; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, v7, v10, vcc -; GFX9-NEXT: v_addc_co_u32_e32 v6, vcc, v6, v13, vcc +; GFX9-NEXT: v_addc_co_u32_e32 v6, vcc, 0, v6, vcc ; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, v4, v5 ; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v6, vcc ; GFX9-NEXT: v_add_co_u32_e32 v6, vcc, v12, v4 -; GFX9-NEXT: v_addc_co_u32_e32 v7, vcc, v14, v5, vcc +; GFX9-NEXT: v_addc_co_u32_e32 v7, vcc, v13, v5, vcc ; GFX9-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v0, v7, 0 ; GFX9-NEXT: v_mul_hi_u32 v8, v0, v6 ; GFX9-NEXT: v_add_co_u32_e32 v8, vcc, v8, v4 @@ -224,7 +222,7 @@ define i64 @udiv64(i64 %a, i64 %b) { ; GFX9-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v1, v7, 0 ; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, v8, v4 ; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, v9, v5, vcc -; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, v7, v13, vcc +; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v7, vcc ; GFX9-NEXT: v_add_co_u32_e32 v6, vcc, v4, v6 ; GFX9-NEXT: v_addc_co_u32_e32 v7, vcc, 0, v5, vcc ; GFX9-NEXT: v_mul_lo_u32 v8, v3, v6 @@ -318,7 +316,6 @@ define i64 @srem64(i64 %a, i64 %b) { ; GFX9-NEXT: v_subb_co_u32_e32 v8, vcc, 0, v9, vcc ; GFX9-NEXT: v_mac_f32_e32 v2, 0x4f800000, v3 ; GFX9-NEXT: v_rcp_f32_e32 v2, v2 -; GFX9-NEXT: v_mov_b32_e32 v13, 0 ; GFX9-NEXT: v_mul_f32_e32 v2, 0x5f7ffffc, v2 ; GFX9-NEXT: v_mul_f32_e32 v3, 0x2f800000, v2 ; GFX9-NEXT: v_trunc_f32_e32 v3, v3 @@ -333,11 +330,11 @@ define i64 @srem64(i64 %a, i64 %b) { ; GFX9-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v6, v5, 0 ; GFX9-NEXT: v_add_co_u32_e32 v12, vcc, v12, v3 ; GFX9-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v11, v2, 0 -; GFX9-NEXT: v_addc_co_u32_e32 v14, vcc, 0, v4, vcc +; GFX9-NEXT: v_addc_co_u32_e32 v13, vcc, 0, v4, vcc ; GFX9-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v11, v5, 0 ; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v12, v2 -; GFX9-NEXT: v_addc_co_u32_e32 v2, vcc, v14, v3, vcc -; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, v5, v13, vcc +; GFX9-NEXT: v_addc_co_u32_e32 v2, vcc, v13, v3, vcc +; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v5, vcc ; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v2, v4 ; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc ; GFX9-NEXT: v_add_co_u32_e32 v12, vcc, v6, v2 @@ -348,13 +345,13 @@ define i64 @srem64(i64 %a, i64 %b) { ; GFX9-NEXT: v_add3_u32 v5, v3, v4, v5 ; GFX9-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v11, v5, 0 ; GFX9-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v12, v5, 0 -; GFX9-NEXT: v_mul_hi_u32 v14, v12, v2 +; GFX9-NEXT: v_mul_hi_u32 v13, v12, v2 ; GFX9-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v11, v2, 0 -; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v14, v5 +; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v13, v5 ; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v6, vcc ; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v2, v7 ; GFX9-NEXT: v_addc_co_u32_e32 v2, vcc, v5, v8, vcc -; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, v4, v13, vcc +; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, 0, v4, vcc ; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v2, v3 ; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v4, vcc ; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v12, v2 @@ -372,7 +369,7 @@ define i64 @srem64(i64 %a, i64 %b) { ; GFX9-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v4, v3, 0 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v7, v0 ; GFX9-NEXT: v_addc_co_u32_e32 v0, vcc, v8, v1, vcc -; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v3, v13, vcc +; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v3, vcc ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v2 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc ; GFX9-NEXT: v_mul_lo_u32 v2, v9, v0 @@ -462,7 +459,6 @@ define i64 @urem64(i64 %a, i64 %b) { ; GFX9-NEXT: v_subb_co_u32_e32 v11, vcc, 0, v3, vcc ; GFX9-NEXT: v_mac_f32_e32 v4, 0x4f800000, v5 ; GFX9-NEXT: v_rcp_f32_e32 v4, v4 -; GFX9-NEXT: v_mov_b32_e32 v13, 0 ; GFX9-NEXT: v_mul_f32_e32 v4, 0x5f7ffffc, v4 ; GFX9-NEXT: v_mul_f32_e32 v5, 0x2f800000, v4 ; GFX9-NEXT: v_trunc_f32_e32 v5, v5 @@ -477,32 +473,32 @@ define i64 @urem64(i64 %a, i64 %b) { ; GFX9-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v9, v7, 0 ; GFX9-NEXT: v_add_co_u32_e32 v12, vcc, v12, v5 ; GFX9-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v8, v4, 0 -; GFX9-NEXT: v_addc_co_u32_e32 v14, vcc, 0, v6, vcc +; GFX9-NEXT: v_addc_co_u32_e32 v13, vcc, 0, v6, vcc ; GFX9-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v8, v7, 0 ; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, v12, v4 -; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, v14, v5, vcc -; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, v7, v13, vcc +; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, v13, v5, vcc +; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v7, vcc ; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, v4, v6 ; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v5, vcc ; GFX9-NEXT: v_add_co_u32_e32 v12, vcc, v9, v4 -; GFX9-NEXT: v_addc_co_u32_e32 v14, vcc, v8, v5, vcc -; GFX9-NEXT: v_mul_lo_u32 v6, v10, v14 +; GFX9-NEXT: v_addc_co_u32_e32 v13, vcc, v8, v5, vcc +; GFX9-NEXT: v_mul_lo_u32 v6, v10, v13 ; GFX9-NEXT: v_mul_lo_u32 v7, v11, v12 ; GFX9-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v10, v12, 0 ; GFX9-NEXT: v_add3_u32 v7, v5, v6, v7 -; GFX9-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v14, v7, 0 +; GFX9-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v13, v7, 0 ; GFX9-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v12, v7, 0 ; GFX9-NEXT: v_mul_hi_u32 v11, v12, v4 -; GFX9-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v14, v4, 0 +; GFX9-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v13, v4, 0 ; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, v11, v7 ; GFX9-NEXT: v_addc_co_u32_e32 v7, vcc, 0, v8, vcc ; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, v4, v9 ; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, v7, v10, vcc -; GFX9-NEXT: v_addc_co_u32_e32 v6, vcc, v6, v13, vcc +; GFX9-NEXT: v_addc_co_u32_e32 v6, vcc, 0, v6, vcc ; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, v4, v5 ; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v6, vcc ; GFX9-NEXT: v_add_co_u32_e32 v6, vcc, v12, v4 -; GFX9-NEXT: v_addc_co_u32_e32 v7, vcc, v14, v5, vcc +; GFX9-NEXT: v_addc_co_u32_e32 v7, vcc, v13, v5, vcc ; GFX9-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v0, v7, 0 ; GFX9-NEXT: v_mul_hi_u32 v8, v0, v6 ; GFX9-NEXT: v_add_co_u32_e32 v8, vcc, v8, v4 @@ -511,7 +507,7 @@ define i64 @urem64(i64 %a, i64 %b) { ; GFX9-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v1, v7, 0 ; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, v8, v4 ; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, v9, v5, vcc -; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, v7, v13, vcc +; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v7, vcc ; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, v4, v6 ; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v5, vcc ; GFX9-NEXT: v_mul_lo_u32 v6, v3, v4 @@ -728,7 +724,6 @@ define <2 x i64> @sdivrem64(i64 %a, i64 %b) { ; GFX9-NEXT: v_subb_co_u32_e32 v8, vcc, 0, v10, vcc ; GFX9-NEXT: v_mac_f32_e32 v2, 0x4f800000, v3 ; GFX9-NEXT: v_rcp_f32_e32 v2, v2 -; GFX9-NEXT: v_mov_b32_e32 v14, 0 ; GFX9-NEXT: v_mul_f32_e32 v2, 0x5f7ffffc, v2 ; GFX9-NEXT: v_mul_f32_e32 v3, 0x2f800000, v2 ; GFX9-NEXT: v_trunc_f32_e32 v3, v3 @@ -743,11 +738,11 @@ define <2 x i64> @sdivrem64(i64 %a, i64 %b) { ; GFX9-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v6, v5, 0 ; GFX9-NEXT: v_add_co_u32_e32 v13, vcc, v13, v3 ; GFX9-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v12, v2, 0 -; GFX9-NEXT: v_addc_co_u32_e32 v15, vcc, 0, v4, vcc +; GFX9-NEXT: v_addc_co_u32_e32 v14, vcc, 0, v4, vcc ; GFX9-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v12, v5, 0 ; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v13, v2 -; GFX9-NEXT: v_addc_co_u32_e32 v2, vcc, v15, v3, vcc -; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, v5, v14, vcc +; GFX9-NEXT: v_addc_co_u32_e32 v2, vcc, v14, v3, vcc +; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v5, vcc ; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v2, v4 ; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc ; GFX9-NEXT: v_add_co_u32_e32 v13, vcc, v6, v2 @@ -758,13 +753,13 @@ define <2 x i64> @sdivrem64(i64 %a, i64 %b) { ; GFX9-NEXT: v_add3_u32 v5, v3, v4, v5 ; GFX9-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v12, v5, 0 ; GFX9-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v13, v5, 0 -; GFX9-NEXT: v_mul_hi_u32 v15, v13, v2 +; GFX9-NEXT: v_mul_hi_u32 v14, v13, v2 ; GFX9-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v12, v2, 0 -; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v15, v5 +; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v14, v5 ; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v6, vcc ; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v2, v7 ; GFX9-NEXT: v_addc_co_u32_e32 v2, vcc, v5, v8, vcc -; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, v4, v14, vcc +; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, 0, v4, vcc ; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v2, v3 ; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v4, vcc ; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v13, v2 @@ -782,7 +777,7 @@ define <2 x i64> @sdivrem64(i64 %a, i64 %b) { ; GFX9-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v4, v3, 0 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v6, v0 ; GFX9-NEXT: v_addc_co_u32_e32 v0, vcc, v8, v1, vcc -; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v3, v14, vcc +; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v3, vcc ; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v0, v2 ; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v1, vcc ; GFX9-NEXT: v_mul_lo_u32 v6, v10, v2 @@ -896,7 +891,6 @@ define <2 x i64> @udivrem64(i64 %a, i64 %b) { ; GFX9-NEXT: v_subb_co_u32_e32 v11, vcc, 0, v3, vcc ; GFX9-NEXT: v_mac_f32_e32 v4, 0x4f800000, v5 ; GFX9-NEXT: v_rcp_f32_e32 v4, v4 -; GFX9-NEXT: v_mov_b32_e32 v13, 0 ; GFX9-NEXT: v_mul_f32_e32 v4, 0x5f7ffffc, v4 ; GFX9-NEXT: v_mul_f32_e32 v5, 0x2f800000, v4 ; GFX9-NEXT: v_trunc_f32_e32 v5, v5 @@ -911,32 +905,32 @@ define <2 x i64> @udivrem64(i64 %a, i64 %b) { ; GFX9-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v9, v7, 0 ; GFX9-NEXT: v_add_co_u32_e32 v12, vcc, v12, v5 ; GFX9-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v8, v4, 0 -; GFX9-NEXT: v_addc_co_u32_e32 v14, vcc, 0, v6, vcc +; GFX9-NEXT: v_addc_co_u32_e32 v13, vcc, 0, v6, vcc ; GFX9-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v8, v7, 0 ; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, v12, v4 -; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, v14, v5, vcc -; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, v7, v13, vcc +; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, v13, v5, vcc +; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v7, vcc ; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, v4, v6 ; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v5, vcc ; GFX9-NEXT: v_add_co_u32_e32 v12, vcc, v9, v4 -; GFX9-NEXT: v_addc_co_u32_e32 v14, vcc, v8, v5, vcc -; GFX9-NEXT: v_mul_lo_u32 v6, v10, v14 +; GFX9-NEXT: v_addc_co_u32_e32 v13, vcc, v8, v5, vcc +; GFX9-NEXT: v_mul_lo_u32 v6, v10, v13 ; GFX9-NEXT: v_mul_lo_u32 v7, v11, v12 ; GFX9-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v10, v12, 0 ; GFX9-NEXT: v_add3_u32 v7, v5, v6, v7 -; GFX9-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v14, v7, 0 +; GFX9-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v13, v7, 0 ; GFX9-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v12, v7, 0 ; GFX9-NEXT: v_mul_hi_u32 v11, v12, v4 -; GFX9-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v14, v4, 0 +; GFX9-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v13, v4, 0 ; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, v11, v7 ; GFX9-NEXT: v_addc_co_u32_e32 v7, vcc, 0, v8, vcc ; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, v4, v9 ; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, v7, v10, vcc -; GFX9-NEXT: v_addc_co_u32_e32 v6, vcc, v6, v13, vcc +; GFX9-NEXT: v_addc_co_u32_e32 v6, vcc, 0, v6, vcc ; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, v4, v5 ; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v6, vcc ; GFX9-NEXT: v_add_co_u32_e32 v6, vcc, v12, v4 -; GFX9-NEXT: v_addc_co_u32_e32 v7, vcc, v14, v5, vcc +; GFX9-NEXT: v_addc_co_u32_e32 v7, vcc, v13, v5, vcc ; GFX9-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v0, v7, 0 ; GFX9-NEXT: v_mul_hi_u32 v8, v0, v6 ; GFX9-NEXT: v_add_co_u32_e32 v8, vcc, v8, v4 @@ -945,7 +939,7 @@ define <2 x i64> @udivrem64(i64 %a, i64 %b) { ; GFX9-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v1, v7, 0 ; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, v8, v4 ; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, v9, v5, vcc -; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, v7, v13, vcc +; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v7, vcc ; GFX9-NEXT: v_add_co_u32_e32 v6, vcc, v4, v6 ; GFX9-NEXT: v_addc_co_u32_e32 v7, vcc, 0, v5, vcc ; GFX9-NEXT: v_mul_lo_u32 v8, v3, v6 diff --git a/llvm/test/CodeGen/AMDGPU/combine-reg-or-const.ll b/llvm/test/CodeGen/AMDGPU/combine-reg-or-const.ll index 65ffba83df95..c9d23d76fdfd 100644 --- a/llvm/test/CodeGen/AMDGPU/combine-reg-or-const.ll +++ b/llvm/test/CodeGen/AMDGPU/combine-reg-or-const.ll @@ -5,7 +5,7 @@ ; CHECK-LABEL: _Z11test_kernelPii: ; CHECK: s_mul_i32 ; CHECK: s_sub_i32 -; CHECK: s_and_b32 [[S1:s[0-9]+]], {{s[0-9]+}}, {{s[0-9]+}} +; CHECK: s_and_b32 [[S1:s[0-9]+]], {{s[0-9]+}}, 0xffff ; CHECK: s_add_i32 [[S2:s[0-9]+]], {{s[0-9]+}}, [[S1]] ; CHECK: s_or_b32 {{s[0-9]+}}, [[S2]], 0xc0 diff --git a/llvm/test/CodeGen/AMDGPU/constant-address-space-32bit.ll b/llvm/test/CodeGen/AMDGPU/constant-address-space-32bit.ll index 22ac0fa31743..cf55eb8d2f12 100644 --- a/llvm/test/CodeGen/AMDGPU/constant-address-space-32bit.ll +++ b/llvm/test/CodeGen/AMDGPU/constant-address-space-32bit.ll @@ -226,7 +226,6 @@ define amdgpu_vs i32 @load_i32_hifffffff0(i32 addrspace(6)* inreg %p) #4 { ; GCN-LABEL: {{^}}load_sampler ; GCN: v_readfirstlane_b32 -; GCN-NEXT: v_readfirstlane_b32 ; SI: s_nop ; GCN: s_load_dwordx8 ; GCN-NEXT: s_load_dwordx4 @@ -260,7 +259,6 @@ main_body: ; GCN-LABEL: {{^}}load_sampler_nouniform ; GCN: v_readfirstlane_b32 -; GCN-NEXT: v_readfirstlane_b32 ; SI: s_nop ; GCN: s_load_dwordx8 ; GCN-NEXT: s_load_dwordx4 diff --git a/llvm/test/CodeGen/AMDGPU/ctlz.ll b/llvm/test/CodeGen/AMDGPU/ctlz.ll index f757ad818ac0..05e3acd2591d 100644 --- a/llvm/test/CodeGen/AMDGPU/ctlz.ll +++ b/llvm/test/CodeGen/AMDGPU/ctlz.ll @@ -1500,15 +1500,13 @@ define amdgpu_kernel void @v_ctlz_i32_sel_ne_bitwidth(i32 addrspace(1)* noalias ; GFX10-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-GISEL-NEXT: global_load_ushort v1, v0, s[2:3] -; GFX10-GISEL-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-GISEL-NEXT: s_mov_b32 s2, 0xffff ; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX10-GISEL-NEXT: v_ffbh_u32_e32 v2, v1 ; GFX10-GISEL-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1 ; GFX10-GISEL-NEXT: v_min_u32_e32 v2, 32, v2 ; GFX10-GISEL-NEXT: v_subrev_nc_u32_e32 v2, 16, v2 -; GFX10-GISEL-NEXT: v_and_b32_e32 v2, s2, v2 -; GFX10-GISEL-NEXT: v_cndmask_b32_e64 v1, v2, s2, vcc_lo +; GFX10-GISEL-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX10-GISEL-NEXT: v_cndmask_b32_e64 v1, v2, 0xffff, vcc_lo ; GFX10-GISEL-NEXT: global_store_short v0, v1, s[0:1] ; GFX10-GISEL-NEXT: s_endpgm %val = load i16, i16 addrspace(1)* %valptr @@ -1604,19 +1602,18 @@ define amdgpu_kernel void @v_ctlz_i7_sel_eq_neg1(i7 addrspace(1)* noalias %out, ; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, s2 ; GFX10-GISEL-NEXT: v_mov_b32_e32 v2, s3 -; GFX10-GISEL-NEXT: s_movk_i32 s2, 0x7f ; GFX10-GISEL-NEXT: v_add_co_u32 v0, vcc_lo, v1, v0 ; GFX10-GISEL-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v2, v3, vcc_lo ; GFX10-GISEL-NEXT: global_load_ubyte v0, v[0:1], off ; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) -; GFX10-GISEL-NEXT: v_and_b32_e32 v0, s2, v0 +; GFX10-GISEL-NEXT: v_and_b32_e32 v0, 0x7f, v0 ; GFX10-GISEL-NEXT: v_ffbh_u32_e32 v1, v0 ; GFX10-GISEL-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX10-GISEL-NEXT: v_min_u32_e32 v1, 32, v1 ; GFX10-GISEL-NEXT: v_subrev_nc_u32_e32 v1, 25, v1 ; GFX10-GISEL-NEXT: v_cndmask_b32_e64 v0, v1, 0x7f, vcc_lo ; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, 0 -; GFX10-GISEL-NEXT: v_and_b32_e32 v0, s2, v0 +; GFX10-GISEL-NEXT: v_and_b32_e32 v0, 0x7f, v0 ; GFX10-GISEL-NEXT: global_store_byte v1, v0, s[0:1] ; GFX10-GISEL-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() diff --git a/llvm/test/CodeGen/AMDGPU/cttz.ll b/llvm/test/CodeGen/AMDGPU/cttz.ll index 3000858c7786..8603cd75e356 100644 --- a/llvm/test/CodeGen/AMDGPU/cttz.ll +++ b/llvm/test/CodeGen/AMDGPU/cttz.ll @@ -1492,14 +1492,12 @@ define amdgpu_kernel void @v_cttz_i32_sel_ne_bitwidth(i32 addrspace(1)* noalias ; GFX10-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-GISEL-NEXT: global_load_ushort v1, v0, s[2:3] -; GFX10-GISEL-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-GISEL-NEXT: s_mov_b32 s2, 0xffff ; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX10-GISEL-NEXT: v_or_b32_e32 v2, 0x10000, v1 ; GFX10-GISEL-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1 ; GFX10-GISEL-NEXT: v_ffbl_b32_e32 v2, v2 -; GFX10-GISEL-NEXT: v_and_b32_e32 v2, s2, v2 -; GFX10-GISEL-NEXT: v_cndmask_b32_e64 v1, v2, s2, vcc_lo +; GFX10-GISEL-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX10-GISEL-NEXT: v_cndmask_b32_e64 v1, v2, 0xffff, vcc_lo ; GFX10-GISEL-NEXT: global_store_short v0, v1, s[0:1] ; GFX10-GISEL-NEXT: s_endpgm %val = load i16, i16 addrspace(1)* %valptr @@ -1595,18 +1593,17 @@ define amdgpu_kernel void @v_cttz_i7_sel_eq_neg1(i7 addrspace(1)* noalias %out, ; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, s2 ; GFX10-GISEL-NEXT: v_mov_b32_e32 v2, s3 -; GFX10-GISEL-NEXT: s_movk_i32 s2, 0x7f ; GFX10-GISEL-NEXT: v_add_co_u32 v0, vcc_lo, v1, v0 ; GFX10-GISEL-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v2, v3, vcc_lo ; GFX10-GISEL-NEXT: global_load_ubyte v0, v[0:1], off ; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX10-GISEL-NEXT: v_or_b32_e32 v1, 0x80, v0 -; GFX10-GISEL-NEXT: v_and_b32_e32 v0, s2, v0 +; GFX10-GISEL-NEXT: v_and_b32_e32 v0, 0x7f, v0 ; GFX10-GISEL-NEXT: v_ffbl_b32_e32 v1, v1 ; GFX10-GISEL-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX10-GISEL-NEXT: v_cndmask_b32_e64 v0, v1, 0x7f, vcc_lo ; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, 0 -; GFX10-GISEL-NEXT: v_and_b32_e32 v0, s2, v0 +; GFX10-GISEL-NEXT: v_and_b32_e32 v0, 0x7f, v0 ; GFX10-GISEL-NEXT: global_store_byte v1, v0, s[0:1] ; GFX10-GISEL-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() diff --git a/llvm/test/CodeGen/AMDGPU/cvt_f32_ubyte.ll b/llvm/test/CodeGen/AMDGPU/cvt_f32_ubyte.ll index 8de42ad09100..05b13a985935 100644 --- a/llvm/test/CodeGen/AMDGPU/cvt_f32_ubyte.ll +++ b/llvm/test/CodeGen/AMDGPU/cvt_f32_ubyte.ll @@ -1205,7 +1205,6 @@ define amdgpu_kernel void @load_v4i8_to_v4f32_2_uses(<4 x float> addrspace(1)* n ; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; VI-NEXT: v_mov_b32_e32 v5, 9 -; VI-NEXT: s_movk_i32 s8, 0x900 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0 @@ -1232,8 +1231,8 @@ define amdgpu_kernel void @load_v4i8_to_v4f32_2_uses(<4 x float> addrspace(1)* n ; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v6 ; VI-NEXT: v_or_b32_sdwa v0, v7, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_or_b32_sdwa v1, v1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; VI-NEXT: v_mov_b32_e32 v2, s8 -; VI-NEXT: v_add_u16_e32 v0, s8, v0 +; VI-NEXT: v_mov_b32_e32 v2, 0x900 +; VI-NEXT: v_add_u16_e32 v0, 0x900, v0 ; VI-NEXT: v_add_u16_sdwa v1, v1, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; VI-NEXT: v_or_b32_e32 v0, v0, v1 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 @@ -1250,7 +1249,6 @@ define amdgpu_kernel void @load_v4i8_to_v4f32_2_uses(<4 x float> addrspace(1)* n ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX10-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x2c -; GFX10-NEXT: s_movk_i32 s0, 0x900 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_lshrrev_b32_e32 v2, 16, v0 ; GFX10-NEXT: v_lshrrev_b32_sdwa v1, v1, v0 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD @@ -1261,8 +1259,8 @@ define amdgpu_kernel void @load_v4i8_to_v4f32_2_uses(<4 x float> addrspace(1)* n ; GFX10-NEXT: v_or_b32_sdwa v2, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX10-NEXT: v_mov_b32_e32 v4, 0 ; GFX10-NEXT: v_cvt_f32_ubyte3_e32 v3, v0 -; GFX10-NEXT: v_add_nc_u16 v1, v1, s0 -; GFX10-NEXT: v_add_nc_u16 v5, v2, s0 +; GFX10-NEXT: v_add_nc_u16 v1, v1, 0x900 +; GFX10-NEXT: v_add_nc_u16 v5, v2, 0x900 ; GFX10-NEXT: v_cvt_f32_ubyte2_e32 v2, v0 ; GFX10-NEXT: v_lshlrev_b32_e32 v6, 16, v1 ; GFX10-NEXT: v_cvt_f32_ubyte1_e32 v1, v0 diff --git a/llvm/test/CodeGen/AMDGPU/extract-subvector-16bit.ll b/llvm/test/CodeGen/AMDGPU/extract-subvector-16bit.ll index c686598225a2..c3262b4671ed 100644 --- a/llvm/test/CodeGen/AMDGPU/extract-subvector-16bit.ll +++ b/llvm/test/CodeGen/AMDGPU/extract-subvector-16bit.ll @@ -71,21 +71,20 @@ define <4 x i16> @extract_4xi16(<8 x i16> addrspace(1) * %p0, <8 x i16> addrspac ; SI-NEXT: v_bfe_i32 v0, v3, 0, 16 ; SI-NEXT: v_bfe_i32 v1, v4, 0, 16 ; SI-NEXT: v_bfe_i32 v2, v2, 0, 16 -; SI-NEXT: s_mov_b32 s4, 0xffff -; SI-NEXT: v_mov_b32_e32 v3, 0x8000 -; SI-NEXT: v_mov_b32_e32 v4, 0xffff0000 -; SI-NEXT: v_bfrev_b32_e32 v5, 1 -; SI-NEXT: v_mov_b32_e32 v6, 0xffff8000 -; SI-NEXT: v_mov_b32_e32 v7, s4 +; SI-NEXT: v_mov_b32_e32 v3, 0xffff +; SI-NEXT: v_mov_b32_e32 v4, 0x8000 +; SI-NEXT: v_mov_b32_e32 v5, 0xffff0000 +; SI-NEXT: v_bfrev_b32_e32 v6, 1 +; SI-NEXT: v_mov_b32_e32 v7, 0xffff8000 ; SI-NEXT: v_cmp_lt_i32_e32 vcc, -1, v0 -; SI-NEXT: v_cndmask_b32_e32 v0, v7, v3, vcc +; SI-NEXT: v_cndmask_b32_e32 v0, v3, v4, vcc ; SI-NEXT: v_cmp_lt_i32_e32 vcc, -1, v1 -; SI-NEXT: v_cndmask_b32_e32 v1, v4, v5, vcc +; SI-NEXT: v_cndmask_b32_e32 v1, v5, v6, vcc ; SI-NEXT: v_cmp_lt_i32_e32 vcc, -1, v2 -; SI-NEXT: v_cndmask_b32_e32 v2, -1, v6, vcc +; SI-NEXT: v_cndmask_b32_e32 v2, -1, v7, vcc ; SI-NEXT: v_or_b32_e32 v0, v0, v1 ; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v2 -; SI-NEXT: v_and_b32_e32 v2, s4, v2 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 ; SI-NEXT: v_or_b32_e32 v2, v2, v3 ; SI-NEXT: v_alignbit_b32 v1, v2, v1, 16 ; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v2 diff --git a/llvm/test/CodeGen/AMDGPU/fabs.f16.ll b/llvm/test/CodeGen/AMDGPU/fabs.f16.ll index d9ae5421516e..38243711c0e7 100644 --- a/llvm/test/CodeGen/AMDGPU/fabs.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/fabs.f16.ll @@ -42,9 +42,8 @@ define amdgpu_kernel void @s_fabs_v2f16(<2 x half> addrspace(1)* %out, <2 x half ; CI: s_load_dwordx2 s[[[LO:[0-9]+]]:[[HI:[0-9]+]]], s{{\[[0-9]+:[0-9]+\]}}, 0x2 ; GFX89: s_load_dwordx2 s[[[LO:[0-9]+]]:[[HI:[0-9]+]]], s{{\[[0-9]+:[0-9]+\]}}, 0x8 -; GCN: s_mov_b32 [[MASK:s[0-9]+]], 0x7fff7fff -; GCN-DAG: s_and_b32 s{{[0-9]+}}, s[[LO]], [[MASK]] -; GCN-DAG: s_and_b32 s{{[0-9]+}}, s[[HI]], [[MASK]] +; GCN-DAG: s_and_b32 s{{[0-9]+}}, s[[LO]], 0x7fff7fff +; GCN-DAG: s_and_b32 s{{[0-9]+}}, s[[HI]], 0x7fff7fff ; GCN: {{flat|global}}_store_dwordx2 define amdgpu_kernel void @s_fabs_v4f16(<4 x half> addrspace(1)* %out, <4 x half> %in) { %fabs = call <4 x half> @llvm.fabs.v4f16(<4 x half> %in) diff --git a/llvm/test/CodeGen/AMDGPU/fabs.f64.ll b/llvm/test/CodeGen/AMDGPU/fabs.f64.ll index 809e7de63114..1629d55dcda9 100644 --- a/llvm/test/CodeGen/AMDGPU/fabs.f64.ll +++ b/llvm/test/CodeGen/AMDGPU/fabs.f64.ll @@ -30,8 +30,8 @@ define amdgpu_kernel void @fabs_f64(double addrspace(1)* %out, double %in) { } ; FUNC-LABEL: {{^}}fabs_v2f64: -; SI: s_and_b32 -; SI: s_and_b32 +; SI: s_bitset0_b32 +; SI: s_bitset0_b32 ; SI: s_endpgm define amdgpu_kernel void @fabs_v2f64(<2 x double> addrspace(1)* %out, <2 x double> %in) { %fabs = call <2 x double> @llvm.fabs.v2f64(<2 x double> %in) @@ -40,10 +40,10 @@ define amdgpu_kernel void @fabs_v2f64(<2 x double> addrspace(1)* %out, <2 x doub } ; FUNC-LABEL: {{^}}fabs_v4f64: -; SI: s_and_b32 -; SI: s_and_b32 -; SI: s_and_b32 -; SI: s_and_b32 +; SI: s_bitset0_b32 +; SI: s_bitset0_b32 +; SI: s_bitset0_b32 +; SI: s_bitset0_b32 ; SI: s_endpgm define amdgpu_kernel void @fabs_v4f64(<4 x double> addrspace(1)* %out, <4 x double> %in) { %fabs = call <4 x double> @llvm.fabs.v4f64(<4 x double> %in) diff --git a/llvm/test/CodeGen/AMDGPU/fabs.ll b/llvm/test/CodeGen/AMDGPU/fabs.ll index b6ad0c49dc26..d98a77a52903 100644 --- a/llvm/test/CodeGen/AMDGPU/fabs.ll +++ b/llvm/test/CodeGen/AMDGPU/fabs.ll @@ -45,8 +45,8 @@ define amdgpu_kernel void @s_fabs_f32(float addrspace(1)* %out, float %in) { ; R600: |{{(PV|T[0-9])\.[XYZW]}}| ; R600: |{{(PV|T[0-9])\.[XYZW]}}| -; GCN: s_and_b32 -; GCN: s_and_b32 +; GCN: s_bitset0_b32 +; GCN: s_bitset0_b32 define amdgpu_kernel void @fabs_v2f32(<2 x float> addrspace(1)* %out, <2 x float> %in) { %fabs = call <2 x float> @llvm.fabs.v2f32(<2 x float> %in) store <2 x float> %fabs, <2 x float> addrspace(1)* %out @@ -59,10 +59,10 @@ define amdgpu_kernel void @fabs_v2f32(<2 x float> addrspace(1)* %out, <2 x float ; R600: |{{(PV|T[0-9])\.[XYZW]}}| ; R600: |{{(PV|T[0-9])\.[XYZW]}}| -; GCN: s_and_b32 -; GCN: s_and_b32 -; GCN: s_and_b32 -; GCN: s_and_b32 +; GCN: s_bitset0_b32 +; GCN: s_bitset0_b32 +; GCN: s_bitset0_b32 +; GCN: s_bitset0_b32 define amdgpu_kernel void @fabs_v4f32(<4 x float> addrspace(1)* %out, <4 x float> %in) { %fabs = call <4 x float> @llvm.fabs.v4f32(<4 x float> %in) store <4 x float> %fabs, <4 x float> addrspace(1)* %out diff --git a/llvm/test/CodeGen/AMDGPU/fexp.ll b/llvm/test/CodeGen/AMDGPU/fexp.ll index 7eaac9421f56..50e9533890b7 100644 --- a/llvm/test/CodeGen/AMDGPU/fexp.ll +++ b/llvm/test/CodeGen/AMDGPU/fexp.ll @@ -107,10 +107,9 @@ define <2 x half> @v_exp_v2f16(<2 x half> %arg0) { ; VI-LABEL: v_exp_v2f16: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: s_movk_i32 s4, 0x3dc5 -; VI-NEXT: v_mov_b32_e32 v1, s4 +; VI-NEXT: v_mov_b32_e32 v1, 0x3dc5 ; VI-NEXT: v_mul_f16_sdwa v1, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_mul_f16_e32 v0, s4, v0 +; VI-NEXT: v_mul_f16_e32 v0, 0x3dc5, v0 ; VI-NEXT: v_exp_f16_sdwa v1, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD ; VI-NEXT: v_exp_f16_e32 v0, v0 ; VI-NEXT: v_or_b32_e32 v0, v0, v1 @@ -161,7 +160,7 @@ define <4 x half> @v_exp_v4f16(<4 x half> %arg0) { ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; VI-NEXT: s_movk_i32 s4, 0x3dc5 -; VI-NEXT: v_mov_b32_e32 v3, s4 +; VI-NEXT: v_mov_b32_e32 v3, 0x3dc5 ; VI-NEXT: v_mul_f16_e32 v2, s4, v1 ; VI-NEXT: v_mul_f16_sdwa v1, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: v_mul_f16_e32 v4, s4, v0 diff --git a/llvm/test/CodeGen/AMDGPU/flat-scratch.ll b/llvm/test/CodeGen/AMDGPU/flat-scratch.ll index cba84f5dfe4a..81ff336ef321 100644 --- a/llvm/test/CodeGen/AMDGPU/flat-scratch.ll +++ b/llvm/test/CodeGen/AMDGPU/flat-scratch.ll @@ -532,12 +532,11 @@ define amdgpu_kernel void @store_load_vindex_kernel() { ; GFX10-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s0 ; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1 -; GFX10-NEXT: v_mov_b32_e32 v1, 4 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX10-NEXT: v_mov_b32_e32 v3, 15 -; GFX10-NEXT: v_add_nc_u32_e32 v2, v1, v0 -; GFX10-NEXT: v_sub_nc_u32_e32 v0, v1, v0 -; GFX10-NEXT: scratch_store_dword v2, v3, off +; GFX10-NEXT: v_mov_b32_e32 v2, 15 +; GFX10-NEXT: v_add_nc_u32_e32 v1, 4, v0 +; GFX10-NEXT: v_sub_nc_u32_e32 v0, 4, v0 +; GFX10-NEXT: scratch_store_dword v1, v2, off ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: scratch_load_dword v0, v0, off offset:124 glc dlc ; GFX10-NEXT: s_waitcnt vmcnt(0) @@ -585,12 +584,11 @@ define amdgpu_kernel void @store_load_vindex_kernel() { ; GFX10-PAL-NEXT: s_addc_u32 s3, s3, 0 ; GFX10-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s2 ; GFX10-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s3 -; GFX10-PAL-NEXT: v_mov_b32_e32 v1, 4 ; GFX10-PAL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX10-PAL-NEXT: v_mov_b32_e32 v3, 15 -; GFX10-PAL-NEXT: v_add_nc_u32_e32 v2, v1, v0 -; GFX10-PAL-NEXT: v_sub_nc_u32_e32 v0, v1, v0 -; GFX10-PAL-NEXT: scratch_store_dword v2, v3, off +; GFX10-PAL-NEXT: v_mov_b32_e32 v2, 15 +; GFX10-PAL-NEXT: v_add_nc_u32_e32 v1, 4, v0 +; GFX10-PAL-NEXT: v_sub_nc_u32_e32 v0, 4, v0 +; GFX10-PAL-NEXT: scratch_store_dword v1, v2, off ; GFX10-PAL-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-PAL-NEXT: scratch_load_dword v0, v0, off offset:124 glc dlc ; GFX10-PAL-NEXT: s_waitcnt vmcnt(0) @@ -639,12 +637,11 @@ define void @store_load_vindex_foo(i32 %idx) { ; GFX10: ; %bb.0: ; %bb ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: v_mov_b32_e32 v1, s32 -; GFX10-NEXT: v_and_b32_e32 v2, 15, v0 -; GFX10-NEXT: v_mov_b32_e32 v3, 15 -; GFX10-NEXT: v_lshl_add_u32 v0, v0, 2, v1 -; GFX10-NEXT: v_lshl_add_u32 v1, v2, 2, v1 -; GFX10-NEXT: scratch_store_dword v0, v3, off +; GFX10-NEXT: v_and_b32_e32 v1, 15, v0 +; GFX10-NEXT: v_lshl_add_u32 v0, v0, 2, s32 +; GFX10-NEXT: v_mov_b32_e32 v2, 15 +; GFX10-NEXT: v_lshl_add_u32 v1, v1, 2, s32 +; GFX10-NEXT: scratch_store_dword v0, v2, off ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: scratch_load_dword v0, v1, off glc dlc ; GFX10-NEXT: s_waitcnt vmcnt(0) @@ -681,12 +678,11 @@ define void @store_load_vindex_foo(i32 %idx) { ; GFX10-PAL: ; %bb.0: ; %bb ; GFX10-PAL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-PAL-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-PAL-NEXT: v_mov_b32_e32 v1, s32 -; GFX10-PAL-NEXT: v_and_b32_e32 v2, 15, v0 -; GFX10-PAL-NEXT: v_mov_b32_e32 v3, 15 -; GFX10-PAL-NEXT: v_lshl_add_u32 v0, v0, 2, v1 -; GFX10-PAL-NEXT: v_lshl_add_u32 v1, v2, 2, v1 -; GFX10-PAL-NEXT: scratch_store_dword v0, v3, off +; GFX10-PAL-NEXT: v_and_b32_e32 v1, 15, v0 +; GFX10-PAL-NEXT: v_lshl_add_u32 v0, v0, 2, s32 +; GFX10-PAL-NEXT: v_mov_b32_e32 v2, 15 +; GFX10-PAL-NEXT: v_lshl_add_u32 v1, v1, 2, s32 +; GFX10-PAL-NEXT: scratch_store_dword v0, v2, off ; GFX10-PAL-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-PAL-NEXT: scratch_load_dword v0, v1, off glc dlc ; GFX10-PAL-NEXT: s_waitcnt vmcnt(0) @@ -1387,14 +1383,13 @@ define amdgpu_kernel void @store_load_vindex_small_offset_kernel() { ; GFX10-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s0 ; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1 -; GFX10-NEXT: v_mov_b32_e32 v1, 0x104 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX10-NEXT: v_mov_b32_e32 v3, 15 -; GFX10-NEXT: v_add_nc_u32_e32 v2, v1, v0 -; GFX10-NEXT: v_sub_nc_u32_e32 v0, v1, v0 -; GFX10-NEXT: scratch_load_dword v1, off, off offset:4 glc dlc +; GFX10-NEXT: v_mov_b32_e32 v2, 15 +; GFX10-NEXT: scratch_load_dword v3, off, off offset:4 glc dlc ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: scratch_store_dword v2, v3, off +; GFX10-NEXT: v_add_nc_u32_e32 v1, 0x104, v0 +; GFX10-NEXT: v_sub_nc_u32_e32 v0, 0x104, v0 +; GFX10-NEXT: scratch_store_dword v1, v2, off ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: scratch_load_dword v0, v0, off offset:124 glc dlc ; GFX10-NEXT: s_waitcnt vmcnt(0) @@ -1447,15 +1442,14 @@ define amdgpu_kernel void @store_load_vindex_small_offset_kernel() { ; GFX1010-PAL-NEXT: s_addc_u32 s3, s3, 0 ; GFX1010-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s2 ; GFX1010-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s3 -; GFX1010-PAL-NEXT: v_mov_b32_e32 v1, 0x104 ; GFX1010-PAL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX1010-PAL-NEXT: v_mov_b32_e32 v3, 15 +; GFX1010-PAL-NEXT: v_mov_b32_e32 v2, 15 ; GFX1010-PAL-NEXT: s_mov_b32 vcc_lo, 0 -; GFX1010-PAL-NEXT: v_add_nc_u32_e32 v2, v1, v0 -; GFX1010-PAL-NEXT: v_sub_nc_u32_e32 v0, v1, v0 -; GFX1010-PAL-NEXT: scratch_load_dword v1, off, vcc_lo offset:4 glc dlc +; GFX1010-PAL-NEXT: scratch_load_dword v3, off, vcc_lo offset:4 glc dlc ; GFX1010-PAL-NEXT: s_waitcnt vmcnt(0) -; GFX1010-PAL-NEXT: scratch_store_dword v2, v3, off +; GFX1010-PAL-NEXT: v_add_nc_u32_e32 v1, 0x104, v0 +; GFX1010-PAL-NEXT: v_sub_nc_u32_e32 v0, 0x104, v0 +; GFX1010-PAL-NEXT: scratch_store_dword v1, v2, off ; GFX1010-PAL-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX1010-PAL-NEXT: scratch_load_dword v0, v0, off offset:124 glc dlc ; GFX1010-PAL-NEXT: s_waitcnt vmcnt(0) @@ -1472,14 +1466,13 @@ define amdgpu_kernel void @store_load_vindex_small_offset_kernel() { ; GFX1030-PAL-NEXT: s_addc_u32 s3, s3, 0 ; GFX1030-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s2 ; GFX1030-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s3 -; GFX1030-PAL-NEXT: v_mov_b32_e32 v1, 0x104 ; GFX1030-PAL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX1030-PAL-NEXT: v_mov_b32_e32 v3, 15 -; GFX1030-PAL-NEXT: v_add_nc_u32_e32 v2, v1, v0 -; GFX1030-PAL-NEXT: v_sub_nc_u32_e32 v0, v1, v0 -; GFX1030-PAL-NEXT: scratch_load_dword v1, off, off offset:4 glc dlc +; GFX1030-PAL-NEXT: v_mov_b32_e32 v2, 15 +; GFX1030-PAL-NEXT: scratch_load_dword v3, off, off offset:4 glc dlc ; GFX1030-PAL-NEXT: s_waitcnt vmcnt(0) -; GFX1030-PAL-NEXT: scratch_store_dword v2, v3, off +; GFX1030-PAL-NEXT: v_add_nc_u32_e32 v1, 0x104, v0 +; GFX1030-PAL-NEXT: v_sub_nc_u32_e32 v0, 0x104, v0 +; GFX1030-PAL-NEXT: scratch_store_dword v1, v2, off ; GFX1030-PAL-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX1030-PAL-NEXT: scratch_load_dword v0, v0, off offset:124 glc dlc ; GFX1030-PAL-NEXT: s_waitcnt vmcnt(0) @@ -1524,15 +1517,15 @@ define void @store_load_vindex_small_offset_foo(i32 %idx) { ; GFX10: ; %bb.0: ; %bb ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: v_and_b32_e32 v1, 15, v0 ; GFX10-NEXT: s_add_i32 vcc_lo, s32, 0x100 -; GFX10-NEXT: v_and_b32_e32 v2, 15, v0 -; GFX10-NEXT: v_mov_b32_e32 v1, vcc_lo -; GFX10-NEXT: v_mov_b32_e32 v3, 15 -; GFX10-NEXT: v_lshl_add_u32 v0, v0, 2, v1 -; GFX10-NEXT: v_lshl_add_u32 v1, v2, 2, v1 -; GFX10-NEXT: scratch_load_dword v2, off, s32 glc dlc +; GFX10-NEXT: v_mov_b32_e32 v2, 15 +; GFX10-NEXT: v_lshl_add_u32 v0, v0, 2, vcc_lo +; GFX10-NEXT: s_add_i32 vcc_lo, s32, 0x100 +; GFX10-NEXT: scratch_load_dword v3, off, s32 glc dlc ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: scratch_store_dword v0, v3, off +; GFX10-NEXT: v_lshl_add_u32 v1, v1, 2, vcc_lo +; GFX10-NEXT: scratch_store_dword v0, v2, off ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: scratch_load_dword v0, v1, off glc dlc ; GFX10-NEXT: s_waitcnt vmcnt(0) @@ -1574,15 +1567,15 @@ define void @store_load_vindex_small_offset_foo(i32 %idx) { ; GFX10-PAL: ; %bb.0: ; %bb ; GFX10-PAL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-PAL-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-PAL-NEXT: v_and_b32_e32 v1, 15, v0 ; GFX10-PAL-NEXT: s_add_i32 vcc_lo, s32, 0x100 -; GFX10-PAL-NEXT: v_and_b32_e32 v2, 15, v0 -; GFX10-PAL-NEXT: v_mov_b32_e32 v1, vcc_lo -; GFX10-PAL-NEXT: v_mov_b32_e32 v3, 15 -; GFX10-PAL-NEXT: v_lshl_add_u32 v0, v0, 2, v1 -; GFX10-PAL-NEXT: v_lshl_add_u32 v1, v2, 2, v1 -; GFX10-PAL-NEXT: scratch_load_dword v2, off, s32 glc dlc +; GFX10-PAL-NEXT: v_mov_b32_e32 v2, 15 +; GFX10-PAL-NEXT: v_lshl_add_u32 v0, v0, 2, vcc_lo +; GFX10-PAL-NEXT: s_add_i32 vcc_lo, s32, 0x100 +; GFX10-PAL-NEXT: scratch_load_dword v3, off, s32 glc dlc ; GFX10-PAL-NEXT: s_waitcnt vmcnt(0) -; GFX10-PAL-NEXT: scratch_store_dword v0, v3, off +; GFX10-PAL-NEXT: v_lshl_add_u32 v1, v1, 2, vcc_lo +; GFX10-PAL-NEXT: scratch_store_dword v0, v2, off ; GFX10-PAL-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-PAL-NEXT: scratch_load_dword v0, v1, off glc dlc ; GFX10-PAL-NEXT: s_waitcnt vmcnt(0) @@ -2277,14 +2270,13 @@ define amdgpu_kernel void @store_load_vindex_large_offset_kernel() { ; GFX10-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s0 ; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1 -; GFX10-NEXT: v_mov_b32_e32 v1, 0x4004 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX10-NEXT: v_mov_b32_e32 v3, 15 -; GFX10-NEXT: v_add_nc_u32_e32 v2, v1, v0 -; GFX10-NEXT: v_sub_nc_u32_e32 v0, v1, v0 -; GFX10-NEXT: scratch_load_dword v1, off, off offset:4 glc dlc +; GFX10-NEXT: v_mov_b32_e32 v2, 15 +; GFX10-NEXT: scratch_load_dword v3, off, off offset:4 glc dlc ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: scratch_store_dword v2, v3, off +; GFX10-NEXT: v_add_nc_u32_e32 v1, 0x4004, v0 +; GFX10-NEXT: v_sub_nc_u32_e32 v0, 0x4004, v0 +; GFX10-NEXT: scratch_store_dword v1, v2, off ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: scratch_load_dword v0, v0, off offset:124 glc dlc ; GFX10-NEXT: s_waitcnt vmcnt(0) @@ -2338,15 +2330,14 @@ define amdgpu_kernel void @store_load_vindex_large_offset_kernel() { ; GFX1010-PAL-NEXT: s_addc_u32 s3, s3, 0 ; GFX1010-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s2 ; GFX1010-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s3 -; GFX1010-PAL-NEXT: v_mov_b32_e32 v1, 0x4004 ; GFX1010-PAL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX1010-PAL-NEXT: v_mov_b32_e32 v3, 15 +; GFX1010-PAL-NEXT: v_mov_b32_e32 v2, 15 ; GFX1010-PAL-NEXT: s_mov_b32 vcc_lo, 0 -; GFX1010-PAL-NEXT: v_add_nc_u32_e32 v2, v1, v0 -; GFX1010-PAL-NEXT: v_sub_nc_u32_e32 v0, v1, v0 -; GFX1010-PAL-NEXT: scratch_load_dword v1, off, vcc_lo offset:4 glc dlc +; GFX1010-PAL-NEXT: scratch_load_dword v3, off, vcc_lo offset:4 glc dlc ; GFX1010-PAL-NEXT: s_waitcnt vmcnt(0) -; GFX1010-PAL-NEXT: scratch_store_dword v2, v3, off +; GFX1010-PAL-NEXT: v_add_nc_u32_e32 v1, 0x4004, v0 +; GFX1010-PAL-NEXT: v_sub_nc_u32_e32 v0, 0x4004, v0 +; GFX1010-PAL-NEXT: scratch_store_dword v1, v2, off ; GFX1010-PAL-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX1010-PAL-NEXT: scratch_load_dword v0, v0, off offset:124 glc dlc ; GFX1010-PAL-NEXT: s_waitcnt vmcnt(0) @@ -2363,14 +2354,13 @@ define amdgpu_kernel void @store_load_vindex_large_offset_kernel() { ; GFX1030-PAL-NEXT: s_addc_u32 s3, s3, 0 ; GFX1030-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s2 ; GFX1030-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s3 -; GFX1030-PAL-NEXT: v_mov_b32_e32 v1, 0x4004 ; GFX1030-PAL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX1030-PAL-NEXT: v_mov_b32_e32 v3, 15 -; GFX1030-PAL-NEXT: v_add_nc_u32_e32 v2, v1, v0 -; GFX1030-PAL-NEXT: v_sub_nc_u32_e32 v0, v1, v0 -; GFX1030-PAL-NEXT: scratch_load_dword v1, off, off offset:4 glc dlc +; GFX1030-PAL-NEXT: v_mov_b32_e32 v2, 15 +; GFX1030-PAL-NEXT: scratch_load_dword v3, off, off offset:4 glc dlc ; GFX1030-PAL-NEXT: s_waitcnt vmcnt(0) -; GFX1030-PAL-NEXT: scratch_store_dword v2, v3, off +; GFX1030-PAL-NEXT: v_add_nc_u32_e32 v1, 0x4004, v0 +; GFX1030-PAL-NEXT: v_sub_nc_u32_e32 v0, 0x4004, v0 +; GFX1030-PAL-NEXT: scratch_store_dword v1, v2, off ; GFX1030-PAL-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX1030-PAL-NEXT: scratch_load_dword v0, v0, off offset:124 glc dlc ; GFX1030-PAL-NEXT: s_waitcnt vmcnt(0) @@ -2415,15 +2405,15 @@ define void @store_load_vindex_large_offset_foo(i32 %idx) { ; GFX10: ; %bb.0: ; %bb ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: v_and_b32_e32 v1, 15, v0 ; GFX10-NEXT: s_add_i32 vcc_lo, s32, 0x4004 -; GFX10-NEXT: v_and_b32_e32 v2, 15, v0 -; GFX10-NEXT: v_mov_b32_e32 v1, vcc_lo -; GFX10-NEXT: v_mov_b32_e32 v3, 15 -; GFX10-NEXT: v_lshl_add_u32 v0, v0, 2, v1 -; GFX10-NEXT: v_lshl_add_u32 v1, v2, 2, v1 -; GFX10-NEXT: scratch_load_dword v2, off, s32 offset:4 glc dlc +; GFX10-NEXT: v_mov_b32_e32 v2, 15 +; GFX10-NEXT: v_lshl_add_u32 v0, v0, 2, vcc_lo +; GFX10-NEXT: s_add_i32 vcc_lo, s32, 0x4004 +; GFX10-NEXT: scratch_load_dword v3, off, s32 offset:4 glc dlc ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: scratch_store_dword v0, v3, off +; GFX10-NEXT: v_lshl_add_u32 v1, v1, 2, vcc_lo +; GFX10-NEXT: scratch_store_dword v0, v2, off ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: scratch_load_dword v0, v1, off glc dlc ; GFX10-NEXT: s_waitcnt vmcnt(0) @@ -2467,15 +2457,15 @@ define void @store_load_vindex_large_offset_foo(i32 %idx) { ; GFX10-PAL: ; %bb.0: ; %bb ; GFX10-PAL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-PAL-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-PAL-NEXT: v_and_b32_e32 v1, 15, v0 ; GFX10-PAL-NEXT: s_add_i32 vcc_lo, s32, 0x4004 -; GFX10-PAL-NEXT: v_and_b32_e32 v2, 15, v0 -; GFX10-PAL-NEXT: v_mov_b32_e32 v1, vcc_lo -; GFX10-PAL-NEXT: v_mov_b32_e32 v3, 15 -; GFX10-PAL-NEXT: v_lshl_add_u32 v0, v0, 2, v1 -; GFX10-PAL-NEXT: v_lshl_add_u32 v1, v2, 2, v1 -; GFX10-PAL-NEXT: scratch_load_dword v2, off, s32 offset:4 glc dlc +; GFX10-PAL-NEXT: v_mov_b32_e32 v2, 15 +; GFX10-PAL-NEXT: v_lshl_add_u32 v0, v0, 2, vcc_lo +; GFX10-PAL-NEXT: s_add_i32 vcc_lo, s32, 0x4004 +; GFX10-PAL-NEXT: scratch_load_dword v3, off, s32 offset:4 glc dlc ; GFX10-PAL-NEXT: s_waitcnt vmcnt(0) -; GFX10-PAL-NEXT: scratch_store_dword v0, v3, off +; GFX10-PAL-NEXT: v_lshl_add_u32 v1, v1, 2, vcc_lo +; GFX10-PAL-NEXT: scratch_store_dword v0, v2, off ; GFX10-PAL-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-PAL-NEXT: scratch_load_dword v0, v1, off glc dlc ; GFX10-PAL-NEXT: s_waitcnt vmcnt(0) diff --git a/llvm/test/CodeGen/AMDGPU/fmed3.ll b/llvm/test/CodeGen/AMDGPU/fmed3.ll index 8b88167d0ae1..c4d72c832c8e 100644 --- a/llvm/test/CodeGen/AMDGPU/fmed3.ll +++ b/llvm/test/CodeGen/AMDGPU/fmed3.ll @@ -970,7 +970,7 @@ define amdgpu_kernel void @two_non_inline_constant(float addrspace(1)* %out, flo ; FIXME: Simple stores do not work as a multiple use because they are bitcasted to integer constants. ; GCN-LABEL: {{^}}one_non_inline_constant: -; GCN-DAG: s_mov_b32 [[K1:s[0-9]+]], 0x41800000 +; GCN-DAG: v_mov_b32_e32 [[K1:v[0-9]+]], 0x41800000 ; GCN-DAG: v_add_f32_e32 [[ADD:v[0-9]+]], 0.5, ; GCN: v_med3_f32 v{{[0-9]+}}, [[ADD]], 1.0, [[K1]] define amdgpu_kernel void @one_non_inline_constant(float addrspace(1)* %out, float addrspace(1)* %aptr) #1 { @@ -990,9 +990,8 @@ define amdgpu_kernel void @one_non_inline_constant(float addrspace(1)* %out, flo } ; GCN-LABEL: {{^}}two_non_inline_constant_multi_use: -; GCN-DAG: s_mov_b32 [[K1:s[0-9]+]], 0x41800000 ; GCN-DAG: s_mov_b32 [[K0:s[0-9]+]], 0x41000000 -; GCN-DAG: v_mov_b32_e32 [[VK1:v[0-9]+]], [[K1]] +; GCN-DAG: v_mov_b32_e32 [[VK1:v[0-9]+]], 0x41800000 ; GCN-DAG: v_add_f32_e32 [[ADD:v[0-9]+]], 0.5, ; GCN: v_med3_f32 v{{[0-9]+}}, [[ADD]], [[K0]], [[VK1]] define amdgpu_kernel void @two_non_inline_constant_multi_use(float addrspace(1)* %out, float addrspace(1)* %aptr) #1 { diff --git a/llvm/test/CodeGen/AMDGPU/fneg-fabs.f16.ll b/llvm/test/CodeGen/AMDGPU/fneg-fabs.f16.ll index 76099a7d2c4d..1cbb2c3ce1b9 100644 --- a/llvm/test/CodeGen/AMDGPU/fneg-fabs.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/fneg-fabs.f16.ll @@ -97,9 +97,8 @@ define amdgpu_kernel void @s_fneg_fabs_v2f16_bc_src(<2 x half> addrspace(1)* %ou } ; GCN-LABEL: {{^}}fneg_fabs_v4f16: -; GCN: s_mov_b32 [[MASK:s[0-9]+]], 0x80008000 -; GCN: s_or_b32 s{{[0-9]+}}, s{{[0-9]+}}, [[MASK]] -; GCN: s_or_b32 s{{[0-9]+}}, s{{[0-9]+}}, [[MASK]] +; GCN: s_or_b32 s{{[0-9]+}}, s{{[0-9]+}}, 0x80008000 +; GCN: s_or_b32 s{{[0-9]+}}, s{{[0-9]+}}, 0x80008000 ; GCN: {{flat|global}}_store_dwordx2 define amdgpu_kernel void @fneg_fabs_v4f16(<4 x half> addrspace(1)* %out, <4 x half> %in) { %fabs = call <4 x half> @llvm.fabs.v4f16(<4 x half> %in) diff --git a/llvm/test/CodeGen/AMDGPU/fneg-fabs.f64.ll b/llvm/test/CodeGen/AMDGPU/fneg-fabs.f64.ll index 6c6fcfa23ccb..14525dba1b1c 100644 --- a/llvm/test/CodeGen/AMDGPU/fneg-fabs.f64.ll +++ b/llvm/test/CodeGen/AMDGPU/fneg-fabs.f64.ll @@ -69,10 +69,9 @@ define amdgpu_kernel void @fneg_fabs_f64(double addrspace(1)* %out, [8 x i32], d } ; GCN-LABEL: {{^}}fneg_fabs_v2f64: -; GCN: s_brev_b32 [[IMMREG:s[0-9]+]], 1{{$}} ; GCN-NOT: 0x80000000 -; GCN: s_or_b32 s{{[0-9]+}}, s{{[0-9]+}}, [[IMMREG]] -; GCN: s_or_b32 s{{[0-9]+}}, s{{[0-9]+}}, [[IMMREG]] +; GCN: s_bitset1_b32 s{{[0-9]+}}, 31 +; GCN: s_bitset1_b32 s{{[0-9]+}}, 31 define amdgpu_kernel void @fneg_fabs_v2f64(<2 x double> addrspace(1)* %out, <2 x double> %in) { %fabs = call <2 x double> @llvm.fabs.v2f64(<2 x double> %in) %fsub = fsub <2 x double> , %fabs @@ -81,12 +80,11 @@ define amdgpu_kernel void @fneg_fabs_v2f64(<2 x double> addrspace(1)* %out, <2 x } ; GCN-LABEL: {{^}}fneg_fabs_v4f64: -; GCN: s_brev_b32 [[IMMREG:s[0-9]+]], 1{{$}} ; GCN-NOT: 0x80000000 -; GCN: s_or_b32 s{{[0-9]+}}, s{{[0-9]+}}, [[IMMREG]] -; GCN: s_or_b32 s{{[0-9]+}}, s{{[0-9]+}}, [[IMMREG]] -; GCN: s_or_b32 s{{[0-9]+}}, s{{[0-9]+}}, [[IMMREG]] -; GCN: s_or_b32 s{{[0-9]+}}, s{{[0-9]+}}, [[IMMREG]] +; GCN: s_bitset1_b32 s{{[0-9]+}}, 31 +; GCN: s_bitset1_b32 s{{[0-9]+}}, 31 +; GCN: s_bitset1_b32 s{{[0-9]+}}, 31 +; GCN: s_bitset1_b32 s{{[0-9]+}}, 31 define amdgpu_kernel void @fneg_fabs_v4f64(<4 x double> addrspace(1)* %out, <4 x double> %in) { %fabs = call <4 x double> @llvm.fabs.v4f64(<4 x double> %in) %fsub = fsub <4 x double> , %fabs diff --git a/llvm/test/CodeGen/AMDGPU/fneg-fabs.ll b/llvm/test/CodeGen/AMDGPU/fneg-fabs.ll index ef6201bac47e..1d7b0450bd99 100644 --- a/llvm/test/CodeGen/AMDGPU/fneg-fabs.ll +++ b/llvm/test/CodeGen/AMDGPU/fneg-fabs.ll @@ -84,9 +84,8 @@ define amdgpu_kernel void @v_fneg_fabs_f32(float addrspace(1)* %out, float addrs ; R600: -PV ; FIXME: In this case two uses of the constant should be folded -; SI: s_brev_b32 [[SIGNBITK:s[0-9]+]], 1{{$}} -; SI: s_or_b32 s{{[0-9]+}}, s{{[0-9]+}}, [[SIGNBITK]] -; SI: s_or_b32 s{{[0-9]+}}, s{{[0-9]+}}, [[SIGNBITK]] +; SI: s_bitset1_b32 s{{[0-9]+}}, 31 +; SI: s_bitset1_b32 s{{[0-9]+}}, 31 define amdgpu_kernel void @fneg_fabs_v2f32(<2 x float> addrspace(1)* %out, <2 x float> %in) { %fabs = call <2 x float> @llvm.fabs.v2f32(<2 x float> %in) %fsub = fsub <2 x float> , %fabs @@ -95,11 +94,10 @@ define amdgpu_kernel void @fneg_fabs_v2f32(<2 x float> addrspace(1)* %out, <2 x } ; FUNC-LABEL: {{^}}fneg_fabs_v4f32: -; SI: s_brev_b32 [[SIGNBITK:s[0-9]+]], 1{{$}} -; SI: s_or_b32 s{{[0-9]+}}, s{{[0-9]+}}, [[SIGNBITK]] -; SI: s_or_b32 s{{[0-9]+}}, s{{[0-9]+}}, [[SIGNBITK]] -; SI: s_or_b32 s{{[0-9]+}}, s{{[0-9]+}}, [[SIGNBITK]] -; SI: s_or_b32 s{{[0-9]+}}, s{{[0-9]+}}, [[SIGNBITK]] +; SI: s_bitset1_b32 s{{[0-9]+}}, 31 +; SI: s_bitset1_b32 s{{[0-9]+}}, 31 +; SI: s_bitset1_b32 s{{[0-9]+}}, 31 +; SI: s_bitset1_b32 s{{[0-9]+}}, 31 define amdgpu_kernel void @fneg_fabs_v4f32(<4 x float> addrspace(1)* %out, <4 x float> %in) { %fabs = call <4 x float> @llvm.fabs.v4f32(<4 x float> %in) %fsub = fsub <4 x float> , %fabs diff --git a/llvm/test/CodeGen/AMDGPU/fneg.ll b/llvm/test/CodeGen/AMDGPU/fneg.ll index 9a8a8c08db88..291ca5f8fe57 100644 --- a/llvm/test/CodeGen/AMDGPU/fneg.ll +++ b/llvm/test/CodeGen/AMDGPU/fneg.ll @@ -18,9 +18,8 @@ define amdgpu_kernel void @s_fneg_f32(float addrspace(1)* %out, float %in) { ; R600: -PV ; R600: -PV -; GCN: s_brev_b32 [[SIGNBIT:s[0-9]+]], 1 -; GCN: s_xor_b32 -; GCN: s_xor_b32 +; GCN: s_xor_b32 {{s[0-9]+}}, {{s[0-9]+}}, 0x80000000 +; GCN: s_xor_b32 {{s[0-9]+}}, {{s[0-9]+}}, 0x80000000 define amdgpu_kernel void @s_fneg_v2f32(<2 x float> addrspace(1)* nocapture %out, <2 x float> %in) { %fneg = fsub <2 x float> , %in store <2 x float> %fneg, <2 x float> addrspace(1)* %out @@ -33,10 +32,10 @@ define amdgpu_kernel void @s_fneg_v2f32(<2 x float> addrspace(1)* nocapture %out ; R600: -PV ; R600: -PV -; GCN: s_xor_b32 -; GCN: s_xor_b32 -; GCN: s_xor_b32 -; GCN: s_xor_b32 +; GCN: s_xor_b32 {{s[0-9]+}}, {{s[0-9]+}}, 0x80000000 +; GCN: s_xor_b32 {{s[0-9]+}}, {{s[0-9]+}}, 0x80000000 +; GCN: s_xor_b32 {{s[0-9]+}}, {{s[0-9]+}}, 0x80000000 +; GCN: s_xor_b32 {{s[0-9]+}}, {{s[0-9]+}}, 0x80000000 define amdgpu_kernel void @s_fneg_v4f32(<4 x float> addrspace(1)* nocapture %out, <4 x float> %in) { %fneg = fsub <4 x float> , %in store <4 x float> %fneg, <4 x float> addrspace(1)* %out diff --git a/llvm/test/CodeGen/AMDGPU/fold-immediate-operand-shrink-with-carry.mir b/llvm/test/CodeGen/AMDGPU/fold-immediate-operand-shrink-with-carry.mir index 079147083863..4e63330211a7 100644 --- a/llvm/test/CodeGen/AMDGPU/fold-immediate-operand-shrink-with-carry.mir +++ b/llvm/test/CodeGen/AMDGPU/fold-immediate-operand-shrink-with-carry.mir @@ -36,9 +36,10 @@ body: | ; GCN: [[S_MOV_B32_:%[0-9]+]]:sreg_32_xm0 = S_MOV_B32 12345 ; GCN: [[DEF:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF ; GCN: [[DEF1:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF - ; GCN: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[S_MOV_B32_]], [[DEF]], 0, implicit $exec - ; GCN: [[V_ADD_CO_U32_e64_2:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_3:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[S_MOV_B32_]], [[DEF1]], 0, implicit $exec - ; GCN: S_ENDPGM 0, implicit [[V_ADD_CO_U32_e64_1]], implicit [[V_ADD_CO_U32_e64_2]] + ; GCN: [[V_ADD_CO_U32_e32_:%[0-9]+]]:vgpr_32 = V_ADD_CO_U32_e32 [[S_MOV_B32_]], [[DEF]], implicit-def $vcc, implicit $exec + ; GCN: [[COPY:%[0-9]+]]:sreg_64_xexec = COPY killed $vcc + ; GCN: [[V_ADD_CO_U32_e32_1:%[0-9]+]]:vgpr_32 = V_ADD_CO_U32_e32 [[S_MOV_B32_]], [[DEF1]], implicit-def $vcc, implicit $exec + ; GCN: S_ENDPGM 0, implicit [[COPY]], implicit [[V_ADD_CO_U32_e32_1]] %0:sreg_32_xm0 = S_MOV_B32 12345 %1:vgpr_32 = IMPLICIT_DEF %2:vgpr_32 = IMPLICIT_DEF diff --git a/llvm/test/CodeGen/AMDGPU/frem.ll b/llvm/test/CodeGen/AMDGPU/frem.ll index 239750ed8c5b..7b17b54becf3 100644 --- a/llvm/test/CodeGen/AMDGPU/frem.ll +++ b/llvm/test/CodeGen/AMDGPU/frem.ll @@ -1344,16 +1344,14 @@ define amdgpu_kernel void @frem_v2f16(<2 x half> addrspace(1)* %out, <2 x half> ; SI-NEXT: v_div_scale_f32 v4, vcc, v0, v2, v0 ; SI-NEXT: v_div_scale_f32 v5, s[4:5], v2, v2, v0 ; SI-NEXT: v_rcp_f32_e32 v6, v5 -; SI-NEXT: s_mov_b32 s6, 3 -; SI-NEXT: s_mov_b32 s7, 0 -; SI-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s6 +; SI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 ; SI-NEXT: v_fma_f32 v7, -v5, v6, 1.0 ; SI-NEXT: v_fma_f32 v6, v7, v6, v6 ; SI-NEXT: v_mul_f32_e32 v7, v4, v6 ; SI-NEXT: v_fma_f32 v8, -v5, v7, v4 ; SI-NEXT: v_fma_f32 v7, v8, v6, v7 ; SI-NEXT: v_fma_f32 v4, -v5, v7, v4 -; SI-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s7 +; SI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 ; SI-NEXT: v_div_fmas_f32 v4, v4, v6, v7 ; SI-NEXT: v_div_fixup_f32 v4, v4, v2, v0 ; SI-NEXT: v_trunc_f32_e32 v4, v4 @@ -1364,14 +1362,14 @@ define amdgpu_kernel void @frem_v2f16(<2 x half> addrspace(1)* %out, <2 x half> ; SI-NEXT: v_div_scale_f32 v2, vcc, v1, v3, v1 ; SI-NEXT: v_div_scale_f32 v4, s[4:5], v3, v3, v1 ; SI-NEXT: v_rcp_f32_e32 v5, v4 -; SI-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s6 +; SI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 ; SI-NEXT: v_fma_f32 v6, -v4, v5, 1.0 ; SI-NEXT: v_fma_f32 v5, v6, v5, v5 ; SI-NEXT: v_mul_f32_e32 v6, v2, v5 ; SI-NEXT: v_fma_f32 v7, -v4, v6, v2 ; SI-NEXT: v_fma_f32 v6, v7, v5, v6 ; SI-NEXT: v_fma_f32 v2, -v4, v6, v2 -; SI-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s7 +; SI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 ; SI-NEXT: v_div_fmas_f32 v2, v2, v5, v6 ; SI-NEXT: v_div_fixup_f32 v2, v2, v3, v1 ; SI-NEXT: v_trunc_f32_e32 v2, v2 @@ -1398,8 +1396,6 @@ define amdgpu_kernel void @frem_v2f16(<2 x half> addrspace(1)* %out, <2 x half> ; CI-NEXT: s_mov_b32 s11, s3 ; CI-NEXT: buffer_load_dword v0, off, s[4:7], 0 ; CI-NEXT: buffer_load_dword v2, off, s[8:11], 0 offset:16 -; CI-NEXT: s_mov_b32 s6, 3 -; CI-NEXT: s_mov_b32 s7, 0 ; CI-NEXT: s_waitcnt vmcnt(1) ; CI-NEXT: v_cvt_f32_f16_e32 v1, v0 ; CI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 @@ -1411,14 +1407,14 @@ define amdgpu_kernel void @frem_v2f16(<2 x half> addrspace(1)* %out, <2 x half> ; CI-NEXT: v_div_scale_f32 v5, s[4:5], v2, v2, v0 ; CI-NEXT: v_div_scale_f32 v4, vcc, v0, v2, v0 ; CI-NEXT: v_rcp_f32_e32 v6, v5 -; CI-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s6 +; CI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 ; CI-NEXT: v_fma_f32 v7, -v5, v6, 1.0 ; CI-NEXT: v_fma_f32 v6, v7, v6, v6 ; CI-NEXT: v_mul_f32_e32 v7, v4, v6 ; CI-NEXT: v_fma_f32 v8, -v5, v7, v4 ; CI-NEXT: v_fma_f32 v7, v8, v6, v7 ; CI-NEXT: v_fma_f32 v4, -v5, v7, v4 -; CI-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s7 +; CI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 ; CI-NEXT: v_div_fmas_f32 v4, v4, v6, v7 ; CI-NEXT: v_div_fixup_f32 v4, v4, v2, v0 ; CI-NEXT: v_trunc_f32_e32 v4, v4 @@ -1429,14 +1425,14 @@ define amdgpu_kernel void @frem_v2f16(<2 x half> addrspace(1)* %out, <2 x half> ; CI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; CI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; CI-NEXT: v_rcp_f32_e32 v5, v4 -; CI-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s6 +; CI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 ; CI-NEXT: v_fma_f32 v6, -v4, v5, 1.0 ; CI-NEXT: v_fma_f32 v5, v6, v5, v5 ; CI-NEXT: v_mul_f32_e32 v6, v2, v5 ; CI-NEXT: v_fma_f32 v7, -v4, v6, v2 ; CI-NEXT: v_fma_f32 v6, v7, v5, v6 ; CI-NEXT: v_fma_f32 v2, -v4, v6, v2 -; CI-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s7 +; CI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 ; CI-NEXT: v_div_fmas_f32 v2, v2, v5, v6 ; CI-NEXT: v_div_fixup_f32 v2, v2, v3, v1 ; CI-NEXT: v_trunc_f32_e32 v2, v2 @@ -1595,16 +1591,14 @@ define amdgpu_kernel void @frem_v4f16(<4 x half> addrspace(1)* %out, <4 x half> ; SI-NEXT: v_div_scale_f32 v8, vcc, v5, v1, v5 ; SI-NEXT: v_div_scale_f32 v9, s[4:5], v1, v1, v5 ; SI-NEXT: v_rcp_f32_e32 v10, v9 -; SI-NEXT: s_mov_b32 s6, 3 -; SI-NEXT: s_mov_b32 s7, 0 -; SI-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s6 +; SI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 ; SI-NEXT: v_fma_f32 v11, -v9, v10, 1.0 ; SI-NEXT: v_fma_f32 v10, v11, v10, v10 ; SI-NEXT: v_mul_f32_e32 v11, v8, v10 ; SI-NEXT: v_fma_f32 v12, -v9, v11, v8 ; SI-NEXT: v_fma_f32 v11, v12, v10, v11 ; SI-NEXT: v_fma_f32 v8, -v9, v11, v8 -; SI-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s7 +; SI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 ; SI-NEXT: v_div_fmas_f32 v8, v8, v10, v11 ; SI-NEXT: v_div_fixup_f32 v8, v8, v1, v5 ; SI-NEXT: v_trunc_f32_e32 v8, v8 @@ -1615,14 +1609,14 @@ define amdgpu_kernel void @frem_v4f16(<4 x half> addrspace(1)* %out, <4 x half> ; SI-NEXT: v_div_scale_f32 v5, vcc, v4, v7, v4 ; SI-NEXT: v_div_scale_f32 v8, s[4:5], v7, v7, v4 ; SI-NEXT: v_rcp_f32_e32 v9, v8 -; SI-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s6 +; SI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 ; SI-NEXT: v_fma_f32 v10, -v8, v9, 1.0 ; SI-NEXT: v_fma_f32 v9, v10, v9, v9 ; SI-NEXT: v_mul_f32_e32 v10, v5, v9 ; SI-NEXT: v_fma_f32 v11, -v8, v10, v5 ; SI-NEXT: v_fma_f32 v10, v11, v9, v10 ; SI-NEXT: v_fma_f32 v5, -v8, v10, v5 -; SI-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s7 +; SI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 ; SI-NEXT: v_div_fmas_f32 v5, v5, v9, v10 ; SI-NEXT: v_div_fixup_f32 v5, v5, v7, v4 ; SI-NEXT: v_trunc_f32_e32 v5, v5 @@ -1632,14 +1626,14 @@ define amdgpu_kernel void @frem_v4f16(<4 x half> addrspace(1)* %out, <4 x half> ; SI-NEXT: v_div_scale_f32 v4, vcc, v3, v0, v3 ; SI-NEXT: v_div_scale_f32 v5, s[4:5], v0, v0, v3 ; SI-NEXT: v_rcp_f32_e32 v7, v5 -; SI-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s6 +; SI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 ; SI-NEXT: v_fma_f32 v8, -v5, v7, 1.0 ; SI-NEXT: v_fma_f32 v7, v8, v7, v7 ; SI-NEXT: v_mul_f32_e32 v8, v4, v7 ; SI-NEXT: v_fma_f32 v9, -v5, v8, v4 ; SI-NEXT: v_fma_f32 v8, v9, v7, v8 ; SI-NEXT: v_fma_f32 v4, -v5, v8, v4 -; SI-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s7 +; SI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 ; SI-NEXT: v_div_fmas_f32 v4, v4, v7, v8 ; SI-NEXT: v_div_fixup_f32 v4, v4, v0, v3 ; SI-NEXT: v_trunc_f32_e32 v4, v4 @@ -1649,14 +1643,14 @@ define amdgpu_kernel void @frem_v4f16(<4 x half> addrspace(1)* %out, <4 x half> ; SI-NEXT: v_div_scale_f32 v3, vcc, v2, v6, v2 ; SI-NEXT: v_div_scale_f32 v4, s[4:5], v6, v6, v2 ; SI-NEXT: v_rcp_f32_e32 v5, v4 -; SI-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s6 +; SI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 ; SI-NEXT: v_fma_f32 v7, -v4, v5, 1.0 ; SI-NEXT: v_fma_f32 v5, v7, v5, v5 ; SI-NEXT: v_mul_f32_e32 v7, v3, v5 ; SI-NEXT: v_fma_f32 v8, -v4, v7, v3 ; SI-NEXT: v_fma_f32 v7, v8, v5, v7 ; SI-NEXT: v_fma_f32 v3, -v4, v7, v3 -; SI-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s7 +; SI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 ; SI-NEXT: v_div_fmas_f32 v3, v3, v5, v7 ; SI-NEXT: v_div_fixup_f32 v3, v3, v6, v2 ; SI-NEXT: v_trunc_f32_e32 v3, v3 @@ -1682,8 +1676,6 @@ define amdgpu_kernel void @frem_v4f16(<4 x half> addrspace(1)* %out, <4 x half> ; CI-NEXT: s_mov_b32 s7, s3 ; CI-NEXT: buffer_load_dwordx2 v[0:1], off, s[4:7], 0 ; CI-NEXT: s_mov_b32 s11, s3 -; CI-NEXT: s_mov_b32 s6, 3 -; CI-NEXT: s_mov_b32 s7, 0 ; CI-NEXT: s_waitcnt vmcnt(0) ; CI-NEXT: v_cvt_f32_f16_e32 v2, v0 ; CI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 @@ -1702,14 +1694,14 @@ define amdgpu_kernel void @frem_v4f16(<4 x half> addrspace(1)* %out, <4 x half> ; CI-NEXT: v_div_scale_f32 v9, s[4:5], v1, v1, v5 ; CI-NEXT: v_div_scale_f32 v8, vcc, v5, v1, v5 ; CI-NEXT: v_rcp_f32_e32 v10, v9 -; CI-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s6 +; CI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 ; CI-NEXT: v_fma_f32 v11, -v9, v10, 1.0 ; CI-NEXT: v_fma_f32 v10, v11, v10, v10 ; CI-NEXT: v_mul_f32_e32 v11, v8, v10 ; CI-NEXT: v_fma_f32 v12, -v9, v11, v8 ; CI-NEXT: v_fma_f32 v11, v12, v10, v11 ; CI-NEXT: v_fma_f32 v8, -v9, v11, v8 -; CI-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s7 +; CI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 ; CI-NEXT: v_div_fmas_f32 v8, v8, v10, v11 ; CI-NEXT: v_div_fixup_f32 v8, v8, v1, v5 ; CI-NEXT: v_trunc_f32_e32 v8, v8 @@ -1720,14 +1712,14 @@ define amdgpu_kernel void @frem_v4f16(<4 x half> addrspace(1)* %out, <4 x half> ; CI-NEXT: v_cvt_f16_f32_e32 v1, v1 ; CI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; CI-NEXT: v_rcp_f32_e32 v9, v8 -; CI-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s6 +; CI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 ; CI-NEXT: v_fma_f32 v10, -v8, v9, 1.0 ; CI-NEXT: v_fma_f32 v9, v10, v9, v9 ; CI-NEXT: v_mul_f32_e32 v10, v5, v9 ; CI-NEXT: v_fma_f32 v11, -v8, v10, v5 ; CI-NEXT: v_fma_f32 v10, v11, v9, v10 ; CI-NEXT: v_fma_f32 v5, -v8, v10, v5 -; CI-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s7 +; CI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 ; CI-NEXT: v_div_fmas_f32 v5, v5, v9, v10 ; CI-NEXT: v_div_fixup_f32 v5, v5, v7, v4 ; CI-NEXT: v_trunc_f32_e32 v5, v5 @@ -1737,14 +1729,14 @@ define amdgpu_kernel void @frem_v4f16(<4 x half> addrspace(1)* %out, <4 x half> ; CI-NEXT: v_or_b32_e32 v1, v4, v1 ; CI-NEXT: v_div_scale_f32 v4, vcc, v3, v0, v3 ; CI-NEXT: v_rcp_f32_e32 v7, v5 -; CI-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s6 +; CI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 ; CI-NEXT: v_fma_f32 v8, -v5, v7, 1.0 ; CI-NEXT: v_fma_f32 v7, v8, v7, v7 ; CI-NEXT: v_mul_f32_e32 v8, v4, v7 ; CI-NEXT: v_fma_f32 v9, -v5, v8, v4 ; CI-NEXT: v_fma_f32 v8, v9, v7, v8 ; CI-NEXT: v_fma_f32 v4, -v5, v8, v4 -; CI-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s7 +; CI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 ; CI-NEXT: v_div_fmas_f32 v4, v4, v7, v8 ; CI-NEXT: v_div_fixup_f32 v4, v4, v0, v3 ; CI-NEXT: v_trunc_f32_e32 v4, v4 @@ -1754,14 +1746,14 @@ define amdgpu_kernel void @frem_v4f16(<4 x half> addrspace(1)* %out, <4 x half> ; CI-NEXT: v_cvt_f16_f32_e32 v0, v0 ; CI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; CI-NEXT: v_rcp_f32_e32 v5, v4 -; CI-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s6 +; CI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 ; CI-NEXT: v_fma_f32 v7, -v4, v5, 1.0 ; CI-NEXT: v_fma_f32 v5, v7, v5, v5 ; CI-NEXT: v_mul_f32_e32 v7, v3, v5 ; CI-NEXT: v_fma_f32 v8, -v4, v7, v3 ; CI-NEXT: v_fma_f32 v7, v8, v5, v7 ; CI-NEXT: v_fma_f32 v3, -v4, v7, v3 -; CI-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s7 +; CI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 ; CI-NEXT: v_div_fmas_f32 v3, v3, v5, v7 ; CI-NEXT: v_div_fixup_f32 v3, v3, v6, v2 ; CI-NEXT: v_trunc_f32_e32 v3, v3 @@ -1965,16 +1957,14 @@ define amdgpu_kernel void @frem_v2f32(<2 x float> addrspace(1)* %out, <2 x float ; SI-NEXT: v_div_scale_f32 v4, vcc, v1, v3, v1 ; SI-NEXT: v_div_scale_f32 v5, s[4:5], v3, v3, v1 ; SI-NEXT: v_rcp_f32_e32 v6, v5 -; SI-NEXT: s_mov_b32 s6, 3 -; SI-NEXT: s_mov_b32 s7, 0 -; SI-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s6 +; SI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 ; SI-NEXT: v_fma_f32 v7, -v5, v6, 1.0 ; SI-NEXT: v_fma_f32 v6, v7, v6, v6 ; SI-NEXT: v_mul_f32_e32 v7, v4, v6 ; SI-NEXT: v_fma_f32 v8, -v5, v7, v4 ; SI-NEXT: v_fma_f32 v7, v8, v6, v7 ; SI-NEXT: v_fma_f32 v4, -v5, v7, v4 -; SI-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s7 +; SI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 ; SI-NEXT: v_div_fmas_f32 v4, v4, v6, v7 ; SI-NEXT: v_div_fixup_f32 v4, v4, v3, v1 ; SI-NEXT: v_trunc_f32_e32 v4, v4 @@ -1982,14 +1972,14 @@ define amdgpu_kernel void @frem_v2f32(<2 x float> addrspace(1)* %out, <2 x float ; SI-NEXT: v_div_scale_f32 v3, vcc, v0, v2, v0 ; SI-NEXT: v_div_scale_f32 v4, s[4:5], v2, v2, v0 ; SI-NEXT: v_rcp_f32_e32 v5, v4 -; SI-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s6 +; SI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 ; SI-NEXT: v_fma_f32 v6, -v4, v5, 1.0 ; SI-NEXT: v_fma_f32 v5, v6, v5, v5 ; SI-NEXT: v_mul_f32_e32 v6, v3, v5 ; SI-NEXT: v_fma_f32 v7, -v4, v6, v3 ; SI-NEXT: v_fma_f32 v6, v7, v5, v6 ; SI-NEXT: v_fma_f32 v3, -v4, v6, v3 -; SI-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s7 +; SI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 ; SI-NEXT: v_div_fmas_f32 v3, v3, v5, v6 ; SI-NEXT: v_div_fixup_f32 v3, v3, v2, v0 ; SI-NEXT: v_trunc_f32_e32 v3, v3 @@ -2014,20 +2004,18 @@ define amdgpu_kernel void @frem_v2f32(<2 x float> addrspace(1)* %out, <2 x float ; CI-NEXT: s_mov_b32 s11, s3 ; CI-NEXT: buffer_load_dwordx2 v[0:1], off, s[4:7], 0 ; CI-NEXT: buffer_load_dwordx2 v[2:3], off, s[8:11], 0 offset:32 -; CI-NEXT: s_mov_b32 s6, 3 -; CI-NEXT: s_mov_b32 s7, 0 ; CI-NEXT: s_waitcnt vmcnt(0) ; CI-NEXT: v_div_scale_f32 v5, s[4:5], v3, v3, v1 ; CI-NEXT: v_div_scale_f32 v4, vcc, v1, v3, v1 ; CI-NEXT: v_rcp_f32_e32 v6, v5 -; CI-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s6 +; CI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 ; CI-NEXT: v_fma_f32 v7, -v5, v6, 1.0 ; CI-NEXT: v_fma_f32 v6, v7, v6, v6 ; CI-NEXT: v_mul_f32_e32 v7, v4, v6 ; CI-NEXT: v_fma_f32 v8, -v5, v7, v4 ; CI-NEXT: v_fma_f32 v7, v8, v6, v7 ; CI-NEXT: v_fma_f32 v4, -v5, v7, v4 -; CI-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s7 +; CI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 ; CI-NEXT: v_div_fmas_f32 v4, v4, v6, v7 ; CI-NEXT: v_div_fixup_f32 v4, v4, v3, v1 ; CI-NEXT: v_trunc_f32_e32 v4, v4 @@ -2035,14 +2023,14 @@ define amdgpu_kernel void @frem_v2f32(<2 x float> addrspace(1)* %out, <2 x float ; CI-NEXT: v_div_scale_f32 v4, s[4:5], v2, v2, v0 ; CI-NEXT: v_div_scale_f32 v3, vcc, v0, v2, v0 ; CI-NEXT: v_rcp_f32_e32 v5, v4 -; CI-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s6 +; CI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 ; CI-NEXT: v_fma_f32 v6, -v4, v5, 1.0 ; CI-NEXT: v_fma_f32 v5, v6, v5, v5 ; CI-NEXT: v_mul_f32_e32 v6, v3, v5 ; CI-NEXT: v_fma_f32 v7, -v4, v6, v3 ; CI-NEXT: v_fma_f32 v6, v7, v5, v6 ; CI-NEXT: v_fma_f32 v3, -v4, v6, v3 -; CI-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s7 +; CI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 ; CI-NEXT: v_div_fmas_f32 v3, v3, v5, v6 ; CI-NEXT: v_div_fixup_f32 v3, v3, v2, v0 ; CI-NEXT: v_trunc_f32_e32 v3, v3 @@ -2054,8 +2042,6 @@ define amdgpu_kernel void @frem_v2f32(<2 x float> addrspace(1)* %out, <2 x float ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 -; VI-NEXT: s_mov_b32 s2, 3 -; VI-NEXT: s_mov_b32 s3, 0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v2, s6 ; VI-NEXT: s_add_u32 s0, s0, 32 @@ -2071,14 +2057,14 @@ define amdgpu_kernel void @frem_v2f32(<2 x float> addrspace(1)* %out, <2 x float ; VI-NEXT: v_div_scale_f32 v7, s[0:1], v5, v5, v3 ; VI-NEXT: v_div_scale_f32 v6, vcc, v3, v5, v3 ; VI-NEXT: v_rcp_f32_e32 v8, v7 -; VI-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s2 +; VI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 ; VI-NEXT: v_fma_f32 v9, -v7, v8, 1.0 ; VI-NEXT: v_fma_f32 v8, v9, v8, v8 ; VI-NEXT: v_mul_f32_e32 v9, v6, v8 ; VI-NEXT: v_fma_f32 v10, -v7, v9, v6 ; VI-NEXT: v_fma_f32 v9, v10, v8, v9 ; VI-NEXT: v_fma_f32 v6, -v7, v9, v6 -; VI-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s3 +; VI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 ; VI-NEXT: v_div_fmas_f32 v6, v6, v8, v9 ; VI-NEXT: v_div_fixup_f32 v6, v6, v5, v3 ; VI-NEXT: v_trunc_f32_e32 v6, v6 @@ -2086,14 +2072,14 @@ define amdgpu_kernel void @frem_v2f32(<2 x float> addrspace(1)* %out, <2 x float ; VI-NEXT: v_div_scale_f32 v6, s[0:1], v4, v4, v2 ; VI-NEXT: v_div_scale_f32 v5, vcc, v2, v4, v2 ; VI-NEXT: v_rcp_f32_e32 v7, v6 -; VI-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s2 +; VI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 ; VI-NEXT: v_fma_f32 v8, -v6, v7, 1.0 ; VI-NEXT: v_fma_f32 v7, v8, v7, v7 ; VI-NEXT: v_mul_f32_e32 v8, v5, v7 ; VI-NEXT: v_fma_f32 v9, -v6, v8, v5 ; VI-NEXT: v_fma_f32 v8, v9, v7, v8 ; VI-NEXT: v_fma_f32 v5, -v6, v8, v5 -; VI-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s3 +; VI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 ; VI-NEXT: v_div_fmas_f32 v5, v5, v7, v8 ; VI-NEXT: v_div_fixup_f32 v5, v5, v4, v2 ; VI-NEXT: v_trunc_f32_e32 v5, v5 @@ -2109,20 +2095,18 @@ define amdgpu_kernel void @frem_v2f32(<2 x float> addrspace(1)* %out, <2 x float ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dwordx2 v[0:1], v4, s[6:7] ; GFX9-NEXT: global_load_dwordx2 v[2:3], v4, s[2:3] offset:32 -; GFX9-NEXT: s_mov_b32 s2, 3 -; GFX9-NEXT: s_mov_b32 s3, 0 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_div_scale_f32 v6, s[0:1], v3, v3, v1 ; GFX9-NEXT: v_div_scale_f32 v5, vcc, v1, v3, v1 ; GFX9-NEXT: v_rcp_f32_e32 v7, v6 -; GFX9-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s2 +; GFX9-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 ; GFX9-NEXT: v_fma_f32 v8, -v6, v7, 1.0 ; GFX9-NEXT: v_fma_f32 v7, v8, v7, v7 ; GFX9-NEXT: v_mul_f32_e32 v8, v5, v7 ; GFX9-NEXT: v_fma_f32 v9, -v6, v8, v5 ; GFX9-NEXT: v_fma_f32 v8, v9, v7, v8 ; GFX9-NEXT: v_fma_f32 v5, -v6, v8, v5 -; GFX9-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s3 +; GFX9-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 ; GFX9-NEXT: v_div_fmas_f32 v5, v5, v7, v8 ; GFX9-NEXT: v_div_fixup_f32 v5, v5, v3, v1 ; GFX9-NEXT: v_trunc_f32_e32 v5, v5 @@ -2130,14 +2114,14 @@ define amdgpu_kernel void @frem_v2f32(<2 x float> addrspace(1)* %out, <2 x float ; GFX9-NEXT: v_div_scale_f32 v5, s[0:1], v2, v2, v0 ; GFX9-NEXT: v_div_scale_f32 v3, vcc, v0, v2, v0 ; GFX9-NEXT: v_rcp_f32_e32 v6, v5 -; GFX9-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s2 +; GFX9-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 ; GFX9-NEXT: v_fma_f32 v7, -v5, v6, 1.0 ; GFX9-NEXT: v_fma_f32 v6, v7, v6, v6 ; GFX9-NEXT: v_mul_f32_e32 v7, v3, v6 ; GFX9-NEXT: v_fma_f32 v8, -v5, v7, v3 ; GFX9-NEXT: v_fma_f32 v7, v8, v6, v7 ; GFX9-NEXT: v_fma_f32 v3, -v5, v7, v3 -; GFX9-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s3 +; GFX9-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 ; GFX9-NEXT: v_div_fmas_f32 v3, v3, v6, v7 ; GFX9-NEXT: v_div_fixup_f32 v3, v3, v2, v0 ; GFX9-NEXT: v_trunc_f32_e32 v3, v3 @@ -2219,16 +2203,14 @@ define amdgpu_kernel void @frem_v4f32(<4 x float> addrspace(1)* %out, <4 x float ; SI-NEXT: v_div_scale_f32 v8, vcc, v3, v7, v3 ; SI-NEXT: v_div_scale_f32 v9, s[4:5], v7, v7, v3 ; SI-NEXT: v_rcp_f32_e32 v10, v9 -; SI-NEXT: s_mov_b32 s6, 3 -; SI-NEXT: s_mov_b32 s7, 0 -; SI-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s6 +; SI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 ; SI-NEXT: v_fma_f32 v11, -v9, v10, 1.0 ; SI-NEXT: v_fma_f32 v10, v11, v10, v10 ; SI-NEXT: v_mul_f32_e32 v11, v8, v10 ; SI-NEXT: v_fma_f32 v12, -v9, v11, v8 ; SI-NEXT: v_fma_f32 v11, v12, v10, v11 ; SI-NEXT: v_fma_f32 v8, -v9, v11, v8 -; SI-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s7 +; SI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 ; SI-NEXT: v_div_fmas_f32 v8, v8, v10, v11 ; SI-NEXT: v_div_fixup_f32 v8, v8, v7, v3 ; SI-NEXT: v_trunc_f32_e32 v8, v8 @@ -2236,14 +2218,14 @@ define amdgpu_kernel void @frem_v4f32(<4 x float> addrspace(1)* %out, <4 x float ; SI-NEXT: v_div_scale_f32 v7, vcc, v2, v6, v2 ; SI-NEXT: v_div_scale_f32 v8, s[4:5], v6, v6, v2 ; SI-NEXT: v_rcp_f32_e32 v9, v8 -; SI-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s6 +; SI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 ; SI-NEXT: v_fma_f32 v10, -v8, v9, 1.0 ; SI-NEXT: v_fma_f32 v9, v10, v9, v9 ; SI-NEXT: v_mul_f32_e32 v10, v7, v9 ; SI-NEXT: v_fma_f32 v11, -v8, v10, v7 ; SI-NEXT: v_fma_f32 v10, v11, v9, v10 ; SI-NEXT: v_fma_f32 v7, -v8, v10, v7 -; SI-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s7 +; SI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 ; SI-NEXT: v_div_fmas_f32 v7, v7, v9, v10 ; SI-NEXT: v_div_fixup_f32 v7, v7, v6, v2 ; SI-NEXT: v_trunc_f32_e32 v7, v7 @@ -2251,14 +2233,14 @@ define amdgpu_kernel void @frem_v4f32(<4 x float> addrspace(1)* %out, <4 x float ; SI-NEXT: v_div_scale_f32 v6, vcc, v1, v5, v1 ; SI-NEXT: v_div_scale_f32 v7, s[4:5], v5, v5, v1 ; SI-NEXT: v_rcp_f32_e32 v8, v7 -; SI-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s6 +; SI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 ; SI-NEXT: v_fma_f32 v9, -v7, v8, 1.0 ; SI-NEXT: v_fma_f32 v8, v9, v8, v8 ; SI-NEXT: v_mul_f32_e32 v9, v6, v8 ; SI-NEXT: v_fma_f32 v10, -v7, v9, v6 ; SI-NEXT: v_fma_f32 v9, v10, v8, v9 ; SI-NEXT: v_fma_f32 v6, -v7, v9, v6 -; SI-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s7 +; SI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 ; SI-NEXT: v_div_fmas_f32 v6, v6, v8, v9 ; SI-NEXT: v_div_fixup_f32 v6, v6, v5, v1 ; SI-NEXT: v_trunc_f32_e32 v6, v6 @@ -2266,14 +2248,14 @@ define amdgpu_kernel void @frem_v4f32(<4 x float> addrspace(1)* %out, <4 x float ; SI-NEXT: v_div_scale_f32 v5, vcc, v0, v4, v0 ; SI-NEXT: v_div_scale_f32 v6, s[4:5], v4, v4, v0 ; SI-NEXT: v_rcp_f32_e32 v7, v6 -; SI-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s6 +; SI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 ; SI-NEXT: v_fma_f32 v8, -v6, v7, 1.0 ; SI-NEXT: v_fma_f32 v7, v8, v7, v7 ; SI-NEXT: v_mul_f32_e32 v8, v5, v7 ; SI-NEXT: v_fma_f32 v9, -v6, v8, v5 ; SI-NEXT: v_fma_f32 v8, v9, v7, v8 ; SI-NEXT: v_fma_f32 v5, -v6, v8, v5 -; SI-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s7 +; SI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 ; SI-NEXT: v_div_fmas_f32 v5, v5, v7, v8 ; SI-NEXT: v_div_fixup_f32 v5, v5, v4, v0 ; SI-NEXT: v_trunc_f32_e32 v5, v5 @@ -2298,20 +2280,18 @@ define amdgpu_kernel void @frem_v4f32(<4 x float> addrspace(1)* %out, <4 x float ; CI-NEXT: s_mov_b32 s11, s3 ; CI-NEXT: buffer_load_dwordx4 v[0:3], off, s[4:7], 0 ; CI-NEXT: buffer_load_dwordx4 v[4:7], off, s[8:11], 0 offset:64 -; CI-NEXT: s_mov_b32 s6, 3 -; CI-NEXT: s_mov_b32 s7, 0 ; CI-NEXT: s_waitcnt vmcnt(0) ; CI-NEXT: v_div_scale_f32 v9, s[4:5], v7, v7, v3 ; CI-NEXT: v_div_scale_f32 v8, vcc, v3, v7, v3 ; CI-NEXT: v_rcp_f32_e32 v10, v9 -; CI-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s6 +; CI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 ; CI-NEXT: v_fma_f32 v11, -v9, v10, 1.0 ; CI-NEXT: v_fma_f32 v10, v11, v10, v10 ; CI-NEXT: v_mul_f32_e32 v11, v8, v10 ; CI-NEXT: v_fma_f32 v12, -v9, v11, v8 ; CI-NEXT: v_fma_f32 v11, v12, v10, v11 ; CI-NEXT: v_fma_f32 v8, -v9, v11, v8 -; CI-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s7 +; CI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 ; CI-NEXT: v_div_fmas_f32 v8, v8, v10, v11 ; CI-NEXT: v_div_fixup_f32 v8, v8, v7, v3 ; CI-NEXT: v_trunc_f32_e32 v8, v8 @@ -2319,14 +2299,14 @@ define amdgpu_kernel void @frem_v4f32(<4 x float> addrspace(1)* %out, <4 x float ; CI-NEXT: v_div_scale_f32 v8, s[4:5], v6, v6, v2 ; CI-NEXT: v_div_scale_f32 v7, vcc, v2, v6, v2 ; CI-NEXT: v_rcp_f32_e32 v9, v8 -; CI-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s6 +; CI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 ; CI-NEXT: v_fma_f32 v10, -v8, v9, 1.0 ; CI-NEXT: v_fma_f32 v9, v10, v9, v9 ; CI-NEXT: v_mul_f32_e32 v10, v7, v9 ; CI-NEXT: v_fma_f32 v11, -v8, v10, v7 ; CI-NEXT: v_fma_f32 v10, v11, v9, v10 ; CI-NEXT: v_fma_f32 v7, -v8, v10, v7 -; CI-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s7 +; CI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 ; CI-NEXT: v_div_fmas_f32 v7, v7, v9, v10 ; CI-NEXT: v_div_fixup_f32 v7, v7, v6, v2 ; CI-NEXT: v_trunc_f32_e32 v7, v7 @@ -2334,14 +2314,14 @@ define amdgpu_kernel void @frem_v4f32(<4 x float> addrspace(1)* %out, <4 x float ; CI-NEXT: v_div_scale_f32 v7, s[4:5], v5, v5, v1 ; CI-NEXT: v_div_scale_f32 v6, vcc, v1, v5, v1 ; CI-NEXT: v_rcp_f32_e32 v8, v7 -; CI-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s6 +; CI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 ; CI-NEXT: v_fma_f32 v9, -v7, v8, 1.0 ; CI-NEXT: v_fma_f32 v8, v9, v8, v8 ; CI-NEXT: v_mul_f32_e32 v9, v6, v8 ; CI-NEXT: v_fma_f32 v10, -v7, v9, v6 ; CI-NEXT: v_fma_f32 v9, v10, v8, v9 ; CI-NEXT: v_fma_f32 v6, -v7, v9, v6 -; CI-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s7 +; CI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 ; CI-NEXT: v_div_fmas_f32 v6, v6, v8, v9 ; CI-NEXT: v_div_fixup_f32 v6, v6, v5, v1 ; CI-NEXT: v_trunc_f32_e32 v6, v6 @@ -2349,14 +2329,14 @@ define amdgpu_kernel void @frem_v4f32(<4 x float> addrspace(1)* %out, <4 x float ; CI-NEXT: v_div_scale_f32 v6, s[4:5], v4, v4, v0 ; CI-NEXT: v_div_scale_f32 v5, vcc, v0, v4, v0 ; CI-NEXT: v_rcp_f32_e32 v7, v6 -; CI-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s6 +; CI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 ; CI-NEXT: v_fma_f32 v8, -v6, v7, 1.0 ; CI-NEXT: v_fma_f32 v7, v8, v7, v7 ; CI-NEXT: v_mul_f32_e32 v8, v5, v7 ; CI-NEXT: v_fma_f32 v9, -v6, v8, v5 ; CI-NEXT: v_fma_f32 v8, v9, v7, v8 ; CI-NEXT: v_fma_f32 v5, -v6, v8, v5 -; CI-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s7 +; CI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 ; CI-NEXT: v_div_fmas_f32 v5, v5, v7, v8 ; CI-NEXT: v_div_fixup_f32 v5, v5, v4, v0 ; CI-NEXT: v_trunc_f32_e32 v5, v5 @@ -2368,8 +2348,6 @@ define amdgpu_kernel void @frem_v4f32(<4 x float> addrspace(1)* %out, <4 x float ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 -; VI-NEXT: s_mov_b32 s2, 3 -; VI-NEXT: s_mov_b32 s3, 0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s6 ; VI-NEXT: s_add_u32 s0, s0, 64 @@ -2385,14 +2363,14 @@ define amdgpu_kernel void @frem_v4f32(<4 x float> addrspace(1)* %out, <4 x float ; VI-NEXT: v_div_scale_f32 v11, s[0:1], v7, v7, v3 ; VI-NEXT: v_div_scale_f32 v10, vcc, v3, v7, v3 ; VI-NEXT: v_rcp_f32_e32 v12, v11 -; VI-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s2 +; VI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 ; VI-NEXT: v_fma_f32 v13, -v11, v12, 1.0 ; VI-NEXT: v_fma_f32 v12, v13, v12, v12 ; VI-NEXT: v_mul_f32_e32 v13, v10, v12 ; VI-NEXT: v_fma_f32 v14, -v11, v13, v10 ; VI-NEXT: v_fma_f32 v13, v14, v12, v13 ; VI-NEXT: v_fma_f32 v10, -v11, v13, v10 -; VI-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s3 +; VI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 ; VI-NEXT: v_div_fmas_f32 v10, v10, v12, v13 ; VI-NEXT: v_div_fixup_f32 v10, v10, v7, v3 ; VI-NEXT: v_trunc_f32_e32 v10, v10 @@ -2400,14 +2378,14 @@ define amdgpu_kernel void @frem_v4f32(<4 x float> addrspace(1)* %out, <4 x float ; VI-NEXT: v_div_scale_f32 v10, s[0:1], v6, v6, v2 ; VI-NEXT: v_div_scale_f32 v7, vcc, v2, v6, v2 ; VI-NEXT: v_rcp_f32_e32 v11, v10 -; VI-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s2 +; VI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 ; VI-NEXT: v_fma_f32 v12, -v10, v11, 1.0 ; VI-NEXT: v_fma_f32 v11, v12, v11, v11 ; VI-NEXT: v_mul_f32_e32 v12, v7, v11 ; VI-NEXT: v_fma_f32 v13, -v10, v12, v7 ; VI-NEXT: v_fma_f32 v12, v13, v11, v12 ; VI-NEXT: v_fma_f32 v7, -v10, v12, v7 -; VI-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s3 +; VI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 ; VI-NEXT: v_div_fmas_f32 v7, v7, v11, v12 ; VI-NEXT: v_div_fixup_f32 v7, v7, v6, v2 ; VI-NEXT: v_trunc_f32_e32 v7, v7 @@ -2415,14 +2393,14 @@ define amdgpu_kernel void @frem_v4f32(<4 x float> addrspace(1)* %out, <4 x float ; VI-NEXT: v_div_scale_f32 v7, s[0:1], v5, v5, v1 ; VI-NEXT: v_div_scale_f32 v6, vcc, v1, v5, v1 ; VI-NEXT: v_rcp_f32_e32 v10, v7 -; VI-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s2 +; VI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 ; VI-NEXT: v_fma_f32 v11, -v7, v10, 1.0 ; VI-NEXT: v_fma_f32 v10, v11, v10, v10 ; VI-NEXT: v_mul_f32_e32 v11, v6, v10 ; VI-NEXT: v_fma_f32 v12, -v7, v11, v6 ; VI-NEXT: v_fma_f32 v11, v12, v10, v11 ; VI-NEXT: v_fma_f32 v6, -v7, v11, v6 -; VI-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s3 +; VI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 ; VI-NEXT: v_div_fmas_f32 v6, v6, v10, v11 ; VI-NEXT: v_div_fixup_f32 v6, v6, v5, v1 ; VI-NEXT: v_trunc_f32_e32 v6, v6 @@ -2430,14 +2408,14 @@ define amdgpu_kernel void @frem_v4f32(<4 x float> addrspace(1)* %out, <4 x float ; VI-NEXT: v_div_scale_f32 v6, s[0:1], v4, v4, v0 ; VI-NEXT: v_div_scale_f32 v5, vcc, v0, v4, v0 ; VI-NEXT: v_rcp_f32_e32 v7, v6 -; VI-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s2 +; VI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 ; VI-NEXT: v_fma_f32 v10, -v6, v7, 1.0 ; VI-NEXT: v_fma_f32 v7, v10, v7, v7 ; VI-NEXT: v_mul_f32_e32 v10, v5, v7 ; VI-NEXT: v_fma_f32 v11, -v6, v10, v5 ; VI-NEXT: v_fma_f32 v10, v11, v7, v10 ; VI-NEXT: v_fma_f32 v5, -v6, v10, v5 -; VI-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s3 +; VI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 ; VI-NEXT: v_div_fmas_f32 v5, v5, v7, v10 ; VI-NEXT: v_div_fixup_f32 v5, v5, v4, v0 ; VI-NEXT: v_trunc_f32_e32 v5, v5 @@ -2453,20 +2431,18 @@ define amdgpu_kernel void @frem_v4f32(<4 x float> addrspace(1)* %out, <4 x float ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dwordx4 v[0:3], v8, s[6:7] ; GFX9-NEXT: global_load_dwordx4 v[4:7], v8, s[2:3] offset:64 -; GFX9-NEXT: s_mov_b32 s2, 3 -; GFX9-NEXT: s_mov_b32 s3, 0 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_div_scale_f32 v10, s[0:1], v7, v7, v3 ; GFX9-NEXT: v_div_scale_f32 v9, vcc, v3, v7, v3 ; GFX9-NEXT: v_rcp_f32_e32 v11, v10 -; GFX9-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s2 +; GFX9-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 ; GFX9-NEXT: v_fma_f32 v12, -v10, v11, 1.0 ; GFX9-NEXT: v_fma_f32 v11, v12, v11, v11 ; GFX9-NEXT: v_mul_f32_e32 v12, v9, v11 ; GFX9-NEXT: v_fma_f32 v13, -v10, v12, v9 ; GFX9-NEXT: v_fma_f32 v12, v13, v11, v12 ; GFX9-NEXT: v_fma_f32 v9, -v10, v12, v9 -; GFX9-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s3 +; GFX9-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 ; GFX9-NEXT: v_div_fmas_f32 v9, v9, v11, v12 ; GFX9-NEXT: v_div_fixup_f32 v9, v9, v7, v3 ; GFX9-NEXT: v_trunc_f32_e32 v9, v9 @@ -2474,14 +2450,14 @@ define amdgpu_kernel void @frem_v4f32(<4 x float> addrspace(1)* %out, <4 x float ; GFX9-NEXT: v_div_scale_f32 v9, s[0:1], v6, v6, v2 ; GFX9-NEXT: v_div_scale_f32 v7, vcc, v2, v6, v2 ; GFX9-NEXT: v_rcp_f32_e32 v10, v9 -; GFX9-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s2 +; GFX9-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 ; GFX9-NEXT: v_fma_f32 v11, -v9, v10, 1.0 ; GFX9-NEXT: v_fma_f32 v10, v11, v10, v10 ; GFX9-NEXT: v_mul_f32_e32 v11, v7, v10 ; GFX9-NEXT: v_fma_f32 v12, -v9, v11, v7 ; GFX9-NEXT: v_fma_f32 v11, v12, v10, v11 ; GFX9-NEXT: v_fma_f32 v7, -v9, v11, v7 -; GFX9-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s3 +; GFX9-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 ; GFX9-NEXT: v_div_fmas_f32 v7, v7, v10, v11 ; GFX9-NEXT: v_div_fixup_f32 v7, v7, v6, v2 ; GFX9-NEXT: v_trunc_f32_e32 v7, v7 @@ -2489,14 +2465,14 @@ define amdgpu_kernel void @frem_v4f32(<4 x float> addrspace(1)* %out, <4 x float ; GFX9-NEXT: v_div_scale_f32 v7, s[0:1], v5, v5, v1 ; GFX9-NEXT: v_div_scale_f32 v6, vcc, v1, v5, v1 ; GFX9-NEXT: v_rcp_f32_e32 v9, v7 -; GFX9-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s2 +; GFX9-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 ; GFX9-NEXT: v_fma_f32 v10, -v7, v9, 1.0 ; GFX9-NEXT: v_fma_f32 v9, v10, v9, v9 ; GFX9-NEXT: v_mul_f32_e32 v10, v6, v9 ; GFX9-NEXT: v_fma_f32 v11, -v7, v10, v6 ; GFX9-NEXT: v_fma_f32 v10, v11, v9, v10 ; GFX9-NEXT: v_fma_f32 v6, -v7, v10, v6 -; GFX9-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s3 +; GFX9-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 ; GFX9-NEXT: v_div_fmas_f32 v6, v6, v9, v10 ; GFX9-NEXT: v_div_fixup_f32 v6, v6, v5, v1 ; GFX9-NEXT: v_trunc_f32_e32 v6, v6 @@ -2504,14 +2480,14 @@ define amdgpu_kernel void @frem_v4f32(<4 x float> addrspace(1)* %out, <4 x float ; GFX9-NEXT: v_div_scale_f32 v6, s[0:1], v4, v4, v0 ; GFX9-NEXT: v_div_scale_f32 v5, vcc, v0, v4, v0 ; GFX9-NEXT: v_rcp_f32_e32 v7, v6 -; GFX9-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s2 +; GFX9-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 ; GFX9-NEXT: v_fma_f32 v9, -v6, v7, 1.0 ; GFX9-NEXT: v_fma_f32 v7, v9, v7, v7 ; GFX9-NEXT: v_mul_f32_e32 v9, v5, v7 ; GFX9-NEXT: v_fma_f32 v10, -v6, v9, v5 ; GFX9-NEXT: v_fma_f32 v9, v10, v7, v9 ; GFX9-NEXT: v_fma_f32 v5, -v6, v9, v5 -; GFX9-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s3 +; GFX9-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 ; GFX9-NEXT: v_div_fmas_f32 v5, v5, v7, v9 ; GFX9-NEXT: v_div_fixup_f32 v5, v5, v4, v0 ; GFX9-NEXT: v_trunc_f32_e32 v5, v5 @@ -2636,16 +2612,15 @@ define amdgpu_kernel void @frem_v2f64(<2 x double> addrspace(1)* %out, <2 x doub ; SI-NEXT: v_div_fmas_f64 v[8:9], v[16:17], v[10:11], v[14:15] ; SI-NEXT: v_div_fixup_f64 v[8:9], v[8:9], v[6:7], v[2:3] ; SI-NEXT: v_bfe_u32 v10, v9, 20, 11 -; SI-NEXT: s_movk_i32 s8, 0xfc01 -; SI-NEXT: v_add_i32_e32 v12, vcc, s8, v10 +; SI-NEXT: v_add_i32_e32 v12, vcc, 0xfffffc01, v10 ; SI-NEXT: s_mov_b32 s3, 0xfffff ; SI-NEXT: v_lshr_b64 v[10:11], s[2:3], v12 ; SI-NEXT: v_not_b32_e32 v10, v10 ; SI-NEXT: v_and_b32_e32 v10, v8, v10 ; SI-NEXT: v_not_b32_e32 v11, v11 ; SI-NEXT: v_and_b32_e32 v11, v9, v11 -; SI-NEXT: s_brev_b32 s9, 1 -; SI-NEXT: v_and_b32_e32 v13, s9, v9 +; SI-NEXT: s_brev_b32 s8, 1 +; SI-NEXT: v_and_b32_e32 v13, s8, v9 ; SI-NEXT: v_cmp_gt_i32_e32 vcc, 0, v12 ; SI-NEXT: v_cndmask_b32_e32 v11, v11, v13, vcc ; SI-NEXT: v_cmp_lt_i32_e64 s[0:1], 51, v12 @@ -2669,13 +2644,13 @@ define amdgpu_kernel void @frem_v2f64(<2 x double> addrspace(1)* %out, <2 x doub ; SI-NEXT: v_div_fmas_f64 v[6:7], v[14:15], v[8:9], v[12:13] ; SI-NEXT: v_div_fixup_f64 v[6:7], v[6:7], v[4:5], v[0:1] ; SI-NEXT: v_bfe_u32 v8, v7, 20, 11 -; SI-NEXT: v_add_i32_e32 v10, vcc, s8, v8 +; SI-NEXT: v_add_i32_e32 v10, vcc, 0xfffffc01, v8 ; SI-NEXT: v_lshr_b64 v[8:9], s[2:3], v10 ; SI-NEXT: v_not_b32_e32 v8, v8 ; SI-NEXT: v_and_b32_e32 v8, v6, v8 ; SI-NEXT: v_not_b32_e32 v9, v9 ; SI-NEXT: v_and_b32_e32 v9, v7, v9 -; SI-NEXT: v_and_b32_e32 v11, s9, v7 +; SI-NEXT: v_and_b32_e32 v11, s8, v7 ; SI-NEXT: v_cmp_gt_i32_e32 vcc, 0, v10 ; SI-NEXT: v_cndmask_b32_e32 v9, v9, v11, vcc ; SI-NEXT: v_cmp_lt_i32_e64 s[0:1], 51, v10 diff --git a/llvm/test/CodeGen/AMDGPU/fshr.ll b/llvm/test/CodeGen/AMDGPU/fshr.ll index 994cc78ec791..a3058fc1f444 100644 --- a/llvm/test/CodeGen/AMDGPU/fshr.ll +++ b/llvm/test/CodeGen/AMDGPU/fshr.ll @@ -716,10 +716,9 @@ define <2 x i16> @v_fshr_v2i16(<2 x i16> %src0, <2 x i16> %src1, <2 x i16> %src2 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: v_xor_b32_e32 v3, -1, v2 -; GFX10-NEXT: s_mov_b32 s4, 0xf000f ; GFX10-NEXT: v_pk_lshlrev_b16 v0, 1, v0 op_sel_hi:[0,1] -; GFX10-NEXT: v_and_b32_e32 v2, s4, v2 -; GFX10-NEXT: v_and_b32_e32 v3, s4, v3 +; GFX10-NEXT: v_and_b32_e32 v2, 0xf000f, v2 +; GFX10-NEXT: v_and_b32_e32 v3, 0xf000f, v3 ; GFX10-NEXT: v_pk_lshrrev_b16 v1, v2, v1 ; GFX10-NEXT: v_pk_lshlrev_b16 v0, v3, v0 ; GFX10-NEXT: v_or_b32_e32 v0, v0, v1 @@ -929,37 +928,36 @@ define <4 x i16> @v_fshr_v4i16(<4 x i16> %src0, <4 x i16> %src1, <4 x i16> %src2 ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: v_lshrrev_b32_e32 v6, 16, v1 +; GFX10-NEXT: v_lshrrev_b32_e32 v6, 16, v3 ; GFX10-NEXT: v_lshrrev_b32_e32 v7, 16, v5 -; GFX10-NEXT: v_lshrrev_b32_e32 v8, 16, v3 -; GFX10-NEXT: v_lshrrev_b32_e32 v11, 16, v4 -; GFX10-NEXT: v_lshrrev_b32_e32 v12, 16, v0 -; GFX10-NEXT: v_lshlrev_b16 v6, 1, v6 -; GFX10-NEXT: v_xor_b32_e32 v9, -1, v7 -; GFX10-NEXT: v_lshrrev_b16 v7, v7, v8 -; GFX10-NEXT: v_lshlrev_b16 v0, 1, v0 -; GFX10-NEXT: v_xor_b32_e32 v8, -1, v4 +; GFX10-NEXT: v_lshrrev_b32_e32 v8, 16, v1 +; GFX10-NEXT: v_lshrrev_b32_e32 v9, 16, v4 +; GFX10-NEXT: v_lshrrev_b32_e32 v10, 16, v0 ; GFX10-NEXT: v_lshlrev_b16 v1, 1, v1 -; GFX10-NEXT: v_lshlrev_b16 v6, v9, v6 -; GFX10-NEXT: v_xor_b32_e32 v9, -1, v5 -; GFX10-NEXT: v_lshrrev_b32_e32 v10, 16, v2 -; GFX10-NEXT: v_lshlrev_b16 v12, 1, v12 -; GFX10-NEXT: v_xor_b32_e32 v13, -1, v11 -; GFX10-NEXT: v_lshlrev_b16 v0, v8, v0 +; GFX10-NEXT: v_xor_b32_e32 v11, -1, v5 +; GFX10-NEXT: v_lshlrev_b16 v0, 1, v0 +; GFX10-NEXT: v_xor_b32_e32 v12, -1, v4 +; GFX10-NEXT: v_lshrrev_b16 v6, v7, v6 +; GFX10-NEXT: v_lshlrev_b16 v8, 1, v8 +; GFX10-NEXT: v_xor_b32_e32 v7, -1, v7 +; GFX10-NEXT: v_lshrrev_b32_e32 v13, 16, v2 +; GFX10-NEXT: v_lshlrev_b16 v10, 1, v10 +; GFX10-NEXT: v_xor_b32_e32 v14, -1, v9 +; GFX10-NEXT: v_lshlrev_b16 v1, v11, v1 +; GFX10-NEXT: v_lshlrev_b16 v0, v12, v0 ; GFX10-NEXT: v_lshrrev_b16 v2, v4, v2 -; GFX10-NEXT: v_lshlrev_b16 v1, v9, v1 ; GFX10-NEXT: v_lshrrev_b16 v3, v5, v3 -; GFX10-NEXT: v_lshrrev_b16 v4, v11, v10 -; GFX10-NEXT: v_lshlrev_b16 v5, v13, v12 +; GFX10-NEXT: v_lshlrev_b16 v4, v7, v8 +; GFX10-NEXT: v_lshrrev_b16 v5, v9, v13 +; GFX10-NEXT: v_lshlrev_b16 v7, v14, v10 ; GFX10-NEXT: v_or_b32_e32 v0, v0, v2 -; GFX10-NEXT: v_mov_b32_e32 v2, 0xffff ; GFX10-NEXT: v_or_b32_e32 v1, v1, v3 -; GFX10-NEXT: v_or_b32_e32 v3, v6, v7 -; GFX10-NEXT: v_or_b32_e32 v4, v5, v4 -; GFX10-NEXT: v_and_b32_e32 v0, v2, v0 -; GFX10-NEXT: v_and_b32_e32 v1, v2, v1 -; GFX10-NEXT: v_lshl_or_b32 v0, v4, 16, v0 -; GFX10-NEXT: v_lshl_or_b32 v1, v3, 16, v1 +; GFX10-NEXT: v_or_b32_e32 v2, v4, v6 +; GFX10-NEXT: v_or_b32_e32 v3, v7, v5 +; GFX10-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX10-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX10-NEXT: v_lshl_or_b32 v0, v3, 16, v0 +; GFX10-NEXT: v_lshl_or_b32 v1, v2, 16, v1 ; GFX10-NEXT: s_setpc_b64 s[30:31] %ret = call <4 x i16> @llvm.fshr.v4i16(<4 x i16> %src0, <4 x i16> %src1, <4 x i16> %src2) ret <4 x i16> %ret @@ -1241,14 +1239,12 @@ define <2 x i24> @v_fshr_v2i24(<2 x i24> %src0, <2 x i24> %src1, <2 x i24> %src2 ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: s_mov_b32 s4, 0xffffff +; GFX10-NEXT: v_and_b32_e32 v6, 0xffffff, v4 +; GFX10-NEXT: v_and_b32_e32 v7, 0xffffff, v5 ; GFX10-NEXT: v_lshlrev_b32_e32 v2, 8, v2 -; GFX10-NEXT: v_and_b32_e32 v6, s4, v4 -; GFX10-NEXT: v_and_b32_e32 v7, s4, v5 -; GFX10-NEXT: s_mov_b32 s4, 0xaaaaaaab ; GFX10-NEXT: v_lshlrev_b32_e32 v3, 8, v3 -; GFX10-NEXT: v_mul_hi_u32 v6, v6, s4 -; GFX10-NEXT: v_mul_hi_u32 v7, v7, s4 +; GFX10-NEXT: v_mul_hi_u32 v6, 0xaaaaaaab, v6 +; GFX10-NEXT: v_mul_hi_u32 v7, 0xaaaaaaab, v7 ; GFX10-NEXT: v_lshrrev_b32_e32 v6, 4, v6 ; GFX10-NEXT: v_lshrrev_b32_e32 v7, 4, v7 ; GFX10-NEXT: v_mul_u32_u24_e32 v6, 24, v6 diff --git a/llvm/test/CodeGen/AMDGPU/idiv-licm.ll b/llvm/test/CodeGen/AMDGPU/idiv-licm.ll index 3d88dfc7ddd6..56707bc7c6e4 100644 --- a/llvm/test/CodeGen/AMDGPU/idiv-licm.ll +++ b/llvm/test/CodeGen/AMDGPU/idiv-licm.ll @@ -406,18 +406,17 @@ define amdgpu_kernel void @udiv16_invariant_denom(i16 addrspace(1)* nocapture %a ; GFX9-LABEL: udiv16_invariant_denom: ; GFX9: ; %bb.0: ; %bb ; GFX9-NEXT: s_load_dword s2, s[0:1], 0x2c -; GFX9-NEXT: s_mov_b32 s4, 0xffff ; GFX9-NEXT: v_mov_b32_e32 v1, 0 -; GFX9-NEXT: s_movk_i32 s5, 0x400 +; GFX9-NEXT: s_movk_i32 s4, 0x400 ; GFX9-NEXT: v_mov_b32_e32 v4, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_and_b32 s2, s4, s2 +; GFX9-NEXT: s_and_b32 s2, 0xffff, s2 ; GFX9-NEXT: v_cvt_f32_u32_e32 v2, s2 ; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v3, v2 ; GFX9-NEXT: .LBB4_1: ; %bb3 ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX9-NEXT: v_and_b32_e32 v0, s4, v4 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v4 ; GFX9-NEXT: v_cvt_f32_u32_e32 v8, v0 ; GFX9-NEXT: v_lshlrev_b64 v[5:6], 1, v[0:1] ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -429,7 +428,7 @@ define amdgpu_kernel void @udiv16_invariant_denom(i16 addrspace(1)* nocapture %a ; GFX9-NEXT: v_cvt_u32_f32_e32 v7, v0 ; GFX9-NEXT: v_add_u16_e32 v4, 1, v4 ; GFX9-NEXT: v_mad_f32 v0, -v0, v2, v8 -; GFX9-NEXT: v_cmp_eq_u16_e32 vcc, s5, v4 +; GFX9-NEXT: v_cmp_eq_u16_e32 vcc, s4, v4 ; GFX9-NEXT: v_cmp_ge_f32_e64 s[0:1], |v0|, v2 ; GFX9-NEXT: v_addc_co_u32_e64 v0, s[0:1], 0, v7, s[0:1] ; GFX9-NEXT: global_store_short v[5:6], v0, off @@ -442,16 +441,15 @@ define amdgpu_kernel void @udiv16_invariant_denom(i16 addrspace(1)* nocapture %a ; GFX10-NEXT: s_clause 0x1 ; GFX10-NEXT: s_load_dword s4, s[0:1], 0x2c ; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 -; GFX10-NEXT: s_mov_b32 s1, 0xffff ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: v_mov_b32_e32 v4, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: s_and_b32 s0, s1, s4 +; GFX10-NEXT: s_and_b32 s0, 0xffff, s4 ; GFX10-NEXT: v_cvt_f32_u32_e32 v2, s0 ; GFX10-NEXT: v_rcp_iflag_f32_e32 v3, v2 ; GFX10-NEXT: .LBB4_1: ; %bb3 ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX10-NEXT: v_and_b32_e32 v0, s1, v4 +; GFX10-NEXT: v_and_b32_e32 v0, 0xffff, v4 ; GFX10-NEXT: v_add_nc_u16 v4, v4, 1 ; GFX10-NEXT: v_cvt_f32_u32_e32 v7, v0 ; GFX10-NEXT: v_lshlrev_b64 v[5:6], 1, v[0:1] @@ -491,17 +489,16 @@ define amdgpu_kernel void @urem16_invariant_denom(i16 addrspace(1)* nocapture %a ; GFX9: ; %bb.0: ; %bb ; GFX9-NEXT: s_load_dword s2, s[0:1], 0x2c ; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 -; GFX9-NEXT: s_mov_b32 s6, 0xffff ; GFX9-NEXT: v_mov_b32_e32 v1, 0 -; GFX9-NEXT: s_movk_i32 s8, 0x400 -; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_and_b32 s7, s6, s2 -; GFX9-NEXT: v_cvt_f32_u32_e32 v2, s7 +; GFX9-NEXT: s_movk_i32 s7, 0x400 ; GFX9-NEXT: v_mov_b32_e32 v4, 0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_and_b32 s6, 0xffff, s2 +; GFX9-NEXT: v_cvt_f32_u32_e32 v2, s6 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v3, v2 ; GFX9-NEXT: .LBB5_1: ; %bb3 ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX9-NEXT: v_and_b32_e32 v0, s6, v4 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v4 ; GFX9-NEXT: v_cvt_f32_u32_e32 v8, v0 ; GFX9-NEXT: v_lshlrev_b64 v[5:6], 1, v[0:1] ; GFX9-NEXT: v_add_u16_e32 v4, 1, v4 @@ -511,9 +508,9 @@ define amdgpu_kernel void @urem16_invariant_denom(i16 addrspace(1)* nocapture %a ; GFX9-NEXT: v_cvt_u32_f32_e32 v10, v9 ; GFX9-NEXT: v_mad_f32 v8, -v9, v2, v8 ; GFX9-NEXT: v_cmp_ge_f32_e64 s[2:3], |v8|, v2 -; GFX9-NEXT: v_cmp_eq_u16_e32 vcc, s8, v4 +; GFX9-NEXT: v_cmp_eq_u16_e32 vcc, s7, v4 ; GFX9-NEXT: v_addc_co_u32_e64 v8, s[2:3], 0, v10, s[2:3] -; GFX9-NEXT: v_mul_lo_u32 v8, v8, s7 +; GFX9-NEXT: v_mul_lo_u32 v8, v8, s6 ; GFX9-NEXT: v_add_co_u32_e64 v5, s[0:1], s4, v5 ; GFX9-NEXT: v_addc_co_u32_e64 v6, s[0:1], v7, v6, s[0:1] ; GFX9-NEXT: v_sub_u32_e32 v0, v0, v8 @@ -527,16 +524,15 @@ define amdgpu_kernel void @urem16_invariant_denom(i16 addrspace(1)* nocapture %a ; GFX10-NEXT: s_clause 0x1 ; GFX10-NEXT: s_load_dword s4, s[0:1], 0x2c ; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 -; GFX10-NEXT: s_mov_b32 s1, 0xffff ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: v_mov_b32_e32 v4, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: s_and_b32 s4, s1, s4 -; GFX10-NEXT: v_cvt_f32_u32_e32 v2, s4 +; GFX10-NEXT: s_and_b32 s1, 0xffff, s4 +; GFX10-NEXT: v_cvt_f32_u32_e32 v2, s1 ; GFX10-NEXT: v_rcp_iflag_f32_e32 v3, v2 ; GFX10-NEXT: .LBB5_1: ; %bb3 ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX10-NEXT: v_and_b32_e32 v0, s1, v4 +; GFX10-NEXT: v_and_b32_e32 v0, 0xffff, v4 ; GFX10-NEXT: v_add_nc_u16 v4, v4, 1 ; GFX10-NEXT: v_cvt_f32_u32_e32 v7, v0 ; GFX10-NEXT: v_lshlrev_b64 v[5:6], 1, v[0:1] @@ -549,7 +545,7 @@ define amdgpu_kernel void @urem16_invariant_denom(i16 addrspace(1)* nocapture %a ; GFX10-NEXT: v_cmp_ge_f32_e64 vcc_lo, |v7|, v2 ; GFX10-NEXT: v_add_co_ci_u32_e32 v7, vcc_lo, 0, v8, vcc_lo ; GFX10-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x400, v4 -; GFX10-NEXT: v_mul_lo_u32 v7, v7, s4 +; GFX10-NEXT: v_mul_lo_u32 v7, v7, s1 ; GFX10-NEXT: v_sub_nc_u32_e32 v0, v0, v7 ; GFX10-NEXT: global_store_short v[5:6], v0, off ; GFX10-NEXT: s_cbranch_vccz .LBB5_1 diff --git a/llvm/test/CodeGen/AMDGPU/idot2.ll b/llvm/test/CodeGen/AMDGPU/idot2.ll index 597ed577c16e..1b4a181f7e12 100644 --- a/llvm/test/CodeGen/AMDGPU/idot2.ll +++ b/llvm/test/CodeGen/AMDGPU/idot2.ll @@ -2104,16 +2104,15 @@ define amdgpu_kernel void @udot2_MultipleUses_mul1(<2 x i16> addrspace(1)* %src1 ; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX10-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 -; GFX10-DL-NEXT: s_mov_b32 s3, 0xffff ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DL-NEXT: s_clause 0x1 ; GFX10-DL-NEXT: global_load_dword v1, v0, s[4:5] ; GFX10-DL-NEXT: global_load_dword v2, v0, s[6:7] ; GFX10-DL-NEXT: s_load_dword s2, s[0:1], 0x0 ; GFX10-DL-NEXT: s_waitcnt vmcnt(1) -; GFX10-DL-NEXT: v_and_b32_e32 v0, s3, v1 +; GFX10-DL-NEXT: v_and_b32_e32 v0, 0xffff, v1 ; GFX10-DL-NEXT: s_waitcnt vmcnt(0) -; GFX10-DL-NEXT: v_and_b32_e32 v3, s3, v2 +; GFX10-DL-NEXT: v_and_b32_e32 v3, 0xffff, v2 ; GFX10-DL-NEXT: v_mul_u32_u24_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; GFX10-DL-NEXT: v_mul_u32_u24_e32 v2, v3, v0 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) diff --git a/llvm/test/CodeGen/AMDGPU/idot4u.ll b/llvm/test/CodeGen/AMDGPU/idot4u.ll index 4aa58fbc42f5..228c17bb5e80 100644 --- a/llvm/test/CodeGen/AMDGPU/idot4u.ll +++ b/llvm/test/CodeGen/AMDGPU/idot4u.ll @@ -213,6 +213,7 @@ define amdgpu_kernel void @udot4_acc16(<4 x i8> addrspace(1)* %src1, ; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 +; GFX8-NEXT: v_mov_b32_e32 v5, 0xff ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, s5 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, s4, v2 @@ -226,7 +227,6 @@ define amdgpu_kernel void @udot4_acc16(<4 x i8> addrspace(1)* %src1, ; GFX8-NEXT: v_mov_b32_e32 v1, s1 ; GFX8-NEXT: flat_load_ushort v4, v[0:1] ; GFX8-NEXT: s_movk_i32 s0, 0xff -; GFX8-NEXT: v_mov_b32_e32 v5, s0 ; GFX8-NEXT: s_waitcnt vmcnt(2) ; GFX8-NEXT: v_and_b32_e32 v6, s0, v3 ; GFX8-NEXT: v_lshrrev_b32_e32 v8, 8, v3 @@ -315,7 +315,7 @@ define amdgpu_kernel void @udot4_acc16(<4 x i8> addrspace(1)* %src1, ; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX10-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX10-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX10-DL-NEXT: s_movk_i32 s0, 0xff +; GFX10-DL-NEXT: v_mov_b32_e32 v8, 0xff ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DL-NEXT: s_clause 0x1 ; GFX10-DL-NEXT: global_load_dword v1, v0, s[4:5] @@ -323,17 +323,17 @@ define amdgpu_kernel void @udot4_acc16(<4 x i8> addrspace(1)* %src1, ; GFX10-DL-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-DL-NEXT: global_load_ushort v3, v0, s[2:3] ; GFX10-DL-NEXT: s_waitcnt vmcnt(2) -; GFX10-DL-NEXT: v_and_b32_e32 v4, s0, v1 +; GFX10-DL-NEXT: v_and_b32_e32 v4, 0xff, v1 ; GFX10-DL-NEXT: v_lshrrev_b32_e32 v5, 8, v1 ; GFX10-DL-NEXT: s_waitcnt vmcnt(1) ; GFX10-DL-NEXT: v_lshrrev_b32_e32 v6, 8, v2 -; GFX10-DL-NEXT: v_and_b32_e32 v7, s0, v2 -; GFX10-DL-NEXT: v_and_b32_e32 v5, s0, v5 -; GFX10-DL-NEXT: v_and_b32_e32 v6, s0, v6 +; GFX10-DL-NEXT: v_and_b32_e32 v7, 0xff, v2 +; GFX10-DL-NEXT: v_and_b32_e32 v5, 0xff, v5 +; GFX10-DL-NEXT: v_and_b32_e32 v6, 0xff, v6 ; GFX10-DL-NEXT: s_waitcnt vmcnt(0) ; GFX10-DL-NEXT: v_mad_u16 v3, v4, v7, v3 -; GFX10-DL-NEXT: v_and_b32_sdwa v4, v1, s0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; GFX10-DL-NEXT: v_and_b32_sdwa v7, v2, s0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX10-DL-NEXT: v_and_b32_sdwa v4, v1, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX10-DL-NEXT: v_and_b32_sdwa v7, v2, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; GFX10-DL-NEXT: v_lshrrev_b32_e32 v1, 24, v1 ; GFX10-DL-NEXT: v_lshrrev_b32_e32 v2, 24, v2 ; GFX10-DL-NEXT: v_mad_u16 v3, v5, v6, v3 @@ -1208,16 +1208,15 @@ define amdgpu_kernel void @udot4_multiuse_mul1(<4 x i8> addrspace(1)* %src1, ; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX10-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 -; GFX10-DL-NEXT: s_movk_i32 s3, 0xff ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DL-NEXT: s_clause 0x1 ; GFX10-DL-NEXT: global_load_dword v1, v0, s[4:5] ; GFX10-DL-NEXT: global_load_dword v2, v0, s[6:7] ; GFX10-DL-NEXT: s_load_dword s2, s[0:1], 0x0 ; GFX10-DL-NEXT: s_waitcnt vmcnt(1) -; GFX10-DL-NEXT: v_and_b32_e32 v0, s3, v1 +; GFX10-DL-NEXT: v_and_b32_e32 v0, 0xff, v1 ; GFX10-DL-NEXT: s_waitcnt vmcnt(0) -; GFX10-DL-NEXT: v_and_b32_e32 v3, s3, v2 +; GFX10-DL-NEXT: v_and_b32_e32 v3, 0xff, v2 ; GFX10-DL-NEXT: v_mul_u32_u24_sdwa v4, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:BYTE_1 ; GFX10-DL-NEXT: v_mul_u32_u24_e32 v5, v0, v3 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) @@ -1511,6 +1510,7 @@ define amdgpu_kernel void @notdot4_mixedtypes(<4 x i8> addrspace(1)* %src1, ; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 +; GFX8-NEXT: v_mov_b32_e32 v5, 0xff ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, s5 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, s4, v2 @@ -1524,7 +1524,6 @@ define amdgpu_kernel void @notdot4_mixedtypes(<4 x i8> addrspace(1)* %src1, ; GFX8-NEXT: v_mov_b32_e32 v1, s1 ; GFX8-NEXT: flat_load_ushort v4, v[0:1] ; GFX8-NEXT: s_movk_i32 s0, 0xff -; GFX8-NEXT: v_mov_b32_e32 v5, s0 ; GFX8-NEXT: s_waitcnt vmcnt(2) ; GFX8-NEXT: v_lshrrev_b32_e32 v8, 8, v3 ; GFX8-NEXT: v_and_b32_e32 v8, s0, v8 @@ -1613,7 +1612,7 @@ define amdgpu_kernel void @notdot4_mixedtypes(<4 x i8> addrspace(1)* %src1, ; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX10-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX10-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX10-DL-NEXT: s_movk_i32 s0, 0xff +; GFX10-DL-NEXT: v_mov_b32_e32 v7, 0xff ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DL-NEXT: s_clause 0x1 ; GFX10-DL-NEXT: global_load_dword v1, v0, s[4:5] @@ -1625,16 +1624,16 @@ define amdgpu_kernel void @notdot4_mixedtypes(<4 x i8> addrspace(1)* %src1, ; GFX10-DL-NEXT: s_waitcnt vmcnt(1) ; GFX10-DL-NEXT: v_lshrrev_b32_e32 v5, 8, v2 ; GFX10-DL-NEXT: v_bfe_i32 v6, v1, 0, 8 -; GFX10-DL-NEXT: v_bfe_i32 v7, v2, 0, 8 -; GFX10-DL-NEXT: v_and_b32_e32 v4, s0, v4 -; GFX10-DL-NEXT: v_and_b32_e32 v5, s0, v5 +; GFX10-DL-NEXT: v_bfe_i32 v8, v2, 0, 8 +; GFX10-DL-NEXT: v_and_b32_e32 v4, 0xff, v4 +; GFX10-DL-NEXT: v_and_b32_e32 v5, 0xff, v5 ; GFX10-DL-NEXT: s_waitcnt vmcnt(0) ; GFX10-DL-NEXT: v_mad_u16 v3, v4, v5, v3 -; GFX10-DL-NEXT: v_and_b32_sdwa v4, v1, s0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; GFX10-DL-NEXT: v_and_b32_sdwa v5, v2, s0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX10-DL-NEXT: v_and_b32_sdwa v4, v1, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX10-DL-NEXT: v_and_b32_sdwa v5, v2, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; GFX10-DL-NEXT: v_lshrrev_b32_e32 v1, 24, v1 ; GFX10-DL-NEXT: v_lshrrev_b32_e32 v2, 24, v2 -; GFX10-DL-NEXT: v_mad_u16 v3, v6, v7, v3 +; GFX10-DL-NEXT: v_mad_u16 v3, v6, v8, v3 ; GFX10-DL-NEXT: v_mad_u16 v3, v4, v5, v3 ; GFX10-DL-NEXT: v_mad_u16 v1, v1, v2, v3 ; GFX10-DL-NEXT: global_store_short v0, v1, s[2:3] @@ -1802,18 +1801,18 @@ define amdgpu_kernel void @udot4_acc32_vecMul(<4 x i8> addrspace(1)* %src1, ; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX10-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 -; GFX10-DL-NEXT: s_mov_b32 s3, 0xffff ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DL-NEXT: s_clause 0x1 ; GFX10-DL-NEXT: global_load_dword v1, v0, s[4:5] ; GFX10-DL-NEXT: global_load_dword v2, v0, s[6:7] +; GFX10-DL-NEXT: v_mov_b32_e32 v0, 0xffff ; GFX10-DL-NEXT: s_load_dword s2, s[0:1], 0x0 ; GFX10-DL-NEXT: s_waitcnt vmcnt(1) -; GFX10-DL-NEXT: v_and_b32_sdwa v0, s3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 +; GFX10-DL-NEXT: v_and_b32_sdwa v3, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 ; GFX10-DL-NEXT: s_waitcnt vmcnt(0) -; GFX10-DL-NEXT: v_and_b32_sdwa v3, s3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 +; GFX10-DL-NEXT: v_and_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 ; GFX10-DL-NEXT: v_mul_u32_u24_sdwa v4, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0 -; GFX10-DL-NEXT: v_mul_u32_u24_e32 v0, v0, v3 +; GFX10-DL-NEXT: v_mul_u32_u24_e32 v0, v3, v0 ; GFX10-DL-NEXT: v_mul_u32_u24_sdwa v3, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2 src1_sel:BYTE_2 ; GFX10-DL-NEXT: v_mul_u32_u24_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:BYTE_3 ; GFX10-DL-NEXT: v_mov_b32_e32 v2, 0 @@ -1901,6 +1900,7 @@ define amdgpu_kernel void @udot4_acc16_vecMul(<4 x i8> addrspace(1)* %src1, ; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 +; GFX8-NEXT: v_mov_b32_e32 v5, 0xff ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, s5 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, s4, v2 @@ -1914,7 +1914,6 @@ define amdgpu_kernel void @udot4_acc16_vecMul(<4 x i8> addrspace(1)* %src1, ; GFX8-NEXT: v_mov_b32_e32 v1, s1 ; GFX8-NEXT: flat_load_ushort v4, v[0:1] ; GFX8-NEXT: s_movk_i32 s0, 0xff -; GFX8-NEXT: v_mov_b32_e32 v5, s0 ; GFX8-NEXT: s_waitcnt vmcnt(2) ; GFX8-NEXT: v_lshrrev_b32_e32 v6, 24, v3 ; GFX8-NEXT: v_lshrrev_b16_e32 v7, 8, v3 @@ -2015,7 +2014,7 @@ define amdgpu_kernel void @udot4_acc16_vecMul(<4 x i8> addrspace(1)* %src1, ; GFX10-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX10-DL-NEXT: v_mov_b32_e32 v4, 0xffff -; GFX10-DL-NEXT: s_movk_i32 s2, 0xff +; GFX10-DL-NEXT: v_mov_b32_e32 v5, 0xff ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DL-NEXT: s_clause 0x1 ; GFX10-DL-NEXT: global_load_dword v1, v0, s[4:5] @@ -2023,27 +2022,27 @@ define amdgpu_kernel void @udot4_acc16_vecMul(<4 x i8> addrspace(1)* %src1, ; GFX10-DL-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-DL-NEXT: global_load_ushort v3, v0, s[0:1] ; GFX10-DL-NEXT: s_waitcnt vmcnt(2) -; GFX10-DL-NEXT: v_lshrrev_b16 v5, 8, v1 +; GFX10-DL-NEXT: v_lshrrev_b16 v6, 8, v1 ; GFX10-DL-NEXT: s_waitcnt vmcnt(1) -; GFX10-DL-NEXT: v_lshrrev_b16 v6, 8, v2 -; GFX10-DL-NEXT: v_and_b32_sdwa v7, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX10-DL-NEXT: v_and_b32_sdwa v8, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX10-DL-NEXT: v_and_b32_sdwa v9, v1, s2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; GFX10-DL-NEXT: v_and_b32_sdwa v10, v2, s2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX10-DL-NEXT: v_lshrrev_b16 v7, 8, v2 +; GFX10-DL-NEXT: v_and_b32_sdwa v8, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX10-DL-NEXT: v_and_b32_sdwa v4, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX10-DL-NEXT: v_and_b32_sdwa v9, v1, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX10-DL-NEXT: v_and_b32_sdwa v5, v2, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; GFX10-DL-NEXT: v_lshrrev_b32_e32 v1, 24, v1 -; GFX10-DL-NEXT: v_lshl_or_b32 v6, v6, 16, v7 -; GFX10-DL-NEXT: v_lshl_or_b32 v5, v5, 16, v8 +; GFX10-DL-NEXT: v_lshl_or_b32 v7, v7, 16, v8 +; GFX10-DL-NEXT: v_lshl_or_b32 v4, v6, 16, v4 ; GFX10-DL-NEXT: v_lshrrev_b32_e32 v2, 24, v2 -; GFX10-DL-NEXT: v_and_b32_e32 v7, v4, v10 -; GFX10-DL-NEXT: v_and_b32_e32 v4, v4, v9 -; GFX10-DL-NEXT: v_pk_mul_lo_u16 v5, v5, v6 -; GFX10-DL-NEXT: v_lshl_or_b32 v2, v2, 16, v7 -; GFX10-DL-NEXT: v_lshl_or_b32 v1, v1, 16, v4 -; GFX10-DL-NEXT: v_lshrrev_b32_e32 v4, 16, v5 +; GFX10-DL-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; GFX10-DL-NEXT: v_and_b32_e32 v6, 0xffff, v9 +; GFX10-DL-NEXT: v_pk_mul_lo_u16 v4, v4, v7 +; GFX10-DL-NEXT: v_lshl_or_b32 v2, v2, 16, v5 +; GFX10-DL-NEXT: v_lshl_or_b32 v1, v1, 16, v6 +; GFX10-DL-NEXT: v_lshrrev_b32_e32 v5, 16, v4 ; GFX10-DL-NEXT: s_waitcnt vmcnt(0) -; GFX10-DL-NEXT: v_add_nc_u16 v3, v5, v3 +; GFX10-DL-NEXT: v_add_nc_u16 v3, v4, v3 ; GFX10-DL-NEXT: v_pk_mul_lo_u16 v1, v1, v2 -; GFX10-DL-NEXT: v_add_nc_u16 v2, v3, v4 +; GFX10-DL-NEXT: v_add_nc_u16 v2, v3, v5 ; GFX10-DL-NEXT: v_lshrrev_b32_e32 v3, 16, v1 ; GFX10-DL-NEXT: v_add_nc_u16 v1, v2, v1 ; GFX10-DL-NEXT: v_add_nc_u16 v1, v1, v3 diff --git a/llvm/test/CodeGen/AMDGPU/idot8s.ll b/llvm/test/CodeGen/AMDGPU/idot8s.ll index 8563d321c83a..eacbd6f0e2e6 100644 --- a/llvm/test/CodeGen/AMDGPU/idot8s.ll +++ b/llvm/test/CodeGen/AMDGPU/idot8s.ll @@ -2519,7 +2519,6 @@ define amdgpu_kernel void @idot8_acc16_vecMul(<8 x i4> addrspace(1)* %src1, ; GFX10-DL-XNACK-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX10-DL-XNACK-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-DL-XNACK-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 -; GFX10-DL-XNACK-NEXT: v_mov_b32_e32 v4, 0xffff ; GFX10-DL-XNACK-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 ; GFX10-DL-XNACK-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 ; GFX10-DL-XNACK-NEXT: s_mov_b32 s10, -1 @@ -2533,79 +2532,79 @@ define amdgpu_kernel void @idot8_acc16_vecMul(<8 x i4> addrspace(1)* %src1, ; GFX10-DL-XNACK-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-DL-XNACK-NEXT: global_load_ushort v3, v0, s[0:1] ; GFX10-DL-XNACK-NEXT: s_waitcnt vmcnt(2) -; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v5, 4, v1 -; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v6, 12, v1 +; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v4, 4, v1 +; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v5, 12, v1 ; GFX10-DL-XNACK-NEXT: s_waitcnt vmcnt(1) -; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v12, 4, v2 -; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v13, 12, v2 -; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v8, 8, v1 -; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v15, 8, v2 -; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v5, 12, v5 -; GFX10-DL-XNACK-NEXT: v_ashrrev_i16 v6, 12, v6 -; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v12, 12, v12 -; GFX10-DL-XNACK-NEXT: v_ashrrev_i16 v13, 12, v13 -; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v7, 12, v1 -; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v14, 12, v2 -; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v8, 12, v8 -; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v15, 12, v15 +; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v11, 4, v2 +; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v12, 12, v2 +; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v7, 8, v1 +; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v14, 8, v2 +; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v4, 12, v4 ; GFX10-DL-XNACK-NEXT: v_ashrrev_i16 v5, 12, v5 +; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v11, 12, v11 ; GFX10-DL-XNACK-NEXT: v_ashrrev_i16 v12, 12, v12 -; GFX10-DL-XNACK-NEXT: v_and_b32_e32 v13, v4, v13 -; GFX10-DL-XNACK-NEXT: v_and_b32_e32 v6, v4, v6 -; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v10, 16, v1 -; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v17, 16, v2 +; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v6, 12, v1 +; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v13, 12, v2 ; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v7, 12, v7 ; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v14, 12, v14 -; GFX10-DL-XNACK-NEXT: v_ashrrev_i16 v8, 12, v8 -; GFX10-DL-XNACK-NEXT: v_ashrrev_i16 v15, 12, v15 -; GFX10-DL-XNACK-NEXT: v_lshl_or_b32 v12, v12, 16, v13 -; GFX10-DL-XNACK-NEXT: v_lshl_or_b32 v5, v5, 16, v6 -; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v9, 20, v1 -; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v16, 20, v2 -; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v10, 12, v10 -; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v17, 12, v17 +; GFX10-DL-XNACK-NEXT: v_ashrrev_i16 v4, 12, v4 +; GFX10-DL-XNACK-NEXT: v_ashrrev_i16 v11, 12, v11 +; GFX10-DL-XNACK-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; GFX10-DL-XNACK-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v9, 16, v1 +; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v16, 16, v2 +; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v6, 12, v6 +; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v13, 12, v13 ; GFX10-DL-XNACK-NEXT: v_ashrrev_i16 v7, 12, v7 -; GFX10-DL-XNACK-NEXT: v_ashrrev_i16 v6, 12, v14 -; GFX10-DL-XNACK-NEXT: v_and_b32_e32 v13, v4, v15 -; GFX10-DL-XNACK-NEXT: v_and_b32_e32 v8, v4, v8 -; GFX10-DL-XNACK-NEXT: v_pk_mul_lo_u16 v5, v5, v12 -; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v11, 28, v1 -; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v1, 24, v1 -; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v18, 28, v2 -; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v2, 24, v2 +; GFX10-DL-XNACK-NEXT: v_ashrrev_i16 v14, 12, v14 +; GFX10-DL-XNACK-NEXT: v_lshl_or_b32 v11, v11, 16, v12 +; GFX10-DL-XNACK-NEXT: v_lshl_or_b32 v4, v4, 16, v5 +; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v8, 20, v1 +; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v15, 20, v2 ; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v9, 12, v9 ; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v16, 12, v16 -; GFX10-DL-XNACK-NEXT: v_ashrrev_i16 v10, 12, v10 -; GFX10-DL-XNACK-NEXT: v_ashrrev_i16 v14, 12, v17 -; GFX10-DL-XNACK-NEXT: v_lshl_or_b32 v6, v6, 16, v13 -; GFX10-DL-XNACK-NEXT: v_lshl_or_b32 v7, v7, 16, v8 -; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v8, 16, v5 -; GFX10-DL-XNACK-NEXT: s_waitcnt vmcnt(0) -; GFX10-DL-XNACK-NEXT: v_add_nc_u16 v3, v5, v3 -; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v1, 12, v1 +; GFX10-DL-XNACK-NEXT: v_ashrrev_i16 v6, 12, v6 +; GFX10-DL-XNACK-NEXT: v_ashrrev_i16 v5, 12, v13 +; GFX10-DL-XNACK-NEXT: v_and_b32_e32 v12, 0xffff, v14 +; GFX10-DL-XNACK-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; GFX10-DL-XNACK-NEXT: v_pk_mul_lo_u16 v4, v4, v11 +; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v10, 28, v1 +; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v1, 24, v1 +; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v17, 28, v2 +; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v2, 24, v2 +; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v8, 12, v8 +; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v15, 12, v15 ; GFX10-DL-XNACK-NEXT: v_ashrrev_i16 v9, 12, v9 -; GFX10-DL-XNACK-NEXT: v_ashrrev_i16 v12, 12, v16 +; GFX10-DL-XNACK-NEXT: v_ashrrev_i16 v13, 12, v16 +; GFX10-DL-XNACK-NEXT: v_lshl_or_b32 v5, v5, 16, v12 +; GFX10-DL-XNACK-NEXT: v_lshl_or_b32 v6, v6, 16, v7 +; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v7, 16, v4 +; GFX10-DL-XNACK-NEXT: s_waitcnt vmcnt(0) +; GFX10-DL-XNACK-NEXT: v_add_nc_u16 v3, v4, v3 +; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v1, 12, v1 +; GFX10-DL-XNACK-NEXT: v_ashrrev_i16 v8, 12, v8 +; GFX10-DL-XNACK-NEXT: v_ashrrev_i16 v11, 12, v15 ; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v2, 12, v2 -; GFX10-DL-XNACK-NEXT: v_and_b32_e32 v5, v4, v14 -; GFX10-DL-XNACK-NEXT: v_and_b32_e32 v10, v4, v10 -; GFX10-DL-XNACK-NEXT: v_pk_mul_lo_u16 v6, v7, v6 -; GFX10-DL-XNACK-NEXT: v_add_nc_u16 v3, v3, v8 -; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v11, 12, v11 -; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v18, 12, v18 +; GFX10-DL-XNACK-NEXT: v_and_b32_e32 v4, 0xffff, v13 +; GFX10-DL-XNACK-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; GFX10-DL-XNACK-NEXT: v_pk_mul_lo_u16 v5, v6, v5 +; GFX10-DL-XNACK-NEXT: v_add_nc_u16 v3, v3, v7 +; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v10, 12, v10 +; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v17, 12, v17 ; GFX10-DL-XNACK-NEXT: v_ashrrev_i16 v1, 12, v1 ; GFX10-DL-XNACK-NEXT: v_ashrrev_i16 v2, 12, v2 -; GFX10-DL-XNACK-NEXT: v_lshl_or_b32 v5, v12, 16, v5 -; GFX10-DL-XNACK-NEXT: v_lshl_or_b32 v7, v9, 16, v10 -; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v8, 16, v6 -; GFX10-DL-XNACK-NEXT: v_add_nc_u16 v3, v3, v6 -; GFX10-DL-XNACK-NEXT: v_ashrrev_i16 v11, 12, v11 -; GFX10-DL-XNACK-NEXT: v_ashrrev_i16 v6, 12, v18 -; GFX10-DL-XNACK-NEXT: v_and_b32_e32 v2, v4, v2 -; GFX10-DL-XNACK-NEXT: v_and_b32_e32 v1, v4, v1 -; GFX10-DL-XNACK-NEXT: v_pk_mul_lo_u16 v4, v7, v5 -; GFX10-DL-XNACK-NEXT: v_add_nc_u16 v3, v3, v8 -; GFX10-DL-XNACK-NEXT: v_lshl_or_b32 v2, v6, 16, v2 -; GFX10-DL-XNACK-NEXT: v_lshl_or_b32 v1, v11, 16, v1 +; GFX10-DL-XNACK-NEXT: v_lshl_or_b32 v4, v11, 16, v4 +; GFX10-DL-XNACK-NEXT: v_lshl_or_b32 v6, v8, 16, v9 +; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v7, 16, v5 +; GFX10-DL-XNACK-NEXT: v_add_nc_u16 v3, v3, v5 +; GFX10-DL-XNACK-NEXT: v_ashrrev_i16 v10, 12, v10 +; GFX10-DL-XNACK-NEXT: v_ashrrev_i16 v5, 12, v17 +; GFX10-DL-XNACK-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX10-DL-XNACK-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX10-DL-XNACK-NEXT: v_pk_mul_lo_u16 v4, v6, v4 +; GFX10-DL-XNACK-NEXT: v_add_nc_u16 v3, v3, v7 +; GFX10-DL-XNACK-NEXT: v_lshl_or_b32 v2, v5, 16, v2 +; GFX10-DL-XNACK-NEXT: v_lshl_or_b32 v1, v10, 16, v1 ; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v5, 16, v4 ; GFX10-DL-XNACK-NEXT: v_add_nc_u16 v3, v3, v4 ; GFX10-DL-XNACK-NEXT: v_pk_mul_lo_u16 v1, v1, v2 @@ -2622,7 +2621,6 @@ define amdgpu_kernel void @idot8_acc16_vecMul(<8 x i4> addrspace(1)* %src1, ; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-DL-NOXNACK-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX10-DL-NOXNACK-NEXT: v_mov_b32_e32 v2, 0 -; GFX10-DL-NOXNACK-NEXT: v_mov_b32_e32 v4, 0xffff ; GFX10-DL-NOXNACK-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 ; GFX10-DL-NOXNACK-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 ; GFX10-DL-NOXNACK-NEXT: s_mov_b32 s10, -1 @@ -2635,79 +2633,79 @@ define amdgpu_kernel void @idot8_acc16_vecMul(<8 x i4> addrspace(1)* %src1, ; GFX10-DL-NOXNACK-NEXT: global_load_dword v0, v0, s[6:7] ; GFX10-DL-NOXNACK-NEXT: global_load_ushort v3, v2, s[0:1] ; GFX10-DL-NOXNACK-NEXT: s_waitcnt vmcnt(2) -; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v5, 4, v1 -; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v6, 12, v1 +; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v4, 4, v1 +; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v5, 12, v1 ; GFX10-DL-NOXNACK-NEXT: s_waitcnt vmcnt(1) -; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v12, 4, v0 -; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v13, 12, v0 -; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v8, 8, v1 -; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v15, 8, v0 -; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v5, 12, v5 -; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v6, 12, v6 -; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v12, 12, v12 -; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v13, 12, v13 -; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v7, 12, v1 -; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v14, 12, v0 -; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v8, 12, v8 -; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v15, 12, v15 +; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v11, 4, v0 +; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v12, 12, v0 +; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v7, 8, v1 +; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v14, 8, v0 +; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v4, 12, v4 ; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v5, 12, v5 +; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v11, 12, v11 ; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v12, 12, v12 -; GFX10-DL-NOXNACK-NEXT: v_and_b32_e32 v13, v4, v13 -; GFX10-DL-NOXNACK-NEXT: v_and_b32_e32 v6, v4, v6 -; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v10, 16, v1 -; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v17, 16, v0 +; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v6, 12, v1 +; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v13, 12, v0 ; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v7, 12, v7 ; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v14, 12, v14 -; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v8, 12, v8 -; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v15, 12, v15 -; GFX10-DL-NOXNACK-NEXT: v_lshl_or_b32 v12, v12, 16, v13 -; GFX10-DL-NOXNACK-NEXT: v_lshl_or_b32 v5, v5, 16, v6 -; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v9, 20, v1 -; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v16, 20, v0 -; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v10, 12, v10 -; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v17, 12, v17 +; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v4, 12, v4 +; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v11, 12, v11 +; GFX10-DL-NOXNACK-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; GFX10-DL-NOXNACK-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v9, 16, v1 +; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v16, 16, v0 +; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v6, 12, v6 +; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v13, 12, v13 ; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v7, 12, v7 -; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v6, 12, v14 -; GFX10-DL-NOXNACK-NEXT: v_and_b32_e32 v13, v4, v15 -; GFX10-DL-NOXNACK-NEXT: v_and_b32_e32 v8, v4, v8 -; GFX10-DL-NOXNACK-NEXT: v_pk_mul_lo_u16 v5, v5, v12 -; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v11, 28, v1 -; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v1, 24, v1 -; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v18, 28, v0 -; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v0, 24, v0 +; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v14, 12, v14 +; GFX10-DL-NOXNACK-NEXT: v_lshl_or_b32 v11, v11, 16, v12 +; GFX10-DL-NOXNACK-NEXT: v_lshl_or_b32 v4, v4, 16, v5 +; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v8, 20, v1 +; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v15, 20, v0 ; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v9, 12, v9 ; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v16, 12, v16 -; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v10, 12, v10 -; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v14, 12, v17 -; GFX10-DL-NOXNACK-NEXT: v_lshl_or_b32 v6, v6, 16, v13 -; GFX10-DL-NOXNACK-NEXT: v_lshl_or_b32 v7, v7, 16, v8 -; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v8, 16, v5 -; GFX10-DL-NOXNACK-NEXT: s_waitcnt vmcnt(0) -; GFX10-DL-NOXNACK-NEXT: v_add_nc_u16 v3, v5, v3 -; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v1, 12, v1 +; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v6, 12, v6 +; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v5, 12, v13 +; GFX10-DL-NOXNACK-NEXT: v_and_b32_e32 v12, 0xffff, v14 +; GFX10-DL-NOXNACK-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; GFX10-DL-NOXNACK-NEXT: v_pk_mul_lo_u16 v4, v4, v11 +; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v10, 28, v1 +; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v1, 24, v1 +; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v17, 28, v0 +; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v0, 24, v0 +; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v8, 12, v8 +; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v15, 12, v15 ; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v9, 12, v9 -; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v12, 12, v16 +; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v13, 12, v16 +; GFX10-DL-NOXNACK-NEXT: v_lshl_or_b32 v5, v5, 16, v12 +; GFX10-DL-NOXNACK-NEXT: v_lshl_or_b32 v6, v6, 16, v7 +; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v7, 16, v4 +; GFX10-DL-NOXNACK-NEXT: s_waitcnt vmcnt(0) +; GFX10-DL-NOXNACK-NEXT: v_add_nc_u16 v3, v4, v3 +; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v1, 12, v1 +; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v8, 12, v8 +; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v11, 12, v15 ; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v0, 12, v0 -; GFX10-DL-NOXNACK-NEXT: v_and_b32_e32 v5, v4, v14 -; GFX10-DL-NOXNACK-NEXT: v_and_b32_e32 v10, v4, v10 -; GFX10-DL-NOXNACK-NEXT: v_pk_mul_lo_u16 v6, v7, v6 -; GFX10-DL-NOXNACK-NEXT: v_add_nc_u16 v3, v3, v8 -; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v11, 12, v11 -; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v18, 12, v18 +; GFX10-DL-NOXNACK-NEXT: v_and_b32_e32 v4, 0xffff, v13 +; GFX10-DL-NOXNACK-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; GFX10-DL-NOXNACK-NEXT: v_pk_mul_lo_u16 v5, v6, v5 +; GFX10-DL-NOXNACK-NEXT: v_add_nc_u16 v3, v3, v7 +; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v10, 12, v10 +; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v17, 12, v17 ; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v1, 12, v1 ; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v0, 12, v0 -; GFX10-DL-NOXNACK-NEXT: v_lshl_or_b32 v5, v12, 16, v5 -; GFX10-DL-NOXNACK-NEXT: v_lshl_or_b32 v7, v9, 16, v10 -; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v8, 16, v6 -; GFX10-DL-NOXNACK-NEXT: v_add_nc_u16 v3, v3, v6 -; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v11, 12, v11 -; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v6, 12, v18 -; GFX10-DL-NOXNACK-NEXT: v_and_b32_e32 v0, v4, v0 -; GFX10-DL-NOXNACK-NEXT: v_and_b32_e32 v1, v4, v1 -; GFX10-DL-NOXNACK-NEXT: v_pk_mul_lo_u16 v4, v7, v5 -; GFX10-DL-NOXNACK-NEXT: v_add_nc_u16 v3, v3, v8 -; GFX10-DL-NOXNACK-NEXT: v_lshl_or_b32 v0, v6, 16, v0 -; GFX10-DL-NOXNACK-NEXT: v_lshl_or_b32 v1, v11, 16, v1 +; GFX10-DL-NOXNACK-NEXT: v_lshl_or_b32 v4, v11, 16, v4 +; GFX10-DL-NOXNACK-NEXT: v_lshl_or_b32 v6, v8, 16, v9 +; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v7, 16, v5 +; GFX10-DL-NOXNACK-NEXT: v_add_nc_u16 v3, v3, v5 +; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v10, 12, v10 +; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v5, 12, v17 +; GFX10-DL-NOXNACK-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX10-DL-NOXNACK-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX10-DL-NOXNACK-NEXT: v_pk_mul_lo_u16 v4, v6, v4 +; GFX10-DL-NOXNACK-NEXT: v_add_nc_u16 v3, v3, v7 +; GFX10-DL-NOXNACK-NEXT: v_lshl_or_b32 v0, v5, 16, v0 +; GFX10-DL-NOXNACK-NEXT: v_lshl_or_b32 v1, v10, 16, v1 ; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v5, 16, v4 ; GFX10-DL-NOXNACK-NEXT: v_add_nc_u16 v3, v3, v4 ; GFX10-DL-NOXNACK-NEXT: v_pk_mul_lo_u16 v0, v1, v0 diff --git a/llvm/test/CodeGen/AMDGPU/idot8u.ll b/llvm/test/CodeGen/AMDGPU/idot8u.ll index 0b99cb5cdc81..7994f83fa431 100644 --- a/llvm/test/CodeGen/AMDGPU/idot8u.ll +++ b/llvm/test/CodeGen/AMDGPU/idot8u.ll @@ -2355,7 +2355,6 @@ define amdgpu_kernel void @udot8_acc16_vecMul(<8 x i4> addrspace(1)* %src1, ; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX10-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 -; GFX10-DL-NEXT: v_mov_b32_e32 v4, 0xffff ; GFX10-DL-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 ; GFX10-DL-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 ; GFX10-DL-NEXT: s_mov_b32 s10, -1 @@ -2369,49 +2368,49 @@ define amdgpu_kernel void @udot8_acc16_vecMul(<8 x i4> addrspace(1)* %src1, ; GFX10-DL-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-DL-NEXT: global_load_ushort v3, v0, s[0:1] ; GFX10-DL-NEXT: s_waitcnt vmcnt(2) -; GFX10-DL-NEXT: v_and_b32_e32 v7, 15, v1 +; GFX10-DL-NEXT: v_and_b32_e32 v6, 15, v1 ; GFX10-DL-NEXT: s_waitcnt vmcnt(1) -; GFX10-DL-NEXT: v_and_b32_e32 v6, 15, v2 -; GFX10-DL-NEXT: v_bfe_u32 v5, v1, 4, 4 -; GFX10-DL-NEXT: v_bfe_u32 v10, v2, 4, 4 -; GFX10-DL-NEXT: v_bfe_u32 v9, v1, 8, 4 -; GFX10-DL-NEXT: v_and_b32_e32 v7, v4, v7 -; GFX10-DL-NEXT: v_and_b32_e32 v6, v4, v6 -; GFX10-DL-NEXT: v_bfe_u32 v13, v2, 8, 4 -; GFX10-DL-NEXT: v_bfe_u32 v8, v1, 12, 4 -; GFX10-DL-NEXT: v_and_b32_e32 v9, v4, v9 -; GFX10-DL-NEXT: v_lshl_or_b32 v5, v5, 16, v7 -; GFX10-DL-NEXT: v_lshl_or_b32 v6, v10, 16, v6 -; GFX10-DL-NEXT: v_bfe_u32 v10, v2, 12, 4 -; GFX10-DL-NEXT: v_and_b32_e32 v13, v4, v13 -; GFX10-DL-NEXT: v_bfe_u32 v12, v1, 16, 4 -; GFX10-DL-NEXT: v_lshl_or_b32 v8, v8, 16, v9 -; GFX10-DL-NEXT: v_pk_mul_lo_u16 v5, v5, v6 -; GFX10-DL-NEXT: v_bfe_u32 v6, v2, 16, 4 -; GFX10-DL-NEXT: v_lshl_or_b32 v10, v10, 16, v13 -; GFX10-DL-NEXT: v_bfe_u32 v11, v1, 20, 4 -; GFX10-DL-NEXT: v_and_b32_e32 v12, v4, v12 -; GFX10-DL-NEXT: v_lshrrev_b32_e32 v9, 16, v5 +; GFX10-DL-NEXT: v_and_b32_e32 v5, 15, v2 +; GFX10-DL-NEXT: v_bfe_u32 v4, v1, 4, 4 +; GFX10-DL-NEXT: v_bfe_u32 v9, v2, 4, 4 +; GFX10-DL-NEXT: v_bfe_u32 v8, v1, 8, 4 +; GFX10-DL-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; GFX10-DL-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; GFX10-DL-NEXT: v_bfe_u32 v12, v2, 8, 4 +; GFX10-DL-NEXT: v_bfe_u32 v7, v1, 12, 4 +; GFX10-DL-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; GFX10-DL-NEXT: v_lshl_or_b32 v4, v4, 16, v6 +; GFX10-DL-NEXT: v_lshl_or_b32 v5, v9, 16, v5 +; GFX10-DL-NEXT: v_bfe_u32 v9, v2, 12, 4 +; GFX10-DL-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; GFX10-DL-NEXT: v_bfe_u32 v11, v1, 16, 4 +; GFX10-DL-NEXT: v_lshl_or_b32 v7, v7, 16, v8 +; GFX10-DL-NEXT: v_pk_mul_lo_u16 v4, v4, v5 +; GFX10-DL-NEXT: v_bfe_u32 v5, v2, 16, 4 +; GFX10-DL-NEXT: v_lshl_or_b32 v9, v9, 16, v12 +; GFX10-DL-NEXT: v_bfe_u32 v10, v1, 20, 4 +; GFX10-DL-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; GFX10-DL-NEXT: v_lshrrev_b32_e32 v8, 16, v4 ; GFX10-DL-NEXT: s_waitcnt vmcnt(0) -; GFX10-DL-NEXT: v_add_nc_u16 v3, v5, v3 -; GFX10-DL-NEXT: v_bfe_u32 v5, v2, 20, 4 -; GFX10-DL-NEXT: v_and_b32_e32 v6, v4, v6 -; GFX10-DL-NEXT: v_pk_mul_lo_u16 v8, v8, v10 -; GFX10-DL-NEXT: v_lshrrev_b32_e32 v7, 28, v1 -; GFX10-DL-NEXT: v_add_nc_u16 v3, v3, v9 -; GFX10-DL-NEXT: v_bfe_u32 v1, v1, 24, 4 -; GFX10-DL-NEXT: v_bfe_u32 v9, v2, 24, 4 -; GFX10-DL-NEXT: v_lshl_or_b32 v5, v5, 16, v6 -; GFX10-DL-NEXT: v_lshl_or_b32 v6, v11, 16, v12 -; GFX10-DL-NEXT: v_lshrrev_b32_e32 v10, 16, v8 +; GFX10-DL-NEXT: v_add_nc_u16 v3, v4, v3 +; GFX10-DL-NEXT: v_bfe_u32 v4, v2, 20, 4 +; GFX10-DL-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; GFX10-DL-NEXT: v_pk_mul_lo_u16 v7, v7, v9 +; GFX10-DL-NEXT: v_lshrrev_b32_e32 v6, 28, v1 ; GFX10-DL-NEXT: v_add_nc_u16 v3, v3, v8 +; GFX10-DL-NEXT: v_bfe_u32 v1, v1, 24, 4 +; GFX10-DL-NEXT: v_bfe_u32 v8, v2, 24, 4 +; GFX10-DL-NEXT: v_lshl_or_b32 v4, v4, 16, v5 +; GFX10-DL-NEXT: v_lshl_or_b32 v5, v10, 16, v11 +; GFX10-DL-NEXT: v_lshrrev_b32_e32 v9, 16, v7 +; GFX10-DL-NEXT: v_add_nc_u16 v3, v3, v7 ; GFX10-DL-NEXT: v_lshrrev_b32_e32 v2, 28, v2 -; GFX10-DL-NEXT: v_and_b32_e32 v8, v4, v9 -; GFX10-DL-NEXT: v_and_b32_e32 v1, v4, v1 -; GFX10-DL-NEXT: v_pk_mul_lo_u16 v4, v6, v5 -; GFX10-DL-NEXT: v_add_nc_u16 v3, v3, v10 -; GFX10-DL-NEXT: v_lshl_or_b32 v2, v2, 16, v8 -; GFX10-DL-NEXT: v_lshl_or_b32 v1, v7, 16, v1 +; GFX10-DL-NEXT: v_and_b32_e32 v7, 0xffff, v8 +; GFX10-DL-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX10-DL-NEXT: v_pk_mul_lo_u16 v4, v5, v4 +; GFX10-DL-NEXT: v_add_nc_u16 v3, v3, v9 +; GFX10-DL-NEXT: v_lshl_or_b32 v2, v2, 16, v7 +; GFX10-DL-NEXT: v_lshl_or_b32 v1, v6, 16, v1 ; GFX10-DL-NEXT: v_lshrrev_b32_e32 v5, 16, v4 ; GFX10-DL-NEXT: v_add_nc_u16 v3, v3, v4 ; GFX10-DL-NEXT: v_pk_mul_lo_u16 v1, v1, v2 @@ -3112,7 +3111,6 @@ define amdgpu_kernel void @udot8_acc4_vecMul(<8 x i4> addrspace(1)* %src1, ; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX10-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 -; GFX10-DL-NEXT: v_mov_b32_e32 v4, 0xffff ; GFX10-DL-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 ; GFX10-DL-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 ; GFX10-DL-NEXT: s_mov_b32 s10, -1 @@ -3126,49 +3124,49 @@ define amdgpu_kernel void @udot8_acc4_vecMul(<8 x i4> addrspace(1)* %src1, ; GFX10-DL-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-DL-NEXT: global_load_ubyte v3, v0, s[0:1] ; GFX10-DL-NEXT: s_waitcnt vmcnt(2) -; GFX10-DL-NEXT: v_and_b32_e32 v7, 15, v1 +; GFX10-DL-NEXT: v_and_b32_e32 v6, 15, v1 ; GFX10-DL-NEXT: s_waitcnt vmcnt(1) -; GFX10-DL-NEXT: v_and_b32_e32 v6, 15, v2 -; GFX10-DL-NEXT: v_bfe_u32 v5, v1, 4, 4 -; GFX10-DL-NEXT: v_bfe_u32 v10, v2, 4, 4 -; GFX10-DL-NEXT: v_bfe_u32 v9, v1, 8, 4 -; GFX10-DL-NEXT: v_and_b32_e32 v7, v4, v7 -; GFX10-DL-NEXT: v_and_b32_e32 v6, v4, v6 -; GFX10-DL-NEXT: v_bfe_u32 v13, v2, 8, 4 -; GFX10-DL-NEXT: v_bfe_u32 v8, v1, 12, 4 -; GFX10-DL-NEXT: v_and_b32_e32 v9, v4, v9 -; GFX10-DL-NEXT: v_lshl_or_b32 v5, v5, 16, v7 -; GFX10-DL-NEXT: v_lshl_or_b32 v6, v10, 16, v6 -; GFX10-DL-NEXT: v_bfe_u32 v10, v2, 12, 4 -; GFX10-DL-NEXT: v_and_b32_e32 v13, v4, v13 -; GFX10-DL-NEXT: v_bfe_u32 v12, v1, 16, 4 -; GFX10-DL-NEXT: v_lshl_or_b32 v8, v8, 16, v9 -; GFX10-DL-NEXT: v_pk_mul_lo_u16 v5, v5, v6 -; GFX10-DL-NEXT: v_bfe_u32 v6, v2, 16, 4 -; GFX10-DL-NEXT: v_lshl_or_b32 v10, v10, 16, v13 -; GFX10-DL-NEXT: v_bfe_u32 v11, v1, 20, 4 -; GFX10-DL-NEXT: v_and_b32_e32 v12, v4, v12 -; GFX10-DL-NEXT: v_lshrrev_b32_e32 v9, 16, v5 +; GFX10-DL-NEXT: v_and_b32_e32 v5, 15, v2 +; GFX10-DL-NEXT: v_bfe_u32 v4, v1, 4, 4 +; GFX10-DL-NEXT: v_bfe_u32 v9, v2, 4, 4 +; GFX10-DL-NEXT: v_bfe_u32 v8, v1, 8, 4 +; GFX10-DL-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; GFX10-DL-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; GFX10-DL-NEXT: v_bfe_u32 v12, v2, 8, 4 +; GFX10-DL-NEXT: v_bfe_u32 v7, v1, 12, 4 +; GFX10-DL-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; GFX10-DL-NEXT: v_lshl_or_b32 v4, v4, 16, v6 +; GFX10-DL-NEXT: v_lshl_or_b32 v5, v9, 16, v5 +; GFX10-DL-NEXT: v_bfe_u32 v9, v2, 12, 4 +; GFX10-DL-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; GFX10-DL-NEXT: v_bfe_u32 v11, v1, 16, 4 +; GFX10-DL-NEXT: v_lshl_or_b32 v7, v7, 16, v8 +; GFX10-DL-NEXT: v_pk_mul_lo_u16 v4, v4, v5 +; GFX10-DL-NEXT: v_bfe_u32 v5, v2, 16, 4 +; GFX10-DL-NEXT: v_lshl_or_b32 v9, v9, 16, v12 +; GFX10-DL-NEXT: v_bfe_u32 v10, v1, 20, 4 +; GFX10-DL-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; GFX10-DL-NEXT: v_lshrrev_b32_e32 v8, 16, v4 ; GFX10-DL-NEXT: s_waitcnt vmcnt(0) -; GFX10-DL-NEXT: v_add_nc_u16 v3, v5, v3 -; GFX10-DL-NEXT: v_bfe_u32 v5, v2, 20, 4 -; GFX10-DL-NEXT: v_and_b32_e32 v6, v4, v6 -; GFX10-DL-NEXT: v_pk_mul_lo_u16 v8, v8, v10 -; GFX10-DL-NEXT: v_lshrrev_b32_e32 v7, 28, v1 -; GFX10-DL-NEXT: v_add_nc_u16 v3, v3, v9 -; GFX10-DL-NEXT: v_bfe_u32 v1, v1, 24, 4 -; GFX10-DL-NEXT: v_bfe_u32 v9, v2, 24, 4 -; GFX10-DL-NEXT: v_lshl_or_b32 v5, v5, 16, v6 -; GFX10-DL-NEXT: v_lshl_or_b32 v6, v11, 16, v12 -; GFX10-DL-NEXT: v_lshrrev_b32_e32 v10, 16, v8 +; GFX10-DL-NEXT: v_add_nc_u16 v3, v4, v3 +; GFX10-DL-NEXT: v_bfe_u32 v4, v2, 20, 4 +; GFX10-DL-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; GFX10-DL-NEXT: v_pk_mul_lo_u16 v7, v7, v9 +; GFX10-DL-NEXT: v_lshrrev_b32_e32 v6, 28, v1 ; GFX10-DL-NEXT: v_add_nc_u16 v3, v3, v8 +; GFX10-DL-NEXT: v_bfe_u32 v1, v1, 24, 4 +; GFX10-DL-NEXT: v_bfe_u32 v8, v2, 24, 4 +; GFX10-DL-NEXT: v_lshl_or_b32 v4, v4, 16, v5 +; GFX10-DL-NEXT: v_lshl_or_b32 v5, v10, 16, v11 +; GFX10-DL-NEXT: v_lshrrev_b32_e32 v9, 16, v7 +; GFX10-DL-NEXT: v_add_nc_u16 v3, v3, v7 ; GFX10-DL-NEXT: v_lshrrev_b32_e32 v2, 28, v2 -; GFX10-DL-NEXT: v_and_b32_e32 v8, v4, v9 -; GFX10-DL-NEXT: v_and_b32_e32 v1, v4, v1 -; GFX10-DL-NEXT: v_pk_mul_lo_u16 v4, v6, v5 -; GFX10-DL-NEXT: v_add_nc_u16 v3, v3, v10 -; GFX10-DL-NEXT: v_lshl_or_b32 v2, v2, 16, v8 -; GFX10-DL-NEXT: v_lshl_or_b32 v1, v7, 16, v1 +; GFX10-DL-NEXT: v_and_b32_e32 v7, 0xffff, v8 +; GFX10-DL-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX10-DL-NEXT: v_pk_mul_lo_u16 v4, v5, v4 +; GFX10-DL-NEXT: v_add_nc_u16 v3, v3, v9 +; GFX10-DL-NEXT: v_lshl_or_b32 v2, v2, 16, v7 +; GFX10-DL-NEXT: v_lshl_or_b32 v1, v6, 16, v1 ; GFX10-DL-NEXT: v_lshrrev_b32_e32 v5, 16, v4 ; GFX10-DL-NEXT: v_add_nc_u16 v3, v3, v4 ; GFX10-DL-NEXT: v_pk_mul_lo_u16 v1, v1, v2 diff --git a/llvm/test/CodeGen/AMDGPU/immv216.ll b/llvm/test/CodeGen/AMDGPU/immv216.ll index 90c248db9877..2cc6ea123fe6 100644 --- a/llvm/test/CodeGen/AMDGPU/immv216.ll +++ b/llvm/test/CodeGen/AMDGPU/immv216.ll @@ -341,12 +341,11 @@ define amdgpu_kernel void @commute_add_inline_imm_0.5_v2f16(<2 x half> addrspace ; GFX9: v_pk_add_f16 [[REG:v[0-9]+]], [[VAL]], [[K]] op_sel_hi:[1,0] ; encoding: [0x00,0x40,0x8f,0xd3,0x00,0x01,0x00,0x08] ; GFX9: buffer_store_dword [[REG]] -; VI-DAG: s_movk_i32 [[K:s[0-9]+]], 0x6400 ; encoding ; VI-DAG: buffer_load_dword ; VI-NOT: and -; VI-DAG: v_add_f16_e32 v{{[0-9]+}}, [[K]], v{{[0-9]+}} +; VI-DAG: v_add_f16_e32 v{{[0-9]+}}, 0x6400, v{{[0-9]+}} ; gfx8 does not support sreg or imm in sdwa - this will be move then -; VI-DAG: v_mov_b32_e32 [[VK:v[0-9]+]], [[K]] +; VI-DAG: v_mov_b32_e32 [[VK:v[0-9]+]], 0x6400 ; VI-DAG: v_add_f16_sdwa v{{[0-9]+}}, v{{[0-9]+}}, [[VK]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI: v_or_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} ; VI: buffer_store_dword diff --git a/llvm/test/CodeGen/AMDGPU/insert_vector_dynelt.ll b/llvm/test/CodeGen/AMDGPU/insert_vector_dynelt.ll index 624a6caf9487..a0b20f4788ab 100644 --- a/llvm/test/CodeGen/AMDGPU/insert_vector_dynelt.ll +++ b/llvm/test/CodeGen/AMDGPU/insert_vector_dynelt.ll @@ -481,9 +481,8 @@ define amdgpu_kernel void @byte8_inselt(<8 x i8> addrspace(1)* %out, <8 x i8> %v ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: s_lshl_b32 s6, s6, 3 ; GCN-NEXT: s_lshl_b64 s[4:5], s[4:5], s6 -; GCN-NEXT: s_mov_b32 s6, 0x1010101 -; GCN-NEXT: s_and_b32 s7, s5, s6 -; GCN-NEXT: s_and_b32 s6, s4, s6 +; GCN-NEXT: s_and_b32 s7, s5, 0x1010101 +; GCN-NEXT: s_and_b32 s6, s4, 0x1010101 ; GCN-NEXT: s_andn2_b64 s[0:1], s[0:1], s[4:5] ; GCN-NEXT: s_or_b64 s[0:1], s[6:7], s[0:1] ; GCN-NEXT: v_mov_b32_e32 v2, s2 diff --git a/llvm/test/CodeGen/AMDGPU/insert_vector_elt.ll b/llvm/test/CodeGen/AMDGPU/insert_vector_elt.ll index 4de859f473df..1b13cc34f3ef 100644 --- a/llvm/test/CodeGen/AMDGPU/insert_vector_elt.ll +++ b/llvm/test/CodeGen/AMDGPU/insert_vector_elt.ll @@ -1072,9 +1072,8 @@ define amdgpu_kernel void @dynamic_insertelement_v3i16(<3 x i16> addrspace(1)* % ; SI-NEXT: s_lshl_b32 s8, s6, 4 ; SI-NEXT: s_mov_b64 s[6:7], 0xffff ; SI-NEXT: s_lshl_b64 s[6:7], s[6:7], s8 -; SI-NEXT: s_mov_b32 s8, 0x50005 -; SI-NEXT: s_and_b32 s9, s7, s8 -; SI-NEXT: s_and_b32 s8, s6, s8 +; SI-NEXT: s_and_b32 s9, s7, 0x50005 +; SI-NEXT: s_and_b32 s8, s6, 0x50005 ; SI-NEXT: s_andn2_b64 s[4:5], s[4:5], s[6:7] ; SI-NEXT: s_or_b64 s[4:5], s[8:9], s[4:5] ; SI-NEXT: v_mov_b32_e32 v0, s5 @@ -1248,9 +1247,8 @@ define amdgpu_kernel void @s_dynamic_insertelement_v8i8(<8 x i8> addrspace(1)* % ; SI-NEXT: s_lshl_b32 s8, s8, 3 ; SI-NEXT: s_mov_b64 s[2:3], 0xffff ; SI-NEXT: s_lshl_b64 s[2:3], s[2:3], s8 -; SI-NEXT: s_mov_b32 s8, 0x5050505 -; SI-NEXT: s_and_b32 s9, s3, s8 -; SI-NEXT: s_and_b32 s8, s2, s8 +; SI-NEXT: s_and_b32 s9, s3, 0x5050505 +; SI-NEXT: s_and_b32 s8, s2, 0x5050505 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3] ; SI-NEXT: s_or_b64 s[0:1], s[8:9], s[0:1] @@ -1272,9 +1270,8 @@ define amdgpu_kernel void @s_dynamic_insertelement_v8i8(<8 x i8> addrspace(1)* % ; VI-NEXT: s_lshl_b32 s8, s8, 3 ; VI-NEXT: s_mov_b64 s[2:3], 0xffff ; VI-NEXT: s_lshl_b64 s[2:3], s[2:3], s8 -; VI-NEXT: s_mov_b32 s8, 0x5050505 -; VI-NEXT: s_and_b32 s9, s3, s8 -; VI-NEXT: s_and_b32 s8, s2, s8 +; VI-NEXT: s_and_b32 s9, s3, 0x5050505 +; VI-NEXT: s_and_b32 s8, s2, 0x5050505 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3] ; VI-NEXT: s_or_b64 s[0:1], s[8:9], s[0:1] diff --git a/llvm/test/CodeGen/AMDGPU/insert_vector_elt.v2i16.ll b/llvm/test/CodeGen/AMDGPU/insert_vector_elt.v2i16.ll index a80604fde3ce..ee401344580d 100644 --- a/llvm/test/CodeGen/AMDGPU/insert_vector_elt.v2i16.ll +++ b/llvm/test/CodeGen/AMDGPU/insert_vector_elt.v2i16.ll @@ -1606,7 +1606,7 @@ define amdgpu_kernel void @v_insertelement_v4i16_dynamic_vgpr(<4 x i16> addrspac ; VI-NEXT: s_mov_b64 s[2:3], 0xffff ; VI-NEXT: v_mov_b32_e32 v3, s1 ; VI-NEXT: s_lshl_b32 s1, s4, 16 -; VI-NEXT: s_and_b32 s4, s4, s2 +; VI-NEXT: s_and_b32 s4, s4, 0xffff ; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2 ; VI-NEXT: s_or_b32 s0, s4, s1 ; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc @@ -1633,7 +1633,7 @@ define amdgpu_kernel void @v_insertelement_v4i16_dynamic_vgpr(<4 x i16> addrspac ; CI-NEXT: s_mov_b64 s[2:3], 0xffff ; CI-NEXT: v_mov_b32_e32 v3, s1 ; CI-NEXT: s_lshl_b32 s1, s4, 16 -; CI-NEXT: s_and_b32 s4, s4, s2 +; CI-NEXT: s_and_b32 s4, s4, 0xffff ; CI-NEXT: v_add_i32_e32 v2, vcc, s0, v2 ; CI-NEXT: s_or_b32 s0, s4, s1 ; CI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc @@ -1691,7 +1691,7 @@ define amdgpu_kernel void @v_insertelement_v4f16_dynamic_sgpr(<4 x half> addrspa ; VI-NEXT: v_mov_b32_e32 v3, s1 ; VI-NEXT: s_lshl_b32 s1, s5, 4 ; VI-NEXT: s_lshl_b32 s5, s4, 16 -; VI-NEXT: s_and_b32 s4, s4, s2 +; VI-NEXT: s_and_b32 s4, s4, 0xffff ; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2 ; VI-NEXT: s_lshl_b64 s[0:1], s[2:3], s1 ; VI-NEXT: s_or_b32 s2, s4, s5 @@ -1716,7 +1716,7 @@ define amdgpu_kernel void @v_insertelement_v4f16_dynamic_sgpr(<4 x half> addrspa ; CI-NEXT: flat_load_dwordx2 v[0:1], v[0:1] ; CI-NEXT: s_mov_b64 s[2:3], 0xffff ; CI-NEXT: v_mov_b32_e32 v3, s1 -; CI-NEXT: s_and_b32 s6, s4, s2 +; CI-NEXT: s_and_b32 s6, s4, 0xffff ; CI-NEXT: s_lshl_b32 s1, s5, 4 ; CI-NEXT: s_lshl_b32 s4, s4, 16 ; CI-NEXT: v_add_i32_e32 v2, vcc, s0, v2 diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.buffer.store.format.d16.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.buffer.store.format.d16.ll index 0ae12149de21..a5129279ddef 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.buffer.store.format.d16.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.buffer.store.format.d16.ll @@ -37,11 +37,10 @@ main_body: ; GCN-LABEL: {{^}}buffer_store_format_d16_xyzw: ; GCN-DAG: s_load_dwordx2 s[[[S_DATA_0:[0-9]+]]:[[S_DATA_1:[0-9]+]]], s{{\[[0-9]+:[0-9]+\]}}, 0x10 -; UNPACKED-DAG: s_mov_b32 [[K:s[0-9]+]], 0xffff{{$}} ; UNPACKED-DAG: s_lshr_b32 [[SHR0:s[0-9]+]], s[[S_DATA_0]], 16 -; UNPACKED-DAG: s_and_b32 [[MASKED0:s[0-9]+]], s[[S_DATA_0]], [[K]] +; UNPACKED-DAG: s_and_b32 [[MASKED0:s[0-9]+]], s[[S_DATA_0]], 0xffff{{$}} ; UNPACKED-DAG: s_lshr_b32 [[SHR1:s[0-9]+]], s[[S_DATA_1]], 16 -; UNPACKED-DAG: s_and_b32 [[MASKED1:s[0-9]+]], s[[S_DATA_1]], [[K]] +; UNPACKED-DAG: s_and_b32 [[MASKED1:s[0-9]+]], s[[S_DATA_1]], 0xffff{{$}} ; UNPACKED-DAG: v_mov_b32_e32 v[[LO:[0-9]+]], [[MASKED0]] ; UNPACKED-DAG: v_mov_b32_e32 v[[HI:[0-9]+]], [[SHR1]] diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.a16.dim.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.a16.dim.ll index b3c66ef9284c..7367f9c31566 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.a16.dim.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.a16.dim.ll @@ -566,10 +566,9 @@ define amdgpu_ps <4 x float> @sample_d_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg ; ; GFX10-LABEL: sample_d_2d: ; GFX10: ; %bb.0: ; %main_body -; GFX10-NEXT: v_mov_b32_e32 v6, 0xffff -; GFX10-NEXT: v_and_b32_e32 v4, v6, v4 -; GFX10-NEXT: v_and_b32_e32 v2, v6, v2 -; GFX10-NEXT: v_and_b32_e32 v0, v6, v0 +; GFX10-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; GFX10-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX10-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX10-NEXT: v_lshl_or_b32 v4, v5, 16, v4 ; GFX10-NEXT: v_lshl_or_b32 v3, v3, 16, v2 ; GFX10-NEXT: v_lshl_or_b32 v2, v1, 16, v0 @@ -602,12 +601,11 @@ define amdgpu_ps <4 x float> @sample_d_3d(<8 x i32> inreg %rsrc, <4 x i32> inreg ; GFX10: ; %bb.0: ; %main_body ; GFX10-NEXT: v_mov_b32_e32 v12, v8 ; GFX10-NEXT: v_mov_b32_e32 v8, v2 -; GFX10-NEXT: v_mov_b32_e32 v2, 0xffff +; GFX10-NEXT: v_and_b32_e32 v2, 0xffff, v6 +; GFX10-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX10-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX10-NEXT: v_mov_b32_e32 v10, v5 -; GFX10-NEXT: v_and_b32_e32 v5, v2, v6 -; GFX10-NEXT: v_and_b32_e32 v3, v2, v3 -; GFX10-NEXT: v_and_b32_e32 v0, v2, v0 -; GFX10-NEXT: v_lshl_or_b32 v11, v7, 16, v5 +; GFX10-NEXT: v_lshl_or_b32 v11, v7, 16, v2 ; GFX10-NEXT: v_lshl_or_b32 v9, v4, 16, v3 ; GFX10-NEXT: v_lshl_or_b32 v7, v1, 16, v0 ; GFX10-NEXT: image_sample_d_g16 v[0:3], v[7:12], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_3D a16 @@ -653,10 +651,9 @@ define amdgpu_ps <4 x float> @sample_c_d_2d(<8 x i32> inreg %rsrc, <4 x i32> inr ; ; GFX10-LABEL: sample_c_d_2d: ; GFX10: ; %bb.0: ; %main_body -; GFX10-NEXT: v_mov_b32_e32 v7, 0xffff -; GFX10-NEXT: v_and_b32_e32 v5, v7, v5 -; GFX10-NEXT: v_and_b32_e32 v3, v7, v3 -; GFX10-NEXT: v_and_b32_e32 v1, v7, v1 +; GFX10-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; GFX10-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX10-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; GFX10-NEXT: v_lshl_or_b32 v5, v6, 16, v5 ; GFX10-NEXT: v_lshl_or_b32 v3, v4, 16, v3 ; GFX10-NEXT: v_lshl_or_b32 v1, v2, 16, v1 @@ -705,10 +702,9 @@ define amdgpu_ps <4 x float> @sample_d_cl_2d(<8 x i32> inreg %rsrc, <4 x i32> in ; ; GFX10-LABEL: sample_d_cl_2d: ; GFX10: ; %bb.0: ; %main_body -; GFX10-NEXT: v_mov_b32_e32 v7, 0xffff -; GFX10-NEXT: v_and_b32_e32 v4, v7, v4 -; GFX10-NEXT: v_and_b32_e32 v2, v7, v2 -; GFX10-NEXT: v_and_b32_e32 v0, v7, v0 +; GFX10-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; GFX10-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX10-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX10-NEXT: v_lshl_or_b32 v4, v5, 16, v4 ; GFX10-NEXT: v_lshl_or_b32 v2, v3, 16, v2 ; GFX10-NEXT: v_lshl_or_b32 v0, v1, 16, v0 @@ -759,10 +755,9 @@ define amdgpu_ps <4 x float> @sample_c_d_cl_2d(<8 x i32> inreg %rsrc, <4 x i32> ; ; GFX10-LABEL: sample_c_d_cl_2d: ; GFX10: ; %bb.0: ; %main_body -; GFX10-NEXT: v_mov_b32_e32 v8, 0xffff -; GFX10-NEXT: v_and_b32_e32 v5, v8, v5 -; GFX10-NEXT: v_and_b32_e32 v3, v8, v3 -; GFX10-NEXT: v_and_b32_e32 v1, v8, v1 +; GFX10-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; GFX10-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX10-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; GFX10-NEXT: v_lshl_or_b32 v5, v6, 16, v5 ; GFX10-NEXT: v_lshl_or_b32 v3, v4, 16, v3 ; GFX10-NEXT: v_lshl_or_b32 v1, v2, 16, v1 @@ -807,10 +802,9 @@ define amdgpu_ps <4 x float> @sample_cd_2d(<8 x i32> inreg %rsrc, <4 x i32> inre ; ; GFX10-LABEL: sample_cd_2d: ; GFX10: ; %bb.0: ; %main_body -; GFX10-NEXT: v_mov_b32_e32 v6, 0xffff -; GFX10-NEXT: v_and_b32_e32 v4, v6, v4 -; GFX10-NEXT: v_and_b32_e32 v2, v6, v2 -; GFX10-NEXT: v_and_b32_e32 v0, v6, v0 +; GFX10-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; GFX10-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX10-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX10-NEXT: v_lshl_or_b32 v4, v5, 16, v4 ; GFX10-NEXT: v_lshl_or_b32 v3, v3, 16, v2 ; GFX10-NEXT: v_lshl_or_b32 v2, v1, 16, v0 @@ -857,10 +851,9 @@ define amdgpu_ps <4 x float> @sample_c_cd_2d(<8 x i32> inreg %rsrc, <4 x i32> in ; ; GFX10-LABEL: sample_c_cd_2d: ; GFX10: ; %bb.0: ; %main_body -; GFX10-NEXT: v_mov_b32_e32 v7, 0xffff -; GFX10-NEXT: v_and_b32_e32 v5, v7, v5 -; GFX10-NEXT: v_and_b32_e32 v3, v7, v3 -; GFX10-NEXT: v_and_b32_e32 v1, v7, v1 +; GFX10-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; GFX10-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX10-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; GFX10-NEXT: v_lshl_or_b32 v5, v6, 16, v5 ; GFX10-NEXT: v_lshl_or_b32 v3, v4, 16, v3 ; GFX10-NEXT: v_lshl_or_b32 v1, v2, 16, v1 @@ -909,10 +902,9 @@ define amdgpu_ps <4 x float> @sample_cd_cl_2d(<8 x i32> inreg %rsrc, <4 x i32> i ; ; GFX10-LABEL: sample_cd_cl_2d: ; GFX10: ; %bb.0: ; %main_body -; GFX10-NEXT: v_mov_b32_e32 v7, 0xffff -; GFX10-NEXT: v_and_b32_e32 v4, v7, v4 -; GFX10-NEXT: v_and_b32_e32 v2, v7, v2 -; GFX10-NEXT: v_and_b32_e32 v0, v7, v0 +; GFX10-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; GFX10-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX10-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX10-NEXT: v_lshl_or_b32 v4, v5, 16, v4 ; GFX10-NEXT: v_lshl_or_b32 v2, v3, 16, v2 ; GFX10-NEXT: v_lshl_or_b32 v0, v1, 16, v0 @@ -963,10 +955,9 @@ define amdgpu_ps <4 x float> @sample_c_cd_cl_2d(<8 x i32> inreg %rsrc, <4 x i32> ; ; GFX10-LABEL: sample_c_cd_cl_2d: ; GFX10: ; %bb.0: ; %main_body -; GFX10-NEXT: v_mov_b32_e32 v8, 0xffff -; GFX10-NEXT: v_and_b32_e32 v5, v8, v5 -; GFX10-NEXT: v_and_b32_e32 v3, v8, v3 -; GFX10-NEXT: v_and_b32_e32 v1, v8, v1 +; GFX10-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; GFX10-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX10-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; GFX10-NEXT: v_lshl_or_b32 v5, v6, 16, v5 ; GFX10-NEXT: v_lshl_or_b32 v3, v4, 16, v3 ; GFX10-NEXT: v_lshl_or_b32 v1, v2, 16, v1 @@ -1160,15 +1151,14 @@ define amdgpu_ps float @sample_c_d_o_2darray_V1(<8 x i32> inreg %rsrc, <4 x i32> ; GFX10-LABEL: sample_c_d_o_2darray_V1: ; GFX10: ; %bb.0: ; %main_body ; GFX10-NEXT: v_mov_b32_e32 v13, v8 -; GFX10-NEXT: v_mov_b32_e32 v8, v0 -; GFX10-NEXT: v_mov_b32_e32 v0, 0xffff ; GFX10-NEXT: v_mov_b32_e32 v9, v1 -; GFX10-NEXT: v_and_b32_e32 v1, v0, v6 -; GFX10-NEXT: v_and_b32_e32 v4, v0, v4 -; GFX10-NEXT: v_and_b32_e32 v0, v0, v2 -; GFX10-NEXT: v_lshl_or_b32 v12, v7, 16, v1 -; GFX10-NEXT: v_lshl_or_b32 v11, v5, 16, v4 -; GFX10-NEXT: v_lshl_or_b32 v10, v3, 16, v0 +; GFX10-NEXT: v_mov_b32_e32 v8, v0 +; GFX10-NEXT: v_and_b32_e32 v0, 0xffff, v6 +; GFX10-NEXT: v_and_b32_e32 v1, 0xffff, v4 +; GFX10-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX10-NEXT: v_lshl_or_b32 v12, v7, 16, v0 +; GFX10-NEXT: v_lshl_or_b32 v11, v5, 16, v1 +; GFX10-NEXT: v_lshl_or_b32 v10, v3, 16, v2 ; GFX10-NEXT: image_sample_c_d_o_g16 v0, v[8:13], s[0:7], s[8:11] dmask:0x4 dim:SQ_RSRC_IMG_2D_ARRAY a16 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ; return to shader part epilog @@ -1197,15 +1187,14 @@ define amdgpu_ps <2 x float> @sample_c_d_o_2darray_V2(<8 x i32> inreg %rsrc, <4 ; GFX10-LABEL: sample_c_d_o_2darray_V2: ; GFX10: ; %bb.0: ; %main_body ; GFX10-NEXT: v_mov_b32_e32 v13, v8 -; GFX10-NEXT: v_mov_b32_e32 v8, v0 -; GFX10-NEXT: v_mov_b32_e32 v0, 0xffff ; GFX10-NEXT: v_mov_b32_e32 v9, v1 -; GFX10-NEXT: v_and_b32_e32 v1, v0, v6 -; GFX10-NEXT: v_and_b32_e32 v4, v0, v4 -; GFX10-NEXT: v_and_b32_e32 v0, v0, v2 -; GFX10-NEXT: v_lshl_or_b32 v12, v7, 16, v1 -; GFX10-NEXT: v_lshl_or_b32 v11, v5, 16, v4 -; GFX10-NEXT: v_lshl_or_b32 v10, v3, 16, v0 +; GFX10-NEXT: v_mov_b32_e32 v8, v0 +; GFX10-NEXT: v_and_b32_e32 v0, 0xffff, v6 +; GFX10-NEXT: v_and_b32_e32 v1, 0xffff, v4 +; GFX10-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX10-NEXT: v_lshl_or_b32 v12, v7, 16, v0 +; GFX10-NEXT: v_lshl_or_b32 v11, v5, 16, v1 +; GFX10-NEXT: v_lshl_or_b32 v10, v3, 16, v2 ; GFX10-NEXT: image_sample_c_d_o_g16 v[0:1], v[8:13], s[0:7], s[8:11] dmask:0x6 dim:SQ_RSRC_IMG_2D_ARRAY a16 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ; return to shader part epilog diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.g16.a16.dim.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.g16.a16.dim.ll index d23caa43bbb6..e3f2269bbe0b 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.g16.a16.dim.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.g16.a16.dim.ll @@ -62,11 +62,10 @@ define amdgpu_ps <4 x float> @sample_d_3d(<8 x i32> inreg %rsrc, <4 x i32> inreg ; ; GFX10GISEL-LABEL: sample_d_3d: ; GFX10GISEL: ; %bb.0: ; %main_body -; GFX10GISEL-NEXT: v_mov_b32_e32 v9, 0xffff ; GFX10GISEL-NEXT: v_lshlrev_b32_e32 v7, 16, v7 ; GFX10GISEL-NEXT: s_lshl_b32 s12, s0, 16 -; GFX10GISEL-NEXT: v_and_or_b32 v6, v6, v9, v7 -; GFX10GISEL-NEXT: v_and_or_b32 v7, v8, v9, s12 +; GFX10GISEL-NEXT: v_and_or_b32 v6, 0xffff, v6, v7 +; GFX10GISEL-NEXT: v_and_or_b32 v7, 0xffff, v8, s12 ; GFX10GISEL-NEXT: image_sample_d v[0:3], v[0:7], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_3D a16 ; GFX10GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX10GISEL-NEXT: ; return to shader part epilog @@ -152,11 +151,10 @@ define amdgpu_ps <4 x float> @sample_d_cl_2d(<8 x i32> inreg %rsrc, <4 x i32> in ; ; GFX10GISEL-LABEL: sample_d_cl_2d: ; GFX10GISEL: ; %bb.0: ; %main_body -; GFX10GISEL-NEXT: v_mov_b32_e32 v7, 0xffff ; GFX10GISEL-NEXT: v_lshlrev_b32_e32 v5, 16, v5 ; GFX10GISEL-NEXT: s_lshl_b32 s12, s0, 16 -; GFX10GISEL-NEXT: v_and_or_b32 v4, v4, v7, v5 -; GFX10GISEL-NEXT: v_and_or_b32 v5, v6, v7, s12 +; GFX10GISEL-NEXT: v_and_or_b32 v4, 0xffff, v4, v5 +; GFX10GISEL-NEXT: v_and_or_b32 v5, 0xffff, v6, s12 ; GFX10GISEL-NEXT: image_sample_d_cl v[0:3], v[0:5], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D a16 ; GFX10GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX10GISEL-NEXT: ; return to shader part epilog @@ -203,11 +201,10 @@ define amdgpu_ps <4 x float> @sample_c_d_cl_2d(<8 x i32> inreg %rsrc, <4 x i32> ; ; GFX10GISEL-LABEL: sample_c_d_cl_2d: ; GFX10GISEL: ; %bb.0: ; %main_body -; GFX10GISEL-NEXT: v_mov_b32_e32 v8, 0xffff ; GFX10GISEL-NEXT: v_lshlrev_b32_e32 v6, 16, v6 ; GFX10GISEL-NEXT: s_lshl_b32 s12, s0, 16 -; GFX10GISEL-NEXT: v_and_or_b32 v5, v5, v8, v6 -; GFX10GISEL-NEXT: v_and_or_b32 v6, v7, v8, s12 +; GFX10GISEL-NEXT: v_and_or_b32 v5, 0xffff, v5, v6 +; GFX10GISEL-NEXT: v_and_or_b32 v6, 0xffff, v7, s12 ; GFX10GISEL-NEXT: image_sample_c_d_cl v[0:3], v[0:6], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D a16 ; GFX10GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX10GISEL-NEXT: ; return to shader part epilog @@ -333,11 +330,10 @@ define amdgpu_ps <4 x float> @sample_cd_cl_2d(<8 x i32> inreg %rsrc, <4 x i32> i ; ; GFX10GISEL-LABEL: sample_cd_cl_2d: ; GFX10GISEL: ; %bb.0: ; %main_body -; GFX10GISEL-NEXT: v_mov_b32_e32 v7, 0xffff ; GFX10GISEL-NEXT: v_lshlrev_b32_e32 v5, 16, v5 ; GFX10GISEL-NEXT: s_lshl_b32 s12, s0, 16 -; GFX10GISEL-NEXT: v_and_or_b32 v4, v4, v7, v5 -; GFX10GISEL-NEXT: v_and_or_b32 v5, v6, v7, s12 +; GFX10GISEL-NEXT: v_and_or_b32 v4, 0xffff, v4, v5 +; GFX10GISEL-NEXT: v_and_or_b32 v5, 0xffff, v6, s12 ; GFX10GISEL-NEXT: image_sample_cd_cl v[0:3], v[0:5], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D a16 ; GFX10GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX10GISEL-NEXT: ; return to shader part epilog @@ -384,11 +380,10 @@ define amdgpu_ps <4 x float> @sample_c_cd_cl_2d(<8 x i32> inreg %rsrc, <4 x i32> ; ; GFX10GISEL-LABEL: sample_c_cd_cl_2d: ; GFX10GISEL: ; %bb.0: ; %main_body -; GFX10GISEL-NEXT: v_mov_b32_e32 v8, 0xffff ; GFX10GISEL-NEXT: v_lshlrev_b32_e32 v6, 16, v6 ; GFX10GISEL-NEXT: s_lshl_b32 s12, s0, 16 -; GFX10GISEL-NEXT: v_and_or_b32 v5, v5, v8, v6 -; GFX10GISEL-NEXT: v_and_or_b32 v6, v7, v8, s12 +; GFX10GISEL-NEXT: v_and_or_b32 v5, 0xffff, v5, v6 +; GFX10GISEL-NEXT: v_and_or_b32 v6, 0xffff, v7, s12 ; GFX10GISEL-NEXT: image_sample_c_cd_cl v[0:3], v[0:6], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D a16 ; GFX10GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX10GISEL-NEXT: ; return to shader part epilog @@ -415,11 +410,10 @@ define amdgpu_ps float @sample_c_d_o_2darray_V1(<8 x i32> inreg %rsrc, <4 x i32> ; ; GFX10GISEL-LABEL: sample_c_d_o_2darray_V1: ; GFX10GISEL: ; %bb.0: ; %main_body -; GFX10GISEL-NEXT: v_mov_b32_e32 v9, 0xffff ; GFX10GISEL-NEXT: v_lshlrev_b32_e32 v7, 16, v7 ; GFX10GISEL-NEXT: s_lshl_b32 s12, s0, 16 -; GFX10GISEL-NEXT: v_and_or_b32 v6, v6, v9, v7 -; GFX10GISEL-NEXT: v_and_or_b32 v7, v8, v9, s12 +; GFX10GISEL-NEXT: v_and_or_b32 v6, 0xffff, v6, v7 +; GFX10GISEL-NEXT: v_and_or_b32 v7, 0xffff, v8, s12 ; GFX10GISEL-NEXT: image_sample_c_d_o v0, v[0:7], s[0:7], s[8:11] dmask:0x4 dim:SQ_RSRC_IMG_2D_ARRAY a16 ; GFX10GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX10GISEL-NEXT: ; return to shader part epilog @@ -446,11 +440,10 @@ define amdgpu_ps <2 x float> @sample_c_d_o_2darray_V2(<8 x i32> inreg %rsrc, <4 ; ; GFX10GISEL-LABEL: sample_c_d_o_2darray_V2: ; GFX10GISEL: ; %bb.0: ; %main_body -; GFX10GISEL-NEXT: v_mov_b32_e32 v9, 0xffff ; GFX10GISEL-NEXT: v_lshlrev_b32_e32 v7, 16, v7 ; GFX10GISEL-NEXT: s_lshl_b32 s12, s0, 16 -; GFX10GISEL-NEXT: v_and_or_b32 v6, v6, v9, v7 -; GFX10GISEL-NEXT: v_and_or_b32 v7, v8, v9, s12 +; GFX10GISEL-NEXT: v_and_or_b32 v6, 0xffff, v6, v7 +; GFX10GISEL-NEXT: v_and_or_b32 v7, 0xffff, v8, s12 ; GFX10GISEL-NEXT: image_sample_c_d_o v[0:1], v[0:7], s[0:7], s[8:11] dmask:0x6 dim:SQ_RSRC_IMG_2D_ARRAY a16 ; GFX10GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX10GISEL-NEXT: ; return to shader part epilog @@ -490,10 +483,9 @@ define amdgpu_ps <4 x float> @sample_g16_noa16_d_1d(<8 x i32> inreg %rsrc, <4 x ; ; GFX10GISEL-LABEL: sample_g16_noa16_d_1d: ; GFX10GISEL: ; %bb.0: ; %main_body -; GFX10GISEL-NEXT: v_mov_b32_e32 v3, 0xffff ; GFX10GISEL-NEXT: s_lshl_b32 s12, s0, 16 -; GFX10GISEL-NEXT: v_and_or_b32 v0, v0, v3, s12 -; GFX10GISEL-NEXT: v_and_or_b32 v1, v1, v3, s12 +; GFX10GISEL-NEXT: v_and_or_b32 v0, 0xffff, v0, s12 +; GFX10GISEL-NEXT: v_and_or_b32 v1, 0xffff, v1, s12 ; GFX10GISEL-NEXT: image_sample_d_g16 v[0:3], v[0:2], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D ; GFX10GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX10GISEL-NEXT: ; return to shader part epilog @@ -505,9 +497,8 @@ main_body: define amdgpu_ps <4 x float> @sample_g16_noa16_d_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, half %dsdh, half %dtdh, half %dsdv, half %dtdv, float %s, float %t) { ; GFX10-LABEL: sample_g16_noa16_d_2d: ; GFX10: ; %bb.0: ; %main_body -; GFX10-NEXT: v_mov_b32_e32 v6, 0xffff -; GFX10-NEXT: v_and_b32_e32 v2, v6, v2 -; GFX10-NEXT: v_and_b32_e32 v0, v6, v0 +; GFX10-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX10-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX10-NEXT: v_lshl_or_b32 v2, v3, 16, v2 ; GFX10-NEXT: v_lshl_or_b32 v0, v1, 16, v0 ; GFX10-NEXT: image_sample_d_g16 v[0:3], [v0, v2, v4, v5], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D @@ -516,11 +507,10 @@ define amdgpu_ps <4 x float> @sample_g16_noa16_d_2d(<8 x i32> inreg %rsrc, <4 x ; ; GFX10GISEL-LABEL: sample_g16_noa16_d_2d: ; GFX10GISEL: ; %bb.0: ; %main_body -; GFX10GISEL-NEXT: v_mov_b32_e32 v6, 0xffff ; GFX10GISEL-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX10GISEL-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GFX10GISEL-NEXT: v_and_or_b32 v0, v0, v6, v1 -; GFX10GISEL-NEXT: v_and_or_b32 v1, v2, v6, v3 +; GFX10GISEL-NEXT: v_and_or_b32 v0, 0xffff, v0, v1 +; GFX10GISEL-NEXT: v_and_or_b32 v1, 0xffff, v2, v3 ; GFX10GISEL-NEXT: image_sample_d_g16 v[0:3], [v0, v1, v4, v5], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D ; GFX10GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX10GISEL-NEXT: ; return to shader part epilog @@ -534,10 +524,9 @@ define amdgpu_ps <4 x float> @sample_g16_noa16_d_3d(<8 x i32> inreg %rsrc, <4 x ; GFX10: ; %bb.0: ; %main_body ; GFX10-NEXT: v_mov_b32_e32 v9, v3 ; GFX10-NEXT: v_mov_b32_e32 v3, v2 -; GFX10-NEXT: v_mov_b32_e32 v2, 0xffff -; GFX10-NEXT: v_and_b32_e32 v9, v2, v9 -; GFX10-NEXT: v_and_b32_e32 v0, v2, v0 -; GFX10-NEXT: v_lshl_or_b32 v4, v4, 16, v9 +; GFX10-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX10-NEXT: v_and_b32_e32 v2, 0xffff, v9 +; GFX10-NEXT: v_lshl_or_b32 v4, v4, 16, v2 ; GFX10-NEXT: v_lshl_or_b32 v2, v1, 16, v0 ; GFX10-NEXT: image_sample_d_g16 v[0:3], v[2:8], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_3D ; GFX10-NEXT: s_waitcnt vmcnt(0) @@ -547,14 +536,13 @@ define amdgpu_ps <4 x float> @sample_g16_noa16_d_3d(<8 x i32> inreg %rsrc, <4 x ; GFX10GISEL: ; %bb.0: ; %main_body ; GFX10GISEL-NEXT: v_mov_b32_e32 v9, v2 ; GFX10GISEL-NEXT: v_mov_b32_e32 v10, v3 -; GFX10GISEL-NEXT: v_mov_b32_e32 v11, 0xffff ; GFX10GISEL-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX10GISEL-NEXT: v_lshlrev_b32_e32 v4, 16, v4 ; GFX10GISEL-NEXT: s_lshl_b32 s12, s0, 16 -; GFX10GISEL-NEXT: v_and_or_b32 v3, v9, v11, s12 -; GFX10GISEL-NEXT: v_and_or_b32 v2, v0, v11, v1 -; GFX10GISEL-NEXT: v_and_or_b32 v4, v10, v11, v4 -; GFX10GISEL-NEXT: v_and_or_b32 v5, v5, v11, s12 +; GFX10GISEL-NEXT: v_and_or_b32 v3, 0xffff, v9, s12 +; GFX10GISEL-NEXT: v_and_or_b32 v2, 0xffff, v0, v1 +; GFX10GISEL-NEXT: v_and_or_b32 v4, 0xffff, v10, v4 +; GFX10GISEL-NEXT: v_and_or_b32 v5, 0xffff, v5, s12 ; GFX10GISEL-NEXT: image_sample_d_g16 v[0:3], v[2:8], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_3D ; GFX10GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX10GISEL-NEXT: ; return to shader part epilog @@ -572,10 +560,9 @@ define amdgpu_ps <4 x float> @sample_g16_noa16_c_d_1d(<8 x i32> inreg %rsrc, <4 ; ; GFX10GISEL-LABEL: sample_g16_noa16_c_d_1d: ; GFX10GISEL: ; %bb.0: ; %main_body -; GFX10GISEL-NEXT: v_mov_b32_e32 v4, 0xffff ; GFX10GISEL-NEXT: s_lshl_b32 s12, s0, 16 -; GFX10GISEL-NEXT: v_and_or_b32 v1, v1, v4, s12 -; GFX10GISEL-NEXT: v_and_or_b32 v2, v2, v4, s12 +; GFX10GISEL-NEXT: v_and_or_b32 v1, 0xffff, v1, s12 +; GFX10GISEL-NEXT: v_and_or_b32 v2, 0xffff, v2, s12 ; GFX10GISEL-NEXT: image_sample_c_d_g16 v[0:3], v[0:3], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D ; GFX10GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX10GISEL-NEXT: ; return to shader part epilog @@ -587,9 +574,8 @@ main_body: define amdgpu_ps <4 x float> @sample_g16_noa16_c_d_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %zcompare, half %dsdh, half %dtdh, half %dsdv, half %dtdv, float %s, float %t) { ; GFX10-LABEL: sample_g16_noa16_c_d_2d: ; GFX10: ; %bb.0: ; %main_body -; GFX10-NEXT: v_mov_b32_e32 v7, 0xffff -; GFX10-NEXT: v_and_b32_e32 v3, v7, v3 -; GFX10-NEXT: v_and_b32_e32 v1, v7, v1 +; GFX10-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX10-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; GFX10-NEXT: v_lshl_or_b32 v3, v4, 16, v3 ; GFX10-NEXT: v_lshl_or_b32 v1, v2, 16, v1 ; GFX10-NEXT: image_sample_c_d_g16 v[0:3], [v0, v1, v3, v5, v6], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D @@ -598,11 +584,10 @@ define amdgpu_ps <4 x float> @sample_g16_noa16_c_d_2d(<8 x i32> inreg %rsrc, <4 ; ; GFX10GISEL-LABEL: sample_g16_noa16_c_d_2d: ; GFX10GISEL: ; %bb.0: ; %main_body -; GFX10GISEL-NEXT: v_mov_b32_e32 v7, 0xffff ; GFX10GISEL-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX10GISEL-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; GFX10GISEL-NEXT: v_and_or_b32 v1, v1, v7, v2 -; GFX10GISEL-NEXT: v_and_or_b32 v2, v3, v7, v4 +; GFX10GISEL-NEXT: v_and_or_b32 v1, 0xffff, v1, v2 +; GFX10GISEL-NEXT: v_and_or_b32 v2, 0xffff, v3, v4 ; GFX10GISEL-NEXT: image_sample_c_d_g16 v[0:3], [v0, v1, v2, v5, v6], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D ; GFX10GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX10GISEL-NEXT: ; return to shader part epilog @@ -620,10 +605,9 @@ define amdgpu_ps <4 x float> @sample_g16_noa16_d_cl_1d(<8 x i32> inreg %rsrc, <4 ; ; GFX10GISEL-LABEL: sample_g16_noa16_d_cl_1d: ; GFX10GISEL: ; %bb.0: ; %main_body -; GFX10GISEL-NEXT: v_mov_b32_e32 v4, 0xffff ; GFX10GISEL-NEXT: s_lshl_b32 s12, s0, 16 -; GFX10GISEL-NEXT: v_and_or_b32 v0, v0, v4, s12 -; GFX10GISEL-NEXT: v_and_or_b32 v1, v1, v4, s12 +; GFX10GISEL-NEXT: v_and_or_b32 v0, 0xffff, v0, s12 +; GFX10GISEL-NEXT: v_and_or_b32 v1, 0xffff, v1, s12 ; GFX10GISEL-NEXT: image_sample_d_cl_g16 v[0:3], v[0:3], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D ; GFX10GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX10GISEL-NEXT: ; return to shader part epilog @@ -635,9 +619,8 @@ main_body: define amdgpu_ps <4 x float> @sample_g16_noa16_d_cl_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, half %dsdh, half %dtdh, half %dsdv, half %dtdv, float %s, float %t, float %clamp) { ; GFX10-LABEL: sample_g16_noa16_d_cl_2d: ; GFX10: ; %bb.0: ; %main_body -; GFX10-NEXT: v_mov_b32_e32 v7, 0xffff -; GFX10-NEXT: v_and_b32_e32 v2, v7, v2 -; GFX10-NEXT: v_and_b32_e32 v0, v7, v0 +; GFX10-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX10-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX10-NEXT: v_lshl_or_b32 v2, v3, 16, v2 ; GFX10-NEXT: v_lshl_or_b32 v0, v1, 16, v0 ; GFX10-NEXT: image_sample_d_cl_g16 v[0:3], [v0, v2, v4, v5, v6], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D @@ -646,11 +629,10 @@ define amdgpu_ps <4 x float> @sample_g16_noa16_d_cl_2d(<8 x i32> inreg %rsrc, <4 ; ; GFX10GISEL-LABEL: sample_g16_noa16_d_cl_2d: ; GFX10GISEL: ; %bb.0: ; %main_body -; GFX10GISEL-NEXT: v_mov_b32_e32 v7, 0xffff ; GFX10GISEL-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX10GISEL-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GFX10GISEL-NEXT: v_and_or_b32 v0, v0, v7, v1 -; GFX10GISEL-NEXT: v_and_or_b32 v1, v2, v7, v3 +; GFX10GISEL-NEXT: v_and_or_b32 v0, 0xffff, v0, v1 +; GFX10GISEL-NEXT: v_and_or_b32 v1, 0xffff, v2, v3 ; GFX10GISEL-NEXT: image_sample_d_cl_g16 v[0:3], [v0, v1, v4, v5, v6], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D ; GFX10GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX10GISEL-NEXT: ; return to shader part epilog @@ -668,10 +650,9 @@ define amdgpu_ps <4 x float> @sample_g16_noa16_c_d_cl_1d(<8 x i32> inreg %rsrc, ; ; GFX10GISEL-LABEL: sample_g16_noa16_c_d_cl_1d: ; GFX10GISEL: ; %bb.0: ; %main_body -; GFX10GISEL-NEXT: v_mov_b32_e32 v5, 0xffff ; GFX10GISEL-NEXT: s_lshl_b32 s12, s0, 16 -; GFX10GISEL-NEXT: v_and_or_b32 v1, v1, v5, s12 -; GFX10GISEL-NEXT: v_and_or_b32 v2, v2, v5, s12 +; GFX10GISEL-NEXT: v_and_or_b32 v1, 0xffff, v1, s12 +; GFX10GISEL-NEXT: v_and_or_b32 v2, 0xffff, v2, s12 ; GFX10GISEL-NEXT: image_sample_c_d_cl_g16 v[0:3], v[0:4], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D ; GFX10GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX10GISEL-NEXT: ; return to shader part epilog @@ -685,11 +666,10 @@ define amdgpu_ps <4 x float> @sample_g16_noa16_c_d_cl_2d(<8 x i32> inreg %rsrc, ; GFX10: ; %bb.0: ; %main_body ; GFX10-NEXT: v_mov_b32_e32 v8, v2 ; GFX10-NEXT: v_mov_b32_e32 v2, v0 -; GFX10-NEXT: v_mov_b32_e32 v0, 0xffff -; GFX10-NEXT: v_and_b32_e32 v3, v0, v3 -; GFX10-NEXT: v_and_b32_e32 v0, v0, v1 -; GFX10-NEXT: v_lshl_or_b32 v4, v4, 16, v3 -; GFX10-NEXT: v_lshl_or_b32 v3, v8, 16, v0 +; GFX10-NEXT: v_and_b32_e32 v0, 0xffff, v3 +; GFX10-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX10-NEXT: v_lshl_or_b32 v4, v4, 16, v0 +; GFX10-NEXT: v_lshl_or_b32 v3, v8, 16, v1 ; GFX10-NEXT: image_sample_c_d_cl_g16 v[0:3], v[2:7], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ; return to shader part epilog @@ -699,11 +679,10 @@ define amdgpu_ps <4 x float> @sample_g16_noa16_c_d_cl_2d(<8 x i32> inreg %rsrc, ; GFX10GISEL-NEXT: v_mov_b32_e32 v8, v2 ; GFX10GISEL-NEXT: v_mov_b32_e32 v9, v3 ; GFX10GISEL-NEXT: v_mov_b32_e32 v2, v0 -; GFX10GISEL-NEXT: v_mov_b32_e32 v0, 0xffff ; GFX10GISEL-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; GFX10GISEL-NEXT: v_lshlrev_b32_e32 v3, 16, v8 -; GFX10GISEL-NEXT: v_and_or_b32 v4, v9, v0, v4 -; GFX10GISEL-NEXT: v_and_or_b32 v3, v1, v0, v3 +; GFX10GISEL-NEXT: v_lshlrev_b32_e32 v0, 16, v8 +; GFX10GISEL-NEXT: v_and_or_b32 v4, 0xffff, v9, v4 +; GFX10GISEL-NEXT: v_and_or_b32 v3, 0xffff, v1, v0 ; GFX10GISEL-NEXT: image_sample_c_d_cl_g16 v[0:3], v[2:7], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D ; GFX10GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX10GISEL-NEXT: ; return to shader part epilog @@ -721,10 +700,9 @@ define amdgpu_ps <4 x float> @sample_g16_noa16_cd_1d(<8 x i32> inreg %rsrc, <4 x ; ; GFX10GISEL-LABEL: sample_g16_noa16_cd_1d: ; GFX10GISEL: ; %bb.0: ; %main_body -; GFX10GISEL-NEXT: v_mov_b32_e32 v3, 0xffff ; GFX10GISEL-NEXT: s_lshl_b32 s12, s0, 16 -; GFX10GISEL-NEXT: v_and_or_b32 v0, v0, v3, s12 -; GFX10GISEL-NEXT: v_and_or_b32 v1, v1, v3, s12 +; GFX10GISEL-NEXT: v_and_or_b32 v0, 0xffff, v0, s12 +; GFX10GISEL-NEXT: v_and_or_b32 v1, 0xffff, v1, s12 ; GFX10GISEL-NEXT: image_sample_cd_g16 v[0:3], v[0:2], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D ; GFX10GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX10GISEL-NEXT: ; return to shader part epilog @@ -736,9 +714,8 @@ main_body: define amdgpu_ps <4 x float> @sample_g16_noa16_cd_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, half %dsdh, half %dtdh, half %dsdv, half %dtdv, float %s, float %t) { ; GFX10-LABEL: sample_g16_noa16_cd_2d: ; GFX10: ; %bb.0: ; %main_body -; GFX10-NEXT: v_mov_b32_e32 v6, 0xffff -; GFX10-NEXT: v_and_b32_e32 v2, v6, v2 -; GFX10-NEXT: v_and_b32_e32 v0, v6, v0 +; GFX10-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX10-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX10-NEXT: v_lshl_or_b32 v2, v3, 16, v2 ; GFX10-NEXT: v_lshl_or_b32 v0, v1, 16, v0 ; GFX10-NEXT: image_sample_cd_g16 v[0:3], [v0, v2, v4, v5], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D @@ -747,11 +724,10 @@ define amdgpu_ps <4 x float> @sample_g16_noa16_cd_2d(<8 x i32> inreg %rsrc, <4 x ; ; GFX10GISEL-LABEL: sample_g16_noa16_cd_2d: ; GFX10GISEL: ; %bb.0: ; %main_body -; GFX10GISEL-NEXT: v_mov_b32_e32 v6, 0xffff ; GFX10GISEL-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX10GISEL-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GFX10GISEL-NEXT: v_and_or_b32 v0, v0, v6, v1 -; GFX10GISEL-NEXT: v_and_or_b32 v1, v2, v6, v3 +; GFX10GISEL-NEXT: v_and_or_b32 v0, 0xffff, v0, v1 +; GFX10GISEL-NEXT: v_and_or_b32 v1, 0xffff, v2, v3 ; GFX10GISEL-NEXT: image_sample_cd_g16 v[0:3], [v0, v1, v4, v5], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D ; GFX10GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX10GISEL-NEXT: ; return to shader part epilog @@ -769,10 +745,9 @@ define amdgpu_ps <4 x float> @sample_g16_noa16_c_cd_1d(<8 x i32> inreg %rsrc, <4 ; ; GFX10GISEL-LABEL: sample_g16_noa16_c_cd_1d: ; GFX10GISEL: ; %bb.0: ; %main_body -; GFX10GISEL-NEXT: v_mov_b32_e32 v4, 0xffff ; GFX10GISEL-NEXT: s_lshl_b32 s12, s0, 16 -; GFX10GISEL-NEXT: v_and_or_b32 v1, v1, v4, s12 -; GFX10GISEL-NEXT: v_and_or_b32 v2, v2, v4, s12 +; GFX10GISEL-NEXT: v_and_or_b32 v1, 0xffff, v1, s12 +; GFX10GISEL-NEXT: v_and_or_b32 v2, 0xffff, v2, s12 ; GFX10GISEL-NEXT: image_sample_c_cd_g16 v[0:3], v[0:3], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D ; GFX10GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX10GISEL-NEXT: ; return to shader part epilog @@ -784,9 +759,8 @@ main_body: define amdgpu_ps <4 x float> @sample_g16_noa16_c_cd_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %zcompare, half %dsdh, half %dtdh, half %dsdv, half %dtdv, float %s, float %t) { ; GFX10-LABEL: sample_g16_noa16_c_cd_2d: ; GFX10: ; %bb.0: ; %main_body -; GFX10-NEXT: v_mov_b32_e32 v7, 0xffff -; GFX10-NEXT: v_and_b32_e32 v3, v7, v3 -; GFX10-NEXT: v_and_b32_e32 v1, v7, v1 +; GFX10-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX10-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; GFX10-NEXT: v_lshl_or_b32 v3, v4, 16, v3 ; GFX10-NEXT: v_lshl_or_b32 v1, v2, 16, v1 ; GFX10-NEXT: image_sample_c_cd_g16 v[0:3], [v0, v1, v3, v5, v6], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D @@ -795,11 +769,10 @@ define amdgpu_ps <4 x float> @sample_g16_noa16_c_cd_2d(<8 x i32> inreg %rsrc, <4 ; ; GFX10GISEL-LABEL: sample_g16_noa16_c_cd_2d: ; GFX10GISEL: ; %bb.0: ; %main_body -; GFX10GISEL-NEXT: v_mov_b32_e32 v7, 0xffff ; GFX10GISEL-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX10GISEL-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; GFX10GISEL-NEXT: v_and_or_b32 v1, v1, v7, v2 -; GFX10GISEL-NEXT: v_and_or_b32 v2, v3, v7, v4 +; GFX10GISEL-NEXT: v_and_or_b32 v1, 0xffff, v1, v2 +; GFX10GISEL-NEXT: v_and_or_b32 v2, 0xffff, v3, v4 ; GFX10GISEL-NEXT: image_sample_c_cd_g16 v[0:3], [v0, v1, v2, v5, v6], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D ; GFX10GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX10GISEL-NEXT: ; return to shader part epilog @@ -817,10 +790,9 @@ define amdgpu_ps <4 x float> @sample_g16_noa16_cd_cl_1d(<8 x i32> inreg %rsrc, < ; ; GFX10GISEL-LABEL: sample_g16_noa16_cd_cl_1d: ; GFX10GISEL: ; %bb.0: ; %main_body -; GFX10GISEL-NEXT: v_mov_b32_e32 v4, 0xffff ; GFX10GISEL-NEXT: s_lshl_b32 s12, s0, 16 -; GFX10GISEL-NEXT: v_and_or_b32 v0, v0, v4, s12 -; GFX10GISEL-NEXT: v_and_or_b32 v1, v1, v4, s12 +; GFX10GISEL-NEXT: v_and_or_b32 v0, 0xffff, v0, s12 +; GFX10GISEL-NEXT: v_and_or_b32 v1, 0xffff, v1, s12 ; GFX10GISEL-NEXT: image_sample_cd_cl_g16 v[0:3], v[0:3], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D ; GFX10GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX10GISEL-NEXT: ; return to shader part epilog @@ -832,9 +804,8 @@ main_body: define amdgpu_ps <4 x float> @sample_g16_noa16_cd_cl_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, half %dsdh, half %dtdh, half %dsdv, half %dtdv, float %s, float %t, float %clamp) { ; GFX10-LABEL: sample_g16_noa16_cd_cl_2d: ; GFX10: ; %bb.0: ; %main_body -; GFX10-NEXT: v_mov_b32_e32 v7, 0xffff -; GFX10-NEXT: v_and_b32_e32 v2, v7, v2 -; GFX10-NEXT: v_and_b32_e32 v0, v7, v0 +; GFX10-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX10-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX10-NEXT: v_lshl_or_b32 v2, v3, 16, v2 ; GFX10-NEXT: v_lshl_or_b32 v0, v1, 16, v0 ; GFX10-NEXT: image_sample_cd_cl_g16 v[0:3], [v0, v2, v4, v5, v6], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D @@ -843,11 +814,10 @@ define amdgpu_ps <4 x float> @sample_g16_noa16_cd_cl_2d(<8 x i32> inreg %rsrc, < ; ; GFX10GISEL-LABEL: sample_g16_noa16_cd_cl_2d: ; GFX10GISEL: ; %bb.0: ; %main_body -; GFX10GISEL-NEXT: v_mov_b32_e32 v7, 0xffff ; GFX10GISEL-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX10GISEL-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GFX10GISEL-NEXT: v_and_or_b32 v0, v0, v7, v1 -; GFX10GISEL-NEXT: v_and_or_b32 v1, v2, v7, v3 +; GFX10GISEL-NEXT: v_and_or_b32 v0, 0xffff, v0, v1 +; GFX10GISEL-NEXT: v_and_or_b32 v1, 0xffff, v2, v3 ; GFX10GISEL-NEXT: image_sample_cd_cl_g16 v[0:3], [v0, v1, v4, v5, v6], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D ; GFX10GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX10GISEL-NEXT: ; return to shader part epilog @@ -865,10 +835,9 @@ define amdgpu_ps <4 x float> @sample_g16_noa16_c_cd_cl_1d(<8 x i32> inreg %rsrc, ; ; GFX10GISEL-LABEL: sample_g16_noa16_c_cd_cl_1d: ; GFX10GISEL: ; %bb.0: ; %main_body -; GFX10GISEL-NEXT: v_mov_b32_e32 v5, 0xffff ; GFX10GISEL-NEXT: s_lshl_b32 s12, s0, 16 -; GFX10GISEL-NEXT: v_and_or_b32 v1, v1, v5, s12 -; GFX10GISEL-NEXT: v_and_or_b32 v2, v2, v5, s12 +; GFX10GISEL-NEXT: v_and_or_b32 v1, 0xffff, v1, s12 +; GFX10GISEL-NEXT: v_and_or_b32 v2, 0xffff, v2, s12 ; GFX10GISEL-NEXT: image_sample_c_cd_cl_g16 v[0:3], v[0:4], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D ; GFX10GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX10GISEL-NEXT: ; return to shader part epilog @@ -882,11 +851,10 @@ define amdgpu_ps <4 x float> @sample_g16_noa16_c_cd_cl_2d(<8 x i32> inreg %rsrc, ; GFX10: ; %bb.0: ; %main_body ; GFX10-NEXT: v_mov_b32_e32 v8, v2 ; GFX10-NEXT: v_mov_b32_e32 v2, v0 -; GFX10-NEXT: v_mov_b32_e32 v0, 0xffff -; GFX10-NEXT: v_and_b32_e32 v3, v0, v3 -; GFX10-NEXT: v_and_b32_e32 v0, v0, v1 -; GFX10-NEXT: v_lshl_or_b32 v4, v4, 16, v3 -; GFX10-NEXT: v_lshl_or_b32 v3, v8, 16, v0 +; GFX10-NEXT: v_and_b32_e32 v0, 0xffff, v3 +; GFX10-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX10-NEXT: v_lshl_or_b32 v4, v4, 16, v0 +; GFX10-NEXT: v_lshl_or_b32 v3, v8, 16, v1 ; GFX10-NEXT: image_sample_c_cd_cl_g16 v[0:3], v[2:7], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ; return to shader part epilog @@ -896,11 +864,10 @@ define amdgpu_ps <4 x float> @sample_g16_noa16_c_cd_cl_2d(<8 x i32> inreg %rsrc, ; GFX10GISEL-NEXT: v_mov_b32_e32 v8, v2 ; GFX10GISEL-NEXT: v_mov_b32_e32 v9, v3 ; GFX10GISEL-NEXT: v_mov_b32_e32 v2, v0 -; GFX10GISEL-NEXT: v_mov_b32_e32 v0, 0xffff ; GFX10GISEL-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; GFX10GISEL-NEXT: v_lshlrev_b32_e32 v3, 16, v8 -; GFX10GISEL-NEXT: v_and_or_b32 v4, v9, v0, v4 -; GFX10GISEL-NEXT: v_and_or_b32 v3, v1, v0, v3 +; GFX10GISEL-NEXT: v_lshlrev_b32_e32 v0, 16, v8 +; GFX10GISEL-NEXT: v_and_or_b32 v4, 0xffff, v9, v4 +; GFX10GISEL-NEXT: v_and_or_b32 v3, 0xffff, v1, v0 ; GFX10GISEL-NEXT: image_sample_c_cd_cl_g16 v[0:3], v[2:7], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D ; GFX10GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX10GISEL-NEXT: ; return to shader part epilog @@ -913,14 +880,13 @@ define amdgpu_ps float @sample_g16_noa16_c_d_o_2darray_V1(<8 x i32> inreg %rsrc, ; GFX10-LABEL: sample_g16_noa16_c_d_o_2darray_V1: ; GFX10: ; %bb.0: ; %main_body ; GFX10-NEXT: v_mov_b32_e32 v9, v2 -; GFX10-NEXT: v_mov_b32_e32 v2, v0 -; GFX10-NEXT: v_mov_b32_e32 v0, 0xffff ; GFX10-NEXT: v_mov_b32_e32 v10, v3 ; GFX10-NEXT: v_mov_b32_e32 v3, v1 -; GFX10-NEXT: v_and_b32_e32 v1, v0, v4 -; GFX10-NEXT: v_and_b32_e32 v0, v0, v9 -; GFX10-NEXT: v_lshl_or_b32 v5, v5, 16, v1 -; GFX10-NEXT: v_lshl_or_b32 v4, v10, 16, v0 +; GFX10-NEXT: v_mov_b32_e32 v2, v0 +; GFX10-NEXT: v_and_b32_e32 v0, 0xffff, v4 +; GFX10-NEXT: v_and_b32_e32 v1, 0xffff, v9 +; GFX10-NEXT: v_lshl_or_b32 v5, v5, 16, v0 +; GFX10-NEXT: v_lshl_or_b32 v4, v10, 16, v1 ; GFX10-NEXT: image_sample_c_d_o_g16 v0, v[2:8], s[0:7], s[8:11] dmask:0x4 dim:SQ_RSRC_IMG_2D_ARRAY ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ; return to shader part epilog @@ -932,11 +898,10 @@ define amdgpu_ps float @sample_g16_noa16_c_d_o_2darray_V1(<8 x i32> inreg %rsrc, ; GFX10GISEL-NEXT: v_mov_b32_e32 v11, v4 ; GFX10GISEL-NEXT: v_mov_b32_e32 v2, v0 ; GFX10GISEL-NEXT: v_mov_b32_e32 v3, v1 -; GFX10GISEL-NEXT: v_mov_b32_e32 v0, 0xffff -; GFX10GISEL-NEXT: v_lshlrev_b32_e32 v1, 16, v9 -; GFX10GISEL-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GFX10GISEL-NEXT: v_and_or_b32 v4, v10, v0, v1 -; GFX10GISEL-NEXT: v_and_or_b32 v5, v11, v0, v5 +; GFX10GISEL-NEXT: v_lshlrev_b32_e32 v0, 16, v9 +; GFX10GISEL-NEXT: v_lshlrev_b32_e32 v1, 16, v5 +; GFX10GISEL-NEXT: v_and_or_b32 v4, 0xffff, v10, v0 +; GFX10GISEL-NEXT: v_and_or_b32 v5, 0xffff, v11, v1 ; GFX10GISEL-NEXT: image_sample_c_d_o_g16 v0, v[2:8], s[0:7], s[8:11] dmask:0x4 dim:SQ_RSRC_IMG_2D_ARRAY ; GFX10GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX10GISEL-NEXT: ; return to shader part epilog @@ -949,14 +914,13 @@ define amdgpu_ps <2 x float> @sample_g16_noa16_c_d_o_2darray_V2(<8 x i32> inreg ; GFX10-LABEL: sample_g16_noa16_c_d_o_2darray_V2: ; GFX10: ; %bb.0: ; %main_body ; GFX10-NEXT: v_mov_b32_e32 v9, v2 -; GFX10-NEXT: v_mov_b32_e32 v2, v0 -; GFX10-NEXT: v_mov_b32_e32 v0, 0xffff ; GFX10-NEXT: v_mov_b32_e32 v10, v3 ; GFX10-NEXT: v_mov_b32_e32 v3, v1 -; GFX10-NEXT: v_and_b32_e32 v1, v0, v4 -; GFX10-NEXT: v_and_b32_e32 v0, v0, v9 -; GFX10-NEXT: v_lshl_or_b32 v5, v5, 16, v1 -; GFX10-NEXT: v_lshl_or_b32 v4, v10, 16, v0 +; GFX10-NEXT: v_mov_b32_e32 v2, v0 +; GFX10-NEXT: v_and_b32_e32 v0, 0xffff, v4 +; GFX10-NEXT: v_and_b32_e32 v1, 0xffff, v9 +; GFX10-NEXT: v_lshl_or_b32 v5, v5, 16, v0 +; GFX10-NEXT: v_lshl_or_b32 v4, v10, 16, v1 ; GFX10-NEXT: image_sample_c_d_o_g16 v[0:1], v[2:8], s[0:7], s[8:11] dmask:0x6 dim:SQ_RSRC_IMG_2D_ARRAY ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ; return to shader part epilog @@ -968,11 +932,10 @@ define amdgpu_ps <2 x float> @sample_g16_noa16_c_d_o_2darray_V2(<8 x i32> inreg ; GFX10GISEL-NEXT: v_mov_b32_e32 v11, v4 ; GFX10GISEL-NEXT: v_mov_b32_e32 v2, v0 ; GFX10GISEL-NEXT: v_mov_b32_e32 v3, v1 -; GFX10GISEL-NEXT: v_mov_b32_e32 v0, 0xffff -; GFX10GISEL-NEXT: v_lshlrev_b32_e32 v1, 16, v9 -; GFX10GISEL-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GFX10GISEL-NEXT: v_and_or_b32 v4, v10, v0, v1 -; GFX10GISEL-NEXT: v_and_or_b32 v5, v11, v0, v5 +; GFX10GISEL-NEXT: v_lshlrev_b32_e32 v0, 16, v9 +; GFX10GISEL-NEXT: v_lshlrev_b32_e32 v1, 16, v5 +; GFX10GISEL-NEXT: v_and_or_b32 v4, 0xffff, v10, v0 +; GFX10GISEL-NEXT: v_and_or_b32 v5, 0xffff, v11, v1 ; GFX10GISEL-NEXT: image_sample_c_d_o_g16 v[0:1], v[2:8], s[0:7], s[8:11] dmask:0x6 dim:SQ_RSRC_IMG_2D_ARRAY ; GFX10GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX10GISEL-NEXT: ; return to shader part epilog diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.g16.encode.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.g16.encode.ll index 75990337c8a3..ffab03057c2b 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.g16.encode.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.g16.encode.ll @@ -15,9 +15,8 @@ main_body: define amdgpu_ps <4 x float> @sample_d_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, half %dsdh, half %dtdh, half %dsdv, half %dtdv, float %s, float %t) { ; GFX10-LABEL: sample_d_2d: ; GFX10: ; %bb.0: ; %main_body -; GFX10-NEXT: v_mov_b32_e32 v6, 0xffff ; encoding: [0xff,0x02,0x0c,0x7e,0xff,0xff,0x00,0x00] -; GFX10-NEXT: v_and_b32_e32 v2, v6, v2 ; encoding: [0x06,0x05,0x04,0x36] -; GFX10-NEXT: v_and_b32_e32 v0, v6, v0 ; encoding: [0x06,0x01,0x00,0x36] +; GFX10-NEXT: v_and_b32_e32 v2, 0xffff, v2 ; encoding: [0xff,0x04,0x04,0x36,0xff,0xff,0x00,0x00] +; GFX10-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; encoding: [0xff,0x00,0x00,0x36,0xff,0xff,0x00,0x00] ; GFX10-NEXT: v_lshl_or_b32 v2, v3, 16, v2 ; encoding: [0x02,0x00,0x6f,0xd7,0x03,0x21,0x09,0x04] ; GFX10-NEXT: v_lshl_or_b32 v0, v1, 16, v0 ; encoding: [0x00,0x00,0x6f,0xd7,0x01,0x21,0x01,0x04] ; GFX10-NEXT: image_sample_d_g16 v[0:3], [v0, v2, v4, v5], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D ; encoding: [0x0b,0x0f,0x88,0xf0,0x00,0x00,0x40,0x00,0x02,0x04,0x05,0x00] @@ -33,10 +32,9 @@ define amdgpu_ps <4 x float> @sample_d_3d(<8 x i32> inreg %rsrc, <4 x i32> inreg ; GFX10: ; %bb.0: ; %main_body ; GFX10-NEXT: v_mov_b32_e32 v9, v3 ; encoding: [0x03,0x03,0x12,0x7e] ; GFX10-NEXT: v_mov_b32_e32 v3, v2 ; encoding: [0x02,0x03,0x06,0x7e] -; GFX10-NEXT: v_mov_b32_e32 v2, 0xffff ; encoding: [0xff,0x02,0x04,0x7e,0xff,0xff,0x00,0x00] -; GFX10-NEXT: v_and_b32_e32 v9, v2, v9 ; encoding: [0x02,0x13,0x12,0x36] -; GFX10-NEXT: v_and_b32_e32 v0, v2, v0 ; encoding: [0x02,0x01,0x00,0x36] -; GFX10-NEXT: v_lshl_or_b32 v4, v4, 16, v9 ; encoding: [0x04,0x00,0x6f,0xd7,0x04,0x21,0x25,0x04] +; GFX10-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; encoding: [0xff,0x00,0x00,0x36,0xff,0xff,0x00,0x00] +; GFX10-NEXT: v_and_b32_e32 v2, 0xffff, v9 ; encoding: [0xff,0x12,0x04,0x36,0xff,0xff,0x00,0x00] +; GFX10-NEXT: v_lshl_or_b32 v4, v4, 16, v2 ; encoding: [0x04,0x00,0x6f,0xd7,0x04,0x21,0x09,0x04] ; GFX10-NEXT: v_lshl_or_b32 v2, v1, 16, v0 ; encoding: [0x02,0x00,0x6f,0xd7,0x01,0x21,0x01,0x04] ; GFX10-NEXT: image_sample_d_g16 v[0:3], v[2:8], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_3D ; encoding: [0x11,0x0f,0x88,0xf0,0x02,0x00,0x40,0x00] ; GFX10-NEXT: s_waitcnt vmcnt(0) ; encoding: [0x70,0x3f,0x8c,0xbf] @@ -60,9 +58,8 @@ main_body: define amdgpu_ps <4 x float> @sample_c_d_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %zcompare, half %dsdh, half %dtdh, half %dsdv, half %dtdv, float %s, float %t) { ; GFX10-LABEL: sample_c_d_2d: ; GFX10: ; %bb.0: ; %main_body -; GFX10-NEXT: v_mov_b32_e32 v7, 0xffff ; encoding: [0xff,0x02,0x0e,0x7e,0xff,0xff,0x00,0x00] -; GFX10-NEXT: v_and_b32_e32 v3, v7, v3 ; encoding: [0x07,0x07,0x06,0x36] -; GFX10-NEXT: v_and_b32_e32 v1, v7, v1 ; encoding: [0x07,0x03,0x02,0x36] +; GFX10-NEXT: v_and_b32_e32 v3, 0xffff, v3 ; encoding: [0xff,0x06,0x06,0x36,0xff,0xff,0x00,0x00] +; GFX10-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; encoding: [0xff,0x02,0x02,0x36,0xff,0xff,0x00,0x00] ; GFX10-NEXT: v_lshl_or_b32 v3, v4, 16, v3 ; encoding: [0x03,0x00,0x6f,0xd7,0x04,0x21,0x0d,0x04] ; GFX10-NEXT: v_lshl_or_b32 v1, v2, 16, v1 ; encoding: [0x01,0x00,0x6f,0xd7,0x02,0x21,0x05,0x04] ; GFX10-NEXT: image_sample_c_d_g16 v[0:3], [v0, v1, v3, v5, v6], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D ; encoding: [0x0b,0x0f,0xa8,0xf0,0x00,0x00,0x40,0x00,0x01,0x03,0x05,0x06] @@ -87,9 +84,8 @@ main_body: define amdgpu_ps <4 x float> @sample_d_cl_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, half %dsdh, half %dtdh, half %dsdv, half %dtdv, float %s, float %t, float %clamp) { ; GFX10-LABEL: sample_d_cl_2d: ; GFX10: ; %bb.0: ; %main_body -; GFX10-NEXT: v_mov_b32_e32 v7, 0xffff ; encoding: [0xff,0x02,0x0e,0x7e,0xff,0xff,0x00,0x00] -; GFX10-NEXT: v_and_b32_e32 v2, v7, v2 ; encoding: [0x07,0x05,0x04,0x36] -; GFX10-NEXT: v_and_b32_e32 v0, v7, v0 ; encoding: [0x07,0x01,0x00,0x36] +; GFX10-NEXT: v_and_b32_e32 v2, 0xffff, v2 ; encoding: [0xff,0x04,0x04,0x36,0xff,0xff,0x00,0x00] +; GFX10-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; encoding: [0xff,0x00,0x00,0x36,0xff,0xff,0x00,0x00] ; GFX10-NEXT: v_lshl_or_b32 v2, v3, 16, v2 ; encoding: [0x02,0x00,0x6f,0xd7,0x03,0x21,0x09,0x04] ; GFX10-NEXT: v_lshl_or_b32 v0, v1, 16, v0 ; encoding: [0x00,0x00,0x6f,0xd7,0x01,0x21,0x01,0x04] ; GFX10-NEXT: image_sample_d_cl_g16 v[0:3], [v0, v2, v4, v5, v6], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D ; encoding: [0x0b,0x0f,0x8c,0xf0,0x00,0x00,0x40,0x00,0x02,0x04,0x05,0x06] @@ -116,11 +112,10 @@ define amdgpu_ps <4 x float> @sample_c_d_cl_2d(<8 x i32> inreg %rsrc, <4 x i32> ; GFX10: ; %bb.0: ; %main_body ; GFX10-NEXT: v_mov_b32_e32 v8, v2 ; encoding: [0x02,0x03,0x10,0x7e] ; GFX10-NEXT: v_mov_b32_e32 v2, v0 ; encoding: [0x00,0x03,0x04,0x7e] -; GFX10-NEXT: v_mov_b32_e32 v0, 0xffff ; encoding: [0xff,0x02,0x00,0x7e,0xff,0xff,0x00,0x00] -; GFX10-NEXT: v_and_b32_e32 v3, v0, v3 ; encoding: [0x00,0x07,0x06,0x36] -; GFX10-NEXT: v_and_b32_e32 v0, v0, v1 ; encoding: [0x00,0x03,0x00,0x36] -; GFX10-NEXT: v_lshl_or_b32 v4, v4, 16, v3 ; encoding: [0x04,0x00,0x6f,0xd7,0x04,0x21,0x0d,0x04] -; GFX10-NEXT: v_lshl_or_b32 v3, v8, 16, v0 ; encoding: [0x03,0x00,0x6f,0xd7,0x08,0x21,0x01,0x04] +; GFX10-NEXT: v_and_b32_e32 v0, 0xffff, v3 ; encoding: [0xff,0x06,0x00,0x36,0xff,0xff,0x00,0x00] +; GFX10-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; encoding: [0xff,0x02,0x02,0x36,0xff,0xff,0x00,0x00] +; GFX10-NEXT: v_lshl_or_b32 v4, v4, 16, v0 ; encoding: [0x04,0x00,0x6f,0xd7,0x04,0x21,0x01,0x04] +; GFX10-NEXT: v_lshl_or_b32 v3, v8, 16, v1 ; encoding: [0x03,0x00,0x6f,0xd7,0x08,0x21,0x05,0x04] ; GFX10-NEXT: image_sample_c_d_cl_g16 v[0:3], v[2:7], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D ; encoding: [0x09,0x0f,0xac,0xf0,0x02,0x00,0x40,0x00] ; GFX10-NEXT: s_waitcnt vmcnt(0) ; encoding: [0x70,0x3f,0x8c,0xbf] ; GFX10-NEXT: ; return to shader part epilog @@ -143,9 +138,8 @@ main_body: define amdgpu_ps <4 x float> @sample_cd_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, half %dsdh, half %dtdh, half %dsdv, half %dtdv, float %s, float %t) { ; GFX10-LABEL: sample_cd_2d: ; GFX10: ; %bb.0: ; %main_body -; GFX10-NEXT: v_mov_b32_e32 v6, 0xffff ; encoding: [0xff,0x02,0x0c,0x7e,0xff,0xff,0x00,0x00] -; GFX10-NEXT: v_and_b32_e32 v2, v6, v2 ; encoding: [0x06,0x05,0x04,0x36] -; GFX10-NEXT: v_and_b32_e32 v0, v6, v0 ; encoding: [0x06,0x01,0x00,0x36] +; GFX10-NEXT: v_and_b32_e32 v2, 0xffff, v2 ; encoding: [0xff,0x04,0x04,0x36,0xff,0xff,0x00,0x00] +; GFX10-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; encoding: [0xff,0x00,0x00,0x36,0xff,0xff,0x00,0x00] ; GFX10-NEXT: v_lshl_or_b32 v2, v3, 16, v2 ; encoding: [0x02,0x00,0x6f,0xd7,0x03,0x21,0x09,0x04] ; GFX10-NEXT: v_lshl_or_b32 v0, v1, 16, v0 ; encoding: [0x00,0x00,0x6f,0xd7,0x01,0x21,0x01,0x04] ; GFX10-NEXT: image_sample_cd_g16 v[0:3], [v0, v2, v4, v5], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D ; encoding: [0x0b,0x0f,0xa0,0xf1,0x00,0x00,0x40,0x00,0x02,0x04,0x05,0x00] @@ -170,9 +164,8 @@ main_body: define amdgpu_ps <4 x float> @sample_c_cd_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %zcompare, half %dsdh, half %dtdh, half %dsdv, half %dtdv, float %s, float %t) { ; GFX10-LABEL: sample_c_cd_2d: ; GFX10: ; %bb.0: ; %main_body -; GFX10-NEXT: v_mov_b32_e32 v7, 0xffff ; encoding: [0xff,0x02,0x0e,0x7e,0xff,0xff,0x00,0x00] -; GFX10-NEXT: v_and_b32_e32 v3, v7, v3 ; encoding: [0x07,0x07,0x06,0x36] -; GFX10-NEXT: v_and_b32_e32 v1, v7, v1 ; encoding: [0x07,0x03,0x02,0x36] +; GFX10-NEXT: v_and_b32_e32 v3, 0xffff, v3 ; encoding: [0xff,0x06,0x06,0x36,0xff,0xff,0x00,0x00] +; GFX10-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; encoding: [0xff,0x02,0x02,0x36,0xff,0xff,0x00,0x00] ; GFX10-NEXT: v_lshl_or_b32 v3, v4, 16, v3 ; encoding: [0x03,0x00,0x6f,0xd7,0x04,0x21,0x0d,0x04] ; GFX10-NEXT: v_lshl_or_b32 v1, v2, 16, v1 ; encoding: [0x01,0x00,0x6f,0xd7,0x02,0x21,0x05,0x04] ; GFX10-NEXT: image_sample_c_cd_g16 v[0:3], [v0, v1, v3, v5, v6], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D ; encoding: [0x0b,0x0f,0xa8,0xf1,0x00,0x00,0x40,0x00,0x01,0x03,0x05,0x06] @@ -197,9 +190,8 @@ main_body: define amdgpu_ps <4 x float> @sample_cd_cl_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, half %dsdh, half %dtdh, half %dsdv, half %dtdv, float %s, float %t, float %clamp) { ; GFX10-LABEL: sample_cd_cl_2d: ; GFX10: ; %bb.0: ; %main_body -; GFX10-NEXT: v_mov_b32_e32 v7, 0xffff ; encoding: [0xff,0x02,0x0e,0x7e,0xff,0xff,0x00,0x00] -; GFX10-NEXT: v_and_b32_e32 v2, v7, v2 ; encoding: [0x07,0x05,0x04,0x36] -; GFX10-NEXT: v_and_b32_e32 v0, v7, v0 ; encoding: [0x07,0x01,0x00,0x36] +; GFX10-NEXT: v_and_b32_e32 v2, 0xffff, v2 ; encoding: [0xff,0x04,0x04,0x36,0xff,0xff,0x00,0x00] +; GFX10-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; encoding: [0xff,0x00,0x00,0x36,0xff,0xff,0x00,0x00] ; GFX10-NEXT: v_lshl_or_b32 v2, v3, 16, v2 ; encoding: [0x02,0x00,0x6f,0xd7,0x03,0x21,0x09,0x04] ; GFX10-NEXT: v_lshl_or_b32 v0, v1, 16, v0 ; encoding: [0x00,0x00,0x6f,0xd7,0x01,0x21,0x01,0x04] ; GFX10-NEXT: image_sample_cd_cl_g16 v[0:3], [v0, v2, v4, v5, v6], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D ; encoding: [0x0b,0x0f,0xa4,0xf1,0x00,0x00,0x40,0x00,0x02,0x04,0x05,0x06] @@ -226,11 +218,10 @@ define amdgpu_ps <4 x float> @sample_c_cd_cl_2d(<8 x i32> inreg %rsrc, <4 x i32> ; GFX10: ; %bb.0: ; %main_body ; GFX10-NEXT: v_mov_b32_e32 v8, v2 ; encoding: [0x02,0x03,0x10,0x7e] ; GFX10-NEXT: v_mov_b32_e32 v2, v0 ; encoding: [0x00,0x03,0x04,0x7e] -; GFX10-NEXT: v_mov_b32_e32 v0, 0xffff ; encoding: [0xff,0x02,0x00,0x7e,0xff,0xff,0x00,0x00] -; GFX10-NEXT: v_and_b32_e32 v3, v0, v3 ; encoding: [0x00,0x07,0x06,0x36] -; GFX10-NEXT: v_and_b32_e32 v0, v0, v1 ; encoding: [0x00,0x03,0x00,0x36] -; GFX10-NEXT: v_lshl_or_b32 v4, v4, 16, v3 ; encoding: [0x04,0x00,0x6f,0xd7,0x04,0x21,0x0d,0x04] -; GFX10-NEXT: v_lshl_or_b32 v3, v8, 16, v0 ; encoding: [0x03,0x00,0x6f,0xd7,0x08,0x21,0x01,0x04] +; GFX10-NEXT: v_and_b32_e32 v0, 0xffff, v3 ; encoding: [0xff,0x06,0x00,0x36,0xff,0xff,0x00,0x00] +; GFX10-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; encoding: [0xff,0x02,0x02,0x36,0xff,0xff,0x00,0x00] +; GFX10-NEXT: v_lshl_or_b32 v4, v4, 16, v0 ; encoding: [0x04,0x00,0x6f,0xd7,0x04,0x21,0x01,0x04] +; GFX10-NEXT: v_lshl_or_b32 v3, v8, 16, v1 ; encoding: [0x03,0x00,0x6f,0xd7,0x08,0x21,0x05,0x04] ; GFX10-NEXT: image_sample_c_cd_cl_g16 v[0:3], v[2:7], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D ; encoding: [0x09,0x0f,0xac,0xf1,0x02,0x00,0x40,0x00] ; GFX10-NEXT: s_waitcnt vmcnt(0) ; encoding: [0x70,0x3f,0x8c,0xbf] ; GFX10-NEXT: ; return to shader part epilog @@ -243,14 +234,13 @@ define amdgpu_ps float @sample_c_d_o_2darray_V1(<8 x i32> inreg %rsrc, <4 x i32> ; GFX10-LABEL: sample_c_d_o_2darray_V1: ; GFX10: ; %bb.0: ; %main_body ; GFX10-NEXT: v_mov_b32_e32 v9, v2 ; encoding: [0x02,0x03,0x12,0x7e] -; GFX10-NEXT: v_mov_b32_e32 v2, v0 ; encoding: [0x00,0x03,0x04,0x7e] -; GFX10-NEXT: v_mov_b32_e32 v0, 0xffff ; encoding: [0xff,0x02,0x00,0x7e,0xff,0xff,0x00,0x00] ; GFX10-NEXT: v_mov_b32_e32 v10, v3 ; encoding: [0x03,0x03,0x14,0x7e] ; GFX10-NEXT: v_mov_b32_e32 v3, v1 ; encoding: [0x01,0x03,0x06,0x7e] -; GFX10-NEXT: v_and_b32_e32 v1, v0, v4 ; encoding: [0x00,0x09,0x02,0x36] -; GFX10-NEXT: v_and_b32_e32 v0, v0, v9 ; encoding: [0x00,0x13,0x00,0x36] -; GFX10-NEXT: v_lshl_or_b32 v5, v5, 16, v1 ; encoding: [0x05,0x00,0x6f,0xd7,0x05,0x21,0x05,0x04] -; GFX10-NEXT: v_lshl_or_b32 v4, v10, 16, v0 ; encoding: [0x04,0x00,0x6f,0xd7,0x0a,0x21,0x01,0x04] +; GFX10-NEXT: v_mov_b32_e32 v2, v0 ; encoding: [0x00,0x03,0x04,0x7e] +; GFX10-NEXT: v_and_b32_e32 v0, 0xffff, v4 ; encoding: [0xff,0x08,0x00,0x36,0xff,0xff,0x00,0x00] +; GFX10-NEXT: v_and_b32_e32 v1, 0xffff, v9 ; encoding: [0xff,0x12,0x02,0x36,0xff,0xff,0x00,0x00] +; GFX10-NEXT: v_lshl_or_b32 v5, v5, 16, v0 ; encoding: [0x05,0x00,0x6f,0xd7,0x05,0x21,0x01,0x04] +; GFX10-NEXT: v_lshl_or_b32 v4, v10, 16, v1 ; encoding: [0x04,0x00,0x6f,0xd7,0x0a,0x21,0x05,0x04] ; GFX10-NEXT: image_sample_c_d_o_g16 v0, v[2:8], s[0:7], s[8:11] dmask:0x4 dim:SQ_RSRC_IMG_2D_ARRAY ; encoding: [0x29,0x04,0xe8,0xf0,0x02,0x00,0x40,0x00] ; GFX10-NEXT: s_waitcnt vmcnt(0) ; encoding: [0x70,0x3f,0x8c,0xbf] ; GFX10-NEXT: ; return to shader part epilog @@ -263,14 +253,13 @@ define amdgpu_ps <2 x float> @sample_c_d_o_2darray_V2(<8 x i32> inreg %rsrc, <4 ; GFX10-LABEL: sample_c_d_o_2darray_V2: ; GFX10: ; %bb.0: ; %main_body ; GFX10-NEXT: v_mov_b32_e32 v9, v2 ; encoding: [0x02,0x03,0x12,0x7e] -; GFX10-NEXT: v_mov_b32_e32 v2, v0 ; encoding: [0x00,0x03,0x04,0x7e] -; GFX10-NEXT: v_mov_b32_e32 v0, 0xffff ; encoding: [0xff,0x02,0x00,0x7e,0xff,0xff,0x00,0x00] ; GFX10-NEXT: v_mov_b32_e32 v10, v3 ; encoding: [0x03,0x03,0x14,0x7e] ; GFX10-NEXT: v_mov_b32_e32 v3, v1 ; encoding: [0x01,0x03,0x06,0x7e] -; GFX10-NEXT: v_and_b32_e32 v1, v0, v4 ; encoding: [0x00,0x09,0x02,0x36] -; GFX10-NEXT: v_and_b32_e32 v0, v0, v9 ; encoding: [0x00,0x13,0x00,0x36] -; GFX10-NEXT: v_lshl_or_b32 v5, v5, 16, v1 ; encoding: [0x05,0x00,0x6f,0xd7,0x05,0x21,0x05,0x04] -; GFX10-NEXT: v_lshl_or_b32 v4, v10, 16, v0 ; encoding: [0x04,0x00,0x6f,0xd7,0x0a,0x21,0x01,0x04] +; GFX10-NEXT: v_mov_b32_e32 v2, v0 ; encoding: [0x00,0x03,0x04,0x7e] +; GFX10-NEXT: v_and_b32_e32 v0, 0xffff, v4 ; encoding: [0xff,0x08,0x00,0x36,0xff,0xff,0x00,0x00] +; GFX10-NEXT: v_and_b32_e32 v1, 0xffff, v9 ; encoding: [0xff,0x12,0x02,0x36,0xff,0xff,0x00,0x00] +; GFX10-NEXT: v_lshl_or_b32 v5, v5, 16, v0 ; encoding: [0x05,0x00,0x6f,0xd7,0x05,0x21,0x01,0x04] +; GFX10-NEXT: v_lshl_or_b32 v4, v10, 16, v1 ; encoding: [0x04,0x00,0x6f,0xd7,0x0a,0x21,0x05,0x04] ; GFX10-NEXT: image_sample_c_d_o_g16 v[0:1], v[2:8], s[0:7], s[8:11] dmask:0x6 dim:SQ_RSRC_IMG_2D_ARRAY ; encoding: [0x29,0x06,0xe8,0xf0,0x02,0x00,0x40,0x00] ; GFX10-NEXT: s_waitcnt vmcnt(0) ; encoding: [0x70,0x3f,0x8c,0xbf] ; GFX10-NEXT: ; return to shader part epilog diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.g16.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.g16.ll index e0a18eb9ae66..ec12dffc8998 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.g16.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.g16.ll @@ -15,9 +15,8 @@ main_body: define amdgpu_ps <4 x float> @sample_d_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, half %dsdh, half %dtdh, half %dsdv, half %dtdv, float %s, float %t) { ; GFX10-LABEL: sample_d_2d: ; GFX10: ; %bb.0: ; %main_body -; GFX10-NEXT: v_mov_b32_e32 v6, 0xffff -; GFX10-NEXT: v_and_b32_e32 v2, v6, v2 -; GFX10-NEXT: v_and_b32_e32 v0, v6, v0 +; GFX10-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX10-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX10-NEXT: v_lshl_or_b32 v2, v3, 16, v2 ; GFX10-NEXT: v_lshl_or_b32 v0, v1, 16, v0 ; GFX10-NEXT: image_sample_d_g16 v[0:3], [v0, v2, v4, v5], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D @@ -33,10 +32,9 @@ define amdgpu_ps <4 x float> @sample_d_3d(<8 x i32> inreg %rsrc, <4 x i32> inreg ; GFX10: ; %bb.0: ; %main_body ; GFX10-NEXT: v_mov_b32_e32 v9, v3 ; GFX10-NEXT: v_mov_b32_e32 v3, v2 -; GFX10-NEXT: v_mov_b32_e32 v2, 0xffff -; GFX10-NEXT: v_and_b32_e32 v9, v2, v9 -; GFX10-NEXT: v_and_b32_e32 v0, v2, v0 -; GFX10-NEXT: v_lshl_or_b32 v4, v4, 16, v9 +; GFX10-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX10-NEXT: v_and_b32_e32 v2, 0xffff, v9 +; GFX10-NEXT: v_lshl_or_b32 v4, v4, 16, v2 ; GFX10-NEXT: v_lshl_or_b32 v2, v1, 16, v0 ; GFX10-NEXT: image_sample_d_g16 v[0:3], v[2:8], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_3D ; GFX10-NEXT: s_waitcnt vmcnt(0) @@ -60,9 +58,8 @@ main_body: define amdgpu_ps <4 x float> @sample_c_d_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %zcompare, half %dsdh, half %dtdh, half %dsdv, half %dtdv, float %s, float %t) { ; GFX10-LABEL: sample_c_d_2d: ; GFX10: ; %bb.0: ; %main_body -; GFX10-NEXT: v_mov_b32_e32 v7, 0xffff -; GFX10-NEXT: v_and_b32_e32 v3, v7, v3 -; GFX10-NEXT: v_and_b32_e32 v1, v7, v1 +; GFX10-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX10-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; GFX10-NEXT: v_lshl_or_b32 v3, v4, 16, v3 ; GFX10-NEXT: v_lshl_or_b32 v1, v2, 16, v1 ; GFX10-NEXT: image_sample_c_d_g16 v[0:3], [v0, v1, v3, v5, v6], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D @@ -87,9 +84,8 @@ main_body: define amdgpu_ps <4 x float> @sample_d_cl_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, half %dsdh, half %dtdh, half %dsdv, half %dtdv, float %s, float %t, float %clamp) { ; GFX10-LABEL: sample_d_cl_2d: ; GFX10: ; %bb.0: ; %main_body -; GFX10-NEXT: v_mov_b32_e32 v7, 0xffff -; GFX10-NEXT: v_and_b32_e32 v2, v7, v2 -; GFX10-NEXT: v_and_b32_e32 v0, v7, v0 +; GFX10-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX10-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX10-NEXT: v_lshl_or_b32 v2, v3, 16, v2 ; GFX10-NEXT: v_lshl_or_b32 v0, v1, 16, v0 ; GFX10-NEXT: image_sample_d_cl_g16 v[0:3], [v0, v2, v4, v5, v6], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D @@ -116,11 +112,10 @@ define amdgpu_ps <4 x float> @sample_c_d_cl_2d(<8 x i32> inreg %rsrc, <4 x i32> ; GFX10: ; %bb.0: ; %main_body ; GFX10-NEXT: v_mov_b32_e32 v8, v2 ; GFX10-NEXT: v_mov_b32_e32 v2, v0 -; GFX10-NEXT: v_mov_b32_e32 v0, 0xffff -; GFX10-NEXT: v_and_b32_e32 v3, v0, v3 -; GFX10-NEXT: v_and_b32_e32 v0, v0, v1 -; GFX10-NEXT: v_lshl_or_b32 v4, v4, 16, v3 -; GFX10-NEXT: v_lshl_or_b32 v3, v8, 16, v0 +; GFX10-NEXT: v_and_b32_e32 v0, 0xffff, v3 +; GFX10-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX10-NEXT: v_lshl_or_b32 v4, v4, 16, v0 +; GFX10-NEXT: v_lshl_or_b32 v3, v8, 16, v1 ; GFX10-NEXT: image_sample_c_d_cl_g16 v[0:3], v[2:7], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ; return to shader part epilog @@ -143,9 +138,8 @@ main_body: define amdgpu_ps <4 x float> @sample_cd_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, half %dsdh, half %dtdh, half %dsdv, half %dtdv, float %s, float %t) { ; GFX10-LABEL: sample_cd_2d: ; GFX10: ; %bb.0: ; %main_body -; GFX10-NEXT: v_mov_b32_e32 v6, 0xffff -; GFX10-NEXT: v_and_b32_e32 v2, v6, v2 -; GFX10-NEXT: v_and_b32_e32 v0, v6, v0 +; GFX10-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX10-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX10-NEXT: v_lshl_or_b32 v2, v3, 16, v2 ; GFX10-NEXT: v_lshl_or_b32 v0, v1, 16, v0 ; GFX10-NEXT: image_sample_cd_g16 v[0:3], [v0, v2, v4, v5], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D @@ -170,9 +164,8 @@ main_body: define amdgpu_ps <4 x float> @sample_c_cd_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %zcompare, half %dsdh, half %dtdh, half %dsdv, half %dtdv, float %s, float %t) { ; GFX10-LABEL: sample_c_cd_2d: ; GFX10: ; %bb.0: ; %main_body -; GFX10-NEXT: v_mov_b32_e32 v7, 0xffff -; GFX10-NEXT: v_and_b32_e32 v3, v7, v3 -; GFX10-NEXT: v_and_b32_e32 v1, v7, v1 +; GFX10-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX10-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; GFX10-NEXT: v_lshl_or_b32 v3, v4, 16, v3 ; GFX10-NEXT: v_lshl_or_b32 v1, v2, 16, v1 ; GFX10-NEXT: image_sample_c_cd_g16 v[0:3], [v0, v1, v3, v5, v6], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D @@ -197,9 +190,8 @@ main_body: define amdgpu_ps <4 x float> @sample_cd_cl_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, half %dsdh, half %dtdh, half %dsdv, half %dtdv, float %s, float %t, float %clamp) { ; GFX10-LABEL: sample_cd_cl_2d: ; GFX10: ; %bb.0: ; %main_body -; GFX10-NEXT: v_mov_b32_e32 v7, 0xffff -; GFX10-NEXT: v_and_b32_e32 v2, v7, v2 -; GFX10-NEXT: v_and_b32_e32 v0, v7, v0 +; GFX10-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX10-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX10-NEXT: v_lshl_or_b32 v2, v3, 16, v2 ; GFX10-NEXT: v_lshl_or_b32 v0, v1, 16, v0 ; GFX10-NEXT: image_sample_cd_cl_g16 v[0:3], [v0, v2, v4, v5, v6], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D @@ -226,11 +218,10 @@ define amdgpu_ps <4 x float> @sample_c_cd_cl_2d(<8 x i32> inreg %rsrc, <4 x i32> ; GFX10: ; %bb.0: ; %main_body ; GFX10-NEXT: v_mov_b32_e32 v8, v2 ; GFX10-NEXT: v_mov_b32_e32 v2, v0 -; GFX10-NEXT: v_mov_b32_e32 v0, 0xffff -; GFX10-NEXT: v_and_b32_e32 v3, v0, v3 -; GFX10-NEXT: v_and_b32_e32 v0, v0, v1 -; GFX10-NEXT: v_lshl_or_b32 v4, v4, 16, v3 -; GFX10-NEXT: v_lshl_or_b32 v3, v8, 16, v0 +; GFX10-NEXT: v_and_b32_e32 v0, 0xffff, v3 +; GFX10-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX10-NEXT: v_lshl_or_b32 v4, v4, 16, v0 +; GFX10-NEXT: v_lshl_or_b32 v3, v8, 16, v1 ; GFX10-NEXT: image_sample_c_cd_cl_g16 v[0:3], v[2:7], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ; return to shader part epilog @@ -243,14 +234,13 @@ define amdgpu_ps float @sample_c_d_o_2darray_V1(<8 x i32> inreg %rsrc, <4 x i32> ; GFX10-LABEL: sample_c_d_o_2darray_V1: ; GFX10: ; %bb.0: ; %main_body ; GFX10-NEXT: v_mov_b32_e32 v9, v2 -; GFX10-NEXT: v_mov_b32_e32 v2, v0 -; GFX10-NEXT: v_mov_b32_e32 v0, 0xffff ; GFX10-NEXT: v_mov_b32_e32 v10, v3 ; GFX10-NEXT: v_mov_b32_e32 v3, v1 -; GFX10-NEXT: v_and_b32_e32 v1, v0, v4 -; GFX10-NEXT: v_and_b32_e32 v0, v0, v9 -; GFX10-NEXT: v_lshl_or_b32 v5, v5, 16, v1 -; GFX10-NEXT: v_lshl_or_b32 v4, v10, 16, v0 +; GFX10-NEXT: v_mov_b32_e32 v2, v0 +; GFX10-NEXT: v_and_b32_e32 v0, 0xffff, v4 +; GFX10-NEXT: v_and_b32_e32 v1, 0xffff, v9 +; GFX10-NEXT: v_lshl_or_b32 v5, v5, 16, v0 +; GFX10-NEXT: v_lshl_or_b32 v4, v10, 16, v1 ; GFX10-NEXT: image_sample_c_d_o_g16 v0, v[2:8], s[0:7], s[8:11] dmask:0x4 dim:SQ_RSRC_IMG_2D_ARRAY ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ; return to shader part epilog @@ -263,14 +253,13 @@ define amdgpu_ps <2 x float> @sample_c_d_o_2darray_V2(<8 x i32> inreg %rsrc, <4 ; GFX10-LABEL: sample_c_d_o_2darray_V2: ; GFX10: ; %bb.0: ; %main_body ; GFX10-NEXT: v_mov_b32_e32 v9, v2 -; GFX10-NEXT: v_mov_b32_e32 v2, v0 -; GFX10-NEXT: v_mov_b32_e32 v0, 0xffff ; GFX10-NEXT: v_mov_b32_e32 v10, v3 ; GFX10-NEXT: v_mov_b32_e32 v3, v1 -; GFX10-NEXT: v_and_b32_e32 v1, v0, v4 -; GFX10-NEXT: v_and_b32_e32 v0, v0, v9 -; GFX10-NEXT: v_lshl_or_b32 v5, v5, 16, v1 -; GFX10-NEXT: v_lshl_or_b32 v4, v10, 16, v0 +; GFX10-NEXT: v_mov_b32_e32 v2, v0 +; GFX10-NEXT: v_and_b32_e32 v0, 0xffff, v4 +; GFX10-NEXT: v_and_b32_e32 v1, 0xffff, v9 +; GFX10-NEXT: v_lshl_or_b32 v5, v5, 16, v0 +; GFX10-NEXT: v_lshl_or_b32 v4, v10, 16, v1 ; GFX10-NEXT: image_sample_c_d_o_g16 v[0:1], v[2:8], s[0:7], s[8:11] dmask:0x6 dim:SQ_RSRC_IMG_2D_ARRAY ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ; return to shader part epilog diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.buffer.store.format.d16.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.buffer.store.format.d16.ll index 08551a22e6aa..9540524b9d2b 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.buffer.store.format.d16.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.buffer.store.format.d16.ll @@ -31,10 +31,9 @@ main_body: ; GCN-LABEL: {{^}}buffer_store_format_d16_xyz: ; GCN-DAG: s_load_dwordx2 s[[[S_DATA_0:[0-9]+]]:[[S_DATA_1:[0-9]+]]], s{{\[[0-9]+:[0-9]+\]}}, 0x10 -; UNPACKED-DAG: s_mov_b32 [[K:s[0-9]+]], 0xffff{{$}} ; UNPACKED-DAG: s_lshr_b32 [[SHR0:s[0-9]+]], s[[S_DATA_0]], 16 -; UNPACKED-DAG: s_and_b32 [[MASKED0:s[0-9]+]], s[[S_DATA_0]], [[K]] -; UNPACKED-DAG: s_and_b32 [[MASKED1:s[0-9]+]], s[[S_DATA_1]], [[K]] +; UNPACKED-DAG: s_and_b32 [[MASKED0:s[0-9]+]], s[[S_DATA_0]], 0xffff{{$}} +; UNPACKED-DAG: s_and_b32 [[MASKED1:s[0-9]+]], s[[S_DATA_1]], 0xffff{{$}} ; UNPACKED-DAG: v_mov_b32_e32 v[[LO:[0-9]+]], [[MASKED0]] ; UNPACKED-DAG: v_mov_b32_e32 v[[HI:[0-9]+]], [[MASKED1]] @@ -56,11 +55,10 @@ main_body: ; GCN-LABEL: {{^}}buffer_store_format_d16_xyzw: ; GCN-DAG: s_load_dwordx2 s[[[S_DATA_0:[0-9]+]]:[[S_DATA_1:[0-9]+]]], s{{\[[0-9]+:[0-9]+\]}}, 0x10 -; UNPACKED-DAG: s_mov_b32 [[K:s[0-9]+]], 0xffff{{$}} ; UNPACKED-DAG: s_lshr_b32 [[SHR0:s[0-9]+]], s[[S_DATA_0]], 16 -; UNPACKED-DAG: s_and_b32 [[MASKED0:s[0-9]+]], s[[S_DATA_0]], [[K]] +; UNPACKED-DAG: s_and_b32 [[MASKED0:s[0-9]+]], s[[S_DATA_0]], 0xffff{{$}} ; UNPACKED-DAG: s_lshr_b32 [[SHR1:s[0-9]+]], s[[S_DATA_1]], 16 -; UNPACKED-DAG: s_and_b32 [[MASKED1:s[0-9]+]], s[[S_DATA_1]], [[K]] +; UNPACKED-DAG: s_and_b32 [[MASKED1:s[0-9]+]], s[[S_DATA_1]], 0xffff{{$}} ; UNPACKED-DAG: v_mov_b32_e32 v[[LO:[0-9]+]], [[MASKED0]] ; UNPACKED-DAG: v_mov_b32_e32 v[[HI:[0-9]+]], [[SHR1]] diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.tbuffer.store.d16.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.tbuffer.store.d16.ll index fec7954a0225..bd30de55ae0c 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.tbuffer.store.d16.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.tbuffer.store.d16.ll @@ -35,10 +35,9 @@ main_body: ; GCN-LABEL: {{^}}tbuffer_store_d16_xyz: ; GCN-DAG: s_load_dwordx2 s[[[S_DATA_0:[0-9]+]]:[[S_DATA_1:[0-9]+]]], s{{\[[0-9]+:[0-9]+\]}}, -; UNPACKED-DAG: s_mov_b32 [[K:s[0-9]+]], 0xffff{{$}} ; UNPACKED-DAG: s_lshr_b32 [[SHR0:s[0-9]+]], s[[S_DATA_0]], 16 -; UNPACKED-DAG: s_and_b32 [[MASKED0:s[0-9]+]], s[[S_DATA_0]], [[K]] -; UNPACKED-DAG: s_and_b32 [[MASKED1:s[0-9]+]], s[[S_DATA_1]], [[K]] +; UNPACKED-DAG: s_and_b32 [[MASKED0:s[0-9]+]], s[[S_DATA_0]], 0xffff{{$}} +; UNPACKED-DAG: s_and_b32 [[MASKED1:s[0-9]+]], s[[S_DATA_1]], 0xffff{{$}} ; UNPACKED-DAG: v_mov_b32_e32 v[[LO:[0-9]+]], [[MASKED0]] ; UNPACKED-DAG: v_mov_b32_e32 v[[HI:[0-9]+]], [[MASKED1]] @@ -60,11 +59,10 @@ main_body: ; GCN-LABEL: {{^}}tbuffer_store_d16_xyzw: ; GCN-DAG: s_load_dwordx2 s[[[S_DATA_0:[0-9]+]]:[[S_DATA_1:[0-9]+]]], s{{\[[0-9]+:[0-9]+\]}}, -; UNPACKED-DAG: s_mov_b32 [[K:s[0-9]+]], 0xffff{{$}} ; UNPACKED-DAG: s_lshr_b32 [[SHR0:s[0-9]+]], s[[S_DATA_0]], 16 -; UNPACKED-DAG: s_and_b32 [[MASKED0:s[0-9]+]], s[[S_DATA_0]], [[K]] +; UNPACKED-DAG: s_and_b32 [[MASKED0:s[0-9]+]], s[[S_DATA_0]], 0xffff{{$}} ; UNPACKED-DAG: s_lshr_b32 [[SHR1:s[0-9]+]], s[[S_DATA_1]], 16 -; UNPACKED-DAG: s_and_b32 [[MASKED1:s[0-9]+]], s[[S_DATA_1]], [[K]] +; UNPACKED-DAG: s_and_b32 [[MASKED1:s[0-9]+]], s[[S_DATA_1]], 0xffff{{$}} ; UNPACKED-DAG: v_mov_b32_e32 v[[LO:[0-9]+]], [[MASKED0]] ; UNPACKED-DAG: v_mov_b32_e32 v[[HI:[0-9]+]], [[SHR1]] diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.buffer.store.format.d16.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.buffer.store.format.d16.ll index 30361a2b36ed..f5b36f9a25c3 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.buffer.store.format.d16.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.buffer.store.format.d16.ll @@ -31,10 +31,9 @@ main_body: ; GCN-LABEL: {{^}}buffer_store_format_d16_xyz: ; GCN-DAG: s_load_dwordx2 s[[[S_DATA_0:[0-9]+]]:[[S_DATA_1:[0-9]+]]], s{{\[[0-9]+:[0-9]+\]}}, 0x10 -; UNPACKED-DAG: s_mov_b32 [[K:s[0-9]+]], 0xffff{{$}} ; UNPACKED-DAG: s_lshr_b32 [[SHR0:s[0-9]+]], s[[S_DATA_0]], 16 -; UNPACKED-DAG: s_and_b32 [[MASKED0:s[0-9]+]], s[[S_DATA_0]], [[K]] -; UNPACKED-DAG: s_and_b32 [[MASKED1:s[0-9]+]], s[[S_DATA_1]], [[K]] +; UNPACKED-DAG: s_and_b32 [[MASKED0:s[0-9]+]], s[[S_DATA_0]], 0xffff{{$}} +; UNPACKED-DAG: s_and_b32 [[MASKED1:s[0-9]+]], s[[S_DATA_1]], 0xffff{{$}} ; UNPACKED-DAG: v_mov_b32_e32 v[[LO:[0-9]+]], [[MASKED0]] ; UNPACKED-DAG: v_mov_b32_e32 v[[HI:[0-9]+]], [[MASKED1]] @@ -56,11 +55,10 @@ main_body: ; GCN-LABEL: {{^}}buffer_store_format_d16_xyzw: ; GCN-DAG: s_load_dwordx2 s[[[S_DATA_0:[0-9]+]]:[[S_DATA_1:[0-9]+]]], s{{\[[0-9]+:[0-9]+\]}}, 0x10 -; UNPACKED-DAG: s_mov_b32 [[K:s[0-9]+]], 0xffff{{$}} ; UNPACKED-DAG: s_lshr_b32 [[SHR0:s[0-9]+]], s[[S_DATA_0]], 16 -; UNPACKED-DAG: s_and_b32 [[MASKED0:s[0-9]+]], s[[S_DATA_0]], [[K]] +; UNPACKED-DAG: s_and_b32 [[MASKED0:s[0-9]+]], s[[S_DATA_0]], 0xffff{{$}} ; UNPACKED-DAG: s_lshr_b32 [[SHR1:s[0-9]+]], s[[S_DATA_1]], 16 -; UNPACKED-DAG: s_and_b32 [[MASKED1:s[0-9]+]], s[[S_DATA_1]], [[K]] +; UNPACKED-DAG: s_and_b32 [[MASKED1:s[0-9]+]], s[[S_DATA_1]], 0xffff{{$}} ; UNPACKED-DAG: v_mov_b32_e32 v[[LO:[0-9]+]], [[MASKED0]] ; UNPACKED-DAG: v_mov_b32_e32 v[[HI:[0-9]+]], [[SHR1]] diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.tbuffer.store.d16.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.tbuffer.store.d16.ll index 5d05c955ddd0..8a1819d9a292 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.tbuffer.store.d16.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.tbuffer.store.d16.ll @@ -35,10 +35,9 @@ main_body: ; GCN-LABEL: {{^}}tbuffer_store_d16_xyz: ; GCN-DAG: s_load_dwordx2 s[[[S_DATA_0:[0-9]+]]:[[S_DATA_1:[0-9]+]]], s{{\[[0-9]+:[0-9]+\]}}, 0x10 -; UNPACKED-DAG: s_mov_b32 [[K:s[0-9]+]], 0xffff{{$}} ; UNPACKED-DAG: s_lshr_b32 [[SHR0:s[0-9]+]], s[[S_DATA_0]], 16 -; UNPACKED-DAG: s_and_b32 [[MASKED0:s[0-9]+]], s[[S_DATA_0]], [[K]] -; UNPACKED-DAG: s_and_b32 [[MASKED1:s[0-9]+]], s[[S_DATA_1]], [[K]] +; UNPACKED-DAG: s_and_b32 [[MASKED0:s[0-9]+]], s[[S_DATA_0]], 0xffff{{$}} +; UNPACKED-DAG: s_and_b32 [[MASKED1:s[0-9]+]], s[[S_DATA_1]], 0xffff{{$}} ; UNPACKED-DAG: v_mov_b32_e32 v[[LO:[0-9]+]], [[MASKED0]] ; UNPACKED-DAG: v_mov_b32_e32 v[[HI:[0-9]+]], [[MASKED1]] @@ -59,11 +58,10 @@ main_body: ; GCN-LABEL: {{^}}tbuffer_store_d16_xyzw: ; GCN-DAG: s_load_dwordx2 s[[[S_DATA_0:[0-9]+]]:[[S_DATA_1:[0-9]+]]], s{{\[[0-9]+:[0-9]+\]}}, 0x10 -; UNPACKED-DAG: s_mov_b32 [[K:s[0-9]+]], 0xffff{{$}} ; UNPACKED-DAG: s_lshr_b32 [[SHR0:s[0-9]+]], s[[S_DATA_0]], 16 -; UNPACKED-DAG: s_and_b32 [[MASKED0:s[0-9]+]], s[[S_DATA_0]], [[K]] +; UNPACKED-DAG: s_and_b32 [[MASKED0:s[0-9]+]], s[[S_DATA_0]], 0xffff{{$}} ; UNPACKED-DAG: s_lshr_b32 [[SHR1:s[0-9]+]], s[[S_DATA_1]], 16 -; UNPACKED-DAG: s_and_b32 [[MASKED1:s[0-9]+]], s[[S_DATA_1]], [[K]] +; UNPACKED-DAG: s_and_b32 [[MASKED1:s[0-9]+]], s[[S_DATA_1]], 0xffff{{$}} ; UNPACKED-DAG: v_mov_b32_e32 v[[LO:[0-9]+]], [[MASKED0]] ; UNPACKED-DAG: v_mov_b32_e32 v[[HI:[0-9]+]], [[SHR1]] diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.tbuffer.store.d16.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.tbuffer.store.d16.ll index 9da9ca2cc99e..bcd7c4a71b29 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.tbuffer.store.d16.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.tbuffer.store.d16.ll @@ -31,10 +31,9 @@ main_body: ; GCN-LABEL: {{^}}tbuffer_store_d16_xyz: ; GCN-DAG: s_load_dwordx2 s[[[S_DATA_0:[0-9]+]]:[[S_DATA_1:[0-9]+]]], s{{\[[0-9]+:[0-9]+\]}}, 0x10 -; UNPACKED-DAG: s_mov_b32 [[K:s[0-9]+]], 0xffff{{$}} ; UNPACKED-DAG: s_lshr_b32 [[SHR0:s[0-9]+]], s[[S_DATA_0]], 16 -; UNPACKED-DAG: s_and_b32 [[MASKED0:s[0-9]+]], s[[S_DATA_0]], [[K]] -; UNPACKED-DAG: s_and_b32 [[SHR1:s[0-9]+]], s[[S_DATA_1]], [[K]] +; UNPACKED-DAG: s_and_b32 [[MASKED0:s[0-9]+]], s[[S_DATA_0]], 0xffff{{$}} +; UNPACKED-DAG: s_and_b32 [[SHR1:s[0-9]+]], s[[S_DATA_1]], 0xffff{{$}} ; UNPACKED-DAG: v_mov_b32_e32 v[[LO:[0-9]+]], [[MASKED0]] ; UNPACKED-DAG: v_mov_b32_e32 v[[HI:[0-9]+]], [[SHR1]] @@ -53,11 +52,10 @@ main_body: ; GCN-LABEL: {{^}}tbuffer_store_d16_xyzw: ; GCN-DAG: s_load_dwordx2 s[[[S_DATA_0:[0-9]+]]:[[S_DATA_1:[0-9]+]]], s{{\[[0-9]+:[0-9]+\]}}, 0x10 -; UNPACKED-DAG: s_mov_b32 [[K:s[0-9]+]], 0xffff{{$}} ; UNPACKED-DAG: s_lshr_b32 [[SHR0:s[0-9]+]], s[[S_DATA_0]], 16 -; UNPACKED-DAG: s_and_b32 [[MASKED0:s[0-9]+]], s[[S_DATA_0]], [[K]] +; UNPACKED-DAG: s_and_b32 [[MASKED0:s[0-9]+]], s[[S_DATA_0]], 0xffff{{$}} ; UNPACKED-DAG: s_lshr_b32 [[SHR1:s[0-9]+]], s[[S_DATA_1]], 16 -; UNPACKED-DAG: s_and_b32 [[MASKED1:s[0-9]+]], s[[S_DATA_1]], [[K]] +; UNPACKED-DAG: s_and_b32 [[MASKED1:s[0-9]+]], s[[S_DATA_1]], 0xffff{{$}} ; UNPACKED-DAG: v_mov_b32_e32 v[[LO:[0-9]+]], [[MASKED0]] ; UNPACKED-DAG: v_mov_b32_e32 v[[HI:[0-9]+]], [[SHR1]] diff --git a/llvm/test/CodeGen/AMDGPU/llvm.log.f16.ll b/llvm/test/CodeGen/AMDGPU/llvm.log.f16.ll index 0b47c934ee79..e3572de13fae 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.log.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.log.f16.ll @@ -33,8 +33,8 @@ entry: ; VI: flat_load_dword v[[A_F16_0:[0-9]+]] ; GFX9: global_load_dword v[[A_F16_0:[0-9]+]] ; SI: s_mov_b32 [[A_F32_2:s[0-9]+]], 0x3f317218 -; VIGFX9: s_movk_i32 [[A_F32_2:s[0-9]+]], 0x398c -; VI: v_mov_b32_e32 [[A_F32_2_V:v[0-9]+]], [[A_F32_2]] +; GFX9: s_movk_i32 [[A_F32_2:s[0-9]+]], 0x398c +; VI: v_mov_b32_e32 [[A_F32_2_V:v[0-9]+]], 0x398c ; SI: v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_F16_0]] ; SI: v_cvt_f32_f16_e32 v[[A_F32_0:[0-9]+]], v[[A_F16_1]] ; SI: v_cvt_f32_f16_e32 v[[A_F32_1:[0-9]+]], v[[A_F16_0]] @@ -49,7 +49,8 @@ entry: ; VI: v_log_f16_e32 v[[R_F16_0:[0-9]+]], v[[A_F16_0]] ; VI: v_mul_f16_sdwa v[[R_F16_2:[0-9]+]], v[[R_F16_1]], [[A_F32_2_V]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9: v_mul_f16_e32 v[[R_F32_3:[0-9]+]], [[A_F32_2]], v[[R_F16_2]] -; VIGFX9: v_mul_f16_e32 v[[R_F32_2:[0-9]+]], [[A_F32_2]], v[[R_F16_0]] +; VI: v_mul_f16_e32 v[[R_F32_2:[0-9]+]], 0x398c, v[[R_F16_0]] +; GFX9: v_mul_f16_e32 v[[R_F32_2:[0-9]+]], [[A_F32_2]], v[[R_F16_0]] ; SI: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_0]] ; SI-NOT: v_and_b32_e32 ; SI: v_or_b32_e32 v[[R_F32_5:[0-9]+]], v[[R_F16_1]], v[[R_F16_0]] diff --git a/llvm/test/CodeGen/AMDGPU/llvm.log10.f16.ll b/llvm/test/CodeGen/AMDGPU/llvm.log10.f16.ll index 2851b270d93b..d5cb21cfd0a7 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.log10.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.log10.f16.ll @@ -33,8 +33,8 @@ entry: ; VI: flat_load_dword v[[A_F16_0:[0-9]+]] ; GFX9: global_load_dword v[[A_F16_0:[0-9]+]] ; SI: s_mov_b32 [[A_F32_2:s[0-9]+]], 0x3e9a209a -; VIGFX9: s_movk_i32 [[A_F32_2:s[0-9]+]], 0x34d1 -; VI: v_mov_b32_e32 [[A_F32_2_V:v[0-9]+]], [[A_F32_2]] +; GFX9: s_movk_i32 [[A_F32_2:s[0-9]+]], 0x34d1 +; VI: v_mov_b32_e32 [[A_F32_2_V:v[0-9]+]], 0x34d1 ; SI: v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_F16_0]] ; SI: v_cvt_f32_f16_e32 v[[A_F32_0:[0-9]+]], v[[A_F16_1]] ; SI: v_cvt_f32_f16_e32 v[[A_F32_1:[0-9]+]], v[[A_F16_0]] @@ -48,8 +48,9 @@ entry: ; VIGFX9: v_log_f16_sdwa v[[R_F16_1:[0-9]+]], v[[A_F16_0]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 ; VI: v_log_f16_e32 v[[R_F16_0:[0-9]+]], v[[A_F16_0]] ; VI: v_mul_f16_sdwa v[[R_F16_2:[0-9]+]], v[[R_F16_1]], [[A_F32_2_V]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI: v_mul_f16_e32 v[[R_F32_3:[0-9]+]], 0x34d1, v[[R_F16_0]] ; GFX9: v_mul_f16_e32 v[[R_F32_3:[0-9]+]], [[A_F32_2]], v[[R_F16_2]] -; VIGFX9: v_mul_f16_e32 v[[R_F32_2:[0-9]+]], [[A_F32_2]], v[[R_F16_0]] +; GFX9: v_mul_f16_e32 v[[R_F32_2:[0-9]+]], [[A_F32_2]], v[[R_F16_0]] ; SI: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_0]] ; SI-NOT: v_and_b32_e32 ; SI: v_or_b32_e32 v[[R_F32_5:[0-9]+]], v[[R_F16_1]], v[[R_F16_0]] diff --git a/llvm/test/CodeGen/AMDGPU/llvm.round.f64.ll b/llvm/test/CodeGen/AMDGPU/llvm.round.f64.ll index f73ed62f20f8..995379cdf5b1 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.round.f64.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.round.f64.ll @@ -147,21 +147,19 @@ define amdgpu_kernel void @round_v2f64(<2 x double> addrspace(1)* %out, <2 x dou ; SI-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0xd ; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s6, -1 -; SI-NEXT: s_movk_i32 s7, 0xfc01 ; SI-NEXT: s_mov_b32 s3, 0xfffff +; SI-NEXT: s_mov_b32 s2, s6 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_bfe_u32 s0, s11, 0xb0014 -; SI-NEXT: s_add_i32 s14, s0, s7 -; SI-NEXT: s_mov_b32 s2, s6 -; SI-NEXT: s_lshr_b64 s[0:1], s[2:3], s14 -; SI-NEXT: s_brev_b32 s15, 1 +; SI-NEXT: s_add_i32 s7, s0, 0xfffffc01 +; SI-NEXT: s_lshr_b64 s[0:1], s[2:3], s7 ; SI-NEXT: s_andn2_b64 s[12:13], s[10:11], s[0:1] -; SI-NEXT: s_and_b32 s0, s11, s15 -; SI-NEXT: s_cmp_lt_i32 s14, 0 +; SI-NEXT: s_and_b32 s0, s11, 0x80000000 +; SI-NEXT: s_cmp_lt_i32 s7, 0 ; SI-NEXT: v_mov_b32_e32 v0, s13 ; SI-NEXT: v_mov_b32_e32 v1, s0 ; SI-NEXT: s_cselect_b64 vcc, -1, 0 -; SI-NEXT: s_cmp_gt_i32 s14, 51 +; SI-NEXT: s_cmp_gt_i32 s7, 51 ; SI-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc ; SI-NEXT: v_mov_b32_e32 v1, s11 ; SI-NEXT: s_cselect_b64 s[0:1], -1, 0 @@ -172,23 +170,23 @@ define amdgpu_kernel void @round_v2f64(<2 x double> addrspace(1)* %out, <2 x dou ; SI-NEXT: v_cndmask_b32_e64 v0, v0, v2, s[0:1] ; SI-NEXT: v_add_f64 v[2:3], s[10:11], -v[0:1] ; SI-NEXT: s_bfe_u32 s0, s9, 0xb0014 -; SI-NEXT: s_add_i32 s7, s0, s7 -; SI-NEXT: s_brev_b32 s10, -2 +; SI-NEXT: s_add_i32 s10, s0, 0xfffffc01 +; SI-NEXT: s_brev_b32 s7, -2 ; SI-NEXT: v_mov_b32_e32 v6, 0x3ff00000 ; SI-NEXT: v_mov_b32_e32 v4, s11 ; SI-NEXT: v_cmp_ge_f64_e64 vcc, |v[2:3]|, 0.5 -; SI-NEXT: s_lshr_b64 s[0:1], s[2:3], s7 -; SI-NEXT: v_bfi_b32 v4, s10, v6, v4 +; SI-NEXT: s_lshr_b64 s[0:1], s[2:3], s10 +; SI-NEXT: v_bfi_b32 v4, s7, v6, v4 ; SI-NEXT: s_andn2_b64 s[2:3], s[8:9], s[0:1] -; SI-NEXT: s_and_b32 s0, s9, s15 +; SI-NEXT: s_and_b32 s0, s9, 0x80000000 ; SI-NEXT: v_cndmask_b32_e32 v3, 0, v4, vcc ; SI-NEXT: v_mov_b32_e32 v2, 0 -; SI-NEXT: s_cmp_lt_i32 s7, 0 +; SI-NEXT: s_cmp_lt_i32 s10, 0 ; SI-NEXT: v_add_f64 v[2:3], v[0:1], v[2:3] ; SI-NEXT: v_mov_b32_e32 v0, s3 ; SI-NEXT: v_mov_b32_e32 v1, s0 ; SI-NEXT: s_cselect_b64 vcc, -1, 0 -; SI-NEXT: s_cmp_gt_i32 s7, 51 +; SI-NEXT: s_cmp_gt_i32 s10, 51 ; SI-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc ; SI-NEXT: v_mov_b32_e32 v1, s9 ; SI-NEXT: s_cselect_b64 s[0:1], -1, 0 @@ -200,7 +198,7 @@ define amdgpu_kernel void @round_v2f64(<2 x double> addrspace(1)* %out, <2 x dou ; SI-NEXT: v_add_f64 v[4:5], s[8:9], -v[0:1] ; SI-NEXT: v_mov_b32_e32 v7, s9 ; SI-NEXT: v_cmp_ge_f64_e64 vcc, |v[4:5]|, 0.5 -; SI-NEXT: v_bfi_b32 v6, s10, v6, v7 +; SI-NEXT: v_bfi_b32 v6, s7, v6, v7 ; SI-NEXT: v_cndmask_b32_e32 v5, 0, v6, vcc ; SI-NEXT: v_mov_b32_e32 v4, 0 ; SI-NEXT: v_add_f64 v[0:1], v[0:1], v[4:5] @@ -245,22 +243,20 @@ define amdgpu_kernel void @round_v4f64(<4 x double> addrspace(1)* %out, <4 x dou ; SI: ; %bb.0: ; SI-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x11 ; SI-NEXT: s_mov_b32 s14, -1 -; SI-NEXT: s_movk_i32 s18, 0xfc01 ; SI-NEXT: s_load_dwordx2 s[12:13], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xfffff +; SI-NEXT: s_mov_b32 s2, s14 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_bfe_u32 s0, s7, 0xb0014 -; SI-NEXT: s_add_i32 s19, s0, s18 -; SI-NEXT: s_mov_b32 s2, s14 -; SI-NEXT: s_lshr_b64 s[0:1], s[2:3], s19 -; SI-NEXT: s_brev_b32 s20, 1 +; SI-NEXT: s_add_i32 s18, s0, 0xfffffc01 +; SI-NEXT: s_lshr_b64 s[0:1], s[2:3], s18 ; SI-NEXT: s_andn2_b64 s[16:17], s[6:7], s[0:1] -; SI-NEXT: s_and_b32 s0, s7, s20 -; SI-NEXT: s_cmp_lt_i32 s19, 0 +; SI-NEXT: s_and_b32 s0, s7, 0x80000000 +; SI-NEXT: s_cmp_lt_i32 s18, 0 ; SI-NEXT: v_mov_b32_e32 v0, s17 ; SI-NEXT: v_mov_b32_e32 v1, s0 ; SI-NEXT: s_cselect_b64 vcc, -1, 0 -; SI-NEXT: s_cmp_gt_i32 s19, 51 +; SI-NEXT: s_cmp_gt_i32 s18, 51 ; SI-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc ; SI-NEXT: v_mov_b32_e32 v1, s7 ; SI-NEXT: s_cselect_b64 s[0:1], -1, 0 @@ -271,7 +267,7 @@ define amdgpu_kernel void @round_v4f64(<4 x double> addrspace(1)* %out, <4 x dou ; SI-NEXT: v_cndmask_b32_e64 v0, v0, v2, s[0:1] ; SI-NEXT: v_add_f64 v[2:3], s[6:7], -v[0:1] ; SI-NEXT: s_bfe_u32 s0, s5, 0xb0014 -; SI-NEXT: s_add_i32 s17, s0, s18 +; SI-NEXT: s_add_i32 s17, s0, 0xfffffc01 ; SI-NEXT: s_brev_b32 s16, -2 ; SI-NEXT: v_mov_b32_e32 v12, 0x3ff00000 ; SI-NEXT: v_mov_b32_e32 v4, s7 @@ -279,7 +275,7 @@ define amdgpu_kernel void @round_v4f64(<4 x double> addrspace(1)* %out, <4 x dou ; SI-NEXT: s_lshr_b64 s[0:1], s[2:3], s17 ; SI-NEXT: v_bfi_b32 v4, s16, v12, v4 ; SI-NEXT: s_andn2_b64 s[6:7], s[4:5], s[0:1] -; SI-NEXT: s_and_b32 s0, s5, s20 +; SI-NEXT: s_and_b32 s0, s5, 0x80000000 ; SI-NEXT: v_cndmask_b32_e32 v3, 0, v4, vcc ; SI-NEXT: v_mov_b32_e32 v2, 0 ; SI-NEXT: s_cmp_lt_i32 s17, 0 @@ -298,12 +294,12 @@ define amdgpu_kernel void @round_v4f64(<4 x double> addrspace(1)* %out, <4 x dou ; SI-NEXT: v_cndmask_b32_e64 v0, v0, v4, s[0:1] ; SI-NEXT: s_bfe_u32 s0, s11, 0xb0014 ; SI-NEXT: v_add_f64 v[4:5], s[4:5], -v[0:1] -; SI-NEXT: s_add_i32 s6, s0, s18 +; SI-NEXT: s_add_i32 s6, s0, 0xfffffc01 ; SI-NEXT: s_lshr_b64 s[0:1], s[2:3], s6 ; SI-NEXT: v_mov_b32_e32 v6, s5 ; SI-NEXT: v_cmp_ge_f64_e64 vcc, |v[4:5]|, 0.5 ; SI-NEXT: s_andn2_b64 s[4:5], s[10:11], s[0:1] -; SI-NEXT: s_and_b32 s0, s11, s20 +; SI-NEXT: s_and_b32 s0, s11, 0x80000000 ; SI-NEXT: v_bfi_b32 v6, s16, v12, v6 ; SI-NEXT: s_cmp_lt_i32 s6, 0 ; SI-NEXT: v_cndmask_b32_e32 v9, 0, v6, vcc @@ -321,13 +317,13 @@ define amdgpu_kernel void @round_v4f64(<4 x double> addrspace(1)* %out, <4 x dou ; SI-NEXT: v_cndmask_b32_e64 v4, v4, v6, s[0:1] ; SI-NEXT: v_add_f64 v[6:7], s[10:11], -v[4:5] ; SI-NEXT: s_bfe_u32 s0, s9, 0xb0014 -; SI-NEXT: s_add_i32 s4, s0, s18 +; SI-NEXT: s_add_i32 s4, s0, 0xfffffc01 ; SI-NEXT: v_mov_b32_e32 v10, s11 ; SI-NEXT: v_cmp_ge_f64_e64 vcc, |v[6:7]|, 0.5 ; SI-NEXT: s_lshr_b64 s[0:1], s[2:3], s4 ; SI-NEXT: v_bfi_b32 v10, s16, v12, v10 ; SI-NEXT: s_andn2_b64 s[2:3], s[8:9], s[0:1] -; SI-NEXT: s_and_b32 s0, s9, s20 +; SI-NEXT: s_and_b32 s0, s9, 0x80000000 ; SI-NEXT: v_cndmask_b32_e32 v7, 0, v10, vcc ; SI-NEXT: v_mov_b32_e32 v6, 0 ; SI-NEXT: s_cmp_lt_i32 s4, 0 @@ -412,21 +408,20 @@ define amdgpu_kernel void @round_v8f64(<8 x double> addrspace(1)* %out, <8 x dou ; SI: ; %bb.0: ; SI-NEXT: s_load_dwordx16 s[4:19], s[0:1], 0x19 ; SI-NEXT: s_mov_b32 s22, -1 -; SI-NEXT: s_movk_i32 s28, 0xfc01 ; SI-NEXT: s_mov_b32 s21, 0xfffff ; SI-NEXT: s_mov_b32 s20, s22 +; SI-NEXT: v_mov_b32_e32 v8, 0x3ff00000 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_bfe_u32 s2, s7, 0xb0014 -; SI-NEXT: s_add_i32 s23, s2, s28 -; SI-NEXT: s_lshr_b64 s[2:3], s[20:21], s23 -; SI-NEXT: s_brev_b32 s29, 1 +; SI-NEXT: s_add_i32 s26, s2, 0xfffffc01 +; SI-NEXT: s_lshr_b64 s[2:3], s[20:21], s26 +; SI-NEXT: s_and_b32 s23, s7, 0x80000000 ; SI-NEXT: s_andn2_b64 s[24:25], s[6:7], s[2:3] -; SI-NEXT: s_and_b32 s2, s7, s29 -; SI-NEXT: s_cmp_lt_i32 s23, 0 +; SI-NEXT: s_cmp_lt_i32 s26, 0 ; SI-NEXT: v_mov_b32_e32 v0, s25 -; SI-NEXT: v_mov_b32_e32 v1, s2 +; SI-NEXT: v_mov_b32_e32 v1, s23 ; SI-NEXT: s_cselect_b64 vcc, -1, 0 -; SI-NEXT: s_cmp_gt_i32 s23, 51 +; SI-NEXT: s_cmp_gt_i32 s26, 51 ; SI-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc ; SI-NEXT: v_mov_b32_e32 v1, s7 ; SI-NEXT: s_cselect_b64 s[2:3], -1, 0 @@ -437,15 +432,14 @@ define amdgpu_kernel void @round_v8f64(<8 x double> addrspace(1)* %out, <8 x dou ; SI-NEXT: v_cndmask_b32_e64 v0, v0, v2, s[2:3] ; SI-NEXT: v_add_f64 v[2:3], s[6:7], -v[0:1] ; SI-NEXT: s_bfe_u32 s2, s5, 0xb0014 -; SI-NEXT: s_add_i32 s24, s2, s28 +; SI-NEXT: s_add_i32 s24, s2, 0xfffffc01 ; SI-NEXT: s_brev_b32 s23, -2 -; SI-NEXT: v_mov_b32_e32 v8, 0x3ff00000 ; SI-NEXT: v_mov_b32_e32 v4, s7 ; SI-NEXT: v_cmp_ge_f64_e64 vcc, |v[2:3]|, 0.5 ; SI-NEXT: s_lshr_b64 s[2:3], s[20:21], s24 ; SI-NEXT: v_bfi_b32 v4, s23, v8, v4 ; SI-NEXT: s_andn2_b64 s[6:7], s[4:5], s[2:3] -; SI-NEXT: s_and_b32 s2, s5, s29 +; SI-NEXT: s_and_b32 s2, s5, 0x80000000 ; SI-NEXT: v_cndmask_b32_e32 v3, 0, v4, vcc ; SI-NEXT: v_mov_b32_e32 v2, 0 ; SI-NEXT: s_cmp_lt_i32 s24, 0 @@ -464,13 +458,13 @@ define amdgpu_kernel void @round_v8f64(<8 x double> addrspace(1)* %out, <8 x dou ; SI-NEXT: v_cndmask_b32_e64 v0, v0, v4, s[2:3] ; SI-NEXT: v_add_f64 v[4:5], s[4:5], -v[0:1] ; SI-NEXT: s_bfe_u32 s2, s11, 0xb0014 -; SI-NEXT: s_add_i32 s6, s2, s28 +; SI-NEXT: s_add_i32 s6, s2, 0xfffffc01 ; SI-NEXT: v_mov_b32_e32 v6, s5 ; SI-NEXT: v_cmp_ge_f64_e64 vcc, |v[4:5]|, 0.5 ; SI-NEXT: s_lshr_b64 s[2:3], s[20:21], s6 ; SI-NEXT: v_bfi_b32 v6, s23, v8, v6 ; SI-NEXT: s_andn2_b64 s[4:5], s[10:11], s[2:3] -; SI-NEXT: s_and_b32 s2, s11, s29 +; SI-NEXT: s_and_b32 s2, s11, 0x80000000 ; SI-NEXT: v_cndmask_b32_e32 v5, 0, v6, vcc ; SI-NEXT: v_mov_b32_e32 v4, 0 ; SI-NEXT: s_cmp_lt_i32 s6, 0 @@ -489,13 +483,13 @@ define amdgpu_kernel void @round_v8f64(<8 x double> addrspace(1)* %out, <8 x dou ; SI-NEXT: v_cndmask_b32_e64 v4, v4, v6, s[2:3] ; SI-NEXT: v_add_f64 v[6:7], s[10:11], -v[4:5] ; SI-NEXT: s_bfe_u32 s2, s9, 0xb0014 -; SI-NEXT: s_add_i32 s6, s2, s28 +; SI-NEXT: s_add_i32 s6, s2, 0xfffffc01 ; SI-NEXT: v_mov_b32_e32 v9, s11 ; SI-NEXT: v_cmp_ge_f64_e64 vcc, |v[6:7]|, 0.5 ; SI-NEXT: s_lshr_b64 s[2:3], s[20:21], s6 ; SI-NEXT: v_bfi_b32 v9, s23, v8, v9 ; SI-NEXT: s_andn2_b64 s[4:5], s[8:9], s[2:3] -; SI-NEXT: s_and_b32 s2, s9, s29 +; SI-NEXT: s_and_b32 s2, s9, 0x80000000 ; SI-NEXT: v_cndmask_b32_e32 v7, 0, v9, vcc ; SI-NEXT: v_mov_b32_e32 v6, 0 ; SI-NEXT: s_cmp_lt_i32 s6, 0 @@ -514,12 +508,12 @@ define amdgpu_kernel void @round_v8f64(<8 x double> addrspace(1)* %out, <8 x dou ; SI-NEXT: v_cndmask_b32_e64 v4, v4, v9, s[2:3] ; SI-NEXT: s_bfe_u32 s2, s15, 0xb0014 ; SI-NEXT: v_add_f64 v[9:10], s[8:9], -v[4:5] -; SI-NEXT: s_add_i32 s4, s2, s28 +; SI-NEXT: s_add_i32 s4, s2, 0xfffffc01 ; SI-NEXT: s_lshr_b64 s[2:3], s[20:21], s4 ; SI-NEXT: v_mov_b32_e32 v11, s9 ; SI-NEXT: v_cmp_ge_f64_e64 vcc, |v[9:10]|, 0.5 ; SI-NEXT: s_andn2_b64 s[24:25], s[14:15], s[2:3] -; SI-NEXT: s_and_b32 s2, s15, s29 +; SI-NEXT: s_and_b32 s2, s15, 0x80000000 ; SI-NEXT: v_bfi_b32 v11, s23, v8, v11 ; SI-NEXT: s_cmp_lt_i32 s4, 0 ; SI-NEXT: v_cndmask_b32_e32 v10, 0, v11, vcc @@ -530,10 +524,10 @@ define amdgpu_kernel void @round_v8f64(<8 x double> addrspace(1)* %out, <8 x dou ; SI-NEXT: v_mov_b32_e32 v10, s2 ; SI-NEXT: s_cselect_b64 s[2:3], -1, 0 ; SI-NEXT: s_bfe_u32 s4, s13, 0xb0014 -; SI-NEXT: s_add_i32 s6, s4, s28 +; SI-NEXT: s_add_i32 s6, s4, 0xfffffc01 ; SI-NEXT: s_lshr_b64 s[4:5], s[20:21], s6 ; SI-NEXT: s_andn2_b64 s[26:27], s[12:13], s[4:5] -; SI-NEXT: s_and_b32 s4, s13, s29 +; SI-NEXT: s_and_b32 s4, s13, 0x80000000 ; SI-NEXT: v_mov_b32_e32 v9, s25 ; SI-NEXT: s_cmp_lt_i32 s6, 0 ; SI-NEXT: v_cndmask_b32_e32 v15, v9, v10, vcc @@ -542,30 +536,30 @@ define amdgpu_kernel void @round_v8f64(<8 x double> addrspace(1)* %out, <8 x dou ; SI-NEXT: s_cmp_gt_i32 s6, 51 ; SI-NEXT: s_cselect_b64 s[6:7], -1, 0 ; SI-NEXT: s_bfe_u32 s8, s19, 0xb0014 -; SI-NEXT: s_add_i32 s25, s8, s28 -; SI-NEXT: s_lshr_b64 s[8:9], s[20:21], s25 -; SI-NEXT: s_andn2_b64 s[10:11], s[18:19], s[8:9] -; SI-NEXT: s_and_b32 s8, s19, s29 +; SI-NEXT: s_add_i32 s10, s8, 0xfffffc01 +; SI-NEXT: s_lshr_b64 s[8:9], s[20:21], s10 +; SI-NEXT: s_andn2_b64 s[28:29], s[18:19], s[8:9] +; SI-NEXT: s_and_b32 s8, s19, 0x80000000 ; SI-NEXT: v_mov_b32_e32 v9, s27 -; SI-NEXT: s_cmp_lt_i32 s25, 0 +; SI-NEXT: s_cmp_lt_i32 s10, 0 ; SI-NEXT: v_cndmask_b32_e64 v17, v9, v10, s[4:5] -; SI-NEXT: v_mov_b32_e32 v9, s11 +; SI-NEXT: v_mov_b32_e32 v9, s29 ; SI-NEXT: v_mov_b32_e32 v10, s8 ; SI-NEXT: s_cselect_b64 s[8:9], -1, 0 -; SI-NEXT: s_cmp_gt_i32 s25, 51 +; SI-NEXT: s_cmp_gt_i32 s10, 51 ; SI-NEXT: v_cndmask_b32_e64 v9, v9, v10, s[8:9] ; SI-NEXT: v_mov_b32_e32 v10, s19 -; SI-NEXT: v_mov_b32_e32 v11, s10 ; SI-NEXT: s_cselect_b64 s[10:11], -1, 0 ; SI-NEXT: v_cndmask_b32_e64 v10, v9, v10, s[10:11] -; SI-NEXT: v_cndmask_b32_e64 v9, v11, 0, s[8:9] +; SI-NEXT: v_mov_b32_e32 v9, s28 +; SI-NEXT: v_cndmask_b32_e64 v9, v9, 0, s[8:9] ; SI-NEXT: v_mov_b32_e32 v11, s18 ; SI-NEXT: s_bfe_u32 s8, s17, 0xb0014 ; SI-NEXT: v_cndmask_b32_e64 v9, v9, v11, s[10:11] -; SI-NEXT: s_add_i32 s10, s8, s28 +; SI-NEXT: s_add_i32 s10, s8, 0xfffffc01 ; SI-NEXT: s_lshr_b64 s[8:9], s[20:21], s10 ; SI-NEXT: s_andn2_b64 s[20:21], s[16:17], s[8:9] -; SI-NEXT: s_and_b32 s8, s17, s29 +; SI-NEXT: s_and_b32 s8, s17, 0x80000000 ; SI-NEXT: s_cmp_lt_i32 s10, 0 ; SI-NEXT: v_mov_b32_e32 v11, s21 ; SI-NEXT: v_mov_b32_e32 v12, s8 diff --git a/llvm/test/CodeGen/AMDGPU/load-constant-i16.ll b/llvm/test/CodeGen/AMDGPU/load-constant-i16.ll index 23150da4e53b..f0fece24c703 100644 --- a/llvm/test/CodeGen/AMDGPU/load-constant-i16.ll +++ b/llvm/test/CodeGen/AMDGPU/load-constant-i16.ll @@ -1027,31 +1027,29 @@ define amdgpu_kernel void @constant_zextload_v3i16_to_v3i32(<3 x i32> addrspace( ; GCN-NOHSA-SI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x0 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s3, 0xf000 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s2, -1 -; GCN-NOHSA-SI-NEXT: s_mov_b32 s6, 0xffff ; GCN-NOHSA-SI-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s7, s4, 16 -; GCN-NOHSA-SI-NEXT: s_and_b32 s5, s5, s6 -; GCN-NOHSA-SI-NEXT: s_and_b32 s4, s4, s6 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s6, s4, 16 +; GCN-NOHSA-SI-NEXT: s_and_b32 s5, s5, 0xffff +; GCN-NOHSA-SI-NEXT: s_and_b32 s4, s4, 0xffff ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s5 ; GCN-NOHSA-SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:8 ; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s4 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s7 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s6 ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; GCN-NOHSA-SI-NEXT: s_endpgm ; ; GCN-HSA-LABEL: constant_zextload_v3i16_to_v3i32: ; GCN-HSA: ; %bb.0: ; %entry ; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; GCN-HSA-NEXT: s_mov_b32 s4, 0xffff ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 ; GCN-HSA-NEXT: v_mov_b32_e32 v3, s0 ; GCN-HSA-NEXT: v_mov_b32_e32 v4, s1 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: s_lshr_b32 s0, s2, 16 -; GCN-HSA-NEXT: s_and_b32 s1, s3, s4 -; GCN-HSA-NEXT: s_and_b32 s2, s2, s4 +; GCN-HSA-NEXT: s_and_b32 s1, s3, 0xffff +; GCN-HSA-NEXT: s_and_b32 s2, s2, 0xffff ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s0 ; GCN-HSA-NEXT: v_mov_b32_e32 v2, s1 @@ -1061,21 +1059,18 @@ define amdgpu_kernel void @constant_zextload_v3i16_to_v3i32(<3 x i32> addrspace( ; GCN-NOHSA-VI-LABEL: constant_zextload_v3i16_to_v3i32: ; GCN-NOHSA-VI: ; %bb.0: ; %entry ; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 -; GCN-NOHSA-VI-NEXT: s_mov_b32 s8, 0xffff -; GCN-NOHSA-VI-NEXT: s_mov_b32 s7, 0xf000 -; GCN-NOHSA-VI-NEXT: s_mov_b32 s6, -1 ; GCN-NOHSA-VI-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NOHSA-VI-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 -; GCN-NOHSA-VI-NEXT: s_mov_b32 s4, s0 -; GCN-NOHSA-VI-NEXT: s_mov_b32 s5, s1 +; GCN-NOHSA-VI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x0 +; GCN-NOHSA-VI-NEXT: s_mov_b32 s3, 0xf000 +; GCN-NOHSA-VI-NEXT: s_mov_b32 s2, -1 ; GCN-NOHSA-VI-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NOHSA-VI-NEXT: s_and_b32 s0, s3, s8 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s1, s2, 16 -; GCN-NOHSA-VI-NEXT: s_and_b32 s2, s2, s8 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s2 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s1 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s0 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx3 v[0:2], off, s[4:7], 0 +; GCN-NOHSA-VI-NEXT: s_and_b32 s5, s5, 0xffff +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s6, s4, 16 +; GCN-NOHSA-VI-NEXT: s_and_b32 s4, s4, 0xffff +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s4 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s6 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s5 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx3 v[0:2], off, s[0:3], 0 ; GCN-NOHSA-VI-NEXT: s_endpgm ; ; EG-LABEL: constant_zextload_v3i16_to_v3i32: @@ -1202,12 +1197,11 @@ define amdgpu_kernel void @constant_zextload_v4i16_to_v4i32(<4 x i32> addrspace( ; GCN-NOHSA-SI-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NOHSA-SI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x0 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s3, 0xf000 -; GCN-NOHSA-SI-NEXT: s_mov_b32 s2, 0xffff ; GCN-NOHSA-SI-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NOHSA-SI-NEXT: s_lshr_b32 s6, s5, 16 ; GCN-NOHSA-SI-NEXT: s_lshr_b32 s7, s4, 16 -; GCN-NOHSA-SI-NEXT: s_and_b32 s5, s5, s2 -; GCN-NOHSA-SI-NEXT: s_and_b32 s4, s4, s2 +; GCN-NOHSA-SI-NEXT: s_and_b32 s5, s5, 0xffff +; GCN-NOHSA-SI-NEXT: s_and_b32 s4, s4, 0xffff ; GCN-NOHSA-SI-NEXT: s_mov_b32 s2, -1 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s4 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s7 @@ -1219,7 +1213,6 @@ define amdgpu_kernel void @constant_zextload_v4i16_to_v4i32(<4 x i32> addrspace( ; GCN-HSA-LABEL: constant_zextload_v4i16_to_v4i32: ; GCN-HSA: ; %bb.0: ; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; GCN-HSA-NEXT: s_mov_b32 s4, 0xffff ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 ; GCN-HSA-NEXT: v_mov_b32_e32 v4, s0 @@ -1227,8 +1220,8 @@ define amdgpu_kernel void @constant_zextload_v4i16_to_v4i32(<4 x i32> addrspace( ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: s_lshr_b32 s0, s3, 16 ; GCN-HSA-NEXT: s_lshr_b32 s1, s2, 16 -; GCN-HSA-NEXT: s_and_b32 s3, s3, s4 -; GCN-HSA-NEXT: s_and_b32 s2, s2, s4 +; GCN-HSA-NEXT: s_and_b32 s3, s3, 0xffff +; GCN-HSA-NEXT: s_and_b32 s2, s2, 0xffff ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s1 ; GCN-HSA-NEXT: v_mov_b32_e32 v2, s3 @@ -1239,23 +1232,20 @@ define amdgpu_kernel void @constant_zextload_v4i16_to_v4i32(<4 x i32> addrspace( ; GCN-NOHSA-VI-LABEL: constant_zextload_v4i16_to_v4i32: ; GCN-NOHSA-VI: ; %bb.0: ; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 -; GCN-NOHSA-VI-NEXT: s_mov_b32 s8, 0xffff -; GCN-NOHSA-VI-NEXT: s_mov_b32 s7, 0xf000 -; GCN-NOHSA-VI-NEXT: s_mov_b32 s6, -1 ; GCN-NOHSA-VI-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NOHSA-VI-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 -; GCN-NOHSA-VI-NEXT: s_mov_b32 s4, s0 -; GCN-NOHSA-VI-NEXT: s_mov_b32 s5, s1 +; GCN-NOHSA-VI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x0 +; GCN-NOHSA-VI-NEXT: s_mov_b32 s3, 0xf000 +; GCN-NOHSA-VI-NEXT: s_mov_b32 s2, -1 ; GCN-NOHSA-VI-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s0, s3, 16 -; GCN-NOHSA-VI-NEXT: s_and_b32 s1, s3, s8 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s3, s2, 16 -; GCN-NOHSA-VI-NEXT: s_and_b32 s2, s2, s8 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s2 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s3 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s1 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s0 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s6, s5, 16 +; GCN-NOHSA-VI-NEXT: s_and_b32 s5, s5, 0xffff +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s7, s4, 16 +; GCN-NOHSA-VI-NEXT: s_and_b32 s4, s4, 0xffff +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s4 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s7 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s5 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s6 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 ; GCN-NOHSA-VI-NEXT: s_endpgm ; ; EG-LABEL: constant_zextload_v4i16_to_v4i32: @@ -1396,26 +1386,25 @@ define amdgpu_kernel void @constant_zextload_v8i16_to_v8i32(<8 x i32> addrspace( ; GCN-NOHSA-SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s3, 0xf000 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s2, -1 -; GCN-NOHSA-SI-NEXT: s_mov_b32 s8, 0xffff ; GCN-NOHSA-SI-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s9, s5, 16 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s10, s4, 16 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s11, s7, 16 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s12, s6, 16 -; GCN-NOHSA-SI-NEXT: s_and_b32 s5, s5, s8 -; GCN-NOHSA-SI-NEXT: s_and_b32 s7, s7, s8 -; GCN-NOHSA-SI-NEXT: s_and_b32 s6, s6, s8 -; GCN-NOHSA-SI-NEXT: s_and_b32 s4, s4, s8 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s8, s5, 16 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s9, s4, 16 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s10, s7, 16 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s11, s6, 16 +; GCN-NOHSA-SI-NEXT: s_and_b32 s5, s5, 0xffff +; GCN-NOHSA-SI-NEXT: s_and_b32 s7, s7, 0xffff +; GCN-NOHSA-SI-NEXT: s_and_b32 s6, s6, 0xffff +; GCN-NOHSA-SI-NEXT: s_and_b32 s4, s4, 0xffff ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s6 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s12 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s11 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s7 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s11 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s10 ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:16 ; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s4 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s10 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s9 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s5 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s9 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s8 ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 ; GCN-NOHSA-SI-NEXT: s_endpgm ; @@ -1424,22 +1413,21 @@ define amdgpu_kernel void @constant_zextload_v8i16_to_v8i32(<8 x i32> addrspace( ; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0 -; GCN-HSA-NEXT: s_mov_b32 s2, 0xffff ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: s_lshr_b32 s8, s5, 16 ; GCN-HSA-NEXT: s_lshr_b32 s9, s4, 16 -; GCN-HSA-NEXT: s_lshr_b32 s3, s7, 16 -; GCN-HSA-NEXT: s_lshr_b32 s10, s6, 16 -; GCN-HSA-NEXT: s_and_b32 s5, s5, s2 -; GCN-HSA-NEXT: s_and_b32 s4, s4, s2 -; GCN-HSA-NEXT: s_and_b32 s7, s7, s2 -; GCN-HSA-NEXT: s_and_b32 s2, s6, s2 -; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2 +; GCN-HSA-NEXT: s_lshr_b32 s2, s7, 16 +; GCN-HSA-NEXT: s_lshr_b32 s3, s6, 16 +; GCN-HSA-NEXT: s_and_b32 s5, s5, 0xffff +; GCN-HSA-NEXT: s_and_b32 s4, s4, 0xffff +; GCN-HSA-NEXT: s_and_b32 s7, s7, 0xffff +; GCN-HSA-NEXT: s_and_b32 s6, s6, 0xffff +; GCN-HSA-NEXT: v_mov_b32_e32 v3, s2 ; GCN-HSA-NEXT: s_add_u32 s2, s0, 16 -; GCN-HSA-NEXT: v_mov_b32_e32 v3, s3 +; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3 ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 ; GCN-HSA-NEXT: v_mov_b32_e32 v5, s3 -; GCN-HSA-NEXT: v_mov_b32_e32 v1, s10 +; GCN-HSA-NEXT: v_mov_b32_e32 v0, s6 ; GCN-HSA-NEXT: v_mov_b32_e32 v2, s7 ; GCN-HSA-NEXT: v_mov_b32_e32 v4, s2 ; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] @@ -1455,33 +1443,30 @@ define amdgpu_kernel void @constant_zextload_v8i16_to_v8i32(<8 x i32> addrspace( ; GCN-NOHSA-VI-LABEL: constant_zextload_v8i16_to_v8i32: ; GCN-NOHSA-VI: ; %bb.0: ; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 -; GCN-NOHSA-VI-NEXT: s_mov_b32 s7, 0xf000 -; GCN-NOHSA-VI-NEXT: s_mov_b32 s6, -1 ; GCN-NOHSA-VI-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0x0 -; GCN-NOHSA-VI-NEXT: s_mov_b32 s2, 0xffff -; GCN-NOHSA-VI-NEXT: s_mov_b32 s4, s0 -; GCN-NOHSA-VI-NEXT: s_mov_b32 s5, s1 +; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0 +; GCN-NOHSA-VI-NEXT: s_mov_b32 s3, 0xf000 +; GCN-NOHSA-VI-NEXT: s_mov_b32 s2, -1 ; GCN-NOHSA-VI-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s0, s9, 16 -; GCN-NOHSA-VI-NEXT: s_and_b32 s1, s9, s2 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s3, s8, 16 -; GCN-NOHSA-VI-NEXT: s_and_b32 s8, s8, s2 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s9, s11, 16 -; GCN-NOHSA-VI-NEXT: s_and_b32 s11, s11, s2 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s12, s10, 16 -; GCN-NOHSA-VI-NEXT: s_and_b32 s2, s10, s2 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s2 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s12 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s11 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s9 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 offset:16 +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s10, s7, 16 +; GCN-NOHSA-VI-NEXT: s_and_b32 s7, s7, 0xffff +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s11, s6, 16 +; GCN-NOHSA-VI-NEXT: s_and_b32 s6, s6, 0xffff +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s8, s5, 16 +; GCN-NOHSA-VI-NEXT: s_and_b32 s5, s5, 0xffff +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s9, s4, 16 +; GCN-NOHSA-VI-NEXT: s_and_b32 s4, s4, 0xffff +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s6 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s11 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s7 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s10 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:16 ; GCN-NOHSA-VI-NEXT: s_nop 0 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s8 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s3 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s1 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s0 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s4 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s9 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s5 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s8 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 ; GCN-NOHSA-VI-NEXT: s_endpgm ; ; EG-LABEL: constant_zextload_v8i16_to_v8i32: @@ -1662,46 +1647,45 @@ define amdgpu_kernel void @constant_zextload_v16i16_to_v16i32(<16 x i32> addrspa ; GCN-NOHSA-SI-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x0 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s3, 0xf000 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s2, -1 -; GCN-NOHSA-SI-NEXT: s_mov_b32 s12, 0xffff ; GCN-NOHSA-SI-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s13, s5, 16 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s14, s4, 16 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s15, s7, 16 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s16, s6, 16 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s17, s9, 16 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s18, s8, 16 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s19, s11, 16 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s20, s10, 16 -; GCN-NOHSA-SI-NEXT: s_and_b32 s5, s5, s12 -; GCN-NOHSA-SI-NEXT: s_and_b32 s4, s4, s12 -; GCN-NOHSA-SI-NEXT: s_and_b32 s7, s7, s12 -; GCN-NOHSA-SI-NEXT: s_and_b32 s6, s6, s12 -; GCN-NOHSA-SI-NEXT: s_and_b32 s9, s9, s12 -; GCN-NOHSA-SI-NEXT: s_and_b32 s11, s11, s12 -; GCN-NOHSA-SI-NEXT: s_and_b32 s10, s10, s12 -; GCN-NOHSA-SI-NEXT: s_and_b32 s8, s8, s12 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s12, s5, 16 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s13, s4, 16 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s14, s7, 16 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s15, s6, 16 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s16, s9, 16 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s17, s8, 16 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s18, s11, 16 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s19, s10, 16 +; GCN-NOHSA-SI-NEXT: s_and_b32 s5, s5, 0xffff +; GCN-NOHSA-SI-NEXT: s_and_b32 s4, s4, 0xffff +; GCN-NOHSA-SI-NEXT: s_and_b32 s7, s7, 0xffff +; GCN-NOHSA-SI-NEXT: s_and_b32 s6, s6, 0xffff +; GCN-NOHSA-SI-NEXT: s_and_b32 s9, s9, 0xffff +; GCN-NOHSA-SI-NEXT: s_and_b32 s11, s11, 0xffff +; GCN-NOHSA-SI-NEXT: s_and_b32 s10, s10, 0xffff +; GCN-NOHSA-SI-NEXT: s_and_b32 s8, s8, 0xffff ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s10 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s20 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s19 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s11 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s19 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s18 ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:48 ; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s8 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s18 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s17 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s9 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s17 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s16 ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:32 ; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s6 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s16 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s15 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s7 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s15 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s14 ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:16 ; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s4 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s14 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s13 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s5 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s13 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s12 ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 ; GCN-NOHSA-SI-NEXT: s_endpgm ; @@ -1710,7 +1694,6 @@ define amdgpu_kernel void @constant_zextload_v16i16_to_v16i32(<16 x i32> addrspa ; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x0 -; GCN-HSA-NEXT: s_mov_b32 s2, 0xffff ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: s_lshr_b32 s12, s5, 16 ; GCN-HSA-NEXT: s_lshr_b32 s13, s4, 16 @@ -1718,24 +1701,24 @@ define amdgpu_kernel void @constant_zextload_v16i16_to_v16i32(<16 x i32> addrspa ; GCN-HSA-NEXT: s_lshr_b32 s15, s6, 16 ; GCN-HSA-NEXT: s_lshr_b32 s16, s9, 16 ; GCN-HSA-NEXT: s_lshr_b32 s17, s8, 16 -; GCN-HSA-NEXT: s_lshr_b32 s3, s11, 16 -; GCN-HSA-NEXT: s_lshr_b32 s18, s10, 16 -; GCN-HSA-NEXT: s_and_b32 s5, s5, s2 -; GCN-HSA-NEXT: s_and_b32 s4, s4, s2 -; GCN-HSA-NEXT: s_and_b32 s7, s7, s2 -; GCN-HSA-NEXT: s_and_b32 s6, s6, s2 -; GCN-HSA-NEXT: s_and_b32 s9, s9, s2 -; GCN-HSA-NEXT: s_and_b32 s8, s8, s2 -; GCN-HSA-NEXT: s_and_b32 s11, s11, s2 -; GCN-HSA-NEXT: s_and_b32 s2, s10, s2 -; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2 +; GCN-HSA-NEXT: s_lshr_b32 s2, s11, 16 +; GCN-HSA-NEXT: s_lshr_b32 s3, s10, 16 +; GCN-HSA-NEXT: s_and_b32 s5, s5, 0xffff +; GCN-HSA-NEXT: s_and_b32 s4, s4, 0xffff +; GCN-HSA-NEXT: s_and_b32 s7, s7, 0xffff +; GCN-HSA-NEXT: s_and_b32 s6, s6, 0xffff +; GCN-HSA-NEXT: s_and_b32 s9, s9, 0xffff +; GCN-HSA-NEXT: s_and_b32 s8, s8, 0xffff +; GCN-HSA-NEXT: s_and_b32 s11, s11, 0xffff +; GCN-HSA-NEXT: s_and_b32 s10, s10, 0xffff +; GCN-HSA-NEXT: v_mov_b32_e32 v3, s2 ; GCN-HSA-NEXT: s_add_u32 s2, s0, 48 -; GCN-HSA-NEXT: v_mov_b32_e32 v3, s3 +; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3 ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 ; GCN-HSA-NEXT: v_mov_b32_e32 v5, s3 ; GCN-HSA-NEXT: v_mov_b32_e32 v4, s2 ; GCN-HSA-NEXT: s_add_u32 s2, s0, 32 -; GCN-HSA-NEXT: v_mov_b32_e32 v1, s18 +; GCN-HSA-NEXT: v_mov_b32_e32 v0, s10 ; GCN-HSA-NEXT: v_mov_b32_e32 v2, s11 ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 ; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] @@ -1766,46 +1749,43 @@ define amdgpu_kernel void @constant_zextload_v16i16_to_v16i32(<16 x i32> addrspa ; ; GCN-NOHSA-VI-LABEL: constant_zextload_v16i16_to_v16i32: ; GCN-NOHSA-VI: ; %bb.0: -; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[12:15], s[0:1], 0x24 +; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GCN-NOHSA-VI-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NOHSA-VI-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x0 ; GCN-NOHSA-VI-NEXT: s_mov_b32 s3, 0xf000 ; GCN-NOHSA-VI-NEXT: s_mov_b32 s2, -1 ; GCN-NOHSA-VI-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NOHSA-VI-NEXT: s_load_dwordx8 s[4:11], s[14:15], 0x0 -; GCN-NOHSA-VI-NEXT: s_mov_b32 s14, 0xffff -; GCN-NOHSA-VI-NEXT: s_mov_b32 s0, s12 -; GCN-NOHSA-VI-NEXT: s_mov_b32 s1, s13 -; GCN-NOHSA-VI-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s19, s11, 16 -; GCN-NOHSA-VI-NEXT: s_and_b32 s11, s11, s14 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s20, s10, 16 -; GCN-NOHSA-VI-NEXT: s_and_b32 s10, s10, s14 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s17, s9, 16 -; GCN-NOHSA-VI-NEXT: s_and_b32 s9, s9, s14 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s18, s8, 16 -; GCN-NOHSA-VI-NEXT: s_and_b32 s8, s8, s14 +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s18, s11, 16 +; GCN-NOHSA-VI-NEXT: s_and_b32 s11, s11, 0xffff +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s19, s10, 16 +; GCN-NOHSA-VI-NEXT: s_and_b32 s10, s10, 0xffff +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s16, s9, 16 +; GCN-NOHSA-VI-NEXT: s_and_b32 s9, s9, 0xffff +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s17, s8, 16 +; GCN-NOHSA-VI-NEXT: s_and_b32 s8, s8, 0xffff ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s10 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s20 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s19 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s11 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s19 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s15, s7, 16 -; GCN-NOHSA-VI-NEXT: s_and_b32 s7, s7, s14 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s16, s6, 16 -; GCN-NOHSA-VI-NEXT: s_and_b32 s6, s6, s14 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s18 +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s14, s7, 16 +; GCN-NOHSA-VI-NEXT: s_and_b32 s7, s7, 0xffff +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s15, s6, 16 +; GCN-NOHSA-VI-NEXT: s_and_b32 s6, s6, 0xffff ; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:48 ; GCN-NOHSA-VI-NEXT: s_lshr_b32 s12, s5, 16 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s8 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s18 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s17 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s9 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s17 -; GCN-NOHSA-VI-NEXT: s_and_b32 s5, s5, s14 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s16 +; GCN-NOHSA-VI-NEXT: s_and_b32 s5, s5, 0xffff ; GCN-NOHSA-VI-NEXT: s_lshr_b32 s13, s4, 16 -; GCN-NOHSA-VI-NEXT: s_and_b32 s4, s4, s14 +; GCN-NOHSA-VI-NEXT: s_and_b32 s4, s4, 0xffff ; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:32 ; GCN-NOHSA-VI-NEXT: s_nop 0 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s6 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s16 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s15 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s7 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s15 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s14 ; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:16 ; GCN-NOHSA-VI-NEXT: s_nop 0 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s4 @@ -2097,282 +2077,277 @@ define amdgpu_kernel void @constant_zextload_v32i16_to_v32i32(<32 x i32> addrspa ; GCN-NOHSA-SI-NEXT: s_load_dwordx4 s[16:19], s[0:1], 0x9 ; GCN-NOHSA-SI-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NOHSA-SI-NEXT: s_load_dwordx16 s[0:15], s[18:19], 0x0 -; GCN-NOHSA-SI-NEXT: s_mov_b32 s18, 0xffff ; GCN-NOHSA-SI-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s19, s1, 16 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s20, s0, 16 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s21, s3, 16 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s22, s2, 16 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s23, s5, 16 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s24, s4, 16 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s25, s7, 16 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s26, s6, 16 -; GCN-NOHSA-SI-NEXT: s_and_b32 s27, s1, s18 -; GCN-NOHSA-SI-NEXT: s_and_b32 s28, s0, s18 -; GCN-NOHSA-SI-NEXT: s_and_b32 s29, s3, s18 -; GCN-NOHSA-SI-NEXT: s_and_b32 s30, s2, s18 -; GCN-NOHSA-SI-NEXT: s_and_b32 s5, s5, s18 -; GCN-NOHSA-SI-NEXT: s_and_b32 s4, s4, s18 -; GCN-NOHSA-SI-NEXT: s_and_b32 s7, s7, s18 -; GCN-NOHSA-SI-NEXT: s_and_b32 s6, s6, s18 -; GCN-NOHSA-SI-NEXT: s_and_b32 s31, s9, s18 -; GCN-NOHSA-SI-NEXT: s_and_b32 s33, s8, s18 -; GCN-NOHSA-SI-NEXT: s_and_b32 s34, s11, s18 -; GCN-NOHSA-SI-NEXT: s_and_b32 s35, s10, s18 -; GCN-NOHSA-SI-NEXT: s_and_b32 s36, s13, s18 -; GCN-NOHSA-SI-NEXT: s_and_b32 s37, s12, s18 -; GCN-NOHSA-SI-NEXT: s_and_b32 s38, s15, s18 -; GCN-NOHSA-SI-NEXT: s_and_b32 s18, s14, s18 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s9, s9, 16 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s8, s8, 16 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s11, s11, 16 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s10, s10, 16 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s13, s13, 16 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s12, s12, 16 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s15, s15, 16 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s14, s14, 16 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s18, s1, 16 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s19, s0, 16 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s20, s3, 16 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s21, s2, 16 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s22, s5, 16 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s23, s4, 16 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s24, s7, 16 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s25, s6, 16 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s26, s9, 16 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s27, s8, 16 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s28, s11, 16 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s29, s10, 16 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s30, s13, 16 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s31, s12, 16 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s33, s15, 16 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s34, s14, 16 +; GCN-NOHSA-SI-NEXT: s_and_b32 s35, s1, 0xffff +; GCN-NOHSA-SI-NEXT: s_and_b32 s36, s0, 0xffff +; GCN-NOHSA-SI-NEXT: s_and_b32 s37, s3, 0xffff +; GCN-NOHSA-SI-NEXT: s_and_b32 s38, s2, 0xffff +; GCN-NOHSA-SI-NEXT: s_and_b32 s5, s5, 0xffff +; GCN-NOHSA-SI-NEXT: s_and_b32 s4, s4, 0xffff +; GCN-NOHSA-SI-NEXT: s_and_b32 s7, s7, 0xffff +; GCN-NOHSA-SI-NEXT: s_and_b32 s6, s6, 0xffff +; GCN-NOHSA-SI-NEXT: s_and_b32 s9, s9, 0xffff +; GCN-NOHSA-SI-NEXT: s_and_b32 s8, s8, 0xffff +; GCN-NOHSA-SI-NEXT: s_and_b32 s11, s11, 0xffff +; GCN-NOHSA-SI-NEXT: s_and_b32 s10, s10, 0xffff +; GCN-NOHSA-SI-NEXT: s_and_b32 s13, s13, 0xffff +; GCN-NOHSA-SI-NEXT: s_and_b32 s12, s12, 0xffff +; GCN-NOHSA-SI-NEXT: s_and_b32 s15, s15, 0xffff +; GCN-NOHSA-SI-NEXT: s_and_b32 s14, s14, 0xffff ; GCN-NOHSA-SI-NEXT: s_mov_b32 s3, 0xf000 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s2, -1 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s0, s16 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s1, s17 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s18 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s14 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s38 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s15 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s14 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s34 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s15 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s33 ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:112 ; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s37 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s12 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s36 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s13 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s12 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s31 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s13 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s30 ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:96 ; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s35 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s10 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s34 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s11 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s10 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s29 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s11 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s28 ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:80 ; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s33 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s8 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s31 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s9 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s8 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s27 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s9 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s26 ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:64 ; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s6 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s26 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s25 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s7 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s25 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s24 ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:48 ; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s4 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s24 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s23 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s5 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s23 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s22 ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:32 ; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s30 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s22 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s29 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s21 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s38 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s21 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s37 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s20 ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:16 ; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s28 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s20 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s27 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s19 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s36 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s19 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s35 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s18 ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 ; GCN-NOHSA-SI-NEXT: s_endpgm ; ; GCN-HSA-LABEL: constant_zextload_v32i16_to_v32i32: ; GCN-HSA: ; %bb.0: -; GCN-HSA-NEXT: s_load_dwordx4 s[16:19], s[4:5], 0x0 +; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) -; GCN-HSA-NEXT: s_load_dwordx16 s[0:15], s[18:19], 0x0 -; GCN-HSA-NEXT: s_mov_b32 s18, 0xffff +; GCN-HSA-NEXT: s_load_dwordx16 s[4:19], s[2:3], 0x0 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) -; GCN-HSA-NEXT: s_and_b32 s19, s1, s18 -; GCN-HSA-NEXT: s_and_b32 s20, s0, s18 -; GCN-HSA-NEXT: s_and_b32 s21, s3, s18 -; GCN-HSA-NEXT: s_and_b32 s22, s2, s18 -; GCN-HSA-NEXT: s_and_b32 s23, s5, s18 -; GCN-HSA-NEXT: s_and_b32 s24, s4, s18 -; GCN-HSA-NEXT: s_and_b32 s25, s7, s18 -; GCN-HSA-NEXT: s_and_b32 s26, s6, s18 -; GCN-HSA-NEXT: s_and_b32 s27, s9, s18 -; GCN-HSA-NEXT: s_and_b32 s28, s8, s18 -; GCN-HSA-NEXT: s_and_b32 s29, s11, s18 -; GCN-HSA-NEXT: s_and_b32 s30, s10, s18 -; GCN-HSA-NEXT: s_and_b32 s31, s13, s18 -; GCN-HSA-NEXT: s_and_b32 s33, s12, s18 -; GCN-HSA-NEXT: s_and_b32 s34, s15, s18 -; GCN-HSA-NEXT: s_and_b32 s18, s14, s18 -; GCN-HSA-NEXT: s_lshr_b32 s35, s1, 16 -; GCN-HSA-NEXT: s_lshr_b32 s36, s0, 16 -; GCN-HSA-NEXT: s_lshr_b32 s3, s3, 16 -; GCN-HSA-NEXT: s_lshr_b32 s2, s2, 16 -; GCN-HSA-NEXT: s_lshr_b32 s5, s5, 16 -; GCN-HSA-NEXT: s_lshr_b32 s4, s4, 16 -; GCN-HSA-NEXT: s_lshr_b32 s7, s7, 16 -; GCN-HSA-NEXT: s_lshr_b32 s6, s6, 16 -; GCN-HSA-NEXT: s_lshr_b32 s9, s9, 16 -; GCN-HSA-NEXT: s_lshr_b32 s8, s8, 16 -; GCN-HSA-NEXT: s_lshr_b32 s11, s11, 16 -; GCN-HSA-NEXT: s_lshr_b32 s10, s10, 16 -; GCN-HSA-NEXT: s_lshr_b32 s13, s13, 16 -; GCN-HSA-NEXT: s_lshr_b32 s12, s12, 16 -; GCN-HSA-NEXT: s_lshr_b32 s15, s15, 16 -; GCN-HSA-NEXT: s_lshr_b32 s14, s14, 16 -; GCN-HSA-NEXT: s_add_u32 s0, s16, 0x70 -; GCN-HSA-NEXT: s_addc_u32 s1, s17, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v9, s1 -; GCN-HSA-NEXT: v_mov_b32_e32 v8, s0 -; GCN-HSA-NEXT: s_add_u32 s0, s16, 0x60 -; GCN-HSA-NEXT: s_addc_u32 s1, s17, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v11, s1 -; GCN-HSA-NEXT: v_mov_b32_e32 v10, s0 -; GCN-HSA-NEXT: s_add_u32 s0, s16, 0x50 +; GCN-HSA-NEXT: s_lshr_b32 s20, s5, 16 +; GCN-HSA-NEXT: s_lshr_b32 s21, s4, 16 +; GCN-HSA-NEXT: s_lshr_b32 s22, s7, 16 +; GCN-HSA-NEXT: s_lshr_b32 s23, s6, 16 +; GCN-HSA-NEXT: s_lshr_b32 s24, s9, 16 +; GCN-HSA-NEXT: s_lshr_b32 s25, s8, 16 +; GCN-HSA-NEXT: s_lshr_b32 s26, s11, 16 +; GCN-HSA-NEXT: s_lshr_b32 s27, s10, 16 +; GCN-HSA-NEXT: s_lshr_b32 s28, s13, 16 +; GCN-HSA-NEXT: s_lshr_b32 s29, s12, 16 +; GCN-HSA-NEXT: s_lshr_b32 s30, s15, 16 +; GCN-HSA-NEXT: s_lshr_b32 s31, s14, 16 +; GCN-HSA-NEXT: s_lshr_b32 s33, s17, 16 +; GCN-HSA-NEXT: s_lshr_b32 s34, s16, 16 +; GCN-HSA-NEXT: s_lshr_b32 s35, s19, 16 +; GCN-HSA-NEXT: s_lshr_b32 s36, s18, 16 +; GCN-HSA-NEXT: s_and_b32 s5, s5, 0xffff +; GCN-HSA-NEXT: s_and_b32 s4, s4, 0xffff +; GCN-HSA-NEXT: s_and_b32 s7, s7, 0xffff +; GCN-HSA-NEXT: s_and_b32 s6, s6, 0xffff +; GCN-HSA-NEXT: s_and_b32 s9, s9, 0xffff +; GCN-HSA-NEXT: s_and_b32 s8, s8, 0xffff +; GCN-HSA-NEXT: s_and_b32 s11, s11, 0xffff +; GCN-HSA-NEXT: s_and_b32 s10, s10, 0xffff +; GCN-HSA-NEXT: s_and_b32 s13, s13, 0xffff +; GCN-HSA-NEXT: s_and_b32 s12, s12, 0xffff +; GCN-HSA-NEXT: s_and_b32 s15, s15, 0xffff +; GCN-HSA-NEXT: s_and_b32 s14, s14, 0xffff +; GCN-HSA-NEXT: s_and_b32 s17, s17, 0xffff +; GCN-HSA-NEXT: s_and_b32 s16, s16, 0xffff +; GCN-HSA-NEXT: s_and_b32 s19, s19, 0xffff +; GCN-HSA-NEXT: s_and_b32 s18, s18, 0xffff +; GCN-HSA-NEXT: s_add_u32 s2, s0, 0x70 +; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 +; GCN-HSA-NEXT: v_mov_b32_e32 v9, s3 +; GCN-HSA-NEXT: v_mov_b32_e32 v8, s2 +; GCN-HSA-NEXT: s_add_u32 s2, s0, 0x60 +; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 +; GCN-HSA-NEXT: v_mov_b32_e32 v11, s3 +; GCN-HSA-NEXT: v_mov_b32_e32 v10, s2 +; GCN-HSA-NEXT: s_add_u32 s2, s0, 0x50 ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s18 -; GCN-HSA-NEXT: v_mov_b32_e32 v1, s14 -; GCN-HSA-NEXT: v_mov_b32_e32 v2, s34 -; GCN-HSA-NEXT: v_mov_b32_e32 v3, s15 -; GCN-HSA-NEXT: v_mov_b32_e32 v4, s33 -; GCN-HSA-NEXT: v_mov_b32_e32 v5, s12 -; GCN-HSA-NEXT: s_addc_u32 s1, s17, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v6, s31 -; GCN-HSA-NEXT: v_mov_b32_e32 v7, s13 -; GCN-HSA-NEXT: flat_store_dwordx4 v[8:9], v[0:3] -; GCN-HSA-NEXT: flat_store_dwordx4 v[10:11], v[4:7] -; GCN-HSA-NEXT: v_mov_b32_e32 v0, s30 -; GCN-HSA-NEXT: v_mov_b32_e32 v5, s1 -; GCN-HSA-NEXT: v_mov_b32_e32 v4, s0 -; GCN-HSA-NEXT: s_add_u32 s0, s16, 64 -; GCN-HSA-NEXT: v_mov_b32_e32 v1, s10 -; GCN-HSA-NEXT: v_mov_b32_e32 v2, s29 -; GCN-HSA-NEXT: v_mov_b32_e32 v3, s11 -; GCN-HSA-NEXT: s_addc_u32 s1, s17, 0 -; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GCN-HSA-NEXT: v_mov_b32_e32 v5, s1 -; GCN-HSA-NEXT: v_mov_b32_e32 v4, s0 -; GCN-HSA-NEXT: s_add_u32 s0, s16, 48 -; GCN-HSA-NEXT: v_mov_b32_e32 v0, s28 -; GCN-HSA-NEXT: v_mov_b32_e32 v1, s8 -; GCN-HSA-NEXT: v_mov_b32_e32 v2, s27 -; GCN-HSA-NEXT: v_mov_b32_e32 v3, s9 -; GCN-HSA-NEXT: s_addc_u32 s1, s17, 0 -; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GCN-HSA-NEXT: v_mov_b32_e32 v5, s1 -; GCN-HSA-NEXT: v_mov_b32_e32 v4, s0 -; GCN-HSA-NEXT: s_add_u32 s0, s16, 32 -; GCN-HSA-NEXT: v_mov_b32_e32 v0, s26 -; GCN-HSA-NEXT: v_mov_b32_e32 v1, s6 -; GCN-HSA-NEXT: v_mov_b32_e32 v2, s25 -; GCN-HSA-NEXT: v_mov_b32_e32 v3, s7 -; GCN-HSA-NEXT: s_addc_u32 s1, s17, 0 -; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GCN-HSA-NEXT: v_mov_b32_e32 v5, s1 -; GCN-HSA-NEXT: v_mov_b32_e32 v4, s0 -; GCN-HSA-NEXT: s_add_u32 s0, s16, 16 -; GCN-HSA-NEXT: v_mov_b32_e32 v0, s24 -; GCN-HSA-NEXT: v_mov_b32_e32 v1, s4 -; GCN-HSA-NEXT: v_mov_b32_e32 v2, s23 -; GCN-HSA-NEXT: v_mov_b32_e32 v3, s5 -; GCN-HSA-NEXT: s_addc_u32 s1, s17, 0 -; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GCN-HSA-NEXT: v_mov_b32_e32 v5, s1 -; GCN-HSA-NEXT: v_mov_b32_e32 v0, s22 -; GCN-HSA-NEXT: v_mov_b32_e32 v1, s2 -; GCN-HSA-NEXT: v_mov_b32_e32 v2, s21 -; GCN-HSA-NEXT: v_mov_b32_e32 v3, s3 -; GCN-HSA-NEXT: v_mov_b32_e32 v4, s0 -; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GCN-HSA-NEXT: v_mov_b32_e32 v4, s16 -; GCN-HSA-NEXT: v_mov_b32_e32 v0, s20 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s36 ; GCN-HSA-NEXT: v_mov_b32_e32 v2, s19 ; GCN-HSA-NEXT: v_mov_b32_e32 v3, s35 -; GCN-HSA-NEXT: v_mov_b32_e32 v5, s17 +; GCN-HSA-NEXT: v_mov_b32_e32 v4, s16 +; GCN-HSA-NEXT: v_mov_b32_e32 v5, s34 +; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 +; GCN-HSA-NEXT: v_mov_b32_e32 v6, s17 +; GCN-HSA-NEXT: v_mov_b32_e32 v7, s33 +; GCN-HSA-NEXT: flat_store_dwordx4 v[8:9], v[0:3] +; GCN-HSA-NEXT: flat_store_dwordx4 v[10:11], v[4:7] +; GCN-HSA-NEXT: v_mov_b32_e32 v0, s14 +; GCN-HSA-NEXT: v_mov_b32_e32 v5, s3 +; GCN-HSA-NEXT: v_mov_b32_e32 v4, s2 +; GCN-HSA-NEXT: s_add_u32 s2, s0, 64 +; GCN-HSA-NEXT: v_mov_b32_e32 v1, s31 +; GCN-HSA-NEXT: v_mov_b32_e32 v2, s15 +; GCN-HSA-NEXT: v_mov_b32_e32 v3, s30 +; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 +; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] +; GCN-HSA-NEXT: v_mov_b32_e32 v5, s3 +; GCN-HSA-NEXT: v_mov_b32_e32 v4, s2 +; GCN-HSA-NEXT: s_add_u32 s2, s0, 48 +; GCN-HSA-NEXT: v_mov_b32_e32 v0, s12 +; GCN-HSA-NEXT: v_mov_b32_e32 v1, s29 +; GCN-HSA-NEXT: v_mov_b32_e32 v2, s13 +; GCN-HSA-NEXT: v_mov_b32_e32 v3, s28 +; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 +; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] +; GCN-HSA-NEXT: v_mov_b32_e32 v5, s3 +; GCN-HSA-NEXT: v_mov_b32_e32 v4, s2 +; GCN-HSA-NEXT: s_add_u32 s2, s0, 32 +; GCN-HSA-NEXT: v_mov_b32_e32 v0, s10 +; GCN-HSA-NEXT: v_mov_b32_e32 v1, s27 +; GCN-HSA-NEXT: v_mov_b32_e32 v2, s11 +; GCN-HSA-NEXT: v_mov_b32_e32 v3, s26 +; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 +; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] +; GCN-HSA-NEXT: v_mov_b32_e32 v5, s3 +; GCN-HSA-NEXT: v_mov_b32_e32 v4, s2 +; GCN-HSA-NEXT: s_add_u32 s2, s0, 16 +; GCN-HSA-NEXT: v_mov_b32_e32 v0, s8 +; GCN-HSA-NEXT: v_mov_b32_e32 v1, s25 +; GCN-HSA-NEXT: v_mov_b32_e32 v2, s9 +; GCN-HSA-NEXT: v_mov_b32_e32 v3, s24 +; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 +; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] +; GCN-HSA-NEXT: v_mov_b32_e32 v5, s3 +; GCN-HSA-NEXT: v_mov_b32_e32 v0, s6 +; GCN-HSA-NEXT: v_mov_b32_e32 v1, s23 +; GCN-HSA-NEXT: v_mov_b32_e32 v2, s7 +; GCN-HSA-NEXT: v_mov_b32_e32 v3, s22 +; GCN-HSA-NEXT: v_mov_b32_e32 v4, s2 +; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] +; GCN-HSA-NEXT: v_mov_b32_e32 v5, s1 +; GCN-HSA-NEXT: v_mov_b32_e32 v0, s4 +; GCN-HSA-NEXT: v_mov_b32_e32 v1, s21 +; GCN-HSA-NEXT: v_mov_b32_e32 v2, s5 +; GCN-HSA-NEXT: v_mov_b32_e32 v3, s20 +; GCN-HSA-NEXT: v_mov_b32_e32 v4, s0 ; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-HSA-NEXT: s_endpgm ; ; GCN-NOHSA-VI-LABEL: constant_zextload_v32i16_to_v32i32: ; GCN-NOHSA-VI: ; %bb.0: -; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[20:23], s[0:1], 0x24 +; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[16:19], s[0:1], 0x24 +; GCN-NOHSA-VI-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NOHSA-VI-NEXT: s_load_dwordx16 s[0:15], s[18:19], 0x0 ; GCN-NOHSA-VI-NEXT: s_mov_b32 s19, 0xf000 ; GCN-NOHSA-VI-NEXT: s_mov_b32 s18, -1 ; GCN-NOHSA-VI-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NOHSA-VI-NEXT: s_load_dwordx16 s[0:15], s[22:23], 0x0 -; GCN-NOHSA-VI-NEXT: s_mov_b32 s22, 0xffff -; GCN-NOHSA-VI-NEXT: s_mov_b32 s16, s20 -; GCN-NOHSA-VI-NEXT: s_mov_b32 s17, s21 -; GCN-NOHSA-VI-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s36, s15, 16 -; GCN-NOHSA-VI-NEXT: s_and_b32 s15, s15, s22 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s37, s14, 16 -; GCN-NOHSA-VI-NEXT: s_and_b32 s14, s14, s22 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s34, s13, 16 -; GCN-NOHSA-VI-NEXT: s_and_b32 s13, s13, s22 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s35, s12, 16 -; GCN-NOHSA-VI-NEXT: s_and_b32 s12, s12, s22 +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s35, s15, 16 +; GCN-NOHSA-VI-NEXT: s_and_b32 s15, s15, 0xffff +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s36, s14, 16 +; GCN-NOHSA-VI-NEXT: s_and_b32 s14, s14, 0xffff +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s33, s13, 16 +; GCN-NOHSA-VI-NEXT: s_and_b32 s13, s13, 0xffff +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s34, s12, 16 +; GCN-NOHSA-VI-NEXT: s_and_b32 s12, s12, 0xffff ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s14 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s37 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s36 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s15 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s36 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s31, s11, 16 -; GCN-NOHSA-VI-NEXT: s_and_b32 s11, s11, s22 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s33, s10, 16 -; GCN-NOHSA-VI-NEXT: s_and_b32 s10, s10, s22 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s35 +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s30, s11, 16 +; GCN-NOHSA-VI-NEXT: s_and_b32 s11, s11, 0xffff +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s31, s10, 16 +; GCN-NOHSA-VI-NEXT: s_and_b32 s10, s10, 0xffff ; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[16:19], 0 offset:112 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s29, s9, 16 +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s28, s9, 16 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s12 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s35 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s34 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s13 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s34 -; GCN-NOHSA-VI-NEXT: s_and_b32 s9, s9, s22 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s30, s8, 16 -; GCN-NOHSA-VI-NEXT: s_and_b32 s8, s8, s22 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s33 +; GCN-NOHSA-VI-NEXT: s_and_b32 s9, s9, 0xffff +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s29, s8, 16 +; GCN-NOHSA-VI-NEXT: s_and_b32 s8, s8, 0xffff ; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[16:19], 0 offset:96 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s27, s7, 16 +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s26, s7, 16 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s10 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s33 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s31 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s11 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s31 -; GCN-NOHSA-VI-NEXT: s_and_b32 s7, s7, s22 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s28, s6, 16 -; GCN-NOHSA-VI-NEXT: s_and_b32 s6, s6, s22 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s30 +; GCN-NOHSA-VI-NEXT: s_and_b32 s7, s7, 0xffff +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s27, s6, 16 +; GCN-NOHSA-VI-NEXT: s_and_b32 s6, s6, 0xffff ; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[16:19], 0 offset:80 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s25, s5, 16 +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s24, s5, 16 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s8 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s30 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s29 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s9 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s29 -; GCN-NOHSA-VI-NEXT: s_and_b32 s5, s5, s22 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s26, s4, 16 -; GCN-NOHSA-VI-NEXT: s_and_b32 s4, s4, s22 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s28 +; GCN-NOHSA-VI-NEXT: s_and_b32 s5, s5, 0xffff +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s25, s4, 16 +; GCN-NOHSA-VI-NEXT: s_and_b32 s4, s4, 0xffff ; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[16:19], 0 offset:64 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s23, s3, 16 +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s22, s3, 16 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s6 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s28 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s27 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s7 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s27 -; GCN-NOHSA-VI-NEXT: s_and_b32 s3, s3, s22 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s24, s2, 16 -; GCN-NOHSA-VI-NEXT: s_and_b32 s2, s2, s22 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s26 +; GCN-NOHSA-VI-NEXT: s_and_b32 s3, s3, 0xffff +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s23, s2, 16 +; GCN-NOHSA-VI-NEXT: s_and_b32 s2, s2, 0xffff ; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[16:19], 0 offset:48 ; GCN-NOHSA-VI-NEXT: s_lshr_b32 s20, s1, 16 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s4 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s26 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s25 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s5 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s25 -; GCN-NOHSA-VI-NEXT: s_and_b32 s1, s1, s22 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s24 +; GCN-NOHSA-VI-NEXT: s_and_b32 s1, s1, 0xffff ; GCN-NOHSA-VI-NEXT: s_lshr_b32 s21, s0, 16 -; GCN-NOHSA-VI-NEXT: s_and_b32 s0, s0, s22 +; GCN-NOHSA-VI-NEXT: s_and_b32 s0, s0, 0xffff ; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[16:19], 0 offset:32 ; GCN-NOHSA-VI-NEXT: s_nop 0 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s2 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s24 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s23 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s3 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s23 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s22 ; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[16:19], 0 offset:16 ; GCN-NOHSA-VI-NEXT: s_nop 0 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s0 @@ -2884,101 +2859,100 @@ define amdgpu_kernel void @constant_zextload_v64i16_to_v64i32(<64 x i32> addrspa ; GCN-NOHSA-SI-NEXT: s_load_dwordx4 s[16:19], s[0:1], 0x9 ; GCN-NOHSA-SI-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NOHSA-SI-NEXT: s_load_dwordx16 s[0:15], s[18:19], 0x0 -; GCN-NOHSA-SI-NEXT: s_mov_b32 s20, 0xffff ; GCN-NOHSA-SI-NEXT: s_load_dwordx16 s[36:51], s[18:19], 0x10 ; GCN-NOHSA-SI-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NOHSA-SI-NEXT: s_lshr_b32 s18, s1, 16 ; GCN-NOHSA-SI-NEXT: s_lshr_b32 s19, s0, 16 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s21, s3, 16 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s22, s2, 16 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s23, s5, 16 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s24, s4, 16 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s25, s7, 16 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s26, s6, 16 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s27, s9, 16 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s28, s8, 16 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s29, s11, 16 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s30, s10, 16 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s31, s13, 16 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s33, s12, 16 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s34, s15, 16 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s35, s14, 16 -; GCN-NOHSA-SI-NEXT: s_and_b32 s52, s1, s20 -; GCN-NOHSA-SI-NEXT: s_and_b32 s53, s0, s20 -; GCN-NOHSA-SI-NEXT: s_and_b32 s54, s3, s20 -; GCN-NOHSA-SI-NEXT: s_and_b32 s55, s2, s20 -; GCN-NOHSA-SI-NEXT: s_and_b32 s5, s5, s20 -; GCN-NOHSA-SI-NEXT: s_and_b32 s4, s4, s20 -; GCN-NOHSA-SI-NEXT: s_and_b32 s7, s7, s20 -; GCN-NOHSA-SI-NEXT: s_and_b32 s6, s6, s20 -; GCN-NOHSA-SI-NEXT: s_and_b32 s9, s9, s20 -; GCN-NOHSA-SI-NEXT: s_and_b32 s8, s8, s20 -; GCN-NOHSA-SI-NEXT: s_and_b32 s11, s11, s20 -; GCN-NOHSA-SI-NEXT: s_and_b32 s10, s10, s20 -; GCN-NOHSA-SI-NEXT: s_and_b32 s13, s13, s20 -; GCN-NOHSA-SI-NEXT: s_and_b32 s12, s12, s20 -; GCN-NOHSA-SI-NEXT: s_and_b32 s15, s15, s20 -; GCN-NOHSA-SI-NEXT: s_and_b32 s14, s14, s20 -; GCN-NOHSA-SI-NEXT: s_and_b32 s56, s37, s20 -; GCN-NOHSA-SI-NEXT: s_and_b32 s57, s36, s20 -; GCN-NOHSA-SI-NEXT: s_and_b32 s58, s39, s20 -; GCN-NOHSA-SI-NEXT: s_and_b32 s59, s38, s20 -; GCN-NOHSA-SI-NEXT: s_and_b32 s60, s41, s20 -; GCN-NOHSA-SI-NEXT: s_and_b32 s61, s40, s20 -; GCN-NOHSA-SI-NEXT: s_and_b32 s62, s43, s20 -; GCN-NOHSA-SI-NEXT: s_and_b32 s63, s42, s20 -; GCN-NOHSA-SI-NEXT: s_and_b32 s64, s45, s20 -; GCN-NOHSA-SI-NEXT: s_and_b32 s65, s44, s20 -; GCN-NOHSA-SI-NEXT: s_and_b32 s66, s47, s20 -; GCN-NOHSA-SI-NEXT: s_and_b32 s67, s46, s20 -; GCN-NOHSA-SI-NEXT: s_and_b32 s68, s49, s20 -; GCN-NOHSA-SI-NEXT: s_and_b32 s69, s48, s20 -; GCN-NOHSA-SI-NEXT: s_and_b32 s70, s51, s20 -; GCN-NOHSA-SI-NEXT: s_and_b32 s20, s50, s20 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s37, s37, 16 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s36, s36, 16 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s39, s39, 16 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s38, s38, 16 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s41, s41, 16 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s40, s40, 16 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s42, s42, 16 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s45, s45, 16 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s44, s44, 16 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s47, s47, 16 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s46, s46, 16 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s49, s49, 16 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s48, s48, 16 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s51, s51, 16 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s50, s50, 16 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s43, s43, 16 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s20, s3, 16 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s21, s2, 16 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s22, s5, 16 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s23, s4, 16 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s24, s7, 16 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s25, s6, 16 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s26, s9, 16 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s27, s8, 16 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s28, s11, 16 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s29, s10, 16 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s30, s13, 16 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s31, s12, 16 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s33, s15, 16 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s34, s14, 16 +; GCN-NOHSA-SI-NEXT: s_and_b32 s35, s1, 0xffff +; GCN-NOHSA-SI-NEXT: s_and_b32 s52, s0, 0xffff +; GCN-NOHSA-SI-NEXT: s_and_b32 s53, s3, 0xffff +; GCN-NOHSA-SI-NEXT: s_and_b32 s54, s2, 0xffff +; GCN-NOHSA-SI-NEXT: s_and_b32 s5, s5, 0xffff +; GCN-NOHSA-SI-NEXT: s_and_b32 s4, s4, 0xffff +; GCN-NOHSA-SI-NEXT: s_and_b32 s7, s7, 0xffff +; GCN-NOHSA-SI-NEXT: s_and_b32 s6, s6, 0xffff +; GCN-NOHSA-SI-NEXT: s_and_b32 s9, s9, 0xffff +; GCN-NOHSA-SI-NEXT: s_and_b32 s8, s8, 0xffff +; GCN-NOHSA-SI-NEXT: s_and_b32 s11, s11, 0xffff +; GCN-NOHSA-SI-NEXT: s_and_b32 s10, s10, 0xffff +; GCN-NOHSA-SI-NEXT: s_and_b32 s13, s13, 0xffff +; GCN-NOHSA-SI-NEXT: s_and_b32 s12, s12, 0xffff +; GCN-NOHSA-SI-NEXT: s_and_b32 s15, s15, 0xffff +; GCN-NOHSA-SI-NEXT: s_and_b32 s14, s14, 0xffff +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s55, s37, 16 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s56, s36, 16 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s57, s39, 16 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s58, s38, 16 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s59, s41, 16 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s60, s40, 16 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s61, s43, 16 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s62, s42, 16 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s63, s45, 16 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s64, s44, 16 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s65, s47, 16 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s66, s46, 16 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s67, s49, 16 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s68, s48, 16 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s69, s51, 16 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s70, s50, 16 +; GCN-NOHSA-SI-NEXT: s_and_b32 s37, s37, 0xffff +; GCN-NOHSA-SI-NEXT: s_and_b32 s36, s36, 0xffff +; GCN-NOHSA-SI-NEXT: s_and_b32 s39, s39, 0xffff +; GCN-NOHSA-SI-NEXT: s_and_b32 s38, s38, 0xffff +; GCN-NOHSA-SI-NEXT: s_and_b32 s40, s40, 0xffff +; GCN-NOHSA-SI-NEXT: s_and_b32 s43, s43, 0xffff +; GCN-NOHSA-SI-NEXT: s_and_b32 s42, s42, 0xffff +; GCN-NOHSA-SI-NEXT: s_and_b32 s45, s45, 0xffff +; GCN-NOHSA-SI-NEXT: s_and_b32 s44, s44, 0xffff +; GCN-NOHSA-SI-NEXT: s_and_b32 s47, s47, 0xffff +; GCN-NOHSA-SI-NEXT: s_and_b32 s46, s46, 0xffff +; GCN-NOHSA-SI-NEXT: s_and_b32 s49, s49, 0xffff +; GCN-NOHSA-SI-NEXT: s_and_b32 s48, s48, 0xffff +; GCN-NOHSA-SI-NEXT: s_and_b32 s51, s51, 0xffff +; GCN-NOHSA-SI-NEXT: s_and_b32 s50, s50, 0xffff +; GCN-NOHSA-SI-NEXT: s_and_b32 s41, s41, 0xffff ; GCN-NOHSA-SI-NEXT: s_mov_b32 s0, s16 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s1, s17 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s3, 0xf000 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s2, -1 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s20 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s50 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s70 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s51 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v4, s69 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v5, s48 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v6, s68 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v7, s49 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v8, s67 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v9, s46 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v10, s66 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v11, s47 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v12, s65 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v13, s44 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v14, s64 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v15, s45 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v16, s63 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v17, s42 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v18, s62 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v20, s61 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v19, s43 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v21, s40 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v22, s60 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v23, s41 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s50 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s70 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s51 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s69 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v4, s48 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v5, s68 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v6, s49 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v7, s67 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v8, s46 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v9, s66 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v10, s47 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v11, s65 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v12, s44 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v13, s64 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v14, s45 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v15, s63 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v16, s42 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v17, s62 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v18, s43 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v20, s40 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v19, s61 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v21, s60 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v22, s41 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v23, s59 ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:240 ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:224 ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:208 @@ -2986,63 +2960,63 @@ define amdgpu_kernel void @constant_zextload_v64i16_to_v64i32(<64 x i32> addrspa ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[16:19], off, s[0:3], 0 offset:176 ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[20:23], off, s[0:3], 0 offset:160 ; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(5) -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s59 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s38 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s58 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s39 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s38 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s58 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s39 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s57 ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:144 ; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s57 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s36 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s56 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s37 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s36 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s56 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s37 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s55 ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:128 ; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s14 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s35 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s34 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s15 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s34 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s33 ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:112 ; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s12 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s33 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s31 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s13 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s31 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s30 ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:96 ; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s10 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s30 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s29 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s11 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s29 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s28 ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:80 ; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s8 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s28 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s27 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s9 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s27 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s26 ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:64 ; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s6 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s26 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s25 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s7 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s25 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s24 ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:48 ; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s4 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s24 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s23 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s5 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s23 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s22 ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:32 ; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s55 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s22 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s54 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s21 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s54 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s21 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s53 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s20 ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:16 ; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s53 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s52 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s19 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s52 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s35 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s18 ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 ; GCN-NOHSA-SI-NEXT: s_endpgm @@ -3050,7 +3024,6 @@ define amdgpu_kernel void @constant_zextload_v64i16_to_v64i32(<64 x i32> addrspa ; GCN-HSA-LABEL: constant_zextload_v64i16_to_v64i32: ; GCN-HSA: ; %bb.0: ; GCN-HSA-NEXT: s_load_dwordx4 s[16:19], s[4:5], 0x0 -; GCN-HSA-NEXT: s_mov_b32 s53, 0xffff ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: s_load_dwordx16 s[0:15], s[18:19], 0x0 ; GCN-HSA-NEXT: s_load_dwordx16 s[36:51], s[18:19], 0x10 @@ -3071,54 +3044,54 @@ define amdgpu_kernel void @constant_zextload_v64i16_to_v64i32(<64 x i32> addrspa ; GCN-HSA-NEXT: s_lshr_b32 s34, s12, 16 ; GCN-HSA-NEXT: s_lshr_b32 s35, s15, 16 ; GCN-HSA-NEXT: s_lshr_b32 s52, s14, 16 -; GCN-HSA-NEXT: s_and_b32 s1, s1, s53 -; GCN-HSA-NEXT: s_and_b32 s0, s0, s53 -; GCN-HSA-NEXT: s_and_b32 s3, s3, s53 -; GCN-HSA-NEXT: s_and_b32 s2, s2, s53 -; GCN-HSA-NEXT: s_and_b32 s5, s5, s53 -; GCN-HSA-NEXT: s_and_b32 s4, s4, s53 -; GCN-HSA-NEXT: s_and_b32 s54, s7, s53 -; GCN-HSA-NEXT: s_and_b32 s55, s6, s53 -; GCN-HSA-NEXT: s_and_b32 s9, s9, s53 -; GCN-HSA-NEXT: s_and_b32 s8, s8, s53 -; GCN-HSA-NEXT: s_and_b32 s11, s11, s53 -; GCN-HSA-NEXT: s_and_b32 s10, s10, s53 -; GCN-HSA-NEXT: s_and_b32 s13, s13, s53 -; GCN-HSA-NEXT: s_and_b32 s12, s12, s53 -; GCN-HSA-NEXT: s_and_b32 s15, s15, s53 -; GCN-HSA-NEXT: s_and_b32 s14, s14, s53 -; GCN-HSA-NEXT: s_and_b32 s18, s37, s53 -; GCN-HSA-NEXT: s_and_b32 s19, s36, s53 -; GCN-HSA-NEXT: s_and_b32 s56, s39, s53 -; GCN-HSA-NEXT: s_and_b32 s57, s38, s53 -; GCN-HSA-NEXT: s_and_b32 s58, s41, s53 -; GCN-HSA-NEXT: s_and_b32 s59, s40, s53 -; GCN-HSA-NEXT: s_and_b32 s60, s43, s53 -; GCN-HSA-NEXT: s_and_b32 s61, s42, s53 -; GCN-HSA-NEXT: s_and_b32 s62, s45, s53 -; GCN-HSA-NEXT: s_and_b32 s63, s44, s53 -; GCN-HSA-NEXT: s_and_b32 s64, s47, s53 -; GCN-HSA-NEXT: s_and_b32 s65, s46, s53 -; GCN-HSA-NEXT: s_and_b32 s66, s49, s53 -; GCN-HSA-NEXT: s_and_b32 s67, s48, s53 -; GCN-HSA-NEXT: s_and_b32 s68, s51, s53 -; GCN-HSA-NEXT: s_and_b32 s53, s50, s53 -; GCN-HSA-NEXT: s_lshr_b32 s37, s37, 16 -; GCN-HSA-NEXT: s_lshr_b32 s36, s36, 16 -; GCN-HSA-NEXT: s_lshr_b32 s39, s39, 16 -; GCN-HSA-NEXT: s_lshr_b32 s38, s38, 16 -; GCN-HSA-NEXT: s_lshr_b32 s41, s41, 16 -; GCN-HSA-NEXT: s_lshr_b32 s40, s40, 16 -; GCN-HSA-NEXT: s_lshr_b32 s43, s43, 16 -; GCN-HSA-NEXT: s_lshr_b32 s42, s42, 16 -; GCN-HSA-NEXT: s_lshr_b32 s45, s45, 16 -; GCN-HSA-NEXT: s_lshr_b32 s44, s44, 16 -; GCN-HSA-NEXT: s_lshr_b32 s47, s47, 16 -; GCN-HSA-NEXT: s_lshr_b32 s46, s46, 16 -; GCN-HSA-NEXT: s_lshr_b32 s49, s49, 16 -; GCN-HSA-NEXT: s_lshr_b32 s48, s48, 16 -; GCN-HSA-NEXT: s_lshr_b32 s51, s51, 16 -; GCN-HSA-NEXT: s_lshr_b32 s50, s50, 16 +; GCN-HSA-NEXT: s_and_b32 s1, s1, 0xffff +; GCN-HSA-NEXT: s_and_b32 s0, s0, 0xffff +; GCN-HSA-NEXT: s_and_b32 s3, s3, 0xffff +; GCN-HSA-NEXT: s_and_b32 s2, s2, 0xffff +; GCN-HSA-NEXT: s_and_b32 s5, s5, 0xffff +; GCN-HSA-NEXT: s_and_b32 s4, s4, 0xffff +; GCN-HSA-NEXT: s_and_b32 s53, s7, 0xffff +; GCN-HSA-NEXT: s_and_b32 s54, s6, 0xffff +; GCN-HSA-NEXT: s_and_b32 s9, s9, 0xffff +; GCN-HSA-NEXT: s_and_b32 s8, s8, 0xffff +; GCN-HSA-NEXT: s_and_b32 s11, s11, 0xffff +; GCN-HSA-NEXT: s_and_b32 s10, s10, 0xffff +; GCN-HSA-NEXT: s_and_b32 s13, s13, 0xffff +; GCN-HSA-NEXT: s_and_b32 s12, s12, 0xffff +; GCN-HSA-NEXT: s_and_b32 s15, s15, 0xffff +; GCN-HSA-NEXT: s_and_b32 s14, s14, 0xffff +; GCN-HSA-NEXT: s_lshr_b32 s18, s37, 16 +; GCN-HSA-NEXT: s_lshr_b32 s19, s36, 16 +; GCN-HSA-NEXT: s_lshr_b32 s55, s39, 16 +; GCN-HSA-NEXT: s_lshr_b32 s56, s38, 16 +; GCN-HSA-NEXT: s_lshr_b32 s57, s41, 16 +; GCN-HSA-NEXT: s_lshr_b32 s58, s40, 16 +; GCN-HSA-NEXT: s_lshr_b32 s59, s43, 16 +; GCN-HSA-NEXT: s_lshr_b32 s60, s42, 16 +; GCN-HSA-NEXT: s_lshr_b32 s61, s45, 16 +; GCN-HSA-NEXT: s_lshr_b32 s62, s44, 16 +; GCN-HSA-NEXT: s_lshr_b32 s63, s47, 16 +; GCN-HSA-NEXT: s_lshr_b32 s64, s46, 16 +; GCN-HSA-NEXT: s_lshr_b32 s65, s49, 16 +; GCN-HSA-NEXT: s_lshr_b32 s66, s48, 16 +; GCN-HSA-NEXT: s_lshr_b32 s67, s51, 16 +; GCN-HSA-NEXT: s_lshr_b32 s68, s50, 16 +; GCN-HSA-NEXT: s_and_b32 s37, s37, 0xffff +; GCN-HSA-NEXT: s_and_b32 s36, s36, 0xffff +; GCN-HSA-NEXT: s_and_b32 s39, s39, 0xffff +; GCN-HSA-NEXT: s_and_b32 s38, s38, 0xffff +; GCN-HSA-NEXT: s_and_b32 s41, s41, 0xffff +; GCN-HSA-NEXT: s_and_b32 s40, s40, 0xffff +; GCN-HSA-NEXT: s_and_b32 s43, s43, 0xffff +; GCN-HSA-NEXT: s_and_b32 s42, s42, 0xffff +; GCN-HSA-NEXT: s_and_b32 s45, s45, 0xffff +; GCN-HSA-NEXT: s_and_b32 s44, s44, 0xffff +; GCN-HSA-NEXT: s_and_b32 s47, s47, 0xffff +; GCN-HSA-NEXT: s_and_b32 s46, s46, 0xffff +; GCN-HSA-NEXT: s_and_b32 s49, s49, 0xffff +; GCN-HSA-NEXT: s_and_b32 s48, s48, 0xffff +; GCN-HSA-NEXT: s_and_b32 s51, s51, 0xffff +; GCN-HSA-NEXT: s_and_b32 s50, s50, 0xffff ; GCN-HSA-NEXT: s_add_u32 s6, s16, 0xf0 ; GCN-HSA-NEXT: s_addc_u32 s7, s17, 0 ; GCN-HSA-NEXT: v_mov_b32_e32 v22, s7 @@ -3144,10 +3117,10 @@ define amdgpu_kernel void @constant_zextload_v64i16_to_v64i32(<64 x i32> addrspa ; GCN-HSA-NEXT: v_mov_b32_e32 v33, s7 ; GCN-HSA-NEXT: v_mov_b32_e32 v32, s6 ; GCN-HSA-NEXT: s_add_u32 s6, s16, 0x90 -; GCN-HSA-NEXT: v_mov_b32_e32 v4, s67 -; GCN-HSA-NEXT: v_mov_b32_e32 v5, s48 -; GCN-HSA-NEXT: v_mov_b32_e32 v6, s66 -; GCN-HSA-NEXT: v_mov_b32_e32 v7, s49 +; GCN-HSA-NEXT: v_mov_b32_e32 v4, s48 +; GCN-HSA-NEXT: v_mov_b32_e32 v5, s66 +; GCN-HSA-NEXT: v_mov_b32_e32 v6, s49 +; GCN-HSA-NEXT: v_mov_b32_e32 v7, s65 ; GCN-HSA-NEXT: s_addc_u32 s7, s17, 0 ; GCN-HSA-NEXT: flat_store_dwordx4 v[24:25], v[4:7] ; GCN-HSA-NEXT: v_mov_b32_e32 v25, s7 @@ -3157,47 +3130,47 @@ define amdgpu_kernel void @constant_zextload_v64i16_to_v64i32(<64 x i32> addrspa ; GCN-HSA-NEXT: v_mov_b32_e32 v35, s7 ; GCN-HSA-NEXT: v_mov_b32_e32 v34, s6 ; GCN-HSA-NEXT: s_add_u32 s6, s16, 0x70 -; GCN-HSA-NEXT: v_mov_b32_e32 v16, s61 -; GCN-HSA-NEXT: v_mov_b32_e32 v17, s42 -; GCN-HSA-NEXT: v_mov_b32_e32 v18, s60 -; GCN-HSA-NEXT: v_mov_b32_e32 v19, s43 +; GCN-HSA-NEXT: v_mov_b32_e32 v16, s42 +; GCN-HSA-NEXT: v_mov_b32_e32 v17, s60 +; GCN-HSA-NEXT: v_mov_b32_e32 v18, s43 +; GCN-HSA-NEXT: v_mov_b32_e32 v19, s59 ; GCN-HSA-NEXT: s_addc_u32 s7, s17, 0 ; GCN-HSA-NEXT: flat_store_dwordx4 v[30:31], v[16:19] -; GCN-HSA-NEXT: v_mov_b32_e32 v0, s53 +; GCN-HSA-NEXT: v_mov_b32_e32 v0, s50 ; GCN-HSA-NEXT: v_mov_b32_e32 v17, s7 ; GCN-HSA-NEXT: v_mov_b32_e32 v16, s6 ; GCN-HSA-NEXT: s_add_u32 s6, s16, 0x60 ; GCN-HSA-NEXT: s_addc_u32 s7, s17, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v1, s50 -; GCN-HSA-NEXT: v_mov_b32_e32 v2, s68 -; GCN-HSA-NEXT: v_mov_b32_e32 v3, s51 +; GCN-HSA-NEXT: v_mov_b32_e32 v1, s68 +; GCN-HSA-NEXT: v_mov_b32_e32 v2, s51 +; GCN-HSA-NEXT: v_mov_b32_e32 v3, s67 ; GCN-HSA-NEXT: v_mov_b32_e32 v19, s7 -; GCN-HSA-NEXT: v_mov_b32_e32 v8, s65 -; GCN-HSA-NEXT: v_mov_b32_e32 v9, s46 -; GCN-HSA-NEXT: v_mov_b32_e32 v10, s64 -; GCN-HSA-NEXT: v_mov_b32_e32 v11, s47 -; GCN-HSA-NEXT: v_mov_b32_e32 v12, s63 -; GCN-HSA-NEXT: v_mov_b32_e32 v13, s44 -; GCN-HSA-NEXT: v_mov_b32_e32 v14, s62 -; GCN-HSA-NEXT: v_mov_b32_e32 v15, s45 -; GCN-HSA-NEXT: v_mov_b32_e32 v20, s59 +; GCN-HSA-NEXT: v_mov_b32_e32 v8, s46 +; GCN-HSA-NEXT: v_mov_b32_e32 v9, s64 +; GCN-HSA-NEXT: v_mov_b32_e32 v10, s47 +; GCN-HSA-NEXT: v_mov_b32_e32 v11, s63 +; GCN-HSA-NEXT: v_mov_b32_e32 v12, s44 +; GCN-HSA-NEXT: v_mov_b32_e32 v13, s62 +; GCN-HSA-NEXT: v_mov_b32_e32 v14, s45 +; GCN-HSA-NEXT: v_mov_b32_e32 v15, s61 +; GCN-HSA-NEXT: v_mov_b32_e32 v20, s40 ; GCN-HSA-NEXT: flat_store_dwordx4 v[21:22], v[0:3] -; GCN-HSA-NEXT: v_mov_b32_e32 v21, s40 -; GCN-HSA-NEXT: v_mov_b32_e32 v0, s57 -; GCN-HSA-NEXT: v_mov_b32_e32 v22, s58 -; GCN-HSA-NEXT: v_mov_b32_e32 v23, s41 -; GCN-HSA-NEXT: v_mov_b32_e32 v1, s38 -; GCN-HSA-NEXT: v_mov_b32_e32 v2, s56 -; GCN-HSA-NEXT: v_mov_b32_e32 v4, s19 -; GCN-HSA-NEXT: v_mov_b32_e32 v3, s39 -; GCN-HSA-NEXT: v_mov_b32_e32 v5, s36 +; GCN-HSA-NEXT: v_mov_b32_e32 v21, s58 +; GCN-HSA-NEXT: v_mov_b32_e32 v0, s38 +; GCN-HSA-NEXT: v_mov_b32_e32 v22, s41 +; GCN-HSA-NEXT: v_mov_b32_e32 v23, s57 +; GCN-HSA-NEXT: v_mov_b32_e32 v1, s56 +; GCN-HSA-NEXT: v_mov_b32_e32 v2, s39 +; GCN-HSA-NEXT: v_mov_b32_e32 v4, s36 +; GCN-HSA-NEXT: v_mov_b32_e32 v3, s55 +; GCN-HSA-NEXT: v_mov_b32_e32 v5, s19 ; GCN-HSA-NEXT: v_mov_b32_e32 v18, s6 ; GCN-HSA-NEXT: s_add_u32 s6, s16, 0x50 ; GCN-HSA-NEXT: flat_store_dwordx4 v[26:27], v[8:11] -; GCN-HSA-NEXT: v_mov_b32_e32 v6, s18 +; GCN-HSA-NEXT: v_mov_b32_e32 v6, s37 ; GCN-HSA-NEXT: v_mov_b32_e32 v8, s14 ; GCN-HSA-NEXT: flat_store_dwordx4 v[28:29], v[12:15] -; GCN-HSA-NEXT: v_mov_b32_e32 v7, s37 +; GCN-HSA-NEXT: v_mov_b32_e32 v7, s18 ; GCN-HSA-NEXT: v_mov_b32_e32 v12, s12 ; GCN-HSA-NEXT: v_mov_b32_e32 v9, s52 ; GCN-HSA-NEXT: v_mov_b32_e32 v10, s15 @@ -3230,9 +3203,9 @@ define amdgpu_kernel void @constant_zextload_v64i16_to_v64i32(<64 x i32> addrspa ; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-HSA-NEXT: s_addc_u32 s7, s17, 0 ; GCN-HSA-NEXT: v_mov_b32_e32 v4, s6 -; GCN-HSA-NEXT: v_mov_b32_e32 v0, s55 +; GCN-HSA-NEXT: v_mov_b32_e32 v0, s54 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s27 -; GCN-HSA-NEXT: v_mov_b32_e32 v2, s54 +; GCN-HSA-NEXT: v_mov_b32_e32 v2, s53 ; GCN-HSA-NEXT: v_mov_b32_e32 v3, s26 ; GCN-HSA-NEXT: v_mov_b32_e32 v5, s7 ; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] @@ -3268,160 +3241,147 @@ define amdgpu_kernel void @constant_zextload_v64i16_to_v64i32(<64 x i32> addrspa ; GCN-NOHSA-VI-LABEL: constant_zextload_v64i16_to_v64i32: ; GCN-NOHSA-VI: ; %bb.0: ; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[36:39], s[0:1], 0x24 -; GCN-NOHSA-VI-NEXT: s_mov_b32 s40, 0xffff ; GCN-NOHSA-VI-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NOHSA-VI-NEXT: s_load_dwordx16 s[16:31], s[38:39], 0x0 ; GCN-NOHSA-VI-NEXT: s_load_dwordx16 s[0:15], s[38:39], 0x40 ; GCN-NOHSA-VI-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s33, s17, 16 -; GCN-NOHSA-VI-NEXT: s_and_b32 s17, s17, s40 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s34, s16, 16 -; GCN-NOHSA-VI-NEXT: s_and_b32 s16, s16, s40 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s35, s19, 16 -; GCN-NOHSA-VI-NEXT: s_and_b32 s19, s19, s40 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s38, s18, 16 -; GCN-NOHSA-VI-NEXT: s_and_b32 s18, s18, s40 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s39, s21, 16 -; GCN-NOHSA-VI-NEXT: s_and_b32 s21, s21, s40 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s41, s20, 16 -; GCN-NOHSA-VI-NEXT: s_and_b32 s20, s20, s40 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s42, s23, 16 -; GCN-NOHSA-VI-NEXT: s_and_b32 s23, s23, s40 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s43, s22, 16 -; GCN-NOHSA-VI-NEXT: s_and_b32 s22, s22, s40 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s44, s25, 16 -; GCN-NOHSA-VI-NEXT: s_and_b32 s25, s25, s40 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s45, s24, 16 -; GCN-NOHSA-VI-NEXT: s_and_b32 s24, s24, s40 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s46, s27, 16 -; GCN-NOHSA-VI-NEXT: s_and_b32 s27, s27, s40 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s47, s26, 16 -; GCN-NOHSA-VI-NEXT: s_and_b32 s26, s26, s40 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s48, s29, 16 -; GCN-NOHSA-VI-NEXT: s_and_b32 s29, s29, s40 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s49, s28, 16 -; GCN-NOHSA-VI-NEXT: s_and_b32 s28, s28, s40 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s50, s31, 16 -; GCN-NOHSA-VI-NEXT: s_and_b32 s31, s31, s40 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s51, s30, 16 -; GCN-NOHSA-VI-NEXT: s_and_b32 s30, s30, s40 -; GCN-NOHSA-VI-NEXT: s_and_b32 s53, s1, s40 -; GCN-NOHSA-VI-NEXT: s_and_b32 s55, s0, s40 -; GCN-NOHSA-VI-NEXT: s_and_b32 s57, s3, s40 -; GCN-NOHSA-VI-NEXT: s_and_b32 s59, s2, s40 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s60, s5, 16 -; GCN-NOHSA-VI-NEXT: s_and_b32 s5, s5, s40 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s61, s4, 16 -; GCN-NOHSA-VI-NEXT: s_and_b32 s4, s4, s40 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s62, s7, 16 -; GCN-NOHSA-VI-NEXT: s_and_b32 s7, s7, s40 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s63, s6, 16 -; GCN-NOHSA-VI-NEXT: s_and_b32 s6, s6, s40 -; GCN-NOHSA-VI-NEXT: s_and_b32 s64, s9, s40 -; GCN-NOHSA-VI-NEXT: s_and_b32 s65, s8, s40 -; GCN-NOHSA-VI-NEXT: s_and_b32 s66, s11, s40 -; GCN-NOHSA-VI-NEXT: s_and_b32 s67, s10, s40 -; GCN-NOHSA-VI-NEXT: s_and_b32 s68, s13, s40 -; GCN-NOHSA-VI-NEXT: s_and_b32 s69, s12, s40 -; GCN-NOHSA-VI-NEXT: s_and_b32 s70, s15, s40 -; GCN-NOHSA-VI-NEXT: s_and_b32 s40, s14, s40 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s15, s15, 16 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s14, s14, 16 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s52, s1, 16 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s54, s0, 16 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s56, s3, 16 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s58, s2, 16 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s13, s13, 16 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s12, s12, 16 +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s49, s31, 16 +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s69, s15, 16 +; GCN-NOHSA-VI-NEXT: s_and_b32 s15, s15, 0xffff +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s70, s14, 16 +; GCN-NOHSA-VI-NEXT: s_and_b32 s14, s14, 0xffff +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s51, s1, 16 +; GCN-NOHSA-VI-NEXT: s_and_b32 s52, s1, 0xffff +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s53, s0, 16 +; GCN-NOHSA-VI-NEXT: s_and_b32 s54, s0, 0xffff +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s55, s3, 16 +; GCN-NOHSA-VI-NEXT: s_and_b32 s56, s3, 0xffff +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s57, s2, 16 +; GCN-NOHSA-VI-NEXT: s_and_b32 s58, s2, 0xffff +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s67, s13, 16 +; GCN-NOHSA-VI-NEXT: s_and_b32 s13, s13, 0xffff +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s68, s12, 16 +; GCN-NOHSA-VI-NEXT: s_and_b32 s12, s12, 0xffff ; GCN-NOHSA-VI-NEXT: s_mov_b32 s3, 0xf000 ; GCN-NOHSA-VI-NEXT: s_mov_b32 s2, -1 ; GCN-NOHSA-VI-NEXT: s_mov_b32 s0, s36 ; GCN-NOHSA-VI-NEXT: s_mov_b32 s1, s37 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s40 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s14 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s70 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s15 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s11, s11, 16 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s10, s10, 16 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s14 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s70 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s15 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s69 +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s65, s11, 16 +; GCN-NOHSA-VI-NEXT: s_and_b32 s11, s11, 0xffff +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s66, s10, 16 +; GCN-NOHSA-VI-NEXT: s_and_b32 s10, s10, 0xffff ; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:240 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s9, s9, 16 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s69 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s12 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s68 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s13 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s8, s8, 16 +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s63, s9, 16 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s12 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s68 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s13 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s67 +; GCN-NOHSA-VI-NEXT: s_and_b32 s9, s9, 0xffff +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s64, s8, 16 +; GCN-NOHSA-VI-NEXT: s_and_b32 s8, s8, 0xffff ; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:224 -; GCN-NOHSA-VI-NEXT: s_nop 0 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s67 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s10 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s66 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s11 +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s61, s7, 16 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s10 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s66 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s11 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s65 +; GCN-NOHSA-VI-NEXT: s_and_b32 s7, s7, 0xffff +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s62, s6, 16 +; GCN-NOHSA-VI-NEXT: s_and_b32 s6, s6, 0xffff ; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:208 -; GCN-NOHSA-VI-NEXT: s_nop 0 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s65 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s8 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s64 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s9 +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s59, s5, 16 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s8 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s64 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s9 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s63 +; GCN-NOHSA-VI-NEXT: s_and_b32 s5, s5, 0xffff +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s60, s4, 16 +; GCN-NOHSA-VI-NEXT: s_and_b32 s4, s4, 0xffff ; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:192 -; GCN-NOHSA-VI-NEXT: s_nop 0 +; GCN-NOHSA-VI-NEXT: s_and_b32 s31, s31, 0xffff ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s6 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s63 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s62 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s7 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s62 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s61 ; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:176 -; GCN-NOHSA-VI-NEXT: s_nop 0 +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s50, s30, 16 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s4 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s61 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s60 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s5 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s60 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s59 ; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:160 -; GCN-NOHSA-VI-NEXT: s_nop 0 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s59 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s58 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s57 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s56 +; GCN-NOHSA-VI-NEXT: s_and_b32 s30, s30, 0xffff +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s58 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s57 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s56 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s55 ; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:144 -; GCN-NOHSA-VI-NEXT: s_nop 0 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s55 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s54 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s53 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s52 +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s47, s29, 16 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s54 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s53 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s52 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s51 +; GCN-NOHSA-VI-NEXT: s_and_b32 s29, s29, 0xffff +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s48, s28, 16 +; GCN-NOHSA-VI-NEXT: s_and_b32 s28, s28, 0xffff ; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:128 -; GCN-NOHSA-VI-NEXT: s_nop 0 +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s45, s27, 16 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s30 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s51 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s50 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s31 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s50 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s49 +; GCN-NOHSA-VI-NEXT: s_and_b32 s27, s27, 0xffff +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s46, s26, 16 +; GCN-NOHSA-VI-NEXT: s_and_b32 s26, s26, 0xffff ; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:112 -; GCN-NOHSA-VI-NEXT: s_nop 0 +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s43, s25, 16 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s28 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s49 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s48 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s29 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s48 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s47 +; GCN-NOHSA-VI-NEXT: s_and_b32 s25, s25, 0xffff +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s44, s24, 16 +; GCN-NOHSA-VI-NEXT: s_and_b32 s24, s24, 0xffff ; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:96 -; GCN-NOHSA-VI-NEXT: s_nop 0 +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s41, s23, 16 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s26 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s47 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s46 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s27 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s46 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s45 +; GCN-NOHSA-VI-NEXT: s_and_b32 s23, s23, 0xffff +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s42, s22, 16 +; GCN-NOHSA-VI-NEXT: s_and_b32 s22, s22, 0xffff ; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:80 -; GCN-NOHSA-VI-NEXT: s_nop 0 +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s39, s21, 16 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s24 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s45 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s44 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s25 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s44 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s43 +; GCN-NOHSA-VI-NEXT: s_and_b32 s21, s21, 0xffff +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s40, s20, 16 +; GCN-NOHSA-VI-NEXT: s_and_b32 s20, s20, 0xffff ; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:64 -; GCN-NOHSA-VI-NEXT: s_nop 0 +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s35, s19, 16 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s22 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s43 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s42 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s23 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s42 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s41 +; GCN-NOHSA-VI-NEXT: s_and_b32 s19, s19, 0xffff +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s38, s18, 16 +; GCN-NOHSA-VI-NEXT: s_and_b32 s18, s18, 0xffff ; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:48 -; GCN-NOHSA-VI-NEXT: s_nop 0 +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s33, s17, 16 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s20 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s41 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s40 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s21 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s39 +; GCN-NOHSA-VI-NEXT: s_and_b32 s17, s17, 0xffff +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s34, s16, 16 +; GCN-NOHSA-VI-NEXT: s_and_b32 s16, s16, 0xffff ; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:32 ; GCN-NOHSA-VI-NEXT: s_nop 0 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s18 @@ -4847,52 +4807,48 @@ define amdgpu_kernel void @constant_zextload_v4i16_to_v4i64(<4 x i64> addrspace( ; GCN-NOHSA-SI-LABEL: constant_zextload_v4i16_to_v4i64: ; GCN-NOHSA-SI: ; %bb.0: ; GCN-NOHSA-SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 -; GCN-NOHSA-SI-NEXT: s_mov_b32 s7, 0xf000 ; GCN-NOHSA-SI-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NOHSA-SI-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 -; GCN-NOHSA-SI-NEXT: s_mov_b32 s6, -1 +; GCN-NOHSA-SI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x0 +; GCN-NOHSA-SI-NEXT: s_mov_b32 s3, 0xf000 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, 0 -; GCN-NOHSA-SI-NEXT: s_mov_b32 s8, 0xffff +; GCN-NOHSA-SI-NEXT: s_mov_b32 s2, -1 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, v1 -; GCN-NOHSA-SI-NEXT: s_mov_b32 s4, s0 -; GCN-NOHSA-SI-NEXT: s_mov_b32 s5, s1 ; GCN-NOHSA-SI-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s0, s3, 16 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s1, s2, 16 -; GCN-NOHSA-SI-NEXT: s_and_b32 s3, s3, s8 -; GCN-NOHSA-SI-NEXT: s_and_b32 s2, s2, s8 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s3 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s0 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 offset:16 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s6, s5, 16 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s7, s4, 16 +; GCN-NOHSA-SI-NEXT: s_and_b32 s5, s5, 0xffff +; GCN-NOHSA-SI-NEXT: s_and_b32 s4, s4, 0xffff +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s5 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s6 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:16 ; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s2 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s1 -; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s4 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s7 +; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 ; GCN-NOHSA-SI-NEXT: s_endpgm ; ; GCN-HSA-LABEL: constant_zextload_v4i16_to_v4i64: ; GCN-HSA: ; %bb.0: ; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; GCN-HSA-NEXT: s_mov_b32 s4, 0xffff ; GCN-HSA-NEXT: v_mov_b32_e32 v1, 0 ; GCN-HSA-NEXT: v_mov_b32_e32 v3, v1 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) -; GCN-HSA-NEXT: s_lshr_b32 s5, s3, 16 -; GCN-HSA-NEXT: s_lshr_b32 s6, s2, 16 -; GCN-HSA-NEXT: s_and_b32 s7, s2, s4 -; GCN-HSA-NEXT: s_and_b32 s2, s3, s4 +; GCN-HSA-NEXT: s_lshr_b32 s4, s3, 16 +; GCN-HSA-NEXT: s_lshr_b32 s5, s2, 16 +; GCN-HSA-NEXT: s_and_b32 s6, s2, 0xffff +; GCN-HSA-NEXT: s_and_b32 s2, s3, 0xffff ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GCN-HSA-NEXT: s_add_u32 s2, s0, 16 ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 ; GCN-HSA-NEXT: v_mov_b32_e32 v5, s3 -; GCN-HSA-NEXT: v_mov_b32_e32 v2, s5 +; GCN-HSA-NEXT: v_mov_b32_e32 v2, s4 ; GCN-HSA-NEXT: v_mov_b32_e32 v4, s2 ; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-HSA-NEXT: v_mov_b32_e32 v5, s1 -; GCN-HSA-NEXT: v_mov_b32_e32 v0, s7 -; GCN-HSA-NEXT: v_mov_b32_e32 v2, s6 +; GCN-HSA-NEXT: v_mov_b32_e32 v0, s6 +; GCN-HSA-NEXT: v_mov_b32_e32 v2, s5 ; GCN-HSA-NEXT: v_mov_b32_e32 v4, s0 ; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-HSA-NEXT: s_endpgm @@ -4900,19 +4856,18 @@ define amdgpu_kernel void @constant_zextload_v4i16_to_v4i64(<4 x i64> addrspace( ; GCN-NOHSA-VI-LABEL: constant_zextload_v4i16_to_v4i64: ; GCN-NOHSA-VI: ; %bb.0: ; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 -; GCN-NOHSA-VI-NEXT: s_mov_b32 s8, 0xffff ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, 0 ; GCN-NOHSA-VI-NEXT: s_mov_b32 s7, 0xf000 ; GCN-NOHSA-VI-NEXT: s_mov_b32 s6, -1 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, v1 ; GCN-NOHSA-VI-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NOHSA-VI-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 ; GCN-NOHSA-VI-NEXT: s_mov_b32 s4, s0 ; GCN-NOHSA-VI-NEXT: s_mov_b32 s5, s1 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, v1 ; GCN-NOHSA-VI-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NOHSA-VI-NEXT: s_and_b32 s0, s2, s8 +; GCN-NOHSA-VI-NEXT: s_and_b32 s0, s2, 0xffff ; GCN-NOHSA-VI-NEXT: s_lshr_b32 s1, s2, 16 -; GCN-NOHSA-VI-NEXT: s_and_b32 s2, s3, s8 +; GCN-NOHSA-VI-NEXT: s_and_b32 s2, s3, 0xffff ; GCN-NOHSA-VI-NEXT: s_lshr_b32 s3, s3, 16 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s2 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s3 @@ -5090,39 +5045,36 @@ define amdgpu_kernel void @constant_sextload_v4i16_to_v4i64(<4 x i64> addrspace( define amdgpu_kernel void @constant_zextload_v8i16_to_v8i64(<8 x i64> addrspace(1)* %out, <8 x i16> addrspace(4)* %in) #0 { ; GCN-NOHSA-SI-LABEL: constant_zextload_v8i16_to_v8i64: ; GCN-NOHSA-SI: ; %bb.0: -; GCN-NOHSA-SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; GCN-NOHSA-SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; GCN-NOHSA-SI-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NOHSA-SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s3, 0xf000 -; GCN-NOHSA-SI-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NOHSA-SI-NEXT: s_load_dwordx4 s[8:11], s[6:7], 0x0 -; GCN-NOHSA-SI-NEXT: s_mov_b32 s2, -1 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, 0 -; GCN-NOHSA-SI-NEXT: s_mov_b32 s6, 0xffff +; GCN-NOHSA-SI-NEXT: s_mov_b32 s2, -1 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, v1 -; GCN-NOHSA-SI-NEXT: s_mov_b32 s0, s4 -; GCN-NOHSA-SI-NEXT: s_mov_b32 s1, s5 ; GCN-NOHSA-SI-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s4, s9, 16 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s5, s11, 16 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s7, s10, 16 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s12, s8, 16 -; GCN-NOHSA-SI-NEXT: s_and_b32 s8, s8, s6 -; GCN-NOHSA-SI-NEXT: s_and_b32 s10, s10, s6 -; GCN-NOHSA-SI-NEXT: s_and_b32 s11, s11, s6 -; GCN-NOHSA-SI-NEXT: s_and_b32 s6, s9, s6 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s11 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s5 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s8, s5, 16 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s9, s7, 16 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s10, s6, 16 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s11, s4, 16 +; GCN-NOHSA-SI-NEXT: s_and_b32 s4, s4, 0xffff +; GCN-NOHSA-SI-NEXT: s_and_b32 s6, s6, 0xffff +; GCN-NOHSA-SI-NEXT: s_and_b32 s7, s7, 0xffff +; GCN-NOHSA-SI-NEXT: s_and_b32 s5, s5, 0xffff +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s7 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s9 ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:48 ; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s6 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s4 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s5 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s8 ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:16 ; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s10 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s7 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s6 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s10 ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:32 ; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s8 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s12 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s4 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s11 ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 ; GCN-NOHSA-SI-NEXT: s_endpgm ; @@ -5133,19 +5085,18 @@ define amdgpu_kernel void @constant_zextload_v8i16_to_v8i64(<8 x i64> addrspace( ; GCN-HSA-NEXT: v_mov_b32_e32 v3, v1 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0 -; GCN-HSA-NEXT: s_mov_b32 s2, 0xffff ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: s_lshr_b32 s8, s5, 16 -; GCN-HSA-NEXT: s_lshr_b32 s3, s7, 16 +; GCN-HSA-NEXT: s_lshr_b32 s2, s7, 16 ; GCN-HSA-NEXT: s_lshr_b32 s9, s6, 16 ; GCN-HSA-NEXT: s_lshr_b32 s10, s4, 16 -; GCN-HSA-NEXT: s_and_b32 s4, s4, s2 -; GCN-HSA-NEXT: s_and_b32 s6, s6, s2 -; GCN-HSA-NEXT: s_and_b32 s5, s5, s2 -; GCN-HSA-NEXT: s_and_b32 s2, s7, s2 -; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2 +; GCN-HSA-NEXT: s_and_b32 s4, s4, 0xffff +; GCN-HSA-NEXT: s_and_b32 s6, s6, 0xffff +; GCN-HSA-NEXT: s_and_b32 s5, s5, 0xffff +; GCN-HSA-NEXT: s_and_b32 s3, s7, 0xffff +; GCN-HSA-NEXT: v_mov_b32_e32 v2, s2 ; GCN-HSA-NEXT: s_add_u32 s2, s0, 48 -; GCN-HSA-NEXT: v_mov_b32_e32 v2, s3 +; GCN-HSA-NEXT: v_mov_b32_e32 v0, s3 ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 ; GCN-HSA-NEXT: v_mov_b32_e32 v5, s3 ; GCN-HSA-NEXT: v_mov_b32_e32 v4, s2 @@ -5173,39 +5124,35 @@ define amdgpu_kernel void @constant_zextload_v8i16_to_v8i64(<8 x i64> addrspace( ; ; GCN-NOHSA-VI-LABEL: constant_zextload_v8i16_to_v8i64: ; GCN-NOHSA-VI: ; %bb.0: -; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, 0 -; GCN-NOHSA-VI-NEXT: s_mov_b32 s3, 0xf000 -; GCN-NOHSA-VI-NEXT: s_mov_b32 s2, -1 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, v1 ; GCN-NOHSA-VI-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[8:11], s[6:7], 0x0 -; GCN-NOHSA-VI-NEXT: s_mov_b32 s6, 0xffff -; GCN-NOHSA-VI-NEXT: s_mov_b32 s0, s4 -; GCN-NOHSA-VI-NEXT: s_mov_b32 s1, s5 +; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0 +; GCN-NOHSA-VI-NEXT: s_mov_b32 s3, 0xf000 +; GCN-NOHSA-VI-NEXT: s_mov_b32 s2, -1 ; GCN-NOHSA-VI-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NOHSA-VI-NEXT: s_and_b32 s4, s8, s6 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s5, s8, 16 -; GCN-NOHSA-VI-NEXT: s_and_b32 s7, s9, s6 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s8, s9, 16 -; GCN-NOHSA-VI-NEXT: s_and_b32 s9, s10, s6 -; GCN-NOHSA-VI-NEXT: s_and_b32 s6, s11, s6 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s11, s11, 16 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s10, s10, 16 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s6 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s11 +; GCN-NOHSA-VI-NEXT: s_and_b32 s11, s7, 0xffff +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s7, s7, 16 +; GCN-NOHSA-VI-NEXT: s_and_b32 s10, s6, 0xffff +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s6, s6, 16 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s11 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s7 +; GCN-NOHSA-VI-NEXT: s_and_b32 s9, s5, 0xffff +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s5, s5, 16 ; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:48 -; GCN-NOHSA-VI-NEXT: s_nop 0 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s9 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s10 +; GCN-NOHSA-VI-NEXT: s_and_b32 s8, s4, 0xffff +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s10 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s6 +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s4, s4, 16 ; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:32 ; GCN-NOHSA-VI-NEXT: s_nop 0 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s7 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s8 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s9 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s5 ; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:16 ; GCN-NOHSA-VI-NEXT: s_nop 0 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s4 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s5 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s8 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s4 ; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 ; GCN-NOHSA-VI-NEXT: s_endpgm ; @@ -5464,39 +5411,36 @@ define amdgpu_kernel void @constant_sextload_v8i16_to_v8i64(<8 x i64> addrspace( define amdgpu_kernel void @constant_zextload_v16i16_to_v16i64(<16 x i64> addrspace(1)* %out, <16 x i16> addrspace(4)* %in) #0 { ; GCN-NOHSA-SI-LABEL: constant_zextload_v16i16_to_v16i64: ; GCN-NOHSA-SI: ; %bb.0: -; GCN-NOHSA-SI-NEXT: s_load_dwordx4 s[12:15], s[0:1], 0x9 -; GCN-NOHSA-SI-NEXT: s_mov_b32 s3, 0xf000 +; GCN-NOHSA-SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GCN-NOHSA-SI-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NOHSA-SI-NEXT: s_load_dwordx8 s[4:11], s[14:15], 0x0 -; GCN-NOHSA-SI-NEXT: s_mov_b32 s2, -1 +; GCN-NOHSA-SI-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x0 +; GCN-NOHSA-SI-NEXT: s_mov_b32 s3, 0xf000 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, 0 -; GCN-NOHSA-SI-NEXT: s_mov_b32 s14, 0xffff +; GCN-NOHSA-SI-NEXT: s_mov_b32 s2, -1 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, v1 -; GCN-NOHSA-SI-NEXT: s_mov_b32 s0, s12 -; GCN-NOHSA-SI-NEXT: s_mov_b32 s1, s13 ; GCN-NOHSA-SI-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NOHSA-SI-NEXT: s_lshr_b32 s12, s5, 16 ; GCN-NOHSA-SI-NEXT: s_lshr_b32 s13, s7, 16 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s15, s11, 16 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s16, s9, 16 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s17, s8, 16 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s18, s10, 16 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s19, s6, 16 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s20, s4, 16 -; GCN-NOHSA-SI-NEXT: s_and_b32 s4, s4, s14 -; GCN-NOHSA-SI-NEXT: s_and_b32 s6, s6, s14 -; GCN-NOHSA-SI-NEXT: s_and_b32 s10, s10, s14 -; GCN-NOHSA-SI-NEXT: s_and_b32 s8, s8, s14 -; GCN-NOHSA-SI-NEXT: s_and_b32 s5, s5, s14 -; GCN-NOHSA-SI-NEXT: s_and_b32 s7, s7, s14 -; GCN-NOHSA-SI-NEXT: s_and_b32 s9, s9, s14 -; GCN-NOHSA-SI-NEXT: s_and_b32 s11, s11, s14 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s14, s11, 16 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s15, s9, 16 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s16, s8, 16 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s17, s10, 16 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s18, s6, 16 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s19, s4, 16 +; GCN-NOHSA-SI-NEXT: s_and_b32 s4, s4, 0xffff +; GCN-NOHSA-SI-NEXT: s_and_b32 s6, s6, 0xffff +; GCN-NOHSA-SI-NEXT: s_and_b32 s10, s10, 0xffff +; GCN-NOHSA-SI-NEXT: s_and_b32 s8, s8, 0xffff +; GCN-NOHSA-SI-NEXT: s_and_b32 s5, s5, 0xffff +; GCN-NOHSA-SI-NEXT: s_and_b32 s7, s7, 0xffff +; GCN-NOHSA-SI-NEXT: s_and_b32 s9, s9, 0xffff +; GCN-NOHSA-SI-NEXT: s_and_b32 s11, s11, 0xffff ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s9 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s16 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s15 ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:80 ; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s11 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s15 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s14 ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:112 ; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s7 @@ -5508,19 +5452,19 @@ define amdgpu_kernel void @constant_zextload_v16i16_to_v16i64(<16 x i64> addrspa ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:16 ; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s8 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s17 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s16 ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:64 ; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s10 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s18 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s17 ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:96 ; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s6 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s19 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s18 ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:32 ; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s4 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s20 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s19 ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 ; GCN-NOHSA-SI-NEXT: s_endpgm ; @@ -5531,27 +5475,26 @@ define amdgpu_kernel void @constant_zextload_v16i16_to_v16i64(<16 x i64> addrspa ; GCN-HSA-NEXT: v_mov_b32_e32 v3, v1 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x0 -; GCN-HSA-NEXT: s_mov_b32 s2, 0xffff ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: s_lshr_b32 s12, s5, 16 ; GCN-HSA-NEXT: s_lshr_b32 s13, s7, 16 ; GCN-HSA-NEXT: s_lshr_b32 s14, s11, 16 -; GCN-HSA-NEXT: s_lshr_b32 s3, s9, 16 +; GCN-HSA-NEXT: s_lshr_b32 s2, s9, 16 ; GCN-HSA-NEXT: s_lshr_b32 s15, s8, 16 ; GCN-HSA-NEXT: s_lshr_b32 s16, s10, 16 ; GCN-HSA-NEXT: s_lshr_b32 s17, s6, 16 ; GCN-HSA-NEXT: s_lshr_b32 s18, s4, 16 -; GCN-HSA-NEXT: s_and_b32 s4, s4, s2 -; GCN-HSA-NEXT: s_and_b32 s6, s6, s2 -; GCN-HSA-NEXT: s_and_b32 s10, s10, s2 -; GCN-HSA-NEXT: s_and_b32 s8, s8, s2 -; GCN-HSA-NEXT: s_and_b32 s5, s5, s2 -; GCN-HSA-NEXT: s_and_b32 s7, s7, s2 -; GCN-HSA-NEXT: s_and_b32 s11, s11, s2 -; GCN-HSA-NEXT: s_and_b32 s2, s9, s2 -; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2 +; GCN-HSA-NEXT: s_and_b32 s4, s4, 0xffff +; GCN-HSA-NEXT: s_and_b32 s6, s6, 0xffff +; GCN-HSA-NEXT: s_and_b32 s10, s10, 0xffff +; GCN-HSA-NEXT: s_and_b32 s8, s8, 0xffff +; GCN-HSA-NEXT: s_and_b32 s5, s5, 0xffff +; GCN-HSA-NEXT: s_and_b32 s7, s7, 0xffff +; GCN-HSA-NEXT: s_and_b32 s11, s11, 0xffff +; GCN-HSA-NEXT: s_and_b32 s3, s9, 0xffff +; GCN-HSA-NEXT: v_mov_b32_e32 v2, s2 ; GCN-HSA-NEXT: s_add_u32 s2, s0, 0x50 -; GCN-HSA-NEXT: v_mov_b32_e32 v2, s3 +; GCN-HSA-NEXT: v_mov_b32_e32 v0, s3 ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 ; GCN-HSA-NEXT: v_mov_b32_e32 v5, s3 ; GCN-HSA-NEXT: v_mov_b32_e32 v4, s2 @@ -5607,50 +5550,47 @@ define amdgpu_kernel void @constant_zextload_v16i16_to_v16i64(<16 x i64> addrspa ; ; GCN-NOHSA-VI-LABEL: constant_zextload_v16i16_to_v16i64: ; GCN-NOHSA-VI: ; %bb.0: -; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[12:15], s[0:1], 0x24 +; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, 0 -; GCN-NOHSA-VI-NEXT: s_mov_b32 s3, 0xf000 -; GCN-NOHSA-VI-NEXT: s_mov_b32 s2, -1 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, v1 ; GCN-NOHSA-VI-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NOHSA-VI-NEXT: s_load_dwordx8 s[4:11], s[14:15], 0x0 -; GCN-NOHSA-VI-NEXT: s_mov_b32 s14, 0xffff -; GCN-NOHSA-VI-NEXT: s_mov_b32 s0, s12 -; GCN-NOHSA-VI-NEXT: s_mov_b32 s1, s13 +; GCN-NOHSA-VI-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x0 +; GCN-NOHSA-VI-NEXT: s_mov_b32 s3, 0xf000 +; GCN-NOHSA-VI-NEXT: s_mov_b32 s2, -1 ; GCN-NOHSA-VI-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NOHSA-VI-NEXT: s_and_b32 s12, s4, s14 -; GCN-NOHSA-VI-NEXT: s_and_b32 s13, s5, s14 -; GCN-NOHSA-VI-NEXT: s_and_b32 s15, s6, s14 -; GCN-NOHSA-VI-NEXT: s_and_b32 s16, s7, s14 -; GCN-NOHSA-VI-NEXT: s_and_b32 s17, s8, s14 -; GCN-NOHSA-VI-NEXT: s_and_b32 s18, s9, s14 -; GCN-NOHSA-VI-NEXT: s_and_b32 s19, s10, s14 -; GCN-NOHSA-VI-NEXT: s_and_b32 s14, s11, s14 +; GCN-NOHSA-VI-NEXT: s_and_b32 s19, s11, 0xffff ; GCN-NOHSA-VI-NEXT: s_lshr_b32 s11, s11, 16 +; GCN-NOHSA-VI-NEXT: s_and_b32 s18, s10, 0xffff ; GCN-NOHSA-VI-NEXT: s_lshr_b32 s10, s10, 16 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s14 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s19 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s11 +; GCN-NOHSA-VI-NEXT: s_and_b32 s17, s9, 0xffff ; GCN-NOHSA-VI-NEXT: s_lshr_b32 s9, s9, 16 ; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:112 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s8, s8, 16 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s19 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s10 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:96 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s7, s7, 16 +; GCN-NOHSA-VI-NEXT: s_and_b32 s16, s8, 0xffff ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s18 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s9 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:80 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s6, s6, 16 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s10 +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s8, s8, 16 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:96 +; GCN-NOHSA-VI-NEXT: s_and_b32 s15, s7, 0xffff ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s17 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s8 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:64 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s5, s5, 16 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s9 +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s7, s7, 16 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:80 +; GCN-NOHSA-VI-NEXT: s_and_b32 s14, s6, 0xffff ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s16 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s7 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:48 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s4, s4, 16 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s8 +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s6, s6, 16 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:64 +; GCN-NOHSA-VI-NEXT: s_and_b32 s13, s5, 0xffff ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s15 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s7 +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s5, s5, 16 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:48 +; GCN-NOHSA-VI-NEXT: s_and_b32 s12, s4, 0xffff +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s14 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s6 +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s4, s4, 16 ; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:32 ; GCN-NOHSA-VI-NEXT: s_nop 0 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s13 @@ -6109,353 +6049,346 @@ define amdgpu_kernel void @constant_sextload_v16i16_to_v16i64(<16 x i64> addrspa define amdgpu_kernel void @constant_zextload_v32i16_to_v32i64(<32 x i64> addrspace(1)* %out, <32 x i16> addrspace(4)* %in) #0 { ; GCN-NOHSA-SI-LABEL: constant_zextload_v32i16_to_v32i64: ; GCN-NOHSA-SI: ; %bb.0: -; GCN-NOHSA-SI-NEXT: s_load_dwordx4 s[16:19], s[0:1], 0x9 +; GCN-NOHSA-SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GCN-NOHSA-SI-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NOHSA-SI-NEXT: s_load_dwordx16 s[0:15], s[18:19], 0x0 -; GCN-NOHSA-SI-NEXT: s_mov_b32 s18, 0xffff +; GCN-NOHSA-SI-NEXT: s_load_dwordx16 s[4:19], s[2:3], 0x0 ; GCN-NOHSA-SI-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s19, s1, 16 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s20, s3, 16 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s21, s5, 16 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s22, s7, 16 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s23, s9, 16 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s24, s11, 16 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s25, s13, 16 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s26, s15, 16 -; GCN-NOHSA-SI-NEXT: s_and_b32 s27, s0, s18 -; GCN-NOHSA-SI-NEXT: s_and_b32 s28, s2, s18 -; GCN-NOHSA-SI-NEXT: s_and_b32 s29, s4, s18 -; GCN-NOHSA-SI-NEXT: s_and_b32 s30, s6, s18 -; GCN-NOHSA-SI-NEXT: s_and_b32 s31, s8, s18 -; GCN-NOHSA-SI-NEXT: s_and_b32 s33, s10, s18 -; GCN-NOHSA-SI-NEXT: s_and_b32 s34, s12, s18 -; GCN-NOHSA-SI-NEXT: s_and_b32 s35, s14, s18 -; GCN-NOHSA-SI-NEXT: s_and_b32 s36, s1, s18 -; GCN-NOHSA-SI-NEXT: s_and_b32 s37, s3, s18 -; GCN-NOHSA-SI-NEXT: s_and_b32 s5, s5, s18 -; GCN-NOHSA-SI-NEXT: s_and_b32 s7, s7, s18 -; GCN-NOHSA-SI-NEXT: s_and_b32 s9, s9, s18 -; GCN-NOHSA-SI-NEXT: s_and_b32 s11, s11, s18 -; GCN-NOHSA-SI-NEXT: s_and_b32 s13, s13, s18 -; GCN-NOHSA-SI-NEXT: s_and_b32 s15, s15, s18 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s14, s14, 16 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s12, s12, 16 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s10, s10, 16 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s8, s8, 16 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s6, s6, 16 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s4, s4, 16 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s18, s2, 16 -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s38, s0, 16 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s20, s5, 16 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s21, s7, 16 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s22, s9, 16 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s23, s11, 16 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s24, s13, 16 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s25, s15, 16 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s26, s17, 16 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s27, s19, 16 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s28, s18, 16 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s29, s16, 16 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s30, s14, 16 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s31, s12, 16 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s33, s10, 16 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s34, s8, 16 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s35, s6, 16 +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s36, s4, 16 +; GCN-NOHSA-SI-NEXT: s_and_b32 s4, s4, 0xffff +; GCN-NOHSA-SI-NEXT: s_and_b32 s6, s6, 0xffff +; GCN-NOHSA-SI-NEXT: s_and_b32 s8, s8, 0xffff +; GCN-NOHSA-SI-NEXT: s_and_b32 s10, s10, 0xffff +; GCN-NOHSA-SI-NEXT: s_and_b32 s12, s12, 0xffff +; GCN-NOHSA-SI-NEXT: s_and_b32 s14, s14, 0xffff +; GCN-NOHSA-SI-NEXT: s_and_b32 s16, s16, 0xffff +; GCN-NOHSA-SI-NEXT: s_and_b32 s18, s18, 0xffff +; GCN-NOHSA-SI-NEXT: s_and_b32 s5, s5, 0xffff +; GCN-NOHSA-SI-NEXT: s_and_b32 s7, s7, 0xffff +; GCN-NOHSA-SI-NEXT: s_and_b32 s9, s9, 0xffff +; GCN-NOHSA-SI-NEXT: s_and_b32 s11, s11, 0xffff +; GCN-NOHSA-SI-NEXT: s_and_b32 s13, s13, 0xffff +; GCN-NOHSA-SI-NEXT: s_and_b32 s15, s15, 0xffff +; GCN-NOHSA-SI-NEXT: s_and_b32 s17, s17, 0xffff +; GCN-NOHSA-SI-NEXT: s_and_b32 s19, s19, 0xffff ; GCN-NOHSA-SI-NEXT: s_mov_b32 s3, 0xf000 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, 0 ; GCN-NOHSA-SI-NEXT: s_mov_b32 s2, -1 ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, v1 -; GCN-NOHSA-SI-NEXT: s_mov_b32 s0, s16 -; GCN-NOHSA-SI-NEXT: s_mov_b32 s1, s17 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s15 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s26 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s19 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s27 ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:240 ; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s13 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s25 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s17 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s26 ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:208 ; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s11 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s24 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s15 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s25 ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:176 ; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s9 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s23 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s13 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s24 ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:144 ; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s7 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s22 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s11 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s23 ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:112 ; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s5 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s21 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s9 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s22 ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:80 ; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s37 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s20 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s7 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s21 ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:48 ; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s36 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s19 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s5 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s20 ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:16 ; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s35 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s14 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s18 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s28 ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:224 ; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s34 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s12 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s16 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s29 ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:192 ; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s33 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s10 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s14 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s30 ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:160 ; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s31 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s8 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s12 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s31 ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:128 ; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s30 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s6 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s10 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s33 ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:96 ; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s29 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s4 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s8 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s34 ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:64 ; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s28 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s18 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s6 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s35 ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:32 ; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0) -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s27 -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s38 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s4 +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s36 ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 ; GCN-NOHSA-SI-NEXT: s_endpgm ; ; GCN-HSA-LABEL: constant_zextload_v32i16_to_v32i64: ; GCN-HSA: ; %bb.0: -; GCN-HSA-NEXT: s_load_dwordx4 s[16:19], s[4:5], 0x0 +; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, 0 ; GCN-HSA-NEXT: v_mov_b32_e32 v3, v1 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) -; GCN-HSA-NEXT: s_load_dwordx16 s[0:15], s[18:19], 0x0 -; GCN-HSA-NEXT: s_mov_b32 s18, 0xffff +; GCN-HSA-NEXT: s_load_dwordx16 s[4:19], s[2:3], 0x0 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) -; GCN-HSA-NEXT: s_and_b32 s19, s0, s18 -; GCN-HSA-NEXT: s_and_b32 s20, s2, s18 -; GCN-HSA-NEXT: s_and_b32 s21, s4, s18 -; GCN-HSA-NEXT: s_and_b32 s22, s6, s18 -; GCN-HSA-NEXT: s_and_b32 s23, s8, s18 -; GCN-HSA-NEXT: s_and_b32 s24, s10, s18 -; GCN-HSA-NEXT: s_and_b32 s25, s12, s18 -; GCN-HSA-NEXT: s_and_b32 s26, s14, s18 -; GCN-HSA-NEXT: s_and_b32 s27, s1, s18 -; GCN-HSA-NEXT: s_and_b32 s28, s3, s18 -; GCN-HSA-NEXT: s_and_b32 s29, s5, s18 -; GCN-HSA-NEXT: s_and_b32 s30, s7, s18 -; GCN-HSA-NEXT: s_and_b32 s31, s9, s18 -; GCN-HSA-NEXT: s_and_b32 s33, s11, s18 -; GCN-HSA-NEXT: s_and_b32 s34, s13, s18 -; GCN-HSA-NEXT: s_and_b32 s18, s15, s18 -; GCN-HSA-NEXT: s_lshr_b32 s35, s1, 16 -; GCN-HSA-NEXT: s_lshr_b32 s3, s3, 16 -; GCN-HSA-NEXT: s_lshr_b32 s5, s5, 16 -; GCN-HSA-NEXT: s_lshr_b32 s7, s7, 16 -; GCN-HSA-NEXT: s_lshr_b32 s9, s9, 16 -; GCN-HSA-NEXT: s_lshr_b32 s11, s11, 16 -; GCN-HSA-NEXT: s_lshr_b32 s13, s13, 16 -; GCN-HSA-NEXT: s_lshr_b32 s15, s15, 16 -; GCN-HSA-NEXT: s_lshr_b32 s14, s14, 16 -; GCN-HSA-NEXT: s_lshr_b32 s12, s12, 16 -; GCN-HSA-NEXT: s_lshr_b32 s10, s10, 16 -; GCN-HSA-NEXT: s_lshr_b32 s8, s8, 16 -; GCN-HSA-NEXT: s_lshr_b32 s6, s6, 16 -; GCN-HSA-NEXT: s_lshr_b32 s4, s4, 16 -; GCN-HSA-NEXT: s_lshr_b32 s2, s2, 16 -; GCN-HSA-NEXT: s_lshr_b32 s36, s0, 16 -; GCN-HSA-NEXT: s_add_u32 s0, s16, 0xf0 -; GCN-HSA-NEXT: s_addc_u32 s1, s17, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v5, s1 -; GCN-HSA-NEXT: v_mov_b32_e32 v4, s0 -; GCN-HSA-NEXT: s_add_u32 s0, s16, 0xd0 -; GCN-HSA-NEXT: s_addc_u32 s1, s17, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v7, s1 -; GCN-HSA-NEXT: v_mov_b32_e32 v6, s0 -; GCN-HSA-NEXT: s_add_u32 s0, s16, 0xb0 -; GCN-HSA-NEXT: s_addc_u32 s1, s17, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v9, s1 -; GCN-HSA-NEXT: v_mov_b32_e32 v8, s0 -; GCN-HSA-NEXT: s_add_u32 s0, s16, 0x90 -; GCN-HSA-NEXT: s_addc_u32 s1, s17, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v11, s1 -; GCN-HSA-NEXT: v_mov_b32_e32 v10, s0 -; GCN-HSA-NEXT: v_mov_b32_e32 v0, s18 -; GCN-HSA-NEXT: v_mov_b32_e32 v2, s15 -; GCN-HSA-NEXT: s_add_u32 s0, s16, 0x70 +; GCN-HSA-NEXT: s_lshr_b32 s3, s5, 16 +; GCN-HSA-NEXT: s_lshr_b32 s20, s7, 16 +; GCN-HSA-NEXT: s_lshr_b32 s21, s9, 16 +; GCN-HSA-NEXT: s_lshr_b32 s22, s11, 16 +; GCN-HSA-NEXT: s_lshr_b32 s23, s13, 16 +; GCN-HSA-NEXT: s_lshr_b32 s24, s15, 16 +; GCN-HSA-NEXT: s_lshr_b32 s25, s17, 16 +; GCN-HSA-NEXT: s_lshr_b32 s26, s19, 16 +; GCN-HSA-NEXT: s_lshr_b32 s27, s18, 16 +; GCN-HSA-NEXT: s_lshr_b32 s28, s16, 16 +; GCN-HSA-NEXT: s_lshr_b32 s29, s14, 16 +; GCN-HSA-NEXT: s_lshr_b32 s30, s12, 16 +; GCN-HSA-NEXT: s_lshr_b32 s31, s10, 16 +; GCN-HSA-NEXT: s_lshr_b32 s33, s8, 16 +; GCN-HSA-NEXT: s_lshr_b32 s34, s6, 16 +; GCN-HSA-NEXT: s_lshr_b32 s2, s4, 16 +; GCN-HSA-NEXT: s_and_b32 s35, s4, 0xffff +; GCN-HSA-NEXT: s_and_b32 s6, s6, 0xffff +; GCN-HSA-NEXT: s_and_b32 s8, s8, 0xffff +; GCN-HSA-NEXT: s_and_b32 s10, s10, 0xffff +; GCN-HSA-NEXT: s_and_b32 s12, s12, 0xffff +; GCN-HSA-NEXT: s_and_b32 s14, s14, 0xffff +; GCN-HSA-NEXT: s_and_b32 s16, s16, 0xffff +; GCN-HSA-NEXT: s_and_b32 s18, s18, 0xffff +; GCN-HSA-NEXT: s_and_b32 s36, s5, 0xffff +; GCN-HSA-NEXT: s_and_b32 s7, s7, 0xffff +; GCN-HSA-NEXT: s_and_b32 s9, s9, 0xffff +; GCN-HSA-NEXT: s_and_b32 s11, s11, 0xffff +; GCN-HSA-NEXT: s_and_b32 s13, s13, 0xffff +; GCN-HSA-NEXT: s_and_b32 s15, s15, 0xffff +; GCN-HSA-NEXT: s_and_b32 s17, s17, 0xffff +; GCN-HSA-NEXT: s_and_b32 s19, s19, 0xffff +; GCN-HSA-NEXT: s_add_u32 s4, s0, 0xf0 +; GCN-HSA-NEXT: s_addc_u32 s5, s1, 0 +; GCN-HSA-NEXT: v_mov_b32_e32 v4, s4 +; GCN-HSA-NEXT: v_mov_b32_e32 v5, s5 +; GCN-HSA-NEXT: s_add_u32 s4, s0, 0xd0 +; GCN-HSA-NEXT: s_addc_u32 s5, s1, 0 +; GCN-HSA-NEXT: v_mov_b32_e32 v7, s5 +; GCN-HSA-NEXT: v_mov_b32_e32 v6, s4 +; GCN-HSA-NEXT: s_add_u32 s4, s0, 0xb0 +; GCN-HSA-NEXT: s_addc_u32 s5, s1, 0 +; GCN-HSA-NEXT: v_mov_b32_e32 v9, s5 +; GCN-HSA-NEXT: v_mov_b32_e32 v8, s4 +; GCN-HSA-NEXT: s_add_u32 s4, s0, 0x90 +; GCN-HSA-NEXT: s_addc_u32 s5, s1, 0 +; GCN-HSA-NEXT: v_mov_b32_e32 v0, s19 +; GCN-HSA-NEXT: v_mov_b32_e32 v2, s26 +; GCN-HSA-NEXT: v_mov_b32_e32 v11, s5 ; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GCN-HSA-NEXT: s_addc_u32 s1, s17, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v0, s34 -; GCN-HSA-NEXT: v_mov_b32_e32 v2, s13 +; GCN-HSA-NEXT: v_mov_b32_e32 v10, s4 +; GCN-HSA-NEXT: v_mov_b32_e32 v0, s17 +; GCN-HSA-NEXT: v_mov_b32_e32 v2, s25 ; GCN-HSA-NEXT: flat_store_dwordx4 v[6:7], v[0:3] -; GCN-HSA-NEXT: v_mov_b32_e32 v5, s1 -; GCN-HSA-NEXT: v_mov_b32_e32 v0, s33 -; GCN-HSA-NEXT: v_mov_b32_e32 v2, s11 +; GCN-HSA-NEXT: s_add_u32 s4, s0, 0x70 +; GCN-HSA-NEXT: v_mov_b32_e32 v0, s15 +; GCN-HSA-NEXT: v_mov_b32_e32 v2, s24 ; GCN-HSA-NEXT: flat_store_dwordx4 v[8:9], v[0:3] -; GCN-HSA-NEXT: v_mov_b32_e32 v4, s0 -; GCN-HSA-NEXT: v_mov_b32_e32 v0, s31 -; GCN-HSA-NEXT: v_mov_b32_e32 v2, s9 -; GCN-HSA-NEXT: s_add_u32 s0, s16, 0x50 +; GCN-HSA-NEXT: s_addc_u32 s5, s1, 0 +; GCN-HSA-NEXT: v_mov_b32_e32 v0, s13 +; GCN-HSA-NEXT: v_mov_b32_e32 v2, s23 +; GCN-HSA-NEXT: v_mov_b32_e32 v4, s4 ; GCN-HSA-NEXT: flat_store_dwordx4 v[10:11], v[0:3] -; GCN-HSA-NEXT: s_addc_u32 s1, s17, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v0, s30 -; GCN-HSA-NEXT: v_mov_b32_e32 v2, s7 +; GCN-HSA-NEXT: v_mov_b32_e32 v5, s5 +; GCN-HSA-NEXT: v_mov_b32_e32 v0, s11 +; GCN-HSA-NEXT: v_mov_b32_e32 v2, s22 +; GCN-HSA-NEXT: s_add_u32 s4, s0, 0x50 ; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GCN-HSA-NEXT: v_mov_b32_e32 v5, s1 -; GCN-HSA-NEXT: v_mov_b32_e32 v4, s0 -; GCN-HSA-NEXT: s_add_u32 s0, s16, 48 -; GCN-HSA-NEXT: v_mov_b32_e32 v0, s29 -; GCN-HSA-NEXT: v_mov_b32_e32 v2, s5 -; GCN-HSA-NEXT: s_addc_u32 s1, s17, 0 +; GCN-HSA-NEXT: s_addc_u32 s5, s1, 0 +; GCN-HSA-NEXT: v_mov_b32_e32 v4, s4 +; GCN-HSA-NEXT: v_mov_b32_e32 v0, s9 +; GCN-HSA-NEXT: v_mov_b32_e32 v2, s21 +; GCN-HSA-NEXT: v_mov_b32_e32 v5, s5 +; GCN-HSA-NEXT: s_add_u32 s4, s0, 48 ; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GCN-HSA-NEXT: v_mov_b32_e32 v5, s1 -; GCN-HSA-NEXT: v_mov_b32_e32 v4, s0 -; GCN-HSA-NEXT: s_add_u32 s0, s16, 16 -; GCN-HSA-NEXT: v_mov_b32_e32 v0, s28 +; GCN-HSA-NEXT: s_addc_u32 s5, s1, 0 +; GCN-HSA-NEXT: v_mov_b32_e32 v4, s4 +; GCN-HSA-NEXT: v_mov_b32_e32 v0, s7 +; GCN-HSA-NEXT: v_mov_b32_e32 v2, s20 +; GCN-HSA-NEXT: v_mov_b32_e32 v5, s5 +; GCN-HSA-NEXT: s_add_u32 s4, s0, 16 +; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] +; GCN-HSA-NEXT: s_addc_u32 s5, s1, 0 +; GCN-HSA-NEXT: v_mov_b32_e32 v4, s4 +; GCN-HSA-NEXT: v_mov_b32_e32 v0, s36 ; GCN-HSA-NEXT: v_mov_b32_e32 v2, s3 -; GCN-HSA-NEXT: s_addc_u32 s1, s17, 0 +; GCN-HSA-NEXT: v_mov_b32_e32 v5, s5 +; GCN-HSA-NEXT: s_add_u32 s4, s0, 0xe0 +; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] +; GCN-HSA-NEXT: s_addc_u32 s5, s1, 0 +; GCN-HSA-NEXT: v_mov_b32_e32 v4, s4 +; GCN-HSA-NEXT: v_mov_b32_e32 v0, s18 +; GCN-HSA-NEXT: v_mov_b32_e32 v2, s27 +; GCN-HSA-NEXT: v_mov_b32_e32 v5, s5 +; GCN-HSA-NEXT: s_add_u32 s4, s0, 0xc0 +; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] +; GCN-HSA-NEXT: s_addc_u32 s5, s1, 0 +; GCN-HSA-NEXT: v_mov_b32_e32 v4, s4 +; GCN-HSA-NEXT: v_mov_b32_e32 v0, s16 +; GCN-HSA-NEXT: v_mov_b32_e32 v2, s28 +; GCN-HSA-NEXT: v_mov_b32_e32 v5, s5 +; GCN-HSA-NEXT: s_add_u32 s4, s0, 0xa0 +; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] +; GCN-HSA-NEXT: s_addc_u32 s5, s1, 0 +; GCN-HSA-NEXT: v_mov_b32_e32 v4, s4 +; GCN-HSA-NEXT: v_mov_b32_e32 v0, s14 +; GCN-HSA-NEXT: v_mov_b32_e32 v2, s29 +; GCN-HSA-NEXT: v_mov_b32_e32 v5, s5 +; GCN-HSA-NEXT: s_add_u32 s4, s0, 0x80 +; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] +; GCN-HSA-NEXT: s_addc_u32 s5, s1, 0 +; GCN-HSA-NEXT: v_mov_b32_e32 v4, s4 +; GCN-HSA-NEXT: v_mov_b32_e32 v0, s12 +; GCN-HSA-NEXT: v_mov_b32_e32 v2, s30 +; GCN-HSA-NEXT: v_mov_b32_e32 v5, s5 +; GCN-HSA-NEXT: s_add_u32 s4, s0, 0x60 +; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] +; GCN-HSA-NEXT: s_addc_u32 s5, s1, 0 +; GCN-HSA-NEXT: v_mov_b32_e32 v4, s4 +; GCN-HSA-NEXT: v_mov_b32_e32 v0, s10 +; GCN-HSA-NEXT: v_mov_b32_e32 v2, s31 +; GCN-HSA-NEXT: v_mov_b32_e32 v5, s5 +; GCN-HSA-NEXT: s_add_u32 s4, s0, 64 +; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] +; GCN-HSA-NEXT: s_addc_u32 s5, s1, 0 +; GCN-HSA-NEXT: v_mov_b32_e32 v4, s4 +; GCN-HSA-NEXT: v_mov_b32_e32 v0, s8 +; GCN-HSA-NEXT: v_mov_b32_e32 v2, s33 +; GCN-HSA-NEXT: v_mov_b32_e32 v5, s5 +; GCN-HSA-NEXT: s_add_u32 s4, s0, 32 +; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] +; GCN-HSA-NEXT: s_addc_u32 s5, s1, 0 +; GCN-HSA-NEXT: v_mov_b32_e32 v4, s4 +; GCN-HSA-NEXT: v_mov_b32_e32 v0, s6 +; GCN-HSA-NEXT: v_mov_b32_e32 v2, s34 +; GCN-HSA-NEXT: v_mov_b32_e32 v5, s5 ; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-HSA-NEXT: v_mov_b32_e32 v5, s1 -; GCN-HSA-NEXT: v_mov_b32_e32 v4, s0 -; GCN-HSA-NEXT: s_add_u32 s0, s16, 0xe0 -; GCN-HSA-NEXT: v_mov_b32_e32 v0, s27 -; GCN-HSA-NEXT: v_mov_b32_e32 v2, s35 -; GCN-HSA-NEXT: s_addc_u32 s1, s17, 0 -; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GCN-HSA-NEXT: v_mov_b32_e32 v5, s1 -; GCN-HSA-NEXT: v_mov_b32_e32 v4, s0 -; GCN-HSA-NEXT: s_add_u32 s0, s16, 0xc0 -; GCN-HSA-NEXT: v_mov_b32_e32 v0, s26 -; GCN-HSA-NEXT: v_mov_b32_e32 v2, s14 -; GCN-HSA-NEXT: s_addc_u32 s1, s17, 0 -; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GCN-HSA-NEXT: v_mov_b32_e32 v5, s1 -; GCN-HSA-NEXT: v_mov_b32_e32 v4, s0 -; GCN-HSA-NEXT: s_add_u32 s0, s16, 0xa0 -; GCN-HSA-NEXT: v_mov_b32_e32 v0, s25 -; GCN-HSA-NEXT: v_mov_b32_e32 v2, s12 -; GCN-HSA-NEXT: s_addc_u32 s1, s17, 0 -; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GCN-HSA-NEXT: v_mov_b32_e32 v5, s1 -; GCN-HSA-NEXT: v_mov_b32_e32 v4, s0 -; GCN-HSA-NEXT: s_add_u32 s0, s16, 0x80 -; GCN-HSA-NEXT: v_mov_b32_e32 v0, s24 -; GCN-HSA-NEXT: v_mov_b32_e32 v2, s10 -; GCN-HSA-NEXT: s_addc_u32 s1, s17, 0 -; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GCN-HSA-NEXT: v_mov_b32_e32 v5, s1 -; GCN-HSA-NEXT: v_mov_b32_e32 v4, s0 -; GCN-HSA-NEXT: s_add_u32 s0, s16, 0x60 -; GCN-HSA-NEXT: v_mov_b32_e32 v0, s23 -; GCN-HSA-NEXT: v_mov_b32_e32 v2, s8 -; GCN-HSA-NEXT: s_addc_u32 s1, s17, 0 -; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GCN-HSA-NEXT: v_mov_b32_e32 v5, s1 -; GCN-HSA-NEXT: v_mov_b32_e32 v4, s0 -; GCN-HSA-NEXT: s_add_u32 s0, s16, 64 -; GCN-HSA-NEXT: v_mov_b32_e32 v0, s22 -; GCN-HSA-NEXT: v_mov_b32_e32 v2, s6 -; GCN-HSA-NEXT: s_addc_u32 s1, s17, 0 -; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GCN-HSA-NEXT: v_mov_b32_e32 v5, s1 -; GCN-HSA-NEXT: v_mov_b32_e32 v4, s0 -; GCN-HSA-NEXT: s_add_u32 s0, s16, 32 -; GCN-HSA-NEXT: v_mov_b32_e32 v0, s21 -; GCN-HSA-NEXT: v_mov_b32_e32 v2, s4 -; GCN-HSA-NEXT: s_addc_u32 s1, s17, 0 -; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GCN-HSA-NEXT: v_mov_b32_e32 v5, s1 -; GCN-HSA-NEXT: v_mov_b32_e32 v0, s20 +; GCN-HSA-NEXT: v_mov_b32_e32 v0, s35 ; GCN-HSA-NEXT: v_mov_b32_e32 v2, s2 ; GCN-HSA-NEXT: v_mov_b32_e32 v4, s0 ; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GCN-HSA-NEXT: v_mov_b32_e32 v4, s16 -; GCN-HSA-NEXT: v_mov_b32_e32 v0, s19 -; GCN-HSA-NEXT: v_mov_b32_e32 v2, s36 -; GCN-HSA-NEXT: v_mov_b32_e32 v5, s17 -; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-HSA-NEXT: s_endpgm ; ; GCN-NOHSA-VI-LABEL: constant_zextload_v32i16_to_v32i64: ; GCN-NOHSA-VI: ; %bb.0: -; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[20:23], s[0:1], 0x24 +; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[16:19], s[0:1], 0x24 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, 0 -; GCN-NOHSA-VI-NEXT: s_mov_b32 s19, 0xf000 -; GCN-NOHSA-VI-NEXT: s_mov_b32 s18, -1 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, v1 ; GCN-NOHSA-VI-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NOHSA-VI-NEXT: s_load_dwordx16 s[0:15], s[22:23], 0x0 -; GCN-NOHSA-VI-NEXT: s_mov_b32 s22, 0xffff -; GCN-NOHSA-VI-NEXT: s_mov_b32 s16, s20 -; GCN-NOHSA-VI-NEXT: s_mov_b32 s17, s21 +; GCN-NOHSA-VI-NEXT: s_load_dwordx16 s[0:15], s[18:19], 0x0 +; GCN-NOHSA-VI-NEXT: s_mov_b32 s19, 0xf000 +; GCN-NOHSA-VI-NEXT: s_mov_b32 s18, -1 ; GCN-NOHSA-VI-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NOHSA-VI-NEXT: s_and_b32 s20, s0, s22 -; GCN-NOHSA-VI-NEXT: s_and_b32 s21, s1, s22 -; GCN-NOHSA-VI-NEXT: s_and_b32 s23, s2, s22 -; GCN-NOHSA-VI-NEXT: s_and_b32 s24, s3, s22 -; GCN-NOHSA-VI-NEXT: s_and_b32 s25, s4, s22 -; GCN-NOHSA-VI-NEXT: s_and_b32 s26, s5, s22 -; GCN-NOHSA-VI-NEXT: s_and_b32 s27, s6, s22 -; GCN-NOHSA-VI-NEXT: s_and_b32 s28, s7, s22 -; GCN-NOHSA-VI-NEXT: s_and_b32 s29, s8, s22 -; GCN-NOHSA-VI-NEXT: s_and_b32 s30, s9, s22 -; GCN-NOHSA-VI-NEXT: s_and_b32 s31, s10, s22 -; GCN-NOHSA-VI-NEXT: s_and_b32 s33, s11, s22 -; GCN-NOHSA-VI-NEXT: s_and_b32 s34, s12, s22 -; GCN-NOHSA-VI-NEXT: s_and_b32 s35, s13, s22 -; GCN-NOHSA-VI-NEXT: s_and_b32 s36, s14, s22 -; GCN-NOHSA-VI-NEXT: s_and_b32 s22, s15, s22 +; GCN-NOHSA-VI-NEXT: s_and_b32 s36, s15, 0xffff ; GCN-NOHSA-VI-NEXT: s_lshr_b32 s15, s15, 16 +; GCN-NOHSA-VI-NEXT: s_and_b32 s35, s14, 0xffff ; GCN-NOHSA-VI-NEXT: s_lshr_b32 s14, s14, 16 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s22 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s36 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s15 +; GCN-NOHSA-VI-NEXT: s_and_b32 s34, s13, 0xffff ; GCN-NOHSA-VI-NEXT: s_lshr_b32 s13, s13, 16 ; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[16:19], 0 offset:240 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s12, s12, 16 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s36 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s14 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[16:19], 0 offset:224 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s11, s11, 16 +; GCN-NOHSA-VI-NEXT: s_and_b32 s33, s12, 0xffff ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s35 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s13 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[16:19], 0 offset:208 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s10, s10, 16 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s14 +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s12, s12, 16 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[16:19], 0 offset:224 +; GCN-NOHSA-VI-NEXT: s_and_b32 s31, s11, 0xffff ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s34 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s12 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[16:19], 0 offset:192 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s9, s9, 16 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s13 +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s11, s11, 16 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[16:19], 0 offset:208 +; GCN-NOHSA-VI-NEXT: s_and_b32 s30, s10, 0xffff ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s33 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s11 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[16:19], 0 offset:176 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s8, s8, 16 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s12 +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s10, s10, 16 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[16:19], 0 offset:192 +; GCN-NOHSA-VI-NEXT: s_and_b32 s29, s9, 0xffff ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s31 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s10 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[16:19], 0 offset:160 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s7, s7, 16 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s11 +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s9, s9, 16 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[16:19], 0 offset:176 +; GCN-NOHSA-VI-NEXT: s_and_b32 s28, s8, 0xffff ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s30 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s9 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[16:19], 0 offset:144 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s6, s6, 16 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s10 +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s8, s8, 16 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[16:19], 0 offset:160 +; GCN-NOHSA-VI-NEXT: s_and_b32 s27, s7, 0xffff ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s29 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s8 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[16:19], 0 offset:128 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s5, s5, 16 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s9 +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s7, s7, 16 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[16:19], 0 offset:144 +; GCN-NOHSA-VI-NEXT: s_and_b32 s26, s6, 0xffff ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s28 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s7 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[16:19], 0 offset:112 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s4, s4, 16 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s8 +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s6, s6, 16 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[16:19], 0 offset:128 +; GCN-NOHSA-VI-NEXT: s_and_b32 s25, s5, 0xffff ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s27 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s6 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[16:19], 0 offset:96 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s3, s3, 16 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s7 +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s5, s5, 16 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[16:19], 0 offset:112 +; GCN-NOHSA-VI-NEXT: s_and_b32 s24, s4, 0xffff ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s26 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s5 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[16:19], 0 offset:80 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s2, s2, 16 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s6 +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s4, s4, 16 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[16:19], 0 offset:96 +; GCN-NOHSA-VI-NEXT: s_and_b32 s23, s3, 0xffff ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s25 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s4 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[16:19], 0 offset:64 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s1, s1, 16 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s5 +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s3, s3, 16 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[16:19], 0 offset:80 +; GCN-NOHSA-VI-NEXT: s_and_b32 s22, s2, 0xffff ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s24 -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s3 -; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[16:19], 0 offset:48 -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s0, s0, 16 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s4 +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s2, s2, 16 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[16:19], 0 offset:64 +; GCN-NOHSA-VI-NEXT: s_and_b32 s21, s1, 0xffff ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s23 +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s3 +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s1, s1, 16 +; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[16:19], 0 offset:48 +; GCN-NOHSA-VI-NEXT: s_and_b32 s20, s0, 0xffff +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s22 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s2 +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s0, s0, 16 ; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[16:19], 0 offset:32 ; GCN-NOHSA-VI-NEXT: s_nop 0 ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s21 diff --git a/llvm/test/CodeGen/AMDGPU/load-global-i16.ll b/llvm/test/CodeGen/AMDGPU/load-global-i16.ll index 29c8d6a8b21d..6c4071354cc8 100644 --- a/llvm/test/CodeGen/AMDGPU/load-global-i16.ll +++ b/llvm/test/CodeGen/AMDGPU/load-global-i16.ll @@ -3632,24 +3632,21 @@ define amdgpu_kernel void @global_zextload_v64i16_to_v64i32(<64 x i32> addrspace ; GCN-HSA-LABEL: global_zextload_v64i16_to_v64i32: ; GCN-HSA: ; %bb.0: ; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; GCN-HSA-NEXT: s_movk_i32 s12, 0x50 -; GCN-HSA-NEXT: s_movk_i32 s13, 0x60 -; GCN-HSA-NEXT: s_movk_i32 s14, 0x70 -; GCN-HSA-NEXT: s_mov_b32 s15, 0xffff +; GCN-HSA-NEXT: s_mov_b32 s12, 0xffff ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3 ; GCN-HSA-NEXT: flat_load_dwordx4 v[0:3], v[0:1] -; GCN-HSA-NEXT: s_add_u32 s4, s2, s12 +; GCN-HSA-NEXT: s_add_u32 s4, s2, 0x50 ; GCN-HSA-NEXT: s_addc_u32 s5, s3, 0 ; GCN-HSA-NEXT: v_mov_b32_e32 v4, s4 ; GCN-HSA-NEXT: v_mov_b32_e32 v5, s5 ; GCN-HSA-NEXT: flat_load_dwordx4 v[4:7], v[4:5] -; GCN-HSA-NEXT: s_add_u32 s4, s2, s13 +; GCN-HSA-NEXT: s_add_u32 s4, s2, 0x60 ; GCN-HSA-NEXT: s_addc_u32 s5, s3, 0 ; GCN-HSA-NEXT: v_mov_b32_e32 v9, s5 ; GCN-HSA-NEXT: v_mov_b32_e32 v8, s4 -; GCN-HSA-NEXT: s_add_u32 s4, s2, s14 +; GCN-HSA-NEXT: s_add_u32 s4, s2, 0x70 ; GCN-HSA-NEXT: s_addc_u32 s5, s3, 0 ; GCN-HSA-NEXT: v_mov_b32_e32 v13, s5 ; GCN-HSA-NEXT: v_mov_b32_e32 v12, s4 @@ -3657,12 +3654,12 @@ define amdgpu_kernel void @global_zextload_v64i16_to_v64i32(<64 x i32> addrspace ; GCN-HSA-NEXT: s_addc_u32 s5, s3, 0 ; GCN-HSA-NEXT: s_add_u32 s6, s2, 32 ; GCN-HSA-NEXT: s_addc_u32 s7, s3, 0 -; GCN-HSA-NEXT: flat_load_dwordx4 v[8:11], v[8:9] ; GCN-HSA-NEXT: s_add_u32 s8, s2, 48 +; GCN-HSA-NEXT: flat_load_dwordx4 v[8:11], v[8:9] ; GCN-HSA-NEXT: s_addc_u32 s9, s3, 0 ; GCN-HSA-NEXT: s_add_u32 s2, s2, 64 -; GCN-HSA-NEXT: flat_load_dwordx4 v[12:15], v[12:13] ; GCN-HSA-NEXT: s_addc_u32 s3, s3, 0 +; GCN-HSA-NEXT: flat_load_dwordx4 v[12:15], v[12:13] ; GCN-HSA-NEXT: v_mov_b32_e32 v17, s3 ; GCN-HSA-NEXT: v_mov_b32_e32 v16, s2 ; GCN-HSA-NEXT: flat_load_dwordx4 v[16:19], v[16:17] @@ -3679,8 +3676,8 @@ define amdgpu_kernel void @global_zextload_v64i16_to_v64i32(<64 x i32> addrspace ; GCN-HSA-NEXT: s_waitcnt vmcnt(6) ; GCN-HSA-NEXT: v_lshrrev_b32_e32 v27, 16, v1 ; GCN-HSA-NEXT: v_lshrrev_b32_e32 v25, 16, v0 -; GCN-HSA-NEXT: v_and_b32_e32 v26, s15, v1 -; GCN-HSA-NEXT: v_and_b32_e32 v24, s15, v0 +; GCN-HSA-NEXT: v_and_b32_e32 v26, s12, v1 +; GCN-HSA-NEXT: v_and_b32_e32 v24, s12, v0 ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s6 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s7 ; GCN-HSA-NEXT: flat_load_dwordx4 v[32:35], v[0:1] @@ -3698,16 +3695,16 @@ define amdgpu_kernel void @global_zextload_v64i16_to_v64i32(<64 x i32> addrspace ; GCN-HSA-NEXT: s_add_u32 s10, s0, 0xa0 ; GCN-HSA-NEXT: v_lshrrev_b32_e32 v27, 16, v3 ; GCN-HSA-NEXT: v_lshrrev_b32_e32 v25, 16, v2 -; GCN-HSA-NEXT: v_and_b32_e32 v26, s15, v3 -; GCN-HSA-NEXT: v_and_b32_e32 v24, s15, v2 +; GCN-HSA-NEXT: v_and_b32_e32 v26, s12, v3 +; GCN-HSA-NEXT: v_and_b32_e32 v24, s12, v2 ; GCN-HSA-NEXT: flat_store_dwordx4 v[0:1], v[24:27] ; GCN-HSA-NEXT: s_addc_u32 s11, s1, 0 ; GCN-HSA-NEXT: v_mov_b32_e32 v25, s3 ; GCN-HSA-NEXT: s_waitcnt vmcnt(8) ; GCN-HSA-NEXT: v_lshrrev_b32_e32 v3, 16, v5 ; GCN-HSA-NEXT: v_lshrrev_b32_e32 v1, 16, v4 -; GCN-HSA-NEXT: v_and_b32_e32 v2, s15, v5 -; GCN-HSA-NEXT: v_and_b32_e32 v0, s15, v4 +; GCN-HSA-NEXT: v_and_b32_e32 v2, s12, v5 +; GCN-HSA-NEXT: v_and_b32_e32 v0, s12, v4 ; GCN-HSA-NEXT: v_mov_b32_e32 v4, s10 ; GCN-HSA-NEXT: v_mov_b32_e32 v24, s2 ; GCN-HSA-NEXT: s_add_u32 s2, s0, 0xb0 @@ -3717,62 +3714,62 @@ define amdgpu_kernel void @global_zextload_v64i16_to_v64i32(<64 x i32> addrspace ; GCN-HSA-NEXT: v_mov_b32_e32 v5, s3 ; GCN-HSA-NEXT: v_lshrrev_b32_e32 v3, 16, v7 ; GCN-HSA-NEXT: v_lshrrev_b32_e32 v1, 16, v6 -; GCN-HSA-NEXT: v_and_b32_e32 v2, s15, v7 -; GCN-HSA-NEXT: v_and_b32_e32 v0, s15, v6 +; GCN-HSA-NEXT: v_and_b32_e32 v2, s12, v7 +; GCN-HSA-NEXT: v_and_b32_e32 v0, s12, v6 ; GCN-HSA-NEXT: v_mov_b32_e32 v4, s2 ; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-HSA-NEXT: v_mov_b32_e32 v4, s6 +; GCN-HSA-NEXT: s_add_u32 s2, s0, 0x80 ; GCN-HSA-NEXT: v_mov_b32_e32 v5, s7 ; GCN-HSA-NEXT: s_waitcnt vmcnt(9) ; GCN-HSA-NEXT: v_lshrrev_b32_e32 v3, 16, v9 ; GCN-HSA-NEXT: v_lshrrev_b32_e32 v1, 16, v8 -; GCN-HSA-NEXT: v_and_b32_e32 v2, s15, v9 -; GCN-HSA-NEXT: v_and_b32_e32 v0, s15, v8 +; GCN-HSA-NEXT: v_and_b32_e32 v2, s12, v9 +; GCN-HSA-NEXT: v_and_b32_e32 v0, s12, v8 +; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 ; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-HSA-NEXT: v_mov_b32_e32 v4, s8 ; GCN-HSA-NEXT: v_lshrrev_b32_e32 v3, 16, v11 ; GCN-HSA-NEXT: v_lshrrev_b32_e32 v1, 16, v10 -; GCN-HSA-NEXT: v_and_b32_e32 v2, s15, v11 -; GCN-HSA-NEXT: v_and_b32_e32 v0, s15, v10 +; GCN-HSA-NEXT: v_and_b32_e32 v2, s12, v11 +; GCN-HSA-NEXT: v_and_b32_e32 v0, s12, v10 +; GCN-HSA-NEXT: v_mov_b32_e32 v11, s3 ; GCN-HSA-NEXT: v_mov_b32_e32 v5, s9 -; GCN-HSA-NEXT: s_add_u32 s2, s0, 0x80 +; GCN-HSA-NEXT: v_mov_b32_e32 v10, s2 +; GCN-HSA-NEXT: s_add_u32 s2, s0, 0x90 ; GCN-HSA-NEXT: v_mov_b32_e32 v9, s5 ; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-HSA-NEXT: s_waitcnt vmcnt(10) ; GCN-HSA-NEXT: v_lshrrev_b32_e32 v5, 16, v14 ; GCN-HSA-NEXT: v_lshrrev_b32_e32 v3, 16, v13 ; GCN-HSA-NEXT: v_lshrrev_b32_e32 v1, 16, v12 -; GCN-HSA-NEXT: v_and_b32_e32 v2, s15, v13 -; GCN-HSA-NEXT: v_and_b32_e32 v0, s15, v12 -; GCN-HSA-NEXT: v_and_b32_e32 v4, s15, v14 +; GCN-HSA-NEXT: v_and_b32_e32 v2, s12, v13 +; GCN-HSA-NEXT: v_and_b32_e32 v0, s12, v12 +; GCN-HSA-NEXT: v_and_b32_e32 v4, s12, v14 ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 ; GCN-HSA-NEXT: v_mov_b32_e32 v8, s4 ; GCN-HSA-NEXT: v_lshrrev_b32_e32 v7, 16, v15 -; GCN-HSA-NEXT: v_and_b32_e32 v6, s15, v15 +; GCN-HSA-NEXT: v_and_b32_e32 v6, s12, v15 ; GCN-HSA-NEXT: flat_store_dwordx4 v[24:25], v[0:3] ; GCN-HSA-NEXT: flat_store_dwordx4 v[8:9], v[4:7] ; GCN-HSA-NEXT: s_waitcnt vmcnt(11) ; GCN-HSA-NEXT: v_lshrrev_b32_e32 v3, 16, v17 ; GCN-HSA-NEXT: v_mov_b32_e32 v5, s3 -; GCN-HSA-NEXT: v_mov_b32_e32 v4, s2 -; GCN-HSA-NEXT: s_add_u32 s2, s0, 0x90 ; GCN-HSA-NEXT: v_lshrrev_b32_e32 v1, 16, v16 -; GCN-HSA-NEXT: v_and_b32_e32 v2, s15, v17 -; GCN-HSA-NEXT: v_and_b32_e32 v0, s15, v16 -; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 -; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GCN-HSA-NEXT: v_mov_b32_e32 v5, s3 +; GCN-HSA-NEXT: v_and_b32_e32 v2, s12, v17 +; GCN-HSA-NEXT: v_and_b32_e32 v0, s12, v16 ; GCN-HSA-NEXT: v_mov_b32_e32 v4, s2 -; GCN-HSA-NEXT: s_add_u32 s2, s0, s13 +; GCN-HSA-NEXT: s_add_u32 s2, s0, 0x60 +; GCN-HSA-NEXT: flat_store_dwordx4 v[10:11], v[0:3] +; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 ; GCN-HSA-NEXT: v_lshrrev_b32_e32 v3, 16, v19 ; GCN-HSA-NEXT: v_lshrrev_b32_e32 v1, 16, v18 -; GCN-HSA-NEXT: v_and_b32_e32 v2, s15, v19 -; GCN-HSA-NEXT: v_and_b32_e32 v0, s15, v18 -; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 +; GCN-HSA-NEXT: v_and_b32_e32 v2, s12, v19 +; GCN-HSA-NEXT: v_and_b32_e32 v0, s12, v18 ; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-HSA-NEXT: v_mov_b32_e32 v5, s3 ; GCN-HSA-NEXT: v_mov_b32_e32 v4, s2 -; GCN-HSA-NEXT: s_add_u32 s2, s0, s14 +; GCN-HSA-NEXT: s_add_u32 s2, s0, 0x70 ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 ; GCN-HSA-NEXT: v_mov_b32_e32 v9, s3 ; GCN-HSA-NEXT: v_mov_b32_e32 v8, s2 @@ -3780,47 +3777,47 @@ define amdgpu_kernel void @global_zextload_v64i16_to_v64i32(<64 x i32> addrspace ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 ; GCN-HSA-NEXT: v_mov_b32_e32 v17, s3 ; GCN-HSA-NEXT: v_mov_b32_e32 v16, s2 -; GCN-HSA-NEXT: s_add_u32 s2, s0, s12 +; GCN-HSA-NEXT: s_add_u32 s2, s0, 0x50 ; GCN-HSA-NEXT: s_waitcnt vmcnt(10) ; GCN-HSA-NEXT: v_lshrrev_b32_e32 v15, 16, v33 ; GCN-HSA-NEXT: v_lshrrev_b32_e32 v13, 16, v32 -; GCN-HSA-NEXT: v_and_b32_e32 v14, s15, v33 -; GCN-HSA-NEXT: v_and_b32_e32 v12, s15, v32 +; GCN-HSA-NEXT: v_and_b32_e32 v14, s12, v33 +; GCN-HSA-NEXT: v_and_b32_e32 v12, s12, v32 ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 ; GCN-HSA-NEXT: v_lshrrev_b32_e32 v3, 16, v21 ; GCN-HSA-NEXT: v_lshrrev_b32_e32 v1, 16, v20 -; GCN-HSA-NEXT: v_and_b32_e32 v2, s15, v21 -; GCN-HSA-NEXT: v_and_b32_e32 v0, s15, v20 +; GCN-HSA-NEXT: v_and_b32_e32 v2, s12, v21 +; GCN-HSA-NEXT: v_and_b32_e32 v0, s12, v20 ; GCN-HSA-NEXT: flat_store_dwordx4 v[16:17], v[12:15] ; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-HSA-NEXT: v_mov_b32_e32 v13, s3 ; GCN-HSA-NEXT: v_lshrrev_b32_e32 v7, 16, v23 ; GCN-HSA-NEXT: v_lshrrev_b32_e32 v5, 16, v22 -; GCN-HSA-NEXT: v_and_b32_e32 v6, s15, v23 -; GCN-HSA-NEXT: v_and_b32_e32 v4, s15, v22 +; GCN-HSA-NEXT: v_and_b32_e32 v6, s12, v23 +; GCN-HSA-NEXT: v_and_b32_e32 v4, s12, v22 ; GCN-HSA-NEXT: v_mov_b32_e32 v12, s2 ; GCN-HSA-NEXT: s_add_u32 s2, s0, 32 ; GCN-HSA-NEXT: flat_store_dwordx4 v[8:9], v[4:7] ; GCN-HSA-NEXT: v_lshrrev_b32_e32 v11, 16, v35 ; GCN-HSA-NEXT: v_lshrrev_b32_e32 v9, 16, v34 -; GCN-HSA-NEXT: v_and_b32_e32 v10, s15, v35 -; GCN-HSA-NEXT: v_and_b32_e32 v8, s15, v34 +; GCN-HSA-NEXT: v_and_b32_e32 v10, s12, v35 +; GCN-HSA-NEXT: v_and_b32_e32 v8, s12, v34 ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 ; GCN-HSA-NEXT: flat_store_dwordx4 v[12:13], v[8:11] ; GCN-HSA-NEXT: s_add_u32 s0, s0, 48 ; GCN-HSA-NEXT: v_mov_b32_e32 v9, s3 ; GCN-HSA-NEXT: v_lshrrev_b32_e32 v7, 16, v29 ; GCN-HSA-NEXT: v_lshrrev_b32_e32 v5, 16, v28 -; GCN-HSA-NEXT: v_and_b32_e32 v6, s15, v29 -; GCN-HSA-NEXT: v_and_b32_e32 v4, s15, v28 +; GCN-HSA-NEXT: v_and_b32_e32 v6, s12, v29 +; GCN-HSA-NEXT: v_and_b32_e32 v4, s12, v28 ; GCN-HSA-NEXT: v_mov_b32_e32 v8, s2 ; GCN-HSA-NEXT: s_addc_u32 s1, s1, 0 ; GCN-HSA-NEXT: flat_store_dwordx4 v[8:9], v[4:7] ; GCN-HSA-NEXT: v_lshrrev_b32_e32 v3, 16, v31 ; GCN-HSA-NEXT: v_mov_b32_e32 v5, s1 ; GCN-HSA-NEXT: v_lshrrev_b32_e32 v1, 16, v30 -; GCN-HSA-NEXT: v_and_b32_e32 v2, s15, v31 -; GCN-HSA-NEXT: v_and_b32_e32 v0, s15, v30 +; GCN-HSA-NEXT: v_and_b32_e32 v2, s12, v31 +; GCN-HSA-NEXT: v_and_b32_e32 v0, s12, v30 ; GCN-HSA-NEXT: v_mov_b32_e32 v4, s0 ; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-HSA-NEXT: s_endpgm @@ -4415,44 +4412,41 @@ define amdgpu_kernel void @global_sextload_v64i16_to_v64i32(<64 x i32> addrspace ; GCN-HSA-LABEL: global_sextload_v64i16_to_v64i32: ; GCN-HSA: ; %bb.0: ; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; GCN-HSA-NEXT: s_movk_i32 s7, 0x70 -; GCN-HSA-NEXT: s_movk_i32 s8, 0x60 -; GCN-HSA-NEXT: s_movk_i32 s6, 0x50 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3 ; GCN-HSA-NEXT: flat_load_dwordx4 v[12:15], v[0:1] -; GCN-HSA-NEXT: s_add_u32 s4, s2, s7 +; GCN-HSA-NEXT: s_add_u32 s4, s2, 0x70 ; GCN-HSA-NEXT: s_addc_u32 s5, s3, 0 ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s4 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s5 ; GCN-HSA-NEXT: flat_load_dwordx4 v[8:11], v[0:1] -; GCN-HSA-NEXT: s_add_u32 s4, s2, s8 +; GCN-HSA-NEXT: s_add_u32 s4, s2, 0x60 ; GCN-HSA-NEXT: s_addc_u32 s5, s3, 0 ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s4 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s5 -; GCN-HSA-NEXT: s_add_u32 s4, s2, s6 ; GCN-HSA-NEXT: flat_load_dwordx4 v[4:7], v[0:1] +; GCN-HSA-NEXT: s_add_u32 s4, s2, 0x50 ; GCN-HSA-NEXT: s_addc_u32 s5, s3, 0 ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s4 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s5 -; GCN-HSA-NEXT: flat_load_dwordx4 v[0:3], v[0:1] ; GCN-HSA-NEXT: s_add_u32 s4, s2, 64 +; GCN-HSA-NEXT: flat_load_dwordx4 v[0:3], v[0:1] ; GCN-HSA-NEXT: s_addc_u32 s5, s3, 0 ; GCN-HSA-NEXT: v_mov_b32_e32 v17, s5 ; GCN-HSA-NEXT: v_mov_b32_e32 v16, s4 ; GCN-HSA-NEXT: flat_load_dwordx4 v[16:19], v[16:17] ; GCN-HSA-NEXT: s_add_u32 s4, s2, 48 ; GCN-HSA-NEXT: s_addc_u32 s5, s3, 0 -; GCN-HSA-NEXT: s_add_u32 s10, s2, 32 -; GCN-HSA-NEXT: s_addc_u32 s11, s3, 0 +; GCN-HSA-NEXT: s_add_u32 s6, s2, 32 +; GCN-HSA-NEXT: s_addc_u32 s7, s3, 0 ; GCN-HSA-NEXT: s_add_u32 s2, s2, 16 ; GCN-HSA-NEXT: v_mov_b32_e32 v21, s5 ; GCN-HSA-NEXT: s_addc_u32 s3, s3, 0 ; GCN-HSA-NEXT: v_mov_b32_e32 v20, s4 -; GCN-HSA-NEXT: v_mov_b32_e32 v33, s11 +; GCN-HSA-NEXT: v_mov_b32_e32 v33, s7 ; GCN-HSA-NEXT: flat_load_dwordx4 v[20:23], v[20:21] -; GCN-HSA-NEXT: v_mov_b32_e32 v32, s10 +; GCN-HSA-NEXT: v_mov_b32_e32 v32, s6 ; GCN-HSA-NEXT: v_mov_b32_e32 v37, s1 ; GCN-HSA-NEXT: v_mov_b32_e32 v36, s0 ; GCN-HSA-NEXT: s_waitcnt vmcnt(5) @@ -4505,37 +4499,37 @@ define amdgpu_kernel void @global_sextload_v64i16_to_v64i32(<64 x i32> addrspace ; GCN-HSA-NEXT: v_mov_b32_e32 v12, s2 ; GCN-HSA-NEXT: s_add_u32 s2, s0, 0xa0 ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 +; GCN-HSA-NEXT: v_mov_b32_e32 v15, s3 +; GCN-HSA-NEXT: v_mov_b32_e32 v14, s2 +; GCN-HSA-NEXT: s_add_u32 s2, s0, 0xb0 ; GCN-HSA-NEXT: v_ashrrev_i32_e32 v11, 16, v5 ; GCN-HSA-NEXT: v_bfe_i32 v10, v5, 0, 16 ; GCN-HSA-NEXT: v_bfe_i32 v8, v4, 0, 16 -; GCN-HSA-NEXT: v_mov_b32_e32 v15, s3 +; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 ; GCN-HSA-NEXT: flat_store_dwordx4 v[24:25], v[8:11] -; GCN-HSA-NEXT: v_mov_b32_e32 v14, s2 -; GCN-HSA-NEXT: s_add_u32 s2, s0, 0xb0 +; GCN-HSA-NEXT: v_mov_b32_e32 v25, s3 ; GCN-HSA-NEXT: v_ashrrev_i32_e32 v10, 16, v7 ; GCN-HSA-NEXT: v_ashrrev_i32_e32 v8, 16, v6 ; GCN-HSA-NEXT: v_bfe_i32 v9, v7, 0, 16 ; GCN-HSA-NEXT: v_bfe_i32 v7, v6, 0, 16 -; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 +; GCN-HSA-NEXT: v_mov_b32_e32 v24, s2 +; GCN-HSA-NEXT: s_add_u32 s2, s0, 0x80 ; GCN-HSA-NEXT: flat_store_dwordx4 v[12:13], v[7:10] ; GCN-HSA-NEXT: s_waitcnt vmcnt(10) ; GCN-HSA-NEXT: v_ashrrev_i32_e32 v5, 16, v0 ; GCN-HSA-NEXT: v_ashrrev_i32_e32 v7, 16, v1 ; GCN-HSA-NEXT: v_bfe_i32 v6, v1, 0, 16 ; GCN-HSA-NEXT: v_bfe_i32 v4, v0, 0, 16 -; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2 -; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3 -; GCN-HSA-NEXT: s_add_u32 s2, s0, 0x80 ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 -; GCN-HSA-NEXT: flat_store_dwordx4 v[14:15], v[4:7] ; GCN-HSA-NEXT: v_ashrrev_i32_e32 v11, 16, v3 -; GCN-HSA-NEXT: v_mov_b32_e32 v5, s3 ; GCN-HSA-NEXT: v_ashrrev_i32_e32 v9, 16, v2 ; GCN-HSA-NEXT: v_bfe_i32 v10, v3, 0, 16 ; GCN-HSA-NEXT: v_bfe_i32 v8, v2, 0, 16 +; GCN-HSA-NEXT: flat_store_dwordx4 v[14:15], v[4:7] +; GCN-HSA-NEXT: flat_store_dwordx4 v[24:25], v[8:11] +; GCN-HSA-NEXT: v_mov_b32_e32 v5, s3 ; GCN-HSA-NEXT: v_mov_b32_e32 v4, s2 ; GCN-HSA-NEXT: s_add_u32 s2, s0, 0x90 -; GCN-HSA-NEXT: flat_store_dwordx4 v[0:1], v[8:11] ; GCN-HSA-NEXT: s_waitcnt vmcnt(11) ; GCN-HSA-NEXT: v_ashrrev_i32_e32 v3, 16, v17 ; GCN-HSA-NEXT: v_ashrrev_i32_e32 v1, 16, v16 @@ -4545,7 +4539,7 @@ define amdgpu_kernel void @global_sextload_v64i16_to_v64i32(<64 x i32> addrspace ; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-HSA-NEXT: v_mov_b32_e32 v5, s3 ; GCN-HSA-NEXT: v_mov_b32_e32 v4, s2 -; GCN-HSA-NEXT: s_add_u32 s2, s0, s8 +; GCN-HSA-NEXT: s_add_u32 s2, s0, 0x60 ; GCN-HSA-NEXT: v_ashrrev_i32_e32 v3, 16, v19 ; GCN-HSA-NEXT: v_ashrrev_i32_e32 v1, 16, v18 ; GCN-HSA-NEXT: v_bfe_i32 v2, v19, 0, 16 @@ -4554,7 +4548,7 @@ define amdgpu_kernel void @global_sextload_v64i16_to_v64i32(<64 x i32> addrspace ; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-HSA-NEXT: v_mov_b32_e32 v5, s3 ; GCN-HSA-NEXT: v_mov_b32_e32 v4, s2 -; GCN-HSA-NEXT: s_add_u32 s2, s0, s7 +; GCN-HSA-NEXT: s_add_u32 s2, s0, 0x70 ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 ; GCN-HSA-NEXT: v_mov_b32_e32 v13, s3 ; GCN-HSA-NEXT: v_mov_b32_e32 v12, s2 @@ -4567,7 +4561,7 @@ define amdgpu_kernel void @global_sextload_v64i16_to_v64i32(<64 x i32> addrspace ; GCN-HSA-NEXT: v_bfe_i32 v10, v23, 0, 16 ; GCN-HSA-NEXT: v_bfe_i32 v8, v22, 0, 16 ; GCN-HSA-NEXT: v_mov_b32_e32 v16, s2 -; GCN-HSA-NEXT: s_add_u32 s2, s0, s6 +; GCN-HSA-NEXT: s_add_u32 s2, s0, 0x50 ; GCN-HSA-NEXT: flat_store_dwordx4 v[12:13], v[8:11] ; GCN-HSA-NEXT: s_waitcnt vmcnt(11) ; GCN-HSA-NEXT: v_ashrrev_i32_e32 v15, 16, v33 @@ -6590,8 +6584,8 @@ define amdgpu_kernel void @global_zextload_v16i16_to_v16i64(<16 x i64> addrspace ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3 -; GCN-HSA-NEXT: flat_load_dwordx4 v[0:3], v[0:1] ; GCN-HSA-NEXT: s_add_u32 s2, s2, 16 +; GCN-HSA-NEXT: flat_load_dwordx4 v[0:3], v[0:1] ; GCN-HSA-NEXT: s_addc_u32 s3, s3, 0 ; GCN-HSA-NEXT: v_mov_b32_e32 v5, s3 ; GCN-HSA-NEXT: v_mov_b32_e32 v4, s2 @@ -6604,8 +6598,7 @@ define amdgpu_kernel void @global_zextload_v16i16_to_v16i64(<16 x i64> addrspace ; GCN-HSA-NEXT: v_mov_b32_e32 v13, s4 ; GCN-HSA-NEXT: s_add_u32 s4, s0, 0x50 ; GCN-HSA-NEXT: s_addc_u32 s5, s1, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v16, s3 -; GCN-HSA-NEXT: v_mov_b32_e32 v15, s2 +; GCN-HSA-NEXT: v_mov_b32_e32 v15, 0 ; GCN-HSA-NEXT: s_waitcnt vmcnt(1) ; GCN-HSA-NEXT: v_lshrrev_b32_e32 v11, 16, v1 ; GCN-HSA-NEXT: v_and_b32_e32 v9, s6, v1 @@ -6613,50 +6606,53 @@ define amdgpu_kernel void @global_zextload_v16i16_to_v16i64(<16 x i64> addrspace ; GCN-HSA-NEXT: v_mov_b32_e32 v14, s5 ; GCN-HSA-NEXT: v_mov_b32_e32 v13, s4 ; GCN-HSA-NEXT: s_add_u32 s4, s0, 0x70 -; GCN-HSA-NEXT: s_addc_u32 s5, s1, 0 ; GCN-HSA-NEXT: s_waitcnt vmcnt(1) ; GCN-HSA-NEXT: v_lshrrev_b32_e32 v11, 16, v5 ; GCN-HSA-NEXT: v_and_b32_e32 v9, s6, v5 -; GCN-HSA-NEXT: s_add_u32 s2, s0, 32 +; GCN-HSA-NEXT: s_addc_u32 s5, s1, 0 ; GCN-HSA-NEXT: flat_store_dwordx4 v[13:14], v[9:12] ; GCN-HSA-NEXT: v_mov_b32_e32 v14, s5 -; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 ; GCN-HSA-NEXT: v_mov_b32_e32 v13, s4 +; GCN-HSA-NEXT: s_add_u32 s4, s0, 32 ; GCN-HSA-NEXT: v_lshrrev_b32_e32 v11, 16, v7 ; GCN-HSA-NEXT: v_and_b32_e32 v9, s6, v7 -; GCN-HSA-NEXT: v_mov_b32_e32 v18, s3 +; GCN-HSA-NEXT: s_addc_u32 s5, s1, 0 ; GCN-HSA-NEXT: flat_store_dwordx4 v[13:14], v[9:12] -; GCN-HSA-NEXT: v_mov_b32_e32 v17, s2 -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v9, 16, v3 -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v13, 16, v2 -; GCN-HSA-NEXT: v_and_b32_e32 v11, s6, v2 -; GCN-HSA-NEXT: v_and_b32_e32 v7, s6, v3 +; GCN-HSA-NEXT: v_mov_b32_e32 v14, s5 +; GCN-HSA-NEXT: v_lshrrev_b32_e32 v11, 16, v2 +; GCN-HSA-NEXT: v_and_b32_e32 v9, s6, v2 +; GCN-HSA-NEXT: v_mov_b32_e32 v1, s2 +; GCN-HSA-NEXT: v_mov_b32_e32 v12, 0 +; GCN-HSA-NEXT: v_mov_b32_e32 v13, s4 +; GCN-HSA-NEXT: v_mov_b32_e32 v2, s3 ; GCN-HSA-NEXT: s_add_u32 s2, s0, 64 -; GCN-HSA-NEXT: v_mov_b32_e32 v14, 0 -; GCN-HSA-NEXT: flat_store_dwordx4 v[15:16], v[7:10] -; GCN-HSA-NEXT: flat_store_dwordx4 v[17:18], v[11:14] -; GCN-HSA-NEXT: v_mov_b32_e32 v1, v8 -; GCN-HSA-NEXT: v_mov_b32_e32 v7, v8 -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v8, 16, v4 -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v13, 16, v6 -; GCN-HSA-NEXT: v_and_b32_e32 v11, s6, v6 -; GCN-HSA-NEXT: v_and_b32_e32 v6, s6, v4 +; GCN-HSA-NEXT: flat_store_dwordx4 v[13:14], v[9:12] +; GCN-HSA-NEXT: v_lshrrev_b32_e32 v14, 16, v4 +; GCN-HSA-NEXT: v_and_b32_e32 v12, s6, v4 ; GCN-HSA-NEXT: v_mov_b32_e32 v5, s1 ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 ; GCN-HSA-NEXT: v_mov_b32_e32 v4, s0 ; GCN-HSA-NEXT: s_add_u32 s0, s0, 0x60 +; GCN-HSA-NEXT: v_lshrrev_b32_e32 v9, 16, v3 +; GCN-HSA-NEXT: v_and_b32_e32 v7, s6, v3 ; GCN-HSA-NEXT: s_addc_u32 s1, s1, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v3, v14 -; GCN-HSA-NEXT: v_mov_b32_e32 v16, s3 +; GCN-HSA-NEXT: flat_store_dwordx4 v[1:2], v[7:10] +; GCN-HSA-NEXT: v_mov_b32_e32 v13, v8 +; GCN-HSA-NEXT: v_mov_b32_e32 v9, v8 +; GCN-HSA-NEXT: v_mov_b32_e32 v1, v8 +; GCN-HSA-NEXT: v_mov_b32_e32 v3, 0 +; GCN-HSA-NEXT: v_lshrrev_b32_e32 v10, 16, v6 +; GCN-HSA-NEXT: v_and_b32_e32 v8, s6, v6 +; GCN-HSA-NEXT: v_mov_b32_e32 v7, s3 ; GCN-HSA-NEXT: v_lshrrev_b32_e32 v2, 16, v0 ; GCN-HSA-NEXT: v_and_b32_e32 v0, s6, v0 -; GCN-HSA-NEXT: v_mov_b32_e32 v18, s1 -; GCN-HSA-NEXT: v_mov_b32_e32 v9, v14 -; GCN-HSA-NEXT: v_mov_b32_e32 v15, s2 -; GCN-HSA-NEXT: v_mov_b32_e32 v17, s0 +; GCN-HSA-NEXT: v_mov_b32_e32 v17, s1 +; GCN-HSA-NEXT: v_mov_b32_e32 v11, 0 +; GCN-HSA-NEXT: v_mov_b32_e32 v6, s2 +; GCN-HSA-NEXT: v_mov_b32_e32 v16, s0 ; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GCN-HSA-NEXT: flat_store_dwordx4 v[15:16], v[6:9] -; GCN-HSA-NEXT: flat_store_dwordx4 v[17:18], v[11:14] +; GCN-HSA-NEXT: flat_store_dwordx4 v[6:7], v[12:15] +; GCN-HSA-NEXT: flat_store_dwordx4 v[16:17], v[8:11] ; GCN-HSA-NEXT: s_endpgm ; ; GCN-NOHSA-VI-LABEL: global_zextload_v16i16_to_v16i64: @@ -7523,70 +7519,74 @@ define amdgpu_kernel void @global_zextload_v32i16_to_v32i64(<32 x i64> addrspace ; GCN-HSA-NEXT: v_mov_b32_e32 v20, s10 ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 ; GCN-HSA-NEXT: flat_store_dwordx4 v[20:21], v[5:8] -; GCN-HSA-NEXT: v_mov_b32_e32 v10, 0 +; GCN-HSA-NEXT: v_and_b32_e32 v3, s18, v16 +; GCN-HSA-NEXT: v_lshrrev_b32_e32 v7, 16, v15 +; GCN-HSA-NEXT: v_and_b32_e32 v5, s18, v15 +; GCN-HSA-NEXT: v_mov_b32_e32 v15, s3 +; GCN-HSA-NEXT: v_mov_b32_e32 v8, 0 +; GCN-HSA-NEXT: v_mov_b32_e32 v14, s2 +; GCN-HSA-NEXT: flat_store_dwordx4 v[14:15], v[5:8] +; GCN-HSA-NEXT: s_add_u32 s2, s0, 0xe0 ; GCN-HSA-NEXT: v_lshrrev_b32_e32 v7, 16, v13 ; GCN-HSA-NEXT: v_and_b32_e32 v5, s18, v13 ; GCN-HSA-NEXT: v_mov_b32_e32 v13, s1 -; GCN-HSA-NEXT: s_add_u32 s6, s0, 0xe0 -; GCN-HSA-NEXT: v_mov_b32_e32 v8, v10 +; GCN-HSA-NEXT: v_mov_b32_e32 v8, 0 ; GCN-HSA-NEXT: v_mov_b32_e32 v12, s0 -; GCN-HSA-NEXT: s_addc_u32 s7, s1, 0 +; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 ; GCN-HSA-NEXT: flat_store_dwordx4 v[12:13], v[5:8] -; GCN-HSA-NEXT: v_mov_b32_e32 v13, s7 -; GCN-HSA-NEXT: v_mov_b32_e32 v12, s6 -; GCN-HSA-NEXT: s_add_u32 s6, s0, 0xc0 +; GCN-HSA-NEXT: v_mov_b32_e32 v13, s3 +; GCN-HSA-NEXT: v_mov_b32_e32 v12, s2 +; GCN-HSA-NEXT: s_add_u32 s2, s0, 0xc0 +; GCN-HSA-NEXT: v_mov_b32_e32 v8, 0 ; GCN-HSA-NEXT: v_lshrrev_b32_e32 v7, 16, v19 ; GCN-HSA-NEXT: v_and_b32_e32 v5, s18, v19 -; GCN-HSA-NEXT: s_addc_u32 s7, s1, 0 +; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 ; GCN-HSA-NEXT: flat_store_dwordx4 v[12:13], v[5:8] -; GCN-HSA-NEXT: v_mov_b32_e32 v13, s7 -; GCN-HSA-NEXT: v_mov_b32_e32 v12, s6 -; GCN-HSA-NEXT: s_add_u32 s6, s0, 0xa0 +; GCN-HSA-NEXT: v_mov_b32_e32 v13, s3 +; GCN-HSA-NEXT: v_mov_b32_e32 v12, s2 +; GCN-HSA-NEXT: s_add_u32 s2, s0, 0xa0 ; GCN-HSA-NEXT: v_lshrrev_b32_e32 v7, 16, v17 ; GCN-HSA-NEXT: v_and_b32_e32 v5, s18, v17 -; GCN-HSA-NEXT: s_addc_u32 s7, s1, 0 +; GCN-HSA-NEXT: v_mov_b32_e32 v8, 0 +; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 ; GCN-HSA-NEXT: flat_store_dwordx4 v[12:13], v[5:8] -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v13, 16, v0 +; GCN-HSA-NEXT: v_lshrrev_b32_e32 v19, 16, v9 ; GCN-HSA-NEXT: v_lshrrev_b32_e32 v7, 16, v11 ; GCN-HSA-NEXT: v_and_b32_e32 v5, s18, v11 -; GCN-HSA-NEXT: v_mov_b32_e32 v12, s7 -; GCN-HSA-NEXT: v_mov_b32_e32 v11, s6 -; GCN-HSA-NEXT: s_add_u32 s6, s0, 0x80 -; GCN-HSA-NEXT: s_addc_u32 s7, s1, 0 -; GCN-HSA-NEXT: flat_store_dwordx4 v[11:12], v[5:8] -; GCN-HSA-NEXT: v_mov_b32_e32 v12, s7 -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v7, 16, v9 -; GCN-HSA-NEXT: v_and_b32_e32 v5, s18, v9 -; GCN-HSA-NEXT: v_mov_b32_e32 v11, s6 -; GCN-HSA-NEXT: flat_store_dwordx4 v[11:12], v[5:8] -; GCN-HSA-NEXT: v_and_b32_e32 v11, s18, v0 +; GCN-HSA-NEXT: v_mov_b32_e32 v11, s3 +; GCN-HSA-NEXT: v_mov_b32_e32 v8, 0 +; GCN-HSA-NEXT: v_mov_b32_e32 v10, s2 +; GCN-HSA-NEXT: flat_store_dwordx4 v[10:11], v[5:8] +; GCN-HSA-NEXT: v_lshrrev_b32_e32 v10, 16, v0 +; GCN-HSA-NEXT: v_and_b32_e32 v8, s18, v0 ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s4 ; GCN-HSA-NEXT: v_lshrrev_b32_e32 v5, 16, v16 -; GCN-HSA-NEXT: v_and_b32_e32 v3, s18, v16 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s5 +; GCN-HSA-NEXT: s_add_u32 s2, s0, 0x80 ; GCN-HSA-NEXT: flat_store_dwordx4 v[0:1], v[3:6] -; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2 -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v9, 16, v15 -; GCN-HSA-NEXT: v_and_b32_e32 v7, s18, v15 -; GCN-HSA-NEXT: v_mov_b32_e32 v8, v4 -; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3 -; GCN-HSA-NEXT: s_add_u32 s2, s0, 0x60 -; GCN-HSA-NEXT: flat_store_dwordx4 v[0:1], v[7:10] ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2 -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v19, 16, v2 -; GCN-HSA-NEXT: v_and_b32_e32 v17, s18, v2 +; GCN-HSA-NEXT: v_and_b32_e32 v17, s18, v9 +; GCN-HSA-NEXT: v_mov_b32_e32 v20, 0 ; GCN-HSA-NEXT: v_mov_b32_e32 v18, v4 -; GCN-HSA-NEXT: v_mov_b32_e32 v20, v10 +; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3 +; GCN-HSA-NEXT: s_add_u32 s2, s0, 0x60 +; GCN-HSA-NEXT: flat_store_dwordx4 v[0:1], v[17:20] +; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 +; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2 +; GCN-HSA-NEXT: v_lshrrev_b32_e32 v14, 16, v2 +; GCN-HSA-NEXT: v_and_b32_e32 v12, s18, v2 +; GCN-HSA-NEXT: v_mov_b32_e32 v15, 0 +; GCN-HSA-NEXT: v_mov_b32_e32 v13, v4 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3 ; GCN-HSA-NEXT: s_add_u32 s0, s0, 64 -; GCN-HSA-NEXT: flat_store_dwordx4 v[0:1], v[17:20] +; GCN-HSA-NEXT: flat_store_dwordx4 v[0:1], v[12:15] ; GCN-HSA-NEXT: s_addc_u32 s1, s1, 0 ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s0 -; GCN-HSA-NEXT: v_mov_b32_e32 v12, v4 -; GCN-HSA-NEXT: v_mov_b32_e32 v14, v10 +; GCN-HSA-NEXT: v_mov_b32_e32 v11, 0 +; GCN-HSA-NEXT: v_mov_b32_e32 v9, v4 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s1 -; GCN-HSA-NEXT: flat_store_dwordx4 v[0:1], v[11:14] +; GCN-HSA-NEXT: flat_store_dwordx4 v[0:1], v[8:11] ; GCN-HSA-NEXT: s_endpgm ; ; GCN-NOHSA-VI-LABEL: global_zextload_v32i16_to_v32i64: diff --git a/llvm/test/CodeGen/AMDGPU/madak.ll b/llvm/test/CodeGen/AMDGPU/madak.ll index 40264342c295..6af32a90d852 100644 --- a/llvm/test/CodeGen/AMDGPU/madak.ll +++ b/llvm/test/CodeGen/AMDGPU/madak.ll @@ -41,7 +41,6 @@ define amdgpu_kernel void @madak_f32(float addrspace(1)* noalias %out, float add ; GCN-LABEL: {{^}}madak_2_use_f32: ; GFX9: v_mov_b32_e32 [[VK:v[0-9]+]], 0x41200000 -; GFX10: v_mov_b32_e32 [[VK:v[0-9]+]], 0x41200000 ; GFX6-DAG: buffer_load_dword [[VA:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 glc{{$}} ; GFX6-DAG: buffer_load_dword [[VB:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4 ; GFX6-DAG: buffer_load_dword [[VC:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8 @@ -54,7 +53,8 @@ define amdgpu_kernel void @madak_f32(float addrspace(1)* noalias %out, float add ; GFX10-MAD-DAG:v_madak_f32 {{v[0-9]+}}, [[VA]], [[VB]], 0x41200000 ; FMA-DAG: v_fmaak_f32 {{v[0-9]+}}, [[VA]], [[VB]], 0x41200000 ; MAD-DAG: v_mac_f32_e32 [[VK]], [[VA]], [[VC]] -; FMA-DAG: v_fmac_f32_e32 [[VK]], [[VA]], [[VC]] +; GFX10-FMA-DAG:v_fmaak_f32 {{v[0-9]+}}, [[VA]], [[VC]], 0x41200000 +; GFX940-FMA-DAG:v_fmac_f32_e32 [[VK]], [[VA]], [[VC]] ; GCN: s_endpgm define amdgpu_kernel void @madak_2_use_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in) #0 { %tid = tail call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone diff --git a/llvm/test/CodeGen/AMDGPU/max.ll b/llvm/test/CodeGen/AMDGPU/max.ll index deded84145b9..70c028759049 100644 --- a/llvm/test/CodeGen/AMDGPU/max.ll +++ b/llvm/test/CodeGen/AMDGPU/max.ll @@ -228,7 +228,9 @@ define amdgpu_kernel void @s_test_umax_ugt_imm_v2i32(<2 x i32> addrspace(1)* %ou ; FUNC-LABEL: {{^}}simplify_demanded_bits_test_umax_ugt_i16: ; SI-DAG: s_load_dword [[A:s[0-9]+]], {{s\[[0-9]+:[0-9]+\]}}, 0x13 ; SI-DAG: s_load_dword [[B:s[0-9]+]], {{s\[[0-9]+:[0-9]+\]}}, 0x1c -; SI: s_max_u32 [[MAX:s[0-9]+]], [[A]], [[B]] +; SI-DAG: s_and_b32 [[A16:s[0-9]+]], [[A]], 0xffff +; SI-DAG: s_and_b32 [[B16:s[0-9]+]], [[B]], 0xffff +; SI: s_max_u32 [[MAX:s[0-9]+]], [[A16]], [[B16]] ; SI: v_mov_b32_e32 [[VMAX:v[0-9]+]], [[MAX]] ; SI: buffer_store_dword [[VMAX]] diff --git a/llvm/test/CodeGen/AMDGPU/mul.ll b/llvm/test/CodeGen/AMDGPU/mul.ll index 0927be6a6b7a..def945fc3b4a 100644 --- a/llvm/test/CodeGen/AMDGPU/mul.ll +++ b/llvm/test/CodeGen/AMDGPU/mul.ll @@ -73,7 +73,7 @@ define amdgpu_kernel void @v_trunc_i64_mul_to_i32(i32 addrspace(1)* %out, i64 ad ; FUNC-LABEL: {{^}}mul64_sext_c: ; EG-DAG: MULLO_INT ; EG-DAG: MULHI_INT -; SI-DAG: s_mul_i32 +; SI-DAG: s_mulk_i32 ; SI-DAG: v_mul_hi_i32 ; VI: v_mad_i64_i32 define amdgpu_kernel void @mul64_sext_c(i64 addrspace(1)* %out, i32 %in) { diff --git a/llvm/test/CodeGen/AMDGPU/mul_uint24-amdgcn.ll b/llvm/test/CodeGen/AMDGPU/mul_uint24-amdgcn.ll index 9e3ef2434ee5..f625977b6854 100644 --- a/llvm/test/CodeGen/AMDGPU/mul_uint24-amdgcn.ll +++ b/llvm/test/CodeGen/AMDGPU/mul_uint24-amdgcn.ll @@ -12,11 +12,10 @@ define amdgpu_kernel void @test_umul24_i32(i32 addrspace(1)* %out, i32 %a, i32 % ; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb ; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 -; SI-NEXT: s_mov_b32 s2, 0xffffff ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_and_b32 s4, s4, s2 -; SI-NEXT: s_and_b32 s2, s5, s2 -; SI-NEXT: s_mul_i32 s4, s4, s2 +; SI-NEXT: s_and_b32 s2, s4, 0xffffff +; SI-NEXT: s_and_b32 s4, s5, 0xffffff +; SI-NEXT: s_mul_i32 s4, s2, s4 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: v_mov_b32_e32 v0, s4 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 @@ -26,12 +25,11 @@ define amdgpu_kernel void @test_umul24_i32(i32 addrspace(1)* %out, i32 %a, i32 % ; VI: ; %bb.0: ; %entry ; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x2c ; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; VI-NEXT: s_mov_b32 s6, 0xffffff ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_and_b32 s4, s4, s6 -; VI-NEXT: s_and_b32 s5, s5, s6 +; VI-NEXT: s_and_b32 s4, s4, 0xffffff +; VI-NEXT: s_and_b32 s5, s5, 0xffffff ; VI-NEXT: s_mul_i32 s4, s4, s5 ; VI-NEXT: v_mov_b32_e32 v0, s4 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 @@ -41,13 +39,12 @@ define amdgpu_kernel void @test_umul24_i32(i32 addrspace(1)* %out, i32 %a, i32 % ; GFX9: ; %bb.0: ; %entry ; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c ; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 -; GFX9-NEXT: s_mov_b32 s0, 0xffffff ; GFX9-NEXT: s_mov_b32 s7, 0xf000 ; GFX9-NEXT: s_mov_b32 s6, -1 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_and_b32 s1, s2, s0 -; GFX9-NEXT: s_and_b32 s0, s3, s0 -; GFX9-NEXT: s_mul_i32 s0, s1, s0 +; GFX9-NEXT: s_and_b32 s0, s2, 0xffffff +; GFX9-NEXT: s_and_b32 s1, s3, 0xffffff +; GFX9-NEXT: s_mul_i32 s0, s0, s1 ; GFX9-NEXT: v_mov_b32_e32 v0, s0 ; GFX9-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GFX9-NEXT: s_endpgm @@ -405,13 +402,12 @@ define amdgpu_kernel void @test_umulhi24_i32_i64(i32 addrspace(1)* %out, i32 %a, ; GFX9: ; %bb.0: ; %entry ; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c ; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 -; GFX9-NEXT: s_mov_b32 s0, 0xffffff ; GFX9-NEXT: s_mov_b32 s7, 0xf000 ; GFX9-NEXT: s_mov_b32 s6, -1 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_and_b32 s1, s2, s0 -; GFX9-NEXT: s_and_b32 s0, s3, s0 -; GFX9-NEXT: s_mul_hi_u32 s0, s1, s0 +; GFX9-NEXT: s_and_b32 s0, s2, 0xffffff +; GFX9-NEXT: s_and_b32 s1, s3, 0xffffff +; GFX9-NEXT: s_mul_hi_u32 s0, s0, s1 ; GFX9-NEXT: v_mov_b32_e32 v0, s0 ; GFX9-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GFX9-NEXT: s_endpgm @@ -466,12 +462,11 @@ define amdgpu_kernel void @test_umulhi24(i32 addrspace(1)* %out, i64 %a, i64 %b) ; GFX9-NEXT: s_mov_b32 s3, 0xf000 ; GFX9-NEXT: s_mov_b32 s2, -1 ; GFX9-NEXT: s_mov_b32 s0, s4 -; GFX9-NEXT: s_mov_b32 s4, 0xffffff ; GFX9-NEXT: s_mov_b32 s1, s5 -; GFX9-NEXT: s_and_b32 s5, s6, s4 +; GFX9-NEXT: s_and_b32 s4, s6, 0xffffff ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_and_b32 s4, s7, s4 -; GFX9-NEXT: s_mul_hi_u32 s4, s5, s4 +; GFX9-NEXT: s_and_b32 s5, s7, 0xffffff +; GFX9-NEXT: s_mul_hi_u32 s4, s4, s5 ; GFX9-NEXT: v_mov_b32_e32 v0, s4 ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX9-NEXT: s_endpgm @@ -494,12 +489,11 @@ define amdgpu_kernel void @test_umul24_i64(i64 addrspace(1)* %out, i64 %a, i64 % ; SI-NEXT: s_load_dword s7, s[0:1], 0xd ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 -; SI-NEXT: s_mov_b32 s8, 0xffffff ; SI-NEXT: s_mov_b32 s0, s4 ; SI-NEXT: s_mov_b32 s1, s5 -; SI-NEXT: s_and_b32 s4, s6, s8 +; SI-NEXT: s_and_b32 s4, s6, 0xffffff ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_and_b32 s5, s7, s8 +; SI-NEXT: s_and_b32 s5, s7, 0xffffff ; SI-NEXT: v_mov_b32_e32 v0, s7 ; SI-NEXT: s_mul_i32 s4, s4, s5 ; SI-NEXT: v_mul_hi_u32_u24_e32 v1, s6, v0 @@ -531,14 +525,13 @@ define amdgpu_kernel void @test_umul24_i64(i64 addrspace(1)* %out, i64 %a, i64 % ; GFX9-NEXT: s_mov_b32 s3, 0xf000 ; GFX9-NEXT: s_mov_b32 s2, -1 ; GFX9-NEXT: s_mov_b32 s0, s4 -; GFX9-NEXT: s_mov_b32 s4, 0xffffff ; GFX9-NEXT: s_mov_b32 s1, s5 -; GFX9-NEXT: s_and_b32 s5, s6, s4 +; GFX9-NEXT: s_and_b32 s4, s6, 0xffffff ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_and_b32 s4, s7, s4 -; GFX9-NEXT: s_mul_hi_u32 s6, s5, s4 -; GFX9-NEXT: s_mul_i32 s5, s5, s4 -; GFX9-NEXT: v_mov_b32_e32 v0, s5 +; GFX9-NEXT: s_and_b32 s5, s7, 0xffffff +; GFX9-NEXT: s_mul_hi_u32 s6, s4, s5 +; GFX9-NEXT: s_mul_i32 s4, s4, s5 +; GFX9-NEXT: v_mov_b32_e32 v0, s4 ; GFX9-NEXT: v_mov_b32_e32 v1, s6 ; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; GFX9-NEXT: s_endpgm @@ -638,12 +631,11 @@ define amdgpu_kernel void @test_umulhi16_i32(i16 addrspace(1)* %out, i32 %a, i32 ; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb ; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 -; SI-NEXT: s_mov_b32 s2, 0xffff ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_and_b32 s4, s4, s2 -; SI-NEXT: s_and_b32 s2, s5, s2 -; SI-NEXT: s_mul_i32 s4, s4, s2 -; SI-NEXT: s_lshr_b32 s4, s4, 16 +; SI-NEXT: s_and_b32 s2, s4, 0xffff +; SI-NEXT: s_and_b32 s4, s5, 0xffff +; SI-NEXT: s_mul_i32 s2, s2, s4 +; SI-NEXT: s_lshr_b32 s4, s2, 16 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: v_mov_b32_e32 v0, s4 ; SI-NEXT: buffer_store_short v0, off, s[0:3], 0 @@ -653,12 +645,11 @@ define amdgpu_kernel void @test_umulhi16_i32(i16 addrspace(1)* %out, i32 %a, i32 ; VI: ; %bb.0: ; %entry ; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x2c ; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; VI-NEXT: s_mov_b32 s6, 0xffff ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_and_b32 s4, s4, s6 -; VI-NEXT: s_and_b32 s5, s5, s6 +; VI-NEXT: s_and_b32 s4, s4, 0xffff +; VI-NEXT: s_and_b32 s5, s5, 0xffff ; VI-NEXT: s_mul_i32 s4, s4, s5 ; VI-NEXT: s_lshr_b32 s4, s4, 16 ; VI-NEXT: v_mov_b32_e32 v0, s4 @@ -669,12 +660,11 @@ define amdgpu_kernel void @test_umulhi16_i32(i16 addrspace(1)* %out, i32 %a, i32 ; GFX9: ; %bb.0: ; %entry ; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c ; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 -; GFX9-NEXT: s_mov_b32 s0, 0xffff ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_and_b32 s1, s2, s0 -; GFX9-NEXT: s_and_b32 s0, s3, s0 -; GFX9-NEXT: s_mul_i32 s0, s1, s0 +; GFX9-NEXT: s_and_b32 s0, s2, 0xffff +; GFX9-NEXT: s_and_b32 s1, s3, 0xffff +; GFX9-NEXT: s_mul_i32 s0, s0, s1 ; GFX9-NEXT: v_mov_b32_e32 v1, s0 ; GFX9-NEXT: global_store_short_d16_hi v0, v1, s[4:5] ; GFX9-NEXT: s_endpgm @@ -696,15 +686,14 @@ define amdgpu_kernel void @test_umul24_i33(i64 addrspace(1)* %out, i33 %a, i33 % ; SI-NEXT: s_load_dword s0, s[0:1], 0xd ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 -; SI-NEXT: s_mov_b32 s1, 0xffffff ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_and_b32 s3, s2, s1 -; SI-NEXT: s_and_b32 s1, s0, s1 +; SI-NEXT: s_and_b32 s1, s2, 0xffffff +; SI-NEXT: s_and_b32 s3, s0, 0xffffff ; SI-NEXT: v_mov_b32_e32 v0, s0 ; SI-NEXT: v_mul_hi_u32_u24_e32 v0, s2, v0 -; SI-NEXT: s_mul_i32 s3, s3, s1 +; SI-NEXT: s_mul_i32 s1, s1, s3 ; SI-NEXT: v_and_b32_e32 v1, 1, v0 -; SI-NEXT: v_mov_b32_e32 v0, s3 +; SI-NEXT: v_mov_b32_e32 v0, s1 ; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; SI-NEXT: s_endpgm ; @@ -725,17 +714,16 @@ define amdgpu_kernel void @test_umul24_i33(i64 addrspace(1)* %out, i33 %a, i33 % ; ; GFX9-LABEL: test_umul24_i33: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 ; GFX9-NEXT: s_load_dword s2, s[0:1], 0x2c ; GFX9-NEXT: s_load_dword s3, s[0:1], 0x34 -; GFX9-NEXT: s_mov_b32 s0, 0xffffff +; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 ; GFX9-NEXT: s_mov_b32 s7, 0xf000 ; GFX9-NEXT: s_mov_b32 s6, -1 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_and_b32 s1, s2, s0 -; GFX9-NEXT: s_and_b32 s0, s3, s0 -; GFX9-NEXT: s_mul_i32 s2, s1, s0 -; GFX9-NEXT: s_mul_hi_u32 s0, s1, s0 +; GFX9-NEXT: s_and_b32 s0, s2, 0xffffff +; GFX9-NEXT: s_and_b32 s1, s3, 0xffffff +; GFX9-NEXT: s_mul_i32 s2, s0, s1 +; GFX9-NEXT: s_mul_hi_u32 s0, s0, s1 ; GFX9-NEXT: s_and_b32 s0, s0, 1 ; GFX9-NEXT: v_mov_b32_e32 v0, s2 ; GFX9-NEXT: v_mov_b32_e32 v1, s0 @@ -783,16 +771,15 @@ define amdgpu_kernel void @test_umulhi24_i33(i32 addrspace(1)* %out, i33 %a, i33 ; ; GFX9-LABEL: test_umulhi24_i33: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 ; GFX9-NEXT: s_load_dword s2, s[0:1], 0x2c ; GFX9-NEXT: s_load_dword s3, s[0:1], 0x34 -; GFX9-NEXT: s_mov_b32 s0, 0xffffff +; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 ; GFX9-NEXT: s_mov_b32 s7, 0xf000 ; GFX9-NEXT: s_mov_b32 s6, -1 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_and_b32 s1, s2, s0 -; GFX9-NEXT: s_and_b32 s0, s3, s0 -; GFX9-NEXT: s_mul_hi_u32 s0, s1, s0 +; GFX9-NEXT: s_and_b32 s0, s2, 0xffffff +; GFX9-NEXT: s_and_b32 s1, s3, 0xffffff +; GFX9-NEXT: s_mul_hi_u32 s0, s0, s1 ; GFX9-NEXT: s_and_b32 s0, s0, 1 ; GFX9-NEXT: v_mov_b32_e32 v0, s0 ; GFX9-NEXT: buffer_store_dword v0, off, s[4:7], 0 diff --git a/llvm/test/CodeGen/AMDGPU/or.ll b/llvm/test/CodeGen/AMDGPU/or.ll index 118cfe515ec1..c2e615e97132 100644 --- a/llvm/test/CodeGen/AMDGPU/or.ll +++ b/llvm/test/CodeGen/AMDGPU/or.ll @@ -80,8 +80,8 @@ define amdgpu_kernel void @scalar_or_literal_i64(i64 addrspace(1)* %out, [8 x i3 ; SI-DAG: s_movk_i32 s[[K_LO:[0-9]+]], 0x3039 ; SI: s_or_b64 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, s[[[K_LO]]:[[K_HI]]] -; SI: s_add_u32 s{{[0-9]+}}, s{{[0-9]+}}, s[[K_LO]] -; SI: s_addc_u32 s{{[0-9]+}}, s{{[0-9]+}}, s[[K_HI]] +; SI: s_add_u32 s{{[0-9]+}}, s{{[0-9]+}}, 0x3039 +; SI: s_addc_u32 s{{[0-9]+}}, s{{[0-9]+}}, 0xf237b define amdgpu_kernel void @scalar_or_literal_multi_use_i64(i64 addrspace(1)* %out, [8 x i32], i64 %a, [8 x i32], i64 %b) { %or = or i64 %a, 4261135838621753 store i64 %or, i64 addrspace(1)* %out diff --git a/llvm/test/CodeGen/AMDGPU/packed-fp32.ll b/llvm/test/CodeGen/AMDGPU/packed-fp32.ll index 10e8d96ed822..b66b9cc35516 100644 --- a/llvm/test/CodeGen/AMDGPU/packed-fp32.ll +++ b/llvm/test/CodeGen/AMDGPU/packed-fp32.ll @@ -565,8 +565,7 @@ define amdgpu_kernel void @fneg_v2f32_vec(<2 x float> addrspace(1)* %a) { } ; GCN-LABEL: {{^}}fneg_v2f32_scalar: -; GCN: s_brev_b32 [[SIGN:s[0-9]+]], 1 -; GCN-COUNT-2: s_xor_b32 s{{[0-9]+}}, s{{[0-9]+}}, [[SIGN]] +; GCN-COUNT-2: s_xor_b32 s{{[0-9]+}}, s{{[0-9]+}}, 0x80000000 define amdgpu_kernel void @fneg_v2f32_scalar(<2 x float> addrspace(1)* %a, <2 x float> %x) { %fneg = fsub <2 x float> , %x store <2 x float> %fneg, <2 x float> addrspace(1)* %a, align 8 diff --git a/llvm/test/CodeGen/AMDGPU/promote-constOffset-to-imm.ll b/llvm/test/CodeGen/AMDGPU/promote-constOffset-to-imm.ll index ca0e32fac5be..d958f5d9d97d 100644 --- a/llvm/test/CodeGen/AMDGPU/promote-constOffset-to-imm.ll +++ b/llvm/test/CodeGen/AMDGPU/promote-constOffset-to-imm.ll @@ -1715,22 +1715,28 @@ define amdgpu_kernel void @DiffBase(i8 addrspace(1)* %buffer1, ; GFX8-NEXT: v_mov_b32_e32 v3, s39 ; GFX8-NEXT: v_add_u32_e32 v12, vcc, s38, v2 ; GFX8-NEXT: v_addc_u32_e32 v13, vcc, 0, v3, vcc -; GFX8-NEXT: v_add_u32_e32 v2, vcc, 0x1000, v0 +; GFX8-NEXT: s_movk_i32 s0, 0x1000 +; GFX8-NEXT: v_add_u32_e32 v2, vcc, s0, v0 ; GFX8-NEXT: v_addc_u32_e32 v3, vcc, 0, v1, vcc -; GFX8-NEXT: v_add_u32_e32 v4, vcc, 0x1800, v0 +; GFX8-NEXT: s_movk_i32 s0, 0x1800 +; GFX8-NEXT: v_add_u32_e32 v4, vcc, s0, v0 ; GFX8-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc -; GFX8-NEXT: v_add_u32_e32 v6, vcc, 0x2000, v0 +; GFX8-NEXT: s_movk_i32 s0, 0x2000 +; GFX8-NEXT: v_add_u32_e32 v6, vcc, s0, v0 ; GFX8-NEXT: v_addc_u32_e32 v7, vcc, 0, v1, vcc -; GFX8-NEXT: v_add_u32_e32 v8, vcc, 0x2800, v12 +; GFX8-NEXT: s_movk_i32 s0, 0x2800 +; GFX8-NEXT: v_add_u32_e32 v8, vcc, s0, v12 ; GFX8-NEXT: flat_load_dwordx2 v[2:3], v[2:3] ; GFX8-NEXT: flat_load_dwordx2 v[4:5], v[4:5] ; GFX8-NEXT: flat_load_dwordx2 v[6:7], v[6:7] ; GFX8-NEXT: v_addc_u32_e32 v9, vcc, 0, v13, vcc -; GFX8-NEXT: v_add_u32_e32 v10, vcc, 0x3000, v12 +; GFX8-NEXT: s_movk_i32 s0, 0x3000 +; GFX8-NEXT: v_add_u32_e32 v10, vcc, s0, v12 ; GFX8-NEXT: v_addc_u32_e32 v11, vcc, 0, v13, vcc +; GFX8-NEXT: s_movk_i32 s0, 0x3800 ; GFX8-NEXT: flat_load_dwordx2 v[8:9], v[8:9] ; GFX8-NEXT: flat_load_dwordx2 v[10:11], v[10:11] -; GFX8-NEXT: v_add_u32_e32 v12, vcc, 0x3800, v12 +; GFX8-NEXT: v_add_u32_e32 v12, vcc, s0, v12 ; GFX8-NEXT: v_addc_u32_e32 v13, vcc, 0, v13, vcc ; GFX8-NEXT: flat_load_dwordx2 v[12:13], v[12:13] ; GFX8-NEXT: s_waitcnt vmcnt(4) @@ -1780,16 +1786,14 @@ define amdgpu_kernel void @DiffBase(i8 addrspace(1)* %buffer1, ; GFX9-NEXT: v_addc_co_u32_e32 v13, vcc, 0, v3, vcc ; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, 0x1000, v0 ; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v1, vcc -; GFX9-NEXT: s_movk_i32 s0, 0x2000 -; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, s0, v0 +; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, 0x2000, v0 ; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v1, vcc ; GFX9-NEXT: global_load_dwordx2 v[6:7], v[2:3], off ; GFX9-NEXT: global_load_dwordx2 v[8:9], v[2:3], off offset:2048 ; GFX9-NEXT: global_load_dwordx2 v[10:11], v[4:5], off -; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s0, v12 +; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, 0x2000, v12 ; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v13, vcc -; GFX9-NEXT: s_movk_i32 s0, 0x3000 -; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, s0, v12 +; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, 0x3000, v12 ; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v13, vcc ; GFX9-NEXT: global_load_dwordx2 v[12:13], v[2:3], off offset:2048 ; GFX9-NEXT: global_load_dwordx2 v[14:15], v[4:5], off diff --git a/llvm/test/CodeGen/AMDGPU/s_addk_i32.ll b/llvm/test/CodeGen/AMDGPU/s_addk_i32.ll index 20943ffa05f1..821576a550c1 100644 --- a/llvm/test/CodeGen/AMDGPU/s_addk_i32.ll +++ b/llvm/test/CodeGen/AMDGPU/s_addk_i32.ll @@ -16,11 +16,9 @@ define amdgpu_kernel void @s_addk_i32_k0(i32 addrspace(1)* %out, i32 %b) { ret void } -; FIXME: This should be folded with any number of uses. ; SI-LABEL: {{^}}s_addk_i32_k0_x2: -; SI: s_movk_i32 [[K:s[0-9]+]], 0x41 -; SI-DAG: s_add_i32 {{s[0-9]+}}, {{s[0-9]+}}, [[K]] -; SI-DAG: s_add_i32 {{s[0-9]+}}, {{s[0-9]+}}, [[K]] +; SI-DAG: s_addk_i32 {{s[0-9]+}}, 0x41 +; SI-DAG: s_addk_i32 {{s[0-9]+}}, 0x41 ; SI: s_endpgm define amdgpu_kernel void @s_addk_i32_k0_x2(i32 addrspace(1)* %out0, i32 addrspace(1)* %out1, i32 %a, i32 %b) { %add0 = add i32 %a, 65 diff --git a/llvm/test/CodeGen/AMDGPU/salu-to-valu.ll b/llvm/test/CodeGen/AMDGPU/salu-to-valu.ll index 2c1aa053b39c..c5f8abc08f00 100644 --- a/llvm/test/CodeGen/AMDGPU/salu-to-valu.ll +++ b/llvm/test/CodeGen/AMDGPU/salu-to-valu.ll @@ -483,10 +483,9 @@ bb4: } ; GCN-LABEL: {{^}}phi_imm_in_sgprs -; GCN: s_movk_i32 [[A:s[0-9]+]], 0x400 ; GCN: s_movk_i32 [[B:s[0-9]+]], 0x400 ; GCN: [[LOOP_LABEL:.L[0-9a-zA-Z_]+]]: -; GCN: s_xor_b32 [[B]], [[B]], [[A]] +; GCN: s_xor_b32 [[B]], [[B]], 0x400 ; GCN: s_cbranch_scc{{[01]}} [[LOOP_LABEL]] define amdgpu_kernel void @phi_imm_in_sgprs(i32 addrspace(3)* %out, i32 %cond) { entry: diff --git a/llvm/test/CodeGen/AMDGPU/scratch-buffer.ll b/llvm/test/CodeGen/AMDGPU/scratch-buffer.ll index d1903f457b30..4a44a89a016f 100644 --- a/llvm/test/CodeGen/AMDGPU/scratch-buffer.ll +++ b/llvm/test/CodeGen/AMDGPU/scratch-buffer.ll @@ -48,9 +48,8 @@ done: ; GCN-LABEL: {{^}}legal_offset_fi_offset: ; GCN-DAG: buffer_store_dword v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9]+}}:{{[0-9]+}}], 0 offen{{$}} -; This constant isn't folded, because it has multiple uses. ; GCN-DAG: v_mov_b32_e32 [[K8000:v[0-9]+]], 0x8004 -; GCN-DAG: v_add_{{[iu]}}32_e32 [[OFFSET:v[0-9]+]], vcc, [[K8000]] +; GCN-DAG: v_add_{{[iu]}}32_e32 [[OFFSET:v[0-9]+]], vcc, 0x8004 ; GCN: buffer_store_dword v{{[0-9]+}}, [[OFFSET]], s[{{[0-9]+}}:{{[0-9]+}}], 0 offen{{$}} define amdgpu_kernel void @legal_offset_fi_offset(i32 addrspace(1)* %out, i32 %cond, i32 addrspace(1)* %offsets, i32 %if_offset, i32 %else_offset) { diff --git a/llvm/test/CodeGen/AMDGPU/sdiv64.ll b/llvm/test/CodeGen/AMDGPU/sdiv64.ll index daa60acbd60f..6fec9f648034 100644 --- a/llvm/test/CodeGen/AMDGPU/sdiv64.ll +++ b/llvm/test/CodeGen/AMDGPU/sdiv64.ll @@ -21,79 +21,78 @@ define amdgpu_kernel void @s_test_sdiv(i64 addrspace(1)* %out, i64 %x, i64 %y) { ; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GCN-NEXT: v_mac_f32_e32 v0, 0x4f800000, v1 ; GCN-NEXT: v_rcp_f32_e32 v0, v0 -; GCN-NEXT: v_mov_b32_e32 v1, 0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: s_ashr_i32 s12, s3, 31 ; GCN-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0 -; GCN-NEXT: v_mul_f32_e32 v2, 0x2f800000, v0 -; GCN-NEXT: v_trunc_f32_e32 v2, v2 -; GCN-NEXT: v_mac_f32_e32 v0, 0xcf800000, v2 -; GCN-NEXT: v_cvt_u32_f32_e32 v2, v2 +; GCN-NEXT: v_mul_f32_e32 v1, 0x2f800000, v0 +; GCN-NEXT: v_trunc_f32_e32 v1, v1 +; GCN-NEXT: v_mac_f32_e32 v0, 0xcf800000, v1 +; GCN-NEXT: v_cvt_u32_f32_e32 v1, v1 ; GCN-NEXT: v_cvt_u32_f32_e32 v0, v0 ; GCN-NEXT: s_add_u32 s2, s2, s12 ; GCN-NEXT: s_mov_b32 s13, s12 -; GCN-NEXT: v_mul_lo_u32 v3, s4, v2 -; GCN-NEXT: v_mul_hi_u32 v4, s4, v0 -; GCN-NEXT: v_mul_lo_u32 v6, s5, v0 -; GCN-NEXT: v_mul_lo_u32 v5, s4, v0 -; GCN-NEXT: s_addc_u32 s3, s3, s12 -; GCN-NEXT: v_add_i32_e32 v3, vcc, v4, v3 -; GCN-NEXT: v_add_i32_e32 v3, vcc, v3, v6 -; GCN-NEXT: v_mul_hi_u32 v4, v0, v5 -; GCN-NEXT: v_mul_lo_u32 v6, v0, v3 -; GCN-NEXT: v_mul_hi_u32 v8, v0, v3 -; GCN-NEXT: v_mul_lo_u32 v7, v2, v5 -; GCN-NEXT: v_mul_hi_u32 v5, v2, v5 -; GCN-NEXT: v_add_i32_e32 v4, vcc, v4, v6 -; GCN-NEXT: v_addc_u32_e32 v6, vcc, 0, v8, vcc -; GCN-NEXT: v_mul_hi_u32 v8, v2, v3 -; GCN-NEXT: v_mul_lo_u32 v3, v2, v3 -; GCN-NEXT: v_add_i32_e32 v4, vcc, v4, v7 -; GCN-NEXT: v_addc_u32_e32 v4, vcc, v6, v5, vcc -; GCN-NEXT: v_addc_u32_e32 v5, vcc, v8, v1, vcc -; GCN-NEXT: v_add_i32_e32 v3, vcc, v4, v3 -; GCN-NEXT: v_addc_u32_e32 v4, vcc, 0, v5, vcc -; GCN-NEXT: v_add_i32_e32 v0, vcc, v0, v3 -; GCN-NEXT: v_addc_u32_e32 v2, vcc, v2, v4, vcc -; GCN-NEXT: v_mul_lo_u32 v3, s4, v2 -; GCN-NEXT: v_mul_hi_u32 v4, s4, v0 +; GCN-NEXT: v_mul_lo_u32 v2, s4, v1 +; GCN-NEXT: v_mul_hi_u32 v3, s4, v0 ; GCN-NEXT: v_mul_lo_u32 v5, s5, v0 +; GCN-NEXT: v_mul_lo_u32 v4, s4, v0 +; GCN-NEXT: s_addc_u32 s3, s3, s12 +; GCN-NEXT: v_add_i32_e32 v2, vcc, v3, v2 +; GCN-NEXT: v_add_i32_e32 v2, vcc, v2, v5 +; GCN-NEXT: v_mul_hi_u32 v3, v0, v4 +; GCN-NEXT: v_mul_lo_u32 v5, v0, v2 +; GCN-NEXT: v_mul_hi_u32 v7, v0, v2 +; GCN-NEXT: v_mul_lo_u32 v6, v1, v4 +; GCN-NEXT: v_mul_hi_u32 v4, v1, v4 +; GCN-NEXT: v_mul_hi_u32 v8, v1, v2 +; GCN-NEXT: v_add_i32_e32 v3, vcc, v3, v5 +; GCN-NEXT: v_addc_u32_e32 v5, vcc, 0, v7, vcc +; GCN-NEXT: v_mul_lo_u32 v2, v1, v2 +; GCN-NEXT: v_add_i32_e32 v3, vcc, v3, v6 +; GCN-NEXT: v_addc_u32_e32 v3, vcc, v5, v4, vcc +; GCN-NEXT: v_addc_u32_e32 v4, vcc, 0, v8, vcc +; GCN-NEXT: v_add_i32_e32 v2, vcc, v3, v2 +; GCN-NEXT: v_addc_u32_e32 v3, vcc, 0, v4, vcc +; GCN-NEXT: v_add_i32_e32 v0, vcc, v0, v2 +; GCN-NEXT: v_addc_u32_e32 v1, vcc, v1, v3, vcc +; GCN-NEXT: v_mul_lo_u32 v2, s4, v1 +; GCN-NEXT: v_mul_hi_u32 v3, s4, v0 +; GCN-NEXT: v_mul_lo_u32 v4, s5, v0 ; GCN-NEXT: s_xor_b64 s[2:3], s[2:3], s[12:13] ; GCN-NEXT: s_mov_b32 s5, s1 -; GCN-NEXT: v_add_i32_e32 v3, vcc, v4, v3 -; GCN-NEXT: v_mul_lo_u32 v4, s4, v0 -; GCN-NEXT: v_add_i32_e32 v3, vcc, v5, v3 -; GCN-NEXT: v_mul_lo_u32 v7, v0, v3 -; GCN-NEXT: v_mul_hi_u32 v8, v0, v4 -; GCN-NEXT: v_mul_hi_u32 v9, v0, v3 -; GCN-NEXT: v_mul_hi_u32 v6, v2, v4 -; GCN-NEXT: v_mul_lo_u32 v4, v2, v4 -; GCN-NEXT: v_mul_hi_u32 v5, v2, v3 -; GCN-NEXT: v_add_i32_e32 v7, vcc, v8, v7 -; GCN-NEXT: v_addc_u32_e32 v8, vcc, 0, v9, vcc -; GCN-NEXT: v_mul_lo_u32 v3, v2, v3 -; GCN-NEXT: v_add_i32_e32 v4, vcc, v7, v4 -; GCN-NEXT: v_addc_u32_e32 v4, vcc, v8, v6, vcc -; GCN-NEXT: v_addc_u32_e32 v5, vcc, v5, v1, vcc -; GCN-NEXT: v_add_i32_e32 v3, vcc, v4, v3 -; GCN-NEXT: v_addc_u32_e32 v4, vcc, 0, v5, vcc -; GCN-NEXT: v_add_i32_e32 v0, vcc, v0, v3 -; GCN-NEXT: v_addc_u32_e32 v2, vcc, v2, v4, vcc -; GCN-NEXT: v_mul_lo_u32 v3, s2, v2 -; GCN-NEXT: v_mul_hi_u32 v4, s2, v0 -; GCN-NEXT: v_mul_hi_u32 v5, s2, v2 -; GCN-NEXT: v_mul_hi_u32 v6, s3, v2 -; GCN-NEXT: v_mul_lo_u32 v2, s3, v2 -; GCN-NEXT: v_add_i32_e32 v3, vcc, v4, v3 -; GCN-NEXT: v_addc_u32_e32 v4, vcc, 0, v5, vcc -; GCN-NEXT: v_mul_lo_u32 v5, s3, v0 +; GCN-NEXT: v_add_i32_e32 v2, vcc, v3, v2 +; GCN-NEXT: v_mul_lo_u32 v3, s4, v0 +; GCN-NEXT: v_add_i32_e32 v2, vcc, v4, v2 +; GCN-NEXT: v_mul_lo_u32 v6, v0, v2 +; GCN-NEXT: v_mul_hi_u32 v7, v0, v3 +; GCN-NEXT: v_mul_hi_u32 v8, v0, v2 +; GCN-NEXT: v_mul_hi_u32 v5, v1, v3 +; GCN-NEXT: v_mul_lo_u32 v3, v1, v3 +; GCN-NEXT: v_mul_hi_u32 v4, v1, v2 +; GCN-NEXT: v_add_i32_e32 v6, vcc, v7, v6 +; GCN-NEXT: v_addc_u32_e32 v7, vcc, 0, v8, vcc +; GCN-NEXT: v_mul_lo_u32 v2, v1, v2 +; GCN-NEXT: v_add_i32_e32 v3, vcc, v6, v3 +; GCN-NEXT: v_addc_u32_e32 v3, vcc, v7, v5, vcc +; GCN-NEXT: v_addc_u32_e32 v4, vcc, 0, v4, vcc +; GCN-NEXT: v_add_i32_e32 v2, vcc, v3, v2 +; GCN-NEXT: v_addc_u32_e32 v3, vcc, 0, v4, vcc +; GCN-NEXT: v_add_i32_e32 v0, vcc, v0, v2 +; GCN-NEXT: v_addc_u32_e32 v1, vcc, v1, v3, vcc +; GCN-NEXT: v_mul_lo_u32 v2, s2, v1 +; GCN-NEXT: v_mul_hi_u32 v3, s2, v0 +; GCN-NEXT: v_mul_hi_u32 v4, s2, v1 +; GCN-NEXT: v_mul_hi_u32 v5, s3, v1 +; GCN-NEXT: v_mul_lo_u32 v1, s3, v1 +; GCN-NEXT: v_add_i32_e32 v2, vcc, v3, v2 +; GCN-NEXT: v_addc_u32_e32 v3, vcc, 0, v4, vcc +; GCN-NEXT: v_mul_lo_u32 v4, s3, v0 ; GCN-NEXT: v_mul_hi_u32 v0, s3, v0 ; GCN-NEXT: s_mov_b32 s4, s0 -; GCN-NEXT: v_add_i32_e32 v3, vcc, v3, v5 -; GCN-NEXT: v_addc_u32_e32 v0, vcc, v4, v0, vcc -; GCN-NEXT: v_addc_u32_e32 v1, vcc, v6, v1, vcc -; GCN-NEXT: v_add_i32_e32 v0, vcc, v0, v2 -; GCN-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GCN-NEXT: v_add_i32_e32 v2, vcc, v2, v4 +; GCN-NEXT: v_addc_u32_e32 v0, vcc, v3, v0, vcc +; GCN-NEXT: v_addc_u32_e32 v2, vcc, 0, v5, vcc +; GCN-NEXT: v_add_i32_e32 v0, vcc, v0, v1 +; GCN-NEXT: v_addc_u32_e32 v1, vcc, 0, v2, vcc ; GCN-NEXT: v_mul_lo_u32 v2, s10, v1 ; GCN-NEXT: v_mul_hi_u32 v3, s10, v0 ; GCN-NEXT: v_mul_lo_u32 v4, s11, v0 @@ -186,12 +185,11 @@ define amdgpu_kernel void @s_test_sdiv(i64 addrspace(1)* %out, i64 %x, i64 %y) { ; GCN-IR-NEXT: s_cbranch_vccz .LBB0_4 ; GCN-IR-NEXT: ; %bb.2: ; %udiv-preheader ; GCN-IR-NEXT: s_lshr_b64 s[16:17], s[12:13], s16 -; GCN-IR-NEXT: s_add_u32 s20, s6, -1 -; GCN-IR-NEXT: s_addc_u32 s21, s7, -1 +; GCN-IR-NEXT: s_add_u32 s19, s6, -1 +; GCN-IR-NEXT: s_addc_u32 s20, s7, -1 ; GCN-IR-NEXT: s_not_b64 s[8:9], s[14:15] ; GCN-IR-NEXT: s_add_u32 s12, s8, s18 -; GCN-IR-NEXT: s_mov_b32 s19, s15 -; GCN-IR-NEXT: s_addc_u32 s13, s9, s15 +; GCN-IR-NEXT: s_addc_u32 s13, s9, 0 ; GCN-IR-NEXT: s_mov_b64 s[14:15], 0 ; GCN-IR-NEXT: s_mov_b32 s9, 0 ; GCN-IR-NEXT: .LBB0_3: ; %udiv-do-while @@ -201,8 +199,8 @@ define amdgpu_kernel void @s_test_sdiv(i64 addrspace(1)* %out, i64 %x, i64 %y) { ; GCN-IR-NEXT: s_lshl_b64 s[10:11], s[10:11], 1 ; GCN-IR-NEXT: s_or_b64 s[16:17], s[16:17], s[8:9] ; GCN-IR-NEXT: s_or_b64 s[10:11], s[14:15], s[10:11] -; GCN-IR-NEXT: s_sub_u32 s8, s20, s16 -; GCN-IR-NEXT: s_subb_u32 s8, s21, s17 +; GCN-IR-NEXT: s_sub_u32 s8, s19, s16 +; GCN-IR-NEXT: s_subb_u32 s8, s20, s17 ; GCN-IR-NEXT: s_ashr_i32 s14, s8, 31 ; GCN-IR-NEXT: s_mov_b32 s15, s14 ; GCN-IR-NEXT: s_and_b32 s8, s14, 1 @@ -211,9 +209,9 @@ define amdgpu_kernel void @s_test_sdiv(i64 addrspace(1)* %out, i64 %x, i64 %y) { ; GCN-IR-NEXT: s_subb_u32 s17, s17, s15 ; GCN-IR-NEXT: s_add_u32 s12, s12, 1 ; GCN-IR-NEXT: s_addc_u32 s13, s13, 0 -; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[18:19], s[12:13], 0 +; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[22:23], s[12:13], 0 ; GCN-IR-NEXT: s_mov_b64 s[14:15], s[8:9] -; GCN-IR-NEXT: s_and_b64 vcc, exec, s[18:19] +; GCN-IR-NEXT: s_and_b64 vcc, exec, s[22:23] ; GCN-IR-NEXT: s_cbranch_vccz .LBB0_3 ; GCN-IR-NEXT: .LBB0_4: ; %Flow6 ; GCN-IR-NEXT: s_lshl_b64 s[6:7], s[10:11], 1 @@ -257,7 +255,6 @@ define i64 @v_test_sdiv(i64 %x, i64 %y) { ; GCN-NEXT: v_subb_u32_e32 v8, vcc, 0, v3, vcc ; GCN-NEXT: v_mac_f32_e32 v5, 0x4f800000, v6 ; GCN-NEXT: v_rcp_f32_e32 v5, v5 -; GCN-NEXT: v_mov_b32_e32 v14, 0 ; GCN-NEXT: v_mul_f32_e32 v5, 0x5f7ffffc, v5 ; GCN-NEXT: v_mul_f32_e32 v6, 0x2f800000, v5 ; GCN-NEXT: v_trunc_f32_e32 v6, v6 @@ -273,7 +270,7 @@ define i64 @v_test_sdiv(i64 %x, i64 %y) { ; GCN-NEXT: v_mul_lo_u32 v11, v5, v9 ; GCN-NEXT: v_mul_hi_u32 v12, v5, v10 ; GCN-NEXT: v_mul_hi_u32 v13, v5, v9 -; GCN-NEXT: v_mul_hi_u32 v15, v6, v9 +; GCN-NEXT: v_mul_hi_u32 v14, v6, v9 ; GCN-NEXT: v_mul_lo_u32 v9, v6, v9 ; GCN-NEXT: v_add_i32_e32 v11, vcc, v12, v11 ; GCN-NEXT: v_addc_u32_e32 v12, vcc, 0, v13, vcc @@ -281,7 +278,7 @@ define i64 @v_test_sdiv(i64 %x, i64 %y) { ; GCN-NEXT: v_mul_hi_u32 v10, v6, v10 ; GCN-NEXT: v_add_i32_e32 v11, vcc, v11, v13 ; GCN-NEXT: v_addc_u32_e32 v10, vcc, v12, v10, vcc -; GCN-NEXT: v_addc_u32_e32 v11, vcc, v15, v14, vcc +; GCN-NEXT: v_addc_u32_e32 v11, vcc, 0, v14, vcc ; GCN-NEXT: v_add_i32_e32 v9, vcc, v10, v9 ; GCN-NEXT: v_addc_u32_e32 v10, vcc, 0, v11, vcc ; GCN-NEXT: v_add_i32_e32 v5, vcc, v5, v9 @@ -303,7 +300,7 @@ define i64 @v_test_sdiv(i64 %x, i64 %y) { ; GCN-NEXT: v_mul_lo_u32 v8, v6, v8 ; GCN-NEXT: v_add_i32_e32 v7, vcc, v11, v7 ; GCN-NEXT: v_addc_u32_e32 v7, vcc, v12, v10, vcc -; GCN-NEXT: v_addc_u32_e32 v9, vcc, v9, v14, vcc +; GCN-NEXT: v_addc_u32_e32 v9, vcc, 0, v9, vcc ; GCN-NEXT: v_add_i32_e32 v7, vcc, v7, v8 ; GCN-NEXT: v_addc_u32_e32 v8, vcc, 0, v9, vcc ; GCN-NEXT: v_add_i32_e32 v5, vcc, v5, v7 @@ -324,7 +321,7 @@ define i64 @v_test_sdiv(i64 %x, i64 %y) { ; GCN-NEXT: v_mul_lo_u32 v6, v1, v6 ; GCN-NEXT: v_add_i32_e32 v8, vcc, v8, v10 ; GCN-NEXT: v_addc_u32_e32 v5, vcc, v9, v5, vcc -; GCN-NEXT: v_addc_u32_e32 v8, vcc, v11, v14, vcc +; GCN-NEXT: v_addc_u32_e32 v8, vcc, 0, v11, vcc ; GCN-NEXT: v_add_i32_e32 v5, vcc, v5, v6 ; GCN-NEXT: v_addc_u32_e32 v6, vcc, 0, v8, vcc ; GCN-NEXT: v_mul_lo_u32 v8, v2, v6 @@ -398,12 +395,10 @@ define i64 @v_test_sdiv(i64 %x, i64 %y) { ; GCN-IR-NEXT: v_cmp_ne_u64_e64 s[4:5], 63, v[7:8] ; GCN-IR-NEXT: s_or_b64 s[6:7], s[6:7], vcc ; GCN-IR-NEXT: s_xor_b64 s[8:9], s[6:7], -1 -; GCN-IR-NEXT: v_mov_b32_e32 v17, 0 ; GCN-IR-NEXT: v_mov_b32_e32 v6, v4 ; GCN-IR-NEXT: v_mov_b32_e32 v1, v5 ; GCN-IR-NEXT: v_cndmask_b32_e64 v10, v12, 0, s[6:7] ; GCN-IR-NEXT: s_and_b64 s[4:5], s[8:9], s[4:5] -; GCN-IR-NEXT: v_mov_b32_e32 v16, v17 ; GCN-IR-NEXT: v_cndmask_b32_e64 v9, v11, 0, s[6:7] ; GCN-IR-NEXT: s_and_saveexec_b64 s[6:7], s[4:5] ; GCN-IR-NEXT: s_cbranch_execz .LBB1_6 @@ -423,10 +418,10 @@ define i64 @v_test_sdiv(i64 %x, i64 %y) { ; GCN-IR-NEXT: v_addc_u32_e32 v19, vcc, -1, v3, vcc ; GCN-IR-NEXT: v_not_b32_e32 v0, v0 ; GCN-IR-NEXT: v_lshr_b64 v[14:15], v[11:12], v14 -; GCN-IR-NEXT: v_not_b32_e32 v9, v17 +; GCN-IR-NEXT: v_not_b32_e32 v9, 0 ; GCN-IR-NEXT: v_add_i32_e32 v11, vcc, v0, v13 -; GCN-IR-NEXT: v_addc_u32_e32 v12, vcc, v9, v16, vcc ; GCN-IR-NEXT: v_mov_b32_e32 v16, 0 +; GCN-IR-NEXT: v_addc_u32_e32 v12, vcc, 0, v9, vcc ; GCN-IR-NEXT: s_mov_b64 s[10:11], 0 ; GCN-IR-NEXT: v_mov_b32_e32 v17, 0 ; GCN-IR-NEXT: v_mov_b32_e32 v10, 0 @@ -1034,12 +1029,11 @@ define amdgpu_kernel void @s_test_sdiv24_48(i48 addrspace(1)* %out, i48 %x, i48 ; GCN-IR-NEXT: s_cbranch_vccz .LBB9_4 ; GCN-IR-NEXT: ; %bb.2: ; %udiv-preheader ; GCN-IR-NEXT: s_lshr_b64 s[16:17], s[12:13], s16 -; GCN-IR-NEXT: s_add_u32 s20, s6, -1 -; GCN-IR-NEXT: s_addc_u32 s21, s7, -1 +; GCN-IR-NEXT: s_add_u32 s19, s6, -1 +; GCN-IR-NEXT: s_addc_u32 s20, s7, -1 ; GCN-IR-NEXT: s_not_b64 s[8:9], s[14:15] ; GCN-IR-NEXT: s_add_u32 s12, s8, s18 -; GCN-IR-NEXT: s_mov_b32 s19, s15 -; GCN-IR-NEXT: s_addc_u32 s13, s9, s15 +; GCN-IR-NEXT: s_addc_u32 s13, s9, 0 ; GCN-IR-NEXT: s_mov_b64 s[14:15], 0 ; GCN-IR-NEXT: s_mov_b32 s9, 0 ; GCN-IR-NEXT: .LBB9_3: ; %udiv-do-while @@ -1049,8 +1043,8 @@ define amdgpu_kernel void @s_test_sdiv24_48(i48 addrspace(1)* %out, i48 %x, i48 ; GCN-IR-NEXT: s_lshl_b64 s[10:11], s[10:11], 1 ; GCN-IR-NEXT: s_or_b64 s[16:17], s[16:17], s[8:9] ; GCN-IR-NEXT: s_or_b64 s[10:11], s[14:15], s[10:11] -; GCN-IR-NEXT: s_sub_u32 s8, s20, s16 -; GCN-IR-NEXT: s_subb_u32 s8, s21, s17 +; GCN-IR-NEXT: s_sub_u32 s8, s19, s16 +; GCN-IR-NEXT: s_subb_u32 s8, s20, s17 ; GCN-IR-NEXT: s_ashr_i32 s14, s8, 31 ; GCN-IR-NEXT: s_mov_b32 s15, s14 ; GCN-IR-NEXT: s_and_b32 s8, s14, 1 @@ -1059,9 +1053,9 @@ define amdgpu_kernel void @s_test_sdiv24_48(i48 addrspace(1)* %out, i48 %x, i48 ; GCN-IR-NEXT: s_subb_u32 s17, s17, s15 ; GCN-IR-NEXT: s_add_u32 s12, s12, 1 ; GCN-IR-NEXT: s_addc_u32 s13, s13, 0 -; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[18:19], s[12:13], 0 +; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[22:23], s[12:13], 0 ; GCN-IR-NEXT: s_mov_b64 s[14:15], s[8:9] -; GCN-IR-NEXT: s_and_b64 vcc, exec, s[18:19] +; GCN-IR-NEXT: s_and_b64 vcc, exec, s[22:23] ; GCN-IR-NEXT: s_cbranch_vccz .LBB9_3 ; GCN-IR-NEXT: .LBB9_4: ; %Flow3 ; GCN-IR-NEXT: s_lshl_b64 s[6:7], s[10:11], 1 @@ -1111,58 +1105,57 @@ define amdgpu_kernel void @s_test_sdiv_k_num_i64(i64 addrspace(1)* %out, i64 %x) ; GCN-NEXT: s_subb_u32 s5, 0, s3 ; GCN-NEXT: v_mac_f32_e32 v0, 0x4f800000, v1 ; GCN-NEXT: v_rcp_f32_e32 v0, v0 -; GCN-NEXT: v_mov_b32_e32 v1, 0 ; GCN-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0 -; GCN-NEXT: v_mul_f32_e32 v2, 0x2f800000, v0 -; GCN-NEXT: v_trunc_f32_e32 v2, v2 -; GCN-NEXT: v_mac_f32_e32 v0, 0xcf800000, v2 -; GCN-NEXT: v_cvt_u32_f32_e32 v2, v2 +; GCN-NEXT: v_mul_f32_e32 v1, 0x2f800000, v0 +; GCN-NEXT: v_trunc_f32_e32 v1, v1 +; GCN-NEXT: v_mac_f32_e32 v0, 0xcf800000, v1 +; GCN-NEXT: v_cvt_u32_f32_e32 v1, v1 ; GCN-NEXT: v_cvt_u32_f32_e32 v0, v0 -; GCN-NEXT: v_mul_lo_u32 v3, s4, v2 -; GCN-NEXT: v_mul_hi_u32 v4, s4, v0 -; GCN-NEXT: v_mul_lo_u32 v6, s5, v0 -; GCN-NEXT: v_mul_lo_u32 v5, s4, v0 -; GCN-NEXT: v_add_i32_e32 v3, vcc, v4, v3 -; GCN-NEXT: v_add_i32_e32 v3, vcc, v3, v6 -; GCN-NEXT: v_mul_hi_u32 v4, v0, v5 -; GCN-NEXT: v_mul_lo_u32 v6, v0, v3 -; GCN-NEXT: v_mul_hi_u32 v8, v0, v3 -; GCN-NEXT: v_mul_hi_u32 v7, v2, v5 -; GCN-NEXT: v_mul_lo_u32 v5, v2, v5 -; GCN-NEXT: v_mul_hi_u32 v9, v2, v3 -; GCN-NEXT: v_add_i32_e32 v4, vcc, v4, v6 -; GCN-NEXT: v_addc_u32_e32 v6, vcc, 0, v8, vcc -; GCN-NEXT: v_mul_lo_u32 v3, v2, v3 -; GCN-NEXT: v_add_i32_e32 v4, vcc, v4, v5 -; GCN-NEXT: v_addc_u32_e32 v4, vcc, v6, v7, vcc -; GCN-NEXT: v_addc_u32_e32 v5, vcc, v9, v1, vcc -; GCN-NEXT: v_add_i32_e32 v3, vcc, v4, v3 -; GCN-NEXT: v_addc_u32_e32 v4, vcc, 0, v5, vcc -; GCN-NEXT: v_add_i32_e32 v0, vcc, v0, v3 -; GCN-NEXT: v_addc_u32_e32 v2, vcc, v2, v4, vcc -; GCN-NEXT: v_mul_lo_u32 v3, s4, v2 -; GCN-NEXT: v_mul_hi_u32 v4, s4, v0 +; GCN-NEXT: v_mul_lo_u32 v2, s4, v1 +; GCN-NEXT: v_mul_hi_u32 v3, s4, v0 ; GCN-NEXT: v_mul_lo_u32 v5, s5, v0 -; GCN-NEXT: s_mov_b32 s5, s1 -; GCN-NEXT: v_add_i32_e32 v3, vcc, v4, v3 ; GCN-NEXT: v_mul_lo_u32 v4, s4, v0 -; GCN-NEXT: v_add_i32_e32 v3, vcc, v5, v3 -; GCN-NEXT: v_mul_lo_u32 v7, v0, v3 -; GCN-NEXT: v_mul_hi_u32 v8, v0, v4 -; GCN-NEXT: v_mul_hi_u32 v9, v0, v3 -; GCN-NEXT: v_mul_hi_u32 v6, v2, v4 -; GCN-NEXT: v_mul_lo_u32 v4, v2, v4 -; GCN-NEXT: v_mul_hi_u32 v5, v2, v3 -; GCN-NEXT: v_add_i32_e32 v7, vcc, v8, v7 -; GCN-NEXT: v_addc_u32_e32 v8, vcc, 0, v9, vcc -; GCN-NEXT: v_mul_lo_u32 v3, v2, v3 -; GCN-NEXT: v_add_i32_e32 v4, vcc, v7, v4 -; GCN-NEXT: v_addc_u32_e32 v4, vcc, v8, v6, vcc -; GCN-NEXT: v_addc_u32_e32 v1, vcc, v5, v1, vcc -; GCN-NEXT: v_add_i32_e32 v3, vcc, v4, v3 -; GCN-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GCN-NEXT: v_add_i32_e32 v0, vcc, v0, v3 -; GCN-NEXT: v_addc_u32_e32 v1, vcc, v2, v1, vcc +; GCN-NEXT: v_add_i32_e32 v2, vcc, v3, v2 +; GCN-NEXT: v_add_i32_e32 v2, vcc, v2, v5 +; GCN-NEXT: v_mul_hi_u32 v3, v0, v4 +; GCN-NEXT: v_mul_lo_u32 v5, v0, v2 +; GCN-NEXT: v_mul_hi_u32 v7, v0, v2 +; GCN-NEXT: v_mul_hi_u32 v6, v1, v4 +; GCN-NEXT: v_mul_lo_u32 v4, v1, v4 +; GCN-NEXT: v_mul_hi_u32 v8, v1, v2 +; GCN-NEXT: v_add_i32_e32 v3, vcc, v3, v5 +; GCN-NEXT: v_addc_u32_e32 v5, vcc, 0, v7, vcc +; GCN-NEXT: v_mul_lo_u32 v2, v1, v2 +; GCN-NEXT: v_add_i32_e32 v3, vcc, v3, v4 +; GCN-NEXT: v_addc_u32_e32 v3, vcc, v5, v6, vcc +; GCN-NEXT: v_addc_u32_e32 v4, vcc, 0, v8, vcc +; GCN-NEXT: v_add_i32_e32 v2, vcc, v3, v2 +; GCN-NEXT: v_addc_u32_e32 v3, vcc, 0, v4, vcc +; GCN-NEXT: v_add_i32_e32 v0, vcc, v0, v2 +; GCN-NEXT: v_addc_u32_e32 v1, vcc, v1, v3, vcc +; GCN-NEXT: v_mul_lo_u32 v2, s4, v1 +; GCN-NEXT: v_mul_hi_u32 v3, s4, v0 +; GCN-NEXT: v_mul_lo_u32 v4, s5, v0 +; GCN-NEXT: s_mov_b32 s5, s1 +; GCN-NEXT: v_add_i32_e32 v2, vcc, v3, v2 +; GCN-NEXT: v_mul_lo_u32 v3, s4, v0 +; GCN-NEXT: v_add_i32_e32 v2, vcc, v4, v2 +; GCN-NEXT: v_mul_lo_u32 v6, v0, v2 +; GCN-NEXT: v_mul_hi_u32 v7, v0, v3 +; GCN-NEXT: v_mul_hi_u32 v8, v0, v2 +; GCN-NEXT: v_mul_hi_u32 v5, v1, v3 +; GCN-NEXT: v_mul_lo_u32 v3, v1, v3 +; GCN-NEXT: v_mul_hi_u32 v4, v1, v2 +; GCN-NEXT: v_add_i32_e32 v6, vcc, v7, v6 +; GCN-NEXT: v_addc_u32_e32 v7, vcc, 0, v8, vcc +; GCN-NEXT: v_mul_lo_u32 v2, v1, v2 +; GCN-NEXT: v_add_i32_e32 v3, vcc, v6, v3 +; GCN-NEXT: v_addc_u32_e32 v3, vcc, v7, v5, vcc +; GCN-NEXT: v_addc_u32_e32 v4, vcc, 0, v4, vcc +; GCN-NEXT: v_add_i32_e32 v2, vcc, v3, v2 +; GCN-NEXT: v_addc_u32_e32 v3, vcc, 0, v4, vcc +; GCN-NEXT: v_add_i32_e32 v0, vcc, v0, v2 +; GCN-NEXT: v_addc_u32_e32 v1, vcc, v1, v3, vcc ; GCN-NEXT: v_mul_lo_u32 v2, v1, 24 ; GCN-NEXT: v_mul_hi_u32 v0, v0, 24 ; GCN-NEXT: v_mul_hi_u32 v1, v1, 24 @@ -1310,7 +1303,6 @@ define i64 @v_test_sdiv_k_num_i64(i64 %x) { ; GCN-NEXT: v_subb_u32_e32 v6, vcc, 0, v1, vcc ; GCN-NEXT: v_mac_f32_e32 v3, 0x4f800000, v4 ; GCN-NEXT: v_rcp_f32_e32 v3, v3 -; GCN-NEXT: v_mov_b32_e32 v12, 0 ; GCN-NEXT: v_mul_f32_e32 v3, 0x5f7ffffc, v3 ; GCN-NEXT: v_mul_f32_e32 v4, 0x2f800000, v3 ; GCN-NEXT: v_trunc_f32_e32 v4, v4 @@ -1326,7 +1318,7 @@ define i64 @v_test_sdiv_k_num_i64(i64 %x) { ; GCN-NEXT: v_mul_lo_u32 v9, v3, v7 ; GCN-NEXT: v_mul_hi_u32 v10, v3, v8 ; GCN-NEXT: v_mul_hi_u32 v11, v3, v7 -; GCN-NEXT: v_mul_hi_u32 v13, v4, v7 +; GCN-NEXT: v_mul_hi_u32 v12, v4, v7 ; GCN-NEXT: v_mul_lo_u32 v7, v4, v7 ; GCN-NEXT: v_add_i32_e32 v9, vcc, v10, v9 ; GCN-NEXT: v_addc_u32_e32 v10, vcc, 0, v11, vcc @@ -1334,7 +1326,7 @@ define i64 @v_test_sdiv_k_num_i64(i64 %x) { ; GCN-NEXT: v_mul_hi_u32 v8, v4, v8 ; GCN-NEXT: v_add_i32_e32 v9, vcc, v9, v11 ; GCN-NEXT: v_addc_u32_e32 v8, vcc, v10, v8, vcc -; GCN-NEXT: v_addc_u32_e32 v9, vcc, v13, v12, vcc +; GCN-NEXT: v_addc_u32_e32 v9, vcc, 0, v12, vcc ; GCN-NEXT: v_add_i32_e32 v7, vcc, v8, v7 ; GCN-NEXT: v_addc_u32_e32 v8, vcc, 0, v9, vcc ; GCN-NEXT: v_add_i32_e32 v3, vcc, v3, v7 @@ -1356,7 +1348,7 @@ define i64 @v_test_sdiv_k_num_i64(i64 %x) { ; GCN-NEXT: v_mul_lo_u32 v6, v4, v6 ; GCN-NEXT: v_add_i32_e32 v5, vcc, v9, v5 ; GCN-NEXT: v_addc_u32_e32 v5, vcc, v10, v8, vcc -; GCN-NEXT: v_addc_u32_e32 v7, vcc, v7, v12, vcc +; GCN-NEXT: v_addc_u32_e32 v7, vcc, 0, v7, vcc ; GCN-NEXT: v_add_i32_e32 v5, vcc, v5, v6 ; GCN-NEXT: v_addc_u32_e32 v6, vcc, 0, v7, vcc ; GCN-NEXT: v_add_i32_e32 v3, vcc, v3, v5 @@ -1417,25 +1409,24 @@ define i64 @v_test_sdiv_k_num_i64(i64 %x) { ; GCN-IR-NEXT: v_ffbh_u32_e32 v5, v1 ; GCN-IR-NEXT: v_min_u32_e32 v8, v4, v5 ; GCN-IR-NEXT: s_movk_i32 s6, 0xffc5 -; GCN-IR-NEXT: v_add_i32_e32 v4, vcc, s6, v8 -; GCN-IR-NEXT: v_addc_u32_e64 v5, s[6:7], 0, -1, vcc +; GCN-IR-NEXT: v_add_i32_e32 v5, vcc, s6, v8 +; GCN-IR-NEXT: v_addc_u32_e64 v6, s[6:7], 0, -1, vcc ; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[4:5], 0, v[0:1] -; GCN-IR-NEXT: v_cmp_lt_u64_e32 vcc, 63, v[4:5] -; GCN-IR-NEXT: v_mov_b32_e32 v9, 0 +; GCN-IR-NEXT: v_cmp_lt_u64_e32 vcc, 63, v[5:6] +; GCN-IR-NEXT: v_cmp_ne_u64_e64 s[6:7], 63, v[5:6] ; GCN-IR-NEXT: s_or_b64 s[4:5], s[4:5], vcc -; GCN-IR-NEXT: v_cmp_ne_u64_e32 vcc, 63, v[4:5] -; GCN-IR-NEXT: v_cndmask_b32_e64 v6, 24, 0, s[4:5] +; GCN-IR-NEXT: v_cndmask_b32_e64 v7, 24, 0, s[4:5] ; GCN-IR-NEXT: s_xor_b64 s[4:5], s[4:5], -1 ; GCN-IR-NEXT: v_mov_b32_e32 v3, v2 -; GCN-IR-NEXT: v_mov_b32_e32 v7, v9 -; GCN-IR-NEXT: s_and_b64 s[4:5], s[4:5], vcc +; GCN-IR-NEXT: v_mov_b32_e32 v4, 0 +; GCN-IR-NEXT: s_and_b64 s[4:5], s[4:5], s[6:7] ; GCN-IR-NEXT: s_and_saveexec_b64 s[6:7], s[4:5] ; GCN-IR-NEXT: s_cbranch_execz .LBB11_6 ; GCN-IR-NEXT: ; %bb.1: ; %udiv-bb1 -; GCN-IR-NEXT: v_add_i32_e32 v10, vcc, 1, v4 -; GCN-IR-NEXT: v_addc_u32_e32 v11, vcc, 0, v5, vcc -; GCN-IR-NEXT: v_sub_i32_e64 v4, s[4:5], 63, v4 -; GCN-IR-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[10:11] +; GCN-IR-NEXT: v_add_i32_e32 v9, vcc, 1, v5 +; GCN-IR-NEXT: v_addc_u32_e32 v10, vcc, 0, v6, vcc +; GCN-IR-NEXT: v_sub_i32_e64 v4, s[4:5], 63, v5 +; GCN-IR-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[9:10] ; GCN-IR-NEXT: v_lshl_b64 v[4:5], 24, v4 ; GCN-IR-NEXT: v_mov_b32_e32 v6, 0 ; GCN-IR-NEXT: v_mov_b32_e32 v7, 0 @@ -1445,10 +1436,10 @@ define i64 @v_test_sdiv_k_num_i64(i64 %x) { ; GCN-IR-NEXT: ; %bb.2: ; %udiv-preheader ; GCN-IR-NEXT: v_add_i32_e32 v14, vcc, -1, v0 ; GCN-IR-NEXT: v_addc_u32_e32 v15, vcc, -1, v1, vcc -; GCN-IR-NEXT: v_lshr_b64 v[10:11], 24, v10 +; GCN-IR-NEXT: v_lshr_b64 v[10:11], 24, v9 ; GCN-IR-NEXT: v_sub_i32_e32 v8, vcc, 58, v8 ; GCN-IR-NEXT: v_mov_b32_e32 v12, 0 -; GCN-IR-NEXT: v_subb_u32_e32 v9, vcc, 0, v9, vcc +; GCN-IR-NEXT: v_subb_u32_e64 v9, s[4:5], 0, 0, vcc ; GCN-IR-NEXT: s_mov_b64 s[10:11], 0 ; GCN-IR-NEXT: v_mov_b32_e32 v13, 0 ; GCN-IR-NEXT: v_mov_b32_e32 v7, 0 @@ -1481,12 +1472,12 @@ define i64 @v_test_sdiv_k_num_i64(i64 %x) { ; GCN-IR-NEXT: .LBB11_5: ; %Flow3 ; GCN-IR-NEXT: s_or_b64 exec, exec, s[8:9] ; GCN-IR-NEXT: v_lshl_b64 v[0:1], v[4:5], 1 -; GCN-IR-NEXT: v_or_b32_e32 v7, v7, v1 -; GCN-IR-NEXT: v_or_b32_e32 v6, v6, v0 +; GCN-IR-NEXT: v_or_b32_e32 v4, v7, v1 +; GCN-IR-NEXT: v_or_b32_e32 v7, v6, v0 ; GCN-IR-NEXT: .LBB11_6: ; %Flow4 ; GCN-IR-NEXT: s_or_b64 exec, exec, s[6:7] -; GCN-IR-NEXT: v_xor_b32_e32 v0, v6, v2 -; GCN-IR-NEXT: v_xor_b32_e32 v1, v7, v3 +; GCN-IR-NEXT: v_xor_b32_e32 v0, v7, v2 +; GCN-IR-NEXT: v_xor_b32_e32 v1, v4, v3 ; GCN-IR-NEXT: v_sub_i32_e32 v0, vcc, v0, v2 ; GCN-IR-NEXT: v_subb_u32_e32 v1, vcc, v1, v3, vcc ; GCN-IR-NEXT: s_setpc_b64 s[30:31] @@ -1509,7 +1500,6 @@ define i64 @v_test_sdiv_pow2_k_num_i64(i64 %x) { ; GCN-NEXT: v_subb_u32_e32 v6, vcc, 0, v1, vcc ; GCN-NEXT: v_mac_f32_e32 v3, 0x4f800000, v4 ; GCN-NEXT: v_rcp_f32_e32 v3, v3 -; GCN-NEXT: v_mov_b32_e32 v12, 0 ; GCN-NEXT: v_mul_f32_e32 v3, 0x5f7ffffc, v3 ; GCN-NEXT: v_mul_f32_e32 v4, 0x2f800000, v3 ; GCN-NEXT: v_trunc_f32_e32 v4, v4 @@ -1525,7 +1515,7 @@ define i64 @v_test_sdiv_pow2_k_num_i64(i64 %x) { ; GCN-NEXT: v_mul_lo_u32 v9, v3, v7 ; GCN-NEXT: v_mul_hi_u32 v10, v3, v8 ; GCN-NEXT: v_mul_hi_u32 v11, v3, v7 -; GCN-NEXT: v_mul_hi_u32 v13, v4, v7 +; GCN-NEXT: v_mul_hi_u32 v12, v4, v7 ; GCN-NEXT: v_mul_lo_u32 v7, v4, v7 ; GCN-NEXT: v_add_i32_e32 v9, vcc, v10, v9 ; GCN-NEXT: v_addc_u32_e32 v10, vcc, 0, v11, vcc @@ -1533,7 +1523,7 @@ define i64 @v_test_sdiv_pow2_k_num_i64(i64 %x) { ; GCN-NEXT: v_mul_hi_u32 v8, v4, v8 ; GCN-NEXT: v_add_i32_e32 v9, vcc, v9, v11 ; GCN-NEXT: v_addc_u32_e32 v8, vcc, v10, v8, vcc -; GCN-NEXT: v_addc_u32_e32 v9, vcc, v13, v12, vcc +; GCN-NEXT: v_addc_u32_e32 v9, vcc, 0, v12, vcc ; GCN-NEXT: v_add_i32_e32 v7, vcc, v8, v7 ; GCN-NEXT: v_addc_u32_e32 v8, vcc, 0, v9, vcc ; GCN-NEXT: v_add_i32_e32 v3, vcc, v3, v7 @@ -1555,7 +1545,7 @@ define i64 @v_test_sdiv_pow2_k_num_i64(i64 %x) { ; GCN-NEXT: v_mul_lo_u32 v6, v4, v6 ; GCN-NEXT: v_add_i32_e32 v5, vcc, v9, v5 ; GCN-NEXT: v_addc_u32_e32 v5, vcc, v10, v8, vcc -; GCN-NEXT: v_addc_u32_e32 v7, vcc, v7, v12, vcc +; GCN-NEXT: v_addc_u32_e32 v7, vcc, 0, v7, vcc ; GCN-NEXT: v_add_i32_e32 v5, vcc, v5, v6 ; GCN-NEXT: v_addc_u32_e32 v6, vcc, 0, v7, vcc ; GCN-NEXT: v_add_i32_e32 v3, vcc, v3, v5 @@ -1612,27 +1602,26 @@ define i64 @v_test_sdiv_pow2_k_num_i64(i64 %x) { ; GCN-IR-NEXT: v_ffbh_u32_e32 v5, v1 ; GCN-IR-NEXT: v_min_u32_e32 v8, v4, v5 ; GCN-IR-NEXT: s_movk_i32 s6, 0xffd0 -; GCN-IR-NEXT: v_add_i32_e32 v4, vcc, s6, v8 -; GCN-IR-NEXT: v_addc_u32_e64 v5, s[6:7], 0, -1, vcc +; GCN-IR-NEXT: v_add_i32_e32 v5, vcc, s6, v8 +; GCN-IR-NEXT: v_addc_u32_e64 v6, s[6:7], 0, -1, vcc ; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[4:5], 0, v[0:1] -; GCN-IR-NEXT: v_cmp_lt_u64_e32 vcc, 63, v[4:5] -; GCN-IR-NEXT: s_mov_b64 s[8:9], 0x8000 -; GCN-IR-NEXT: v_mov_b32_e32 v6, s8 +; GCN-IR-NEXT: v_cmp_lt_u64_e32 vcc, 63, v[5:6] +; GCN-IR-NEXT: v_mov_b32_e32 v7, 0x8000 ; GCN-IR-NEXT: s_or_b64 s[4:5], s[4:5], vcc -; GCN-IR-NEXT: v_cmp_ne_u64_e32 vcc, 63, v[4:5] -; GCN-IR-NEXT: v_mov_b32_e32 v9, 0 -; GCN-IR-NEXT: v_cndmask_b32_e64 v6, v6, 0, s[4:5] +; GCN-IR-NEXT: v_cmp_ne_u64_e32 vcc, 63, v[5:6] +; GCN-IR-NEXT: v_cndmask_b32_e64 v7, v7, 0, s[4:5] ; GCN-IR-NEXT: s_xor_b64 s[4:5], s[4:5], -1 ; GCN-IR-NEXT: v_mov_b32_e32 v3, v2 -; GCN-IR-NEXT: v_mov_b32_e32 v7, v9 +; GCN-IR-NEXT: v_mov_b32_e32 v4, 0 +; GCN-IR-NEXT: s_mov_b64 s[8:9], 0x8000 ; GCN-IR-NEXT: s_and_b64 s[4:5], s[4:5], vcc ; GCN-IR-NEXT: s_and_saveexec_b64 s[6:7], s[4:5] ; GCN-IR-NEXT: s_cbranch_execz .LBB12_6 ; GCN-IR-NEXT: ; %bb.1: ; %udiv-bb1 -; GCN-IR-NEXT: v_add_i32_e32 v10, vcc, 1, v4 -; GCN-IR-NEXT: v_addc_u32_e32 v11, vcc, 0, v5, vcc -; GCN-IR-NEXT: v_sub_i32_e64 v4, s[4:5], 63, v4 -; GCN-IR-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[10:11] +; GCN-IR-NEXT: v_add_i32_e32 v9, vcc, 1, v5 +; GCN-IR-NEXT: v_addc_u32_e32 v10, vcc, 0, v6, vcc +; GCN-IR-NEXT: v_sub_i32_e64 v4, s[4:5], 63, v5 +; GCN-IR-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[9:10] ; GCN-IR-NEXT: v_lshl_b64 v[4:5], s[8:9], v4 ; GCN-IR-NEXT: v_mov_b32_e32 v6, 0 ; GCN-IR-NEXT: v_mov_b32_e32 v7, 0 @@ -1643,10 +1632,10 @@ define i64 @v_test_sdiv_pow2_k_num_i64(i64 %x) { ; GCN-IR-NEXT: v_add_i32_e32 v14, vcc, -1, v0 ; GCN-IR-NEXT: s_mov_b64 s[4:5], 0x8000 ; GCN-IR-NEXT: v_addc_u32_e32 v15, vcc, -1, v1, vcc -; GCN-IR-NEXT: v_lshr_b64 v[10:11], s[4:5], v10 +; GCN-IR-NEXT: v_lshr_b64 v[10:11], s[4:5], v9 ; GCN-IR-NEXT: v_sub_i32_e32 v8, vcc, 47, v8 ; GCN-IR-NEXT: v_mov_b32_e32 v12, 0 -; GCN-IR-NEXT: v_subb_u32_e32 v9, vcc, 0, v9, vcc +; GCN-IR-NEXT: v_subb_u32_e64 v9, s[4:5], 0, 0, vcc ; GCN-IR-NEXT: s_mov_b64 s[10:11], 0 ; GCN-IR-NEXT: v_mov_b32_e32 v13, 0 ; GCN-IR-NEXT: v_mov_b32_e32 v7, 0 @@ -1679,12 +1668,12 @@ define i64 @v_test_sdiv_pow2_k_num_i64(i64 %x) { ; GCN-IR-NEXT: .LBB12_5: ; %Flow3 ; GCN-IR-NEXT: s_or_b64 exec, exec, s[8:9] ; GCN-IR-NEXT: v_lshl_b64 v[0:1], v[4:5], 1 -; GCN-IR-NEXT: v_or_b32_e32 v7, v7, v1 -; GCN-IR-NEXT: v_or_b32_e32 v6, v6, v0 +; GCN-IR-NEXT: v_or_b32_e32 v4, v7, v1 +; GCN-IR-NEXT: v_or_b32_e32 v7, v6, v0 ; GCN-IR-NEXT: .LBB12_6: ; %Flow4 ; GCN-IR-NEXT: s_or_b64 exec, exec, s[6:7] -; GCN-IR-NEXT: v_xor_b32_e32 v0, v6, v2 -; GCN-IR-NEXT: v_xor_b32_e32 v1, v7, v3 +; GCN-IR-NEXT: v_xor_b32_e32 v0, v7, v2 +; GCN-IR-NEXT: v_xor_b32_e32 v1, v4, v3 ; GCN-IR-NEXT: v_sub_i32_e32 v0, vcc, v0, v2 ; GCN-IR-NEXT: v_subb_u32_e32 v1, vcc, v1, v3, vcc ; GCN-IR-NEXT: s_setpc_b64 s[30:31] diff --git a/llvm/test/CodeGen/AMDGPU/sdwa-peephole.ll b/llvm/test/CodeGen/AMDGPU/sdwa-peephole.ll index aa97bb0b76bd..ada9fe811185 100644 --- a/llvm/test/CodeGen/AMDGPU/sdwa-peephole.ll +++ b/llvm/test/CodeGen/AMDGPU/sdwa-peephole.ll @@ -516,13 +516,15 @@ store_label: ; NOSDWA-NOT: v_or_b32_sdwa ; VI-DAG: v_and_b32_sdwa v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; GFX9_10-DAG: v_and_b32_sdwa v{{[0-9]+}}, v{{[0-9]+}}, s{{[0-9]+}} dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX9-DAG: v_and_b32_sdwa v{{[0-9]+}}, v{{[0-9]+}}, s{{[0-9]+}} dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX10-DAG: v_and_b32_sdwa v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; GFX89-DAG: v_lshlrev_b16_e32 v{{[0-9]+}}, 8, v{{[0-9]+}} ; ; GFX10-DAG: v_lshrrev_b32_sdwa v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; ; VI-DAG: v_and_b32_sdwa v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; GFX9_10-DAG: v_and_b32_sdwa v{{[0-9]+}}, v{{[0-9]+}}, s{{[0-9]+}} dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX9-DAG: v_and_b32_sdwa v{{[0-9]+}}, v{{[0-9]+}}, s{{[0-9]+}} dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX10-DAG: v_and_b32_sdwa v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; GFX89-DAG: v_lshlrev_b16_e32 v{{[0-9]+}}, 8, v{{[0-9]+}} ; ; GFX10-DAG: v_lshrrev_b32_sdwa v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD diff --git a/llvm/test/CodeGen/AMDGPU/setcc-opt.ll b/llvm/test/CodeGen/AMDGPU/setcc-opt.ll index 02535610e027..73d72f44a1c0 100644 --- a/llvm/test/CodeGen/AMDGPU/setcc-opt.ll +++ b/llvm/test/CodeGen/AMDGPU/setcc-opt.ll @@ -156,12 +156,12 @@ define amdgpu_kernel void @zext_bool_icmp_ne_neg1(i1 addrspace(1)* %out, i32 %a, ; FUNC-LABEL: {{^}}cmp_zext_k_i8max: ; SI: s_load_dword [[VALUE:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xb ; VI: s_load_dword [[VALUE:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x2c -; GCN: s_movk_i32 [[K255:s[0-9]+]], 0xff -; SI-DAG: s_and_b32 [[B:s[0-9]+]], [[VALUE]], [[K255]] -; SI: s_cmp_lg_u32 [[B]], [[K255]] +; SI-DAG: s_and_b32 [[B:s[0-9]+]], [[VALUE]], 0xff +; SI: s_cmpk_lg_i32 [[B]], 0xff ; SI: s_cselect_b64 [[CC:[^,]+]], -1, 0 -; VI: v_mov_b32_e32 [[VK255:v[0-9]+]], [[K255]] +; VI: v_mov_b32_e32 [[VK255:v[0-9]+]], 0xff +; VI: s_movk_i32 [[K255:s[0-9]+]], 0xff ; VI: v_and_b32_e32 [[B:v[0-9]+]], [[VALUE]], [[VK255]] ; VI: v_cmp_ne_u16_e32 vcc, [[K255]], [[B]] @@ -208,9 +208,8 @@ define void @v_cmp_sext_k_neg1_i8_sext_arg(i8 signext %b) nounwind { ; FUNC-LABEL: {{^}}cmp_sext_k_neg1_i8_arg: ; SI: s_load_dword [[VAL:s[0-9]+]], s[{{[0-9]+:[0-9]+}}], 0xb ; VI: s_load_dword [[VAL:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x2c -; GCN: s_movk_i32 [[K:s[0-9]+]], 0xff -; GCN-DAG: s_and_b32 [[B:s[0-9]+]], [[VAL]], [[K]] -; GCN: s_cmp_lg_u32 [[B]], [[K]]{{$}} +; GCN-DAG: s_and_b32 [[B:s[0-9]+]], [[VAL]], 0xff +; GCN: s_cmpk_lg_i32 [[B]], 0xff{{$}} ; GCN: s_cselect_b64 [[CC:[^,]+]], -1, 0 ; GCN: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, 1, [[CC]] ; GCN: buffer_store_byte [[RESULT]] diff --git a/llvm/test/CodeGen/AMDGPU/shift-i128.ll b/llvm/test/CodeGen/AMDGPU/shift-i128.ll index 78ad2625a673..f914c9645df9 100644 --- a/llvm/test/CodeGen/AMDGPU/shift-i128.ll +++ b/llvm/test/CodeGen/AMDGPU/shift-i128.ll @@ -149,9 +149,9 @@ define i128 @v_lshr_i128_kv(i128 %rhs) { ; GCN-NEXT: s_mov_b64 s[4:5], 0x41 ; GCN-NEXT: v_lshr_b64 v[1:2], s[4:5], v0 ; GCN-NEXT: v_cmp_gt_u32_e32 vcc, 64, v0 -; GCN-NEXT: v_mov_b32_e32 v3, s4 ; GCN-NEXT: v_cmp_ne_u32_e64 s[4:5], 0, v0 ; GCN-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc +; GCN-NEXT: v_mov_b32_e32 v3, 0x41 ; GCN-NEXT: s_and_b64 vcc, s[4:5], vcc ; GCN-NEXT: v_cndmask_b32_e64 v0, v3, v1, s[4:5] ; GCN-NEXT: v_cndmask_b32_e32 v1, 0, v2, vcc diff --git a/llvm/test/CodeGen/AMDGPU/shl.v2i16.ll b/llvm/test/CodeGen/AMDGPU/shl.v2i16.ll index 08b045e4a623..fb15854e1fb9 100644 --- a/llvm/test/CodeGen/AMDGPU/shl.v2i16.ll +++ b/llvm/test/CodeGen/AMDGPU/shl.v2i16.ll @@ -21,17 +21,16 @@ define amdgpu_kernel void @s_shl_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> % ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x2c ; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; VI-NEXT: s_mov_b32 s6, 0xffff ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_and_b32 s7, s4, s6 +; VI-NEXT: s_and_b32 s6, s4, 0xffff ; VI-NEXT: s_lshr_b32 s4, s4, 16 -; VI-NEXT: s_lshr_b32 s8, s5, 16 -; VI-NEXT: s_lshl_b32 s4, s4, s8 -; VI-NEXT: s_lshl_b32 s5, s7, s5 +; VI-NEXT: s_lshr_b32 s7, s5, 16 +; VI-NEXT: s_lshl_b32 s4, s4, s7 +; VI-NEXT: s_lshl_b32 s5, s6, s5 ; VI-NEXT: s_lshl_b32 s4, s4, 16 -; VI-NEXT: s_and_b32 s5, s5, s6 +; VI-NEXT: s_and_b32 s5, s5, 0xffff ; VI-NEXT: s_or_b32 s4, s5, s4 ; VI-NEXT: v_mov_b32_e32 v0, s4 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 diff --git a/llvm/test/CodeGen/AMDGPU/shrink-add-sub-constant.ll b/llvm/test/CodeGen/AMDGPU/shrink-add-sub-constant.ll index 5326836b7912..edbc1fcd5842 100644 --- a/llvm/test/CodeGen/AMDGPU/shrink-add-sub-constant.ll +++ b/llvm/test/CodeGen/AMDGPU/shrink-add-sub-constant.ll @@ -1822,18 +1822,17 @@ define amdgpu_kernel void @v_test_v2i16_x_add_neg_fpone(<2 x i16> addrspace(1)* ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 +; VI-NEXT: v_mov_b32_e32 v4, 0xffffc400 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: flat_load_dword v3, v[0:1] -; VI-NEXT: s_movk_i32 s2, 0xc400 -; VI-NEXT: v_mov_b32_e32 v4, s2 ; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v2 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_add_u16_e32 v2, s2, v3 +; VI-NEXT: v_add_u16_e32 v2, 0xc400, v3 ; VI-NEXT: v_add_u16_sdwa v3, v3, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: v_or_b32_e32 v2, v2, v3 ; VI-NEXT: flat_store_dword v[0:1], v2 @@ -1895,18 +1894,17 @@ define amdgpu_kernel void @v_test_v2i16_x_add_neg_negfpone(<2 x i16> addrspace(1 ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 +; VI-NEXT: v_mov_b32_e32 v4, 0x4400 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: flat_load_dword v3, v[0:1] -; VI-NEXT: s_movk_i32 s2, 0x4400 -; VI-NEXT: v_mov_b32_e32 v4, s2 ; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v2 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_add_u16_e32 v2, s2, v3 +; VI-NEXT: v_add_u16_e32 v2, 0x4400, v3 ; VI-NEXT: v_add_u16_sdwa v3, v3, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: v_or_b32_e32 v2, v2, v3 ; VI-NEXT: flat_store_dword v[0:1], v2 @@ -1968,18 +1966,17 @@ define amdgpu_kernel void @v_test_v2i16_x_add_neg_fptwo(<2 x i16> addrspace(1)* ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 +; VI-NEXT: v_mov_b32_e32 v4, 0x4000 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: flat_load_dword v3, v[0:1] -; VI-NEXT: s_movk_i32 s2, 0x4000 -; VI-NEXT: v_mov_b32_e32 v4, s2 ; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v2 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_add_u16_e32 v2, s2, v3 +; VI-NEXT: v_add_u16_e32 v2, 0x4000, v3 ; VI-NEXT: v_add_u16_sdwa v3, v3, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: v_or_b32_e32 v2, v2, v3 ; VI-NEXT: flat_store_dword v[0:1], v2 @@ -2041,18 +2038,17 @@ define amdgpu_kernel void @v_test_v2i16_x_add_neg_negfptwo(<2 x i16> addrspace(1 ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 +; VI-NEXT: v_mov_b32_e32 v4, 0xffffc000 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: flat_load_dword v3, v[0:1] -; VI-NEXT: s_movk_i32 s2, 0xc000 -; VI-NEXT: v_mov_b32_e32 v4, s2 ; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v2 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_add_u16_e32 v2, s2, v3 +; VI-NEXT: v_add_u16_e32 v2, 0xc000, v3 ; VI-NEXT: v_add_u16_sdwa v3, v3, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: v_or_b32_e32 v2, v2, v3 ; VI-NEXT: flat_store_dword v[0:1], v2 diff --git a/llvm/test/CodeGen/AMDGPU/splitkit-getsubrangeformask.ll b/llvm/test/CodeGen/AMDGPU/splitkit-getsubrangeformask.ll index 29c20c69c94f..30f021d87e53 100644 --- a/llvm/test/CodeGen/AMDGPU/splitkit-getsubrangeformask.ll +++ b/llvm/test/CodeGen/AMDGPU/splitkit-getsubrangeformask.ll @@ -31,14 +31,13 @@ define amdgpu_gs void @_amdgpu_gs_main(i32 inreg %primShaderTableAddrLow, <31 x ; CHECK-NEXT: [[COPY11:%[0-9]+]]:sgpr_32 = COPY $sgpr10 ; CHECK-NEXT: [[COPY12:%[0-9]+]]:sgpr_32 = COPY $sgpr8 ; CHECK-NEXT: undef %71.sub0_sub1:sgpr_128 = S_LOAD_DWORDX2_IMM %56, 232, 0 :: (load (s64) from %ir.40, addrspace 4) - ; CHECK-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 65535 ; CHECK-NEXT: [[S_LSHL_B32_:%[0-9]+]]:sreg_32 = S_LSHL_B32 [[COPY4]], 4, implicit-def dead $scc ; CHECK-NEXT: [[S_LSHL_B32_1:%[0-9]+]]:sreg_32 = S_LSHL_B32 [[COPY3]], 4, implicit-def dead $scc ; CHECK-NEXT: [[S_LSHL_B32_2:%[0-9]+]]:sreg_32 = S_LSHL_B32 [[COPY2]], 4, implicit-def dead $scc ; CHECK-NEXT: [[S_ASHR_I32_:%[0-9]+]]:sreg_32_xm0 = S_ASHR_I32 [[S_LSHL_B32_]], 31, implicit-def dead $scc ; CHECK-NEXT: [[S_ASHR_I32_1:%[0-9]+]]:sreg_32_xm0 = S_ASHR_I32 [[S_LSHL_B32_1]], 31, implicit-def dead $scc - ; CHECK-NEXT: %71.sub1:sgpr_128 = S_AND_B32 %71.sub1, [[S_MOV_B32_]], implicit-def dead $scc ; CHECK-NEXT: [[S_ASHR_I32_2:%[0-9]+]]:sreg_32_xm0 = S_ASHR_I32 [[S_LSHL_B32_2]], 31, implicit-def dead $scc + ; CHECK-NEXT: %71.sub1:sgpr_128 = S_AND_B32 %71.sub1, 65535, implicit-def dead $scc ; CHECK-NEXT: undef %130.sub0:sreg_64 = S_ADD_U32 [[COPY5]], [[S_LSHL_B32_2]], implicit-def $scc ; CHECK-NEXT: %130.sub1:sreg_64 = S_ADDC_U32 undef %54:sreg_32, [[S_ASHR_I32_2]], implicit-def dead $scc, implicit $scc ; CHECK-NEXT: [[S_LOAD_DWORDX4_IMM:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM %130, 16, 0 :: (load (s128) from %ir.84, addrspace 4) @@ -195,7 +194,7 @@ define amdgpu_gs void @_amdgpu_gs_main(i32 inreg %primShaderTableAddrLow, <31 x ; CHECK-NEXT: [[S_LOAD_DWORDX4_IMM20:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM %335, 0, 0 :: (load (s128) from %ir.230, addrspace 4) ; CHECK-NEXT: [[COPY14:%[0-9]+]]:sgpr_128 = COPY %71 ; CHECK-NEXT: [[S_LOAD_DWORDX4_IMM21:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM %343, 0, 0 :: (load (s128) from %ir.236, addrspace 4) - ; CHECK-NEXT: [[S_AND_B32_:%[0-9]+]]:sreg_32 = S_AND_B32 [[S_LOAD_DWORDX2_IMM]].sub1, [[S_MOV_B32_]], implicit-def dead $scc + ; CHECK-NEXT: [[S_AND_B32_:%[0-9]+]]:sreg_32 = S_AND_B32 [[S_LOAD_DWORDX2_IMM]].sub1, 65535, implicit-def dead $scc ; CHECK-NEXT: [[COPY14]].sub0:sgpr_128 = COPY [[S_LOAD_DWORDX2_IMM]].sub0 ; CHECK-NEXT: [[COPY14]].sub1:sgpr_128 = COPY [[S_AND_B32_]] ; CHECK-NEXT: [[S_BUFFER_LOAD_DWORD_IMM5:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_IMM [[COPY14]], 0, 0 :: (dereferenceable invariant load (s32)) @@ -211,7 +210,7 @@ define amdgpu_gs void @_amdgpu_gs_main(i32 inreg %primShaderTableAddrLow, <31 x ; CHECK-NEXT: [[BUFFER_LOAD_FORMAT_X_IDXEN20:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM22]], 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32) from custom "BufferResource", align 1, addrspace 4) ; CHECK-NEXT: [[S_LOAD_DWORDX2_IMM1:%[0-9]+]]:sreg_64_xexec = S_LOAD_DWORDX2_IMM %468, 0, 0 :: (load (s64) from %ir.320, addrspace 4) ; CHECK-NEXT: [[COPY15:%[0-9]+]]:sgpr_128 = COPY %71 - ; CHECK-NEXT: [[S_AND_B32_1:%[0-9]+]]:sreg_32 = S_AND_B32 [[S_LOAD_DWORDX2_IMM1]].sub1, [[S_MOV_B32_]], implicit-def dead $scc + ; CHECK-NEXT: [[S_AND_B32_1:%[0-9]+]]:sreg_32 = S_AND_B32 [[S_LOAD_DWORDX2_IMM1]].sub1, 65535, implicit-def dead $scc ; CHECK-NEXT: [[COPY15]].sub0:sgpr_128 = COPY [[S_LOAD_DWORDX2_IMM1]].sub0 ; CHECK-NEXT: [[COPY15]].sub1:sgpr_128 = COPY [[S_AND_B32_1]] ; CHECK-NEXT: [[S_BUFFER_LOAD_DWORD_IMM6:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_IMM [[COPY15]], 0, 0 :: (dereferenceable invariant load (s32)) @@ -231,7 +230,7 @@ define amdgpu_gs void @_amdgpu_gs_main(i32 inreg %primShaderTableAddrLow, <31 x ; CHECK-NEXT: [[BUFFER_LOAD_FORMAT_X_IDXEN22:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM24]], 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32) from custom "BufferResource", align 1, addrspace 4) ; CHECK-NEXT: KILL [[S_LOAD_DWORDX4_IMM24]] ; CHECK-NEXT: KILL [[S_LOAD_DWORDX4_IMM23]] - ; CHECK-NEXT: [[S_AND_B32_2:%[0-9]+]]:sreg_32 = S_AND_B32 [[S_LOAD_DWORD_IMM1]], [[S_MOV_B32_]], implicit-def dead $scc + ; CHECK-NEXT: [[S_AND_B32_2:%[0-9]+]]:sreg_32 = S_AND_B32 [[S_LOAD_DWORD_IMM1]], 65535, implicit-def dead $scc ; CHECK-NEXT: [[COPY16:%[0-9]+]]:sgpr_128 = COPY %71 ; CHECK-NEXT: [[COPY16]].sub1:sgpr_128 = COPY [[S_AND_B32_2]] ; CHECK-NEXT: [[COPY16]].sub0:sgpr_128 = COPY [[S_LOAD_DWORD_IMM2]] diff --git a/llvm/test/CodeGen/AMDGPU/srem64.ll b/llvm/test/CodeGen/AMDGPU/srem64.ll index b227e29b2e38..1b486681037c 100644 --- a/llvm/test/CodeGen/AMDGPU/srem64.ll +++ b/llvm/test/CodeGen/AMDGPU/srem64.ll @@ -17,72 +17,71 @@ define amdgpu_kernel void @s_test_srem(i64 addrspace(1)* %out, i64 %x, i64 %y) { ; GCN-NEXT: s_mov_b32 s4, s8 ; GCN-NEXT: v_mac_f32_e32 v0, 0x4f800000, v1 ; GCN-NEXT: v_rcp_f32_e32 v0, v0 -; GCN-NEXT: v_mov_b32_e32 v1, 0 ; GCN-NEXT: s_mov_b32 s5, s9 ; GCN-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0 -; GCN-NEXT: v_mul_f32_e32 v2, 0x2f800000, v0 -; GCN-NEXT: v_trunc_f32_e32 v2, v2 -; GCN-NEXT: v_mac_f32_e32 v0, 0xcf800000, v2 -; GCN-NEXT: v_cvt_u32_f32_e32 v2, v2 +; GCN-NEXT: v_mul_f32_e32 v1, 0x2f800000, v0 +; GCN-NEXT: v_trunc_f32_e32 v1, v1 +; GCN-NEXT: v_mac_f32_e32 v0, 0xcf800000, v1 +; GCN-NEXT: v_cvt_u32_f32_e32 v1, v1 ; GCN-NEXT: v_cvt_u32_f32_e32 v0, v0 -; GCN-NEXT: v_mul_lo_u32 v3, s0, v2 -; GCN-NEXT: v_mul_hi_u32 v4, s0, v0 -; GCN-NEXT: v_mul_lo_u32 v6, s1, v0 -; GCN-NEXT: v_mul_lo_u32 v5, s0, v0 -; GCN-NEXT: v_add_i32_e32 v3, vcc, v4, v3 -; GCN-NEXT: v_add_i32_e32 v3, vcc, v3, v6 -; GCN-NEXT: v_mul_hi_u32 v4, v0, v5 -; GCN-NEXT: v_mul_lo_u32 v6, v0, v3 -; GCN-NEXT: v_mul_hi_u32 v8, v0, v3 -; GCN-NEXT: v_mul_lo_u32 v7, v2, v5 -; GCN-NEXT: v_mul_hi_u32 v5, v2, v5 -; GCN-NEXT: v_mul_hi_u32 v9, v2, v3 -; GCN-NEXT: v_add_i32_e32 v4, vcc, v4, v6 -; GCN-NEXT: v_addc_u32_e32 v6, vcc, 0, v8, vcc -; GCN-NEXT: v_mul_lo_u32 v3, v2, v3 -; GCN-NEXT: v_add_i32_e32 v4, vcc, v4, v7 -; GCN-NEXT: v_addc_u32_e32 v4, vcc, v6, v5, vcc -; GCN-NEXT: v_addc_u32_e32 v5, vcc, v9, v1, vcc -; GCN-NEXT: v_add_i32_e32 v3, vcc, v4, v3 -; GCN-NEXT: v_addc_u32_e32 v4, vcc, 0, v5, vcc -; GCN-NEXT: v_add_i32_e32 v0, vcc, v0, v3 -; GCN-NEXT: v_addc_u32_e32 v2, vcc, v2, v4, vcc -; GCN-NEXT: v_mul_lo_u32 v3, s0, v2 -; GCN-NEXT: v_mul_hi_u32 v4, s0, v0 +; GCN-NEXT: v_mul_lo_u32 v2, s0, v1 +; GCN-NEXT: v_mul_hi_u32 v3, s0, v0 ; GCN-NEXT: v_mul_lo_u32 v5, s1, v0 -; GCN-NEXT: v_add_i32_e32 v3, vcc, v4, v3 ; GCN-NEXT: v_mul_lo_u32 v4, s0, v0 -; GCN-NEXT: v_add_i32_e32 v3, vcc, v5, v3 -; GCN-NEXT: v_mul_lo_u32 v7, v0, v3 -; GCN-NEXT: v_mul_hi_u32 v8, v0, v4 -; GCN-NEXT: v_mul_hi_u32 v9, v0, v3 -; GCN-NEXT: v_mul_hi_u32 v6, v2, v4 -; GCN-NEXT: v_mul_lo_u32 v4, v2, v4 -; GCN-NEXT: v_mul_hi_u32 v5, v2, v3 -; GCN-NEXT: v_add_i32_e32 v7, vcc, v8, v7 -; GCN-NEXT: v_addc_u32_e32 v8, vcc, 0, v9, vcc -; GCN-NEXT: v_mul_lo_u32 v3, v2, v3 -; GCN-NEXT: v_add_i32_e32 v4, vcc, v7, v4 -; GCN-NEXT: v_addc_u32_e32 v4, vcc, v8, v6, vcc -; GCN-NEXT: v_addc_u32_e32 v5, vcc, v5, v1, vcc -; GCN-NEXT: v_add_i32_e32 v3, vcc, v4, v3 -; GCN-NEXT: v_addc_u32_e32 v4, vcc, 0, v5, vcc -; GCN-NEXT: v_add_i32_e32 v0, vcc, v0, v3 -; GCN-NEXT: v_addc_u32_e32 v2, vcc, v2, v4, vcc -; GCN-NEXT: v_mul_lo_u32 v3, s10, v2 -; GCN-NEXT: v_mul_hi_u32 v4, s10, v0 -; GCN-NEXT: v_mul_hi_u32 v5, s10, v2 -; GCN-NEXT: v_mul_hi_u32 v6, s11, v2 -; GCN-NEXT: v_mul_lo_u32 v2, s11, v2 -; GCN-NEXT: v_add_i32_e32 v3, vcc, v4, v3 -; GCN-NEXT: v_addc_u32_e32 v4, vcc, 0, v5, vcc -; GCN-NEXT: v_mul_lo_u32 v5, s11, v0 -; GCN-NEXT: v_mul_hi_u32 v0, s11, v0 +; GCN-NEXT: v_add_i32_e32 v2, vcc, v3, v2 +; GCN-NEXT: v_add_i32_e32 v2, vcc, v2, v5 +; GCN-NEXT: v_mul_hi_u32 v3, v0, v4 +; GCN-NEXT: v_mul_lo_u32 v5, v0, v2 +; GCN-NEXT: v_mul_hi_u32 v7, v0, v2 +; GCN-NEXT: v_mul_hi_u32 v6, v1, v4 +; GCN-NEXT: v_mul_lo_u32 v4, v1, v4 +; GCN-NEXT: v_mul_hi_u32 v8, v1, v2 ; GCN-NEXT: v_add_i32_e32 v3, vcc, v3, v5 -; GCN-NEXT: v_addc_u32_e32 v0, vcc, v4, v0, vcc -; GCN-NEXT: v_addc_u32_e32 v1, vcc, v6, v1, vcc +; GCN-NEXT: v_addc_u32_e32 v5, vcc, 0, v7, vcc +; GCN-NEXT: v_mul_lo_u32 v2, v1, v2 +; GCN-NEXT: v_add_i32_e32 v3, vcc, v3, v4 +; GCN-NEXT: v_addc_u32_e32 v3, vcc, v5, v6, vcc +; GCN-NEXT: v_addc_u32_e32 v4, vcc, 0, v8, vcc +; GCN-NEXT: v_add_i32_e32 v2, vcc, v3, v2 +; GCN-NEXT: v_addc_u32_e32 v3, vcc, 0, v4, vcc ; GCN-NEXT: v_add_i32_e32 v0, vcc, v0, v2 -; GCN-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GCN-NEXT: v_addc_u32_e32 v1, vcc, v1, v3, vcc +; GCN-NEXT: v_mul_lo_u32 v2, s0, v1 +; GCN-NEXT: v_mul_hi_u32 v3, s0, v0 +; GCN-NEXT: v_mul_lo_u32 v4, s1, v0 +; GCN-NEXT: v_add_i32_e32 v2, vcc, v3, v2 +; GCN-NEXT: v_mul_lo_u32 v3, s0, v0 +; GCN-NEXT: v_add_i32_e32 v2, vcc, v4, v2 +; GCN-NEXT: v_mul_lo_u32 v6, v0, v2 +; GCN-NEXT: v_mul_hi_u32 v7, v0, v3 +; GCN-NEXT: v_mul_hi_u32 v8, v0, v2 +; GCN-NEXT: v_mul_hi_u32 v5, v1, v3 +; GCN-NEXT: v_mul_lo_u32 v3, v1, v3 +; GCN-NEXT: v_mul_hi_u32 v4, v1, v2 +; GCN-NEXT: v_add_i32_e32 v6, vcc, v7, v6 +; GCN-NEXT: v_addc_u32_e32 v7, vcc, 0, v8, vcc +; GCN-NEXT: v_mul_lo_u32 v2, v1, v2 +; GCN-NEXT: v_add_i32_e32 v3, vcc, v6, v3 +; GCN-NEXT: v_addc_u32_e32 v3, vcc, v7, v5, vcc +; GCN-NEXT: v_addc_u32_e32 v4, vcc, 0, v4, vcc +; GCN-NEXT: v_add_i32_e32 v2, vcc, v3, v2 +; GCN-NEXT: v_addc_u32_e32 v3, vcc, 0, v4, vcc +; GCN-NEXT: v_add_i32_e32 v0, vcc, v0, v2 +; GCN-NEXT: v_addc_u32_e32 v1, vcc, v1, v3, vcc +; GCN-NEXT: v_mul_lo_u32 v2, s10, v1 +; GCN-NEXT: v_mul_hi_u32 v3, s10, v0 +; GCN-NEXT: v_mul_hi_u32 v4, s10, v1 +; GCN-NEXT: v_mul_hi_u32 v5, s11, v1 +; GCN-NEXT: v_mul_lo_u32 v1, s11, v1 +; GCN-NEXT: v_add_i32_e32 v2, vcc, v3, v2 +; GCN-NEXT: v_addc_u32_e32 v3, vcc, 0, v4, vcc +; GCN-NEXT: v_mul_lo_u32 v4, s11, v0 +; GCN-NEXT: v_mul_hi_u32 v0, s11, v0 +; GCN-NEXT: v_add_i32_e32 v2, vcc, v2, v4 +; GCN-NEXT: v_addc_u32_e32 v0, vcc, v3, v0, vcc +; GCN-NEXT: v_addc_u32_e32 v2, vcc, 0, v5, vcc +; GCN-NEXT: v_add_i32_e32 v0, vcc, v0, v1 +; GCN-NEXT: v_addc_u32_e32 v1, vcc, 0, v2, vcc ; GCN-NEXT: v_mul_lo_u32 v1, s12, v1 ; GCN-NEXT: v_mul_hi_u32 v2, s12, v0 ; GCN-NEXT: v_mul_lo_u32 v3, s13, v0 @@ -161,9 +160,8 @@ define amdgpu_kernel void @s_test_srem(i64 addrspace(1)* %out, i64 %x, i64 %y) { ; GCN-IR-NEXT: s_add_u32 s16, s4, -1 ; GCN-IR-NEXT: s_addc_u32 s17, s5, -1 ; GCN-IR-NEXT: s_not_b64 s[6:7], s[10:11] -; GCN-IR-NEXT: s_mov_b32 s15, s11 ; GCN-IR-NEXT: s_add_u32 s10, s6, s14 -; GCN-IR-NEXT: s_addc_u32 s11, s7, s11 +; GCN-IR-NEXT: s_addc_u32 s11, s7, 0 ; GCN-IR-NEXT: s_mov_b64 s[14:15], 0 ; GCN-IR-NEXT: s_mov_b32 s7, 0 ; GCN-IR-NEXT: .LBB0_3: ; %udiv-do-while @@ -234,7 +232,6 @@ define i64 @v_test_srem(i64 %x, i64 %y) { ; GCN-NEXT: v_subb_u32_e32 v7, vcc, 0, v3, vcc ; GCN-NEXT: v_mac_f32_e32 v4, 0x4f800000, v5 ; GCN-NEXT: v_rcp_f32_e32 v4, v4 -; GCN-NEXT: v_mov_b32_e32 v13, 0 ; GCN-NEXT: v_mul_f32_e32 v4, 0x5f7ffffc, v4 ; GCN-NEXT: v_mul_f32_e32 v5, 0x2f800000, v4 ; GCN-NEXT: v_trunc_f32_e32 v5, v5 @@ -250,7 +247,7 @@ define i64 @v_test_srem(i64 %x, i64 %y) { ; GCN-NEXT: v_mul_lo_u32 v10, v4, v8 ; GCN-NEXT: v_mul_hi_u32 v11, v4, v9 ; GCN-NEXT: v_mul_hi_u32 v12, v4, v8 -; GCN-NEXT: v_mul_hi_u32 v14, v5, v8 +; GCN-NEXT: v_mul_hi_u32 v13, v5, v8 ; GCN-NEXT: v_mul_lo_u32 v8, v5, v8 ; GCN-NEXT: v_add_i32_e32 v10, vcc, v11, v10 ; GCN-NEXT: v_addc_u32_e32 v11, vcc, 0, v12, vcc @@ -258,7 +255,7 @@ define i64 @v_test_srem(i64 %x, i64 %y) { ; GCN-NEXT: v_mul_hi_u32 v9, v5, v9 ; GCN-NEXT: v_add_i32_e32 v10, vcc, v10, v12 ; GCN-NEXT: v_addc_u32_e32 v9, vcc, v11, v9, vcc -; GCN-NEXT: v_addc_u32_e32 v10, vcc, v14, v13, vcc +; GCN-NEXT: v_addc_u32_e32 v10, vcc, 0, v13, vcc ; GCN-NEXT: v_add_i32_e32 v8, vcc, v9, v8 ; GCN-NEXT: v_addc_u32_e32 v9, vcc, 0, v10, vcc ; GCN-NEXT: v_add_i32_e32 v4, vcc, v4, v8 @@ -280,7 +277,7 @@ define i64 @v_test_srem(i64 %x, i64 %y) { ; GCN-NEXT: v_mul_lo_u32 v7, v5, v7 ; GCN-NEXT: v_add_i32_e32 v6, vcc, v10, v6 ; GCN-NEXT: v_addc_u32_e32 v6, vcc, v11, v9, vcc -; GCN-NEXT: v_addc_u32_e32 v8, vcc, v8, v13, vcc +; GCN-NEXT: v_addc_u32_e32 v8, vcc, 0, v8, vcc ; GCN-NEXT: v_add_i32_e32 v6, vcc, v6, v7 ; GCN-NEXT: v_addc_u32_e32 v7, vcc, 0, v8, vcc ; GCN-NEXT: v_add_i32_e32 v4, vcc, v4, v6 @@ -301,7 +298,7 @@ define i64 @v_test_srem(i64 %x, i64 %y) { ; GCN-NEXT: v_mul_lo_u32 v5, v1, v5 ; GCN-NEXT: v_add_i32_e32 v7, vcc, v7, v9 ; GCN-NEXT: v_addc_u32_e32 v4, vcc, v8, v4, vcc -; GCN-NEXT: v_addc_u32_e32 v7, vcc, v10, v13, vcc +; GCN-NEXT: v_addc_u32_e32 v7, vcc, 0, v10, vcc ; GCN-NEXT: v_add_i32_e32 v4, vcc, v4, v5 ; GCN-NEXT: v_addc_u32_e32 v5, vcc, 0, v7, vcc ; GCN-NEXT: v_mul_lo_u32 v5, v2, v5 @@ -354,101 +351,99 @@ define i64 @v_test_srem(i64 %x, i64 %y) { ; GCN-IR-NEXT: v_xor_b32_e32 v2, v2, v6 ; GCN-IR-NEXT: v_subb_u32_e32 v1, vcc, v1, v4, vcc ; GCN-IR-NEXT: v_xor_b32_e32 v3, v3, v6 -; GCN-IR-NEXT: v_sub_i32_e32 v5, vcc, v2, v6 -; GCN-IR-NEXT: v_subb_u32_e32 v6, vcc, v3, v6, vcc -; GCN-IR-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[5:6] +; GCN-IR-NEXT: v_sub_i32_e32 v2, vcc, v2, v6 +; GCN-IR-NEXT: v_subb_u32_e32 v3, vcc, v3, v6, vcc +; GCN-IR-NEXT: v_ffbh_u32_e32 v6, v2 +; GCN-IR-NEXT: v_add_i32_e64 v6, s[6:7], 32, v6 +; GCN-IR-NEXT: v_ffbh_u32_e32 v7, v3 +; GCN-IR-NEXT: v_min_u32_e32 v10, v6, v7 +; GCN-IR-NEXT: v_ffbh_u32_e32 v6, v0 +; GCN-IR-NEXT: v_add_i32_e64 v6, s[6:7], 32, v6 +; GCN-IR-NEXT: v_ffbh_u32_e32 v7, v1 +; GCN-IR-NEXT: v_min_u32_e32 v11, v6, v7 +; GCN-IR-NEXT: v_sub_i32_e64 v7, s[6:7], v10, v11 +; GCN-IR-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[2:3] ; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[4:5], 0, v[0:1] -; GCN-IR-NEXT: v_ffbh_u32_e32 v3, v5 -; GCN-IR-NEXT: s_or_b64 s[6:7], vcc, s[4:5] -; GCN-IR-NEXT: v_add_i32_e32 v3, vcc, 32, v3 -; GCN-IR-NEXT: v_ffbh_u32_e32 v7, v6 -; GCN-IR-NEXT: v_min_u32_e32 v3, v3, v7 -; GCN-IR-NEXT: v_ffbh_u32_e32 v7, v0 -; GCN-IR-NEXT: v_add_i32_e32 v7, vcc, 32, v7 -; GCN-IR-NEXT: v_ffbh_u32_e32 v8, v1 -; GCN-IR-NEXT: v_min_u32_e32 v12, v7, v8 -; GCN-IR-NEXT: v_sub_i32_e32 v7, vcc, v3, v12 -; GCN-IR-NEXT: v_subb_u32_e64 v8, s[4:5], 0, 0, vcc -; GCN-IR-NEXT: v_cmp_lt_u64_e32 vcc, 63, v[7:8] -; GCN-IR-NEXT: v_cmp_ne_u64_e64 s[4:5], 63, v[7:8] -; GCN-IR-NEXT: s_or_b64 s[6:7], s[6:7], vcc -; GCN-IR-NEXT: v_mov_b32_e32 v11, 0 -; GCN-IR-NEXT: s_xor_b64 s[8:9], s[6:7], -1 -; GCN-IR-NEXT: v_mov_b32_e32 v2, v4 -; GCN-IR-NEXT: v_mov_b32_e32 v13, v11 -; GCN-IR-NEXT: v_cndmask_b32_e64 v10, v1, 0, s[6:7] -; GCN-IR-NEXT: s_and_b64 s[4:5], s[8:9], s[4:5] -; GCN-IR-NEXT: v_cndmask_b32_e64 v9, v0, 0, s[6:7] +; GCN-IR-NEXT: v_subb_u32_e64 v8, s[6:7], 0, 0, s[6:7] +; GCN-IR-NEXT: v_cmp_lt_u64_e64 s[6:7], 63, v[7:8] +; GCN-IR-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GCN-IR-NEXT: s_or_b64 s[4:5], s[4:5], s[6:7] +; GCN-IR-NEXT: v_cmp_ne_u64_e32 vcc, 63, v[7:8] +; GCN-IR-NEXT: s_xor_b64 s[6:7], s[4:5], -1 +; GCN-IR-NEXT: v_mov_b32_e32 v5, v4 +; GCN-IR-NEXT: v_cndmask_b32_e64 v9, v1, 0, s[4:5] +; GCN-IR-NEXT: v_cndmask_b32_e64 v6, v0, 0, s[4:5] +; GCN-IR-NEXT: s_and_b64 s[4:5], s[6:7], vcc ; GCN-IR-NEXT: s_and_saveexec_b64 s[6:7], s[4:5] ; GCN-IR-NEXT: s_cbranch_execz .LBB1_6 ; GCN-IR-NEXT: ; %bb.1: ; %udiv-bb1 -; GCN-IR-NEXT: v_add_i32_e32 v14, vcc, 1, v7 -; GCN-IR-NEXT: v_addc_u32_e32 v15, vcc, 0, v8, vcc -; GCN-IR-NEXT: v_sub_i32_e64 v7, s[4:5], 63, v7 -; GCN-IR-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[14:15] -; GCN-IR-NEXT: v_lshl_b64 v[7:8], v[0:1], v7 +; GCN-IR-NEXT: v_add_i32_e32 v12, vcc, 1, v7 +; GCN-IR-NEXT: v_addc_u32_e32 v13, vcc, 0, v8, vcc +; GCN-IR-NEXT: v_sub_i32_e64 v6, s[4:5], 63, v7 +; GCN-IR-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[12:13] +; GCN-IR-NEXT: v_lshl_b64 v[6:7], v[0:1], v6 +; GCN-IR-NEXT: v_mov_b32_e32 v8, 0 ; GCN-IR-NEXT: v_mov_b32_e32 v9, 0 -; GCN-IR-NEXT: v_mov_b32_e32 v10, 0 ; GCN-IR-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GCN-IR-NEXT: s_xor_b64 s[8:9], exec, s[4:5] ; GCN-IR-NEXT: s_cbranch_execz .LBB1_5 ; GCN-IR-NEXT: ; %bb.2: ; %udiv-preheader -; GCN-IR-NEXT: v_add_i32_e32 v18, vcc, -1, v5 -; GCN-IR-NEXT: v_addc_u32_e32 v19, vcc, -1, v6, vcc -; GCN-IR-NEXT: v_not_b32_e32 v3, v3 -; GCN-IR-NEXT: v_lshr_b64 v[14:15], v[0:1], v14 -; GCN-IR-NEXT: v_not_b32_e32 v9, v11 -; GCN-IR-NEXT: v_add_i32_e32 v11, vcc, v3, v12 -; GCN-IR-NEXT: v_mov_b32_e32 v16, 0 -; GCN-IR-NEXT: v_addc_u32_e32 v12, vcc, v9, v13, vcc +; GCN-IR-NEXT: v_add_i32_e32 v16, vcc, -1, v2 +; GCN-IR-NEXT: v_addc_u32_e32 v17, vcc, -1, v3, vcc +; GCN-IR-NEXT: v_not_b32_e32 v9, v10 +; GCN-IR-NEXT: v_lshr_b64 v[12:13], v[0:1], v12 +; GCN-IR-NEXT: v_not_b32_e32 v8, 0 +; GCN-IR-NEXT: v_add_i32_e32 v10, vcc, v9, v11 +; GCN-IR-NEXT: v_mov_b32_e32 v14, 0 +; GCN-IR-NEXT: v_addc_u32_e32 v11, vcc, 0, v8, vcc ; GCN-IR-NEXT: s_mov_b64 s[10:11], 0 -; GCN-IR-NEXT: v_mov_b32_e32 v17, 0 -; GCN-IR-NEXT: v_mov_b32_e32 v10, 0 +; GCN-IR-NEXT: v_mov_b32_e32 v15, 0 +; GCN-IR-NEXT: v_mov_b32_e32 v9, 0 ; GCN-IR-NEXT: .LBB1_3: ; %udiv-do-while ; GCN-IR-NEXT: ; =>This Inner Loop Header: Depth=1 -; GCN-IR-NEXT: v_lshl_b64 v[14:15], v[14:15], 1 -; GCN-IR-NEXT: v_lshrrev_b32_e32 v3, 31, v8 -; GCN-IR-NEXT: v_or_b32_e32 v3, v14, v3 -; GCN-IR-NEXT: v_sub_i32_e32 v9, vcc, v18, v3 -; GCN-IR-NEXT: v_lshl_b64 v[7:8], v[7:8], 1 -; GCN-IR-NEXT: v_subb_u32_e32 v9, vcc, v19, v15, vcc -; GCN-IR-NEXT: v_ashrrev_i32_e32 v13, 31, v9 -; GCN-IR-NEXT: v_add_i32_e32 v11, vcc, 1, v11 -; GCN-IR-NEXT: v_or_b32_e32 v7, v16, v7 -; GCN-IR-NEXT: v_and_b32_e32 v9, 1, v13 -; GCN-IR-NEXT: v_and_b32_e32 v16, v13, v6 -; GCN-IR-NEXT: v_and_b32_e32 v13, v13, v5 -; GCN-IR-NEXT: v_addc_u32_e32 v12, vcc, 0, v12, vcc -; GCN-IR-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[11:12] -; GCN-IR-NEXT: v_sub_i32_e64 v14, s[4:5], v3, v13 -; GCN-IR-NEXT: v_or_b32_e32 v8, v17, v8 -; GCN-IR-NEXT: v_subb_u32_e64 v15, s[4:5], v15, v16, s[4:5] -; GCN-IR-NEXT: v_mov_b32_e32 v17, v10 +; GCN-IR-NEXT: v_lshl_b64 v[12:13], v[12:13], 1 +; GCN-IR-NEXT: v_lshrrev_b32_e32 v8, 31, v7 +; GCN-IR-NEXT: v_or_b32_e32 v12, v12, v8 +; GCN-IR-NEXT: v_lshl_b64 v[6:7], v[6:7], 1 +; GCN-IR-NEXT: v_sub_i32_e32 v8, vcc, v16, v12 +; GCN-IR-NEXT: v_subb_u32_e32 v8, vcc, v17, v13, vcc +; GCN-IR-NEXT: v_or_b32_e32 v6, v14, v6 +; GCN-IR-NEXT: v_ashrrev_i32_e32 v14, 31, v8 +; GCN-IR-NEXT: v_add_i32_e32 v10, vcc, 1, v10 +; GCN-IR-NEXT: v_or_b32_e32 v7, v15, v7 +; GCN-IR-NEXT: v_and_b32_e32 v8, 1, v14 +; GCN-IR-NEXT: v_and_b32_e32 v15, v14, v3 +; GCN-IR-NEXT: v_and_b32_e32 v14, v14, v2 +; GCN-IR-NEXT: v_addc_u32_e32 v11, vcc, 0, v11, vcc +; GCN-IR-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[10:11] +; GCN-IR-NEXT: v_sub_i32_e64 v12, s[4:5], v12, v14 +; GCN-IR-NEXT: v_subb_u32_e64 v13, s[4:5], v13, v15, s[4:5] +; GCN-IR-NEXT: v_mov_b32_e32 v15, v9 ; GCN-IR-NEXT: s_or_b64 s[10:11], vcc, s[10:11] -; GCN-IR-NEXT: v_mov_b32_e32 v16, v9 +; GCN-IR-NEXT: v_mov_b32_e32 v14, v8 ; GCN-IR-NEXT: s_andn2_b64 exec, exec, s[10:11] ; GCN-IR-NEXT: s_cbranch_execnz .LBB1_3 ; GCN-IR-NEXT: ; %bb.4: ; %Flow ; GCN-IR-NEXT: s_or_b64 exec, exec, s[10:11] ; GCN-IR-NEXT: .LBB1_5: ; %Flow3 ; GCN-IR-NEXT: s_or_b64 exec, exec, s[8:9] -; GCN-IR-NEXT: v_lshl_b64 v[7:8], v[7:8], 1 -; GCN-IR-NEXT: v_or_b32_e32 v10, v10, v8 +; GCN-IR-NEXT: v_lshl_b64 v[6:7], v[6:7], 1 ; GCN-IR-NEXT: v_or_b32_e32 v9, v9, v7 +; GCN-IR-NEXT: v_or_b32_e32 v6, v8, v6 ; GCN-IR-NEXT: .LBB1_6: ; %Flow4 ; GCN-IR-NEXT: s_or_b64 exec, exec, s[6:7] -; GCN-IR-NEXT: v_mul_lo_u32 v3, v5, v10 -; GCN-IR-NEXT: v_mul_hi_u32 v7, v5, v9 -; GCN-IR-NEXT: v_mul_lo_u32 v6, v6, v9 -; GCN-IR-NEXT: v_mul_lo_u32 v5, v5, v9 +; GCN-IR-NEXT: v_mul_lo_u32 v7, v2, v9 +; GCN-IR-NEXT: v_mul_hi_u32 v8, v2, v6 +; GCN-IR-NEXT: v_mul_lo_u32 v3, v3, v6 +; GCN-IR-NEXT: v_mul_lo_u32 v2, v2, v6 +; GCN-IR-NEXT: v_add_i32_e32 v7, vcc, v8, v7 ; GCN-IR-NEXT: v_add_i32_e32 v3, vcc, v7, v3 -; GCN-IR-NEXT: v_add_i32_e32 v3, vcc, v3, v6 -; GCN-IR-NEXT: v_sub_i32_e32 v0, vcc, v0, v5 +; GCN-IR-NEXT: v_sub_i32_e32 v0, vcc, v0, v2 ; GCN-IR-NEXT: v_subb_u32_e32 v1, vcc, v1, v3, vcc ; GCN-IR-NEXT: v_xor_b32_e32 v0, v0, v4 -; GCN-IR-NEXT: v_xor_b32_e32 v1, v1, v2 +; GCN-IR-NEXT: v_xor_b32_e32 v1, v1, v5 ; GCN-IR-NEXT: v_sub_i32_e32 v0, vcc, v0, v4 -; GCN-IR-NEXT: v_subb_u32_e32 v1, vcc, v1, v2, vcc +; GCN-IR-NEXT: v_subb_u32_e32 v1, vcc, v1, v5, vcc ; GCN-IR-NEXT: s_setpc_b64 s[30:31] %result = srem i64 %x, %y ret i64 %result @@ -862,29 +857,28 @@ define amdgpu_kernel void @s_test_srem32_64(i64 addrspace(1)* %out, i64 %x, i64 define amdgpu_kernel void @s_test_srem33_64(i64 addrspace(1)* %out, i64 %x, i64 %y) { ; GCN-LABEL: s_test_srem33_64: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x9 +; GCN-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 ; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd -; GCN-NEXT: v_mov_b32_e32 v6, 0 -; GCN-NEXT: s_mov_b32 s7, 0xf000 -; GCN-NEXT: s_mov_b32 s6, -1 +; GCN-NEXT: s_mov_b32 s11, 0xf000 +; GCN-NEXT: s_mov_b32 s10, -1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_ashr_i64 s[2:3], s[10:11], 31 -; GCN-NEXT: s_ashr_i64 s[4:5], s[0:1], 31 +; GCN-NEXT: s_ashr_i64 s[2:3], s[6:7], 31 +; GCN-NEXT: s_ashr_i64 s[8:9], s[0:1], 31 ; GCN-NEXT: s_ashr_i32 s0, s1, 31 -; GCN-NEXT: s_add_u32 s4, s4, s0 +; GCN-NEXT: s_add_u32 s8, s8, s0 ; GCN-NEXT: s_mov_b32 s1, s0 -; GCN-NEXT: s_addc_u32 s5, s5, s0 -; GCN-NEXT: s_xor_b64 s[12:13], s[4:5], s[0:1] +; GCN-NEXT: s_addc_u32 s9, s9, s0 +; GCN-NEXT: s_xor_b64 s[12:13], s[8:9], s[0:1] ; GCN-NEXT: v_cvt_f32_u32_e32 v0, s12 ; GCN-NEXT: v_cvt_f32_u32_e32 v1, s13 ; GCN-NEXT: s_sub_u32 s0, 0, s12 ; GCN-NEXT: s_subb_u32 s1, 0, s13 -; GCN-NEXT: s_ashr_i32 s10, s11, 31 +; GCN-NEXT: s_ashr_i32 s6, s7, 31 ; GCN-NEXT: v_mac_f32_e32 v0, 0x4f800000, v1 ; GCN-NEXT: v_rcp_f32_e32 v0, v0 -; GCN-NEXT: s_mov_b32 s11, s10 -; GCN-NEXT: s_mov_b32 s4, s8 -; GCN-NEXT: s_mov_b32 s5, s9 +; GCN-NEXT: s_mov_b32 s7, s6 +; GCN-NEXT: s_mov_b32 s8, s4 +; GCN-NEXT: s_mov_b32 s9, s5 ; GCN-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0 ; GCN-NEXT: v_mul_f32_e32 v1, 0x2f800000, v0 ; GCN-NEXT: v_trunc_f32_e32 v1, v1 @@ -899,16 +893,16 @@ define amdgpu_kernel void @s_test_srem33_64(i64 addrspace(1)* %out, i64 %x, i64 ; GCN-NEXT: v_add_i32_e32 v2, vcc, v2, v5 ; GCN-NEXT: v_mul_hi_u32 v3, v0, v4 ; GCN-NEXT: v_mul_lo_u32 v5, v0, v2 -; GCN-NEXT: v_mul_hi_u32 v7, v0, v2 -; GCN-NEXT: v_mul_hi_u32 v8, v1, v2 +; GCN-NEXT: v_mul_hi_u32 v6, v0, v2 +; GCN-NEXT: v_mul_hi_u32 v7, v1, v2 ; GCN-NEXT: v_mul_lo_u32 v2, v1, v2 ; GCN-NEXT: v_add_i32_e32 v3, vcc, v3, v5 -; GCN-NEXT: v_addc_u32_e32 v5, vcc, 0, v7, vcc -; GCN-NEXT: v_mul_lo_u32 v7, v1, v4 +; GCN-NEXT: v_addc_u32_e32 v5, vcc, 0, v6, vcc +; GCN-NEXT: v_mul_lo_u32 v6, v1, v4 ; GCN-NEXT: v_mul_hi_u32 v4, v1, v4 -; GCN-NEXT: v_add_i32_e32 v3, vcc, v3, v7 +; GCN-NEXT: v_add_i32_e32 v3, vcc, v3, v6 ; GCN-NEXT: v_addc_u32_e32 v3, vcc, v5, v4, vcc -; GCN-NEXT: v_addc_u32_e32 v4, vcc, v8, v6, vcc +; GCN-NEXT: v_addc_u32_e32 v4, vcc, 0, v7, vcc ; GCN-NEXT: v_add_i32_e32 v2, vcc, v3, v2 ; GCN-NEXT: v_addc_u32_e32 v3, vcc, 0, v4, vcc ; GCN-NEXT: v_add_i32_e32 v0, vcc, v0, v2 @@ -919,25 +913,25 @@ define amdgpu_kernel void @s_test_srem33_64(i64 addrspace(1)* %out, i64 %x, i64 ; GCN-NEXT: v_add_i32_e32 v2, vcc, v3, v2 ; GCN-NEXT: v_mul_lo_u32 v3, s0, v0 ; GCN-NEXT: v_add_i32_e32 v2, vcc, v4, v2 -; GCN-NEXT: v_mul_lo_u32 v7, v0, v2 -; GCN-NEXT: v_mul_hi_u32 v8, v0, v3 -; GCN-NEXT: v_mul_hi_u32 v9, v0, v2 +; GCN-NEXT: v_mul_lo_u32 v6, v0, v2 +; GCN-NEXT: v_mul_hi_u32 v7, v0, v3 +; GCN-NEXT: v_mul_hi_u32 v8, v0, v2 ; GCN-NEXT: v_mul_hi_u32 v5, v1, v3 ; GCN-NEXT: v_mul_lo_u32 v3, v1, v3 ; GCN-NEXT: v_mul_hi_u32 v4, v1, v2 -; GCN-NEXT: v_add_i32_e32 v7, vcc, v8, v7 -; GCN-NEXT: v_addc_u32_e32 v8, vcc, 0, v9, vcc +; GCN-NEXT: v_add_i32_e32 v6, vcc, v7, v6 +; GCN-NEXT: v_addc_u32_e32 v7, vcc, 0, v8, vcc ; GCN-NEXT: v_mul_lo_u32 v2, v1, v2 -; GCN-NEXT: v_add_i32_e32 v3, vcc, v7, v3 -; GCN-NEXT: v_addc_u32_e32 v3, vcc, v8, v5, vcc -; GCN-NEXT: v_addc_u32_e32 v4, vcc, v4, v6, vcc +; GCN-NEXT: v_add_i32_e32 v3, vcc, v6, v3 +; GCN-NEXT: v_addc_u32_e32 v3, vcc, v7, v5, vcc +; GCN-NEXT: v_addc_u32_e32 v4, vcc, 0, v4, vcc ; GCN-NEXT: v_add_i32_e32 v2, vcc, v3, v2 ; GCN-NEXT: v_addc_u32_e32 v3, vcc, 0, v4, vcc -; GCN-NEXT: s_add_u32 s0, s2, s10 +; GCN-NEXT: s_add_u32 s0, s2, s6 ; GCN-NEXT: v_add_i32_e32 v0, vcc, v0, v2 -; GCN-NEXT: s_addc_u32 s1, s3, s10 +; GCN-NEXT: s_addc_u32 s1, s3, s6 ; GCN-NEXT: v_addc_u32_e32 v1, vcc, v1, v3, vcc -; GCN-NEXT: s_xor_b64 s[14:15], s[0:1], s[10:11] +; GCN-NEXT: s_xor_b64 s[14:15], s[0:1], s[6:7] ; GCN-NEXT: v_mul_lo_u32 v2, s14, v1 ; GCN-NEXT: v_mul_hi_u32 v3, s14, v0 ; GCN-NEXT: v_mul_hi_u32 v4, s14, v1 @@ -949,7 +943,7 @@ define amdgpu_kernel void @s_test_srem33_64(i64 addrspace(1)* %out, i64 %x, i64 ; GCN-NEXT: v_mul_hi_u32 v0, s15, v0 ; GCN-NEXT: v_add_i32_e32 v2, vcc, v2, v4 ; GCN-NEXT: v_addc_u32_e32 v0, vcc, v3, v0, vcc -; GCN-NEXT: v_addc_u32_e32 v2, vcc, v5, v6, vcc +; GCN-NEXT: v_addc_u32_e32 v2, vcc, 0, v5, vcc ; GCN-NEXT: v_add_i32_e32 v0, vcc, v0, v1 ; GCN-NEXT: v_addc_u32_e32 v1, vcc, 0, v2, vcc ; GCN-NEXT: v_mul_lo_u32 v1, s12, v1 @@ -987,12 +981,12 @@ define amdgpu_kernel void @s_test_srem33_64(i64 addrspace(1)* %out, i64 %x, i64 ; GCN-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc ; GCN-NEXT: v_cndmask_b32_e64 v2, v4, v3, s[0:1] ; GCN-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc -; GCN-NEXT: v_xor_b32_e32 v0, s10, v0 -; GCN-NEXT: v_xor_b32_e32 v1, s10, v1 -; GCN-NEXT: v_mov_b32_e32 v2, s10 -; GCN-NEXT: v_subrev_i32_e32 v0, vcc, s10, v0 +; GCN-NEXT: v_xor_b32_e32 v0, s6, v0 +; GCN-NEXT: v_xor_b32_e32 v1, s6, v1 +; GCN-NEXT: v_mov_b32_e32 v2, s6 +; GCN-NEXT: v_subrev_i32_e32 v0, vcc, s6, v0 ; GCN-NEXT: v_subb_u32_e32 v1, vcc, v1, v2, vcc -; GCN-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 +; GCN-NEXT: buffer_store_dwordx2 v[0:1], off, s[8:11], 0 ; GCN-NEXT: s_endpgm ; ; GCN-IR-LABEL: s_test_srem33_64: @@ -1047,9 +1041,8 @@ define amdgpu_kernel void @s_test_srem33_64(i64 addrspace(1)* %out, i64 %x, i64 ; GCN-IR-NEXT: s_add_u32 s18, s8, -1 ; GCN-IR-NEXT: s_addc_u32 s19, s9, -1 ; GCN-IR-NEXT: s_not_b64 s[6:7], s[12:13] -; GCN-IR-NEXT: s_mov_b32 s17, s13 ; GCN-IR-NEXT: s_add_u32 s12, s6, s16 -; GCN-IR-NEXT: s_addc_u32 s13, s7, s13 +; GCN-IR-NEXT: s_addc_u32 s13, s7, 0 ; GCN-IR-NEXT: s_mov_b64 s[16:17], 0 ; GCN-IR-NEXT: s_mov_b32 s7, 0 ; GCN-IR-NEXT: .LBB8_3: ; %udiv-do-while @@ -1205,9 +1198,8 @@ define amdgpu_kernel void @s_test_srem24_48(i48 addrspace(1)* %out, i48 %x, i48 ; GCN-IR-NEXT: s_add_u32 s18, s6, -1 ; GCN-IR-NEXT: s_addc_u32 s19, s7, -1 ; GCN-IR-NEXT: s_not_b64 s[8:9], s[12:13] -; GCN-IR-NEXT: s_mov_b32 s17, s13 ; GCN-IR-NEXT: s_add_u32 s12, s8, s16 -; GCN-IR-NEXT: s_addc_u32 s13, s9, s13 +; GCN-IR-NEXT: s_addc_u32 s13, s9, 0 ; GCN-IR-NEXT: s_mov_b64 s[16:17], 0 ; GCN-IR-NEXT: s_mov_b32 s9, 0 ; GCN-IR-NEXT: .LBB9_3: ; %udiv-do-while @@ -1288,58 +1280,57 @@ define amdgpu_kernel void @s_test_srem_k_num_i64(i64 addrspace(1)* %out, i64 %x) ; GCN-NEXT: s_mov_b32 s4, s0 ; GCN-NEXT: v_mac_f32_e32 v0, 0x4f800000, v1 ; GCN-NEXT: v_rcp_f32_e32 v0, v0 -; GCN-NEXT: v_mov_b32_e32 v1, 0 ; GCN-NEXT: s_mov_b32 s5, s1 ; GCN-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0 -; GCN-NEXT: v_mul_f32_e32 v2, 0x2f800000, v0 -; GCN-NEXT: v_trunc_f32_e32 v2, v2 -; GCN-NEXT: v_mac_f32_e32 v0, 0xcf800000, v2 -; GCN-NEXT: v_cvt_u32_f32_e32 v2, v2 +; GCN-NEXT: v_mul_f32_e32 v1, 0x2f800000, v0 +; GCN-NEXT: v_trunc_f32_e32 v1, v1 +; GCN-NEXT: v_mac_f32_e32 v0, 0xcf800000, v1 +; GCN-NEXT: v_cvt_u32_f32_e32 v1, v1 ; GCN-NEXT: v_cvt_u32_f32_e32 v0, v0 -; GCN-NEXT: v_mul_lo_u32 v3, s2, v2 -; GCN-NEXT: v_mul_hi_u32 v4, s2, v0 -; GCN-NEXT: v_mul_lo_u32 v6, s3, v0 -; GCN-NEXT: v_mul_lo_u32 v5, s2, v0 -; GCN-NEXT: v_add_i32_e32 v3, vcc, v4, v3 -; GCN-NEXT: v_add_i32_e32 v3, vcc, v3, v6 -; GCN-NEXT: v_mul_hi_u32 v4, v0, v5 -; GCN-NEXT: v_mul_lo_u32 v6, v0, v3 -; GCN-NEXT: v_mul_hi_u32 v8, v0, v3 -; GCN-NEXT: v_mul_hi_u32 v7, v2, v5 -; GCN-NEXT: v_mul_lo_u32 v5, v2, v5 -; GCN-NEXT: v_mul_hi_u32 v9, v2, v3 -; GCN-NEXT: v_add_i32_e32 v4, vcc, v4, v6 -; GCN-NEXT: v_addc_u32_e32 v6, vcc, 0, v8, vcc -; GCN-NEXT: v_mul_lo_u32 v3, v2, v3 -; GCN-NEXT: v_add_i32_e32 v4, vcc, v4, v5 -; GCN-NEXT: v_addc_u32_e32 v4, vcc, v6, v7, vcc -; GCN-NEXT: v_addc_u32_e32 v5, vcc, v9, v1, vcc -; GCN-NEXT: v_add_i32_e32 v3, vcc, v4, v3 -; GCN-NEXT: v_addc_u32_e32 v4, vcc, 0, v5, vcc -; GCN-NEXT: v_add_i32_e32 v0, vcc, v0, v3 -; GCN-NEXT: v_addc_u32_e32 v2, vcc, v2, v4, vcc -; GCN-NEXT: v_mul_lo_u32 v3, s2, v2 -; GCN-NEXT: v_mul_hi_u32 v4, s2, v0 +; GCN-NEXT: v_mul_lo_u32 v2, s2, v1 +; GCN-NEXT: v_mul_hi_u32 v3, s2, v0 ; GCN-NEXT: v_mul_lo_u32 v5, s3, v0 -; GCN-NEXT: v_add_i32_e32 v3, vcc, v4, v3 ; GCN-NEXT: v_mul_lo_u32 v4, s2, v0 -; GCN-NEXT: v_add_i32_e32 v3, vcc, v5, v3 -; GCN-NEXT: v_mul_lo_u32 v7, v0, v3 -; GCN-NEXT: v_mul_hi_u32 v8, v0, v4 -; GCN-NEXT: v_mul_hi_u32 v9, v0, v3 -; GCN-NEXT: v_mul_hi_u32 v6, v2, v4 -; GCN-NEXT: v_mul_lo_u32 v4, v2, v4 -; GCN-NEXT: v_mul_hi_u32 v5, v2, v3 -; GCN-NEXT: v_add_i32_e32 v7, vcc, v8, v7 -; GCN-NEXT: v_addc_u32_e32 v8, vcc, 0, v9, vcc -; GCN-NEXT: v_mul_lo_u32 v3, v2, v3 -; GCN-NEXT: v_add_i32_e32 v4, vcc, v7, v4 -; GCN-NEXT: v_addc_u32_e32 v4, vcc, v8, v6, vcc -; GCN-NEXT: v_addc_u32_e32 v1, vcc, v5, v1, vcc -; GCN-NEXT: v_add_i32_e32 v3, vcc, v4, v3 -; GCN-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GCN-NEXT: v_add_i32_e32 v0, vcc, v0, v3 -; GCN-NEXT: v_addc_u32_e32 v1, vcc, v2, v1, vcc +; GCN-NEXT: v_add_i32_e32 v2, vcc, v3, v2 +; GCN-NEXT: v_add_i32_e32 v2, vcc, v2, v5 +; GCN-NEXT: v_mul_hi_u32 v3, v0, v4 +; GCN-NEXT: v_mul_lo_u32 v5, v0, v2 +; GCN-NEXT: v_mul_hi_u32 v7, v0, v2 +; GCN-NEXT: v_mul_hi_u32 v6, v1, v4 +; GCN-NEXT: v_mul_lo_u32 v4, v1, v4 +; GCN-NEXT: v_mul_hi_u32 v8, v1, v2 +; GCN-NEXT: v_add_i32_e32 v3, vcc, v3, v5 +; GCN-NEXT: v_addc_u32_e32 v5, vcc, 0, v7, vcc +; GCN-NEXT: v_mul_lo_u32 v2, v1, v2 +; GCN-NEXT: v_add_i32_e32 v3, vcc, v3, v4 +; GCN-NEXT: v_addc_u32_e32 v3, vcc, v5, v6, vcc +; GCN-NEXT: v_addc_u32_e32 v4, vcc, 0, v8, vcc +; GCN-NEXT: v_add_i32_e32 v2, vcc, v3, v2 +; GCN-NEXT: v_addc_u32_e32 v3, vcc, 0, v4, vcc +; GCN-NEXT: v_add_i32_e32 v0, vcc, v0, v2 +; GCN-NEXT: v_addc_u32_e32 v1, vcc, v1, v3, vcc +; GCN-NEXT: v_mul_lo_u32 v2, s2, v1 +; GCN-NEXT: v_mul_hi_u32 v3, s2, v0 +; GCN-NEXT: v_mul_lo_u32 v4, s3, v0 +; GCN-NEXT: v_add_i32_e32 v2, vcc, v3, v2 +; GCN-NEXT: v_mul_lo_u32 v3, s2, v0 +; GCN-NEXT: v_add_i32_e32 v2, vcc, v4, v2 +; GCN-NEXT: v_mul_lo_u32 v6, v0, v2 +; GCN-NEXT: v_mul_hi_u32 v7, v0, v3 +; GCN-NEXT: v_mul_hi_u32 v8, v0, v2 +; GCN-NEXT: v_mul_hi_u32 v5, v1, v3 +; GCN-NEXT: v_mul_lo_u32 v3, v1, v3 +; GCN-NEXT: v_mul_hi_u32 v4, v1, v2 +; GCN-NEXT: v_add_i32_e32 v6, vcc, v7, v6 +; GCN-NEXT: v_addc_u32_e32 v7, vcc, 0, v8, vcc +; GCN-NEXT: v_mul_lo_u32 v2, v1, v2 +; GCN-NEXT: v_add_i32_e32 v3, vcc, v6, v3 +; GCN-NEXT: v_addc_u32_e32 v3, vcc, v7, v5, vcc +; GCN-NEXT: v_addc_u32_e32 v4, vcc, 0, v4, vcc +; GCN-NEXT: v_add_i32_e32 v2, vcc, v3, v2 +; GCN-NEXT: v_addc_u32_e32 v3, vcc, 0, v4, vcc +; GCN-NEXT: v_add_i32_e32 v0, vcc, v0, v2 +; GCN-NEXT: v_addc_u32_e32 v1, vcc, v1, v3, vcc ; GCN-NEXT: v_mul_lo_u32 v2, v1, 24 ; GCN-NEXT: v_mul_hi_u32 v0, v0, 24 ; GCN-NEXT: v_mul_hi_u32 v1, v1, 24 @@ -1483,7 +1474,6 @@ define i64 @v_test_srem_k_num_i64(i64 %x) { ; GCN-NEXT: v_subb_u32_e32 v5, vcc, 0, v1, vcc ; GCN-NEXT: v_mac_f32_e32 v2, 0x4f800000, v3 ; GCN-NEXT: v_rcp_f32_e32 v2, v2 -; GCN-NEXT: v_mov_b32_e32 v11, 0 ; GCN-NEXT: v_mul_f32_e32 v2, 0x5f7ffffc, v2 ; GCN-NEXT: v_mul_f32_e32 v3, 0x2f800000, v2 ; GCN-NEXT: v_trunc_f32_e32 v3, v3 @@ -1499,7 +1489,7 @@ define i64 @v_test_srem_k_num_i64(i64 %x) { ; GCN-NEXT: v_mul_lo_u32 v8, v2, v6 ; GCN-NEXT: v_mul_hi_u32 v9, v2, v7 ; GCN-NEXT: v_mul_hi_u32 v10, v2, v6 -; GCN-NEXT: v_mul_hi_u32 v12, v3, v6 +; GCN-NEXT: v_mul_hi_u32 v11, v3, v6 ; GCN-NEXT: v_mul_lo_u32 v6, v3, v6 ; GCN-NEXT: v_add_i32_e32 v8, vcc, v9, v8 ; GCN-NEXT: v_addc_u32_e32 v9, vcc, 0, v10, vcc @@ -1507,7 +1497,7 @@ define i64 @v_test_srem_k_num_i64(i64 %x) { ; GCN-NEXT: v_mul_hi_u32 v7, v3, v7 ; GCN-NEXT: v_add_i32_e32 v8, vcc, v8, v10 ; GCN-NEXT: v_addc_u32_e32 v7, vcc, v9, v7, vcc -; GCN-NEXT: v_addc_u32_e32 v8, vcc, v12, v11, vcc +; GCN-NEXT: v_addc_u32_e32 v8, vcc, 0, v11, vcc ; GCN-NEXT: v_add_i32_e32 v6, vcc, v7, v6 ; GCN-NEXT: v_addc_u32_e32 v7, vcc, 0, v8, vcc ; GCN-NEXT: v_add_i32_e32 v2, vcc, v2, v6 @@ -1529,7 +1519,7 @@ define i64 @v_test_srem_k_num_i64(i64 %x) { ; GCN-NEXT: v_mul_lo_u32 v5, v3, v5 ; GCN-NEXT: v_add_i32_e32 v4, vcc, v8, v4 ; GCN-NEXT: v_addc_u32_e32 v4, vcc, v9, v7, vcc -; GCN-NEXT: v_addc_u32_e32 v6, vcc, v6, v11, vcc +; GCN-NEXT: v_addc_u32_e32 v6, vcc, 0, v6, vcc ; GCN-NEXT: v_add_i32_e32 v4, vcc, v4, v5 ; GCN-NEXT: v_addc_u32_e32 v5, vcc, 0, v6, vcc ; GCN-NEXT: v_add_i32_e32 v2, vcc, v2, v4 @@ -1585,24 +1575,23 @@ define i64 @v_test_srem_k_num_i64(i64 %x) { ; GCN-IR-NEXT: v_ffbh_u32_e32 v3, v1 ; GCN-IR-NEXT: v_min_u32_e32 v6, v2, v3 ; GCN-IR-NEXT: s_movk_i32 s6, 0xffc5 -; GCN-IR-NEXT: v_add_i32_e32 v3, vcc, s6, v6 -; GCN-IR-NEXT: v_addc_u32_e64 v4, s[6:7], 0, -1, vcc +; GCN-IR-NEXT: v_add_i32_e32 v4, vcc, s6, v6 +; GCN-IR-NEXT: v_addc_u32_e64 v5, s[6:7], 0, -1, vcc ; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[4:5], 0, v[0:1] -; GCN-IR-NEXT: v_cmp_lt_u64_e32 vcc, 63, v[3:4] -; GCN-IR-NEXT: v_mov_b32_e32 v7, 0 +; GCN-IR-NEXT: v_cmp_lt_u64_e32 vcc, 63, v[4:5] +; GCN-IR-NEXT: v_cmp_ne_u64_e64 s[6:7], 63, v[4:5] ; GCN-IR-NEXT: s_or_b64 s[4:5], s[4:5], vcc -; GCN-IR-NEXT: v_cmp_ne_u64_e32 vcc, 63, v[3:4] -; GCN-IR-NEXT: v_cndmask_b32_e64 v2, 24, 0, s[4:5] +; GCN-IR-NEXT: v_cndmask_b32_e64 v3, 24, 0, s[4:5] ; GCN-IR-NEXT: s_xor_b64 s[4:5], s[4:5], -1 -; GCN-IR-NEXT: v_mov_b32_e32 v5, v7 -; GCN-IR-NEXT: s_and_b64 s[4:5], s[4:5], vcc +; GCN-IR-NEXT: v_mov_b32_e32 v2, 0 +; GCN-IR-NEXT: s_and_b64 s[4:5], s[4:5], s[6:7] ; GCN-IR-NEXT: s_and_saveexec_b64 s[6:7], s[4:5] ; GCN-IR-NEXT: s_cbranch_execz .LBB11_6 ; GCN-IR-NEXT: ; %bb.1: ; %udiv-bb1 -; GCN-IR-NEXT: v_add_i32_e32 v8, vcc, 1, v3 -; GCN-IR-NEXT: v_addc_u32_e32 v9, vcc, 0, v4, vcc -; GCN-IR-NEXT: v_sub_i32_e64 v2, s[4:5], 63, v3 -; GCN-IR-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[8:9] +; GCN-IR-NEXT: v_add_i32_e32 v7, vcc, 1, v4 +; GCN-IR-NEXT: v_addc_u32_e32 v8, vcc, 0, v5, vcc +; GCN-IR-NEXT: v_sub_i32_e64 v2, s[4:5], 63, v4 +; GCN-IR-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[7:8] ; GCN-IR-NEXT: v_lshl_b64 v[2:3], 24, v2 ; GCN-IR-NEXT: v_mov_b32_e32 v4, 0 ; GCN-IR-NEXT: v_mov_b32_e32 v5, 0 @@ -1612,10 +1601,10 @@ define i64 @v_test_srem_k_num_i64(i64 %x) { ; GCN-IR-NEXT: ; %bb.2: ; %udiv-preheader ; GCN-IR-NEXT: v_add_i32_e32 v12, vcc, -1, v0 ; GCN-IR-NEXT: v_addc_u32_e32 v13, vcc, -1, v1, vcc -; GCN-IR-NEXT: v_lshr_b64 v[8:9], 24, v8 +; GCN-IR-NEXT: v_lshr_b64 v[8:9], 24, v7 ; GCN-IR-NEXT: v_sub_i32_e32 v6, vcc, 58, v6 ; GCN-IR-NEXT: v_mov_b32_e32 v10, 0 -; GCN-IR-NEXT: v_subb_u32_e32 v7, vcc, 0, v7, vcc +; GCN-IR-NEXT: v_subb_u32_e64 v7, s[4:5], 0, 0, vcc ; GCN-IR-NEXT: s_mov_b64 s[10:11], 0 ; GCN-IR-NEXT: v_mov_b32_e32 v11, 0 ; GCN-IR-NEXT: v_mov_b32_e32 v5, 0 @@ -1647,17 +1636,17 @@ define i64 @v_test_srem_k_num_i64(i64 %x) { ; GCN-IR-NEXT: s_or_b64 exec, exec, s[10:11] ; GCN-IR-NEXT: .LBB11_5: ; %Flow3 ; GCN-IR-NEXT: s_or_b64 exec, exec, s[8:9] -; GCN-IR-NEXT: v_lshl_b64 v[2:3], v[2:3], 1 -; GCN-IR-NEXT: v_or_b32_e32 v5, v5, v3 -; GCN-IR-NEXT: v_or_b32_e32 v2, v4, v2 +; GCN-IR-NEXT: v_lshl_b64 v[6:7], v[2:3], 1 +; GCN-IR-NEXT: v_or_b32_e32 v2, v5, v7 +; GCN-IR-NEXT: v_or_b32_e32 v3, v4, v6 ; GCN-IR-NEXT: .LBB11_6: ; %Flow4 ; GCN-IR-NEXT: s_or_b64 exec, exec, s[6:7] -; GCN-IR-NEXT: v_mul_lo_u32 v3, v0, v5 -; GCN-IR-NEXT: v_mul_hi_u32 v4, v0, v2 -; GCN-IR-NEXT: v_mul_lo_u32 v1, v1, v2 -; GCN-IR-NEXT: v_mul_lo_u32 v0, v0, v2 -; GCN-IR-NEXT: v_add_i32_e32 v3, vcc, v4, v3 -; GCN-IR-NEXT: v_add_i32_e32 v1, vcc, v3, v1 +; GCN-IR-NEXT: v_mul_lo_u32 v2, v0, v2 +; GCN-IR-NEXT: v_mul_hi_u32 v4, v0, v3 +; GCN-IR-NEXT: v_mul_lo_u32 v1, v1, v3 +; GCN-IR-NEXT: v_mul_lo_u32 v0, v0, v3 +; GCN-IR-NEXT: v_add_i32_e32 v2, vcc, v4, v2 +; GCN-IR-NEXT: v_add_i32_e32 v1, vcc, v2, v1 ; GCN-IR-NEXT: v_sub_i32_e32 v0, vcc, 24, v0 ; GCN-IR-NEXT: v_subb_u32_e32 v1, vcc, 0, v1, vcc ; GCN-IR-NEXT: s_setpc_b64 s[30:31] @@ -1680,7 +1669,6 @@ define i64 @v_test_srem_pow2_k_num_i64(i64 %x) { ; GCN-NEXT: v_subb_u32_e32 v5, vcc, 0, v1, vcc ; GCN-NEXT: v_mac_f32_e32 v2, 0x4f800000, v3 ; GCN-NEXT: v_rcp_f32_e32 v2, v2 -; GCN-NEXT: v_mov_b32_e32 v11, 0 ; GCN-NEXT: v_mul_f32_e32 v2, 0x5f7ffffc, v2 ; GCN-NEXT: v_mul_f32_e32 v3, 0x2f800000, v2 ; GCN-NEXT: v_trunc_f32_e32 v3, v3 @@ -1696,7 +1684,7 @@ define i64 @v_test_srem_pow2_k_num_i64(i64 %x) { ; GCN-NEXT: v_mul_lo_u32 v8, v2, v6 ; GCN-NEXT: v_mul_hi_u32 v9, v2, v7 ; GCN-NEXT: v_mul_hi_u32 v10, v2, v6 -; GCN-NEXT: v_mul_hi_u32 v12, v3, v6 +; GCN-NEXT: v_mul_hi_u32 v11, v3, v6 ; GCN-NEXT: v_mul_lo_u32 v6, v3, v6 ; GCN-NEXT: v_add_i32_e32 v8, vcc, v9, v8 ; GCN-NEXT: v_addc_u32_e32 v9, vcc, 0, v10, vcc @@ -1704,7 +1692,7 @@ define i64 @v_test_srem_pow2_k_num_i64(i64 %x) { ; GCN-NEXT: v_mul_hi_u32 v7, v3, v7 ; GCN-NEXT: v_add_i32_e32 v8, vcc, v8, v10 ; GCN-NEXT: v_addc_u32_e32 v7, vcc, v9, v7, vcc -; GCN-NEXT: v_addc_u32_e32 v8, vcc, v12, v11, vcc +; GCN-NEXT: v_addc_u32_e32 v8, vcc, 0, v11, vcc ; GCN-NEXT: v_add_i32_e32 v6, vcc, v7, v6 ; GCN-NEXT: v_addc_u32_e32 v7, vcc, 0, v8, vcc ; GCN-NEXT: v_add_i32_e32 v2, vcc, v2, v6 @@ -1726,7 +1714,7 @@ define i64 @v_test_srem_pow2_k_num_i64(i64 %x) { ; GCN-NEXT: v_mul_lo_u32 v5, v3, v5 ; GCN-NEXT: v_add_i32_e32 v4, vcc, v8, v4 ; GCN-NEXT: v_addc_u32_e32 v4, vcc, v9, v7, vcc -; GCN-NEXT: v_addc_u32_e32 v6, vcc, v6, v11, vcc +; GCN-NEXT: v_addc_u32_e32 v6, vcc, 0, v6, vcc ; GCN-NEXT: v_add_i32_e32 v4, vcc, v4, v5 ; GCN-NEXT: v_addc_u32_e32 v5, vcc, 0, v6, vcc ; GCN-NEXT: v_add_i32_e32 v2, vcc, v2, v4 @@ -1778,26 +1766,25 @@ define i64 @v_test_srem_pow2_k_num_i64(i64 %x) { ; GCN-IR-NEXT: v_ffbh_u32_e32 v3, v1 ; GCN-IR-NEXT: v_min_u32_e32 v6, v2, v3 ; GCN-IR-NEXT: s_movk_i32 s6, 0xffd0 -; GCN-IR-NEXT: v_add_i32_e32 v2, vcc, s6, v6 -; GCN-IR-NEXT: v_addc_u32_e64 v3, s[6:7], 0, -1, vcc +; GCN-IR-NEXT: v_add_i32_e32 v3, vcc, s6, v6 +; GCN-IR-NEXT: v_addc_u32_e64 v4, s[6:7], 0, -1, vcc ; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[4:5], 0, v[0:1] -; GCN-IR-NEXT: v_cmp_lt_u64_e32 vcc, 63, v[2:3] -; GCN-IR-NEXT: s_mov_b64 s[8:9], 0x8000 -; GCN-IR-NEXT: v_mov_b32_e32 v4, s8 +; GCN-IR-NEXT: v_cmp_lt_u64_e32 vcc, 63, v[3:4] +; GCN-IR-NEXT: v_mov_b32_e32 v5, 0x8000 ; GCN-IR-NEXT: s_or_b64 s[4:5], s[4:5], vcc -; GCN-IR-NEXT: v_cmp_ne_u64_e32 vcc, 63, v[2:3] -; GCN-IR-NEXT: v_mov_b32_e32 v7, 0 -; GCN-IR-NEXT: v_cndmask_b32_e64 v4, v4, 0, s[4:5] +; GCN-IR-NEXT: v_cmp_ne_u64_e32 vcc, 63, v[3:4] +; GCN-IR-NEXT: v_cndmask_b32_e64 v5, v5, 0, s[4:5] ; GCN-IR-NEXT: s_xor_b64 s[4:5], s[4:5], -1 -; GCN-IR-NEXT: v_mov_b32_e32 v5, v7 +; GCN-IR-NEXT: v_mov_b32_e32 v2, 0 +; GCN-IR-NEXT: s_mov_b64 s[8:9], 0x8000 ; GCN-IR-NEXT: s_and_b64 s[4:5], s[4:5], vcc ; GCN-IR-NEXT: s_and_saveexec_b64 s[6:7], s[4:5] ; GCN-IR-NEXT: s_cbranch_execz .LBB12_6 ; GCN-IR-NEXT: ; %bb.1: ; %udiv-bb1 -; GCN-IR-NEXT: v_add_i32_e32 v8, vcc, 1, v2 -; GCN-IR-NEXT: v_addc_u32_e32 v9, vcc, 0, v3, vcc -; GCN-IR-NEXT: v_sub_i32_e64 v2, s[4:5], 63, v2 -; GCN-IR-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[8:9] +; GCN-IR-NEXT: v_add_i32_e32 v7, vcc, 1, v3 +; GCN-IR-NEXT: v_addc_u32_e32 v8, vcc, 0, v4, vcc +; GCN-IR-NEXT: v_sub_i32_e64 v2, s[4:5], 63, v3 +; GCN-IR-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[7:8] ; GCN-IR-NEXT: v_lshl_b64 v[2:3], s[8:9], v2 ; GCN-IR-NEXT: v_mov_b32_e32 v4, 0 ; GCN-IR-NEXT: v_mov_b32_e32 v5, 0 @@ -1808,10 +1795,10 @@ define i64 @v_test_srem_pow2_k_num_i64(i64 %x) { ; GCN-IR-NEXT: v_add_i32_e32 v12, vcc, -1, v0 ; GCN-IR-NEXT: s_mov_b64 s[4:5], 0x8000 ; GCN-IR-NEXT: v_addc_u32_e32 v13, vcc, -1, v1, vcc -; GCN-IR-NEXT: v_lshr_b64 v[8:9], s[4:5], v8 +; GCN-IR-NEXT: v_lshr_b64 v[8:9], s[4:5], v7 ; GCN-IR-NEXT: v_sub_i32_e32 v6, vcc, 47, v6 ; GCN-IR-NEXT: v_mov_b32_e32 v10, 0 -; GCN-IR-NEXT: v_subb_u32_e32 v7, vcc, 0, v7, vcc +; GCN-IR-NEXT: v_subb_u32_e64 v7, s[4:5], 0, 0, vcc ; GCN-IR-NEXT: s_mov_b64 s[10:11], 0 ; GCN-IR-NEXT: v_mov_b32_e32 v11, 0 ; GCN-IR-NEXT: v_mov_b32_e32 v5, 0 @@ -1843,15 +1830,15 @@ define i64 @v_test_srem_pow2_k_num_i64(i64 %x) { ; GCN-IR-NEXT: s_or_b64 exec, exec, s[10:11] ; GCN-IR-NEXT: .LBB12_5: ; %Flow3 ; GCN-IR-NEXT: s_or_b64 exec, exec, s[8:9] -; GCN-IR-NEXT: v_lshl_b64 v[2:3], v[2:3], 1 -; GCN-IR-NEXT: v_or_b32_e32 v5, v5, v3 -; GCN-IR-NEXT: v_or_b32_e32 v4, v4, v2 +; GCN-IR-NEXT: v_lshl_b64 v[6:7], v[2:3], 1 +; GCN-IR-NEXT: v_or_b32_e32 v2, v5, v7 +; GCN-IR-NEXT: v_or_b32_e32 v5, v4, v6 ; GCN-IR-NEXT: .LBB12_6: ; %Flow4 ; GCN-IR-NEXT: s_or_b64 exec, exec, s[6:7] -; GCN-IR-NEXT: v_mul_lo_u32 v2, v0, v5 -; GCN-IR-NEXT: v_mul_hi_u32 v3, v0, v4 -; GCN-IR-NEXT: v_mul_lo_u32 v1, v1, v4 -; GCN-IR-NEXT: v_mul_lo_u32 v0, v0, v4 +; GCN-IR-NEXT: v_mul_lo_u32 v2, v0, v2 +; GCN-IR-NEXT: v_mul_hi_u32 v3, v0, v5 +; GCN-IR-NEXT: v_mul_lo_u32 v1, v1, v5 +; GCN-IR-NEXT: v_mul_lo_u32 v0, v0, v5 ; GCN-IR-NEXT: v_add_i32_e32 v2, vcc, v3, v2 ; GCN-IR-NEXT: v_add_i32_e32 v1, vcc, v2, v1 ; GCN-IR-NEXT: v_sub_i32_e32 v0, vcc, 0x8000, v0 diff --git a/llvm/test/CodeGen/AMDGPU/strict_fadd.f16.ll b/llvm/test/CodeGen/AMDGPU/strict_fadd.f16.ll index bece8aab1e3d..940071149e13 100644 --- a/llvm/test/CodeGen/AMDGPU/strict_fadd.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/strict_fadd.f16.ll @@ -186,13 +186,12 @@ define <4 x half> @v_constained_fadd_v4f16_fpexcept_strict(<4 x half> %x, <4 x h ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: v_mov_b32_e32 v4, 0xffff -; GFX10-NEXT: v_add_f16_e32 v5, v0, v2 -; GFX10-NEXT: v_add_f16_e32 v6, v1, v3 +; GFX10-NEXT: v_add_f16_e32 v4, v0, v2 +; GFX10-NEXT: v_add_f16_e32 v5, v1, v3 ; GFX10-NEXT: v_add_f16_sdwa v1, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; GFX10-NEXT: v_add_f16_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX10-NEXT: v_and_b32_e32 v2, v4, v5 -; GFX10-NEXT: v_and_b32_e32 v3, v4, v6 +; GFX10-NEXT: v_and_b32_e32 v2, 0xffff, v4 +; GFX10-NEXT: v_and_b32_e32 v3, 0xffff, v5 ; GFX10-NEXT: v_lshl_or_b32 v0, v0, 16, v2 ; GFX10-NEXT: v_lshl_or_b32 v1, v1, 16, v3 ; GFX10-NEXT: s_setpc_b64 s[30:31] diff --git a/llvm/test/CodeGen/AMDGPU/strict_fma.f16.ll b/llvm/test/CodeGen/AMDGPU/strict_fma.f16.ll index 4326ecd53462..3831190a538d 100644 --- a/llvm/test/CodeGen/AMDGPU/strict_fma.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/strict_fma.f16.ll @@ -132,14 +132,13 @@ define <4 x half> @v_constained_fma_v4f16_fpexcept_strict(<4 x half> %x, <4 x ha ; GFX10-NEXT: v_lshrrev_b32_e32 v10, 16, v2 ; GFX10-NEXT: v_lshrrev_b32_e32 v11, 16, v0 ; GFX10-NEXT: v_fmac_f16_e32 v4, v0, v2 -; GFX10-NEXT: v_mov_b32_e32 v0, 0xffff ; GFX10-NEXT: v_fmac_f16_e32 v5, v1, v3 ; GFX10-NEXT: v_fmac_f16_e32 v6, v8, v7 ; GFX10-NEXT: v_fmac_f16_e32 v9, v11, v10 -; GFX10-NEXT: v_and_b32_e32 v1, v0, v4 -; GFX10-NEXT: v_and_b32_e32 v2, v0, v5 -; GFX10-NEXT: v_lshl_or_b32 v0, v9, 16, v1 -; GFX10-NEXT: v_lshl_or_b32 v1, v6, 16, v2 +; GFX10-NEXT: v_and_b32_e32 v0, 0xffff, v4 +; GFX10-NEXT: v_and_b32_e32 v1, 0xffff, v5 +; GFX10-NEXT: v_lshl_or_b32 v0, v9, 16, v0 +; GFX10-NEXT: v_lshl_or_b32 v1, v6, 16, v1 ; GFX10-NEXT: s_setpc_b64 s[30:31] %val = call <4 x half> @llvm.experimental.constrained.fma.v4f16(<4 x half> %x, <4 x half> %y, <4 x half> %z, metadata !"round.tonearest", metadata !"fpexcept.strict") ret <4 x half> %val diff --git a/llvm/test/CodeGen/AMDGPU/strict_fmul.f16.ll b/llvm/test/CodeGen/AMDGPU/strict_fmul.f16.ll index 00aa96fea128..3dcdc69704f3 100644 --- a/llvm/test/CodeGen/AMDGPU/strict_fmul.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/strict_fmul.f16.ll @@ -186,13 +186,12 @@ define <4 x half> @v_constained_fmul_v4f16_fpexcept_strict(<4 x half> %x, <4 x h ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: v_mov_b32_e32 v4, 0xffff -; GFX10-NEXT: v_mul_f16_e32 v5, v0, v2 -; GFX10-NEXT: v_mul_f16_e32 v6, v1, v3 +; GFX10-NEXT: v_mul_f16_e32 v4, v0, v2 +; GFX10-NEXT: v_mul_f16_e32 v5, v1, v3 ; GFX10-NEXT: v_mul_f16_sdwa v1, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; GFX10-NEXT: v_mul_f16_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX10-NEXT: v_and_b32_e32 v2, v4, v5 -; GFX10-NEXT: v_and_b32_e32 v3, v4, v6 +; GFX10-NEXT: v_and_b32_e32 v2, 0xffff, v4 +; GFX10-NEXT: v_and_b32_e32 v3, 0xffff, v5 ; GFX10-NEXT: v_lshl_or_b32 v0, v0, 16, v2 ; GFX10-NEXT: v_lshl_or_b32 v1, v1, 16, v3 ; GFX10-NEXT: s_setpc_b64 s[30:31] diff --git a/llvm/test/CodeGen/AMDGPU/strict_fsub.f16.ll b/llvm/test/CodeGen/AMDGPU/strict_fsub.f16.ll index 802c08fe64a2..3d4a879d9ee3 100644 --- a/llvm/test/CodeGen/AMDGPU/strict_fsub.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/strict_fsub.f16.ll @@ -206,13 +206,12 @@ define <4 x half> @v_constained_fsub_v4f16_fpexcept_strict(<4 x half> %x, <4 x h ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: v_mov_b32_e32 v4, 0xffff -; GFX10-NEXT: v_sub_f16_e32 v5, v0, v2 -; GFX10-NEXT: v_sub_f16_e32 v6, v1, v3 +; GFX10-NEXT: v_sub_f16_e32 v4, v0, v2 +; GFX10-NEXT: v_sub_f16_e32 v5, v1, v3 ; GFX10-NEXT: v_sub_f16_sdwa v1, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; GFX10-NEXT: v_sub_f16_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX10-NEXT: v_and_b32_e32 v2, v4, v5 -; GFX10-NEXT: v_and_b32_e32 v3, v4, v6 +; GFX10-NEXT: v_and_b32_e32 v2, 0xffff, v4 +; GFX10-NEXT: v_and_b32_e32 v3, 0xffff, v5 ; GFX10-NEXT: v_lshl_or_b32 v0, v0, 16, v2 ; GFX10-NEXT: v_lshl_or_b32 v1, v1, 16, v3 ; GFX10-NEXT: s_setpc_b64 s[30:31] diff --git a/llvm/test/CodeGen/AMDGPU/sub.v2i16.ll b/llvm/test/CodeGen/AMDGPU/sub.v2i16.ll index 4c5e61ca626c..3c74feb7ea44 100644 --- a/llvm/test/CodeGen/AMDGPU/sub.v2i16.ll +++ b/llvm/test/CodeGen/AMDGPU/sub.v2i16.ll @@ -626,7 +626,7 @@ define amdgpu_kernel void @v_test_sub_v2i16_zext_to_v2i64(<2 x i64> addrspace(1) ; GFX10-NEXT: s_mov_b32 s6, -1 ; GFX10-NEXT: v_pk_sub_i16 v2, v1, v2 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 -; GFX10-NEXT: v_and_b32_e32 v0, v3, v2 +; GFX10-NEXT: v_and_b32_e32 v0, 0xffff, v2 ; GFX10-NEXT: v_and_b32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX10-NEXT: v_mov_b32_e32 v3, v1 ; GFX10-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 diff --git a/llvm/test/CodeGen/AMDGPU/uaddsat.ll b/llvm/test/CodeGen/AMDGPU/uaddsat.ll index 54dfc8a7f353..b4ee382704a7 100644 --- a/llvm/test/CodeGen/AMDGPU/uaddsat.ll +++ b/llvm/test/CodeGen/AMDGPU/uaddsat.ll @@ -33,11 +33,10 @@ define i8 @v_uaddsat_i8(i8 %lhs, i8 %rhs) { ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: s_movk_i32 s4, 0xff -; GFX10-NEXT: v_and_b32_e32 v1, s4, v1 -; GFX10-NEXT: v_and_b32_e32 v0, s4, v0 +; GFX10-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GFX10-NEXT: v_and_b32_e32 v0, 0xff, v0 ; GFX10-NEXT: v_add_nc_u16 v0, v0, v1 -; GFX10-NEXT: v_min_u16 v0, v0, s4 +; GFX10-NEXT: v_min_u16 v0, 0xff, v0 ; GFX10-NEXT: s_setpc_b64 s[30:31] %result = call i8 @llvm.uadd.sat.i8(i8 %lhs, i8 %rhs) ret i8 %result diff --git a/llvm/test/CodeGen/AMDGPU/udiv.ll b/llvm/test/CodeGen/AMDGPU/udiv.ll index 2a3100979883..79c7c8b3b6d8 100644 --- a/llvm/test/CodeGen/AMDGPU/udiv.ll +++ b/llvm/test/CodeGen/AMDGPU/udiv.ll @@ -500,7 +500,6 @@ define amdgpu_kernel void @udiv_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> ad ; GFX1030: ; %bb.0: ; GFX1030-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x0 ; GFX1030-NEXT: v_mov_b32_e32 v4, 0 -; GFX1030-NEXT: s_mov_b32 s0, 0x4f7ffffe ; GFX1030-NEXT: s_waitcnt lgkmcnt(0) ; GFX1030-NEXT: global_load_dwordx4 v[0:3], v4, s[6:7] ; GFX1030-NEXT: s_waitcnt vmcnt(0) @@ -510,8 +509,8 @@ define amdgpu_kernel void @udiv_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> ad ; GFX1030-NEXT: v_sub_nc_u32_e32 v8, 0, v3 ; GFX1030-NEXT: v_rcp_iflag_f32_e32 v5, v5 ; GFX1030-NEXT: v_rcp_iflag_f32_e32 v6, v6 -; GFX1030-NEXT: v_mul_f32_e32 v5, s0, v5 -; GFX1030-NEXT: v_mul_f32_e32 v6, s0, v6 +; GFX1030-NEXT: v_mul_f32_e32 v5, 0x4f7ffffe, v5 +; GFX1030-NEXT: v_mul_f32_e32 v6, 0x4f7ffffe, v6 ; GFX1030-NEXT: v_cvt_u32_f32_e32 v5, v5 ; GFX1030-NEXT: v_cvt_u32_f32_e32 v6, v6 ; GFX1030-NEXT: v_mul_lo_u32 v7, v7, v5 @@ -893,7 +892,6 @@ define amdgpu_kernel void @udiv_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> ad ; GFX1030: ; %bb.0: ; GFX1030-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x0 ; GFX1030-NEXT: v_mov_b32_e32 v8, 0 -; GFX1030-NEXT: s_mov_b32 s0, 0x4f7ffffe ; GFX1030-NEXT: s_waitcnt lgkmcnt(0) ; GFX1030-NEXT: s_clause 0x1 ; GFX1030-NEXT: global_load_dwordx4 v[0:3], v8, s[6:7] offset:16 @@ -911,10 +909,10 @@ define amdgpu_kernel void @udiv_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> ad ; GFX1030-NEXT: v_sub_nc_u32_e32 v14, 0, v1 ; GFX1030-NEXT: v_sub_nc_u32_e32 v15, 0, v2 ; GFX1030-NEXT: v_sub_nc_u32_e32 v16, 0, v3 -; GFX1030-NEXT: v_mul_f32_e32 v9, s0, v9 -; GFX1030-NEXT: v_mul_f32_e32 v10, s0, v10 -; GFX1030-NEXT: v_mul_f32_e32 v11, s0, v11 -; GFX1030-NEXT: v_mul_f32_e32 v12, s0, v12 +; GFX1030-NEXT: v_mul_f32_e32 v9, 0x4f7ffffe, v9 +; GFX1030-NEXT: v_mul_f32_e32 v10, 0x4f7ffffe, v10 +; GFX1030-NEXT: v_mul_f32_e32 v11, 0x4f7ffffe, v11 +; GFX1030-NEXT: v_mul_f32_e32 v12, 0x4f7ffffe, v12 ; GFX1030-NEXT: v_cvt_u32_f32_e32 v9, v9 ; GFX1030-NEXT: v_cvt_u32_f32_e32 v10, v10 ; GFX1030-NEXT: v_cvt_u32_f32_e32 v11, v11 @@ -2154,16 +2152,15 @@ define amdgpu_kernel void @scalarize_mulhu_4xi32(<4 x i32> addrspace(1)* nocaptu ; GFX1030-NEXT: v_mov_b32_e32 v4, 0 ; GFX1030-NEXT: s_waitcnt lgkmcnt(0) ; GFX1030-NEXT: global_load_dwordx4 v[0:3], v4, s[0:1] -; GFX1030-NEXT: s_mov_b32 s0, 0x1389c755 ; GFX1030-NEXT: s_waitcnt vmcnt(0) ; GFX1030-NEXT: v_lshrrev_b32_e32 v0, 2, v0 ; GFX1030-NEXT: v_lshrrev_b32_e32 v1, 2, v1 ; GFX1030-NEXT: v_lshrrev_b32_e32 v2, 2, v2 ; GFX1030-NEXT: v_lshrrev_b32_e32 v3, 2, v3 -; GFX1030-NEXT: v_mul_hi_u32 v0, v0, s0 -; GFX1030-NEXT: v_mul_hi_u32 v1, v1, s0 -; GFX1030-NEXT: v_mul_hi_u32 v2, v2, s0 -; GFX1030-NEXT: v_mul_hi_u32 v3, v3, s0 +; GFX1030-NEXT: v_mul_hi_u32 v0, 0x1389c755, v0 +; GFX1030-NEXT: v_mul_hi_u32 v1, 0x1389c755, v1 +; GFX1030-NEXT: v_mul_hi_u32 v2, 0x1389c755, v2 +; GFX1030-NEXT: v_mul_hi_u32 v3, 0x1389c755, v3 ; GFX1030-NEXT: v_lshrrev_b32_e32 v0, 10, v0 ; GFX1030-NEXT: v_lshrrev_b32_e32 v1, 10, v1 ; GFX1030-NEXT: v_lshrrev_b32_e32 v2, 10, v2 @@ -2499,7 +2496,6 @@ define i64 @v_test_udiv64_mulhi_fold(i64 %arg) { ; SI-NEXT: v_madak_f32 v2, 0, v2, 0x47c35000 ; SI-NEXT: v_rcp_f32_e32 v2, v2 ; SI-NEXT: s_mov_b32 s4, 0xfffe7960 -; SI-NEXT: v_mov_b32_e32 v8, 0 ; SI-NEXT: v_mul_f32_e32 v2, 0x5f7ffffc, v2 ; SI-NEXT: v_mul_f32_e32 v3, 0x2f800000, v2 ; SI-NEXT: v_trunc_f32_e32 v3, v3 @@ -2513,16 +2509,16 @@ define i64 @v_test_udiv64_mulhi_fold(i64 %arg) { ; SI-NEXT: v_add_i32_e32 v4, vcc, v6, v4 ; SI-NEXT: v_mul_hi_u32 v7, v2, v5 ; SI-NEXT: v_mul_lo_u32 v6, v2, v4 -; SI-NEXT: v_mul_hi_u32 v9, v2, v4 -; SI-NEXT: v_mul_hi_u32 v10, v3, v4 +; SI-NEXT: v_mul_hi_u32 v8, v2, v4 +; SI-NEXT: v_mul_hi_u32 v9, v3, v4 ; SI-NEXT: v_mul_lo_u32 v4, v3, v4 ; SI-NEXT: v_add_i32_e32 v6, vcc, v7, v6 -; SI-NEXT: v_addc_u32_e32 v7, vcc, 0, v9, vcc -; SI-NEXT: v_mul_lo_u32 v9, v3, v5 +; SI-NEXT: v_addc_u32_e32 v7, vcc, 0, v8, vcc +; SI-NEXT: v_mul_lo_u32 v8, v3, v5 ; SI-NEXT: v_mul_hi_u32 v5, v3, v5 -; SI-NEXT: v_add_i32_e32 v6, vcc, v6, v9 +; SI-NEXT: v_add_i32_e32 v6, vcc, v6, v8 ; SI-NEXT: v_addc_u32_e32 v5, vcc, v7, v5, vcc -; SI-NEXT: v_addc_u32_e32 v6, vcc, v10, v8, vcc +; SI-NEXT: v_addc_u32_e32 v6, vcc, 0, v9, vcc ; SI-NEXT: v_add_i32_e32 v4, vcc, v5, v4 ; SI-NEXT: v_addc_u32_e32 v5, vcc, 0, v6, vcc ; SI-NEXT: v_add_i32_e32 v2, vcc, v2, v4 @@ -2535,16 +2531,16 @@ define i64 @v_test_udiv64_mulhi_fold(i64 %arg) { ; SI-NEXT: v_add_i32_e32 v4, vcc, v4, v5 ; SI-NEXT: v_mul_lo_u32 v5, v2, v4 ; SI-NEXT: v_mul_hi_u32 v7, v2, v6 -; SI-NEXT: v_mul_hi_u32 v9, v2, v4 -; SI-NEXT: v_mul_hi_u32 v10, v3, v4 +; SI-NEXT: v_mul_hi_u32 v8, v2, v4 +; SI-NEXT: v_mul_hi_u32 v9, v3, v4 ; SI-NEXT: v_mul_lo_u32 v4, v3, v4 ; SI-NEXT: v_add_i32_e32 v5, vcc, v7, v5 -; SI-NEXT: v_addc_u32_e32 v7, vcc, 0, v9, vcc -; SI-NEXT: v_mul_lo_u32 v9, v3, v6 +; SI-NEXT: v_addc_u32_e32 v7, vcc, 0, v8, vcc +; SI-NEXT: v_mul_lo_u32 v8, v3, v6 ; SI-NEXT: v_mul_hi_u32 v6, v3, v6 -; SI-NEXT: v_add_i32_e32 v5, vcc, v5, v9 +; SI-NEXT: v_add_i32_e32 v5, vcc, v5, v8 ; SI-NEXT: v_addc_u32_e32 v5, vcc, v7, v6, vcc -; SI-NEXT: v_addc_u32_e32 v6, vcc, v10, v8, vcc +; SI-NEXT: v_addc_u32_e32 v6, vcc, 0, v9, vcc ; SI-NEXT: v_add_i32_e32 v4, vcc, v5, v4 ; SI-NEXT: v_addc_u32_e32 v5, vcc, 0, v6, vcc ; SI-NEXT: v_add_i32_e32 v2, vcc, v2, v4 @@ -2560,18 +2556,18 @@ define i64 @v_test_udiv64_mulhi_fold(i64 %arg) { ; SI-NEXT: v_mul_hi_u32 v2, v1, v2 ; SI-NEXT: v_add_i32_e32 v4, vcc, v4, v6 ; SI-NEXT: v_addc_u32_e32 v2, vcc, v5, v2, vcc -; SI-NEXT: v_addc_u32_e32 v4, vcc, v7, v8, vcc +; SI-NEXT: v_addc_u32_e32 v4, vcc, 0, v7, vcc ; SI-NEXT: v_add_i32_e32 v2, vcc, v2, v3 ; SI-NEXT: v_addc_u32_e32 v3, vcc, 0, v4, vcc ; SI-NEXT: v_mul_lo_u32 v4, v3, s4 ; SI-NEXT: v_mul_hi_u32 v5, v2, s4 ; SI-NEXT: v_mul_lo_u32 v6, v2, s4 +; SI-NEXT: s_mov_b32 s4, 0x1869f ; SI-NEXT: v_add_i32_e32 v4, vcc, v5, v4 ; SI-NEXT: v_sub_i32_e32 v0, vcc, v0, v6 ; SI-NEXT: v_subb_u32_e32 v1, vcc, v1, v4, vcc -; SI-NEXT: v_subrev_i32_e32 v4, vcc, s4, v0 +; SI-NEXT: v_subrev_i32_e32 v4, vcc, 0x186a0, v0 ; SI-NEXT: v_subbrev_u32_e32 v5, vcc, 0, v1, vcc -; SI-NEXT: s_mov_b32 s4, 0x1869f ; SI-NEXT: v_cmp_lt_u32_e32 vcc, s4, v4 ; SI-NEXT: v_cndmask_b32_e64 v4, 0, -1, vcc ; SI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v5 @@ -2599,7 +2595,6 @@ define i64 @v_test_udiv64_mulhi_fold(i64 %arg) { ; VI-NEXT: v_madak_f32 v2, 0, v2, 0x47c35000 ; VI-NEXT: v_rcp_f32_e32 v2, v2 ; VI-NEXT: s_mov_b32 s6, 0xfffe7960 -; VI-NEXT: v_mov_b32_e32 v9, 0 ; VI-NEXT: v_mul_f32_e32 v2, 0x5f7ffffc, v2 ; VI-NEXT: v_mul_f32_e32 v3, 0x2f800000, v2 ; VI-NEXT: v_trunc_f32_e32 v3, v3 @@ -2612,31 +2607,30 @@ define i64 @v_test_udiv64_mulhi_fold(i64 %arg) { ; VI-NEXT: v_add_u32_e32 v8, vcc, v4, v3 ; VI-NEXT: v_mul_hi_u32 v5, v6, v2 ; VI-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v6, v8, 0 -; VI-NEXT: v_add_u32_e32 v10, vcc, v5, v3 +; VI-NEXT: v_add_u32_e32 v9, vcc, v5, v3 ; VI-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v7, v2, 0 -; VI-NEXT: v_addc_u32_e32 v11, vcc, 0, v4, vcc +; VI-NEXT: v_addc_u32_e32 v10, vcc, 0, v4, vcc ; VI-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v7, v8, 0 -; VI-NEXT: v_add_u32_e32 v2, vcc, v10, v2 -; VI-NEXT: v_addc_u32_e32 v2, vcc, v11, v3, vcc -; VI-NEXT: v_addc_u32_e32 v3, vcc, v5, v9, vcc +; VI-NEXT: v_add_u32_e32 v2, vcc, v9, v2 +; VI-NEXT: v_addc_u32_e32 v2, vcc, v10, v3, vcc +; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v5, vcc ; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v4 ; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc ; VI-NEXT: v_add_u32_e32 v6, vcc, v6, v2 ; VI-NEXT: v_addc_u32_e32 v7, vcc, v7, v3, vcc ; VI-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v6, s6, 0 ; VI-NEXT: v_mul_lo_u32 v4, v7, s6 -; VI-NEXT: s_mov_b32 s6, 0x186a0 ; VI-NEXT: v_subrev_u32_e32 v3, vcc, v6, v3 ; VI-NEXT: v_add_u32_e32 v5, vcc, v3, v4 ; VI-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v6, v5, 0 ; VI-NEXT: v_mul_hi_u32 v8, v6, v2 ; VI-NEXT: v_add_u32_e32 v8, vcc, v8, v3 ; VI-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v7, v2, 0 -; VI-NEXT: v_addc_u32_e32 v10, vcc, 0, v4, vcc +; VI-NEXT: v_addc_u32_e32 v9, vcc, 0, v4, vcc ; VI-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v7, v5, 0 ; VI-NEXT: v_add_u32_e32 v2, vcc, v8, v2 -; VI-NEXT: v_addc_u32_e32 v2, vcc, v10, v3, vcc -; VI-NEXT: v_addc_u32_e32 v3, vcc, v5, v9, vcc +; VI-NEXT: v_addc_u32_e32 v2, vcc, v9, v3, vcc +; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v5, vcc ; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v4 ; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc ; VI-NEXT: v_add_u32_e32 v4, vcc, v6, v2 @@ -2649,16 +2643,17 @@ define i64 @v_test_udiv64_mulhi_fold(i64 %arg) { ; VI-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v1, v5, 0 ; VI-NEXT: v_add_u32_e32 v2, vcc, v6, v2 ; VI-NEXT: v_addc_u32_e32 v2, vcc, v7, v3, vcc -; VI-NEXT: v_addc_u32_e32 v3, vcc, v5, v9, vcc +; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v5, vcc ; VI-NEXT: v_add_u32_e32 v4, vcc, v2, v4 ; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v3, vcc -; VI-NEXT: v_mul_lo_u32 v6, v5, s6 -; VI-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v4, s6, 0 +; VI-NEXT: s_mov_b32 s4, 0x186a0 +; VI-NEXT: v_mul_lo_u32 v6, v5, s4 +; VI-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v4, s4, 0 ; VI-NEXT: s_mov_b32 s4, 0x1869f ; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v6 ; VI-NEXT: v_sub_u32_e32 v0, vcc, v0, v2 ; VI-NEXT: v_subb_u32_e32 v1, vcc, v1, v3, vcc -; VI-NEXT: v_subrev_u32_e32 v2, vcc, s6, v0 +; VI-NEXT: v_subrev_u32_e32 v2, vcc, 0x186a0, v0 ; VI-NEXT: v_subbrev_u32_e32 v3, vcc, 0, v1, vcc ; VI-NEXT: v_cmp_lt_u32_e32 vcc, s4, v2 ; VI-NEXT: v_cndmask_b32_e64 v2, 0, -1, vcc @@ -2687,7 +2682,6 @@ define i64 @v_test_udiv64_mulhi_fold(i64 %arg) { ; GCN-NEXT: v_madak_f32 v2, 0, v2, 0x47c35000 ; GCN-NEXT: v_rcp_f32_e32 v2, v2 ; GCN-NEXT: s_mov_b32 s6, 0xfffe7960 -; GCN-NEXT: v_mov_b32_e32 v9, 0 ; GCN-NEXT: v_mul_f32_e32 v2, 0x5f7ffffc, v2 ; GCN-NEXT: v_mul_f32_e32 v3, 0x2f800000, v2 ; GCN-NEXT: v_trunc_f32_e32 v3, v3 @@ -2700,31 +2694,30 @@ define i64 @v_test_udiv64_mulhi_fold(i64 %arg) { ; GCN-NEXT: v_add_u32_e32 v8, vcc, v4, v3 ; GCN-NEXT: v_mul_hi_u32 v5, v6, v2 ; GCN-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v6, v8, 0 -; GCN-NEXT: v_add_u32_e32 v10, vcc, v5, v3 +; GCN-NEXT: v_add_u32_e32 v9, vcc, v5, v3 ; GCN-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v7, v2, 0 -; GCN-NEXT: v_addc_u32_e32 v11, vcc, 0, v4, vcc +; GCN-NEXT: v_addc_u32_e32 v10, vcc, 0, v4, vcc ; GCN-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v7, v8, 0 -; GCN-NEXT: v_add_u32_e32 v2, vcc, v10, v2 -; GCN-NEXT: v_addc_u32_e32 v2, vcc, v11, v3, vcc -; GCN-NEXT: v_addc_u32_e32 v3, vcc, v5, v9, vcc +; GCN-NEXT: v_add_u32_e32 v2, vcc, v9, v2 +; GCN-NEXT: v_addc_u32_e32 v2, vcc, v10, v3, vcc +; GCN-NEXT: v_addc_u32_e32 v3, vcc, 0, v5, vcc ; GCN-NEXT: v_add_u32_e32 v2, vcc, v2, v4 ; GCN-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc ; GCN-NEXT: v_add_u32_e32 v6, vcc, v6, v2 ; GCN-NEXT: v_addc_u32_e32 v7, vcc, v7, v3, vcc ; GCN-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v6, s6, 0 ; GCN-NEXT: v_mul_lo_u32 v4, v7, s6 -; GCN-NEXT: s_mov_b32 s6, 0x186a0 ; GCN-NEXT: v_subrev_u32_e32 v3, vcc, v6, v3 ; GCN-NEXT: v_add_u32_e32 v5, vcc, v3, v4 ; GCN-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v6, v5, 0 ; GCN-NEXT: v_mul_hi_u32 v8, v6, v2 ; GCN-NEXT: v_add_u32_e32 v8, vcc, v8, v3 ; GCN-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v7, v2, 0 -; GCN-NEXT: v_addc_u32_e32 v10, vcc, 0, v4, vcc +; GCN-NEXT: v_addc_u32_e32 v9, vcc, 0, v4, vcc ; GCN-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v7, v5, 0 ; GCN-NEXT: v_add_u32_e32 v2, vcc, v8, v2 -; GCN-NEXT: v_addc_u32_e32 v2, vcc, v10, v3, vcc -; GCN-NEXT: v_addc_u32_e32 v3, vcc, v5, v9, vcc +; GCN-NEXT: v_addc_u32_e32 v2, vcc, v9, v3, vcc +; GCN-NEXT: v_addc_u32_e32 v3, vcc, 0, v5, vcc ; GCN-NEXT: v_add_u32_e32 v2, vcc, v2, v4 ; GCN-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc ; GCN-NEXT: v_add_u32_e32 v4, vcc, v6, v2 @@ -2737,16 +2730,17 @@ define i64 @v_test_udiv64_mulhi_fold(i64 %arg) { ; GCN-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v1, v5, 0 ; GCN-NEXT: v_add_u32_e32 v2, vcc, v6, v2 ; GCN-NEXT: v_addc_u32_e32 v2, vcc, v7, v3, vcc -; GCN-NEXT: v_addc_u32_e32 v3, vcc, v5, v9, vcc +; GCN-NEXT: v_addc_u32_e32 v3, vcc, 0, v5, vcc ; GCN-NEXT: v_add_u32_e32 v4, vcc, v2, v4 ; GCN-NEXT: v_addc_u32_e32 v5, vcc, 0, v3, vcc -; GCN-NEXT: v_mul_lo_u32 v6, v5, s6 -; GCN-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v4, s6, 0 +; GCN-NEXT: s_mov_b32 s4, 0x186a0 +; GCN-NEXT: v_mul_lo_u32 v6, v5, s4 +; GCN-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v4, s4, 0 ; GCN-NEXT: s_mov_b32 s4, 0x1869f ; GCN-NEXT: v_add_u32_e32 v3, vcc, v3, v6 ; GCN-NEXT: v_sub_u32_e32 v0, vcc, v0, v2 ; GCN-NEXT: v_subb_u32_e32 v1, vcc, v1, v3, vcc -; GCN-NEXT: v_subrev_u32_e32 v2, vcc, s6, v0 +; GCN-NEXT: v_subrev_u32_e32 v2, vcc, 0x186a0, v0 ; GCN-NEXT: v_subbrev_u32_e32 v3, vcc, 0, v1, vcc ; GCN-NEXT: v_cmp_lt_u32_e32 vcc, s4, v2 ; GCN-NEXT: v_cndmask_b32_e64 v2, 0, -1, vcc @@ -2773,15 +2767,14 @@ define i64 @v_test_udiv64_mulhi_fold(i64 %arg) { ; GFX1030-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX1030-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX1030-NEXT: s_mov_b32 s4, 0x346d900 -; GFX1030-NEXT: s_mov_b32 s5, 0xfffe7960 ; GFX1030-NEXT: s_add_u32 s4, 0x4237, s4 -; GFX1030-NEXT: s_addc_u32 s6, 0, 0 +; GFX1030-NEXT: s_addc_u32 s5, 0, 0 ; GFX1030-NEXT: v_add_co_u32 v2, s4, 0xa9000000, s4 ; GFX1030-NEXT: s_cmpk_lg_u32 s4, 0x0 -; GFX1030-NEXT: s_addc_u32 s4, s6, 0xa7c5 -; GFX1030-NEXT: v_mul_hi_u32 v3, v2, s5 -; GFX1030-NEXT: v_mul_lo_u32 v4, v2, s5 -; GFX1030-NEXT: s_mul_i32 s5, s4, s5 +; GFX1030-NEXT: s_addc_u32 s4, s5, 0xa7c5 +; GFX1030-NEXT: v_mul_hi_u32 v3, 0xfffe7960, v2 +; GFX1030-NEXT: v_mul_lo_u32 v4, 0xfffe7960, v2 +; GFX1030-NEXT: s_mul_i32 s5, s4, 0xfffe7960 ; GFX1030-NEXT: v_sub_nc_u32_e32 v3, v3, v2 ; GFX1030-NEXT: v_mul_hi_u32 v5, v2, v4 ; GFX1030-NEXT: v_mul_hi_u32 v8, s4, v4 @@ -2804,7 +2797,6 @@ define i64 @v_test_udiv64_mulhi_fold(i64 %arg) { ; GFX1030-NEXT: v_mad_u64_u32 v[4:5], s4, v1, v5, 0 ; GFX1030-NEXT: v_mad_u64_u32 v[2:3], s4, v0, v6, 0 ; GFX1030-NEXT: v_mad_u64_u32 v[6:7], s4, v1, v6, 0 -; GFX1030-NEXT: s_mov_b32 s4, 0x186a0 ; GFX1030-NEXT: v_add_co_u32 v2, vcc_lo, v8, v2 ; GFX1030-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, 0, v3, vcc_lo ; GFX1030-NEXT: v_add_co_u32 v2, vcc_lo, v2, v4 @@ -2812,20 +2804,19 @@ define i64 @v_test_udiv64_mulhi_fold(i64 %arg) { ; GFX1030-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, 0, v7, vcc_lo ; GFX1030-NEXT: v_add_co_u32 v4, vcc_lo, v2, v6 ; GFX1030-NEXT: v_add_co_ci_u32_e32 v5, vcc_lo, 0, v3, vcc_lo -; GFX1030-NEXT: v_mad_u64_u32 v[2:3], s5, v4, s4, 0 -; GFX1030-NEXT: v_mul_lo_u32 v6, v5, s4 +; GFX1030-NEXT: v_mad_u64_u32 v[2:3], s4, 0x186a0, v4, 0 +; GFX1030-NEXT: v_mul_lo_u32 v6, 0x186a0, v5 ; GFX1030-NEXT: v_sub_co_u32 v0, vcc_lo, v0, v2 ; GFX1030-NEXT: v_add_nc_u32_e32 v3, v3, v6 ; GFX1030-NEXT: v_sub_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo -; GFX1030-NEXT: v_sub_co_u32 v2, vcc_lo, v0, s4 -; GFX1030-NEXT: s_mov_b32 s4, 0x1869f +; GFX1030-NEXT: v_subrev_co_u32 v2, vcc_lo, 0x186a0, v0 ; GFX1030-NEXT: v_subrev_co_ci_u32_e32 v3, vcc_lo, 0, v1, vcc_lo -; GFX1030-NEXT: v_cmp_lt_u32_e32 vcc_lo, s4, v2 +; GFX1030-NEXT: v_cmp_lt_u32_e32 vcc_lo, 0x1869f, v2 +; GFX1030-NEXT: v_cmp_eq_u32_e64 s4, 0, v1 ; GFX1030-NEXT: v_cndmask_b32_e64 v2, 0, -1, vcc_lo ; GFX1030-NEXT: v_add_co_u32 v6, vcc_lo, v4, 2 ; GFX1030-NEXT: v_add_co_ci_u32_e32 v7, vcc_lo, 0, v5, vcc_lo -; GFX1030-NEXT: v_cmp_lt_u32_e32 vcc_lo, s4, v0 -; GFX1030-NEXT: v_cmp_eq_u32_e64 s4, 0, v1 +; GFX1030-NEXT: v_cmp_lt_u32_e32 vcc_lo, 0x1869f, v0 ; GFX1030-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc_lo ; GFX1030-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v3 ; GFX1030-NEXT: v_cndmask_b32_e64 v0, -1, v0, s4 diff --git a/llvm/test/CodeGen/AMDGPU/udiv64.ll b/llvm/test/CodeGen/AMDGPU/udiv64.ll index 20fa20a3d073..3ed36b82ced7 100644 --- a/llvm/test/CodeGen/AMDGPU/udiv64.ll +++ b/llvm/test/CodeGen/AMDGPU/udiv64.ll @@ -16,73 +16,72 @@ define amdgpu_kernel void @s_test_udiv_i64(i64 addrspace(1)* %out, i64 %x, i64 % ; GCN-NEXT: s_subb_u32 s5, 0, s9 ; GCN-NEXT: v_mac_f32_e32 v0, 0x4f800000, v1 ; GCN-NEXT: v_rcp_f32_e32 v0, v0 -; GCN-NEXT: v_mov_b32_e32 v1, 0 ; GCN-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0 -; GCN-NEXT: v_mul_f32_e32 v2, 0x2f800000, v0 -; GCN-NEXT: v_trunc_f32_e32 v2, v2 -; GCN-NEXT: v_mac_f32_e32 v0, 0xcf800000, v2 -; GCN-NEXT: v_cvt_u32_f32_e32 v2, v2 +; GCN-NEXT: v_mul_f32_e32 v1, 0x2f800000, v0 +; GCN-NEXT: v_trunc_f32_e32 v1, v1 +; GCN-NEXT: v_mac_f32_e32 v0, 0xcf800000, v1 +; GCN-NEXT: v_cvt_u32_f32_e32 v1, v1 ; GCN-NEXT: v_cvt_u32_f32_e32 v0, v0 -; GCN-NEXT: v_mul_lo_u32 v3, s4, v2 -; GCN-NEXT: v_mul_hi_u32 v4, s4, v0 -; GCN-NEXT: v_mul_lo_u32 v6, s5, v0 -; GCN-NEXT: v_mul_lo_u32 v5, s4, v0 -; GCN-NEXT: v_add_i32_e32 v3, vcc, v4, v3 -; GCN-NEXT: v_add_i32_e32 v3, vcc, v3, v6 -; GCN-NEXT: v_mul_hi_u32 v4, v0, v5 -; GCN-NEXT: v_mul_lo_u32 v6, v0, v3 -; GCN-NEXT: v_mul_hi_u32 v8, v0, v3 -; GCN-NEXT: v_mul_lo_u32 v7, v2, v5 -; GCN-NEXT: v_mul_hi_u32 v5, v2, v5 -; GCN-NEXT: v_mul_hi_u32 v9, v2, v3 -; GCN-NEXT: v_add_i32_e32 v4, vcc, v4, v6 -; GCN-NEXT: v_addc_u32_e32 v6, vcc, 0, v8, vcc -; GCN-NEXT: v_mul_lo_u32 v3, v2, v3 -; GCN-NEXT: v_add_i32_e32 v4, vcc, v4, v7 -; GCN-NEXT: v_addc_u32_e32 v4, vcc, v6, v5, vcc -; GCN-NEXT: v_addc_u32_e32 v5, vcc, v9, v1, vcc -; GCN-NEXT: v_add_i32_e32 v3, vcc, v4, v3 -; GCN-NEXT: v_addc_u32_e32 v4, vcc, 0, v5, vcc -; GCN-NEXT: v_add_i32_e32 v0, vcc, v0, v3 -; GCN-NEXT: v_addc_u32_e32 v2, vcc, v2, v4, vcc -; GCN-NEXT: v_mul_lo_u32 v3, s4, v2 -; GCN-NEXT: v_mul_hi_u32 v4, s4, v0 +; GCN-NEXT: v_mul_lo_u32 v2, s4, v1 +; GCN-NEXT: v_mul_hi_u32 v3, s4, v0 ; GCN-NEXT: v_mul_lo_u32 v5, s5, v0 -; GCN-NEXT: s_mov_b32 s5, s1 -; GCN-NEXT: v_add_i32_e32 v3, vcc, v4, v3 ; GCN-NEXT: v_mul_lo_u32 v4, s4, v0 -; GCN-NEXT: v_add_i32_e32 v3, vcc, v5, v3 -; GCN-NEXT: v_mul_lo_u32 v7, v0, v3 -; GCN-NEXT: v_mul_hi_u32 v8, v0, v4 -; GCN-NEXT: v_mul_hi_u32 v9, v0, v3 -; GCN-NEXT: v_mul_hi_u32 v6, v2, v4 -; GCN-NEXT: v_mul_lo_u32 v4, v2, v4 -; GCN-NEXT: v_mul_hi_u32 v5, v2, v3 -; GCN-NEXT: v_add_i32_e32 v7, vcc, v8, v7 -; GCN-NEXT: v_addc_u32_e32 v8, vcc, 0, v9, vcc -; GCN-NEXT: v_mul_lo_u32 v3, v2, v3 -; GCN-NEXT: v_add_i32_e32 v4, vcc, v7, v4 -; GCN-NEXT: v_addc_u32_e32 v4, vcc, v8, v6, vcc -; GCN-NEXT: v_addc_u32_e32 v5, vcc, v5, v1, vcc -; GCN-NEXT: v_add_i32_e32 v3, vcc, v4, v3 -; GCN-NEXT: v_addc_u32_e32 v4, vcc, 0, v5, vcc -; GCN-NEXT: v_add_i32_e32 v0, vcc, v0, v3 -; GCN-NEXT: v_addc_u32_e32 v2, vcc, v2, v4, vcc -; GCN-NEXT: v_mul_lo_u32 v3, s2, v2 -; GCN-NEXT: v_mul_hi_u32 v4, s2, v0 -; GCN-NEXT: v_mul_hi_u32 v5, s2, v2 -; GCN-NEXT: v_mul_hi_u32 v6, s3, v2 -; GCN-NEXT: v_mul_lo_u32 v2, s3, v2 -; GCN-NEXT: v_add_i32_e32 v3, vcc, v4, v3 -; GCN-NEXT: v_addc_u32_e32 v4, vcc, 0, v5, vcc -; GCN-NEXT: v_mul_lo_u32 v5, s3, v0 +; GCN-NEXT: v_add_i32_e32 v2, vcc, v3, v2 +; GCN-NEXT: v_add_i32_e32 v2, vcc, v2, v5 +; GCN-NEXT: v_mul_hi_u32 v3, v0, v4 +; GCN-NEXT: v_mul_lo_u32 v5, v0, v2 +; GCN-NEXT: v_mul_hi_u32 v7, v0, v2 +; GCN-NEXT: v_mul_hi_u32 v6, v1, v4 +; GCN-NEXT: v_mul_lo_u32 v4, v1, v4 +; GCN-NEXT: v_mul_hi_u32 v8, v1, v2 +; GCN-NEXT: v_add_i32_e32 v3, vcc, v3, v5 +; GCN-NEXT: v_addc_u32_e32 v5, vcc, 0, v7, vcc +; GCN-NEXT: v_mul_lo_u32 v2, v1, v2 +; GCN-NEXT: v_add_i32_e32 v3, vcc, v3, v4 +; GCN-NEXT: v_addc_u32_e32 v3, vcc, v5, v6, vcc +; GCN-NEXT: v_addc_u32_e32 v4, vcc, 0, v8, vcc +; GCN-NEXT: v_add_i32_e32 v2, vcc, v3, v2 +; GCN-NEXT: v_addc_u32_e32 v3, vcc, 0, v4, vcc +; GCN-NEXT: v_add_i32_e32 v0, vcc, v0, v2 +; GCN-NEXT: v_addc_u32_e32 v1, vcc, v1, v3, vcc +; GCN-NEXT: v_mul_lo_u32 v2, s4, v1 +; GCN-NEXT: v_mul_hi_u32 v3, s4, v0 +; GCN-NEXT: v_mul_lo_u32 v4, s5, v0 +; GCN-NEXT: s_mov_b32 s5, s1 +; GCN-NEXT: v_add_i32_e32 v2, vcc, v3, v2 +; GCN-NEXT: v_mul_lo_u32 v3, s4, v0 +; GCN-NEXT: v_add_i32_e32 v2, vcc, v4, v2 +; GCN-NEXT: v_mul_lo_u32 v6, v0, v2 +; GCN-NEXT: v_mul_hi_u32 v7, v0, v3 +; GCN-NEXT: v_mul_hi_u32 v8, v0, v2 +; GCN-NEXT: v_mul_hi_u32 v5, v1, v3 +; GCN-NEXT: v_mul_lo_u32 v3, v1, v3 +; GCN-NEXT: v_mul_hi_u32 v4, v1, v2 +; GCN-NEXT: v_add_i32_e32 v6, vcc, v7, v6 +; GCN-NEXT: v_addc_u32_e32 v7, vcc, 0, v8, vcc +; GCN-NEXT: v_mul_lo_u32 v2, v1, v2 +; GCN-NEXT: v_add_i32_e32 v3, vcc, v6, v3 +; GCN-NEXT: v_addc_u32_e32 v3, vcc, v7, v5, vcc +; GCN-NEXT: v_addc_u32_e32 v4, vcc, 0, v4, vcc +; GCN-NEXT: v_add_i32_e32 v2, vcc, v3, v2 +; GCN-NEXT: v_addc_u32_e32 v3, vcc, 0, v4, vcc +; GCN-NEXT: v_add_i32_e32 v0, vcc, v0, v2 +; GCN-NEXT: v_addc_u32_e32 v1, vcc, v1, v3, vcc +; GCN-NEXT: v_mul_lo_u32 v2, s2, v1 +; GCN-NEXT: v_mul_hi_u32 v3, s2, v0 +; GCN-NEXT: v_mul_hi_u32 v4, s2, v1 +; GCN-NEXT: v_mul_hi_u32 v5, s3, v1 +; GCN-NEXT: v_mul_lo_u32 v1, s3, v1 +; GCN-NEXT: v_add_i32_e32 v2, vcc, v3, v2 +; GCN-NEXT: v_addc_u32_e32 v3, vcc, 0, v4, vcc +; GCN-NEXT: v_mul_lo_u32 v4, s3, v0 ; GCN-NEXT: v_mul_hi_u32 v0, s3, v0 ; GCN-NEXT: s_mov_b32 s4, s0 -; GCN-NEXT: v_add_i32_e32 v3, vcc, v3, v5 -; GCN-NEXT: v_addc_u32_e32 v0, vcc, v4, v0, vcc -; GCN-NEXT: v_addc_u32_e32 v1, vcc, v6, v1, vcc -; GCN-NEXT: v_add_i32_e32 v0, vcc, v0, v2 -; GCN-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GCN-NEXT: v_add_i32_e32 v2, vcc, v2, v4 +; GCN-NEXT: v_addc_u32_e32 v0, vcc, v3, v0, vcc +; GCN-NEXT: v_addc_u32_e32 v2, vcc, 0, v5, vcc +; GCN-NEXT: v_add_i32_e32 v0, vcc, v0, v1 +; GCN-NEXT: v_addc_u32_e32 v1, vcc, 0, v2, vcc ; GCN-NEXT: v_mul_lo_u32 v2, s8, v1 ; GCN-NEXT: v_mul_hi_u32 v3, s8, v0 ; GCN-NEXT: v_mul_lo_u32 v4, s9, v0 @@ -159,12 +158,11 @@ define amdgpu_kernel void @s_test_udiv_i64(i64 addrspace(1)* %out, i64 %x, i64 % ; GCN-IR-NEXT: s_cbranch_vccz .LBB0_4 ; GCN-IR-NEXT: ; %bb.2: ; %udiv-preheader ; GCN-IR-NEXT: s_lshr_b64 s[12:13], s[2:3], s12 -; GCN-IR-NEXT: s_add_u32 s16, s4, -1 -; GCN-IR-NEXT: s_addc_u32 s17, s5, -1 +; GCN-IR-NEXT: s_add_u32 s15, s4, -1 +; GCN-IR-NEXT: s_addc_u32 s16, s5, -1 ; GCN-IR-NEXT: s_not_b64 s[2:3], s[10:11] ; GCN-IR-NEXT: s_add_u32 s2, s2, s14 -; GCN-IR-NEXT: s_mov_b32 s15, s11 -; GCN-IR-NEXT: s_addc_u32 s3, s3, s11 +; GCN-IR-NEXT: s_addc_u32 s3, s3, 0 ; GCN-IR-NEXT: s_mov_b64 s[10:11], 0 ; GCN-IR-NEXT: s_mov_b32 s7, 0 ; GCN-IR-NEXT: .LBB0_3: ; %udiv-do-while @@ -174,8 +172,8 @@ define amdgpu_kernel void @s_test_udiv_i64(i64 addrspace(1)* %out, i64 %x, i64 % ; GCN-IR-NEXT: s_lshl_b64 s[8:9], s[8:9], 1 ; GCN-IR-NEXT: s_or_b64 s[12:13], s[12:13], s[6:7] ; GCN-IR-NEXT: s_or_b64 s[8:9], s[10:11], s[8:9] -; GCN-IR-NEXT: s_sub_u32 s6, s16, s12 -; GCN-IR-NEXT: s_subb_u32 s6, s17, s13 +; GCN-IR-NEXT: s_sub_u32 s6, s15, s12 +; GCN-IR-NEXT: s_subb_u32 s6, s16, s13 ; GCN-IR-NEXT: s_ashr_i32 s10, s6, 31 ; GCN-IR-NEXT: s_mov_b32 s11, s10 ; GCN-IR-NEXT: s_and_b32 s6, s10, 1 @@ -184,9 +182,9 @@ define amdgpu_kernel void @s_test_udiv_i64(i64 addrspace(1)* %out, i64 %x, i64 % ; GCN-IR-NEXT: s_subb_u32 s13, s13, s11 ; GCN-IR-NEXT: s_add_u32 s2, s2, 1 ; GCN-IR-NEXT: s_addc_u32 s3, s3, 0 -; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[14:15], s[2:3], 0 +; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[18:19], s[2:3], 0 ; GCN-IR-NEXT: s_mov_b64 s[10:11], s[6:7] -; GCN-IR-NEXT: s_and_b64 vcc, exec, s[14:15] +; GCN-IR-NEXT: s_and_b64 vcc, exec, s[18:19] ; GCN-IR-NEXT: s_cbranch_vccz .LBB0_3 ; GCN-IR-NEXT: .LBB0_4: ; %Flow6 ; GCN-IR-NEXT: s_lshl_b64 s[2:3], s[8:9], 1 @@ -219,7 +217,6 @@ define i64 @v_test_udiv_i64(i64 %x, i64 %y) { ; GCN-NEXT: v_subb_u32_e32 v7, vcc, 0, v3, vcc ; GCN-NEXT: v_mac_f32_e32 v4, 0x4f800000, v5 ; GCN-NEXT: v_rcp_f32_e32 v4, v4 -; GCN-NEXT: v_mov_b32_e32 v13, 0 ; GCN-NEXT: v_mul_f32_e32 v4, 0x5f7ffffc, v4 ; GCN-NEXT: v_mul_f32_e32 v5, 0x2f800000, v4 ; GCN-NEXT: v_trunc_f32_e32 v5, v5 @@ -235,7 +232,7 @@ define i64 @v_test_udiv_i64(i64 %x, i64 %y) { ; GCN-NEXT: v_mul_lo_u32 v10, v4, v8 ; GCN-NEXT: v_mul_hi_u32 v11, v4, v9 ; GCN-NEXT: v_mul_hi_u32 v12, v4, v8 -; GCN-NEXT: v_mul_hi_u32 v14, v5, v8 +; GCN-NEXT: v_mul_hi_u32 v13, v5, v8 ; GCN-NEXT: v_mul_lo_u32 v8, v5, v8 ; GCN-NEXT: v_add_i32_e32 v10, vcc, v11, v10 ; GCN-NEXT: v_addc_u32_e32 v11, vcc, 0, v12, vcc @@ -243,7 +240,7 @@ define i64 @v_test_udiv_i64(i64 %x, i64 %y) { ; GCN-NEXT: v_mul_hi_u32 v9, v5, v9 ; GCN-NEXT: v_add_i32_e32 v10, vcc, v10, v12 ; GCN-NEXT: v_addc_u32_e32 v9, vcc, v11, v9, vcc -; GCN-NEXT: v_addc_u32_e32 v10, vcc, v14, v13, vcc +; GCN-NEXT: v_addc_u32_e32 v10, vcc, 0, v13, vcc ; GCN-NEXT: v_add_i32_e32 v8, vcc, v9, v8 ; GCN-NEXT: v_addc_u32_e32 v9, vcc, 0, v10, vcc ; GCN-NEXT: v_add_i32_e32 v4, vcc, v4, v8 @@ -265,7 +262,7 @@ define i64 @v_test_udiv_i64(i64 %x, i64 %y) { ; GCN-NEXT: v_mul_lo_u32 v7, v5, v7 ; GCN-NEXT: v_add_i32_e32 v6, vcc, v10, v6 ; GCN-NEXT: v_addc_u32_e32 v6, vcc, v11, v9, vcc -; GCN-NEXT: v_addc_u32_e32 v8, vcc, v8, v13, vcc +; GCN-NEXT: v_addc_u32_e32 v8, vcc, 0, v8, vcc ; GCN-NEXT: v_add_i32_e32 v6, vcc, v6, v7 ; GCN-NEXT: v_addc_u32_e32 v7, vcc, 0, v8, vcc ; GCN-NEXT: v_add_i32_e32 v4, vcc, v4, v6 @@ -281,7 +278,7 @@ define i64 @v_test_udiv_i64(i64 %x, i64 %y) { ; GCN-NEXT: v_mul_hi_u32 v4, v1, v4 ; GCN-NEXT: v_add_i32_e32 v6, vcc, v6, v8 ; GCN-NEXT: v_addc_u32_e32 v4, vcc, v7, v4, vcc -; GCN-NEXT: v_addc_u32_e32 v6, vcc, v9, v13, vcc +; GCN-NEXT: v_addc_u32_e32 v6, vcc, 0, v9, vcc ; GCN-NEXT: v_add_i32_e32 v4, vcc, v4, v5 ; GCN-NEXT: v_addc_u32_e32 v5, vcc, 0, v6, vcc ; GCN-NEXT: v_mul_lo_u32 v6, v2, v5 @@ -323,35 +320,33 @@ define i64 @v_test_udiv_i64(i64 %x, i64 %y) { ; GCN-IR-LABEL: v_test_udiv_i64: ; GCN-IR: ; %bb.0: ; %_udiv-special-cases ; GCN-IR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-IR-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[2:3] -; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[4:5], 0, v[0:1] ; GCN-IR-NEXT: v_ffbh_u32_e32 v4, v2 -; GCN-IR-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GCN-IR-NEXT: v_add_i32_e32 v4, vcc, 32, v4 +; GCN-IR-NEXT: v_add_i32_e64 v4, s[6:7], 32, v4 ; GCN-IR-NEXT: v_ffbh_u32_e32 v5, v3 ; GCN-IR-NEXT: v_min_u32_e32 v8, v4, v5 ; GCN-IR-NEXT: v_ffbh_u32_e32 v4, v0 -; GCN-IR-NEXT: v_add_i32_e32 v4, vcc, 32, v4 +; GCN-IR-NEXT: v_add_i32_e64 v4, s[6:7], 32, v4 ; GCN-IR-NEXT: v_ffbh_u32_e32 v5, v1 -; GCN-IR-NEXT: v_min_u32_e32 v10, v4, v5 -; GCN-IR-NEXT: v_sub_i32_e32 v6, vcc, v8, v10 -; GCN-IR-NEXT: v_subb_u32_e64 v7, s[6:7], 0, 0, vcc -; GCN-IR-NEXT: v_cmp_lt_u64_e32 vcc, 63, v[6:7] -; GCN-IR-NEXT: v_mov_b32_e32 v9, 0 -; GCN-IR-NEXT: s_or_b64 s[4:5], s[4:5], vcc +; GCN-IR-NEXT: v_min_u32_e32 v9, v4, v5 +; GCN-IR-NEXT: v_sub_i32_e64 v6, s[6:7], v8, v9 +; GCN-IR-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[2:3] +; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[4:5], 0, v[0:1] +; GCN-IR-NEXT: v_subb_u32_e64 v7, s[6:7], 0, 0, s[6:7] +; GCN-IR-NEXT: v_cmp_lt_u64_e64 s[6:7], 63, v[6:7] +; GCN-IR-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GCN-IR-NEXT: s_or_b64 s[4:5], s[4:5], s[6:7] ; GCN-IR-NEXT: v_cmp_ne_u64_e32 vcc, 63, v[6:7] ; GCN-IR-NEXT: s_xor_b64 s[6:7], s[4:5], -1 -; GCN-IR-NEXT: v_mov_b32_e32 v11, v9 ; GCN-IR-NEXT: v_cndmask_b32_e64 v4, v1, 0, s[4:5] ; GCN-IR-NEXT: v_cndmask_b32_e64 v5, v0, 0, s[4:5] ; GCN-IR-NEXT: s_and_b64 s[4:5], s[6:7], vcc ; GCN-IR-NEXT: s_and_saveexec_b64 s[6:7], s[4:5] ; GCN-IR-NEXT: s_cbranch_execz .LBB1_6 ; GCN-IR-NEXT: ; %bb.1: ; %udiv-bb1 -; GCN-IR-NEXT: v_add_i32_e32 v12, vcc, 1, v6 -; GCN-IR-NEXT: v_addc_u32_e32 v13, vcc, 0, v7, vcc +; GCN-IR-NEXT: v_add_i32_e32 v10, vcc, 1, v6 +; GCN-IR-NEXT: v_addc_u32_e32 v11, vcc, 0, v7, vcc ; GCN-IR-NEXT: v_sub_i32_e64 v4, s[4:5], 63, v6 -; GCN-IR-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[12:13] +; GCN-IR-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[10:11] ; GCN-IR-NEXT: v_lshl_b64 v[4:5], v[0:1], v4 ; GCN-IR-NEXT: v_mov_b32_e32 v6, 0 ; GCN-IR-NEXT: v_mov_b32_e32 v7, 0 @@ -359,25 +354,25 @@ define i64 @v_test_udiv_i64(i64 %x, i64 %y) { ; GCN-IR-NEXT: s_xor_b64 s[8:9], exec, s[4:5] ; GCN-IR-NEXT: s_cbranch_execz .LBB1_5 ; GCN-IR-NEXT: ; %bb.2: ; %udiv-preheader -; GCN-IR-NEXT: v_add_i32_e32 v14, vcc, -1, v2 -; GCN-IR-NEXT: v_lshr_b64 v[12:13], v[0:1], v12 -; GCN-IR-NEXT: v_addc_u32_e32 v15, vcc, -1, v3, vcc +; GCN-IR-NEXT: v_add_i32_e32 v12, vcc, -1, v2 +; GCN-IR-NEXT: v_lshr_b64 v[10:11], v[0:1], v10 +; GCN-IR-NEXT: v_addc_u32_e32 v13, vcc, -1, v3, vcc ; GCN-IR-NEXT: v_not_b32_e32 v0, v8 -; GCN-IR-NEXT: v_not_b32_e32 v1, v9 -; GCN-IR-NEXT: v_add_i32_e32 v0, vcc, v0, v10 +; GCN-IR-NEXT: v_not_b32_e32 v1, 0 +; GCN-IR-NEXT: v_add_i32_e32 v0, vcc, v0, v9 ; GCN-IR-NEXT: v_mov_b32_e32 v8, 0 -; GCN-IR-NEXT: v_addc_u32_e32 v1, vcc, v1, v11, vcc +; GCN-IR-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; GCN-IR-NEXT: s_mov_b64 s[10:11], 0 ; GCN-IR-NEXT: v_mov_b32_e32 v9, 0 ; GCN-IR-NEXT: v_mov_b32_e32 v7, 0 ; GCN-IR-NEXT: .LBB1_3: ; %udiv-do-while ; GCN-IR-NEXT: ; =>This Inner Loop Header: Depth=1 -; GCN-IR-NEXT: v_lshl_b64 v[10:11], v[12:13], 1 +; GCN-IR-NEXT: v_lshl_b64 v[10:11], v[10:11], 1 ; GCN-IR-NEXT: v_lshrrev_b32_e32 v6, 31, v5 ; GCN-IR-NEXT: v_or_b32_e32 v10, v10, v6 ; GCN-IR-NEXT: v_lshl_b64 v[4:5], v[4:5], 1 -; GCN-IR-NEXT: v_sub_i32_e32 v6, vcc, v14, v10 -; GCN-IR-NEXT: v_subb_u32_e32 v6, vcc, v15, v11, vcc +; GCN-IR-NEXT: v_sub_i32_e32 v6, vcc, v12, v10 +; GCN-IR-NEXT: v_subb_u32_e32 v6, vcc, v13, v11, vcc ; GCN-IR-NEXT: v_or_b32_e32 v4, v8, v4 ; GCN-IR-NEXT: v_ashrrev_i32_e32 v8, 31, v6 ; GCN-IR-NEXT: v_add_i32_e32 v0, vcc, 1, v0 @@ -387,8 +382,8 @@ define i64 @v_test_udiv_i64(i64 %x, i64 %y) { ; GCN-IR-NEXT: v_and_b32_e32 v8, v8, v2 ; GCN-IR-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; GCN-IR-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[0:1] -; GCN-IR-NEXT: v_sub_i32_e64 v12, s[4:5], v10, v8 -; GCN-IR-NEXT: v_subb_u32_e64 v13, s[4:5], v11, v9, s[4:5] +; GCN-IR-NEXT: v_sub_i32_e64 v10, s[4:5], v10, v8 +; GCN-IR-NEXT: v_subb_u32_e64 v11, s[4:5], v11, v9, s[4:5] ; GCN-IR-NEXT: v_mov_b32_e32 v9, v7 ; GCN-IR-NEXT: s_or_b64 s[10:11], vcc, s[10:11] ; GCN-IR-NEXT: v_mov_b32_e32 v8, v6 @@ -679,52 +674,50 @@ define amdgpu_kernel void @s_test_udiv24_i48(i48 addrspace(1)* %out, i48 %x, i48 ; GCN: ; %bb.0: ; GCN-NEXT: s_load_dword s2, s[0:1], 0xe ; GCN-NEXT: s_load_dword s4, s[0:1], 0xd -; GCN-NEXT: s_mov_b32 s6, 0xffff -; GCN-NEXT: s_mov_b32 s7, 0xff000000 -; GCN-NEXT: v_cvt_f32_ubyte3_e32 v2, s6 +; GCN-NEXT: s_load_dword s6, s[0:1], 0xc +; GCN-NEXT: v_cvt_f32_ubyte3_e32 v2, 0xffff +; GCN-NEXT: s_mov_b32 s7, 0xf000 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_and_b32 s3, s2, s6 -; GCN-NEXT: s_and_b32 s2, s4, s7 +; GCN-NEXT: s_and_b32 s3, s2, 0xffff +; GCN-NEXT: s_and_b32 s2, s4, 0xff000000 ; GCN-NEXT: v_mov_b32_e32 v0, s2 ; GCN-NEXT: v_alignbit_b32 v0, s3, v0, 24 ; GCN-NEXT: v_cvt_f32_u32_e32 v1, v0 -; GCN-NEXT: s_load_dword s8, s[0:1], 0xb -; GCN-NEXT: s_load_dword s9, s[0:1], 0xc ; GCN-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; GCN-NEXT: s_lshr_b64 s[0:1], s[2:3], 24 +; GCN-NEXT: s_load_dword s0, s[0:1], 0xb +; GCN-NEXT: s_and_b32 s8, s6, 0xffff +; GCN-NEXT: s_mov_b32 s6, -1 ; GCN-NEXT: v_mac_f32_e32 v1, 0x4f800000, v2 ; GCN-NEXT: v_rcp_f32_e32 v1, v1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_and_b32 s6, s9, s6 -; GCN-NEXT: s_and_b32 s8, s8, s7 +; GCN-NEXT: s_and_b32 s9, s0, 0xff000000 +; GCN-NEXT: s_lshr_b64 s[0:1], s[2:3], 24 ; GCN-NEXT: s_sub_u32 s0, 0, s0 ; GCN-NEXT: v_mul_f32_e32 v1, 0x5f7ffffc, v1 ; GCN-NEXT: v_mul_f32_e32 v2, 0x2f800000, v1 ; GCN-NEXT: v_trunc_f32_e32 v2, v2 ; GCN-NEXT: v_mac_f32_e32 v1, 0xcf800000, v2 -; GCN-NEXT: v_cvt_u32_f32_e32 v1, v1 ; GCN-NEXT: v_cvt_u32_f32_e32 v2, v2 +; GCN-NEXT: v_cvt_u32_f32_e32 v1, v1 ; GCN-NEXT: s_subb_u32 s1, 0, s1 -; GCN-NEXT: v_mov_b32_e32 v8, 0 -; GCN-NEXT: v_mul_hi_u32 v3, s0, v1 -; GCN-NEXT: v_mul_lo_u32 v4, s0, v2 +; GCN-NEXT: v_mul_lo_u32 v3, s0, v2 +; GCN-NEXT: v_mul_hi_u32 v4, s0, v1 ; GCN-NEXT: v_mul_lo_u32 v5, s1, v1 -; GCN-NEXT: s_mov_b32 s7, 0xf000 -; GCN-NEXT: v_add_i32_e32 v3, vcc, v3, v4 -; GCN-NEXT: v_mul_lo_u32 v4, s0, v1 +; GCN-NEXT: v_mul_lo_u32 v6, s0, v1 +; GCN-NEXT: v_add_i32_e32 v3, vcc, v4, v3 ; GCN-NEXT: v_add_i32_e32 v3, vcc, v3, v5 -; GCN-NEXT: v_mul_lo_u32 v5, v1, v3 -; GCN-NEXT: v_mul_hi_u32 v6, v1, v4 +; GCN-NEXT: v_mul_lo_u32 v4, v1, v3 +; GCN-NEXT: v_mul_hi_u32 v5, v1, v6 ; GCN-NEXT: v_mul_hi_u32 v7, v1, v3 -; GCN-NEXT: v_mul_hi_u32 v9, v2, v3 +; GCN-NEXT: v_mul_hi_u32 v8, v2, v3 ; GCN-NEXT: v_mul_lo_u32 v3, v2, v3 -; GCN-NEXT: v_add_i32_e32 v5, vcc, v6, v5 -; GCN-NEXT: v_addc_u32_e32 v6, vcc, 0, v7, vcc -; GCN-NEXT: v_mul_lo_u32 v7, v2, v4 -; GCN-NEXT: v_mul_hi_u32 v4, v2, v4 -; GCN-NEXT: v_add_i32_e32 v5, vcc, v5, v7 -; GCN-NEXT: v_addc_u32_e32 v4, vcc, v6, v4, vcc -; GCN-NEXT: v_addc_u32_e32 v5, vcc, v9, v8, vcc +; GCN-NEXT: v_add_i32_e32 v4, vcc, v5, v4 +; GCN-NEXT: v_addc_u32_e32 v5, vcc, 0, v7, vcc +; GCN-NEXT: v_mul_lo_u32 v7, v2, v6 +; GCN-NEXT: v_mul_hi_u32 v6, v2, v6 +; GCN-NEXT: v_add_i32_e32 v4, vcc, v4, v7 +; GCN-NEXT: v_addc_u32_e32 v4, vcc, v5, v6, vcc +; GCN-NEXT: v_addc_u32_e32 v5, vcc, 0, v8, vcc ; GCN-NEXT: v_add_i32_e32 v3, vcc, v4, v3 ; GCN-NEXT: v_addc_u32_e32 v4, vcc, 0, v5, vcc ; GCN-NEXT: v_add_i32_e32 v1, vcc, v1, v3 @@ -736,32 +729,32 @@ define amdgpu_kernel void @s_test_udiv24_i48(i48 addrspace(1)* %out, i48 %x, i48 ; GCN-NEXT: v_mul_lo_u32 v4, s0, v1 ; GCN-NEXT: v_add_i32_e32 v3, vcc, v5, v3 ; GCN-NEXT: v_mul_lo_u32 v7, v1, v3 -; GCN-NEXT: v_mul_hi_u32 v9, v1, v4 -; GCN-NEXT: v_mul_hi_u32 v10, v1, v3 +; GCN-NEXT: v_mul_hi_u32 v8, v1, v4 +; GCN-NEXT: v_mul_hi_u32 v9, v1, v3 ; GCN-NEXT: v_mul_hi_u32 v6, v2, v4 ; GCN-NEXT: v_mul_lo_u32 v4, v2, v4 ; GCN-NEXT: v_mul_hi_u32 v5, v2, v3 -; GCN-NEXT: v_add_i32_e32 v7, vcc, v9, v7 -; GCN-NEXT: v_addc_u32_e32 v9, vcc, 0, v10, vcc +; GCN-NEXT: v_add_i32_e32 v7, vcc, v8, v7 +; GCN-NEXT: v_addc_u32_e32 v8, vcc, 0, v9, vcc ; GCN-NEXT: v_mul_lo_u32 v3, v2, v3 ; GCN-NEXT: v_add_i32_e32 v4, vcc, v7, v4 -; GCN-NEXT: v_addc_u32_e32 v4, vcc, v9, v6, vcc -; GCN-NEXT: v_addc_u32_e32 v5, vcc, v5, v8, vcc +; GCN-NEXT: v_addc_u32_e32 v4, vcc, v8, v6, vcc +; GCN-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc ; GCN-NEXT: v_add_i32_e32 v3, vcc, v4, v3 ; GCN-NEXT: v_addc_u32_e32 v4, vcc, 0, v5, vcc ; GCN-NEXT: v_add_i32_e32 v1, vcc, v1, v3 -; GCN-NEXT: v_mov_b32_e32 v3, s8 +; GCN-NEXT: v_mov_b32_e32 v3, s9 ; GCN-NEXT: v_addc_u32_e32 v2, vcc, v2, v4, vcc -; GCN-NEXT: v_alignbit_b32 v3, s6, v3, 24 +; GCN-NEXT: v_alignbit_b32 v3, s8, v3, 24 ; GCN-NEXT: v_mul_lo_u32 v4, v3, v2 ; GCN-NEXT: v_mul_hi_u32 v1, v3, v1 ; GCN-NEXT: v_mul_hi_u32 v2, v3, v2 -; GCN-NEXT: s_mov_b32 s6, -1 +; GCN-NEXT: v_mov_b32_e32 v5, 0 ; GCN-NEXT: v_add_i32_e32 v1, vcc, v1, v4 ; GCN-NEXT: v_addc_u32_e32 v2, vcc, 0, v2, vcc ; GCN-NEXT: v_add_i32_e32 v1, vcc, 0, v1 ; GCN-NEXT: v_addc_u32_e32 v1, vcc, 0, v2, vcc -; GCN-NEXT: v_addc_u32_e32 v2, vcc, 0, v8, vcc +; GCN-NEXT: v_addc_u32_e32 v2, vcc, 0, v5, vcc ; GCN-NEXT: v_add_i32_e32 v1, vcc, 0, v1 ; GCN-NEXT: v_addc_u32_e32 v2, vcc, 0, v2, vcc ; GCN-NEXT: v_mul_lo_u32 v6, v0, v2 @@ -797,26 +790,24 @@ define amdgpu_kernel void @s_test_udiv24_i48(i48 addrspace(1)* %out, i48 %x, i48 ; GCN-IR-LABEL: s_test_udiv24_i48: ; GCN-IR: ; %bb.0: ; %_udiv-special-cases ; GCN-IR-NEXT: s_load_dword s2, s[0:1], 0xc -; GCN-IR-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; GCN-IR-NEXT: s_load_dword s3, s[0:1], 0xb +; GCN-IR-NEXT: s_load_dword s4, s[0:1], 0xb +; GCN-IR-NEXT: s_load_dword s5, s[0:1], 0xe ; GCN-IR-NEXT: s_load_dword s6, s[0:1], 0xd -; GCN-IR-NEXT: s_load_dword s7, s[0:1], 0xe -; GCN-IR-NEXT: s_mov_b32 s8, 0xffff +; GCN-IR-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; GCN-IR-NEXT: s_waitcnt lgkmcnt(0) -; GCN-IR-NEXT: s_and_b32 s1, s2, s8 -; GCN-IR-NEXT: s_mov_b32 s2, 0xff000000 -; GCN-IR-NEXT: s_and_b32 s0, s3, s2 -; GCN-IR-NEXT: s_and_b32 s3, s7, s8 -; GCN-IR-NEXT: s_and_b32 s2, s6, s2 -; GCN-IR-NEXT: s_lshr_b64 s[8:9], s[0:1], 24 -; GCN-IR-NEXT: s_lshr_b64 s[2:3], s[2:3], 24 -; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[6:7], s[2:3], 0 +; GCN-IR-NEXT: s_and_b32 s3, s2, 0xffff +; GCN-IR-NEXT: s_and_b32 s2, s4, 0xff000000 +; GCN-IR-NEXT: s_and_b32 s5, s5, 0xffff +; GCN-IR-NEXT: s_and_b32 s4, s6, 0xff000000 +; GCN-IR-NEXT: s_lshr_b64 s[8:9], s[2:3], 24 +; GCN-IR-NEXT: s_lshr_b64 s[4:5], s[4:5], 24 +; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[6:7], s[4:5], 0 ; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[10:11], s[8:9], 0 -; GCN-IR-NEXT: s_mov_b64 s[0:1], 0 +; GCN-IR-NEXT: s_mov_b64 s[2:3], 0 ; GCN-IR-NEXT: s_or_b64 s[12:13], s[6:7], s[10:11] -; GCN-IR-NEXT: s_flbit_i32_b32 s6, s2 +; GCN-IR-NEXT: s_flbit_i32_b32 s6, s4 ; GCN-IR-NEXT: s_add_i32 s6, s6, 32 -; GCN-IR-NEXT: s_flbit_i32_b32 s7, s3 +; GCN-IR-NEXT: s_flbit_i32_b32 s7, s5 ; GCN-IR-NEXT: s_min_u32 s10, s6, s7 ; GCN-IR-NEXT: s_flbit_i32_b32 s6, s8 ; GCN-IR-NEXT: s_add_i32 s6, s6, 32 @@ -842,40 +833,39 @@ define amdgpu_kernel void @s_test_udiv24_i48(i48 addrspace(1)* %out, i48 %x, i48 ; GCN-IR-NEXT: s_cbranch_vccz .LBB7_4 ; GCN-IR-NEXT: ; %bb.2: ; %udiv-preheader ; GCN-IR-NEXT: s_lshr_b64 s[12:13], s[8:9], s12 -; GCN-IR-NEXT: s_add_u32 s16, s2, -1 -; GCN-IR-NEXT: s_addc_u32 s17, s3, -1 -; GCN-IR-NEXT: s_not_b64 s[0:1], s[10:11] -; GCN-IR-NEXT: s_add_u32 s8, s0, s14 -; GCN-IR-NEXT: s_mov_b32 s15, s11 -; GCN-IR-NEXT: s_addc_u32 s9, s1, s11 +; GCN-IR-NEXT: s_add_u32 s15, s4, -1 +; GCN-IR-NEXT: s_addc_u32 s16, s5, -1 +; GCN-IR-NEXT: s_not_b64 s[2:3], s[10:11] +; GCN-IR-NEXT: s_add_u32 s8, s2, s14 +; GCN-IR-NEXT: s_addc_u32 s9, s3, 0 ; GCN-IR-NEXT: s_mov_b64 s[10:11], 0 -; GCN-IR-NEXT: s_mov_b32 s1, 0 +; GCN-IR-NEXT: s_mov_b32 s3, 0 ; GCN-IR-NEXT: .LBB7_3: ; %udiv-do-while ; GCN-IR-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN-IR-NEXT: s_lshl_b64 s[12:13], s[12:13], 1 -; GCN-IR-NEXT: s_lshr_b32 s0, s7, 31 +; GCN-IR-NEXT: s_lshr_b32 s2, s7, 31 ; GCN-IR-NEXT: s_lshl_b64 s[6:7], s[6:7], 1 -; GCN-IR-NEXT: s_or_b64 s[12:13], s[12:13], s[0:1] +; GCN-IR-NEXT: s_or_b64 s[12:13], s[12:13], s[2:3] ; GCN-IR-NEXT: s_or_b64 s[6:7], s[10:11], s[6:7] -; GCN-IR-NEXT: s_sub_u32 s0, s16, s12 -; GCN-IR-NEXT: s_subb_u32 s0, s17, s13 -; GCN-IR-NEXT: s_ashr_i32 s10, s0, 31 +; GCN-IR-NEXT: s_sub_u32 s2, s15, s12 +; GCN-IR-NEXT: s_subb_u32 s2, s16, s13 +; GCN-IR-NEXT: s_ashr_i32 s10, s2, 31 ; GCN-IR-NEXT: s_mov_b32 s11, s10 -; GCN-IR-NEXT: s_and_b32 s0, s10, 1 -; GCN-IR-NEXT: s_and_b64 s[10:11], s[10:11], s[2:3] +; GCN-IR-NEXT: s_and_b32 s2, s10, 1 +; GCN-IR-NEXT: s_and_b64 s[10:11], s[10:11], s[4:5] ; GCN-IR-NEXT: s_sub_u32 s12, s12, s10 ; GCN-IR-NEXT: s_subb_u32 s13, s13, s11 ; GCN-IR-NEXT: s_add_u32 s8, s8, 1 ; GCN-IR-NEXT: s_addc_u32 s9, s9, 0 -; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[14:15], s[8:9], 0 -; GCN-IR-NEXT: s_mov_b64 s[10:11], s[0:1] -; GCN-IR-NEXT: s_and_b64 vcc, exec, s[14:15] +; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[18:19], s[8:9], 0 +; GCN-IR-NEXT: s_mov_b64 s[10:11], s[2:3] +; GCN-IR-NEXT: s_and_b64 vcc, exec, s[18:19] ; GCN-IR-NEXT: s_cbranch_vccz .LBB7_3 ; GCN-IR-NEXT: .LBB7_4: ; %Flow3 -; GCN-IR-NEXT: s_lshl_b64 s[2:3], s[6:7], 1 -; GCN-IR-NEXT: s_or_b64 s[0:1], s[0:1], s[2:3] -; GCN-IR-NEXT: v_mov_b32_e32 v0, s0 -; GCN-IR-NEXT: v_mov_b32_e32 v1, s1 +; GCN-IR-NEXT: s_lshl_b64 s[4:5], s[6:7], 1 +; GCN-IR-NEXT: s_or_b64 s[2:3], s[2:3], s[4:5] +; GCN-IR-NEXT: v_mov_b32_e32 v0, s2 +; GCN-IR-NEXT: v_mov_b32_e32 v1, s3 ; GCN-IR-NEXT: s_branch .LBB7_6 ; GCN-IR-NEXT: .LBB7_5: ; GCN-IR-NEXT: v_mov_b32_e32 v0, s9 @@ -883,10 +873,10 @@ define amdgpu_kernel void @s_test_udiv24_i48(i48 addrspace(1)* %out, i48 %x, i48 ; GCN-IR-NEXT: v_mov_b32_e32 v0, s8 ; GCN-IR-NEXT: v_cndmask_b32_e64 v0, v0, 0, s[12:13] ; GCN-IR-NEXT: .LBB7_6: ; %udiv-end -; GCN-IR-NEXT: s_mov_b32 s7, 0xf000 -; GCN-IR-NEXT: s_mov_b32 s6, -1 -; GCN-IR-NEXT: buffer_store_short v1, off, s[4:7], 0 offset:4 -; GCN-IR-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GCN-IR-NEXT: s_mov_b32 s3, 0xf000 +; GCN-IR-NEXT: s_mov_b32 s2, -1 +; GCN-IR-NEXT: buffer_store_short v1, off, s[0:3], 0 offset:4 +; GCN-IR-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GCN-IR-NEXT: s_endpgm %1 = lshr i48 %x, 24 %2 = lshr i48 %y, 24 @@ -908,58 +898,57 @@ define amdgpu_kernel void @s_test_udiv_k_num_i64(i64 addrspace(1)* %out, i64 %x) ; GCN-NEXT: s_subb_u32 s5, 0, s3 ; GCN-NEXT: v_mac_f32_e32 v0, 0x4f800000, v1 ; GCN-NEXT: v_rcp_f32_e32 v0, v0 -; GCN-NEXT: v_mov_b32_e32 v1, 0 ; GCN-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0 -; GCN-NEXT: v_mul_f32_e32 v2, 0x2f800000, v0 -; GCN-NEXT: v_trunc_f32_e32 v2, v2 -; GCN-NEXT: v_mac_f32_e32 v0, 0xcf800000, v2 -; GCN-NEXT: v_cvt_u32_f32_e32 v2, v2 +; GCN-NEXT: v_mul_f32_e32 v1, 0x2f800000, v0 +; GCN-NEXT: v_trunc_f32_e32 v1, v1 +; GCN-NEXT: v_mac_f32_e32 v0, 0xcf800000, v1 +; GCN-NEXT: v_cvt_u32_f32_e32 v1, v1 ; GCN-NEXT: v_cvt_u32_f32_e32 v0, v0 -; GCN-NEXT: v_mul_lo_u32 v3, s4, v2 -; GCN-NEXT: v_mul_hi_u32 v4, s4, v0 -; GCN-NEXT: v_mul_lo_u32 v6, s5, v0 -; GCN-NEXT: v_mul_lo_u32 v5, s4, v0 -; GCN-NEXT: v_add_i32_e32 v3, vcc, v4, v3 -; GCN-NEXT: v_add_i32_e32 v3, vcc, v3, v6 -; GCN-NEXT: v_mul_hi_u32 v4, v0, v5 -; GCN-NEXT: v_mul_lo_u32 v6, v0, v3 -; GCN-NEXT: v_mul_hi_u32 v8, v0, v3 -; GCN-NEXT: v_mul_hi_u32 v7, v2, v5 -; GCN-NEXT: v_mul_lo_u32 v5, v2, v5 -; GCN-NEXT: v_mul_hi_u32 v9, v2, v3 -; GCN-NEXT: v_add_i32_e32 v4, vcc, v4, v6 -; GCN-NEXT: v_addc_u32_e32 v6, vcc, 0, v8, vcc -; GCN-NEXT: v_mul_lo_u32 v3, v2, v3 -; GCN-NEXT: v_add_i32_e32 v4, vcc, v4, v5 -; GCN-NEXT: v_addc_u32_e32 v4, vcc, v6, v7, vcc -; GCN-NEXT: v_addc_u32_e32 v5, vcc, v9, v1, vcc -; GCN-NEXT: v_add_i32_e32 v3, vcc, v4, v3 -; GCN-NEXT: v_addc_u32_e32 v4, vcc, 0, v5, vcc -; GCN-NEXT: v_add_i32_e32 v0, vcc, v0, v3 -; GCN-NEXT: v_addc_u32_e32 v2, vcc, v2, v4, vcc -; GCN-NEXT: v_mul_lo_u32 v3, s4, v2 -; GCN-NEXT: v_mul_hi_u32 v4, s4, v0 +; GCN-NEXT: v_mul_lo_u32 v2, s4, v1 +; GCN-NEXT: v_mul_hi_u32 v3, s4, v0 ; GCN-NEXT: v_mul_lo_u32 v5, s5, v0 -; GCN-NEXT: s_mov_b32 s5, s1 -; GCN-NEXT: v_add_i32_e32 v3, vcc, v4, v3 ; GCN-NEXT: v_mul_lo_u32 v4, s4, v0 -; GCN-NEXT: v_add_i32_e32 v3, vcc, v5, v3 -; GCN-NEXT: v_mul_lo_u32 v7, v0, v3 -; GCN-NEXT: v_mul_hi_u32 v8, v0, v4 -; GCN-NEXT: v_mul_hi_u32 v9, v0, v3 -; GCN-NEXT: v_mul_hi_u32 v6, v2, v4 -; GCN-NEXT: v_mul_lo_u32 v4, v2, v4 -; GCN-NEXT: v_mul_hi_u32 v5, v2, v3 -; GCN-NEXT: v_add_i32_e32 v7, vcc, v8, v7 -; GCN-NEXT: v_addc_u32_e32 v8, vcc, 0, v9, vcc -; GCN-NEXT: v_mul_lo_u32 v3, v2, v3 -; GCN-NEXT: v_add_i32_e32 v4, vcc, v7, v4 -; GCN-NEXT: v_addc_u32_e32 v4, vcc, v8, v6, vcc -; GCN-NEXT: v_addc_u32_e32 v1, vcc, v5, v1, vcc -; GCN-NEXT: v_add_i32_e32 v3, vcc, v4, v3 -; GCN-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GCN-NEXT: v_add_i32_e32 v0, vcc, v0, v3 -; GCN-NEXT: v_addc_u32_e32 v1, vcc, v2, v1, vcc +; GCN-NEXT: v_add_i32_e32 v2, vcc, v3, v2 +; GCN-NEXT: v_add_i32_e32 v2, vcc, v2, v5 +; GCN-NEXT: v_mul_hi_u32 v3, v0, v4 +; GCN-NEXT: v_mul_lo_u32 v5, v0, v2 +; GCN-NEXT: v_mul_hi_u32 v7, v0, v2 +; GCN-NEXT: v_mul_hi_u32 v6, v1, v4 +; GCN-NEXT: v_mul_lo_u32 v4, v1, v4 +; GCN-NEXT: v_mul_hi_u32 v8, v1, v2 +; GCN-NEXT: v_add_i32_e32 v3, vcc, v3, v5 +; GCN-NEXT: v_addc_u32_e32 v5, vcc, 0, v7, vcc +; GCN-NEXT: v_mul_lo_u32 v2, v1, v2 +; GCN-NEXT: v_add_i32_e32 v3, vcc, v3, v4 +; GCN-NEXT: v_addc_u32_e32 v3, vcc, v5, v6, vcc +; GCN-NEXT: v_addc_u32_e32 v4, vcc, 0, v8, vcc +; GCN-NEXT: v_add_i32_e32 v2, vcc, v3, v2 +; GCN-NEXT: v_addc_u32_e32 v3, vcc, 0, v4, vcc +; GCN-NEXT: v_add_i32_e32 v0, vcc, v0, v2 +; GCN-NEXT: v_addc_u32_e32 v1, vcc, v1, v3, vcc +; GCN-NEXT: v_mul_lo_u32 v2, s4, v1 +; GCN-NEXT: v_mul_hi_u32 v3, s4, v0 +; GCN-NEXT: v_mul_lo_u32 v4, s5, v0 +; GCN-NEXT: s_mov_b32 s5, s1 +; GCN-NEXT: v_add_i32_e32 v2, vcc, v3, v2 +; GCN-NEXT: v_mul_lo_u32 v3, s4, v0 +; GCN-NEXT: v_add_i32_e32 v2, vcc, v4, v2 +; GCN-NEXT: v_mul_lo_u32 v6, v0, v2 +; GCN-NEXT: v_mul_hi_u32 v7, v0, v3 +; GCN-NEXT: v_mul_hi_u32 v8, v0, v2 +; GCN-NEXT: v_mul_hi_u32 v5, v1, v3 +; GCN-NEXT: v_mul_lo_u32 v3, v1, v3 +; GCN-NEXT: v_mul_hi_u32 v4, v1, v2 +; GCN-NEXT: v_add_i32_e32 v6, vcc, v7, v6 +; GCN-NEXT: v_addc_u32_e32 v7, vcc, 0, v8, vcc +; GCN-NEXT: v_mul_lo_u32 v2, v1, v2 +; GCN-NEXT: v_add_i32_e32 v3, vcc, v6, v3 +; GCN-NEXT: v_addc_u32_e32 v3, vcc, v7, v5, vcc +; GCN-NEXT: v_addc_u32_e32 v4, vcc, 0, v4, vcc +; GCN-NEXT: v_add_i32_e32 v2, vcc, v3, v2 +; GCN-NEXT: v_addc_u32_e32 v3, vcc, 0, v4, vcc +; GCN-NEXT: v_add_i32_e32 v0, vcc, v0, v2 +; GCN-NEXT: v_addc_u32_e32 v1, vcc, v1, v3, vcc ; GCN-NEXT: v_mul_lo_u32 v2, v1, 24 ; GCN-NEXT: v_mul_hi_u32 v0, v0, 24 ; GCN-NEXT: v_mul_hi_u32 v1, v1, 24 @@ -1092,7 +1081,6 @@ define i64 @v_test_udiv_pow2_k_num_i64(i64 %x) { ; GCN-NEXT: v_subb_u32_e32 v5, vcc, 0, v1, vcc ; GCN-NEXT: v_mac_f32_e32 v2, 0x4f800000, v3 ; GCN-NEXT: v_rcp_f32_e32 v2, v2 -; GCN-NEXT: v_mov_b32_e32 v11, 0 ; GCN-NEXT: v_mul_f32_e32 v2, 0x5f7ffffc, v2 ; GCN-NEXT: v_mul_f32_e32 v3, 0x2f800000, v2 ; GCN-NEXT: v_trunc_f32_e32 v3, v3 @@ -1105,18 +1093,18 @@ define i64 @v_test_udiv_pow2_k_num_i64(i64 %x) { ; GCN-NEXT: v_mul_lo_u32 v9, v4, v2 ; GCN-NEXT: v_add_i32_e32 v6, vcc, v7, v6 ; GCN-NEXT: v_add_i32_e32 v6, vcc, v6, v8 -; GCN-NEXT: v_mul_lo_u32 v7, v2, v6 -; GCN-NEXT: v_mul_hi_u32 v8, v2, v9 +; GCN-NEXT: v_mul_hi_u32 v7, v2, v9 +; GCN-NEXT: v_mul_lo_u32 v8, v2, v6 ; GCN-NEXT: v_mul_hi_u32 v10, v2, v6 -; GCN-NEXT: v_mul_hi_u32 v12, v3, v6 +; GCN-NEXT: v_mul_hi_u32 v11, v3, v6 ; GCN-NEXT: v_mul_lo_u32 v6, v3, v6 -; GCN-NEXT: v_add_i32_e32 v7, vcc, v8, v7 +; GCN-NEXT: v_add_i32_e32 v7, vcc, v7, v8 ; GCN-NEXT: v_addc_u32_e32 v8, vcc, 0, v10, vcc ; GCN-NEXT: v_mul_lo_u32 v10, v3, v9 ; GCN-NEXT: v_mul_hi_u32 v9, v3, v9 ; GCN-NEXT: v_add_i32_e32 v7, vcc, v7, v10 ; GCN-NEXT: v_addc_u32_e32 v7, vcc, v8, v9, vcc -; GCN-NEXT: v_addc_u32_e32 v8, vcc, v12, v11, vcc +; GCN-NEXT: v_addc_u32_e32 v8, vcc, 0, v11, vcc ; GCN-NEXT: v_add_i32_e32 v6, vcc, v7, v6 ; GCN-NEXT: v_addc_u32_e32 v7, vcc, 0, v8, vcc ; GCN-NEXT: v_add_i32_e32 v2, vcc, v2, v6 @@ -1138,7 +1126,7 @@ define i64 @v_test_udiv_pow2_k_num_i64(i64 %x) { ; GCN-NEXT: v_mul_lo_u32 v5, v3, v5 ; GCN-NEXT: v_add_i32_e32 v4, vcc, v8, v4 ; GCN-NEXT: v_addc_u32_e32 v4, vcc, v9, v7, vcc -; GCN-NEXT: v_addc_u32_e32 v6, vcc, v6, v11, vcc +; GCN-NEXT: v_addc_u32_e32 v6, vcc, 0, v6, vcc ; GCN-NEXT: v_add_i32_e32 v4, vcc, v4, v5 ; GCN-NEXT: v_addc_u32_e32 v5, vcc, 0, v6, vcc ; GCN-NEXT: v_add_i32_e32 v2, vcc, v2, v4 @@ -1189,22 +1177,21 @@ define i64 @v_test_udiv_pow2_k_num_i64(i64 %x) { ; GCN-IR-NEXT: v_addc_u32_e64 v5, s[6:7], 0, -1, vcc ; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[4:5], 0, v[0:1] ; GCN-IR-NEXT: v_cmp_lt_u64_e32 vcc, 63, v[4:5] -; GCN-IR-NEXT: s_mov_b64 s[8:9], 0x8000 -; GCN-IR-NEXT: v_mov_b32_e32 v2, s8 +; GCN-IR-NEXT: v_mov_b32_e32 v3, 0x8000 ; GCN-IR-NEXT: s_or_b64 s[4:5], s[4:5], vcc ; GCN-IR-NEXT: v_cmp_ne_u64_e32 vcc, 63, v[4:5] -; GCN-IR-NEXT: v_mov_b32_e32 v7, 0 -; GCN-IR-NEXT: v_cndmask_b32_e64 v2, v2, 0, s[4:5] +; GCN-IR-NEXT: v_cndmask_b32_e64 v3, v3, 0, s[4:5] ; GCN-IR-NEXT: s_xor_b64 s[4:5], s[4:5], -1 -; GCN-IR-NEXT: v_mov_b32_e32 v3, v7 +; GCN-IR-NEXT: v_mov_b32_e32 v2, 0 +; GCN-IR-NEXT: s_mov_b64 s[8:9], 0x8000 ; GCN-IR-NEXT: s_and_b64 s[4:5], s[4:5], vcc ; GCN-IR-NEXT: s_and_saveexec_b64 s[6:7], s[4:5] ; GCN-IR-NEXT: s_cbranch_execz .LBB9_6 ; GCN-IR-NEXT: ; %bb.1: ; %udiv-bb1 -; GCN-IR-NEXT: v_add_i32_e32 v8, vcc, 1, v4 -; GCN-IR-NEXT: v_addc_u32_e32 v9, vcc, 0, v5, vcc +; GCN-IR-NEXT: v_add_i32_e32 v7, vcc, 1, v4 +; GCN-IR-NEXT: v_addc_u32_e32 v8, vcc, 0, v5, vcc ; GCN-IR-NEXT: v_sub_i32_e64 v2, s[4:5], 63, v4 -; GCN-IR-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[8:9] +; GCN-IR-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[7:8] ; GCN-IR-NEXT: v_lshl_b64 v[2:3], s[8:9], v2 ; GCN-IR-NEXT: v_mov_b32_e32 v4, 0 ; GCN-IR-NEXT: v_mov_b32_e32 v5, 0 @@ -1215,10 +1202,10 @@ define i64 @v_test_udiv_pow2_k_num_i64(i64 %x) { ; GCN-IR-NEXT: v_add_i32_e32 v12, vcc, -1, v0 ; GCN-IR-NEXT: s_mov_b64 s[4:5], 0x8000 ; GCN-IR-NEXT: v_addc_u32_e32 v13, vcc, -1, v1, vcc -; GCN-IR-NEXT: v_lshr_b64 v[8:9], s[4:5], v8 +; GCN-IR-NEXT: v_lshr_b64 v[8:9], s[4:5], v7 ; GCN-IR-NEXT: v_sub_i32_e32 v6, vcc, 47, v6 ; GCN-IR-NEXT: v_mov_b32_e32 v10, 0 -; GCN-IR-NEXT: v_subb_u32_e32 v7, vcc, 0, v7, vcc +; GCN-IR-NEXT: v_subb_u32_e64 v7, s[4:5], 0, 0, vcc ; GCN-IR-NEXT: s_mov_b64 s[10:11], 0 ; GCN-IR-NEXT: v_mov_b32_e32 v11, 0 ; GCN-IR-NEXT: v_mov_b32_e32 v5, 0 @@ -1251,12 +1238,12 @@ define i64 @v_test_udiv_pow2_k_num_i64(i64 %x) { ; GCN-IR-NEXT: .LBB9_5: ; %Flow3 ; GCN-IR-NEXT: s_or_b64 exec, exec, s[8:9] ; GCN-IR-NEXT: v_lshl_b64 v[0:1], v[2:3], 1 -; GCN-IR-NEXT: v_or_b32_e32 v3, v5, v1 -; GCN-IR-NEXT: v_or_b32_e32 v2, v4, v0 +; GCN-IR-NEXT: v_or_b32_e32 v2, v5, v1 +; GCN-IR-NEXT: v_or_b32_e32 v3, v4, v0 ; GCN-IR-NEXT: .LBB9_6: ; %Flow4 ; GCN-IR-NEXT: s_or_b64 exec, exec, s[6:7] -; GCN-IR-NEXT: v_mov_b32_e32 v0, v2 -; GCN-IR-NEXT: v_mov_b32_e32 v1, v3 +; GCN-IR-NEXT: v_mov_b32_e32 v0, v3 +; GCN-IR-NEXT: v_mov_b32_e32 v1, v2 ; GCN-IR-NEXT: s_setpc_b64 s[30:31] %result = udiv i64 32768, %x ret i64 %result @@ -1355,8 +1342,8 @@ define amdgpu_kernel void @s_test_udiv_k_den_i64(i64 addrspace(1)* %out, i64 %x) ; GCN-NEXT: v_madak_f32 v0, 0, v0, 0x41c00000 ; GCN-NEXT: v_rcp_f32_e32 v0, v0 ; GCN-NEXT: s_movk_i32 s4, 0xffe8 -; GCN-NEXT: v_mov_b32_e32 v7, 0 ; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; GCN-NEXT: s_mov_b32 s7, 0xf000 ; GCN-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0 ; GCN-NEXT: v_mul_f32_e32 v1, 0x2f800000, v0 ; GCN-NEXT: v_trunc_f32_e32 v1, v1 @@ -1365,25 +1352,24 @@ define amdgpu_kernel void @s_test_udiv_k_den_i64(i64 addrspace(1)* %out, i64 %x) ; GCN-NEXT: v_cvt_u32_f32_e32 v1, v1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: s_mov_b32 s5, s1 -; GCN-NEXT: s_mov_b32 s7, 0xf000 -; GCN-NEXT: v_mul_hi_u32 v2, v0, s4 -; GCN-NEXT: v_mul_lo_u32 v3, v1, s4 -; GCN-NEXT: v_mul_lo_u32 v4, v0, s4 ; GCN-NEXT: s_mov_b32 s6, -1 +; GCN-NEXT: v_mul_hi_u32 v2, v0, s4 +; GCN-NEXT: v_mul_lo_u32 v4, v1, s4 +; GCN-NEXT: v_mul_lo_u32 v3, v0, s4 ; GCN-NEXT: v_subrev_i32_e32 v2, vcc, v0, v2 -; GCN-NEXT: v_add_i32_e32 v2, vcc, v3, v2 -; GCN-NEXT: v_mul_lo_u32 v3, v0, v2 -; GCN-NEXT: v_mul_hi_u32 v5, v0, v4 +; GCN-NEXT: v_add_i32_e32 v2, vcc, v4, v2 +; GCN-NEXT: v_mul_hi_u32 v5, v0, v3 +; GCN-NEXT: v_mul_lo_u32 v4, v0, v2 ; GCN-NEXT: v_mul_hi_u32 v6, v0, v2 -; GCN-NEXT: v_mul_hi_u32 v8, v1, v2 +; GCN-NEXT: v_mul_hi_u32 v7, v1, v2 ; GCN-NEXT: v_mul_lo_u32 v2, v1, v2 -; GCN-NEXT: v_add_i32_e32 v3, vcc, v5, v3 +; GCN-NEXT: v_add_i32_e32 v4, vcc, v5, v4 ; GCN-NEXT: v_addc_u32_e32 v5, vcc, 0, v6, vcc -; GCN-NEXT: v_mul_lo_u32 v6, v1, v4 -; GCN-NEXT: v_mul_hi_u32 v4, v1, v4 -; GCN-NEXT: v_add_i32_e32 v3, vcc, v3, v6 -; GCN-NEXT: v_addc_u32_e32 v3, vcc, v5, v4, vcc -; GCN-NEXT: v_addc_u32_e32 v4, vcc, v8, v7, vcc +; GCN-NEXT: v_mul_lo_u32 v6, v1, v3 +; GCN-NEXT: v_mul_hi_u32 v3, v1, v3 +; GCN-NEXT: v_add_i32_e32 v4, vcc, v4, v6 +; GCN-NEXT: v_addc_u32_e32 v3, vcc, v5, v3, vcc +; GCN-NEXT: v_addc_u32_e32 v4, vcc, 0, v7, vcc ; GCN-NEXT: v_add_i32_e32 v2, vcc, v3, v2 ; GCN-NEXT: v_addc_u32_e32 v3, vcc, 0, v4, vcc ; GCN-NEXT: v_add_i32_e32 v0, vcc, v0, v2 @@ -1397,7 +1383,7 @@ define amdgpu_kernel void @s_test_udiv_k_den_i64(i64 addrspace(1)* %out, i64 %x) ; GCN-NEXT: v_mul_lo_u32 v3, v0, v2 ; GCN-NEXT: v_mul_hi_u32 v5, v0, v4 ; GCN-NEXT: v_mul_hi_u32 v6, v0, v2 -; GCN-NEXT: v_mul_hi_u32 v8, v1, v2 +; GCN-NEXT: v_mul_hi_u32 v7, v1, v2 ; GCN-NEXT: v_mul_lo_u32 v2, v1, v2 ; GCN-NEXT: v_add_i32_e32 v3, vcc, v5, v3 ; GCN-NEXT: v_addc_u32_e32 v5, vcc, 0, v6, vcc @@ -1405,7 +1391,7 @@ define amdgpu_kernel void @s_test_udiv_k_den_i64(i64 addrspace(1)* %out, i64 %x) ; GCN-NEXT: v_mul_hi_u32 v4, v1, v4 ; GCN-NEXT: v_add_i32_e32 v3, vcc, v3, v6 ; GCN-NEXT: v_addc_u32_e32 v3, vcc, v5, v4, vcc -; GCN-NEXT: v_addc_u32_e32 v4, vcc, v8, v7, vcc +; GCN-NEXT: v_addc_u32_e32 v4, vcc, 0, v7, vcc ; GCN-NEXT: v_add_i32_e32 v2, vcc, v3, v2 ; GCN-NEXT: v_addc_u32_e32 v3, vcc, 0, v4, vcc ; GCN-NEXT: v_add_i32_e32 v0, vcc, v0, v2 @@ -1421,7 +1407,7 @@ define amdgpu_kernel void @s_test_udiv_k_den_i64(i64 addrspace(1)* %out, i64 %x) ; GCN-NEXT: v_mul_hi_u32 v0, s3, v0 ; GCN-NEXT: v_add_i32_e32 v2, vcc, v2, v4 ; GCN-NEXT: v_addc_u32_e32 v0, vcc, v3, v0, vcc -; GCN-NEXT: v_addc_u32_e32 v2, vcc, v5, v7, vcc +; GCN-NEXT: v_addc_u32_e32 v2, vcc, 0, v5, vcc ; GCN-NEXT: v_add_i32_e32 v0, vcc, v0, v1 ; GCN-NEXT: v_addc_u32_e32 v1, vcc, 0, v2, vcc ; GCN-NEXT: v_mul_lo_u32 v4, v1, 24 @@ -1536,7 +1522,6 @@ define i64 @v_test_udiv_k_den_i64(i64 %x) { ; GCN-NEXT: v_madak_f32 v2, 0, v2, 0x41c00000 ; GCN-NEXT: v_rcp_f32_e32 v2, v2 ; GCN-NEXT: s_movk_i32 s4, 0xffe8 -; GCN-NEXT: v_mov_b32_e32 v8, 0 ; GCN-NEXT: v_mul_f32_e32 v2, 0x5f7ffffc, v2 ; GCN-NEXT: v_mul_f32_e32 v3, 0x2f800000, v2 ; GCN-NEXT: v_trunc_f32_e32 v3, v3 @@ -1550,16 +1535,16 @@ define i64 @v_test_udiv_k_den_i64(i64 %x) { ; GCN-NEXT: v_add_i32_e32 v4, vcc, v6, v4 ; GCN-NEXT: v_mul_hi_u32 v7, v2, v5 ; GCN-NEXT: v_mul_lo_u32 v6, v2, v4 -; GCN-NEXT: v_mul_hi_u32 v9, v2, v4 -; GCN-NEXT: v_mul_hi_u32 v10, v3, v4 +; GCN-NEXT: v_mul_hi_u32 v8, v2, v4 +; GCN-NEXT: v_mul_hi_u32 v9, v3, v4 ; GCN-NEXT: v_mul_lo_u32 v4, v3, v4 ; GCN-NEXT: v_add_i32_e32 v6, vcc, v7, v6 -; GCN-NEXT: v_addc_u32_e32 v7, vcc, 0, v9, vcc -; GCN-NEXT: v_mul_lo_u32 v9, v3, v5 +; GCN-NEXT: v_addc_u32_e32 v7, vcc, 0, v8, vcc +; GCN-NEXT: v_mul_lo_u32 v8, v3, v5 ; GCN-NEXT: v_mul_hi_u32 v5, v3, v5 -; GCN-NEXT: v_add_i32_e32 v6, vcc, v6, v9 +; GCN-NEXT: v_add_i32_e32 v6, vcc, v6, v8 ; GCN-NEXT: v_addc_u32_e32 v5, vcc, v7, v5, vcc -; GCN-NEXT: v_addc_u32_e32 v6, vcc, v10, v8, vcc +; GCN-NEXT: v_addc_u32_e32 v6, vcc, 0, v9, vcc ; GCN-NEXT: v_add_i32_e32 v4, vcc, v5, v4 ; GCN-NEXT: v_addc_u32_e32 v5, vcc, 0, v6, vcc ; GCN-NEXT: v_add_i32_e32 v2, vcc, v2, v4 @@ -1571,16 +1556,16 @@ define i64 @v_test_udiv_k_den_i64(i64 %x) { ; GCN-NEXT: v_add_i32_e32 v4, vcc, v4, v5 ; GCN-NEXT: v_mul_lo_u32 v5, v2, v4 ; GCN-NEXT: v_mul_hi_u32 v7, v2, v6 -; GCN-NEXT: v_mul_hi_u32 v9, v2, v4 -; GCN-NEXT: v_mul_hi_u32 v10, v3, v4 +; GCN-NEXT: v_mul_hi_u32 v8, v2, v4 +; GCN-NEXT: v_mul_hi_u32 v9, v3, v4 ; GCN-NEXT: v_mul_lo_u32 v4, v3, v4 ; GCN-NEXT: v_add_i32_e32 v5, vcc, v7, v5 -; GCN-NEXT: v_addc_u32_e32 v7, vcc, 0, v9, vcc -; GCN-NEXT: v_mul_lo_u32 v9, v3, v6 +; GCN-NEXT: v_addc_u32_e32 v7, vcc, 0, v8, vcc +; GCN-NEXT: v_mul_lo_u32 v8, v3, v6 ; GCN-NEXT: v_mul_hi_u32 v6, v3, v6 -; GCN-NEXT: v_add_i32_e32 v5, vcc, v5, v9 +; GCN-NEXT: v_add_i32_e32 v5, vcc, v5, v8 ; GCN-NEXT: v_addc_u32_e32 v5, vcc, v7, v6, vcc -; GCN-NEXT: v_addc_u32_e32 v6, vcc, v10, v8, vcc +; GCN-NEXT: v_addc_u32_e32 v6, vcc, 0, v9, vcc ; GCN-NEXT: v_add_i32_e32 v4, vcc, v5, v4 ; GCN-NEXT: v_addc_u32_e32 v5, vcc, 0, v6, vcc ; GCN-NEXT: v_add_i32_e32 v2, vcc, v2, v4 @@ -1596,7 +1581,7 @@ define i64 @v_test_udiv_k_den_i64(i64 %x) { ; GCN-NEXT: v_mul_hi_u32 v2, v1, v2 ; GCN-NEXT: v_add_i32_e32 v4, vcc, v4, v6 ; GCN-NEXT: v_addc_u32_e32 v2, vcc, v5, v2, vcc -; GCN-NEXT: v_addc_u32_e32 v4, vcc, v7, v8, vcc +; GCN-NEXT: v_addc_u32_e32 v4, vcc, 0, v7, vcc ; GCN-NEXT: v_add_i32_e32 v2, vcc, v2, v3 ; GCN-NEXT: v_addc_u32_e32 v3, vcc, 0, v4, vcc ; GCN-NEXT: v_mul_lo_u32 v4, v3, 24 diff --git a/llvm/test/CodeGen/AMDGPU/udivrem24.ll b/llvm/test/CodeGen/AMDGPU/udivrem24.ll index 17cf533d131f..6b629ca9582f 100644 --- a/llvm/test/CodeGen/AMDGPU/udivrem24.ll +++ b/llvm/test/CodeGen/AMDGPU/udivrem24.ll @@ -346,9 +346,8 @@ define amdgpu_kernel void @test_no_urem24_i32_2(i32 addrspace(1)* %out, i32 addr } ; FUNC-LABEL: {{^}}test_udiv24_u16_u23_i32: -; SI-DAG: v_rcp_iflag_f32 -; SI-DAG: s_mov_b32 [[MASK:s[0-9]+]], 0x7fffff{{$}} -; SI: v_and_b32_e32 v{{[0-9]+}}, [[MASK]], +; SI: v_rcp_iflag_f32 +; SI: v_and_b32_e32 v{{[0-9]+}}, 0x7fffff, ; EG: RECIP_IEEE define amdgpu_kernel void @test_udiv24_u16_u23_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) { @@ -365,9 +364,8 @@ define amdgpu_kernel void @test_udiv24_u16_u23_i32(i32 addrspace(1)* %out, i32 a } ; FUNC-LABEL: {{^}}test_udiv24_u23_u16_i32: -; SI-DAG: v_rcp_iflag_f32 -; SI-DAG: s_mov_b32 [[MASK:s[0-9]+]], 0x7fffff{{$}} -; SI: v_and_b32_e32 v{{[0-9]+}}, [[MASK]], +; SI: v_rcp_iflag_f32 +; SI: v_and_b32_e32 v{{[0-9]+}}, 0x7fffff, ; EG: RECIP_IEEE define amdgpu_kernel void @test_udiv24_u23_u16_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) { diff --git a/llvm/test/CodeGen/AMDGPU/urem64.ll b/llvm/test/CodeGen/AMDGPU/urem64.ll index ef6e67f64a26..35079f42ac78 100644 --- a/llvm/test/CodeGen/AMDGPU/urem64.ll +++ b/llvm/test/CodeGen/AMDGPU/urem64.ll @@ -17,72 +17,71 @@ define amdgpu_kernel void @s_test_urem_i64(i64 addrspace(1)* %out, i64 %x, i64 % ; GCN-NEXT: s_mov_b32 s4, s8 ; GCN-NEXT: v_mac_f32_e32 v0, 0x4f800000, v1 ; GCN-NEXT: v_rcp_f32_e32 v0, v0 -; GCN-NEXT: v_mov_b32_e32 v1, 0 ; GCN-NEXT: s_mov_b32 s5, s9 ; GCN-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0 -; GCN-NEXT: v_mul_f32_e32 v2, 0x2f800000, v0 -; GCN-NEXT: v_trunc_f32_e32 v2, v2 -; GCN-NEXT: v_mac_f32_e32 v0, 0xcf800000, v2 -; GCN-NEXT: v_cvt_u32_f32_e32 v2, v2 +; GCN-NEXT: v_mul_f32_e32 v1, 0x2f800000, v0 +; GCN-NEXT: v_trunc_f32_e32 v1, v1 +; GCN-NEXT: v_mac_f32_e32 v0, 0xcf800000, v1 +; GCN-NEXT: v_cvt_u32_f32_e32 v1, v1 ; GCN-NEXT: v_cvt_u32_f32_e32 v0, v0 -; GCN-NEXT: v_mul_lo_u32 v3, s0, v2 -; GCN-NEXT: v_mul_hi_u32 v4, s0, v0 -; GCN-NEXT: v_mul_lo_u32 v6, s1, v0 -; GCN-NEXT: v_mul_lo_u32 v5, s0, v0 -; GCN-NEXT: v_add_i32_e32 v3, vcc, v4, v3 -; GCN-NEXT: v_add_i32_e32 v3, vcc, v3, v6 -; GCN-NEXT: v_mul_hi_u32 v4, v0, v5 -; GCN-NEXT: v_mul_lo_u32 v6, v0, v3 -; GCN-NEXT: v_mul_hi_u32 v8, v0, v3 -; GCN-NEXT: v_mul_lo_u32 v7, v2, v5 -; GCN-NEXT: v_mul_hi_u32 v5, v2, v5 -; GCN-NEXT: v_mul_hi_u32 v9, v2, v3 -; GCN-NEXT: v_add_i32_e32 v4, vcc, v4, v6 -; GCN-NEXT: v_addc_u32_e32 v6, vcc, 0, v8, vcc -; GCN-NEXT: v_mul_lo_u32 v3, v2, v3 -; GCN-NEXT: v_add_i32_e32 v4, vcc, v4, v7 -; GCN-NEXT: v_addc_u32_e32 v4, vcc, v6, v5, vcc -; GCN-NEXT: v_addc_u32_e32 v5, vcc, v9, v1, vcc -; GCN-NEXT: v_add_i32_e32 v3, vcc, v4, v3 -; GCN-NEXT: v_addc_u32_e32 v4, vcc, 0, v5, vcc -; GCN-NEXT: v_add_i32_e32 v0, vcc, v0, v3 -; GCN-NEXT: v_addc_u32_e32 v2, vcc, v2, v4, vcc -; GCN-NEXT: v_mul_lo_u32 v3, s0, v2 -; GCN-NEXT: v_mul_hi_u32 v4, s0, v0 +; GCN-NEXT: v_mul_lo_u32 v2, s0, v1 +; GCN-NEXT: v_mul_hi_u32 v3, s0, v0 ; GCN-NEXT: v_mul_lo_u32 v5, s1, v0 -; GCN-NEXT: v_add_i32_e32 v3, vcc, v4, v3 ; GCN-NEXT: v_mul_lo_u32 v4, s0, v0 -; GCN-NEXT: v_add_i32_e32 v3, vcc, v5, v3 -; GCN-NEXT: v_mul_lo_u32 v7, v0, v3 -; GCN-NEXT: v_mul_hi_u32 v8, v0, v4 -; GCN-NEXT: v_mul_hi_u32 v9, v0, v3 -; GCN-NEXT: v_mul_hi_u32 v6, v2, v4 -; GCN-NEXT: v_mul_lo_u32 v4, v2, v4 -; GCN-NEXT: v_mul_hi_u32 v5, v2, v3 -; GCN-NEXT: v_add_i32_e32 v7, vcc, v8, v7 -; GCN-NEXT: v_addc_u32_e32 v8, vcc, 0, v9, vcc -; GCN-NEXT: v_mul_lo_u32 v3, v2, v3 -; GCN-NEXT: v_add_i32_e32 v4, vcc, v7, v4 -; GCN-NEXT: v_addc_u32_e32 v4, vcc, v8, v6, vcc -; GCN-NEXT: v_addc_u32_e32 v5, vcc, v5, v1, vcc -; GCN-NEXT: v_add_i32_e32 v3, vcc, v4, v3 -; GCN-NEXT: v_addc_u32_e32 v4, vcc, 0, v5, vcc -; GCN-NEXT: v_add_i32_e32 v0, vcc, v0, v3 -; GCN-NEXT: v_addc_u32_e32 v2, vcc, v2, v4, vcc -; GCN-NEXT: v_mul_lo_u32 v3, s10, v2 -; GCN-NEXT: v_mul_hi_u32 v4, s10, v0 -; GCN-NEXT: v_mul_hi_u32 v5, s10, v2 -; GCN-NEXT: v_mul_hi_u32 v6, s11, v2 -; GCN-NEXT: v_mul_lo_u32 v2, s11, v2 -; GCN-NEXT: v_add_i32_e32 v3, vcc, v4, v3 -; GCN-NEXT: v_addc_u32_e32 v4, vcc, 0, v5, vcc -; GCN-NEXT: v_mul_lo_u32 v5, s11, v0 -; GCN-NEXT: v_mul_hi_u32 v0, s11, v0 +; GCN-NEXT: v_add_i32_e32 v2, vcc, v3, v2 +; GCN-NEXT: v_add_i32_e32 v2, vcc, v2, v5 +; GCN-NEXT: v_mul_hi_u32 v3, v0, v4 +; GCN-NEXT: v_mul_lo_u32 v5, v0, v2 +; GCN-NEXT: v_mul_hi_u32 v7, v0, v2 +; GCN-NEXT: v_mul_hi_u32 v6, v1, v4 +; GCN-NEXT: v_mul_lo_u32 v4, v1, v4 +; GCN-NEXT: v_mul_hi_u32 v8, v1, v2 ; GCN-NEXT: v_add_i32_e32 v3, vcc, v3, v5 -; GCN-NEXT: v_addc_u32_e32 v0, vcc, v4, v0, vcc -; GCN-NEXT: v_addc_u32_e32 v1, vcc, v6, v1, vcc +; GCN-NEXT: v_addc_u32_e32 v5, vcc, 0, v7, vcc +; GCN-NEXT: v_mul_lo_u32 v2, v1, v2 +; GCN-NEXT: v_add_i32_e32 v3, vcc, v3, v4 +; GCN-NEXT: v_addc_u32_e32 v3, vcc, v5, v6, vcc +; GCN-NEXT: v_addc_u32_e32 v4, vcc, 0, v8, vcc +; GCN-NEXT: v_add_i32_e32 v2, vcc, v3, v2 +; GCN-NEXT: v_addc_u32_e32 v3, vcc, 0, v4, vcc ; GCN-NEXT: v_add_i32_e32 v0, vcc, v0, v2 -; GCN-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GCN-NEXT: v_addc_u32_e32 v1, vcc, v1, v3, vcc +; GCN-NEXT: v_mul_lo_u32 v2, s0, v1 +; GCN-NEXT: v_mul_hi_u32 v3, s0, v0 +; GCN-NEXT: v_mul_lo_u32 v4, s1, v0 +; GCN-NEXT: v_add_i32_e32 v2, vcc, v3, v2 +; GCN-NEXT: v_mul_lo_u32 v3, s0, v0 +; GCN-NEXT: v_add_i32_e32 v2, vcc, v4, v2 +; GCN-NEXT: v_mul_lo_u32 v6, v0, v2 +; GCN-NEXT: v_mul_hi_u32 v7, v0, v3 +; GCN-NEXT: v_mul_hi_u32 v8, v0, v2 +; GCN-NEXT: v_mul_hi_u32 v5, v1, v3 +; GCN-NEXT: v_mul_lo_u32 v3, v1, v3 +; GCN-NEXT: v_mul_hi_u32 v4, v1, v2 +; GCN-NEXT: v_add_i32_e32 v6, vcc, v7, v6 +; GCN-NEXT: v_addc_u32_e32 v7, vcc, 0, v8, vcc +; GCN-NEXT: v_mul_lo_u32 v2, v1, v2 +; GCN-NEXT: v_add_i32_e32 v3, vcc, v6, v3 +; GCN-NEXT: v_addc_u32_e32 v3, vcc, v7, v5, vcc +; GCN-NEXT: v_addc_u32_e32 v4, vcc, 0, v4, vcc +; GCN-NEXT: v_add_i32_e32 v2, vcc, v3, v2 +; GCN-NEXT: v_addc_u32_e32 v3, vcc, 0, v4, vcc +; GCN-NEXT: v_add_i32_e32 v0, vcc, v0, v2 +; GCN-NEXT: v_addc_u32_e32 v1, vcc, v1, v3, vcc +; GCN-NEXT: v_mul_lo_u32 v2, s10, v1 +; GCN-NEXT: v_mul_hi_u32 v3, s10, v0 +; GCN-NEXT: v_mul_hi_u32 v4, s10, v1 +; GCN-NEXT: v_mul_hi_u32 v5, s11, v1 +; GCN-NEXT: v_mul_lo_u32 v1, s11, v1 +; GCN-NEXT: v_add_i32_e32 v2, vcc, v3, v2 +; GCN-NEXT: v_addc_u32_e32 v3, vcc, 0, v4, vcc +; GCN-NEXT: v_mul_lo_u32 v4, s11, v0 +; GCN-NEXT: v_mul_hi_u32 v0, s11, v0 +; GCN-NEXT: v_add_i32_e32 v2, vcc, v2, v4 +; GCN-NEXT: v_addc_u32_e32 v0, vcc, v3, v0, vcc +; GCN-NEXT: v_addc_u32_e32 v2, vcc, 0, v5, vcc +; GCN-NEXT: v_add_i32_e32 v0, vcc, v0, v1 +; GCN-NEXT: v_addc_u32_e32 v1, vcc, 0, v2, vcc ; GCN-NEXT: v_mul_lo_u32 v1, s12, v1 ; GCN-NEXT: v_mul_hi_u32 v2, s12, v0 ; GCN-NEXT: v_mul_lo_u32 v3, s13, v0 @@ -161,9 +160,8 @@ define amdgpu_kernel void @s_test_urem_i64(i64 addrspace(1)* %out, i64 %x, i64 % ; GCN-IR-NEXT: s_add_u32 s16, s4, -1 ; GCN-IR-NEXT: s_addc_u32 s17, s5, -1 ; GCN-IR-NEXT: s_not_b64 s[6:7], s[10:11] -; GCN-IR-NEXT: s_mov_b32 s15, s11 ; GCN-IR-NEXT: s_add_u32 s10, s6, s14 -; GCN-IR-NEXT: s_addc_u32 s11, s7, s11 +; GCN-IR-NEXT: s_addc_u32 s11, s7, 0 ; GCN-IR-NEXT: s_mov_b64 s[14:15], 0 ; GCN-IR-NEXT: s_mov_b32 s7, 0 ; GCN-IR-NEXT: .LBB0_3: ; %udiv-do-while @@ -229,7 +227,6 @@ define i64 @v_test_urem_i64(i64 %x, i64 %y) { ; GCN-NEXT: v_subb_u32_e32 v7, vcc, 0, v3, vcc ; GCN-NEXT: v_mac_f32_e32 v4, 0x4f800000, v5 ; GCN-NEXT: v_rcp_f32_e32 v4, v4 -; GCN-NEXT: v_mov_b32_e32 v13, 0 ; GCN-NEXT: v_mul_f32_e32 v4, 0x5f7ffffc, v4 ; GCN-NEXT: v_mul_f32_e32 v5, 0x2f800000, v4 ; GCN-NEXT: v_trunc_f32_e32 v5, v5 @@ -245,7 +242,7 @@ define i64 @v_test_urem_i64(i64 %x, i64 %y) { ; GCN-NEXT: v_mul_lo_u32 v10, v4, v8 ; GCN-NEXT: v_mul_hi_u32 v11, v4, v9 ; GCN-NEXT: v_mul_hi_u32 v12, v4, v8 -; GCN-NEXT: v_mul_hi_u32 v14, v5, v8 +; GCN-NEXT: v_mul_hi_u32 v13, v5, v8 ; GCN-NEXT: v_mul_lo_u32 v8, v5, v8 ; GCN-NEXT: v_add_i32_e32 v10, vcc, v11, v10 ; GCN-NEXT: v_addc_u32_e32 v11, vcc, 0, v12, vcc @@ -253,7 +250,7 @@ define i64 @v_test_urem_i64(i64 %x, i64 %y) { ; GCN-NEXT: v_mul_hi_u32 v9, v5, v9 ; GCN-NEXT: v_add_i32_e32 v10, vcc, v10, v12 ; GCN-NEXT: v_addc_u32_e32 v9, vcc, v11, v9, vcc -; GCN-NEXT: v_addc_u32_e32 v10, vcc, v14, v13, vcc +; GCN-NEXT: v_addc_u32_e32 v10, vcc, 0, v13, vcc ; GCN-NEXT: v_add_i32_e32 v8, vcc, v9, v8 ; GCN-NEXT: v_addc_u32_e32 v9, vcc, 0, v10, vcc ; GCN-NEXT: v_add_i32_e32 v4, vcc, v4, v8 @@ -275,7 +272,7 @@ define i64 @v_test_urem_i64(i64 %x, i64 %y) { ; GCN-NEXT: v_mul_lo_u32 v7, v5, v7 ; GCN-NEXT: v_add_i32_e32 v6, vcc, v10, v6 ; GCN-NEXT: v_addc_u32_e32 v6, vcc, v11, v9, vcc -; GCN-NEXT: v_addc_u32_e32 v8, vcc, v8, v13, vcc +; GCN-NEXT: v_addc_u32_e32 v8, vcc, 0, v8, vcc ; GCN-NEXT: v_add_i32_e32 v6, vcc, v6, v7 ; GCN-NEXT: v_addc_u32_e32 v7, vcc, 0, v8, vcc ; GCN-NEXT: v_add_i32_e32 v4, vcc, v4, v6 @@ -291,7 +288,7 @@ define i64 @v_test_urem_i64(i64 %x, i64 %y) { ; GCN-NEXT: v_mul_hi_u32 v4, v1, v4 ; GCN-NEXT: v_add_i32_e32 v6, vcc, v6, v8 ; GCN-NEXT: v_addc_u32_e32 v4, vcc, v7, v4, vcc -; GCN-NEXT: v_addc_u32_e32 v6, vcc, v9, v13, vcc +; GCN-NEXT: v_addc_u32_e32 v6, vcc, 0, v9, vcc ; GCN-NEXT: v_add_i32_e32 v4, vcc, v4, v5 ; GCN-NEXT: v_addc_u32_e32 v5, vcc, 0, v6, vcc ; GCN-NEXT: v_mul_lo_u32 v5, v2, v5 @@ -332,35 +329,33 @@ define i64 @v_test_urem_i64(i64 %x, i64 %y) { ; GCN-IR-LABEL: v_test_urem_i64: ; GCN-IR: ; %bb.0: ; %_udiv-special-cases ; GCN-IR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-IR-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[2:3] -; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[4:5], 0, v[0:1] ; GCN-IR-NEXT: v_ffbh_u32_e32 v4, v2 -; GCN-IR-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GCN-IR-NEXT: v_add_i32_e32 v4, vcc, 32, v4 +; GCN-IR-NEXT: v_add_i32_e64 v4, s[6:7], 32, v4 ; GCN-IR-NEXT: v_ffbh_u32_e32 v5, v3 ; GCN-IR-NEXT: v_min_u32_e32 v8, v4, v5 ; GCN-IR-NEXT: v_ffbh_u32_e32 v4, v0 -; GCN-IR-NEXT: v_add_i32_e32 v4, vcc, 32, v4 +; GCN-IR-NEXT: v_add_i32_e64 v4, s[6:7], 32, v4 ; GCN-IR-NEXT: v_ffbh_u32_e32 v5, v1 -; GCN-IR-NEXT: v_min_u32_e32 v10, v4, v5 -; GCN-IR-NEXT: v_sub_i32_e32 v5, vcc, v8, v10 -; GCN-IR-NEXT: v_subb_u32_e64 v6, s[6:7], 0, 0, vcc -; GCN-IR-NEXT: v_cmp_lt_u64_e32 vcc, 63, v[5:6] -; GCN-IR-NEXT: v_mov_b32_e32 v9, 0 -; GCN-IR-NEXT: s_or_b64 s[4:5], s[4:5], vcc +; GCN-IR-NEXT: v_min_u32_e32 v9, v4, v5 +; GCN-IR-NEXT: v_sub_i32_e64 v5, s[6:7], v8, v9 +; GCN-IR-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[2:3] +; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[4:5], 0, v[0:1] +; GCN-IR-NEXT: v_subb_u32_e64 v6, s[6:7], 0, 0, s[6:7] +; GCN-IR-NEXT: v_cmp_lt_u64_e64 s[6:7], 63, v[5:6] +; GCN-IR-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GCN-IR-NEXT: s_or_b64 s[4:5], s[4:5], s[6:7] ; GCN-IR-NEXT: v_cmp_ne_u64_e32 vcc, 63, v[5:6] ; GCN-IR-NEXT: s_xor_b64 s[6:7], s[4:5], -1 -; GCN-IR-NEXT: v_mov_b32_e32 v11, v9 ; GCN-IR-NEXT: v_cndmask_b32_e64 v7, v1, 0, s[4:5] ; GCN-IR-NEXT: v_cndmask_b32_e64 v4, v0, 0, s[4:5] ; GCN-IR-NEXT: s_and_b64 s[4:5], s[6:7], vcc ; GCN-IR-NEXT: s_and_saveexec_b64 s[6:7], s[4:5] ; GCN-IR-NEXT: s_cbranch_execz .LBB1_6 ; GCN-IR-NEXT: ; %bb.1: ; %udiv-bb1 -; GCN-IR-NEXT: v_add_i32_e32 v12, vcc, 1, v5 -; GCN-IR-NEXT: v_addc_u32_e32 v13, vcc, 0, v6, vcc +; GCN-IR-NEXT: v_add_i32_e32 v10, vcc, 1, v5 +; GCN-IR-NEXT: v_addc_u32_e32 v11, vcc, 0, v6, vcc ; GCN-IR-NEXT: v_sub_i32_e64 v4, s[4:5], 63, v5 -; GCN-IR-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[12:13] +; GCN-IR-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[10:11] ; GCN-IR-NEXT: v_lshl_b64 v[4:5], v[0:1], v4 ; GCN-IR-NEXT: v_mov_b32_e32 v6, 0 ; GCN-IR-NEXT: v_mov_b32_e32 v7, 0 @@ -371,36 +366,36 @@ define i64 @v_test_urem_i64(i64 %x, i64 %y) { ; GCN-IR-NEXT: v_add_i32_e32 v14, vcc, -1, v2 ; GCN-IR-NEXT: v_addc_u32_e32 v15, vcc, -1, v3, vcc ; GCN-IR-NEXT: v_not_b32_e32 v7, v8 -; GCN-IR-NEXT: v_not_b32_e32 v6, v9 -; GCN-IR-NEXT: v_add_i32_e32 v8, vcc, v7, v10 -; GCN-IR-NEXT: v_lshr_b64 v[12:13], v[0:1], v12 -; GCN-IR-NEXT: v_addc_u32_e32 v9, vcc, v6, v11, vcc -; GCN-IR-NEXT: v_mov_b32_e32 v10, 0 +; GCN-IR-NEXT: v_lshr_b64 v[10:11], v[0:1], v10 +; GCN-IR-NEXT: v_not_b32_e32 v6, 0 +; GCN-IR-NEXT: v_add_i32_e32 v8, vcc, v7, v9 +; GCN-IR-NEXT: v_mov_b32_e32 v12, 0 +; GCN-IR-NEXT: v_addc_u32_e32 v9, vcc, 0, v6, vcc ; GCN-IR-NEXT: s_mov_b64 s[10:11], 0 -; GCN-IR-NEXT: v_mov_b32_e32 v11, 0 +; GCN-IR-NEXT: v_mov_b32_e32 v13, 0 ; GCN-IR-NEXT: v_mov_b32_e32 v7, 0 ; GCN-IR-NEXT: .LBB1_3: ; %udiv-do-while ; GCN-IR-NEXT: ; =>This Inner Loop Header: Depth=1 -; GCN-IR-NEXT: v_lshl_b64 v[12:13], v[12:13], 1 +; GCN-IR-NEXT: v_lshl_b64 v[10:11], v[10:11], 1 ; GCN-IR-NEXT: v_lshrrev_b32_e32 v6, 31, v5 -; GCN-IR-NEXT: v_or_b32_e32 v12, v12, v6 +; GCN-IR-NEXT: v_or_b32_e32 v10, v10, v6 ; GCN-IR-NEXT: v_lshl_b64 v[4:5], v[4:5], 1 -; GCN-IR-NEXT: v_sub_i32_e32 v6, vcc, v14, v12 -; GCN-IR-NEXT: v_subb_u32_e32 v6, vcc, v15, v13, vcc -; GCN-IR-NEXT: v_or_b32_e32 v4, v10, v4 -; GCN-IR-NEXT: v_ashrrev_i32_e32 v10, 31, v6 +; GCN-IR-NEXT: v_sub_i32_e32 v6, vcc, v14, v10 +; GCN-IR-NEXT: v_subb_u32_e32 v6, vcc, v15, v11, vcc +; GCN-IR-NEXT: v_or_b32_e32 v4, v12, v4 +; GCN-IR-NEXT: v_ashrrev_i32_e32 v12, 31, v6 ; GCN-IR-NEXT: v_add_i32_e32 v8, vcc, 1, v8 -; GCN-IR-NEXT: v_or_b32_e32 v5, v11, v5 -; GCN-IR-NEXT: v_and_b32_e32 v6, 1, v10 -; GCN-IR-NEXT: v_and_b32_e32 v11, v10, v3 -; GCN-IR-NEXT: v_and_b32_e32 v10, v10, v2 +; GCN-IR-NEXT: v_or_b32_e32 v5, v13, v5 +; GCN-IR-NEXT: v_and_b32_e32 v6, 1, v12 +; GCN-IR-NEXT: v_and_b32_e32 v13, v12, v3 +; GCN-IR-NEXT: v_and_b32_e32 v12, v12, v2 ; GCN-IR-NEXT: v_addc_u32_e32 v9, vcc, 0, v9, vcc ; GCN-IR-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[8:9] -; GCN-IR-NEXT: v_sub_i32_e64 v12, s[4:5], v12, v10 -; GCN-IR-NEXT: v_subb_u32_e64 v13, s[4:5], v13, v11, s[4:5] -; GCN-IR-NEXT: v_mov_b32_e32 v11, v7 +; GCN-IR-NEXT: v_sub_i32_e64 v10, s[4:5], v10, v12 +; GCN-IR-NEXT: v_subb_u32_e64 v11, s[4:5], v11, v13, s[4:5] +; GCN-IR-NEXT: v_mov_b32_e32 v13, v7 ; GCN-IR-NEXT: s_or_b64 s[10:11], vcc, s[10:11] -; GCN-IR-NEXT: v_mov_b32_e32 v10, v6 +; GCN-IR-NEXT: v_mov_b32_e32 v12, v6 ; GCN-IR-NEXT: s_andn2_b64 exec, exec, s[10:11] ; GCN-IR-NEXT: s_cbranch_execnz .LBB1_3 ; GCN-IR-NEXT: ; %bb.4: ; %Flow @@ -731,58 +726,57 @@ define amdgpu_kernel void @s_test_urem_k_num_i64(i64 addrspace(1)* %out, i64 %x) ; GCN-NEXT: s_mov_b32 s8, s4 ; GCN-NEXT: v_mac_f32_e32 v0, 0x4f800000, v1 ; GCN-NEXT: v_rcp_f32_e32 v0, v0 -; GCN-NEXT: v_mov_b32_e32 v1, 0 ; GCN-NEXT: s_mov_b32 s9, s5 ; GCN-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0 -; GCN-NEXT: v_mul_f32_e32 v2, 0x2f800000, v0 -; GCN-NEXT: v_trunc_f32_e32 v2, v2 -; GCN-NEXT: v_mac_f32_e32 v0, 0xcf800000, v2 -; GCN-NEXT: v_cvt_u32_f32_e32 v2, v2 +; GCN-NEXT: v_mul_f32_e32 v1, 0x2f800000, v0 +; GCN-NEXT: v_trunc_f32_e32 v1, v1 +; GCN-NEXT: v_mac_f32_e32 v0, 0xcf800000, v1 +; GCN-NEXT: v_cvt_u32_f32_e32 v1, v1 ; GCN-NEXT: v_cvt_u32_f32_e32 v0, v0 -; GCN-NEXT: v_mul_lo_u32 v3, s0, v2 -; GCN-NEXT: v_mul_hi_u32 v4, s0, v0 -; GCN-NEXT: v_mul_lo_u32 v6, s1, v0 -; GCN-NEXT: v_mul_lo_u32 v5, s0, v0 -; GCN-NEXT: v_add_i32_e32 v3, vcc, v4, v3 -; GCN-NEXT: v_add_i32_e32 v3, vcc, v3, v6 -; GCN-NEXT: v_mul_hi_u32 v4, v0, v5 -; GCN-NEXT: v_mul_lo_u32 v6, v0, v3 -; GCN-NEXT: v_mul_hi_u32 v8, v0, v3 -; GCN-NEXT: v_mul_hi_u32 v7, v2, v5 -; GCN-NEXT: v_mul_lo_u32 v5, v2, v5 -; GCN-NEXT: v_mul_hi_u32 v9, v2, v3 -; GCN-NEXT: v_add_i32_e32 v4, vcc, v4, v6 -; GCN-NEXT: v_addc_u32_e32 v6, vcc, 0, v8, vcc -; GCN-NEXT: v_mul_lo_u32 v3, v2, v3 -; GCN-NEXT: v_add_i32_e32 v4, vcc, v4, v5 -; GCN-NEXT: v_addc_u32_e32 v4, vcc, v6, v7, vcc -; GCN-NEXT: v_addc_u32_e32 v5, vcc, v9, v1, vcc -; GCN-NEXT: v_add_i32_e32 v3, vcc, v4, v3 -; GCN-NEXT: v_addc_u32_e32 v4, vcc, 0, v5, vcc -; GCN-NEXT: v_add_i32_e32 v0, vcc, v0, v3 -; GCN-NEXT: v_addc_u32_e32 v2, vcc, v2, v4, vcc -; GCN-NEXT: v_mul_lo_u32 v3, s0, v2 -; GCN-NEXT: v_mul_hi_u32 v4, s0, v0 +; GCN-NEXT: v_mul_lo_u32 v2, s0, v1 +; GCN-NEXT: v_mul_hi_u32 v3, s0, v0 ; GCN-NEXT: v_mul_lo_u32 v5, s1, v0 -; GCN-NEXT: v_add_i32_e32 v3, vcc, v4, v3 ; GCN-NEXT: v_mul_lo_u32 v4, s0, v0 -; GCN-NEXT: v_add_i32_e32 v3, vcc, v5, v3 -; GCN-NEXT: v_mul_lo_u32 v7, v0, v3 -; GCN-NEXT: v_mul_hi_u32 v8, v0, v4 -; GCN-NEXT: v_mul_hi_u32 v9, v0, v3 -; GCN-NEXT: v_mul_hi_u32 v6, v2, v4 -; GCN-NEXT: v_mul_lo_u32 v4, v2, v4 -; GCN-NEXT: v_mul_hi_u32 v5, v2, v3 -; GCN-NEXT: v_add_i32_e32 v7, vcc, v8, v7 -; GCN-NEXT: v_addc_u32_e32 v8, vcc, 0, v9, vcc -; GCN-NEXT: v_mul_lo_u32 v3, v2, v3 -; GCN-NEXT: v_add_i32_e32 v4, vcc, v7, v4 -; GCN-NEXT: v_addc_u32_e32 v4, vcc, v8, v6, vcc -; GCN-NEXT: v_addc_u32_e32 v1, vcc, v5, v1, vcc -; GCN-NEXT: v_add_i32_e32 v3, vcc, v4, v3 -; GCN-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GCN-NEXT: v_add_i32_e32 v0, vcc, v0, v3 -; GCN-NEXT: v_addc_u32_e32 v1, vcc, v2, v1, vcc +; GCN-NEXT: v_add_i32_e32 v2, vcc, v3, v2 +; GCN-NEXT: v_add_i32_e32 v2, vcc, v2, v5 +; GCN-NEXT: v_mul_hi_u32 v3, v0, v4 +; GCN-NEXT: v_mul_lo_u32 v5, v0, v2 +; GCN-NEXT: v_mul_hi_u32 v7, v0, v2 +; GCN-NEXT: v_mul_hi_u32 v6, v1, v4 +; GCN-NEXT: v_mul_lo_u32 v4, v1, v4 +; GCN-NEXT: v_mul_hi_u32 v8, v1, v2 +; GCN-NEXT: v_add_i32_e32 v3, vcc, v3, v5 +; GCN-NEXT: v_addc_u32_e32 v5, vcc, 0, v7, vcc +; GCN-NEXT: v_mul_lo_u32 v2, v1, v2 +; GCN-NEXT: v_add_i32_e32 v3, vcc, v3, v4 +; GCN-NEXT: v_addc_u32_e32 v3, vcc, v5, v6, vcc +; GCN-NEXT: v_addc_u32_e32 v4, vcc, 0, v8, vcc +; GCN-NEXT: v_add_i32_e32 v2, vcc, v3, v2 +; GCN-NEXT: v_addc_u32_e32 v3, vcc, 0, v4, vcc +; GCN-NEXT: v_add_i32_e32 v0, vcc, v0, v2 +; GCN-NEXT: v_addc_u32_e32 v1, vcc, v1, v3, vcc +; GCN-NEXT: v_mul_lo_u32 v2, s0, v1 +; GCN-NEXT: v_mul_hi_u32 v3, s0, v0 +; GCN-NEXT: v_mul_lo_u32 v4, s1, v0 +; GCN-NEXT: v_add_i32_e32 v2, vcc, v3, v2 +; GCN-NEXT: v_mul_lo_u32 v3, s0, v0 +; GCN-NEXT: v_add_i32_e32 v2, vcc, v4, v2 +; GCN-NEXT: v_mul_lo_u32 v6, v0, v2 +; GCN-NEXT: v_mul_hi_u32 v7, v0, v3 +; GCN-NEXT: v_mul_hi_u32 v8, v0, v2 +; GCN-NEXT: v_mul_hi_u32 v5, v1, v3 +; GCN-NEXT: v_mul_lo_u32 v3, v1, v3 +; GCN-NEXT: v_mul_hi_u32 v4, v1, v2 +; GCN-NEXT: v_add_i32_e32 v6, vcc, v7, v6 +; GCN-NEXT: v_addc_u32_e32 v7, vcc, 0, v8, vcc +; GCN-NEXT: v_mul_lo_u32 v2, v1, v2 +; GCN-NEXT: v_add_i32_e32 v3, vcc, v6, v3 +; GCN-NEXT: v_addc_u32_e32 v3, vcc, v7, v5, vcc +; GCN-NEXT: v_addc_u32_e32 v4, vcc, 0, v4, vcc +; GCN-NEXT: v_add_i32_e32 v2, vcc, v3, v2 +; GCN-NEXT: v_addc_u32_e32 v3, vcc, 0, v4, vcc +; GCN-NEXT: v_add_i32_e32 v0, vcc, v0, v2 +; GCN-NEXT: v_addc_u32_e32 v1, vcc, v1, v3, vcc ; GCN-NEXT: v_mul_lo_u32 v2, v1, 24 ; GCN-NEXT: v_mul_hi_u32 v0, v0, 24 ; GCN-NEXT: v_mul_hi_u32 v1, v1, 24 @@ -915,8 +909,8 @@ define amdgpu_kernel void @s_test_urem_k_den_i64(i64 addrspace(1)* %out, i64 %x) ; GCN-NEXT: v_madak_f32 v0, 0, v0, 0x41c00000 ; GCN-NEXT: v_rcp_f32_e32 v0, v0 ; GCN-NEXT: s_movk_i32 s4, 0xffe8 -; GCN-NEXT: v_mov_b32_e32 v2, 0 ; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; GCN-NEXT: s_mov_b32 s7, 0xf000 ; GCN-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0 ; GCN-NEXT: v_mul_f32_e32 v1, 0x2f800000, v0 ; GCN-NEXT: v_trunc_f32_e32 v1, v1 @@ -925,63 +919,62 @@ define amdgpu_kernel void @s_test_urem_k_den_i64(i64 addrspace(1)* %out, i64 %x) ; GCN-NEXT: v_cvt_u32_f32_e32 v1, v1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: s_mov_b32 s5, s1 -; GCN-NEXT: s_mov_b32 s7, 0xf000 -; GCN-NEXT: v_mul_hi_u32 v3, v0, s4 -; GCN-NEXT: v_mul_lo_u32 v5, v1, s4 -; GCN-NEXT: v_mul_lo_u32 v4, v0, s4 ; GCN-NEXT: s_mov_b32 s6, -1 -; GCN-NEXT: v_subrev_i32_e32 v3, vcc, v0, v3 -; GCN-NEXT: v_add_i32_e32 v3, vcc, v5, v3 -; GCN-NEXT: v_mul_hi_u32 v6, v0, v4 -; GCN-NEXT: v_mul_lo_u32 v5, v0, v3 -; GCN-NEXT: v_mul_hi_u32 v7, v0, v3 -; GCN-NEXT: v_mul_hi_u32 v8, v1, v3 -; GCN-NEXT: v_mul_lo_u32 v3, v1, v3 -; GCN-NEXT: v_add_i32_e32 v5, vcc, v6, v5 -; GCN-NEXT: v_addc_u32_e32 v6, vcc, 0, v7, vcc -; GCN-NEXT: v_mul_lo_u32 v7, v1, v4 -; GCN-NEXT: v_mul_hi_u32 v4, v1, v4 -; GCN-NEXT: v_add_i32_e32 v5, vcc, v5, v7 -; GCN-NEXT: v_addc_u32_e32 v4, vcc, v6, v4, vcc -; GCN-NEXT: v_addc_u32_e32 v5, vcc, v8, v2, vcc -; GCN-NEXT: v_add_i32_e32 v3, vcc, v4, v3 -; GCN-NEXT: v_addc_u32_e32 v4, vcc, 0, v5, vcc -; GCN-NEXT: v_add_i32_e32 v0, vcc, v0, v3 -; GCN-NEXT: v_addc_u32_e32 v1, vcc, v1, v4, vcc -; GCN-NEXT: v_mul_hi_u32 v3, v0, s4 +; GCN-NEXT: v_mul_hi_u32 v2, v0, s4 ; GCN-NEXT: v_mul_lo_u32 v4, v1, s4 -; GCN-NEXT: v_mul_lo_u32 v5, v0, s4 +; GCN-NEXT: v_mul_lo_u32 v3, v0, s4 +; GCN-NEXT: v_subrev_i32_e32 v2, vcc, v0, v2 +; GCN-NEXT: v_add_i32_e32 v2, vcc, v4, v2 +; GCN-NEXT: v_mul_hi_u32 v5, v0, v3 +; GCN-NEXT: v_mul_lo_u32 v4, v0, v2 +; GCN-NEXT: v_mul_hi_u32 v6, v0, v2 +; GCN-NEXT: v_mul_hi_u32 v7, v1, v2 +; GCN-NEXT: v_mul_lo_u32 v2, v1, v2 +; GCN-NEXT: v_add_i32_e32 v4, vcc, v5, v4 +; GCN-NEXT: v_addc_u32_e32 v5, vcc, 0, v6, vcc +; GCN-NEXT: v_mul_lo_u32 v6, v1, v3 +; GCN-NEXT: v_mul_hi_u32 v3, v1, v3 +; GCN-NEXT: v_add_i32_e32 v4, vcc, v4, v6 +; GCN-NEXT: v_addc_u32_e32 v3, vcc, v5, v3, vcc +; GCN-NEXT: v_addc_u32_e32 v4, vcc, 0, v7, vcc +; GCN-NEXT: v_add_i32_e32 v2, vcc, v3, v2 +; GCN-NEXT: v_addc_u32_e32 v3, vcc, 0, v4, vcc +; GCN-NEXT: v_add_i32_e32 v0, vcc, v0, v2 +; GCN-NEXT: v_addc_u32_e32 v1, vcc, v1, v3, vcc +; GCN-NEXT: v_mul_hi_u32 v2, v0, s4 +; GCN-NEXT: v_mul_lo_u32 v3, v1, s4 +; GCN-NEXT: v_mul_lo_u32 v4, v0, s4 ; GCN-NEXT: s_mov_b32 s4, s0 -; GCN-NEXT: v_subrev_i32_e32 v3, vcc, v0, v3 -; GCN-NEXT: v_add_i32_e32 v3, vcc, v3, v4 -; GCN-NEXT: v_mul_lo_u32 v4, v0, v3 -; GCN-NEXT: v_mul_hi_u32 v6, v0, v5 -; GCN-NEXT: v_mul_hi_u32 v7, v0, v3 -; GCN-NEXT: v_mul_hi_u32 v8, v1, v3 -; GCN-NEXT: v_mul_lo_u32 v3, v1, v3 -; GCN-NEXT: v_add_i32_e32 v4, vcc, v6, v4 -; GCN-NEXT: v_addc_u32_e32 v6, vcc, 0, v7, vcc -; GCN-NEXT: v_mul_lo_u32 v7, v1, v5 -; GCN-NEXT: v_mul_hi_u32 v5, v1, v5 -; GCN-NEXT: v_add_i32_e32 v4, vcc, v4, v7 -; GCN-NEXT: v_addc_u32_e32 v4, vcc, v6, v5, vcc -; GCN-NEXT: v_addc_u32_e32 v5, vcc, v8, v2, vcc -; GCN-NEXT: v_add_i32_e32 v3, vcc, v4, v3 -; GCN-NEXT: v_addc_u32_e32 v4, vcc, 0, v5, vcc -; GCN-NEXT: v_add_i32_e32 v0, vcc, v0, v3 -; GCN-NEXT: v_addc_u32_e32 v1, vcc, v1, v4, vcc -; GCN-NEXT: v_mul_lo_u32 v3, s2, v1 -; GCN-NEXT: v_mul_hi_u32 v4, s2, v0 -; GCN-NEXT: v_mul_hi_u32 v5, s2, v1 -; GCN-NEXT: v_mul_hi_u32 v6, s3, v1 +; GCN-NEXT: v_subrev_i32_e32 v2, vcc, v0, v2 +; GCN-NEXT: v_add_i32_e32 v2, vcc, v2, v3 +; GCN-NEXT: v_mul_lo_u32 v3, v0, v2 +; GCN-NEXT: v_mul_hi_u32 v5, v0, v4 +; GCN-NEXT: v_mul_hi_u32 v6, v0, v2 +; GCN-NEXT: v_mul_hi_u32 v7, v1, v2 +; GCN-NEXT: v_mul_lo_u32 v2, v1, v2 +; GCN-NEXT: v_add_i32_e32 v3, vcc, v5, v3 +; GCN-NEXT: v_addc_u32_e32 v5, vcc, 0, v6, vcc +; GCN-NEXT: v_mul_lo_u32 v6, v1, v4 +; GCN-NEXT: v_mul_hi_u32 v4, v1, v4 +; GCN-NEXT: v_add_i32_e32 v3, vcc, v3, v6 +; GCN-NEXT: v_addc_u32_e32 v3, vcc, v5, v4, vcc +; GCN-NEXT: v_addc_u32_e32 v4, vcc, 0, v7, vcc +; GCN-NEXT: v_add_i32_e32 v2, vcc, v3, v2 +; GCN-NEXT: v_addc_u32_e32 v3, vcc, 0, v4, vcc +; GCN-NEXT: v_add_i32_e32 v0, vcc, v0, v2 +; GCN-NEXT: v_addc_u32_e32 v1, vcc, v1, v3, vcc +; GCN-NEXT: v_mul_lo_u32 v2, s2, v1 +; GCN-NEXT: v_mul_hi_u32 v3, s2, v0 +; GCN-NEXT: v_mul_hi_u32 v4, s2, v1 +; GCN-NEXT: v_mul_hi_u32 v5, s3, v1 ; GCN-NEXT: v_mul_lo_u32 v1, s3, v1 -; GCN-NEXT: v_add_i32_e32 v3, vcc, v4, v3 -; GCN-NEXT: v_addc_u32_e32 v4, vcc, 0, v5, vcc -; GCN-NEXT: v_mul_lo_u32 v5, s3, v0 +; GCN-NEXT: v_add_i32_e32 v2, vcc, v3, v2 +; GCN-NEXT: v_addc_u32_e32 v3, vcc, 0, v4, vcc +; GCN-NEXT: v_mul_lo_u32 v4, s3, v0 ; GCN-NEXT: v_mul_hi_u32 v0, s3, v0 -; GCN-NEXT: v_add_i32_e32 v3, vcc, v3, v5 -; GCN-NEXT: v_addc_u32_e32 v0, vcc, v4, v0, vcc -; GCN-NEXT: v_addc_u32_e32 v2, vcc, v6, v2, vcc +; GCN-NEXT: v_add_i32_e32 v2, vcc, v2, v4 +; GCN-NEXT: v_addc_u32_e32 v0, vcc, v3, v0, vcc +; GCN-NEXT: v_addc_u32_e32 v2, vcc, 0, v5, vcc ; GCN-NEXT: v_add_i32_e32 v0, vcc, v0, v1 ; GCN-NEXT: v_addc_u32_e32 v1, vcc, 0, v2, vcc ; GCN-NEXT: v_mul_lo_u32 v1, v1, 24 @@ -1111,7 +1104,6 @@ define i64 @v_test_urem_pow2_k_num_i64(i64 %x) { ; GCN-NEXT: v_subb_u32_e32 v5, vcc, 0, v1, vcc ; GCN-NEXT: v_mac_f32_e32 v2, 0x4f800000, v3 ; GCN-NEXT: v_rcp_f32_e32 v2, v2 -; GCN-NEXT: v_mov_b32_e32 v11, 0 ; GCN-NEXT: v_mul_f32_e32 v2, 0x5f7ffffc, v2 ; GCN-NEXT: v_mul_f32_e32 v3, 0x2f800000, v2 ; GCN-NEXT: v_trunc_f32_e32 v3, v3 @@ -1124,18 +1116,18 @@ define i64 @v_test_urem_pow2_k_num_i64(i64 %x) { ; GCN-NEXT: v_mul_lo_u32 v9, v4, v2 ; GCN-NEXT: v_add_i32_e32 v6, vcc, v7, v6 ; GCN-NEXT: v_add_i32_e32 v6, vcc, v6, v8 -; GCN-NEXT: v_mul_lo_u32 v7, v2, v6 -; GCN-NEXT: v_mul_hi_u32 v8, v2, v9 +; GCN-NEXT: v_mul_hi_u32 v7, v2, v9 +; GCN-NEXT: v_mul_lo_u32 v8, v2, v6 ; GCN-NEXT: v_mul_hi_u32 v10, v2, v6 -; GCN-NEXT: v_mul_hi_u32 v12, v3, v6 +; GCN-NEXT: v_mul_hi_u32 v11, v3, v6 ; GCN-NEXT: v_mul_lo_u32 v6, v3, v6 -; GCN-NEXT: v_add_i32_e32 v7, vcc, v8, v7 +; GCN-NEXT: v_add_i32_e32 v7, vcc, v7, v8 ; GCN-NEXT: v_addc_u32_e32 v8, vcc, 0, v10, vcc ; GCN-NEXT: v_mul_lo_u32 v10, v3, v9 ; GCN-NEXT: v_mul_hi_u32 v9, v3, v9 ; GCN-NEXT: v_add_i32_e32 v7, vcc, v7, v10 ; GCN-NEXT: v_addc_u32_e32 v7, vcc, v8, v9, vcc -; GCN-NEXT: v_addc_u32_e32 v8, vcc, v12, v11, vcc +; GCN-NEXT: v_addc_u32_e32 v8, vcc, 0, v11, vcc ; GCN-NEXT: v_add_i32_e32 v6, vcc, v7, v6 ; GCN-NEXT: v_addc_u32_e32 v7, vcc, 0, v8, vcc ; GCN-NEXT: v_add_i32_e32 v2, vcc, v2, v6 @@ -1157,7 +1149,7 @@ define i64 @v_test_urem_pow2_k_num_i64(i64 %x) { ; GCN-NEXT: v_mul_lo_u32 v5, v3, v5 ; GCN-NEXT: v_add_i32_e32 v4, vcc, v8, v4 ; GCN-NEXT: v_addc_u32_e32 v4, vcc, v9, v7, vcc -; GCN-NEXT: v_addc_u32_e32 v6, vcc, v6, v11, vcc +; GCN-NEXT: v_addc_u32_e32 v6, vcc, 0, v6, vcc ; GCN-NEXT: v_add_i32_e32 v4, vcc, v4, v5 ; GCN-NEXT: v_addc_u32_e32 v5, vcc, 0, v6, vcc ; GCN-NEXT: v_add_i32_e32 v2, vcc, v2, v4 @@ -1203,26 +1195,25 @@ define i64 @v_test_urem_pow2_k_num_i64(i64 %x) { ; GCN-IR-NEXT: v_add_i32_e32 v2, vcc, 32, v2 ; GCN-IR-NEXT: v_ffbh_u32_e32 v3, v1 ; GCN-IR-NEXT: v_min_u32_e32 v6, v2, v3 -; GCN-IR-NEXT: v_add_i32_e32 v2, vcc, 0xffffffd0, v6 -; GCN-IR-NEXT: v_addc_u32_e64 v3, s[6:7], 0, -1, vcc +; GCN-IR-NEXT: v_add_i32_e32 v3, vcc, 0xffffffd0, v6 +; GCN-IR-NEXT: v_addc_u32_e64 v4, s[6:7], 0, -1, vcc ; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[4:5], 0, v[0:1] -; GCN-IR-NEXT: v_cmp_lt_u64_e32 vcc, 63, v[2:3] -; GCN-IR-NEXT: s_mov_b64 s[8:9], 0x8000 -; GCN-IR-NEXT: v_mov_b32_e32 v4, s8 +; GCN-IR-NEXT: v_cmp_lt_u64_e32 vcc, 63, v[3:4] +; GCN-IR-NEXT: v_mov_b32_e32 v5, 0x8000 ; GCN-IR-NEXT: s_or_b64 s[4:5], s[4:5], vcc -; GCN-IR-NEXT: v_cmp_ne_u64_e32 vcc, 63, v[2:3] -; GCN-IR-NEXT: v_mov_b32_e32 v7, 0 -; GCN-IR-NEXT: v_cndmask_b32_e64 v4, v4, 0, s[4:5] +; GCN-IR-NEXT: v_cmp_ne_u64_e32 vcc, 63, v[3:4] +; GCN-IR-NEXT: v_cndmask_b32_e64 v5, v5, 0, s[4:5] ; GCN-IR-NEXT: s_xor_b64 s[4:5], s[4:5], -1 -; GCN-IR-NEXT: v_mov_b32_e32 v5, v7 +; GCN-IR-NEXT: v_mov_b32_e32 v2, 0 +; GCN-IR-NEXT: s_mov_b64 s[8:9], 0x8000 ; GCN-IR-NEXT: s_and_b64 s[4:5], s[4:5], vcc ; GCN-IR-NEXT: s_and_saveexec_b64 s[6:7], s[4:5] ; GCN-IR-NEXT: s_cbranch_execz .LBB8_6 ; GCN-IR-NEXT: ; %bb.1: ; %udiv-bb1 -; GCN-IR-NEXT: v_add_i32_e32 v8, vcc, 1, v2 -; GCN-IR-NEXT: v_addc_u32_e32 v9, vcc, 0, v3, vcc -; GCN-IR-NEXT: v_sub_i32_e64 v2, s[4:5], 63, v2 -; GCN-IR-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[8:9] +; GCN-IR-NEXT: v_add_i32_e32 v7, vcc, 1, v3 +; GCN-IR-NEXT: v_addc_u32_e32 v8, vcc, 0, v4, vcc +; GCN-IR-NEXT: v_sub_i32_e64 v2, s[4:5], 63, v3 +; GCN-IR-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[7:8] ; GCN-IR-NEXT: v_lshl_b64 v[2:3], s[8:9], v2 ; GCN-IR-NEXT: v_mov_b32_e32 v4, 0 ; GCN-IR-NEXT: v_mov_b32_e32 v5, 0 @@ -1233,10 +1224,10 @@ define i64 @v_test_urem_pow2_k_num_i64(i64 %x) { ; GCN-IR-NEXT: v_add_i32_e32 v12, vcc, -1, v0 ; GCN-IR-NEXT: s_mov_b64 s[4:5], 0x8000 ; GCN-IR-NEXT: v_addc_u32_e32 v13, vcc, -1, v1, vcc -; GCN-IR-NEXT: v_lshr_b64 v[8:9], s[4:5], v8 +; GCN-IR-NEXT: v_lshr_b64 v[8:9], s[4:5], v7 ; GCN-IR-NEXT: v_sub_i32_e32 v6, vcc, 47, v6 ; GCN-IR-NEXT: v_mov_b32_e32 v10, 0 -; GCN-IR-NEXT: v_subb_u32_e32 v7, vcc, 0, v7, vcc +; GCN-IR-NEXT: v_subb_u32_e64 v7, s[4:5], 0, 0, vcc ; GCN-IR-NEXT: s_mov_b64 s[10:11], 0 ; GCN-IR-NEXT: v_mov_b32_e32 v11, 0 ; GCN-IR-NEXT: v_mov_b32_e32 v5, 0 @@ -1268,15 +1259,15 @@ define i64 @v_test_urem_pow2_k_num_i64(i64 %x) { ; GCN-IR-NEXT: s_or_b64 exec, exec, s[10:11] ; GCN-IR-NEXT: .LBB8_5: ; %Flow3 ; GCN-IR-NEXT: s_or_b64 exec, exec, s[8:9] -; GCN-IR-NEXT: v_lshl_b64 v[2:3], v[2:3], 1 -; GCN-IR-NEXT: v_or_b32_e32 v5, v5, v3 -; GCN-IR-NEXT: v_or_b32_e32 v4, v4, v2 +; GCN-IR-NEXT: v_lshl_b64 v[6:7], v[2:3], 1 +; GCN-IR-NEXT: v_or_b32_e32 v2, v5, v7 +; GCN-IR-NEXT: v_or_b32_e32 v5, v4, v6 ; GCN-IR-NEXT: .LBB8_6: ; %Flow4 ; GCN-IR-NEXT: s_or_b64 exec, exec, s[6:7] -; GCN-IR-NEXT: v_mul_lo_u32 v2, v0, v5 -; GCN-IR-NEXT: v_mul_hi_u32 v3, v0, v4 -; GCN-IR-NEXT: v_mul_lo_u32 v1, v1, v4 -; GCN-IR-NEXT: v_mul_lo_u32 v0, v0, v4 +; GCN-IR-NEXT: v_mul_lo_u32 v2, v0, v2 +; GCN-IR-NEXT: v_mul_hi_u32 v3, v0, v5 +; GCN-IR-NEXT: v_mul_lo_u32 v1, v1, v5 +; GCN-IR-NEXT: v_mul_lo_u32 v0, v0, v5 ; GCN-IR-NEXT: v_add_i32_e32 v2, vcc, v3, v2 ; GCN-IR-NEXT: v_add_i32_e32 v1, vcc, v2, v1 ; GCN-IR-NEXT: v_sub_i32_e32 v0, vcc, 0x8000, v0 diff --git a/llvm/test/CodeGen/AMDGPU/usubsat.ll b/llvm/test/CodeGen/AMDGPU/usubsat.ll index 3c7b49d0f501..17b357de1e78 100644 --- a/llvm/test/CodeGen/AMDGPU/usubsat.ll +++ b/llvm/test/CodeGen/AMDGPU/usubsat.ll @@ -31,9 +31,8 @@ define i8 @v_usubsat_i8(i8 %lhs, i8 %rhs) { ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: s_movk_i32 s4, 0xff -; GFX10-NEXT: v_and_b32_e32 v1, s4, v1 -; GFX10-NEXT: v_and_b32_e32 v0, s4, v0 +; GFX10-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GFX10-NEXT: v_and_b32_e32 v0, 0xff, v0 ; GFX10-NEXT: v_sub_nc_u16 v0, v0, v1 clamp ; GFX10-NEXT: s_setpc_b64 s[30:31] %result = call i8 @llvm.usub.sat.i8(i8 %lhs, i8 %rhs) diff --git a/llvm/test/CodeGen/AMDGPU/v_pack.ll b/llvm/test/CodeGen/AMDGPU/v_pack.ll index 940dc967edc0..129b93e5b001 100644 --- a/llvm/test/CodeGen/AMDGPU/v_pack.ll +++ b/llvm/test/CodeGen/AMDGPU/v_pack.ll @@ -173,13 +173,12 @@ define amdgpu_kernel void @v_pack_b32.fabs(half addrspace(1)* %in0, half addrspa ; GISEL-NEXT: s_waitcnt vmcnt(0) ; GISEL-NEXT: global_load_ushort v2, v0, s[2:3] glc dlc ; GISEL-NEXT: s_waitcnt vmcnt(0) -; GISEL-NEXT: s_waitcnt_depctr 0xffe3 -; GISEL-NEXT: s_movk_i32 s0, 0x7fff -; GISEL-NEXT: v_add_f16_e32 v0, 2.0, v1 -; GISEL-NEXT: v_add_f16_e32 v1, 2.0, v2 -; GISEL-NEXT: v_and_b32_e32 v0, s0, v0 -; GISEL-NEXT: v_and_b32_sdwa v1, s0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GISEL-NEXT: v_and_or_b32 v0, 0xffff, v0, v1 +; GISEL-NEXT: v_mov_b32_e32 v0, 0x7fff +; GISEL-NEXT: v_add_f16_e32 v1, 2.0, v1 +; GISEL-NEXT: v_add_f16_e32 v2, 2.0, v2 +; GISEL-NEXT: v_and_b32_e32 v1, 0x7fff, v1 +; GISEL-NEXT: v_and_b32_sdwa v0, v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GISEL-NEXT: v_and_or_b32 v0, 0xffff, v1, v0 ; GISEL-NEXT: ;;#ASMSTART ; GISEL-NEXT: ; use v0 ; GISEL-NEXT: ;;#ASMEND @@ -232,7 +231,7 @@ define amdgpu_kernel void @v_pack_b32.fneg(half addrspace(1)* %in0, half addrspa ; GISEL-NEXT: s_mov_b32 s0, 0x8000 ; GISEL-NEXT: v_add_f16_e32 v0, 2.0, v1 ; GISEL-NEXT: v_add_f16_e32 v1, 2.0, v2 -; GISEL-NEXT: v_add_f16_e64 v0, s0, -v0 +; GISEL-NEXT: v_add_f16_e64 v0, 0x8000, -v0 ; GISEL-NEXT: v_add_f16_sdwa v1, s0, -v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GISEL-NEXT: v_and_or_b32 v0, 0xffff, v0, v1 ; GISEL-NEXT: ;;#ASMSTART diff --git a/llvm/test/CodeGen/AMDGPU/vector_shuffle.packed.ll b/llvm/test/CodeGen/AMDGPU/vector_shuffle.packed.ll index 8d4d929e029d..ee607351d5ba 100644 --- a/llvm/test/CodeGen/AMDGPU/vector_shuffle.packed.ll +++ b/llvm/test/CodeGen/AMDGPU/vector_shuffle.packed.ll @@ -967,13 +967,12 @@ define <4 x half> @shuffle_v4f16_1100(<4 x half> addrspace(1)* %arg0, <4 x half> ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: global_load_dwordx2 v[1:2], v[0:1], off -; GFX10-NEXT: v_mov_b32_e32 v0, 0xffff ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_lshrrev_b32_e32 v2, 16, v1 -; GFX10-NEXT: v_and_b32_e32 v4, v0, v1 -; GFX10-NEXT: v_and_b32_e32 v3, v0, v2 -; GFX10-NEXT: v_lshl_or_b32 v1, v1, 16, v4 -; GFX10-NEXT: v_lshl_or_b32 v0, v2, 16, v3 +; GFX10-NEXT: v_lshrrev_b32_e32 v0, 16, v1 +; GFX10-NEXT: v_and_b32_e32 v3, 0xffff, v1 +; GFX10-NEXT: v_and_b32_e32 v2, 0xffff, v0 +; GFX10-NEXT: v_lshl_or_b32 v1, v1, 16, v3 +; GFX10-NEXT: v_lshl_or_b32 v0, v0, 16, v2 ; GFX10-NEXT: s_setpc_b64 s[30:31] %val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0 %val1 = load <4 x half>, <4 x half> addrspace(1)* %arg1 @@ -1384,7 +1383,7 @@ define <4 x half> @shuffle_v4f16_0456(<4 x half> addrspace(1)* %arg0, <4 x half> ; GFX10-NEXT: ; kill: killed $vgpr0 killed $vgpr1 ; GFX10-NEXT: v_mov_b32_e32 v0, 0xffff ; GFX10-NEXT: ; kill: killed $vgpr2 killed $vgpr3 -; GFX10-NEXT: v_and_b32_e32 v1, v0, v4 +; GFX10-NEXT: v_and_b32_e32 v1, 0xffff, v4 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_and_b32_sdwa v2, v0, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX10-NEXT: v_lshl_or_b32 v0, v5, 16, v1 diff --git a/llvm/test/CodeGen/AMDGPU/xor.ll b/llvm/test/CodeGen/AMDGPU/xor.ll index d85733d85363..a5e055290d37 100644 --- a/llvm/test/CodeGen/AMDGPU/xor.ll +++ b/llvm/test/CodeGen/AMDGPU/xor.ll @@ -190,8 +190,8 @@ define amdgpu_kernel void @scalar_xor_literal_i64(i64 addrspace(1)* %out, [8 x i ; SI-DAG: s_movk_i32 s[[K_LO:[0-9]+]], 0x3039 ; SI: s_xor_b64 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, s[[[K_LO]]:[[K_HI]]] -; SI: s_add_u32 s{{[0-9]+}}, s{{[0-9]+}}, s[[K_LO]] -; SI: s_addc_u32 s{{[0-9]+}}, s{{[0-9]+}}, s[[K_HI]] +; SI: s_add_u32 s{{[0-9]+}}, s{{[0-9]+}}, 0x3039 +; SI: s_addc_u32 s{{[0-9]+}}, s{{[0-9]+}}, 0xf237b define amdgpu_kernel void @scalar_xor_literal_multi_use_i64(i64 addrspace(1)* %out, [8 x i32], i64 %a, i64 %b) { %or = xor i64 %a, 4261135838621753 store i64 %or, i64 addrspace(1)* %out diff --git a/llvm/test/CodeGen/AMDGPU/zero_extend.ll b/llvm/test/CodeGen/AMDGPU/zero_extend.ll index 01820ea1854b..a0e46e61c6c1 100644 --- a/llvm/test/CodeGen/AMDGPU/zero_extend.ll +++ b/llvm/test/CodeGen/AMDGPU/zero_extend.ll @@ -51,9 +51,8 @@ define amdgpu_kernel void @s_cmp_zext_i1_to_i64(i64 addrspace(1)* %out, i32 %a, ; GCN: s_load_dword [[A:s[0-9]+]] ; GCN: s_load_dword [[B:s[0-9]+]] -; GCN: s_mov_b32 [[MASK:s[0-9]+]], 0xffff{{$}} -; GCN-DAG: s_and_b32 [[MASK_A:s[0-9]+]], [[A]], [[MASK]] -; GCN-DAG: s_and_b32 [[MASK_B:s[0-9]+]], [[B]], [[MASK]] +; GCN-DAG: s_and_b32 [[MASK_A:s[0-9]+]], [[A]], 0xffff{{$}} +; GCN-DAG: s_and_b32 [[MASK_B:s[0-9]+]], [[B]], 0xffff{{$}} ; GCN: s_cmp_eq_u32 [[MASK_A]], [[B]] ; GCN: s_cselect_b64 [[CC:s\[[0-9:]+\]]], -1, 0 ; GCN: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, 1, [[CC]]