[AMDGPU] Aggressively fold immediates in SIFoldOperands

Previously SIFoldOperands::foldInstOperand would only fold a
non-inlinable immediate into a single user, so as not to increase code
size by adding the same 32-bit literal operand to many instructions.

This patch removes that restriction, so that a non-inlinable immediate
will be folded into any number of users. The rationale is:
- It reduces the number of registers used for holding constant values,
  which might increase occupancy. (On the other hand, many of these
  registers are SGPRs which no longer affect occupancy on GFX10+.)
- It reduces ALU stalls between the instruction that loads a constant
  into a register, and the instruction that uses it.
- The above benefits are expected to outweigh any increase in code size.

Differential Revision: https://reviews.llvm.org/D114643
This commit is contained in:
Jay Foad 2022-05-16 15:48:11 +01:00
parent aa568e082b
commit 3eb2281bc0
135 changed files with 10543 additions and 11599 deletions

View File

@ -146,30 +146,6 @@ static unsigned macToMad(unsigned Opc) {
return AMDGPU::INSTRUCTION_LIST_END;
}
// Wrapper around isInlineConstant that understands special cases when
// instruction types are replaced during operand folding.
static bool isInlineConstantIfFolded(const SIInstrInfo *TII,
const MachineInstr &UseMI,
unsigned OpNo,
const MachineOperand &OpToFold) {
if (TII->isInlineConstant(UseMI, OpNo, OpToFold))
return true;
unsigned Opc = UseMI.getOpcode();
unsigned NewOpc = macToMad(Opc);
if (NewOpc != AMDGPU::INSTRUCTION_LIST_END) {
// Special case for mac. Since this is replaced with mad when folded into
// src2, we need to check the legality for the final instruction.
int Src2Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2);
if (static_cast<int>(OpNo) == Src2Idx) {
const MCInstrDesc &MadDesc = TII->get(NewOpc);
return TII->isInlineConstant(OpToFold, MadDesc.OpInfo[OpNo].OperandType);
}
}
return false;
}
// TODO: Add heuristic that the frame index might not fit in the addressing mode
// immediate offset to avoid materializing in loops.
static bool frameIndexMayFold(const SIInstrInfo *TII,
@ -1267,59 +1243,13 @@ bool SIFoldOperands::foldInstOperand(MachineInstr &MI,
}
}
bool FoldingImm = OpToFold.isImm() || OpToFold.isFI() || OpToFold.isGlobal();
if (FoldingImm) {
unsigned NumLiteralUses = 0;
MachineOperand *NonInlineUse = nullptr;
int NonInlineUseOpNo = -1;
for (auto &Use :
make_early_inc_range(MRI->use_nodbg_operands(Dst.getReg()))) {
MachineInstr *UseMI = Use.getParent();
unsigned OpNo = UseMI->getOperandNo(&Use);
// Try to fold any inline immediate uses, and then only fold other
// constants if they have one use.
//
// The legality of the inline immediate must be checked based on the use
// operand, not the defining instruction, because 32-bit instructions
// with 32-bit inline immediate sources may be used to materialize
// constants used in 16-bit operands.
//
// e.g. it is unsafe to fold:
// s_mov_b32 s0, 1.0 // materializes 0x3f800000
// v_add_f16 v0, v1, s0 // 1.0 f16 inline immediate sees 0x00003c00
// Folding immediates with more than one use will increase program size.
// FIXME: This will also reduce register usage, which may be better
// in some cases. A better heuristic is needed.
if (isInlineConstantIfFolded(TII, *UseMI, OpNo, OpToFold)) {
foldOperand(OpToFold, UseMI, OpNo, FoldList, CopiesToReplace);
} else if (frameIndexMayFold(TII, *UseMI, OpNo, OpToFold)) {
foldOperand(OpToFold, UseMI, OpNo, FoldList, CopiesToReplace);
} else {
if (++NumLiteralUses == 1) {
NonInlineUse = &Use;
NonInlineUseOpNo = OpNo;
}
}
}
if (NumLiteralUses == 1) {
MachineInstr *UseMI = NonInlineUse->getParent();
foldOperand(OpToFold, UseMI, NonInlineUseOpNo, FoldList, CopiesToReplace);
}
} else {
// Folding register.
SmallVector <MachineOperand *, 4> UsesToProcess;
for (auto &Use : MRI->use_nodbg_operands(Dst.getReg()))
UsesToProcess.push_back(&Use);
for (auto U : UsesToProcess) {
MachineInstr *UseMI = U->getParent();
foldOperand(OpToFold, UseMI, UseMI->getOperandNo(U),
FoldList, CopiesToReplace);
}
SmallVector<MachineOperand *, 4> UsesToProcess;
for (auto &Use : MRI->use_nodbg_operands(Dst.getReg()))
UsesToProcess.push_back(&Use);
for (auto U : UsesToProcess) {
MachineInstr *UseMI = U->getParent();
foldOperand(OpToFold, UseMI, UseMI->getOperandNo(U), FoldList,
CopiesToReplace);
}
if (CopiesToReplace.empty() && FoldList.empty())

View File

@ -127,9 +127,8 @@ define <2 x i16> @v_add_v2i16_neg_inline_imm_splat(<2 x i16> %a) {
; GFX8-LABEL: v_add_v2i16_neg_inline_imm_splat:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8-NEXT: s_movk_i32 s4, 0xffc0
; GFX8-NEXT: v_mov_b32_e32 v2, s4
; GFX8-NEXT: v_add_u16_e32 v1, s4, v0
; GFX8-NEXT: v_mov_b32_e32 v2, 0xffffffc0
; GFX8-NEXT: v_add_u16_e32 v1, 0xffc0, v0
; GFX8-NEXT: v_add_u16_sdwa v0, v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; GFX8-NEXT: v_or_b32_e32 v0, v1, v0
; GFX8-NEXT: s_setpc_b64 s[30:31]
@ -209,14 +208,12 @@ define amdgpu_ps i32 @s_add_v2i16_neg_inline_imm_splat(<2 x i16> inreg %a) {
;
; GFX8-LABEL: s_add_v2i16_neg_inline_imm_splat:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_mov_b32 s3, 0xffff
; GFX8-NEXT: s_mov_b32 s1, 0xffc0
; GFX8-NEXT: s_lshr_b32 s2, s0, 16
; GFX8-NEXT: s_and_b32 s0, s0, s3
; GFX8-NEXT: s_add_i32 s0, s0, s1
; GFX8-NEXT: s_add_i32 s2, s2, s1
; GFX8-NEXT: s_lshl_b32 s1, s2, 16
; GFX8-NEXT: s_and_b32 s0, s0, s3
; GFX8-NEXT: s_lshr_b32 s1, s0, 16
; GFX8-NEXT: s_and_b32 s0, s0, 0xffff
; GFX8-NEXT: s_add_i32 s0, s0, 0xffc0
; GFX8-NEXT: s_add_i32 s1, s1, 0xffc0
; GFX8-NEXT: s_lshl_b32 s1, s1, 16
; GFX8-NEXT: s_and_b32 s0, s0, 0xffff
; GFX8-NEXT: s_or_b32 s0, s1, s0
; GFX8-NEXT: ; return to shader part epilog
;
@ -243,13 +240,12 @@ define amdgpu_ps i32 @s_add_v2i16_neg_inline_imm_lo(<2 x i16> inreg %a) {
;
; GFX8-LABEL: s_add_v2i16_neg_inline_imm_lo:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_mov_b32 s2, 0xffff
; GFX8-NEXT: s_lshr_b32 s1, s0, 16
; GFX8-NEXT: s_and_b32 s0, s0, s2
; GFX8-NEXT: s_and_b32 s0, s0, 0xffff
; GFX8-NEXT: s_add_i32 s0, s0, 0xffc0
; GFX8-NEXT: s_add_i32 s1, s1, 4
; GFX8-NEXT: s_lshl_b32 s1, s1, 16
; GFX8-NEXT: s_and_b32 s0, s0, s2
; GFX8-NEXT: s_and_b32 s0, s0, 0xffff
; GFX8-NEXT: s_or_b32 s0, s1, s0
; GFX8-NEXT: ; return to shader part epilog
;
@ -276,13 +272,12 @@ define amdgpu_ps i32 @s_add_v2i16_neg_inline_imm_hi(<2 x i16> inreg %a) {
;
; GFX8-LABEL: s_add_v2i16_neg_inline_imm_hi:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_mov_b32 s2, 0xffff
; GFX8-NEXT: s_lshr_b32 s1, s0, 16
; GFX8-NEXT: s_and_b32 s0, s0, s2
; GFX8-NEXT: s_and_b32 s0, s0, 0xffff
; GFX8-NEXT: s_add_i32 s0, s0, 4
; GFX8-NEXT: s_add_i32 s1, s1, 0xffc0
; GFX8-NEXT: s_lshl_b32 s1, s1, 16
; GFX8-NEXT: s_and_b32 s0, s0, s2
; GFX8-NEXT: s_and_b32 s0, s0, 0xffff
; GFX8-NEXT: s_or_b32 s0, s1, s0
; GFX8-NEXT: ; return to shader part epilog
;
@ -310,15 +305,14 @@ define amdgpu_ps i32 @s_add_v2i16(<2 x i16> inreg %a, <2 x i16> inreg %b) {
;
; GFX8-LABEL: s_add_v2i16:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_mov_b32 s3, 0xffff
; GFX8-NEXT: s_lshr_b32 s2, s0, 16
; GFX8-NEXT: s_and_b32 s0, s0, s3
; GFX8-NEXT: s_lshr_b32 s4, s1, 16
; GFX8-NEXT: s_and_b32 s1, s1, s3
; GFX8-NEXT: s_and_b32 s0, s0, 0xffff
; GFX8-NEXT: s_lshr_b32 s3, s1, 16
; GFX8-NEXT: s_and_b32 s1, s1, 0xffff
; GFX8-NEXT: s_add_i32 s0, s0, s1
; GFX8-NEXT: s_add_i32 s2, s2, s4
; GFX8-NEXT: s_add_i32 s2, s2, s3
; GFX8-NEXT: s_lshl_b32 s1, s2, 16
; GFX8-NEXT: s_and_b32 s0, s0, s3
; GFX8-NEXT: s_and_b32 s0, s0, 0xffff
; GFX8-NEXT: s_or_b32 s0, s1, s0
; GFX8-NEXT: ; return to shader part epilog
;
@ -349,15 +343,14 @@ define amdgpu_ps i32 @s_add_v2i16_fneg_lhs(<2 x half> inreg %a, <2 x i16> inreg
; GFX8-LABEL: s_add_v2i16_fneg_lhs:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_xor_b32 s0, s0, 0x80008000
; GFX8-NEXT: s_mov_b32 s3, 0xffff
; GFX8-NEXT: s_lshr_b32 s2, s0, 16
; GFX8-NEXT: s_and_b32 s0, s0, s3
; GFX8-NEXT: s_lshr_b32 s4, s1, 16
; GFX8-NEXT: s_and_b32 s1, s1, s3
; GFX8-NEXT: s_and_b32 s0, s0, 0xffff
; GFX8-NEXT: s_lshr_b32 s3, s1, 16
; GFX8-NEXT: s_and_b32 s1, s1, 0xffff
; GFX8-NEXT: s_add_i32 s0, s0, s1
; GFX8-NEXT: s_add_i32 s2, s2, s4
; GFX8-NEXT: s_add_i32 s2, s2, s3
; GFX8-NEXT: s_lshl_b32 s1, s2, 16
; GFX8-NEXT: s_and_b32 s0, s0, s3
; GFX8-NEXT: s_and_b32 s0, s0, 0xffff
; GFX8-NEXT: s_or_b32 s0, s1, s0
; GFX8-NEXT: ; return to shader part epilog
;
@ -391,15 +384,14 @@ define amdgpu_ps i32 @s_add_v2i16_fneg_rhs(<2 x i16> inreg %a, <2 x half> inreg
; GFX8-LABEL: s_add_v2i16_fneg_rhs:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_xor_b32 s1, s1, 0x80008000
; GFX8-NEXT: s_mov_b32 s3, 0xffff
; GFX8-NEXT: s_lshr_b32 s2, s0, 16
; GFX8-NEXT: s_and_b32 s0, s0, s3
; GFX8-NEXT: s_lshr_b32 s4, s1, 16
; GFX8-NEXT: s_and_b32 s1, s1, s3
; GFX8-NEXT: s_and_b32 s0, s0, 0xffff
; GFX8-NEXT: s_lshr_b32 s3, s1, 16
; GFX8-NEXT: s_and_b32 s1, s1, 0xffff
; GFX8-NEXT: s_add_i32 s0, s0, s1
; GFX8-NEXT: s_add_i32 s2, s2, s4
; GFX8-NEXT: s_add_i32 s2, s2, s3
; GFX8-NEXT: s_lshl_b32 s1, s2, 16
; GFX8-NEXT: s_and_b32 s0, s0, s3
; GFX8-NEXT: s_and_b32 s0, s0, 0xffff
; GFX8-NEXT: s_or_b32 s0, s1, s0
; GFX8-NEXT: ; return to shader part epilog
;
@ -422,9 +414,8 @@ define amdgpu_ps i32 @s_add_v2i16_fneg_rhs(<2 x i16> inreg %a, <2 x half> inreg
define amdgpu_ps i32 @s_add_v2i16_fneg_lhs_fneg_rhs(<2 x half> inreg %a, <2 x half> inreg %b) {
; GFX9-LABEL: s_add_v2i16_fneg_lhs_fneg_rhs:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_mov_b32 s2, 0x80008000
; GFX9-NEXT: s_xor_b32 s0, s0, s2
; GFX9-NEXT: s_xor_b32 s1, s1, s2
; GFX9-NEXT: s_xor_b32 s0, s0, 0x80008000
; GFX9-NEXT: s_xor_b32 s1, s1, 0x80008000
; GFX9-NEXT: s_lshr_b32 s2, s0, 16
; GFX9-NEXT: s_lshr_b32 s3, s1, 16
; GFX9-NEXT: s_add_i32 s0, s0, s1
@ -434,26 +425,23 @@ define amdgpu_ps i32 @s_add_v2i16_fneg_lhs_fneg_rhs(<2 x half> inreg %a, <2 x ha
;
; GFX8-LABEL: s_add_v2i16_fneg_lhs_fneg_rhs:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_mov_b32 s2, 0x80008000
; GFX8-NEXT: s_xor_b32 s0, s0, s2
; GFX8-NEXT: s_xor_b32 s1, s1, s2
; GFX8-NEXT: s_mov_b32 s3, 0xffff
; GFX8-NEXT: s_xor_b32 s0, s0, 0x80008000
; GFX8-NEXT: s_xor_b32 s1, s1, 0x80008000
; GFX8-NEXT: s_lshr_b32 s2, s0, 16
; GFX8-NEXT: s_and_b32 s0, s0, s3
; GFX8-NEXT: s_lshr_b32 s4, s1, 16
; GFX8-NEXT: s_and_b32 s1, s1, s3
; GFX8-NEXT: s_and_b32 s0, s0, 0xffff
; GFX8-NEXT: s_lshr_b32 s3, s1, 16
; GFX8-NEXT: s_and_b32 s1, s1, 0xffff
; GFX8-NEXT: s_add_i32 s0, s0, s1
; GFX8-NEXT: s_add_i32 s2, s2, s4
; GFX8-NEXT: s_add_i32 s2, s2, s3
; GFX8-NEXT: s_lshl_b32 s1, s2, 16
; GFX8-NEXT: s_and_b32 s0, s0, s3
; GFX8-NEXT: s_and_b32 s0, s0, 0xffff
; GFX8-NEXT: s_or_b32 s0, s1, s0
; GFX8-NEXT: ; return to shader part epilog
;
; GFX10-LABEL: s_add_v2i16_fneg_lhs_fneg_rhs:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_mov_b32 s2, 0x80008000
; GFX10-NEXT: s_xor_b32 s0, s0, s2
; GFX10-NEXT: s_xor_b32 s1, s1, s2
; GFX10-NEXT: s_xor_b32 s0, s0, 0x80008000
; GFX10-NEXT: s_xor_b32 s1, s1, 0x80008000
; GFX10-NEXT: s_lshr_b32 s2, s0, 16
; GFX10-NEXT: s_lshr_b32 s3, s1, 16
; GFX10-NEXT: s_add_i32 s0, s0, s1

View File

@ -967,7 +967,7 @@ define amdgpu_ps i16 @uaddo_i16_sv(i16 inreg %a, i16 %b) {
; GFX7-LABEL: uaddo_i16_sv:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_mov_b32 s1, 0xffff
; GFX7-NEXT: s_and_b32 s0, s0, s1
; GFX7-NEXT: s_and_b32 s0, s0, 0xffff
; GFX7-NEXT: v_and_b32_e32 v0, s1, v0
; GFX7-NEXT: v_add_i32_e32 v0, vcc, s0, v0
; GFX7-NEXT: v_and_b32_e32 v1, s1, v0
@ -980,7 +980,7 @@ define amdgpu_ps i16 @uaddo_i16_sv(i16 inreg %a, i16 %b) {
; GFX8-LABEL: uaddo_i16_sv:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_mov_b32 s1, 0xffff
; GFX8-NEXT: s_and_b32 s0, s0, s1
; GFX8-NEXT: s_and_b32 s0, s0, 0xffff
; GFX8-NEXT: v_and_b32_e32 v0, s1, v0
; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v0
; GFX8-NEXT: v_and_b32_e32 v1, s1, v0
@ -992,8 +992,7 @@ define amdgpu_ps i16 @uaddo_i16_sv(i16 inreg %a, i16 %b) {
;
; GFX9-LABEL: uaddo_i16_sv:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_mov_b32 s1, 0xffff
; GFX9-NEXT: s_and_b32 s0, s0, s1
; GFX9-NEXT: s_and_b32 s0, s0, 0xffff
; GFX9-NEXT: v_add_u32_sdwa v0, s0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
; GFX9-NEXT: v_cmp_ne_u32_sdwa s[0:1], v0, v0 src0_sel:DWORD src1_sel:WORD_0
; GFX9-NEXT: v_cndmask_b32_e64 v1, 0, 1, s[0:1]

View File

@ -429,13 +429,12 @@ define amdgpu_ps float @v_andn2_i16_vs(i16 %src0, i16 inreg %src1) {
define amdgpu_ps i32 @s_andn2_v2i16(<2 x i16> inreg %src0, <2 x i16> inreg %src1) {
; GFX6-LABEL: s_andn2_v2i16:
; GFX6: ; %bb.0:
; GFX6-NEXT: s_mov_b32 s1, 0xffff
; GFX6-NEXT: s_lshl_b32 s0, s3, 16
; GFX6-NEXT: s_and_b32 s2, s2, s1
; GFX6-NEXT: s_or_b32 s0, s0, s2
; GFX6-NEXT: s_lshl_b32 s2, s5, 16
; GFX6-NEXT: s_and_b32 s1, s4, s1
; GFX6-NEXT: s_or_b32 s1, s2, s1
; GFX6-NEXT: s_and_b32 s1, s2, 0xffff
; GFX6-NEXT: s_or_b32 s0, s0, s1
; GFX6-NEXT: s_lshl_b32 s1, s5, 16
; GFX6-NEXT: s_and_b32 s2, s4, 0xffff
; GFX6-NEXT: s_or_b32 s1, s1, s2
; GFX6-NEXT: s_xor_b32 s1, s1, -1
; GFX6-NEXT: s_and_b32 s0, s0, s1
; GFX6-NEXT: ; return to shader part epilog
@ -458,13 +457,12 @@ define amdgpu_ps i32 @s_andn2_v2i16(<2 x i16> inreg %src0, <2 x i16> inreg %src1
define amdgpu_ps i32 @s_andn2_v2i16_commute(<2 x i16> inreg %src0, <2 x i16> inreg %src1) {
; GFX6-LABEL: s_andn2_v2i16_commute:
; GFX6: ; %bb.0:
; GFX6-NEXT: s_mov_b32 s1, 0xffff
; GFX6-NEXT: s_lshl_b32 s0, s3, 16
; GFX6-NEXT: s_and_b32 s2, s2, s1
; GFX6-NEXT: s_or_b32 s0, s0, s2
; GFX6-NEXT: s_lshl_b32 s2, s5, 16
; GFX6-NEXT: s_and_b32 s1, s4, s1
; GFX6-NEXT: s_or_b32 s1, s2, s1
; GFX6-NEXT: s_and_b32 s1, s2, 0xffff
; GFX6-NEXT: s_or_b32 s0, s0, s1
; GFX6-NEXT: s_lshl_b32 s1, s5, 16
; GFX6-NEXT: s_and_b32 s2, s4, 0xffff
; GFX6-NEXT: s_or_b32 s1, s1, s2
; GFX6-NEXT: s_xor_b32 s1, s1, -1
; GFX6-NEXT: s_and_b32 s0, s1, s0
; GFX6-NEXT: ; return to shader part epilog
@ -487,13 +485,12 @@ define amdgpu_ps i32 @s_andn2_v2i16_commute(<2 x i16> inreg %src0, <2 x i16> inr
define amdgpu_ps { i32, i32 } @s_andn2_v2i16_multi_use(<2 x i16> inreg %src0, <2 x i16> inreg %src1) {
; GFX6-LABEL: s_andn2_v2i16_multi_use:
; GFX6: ; %bb.0:
; GFX6-NEXT: s_mov_b32 s1, 0xffff
; GFX6-NEXT: s_lshl_b32 s0, s3, 16
; GFX6-NEXT: s_and_b32 s2, s2, s1
; GFX6-NEXT: s_or_b32 s0, s0, s2
; GFX6-NEXT: s_lshl_b32 s2, s5, 16
; GFX6-NEXT: s_and_b32 s1, s4, s1
; GFX6-NEXT: s_or_b32 s1, s2, s1
; GFX6-NEXT: s_and_b32 s1, s2, 0xffff
; GFX6-NEXT: s_or_b32 s0, s0, s1
; GFX6-NEXT: s_lshl_b32 s1, s5, 16
; GFX6-NEXT: s_and_b32 s2, s4, 0xffff
; GFX6-NEXT: s_or_b32 s1, s1, s2
; GFX6-NEXT: s_xor_b32 s1, s1, -1
; GFX6-NEXT: s_and_b32 s0, s0, s1
; GFX6-NEXT: ; return to shader part epilog
@ -522,19 +519,18 @@ define amdgpu_ps { i32, i32 } @s_andn2_v2i16_multi_use(<2 x i16> inreg %src0, <2
define amdgpu_ps { i32, i32 } @s_andn2_v2i16_multi_foldable_use(<2 x i16> inreg %src0, <2 x i16> inreg %src1, <2 x i16> inreg %src2) {
; GFX6-LABEL: s_andn2_v2i16_multi_foldable_use:
; GFX6: ; %bb.0:
; GFX6-NEXT: s_mov_b32 s1, 0xffff
; GFX6-NEXT: s_lshl_b32 s0, s3, 16
; GFX6-NEXT: s_and_b32 s2, s2, s1
; GFX6-NEXT: s_or_b32 s0, s0, s2
; GFX6-NEXT: s_lshl_b32 s2, s5, 16
; GFX6-NEXT: s_and_b32 s3, s4, s1
; GFX6-NEXT: s_and_b32 s1, s2, 0xffff
; GFX6-NEXT: s_or_b32 s0, s0, s1
; GFX6-NEXT: s_lshl_b32 s1, s5, 16
; GFX6-NEXT: s_and_b32 s2, s4, 0xffff
; GFX6-NEXT: s_or_b32 s1, s1, s2
; GFX6-NEXT: s_lshl_b32 s2, s7, 16
; GFX6-NEXT: s_and_b32 s3, s6, 0xffff
; GFX6-NEXT: s_or_b32 s2, s2, s3
; GFX6-NEXT: s_lshl_b32 s3, s7, 16
; GFX6-NEXT: s_and_b32 s1, s6, s1
; GFX6-NEXT: s_or_b32 s1, s3, s1
; GFX6-NEXT: s_xor_b32 s1, s1, -1
; GFX6-NEXT: s_and_b32 s0, s0, s1
; GFX6-NEXT: s_and_b32 s1, s2, s1
; GFX6-NEXT: s_xor_b32 s2, s2, -1
; GFX6-NEXT: s_and_b32 s0, s0, s2
; GFX6-NEXT: s_and_b32 s1, s1, s2
; GFX6-NEXT: ; return to shader part epilog
;
; GFX9-LABEL: s_andn2_v2i16_multi_foldable_use:
@ -630,18 +626,17 @@ define amdgpu_ps i64 @s_andn2_v4i16(<4 x i16> inreg %src0, <4 x i16> inreg %src1
; GFX6-LABEL: s_andn2_v4i16:
; GFX6: ; %bb.0:
; GFX6-NEXT: s_lshl_b32 s0, s3, 16
; GFX6-NEXT: s_mov_b32 s3, 0xffff
; GFX6-NEXT: s_and_b32 s1, s2, s3
; GFX6-NEXT: s_and_b32 s1, s2, 0xffff
; GFX6-NEXT: s_or_b32 s0, s0, s1
; GFX6-NEXT: s_lshl_b32 s1, s5, 16
; GFX6-NEXT: s_and_b32 s2, s4, s3
; GFX6-NEXT: s_and_b32 s2, s4, 0xffff
; GFX6-NEXT: s_or_b32 s1, s1, s2
; GFX6-NEXT: s_lshl_b32 s2, s7, 16
; GFX6-NEXT: s_and_b32 s4, s6, s3
; GFX6-NEXT: s_or_b32 s2, s2, s4
; GFX6-NEXT: s_lshl_b32 s4, s9, 16
; GFX6-NEXT: s_and_b32 s3, s8, s3
; GFX6-NEXT: s_or_b32 s3, s4, s3
; GFX6-NEXT: s_and_b32 s3, s6, 0xffff
; GFX6-NEXT: s_or_b32 s2, s2, s3
; GFX6-NEXT: s_lshl_b32 s3, s9, 16
; GFX6-NEXT: s_and_b32 s4, s8, 0xffff
; GFX6-NEXT: s_or_b32 s3, s3, s4
; GFX6-NEXT: s_mov_b32 s4, -1
; GFX6-NEXT: s_mov_b32 s5, s4
; GFX6-NEXT: s_xor_b64 s[2:3], s[2:3], s[4:5]
@ -673,18 +668,17 @@ define amdgpu_ps i64 @s_andn2_v4i16_commute(<4 x i16> inreg %src0, <4 x i16> inr
; GFX6-LABEL: s_andn2_v4i16_commute:
; GFX6: ; %bb.0:
; GFX6-NEXT: s_lshl_b32 s0, s3, 16
; GFX6-NEXT: s_mov_b32 s3, 0xffff
; GFX6-NEXT: s_and_b32 s1, s2, s3
; GFX6-NEXT: s_and_b32 s1, s2, 0xffff
; GFX6-NEXT: s_or_b32 s0, s0, s1
; GFX6-NEXT: s_lshl_b32 s1, s5, 16
; GFX6-NEXT: s_and_b32 s2, s4, s3
; GFX6-NEXT: s_and_b32 s2, s4, 0xffff
; GFX6-NEXT: s_or_b32 s1, s1, s2
; GFX6-NEXT: s_lshl_b32 s2, s7, 16
; GFX6-NEXT: s_and_b32 s4, s6, s3
; GFX6-NEXT: s_or_b32 s2, s2, s4
; GFX6-NEXT: s_lshl_b32 s4, s9, 16
; GFX6-NEXT: s_and_b32 s3, s8, s3
; GFX6-NEXT: s_or_b32 s3, s4, s3
; GFX6-NEXT: s_and_b32 s3, s6, 0xffff
; GFX6-NEXT: s_or_b32 s2, s2, s3
; GFX6-NEXT: s_lshl_b32 s3, s9, 16
; GFX6-NEXT: s_and_b32 s4, s8, 0xffff
; GFX6-NEXT: s_or_b32 s3, s3, s4
; GFX6-NEXT: s_mov_b32 s4, -1
; GFX6-NEXT: s_mov_b32 s5, s4
; GFX6-NEXT: s_xor_b64 s[2:3], s[2:3], s[4:5]
@ -716,18 +710,17 @@ define amdgpu_ps { i64, i64 } @s_andn2_v4i16_multi_use(<4 x i16> inreg %src0, <4
; GFX6-LABEL: s_andn2_v4i16_multi_use:
; GFX6: ; %bb.0:
; GFX6-NEXT: s_lshl_b32 s0, s3, 16
; GFX6-NEXT: s_mov_b32 s3, 0xffff
; GFX6-NEXT: s_and_b32 s1, s2, s3
; GFX6-NEXT: s_and_b32 s1, s2, 0xffff
; GFX6-NEXT: s_or_b32 s0, s0, s1
; GFX6-NEXT: s_lshl_b32 s1, s5, 16
; GFX6-NEXT: s_and_b32 s2, s4, s3
; GFX6-NEXT: s_and_b32 s2, s4, 0xffff
; GFX6-NEXT: s_or_b32 s1, s1, s2
; GFX6-NEXT: s_lshl_b32 s2, s7, 16
; GFX6-NEXT: s_and_b32 s4, s6, s3
; GFX6-NEXT: s_or_b32 s2, s2, s4
; GFX6-NEXT: s_lshl_b32 s4, s9, 16
; GFX6-NEXT: s_and_b32 s3, s8, s3
; GFX6-NEXT: s_or_b32 s3, s4, s3
; GFX6-NEXT: s_and_b32 s3, s6, 0xffff
; GFX6-NEXT: s_or_b32 s2, s2, s3
; GFX6-NEXT: s_lshl_b32 s3, s9, 16
; GFX6-NEXT: s_and_b32 s4, s8, 0xffff
; GFX6-NEXT: s_or_b32 s3, s3, s4
; GFX6-NEXT: s_mov_b32 s4, -1
; GFX6-NEXT: s_mov_b32 s5, s4
; GFX6-NEXT: s_xor_b64 s[2:3], s[2:3], s[4:5]
@ -766,24 +759,23 @@ define amdgpu_ps { i64, i64 } @s_andn2_v4i16_multi_use(<4 x i16> inreg %src0, <4
define amdgpu_ps { i64, i64 } @s_andn2_v4i16_multi_foldable_use(<4 x i16> inreg %src0, <4 x i16> inreg %src1, <4 x i16> inreg %src2) {
; GFX6-LABEL: s_andn2_v4i16_multi_foldable_use:
; GFX6: ; %bb.0:
; GFX6-NEXT: s_mov_b32 s14, 0xffff
; GFX6-NEXT: s_lshl_b32 s0, s3, 16
; GFX6-NEXT: s_and_b32 s1, s2, s14
; GFX6-NEXT: s_and_b32 s1, s2, 0xffff
; GFX6-NEXT: s_or_b32 s0, s0, s1
; GFX6-NEXT: s_lshl_b32 s1, s5, 16
; GFX6-NEXT: s_and_b32 s2, s4, s14
; GFX6-NEXT: s_and_b32 s2, s4, 0xffff
; GFX6-NEXT: s_or_b32 s1, s1, s2
; GFX6-NEXT: s_lshl_b32 s2, s7, 16
; GFX6-NEXT: s_and_b32 s3, s6, s14
; GFX6-NEXT: s_and_b32 s3, s6, 0xffff
; GFX6-NEXT: s_or_b32 s2, s2, s3
; GFX6-NEXT: s_lshl_b32 s3, s9, 16
; GFX6-NEXT: s_and_b32 s4, s8, s14
; GFX6-NEXT: s_and_b32 s4, s8, 0xffff
; GFX6-NEXT: s_or_b32 s3, s3, s4
; GFX6-NEXT: s_lshl_b32 s4, s11, 16
; GFX6-NEXT: s_and_b32 s5, s10, s14
; GFX6-NEXT: s_and_b32 s5, s10, 0xffff
; GFX6-NEXT: s_or_b32 s4, s4, s5
; GFX6-NEXT: s_lshl_b32 s5, s13, 16
; GFX6-NEXT: s_and_b32 s6, s12, s14
; GFX6-NEXT: s_and_b32 s6, s12, 0xffff
; GFX6-NEXT: s_or_b32 s5, s5, s6
; GFX6-NEXT: s_mov_b32 s6, -1
; GFX6-NEXT: s_mov_b32 s7, s6

View File

@ -794,24 +794,22 @@ define amdgpu_ps i32 @s_ashr_v2i16(<2 x i16> inreg %value, <2 x i16> inreg %amou
; GFX6-LABEL: s_ashr_v2i16:
; GFX6: ; %bb.0:
; GFX6-NEXT: s_sext_i32_i16 s1, s1
; GFX6-NEXT: s_mov_b32 s4, 0xffff
; GFX6-NEXT: s_sext_i32_i16 s0, s0
; GFX6-NEXT: s_ashr_i32 s1, s1, s3
; GFX6-NEXT: s_ashr_i32 s0, s0, s2
; GFX6-NEXT: s_and_b32 s1, s1, s4
; GFX6-NEXT: s_and_b32 s0, s0, s4
; GFX6-NEXT: s_and_b32 s1, s1, 0xffff
; GFX6-NEXT: s_and_b32 s0, s0, 0xffff
; GFX6-NEXT: s_lshl_b32 s1, s1, 16
; GFX6-NEXT: s_or_b32 s0, s0, s1
; GFX6-NEXT: ; return to shader part epilog
;
; GFX8-LABEL: s_ashr_v2i16:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_mov_b32 s3, 0x100010
; GFX8-NEXT: s_sext_i32_i16 s2, s0
; GFX8-NEXT: s_bfe_i32 s0, s0, s3
; GFX8-NEXT: s_sext_i32_i16 s4, s1
; GFX8-NEXT: s_bfe_i32 s1, s1, s3
; GFX8-NEXT: s_ashr_i32 s2, s2, s4
; GFX8-NEXT: s_bfe_i32 s0, s0, 0x100010
; GFX8-NEXT: s_sext_i32_i16 s3, s1
; GFX8-NEXT: s_bfe_i32 s1, s1, 0x100010
; GFX8-NEXT: s_ashr_i32 s2, s2, s3
; GFX8-NEXT: s_ashr_i32 s0, s0, s1
; GFX8-NEXT: s_lshl_b32 s0, s0, 16
; GFX8-NEXT: s_and_b32 s1, s2, 0xffff
@ -886,12 +884,12 @@ define amdgpu_ps float @ashr_v2i16_sv(<2 x i16> inreg %value, <2 x i16> %amount)
define amdgpu_ps float @ashr_v2i16_vs(<2 x i16> %value, <2 x i16> inreg %amount) {
; GFX6-LABEL: ashr_v2i16_vs:
; GFX6: ; %bb.0:
; GFX6-NEXT: s_mov_b32 s2, 0xffff
; GFX6-NEXT: s_and_b32 s0, s0, s2
; GFX6-NEXT: s_and_b32 s0, s0, 0xffff
; GFX6-NEXT: v_bfe_i32 v0, v0, 0, 16
; GFX6-NEXT: v_ashrrev_i32_e32 v0, s0, v0
; GFX6-NEXT: s_and_b32 s0, s1, s2
; GFX6-NEXT: s_and_b32 s0, s1, 0xffff
; GFX6-NEXT: v_bfe_i32 v1, v1, 0, 16
; GFX6-NEXT: s_mov_b32 s2, 0xffff
; GFX6-NEXT: v_ashrrev_i32_e32 v1, s0, v1
; GFX6-NEXT: v_and_b32_e32 v1, s2, v1
; GFX6-NEXT: v_and_b32_e32 v0, s2, v0
@ -994,45 +992,42 @@ define amdgpu_ps <2 x i32> @s_ashr_v4i16(<4 x i16> inreg %value, <4 x i16> inreg
; GFX6-LABEL: s_ashr_v4i16:
; GFX6: ; %bb.0:
; GFX6-NEXT: s_sext_i32_i16 s1, s1
; GFX6-NEXT: s_mov_b32 s8, 0xffff
; GFX6-NEXT: s_sext_i32_i16 s0, s0
; GFX6-NEXT: s_ashr_i32 s1, s1, s5
; GFX6-NEXT: s_ashr_i32 s0, s0, s4
; GFX6-NEXT: s_sext_i32_i16 s2, s2
; GFX6-NEXT: s_sext_i32_i16 s3, s3
; GFX6-NEXT: s_and_b32 s1, s1, s8
; GFX6-NEXT: s_and_b32 s1, s1, 0xffff
; GFX6-NEXT: s_ashr_i32 s2, s2, s6
; GFX6-NEXT: s_ashr_i32 s3, s3, s7
; GFX6-NEXT: s_and_b32 s0, s0, s8
; GFX6-NEXT: s_and_b32 s0, s0, 0xffff
; GFX6-NEXT: s_lshl_b32 s1, s1, 16
; GFX6-NEXT: s_or_b32 s0, s0, s1
; GFX6-NEXT: s_and_b32 s1, s2, s8
; GFX6-NEXT: s_and_b32 s2, s3, s8
; GFX6-NEXT: s_and_b32 s1, s2, 0xffff
; GFX6-NEXT: s_and_b32 s2, s3, 0xffff
; GFX6-NEXT: s_lshl_b32 s2, s2, 16
; GFX6-NEXT: s_or_b32 s1, s1, s2
; GFX6-NEXT: ; return to shader part epilog
;
; GFX8-LABEL: s_ashr_v4i16:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_mov_b32 s5, 0x100010
; GFX8-NEXT: s_sext_i32_i16 s4, s0
; GFX8-NEXT: s_bfe_i32 s0, s0, s5
; GFX8-NEXT: s_sext_i32_i16 s6, s1
; GFX8-NEXT: s_bfe_i32 s1, s1, s5
; GFX8-NEXT: s_sext_i32_i16 s7, s2
; GFX8-NEXT: s_bfe_i32 s2, s2, s5
; GFX8-NEXT: s_sext_i32_i16 s8, s3
; GFX8-NEXT: s_bfe_i32 s3, s3, s5
; GFX8-NEXT: s_ashr_i32 s4, s4, s7
; GFX8-NEXT: s_bfe_i32 s0, s0, 0x100010
; GFX8-NEXT: s_sext_i32_i16 s5, s1
; GFX8-NEXT: s_bfe_i32 s1, s1, 0x100010
; GFX8-NEXT: s_sext_i32_i16 s6, s2
; GFX8-NEXT: s_bfe_i32 s2, s2, 0x100010
; GFX8-NEXT: s_sext_i32_i16 s7, s3
; GFX8-NEXT: s_bfe_i32 s3, s3, 0x100010
; GFX8-NEXT: s_ashr_i32 s4, s4, s6
; GFX8-NEXT: s_ashr_i32 s0, s0, s2
; GFX8-NEXT: s_ashr_i32 s2, s6, s8
; GFX8-NEXT: s_ashr_i32 s2, s5, s7
; GFX8-NEXT: s_ashr_i32 s1, s1, s3
; GFX8-NEXT: s_mov_b32 s3, 0xffff
; GFX8-NEXT: s_lshl_b32 s0, s0, 16
; GFX8-NEXT: s_and_b32 s4, s4, s3
; GFX8-NEXT: s_and_b32 s3, s4, 0xffff
; GFX8-NEXT: s_lshl_b32 s1, s1, 16
; GFX8-NEXT: s_and_b32 s2, s2, s3
; GFX8-NEXT: s_or_b32 s0, s0, s4
; GFX8-NEXT: s_and_b32 s2, s2, 0xffff
; GFX8-NEXT: s_or_b32 s0, s0, s3
; GFX8-NEXT: s_or_b32 s1, s1, s2
; GFX8-NEXT: ; return to shader part epilog
;
@ -1191,79 +1186,76 @@ define amdgpu_ps <4 x i32> @s_ashr_v8i16(<8 x i16> inreg %value, <8 x i16> inreg
; GFX6-LABEL: s_ashr_v8i16:
; GFX6: ; %bb.0:
; GFX6-NEXT: s_sext_i32_i16 s1, s1
; GFX6-NEXT: s_mov_b32 s16, 0xffff
; GFX6-NEXT: s_sext_i32_i16 s0, s0
; GFX6-NEXT: s_ashr_i32 s1, s1, s9
; GFX6-NEXT: s_ashr_i32 s0, s0, s8
; GFX6-NEXT: s_sext_i32_i16 s2, s2
; GFX6-NEXT: s_sext_i32_i16 s3, s3
; GFX6-NEXT: s_and_b32 s1, s1, s16
; GFX6-NEXT: s_and_b32 s1, s1, 0xffff
; GFX6-NEXT: s_ashr_i32 s2, s2, s10
; GFX6-NEXT: s_ashr_i32 s3, s3, s11
; GFX6-NEXT: s_sext_i32_i16 s5, s5
; GFX6-NEXT: s_and_b32 s0, s0, s16
; GFX6-NEXT: s_and_b32 s0, s0, 0xffff
; GFX6-NEXT: s_lshl_b32 s1, s1, 16
; GFX6-NEXT: s_sext_i32_i16 s4, s4
; GFX6-NEXT: s_ashr_i32 s5, s5, s13
; GFX6-NEXT: s_sext_i32_i16 s7, s7
; GFX6-NEXT: s_or_b32 s0, s0, s1
; GFX6-NEXT: s_and_b32 s1, s2, s16
; GFX6-NEXT: s_and_b32 s2, s3, s16
; GFX6-NEXT: s_and_b32 s1, s2, 0xffff
; GFX6-NEXT: s_and_b32 s2, s3, 0xffff
; GFX6-NEXT: s_ashr_i32 s4, s4, s12
; GFX6-NEXT: s_sext_i32_i16 s6, s6
; GFX6-NEXT: s_ashr_i32 s7, s7, s15
; GFX6-NEXT: s_lshl_b32 s2, s2, 16
; GFX6-NEXT: s_and_b32 s3, s5, s16
; GFX6-NEXT: s_and_b32 s3, s5, 0xffff
; GFX6-NEXT: s_ashr_i32 s6, s6, s14
; GFX6-NEXT: s_or_b32 s1, s1, s2
; GFX6-NEXT: s_and_b32 s2, s4, s16
; GFX6-NEXT: s_and_b32 s2, s4, 0xffff
; GFX6-NEXT: s_lshl_b32 s3, s3, 16
; GFX6-NEXT: s_and_b32 s4, s7, s16
; GFX6-NEXT: s_and_b32 s4, s7, 0xffff
; GFX6-NEXT: s_or_b32 s2, s2, s3
; GFX6-NEXT: s_and_b32 s3, s6, s16
; GFX6-NEXT: s_and_b32 s3, s6, 0xffff
; GFX6-NEXT: s_lshl_b32 s4, s4, 16
; GFX6-NEXT: s_or_b32 s3, s3, s4
; GFX6-NEXT: ; return to shader part epilog
;
; GFX8-LABEL: s_ashr_v8i16:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_mov_b32 s9, 0x100010
; GFX8-NEXT: s_sext_i32_i16 s8, s0
; GFX8-NEXT: s_bfe_i32 s0, s0, s9
; GFX8-NEXT: s_sext_i32_i16 s10, s1
; GFX8-NEXT: s_bfe_i32 s1, s1, s9
; GFX8-NEXT: s_sext_i32_i16 s12, s3
; GFX8-NEXT: s_bfe_i32 s3, s3, s9
; GFX8-NEXT: s_sext_i32_i16 s13, s4
; GFX8-NEXT: s_bfe_i32 s4, s4, s9
; GFX8-NEXT: s_sext_i32_i16 s14, s5
; GFX8-NEXT: s_bfe_i32 s5, s5, s9
; GFX8-NEXT: s_sext_i32_i16 s16, s7
; GFX8-NEXT: s_bfe_i32 s7, s7, s9
; GFX8-NEXT: s_sext_i32_i16 s11, s2
; GFX8-NEXT: s_bfe_i32 s2, s2, s9
; GFX8-NEXT: s_sext_i32_i16 s15, s6
; GFX8-NEXT: s_bfe_i32 s6, s6, s9
; GFX8-NEXT: s_bfe_i32 s0, s0, 0x100010
; GFX8-NEXT: s_sext_i32_i16 s9, s1
; GFX8-NEXT: s_bfe_i32 s1, s1, 0x100010
; GFX8-NEXT: s_sext_i32_i16 s12, s4
; GFX8-NEXT: s_bfe_i32 s4, s4, 0x100010
; GFX8-NEXT: s_sext_i32_i16 s13, s5
; GFX8-NEXT: s_bfe_i32 s5, s5, 0x100010
; GFX8-NEXT: s_sext_i32_i16 s10, s2
; GFX8-NEXT: s_bfe_i32 s2, s2, 0x100010
; GFX8-NEXT: s_sext_i32_i16 s14, s6
; GFX8-NEXT: s_bfe_i32 s6, s6, 0x100010
; GFX8-NEXT: s_ashr_i32 s0, s0, s4
; GFX8-NEXT: s_ashr_i32 s4, s10, s14
; GFX8-NEXT: s_ashr_i32 s4, s9, s13
; GFX8-NEXT: s_ashr_i32 s1, s1, s5
; GFX8-NEXT: s_ashr_i32 s3, s3, s7
; GFX8-NEXT: s_mov_b32 s7, 0xffff
; GFX8-NEXT: s_ashr_i32 s5, s11, s15
; GFX8-NEXT: s_sext_i32_i16 s11, s3
; GFX8-NEXT: s_bfe_i32 s3, s3, 0x100010
; GFX8-NEXT: s_sext_i32_i16 s15, s7
; GFX8-NEXT: s_bfe_i32 s7, s7, 0x100010
; GFX8-NEXT: s_ashr_i32 s5, s10, s14
; GFX8-NEXT: s_ashr_i32 s2, s2, s6
; GFX8-NEXT: s_lshl_b32 s1, s1, 16
; GFX8-NEXT: s_and_b32 s4, s4, s7
; GFX8-NEXT: s_ashr_i32 s8, s8, s13
; GFX8-NEXT: s_ashr_i32 s6, s12, s16
; GFX8-NEXT: s_and_b32 s4, s4, 0xffff
; GFX8-NEXT: s_ashr_i32 s8, s8, s12
; GFX8-NEXT: s_ashr_i32 s6, s11, s15
; GFX8-NEXT: s_ashr_i32 s3, s3, s7
; GFX8-NEXT: s_or_b32 s1, s1, s4
; GFX8-NEXT: s_lshl_b32 s2, s2, 16
; GFX8-NEXT: s_and_b32 s4, s5, s7
; GFX8-NEXT: s_and_b32 s4, s5, 0xffff
; GFX8-NEXT: s_lshl_b32 s0, s0, 16
; GFX8-NEXT: s_and_b32 s8, s8, s7
; GFX8-NEXT: s_and_b32 s7, s8, 0xffff
; GFX8-NEXT: s_or_b32 s2, s2, s4
; GFX8-NEXT: s_lshl_b32 s3, s3, 16
; GFX8-NEXT: s_and_b32 s4, s6, s7
; GFX8-NEXT: s_or_b32 s0, s0, s8
; GFX8-NEXT: s_and_b32 s4, s6, 0xffff
; GFX8-NEXT: s_or_b32 s0, s0, s7
; GFX8-NEXT: s_or_b32 s3, s3, s4
; GFX8-NEXT: ; return to shader part epilog
;

View File

@ -111,9 +111,8 @@ define amdgpu_ps <2 x i32> @s_bswap_v2i32(<2 x i32> inreg %src) {
;
; GFX10-LABEL: s_bswap_v2i32:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_mov_b32 s2, 0x10203
; GFX10-NEXT: v_perm_b32 v0, 0, s0, s2
; GFX10-NEXT: v_perm_b32 v1, 0, s1, s2
; GFX10-NEXT: v_perm_b32 v0, 0, s0, 0x10203
; GFX10-NEXT: v_perm_b32 v1, 0, s1, 0x10203
; GFX10-NEXT: v_readfirstlane_b32 s0, v0
; GFX10-NEXT: v_readfirstlane_b32 s1, v1
; GFX10-NEXT: ; return to shader part epilog
@ -154,9 +153,8 @@ define <2 x i32> @v_bswap_v2i32(<2 x i32> %src) {
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: s_mov_b32 s4, 0x10203
; GFX10-NEXT: v_perm_b32 v0, 0, v0, s4
; GFX10-NEXT: v_perm_b32 v1, 0, v1, s4
; GFX10-NEXT: v_perm_b32 v0, 0, v0, 0x10203
; GFX10-NEXT: v_perm_b32 v1, 0, v1, 0x10203
; GFX10-NEXT: s_setpc_b64 s[30:31]
%bswap = call <2 x i32> @llvm.bswap.v2i32(<2 x i32> %src)
ret <2 x i32> %bswap
@ -200,9 +198,8 @@ define amdgpu_ps i64 @s_bswap_i64(i64 inreg %src) {
;
; GFX10-LABEL: s_bswap_i64:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_mov_b32 s2, 0x10203
; GFX10-NEXT: v_perm_b32 v0, 0, s1, s2
; GFX10-NEXT: v_perm_b32 v1, 0, s0, s2
; GFX10-NEXT: v_perm_b32 v0, 0, s1, 0x10203
; GFX10-NEXT: v_perm_b32 v1, 0, s0, 0x10203
; GFX10-NEXT: v_readfirstlane_b32 s0, v0
; GFX10-NEXT: v_readfirstlane_b32 s1, v1
; GFX10-NEXT: ; return to shader part epilog
@ -246,9 +243,8 @@ define i64 @v_bswap_i64(i64 %src) {
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: s_mov_b32 s4, 0x10203
; GFX10-NEXT: v_perm_b32 v2, 0, v1, s4
; GFX10-NEXT: v_perm_b32 v1, 0, v0, s4
; GFX10-NEXT: v_perm_b32 v2, 0, v1, 0x10203
; GFX10-NEXT: v_perm_b32 v1, 0, v0, 0x10203
; GFX10-NEXT: v_mov_b32_e32 v0, v2
; GFX10-NEXT: s_setpc_b64 s[30:31]
%bswap = call i64 @llvm.bswap.i64(i64 %src)
@ -313,11 +309,10 @@ define amdgpu_ps <2 x i64> @s_bswap_v2i64(<2 x i64> inreg %src) {
;
; GFX10-LABEL: s_bswap_v2i64:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_mov_b32 s4, 0x10203
; GFX10-NEXT: v_perm_b32 v0, 0, s1, s4
; GFX10-NEXT: v_perm_b32 v1, 0, s0, s4
; GFX10-NEXT: v_perm_b32 v2, 0, s3, s4
; GFX10-NEXT: v_perm_b32 v3, 0, s2, s4
; GFX10-NEXT: v_perm_b32 v0, 0, s1, 0x10203
; GFX10-NEXT: v_perm_b32 v1, 0, s0, 0x10203
; GFX10-NEXT: v_perm_b32 v2, 0, s3, 0x10203
; GFX10-NEXT: v_perm_b32 v3, 0, s2, 0x10203
; GFX10-NEXT: v_readfirstlane_b32 s0, v0
; GFX10-NEXT: v_readfirstlane_b32 s1, v1
; GFX10-NEXT: v_readfirstlane_b32 s2, v2
@ -376,11 +371,10 @@ define <2 x i64> @v_bswap_v2i64(<2 x i64> %src) {
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: s_mov_b32 s4, 0x10203
; GFX10-NEXT: v_perm_b32 v4, 0, v1, s4
; GFX10-NEXT: v_perm_b32 v5, 0, v3, s4
; GFX10-NEXT: v_perm_b32 v1, 0, v0, s4
; GFX10-NEXT: v_perm_b32 v3, 0, v2, s4
; GFX10-NEXT: v_perm_b32 v4, 0, v1, 0x10203
; GFX10-NEXT: v_perm_b32 v5, 0, v3, 0x10203
; GFX10-NEXT: v_perm_b32 v1, 0, v0, 0x10203
; GFX10-NEXT: v_perm_b32 v3, 0, v2, 0x10203
; GFX10-NEXT: v_mov_b32_e32 v0, v4
; GFX10-NEXT: v_mov_b32_e32 v2, v5
; GFX10-NEXT: s_setpc_b64 s[30:31]
@ -457,12 +451,11 @@ define i16 @v_bswap_i16(i16 %src) {
define amdgpu_ps i32 @s_bswap_v2i16(<2 x i16> inreg %src) {
; GFX7-LABEL: s_bswap_v2i16:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_mov_b32 s3, 0x80008
; GFX7-NEXT: s_lshl_b32 s2, s0, 8
; GFX7-NEXT: s_bfe_u32 s0, s0, s3
; GFX7-NEXT: s_bfe_u32 s0, s0, 0x80008
; GFX7-NEXT: s_or_b32 s0, s0, s2
; GFX7-NEXT: s_lshl_b32 s2, s1, 8
; GFX7-NEXT: s_bfe_u32 s1, s1, s3
; GFX7-NEXT: s_bfe_u32 s1, s1, 0x80008
; GFX7-NEXT: s_or_b32 s1, s1, s2
; GFX7-NEXT: s_bfe_u32 s1, s1, 0x100000
; GFX7-NEXT: s_bfe_u32 s0, s0, 0x100000
@ -647,9 +640,8 @@ define i64 @v_bswap_i48(i64 %src) {
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: s_mov_b32 s4, 0x10203
; GFX10-NEXT: v_perm_b32 v1, 0, v1, s4
; GFX10-NEXT: v_perm_b32 v2, 0, v0, s4
; GFX10-NEXT: v_perm_b32 v1, 0, v1, 0x10203
; GFX10-NEXT: v_perm_b32 v2, 0, v0, 0x10203
; GFX10-NEXT: v_lshrrev_b64 v[0:1], 16, v[1:2]
; GFX10-NEXT: s_setpc_b64 s[30:31]
%trunc = trunc i64 %src to i48

View File

@ -901,30 +901,29 @@ define <3 x half> @test_3xhalf_add_mul_rhs(<3 x half> %x, <3 x half> %y, <3 x ha
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: v_lshrrev_b32_e32 v6, 16, v0
; GFX10-NEXT: v_lshrrev_b32_e32 v7, 16, v2
; GFX10-NEXT: v_mov_b32_e32 v8, 0xffff
; GFX10-NEXT: s_lshl_b32 s4, s4, 16
; GFX10-NEXT: v_and_or_b32 v1, 0xffff, v1, s4
; GFX10-NEXT: v_lshlrev_b32_e32 v6, 16, v6
; GFX10-NEXT: v_lshlrev_b32_e32 v7, 16, v7
; GFX10-NEXT: v_and_or_b32 v1, v1, v8, s4
; GFX10-NEXT: v_and_or_b32 v3, v3, v8, s4
; GFX10-NEXT: v_and_or_b32 v0, v0, v8, v6
; GFX10-NEXT: v_and_or_b32 v2, v2, v8, v7
; GFX10-NEXT: v_and_or_b32 v3, 0xffff, v3, s4
; GFX10-NEXT: v_and_or_b32 v0, 0xffff, v0, v6
; GFX10-NEXT: v_and_or_b32 v2, 0xffff, v2, v7
; GFX10-NEXT: v_pk_mul_f16 v1, v1, v3
; GFX10-NEXT: v_pk_mul_f16 v0, v0, v2
; GFX10-NEXT: v_lshrrev_b32_e32 v2, 16, v4
; GFX10-NEXT: v_and_or_b32 v1, v1, v8, s4
; GFX10-NEXT: v_and_or_b32 v1, 0xffff, v1, s4
; GFX10-NEXT: v_lshrrev_b32_e32 v6, 16, v0
; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v2
; GFX10-NEXT: v_lshlrev_b32_e32 v6, 16, v6
; GFX10-NEXT: v_and_or_b32 v2, v4, v8, v2
; GFX10-NEXT: v_and_or_b32 v0, v0, v8, v6
; GFX10-NEXT: v_and_or_b32 v2, 0xffff, v4, v2
; GFX10-NEXT: v_and_or_b32 v0, 0xffff, v0, v6
; GFX10-NEXT: v_pk_add_f16 v0, v2, v0
; GFX10-NEXT: v_and_or_b32 v2, v5, v8, s4
; GFX10-NEXT: v_and_or_b32 v2, 0xffff, v5, s4
; GFX10-NEXT: v_lshrrev_b32_e32 v3, 16, v0
; GFX10-NEXT: v_pk_add_f16 v1, v2, v1
; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v3
; GFX10-NEXT: v_and_or_b32 v1, v1, v8, s4
; GFX10-NEXT: v_and_or_b32 v0, v0, v8, v3
; GFX10-NEXT: v_and_or_b32 v1, 0xffff, v1, s4
; GFX10-NEXT: v_and_or_b32 v0, 0xffff, v0, v3
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-CONTRACT-LABEL: test_3xhalf_add_mul_rhs:
@ -934,23 +933,22 @@ define <3 x half> @test_3xhalf_add_mul_rhs(<3 x half> %x, <3 x half> %y, <3 x ha
; GFX10-CONTRACT-NEXT: v_lshrrev_b32_e32 v6, 16, v0
; GFX10-CONTRACT-NEXT: v_lshrrev_b32_e32 v7, 16, v2
; GFX10-CONTRACT-NEXT: v_lshrrev_b32_e32 v8, 16, v4
; GFX10-CONTRACT-NEXT: v_mov_b32_e32 v9, 0xffff
; GFX10-CONTRACT-NEXT: s_lshl_b32 s4, s4, 16
; GFX10-CONTRACT-NEXT: v_and_or_b32 v1, 0xffff, v1, s4
; GFX10-CONTRACT-NEXT: v_lshlrev_b32_e32 v6, 16, v6
; GFX10-CONTRACT-NEXT: v_lshlrev_b32_e32 v7, 16, v7
; GFX10-CONTRACT-NEXT: v_lshlrev_b32_e32 v8, 16, v8
; GFX10-CONTRACT-NEXT: v_and_or_b32 v1, v1, v9, s4
; GFX10-CONTRACT-NEXT: v_and_or_b32 v0, v0, v9, v6
; GFX10-CONTRACT-NEXT: v_and_or_b32 v2, v2, v9, v7
; GFX10-CONTRACT-NEXT: v_and_or_b32 v4, v4, v9, v8
; GFX10-CONTRACT-NEXT: v_and_or_b32 v0, 0xffff, v0, v6
; GFX10-CONTRACT-NEXT: v_and_or_b32 v2, 0xffff, v2, v7
; GFX10-CONTRACT-NEXT: v_and_or_b32 v4, 0xffff, v4, v8
; GFX10-CONTRACT-NEXT: v_pk_fma_f16 v0, v0, v2, v4
; GFX10-CONTRACT-NEXT: v_and_or_b32 v2, v3, v9, s4
; GFX10-CONTRACT-NEXT: v_and_or_b32 v4, v5, v9, s4
; GFX10-CONTRACT-NEXT: v_and_or_b32 v2, 0xffff, v3, s4
; GFX10-CONTRACT-NEXT: v_and_or_b32 v4, 0xffff, v5, s4
; GFX10-CONTRACT-NEXT: v_lshrrev_b32_e32 v3, 16, v0
; GFX10-CONTRACT-NEXT: v_pk_fma_f16 v1, v1, v2, v4
; GFX10-CONTRACT-NEXT: v_lshlrev_b32_e32 v3, 16, v3
; GFX10-CONTRACT-NEXT: v_and_or_b32 v1, v1, v9, s4
; GFX10-CONTRACT-NEXT: v_and_or_b32 v0, v0, v9, v3
; GFX10-CONTRACT-NEXT: v_and_or_b32 v1, 0xffff, v1, s4
; GFX10-CONTRACT-NEXT: v_and_or_b32 v0, 0xffff, v0, v3
; GFX10-CONTRACT-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-DENORM-LABEL: test_3xhalf_add_mul_rhs:
@ -959,30 +957,29 @@ define <3 x half> @test_3xhalf_add_mul_rhs(<3 x half> %x, <3 x half> %y, <3 x ha
; GFX10-DENORM-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-DENORM-NEXT: v_lshrrev_b32_e32 v6, 16, v0
; GFX10-DENORM-NEXT: v_lshrrev_b32_e32 v7, 16, v2
; GFX10-DENORM-NEXT: v_mov_b32_e32 v8, 0xffff
; GFX10-DENORM-NEXT: s_lshl_b32 s4, s4, 16
; GFX10-DENORM-NEXT: v_and_or_b32 v1, 0xffff, v1, s4
; GFX10-DENORM-NEXT: v_lshlrev_b32_e32 v6, 16, v6
; GFX10-DENORM-NEXT: v_lshlrev_b32_e32 v7, 16, v7
; GFX10-DENORM-NEXT: v_and_or_b32 v1, v1, v8, s4
; GFX10-DENORM-NEXT: v_and_or_b32 v3, v3, v8, s4
; GFX10-DENORM-NEXT: v_and_or_b32 v0, v0, v8, v6
; GFX10-DENORM-NEXT: v_and_or_b32 v2, v2, v8, v7
; GFX10-DENORM-NEXT: v_and_or_b32 v3, 0xffff, v3, s4
; GFX10-DENORM-NEXT: v_and_or_b32 v0, 0xffff, v0, v6
; GFX10-DENORM-NEXT: v_and_or_b32 v2, 0xffff, v2, v7
; GFX10-DENORM-NEXT: v_pk_mul_f16 v1, v1, v3
; GFX10-DENORM-NEXT: v_pk_mul_f16 v0, v0, v2
; GFX10-DENORM-NEXT: v_lshrrev_b32_e32 v2, 16, v4
; GFX10-DENORM-NEXT: v_and_or_b32 v1, v1, v8, s4
; GFX10-DENORM-NEXT: v_and_or_b32 v1, 0xffff, v1, s4
; GFX10-DENORM-NEXT: v_lshrrev_b32_e32 v6, 16, v0
; GFX10-DENORM-NEXT: v_lshlrev_b32_e32 v2, 16, v2
; GFX10-DENORM-NEXT: v_lshlrev_b32_e32 v6, 16, v6
; GFX10-DENORM-NEXT: v_and_or_b32 v2, v4, v8, v2
; GFX10-DENORM-NEXT: v_and_or_b32 v0, v0, v8, v6
; GFX10-DENORM-NEXT: v_and_or_b32 v2, 0xffff, v4, v2
; GFX10-DENORM-NEXT: v_and_or_b32 v0, 0xffff, v0, v6
; GFX10-DENORM-NEXT: v_pk_add_f16 v0, v2, v0
; GFX10-DENORM-NEXT: v_and_or_b32 v2, v5, v8, s4
; GFX10-DENORM-NEXT: v_and_or_b32 v2, 0xffff, v5, s4
; GFX10-DENORM-NEXT: v_lshrrev_b32_e32 v3, 16, v0
; GFX10-DENORM-NEXT: v_pk_add_f16 v1, v2, v1
; GFX10-DENORM-NEXT: v_lshlrev_b32_e32 v3, 16, v3
; GFX10-DENORM-NEXT: v_and_or_b32 v1, v1, v8, s4
; GFX10-DENORM-NEXT: v_and_or_b32 v0, v0, v8, v3
; GFX10-DENORM-NEXT: v_and_or_b32 v1, 0xffff, v1, s4
; GFX10-DENORM-NEXT: v_and_or_b32 v0, 0xffff, v0, v3
; GFX10-DENORM-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-UNSAFE-LABEL: test_3xhalf_add_mul_rhs:
@ -992,23 +989,22 @@ define <3 x half> @test_3xhalf_add_mul_rhs(<3 x half> %x, <3 x half> %y, <3 x ha
; GFX10-UNSAFE-NEXT: v_lshrrev_b32_e32 v6, 16, v0
; GFX10-UNSAFE-NEXT: v_lshrrev_b32_e32 v7, 16, v2
; GFX10-UNSAFE-NEXT: v_lshrrev_b32_e32 v8, 16, v4
; GFX10-UNSAFE-NEXT: v_mov_b32_e32 v9, 0xffff
; GFX10-UNSAFE-NEXT: s_lshl_b32 s4, s4, 16
; GFX10-UNSAFE-NEXT: v_and_or_b32 v1, 0xffff, v1, s4
; GFX10-UNSAFE-NEXT: v_lshlrev_b32_e32 v6, 16, v6
; GFX10-UNSAFE-NEXT: v_lshlrev_b32_e32 v7, 16, v7
; GFX10-UNSAFE-NEXT: v_lshlrev_b32_e32 v8, 16, v8
; GFX10-UNSAFE-NEXT: v_and_or_b32 v1, v1, v9, s4
; GFX10-UNSAFE-NEXT: v_and_or_b32 v0, v0, v9, v6
; GFX10-UNSAFE-NEXT: v_and_or_b32 v2, v2, v9, v7
; GFX10-UNSAFE-NEXT: v_and_or_b32 v4, v4, v9, v8
; GFX10-UNSAFE-NEXT: v_and_or_b32 v0, 0xffff, v0, v6
; GFX10-UNSAFE-NEXT: v_and_or_b32 v2, 0xffff, v2, v7
; GFX10-UNSAFE-NEXT: v_and_or_b32 v4, 0xffff, v4, v8
; GFX10-UNSAFE-NEXT: v_pk_fma_f16 v0, v0, v2, v4
; GFX10-UNSAFE-NEXT: v_and_or_b32 v2, v3, v9, s4
; GFX10-UNSAFE-NEXT: v_and_or_b32 v4, v5, v9, s4
; GFX10-UNSAFE-NEXT: v_and_or_b32 v2, 0xffff, v3, s4
; GFX10-UNSAFE-NEXT: v_and_or_b32 v4, 0xffff, v5, s4
; GFX10-UNSAFE-NEXT: v_lshrrev_b32_e32 v3, 16, v0
; GFX10-UNSAFE-NEXT: v_pk_fma_f16 v1, v1, v2, v4
; GFX10-UNSAFE-NEXT: v_lshlrev_b32_e32 v3, 16, v3
; GFX10-UNSAFE-NEXT: v_and_or_b32 v1, v1, v9, s4
; GFX10-UNSAFE-NEXT: v_and_or_b32 v0, v0, v9, v3
; GFX10-UNSAFE-NEXT: v_and_or_b32 v1, 0xffff, v1, s4
; GFX10-UNSAFE-NEXT: v_and_or_b32 v0, 0xffff, v0, v3
; GFX10-UNSAFE-NEXT: s_setpc_b64 s[30:31]
.entry:
%a = fmul <3 x half> %x, %y

View File

@ -113,10 +113,9 @@ define amdgpu_vs <4 x float> @test_v4f16_to_v4f32_sub_ext_neg_mul(<4 x half> %x,
;
; GFX10-DENORM-LABEL: test_v4f16_to_v4f32_sub_ext_neg_mul:
; GFX10-DENORM: ; %bb.0: ; %entry
; GFX10-DENORM-NEXT: s_mov_b32 s0, 0x80008000
; GFX10-DENORM-NEXT: v_xor_b32_e32 v2, 0x80008000, v2
; GFX10-DENORM-NEXT: v_xor_b32_e32 v3, 0x80008000, v3
; GFX10-DENORM-NEXT: v_cvt_f32_f16_e32 v8, v0
; GFX10-DENORM-NEXT: v_xor_b32_e32 v2, s0, v2
; GFX10-DENORM-NEXT: v_xor_b32_e32 v3, s0, v3
; GFX10-DENORM-NEXT: v_cvt_f32_f16_sdwa v9, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
; GFX10-DENORM-NEXT: v_cvt_f32_f16_e32 v10, v1
; GFX10-DENORM-NEXT: v_cvt_f32_f16_sdwa v11, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
@ -155,10 +154,9 @@ define amdgpu_vs <4 x float> @test_v4f16_to_v4f32_sub_neg_ext_mul(<4 x half> %x,
;
; GFX10-DENORM-LABEL: test_v4f16_to_v4f32_sub_neg_ext_mul:
; GFX10-DENORM: ; %bb.0: ; %entry
; GFX10-DENORM-NEXT: s_mov_b32 s0, 0x80008000
; GFX10-DENORM-NEXT: v_xor_b32_e32 v2, 0x80008000, v2
; GFX10-DENORM-NEXT: v_xor_b32_e32 v3, 0x80008000, v3
; GFX10-DENORM-NEXT: v_cvt_f32_f16_e32 v8, v0
; GFX10-DENORM-NEXT: v_xor_b32_e32 v2, s0, v2
; GFX10-DENORM-NEXT: v_xor_b32_e32 v3, s0, v3
; GFX10-DENORM-NEXT: v_cvt_f32_f16_sdwa v9, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
; GFX10-DENORM-NEXT: v_cvt_f32_f16_e32 v10, v1
; GFX10-DENORM-NEXT: v_cvt_f32_f16_sdwa v11, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
@ -198,10 +196,9 @@ define amdgpu_vs <4 x float> @test_v4f16_to_v4f32_sub_ext_neg_mul2(<4 x float> %
;
; GFX10-DENORM-LABEL: test_v4f16_to_v4f32_sub_ext_neg_mul2:
; GFX10-DENORM: ; %bb.0: ; %entry
; GFX10-DENORM-NEXT: s_mov_b32 s0, 0x80008000
; GFX10-DENORM-NEXT: v_xor_b32_e32 v6, 0x80008000, v6
; GFX10-DENORM-NEXT: v_xor_b32_e32 v7, 0x80008000, v7
; GFX10-DENORM-NEXT: v_cvt_f32_f16_e32 v8, v4
; GFX10-DENORM-NEXT: v_xor_b32_e32 v6, s0, v6
; GFX10-DENORM-NEXT: v_xor_b32_e32 v7, s0, v7
; GFX10-DENORM-NEXT: v_cvt_f32_f16_sdwa v4, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
; GFX10-DENORM-NEXT: v_cvt_f32_f16_e32 v9, v5
; GFX10-DENORM-NEXT: v_cvt_f32_f16_sdwa v5, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
@ -240,10 +237,9 @@ define amdgpu_vs <4 x float> @test_v4f16_to_v4f32_sub_neg_ext_mul2(<4 x float> %
;
; GFX10-DENORM-LABEL: test_v4f16_to_v4f32_sub_neg_ext_mul2:
; GFX10-DENORM: ; %bb.0: ; %entry
; GFX10-DENORM-NEXT: s_mov_b32 s0, 0x80008000
; GFX10-DENORM-NEXT: v_xor_b32_e32 v6, 0x80008000, v6
; GFX10-DENORM-NEXT: v_xor_b32_e32 v7, 0x80008000, v7
; GFX10-DENORM-NEXT: v_cvt_f32_f16_e32 v8, v4
; GFX10-DENORM-NEXT: v_xor_b32_e32 v6, s0, v6
; GFX10-DENORM-NEXT: v_xor_b32_e32 v7, s0, v7
; GFX10-DENORM-NEXT: v_cvt_f32_f16_sdwa v4, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
; GFX10-DENORM-NEXT: v_cvt_f32_f16_e32 v9, v5
; GFX10-DENORM-NEXT: v_cvt_f32_f16_sdwa v5, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1

View File

@ -488,10 +488,9 @@ define <4 x half> @test_v4f16_sub_mul(<4 x half> %x, <4 x half> %y, <4 x half> %
; GFX10-NEXT: v_add_f16_e64 v2, v0, -v4
; GFX10-NEXT: v_add_f16_sdwa v0, v0, -v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
; GFX10-NEXT: v_add_f16_e64 v3, v1, -v5
; GFX10-NEXT: v_mov_b32_e32 v4, 0xffff
; GFX10-NEXT: v_add_f16_sdwa v1, v1, -v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
; GFX10-NEXT: v_and_or_b32 v0, v2, v4, v0
; GFX10-NEXT: v_and_or_b32 v1, v3, v4, v1
; GFX10-NEXT: v_and_or_b32 v0, 0xffff, v2, v0
; GFX10-NEXT: v_and_or_b32 v1, 0xffff, v3, v1
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-CONTRACT-LABEL: test_v4f16_sub_mul:
@ -511,10 +510,9 @@ define <4 x half> @test_v4f16_sub_mul(<4 x half> %x, <4 x half> %y, <4 x half> %
; GFX10-DENORM-NEXT: v_add_f16_e64 v2, v0, -v4
; GFX10-DENORM-NEXT: v_add_f16_sdwa v0, v0, -v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
; GFX10-DENORM-NEXT: v_add_f16_e64 v3, v1, -v5
; GFX10-DENORM-NEXT: v_mov_b32_e32 v4, 0xffff
; GFX10-DENORM-NEXT: v_add_f16_sdwa v1, v1, -v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
; GFX10-DENORM-NEXT: v_and_or_b32 v0, v2, v4, v0
; GFX10-DENORM-NEXT: v_and_or_b32 v1, v3, v4, v1
; GFX10-DENORM-NEXT: v_and_or_b32 v0, 0xffff, v2, v0
; GFX10-DENORM-NEXT: v_and_or_b32 v1, 0xffff, v3, v1
; GFX10-DENORM-NEXT: s_setpc_b64 s[30:31]
.entry:
%a = fmul <4 x half> %x, %y
@ -567,10 +565,9 @@ define <4 x half> @test_v4f16_sub_mul_rhs(<4 x half> %x, <4 x half> %y, <4 x hal
; GFX10-NEXT: v_add_f16_e64 v2, v4, -v0
; GFX10-NEXT: v_add_f16_sdwa v0, v4, -v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
; GFX10-NEXT: v_add_f16_e64 v3, v5, -v1
; GFX10-NEXT: v_mov_b32_e32 v4, 0xffff
; GFX10-NEXT: v_add_f16_sdwa v1, v5, -v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
; GFX10-NEXT: v_and_or_b32 v0, v2, v4, v0
; GFX10-NEXT: v_and_or_b32 v1, v3, v4, v1
; GFX10-NEXT: v_and_or_b32 v0, 0xffff, v2, v0
; GFX10-NEXT: v_and_or_b32 v1, 0xffff, v3, v1
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-CONTRACT-LABEL: test_v4f16_sub_mul_rhs:
@ -590,10 +587,9 @@ define <4 x half> @test_v4f16_sub_mul_rhs(<4 x half> %x, <4 x half> %y, <4 x hal
; GFX10-DENORM-NEXT: v_add_f16_e64 v2, v4, -v0
; GFX10-DENORM-NEXT: v_add_f16_sdwa v0, v4, -v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
; GFX10-DENORM-NEXT: v_add_f16_e64 v3, v5, -v1
; GFX10-DENORM-NEXT: v_mov_b32_e32 v4, 0xffff
; GFX10-DENORM-NEXT: v_add_f16_sdwa v1, v5, -v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
; GFX10-DENORM-NEXT: v_and_or_b32 v0, v2, v4, v0
; GFX10-DENORM-NEXT: v_and_or_b32 v1, v3, v4, v1
; GFX10-DENORM-NEXT: v_and_or_b32 v0, 0xffff, v2, v0
; GFX10-DENORM-NEXT: v_and_or_b32 v1, 0xffff, v3, v1
; GFX10-DENORM-NEXT: s_setpc_b64 s[30:31]
.entry:
%a = fmul <4 x half> %x, %y

View File

@ -90,9 +90,8 @@ define half @test_f16_sub_ext_neg_mul(half %x, half %y, half %z) {
; GFX10-CONTRACT: ; %bb.0: ; %entry
; GFX10-CONTRACT-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-CONTRACT-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CONTRACT-NEXT: s_mov_b32 s4, 0x8000
; GFX10-CONTRACT-NEXT: v_xor_b32_e32 v1, s4, v1
; GFX10-CONTRACT-NEXT: v_xor_b32_e32 v2, s4, v2
; GFX10-CONTRACT-NEXT: v_xor_b32_e32 v1, 0x8000, v1
; GFX10-CONTRACT-NEXT: v_xor_b32_e32 v2, 0x8000, v2
; GFX10-CONTRACT-NEXT: v_fma_f16 v0, v0, v1, v2
; GFX10-CONTRACT-NEXT: s_setpc_b64 s[30:31]
;
@ -278,10 +277,9 @@ define <4 x half> @test_v4f16_sub_ext_neg_mul(<4 x half> %x, <4 x half> %y, <4 x
; GFX10-NEXT: v_add_f16_e64 v2, v0, -v4
; GFX10-NEXT: v_add_f16_sdwa v0, v0, -v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
; GFX10-NEXT: v_add_f16_e64 v3, v1, -v5
; GFX10-NEXT: v_mov_b32_e32 v4, 0xffff
; GFX10-NEXT: v_add_f16_sdwa v1, v1, -v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
; GFX10-NEXT: v_and_or_b32 v0, v2, v4, v0
; GFX10-NEXT: v_and_or_b32 v1, v3, v4, v1
; GFX10-NEXT: v_and_or_b32 v0, 0xffff, v2, v0
; GFX10-NEXT: v_and_or_b32 v1, 0xffff, v3, v1
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-CONTRACT-LABEL: test_v4f16_sub_ext_neg_mul:
@ -301,10 +299,9 @@ define <4 x half> @test_v4f16_sub_ext_neg_mul(<4 x half> %x, <4 x half> %y, <4 x
; GFX10-DENORM-NEXT: v_add_f16_e64 v2, v0, -v4
; GFX10-DENORM-NEXT: v_add_f16_sdwa v0, v0, -v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
; GFX10-DENORM-NEXT: v_add_f16_e64 v3, v1, -v5
; GFX10-DENORM-NEXT: v_mov_b32_e32 v4, 0xffff
; GFX10-DENORM-NEXT: v_add_f16_sdwa v1, v1, -v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
; GFX10-DENORM-NEXT: v_and_or_b32 v0, v2, v4, v0
; GFX10-DENORM-NEXT: v_and_or_b32 v1, v3, v4, v1
; GFX10-DENORM-NEXT: v_and_or_b32 v0, 0xffff, v2, v0
; GFX10-DENORM-NEXT: v_and_or_b32 v1, 0xffff, v3, v1
; GFX10-DENORM-NEXT: s_setpc_b64 s[30:31]
entry:
%a = fmul <4 x half> %x, %y

View File

@ -763,29 +763,26 @@ define amdgpu_ps i8 @extractelement_sgpr_v8i8_sgpr_idx(<8 x i8> addrspace(4)* in
; GCN-LABEL: extractelement_sgpr_v8i8_sgpr_idx:
; GCN: ; %bb.0:
; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0
; GCN-NEXT: s_mov_b32 s7, 0x80008
; GCN-NEXT: s_movk_i32 s2, 0xff
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: s_bfe_u32 s8, s0, s7
; GCN-NEXT: s_and_b32 s6, s0, s2
; GCN-NEXT: s_lshl_b32 s8, s8, 8
; GCN-NEXT: s_or_b32 s6, s6, s8
; GCN-NEXT: s_mov_b32 s8, 0x80010
; GCN-NEXT: s_lshr_b32 s3, s0, 24
; GCN-NEXT: s_bfe_u32 s0, s0, s8
; GCN-NEXT: s_bfe_u32 s6, s0, 0x80008
; GCN-NEXT: s_lshr_b32 s2, s0, 24
; GCN-NEXT: s_and_b32 s5, s0, 0xff
; GCN-NEXT: s_lshl_b32 s6, s6, 8
; GCN-NEXT: s_bfe_u32 s0, s0, 0x80010
; GCN-NEXT: s_or_b32 s5, s5, s6
; GCN-NEXT: s_lshl_b32 s0, s0, 16
; GCN-NEXT: s_or_b32 s0, s6, s0
; GCN-NEXT: s_lshl_b32 s3, s3, 24
; GCN-NEXT: s_or_b32 s0, s0, s3
; GCN-NEXT: s_bfe_u32 s3, s1, s7
; GCN-NEXT: s_lshr_b32 s5, s1, 24
; GCN-NEXT: s_and_b32 s2, s1, s2
; GCN-NEXT: s_lshl_b32 s3, s3, 8
; GCN-NEXT: s_bfe_u32 s1, s1, s8
; GCN-NEXT: s_or_b32 s2, s2, s3
; GCN-NEXT: s_or_b32 s0, s5, s0
; GCN-NEXT: s_lshl_b32 s2, s2, 24
; GCN-NEXT: s_bfe_u32 s5, s1, 0x80008
; GCN-NEXT: s_lshr_b32 s3, s1, 24
; GCN-NEXT: s_or_b32 s0, s0, s2
; GCN-NEXT: s_and_b32 s2, s1, 0xff
; GCN-NEXT: s_lshl_b32 s5, s5, 8
; GCN-NEXT: s_bfe_u32 s1, s1, 0x80010
; GCN-NEXT: s_or_b32 s2, s2, s5
; GCN-NEXT: s_lshl_b32 s1, s1, 16
; GCN-NEXT: s_or_b32 s1, s2, s1
; GCN-NEXT: s_lshl_b32 s2, s5, 24
; GCN-NEXT: s_lshl_b32 s2, s3, 24
; GCN-NEXT: s_or_b32 s1, s1, s2
; GCN-NEXT: s_lshr_b32 s2, s4, 2
; GCN-NEXT: s_cmp_eq_u32 s2, 1
@ -798,32 +795,29 @@ define amdgpu_ps i8 @extractelement_sgpr_v8i8_sgpr_idx(<8 x i8> addrspace(4)* in
; GFX10-LABEL: extractelement_sgpr_v8i8_sgpr_idx:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0
; GFX10-NEXT: s_mov_b32 s3, 0x80008
; GFX10-NEXT: s_movk_i32 s2, 0xff
; GFX10-NEXT: s_mov_b32 s5, 0x80010
; GFX10-NEXT: s_lshr_b32 s6, s4, 2
; GFX10-NEXT: s_lshr_b32 s2, s4, 2
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: s_bfe_u32 s10, s0, s3
; GFX10-NEXT: s_bfe_u32 s3, s1, s3
; GFX10-NEXT: s_lshr_b32 s7, s0, 24
; GFX10-NEXT: s_lshr_b32 s8, s1, 24
; GFX10-NEXT: s_and_b32 s9, s0, s2
; GFX10-NEXT: s_bfe_u32 s0, s0, s5
; GFX10-NEXT: s_and_b32 s2, s1, s2
; GFX10-NEXT: s_bfe_u32 s1, s1, s5
; GFX10-NEXT: s_lshl_b32 s5, s10, 8
; GFX10-NEXT: s_lshl_b32 s3, s3, 8
; GFX10-NEXT: s_bfe_u32 s7, s0, 0x80008
; GFX10-NEXT: s_bfe_u32 s9, s1, 0x80008
; GFX10-NEXT: s_lshr_b32 s3, s0, 24
; GFX10-NEXT: s_lshr_b32 s5, s1, 24
; GFX10-NEXT: s_and_b32 s6, s0, 0xff
; GFX10-NEXT: s_bfe_u32 s0, s0, 0x80010
; GFX10-NEXT: s_and_b32 s8, s1, 0xff
; GFX10-NEXT: s_bfe_u32 s1, s1, 0x80010
; GFX10-NEXT: s_lshl_b32 s7, s7, 8
; GFX10-NEXT: s_lshl_b32 s9, s9, 8
; GFX10-NEXT: s_lshl_b32 s0, s0, 16
; GFX10-NEXT: s_lshl_b32 s1, s1, 16
; GFX10-NEXT: s_or_b32 s5, s9, s5
; GFX10-NEXT: s_or_b32 s2, s2, s3
; GFX10-NEXT: s_lshl_b32 s7, s7, 24
; GFX10-NEXT: s_lshl_b32 s8, s8, 24
; GFX10-NEXT: s_or_b32 s0, s5, s0
; GFX10-NEXT: s_or_b32 s1, s2, s1
; GFX10-NEXT: s_or_b32 s0, s0, s7
; GFX10-NEXT: s_or_b32 s1, s1, s8
; GFX10-NEXT: s_cmp_eq_u32 s6, 1
; GFX10-NEXT: s_or_b32 s6, s6, s7
; GFX10-NEXT: s_or_b32 s7, s8, s9
; GFX10-NEXT: s_lshl_b32 s3, s3, 24
; GFX10-NEXT: s_lshl_b32 s5, s5, 24
; GFX10-NEXT: s_or_b32 s0, s6, s0
; GFX10-NEXT: s_or_b32 s1, s7, s1
; GFX10-NEXT: s_or_b32 s0, s0, s3
; GFX10-NEXT: s_or_b32 s1, s1, s5
; GFX10-NEXT: s_cmp_eq_u32 s2, 1
; GFX10-NEXT: s_cselect_b32 s0, s1, s0
; GFX10-NEXT: s_and_b32 s1, s4, 3
; GFX10-NEXT: s_lshl_b32 s1, s1, 3
@ -934,7 +928,6 @@ define amdgpu_ps i8 @extractelement_vgpr_v8i8_sgpr_idx(<8 x i8> addrspace(1)* %p
; GFX10-NEXT: global_load_dwordx2 v[0:1], v[0:1], off
; GFX10-NEXT: s_mov_b32 s0, 8
; GFX10-NEXT: s_mov_b32 s1, 16
; GFX10-NEXT: s_movk_i32 s3, 0xff
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: v_lshrrev_b32_e32 v2, 24, v0
; GFX10-NEXT: v_lshlrev_b32_sdwa v3, s0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
@ -942,9 +935,9 @@ define amdgpu_ps i8 @extractelement_vgpr_v8i8_sgpr_idx(<8 x i8> addrspace(1)* %p
; GFX10-NEXT: v_lshlrev_b32_sdwa v5, s0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
; GFX10-NEXT: v_lshlrev_b32_sdwa v6, s1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
; GFX10-NEXT: v_lshlrev_b32_sdwa v7, s1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
; GFX10-NEXT: v_and_or_b32 v0, v0, s3, v3
; GFX10-NEXT: v_and_or_b32 v0, v0, 0xff, v3
; GFX10-NEXT: v_lshlrev_b32_e32 v2, 24, v2
; GFX10-NEXT: v_and_or_b32 v1, v1, s3, v5
; GFX10-NEXT: v_and_or_b32 v1, v1, 0xff, v5
; GFX10-NEXT: v_lshlrev_b32_e32 v3, 24, v4
; GFX10-NEXT: s_lshr_b32 s0, s2, 2
; GFX10-NEXT: v_or3_b32 v0, v0, v6, v2
@ -1063,7 +1056,6 @@ define i8 @extractelement_vgpr_v8i8_vgpr_idx(<8 x i8> addrspace(1)* %ptr, i32 %i
; GFX10-NEXT: global_load_dwordx2 v[0:1], v[0:1], off
; GFX10-NEXT: s_mov_b32 s4, 8
; GFX10-NEXT: s_mov_b32 s5, 16
; GFX10-NEXT: s_movk_i32 s6, 0xff
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: v_lshrrev_b32_e32 v3, 24, v0
; GFX10-NEXT: v_lshlrev_b32_sdwa v4, s4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
@ -1071,9 +1063,9 @@ define i8 @extractelement_vgpr_v8i8_vgpr_idx(<8 x i8> addrspace(1)* %ptr, i32 %i
; GFX10-NEXT: v_lshlrev_b32_sdwa v6, s4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
; GFX10-NEXT: v_lshlrev_b32_sdwa v7, s5, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
; GFX10-NEXT: v_lshlrev_b32_sdwa v8, s5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
; GFX10-NEXT: v_and_or_b32 v0, v0, s6, v4
; GFX10-NEXT: v_and_or_b32 v0, v0, 0xff, v4
; GFX10-NEXT: v_lshlrev_b32_e32 v3, 24, v3
; GFX10-NEXT: v_and_or_b32 v1, v1, s6, v6
; GFX10-NEXT: v_and_or_b32 v1, v1, 0xff, v6
; GFX10-NEXT: v_lshlrev_b32_e32 v4, 24, v5
; GFX10-NEXT: v_lshrrev_b32_e32 v5, 2, v2
; GFX10-NEXT: v_and_b32_e32 v2, 3, v2
@ -1093,37 +1085,34 @@ define amdgpu_ps i8 @extractelement_sgpr_v8i8_vgpr_idx(<8 x i8> addrspace(4)* in
; GCN-LABEL: extractelement_sgpr_v8i8_vgpr_idx:
; GCN: ; %bb.0:
; GCN-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0
; GCN-NEXT: s_mov_b32 s6, 0x80008
; GCN-NEXT: s_movk_i32 s2, 0xff
; GCN-NEXT: v_lshrrev_b32_e32 v1, 2, v0
; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 1, v1
; GCN-NEXT: v_and_b32_e32 v0, 3, v0
; GCN-NEXT: v_lshlrev_b32_e32 v0, 3, v0
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: s_bfe_u32 s7, s0, s6
; GCN-NEXT: s_and_b32 s5, s0, s2
; GCN-NEXT: s_lshl_b32 s7, s7, 8
; GCN-NEXT: s_or_b32 s5, s5, s7
; GCN-NEXT: s_mov_b32 s7, 0x80010
; GCN-NEXT: s_lshr_b32 s3, s0, 24
; GCN-NEXT: s_bfe_u32 s0, s0, s7
; GCN-NEXT: s_bfe_u32 s5, s0, 0x80008
; GCN-NEXT: s_lshr_b32 s2, s0, 24
; GCN-NEXT: s_and_b32 s4, s0, 0xff
; GCN-NEXT: s_lshl_b32 s5, s5, 8
; GCN-NEXT: s_bfe_u32 s0, s0, 0x80010
; GCN-NEXT: s_or_b32 s4, s4, s5
; GCN-NEXT: s_lshl_b32 s0, s0, 16
; GCN-NEXT: s_or_b32 s0, s5, s0
; GCN-NEXT: s_lshl_b32 s3, s3, 24
; GCN-NEXT: s_or_b32 s0, s0, s3
; GCN-NEXT: s_bfe_u32 s3, s1, s6
; GCN-NEXT: s_lshr_b32 s4, s1, 24
; GCN-NEXT: s_and_b32 s2, s1, s2
; GCN-NEXT: s_lshl_b32 s3, s3, 8
; GCN-NEXT: s_bfe_u32 s1, s1, s7
; GCN-NEXT: s_or_b32 s2, s2, s3
; GCN-NEXT: s_or_b32 s0, s4, s0
; GCN-NEXT: s_lshl_b32 s2, s2, 24
; GCN-NEXT: s_bfe_u32 s4, s1, 0x80008
; GCN-NEXT: s_lshr_b32 s3, s1, 24
; GCN-NEXT: s_or_b32 s0, s0, s2
; GCN-NEXT: s_and_b32 s2, s1, 0xff
; GCN-NEXT: s_lshl_b32 s4, s4, 8
; GCN-NEXT: s_bfe_u32 s1, s1, 0x80010
; GCN-NEXT: s_or_b32 s2, s2, s4
; GCN-NEXT: s_lshl_b32 s1, s1, 16
; GCN-NEXT: s_or_b32 s1, s2, s1
; GCN-NEXT: s_lshl_b32 s2, s4, 24
; GCN-NEXT: s_lshl_b32 s2, s3, 24
; GCN-NEXT: s_or_b32 s1, s1, s2
; GCN-NEXT: v_mov_b32_e32 v2, s0
; GCN-NEXT: v_mov_b32_e32 v3, s1
; GCN-NEXT: v_and_b32_e32 v0, 3, v0
; GCN-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc
; GCN-NEXT: v_lshlrev_b32_e32 v0, 3, v0
; GCN-NEXT: v_lshrrev_b32_e32 v0, v0, v1
; GCN-NEXT: v_readfirstlane_b32 s0, v0
; GCN-NEXT: ; return to shader part epilog
@ -1131,33 +1120,30 @@ define amdgpu_ps i8 @extractelement_sgpr_v8i8_vgpr_idx(<8 x i8> addrspace(4)* in
; GFX10-LABEL: extractelement_sgpr_v8i8_vgpr_idx:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0
; GFX10-NEXT: s_mov_b32 s3, 0x80008
; GFX10-NEXT: s_movk_i32 s2, 0xff
; GFX10-NEXT: s_mov_b32 s4, 0x80010
; GFX10-NEXT: v_lshrrev_b32_e32 v1, 2, v0
; GFX10-NEXT: v_and_b32_e32 v0, 3, v0
; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v1
; GFX10-NEXT: v_lshlrev_b32_e32 v0, 3, v0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: s_bfe_u32 s8, s0, s3
; GFX10-NEXT: s_bfe_u32 s3, s1, s3
; GFX10-NEXT: s_lshr_b32 s6, s1, 24
; GFX10-NEXT: s_and_b32 s7, s0, s2
; GFX10-NEXT: s_and_b32 s2, s1, s2
; GFX10-NEXT: s_bfe_u32 s1, s1, s4
; GFX10-NEXT: s_lshl_b32 s3, s3, 8
; GFX10-NEXT: s_bfe_u32 s7, s1, 0x80008
; GFX10-NEXT: s_lshr_b32 s3, s1, 24
; GFX10-NEXT: s_and_b32 s6, s1, 0xff
; GFX10-NEXT: s_bfe_u32 s1, s1, 0x80010
; GFX10-NEXT: s_lshl_b32 s7, s7, 8
; GFX10-NEXT: s_lshl_b32 s1, s1, 16
; GFX10-NEXT: s_or_b32 s2, s2, s3
; GFX10-NEXT: s_lshl_b32 s3, s6, 24
; GFX10-NEXT: s_or_b32 s1, s2, s1
; GFX10-NEXT: s_lshr_b32 s5, s0, 24
; GFX10-NEXT: s_bfe_u32 s0, s0, s4
; GFX10-NEXT: s_lshl_b32 s4, s8, 8
; GFX10-NEXT: s_or_b32 s6, s6, s7
; GFX10-NEXT: s_bfe_u32 s5, s0, 0x80008
; GFX10-NEXT: s_lshl_b32 s3, s3, 24
; GFX10-NEXT: s_or_b32 s1, s6, s1
; GFX10-NEXT: s_lshr_b32 s2, s0, 24
; GFX10-NEXT: s_and_b32 s4, s0, 0xff
; GFX10-NEXT: s_bfe_u32 s0, s0, 0x80010
; GFX10-NEXT: s_lshl_b32 s5, s5, 8
; GFX10-NEXT: s_or_b32 s1, s1, s3
; GFX10-NEXT: s_lshl_b32 s0, s0, 16
; GFX10-NEXT: s_or_b32 s3, s7, s4
; GFX10-NEXT: s_or_b32 s3, s4, s5
; GFX10-NEXT: v_mov_b32_e32 v2, s1
; GFX10-NEXT: s_lshl_b32 s2, s5, 24
; GFX10-NEXT: s_lshl_b32 s2, s2, 24
; GFX10-NEXT: s_or_b32 s0, s3, s0
; GFX10-NEXT: s_or_b32 s0, s0, s2
; GFX10-NEXT: v_cndmask_b32_e32 v1, s0, v2, vcc_lo
@ -2089,45 +2075,42 @@ define amdgpu_ps i8 @extractelement_sgpr_v16i8_sgpr_idx(<16 x i8> addrspace(4)*
; GCN-LABEL: extractelement_sgpr_v16i8_sgpr_idx:
; GCN: ; %bb.0:
; GCN-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x0
; GCN-NEXT: s_mov_b32 s11, 0x80008
; GCN-NEXT: s_movk_i32 s9, 0xff
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: s_bfe_u32 s12, s0, s11
; GCN-NEXT: s_and_b32 s10, s0, s9
; GCN-NEXT: s_lshl_b32 s12, s12, 8
; GCN-NEXT: s_or_b32 s10, s10, s12
; GCN-NEXT: s_mov_b32 s12, 0x80010
; GCN-NEXT: s_bfe_u32 s10, s0, 0x80008
; GCN-NEXT: s_lshr_b32 s5, s0, 24
; GCN-NEXT: s_bfe_u32 s0, s0, s12
; GCN-NEXT: s_and_b32 s9, s0, 0xff
; GCN-NEXT: s_lshl_b32 s10, s10, 8
; GCN-NEXT: s_bfe_u32 s0, s0, 0x80010
; GCN-NEXT: s_or_b32 s9, s9, s10
; GCN-NEXT: s_lshl_b32 s0, s0, 16
; GCN-NEXT: s_or_b32 s0, s10, s0
; GCN-NEXT: s_or_b32 s0, s9, s0
; GCN-NEXT: s_lshl_b32 s5, s5, 24
; GCN-NEXT: s_bfe_u32 s10, s1, s11
; GCN-NEXT: s_bfe_u32 s9, s1, 0x80008
; GCN-NEXT: s_lshr_b32 s6, s1, 24
; GCN-NEXT: s_or_b32 s0, s0, s5
; GCN-NEXT: s_and_b32 s5, s1, s9
; GCN-NEXT: s_lshl_b32 s10, s10, 8
; GCN-NEXT: s_bfe_u32 s1, s1, s12
; GCN-NEXT: s_or_b32 s5, s5, s10
; GCN-NEXT: s_and_b32 s5, s1, 0xff
; GCN-NEXT: s_lshl_b32 s9, s9, 8
; GCN-NEXT: s_bfe_u32 s1, s1, 0x80010
; GCN-NEXT: s_or_b32 s5, s5, s9
; GCN-NEXT: s_lshl_b32 s1, s1, 16
; GCN-NEXT: s_or_b32 s1, s5, s1
; GCN-NEXT: s_lshl_b32 s5, s6, 24
; GCN-NEXT: s_bfe_u32 s6, s2, s11
; GCN-NEXT: s_bfe_u32 s6, s2, 0x80008
; GCN-NEXT: s_lshr_b32 s7, s2, 24
; GCN-NEXT: s_or_b32 s1, s1, s5
; GCN-NEXT: s_and_b32 s5, s2, s9
; GCN-NEXT: s_and_b32 s5, s2, 0xff
; GCN-NEXT: s_lshl_b32 s6, s6, 8
; GCN-NEXT: s_bfe_u32 s2, s2, s12
; GCN-NEXT: s_bfe_u32 s2, s2, 0x80010
; GCN-NEXT: s_or_b32 s5, s5, s6
; GCN-NEXT: s_lshl_b32 s2, s2, 16
; GCN-NEXT: s_or_b32 s2, s5, s2
; GCN-NEXT: s_lshl_b32 s5, s7, 24
; GCN-NEXT: s_bfe_u32 s6, s3, s11
; GCN-NEXT: s_bfe_u32 s6, s3, 0x80008
; GCN-NEXT: s_lshr_b32 s8, s3, 24
; GCN-NEXT: s_or_b32 s2, s2, s5
; GCN-NEXT: s_and_b32 s5, s3, s9
; GCN-NEXT: s_and_b32 s5, s3, 0xff
; GCN-NEXT: s_lshl_b32 s6, s6, 8
; GCN-NEXT: s_bfe_u32 s3, s3, s12
; GCN-NEXT: s_bfe_u32 s3, s3, 0x80010
; GCN-NEXT: s_or_b32 s5, s5, s6
; GCN-NEXT: s_lshl_b32 s3, s3, 16
; GCN-NEXT: s_or_b32 s3, s5, s3
@ -2148,50 +2131,47 @@ define amdgpu_ps i8 @extractelement_sgpr_v16i8_sgpr_idx(<16 x i8> addrspace(4)*
; GFX10-LABEL: extractelement_sgpr_v16i8_sgpr_idx:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x0
; GFX10-NEXT: s_mov_b32 s6, 0x80008
; GFX10-NEXT: s_movk_i32 s5, 0xff
; GFX10-NEXT: s_mov_b32 s7, 0x80010
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: s_bfe_u32 s13, s0, s6
; GFX10-NEXT: s_lshr_b32 s8, s0, 24
; GFX10-NEXT: s_and_b32 s12, s0, s5
; GFX10-NEXT: s_bfe_u32 s0, s0, s7
; GFX10-NEXT: s_lshl_b32 s13, s13, 8
; GFX10-NEXT: s_bfe_u32 s15, s1, s6
; GFX10-NEXT: s_bfe_u32 s17, s2, s6
; GFX10-NEXT: s_lshl_b32 s0, s0, 16
; GFX10-NEXT: s_or_b32 s12, s12, s13
; GFX10-NEXT: s_bfe_u32 s6, s3, s6
; GFX10-NEXT: s_lshr_b32 s9, s1, 24
; GFX10-NEXT: s_lshr_b32 s10, s2, 24
; GFX10-NEXT: s_lshr_b32 s11, s3, 24
; GFX10-NEXT: s_and_b32 s14, s1, s5
; GFX10-NEXT: s_bfe_u32 s1, s1, s7
; GFX10-NEXT: s_and_b32 s16, s2, s5
; GFX10-NEXT: s_lshl_b32 s8, s8, 24
; GFX10-NEXT: s_lshl_b32 s15, s15, 8
; GFX10-NEXT: s_lshl_b32 s17, s17, 8
; GFX10-NEXT: s_or_b32 s0, s12, s0
; GFX10-NEXT: s_bfe_u32 s2, s2, s7
; GFX10-NEXT: s_and_b32 s5, s3, s5
; GFX10-NEXT: s_lshl_b32 s6, s6, 8
; GFX10-NEXT: s_bfe_u32 s3, s3, s7
; GFX10-NEXT: s_bfe_u32 s10, s0, 0x80008
; GFX10-NEXT: s_bfe_u32 s12, s1, 0x80008
; GFX10-NEXT: s_lshr_b32 s6, s1, 24
; GFX10-NEXT: s_and_b32 s9, s0, 0xff
; GFX10-NEXT: s_and_b32 s11, s1, 0xff
; GFX10-NEXT: s_bfe_u32 s1, s1, 0x80010
; GFX10-NEXT: s_lshl_b32 s10, s10, 8
; GFX10-NEXT: s_lshl_b32 s12, s12, 8
; GFX10-NEXT: s_lshr_b32 s5, s0, 24
; GFX10-NEXT: s_bfe_u32 s0, s0, 0x80010
; GFX10-NEXT: s_lshl_b32 s1, s1, 16
; GFX10-NEXT: s_or_b32 s13, s14, s15
; GFX10-NEXT: s_or_b32 s0, s0, s8
; GFX10-NEXT: s_or_b32 s8, s16, s17
; GFX10-NEXT: s_or_b32 s9, s9, s10
; GFX10-NEXT: s_or_b32 s10, s11, s12
; GFX10-NEXT: s_bfe_u32 s14, s2, 0x80008
; GFX10-NEXT: s_lshl_b32 s0, s0, 16
; GFX10-NEXT: s_lshl_b32 s6, s6, 24
; GFX10-NEXT: s_or_b32 s1, s10, s1
; GFX10-NEXT: s_lshr_b32 s7, s2, 24
; GFX10-NEXT: s_and_b32 s13, s2, 0xff
; GFX10-NEXT: s_bfe_u32 s2, s2, 0x80010
; GFX10-NEXT: s_lshl_b32 s5, s5, 24
; GFX10-NEXT: s_lshl_b32 s14, s14, 8
; GFX10-NEXT: s_or_b32 s0, s9, s0
; GFX10-NEXT: s_or_b32 s1, s1, s6
; GFX10-NEXT: s_bfe_u32 s6, s3, 0x80008
; GFX10-NEXT: s_lshr_b32 s8, s3, 24
; GFX10-NEXT: s_lshl_b32 s2, s2, 16
; GFX10-NEXT: s_or_b32 s5, s5, s6
; GFX10-NEXT: s_or_b32 s11, s13, s14
; GFX10-NEXT: s_or_b32 s0, s0, s5
; GFX10-NEXT: s_lshl_b32 s5, s7, 24
; GFX10-NEXT: s_and_b32 s7, s3, 0xff
; GFX10-NEXT: s_lshl_b32 s6, s6, 8
; GFX10-NEXT: s_bfe_u32 s3, s3, 0x80010
; GFX10-NEXT: s_or_b32 s2, s11, s2
; GFX10-NEXT: s_or_b32 s6, s7, s6
; GFX10-NEXT: s_lshl_b32 s3, s3, 16
; GFX10-NEXT: s_lshl_b32 s9, s9, 24
; GFX10-NEXT: s_or_b32 s1, s13, s1
; GFX10-NEXT: s_or_b32 s2, s8, s2
; GFX10-NEXT: s_lshl_b32 s8, s10, 24
; GFX10-NEXT: s_or_b32 s3, s5, s3
; GFX10-NEXT: s_lshl_b32 s5, s11, 24
; GFX10-NEXT: s_or_b32 s2, s2, s5
; GFX10-NEXT: s_or_b32 s3, s6, s3
; GFX10-NEXT: s_lshl_b32 s5, s8, 24
; GFX10-NEXT: s_lshr_b32 s6, s4, 2
; GFX10-NEXT: s_or_b32 s1, s1, s9
; GFX10-NEXT: s_or_b32 s2, s2, s8
; GFX10-NEXT: s_or_b32 s3, s3, s5
; GFX10-NEXT: s_cmp_eq_u32 s6, 1
; GFX10-NEXT: s_cselect_b32 s0, s1, s0
@ -2371,37 +2351,35 @@ define amdgpu_ps i8 @extractelement_vgpr_v16i8_sgpr_idx(<16 x i8> addrspace(1)*
; GFX10: ; %bb.0:
; GFX10-NEXT: global_load_dwordx4 v[0:3], v[0:1], off
; GFX10-NEXT: s_mov_b32 s0, 8
; GFX10-NEXT: v_mov_b32_e32 v5, 8
; GFX10-NEXT: v_mov_b32_e32 v4, 8
; GFX10-NEXT: s_mov_b32 s1, 16
; GFX10-NEXT: s_movk_i32 s3, 0xff
; GFX10-NEXT: v_mov_b32_e32 v4, 0xff
; GFX10-NEXT: v_mov_b32_e32 v6, 16
; GFX10-NEXT: v_mov_b32_e32 v5, 16
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: v_lshrrev_b32_e32 v7, 24, v0
; GFX10-NEXT: v_lshrrev_b32_e32 v8, 24, v1
; GFX10-NEXT: v_lshlrev_b32_sdwa v10, s0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
; GFX10-NEXT: v_lshlrev_b32_sdwa v12, s0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
; GFX10-NEXT: v_lshrrev_b32_e32 v9, 24, v2
; GFX10-NEXT: v_lshlrev_b32_sdwa v11, s1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
; GFX10-NEXT: v_lshlrev_b32_sdwa v13, s1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
; GFX10-NEXT: v_lshlrev_b32_sdwa v14, v5, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
; GFX10-NEXT: v_and_or_b32 v0, v0, s3, v10
; GFX10-NEXT: v_lshrrev_b32_e32 v6, 24, v0
; GFX10-NEXT: v_lshrrev_b32_e32 v7, 24, v1
; GFX10-NEXT: v_lshlrev_b32_sdwa v9, s0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
; GFX10-NEXT: v_lshlrev_b32_sdwa v11, s0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
; GFX10-NEXT: v_lshrrev_b32_e32 v8, 24, v2
; GFX10-NEXT: v_lshlrev_b32_sdwa v10, s1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
; GFX10-NEXT: v_lshlrev_b32_sdwa v12, s1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
; GFX10-NEXT: v_lshlrev_b32_sdwa v13, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
; GFX10-NEXT: v_and_or_b32 v0, v0, 0xff, v9
; GFX10-NEXT: v_lshlrev_b32_e32 v6, 24, v6
; GFX10-NEXT: v_and_or_b32 v1, v1, 0xff, v11
; GFX10-NEXT: v_lshlrev_b32_e32 v7, 24, v7
; GFX10-NEXT: v_and_or_b32 v1, v1, s3, v12
; GFX10-NEXT: v_lshlrev_b32_e32 v8, 24, v8
; GFX10-NEXT: s_lshr_b32 s0, s2, 2
; GFX10-NEXT: v_lshlrev_b32_sdwa v15, v6, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
; GFX10-NEXT: v_and_or_b32 v2, v2, v4, v14
; GFX10-NEXT: v_lshlrev_b32_e32 v9, 24, v9
; GFX10-NEXT: v_lshlrev_b32_sdwa v5, v5, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
; GFX10-NEXT: v_lshrrev_b32_e32 v10, 24, v3
; GFX10-NEXT: v_or3_b32 v0, v0, v11, v7
; GFX10-NEXT: v_or3_b32 v1, v1, v13, v8
; GFX10-NEXT: v_lshlrev_b32_sdwa v14, v5, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
; GFX10-NEXT: v_and_or_b32 v2, 0xff, v2, v13
; GFX10-NEXT: v_lshlrev_b32_e32 v8, 24, v8
; GFX10-NEXT: v_lshlrev_b32_sdwa v4, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
; GFX10-NEXT: v_lshrrev_b32_e32 v9, 24, v3
; GFX10-NEXT: v_or3_b32 v0, v0, v10, v6
; GFX10-NEXT: v_or3_b32 v1, v1, v12, v7
; GFX10-NEXT: v_cmp_eq_u32_e64 vcc_lo, s0, 1
; GFX10-NEXT: v_or3_b32 v2, v2, v15, v9
; GFX10-NEXT: v_and_or_b32 v4, v3, v4, v5
; GFX10-NEXT: v_lshlrev_b32_sdwa v3, v6, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
; GFX10-NEXT: v_lshlrev_b32_e32 v5, 24, v10
; GFX10-NEXT: v_or3_b32 v2, v2, v14, v8
; GFX10-NEXT: v_and_or_b32 v4, 0xff, v3, v4
; GFX10-NEXT: v_lshlrev_b32_sdwa v3, v5, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
; GFX10-NEXT: v_lshlrev_b32_e32 v5, 24, v9
; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc_lo
; GFX10-NEXT: v_cmp_eq_u32_e64 vcc_lo, s0, 2
; GFX10-NEXT: v_or3_b32 v1, v4, v3, v5
@ -2583,43 +2561,41 @@ define i8 @extractelement_vgpr_v16i8_vgpr_idx(<16 x i8> addrspace(1)* %ptr, i32
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: global_load_dwordx4 v[3:6], v[0:1], off
; GFX10-NEXT: s_mov_b32 s4, 8
; GFX10-NEXT: v_mov_b32_e32 v1, 8
; GFX10-NEXT: v_mov_b32_e32 v0, 8
; GFX10-NEXT: s_mov_b32 s5, 16
; GFX10-NEXT: s_movk_i32 s6, 0xff
; GFX10-NEXT: v_mov_b32_e32 v0, 0xff
; GFX10-NEXT: v_mov_b32_e32 v7, 16
; GFX10-NEXT: v_lshrrev_b32_e32 v8, 2, v2
; GFX10-NEXT: v_mov_b32_e32 v1, 16
; GFX10-NEXT: v_lshrrev_b32_e32 v7, 2, v2
; GFX10-NEXT: v_and_b32_e32 v2, 3, v2
; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v8
; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v7
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: v_lshrrev_b32_e32 v9, 24, v3
; GFX10-NEXT: v_lshrrev_b32_e32 v10, 24, v4
; GFX10-NEXT: v_lshlrev_b32_sdwa v13, s4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
; GFX10-NEXT: v_lshlrev_b32_sdwa v15, s4, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
; GFX10-NEXT: v_lshrrev_b32_e32 v11, 24, v5
; GFX10-NEXT: v_lshlrev_b32_sdwa v14, s5, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
; GFX10-NEXT: v_lshlrev_b32_sdwa v16, s5, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
; GFX10-NEXT: v_lshlrev_b32_sdwa v17, v1, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
; GFX10-NEXT: v_and_or_b32 v3, v3, s6, v13
; GFX10-NEXT: v_lshrrev_b32_e32 v8, 24, v3
; GFX10-NEXT: v_lshrrev_b32_e32 v9, 24, v4
; GFX10-NEXT: v_lshlrev_b32_sdwa v12, s4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
; GFX10-NEXT: v_lshlrev_b32_sdwa v14, s4, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
; GFX10-NEXT: v_lshrrev_b32_e32 v10, 24, v5
; GFX10-NEXT: v_lshlrev_b32_sdwa v13, s5, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
; GFX10-NEXT: v_lshlrev_b32_sdwa v15, s5, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
; GFX10-NEXT: v_lshlrev_b32_sdwa v16, v0, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
; GFX10-NEXT: v_and_or_b32 v3, v3, 0xff, v12
; GFX10-NEXT: v_lshlrev_b32_e32 v8, 24, v8
; GFX10-NEXT: v_and_or_b32 v4, v4, 0xff, v14
; GFX10-NEXT: v_lshlrev_b32_e32 v9, 24, v9
; GFX10-NEXT: v_and_or_b32 v4, v4, s6, v15
; GFX10-NEXT: v_lshrrev_b32_e32 v11, 24, v6
; GFX10-NEXT: v_lshlrev_b32_sdwa v17, v1, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
; GFX10-NEXT: v_lshlrev_b32_sdwa v0, v0, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
; GFX10-NEXT: v_and_or_b32 v5, 0xff, v5, v16
; GFX10-NEXT: v_lshlrev_b32_e32 v10, 24, v10
; GFX10-NEXT: v_lshrrev_b32_e32 v12, 24, v6
; GFX10-NEXT: v_lshlrev_b32_sdwa v18, v7, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
; GFX10-NEXT: v_lshlrev_b32_sdwa v1, v1, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
; GFX10-NEXT: v_and_or_b32 v5, v5, v0, v17
; GFX10-NEXT: v_lshlrev_b32_e32 v11, 24, v11
; GFX10-NEXT: v_or3_b32 v3, v3, v14, v9
; GFX10-NEXT: v_or3_b32 v4, v4, v16, v10
; GFX10-NEXT: v_lshlrev_b32_sdwa v7, v7, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
; GFX10-NEXT: v_and_or_b32 v0, v6, v0, v1
; GFX10-NEXT: v_lshlrev_b32_e32 v1, 24, v12
; GFX10-NEXT: v_or3_b32 v5, v5, v18, v11
; GFX10-NEXT: v_or3_b32 v3, v3, v13, v8
; GFX10-NEXT: v_or3_b32 v4, v4, v15, v9
; GFX10-NEXT: v_lshlrev_b32_sdwa v1, v1, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
; GFX10-NEXT: v_and_or_b32 v0, 0xff, v6, v0
; GFX10-NEXT: v_lshlrev_b32_e32 v6, 24, v11
; GFX10-NEXT: v_or3_b32 v5, v5, v17, v10
; GFX10-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc_lo
; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 2, v8
; GFX10-NEXT: v_or3_b32 v0, v0, v7, v1
; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 2, v7
; GFX10-NEXT: v_or3_b32 v0, v0, v1, v6
; GFX10-NEXT: v_cndmask_b32_e32 v1, v3, v5, vcc_lo
; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 3, v8
; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 3, v7
; GFX10-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc_lo
; GFX10-NEXT: v_lshlrev_b32_e32 v1, 3, v2
; GFX10-NEXT: v_lshrrev_b32_e32 v0, v1, v0
@ -2633,47 +2609,46 @@ define amdgpu_ps i8 @extractelement_sgpr_v16i8_vgpr_idx(<16 x i8> addrspace(4)*
; GCN-LABEL: extractelement_sgpr_v16i8_vgpr_idx:
; GCN: ; %bb.0:
; GCN-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x0
; GCN-NEXT: s_mov_b32 s10, 0x80008
; GCN-NEXT: s_movk_i32 s8, 0xff
; GCN-NEXT: v_lshrrev_b32_e32 v1, 2, v0
; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 1, v1
; GCN-NEXT: v_and_b32_e32 v0, 3, v0
; GCN-NEXT: v_lshlrev_b32_e32 v0, 3, v0
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: s_bfe_u32 s11, s0, s10
; GCN-NEXT: s_and_b32 s9, s0, s8
; GCN-NEXT: s_lshl_b32 s11, s11, 8
; GCN-NEXT: s_or_b32 s9, s9, s11
; GCN-NEXT: s_mov_b32 s11, 0x80010
; GCN-NEXT: s_bfe_u32 s9, s0, 0x80008
; GCN-NEXT: s_lshr_b32 s4, s0, 24
; GCN-NEXT: s_bfe_u32 s0, s0, s11
; GCN-NEXT: s_and_b32 s8, s0, 0xff
; GCN-NEXT: s_lshl_b32 s9, s9, 8
; GCN-NEXT: s_bfe_u32 s0, s0, 0x80010
; GCN-NEXT: s_or_b32 s8, s8, s9
; GCN-NEXT: s_lshl_b32 s0, s0, 16
; GCN-NEXT: s_or_b32 s0, s9, s0
; GCN-NEXT: s_or_b32 s0, s8, s0
; GCN-NEXT: s_lshl_b32 s4, s4, 24
; GCN-NEXT: s_bfe_u32 s9, s1, s10
; GCN-NEXT: s_bfe_u32 s8, s1, 0x80008
; GCN-NEXT: s_lshr_b32 s5, s1, 24
; GCN-NEXT: s_or_b32 s0, s0, s4
; GCN-NEXT: s_and_b32 s4, s1, s8
; GCN-NEXT: s_lshl_b32 s9, s9, 8
; GCN-NEXT: s_bfe_u32 s1, s1, s11
; GCN-NEXT: s_or_b32 s4, s4, s9
; GCN-NEXT: s_and_b32 s4, s1, 0xff
; GCN-NEXT: s_lshl_b32 s8, s8, 8
; GCN-NEXT: s_bfe_u32 s1, s1, 0x80010
; GCN-NEXT: s_or_b32 s4, s4, s8
; GCN-NEXT: s_lshl_b32 s1, s1, 16
; GCN-NEXT: s_or_b32 s1, s4, s1
; GCN-NEXT: s_lshl_b32 s4, s5, 24
; GCN-NEXT: s_bfe_u32 s5, s2, s10
; GCN-NEXT: s_bfe_u32 s5, s2, 0x80008
; GCN-NEXT: s_lshr_b32 s6, s2, 24
; GCN-NEXT: s_or_b32 s1, s1, s4
; GCN-NEXT: s_and_b32 s4, s2, s8
; GCN-NEXT: s_and_b32 s4, s2, 0xff
; GCN-NEXT: s_lshl_b32 s5, s5, 8
; GCN-NEXT: s_bfe_u32 s2, s2, s11
; GCN-NEXT: s_bfe_u32 s2, s2, 0x80010
; GCN-NEXT: s_or_b32 s4, s4, s5
; GCN-NEXT: s_lshl_b32 s2, s2, 16
; GCN-NEXT: s_or_b32 s2, s4, s2
; GCN-NEXT: s_lshl_b32 s4, s6, 24
; GCN-NEXT: s_bfe_u32 s5, s3, s10
; GCN-NEXT: s_bfe_u32 s5, s3, 0x80008
; GCN-NEXT: s_lshr_b32 s7, s3, 24
; GCN-NEXT: s_or_b32 s2, s2, s4
; GCN-NEXT: s_and_b32 s4, s3, s8
; GCN-NEXT: s_and_b32 s4, s3, 0xff
; GCN-NEXT: s_lshl_b32 s5, s5, 8
; GCN-NEXT: s_bfe_u32 s3, s3, s11
; GCN-NEXT: s_bfe_u32 s3, s3, 0x80010
; GCN-NEXT: s_or_b32 s4, s4, s5
; GCN-NEXT: s_lshl_b32 s3, s3, 16
; GCN-NEXT: s_or_b32 s3, s4, s3
@ -2687,9 +2662,7 @@ define amdgpu_ps i8 @extractelement_sgpr_v16i8_vgpr_idx(<16 x i8> addrspace(4)*
; GCN-NEXT: v_mov_b32_e32 v5, s3
; GCN-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc
; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 3, v1
; GCN-NEXT: v_and_b32_e32 v0, 3, v0
; GCN-NEXT: v_cndmask_b32_e32 v1, v2, v5, vcc
; GCN-NEXT: v_lshlrev_b32_e32 v0, 3, v0
; GCN-NEXT: v_lshrrev_b32_e32 v0, v0, v1
; GCN-NEXT: v_readfirstlane_b32 s0, v0
; GCN-NEXT: ; return to shader part epilog
@ -2697,58 +2670,55 @@ define amdgpu_ps i8 @extractelement_sgpr_v16i8_vgpr_idx(<16 x i8> addrspace(4)*
; GFX10-LABEL: extractelement_sgpr_v16i8_vgpr_idx:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x0
; GFX10-NEXT: s_mov_b32 s5, 0x80008
; GFX10-NEXT: s_movk_i32 s4, 0xff
; GFX10-NEXT: s_mov_b32 s6, 0x80010
; GFX10-NEXT: v_lshrrev_b32_e32 v1, 2, v0
; GFX10-NEXT: v_and_b32_e32 v0, 3, v0
; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v1
; GFX10-NEXT: v_lshlrev_b32_e32 v0, 3, v0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: s_bfe_u32 s12, s0, s5
; GFX10-NEXT: s_bfe_u32 s14, s1, s5
; GFX10-NEXT: s_lshr_b32 s8, s1, 24
; GFX10-NEXT: s_and_b32 s11, s0, s4
; GFX10-NEXT: s_and_b32 s13, s1, s4
; GFX10-NEXT: s_bfe_u32 s1, s1, s6
; GFX10-NEXT: s_lshl_b32 s12, s12, 8
; GFX10-NEXT: s_lshl_b32 s14, s14, 8
; GFX10-NEXT: s_bfe_u32 s9, s0, 0x80008
; GFX10-NEXT: s_bfe_u32 s11, s1, 0x80008
; GFX10-NEXT: s_lshr_b32 s5, s1, 24
; GFX10-NEXT: s_and_b32 s8, s0, 0xff
; GFX10-NEXT: s_and_b32 s10, s1, 0xff
; GFX10-NEXT: s_bfe_u32 s1, s1, 0x80010
; GFX10-NEXT: s_lshl_b32 s9, s9, 8
; GFX10-NEXT: s_lshl_b32 s11, s11, 8
; GFX10-NEXT: s_lshl_b32 s1, s1, 16
; GFX10-NEXT: s_or_b32 s11, s11, s12
; GFX10-NEXT: s_or_b32 s12, s13, s14
; GFX10-NEXT: s_lshl_b32 s8, s8, 24
; GFX10-NEXT: s_or_b32 s1, s12, s1
; GFX10-NEXT: s_lshr_b32 s7, s0, 24
; GFX10-NEXT: s_bfe_u32 s0, s0, s6
; GFX10-NEXT: s_or_b32 s1, s1, s8
; GFX10-NEXT: s_or_b32 s8, s8, s9
; GFX10-NEXT: s_or_b32 s9, s10, s11
; GFX10-NEXT: s_lshl_b32 s5, s5, 24
; GFX10-NEXT: s_or_b32 s1, s9, s1
; GFX10-NEXT: s_lshr_b32 s4, s0, 24
; GFX10-NEXT: s_bfe_u32 s0, s0, 0x80010
; GFX10-NEXT: s_or_b32 s1, s1, s5
; GFX10-NEXT: s_lshl_b32 s0, s0, 16
; GFX10-NEXT: s_bfe_u32 s16, s2, s5
; GFX10-NEXT: s_bfe_u32 s13, s2, 0x80008
; GFX10-NEXT: v_mov_b32_e32 v2, s1
; GFX10-NEXT: s_lshl_b32 s7, s7, 24
; GFX10-NEXT: s_or_b32 s0, s11, s0
; GFX10-NEXT: s_lshr_b32 s9, s2, 24
; GFX10-NEXT: s_and_b32 s15, s2, s4
; GFX10-NEXT: s_lshl_b32 s16, s16, 8
; GFX10-NEXT: s_bfe_u32 s2, s2, s6
; GFX10-NEXT: s_or_b32 s0, s0, s7
; GFX10-NEXT: s_or_b32 s7, s15, s16
; GFX10-NEXT: s_lshl_b32 s4, s4, 24
; GFX10-NEXT: s_or_b32 s0, s8, s0
; GFX10-NEXT: s_lshr_b32 s6, s2, 24
; GFX10-NEXT: s_and_b32 s12, s2, 0xff
; GFX10-NEXT: s_bfe_u32 s2, s2, 0x80010
; GFX10-NEXT: s_lshl_b32 s13, s13, 8
; GFX10-NEXT: s_or_b32 s0, s0, s4
; GFX10-NEXT: s_lshl_b32 s2, s2, 16
; GFX10-NEXT: s_bfe_u32 s5, s3, s5
; GFX10-NEXT: s_or_b32 s10, s12, s13
; GFX10-NEXT: s_bfe_u32 s5, s3, 0x80008
; GFX10-NEXT: v_cndmask_b32_e32 v2, s0, v2, vcc_lo
; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 2, v1
; GFX10-NEXT: s_or_b32 s2, s7, s2
; GFX10-NEXT: s_lshl_b32 s7, s9, 24
; GFX10-NEXT: s_and_b32 s4, s3, s4
; GFX10-NEXT: s_or_b32 s2, s10, s2
; GFX10-NEXT: s_lshl_b32 s4, s6, 24
; GFX10-NEXT: s_and_b32 s6, s3, 0xff
; GFX10-NEXT: s_lshl_b32 s5, s5, 8
; GFX10-NEXT: s_bfe_u32 s1, s3, s6
; GFX10-NEXT: s_or_b32 s2, s2, s7
; GFX10-NEXT: s_lshr_b32 s10, s3, 24
; GFX10-NEXT: s_or_b32 s3, s4, s5
; GFX10-NEXT: s_bfe_u32 s1, s3, 0x80010
; GFX10-NEXT: s_or_b32 s2, s2, s4
; GFX10-NEXT: s_lshr_b32 s7, s3, 24
; GFX10-NEXT: s_or_b32 s3, s6, s5
; GFX10-NEXT: s_lshl_b32 s1, s1, 16
; GFX10-NEXT: v_cndmask_b32_e64 v2, v2, s2, vcc_lo
; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 3, v1
; GFX10-NEXT: s_or_b32 s0, s3, s1
; GFX10-NEXT: s_lshl_b32 s1, s10, 24
; GFX10-NEXT: s_lshl_b32 s1, s7, 24
; GFX10-NEXT: s_or_b32 s3, s0, s1
; GFX10-NEXT: v_cndmask_b32_e64 v1, v2, s3, vcc_lo
; GFX10-NEXT: v_lshrrev_b32_e32 v0, v0, v1

View File

@ -901,19 +901,17 @@ define <2 x float> @v_fdiv_v2f32_ulp25(<2 x float> %a, <2 x float> %b) {
; GFX10-FLUSH: ; %bb.0:
; GFX10-FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-FLUSH-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-FLUSH-NEXT: s_mov_b32 s4, 0x6f800000
; GFX10-FLUSH-NEXT: s_mov_b32 s5, 0x2f800000
; GFX10-FLUSH-NEXT: v_cmp_gt_f32_e64 s6, |v2|, s4
; GFX10-FLUSH-NEXT: v_cmp_gt_f32_e64 s4, |v3|, s4
; GFX10-FLUSH-NEXT: v_cndmask_b32_e64 v4, 1.0, s5, s6
; GFX10-FLUSH-NEXT: v_cndmask_b32_e64 v5, 1.0, s5, s4
; GFX10-FLUSH-NEXT: v_cmp_lt_f32_e64 s4, 0x6f800000, |v2|
; GFX10-FLUSH-NEXT: v_cndmask_b32_e64 v4, 1.0, 0x2f800000, s4
; GFX10-FLUSH-NEXT: v_cmp_lt_f32_e64 s4, 0x6f800000, |v3|
; GFX10-FLUSH-NEXT: v_mul_f32_e32 v2, v2, v4
; GFX10-FLUSH-NEXT: v_mul_f32_e32 v3, v3, v5
; GFX10-FLUSH-NEXT: v_cndmask_b32_e64 v5, 1.0, 0x2f800000, s4
; GFX10-FLUSH-NEXT: v_rcp_f32_e32 v2, v2
; GFX10-FLUSH-NEXT: v_mul_f32_e32 v3, v3, v5
; GFX10-FLUSH-NEXT: v_rcp_f32_e32 v3, v3
; GFX10-FLUSH-NEXT: v_mul_f32_e32 v0, v0, v2
; GFX10-FLUSH-NEXT: v_mul_f32_e32 v1, v1, v3
; GFX10-FLUSH-NEXT: v_mul_f32_e32 v0, v4, v0
; GFX10-FLUSH-NEXT: v_mul_f32_e32 v1, v1, v3
; GFX10-FLUSH-NEXT: v_mul_f32_e32 v1, v5, v1
; GFX10-FLUSH-NEXT: s_setpc_b64 s[30:31]
%fdiv = fdiv <2 x float> %a, %b, !fpmath !0
@ -1334,19 +1332,17 @@ define <2 x float> @v_rcp_v2f32_ulp25(<2 x float> %x) {
; GFX10-IEEE: ; %bb.0:
; GFX10-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-IEEE-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-IEEE-NEXT: s_mov_b32 s4, 0x6f800000
; GFX10-IEEE-NEXT: s_mov_b32 s5, 0x2f800000
; GFX10-IEEE-NEXT: v_cmp_gt_f32_e64 s6, |v0|, s4
; GFX10-IEEE-NEXT: v_cmp_gt_f32_e64 s4, |v1|, s4
; GFX10-IEEE-NEXT: v_cndmask_b32_e64 v2, 1.0, s5, s6
; GFX10-IEEE-NEXT: v_cndmask_b32_e64 v3, 1.0, s5, s4
; GFX10-IEEE-NEXT: v_cmp_lt_f32_e64 s4, 0x6f800000, |v0|
; GFX10-IEEE-NEXT: v_cndmask_b32_e64 v2, 1.0, 0x2f800000, s4
; GFX10-IEEE-NEXT: v_cmp_lt_f32_e64 s4, 0x6f800000, |v1|
; GFX10-IEEE-NEXT: v_mul_f32_e32 v0, v0, v2
; GFX10-IEEE-NEXT: v_mul_f32_e32 v1, v1, v3
; GFX10-IEEE-NEXT: v_cndmask_b32_e64 v3, 1.0, 0x2f800000, s4
; GFX10-IEEE-NEXT: v_rcp_f32_e32 v0, v0
; GFX10-IEEE-NEXT: v_mul_f32_e32 v1, v1, v3
; GFX10-IEEE-NEXT: v_rcp_f32_e32 v1, v1
; GFX10-IEEE-NEXT: v_mul_f32_e32 v0, 1.0, v0
; GFX10-IEEE-NEXT: v_mul_f32_e32 v1, 1.0, v1
; GFX10-IEEE-NEXT: v_mul_f32_e32 v0, v2, v0
; GFX10-IEEE-NEXT: v_mul_f32_e32 v1, 1.0, v1
; GFX10-IEEE-NEXT: v_mul_f32_e32 v1, v3, v1
; GFX10-IEEE-NEXT: s_setpc_b64 s[30:31]
;
@ -1492,19 +1488,17 @@ define <2 x float> @v_fdiv_v2f32_arcp_ulp25(<2 x float> %a, <2 x float> %b) {
; GFX10-FLUSH: ; %bb.0:
; GFX10-FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-FLUSH-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-FLUSH-NEXT: s_mov_b32 s4, 0x6f800000
; GFX10-FLUSH-NEXT: s_mov_b32 s5, 0x2f800000
; GFX10-FLUSH-NEXT: v_cmp_gt_f32_e64 s6, |v2|, s4
; GFX10-FLUSH-NEXT: v_cmp_gt_f32_e64 s4, |v3|, s4
; GFX10-FLUSH-NEXT: v_cndmask_b32_e64 v4, 1.0, s5, s6
; GFX10-FLUSH-NEXT: v_cndmask_b32_e64 v5, 1.0, s5, s4
; GFX10-FLUSH-NEXT: v_cmp_lt_f32_e64 s4, 0x6f800000, |v2|
; GFX10-FLUSH-NEXT: v_cndmask_b32_e64 v4, 1.0, 0x2f800000, s4
; GFX10-FLUSH-NEXT: v_cmp_lt_f32_e64 s4, 0x6f800000, |v3|
; GFX10-FLUSH-NEXT: v_mul_f32_e32 v2, v2, v4
; GFX10-FLUSH-NEXT: v_mul_f32_e32 v3, v3, v5
; GFX10-FLUSH-NEXT: v_cndmask_b32_e64 v5, 1.0, 0x2f800000, s4
; GFX10-FLUSH-NEXT: v_rcp_f32_e32 v2, v2
; GFX10-FLUSH-NEXT: v_mul_f32_e32 v3, v3, v5
; GFX10-FLUSH-NEXT: v_rcp_f32_e32 v3, v3
; GFX10-FLUSH-NEXT: v_mul_f32_e32 v0, v0, v2
; GFX10-FLUSH-NEXT: v_mul_f32_e32 v1, v1, v3
; GFX10-FLUSH-NEXT: v_mul_f32_e32 v0, v4, v0
; GFX10-FLUSH-NEXT: v_mul_f32_e32 v1, v1, v3
; GFX10-FLUSH-NEXT: v_mul_f32_e32 v1, v5, v1
; GFX10-FLUSH-NEXT: s_setpc_b64 s[30:31]
%fdiv = fdiv arcp <2 x float> %a, %b, !fpmath !0

View File

@ -96,12 +96,11 @@ define amdgpu_kernel void @store_load_vindex_kernel() {
; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1
; GFX10-NEXT: v_sub_nc_u32_e32 v1, 0, v0
; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX10-NEXT: v_mov_b32_e32 v2, 4
; GFX10-NEXT: v_mov_b32_e32 v3, 15
; GFX10-NEXT: v_mov_b32_e32 v2, 15
; GFX10-NEXT: v_lshlrev_b32_e32 v1, 2, v1
; GFX10-NEXT: v_add_nc_u32_e32 v0, v2, v0
; GFX10-NEXT: v_add_nc_u32_e32 v1, v2, v1
; GFX10-NEXT: scratch_store_dword v0, v3, off
; GFX10-NEXT: v_add_nc_u32_e32 v0, 4, v0
; GFX10-NEXT: v_add_nc_u32_e32 v1, 4, v1
; GFX10-NEXT: scratch_store_dword v0, v2, off
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: scratch_load_dword v0, v1, off offset:124 glc dlc
; GFX10-NEXT: s_waitcnt vmcnt(0)
@ -156,12 +155,11 @@ define void @store_load_vindex_foo(i32 %idx) {
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: v_and_b32_e32 v1, 15, v0
; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX10-NEXT: v_mov_b32_e32 v2, s32
; GFX10-NEXT: v_mov_b32_e32 v3, 15
; GFX10-NEXT: v_mov_b32_e32 v2, 15
; GFX10-NEXT: v_lshlrev_b32_e32 v1, 2, v1
; GFX10-NEXT: v_add_nc_u32_e32 v0, v2, v0
; GFX10-NEXT: v_add_nc_u32_e32 v1, v2, v1
; GFX10-NEXT: scratch_store_dword v0, v3, off
; GFX10-NEXT: v_add_nc_u32_e32 v0, s32, v0
; GFX10-NEXT: v_add_nc_u32_e32 v1, s32, v1
; GFX10-NEXT: scratch_store_dword v0, v2, off
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: scratch_load_dword v0, v1, off glc dlc
; GFX10-NEXT: s_waitcnt vmcnt(0)
@ -327,14 +325,13 @@ define amdgpu_kernel void @store_load_vindex_small_offset_kernel() {
; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1
; GFX10-NEXT: v_sub_nc_u32_e32 v1, 0, v0
; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX10-NEXT: v_mov_b32_e32 v2, 0x104
; GFX10-NEXT: v_mov_b32_e32 v3, 15
; GFX10-NEXT: v_lshlrev_b32_e32 v1, 2, v1
; GFX10-NEXT: v_add_nc_u32_e32 v0, v2, v0
; GFX10-NEXT: v_add_nc_u32_e32 v1, v2, v1
; GFX10-NEXT: scratch_load_dword v2, off, off offset:4 glc dlc
; GFX10-NEXT: v_mov_b32_e32 v2, 15
; GFX10-NEXT: scratch_load_dword v3, off, off offset:4 glc dlc
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: scratch_store_dword v0, v3, off
; GFX10-NEXT: v_lshlrev_b32_e32 v1, 2, v1
; GFX10-NEXT: v_add_nc_u32_e32 v0, 0x104, v0
; GFX10-NEXT: v_add_nc_u32_e32 v1, 0x104, v1
; GFX10-NEXT: scratch_store_dword v0, v2, off
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: scratch_load_dword v0, v1, off offset:124 glc dlc
; GFX10-NEXT: s_waitcnt vmcnt(0)
@ -396,16 +393,16 @@ define void @store_load_vindex_small_offset_foo(i32 %idx) {
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: v_and_b32_e32 v1, 15, v0
; GFX10-NEXT: s_add_i32 vcc_lo, s32, 0x100
; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX10-NEXT: v_mov_b32_e32 v2, vcc_lo
; GFX10-NEXT: v_mov_b32_e32 v3, 15
; GFX10-NEXT: v_lshlrev_b32_e32 v1, 2, v1
; GFX10-NEXT: v_add_nc_u32_e32 v0, v2, v0
; GFX10-NEXT: v_add_nc_u32_e32 v1, v2, v1
; GFX10-NEXT: scratch_load_dword v2, off, s32 glc dlc
; GFX10-NEXT: s_add_i32 vcc_lo, s32, 0x100
; GFX10-NEXT: v_mov_b32_e32 v2, 15
; GFX10-NEXT: scratch_load_dword v3, off, s32 glc dlc
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: scratch_store_dword v0, v3, off
; GFX10-NEXT: v_lshlrev_b32_e32 v1, 2, v1
; GFX10-NEXT: v_add_nc_u32_e32 v0, vcc_lo, v0
; GFX10-NEXT: s_add_i32 vcc_lo, s32, 0x100
; GFX10-NEXT: v_add_nc_u32_e32 v1, vcc_lo, v1
; GFX10-NEXT: scratch_store_dword v0, v2, off
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: scratch_load_dword v0, v1, off glc dlc
; GFX10-NEXT: s_waitcnt vmcnt(0)
@ -548,14 +545,13 @@ define amdgpu_kernel void @store_load_vindex_large_offset_kernel() {
; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1
; GFX10-NEXT: v_sub_nc_u32_e32 v1, 0, v0
; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX10-NEXT: v_mov_b32_e32 v2, 0x4004
; GFX10-NEXT: v_mov_b32_e32 v3, 15
; GFX10-NEXT: v_lshlrev_b32_e32 v1, 2, v1
; GFX10-NEXT: v_add_nc_u32_e32 v0, v2, v0
; GFX10-NEXT: v_add_nc_u32_e32 v1, v2, v1
; GFX10-NEXT: scratch_load_dword v2, off, off offset:4 glc dlc
; GFX10-NEXT: v_mov_b32_e32 v2, 15
; GFX10-NEXT: scratch_load_dword v3, off, off offset:4 glc dlc
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: scratch_store_dword v0, v3, off
; GFX10-NEXT: v_lshlrev_b32_e32 v1, 2, v1
; GFX10-NEXT: v_add_nc_u32_e32 v0, 0x4004, v0
; GFX10-NEXT: v_add_nc_u32_e32 v1, 0x4004, v1
; GFX10-NEXT: scratch_store_dword v0, v2, off
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: scratch_load_dword v0, v1, off offset:124 glc dlc
; GFX10-NEXT: s_waitcnt vmcnt(0)
@ -619,16 +615,16 @@ define void @store_load_vindex_large_offset_foo(i32 %idx) {
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: v_and_b32_e32 v1, 15, v0
; GFX10-NEXT: s_add_i32 vcc_lo, s32, 0x4004
; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX10-NEXT: v_mov_b32_e32 v2, vcc_lo
; GFX10-NEXT: v_mov_b32_e32 v3, 15
; GFX10-NEXT: v_lshlrev_b32_e32 v1, 2, v1
; GFX10-NEXT: v_add_nc_u32_e32 v0, v2, v0
; GFX10-NEXT: v_add_nc_u32_e32 v1, v2, v1
; GFX10-NEXT: scratch_load_dword v2, off, s32 offset:4 glc dlc
; GFX10-NEXT: s_add_i32 vcc_lo, s32, 0x4004
; GFX10-NEXT: v_mov_b32_e32 v2, 15
; GFX10-NEXT: scratch_load_dword v3, off, s32 offset:4 glc dlc
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: scratch_store_dword v0, v3, off
; GFX10-NEXT: v_lshlrev_b32_e32 v1, 2, v1
; GFX10-NEXT: v_add_nc_u32_e32 v0, vcc_lo, v0
; GFX10-NEXT: s_add_i32 vcc_lo, s32, 0x4004
; GFX10-NEXT: v_add_nc_u32_e32 v1, vcc_lo, v1
; GFX10-NEXT: scratch_store_dword v0, v2, off
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: scratch_load_dword v0, v1, off glc dlc
; GFX10-NEXT: s_waitcnt vmcnt(0)

View File

@ -325,10 +325,8 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat0_srcmod012(float add
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: global_load_dword v3, v0, s[6:7] glc dlc
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-NEXT: s_mov_b32 s2, 0x80000000
; GFX10-NEXT: v_sub_f32_e32 v1, s2, v1
; GFX10-NEXT: v_sub_f32_e64 v3, s2, |v3|
; GFX10-NEXT: v_sub_f32_e32 v1, 0x80000000, v1
; GFX10-NEXT: v_sub_f32_e64 v3, 0x80000000, |v3|
; GFX10-NEXT: v_med3_f32 v1, v1, |v2|, v3
; GFX10-NEXT: global_store_dword v0, v1, s[0:1]
; GFX10-NEXT: s_endpgm
@ -447,11 +445,9 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat0_negabs012(float add
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: global_load_dword v3, v0, s[6:7] glc dlc
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-NEXT: s_mov_b32 s2, 0x80000000
; GFX10-NEXT: v_sub_f32_e64 v1, s2, |v1|
; GFX10-NEXT: v_sub_f32_e64 v2, s2, |v2|
; GFX10-NEXT: v_sub_f32_e64 v3, s2, |v3|
; GFX10-NEXT: v_sub_f32_e64 v1, 0x80000000, |v1|
; GFX10-NEXT: v_sub_f32_e64 v2, 0x80000000, |v2|
; GFX10-NEXT: v_sub_f32_e64 v3, 0x80000000, |v3|
; GFX10-NEXT: v_med3_f32 v1, v1, v2, v3
; GFX10-NEXT: global_store_dword v0, v1, s[0:1]
; GFX10-NEXT: s_endpgm

View File

@ -473,9 +473,8 @@ define <2 x half> @v_pow_v2f16_fneg_lhs_rhs(<2 x half> %x, <2 x half> %y) {
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: s_mov_b32 s4, 0x80008000
; GFX10-NEXT: v_xor_b32_e32 v0, s4, v0
; GFX10-NEXT: v_xor_b32_e32 v1, s4, v1
; GFX10-NEXT: v_xor_b32_e32 v0, 0x80008000, v0
; GFX10-NEXT: v_xor_b32_e32 v1, 0x80008000, v1
; GFX10-NEXT: v_log_f16_e32 v2, v0
; GFX10-NEXT: v_log_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
; GFX10-NEXT: v_cvt_f32_f16_e32 v3, v1

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@ -1261,10 +1261,9 @@ define amdgpu_kernel void @test_div_scale_f32_undef_val_val(float addrspace(1)*
; GFX10-LABEL: test_div_scale_f32_undef_val_val:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; GFX10-NEXT: v_mov_b32_e32 v0, 0x41000000
; GFX10-NEXT: v_mov_b32_e32 v1, 0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: v_div_scale_f32 v0, s2, v0, v0, s0
; GFX10-NEXT: v_div_scale_f32 v0, s2, 0x41000000, 0x41000000, s0
; GFX10-NEXT: global_store_dword v1, v0, s[0:1]
; GFX10-NEXT: s_endpgm
%result = call { float, i1 } @llvm.amdgcn.div.scale.f32(float undef, float 8.0, i1 false)

View File

@ -505,15 +505,14 @@ define amdgpu_ps float @atomic_add_i32_3d(<8 x i32> inreg %rsrc, i32 %data, i16
;
; GFX10-LABEL: atomic_add_i32_3d:
; GFX10: ; %bb.0: ; %main_body
; GFX10-NEXT: v_mov_b32_e32 v4, 0xffff
; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v2
; GFX10-NEXT: s_mov_b32 s0, s2
; GFX10-NEXT: s_mov_b32 s2, s4
; GFX10-NEXT: s_mov_b32 s4, s6
; GFX10-NEXT: s_mov_b32 s6, s8
; GFX10-NEXT: s_lshl_b32 s8, s0, 16
; GFX10-NEXT: v_and_or_b32 v1, v1, v4, v2
; GFX10-NEXT: v_and_or_b32 v2, v3, v4, s8
; GFX10-NEXT: v_and_or_b32 v1, 0xffff, v1, v2
; GFX10-NEXT: v_and_or_b32 v2, 0xffff, v3, s8
; GFX10-NEXT: s_mov_b32 s1, s3
; GFX10-NEXT: s_mov_b32 s3, s5
; GFX10-NEXT: s_mov_b32 s5, s7
@ -549,15 +548,14 @@ define amdgpu_ps float @atomic_add_i32_cube(<8 x i32> inreg %rsrc, i32 %data, i1
;
; GFX10-LABEL: atomic_add_i32_cube:
; GFX10: ; %bb.0: ; %main_body
; GFX10-NEXT: v_mov_b32_e32 v4, 0xffff
; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v2
; GFX10-NEXT: s_mov_b32 s0, s2
; GFX10-NEXT: s_mov_b32 s2, s4
; GFX10-NEXT: s_mov_b32 s4, s6
; GFX10-NEXT: s_mov_b32 s6, s8
; GFX10-NEXT: s_lshl_b32 s8, s0, 16
; GFX10-NEXT: v_and_or_b32 v1, v1, v4, v2
; GFX10-NEXT: v_and_or_b32 v2, v3, v4, s8
; GFX10-NEXT: v_and_or_b32 v1, 0xffff, v1, v2
; GFX10-NEXT: v_and_or_b32 v2, 0xffff, v3, s8
; GFX10-NEXT: s_mov_b32 s1, s3
; GFX10-NEXT: s_mov_b32 s3, s5
; GFX10-NEXT: s_mov_b32 s5, s7
@ -632,15 +630,14 @@ define amdgpu_ps float @atomic_add_i32_2darray(<8 x i32> inreg %rsrc, i32 %data,
;
; GFX10-LABEL: atomic_add_i32_2darray:
; GFX10: ; %bb.0: ; %main_body
; GFX10-NEXT: v_mov_b32_e32 v4, 0xffff
; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v2
; GFX10-NEXT: s_mov_b32 s0, s2
; GFX10-NEXT: s_mov_b32 s2, s4
; GFX10-NEXT: s_mov_b32 s4, s6
; GFX10-NEXT: s_mov_b32 s6, s8
; GFX10-NEXT: s_lshl_b32 s8, s0, 16
; GFX10-NEXT: v_and_or_b32 v1, v1, v4, v2
; GFX10-NEXT: v_and_or_b32 v2, v3, v4, s8
; GFX10-NEXT: v_and_or_b32 v1, 0xffff, v1, v2
; GFX10-NEXT: v_and_or_b32 v2, 0xffff, v3, s8
; GFX10-NEXT: s_mov_b32 s1, s3
; GFX10-NEXT: s_mov_b32 s3, s5
; GFX10-NEXT: s_mov_b32 s5, s7
@ -676,15 +673,14 @@ define amdgpu_ps float @atomic_add_i32_2dmsaa(<8 x i32> inreg %rsrc, i32 %data,
;
; GFX10-LABEL: atomic_add_i32_2dmsaa:
; GFX10: ; %bb.0: ; %main_body
; GFX10-NEXT: v_mov_b32_e32 v4, 0xffff
; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v2
; GFX10-NEXT: s_mov_b32 s0, s2
; GFX10-NEXT: s_mov_b32 s2, s4
; GFX10-NEXT: s_mov_b32 s4, s6
; GFX10-NEXT: s_mov_b32 s6, s8
; GFX10-NEXT: s_lshl_b32 s8, s0, 16
; GFX10-NEXT: v_and_or_b32 v1, v1, v4, v2
; GFX10-NEXT: v_and_or_b32 v2, v3, v4, s8
; GFX10-NEXT: v_and_or_b32 v1, 0xffff, v1, v2
; GFX10-NEXT: v_and_or_b32 v2, 0xffff, v3, s8
; GFX10-NEXT: s_mov_b32 s1, s3
; GFX10-NEXT: s_mov_b32 s3, s5
; GFX10-NEXT: s_mov_b32 s5, s7
@ -720,14 +716,13 @@ define amdgpu_ps float @atomic_add_i32_2darraymsaa(<8 x i32> inreg %rsrc, i32 %d
;
; GFX10-LABEL: atomic_add_i32_2darraymsaa:
; GFX10: ; %bb.0: ; %main_body
; GFX10-NEXT: v_mov_b32_e32 v5, 0xffff
; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v2
; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v4
; GFX10-NEXT: s_mov_b32 s0, s2
; GFX10-NEXT: s_mov_b32 s1, s3
; GFX10-NEXT: s_mov_b32 s2, s4
; GFX10-NEXT: v_and_or_b32 v1, v1, v5, v2
; GFX10-NEXT: v_and_or_b32 v2, v3, v5, v4
; GFX10-NEXT: v_and_or_b32 v1, 0xffff, v1, v2
; GFX10-NEXT: v_and_or_b32 v2, 0xffff, v3, v4
; GFX10-NEXT: s_mov_b32 s3, s5
; GFX10-NEXT: s_mov_b32 s4, s6
; GFX10-NEXT: s_mov_b32 s5, s7
@ -1279,15 +1274,14 @@ define amdgpu_ps <2 x float> @atomic_add_i64_3d(<8 x i32> inreg %rsrc, i64 %data
;
; GFX10-LABEL: atomic_add_i64_3d:
; GFX10: ; %bb.0: ; %main_body
; GFX10-NEXT: v_mov_b32_e32 v5, 0xffff
; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v3
; GFX10-NEXT: s_mov_b32 s0, s2
; GFX10-NEXT: s_mov_b32 s2, s4
; GFX10-NEXT: s_mov_b32 s4, s6
; GFX10-NEXT: s_mov_b32 s6, s8
; GFX10-NEXT: s_lshl_b32 s8, s0, 16
; GFX10-NEXT: v_and_or_b32 v2, v2, v5, v3
; GFX10-NEXT: v_and_or_b32 v3, v4, v5, s8
; GFX10-NEXT: v_and_or_b32 v2, 0xffff, v2, v3
; GFX10-NEXT: v_and_or_b32 v3, 0xffff, v4, s8
; GFX10-NEXT: s_mov_b32 s1, s3
; GFX10-NEXT: s_mov_b32 s3, s5
; GFX10-NEXT: s_mov_b32 s5, s7
@ -1323,15 +1317,14 @@ define amdgpu_ps <2 x float> @atomic_add_i64_cube(<8 x i32> inreg %rsrc, i64 %da
;
; GFX10-LABEL: atomic_add_i64_cube:
; GFX10: ; %bb.0: ; %main_body
; GFX10-NEXT: v_mov_b32_e32 v5, 0xffff
; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v3
; GFX10-NEXT: s_mov_b32 s0, s2
; GFX10-NEXT: s_mov_b32 s2, s4
; GFX10-NEXT: s_mov_b32 s4, s6
; GFX10-NEXT: s_mov_b32 s6, s8
; GFX10-NEXT: s_lshl_b32 s8, s0, 16
; GFX10-NEXT: v_and_or_b32 v2, v2, v5, v3
; GFX10-NEXT: v_and_or_b32 v3, v4, v5, s8
; GFX10-NEXT: v_and_or_b32 v2, 0xffff, v2, v3
; GFX10-NEXT: v_and_or_b32 v3, 0xffff, v4, s8
; GFX10-NEXT: s_mov_b32 s1, s3
; GFX10-NEXT: s_mov_b32 s3, s5
; GFX10-NEXT: s_mov_b32 s5, s7
@ -1406,15 +1399,14 @@ define amdgpu_ps <2 x float> @atomic_add_i64_2darray(<8 x i32> inreg %rsrc, i64
;
; GFX10-LABEL: atomic_add_i64_2darray:
; GFX10: ; %bb.0: ; %main_body
; GFX10-NEXT: v_mov_b32_e32 v5, 0xffff
; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v3
; GFX10-NEXT: s_mov_b32 s0, s2
; GFX10-NEXT: s_mov_b32 s2, s4
; GFX10-NEXT: s_mov_b32 s4, s6
; GFX10-NEXT: s_mov_b32 s6, s8
; GFX10-NEXT: s_lshl_b32 s8, s0, 16
; GFX10-NEXT: v_and_or_b32 v2, v2, v5, v3
; GFX10-NEXT: v_and_or_b32 v3, v4, v5, s8
; GFX10-NEXT: v_and_or_b32 v2, 0xffff, v2, v3
; GFX10-NEXT: v_and_or_b32 v3, 0xffff, v4, s8
; GFX10-NEXT: s_mov_b32 s1, s3
; GFX10-NEXT: s_mov_b32 s3, s5
; GFX10-NEXT: s_mov_b32 s5, s7
@ -1450,15 +1442,14 @@ define amdgpu_ps <2 x float> @atomic_add_i64_2dmsaa(<8 x i32> inreg %rsrc, i64 %
;
; GFX10-LABEL: atomic_add_i64_2dmsaa:
; GFX10: ; %bb.0: ; %main_body
; GFX10-NEXT: v_mov_b32_e32 v5, 0xffff
; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v3
; GFX10-NEXT: s_mov_b32 s0, s2
; GFX10-NEXT: s_mov_b32 s2, s4
; GFX10-NEXT: s_mov_b32 s4, s6
; GFX10-NEXT: s_mov_b32 s6, s8
; GFX10-NEXT: s_lshl_b32 s8, s0, 16
; GFX10-NEXT: v_and_or_b32 v2, v2, v5, v3
; GFX10-NEXT: v_and_or_b32 v3, v4, v5, s8
; GFX10-NEXT: v_and_or_b32 v2, 0xffff, v2, v3
; GFX10-NEXT: v_and_or_b32 v3, 0xffff, v4, s8
; GFX10-NEXT: s_mov_b32 s1, s3
; GFX10-NEXT: s_mov_b32 s3, s5
; GFX10-NEXT: s_mov_b32 s5, s7
@ -1494,14 +1485,13 @@ define amdgpu_ps <2 x float> @atomic_add_i64_2darraymsaa(<8 x i32> inreg %rsrc,
;
; GFX10-LABEL: atomic_add_i64_2darraymsaa:
; GFX10: ; %bb.0: ; %main_body
; GFX10-NEXT: v_mov_b32_e32 v6, 0xffff
; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v3
; GFX10-NEXT: v_lshlrev_b32_e32 v5, 16, v5
; GFX10-NEXT: s_mov_b32 s0, s2
; GFX10-NEXT: s_mov_b32 s1, s3
; GFX10-NEXT: s_mov_b32 s2, s4
; GFX10-NEXT: v_and_or_b32 v2, v2, v6, v3
; GFX10-NEXT: v_and_or_b32 v3, v4, v6, v5
; GFX10-NEXT: v_and_or_b32 v2, 0xffff, v2, v3
; GFX10-NEXT: v_and_or_b32 v3, 0xffff, v4, v5
; GFX10-NEXT: s_mov_b32 s3, s5
; GFX10-NEXT: s_mov_b32 s4, s6
; GFX10-NEXT: s_mov_b32 s5, s7

View File

@ -86,7 +86,6 @@ define amdgpu_ps <4 x float> @gather4_cube(<8 x i32> inreg %rsrc, <4 x i32> inre
; GFX10NSA-NEXT: s_mov_b32 s14, exec_lo
; GFX10NSA-NEXT: s_mov_b32 s0, s2
; GFX10NSA-NEXT: s_wqm_b32 exec_lo, exec_lo
; GFX10NSA-NEXT: v_mov_b32_e32 v3, 0xffff
; GFX10NSA-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX10NSA-NEXT: s_mov_b32 s2, s4
; GFX10NSA-NEXT: s_mov_b32 s4, s6
@ -100,8 +99,8 @@ define amdgpu_ps <4 x float> @gather4_cube(<8 x i32> inreg %rsrc, <4 x i32> inre
; GFX10NSA-NEXT: s_mov_b32 s7, s9
; GFX10NSA-NEXT: s_mov_b32 s9, s11
; GFX10NSA-NEXT: s_mov_b32 s11, s13
; GFX10NSA-NEXT: v_and_or_b32 v0, v0, v3, v1
; GFX10NSA-NEXT: v_and_or_b32 v1, v2, v3, s12
; GFX10NSA-NEXT: v_and_or_b32 v0, 0xffff, v0, v1
; GFX10NSA-NEXT: v_and_or_b32 v1, 0xffff, v2, s12
; GFX10NSA-NEXT: s_and_b32 exec_lo, exec_lo, s14
; GFX10NSA-NEXT: image_gather4 v[0:3], v[0:1], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_CUBE a16
; GFX10NSA-NEXT: s_waitcnt vmcnt(0)
@ -143,7 +142,6 @@ define amdgpu_ps <4 x float> @gather4_2darray(<8 x i32> inreg %rsrc, <4 x i32> i
; GFX10NSA-NEXT: s_mov_b32 s14, exec_lo
; GFX10NSA-NEXT: s_mov_b32 s0, s2
; GFX10NSA-NEXT: s_wqm_b32 exec_lo, exec_lo
; GFX10NSA-NEXT: v_mov_b32_e32 v3, 0xffff
; GFX10NSA-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX10NSA-NEXT: s_mov_b32 s2, s4
; GFX10NSA-NEXT: s_mov_b32 s4, s6
@ -157,8 +155,8 @@ define amdgpu_ps <4 x float> @gather4_2darray(<8 x i32> inreg %rsrc, <4 x i32> i
; GFX10NSA-NEXT: s_mov_b32 s7, s9
; GFX10NSA-NEXT: s_mov_b32 s9, s11
; GFX10NSA-NEXT: s_mov_b32 s11, s13
; GFX10NSA-NEXT: v_and_or_b32 v0, v0, v3, v1
; GFX10NSA-NEXT: v_and_or_b32 v1, v2, v3, s12
; GFX10NSA-NEXT: v_and_or_b32 v0, 0xffff, v0, v1
; GFX10NSA-NEXT: v_and_or_b32 v1, 0xffff, v2, s12
; GFX10NSA-NEXT: s_and_b32 exec_lo, exec_lo, s14
; GFX10NSA-NEXT: image_gather4 v[0:3], v[0:1], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_2D_ARRAY a16
; GFX10NSA-NEXT: s_waitcnt vmcnt(0)
@ -252,7 +250,6 @@ define amdgpu_ps <4 x float> @gather4_cl_2d(<8 x i32> inreg %rsrc, <4 x i32> inr
; GFX10NSA-NEXT: s_mov_b32 s14, exec_lo
; GFX10NSA-NEXT: s_mov_b32 s0, s2
; GFX10NSA-NEXT: s_wqm_b32 exec_lo, exec_lo
; GFX10NSA-NEXT: v_mov_b32_e32 v3, 0xffff
; GFX10NSA-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX10NSA-NEXT: s_mov_b32 s2, s4
; GFX10NSA-NEXT: s_mov_b32 s4, s6
@ -266,8 +263,8 @@ define amdgpu_ps <4 x float> @gather4_cl_2d(<8 x i32> inreg %rsrc, <4 x i32> inr
; GFX10NSA-NEXT: s_mov_b32 s7, s9
; GFX10NSA-NEXT: s_mov_b32 s9, s11
; GFX10NSA-NEXT: s_mov_b32 s11, s13
; GFX10NSA-NEXT: v_and_or_b32 v0, v0, v3, v1
; GFX10NSA-NEXT: v_and_or_b32 v1, v2, v3, s12
; GFX10NSA-NEXT: v_and_or_b32 v0, 0xffff, v0, v1
; GFX10NSA-NEXT: v_and_or_b32 v1, 0xffff, v2, s12
; GFX10NSA-NEXT: s_and_b32 exec_lo, exec_lo, s14
; GFX10NSA-NEXT: image_gather4_cl v[0:3], v[0:1], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_2D a16
; GFX10NSA-NEXT: s_waitcnt vmcnt(0)
@ -309,7 +306,6 @@ define amdgpu_ps <4 x float> @gather4_c_cl_2d(<8 x i32> inreg %rsrc, <4 x i32> i
; GFX10NSA-NEXT: s_mov_b32 s14, exec_lo
; GFX10NSA-NEXT: s_mov_b32 s0, s2
; GFX10NSA-NEXT: s_wqm_b32 exec_lo, exec_lo
; GFX10NSA-NEXT: v_mov_b32_e32 v4, 0xffff
; GFX10NSA-NEXT: v_lshlrev_b32_e32 v2, 16, v2
; GFX10NSA-NEXT: s_mov_b32 s2, s4
; GFX10NSA-NEXT: s_mov_b32 s4, s6
@ -323,8 +319,8 @@ define amdgpu_ps <4 x float> @gather4_c_cl_2d(<8 x i32> inreg %rsrc, <4 x i32> i
; GFX10NSA-NEXT: s_mov_b32 s7, s9
; GFX10NSA-NEXT: s_mov_b32 s9, s11
; GFX10NSA-NEXT: s_mov_b32 s11, s13
; GFX10NSA-NEXT: v_and_or_b32 v1, v1, v4, v2
; GFX10NSA-NEXT: v_and_or_b32 v2, v3, v4, s12
; GFX10NSA-NEXT: v_and_or_b32 v1, 0xffff, v1, v2
; GFX10NSA-NEXT: v_and_or_b32 v2, 0xffff, v3, s12
; GFX10NSA-NEXT: s_and_b32 exec_lo, exec_lo, s14
; GFX10NSA-NEXT: image_gather4_c_cl v[0:3], v[0:2], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_2D a16
; GFX10NSA-NEXT: s_waitcnt vmcnt(0)
@ -366,7 +362,6 @@ define amdgpu_ps <4 x float> @gather4_b_2d(<8 x i32> inreg %rsrc, <4 x i32> inre
; GFX10NSA-NEXT: s_mov_b32 s14, exec_lo
; GFX10NSA-NEXT: s_mov_b32 s0, s2
; GFX10NSA-NEXT: s_wqm_b32 exec_lo, exec_lo
; GFX10NSA-NEXT: v_mov_b32_e32 v3, 0xffff
; GFX10NSA-NEXT: v_lshlrev_b32_e32 v2, 16, v2
; GFX10NSA-NEXT: s_mov_b32 s2, s4
; GFX10NSA-NEXT: s_mov_b32 s4, s6
@ -380,8 +375,8 @@ define amdgpu_ps <4 x float> @gather4_b_2d(<8 x i32> inreg %rsrc, <4 x i32> inre
; GFX10NSA-NEXT: s_mov_b32 s7, s9
; GFX10NSA-NEXT: s_mov_b32 s9, s11
; GFX10NSA-NEXT: s_mov_b32 s11, s13
; GFX10NSA-NEXT: v_and_or_b32 v0, v0, v3, s12
; GFX10NSA-NEXT: v_and_or_b32 v1, v1, v3, v2
; GFX10NSA-NEXT: v_and_or_b32 v0, 0xffff, v0, s12
; GFX10NSA-NEXT: v_and_or_b32 v1, 0xffff, v1, v2
; GFX10NSA-NEXT: s_and_b32 exec_lo, exec_lo, s14
; GFX10NSA-NEXT: image_gather4_b v[0:3], v[0:1], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_2D a16
; GFX10NSA-NEXT: s_waitcnt vmcnt(0)
@ -423,7 +418,6 @@ define amdgpu_ps <4 x float> @gather4_c_b_2d(<8 x i32> inreg %rsrc, <4 x i32> in
; GFX10NSA-NEXT: s_mov_b32 s14, exec_lo
; GFX10NSA-NEXT: s_mov_b32 s0, s2
; GFX10NSA-NEXT: s_wqm_b32 exec_lo, exec_lo
; GFX10NSA-NEXT: v_mov_b32_e32 v4, 0xffff
; GFX10NSA-NEXT: v_lshlrev_b32_e32 v3, 16, v3
; GFX10NSA-NEXT: s_mov_b32 s2, s4
; GFX10NSA-NEXT: s_mov_b32 s4, s6
@ -437,8 +431,8 @@ define amdgpu_ps <4 x float> @gather4_c_b_2d(<8 x i32> inreg %rsrc, <4 x i32> in
; GFX10NSA-NEXT: s_mov_b32 s7, s9
; GFX10NSA-NEXT: s_mov_b32 s9, s11
; GFX10NSA-NEXT: s_mov_b32 s11, s13
; GFX10NSA-NEXT: v_and_or_b32 v0, v0, v4, s12
; GFX10NSA-NEXT: v_and_or_b32 v2, v2, v4, v3
; GFX10NSA-NEXT: v_and_or_b32 v0, 0xffff, v0, s12
; GFX10NSA-NEXT: v_and_or_b32 v2, 0xffff, v2, v3
; GFX10NSA-NEXT: s_and_b32 exec_lo, exec_lo, s14
; GFX10NSA-NEXT: image_gather4_c_b v[0:3], v[0:2], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_2D a16
; GFX10NSA-NEXT: s_waitcnt vmcnt(0)
@ -481,7 +475,6 @@ define amdgpu_ps <4 x float> @gather4_b_cl_2d(<8 x i32> inreg %rsrc, <4 x i32> i
; GFX10NSA-NEXT: s_mov_b32 s14, exec_lo
; GFX10NSA-NEXT: s_mov_b32 s0, s2
; GFX10NSA-NEXT: s_wqm_b32 exec_lo, exec_lo
; GFX10NSA-NEXT: v_mov_b32_e32 v4, 0xffff
; GFX10NSA-NEXT: v_lshlrev_b32_e32 v2, 16, v2
; GFX10NSA-NEXT: s_mov_b32 s2, s4
; GFX10NSA-NEXT: s_mov_b32 s4, s6
@ -495,9 +488,9 @@ define amdgpu_ps <4 x float> @gather4_b_cl_2d(<8 x i32> inreg %rsrc, <4 x i32> i
; GFX10NSA-NEXT: s_mov_b32 s7, s9
; GFX10NSA-NEXT: s_mov_b32 s9, s11
; GFX10NSA-NEXT: s_mov_b32 s11, s13
; GFX10NSA-NEXT: v_and_or_b32 v0, v0, v4, s12
; GFX10NSA-NEXT: v_and_or_b32 v1, v1, v4, v2
; GFX10NSA-NEXT: v_and_or_b32 v2, v3, v4, s12
; GFX10NSA-NEXT: v_and_or_b32 v0, 0xffff, v0, s12
; GFX10NSA-NEXT: v_and_or_b32 v1, 0xffff, v1, v2
; GFX10NSA-NEXT: v_and_or_b32 v2, 0xffff, v3, s12
; GFX10NSA-NEXT: s_and_b32 exec_lo, exec_lo, s14
; GFX10NSA-NEXT: image_gather4_b_cl v[0:3], v[0:2], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_2D a16
; GFX10NSA-NEXT: s_waitcnt vmcnt(0)
@ -540,7 +533,6 @@ define amdgpu_ps <4 x float> @gather4_c_b_cl_2d(<8 x i32> inreg %rsrc, <4 x i32>
; GFX10NSA-NEXT: s_mov_b32 s14, exec_lo
; GFX10NSA-NEXT: s_mov_b32 s0, s2
; GFX10NSA-NEXT: s_wqm_b32 exec_lo, exec_lo
; GFX10NSA-NEXT: v_mov_b32_e32 v5, 0xffff
; GFX10NSA-NEXT: v_lshlrev_b32_e32 v3, 16, v3
; GFX10NSA-NEXT: s_mov_b32 s2, s4
; GFX10NSA-NEXT: s_mov_b32 s4, s6
@ -554,9 +546,9 @@ define amdgpu_ps <4 x float> @gather4_c_b_cl_2d(<8 x i32> inreg %rsrc, <4 x i32>
; GFX10NSA-NEXT: s_mov_b32 s7, s9
; GFX10NSA-NEXT: s_mov_b32 s9, s11
; GFX10NSA-NEXT: s_mov_b32 s11, s13
; GFX10NSA-NEXT: v_and_or_b32 v0, v0, v5, s12
; GFX10NSA-NEXT: v_and_or_b32 v2, v2, v5, v3
; GFX10NSA-NEXT: v_and_or_b32 v3, v4, v5, s12
; GFX10NSA-NEXT: v_and_or_b32 v0, 0xffff, v0, s12
; GFX10NSA-NEXT: v_and_or_b32 v2, 0xffff, v2, v3
; GFX10NSA-NEXT: v_and_or_b32 v3, 0xffff, v4, s12
; GFX10NSA-NEXT: s_and_b32 exec_lo, exec_lo, s14
; GFX10NSA-NEXT: image_gather4_c_b_cl v[0:3], v[0:3], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_2D a16
; GFX10NSA-NEXT: s_waitcnt vmcnt(0)
@ -592,7 +584,6 @@ define amdgpu_ps <4 x float> @gather4_l_2d(<8 x i32> inreg %rsrc, <4 x i32> inre
;
; GFX10NSA-LABEL: gather4_l_2d:
; GFX10NSA: ; %bb.0: ; %main_body
; GFX10NSA-NEXT: v_mov_b32_e32 v3, 0xffff
; GFX10NSA-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX10NSA-NEXT: s_mov_b32 s0, s2
; GFX10NSA-NEXT: s_mov_b32 s2, s4
@ -601,8 +592,8 @@ define amdgpu_ps <4 x float> @gather4_l_2d(<8 x i32> inreg %rsrc, <4 x i32> inre
; GFX10NSA-NEXT: s_mov_b32 s8, s10
; GFX10NSA-NEXT: s_mov_b32 s10, s12
; GFX10NSA-NEXT: s_lshl_b32 s12, s0, 16
; GFX10NSA-NEXT: v_and_or_b32 v0, v0, v3, v1
; GFX10NSA-NEXT: v_and_or_b32 v1, v2, v3, s12
; GFX10NSA-NEXT: v_and_or_b32 v0, 0xffff, v0, v1
; GFX10NSA-NEXT: v_and_or_b32 v1, 0xffff, v2, s12
; GFX10NSA-NEXT: s_mov_b32 s1, s3
; GFX10NSA-NEXT: s_mov_b32 s3, s5
; GFX10NSA-NEXT: s_mov_b32 s5, s7
@ -643,7 +634,6 @@ define amdgpu_ps <4 x float> @gather4_c_l_2d(<8 x i32> inreg %rsrc, <4 x i32> in
;
; GFX10NSA-LABEL: gather4_c_l_2d:
; GFX10NSA: ; %bb.0: ; %main_body
; GFX10NSA-NEXT: v_mov_b32_e32 v4, 0xffff
; GFX10NSA-NEXT: v_lshlrev_b32_e32 v2, 16, v2
; GFX10NSA-NEXT: s_mov_b32 s0, s2
; GFX10NSA-NEXT: s_mov_b32 s2, s4
@ -652,8 +642,8 @@ define amdgpu_ps <4 x float> @gather4_c_l_2d(<8 x i32> inreg %rsrc, <4 x i32> in
; GFX10NSA-NEXT: s_mov_b32 s8, s10
; GFX10NSA-NEXT: s_mov_b32 s10, s12
; GFX10NSA-NEXT: s_lshl_b32 s12, s0, 16
; GFX10NSA-NEXT: v_and_or_b32 v1, v1, v4, v2
; GFX10NSA-NEXT: v_and_or_b32 v2, v3, v4, s12
; GFX10NSA-NEXT: v_and_or_b32 v1, 0xffff, v1, v2
; GFX10NSA-NEXT: v_and_or_b32 v2, 0xffff, v3, s12
; GFX10NSA-NEXT: s_mov_b32 s1, s3
; GFX10NSA-NEXT: s_mov_b32 s3, s5
; GFX10NSA-NEXT: s_mov_b32 s5, s7

View File

@ -564,15 +564,14 @@ define amdgpu_ps <3 x half> @load_1d_v3f16_xyz(<8 x i32> inreg %rsrc, i32 %s) {
; GFX10-NEXT: s_mov_b32 s5, s7
; GFX10-NEXT: s_mov_b32 s6, s8
; GFX10-NEXT: s_mov_b32 s7, s9
; GFX10-NEXT: v_mov_b32_e32 v3, 0xffff
; GFX10-NEXT: image_load v[0:1], v0, s[0:7] dmask:0x7 dim:SQ_RSRC_IMG_1D unorm d16
; GFX10-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-NEXT: s_lshl_b32 s0, s0, 16
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: v_lshrrev_b32_e32 v2, 16, v0
; GFX10-NEXT: v_and_or_b32 v1, v1, v3, s0
; GFX10-NEXT: v_and_or_b32 v1, 0xffff, v1, s0
; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v2
; GFX10-NEXT: v_and_or_b32 v0, v0, v3, v2
; GFX10-NEXT: v_and_or_b32 v0, 0xffff, v0, v2
; GFX10-NEXT: ; return to shader part epilog
%v = call <3 x half> @llvm.amdgcn.image.load.1d.v3f16.i32(i32 7, i32 %s, <8 x i32> %rsrc, i32 0, i32 0)
ret <3 x half> %v

View File

@ -24,14 +24,13 @@ define amdgpu_ps <4 x float> @load_2darraymsaa_v4f32_xyzw(<8 x i32> inreg %rsrc,
;
; GFX10-LABEL: load_2darraymsaa_v4f32_xyzw:
; GFX10: ; %bb.0:
; GFX10-NEXT: v_mov_b32_e32 v4, 0xffff
; GFX10-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v3
; GFX10-NEXT: s_mov_b32 s0, s2
; GFX10-NEXT: s_mov_b32 s1, s3
; GFX10-NEXT: s_mov_b32 s2, s4
; GFX10-NEXT: v_and_or_b32 v0, v0, v4, v1
; GFX10-NEXT: v_and_or_b32 v1, v2, v4, v3
; GFX10-NEXT: v_and_or_b32 v0, 0xffff, v0, v1
; GFX10-NEXT: v_and_or_b32 v1, 0xffff, v2, v3
; GFX10-NEXT: s_mov_b32 s3, s5
; GFX10-NEXT: s_mov_b32 s4, s6
; GFX10-NEXT: s_mov_b32 s5, s7
@ -79,17 +78,16 @@ define amdgpu_ps <4 x float> @load_2darraymsaa_v4f32_xyzw_tfe(<8 x i32> inreg %r
; GFX10-LABEL: load_2darraymsaa_v4f32_xyzw_tfe:
; GFX10: ; %bb.0:
; GFX10-NEXT: v_mov_b32_e32 v5, 0
; GFX10-NEXT: v_mov_b32_e32 v4, 0xffff
; GFX10-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v3
; GFX10-NEXT: s_mov_b32 s0, s2
; GFX10-NEXT: s_mov_b32 s1, s3
; GFX10-NEXT: v_mov_b32_e32 v6, v5
; GFX10-NEXT: v_mov_b32_e32 v7, v5
; GFX10-NEXT: v_mov_b32_e32 v8, v5
; GFX10-NEXT: v_mov_b32_e32 v9, v5
; GFX10-NEXT: v_and_or_b32 v10, v0, v4, v1
; GFX10-NEXT: v_and_or_b32 v11, v2, v4, v3
; GFX10-NEXT: s_mov_b32 s1, s3
; GFX10-NEXT: v_and_or_b32 v10, 0xffff, v0, v1
; GFX10-NEXT: v_and_or_b32 v11, 0xffff, v2, v3
; GFX10-NEXT: s_mov_b32 s2, s4
; GFX10-NEXT: s_mov_b32 s3, s5
; GFX10-NEXT: s_mov_b32 s4, s6
@ -148,17 +146,16 @@ define amdgpu_ps <4 x float> @load_2darraymsaa_v4f32_xyzw_tfe_lwe(<8 x i32> inre
; GFX10-LABEL: load_2darraymsaa_v4f32_xyzw_tfe_lwe:
; GFX10: ; %bb.0:
; GFX10-NEXT: v_mov_b32_e32 v5, 0
; GFX10-NEXT: v_mov_b32_e32 v4, 0xffff
; GFX10-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v3
; GFX10-NEXT: s_mov_b32 s0, s2
; GFX10-NEXT: s_mov_b32 s1, s3
; GFX10-NEXT: v_mov_b32_e32 v6, v5
; GFX10-NEXT: v_mov_b32_e32 v7, v5
; GFX10-NEXT: v_mov_b32_e32 v8, v5
; GFX10-NEXT: v_mov_b32_e32 v9, v5
; GFX10-NEXT: v_and_or_b32 v10, v0, v4, v1
; GFX10-NEXT: v_and_or_b32 v11, v2, v4, v3
; GFX10-NEXT: s_mov_b32 s1, s3
; GFX10-NEXT: v_and_or_b32 v10, 0xffff, v0, v1
; GFX10-NEXT: v_and_or_b32 v11, 0xffff, v2, v3
; GFX10-NEXT: s_mov_b32 s2, s4
; GFX10-NEXT: s_mov_b32 s3, s5
; GFX10-NEXT: s_mov_b32 s4, s6

View File

@ -24,15 +24,14 @@ define amdgpu_ps <4 x float> @load_3d_v4f32_xyzw(<8 x i32> inreg %rsrc, i16 %s,
;
; GFX10-LABEL: load_3d_v4f32_xyzw:
; GFX10: ; %bb.0:
; GFX10-NEXT: v_mov_b32_e32 v3, 0xffff
; GFX10-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v1
; GFX10-NEXT: s_mov_b32 s0, s2
; GFX10-NEXT: s_mov_b32 s2, s4
; GFX10-NEXT: s_mov_b32 s4, s6
; GFX10-NEXT: s_mov_b32 s6, s8
; GFX10-NEXT: s_lshl_b32 s8, s0, 16
; GFX10-NEXT: v_and_or_b32 v0, v0, v3, v1
; GFX10-NEXT: v_and_or_b32 v1, v2, v3, s8
; GFX10-NEXT: v_and_or_b32 v0, 0xffff, v0, v3
; GFX10-NEXT: v_and_or_b32 v1, 0xffff, v2, s8
; GFX10-NEXT: s_mov_b32 s1, s3
; GFX10-NEXT: s_mov_b32 s3, s5
; GFX10-NEXT: s_mov_b32 s5, s7
@ -79,7 +78,6 @@ define amdgpu_ps <4 x float> @load_3d_v4f32_xyzw_tfe(<8 x i32> inreg %rsrc, i32
; GFX10-LABEL: load_3d_v4f32_xyzw_tfe:
; GFX10: ; %bb.0:
; GFX10-NEXT: v_mov_b32_e32 v5, 0
; GFX10-NEXT: v_mov_b32_e32 v3, 0xffff
; GFX10-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX10-NEXT: s_mov_b32 s0, s2
; GFX10-NEXT: s_mov_b32 s2, s4
@ -90,8 +88,8 @@ define amdgpu_ps <4 x float> @load_3d_v4f32_xyzw_tfe(<8 x i32> inreg %rsrc, i32
; GFX10-NEXT: v_mov_b32_e32 v7, v5
; GFX10-NEXT: v_mov_b32_e32 v8, v5
; GFX10-NEXT: v_mov_b32_e32 v9, v5
; GFX10-NEXT: v_and_or_b32 v10, v0, v3, v1
; GFX10-NEXT: v_and_or_b32 v11, v2, v3, s8
; GFX10-NEXT: v_and_or_b32 v10, 0xffff, v0, v1
; GFX10-NEXT: v_and_or_b32 v11, 0xffff, v2, s8
; GFX10-NEXT: s_mov_b32 s1, s3
; GFX10-NEXT: s_mov_b32 s3, s5
; GFX10-NEXT: s_mov_b32 s5, s7
@ -148,7 +146,6 @@ define amdgpu_ps <4 x float> @load_3d_v4f32_xyzw_tfe_lwe(<8 x i32> inreg %rsrc,
; GFX10-LABEL: load_3d_v4f32_xyzw_tfe_lwe:
; GFX10: ; %bb.0:
; GFX10-NEXT: v_mov_b32_e32 v5, 0
; GFX10-NEXT: v_mov_b32_e32 v3, 0xffff
; GFX10-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX10-NEXT: s_mov_b32 s0, s2
; GFX10-NEXT: s_mov_b32 s2, s4
@ -159,8 +156,8 @@ define amdgpu_ps <4 x float> @load_3d_v4f32_xyzw_tfe_lwe(<8 x i32> inreg %rsrc,
; GFX10-NEXT: v_mov_b32_e32 v7, v5
; GFX10-NEXT: v_mov_b32_e32 v8, v5
; GFX10-NEXT: v_mov_b32_e32 v9, v5
; GFX10-NEXT: v_and_or_b32 v10, v0, v3, v1
; GFX10-NEXT: v_and_or_b32 v11, v2, v3, s8
; GFX10-NEXT: v_and_or_b32 v10, 0xffff, v0, v1
; GFX10-NEXT: v_and_or_b32 v11, 0xffff, v2, s8
; GFX10-NEXT: s_mov_b32 s1, s3
; GFX10-NEXT: s_mov_b32 s3, s5
; GFX10-NEXT: s_mov_b32 s5, s7

View File

@ -4,10 +4,9 @@
define amdgpu_ps <4 x float> @sample_d_1d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, half %dsdh, half %dsdv, float %s) {
; GFX10-LABEL: sample_d_1d:
; GFX10: ; %bb.0: ; %main_body
; GFX10-NEXT: v_mov_b32_e32 v3, 0xffff
; GFX10-NEXT: s_lshl_b32 s12, s0, 16
; GFX10-NEXT: v_and_or_b32 v0, v0, v3, s12
; GFX10-NEXT: v_and_or_b32 v1, v1, v3, s12
; GFX10-NEXT: v_and_or_b32 v0, 0xffff, v0, s12
; GFX10-NEXT: v_and_or_b32 v1, 0xffff, v1, s12
; GFX10-NEXT: image_sample_d_g16 v[0:3], v[0:2], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: ; return to shader part epilog
@ -19,11 +18,10 @@ main_body:
define amdgpu_ps <4 x float> @sample_d_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, half %dsdh, half %dtdh, half %dsdv, half %dtdv, float %s, float %t) {
; GFX10-LABEL: sample_d_2d:
; GFX10: ; %bb.0: ; %main_body
; GFX10-NEXT: v_mov_b32_e32 v6, 0xffff
; GFX10-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v3
; GFX10-NEXT: v_and_or_b32 v0, v0, v6, v1
; GFX10-NEXT: v_and_or_b32 v1, v2, v6, v3
; GFX10-NEXT: v_and_or_b32 v0, 0xffff, v0, v1
; GFX10-NEXT: v_and_or_b32 v1, 0xffff, v2, v3
; GFX10-NEXT: image_sample_d_g16 v[0:3], [v0, v1, v4, v5], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: ; return to shader part epilog
@ -37,14 +35,13 @@ define amdgpu_ps <4 x float> @sample_d_3d(<8 x i32> inreg %rsrc, <4 x i32> inreg
; GFX10: ; %bb.0: ; %main_body
; GFX10-NEXT: v_mov_b32_e32 v9, v2
; GFX10-NEXT: v_mov_b32_e32 v10, v3
; GFX10-NEXT: v_mov_b32_e32 v11, 0xffff
; GFX10-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v4
; GFX10-NEXT: s_lshl_b32 s12, s0, 16
; GFX10-NEXT: v_and_or_b32 v3, v9, v11, s12
; GFX10-NEXT: v_and_or_b32 v2, v0, v11, v1
; GFX10-NEXT: v_and_or_b32 v4, v10, v11, v4
; GFX10-NEXT: v_and_or_b32 v5, v5, v11, s12
; GFX10-NEXT: v_and_or_b32 v3, 0xffff, v9, s12
; GFX10-NEXT: v_and_or_b32 v2, 0xffff, v0, v1
; GFX10-NEXT: v_and_or_b32 v4, 0xffff, v10, v4
; GFX10-NEXT: v_and_or_b32 v5, 0xffff, v5, s12
; GFX10-NEXT: image_sample_d_g16 v[0:3], v[2:8], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_3D
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: ; return to shader part epilog
@ -56,10 +53,9 @@ main_body:
define amdgpu_ps <4 x float> @sample_c_d_1d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %zcompare, half %dsdh, half %dsdv, float %s) {
; GFX10-LABEL: sample_c_d_1d:
; GFX10: ; %bb.0: ; %main_body
; GFX10-NEXT: v_mov_b32_e32 v4, 0xffff
; GFX10-NEXT: s_lshl_b32 s12, s0, 16
; GFX10-NEXT: v_and_or_b32 v1, v1, v4, s12
; GFX10-NEXT: v_and_or_b32 v2, v2, v4, s12
; GFX10-NEXT: v_and_or_b32 v1, 0xffff, v1, s12
; GFX10-NEXT: v_and_or_b32 v2, 0xffff, v2, s12
; GFX10-NEXT: image_sample_c_d_g16 v[0:3], v[0:3], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: ; return to shader part epilog
@ -71,11 +67,10 @@ main_body:
define amdgpu_ps <4 x float> @sample_c_d_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %zcompare, half %dsdh, half %dtdh, half %dsdv, half %dtdv, float %s, float %t) {
; GFX10-LABEL: sample_c_d_2d:
; GFX10: ; %bb.0: ; %main_body
; GFX10-NEXT: v_mov_b32_e32 v7, 0xffff
; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v2
; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v4
; GFX10-NEXT: v_and_or_b32 v1, v1, v7, v2
; GFX10-NEXT: v_and_or_b32 v2, v3, v7, v4
; GFX10-NEXT: v_and_or_b32 v1, 0xffff, v1, v2
; GFX10-NEXT: v_and_or_b32 v2, 0xffff, v3, v4
; GFX10-NEXT: image_sample_c_d_g16 v[0:3], [v0, v1, v2, v5, v6], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: ; return to shader part epilog
@ -87,10 +82,9 @@ main_body:
define amdgpu_ps <4 x float> @sample_d_cl_1d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, half %dsdh, half %dsdv, float %s, float %clamp) {
; GFX10-LABEL: sample_d_cl_1d:
; GFX10: ; %bb.0: ; %main_body
; GFX10-NEXT: v_mov_b32_e32 v4, 0xffff
; GFX10-NEXT: s_lshl_b32 s12, s0, 16
; GFX10-NEXT: v_and_or_b32 v0, v0, v4, s12
; GFX10-NEXT: v_and_or_b32 v1, v1, v4, s12
; GFX10-NEXT: v_and_or_b32 v0, 0xffff, v0, s12
; GFX10-NEXT: v_and_or_b32 v1, 0xffff, v1, s12
; GFX10-NEXT: image_sample_d_cl_g16 v[0:3], v[0:3], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: ; return to shader part epilog
@ -102,11 +96,10 @@ main_body:
define amdgpu_ps <4 x float> @sample_d_cl_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, half %dsdh, half %dtdh, half %dsdv, half %dtdv, float %s, float %t, float %clamp) {
; GFX10-LABEL: sample_d_cl_2d:
; GFX10: ; %bb.0: ; %main_body
; GFX10-NEXT: v_mov_b32_e32 v7, 0xffff
; GFX10-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v3
; GFX10-NEXT: v_and_or_b32 v0, v0, v7, v1
; GFX10-NEXT: v_and_or_b32 v1, v2, v7, v3
; GFX10-NEXT: v_and_or_b32 v0, 0xffff, v0, v1
; GFX10-NEXT: v_and_or_b32 v1, 0xffff, v2, v3
; GFX10-NEXT: image_sample_d_cl_g16 v[0:3], [v0, v1, v4, v5, v6], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: ; return to shader part epilog
@ -118,10 +111,9 @@ main_body:
define amdgpu_ps <4 x float> @sample_c_d_cl_1d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %zcompare, half %dsdh, half %dsdv, float %s, float %clamp) {
; GFX10-LABEL: sample_c_d_cl_1d:
; GFX10: ; %bb.0: ; %main_body
; GFX10-NEXT: v_mov_b32_e32 v5, 0xffff
; GFX10-NEXT: s_lshl_b32 s12, s0, 16
; GFX10-NEXT: v_and_or_b32 v1, v1, v5, s12
; GFX10-NEXT: v_and_or_b32 v2, v2, v5, s12
; GFX10-NEXT: v_and_or_b32 v1, 0xffff, v1, s12
; GFX10-NEXT: v_and_or_b32 v2, 0xffff, v2, s12
; GFX10-NEXT: image_sample_c_d_cl_g16 v[0:3], v[0:4], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: ; return to shader part epilog
@ -136,11 +128,10 @@ define amdgpu_ps <4 x float> @sample_c_d_cl_2d(<8 x i32> inreg %rsrc, <4 x i32>
; GFX10-NEXT: v_mov_b32_e32 v8, v2
; GFX10-NEXT: v_mov_b32_e32 v9, v3
; GFX10-NEXT: v_mov_b32_e32 v2, v0
; GFX10-NEXT: v_mov_b32_e32 v0, 0xffff
; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v4
; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v8
; GFX10-NEXT: v_and_or_b32 v4, v9, v0, v4
; GFX10-NEXT: v_and_or_b32 v3, v1, v0, v3
; GFX10-NEXT: v_lshlrev_b32_e32 v0, 16, v8
; GFX10-NEXT: v_and_or_b32 v4, 0xffff, v9, v4
; GFX10-NEXT: v_and_or_b32 v3, 0xffff, v1, v0
; GFX10-NEXT: image_sample_c_d_cl_g16 v[0:3], v[2:7], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: ; return to shader part epilog
@ -152,10 +143,9 @@ main_body:
define amdgpu_ps <4 x float> @sample_cd_1d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, half %dsdh, half %dsdv, float %s) {
; GFX10-LABEL: sample_cd_1d:
; GFX10: ; %bb.0: ; %main_body
; GFX10-NEXT: v_mov_b32_e32 v3, 0xffff
; GFX10-NEXT: s_lshl_b32 s12, s0, 16
; GFX10-NEXT: v_and_or_b32 v0, v0, v3, s12
; GFX10-NEXT: v_and_or_b32 v1, v1, v3, s12
; GFX10-NEXT: v_and_or_b32 v0, 0xffff, v0, s12
; GFX10-NEXT: v_and_or_b32 v1, 0xffff, v1, s12
; GFX10-NEXT: image_sample_cd_g16 v[0:3], v[0:2], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: ; return to shader part epilog
@ -167,11 +157,10 @@ main_body:
define amdgpu_ps <4 x float> @sample_cd_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, half %dsdh, half %dtdh, half %dsdv, half %dtdv, float %s, float %t) {
; GFX10-LABEL: sample_cd_2d:
; GFX10: ; %bb.0: ; %main_body
; GFX10-NEXT: v_mov_b32_e32 v6, 0xffff
; GFX10-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v3
; GFX10-NEXT: v_and_or_b32 v0, v0, v6, v1
; GFX10-NEXT: v_and_or_b32 v1, v2, v6, v3
; GFX10-NEXT: v_and_or_b32 v0, 0xffff, v0, v1
; GFX10-NEXT: v_and_or_b32 v1, 0xffff, v2, v3
; GFX10-NEXT: image_sample_cd_g16 v[0:3], [v0, v1, v4, v5], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: ; return to shader part epilog
@ -183,10 +172,9 @@ main_body:
define amdgpu_ps <4 x float> @sample_c_cd_1d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %zcompare, half %dsdh, half %dsdv, float %s) {
; GFX10-LABEL: sample_c_cd_1d:
; GFX10: ; %bb.0: ; %main_body
; GFX10-NEXT: v_mov_b32_e32 v4, 0xffff
; GFX10-NEXT: s_lshl_b32 s12, s0, 16
; GFX10-NEXT: v_and_or_b32 v1, v1, v4, s12
; GFX10-NEXT: v_and_or_b32 v2, v2, v4, s12
; GFX10-NEXT: v_and_or_b32 v1, 0xffff, v1, s12
; GFX10-NEXT: v_and_or_b32 v2, 0xffff, v2, s12
; GFX10-NEXT: image_sample_c_cd_g16 v[0:3], v[0:3], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: ; return to shader part epilog
@ -198,11 +186,10 @@ main_body:
define amdgpu_ps <4 x float> @sample_c_cd_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %zcompare, half %dsdh, half %dtdh, half %dsdv, half %dtdv, float %s, float %t) {
; GFX10-LABEL: sample_c_cd_2d:
; GFX10: ; %bb.0: ; %main_body
; GFX10-NEXT: v_mov_b32_e32 v7, 0xffff
; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v2
; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v4
; GFX10-NEXT: v_and_or_b32 v1, v1, v7, v2
; GFX10-NEXT: v_and_or_b32 v2, v3, v7, v4
; GFX10-NEXT: v_and_or_b32 v1, 0xffff, v1, v2
; GFX10-NEXT: v_and_or_b32 v2, 0xffff, v3, v4
; GFX10-NEXT: image_sample_c_cd_g16 v[0:3], [v0, v1, v2, v5, v6], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: ; return to shader part epilog
@ -214,10 +201,9 @@ main_body:
define amdgpu_ps <4 x float> @sample_cd_cl_1d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, half %dsdh, half %dsdv, float %s, float %clamp) {
; GFX10-LABEL: sample_cd_cl_1d:
; GFX10: ; %bb.0: ; %main_body
; GFX10-NEXT: v_mov_b32_e32 v4, 0xffff
; GFX10-NEXT: s_lshl_b32 s12, s0, 16
; GFX10-NEXT: v_and_or_b32 v0, v0, v4, s12
; GFX10-NEXT: v_and_or_b32 v1, v1, v4, s12
; GFX10-NEXT: v_and_or_b32 v0, 0xffff, v0, s12
; GFX10-NEXT: v_and_or_b32 v1, 0xffff, v1, s12
; GFX10-NEXT: image_sample_cd_cl_g16 v[0:3], v[0:3], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: ; return to shader part epilog
@ -229,11 +215,10 @@ main_body:
define amdgpu_ps <4 x float> @sample_cd_cl_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, half %dsdh, half %dtdh, half %dsdv, half %dtdv, float %s, float %t, float %clamp) {
; GFX10-LABEL: sample_cd_cl_2d:
; GFX10: ; %bb.0: ; %main_body
; GFX10-NEXT: v_mov_b32_e32 v7, 0xffff
; GFX10-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v3
; GFX10-NEXT: v_and_or_b32 v0, v0, v7, v1
; GFX10-NEXT: v_and_or_b32 v1, v2, v7, v3
; GFX10-NEXT: v_and_or_b32 v0, 0xffff, v0, v1
; GFX10-NEXT: v_and_or_b32 v1, 0xffff, v2, v3
; GFX10-NEXT: image_sample_cd_cl_g16 v[0:3], [v0, v1, v4, v5, v6], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: ; return to shader part epilog
@ -245,10 +230,9 @@ main_body:
define amdgpu_ps <4 x float> @sample_c_cd_cl_1d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %zcompare, half %dsdh, half %dsdv, float %s, float %clamp) {
; GFX10-LABEL: sample_c_cd_cl_1d:
; GFX10: ; %bb.0: ; %main_body
; GFX10-NEXT: v_mov_b32_e32 v5, 0xffff
; GFX10-NEXT: s_lshl_b32 s12, s0, 16
; GFX10-NEXT: v_and_or_b32 v1, v1, v5, s12
; GFX10-NEXT: v_and_or_b32 v2, v2, v5, s12
; GFX10-NEXT: v_and_or_b32 v1, 0xffff, v1, s12
; GFX10-NEXT: v_and_or_b32 v2, 0xffff, v2, s12
; GFX10-NEXT: image_sample_c_cd_cl_g16 v[0:3], v[0:4], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: ; return to shader part epilog
@ -263,11 +247,10 @@ define amdgpu_ps <4 x float> @sample_c_cd_cl_2d(<8 x i32> inreg %rsrc, <4 x i32>
; GFX10-NEXT: v_mov_b32_e32 v8, v2
; GFX10-NEXT: v_mov_b32_e32 v9, v3
; GFX10-NEXT: v_mov_b32_e32 v2, v0
; GFX10-NEXT: v_mov_b32_e32 v0, 0xffff
; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v4
; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v8
; GFX10-NEXT: v_and_or_b32 v4, v9, v0, v4
; GFX10-NEXT: v_and_or_b32 v3, v1, v0, v3
; GFX10-NEXT: v_lshlrev_b32_e32 v0, 16, v8
; GFX10-NEXT: v_and_or_b32 v4, 0xffff, v9, v4
; GFX10-NEXT: v_and_or_b32 v3, 0xffff, v1, v0
; GFX10-NEXT: image_sample_c_cd_cl_g16 v[0:3], v[2:7], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: ; return to shader part epilog
@ -284,11 +267,10 @@ define amdgpu_ps float @sample_c_d_o_2darray_V1(<8 x i32> inreg %rsrc, <4 x i32>
; GFX10-NEXT: v_mov_b32_e32 v11, v4
; GFX10-NEXT: v_mov_b32_e32 v2, v0
; GFX10-NEXT: v_mov_b32_e32 v3, v1
; GFX10-NEXT: v_mov_b32_e32 v0, 0xffff
; GFX10-NEXT: v_lshlrev_b32_e32 v1, 16, v9
; GFX10-NEXT: v_lshlrev_b32_e32 v5, 16, v5
; GFX10-NEXT: v_and_or_b32 v4, v10, v0, v1
; GFX10-NEXT: v_and_or_b32 v5, v11, v0, v5
; GFX10-NEXT: v_lshlrev_b32_e32 v0, 16, v9
; GFX10-NEXT: v_lshlrev_b32_e32 v1, 16, v5
; GFX10-NEXT: v_and_or_b32 v4, 0xffff, v10, v0
; GFX10-NEXT: v_and_or_b32 v5, 0xffff, v11, v1
; GFX10-NEXT: image_sample_c_d_o_g16 v0, v[2:8], s[0:7], s[8:11] dmask:0x4 dim:SQ_RSRC_IMG_2D_ARRAY
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: ; return to shader part epilog
@ -305,11 +287,10 @@ define amdgpu_ps <2 x float> @sample_c_d_o_2darray_V2(<8 x i32> inreg %rsrc, <4
; GFX10-NEXT: v_mov_b32_e32 v11, v4
; GFX10-NEXT: v_mov_b32_e32 v2, v0
; GFX10-NEXT: v_mov_b32_e32 v3, v1
; GFX10-NEXT: v_mov_b32_e32 v0, 0xffff
; GFX10-NEXT: v_lshlrev_b32_e32 v1, 16, v9
; GFX10-NEXT: v_lshlrev_b32_e32 v5, 16, v5
; GFX10-NEXT: v_and_or_b32 v4, v10, v0, v1
; GFX10-NEXT: v_and_or_b32 v5, v11, v0, v5
; GFX10-NEXT: v_lshlrev_b32_e32 v0, 16, v9
; GFX10-NEXT: v_lshlrev_b32_e32 v1, 16, v5
; GFX10-NEXT: v_and_or_b32 v4, 0xffff, v10, v0
; GFX10-NEXT: v_and_or_b32 v5, 0xffff, v11, v1
; GFX10-NEXT: image_sample_c_d_o_g16 v[0:1], v[2:8], s[0:7], s[8:11] dmask:0x6 dim:SQ_RSRC_IMG_2D_ARRAY
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: ; return to shader part epilog

View File

@ -49,15 +49,14 @@ define amdgpu_ps <4 x float> @image_bvh_intersect_ray_flat(i32 %node_ptr, float
define amdgpu_ps <4 x float> @image_bvh_intersect_ray_a16(i32 %node_ptr, float %ray_extent, <3 x float> %ray_origin, <3 x half> %ray_dir, <3 x half> %ray_inv_dir, <4 x i32> inreg %tdescr) {
; GCN-LABEL: image_bvh_intersect_ray_a16:
; GCN: ; %bb.0:
; GCN-NEXT: s_mov_b32 s4, 0xffff
; GCN-NEXT: v_lshrrev_b32_e32 v9, 16, v5
; GCN-NEXT: v_and_b32_e32 v10, s4, v7
; GCN-NEXT: v_and_b32_e32 v8, s4, v8
; GCN-NEXT: v_and_b32_e32 v10, 0xffff, v7
; GCN-NEXT: v_and_b32_e32 v8, 0xffff, v8
; GCN-NEXT: v_lshlrev_b32_e32 v9, 16, v9
; GCN-NEXT: v_lshlrev_b32_e32 v10, 16, v10
; GCN-NEXT: v_alignbit_b32 v7, v8, v7, 16
; GCN-NEXT: v_and_or_b32 v5, v5, s4, v9
; GCN-NEXT: v_and_or_b32 v6, v6, s4, v10
; GCN-NEXT: v_and_or_b32 v5, v5, 0xffff, v9
; GCN-NEXT: v_and_or_b32 v6, v6, 0xffff, v10
; GCN-NEXT: image_bvh_intersect_ray v[0:3], v[0:7], s[0:3] a16
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: ; return to shader part epilog
@ -101,15 +100,14 @@ define amdgpu_ps <4 x float> @image_bvh64_intersect_ray_flat(<2 x i32> %node_ptr
define amdgpu_ps <4 x float> @image_bvh64_intersect_ray_a16(i64 %node_ptr, float %ray_extent, <3 x float> %ray_origin, <3 x half> %ray_dir, <3 x half> %ray_inv_dir, <4 x i32> inreg %tdescr) {
; GCN-LABEL: image_bvh64_intersect_ray_a16:
; GCN: ; %bb.0:
; GCN-NEXT: s_mov_b32 s4, 0xffff
; GCN-NEXT: v_lshrrev_b32_e32 v10, 16, v6
; GCN-NEXT: v_and_b32_e32 v11, s4, v8
; GCN-NEXT: v_and_b32_e32 v9, s4, v9
; GCN-NEXT: v_and_b32_e32 v11, 0xffff, v8
; GCN-NEXT: v_and_b32_e32 v9, 0xffff, v9
; GCN-NEXT: v_lshlrev_b32_e32 v10, 16, v10
; GCN-NEXT: v_lshlrev_b32_e32 v11, 16, v11
; GCN-NEXT: v_alignbit_b32 v8, v9, v8, 16
; GCN-NEXT: v_and_or_b32 v6, v6, s4, v10
; GCN-NEXT: v_and_or_b32 v7, v7, s4, v11
; GCN-NEXT: v_and_or_b32 v6, v6, 0xffff, v10
; GCN-NEXT: v_and_or_b32 v7, v7, 0xffff, v11
; GCN-NEXT: image_bvh64_intersect_ray v[0:3], v[0:15], s[0:3] a16
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: ; return to shader part epilog
@ -202,21 +200,20 @@ define amdgpu_ps <4 x float> @image_bvh_intersect_ray_vgpr_descr(i32 %node_ptr,
define amdgpu_ps <4 x float> @image_bvh_intersect_ray_a16_vgpr_descr(i32 %node_ptr, float %ray_extent, <3 x float> %ray_origin, <3 x half> %ray_dir, <3 x half> %ray_inv_dir, <4 x i32> %tdescr) {
; GFX1030-LABEL: image_bvh_intersect_ray_a16_vgpr_descr:
; GFX1030: ; %bb.0:
; GFX1030-NEXT: s_mov_b32 s0, 0xffff
; GFX1030-NEXT: v_mov_b32_e32 v13, v0
; GFX1030-NEXT: v_mov_b32_e32 v14, v1
; GFX1030-NEXT: v_lshrrev_b32_e32 v0, 16, v5
; GFX1030-NEXT: v_and_b32_e32 v1, s0, v7
; GFX1030-NEXT: v_and_b32_e32 v1, 0xffff, v7
; GFX1030-NEXT: v_mov_b32_e32 v15, v2
; GFX1030-NEXT: v_and_b32_e32 v2, s0, v8
; GFX1030-NEXT: v_and_b32_e32 v2, 0xffff, v8
; GFX1030-NEXT: v_mov_b32_e32 v16, v3
; GFX1030-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX1030-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX1030-NEXT: v_mov_b32_e32 v17, v4
; GFX1030-NEXT: v_alignbit_b32 v20, v2, v7, 16
; GFX1030-NEXT: s_mov_b32 s1, exec_lo
; GFX1030-NEXT: v_and_or_b32 v18, v5, s0, v0
; GFX1030-NEXT: v_and_or_b32 v19, v6, s0, v1
; GFX1030-NEXT: v_and_or_b32 v18, v5, 0xffff, v0
; GFX1030-NEXT: v_and_or_b32 v19, v6, 0xffff, v1
; GFX1030-NEXT: .LBB7_1: ; =>This Inner Loop Header: Depth=1
; GFX1030-NEXT: v_readfirstlane_b32 s4, v9
; GFX1030-NEXT: v_readfirstlane_b32 s5, v10
@ -246,16 +243,15 @@ define amdgpu_ps <4 x float> @image_bvh_intersect_ray_a16_vgpr_descr(i32 %node_p
;
; GFX1013-LABEL: image_bvh_intersect_ray_a16_vgpr_descr:
; GFX1013: ; %bb.0:
; GFX1013-NEXT: s_mov_b32 s0, 0xffff
; GFX1013-NEXT: v_lshrrev_b32_e32 v13, 16, v5
; GFX1013-NEXT: v_and_b32_e32 v14, s0, v7
; GFX1013-NEXT: v_and_b32_e32 v8, s0, v8
; GFX1013-NEXT: v_and_b32_e32 v14, 0xffff, v7
; GFX1013-NEXT: v_and_b32_e32 v8, 0xffff, v8
; GFX1013-NEXT: s_mov_b32 s1, exec_lo
; GFX1013-NEXT: v_lshlrev_b32_e32 v13, 16, v13
; GFX1013-NEXT: v_lshlrev_b32_e32 v14, 16, v14
; GFX1013-NEXT: v_alignbit_b32 v7, v8, v7, 16
; GFX1013-NEXT: v_and_or_b32 v5, v5, s0, v13
; GFX1013-NEXT: v_and_or_b32 v6, v6, s0, v14
; GFX1013-NEXT: v_and_or_b32 v5, v5, 0xffff, v13
; GFX1013-NEXT: v_and_or_b32 v6, v6, 0xffff, v14
; GFX1013-NEXT: .LBB7_1: ; =>This Inner Loop Header: Depth=1
; GFX1013-NEXT: v_readfirstlane_b32 s4, v9
; GFX1013-NEXT: v_readfirstlane_b32 s5, v10
@ -371,21 +367,20 @@ define amdgpu_ps <4 x float> @image_bvh64_intersect_ray_vgpr_descr(i64 %node_ptr
define amdgpu_ps <4 x float> @image_bvh64_intersect_ray_a16_vgpr_descr(i64 %node_ptr, float %ray_extent, <3 x float> %ray_origin, <3 x half> %ray_dir, <3 x half> %ray_inv_dir, <4 x i32> %tdescr) {
; GFX1030-LABEL: image_bvh64_intersect_ray_a16_vgpr_descr:
; GFX1030: ; %bb.0:
; GFX1030-NEXT: s_mov_b32 s0, 0xffff
; GFX1030-NEXT: v_mov_b32_e32 v14, v0
; GFX1030-NEXT: v_mov_b32_e32 v15, v1
; GFX1030-NEXT: v_lshrrev_b32_e32 v0, 16, v6
; GFX1030-NEXT: v_and_b32_e32 v1, s0, v8
; GFX1030-NEXT: v_and_b32_e32 v1, 0xffff, v8
; GFX1030-NEXT: v_mov_b32_e32 v16, v2
; GFX1030-NEXT: v_and_b32_e32 v2, s0, v9
; GFX1030-NEXT: v_and_b32_e32 v2, 0xffff, v9
; GFX1030-NEXT: v_mov_b32_e32 v17, v3
; GFX1030-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX1030-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX1030-NEXT: v_mov_b32_e32 v18, v4
; GFX1030-NEXT: v_mov_b32_e32 v19, v5
; GFX1030-NEXT: v_alignbit_b32 v22, v2, v8, 16
; GFX1030-NEXT: v_and_or_b32 v20, v6, s0, v0
; GFX1030-NEXT: v_and_or_b32 v21, v7, s0, v1
; GFX1030-NEXT: v_and_or_b32 v20, v6, 0xffff, v0
; GFX1030-NEXT: v_and_or_b32 v21, v7, 0xffff, v1
; GFX1030-NEXT: s_mov_b32 s1, exec_lo
; GFX1030-NEXT: .LBB9_1: ; =>This Inner Loop Header: Depth=1
; GFX1030-NEXT: v_readfirstlane_b32 s4, v10
@ -417,20 +412,19 @@ define amdgpu_ps <4 x float> @image_bvh64_intersect_ray_a16_vgpr_descr(i64 %node
;
; GFX1013-LABEL: image_bvh64_intersect_ray_a16_vgpr_descr:
; GFX1013: ; %bb.0:
; GFX1013-NEXT: s_mov_b32 s0, 0xffff
; GFX1013-NEXT: v_mov_b32_e32 v16, v10
; GFX1013-NEXT: v_mov_b32_e32 v17, v11
; GFX1013-NEXT: v_lshrrev_b32_e32 v10, 16, v6
; GFX1013-NEXT: v_and_b32_e32 v11, s0, v8
; GFX1013-NEXT: v_and_b32_e32 v9, s0, v9
; GFX1013-NEXT: v_and_b32_e32 v11, 0xffff, v8
; GFX1013-NEXT: v_and_b32_e32 v9, 0xffff, v9
; GFX1013-NEXT: v_mov_b32_e32 v18, v12
; GFX1013-NEXT: v_mov_b32_e32 v19, v13
; GFX1013-NEXT: v_lshlrev_b32_e32 v10, 16, v10
; GFX1013-NEXT: v_lshlrev_b32_e32 v11, 16, v11
; GFX1013-NEXT: v_alignbit_b32 v8, v9, v8, 16
; GFX1013-NEXT: s_mov_b32 s1, exec_lo
; GFX1013-NEXT: v_and_or_b32 v6, v6, s0, v10
; GFX1013-NEXT: v_and_or_b32 v7, v7, s0, v11
; GFX1013-NEXT: v_and_or_b32 v6, v6, 0xffff, v10
; GFX1013-NEXT: v_and_or_b32 v7, v7, 0xffff, v11
; GFX1013-NEXT: .LBB9_1: ; =>This Inner Loop Header: Depth=1
; GFX1013-NEXT: v_readfirstlane_b32 s4, v16
; GFX1013-NEXT: v_readfirstlane_b32 s5, v17

View File

@ -752,14 +752,13 @@ define amdgpu_kernel void @bfe_8_bfe_8(i32 addrspace(1)* %out, i32 addrspace(1)*
; GFX6-LABEL: bfe_8_bfe_8:
; GFX6: ; %bb.0:
; GFX6-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2
; GFX6-NEXT: s_mov_b32 s4, 0x80000
; GFX6-NEXT: s_waitcnt lgkmcnt(0)
; GFX6-NEXT: s_load_dword s3, s[2:3], 0x0
; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0
; GFX6-NEXT: s_mov_b32 s2, -1
; GFX6-NEXT: s_waitcnt lgkmcnt(0)
; GFX6-NEXT: s_bfe_i32 s3, s3, s4
; GFX6-NEXT: s_bfe_i32 s3, s3, s4
; GFX6-NEXT: s_bfe_i32 s3, s3, 0x80000
; GFX6-NEXT: s_bfe_i32 s3, s3, 0x80000
; GFX6-NEXT: v_mov_b32_e32 v0, s3
; GFX6-NEXT: s_mov_b32 s3, 0xf000
; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0

View File

@ -66,17 +66,16 @@ define i32 @v_sdot4_cast_v4i8(<4 x i8> %a, <4 x i8> %b, i32 %c) {
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: s_mov_b32 s4, 8
; GFX10-NEXT: s_movk_i32 s5, 0xff
; GFX10-NEXT: v_lshlrev_b32_sdwa v1, s4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
; GFX10-NEXT: v_and_or_b32 v0, v0, s5, v1
; GFX10-NEXT: v_and_b32_e32 v1, s5, v2
; GFX10-NEXT: v_and_b32_e32 v2, s5, v3
; GFX10-NEXT: v_and_or_b32 v0, v0, 0xff, v1
; GFX10-NEXT: v_and_b32_e32 v1, 0xff, v2
; GFX10-NEXT: v_and_b32_e32 v2, 0xff, v3
; GFX10-NEXT: v_lshlrev_b32_sdwa v3, s4, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
; GFX10-NEXT: v_and_b32_e32 v5, s5, v6
; GFX10-NEXT: v_and_b32_e32 v6, s5, v7
; GFX10-NEXT: v_and_b32_e32 v5, 0xff, v6
; GFX10-NEXT: v_and_b32_e32 v6, 0xff, v7
; GFX10-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX10-NEXT: v_lshlrev_b32_e32 v2, 24, v2
; GFX10-NEXT: v_and_or_b32 v3, v4, s5, v3
; GFX10-NEXT: v_and_or_b32 v3, v4, 0xff, v3
; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v5
; GFX10-NEXT: v_lshlrev_b32_e32 v5, 24, v6
; GFX10-NEXT: v_or3_b32 v0, v0, v1, v2

View File

@ -66,17 +66,16 @@ define i32 @v_udot4_cast_v4i8(<4 x i8> %a, <4 x i8> %b, i32 %c) {
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: s_mov_b32 s4, 8
; GFX10-NEXT: s_movk_i32 s5, 0xff
; GFX10-NEXT: v_lshlrev_b32_sdwa v1, s4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
; GFX10-NEXT: v_and_or_b32 v0, v0, s5, v1
; GFX10-NEXT: v_and_b32_e32 v1, s5, v2
; GFX10-NEXT: v_and_b32_e32 v2, s5, v3
; GFX10-NEXT: v_and_or_b32 v0, v0, 0xff, v1
; GFX10-NEXT: v_and_b32_e32 v1, 0xff, v2
; GFX10-NEXT: v_and_b32_e32 v2, 0xff, v3
; GFX10-NEXT: v_lshlrev_b32_sdwa v3, s4, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
; GFX10-NEXT: v_and_b32_e32 v5, s5, v6
; GFX10-NEXT: v_and_b32_e32 v6, s5, v7
; GFX10-NEXT: v_and_b32_e32 v5, 0xff, v6
; GFX10-NEXT: v_and_b32_e32 v6, 0xff, v7
; GFX10-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX10-NEXT: v_lshlrev_b32_e32 v2, 24, v2
; GFX10-NEXT: v_and_or_b32 v3, v4, s5, v3
; GFX10-NEXT: v_and_or_b32 v3, v4, 0xff, v3
; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v5
; GFX10-NEXT: v_lshlrev_b32_e32 v5, 24, v6
; GFX10-NEXT: v_or3_b32 v0, v0, v1, v2

View File

@ -115,9 +115,8 @@ define i24 @v_lshr_i24(i24 %value, i24 %amount) {
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: s_mov_b32 s4, 0xffffff
; GFX10-NEXT: v_and_b32_e32 v1, s4, v1
; GFX10-NEXT: v_and_b32_e32 v0, s4, v0
; GFX10-NEXT: v_and_b32_e32 v1, 0xffffff, v1
; GFX10-NEXT: v_and_b32_e32 v0, 0xffffff, v0
; GFX10-NEXT: v_lshrrev_b32_e32 v0, v1, v0
; GFX10-NEXT: s_setpc_b64 s[30:31]
%result = lshr i24 %value, %amount
@ -631,9 +630,8 @@ define amdgpu_ps i16 @s_lshr_i16_15(i16 inreg %value) {
define amdgpu_ps half @lshr_i16_sv(i16 inreg %value, i16 %amount) {
; GFX6-LABEL: lshr_i16_sv:
; GFX6: ; %bb.0:
; GFX6-NEXT: s_mov_b32 s1, 0xffff
; GFX6-NEXT: v_and_b32_e32 v0, s1, v0
; GFX6-NEXT: s_and_b32 s0, s0, s1
; GFX6-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX6-NEXT: s_and_b32 s0, s0, 0xffff
; GFX6-NEXT: v_lshr_b32_e32 v0, s0, v0
; GFX6-NEXT: ; return to shader part epilog
;
@ -659,9 +657,8 @@ define amdgpu_ps half @lshr_i16_sv(i16 inreg %value, i16 %amount) {
define amdgpu_ps half @lshr_i16_vs(i16 %value, i16 inreg %amount) {
; GFX6-LABEL: lshr_i16_vs:
; GFX6: ; %bb.0:
; GFX6-NEXT: s_mov_b32 s1, 0xffff
; GFX6-NEXT: s_and_b32 s0, s0, s1
; GFX6-NEXT: v_and_b32_e32 v0, s1, v0
; GFX6-NEXT: s_and_b32 s0, s0, 0xffff
; GFX6-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX6-NEXT: v_lshrrev_b32_e32 v0, s0, v0
; GFX6-NEXT: ; return to shader part epilog
;
@ -757,9 +754,8 @@ define <2 x i16> @v_lshr_v2i16_15(<2 x i16> %value) {
define amdgpu_ps i32 @s_lshr_v2i16(<2 x i16> inreg %value, <2 x i16> inreg %amount) {
; GFX6-LABEL: s_lshr_v2i16:
; GFX6: ; %bb.0:
; GFX6-NEXT: s_mov_b32 s4, 0xffff
; GFX6-NEXT: s_and_b32 s1, s1, s4
; GFX6-NEXT: s_and_b32 s0, s0, s4
; GFX6-NEXT: s_and_b32 s1, s1, 0xffff
; GFX6-NEXT: s_and_b32 s0, s0, 0xffff
; GFX6-NEXT: s_lshr_b32 s1, s1, s3
; GFX6-NEXT: s_lshr_b32 s0, s0, s2
; GFX6-NEXT: s_lshl_b32 s1, s1, 16
@ -768,14 +764,13 @@ define amdgpu_ps i32 @s_lshr_v2i16(<2 x i16> inreg %value, <2 x i16> inreg %amou
;
; GFX8-LABEL: s_lshr_v2i16:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_mov_b32 s3, 0xffff
; GFX8-NEXT: s_lshr_b32 s2, s0, 16
; GFX8-NEXT: s_and_b32 s0, s0, s3
; GFX8-NEXT: s_lshr_b32 s4, s1, 16
; GFX8-NEXT: s_and_b32 s0, s0, 0xffff
; GFX8-NEXT: s_lshr_b32 s3, s1, 16
; GFX8-NEXT: s_lshr_b32 s0, s0, s1
; GFX8-NEXT: s_lshr_b32 s1, s2, s4
; GFX8-NEXT: s_lshr_b32 s1, s2, s3
; GFX8-NEXT: s_lshl_b32 s1, s1, 16
; GFX8-NEXT: s_and_b32 s0, s0, s3
; GFX8-NEXT: s_and_b32 s0, s0, 0xffff
; GFX8-NEXT: s_or_b32 s0, s1, s0
; GFX8-NEXT: ; return to shader part epilog
;
@ -808,10 +803,10 @@ define amdgpu_ps float @lshr_v2i16_sv(<2 x i16> inreg %value, <2 x i16> %amount)
; GFX6: ; %bb.0:
; GFX6-NEXT: s_mov_b32 s2, 0xffff
; GFX6-NEXT: v_and_b32_e32 v0, s2, v0
; GFX6-NEXT: s_and_b32 s0, s0, s2
; GFX6-NEXT: s_and_b32 s0, s0, 0xffff
; GFX6-NEXT: v_lshr_b32_e32 v0, s0, v0
; GFX6-NEXT: v_and_b32_e32 v1, s2, v1
; GFX6-NEXT: s_and_b32 s0, s1, s2
; GFX6-NEXT: s_and_b32 s0, s1, 0xffff
; GFX6-NEXT: v_lshr_b32_e32 v1, s0, v1
; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX6-NEXT: v_or_b32_e32 v0, v0, v1
@ -844,10 +839,10 @@ define amdgpu_ps float @lshr_v2i16_vs(<2 x i16> %value, <2 x i16> inreg %amount)
; GFX6-LABEL: lshr_v2i16_vs:
; GFX6: ; %bb.0:
; GFX6-NEXT: s_mov_b32 s2, 0xffff
; GFX6-NEXT: s_and_b32 s0, s0, s2
; GFX6-NEXT: s_and_b32 s0, s0, 0xffff
; GFX6-NEXT: v_and_b32_e32 v0, s2, v0
; GFX6-NEXT: v_lshrrev_b32_e32 v0, s0, v0
; GFX6-NEXT: s_and_b32 s0, s1, s2
; GFX6-NEXT: s_and_b32 s0, s1, 0xffff
; GFX6-NEXT: v_and_b32_e32 v1, s2, v1
; GFX6-NEXT: v_lshrrev_b32_e32 v1, s0, v1
; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1
@ -944,13 +939,12 @@ define <2 x float> @v_lshr_v4i16(<4 x i16> %value, <4 x i16> %amount) {
define amdgpu_ps <2 x i32> @s_lshr_v4i16(<4 x i16> inreg %value, <4 x i16> inreg %amount) {
; GFX6-LABEL: s_lshr_v4i16:
; GFX6: ; %bb.0:
; GFX6-NEXT: s_mov_b32 s8, 0xffff
; GFX6-NEXT: s_and_b32 s1, s1, s8
; GFX6-NEXT: s_and_b32 s0, s0, s8
; GFX6-NEXT: s_and_b32 s1, s1, 0xffff
; GFX6-NEXT: s_and_b32 s0, s0, 0xffff
; GFX6-NEXT: s_lshr_b32 s1, s1, s5
; GFX6-NEXT: s_and_b32 s3, s3, s8
; GFX6-NEXT: s_and_b32 s3, s3, 0xffff
; GFX6-NEXT: s_lshr_b32 s0, s0, s4
; GFX6-NEXT: s_and_b32 s2, s2, s8
; GFX6-NEXT: s_and_b32 s2, s2, 0xffff
; GFX6-NEXT: s_lshr_b32 s3, s3, s7
; GFX6-NEXT: s_lshl_b32 s1, s1, 16
; GFX6-NEXT: s_lshr_b32 s2, s2, s6
@ -961,36 +955,34 @@ define amdgpu_ps <2 x i32> @s_lshr_v4i16(<4 x i16> inreg %value, <4 x i16> inreg
;
; GFX8-LABEL: s_lshr_v4i16:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_mov_b32 s6, 0xffff
; GFX8-NEXT: s_lshr_b32 s4, s0, 16
; GFX8-NEXT: s_and_b32 s0, s0, s6
; GFX8-NEXT: s_lshr_b32 s7, s2, 16
; GFX8-NEXT: s_and_b32 s0, s0, 0xffff
; GFX8-NEXT: s_lshr_b32 s6, s2, 16
; GFX8-NEXT: s_lshr_b32 s5, s1, 16
; GFX8-NEXT: s_and_b32 s1, s1, s6
; GFX8-NEXT: s_lshr_b32 s8, s3, 16
; GFX8-NEXT: s_and_b32 s1, s1, 0xffff
; GFX8-NEXT: s_lshr_b32 s7, s3, 16
; GFX8-NEXT: s_lshr_b32 s0, s0, s2
; GFX8-NEXT: s_lshr_b32 s2, s4, s7
; GFX8-NEXT: s_lshr_b32 s2, s4, s6
; GFX8-NEXT: s_lshr_b32 s1, s1, s3
; GFX8-NEXT: s_lshr_b32 s3, s5, s8
; GFX8-NEXT: s_lshr_b32 s3, s5, s7
; GFX8-NEXT: s_lshl_b32 s2, s2, 16
; GFX8-NEXT: s_and_b32 s0, s0, s6
; GFX8-NEXT: s_and_b32 s0, s0, 0xffff
; GFX8-NEXT: s_or_b32 s0, s2, s0
; GFX8-NEXT: s_lshl_b32 s2, s3, 16
; GFX8-NEXT: s_and_b32 s1, s1, s6
; GFX8-NEXT: s_and_b32 s1, s1, 0xffff
; GFX8-NEXT: s_or_b32 s1, s2, s1
; GFX8-NEXT: ; return to shader part epilog
;
; GFX9-LABEL: s_lshr_v4i16:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_mov_b32 s5, 0xffff
; GFX9-NEXT: s_lshr_b32 s4, s0, 16
; GFX9-NEXT: s_and_b32 s0, s0, s5
; GFX9-NEXT: s_lshr_b32 s6, s2, 16
; GFX9-NEXT: s_and_b32 s0, s0, 0xffff
; GFX9-NEXT: s_lshr_b32 s5, s2, 16
; GFX9-NEXT: s_lshr_b32 s0, s0, s2
; GFX9-NEXT: s_lshr_b32 s2, s4, s6
; GFX9-NEXT: s_lshr_b32 s2, s4, s5
; GFX9-NEXT: s_pack_ll_b32_b16 s0, s0, s2
; GFX9-NEXT: s_lshr_b32 s2, s1, 16
; GFX9-NEXT: s_and_b32 s1, s1, s5
; GFX9-NEXT: s_and_b32 s1, s1, 0xffff
; GFX9-NEXT: s_lshr_b32 s4, s3, 16
; GFX9-NEXT: s_lshr_b32 s1, s1, s3
; GFX9-NEXT: s_lshr_b32 s2, s2, s4
@ -999,17 +991,16 @@ define amdgpu_ps <2 x i32> @s_lshr_v4i16(<4 x i16> inreg %value, <4 x i16> inreg
;
; GFX10-LABEL: s_lshr_v4i16:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_mov_b32 s4, 0xffff
; GFX10-NEXT: s_lshr_b32 s5, s0, 16
; GFX10-NEXT: s_and_b32 s0, s0, s4
; GFX10-NEXT: s_lshr_b32 s6, s2, 16
; GFX10-NEXT: s_lshr_b32 s4, s0, 16
; GFX10-NEXT: s_and_b32 s0, s0, 0xffff
; GFX10-NEXT: s_lshr_b32 s5, s2, 16
; GFX10-NEXT: s_lshr_b32 s0, s0, s2
; GFX10-NEXT: s_lshr_b32 s2, s5, s6
; GFX10-NEXT: s_lshr_b32 s5, s1, 16
; GFX10-NEXT: s_and_b32 s1, s1, s4
; GFX10-NEXT: s_lshr_b32 s4, s3, 16
; GFX10-NEXT: s_lshr_b32 s2, s4, s5
; GFX10-NEXT: s_lshr_b32 s4, s1, 16
; GFX10-NEXT: s_and_b32 s1, s1, 0xffff
; GFX10-NEXT: s_lshr_b32 s5, s3, 16
; GFX10-NEXT: s_lshr_b32 s1, s1, s3
; GFX10-NEXT: s_lshr_b32 s3, s5, s4
; GFX10-NEXT: s_lshr_b32 s3, s4, s5
; GFX10-NEXT: s_pack_ll_b32_b16 s0, s0, s2
; GFX10-NEXT: s_pack_ll_b32_b16 s1, s1, s3
; GFX10-NEXT: ; return to shader part epilog
@ -1124,21 +1115,20 @@ define <4 x float> @v_lshr_v8i16(<8 x i16> %value, <8 x i16> %amount) {
define amdgpu_ps <4 x i32> @s_lshr_v8i16(<8 x i16> inreg %value, <8 x i16> inreg %amount) {
; GFX6-LABEL: s_lshr_v8i16:
; GFX6: ; %bb.0:
; GFX6-NEXT: s_mov_b32 s16, 0xffff
; GFX6-NEXT: s_and_b32 s1, s1, s16
; GFX6-NEXT: s_and_b32 s0, s0, s16
; GFX6-NEXT: s_and_b32 s1, s1, 0xffff
; GFX6-NEXT: s_and_b32 s0, s0, 0xffff
; GFX6-NEXT: s_lshr_b32 s1, s1, s9
; GFX6-NEXT: s_and_b32 s3, s3, s16
; GFX6-NEXT: s_and_b32 s3, s3, 0xffff
; GFX6-NEXT: s_lshr_b32 s0, s0, s8
; GFX6-NEXT: s_and_b32 s2, s2, s16
; GFX6-NEXT: s_and_b32 s2, s2, 0xffff
; GFX6-NEXT: s_lshr_b32 s3, s3, s11
; GFX6-NEXT: s_and_b32 s5, s5, s16
; GFX6-NEXT: s_and_b32 s7, s7, s16
; GFX6-NEXT: s_and_b32 s5, s5, 0xffff
; GFX6-NEXT: s_and_b32 s7, s7, 0xffff
; GFX6-NEXT: s_lshl_b32 s1, s1, 16
; GFX6-NEXT: s_lshr_b32 s2, s2, s10
; GFX6-NEXT: s_and_b32 s4, s4, s16
; GFX6-NEXT: s_and_b32 s4, s4, 0xffff
; GFX6-NEXT: s_lshr_b32 s5, s5, s13
; GFX6-NEXT: s_and_b32 s6, s6, s16
; GFX6-NEXT: s_and_b32 s6, s6, 0xffff
; GFX6-NEXT: s_lshr_b32 s7, s7, s15
; GFX6-NEXT: s_or_b32 s0, s0, s1
; GFX6-NEXT: s_lshl_b32 s1, s3, 16
@ -1153,64 +1143,62 @@ define amdgpu_ps <4 x i32> @s_lshr_v8i16(<8 x i16> inreg %value, <8 x i16> inreg
;
; GFX8-LABEL: s_lshr_v8i16:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_mov_b32 s12, 0xffff
; GFX8-NEXT: s_lshr_b32 s8, s0, 16
; GFX8-NEXT: s_and_b32 s0, s0, s12
; GFX8-NEXT: s_lshr_b32 s13, s4, 16
; GFX8-NEXT: s_and_b32 s0, s0, 0xffff
; GFX8-NEXT: s_lshr_b32 s12, s4, 16
; GFX8-NEXT: s_lshr_b32 s9, s1, 16
; GFX8-NEXT: s_and_b32 s1, s1, s12
; GFX8-NEXT: s_lshr_b32 s14, s5, 16
; GFX8-NEXT: s_and_b32 s1, s1, 0xffff
; GFX8-NEXT: s_lshr_b32 s13, s5, 16
; GFX8-NEXT: s_lshr_b32 s0, s0, s4
; GFX8-NEXT: s_lshr_b32 s4, s8, s13
; GFX8-NEXT: s_lshr_b32 s4, s8, s12
; GFX8-NEXT: s_lshr_b32 s10, s2, 16
; GFX8-NEXT: s_and_b32 s2, s2, s12
; GFX8-NEXT: s_lshr_b32 s15, s6, 16
; GFX8-NEXT: s_and_b32 s2, s2, 0xffff
; GFX8-NEXT: s_lshr_b32 s14, s6, 16
; GFX8-NEXT: s_lshr_b32 s1, s1, s5
; GFX8-NEXT: s_lshr_b32 s5, s9, s14
; GFX8-NEXT: s_lshr_b32 s5, s9, s13
; GFX8-NEXT: s_lshl_b32 s4, s4, 16
; GFX8-NEXT: s_and_b32 s0, s0, s12
; GFX8-NEXT: s_and_b32 s0, s0, 0xffff
; GFX8-NEXT: s_lshr_b32 s11, s3, 16
; GFX8-NEXT: s_and_b32 s3, s3, s12
; GFX8-NEXT: s_lshr_b32 s16, s7, 16
; GFX8-NEXT: s_and_b32 s3, s3, 0xffff
; GFX8-NEXT: s_lshr_b32 s15, s7, 16
; GFX8-NEXT: s_lshr_b32 s2, s2, s6
; GFX8-NEXT: s_lshr_b32 s6, s10, s15
; GFX8-NEXT: s_lshr_b32 s6, s10, s14
; GFX8-NEXT: s_or_b32 s0, s4, s0
; GFX8-NEXT: s_lshl_b32 s4, s5, 16
; GFX8-NEXT: s_and_b32 s1, s1, s12
; GFX8-NEXT: s_and_b32 s1, s1, 0xffff
; GFX8-NEXT: s_lshr_b32 s3, s3, s7
; GFX8-NEXT: s_lshr_b32 s7, s11, s16
; GFX8-NEXT: s_lshr_b32 s7, s11, s15
; GFX8-NEXT: s_or_b32 s1, s4, s1
; GFX8-NEXT: s_lshl_b32 s4, s6, 16
; GFX8-NEXT: s_and_b32 s2, s2, s12
; GFX8-NEXT: s_and_b32 s2, s2, 0xffff
; GFX8-NEXT: s_or_b32 s2, s4, s2
; GFX8-NEXT: s_lshl_b32 s4, s7, 16
; GFX8-NEXT: s_and_b32 s3, s3, s12
; GFX8-NEXT: s_and_b32 s3, s3, 0xffff
; GFX8-NEXT: s_or_b32 s3, s4, s3
; GFX8-NEXT: ; return to shader part epilog
;
; GFX9-LABEL: s_lshr_v8i16:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_mov_b32 s9, 0xffff
; GFX9-NEXT: s_lshr_b32 s8, s0, 16
; GFX9-NEXT: s_and_b32 s0, s0, s9
; GFX9-NEXT: s_lshr_b32 s10, s4, 16
; GFX9-NEXT: s_and_b32 s0, s0, 0xffff
; GFX9-NEXT: s_lshr_b32 s9, s4, 16
; GFX9-NEXT: s_lshr_b32 s0, s0, s4
; GFX9-NEXT: s_lshr_b32 s4, s8, s10
; GFX9-NEXT: s_lshr_b32 s4, s8, s9
; GFX9-NEXT: s_pack_ll_b32_b16 s0, s0, s4
; GFX9-NEXT: s_lshr_b32 s4, s1, 16
; GFX9-NEXT: s_and_b32 s1, s1, s9
; GFX9-NEXT: s_and_b32 s1, s1, 0xffff
; GFX9-NEXT: s_lshr_b32 s8, s5, 16
; GFX9-NEXT: s_lshr_b32 s1, s1, s5
; GFX9-NEXT: s_lshr_b32 s4, s4, s8
; GFX9-NEXT: s_pack_ll_b32_b16 s1, s1, s4
; GFX9-NEXT: s_lshr_b32 s4, s2, 16
; GFX9-NEXT: s_and_b32 s2, s2, s9
; GFX9-NEXT: s_and_b32 s2, s2, 0xffff
; GFX9-NEXT: s_lshr_b32 s5, s6, 16
; GFX9-NEXT: s_lshr_b32 s2, s2, s6
; GFX9-NEXT: s_lshr_b32 s4, s4, s5
; GFX9-NEXT: s_pack_ll_b32_b16 s2, s2, s4
; GFX9-NEXT: s_lshr_b32 s4, s3, 16
; GFX9-NEXT: s_and_b32 s3, s3, s9
; GFX9-NEXT: s_and_b32 s3, s3, 0xffff
; GFX9-NEXT: s_lshr_b32 s5, s7, 16
; GFX9-NEXT: s_lshr_b32 s3, s3, s7
; GFX9-NEXT: s_lshr_b32 s4, s4, s5
@ -1219,26 +1207,25 @@ define amdgpu_ps <4 x i32> @s_lshr_v8i16(<8 x i16> inreg %value, <8 x i16> inreg
;
; GFX10-LABEL: s_lshr_v8i16:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_mov_b32 s8, 0xffff
; GFX10-NEXT: s_lshr_b32 s9, s0, 16
; GFX10-NEXT: s_and_b32 s0, s0, s8
; GFX10-NEXT: s_lshr_b32 s10, s4, 16
; GFX10-NEXT: s_lshr_b32 s8, s0, 16
; GFX10-NEXT: s_and_b32 s0, s0, 0xffff
; GFX10-NEXT: s_lshr_b32 s9, s4, 16
; GFX10-NEXT: s_lshr_b32 s0, s0, s4
; GFX10-NEXT: s_lshr_b32 s4, s9, s10
; GFX10-NEXT: s_lshr_b32 s9, s1, 16
; GFX10-NEXT: s_and_b32 s1, s1, s8
; GFX10-NEXT: s_lshr_b32 s10, s5, 16
; GFX10-NEXT: s_lshr_b32 s4, s8, s9
; GFX10-NEXT: s_lshr_b32 s8, s1, 16
; GFX10-NEXT: s_and_b32 s1, s1, 0xffff
; GFX10-NEXT: s_lshr_b32 s9, s5, 16
; GFX10-NEXT: s_lshr_b32 s1, s1, s5
; GFX10-NEXT: s_lshr_b32 s5, s9, s10
; GFX10-NEXT: s_lshr_b32 s5, s8, s9
; GFX10-NEXT: s_pack_ll_b32_b16 s0, s0, s4
; GFX10-NEXT: s_pack_ll_b32_b16 s1, s1, s5
; GFX10-NEXT: s_lshr_b32 s4, s2, 16
; GFX10-NEXT: s_and_b32 s2, s2, s8
; GFX10-NEXT: s_and_b32 s2, s2, 0xffff
; GFX10-NEXT: s_lshr_b32 s5, s6, 16
; GFX10-NEXT: s_lshr_b32 s2, s2, s6
; GFX10-NEXT: s_lshr_b32 s4, s4, s5
; GFX10-NEXT: s_lshr_b32 s5, s3, 16
; GFX10-NEXT: s_and_b32 s3, s3, s8
; GFX10-NEXT: s_and_b32 s3, s3, 0xffff
; GFX10-NEXT: s_lshr_b32 s6, s7, 16
; GFX10-NEXT: s_lshr_b32 s3, s3, s7
; GFX10-NEXT: s_lshr_b32 s5, s5, s6

View File

@ -12,25 +12,22 @@ define amdgpu_ps i16 @s_mul_i16(i16 inreg %num, i16 inreg %den) {
;
; GFX8-LABEL: s_mul_i16:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_mov_b32 s2, 0xffff
; GFX8-NEXT: s_and_b32 s0, s0, s2
; GFX8-NEXT: s_and_b32 s1, s1, s2
; GFX8-NEXT: s_and_b32 s0, s0, 0xffff
; GFX8-NEXT: s_and_b32 s1, s1, 0xffff
; GFX8-NEXT: s_mul_i32 s0, s0, s1
; GFX8-NEXT: ; return to shader part epilog
;
; GFX9-LABEL: s_mul_i16:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_mov_b32 s2, 0xffff
; GFX9-NEXT: s_and_b32 s0, s0, s2
; GFX9-NEXT: s_and_b32 s1, s1, s2
; GFX9-NEXT: s_and_b32 s0, s0, 0xffff
; GFX9-NEXT: s_and_b32 s1, s1, 0xffff
; GFX9-NEXT: s_mul_i32 s0, s0, s1
; GFX9-NEXT: ; return to shader part epilog
;
; GFX10-LABEL: s_mul_i16:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_mov_b32 s2, 0xffff
; GFX10-NEXT: s_and_b32 s0, s0, s2
; GFX10-NEXT: s_and_b32 s1, s1, s2
; GFX10-NEXT: s_and_b32 s0, s0, 0xffff
; GFX10-NEXT: s_and_b32 s1, s1, 0xffff
; GFX10-NEXT: s_mul_i32 s0, s0, s1
; GFX10-NEXT: ; return to shader part epilog
%result = mul i16 %num, %den
@ -78,29 +75,26 @@ define amdgpu_ps zeroext i16 @s_mul_i16_zeroext(i16 inreg zeroext %num, i16 inre
;
; GFX8-LABEL: s_mul_i16_zeroext:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_mov_b32 s2, 0xffff
; GFX8-NEXT: s_and_b32 s0, s0, s2
; GFX8-NEXT: s_and_b32 s1, s1, s2
; GFX8-NEXT: s_and_b32 s0, s0, 0xffff
; GFX8-NEXT: s_and_b32 s1, s1, 0xffff
; GFX8-NEXT: s_mul_i32 s0, s0, s1
; GFX8-NEXT: s_and_b32 s0, s0, s2
; GFX8-NEXT: s_and_b32 s0, s0, 0xffff
; GFX8-NEXT: ; return to shader part epilog
;
; GFX9-LABEL: s_mul_i16_zeroext:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_mov_b32 s2, 0xffff
; GFX9-NEXT: s_and_b32 s0, s0, s2
; GFX9-NEXT: s_and_b32 s1, s1, s2
; GFX9-NEXT: s_and_b32 s0, s0, 0xffff
; GFX9-NEXT: s_and_b32 s1, s1, 0xffff
; GFX9-NEXT: s_mul_i32 s0, s0, s1
; GFX9-NEXT: s_and_b32 s0, s0, s2
; GFX9-NEXT: s_and_b32 s0, s0, 0xffff
; GFX9-NEXT: ; return to shader part epilog
;
; GFX10-LABEL: s_mul_i16_zeroext:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_mov_b32 s2, 0xffff
; GFX10-NEXT: s_and_b32 s0, s0, s2
; GFX10-NEXT: s_and_b32 s1, s1, s2
; GFX10-NEXT: s_and_b32 s0, s0, 0xffff
; GFX10-NEXT: s_and_b32 s1, s1, 0xffff
; GFX10-NEXT: s_mul_i32 s0, s0, s1
; GFX10-NEXT: s_and_b32 s0, s0, s2
; GFX10-NEXT: s_and_b32 s0, s0, 0xffff
; GFX10-NEXT: ; return to shader part epilog
%result = mul i16 %num, %den
ret i16 %result
@ -146,27 +140,24 @@ define amdgpu_ps signext i16 @s_mul_i16_signext(i16 inreg signext %num, i16 inre
;
; GFX8-LABEL: s_mul_i16_signext:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_mov_b32 s2, 0xffff
; GFX8-NEXT: s_and_b32 s0, s0, s2
; GFX8-NEXT: s_and_b32 s1, s1, s2
; GFX8-NEXT: s_and_b32 s0, s0, 0xffff
; GFX8-NEXT: s_and_b32 s1, s1, 0xffff
; GFX8-NEXT: s_mul_i32 s0, s0, s1
; GFX8-NEXT: s_sext_i32_i16 s0, s0
; GFX8-NEXT: ; return to shader part epilog
;
; GFX9-LABEL: s_mul_i16_signext:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_mov_b32 s2, 0xffff
; GFX9-NEXT: s_and_b32 s0, s0, s2
; GFX9-NEXT: s_and_b32 s1, s1, s2
; GFX9-NEXT: s_and_b32 s0, s0, 0xffff
; GFX9-NEXT: s_and_b32 s1, s1, 0xffff
; GFX9-NEXT: s_mul_i32 s0, s0, s1
; GFX9-NEXT: s_sext_i32_i16 s0, s0
; GFX9-NEXT: ; return to shader part epilog
;
; GFX10-LABEL: s_mul_i16_signext:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_mov_b32 s2, 0xffff
; GFX10-NEXT: s_and_b32 s0, s0, s2
; GFX10-NEXT: s_and_b32 s1, s1, s2
; GFX10-NEXT: s_and_b32 s0, s0, 0xffff
; GFX10-NEXT: s_and_b32 s1, s1, 0xffff
; GFX10-NEXT: s_mul_i32 s0, s0, s1
; GFX10-NEXT: s_sext_i32_i16 s0, s0
; GFX10-NEXT: ; return to shader part epilog

View File

@ -429,13 +429,12 @@ define amdgpu_ps float @v_orn2_i16_vs(i16 %src0, i16 inreg %src1) {
define amdgpu_ps i32 @s_orn2_v2i16(<2 x i16> inreg %src0, <2 x i16> inreg %src1) {
; GFX6-LABEL: s_orn2_v2i16:
; GFX6: ; %bb.0:
; GFX6-NEXT: s_mov_b32 s1, 0xffff
; GFX6-NEXT: s_lshl_b32 s0, s3, 16
; GFX6-NEXT: s_and_b32 s2, s2, s1
; GFX6-NEXT: s_or_b32 s0, s0, s2
; GFX6-NEXT: s_lshl_b32 s2, s5, 16
; GFX6-NEXT: s_and_b32 s1, s4, s1
; GFX6-NEXT: s_or_b32 s1, s2, s1
; GFX6-NEXT: s_and_b32 s1, s2, 0xffff
; GFX6-NEXT: s_or_b32 s0, s0, s1
; GFX6-NEXT: s_lshl_b32 s1, s5, 16
; GFX6-NEXT: s_and_b32 s2, s4, 0xffff
; GFX6-NEXT: s_or_b32 s1, s1, s2
; GFX6-NEXT: s_xor_b32 s1, s1, -1
; GFX6-NEXT: s_or_b32 s0, s0, s1
; GFX6-NEXT: ; return to shader part epilog
@ -458,13 +457,12 @@ define amdgpu_ps i32 @s_orn2_v2i16(<2 x i16> inreg %src0, <2 x i16> inreg %src1)
define amdgpu_ps i32 @s_orn2_v2i16_commute(<2 x i16> inreg %src0, <2 x i16> inreg %src1) {
; GFX6-LABEL: s_orn2_v2i16_commute:
; GFX6: ; %bb.0:
; GFX6-NEXT: s_mov_b32 s1, 0xffff
; GFX6-NEXT: s_lshl_b32 s0, s3, 16
; GFX6-NEXT: s_and_b32 s2, s2, s1
; GFX6-NEXT: s_or_b32 s0, s0, s2
; GFX6-NEXT: s_lshl_b32 s2, s5, 16
; GFX6-NEXT: s_and_b32 s1, s4, s1
; GFX6-NEXT: s_or_b32 s1, s2, s1
; GFX6-NEXT: s_and_b32 s1, s2, 0xffff
; GFX6-NEXT: s_or_b32 s0, s0, s1
; GFX6-NEXT: s_lshl_b32 s1, s5, 16
; GFX6-NEXT: s_and_b32 s2, s4, 0xffff
; GFX6-NEXT: s_or_b32 s1, s1, s2
; GFX6-NEXT: s_xor_b32 s1, s1, -1
; GFX6-NEXT: s_or_b32 s0, s1, s0
; GFX6-NEXT: ; return to shader part epilog
@ -487,13 +485,12 @@ define amdgpu_ps i32 @s_orn2_v2i16_commute(<2 x i16> inreg %src0, <2 x i16> inre
define amdgpu_ps { i32, i32 } @s_orn2_v2i16_multi_use(<2 x i16> inreg %src0, <2 x i16> inreg %src1) {
; GFX6-LABEL: s_orn2_v2i16_multi_use:
; GFX6: ; %bb.0:
; GFX6-NEXT: s_mov_b32 s1, 0xffff
; GFX6-NEXT: s_lshl_b32 s0, s3, 16
; GFX6-NEXT: s_and_b32 s2, s2, s1
; GFX6-NEXT: s_or_b32 s0, s0, s2
; GFX6-NEXT: s_lshl_b32 s2, s5, 16
; GFX6-NEXT: s_and_b32 s1, s4, s1
; GFX6-NEXT: s_or_b32 s1, s2, s1
; GFX6-NEXT: s_and_b32 s1, s2, 0xffff
; GFX6-NEXT: s_or_b32 s0, s0, s1
; GFX6-NEXT: s_lshl_b32 s1, s5, 16
; GFX6-NEXT: s_and_b32 s2, s4, 0xffff
; GFX6-NEXT: s_or_b32 s1, s1, s2
; GFX6-NEXT: s_xor_b32 s1, s1, -1
; GFX6-NEXT: s_or_b32 s0, s0, s1
; GFX6-NEXT: ; return to shader part epilog
@ -522,19 +519,18 @@ define amdgpu_ps { i32, i32 } @s_orn2_v2i16_multi_use(<2 x i16> inreg %src0, <2
define amdgpu_ps { i32, i32 } @s_orn2_v2i16_multi_foldable_use(<2 x i16> inreg %src0, <2 x i16> inreg %src1, <2 x i16> inreg %src2) {
; GFX6-LABEL: s_orn2_v2i16_multi_foldable_use:
; GFX6: ; %bb.0:
; GFX6-NEXT: s_mov_b32 s1, 0xffff
; GFX6-NEXT: s_lshl_b32 s0, s3, 16
; GFX6-NEXT: s_and_b32 s2, s2, s1
; GFX6-NEXT: s_or_b32 s0, s0, s2
; GFX6-NEXT: s_lshl_b32 s2, s5, 16
; GFX6-NEXT: s_and_b32 s3, s4, s1
; GFX6-NEXT: s_or_b32 s2, s2, s3
; GFX6-NEXT: s_lshl_b32 s3, s7, 16
; GFX6-NEXT: s_and_b32 s1, s6, s1
; GFX6-NEXT: s_or_b32 s1, s3, s1
; GFX6-NEXT: s_xor_b32 s1, s1, -1
; GFX6-NEXT: s_and_b32 s1, s2, 0xffff
; GFX6-NEXT: s_or_b32 s0, s0, s1
; GFX6-NEXT: s_or_b32 s1, s2, s1
; GFX6-NEXT: s_lshl_b32 s1, s5, 16
; GFX6-NEXT: s_and_b32 s2, s4, 0xffff
; GFX6-NEXT: s_or_b32 s1, s1, s2
; GFX6-NEXT: s_lshl_b32 s2, s7, 16
; GFX6-NEXT: s_and_b32 s3, s6, 0xffff
; GFX6-NEXT: s_or_b32 s2, s2, s3
; GFX6-NEXT: s_xor_b32 s2, s2, -1
; GFX6-NEXT: s_or_b32 s0, s0, s2
; GFX6-NEXT: s_or_b32 s1, s1, s2
; GFX6-NEXT: ; return to shader part epilog
;
; GFX9-LABEL: s_orn2_v2i16_multi_foldable_use:
@ -630,18 +626,17 @@ define amdgpu_ps i64 @s_orn2_v4i16(<4 x i16> inreg %src0, <4 x i16> inreg %src1)
; GFX6-LABEL: s_orn2_v4i16:
; GFX6: ; %bb.0:
; GFX6-NEXT: s_lshl_b32 s0, s3, 16
; GFX6-NEXT: s_mov_b32 s3, 0xffff
; GFX6-NEXT: s_and_b32 s1, s2, s3
; GFX6-NEXT: s_and_b32 s1, s2, 0xffff
; GFX6-NEXT: s_or_b32 s0, s0, s1
; GFX6-NEXT: s_lshl_b32 s1, s5, 16
; GFX6-NEXT: s_and_b32 s2, s4, s3
; GFX6-NEXT: s_and_b32 s2, s4, 0xffff
; GFX6-NEXT: s_or_b32 s1, s1, s2
; GFX6-NEXT: s_lshl_b32 s2, s7, 16
; GFX6-NEXT: s_and_b32 s4, s6, s3
; GFX6-NEXT: s_or_b32 s2, s2, s4
; GFX6-NEXT: s_lshl_b32 s4, s9, 16
; GFX6-NEXT: s_and_b32 s3, s8, s3
; GFX6-NEXT: s_or_b32 s3, s4, s3
; GFX6-NEXT: s_and_b32 s3, s6, 0xffff
; GFX6-NEXT: s_or_b32 s2, s2, s3
; GFX6-NEXT: s_lshl_b32 s3, s9, 16
; GFX6-NEXT: s_and_b32 s4, s8, 0xffff
; GFX6-NEXT: s_or_b32 s3, s3, s4
; GFX6-NEXT: s_mov_b32 s4, -1
; GFX6-NEXT: s_mov_b32 s5, s4
; GFX6-NEXT: s_xor_b64 s[2:3], s[2:3], s[4:5]
@ -673,18 +668,17 @@ define amdgpu_ps i64 @s_orn2_v4i16_commute(<4 x i16> inreg %src0, <4 x i16> inre
; GFX6-LABEL: s_orn2_v4i16_commute:
; GFX6: ; %bb.0:
; GFX6-NEXT: s_lshl_b32 s0, s3, 16
; GFX6-NEXT: s_mov_b32 s3, 0xffff
; GFX6-NEXT: s_and_b32 s1, s2, s3
; GFX6-NEXT: s_and_b32 s1, s2, 0xffff
; GFX6-NEXT: s_or_b32 s0, s0, s1
; GFX6-NEXT: s_lshl_b32 s1, s5, 16
; GFX6-NEXT: s_and_b32 s2, s4, s3
; GFX6-NEXT: s_and_b32 s2, s4, 0xffff
; GFX6-NEXT: s_or_b32 s1, s1, s2
; GFX6-NEXT: s_lshl_b32 s2, s7, 16
; GFX6-NEXT: s_and_b32 s4, s6, s3
; GFX6-NEXT: s_or_b32 s2, s2, s4
; GFX6-NEXT: s_lshl_b32 s4, s9, 16
; GFX6-NEXT: s_and_b32 s3, s8, s3
; GFX6-NEXT: s_or_b32 s3, s4, s3
; GFX6-NEXT: s_and_b32 s3, s6, 0xffff
; GFX6-NEXT: s_or_b32 s2, s2, s3
; GFX6-NEXT: s_lshl_b32 s3, s9, 16
; GFX6-NEXT: s_and_b32 s4, s8, 0xffff
; GFX6-NEXT: s_or_b32 s3, s3, s4
; GFX6-NEXT: s_mov_b32 s4, -1
; GFX6-NEXT: s_mov_b32 s5, s4
; GFX6-NEXT: s_xor_b64 s[2:3], s[2:3], s[4:5]
@ -716,18 +710,17 @@ define amdgpu_ps { i64, i64 } @s_orn2_v4i16_multi_use(<4 x i16> inreg %src0, <4
; GFX6-LABEL: s_orn2_v4i16_multi_use:
; GFX6: ; %bb.0:
; GFX6-NEXT: s_lshl_b32 s0, s3, 16
; GFX6-NEXT: s_mov_b32 s3, 0xffff
; GFX6-NEXT: s_and_b32 s1, s2, s3
; GFX6-NEXT: s_and_b32 s1, s2, 0xffff
; GFX6-NEXT: s_or_b32 s0, s0, s1
; GFX6-NEXT: s_lshl_b32 s1, s5, 16
; GFX6-NEXT: s_and_b32 s2, s4, s3
; GFX6-NEXT: s_and_b32 s2, s4, 0xffff
; GFX6-NEXT: s_or_b32 s1, s1, s2
; GFX6-NEXT: s_lshl_b32 s2, s7, 16
; GFX6-NEXT: s_and_b32 s4, s6, s3
; GFX6-NEXT: s_or_b32 s2, s2, s4
; GFX6-NEXT: s_lshl_b32 s4, s9, 16
; GFX6-NEXT: s_and_b32 s3, s8, s3
; GFX6-NEXT: s_or_b32 s3, s4, s3
; GFX6-NEXT: s_and_b32 s3, s6, 0xffff
; GFX6-NEXT: s_or_b32 s2, s2, s3
; GFX6-NEXT: s_lshl_b32 s3, s9, 16
; GFX6-NEXT: s_and_b32 s4, s8, 0xffff
; GFX6-NEXT: s_or_b32 s3, s3, s4
; GFX6-NEXT: s_mov_b32 s4, -1
; GFX6-NEXT: s_mov_b32 s5, s4
; GFX6-NEXT: s_xor_b64 s[2:3], s[2:3], s[4:5]
@ -766,24 +759,23 @@ define amdgpu_ps { i64, i64 } @s_orn2_v4i16_multi_use(<4 x i16> inreg %src0, <4
define amdgpu_ps { i64, i64 } @s_orn2_v4i16_multi_foldable_use(<4 x i16> inreg %src0, <4 x i16> inreg %src1, <4 x i16> inreg %src2) {
; GFX6-LABEL: s_orn2_v4i16_multi_foldable_use:
; GFX6: ; %bb.0:
; GFX6-NEXT: s_mov_b32 s14, 0xffff
; GFX6-NEXT: s_lshl_b32 s0, s3, 16
; GFX6-NEXT: s_and_b32 s1, s2, s14
; GFX6-NEXT: s_and_b32 s1, s2, 0xffff
; GFX6-NEXT: s_or_b32 s0, s0, s1
; GFX6-NEXT: s_lshl_b32 s1, s5, 16
; GFX6-NEXT: s_and_b32 s2, s4, s14
; GFX6-NEXT: s_and_b32 s2, s4, 0xffff
; GFX6-NEXT: s_or_b32 s1, s1, s2
; GFX6-NEXT: s_lshl_b32 s2, s7, 16
; GFX6-NEXT: s_and_b32 s3, s6, s14
; GFX6-NEXT: s_and_b32 s3, s6, 0xffff
; GFX6-NEXT: s_or_b32 s2, s2, s3
; GFX6-NEXT: s_lshl_b32 s3, s9, 16
; GFX6-NEXT: s_and_b32 s4, s8, s14
; GFX6-NEXT: s_and_b32 s4, s8, 0xffff
; GFX6-NEXT: s_or_b32 s3, s3, s4
; GFX6-NEXT: s_lshl_b32 s4, s11, 16
; GFX6-NEXT: s_and_b32 s5, s10, s14
; GFX6-NEXT: s_and_b32 s5, s10, 0xffff
; GFX6-NEXT: s_or_b32 s4, s4, s5
; GFX6-NEXT: s_lshl_b32 s5, s13, 16
; GFX6-NEXT: s_and_b32 s6, s12, s14
; GFX6-NEXT: s_and_b32 s6, s12, 0xffff
; GFX6-NEXT: s_or_b32 s5, s5, s6
; GFX6-NEXT: s_mov_b32 s6, -1
; GFX6-NEXT: s_mov_b32 s7, s6

View File

@ -401,13 +401,12 @@ define <4 x half> @v_roundeven_v4f16(<4 x half> %x) {
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: v_mov_b32_e32 v2, 0xffff
; GFX10-NEXT: v_rndne_f16_e32 v3, v0
; GFX10-NEXT: v_rndne_f16_e32 v2, v0
; GFX10-NEXT: v_rndne_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1
; GFX10-NEXT: v_rndne_f16_e32 v4, v1
; GFX10-NEXT: v_rndne_f16_e32 v3, v1
; GFX10-NEXT: v_rndne_f16_sdwa v1, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1
; GFX10-NEXT: v_and_or_b32 v0, v3, v2, v0
; GFX10-NEXT: v_and_or_b32 v1, v4, v2, v1
; GFX10-NEXT: v_and_or_b32 v0, 0xffff, v2, v0
; GFX10-NEXT: v_and_or_b32 v1, 0xffff, v3, v1
; GFX10-NEXT: s_setpc_b64 s[30:31]
%roundeven = call <4 x half> @llvm.roundeven.v4f16(<4 x half> %x)
ret <4 x half> %roundeven
@ -610,8 +609,8 @@ define <2 x double> @v_roundeven_v2f64(<2 x double> %x) {
; GFX6: ; %bb.0:
; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX6-NEXT: s_brev_b32 s6, 1
; GFX6-NEXT: s_mov_b32 s7, 0x43300000
; GFX6-NEXT: v_and_b32_e32 v5, s6, v1
; GFX6-NEXT: s_mov_b32 s7, 0x43300000
; GFX6-NEXT: v_mov_b32_e32 v4, 0
; GFX6-NEXT: v_or_b32_e32 v5, s7, v5
; GFX6-NEXT: v_add_f64 v[6:7], v[0:1], v[4:5]

File diff suppressed because it is too large Load Diff

View File

@ -309,10 +309,10 @@ define i32 @v_sdiv_i32_pow2k_denom(i32 %num) {
; CHECK-NEXT: v_lshlrev_b32_e32 v3, 12, v2
; CHECK-NEXT: v_add_i32_e32 v4, vcc, 1, v2
; CHECK-NEXT: v_sub_i32_e32 v0, vcc, v0, v3
; CHECK-NEXT: v_cmp_le_u32_e32 vcc, s6, v0
; CHECK-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc
; CHECK-NEXT: v_subrev_i32_e64 v3, s[4:5], s6, v0
; CHECK-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc
; CHECK-NEXT: v_cmp_le_u32_e64 s[4:5], s6, v0
; CHECK-NEXT: v_cndmask_b32_e64 v2, v2, v4, s[4:5]
; CHECK-NEXT: v_subrev_i32_e32 v3, vcc, s6, v0
; CHECK-NEXT: v_cndmask_b32_e64 v0, v0, v3, s[4:5]
; CHECK-NEXT: v_add_i32_e32 v3, vcc, 1, v2
; CHECK-NEXT: v_cmp_le_u32_e32 vcc, s6, v0
; CHECK-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc
@ -408,15 +408,15 @@ define <2 x i32> @v_sdiv_v2i32_pow2k_denom(<2 x i32> %num) {
; CGP-NEXT: v_add_i32_e32 v10, vcc, 1, v4
; CGP-NEXT: v_sub_i32_e32 v0, vcc, v0, v7
; CGP-NEXT: v_sub_i32_e32 v1, vcc, v1, v9
; CGP-NEXT: v_cmp_le_u32_e32 vcc, s8, v0
; CGP-NEXT: v_cndmask_b32_e32 v3, v3, v8, vcc
; CGP-NEXT: v_subrev_i32_e64 v7, s[4:5], s8, v0
; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v1, v5
; CGP-NEXT: v_cndmask_b32_e64 v4, v4, v10, s[4:5]
; CGP-NEXT: v_sub_i32_e64 v8, s[6:7], v1, v5
; CGP-NEXT: v_cndmask_b32_e32 v0, v0, v7, vcc
; CGP-NEXT: v_cmp_le_u32_e64 s[4:5], s8, v0
; CGP-NEXT: v_cndmask_b32_e64 v3, v3, v8, s[4:5]
; CGP-NEXT: v_subrev_i32_e32 v7, vcc, s8, v0
; CGP-NEXT: v_cmp_ge_u32_e64 s[6:7], v1, v5
; CGP-NEXT: v_cndmask_b32_e64 v4, v4, v10, s[6:7]
; CGP-NEXT: v_subrev_i32_e32 v8, vcc, 0x1000, v1
; CGP-NEXT: v_cndmask_b32_e64 v0, v0, v7, s[4:5]
; CGP-NEXT: v_add_i32_e32 v7, vcc, 1, v3
; CGP-NEXT: v_cndmask_b32_e64 v1, v1, v8, s[4:5]
; CGP-NEXT: v_cndmask_b32_e64 v1, v1, v8, s[6:7]
; CGP-NEXT: v_add_i32_e32 v8, vcc, 1, v4
; CGP-NEXT: v_cmp_le_u32_e32 vcc, s8, v0
; CGP-NEXT: v_cndmask_b32_e32 v0, v3, v7, vcc
@ -450,10 +450,10 @@ define i32 @v_sdiv_i32_oddk_denom(i32 %num) {
; CHECK-NEXT: v_mul_lo_u32 v3, v2, s6
; CHECK-NEXT: v_add_i32_e32 v4, vcc, 1, v2
; CHECK-NEXT: v_sub_i32_e32 v0, vcc, v0, v3
; CHECK-NEXT: v_cmp_le_u32_e32 vcc, s6, v0
; CHECK-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc
; CHECK-NEXT: v_subrev_i32_e64 v3, s[4:5], s6, v0
; CHECK-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc
; CHECK-NEXT: v_cmp_le_u32_e64 s[4:5], s6, v0
; CHECK-NEXT: v_cndmask_b32_e64 v2, v2, v4, s[4:5]
; CHECK-NEXT: v_subrev_i32_e32 v3, vcc, s6, v0
; CHECK-NEXT: v_cndmask_b32_e64 v0, v0, v3, s[4:5]
; CHECK-NEXT: v_add_i32_e32 v3, vcc, 1, v2
; CHECK-NEXT: v_cmp_le_u32_e32 vcc, s6, v0
; CHECK-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc
@ -549,15 +549,15 @@ define <2 x i32> @v_sdiv_v2i32_oddk_denom(<2 x i32> %num) {
; CGP-NEXT: v_add_i32_e32 v10, vcc, 1, v4
; CGP-NEXT: v_sub_i32_e32 v0, vcc, v0, v7
; CGP-NEXT: v_sub_i32_e32 v1, vcc, v1, v9
; CGP-NEXT: v_cmp_le_u32_e32 vcc, s8, v0
; CGP-NEXT: v_cndmask_b32_e32 v3, v3, v8, vcc
; CGP-NEXT: v_subrev_i32_e64 v7, s[4:5], s8, v0
; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v1, v5
; CGP-NEXT: v_cndmask_b32_e64 v4, v4, v10, s[4:5]
; CGP-NEXT: v_sub_i32_e64 v8, s[6:7], v1, v5
; CGP-NEXT: v_cndmask_b32_e32 v0, v0, v7, vcc
; CGP-NEXT: v_cmp_le_u32_e64 s[4:5], s8, v0
; CGP-NEXT: v_cndmask_b32_e64 v3, v3, v8, s[4:5]
; CGP-NEXT: v_subrev_i32_e32 v7, vcc, s8, v0
; CGP-NEXT: v_cmp_ge_u32_e64 s[6:7], v1, v5
; CGP-NEXT: v_cndmask_b32_e64 v4, v4, v10, s[6:7]
; CGP-NEXT: v_subrev_i32_e32 v8, vcc, 0x12d8fb, v1
; CGP-NEXT: v_cndmask_b32_e64 v0, v0, v7, s[4:5]
; CGP-NEXT: v_add_i32_e32 v7, vcc, 1, v3
; CGP-NEXT: v_cndmask_b32_e64 v1, v1, v8, s[4:5]
; CGP-NEXT: v_cndmask_b32_e64 v1, v1, v8, s[6:7]
; CGP-NEXT: v_add_i32_e32 v8, vcc, 1, v4
; CGP-NEXT: v_cmp_le_u32_e32 vcc, s8, v0
; CGP-NEXT: v_cndmask_b32_e32 v0, v3, v7, vcc

View File

@ -1186,9 +1186,8 @@ define <2 x i64> @v_sdiv_v2i64_pow2k_denom(<2 x i64> %num) {
; GISEL-LABEL: v_sdiv_v2i64_pow2k_denom:
; GISEL: ; %bb.0:
; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GISEL-NEXT: s_movk_i32 s10, 0x1000
; GISEL-NEXT: s_mov_b32 s6, 0
; GISEL-NEXT: s_add_u32 s4, s10, 0
; GISEL-NEXT: s_add_u32 s4, 0x1000, 0
; GISEL-NEXT: s_mov_b32 s7, s6
; GISEL-NEXT: s_addc_u32 s5, 0, 0
; GISEL-NEXT: s_xor_b64 s[8:9], s[4:5], s[6:7]
@ -1317,7 +1316,7 @@ define <2 x i64> @v_sdiv_v2i64_pow2k_denom(<2 x i64> %num) {
; GISEL-NEXT: v_cndmask_b32_e32 v0, v10, v0, vcc
; GISEL-NEXT: v_add_i32_e32 v1, vcc, 1, v8
; GISEL-NEXT: v_addc_u32_e32 v10, vcc, 0, v9, vcc
; GISEL-NEXT: s_add_u32 s4, s10, 0
; GISEL-NEXT: s_add_u32 s4, 0x1000, 0
; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0
; GISEL-NEXT: s_addc_u32 s5, 0, 0
; GISEL-NEXT: v_cndmask_b32_e32 v0, v8, v1, vcc
@ -1890,9 +1889,8 @@ define <2 x i64> @v_sdiv_v2i64_oddk_denom(<2 x i64> %num) {
; GISEL-LABEL: v_sdiv_v2i64_oddk_denom:
; GISEL: ; %bb.0:
; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GISEL-NEXT: s_mov_b32 s10, 0x12d8fb
; GISEL-NEXT: s_mov_b32 s6, 0
; GISEL-NEXT: s_add_u32 s4, s10, 0
; GISEL-NEXT: s_add_u32 s4, 0x12d8fb, 0
; GISEL-NEXT: s_mov_b32 s7, s6
; GISEL-NEXT: s_addc_u32 s5, 0, 0
; GISEL-NEXT: s_xor_b64 s[8:9], s[4:5], s[6:7]
@ -2021,7 +2019,7 @@ define <2 x i64> @v_sdiv_v2i64_oddk_denom(<2 x i64> %num) {
; GISEL-NEXT: v_cndmask_b32_e32 v0, v10, v0, vcc
; GISEL-NEXT: v_add_i32_e32 v1, vcc, 1, v8
; GISEL-NEXT: v_addc_u32_e32 v10, vcc, 0, v9, vcc
; GISEL-NEXT: s_add_u32 s4, s10, 0
; GISEL-NEXT: s_add_u32 s4, 0x12d8fb, 0
; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0
; GISEL-NEXT: s_addc_u32 s5, 0, 0
; GISEL-NEXT: v_cndmask_b32_e32 v0, v8, v1, vcc

View File

@ -1157,121 +1157,120 @@ define amdgpu_kernel void @sdivrem_v4i32(<4 x i32> addrspace(1)* %out0, <4 x i32
; GFX10-LABEL: sdivrem_v4i32:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x10
; GFX10-NEXT: v_mov_b32_e32 v4, 0x4f7ffffe
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: s_ashr_i32 s0, s12, 31
; GFX10-NEXT: s_ashr_i32 s2, s14, 31
; GFX10-NEXT: s_add_i32 s6, s12, s0
; GFX10-NEXT: s_add_i32 s12, s14, s2
; GFX10-NEXT: s_xor_b32 s14, s6, s0
; GFX10-NEXT: s_ashr_i32 s1, s13, 31
; GFX10-NEXT: v_cvt_f32_u32_e32 v0, s14
; GFX10-NEXT: s_ashr_i32 s2, s14, 31
; GFX10-NEXT: s_ashr_i32 s3, s15, 31
; GFX10-NEXT: s_add_i32 s6, s12, s0
; GFX10-NEXT: s_add_i32 s7, s13, s1
; GFX10-NEXT: s_add_i32 s12, s14, s2
; GFX10-NEXT: s_add_i32 s13, s15, s3
; GFX10-NEXT: s_xor_b32 s14, s6, s0
; GFX10-NEXT: s_xor_b32 s15, s7, s1
; GFX10-NEXT: s_xor_b32 s12, s12, s2
; GFX10-NEXT: v_cvt_f32_u32_e32 v0, s14
; GFX10-NEXT: s_xor_b32 s13, s13, s3
; GFX10-NEXT: v_cvt_f32_u32_e32 v1, s15
; GFX10-NEXT: v_cvt_f32_u32_e32 v2, s12
; GFX10-NEXT: v_rcp_iflag_f32_e32 v0, v0
; GFX10-NEXT: s_xor_b32 s13, s13, s3
; GFX10-NEXT: s_sub_i32 s6, 0, s14
; GFX10-NEXT: v_cvt_f32_u32_e32 v3, s13
; GFX10-NEXT: v_rcp_iflag_f32_e32 v0, v0
; GFX10-NEXT: s_sub_i32 s6, 0, s14
; GFX10-NEXT: v_rcp_iflag_f32_e32 v1, v1
; GFX10-NEXT: v_rcp_iflag_f32_e32 v2, v2
; GFX10-NEXT: v_rcp_iflag_f32_e32 v3, v3
; GFX10-NEXT: s_sub_i32 s7, 0, s15
; GFX10-NEXT: s_sub_i32 s19, 0, s12
; GFX10-NEXT: v_rcp_iflag_f32_e32 v3, v3
; GFX10-NEXT: s_ashr_i32 s16, s8, 31
; GFX10-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0
; GFX10-NEXT: s_ashr_i32 s18, s10, 31
; GFX10-NEXT: s_ashr_i32 s17, s9, 31
; GFX10-NEXT: s_ashr_i32 s18, s10, 31
; GFX10-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0
; GFX10-NEXT: s_xor_b32 s20, s16, s0
; GFX10-NEXT: v_mul_f32_e32 v1, v1, v4
; GFX10-NEXT: v_mul_f32_e32 v2, v2, v4
; GFX10-NEXT: v_mul_f32_e32 v1, 0x4f7ffffe, v1
; GFX10-NEXT: v_mul_f32_e32 v2, 0x4f7ffffe, v2
; GFX10-NEXT: v_mul_f32_e32 v3, 0x4f7ffffe, v3
; GFX10-NEXT: v_cvt_u32_f32_e32 v0, v0
; GFX10-NEXT: s_xor_b32 s21, s17, s1
; GFX10-NEXT: v_mul_f32_e32 v3, v3, v4
; GFX10-NEXT: v_cvt_u32_f32_e32 v1, v1
; GFX10-NEXT: v_cvt_u32_f32_e32 v2, v2
; GFX10-NEXT: v_cvt_u32_f32_e32 v3, v3
; GFX10-NEXT: v_mul_lo_u32 v4, s6, v0
; GFX10-NEXT: s_sub_i32 s6, 0, s13
; GFX10-NEXT: v_cvt_u32_f32_e32 v3, v3
; GFX10-NEXT: v_mul_lo_u32 v5, s7, v1
; GFX10-NEXT: v_mul_lo_u32 v6, s19, v2
; GFX10-NEXT: s_ashr_i32 s19, s11, 31
; GFX10-NEXT: s_add_i32 s7, s9, s17
; GFX10-NEXT: v_mul_lo_u32 v7, s6, v3
; GFX10-NEXT: v_mul_hi_u32 v4, v0, v4
; GFX10-NEXT: s_ashr_i32 s19, s11, 31
; GFX10-NEXT: s_add_i32 s6, s8, s16
; GFX10-NEXT: s_add_i32 s7, s9, s17
; GFX10-NEXT: v_mul_hi_u32 v4, v0, v4
; GFX10-NEXT: s_add_i32 s8, s10, s18
; GFX10-NEXT: v_mul_hi_u32 v5, v1, v5
; GFX10-NEXT: v_mul_hi_u32 v6, v2, v6
; GFX10-NEXT: s_xor_b32 s10, s6, s16
; GFX10-NEXT: s_add_i32 s9, s11, s19
; GFX10-NEXT: v_mul_hi_u32 v7, v3, v7
; GFX10-NEXT: v_add_nc_u32_e32 v0, v0, v4
; GFX10-NEXT: s_add_i32 s9, s11, s19
; GFX10-NEXT: s_xor_b32 s10, s6, s16
; GFX10-NEXT: s_xor_b32 s11, s7, s17
; GFX10-NEXT: v_add_nc_u32_e32 v0, v0, v4
; GFX10-NEXT: s_xor_b32 s8, s8, s18
; GFX10-NEXT: v_add_nc_u32_e32 v1, v1, v5
; GFX10-NEXT: v_add_nc_u32_e32 v2, v2, v6
; GFX10-NEXT: v_mul_hi_u32 v0, s10, v0
; GFX10-NEXT: s_xor_b32 s9, s9, s19
; GFX10-NEXT: v_add_nc_u32_e32 v3, v3, v7
; GFX10-NEXT: s_xor_b32 s9, s9, s19
; GFX10-NEXT: v_mul_hi_u32 v0, s10, v0
; GFX10-NEXT: v_mul_hi_u32 v1, s11, v1
; GFX10-NEXT: v_mul_hi_u32 v2, s8, v2
; GFX10-NEXT: v_mul_hi_u32 v3, s9, v3
; GFX10-NEXT: s_xor_b32 s22, s18, s2
; GFX10-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x0
; GFX10-NEXT: v_mul_hi_u32 v3, s9, v3
; GFX10-NEXT: v_mul_lo_u32 v4, v0, s14
; GFX10-NEXT: v_add_nc_u32_e32 v8, 1, v0
; GFX10-NEXT: v_mul_lo_u32 v5, v1, s15
; GFX10-NEXT: v_mul_lo_u32 v6, v2, s12
; GFX10-NEXT: v_mul_lo_u32 v7, v3, s13
; GFX10-NEXT: v_add_nc_u32_e32 v8, 1, v0
; GFX10-NEXT: v_add_nc_u32_e32 v9, 1, v1
; GFX10-NEXT: v_add_nc_u32_e32 v10, 1, v2
; GFX10-NEXT: v_mul_lo_u32 v7, v3, s13
; GFX10-NEXT: v_sub_nc_u32_e32 v4, s10, v4
; GFX10-NEXT: v_add_nc_u32_e32 v11, 1, v3
; GFX10-NEXT: v_sub_nc_u32_e32 v4, s10, v4
; GFX10-NEXT: v_sub_nc_u32_e32 v5, s11, v5
; GFX10-NEXT: v_sub_nc_u32_e32 v6, s8, v6
; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, s14, v4
; GFX10-NEXT: v_sub_nc_u32_e32 v7, s9, v7
; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, s14, v4
; GFX10-NEXT: v_cmp_le_u32_e64 s0, s15, v5
; GFX10-NEXT: v_cmp_le_u32_e64 s1, s12, v6
; GFX10-NEXT: v_cmp_le_u32_e64 s2, s13, v7
; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v8, vcc_lo
; GFX10-NEXT: v_subrev_nc_u32_e32 v8, s14, v4
; GFX10-NEXT: v_cmp_le_u32_e64 s2, s13, v7
; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, v9, s0
; GFX10-NEXT: v_subrev_nc_u32_e32 v9, s15, v5
; GFX10-NEXT: v_cndmask_b32_e64 v2, v2, v10, s1
; GFX10-NEXT: v_subrev_nc_u32_e32 v10, s12, v6
; GFX10-NEXT: v_cndmask_b32_e32 v4, v4, v8, vcc_lo
; GFX10-NEXT: v_cndmask_b32_e64 v3, v3, v11, s2
; GFX10-NEXT: v_subrev_nc_u32_e32 v11, s13, v7
; GFX10-NEXT: v_add_nc_u32_e32 v8, 1, v0
; GFX10-NEXT: v_cndmask_b32_e32 v4, v4, v8, vcc_lo
; GFX10-NEXT: v_cndmask_b32_e64 v5, v5, v9, s0
; GFX10-NEXT: v_cndmask_b32_e64 v6, v6, v10, s1
; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, s14, v4
; GFX10-NEXT: v_add_nc_u32_e32 v8, 1, v0
; GFX10-NEXT: v_cndmask_b32_e64 v7, v7, v11, s2
; GFX10-NEXT: v_add_nc_u32_e32 v9, 1, v1
; GFX10-NEXT: v_add_nc_u32_e32 v10, 1, v2
; GFX10-NEXT: v_add_nc_u32_e32 v11, 1, v3
; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, s14, v4
; GFX10-NEXT: v_cmp_le_u32_e64 s0, s15, v5
; GFX10-NEXT: v_cmp_le_u32_e64 s1, s12, v6
; GFX10-NEXT: v_cmp_le_u32_e64 s2, s13, v7
; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v8, vcc_lo
; GFX10-NEXT: v_subrev_nc_u32_e32 v8, s14, v4
; GFX10-NEXT: v_cmp_le_u32_e64 s1, s12, v6
; GFX10-NEXT: v_add_nc_u32_e32 v11, 1, v3
; GFX10-NEXT: v_cmp_le_u32_e64 s2, s13, v7
; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, v9, s0
; GFX10-NEXT: v_subrev_nc_u32_e32 v9, s15, v5
; GFX10-NEXT: v_cndmask_b32_e64 v2, v2, v10, s1
; GFX10-NEXT: v_subrev_nc_u32_e32 v10, s12, v6
; GFX10-NEXT: v_cndmask_b32_e32 v4, v4, v8, vcc_lo
; GFX10-NEXT: v_subrev_nc_u32_e32 v8, s13, v7
; GFX10-NEXT: v_cndmask_b32_e64 v3, v3, v11, s2
; GFX10-NEXT: v_subrev_nc_u32_e32 v11, s13, v7
; GFX10-NEXT: v_cndmask_b32_e32 v4, v4, v8, vcc_lo
; GFX10-NEXT: v_cndmask_b32_e64 v5, v5, v9, s0
; GFX10-NEXT: v_cndmask_b32_e64 v6, v6, v10, s1
; GFX10-NEXT: s_xor_b32 s0, s19, s3
; GFX10-NEXT: v_cndmask_b32_e64 v7, v7, v8, s2
; GFX10-NEXT: v_cndmask_b32_e64 v7, v7, v11, s2
; GFX10-NEXT: v_xor_b32_e32 v0, s20, v0
; GFX10-NEXT: v_xor_b32_e32 v1, s21, v1
; GFX10-NEXT: v_xor_b32_e32 v2, s22, v2
@ -2817,7 +2816,6 @@ define amdgpu_kernel void @sdivrem_v2i16(<2 x i16> addrspace(1)* %out0, <2 x i16
; GFX8-LABEL: sdivrem_v2i16:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x10
; GFX8-NEXT: s_mov_b32 s10, 0x100010
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: s_sext_i32_i16 s0, s3
; GFX8-NEXT: s_ashr_i32 s8, s0, 31
@ -2825,41 +2823,41 @@ define amdgpu_kernel void @sdivrem_v2i16(<2 x i16> addrspace(1)* %out0, <2 x i16
; GFX8-NEXT: s_xor_b32 s9, s0, s8
; GFX8-NEXT: v_cvt_f32_u32_e32 v0, s9
; GFX8-NEXT: s_sub_i32 s6, 0, s9
; GFX8-NEXT: s_sext_i32_i16 s0, s2
; GFX8-NEXT: s_bfe_i32 s1, s3, s10
; GFX8-NEXT: s_bfe_i32 s1, s3, 0x100010
; GFX8-NEXT: s_ashr_i32 s10, s1, 31
; GFX8-NEXT: v_rcp_iflag_f32_e32 v0, v0
; GFX8-NEXT: s_ashr_i32 s3, s0, 31
; GFX8-NEXT: s_ashr_i32 s11, s1, 31
; GFX8-NEXT: s_add_i32 s0, s0, s3
; GFX8-NEXT: s_add_i32 s1, s1, s10
; GFX8-NEXT: s_xor_b32 s11, s1, s10
; GFX8-NEXT: s_sext_i32_i16 s0, s2
; GFX8-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0
; GFX8-NEXT: v_cvt_u32_f32_e32 v0, v0
; GFX8-NEXT: s_add_i32 s1, s1, s11
; GFX8-NEXT: s_xor_b32 s0, s0, s3
; GFX8-NEXT: s_xor_b32 s12, s1, s11
; GFX8-NEXT: v_cvt_f32_u32_e32 v2, s11
; GFX8-NEXT: s_ashr_i32 s3, s0, 31
; GFX8-NEXT: s_add_i32 s0, s0, s3
; GFX8-NEXT: v_mul_lo_u32 v1, s6, v0
; GFX8-NEXT: v_cvt_f32_u32_e32 v2, s12
; GFX8-NEXT: s_xor_b32 s0, s0, s3
; GFX8-NEXT: v_rcp_iflag_f32_e32 v2, v2
; GFX8-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x0
; GFX8-NEXT: v_mul_hi_u32 v1, v0, v1
; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v1
; GFX8-NEXT: v_mul_hi_u32 v0, s0, v0
; GFX8-NEXT: v_rcp_iflag_f32_e32 v1, v2
; GFX8-NEXT: v_mul_f32_e32 v1, 0x4f7ffffe, v2
; GFX8-NEXT: v_cvt_u32_f32_e32 v1, v1
; GFX8-NEXT: v_mul_lo_u32 v2, v0, s9
; GFX8-NEXT: v_add_u32_e32 v3, vcc, 1, v0
; GFX8-NEXT: v_mul_f32_e32 v1, 0x4f7ffffe, v1
; GFX8-NEXT: v_sub_u32_e32 v2, vcc, s0, v2
; GFX8-NEXT: v_cmp_le_u32_e32 vcc, s9, v2
; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc
; GFX8-NEXT: v_subrev_u32_e64 v3, s[0:1], s9, v2
; GFX8-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc
; GFX8-NEXT: v_add_u32_e32 v3, vcc, 1, v0
; GFX8-NEXT: v_cvt_u32_f32_e32 v1, v1
; GFX8-NEXT: v_cmp_le_u32_e32 vcc, s9, v2
; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc
; GFX8-NEXT: v_subrev_u32_e64 v3, s[0:1], s9, v2
; GFX8-NEXT: s_sub_i32 s1, 0, s12
; GFX8-NEXT: s_sub_i32 s1, 0, s11
; GFX8-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc
; GFX8-NEXT: v_mul_lo_u32 v3, s1, v1
; GFX8-NEXT: s_bfe_i32 s1, s2, s10
; GFX8-NEXT: s_bfe_i32 s1, s2, 0x100010
; GFX8-NEXT: s_ashr_i32 s2, s1, 31
; GFX8-NEXT: s_add_i32 s1, s1, s2
; GFX8-NEXT: v_mul_hi_u32 v3, v1, v3
@ -2870,19 +2868,19 @@ define amdgpu_kernel void @sdivrem_v2i16(<2 x i16> addrspace(1)* %out0, <2 x i16
; GFX8-NEXT: v_mul_hi_u32 v1, s1, v1
; GFX8-NEXT: v_xor_b32_e32 v2, s3, v2
; GFX8-NEXT: v_subrev_u32_e32 v0, vcc, s0, v0
; GFX8-NEXT: v_mul_lo_u32 v3, v1, s12
; GFX8-NEXT: v_mul_lo_u32 v3, v1, s11
; GFX8-NEXT: v_subrev_u32_e32 v2, vcc, s3, v2
; GFX8-NEXT: v_add_u32_e32 v4, vcc, 1, v1
; GFX8-NEXT: v_sub_u32_e32 v3, vcc, s1, v3
; GFX8-NEXT: v_cmp_le_u32_e32 vcc, s12, v3
; GFX8-NEXT: v_cmp_le_u32_e32 vcc, s11, v3
; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc
; GFX8-NEXT: v_subrev_u32_e64 v4, s[0:1], s12, v3
; GFX8-NEXT: v_subrev_u32_e64 v4, s[0:1], s11, v3
; GFX8-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc
; GFX8-NEXT: v_add_u32_e32 v4, vcc, 1, v1
; GFX8-NEXT: v_cmp_le_u32_e32 vcc, s12, v3
; GFX8-NEXT: v_cmp_le_u32_e32 vcc, s11, v3
; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc
; GFX8-NEXT: v_subrev_u32_e64 v4, s[0:1], s12, v3
; GFX8-NEXT: s_xor_b32 s0, s2, s11
; GFX8-NEXT: v_subrev_u32_e64 v4, s[0:1], s11, v3
; GFX8-NEXT: s_xor_b32 s0, s2, s10
; GFX8-NEXT: v_xor_b32_e32 v1, s0, v1
; GFX8-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc
; GFX8-NEXT: v_subrev_u32_e32 v1, vcc, s0, v1
@ -2907,45 +2905,44 @@ define amdgpu_kernel void @sdivrem_v2i16(<2 x i16> addrspace(1)* %out0, <2 x i16
; GFX9-LABEL: sdivrem_v2i16:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x10
; GFX9-NEXT: s_mov_b32 s10, 0x100010
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: s_sext_i32_i16 s0, s7
; GFX9-NEXT: s_ashr_i32 s8, s0, 31
; GFX9-NEXT: s_add_i32 s0, s0, s8
; GFX9-NEXT: s_xor_b32 s9, s0, s8
; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s9
; GFX9-NEXT: s_bfe_i32 s1, s7, s10
; GFX9-NEXT: s_ashr_i32 s7, s1, 31
; GFX9-NEXT: s_add_i32 s1, s1, s7
; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
; GFX9-NEXT: s_bfe_i32 s5, s7, 0x100010
; GFX9-NEXT: s_ashr_i32 s7, s5, 31
; GFX9-NEXT: v_rcp_iflag_f32_e32 v0, v0
; GFX9-NEXT: s_xor_b32 s11, s1, s7
; GFX9-NEXT: v_cvt_f32_u32_e32 v1, s11
; GFX9-NEXT: s_sub_i32 s1, 0, s9
; GFX9-NEXT: s_add_i32 s5, s5, s7
; GFX9-NEXT: s_xor_b32 s5, s5, s7
; GFX9-NEXT: v_cvt_f32_u32_e32 v1, s5
; GFX9-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0
; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0
; GFX9-NEXT: s_sub_i32 s10, 0, s9
; GFX9-NEXT: v_rcp_iflag_f32_e32 v1, v1
; GFX9-NEXT: s_sext_i32_i16 s0, s6
; GFX9-NEXT: s_ashr_i32 s12, s0, 31
; GFX9-NEXT: v_mul_lo_u32 v2, s1, v0
; GFX9-NEXT: s_sext_i32_i16 s4, s6
; GFX9-NEXT: v_mul_lo_u32 v2, s10, v0
; GFX9-NEXT: s_ashr_i32 s10, s4, 31
; GFX9-NEXT: v_mul_f32_e32 v1, 0x4f7ffffe, v1
; GFX9-NEXT: v_cvt_u32_f32_e32 v1, v1
; GFX9-NEXT: s_add_i32 s0, s0, s12
; GFX9-NEXT: v_mul_hi_u32 v2, v0, v2
; GFX9-NEXT: s_xor_b32 s13, s0, s12
; GFX9-NEXT: s_sub_i32 s0, 0, s11
; GFX9-NEXT: s_add_i32 s4, s4, s10
; GFX9-NEXT: s_xor_b32 s4, s4, s10
; GFX9-NEXT: s_sub_i32 s11, 0, s5
; GFX9-NEXT: v_add_u32_e32 v0, v0, v2
; GFX9-NEXT: v_mul_hi_u32 v0, s13, v0
; GFX9-NEXT: v_mul_lo_u32 v2, s0, v1
; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
; GFX9-NEXT: s_bfe_i32 s4, s6, s10
; GFX9-NEXT: v_mul_hi_u32 v0, s4, v0
; GFX9-NEXT: v_mul_lo_u32 v2, s11, v1
; GFX9-NEXT: s_bfe_i32 s6, s6, 0x100010
; GFX9-NEXT: s_ashr_i32 s11, s6, 31
; GFX9-NEXT: v_mul_lo_u32 v3, v0, s9
; GFX9-NEXT: v_mul_hi_u32 v2, v1, v2
; GFX9-NEXT: s_ashr_i32 s5, s4, 31
; GFX9-NEXT: s_add_i32 s4, s4, s5
; GFX9-NEXT: v_sub_u32_e32 v3, s13, v3
; GFX9-NEXT: s_add_i32 s6, s6, s11
; GFX9-NEXT: v_add_u32_e32 v4, 1, v0
; GFX9-NEXT: v_sub_u32_e32 v3, s4, v3
; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s9, v3
; GFX9-NEXT: s_xor_b32 s4, s4, s5
; GFX9-NEXT: s_xor_b32 s4, s6, s11
; GFX9-NEXT: v_add_u32_e32 v1, v1, v2
; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc
; GFX9-NEXT: v_subrev_u32_e32 v4, s9, v3
@ -2956,28 +2953,28 @@ define amdgpu_kernel void @sdivrem_v2i16(<2 x i16> addrspace(1)* %out0, <2 x i16
; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc
; GFX9-NEXT: v_subrev_u32_e32 v4, s9, v3
; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc
; GFX9-NEXT: v_mul_lo_u32 v3, v1, s11
; GFX9-NEXT: v_mul_lo_u32 v3, v1, s5
; GFX9-NEXT: v_add_u32_e32 v4, 1, v1
; GFX9-NEXT: s_xor_b32 s6, s12, s8
; GFX9-NEXT: s_xor_b32 s6, s10, s8
; GFX9-NEXT: v_xor_b32_e32 v0, s6, v0
; GFX9-NEXT: v_sub_u32_e32 v3, s4, v3
; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s11, v3
; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s5, v3
; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc
; GFX9-NEXT: v_subrev_u32_e32 v4, s11, v3
; GFX9-NEXT: v_subrev_u32_e32 v4, s5, v3
; GFX9-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc
; GFX9-NEXT: v_add_u32_e32 v4, 1, v1
; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s11, v3
; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s5, v3
; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc
; GFX9-NEXT: v_subrev_u32_e32 v4, s11, v3
; GFX9-NEXT: v_subrev_u32_e32 v4, s5, v3
; GFX9-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc
; GFX9-NEXT: s_xor_b32 s4, s5, s7
; GFX9-NEXT: v_xor_b32_e32 v2, s12, v2
; GFX9-NEXT: s_xor_b32 s4, s11, s7
; GFX9-NEXT: v_xor_b32_e32 v2, s10, v2
; GFX9-NEXT: v_xor_b32_e32 v1, s4, v1
; GFX9-NEXT: v_xor_b32_e32 v3, s5, v3
; GFX9-NEXT: v_xor_b32_e32 v3, s11, v3
; GFX9-NEXT: v_subrev_u32_e32 v0, s6, v0
; GFX9-NEXT: v_subrev_u32_e32 v2, s12, v2
; GFX9-NEXT: v_subrev_u32_e32 v2, s10, v2
; GFX9-NEXT: v_sub_u32_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
; GFX9-NEXT: v_sub_u32_sdwa v3, v3, s5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
; GFX9-NEXT: v_sub_u32_sdwa v3, v3, s11 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
; GFX9-NEXT: v_mov_b32_e32 v4, 0xffff
; GFX9-NEXT: v_and_or_b32 v0, v0, v4, v1
; GFX9-NEXT: v_and_or_b32 v1, v2, v4, v3
@ -2990,19 +2987,18 @@ define amdgpu_kernel void @sdivrem_v2i16(<2 x i16> addrspace(1)* %out0, <2 x i16
; GFX10-LABEL: sdivrem_v2i16:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x10
; GFX10-NEXT: s_mov_b32 s2, 0x100010
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: s_sext_i32_i16 s3, s1
; GFX10-NEXT: s_bfe_i32 s1, s1, s2
; GFX10-NEXT: s_ashr_i32 s8, s3, 31
; GFX10-NEXT: s_ashr_i32 s9, s1, 31
; GFX10-NEXT: s_add_i32 s3, s3, s8
; GFX10-NEXT: s_add_i32 s1, s1, s9
; GFX10-NEXT: s_xor_b32 s3, s3, s8
; GFX10-NEXT: s_xor_b32 s1, s1, s9
; GFX10-NEXT: v_cvt_f32_u32_e32 v0, s3
; GFX10-NEXT: s_sext_i32_i16 s2, s1
; GFX10-NEXT: s_bfe_i32 s1, s1, 0x100010
; GFX10-NEXT: s_ashr_i32 s3, s2, 31
; GFX10-NEXT: s_ashr_i32 s8, s1, 31
; GFX10-NEXT: s_add_i32 s2, s2, s3
; GFX10-NEXT: s_add_i32 s1, s1, s8
; GFX10-NEXT: s_xor_b32 s2, s2, s3
; GFX10-NEXT: s_xor_b32 s1, s1, s8
; GFX10-NEXT: v_cvt_f32_u32_e32 v0, s2
; GFX10-NEXT: v_cvt_f32_u32_e32 v1, s1
; GFX10-NEXT: s_sub_i32 s6, 0, s3
; GFX10-NEXT: s_sub_i32 s6, 0, s2
; GFX10-NEXT: s_sub_i32 s7, 0, s1
; GFX10-NEXT: v_rcp_iflag_f32_e32 v0, v0
; GFX10-NEXT: v_rcp_iflag_f32_e32 v1, v1
@ -3013,29 +3009,29 @@ define amdgpu_kernel void @sdivrem_v2i16(<2 x i16> addrspace(1)* %out0, <2 x i16
; GFX10-NEXT: v_mul_lo_u32 v2, s6, v0
; GFX10-NEXT: v_mul_lo_u32 v3, s7, v1
; GFX10-NEXT: s_sext_i32_i16 s6, s0
; GFX10-NEXT: s_bfe_i32 s0, s0, s2
; GFX10-NEXT: s_ashr_i32 s2, s6, 31
; GFX10-NEXT: s_bfe_i32 s0, s0, 0x100010
; GFX10-NEXT: s_ashr_i32 s9, s6, 31
; GFX10-NEXT: s_ashr_i32 s10, s0, 31
; GFX10-NEXT: s_add_i32 s6, s6, s2
; GFX10-NEXT: s_add_i32 s6, s6, s9
; GFX10-NEXT: s_add_i32 s0, s0, s10
; GFX10-NEXT: v_mul_hi_u32 v2, v0, v2
; GFX10-NEXT: v_mul_hi_u32 v3, v1, v3
; GFX10-NEXT: s_xor_b32 s6, s6, s2
; GFX10-NEXT: s_xor_b32 s6, s6, s9
; GFX10-NEXT: s_xor_b32 s0, s0, s10
; GFX10-NEXT: v_add_nc_u32_e32 v0, v0, v2
; GFX10-NEXT: v_add_nc_u32_e32 v1, v1, v3
; GFX10-NEXT: v_mul_hi_u32 v0, s6, v0
; GFX10-NEXT: v_mul_hi_u32 v1, s0, v1
; GFX10-NEXT: v_mul_lo_u32 v2, v0, s3
; GFX10-NEXT: v_mul_lo_u32 v2, v0, s2
; GFX10-NEXT: v_mul_lo_u32 v3, v1, s1
; GFX10-NEXT: v_add_nc_u32_e32 v4, 1, v0
; GFX10-NEXT: v_add_nc_u32_e32 v5, 1, v1
; GFX10-NEXT: v_sub_nc_u32_e32 v2, s6, v2
; GFX10-NEXT: v_sub_nc_u32_e32 v3, s0, v3
; GFX10-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x0
; GFX10-NEXT: v_subrev_nc_u32_e32 v6, s3, v2
; GFX10-NEXT: v_subrev_nc_u32_e32 v6, s2, v2
; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, s1, v3
; GFX10-NEXT: v_cmp_le_u32_e64 s0, s3, v2
; GFX10-NEXT: v_cmp_le_u32_e64 s0, s2, v2
; GFX10-NEXT: v_subrev_nc_u32_e32 v7, s1, v3
; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc_lo
; GFX10-NEXT: v_cndmask_b32_e64 v0, v0, v4, s0
@ -3043,28 +3039,27 @@ define amdgpu_kernel void @sdivrem_v2i16(<2 x i16> addrspace(1)* %out0, <2 x i16
; GFX10-NEXT: v_cndmask_b32_e32 v3, v3, v7, vcc_lo
; GFX10-NEXT: v_add_nc_u32_e32 v5, 1, v1
; GFX10-NEXT: v_add_nc_u32_e32 v4, 1, v0
; GFX10-NEXT: v_cmp_le_u32_e64 s0, s3, v2
; GFX10-NEXT: v_cmp_le_u32_e64 s0, s2, v2
; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, s1, v3
; GFX10-NEXT: v_subrev_nc_u32_e32 v6, s3, v2
; GFX10-NEXT: v_subrev_nc_u32_e32 v6, s2, v2
; GFX10-NEXT: v_subrev_nc_u32_e32 v7, s1, v3
; GFX10-NEXT: s_xor_b32 s1, s2, s8
; GFX10-NEXT: s_xor_b32 s1, s9, s3
; GFX10-NEXT: v_cndmask_b32_e64 v0, v0, v4, s0
; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc_lo
; GFX10-NEXT: v_cndmask_b32_e64 v2, v2, v6, s0
; GFX10-NEXT: v_cndmask_b32_e32 v3, v3, v7, vcc_lo
; GFX10-NEXT: s_xor_b32 s0, s10, s9
; GFX10-NEXT: s_xor_b32 s0, s10, s8
; GFX10-NEXT: v_xor_b32_e32 v0, s1, v0
; GFX10-NEXT: v_xor_b32_e32 v1, s0, v1
; GFX10-NEXT: v_xor_b32_e32 v2, s2, v2
; GFX10-NEXT: v_xor_b32_e32 v2, s9, v2
; GFX10-NEXT: v_xor_b32_e32 v3, s10, v3
; GFX10-NEXT: v_mov_b32_e32 v4, 0xffff
; GFX10-NEXT: v_subrev_nc_u32_e32 v0, s1, v0
; GFX10-NEXT: v_sub_nc_u32_sdwa v1, v1, s0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
; GFX10-NEXT: v_subrev_nc_u32_e32 v2, s2, v2
; GFX10-NEXT: v_subrev_nc_u32_e32 v2, s9, v2
; GFX10-NEXT: v_sub_nc_u32_sdwa v3, v3, s10 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
; GFX10-NEXT: v_and_or_b32 v0, v0, v4, v1
; GFX10-NEXT: v_and_or_b32 v0, 0xffff, v0, v1
; GFX10-NEXT: v_mov_b32_e32 v1, 0
; GFX10-NEXT: v_and_or_b32 v2, v2, v4, v3
; GFX10-NEXT: v_and_or_b32 v2, 0xffff, v2, v3
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: global_store_dword v1, v0, s[4:5]
; GFX10-NEXT: global_store_dword v1, v2, s[6:7]
@ -3366,9 +3361,8 @@ define amdgpu_kernel void @sdivrem_i27(i27 addrspace(1)* %out0, i27 addrspace(1)
; GFX10-NEXT: v_xor_b32_e32 v1, s7, v1
; GFX10-NEXT: v_subrev_nc_u32_e32 v0, s4, v0
; GFX10-NEXT: v_subrev_nc_u32_e32 v1, s7, v1
; GFX10-NEXT: s_mov_b32 s4, 0x7ffffff
; GFX10-NEXT: v_and_b32_e32 v0, s4, v0
; GFX10-NEXT: v_and_b32_e32 v1, s4, v1
; GFX10-NEXT: v_and_b32_e32 v0, 0x7ffffff, v0
; GFX10-NEXT: v_and_b32_e32 v1, 0x7ffffff, v1
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: global_store_dword v2, v0, s[0:1]
; GFX10-NEXT: global_store_dword v2, v1, s[2:3]

View File

@ -439,9 +439,8 @@ define <2 x i64> @v_shl_v2i64_zext_v2i32(<2 x i32> %x) {
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: v_mov_b32_e32 v3, 0
; GFX10-NEXT: s_brev_b32 s4, -4
; GFX10-NEXT: v_and_b32_e32 v2, s4, v0
; GFX10-NEXT: v_and_b32_e32 v4, s4, v1
; GFX10-NEXT: v_and_b32_e32 v2, 0x3fffffff, v0
; GFX10-NEXT: v_and_b32_e32 v4, 0x3fffffff, v1
; GFX10-NEXT: v_mov_b32_e32 v5, v3
; GFX10-NEXT: v_lshlrev_b64 v[0:1], 2, v[2:3]
; GFX10-NEXT: v_lshlrev_b64 v[2:3], 2, v[4:5]
@ -523,9 +522,8 @@ define <2 x i64> @v_shl_v2i64_sext_v2i32(<2 x i32> %x) {
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: s_brev_b32 s4, -8
; GFX10-NEXT: v_and_b32_e32 v0, s4, v0
; GFX10-NEXT: v_and_b32_e32 v2, s4, v1
; GFX10-NEXT: v_and_b32_e32 v0, 0x1fffffff, v0
; GFX10-NEXT: v_and_b32_e32 v2, 0x1fffffff, v1
; GFX10-NEXT: v_ashrrev_i32_e32 v1, 31, v0
; GFX10-NEXT: v_ashrrev_i32_e32 v3, 31, v2
; GFX10-NEXT: v_lshlrev_b64 v[0:1], 2, v[0:1]
@ -608,13 +606,12 @@ define i32 @v_shl_i32_zext_i16(i16 %x) {
define amdgpu_ps <2 x i32> @s_shl_v2i32_zext_v2i16(<2 x i16> inreg %x) {
; GFX7-LABEL: s_shl_v2i32_zext_v2i16:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_mov_b32 s2, 0xffff
; GFX7-NEXT: s_lshl_b32 s1, s1, 16
; GFX7-NEXT: s_and_b32 s0, s0, s2
; GFX7-NEXT: s_and_b32 s0, s0, 0xffff
; GFX7-NEXT: s_or_b32 s0, s1, s0
; GFX7-NEXT: s_and_b32 s0, s0, 0x3fff3fff
; GFX7-NEXT: s_lshr_b32 s1, s0, 16
; GFX7-NEXT: s_and_b32 s0, s0, s2
; GFX7-NEXT: s_and_b32 s0, s0, 0xffff
; GFX7-NEXT: s_lshl_b32 s0, s0, 2
; GFX7-NEXT: s_lshl_b32 s1, s1, 2
; GFX7-NEXT: ; return to shader part epilog

View File

@ -785,25 +785,23 @@ define <2 x i16> @v_shl_v2i16_15(<2 x i16> %value) {
define amdgpu_ps i32 @s_shl_v2i16(<2 x i16> inreg %value, <2 x i16> inreg %amount) {
; GFX6-LABEL: s_shl_v2i16:
; GFX6: ; %bb.0:
; GFX6-NEXT: s_mov_b32 s4, 0xffff
; GFX6-NEXT: s_lshl_b32 s1, s1, s3
; GFX6-NEXT: s_lshl_b32 s0, s0, s2
; GFX6-NEXT: s_and_b32 s1, s1, s4
; GFX6-NEXT: s_and_b32 s0, s0, s4
; GFX6-NEXT: s_and_b32 s1, s1, 0xffff
; GFX6-NEXT: s_and_b32 s0, s0, 0xffff
; GFX6-NEXT: s_lshl_b32 s1, s1, 16
; GFX6-NEXT: s_or_b32 s0, s0, s1
; GFX6-NEXT: ; return to shader part epilog
;
; GFX8-LABEL: s_shl_v2i16:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_mov_b32 s3, 0xffff
; GFX8-NEXT: s_lshr_b32 s2, s0, 16
; GFX8-NEXT: s_and_b32 s0, s0, s3
; GFX8-NEXT: s_lshr_b32 s4, s1, 16
; GFX8-NEXT: s_and_b32 s0, s0, 0xffff
; GFX8-NEXT: s_lshr_b32 s3, s1, 16
; GFX8-NEXT: s_lshl_b32 s0, s0, s1
; GFX8-NEXT: s_lshl_b32 s1, s2, s4
; GFX8-NEXT: s_lshl_b32 s1, s2, s3
; GFX8-NEXT: s_lshl_b32 s1, s1, 16
; GFX8-NEXT: s_and_b32 s0, s0, s3
; GFX8-NEXT: s_and_b32 s0, s0, 0xffff
; GFX8-NEXT: s_or_b32 s0, s1, s0
; GFX8-NEXT: ; return to shader part epilog
;
@ -869,10 +867,10 @@ define amdgpu_ps float @shl_v2i16_sv(<2 x i16> inreg %value, <2 x i16> %amount)
define amdgpu_ps float @shl_v2i16_vs(<2 x i16> %value, <2 x i16> inreg %amount) {
; GFX6-LABEL: shl_v2i16_vs:
; GFX6: ; %bb.0:
; GFX6-NEXT: s_mov_b32 s2, 0xffff
; GFX6-NEXT: s_and_b32 s0, s0, s2
; GFX6-NEXT: s_and_b32 s0, s0, 0xffff
; GFX6-NEXT: v_lshlrev_b32_e32 v0, s0, v0
; GFX6-NEXT: s_and_b32 s0, s1, s2
; GFX6-NEXT: s_and_b32 s0, s1, 0xffff
; GFX6-NEXT: s_mov_b32 s2, 0xffff
; GFX6-NEXT: v_lshlrev_b32_e32 v1, s0, v1
; GFX6-NEXT: v_and_b32_e32 v1, s2, v1
; GFX6-NEXT: v_and_b32_e32 v0, s2, v0
@ -970,39 +968,37 @@ define <2 x float> @v_shl_v4i16(<4 x i16> %value, <4 x i16> %amount) {
define amdgpu_ps <2 x i32> @s_shl_v4i16(<4 x i16> inreg %value, <4 x i16> inreg %amount) {
; GFX6-LABEL: s_shl_v4i16:
; GFX6: ; %bb.0:
; GFX6-NEXT: s_mov_b32 s8, 0xffff
; GFX6-NEXT: s_lshl_b32 s1, s1, s5
; GFX6-NEXT: s_lshl_b32 s0, s0, s4
; GFX6-NEXT: s_and_b32 s1, s1, s8
; GFX6-NEXT: s_and_b32 s1, s1, 0xffff
; GFX6-NEXT: s_lshl_b32 s2, s2, s6
; GFX6-NEXT: s_lshl_b32 s3, s3, s7
; GFX6-NEXT: s_and_b32 s0, s0, s8
; GFX6-NEXT: s_and_b32 s0, s0, 0xffff
; GFX6-NEXT: s_lshl_b32 s1, s1, 16
; GFX6-NEXT: s_or_b32 s0, s0, s1
; GFX6-NEXT: s_and_b32 s1, s2, s8
; GFX6-NEXT: s_and_b32 s2, s3, s8
; GFX6-NEXT: s_and_b32 s1, s2, 0xffff
; GFX6-NEXT: s_and_b32 s2, s3, 0xffff
; GFX6-NEXT: s_lshl_b32 s2, s2, 16
; GFX6-NEXT: s_or_b32 s1, s1, s2
; GFX6-NEXT: ; return to shader part epilog
;
; GFX8-LABEL: s_shl_v4i16:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_mov_b32 s6, 0xffff
; GFX8-NEXT: s_lshr_b32 s4, s0, 16
; GFX8-NEXT: s_and_b32 s0, s0, s6
; GFX8-NEXT: s_lshr_b32 s7, s2, 16
; GFX8-NEXT: s_and_b32 s0, s0, 0xffff
; GFX8-NEXT: s_lshr_b32 s6, s2, 16
; GFX8-NEXT: s_lshr_b32 s5, s1, 16
; GFX8-NEXT: s_and_b32 s1, s1, s6
; GFX8-NEXT: s_lshr_b32 s8, s3, 16
; GFX8-NEXT: s_and_b32 s1, s1, 0xffff
; GFX8-NEXT: s_lshr_b32 s7, s3, 16
; GFX8-NEXT: s_lshl_b32 s0, s0, s2
; GFX8-NEXT: s_lshl_b32 s2, s4, s7
; GFX8-NEXT: s_lshl_b32 s2, s4, s6
; GFX8-NEXT: s_lshl_b32 s1, s1, s3
; GFX8-NEXT: s_lshl_b32 s3, s5, s8
; GFX8-NEXT: s_lshl_b32 s3, s5, s7
; GFX8-NEXT: s_lshl_b32 s2, s2, 16
; GFX8-NEXT: s_and_b32 s0, s0, s6
; GFX8-NEXT: s_and_b32 s0, s0, 0xffff
; GFX8-NEXT: s_or_b32 s0, s2, s0
; GFX8-NEXT: s_lshl_b32 s2, s3, 16
; GFX8-NEXT: s_and_b32 s1, s1, s6
; GFX8-NEXT: s_and_b32 s1, s1, 0xffff
; GFX8-NEXT: s_or_b32 s1, s2, s1
; GFX8-NEXT: ; return to shader part epilog
;
@ -1144,67 +1140,65 @@ define <4 x float> @v_shl_v8i16(<8 x i16> %value, <8 x i16> %amount) {
define amdgpu_ps <4 x i32> @s_shl_v8i16(<8 x i16> inreg %value, <8 x i16> inreg %amount) {
; GFX6-LABEL: s_shl_v8i16:
; GFX6: ; %bb.0:
; GFX6-NEXT: s_mov_b32 s16, 0xffff
; GFX6-NEXT: s_lshl_b32 s1, s1, s9
; GFX6-NEXT: s_lshl_b32 s0, s0, s8
; GFX6-NEXT: s_and_b32 s1, s1, s16
; GFX6-NEXT: s_and_b32 s1, s1, 0xffff
; GFX6-NEXT: s_lshl_b32 s2, s2, s10
; GFX6-NEXT: s_lshl_b32 s3, s3, s11
; GFX6-NEXT: s_and_b32 s0, s0, s16
; GFX6-NEXT: s_and_b32 s0, s0, 0xffff
; GFX6-NEXT: s_lshl_b32 s1, s1, 16
; GFX6-NEXT: s_lshl_b32 s5, s5, s13
; GFX6-NEXT: s_or_b32 s0, s0, s1
; GFX6-NEXT: s_and_b32 s1, s2, s16
; GFX6-NEXT: s_and_b32 s2, s3, s16
; GFX6-NEXT: s_and_b32 s1, s2, 0xffff
; GFX6-NEXT: s_and_b32 s2, s3, 0xffff
; GFX6-NEXT: s_lshl_b32 s4, s4, s12
; GFX6-NEXT: s_lshl_b32 s7, s7, s15
; GFX6-NEXT: s_lshl_b32 s2, s2, 16
; GFX6-NEXT: s_and_b32 s3, s5, s16
; GFX6-NEXT: s_and_b32 s3, s5, 0xffff
; GFX6-NEXT: s_lshl_b32 s6, s6, s14
; GFX6-NEXT: s_or_b32 s1, s1, s2
; GFX6-NEXT: s_and_b32 s2, s4, s16
; GFX6-NEXT: s_and_b32 s2, s4, 0xffff
; GFX6-NEXT: s_lshl_b32 s3, s3, 16
; GFX6-NEXT: s_and_b32 s4, s7, s16
; GFX6-NEXT: s_and_b32 s4, s7, 0xffff
; GFX6-NEXT: s_or_b32 s2, s2, s3
; GFX6-NEXT: s_and_b32 s3, s6, s16
; GFX6-NEXT: s_and_b32 s3, s6, 0xffff
; GFX6-NEXT: s_lshl_b32 s4, s4, 16
; GFX6-NEXT: s_or_b32 s3, s3, s4
; GFX6-NEXT: ; return to shader part epilog
;
; GFX8-LABEL: s_shl_v8i16:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_mov_b32 s12, 0xffff
; GFX8-NEXT: s_lshr_b32 s8, s0, 16
; GFX8-NEXT: s_and_b32 s0, s0, s12
; GFX8-NEXT: s_lshr_b32 s13, s4, 16
; GFX8-NEXT: s_and_b32 s0, s0, 0xffff
; GFX8-NEXT: s_lshr_b32 s12, s4, 16
; GFX8-NEXT: s_lshr_b32 s9, s1, 16
; GFX8-NEXT: s_and_b32 s1, s1, s12
; GFX8-NEXT: s_lshr_b32 s14, s5, 16
; GFX8-NEXT: s_and_b32 s1, s1, 0xffff
; GFX8-NEXT: s_lshr_b32 s13, s5, 16
; GFX8-NEXT: s_lshl_b32 s0, s0, s4
; GFX8-NEXT: s_lshl_b32 s4, s8, s13
; GFX8-NEXT: s_lshl_b32 s4, s8, s12
; GFX8-NEXT: s_lshr_b32 s10, s2, 16
; GFX8-NEXT: s_and_b32 s2, s2, s12
; GFX8-NEXT: s_lshr_b32 s15, s6, 16
; GFX8-NEXT: s_and_b32 s2, s2, 0xffff
; GFX8-NEXT: s_lshr_b32 s14, s6, 16
; GFX8-NEXT: s_lshl_b32 s1, s1, s5
; GFX8-NEXT: s_lshl_b32 s5, s9, s14
; GFX8-NEXT: s_lshl_b32 s5, s9, s13
; GFX8-NEXT: s_lshl_b32 s4, s4, 16
; GFX8-NEXT: s_and_b32 s0, s0, s12
; GFX8-NEXT: s_and_b32 s0, s0, 0xffff
; GFX8-NEXT: s_lshr_b32 s11, s3, 16
; GFX8-NEXT: s_and_b32 s3, s3, s12
; GFX8-NEXT: s_lshr_b32 s16, s7, 16
; GFX8-NEXT: s_and_b32 s3, s3, 0xffff
; GFX8-NEXT: s_lshr_b32 s15, s7, 16
; GFX8-NEXT: s_lshl_b32 s2, s2, s6
; GFX8-NEXT: s_lshl_b32 s6, s10, s15
; GFX8-NEXT: s_lshl_b32 s6, s10, s14
; GFX8-NEXT: s_or_b32 s0, s4, s0
; GFX8-NEXT: s_lshl_b32 s4, s5, 16
; GFX8-NEXT: s_and_b32 s1, s1, s12
; GFX8-NEXT: s_and_b32 s1, s1, 0xffff
; GFX8-NEXT: s_lshl_b32 s3, s3, s7
; GFX8-NEXT: s_lshl_b32 s7, s11, s16
; GFX8-NEXT: s_lshl_b32 s7, s11, s15
; GFX8-NEXT: s_or_b32 s1, s4, s1
; GFX8-NEXT: s_lshl_b32 s4, s6, 16
; GFX8-NEXT: s_and_b32 s2, s2, s12
; GFX8-NEXT: s_and_b32 s2, s2, 0xffff
; GFX8-NEXT: s_or_b32 s2, s4, s2
; GFX8-NEXT: s_lshl_b32 s4, s7, 16
; GFX8-NEXT: s_and_b32 s3, s3, s12
; GFX8-NEXT: s_and_b32 s3, s3, 0xffff
; GFX8-NEXT: s_or_b32 s3, s4, s3
; GFX8-NEXT: ; return to shader part epilog
;

View File

@ -377,13 +377,13 @@ define <2 x i32> @v_srem_v2i32_pow2k_denom(<2 x i32> %num) {
; CGP-NEXT: v_sub_i32_e32 v0, vcc, v0, v3
; CGP-NEXT: v_sub_i32_e32 v1, vcc, v1, v4
; CGP-NEXT: v_subrev_i32_e32 v3, vcc, s4, v0
; CGP-NEXT: v_sub_i32_e32 v4, vcc, v1, v5
; CGP-NEXT: v_subrev_i32_e32 v4, vcc, 0x1000, v1
; CGP-NEXT: v_cmp_le_u32_e32 vcc, s4, v0
; CGP-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc
; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v1, v5
; CGP-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc
; CGP-NEXT: v_subrev_i32_e32 v3, vcc, s4, v0
; CGP-NEXT: v_sub_i32_e32 v4, vcc, v1, v5
; CGP-NEXT: v_subrev_i32_e32 v4, vcc, 0x1000, v1
; CGP-NEXT: v_cmp_le_u32_e32 vcc, s4, v0
; CGP-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc
; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v1, v5
@ -508,13 +508,13 @@ define <2 x i32> @v_srem_v2i32_oddk_denom(<2 x i32> %num) {
; CGP-NEXT: v_sub_i32_e32 v0, vcc, v0, v3
; CGP-NEXT: v_sub_i32_e32 v1, vcc, v1, v4
; CGP-NEXT: v_subrev_i32_e32 v3, vcc, s4, v0
; CGP-NEXT: v_sub_i32_e32 v4, vcc, v1, v5
; CGP-NEXT: v_subrev_i32_e32 v4, vcc, 0x12d8fb, v1
; CGP-NEXT: v_cmp_le_u32_e32 vcc, s4, v0
; CGP-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc
; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v1, v5
; CGP-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc
; CGP-NEXT: v_subrev_i32_e32 v3, vcc, s4, v0
; CGP-NEXT: v_sub_i32_e32 v4, vcc, v1, v5
; CGP-NEXT: v_subrev_i32_e32 v4, vcc, 0x12d8fb, v1
; CGP-NEXT: v_cmp_le_u32_e32 vcc, s4, v0
; CGP-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc
; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v1, v5

View File

@ -1164,9 +1164,8 @@ define <2 x i64> @v_srem_v2i64_pow2k_denom(<2 x i64> %num) {
; GISEL-LABEL: v_srem_v2i64_pow2k_denom:
; GISEL: ; %bb.0:
; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GISEL-NEXT: s_movk_i32 s10, 0x1000
; GISEL-NEXT: s_mov_b32 s6, 0
; GISEL-NEXT: s_add_u32 s4, s10, 0
; GISEL-NEXT: s_add_u32 s4, 0x1000, 0
; GISEL-NEXT: s_mov_b32 s7, s6
; GISEL-NEXT: s_addc_u32 s5, 0, 0
; GISEL-NEXT: s_xor_b64 s[8:9], s[4:5], s[6:7]
@ -1294,7 +1293,7 @@ define <2 x i64> @v_srem_v2i64_pow2k_denom(<2 x i64> %num) {
; GISEL-NEXT: v_subrev_i32_e32 v9, vcc, s8, v7
; GISEL-NEXT: v_cndmask_b32_e64 v10, v10, v11, s[4:5]
; GISEL-NEXT: v_subbrev_u32_e32 v1, vcc, 0, v1, vcc
; GISEL-NEXT: s_add_u32 s4, s10, 0
; GISEL-NEXT: s_add_u32 s4, 0x1000, 0
; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10
; GISEL-NEXT: s_addc_u32 s5, 0, 0
; GISEL-NEXT: v_cndmask_b32_e32 v7, v7, v9, vcc
@ -1860,9 +1859,8 @@ define <2 x i64> @v_srem_v2i64_oddk_denom(<2 x i64> %num) {
; GISEL-LABEL: v_srem_v2i64_oddk_denom:
; GISEL: ; %bb.0:
; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GISEL-NEXT: s_mov_b32 s10, 0x12d8fb
; GISEL-NEXT: s_mov_b32 s6, 0
; GISEL-NEXT: s_add_u32 s4, s10, 0
; GISEL-NEXT: s_add_u32 s4, 0x12d8fb, 0
; GISEL-NEXT: s_mov_b32 s7, s6
; GISEL-NEXT: s_addc_u32 s5, 0, 0
; GISEL-NEXT: s_xor_b64 s[8:9], s[4:5], s[6:7]
@ -1990,7 +1988,7 @@ define <2 x i64> @v_srem_v2i64_oddk_denom(<2 x i64> %num) {
; GISEL-NEXT: v_subrev_i32_e32 v9, vcc, s8, v7
; GISEL-NEXT: v_cndmask_b32_e64 v10, v10, v11, s[4:5]
; GISEL-NEXT: v_subbrev_u32_e32 v1, vcc, 0, v1, vcc
; GISEL-NEXT: s_add_u32 s4, s10, 0
; GISEL-NEXT: s_add_u32 s4, 0x12d8fb, 0
; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10
; GISEL-NEXT: s_addc_u32 s5, 0, 0
; GISEL-NEXT: v_cndmask_b32_e32 v7, v7, v9, vcc

File diff suppressed because it is too large Load Diff

View File

@ -113,44 +113,43 @@ define amdgpu_kernel void @store_lds_v4i32_align1(<4 x i32> addrspace(3)* %out,
; GFX7: ; %bb.0:
; GFX7-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x4
; GFX7-NEXT: s_load_dword s0, s[0:1], 0x0
; GFX7-NEXT: s_mov_b32 s1, 0x80008
; GFX7-NEXT: s_mov_b32 m0, -1
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: s_bfe_u32 s3, s4, s1
; GFX7-NEXT: s_bfe_u32 s2, s4, 0x80008
; GFX7-NEXT: v_mov_b32_e32 v0, s4
; GFX7-NEXT: v_mov_b32_e32 v1, s0
; GFX7-NEXT: s_lshr_b32 s2, s4, 16
; GFX7-NEXT: s_lshr_b32 s1, s4, 16
; GFX7-NEXT: ds_write_b8 v1, v0
; GFX7-NEXT: v_mov_b32_e32 v0, s3
; GFX7-NEXT: v_mov_b32_e32 v0, s2
; GFX7-NEXT: ds_write_b8 v1, v0 offset:1
; GFX7-NEXT: s_lshr_b32 s0, s4, 24
; GFX7-NEXT: v_mov_b32_e32 v0, s2
; GFX7-NEXT: v_mov_b32_e32 v0, s1
; GFX7-NEXT: ds_write_b8 v1, v0 offset:2
; GFX7-NEXT: v_mov_b32_e32 v0, s0
; GFX7-NEXT: ds_write_b8 v1, v0 offset:3
; GFX7-NEXT: s_bfe_u32 s2, s5, s1
; GFX7-NEXT: s_bfe_u32 s1, s5, 0x80008
; GFX7-NEXT: v_mov_b32_e32 v0, s5
; GFX7-NEXT: s_lshr_b32 s0, s5, 16
; GFX7-NEXT: ds_write_b8 v1, v0 offset:4
; GFX7-NEXT: v_mov_b32_e32 v0, s2
; GFX7-NEXT: v_mov_b32_e32 v0, s1
; GFX7-NEXT: ds_write_b8 v1, v0 offset:5
; GFX7-NEXT: s_lshr_b32 s2, s5, 24
; GFX7-NEXT: s_lshr_b32 s1, s5, 24
; GFX7-NEXT: v_mov_b32_e32 v0, s0
; GFX7-NEXT: ds_write_b8 v1, v0 offset:6
; GFX7-NEXT: v_mov_b32_e32 v0, s2
; GFX7-NEXT: v_mov_b32_e32 v0, s1
; GFX7-NEXT: ds_write_b8 v1, v0 offset:7
; GFX7-NEXT: s_bfe_u32 s2, s6, s1
; GFX7-NEXT: s_bfe_u32 s1, s6, 0x80008
; GFX7-NEXT: v_mov_b32_e32 v0, s6
; GFX7-NEXT: s_lshr_b32 s0, s6, 16
; GFX7-NEXT: ds_write_b8 v1, v0 offset:8
; GFX7-NEXT: v_mov_b32_e32 v0, s2
; GFX7-NEXT: v_mov_b32_e32 v0, s1
; GFX7-NEXT: ds_write_b8 v1, v0 offset:9
; GFX7-NEXT: s_lshr_b32 s2, s6, 24
; GFX7-NEXT: s_lshr_b32 s1, s6, 24
; GFX7-NEXT: v_mov_b32_e32 v0, s0
; GFX7-NEXT: ds_write_b8 v1, v0 offset:10
; GFX7-NEXT: v_mov_b32_e32 v0, s2
; GFX7-NEXT: v_mov_b32_e32 v0, s1
; GFX7-NEXT: ds_write_b8 v1, v0 offset:11
; GFX7-NEXT: s_bfe_u32 s1, s7, s1
; GFX7-NEXT: s_bfe_u32 s1, s7, 0x80008
; GFX7-NEXT: v_mov_b32_e32 v0, s7
; GFX7-NEXT: s_lshr_b32 s0, s7, 16
; GFX7-NEXT: ds_write_b8 v1, v0 offset:12

View File

@ -98,33 +98,32 @@ define amdgpu_kernel void @store_lds_v3i32_align1(<3 x i32> addrspace(3)* %out,
; GFX7: ; %bb.0:
; GFX7-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x4
; GFX7-NEXT: s_load_dword s0, s[0:1], 0x0
; GFX7-NEXT: s_mov_b32 s1, 0x80008
; GFX7-NEXT: s_mov_b32 m0, -1
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: s_bfe_u32 s3, s4, s1
; GFX7-NEXT: s_bfe_u32 s2, s4, 0x80008
; GFX7-NEXT: v_mov_b32_e32 v0, s4
; GFX7-NEXT: v_mov_b32_e32 v1, s0
; GFX7-NEXT: s_lshr_b32 s2, s4, 16
; GFX7-NEXT: s_lshr_b32 s1, s4, 16
; GFX7-NEXT: ds_write_b8 v1, v0
; GFX7-NEXT: v_mov_b32_e32 v0, s3
; GFX7-NEXT: v_mov_b32_e32 v0, s2
; GFX7-NEXT: ds_write_b8 v1, v0 offset:1
; GFX7-NEXT: s_lshr_b32 s0, s4, 24
; GFX7-NEXT: v_mov_b32_e32 v0, s2
; GFX7-NEXT: v_mov_b32_e32 v0, s1
; GFX7-NEXT: ds_write_b8 v1, v0 offset:2
; GFX7-NEXT: v_mov_b32_e32 v0, s0
; GFX7-NEXT: ds_write_b8 v1, v0 offset:3
; GFX7-NEXT: s_bfe_u32 s2, s5, s1
; GFX7-NEXT: s_bfe_u32 s1, s5, 0x80008
; GFX7-NEXT: v_mov_b32_e32 v0, s5
; GFX7-NEXT: s_lshr_b32 s0, s5, 16
; GFX7-NEXT: ds_write_b8 v1, v0 offset:4
; GFX7-NEXT: v_mov_b32_e32 v0, s2
; GFX7-NEXT: v_mov_b32_e32 v0, s1
; GFX7-NEXT: ds_write_b8 v1, v0 offset:5
; GFX7-NEXT: s_lshr_b32 s2, s5, 24
; GFX7-NEXT: s_lshr_b32 s1, s5, 24
; GFX7-NEXT: v_mov_b32_e32 v0, s0
; GFX7-NEXT: ds_write_b8 v1, v0 offset:6
; GFX7-NEXT: v_mov_b32_e32 v0, s2
; GFX7-NEXT: v_mov_b32_e32 v0, s1
; GFX7-NEXT: ds_write_b8 v1, v0 offset:7
; GFX7-NEXT: s_bfe_u32 s1, s6, s1
; GFX7-NEXT: s_bfe_u32 s1, s6, 0x80008
; GFX7-NEXT: v_mov_b32_e32 v0, s6
; GFX7-NEXT: s_lshr_b32 s0, s6, 16
; GFX7-NEXT: ds_write_b8 v1, v0 offset:8

View File

@ -967,7 +967,7 @@ define amdgpu_ps i16 @usubo_i16_sv(i16 inreg %a, i16 %b) {
; GFX7-LABEL: usubo_i16_sv:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_mov_b32 s1, 0xffff
; GFX7-NEXT: s_and_b32 s0, s0, s1
; GFX7-NEXT: s_and_b32 s0, s0, 0xffff
; GFX7-NEXT: v_and_b32_e32 v0, s1, v0
; GFX7-NEXT: v_sub_i32_e32 v0, vcc, s0, v0
; GFX7-NEXT: v_and_b32_e32 v1, s1, v0
@ -980,7 +980,7 @@ define amdgpu_ps i16 @usubo_i16_sv(i16 inreg %a, i16 %b) {
; GFX8-LABEL: usubo_i16_sv:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_mov_b32 s1, 0xffff
; GFX8-NEXT: s_and_b32 s0, s0, s1
; GFX8-NEXT: s_and_b32 s0, s0, 0xffff
; GFX8-NEXT: v_and_b32_e32 v0, s1, v0
; GFX8-NEXT: v_sub_u32_e32 v0, vcc, s0, v0
; GFX8-NEXT: v_and_b32_e32 v1, s1, v0
@ -992,8 +992,7 @@ define amdgpu_ps i16 @usubo_i16_sv(i16 inreg %a, i16 %b) {
;
; GFX9-LABEL: usubo_i16_sv:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_mov_b32 s1, 0xffff
; GFX9-NEXT: s_and_b32 s0, s0, s1
; GFX9-NEXT: s_and_b32 s0, s0, 0xffff
; GFX9-NEXT: v_sub_u32_sdwa v0, s0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
; GFX9-NEXT: v_cmp_ne_u32_sdwa s[0:1], v0, v0 src0_sel:DWORD src1_sel:WORD_0
; GFX9-NEXT: v_cndmask_b32_e64 v1, 0, 1, s[0:1]

View File

@ -162,23 +162,21 @@ define <2 x i32> @v_trunc_v4i32_to_v4i16(<4 x i32> %src) {
define amdgpu_ps <2 x i32> @s_trunc_v4i32_to_v4i16(<4 x i32> inreg %src) {
; GFX7-LABEL: s_trunc_v4i32_to_v4i16:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_mov_b32 s4, 0xffff
; GFX7-NEXT: s_lshl_b32 s1, s1, 16
; GFX7-NEXT: s_and_b32 s0, s0, s4
; GFX7-NEXT: s_and_b32 s0, s0, 0xffff
; GFX7-NEXT: s_or_b32 s0, s1, s0
; GFX7-NEXT: s_lshl_b32 s1, s3, 16
; GFX7-NEXT: s_and_b32 s2, s2, s4
; GFX7-NEXT: s_and_b32 s2, s2, 0xffff
; GFX7-NEXT: s_or_b32 s1, s1, s2
; GFX7-NEXT: ; return to shader part epilog
;
; GFX8-LABEL: s_trunc_v4i32_to_v4i16:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_mov_b32 s4, 0xffff
; GFX8-NEXT: s_lshl_b32 s1, s1, 16
; GFX8-NEXT: s_and_b32 s0, s0, s4
; GFX8-NEXT: s_and_b32 s0, s0, 0xffff
; GFX8-NEXT: s_or_b32 s0, s1, s0
; GFX8-NEXT: s_lshl_b32 s1, s3, 16
; GFX8-NEXT: s_and_b32 s2, s2, s4
; GFX8-NEXT: s_and_b32 s2, s2, 0xffff
; GFX8-NEXT: s_or_b32 s1, s1, s2
; GFX8-NEXT: ; return to shader part epilog
%trunc = trunc <4 x i32> %src to <4 x i16>

View File

@ -242,12 +242,11 @@ define i16 @v_uaddsat_v2i8(i16 %lhs.arg, i16 %rhs.arg) {
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: s_mov_b32 s4, 8
; GFX10-NEXT: v_mov_b32_e32 v2, 0xffff
; GFX10-NEXT: v_lshrrev_b32_sdwa v3, s4, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
; GFX10-NEXT: v_lshrrev_b32_sdwa v4, s4, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
; GFX10-NEXT: v_lshrrev_b32_sdwa v2, s4, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
; GFX10-NEXT: v_lshrrev_b32_sdwa v3, s4, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
; GFX10-NEXT: s_movk_i32 s4, 0xff
; GFX10-NEXT: v_and_or_b32 v0, v0, v2, v3
; GFX10-NEXT: v_and_or_b32 v1, v1, v2, v4
; GFX10-NEXT: v_and_or_b32 v0, 0xffff, v0, v2
; GFX10-NEXT: v_and_or_b32 v1, 0xffff, v1, v3
; GFX10-NEXT: v_pk_lshlrev_b16 v0, 8, v0 op_sel_hi:[0,1]
; GFX10-NEXT: v_pk_lshlrev_b16 v1, 8, v1 op_sel_hi:[0,1]
; GFX10-NEXT: v_pk_add_u16 v0, v0, v1 clamp
@ -305,17 +304,16 @@ define amdgpu_ps i16 @s_uaddsat_v2i8(i16 inreg %lhs.arg, i16 inreg %rhs.arg) {
; GFX9-LABEL: s_uaddsat_v2i8:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_lshr_b32 s2, s0, 8
; GFX9-NEXT: s_lshr_b32 s3, s1, 8
; GFX9-NEXT: s_pack_ll_b32_b16 s0, s0, s2
; GFX9-NEXT: s_lshr_b32 s3, s1, 8
; GFX9-NEXT: s_lshr_b32 s2, s0, 16
; GFX9-NEXT: s_pack_ll_b32_b16 s1, s1, s3
; GFX9-NEXT: s_mov_b32 s2, 0x80008
; GFX9-NEXT: s_lshr_b32 s3, s0, 16
; GFX9-NEXT: s_lshl_b32 s0, s0, s2
; GFX9-NEXT: s_lshl_b32 s3, s3, 8
; GFX9-NEXT: s_pack_ll_b32_b16 s0, s0, s3
; GFX9-NEXT: s_lshr_b32 s3, s1, 16
; GFX9-NEXT: s_lshl_b32 s1, s1, s2
; GFX9-NEXT: s_lshl_b32 s2, s3, 8
; GFX9-NEXT: s_lshl_b32 s0, s0, 0x80008
; GFX9-NEXT: s_lshl_b32 s2, s2, 8
; GFX9-NEXT: s_pack_ll_b32_b16 s0, s0, s2
; GFX9-NEXT: s_lshr_b32 s2, s1, 16
; GFX9-NEXT: s_lshl_b32 s1, s1, 0x80008
; GFX9-NEXT: s_lshl_b32 s2, s2, 8
; GFX9-NEXT: s_pack_ll_b32_b16 s1, s1, s2
; GFX9-NEXT: v_mov_b32_e32 v0, s1
; GFX9-NEXT: v_pk_add_u16 v0, s0, v0 clamp
@ -332,15 +330,14 @@ define amdgpu_ps i16 @s_uaddsat_v2i8(i16 inreg %lhs.arg, i16 inreg %rhs.arg) {
; GFX10-NEXT: s_lshr_b32 s3, s1, 8
; GFX10-NEXT: s_pack_ll_b32_b16 s0, s0, s2
; GFX10-NEXT: s_pack_ll_b32_b16 s1, s1, s3
; GFX10-NEXT: s_mov_b32 s2, 0x80008
; GFX10-NEXT: s_lshr_b32 s3, s0, 16
; GFX10-NEXT: s_lshr_b32 s4, s1, 16
; GFX10-NEXT: s_lshl_b32 s0, s0, s2
; GFX10-NEXT: s_lshr_b32 s2, s0, 16
; GFX10-NEXT: s_lshr_b32 s3, s1, 16
; GFX10-NEXT: s_lshl_b32 s0, s0, 0x80008
; GFX10-NEXT: s_lshl_b32 s2, s2, 8
; GFX10-NEXT: s_lshl_b32 s1, s1, 0x80008
; GFX10-NEXT: s_lshl_b32 s3, s3, 8
; GFX10-NEXT: s_lshl_b32 s1, s1, s2
; GFX10-NEXT: s_lshl_b32 s2, s4, 8
; GFX10-NEXT: s_pack_ll_b32_b16 s0, s0, s3
; GFX10-NEXT: s_pack_ll_b32_b16 s1, s1, s2
; GFX10-NEXT: s_pack_ll_b32_b16 s0, s0, s2
; GFX10-NEXT: s_pack_ll_b32_b16 s1, s1, s3
; GFX10-NEXT: v_pk_add_u16 v0, s0, s1 clamp
; GFX10-NEXT: s_movk_i32 s0, 0xff
; GFX10-NEXT: v_pk_lshrrev_b16 v0, 8, v0 op_sel_hi:[0,1]
@ -466,35 +463,33 @@ define i32 @v_uaddsat_v4i8(i32 %lhs.arg, i32 %rhs.arg) {
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: v_lshrrev_b32_e32 v4, 24, v0
; GFX10-NEXT: v_lshrrev_b32_e32 v5, 24, v1
; GFX10-NEXT: v_lshrrev_b32_e32 v3, 24, v0
; GFX10-NEXT: v_lshrrev_b32_e32 v4, 24, v1
; GFX10-NEXT: s_mov_b32 s4, 8
; GFX10-NEXT: v_mov_b32_e32 v2, 0xffff
; GFX10-NEXT: v_lshrrev_b32_sdwa v3, s4, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
; GFX10-NEXT: v_lshrrev_b32_e32 v6, 16, v0
; GFX10-NEXT: v_lshrrev_b32_sdwa v7, s4, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
; GFX10-NEXT: v_lshrrev_b32_e32 v8, 16, v1
; GFX10-NEXT: v_lshrrev_b32_e32 v5, 16, v0
; GFX10-NEXT: v_lshrrev_b32_sdwa v2, s4, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
; GFX10-NEXT: v_lshrrev_b32_sdwa v6, s4, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
; GFX10-NEXT: v_lshrrev_b32_e32 v7, 16, v1
; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v3
; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v4
; GFX10-NEXT: v_lshlrev_b32_e32 v5, 16, v5
; GFX10-NEXT: v_and_or_b32 v0, v0, v2, v3
; GFX10-NEXT: v_and_or_b32 v1, v1, v2, v7
; GFX10-NEXT: s_movk_i32 s4, 0xff
; GFX10-NEXT: v_and_or_b32 v3, v6, v2, v4
; GFX10-NEXT: v_and_or_b32 v2, v8, v2, v5
; GFX10-NEXT: v_and_or_b32 v0, 0xffff, v0, v2
; GFX10-NEXT: v_and_or_b32 v1, 0xffff, v1, v6
; GFX10-NEXT: v_and_or_b32 v2, 0xffff, v5, v3
; GFX10-NEXT: v_and_or_b32 v3, 0xffff, v7, v4
; GFX10-NEXT: v_pk_lshlrev_b16 v0, 8, v0 op_sel_hi:[0,1]
; GFX10-NEXT: v_pk_lshlrev_b16 v1, 8, v1 op_sel_hi:[0,1]
; GFX10-NEXT: v_mov_b32_e32 v4, 24
; GFX10-NEXT: v_pk_lshlrev_b16 v3, 8, v3 op_sel_hi:[0,1]
; GFX10-NEXT: v_pk_lshlrev_b16 v2, 8, v2 op_sel_hi:[0,1]
; GFX10-NEXT: v_pk_lshlrev_b16 v3, 8, v3 op_sel_hi:[0,1]
; GFX10-NEXT: v_pk_add_u16 v0, v0, v1 clamp
; GFX10-NEXT: v_pk_add_u16 v1, v3, v2 clamp
; GFX10-NEXT: v_pk_add_u16 v1, v2, v3 clamp
; GFX10-NEXT: v_mov_b32_e32 v2, 8
; GFX10-NEXT: v_pk_lshrrev_b16 v0, 8, v0 op_sel_hi:[0,1]
; GFX10-NEXT: v_pk_lshrrev_b16 v1, 8, v1 op_sel_hi:[0,1]
; GFX10-NEXT: v_lshlrev_b32_sdwa v2, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
; GFX10-NEXT: v_and_b32_e32 v3, s4, v1
; GFX10-NEXT: v_and_b32_e32 v3, 0xff, v1
; GFX10-NEXT: v_lshlrev_b32_sdwa v1, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
; GFX10-NEXT: v_and_or_b32 v0, v0, s4, v2
; GFX10-NEXT: v_and_or_b32 v0, v0, 0xff, v2
; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v3
; GFX10-NEXT: v_or3_b32 v0, v0, v2, v1
; GFX10-NEXT: s_setpc_b64 s[30:31]
@ -590,27 +585,26 @@ define amdgpu_ps i32 @s_uaddsat_v4i8(i32 inreg %lhs.arg, i32 inreg %rhs.arg) {
; GFX9-NEXT: s_lshr_b32 s6, s0, 24
; GFX9-NEXT: s_pack_ll_b32_b16 s0, s0, s3
; GFX9-NEXT: s_pack_ll_b32_b16 s3, s4, s6
; GFX9-NEXT: s_mov_b32 s4, 0x80008
; GFX9-NEXT: s_lshr_b32 s6, s0, 16
; GFX9-NEXT: s_lshr_b32 s4, s0, 16
; GFX9-NEXT: s_lshr_b32 s7, s1, 8
; GFX9-NEXT: s_lshl_b32 s0, s0, s4
; GFX9-NEXT: s_lshl_b32 s6, s6, 8
; GFX9-NEXT: s_lshl_b32 s0, s0, 0x80008
; GFX9-NEXT: s_lshl_b32 s4, s4, 8
; GFX9-NEXT: s_lshr_b32 s8, s1, 16
; GFX9-NEXT: s_lshr_b32 s9, s1, 24
; GFX9-NEXT: s_pack_ll_b32_b16 s0, s0, s6
; GFX9-NEXT: s_lshr_b32 s6, s3, 16
; GFX9-NEXT: s_pack_ll_b32_b16 s0, s0, s4
; GFX9-NEXT: s_lshr_b32 s4, s3, 16
; GFX9-NEXT: s_pack_ll_b32_b16 s1, s1, s7
; GFX9-NEXT: s_lshl_b32 s3, s3, s4
; GFX9-NEXT: s_lshl_b32 s3, s3, 0x80008
; GFX9-NEXT: s_lshl_b32 s4, s4, 8
; GFX9-NEXT: s_lshr_b32 s6, s1, 16
; GFX9-NEXT: s_pack_ll_b32_b16 s3, s3, s4
; GFX9-NEXT: s_pack_ll_b32_b16 s4, s8, s9
; GFX9-NEXT: s_lshl_b32 s1, s1, 0x80008
; GFX9-NEXT: s_lshl_b32 s6, s6, 8
; GFX9-NEXT: s_pack_ll_b32_b16 s1, s1, s6
; GFX9-NEXT: s_lshr_b32 s6, s4, 16
; GFX9-NEXT: s_lshl_b32 s4, s4, 0x80008
; GFX9-NEXT: s_lshl_b32 s6, s6, 8
; GFX9-NEXT: s_lshr_b32 s7, s1, 16
; GFX9-NEXT: s_pack_ll_b32_b16 s3, s3, s6
; GFX9-NEXT: s_pack_ll_b32_b16 s6, s8, s9
; GFX9-NEXT: s_lshl_b32 s1, s1, s4
; GFX9-NEXT: s_lshl_b32 s7, s7, 8
; GFX9-NEXT: s_pack_ll_b32_b16 s1, s1, s7
; GFX9-NEXT: s_lshr_b32 s7, s6, 16
; GFX9-NEXT: s_lshl_b32 s4, s6, s4
; GFX9-NEXT: s_lshl_b32 s6, s7, 8
; GFX9-NEXT: s_pack_ll_b32_b16 s4, s4, s6
; GFX9-NEXT: v_mov_b32_e32 v0, s1
; GFX9-NEXT: v_pk_add_u16 v0, s0, v0 clamp
@ -637,39 +631,37 @@ define amdgpu_ps i32 @s_uaddsat_v4i8(i32 inreg %lhs.arg, i32 inreg %rhs.arg) {
; GFX10-NEXT: s_lshr_b32 s4, s0, 24
; GFX10-NEXT: s_pack_ll_b32_b16 s0, s0, s2
; GFX10-NEXT: s_pack_ll_b32_b16 s2, s3, s4
; GFX10-NEXT: s_mov_b32 s3, 0x80008
; GFX10-NEXT: s_lshr_b32 s4, s0, 16
; GFX10-NEXT: s_lshr_b32 s3, s0, 16
; GFX10-NEXT: s_lshr_b32 s5, s1, 8
; GFX10-NEXT: s_lshr_b32 s6, s1, 16
; GFX10-NEXT: s_lshr_b32 s7, s1, 24
; GFX10-NEXT: s_lshl_b32 s0, s0, s3
; GFX10-NEXT: s_lshl_b32 s0, s0, 0x80008
; GFX10-NEXT: s_lshl_b32 s3, s3, 8
; GFX10-NEXT: s_lshr_b32 s4, s2, 16
; GFX10-NEXT: s_lshl_b32 s2, s2, 0x80008
; GFX10-NEXT: s_lshl_b32 s4, s4, 8
; GFX10-NEXT: s_pack_ll_b32_b16 s0, s0, s3
; GFX10-NEXT: s_pack_ll_b32_b16 s1, s1, s5
; GFX10-NEXT: s_pack_ll_b32_b16 s0, s0, s4
; GFX10-NEXT: s_pack_ll_b32_b16 s4, s6, s7
; GFX10-NEXT: s_lshr_b32 s8, s2, 16
; GFX10-NEXT: s_lshr_b32 s5, s1, 16
; GFX10-NEXT: s_lshr_b32 s6, s4, 16
; GFX10-NEXT: s_lshl_b32 s2, s2, s3
; GFX10-NEXT: s_lshl_b32 s8, s8, 8
; GFX10-NEXT: s_lshl_b32 s1, s1, s3
; GFX10-NEXT: s_pack_ll_b32_b16 s3, s6, s7
; GFX10-NEXT: s_pack_ll_b32_b16 s2, s2, s4
; GFX10-NEXT: s_lshr_b32 s4, s1, 16
; GFX10-NEXT: s_lshr_b32 s5, s3, 16
; GFX10-NEXT: s_lshl_b32 s1, s1, 0x80008
; GFX10-NEXT: s_lshl_b32 s4, s4, 8
; GFX10-NEXT: s_lshl_b32 s3, s3, 0x80008
; GFX10-NEXT: s_lshl_b32 s5, s5, 8
; GFX10-NEXT: s_lshl_b32 s3, s4, s3
; GFX10-NEXT: s_lshl_b32 s4, s6, 8
; GFX10-NEXT: s_pack_ll_b32_b16 s2, s2, s8
; GFX10-NEXT: s_pack_ll_b32_b16 s1, s1, s5
; GFX10-NEXT: s_pack_ll_b32_b16 s3, s3, s4
; GFX10-NEXT: s_pack_ll_b32_b16 s1, s1, s4
; GFX10-NEXT: s_pack_ll_b32_b16 s3, s3, s5
; GFX10-NEXT: v_pk_add_u16 v0, s0, s1 clamp
; GFX10-NEXT: v_pk_add_u16 v1, s2, s3 clamp
; GFX10-NEXT: s_mov_b32 s0, 8
; GFX10-NEXT: s_movk_i32 s1, 0xff
; GFX10-NEXT: v_pk_lshrrev_b16 v0, 8, v0 op_sel_hi:[0,1]
; GFX10-NEXT: v_pk_lshrrev_b16 v1, 8, v1 op_sel_hi:[0,1]
; GFX10-NEXT: v_lshlrev_b32_sdwa v2, s0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
; GFX10-NEXT: v_and_b32_e32 v3, s1, v1
; GFX10-NEXT: v_and_b32_e32 v3, 0xff, v1
; GFX10-NEXT: s_mov_b32 s0, 24
; GFX10-NEXT: v_lshlrev_b32_sdwa v1, s0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
; GFX10-NEXT: v_and_or_b32 v0, v0, s1, v2
; GFX10-NEXT: v_and_or_b32 v0, v0, 0xff, v2
; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v3
; GFX10-NEXT: v_or3_b32 v0, v0, v2, v1
; GFX10-NEXT: v_readfirstlane_b32 s0, v0

View File

@ -910,9 +910,8 @@ define amdgpu_kernel void @udivrem_v4i32(<4 x i32> addrspace(1)* %out0, <4 x i32
; GFX10-LABEL: udivrem_v4i32:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x10
; GFX10-NEXT: v_mov_b32_e32 v4, 0x4f7ffffe
; GFX10-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x0
; GFX10-NEXT: v_mov_b32_e32 v8, 0
; GFX10-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: v_cvt_f32_u32_e32 v0, s12
; GFX10-NEXT: v_cvt_f32_u32_e32 v1, s13
@ -926,9 +925,9 @@ define amdgpu_kernel void @udivrem_v4i32(<4 x i32> addrspace(1)* %out0, <4 x i32
; GFX10-NEXT: s_sub_i32 s1, 0, s13
; GFX10-NEXT: s_sub_i32 s2, 0, s14
; GFX10-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0
; GFX10-NEXT: v_mul_f32_e32 v1, v1, v4
; GFX10-NEXT: v_mul_f32_e32 v2, v2, v4
; GFX10-NEXT: v_mul_f32_e32 v3, v3, v4
; GFX10-NEXT: v_mul_f32_e32 v1, 0x4f7ffffe, v1
; GFX10-NEXT: v_mul_f32_e32 v2, 0x4f7ffffe, v2
; GFX10-NEXT: v_mul_f32_e32 v3, 0x4f7ffffe, v3
; GFX10-NEXT: v_cvt_u32_f32_e32 v0, v0
; GFX10-NEXT: v_cvt_u32_f32_e32 v1, v1
; GFX10-NEXT: v_cvt_u32_f32_e32 v2, v2
@ -2269,24 +2268,24 @@ define amdgpu_kernel void @udivrem_v2i16(<2 x i16> addrspace(1)* %out0, <2 x i16
; GFX8-LABEL: udivrem_v2i16:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x10
; GFX8-NEXT: s_mov_b32 s2, 0xffff
; GFX8-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x0
; GFX8-NEXT: s_mov_b32 s8, 0xffff
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: s_and_b32 s3, s1, s2
; GFX8-NEXT: v_cvt_f32_u32_e32 v0, s3
; GFX8-NEXT: s_lshr_b32 s8, s1, 16
; GFX8-NEXT: v_cvt_f32_u32_e32 v1, s8
; GFX8-NEXT: s_sub_i32 s1, 0, s3
; GFX8-NEXT: s_and_b32 s2, s1, 0xffff
; GFX8-NEXT: v_cvt_f32_u32_e32 v0, s2
; GFX8-NEXT: s_lshr_b32 s3, s1, 16
; GFX8-NEXT: v_cvt_f32_u32_e32 v1, s3
; GFX8-NEXT: s_sub_i32 s1, 0, s2
; GFX8-NEXT: v_rcp_iflag_f32_e32 v0, v0
; GFX8-NEXT: s_lshr_b32 s9, s0, 16
; GFX8-NEXT: v_rcp_iflag_f32_e32 v1, v1
; GFX8-NEXT: s_and_b32 s0, s0, s2
; GFX8-NEXT: s_and_b32 s0, s0, 0xffff
; GFX8-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0
; GFX8-NEXT: v_cvt_u32_f32_e32 v0, v0
; GFX8-NEXT: v_mul_f32_e32 v1, 0x4f7ffffe, v1
; GFX8-NEXT: v_cvt_u32_f32_e32 v1, v1
; GFX8-NEXT: v_mul_lo_u32 v2, s1, v0
; GFX8-NEXT: s_sub_i32 s1, 0, s8
; GFX8-NEXT: s_sub_i32 s1, 0, s3
; GFX8-NEXT: v_mul_lo_u32 v3, s1, v1
; GFX8-NEXT: v_mul_hi_u32 v2, v0, v2
; GFX8-NEXT: v_mul_hi_u32 v3, v1, v3
@ -2294,34 +2293,34 @@ define amdgpu_kernel void @udivrem_v2i16(<2 x i16> addrspace(1)* %out0, <2 x i16
; GFX8-NEXT: v_mul_hi_u32 v0, s0, v0
; GFX8-NEXT: v_add_u32_e32 v1, vcc, v1, v3
; GFX8-NEXT: v_mul_hi_u32 v1, s9, v1
; GFX8-NEXT: v_mul_lo_u32 v2, v0, s3
; GFX8-NEXT: v_mul_lo_u32 v2, v0, s2
; GFX8-NEXT: v_add_u32_e32 v3, vcc, 1, v0
; GFX8-NEXT: v_sub_u32_e32 v2, vcc, s0, v2
; GFX8-NEXT: v_cmp_le_u32_e32 vcc, s3, v2
; GFX8-NEXT: v_cmp_le_u32_e32 vcc, s2, v2
; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc
; GFX8-NEXT: v_subrev_u32_e64 v3, s[0:1], s3, v2
; GFX8-NEXT: v_subrev_u32_e64 v3, s[0:1], s2, v2
; GFX8-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc
; GFX8-NEXT: v_add_u32_e32 v3, vcc, 1, v0
; GFX8-NEXT: v_cmp_le_u32_e32 vcc, s3, v2
; GFX8-NEXT: v_cmp_le_u32_e32 vcc, s2, v2
; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc
; GFX8-NEXT: v_mul_lo_u32 v3, v1, s8
; GFX8-NEXT: v_subrev_u32_e64 v4, s[0:1], s3, v2
; GFX8-NEXT: v_mul_lo_u32 v3, v1, s3
; GFX8-NEXT: v_subrev_u32_e64 v4, s[0:1], s2, v2
; GFX8-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc
; GFX8-NEXT: v_sub_u32_e32 v3, vcc, s9, v3
; GFX8-NEXT: v_add_u32_e32 v4, vcc, 1, v1
; GFX8-NEXT: v_cmp_le_u32_e32 vcc, s8, v3
; GFX8-NEXT: v_cmp_le_u32_e32 vcc, s3, v3
; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc
; GFX8-NEXT: v_subrev_u32_e64 v4, s[0:1], s8, v3
; GFX8-NEXT: v_subrev_u32_e64 v4, s[0:1], s3, v3
; GFX8-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc
; GFX8-NEXT: v_add_u32_e32 v4, vcc, 1, v1
; GFX8-NEXT: v_cmp_le_u32_e32 vcc, s8, v3
; GFX8-NEXT: v_cmp_le_u32_e32 vcc, s3, v3
; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc
; GFX8-NEXT: v_subrev_u32_e64 v4, s[0:1], s8, v3
; GFX8-NEXT: v_and_b32_e32 v1, s2, v1
; GFX8-NEXT: v_subrev_u32_e64 v4, s[0:1], s3, v3
; GFX8-NEXT: v_and_b32_e32 v1, s8, v1
; GFX8-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc
; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX8-NEXT: v_or_b32_sdwa v4, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; GFX8-NEXT: v_and_b32_e32 v0, s2, v3
; GFX8-NEXT: v_and_b32_e32 v0, s8, v3
; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX8-NEXT: v_or_b32_sdwa v2, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; GFX8-NEXT: v_mov_b32_e32 v0, s4
@ -2335,54 +2334,53 @@ define amdgpu_kernel void @udivrem_v2i16(<2 x i16> addrspace(1)* %out0, <2 x i16
; GFX9-LABEL: udivrem_v2i16:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x10
; GFX9-NEXT: s_mov_b32 s2, 0xffff
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: s_and_b32 s7, s1, s2
; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s7
; GFX9-NEXT: s_lshr_b32 s6, s1, 16
; GFX9-NEXT: v_cvt_f32_u32_e32 v1, s6
; GFX9-NEXT: s_sub_i32 s1, 0, s7
; GFX9-NEXT: s_and_b32 s3, s1, 0xffff
; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s3
; GFX9-NEXT: s_lshr_b32 s2, s1, 16
; GFX9-NEXT: v_cvt_f32_u32_e32 v1, s2
; GFX9-NEXT: s_sub_i32 s1, 0, s3
; GFX9-NEXT: v_rcp_iflag_f32_e32 v0, v0
; GFX9-NEXT: s_sub_i32 s3, 0, s6
; GFX9-NEXT: s_sub_i32 s6, 0, s2
; GFX9-NEXT: v_rcp_iflag_f32_e32 v1, v1
; GFX9-NEXT: s_and_b32 s9, s0, s2
; GFX9-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0
; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0
; GFX9-NEXT: v_mul_f32_e32 v1, 0x4f7ffffe, v1
; GFX9-NEXT: v_cvt_u32_f32_e32 v1, v1
; GFX9-NEXT: s_lshr_b32 s8, s0, 16
; GFX9-NEXT: v_mul_lo_u32 v2, s1, v0
; GFX9-NEXT: v_mul_lo_u32 v3, s3, v1
; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
; GFX9-NEXT: s_lshr_b32 s1, s0, 16
; GFX9-NEXT: v_mul_lo_u32 v3, s6, v1
; GFX9-NEXT: s_and_b32 s0, s0, 0xffff
; GFX9-NEXT: v_mul_hi_u32 v2, v0, v2
; GFX9-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x0
; GFX9-NEXT: v_mul_hi_u32 v3, v1, v3
; GFX9-NEXT: v_add_u32_e32 v0, v0, v2
; GFX9-NEXT: v_mul_hi_u32 v0, s9, v0
; GFX9-NEXT: v_mul_hi_u32 v0, s0, v0
; GFX9-NEXT: v_add_u32_e32 v1, v1, v3
; GFX9-NEXT: v_mul_hi_u32 v1, s8, v1
; GFX9-NEXT: v_mul_lo_u32 v2, v0, s7
; GFX9-NEXT: v_mul_hi_u32 v1, s1, v1
; GFX9-NEXT: v_mul_lo_u32 v2, v0, s3
; GFX9-NEXT: v_add_u32_e32 v4, 1, v0
; GFX9-NEXT: v_mul_lo_u32 v3, v1, s6
; GFX9-NEXT: v_mul_lo_u32 v3, v1, s2
; GFX9-NEXT: v_add_u32_e32 v5, 1, v1
; GFX9-NEXT: v_sub_u32_e32 v2, s9, v2
; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s7, v2
; GFX9-NEXT: v_sub_u32_e32 v2, s0, v2
; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s3, v2
; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc
; GFX9-NEXT: v_subrev_u32_e32 v4, s7, v2
; GFX9-NEXT: v_subrev_u32_e32 v4, s3, v2
; GFX9-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc
; GFX9-NEXT: v_add_u32_e32 v4, 1, v0
; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s7, v2
; GFX9-NEXT: v_sub_u32_e32 v3, s8, v3
; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s3, v2
; GFX9-NEXT: v_sub_u32_e32 v3, s1, v3
; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc
; GFX9-NEXT: v_subrev_u32_e32 v4, s7, v2
; GFX9-NEXT: v_subrev_u32_e32 v4, s3, v2
; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s2, v3
; GFX9-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc
; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s6, v3
; GFX9-NEXT: v_subrev_u32_e32 v4, s6, v3
; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc
; GFX9-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc
; GFX9-NEXT: v_subrev_u32_e32 v4, s2, v3
; GFX9-NEXT: v_cndmask_b32_e64 v1, v1, v5, s[0:1]
; GFX9-NEXT: v_cndmask_b32_e64 v3, v3, v4, s[0:1]
; GFX9-NEXT: v_add_u32_e32 v4, 1, v1
; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s6, v3
; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s2, v3
; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc
; GFX9-NEXT: v_subrev_u32_e32 v4, s6, v3
; GFX9-NEXT: v_subrev_u32_e32 v4, s2, v3
; GFX9-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc
; GFX9-NEXT: v_mov_b32_e32 v4, 0xffff
; GFX9-NEXT: v_lshlrev_b32_e32 v1, 16, v1
@ -2391,43 +2389,42 @@ define amdgpu_kernel void @udivrem_v2i16(<2 x i16> addrspace(1)* %out0, <2 x i16
; GFX9-NEXT: v_and_or_b32 v1, v2, v4, v1
; GFX9-NEXT: v_mov_b32_e32 v2, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: global_store_dword v2, v0, s[0:1]
; GFX9-NEXT: global_store_dword v2, v1, s[2:3]
; GFX9-NEXT: global_store_dword v2, v0, s[4:5]
; GFX9-NEXT: global_store_dword v2, v1, s[6:7]
; GFX9-NEXT: s_endpgm
;
; GFX10-LABEL: udivrem_v2i16:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x10
; GFX10-NEXT: s_mov_b32 s3, 0xffff
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: s_lshr_b32 s2, s1, 16
; GFX10-NEXT: s_and_b32 s1, s1, s3
; GFX10-NEXT: s_and_b32 s1, s1, 0xffff
; GFX10-NEXT: v_cvt_f32_u32_e32 v0, s2
; GFX10-NEXT: v_cvt_f32_u32_e32 v1, s1
; GFX10-NEXT: s_sub_i32 s6, 0, s2
; GFX10-NEXT: s_sub_i32 s3, 0, s2
; GFX10-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x0
; GFX10-NEXT: v_rcp_iflag_f32_e32 v0, v0
; GFX10-NEXT: v_rcp_iflag_f32_e32 v1, v1
; GFX10-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0
; GFX10-NEXT: v_mul_f32_e32 v1, 0x4f7ffffe, v1
; GFX10-NEXT: v_cvt_u32_f32_e32 v0, v0
; GFX10-NEXT: v_cvt_u32_f32_e32 v1, v1
; GFX10-NEXT: v_mul_lo_u32 v2, s6, v0
; GFX10-NEXT: s_sub_i32 s6, 0, s1
; GFX10-NEXT: v_mul_lo_u32 v3, s6, v1
; GFX10-NEXT: s_lshr_b32 s6, s0, 16
; GFX10-NEXT: s_and_b32 s0, s0, s3
; GFX10-NEXT: v_mul_lo_u32 v2, s3, v0
; GFX10-NEXT: s_sub_i32 s3, 0, s1
; GFX10-NEXT: v_mul_lo_u32 v3, s3, v1
; GFX10-NEXT: s_lshr_b32 s3, s0, 16
; GFX10-NEXT: s_and_b32 s0, s0, 0xffff
; GFX10-NEXT: v_mul_hi_u32 v2, v0, v2
; GFX10-NEXT: v_mul_hi_u32 v3, v1, v3
; GFX10-NEXT: v_add_nc_u32_e32 v0, v0, v2
; GFX10-NEXT: v_add_nc_u32_e32 v1, v1, v3
; GFX10-NEXT: v_mul_hi_u32 v0, s6, v0
; GFX10-NEXT: v_mul_hi_u32 v0, s3, v0
; GFX10-NEXT: v_mul_hi_u32 v1, s0, v1
; GFX10-NEXT: v_mul_lo_u32 v2, v0, s2
; GFX10-NEXT: v_add_nc_u32_e32 v4, 1, v0
; GFX10-NEXT: v_mul_lo_u32 v3, v1, s1
; GFX10-NEXT: v_add_nc_u32_e32 v6, 1, v1
; GFX10-NEXT: v_sub_nc_u32_e32 v2, s6, v2
; GFX10-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x0
; GFX10-NEXT: v_sub_nc_u32_e32 v2, s3, v2
; GFX10-NEXT: v_sub_nc_u32_e32 v3, s0, v3
; GFX10-NEXT: v_subrev_nc_u32_e32 v5, s2, v2
; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, s2, v2
@ -2445,14 +2442,13 @@ define amdgpu_kernel void @udivrem_v2i16(<2 x i16> addrspace(1)* %out0, <2 x i16
; GFX10-NEXT: v_subrev_nc_u32_e32 v7, s1, v3
; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc_lo
; GFX10-NEXT: v_cndmask_b32_e32 v2, v2, v5, vcc_lo
; GFX10-NEXT: v_mov_b32_e32 v4, 0xffff
; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, v6, s0
; GFX10-NEXT: v_cndmask_b32_e64 v3, v3, v7, s0
; GFX10-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v2
; GFX10-NEXT: v_and_or_b32 v0, v1, v4, v0
; GFX10-NEXT: v_and_or_b32 v0, 0xffff, v1, v0
; GFX10-NEXT: v_mov_b32_e32 v1, 0
; GFX10-NEXT: v_and_or_b32 v2, v3, v4, v2
; GFX10-NEXT: v_and_or_b32 v2, 0xffff, v3, v2
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: global_store_dword v1, v0, s[4:5]
; GFX10-NEXT: global_store_dword v1, v2, s[6:7]
@ -2586,9 +2582,8 @@ define amdgpu_kernel void @udivrem_i27(i27 addrspace(1)* %out0, i27 addrspace(1)
; GFX8-LABEL: udivrem_i27:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x10
; GFX8-NEXT: s_mov_b32 s8, 0x7ffffff
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: s_and_b32 s7, s7, s8
; GFX8-NEXT: s_and_b32 s7, s7, 0x7ffffff
; GFX8-NEXT: v_cvt_f32_u32_e32 v0, s7
; GFX8-NEXT: s_sub_i32 s0, 0, s7
; GFX8-NEXT: v_rcp_iflag_f32_e32 v0, v0
@ -2596,7 +2591,8 @@ define amdgpu_kernel void @udivrem_i27(i27 addrspace(1)* %out0, i27 addrspace(1)
; GFX8-NEXT: v_cvt_u32_f32_e32 v0, v0
; GFX8-NEXT: v_mul_lo_u32 v1, s0, v0
; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
; GFX8-NEXT: s_and_b32 s4, s6, s8
; GFX8-NEXT: s_and_b32 s4, s6, 0x7ffffff
; GFX8-NEXT: s_mov_b32 s5, 0x7ffffff
; GFX8-NEXT: v_mul_hi_u32 v1, v0, v1
; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v1
; GFX8-NEXT: v_mul_hi_u32 v2, s4, v0
@ -2614,11 +2610,11 @@ define amdgpu_kernel void @udivrem_i27(i27 addrspace(1)* %out0, i27 addrspace(1)
; GFX8-NEXT: v_cmp_le_u32_e32 vcc, s7, v3
; GFX8-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc
; GFX8-NEXT: v_subrev_u32_e64 v4, s[0:1], s7, v3
; GFX8-NEXT: v_and_b32_e32 v2, s8, v2
; GFX8-NEXT: v_and_b32_e32 v2, s5, v2
; GFX8-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc
; GFX8-NEXT: flat_store_dword v[0:1], v2
; GFX8-NEXT: v_mov_b32_e32 v0, s2
; GFX8-NEXT: v_and_b32_e32 v2, s8, v3
; GFX8-NEXT: v_and_b32_e32 v2, s5, v3
; GFX8-NEXT: v_mov_b32_e32 v1, s3
; GFX8-NEXT: flat_store_dword v[0:1], v2
; GFX8-NEXT: s_endpgm
@ -2626,49 +2622,48 @@ define amdgpu_kernel void @udivrem_i27(i27 addrspace(1)* %out0, i27 addrspace(1)
; GFX9-LABEL: udivrem_i27:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x10
; GFX9-NEXT: s_mov_b32 s6, 0x7ffffff
; GFX9-NEXT: v_mov_b32_e32 v2, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: s_and_b32 s7, s1, s6
; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s7
; GFX9-NEXT: s_sub_i32 s1, 0, s7
; GFX9-NEXT: s_and_b32 s8, s0, s6
; GFX9-NEXT: s_and_b32 s6, s1, 0x7ffffff
; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s6
; GFX9-NEXT: s_sub_i32 s1, 0, s6
; GFX9-NEXT: s_and_b32 s7, s0, 0x7ffffff
; GFX9-NEXT: v_rcp_iflag_f32_e32 v0, v0
; GFX9-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0
; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0
; GFX9-NEXT: v_mul_lo_u32 v1, s1, v0
; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
; GFX9-NEXT: s_mov_b32 s4, 0x7ffffff
; GFX9-NEXT: v_mul_hi_u32 v1, v0, v1
; GFX9-NEXT: v_add_u32_e32 v0, v0, v1
; GFX9-NEXT: v_mul_hi_u32 v0, s8, v0
; GFX9-NEXT: v_mul_lo_u32 v1, v0, s7
; GFX9-NEXT: v_mul_hi_u32 v0, s7, v0
; GFX9-NEXT: v_mul_lo_u32 v1, v0, s6
; GFX9-NEXT: v_add_u32_e32 v3, 1, v0
; GFX9-NEXT: v_sub_u32_e32 v1, s8, v1
; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s7, v1
; GFX9-NEXT: v_sub_u32_e32 v1, s7, v1
; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s6, v1
; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc
; GFX9-NEXT: v_subrev_u32_e32 v3, s7, v1
; GFX9-NEXT: v_subrev_u32_e32 v3, s6, v1
; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc
; GFX9-NEXT: v_add_u32_e32 v3, 1, v0
; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s7, v1
; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s6, v1
; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc
; GFX9-NEXT: v_subrev_u32_e32 v3, s7, v1
; GFX9-NEXT: v_subrev_u32_e32 v3, s6, v1
; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc
; GFX9-NEXT: v_and_b32_e32 v0, s6, v0
; GFX9-NEXT: v_and_b32_e32 v0, s4, v0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: global_store_dword v2, v0, s[0:1]
; GFX9-NEXT: v_and_b32_e32 v0, s6, v1
; GFX9-NEXT: v_and_b32_e32 v0, s4, v1
; GFX9-NEXT: global_store_dword v2, v0, s[2:3]
; GFX9-NEXT: s_endpgm
;
; GFX10-LABEL: udivrem_i27:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x10
; GFX10-NEXT: s_mov_b32 s6, 0x7ffffff
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: s_and_b32 s7, s1, s6
; GFX10-NEXT: s_and_b32 s0, s0, s6
; GFX10-NEXT: v_cvt_f32_u32_e32 v0, s7
; GFX10-NEXT: s_sub_i32 s1, 0, s7
; GFX10-NEXT: s_and_b32 s6, s1, 0x7ffffff
; GFX10-NEXT: s_and_b32 s0, s0, 0x7ffffff
; GFX10-NEXT: v_cvt_f32_u32_e32 v0, s6
; GFX10-NEXT: s_sub_i32 s1, 0, s6
; GFX10-NEXT: v_rcp_iflag_f32_e32 v0, v0
; GFX10-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0
; GFX10-NEXT: v_cvt_u32_f32_e32 v0, v0
@ -2676,22 +2671,22 @@ define amdgpu_kernel void @udivrem_i27(i27 addrspace(1)* %out0, i27 addrspace(1)
; GFX10-NEXT: v_mul_hi_u32 v1, v0, v1
; GFX10-NEXT: v_add_nc_u32_e32 v0, v0, v1
; GFX10-NEXT: v_mul_hi_u32 v0, s0, v0
; GFX10-NEXT: v_mul_lo_u32 v1, v0, s7
; GFX10-NEXT: v_mul_lo_u32 v1, v0, s6
; GFX10-NEXT: v_add_nc_u32_e32 v2, 1, v0
; GFX10-NEXT: v_sub_nc_u32_e32 v1, s0, v1
; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
; GFX10-NEXT: v_subrev_nc_u32_e32 v3, s7, v1
; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, s7, v1
; GFX10-NEXT: v_subrev_nc_u32_e32 v3, s6, v1
; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, s6, v1
; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc_lo
; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc_lo
; GFX10-NEXT: v_add_nc_u32_e32 v2, 1, v0
; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, s7, v1
; GFX10-NEXT: v_subrev_nc_u32_e32 v3, s7, v1
; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, s6, v1
; GFX10-NEXT: v_subrev_nc_u32_e32 v3, s6, v1
; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc_lo
; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc_lo
; GFX10-NEXT: v_mov_b32_e32 v2, 0
; GFX10-NEXT: v_and_b32_e32 v0, s6, v0
; GFX10-NEXT: v_and_b32_e32 v1, s6, v1
; GFX10-NEXT: v_and_b32_e32 v0, 0x7ffffff, v0
; GFX10-NEXT: v_and_b32_e32 v1, 0x7ffffff, v1
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: global_store_dword v2, v0, s[0:1]
; GFX10-NEXT: global_store_dword v2, v1, s[2:3]

View File

@ -272,13 +272,13 @@ define <2 x i32> @v_urem_v2i32_oddk_denom(<2 x i32> %num) {
; GISEL-NEXT: v_sub_i32_e32 v0, vcc, v0, v4
; GISEL-NEXT: v_sub_i32_e32 v1, vcc, v1, v3
; GISEL-NEXT: v_subrev_i32_e32 v3, vcc, s4, v0
; GISEL-NEXT: v_sub_i32_e32 v4, vcc, v1, v2
; GISEL-NEXT: v_subrev_i32_e32 v4, vcc, 0x12d8fb, v1
; GISEL-NEXT: v_cmp_le_u32_e32 vcc, s4, v0
; GISEL-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc
; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v1, v2
; GISEL-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc
; GISEL-NEXT: v_subrev_i32_e32 v3, vcc, s4, v0
; GISEL-NEXT: v_sub_i32_e32 v4, vcc, v1, v2
; GISEL-NEXT: v_subrev_i32_e32 v4, vcc, 0x12d8fb, v1
; GISEL-NEXT: v_cmp_le_u32_e32 vcc, s4, v0
; GISEL-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc
; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v1, v2

View File

@ -1079,9 +1079,9 @@ define i64 @v_urem_i64_oddk_denom(i64 %num) {
; CHECK-NEXT: v_subbrev_u32_e32 v1, vcc, 0, v1, vcc
; CHECK-NEXT: v_cmp_ge_u32_e32 vcc, v5, v2
; CHECK-NEXT: v_cndmask_b32_e64 v7, 0, -1, vcc
; CHECK-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1
; CHECK-NEXT: v_cndmask_b32_e32 v6, v6, v7, vcc
; CHECK-NEXT: v_sub_i32_e32 v2, vcc, v5, v2
; CHECK-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v1
; CHECK-NEXT: v_cndmask_b32_e64 v6, v6, v7, s[4:5]
; CHECK-NEXT: v_subbrev_u32_e32 v7, vcc, 0, v1, vcc
; CHECK-NEXT: v_cmp_ne_u32_e32 vcc, 0, v6
; CHECK-NEXT: v_cndmask_b32_e32 v2, v5, v2, vcc
@ -1099,67 +1099,67 @@ define <2 x i64> @v_urem_v2i64_oddk_denom(<2 x i64> %num) {
; GISEL: ; %bb.0:
; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GISEL-NEXT: s_mov_b32 s8, 0x12d8fb
; GISEL-NEXT: v_cvt_f32_ubyte0_e32 v4, 0
; GISEL-NEXT: v_cvt_f32_u32_e32 v6, s8
; GISEL-NEXT: s_sub_u32 s6, 0, s8
; GISEL-NEXT: v_madmk_f32 v5, v4, 0x4f800000, v6
; GISEL-NEXT: v_mov_b32_e32 v4, 0x12d8fb
; GISEL-NEXT: v_cvt_f32_ubyte0_e32 v7, 0
; GISEL-NEXT: s_sub_u32 s6, 0, 0x12d8fb
; GISEL-NEXT: v_cvt_f32_u32_e32 v8, s8
; GISEL-NEXT: s_subb_u32 s7, 0, 0
; GISEL-NEXT: s_bfe_i32 s4, -1, 0x10000
; GISEL-NEXT: s_bfe_i32 s5, -1, 0x10000
; GISEL-NEXT: v_mac_f32_e32 v6, 0x4f800000, v4
; GISEL-NEXT: v_rcp_iflag_f32_e32 v7, v5
; GISEL-NEXT: v_mov_b32_e32 v5, s4
; GISEL-NEXT: v_mov_b32_e32 v4, s5
; GISEL-NEXT: v_rcp_iflag_f32_e32 v6, v6
; GISEL-NEXT: s_sub_u32 s9, 0, s8
; GISEL-NEXT: v_mul_f32_e32 v7, 0x5f7ffffc, v7
; GISEL-NEXT: v_mul_f32_e32 v6, 0x5f7ffffc, v6
; GISEL-NEXT: v_madmk_f32 v9, v7, 0x4f800000, v8
; GISEL-NEXT: v_mov_b32_e32 v6, s4
; GISEL-NEXT: v_mov_b32_e32 v5, s5
; GISEL-NEXT: v_mac_f32_e32 v8, 0x4f800000, v7
; GISEL-NEXT: s_sub_u32 s9, 0, 0x12d8fb
; GISEL-NEXT: v_rcp_iflag_f32_e32 v7, v9
; GISEL-NEXT: v_rcp_iflag_f32_e32 v8, v8
; GISEL-NEXT: s_subb_u32 s10, 0, 0
; GISEL-NEXT: s_bfe_i32 s4, -1, 0x10000
; GISEL-NEXT: s_bfe_i32 s11, -1, 0x10000
; GISEL-NEXT: v_mul_f32_e32 v8, 0x2f800000, v7
; GISEL-NEXT: v_mul_f32_e32 v9, 0x2f800000, v6
; GISEL-NEXT: v_mov_b32_e32 v10, s4
; GISEL-NEXT: v_trunc_f32_e32 v8, v8
; GISEL-NEXT: s_bfe_i32 s12, -1, 0x10000
; GISEL-NEXT: v_mul_f32_e32 v7, 0x5f7ffffc, v7
; GISEL-NEXT: v_mul_f32_e32 v8, 0x5f7ffffc, v8
; GISEL-NEXT: v_mul_f32_e32 v9, 0x2f800000, v7
; GISEL-NEXT: v_mul_f32_e32 v10, 0x2f800000, v8
; GISEL-NEXT: v_trunc_f32_e32 v9, v9
; GISEL-NEXT: v_mac_f32_e32 v7, 0xcf800000, v8
; GISEL-NEXT: v_cvt_u32_f32_e32 v8, v8
; GISEL-NEXT: v_mac_f32_e32 v6, 0xcf800000, v9
; GISEL-NEXT: v_trunc_f32_e32 v10, v10
; GISEL-NEXT: v_mac_f32_e32 v7, 0xcf800000, v9
; GISEL-NEXT: v_cvt_u32_f32_e32 v9, v9
; GISEL-NEXT: v_mac_f32_e32 v8, 0xcf800000, v10
; GISEL-NEXT: v_cvt_u32_f32_e32 v10, v10
; GISEL-NEXT: v_cvt_u32_f32_e32 v7, v7
; GISEL-NEXT: v_mul_lo_u32 v11, s6, v8
; GISEL-NEXT: v_cvt_u32_f32_e32 v6, v6
; GISEL-NEXT: v_mul_lo_u32 v12, s9, v9
; GISEL-NEXT: v_mul_lo_u32 v11, s6, v9
; GISEL-NEXT: v_cvt_u32_f32_e32 v8, v8
; GISEL-NEXT: v_mul_lo_u32 v12, s9, v10
; GISEL-NEXT: v_mul_lo_u32 v13, s6, v7
; GISEL-NEXT: v_mul_lo_u32 v14, s7, v7
; GISEL-NEXT: v_mul_hi_u32 v15, s6, v7
; GISEL-NEXT: v_mul_lo_u32 v16, s9, v6
; GISEL-NEXT: v_mul_lo_u32 v17, s10, v6
; GISEL-NEXT: v_mul_hi_u32 v18, s9, v6
; GISEL-NEXT: v_mul_lo_u32 v16, s9, v8
; GISEL-NEXT: v_mul_lo_u32 v17, s10, v8
; GISEL-NEXT: v_mul_hi_u32 v18, s9, v8
; GISEL-NEXT: v_add_i32_e32 v11, vcc, v14, v11
; GISEL-NEXT: v_mul_lo_u32 v14, v8, v13
; GISEL-NEXT: v_mul_lo_u32 v14, v9, v13
; GISEL-NEXT: v_mul_hi_u32 v19, v7, v13
; GISEL-NEXT: v_mul_hi_u32 v13, v8, v13
; GISEL-NEXT: v_mul_hi_u32 v13, v9, v13
; GISEL-NEXT: v_add_i32_e32 v12, vcc, v17, v12
; GISEL-NEXT: v_mul_lo_u32 v17, v9, v16
; GISEL-NEXT: v_mul_lo_u32 v17, v10, v16
; GISEL-NEXT: v_add_i32_e32 v11, vcc, v11, v15
; GISEL-NEXT: v_mul_hi_u32 v15, v6, v16
; GISEL-NEXT: v_mul_hi_u32 v16, v9, v16
; GISEL-NEXT: v_mul_hi_u32 v15, v8, v16
; GISEL-NEXT: v_mul_hi_u32 v16, v10, v16
; GISEL-NEXT: v_add_i32_e32 v12, vcc, v12, v18
; GISEL-NEXT: v_mul_lo_u32 v18, v6, v12
; GISEL-NEXT: v_mul_lo_u32 v18, v8, v12
; GISEL-NEXT: v_add_i32_e32 v17, vcc, v17, v18
; GISEL-NEXT: v_cndmask_b32_e64 v18, 0, 1, vcc
; GISEL-NEXT: v_add_i32_e32 v15, vcc, v17, v15
; GISEL-NEXT: v_mul_lo_u32 v15, v7, v11
; GISEL-NEXT: v_mul_lo_u32 v17, v8, v11
; GISEL-NEXT: v_mul_lo_u32 v17, v9, v11
; GISEL-NEXT: v_add_i32_e64 v14, s[4:5], v14, v15
; GISEL-NEXT: v_cndmask_b32_e64 v15, 0, 1, s[4:5]
; GISEL-NEXT: v_add_i32_e64 v14, s[4:5], v14, v19
; GISEL-NEXT: v_mul_hi_u32 v14, v7, v11
; GISEL-NEXT: v_mul_hi_u32 v11, v8, v11
; GISEL-NEXT: v_mul_hi_u32 v11, v9, v11
; GISEL-NEXT: v_cndmask_b32_e64 v19, 0, 1, s[4:5]
; GISEL-NEXT: v_add_i32_e64 v15, s[4:5], v15, v19
; GISEL-NEXT: v_mul_lo_u32 v19, v9, v12
; GISEL-NEXT: v_mul_lo_u32 v19, v10, v12
; GISEL-NEXT: v_add_i32_e64 v13, s[4:5], v17, v13
; GISEL-NEXT: v_cndmask_b32_e64 v17, 0, 1, s[4:5]
; GISEL-NEXT: v_add_i32_e64 v13, s[4:5], v13, v14
@ -1167,8 +1167,8 @@ define <2 x i64> @v_urem_v2i64_oddk_denom(<2 x i64> %num) {
; GISEL-NEXT: v_add_i32_e64 v14, s[4:5], v17, v14
; GISEL-NEXT: v_cndmask_b32_e64 v17, 0, 1, vcc
; GISEL-NEXT: v_add_i32_e32 v17, vcc, v18, v17
; GISEL-NEXT: v_mul_hi_u32 v18, v6, v12
; GISEL-NEXT: v_mul_hi_u32 v12, v9, v12
; GISEL-NEXT: v_mul_hi_u32 v18, v8, v12
; GISEL-NEXT: v_mul_hi_u32 v12, v10, v12
; GISEL-NEXT: v_add_i32_e32 v16, vcc, v19, v16
; GISEL-NEXT: v_cndmask_b32_e64 v19, 0, 1, vcc
; GISEL-NEXT: v_add_i32_e32 v16, vcc, v16, v18
@ -1183,40 +1183,40 @@ define <2 x i64> @v_urem_v2i64_oddk_denom(<2 x i64> %num) {
; GISEL-NEXT: v_add_i32_e32 v11, vcc, v11, v14
; GISEL-NEXT: v_add_i32_e32 v12, vcc, v12, v15
; GISEL-NEXT: v_add_i32_e32 v7, vcc, v7, v13
; GISEL-NEXT: v_addc_u32_e32 v8, vcc, v8, v11, vcc
; GISEL-NEXT: v_addc_u32_e32 v9, vcc, v9, v11, vcc
; GISEL-NEXT: v_mul_lo_u32 v11, s6, v7
; GISEL-NEXT: v_mul_lo_u32 v13, s7, v7
; GISEL-NEXT: v_mul_hi_u32 v14, s6, v7
; GISEL-NEXT: v_add_i32_e32 v6, vcc, v6, v16
; GISEL-NEXT: v_addc_u32_e32 v9, vcc, v9, v12, vcc
; GISEL-NEXT: v_mul_lo_u32 v12, s9, v6
; GISEL-NEXT: v_mul_lo_u32 v15, s10, v6
; GISEL-NEXT: v_mul_hi_u32 v16, s9, v6
; GISEL-NEXT: v_mul_lo_u32 v17, s6, v8
; GISEL-NEXT: v_mul_lo_u32 v18, v8, v11
; GISEL-NEXT: v_add_i32_e32 v8, vcc, v8, v16
; GISEL-NEXT: v_addc_u32_e32 v10, vcc, v10, v12, vcc
; GISEL-NEXT: v_mul_lo_u32 v12, s9, v8
; GISEL-NEXT: v_mul_lo_u32 v15, s10, v8
; GISEL-NEXT: v_mul_hi_u32 v16, s9, v8
; GISEL-NEXT: v_mul_lo_u32 v17, s6, v9
; GISEL-NEXT: v_mul_lo_u32 v18, v9, v11
; GISEL-NEXT: v_mul_hi_u32 v19, v7, v11
; GISEL-NEXT: v_mul_hi_u32 v11, v8, v11
; GISEL-NEXT: v_mul_hi_u32 v11, v9, v11
; GISEL-NEXT: v_add_i32_e32 v13, vcc, v13, v17
; GISEL-NEXT: v_mul_lo_u32 v17, s9, v9
; GISEL-NEXT: v_mul_lo_u32 v17, s9, v10
; GISEL-NEXT: v_add_i32_e32 v15, vcc, v15, v17
; GISEL-NEXT: v_mul_lo_u32 v17, v9, v12
; GISEL-NEXT: v_mul_lo_u32 v17, v10, v12
; GISEL-NEXT: v_add_i32_e32 v13, vcc, v13, v14
; GISEL-NEXT: v_mul_hi_u32 v14, v6, v12
; GISEL-NEXT: v_mul_hi_u32 v12, v9, v12
; GISEL-NEXT: v_mul_hi_u32 v14, v8, v12
; GISEL-NEXT: v_mul_hi_u32 v12, v10, v12
; GISEL-NEXT: v_add_i32_e32 v15, vcc, v15, v16
; GISEL-NEXT: v_mul_lo_u32 v16, v6, v15
; GISEL-NEXT: v_mul_lo_u32 v16, v8, v15
; GISEL-NEXT: v_add_i32_e32 v16, vcc, v17, v16
; GISEL-NEXT: v_cndmask_b32_e64 v17, 0, 1, vcc
; GISEL-NEXT: v_add_i32_e32 v14, vcc, v16, v14
; GISEL-NEXT: v_mul_lo_u32 v14, v7, v13
; GISEL-NEXT: v_mul_lo_u32 v16, v8, v13
; GISEL-NEXT: v_mul_lo_u32 v16, v9, v13
; GISEL-NEXT: v_add_i32_e64 v14, s[4:5], v18, v14
; GISEL-NEXT: v_cndmask_b32_e64 v18, 0, 1, s[4:5]
; GISEL-NEXT: v_add_i32_e64 v14, s[4:5], v14, v19
; GISEL-NEXT: v_mul_hi_u32 v14, v7, v13
; GISEL-NEXT: v_cndmask_b32_e64 v19, 0, 1, s[4:5]
; GISEL-NEXT: v_add_i32_e64 v18, s[4:5], v18, v19
; GISEL-NEXT: v_mul_lo_u32 v19, v9, v15
; GISEL-NEXT: v_mul_lo_u32 v19, v10, v15
; GISEL-NEXT: v_add_i32_e64 v11, s[4:5], v16, v11
; GISEL-NEXT: v_cndmask_b32_e64 v16, 0, 1, s[4:5]
; GISEL-NEXT: v_add_i32_e64 v11, s[4:5], v11, v14
@ -1224,125 +1224,126 @@ define <2 x i64> @v_urem_v2i64_oddk_denom(<2 x i64> %num) {
; GISEL-NEXT: v_add_i32_e64 v14, s[4:5], v16, v14
; GISEL-NEXT: v_cndmask_b32_e64 v16, 0, 1, vcc
; GISEL-NEXT: v_add_i32_e32 v16, vcc, v17, v16
; GISEL-NEXT: v_mul_hi_u32 v17, v6, v15
; GISEL-NEXT: v_mul_hi_u32 v17, v8, v15
; GISEL-NEXT: v_add_i32_e32 v12, vcc, v19, v12
; GISEL-NEXT: v_cndmask_b32_e64 v19, 0, 1, vcc
; GISEL-NEXT: v_add_i32_e32 v12, vcc, v12, v17
; GISEL-NEXT: v_cndmask_b32_e64 v17, 0, 1, vcc
; GISEL-NEXT: v_add_i32_e32 v17, vcc, v19, v17
; GISEL-NEXT: v_mov_b32_e32 v19, s11
; GISEL-NEXT: v_mul_hi_u32 v13, v8, v13
; GISEL-NEXT: v_mul_hi_u32 v15, v9, v15
; GISEL-NEXT: v_add_i32_e32 v11, vcc, v11, v18
; GISEL-NEXT: v_cndmask_b32_e64 v18, 0, 1, vcc
; GISEL-NEXT: v_add_i32_e32 v14, vcc, v14, v18
; GISEL-NEXT: v_mov_b32_e32 v18, s12
; GISEL-NEXT: v_mul_hi_u32 v13, v9, v13
; GISEL-NEXT: v_mul_hi_u32 v15, v10, v15
; GISEL-NEXT: v_add_i32_e32 v12, vcc, v12, v16
; GISEL-NEXT: v_cndmask_b32_e64 v16, 0, 1, vcc
; GISEL-NEXT: v_add_i32_e32 v14, vcc, v14, v18
; GISEL-NEXT: v_add_i32_e32 v16, vcc, v17, v16
; GISEL-NEXT: v_add_i32_e32 v13, vcc, v13, v14
; GISEL-NEXT: v_add_i32_e32 v14, vcc, v15, v16
; GISEL-NEXT: v_add_i32_e32 v7, vcc, v7, v11
; GISEL-NEXT: v_addc_u32_e32 v8, vcc, v8, v13, vcc
; GISEL-NEXT: v_addc_u32_e32 v9, vcc, v9, v13, vcc
; GISEL-NEXT: v_mul_lo_u32 v11, v1, v7
; GISEL-NEXT: v_mul_hi_u32 v13, v0, v7
; GISEL-NEXT: v_mul_hi_u32 v7, v1, v7
; GISEL-NEXT: v_add_i32_e32 v6, vcc, v6, v12
; GISEL-NEXT: v_addc_u32_e32 v9, vcc, v9, v14, vcc
; GISEL-NEXT: v_mul_lo_u32 v12, v3, v6
; GISEL-NEXT: v_mul_hi_u32 v14, v2, v6
; GISEL-NEXT: v_mul_hi_u32 v6, v3, v6
; GISEL-NEXT: v_mul_lo_u32 v15, v0, v8
; GISEL-NEXT: v_mul_lo_u32 v16, v1, v8
; GISEL-NEXT: v_mul_hi_u32 v17, v0, v8
; GISEL-NEXT: v_mul_hi_u32 v8, v1, v8
; GISEL-NEXT: v_mul_lo_u32 v18, v2, v9
; GISEL-NEXT: v_add_i32_e32 v12, vcc, v12, v18
; GISEL-NEXT: v_cndmask_b32_e64 v18, 0, 1, vcc
; GISEL-NEXT: v_add_i32_e32 v12, vcc, v12, v14
; GISEL-NEXT: v_mul_lo_u32 v12, v3, v9
; GISEL-NEXT: v_mul_hi_u32 v14, v2, v9
; GISEL-NEXT: v_mul_hi_u32 v9, v3, v9
; GISEL-NEXT: v_add_i32_e64 v11, s[4:5], v11, v15
; GISEL-NEXT: v_cndmask_b32_e64 v15, 0, 1, s[4:5]
; GISEL-NEXT: v_add_i32_e64 v7, s[4:5], v16, v7
; GISEL-NEXT: v_cndmask_b32_e64 v16, 0, 1, s[4:5]
; GISEL-NEXT: v_add_i32_e64 v6, s[4:5], v12, v6
; GISEL-NEXT: v_add_i32_e32 v8, vcc, v8, v12
; GISEL-NEXT: v_addc_u32_e32 v10, vcc, v10, v14, vcc
; GISEL-NEXT: v_mul_lo_u32 v12, v3, v8
; GISEL-NEXT: v_mul_hi_u32 v14, v2, v8
; GISEL-NEXT: v_mul_hi_u32 v8, v3, v8
; GISEL-NEXT: v_mul_lo_u32 v15, v0, v9
; GISEL-NEXT: v_mul_lo_u32 v16, v1, v9
; GISEL-NEXT: v_mul_hi_u32 v17, v0, v9
; GISEL-NEXT: v_mul_hi_u32 v9, v1, v9
; GISEL-NEXT: v_add_i32_e32 v11, vcc, v11, v15
; GISEL-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc
; GISEL-NEXT: v_add_i32_e32 v11, vcc, v11, v13
; GISEL-NEXT: v_mul_lo_u32 v11, v2, v10
; GISEL-NEXT: v_mul_lo_u32 v13, v3, v10
; GISEL-NEXT: v_add_i32_e64 v11, s[4:5], v12, v11
; GISEL-NEXT: v_cndmask_b32_e64 v12, 0, 1, s[4:5]
; GISEL-NEXT: v_add_i32_e64 v11, s[4:5], v11, v13
; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, 1, s[4:5]
; GISEL-NEXT: v_add_i32_e64 v7, s[4:5], v7, v17
; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, s[4:5]
; GISEL-NEXT: v_add_i32_e64 v11, s[4:5], v11, v14
; GISEL-NEXT: v_mul_hi_u32 v11, v2, v10
; GISEL-NEXT: v_mul_hi_u32 v10, v3, v10
; GISEL-NEXT: v_add_i32_e64 v7, s[6:7], v16, v7
; GISEL-NEXT: v_cndmask_b32_e64 v14, 0, 1, s[6:7]
; GISEL-NEXT: v_add_i32_e64 v8, s[6:7], v13, v8
; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, s[6:7]
; GISEL-NEXT: v_cndmask_b32_e64 v16, 0, 1, vcc
; GISEL-NEXT: v_add_i32_e32 v7, vcc, v7, v17
; GISEL-NEXT: v_cndmask_b32_e64 v17, 0, 1, vcc
; GISEL-NEXT: v_add_i32_e32 v6, vcc, v6, v14
; GISEL-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc
; GISEL-NEXT: v_add_i32_e32 v11, vcc, v15, v11
; GISEL-NEXT: v_add_i32_e32 v13, vcc, v16, v13
; GISEL-NEXT: v_add_i32_e32 v15, vcc, v18, v17
; GISEL-NEXT: v_add_i32_e32 v12, vcc, v12, v14
; GISEL-NEXT: v_add_i32_e32 v7, vcc, v7, v11
; GISEL-NEXT: v_add_i32_e32 v15, vcc, v15, v16
; GISEL-NEXT: v_cndmask_b32_e64 v16, 0, 1, s[4:5]
; GISEL-NEXT: v_add_i32_e32 v8, vcc, v8, v11
; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc
; GISEL-NEXT: v_add_i32_e32 v6, vcc, v6, v15
; GISEL-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc
; GISEL-NEXT: v_add_i32_e32 v14, vcc, v14, v17
; GISEL-NEXT: v_add_i32_e32 v12, vcc, v12, v16
; GISEL-NEXT: v_add_i32_e32 v11, vcc, v13, v11
; GISEL-NEXT: v_mul_lo_u32 v13, s8, v7
; GISEL-NEXT: v_add_i32_e32 v7, vcc, v7, v15
; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc
; GISEL-NEXT: v_add_i32_e32 v8, vcc, v8, v12
; GISEL-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc
; GISEL-NEXT: v_add_i32_e32 v13, vcc, v14, v13
; GISEL-NEXT: v_mul_lo_u32 v14, s8, v7
; GISEL-NEXT: v_mul_lo_u32 v15, 0, v7
; GISEL-NEXT: v_mul_hi_u32 v7, s8, v7
; GISEL-NEXT: v_add_i32_e32 v12, vcc, v12, v14
; GISEL-NEXT: v_mul_lo_u32 v14, s8, v6
; GISEL-NEXT: v_mul_lo_u32 v16, 0, v6
; GISEL-NEXT: v_mul_hi_u32 v6, s8, v6
; GISEL-NEXT: v_add_i32_e32 v8, vcc, v8, v11
; GISEL-NEXT: v_add_i32_e32 v9, vcc, v9, v12
; GISEL-NEXT: v_mul_lo_u32 v8, s8, v8
; GISEL-NEXT: v_add_i32_e32 v11, vcc, v11, v12
; GISEL-NEXT: v_mul_lo_u32 v12, s8, v8
; GISEL-NEXT: v_mul_lo_u32 v16, 0, v8
; GISEL-NEXT: v_mul_hi_u32 v8, s8, v8
; GISEL-NEXT: v_add_i32_e32 v9, vcc, v9, v13
; GISEL-NEXT: v_add_i32_e32 v10, vcc, v10, v11
; GISEL-NEXT: v_mul_lo_u32 v9, s8, v9
; GISEL-NEXT: v_add_i32_e32 v8, vcc, v15, v8
; GISEL-NEXT: v_add_i32_e32 v9, vcc, v16, v9
; GISEL-NEXT: v_add_i32_e32 v7, vcc, v8, v7
; GISEL-NEXT: v_add_i32_e32 v6, vcc, v9, v6
; GISEL-NEXT: v_sub_i32_e32 v0, vcc, v0, v13
; GISEL-NEXT: v_subb_u32_e64 v8, s[4:5], v1, v7, vcc
; GISEL-NEXT: v_mul_lo_u32 v10, s8, v10
; GISEL-NEXT: v_add_i32_e32 v9, vcc, v15, v9
; GISEL-NEXT: v_add_i32_e32 v10, vcc, v16, v10
; GISEL-NEXT: v_add_i32_e32 v7, vcc, v9, v7
; GISEL-NEXT: v_add_i32_e32 v8, vcc, v10, v8
; GISEL-NEXT: v_sub_i32_e32 v0, vcc, v0, v14
; GISEL-NEXT: v_subb_u32_e64 v9, s[4:5], v1, v7, vcc
; GISEL-NEXT: v_sub_i32_e64 v1, s[4:5], v1, v7
; GISEL-NEXT: v_cmp_le_u32_e64 s[4:5], s8, v0
; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v0, v4
; GISEL-NEXT: v_cndmask_b32_e64 v7, 0, -1, s[4:5]
; GISEL-NEXT: v_sub_i32_e64 v2, s[4:5], v2, v14
; GISEL-NEXT: v_subb_u32_e64 v9, s[6:7], v3, v6, s[4:5]
; GISEL-NEXT: v_sub_i32_e64 v3, s[6:7], v3, v6
; GISEL-NEXT: v_cmp_le_u32_e64 s[6:7], s8, v2
; GISEL-NEXT: v_cndmask_b32_e64 v6, 0, -1, s[6:7]
; GISEL-NEXT: v_cmp_eq_u32_e64 s[6:7], 0, v8
; GISEL-NEXT: v_cndmask_b32_e64 v5, v5, v7, s[6:7]
; GISEL-NEXT: v_sub_i32_e64 v2, s[4:5], v2, v12
; GISEL-NEXT: v_subb_u32_e64 v10, s[6:7], v3, v8, s[4:5]
; GISEL-NEXT: v_sub_i32_e64 v3, s[6:7], v3, v8
; GISEL-NEXT: v_cmp_ge_u32_e64 s[6:7], v2, v4
; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, -1, s[6:7]
; GISEL-NEXT: v_cmp_eq_u32_e64 s[6:7], 0, v9
; GISEL-NEXT: v_cndmask_b32_e64 v6, v6, v7, s[6:7]
; GISEL-NEXT: v_subbrev_u32_e32 v1, vcc, 0, v1, vcc
; GISEL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v9
; GISEL-NEXT: v_cndmask_b32_e32 v6, v10, v6, vcc
; GISEL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v10
; GISEL-NEXT: v_cndmask_b32_e32 v7, v19, v8, vcc
; GISEL-NEXT: v_subbrev_u32_e64 v3, vcc, 0, v3, s[4:5]
; GISEL-NEXT: v_subrev_i32_e32 v7, vcc, s8, v0
; GISEL-NEXT: v_sub_i32_e32 v8, vcc, v0, v4
; GISEL-NEXT: v_subbrev_u32_e32 v1, vcc, 0, v1, vcc
; GISEL-NEXT: v_cmp_le_u32_e32 vcc, s8, v7
; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, -1, vcc
; GISEL-NEXT: v_subrev_i32_e32 v11, vcc, s8, v2
; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v8, v4
; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, -1, vcc
; GISEL-NEXT: v_sub_i32_e32 v12, vcc, v2, v4
; GISEL-NEXT: v_subbrev_u32_e32 v3, vcc, 0, v3, vcc
; GISEL-NEXT: v_cmp_le_u32_e32 vcc, s8, v11
; GISEL-NEXT: v_cndmask_b32_e64 v12, 0, -1, vcc
; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v12, v4
; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, -1, vcc
; GISEL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1
; GISEL-NEXT: v_cndmask_b32_e32 v4, v4, v10, vcc
; GISEL-NEXT: v_subrev_i32_e32 v10, vcc, s8, v7
; GISEL-NEXT: v_subbrev_u32_e32 v13, vcc, 0, v1, vcc
; GISEL-NEXT: v_cndmask_b32_e32 v5, v5, v11, vcc
; GISEL-NEXT: v_sub_i32_e32 v11, vcc, v8, v4
; GISEL-NEXT: v_subbrev_u32_e32 v14, vcc, 0, v1, vcc
; GISEL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3
; GISEL-NEXT: v_cndmask_b32_e32 v12, v19, v12, vcc
; GISEL-NEXT: v_subrev_i32_e32 v14, vcc, s8, v11
; GISEL-NEXT: v_cndmask_b32_e32 v13, v18, v13, vcc
; GISEL-NEXT: v_sub_i32_e32 v4, vcc, v12, v4
; GISEL-NEXT: v_subbrev_u32_e32 v15, vcc, 0, v3, vcc
; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4
; GISEL-NEXT: v_cndmask_b32_e32 v4, v7, v10, vcc
; GISEL-NEXT: v_cmp_ne_u32_e64 s[4:5], 0, v12
; GISEL-NEXT: v_cndmask_b32_e64 v7, v11, v14, s[4:5]
; GISEL-NEXT: v_cndmask_b32_e32 v1, v1, v13, vcc
; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v5
; GISEL-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc
; GISEL-NEXT: v_cndmask_b32_e32 v5, v8, v11, vcc
; GISEL-NEXT: v_cmp_ne_u32_e64 s[4:5], 0, v13
; GISEL-NEXT: v_cndmask_b32_e64 v4, v12, v4, s[4:5]
; GISEL-NEXT: v_cndmask_b32_e32 v1, v1, v14, vcc
; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v6
; GISEL-NEXT: v_cndmask_b32_e32 v0, v0, v5, vcc
; GISEL-NEXT: v_cndmask_b32_e64 v3, v3, v15, s[4:5]
; GISEL-NEXT: v_cmp_ne_u32_e64 s[4:5], 0, v6
; GISEL-NEXT: v_cndmask_b32_e64 v2, v2, v7, s[4:5]
; GISEL-NEXT: v_cndmask_b32_e32 v1, v8, v1, vcc
; GISEL-NEXT: v_cndmask_b32_e64 v3, v9, v3, s[4:5]
; GISEL-NEXT: v_cmp_ne_u32_e64 s[4:5], 0, v7
; GISEL-NEXT: v_cndmask_b32_e64 v2, v2, v4, s[4:5]
; GISEL-NEXT: v_cndmask_b32_e32 v1, v9, v1, vcc
; GISEL-NEXT: v_cndmask_b32_e64 v3, v10, v3, s[4:5]
; GISEL-NEXT: s_setpc_b64 s[30:31]
;
; CGP-LABEL: v_urem_v2i64_oddk_denom:

View File

@ -236,12 +236,11 @@ define i16 @v_usubsat_v2i8(i16 %lhs.arg, i16 %rhs.arg) {
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: s_mov_b32 s4, 8
; GFX10-NEXT: v_mov_b32_e32 v2, 0xffff
; GFX10-NEXT: v_lshrrev_b32_sdwa v3, s4, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
; GFX10-NEXT: v_lshrrev_b32_sdwa v4, s4, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
; GFX10-NEXT: v_lshrrev_b32_sdwa v2, s4, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
; GFX10-NEXT: v_lshrrev_b32_sdwa v3, s4, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
; GFX10-NEXT: s_movk_i32 s4, 0xff
; GFX10-NEXT: v_and_or_b32 v0, v0, v2, v3
; GFX10-NEXT: v_and_or_b32 v1, v1, v2, v4
; GFX10-NEXT: v_and_or_b32 v0, 0xffff, v0, v2
; GFX10-NEXT: v_and_or_b32 v1, 0xffff, v1, v3
; GFX10-NEXT: v_pk_lshlrev_b16 v0, 8, v0 op_sel_hi:[0,1]
; GFX10-NEXT: v_pk_lshlrev_b16 v1, 8, v1 op_sel_hi:[0,1]
; GFX10-NEXT: v_pk_sub_u16 v0, v0, v1 clamp
@ -297,17 +296,16 @@ define amdgpu_ps i16 @s_usubsat_v2i8(i16 inreg %lhs.arg, i16 inreg %rhs.arg) {
; GFX9-LABEL: s_usubsat_v2i8:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_lshr_b32 s2, s0, 8
; GFX9-NEXT: s_lshr_b32 s3, s1, 8
; GFX9-NEXT: s_pack_ll_b32_b16 s0, s0, s2
; GFX9-NEXT: s_lshr_b32 s3, s1, 8
; GFX9-NEXT: s_lshr_b32 s2, s0, 16
; GFX9-NEXT: s_pack_ll_b32_b16 s1, s1, s3
; GFX9-NEXT: s_mov_b32 s2, 0x80008
; GFX9-NEXT: s_lshr_b32 s3, s0, 16
; GFX9-NEXT: s_lshl_b32 s0, s0, s2
; GFX9-NEXT: s_lshl_b32 s3, s3, 8
; GFX9-NEXT: s_pack_ll_b32_b16 s0, s0, s3
; GFX9-NEXT: s_lshr_b32 s3, s1, 16
; GFX9-NEXT: s_lshl_b32 s1, s1, s2
; GFX9-NEXT: s_lshl_b32 s2, s3, 8
; GFX9-NEXT: s_lshl_b32 s0, s0, 0x80008
; GFX9-NEXT: s_lshl_b32 s2, s2, 8
; GFX9-NEXT: s_pack_ll_b32_b16 s0, s0, s2
; GFX9-NEXT: s_lshr_b32 s2, s1, 16
; GFX9-NEXT: s_lshl_b32 s1, s1, 0x80008
; GFX9-NEXT: s_lshl_b32 s2, s2, 8
; GFX9-NEXT: s_pack_ll_b32_b16 s1, s1, s2
; GFX9-NEXT: v_mov_b32_e32 v0, s1
; GFX9-NEXT: v_pk_sub_u16 v0, s0, v0 clamp
@ -324,15 +322,14 @@ define amdgpu_ps i16 @s_usubsat_v2i8(i16 inreg %lhs.arg, i16 inreg %rhs.arg) {
; GFX10-NEXT: s_lshr_b32 s3, s1, 8
; GFX10-NEXT: s_pack_ll_b32_b16 s0, s0, s2
; GFX10-NEXT: s_pack_ll_b32_b16 s1, s1, s3
; GFX10-NEXT: s_mov_b32 s2, 0x80008
; GFX10-NEXT: s_lshr_b32 s3, s0, 16
; GFX10-NEXT: s_lshr_b32 s4, s1, 16
; GFX10-NEXT: s_lshl_b32 s0, s0, s2
; GFX10-NEXT: s_lshr_b32 s2, s0, 16
; GFX10-NEXT: s_lshr_b32 s3, s1, 16
; GFX10-NEXT: s_lshl_b32 s0, s0, 0x80008
; GFX10-NEXT: s_lshl_b32 s2, s2, 8
; GFX10-NEXT: s_lshl_b32 s1, s1, 0x80008
; GFX10-NEXT: s_lshl_b32 s3, s3, 8
; GFX10-NEXT: s_lshl_b32 s1, s1, s2
; GFX10-NEXT: s_lshl_b32 s2, s4, 8
; GFX10-NEXT: s_pack_ll_b32_b16 s0, s0, s3
; GFX10-NEXT: s_pack_ll_b32_b16 s1, s1, s2
; GFX10-NEXT: s_pack_ll_b32_b16 s0, s0, s2
; GFX10-NEXT: s_pack_ll_b32_b16 s1, s1, s3
; GFX10-NEXT: v_pk_sub_u16 v0, s0, s1 clamp
; GFX10-NEXT: s_movk_i32 s0, 0xff
; GFX10-NEXT: v_pk_lshrrev_b16 v0, 8, v0 op_sel_hi:[0,1]
@ -454,35 +451,33 @@ define i32 @v_usubsat_v4i8(i32 %lhs.arg, i32 %rhs.arg) {
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: v_lshrrev_b32_e32 v4, 24, v0
; GFX10-NEXT: v_lshrrev_b32_e32 v5, 24, v1
; GFX10-NEXT: v_lshrrev_b32_e32 v3, 24, v0
; GFX10-NEXT: v_lshrrev_b32_e32 v4, 24, v1
; GFX10-NEXT: s_mov_b32 s4, 8
; GFX10-NEXT: v_mov_b32_e32 v2, 0xffff
; GFX10-NEXT: v_lshrrev_b32_sdwa v3, s4, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
; GFX10-NEXT: v_lshrrev_b32_e32 v6, 16, v0
; GFX10-NEXT: v_lshrrev_b32_sdwa v7, s4, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
; GFX10-NEXT: v_lshrrev_b32_e32 v8, 16, v1
; GFX10-NEXT: v_lshrrev_b32_e32 v5, 16, v0
; GFX10-NEXT: v_lshrrev_b32_sdwa v2, s4, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
; GFX10-NEXT: v_lshrrev_b32_sdwa v6, s4, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
; GFX10-NEXT: v_lshrrev_b32_e32 v7, 16, v1
; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v3
; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v4
; GFX10-NEXT: v_lshlrev_b32_e32 v5, 16, v5
; GFX10-NEXT: v_and_or_b32 v0, v0, v2, v3
; GFX10-NEXT: v_and_or_b32 v1, v1, v2, v7
; GFX10-NEXT: s_movk_i32 s4, 0xff
; GFX10-NEXT: v_and_or_b32 v3, v6, v2, v4
; GFX10-NEXT: v_and_or_b32 v2, v8, v2, v5
; GFX10-NEXT: v_and_or_b32 v0, 0xffff, v0, v2
; GFX10-NEXT: v_and_or_b32 v1, 0xffff, v1, v6
; GFX10-NEXT: v_and_or_b32 v2, 0xffff, v5, v3
; GFX10-NEXT: v_and_or_b32 v3, 0xffff, v7, v4
; GFX10-NEXT: v_pk_lshlrev_b16 v0, 8, v0 op_sel_hi:[0,1]
; GFX10-NEXT: v_pk_lshlrev_b16 v1, 8, v1 op_sel_hi:[0,1]
; GFX10-NEXT: v_mov_b32_e32 v4, 24
; GFX10-NEXT: v_pk_lshlrev_b16 v3, 8, v3 op_sel_hi:[0,1]
; GFX10-NEXT: v_pk_lshlrev_b16 v2, 8, v2 op_sel_hi:[0,1]
; GFX10-NEXT: v_pk_lshlrev_b16 v3, 8, v3 op_sel_hi:[0,1]
; GFX10-NEXT: v_pk_sub_u16 v0, v0, v1 clamp
; GFX10-NEXT: v_pk_sub_u16 v1, v3, v2 clamp
; GFX10-NEXT: v_pk_sub_u16 v1, v2, v3 clamp
; GFX10-NEXT: v_mov_b32_e32 v2, 8
; GFX10-NEXT: v_pk_lshrrev_b16 v0, 8, v0 op_sel_hi:[0,1]
; GFX10-NEXT: v_pk_lshrrev_b16 v1, 8, v1 op_sel_hi:[0,1]
; GFX10-NEXT: v_lshlrev_b32_sdwa v2, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
; GFX10-NEXT: v_and_b32_e32 v3, s4, v1
; GFX10-NEXT: v_and_b32_e32 v3, 0xff, v1
; GFX10-NEXT: v_lshlrev_b32_sdwa v1, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
; GFX10-NEXT: v_and_or_b32 v0, v0, s4, v2
; GFX10-NEXT: v_and_or_b32 v0, v0, 0xff, v2
; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v3
; GFX10-NEXT: v_or3_b32 v0, v0, v2, v1
; GFX10-NEXT: s_setpc_b64 s[30:31]
@ -574,27 +569,26 @@ define amdgpu_ps i32 @s_usubsat_v4i8(i32 inreg %lhs.arg, i32 inreg %rhs.arg) {
; GFX9-NEXT: s_lshr_b32 s6, s0, 24
; GFX9-NEXT: s_pack_ll_b32_b16 s0, s0, s3
; GFX9-NEXT: s_pack_ll_b32_b16 s3, s4, s6
; GFX9-NEXT: s_mov_b32 s4, 0x80008
; GFX9-NEXT: s_lshr_b32 s6, s0, 16
; GFX9-NEXT: s_lshr_b32 s4, s0, 16
; GFX9-NEXT: s_lshr_b32 s7, s1, 8
; GFX9-NEXT: s_lshl_b32 s0, s0, s4
; GFX9-NEXT: s_lshl_b32 s6, s6, 8
; GFX9-NEXT: s_lshl_b32 s0, s0, 0x80008
; GFX9-NEXT: s_lshl_b32 s4, s4, 8
; GFX9-NEXT: s_lshr_b32 s8, s1, 16
; GFX9-NEXT: s_lshr_b32 s9, s1, 24
; GFX9-NEXT: s_pack_ll_b32_b16 s0, s0, s6
; GFX9-NEXT: s_lshr_b32 s6, s3, 16
; GFX9-NEXT: s_pack_ll_b32_b16 s0, s0, s4
; GFX9-NEXT: s_lshr_b32 s4, s3, 16
; GFX9-NEXT: s_pack_ll_b32_b16 s1, s1, s7
; GFX9-NEXT: s_lshl_b32 s3, s3, s4
; GFX9-NEXT: s_lshl_b32 s3, s3, 0x80008
; GFX9-NEXT: s_lshl_b32 s4, s4, 8
; GFX9-NEXT: s_lshr_b32 s6, s1, 16
; GFX9-NEXT: s_pack_ll_b32_b16 s3, s3, s4
; GFX9-NEXT: s_pack_ll_b32_b16 s4, s8, s9
; GFX9-NEXT: s_lshl_b32 s1, s1, 0x80008
; GFX9-NEXT: s_lshl_b32 s6, s6, 8
; GFX9-NEXT: s_pack_ll_b32_b16 s1, s1, s6
; GFX9-NEXT: s_lshr_b32 s6, s4, 16
; GFX9-NEXT: s_lshl_b32 s4, s4, 0x80008
; GFX9-NEXT: s_lshl_b32 s6, s6, 8
; GFX9-NEXT: s_lshr_b32 s7, s1, 16
; GFX9-NEXT: s_pack_ll_b32_b16 s3, s3, s6
; GFX9-NEXT: s_pack_ll_b32_b16 s6, s8, s9
; GFX9-NEXT: s_lshl_b32 s1, s1, s4
; GFX9-NEXT: s_lshl_b32 s7, s7, 8
; GFX9-NEXT: s_pack_ll_b32_b16 s1, s1, s7
; GFX9-NEXT: s_lshr_b32 s7, s6, 16
; GFX9-NEXT: s_lshl_b32 s4, s6, s4
; GFX9-NEXT: s_lshl_b32 s6, s7, 8
; GFX9-NEXT: s_pack_ll_b32_b16 s4, s4, s6
; GFX9-NEXT: v_mov_b32_e32 v0, s1
; GFX9-NEXT: v_pk_sub_u16 v0, s0, v0 clamp
@ -621,39 +615,37 @@ define amdgpu_ps i32 @s_usubsat_v4i8(i32 inreg %lhs.arg, i32 inreg %rhs.arg) {
; GFX10-NEXT: s_lshr_b32 s4, s0, 24
; GFX10-NEXT: s_pack_ll_b32_b16 s0, s0, s2
; GFX10-NEXT: s_pack_ll_b32_b16 s2, s3, s4
; GFX10-NEXT: s_mov_b32 s3, 0x80008
; GFX10-NEXT: s_lshr_b32 s4, s0, 16
; GFX10-NEXT: s_lshr_b32 s3, s0, 16
; GFX10-NEXT: s_lshr_b32 s5, s1, 8
; GFX10-NEXT: s_lshr_b32 s6, s1, 16
; GFX10-NEXT: s_lshr_b32 s7, s1, 24
; GFX10-NEXT: s_lshl_b32 s0, s0, s3
; GFX10-NEXT: s_lshl_b32 s0, s0, 0x80008
; GFX10-NEXT: s_lshl_b32 s3, s3, 8
; GFX10-NEXT: s_lshr_b32 s4, s2, 16
; GFX10-NEXT: s_lshl_b32 s2, s2, 0x80008
; GFX10-NEXT: s_lshl_b32 s4, s4, 8
; GFX10-NEXT: s_pack_ll_b32_b16 s0, s0, s3
; GFX10-NEXT: s_pack_ll_b32_b16 s1, s1, s5
; GFX10-NEXT: s_pack_ll_b32_b16 s0, s0, s4
; GFX10-NEXT: s_pack_ll_b32_b16 s4, s6, s7
; GFX10-NEXT: s_lshr_b32 s8, s2, 16
; GFX10-NEXT: s_lshr_b32 s5, s1, 16
; GFX10-NEXT: s_lshr_b32 s6, s4, 16
; GFX10-NEXT: s_lshl_b32 s2, s2, s3
; GFX10-NEXT: s_lshl_b32 s8, s8, 8
; GFX10-NEXT: s_lshl_b32 s1, s1, s3
; GFX10-NEXT: s_pack_ll_b32_b16 s3, s6, s7
; GFX10-NEXT: s_pack_ll_b32_b16 s2, s2, s4
; GFX10-NEXT: s_lshr_b32 s4, s1, 16
; GFX10-NEXT: s_lshr_b32 s5, s3, 16
; GFX10-NEXT: s_lshl_b32 s1, s1, 0x80008
; GFX10-NEXT: s_lshl_b32 s4, s4, 8
; GFX10-NEXT: s_lshl_b32 s3, s3, 0x80008
; GFX10-NEXT: s_lshl_b32 s5, s5, 8
; GFX10-NEXT: s_lshl_b32 s3, s4, s3
; GFX10-NEXT: s_lshl_b32 s4, s6, 8
; GFX10-NEXT: s_pack_ll_b32_b16 s2, s2, s8
; GFX10-NEXT: s_pack_ll_b32_b16 s1, s1, s5
; GFX10-NEXT: s_pack_ll_b32_b16 s3, s3, s4
; GFX10-NEXT: s_pack_ll_b32_b16 s1, s1, s4
; GFX10-NEXT: s_pack_ll_b32_b16 s3, s3, s5
; GFX10-NEXT: v_pk_sub_u16 v0, s0, s1 clamp
; GFX10-NEXT: v_pk_sub_u16 v1, s2, s3 clamp
; GFX10-NEXT: s_mov_b32 s0, 8
; GFX10-NEXT: s_movk_i32 s1, 0xff
; GFX10-NEXT: v_pk_lshrrev_b16 v0, 8, v0 op_sel_hi:[0,1]
; GFX10-NEXT: v_pk_lshrrev_b16 v1, 8, v1 op_sel_hi:[0,1]
; GFX10-NEXT: v_lshlrev_b32_sdwa v2, s0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
; GFX10-NEXT: v_and_b32_e32 v3, s1, v1
; GFX10-NEXT: v_and_b32_e32 v3, 0xff, v1
; GFX10-NEXT: s_mov_b32 s0, 24
; GFX10-NEXT: v_lshlrev_b32_sdwa v1, s0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
; GFX10-NEXT: v_and_or_b32 v0, v0, s1, v2
; GFX10-NEXT: v_and_or_b32 v0, v0, 0xff, v2
; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v3
; GFX10-NEXT: v_or3_b32 v0, v0, v2, v1
; GFX10-NEXT: v_readfirstlane_b32 s0, v0

View File

@ -25,12 +25,11 @@ entry:
define amdgpu_ps i32 @scalar_xnor_v2i16_one_use(<2 x i16> inreg %a, <2 x i16> inreg %b) {
; GFX7-LABEL: scalar_xnor_v2i16_one_use:
; GFX7: ; %bb.0: ; %entry
; GFX7-NEXT: s_mov_b32 s4, 0xffff
; GFX7-NEXT: s_lshl_b32 s1, s1, 16
; GFX7-NEXT: s_and_b32 s0, s0, s4
; GFX7-NEXT: s_and_b32 s0, s0, 0xffff
; GFX7-NEXT: s_or_b32 s0, s1, s0
; GFX7-NEXT: s_lshl_b32 s1, s3, 16
; GFX7-NEXT: s_and_b32 s2, s2, s4
; GFX7-NEXT: s_and_b32 s2, s2, 0xffff
; GFX7-NEXT: s_or_b32 s1, s1, s2
; GFX7-NEXT: s_xor_b32 s0, s0, s1
; GFX7-NEXT: s_xor_b32 s0, s0, -1
@ -42,10 +41,10 @@ define amdgpu_ps i32 @scalar_xnor_v2i16_one_use(<2 x i16> inreg %a, <2 x i16> in
; GFX8-NEXT: s_xor_b32 s0, s0, s1
; GFX8-NEXT: s_mov_b32 s3, s2
; GFX8-NEXT: s_lshr_b32 s1, s0, 16
; GFX8-NEXT: s_and_b32 s0, s0, s2
; GFX8-NEXT: s_and_b32 s0, s0, 0xffff
; GFX8-NEXT: s_xor_b64 s[0:1], s[0:1], s[2:3]
; GFX8-NEXT: s_lshl_b32 s1, s1, 16
; GFX8-NEXT: s_and_b32 s0, s0, s2
; GFX8-NEXT: s_and_b32 s0, s0, 0xffff
; GFX8-NEXT: s_or_b32 s0, s1, s0
; GFX8-NEXT: ; return to shader part epilog
;
@ -117,18 +116,17 @@ define amdgpu_ps i64 @scalar_xnor_i64_one_use(i64 inreg %a, i64 inreg %b) {
define amdgpu_ps i64 @scalar_xnor_v4i16_one_use(<4 x i16> inreg %a, <4 x i16> inreg %b) {
; GFX7-LABEL: scalar_xnor_v4i16_one_use:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_mov_b32 s8, 0xffff
; GFX7-NEXT: s_lshl_b32 s1, s1, 16
; GFX7-NEXT: s_and_b32 s0, s0, s8
; GFX7-NEXT: s_and_b32 s0, s0, 0xffff
; GFX7-NEXT: s_or_b32 s0, s1, s0
; GFX7-NEXT: s_lshl_b32 s1, s3, 16
; GFX7-NEXT: s_and_b32 s2, s2, s8
; GFX7-NEXT: s_and_b32 s2, s2, 0xffff
; GFX7-NEXT: s_or_b32 s1, s1, s2
; GFX7-NEXT: s_lshl_b32 s2, s5, 16
; GFX7-NEXT: s_and_b32 s3, s4, s8
; GFX7-NEXT: s_and_b32 s3, s4, 0xffff
; GFX7-NEXT: s_or_b32 s2, s2, s3
; GFX7-NEXT: s_lshl_b32 s3, s7, 16
; GFX7-NEXT: s_and_b32 s4, s6, s8
; GFX7-NEXT: s_and_b32 s4, s6, 0xffff
; GFX7-NEXT: s_or_b32 s3, s3, s4
; GFX7-NEXT: s_mov_b32 s4, -1
; GFX7-NEXT: s_mov_b32 s5, s4
@ -142,16 +140,16 @@ define amdgpu_ps i64 @scalar_xnor_v4i16_one_use(<4 x i16> inreg %a, <4 x i16> in
; GFX8-NEXT: s_xor_b64 s[0:1], s[0:1], s[2:3]
; GFX8-NEXT: s_mov_b32 s5, s4
; GFX8-NEXT: s_lshr_b32 s3, s0, 16
; GFX8-NEXT: s_and_b32 s2, s0, s4
; GFX8-NEXT: s_and_b32 s2, s0, 0xffff
; GFX8-NEXT: s_lshr_b32 s7, s1, 16
; GFX8-NEXT: s_and_b32 s6, s1, s4
; GFX8-NEXT: s_and_b32 s6, s1, 0xffff
; GFX8-NEXT: s_xor_b64 s[0:1], s[2:3], s[4:5]
; GFX8-NEXT: s_xor_b64 s[2:3], s[6:7], s[4:5]
; GFX8-NEXT: s_lshl_b32 s1, s1, 16
; GFX8-NEXT: s_and_b32 s0, s0, s4
; GFX8-NEXT: s_and_b32 s0, s0, 0xffff
; GFX8-NEXT: s_or_b32 s0, s1, s0
; GFX8-NEXT: s_lshl_b32 s1, s3, 16
; GFX8-NEXT: s_and_b32 s2, s2, s4
; GFX8-NEXT: s_and_b32 s2, s2, 0xffff
; GFX8-NEXT: s_or_b32 s1, s1, s2
; GFX8-NEXT: ; return to shader part epilog
;

View File

@ -205,7 +205,8 @@ define amdgpu_kernel void @v_test_add_v2i16_zext_to_v2i32(<2 x i32> addrspace(1)
; GFX9PLUS: global_load_dword [[B:v[0-9]+]]
; GFX9PLUS: v_pk_add_u16 [[ADD:v[0-9]+]], [[A]], [[B]]
; GFX9PLUS-DAG: v_and_b32_e32 v[[ELT0:[0-9]+]], [[MASK]], [[ADD]]
; GFX9-DAG: v_and_b32_e32 v[[ELT0:[0-9]+]], [[MASK]], [[ADD]]
; GFX10-DAG: v_and_b32_e32 v[[ELT0:[0-9]+]], 0xffff, [[ADD]]
; GFX9PLUS-DAG: v_and_b32_sdwa v{{[0-9]+}}, [[MASK]], v{{[0-9]+}} dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
; GFX9PLUS: buffer_store_dwordx4

View File

@ -150,7 +150,7 @@ define i32 @select_sdiv_lhs_opaque_const0_i32(i1 %cond) {
; GCN-NEXT: v_mul_hi_u32 v2, v2, s4
; GCN-NEXT: v_mul_lo_u32 v3, v2, v0
; GCN-NEXT: v_add_u32_e32 v4, vcc, 1, v2
; GCN-NEXT: v_sub_u32_e32 v3, vcc, s4, v3
; GCN-NEXT: v_sub_u32_e32 v3, vcc, 0xf4240, v3
; GCN-NEXT: v_cmp_ge_u32_e32 vcc, v3, v0
; GCN-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc
; GCN-NEXT: v_sub_u32_e64 v4, s[4:5], v3, v0
@ -232,7 +232,7 @@ define i32 @select_sdiv_lhs_opaque_const1_i32(i1 %cond) {
; GCN-NEXT: v_mul_hi_u32 v2, v2, s4
; GCN-NEXT: v_mul_lo_u32 v3, v2, v0
; GCN-NEXT: v_add_u32_e32 v4, vcc, 1, v2
; GCN-NEXT: v_sub_u32_e32 v3, vcc, s4, v3
; GCN-NEXT: v_sub_u32_e32 v3, vcc, 0xf4240, v3
; GCN-NEXT: v_cmp_ge_u32_e32 vcc, v3, v0
; GCN-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc
; GCN-NEXT: v_sub_u32_e64 v4, s[4:5], v3, v0

File diff suppressed because it is too large Load Diff

View File

@ -63,9 +63,8 @@ define amdgpu_kernel void @s_and_constant_i32(i32 addrspace(1)* %out, i32 %a) {
; Second use is a VGPR use of the constant.
; FUNC-LABEL: {{^}}s_and_multi_use_constant_i32_0:
; SI: s_mov_b32 [[K:s[0-9]+]], 0x12d687
; SI-DAG: s_and_b32 [[AND:s[0-9]+]], s{{[0-9]+}}, [[K]]
; SI-DAG: v_mov_b32_e32 [[VK:v[0-9]+]], [[K]]
; SI-DAG: s_and_b32 [[AND:s[0-9]+]], s{{[0-9]+}}, 0x12d687
; SI-DAG: v_mov_b32_e32 [[VK:v[0-9]+]], 0x12d687
; SI: buffer_store_dword [[VK]]
define amdgpu_kernel void @s_and_multi_use_constant_i32_0(i32 addrspace(1)* %out, i32 %a, i32 %b) {
%and = and i32 %a, 1234567
@ -79,10 +78,9 @@ define amdgpu_kernel void @s_and_multi_use_constant_i32_0(i32 addrspace(1)* %out
; Second use is another SGPR use of the constant.
; FUNC-LABEL: {{^}}s_and_multi_use_constant_i32_1:
; SI: s_mov_b32 [[K:s[0-9]+]], 0x12d687
; SI: s_and_b32 [[AND:s[0-9]+]], s{{[0-9]+}}, [[K]]
; SI: s_and_b32 [[AND:s[0-9]+]], s{{[0-9]+}}, 0x12d687
; SI: s_add_i32
; SI: s_add_i32 [[ADD:s[0-9]+]], s{{[0-9]+}}, [[K]]
; SI: s_add_i32 [[ADD:s[0-9]+]], s{{[0-9]+}}, 0x12d687
; SI: v_mov_b32_e32 [[VADD:v[0-9]+]], [[ADD]]
; SI: buffer_store_dword [[VADD]]
define amdgpu_kernel void @s_and_multi_use_constant_i32_1(i32 addrspace(1)* %out, i32 %a, i32 %b) {

View File

@ -3067,7 +3067,7 @@ define amdgpu_kernel void @max_i32_varying(i32 addrspace(1)* %out) {
; GFX8-NEXT: s_mov_b64 exec, s[2:3]
; GFX8-NEXT: v_mov_b32_e32 v2, v0
; GFX8-NEXT: s_not_b64 exec, exec
; GFX8-NEXT: v_mov_b32_e32 v2, v1
; GFX8-NEXT: v_bfrev_b32_e32 v2, 1
; GFX8-NEXT: s_not_b64 exec, exec
; GFX8-NEXT: s_or_saveexec_b64 s[2:3], -1
; GFX8-NEXT: v_max_i32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf
@ -3118,7 +3118,7 @@ define amdgpu_kernel void @max_i32_varying(i32 addrspace(1)* %out) {
; GFX9-NEXT: s_mov_b64 exec, s[2:3]
; GFX9-NEXT: v_mov_b32_e32 v2, v0
; GFX9-NEXT: s_not_b64 exec, exec
; GFX9-NEXT: v_mov_b32_e32 v2, v1
; GFX9-NEXT: v_bfrev_b32_e32 v2, 1
; GFX9-NEXT: s_not_b64 exec, exec
; GFX9-NEXT: s_or_saveexec_b64 s[2:3], -1
; GFX9-NEXT: v_max_i32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf
@ -3160,41 +3160,39 @@ define amdgpu_kernel void @max_i32_varying(i32 addrspace(1)* %out) {
;
; GFX1064-LABEL: max_i32_varying:
; GFX1064: ; %bb.0: ; %entry
; GFX1064-NEXT: s_or_saveexec_b64 s[2:3], -1
; GFX1064-NEXT: v_mov_b32_e32 v1, v0
; GFX1064-NEXT: s_not_b64 exec, exec
; GFX1064-NEXT: v_bfrev_b32_e32 v1, 1
; GFX1064-NEXT: s_mov_b64 exec, s[2:3]
; GFX1064-NEXT: v_mov_b32_e32 v2, v0
; GFX1064-NEXT: s_not_b64 exec, exec
; GFX1064-NEXT: v_mov_b32_e32 v2, v1
; GFX1064-NEXT: s_not_b64 exec, exec
; GFX1064-NEXT: s_or_saveexec_b64 s[2:3], -1
; GFX1064-NEXT: v_max_i32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf
; GFX1064-NEXT: v_max_i32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf
; GFX1064-NEXT: v_max_i32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf
; GFX1064-NEXT: v_max_i32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf
; GFX1064-NEXT: v_mov_b32_e32 v3, v2
; GFX1064-NEXT: v_permlanex16_b32 v3, v3, -1, -1
; GFX1064-NEXT: v_max_i32_dpp v2, v3, v2 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
; GFX1064-NEXT: v_readlane_b32 s4, v2, 31
; GFX1064-NEXT: v_mov_b32_e32 v3, s4
; GFX1064-NEXT: v_max_i32_dpp v2, v3, v2 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf
; GFX1064-NEXT: v_readlane_b32 s4, v2, 15
; GFX1064-NEXT: v_mov_b32_dpp v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf
; GFX1064-NEXT: v_max_i32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf
; GFX1064-NEXT: v_bfrev_b32_e32 v3, 1
; GFX1064-NEXT: v_max_i32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf
; GFX1064-NEXT: v_max_i32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf
; GFX1064-NEXT: v_max_i32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf
; GFX1064-NEXT: v_mov_b32_e32 v2, v1
; GFX1064-NEXT: v_permlanex16_b32 v2, v2, -1, -1
; GFX1064-NEXT: v_max_i32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
; GFX1064-NEXT: v_readlane_b32 s4, v1, 31
; GFX1064-NEXT: v_mov_b32_e32 v2, s4
; GFX1064-NEXT: v_max_i32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf
; GFX1064-NEXT: v_readlane_b32 s4, v1, 15
; GFX1064-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf
; GFX1064-NEXT: s_mov_b64 exec, s[2:3]
; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; GFX1064-NEXT: s_or_saveexec_b64 s[2:3], -1
; GFX1064-NEXT: v_readlane_b32 s5, v2, 31
; GFX1064-NEXT: v_writelane_b32 v1, s4, 16
; GFX1064-NEXT: v_readlane_b32 s5, v1, 31
; GFX1064-NEXT: v_writelane_b32 v3, s4, 16
; GFX1064-NEXT: s_mov_b64 exec, s[2:3]
; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
; GFX1064-NEXT: s_or_saveexec_b64 s[2:3], -1
; GFX1064-NEXT: v_readlane_b32 s7, v2, 63
; GFX1064-NEXT: v_readlane_b32 s6, v2, 47
; GFX1064-NEXT: v_writelane_b32 v1, s5, 32
; GFX1064-NEXT: v_readlane_b32 s7, v1, 63
; GFX1064-NEXT: v_readlane_b32 s6, v1, 47
; GFX1064-NEXT: v_writelane_b32 v3, s5, 32
; GFX1064-NEXT: s_mov_b64 exec, s[2:3]
; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
; GFX1064-NEXT: s_or_saveexec_b64 s[4:5], -1
; GFX1064-NEXT: v_writelane_b32 v1, s6, 48
; GFX1064-NEXT: v_writelane_b32 v3, s6, 48
; GFX1064-NEXT: s_mov_b64 exec, s[4:5]
; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
; GFX1064-NEXT: s_mov_b32 s2, -1
@ -3214,7 +3212,7 @@ define amdgpu_kernel void @max_i32_varying(i32 addrspace(1)* %out) {
; GFX1064-NEXT: s_waitcnt_depctr 0xffe3
; GFX1064-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX1064-NEXT: v_readfirstlane_b32 s3, v0
; GFX1064-NEXT: v_mov_b32_e32 v0, v1
; GFX1064-NEXT: v_mov_b32_e32 v0, v3
; GFX1064-NEXT: v_max_i32_e32 v0, s3, v0
; GFX1064-NEXT: s_mov_b32 s3, 0x31016000
; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
@ -3223,31 +3221,29 @@ define amdgpu_kernel void @max_i32_varying(i32 addrspace(1)* %out) {
;
; GFX1032-LABEL: max_i32_varying:
; GFX1032: ; %bb.0: ; %entry
; GFX1032-NEXT: s_or_saveexec_b32 s2, -1
; GFX1032-NEXT: v_mov_b32_e32 v1, v0
; GFX1032-NEXT: s_not_b32 exec_lo, exec_lo
; GFX1032-NEXT: v_bfrev_b32_e32 v1, 1
; GFX1032-NEXT: s_mov_b32 exec_lo, s2
; GFX1032-NEXT: v_mov_b32_e32 v2, v0
; GFX1032-NEXT: s_not_b32 exec_lo, exec_lo
; GFX1032-NEXT: v_mov_b32_e32 v2, v1
; GFX1032-NEXT: s_not_b32 exec_lo, exec_lo
; GFX1032-NEXT: s_or_saveexec_b32 s2, -1
; GFX1032-NEXT: v_max_i32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf
; GFX1032-NEXT: v_max_i32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf
; GFX1032-NEXT: v_max_i32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf
; GFX1032-NEXT: v_max_i32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf
; GFX1032-NEXT: v_mov_b32_e32 v3, v2
; GFX1032-NEXT: v_permlanex16_b32 v3, v3, -1, -1
; GFX1032-NEXT: v_max_i32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf
; GFX1032-NEXT: v_max_i32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf
; GFX1032-NEXT: v_max_i32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf
; GFX1032-NEXT: v_max_i32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf
; GFX1032-NEXT: v_mov_b32_e32 v2, v1
; GFX1032-NEXT: v_permlanex16_b32 v2, v2, -1, -1
; GFX1032-NEXT: s_mov_b32 exec_lo, s2
; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; GFX1032-NEXT: s_or_saveexec_b32 s2, -1
; GFX1032-NEXT: v_max_i32_dpp v2, v3, v2 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
; GFX1032-NEXT: v_readlane_b32 s3, v2, 15
; GFX1032-NEXT: v_readlane_b32 s4, v2, 31
; GFX1032-NEXT: v_mov_b32_dpp v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf
; GFX1032-NEXT: v_max_i32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
; GFX1032-NEXT: v_bfrev_b32_e32 v3, 1
; GFX1032-NEXT: v_readlane_b32 s3, v1, 15
; GFX1032-NEXT: v_readlane_b32 s4, v1, 31
; GFX1032-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf
; GFX1032-NEXT: s_mov_b32 exec_lo, s2
; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
; GFX1032-NEXT: s_or_saveexec_b32 s2, -1
; GFX1032-NEXT: v_writelane_b32 v1, s3, 16
; GFX1032-NEXT: v_writelane_b32 v3, s3, 16
; GFX1032-NEXT: s_mov_b32 exec_lo, s2
; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
; GFX1032-NEXT: s_mov_b32 s2, -1
@ -3266,7 +3262,7 @@ define amdgpu_kernel void @max_i32_varying(i32 addrspace(1)* %out) {
; GFX1032-NEXT: s_waitcnt_depctr 0xffe3
; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s3
; GFX1032-NEXT: v_readfirstlane_b32 s3, v0
; GFX1032-NEXT: v_mov_b32_e32 v0, v1
; GFX1032-NEXT: v_mov_b32_e32 v0, v3
; GFX1032-NEXT: v_max_i32_e32 v0, s3, v0
; GFX1032-NEXT: s_mov_b32 s3, 0x31016000
; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
@ -3484,7 +3480,7 @@ define amdgpu_kernel void @min_i32_varying(i32 addrspace(1)* %out) {
; GFX8-NEXT: s_mov_b64 exec, s[2:3]
; GFX8-NEXT: v_mov_b32_e32 v2, v0
; GFX8-NEXT: s_not_b64 exec, exec
; GFX8-NEXT: v_mov_b32_e32 v2, v1
; GFX8-NEXT: v_bfrev_b32_e32 v2, -2
; GFX8-NEXT: s_not_b64 exec, exec
; GFX8-NEXT: s_or_saveexec_b64 s[2:3], -1
; GFX8-NEXT: v_min_i32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf
@ -3535,7 +3531,7 @@ define amdgpu_kernel void @min_i32_varying(i32 addrspace(1)* %out) {
; GFX9-NEXT: s_mov_b64 exec, s[2:3]
; GFX9-NEXT: v_mov_b32_e32 v2, v0
; GFX9-NEXT: s_not_b64 exec, exec
; GFX9-NEXT: v_mov_b32_e32 v2, v1
; GFX9-NEXT: v_bfrev_b32_e32 v2, -2
; GFX9-NEXT: s_not_b64 exec, exec
; GFX9-NEXT: s_or_saveexec_b64 s[2:3], -1
; GFX9-NEXT: v_min_i32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf
@ -3577,41 +3573,39 @@ define amdgpu_kernel void @min_i32_varying(i32 addrspace(1)* %out) {
;
; GFX1064-LABEL: min_i32_varying:
; GFX1064: ; %bb.0: ; %entry
; GFX1064-NEXT: s_or_saveexec_b64 s[2:3], -1
; GFX1064-NEXT: v_mov_b32_e32 v1, v0
; GFX1064-NEXT: s_not_b64 exec, exec
; GFX1064-NEXT: v_bfrev_b32_e32 v1, -2
; GFX1064-NEXT: s_mov_b64 exec, s[2:3]
; GFX1064-NEXT: v_mov_b32_e32 v2, v0
; GFX1064-NEXT: s_not_b64 exec, exec
; GFX1064-NEXT: v_mov_b32_e32 v2, v1
; GFX1064-NEXT: s_not_b64 exec, exec
; GFX1064-NEXT: s_or_saveexec_b64 s[2:3], -1
; GFX1064-NEXT: v_min_i32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf
; GFX1064-NEXT: v_min_i32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf
; GFX1064-NEXT: v_min_i32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf
; GFX1064-NEXT: v_min_i32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf
; GFX1064-NEXT: v_mov_b32_e32 v3, v2
; GFX1064-NEXT: v_permlanex16_b32 v3, v3, -1, -1
; GFX1064-NEXT: v_min_i32_dpp v2, v3, v2 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
; GFX1064-NEXT: v_readlane_b32 s4, v2, 31
; GFX1064-NEXT: v_mov_b32_e32 v3, s4
; GFX1064-NEXT: v_min_i32_dpp v2, v3, v2 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf
; GFX1064-NEXT: v_readlane_b32 s4, v2, 15
; GFX1064-NEXT: v_mov_b32_dpp v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf
; GFX1064-NEXT: v_min_i32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf
; GFX1064-NEXT: v_bfrev_b32_e32 v3, -2
; GFX1064-NEXT: v_min_i32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf
; GFX1064-NEXT: v_min_i32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf
; GFX1064-NEXT: v_min_i32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf
; GFX1064-NEXT: v_mov_b32_e32 v2, v1
; GFX1064-NEXT: v_permlanex16_b32 v2, v2, -1, -1
; GFX1064-NEXT: v_min_i32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
; GFX1064-NEXT: v_readlane_b32 s4, v1, 31
; GFX1064-NEXT: v_mov_b32_e32 v2, s4
; GFX1064-NEXT: v_min_i32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf
; GFX1064-NEXT: v_readlane_b32 s4, v1, 15
; GFX1064-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf
; GFX1064-NEXT: s_mov_b64 exec, s[2:3]
; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; GFX1064-NEXT: s_or_saveexec_b64 s[2:3], -1
; GFX1064-NEXT: v_readlane_b32 s5, v2, 31
; GFX1064-NEXT: v_writelane_b32 v1, s4, 16
; GFX1064-NEXT: v_readlane_b32 s5, v1, 31
; GFX1064-NEXT: v_writelane_b32 v3, s4, 16
; GFX1064-NEXT: s_mov_b64 exec, s[2:3]
; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
; GFX1064-NEXT: s_or_saveexec_b64 s[2:3], -1
; GFX1064-NEXT: v_readlane_b32 s7, v2, 63
; GFX1064-NEXT: v_readlane_b32 s6, v2, 47
; GFX1064-NEXT: v_writelane_b32 v1, s5, 32
; GFX1064-NEXT: v_readlane_b32 s7, v1, 63
; GFX1064-NEXT: v_readlane_b32 s6, v1, 47
; GFX1064-NEXT: v_writelane_b32 v3, s5, 32
; GFX1064-NEXT: s_mov_b64 exec, s[2:3]
; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
; GFX1064-NEXT: s_or_saveexec_b64 s[4:5], -1
; GFX1064-NEXT: v_writelane_b32 v1, s6, 48
; GFX1064-NEXT: v_writelane_b32 v3, s6, 48
; GFX1064-NEXT: s_mov_b64 exec, s[4:5]
; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
; GFX1064-NEXT: s_mov_b32 s2, -1
@ -3631,7 +3625,7 @@ define amdgpu_kernel void @min_i32_varying(i32 addrspace(1)* %out) {
; GFX1064-NEXT: s_waitcnt_depctr 0xffe3
; GFX1064-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX1064-NEXT: v_readfirstlane_b32 s3, v0
; GFX1064-NEXT: v_mov_b32_e32 v0, v1
; GFX1064-NEXT: v_mov_b32_e32 v0, v3
; GFX1064-NEXT: v_min_i32_e32 v0, s3, v0
; GFX1064-NEXT: s_mov_b32 s3, 0x31016000
; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
@ -3640,31 +3634,29 @@ define amdgpu_kernel void @min_i32_varying(i32 addrspace(1)* %out) {
;
; GFX1032-LABEL: min_i32_varying:
; GFX1032: ; %bb.0: ; %entry
; GFX1032-NEXT: s_or_saveexec_b32 s2, -1
; GFX1032-NEXT: v_mov_b32_e32 v1, v0
; GFX1032-NEXT: s_not_b32 exec_lo, exec_lo
; GFX1032-NEXT: v_bfrev_b32_e32 v1, -2
; GFX1032-NEXT: s_mov_b32 exec_lo, s2
; GFX1032-NEXT: v_mov_b32_e32 v2, v0
; GFX1032-NEXT: s_not_b32 exec_lo, exec_lo
; GFX1032-NEXT: v_mov_b32_e32 v2, v1
; GFX1032-NEXT: s_not_b32 exec_lo, exec_lo
; GFX1032-NEXT: s_or_saveexec_b32 s2, -1
; GFX1032-NEXT: v_min_i32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf
; GFX1032-NEXT: v_min_i32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf
; GFX1032-NEXT: v_min_i32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf
; GFX1032-NEXT: v_min_i32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf
; GFX1032-NEXT: v_mov_b32_e32 v3, v2
; GFX1032-NEXT: v_permlanex16_b32 v3, v3, -1, -1
; GFX1032-NEXT: v_min_i32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf
; GFX1032-NEXT: v_min_i32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf
; GFX1032-NEXT: v_min_i32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf
; GFX1032-NEXT: v_min_i32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf
; GFX1032-NEXT: v_mov_b32_e32 v2, v1
; GFX1032-NEXT: v_permlanex16_b32 v2, v2, -1, -1
; GFX1032-NEXT: s_mov_b32 exec_lo, s2
; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; GFX1032-NEXT: s_or_saveexec_b32 s2, -1
; GFX1032-NEXT: v_min_i32_dpp v2, v3, v2 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
; GFX1032-NEXT: v_readlane_b32 s3, v2, 15
; GFX1032-NEXT: v_readlane_b32 s4, v2, 31
; GFX1032-NEXT: v_mov_b32_dpp v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf
; GFX1032-NEXT: v_min_i32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
; GFX1032-NEXT: v_bfrev_b32_e32 v3, -2
; GFX1032-NEXT: v_readlane_b32 s3, v1, 15
; GFX1032-NEXT: v_readlane_b32 s4, v1, 31
; GFX1032-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf
; GFX1032-NEXT: s_mov_b32 exec_lo, s2
; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
; GFX1032-NEXT: s_or_saveexec_b32 s2, -1
; GFX1032-NEXT: v_writelane_b32 v1, s3, 16
; GFX1032-NEXT: v_writelane_b32 v3, s3, 16
; GFX1032-NEXT: s_mov_b32 exec_lo, s2
; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
; GFX1032-NEXT: s_mov_b32 s2, -1
@ -3683,7 +3675,7 @@ define amdgpu_kernel void @min_i32_varying(i32 addrspace(1)* %out) {
; GFX1032-NEXT: s_waitcnt_depctr 0xffe3
; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s3
; GFX1032-NEXT: v_readfirstlane_b32 s3, v0
; GFX1032-NEXT: v_mov_b32_e32 v0, v1
; GFX1032-NEXT: v_mov_b32_e32 v0, v3
; GFX1032-NEXT: v_min_i32_e32 v0, s3, v0
; GFX1032-NEXT: s_mov_b32 s3, 0x31016000
; GFX1032-NEXT: s_waitcnt lgkmcnt(0)

View File

@ -27,7 +27,6 @@ define i64 @sdiv64(i64 %a, i64 %b) {
; GFX9-NEXT: v_subb_co_u32_e32 v8, vcc, 0, v10, vcc
; GFX9-NEXT: v_mac_f32_e32 v2, 0x4f800000, v3
; GFX9-NEXT: v_rcp_f32_e32 v2, v2
; GFX9-NEXT: v_mov_b32_e32 v14, 0
; GFX9-NEXT: v_mul_f32_e32 v2, 0x5f7ffffc, v2
; GFX9-NEXT: v_mul_f32_e32 v3, 0x2f800000, v2
; GFX9-NEXT: v_trunc_f32_e32 v3, v3
@ -42,11 +41,11 @@ define i64 @sdiv64(i64 %a, i64 %b) {
; GFX9-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v6, v5, 0
; GFX9-NEXT: v_add_co_u32_e32 v13, vcc, v13, v3
; GFX9-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v12, v2, 0
; GFX9-NEXT: v_addc_co_u32_e32 v15, vcc, 0, v4, vcc
; GFX9-NEXT: v_addc_co_u32_e32 v14, vcc, 0, v4, vcc
; GFX9-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v12, v5, 0
; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v13, v2
; GFX9-NEXT: v_addc_co_u32_e32 v2, vcc, v15, v3, vcc
; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, v5, v14, vcc
; GFX9-NEXT: v_addc_co_u32_e32 v2, vcc, v14, v3, vcc
; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v5, vcc
; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v2, v4
; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc
; GFX9-NEXT: v_add_co_u32_e32 v13, vcc, v6, v2
@ -57,13 +56,13 @@ define i64 @sdiv64(i64 %a, i64 %b) {
; GFX9-NEXT: v_add3_u32 v5, v3, v4, v5
; GFX9-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v12, v5, 0
; GFX9-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v13, v5, 0
; GFX9-NEXT: v_mul_hi_u32 v15, v13, v2
; GFX9-NEXT: v_mul_hi_u32 v14, v13, v2
; GFX9-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v12, v2, 0
; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v15, v5
; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v14, v5
; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v6, vcc
; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v2, v7
; GFX9-NEXT: v_addc_co_u32_e32 v2, vcc, v5, v8, vcc
; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, v4, v14, vcc
; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, 0, v4, vcc
; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v2, v3
; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v4, vcc
; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v13, v2
@ -81,7 +80,7 @@ define i64 @sdiv64(i64 %a, i64 %b) {
; GFX9-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v5, v3, 0
; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v7, v0
; GFX9-NEXT: v_addc_co_u32_e32 v0, vcc, v8, v1, vcc
; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v3, v14, vcc
; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v3, vcc
; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v0, v2
; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v1, vcc
; GFX9-NEXT: v_mul_lo_u32 v7, v10, v2
@ -175,7 +174,6 @@ define i64 @udiv64(i64 %a, i64 %b) {
; GFX9-NEXT: v_subb_co_u32_e32 v11, vcc, 0, v3, vcc
; GFX9-NEXT: v_mac_f32_e32 v4, 0x4f800000, v5
; GFX9-NEXT: v_rcp_f32_e32 v4, v4
; GFX9-NEXT: v_mov_b32_e32 v13, 0
; GFX9-NEXT: v_mul_f32_e32 v4, 0x5f7ffffc, v4
; GFX9-NEXT: v_mul_f32_e32 v5, 0x2f800000, v4
; GFX9-NEXT: v_trunc_f32_e32 v5, v5
@ -190,32 +188,32 @@ define i64 @udiv64(i64 %a, i64 %b) {
; GFX9-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v9, v7, 0
; GFX9-NEXT: v_add_co_u32_e32 v12, vcc, v12, v5
; GFX9-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v8, v4, 0
; GFX9-NEXT: v_addc_co_u32_e32 v14, vcc, 0, v6, vcc
; GFX9-NEXT: v_addc_co_u32_e32 v13, vcc, 0, v6, vcc
; GFX9-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v8, v7, 0
; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, v12, v4
; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, v14, v5, vcc
; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, v7, v13, vcc
; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, v13, v5, vcc
; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v7, vcc
; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, v4, v6
; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v5, vcc
; GFX9-NEXT: v_add_co_u32_e32 v12, vcc, v9, v4
; GFX9-NEXT: v_addc_co_u32_e32 v14, vcc, v8, v5, vcc
; GFX9-NEXT: v_mul_lo_u32 v6, v10, v14
; GFX9-NEXT: v_addc_co_u32_e32 v13, vcc, v8, v5, vcc
; GFX9-NEXT: v_mul_lo_u32 v6, v10, v13
; GFX9-NEXT: v_mul_lo_u32 v7, v11, v12
; GFX9-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v10, v12, 0
; GFX9-NEXT: v_add3_u32 v7, v5, v6, v7
; GFX9-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v14, v7, 0
; GFX9-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v13, v7, 0
; GFX9-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v12, v7, 0
; GFX9-NEXT: v_mul_hi_u32 v11, v12, v4
; GFX9-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v14, v4, 0
; GFX9-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v13, v4, 0
; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, v11, v7
; GFX9-NEXT: v_addc_co_u32_e32 v7, vcc, 0, v8, vcc
; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, v4, v9
; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, v7, v10, vcc
; GFX9-NEXT: v_addc_co_u32_e32 v6, vcc, v6, v13, vcc
; GFX9-NEXT: v_addc_co_u32_e32 v6, vcc, 0, v6, vcc
; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, v4, v5
; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v6, vcc
; GFX9-NEXT: v_add_co_u32_e32 v6, vcc, v12, v4
; GFX9-NEXT: v_addc_co_u32_e32 v7, vcc, v14, v5, vcc
; GFX9-NEXT: v_addc_co_u32_e32 v7, vcc, v13, v5, vcc
; GFX9-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v0, v7, 0
; GFX9-NEXT: v_mul_hi_u32 v8, v0, v6
; GFX9-NEXT: v_add_co_u32_e32 v8, vcc, v8, v4
@ -224,7 +222,7 @@ define i64 @udiv64(i64 %a, i64 %b) {
; GFX9-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v1, v7, 0
; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, v8, v4
; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, v9, v5, vcc
; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, v7, v13, vcc
; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v7, vcc
; GFX9-NEXT: v_add_co_u32_e32 v6, vcc, v4, v6
; GFX9-NEXT: v_addc_co_u32_e32 v7, vcc, 0, v5, vcc
; GFX9-NEXT: v_mul_lo_u32 v8, v3, v6
@ -318,7 +316,6 @@ define i64 @srem64(i64 %a, i64 %b) {
; GFX9-NEXT: v_subb_co_u32_e32 v8, vcc, 0, v9, vcc
; GFX9-NEXT: v_mac_f32_e32 v2, 0x4f800000, v3
; GFX9-NEXT: v_rcp_f32_e32 v2, v2
; GFX9-NEXT: v_mov_b32_e32 v13, 0
; GFX9-NEXT: v_mul_f32_e32 v2, 0x5f7ffffc, v2
; GFX9-NEXT: v_mul_f32_e32 v3, 0x2f800000, v2
; GFX9-NEXT: v_trunc_f32_e32 v3, v3
@ -333,11 +330,11 @@ define i64 @srem64(i64 %a, i64 %b) {
; GFX9-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v6, v5, 0
; GFX9-NEXT: v_add_co_u32_e32 v12, vcc, v12, v3
; GFX9-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v11, v2, 0
; GFX9-NEXT: v_addc_co_u32_e32 v14, vcc, 0, v4, vcc
; GFX9-NEXT: v_addc_co_u32_e32 v13, vcc, 0, v4, vcc
; GFX9-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v11, v5, 0
; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v12, v2
; GFX9-NEXT: v_addc_co_u32_e32 v2, vcc, v14, v3, vcc
; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, v5, v13, vcc
; GFX9-NEXT: v_addc_co_u32_e32 v2, vcc, v13, v3, vcc
; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v5, vcc
; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v2, v4
; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc
; GFX9-NEXT: v_add_co_u32_e32 v12, vcc, v6, v2
@ -348,13 +345,13 @@ define i64 @srem64(i64 %a, i64 %b) {
; GFX9-NEXT: v_add3_u32 v5, v3, v4, v5
; GFX9-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v11, v5, 0
; GFX9-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v12, v5, 0
; GFX9-NEXT: v_mul_hi_u32 v14, v12, v2
; GFX9-NEXT: v_mul_hi_u32 v13, v12, v2
; GFX9-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v11, v2, 0
; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v14, v5
; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v13, v5
; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v6, vcc
; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v2, v7
; GFX9-NEXT: v_addc_co_u32_e32 v2, vcc, v5, v8, vcc
; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, v4, v13, vcc
; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, 0, v4, vcc
; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v2, v3
; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v4, vcc
; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v12, v2
@ -372,7 +369,7 @@ define i64 @srem64(i64 %a, i64 %b) {
; GFX9-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v4, v3, 0
; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v7, v0
; GFX9-NEXT: v_addc_co_u32_e32 v0, vcc, v8, v1, vcc
; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v3, v13, vcc
; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v3, vcc
; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v2
; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
; GFX9-NEXT: v_mul_lo_u32 v2, v9, v0
@ -462,7 +459,6 @@ define i64 @urem64(i64 %a, i64 %b) {
; GFX9-NEXT: v_subb_co_u32_e32 v11, vcc, 0, v3, vcc
; GFX9-NEXT: v_mac_f32_e32 v4, 0x4f800000, v5
; GFX9-NEXT: v_rcp_f32_e32 v4, v4
; GFX9-NEXT: v_mov_b32_e32 v13, 0
; GFX9-NEXT: v_mul_f32_e32 v4, 0x5f7ffffc, v4
; GFX9-NEXT: v_mul_f32_e32 v5, 0x2f800000, v4
; GFX9-NEXT: v_trunc_f32_e32 v5, v5
@ -477,32 +473,32 @@ define i64 @urem64(i64 %a, i64 %b) {
; GFX9-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v9, v7, 0
; GFX9-NEXT: v_add_co_u32_e32 v12, vcc, v12, v5
; GFX9-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v8, v4, 0
; GFX9-NEXT: v_addc_co_u32_e32 v14, vcc, 0, v6, vcc
; GFX9-NEXT: v_addc_co_u32_e32 v13, vcc, 0, v6, vcc
; GFX9-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v8, v7, 0
; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, v12, v4
; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, v14, v5, vcc
; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, v7, v13, vcc
; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, v13, v5, vcc
; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v7, vcc
; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, v4, v6
; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v5, vcc
; GFX9-NEXT: v_add_co_u32_e32 v12, vcc, v9, v4
; GFX9-NEXT: v_addc_co_u32_e32 v14, vcc, v8, v5, vcc
; GFX9-NEXT: v_mul_lo_u32 v6, v10, v14
; GFX9-NEXT: v_addc_co_u32_e32 v13, vcc, v8, v5, vcc
; GFX9-NEXT: v_mul_lo_u32 v6, v10, v13
; GFX9-NEXT: v_mul_lo_u32 v7, v11, v12
; GFX9-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v10, v12, 0
; GFX9-NEXT: v_add3_u32 v7, v5, v6, v7
; GFX9-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v14, v7, 0
; GFX9-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v13, v7, 0
; GFX9-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v12, v7, 0
; GFX9-NEXT: v_mul_hi_u32 v11, v12, v4
; GFX9-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v14, v4, 0
; GFX9-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v13, v4, 0
; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, v11, v7
; GFX9-NEXT: v_addc_co_u32_e32 v7, vcc, 0, v8, vcc
; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, v4, v9
; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, v7, v10, vcc
; GFX9-NEXT: v_addc_co_u32_e32 v6, vcc, v6, v13, vcc
; GFX9-NEXT: v_addc_co_u32_e32 v6, vcc, 0, v6, vcc
; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, v4, v5
; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v6, vcc
; GFX9-NEXT: v_add_co_u32_e32 v6, vcc, v12, v4
; GFX9-NEXT: v_addc_co_u32_e32 v7, vcc, v14, v5, vcc
; GFX9-NEXT: v_addc_co_u32_e32 v7, vcc, v13, v5, vcc
; GFX9-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v0, v7, 0
; GFX9-NEXT: v_mul_hi_u32 v8, v0, v6
; GFX9-NEXT: v_add_co_u32_e32 v8, vcc, v8, v4
@ -511,7 +507,7 @@ define i64 @urem64(i64 %a, i64 %b) {
; GFX9-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v1, v7, 0
; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, v8, v4
; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, v9, v5, vcc
; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, v7, v13, vcc
; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v7, vcc
; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, v4, v6
; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v5, vcc
; GFX9-NEXT: v_mul_lo_u32 v6, v3, v4
@ -728,7 +724,6 @@ define <2 x i64> @sdivrem64(i64 %a, i64 %b) {
; GFX9-NEXT: v_subb_co_u32_e32 v8, vcc, 0, v10, vcc
; GFX9-NEXT: v_mac_f32_e32 v2, 0x4f800000, v3
; GFX9-NEXT: v_rcp_f32_e32 v2, v2
; GFX9-NEXT: v_mov_b32_e32 v14, 0
; GFX9-NEXT: v_mul_f32_e32 v2, 0x5f7ffffc, v2
; GFX9-NEXT: v_mul_f32_e32 v3, 0x2f800000, v2
; GFX9-NEXT: v_trunc_f32_e32 v3, v3
@ -743,11 +738,11 @@ define <2 x i64> @sdivrem64(i64 %a, i64 %b) {
; GFX9-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v6, v5, 0
; GFX9-NEXT: v_add_co_u32_e32 v13, vcc, v13, v3
; GFX9-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v12, v2, 0
; GFX9-NEXT: v_addc_co_u32_e32 v15, vcc, 0, v4, vcc
; GFX9-NEXT: v_addc_co_u32_e32 v14, vcc, 0, v4, vcc
; GFX9-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v12, v5, 0
; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v13, v2
; GFX9-NEXT: v_addc_co_u32_e32 v2, vcc, v15, v3, vcc
; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, v5, v14, vcc
; GFX9-NEXT: v_addc_co_u32_e32 v2, vcc, v14, v3, vcc
; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v5, vcc
; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v2, v4
; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc
; GFX9-NEXT: v_add_co_u32_e32 v13, vcc, v6, v2
@ -758,13 +753,13 @@ define <2 x i64> @sdivrem64(i64 %a, i64 %b) {
; GFX9-NEXT: v_add3_u32 v5, v3, v4, v5
; GFX9-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v12, v5, 0
; GFX9-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v13, v5, 0
; GFX9-NEXT: v_mul_hi_u32 v15, v13, v2
; GFX9-NEXT: v_mul_hi_u32 v14, v13, v2
; GFX9-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v12, v2, 0
; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v15, v5
; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v14, v5
; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v6, vcc
; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v2, v7
; GFX9-NEXT: v_addc_co_u32_e32 v2, vcc, v5, v8, vcc
; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, v4, v14, vcc
; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, 0, v4, vcc
; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v2, v3
; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v4, vcc
; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v13, v2
@ -782,7 +777,7 @@ define <2 x i64> @sdivrem64(i64 %a, i64 %b) {
; GFX9-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v4, v3, 0
; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v6, v0
; GFX9-NEXT: v_addc_co_u32_e32 v0, vcc, v8, v1, vcc
; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v3, v14, vcc
; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v3, vcc
; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v0, v2
; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v1, vcc
; GFX9-NEXT: v_mul_lo_u32 v6, v10, v2
@ -896,7 +891,6 @@ define <2 x i64> @udivrem64(i64 %a, i64 %b) {
; GFX9-NEXT: v_subb_co_u32_e32 v11, vcc, 0, v3, vcc
; GFX9-NEXT: v_mac_f32_e32 v4, 0x4f800000, v5
; GFX9-NEXT: v_rcp_f32_e32 v4, v4
; GFX9-NEXT: v_mov_b32_e32 v13, 0
; GFX9-NEXT: v_mul_f32_e32 v4, 0x5f7ffffc, v4
; GFX9-NEXT: v_mul_f32_e32 v5, 0x2f800000, v4
; GFX9-NEXT: v_trunc_f32_e32 v5, v5
@ -911,32 +905,32 @@ define <2 x i64> @udivrem64(i64 %a, i64 %b) {
; GFX9-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v9, v7, 0
; GFX9-NEXT: v_add_co_u32_e32 v12, vcc, v12, v5
; GFX9-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v8, v4, 0
; GFX9-NEXT: v_addc_co_u32_e32 v14, vcc, 0, v6, vcc
; GFX9-NEXT: v_addc_co_u32_e32 v13, vcc, 0, v6, vcc
; GFX9-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v8, v7, 0
; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, v12, v4
; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, v14, v5, vcc
; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, v7, v13, vcc
; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, v13, v5, vcc
; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v7, vcc
; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, v4, v6
; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v5, vcc
; GFX9-NEXT: v_add_co_u32_e32 v12, vcc, v9, v4
; GFX9-NEXT: v_addc_co_u32_e32 v14, vcc, v8, v5, vcc
; GFX9-NEXT: v_mul_lo_u32 v6, v10, v14
; GFX9-NEXT: v_addc_co_u32_e32 v13, vcc, v8, v5, vcc
; GFX9-NEXT: v_mul_lo_u32 v6, v10, v13
; GFX9-NEXT: v_mul_lo_u32 v7, v11, v12
; GFX9-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v10, v12, 0
; GFX9-NEXT: v_add3_u32 v7, v5, v6, v7
; GFX9-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v14, v7, 0
; GFX9-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v13, v7, 0
; GFX9-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v12, v7, 0
; GFX9-NEXT: v_mul_hi_u32 v11, v12, v4
; GFX9-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v14, v4, 0
; GFX9-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v13, v4, 0
; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, v11, v7
; GFX9-NEXT: v_addc_co_u32_e32 v7, vcc, 0, v8, vcc
; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, v4, v9
; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, v7, v10, vcc
; GFX9-NEXT: v_addc_co_u32_e32 v6, vcc, v6, v13, vcc
; GFX9-NEXT: v_addc_co_u32_e32 v6, vcc, 0, v6, vcc
; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, v4, v5
; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v6, vcc
; GFX9-NEXT: v_add_co_u32_e32 v6, vcc, v12, v4
; GFX9-NEXT: v_addc_co_u32_e32 v7, vcc, v14, v5, vcc
; GFX9-NEXT: v_addc_co_u32_e32 v7, vcc, v13, v5, vcc
; GFX9-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v0, v7, 0
; GFX9-NEXT: v_mul_hi_u32 v8, v0, v6
; GFX9-NEXT: v_add_co_u32_e32 v8, vcc, v8, v4
@ -945,7 +939,7 @@ define <2 x i64> @udivrem64(i64 %a, i64 %b) {
; GFX9-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v1, v7, 0
; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, v8, v4
; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, v9, v5, vcc
; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, v7, v13, vcc
; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v7, vcc
; GFX9-NEXT: v_add_co_u32_e32 v6, vcc, v4, v6
; GFX9-NEXT: v_addc_co_u32_e32 v7, vcc, 0, v5, vcc
; GFX9-NEXT: v_mul_lo_u32 v8, v3, v6

View File

@ -5,7 +5,7 @@
; CHECK-LABEL: _Z11test_kernelPii:
; CHECK: s_mul_i32
; CHECK: s_sub_i32
; CHECK: s_and_b32 [[S1:s[0-9]+]], {{s[0-9]+}}, {{s[0-9]+}}
; CHECK: s_and_b32 [[S1:s[0-9]+]], {{s[0-9]+}}, 0xffff
; CHECK: s_add_i32 [[S2:s[0-9]+]], {{s[0-9]+}}, [[S1]]
; CHECK: s_or_b32 {{s[0-9]+}}, [[S2]], 0xc0

View File

@ -226,7 +226,6 @@ define amdgpu_vs i32 @load_i32_hifffffff0(i32 addrspace(6)* inreg %p) #4 {
; GCN-LABEL: {{^}}load_sampler
; GCN: v_readfirstlane_b32
; GCN-NEXT: v_readfirstlane_b32
; SI: s_nop
; GCN: s_load_dwordx8
; GCN-NEXT: s_load_dwordx4
@ -260,7 +259,6 @@ main_body:
; GCN-LABEL: {{^}}load_sampler_nouniform
; GCN: v_readfirstlane_b32
; GCN-NEXT: v_readfirstlane_b32
; SI: s_nop
; GCN: s_load_dwordx8
; GCN-NEXT: s_load_dwordx4

View File

@ -1500,15 +1500,13 @@ define amdgpu_kernel void @v_ctlz_i32_sel_ne_bitwidth(i32 addrspace(1)* noalias
; GFX10-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-GISEL-NEXT: global_load_ushort v1, v0, s[2:3]
; GFX10-GISEL-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-GISEL-NEXT: s_mov_b32 s2, 0xffff
; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0)
; GFX10-GISEL-NEXT: v_ffbh_u32_e32 v2, v1
; GFX10-GISEL-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1
; GFX10-GISEL-NEXT: v_min_u32_e32 v2, 32, v2
; GFX10-GISEL-NEXT: v_subrev_nc_u32_e32 v2, 16, v2
; GFX10-GISEL-NEXT: v_and_b32_e32 v2, s2, v2
; GFX10-GISEL-NEXT: v_cndmask_b32_e64 v1, v2, s2, vcc_lo
; GFX10-GISEL-NEXT: v_and_b32_e32 v2, 0xffff, v2
; GFX10-GISEL-NEXT: v_cndmask_b32_e64 v1, v2, 0xffff, vcc_lo
; GFX10-GISEL-NEXT: global_store_short v0, v1, s[0:1]
; GFX10-GISEL-NEXT: s_endpgm
%val = load i16, i16 addrspace(1)* %valptr
@ -1604,19 +1602,18 @@ define amdgpu_kernel void @v_ctlz_i7_sel_eq_neg1(i7 addrspace(1)* noalias %out,
; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, s2
; GFX10-GISEL-NEXT: v_mov_b32_e32 v2, s3
; GFX10-GISEL-NEXT: s_movk_i32 s2, 0x7f
; GFX10-GISEL-NEXT: v_add_co_u32 v0, vcc_lo, v1, v0
; GFX10-GISEL-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v2, v3, vcc_lo
; GFX10-GISEL-NEXT: global_load_ubyte v0, v[0:1], off
; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0)
; GFX10-GISEL-NEXT: v_and_b32_e32 v0, s2, v0
; GFX10-GISEL-NEXT: v_and_b32_e32 v0, 0x7f, v0
; GFX10-GISEL-NEXT: v_ffbh_u32_e32 v1, v0
; GFX10-GISEL-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
; GFX10-GISEL-NEXT: v_min_u32_e32 v1, 32, v1
; GFX10-GISEL-NEXT: v_subrev_nc_u32_e32 v1, 25, v1
; GFX10-GISEL-NEXT: v_cndmask_b32_e64 v0, v1, 0x7f, vcc_lo
; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, 0
; GFX10-GISEL-NEXT: v_and_b32_e32 v0, s2, v0
; GFX10-GISEL-NEXT: v_and_b32_e32 v0, 0x7f, v0
; GFX10-GISEL-NEXT: global_store_byte v1, v0, s[0:1]
; GFX10-GISEL-NEXT: s_endpgm
%tid = call i32 @llvm.amdgcn.workitem.id.x()

View File

@ -1492,14 +1492,12 @@ define amdgpu_kernel void @v_cttz_i32_sel_ne_bitwidth(i32 addrspace(1)* noalias
; GFX10-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-GISEL-NEXT: global_load_ushort v1, v0, s[2:3]
; GFX10-GISEL-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-GISEL-NEXT: s_mov_b32 s2, 0xffff
; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0)
; GFX10-GISEL-NEXT: v_or_b32_e32 v2, 0x10000, v1
; GFX10-GISEL-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1
; GFX10-GISEL-NEXT: v_ffbl_b32_e32 v2, v2
; GFX10-GISEL-NEXT: v_and_b32_e32 v2, s2, v2
; GFX10-GISEL-NEXT: v_cndmask_b32_e64 v1, v2, s2, vcc_lo
; GFX10-GISEL-NEXT: v_and_b32_e32 v2, 0xffff, v2
; GFX10-GISEL-NEXT: v_cndmask_b32_e64 v1, v2, 0xffff, vcc_lo
; GFX10-GISEL-NEXT: global_store_short v0, v1, s[0:1]
; GFX10-GISEL-NEXT: s_endpgm
%val = load i16, i16 addrspace(1)* %valptr
@ -1595,18 +1593,17 @@ define amdgpu_kernel void @v_cttz_i7_sel_eq_neg1(i7 addrspace(1)* noalias %out,
; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, s2
; GFX10-GISEL-NEXT: v_mov_b32_e32 v2, s3
; GFX10-GISEL-NEXT: s_movk_i32 s2, 0x7f
; GFX10-GISEL-NEXT: v_add_co_u32 v0, vcc_lo, v1, v0
; GFX10-GISEL-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v2, v3, vcc_lo
; GFX10-GISEL-NEXT: global_load_ubyte v0, v[0:1], off
; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0)
; GFX10-GISEL-NEXT: v_or_b32_e32 v1, 0x80, v0
; GFX10-GISEL-NEXT: v_and_b32_e32 v0, s2, v0
; GFX10-GISEL-NEXT: v_and_b32_e32 v0, 0x7f, v0
; GFX10-GISEL-NEXT: v_ffbl_b32_e32 v1, v1
; GFX10-GISEL-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
; GFX10-GISEL-NEXT: v_cndmask_b32_e64 v0, v1, 0x7f, vcc_lo
; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, 0
; GFX10-GISEL-NEXT: v_and_b32_e32 v0, s2, v0
; GFX10-GISEL-NEXT: v_and_b32_e32 v0, 0x7f, v0
; GFX10-GISEL-NEXT: global_store_byte v1, v0, s[0:1]
; GFX10-GISEL-NEXT: s_endpgm
%tid = call i32 @llvm.amdgcn.workitem.id.x()

View File

@ -1205,7 +1205,6 @@ define amdgpu_kernel void @load_v4i8_to_v4f32_2_uses(<4 x float> addrspace(1)* n
; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; VI-NEXT: v_mov_b32_e32 v5, 9
; VI-NEXT: s_movk_i32 s8, 0x900
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v1, s3
; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0
@ -1232,8 +1231,8 @@ define amdgpu_kernel void @load_v4i8_to_v4f32_2_uses(<4 x float> addrspace(1)* n
; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v6
; VI-NEXT: v_or_b32_sdwa v0, v7, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
; VI-NEXT: v_or_b32_sdwa v1, v1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
; VI-NEXT: v_mov_b32_e32 v2, s8
; VI-NEXT: v_add_u16_e32 v0, s8, v0
; VI-NEXT: v_mov_b32_e32 v2, 0x900
; VI-NEXT: v_add_u16_e32 v0, 0x900, v0
; VI-NEXT: v_add_u16_sdwa v1, v1, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
; VI-NEXT: v_or_b32_e32 v0, v0, v1
; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
@ -1250,7 +1249,6 @@ define amdgpu_kernel void @load_v4i8_to_v4f32_2_uses(<4 x float> addrspace(1)* n
; GFX10-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX10-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x2c
; GFX10-NEXT: s_movk_i32 s0, 0x900
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: v_lshrrev_b32_e32 v2, 16, v0
; GFX10-NEXT: v_lshrrev_b32_sdwa v1, v1, v0 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
@ -1261,8 +1259,8 @@ define amdgpu_kernel void @load_v4i8_to_v4f32_2_uses(<4 x float> addrspace(1)* n
; GFX10-NEXT: v_or_b32_sdwa v2, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
; GFX10-NEXT: v_mov_b32_e32 v4, 0
; GFX10-NEXT: v_cvt_f32_ubyte3_e32 v3, v0
; GFX10-NEXT: v_add_nc_u16 v1, v1, s0
; GFX10-NEXT: v_add_nc_u16 v5, v2, s0
; GFX10-NEXT: v_add_nc_u16 v1, v1, 0x900
; GFX10-NEXT: v_add_nc_u16 v5, v2, 0x900
; GFX10-NEXT: v_cvt_f32_ubyte2_e32 v2, v0
; GFX10-NEXT: v_lshlrev_b32_e32 v6, 16, v1
; GFX10-NEXT: v_cvt_f32_ubyte1_e32 v1, v0

View File

@ -71,21 +71,20 @@ define <4 x i16> @extract_4xi16(<8 x i16> addrspace(1) * %p0, <8 x i16> addrspac
; SI-NEXT: v_bfe_i32 v0, v3, 0, 16
; SI-NEXT: v_bfe_i32 v1, v4, 0, 16
; SI-NEXT: v_bfe_i32 v2, v2, 0, 16
; SI-NEXT: s_mov_b32 s4, 0xffff
; SI-NEXT: v_mov_b32_e32 v3, 0x8000
; SI-NEXT: v_mov_b32_e32 v4, 0xffff0000
; SI-NEXT: v_bfrev_b32_e32 v5, 1
; SI-NEXT: v_mov_b32_e32 v6, 0xffff8000
; SI-NEXT: v_mov_b32_e32 v7, s4
; SI-NEXT: v_mov_b32_e32 v3, 0xffff
; SI-NEXT: v_mov_b32_e32 v4, 0x8000
; SI-NEXT: v_mov_b32_e32 v5, 0xffff0000
; SI-NEXT: v_bfrev_b32_e32 v6, 1
; SI-NEXT: v_mov_b32_e32 v7, 0xffff8000
; SI-NEXT: v_cmp_lt_i32_e32 vcc, -1, v0
; SI-NEXT: v_cndmask_b32_e32 v0, v7, v3, vcc
; SI-NEXT: v_cndmask_b32_e32 v0, v3, v4, vcc
; SI-NEXT: v_cmp_lt_i32_e32 vcc, -1, v1
; SI-NEXT: v_cndmask_b32_e32 v1, v4, v5, vcc
; SI-NEXT: v_cndmask_b32_e32 v1, v5, v6, vcc
; SI-NEXT: v_cmp_lt_i32_e32 vcc, -1, v2
; SI-NEXT: v_cndmask_b32_e32 v2, -1, v6, vcc
; SI-NEXT: v_cndmask_b32_e32 v2, -1, v7, vcc
; SI-NEXT: v_or_b32_e32 v0, v0, v1
; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v2
; SI-NEXT: v_and_b32_e32 v2, s4, v2
; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2
; SI-NEXT: v_or_b32_e32 v2, v2, v3
; SI-NEXT: v_alignbit_b32 v1, v2, v1, 16
; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v2

View File

@ -42,9 +42,8 @@ define amdgpu_kernel void @s_fabs_v2f16(<2 x half> addrspace(1)* %out, <2 x half
; CI: s_load_dwordx2 s[[[LO:[0-9]+]]:[[HI:[0-9]+]]], s{{\[[0-9]+:[0-9]+\]}}, 0x2
; GFX89: s_load_dwordx2 s[[[LO:[0-9]+]]:[[HI:[0-9]+]]], s{{\[[0-9]+:[0-9]+\]}}, 0x8
; GCN: s_mov_b32 [[MASK:s[0-9]+]], 0x7fff7fff
; GCN-DAG: s_and_b32 s{{[0-9]+}}, s[[LO]], [[MASK]]
; GCN-DAG: s_and_b32 s{{[0-9]+}}, s[[HI]], [[MASK]]
; GCN-DAG: s_and_b32 s{{[0-9]+}}, s[[LO]], 0x7fff7fff
; GCN-DAG: s_and_b32 s{{[0-9]+}}, s[[HI]], 0x7fff7fff
; GCN: {{flat|global}}_store_dwordx2
define amdgpu_kernel void @s_fabs_v4f16(<4 x half> addrspace(1)* %out, <4 x half> %in) {
%fabs = call <4 x half> @llvm.fabs.v4f16(<4 x half> %in)

View File

@ -30,8 +30,8 @@ define amdgpu_kernel void @fabs_f64(double addrspace(1)* %out, double %in) {
}
; FUNC-LABEL: {{^}}fabs_v2f64:
; SI: s_and_b32
; SI: s_and_b32
; SI: s_bitset0_b32
; SI: s_bitset0_b32
; SI: s_endpgm
define amdgpu_kernel void @fabs_v2f64(<2 x double> addrspace(1)* %out, <2 x double> %in) {
%fabs = call <2 x double> @llvm.fabs.v2f64(<2 x double> %in)
@ -40,10 +40,10 @@ define amdgpu_kernel void @fabs_v2f64(<2 x double> addrspace(1)* %out, <2 x doub
}
; FUNC-LABEL: {{^}}fabs_v4f64:
; SI: s_and_b32
; SI: s_and_b32
; SI: s_and_b32
; SI: s_and_b32
; SI: s_bitset0_b32
; SI: s_bitset0_b32
; SI: s_bitset0_b32
; SI: s_bitset0_b32
; SI: s_endpgm
define amdgpu_kernel void @fabs_v4f64(<4 x double> addrspace(1)* %out, <4 x double> %in) {
%fabs = call <4 x double> @llvm.fabs.v4f64(<4 x double> %in)

View File

@ -45,8 +45,8 @@ define amdgpu_kernel void @s_fabs_f32(float addrspace(1)* %out, float %in) {
; R600: |{{(PV|T[0-9])\.[XYZW]}}|
; R600: |{{(PV|T[0-9])\.[XYZW]}}|
; GCN: s_and_b32
; GCN: s_and_b32
; GCN: s_bitset0_b32
; GCN: s_bitset0_b32
define amdgpu_kernel void @fabs_v2f32(<2 x float> addrspace(1)* %out, <2 x float> %in) {
%fabs = call <2 x float> @llvm.fabs.v2f32(<2 x float> %in)
store <2 x float> %fabs, <2 x float> addrspace(1)* %out
@ -59,10 +59,10 @@ define amdgpu_kernel void @fabs_v2f32(<2 x float> addrspace(1)* %out, <2 x float
; R600: |{{(PV|T[0-9])\.[XYZW]}}|
; R600: |{{(PV|T[0-9])\.[XYZW]}}|
; GCN: s_and_b32
; GCN: s_and_b32
; GCN: s_and_b32
; GCN: s_and_b32
; GCN: s_bitset0_b32
; GCN: s_bitset0_b32
; GCN: s_bitset0_b32
; GCN: s_bitset0_b32
define amdgpu_kernel void @fabs_v4f32(<4 x float> addrspace(1)* %out, <4 x float> %in) {
%fabs = call <4 x float> @llvm.fabs.v4f32(<4 x float> %in)
store <4 x float> %fabs, <4 x float> addrspace(1)* %out

View File

@ -107,10 +107,9 @@ define <2 x half> @v_exp_v2f16(<2 x half> %arg0) {
; VI-LABEL: v_exp_v2f16:
; VI: ; %bb.0:
; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; VI-NEXT: s_movk_i32 s4, 0x3dc5
; VI-NEXT: v_mov_b32_e32 v1, s4
; VI-NEXT: v_mov_b32_e32 v1, 0x3dc5
; VI-NEXT: v_mul_f16_sdwa v1, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; VI-NEXT: v_mul_f16_e32 v0, s4, v0
; VI-NEXT: v_mul_f16_e32 v0, 0x3dc5, v0
; VI-NEXT: v_exp_f16_sdwa v1, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD
; VI-NEXT: v_exp_f16_e32 v0, v0
; VI-NEXT: v_or_b32_e32 v0, v0, v1
@ -161,7 +160,7 @@ define <4 x half> @v_exp_v4f16(<4 x half> %arg0) {
; VI: ; %bb.0:
; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; VI-NEXT: s_movk_i32 s4, 0x3dc5
; VI-NEXT: v_mov_b32_e32 v3, s4
; VI-NEXT: v_mov_b32_e32 v3, 0x3dc5
; VI-NEXT: v_mul_f16_e32 v2, s4, v1
; VI-NEXT: v_mul_f16_sdwa v1, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; VI-NEXT: v_mul_f16_e32 v4, s4, v0

View File

@ -532,12 +532,11 @@ define amdgpu_kernel void @store_load_vindex_kernel() {
; GFX10-NEXT: s_addc_u32 s1, s1, 0
; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s0
; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1
; GFX10-NEXT: v_mov_b32_e32 v1, 4
; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX10-NEXT: v_mov_b32_e32 v3, 15
; GFX10-NEXT: v_add_nc_u32_e32 v2, v1, v0
; GFX10-NEXT: v_sub_nc_u32_e32 v0, v1, v0
; GFX10-NEXT: scratch_store_dword v2, v3, off
; GFX10-NEXT: v_mov_b32_e32 v2, 15
; GFX10-NEXT: v_add_nc_u32_e32 v1, 4, v0
; GFX10-NEXT: v_sub_nc_u32_e32 v0, 4, v0
; GFX10-NEXT: scratch_store_dword v1, v2, off
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: scratch_load_dword v0, v0, off offset:124 glc dlc
; GFX10-NEXT: s_waitcnt vmcnt(0)
@ -585,12 +584,11 @@ define amdgpu_kernel void @store_load_vindex_kernel() {
; GFX10-PAL-NEXT: s_addc_u32 s3, s3, 0
; GFX10-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s2
; GFX10-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s3
; GFX10-PAL-NEXT: v_mov_b32_e32 v1, 4
; GFX10-PAL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX10-PAL-NEXT: v_mov_b32_e32 v3, 15
; GFX10-PAL-NEXT: v_add_nc_u32_e32 v2, v1, v0
; GFX10-PAL-NEXT: v_sub_nc_u32_e32 v0, v1, v0
; GFX10-PAL-NEXT: scratch_store_dword v2, v3, off
; GFX10-PAL-NEXT: v_mov_b32_e32 v2, 15
; GFX10-PAL-NEXT: v_add_nc_u32_e32 v1, 4, v0
; GFX10-PAL-NEXT: v_sub_nc_u32_e32 v0, 4, v0
; GFX10-PAL-NEXT: scratch_store_dword v1, v2, off
; GFX10-PAL-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-PAL-NEXT: scratch_load_dword v0, v0, off offset:124 glc dlc
; GFX10-PAL-NEXT: s_waitcnt vmcnt(0)
@ -639,12 +637,11 @@ define void @store_load_vindex_foo(i32 %idx) {
; GFX10: ; %bb.0: ; %bb
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: v_mov_b32_e32 v1, s32
; GFX10-NEXT: v_and_b32_e32 v2, 15, v0
; GFX10-NEXT: v_mov_b32_e32 v3, 15
; GFX10-NEXT: v_lshl_add_u32 v0, v0, 2, v1
; GFX10-NEXT: v_lshl_add_u32 v1, v2, 2, v1
; GFX10-NEXT: scratch_store_dword v0, v3, off
; GFX10-NEXT: v_and_b32_e32 v1, 15, v0
; GFX10-NEXT: v_lshl_add_u32 v0, v0, 2, s32
; GFX10-NEXT: v_mov_b32_e32 v2, 15
; GFX10-NEXT: v_lshl_add_u32 v1, v1, 2, s32
; GFX10-NEXT: scratch_store_dword v0, v2, off
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: scratch_load_dword v0, v1, off glc dlc
; GFX10-NEXT: s_waitcnt vmcnt(0)
@ -681,12 +678,11 @@ define void @store_load_vindex_foo(i32 %idx) {
; GFX10-PAL: ; %bb.0: ; %bb
; GFX10-PAL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-PAL-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-PAL-NEXT: v_mov_b32_e32 v1, s32
; GFX10-PAL-NEXT: v_and_b32_e32 v2, 15, v0
; GFX10-PAL-NEXT: v_mov_b32_e32 v3, 15
; GFX10-PAL-NEXT: v_lshl_add_u32 v0, v0, 2, v1
; GFX10-PAL-NEXT: v_lshl_add_u32 v1, v2, 2, v1
; GFX10-PAL-NEXT: scratch_store_dword v0, v3, off
; GFX10-PAL-NEXT: v_and_b32_e32 v1, 15, v0
; GFX10-PAL-NEXT: v_lshl_add_u32 v0, v0, 2, s32
; GFX10-PAL-NEXT: v_mov_b32_e32 v2, 15
; GFX10-PAL-NEXT: v_lshl_add_u32 v1, v1, 2, s32
; GFX10-PAL-NEXT: scratch_store_dword v0, v2, off
; GFX10-PAL-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-PAL-NEXT: scratch_load_dword v0, v1, off glc dlc
; GFX10-PAL-NEXT: s_waitcnt vmcnt(0)
@ -1387,14 +1383,13 @@ define amdgpu_kernel void @store_load_vindex_small_offset_kernel() {
; GFX10-NEXT: s_addc_u32 s1, s1, 0
; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s0
; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1
; GFX10-NEXT: v_mov_b32_e32 v1, 0x104
; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX10-NEXT: v_mov_b32_e32 v3, 15
; GFX10-NEXT: v_add_nc_u32_e32 v2, v1, v0
; GFX10-NEXT: v_sub_nc_u32_e32 v0, v1, v0
; GFX10-NEXT: scratch_load_dword v1, off, off offset:4 glc dlc
; GFX10-NEXT: v_mov_b32_e32 v2, 15
; GFX10-NEXT: scratch_load_dword v3, off, off offset:4 glc dlc
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: scratch_store_dword v2, v3, off
; GFX10-NEXT: v_add_nc_u32_e32 v1, 0x104, v0
; GFX10-NEXT: v_sub_nc_u32_e32 v0, 0x104, v0
; GFX10-NEXT: scratch_store_dword v1, v2, off
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: scratch_load_dword v0, v0, off offset:124 glc dlc
; GFX10-NEXT: s_waitcnt vmcnt(0)
@ -1447,15 +1442,14 @@ define amdgpu_kernel void @store_load_vindex_small_offset_kernel() {
; GFX1010-PAL-NEXT: s_addc_u32 s3, s3, 0
; GFX1010-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s2
; GFX1010-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s3
; GFX1010-PAL-NEXT: v_mov_b32_e32 v1, 0x104
; GFX1010-PAL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX1010-PAL-NEXT: v_mov_b32_e32 v3, 15
; GFX1010-PAL-NEXT: v_mov_b32_e32 v2, 15
; GFX1010-PAL-NEXT: s_mov_b32 vcc_lo, 0
; GFX1010-PAL-NEXT: v_add_nc_u32_e32 v2, v1, v0
; GFX1010-PAL-NEXT: v_sub_nc_u32_e32 v0, v1, v0
; GFX1010-PAL-NEXT: scratch_load_dword v1, off, vcc_lo offset:4 glc dlc
; GFX1010-PAL-NEXT: scratch_load_dword v3, off, vcc_lo offset:4 glc dlc
; GFX1010-PAL-NEXT: s_waitcnt vmcnt(0)
; GFX1010-PAL-NEXT: scratch_store_dword v2, v3, off
; GFX1010-PAL-NEXT: v_add_nc_u32_e32 v1, 0x104, v0
; GFX1010-PAL-NEXT: v_sub_nc_u32_e32 v0, 0x104, v0
; GFX1010-PAL-NEXT: scratch_store_dword v1, v2, off
; GFX1010-PAL-NEXT: s_waitcnt_vscnt null, 0x0
; GFX1010-PAL-NEXT: scratch_load_dword v0, v0, off offset:124 glc dlc
; GFX1010-PAL-NEXT: s_waitcnt vmcnt(0)
@ -1472,14 +1466,13 @@ define amdgpu_kernel void @store_load_vindex_small_offset_kernel() {
; GFX1030-PAL-NEXT: s_addc_u32 s3, s3, 0
; GFX1030-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s2
; GFX1030-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s3
; GFX1030-PAL-NEXT: v_mov_b32_e32 v1, 0x104
; GFX1030-PAL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX1030-PAL-NEXT: v_mov_b32_e32 v3, 15
; GFX1030-PAL-NEXT: v_add_nc_u32_e32 v2, v1, v0
; GFX1030-PAL-NEXT: v_sub_nc_u32_e32 v0, v1, v0
; GFX1030-PAL-NEXT: scratch_load_dword v1, off, off offset:4 glc dlc
; GFX1030-PAL-NEXT: v_mov_b32_e32 v2, 15
; GFX1030-PAL-NEXT: scratch_load_dword v3, off, off offset:4 glc dlc
; GFX1030-PAL-NEXT: s_waitcnt vmcnt(0)
; GFX1030-PAL-NEXT: scratch_store_dword v2, v3, off
; GFX1030-PAL-NEXT: v_add_nc_u32_e32 v1, 0x104, v0
; GFX1030-PAL-NEXT: v_sub_nc_u32_e32 v0, 0x104, v0
; GFX1030-PAL-NEXT: scratch_store_dword v1, v2, off
; GFX1030-PAL-NEXT: s_waitcnt_vscnt null, 0x0
; GFX1030-PAL-NEXT: scratch_load_dword v0, v0, off offset:124 glc dlc
; GFX1030-PAL-NEXT: s_waitcnt vmcnt(0)
@ -1524,15 +1517,15 @@ define void @store_load_vindex_small_offset_foo(i32 %idx) {
; GFX10: ; %bb.0: ; %bb
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: v_and_b32_e32 v1, 15, v0
; GFX10-NEXT: s_add_i32 vcc_lo, s32, 0x100
; GFX10-NEXT: v_and_b32_e32 v2, 15, v0
; GFX10-NEXT: v_mov_b32_e32 v1, vcc_lo
; GFX10-NEXT: v_mov_b32_e32 v3, 15
; GFX10-NEXT: v_lshl_add_u32 v0, v0, 2, v1
; GFX10-NEXT: v_lshl_add_u32 v1, v2, 2, v1
; GFX10-NEXT: scratch_load_dword v2, off, s32 glc dlc
; GFX10-NEXT: v_mov_b32_e32 v2, 15
; GFX10-NEXT: v_lshl_add_u32 v0, v0, 2, vcc_lo
; GFX10-NEXT: s_add_i32 vcc_lo, s32, 0x100
; GFX10-NEXT: scratch_load_dword v3, off, s32 glc dlc
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: scratch_store_dword v0, v3, off
; GFX10-NEXT: v_lshl_add_u32 v1, v1, 2, vcc_lo
; GFX10-NEXT: scratch_store_dword v0, v2, off
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: scratch_load_dword v0, v1, off glc dlc
; GFX10-NEXT: s_waitcnt vmcnt(0)
@ -1574,15 +1567,15 @@ define void @store_load_vindex_small_offset_foo(i32 %idx) {
; GFX10-PAL: ; %bb.0: ; %bb
; GFX10-PAL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-PAL-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-PAL-NEXT: v_and_b32_e32 v1, 15, v0
; GFX10-PAL-NEXT: s_add_i32 vcc_lo, s32, 0x100
; GFX10-PAL-NEXT: v_and_b32_e32 v2, 15, v0
; GFX10-PAL-NEXT: v_mov_b32_e32 v1, vcc_lo
; GFX10-PAL-NEXT: v_mov_b32_e32 v3, 15
; GFX10-PAL-NEXT: v_lshl_add_u32 v0, v0, 2, v1
; GFX10-PAL-NEXT: v_lshl_add_u32 v1, v2, 2, v1
; GFX10-PAL-NEXT: scratch_load_dword v2, off, s32 glc dlc
; GFX10-PAL-NEXT: v_mov_b32_e32 v2, 15
; GFX10-PAL-NEXT: v_lshl_add_u32 v0, v0, 2, vcc_lo
; GFX10-PAL-NEXT: s_add_i32 vcc_lo, s32, 0x100
; GFX10-PAL-NEXT: scratch_load_dword v3, off, s32 glc dlc
; GFX10-PAL-NEXT: s_waitcnt vmcnt(0)
; GFX10-PAL-NEXT: scratch_store_dword v0, v3, off
; GFX10-PAL-NEXT: v_lshl_add_u32 v1, v1, 2, vcc_lo
; GFX10-PAL-NEXT: scratch_store_dword v0, v2, off
; GFX10-PAL-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-PAL-NEXT: scratch_load_dword v0, v1, off glc dlc
; GFX10-PAL-NEXT: s_waitcnt vmcnt(0)
@ -2277,14 +2270,13 @@ define amdgpu_kernel void @store_load_vindex_large_offset_kernel() {
; GFX10-NEXT: s_addc_u32 s1, s1, 0
; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s0
; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1
; GFX10-NEXT: v_mov_b32_e32 v1, 0x4004
; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX10-NEXT: v_mov_b32_e32 v3, 15
; GFX10-NEXT: v_add_nc_u32_e32 v2, v1, v0
; GFX10-NEXT: v_sub_nc_u32_e32 v0, v1, v0
; GFX10-NEXT: scratch_load_dword v1, off, off offset:4 glc dlc
; GFX10-NEXT: v_mov_b32_e32 v2, 15
; GFX10-NEXT: scratch_load_dword v3, off, off offset:4 glc dlc
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: scratch_store_dword v2, v3, off
; GFX10-NEXT: v_add_nc_u32_e32 v1, 0x4004, v0
; GFX10-NEXT: v_sub_nc_u32_e32 v0, 0x4004, v0
; GFX10-NEXT: scratch_store_dword v1, v2, off
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: scratch_load_dword v0, v0, off offset:124 glc dlc
; GFX10-NEXT: s_waitcnt vmcnt(0)
@ -2338,15 +2330,14 @@ define amdgpu_kernel void @store_load_vindex_large_offset_kernel() {
; GFX1010-PAL-NEXT: s_addc_u32 s3, s3, 0
; GFX1010-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s2
; GFX1010-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s3
; GFX1010-PAL-NEXT: v_mov_b32_e32 v1, 0x4004
; GFX1010-PAL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX1010-PAL-NEXT: v_mov_b32_e32 v3, 15
; GFX1010-PAL-NEXT: v_mov_b32_e32 v2, 15
; GFX1010-PAL-NEXT: s_mov_b32 vcc_lo, 0
; GFX1010-PAL-NEXT: v_add_nc_u32_e32 v2, v1, v0
; GFX1010-PAL-NEXT: v_sub_nc_u32_e32 v0, v1, v0
; GFX1010-PAL-NEXT: scratch_load_dword v1, off, vcc_lo offset:4 glc dlc
; GFX1010-PAL-NEXT: scratch_load_dword v3, off, vcc_lo offset:4 glc dlc
; GFX1010-PAL-NEXT: s_waitcnt vmcnt(0)
; GFX1010-PAL-NEXT: scratch_store_dword v2, v3, off
; GFX1010-PAL-NEXT: v_add_nc_u32_e32 v1, 0x4004, v0
; GFX1010-PAL-NEXT: v_sub_nc_u32_e32 v0, 0x4004, v0
; GFX1010-PAL-NEXT: scratch_store_dword v1, v2, off
; GFX1010-PAL-NEXT: s_waitcnt_vscnt null, 0x0
; GFX1010-PAL-NEXT: scratch_load_dword v0, v0, off offset:124 glc dlc
; GFX1010-PAL-NEXT: s_waitcnt vmcnt(0)
@ -2363,14 +2354,13 @@ define amdgpu_kernel void @store_load_vindex_large_offset_kernel() {
; GFX1030-PAL-NEXT: s_addc_u32 s3, s3, 0
; GFX1030-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s2
; GFX1030-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s3
; GFX1030-PAL-NEXT: v_mov_b32_e32 v1, 0x4004
; GFX1030-PAL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX1030-PAL-NEXT: v_mov_b32_e32 v3, 15
; GFX1030-PAL-NEXT: v_add_nc_u32_e32 v2, v1, v0
; GFX1030-PAL-NEXT: v_sub_nc_u32_e32 v0, v1, v0
; GFX1030-PAL-NEXT: scratch_load_dword v1, off, off offset:4 glc dlc
; GFX1030-PAL-NEXT: v_mov_b32_e32 v2, 15
; GFX1030-PAL-NEXT: scratch_load_dword v3, off, off offset:4 glc dlc
; GFX1030-PAL-NEXT: s_waitcnt vmcnt(0)
; GFX1030-PAL-NEXT: scratch_store_dword v2, v3, off
; GFX1030-PAL-NEXT: v_add_nc_u32_e32 v1, 0x4004, v0
; GFX1030-PAL-NEXT: v_sub_nc_u32_e32 v0, 0x4004, v0
; GFX1030-PAL-NEXT: scratch_store_dword v1, v2, off
; GFX1030-PAL-NEXT: s_waitcnt_vscnt null, 0x0
; GFX1030-PAL-NEXT: scratch_load_dword v0, v0, off offset:124 glc dlc
; GFX1030-PAL-NEXT: s_waitcnt vmcnt(0)
@ -2415,15 +2405,15 @@ define void @store_load_vindex_large_offset_foo(i32 %idx) {
; GFX10: ; %bb.0: ; %bb
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: v_and_b32_e32 v1, 15, v0
; GFX10-NEXT: s_add_i32 vcc_lo, s32, 0x4004
; GFX10-NEXT: v_and_b32_e32 v2, 15, v0
; GFX10-NEXT: v_mov_b32_e32 v1, vcc_lo
; GFX10-NEXT: v_mov_b32_e32 v3, 15
; GFX10-NEXT: v_lshl_add_u32 v0, v0, 2, v1
; GFX10-NEXT: v_lshl_add_u32 v1, v2, 2, v1
; GFX10-NEXT: scratch_load_dword v2, off, s32 offset:4 glc dlc
; GFX10-NEXT: v_mov_b32_e32 v2, 15
; GFX10-NEXT: v_lshl_add_u32 v0, v0, 2, vcc_lo
; GFX10-NEXT: s_add_i32 vcc_lo, s32, 0x4004
; GFX10-NEXT: scratch_load_dword v3, off, s32 offset:4 glc dlc
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: scratch_store_dword v0, v3, off
; GFX10-NEXT: v_lshl_add_u32 v1, v1, 2, vcc_lo
; GFX10-NEXT: scratch_store_dword v0, v2, off
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: scratch_load_dword v0, v1, off glc dlc
; GFX10-NEXT: s_waitcnt vmcnt(0)
@ -2467,15 +2457,15 @@ define void @store_load_vindex_large_offset_foo(i32 %idx) {
; GFX10-PAL: ; %bb.0: ; %bb
; GFX10-PAL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-PAL-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-PAL-NEXT: v_and_b32_e32 v1, 15, v0
; GFX10-PAL-NEXT: s_add_i32 vcc_lo, s32, 0x4004
; GFX10-PAL-NEXT: v_and_b32_e32 v2, 15, v0
; GFX10-PAL-NEXT: v_mov_b32_e32 v1, vcc_lo
; GFX10-PAL-NEXT: v_mov_b32_e32 v3, 15
; GFX10-PAL-NEXT: v_lshl_add_u32 v0, v0, 2, v1
; GFX10-PAL-NEXT: v_lshl_add_u32 v1, v2, 2, v1
; GFX10-PAL-NEXT: scratch_load_dword v2, off, s32 offset:4 glc dlc
; GFX10-PAL-NEXT: v_mov_b32_e32 v2, 15
; GFX10-PAL-NEXT: v_lshl_add_u32 v0, v0, 2, vcc_lo
; GFX10-PAL-NEXT: s_add_i32 vcc_lo, s32, 0x4004
; GFX10-PAL-NEXT: scratch_load_dword v3, off, s32 offset:4 glc dlc
; GFX10-PAL-NEXT: s_waitcnt vmcnt(0)
; GFX10-PAL-NEXT: scratch_store_dword v0, v3, off
; GFX10-PAL-NEXT: v_lshl_add_u32 v1, v1, 2, vcc_lo
; GFX10-PAL-NEXT: scratch_store_dword v0, v2, off
; GFX10-PAL-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-PAL-NEXT: scratch_load_dword v0, v1, off glc dlc
; GFX10-PAL-NEXT: s_waitcnt vmcnt(0)

View File

@ -970,7 +970,7 @@ define amdgpu_kernel void @two_non_inline_constant(float addrspace(1)* %out, flo
; FIXME: Simple stores do not work as a multiple use because they are bitcasted to integer constants.
; GCN-LABEL: {{^}}one_non_inline_constant:
; GCN-DAG: s_mov_b32 [[K1:s[0-9]+]], 0x41800000
; GCN-DAG: v_mov_b32_e32 [[K1:v[0-9]+]], 0x41800000
; GCN-DAG: v_add_f32_e32 [[ADD:v[0-9]+]], 0.5,
; GCN: v_med3_f32 v{{[0-9]+}}, [[ADD]], 1.0, [[K1]]
define amdgpu_kernel void @one_non_inline_constant(float addrspace(1)* %out, float addrspace(1)* %aptr) #1 {
@ -990,9 +990,8 @@ define amdgpu_kernel void @one_non_inline_constant(float addrspace(1)* %out, flo
}
; GCN-LABEL: {{^}}two_non_inline_constant_multi_use:
; GCN-DAG: s_mov_b32 [[K1:s[0-9]+]], 0x41800000
; GCN-DAG: s_mov_b32 [[K0:s[0-9]+]], 0x41000000
; GCN-DAG: v_mov_b32_e32 [[VK1:v[0-9]+]], [[K1]]
; GCN-DAG: v_mov_b32_e32 [[VK1:v[0-9]+]], 0x41800000
; GCN-DAG: v_add_f32_e32 [[ADD:v[0-9]+]], 0.5,
; GCN: v_med3_f32 v{{[0-9]+}}, [[ADD]], [[K0]], [[VK1]]
define amdgpu_kernel void @two_non_inline_constant_multi_use(float addrspace(1)* %out, float addrspace(1)* %aptr) #1 {

View File

@ -97,9 +97,8 @@ define amdgpu_kernel void @s_fneg_fabs_v2f16_bc_src(<2 x half> addrspace(1)* %ou
}
; GCN-LABEL: {{^}}fneg_fabs_v4f16:
; GCN: s_mov_b32 [[MASK:s[0-9]+]], 0x80008000
; GCN: s_or_b32 s{{[0-9]+}}, s{{[0-9]+}}, [[MASK]]
; GCN: s_or_b32 s{{[0-9]+}}, s{{[0-9]+}}, [[MASK]]
; GCN: s_or_b32 s{{[0-9]+}}, s{{[0-9]+}}, 0x80008000
; GCN: s_or_b32 s{{[0-9]+}}, s{{[0-9]+}}, 0x80008000
; GCN: {{flat|global}}_store_dwordx2
define amdgpu_kernel void @fneg_fabs_v4f16(<4 x half> addrspace(1)* %out, <4 x half> %in) {
%fabs = call <4 x half> @llvm.fabs.v4f16(<4 x half> %in)

View File

@ -69,10 +69,9 @@ define amdgpu_kernel void @fneg_fabs_f64(double addrspace(1)* %out, [8 x i32], d
}
; GCN-LABEL: {{^}}fneg_fabs_v2f64:
; GCN: s_brev_b32 [[IMMREG:s[0-9]+]], 1{{$}}
; GCN-NOT: 0x80000000
; GCN: s_or_b32 s{{[0-9]+}}, s{{[0-9]+}}, [[IMMREG]]
; GCN: s_or_b32 s{{[0-9]+}}, s{{[0-9]+}}, [[IMMREG]]
; GCN: s_bitset1_b32 s{{[0-9]+}}, 31
; GCN: s_bitset1_b32 s{{[0-9]+}}, 31
define amdgpu_kernel void @fneg_fabs_v2f64(<2 x double> addrspace(1)* %out, <2 x double> %in) {
%fabs = call <2 x double> @llvm.fabs.v2f64(<2 x double> %in)
%fsub = fsub <2 x double> <double -0.000000e+00, double -0.000000e+00>, %fabs
@ -81,12 +80,11 @@ define amdgpu_kernel void @fneg_fabs_v2f64(<2 x double> addrspace(1)* %out, <2 x
}
; GCN-LABEL: {{^}}fneg_fabs_v4f64:
; GCN: s_brev_b32 [[IMMREG:s[0-9]+]], 1{{$}}
; GCN-NOT: 0x80000000
; GCN: s_or_b32 s{{[0-9]+}}, s{{[0-9]+}}, [[IMMREG]]
; GCN: s_or_b32 s{{[0-9]+}}, s{{[0-9]+}}, [[IMMREG]]
; GCN: s_or_b32 s{{[0-9]+}}, s{{[0-9]+}}, [[IMMREG]]
; GCN: s_or_b32 s{{[0-9]+}}, s{{[0-9]+}}, [[IMMREG]]
; GCN: s_bitset1_b32 s{{[0-9]+}}, 31
; GCN: s_bitset1_b32 s{{[0-9]+}}, 31
; GCN: s_bitset1_b32 s{{[0-9]+}}, 31
; GCN: s_bitset1_b32 s{{[0-9]+}}, 31
define amdgpu_kernel void @fneg_fabs_v4f64(<4 x double> addrspace(1)* %out, <4 x double> %in) {
%fabs = call <4 x double> @llvm.fabs.v4f64(<4 x double> %in)
%fsub = fsub <4 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %fabs

View File

@ -84,9 +84,8 @@ define amdgpu_kernel void @v_fneg_fabs_f32(float addrspace(1)* %out, float addrs
; R600: -PV
; FIXME: In this case two uses of the constant should be folded
; SI: s_brev_b32 [[SIGNBITK:s[0-9]+]], 1{{$}}
; SI: s_or_b32 s{{[0-9]+}}, s{{[0-9]+}}, [[SIGNBITK]]
; SI: s_or_b32 s{{[0-9]+}}, s{{[0-9]+}}, [[SIGNBITK]]
; SI: s_bitset1_b32 s{{[0-9]+}}, 31
; SI: s_bitset1_b32 s{{[0-9]+}}, 31
define amdgpu_kernel void @fneg_fabs_v2f32(<2 x float> addrspace(1)* %out, <2 x float> %in) {
%fabs = call <2 x float> @llvm.fabs.v2f32(<2 x float> %in)
%fsub = fsub <2 x float> <float -0.000000e+00, float -0.000000e+00>, %fabs
@ -95,11 +94,10 @@ define amdgpu_kernel void @fneg_fabs_v2f32(<2 x float> addrspace(1)* %out, <2 x
}
; FUNC-LABEL: {{^}}fneg_fabs_v4f32:
; SI: s_brev_b32 [[SIGNBITK:s[0-9]+]], 1{{$}}
; SI: s_or_b32 s{{[0-9]+}}, s{{[0-9]+}}, [[SIGNBITK]]
; SI: s_or_b32 s{{[0-9]+}}, s{{[0-9]+}}, [[SIGNBITK]]
; SI: s_or_b32 s{{[0-9]+}}, s{{[0-9]+}}, [[SIGNBITK]]
; SI: s_or_b32 s{{[0-9]+}}, s{{[0-9]+}}, [[SIGNBITK]]
; SI: s_bitset1_b32 s{{[0-9]+}}, 31
; SI: s_bitset1_b32 s{{[0-9]+}}, 31
; SI: s_bitset1_b32 s{{[0-9]+}}, 31
; SI: s_bitset1_b32 s{{[0-9]+}}, 31
define amdgpu_kernel void @fneg_fabs_v4f32(<4 x float> addrspace(1)* %out, <4 x float> %in) {
%fabs = call <4 x float> @llvm.fabs.v4f32(<4 x float> %in)
%fsub = fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %fabs

View File

@ -18,9 +18,8 @@ define amdgpu_kernel void @s_fneg_f32(float addrspace(1)* %out, float %in) {
; R600: -PV
; R600: -PV
; GCN: s_brev_b32 [[SIGNBIT:s[0-9]+]], 1
; GCN: s_xor_b32
; GCN: s_xor_b32
; GCN: s_xor_b32 {{s[0-9]+}}, {{s[0-9]+}}, 0x80000000
; GCN: s_xor_b32 {{s[0-9]+}}, {{s[0-9]+}}, 0x80000000
define amdgpu_kernel void @s_fneg_v2f32(<2 x float> addrspace(1)* nocapture %out, <2 x float> %in) {
%fneg = fsub <2 x float> <float -0.000000e+00, float -0.000000e+00>, %in
store <2 x float> %fneg, <2 x float> addrspace(1)* %out
@ -33,10 +32,10 @@ define amdgpu_kernel void @s_fneg_v2f32(<2 x float> addrspace(1)* nocapture %out
; R600: -PV
; R600: -PV
; GCN: s_xor_b32
; GCN: s_xor_b32
; GCN: s_xor_b32
; GCN: s_xor_b32
; GCN: s_xor_b32 {{s[0-9]+}}, {{s[0-9]+}}, 0x80000000
; GCN: s_xor_b32 {{s[0-9]+}}, {{s[0-9]+}}, 0x80000000
; GCN: s_xor_b32 {{s[0-9]+}}, {{s[0-9]+}}, 0x80000000
; GCN: s_xor_b32 {{s[0-9]+}}, {{s[0-9]+}}, 0x80000000
define amdgpu_kernel void @s_fneg_v4f32(<4 x float> addrspace(1)* nocapture %out, <4 x float> %in) {
%fneg = fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %in
store <4 x float> %fneg, <4 x float> addrspace(1)* %out

View File

@ -36,9 +36,10 @@ body: |
; GCN: [[S_MOV_B32_:%[0-9]+]]:sreg_32_xm0 = S_MOV_B32 12345
; GCN: [[DEF:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
; GCN: [[DEF1:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
; GCN: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[S_MOV_B32_]], [[DEF]], 0, implicit $exec
; GCN: [[V_ADD_CO_U32_e64_2:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_3:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[S_MOV_B32_]], [[DEF1]], 0, implicit $exec
; GCN: S_ENDPGM 0, implicit [[V_ADD_CO_U32_e64_1]], implicit [[V_ADD_CO_U32_e64_2]]
; GCN: [[V_ADD_CO_U32_e32_:%[0-9]+]]:vgpr_32 = V_ADD_CO_U32_e32 [[S_MOV_B32_]], [[DEF]], implicit-def $vcc, implicit $exec
; GCN: [[COPY:%[0-9]+]]:sreg_64_xexec = COPY killed $vcc
; GCN: [[V_ADD_CO_U32_e32_1:%[0-9]+]]:vgpr_32 = V_ADD_CO_U32_e32 [[S_MOV_B32_]], [[DEF1]], implicit-def $vcc, implicit $exec
; GCN: S_ENDPGM 0, implicit [[COPY]], implicit [[V_ADD_CO_U32_e32_1]]
%0:sreg_32_xm0 = S_MOV_B32 12345
%1:vgpr_32 = IMPLICIT_DEF
%2:vgpr_32 = IMPLICIT_DEF

View File

@ -1344,16 +1344,14 @@ define amdgpu_kernel void @frem_v2f16(<2 x half> addrspace(1)* %out, <2 x half>
; SI-NEXT: v_div_scale_f32 v4, vcc, v0, v2, v0
; SI-NEXT: v_div_scale_f32 v5, s[4:5], v2, v2, v0
; SI-NEXT: v_rcp_f32_e32 v6, v5
; SI-NEXT: s_mov_b32 s6, 3
; SI-NEXT: s_mov_b32 s7, 0
; SI-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s6
; SI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
; SI-NEXT: v_fma_f32 v7, -v5, v6, 1.0
; SI-NEXT: v_fma_f32 v6, v7, v6, v6
; SI-NEXT: v_mul_f32_e32 v7, v4, v6
; SI-NEXT: v_fma_f32 v8, -v5, v7, v4
; SI-NEXT: v_fma_f32 v7, v8, v6, v7
; SI-NEXT: v_fma_f32 v4, -v5, v7, v4
; SI-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s7
; SI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
; SI-NEXT: v_div_fmas_f32 v4, v4, v6, v7
; SI-NEXT: v_div_fixup_f32 v4, v4, v2, v0
; SI-NEXT: v_trunc_f32_e32 v4, v4
@ -1364,14 +1362,14 @@ define amdgpu_kernel void @frem_v2f16(<2 x half> addrspace(1)* %out, <2 x half>
; SI-NEXT: v_div_scale_f32 v2, vcc, v1, v3, v1
; SI-NEXT: v_div_scale_f32 v4, s[4:5], v3, v3, v1
; SI-NEXT: v_rcp_f32_e32 v5, v4
; SI-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s6
; SI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
; SI-NEXT: v_fma_f32 v6, -v4, v5, 1.0
; SI-NEXT: v_fma_f32 v5, v6, v5, v5
; SI-NEXT: v_mul_f32_e32 v6, v2, v5
; SI-NEXT: v_fma_f32 v7, -v4, v6, v2
; SI-NEXT: v_fma_f32 v6, v7, v5, v6
; SI-NEXT: v_fma_f32 v2, -v4, v6, v2
; SI-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s7
; SI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
; SI-NEXT: v_div_fmas_f32 v2, v2, v5, v6
; SI-NEXT: v_div_fixup_f32 v2, v2, v3, v1
; SI-NEXT: v_trunc_f32_e32 v2, v2
@ -1398,8 +1396,6 @@ define amdgpu_kernel void @frem_v2f16(<2 x half> addrspace(1)* %out, <2 x half>
; CI-NEXT: s_mov_b32 s11, s3
; CI-NEXT: buffer_load_dword v0, off, s[4:7], 0
; CI-NEXT: buffer_load_dword v2, off, s[8:11], 0 offset:16
; CI-NEXT: s_mov_b32 s6, 3
; CI-NEXT: s_mov_b32 s7, 0
; CI-NEXT: s_waitcnt vmcnt(1)
; CI-NEXT: v_cvt_f32_f16_e32 v1, v0
; CI-NEXT: v_lshrrev_b32_e32 v0, 16, v0
@ -1411,14 +1407,14 @@ define amdgpu_kernel void @frem_v2f16(<2 x half> addrspace(1)* %out, <2 x half>
; CI-NEXT: v_div_scale_f32 v5, s[4:5], v2, v2, v0
; CI-NEXT: v_div_scale_f32 v4, vcc, v0, v2, v0
; CI-NEXT: v_rcp_f32_e32 v6, v5
; CI-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s6
; CI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
; CI-NEXT: v_fma_f32 v7, -v5, v6, 1.0
; CI-NEXT: v_fma_f32 v6, v7, v6, v6
; CI-NEXT: v_mul_f32_e32 v7, v4, v6
; CI-NEXT: v_fma_f32 v8, -v5, v7, v4
; CI-NEXT: v_fma_f32 v7, v8, v6, v7
; CI-NEXT: v_fma_f32 v4, -v5, v7, v4
; CI-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s7
; CI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
; CI-NEXT: v_div_fmas_f32 v4, v4, v6, v7
; CI-NEXT: v_div_fixup_f32 v4, v4, v2, v0
; CI-NEXT: v_trunc_f32_e32 v4, v4
@ -1429,14 +1425,14 @@ define amdgpu_kernel void @frem_v2f16(<2 x half> addrspace(1)* %out, <2 x half>
; CI-NEXT: v_cvt_f16_f32_e32 v0, v0
; CI-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; CI-NEXT: v_rcp_f32_e32 v5, v4
; CI-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s6
; CI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
; CI-NEXT: v_fma_f32 v6, -v4, v5, 1.0
; CI-NEXT: v_fma_f32 v5, v6, v5, v5
; CI-NEXT: v_mul_f32_e32 v6, v2, v5
; CI-NEXT: v_fma_f32 v7, -v4, v6, v2
; CI-NEXT: v_fma_f32 v6, v7, v5, v6
; CI-NEXT: v_fma_f32 v2, -v4, v6, v2
; CI-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s7
; CI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
; CI-NEXT: v_div_fmas_f32 v2, v2, v5, v6
; CI-NEXT: v_div_fixup_f32 v2, v2, v3, v1
; CI-NEXT: v_trunc_f32_e32 v2, v2
@ -1595,16 +1591,14 @@ define amdgpu_kernel void @frem_v4f16(<4 x half> addrspace(1)* %out, <4 x half>
; SI-NEXT: v_div_scale_f32 v8, vcc, v5, v1, v5
; SI-NEXT: v_div_scale_f32 v9, s[4:5], v1, v1, v5
; SI-NEXT: v_rcp_f32_e32 v10, v9
; SI-NEXT: s_mov_b32 s6, 3
; SI-NEXT: s_mov_b32 s7, 0
; SI-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s6
; SI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
; SI-NEXT: v_fma_f32 v11, -v9, v10, 1.0
; SI-NEXT: v_fma_f32 v10, v11, v10, v10
; SI-NEXT: v_mul_f32_e32 v11, v8, v10
; SI-NEXT: v_fma_f32 v12, -v9, v11, v8
; SI-NEXT: v_fma_f32 v11, v12, v10, v11
; SI-NEXT: v_fma_f32 v8, -v9, v11, v8
; SI-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s7
; SI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
; SI-NEXT: v_div_fmas_f32 v8, v8, v10, v11
; SI-NEXT: v_div_fixup_f32 v8, v8, v1, v5
; SI-NEXT: v_trunc_f32_e32 v8, v8
@ -1615,14 +1609,14 @@ define amdgpu_kernel void @frem_v4f16(<4 x half> addrspace(1)* %out, <4 x half>
; SI-NEXT: v_div_scale_f32 v5, vcc, v4, v7, v4
; SI-NEXT: v_div_scale_f32 v8, s[4:5], v7, v7, v4
; SI-NEXT: v_rcp_f32_e32 v9, v8
; SI-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s6
; SI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
; SI-NEXT: v_fma_f32 v10, -v8, v9, 1.0
; SI-NEXT: v_fma_f32 v9, v10, v9, v9
; SI-NEXT: v_mul_f32_e32 v10, v5, v9
; SI-NEXT: v_fma_f32 v11, -v8, v10, v5
; SI-NEXT: v_fma_f32 v10, v11, v9, v10
; SI-NEXT: v_fma_f32 v5, -v8, v10, v5
; SI-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s7
; SI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
; SI-NEXT: v_div_fmas_f32 v5, v5, v9, v10
; SI-NEXT: v_div_fixup_f32 v5, v5, v7, v4
; SI-NEXT: v_trunc_f32_e32 v5, v5
@ -1632,14 +1626,14 @@ define amdgpu_kernel void @frem_v4f16(<4 x half> addrspace(1)* %out, <4 x half>
; SI-NEXT: v_div_scale_f32 v4, vcc, v3, v0, v3
; SI-NEXT: v_div_scale_f32 v5, s[4:5], v0, v0, v3
; SI-NEXT: v_rcp_f32_e32 v7, v5
; SI-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s6
; SI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
; SI-NEXT: v_fma_f32 v8, -v5, v7, 1.0
; SI-NEXT: v_fma_f32 v7, v8, v7, v7
; SI-NEXT: v_mul_f32_e32 v8, v4, v7
; SI-NEXT: v_fma_f32 v9, -v5, v8, v4
; SI-NEXT: v_fma_f32 v8, v9, v7, v8
; SI-NEXT: v_fma_f32 v4, -v5, v8, v4
; SI-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s7
; SI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
; SI-NEXT: v_div_fmas_f32 v4, v4, v7, v8
; SI-NEXT: v_div_fixup_f32 v4, v4, v0, v3
; SI-NEXT: v_trunc_f32_e32 v4, v4
@ -1649,14 +1643,14 @@ define amdgpu_kernel void @frem_v4f16(<4 x half> addrspace(1)* %out, <4 x half>
; SI-NEXT: v_div_scale_f32 v3, vcc, v2, v6, v2
; SI-NEXT: v_div_scale_f32 v4, s[4:5], v6, v6, v2
; SI-NEXT: v_rcp_f32_e32 v5, v4
; SI-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s6
; SI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
; SI-NEXT: v_fma_f32 v7, -v4, v5, 1.0
; SI-NEXT: v_fma_f32 v5, v7, v5, v5
; SI-NEXT: v_mul_f32_e32 v7, v3, v5
; SI-NEXT: v_fma_f32 v8, -v4, v7, v3
; SI-NEXT: v_fma_f32 v7, v8, v5, v7
; SI-NEXT: v_fma_f32 v3, -v4, v7, v3
; SI-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s7
; SI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
; SI-NEXT: v_div_fmas_f32 v3, v3, v5, v7
; SI-NEXT: v_div_fixup_f32 v3, v3, v6, v2
; SI-NEXT: v_trunc_f32_e32 v3, v3
@ -1682,8 +1676,6 @@ define amdgpu_kernel void @frem_v4f16(<4 x half> addrspace(1)* %out, <4 x half>
; CI-NEXT: s_mov_b32 s7, s3
; CI-NEXT: buffer_load_dwordx2 v[0:1], off, s[4:7], 0
; CI-NEXT: s_mov_b32 s11, s3
; CI-NEXT: s_mov_b32 s6, 3
; CI-NEXT: s_mov_b32 s7, 0
; CI-NEXT: s_waitcnt vmcnt(0)
; CI-NEXT: v_cvt_f32_f16_e32 v2, v0
; CI-NEXT: v_lshrrev_b32_e32 v0, 16, v0
@ -1702,14 +1694,14 @@ define amdgpu_kernel void @frem_v4f16(<4 x half> addrspace(1)* %out, <4 x half>
; CI-NEXT: v_div_scale_f32 v9, s[4:5], v1, v1, v5
; CI-NEXT: v_div_scale_f32 v8, vcc, v5, v1, v5
; CI-NEXT: v_rcp_f32_e32 v10, v9
; CI-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s6
; CI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
; CI-NEXT: v_fma_f32 v11, -v9, v10, 1.0
; CI-NEXT: v_fma_f32 v10, v11, v10, v10
; CI-NEXT: v_mul_f32_e32 v11, v8, v10
; CI-NEXT: v_fma_f32 v12, -v9, v11, v8
; CI-NEXT: v_fma_f32 v11, v12, v10, v11
; CI-NEXT: v_fma_f32 v8, -v9, v11, v8
; CI-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s7
; CI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
; CI-NEXT: v_div_fmas_f32 v8, v8, v10, v11
; CI-NEXT: v_div_fixup_f32 v8, v8, v1, v5
; CI-NEXT: v_trunc_f32_e32 v8, v8
@ -1720,14 +1712,14 @@ define amdgpu_kernel void @frem_v4f16(<4 x half> addrspace(1)* %out, <4 x half>
; CI-NEXT: v_cvt_f16_f32_e32 v1, v1
; CI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; CI-NEXT: v_rcp_f32_e32 v9, v8
; CI-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s6
; CI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
; CI-NEXT: v_fma_f32 v10, -v8, v9, 1.0
; CI-NEXT: v_fma_f32 v9, v10, v9, v9
; CI-NEXT: v_mul_f32_e32 v10, v5, v9
; CI-NEXT: v_fma_f32 v11, -v8, v10, v5
; CI-NEXT: v_fma_f32 v10, v11, v9, v10
; CI-NEXT: v_fma_f32 v5, -v8, v10, v5
; CI-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s7
; CI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
; CI-NEXT: v_div_fmas_f32 v5, v5, v9, v10
; CI-NEXT: v_div_fixup_f32 v5, v5, v7, v4
; CI-NEXT: v_trunc_f32_e32 v5, v5
@ -1737,14 +1729,14 @@ define amdgpu_kernel void @frem_v4f16(<4 x half> addrspace(1)* %out, <4 x half>
; CI-NEXT: v_or_b32_e32 v1, v4, v1
; CI-NEXT: v_div_scale_f32 v4, vcc, v3, v0, v3
; CI-NEXT: v_rcp_f32_e32 v7, v5
; CI-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s6
; CI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
; CI-NEXT: v_fma_f32 v8, -v5, v7, 1.0
; CI-NEXT: v_fma_f32 v7, v8, v7, v7
; CI-NEXT: v_mul_f32_e32 v8, v4, v7
; CI-NEXT: v_fma_f32 v9, -v5, v8, v4
; CI-NEXT: v_fma_f32 v8, v9, v7, v8
; CI-NEXT: v_fma_f32 v4, -v5, v8, v4
; CI-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s7
; CI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
; CI-NEXT: v_div_fmas_f32 v4, v4, v7, v8
; CI-NEXT: v_div_fixup_f32 v4, v4, v0, v3
; CI-NEXT: v_trunc_f32_e32 v4, v4
@ -1754,14 +1746,14 @@ define amdgpu_kernel void @frem_v4f16(<4 x half> addrspace(1)* %out, <4 x half>
; CI-NEXT: v_cvt_f16_f32_e32 v0, v0
; CI-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; CI-NEXT: v_rcp_f32_e32 v5, v4
; CI-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s6
; CI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
; CI-NEXT: v_fma_f32 v7, -v4, v5, 1.0
; CI-NEXT: v_fma_f32 v5, v7, v5, v5
; CI-NEXT: v_mul_f32_e32 v7, v3, v5
; CI-NEXT: v_fma_f32 v8, -v4, v7, v3
; CI-NEXT: v_fma_f32 v7, v8, v5, v7
; CI-NEXT: v_fma_f32 v3, -v4, v7, v3
; CI-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s7
; CI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
; CI-NEXT: v_div_fmas_f32 v3, v3, v5, v7
; CI-NEXT: v_div_fixup_f32 v3, v3, v6, v2
; CI-NEXT: v_trunc_f32_e32 v3, v3
@ -1965,16 +1957,14 @@ define amdgpu_kernel void @frem_v2f32(<2 x float> addrspace(1)* %out, <2 x float
; SI-NEXT: v_div_scale_f32 v4, vcc, v1, v3, v1
; SI-NEXT: v_div_scale_f32 v5, s[4:5], v3, v3, v1
; SI-NEXT: v_rcp_f32_e32 v6, v5
; SI-NEXT: s_mov_b32 s6, 3
; SI-NEXT: s_mov_b32 s7, 0
; SI-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s6
; SI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
; SI-NEXT: v_fma_f32 v7, -v5, v6, 1.0
; SI-NEXT: v_fma_f32 v6, v7, v6, v6
; SI-NEXT: v_mul_f32_e32 v7, v4, v6
; SI-NEXT: v_fma_f32 v8, -v5, v7, v4
; SI-NEXT: v_fma_f32 v7, v8, v6, v7
; SI-NEXT: v_fma_f32 v4, -v5, v7, v4
; SI-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s7
; SI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
; SI-NEXT: v_div_fmas_f32 v4, v4, v6, v7
; SI-NEXT: v_div_fixup_f32 v4, v4, v3, v1
; SI-NEXT: v_trunc_f32_e32 v4, v4
@ -1982,14 +1972,14 @@ define amdgpu_kernel void @frem_v2f32(<2 x float> addrspace(1)* %out, <2 x float
; SI-NEXT: v_div_scale_f32 v3, vcc, v0, v2, v0
; SI-NEXT: v_div_scale_f32 v4, s[4:5], v2, v2, v0
; SI-NEXT: v_rcp_f32_e32 v5, v4
; SI-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s6
; SI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
; SI-NEXT: v_fma_f32 v6, -v4, v5, 1.0
; SI-NEXT: v_fma_f32 v5, v6, v5, v5
; SI-NEXT: v_mul_f32_e32 v6, v3, v5
; SI-NEXT: v_fma_f32 v7, -v4, v6, v3
; SI-NEXT: v_fma_f32 v6, v7, v5, v6
; SI-NEXT: v_fma_f32 v3, -v4, v6, v3
; SI-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s7
; SI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
; SI-NEXT: v_div_fmas_f32 v3, v3, v5, v6
; SI-NEXT: v_div_fixup_f32 v3, v3, v2, v0
; SI-NEXT: v_trunc_f32_e32 v3, v3
@ -2014,20 +2004,18 @@ define amdgpu_kernel void @frem_v2f32(<2 x float> addrspace(1)* %out, <2 x float
; CI-NEXT: s_mov_b32 s11, s3
; CI-NEXT: buffer_load_dwordx2 v[0:1], off, s[4:7], 0
; CI-NEXT: buffer_load_dwordx2 v[2:3], off, s[8:11], 0 offset:32
; CI-NEXT: s_mov_b32 s6, 3
; CI-NEXT: s_mov_b32 s7, 0
; CI-NEXT: s_waitcnt vmcnt(0)
; CI-NEXT: v_div_scale_f32 v5, s[4:5], v3, v3, v1
; CI-NEXT: v_div_scale_f32 v4, vcc, v1, v3, v1
; CI-NEXT: v_rcp_f32_e32 v6, v5
; CI-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s6
; CI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
; CI-NEXT: v_fma_f32 v7, -v5, v6, 1.0
; CI-NEXT: v_fma_f32 v6, v7, v6, v6
; CI-NEXT: v_mul_f32_e32 v7, v4, v6
; CI-NEXT: v_fma_f32 v8, -v5, v7, v4
; CI-NEXT: v_fma_f32 v7, v8, v6, v7
; CI-NEXT: v_fma_f32 v4, -v5, v7, v4
; CI-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s7
; CI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
; CI-NEXT: v_div_fmas_f32 v4, v4, v6, v7
; CI-NEXT: v_div_fixup_f32 v4, v4, v3, v1
; CI-NEXT: v_trunc_f32_e32 v4, v4
@ -2035,14 +2023,14 @@ define amdgpu_kernel void @frem_v2f32(<2 x float> addrspace(1)* %out, <2 x float
; CI-NEXT: v_div_scale_f32 v4, s[4:5], v2, v2, v0
; CI-NEXT: v_div_scale_f32 v3, vcc, v0, v2, v0
; CI-NEXT: v_rcp_f32_e32 v5, v4
; CI-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s6
; CI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
; CI-NEXT: v_fma_f32 v6, -v4, v5, 1.0
; CI-NEXT: v_fma_f32 v5, v6, v5, v5
; CI-NEXT: v_mul_f32_e32 v6, v3, v5
; CI-NEXT: v_fma_f32 v7, -v4, v6, v3
; CI-NEXT: v_fma_f32 v6, v7, v5, v6
; CI-NEXT: v_fma_f32 v3, -v4, v6, v3
; CI-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s7
; CI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
; CI-NEXT: v_div_fmas_f32 v3, v3, v5, v6
; CI-NEXT: v_div_fixup_f32 v3, v3, v2, v0
; CI-NEXT: v_trunc_f32_e32 v3, v3
@ -2054,8 +2042,6 @@ define amdgpu_kernel void @frem_v2f32(<2 x float> addrspace(1)* %out, <2 x float
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
; VI-NEXT: s_mov_b32 s2, 3
; VI-NEXT: s_mov_b32 s3, 0
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v2, s6
; VI-NEXT: s_add_u32 s0, s0, 32
@ -2071,14 +2057,14 @@ define amdgpu_kernel void @frem_v2f32(<2 x float> addrspace(1)* %out, <2 x float
; VI-NEXT: v_div_scale_f32 v7, s[0:1], v5, v5, v3
; VI-NEXT: v_div_scale_f32 v6, vcc, v3, v5, v3
; VI-NEXT: v_rcp_f32_e32 v8, v7
; VI-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s2
; VI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
; VI-NEXT: v_fma_f32 v9, -v7, v8, 1.0
; VI-NEXT: v_fma_f32 v8, v9, v8, v8
; VI-NEXT: v_mul_f32_e32 v9, v6, v8
; VI-NEXT: v_fma_f32 v10, -v7, v9, v6
; VI-NEXT: v_fma_f32 v9, v10, v8, v9
; VI-NEXT: v_fma_f32 v6, -v7, v9, v6
; VI-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s3
; VI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
; VI-NEXT: v_div_fmas_f32 v6, v6, v8, v9
; VI-NEXT: v_div_fixup_f32 v6, v6, v5, v3
; VI-NEXT: v_trunc_f32_e32 v6, v6
@ -2086,14 +2072,14 @@ define amdgpu_kernel void @frem_v2f32(<2 x float> addrspace(1)* %out, <2 x float
; VI-NEXT: v_div_scale_f32 v6, s[0:1], v4, v4, v2
; VI-NEXT: v_div_scale_f32 v5, vcc, v2, v4, v2
; VI-NEXT: v_rcp_f32_e32 v7, v6
; VI-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s2
; VI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
; VI-NEXT: v_fma_f32 v8, -v6, v7, 1.0
; VI-NEXT: v_fma_f32 v7, v8, v7, v7
; VI-NEXT: v_mul_f32_e32 v8, v5, v7
; VI-NEXT: v_fma_f32 v9, -v6, v8, v5
; VI-NEXT: v_fma_f32 v8, v9, v7, v8
; VI-NEXT: v_fma_f32 v5, -v6, v8, v5
; VI-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s3
; VI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
; VI-NEXT: v_div_fmas_f32 v5, v5, v7, v8
; VI-NEXT: v_div_fixup_f32 v5, v5, v4, v2
; VI-NEXT: v_trunc_f32_e32 v5, v5
@ -2109,20 +2095,18 @@ define amdgpu_kernel void @frem_v2f32(<2 x float> addrspace(1)* %out, <2 x float
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: global_load_dwordx2 v[0:1], v4, s[6:7]
; GFX9-NEXT: global_load_dwordx2 v[2:3], v4, s[2:3] offset:32
; GFX9-NEXT: s_mov_b32 s2, 3
; GFX9-NEXT: s_mov_b32 s3, 0
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_div_scale_f32 v6, s[0:1], v3, v3, v1
; GFX9-NEXT: v_div_scale_f32 v5, vcc, v1, v3, v1
; GFX9-NEXT: v_rcp_f32_e32 v7, v6
; GFX9-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s2
; GFX9-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
; GFX9-NEXT: v_fma_f32 v8, -v6, v7, 1.0
; GFX9-NEXT: v_fma_f32 v7, v8, v7, v7
; GFX9-NEXT: v_mul_f32_e32 v8, v5, v7
; GFX9-NEXT: v_fma_f32 v9, -v6, v8, v5
; GFX9-NEXT: v_fma_f32 v8, v9, v7, v8
; GFX9-NEXT: v_fma_f32 v5, -v6, v8, v5
; GFX9-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s3
; GFX9-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
; GFX9-NEXT: v_div_fmas_f32 v5, v5, v7, v8
; GFX9-NEXT: v_div_fixup_f32 v5, v5, v3, v1
; GFX9-NEXT: v_trunc_f32_e32 v5, v5
@ -2130,14 +2114,14 @@ define amdgpu_kernel void @frem_v2f32(<2 x float> addrspace(1)* %out, <2 x float
; GFX9-NEXT: v_div_scale_f32 v5, s[0:1], v2, v2, v0
; GFX9-NEXT: v_div_scale_f32 v3, vcc, v0, v2, v0
; GFX9-NEXT: v_rcp_f32_e32 v6, v5
; GFX9-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s2
; GFX9-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
; GFX9-NEXT: v_fma_f32 v7, -v5, v6, 1.0
; GFX9-NEXT: v_fma_f32 v6, v7, v6, v6
; GFX9-NEXT: v_mul_f32_e32 v7, v3, v6
; GFX9-NEXT: v_fma_f32 v8, -v5, v7, v3
; GFX9-NEXT: v_fma_f32 v7, v8, v6, v7
; GFX9-NEXT: v_fma_f32 v3, -v5, v7, v3
; GFX9-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s3
; GFX9-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
; GFX9-NEXT: v_div_fmas_f32 v3, v3, v6, v7
; GFX9-NEXT: v_div_fixup_f32 v3, v3, v2, v0
; GFX9-NEXT: v_trunc_f32_e32 v3, v3
@ -2219,16 +2203,14 @@ define amdgpu_kernel void @frem_v4f32(<4 x float> addrspace(1)* %out, <4 x float
; SI-NEXT: v_div_scale_f32 v8, vcc, v3, v7, v3
; SI-NEXT: v_div_scale_f32 v9, s[4:5], v7, v7, v3
; SI-NEXT: v_rcp_f32_e32 v10, v9
; SI-NEXT: s_mov_b32 s6, 3
; SI-NEXT: s_mov_b32 s7, 0
; SI-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s6
; SI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
; SI-NEXT: v_fma_f32 v11, -v9, v10, 1.0
; SI-NEXT: v_fma_f32 v10, v11, v10, v10
; SI-NEXT: v_mul_f32_e32 v11, v8, v10
; SI-NEXT: v_fma_f32 v12, -v9, v11, v8
; SI-NEXT: v_fma_f32 v11, v12, v10, v11
; SI-NEXT: v_fma_f32 v8, -v9, v11, v8
; SI-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s7
; SI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
; SI-NEXT: v_div_fmas_f32 v8, v8, v10, v11
; SI-NEXT: v_div_fixup_f32 v8, v8, v7, v3
; SI-NEXT: v_trunc_f32_e32 v8, v8
@ -2236,14 +2218,14 @@ define amdgpu_kernel void @frem_v4f32(<4 x float> addrspace(1)* %out, <4 x float
; SI-NEXT: v_div_scale_f32 v7, vcc, v2, v6, v2
; SI-NEXT: v_div_scale_f32 v8, s[4:5], v6, v6, v2
; SI-NEXT: v_rcp_f32_e32 v9, v8
; SI-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s6
; SI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
; SI-NEXT: v_fma_f32 v10, -v8, v9, 1.0
; SI-NEXT: v_fma_f32 v9, v10, v9, v9
; SI-NEXT: v_mul_f32_e32 v10, v7, v9
; SI-NEXT: v_fma_f32 v11, -v8, v10, v7
; SI-NEXT: v_fma_f32 v10, v11, v9, v10
; SI-NEXT: v_fma_f32 v7, -v8, v10, v7
; SI-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s7
; SI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
; SI-NEXT: v_div_fmas_f32 v7, v7, v9, v10
; SI-NEXT: v_div_fixup_f32 v7, v7, v6, v2
; SI-NEXT: v_trunc_f32_e32 v7, v7
@ -2251,14 +2233,14 @@ define amdgpu_kernel void @frem_v4f32(<4 x float> addrspace(1)* %out, <4 x float
; SI-NEXT: v_div_scale_f32 v6, vcc, v1, v5, v1
; SI-NEXT: v_div_scale_f32 v7, s[4:5], v5, v5, v1
; SI-NEXT: v_rcp_f32_e32 v8, v7
; SI-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s6
; SI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
; SI-NEXT: v_fma_f32 v9, -v7, v8, 1.0
; SI-NEXT: v_fma_f32 v8, v9, v8, v8
; SI-NEXT: v_mul_f32_e32 v9, v6, v8
; SI-NEXT: v_fma_f32 v10, -v7, v9, v6
; SI-NEXT: v_fma_f32 v9, v10, v8, v9
; SI-NEXT: v_fma_f32 v6, -v7, v9, v6
; SI-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s7
; SI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
; SI-NEXT: v_div_fmas_f32 v6, v6, v8, v9
; SI-NEXT: v_div_fixup_f32 v6, v6, v5, v1
; SI-NEXT: v_trunc_f32_e32 v6, v6
@ -2266,14 +2248,14 @@ define amdgpu_kernel void @frem_v4f32(<4 x float> addrspace(1)* %out, <4 x float
; SI-NEXT: v_div_scale_f32 v5, vcc, v0, v4, v0
; SI-NEXT: v_div_scale_f32 v6, s[4:5], v4, v4, v0
; SI-NEXT: v_rcp_f32_e32 v7, v6
; SI-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s6
; SI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
; SI-NEXT: v_fma_f32 v8, -v6, v7, 1.0
; SI-NEXT: v_fma_f32 v7, v8, v7, v7
; SI-NEXT: v_mul_f32_e32 v8, v5, v7
; SI-NEXT: v_fma_f32 v9, -v6, v8, v5
; SI-NEXT: v_fma_f32 v8, v9, v7, v8
; SI-NEXT: v_fma_f32 v5, -v6, v8, v5
; SI-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s7
; SI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
; SI-NEXT: v_div_fmas_f32 v5, v5, v7, v8
; SI-NEXT: v_div_fixup_f32 v5, v5, v4, v0
; SI-NEXT: v_trunc_f32_e32 v5, v5
@ -2298,20 +2280,18 @@ define amdgpu_kernel void @frem_v4f32(<4 x float> addrspace(1)* %out, <4 x float
; CI-NEXT: s_mov_b32 s11, s3
; CI-NEXT: buffer_load_dwordx4 v[0:3], off, s[4:7], 0
; CI-NEXT: buffer_load_dwordx4 v[4:7], off, s[8:11], 0 offset:64
; CI-NEXT: s_mov_b32 s6, 3
; CI-NEXT: s_mov_b32 s7, 0
; CI-NEXT: s_waitcnt vmcnt(0)
; CI-NEXT: v_div_scale_f32 v9, s[4:5], v7, v7, v3
; CI-NEXT: v_div_scale_f32 v8, vcc, v3, v7, v3
; CI-NEXT: v_rcp_f32_e32 v10, v9
; CI-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s6
; CI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
; CI-NEXT: v_fma_f32 v11, -v9, v10, 1.0
; CI-NEXT: v_fma_f32 v10, v11, v10, v10
; CI-NEXT: v_mul_f32_e32 v11, v8, v10
; CI-NEXT: v_fma_f32 v12, -v9, v11, v8
; CI-NEXT: v_fma_f32 v11, v12, v10, v11
; CI-NEXT: v_fma_f32 v8, -v9, v11, v8
; CI-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s7
; CI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
; CI-NEXT: v_div_fmas_f32 v8, v8, v10, v11
; CI-NEXT: v_div_fixup_f32 v8, v8, v7, v3
; CI-NEXT: v_trunc_f32_e32 v8, v8
@ -2319,14 +2299,14 @@ define amdgpu_kernel void @frem_v4f32(<4 x float> addrspace(1)* %out, <4 x float
; CI-NEXT: v_div_scale_f32 v8, s[4:5], v6, v6, v2
; CI-NEXT: v_div_scale_f32 v7, vcc, v2, v6, v2
; CI-NEXT: v_rcp_f32_e32 v9, v8
; CI-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s6
; CI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
; CI-NEXT: v_fma_f32 v10, -v8, v9, 1.0
; CI-NEXT: v_fma_f32 v9, v10, v9, v9
; CI-NEXT: v_mul_f32_e32 v10, v7, v9
; CI-NEXT: v_fma_f32 v11, -v8, v10, v7
; CI-NEXT: v_fma_f32 v10, v11, v9, v10
; CI-NEXT: v_fma_f32 v7, -v8, v10, v7
; CI-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s7
; CI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
; CI-NEXT: v_div_fmas_f32 v7, v7, v9, v10
; CI-NEXT: v_div_fixup_f32 v7, v7, v6, v2
; CI-NEXT: v_trunc_f32_e32 v7, v7
@ -2334,14 +2314,14 @@ define amdgpu_kernel void @frem_v4f32(<4 x float> addrspace(1)* %out, <4 x float
; CI-NEXT: v_div_scale_f32 v7, s[4:5], v5, v5, v1
; CI-NEXT: v_div_scale_f32 v6, vcc, v1, v5, v1
; CI-NEXT: v_rcp_f32_e32 v8, v7
; CI-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s6
; CI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
; CI-NEXT: v_fma_f32 v9, -v7, v8, 1.0
; CI-NEXT: v_fma_f32 v8, v9, v8, v8
; CI-NEXT: v_mul_f32_e32 v9, v6, v8
; CI-NEXT: v_fma_f32 v10, -v7, v9, v6
; CI-NEXT: v_fma_f32 v9, v10, v8, v9
; CI-NEXT: v_fma_f32 v6, -v7, v9, v6
; CI-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s7
; CI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
; CI-NEXT: v_div_fmas_f32 v6, v6, v8, v9
; CI-NEXT: v_div_fixup_f32 v6, v6, v5, v1
; CI-NEXT: v_trunc_f32_e32 v6, v6
@ -2349,14 +2329,14 @@ define amdgpu_kernel void @frem_v4f32(<4 x float> addrspace(1)* %out, <4 x float
; CI-NEXT: v_div_scale_f32 v6, s[4:5], v4, v4, v0
; CI-NEXT: v_div_scale_f32 v5, vcc, v0, v4, v0
; CI-NEXT: v_rcp_f32_e32 v7, v6
; CI-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s6
; CI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
; CI-NEXT: v_fma_f32 v8, -v6, v7, 1.0
; CI-NEXT: v_fma_f32 v7, v8, v7, v7
; CI-NEXT: v_mul_f32_e32 v8, v5, v7
; CI-NEXT: v_fma_f32 v9, -v6, v8, v5
; CI-NEXT: v_fma_f32 v8, v9, v7, v8
; CI-NEXT: v_fma_f32 v5, -v6, v8, v5
; CI-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s7
; CI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
; CI-NEXT: v_div_fmas_f32 v5, v5, v7, v8
; CI-NEXT: v_div_fixup_f32 v5, v5, v4, v0
; CI-NEXT: v_trunc_f32_e32 v5, v5
@ -2368,8 +2348,6 @@ define amdgpu_kernel void @frem_v4f32(<4 x float> addrspace(1)* %out, <4 x float
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
; VI-NEXT: s_mov_b32 s2, 3
; VI-NEXT: s_mov_b32 s3, 0
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v0, s6
; VI-NEXT: s_add_u32 s0, s0, 64
@ -2385,14 +2363,14 @@ define amdgpu_kernel void @frem_v4f32(<4 x float> addrspace(1)* %out, <4 x float
; VI-NEXT: v_div_scale_f32 v11, s[0:1], v7, v7, v3
; VI-NEXT: v_div_scale_f32 v10, vcc, v3, v7, v3
; VI-NEXT: v_rcp_f32_e32 v12, v11
; VI-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s2
; VI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
; VI-NEXT: v_fma_f32 v13, -v11, v12, 1.0
; VI-NEXT: v_fma_f32 v12, v13, v12, v12
; VI-NEXT: v_mul_f32_e32 v13, v10, v12
; VI-NEXT: v_fma_f32 v14, -v11, v13, v10
; VI-NEXT: v_fma_f32 v13, v14, v12, v13
; VI-NEXT: v_fma_f32 v10, -v11, v13, v10
; VI-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s3
; VI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
; VI-NEXT: v_div_fmas_f32 v10, v10, v12, v13
; VI-NEXT: v_div_fixup_f32 v10, v10, v7, v3
; VI-NEXT: v_trunc_f32_e32 v10, v10
@ -2400,14 +2378,14 @@ define amdgpu_kernel void @frem_v4f32(<4 x float> addrspace(1)* %out, <4 x float
; VI-NEXT: v_div_scale_f32 v10, s[0:1], v6, v6, v2
; VI-NEXT: v_div_scale_f32 v7, vcc, v2, v6, v2
; VI-NEXT: v_rcp_f32_e32 v11, v10
; VI-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s2
; VI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
; VI-NEXT: v_fma_f32 v12, -v10, v11, 1.0
; VI-NEXT: v_fma_f32 v11, v12, v11, v11
; VI-NEXT: v_mul_f32_e32 v12, v7, v11
; VI-NEXT: v_fma_f32 v13, -v10, v12, v7
; VI-NEXT: v_fma_f32 v12, v13, v11, v12
; VI-NEXT: v_fma_f32 v7, -v10, v12, v7
; VI-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s3
; VI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
; VI-NEXT: v_div_fmas_f32 v7, v7, v11, v12
; VI-NEXT: v_div_fixup_f32 v7, v7, v6, v2
; VI-NEXT: v_trunc_f32_e32 v7, v7
@ -2415,14 +2393,14 @@ define amdgpu_kernel void @frem_v4f32(<4 x float> addrspace(1)* %out, <4 x float
; VI-NEXT: v_div_scale_f32 v7, s[0:1], v5, v5, v1
; VI-NEXT: v_div_scale_f32 v6, vcc, v1, v5, v1
; VI-NEXT: v_rcp_f32_e32 v10, v7
; VI-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s2
; VI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
; VI-NEXT: v_fma_f32 v11, -v7, v10, 1.0
; VI-NEXT: v_fma_f32 v10, v11, v10, v10
; VI-NEXT: v_mul_f32_e32 v11, v6, v10
; VI-NEXT: v_fma_f32 v12, -v7, v11, v6
; VI-NEXT: v_fma_f32 v11, v12, v10, v11
; VI-NEXT: v_fma_f32 v6, -v7, v11, v6
; VI-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s3
; VI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
; VI-NEXT: v_div_fmas_f32 v6, v6, v10, v11
; VI-NEXT: v_div_fixup_f32 v6, v6, v5, v1
; VI-NEXT: v_trunc_f32_e32 v6, v6
@ -2430,14 +2408,14 @@ define amdgpu_kernel void @frem_v4f32(<4 x float> addrspace(1)* %out, <4 x float
; VI-NEXT: v_div_scale_f32 v6, s[0:1], v4, v4, v0
; VI-NEXT: v_div_scale_f32 v5, vcc, v0, v4, v0
; VI-NEXT: v_rcp_f32_e32 v7, v6
; VI-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s2
; VI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
; VI-NEXT: v_fma_f32 v10, -v6, v7, 1.0
; VI-NEXT: v_fma_f32 v7, v10, v7, v7
; VI-NEXT: v_mul_f32_e32 v10, v5, v7
; VI-NEXT: v_fma_f32 v11, -v6, v10, v5
; VI-NEXT: v_fma_f32 v10, v11, v7, v10
; VI-NEXT: v_fma_f32 v5, -v6, v10, v5
; VI-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s3
; VI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
; VI-NEXT: v_div_fmas_f32 v5, v5, v7, v10
; VI-NEXT: v_div_fixup_f32 v5, v5, v4, v0
; VI-NEXT: v_trunc_f32_e32 v5, v5
@ -2453,20 +2431,18 @@ define amdgpu_kernel void @frem_v4f32(<4 x float> addrspace(1)* %out, <4 x float
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: global_load_dwordx4 v[0:3], v8, s[6:7]
; GFX9-NEXT: global_load_dwordx4 v[4:7], v8, s[2:3] offset:64
; GFX9-NEXT: s_mov_b32 s2, 3
; GFX9-NEXT: s_mov_b32 s3, 0
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_div_scale_f32 v10, s[0:1], v7, v7, v3
; GFX9-NEXT: v_div_scale_f32 v9, vcc, v3, v7, v3
; GFX9-NEXT: v_rcp_f32_e32 v11, v10
; GFX9-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s2
; GFX9-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
; GFX9-NEXT: v_fma_f32 v12, -v10, v11, 1.0
; GFX9-NEXT: v_fma_f32 v11, v12, v11, v11
; GFX9-NEXT: v_mul_f32_e32 v12, v9, v11
; GFX9-NEXT: v_fma_f32 v13, -v10, v12, v9
; GFX9-NEXT: v_fma_f32 v12, v13, v11, v12
; GFX9-NEXT: v_fma_f32 v9, -v10, v12, v9
; GFX9-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s3
; GFX9-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
; GFX9-NEXT: v_div_fmas_f32 v9, v9, v11, v12
; GFX9-NEXT: v_div_fixup_f32 v9, v9, v7, v3
; GFX9-NEXT: v_trunc_f32_e32 v9, v9
@ -2474,14 +2450,14 @@ define amdgpu_kernel void @frem_v4f32(<4 x float> addrspace(1)* %out, <4 x float
; GFX9-NEXT: v_div_scale_f32 v9, s[0:1], v6, v6, v2
; GFX9-NEXT: v_div_scale_f32 v7, vcc, v2, v6, v2
; GFX9-NEXT: v_rcp_f32_e32 v10, v9
; GFX9-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s2
; GFX9-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
; GFX9-NEXT: v_fma_f32 v11, -v9, v10, 1.0
; GFX9-NEXT: v_fma_f32 v10, v11, v10, v10
; GFX9-NEXT: v_mul_f32_e32 v11, v7, v10
; GFX9-NEXT: v_fma_f32 v12, -v9, v11, v7
; GFX9-NEXT: v_fma_f32 v11, v12, v10, v11
; GFX9-NEXT: v_fma_f32 v7, -v9, v11, v7
; GFX9-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s3
; GFX9-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
; GFX9-NEXT: v_div_fmas_f32 v7, v7, v10, v11
; GFX9-NEXT: v_div_fixup_f32 v7, v7, v6, v2
; GFX9-NEXT: v_trunc_f32_e32 v7, v7
@ -2489,14 +2465,14 @@ define amdgpu_kernel void @frem_v4f32(<4 x float> addrspace(1)* %out, <4 x float
; GFX9-NEXT: v_div_scale_f32 v7, s[0:1], v5, v5, v1
; GFX9-NEXT: v_div_scale_f32 v6, vcc, v1, v5, v1
; GFX9-NEXT: v_rcp_f32_e32 v9, v7
; GFX9-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s2
; GFX9-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
; GFX9-NEXT: v_fma_f32 v10, -v7, v9, 1.0
; GFX9-NEXT: v_fma_f32 v9, v10, v9, v9
; GFX9-NEXT: v_mul_f32_e32 v10, v6, v9
; GFX9-NEXT: v_fma_f32 v11, -v7, v10, v6
; GFX9-NEXT: v_fma_f32 v10, v11, v9, v10
; GFX9-NEXT: v_fma_f32 v6, -v7, v10, v6
; GFX9-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s3
; GFX9-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
; GFX9-NEXT: v_div_fmas_f32 v6, v6, v9, v10
; GFX9-NEXT: v_div_fixup_f32 v6, v6, v5, v1
; GFX9-NEXT: v_trunc_f32_e32 v6, v6
@ -2504,14 +2480,14 @@ define amdgpu_kernel void @frem_v4f32(<4 x float> addrspace(1)* %out, <4 x float
; GFX9-NEXT: v_div_scale_f32 v6, s[0:1], v4, v4, v0
; GFX9-NEXT: v_div_scale_f32 v5, vcc, v0, v4, v0
; GFX9-NEXT: v_rcp_f32_e32 v7, v6
; GFX9-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s2
; GFX9-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
; GFX9-NEXT: v_fma_f32 v9, -v6, v7, 1.0
; GFX9-NEXT: v_fma_f32 v7, v9, v7, v7
; GFX9-NEXT: v_mul_f32_e32 v9, v5, v7
; GFX9-NEXT: v_fma_f32 v10, -v6, v9, v5
; GFX9-NEXT: v_fma_f32 v9, v10, v7, v9
; GFX9-NEXT: v_fma_f32 v5, -v6, v9, v5
; GFX9-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s3
; GFX9-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
; GFX9-NEXT: v_div_fmas_f32 v5, v5, v7, v9
; GFX9-NEXT: v_div_fixup_f32 v5, v5, v4, v0
; GFX9-NEXT: v_trunc_f32_e32 v5, v5
@ -2636,16 +2612,15 @@ define amdgpu_kernel void @frem_v2f64(<2 x double> addrspace(1)* %out, <2 x doub
; SI-NEXT: v_div_fmas_f64 v[8:9], v[16:17], v[10:11], v[14:15]
; SI-NEXT: v_div_fixup_f64 v[8:9], v[8:9], v[6:7], v[2:3]
; SI-NEXT: v_bfe_u32 v10, v9, 20, 11
; SI-NEXT: s_movk_i32 s8, 0xfc01
; SI-NEXT: v_add_i32_e32 v12, vcc, s8, v10
; SI-NEXT: v_add_i32_e32 v12, vcc, 0xfffffc01, v10
; SI-NEXT: s_mov_b32 s3, 0xfffff
; SI-NEXT: v_lshr_b64 v[10:11], s[2:3], v12
; SI-NEXT: v_not_b32_e32 v10, v10
; SI-NEXT: v_and_b32_e32 v10, v8, v10
; SI-NEXT: v_not_b32_e32 v11, v11
; SI-NEXT: v_and_b32_e32 v11, v9, v11
; SI-NEXT: s_brev_b32 s9, 1
; SI-NEXT: v_and_b32_e32 v13, s9, v9
; SI-NEXT: s_brev_b32 s8, 1
; SI-NEXT: v_and_b32_e32 v13, s8, v9
; SI-NEXT: v_cmp_gt_i32_e32 vcc, 0, v12
; SI-NEXT: v_cndmask_b32_e32 v11, v11, v13, vcc
; SI-NEXT: v_cmp_lt_i32_e64 s[0:1], 51, v12
@ -2669,13 +2644,13 @@ define amdgpu_kernel void @frem_v2f64(<2 x double> addrspace(1)* %out, <2 x doub
; SI-NEXT: v_div_fmas_f64 v[6:7], v[14:15], v[8:9], v[12:13]
; SI-NEXT: v_div_fixup_f64 v[6:7], v[6:7], v[4:5], v[0:1]
; SI-NEXT: v_bfe_u32 v8, v7, 20, 11
; SI-NEXT: v_add_i32_e32 v10, vcc, s8, v8
; SI-NEXT: v_add_i32_e32 v10, vcc, 0xfffffc01, v8
; SI-NEXT: v_lshr_b64 v[8:9], s[2:3], v10
; SI-NEXT: v_not_b32_e32 v8, v8
; SI-NEXT: v_and_b32_e32 v8, v6, v8
; SI-NEXT: v_not_b32_e32 v9, v9
; SI-NEXT: v_and_b32_e32 v9, v7, v9
; SI-NEXT: v_and_b32_e32 v11, s9, v7
; SI-NEXT: v_and_b32_e32 v11, s8, v7
; SI-NEXT: v_cmp_gt_i32_e32 vcc, 0, v10
; SI-NEXT: v_cndmask_b32_e32 v9, v9, v11, vcc
; SI-NEXT: v_cmp_lt_i32_e64 s[0:1], 51, v10

View File

@ -716,10 +716,9 @@ define <2 x i16> @v_fshr_v2i16(<2 x i16> %src0, <2 x i16> %src1, <2 x i16> %src2
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: v_xor_b32_e32 v3, -1, v2
; GFX10-NEXT: s_mov_b32 s4, 0xf000f
; GFX10-NEXT: v_pk_lshlrev_b16 v0, 1, v0 op_sel_hi:[0,1]
; GFX10-NEXT: v_and_b32_e32 v2, s4, v2
; GFX10-NEXT: v_and_b32_e32 v3, s4, v3
; GFX10-NEXT: v_and_b32_e32 v2, 0xf000f, v2
; GFX10-NEXT: v_and_b32_e32 v3, 0xf000f, v3
; GFX10-NEXT: v_pk_lshrrev_b16 v1, v2, v1
; GFX10-NEXT: v_pk_lshlrev_b16 v0, v3, v0
; GFX10-NEXT: v_or_b32_e32 v0, v0, v1
@ -929,37 +928,36 @@ define <4 x i16> @v_fshr_v4i16(<4 x i16> %src0, <4 x i16> %src1, <4 x i16> %src2
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: v_lshrrev_b32_e32 v6, 16, v1
; GFX10-NEXT: v_lshrrev_b32_e32 v6, 16, v3
; GFX10-NEXT: v_lshrrev_b32_e32 v7, 16, v5
; GFX10-NEXT: v_lshrrev_b32_e32 v8, 16, v3
; GFX10-NEXT: v_lshrrev_b32_e32 v11, 16, v4
; GFX10-NEXT: v_lshrrev_b32_e32 v12, 16, v0
; GFX10-NEXT: v_lshlrev_b16 v6, 1, v6
; GFX10-NEXT: v_xor_b32_e32 v9, -1, v7
; GFX10-NEXT: v_lshrrev_b16 v7, v7, v8
; GFX10-NEXT: v_lshlrev_b16 v0, 1, v0
; GFX10-NEXT: v_xor_b32_e32 v8, -1, v4
; GFX10-NEXT: v_lshrrev_b32_e32 v8, 16, v1
; GFX10-NEXT: v_lshrrev_b32_e32 v9, 16, v4
; GFX10-NEXT: v_lshrrev_b32_e32 v10, 16, v0
; GFX10-NEXT: v_lshlrev_b16 v1, 1, v1
; GFX10-NEXT: v_lshlrev_b16 v6, v9, v6
; GFX10-NEXT: v_xor_b32_e32 v9, -1, v5
; GFX10-NEXT: v_lshrrev_b32_e32 v10, 16, v2
; GFX10-NEXT: v_lshlrev_b16 v12, 1, v12
; GFX10-NEXT: v_xor_b32_e32 v13, -1, v11
; GFX10-NEXT: v_lshlrev_b16 v0, v8, v0
; GFX10-NEXT: v_xor_b32_e32 v11, -1, v5
; GFX10-NEXT: v_lshlrev_b16 v0, 1, v0
; GFX10-NEXT: v_xor_b32_e32 v12, -1, v4
; GFX10-NEXT: v_lshrrev_b16 v6, v7, v6
; GFX10-NEXT: v_lshlrev_b16 v8, 1, v8
; GFX10-NEXT: v_xor_b32_e32 v7, -1, v7
; GFX10-NEXT: v_lshrrev_b32_e32 v13, 16, v2
; GFX10-NEXT: v_lshlrev_b16 v10, 1, v10
; GFX10-NEXT: v_xor_b32_e32 v14, -1, v9
; GFX10-NEXT: v_lshlrev_b16 v1, v11, v1
; GFX10-NEXT: v_lshlrev_b16 v0, v12, v0
; GFX10-NEXT: v_lshrrev_b16 v2, v4, v2
; GFX10-NEXT: v_lshlrev_b16 v1, v9, v1
; GFX10-NEXT: v_lshrrev_b16 v3, v5, v3
; GFX10-NEXT: v_lshrrev_b16 v4, v11, v10
; GFX10-NEXT: v_lshlrev_b16 v5, v13, v12
; GFX10-NEXT: v_lshlrev_b16 v4, v7, v8
; GFX10-NEXT: v_lshrrev_b16 v5, v9, v13
; GFX10-NEXT: v_lshlrev_b16 v7, v14, v10
; GFX10-NEXT: v_or_b32_e32 v0, v0, v2
; GFX10-NEXT: v_mov_b32_e32 v2, 0xffff
; GFX10-NEXT: v_or_b32_e32 v1, v1, v3
; GFX10-NEXT: v_or_b32_e32 v3, v6, v7
; GFX10-NEXT: v_or_b32_e32 v4, v5, v4
; GFX10-NEXT: v_and_b32_e32 v0, v2, v0
; GFX10-NEXT: v_and_b32_e32 v1, v2, v1
; GFX10-NEXT: v_lshl_or_b32 v0, v4, 16, v0
; GFX10-NEXT: v_lshl_or_b32 v1, v3, 16, v1
; GFX10-NEXT: v_or_b32_e32 v2, v4, v6
; GFX10-NEXT: v_or_b32_e32 v3, v7, v5
; GFX10-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX10-NEXT: v_and_b32_e32 v1, 0xffff, v1
; GFX10-NEXT: v_lshl_or_b32 v0, v3, 16, v0
; GFX10-NEXT: v_lshl_or_b32 v1, v2, 16, v1
; GFX10-NEXT: s_setpc_b64 s[30:31]
%ret = call <4 x i16> @llvm.fshr.v4i16(<4 x i16> %src0, <4 x i16> %src1, <4 x i16> %src2)
ret <4 x i16> %ret
@ -1241,14 +1239,12 @@ define <2 x i24> @v_fshr_v2i24(<2 x i24> %src0, <2 x i24> %src1, <2 x i24> %src2
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: s_mov_b32 s4, 0xffffff
; GFX10-NEXT: v_and_b32_e32 v6, 0xffffff, v4
; GFX10-NEXT: v_and_b32_e32 v7, 0xffffff, v5
; GFX10-NEXT: v_lshlrev_b32_e32 v2, 8, v2
; GFX10-NEXT: v_and_b32_e32 v6, s4, v4
; GFX10-NEXT: v_and_b32_e32 v7, s4, v5
; GFX10-NEXT: s_mov_b32 s4, 0xaaaaaaab
; GFX10-NEXT: v_lshlrev_b32_e32 v3, 8, v3
; GFX10-NEXT: v_mul_hi_u32 v6, v6, s4
; GFX10-NEXT: v_mul_hi_u32 v7, v7, s4
; GFX10-NEXT: v_mul_hi_u32 v6, 0xaaaaaaab, v6
; GFX10-NEXT: v_mul_hi_u32 v7, 0xaaaaaaab, v7
; GFX10-NEXT: v_lshrrev_b32_e32 v6, 4, v6
; GFX10-NEXT: v_lshrrev_b32_e32 v7, 4, v7
; GFX10-NEXT: v_mul_u32_u24_e32 v6, 24, v6

View File

@ -406,18 +406,17 @@ define amdgpu_kernel void @udiv16_invariant_denom(i16 addrspace(1)* nocapture %a
; GFX9-LABEL: udiv16_invariant_denom:
; GFX9: ; %bb.0: ; %bb
; GFX9-NEXT: s_load_dword s2, s[0:1], 0x2c
; GFX9-NEXT: s_mov_b32 s4, 0xffff
; GFX9-NEXT: v_mov_b32_e32 v1, 0
; GFX9-NEXT: s_movk_i32 s5, 0x400
; GFX9-NEXT: s_movk_i32 s4, 0x400
; GFX9-NEXT: v_mov_b32_e32 v4, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: s_and_b32 s2, s4, s2
; GFX9-NEXT: s_and_b32 s2, 0xffff, s2
; GFX9-NEXT: v_cvt_f32_u32_e32 v2, s2
; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX9-NEXT: v_rcp_iflag_f32_e32 v3, v2
; GFX9-NEXT: .LBB4_1: ; %bb3
; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX9-NEXT: v_and_b32_e32 v0, s4, v4
; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v4
; GFX9-NEXT: v_cvt_f32_u32_e32 v8, v0
; GFX9-NEXT: v_lshlrev_b64 v[5:6], 1, v[0:1]
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
@ -429,7 +428,7 @@ define amdgpu_kernel void @udiv16_invariant_denom(i16 addrspace(1)* nocapture %a
; GFX9-NEXT: v_cvt_u32_f32_e32 v7, v0
; GFX9-NEXT: v_add_u16_e32 v4, 1, v4
; GFX9-NEXT: v_mad_f32 v0, -v0, v2, v8
; GFX9-NEXT: v_cmp_eq_u16_e32 vcc, s5, v4
; GFX9-NEXT: v_cmp_eq_u16_e32 vcc, s4, v4
; GFX9-NEXT: v_cmp_ge_f32_e64 s[0:1], |v0|, v2
; GFX9-NEXT: v_addc_co_u32_e64 v0, s[0:1], 0, v7, s[0:1]
; GFX9-NEXT: global_store_short v[5:6], v0, off
@ -442,16 +441,15 @@ define amdgpu_kernel void @udiv16_invariant_denom(i16 addrspace(1)* nocapture %a
; GFX10-NEXT: s_clause 0x1
; GFX10-NEXT: s_load_dword s4, s[0:1], 0x2c
; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX10-NEXT: s_mov_b32 s1, 0xffff
; GFX10-NEXT: v_mov_b32_e32 v1, 0
; GFX10-NEXT: v_mov_b32_e32 v4, 0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: s_and_b32 s0, s1, s4
; GFX10-NEXT: s_and_b32 s0, 0xffff, s4
; GFX10-NEXT: v_cvt_f32_u32_e32 v2, s0
; GFX10-NEXT: v_rcp_iflag_f32_e32 v3, v2
; GFX10-NEXT: .LBB4_1: ; %bb3
; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX10-NEXT: v_and_b32_e32 v0, s1, v4
; GFX10-NEXT: v_and_b32_e32 v0, 0xffff, v4
; GFX10-NEXT: v_add_nc_u16 v4, v4, 1
; GFX10-NEXT: v_cvt_f32_u32_e32 v7, v0
; GFX10-NEXT: v_lshlrev_b64 v[5:6], 1, v[0:1]
@ -491,17 +489,16 @@ define amdgpu_kernel void @urem16_invariant_denom(i16 addrspace(1)* nocapture %a
; GFX9: ; %bb.0: ; %bb
; GFX9-NEXT: s_load_dword s2, s[0:1], 0x2c
; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
; GFX9-NEXT: s_mov_b32 s6, 0xffff
; GFX9-NEXT: v_mov_b32_e32 v1, 0
; GFX9-NEXT: s_movk_i32 s8, 0x400
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: s_and_b32 s7, s6, s2
; GFX9-NEXT: v_cvt_f32_u32_e32 v2, s7
; GFX9-NEXT: s_movk_i32 s7, 0x400
; GFX9-NEXT: v_mov_b32_e32 v4, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: s_and_b32 s6, 0xffff, s2
; GFX9-NEXT: v_cvt_f32_u32_e32 v2, s6
; GFX9-NEXT: v_rcp_iflag_f32_e32 v3, v2
; GFX9-NEXT: .LBB5_1: ; %bb3
; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX9-NEXT: v_and_b32_e32 v0, s6, v4
; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v4
; GFX9-NEXT: v_cvt_f32_u32_e32 v8, v0
; GFX9-NEXT: v_lshlrev_b64 v[5:6], 1, v[0:1]
; GFX9-NEXT: v_add_u16_e32 v4, 1, v4
@ -511,9 +508,9 @@ define amdgpu_kernel void @urem16_invariant_denom(i16 addrspace(1)* nocapture %a
; GFX9-NEXT: v_cvt_u32_f32_e32 v10, v9
; GFX9-NEXT: v_mad_f32 v8, -v9, v2, v8
; GFX9-NEXT: v_cmp_ge_f32_e64 s[2:3], |v8|, v2
; GFX9-NEXT: v_cmp_eq_u16_e32 vcc, s8, v4
; GFX9-NEXT: v_cmp_eq_u16_e32 vcc, s7, v4
; GFX9-NEXT: v_addc_co_u32_e64 v8, s[2:3], 0, v10, s[2:3]
; GFX9-NEXT: v_mul_lo_u32 v8, v8, s7
; GFX9-NEXT: v_mul_lo_u32 v8, v8, s6
; GFX9-NEXT: v_add_co_u32_e64 v5, s[0:1], s4, v5
; GFX9-NEXT: v_addc_co_u32_e64 v6, s[0:1], v7, v6, s[0:1]
; GFX9-NEXT: v_sub_u32_e32 v0, v0, v8
@ -527,16 +524,15 @@ define amdgpu_kernel void @urem16_invariant_denom(i16 addrspace(1)* nocapture %a
; GFX10-NEXT: s_clause 0x1
; GFX10-NEXT: s_load_dword s4, s[0:1], 0x2c
; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX10-NEXT: s_mov_b32 s1, 0xffff
; GFX10-NEXT: v_mov_b32_e32 v1, 0
; GFX10-NEXT: v_mov_b32_e32 v4, 0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: s_and_b32 s4, s1, s4
; GFX10-NEXT: v_cvt_f32_u32_e32 v2, s4
; GFX10-NEXT: s_and_b32 s1, 0xffff, s4
; GFX10-NEXT: v_cvt_f32_u32_e32 v2, s1
; GFX10-NEXT: v_rcp_iflag_f32_e32 v3, v2
; GFX10-NEXT: .LBB5_1: ; %bb3
; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX10-NEXT: v_and_b32_e32 v0, s1, v4
; GFX10-NEXT: v_and_b32_e32 v0, 0xffff, v4
; GFX10-NEXT: v_add_nc_u16 v4, v4, 1
; GFX10-NEXT: v_cvt_f32_u32_e32 v7, v0
; GFX10-NEXT: v_lshlrev_b64 v[5:6], 1, v[0:1]
@ -549,7 +545,7 @@ define amdgpu_kernel void @urem16_invariant_denom(i16 addrspace(1)* nocapture %a
; GFX10-NEXT: v_cmp_ge_f32_e64 vcc_lo, |v7|, v2
; GFX10-NEXT: v_add_co_ci_u32_e32 v7, vcc_lo, 0, v8, vcc_lo
; GFX10-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x400, v4
; GFX10-NEXT: v_mul_lo_u32 v7, v7, s4
; GFX10-NEXT: v_mul_lo_u32 v7, v7, s1
; GFX10-NEXT: v_sub_nc_u32_e32 v0, v0, v7
; GFX10-NEXT: global_store_short v[5:6], v0, off
; GFX10-NEXT: s_cbranch_vccz .LBB5_1

View File

@ -2104,16 +2104,15 @@ define amdgpu_kernel void @udot2_MultipleUses_mul1(<2 x i16> addrspace(1)* %src1
; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX10-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
; GFX10-DL-NEXT: s_mov_b32 s3, 0xffff
; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-DL-NEXT: s_clause 0x1
; GFX10-DL-NEXT: global_load_dword v1, v0, s[4:5]
; GFX10-DL-NEXT: global_load_dword v2, v0, s[6:7]
; GFX10-DL-NEXT: s_load_dword s2, s[0:1], 0x0
; GFX10-DL-NEXT: s_waitcnt vmcnt(1)
; GFX10-DL-NEXT: v_and_b32_e32 v0, s3, v1
; GFX10-DL-NEXT: v_and_b32_e32 v0, 0xffff, v1
; GFX10-DL-NEXT: s_waitcnt vmcnt(0)
; GFX10-DL-NEXT: v_and_b32_e32 v3, s3, v2
; GFX10-DL-NEXT: v_and_b32_e32 v3, 0xffff, v2
; GFX10-DL-NEXT: v_mul_u32_u24_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
; GFX10-DL-NEXT: v_mul_u32_u24_e32 v2, v3, v0
; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)

View File

@ -213,6 +213,7 @@ define amdgpu_kernel void @udot4_acc16(<4 x i8> addrspace(1)* %src1,
; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; GFX8-NEXT: v_mov_b32_e32 v5, 0xff
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: v_mov_b32_e32 v1, s5
; GFX8-NEXT: v_add_u32_e32 v0, vcc, s4, v2
@ -226,7 +227,6 @@ define amdgpu_kernel void @udot4_acc16(<4 x i8> addrspace(1)* %src1,
; GFX8-NEXT: v_mov_b32_e32 v1, s1
; GFX8-NEXT: flat_load_ushort v4, v[0:1]
; GFX8-NEXT: s_movk_i32 s0, 0xff
; GFX8-NEXT: v_mov_b32_e32 v5, s0
; GFX8-NEXT: s_waitcnt vmcnt(2)
; GFX8-NEXT: v_and_b32_e32 v6, s0, v3
; GFX8-NEXT: v_lshrrev_b32_e32 v8, 8, v3
@ -315,7 +315,7 @@ define amdgpu_kernel void @udot4_acc16(<4 x i8> addrspace(1)* %src1,
; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX10-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; GFX10-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX10-DL-NEXT: s_movk_i32 s0, 0xff
; GFX10-DL-NEXT: v_mov_b32_e32 v8, 0xff
; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-DL-NEXT: s_clause 0x1
; GFX10-DL-NEXT: global_load_dword v1, v0, s[4:5]
@ -323,17 +323,17 @@ define amdgpu_kernel void @udot4_acc16(<4 x i8> addrspace(1)* %src1,
; GFX10-DL-NEXT: v_mov_b32_e32 v0, 0
; GFX10-DL-NEXT: global_load_ushort v3, v0, s[2:3]
; GFX10-DL-NEXT: s_waitcnt vmcnt(2)
; GFX10-DL-NEXT: v_and_b32_e32 v4, s0, v1
; GFX10-DL-NEXT: v_and_b32_e32 v4, 0xff, v1
; GFX10-DL-NEXT: v_lshrrev_b32_e32 v5, 8, v1
; GFX10-DL-NEXT: s_waitcnt vmcnt(1)
; GFX10-DL-NEXT: v_lshrrev_b32_e32 v6, 8, v2
; GFX10-DL-NEXT: v_and_b32_e32 v7, s0, v2
; GFX10-DL-NEXT: v_and_b32_e32 v5, s0, v5
; GFX10-DL-NEXT: v_and_b32_e32 v6, s0, v6
; GFX10-DL-NEXT: v_and_b32_e32 v7, 0xff, v2
; GFX10-DL-NEXT: v_and_b32_e32 v5, 0xff, v5
; GFX10-DL-NEXT: v_and_b32_e32 v6, 0xff, v6
; GFX10-DL-NEXT: s_waitcnt vmcnt(0)
; GFX10-DL-NEXT: v_mad_u16 v3, v4, v7, v3
; GFX10-DL-NEXT: v_and_b32_sdwa v4, v1, s0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; GFX10-DL-NEXT: v_and_b32_sdwa v7, v2, s0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; GFX10-DL-NEXT: v_and_b32_sdwa v4, v1, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; GFX10-DL-NEXT: v_and_b32_sdwa v7, v2, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; GFX10-DL-NEXT: v_lshrrev_b32_e32 v1, 24, v1
; GFX10-DL-NEXT: v_lshrrev_b32_e32 v2, 24, v2
; GFX10-DL-NEXT: v_mad_u16 v3, v5, v6, v3
@ -1208,16 +1208,15 @@ define amdgpu_kernel void @udot4_multiuse_mul1(<4 x i8> addrspace(1)* %src1,
; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX10-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
; GFX10-DL-NEXT: s_movk_i32 s3, 0xff
; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-DL-NEXT: s_clause 0x1
; GFX10-DL-NEXT: global_load_dword v1, v0, s[4:5]
; GFX10-DL-NEXT: global_load_dword v2, v0, s[6:7]
; GFX10-DL-NEXT: s_load_dword s2, s[0:1], 0x0
; GFX10-DL-NEXT: s_waitcnt vmcnt(1)
; GFX10-DL-NEXT: v_and_b32_e32 v0, s3, v1
; GFX10-DL-NEXT: v_and_b32_e32 v0, 0xff, v1
; GFX10-DL-NEXT: s_waitcnt vmcnt(0)
; GFX10-DL-NEXT: v_and_b32_e32 v3, s3, v2
; GFX10-DL-NEXT: v_and_b32_e32 v3, 0xff, v2
; GFX10-DL-NEXT: v_mul_u32_u24_sdwa v4, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:BYTE_1
; GFX10-DL-NEXT: v_mul_u32_u24_e32 v5, v0, v3
; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
@ -1511,6 +1510,7 @@ define amdgpu_kernel void @notdot4_mixedtypes(<4 x i8> addrspace(1)* %src1,
; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; GFX8-NEXT: v_mov_b32_e32 v5, 0xff
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: v_mov_b32_e32 v1, s5
; GFX8-NEXT: v_add_u32_e32 v0, vcc, s4, v2
@ -1524,7 +1524,6 @@ define amdgpu_kernel void @notdot4_mixedtypes(<4 x i8> addrspace(1)* %src1,
; GFX8-NEXT: v_mov_b32_e32 v1, s1
; GFX8-NEXT: flat_load_ushort v4, v[0:1]
; GFX8-NEXT: s_movk_i32 s0, 0xff
; GFX8-NEXT: v_mov_b32_e32 v5, s0
; GFX8-NEXT: s_waitcnt vmcnt(2)
; GFX8-NEXT: v_lshrrev_b32_e32 v8, 8, v3
; GFX8-NEXT: v_and_b32_e32 v8, s0, v8
@ -1613,7 +1612,7 @@ define amdgpu_kernel void @notdot4_mixedtypes(<4 x i8> addrspace(1)* %src1,
; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX10-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; GFX10-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX10-DL-NEXT: s_movk_i32 s0, 0xff
; GFX10-DL-NEXT: v_mov_b32_e32 v7, 0xff
; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-DL-NEXT: s_clause 0x1
; GFX10-DL-NEXT: global_load_dword v1, v0, s[4:5]
@ -1625,16 +1624,16 @@ define amdgpu_kernel void @notdot4_mixedtypes(<4 x i8> addrspace(1)* %src1,
; GFX10-DL-NEXT: s_waitcnt vmcnt(1)
; GFX10-DL-NEXT: v_lshrrev_b32_e32 v5, 8, v2
; GFX10-DL-NEXT: v_bfe_i32 v6, v1, 0, 8
; GFX10-DL-NEXT: v_bfe_i32 v7, v2, 0, 8
; GFX10-DL-NEXT: v_and_b32_e32 v4, s0, v4
; GFX10-DL-NEXT: v_and_b32_e32 v5, s0, v5
; GFX10-DL-NEXT: v_bfe_i32 v8, v2, 0, 8
; GFX10-DL-NEXT: v_and_b32_e32 v4, 0xff, v4
; GFX10-DL-NEXT: v_and_b32_e32 v5, 0xff, v5
; GFX10-DL-NEXT: s_waitcnt vmcnt(0)
; GFX10-DL-NEXT: v_mad_u16 v3, v4, v5, v3
; GFX10-DL-NEXT: v_and_b32_sdwa v4, v1, s0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; GFX10-DL-NEXT: v_and_b32_sdwa v5, v2, s0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; GFX10-DL-NEXT: v_and_b32_sdwa v4, v1, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; GFX10-DL-NEXT: v_and_b32_sdwa v5, v2, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; GFX10-DL-NEXT: v_lshrrev_b32_e32 v1, 24, v1
; GFX10-DL-NEXT: v_lshrrev_b32_e32 v2, 24, v2
; GFX10-DL-NEXT: v_mad_u16 v3, v6, v7, v3
; GFX10-DL-NEXT: v_mad_u16 v3, v6, v8, v3
; GFX10-DL-NEXT: v_mad_u16 v3, v4, v5, v3
; GFX10-DL-NEXT: v_mad_u16 v1, v1, v2, v3
; GFX10-DL-NEXT: global_store_short v0, v1, s[2:3]
@ -1802,18 +1801,18 @@ define amdgpu_kernel void @udot4_acc32_vecMul(<4 x i8> addrspace(1)* %src1,
; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX10-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
; GFX10-DL-NEXT: s_mov_b32 s3, 0xffff
; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-DL-NEXT: s_clause 0x1
; GFX10-DL-NEXT: global_load_dword v1, v0, s[4:5]
; GFX10-DL-NEXT: global_load_dword v2, v0, s[6:7]
; GFX10-DL-NEXT: v_mov_b32_e32 v0, 0xffff
; GFX10-DL-NEXT: s_load_dword s2, s[0:1], 0x0
; GFX10-DL-NEXT: s_waitcnt vmcnt(1)
; GFX10-DL-NEXT: v_and_b32_sdwa v0, s3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
; GFX10-DL-NEXT: v_and_b32_sdwa v3, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
; GFX10-DL-NEXT: s_waitcnt vmcnt(0)
; GFX10-DL-NEXT: v_and_b32_sdwa v3, s3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
; GFX10-DL-NEXT: v_and_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
; GFX10-DL-NEXT: v_mul_u32_u24_sdwa v4, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0
; GFX10-DL-NEXT: v_mul_u32_u24_e32 v0, v0, v3
; GFX10-DL-NEXT: v_mul_u32_u24_e32 v0, v3, v0
; GFX10-DL-NEXT: v_mul_u32_u24_sdwa v3, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2 src1_sel:BYTE_2
; GFX10-DL-NEXT: v_mul_u32_u24_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:BYTE_3
; GFX10-DL-NEXT: v_mov_b32_e32 v2, 0
@ -1901,6 +1900,7 @@ define amdgpu_kernel void @udot4_acc16_vecMul(<4 x i8> addrspace(1)* %src1,
; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; GFX8-NEXT: v_mov_b32_e32 v5, 0xff
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: v_mov_b32_e32 v1, s5
; GFX8-NEXT: v_add_u32_e32 v0, vcc, s4, v2
@ -1914,7 +1914,6 @@ define amdgpu_kernel void @udot4_acc16_vecMul(<4 x i8> addrspace(1)* %src1,
; GFX8-NEXT: v_mov_b32_e32 v1, s1
; GFX8-NEXT: flat_load_ushort v4, v[0:1]
; GFX8-NEXT: s_movk_i32 s0, 0xff
; GFX8-NEXT: v_mov_b32_e32 v5, s0
; GFX8-NEXT: s_waitcnt vmcnt(2)
; GFX8-NEXT: v_lshrrev_b32_e32 v6, 24, v3
; GFX8-NEXT: v_lshrrev_b16_e32 v7, 8, v3
@ -2015,7 +2014,7 @@ define amdgpu_kernel void @udot4_acc16_vecMul(<4 x i8> addrspace(1)* %src1,
; GFX10-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
; GFX10-DL-NEXT: v_mov_b32_e32 v4, 0xffff
; GFX10-DL-NEXT: s_movk_i32 s2, 0xff
; GFX10-DL-NEXT: v_mov_b32_e32 v5, 0xff
; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-DL-NEXT: s_clause 0x1
; GFX10-DL-NEXT: global_load_dword v1, v0, s[4:5]
@ -2023,27 +2022,27 @@ define amdgpu_kernel void @udot4_acc16_vecMul(<4 x i8> addrspace(1)* %src1,
; GFX10-DL-NEXT: v_mov_b32_e32 v0, 0
; GFX10-DL-NEXT: global_load_ushort v3, v0, s[0:1]
; GFX10-DL-NEXT: s_waitcnt vmcnt(2)
; GFX10-DL-NEXT: v_lshrrev_b16 v5, 8, v1
; GFX10-DL-NEXT: v_lshrrev_b16 v6, 8, v1
; GFX10-DL-NEXT: s_waitcnt vmcnt(1)
; GFX10-DL-NEXT: v_lshrrev_b16 v6, 8, v2
; GFX10-DL-NEXT: v_and_b32_sdwa v7, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
; GFX10-DL-NEXT: v_and_b32_sdwa v8, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
; GFX10-DL-NEXT: v_and_b32_sdwa v9, v1, s2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; GFX10-DL-NEXT: v_and_b32_sdwa v10, v2, s2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; GFX10-DL-NEXT: v_lshrrev_b16 v7, 8, v2
; GFX10-DL-NEXT: v_and_b32_sdwa v8, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
; GFX10-DL-NEXT: v_and_b32_sdwa v4, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
; GFX10-DL-NEXT: v_and_b32_sdwa v9, v1, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; GFX10-DL-NEXT: v_and_b32_sdwa v5, v2, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; GFX10-DL-NEXT: v_lshrrev_b32_e32 v1, 24, v1
; GFX10-DL-NEXT: v_lshl_or_b32 v6, v6, 16, v7
; GFX10-DL-NEXT: v_lshl_or_b32 v5, v5, 16, v8
; GFX10-DL-NEXT: v_lshl_or_b32 v7, v7, 16, v8
; GFX10-DL-NEXT: v_lshl_or_b32 v4, v6, 16, v4
; GFX10-DL-NEXT: v_lshrrev_b32_e32 v2, 24, v2
; GFX10-DL-NEXT: v_and_b32_e32 v7, v4, v10
; GFX10-DL-NEXT: v_and_b32_e32 v4, v4, v9
; GFX10-DL-NEXT: v_pk_mul_lo_u16 v5, v5, v6
; GFX10-DL-NEXT: v_lshl_or_b32 v2, v2, 16, v7
; GFX10-DL-NEXT: v_lshl_or_b32 v1, v1, 16, v4
; GFX10-DL-NEXT: v_lshrrev_b32_e32 v4, 16, v5
; GFX10-DL-NEXT: v_and_b32_e32 v5, 0xffff, v5
; GFX10-DL-NEXT: v_and_b32_e32 v6, 0xffff, v9
; GFX10-DL-NEXT: v_pk_mul_lo_u16 v4, v4, v7
; GFX10-DL-NEXT: v_lshl_or_b32 v2, v2, 16, v5
; GFX10-DL-NEXT: v_lshl_or_b32 v1, v1, 16, v6
; GFX10-DL-NEXT: v_lshrrev_b32_e32 v5, 16, v4
; GFX10-DL-NEXT: s_waitcnt vmcnt(0)
; GFX10-DL-NEXT: v_add_nc_u16 v3, v5, v3
; GFX10-DL-NEXT: v_add_nc_u16 v3, v4, v3
; GFX10-DL-NEXT: v_pk_mul_lo_u16 v1, v1, v2
; GFX10-DL-NEXT: v_add_nc_u16 v2, v3, v4
; GFX10-DL-NEXT: v_add_nc_u16 v2, v3, v5
; GFX10-DL-NEXT: v_lshrrev_b32_e32 v3, 16, v1
; GFX10-DL-NEXT: v_add_nc_u16 v1, v2, v1
; GFX10-DL-NEXT: v_add_nc_u16 v1, v1, v3

View File

@ -2519,7 +2519,6 @@ define amdgpu_kernel void @idot8_acc16_vecMul(<8 x i4> addrspace(1)* %src1,
; GFX10-DL-XNACK-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX10-DL-XNACK-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX10-DL-XNACK-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
; GFX10-DL-XNACK-NEXT: v_mov_b32_e32 v4, 0xffff
; GFX10-DL-XNACK-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0
; GFX10-DL-XNACK-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1
; GFX10-DL-XNACK-NEXT: s_mov_b32 s10, -1
@ -2533,79 +2532,79 @@ define amdgpu_kernel void @idot8_acc16_vecMul(<8 x i4> addrspace(1)* %src1,
; GFX10-DL-XNACK-NEXT: v_mov_b32_e32 v0, 0
; GFX10-DL-XNACK-NEXT: global_load_ushort v3, v0, s[0:1]
; GFX10-DL-XNACK-NEXT: s_waitcnt vmcnt(2)
; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v5, 4, v1
; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v6, 12, v1
; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v4, 4, v1
; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v5, 12, v1
; GFX10-DL-XNACK-NEXT: s_waitcnt vmcnt(1)
; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v12, 4, v2
; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v13, 12, v2
; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v8, 8, v1
; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v15, 8, v2
; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v5, 12, v5
; GFX10-DL-XNACK-NEXT: v_ashrrev_i16 v6, 12, v6
; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v12, 12, v12
; GFX10-DL-XNACK-NEXT: v_ashrrev_i16 v13, 12, v13
; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v7, 12, v1
; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v14, 12, v2
; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v8, 12, v8
; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v15, 12, v15
; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v11, 4, v2
; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v12, 12, v2
; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v7, 8, v1
; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v14, 8, v2
; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v4, 12, v4
; GFX10-DL-XNACK-NEXT: v_ashrrev_i16 v5, 12, v5
; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v11, 12, v11
; GFX10-DL-XNACK-NEXT: v_ashrrev_i16 v12, 12, v12
; GFX10-DL-XNACK-NEXT: v_and_b32_e32 v13, v4, v13
; GFX10-DL-XNACK-NEXT: v_and_b32_e32 v6, v4, v6
; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v10, 16, v1
; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v17, 16, v2
; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v6, 12, v1
; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v13, 12, v2
; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v7, 12, v7
; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v14, 12, v14
; GFX10-DL-XNACK-NEXT: v_ashrrev_i16 v8, 12, v8
; GFX10-DL-XNACK-NEXT: v_ashrrev_i16 v15, 12, v15
; GFX10-DL-XNACK-NEXT: v_lshl_or_b32 v12, v12, 16, v13
; GFX10-DL-XNACK-NEXT: v_lshl_or_b32 v5, v5, 16, v6
; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v9, 20, v1
; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v16, 20, v2
; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v10, 12, v10
; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v17, 12, v17
; GFX10-DL-XNACK-NEXT: v_ashrrev_i16 v4, 12, v4
; GFX10-DL-XNACK-NEXT: v_ashrrev_i16 v11, 12, v11
; GFX10-DL-XNACK-NEXT: v_and_b32_e32 v12, 0xffff, v12
; GFX10-DL-XNACK-NEXT: v_and_b32_e32 v5, 0xffff, v5
; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v9, 16, v1
; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v16, 16, v2
; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v6, 12, v6
; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v13, 12, v13
; GFX10-DL-XNACK-NEXT: v_ashrrev_i16 v7, 12, v7
; GFX10-DL-XNACK-NEXT: v_ashrrev_i16 v6, 12, v14
; GFX10-DL-XNACK-NEXT: v_and_b32_e32 v13, v4, v15
; GFX10-DL-XNACK-NEXT: v_and_b32_e32 v8, v4, v8
; GFX10-DL-XNACK-NEXT: v_pk_mul_lo_u16 v5, v5, v12
; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v11, 28, v1
; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v1, 24, v1
; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v18, 28, v2
; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v2, 24, v2
; GFX10-DL-XNACK-NEXT: v_ashrrev_i16 v14, 12, v14
; GFX10-DL-XNACK-NEXT: v_lshl_or_b32 v11, v11, 16, v12
; GFX10-DL-XNACK-NEXT: v_lshl_or_b32 v4, v4, 16, v5
; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v8, 20, v1
; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v15, 20, v2
; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v9, 12, v9
; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v16, 12, v16
; GFX10-DL-XNACK-NEXT: v_ashrrev_i16 v10, 12, v10
; GFX10-DL-XNACK-NEXT: v_ashrrev_i16 v14, 12, v17
; GFX10-DL-XNACK-NEXT: v_lshl_or_b32 v6, v6, 16, v13
; GFX10-DL-XNACK-NEXT: v_lshl_or_b32 v7, v7, 16, v8
; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v8, 16, v5
; GFX10-DL-XNACK-NEXT: s_waitcnt vmcnt(0)
; GFX10-DL-XNACK-NEXT: v_add_nc_u16 v3, v5, v3
; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v1, 12, v1
; GFX10-DL-XNACK-NEXT: v_ashrrev_i16 v6, 12, v6
; GFX10-DL-XNACK-NEXT: v_ashrrev_i16 v5, 12, v13
; GFX10-DL-XNACK-NEXT: v_and_b32_e32 v12, 0xffff, v14
; GFX10-DL-XNACK-NEXT: v_and_b32_e32 v7, 0xffff, v7
; GFX10-DL-XNACK-NEXT: v_pk_mul_lo_u16 v4, v4, v11
; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v10, 28, v1
; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v1, 24, v1
; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v17, 28, v2
; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v2, 24, v2
; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v8, 12, v8
; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v15, 12, v15
; GFX10-DL-XNACK-NEXT: v_ashrrev_i16 v9, 12, v9
; GFX10-DL-XNACK-NEXT: v_ashrrev_i16 v12, 12, v16
; GFX10-DL-XNACK-NEXT: v_ashrrev_i16 v13, 12, v16
; GFX10-DL-XNACK-NEXT: v_lshl_or_b32 v5, v5, 16, v12
; GFX10-DL-XNACK-NEXT: v_lshl_or_b32 v6, v6, 16, v7
; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v7, 16, v4
; GFX10-DL-XNACK-NEXT: s_waitcnt vmcnt(0)
; GFX10-DL-XNACK-NEXT: v_add_nc_u16 v3, v4, v3
; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v1, 12, v1
; GFX10-DL-XNACK-NEXT: v_ashrrev_i16 v8, 12, v8
; GFX10-DL-XNACK-NEXT: v_ashrrev_i16 v11, 12, v15
; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v2, 12, v2
; GFX10-DL-XNACK-NEXT: v_and_b32_e32 v5, v4, v14
; GFX10-DL-XNACK-NEXT: v_and_b32_e32 v10, v4, v10
; GFX10-DL-XNACK-NEXT: v_pk_mul_lo_u16 v6, v7, v6
; GFX10-DL-XNACK-NEXT: v_add_nc_u16 v3, v3, v8
; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v11, 12, v11
; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v18, 12, v18
; GFX10-DL-XNACK-NEXT: v_and_b32_e32 v4, 0xffff, v13
; GFX10-DL-XNACK-NEXT: v_and_b32_e32 v9, 0xffff, v9
; GFX10-DL-XNACK-NEXT: v_pk_mul_lo_u16 v5, v6, v5
; GFX10-DL-XNACK-NEXT: v_add_nc_u16 v3, v3, v7
; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v10, 12, v10
; GFX10-DL-XNACK-NEXT: v_lshlrev_b16 v17, 12, v17
; GFX10-DL-XNACK-NEXT: v_ashrrev_i16 v1, 12, v1
; GFX10-DL-XNACK-NEXT: v_ashrrev_i16 v2, 12, v2
; GFX10-DL-XNACK-NEXT: v_lshl_or_b32 v5, v12, 16, v5
; GFX10-DL-XNACK-NEXT: v_lshl_or_b32 v7, v9, 16, v10
; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v8, 16, v6
; GFX10-DL-XNACK-NEXT: v_add_nc_u16 v3, v3, v6
; GFX10-DL-XNACK-NEXT: v_ashrrev_i16 v11, 12, v11
; GFX10-DL-XNACK-NEXT: v_ashrrev_i16 v6, 12, v18
; GFX10-DL-XNACK-NEXT: v_and_b32_e32 v2, v4, v2
; GFX10-DL-XNACK-NEXT: v_and_b32_e32 v1, v4, v1
; GFX10-DL-XNACK-NEXT: v_pk_mul_lo_u16 v4, v7, v5
; GFX10-DL-XNACK-NEXT: v_add_nc_u16 v3, v3, v8
; GFX10-DL-XNACK-NEXT: v_lshl_or_b32 v2, v6, 16, v2
; GFX10-DL-XNACK-NEXT: v_lshl_or_b32 v1, v11, 16, v1
; GFX10-DL-XNACK-NEXT: v_lshl_or_b32 v4, v11, 16, v4
; GFX10-DL-XNACK-NEXT: v_lshl_or_b32 v6, v8, 16, v9
; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v7, 16, v5
; GFX10-DL-XNACK-NEXT: v_add_nc_u16 v3, v3, v5
; GFX10-DL-XNACK-NEXT: v_ashrrev_i16 v10, 12, v10
; GFX10-DL-XNACK-NEXT: v_ashrrev_i16 v5, 12, v17
; GFX10-DL-XNACK-NEXT: v_and_b32_e32 v2, 0xffff, v2
; GFX10-DL-XNACK-NEXT: v_and_b32_e32 v1, 0xffff, v1
; GFX10-DL-XNACK-NEXT: v_pk_mul_lo_u16 v4, v6, v4
; GFX10-DL-XNACK-NEXT: v_add_nc_u16 v3, v3, v7
; GFX10-DL-XNACK-NEXT: v_lshl_or_b32 v2, v5, 16, v2
; GFX10-DL-XNACK-NEXT: v_lshl_or_b32 v1, v10, 16, v1
; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v5, 16, v4
; GFX10-DL-XNACK-NEXT: v_add_nc_u16 v3, v3, v4
; GFX10-DL-XNACK-NEXT: v_pk_mul_lo_u16 v1, v1, v2
@ -2622,7 +2621,6 @@ define amdgpu_kernel void @idot8_acc16_vecMul(<8 x i4> addrspace(1)* %src1,
; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX10-DL-NOXNACK-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
; GFX10-DL-NOXNACK-NEXT: v_mov_b32_e32 v2, 0
; GFX10-DL-NOXNACK-NEXT: v_mov_b32_e32 v4, 0xffff
; GFX10-DL-NOXNACK-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0
; GFX10-DL-NOXNACK-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1
; GFX10-DL-NOXNACK-NEXT: s_mov_b32 s10, -1
@ -2635,79 +2633,79 @@ define amdgpu_kernel void @idot8_acc16_vecMul(<8 x i4> addrspace(1)* %src1,
; GFX10-DL-NOXNACK-NEXT: global_load_dword v0, v0, s[6:7]
; GFX10-DL-NOXNACK-NEXT: global_load_ushort v3, v2, s[0:1]
; GFX10-DL-NOXNACK-NEXT: s_waitcnt vmcnt(2)
; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v5, 4, v1
; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v6, 12, v1
; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v4, 4, v1
; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v5, 12, v1
; GFX10-DL-NOXNACK-NEXT: s_waitcnt vmcnt(1)
; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v12, 4, v0
; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v13, 12, v0
; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v8, 8, v1
; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v15, 8, v0
; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v5, 12, v5
; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v6, 12, v6
; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v12, 12, v12
; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v13, 12, v13
; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v7, 12, v1
; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v14, 12, v0
; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v8, 12, v8
; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v15, 12, v15
; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v11, 4, v0
; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v12, 12, v0
; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v7, 8, v1
; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v14, 8, v0
; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v4, 12, v4
; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v5, 12, v5
; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v11, 12, v11
; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v12, 12, v12
; GFX10-DL-NOXNACK-NEXT: v_and_b32_e32 v13, v4, v13
; GFX10-DL-NOXNACK-NEXT: v_and_b32_e32 v6, v4, v6
; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v10, 16, v1
; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v17, 16, v0
; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v6, 12, v1
; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v13, 12, v0
; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v7, 12, v7
; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v14, 12, v14
; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v8, 12, v8
; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v15, 12, v15
; GFX10-DL-NOXNACK-NEXT: v_lshl_or_b32 v12, v12, 16, v13
; GFX10-DL-NOXNACK-NEXT: v_lshl_or_b32 v5, v5, 16, v6
; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v9, 20, v1
; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v16, 20, v0
; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v10, 12, v10
; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v17, 12, v17
; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v4, 12, v4
; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v11, 12, v11
; GFX10-DL-NOXNACK-NEXT: v_and_b32_e32 v12, 0xffff, v12
; GFX10-DL-NOXNACK-NEXT: v_and_b32_e32 v5, 0xffff, v5
; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v9, 16, v1
; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v16, 16, v0
; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v6, 12, v6
; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v13, 12, v13
; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v7, 12, v7
; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v6, 12, v14
; GFX10-DL-NOXNACK-NEXT: v_and_b32_e32 v13, v4, v15
; GFX10-DL-NOXNACK-NEXT: v_and_b32_e32 v8, v4, v8
; GFX10-DL-NOXNACK-NEXT: v_pk_mul_lo_u16 v5, v5, v12
; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v11, 28, v1
; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v1, 24, v1
; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v18, 28, v0
; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v0, 24, v0
; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v14, 12, v14
; GFX10-DL-NOXNACK-NEXT: v_lshl_or_b32 v11, v11, 16, v12
; GFX10-DL-NOXNACK-NEXT: v_lshl_or_b32 v4, v4, 16, v5
; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v8, 20, v1
; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v15, 20, v0
; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v9, 12, v9
; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v16, 12, v16
; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v10, 12, v10
; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v14, 12, v17
; GFX10-DL-NOXNACK-NEXT: v_lshl_or_b32 v6, v6, 16, v13
; GFX10-DL-NOXNACK-NEXT: v_lshl_or_b32 v7, v7, 16, v8
; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v8, 16, v5
; GFX10-DL-NOXNACK-NEXT: s_waitcnt vmcnt(0)
; GFX10-DL-NOXNACK-NEXT: v_add_nc_u16 v3, v5, v3
; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v1, 12, v1
; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v6, 12, v6
; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v5, 12, v13
; GFX10-DL-NOXNACK-NEXT: v_and_b32_e32 v12, 0xffff, v14
; GFX10-DL-NOXNACK-NEXT: v_and_b32_e32 v7, 0xffff, v7
; GFX10-DL-NOXNACK-NEXT: v_pk_mul_lo_u16 v4, v4, v11
; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v10, 28, v1
; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v1, 24, v1
; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v17, 28, v0
; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v0, 24, v0
; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v8, 12, v8
; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v15, 12, v15
; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v9, 12, v9
; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v12, 12, v16
; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v13, 12, v16
; GFX10-DL-NOXNACK-NEXT: v_lshl_or_b32 v5, v5, 16, v12
; GFX10-DL-NOXNACK-NEXT: v_lshl_or_b32 v6, v6, 16, v7
; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v7, 16, v4
; GFX10-DL-NOXNACK-NEXT: s_waitcnt vmcnt(0)
; GFX10-DL-NOXNACK-NEXT: v_add_nc_u16 v3, v4, v3
; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v1, 12, v1
; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v8, 12, v8
; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v11, 12, v15
; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v0, 12, v0
; GFX10-DL-NOXNACK-NEXT: v_and_b32_e32 v5, v4, v14
; GFX10-DL-NOXNACK-NEXT: v_and_b32_e32 v10, v4, v10
; GFX10-DL-NOXNACK-NEXT: v_pk_mul_lo_u16 v6, v7, v6
; GFX10-DL-NOXNACK-NEXT: v_add_nc_u16 v3, v3, v8
; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v11, 12, v11
; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v18, 12, v18
; GFX10-DL-NOXNACK-NEXT: v_and_b32_e32 v4, 0xffff, v13
; GFX10-DL-NOXNACK-NEXT: v_and_b32_e32 v9, 0xffff, v9
; GFX10-DL-NOXNACK-NEXT: v_pk_mul_lo_u16 v5, v6, v5
; GFX10-DL-NOXNACK-NEXT: v_add_nc_u16 v3, v3, v7
; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v10, 12, v10
; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b16 v17, 12, v17
; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v1, 12, v1
; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v0, 12, v0
; GFX10-DL-NOXNACK-NEXT: v_lshl_or_b32 v5, v12, 16, v5
; GFX10-DL-NOXNACK-NEXT: v_lshl_or_b32 v7, v9, 16, v10
; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v8, 16, v6
; GFX10-DL-NOXNACK-NEXT: v_add_nc_u16 v3, v3, v6
; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v11, 12, v11
; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v6, 12, v18
; GFX10-DL-NOXNACK-NEXT: v_and_b32_e32 v0, v4, v0
; GFX10-DL-NOXNACK-NEXT: v_and_b32_e32 v1, v4, v1
; GFX10-DL-NOXNACK-NEXT: v_pk_mul_lo_u16 v4, v7, v5
; GFX10-DL-NOXNACK-NEXT: v_add_nc_u16 v3, v3, v8
; GFX10-DL-NOXNACK-NEXT: v_lshl_or_b32 v0, v6, 16, v0
; GFX10-DL-NOXNACK-NEXT: v_lshl_or_b32 v1, v11, 16, v1
; GFX10-DL-NOXNACK-NEXT: v_lshl_or_b32 v4, v11, 16, v4
; GFX10-DL-NOXNACK-NEXT: v_lshl_or_b32 v6, v8, 16, v9
; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v7, 16, v5
; GFX10-DL-NOXNACK-NEXT: v_add_nc_u16 v3, v3, v5
; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v10, 12, v10
; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v5, 12, v17
; GFX10-DL-NOXNACK-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX10-DL-NOXNACK-NEXT: v_and_b32_e32 v1, 0xffff, v1
; GFX10-DL-NOXNACK-NEXT: v_pk_mul_lo_u16 v4, v6, v4
; GFX10-DL-NOXNACK-NEXT: v_add_nc_u16 v3, v3, v7
; GFX10-DL-NOXNACK-NEXT: v_lshl_or_b32 v0, v5, 16, v0
; GFX10-DL-NOXNACK-NEXT: v_lshl_or_b32 v1, v10, 16, v1
; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v5, 16, v4
; GFX10-DL-NOXNACK-NEXT: v_add_nc_u16 v3, v3, v4
; GFX10-DL-NOXNACK-NEXT: v_pk_mul_lo_u16 v0, v1, v0

View File

@ -2355,7 +2355,6 @@ define amdgpu_kernel void @udot8_acc16_vecMul(<8 x i4> addrspace(1)* %src1,
; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX10-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
; GFX10-DL-NEXT: v_mov_b32_e32 v4, 0xffff
; GFX10-DL-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0
; GFX10-DL-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1
; GFX10-DL-NEXT: s_mov_b32 s10, -1
@ -2369,49 +2368,49 @@ define amdgpu_kernel void @udot8_acc16_vecMul(<8 x i4> addrspace(1)* %src1,
; GFX10-DL-NEXT: v_mov_b32_e32 v0, 0
; GFX10-DL-NEXT: global_load_ushort v3, v0, s[0:1]
; GFX10-DL-NEXT: s_waitcnt vmcnt(2)
; GFX10-DL-NEXT: v_and_b32_e32 v7, 15, v1
; GFX10-DL-NEXT: v_and_b32_e32 v6, 15, v1
; GFX10-DL-NEXT: s_waitcnt vmcnt(1)
; GFX10-DL-NEXT: v_and_b32_e32 v6, 15, v2
; GFX10-DL-NEXT: v_bfe_u32 v5, v1, 4, 4
; GFX10-DL-NEXT: v_bfe_u32 v10, v2, 4, 4
; GFX10-DL-NEXT: v_bfe_u32 v9, v1, 8, 4
; GFX10-DL-NEXT: v_and_b32_e32 v7, v4, v7
; GFX10-DL-NEXT: v_and_b32_e32 v6, v4, v6
; GFX10-DL-NEXT: v_bfe_u32 v13, v2, 8, 4
; GFX10-DL-NEXT: v_bfe_u32 v8, v1, 12, 4
; GFX10-DL-NEXT: v_and_b32_e32 v9, v4, v9
; GFX10-DL-NEXT: v_lshl_or_b32 v5, v5, 16, v7
; GFX10-DL-NEXT: v_lshl_or_b32 v6, v10, 16, v6
; GFX10-DL-NEXT: v_bfe_u32 v10, v2, 12, 4
; GFX10-DL-NEXT: v_and_b32_e32 v13, v4, v13
; GFX10-DL-NEXT: v_bfe_u32 v12, v1, 16, 4
; GFX10-DL-NEXT: v_lshl_or_b32 v8, v8, 16, v9
; GFX10-DL-NEXT: v_pk_mul_lo_u16 v5, v5, v6
; GFX10-DL-NEXT: v_bfe_u32 v6, v2, 16, 4
; GFX10-DL-NEXT: v_lshl_or_b32 v10, v10, 16, v13
; GFX10-DL-NEXT: v_bfe_u32 v11, v1, 20, 4
; GFX10-DL-NEXT: v_and_b32_e32 v12, v4, v12
; GFX10-DL-NEXT: v_lshrrev_b32_e32 v9, 16, v5
; GFX10-DL-NEXT: v_and_b32_e32 v5, 15, v2
; GFX10-DL-NEXT: v_bfe_u32 v4, v1, 4, 4
; GFX10-DL-NEXT: v_bfe_u32 v9, v2, 4, 4
; GFX10-DL-NEXT: v_bfe_u32 v8, v1, 8, 4
; GFX10-DL-NEXT: v_and_b32_e32 v6, 0xffff, v6
; GFX10-DL-NEXT: v_and_b32_e32 v5, 0xffff, v5
; GFX10-DL-NEXT: v_bfe_u32 v12, v2, 8, 4
; GFX10-DL-NEXT: v_bfe_u32 v7, v1, 12, 4
; GFX10-DL-NEXT: v_and_b32_e32 v8, 0xffff, v8
; GFX10-DL-NEXT: v_lshl_or_b32 v4, v4, 16, v6
; GFX10-DL-NEXT: v_lshl_or_b32 v5, v9, 16, v5
; GFX10-DL-NEXT: v_bfe_u32 v9, v2, 12, 4
; GFX10-DL-NEXT: v_and_b32_e32 v12, 0xffff, v12
; GFX10-DL-NEXT: v_bfe_u32 v11, v1, 16, 4
; GFX10-DL-NEXT: v_lshl_or_b32 v7, v7, 16, v8
; GFX10-DL-NEXT: v_pk_mul_lo_u16 v4, v4, v5
; GFX10-DL-NEXT: v_bfe_u32 v5, v2, 16, 4
; GFX10-DL-NEXT: v_lshl_or_b32 v9, v9, 16, v12
; GFX10-DL-NEXT: v_bfe_u32 v10, v1, 20, 4
; GFX10-DL-NEXT: v_and_b32_e32 v11, 0xffff, v11
; GFX10-DL-NEXT: v_lshrrev_b32_e32 v8, 16, v4
; GFX10-DL-NEXT: s_waitcnt vmcnt(0)
; GFX10-DL-NEXT: v_add_nc_u16 v3, v5, v3
; GFX10-DL-NEXT: v_bfe_u32 v5, v2, 20, 4
; GFX10-DL-NEXT: v_and_b32_e32 v6, v4, v6
; GFX10-DL-NEXT: v_pk_mul_lo_u16 v8, v8, v10
; GFX10-DL-NEXT: v_lshrrev_b32_e32 v7, 28, v1
; GFX10-DL-NEXT: v_add_nc_u16 v3, v3, v9
; GFX10-DL-NEXT: v_bfe_u32 v1, v1, 24, 4
; GFX10-DL-NEXT: v_bfe_u32 v9, v2, 24, 4
; GFX10-DL-NEXT: v_lshl_or_b32 v5, v5, 16, v6
; GFX10-DL-NEXT: v_lshl_or_b32 v6, v11, 16, v12
; GFX10-DL-NEXT: v_lshrrev_b32_e32 v10, 16, v8
; GFX10-DL-NEXT: v_add_nc_u16 v3, v4, v3
; GFX10-DL-NEXT: v_bfe_u32 v4, v2, 20, 4
; GFX10-DL-NEXT: v_and_b32_e32 v5, 0xffff, v5
; GFX10-DL-NEXT: v_pk_mul_lo_u16 v7, v7, v9
; GFX10-DL-NEXT: v_lshrrev_b32_e32 v6, 28, v1
; GFX10-DL-NEXT: v_add_nc_u16 v3, v3, v8
; GFX10-DL-NEXT: v_bfe_u32 v1, v1, 24, 4
; GFX10-DL-NEXT: v_bfe_u32 v8, v2, 24, 4
; GFX10-DL-NEXT: v_lshl_or_b32 v4, v4, 16, v5
; GFX10-DL-NEXT: v_lshl_or_b32 v5, v10, 16, v11
; GFX10-DL-NEXT: v_lshrrev_b32_e32 v9, 16, v7
; GFX10-DL-NEXT: v_add_nc_u16 v3, v3, v7
; GFX10-DL-NEXT: v_lshrrev_b32_e32 v2, 28, v2
; GFX10-DL-NEXT: v_and_b32_e32 v8, v4, v9
; GFX10-DL-NEXT: v_and_b32_e32 v1, v4, v1
; GFX10-DL-NEXT: v_pk_mul_lo_u16 v4, v6, v5
; GFX10-DL-NEXT: v_add_nc_u16 v3, v3, v10
; GFX10-DL-NEXT: v_lshl_or_b32 v2, v2, 16, v8
; GFX10-DL-NEXT: v_lshl_or_b32 v1, v7, 16, v1
; GFX10-DL-NEXT: v_and_b32_e32 v7, 0xffff, v8
; GFX10-DL-NEXT: v_and_b32_e32 v1, 0xffff, v1
; GFX10-DL-NEXT: v_pk_mul_lo_u16 v4, v5, v4
; GFX10-DL-NEXT: v_add_nc_u16 v3, v3, v9
; GFX10-DL-NEXT: v_lshl_or_b32 v2, v2, 16, v7
; GFX10-DL-NEXT: v_lshl_or_b32 v1, v6, 16, v1
; GFX10-DL-NEXT: v_lshrrev_b32_e32 v5, 16, v4
; GFX10-DL-NEXT: v_add_nc_u16 v3, v3, v4
; GFX10-DL-NEXT: v_pk_mul_lo_u16 v1, v1, v2
@ -3112,7 +3111,6 @@ define amdgpu_kernel void @udot8_acc4_vecMul(<8 x i4> addrspace(1)* %src1,
; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX10-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
; GFX10-DL-NEXT: v_mov_b32_e32 v4, 0xffff
; GFX10-DL-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0
; GFX10-DL-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1
; GFX10-DL-NEXT: s_mov_b32 s10, -1
@ -3126,49 +3124,49 @@ define amdgpu_kernel void @udot8_acc4_vecMul(<8 x i4> addrspace(1)* %src1,
; GFX10-DL-NEXT: v_mov_b32_e32 v0, 0
; GFX10-DL-NEXT: global_load_ubyte v3, v0, s[0:1]
; GFX10-DL-NEXT: s_waitcnt vmcnt(2)
; GFX10-DL-NEXT: v_and_b32_e32 v7, 15, v1
; GFX10-DL-NEXT: v_and_b32_e32 v6, 15, v1
; GFX10-DL-NEXT: s_waitcnt vmcnt(1)
; GFX10-DL-NEXT: v_and_b32_e32 v6, 15, v2
; GFX10-DL-NEXT: v_bfe_u32 v5, v1, 4, 4
; GFX10-DL-NEXT: v_bfe_u32 v10, v2, 4, 4
; GFX10-DL-NEXT: v_bfe_u32 v9, v1, 8, 4
; GFX10-DL-NEXT: v_and_b32_e32 v7, v4, v7
; GFX10-DL-NEXT: v_and_b32_e32 v6, v4, v6
; GFX10-DL-NEXT: v_bfe_u32 v13, v2, 8, 4
; GFX10-DL-NEXT: v_bfe_u32 v8, v1, 12, 4
; GFX10-DL-NEXT: v_and_b32_e32 v9, v4, v9
; GFX10-DL-NEXT: v_lshl_or_b32 v5, v5, 16, v7
; GFX10-DL-NEXT: v_lshl_or_b32 v6, v10, 16, v6
; GFX10-DL-NEXT: v_bfe_u32 v10, v2, 12, 4
; GFX10-DL-NEXT: v_and_b32_e32 v13, v4, v13
; GFX10-DL-NEXT: v_bfe_u32 v12, v1, 16, 4
; GFX10-DL-NEXT: v_lshl_or_b32 v8, v8, 16, v9
; GFX10-DL-NEXT: v_pk_mul_lo_u16 v5, v5, v6
; GFX10-DL-NEXT: v_bfe_u32 v6, v2, 16, 4
; GFX10-DL-NEXT: v_lshl_or_b32 v10, v10, 16, v13
; GFX10-DL-NEXT: v_bfe_u32 v11, v1, 20, 4
; GFX10-DL-NEXT: v_and_b32_e32 v12, v4, v12
; GFX10-DL-NEXT: v_lshrrev_b32_e32 v9, 16, v5
; GFX10-DL-NEXT: v_and_b32_e32 v5, 15, v2
; GFX10-DL-NEXT: v_bfe_u32 v4, v1, 4, 4
; GFX10-DL-NEXT: v_bfe_u32 v9, v2, 4, 4
; GFX10-DL-NEXT: v_bfe_u32 v8, v1, 8, 4
; GFX10-DL-NEXT: v_and_b32_e32 v6, 0xffff, v6
; GFX10-DL-NEXT: v_and_b32_e32 v5, 0xffff, v5
; GFX10-DL-NEXT: v_bfe_u32 v12, v2, 8, 4
; GFX10-DL-NEXT: v_bfe_u32 v7, v1, 12, 4
; GFX10-DL-NEXT: v_and_b32_e32 v8, 0xffff, v8
; GFX10-DL-NEXT: v_lshl_or_b32 v4, v4, 16, v6
; GFX10-DL-NEXT: v_lshl_or_b32 v5, v9, 16, v5
; GFX10-DL-NEXT: v_bfe_u32 v9, v2, 12, 4
; GFX10-DL-NEXT: v_and_b32_e32 v12, 0xffff, v12
; GFX10-DL-NEXT: v_bfe_u32 v11, v1, 16, 4
; GFX10-DL-NEXT: v_lshl_or_b32 v7, v7, 16, v8
; GFX10-DL-NEXT: v_pk_mul_lo_u16 v4, v4, v5
; GFX10-DL-NEXT: v_bfe_u32 v5, v2, 16, 4
; GFX10-DL-NEXT: v_lshl_or_b32 v9, v9, 16, v12
; GFX10-DL-NEXT: v_bfe_u32 v10, v1, 20, 4
; GFX10-DL-NEXT: v_and_b32_e32 v11, 0xffff, v11
; GFX10-DL-NEXT: v_lshrrev_b32_e32 v8, 16, v4
; GFX10-DL-NEXT: s_waitcnt vmcnt(0)
; GFX10-DL-NEXT: v_add_nc_u16 v3, v5, v3
; GFX10-DL-NEXT: v_bfe_u32 v5, v2, 20, 4
; GFX10-DL-NEXT: v_and_b32_e32 v6, v4, v6
; GFX10-DL-NEXT: v_pk_mul_lo_u16 v8, v8, v10
; GFX10-DL-NEXT: v_lshrrev_b32_e32 v7, 28, v1
; GFX10-DL-NEXT: v_add_nc_u16 v3, v3, v9
; GFX10-DL-NEXT: v_bfe_u32 v1, v1, 24, 4
; GFX10-DL-NEXT: v_bfe_u32 v9, v2, 24, 4
; GFX10-DL-NEXT: v_lshl_or_b32 v5, v5, 16, v6
; GFX10-DL-NEXT: v_lshl_or_b32 v6, v11, 16, v12
; GFX10-DL-NEXT: v_lshrrev_b32_e32 v10, 16, v8
; GFX10-DL-NEXT: v_add_nc_u16 v3, v4, v3
; GFX10-DL-NEXT: v_bfe_u32 v4, v2, 20, 4
; GFX10-DL-NEXT: v_and_b32_e32 v5, 0xffff, v5
; GFX10-DL-NEXT: v_pk_mul_lo_u16 v7, v7, v9
; GFX10-DL-NEXT: v_lshrrev_b32_e32 v6, 28, v1
; GFX10-DL-NEXT: v_add_nc_u16 v3, v3, v8
; GFX10-DL-NEXT: v_bfe_u32 v1, v1, 24, 4
; GFX10-DL-NEXT: v_bfe_u32 v8, v2, 24, 4
; GFX10-DL-NEXT: v_lshl_or_b32 v4, v4, 16, v5
; GFX10-DL-NEXT: v_lshl_or_b32 v5, v10, 16, v11
; GFX10-DL-NEXT: v_lshrrev_b32_e32 v9, 16, v7
; GFX10-DL-NEXT: v_add_nc_u16 v3, v3, v7
; GFX10-DL-NEXT: v_lshrrev_b32_e32 v2, 28, v2
; GFX10-DL-NEXT: v_and_b32_e32 v8, v4, v9
; GFX10-DL-NEXT: v_and_b32_e32 v1, v4, v1
; GFX10-DL-NEXT: v_pk_mul_lo_u16 v4, v6, v5
; GFX10-DL-NEXT: v_add_nc_u16 v3, v3, v10
; GFX10-DL-NEXT: v_lshl_or_b32 v2, v2, 16, v8
; GFX10-DL-NEXT: v_lshl_or_b32 v1, v7, 16, v1
; GFX10-DL-NEXT: v_and_b32_e32 v7, 0xffff, v8
; GFX10-DL-NEXT: v_and_b32_e32 v1, 0xffff, v1
; GFX10-DL-NEXT: v_pk_mul_lo_u16 v4, v5, v4
; GFX10-DL-NEXT: v_add_nc_u16 v3, v3, v9
; GFX10-DL-NEXT: v_lshl_or_b32 v2, v2, 16, v7
; GFX10-DL-NEXT: v_lshl_or_b32 v1, v6, 16, v1
; GFX10-DL-NEXT: v_lshrrev_b32_e32 v5, 16, v4
; GFX10-DL-NEXT: v_add_nc_u16 v3, v3, v4
; GFX10-DL-NEXT: v_pk_mul_lo_u16 v1, v1, v2

View File

@ -341,12 +341,11 @@ define amdgpu_kernel void @commute_add_inline_imm_0.5_v2f16(<2 x half> addrspace
; GFX9: v_pk_add_f16 [[REG:v[0-9]+]], [[VAL]], [[K]] op_sel_hi:[1,0] ; encoding: [0x00,0x40,0x8f,0xd3,0x00,0x01,0x00,0x08]
; GFX9: buffer_store_dword [[REG]]
; VI-DAG: s_movk_i32 [[K:s[0-9]+]], 0x6400 ; encoding
; VI-DAG: buffer_load_dword
; VI-NOT: and
; VI-DAG: v_add_f16_e32 v{{[0-9]+}}, [[K]], v{{[0-9]+}}
; VI-DAG: v_add_f16_e32 v{{[0-9]+}}, 0x6400, v{{[0-9]+}}
; gfx8 does not support sreg or imm in sdwa - this will be move then
; VI-DAG: v_mov_b32_e32 [[VK:v[0-9]+]], [[K]]
; VI-DAG: v_mov_b32_e32 [[VK:v[0-9]+]], 0x6400
; VI-DAG: v_add_f16_sdwa v{{[0-9]+}}, v{{[0-9]+}}, [[VK]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; VI: v_or_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
; VI: buffer_store_dword

View File

@ -481,9 +481,8 @@ define amdgpu_kernel void @byte8_inselt(<8 x i8> addrspace(1)* %out, <8 x i8> %v
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: s_lshl_b32 s6, s6, 3
; GCN-NEXT: s_lshl_b64 s[4:5], s[4:5], s6
; GCN-NEXT: s_mov_b32 s6, 0x1010101
; GCN-NEXT: s_and_b32 s7, s5, s6
; GCN-NEXT: s_and_b32 s6, s4, s6
; GCN-NEXT: s_and_b32 s7, s5, 0x1010101
; GCN-NEXT: s_and_b32 s6, s4, 0x1010101
; GCN-NEXT: s_andn2_b64 s[0:1], s[0:1], s[4:5]
; GCN-NEXT: s_or_b64 s[0:1], s[6:7], s[0:1]
; GCN-NEXT: v_mov_b32_e32 v2, s2

View File

@ -1072,9 +1072,8 @@ define amdgpu_kernel void @dynamic_insertelement_v3i16(<3 x i16> addrspace(1)* %
; SI-NEXT: s_lshl_b32 s8, s6, 4
; SI-NEXT: s_mov_b64 s[6:7], 0xffff
; SI-NEXT: s_lshl_b64 s[6:7], s[6:7], s8
; SI-NEXT: s_mov_b32 s8, 0x50005
; SI-NEXT: s_and_b32 s9, s7, s8
; SI-NEXT: s_and_b32 s8, s6, s8
; SI-NEXT: s_and_b32 s9, s7, 0x50005
; SI-NEXT: s_and_b32 s8, s6, 0x50005
; SI-NEXT: s_andn2_b64 s[4:5], s[4:5], s[6:7]
; SI-NEXT: s_or_b64 s[4:5], s[8:9], s[4:5]
; SI-NEXT: v_mov_b32_e32 v0, s5
@ -1248,9 +1247,8 @@ define amdgpu_kernel void @s_dynamic_insertelement_v8i8(<8 x i8> addrspace(1)* %
; SI-NEXT: s_lshl_b32 s8, s8, 3
; SI-NEXT: s_mov_b64 s[2:3], 0xffff
; SI-NEXT: s_lshl_b64 s[2:3], s[2:3], s8
; SI-NEXT: s_mov_b32 s8, 0x5050505
; SI-NEXT: s_and_b32 s9, s3, s8
; SI-NEXT: s_and_b32 s8, s2, s8
; SI-NEXT: s_and_b32 s9, s3, 0x5050505
; SI-NEXT: s_and_b32 s8, s2, 0x5050505
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3]
; SI-NEXT: s_or_b64 s[0:1], s[8:9], s[0:1]
@ -1272,9 +1270,8 @@ define amdgpu_kernel void @s_dynamic_insertelement_v8i8(<8 x i8> addrspace(1)* %
; VI-NEXT: s_lshl_b32 s8, s8, 3
; VI-NEXT: s_mov_b64 s[2:3], 0xffff
; VI-NEXT: s_lshl_b64 s[2:3], s[2:3], s8
; VI-NEXT: s_mov_b32 s8, 0x5050505
; VI-NEXT: s_and_b32 s9, s3, s8
; VI-NEXT: s_and_b32 s8, s2, s8
; VI-NEXT: s_and_b32 s9, s3, 0x5050505
; VI-NEXT: s_and_b32 s8, s2, 0x5050505
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3]
; VI-NEXT: s_or_b64 s[0:1], s[8:9], s[0:1]

View File

@ -1606,7 +1606,7 @@ define amdgpu_kernel void @v_insertelement_v4i16_dynamic_vgpr(<4 x i16> addrspac
; VI-NEXT: s_mov_b64 s[2:3], 0xffff
; VI-NEXT: v_mov_b32_e32 v3, s1
; VI-NEXT: s_lshl_b32 s1, s4, 16
; VI-NEXT: s_and_b32 s4, s4, s2
; VI-NEXT: s_and_b32 s4, s4, 0xffff
; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2
; VI-NEXT: s_or_b32 s0, s4, s1
; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
@ -1633,7 +1633,7 @@ define amdgpu_kernel void @v_insertelement_v4i16_dynamic_vgpr(<4 x i16> addrspac
; CI-NEXT: s_mov_b64 s[2:3], 0xffff
; CI-NEXT: v_mov_b32_e32 v3, s1
; CI-NEXT: s_lshl_b32 s1, s4, 16
; CI-NEXT: s_and_b32 s4, s4, s2
; CI-NEXT: s_and_b32 s4, s4, 0xffff
; CI-NEXT: v_add_i32_e32 v2, vcc, s0, v2
; CI-NEXT: s_or_b32 s0, s4, s1
; CI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
@ -1691,7 +1691,7 @@ define amdgpu_kernel void @v_insertelement_v4f16_dynamic_sgpr(<4 x half> addrspa
; VI-NEXT: v_mov_b32_e32 v3, s1
; VI-NEXT: s_lshl_b32 s1, s5, 4
; VI-NEXT: s_lshl_b32 s5, s4, 16
; VI-NEXT: s_and_b32 s4, s4, s2
; VI-NEXT: s_and_b32 s4, s4, 0xffff
; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2
; VI-NEXT: s_lshl_b64 s[0:1], s[2:3], s1
; VI-NEXT: s_or_b32 s2, s4, s5
@ -1716,7 +1716,7 @@ define amdgpu_kernel void @v_insertelement_v4f16_dynamic_sgpr(<4 x half> addrspa
; CI-NEXT: flat_load_dwordx2 v[0:1], v[0:1]
; CI-NEXT: s_mov_b64 s[2:3], 0xffff
; CI-NEXT: v_mov_b32_e32 v3, s1
; CI-NEXT: s_and_b32 s6, s4, s2
; CI-NEXT: s_and_b32 s6, s4, 0xffff
; CI-NEXT: s_lshl_b32 s1, s5, 4
; CI-NEXT: s_lshl_b32 s4, s4, 16
; CI-NEXT: v_add_i32_e32 v2, vcc, s0, v2

View File

@ -37,11 +37,10 @@ main_body:
; GCN-LABEL: {{^}}buffer_store_format_d16_xyzw:
; GCN-DAG: s_load_dwordx2 s[[[S_DATA_0:[0-9]+]]:[[S_DATA_1:[0-9]+]]], s{{\[[0-9]+:[0-9]+\]}}, 0x10
; UNPACKED-DAG: s_mov_b32 [[K:s[0-9]+]], 0xffff{{$}}
; UNPACKED-DAG: s_lshr_b32 [[SHR0:s[0-9]+]], s[[S_DATA_0]], 16
; UNPACKED-DAG: s_and_b32 [[MASKED0:s[0-9]+]], s[[S_DATA_0]], [[K]]
; UNPACKED-DAG: s_and_b32 [[MASKED0:s[0-9]+]], s[[S_DATA_0]], 0xffff{{$}}
; UNPACKED-DAG: s_lshr_b32 [[SHR1:s[0-9]+]], s[[S_DATA_1]], 16
; UNPACKED-DAG: s_and_b32 [[MASKED1:s[0-9]+]], s[[S_DATA_1]], [[K]]
; UNPACKED-DAG: s_and_b32 [[MASKED1:s[0-9]+]], s[[S_DATA_1]], 0xffff{{$}}
; UNPACKED-DAG: v_mov_b32_e32 v[[LO:[0-9]+]], [[MASKED0]]
; UNPACKED-DAG: v_mov_b32_e32 v[[HI:[0-9]+]], [[SHR1]]

View File

@ -566,10 +566,9 @@ define amdgpu_ps <4 x float> @sample_d_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg
;
; GFX10-LABEL: sample_d_2d:
; GFX10: ; %bb.0: ; %main_body
; GFX10-NEXT: v_mov_b32_e32 v6, 0xffff
; GFX10-NEXT: v_and_b32_e32 v4, v6, v4
; GFX10-NEXT: v_and_b32_e32 v2, v6, v2
; GFX10-NEXT: v_and_b32_e32 v0, v6, v0
; GFX10-NEXT: v_and_b32_e32 v4, 0xffff, v4
; GFX10-NEXT: v_and_b32_e32 v2, 0xffff, v2
; GFX10-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX10-NEXT: v_lshl_or_b32 v4, v5, 16, v4
; GFX10-NEXT: v_lshl_or_b32 v3, v3, 16, v2
; GFX10-NEXT: v_lshl_or_b32 v2, v1, 16, v0
@ -602,12 +601,11 @@ define amdgpu_ps <4 x float> @sample_d_3d(<8 x i32> inreg %rsrc, <4 x i32> inreg
; GFX10: ; %bb.0: ; %main_body
; GFX10-NEXT: v_mov_b32_e32 v12, v8
; GFX10-NEXT: v_mov_b32_e32 v8, v2
; GFX10-NEXT: v_mov_b32_e32 v2, 0xffff
; GFX10-NEXT: v_and_b32_e32 v2, 0xffff, v6
; GFX10-NEXT: v_and_b32_e32 v3, 0xffff, v3
; GFX10-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX10-NEXT: v_mov_b32_e32 v10, v5
; GFX10-NEXT: v_and_b32_e32 v5, v2, v6
; GFX10-NEXT: v_and_b32_e32 v3, v2, v3
; GFX10-NEXT: v_and_b32_e32 v0, v2, v0
; GFX10-NEXT: v_lshl_or_b32 v11, v7, 16, v5
; GFX10-NEXT: v_lshl_or_b32 v11, v7, 16, v2
; GFX10-NEXT: v_lshl_or_b32 v9, v4, 16, v3
; GFX10-NEXT: v_lshl_or_b32 v7, v1, 16, v0
; GFX10-NEXT: image_sample_d_g16 v[0:3], v[7:12], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_3D a16
@ -653,10 +651,9 @@ define amdgpu_ps <4 x float> @sample_c_d_2d(<8 x i32> inreg %rsrc, <4 x i32> inr
;
; GFX10-LABEL: sample_c_d_2d:
; GFX10: ; %bb.0: ; %main_body
; GFX10-NEXT: v_mov_b32_e32 v7, 0xffff
; GFX10-NEXT: v_and_b32_e32 v5, v7, v5
; GFX10-NEXT: v_and_b32_e32 v3, v7, v3
; GFX10-NEXT: v_and_b32_e32 v1, v7, v1
; GFX10-NEXT: v_and_b32_e32 v5, 0xffff, v5
; GFX10-NEXT: v_and_b32_e32 v3, 0xffff, v3
; GFX10-NEXT: v_and_b32_e32 v1, 0xffff, v1
; GFX10-NEXT: v_lshl_or_b32 v5, v6, 16, v5
; GFX10-NEXT: v_lshl_or_b32 v3, v4, 16, v3
; GFX10-NEXT: v_lshl_or_b32 v1, v2, 16, v1
@ -705,10 +702,9 @@ define amdgpu_ps <4 x float> @sample_d_cl_2d(<8 x i32> inreg %rsrc, <4 x i32> in
;
; GFX10-LABEL: sample_d_cl_2d:
; GFX10: ; %bb.0: ; %main_body
; GFX10-NEXT: v_mov_b32_e32 v7, 0xffff
; GFX10-NEXT: v_and_b32_e32 v4, v7, v4
; GFX10-NEXT: v_and_b32_e32 v2, v7, v2
; GFX10-NEXT: v_and_b32_e32 v0, v7, v0
; GFX10-NEXT: v_and_b32_e32 v4, 0xffff, v4
; GFX10-NEXT: v_and_b32_e32 v2, 0xffff, v2
; GFX10-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX10-NEXT: v_lshl_or_b32 v4, v5, 16, v4
; GFX10-NEXT: v_lshl_or_b32 v2, v3, 16, v2
; GFX10-NEXT: v_lshl_or_b32 v0, v1, 16, v0
@ -759,10 +755,9 @@ define amdgpu_ps <4 x float> @sample_c_d_cl_2d(<8 x i32> inreg %rsrc, <4 x i32>
;
; GFX10-LABEL: sample_c_d_cl_2d:
; GFX10: ; %bb.0: ; %main_body
; GFX10-NEXT: v_mov_b32_e32 v8, 0xffff
; GFX10-NEXT: v_and_b32_e32 v5, v8, v5
; GFX10-NEXT: v_and_b32_e32 v3, v8, v3
; GFX10-NEXT: v_and_b32_e32 v1, v8, v1
; GFX10-NEXT: v_and_b32_e32 v5, 0xffff, v5
; GFX10-NEXT: v_and_b32_e32 v3, 0xffff, v3
; GFX10-NEXT: v_and_b32_e32 v1, 0xffff, v1
; GFX10-NEXT: v_lshl_or_b32 v5, v6, 16, v5
; GFX10-NEXT: v_lshl_or_b32 v3, v4, 16, v3
; GFX10-NEXT: v_lshl_or_b32 v1, v2, 16, v1
@ -807,10 +802,9 @@ define amdgpu_ps <4 x float> @sample_cd_2d(<8 x i32> inreg %rsrc, <4 x i32> inre
;
; GFX10-LABEL: sample_cd_2d:
; GFX10: ; %bb.0: ; %main_body
; GFX10-NEXT: v_mov_b32_e32 v6, 0xffff
; GFX10-NEXT: v_and_b32_e32 v4, v6, v4
; GFX10-NEXT: v_and_b32_e32 v2, v6, v2
; GFX10-NEXT: v_and_b32_e32 v0, v6, v0
; GFX10-NEXT: v_and_b32_e32 v4, 0xffff, v4
; GFX10-NEXT: v_and_b32_e32 v2, 0xffff, v2
; GFX10-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX10-NEXT: v_lshl_or_b32 v4, v5, 16, v4
; GFX10-NEXT: v_lshl_or_b32 v3, v3, 16, v2
; GFX10-NEXT: v_lshl_or_b32 v2, v1, 16, v0
@ -857,10 +851,9 @@ define amdgpu_ps <4 x float> @sample_c_cd_2d(<8 x i32> inreg %rsrc, <4 x i32> in
;
; GFX10-LABEL: sample_c_cd_2d:
; GFX10: ; %bb.0: ; %main_body
; GFX10-NEXT: v_mov_b32_e32 v7, 0xffff
; GFX10-NEXT: v_and_b32_e32 v5, v7, v5
; GFX10-NEXT: v_and_b32_e32 v3, v7, v3
; GFX10-NEXT: v_and_b32_e32 v1, v7, v1
; GFX10-NEXT: v_and_b32_e32 v5, 0xffff, v5
; GFX10-NEXT: v_and_b32_e32 v3, 0xffff, v3
; GFX10-NEXT: v_and_b32_e32 v1, 0xffff, v1
; GFX10-NEXT: v_lshl_or_b32 v5, v6, 16, v5
; GFX10-NEXT: v_lshl_or_b32 v3, v4, 16, v3
; GFX10-NEXT: v_lshl_or_b32 v1, v2, 16, v1
@ -909,10 +902,9 @@ define amdgpu_ps <4 x float> @sample_cd_cl_2d(<8 x i32> inreg %rsrc, <4 x i32> i
;
; GFX10-LABEL: sample_cd_cl_2d:
; GFX10: ; %bb.0: ; %main_body
; GFX10-NEXT: v_mov_b32_e32 v7, 0xffff
; GFX10-NEXT: v_and_b32_e32 v4, v7, v4
; GFX10-NEXT: v_and_b32_e32 v2, v7, v2
; GFX10-NEXT: v_and_b32_e32 v0, v7, v0
; GFX10-NEXT: v_and_b32_e32 v4, 0xffff, v4
; GFX10-NEXT: v_and_b32_e32 v2, 0xffff, v2
; GFX10-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX10-NEXT: v_lshl_or_b32 v4, v5, 16, v4
; GFX10-NEXT: v_lshl_or_b32 v2, v3, 16, v2
; GFX10-NEXT: v_lshl_or_b32 v0, v1, 16, v0
@ -963,10 +955,9 @@ define amdgpu_ps <4 x float> @sample_c_cd_cl_2d(<8 x i32> inreg %rsrc, <4 x i32>
;
; GFX10-LABEL: sample_c_cd_cl_2d:
; GFX10: ; %bb.0: ; %main_body
; GFX10-NEXT: v_mov_b32_e32 v8, 0xffff
; GFX10-NEXT: v_and_b32_e32 v5, v8, v5
; GFX10-NEXT: v_and_b32_e32 v3, v8, v3
; GFX10-NEXT: v_and_b32_e32 v1, v8, v1
; GFX10-NEXT: v_and_b32_e32 v5, 0xffff, v5
; GFX10-NEXT: v_and_b32_e32 v3, 0xffff, v3
; GFX10-NEXT: v_and_b32_e32 v1, 0xffff, v1
; GFX10-NEXT: v_lshl_or_b32 v5, v6, 16, v5
; GFX10-NEXT: v_lshl_or_b32 v3, v4, 16, v3
; GFX10-NEXT: v_lshl_or_b32 v1, v2, 16, v1
@ -1160,15 +1151,14 @@ define amdgpu_ps float @sample_c_d_o_2darray_V1(<8 x i32> inreg %rsrc, <4 x i32>
; GFX10-LABEL: sample_c_d_o_2darray_V1:
; GFX10: ; %bb.0: ; %main_body
; GFX10-NEXT: v_mov_b32_e32 v13, v8
; GFX10-NEXT: v_mov_b32_e32 v8, v0
; GFX10-NEXT: v_mov_b32_e32 v0, 0xffff
; GFX10-NEXT: v_mov_b32_e32 v9, v1
; GFX10-NEXT: v_and_b32_e32 v1, v0, v6
; GFX10-NEXT: v_and_b32_e32 v4, v0, v4
; GFX10-NEXT: v_and_b32_e32 v0, v0, v2
; GFX10-NEXT: v_lshl_or_b32 v12, v7, 16, v1
; GFX10-NEXT: v_lshl_or_b32 v11, v5, 16, v4
; GFX10-NEXT: v_lshl_or_b32 v10, v3, 16, v0
; GFX10-NEXT: v_mov_b32_e32 v8, v0
; GFX10-NEXT: v_and_b32_e32 v0, 0xffff, v6
; GFX10-NEXT: v_and_b32_e32 v1, 0xffff, v4
; GFX10-NEXT: v_and_b32_e32 v2, 0xffff, v2
; GFX10-NEXT: v_lshl_or_b32 v12, v7, 16, v0
; GFX10-NEXT: v_lshl_or_b32 v11, v5, 16, v1
; GFX10-NEXT: v_lshl_or_b32 v10, v3, 16, v2
; GFX10-NEXT: image_sample_c_d_o_g16 v0, v[8:13], s[0:7], s[8:11] dmask:0x4 dim:SQ_RSRC_IMG_2D_ARRAY a16
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: ; return to shader part epilog
@ -1197,15 +1187,14 @@ define amdgpu_ps <2 x float> @sample_c_d_o_2darray_V2(<8 x i32> inreg %rsrc, <4
; GFX10-LABEL: sample_c_d_o_2darray_V2:
; GFX10: ; %bb.0: ; %main_body
; GFX10-NEXT: v_mov_b32_e32 v13, v8
; GFX10-NEXT: v_mov_b32_e32 v8, v0
; GFX10-NEXT: v_mov_b32_e32 v0, 0xffff
; GFX10-NEXT: v_mov_b32_e32 v9, v1
; GFX10-NEXT: v_and_b32_e32 v1, v0, v6
; GFX10-NEXT: v_and_b32_e32 v4, v0, v4
; GFX10-NEXT: v_and_b32_e32 v0, v0, v2
; GFX10-NEXT: v_lshl_or_b32 v12, v7, 16, v1
; GFX10-NEXT: v_lshl_or_b32 v11, v5, 16, v4
; GFX10-NEXT: v_lshl_or_b32 v10, v3, 16, v0
; GFX10-NEXT: v_mov_b32_e32 v8, v0
; GFX10-NEXT: v_and_b32_e32 v0, 0xffff, v6
; GFX10-NEXT: v_and_b32_e32 v1, 0xffff, v4
; GFX10-NEXT: v_and_b32_e32 v2, 0xffff, v2
; GFX10-NEXT: v_lshl_or_b32 v12, v7, 16, v0
; GFX10-NEXT: v_lshl_or_b32 v11, v5, 16, v1
; GFX10-NEXT: v_lshl_or_b32 v10, v3, 16, v2
; GFX10-NEXT: image_sample_c_d_o_g16 v[0:1], v[8:13], s[0:7], s[8:11] dmask:0x6 dim:SQ_RSRC_IMG_2D_ARRAY a16
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: ; return to shader part epilog

View File

@ -62,11 +62,10 @@ define amdgpu_ps <4 x float> @sample_d_3d(<8 x i32> inreg %rsrc, <4 x i32> inreg
;
; GFX10GISEL-LABEL: sample_d_3d:
; GFX10GISEL: ; %bb.0: ; %main_body
; GFX10GISEL-NEXT: v_mov_b32_e32 v9, 0xffff
; GFX10GISEL-NEXT: v_lshlrev_b32_e32 v7, 16, v7
; GFX10GISEL-NEXT: s_lshl_b32 s12, s0, 16
; GFX10GISEL-NEXT: v_and_or_b32 v6, v6, v9, v7
; GFX10GISEL-NEXT: v_and_or_b32 v7, v8, v9, s12
; GFX10GISEL-NEXT: v_and_or_b32 v6, 0xffff, v6, v7
; GFX10GISEL-NEXT: v_and_or_b32 v7, 0xffff, v8, s12
; GFX10GISEL-NEXT: image_sample_d v[0:3], v[0:7], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_3D a16
; GFX10GISEL-NEXT: s_waitcnt vmcnt(0)
; GFX10GISEL-NEXT: ; return to shader part epilog
@ -152,11 +151,10 @@ define amdgpu_ps <4 x float> @sample_d_cl_2d(<8 x i32> inreg %rsrc, <4 x i32> in
;
; GFX10GISEL-LABEL: sample_d_cl_2d:
; GFX10GISEL: ; %bb.0: ; %main_body
; GFX10GISEL-NEXT: v_mov_b32_e32 v7, 0xffff
; GFX10GISEL-NEXT: v_lshlrev_b32_e32 v5, 16, v5
; GFX10GISEL-NEXT: s_lshl_b32 s12, s0, 16
; GFX10GISEL-NEXT: v_and_or_b32 v4, v4, v7, v5
; GFX10GISEL-NEXT: v_and_or_b32 v5, v6, v7, s12
; GFX10GISEL-NEXT: v_and_or_b32 v4, 0xffff, v4, v5
; GFX10GISEL-NEXT: v_and_or_b32 v5, 0xffff, v6, s12
; GFX10GISEL-NEXT: image_sample_d_cl v[0:3], v[0:5], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D a16
; GFX10GISEL-NEXT: s_waitcnt vmcnt(0)
; GFX10GISEL-NEXT: ; return to shader part epilog
@ -203,11 +201,10 @@ define amdgpu_ps <4 x float> @sample_c_d_cl_2d(<8 x i32> inreg %rsrc, <4 x i32>
;
; GFX10GISEL-LABEL: sample_c_d_cl_2d:
; GFX10GISEL: ; %bb.0: ; %main_body
; GFX10GISEL-NEXT: v_mov_b32_e32 v8, 0xffff
; GFX10GISEL-NEXT: v_lshlrev_b32_e32 v6, 16, v6
; GFX10GISEL-NEXT: s_lshl_b32 s12, s0, 16
; GFX10GISEL-NEXT: v_and_or_b32 v5, v5, v8, v6
; GFX10GISEL-NEXT: v_and_or_b32 v6, v7, v8, s12
; GFX10GISEL-NEXT: v_and_or_b32 v5, 0xffff, v5, v6
; GFX10GISEL-NEXT: v_and_or_b32 v6, 0xffff, v7, s12
; GFX10GISEL-NEXT: image_sample_c_d_cl v[0:3], v[0:6], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D a16
; GFX10GISEL-NEXT: s_waitcnt vmcnt(0)
; GFX10GISEL-NEXT: ; return to shader part epilog
@ -333,11 +330,10 @@ define amdgpu_ps <4 x float> @sample_cd_cl_2d(<8 x i32> inreg %rsrc, <4 x i32> i
;
; GFX10GISEL-LABEL: sample_cd_cl_2d:
; GFX10GISEL: ; %bb.0: ; %main_body
; GFX10GISEL-NEXT: v_mov_b32_e32 v7, 0xffff
; GFX10GISEL-NEXT: v_lshlrev_b32_e32 v5, 16, v5
; GFX10GISEL-NEXT: s_lshl_b32 s12, s0, 16
; GFX10GISEL-NEXT: v_and_or_b32 v4, v4, v7, v5
; GFX10GISEL-NEXT: v_and_or_b32 v5, v6, v7, s12
; GFX10GISEL-NEXT: v_and_or_b32 v4, 0xffff, v4, v5
; GFX10GISEL-NEXT: v_and_or_b32 v5, 0xffff, v6, s12
; GFX10GISEL-NEXT: image_sample_cd_cl v[0:3], v[0:5], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D a16
; GFX10GISEL-NEXT: s_waitcnt vmcnt(0)
; GFX10GISEL-NEXT: ; return to shader part epilog
@ -384,11 +380,10 @@ define amdgpu_ps <4 x float> @sample_c_cd_cl_2d(<8 x i32> inreg %rsrc, <4 x i32>
;
; GFX10GISEL-LABEL: sample_c_cd_cl_2d:
; GFX10GISEL: ; %bb.0: ; %main_body
; GFX10GISEL-NEXT: v_mov_b32_e32 v8, 0xffff
; GFX10GISEL-NEXT: v_lshlrev_b32_e32 v6, 16, v6
; GFX10GISEL-NEXT: s_lshl_b32 s12, s0, 16
; GFX10GISEL-NEXT: v_and_or_b32 v5, v5, v8, v6
; GFX10GISEL-NEXT: v_and_or_b32 v6, v7, v8, s12
; GFX10GISEL-NEXT: v_and_or_b32 v5, 0xffff, v5, v6
; GFX10GISEL-NEXT: v_and_or_b32 v6, 0xffff, v7, s12
; GFX10GISEL-NEXT: image_sample_c_cd_cl v[0:3], v[0:6], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D a16
; GFX10GISEL-NEXT: s_waitcnt vmcnt(0)
; GFX10GISEL-NEXT: ; return to shader part epilog
@ -415,11 +410,10 @@ define amdgpu_ps float @sample_c_d_o_2darray_V1(<8 x i32> inreg %rsrc, <4 x i32>
;
; GFX10GISEL-LABEL: sample_c_d_o_2darray_V1:
; GFX10GISEL: ; %bb.0: ; %main_body
; GFX10GISEL-NEXT: v_mov_b32_e32 v9, 0xffff
; GFX10GISEL-NEXT: v_lshlrev_b32_e32 v7, 16, v7
; GFX10GISEL-NEXT: s_lshl_b32 s12, s0, 16
; GFX10GISEL-NEXT: v_and_or_b32 v6, v6, v9, v7
; GFX10GISEL-NEXT: v_and_or_b32 v7, v8, v9, s12
; GFX10GISEL-NEXT: v_and_or_b32 v6, 0xffff, v6, v7
; GFX10GISEL-NEXT: v_and_or_b32 v7, 0xffff, v8, s12
; GFX10GISEL-NEXT: image_sample_c_d_o v0, v[0:7], s[0:7], s[8:11] dmask:0x4 dim:SQ_RSRC_IMG_2D_ARRAY a16
; GFX10GISEL-NEXT: s_waitcnt vmcnt(0)
; GFX10GISEL-NEXT: ; return to shader part epilog
@ -446,11 +440,10 @@ define amdgpu_ps <2 x float> @sample_c_d_o_2darray_V2(<8 x i32> inreg %rsrc, <4
;
; GFX10GISEL-LABEL: sample_c_d_o_2darray_V2:
; GFX10GISEL: ; %bb.0: ; %main_body
; GFX10GISEL-NEXT: v_mov_b32_e32 v9, 0xffff
; GFX10GISEL-NEXT: v_lshlrev_b32_e32 v7, 16, v7
; GFX10GISEL-NEXT: s_lshl_b32 s12, s0, 16
; GFX10GISEL-NEXT: v_and_or_b32 v6, v6, v9, v7
; GFX10GISEL-NEXT: v_and_or_b32 v7, v8, v9, s12
; GFX10GISEL-NEXT: v_and_or_b32 v6, 0xffff, v6, v7
; GFX10GISEL-NEXT: v_and_or_b32 v7, 0xffff, v8, s12
; GFX10GISEL-NEXT: image_sample_c_d_o v[0:1], v[0:7], s[0:7], s[8:11] dmask:0x6 dim:SQ_RSRC_IMG_2D_ARRAY a16
; GFX10GISEL-NEXT: s_waitcnt vmcnt(0)
; GFX10GISEL-NEXT: ; return to shader part epilog
@ -490,10 +483,9 @@ define amdgpu_ps <4 x float> @sample_g16_noa16_d_1d(<8 x i32> inreg %rsrc, <4 x
;
; GFX10GISEL-LABEL: sample_g16_noa16_d_1d:
; GFX10GISEL: ; %bb.0: ; %main_body
; GFX10GISEL-NEXT: v_mov_b32_e32 v3, 0xffff
; GFX10GISEL-NEXT: s_lshl_b32 s12, s0, 16
; GFX10GISEL-NEXT: v_and_or_b32 v0, v0, v3, s12
; GFX10GISEL-NEXT: v_and_or_b32 v1, v1, v3, s12
; GFX10GISEL-NEXT: v_and_or_b32 v0, 0xffff, v0, s12
; GFX10GISEL-NEXT: v_and_or_b32 v1, 0xffff, v1, s12
; GFX10GISEL-NEXT: image_sample_d_g16 v[0:3], v[0:2], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D
; GFX10GISEL-NEXT: s_waitcnt vmcnt(0)
; GFX10GISEL-NEXT: ; return to shader part epilog
@ -505,9 +497,8 @@ main_body:
define amdgpu_ps <4 x float> @sample_g16_noa16_d_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, half %dsdh, half %dtdh, half %dsdv, half %dtdv, float %s, float %t) {
; GFX10-LABEL: sample_g16_noa16_d_2d:
; GFX10: ; %bb.0: ; %main_body
; GFX10-NEXT: v_mov_b32_e32 v6, 0xffff
; GFX10-NEXT: v_and_b32_e32 v2, v6, v2
; GFX10-NEXT: v_and_b32_e32 v0, v6, v0
; GFX10-NEXT: v_and_b32_e32 v2, 0xffff, v2
; GFX10-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX10-NEXT: v_lshl_or_b32 v2, v3, 16, v2
; GFX10-NEXT: v_lshl_or_b32 v0, v1, 16, v0
; GFX10-NEXT: image_sample_d_g16 v[0:3], [v0, v2, v4, v5], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D
@ -516,11 +507,10 @@ define amdgpu_ps <4 x float> @sample_g16_noa16_d_2d(<8 x i32> inreg %rsrc, <4 x
;
; GFX10GISEL-LABEL: sample_g16_noa16_d_2d:
; GFX10GISEL: ; %bb.0: ; %main_body
; GFX10GISEL-NEXT: v_mov_b32_e32 v6, 0xffff
; GFX10GISEL-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX10GISEL-NEXT: v_lshlrev_b32_e32 v3, 16, v3
; GFX10GISEL-NEXT: v_and_or_b32 v0, v0, v6, v1
; GFX10GISEL-NEXT: v_and_or_b32 v1, v2, v6, v3
; GFX10GISEL-NEXT: v_and_or_b32 v0, 0xffff, v0, v1
; GFX10GISEL-NEXT: v_and_or_b32 v1, 0xffff, v2, v3
; GFX10GISEL-NEXT: image_sample_d_g16 v[0:3], [v0, v1, v4, v5], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D
; GFX10GISEL-NEXT: s_waitcnt vmcnt(0)
; GFX10GISEL-NEXT: ; return to shader part epilog
@ -534,10 +524,9 @@ define amdgpu_ps <4 x float> @sample_g16_noa16_d_3d(<8 x i32> inreg %rsrc, <4 x
; GFX10: ; %bb.0: ; %main_body
; GFX10-NEXT: v_mov_b32_e32 v9, v3
; GFX10-NEXT: v_mov_b32_e32 v3, v2
; GFX10-NEXT: v_mov_b32_e32 v2, 0xffff
; GFX10-NEXT: v_and_b32_e32 v9, v2, v9
; GFX10-NEXT: v_and_b32_e32 v0, v2, v0
; GFX10-NEXT: v_lshl_or_b32 v4, v4, 16, v9
; GFX10-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX10-NEXT: v_and_b32_e32 v2, 0xffff, v9
; GFX10-NEXT: v_lshl_or_b32 v4, v4, 16, v2
; GFX10-NEXT: v_lshl_or_b32 v2, v1, 16, v0
; GFX10-NEXT: image_sample_d_g16 v[0:3], v[2:8], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_3D
; GFX10-NEXT: s_waitcnt vmcnt(0)
@ -547,14 +536,13 @@ define amdgpu_ps <4 x float> @sample_g16_noa16_d_3d(<8 x i32> inreg %rsrc, <4 x
; GFX10GISEL: ; %bb.0: ; %main_body
; GFX10GISEL-NEXT: v_mov_b32_e32 v9, v2
; GFX10GISEL-NEXT: v_mov_b32_e32 v10, v3
; GFX10GISEL-NEXT: v_mov_b32_e32 v11, 0xffff
; GFX10GISEL-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX10GISEL-NEXT: v_lshlrev_b32_e32 v4, 16, v4
; GFX10GISEL-NEXT: s_lshl_b32 s12, s0, 16
; GFX10GISEL-NEXT: v_and_or_b32 v3, v9, v11, s12
; GFX10GISEL-NEXT: v_and_or_b32 v2, v0, v11, v1
; GFX10GISEL-NEXT: v_and_or_b32 v4, v10, v11, v4
; GFX10GISEL-NEXT: v_and_or_b32 v5, v5, v11, s12
; GFX10GISEL-NEXT: v_and_or_b32 v3, 0xffff, v9, s12
; GFX10GISEL-NEXT: v_and_or_b32 v2, 0xffff, v0, v1
; GFX10GISEL-NEXT: v_and_or_b32 v4, 0xffff, v10, v4
; GFX10GISEL-NEXT: v_and_or_b32 v5, 0xffff, v5, s12
; GFX10GISEL-NEXT: image_sample_d_g16 v[0:3], v[2:8], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_3D
; GFX10GISEL-NEXT: s_waitcnt vmcnt(0)
; GFX10GISEL-NEXT: ; return to shader part epilog
@ -572,10 +560,9 @@ define amdgpu_ps <4 x float> @sample_g16_noa16_c_d_1d(<8 x i32> inreg %rsrc, <4
;
; GFX10GISEL-LABEL: sample_g16_noa16_c_d_1d:
; GFX10GISEL: ; %bb.0: ; %main_body
; GFX10GISEL-NEXT: v_mov_b32_e32 v4, 0xffff
; GFX10GISEL-NEXT: s_lshl_b32 s12, s0, 16
; GFX10GISEL-NEXT: v_and_or_b32 v1, v1, v4, s12
; GFX10GISEL-NEXT: v_and_or_b32 v2, v2, v4, s12
; GFX10GISEL-NEXT: v_and_or_b32 v1, 0xffff, v1, s12
; GFX10GISEL-NEXT: v_and_or_b32 v2, 0xffff, v2, s12
; GFX10GISEL-NEXT: image_sample_c_d_g16 v[0:3], v[0:3], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D
; GFX10GISEL-NEXT: s_waitcnt vmcnt(0)
; GFX10GISEL-NEXT: ; return to shader part epilog
@ -587,9 +574,8 @@ main_body:
define amdgpu_ps <4 x float> @sample_g16_noa16_c_d_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %zcompare, half %dsdh, half %dtdh, half %dsdv, half %dtdv, float %s, float %t) {
; GFX10-LABEL: sample_g16_noa16_c_d_2d:
; GFX10: ; %bb.0: ; %main_body
; GFX10-NEXT: v_mov_b32_e32 v7, 0xffff
; GFX10-NEXT: v_and_b32_e32 v3, v7, v3
; GFX10-NEXT: v_and_b32_e32 v1, v7, v1
; GFX10-NEXT: v_and_b32_e32 v3, 0xffff, v3
; GFX10-NEXT: v_and_b32_e32 v1, 0xffff, v1
; GFX10-NEXT: v_lshl_or_b32 v3, v4, 16, v3
; GFX10-NEXT: v_lshl_or_b32 v1, v2, 16, v1
; GFX10-NEXT: image_sample_c_d_g16 v[0:3], [v0, v1, v3, v5, v6], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D
@ -598,11 +584,10 @@ define amdgpu_ps <4 x float> @sample_g16_noa16_c_d_2d(<8 x i32> inreg %rsrc, <4
;
; GFX10GISEL-LABEL: sample_g16_noa16_c_d_2d:
; GFX10GISEL: ; %bb.0: ; %main_body
; GFX10GISEL-NEXT: v_mov_b32_e32 v7, 0xffff
; GFX10GISEL-NEXT: v_lshlrev_b32_e32 v2, 16, v2
; GFX10GISEL-NEXT: v_lshlrev_b32_e32 v4, 16, v4
; GFX10GISEL-NEXT: v_and_or_b32 v1, v1, v7, v2
; GFX10GISEL-NEXT: v_and_or_b32 v2, v3, v7, v4
; GFX10GISEL-NEXT: v_and_or_b32 v1, 0xffff, v1, v2
; GFX10GISEL-NEXT: v_and_or_b32 v2, 0xffff, v3, v4
; GFX10GISEL-NEXT: image_sample_c_d_g16 v[0:3], [v0, v1, v2, v5, v6], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D
; GFX10GISEL-NEXT: s_waitcnt vmcnt(0)
; GFX10GISEL-NEXT: ; return to shader part epilog
@ -620,10 +605,9 @@ define amdgpu_ps <4 x float> @sample_g16_noa16_d_cl_1d(<8 x i32> inreg %rsrc, <4
;
; GFX10GISEL-LABEL: sample_g16_noa16_d_cl_1d:
; GFX10GISEL: ; %bb.0: ; %main_body
; GFX10GISEL-NEXT: v_mov_b32_e32 v4, 0xffff
; GFX10GISEL-NEXT: s_lshl_b32 s12, s0, 16
; GFX10GISEL-NEXT: v_and_or_b32 v0, v0, v4, s12
; GFX10GISEL-NEXT: v_and_or_b32 v1, v1, v4, s12
; GFX10GISEL-NEXT: v_and_or_b32 v0, 0xffff, v0, s12
; GFX10GISEL-NEXT: v_and_or_b32 v1, 0xffff, v1, s12
; GFX10GISEL-NEXT: image_sample_d_cl_g16 v[0:3], v[0:3], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D
; GFX10GISEL-NEXT: s_waitcnt vmcnt(0)
; GFX10GISEL-NEXT: ; return to shader part epilog
@ -635,9 +619,8 @@ main_body:
define amdgpu_ps <4 x float> @sample_g16_noa16_d_cl_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, half %dsdh, half %dtdh, half %dsdv, half %dtdv, float %s, float %t, float %clamp) {
; GFX10-LABEL: sample_g16_noa16_d_cl_2d:
; GFX10: ; %bb.0: ; %main_body
; GFX10-NEXT: v_mov_b32_e32 v7, 0xffff
; GFX10-NEXT: v_and_b32_e32 v2, v7, v2
; GFX10-NEXT: v_and_b32_e32 v0, v7, v0
; GFX10-NEXT: v_and_b32_e32 v2, 0xffff, v2
; GFX10-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX10-NEXT: v_lshl_or_b32 v2, v3, 16, v2
; GFX10-NEXT: v_lshl_or_b32 v0, v1, 16, v0
; GFX10-NEXT: image_sample_d_cl_g16 v[0:3], [v0, v2, v4, v5, v6], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D
@ -646,11 +629,10 @@ define amdgpu_ps <4 x float> @sample_g16_noa16_d_cl_2d(<8 x i32> inreg %rsrc, <4
;
; GFX10GISEL-LABEL: sample_g16_noa16_d_cl_2d:
; GFX10GISEL: ; %bb.0: ; %main_body
; GFX10GISEL-NEXT: v_mov_b32_e32 v7, 0xffff
; GFX10GISEL-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX10GISEL-NEXT: v_lshlrev_b32_e32 v3, 16, v3
; GFX10GISEL-NEXT: v_and_or_b32 v0, v0, v7, v1
; GFX10GISEL-NEXT: v_and_or_b32 v1, v2, v7, v3
; GFX10GISEL-NEXT: v_and_or_b32 v0, 0xffff, v0, v1
; GFX10GISEL-NEXT: v_and_or_b32 v1, 0xffff, v2, v3
; GFX10GISEL-NEXT: image_sample_d_cl_g16 v[0:3], [v0, v1, v4, v5, v6], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D
; GFX10GISEL-NEXT: s_waitcnt vmcnt(0)
; GFX10GISEL-NEXT: ; return to shader part epilog
@ -668,10 +650,9 @@ define amdgpu_ps <4 x float> @sample_g16_noa16_c_d_cl_1d(<8 x i32> inreg %rsrc,
;
; GFX10GISEL-LABEL: sample_g16_noa16_c_d_cl_1d:
; GFX10GISEL: ; %bb.0: ; %main_body
; GFX10GISEL-NEXT: v_mov_b32_e32 v5, 0xffff
; GFX10GISEL-NEXT: s_lshl_b32 s12, s0, 16
; GFX10GISEL-NEXT: v_and_or_b32 v1, v1, v5, s12
; GFX10GISEL-NEXT: v_and_or_b32 v2, v2, v5, s12
; GFX10GISEL-NEXT: v_and_or_b32 v1, 0xffff, v1, s12
; GFX10GISEL-NEXT: v_and_or_b32 v2, 0xffff, v2, s12
; GFX10GISEL-NEXT: image_sample_c_d_cl_g16 v[0:3], v[0:4], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D
; GFX10GISEL-NEXT: s_waitcnt vmcnt(0)
; GFX10GISEL-NEXT: ; return to shader part epilog
@ -685,11 +666,10 @@ define amdgpu_ps <4 x float> @sample_g16_noa16_c_d_cl_2d(<8 x i32> inreg %rsrc,
; GFX10: ; %bb.0: ; %main_body
; GFX10-NEXT: v_mov_b32_e32 v8, v2
; GFX10-NEXT: v_mov_b32_e32 v2, v0
; GFX10-NEXT: v_mov_b32_e32 v0, 0xffff
; GFX10-NEXT: v_and_b32_e32 v3, v0, v3
; GFX10-NEXT: v_and_b32_e32 v0, v0, v1
; GFX10-NEXT: v_lshl_or_b32 v4, v4, 16, v3
; GFX10-NEXT: v_lshl_or_b32 v3, v8, 16, v0
; GFX10-NEXT: v_and_b32_e32 v0, 0xffff, v3
; GFX10-NEXT: v_and_b32_e32 v1, 0xffff, v1
; GFX10-NEXT: v_lshl_or_b32 v4, v4, 16, v0
; GFX10-NEXT: v_lshl_or_b32 v3, v8, 16, v1
; GFX10-NEXT: image_sample_c_d_cl_g16 v[0:3], v[2:7], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: ; return to shader part epilog
@ -699,11 +679,10 @@ define amdgpu_ps <4 x float> @sample_g16_noa16_c_d_cl_2d(<8 x i32> inreg %rsrc,
; GFX10GISEL-NEXT: v_mov_b32_e32 v8, v2
; GFX10GISEL-NEXT: v_mov_b32_e32 v9, v3
; GFX10GISEL-NEXT: v_mov_b32_e32 v2, v0
; GFX10GISEL-NEXT: v_mov_b32_e32 v0, 0xffff
; GFX10GISEL-NEXT: v_lshlrev_b32_e32 v4, 16, v4
; GFX10GISEL-NEXT: v_lshlrev_b32_e32 v3, 16, v8
; GFX10GISEL-NEXT: v_and_or_b32 v4, v9, v0, v4
; GFX10GISEL-NEXT: v_and_or_b32 v3, v1, v0, v3
; GFX10GISEL-NEXT: v_lshlrev_b32_e32 v0, 16, v8
; GFX10GISEL-NEXT: v_and_or_b32 v4, 0xffff, v9, v4
; GFX10GISEL-NEXT: v_and_or_b32 v3, 0xffff, v1, v0
; GFX10GISEL-NEXT: image_sample_c_d_cl_g16 v[0:3], v[2:7], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D
; GFX10GISEL-NEXT: s_waitcnt vmcnt(0)
; GFX10GISEL-NEXT: ; return to shader part epilog
@ -721,10 +700,9 @@ define amdgpu_ps <4 x float> @sample_g16_noa16_cd_1d(<8 x i32> inreg %rsrc, <4 x
;
; GFX10GISEL-LABEL: sample_g16_noa16_cd_1d:
; GFX10GISEL: ; %bb.0: ; %main_body
; GFX10GISEL-NEXT: v_mov_b32_e32 v3, 0xffff
; GFX10GISEL-NEXT: s_lshl_b32 s12, s0, 16
; GFX10GISEL-NEXT: v_and_or_b32 v0, v0, v3, s12
; GFX10GISEL-NEXT: v_and_or_b32 v1, v1, v3, s12
; GFX10GISEL-NEXT: v_and_or_b32 v0, 0xffff, v0, s12
; GFX10GISEL-NEXT: v_and_or_b32 v1, 0xffff, v1, s12
; GFX10GISEL-NEXT: image_sample_cd_g16 v[0:3], v[0:2], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D
; GFX10GISEL-NEXT: s_waitcnt vmcnt(0)
; GFX10GISEL-NEXT: ; return to shader part epilog
@ -736,9 +714,8 @@ main_body:
define amdgpu_ps <4 x float> @sample_g16_noa16_cd_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, half %dsdh, half %dtdh, half %dsdv, half %dtdv, float %s, float %t) {
; GFX10-LABEL: sample_g16_noa16_cd_2d:
; GFX10: ; %bb.0: ; %main_body
; GFX10-NEXT: v_mov_b32_e32 v6, 0xffff
; GFX10-NEXT: v_and_b32_e32 v2, v6, v2
; GFX10-NEXT: v_and_b32_e32 v0, v6, v0
; GFX10-NEXT: v_and_b32_e32 v2, 0xffff, v2
; GFX10-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX10-NEXT: v_lshl_or_b32 v2, v3, 16, v2
; GFX10-NEXT: v_lshl_or_b32 v0, v1, 16, v0
; GFX10-NEXT: image_sample_cd_g16 v[0:3], [v0, v2, v4, v5], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D
@ -747,11 +724,10 @@ define amdgpu_ps <4 x float> @sample_g16_noa16_cd_2d(<8 x i32> inreg %rsrc, <4 x
;
; GFX10GISEL-LABEL: sample_g16_noa16_cd_2d:
; GFX10GISEL: ; %bb.0: ; %main_body
; GFX10GISEL-NEXT: v_mov_b32_e32 v6, 0xffff
; GFX10GISEL-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX10GISEL-NEXT: v_lshlrev_b32_e32 v3, 16, v3
; GFX10GISEL-NEXT: v_and_or_b32 v0, v0, v6, v1
; GFX10GISEL-NEXT: v_and_or_b32 v1, v2, v6, v3
; GFX10GISEL-NEXT: v_and_or_b32 v0, 0xffff, v0, v1
; GFX10GISEL-NEXT: v_and_or_b32 v1, 0xffff, v2, v3
; GFX10GISEL-NEXT: image_sample_cd_g16 v[0:3], [v0, v1, v4, v5], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D
; GFX10GISEL-NEXT: s_waitcnt vmcnt(0)
; GFX10GISEL-NEXT: ; return to shader part epilog
@ -769,10 +745,9 @@ define amdgpu_ps <4 x float> @sample_g16_noa16_c_cd_1d(<8 x i32> inreg %rsrc, <4
;
; GFX10GISEL-LABEL: sample_g16_noa16_c_cd_1d:
; GFX10GISEL: ; %bb.0: ; %main_body
; GFX10GISEL-NEXT: v_mov_b32_e32 v4, 0xffff
; GFX10GISEL-NEXT: s_lshl_b32 s12, s0, 16
; GFX10GISEL-NEXT: v_and_or_b32 v1, v1, v4, s12
; GFX10GISEL-NEXT: v_and_or_b32 v2, v2, v4, s12
; GFX10GISEL-NEXT: v_and_or_b32 v1, 0xffff, v1, s12
; GFX10GISEL-NEXT: v_and_or_b32 v2, 0xffff, v2, s12
; GFX10GISEL-NEXT: image_sample_c_cd_g16 v[0:3], v[0:3], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D
; GFX10GISEL-NEXT: s_waitcnt vmcnt(0)
; GFX10GISEL-NEXT: ; return to shader part epilog
@ -784,9 +759,8 @@ main_body:
define amdgpu_ps <4 x float> @sample_g16_noa16_c_cd_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %zcompare, half %dsdh, half %dtdh, half %dsdv, half %dtdv, float %s, float %t) {
; GFX10-LABEL: sample_g16_noa16_c_cd_2d:
; GFX10: ; %bb.0: ; %main_body
; GFX10-NEXT: v_mov_b32_e32 v7, 0xffff
; GFX10-NEXT: v_and_b32_e32 v3, v7, v3
; GFX10-NEXT: v_and_b32_e32 v1, v7, v1
; GFX10-NEXT: v_and_b32_e32 v3, 0xffff, v3
; GFX10-NEXT: v_and_b32_e32 v1, 0xffff, v1
; GFX10-NEXT: v_lshl_or_b32 v3, v4, 16, v3
; GFX10-NEXT: v_lshl_or_b32 v1, v2, 16, v1
; GFX10-NEXT: image_sample_c_cd_g16 v[0:3], [v0, v1, v3, v5, v6], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D
@ -795,11 +769,10 @@ define amdgpu_ps <4 x float> @sample_g16_noa16_c_cd_2d(<8 x i32> inreg %rsrc, <4
;
; GFX10GISEL-LABEL: sample_g16_noa16_c_cd_2d:
; GFX10GISEL: ; %bb.0: ; %main_body
; GFX10GISEL-NEXT: v_mov_b32_e32 v7, 0xffff
; GFX10GISEL-NEXT: v_lshlrev_b32_e32 v2, 16, v2
; GFX10GISEL-NEXT: v_lshlrev_b32_e32 v4, 16, v4
; GFX10GISEL-NEXT: v_and_or_b32 v1, v1, v7, v2
; GFX10GISEL-NEXT: v_and_or_b32 v2, v3, v7, v4
; GFX10GISEL-NEXT: v_and_or_b32 v1, 0xffff, v1, v2
; GFX10GISEL-NEXT: v_and_or_b32 v2, 0xffff, v3, v4
; GFX10GISEL-NEXT: image_sample_c_cd_g16 v[0:3], [v0, v1, v2, v5, v6], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D
; GFX10GISEL-NEXT: s_waitcnt vmcnt(0)
; GFX10GISEL-NEXT: ; return to shader part epilog
@ -817,10 +790,9 @@ define amdgpu_ps <4 x float> @sample_g16_noa16_cd_cl_1d(<8 x i32> inreg %rsrc, <
;
; GFX10GISEL-LABEL: sample_g16_noa16_cd_cl_1d:
; GFX10GISEL: ; %bb.0: ; %main_body
; GFX10GISEL-NEXT: v_mov_b32_e32 v4, 0xffff
; GFX10GISEL-NEXT: s_lshl_b32 s12, s0, 16
; GFX10GISEL-NEXT: v_and_or_b32 v0, v0, v4, s12
; GFX10GISEL-NEXT: v_and_or_b32 v1, v1, v4, s12
; GFX10GISEL-NEXT: v_and_or_b32 v0, 0xffff, v0, s12
; GFX10GISEL-NEXT: v_and_or_b32 v1, 0xffff, v1, s12
; GFX10GISEL-NEXT: image_sample_cd_cl_g16 v[0:3], v[0:3], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D
; GFX10GISEL-NEXT: s_waitcnt vmcnt(0)
; GFX10GISEL-NEXT: ; return to shader part epilog
@ -832,9 +804,8 @@ main_body:
define amdgpu_ps <4 x float> @sample_g16_noa16_cd_cl_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, half %dsdh, half %dtdh, half %dsdv, half %dtdv, float %s, float %t, float %clamp) {
; GFX10-LABEL: sample_g16_noa16_cd_cl_2d:
; GFX10: ; %bb.0: ; %main_body
; GFX10-NEXT: v_mov_b32_e32 v7, 0xffff
; GFX10-NEXT: v_and_b32_e32 v2, v7, v2
; GFX10-NEXT: v_and_b32_e32 v0, v7, v0
; GFX10-NEXT: v_and_b32_e32 v2, 0xffff, v2
; GFX10-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX10-NEXT: v_lshl_or_b32 v2, v3, 16, v2
; GFX10-NEXT: v_lshl_or_b32 v0, v1, 16, v0
; GFX10-NEXT: image_sample_cd_cl_g16 v[0:3], [v0, v2, v4, v5, v6], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D
@ -843,11 +814,10 @@ define amdgpu_ps <4 x float> @sample_g16_noa16_cd_cl_2d(<8 x i32> inreg %rsrc, <
;
; GFX10GISEL-LABEL: sample_g16_noa16_cd_cl_2d:
; GFX10GISEL: ; %bb.0: ; %main_body
; GFX10GISEL-NEXT: v_mov_b32_e32 v7, 0xffff
; GFX10GISEL-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX10GISEL-NEXT: v_lshlrev_b32_e32 v3, 16, v3
; GFX10GISEL-NEXT: v_and_or_b32 v0, v0, v7, v1
; GFX10GISEL-NEXT: v_and_or_b32 v1, v2, v7, v3
; GFX10GISEL-NEXT: v_and_or_b32 v0, 0xffff, v0, v1
; GFX10GISEL-NEXT: v_and_or_b32 v1, 0xffff, v2, v3
; GFX10GISEL-NEXT: image_sample_cd_cl_g16 v[0:3], [v0, v1, v4, v5, v6], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D
; GFX10GISEL-NEXT: s_waitcnt vmcnt(0)
; GFX10GISEL-NEXT: ; return to shader part epilog
@ -865,10 +835,9 @@ define amdgpu_ps <4 x float> @sample_g16_noa16_c_cd_cl_1d(<8 x i32> inreg %rsrc,
;
; GFX10GISEL-LABEL: sample_g16_noa16_c_cd_cl_1d:
; GFX10GISEL: ; %bb.0: ; %main_body
; GFX10GISEL-NEXT: v_mov_b32_e32 v5, 0xffff
; GFX10GISEL-NEXT: s_lshl_b32 s12, s0, 16
; GFX10GISEL-NEXT: v_and_or_b32 v1, v1, v5, s12
; GFX10GISEL-NEXT: v_and_or_b32 v2, v2, v5, s12
; GFX10GISEL-NEXT: v_and_or_b32 v1, 0xffff, v1, s12
; GFX10GISEL-NEXT: v_and_or_b32 v2, 0xffff, v2, s12
; GFX10GISEL-NEXT: image_sample_c_cd_cl_g16 v[0:3], v[0:4], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D
; GFX10GISEL-NEXT: s_waitcnt vmcnt(0)
; GFX10GISEL-NEXT: ; return to shader part epilog
@ -882,11 +851,10 @@ define amdgpu_ps <4 x float> @sample_g16_noa16_c_cd_cl_2d(<8 x i32> inreg %rsrc,
; GFX10: ; %bb.0: ; %main_body
; GFX10-NEXT: v_mov_b32_e32 v8, v2
; GFX10-NEXT: v_mov_b32_e32 v2, v0
; GFX10-NEXT: v_mov_b32_e32 v0, 0xffff
; GFX10-NEXT: v_and_b32_e32 v3, v0, v3
; GFX10-NEXT: v_and_b32_e32 v0, v0, v1
; GFX10-NEXT: v_lshl_or_b32 v4, v4, 16, v3
; GFX10-NEXT: v_lshl_or_b32 v3, v8, 16, v0
; GFX10-NEXT: v_and_b32_e32 v0, 0xffff, v3
; GFX10-NEXT: v_and_b32_e32 v1, 0xffff, v1
; GFX10-NEXT: v_lshl_or_b32 v4, v4, 16, v0
; GFX10-NEXT: v_lshl_or_b32 v3, v8, 16, v1
; GFX10-NEXT: image_sample_c_cd_cl_g16 v[0:3], v[2:7], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: ; return to shader part epilog
@ -896,11 +864,10 @@ define amdgpu_ps <4 x float> @sample_g16_noa16_c_cd_cl_2d(<8 x i32> inreg %rsrc,
; GFX10GISEL-NEXT: v_mov_b32_e32 v8, v2
; GFX10GISEL-NEXT: v_mov_b32_e32 v9, v3
; GFX10GISEL-NEXT: v_mov_b32_e32 v2, v0
; GFX10GISEL-NEXT: v_mov_b32_e32 v0, 0xffff
; GFX10GISEL-NEXT: v_lshlrev_b32_e32 v4, 16, v4
; GFX10GISEL-NEXT: v_lshlrev_b32_e32 v3, 16, v8
; GFX10GISEL-NEXT: v_and_or_b32 v4, v9, v0, v4
; GFX10GISEL-NEXT: v_and_or_b32 v3, v1, v0, v3
; GFX10GISEL-NEXT: v_lshlrev_b32_e32 v0, 16, v8
; GFX10GISEL-NEXT: v_and_or_b32 v4, 0xffff, v9, v4
; GFX10GISEL-NEXT: v_and_or_b32 v3, 0xffff, v1, v0
; GFX10GISEL-NEXT: image_sample_c_cd_cl_g16 v[0:3], v[2:7], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D
; GFX10GISEL-NEXT: s_waitcnt vmcnt(0)
; GFX10GISEL-NEXT: ; return to shader part epilog
@ -913,14 +880,13 @@ define amdgpu_ps float @sample_g16_noa16_c_d_o_2darray_V1(<8 x i32> inreg %rsrc,
; GFX10-LABEL: sample_g16_noa16_c_d_o_2darray_V1:
; GFX10: ; %bb.0: ; %main_body
; GFX10-NEXT: v_mov_b32_e32 v9, v2
; GFX10-NEXT: v_mov_b32_e32 v2, v0
; GFX10-NEXT: v_mov_b32_e32 v0, 0xffff
; GFX10-NEXT: v_mov_b32_e32 v10, v3
; GFX10-NEXT: v_mov_b32_e32 v3, v1
; GFX10-NEXT: v_and_b32_e32 v1, v0, v4
; GFX10-NEXT: v_and_b32_e32 v0, v0, v9
; GFX10-NEXT: v_lshl_or_b32 v5, v5, 16, v1
; GFX10-NEXT: v_lshl_or_b32 v4, v10, 16, v0
; GFX10-NEXT: v_mov_b32_e32 v2, v0
; GFX10-NEXT: v_and_b32_e32 v0, 0xffff, v4
; GFX10-NEXT: v_and_b32_e32 v1, 0xffff, v9
; GFX10-NEXT: v_lshl_or_b32 v5, v5, 16, v0
; GFX10-NEXT: v_lshl_or_b32 v4, v10, 16, v1
; GFX10-NEXT: image_sample_c_d_o_g16 v0, v[2:8], s[0:7], s[8:11] dmask:0x4 dim:SQ_RSRC_IMG_2D_ARRAY
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: ; return to shader part epilog
@ -932,11 +898,10 @@ define amdgpu_ps float @sample_g16_noa16_c_d_o_2darray_V1(<8 x i32> inreg %rsrc,
; GFX10GISEL-NEXT: v_mov_b32_e32 v11, v4
; GFX10GISEL-NEXT: v_mov_b32_e32 v2, v0
; GFX10GISEL-NEXT: v_mov_b32_e32 v3, v1
; GFX10GISEL-NEXT: v_mov_b32_e32 v0, 0xffff
; GFX10GISEL-NEXT: v_lshlrev_b32_e32 v1, 16, v9
; GFX10GISEL-NEXT: v_lshlrev_b32_e32 v5, 16, v5
; GFX10GISEL-NEXT: v_and_or_b32 v4, v10, v0, v1
; GFX10GISEL-NEXT: v_and_or_b32 v5, v11, v0, v5
; GFX10GISEL-NEXT: v_lshlrev_b32_e32 v0, 16, v9
; GFX10GISEL-NEXT: v_lshlrev_b32_e32 v1, 16, v5
; GFX10GISEL-NEXT: v_and_or_b32 v4, 0xffff, v10, v0
; GFX10GISEL-NEXT: v_and_or_b32 v5, 0xffff, v11, v1
; GFX10GISEL-NEXT: image_sample_c_d_o_g16 v0, v[2:8], s[0:7], s[8:11] dmask:0x4 dim:SQ_RSRC_IMG_2D_ARRAY
; GFX10GISEL-NEXT: s_waitcnt vmcnt(0)
; GFX10GISEL-NEXT: ; return to shader part epilog
@ -949,14 +914,13 @@ define amdgpu_ps <2 x float> @sample_g16_noa16_c_d_o_2darray_V2(<8 x i32> inreg
; GFX10-LABEL: sample_g16_noa16_c_d_o_2darray_V2:
; GFX10: ; %bb.0: ; %main_body
; GFX10-NEXT: v_mov_b32_e32 v9, v2
; GFX10-NEXT: v_mov_b32_e32 v2, v0
; GFX10-NEXT: v_mov_b32_e32 v0, 0xffff
; GFX10-NEXT: v_mov_b32_e32 v10, v3
; GFX10-NEXT: v_mov_b32_e32 v3, v1
; GFX10-NEXT: v_and_b32_e32 v1, v0, v4
; GFX10-NEXT: v_and_b32_e32 v0, v0, v9
; GFX10-NEXT: v_lshl_or_b32 v5, v5, 16, v1
; GFX10-NEXT: v_lshl_or_b32 v4, v10, 16, v0
; GFX10-NEXT: v_mov_b32_e32 v2, v0
; GFX10-NEXT: v_and_b32_e32 v0, 0xffff, v4
; GFX10-NEXT: v_and_b32_e32 v1, 0xffff, v9
; GFX10-NEXT: v_lshl_or_b32 v5, v5, 16, v0
; GFX10-NEXT: v_lshl_or_b32 v4, v10, 16, v1
; GFX10-NEXT: image_sample_c_d_o_g16 v[0:1], v[2:8], s[0:7], s[8:11] dmask:0x6 dim:SQ_RSRC_IMG_2D_ARRAY
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: ; return to shader part epilog
@ -968,11 +932,10 @@ define amdgpu_ps <2 x float> @sample_g16_noa16_c_d_o_2darray_V2(<8 x i32> inreg
; GFX10GISEL-NEXT: v_mov_b32_e32 v11, v4
; GFX10GISEL-NEXT: v_mov_b32_e32 v2, v0
; GFX10GISEL-NEXT: v_mov_b32_e32 v3, v1
; GFX10GISEL-NEXT: v_mov_b32_e32 v0, 0xffff
; GFX10GISEL-NEXT: v_lshlrev_b32_e32 v1, 16, v9
; GFX10GISEL-NEXT: v_lshlrev_b32_e32 v5, 16, v5
; GFX10GISEL-NEXT: v_and_or_b32 v4, v10, v0, v1
; GFX10GISEL-NEXT: v_and_or_b32 v5, v11, v0, v5
; GFX10GISEL-NEXT: v_lshlrev_b32_e32 v0, 16, v9
; GFX10GISEL-NEXT: v_lshlrev_b32_e32 v1, 16, v5
; GFX10GISEL-NEXT: v_and_or_b32 v4, 0xffff, v10, v0
; GFX10GISEL-NEXT: v_and_or_b32 v5, 0xffff, v11, v1
; GFX10GISEL-NEXT: image_sample_c_d_o_g16 v[0:1], v[2:8], s[0:7], s[8:11] dmask:0x6 dim:SQ_RSRC_IMG_2D_ARRAY
; GFX10GISEL-NEXT: s_waitcnt vmcnt(0)
; GFX10GISEL-NEXT: ; return to shader part epilog

View File

@ -15,9 +15,8 @@ main_body:
define amdgpu_ps <4 x float> @sample_d_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, half %dsdh, half %dtdh, half %dsdv, half %dtdv, float %s, float %t) {
; GFX10-LABEL: sample_d_2d:
; GFX10: ; %bb.0: ; %main_body
; GFX10-NEXT: v_mov_b32_e32 v6, 0xffff ; encoding: [0xff,0x02,0x0c,0x7e,0xff,0xff,0x00,0x00]
; GFX10-NEXT: v_and_b32_e32 v2, v6, v2 ; encoding: [0x06,0x05,0x04,0x36]
; GFX10-NEXT: v_and_b32_e32 v0, v6, v0 ; encoding: [0x06,0x01,0x00,0x36]
; GFX10-NEXT: v_and_b32_e32 v2, 0xffff, v2 ; encoding: [0xff,0x04,0x04,0x36,0xff,0xff,0x00,0x00]
; GFX10-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; encoding: [0xff,0x00,0x00,0x36,0xff,0xff,0x00,0x00]
; GFX10-NEXT: v_lshl_or_b32 v2, v3, 16, v2 ; encoding: [0x02,0x00,0x6f,0xd7,0x03,0x21,0x09,0x04]
; GFX10-NEXT: v_lshl_or_b32 v0, v1, 16, v0 ; encoding: [0x00,0x00,0x6f,0xd7,0x01,0x21,0x01,0x04]
; GFX10-NEXT: image_sample_d_g16 v[0:3], [v0, v2, v4, v5], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D ; encoding: [0x0b,0x0f,0x88,0xf0,0x00,0x00,0x40,0x00,0x02,0x04,0x05,0x00]
@ -33,10 +32,9 @@ define amdgpu_ps <4 x float> @sample_d_3d(<8 x i32> inreg %rsrc, <4 x i32> inreg
; GFX10: ; %bb.0: ; %main_body
; GFX10-NEXT: v_mov_b32_e32 v9, v3 ; encoding: [0x03,0x03,0x12,0x7e]
; GFX10-NEXT: v_mov_b32_e32 v3, v2 ; encoding: [0x02,0x03,0x06,0x7e]
; GFX10-NEXT: v_mov_b32_e32 v2, 0xffff ; encoding: [0xff,0x02,0x04,0x7e,0xff,0xff,0x00,0x00]
; GFX10-NEXT: v_and_b32_e32 v9, v2, v9 ; encoding: [0x02,0x13,0x12,0x36]
; GFX10-NEXT: v_and_b32_e32 v0, v2, v0 ; encoding: [0x02,0x01,0x00,0x36]
; GFX10-NEXT: v_lshl_or_b32 v4, v4, 16, v9 ; encoding: [0x04,0x00,0x6f,0xd7,0x04,0x21,0x25,0x04]
; GFX10-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; encoding: [0xff,0x00,0x00,0x36,0xff,0xff,0x00,0x00]
; GFX10-NEXT: v_and_b32_e32 v2, 0xffff, v9 ; encoding: [0xff,0x12,0x04,0x36,0xff,0xff,0x00,0x00]
; GFX10-NEXT: v_lshl_or_b32 v4, v4, 16, v2 ; encoding: [0x04,0x00,0x6f,0xd7,0x04,0x21,0x09,0x04]
; GFX10-NEXT: v_lshl_or_b32 v2, v1, 16, v0 ; encoding: [0x02,0x00,0x6f,0xd7,0x01,0x21,0x01,0x04]
; GFX10-NEXT: image_sample_d_g16 v[0:3], v[2:8], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_3D ; encoding: [0x11,0x0f,0x88,0xf0,0x02,0x00,0x40,0x00]
; GFX10-NEXT: s_waitcnt vmcnt(0) ; encoding: [0x70,0x3f,0x8c,0xbf]
@ -60,9 +58,8 @@ main_body:
define amdgpu_ps <4 x float> @sample_c_d_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %zcompare, half %dsdh, half %dtdh, half %dsdv, half %dtdv, float %s, float %t) {
; GFX10-LABEL: sample_c_d_2d:
; GFX10: ; %bb.0: ; %main_body
; GFX10-NEXT: v_mov_b32_e32 v7, 0xffff ; encoding: [0xff,0x02,0x0e,0x7e,0xff,0xff,0x00,0x00]
; GFX10-NEXT: v_and_b32_e32 v3, v7, v3 ; encoding: [0x07,0x07,0x06,0x36]
; GFX10-NEXT: v_and_b32_e32 v1, v7, v1 ; encoding: [0x07,0x03,0x02,0x36]
; GFX10-NEXT: v_and_b32_e32 v3, 0xffff, v3 ; encoding: [0xff,0x06,0x06,0x36,0xff,0xff,0x00,0x00]
; GFX10-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; encoding: [0xff,0x02,0x02,0x36,0xff,0xff,0x00,0x00]
; GFX10-NEXT: v_lshl_or_b32 v3, v4, 16, v3 ; encoding: [0x03,0x00,0x6f,0xd7,0x04,0x21,0x0d,0x04]
; GFX10-NEXT: v_lshl_or_b32 v1, v2, 16, v1 ; encoding: [0x01,0x00,0x6f,0xd7,0x02,0x21,0x05,0x04]
; GFX10-NEXT: image_sample_c_d_g16 v[0:3], [v0, v1, v3, v5, v6], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D ; encoding: [0x0b,0x0f,0xa8,0xf0,0x00,0x00,0x40,0x00,0x01,0x03,0x05,0x06]
@ -87,9 +84,8 @@ main_body:
define amdgpu_ps <4 x float> @sample_d_cl_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, half %dsdh, half %dtdh, half %dsdv, half %dtdv, float %s, float %t, float %clamp) {
; GFX10-LABEL: sample_d_cl_2d:
; GFX10: ; %bb.0: ; %main_body
; GFX10-NEXT: v_mov_b32_e32 v7, 0xffff ; encoding: [0xff,0x02,0x0e,0x7e,0xff,0xff,0x00,0x00]
; GFX10-NEXT: v_and_b32_e32 v2, v7, v2 ; encoding: [0x07,0x05,0x04,0x36]
; GFX10-NEXT: v_and_b32_e32 v0, v7, v0 ; encoding: [0x07,0x01,0x00,0x36]
; GFX10-NEXT: v_and_b32_e32 v2, 0xffff, v2 ; encoding: [0xff,0x04,0x04,0x36,0xff,0xff,0x00,0x00]
; GFX10-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; encoding: [0xff,0x00,0x00,0x36,0xff,0xff,0x00,0x00]
; GFX10-NEXT: v_lshl_or_b32 v2, v3, 16, v2 ; encoding: [0x02,0x00,0x6f,0xd7,0x03,0x21,0x09,0x04]
; GFX10-NEXT: v_lshl_or_b32 v0, v1, 16, v0 ; encoding: [0x00,0x00,0x6f,0xd7,0x01,0x21,0x01,0x04]
; GFX10-NEXT: image_sample_d_cl_g16 v[0:3], [v0, v2, v4, v5, v6], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D ; encoding: [0x0b,0x0f,0x8c,0xf0,0x00,0x00,0x40,0x00,0x02,0x04,0x05,0x06]
@ -116,11 +112,10 @@ define amdgpu_ps <4 x float> @sample_c_d_cl_2d(<8 x i32> inreg %rsrc, <4 x i32>
; GFX10: ; %bb.0: ; %main_body
; GFX10-NEXT: v_mov_b32_e32 v8, v2 ; encoding: [0x02,0x03,0x10,0x7e]
; GFX10-NEXT: v_mov_b32_e32 v2, v0 ; encoding: [0x00,0x03,0x04,0x7e]
; GFX10-NEXT: v_mov_b32_e32 v0, 0xffff ; encoding: [0xff,0x02,0x00,0x7e,0xff,0xff,0x00,0x00]
; GFX10-NEXT: v_and_b32_e32 v3, v0, v3 ; encoding: [0x00,0x07,0x06,0x36]
; GFX10-NEXT: v_and_b32_e32 v0, v0, v1 ; encoding: [0x00,0x03,0x00,0x36]
; GFX10-NEXT: v_lshl_or_b32 v4, v4, 16, v3 ; encoding: [0x04,0x00,0x6f,0xd7,0x04,0x21,0x0d,0x04]
; GFX10-NEXT: v_lshl_or_b32 v3, v8, 16, v0 ; encoding: [0x03,0x00,0x6f,0xd7,0x08,0x21,0x01,0x04]
; GFX10-NEXT: v_and_b32_e32 v0, 0xffff, v3 ; encoding: [0xff,0x06,0x00,0x36,0xff,0xff,0x00,0x00]
; GFX10-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; encoding: [0xff,0x02,0x02,0x36,0xff,0xff,0x00,0x00]
; GFX10-NEXT: v_lshl_or_b32 v4, v4, 16, v0 ; encoding: [0x04,0x00,0x6f,0xd7,0x04,0x21,0x01,0x04]
; GFX10-NEXT: v_lshl_or_b32 v3, v8, 16, v1 ; encoding: [0x03,0x00,0x6f,0xd7,0x08,0x21,0x05,0x04]
; GFX10-NEXT: image_sample_c_d_cl_g16 v[0:3], v[2:7], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D ; encoding: [0x09,0x0f,0xac,0xf0,0x02,0x00,0x40,0x00]
; GFX10-NEXT: s_waitcnt vmcnt(0) ; encoding: [0x70,0x3f,0x8c,0xbf]
; GFX10-NEXT: ; return to shader part epilog
@ -143,9 +138,8 @@ main_body:
define amdgpu_ps <4 x float> @sample_cd_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, half %dsdh, half %dtdh, half %dsdv, half %dtdv, float %s, float %t) {
; GFX10-LABEL: sample_cd_2d:
; GFX10: ; %bb.0: ; %main_body
; GFX10-NEXT: v_mov_b32_e32 v6, 0xffff ; encoding: [0xff,0x02,0x0c,0x7e,0xff,0xff,0x00,0x00]
; GFX10-NEXT: v_and_b32_e32 v2, v6, v2 ; encoding: [0x06,0x05,0x04,0x36]
; GFX10-NEXT: v_and_b32_e32 v0, v6, v0 ; encoding: [0x06,0x01,0x00,0x36]
; GFX10-NEXT: v_and_b32_e32 v2, 0xffff, v2 ; encoding: [0xff,0x04,0x04,0x36,0xff,0xff,0x00,0x00]
; GFX10-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; encoding: [0xff,0x00,0x00,0x36,0xff,0xff,0x00,0x00]
; GFX10-NEXT: v_lshl_or_b32 v2, v3, 16, v2 ; encoding: [0x02,0x00,0x6f,0xd7,0x03,0x21,0x09,0x04]
; GFX10-NEXT: v_lshl_or_b32 v0, v1, 16, v0 ; encoding: [0x00,0x00,0x6f,0xd7,0x01,0x21,0x01,0x04]
; GFX10-NEXT: image_sample_cd_g16 v[0:3], [v0, v2, v4, v5], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D ; encoding: [0x0b,0x0f,0xa0,0xf1,0x00,0x00,0x40,0x00,0x02,0x04,0x05,0x00]
@ -170,9 +164,8 @@ main_body:
define amdgpu_ps <4 x float> @sample_c_cd_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %zcompare, half %dsdh, half %dtdh, half %dsdv, half %dtdv, float %s, float %t) {
; GFX10-LABEL: sample_c_cd_2d:
; GFX10: ; %bb.0: ; %main_body
; GFX10-NEXT: v_mov_b32_e32 v7, 0xffff ; encoding: [0xff,0x02,0x0e,0x7e,0xff,0xff,0x00,0x00]
; GFX10-NEXT: v_and_b32_e32 v3, v7, v3 ; encoding: [0x07,0x07,0x06,0x36]
; GFX10-NEXT: v_and_b32_e32 v1, v7, v1 ; encoding: [0x07,0x03,0x02,0x36]
; GFX10-NEXT: v_and_b32_e32 v3, 0xffff, v3 ; encoding: [0xff,0x06,0x06,0x36,0xff,0xff,0x00,0x00]
; GFX10-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; encoding: [0xff,0x02,0x02,0x36,0xff,0xff,0x00,0x00]
; GFX10-NEXT: v_lshl_or_b32 v3, v4, 16, v3 ; encoding: [0x03,0x00,0x6f,0xd7,0x04,0x21,0x0d,0x04]
; GFX10-NEXT: v_lshl_or_b32 v1, v2, 16, v1 ; encoding: [0x01,0x00,0x6f,0xd7,0x02,0x21,0x05,0x04]
; GFX10-NEXT: image_sample_c_cd_g16 v[0:3], [v0, v1, v3, v5, v6], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D ; encoding: [0x0b,0x0f,0xa8,0xf1,0x00,0x00,0x40,0x00,0x01,0x03,0x05,0x06]
@ -197,9 +190,8 @@ main_body:
define amdgpu_ps <4 x float> @sample_cd_cl_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, half %dsdh, half %dtdh, half %dsdv, half %dtdv, float %s, float %t, float %clamp) {
; GFX10-LABEL: sample_cd_cl_2d:
; GFX10: ; %bb.0: ; %main_body
; GFX10-NEXT: v_mov_b32_e32 v7, 0xffff ; encoding: [0xff,0x02,0x0e,0x7e,0xff,0xff,0x00,0x00]
; GFX10-NEXT: v_and_b32_e32 v2, v7, v2 ; encoding: [0x07,0x05,0x04,0x36]
; GFX10-NEXT: v_and_b32_e32 v0, v7, v0 ; encoding: [0x07,0x01,0x00,0x36]
; GFX10-NEXT: v_and_b32_e32 v2, 0xffff, v2 ; encoding: [0xff,0x04,0x04,0x36,0xff,0xff,0x00,0x00]
; GFX10-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; encoding: [0xff,0x00,0x00,0x36,0xff,0xff,0x00,0x00]
; GFX10-NEXT: v_lshl_or_b32 v2, v3, 16, v2 ; encoding: [0x02,0x00,0x6f,0xd7,0x03,0x21,0x09,0x04]
; GFX10-NEXT: v_lshl_or_b32 v0, v1, 16, v0 ; encoding: [0x00,0x00,0x6f,0xd7,0x01,0x21,0x01,0x04]
; GFX10-NEXT: image_sample_cd_cl_g16 v[0:3], [v0, v2, v4, v5, v6], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D ; encoding: [0x0b,0x0f,0xa4,0xf1,0x00,0x00,0x40,0x00,0x02,0x04,0x05,0x06]
@ -226,11 +218,10 @@ define amdgpu_ps <4 x float> @sample_c_cd_cl_2d(<8 x i32> inreg %rsrc, <4 x i32>
; GFX10: ; %bb.0: ; %main_body
; GFX10-NEXT: v_mov_b32_e32 v8, v2 ; encoding: [0x02,0x03,0x10,0x7e]
; GFX10-NEXT: v_mov_b32_e32 v2, v0 ; encoding: [0x00,0x03,0x04,0x7e]
; GFX10-NEXT: v_mov_b32_e32 v0, 0xffff ; encoding: [0xff,0x02,0x00,0x7e,0xff,0xff,0x00,0x00]
; GFX10-NEXT: v_and_b32_e32 v3, v0, v3 ; encoding: [0x00,0x07,0x06,0x36]
; GFX10-NEXT: v_and_b32_e32 v0, v0, v1 ; encoding: [0x00,0x03,0x00,0x36]
; GFX10-NEXT: v_lshl_or_b32 v4, v4, 16, v3 ; encoding: [0x04,0x00,0x6f,0xd7,0x04,0x21,0x0d,0x04]
; GFX10-NEXT: v_lshl_or_b32 v3, v8, 16, v0 ; encoding: [0x03,0x00,0x6f,0xd7,0x08,0x21,0x01,0x04]
; GFX10-NEXT: v_and_b32_e32 v0, 0xffff, v3 ; encoding: [0xff,0x06,0x00,0x36,0xff,0xff,0x00,0x00]
; GFX10-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; encoding: [0xff,0x02,0x02,0x36,0xff,0xff,0x00,0x00]
; GFX10-NEXT: v_lshl_or_b32 v4, v4, 16, v0 ; encoding: [0x04,0x00,0x6f,0xd7,0x04,0x21,0x01,0x04]
; GFX10-NEXT: v_lshl_or_b32 v3, v8, 16, v1 ; encoding: [0x03,0x00,0x6f,0xd7,0x08,0x21,0x05,0x04]
; GFX10-NEXT: image_sample_c_cd_cl_g16 v[0:3], v[2:7], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D ; encoding: [0x09,0x0f,0xac,0xf1,0x02,0x00,0x40,0x00]
; GFX10-NEXT: s_waitcnt vmcnt(0) ; encoding: [0x70,0x3f,0x8c,0xbf]
; GFX10-NEXT: ; return to shader part epilog
@ -243,14 +234,13 @@ define amdgpu_ps float @sample_c_d_o_2darray_V1(<8 x i32> inreg %rsrc, <4 x i32>
; GFX10-LABEL: sample_c_d_o_2darray_V1:
; GFX10: ; %bb.0: ; %main_body
; GFX10-NEXT: v_mov_b32_e32 v9, v2 ; encoding: [0x02,0x03,0x12,0x7e]
; GFX10-NEXT: v_mov_b32_e32 v2, v0 ; encoding: [0x00,0x03,0x04,0x7e]
; GFX10-NEXT: v_mov_b32_e32 v0, 0xffff ; encoding: [0xff,0x02,0x00,0x7e,0xff,0xff,0x00,0x00]
; GFX10-NEXT: v_mov_b32_e32 v10, v3 ; encoding: [0x03,0x03,0x14,0x7e]
; GFX10-NEXT: v_mov_b32_e32 v3, v1 ; encoding: [0x01,0x03,0x06,0x7e]
; GFX10-NEXT: v_and_b32_e32 v1, v0, v4 ; encoding: [0x00,0x09,0x02,0x36]
; GFX10-NEXT: v_and_b32_e32 v0, v0, v9 ; encoding: [0x00,0x13,0x00,0x36]
; GFX10-NEXT: v_lshl_or_b32 v5, v5, 16, v1 ; encoding: [0x05,0x00,0x6f,0xd7,0x05,0x21,0x05,0x04]
; GFX10-NEXT: v_lshl_or_b32 v4, v10, 16, v0 ; encoding: [0x04,0x00,0x6f,0xd7,0x0a,0x21,0x01,0x04]
; GFX10-NEXT: v_mov_b32_e32 v2, v0 ; encoding: [0x00,0x03,0x04,0x7e]
; GFX10-NEXT: v_and_b32_e32 v0, 0xffff, v4 ; encoding: [0xff,0x08,0x00,0x36,0xff,0xff,0x00,0x00]
; GFX10-NEXT: v_and_b32_e32 v1, 0xffff, v9 ; encoding: [0xff,0x12,0x02,0x36,0xff,0xff,0x00,0x00]
; GFX10-NEXT: v_lshl_or_b32 v5, v5, 16, v0 ; encoding: [0x05,0x00,0x6f,0xd7,0x05,0x21,0x01,0x04]
; GFX10-NEXT: v_lshl_or_b32 v4, v10, 16, v1 ; encoding: [0x04,0x00,0x6f,0xd7,0x0a,0x21,0x05,0x04]
; GFX10-NEXT: image_sample_c_d_o_g16 v0, v[2:8], s[0:7], s[8:11] dmask:0x4 dim:SQ_RSRC_IMG_2D_ARRAY ; encoding: [0x29,0x04,0xe8,0xf0,0x02,0x00,0x40,0x00]
; GFX10-NEXT: s_waitcnt vmcnt(0) ; encoding: [0x70,0x3f,0x8c,0xbf]
; GFX10-NEXT: ; return to shader part epilog
@ -263,14 +253,13 @@ define amdgpu_ps <2 x float> @sample_c_d_o_2darray_V2(<8 x i32> inreg %rsrc, <4
; GFX10-LABEL: sample_c_d_o_2darray_V2:
; GFX10: ; %bb.0: ; %main_body
; GFX10-NEXT: v_mov_b32_e32 v9, v2 ; encoding: [0x02,0x03,0x12,0x7e]
; GFX10-NEXT: v_mov_b32_e32 v2, v0 ; encoding: [0x00,0x03,0x04,0x7e]
; GFX10-NEXT: v_mov_b32_e32 v0, 0xffff ; encoding: [0xff,0x02,0x00,0x7e,0xff,0xff,0x00,0x00]
; GFX10-NEXT: v_mov_b32_e32 v10, v3 ; encoding: [0x03,0x03,0x14,0x7e]
; GFX10-NEXT: v_mov_b32_e32 v3, v1 ; encoding: [0x01,0x03,0x06,0x7e]
; GFX10-NEXT: v_and_b32_e32 v1, v0, v4 ; encoding: [0x00,0x09,0x02,0x36]
; GFX10-NEXT: v_and_b32_e32 v0, v0, v9 ; encoding: [0x00,0x13,0x00,0x36]
; GFX10-NEXT: v_lshl_or_b32 v5, v5, 16, v1 ; encoding: [0x05,0x00,0x6f,0xd7,0x05,0x21,0x05,0x04]
; GFX10-NEXT: v_lshl_or_b32 v4, v10, 16, v0 ; encoding: [0x04,0x00,0x6f,0xd7,0x0a,0x21,0x01,0x04]
; GFX10-NEXT: v_mov_b32_e32 v2, v0 ; encoding: [0x00,0x03,0x04,0x7e]
; GFX10-NEXT: v_and_b32_e32 v0, 0xffff, v4 ; encoding: [0xff,0x08,0x00,0x36,0xff,0xff,0x00,0x00]
; GFX10-NEXT: v_and_b32_e32 v1, 0xffff, v9 ; encoding: [0xff,0x12,0x02,0x36,0xff,0xff,0x00,0x00]
; GFX10-NEXT: v_lshl_or_b32 v5, v5, 16, v0 ; encoding: [0x05,0x00,0x6f,0xd7,0x05,0x21,0x01,0x04]
; GFX10-NEXT: v_lshl_or_b32 v4, v10, 16, v1 ; encoding: [0x04,0x00,0x6f,0xd7,0x0a,0x21,0x05,0x04]
; GFX10-NEXT: image_sample_c_d_o_g16 v[0:1], v[2:8], s[0:7], s[8:11] dmask:0x6 dim:SQ_RSRC_IMG_2D_ARRAY ; encoding: [0x29,0x06,0xe8,0xf0,0x02,0x00,0x40,0x00]
; GFX10-NEXT: s_waitcnt vmcnt(0) ; encoding: [0x70,0x3f,0x8c,0xbf]
; GFX10-NEXT: ; return to shader part epilog

View File

@ -15,9 +15,8 @@ main_body:
define amdgpu_ps <4 x float> @sample_d_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, half %dsdh, half %dtdh, half %dsdv, half %dtdv, float %s, float %t) {
; GFX10-LABEL: sample_d_2d:
; GFX10: ; %bb.0: ; %main_body
; GFX10-NEXT: v_mov_b32_e32 v6, 0xffff
; GFX10-NEXT: v_and_b32_e32 v2, v6, v2
; GFX10-NEXT: v_and_b32_e32 v0, v6, v0
; GFX10-NEXT: v_and_b32_e32 v2, 0xffff, v2
; GFX10-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX10-NEXT: v_lshl_or_b32 v2, v3, 16, v2
; GFX10-NEXT: v_lshl_or_b32 v0, v1, 16, v0
; GFX10-NEXT: image_sample_d_g16 v[0:3], [v0, v2, v4, v5], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D
@ -33,10 +32,9 @@ define amdgpu_ps <4 x float> @sample_d_3d(<8 x i32> inreg %rsrc, <4 x i32> inreg
; GFX10: ; %bb.0: ; %main_body
; GFX10-NEXT: v_mov_b32_e32 v9, v3
; GFX10-NEXT: v_mov_b32_e32 v3, v2
; GFX10-NEXT: v_mov_b32_e32 v2, 0xffff
; GFX10-NEXT: v_and_b32_e32 v9, v2, v9
; GFX10-NEXT: v_and_b32_e32 v0, v2, v0
; GFX10-NEXT: v_lshl_or_b32 v4, v4, 16, v9
; GFX10-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX10-NEXT: v_and_b32_e32 v2, 0xffff, v9
; GFX10-NEXT: v_lshl_or_b32 v4, v4, 16, v2
; GFX10-NEXT: v_lshl_or_b32 v2, v1, 16, v0
; GFX10-NEXT: image_sample_d_g16 v[0:3], v[2:8], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_3D
; GFX10-NEXT: s_waitcnt vmcnt(0)
@ -60,9 +58,8 @@ main_body:
define amdgpu_ps <4 x float> @sample_c_d_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %zcompare, half %dsdh, half %dtdh, half %dsdv, half %dtdv, float %s, float %t) {
; GFX10-LABEL: sample_c_d_2d:
; GFX10: ; %bb.0: ; %main_body
; GFX10-NEXT: v_mov_b32_e32 v7, 0xffff
; GFX10-NEXT: v_and_b32_e32 v3, v7, v3
; GFX10-NEXT: v_and_b32_e32 v1, v7, v1
; GFX10-NEXT: v_and_b32_e32 v3, 0xffff, v3
; GFX10-NEXT: v_and_b32_e32 v1, 0xffff, v1
; GFX10-NEXT: v_lshl_or_b32 v3, v4, 16, v3
; GFX10-NEXT: v_lshl_or_b32 v1, v2, 16, v1
; GFX10-NEXT: image_sample_c_d_g16 v[0:3], [v0, v1, v3, v5, v6], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D
@ -87,9 +84,8 @@ main_body:
define amdgpu_ps <4 x float> @sample_d_cl_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, half %dsdh, half %dtdh, half %dsdv, half %dtdv, float %s, float %t, float %clamp) {
; GFX10-LABEL: sample_d_cl_2d:
; GFX10: ; %bb.0: ; %main_body
; GFX10-NEXT: v_mov_b32_e32 v7, 0xffff
; GFX10-NEXT: v_and_b32_e32 v2, v7, v2
; GFX10-NEXT: v_and_b32_e32 v0, v7, v0
; GFX10-NEXT: v_and_b32_e32 v2, 0xffff, v2
; GFX10-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX10-NEXT: v_lshl_or_b32 v2, v3, 16, v2
; GFX10-NEXT: v_lshl_or_b32 v0, v1, 16, v0
; GFX10-NEXT: image_sample_d_cl_g16 v[0:3], [v0, v2, v4, v5, v6], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D
@ -116,11 +112,10 @@ define amdgpu_ps <4 x float> @sample_c_d_cl_2d(<8 x i32> inreg %rsrc, <4 x i32>
; GFX10: ; %bb.0: ; %main_body
; GFX10-NEXT: v_mov_b32_e32 v8, v2
; GFX10-NEXT: v_mov_b32_e32 v2, v0
; GFX10-NEXT: v_mov_b32_e32 v0, 0xffff
; GFX10-NEXT: v_and_b32_e32 v3, v0, v3
; GFX10-NEXT: v_and_b32_e32 v0, v0, v1
; GFX10-NEXT: v_lshl_or_b32 v4, v4, 16, v3
; GFX10-NEXT: v_lshl_or_b32 v3, v8, 16, v0
; GFX10-NEXT: v_and_b32_e32 v0, 0xffff, v3
; GFX10-NEXT: v_and_b32_e32 v1, 0xffff, v1
; GFX10-NEXT: v_lshl_or_b32 v4, v4, 16, v0
; GFX10-NEXT: v_lshl_or_b32 v3, v8, 16, v1
; GFX10-NEXT: image_sample_c_d_cl_g16 v[0:3], v[2:7], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: ; return to shader part epilog
@ -143,9 +138,8 @@ main_body:
define amdgpu_ps <4 x float> @sample_cd_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, half %dsdh, half %dtdh, half %dsdv, half %dtdv, float %s, float %t) {
; GFX10-LABEL: sample_cd_2d:
; GFX10: ; %bb.0: ; %main_body
; GFX10-NEXT: v_mov_b32_e32 v6, 0xffff
; GFX10-NEXT: v_and_b32_e32 v2, v6, v2
; GFX10-NEXT: v_and_b32_e32 v0, v6, v0
; GFX10-NEXT: v_and_b32_e32 v2, 0xffff, v2
; GFX10-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX10-NEXT: v_lshl_or_b32 v2, v3, 16, v2
; GFX10-NEXT: v_lshl_or_b32 v0, v1, 16, v0
; GFX10-NEXT: image_sample_cd_g16 v[0:3], [v0, v2, v4, v5], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D
@ -170,9 +164,8 @@ main_body:
define amdgpu_ps <4 x float> @sample_c_cd_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %zcompare, half %dsdh, half %dtdh, half %dsdv, half %dtdv, float %s, float %t) {
; GFX10-LABEL: sample_c_cd_2d:
; GFX10: ; %bb.0: ; %main_body
; GFX10-NEXT: v_mov_b32_e32 v7, 0xffff
; GFX10-NEXT: v_and_b32_e32 v3, v7, v3
; GFX10-NEXT: v_and_b32_e32 v1, v7, v1
; GFX10-NEXT: v_and_b32_e32 v3, 0xffff, v3
; GFX10-NEXT: v_and_b32_e32 v1, 0xffff, v1
; GFX10-NEXT: v_lshl_or_b32 v3, v4, 16, v3
; GFX10-NEXT: v_lshl_or_b32 v1, v2, 16, v1
; GFX10-NEXT: image_sample_c_cd_g16 v[0:3], [v0, v1, v3, v5, v6], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D
@ -197,9 +190,8 @@ main_body:
define amdgpu_ps <4 x float> @sample_cd_cl_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, half %dsdh, half %dtdh, half %dsdv, half %dtdv, float %s, float %t, float %clamp) {
; GFX10-LABEL: sample_cd_cl_2d:
; GFX10: ; %bb.0: ; %main_body
; GFX10-NEXT: v_mov_b32_e32 v7, 0xffff
; GFX10-NEXT: v_and_b32_e32 v2, v7, v2
; GFX10-NEXT: v_and_b32_e32 v0, v7, v0
; GFX10-NEXT: v_and_b32_e32 v2, 0xffff, v2
; GFX10-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX10-NEXT: v_lshl_or_b32 v2, v3, 16, v2
; GFX10-NEXT: v_lshl_or_b32 v0, v1, 16, v0
; GFX10-NEXT: image_sample_cd_cl_g16 v[0:3], [v0, v2, v4, v5, v6], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D
@ -226,11 +218,10 @@ define amdgpu_ps <4 x float> @sample_c_cd_cl_2d(<8 x i32> inreg %rsrc, <4 x i32>
; GFX10: ; %bb.0: ; %main_body
; GFX10-NEXT: v_mov_b32_e32 v8, v2
; GFX10-NEXT: v_mov_b32_e32 v2, v0
; GFX10-NEXT: v_mov_b32_e32 v0, 0xffff
; GFX10-NEXT: v_and_b32_e32 v3, v0, v3
; GFX10-NEXT: v_and_b32_e32 v0, v0, v1
; GFX10-NEXT: v_lshl_or_b32 v4, v4, 16, v3
; GFX10-NEXT: v_lshl_or_b32 v3, v8, 16, v0
; GFX10-NEXT: v_and_b32_e32 v0, 0xffff, v3
; GFX10-NEXT: v_and_b32_e32 v1, 0xffff, v1
; GFX10-NEXT: v_lshl_or_b32 v4, v4, 16, v0
; GFX10-NEXT: v_lshl_or_b32 v3, v8, 16, v1
; GFX10-NEXT: image_sample_c_cd_cl_g16 v[0:3], v[2:7], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: ; return to shader part epilog
@ -243,14 +234,13 @@ define amdgpu_ps float @sample_c_d_o_2darray_V1(<8 x i32> inreg %rsrc, <4 x i32>
; GFX10-LABEL: sample_c_d_o_2darray_V1:
; GFX10: ; %bb.0: ; %main_body
; GFX10-NEXT: v_mov_b32_e32 v9, v2
; GFX10-NEXT: v_mov_b32_e32 v2, v0
; GFX10-NEXT: v_mov_b32_e32 v0, 0xffff
; GFX10-NEXT: v_mov_b32_e32 v10, v3
; GFX10-NEXT: v_mov_b32_e32 v3, v1
; GFX10-NEXT: v_and_b32_e32 v1, v0, v4
; GFX10-NEXT: v_and_b32_e32 v0, v0, v9
; GFX10-NEXT: v_lshl_or_b32 v5, v5, 16, v1
; GFX10-NEXT: v_lshl_or_b32 v4, v10, 16, v0
; GFX10-NEXT: v_mov_b32_e32 v2, v0
; GFX10-NEXT: v_and_b32_e32 v0, 0xffff, v4
; GFX10-NEXT: v_and_b32_e32 v1, 0xffff, v9
; GFX10-NEXT: v_lshl_or_b32 v5, v5, 16, v0
; GFX10-NEXT: v_lshl_or_b32 v4, v10, 16, v1
; GFX10-NEXT: image_sample_c_d_o_g16 v0, v[2:8], s[0:7], s[8:11] dmask:0x4 dim:SQ_RSRC_IMG_2D_ARRAY
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: ; return to shader part epilog
@ -263,14 +253,13 @@ define amdgpu_ps <2 x float> @sample_c_d_o_2darray_V2(<8 x i32> inreg %rsrc, <4
; GFX10-LABEL: sample_c_d_o_2darray_V2:
; GFX10: ; %bb.0: ; %main_body
; GFX10-NEXT: v_mov_b32_e32 v9, v2
; GFX10-NEXT: v_mov_b32_e32 v2, v0
; GFX10-NEXT: v_mov_b32_e32 v0, 0xffff
; GFX10-NEXT: v_mov_b32_e32 v10, v3
; GFX10-NEXT: v_mov_b32_e32 v3, v1
; GFX10-NEXT: v_and_b32_e32 v1, v0, v4
; GFX10-NEXT: v_and_b32_e32 v0, v0, v9
; GFX10-NEXT: v_lshl_or_b32 v5, v5, 16, v1
; GFX10-NEXT: v_lshl_or_b32 v4, v10, 16, v0
; GFX10-NEXT: v_mov_b32_e32 v2, v0
; GFX10-NEXT: v_and_b32_e32 v0, 0xffff, v4
; GFX10-NEXT: v_and_b32_e32 v1, 0xffff, v9
; GFX10-NEXT: v_lshl_or_b32 v5, v5, 16, v0
; GFX10-NEXT: v_lshl_or_b32 v4, v10, 16, v1
; GFX10-NEXT: image_sample_c_d_o_g16 v[0:1], v[2:8], s[0:7], s[8:11] dmask:0x6 dim:SQ_RSRC_IMG_2D_ARRAY
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: ; return to shader part epilog

View File

@ -31,10 +31,9 @@ main_body:
; GCN-LABEL: {{^}}buffer_store_format_d16_xyz:
; GCN-DAG: s_load_dwordx2 s[[[S_DATA_0:[0-9]+]]:[[S_DATA_1:[0-9]+]]], s{{\[[0-9]+:[0-9]+\]}}, 0x10
; UNPACKED-DAG: s_mov_b32 [[K:s[0-9]+]], 0xffff{{$}}
; UNPACKED-DAG: s_lshr_b32 [[SHR0:s[0-9]+]], s[[S_DATA_0]], 16
; UNPACKED-DAG: s_and_b32 [[MASKED0:s[0-9]+]], s[[S_DATA_0]], [[K]]
; UNPACKED-DAG: s_and_b32 [[MASKED1:s[0-9]+]], s[[S_DATA_1]], [[K]]
; UNPACKED-DAG: s_and_b32 [[MASKED0:s[0-9]+]], s[[S_DATA_0]], 0xffff{{$}}
; UNPACKED-DAG: s_and_b32 [[MASKED1:s[0-9]+]], s[[S_DATA_1]], 0xffff{{$}}
; UNPACKED-DAG: v_mov_b32_e32 v[[LO:[0-9]+]], [[MASKED0]]
; UNPACKED-DAG: v_mov_b32_e32 v[[HI:[0-9]+]], [[MASKED1]]
@ -56,11 +55,10 @@ main_body:
; GCN-LABEL: {{^}}buffer_store_format_d16_xyzw:
; GCN-DAG: s_load_dwordx2 s[[[S_DATA_0:[0-9]+]]:[[S_DATA_1:[0-9]+]]], s{{\[[0-9]+:[0-9]+\]}}, 0x10
; UNPACKED-DAG: s_mov_b32 [[K:s[0-9]+]], 0xffff{{$}}
; UNPACKED-DAG: s_lshr_b32 [[SHR0:s[0-9]+]], s[[S_DATA_0]], 16
; UNPACKED-DAG: s_and_b32 [[MASKED0:s[0-9]+]], s[[S_DATA_0]], [[K]]
; UNPACKED-DAG: s_and_b32 [[MASKED0:s[0-9]+]], s[[S_DATA_0]], 0xffff{{$}}
; UNPACKED-DAG: s_lshr_b32 [[SHR1:s[0-9]+]], s[[S_DATA_1]], 16
; UNPACKED-DAG: s_and_b32 [[MASKED1:s[0-9]+]], s[[S_DATA_1]], [[K]]
; UNPACKED-DAG: s_and_b32 [[MASKED1:s[0-9]+]], s[[S_DATA_1]], 0xffff{{$}}
; UNPACKED-DAG: v_mov_b32_e32 v[[LO:[0-9]+]], [[MASKED0]]
; UNPACKED-DAG: v_mov_b32_e32 v[[HI:[0-9]+]], [[SHR1]]

View File

@ -35,10 +35,9 @@ main_body:
; GCN-LABEL: {{^}}tbuffer_store_d16_xyz:
; GCN-DAG: s_load_dwordx2 s[[[S_DATA_0:[0-9]+]]:[[S_DATA_1:[0-9]+]]], s{{\[[0-9]+:[0-9]+\]}},
; UNPACKED-DAG: s_mov_b32 [[K:s[0-9]+]], 0xffff{{$}}
; UNPACKED-DAG: s_lshr_b32 [[SHR0:s[0-9]+]], s[[S_DATA_0]], 16
; UNPACKED-DAG: s_and_b32 [[MASKED0:s[0-9]+]], s[[S_DATA_0]], [[K]]
; UNPACKED-DAG: s_and_b32 [[MASKED1:s[0-9]+]], s[[S_DATA_1]], [[K]]
; UNPACKED-DAG: s_and_b32 [[MASKED0:s[0-9]+]], s[[S_DATA_0]], 0xffff{{$}}
; UNPACKED-DAG: s_and_b32 [[MASKED1:s[0-9]+]], s[[S_DATA_1]], 0xffff{{$}}
; UNPACKED-DAG: v_mov_b32_e32 v[[LO:[0-9]+]], [[MASKED0]]
; UNPACKED-DAG: v_mov_b32_e32 v[[HI:[0-9]+]], [[MASKED1]]
@ -60,11 +59,10 @@ main_body:
; GCN-LABEL: {{^}}tbuffer_store_d16_xyzw:
; GCN-DAG: s_load_dwordx2 s[[[S_DATA_0:[0-9]+]]:[[S_DATA_1:[0-9]+]]], s{{\[[0-9]+:[0-9]+\]}},
; UNPACKED-DAG: s_mov_b32 [[K:s[0-9]+]], 0xffff{{$}}
; UNPACKED-DAG: s_lshr_b32 [[SHR0:s[0-9]+]], s[[S_DATA_0]], 16
; UNPACKED-DAG: s_and_b32 [[MASKED0:s[0-9]+]], s[[S_DATA_0]], [[K]]
; UNPACKED-DAG: s_and_b32 [[MASKED0:s[0-9]+]], s[[S_DATA_0]], 0xffff{{$}}
; UNPACKED-DAG: s_lshr_b32 [[SHR1:s[0-9]+]], s[[S_DATA_1]], 16
; UNPACKED-DAG: s_and_b32 [[MASKED1:s[0-9]+]], s[[S_DATA_1]], [[K]]
; UNPACKED-DAG: s_and_b32 [[MASKED1:s[0-9]+]], s[[S_DATA_1]], 0xffff{{$}}
; UNPACKED-DAG: v_mov_b32_e32 v[[LO:[0-9]+]], [[MASKED0]]
; UNPACKED-DAG: v_mov_b32_e32 v[[HI:[0-9]+]], [[SHR1]]

View File

@ -31,10 +31,9 @@ main_body:
; GCN-LABEL: {{^}}buffer_store_format_d16_xyz:
; GCN-DAG: s_load_dwordx2 s[[[S_DATA_0:[0-9]+]]:[[S_DATA_1:[0-9]+]]], s{{\[[0-9]+:[0-9]+\]}}, 0x10
; UNPACKED-DAG: s_mov_b32 [[K:s[0-9]+]], 0xffff{{$}}
; UNPACKED-DAG: s_lshr_b32 [[SHR0:s[0-9]+]], s[[S_DATA_0]], 16
; UNPACKED-DAG: s_and_b32 [[MASKED0:s[0-9]+]], s[[S_DATA_0]], [[K]]
; UNPACKED-DAG: s_and_b32 [[MASKED1:s[0-9]+]], s[[S_DATA_1]], [[K]]
; UNPACKED-DAG: s_and_b32 [[MASKED0:s[0-9]+]], s[[S_DATA_0]], 0xffff{{$}}
; UNPACKED-DAG: s_and_b32 [[MASKED1:s[0-9]+]], s[[S_DATA_1]], 0xffff{{$}}
; UNPACKED-DAG: v_mov_b32_e32 v[[LO:[0-9]+]], [[MASKED0]]
; UNPACKED-DAG: v_mov_b32_e32 v[[HI:[0-9]+]], [[MASKED1]]
@ -56,11 +55,10 @@ main_body:
; GCN-LABEL: {{^}}buffer_store_format_d16_xyzw:
; GCN-DAG: s_load_dwordx2 s[[[S_DATA_0:[0-9]+]]:[[S_DATA_1:[0-9]+]]], s{{\[[0-9]+:[0-9]+\]}}, 0x10
; UNPACKED-DAG: s_mov_b32 [[K:s[0-9]+]], 0xffff{{$}}
; UNPACKED-DAG: s_lshr_b32 [[SHR0:s[0-9]+]], s[[S_DATA_0]], 16
; UNPACKED-DAG: s_and_b32 [[MASKED0:s[0-9]+]], s[[S_DATA_0]], [[K]]
; UNPACKED-DAG: s_and_b32 [[MASKED0:s[0-9]+]], s[[S_DATA_0]], 0xffff{{$}}
; UNPACKED-DAG: s_lshr_b32 [[SHR1:s[0-9]+]], s[[S_DATA_1]], 16
; UNPACKED-DAG: s_and_b32 [[MASKED1:s[0-9]+]], s[[S_DATA_1]], [[K]]
; UNPACKED-DAG: s_and_b32 [[MASKED1:s[0-9]+]], s[[S_DATA_1]], 0xffff{{$}}
; UNPACKED-DAG: v_mov_b32_e32 v[[LO:[0-9]+]], [[MASKED0]]
; UNPACKED-DAG: v_mov_b32_e32 v[[HI:[0-9]+]], [[SHR1]]

View File

@ -35,10 +35,9 @@ main_body:
; GCN-LABEL: {{^}}tbuffer_store_d16_xyz:
; GCN-DAG: s_load_dwordx2 s[[[S_DATA_0:[0-9]+]]:[[S_DATA_1:[0-9]+]]], s{{\[[0-9]+:[0-9]+\]}}, 0x10
; UNPACKED-DAG: s_mov_b32 [[K:s[0-9]+]], 0xffff{{$}}
; UNPACKED-DAG: s_lshr_b32 [[SHR0:s[0-9]+]], s[[S_DATA_0]], 16
; UNPACKED-DAG: s_and_b32 [[MASKED0:s[0-9]+]], s[[S_DATA_0]], [[K]]
; UNPACKED-DAG: s_and_b32 [[MASKED1:s[0-9]+]], s[[S_DATA_1]], [[K]]
; UNPACKED-DAG: s_and_b32 [[MASKED0:s[0-9]+]], s[[S_DATA_0]], 0xffff{{$}}
; UNPACKED-DAG: s_and_b32 [[MASKED1:s[0-9]+]], s[[S_DATA_1]], 0xffff{{$}}
; UNPACKED-DAG: v_mov_b32_e32 v[[LO:[0-9]+]], [[MASKED0]]
; UNPACKED-DAG: v_mov_b32_e32 v[[HI:[0-9]+]], [[MASKED1]]
@ -59,11 +58,10 @@ main_body:
; GCN-LABEL: {{^}}tbuffer_store_d16_xyzw:
; GCN-DAG: s_load_dwordx2 s[[[S_DATA_0:[0-9]+]]:[[S_DATA_1:[0-9]+]]], s{{\[[0-9]+:[0-9]+\]}}, 0x10
; UNPACKED-DAG: s_mov_b32 [[K:s[0-9]+]], 0xffff{{$}}
; UNPACKED-DAG: s_lshr_b32 [[SHR0:s[0-9]+]], s[[S_DATA_0]], 16
; UNPACKED-DAG: s_and_b32 [[MASKED0:s[0-9]+]], s[[S_DATA_0]], [[K]]
; UNPACKED-DAG: s_and_b32 [[MASKED0:s[0-9]+]], s[[S_DATA_0]], 0xffff{{$}}
; UNPACKED-DAG: s_lshr_b32 [[SHR1:s[0-9]+]], s[[S_DATA_1]], 16
; UNPACKED-DAG: s_and_b32 [[MASKED1:s[0-9]+]], s[[S_DATA_1]], [[K]]
; UNPACKED-DAG: s_and_b32 [[MASKED1:s[0-9]+]], s[[S_DATA_1]], 0xffff{{$}}
; UNPACKED-DAG: v_mov_b32_e32 v[[LO:[0-9]+]], [[MASKED0]]
; UNPACKED-DAG: v_mov_b32_e32 v[[HI:[0-9]+]], [[SHR1]]

View File

@ -31,10 +31,9 @@ main_body:
; GCN-LABEL: {{^}}tbuffer_store_d16_xyz:
; GCN-DAG: s_load_dwordx2 s[[[S_DATA_0:[0-9]+]]:[[S_DATA_1:[0-9]+]]], s{{\[[0-9]+:[0-9]+\]}}, 0x10
; UNPACKED-DAG: s_mov_b32 [[K:s[0-9]+]], 0xffff{{$}}
; UNPACKED-DAG: s_lshr_b32 [[SHR0:s[0-9]+]], s[[S_DATA_0]], 16
; UNPACKED-DAG: s_and_b32 [[MASKED0:s[0-9]+]], s[[S_DATA_0]], [[K]]
; UNPACKED-DAG: s_and_b32 [[SHR1:s[0-9]+]], s[[S_DATA_1]], [[K]]
; UNPACKED-DAG: s_and_b32 [[MASKED0:s[0-9]+]], s[[S_DATA_0]], 0xffff{{$}}
; UNPACKED-DAG: s_and_b32 [[SHR1:s[0-9]+]], s[[S_DATA_1]], 0xffff{{$}}
; UNPACKED-DAG: v_mov_b32_e32 v[[LO:[0-9]+]], [[MASKED0]]
; UNPACKED-DAG: v_mov_b32_e32 v[[HI:[0-9]+]], [[SHR1]]
@ -53,11 +52,10 @@ main_body:
; GCN-LABEL: {{^}}tbuffer_store_d16_xyzw:
; GCN-DAG: s_load_dwordx2 s[[[S_DATA_0:[0-9]+]]:[[S_DATA_1:[0-9]+]]], s{{\[[0-9]+:[0-9]+\]}}, 0x10
; UNPACKED-DAG: s_mov_b32 [[K:s[0-9]+]], 0xffff{{$}}
; UNPACKED-DAG: s_lshr_b32 [[SHR0:s[0-9]+]], s[[S_DATA_0]], 16
; UNPACKED-DAG: s_and_b32 [[MASKED0:s[0-9]+]], s[[S_DATA_0]], [[K]]
; UNPACKED-DAG: s_and_b32 [[MASKED0:s[0-9]+]], s[[S_DATA_0]], 0xffff{{$}}
; UNPACKED-DAG: s_lshr_b32 [[SHR1:s[0-9]+]], s[[S_DATA_1]], 16
; UNPACKED-DAG: s_and_b32 [[MASKED1:s[0-9]+]], s[[S_DATA_1]], [[K]]
; UNPACKED-DAG: s_and_b32 [[MASKED1:s[0-9]+]], s[[S_DATA_1]], 0xffff{{$}}
; UNPACKED-DAG: v_mov_b32_e32 v[[LO:[0-9]+]], [[MASKED0]]
; UNPACKED-DAG: v_mov_b32_e32 v[[HI:[0-9]+]], [[SHR1]]

View File

@ -33,8 +33,8 @@ entry:
; VI: flat_load_dword v[[A_F16_0:[0-9]+]]
; GFX9: global_load_dword v[[A_F16_0:[0-9]+]]
; SI: s_mov_b32 [[A_F32_2:s[0-9]+]], 0x3f317218
; VIGFX9: s_movk_i32 [[A_F32_2:s[0-9]+]], 0x398c
; VI: v_mov_b32_e32 [[A_F32_2_V:v[0-9]+]], [[A_F32_2]]
; GFX9: s_movk_i32 [[A_F32_2:s[0-9]+]], 0x398c
; VI: v_mov_b32_e32 [[A_F32_2_V:v[0-9]+]], 0x398c
; SI: v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_F16_0]]
; SI: v_cvt_f32_f16_e32 v[[A_F32_0:[0-9]+]], v[[A_F16_1]]
; SI: v_cvt_f32_f16_e32 v[[A_F32_1:[0-9]+]], v[[A_F16_0]]
@ -49,7 +49,8 @@ entry:
; VI: v_log_f16_e32 v[[R_F16_0:[0-9]+]], v[[A_F16_0]]
; VI: v_mul_f16_sdwa v[[R_F16_2:[0-9]+]], v[[R_F16_1]], [[A_F32_2_V]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
; GFX9: v_mul_f16_e32 v[[R_F32_3:[0-9]+]], [[A_F32_2]], v[[R_F16_2]]
; VIGFX9: v_mul_f16_e32 v[[R_F32_2:[0-9]+]], [[A_F32_2]], v[[R_F16_0]]
; VI: v_mul_f16_e32 v[[R_F32_2:[0-9]+]], 0x398c, v[[R_F16_0]]
; GFX9: v_mul_f16_e32 v[[R_F32_2:[0-9]+]], [[A_F32_2]], v[[R_F16_0]]
; SI: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_0]]
; SI-NOT: v_and_b32_e32
; SI: v_or_b32_e32 v[[R_F32_5:[0-9]+]], v[[R_F16_1]], v[[R_F16_0]]

View File

@ -33,8 +33,8 @@ entry:
; VI: flat_load_dword v[[A_F16_0:[0-9]+]]
; GFX9: global_load_dword v[[A_F16_0:[0-9]+]]
; SI: s_mov_b32 [[A_F32_2:s[0-9]+]], 0x3e9a209a
; VIGFX9: s_movk_i32 [[A_F32_2:s[0-9]+]], 0x34d1
; VI: v_mov_b32_e32 [[A_F32_2_V:v[0-9]+]], [[A_F32_2]]
; GFX9: s_movk_i32 [[A_F32_2:s[0-9]+]], 0x34d1
; VI: v_mov_b32_e32 [[A_F32_2_V:v[0-9]+]], 0x34d1
; SI: v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_F16_0]]
; SI: v_cvt_f32_f16_e32 v[[A_F32_0:[0-9]+]], v[[A_F16_1]]
; SI: v_cvt_f32_f16_e32 v[[A_F32_1:[0-9]+]], v[[A_F16_0]]
@ -48,8 +48,9 @@ entry:
; VIGFX9: v_log_f16_sdwa v[[R_F16_1:[0-9]+]], v[[A_F16_0]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
; VI: v_log_f16_e32 v[[R_F16_0:[0-9]+]], v[[A_F16_0]]
; VI: v_mul_f16_sdwa v[[R_F16_2:[0-9]+]], v[[R_F16_1]], [[A_F32_2_V]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
; VI: v_mul_f16_e32 v[[R_F32_3:[0-9]+]], 0x34d1, v[[R_F16_0]]
; GFX9: v_mul_f16_e32 v[[R_F32_3:[0-9]+]], [[A_F32_2]], v[[R_F16_2]]
; VIGFX9: v_mul_f16_e32 v[[R_F32_2:[0-9]+]], [[A_F32_2]], v[[R_F16_0]]
; GFX9: v_mul_f16_e32 v[[R_F32_2:[0-9]+]], [[A_F32_2]], v[[R_F16_0]]
; SI: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_0]]
; SI-NOT: v_and_b32_e32
; SI: v_or_b32_e32 v[[R_F32_5:[0-9]+]], v[[R_F16_1]], v[[R_F16_0]]

View File

@ -147,21 +147,19 @@ define amdgpu_kernel void @round_v2f64(<2 x double> addrspace(1)* %out, <2 x dou
; SI-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0xd
; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9
; SI-NEXT: s_mov_b32 s6, -1
; SI-NEXT: s_movk_i32 s7, 0xfc01
; SI-NEXT: s_mov_b32 s3, 0xfffff
; SI-NEXT: s_mov_b32 s2, s6
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: s_bfe_u32 s0, s11, 0xb0014
; SI-NEXT: s_add_i32 s14, s0, s7
; SI-NEXT: s_mov_b32 s2, s6
; SI-NEXT: s_lshr_b64 s[0:1], s[2:3], s14
; SI-NEXT: s_brev_b32 s15, 1
; SI-NEXT: s_add_i32 s7, s0, 0xfffffc01
; SI-NEXT: s_lshr_b64 s[0:1], s[2:3], s7
; SI-NEXT: s_andn2_b64 s[12:13], s[10:11], s[0:1]
; SI-NEXT: s_and_b32 s0, s11, s15
; SI-NEXT: s_cmp_lt_i32 s14, 0
; SI-NEXT: s_and_b32 s0, s11, 0x80000000
; SI-NEXT: s_cmp_lt_i32 s7, 0
; SI-NEXT: v_mov_b32_e32 v0, s13
; SI-NEXT: v_mov_b32_e32 v1, s0
; SI-NEXT: s_cselect_b64 vcc, -1, 0
; SI-NEXT: s_cmp_gt_i32 s14, 51
; SI-NEXT: s_cmp_gt_i32 s7, 51
; SI-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc
; SI-NEXT: v_mov_b32_e32 v1, s11
; SI-NEXT: s_cselect_b64 s[0:1], -1, 0
@ -172,23 +170,23 @@ define amdgpu_kernel void @round_v2f64(<2 x double> addrspace(1)* %out, <2 x dou
; SI-NEXT: v_cndmask_b32_e64 v0, v0, v2, s[0:1]
; SI-NEXT: v_add_f64 v[2:3], s[10:11], -v[0:1]
; SI-NEXT: s_bfe_u32 s0, s9, 0xb0014
; SI-NEXT: s_add_i32 s7, s0, s7
; SI-NEXT: s_brev_b32 s10, -2
; SI-NEXT: s_add_i32 s10, s0, 0xfffffc01
; SI-NEXT: s_brev_b32 s7, -2
; SI-NEXT: v_mov_b32_e32 v6, 0x3ff00000
; SI-NEXT: v_mov_b32_e32 v4, s11
; SI-NEXT: v_cmp_ge_f64_e64 vcc, |v[2:3]|, 0.5
; SI-NEXT: s_lshr_b64 s[0:1], s[2:3], s7
; SI-NEXT: v_bfi_b32 v4, s10, v6, v4
; SI-NEXT: s_lshr_b64 s[0:1], s[2:3], s10
; SI-NEXT: v_bfi_b32 v4, s7, v6, v4
; SI-NEXT: s_andn2_b64 s[2:3], s[8:9], s[0:1]
; SI-NEXT: s_and_b32 s0, s9, s15
; SI-NEXT: s_and_b32 s0, s9, 0x80000000
; SI-NEXT: v_cndmask_b32_e32 v3, 0, v4, vcc
; SI-NEXT: v_mov_b32_e32 v2, 0
; SI-NEXT: s_cmp_lt_i32 s7, 0
; SI-NEXT: s_cmp_lt_i32 s10, 0
; SI-NEXT: v_add_f64 v[2:3], v[0:1], v[2:3]
; SI-NEXT: v_mov_b32_e32 v0, s3
; SI-NEXT: v_mov_b32_e32 v1, s0
; SI-NEXT: s_cselect_b64 vcc, -1, 0
; SI-NEXT: s_cmp_gt_i32 s7, 51
; SI-NEXT: s_cmp_gt_i32 s10, 51
; SI-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc
; SI-NEXT: v_mov_b32_e32 v1, s9
; SI-NEXT: s_cselect_b64 s[0:1], -1, 0
@ -200,7 +198,7 @@ define amdgpu_kernel void @round_v2f64(<2 x double> addrspace(1)* %out, <2 x dou
; SI-NEXT: v_add_f64 v[4:5], s[8:9], -v[0:1]
; SI-NEXT: v_mov_b32_e32 v7, s9
; SI-NEXT: v_cmp_ge_f64_e64 vcc, |v[4:5]|, 0.5
; SI-NEXT: v_bfi_b32 v6, s10, v6, v7
; SI-NEXT: v_bfi_b32 v6, s7, v6, v7
; SI-NEXT: v_cndmask_b32_e32 v5, 0, v6, vcc
; SI-NEXT: v_mov_b32_e32 v4, 0
; SI-NEXT: v_add_f64 v[0:1], v[0:1], v[4:5]
@ -245,22 +243,20 @@ define amdgpu_kernel void @round_v4f64(<4 x double> addrspace(1)* %out, <4 x dou
; SI: ; %bb.0:
; SI-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x11
; SI-NEXT: s_mov_b32 s14, -1
; SI-NEXT: s_movk_i32 s18, 0xfc01
; SI-NEXT: s_load_dwordx2 s[12:13], s[0:1], 0x9
; SI-NEXT: s_mov_b32 s3, 0xfffff
; SI-NEXT: s_mov_b32 s2, s14
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: s_bfe_u32 s0, s7, 0xb0014
; SI-NEXT: s_add_i32 s19, s0, s18
; SI-NEXT: s_mov_b32 s2, s14
; SI-NEXT: s_lshr_b64 s[0:1], s[2:3], s19
; SI-NEXT: s_brev_b32 s20, 1
; SI-NEXT: s_add_i32 s18, s0, 0xfffffc01
; SI-NEXT: s_lshr_b64 s[0:1], s[2:3], s18
; SI-NEXT: s_andn2_b64 s[16:17], s[6:7], s[0:1]
; SI-NEXT: s_and_b32 s0, s7, s20
; SI-NEXT: s_cmp_lt_i32 s19, 0
; SI-NEXT: s_and_b32 s0, s7, 0x80000000
; SI-NEXT: s_cmp_lt_i32 s18, 0
; SI-NEXT: v_mov_b32_e32 v0, s17
; SI-NEXT: v_mov_b32_e32 v1, s0
; SI-NEXT: s_cselect_b64 vcc, -1, 0
; SI-NEXT: s_cmp_gt_i32 s19, 51
; SI-NEXT: s_cmp_gt_i32 s18, 51
; SI-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc
; SI-NEXT: v_mov_b32_e32 v1, s7
; SI-NEXT: s_cselect_b64 s[0:1], -1, 0
@ -271,7 +267,7 @@ define amdgpu_kernel void @round_v4f64(<4 x double> addrspace(1)* %out, <4 x dou
; SI-NEXT: v_cndmask_b32_e64 v0, v0, v2, s[0:1]
; SI-NEXT: v_add_f64 v[2:3], s[6:7], -v[0:1]
; SI-NEXT: s_bfe_u32 s0, s5, 0xb0014
; SI-NEXT: s_add_i32 s17, s0, s18
; SI-NEXT: s_add_i32 s17, s0, 0xfffffc01
; SI-NEXT: s_brev_b32 s16, -2
; SI-NEXT: v_mov_b32_e32 v12, 0x3ff00000
; SI-NEXT: v_mov_b32_e32 v4, s7
@ -279,7 +275,7 @@ define amdgpu_kernel void @round_v4f64(<4 x double> addrspace(1)* %out, <4 x dou
; SI-NEXT: s_lshr_b64 s[0:1], s[2:3], s17
; SI-NEXT: v_bfi_b32 v4, s16, v12, v4
; SI-NEXT: s_andn2_b64 s[6:7], s[4:5], s[0:1]
; SI-NEXT: s_and_b32 s0, s5, s20
; SI-NEXT: s_and_b32 s0, s5, 0x80000000
; SI-NEXT: v_cndmask_b32_e32 v3, 0, v4, vcc
; SI-NEXT: v_mov_b32_e32 v2, 0
; SI-NEXT: s_cmp_lt_i32 s17, 0
@ -298,12 +294,12 @@ define amdgpu_kernel void @round_v4f64(<4 x double> addrspace(1)* %out, <4 x dou
; SI-NEXT: v_cndmask_b32_e64 v0, v0, v4, s[0:1]
; SI-NEXT: s_bfe_u32 s0, s11, 0xb0014
; SI-NEXT: v_add_f64 v[4:5], s[4:5], -v[0:1]
; SI-NEXT: s_add_i32 s6, s0, s18
; SI-NEXT: s_add_i32 s6, s0, 0xfffffc01
; SI-NEXT: s_lshr_b64 s[0:1], s[2:3], s6
; SI-NEXT: v_mov_b32_e32 v6, s5
; SI-NEXT: v_cmp_ge_f64_e64 vcc, |v[4:5]|, 0.5
; SI-NEXT: s_andn2_b64 s[4:5], s[10:11], s[0:1]
; SI-NEXT: s_and_b32 s0, s11, s20
; SI-NEXT: s_and_b32 s0, s11, 0x80000000
; SI-NEXT: v_bfi_b32 v6, s16, v12, v6
; SI-NEXT: s_cmp_lt_i32 s6, 0
; SI-NEXT: v_cndmask_b32_e32 v9, 0, v6, vcc
@ -321,13 +317,13 @@ define amdgpu_kernel void @round_v4f64(<4 x double> addrspace(1)* %out, <4 x dou
; SI-NEXT: v_cndmask_b32_e64 v4, v4, v6, s[0:1]
; SI-NEXT: v_add_f64 v[6:7], s[10:11], -v[4:5]
; SI-NEXT: s_bfe_u32 s0, s9, 0xb0014
; SI-NEXT: s_add_i32 s4, s0, s18
; SI-NEXT: s_add_i32 s4, s0, 0xfffffc01
; SI-NEXT: v_mov_b32_e32 v10, s11
; SI-NEXT: v_cmp_ge_f64_e64 vcc, |v[6:7]|, 0.5
; SI-NEXT: s_lshr_b64 s[0:1], s[2:3], s4
; SI-NEXT: v_bfi_b32 v10, s16, v12, v10
; SI-NEXT: s_andn2_b64 s[2:3], s[8:9], s[0:1]
; SI-NEXT: s_and_b32 s0, s9, s20
; SI-NEXT: s_and_b32 s0, s9, 0x80000000
; SI-NEXT: v_cndmask_b32_e32 v7, 0, v10, vcc
; SI-NEXT: v_mov_b32_e32 v6, 0
; SI-NEXT: s_cmp_lt_i32 s4, 0
@ -412,21 +408,20 @@ define amdgpu_kernel void @round_v8f64(<8 x double> addrspace(1)* %out, <8 x dou
; SI: ; %bb.0:
; SI-NEXT: s_load_dwordx16 s[4:19], s[0:1], 0x19
; SI-NEXT: s_mov_b32 s22, -1
; SI-NEXT: s_movk_i32 s28, 0xfc01
; SI-NEXT: s_mov_b32 s21, 0xfffff
; SI-NEXT: s_mov_b32 s20, s22
; SI-NEXT: v_mov_b32_e32 v8, 0x3ff00000
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: s_bfe_u32 s2, s7, 0xb0014
; SI-NEXT: s_add_i32 s23, s2, s28
; SI-NEXT: s_lshr_b64 s[2:3], s[20:21], s23
; SI-NEXT: s_brev_b32 s29, 1
; SI-NEXT: s_add_i32 s26, s2, 0xfffffc01
; SI-NEXT: s_lshr_b64 s[2:3], s[20:21], s26
; SI-NEXT: s_and_b32 s23, s7, 0x80000000
; SI-NEXT: s_andn2_b64 s[24:25], s[6:7], s[2:3]
; SI-NEXT: s_and_b32 s2, s7, s29
; SI-NEXT: s_cmp_lt_i32 s23, 0
; SI-NEXT: s_cmp_lt_i32 s26, 0
; SI-NEXT: v_mov_b32_e32 v0, s25
; SI-NEXT: v_mov_b32_e32 v1, s2
; SI-NEXT: v_mov_b32_e32 v1, s23
; SI-NEXT: s_cselect_b64 vcc, -1, 0
; SI-NEXT: s_cmp_gt_i32 s23, 51
; SI-NEXT: s_cmp_gt_i32 s26, 51
; SI-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc
; SI-NEXT: v_mov_b32_e32 v1, s7
; SI-NEXT: s_cselect_b64 s[2:3], -1, 0
@ -437,15 +432,14 @@ define amdgpu_kernel void @round_v8f64(<8 x double> addrspace(1)* %out, <8 x dou
; SI-NEXT: v_cndmask_b32_e64 v0, v0, v2, s[2:3]
; SI-NEXT: v_add_f64 v[2:3], s[6:7], -v[0:1]
; SI-NEXT: s_bfe_u32 s2, s5, 0xb0014
; SI-NEXT: s_add_i32 s24, s2, s28
; SI-NEXT: s_add_i32 s24, s2, 0xfffffc01
; SI-NEXT: s_brev_b32 s23, -2
; SI-NEXT: v_mov_b32_e32 v8, 0x3ff00000
; SI-NEXT: v_mov_b32_e32 v4, s7
; SI-NEXT: v_cmp_ge_f64_e64 vcc, |v[2:3]|, 0.5
; SI-NEXT: s_lshr_b64 s[2:3], s[20:21], s24
; SI-NEXT: v_bfi_b32 v4, s23, v8, v4
; SI-NEXT: s_andn2_b64 s[6:7], s[4:5], s[2:3]
; SI-NEXT: s_and_b32 s2, s5, s29
; SI-NEXT: s_and_b32 s2, s5, 0x80000000
; SI-NEXT: v_cndmask_b32_e32 v3, 0, v4, vcc
; SI-NEXT: v_mov_b32_e32 v2, 0
; SI-NEXT: s_cmp_lt_i32 s24, 0
@ -464,13 +458,13 @@ define amdgpu_kernel void @round_v8f64(<8 x double> addrspace(1)* %out, <8 x dou
; SI-NEXT: v_cndmask_b32_e64 v0, v0, v4, s[2:3]
; SI-NEXT: v_add_f64 v[4:5], s[4:5], -v[0:1]
; SI-NEXT: s_bfe_u32 s2, s11, 0xb0014
; SI-NEXT: s_add_i32 s6, s2, s28
; SI-NEXT: s_add_i32 s6, s2, 0xfffffc01
; SI-NEXT: v_mov_b32_e32 v6, s5
; SI-NEXT: v_cmp_ge_f64_e64 vcc, |v[4:5]|, 0.5
; SI-NEXT: s_lshr_b64 s[2:3], s[20:21], s6
; SI-NEXT: v_bfi_b32 v6, s23, v8, v6
; SI-NEXT: s_andn2_b64 s[4:5], s[10:11], s[2:3]
; SI-NEXT: s_and_b32 s2, s11, s29
; SI-NEXT: s_and_b32 s2, s11, 0x80000000
; SI-NEXT: v_cndmask_b32_e32 v5, 0, v6, vcc
; SI-NEXT: v_mov_b32_e32 v4, 0
; SI-NEXT: s_cmp_lt_i32 s6, 0
@ -489,13 +483,13 @@ define amdgpu_kernel void @round_v8f64(<8 x double> addrspace(1)* %out, <8 x dou
; SI-NEXT: v_cndmask_b32_e64 v4, v4, v6, s[2:3]
; SI-NEXT: v_add_f64 v[6:7], s[10:11], -v[4:5]
; SI-NEXT: s_bfe_u32 s2, s9, 0xb0014
; SI-NEXT: s_add_i32 s6, s2, s28
; SI-NEXT: s_add_i32 s6, s2, 0xfffffc01
; SI-NEXT: v_mov_b32_e32 v9, s11
; SI-NEXT: v_cmp_ge_f64_e64 vcc, |v[6:7]|, 0.5
; SI-NEXT: s_lshr_b64 s[2:3], s[20:21], s6
; SI-NEXT: v_bfi_b32 v9, s23, v8, v9
; SI-NEXT: s_andn2_b64 s[4:5], s[8:9], s[2:3]
; SI-NEXT: s_and_b32 s2, s9, s29
; SI-NEXT: s_and_b32 s2, s9, 0x80000000
; SI-NEXT: v_cndmask_b32_e32 v7, 0, v9, vcc
; SI-NEXT: v_mov_b32_e32 v6, 0
; SI-NEXT: s_cmp_lt_i32 s6, 0
@ -514,12 +508,12 @@ define amdgpu_kernel void @round_v8f64(<8 x double> addrspace(1)* %out, <8 x dou
; SI-NEXT: v_cndmask_b32_e64 v4, v4, v9, s[2:3]
; SI-NEXT: s_bfe_u32 s2, s15, 0xb0014
; SI-NEXT: v_add_f64 v[9:10], s[8:9], -v[4:5]
; SI-NEXT: s_add_i32 s4, s2, s28
; SI-NEXT: s_add_i32 s4, s2, 0xfffffc01
; SI-NEXT: s_lshr_b64 s[2:3], s[20:21], s4
; SI-NEXT: v_mov_b32_e32 v11, s9
; SI-NEXT: v_cmp_ge_f64_e64 vcc, |v[9:10]|, 0.5
; SI-NEXT: s_andn2_b64 s[24:25], s[14:15], s[2:3]
; SI-NEXT: s_and_b32 s2, s15, s29
; SI-NEXT: s_and_b32 s2, s15, 0x80000000
; SI-NEXT: v_bfi_b32 v11, s23, v8, v11
; SI-NEXT: s_cmp_lt_i32 s4, 0
; SI-NEXT: v_cndmask_b32_e32 v10, 0, v11, vcc
@ -530,10 +524,10 @@ define amdgpu_kernel void @round_v8f64(<8 x double> addrspace(1)* %out, <8 x dou
; SI-NEXT: v_mov_b32_e32 v10, s2
; SI-NEXT: s_cselect_b64 s[2:3], -1, 0
; SI-NEXT: s_bfe_u32 s4, s13, 0xb0014
; SI-NEXT: s_add_i32 s6, s4, s28
; SI-NEXT: s_add_i32 s6, s4, 0xfffffc01
; SI-NEXT: s_lshr_b64 s[4:5], s[20:21], s6
; SI-NEXT: s_andn2_b64 s[26:27], s[12:13], s[4:5]
; SI-NEXT: s_and_b32 s4, s13, s29
; SI-NEXT: s_and_b32 s4, s13, 0x80000000
; SI-NEXT: v_mov_b32_e32 v9, s25
; SI-NEXT: s_cmp_lt_i32 s6, 0
; SI-NEXT: v_cndmask_b32_e32 v15, v9, v10, vcc
@ -542,30 +536,30 @@ define amdgpu_kernel void @round_v8f64(<8 x double> addrspace(1)* %out, <8 x dou
; SI-NEXT: s_cmp_gt_i32 s6, 51
; SI-NEXT: s_cselect_b64 s[6:7], -1, 0
; SI-NEXT: s_bfe_u32 s8, s19, 0xb0014
; SI-NEXT: s_add_i32 s25, s8, s28
; SI-NEXT: s_lshr_b64 s[8:9], s[20:21], s25
; SI-NEXT: s_andn2_b64 s[10:11], s[18:19], s[8:9]
; SI-NEXT: s_and_b32 s8, s19, s29
; SI-NEXT: s_add_i32 s10, s8, 0xfffffc01
; SI-NEXT: s_lshr_b64 s[8:9], s[20:21], s10
; SI-NEXT: s_andn2_b64 s[28:29], s[18:19], s[8:9]
; SI-NEXT: s_and_b32 s8, s19, 0x80000000
; SI-NEXT: v_mov_b32_e32 v9, s27
; SI-NEXT: s_cmp_lt_i32 s25, 0
; SI-NEXT: s_cmp_lt_i32 s10, 0
; SI-NEXT: v_cndmask_b32_e64 v17, v9, v10, s[4:5]
; SI-NEXT: v_mov_b32_e32 v9, s11
; SI-NEXT: v_mov_b32_e32 v9, s29
; SI-NEXT: v_mov_b32_e32 v10, s8
; SI-NEXT: s_cselect_b64 s[8:9], -1, 0
; SI-NEXT: s_cmp_gt_i32 s25, 51
; SI-NEXT: s_cmp_gt_i32 s10, 51
; SI-NEXT: v_cndmask_b32_e64 v9, v9, v10, s[8:9]
; SI-NEXT: v_mov_b32_e32 v10, s19
; SI-NEXT: v_mov_b32_e32 v11, s10
; SI-NEXT: s_cselect_b64 s[10:11], -1, 0
; SI-NEXT: v_cndmask_b32_e64 v10, v9, v10, s[10:11]
; SI-NEXT: v_cndmask_b32_e64 v9, v11, 0, s[8:9]
; SI-NEXT: v_mov_b32_e32 v9, s28
; SI-NEXT: v_cndmask_b32_e64 v9, v9, 0, s[8:9]
; SI-NEXT: v_mov_b32_e32 v11, s18
; SI-NEXT: s_bfe_u32 s8, s17, 0xb0014
; SI-NEXT: v_cndmask_b32_e64 v9, v9, v11, s[10:11]
; SI-NEXT: s_add_i32 s10, s8, s28
; SI-NEXT: s_add_i32 s10, s8, 0xfffffc01
; SI-NEXT: s_lshr_b64 s[8:9], s[20:21], s10
; SI-NEXT: s_andn2_b64 s[20:21], s[16:17], s[8:9]
; SI-NEXT: s_and_b32 s8, s17, s29
; SI-NEXT: s_and_b32 s8, s17, 0x80000000
; SI-NEXT: s_cmp_lt_i32 s10, 0
; SI-NEXT: v_mov_b32_e32 v11, s21
; SI-NEXT: v_mov_b32_e32 v12, s8

Some files were not shown because too many files have changed in this diff Show More