[AMDGPU] Remove dubious logic in bidirectional list scheduler

Summary:
pickNodeBidirectional tried to compare the best top candidate and the
best bottom candidate by examining TopCand.Reason and BotCand.Reason.
This is unsound because, after calling pickNodeFromQueue, Cand.Reason
does not reflect the most important reason why Cand was chosen. Rather
it reflects the most recent reason why it beat some other potential
candidate, which could have been for some low priority tie breaker
reason.

I have seen this cause problems where TopCand is a good candidate, but
because TopCand.Reason is ORDER (which is very low priority) it is
repeatedly ignored in favour of a mediocre BotCand. This is not how
bidirectional scheduling is supposed to work.

To fix this I changed the code to always compare TopCand and BotCand
directly, like the generic implementation of pickNodeBidirectional does.
This removes some uncommented AMDGPU-specific logic; if this logic turns
out to be important then perhaps it could be moved into an override of
tryCandidate instead.

Graphics shader benchmarking on gfx10 shows a lot more positive than
negative effects from this change.

Reviewers: arsenm, tstellar, rampitec, kzhuravl, vpykhtin, dstuttard, tpr, atrick, MatzeB

Subscribers: jvesely, wdng, nhaehnle, yaxunl, t-tye, hiraditya, llvm-commits

Tags: #llvm

Differential Revision: https://reviews.llvm.org/D68338
This commit is contained in:
Jay Foad 2019-10-07 15:33:59 +01:00
parent 0a2d415bd0
commit 43830790d7
56 changed files with 4425 additions and 4416 deletions

View File

@ -233,33 +233,11 @@ SUnit *GCNMaxOccupancySchedStrategy::pickNodeBidirectional(bool &IsTopNode) {
// Pick best from BotCand and TopCand.
LLVM_DEBUG(dbgs() << "Top Cand: "; traceCandidate(TopCand);
dbgs() << "Bot Cand: "; traceCandidate(BotCand););
SchedCandidate Cand;
if (TopCand.Reason == BotCand.Reason) {
Cand = BotCand;
GenericSchedulerBase::CandReason TopReason = TopCand.Reason;
TopCand.Reason = NoCand;
GenericScheduler::tryCandidate(Cand, TopCand, nullptr);
if (TopCand.Reason != NoCand) {
Cand.setBest(TopCand);
} else {
TopCand.Reason = TopReason;
}
} else {
if (TopCand.Reason == RegExcess && TopCand.RPDelta.Excess.getUnitInc() <= 0) {
Cand = TopCand;
} else if (BotCand.Reason == RegExcess && BotCand.RPDelta.Excess.getUnitInc() <= 0) {
Cand = BotCand;
} else if (TopCand.Reason == RegCritical && TopCand.RPDelta.CriticalMax.getUnitInc() <= 0) {
Cand = TopCand;
} else if (BotCand.Reason == RegCritical && BotCand.RPDelta.CriticalMax.getUnitInc() <= 0) {
Cand = BotCand;
} else {
if (BotCand.Reason > TopCand.Reason) {
Cand = TopCand;
} else {
Cand = BotCand;
}
}
SchedCandidate Cand = BotCand;
TopCand.Reason = NoCand;
GenericScheduler::tryCandidate(Cand, TopCand, nullptr);
if (TopCand.Reason != NoCand) {
Cand.setBest(TopCand);
}
LLVM_DEBUG(dbgs() << "Picking: "; traceCandidate(Cand););

View File

@ -374,10 +374,10 @@ define i16 @v_bswap_i16(i16 %src) {
; GFX7-LABEL: v_bswap_i16:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX7-NEXT: v_and_b32_e32 v1, 0xffff, v0
; GFX7-NEXT: v_lshlrev_b32_e32 v0, 8, v0
; GFX7-NEXT: v_lshrrev_b32_e32 v1, 8, v1
; GFX7-NEXT: v_or_b32_e32 v0, v1, v0
; GFX7-NEXT: v_lshlrev_b32_e32 v1, 8, v0
; GFX7-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX7-NEXT: v_lshrrev_b32_e32 v0, 8, v0
; GFX7-NEXT: v_or_b32_e32 v0, v0, v1
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: v_bswap_i16:
@ -440,10 +440,10 @@ define i32 @v_bswap_i16_zext_to_i32(i16 %src) {
; GFX7-LABEL: v_bswap_i16_zext_to_i32:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX7-NEXT: v_and_b32_e32 v1, 0xffff, v0
; GFX7-NEXT: v_lshlrev_b32_e32 v0, 8, v0
; GFX7-NEXT: v_lshrrev_b32_e32 v1, 8, v1
; GFX7-NEXT: v_or_b32_e32 v0, v1, v0
; GFX7-NEXT: v_lshlrev_b32_e32 v1, 8, v0
; GFX7-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX7-NEXT: v_lshrrev_b32_e32 v0, 8, v0
; GFX7-NEXT: v_or_b32_e32 v0, v0, v1
; GFX7-NEXT: v_bfe_u32 v0, v0, 0, 16
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
@ -469,10 +469,10 @@ define i32 @v_bswap_i16_sext_to_i32(i16 %src) {
; GFX7-LABEL: v_bswap_i16_sext_to_i32:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX7-NEXT: v_and_b32_e32 v1, 0xffff, v0
; GFX7-NEXT: v_lshlrev_b32_e32 v0, 8, v0
; GFX7-NEXT: v_lshrrev_b32_e32 v1, 8, v1
; GFX7-NEXT: v_or_b32_e32 v0, v1, v0
; GFX7-NEXT: v_lshlrev_b32_e32 v1, 8, v0
; GFX7-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX7-NEXT: v_lshrrev_b32_e32 v0, 8, v0
; GFX7-NEXT: v_or_b32_e32 v0, v0, v1
; GFX7-NEXT: v_bfe_i32 v0, v0, 0, 16
; GFX7-NEXT: s_setpc_b64 s[30:31]
;

View File

@ -126,21 +126,21 @@ define <2 x half> @v_pow_v2f16(<2 x half> %x, <2 x half> %y) {
; GFX8-LABEL: v_pow_v2f16:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8-NEXT: v_log_f16_sdwa v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
; GFX8-NEXT: v_log_f16_e32 v0, v0
; GFX8-NEXT: v_cvt_f32_f16_sdwa v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
; GFX8-NEXT: v_cvt_f32_f16_e32 v1, v1
; GFX8-NEXT: v_log_f16_e32 v2, v0
; GFX8-NEXT: v_log_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
; GFX8-NEXT: v_cvt_f32_f16_e32 v3, v1
; GFX8-NEXT: v_cvt_f32_f16_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
; GFX8-NEXT: v_cvt_f32_f16_e32 v2, v2
; GFX8-NEXT: v_cvt_f32_f16_e32 v0, v0
; GFX8-NEXT: v_mul_legacy_f32_e32 v2, v2, v3
; GFX8-NEXT: v_mul_legacy_f32_e32 v0, v0, v1
; GFX8-NEXT: v_cvt_f16_f32_e32 v1, v2
; GFX8-NEXT: v_cvt_f16_f32_e32 v0, v0
; GFX8-NEXT: v_cvt_f16_f32_e32 v1, v2
; GFX8-NEXT: v_mov_b32_e32 v2, 16
; GFX8-NEXT: v_exp_f16_e32 v1, v1
; GFX8-NEXT: v_exp_f16_e32 v0, v0
; GFX8-NEXT: v_lshlrev_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
; GFX8-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; GFX8-NEXT: v_exp_f16_e32 v1, v1
; GFX8-NEXT: v_lshlrev_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
; GFX8-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_pow_v2f16:
@ -154,11 +154,11 @@ define <2 x half> @v_pow_v2f16(<2 x half> %x, <2 x half> %y) {
; GFX9-NEXT: v_cvt_f32_f16_e32 v0, v0
; GFX9-NEXT: v_mul_legacy_f32_e32 v2, v2, v3
; GFX9-NEXT: v_mul_legacy_f32_e32 v0, v0, v1
; GFX9-NEXT: v_cvt_f16_f32_e32 v1, v2
; GFX9-NEXT: v_cvt_f16_f32_e32 v0, v0
; GFX9-NEXT: v_mov_b32_e32 v2, 0xffff
; GFX9-NEXT: v_exp_f16_e32 v1, v1
; GFX9-NEXT: v_cvt_f16_f32_e32 v2, v2
; GFX9-NEXT: v_exp_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD
; GFX9-NEXT: v_exp_f16_e32 v1, v2
; GFX9-NEXT: v_mov_b32_e32 v2, 0xffff
; GFX9-NEXT: v_and_or_b32 v0, v1, v2, v0
; GFX9-NEXT: s_setpc_b64 s[30:31]
%pow = call <2 x half> @llvm.pow.v2f16(<2 x half> %x, <2 x half> %y)
@ -173,40 +173,40 @@ define <2 x half> @v_pow_v2f16_fneg_lhs(<2 x half> %x, <2 x half> %y) {
; GFX6-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX6-NEXT: v_or_b32_e32 v0, v1, v0
; GFX6-NEXT: v_xor_b32_e32 v0, 0x80008000, v0
; GFX6-NEXT: v_lshrrev_b32_e32 v1, 16, v0
; GFX6-NEXT: v_cvt_f32_f16_e32 v1, v0
; GFX6-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v0
; GFX6-NEXT: v_cvt_f32_f16_e32 v1, v1
; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v2
; GFX6-NEXT: v_log_f32_e32 v1, v1
; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v3
; GFX6-NEXT: v_log_f32_e32 v0, v0
; GFX6-NEXT: v_log_f32_e32 v1, v1
; GFX6-NEXT: v_mul_legacy_f32_e32 v0, v0, v2
; GFX6-NEXT: v_mul_legacy_f32_e32 v1, v1, v3
; GFX6-NEXT: v_exp_f32_e32 v0, v0
; GFX6-NEXT: v_mul_legacy_f32_e32 v1, v1, v2
; GFX6-NEXT: v_exp_f32_e32 v1, v1
; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v0
; GFX6-NEXT: v_cvt_f16_f32_e32 v1, v1
; GFX6-NEXT: v_mul_legacy_f32_e32 v0, v0, v3
; GFX6-NEXT: v_exp_f32_e32 v2, v0
; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v1
; GFX6-NEXT: v_cvt_f16_f32_e32 v1, v2
; GFX6-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: v_pow_v2f16_fneg_lhs:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8-NEXT: v_xor_b32_e32 v0, 0x80008000, v0
; GFX8-NEXT: v_log_f16_sdwa v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
; GFX8-NEXT: v_log_f16_e32 v0, v0
; GFX8-NEXT: v_cvt_f32_f16_sdwa v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
; GFX8-NEXT: v_cvt_f32_f16_e32 v1, v1
; GFX8-NEXT: v_log_f16_e32 v2, v0
; GFX8-NEXT: v_log_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
; GFX8-NEXT: v_cvt_f32_f16_e32 v3, v1
; GFX8-NEXT: v_cvt_f32_f16_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
; GFX8-NEXT: v_cvt_f32_f16_e32 v2, v2
; GFX8-NEXT: v_cvt_f32_f16_e32 v0, v0
; GFX8-NEXT: v_mul_legacy_f32_e32 v2, v2, v3
; GFX8-NEXT: v_mul_legacy_f32_e32 v0, v0, v1
; GFX8-NEXT: v_cvt_f16_f32_e32 v1, v2
; GFX8-NEXT: v_cvt_f16_f32_e32 v0, v0
; GFX8-NEXT: v_cvt_f16_f32_e32 v1, v2
; GFX8-NEXT: v_mov_b32_e32 v2, 16
; GFX8-NEXT: v_exp_f16_e32 v1, v1
; GFX8-NEXT: v_exp_f16_e32 v0, v0
; GFX8-NEXT: v_lshlrev_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
; GFX8-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; GFX8-NEXT: v_exp_f16_e32 v1, v1
; GFX8-NEXT: v_lshlrev_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
; GFX8-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_pow_v2f16_fneg_lhs:
@ -259,22 +259,22 @@ define <2 x half> @v_pow_v2f16_fneg_rhs(<2 x half> %x, <2 x half> %y) {
; GFX8-LABEL: v_pow_v2f16_fneg_rhs:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8-NEXT: v_log_f16_sdwa v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
; GFX8-NEXT: v_log_f16_e32 v0, v0
; GFX8-NEXT: v_log_f16_e32 v2, v0
; GFX8-NEXT: v_log_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
; GFX8-NEXT: v_xor_b32_e32 v1, 0x80008000, v1
; GFX8-NEXT: v_cvt_f32_f16_sdwa v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
; GFX8-NEXT: v_cvt_f32_f16_e32 v2, v2
; GFX8-NEXT: v_cvt_f32_f16_e32 v3, v1
; GFX8-NEXT: v_cvt_f32_f16_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
; GFX8-NEXT: v_cvt_f32_f16_e32 v0, v0
; GFX8-NEXT: v_cvt_f32_f16_e32 v1, v1
; GFX8-NEXT: v_mul_legacy_f32_e32 v2, v2, v3
; GFX8-NEXT: v_cvt_f16_f32_e32 v2, v2
; GFX8-NEXT: v_cvt_f32_f16_e32 v2, v2
; GFX8-NEXT: v_mul_legacy_f32_e32 v0, v0, v1
; GFX8-NEXT: v_cvt_f16_f32_e32 v0, v0
; GFX8-NEXT: v_exp_f16_e32 v1, v2
; GFX8-NEXT: v_mul_legacy_f32_e32 v2, v2, v3
; GFX8-NEXT: v_cvt_f16_f32_e32 v1, v2
; GFX8-NEXT: v_mov_b32_e32 v2, 16
; GFX8-NEXT: v_exp_f16_e32 v0, v0
; GFX8-NEXT: v_lshlrev_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
; GFX8-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; GFX8-NEXT: v_exp_f16_e32 v1, v1
; GFX8-NEXT: v_lshlrev_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
; GFX8-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_pow_v2f16_fneg_rhs:
@ -336,22 +336,22 @@ define <2 x half> @v_pow_v2f16_fneg_lhs_rhs(<2 x half> %x, <2 x half> %y) {
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8-NEXT: s_mov_b32 s4, 0x80008000
; GFX8-NEXT: v_xor_b32_e32 v0, s4, v0
; GFX8-NEXT: v_log_f16_sdwa v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
; GFX8-NEXT: v_log_f16_e32 v0, v0
; GFX8-NEXT: v_log_f16_e32 v2, v0
; GFX8-NEXT: v_log_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
; GFX8-NEXT: v_xor_b32_e32 v1, s4, v1
; GFX8-NEXT: v_cvt_f32_f16_sdwa v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
; GFX8-NEXT: v_cvt_f32_f16_e32 v2, v2
; GFX8-NEXT: v_cvt_f32_f16_e32 v3, v1
; GFX8-NEXT: v_cvt_f32_f16_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
; GFX8-NEXT: v_cvt_f32_f16_e32 v0, v0
; GFX8-NEXT: v_cvt_f32_f16_e32 v1, v1
; GFX8-NEXT: v_mul_legacy_f32_e32 v2, v2, v3
; GFX8-NEXT: v_cvt_f16_f32_e32 v2, v2
; GFX8-NEXT: v_cvt_f32_f16_e32 v2, v2
; GFX8-NEXT: v_mul_legacy_f32_e32 v0, v0, v1
; GFX8-NEXT: v_cvt_f16_f32_e32 v0, v0
; GFX8-NEXT: v_exp_f16_e32 v1, v2
; GFX8-NEXT: v_mul_legacy_f32_e32 v2, v2, v3
; GFX8-NEXT: v_cvt_f16_f32_e32 v1, v2
; GFX8-NEXT: v_mov_b32_e32 v2, 16
; GFX8-NEXT: v_exp_f16_e32 v0, v0
; GFX8-NEXT: v_lshlrev_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
; GFX8-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; GFX8-NEXT: v_exp_f16_e32 v1, v1
; GFX8-NEXT: v_lshlrev_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
; GFX8-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_pow_v2f16_fneg_lhs_rhs:

View File

@ -233,34 +233,41 @@ define amdgpu_ps <8 x float> @dyn_insertelement_v8f32_s_s_v(<8 x float> inreg %v
; MOVREL-NEXT: s_mov_b32 s4, s6
; MOVREL-NEXT: s_mov_b32 s6, s8
; MOVREL-NEXT: v_mov_b32_e32 v16, s7
; MOVREL-NEXT: v_mov_b32_e32 v8, v0
; MOVREL-NEXT: v_mov_b32_e32 v14, s5
; MOVREL-NEXT: v_mov_b32_e32 v12, s3
; MOVREL-NEXT: v_mov_b32_e32 v13, s4
; MOVREL-NEXT: v_mov_b32_e32 v15, s6
; MOVREL-NEXT: v_mov_b32_e32 v12, s3
; MOVREL-NEXT: v_mov_b32_e32 v11, s2
; MOVREL-NEXT: v_mov_b32_e32 v10, s1
; MOVREL-NEXT: v_mov_b32_e32 v9, s0
; MOVREL-NEXT: s_mov_b32 s0, exec_lo
; MOVREL-NEXT: ; implicit-def: $vcc_hi
; MOVREL-NEXT: BB3_1: ; =>This Inner Loop Header: Depth=1
; MOVREL-NEXT: v_readfirstlane_b32 s1, v8
; MOVREL-NEXT: v_mov_b32_e32 v0, v9
; MOVREL-NEXT: v_mov_b32_e32 v1, v10
; MOVREL-NEXT: v_mov_b32_e32 v2, v11
; MOVREL-NEXT: v_mov_b32_e32 v3, v12
; MOVREL-NEXT: v_cmp_eq_u32_e32 vcc_lo, s1, v8
; MOVREL-NEXT: v_readfirstlane_b32 s1, v0
; MOVREL-NEXT: v_mov_b32_e32 v1, v9
; MOVREL-NEXT: v_mov_b32_e32 v2, v10
; MOVREL-NEXT: v_mov_b32_e32 v3, v11
; MOVREL-NEXT: v_mov_b32_e32 v4, v12
; MOVREL-NEXT: v_cmp_eq_u32_e32 vcc_lo, s1, v0
; MOVREL-NEXT: s_mov_b32 m0, s1
; MOVREL-NEXT: v_mov_b32_e32 v4, v13
; MOVREL-NEXT: v_mov_b32_e32 v5, v14
; MOVREL-NEXT: v_mov_b32_e32 v6, v15
; MOVREL-NEXT: v_mov_b32_e32 v7, v16
; MOVREL-NEXT: v_movreld_b32_e32 v0, s10
; MOVREL-NEXT: v_mov_b32_e32 v5, v13
; MOVREL-NEXT: v_mov_b32_e32 v6, v14
; MOVREL-NEXT: v_mov_b32_e32 v7, v15
; MOVREL-NEXT: v_mov_b32_e32 v8, v16
; MOVREL-NEXT: v_movreld_b32_e32 v1, s10
; MOVREL-NEXT: s_and_saveexec_b32 vcc_lo, vcc_lo
; MOVREL-NEXT: s_xor_b32 exec_lo, exec_lo, vcc_lo
; MOVREL-NEXT: s_cbranch_execnz BB3_1
; MOVREL-NEXT: ; %bb.2:
; MOVREL-NEXT: s_mov_b32 exec_lo, s0
; MOVREL-NEXT: v_mov_b32_e32 v0, v1
; MOVREL-NEXT: v_mov_b32_e32 v1, v2
; MOVREL-NEXT: v_mov_b32_e32 v2, v3
; MOVREL-NEXT: v_mov_b32_e32 v3, v4
; MOVREL-NEXT: v_mov_b32_e32 v4, v5
; MOVREL-NEXT: v_mov_b32_e32 v5, v6
; MOVREL-NEXT: v_mov_b32_e32 v6, v7
; MOVREL-NEXT: v_mov_b32_e32 v7, v8
; MOVREL-NEXT: ; return to shader part epilog
entry:
%insert = insertelement <8 x float> %vec, float %val, i32 %idx
@ -393,35 +400,41 @@ define amdgpu_ps <8 x float> @dyn_insertelement_v8f32_s_v_v(<8 x float> inreg %v
; MOVREL-NEXT: s_mov_b32 s4, s6
; MOVREL-NEXT: s_mov_b32 s6, s8
; MOVREL-NEXT: v_mov_b32_e32 v17, s7
; MOVREL-NEXT: v_mov_b32_e32 v8, v0
; MOVREL-NEXT: v_mov_b32_e32 v9, v1
; MOVREL-NEXT: v_mov_b32_e32 v15, s5
; MOVREL-NEXT: v_mov_b32_e32 v16, s6
; MOVREL-NEXT: v_mov_b32_e32 v14, s4
; MOVREL-NEXT: v_mov_b32_e32 v13, s3
; MOVREL-NEXT: v_mov_b32_e32 v14, s4
; MOVREL-NEXT: v_mov_b32_e32 v16, s6
; MOVREL-NEXT: v_mov_b32_e32 v12, s2
; MOVREL-NEXT: v_mov_b32_e32 v11, s1
; MOVREL-NEXT: v_mov_b32_e32 v10, s0
; MOVREL-NEXT: s_mov_b32 s0, exec_lo
; MOVREL-NEXT: ; implicit-def: $vcc_hi
; MOVREL-NEXT: BB6_1: ; =>This Inner Loop Header: Depth=1
; MOVREL-NEXT: v_readfirstlane_b32 s1, v9
; MOVREL-NEXT: v_mov_b32_e32 v0, v10
; MOVREL-NEXT: v_mov_b32_e32 v1, v11
; MOVREL-NEXT: v_mov_b32_e32 v2, v12
; MOVREL-NEXT: v_mov_b32_e32 v3, v13
; MOVREL-NEXT: v_cmp_eq_u32_e32 vcc_lo, s1, v9
; MOVREL-NEXT: v_readfirstlane_b32 s1, v1
; MOVREL-NEXT: v_mov_b32_e32 v2, v10
; MOVREL-NEXT: v_mov_b32_e32 v3, v11
; MOVREL-NEXT: v_mov_b32_e32 v4, v12
; MOVREL-NEXT: v_mov_b32_e32 v5, v13
; MOVREL-NEXT: v_cmp_eq_u32_e32 vcc_lo, s1, v1
; MOVREL-NEXT: s_mov_b32 m0, s1
; MOVREL-NEXT: v_mov_b32_e32 v4, v14
; MOVREL-NEXT: v_mov_b32_e32 v5, v15
; MOVREL-NEXT: v_mov_b32_e32 v6, v16
; MOVREL-NEXT: v_mov_b32_e32 v7, v17
; MOVREL-NEXT: v_movreld_b32_e32 v0, v8
; MOVREL-NEXT: v_mov_b32_e32 v6, v14
; MOVREL-NEXT: v_mov_b32_e32 v7, v15
; MOVREL-NEXT: v_mov_b32_e32 v8, v16
; MOVREL-NEXT: v_mov_b32_e32 v9, v17
; MOVREL-NEXT: v_movreld_b32_e32 v2, v0
; MOVREL-NEXT: s_and_saveexec_b32 vcc_lo, vcc_lo
; MOVREL-NEXT: s_xor_b32 exec_lo, exec_lo, vcc_lo
; MOVREL-NEXT: s_cbranch_execnz BB6_1
; MOVREL-NEXT: ; %bb.2:
; MOVREL-NEXT: s_mov_b32 exec_lo, s0
; MOVREL-NEXT: v_mov_b32_e32 v0, v2
; MOVREL-NEXT: v_mov_b32_e32 v1, v3
; MOVREL-NEXT: v_mov_b32_e32 v2, v4
; MOVREL-NEXT: v_mov_b32_e32 v3, v5
; MOVREL-NEXT: v_mov_b32_e32 v4, v6
; MOVREL-NEXT: v_mov_b32_e32 v5, v7
; MOVREL-NEXT: v_mov_b32_e32 v6, v8
; MOVREL-NEXT: v_mov_b32_e32 v7, v9
; MOVREL-NEXT: ; return to shader part epilog
entry:
%insert = insertelement <8 x float> %vec, float %val, i32 %idx

View File

@ -10,9 +10,9 @@ define amdgpu_kernel void @use_lds_globals(i32 addrspace(1)* %out, i32 addrspace
; CHECK: ; %bb.0: ; %entry
; CHECK-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; CHECK-NEXT: s_add_u32 s2, 4, 4
; CHECK-NEXT: v_mov_b32_e32 v2, s2
; CHECK-NEXT: v_mov_b32_e32 v0, s2
; CHECK-NEXT: s_mov_b32 m0, -1
; CHECK-NEXT: ds_read_b32 v2, v2
; CHECK-NEXT: ds_read_b32 v2, v0
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
; CHECK-NEXT: s_add_u32 s0, s0, 4
; CHECK-NEXT: s_addc_u32 s1, s1, 0

View File

@ -371,21 +371,21 @@ define amdgpu_kernel void @global_atomic_dec_ret_i32_offset_addr64(i32 addrspace
; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
; CI-NEXT: v_ashrrev_i32_e32 v1, 31, v0
; CI-NEXT: v_lshl_b64 v[0:1], v[0:1], 2
; CI-NEXT: v_mov_b32_e32 v4, 42
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: v_mov_b32_e32 v2, s2
; CI-NEXT: v_mov_b32_e32 v3, s3
; CI-NEXT: v_add_i32_e32 v2, vcc, v2, v0
; CI-NEXT: v_addc_u32_e32 v3, vcc, v3, v1, vcc
; CI-NEXT: v_add_i32_e32 v2, vcc, 20, v2
; CI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
; CI-NEXT: flat_atomic_dec v4, v[2:3], v4 glc
; CI-NEXT: v_add_i32_e32 v4, vcc, v2, v0
; CI-NEXT: v_addc_u32_e32 v5, vcc, v3, v1, vcc
; CI-NEXT: v_mov_b32_e32 v3, s1
; CI-NEXT: v_mov_b32_e32 v2, s0
; CI-NEXT: v_add_i32_e32 v0, vcc, v2, v0
; CI-NEXT: v_addc_u32_e32 v1, vcc, v3, v1, vcc
; CI-NEXT: v_add_i32_e32 v2, vcc, 20, v4
; CI-NEXT: v_addc_u32_e32 v3, vcc, 0, v5, vcc
; CI-NEXT: v_mov_b32_e32 v4, 42
; CI-NEXT: flat_atomic_dec v2, v[2:3], v4 glc
; CI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; CI-NEXT: flat_store_dword v[0:1], v4
; CI-NEXT: flat_store_dword v[0:1], v2
; CI-NEXT: s_endpgm
;
; VI-LABEL: global_atomic_dec_ret_i32_offset_addr64:
@ -393,21 +393,21 @@ define amdgpu_kernel void @global_atomic_dec_ret_i32_offset_addr64(i32 addrspace
; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
; VI-NEXT: v_ashrrev_i32_e32 v1, 31, v0
; VI-NEXT: v_lshlrev_b64 v[0:1], 2, v[0:1]
; VI-NEXT: v_mov_b32_e32 v4, 42
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v2, s2
; VI-NEXT: v_mov_b32_e32 v3, s3
; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v0
; VI-NEXT: v_addc_u32_e32 v3, vcc, v3, v1, vcc
; VI-NEXT: v_add_u32_e32 v2, vcc, 20, v2
; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
; VI-NEXT: flat_atomic_dec v4, v[2:3], v4 glc
; VI-NEXT: v_add_u32_e32 v4, vcc, v2, v0
; VI-NEXT: v_addc_u32_e32 v5, vcc, v3, v1, vcc
; VI-NEXT: v_mov_b32_e32 v3, s1
; VI-NEXT: v_mov_b32_e32 v2, s0
; VI-NEXT: v_add_u32_e32 v0, vcc, v2, v0
; VI-NEXT: v_addc_u32_e32 v1, vcc, v3, v1, vcc
; VI-NEXT: v_add_u32_e32 v2, vcc, 20, v4
; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v5, vcc
; VI-NEXT: v_mov_b32_e32 v4, 42
; VI-NEXT: flat_atomic_dec v2, v[2:3], v4 glc
; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; VI-NEXT: flat_store_dword v[0:1], v4
; VI-NEXT: flat_store_dword v[0:1], v2
; VI-NEXT: s_endpgm
; GFX9-LABEL: global_atomic_dec_ret_i32_offset_addr64:
; GFX9: ; %bb.0:
@ -448,7 +448,6 @@ define amdgpu_kernel void @global_atomic_dec_noret_i32_offset_addr64(i32 addrspa
; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; CI-NEXT: v_ashrrev_i32_e32 v1, 31, v0
; CI-NEXT: v_lshl_b64 v[0:1], v[0:1], 2
; CI-NEXT: v_mov_b32_e32 v4, 42
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: v_mov_b32_e32 v3, s1
; CI-NEXT: v_mov_b32_e32 v2, s0
@ -456,7 +455,8 @@ define amdgpu_kernel void @global_atomic_dec_noret_i32_offset_addr64(i32 addrspa
; CI-NEXT: v_addc_u32_e32 v1, vcc, v3, v1, vcc
; CI-NEXT: v_add_i32_e32 v0, vcc, 20, v0
; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; CI-NEXT: flat_atomic_dec v0, v[0:1], v4 glc
; CI-NEXT: v_mov_b32_e32 v2, 42
; CI-NEXT: flat_atomic_dec v0, v[0:1], v2 glc
; CI-NEXT: s_endpgm
;
; VI-LABEL: global_atomic_dec_noret_i32_offset_addr64:
@ -464,7 +464,6 @@ define amdgpu_kernel void @global_atomic_dec_noret_i32_offset_addr64(i32 addrspa
; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; VI-NEXT: v_ashrrev_i32_e32 v1, 31, v0
; VI-NEXT: v_lshlrev_b64 v[0:1], 2, v[0:1]
; VI-NEXT: v_mov_b32_e32 v4, 42
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v3, s1
; VI-NEXT: v_mov_b32_e32 v2, s0
@ -472,7 +471,8 @@ define amdgpu_kernel void @global_atomic_dec_noret_i32_offset_addr64(i32 addrspa
; VI-NEXT: v_addc_u32_e32 v1, vcc, v3, v1, vcc
; VI-NEXT: v_add_u32_e32 v0, vcc, 20, v0
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: flat_atomic_dec v0, v[0:1], v4 glc
; VI-NEXT: v_mov_b32_e32 v2, 42
; VI-NEXT: flat_atomic_dec v0, v[0:1], v2 glc
; VI-NEXT: s_endpgm
; GFX9-LABEL: global_atomic_dec_noret_i32_offset_addr64:
; GFX9: ; %bb.0:
@ -678,21 +678,21 @@ define amdgpu_kernel void @flat_atomic_dec_ret_i32_offset_addr64(i32* %out, i32*
; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
; CI-NEXT: v_ashrrev_i32_e32 v1, 31, v0
; CI-NEXT: v_lshl_b64 v[0:1], v[0:1], 2
; CI-NEXT: v_mov_b32_e32 v4, 42
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: v_mov_b32_e32 v2, s2
; CI-NEXT: v_mov_b32_e32 v3, s3
; CI-NEXT: v_add_i32_e32 v2, vcc, v2, v0
; CI-NEXT: v_addc_u32_e32 v3, vcc, v3, v1, vcc
; CI-NEXT: v_add_i32_e32 v2, vcc, 20, v2
; CI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
; CI-NEXT: flat_atomic_dec v4, v[2:3], v4 glc
; CI-NEXT: v_add_i32_e32 v4, vcc, v2, v0
; CI-NEXT: v_addc_u32_e32 v5, vcc, v3, v1, vcc
; CI-NEXT: v_mov_b32_e32 v3, s1
; CI-NEXT: v_mov_b32_e32 v2, s0
; CI-NEXT: v_add_i32_e32 v0, vcc, v2, v0
; CI-NEXT: v_addc_u32_e32 v1, vcc, v3, v1, vcc
; CI-NEXT: v_add_i32_e32 v2, vcc, 20, v4
; CI-NEXT: v_addc_u32_e32 v3, vcc, 0, v5, vcc
; CI-NEXT: v_mov_b32_e32 v4, 42
; CI-NEXT: flat_atomic_dec v2, v[2:3], v4 glc
; CI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; CI-NEXT: flat_store_dword v[0:1], v4
; CI-NEXT: flat_store_dword v[0:1], v2
; CI-NEXT: s_endpgm
;
; VI-LABEL: flat_atomic_dec_ret_i32_offset_addr64:
@ -700,21 +700,21 @@ define amdgpu_kernel void @flat_atomic_dec_ret_i32_offset_addr64(i32* %out, i32*
; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
; VI-NEXT: v_ashrrev_i32_e32 v1, 31, v0
; VI-NEXT: v_lshlrev_b64 v[0:1], 2, v[0:1]
; VI-NEXT: v_mov_b32_e32 v4, 42
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v2, s2
; VI-NEXT: v_mov_b32_e32 v3, s3
; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v0
; VI-NEXT: v_addc_u32_e32 v3, vcc, v3, v1, vcc
; VI-NEXT: v_add_u32_e32 v2, vcc, 20, v2
; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
; VI-NEXT: flat_atomic_dec v4, v[2:3], v4 glc
; VI-NEXT: v_add_u32_e32 v4, vcc, v2, v0
; VI-NEXT: v_addc_u32_e32 v5, vcc, v3, v1, vcc
; VI-NEXT: v_mov_b32_e32 v3, s1
; VI-NEXT: v_mov_b32_e32 v2, s0
; VI-NEXT: v_add_u32_e32 v0, vcc, v2, v0
; VI-NEXT: v_addc_u32_e32 v1, vcc, v3, v1, vcc
; VI-NEXT: v_add_u32_e32 v2, vcc, 20, v4
; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v5, vcc
; VI-NEXT: v_mov_b32_e32 v4, 42
; VI-NEXT: flat_atomic_dec v2, v[2:3], v4 glc
; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; VI-NEXT: flat_store_dword v[0:1], v4
; VI-NEXT: flat_store_dword v[0:1], v2
; VI-NEXT: s_endpgm
; GFX9-LABEL: flat_atomic_dec_ret_i32_offset_addr64:
; GFX9: ; %bb.0:
@ -755,7 +755,6 @@ define amdgpu_kernel void @flat_atomic_dec_noret_i32_offset_addr64(i32* %ptr) #0
; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; CI-NEXT: v_ashrrev_i32_e32 v1, 31, v0
; CI-NEXT: v_lshl_b64 v[0:1], v[0:1], 2
; CI-NEXT: v_mov_b32_e32 v4, 42
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: v_mov_b32_e32 v3, s1
; CI-NEXT: v_mov_b32_e32 v2, s0
@ -763,7 +762,8 @@ define amdgpu_kernel void @flat_atomic_dec_noret_i32_offset_addr64(i32* %ptr) #0
; CI-NEXT: v_addc_u32_e32 v1, vcc, v3, v1, vcc
; CI-NEXT: v_add_i32_e32 v0, vcc, 20, v0
; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; CI-NEXT: flat_atomic_dec v0, v[0:1], v4 glc
; CI-NEXT: v_mov_b32_e32 v2, 42
; CI-NEXT: flat_atomic_dec v0, v[0:1], v2 glc
; CI-NEXT: s_endpgm
;
; VI-LABEL: flat_atomic_dec_noret_i32_offset_addr64:
@ -771,7 +771,6 @@ define amdgpu_kernel void @flat_atomic_dec_noret_i32_offset_addr64(i32* %ptr) #0
; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; VI-NEXT: v_ashrrev_i32_e32 v1, 31, v0
; VI-NEXT: v_lshlrev_b64 v[0:1], 2, v[0:1]
; VI-NEXT: v_mov_b32_e32 v4, 42
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v3, s1
; VI-NEXT: v_mov_b32_e32 v2, s0
@ -779,7 +778,8 @@ define amdgpu_kernel void @flat_atomic_dec_noret_i32_offset_addr64(i32* %ptr) #0
; VI-NEXT: v_addc_u32_e32 v1, vcc, v3, v1, vcc
; VI-NEXT: v_add_u32_e32 v0, vcc, 20, v0
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: flat_atomic_dec v0, v[0:1], v4 glc
; VI-NEXT: v_mov_b32_e32 v2, 42
; VI-NEXT: flat_atomic_dec v0, v[0:1], v2 glc
; VI-NEXT: s_endpgm
; GFX9-LABEL: flat_atomic_dec_noret_i32_offset_addr64:
; GFX9: ; %bb.0:
@ -997,20 +997,20 @@ define amdgpu_kernel void @flat_atomic_dec_ret_i64_offset_addr64(i64* %out, i64*
; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
; CI-NEXT: v_ashrrev_i32_e32 v1, 31, v0
; CI-NEXT: v_lshl_b64 v[0:1], v[0:1], 3
; CI-NEXT: v_mov_b32_e32 v2, 42
; CI-NEXT: v_mov_b32_e32 v3, 0
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: v_mov_b32_e32 v5, s3
; CI-NEXT: v_mov_b32_e32 v4, s2
; CI-NEXT: v_add_i32_e32 v4, vcc, v4, v0
; CI-NEXT: v_addc_u32_e32 v5, vcc, v5, v1, vcc
; CI-NEXT: v_add_i32_e32 v4, vcc, 40, v4
; CI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc
; CI-NEXT: flat_atomic_dec_x2 v[2:3], v[4:5], v[2:3] glc
; CI-NEXT: v_mov_b32_e32 v5, s1
; CI-NEXT: v_mov_b32_e32 v4, s0
; CI-NEXT: v_add_i32_e32 v0, vcc, v4, v0
; CI-NEXT: v_addc_u32_e32 v1, vcc, v5, v1, vcc
; CI-NEXT: v_mov_b32_e32 v2, s2
; CI-NEXT: v_mov_b32_e32 v3, s3
; CI-NEXT: v_add_i32_e32 v4, vcc, v2, v0
; CI-NEXT: v_addc_u32_e32 v5, vcc, v3, v1, vcc
; CI-NEXT: v_mov_b32_e32 v3, s1
; CI-NEXT: v_mov_b32_e32 v2, s0
; CI-NEXT: v_add_i32_e32 v0, vcc, v2, v0
; CI-NEXT: v_addc_u32_e32 v1, vcc, v3, v1, vcc
; CI-NEXT: v_add_i32_e32 v2, vcc, 40, v4
; CI-NEXT: v_addc_u32_e32 v3, vcc, 0, v5, vcc
; CI-NEXT: v_mov_b32_e32 v4, 42
; CI-NEXT: v_mov_b32_e32 v5, 0
; CI-NEXT: flat_atomic_dec_x2 v[2:3], v[2:3], v[4:5] glc
; CI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; CI-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
; CI-NEXT: s_endpgm
@ -1020,20 +1020,20 @@ define amdgpu_kernel void @flat_atomic_dec_ret_i64_offset_addr64(i64* %out, i64*
; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
; VI-NEXT: v_ashrrev_i32_e32 v1, 31, v0
; VI-NEXT: v_lshlrev_b64 v[0:1], 3, v[0:1]
; VI-NEXT: v_mov_b32_e32 v2, 42
; VI-NEXT: v_mov_b32_e32 v3, 0
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v5, s3
; VI-NEXT: v_mov_b32_e32 v4, s2
; VI-NEXT: v_add_u32_e32 v4, vcc, v4, v0
; VI-NEXT: v_addc_u32_e32 v5, vcc, v5, v1, vcc
; VI-NEXT: v_add_u32_e32 v4, vcc, 40, v4
; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc
; VI-NEXT: flat_atomic_dec_x2 v[2:3], v[4:5], v[2:3] glc
; VI-NEXT: v_mov_b32_e32 v5, s1
; VI-NEXT: v_mov_b32_e32 v4, s0
; VI-NEXT: v_add_u32_e32 v0, vcc, v4, v0
; VI-NEXT: v_addc_u32_e32 v1, vcc, v5, v1, vcc
; VI-NEXT: v_mov_b32_e32 v2, s2
; VI-NEXT: v_mov_b32_e32 v3, s3
; VI-NEXT: v_add_u32_e32 v4, vcc, v2, v0
; VI-NEXT: v_addc_u32_e32 v5, vcc, v3, v1, vcc
; VI-NEXT: v_mov_b32_e32 v3, s1
; VI-NEXT: v_mov_b32_e32 v2, s0
; VI-NEXT: v_add_u32_e32 v0, vcc, v2, v0
; VI-NEXT: v_addc_u32_e32 v1, vcc, v3, v1, vcc
; VI-NEXT: v_add_u32_e32 v2, vcc, 40, v4
; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v5, vcc
; VI-NEXT: v_mov_b32_e32 v4, 42
; VI-NEXT: v_mov_b32_e32 v5, 0
; VI-NEXT: flat_atomic_dec_x2 v[2:3], v[2:3], v[4:5] glc
; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; VI-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
; VI-NEXT: s_endpgm
@ -1077,15 +1077,15 @@ define amdgpu_kernel void @flat_atomic_dec_noret_i64_offset_addr64(i64* %ptr) #0
; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; CI-NEXT: v_ashrrev_i32_e32 v1, 31, v0
; CI-NEXT: v_lshl_b64 v[0:1], v[0:1], 3
; CI-NEXT: v_mov_b32_e32 v2, 42
; CI-NEXT: v_mov_b32_e32 v3, 0
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: v_mov_b32_e32 v5, s1
; CI-NEXT: v_mov_b32_e32 v4, s0
; CI-NEXT: v_add_i32_e32 v0, vcc, v4, v0
; CI-NEXT: v_addc_u32_e32 v1, vcc, v5, v1, vcc
; CI-NEXT: v_mov_b32_e32 v3, s1
; CI-NEXT: v_mov_b32_e32 v2, s0
; CI-NEXT: v_add_i32_e32 v0, vcc, v2, v0
; CI-NEXT: v_addc_u32_e32 v1, vcc, v3, v1, vcc
; CI-NEXT: v_add_i32_e32 v0, vcc, 40, v0
; CI-NEXT: v_mov_b32_e32 v2, 42
; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; CI-NEXT: v_mov_b32_e32 v3, 0
; CI-NEXT: flat_atomic_dec_x2 v[0:1], v[0:1], v[2:3] glc
; CI-NEXT: s_endpgm
;
@ -1094,15 +1094,15 @@ define amdgpu_kernel void @flat_atomic_dec_noret_i64_offset_addr64(i64* %ptr) #0
; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; VI-NEXT: v_ashrrev_i32_e32 v1, 31, v0
; VI-NEXT: v_lshlrev_b64 v[0:1], 3, v[0:1]
; VI-NEXT: v_mov_b32_e32 v2, 42
; VI-NEXT: v_mov_b32_e32 v3, 0
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v5, s1
; VI-NEXT: v_mov_b32_e32 v4, s0
; VI-NEXT: v_add_u32_e32 v0, vcc, v4, v0
; VI-NEXT: v_addc_u32_e32 v1, vcc, v5, v1, vcc
; VI-NEXT: v_mov_b32_e32 v3, s1
; VI-NEXT: v_mov_b32_e32 v2, s0
; VI-NEXT: v_add_u32_e32 v0, vcc, v2, v0
; VI-NEXT: v_addc_u32_e32 v1, vcc, v3, v1, vcc
; VI-NEXT: v_add_u32_e32 v0, vcc, 40, v0
; VI-NEXT: v_mov_b32_e32 v2, 42
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: v_mov_b32_e32 v3, 0
; VI-NEXT: flat_atomic_dec_x2 v[0:1], v[0:1], v[2:3] glc
; VI-NEXT: s_endpgm
; GFX9-LABEL: flat_atomic_dec_noret_i64_offset_addr64:
@ -1566,20 +1566,20 @@ define amdgpu_kernel void @global_atomic_dec_ret_i64_offset_addr64(i64 addrspace
; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
; CI-NEXT: v_ashrrev_i32_e32 v1, 31, v0
; CI-NEXT: v_lshl_b64 v[0:1], v[0:1], 3
; CI-NEXT: v_mov_b32_e32 v2, 42
; CI-NEXT: v_mov_b32_e32 v3, 0
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: v_mov_b32_e32 v5, s3
; CI-NEXT: v_mov_b32_e32 v4, s2
; CI-NEXT: v_add_i32_e32 v4, vcc, v4, v0
; CI-NEXT: v_addc_u32_e32 v5, vcc, v5, v1, vcc
; CI-NEXT: v_add_i32_e32 v4, vcc, 40, v4
; CI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc
; CI-NEXT: flat_atomic_dec_x2 v[2:3], v[4:5], v[2:3] glc
; CI-NEXT: v_mov_b32_e32 v5, s1
; CI-NEXT: v_mov_b32_e32 v4, s0
; CI-NEXT: v_add_i32_e32 v0, vcc, v4, v0
; CI-NEXT: v_addc_u32_e32 v1, vcc, v5, v1, vcc
; CI-NEXT: v_mov_b32_e32 v2, s2
; CI-NEXT: v_mov_b32_e32 v3, s3
; CI-NEXT: v_add_i32_e32 v4, vcc, v2, v0
; CI-NEXT: v_addc_u32_e32 v5, vcc, v3, v1, vcc
; CI-NEXT: v_mov_b32_e32 v3, s1
; CI-NEXT: v_mov_b32_e32 v2, s0
; CI-NEXT: v_add_i32_e32 v0, vcc, v2, v0
; CI-NEXT: v_addc_u32_e32 v1, vcc, v3, v1, vcc
; CI-NEXT: v_add_i32_e32 v2, vcc, 40, v4
; CI-NEXT: v_addc_u32_e32 v3, vcc, 0, v5, vcc
; CI-NEXT: v_mov_b32_e32 v4, 42
; CI-NEXT: v_mov_b32_e32 v5, 0
; CI-NEXT: flat_atomic_dec_x2 v[2:3], v[2:3], v[4:5] glc
; CI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; CI-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
; CI-NEXT: s_endpgm
@ -1589,20 +1589,20 @@ define amdgpu_kernel void @global_atomic_dec_ret_i64_offset_addr64(i64 addrspace
; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
; VI-NEXT: v_ashrrev_i32_e32 v1, 31, v0
; VI-NEXT: v_lshlrev_b64 v[0:1], 3, v[0:1]
; VI-NEXT: v_mov_b32_e32 v2, 42
; VI-NEXT: v_mov_b32_e32 v3, 0
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v5, s3
; VI-NEXT: v_mov_b32_e32 v4, s2
; VI-NEXT: v_add_u32_e32 v4, vcc, v4, v0
; VI-NEXT: v_addc_u32_e32 v5, vcc, v5, v1, vcc
; VI-NEXT: v_add_u32_e32 v4, vcc, 40, v4
; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc
; VI-NEXT: flat_atomic_dec_x2 v[2:3], v[4:5], v[2:3] glc
; VI-NEXT: v_mov_b32_e32 v5, s1
; VI-NEXT: v_mov_b32_e32 v4, s0
; VI-NEXT: v_add_u32_e32 v0, vcc, v4, v0
; VI-NEXT: v_addc_u32_e32 v1, vcc, v5, v1, vcc
; VI-NEXT: v_mov_b32_e32 v2, s2
; VI-NEXT: v_mov_b32_e32 v3, s3
; VI-NEXT: v_add_u32_e32 v4, vcc, v2, v0
; VI-NEXT: v_addc_u32_e32 v5, vcc, v3, v1, vcc
; VI-NEXT: v_mov_b32_e32 v3, s1
; VI-NEXT: v_mov_b32_e32 v2, s0
; VI-NEXT: v_add_u32_e32 v0, vcc, v2, v0
; VI-NEXT: v_addc_u32_e32 v1, vcc, v3, v1, vcc
; VI-NEXT: v_add_u32_e32 v2, vcc, 40, v4
; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v5, vcc
; VI-NEXT: v_mov_b32_e32 v4, 42
; VI-NEXT: v_mov_b32_e32 v5, 0
; VI-NEXT: flat_atomic_dec_x2 v[2:3], v[2:3], v[4:5] glc
; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; VI-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
; VI-NEXT: s_endpgm
@ -1646,15 +1646,15 @@ define amdgpu_kernel void @global_atomic_dec_noret_i64_offset_addr64(i64 addrspa
; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; CI-NEXT: v_ashrrev_i32_e32 v1, 31, v0
; CI-NEXT: v_lshl_b64 v[0:1], v[0:1], 3
; CI-NEXT: v_mov_b32_e32 v2, 42
; CI-NEXT: v_mov_b32_e32 v3, 0
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: v_mov_b32_e32 v5, s1
; CI-NEXT: v_mov_b32_e32 v4, s0
; CI-NEXT: v_add_i32_e32 v0, vcc, v4, v0
; CI-NEXT: v_addc_u32_e32 v1, vcc, v5, v1, vcc
; CI-NEXT: v_mov_b32_e32 v3, s1
; CI-NEXT: v_mov_b32_e32 v2, s0
; CI-NEXT: v_add_i32_e32 v0, vcc, v2, v0
; CI-NEXT: v_addc_u32_e32 v1, vcc, v3, v1, vcc
; CI-NEXT: v_add_i32_e32 v0, vcc, 40, v0
; CI-NEXT: v_mov_b32_e32 v2, 42
; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; CI-NEXT: v_mov_b32_e32 v3, 0
; CI-NEXT: flat_atomic_dec_x2 v[0:1], v[0:1], v[2:3] glc
; CI-NEXT: s_endpgm
;
@ -1663,15 +1663,15 @@ define amdgpu_kernel void @global_atomic_dec_noret_i64_offset_addr64(i64 addrspa
; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; VI-NEXT: v_ashrrev_i32_e32 v1, 31, v0
; VI-NEXT: v_lshlrev_b64 v[0:1], 3, v[0:1]
; VI-NEXT: v_mov_b32_e32 v2, 42
; VI-NEXT: v_mov_b32_e32 v3, 0
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v5, s1
; VI-NEXT: v_mov_b32_e32 v4, s0
; VI-NEXT: v_add_u32_e32 v0, vcc, v4, v0
; VI-NEXT: v_addc_u32_e32 v1, vcc, v5, v1, vcc
; VI-NEXT: v_mov_b32_e32 v3, s1
; VI-NEXT: v_mov_b32_e32 v2, s0
; VI-NEXT: v_add_u32_e32 v0, vcc, v2, v0
; VI-NEXT: v_addc_u32_e32 v1, vcc, v3, v1, vcc
; VI-NEXT: v_add_u32_e32 v0, vcc, 40, v0
; VI-NEXT: v_mov_b32_e32 v2, 42
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: v_mov_b32_e32 v3, 0
; VI-NEXT: flat_atomic_dec_x2 v[0:1], v[0:1], v[2:3] glc
; VI-NEXT: s_endpgm
; GFX9-LABEL: global_atomic_dec_noret_i64_offset_addr64:

View File

@ -373,21 +373,21 @@ define amdgpu_kernel void @global_atomic_inc_ret_i32_offset_addr64(i32 addrspace
; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
; CI-NEXT: v_ashrrev_i32_e32 v1, 31, v0
; CI-NEXT: v_lshl_b64 v[0:1], v[0:1], 2
; CI-NEXT: v_mov_b32_e32 v4, 42
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: v_mov_b32_e32 v2, s2
; CI-NEXT: v_mov_b32_e32 v3, s3
; CI-NEXT: v_add_i32_e32 v2, vcc, v2, v0
; CI-NEXT: v_addc_u32_e32 v3, vcc, v3, v1, vcc
; CI-NEXT: v_add_i32_e32 v2, vcc, 20, v2
; CI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
; CI-NEXT: flat_atomic_inc v4, v[2:3], v4 glc
; CI-NEXT: v_add_i32_e32 v4, vcc, v2, v0
; CI-NEXT: v_addc_u32_e32 v5, vcc, v3, v1, vcc
; CI-NEXT: v_mov_b32_e32 v3, s1
; CI-NEXT: v_mov_b32_e32 v2, s0
; CI-NEXT: v_add_i32_e32 v0, vcc, v2, v0
; CI-NEXT: v_addc_u32_e32 v1, vcc, v3, v1, vcc
; CI-NEXT: v_add_i32_e32 v2, vcc, 20, v4
; CI-NEXT: v_addc_u32_e32 v3, vcc, 0, v5, vcc
; CI-NEXT: v_mov_b32_e32 v4, 42
; CI-NEXT: flat_atomic_inc v2, v[2:3], v4 glc
; CI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; CI-NEXT: flat_store_dword v[0:1], v4
; CI-NEXT: flat_store_dword v[0:1], v2
; CI-NEXT: s_endpgm
;
; VI-LABEL: global_atomic_inc_ret_i32_offset_addr64:
@ -395,21 +395,21 @@ define amdgpu_kernel void @global_atomic_inc_ret_i32_offset_addr64(i32 addrspace
; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
; VI-NEXT: v_ashrrev_i32_e32 v1, 31, v0
; VI-NEXT: v_lshlrev_b64 v[0:1], 2, v[0:1]
; VI-NEXT: v_mov_b32_e32 v4, 42
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v2, s2
; VI-NEXT: v_mov_b32_e32 v3, s3
; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v0
; VI-NEXT: v_addc_u32_e32 v3, vcc, v3, v1, vcc
; VI-NEXT: v_add_u32_e32 v2, vcc, 20, v2
; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
; VI-NEXT: flat_atomic_inc v4, v[2:3], v4 glc
; VI-NEXT: v_add_u32_e32 v4, vcc, v2, v0
; VI-NEXT: v_addc_u32_e32 v5, vcc, v3, v1, vcc
; VI-NEXT: v_mov_b32_e32 v3, s1
; VI-NEXT: v_mov_b32_e32 v2, s0
; VI-NEXT: v_add_u32_e32 v0, vcc, v2, v0
; VI-NEXT: v_addc_u32_e32 v1, vcc, v3, v1, vcc
; VI-NEXT: v_add_u32_e32 v2, vcc, 20, v4
; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v5, vcc
; VI-NEXT: v_mov_b32_e32 v4, 42
; VI-NEXT: flat_atomic_inc v2, v[2:3], v4 glc
; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; VI-NEXT: flat_store_dword v[0:1], v4
; VI-NEXT: flat_store_dword v[0:1], v2
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: global_atomic_inc_ret_i32_offset_addr64:
@ -417,21 +417,21 @@ define amdgpu_kernel void @global_atomic_inc_ret_i32_offset_addr64(i32 addrspace
; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
; GFX9-NEXT: v_ashrrev_i32_e32 v1, 31, v0
; GFX9-NEXT: v_lshlrev_b64 v[0:1], 2, v[0:1]
; GFX9-NEXT: v_mov_b32_e32 v4, 42
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: v_mov_b32_e32 v2, s2
; GFX9-NEXT: v_mov_b32_e32 v3, s3
; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v2, v0
; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, v3, v1, vcc
; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, 20, v2
; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc
; GFX9-NEXT: global_atomic_inc v4, v[2:3], v4, off glc
; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, v2, v0
; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, v3, v1, vcc
; GFX9-NEXT: v_mov_b32_e32 v3, s1
; GFX9-NEXT: v_mov_b32_e32 v2, s0
; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v2, v0
; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v3, v1, vcc
; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, 20, v4
; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v5, vcc
; GFX9-NEXT: v_mov_b32_e32 v4, 42
; GFX9-NEXT: global_atomic_inc v2, v[2:3], v4, off glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: global_store_dword v[0:1], v4, off
; GFX9-NEXT: global_store_dword v[0:1], v2, off
; GFX9-NEXT: s_endpgm
%id = call i32 @llvm.amdgcn.workitem.id.x()
%gep.tid = getelementptr i32, i32 addrspace(1)* %ptr, i32 %id
@ -448,7 +448,6 @@ define amdgpu_kernel void @global_atomic_inc_noret_i32_offset_addr64(i32 addrspa
; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; CI-NEXT: v_ashrrev_i32_e32 v1, 31, v0
; CI-NEXT: v_lshl_b64 v[0:1], v[0:1], 2
; CI-NEXT: v_mov_b32_e32 v4, 42
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: v_mov_b32_e32 v3, s1
; CI-NEXT: v_mov_b32_e32 v2, s0
@ -456,7 +455,8 @@ define amdgpu_kernel void @global_atomic_inc_noret_i32_offset_addr64(i32 addrspa
; CI-NEXT: v_addc_u32_e32 v1, vcc, v3, v1, vcc
; CI-NEXT: v_add_i32_e32 v0, vcc, 20, v0
; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; CI-NEXT: flat_atomic_inc v0, v[0:1], v4 glc
; CI-NEXT: v_mov_b32_e32 v2, 42
; CI-NEXT: flat_atomic_inc v0, v[0:1], v2 glc
; CI-NEXT: s_endpgm
;
; VI-LABEL: global_atomic_inc_noret_i32_offset_addr64:
@ -464,7 +464,6 @@ define amdgpu_kernel void @global_atomic_inc_noret_i32_offset_addr64(i32 addrspa
; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; VI-NEXT: v_ashrrev_i32_e32 v1, 31, v0
; VI-NEXT: v_lshlrev_b64 v[0:1], 2, v[0:1]
; VI-NEXT: v_mov_b32_e32 v4, 42
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v3, s1
; VI-NEXT: v_mov_b32_e32 v2, s0
@ -472,7 +471,8 @@ define amdgpu_kernel void @global_atomic_inc_noret_i32_offset_addr64(i32 addrspa
; VI-NEXT: v_addc_u32_e32 v1, vcc, v3, v1, vcc
; VI-NEXT: v_add_u32_e32 v0, vcc, 20, v0
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: flat_atomic_inc v0, v[0:1], v4 glc
; VI-NEXT: v_mov_b32_e32 v2, 42
; VI-NEXT: flat_atomic_inc v0, v[0:1], v2 glc
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: global_atomic_inc_noret_i32_offset_addr64:
@ -480,7 +480,6 @@ define amdgpu_kernel void @global_atomic_inc_noret_i32_offset_addr64(i32 addrspa
; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; GFX9-NEXT: v_ashrrev_i32_e32 v1, 31, v0
; GFX9-NEXT: v_lshlrev_b64 v[0:1], 2, v[0:1]
; GFX9-NEXT: v_mov_b32_e32 v4, 42
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: v_mov_b32_e32 v3, s1
; GFX9-NEXT: v_mov_b32_e32 v2, s0
@ -488,7 +487,8 @@ define amdgpu_kernel void @global_atomic_inc_noret_i32_offset_addr64(i32 addrspa
; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v3, v1, vcc
; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 20, v0
; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
; GFX9-NEXT: global_atomic_inc v0, v[0:1], v4, off glc
; GFX9-NEXT: v_mov_b32_e32 v2, 42
; GFX9-NEXT: global_atomic_inc v0, v[0:1], v2, off glc
; GFX9-NEXT: s_endpgm
%id = call i32 @llvm.amdgcn.workitem.id.x()
%gep.tid = getelementptr i32, i32 addrspace(1)* %ptr, i32 %id
@ -939,20 +939,20 @@ define amdgpu_kernel void @global_atomic_inc_ret_i64_offset_addr64(i64 addrspace
; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
; CI-NEXT: v_ashrrev_i32_e32 v1, 31, v0
; CI-NEXT: v_lshl_b64 v[0:1], v[0:1], 3
; CI-NEXT: v_mov_b32_e32 v2, 42
; CI-NEXT: v_mov_b32_e32 v3, 0
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: v_mov_b32_e32 v5, s3
; CI-NEXT: v_mov_b32_e32 v4, s2
; CI-NEXT: v_add_i32_e32 v4, vcc, v4, v0
; CI-NEXT: v_addc_u32_e32 v5, vcc, v5, v1, vcc
; CI-NEXT: v_add_i32_e32 v4, vcc, 40, v4
; CI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc
; CI-NEXT: flat_atomic_inc_x2 v[2:3], v[4:5], v[2:3] glc
; CI-NEXT: v_mov_b32_e32 v5, s1
; CI-NEXT: v_mov_b32_e32 v4, s0
; CI-NEXT: v_add_i32_e32 v0, vcc, v4, v0
; CI-NEXT: v_addc_u32_e32 v1, vcc, v5, v1, vcc
; CI-NEXT: v_mov_b32_e32 v2, s2
; CI-NEXT: v_mov_b32_e32 v3, s3
; CI-NEXT: v_add_i32_e32 v4, vcc, v2, v0
; CI-NEXT: v_addc_u32_e32 v5, vcc, v3, v1, vcc
; CI-NEXT: v_mov_b32_e32 v3, s1
; CI-NEXT: v_mov_b32_e32 v2, s0
; CI-NEXT: v_add_i32_e32 v0, vcc, v2, v0
; CI-NEXT: v_addc_u32_e32 v1, vcc, v3, v1, vcc
; CI-NEXT: v_add_i32_e32 v2, vcc, 40, v4
; CI-NEXT: v_addc_u32_e32 v3, vcc, 0, v5, vcc
; CI-NEXT: v_mov_b32_e32 v4, 42
; CI-NEXT: v_mov_b32_e32 v5, 0
; CI-NEXT: flat_atomic_inc_x2 v[2:3], v[2:3], v[4:5] glc
; CI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; CI-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
; CI-NEXT: s_endpgm
@ -962,20 +962,20 @@ define amdgpu_kernel void @global_atomic_inc_ret_i64_offset_addr64(i64 addrspace
; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
; VI-NEXT: v_ashrrev_i32_e32 v1, 31, v0
; VI-NEXT: v_lshlrev_b64 v[0:1], 3, v[0:1]
; VI-NEXT: v_mov_b32_e32 v2, 42
; VI-NEXT: v_mov_b32_e32 v3, 0
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v5, s3
; VI-NEXT: v_mov_b32_e32 v4, s2
; VI-NEXT: v_add_u32_e32 v4, vcc, v4, v0
; VI-NEXT: v_addc_u32_e32 v5, vcc, v5, v1, vcc
; VI-NEXT: v_add_u32_e32 v4, vcc, 40, v4
; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc
; VI-NEXT: flat_atomic_inc_x2 v[2:3], v[4:5], v[2:3] glc
; VI-NEXT: v_mov_b32_e32 v5, s1
; VI-NEXT: v_mov_b32_e32 v4, s0
; VI-NEXT: v_add_u32_e32 v0, vcc, v4, v0
; VI-NEXT: v_addc_u32_e32 v1, vcc, v5, v1, vcc
; VI-NEXT: v_mov_b32_e32 v2, s2
; VI-NEXT: v_mov_b32_e32 v3, s3
; VI-NEXT: v_add_u32_e32 v4, vcc, v2, v0
; VI-NEXT: v_addc_u32_e32 v5, vcc, v3, v1, vcc
; VI-NEXT: v_mov_b32_e32 v3, s1
; VI-NEXT: v_mov_b32_e32 v2, s0
; VI-NEXT: v_add_u32_e32 v0, vcc, v2, v0
; VI-NEXT: v_addc_u32_e32 v1, vcc, v3, v1, vcc
; VI-NEXT: v_add_u32_e32 v2, vcc, 40, v4
; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v5, vcc
; VI-NEXT: v_mov_b32_e32 v4, 42
; VI-NEXT: v_mov_b32_e32 v5, 0
; VI-NEXT: flat_atomic_inc_x2 v[2:3], v[2:3], v[4:5] glc
; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; VI-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
; VI-NEXT: s_endpgm
@ -985,20 +985,20 @@ define amdgpu_kernel void @global_atomic_inc_ret_i64_offset_addr64(i64 addrspace
; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
; GFX9-NEXT: v_ashrrev_i32_e32 v1, 31, v0
; GFX9-NEXT: v_lshlrev_b64 v[0:1], 3, v[0:1]
; GFX9-NEXT: v_mov_b32_e32 v2, 42
; GFX9-NEXT: v_mov_b32_e32 v3, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: v_mov_b32_e32 v5, s3
; GFX9-NEXT: v_mov_b32_e32 v4, s2
; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, v4, v0
; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, v5, v1, vcc
; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, 40, v4
; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v5, vcc
; GFX9-NEXT: global_atomic_inc_x2 v[2:3], v[4:5], v[2:3], off glc
; GFX9-NEXT: v_mov_b32_e32 v5, s1
; GFX9-NEXT: v_mov_b32_e32 v4, s0
; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v4, v0
; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v5, v1, vcc
; GFX9-NEXT: v_mov_b32_e32 v2, s2
; GFX9-NEXT: v_mov_b32_e32 v3, s3
; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, v2, v0
; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, v3, v1, vcc
; GFX9-NEXT: v_mov_b32_e32 v3, s1
; GFX9-NEXT: v_mov_b32_e32 v2, s0
; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v2, v0
; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v3, v1, vcc
; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, 40, v4
; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v5, vcc
; GFX9-NEXT: v_mov_b32_e32 v4, 42
; GFX9-NEXT: v_mov_b32_e32 v5, 0
; GFX9-NEXT: global_atomic_inc_x2 v[2:3], v[2:3], v[4:5], off glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: global_store_dwordx2 v[0:1], v[2:3], off
; GFX9-NEXT: s_endpgm
@ -1017,15 +1017,15 @@ define amdgpu_kernel void @global_atomic_inc_noret_i64_offset_addr64(i64 addrspa
; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; CI-NEXT: v_ashrrev_i32_e32 v1, 31, v0
; CI-NEXT: v_lshl_b64 v[0:1], v[0:1], 3
; CI-NEXT: v_mov_b32_e32 v2, 42
; CI-NEXT: v_mov_b32_e32 v3, 0
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: v_mov_b32_e32 v5, s1
; CI-NEXT: v_mov_b32_e32 v4, s0
; CI-NEXT: v_add_i32_e32 v0, vcc, v4, v0
; CI-NEXT: v_addc_u32_e32 v1, vcc, v5, v1, vcc
; CI-NEXT: v_mov_b32_e32 v3, s1
; CI-NEXT: v_mov_b32_e32 v2, s0
; CI-NEXT: v_add_i32_e32 v0, vcc, v2, v0
; CI-NEXT: v_addc_u32_e32 v1, vcc, v3, v1, vcc
; CI-NEXT: v_add_i32_e32 v0, vcc, 40, v0
; CI-NEXT: v_mov_b32_e32 v2, 42
; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; CI-NEXT: v_mov_b32_e32 v3, 0
; CI-NEXT: flat_atomic_inc_x2 v[0:1], v[0:1], v[2:3] glc
; CI-NEXT: s_endpgm
;
@ -1034,15 +1034,15 @@ define amdgpu_kernel void @global_atomic_inc_noret_i64_offset_addr64(i64 addrspa
; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; VI-NEXT: v_ashrrev_i32_e32 v1, 31, v0
; VI-NEXT: v_lshlrev_b64 v[0:1], 3, v[0:1]
; VI-NEXT: v_mov_b32_e32 v2, 42
; VI-NEXT: v_mov_b32_e32 v3, 0
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v5, s1
; VI-NEXT: v_mov_b32_e32 v4, s0
; VI-NEXT: v_add_u32_e32 v0, vcc, v4, v0
; VI-NEXT: v_addc_u32_e32 v1, vcc, v5, v1, vcc
; VI-NEXT: v_mov_b32_e32 v3, s1
; VI-NEXT: v_mov_b32_e32 v2, s0
; VI-NEXT: v_add_u32_e32 v0, vcc, v2, v0
; VI-NEXT: v_addc_u32_e32 v1, vcc, v3, v1, vcc
; VI-NEXT: v_add_u32_e32 v0, vcc, 40, v0
; VI-NEXT: v_mov_b32_e32 v2, 42
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: v_mov_b32_e32 v3, 0
; VI-NEXT: flat_atomic_inc_x2 v[0:1], v[0:1], v[2:3] glc
; VI-NEXT: s_endpgm
;
@ -1051,15 +1051,15 @@ define amdgpu_kernel void @global_atomic_inc_noret_i64_offset_addr64(i64 addrspa
; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; GFX9-NEXT: v_ashrrev_i32_e32 v1, 31, v0
; GFX9-NEXT: v_lshlrev_b64 v[0:1], 3, v[0:1]
; GFX9-NEXT: v_mov_b32_e32 v2, 42
; GFX9-NEXT: v_mov_b32_e32 v3, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: v_mov_b32_e32 v5, s1
; GFX9-NEXT: v_mov_b32_e32 v4, s0
; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v4, v0
; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v5, v1, vcc
; GFX9-NEXT: v_mov_b32_e32 v3, s1
; GFX9-NEXT: v_mov_b32_e32 v2, s0
; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v2, v0
; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v3, v1, vcc
; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 40, v0
; GFX9-NEXT: v_mov_b32_e32 v2, 42
; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
; GFX9-NEXT: v_mov_b32_e32 v3, 0
; GFX9-NEXT: global_atomic_inc_x2 v[0:1], v[0:1], v[2:3], off glc
; GFX9-NEXT: s_endpgm
%id = call i32 @llvm.amdgcn.workitem.id.x()
@ -1147,21 +1147,21 @@ define amdgpu_kernel void @flat_atomic_inc_ret_i32_offset_addr64(i32* %out, i32*
; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
; CI-NEXT: v_ashrrev_i32_e32 v1, 31, v0
; CI-NEXT: v_lshl_b64 v[0:1], v[0:1], 2
; CI-NEXT: v_mov_b32_e32 v4, 42
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: v_mov_b32_e32 v2, s2
; CI-NEXT: v_mov_b32_e32 v3, s3
; CI-NEXT: v_add_i32_e32 v2, vcc, v2, v0
; CI-NEXT: v_addc_u32_e32 v3, vcc, v3, v1, vcc
; CI-NEXT: v_add_i32_e32 v2, vcc, 20, v2
; CI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
; CI-NEXT: flat_atomic_inc v4, v[2:3], v4 glc
; CI-NEXT: v_add_i32_e32 v4, vcc, v2, v0
; CI-NEXT: v_addc_u32_e32 v5, vcc, v3, v1, vcc
; CI-NEXT: v_mov_b32_e32 v3, s1
; CI-NEXT: v_mov_b32_e32 v2, s0
; CI-NEXT: v_add_i32_e32 v0, vcc, v2, v0
; CI-NEXT: v_addc_u32_e32 v1, vcc, v3, v1, vcc
; CI-NEXT: v_add_i32_e32 v2, vcc, 20, v4
; CI-NEXT: v_addc_u32_e32 v3, vcc, 0, v5, vcc
; CI-NEXT: v_mov_b32_e32 v4, 42
; CI-NEXT: flat_atomic_inc v2, v[2:3], v4 glc
; CI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; CI-NEXT: flat_store_dword v[0:1], v4
; CI-NEXT: flat_store_dword v[0:1], v2
; CI-NEXT: s_endpgm
;
; VI-LABEL: flat_atomic_inc_ret_i32_offset_addr64:
@ -1169,21 +1169,21 @@ define amdgpu_kernel void @flat_atomic_inc_ret_i32_offset_addr64(i32* %out, i32*
; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
; VI-NEXT: v_ashrrev_i32_e32 v1, 31, v0
; VI-NEXT: v_lshlrev_b64 v[0:1], 2, v[0:1]
; VI-NEXT: v_mov_b32_e32 v4, 42
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v2, s2
; VI-NEXT: v_mov_b32_e32 v3, s3
; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v0
; VI-NEXT: v_addc_u32_e32 v3, vcc, v3, v1, vcc
; VI-NEXT: v_add_u32_e32 v2, vcc, 20, v2
; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
; VI-NEXT: flat_atomic_inc v4, v[2:3], v4 glc
; VI-NEXT: v_add_u32_e32 v4, vcc, v2, v0
; VI-NEXT: v_addc_u32_e32 v5, vcc, v3, v1, vcc
; VI-NEXT: v_mov_b32_e32 v3, s1
; VI-NEXT: v_mov_b32_e32 v2, s0
; VI-NEXT: v_add_u32_e32 v0, vcc, v2, v0
; VI-NEXT: v_addc_u32_e32 v1, vcc, v3, v1, vcc
; VI-NEXT: v_add_u32_e32 v2, vcc, 20, v4
; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v5, vcc
; VI-NEXT: v_mov_b32_e32 v4, 42
; VI-NEXT: flat_atomic_inc v2, v[2:3], v4 glc
; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; VI-NEXT: flat_store_dword v[0:1], v4
; VI-NEXT: flat_store_dword v[0:1], v2
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: flat_atomic_inc_ret_i32_offset_addr64:
@ -1191,21 +1191,21 @@ define amdgpu_kernel void @flat_atomic_inc_ret_i32_offset_addr64(i32* %out, i32*
; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
; GFX9-NEXT: v_ashrrev_i32_e32 v1, 31, v0
; GFX9-NEXT: v_lshlrev_b64 v[0:1], 2, v[0:1]
; GFX9-NEXT: v_mov_b32_e32 v4, 42
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: v_mov_b32_e32 v2, s2
; GFX9-NEXT: v_mov_b32_e32 v3, s3
; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v2, v0
; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, v3, v1, vcc
; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, 20, v2
; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc
; GFX9-NEXT: flat_atomic_inc v4, v[2:3], v4 glc
; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, v2, v0
; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, v3, v1, vcc
; GFX9-NEXT: v_mov_b32_e32 v3, s1
; GFX9-NEXT: v_mov_b32_e32 v2, s0
; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v2, v0
; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v3, v1, vcc
; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, 20, v4
; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v5, vcc
; GFX9-NEXT: v_mov_b32_e32 v4, 42
; GFX9-NEXT: flat_atomic_inc v2, v[2:3], v4 glc
; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX9-NEXT: flat_store_dword v[0:1], v4
; GFX9-NEXT: flat_store_dword v[0:1], v2
; GFX9-NEXT: s_endpgm
%id = call i32 @llvm.amdgcn.workitem.id.x()
%gep.tid = getelementptr i32, i32* %ptr, i32 %id
@ -1222,7 +1222,6 @@ define amdgpu_kernel void @flat_atomic_inc_noret_i32_offset_addr64(i32* %ptr) #0
; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; CI-NEXT: v_ashrrev_i32_e32 v1, 31, v0
; CI-NEXT: v_lshl_b64 v[0:1], v[0:1], 2
; CI-NEXT: v_mov_b32_e32 v4, 42
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: v_mov_b32_e32 v3, s1
; CI-NEXT: v_mov_b32_e32 v2, s0
@ -1230,7 +1229,8 @@ define amdgpu_kernel void @flat_atomic_inc_noret_i32_offset_addr64(i32* %ptr) #0
; CI-NEXT: v_addc_u32_e32 v1, vcc, v3, v1, vcc
; CI-NEXT: v_add_i32_e32 v0, vcc, 20, v0
; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; CI-NEXT: flat_atomic_inc v0, v[0:1], v4 glc
; CI-NEXT: v_mov_b32_e32 v2, 42
; CI-NEXT: flat_atomic_inc v0, v[0:1], v2 glc
; CI-NEXT: s_endpgm
;
; VI-LABEL: flat_atomic_inc_noret_i32_offset_addr64:
@ -1238,7 +1238,6 @@ define amdgpu_kernel void @flat_atomic_inc_noret_i32_offset_addr64(i32* %ptr) #0
; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; VI-NEXT: v_ashrrev_i32_e32 v1, 31, v0
; VI-NEXT: v_lshlrev_b64 v[0:1], 2, v[0:1]
; VI-NEXT: v_mov_b32_e32 v4, 42
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v3, s1
; VI-NEXT: v_mov_b32_e32 v2, s0
@ -1246,7 +1245,8 @@ define amdgpu_kernel void @flat_atomic_inc_noret_i32_offset_addr64(i32* %ptr) #0
; VI-NEXT: v_addc_u32_e32 v1, vcc, v3, v1, vcc
; VI-NEXT: v_add_u32_e32 v0, vcc, 20, v0
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: flat_atomic_inc v0, v[0:1], v4 glc
; VI-NEXT: v_mov_b32_e32 v2, 42
; VI-NEXT: flat_atomic_inc v0, v[0:1], v2 glc
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: flat_atomic_inc_noret_i32_offset_addr64:
@ -1254,7 +1254,6 @@ define amdgpu_kernel void @flat_atomic_inc_noret_i32_offset_addr64(i32* %ptr) #0
; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; GFX9-NEXT: v_ashrrev_i32_e32 v1, 31, v0
; GFX9-NEXT: v_lshlrev_b64 v[0:1], 2, v[0:1]
; GFX9-NEXT: v_mov_b32_e32 v4, 42
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: v_mov_b32_e32 v3, s1
; GFX9-NEXT: v_mov_b32_e32 v2, s0
@ -1262,7 +1261,8 @@ define amdgpu_kernel void @flat_atomic_inc_noret_i32_offset_addr64(i32* %ptr) #0
; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v3, v1, vcc
; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 20, v0
; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
; GFX9-NEXT: flat_atomic_inc v0, v[0:1], v4 glc
; GFX9-NEXT: v_mov_b32_e32 v2, 42
; GFX9-NEXT: flat_atomic_inc v0, v[0:1], v2 glc
; GFX9-NEXT: s_endpgm
%id = call i32 @llvm.amdgcn.workitem.id.x()
%gep.tid = getelementptr i32, i32* %ptr, i32 %id
@ -1420,20 +1420,20 @@ define amdgpu_kernel void @flat_atomic_inc_ret_i64_offset_addr64(i64* %out, i64*
; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
; CI-NEXT: v_ashrrev_i32_e32 v1, 31, v0
; CI-NEXT: v_lshl_b64 v[0:1], v[0:1], 3
; CI-NEXT: v_mov_b32_e32 v2, 42
; CI-NEXT: v_mov_b32_e32 v3, 0
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: v_mov_b32_e32 v5, s3
; CI-NEXT: v_mov_b32_e32 v4, s2
; CI-NEXT: v_add_i32_e32 v4, vcc, v4, v0
; CI-NEXT: v_addc_u32_e32 v5, vcc, v5, v1, vcc
; CI-NEXT: v_add_i32_e32 v4, vcc, 40, v4
; CI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc
; CI-NEXT: flat_atomic_inc_x2 v[2:3], v[4:5], v[2:3] glc
; CI-NEXT: v_mov_b32_e32 v5, s1
; CI-NEXT: v_mov_b32_e32 v4, s0
; CI-NEXT: v_add_i32_e32 v0, vcc, v4, v0
; CI-NEXT: v_addc_u32_e32 v1, vcc, v5, v1, vcc
; CI-NEXT: v_mov_b32_e32 v2, s2
; CI-NEXT: v_mov_b32_e32 v3, s3
; CI-NEXT: v_add_i32_e32 v4, vcc, v2, v0
; CI-NEXT: v_addc_u32_e32 v5, vcc, v3, v1, vcc
; CI-NEXT: v_mov_b32_e32 v3, s1
; CI-NEXT: v_mov_b32_e32 v2, s0
; CI-NEXT: v_add_i32_e32 v0, vcc, v2, v0
; CI-NEXT: v_addc_u32_e32 v1, vcc, v3, v1, vcc
; CI-NEXT: v_add_i32_e32 v2, vcc, 40, v4
; CI-NEXT: v_addc_u32_e32 v3, vcc, 0, v5, vcc
; CI-NEXT: v_mov_b32_e32 v4, 42
; CI-NEXT: v_mov_b32_e32 v5, 0
; CI-NEXT: flat_atomic_inc_x2 v[2:3], v[2:3], v[4:5] glc
; CI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; CI-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
; CI-NEXT: s_endpgm
@ -1443,20 +1443,20 @@ define amdgpu_kernel void @flat_atomic_inc_ret_i64_offset_addr64(i64* %out, i64*
; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
; VI-NEXT: v_ashrrev_i32_e32 v1, 31, v0
; VI-NEXT: v_lshlrev_b64 v[0:1], 3, v[0:1]
; VI-NEXT: v_mov_b32_e32 v2, 42
; VI-NEXT: v_mov_b32_e32 v3, 0
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v5, s3
; VI-NEXT: v_mov_b32_e32 v4, s2
; VI-NEXT: v_add_u32_e32 v4, vcc, v4, v0
; VI-NEXT: v_addc_u32_e32 v5, vcc, v5, v1, vcc
; VI-NEXT: v_add_u32_e32 v4, vcc, 40, v4
; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc
; VI-NEXT: flat_atomic_inc_x2 v[2:3], v[4:5], v[2:3] glc
; VI-NEXT: v_mov_b32_e32 v5, s1
; VI-NEXT: v_mov_b32_e32 v4, s0
; VI-NEXT: v_add_u32_e32 v0, vcc, v4, v0
; VI-NEXT: v_addc_u32_e32 v1, vcc, v5, v1, vcc
; VI-NEXT: v_mov_b32_e32 v2, s2
; VI-NEXT: v_mov_b32_e32 v3, s3
; VI-NEXT: v_add_u32_e32 v4, vcc, v2, v0
; VI-NEXT: v_addc_u32_e32 v5, vcc, v3, v1, vcc
; VI-NEXT: v_mov_b32_e32 v3, s1
; VI-NEXT: v_mov_b32_e32 v2, s0
; VI-NEXT: v_add_u32_e32 v0, vcc, v2, v0
; VI-NEXT: v_addc_u32_e32 v1, vcc, v3, v1, vcc
; VI-NEXT: v_add_u32_e32 v2, vcc, 40, v4
; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v5, vcc
; VI-NEXT: v_mov_b32_e32 v4, 42
; VI-NEXT: v_mov_b32_e32 v5, 0
; VI-NEXT: flat_atomic_inc_x2 v[2:3], v[2:3], v[4:5] glc
; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; VI-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
; VI-NEXT: s_endpgm
@ -1466,20 +1466,20 @@ define amdgpu_kernel void @flat_atomic_inc_ret_i64_offset_addr64(i64* %out, i64*
; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
; GFX9-NEXT: v_ashrrev_i32_e32 v1, 31, v0
; GFX9-NEXT: v_lshlrev_b64 v[0:1], 3, v[0:1]
; GFX9-NEXT: v_mov_b32_e32 v2, 42
; GFX9-NEXT: v_mov_b32_e32 v3, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: v_mov_b32_e32 v5, s3
; GFX9-NEXT: v_mov_b32_e32 v4, s2
; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, v4, v0
; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, v5, v1, vcc
; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, 40, v4
; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v5, vcc
; GFX9-NEXT: flat_atomic_inc_x2 v[2:3], v[4:5], v[2:3] glc
; GFX9-NEXT: v_mov_b32_e32 v5, s1
; GFX9-NEXT: v_mov_b32_e32 v4, s0
; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v4, v0
; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v5, v1, vcc
; GFX9-NEXT: v_mov_b32_e32 v2, s2
; GFX9-NEXT: v_mov_b32_e32 v3, s3
; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, v2, v0
; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, v3, v1, vcc
; GFX9-NEXT: v_mov_b32_e32 v3, s1
; GFX9-NEXT: v_mov_b32_e32 v2, s0
; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v2, v0
; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v3, v1, vcc
; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, 40, v4
; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v5, vcc
; GFX9-NEXT: v_mov_b32_e32 v4, 42
; GFX9-NEXT: v_mov_b32_e32 v5, 0
; GFX9-NEXT: flat_atomic_inc_x2 v[2:3], v[2:3], v[4:5] glc
; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX9-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
; GFX9-NEXT: s_endpgm
@ -1498,15 +1498,15 @@ define amdgpu_kernel void @flat_atomic_inc_noret_i64_offset_addr64(i64* %ptr) #0
; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; CI-NEXT: v_ashrrev_i32_e32 v1, 31, v0
; CI-NEXT: v_lshl_b64 v[0:1], v[0:1], 3
; CI-NEXT: v_mov_b32_e32 v2, 42
; CI-NEXT: v_mov_b32_e32 v3, 0
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: v_mov_b32_e32 v5, s1
; CI-NEXT: v_mov_b32_e32 v4, s0
; CI-NEXT: v_add_i32_e32 v0, vcc, v4, v0
; CI-NEXT: v_addc_u32_e32 v1, vcc, v5, v1, vcc
; CI-NEXT: v_mov_b32_e32 v3, s1
; CI-NEXT: v_mov_b32_e32 v2, s0
; CI-NEXT: v_add_i32_e32 v0, vcc, v2, v0
; CI-NEXT: v_addc_u32_e32 v1, vcc, v3, v1, vcc
; CI-NEXT: v_add_i32_e32 v0, vcc, 40, v0
; CI-NEXT: v_mov_b32_e32 v2, 42
; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; CI-NEXT: v_mov_b32_e32 v3, 0
; CI-NEXT: flat_atomic_inc_x2 v[0:1], v[0:1], v[2:3] glc
; CI-NEXT: s_endpgm
;
@ -1515,15 +1515,15 @@ define amdgpu_kernel void @flat_atomic_inc_noret_i64_offset_addr64(i64* %ptr) #0
; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; VI-NEXT: v_ashrrev_i32_e32 v1, 31, v0
; VI-NEXT: v_lshlrev_b64 v[0:1], 3, v[0:1]
; VI-NEXT: v_mov_b32_e32 v2, 42
; VI-NEXT: v_mov_b32_e32 v3, 0
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v5, s1
; VI-NEXT: v_mov_b32_e32 v4, s0
; VI-NEXT: v_add_u32_e32 v0, vcc, v4, v0
; VI-NEXT: v_addc_u32_e32 v1, vcc, v5, v1, vcc
; VI-NEXT: v_mov_b32_e32 v3, s1
; VI-NEXT: v_mov_b32_e32 v2, s0
; VI-NEXT: v_add_u32_e32 v0, vcc, v2, v0
; VI-NEXT: v_addc_u32_e32 v1, vcc, v3, v1, vcc
; VI-NEXT: v_add_u32_e32 v0, vcc, 40, v0
; VI-NEXT: v_mov_b32_e32 v2, 42
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: v_mov_b32_e32 v3, 0
; VI-NEXT: flat_atomic_inc_x2 v[0:1], v[0:1], v[2:3] glc
; VI-NEXT: s_endpgm
;
@ -1532,15 +1532,15 @@ define amdgpu_kernel void @flat_atomic_inc_noret_i64_offset_addr64(i64* %ptr) #0
; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; GFX9-NEXT: v_ashrrev_i32_e32 v1, 31, v0
; GFX9-NEXT: v_lshlrev_b64 v[0:1], 3, v[0:1]
; GFX9-NEXT: v_mov_b32_e32 v2, 42
; GFX9-NEXT: v_mov_b32_e32 v3, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: v_mov_b32_e32 v5, s1
; GFX9-NEXT: v_mov_b32_e32 v4, s0
; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v4, v0
; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v5, v1, vcc
; GFX9-NEXT: v_mov_b32_e32 v3, s1
; GFX9-NEXT: v_mov_b32_e32 v2, s0
; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v2, v0
; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v3, v1, vcc
; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 40, v0
; GFX9-NEXT: v_mov_b32_e32 v2, 42
; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
; GFX9-NEXT: v_mov_b32_e32 v3, 0
; GFX9-NEXT: flat_atomic_inc_x2 v[0:1], v[0:1], v[2:3] glc
; GFX9-NEXT: s_endpgm
%id = call i32 @llvm.amdgcn.workitem.id.x()

View File

@ -20,13 +20,13 @@ define amdgpu_kernel void @dpp_test(i32 addrspace(1)* %out, i32 %in) {
;
; GFX10-LABEL: dpp_test:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; encoding: [0x80,0x00,0x04,0xf4,0x24,0x00,0x00,0xfa]
; GFX10-NEXT: s_load_dword s0, s[0:1], 0x2c ; encoding: [0x00,0x00,0x00,0xf4,0x2c,0x00,0x00,0xfa]
; GFX10-NEXT: s_load_dword s2, s[0:1], 0x2c ; encoding: [0x80,0x00,0x00,0xf4,0x2c,0x00,0x00,0xfa]
; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; encoding: [0x00,0x00,0x04,0xf4,0x24,0x00,0x00,0xfa]
; GFX10-NEXT: ; implicit-def: $vcc_hi
; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; encoding: [0x7f,0xc0,0x8c,0xbf]
; GFX10-NEXT: v_mov_b32_e32 v0, s2 ; encoding: [0x02,0x02,0x00,0x7e]
; GFX10-NEXT: v_mov_b32_e32 v2, s0 ; encoding: [0x00,0x02,0x04,0x7e]
; GFX10-NEXT: v_mov_b32_e32 v1, s3 ; encoding: [0x03,0x02,0x02,0x7e]
; GFX10-NEXT: v_mov_b32_e32 v2, s2 ; encoding: [0x02,0x02,0x04,0x7e]
; GFX10-NEXT: v_mov_b32_e32 v0, s0 ; encoding: [0x00,0x02,0x00,0x7e]
; GFX10-NEXT: v_mov_b32_e32 v1, s1 ; encoding: [0x01,0x02,0x02,0x7e]
; GFX10-NEXT: v_mov_b32_dpp v2, v2 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1 bound_ctrl:0 ; encoding: [0xfa,0x02,0x04,0x7e,0x02,0x01,0x08,0x11]
; GFX10-NEXT: global_store_dword v[0:1], v2, off ; encoding: [0x00,0x80,0x70,0xdc,0x00,0x02,0x7d,0x00]
; GFX10-NEXT: s_endpgm ; encoding: [0x00,0x00,0x81,0xbf]

View File

@ -136,19 +136,19 @@ define i32 @v_sdot2_inline_literal_a_b(<2 x i16> %a, i32 %c) {
; GFX906-LABEL: v_sdot2_inline_literal_a_b:
; GFX906: ; %bb.0:
; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX906-NEXT: s_pack_ll_b32_b16 s4, 4, 4
; GFX906-NEXT: s_pack_ll_b32_b16 s5, 8, 8
; GFX906-NEXT: v_mov_b32_e32 v0, s4
; GFX906-NEXT: v_dot2_i32_i16 v0, s5, v0, v1
; GFX906-NEXT: s_pack_ll_b32_b16 s5, 4, 4
; GFX906-NEXT: s_pack_ll_b32_b16 s4, 8, 8
; GFX906-NEXT: v_mov_b32_e32 v0, s5
; GFX906-NEXT: v_dot2_i32_i16 v0, s4, v0, v1
; GFX906-NEXT: s_setpc_b64 s[30:31]
;
; GFX908-LABEL: v_sdot2_inline_literal_a_b:
; GFX908: ; %bb.0:
; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX908-NEXT: s_pack_ll_b32_b16 s4, 4, 4
; GFX908-NEXT: s_pack_ll_b32_b16 s5, 8, 8
; GFX908-NEXT: v_mov_b32_e32 v0, s4
; GFX908-NEXT: v_dot2_i32_i16 v0, s5, v0, v1
; GFX908-NEXT: s_pack_ll_b32_b16 s5, 4, 4
; GFX908-NEXT: s_pack_ll_b32_b16 s4, 8, 8
; GFX908-NEXT: v_mov_b32_e32 v0, s5
; GFX908-NEXT: v_dot2_i32_i16 v0, s4, v0, v1
; GFX908-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: v_sdot2_inline_literal_a_b:
@ -168,19 +168,19 @@ define i32 @v_sdot2_inline_literal_a_b_c() {
; GFX906-LABEL: v_sdot2_inline_literal_a_b_c:
; GFX906: ; %bb.0:
; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX906-NEXT: s_pack_ll_b32_b16 s4, 4, 4
; GFX906-NEXT: s_pack_ll_b32_b16 s5, 8, 8
; GFX906-NEXT: v_mov_b32_e32 v0, s4
; GFX906-NEXT: v_dot2_i32_i16 v0, s5, v0, 8
; GFX906-NEXT: s_pack_ll_b32_b16 s5, 4, 4
; GFX906-NEXT: s_pack_ll_b32_b16 s4, 8, 8
; GFX906-NEXT: v_mov_b32_e32 v0, s5
; GFX906-NEXT: v_dot2_i32_i16 v0, s4, v0, 8
; GFX906-NEXT: s_setpc_b64 s[30:31]
;
; GFX908-LABEL: v_sdot2_inline_literal_a_b_c:
; GFX908: ; %bb.0:
; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX908-NEXT: s_pack_ll_b32_b16 s4, 4, 4
; GFX908-NEXT: s_pack_ll_b32_b16 s5, 8, 8
; GFX908-NEXT: v_mov_b32_e32 v0, s4
; GFX908-NEXT: v_dot2_i32_i16 v0, s5, v0, 8
; GFX908-NEXT: s_pack_ll_b32_b16 s5, 4, 4
; GFX908-NEXT: s_pack_ll_b32_b16 s4, 8, 8
; GFX908-NEXT: v_mov_b32_e32 v0, s5
; GFX908-NEXT: v_dot2_i32_i16 v0, s4, v0, 8
; GFX908-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: v_sdot2_inline_literal_a_b_c:

View File

@ -136,19 +136,19 @@ define i32 @v_udot2_inline_literal_a_b(<2 x i16> %a, i32 %c) {
; GFX906-LABEL: v_udot2_inline_literal_a_b:
; GFX906: ; %bb.0:
; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX906-NEXT: s_pack_ll_b32_b16 s4, 4, 4
; GFX906-NEXT: s_pack_ll_b32_b16 s5, 8, 8
; GFX906-NEXT: v_mov_b32_e32 v0, s4
; GFX906-NEXT: v_dot2_u32_u16 v0, s5, v0, v1
; GFX906-NEXT: s_pack_ll_b32_b16 s5, 4, 4
; GFX906-NEXT: s_pack_ll_b32_b16 s4, 8, 8
; GFX906-NEXT: v_mov_b32_e32 v0, s5
; GFX906-NEXT: v_dot2_u32_u16 v0, s4, v0, v1
; GFX906-NEXT: s_setpc_b64 s[30:31]
;
; GFX908-LABEL: v_udot2_inline_literal_a_b:
; GFX908: ; %bb.0:
; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX908-NEXT: s_pack_ll_b32_b16 s4, 4, 4
; GFX908-NEXT: s_pack_ll_b32_b16 s5, 8, 8
; GFX908-NEXT: v_mov_b32_e32 v0, s4
; GFX908-NEXT: v_dot2_u32_u16 v0, s5, v0, v1
; GFX908-NEXT: s_pack_ll_b32_b16 s5, 4, 4
; GFX908-NEXT: s_pack_ll_b32_b16 s4, 8, 8
; GFX908-NEXT: v_mov_b32_e32 v0, s5
; GFX908-NEXT: v_dot2_u32_u16 v0, s4, v0, v1
; GFX908-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: v_udot2_inline_literal_a_b:
@ -168,19 +168,19 @@ define i32 @v_udot2_inline_literal_a_b_c() {
; GFX906-LABEL: v_udot2_inline_literal_a_b_c:
; GFX906: ; %bb.0:
; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX906-NEXT: s_pack_ll_b32_b16 s4, 4, 4
; GFX906-NEXT: s_pack_ll_b32_b16 s5, 8, 8
; GFX906-NEXT: v_mov_b32_e32 v0, s4
; GFX906-NEXT: v_dot2_u32_u16 v0, s5, v0, 8
; GFX906-NEXT: s_pack_ll_b32_b16 s5, 4, 4
; GFX906-NEXT: s_pack_ll_b32_b16 s4, 8, 8
; GFX906-NEXT: v_mov_b32_e32 v0, s5
; GFX906-NEXT: v_dot2_u32_u16 v0, s4, v0, 8
; GFX906-NEXT: s_setpc_b64 s[30:31]
;
; GFX908-LABEL: v_udot2_inline_literal_a_b_c:
; GFX908: ; %bb.0:
; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX908-NEXT: s_pack_ll_b32_b16 s4, 4, 4
; GFX908-NEXT: s_pack_ll_b32_b16 s5, 8, 8
; GFX908-NEXT: v_mov_b32_e32 v0, s4
; GFX908-NEXT: v_dot2_u32_u16 v0, s5, v0, 8
; GFX908-NEXT: s_pack_ll_b32_b16 s5, 4, 4
; GFX908-NEXT: s_pack_ll_b32_b16 s4, 8, 8
; GFX908-NEXT: v_mov_b32_e32 v0, s5
; GFX908-NEXT: v_dot2_u32_u16 v0, s4, v0, 8
; GFX908-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: v_udot2_inline_literal_a_b_c:

View File

@ -8,26 +8,26 @@ define amdgpu_kernel void @dpp_test(i32 addrspace(1)* %out, i32 %in1, i32 %in2)
; GFX8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2c
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: v_mov_b32_e32 v0, s2
; GFX8-NEXT: v_mov_b32_e32 v2, s0
; GFX8-NEXT: v_mov_b32_e32 v3, s1
; GFX8-NEXT: v_mov_b32_e32 v0, s1
; GFX8-NEXT: s_nop 1
; GFX8-NEXT: v_mov_b32_dpp v2, v0 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1
; GFX8-NEXT: v_mov_b32_e32 v0, s2
; GFX8-NEXT: v_mov_b32_e32 v1, s3
; GFX8-NEXT: s_nop 0
; GFX8-NEXT: v_mov_b32_dpp v2, v3 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1
; GFX8-NEXT: flat_store_dword v[0:1], v2
; GFX8-NEXT: s_endpgm
;
; GFX10-LABEL: dpp_test:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2c
; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c
; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; GFX10-NEXT: ; implicit-def: $vcc_hi
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: v_mov_b32_e32 v0, s2
; GFX10-NEXT: v_mov_b32_e32 v2, s0
; GFX10-NEXT: v_mov_b32_e32 v3, s1
; GFX10-NEXT: v_mov_b32_e32 v1, s3
; GFX10-NEXT: v_mov_b32_dpp v2, v3 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1
; GFX10-NEXT: v_mov_b32_e32 v2, s2
; GFX10-NEXT: v_mov_b32_e32 v0, s3
; GFX10-NEXT: v_mov_b32_dpp v2, v0 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1
; GFX10-NEXT: v_mov_b32_e32 v0, s0
; GFX10-NEXT: v_mov_b32_e32 v1, s1
; GFX10-NEXT: global_store_dword v[0:1], v2, off
; GFX10-NEXT: s_endpgm
%tmp0 = call i32 @llvm.amdgcn.update.dpp.i32(i32 %in1, i32 %in2, i32 1, i32 1, i32 1, i1 false)

View File

@ -891,24 +891,24 @@ define amdgpu_ps float @mubuf_load_sgpr_ptr_offset4095_vgpr_offset(float addrspa
; GFX6-LABEL: mubuf_load_sgpr_ptr_offset4095_vgpr_offset:
; GFX6: ; %bb.0:
; GFX6-NEXT: v_ashrrev_i32_e32 v1, 31, v0
; GFX6-NEXT: s_add_u32 s0, s2, 0x3ffc
; GFX6-NEXT: s_add_u32 s4, s2, 0x3ffc
; GFX6-NEXT: v_lshl_b64 v[0:1], v[0:1], 2
; GFX6-NEXT: s_addc_u32 s1, s3, 0
; GFX6-NEXT: s_mov_b32 s2, 0
; GFX6-NEXT: s_mov_b32 s3, 0xf000
; GFX6-NEXT: buffer_load_dword v0, v[0:1], s[0:3], 0 addr64
; GFX6-NEXT: s_mov_b32 s6, 0
; GFX6-NEXT: s_addc_u32 s5, s3, 0
; GFX6-NEXT: s_mov_b32 s7, 0xf000
; GFX6-NEXT: buffer_load_dword v0, v[0:1], s[4:7], 0 addr64
; GFX6-NEXT: s_waitcnt vmcnt(0)
; GFX6-NEXT: ; return to shader part epilog
;
; GFX7-LABEL: mubuf_load_sgpr_ptr_offset4095_vgpr_offset:
; GFX7: ; %bb.0:
; GFX7-NEXT: v_ashrrev_i32_e32 v1, 31, v0
; GFX7-NEXT: s_add_u32 s0, s2, 0x3ffc
; GFX7-NEXT: s_add_u32 s4, s2, 0x3ffc
; GFX7-NEXT: v_lshl_b64 v[0:1], v[0:1], 2
; GFX7-NEXT: s_addc_u32 s1, s3, 0
; GFX7-NEXT: s_mov_b32 s2, 0
; GFX7-NEXT: s_mov_b32 s3, 0xf000
; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[0:3], 0 addr64
; GFX7-NEXT: s_mov_b32 s6, 0
; GFX7-NEXT: s_addc_u32 s5, s3, 0
; GFX7-NEXT: s_mov_b32 s7, 0xf000
; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[4:7], 0 addr64
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: ; return to shader part epilog
%gep0 = getelementptr float, float addrspace(1)* %ptr, i64 4095
@ -1067,32 +1067,34 @@ define amdgpu_ps float @mubuf_atomicrmw_sgpr_ptr_vgpr_offset(i32 addrspace(1)* i
; GFX6-LABEL: mubuf_atomicrmw_sgpr_ptr_vgpr_offset:
; GFX6: ; %bb.0:
; GFX6-NEXT: v_ashrrev_i32_e32 v1, 31, v0
; GFX6-NEXT: v_lshl_b64 v[1:2], v[0:1], 2
; GFX6-NEXT: v_lshl_b64 v[0:1], v[0:1], 2
; GFX6-NEXT: s_mov_b32 s0, s2
; GFX6-NEXT: s_mov_b32 s1, s3
; GFX6-NEXT: v_mov_b32_e32 v0, 2
; GFX6-NEXT: v_mov_b32_e32 v2, 2
; GFX6-NEXT: s_mov_b32 s2, 0
; GFX6-NEXT: s_mov_b32 s3, 0xf000
; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX6-NEXT: buffer_atomic_add v0, v[1:2], s[0:3], 0 addr64 glc
; GFX6-NEXT: buffer_atomic_add v2, v[0:1], s[0:3], 0 addr64 glc
; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX6-NEXT: buffer_wbinvl1
; GFX6-NEXT: v_mov_b32_e32 v0, v2
; GFX6-NEXT: s_waitcnt expcnt(0)
; GFX6-NEXT: ; return to shader part epilog
;
; GFX7-LABEL: mubuf_atomicrmw_sgpr_ptr_vgpr_offset:
; GFX7: ; %bb.0:
; GFX7-NEXT: v_ashrrev_i32_e32 v1, 31, v0
; GFX7-NEXT: v_lshl_b64 v[1:2], v[0:1], 2
; GFX7-NEXT: v_lshl_b64 v[0:1], v[0:1], 2
; GFX7-NEXT: s_mov_b32 s0, s2
; GFX7-NEXT: s_mov_b32 s1, s3
; GFX7-NEXT: v_mov_b32_e32 v0, 2
; GFX7-NEXT: v_mov_b32_e32 v2, 2
; GFX7-NEXT: s_mov_b32 s2, 0
; GFX7-NEXT: s_mov_b32 s3, 0xf000
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX7-NEXT: buffer_atomic_add v0, v[1:2], s[0:3], 0 addr64 glc
; GFX7-NEXT: buffer_atomic_add v2, v[0:1], s[0:3], 0 addr64 glc
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX7-NEXT: buffer_wbinvl1
; GFX7-NEXT: v_mov_b32_e32 v0, v2
; GFX7-NEXT: ; return to shader part epilog
%gep = getelementptr i32, i32 addrspace(1)* %ptr, i32 %voffset
%result = atomicrmw add i32 addrspace(1)* %gep, i32 2 seq_cst

View File

@ -245,12 +245,12 @@ define amdgpu_ps float @add3_uniform_vgpr(float inreg %a, float inreg %b, float
;
; GFX10-LABEL: add3_uniform_vgpr:
; GFX10: ; %bb.0:
; GFX10-NEXT: v_add_f32_e64 v0, s2, 1.0
; GFX10-NEXT: v_add_f32_e64 v1, s3, 2.0
; GFX10-NEXT: v_add_f32_e64 v2, s2, 1.0
; GFX10-NEXT: v_add_f32_e64 v0, 0x40400000, s4
; GFX10-NEXT: v_add_f32_e64 v2, 0x40400000, s4
; GFX10-NEXT: ; implicit-def: $vcc_hi
; GFX10-NEXT: v_add_nc_u32_e32 v1, v2, v1
; GFX10-NEXT: v_add_nc_u32_e32 v0, v1, v0
; GFX10-NEXT: v_add_nc_u32_e32 v0, v0, v1
; GFX10-NEXT: v_add_nc_u32_e32 v0, v0, v2
; GFX10-NEXT: ; return to shader part epilog
%a1 = fadd float %a, 1.0
%b2 = fadd float %b, 2.0

View File

@ -1,4 +1,4 @@
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -mcpu=fiji -amdgpu-codegenprepare %s | FileCheck -check-prefix=IR %s
; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=fiji < %s | FileCheck -check-prefix=GCN %s
@ -290,11 +290,11 @@ define i32 @select_sdiv_rhs_opaque_const0_i32(i1 %cond) {
; GCN-NEXT: v_and_b32_e32 v0, 1, v0
; GCN-NEXT: v_mov_b32_e32 v1, 0x392fa
; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0
; GCN-NEXT: s_mov_b32 s5, 0x30c30c31
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: v_mov_b32_e32 v2, s4
; GCN-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc
; GCN-NEXT: v_mul_hi_i32 v0, v0, s5
; GCN-NEXT: s_mov_b32 s4, 0x30c30c31
; GCN-NEXT: v_mul_hi_i32 v0, v0, s4
; GCN-NEXT: v_lshrrev_b32_e32 v1, 31, v0
; GCN-NEXT: v_ashrrev_i32_e32 v0, 3, v0
; GCN-NEXT: v_add_u32_e32 v0, vcc, v0, v1
@ -320,11 +320,11 @@ define i32 @select_sdiv_rhs_opaque_const1_i32(i1 %cond) {
; GCN-NEXT: v_and_b32_e32 v0, 1, v0
; GCN-NEXT: v_mov_b32_e32 v1, 0xa410
; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0
; GCN-NEXT: s_mov_b32 s5, 0x30c30c31
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: v_mov_b32_e32 v2, s4
; GCN-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc
; GCN-NEXT: v_mul_hi_i32 v0, v0, s5
; GCN-NEXT: s_mov_b32 s4, 0x30c30c31
; GCN-NEXT: v_mul_hi_i32 v0, v0, s4
; GCN-NEXT: v_lshrrev_b32_e32 v1, 31, v0
; GCN-NEXT: v_ashrrev_i32_e32 v0, 3, v0
; GCN-NEXT: v_add_u32_e32 v0, vcc, v0, v1
@ -404,7 +404,6 @@ define i32 @select_mul_rhs_const_i32(i1 %cond) {
; IR-LABEL: @select_mul_rhs_const_i32(
; IR-NEXT: [[OP:%.*]] = select i1 [[COND:%.*]], i32 5000, i32 8000
; IR-NEXT: ret i32 [[OP]]
;
%select = select i1 %cond, i32 5, i32 8
%op = mul i32 %select, 1000
ret i32 %op
@ -415,7 +414,6 @@ define amdgpu_kernel void @select_add_lhs_const_i16(i1 %cond) {
; IR-NEXT: [[OP:%.*]] = select i1 [[COND:%.*]], i16 128, i16 131
; IR-NEXT: store i16 [[OP]], i16 addrspace(1)* undef
; IR-NEXT: ret void
; GCN-LABEL: select_add_lhs_const_i16:
; GCN: ; %bb.0:
; GCN-NEXT: s_load_dword s0, s[4:5], 0x0
@ -427,7 +425,6 @@ define amdgpu_kernel void @select_add_lhs_const_i16(i1 %cond) {
; GCN-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc
; GCN-NEXT: flat_store_short v[0:1], v0
; GCN-NEXT: s_endpgm
;
%select = select i1 %cond, i16 5, i16 8
%op = add i16 %select, 123
store i16 %op, i16 addrspace(1)* undef
@ -445,7 +442,6 @@ define i16 @select_add_trunc_select(i1 %cond) {
; IR-LABEL: @select_add_trunc_select(
; IR-NEXT: [[OP:%.*]] = select i1 [[COND:%.*]], i16 47, i16 50
; IR-NEXT: ret i16 [[OP]]
;
%select = select i1 %cond, i32 5, i32 8
%trunc = trunc i32 %select to i16
%op = add i16 %trunc, 42
@ -473,7 +469,6 @@ define i32 @select_add_zext_select(i1 %cond) {
; IR-LABEL: @select_add_zext_select(
; IR-NEXT: [[OP:%.*]] = select i1 [[COND:%.*]], i32 47, i32 50
; IR-NEXT: ret i32 [[OP]]
; GCN-LABEL: select_add_zext_select:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)

View File

@ -105,18 +105,18 @@ define amdgpu_kernel void @add_i32_constant(i32 addrspace(1)* %out) {
;
; GFX1064-LABEL: add_i32_constant:
; GFX1064: ; %bb.0: ; %entry
; GFX1064-NEXT: v_cmp_ne_u32_e64 s[2:3], 1, 0
; GFX1064-NEXT: v_cmp_ne_u32_e64 s[4:5], 1, 0
; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; GFX1064-NEXT: ; implicit-def: $vgpr1
; GFX1064-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s2, 0
; GFX1064-NEXT: v_mbcnt_hi_u32_b32_e64 v0, s3, v0
; GFX1064-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s4, 0
; GFX1064-NEXT: v_mbcnt_hi_u32_b32_e64 v0, s5, v0
; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
; GFX1064-NEXT: s_and_saveexec_b64 s[4:5], vcc
; GFX1064-NEXT: s_and_saveexec_b64 s[2:3], vcc
; GFX1064-NEXT: s_cbranch_execz BB0_2
; GFX1064-NEXT: ; %bb.1:
; GFX1064-NEXT: s_bcnt1_i32_b64 s2, s[2:3]
; GFX1064-NEXT: s_bcnt1_i32_b64 s4, s[4:5]
; GFX1064-NEXT: v_mov_b32_e32 v2, local_var32@abs32@lo
; GFX1064-NEXT: v_mul_u32_u24_e64 v1, s2, 5
; GFX1064-NEXT: v_mul_u32_u24_e64 v1, s4, 5
; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX1064-NEXT: s_waitcnt_vscnt null, 0x0
; GFX1064-NEXT: ds_add_rtn_u32 v1, v2, v1
@ -125,7 +125,7 @@ define amdgpu_kernel void @add_i32_constant(i32 addrspace(1)* %out) {
; GFX1064-NEXT: buffer_gl1_inv
; GFX1064-NEXT: BB0_2:
; GFX1064-NEXT: v_nop
; GFX1064-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX1064-NEXT: s_or_b64 exec, exec, s[2:3]
; GFX1064-NEXT: v_readfirstlane_b32 s2, v1
; GFX1064-NEXT: s_mov_b32 s3, 0x31016000
; GFX1064-NEXT: v_mad_u32_u24 v0, v0, 5, s2
@ -275,17 +275,17 @@ define amdgpu_kernel void @add_i32_uniform(i32 addrspace(1)* %out, i32 %additive
;
; GFX1064-LABEL: add_i32_uniform:
; GFX1064: ; %bb.0: ; %entry
; GFX1064-NEXT: v_cmp_ne_u32_e64 s[2:3], 1, 0
; GFX1064-NEXT: v_cmp_ne_u32_e64 s[6:7], 1, 0
; GFX1064-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
; GFX1064-NEXT: s_load_dword s0, s[0:1], 0x2c
; GFX1064-NEXT: ; implicit-def: $vgpr1
; GFX1064-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s2, 0
; GFX1064-NEXT: v_mbcnt_hi_u32_b32_e64 v0, s3, v0
; GFX1064-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s6, 0
; GFX1064-NEXT: v_mbcnt_hi_u32_b32_e64 v0, s7, v0
; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
; GFX1064-NEXT: s_and_saveexec_b64 s[6:7], vcc
; GFX1064-NEXT: s_and_saveexec_b64 s[2:3], vcc
; GFX1064-NEXT: s_cbranch_execz BB1_2
; GFX1064-NEXT: ; %bb.1:
; GFX1064-NEXT: s_bcnt1_i32_b64 s1, s[2:3]
; GFX1064-NEXT: s_bcnt1_i32_b64 s1, s[6:7]
; GFX1064-NEXT: v_mov_b32_e32 v1, local_var32@abs32@lo
; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
; GFX1064-NEXT: s_mul_i32 s1, s0, s1
@ -298,7 +298,7 @@ define amdgpu_kernel void @add_i32_uniform(i32 addrspace(1)* %out, i32 %additive
; GFX1064-NEXT: buffer_gl1_inv
; GFX1064-NEXT: BB1_2:
; GFX1064-NEXT: v_nop
; GFX1064-NEXT: s_or_b64 exec, exec, s[6:7]
; GFX1064-NEXT: s_or_b64 exec, exec, s[2:3]
; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
; GFX1064-NEXT: v_mul_lo_u32 v0, s0, v0
; GFX1064-NEXT: v_readfirstlane_b32 s0, v1
@ -1662,18 +1662,18 @@ define amdgpu_kernel void @sub_i32_constant(i32 addrspace(1)* %out) {
;
; GFX1064-LABEL: sub_i32_constant:
; GFX1064: ; %bb.0: ; %entry
; GFX1064-NEXT: v_cmp_ne_u32_e64 s[2:3], 1, 0
; GFX1064-NEXT: v_cmp_ne_u32_e64 s[4:5], 1, 0
; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; GFX1064-NEXT: ; implicit-def: $vgpr1
; GFX1064-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s2, 0
; GFX1064-NEXT: v_mbcnt_hi_u32_b32_e64 v0, s3, v0
; GFX1064-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s4, 0
; GFX1064-NEXT: v_mbcnt_hi_u32_b32_e64 v0, s5, v0
; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
; GFX1064-NEXT: s_and_saveexec_b64 s[4:5], vcc
; GFX1064-NEXT: s_and_saveexec_b64 s[2:3], vcc
; GFX1064-NEXT: s_cbranch_execz BB8_2
; GFX1064-NEXT: ; %bb.1:
; GFX1064-NEXT: s_bcnt1_i32_b64 s2, s[2:3]
; GFX1064-NEXT: s_bcnt1_i32_b64 s4, s[4:5]
; GFX1064-NEXT: v_mov_b32_e32 v2, local_var32@abs32@lo
; GFX1064-NEXT: v_mul_u32_u24_e64 v1, s2, 5
; GFX1064-NEXT: v_mul_u32_u24_e64 v1, s4, 5
; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX1064-NEXT: s_waitcnt_vscnt null, 0x0
; GFX1064-NEXT: ds_sub_rtn_u32 v1, v2, v1
@ -1682,7 +1682,7 @@ define amdgpu_kernel void @sub_i32_constant(i32 addrspace(1)* %out) {
; GFX1064-NEXT: buffer_gl1_inv
; GFX1064-NEXT: BB8_2:
; GFX1064-NEXT: v_nop
; GFX1064-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX1064-NEXT: s_or_b64 exec, exec, s[2:3]
; GFX1064-NEXT: v_readfirstlane_b32 s2, v1
; GFX1064-NEXT: v_mul_u32_u24_e32 v0, 5, v0
; GFX1064-NEXT: s_mov_b32 s3, 0x31016000
@ -1834,17 +1834,17 @@ define amdgpu_kernel void @sub_i32_uniform(i32 addrspace(1)* %out, i32 %subitive
;
; GFX1064-LABEL: sub_i32_uniform:
; GFX1064: ; %bb.0: ; %entry
; GFX1064-NEXT: v_cmp_ne_u32_e64 s[2:3], 1, 0
; GFX1064-NEXT: v_cmp_ne_u32_e64 s[6:7], 1, 0
; GFX1064-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
; GFX1064-NEXT: s_load_dword s0, s[0:1], 0x2c
; GFX1064-NEXT: ; implicit-def: $vgpr1
; GFX1064-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s2, 0
; GFX1064-NEXT: v_mbcnt_hi_u32_b32_e64 v0, s3, v0
; GFX1064-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s6, 0
; GFX1064-NEXT: v_mbcnt_hi_u32_b32_e64 v0, s7, v0
; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
; GFX1064-NEXT: s_and_saveexec_b64 s[6:7], vcc
; GFX1064-NEXT: s_and_saveexec_b64 s[2:3], vcc
; GFX1064-NEXT: s_cbranch_execz BB9_2
; GFX1064-NEXT: ; %bb.1:
; GFX1064-NEXT: s_bcnt1_i32_b64 s1, s[2:3]
; GFX1064-NEXT: s_bcnt1_i32_b64 s1, s[6:7]
; GFX1064-NEXT: v_mov_b32_e32 v1, local_var32@abs32@lo
; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
; GFX1064-NEXT: s_mul_i32 s1, s0, s1
@ -1857,7 +1857,7 @@ define amdgpu_kernel void @sub_i32_uniform(i32 addrspace(1)* %out, i32 %subitive
; GFX1064-NEXT: buffer_gl1_inv
; GFX1064-NEXT: BB9_2:
; GFX1064-NEXT: v_nop
; GFX1064-NEXT: s_or_b64 exec, exec, s[6:7]
; GFX1064-NEXT: s_or_b64 exec, exec, s[2:3]
; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
; GFX1064-NEXT: v_mul_lo_u32 v0, s0, v0
; GFX1064-NEXT: v_readfirstlane_b32 s0, v1

View File

@ -339,41 +339,41 @@ define amdgpu_kernel void @v_brev_i64(i64 addrspace(1)* noalias %out, i64 addrsp
; SI-NEXT: v_mov_b32_e32 v1, 0
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: buffer_load_dwordx2 v[0:1], v[0:1], s[0:3], 0 addr64
; SI-NEXT: s_mov_b32 s0, 0xff00ff
; SI-NEXT: s_mov_b32 s1, 0xf0f0f0f
; SI-NEXT: s_mov_b32 s2, 0xf0f0f0f0
; SI-NEXT: s_mov_b32 s3, 0x33333333
; SI-NEXT: s_mov_b32 s6, 0xcccccccc
; SI-NEXT: s_mov_b32 s8, 0x55555555
; SI-NEXT: s_mov_b32 s9, 0xaaaaaaaa
; SI-NEXT: s_mov_b32 s6, 0xff00ff
; SI-NEXT: s_mov_b32 s8, 0xf0f0f0f
; SI-NEXT: s_mov_b32 s9, 0xf0f0f0f0
; SI-NEXT: s_mov_b32 s10, 0x33333333
; SI-NEXT: s_mov_b32 s11, 0xcccccccc
; SI-NEXT: s_mov_b32 s0, 0x55555555
; SI-NEXT: s_mov_b32 s1, 0xaaaaaaaa
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_alignbit_b32 v2, v0, v0, 8
; SI-NEXT: v_alignbit_b32 v0, v0, v0, 24
; SI-NEXT: v_alignbit_b32 v3, v1, v1, 8
; SI-NEXT: v_alignbit_b32 v1, v1, v1, 24
; SI-NEXT: v_bfi_b32 v2, s0, v0, v2
; SI-NEXT: v_bfi_b32 v4, s0, v1, v3
; SI-NEXT: v_and_b32_e32 v1, s1, v2
; SI-NEXT: v_and_b32_e32 v0, s1, v4
; SI-NEXT: v_and_b32_e32 v3, s2, v2
; SI-NEXT: v_and_b32_e32 v2, s2, v4
; SI-NEXT: v_bfi_b32 v2, s6, v0, v2
; SI-NEXT: v_bfi_b32 v4, s6, v1, v3
; SI-NEXT: v_and_b32_e32 v1, s8, v2
; SI-NEXT: v_and_b32_e32 v0, s8, v4
; SI-NEXT: v_and_b32_e32 v3, s9, v2
; SI-NEXT: v_and_b32_e32 v2, s9, v4
; SI-NEXT: v_lshl_b64 v[0:1], v[0:1], 4
; SI-NEXT: v_lshr_b64 v[2:3], v[2:3], 4
; SI-NEXT: v_or_b32_e32 v3, v3, v1
; SI-NEXT: v_or_b32_e32 v2, v2, v0
; SI-NEXT: v_and_b32_e32 v1, s3, v3
; SI-NEXT: v_and_b32_e32 v0, s3, v2
; SI-NEXT: v_and_b32_e32 v3, s6, v3
; SI-NEXT: v_and_b32_e32 v2, s6, v2
; SI-NEXT: v_lshl_b64 v[0:1], v[0:1], 2
; SI-NEXT: v_lshr_b64 v[2:3], v[2:3], 2
; SI-NEXT: s_mov_b32 s6, -1
; SI-NEXT: v_or_b32_e32 v3, v3, v1
; SI-NEXT: v_or_b32_e32 v2, v2, v0
; SI-NEXT: v_and_b32_e32 v1, s8, v3
; SI-NEXT: v_and_b32_e32 v0, s8, v2
; SI-NEXT: v_and_b32_e32 v3, s9, v3
; SI-NEXT: v_and_b32_e32 v2, s9, v2
; SI-NEXT: v_and_b32_e32 v1, s10, v3
; SI-NEXT: v_and_b32_e32 v0, s10, v2
; SI-NEXT: v_and_b32_e32 v3, s11, v3
; SI-NEXT: v_and_b32_e32 v2, s11, v2
; SI-NEXT: v_lshl_b64 v[0:1], v[0:1], 2
; SI-NEXT: v_lshr_b64 v[2:3], v[2:3], 2
; SI-NEXT: v_or_b32_e32 v3, v3, v1
; SI-NEXT: v_or_b32_e32 v2, v2, v0
; SI-NEXT: v_and_b32_e32 v1, s0, v3
; SI-NEXT: v_and_b32_e32 v0, s0, v2
; SI-NEXT: v_and_b32_e32 v3, s1, v3
; SI-NEXT: v_and_b32_e32 v2, s1, v2
; SI-NEXT: v_lshl_b64 v[0:1], v[0:1], 1
; SI-NEXT: v_lshr_b64 v[2:3], v[2:3], 1
; SI-NEXT: v_or_b32_e32 v1, v3, v1
@ -386,45 +386,45 @@ define amdgpu_kernel void @v_brev_i64(i64 addrspace(1)* noalias %out, i64 addrsp
; FLAT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
; FLAT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2c
; FLAT-NEXT: v_lshlrev_b32_e32 v0, 3, v0
; FLAT-NEXT: s_mov_b32 s2, 0x10203
; FLAT-NEXT: s_mov_b32 s3, 0xf0f0f0f
; FLAT-NEXT: s_mov_b32 s6, 0xf0f0f0f0
; FLAT-NEXT: s_mov_b32 s6, 0x10203
; FLAT-NEXT: s_mov_b32 s2, 0x33333333
; FLAT-NEXT: s_mov_b32 s3, 0xcccccccc
; FLAT-NEXT: s_waitcnt lgkmcnt(0)
; FLAT-NEXT: v_mov_b32_e32 v1, s1
; FLAT-NEXT: v_add_u32_e32 v0, vcc, s0, v0
; FLAT-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; FLAT-NEXT: flat_load_dwordx2 v[0:1], v[0:1]
; FLAT-NEXT: s_mov_b32 s0, 0x33333333
; FLAT-NEXT: s_mov_b32 s1, 0xcccccccc
; FLAT-NEXT: s_mov_b32 s8, 0x55555555
; FLAT-NEXT: s_mov_b32 s9, 0xaaaaaaaa
; FLAT-NEXT: s_mov_b32 s0, 0xf0f0f0f
; FLAT-NEXT: s_mov_b32 s1, 0xf0f0f0f0
; FLAT-NEXT: s_mov_b32 s7, 0xf000
; FLAT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; FLAT-NEXT: v_perm_b32 v2, 0, v0, s2
; FLAT-NEXT: v_perm_b32 v4, 0, v1, s2
; FLAT-NEXT: v_and_b32_e32 v1, s3, v2
; FLAT-NEXT: v_and_b32_e32 v0, s3, v4
; FLAT-NEXT: v_and_b32_e32 v3, s6, v2
; FLAT-NEXT: v_and_b32_e32 v2, s6, v4
; FLAT-NEXT: v_perm_b32 v2, 0, v0, s6
; FLAT-NEXT: v_perm_b32 v4, 0, v1, s6
; FLAT-NEXT: v_and_b32_e32 v1, s0, v2
; FLAT-NEXT: v_and_b32_e32 v0, s0, v4
; FLAT-NEXT: v_and_b32_e32 v3, s1, v2
; FLAT-NEXT: v_and_b32_e32 v2, s1, v4
; FLAT-NEXT: v_lshlrev_b64 v[0:1], 4, v[0:1]
; FLAT-NEXT: v_lshrrev_b64 v[2:3], 4, v[2:3]
; FLAT-NEXT: s_mov_b32 s6, -1
; FLAT-NEXT: s_mov_b32 s0, 0x55555555
; FLAT-NEXT: v_or_b32_e32 v3, v3, v1
; FLAT-NEXT: v_or_b32_e32 v2, v2, v0
; FLAT-NEXT: v_and_b32_e32 v1, s2, v3
; FLAT-NEXT: v_and_b32_e32 v0, s2, v2
; FLAT-NEXT: v_and_b32_e32 v3, s3, v3
; FLAT-NEXT: v_and_b32_e32 v2, s3, v2
; FLAT-NEXT: v_lshlrev_b64 v[0:1], 2, v[0:1]
; FLAT-NEXT: v_lshrrev_b64 v[2:3], 2, v[2:3]
; FLAT-NEXT: s_mov_b32 s1, 0xaaaaaaaa
; FLAT-NEXT: v_or_b32_e32 v3, v3, v1
; FLAT-NEXT: v_or_b32_e32 v2, v2, v0
; FLAT-NEXT: v_and_b32_e32 v1, s0, v3
; FLAT-NEXT: v_and_b32_e32 v0, s0, v2
; FLAT-NEXT: v_and_b32_e32 v3, s1, v3
; FLAT-NEXT: v_and_b32_e32 v2, s1, v2
; FLAT-NEXT: v_lshlrev_b64 v[0:1], 2, v[0:1]
; FLAT-NEXT: v_lshrrev_b64 v[2:3], 2, v[2:3]
; FLAT-NEXT: v_or_b32_e32 v3, v3, v1
; FLAT-NEXT: v_or_b32_e32 v2, v2, v0
; FLAT-NEXT: v_and_b32_e32 v1, s8, v3
; FLAT-NEXT: v_and_b32_e32 v0, s8, v2
; FLAT-NEXT: v_and_b32_e32 v3, s9, v3
; FLAT-NEXT: v_and_b32_e32 v2, s9, v2
; FLAT-NEXT: v_lshlrev_b64 v[0:1], 1, v[0:1]
; FLAT-NEXT: v_lshrrev_b64 v[2:3], 1, v[2:3]
; FLAT-NEXT: s_mov_b32 s6, -1
; FLAT-NEXT: v_or_b32_e32 v1, v3, v1
; FLAT-NEXT: v_or_b32_e32 v0, v2, v0
; FLAT-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
@ -600,13 +600,13 @@ define amdgpu_kernel void @v_brev_v2i64(<2 x i64> addrspace(1)* noalias %out, <2
; SI-NEXT: v_mov_b32_e32 v1, 0
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: buffer_load_dwordx4 v[0:3], v[0:1], s[0:3], 0 addr64
; SI-NEXT: s_mov_b32 s0, 0xff00ff
; SI-NEXT: s_mov_b32 s1, 0xf0f0f0f
; SI-NEXT: s_mov_b32 s2, 0xf0f0f0f0
; SI-NEXT: s_mov_b32 s3, 0x33333333
; SI-NEXT: s_mov_b32 s8, 0xcccccccc
; SI-NEXT: s_mov_b32 s9, 0x55555555
; SI-NEXT: s_mov_b32 s10, 0xaaaaaaaa
; SI-NEXT: s_mov_b32 s8, 0xff00ff
; SI-NEXT: s_mov_b32 s9, 0xf0f0f0f
; SI-NEXT: s_mov_b32 s10, 0xf0f0f0f0
; SI-NEXT: s_mov_b32 s11, 0x33333333
; SI-NEXT: s_mov_b32 s12, 0xcccccccc
; SI-NEXT: s_mov_b32 s13, 0x55555555
; SI-NEXT: s_mov_b32 s14, 0xaaaaaaaa
; SI-NEXT: s_mov_b32 s6, -1
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_alignbit_b32 v4, v2, v2, 8
@ -617,18 +617,18 @@ define amdgpu_kernel void @v_brev_v2i64(<2 x i64> addrspace(1)* noalias %out, <2
; SI-NEXT: v_alignbit_b32 v7, v1, v1, 8
; SI-NEXT: v_alignbit_b32 v1, v1, v1, 24
; SI-NEXT: v_alignbit_b32 v3, v3, v3, 24
; SI-NEXT: v_bfi_b32 v2, s0, v2, v4
; SI-NEXT: v_bfi_b32 v4, s0, v3, v5
; SI-NEXT: v_bfi_b32 v6, s0, v0, v6
; SI-NEXT: v_bfi_b32 v8, s0, v1, v7
; SI-NEXT: v_and_b32_e32 v1, s1, v2
; SI-NEXT: v_and_b32_e32 v0, s1, v4
; SI-NEXT: v_and_b32_e32 v3, s2, v2
; SI-NEXT: v_and_b32_e32 v2, s2, v4
; SI-NEXT: v_and_b32_e32 v5, s1, v6
; SI-NEXT: v_and_b32_e32 v4, s1, v8
; SI-NEXT: v_and_b32_e32 v7, s2, v6
; SI-NEXT: v_and_b32_e32 v6, s2, v8
; SI-NEXT: v_bfi_b32 v2, s8, v2, v4
; SI-NEXT: v_bfi_b32 v4, s8, v3, v5
; SI-NEXT: v_bfi_b32 v6, s8, v0, v6
; SI-NEXT: v_bfi_b32 v8, s8, v1, v7
; SI-NEXT: v_and_b32_e32 v1, s9, v2
; SI-NEXT: v_and_b32_e32 v0, s9, v4
; SI-NEXT: v_and_b32_e32 v3, s10, v2
; SI-NEXT: v_and_b32_e32 v2, s10, v4
; SI-NEXT: v_and_b32_e32 v5, s9, v6
; SI-NEXT: v_and_b32_e32 v4, s9, v8
; SI-NEXT: v_and_b32_e32 v7, s10, v6
; SI-NEXT: v_and_b32_e32 v6, s10, v8
; SI-NEXT: v_lshl_b64 v[0:1], v[0:1], 4
; SI-NEXT: v_lshr_b64 v[2:3], v[2:3], 4
; SI-NEXT: v_lshl_b64 v[4:5], v[4:5], 4
@ -637,14 +637,14 @@ define amdgpu_kernel void @v_brev_v2i64(<2 x i64> addrspace(1)* noalias %out, <2
; SI-NEXT: v_or_b32_e32 v2, v2, v0
; SI-NEXT: v_or_b32_e32 v7, v7, v5
; SI-NEXT: v_or_b32_e32 v6, v6, v4
; SI-NEXT: v_and_b32_e32 v1, s3, v3
; SI-NEXT: v_and_b32_e32 v0, s3, v2
; SI-NEXT: v_and_b32_e32 v5, s3, v7
; SI-NEXT: v_and_b32_e32 v4, s3, v6
; SI-NEXT: v_and_b32_e32 v3, s8, v3
; SI-NEXT: v_and_b32_e32 v2, s8, v2
; SI-NEXT: v_and_b32_e32 v7, s8, v7
; SI-NEXT: v_and_b32_e32 v6, s8, v6
; SI-NEXT: v_and_b32_e32 v1, s11, v3
; SI-NEXT: v_and_b32_e32 v0, s11, v2
; SI-NEXT: v_and_b32_e32 v5, s11, v7
; SI-NEXT: v_and_b32_e32 v4, s11, v6
; SI-NEXT: v_and_b32_e32 v3, s12, v3
; SI-NEXT: v_and_b32_e32 v2, s12, v2
; SI-NEXT: v_and_b32_e32 v7, s12, v7
; SI-NEXT: v_and_b32_e32 v6, s12, v6
; SI-NEXT: v_lshl_b64 v[0:1], v[0:1], 2
; SI-NEXT: v_lshr_b64 v[2:3], v[2:3], 2
; SI-NEXT: v_lshl_b64 v[4:5], v[4:5], 2
@ -653,14 +653,14 @@ define amdgpu_kernel void @v_brev_v2i64(<2 x i64> addrspace(1)* noalias %out, <2
; SI-NEXT: v_or_b32_e32 v2, v2, v0
; SI-NEXT: v_or_b32_e32 v7, v7, v5
; SI-NEXT: v_or_b32_e32 v6, v6, v4
; SI-NEXT: v_and_b32_e32 v1, s9, v3
; SI-NEXT: v_and_b32_e32 v0, s9, v2
; SI-NEXT: v_and_b32_e32 v5, s9, v7
; SI-NEXT: v_and_b32_e32 v4, s9, v6
; SI-NEXT: v_and_b32_e32 v3, s10, v3
; SI-NEXT: v_and_b32_e32 v2, s10, v2
; SI-NEXT: v_and_b32_e32 v7, s10, v7
; SI-NEXT: v_and_b32_e32 v6, s10, v6
; SI-NEXT: v_and_b32_e32 v1, s13, v3
; SI-NEXT: v_and_b32_e32 v0, s13, v2
; SI-NEXT: v_and_b32_e32 v5, s13, v7
; SI-NEXT: v_and_b32_e32 v4, s13, v6
; SI-NEXT: v_and_b32_e32 v3, s14, v3
; SI-NEXT: v_and_b32_e32 v2, s14, v2
; SI-NEXT: v_and_b32_e32 v7, s14, v7
; SI-NEXT: v_and_b32_e32 v6, s14, v6
; SI-NEXT: v_lshl_b64 v[0:1], v[0:1], 1
; SI-NEXT: v_lshr_b64 v[2:3], v[2:3], 1
; SI-NEXT: v_lshl_b64 v[4:5], v[4:5], 1
@ -677,33 +677,33 @@ define amdgpu_kernel void @v_brev_v2i64(<2 x i64> addrspace(1)* noalias %out, <2
; FLAT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
; FLAT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2c
; FLAT-NEXT: v_lshlrev_b32_e32 v0, 4, v0
; FLAT-NEXT: s_mov_b32 s2, 0x10203
; FLAT-NEXT: s_mov_b32 s3, 0xf0f0f0f
; FLAT-NEXT: s_mov_b32 s8, 0xf0f0f0f0
; FLAT-NEXT: s_mov_b32 s10, 0x10203
; FLAT-NEXT: s_mov_b32 s2, 0x33333333
; FLAT-NEXT: s_mov_b32 s3, 0xcccccccc
; FLAT-NEXT: s_waitcnt lgkmcnt(0)
; FLAT-NEXT: v_mov_b32_e32 v1, s1
; FLAT-NEXT: v_add_u32_e32 v0, vcc, s0, v0
; FLAT-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; FLAT-NEXT: flat_load_dwordx4 v[0:3], v[0:1]
; FLAT-NEXT: s_mov_b32 s0, 0x33333333
; FLAT-NEXT: s_mov_b32 s1, 0xcccccccc
; FLAT-NEXT: s_mov_b32 s9, 0x55555555
; FLAT-NEXT: s_mov_b32 s10, 0xaaaaaaaa
; FLAT-NEXT: s_mov_b32 s0, 0xf0f0f0f
; FLAT-NEXT: s_mov_b32 s1, 0xf0f0f0f0
; FLAT-NEXT: s_mov_b32 s8, 0x55555555
; FLAT-NEXT: s_mov_b32 s9, 0xaaaaaaaa
; FLAT-NEXT: s_mov_b32 s7, 0xf000
; FLAT-NEXT: s_mov_b32 s6, -1
; FLAT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; FLAT-NEXT: v_perm_b32 v6, 0, v0, s2
; FLAT-NEXT: v_perm_b32 v4, 0, v3, s2
; FLAT-NEXT: v_perm_b32 v2, 0, v2, s2
; FLAT-NEXT: v_perm_b32 v8, 0, v1, s2
; FLAT-NEXT: v_and_b32_e32 v1, s3, v2
; FLAT-NEXT: v_and_b32_e32 v0, s3, v4
; FLAT-NEXT: v_and_b32_e32 v3, s8, v2
; FLAT-NEXT: v_and_b32_e32 v2, s8, v4
; FLAT-NEXT: v_and_b32_e32 v5, s3, v6
; FLAT-NEXT: v_and_b32_e32 v4, s3, v8
; FLAT-NEXT: v_and_b32_e32 v7, s8, v6
; FLAT-NEXT: v_and_b32_e32 v6, s8, v8
; FLAT-NEXT: v_perm_b32 v6, 0, v0, s10
; FLAT-NEXT: v_perm_b32 v4, 0, v3, s10
; FLAT-NEXT: v_perm_b32 v2, 0, v2, s10
; FLAT-NEXT: v_perm_b32 v8, 0, v1, s10
; FLAT-NEXT: v_and_b32_e32 v1, s0, v2
; FLAT-NEXT: v_and_b32_e32 v0, s0, v4
; FLAT-NEXT: v_and_b32_e32 v3, s1, v2
; FLAT-NEXT: v_and_b32_e32 v2, s1, v4
; FLAT-NEXT: v_and_b32_e32 v5, s0, v6
; FLAT-NEXT: v_and_b32_e32 v4, s0, v8
; FLAT-NEXT: v_and_b32_e32 v7, s1, v6
; FLAT-NEXT: v_and_b32_e32 v6, s1, v8
; FLAT-NEXT: v_lshlrev_b64 v[0:1], 4, v[0:1]
; FLAT-NEXT: v_lshrrev_b64 v[2:3], 4, v[2:3]
; FLAT-NEXT: v_lshlrev_b64 v[4:5], 4, v[4:5]
@ -712,14 +712,14 @@ define amdgpu_kernel void @v_brev_v2i64(<2 x i64> addrspace(1)* noalias %out, <2
; FLAT-NEXT: v_or_b32_e32 v2, v2, v0
; FLAT-NEXT: v_or_b32_e32 v7, v7, v5
; FLAT-NEXT: v_or_b32_e32 v6, v6, v4
; FLAT-NEXT: v_and_b32_e32 v1, s0, v3
; FLAT-NEXT: v_and_b32_e32 v0, s0, v2
; FLAT-NEXT: v_and_b32_e32 v5, s0, v7
; FLAT-NEXT: v_and_b32_e32 v4, s0, v6
; FLAT-NEXT: v_and_b32_e32 v3, s1, v3
; FLAT-NEXT: v_and_b32_e32 v2, s1, v2
; FLAT-NEXT: v_and_b32_e32 v7, s1, v7
; FLAT-NEXT: v_and_b32_e32 v6, s1, v6
; FLAT-NEXT: v_and_b32_e32 v1, s2, v3
; FLAT-NEXT: v_and_b32_e32 v0, s2, v2
; FLAT-NEXT: v_and_b32_e32 v5, s2, v7
; FLAT-NEXT: v_and_b32_e32 v4, s2, v6
; FLAT-NEXT: v_and_b32_e32 v3, s3, v3
; FLAT-NEXT: v_and_b32_e32 v2, s3, v2
; FLAT-NEXT: v_and_b32_e32 v7, s3, v7
; FLAT-NEXT: v_and_b32_e32 v6, s3, v6
; FLAT-NEXT: v_lshlrev_b64 v[0:1], 2, v[0:1]
; FLAT-NEXT: v_lshrrev_b64 v[2:3], 2, v[2:3]
; FLAT-NEXT: v_lshlrev_b64 v[4:5], 2, v[4:5]
@ -728,14 +728,14 @@ define amdgpu_kernel void @v_brev_v2i64(<2 x i64> addrspace(1)* noalias %out, <2
; FLAT-NEXT: v_or_b32_e32 v2, v2, v0
; FLAT-NEXT: v_or_b32_e32 v7, v7, v5
; FLAT-NEXT: v_or_b32_e32 v6, v6, v4
; FLAT-NEXT: v_and_b32_e32 v1, s9, v3
; FLAT-NEXT: v_and_b32_e32 v0, s9, v2
; FLAT-NEXT: v_and_b32_e32 v5, s9, v7
; FLAT-NEXT: v_and_b32_e32 v4, s9, v6
; FLAT-NEXT: v_and_b32_e32 v3, s10, v3
; FLAT-NEXT: v_and_b32_e32 v2, s10, v2
; FLAT-NEXT: v_and_b32_e32 v7, s10, v7
; FLAT-NEXT: v_and_b32_e32 v6, s10, v6
; FLAT-NEXT: v_and_b32_e32 v1, s8, v3
; FLAT-NEXT: v_and_b32_e32 v0, s8, v2
; FLAT-NEXT: v_and_b32_e32 v5, s8, v7
; FLAT-NEXT: v_and_b32_e32 v4, s8, v6
; FLAT-NEXT: v_and_b32_e32 v3, s9, v3
; FLAT-NEXT: v_and_b32_e32 v2, s9, v2
; FLAT-NEXT: v_and_b32_e32 v7, s9, v7
; FLAT-NEXT: v_and_b32_e32 v6, s9, v6
; FLAT-NEXT: v_lshlrev_b64 v[0:1], 1, v[0:1]
; FLAT-NEXT: v_lshrrev_b64 v[2:3], 1, v[2:3]
; FLAT-NEXT: v_lshlrev_b64 v[4:5], 1, v[4:5]

View File

@ -661,11 +661,7 @@ define i32 @sdiv32(i32 %a, i32 %b) {
; GFX9-NEXT: v_add_u32_e32 v1, v1, v2
; GFX9-NEXT: v_xor_b32_e32 v1, v1, v2
; GFX9-NEXT: v_cvt_f32_u32_e32 v3, v1
; GFX9-NEXT: v_ashrrev_i32_e32 v4, 31, v0
; GFX9-NEXT: v_add_u32_e32 v0, v0, v4
; GFX9-NEXT: v_xor_b32_e32 v0, v0, v4
; GFX9-NEXT: v_rcp_iflag_f32_e32 v3, v3
; GFX9-NEXT: v_xor_b32_e32 v2, v4, v2
; GFX9-NEXT: v_mul_f32_e32 v3, 0x4f800000, v3
; GFX9-NEXT: v_cvt_u32_f32_e32 v3, v3
; GFX9-NEXT: v_mul_lo_u32 v4, v3, v1
@ -674,10 +670,14 @@ define i32 @sdiv32(i32 %a, i32 %b) {
; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v5
; GFX9-NEXT: v_cndmask_b32_e32 v4, v4, v6, vcc
; GFX9-NEXT: v_mul_hi_u32 v4, v4, v3
; GFX9-NEXT: v_add_u32_e32 v5, v3, v4
; GFX9-NEXT: v_ashrrev_i32_e32 v5, 31, v0
; GFX9-NEXT: v_add_u32_e32 v0, v0, v5
; GFX9-NEXT: v_xor_b32_e32 v0, v0, v5
; GFX9-NEXT: v_add_u32_e32 v6, v3, v4
; GFX9-NEXT: v_sub_u32_e32 v3, v3, v4
; GFX9-NEXT: v_cndmask_b32_e32 v3, v3, v5, vcc
; GFX9-NEXT: v_cndmask_b32_e32 v3, v3, v6, vcc
; GFX9-NEXT: v_mul_hi_u32 v3, v3, v0
; GFX9-NEXT: v_xor_b32_e32 v2, v5, v2
; GFX9-NEXT: v_mul_lo_u32 v4, v3, v1
; GFX9-NEXT: v_add_u32_e32 v5, 1, v3
; GFX9-NEXT: v_add_u32_e32 v6, -1, v3
@ -734,33 +734,33 @@ define i32 @srem32(i32 %a, i32 %b) {
; GFX9-NEXT: v_add_u32_e32 v1, v1, v2
; GFX9-NEXT: v_xor_b32_e32 v1, v1, v2
; GFX9-NEXT: v_cvt_f32_u32_e32 v2, v1
; GFX9-NEXT: v_ashrrev_i32_e32 v3, 31, v0
; GFX9-NEXT: v_add_u32_e32 v0, v0, v3
; GFX9-NEXT: v_xor_b32_e32 v0, v0, v3
; GFX9-NEXT: v_rcp_iflag_f32_e32 v2, v2
; GFX9-NEXT: v_mul_f32_e32 v2, 0x4f800000, v2
; GFX9-NEXT: v_cvt_u32_f32_e32 v2, v2
; GFX9-NEXT: v_mul_lo_u32 v4, v2, v1
; GFX9-NEXT: v_mul_hi_u32 v5, v2, v1
; GFX9-NEXT: v_sub_u32_e32 v6, 0, v4
; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v5
; GFX9-NEXT: v_cndmask_b32_e32 v4, v4, v6, vcc
; GFX9-NEXT: v_mul_hi_u32 v4, v4, v2
; GFX9-NEXT: v_add_u32_e32 v5, v2, v4
; GFX9-NEXT: v_sub_u32_e32 v2, v2, v4
; GFX9-NEXT: v_mul_lo_u32 v3, v2, v1
; GFX9-NEXT: v_mul_hi_u32 v4, v2, v1
; GFX9-NEXT: v_sub_u32_e32 v5, 0, v3
; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v4
; GFX9-NEXT: v_cndmask_b32_e32 v3, v3, v5, vcc
; GFX9-NEXT: v_mul_hi_u32 v3, v3, v2
; GFX9-NEXT: v_ashrrev_i32_e32 v4, 31, v0
; GFX9-NEXT: v_add_u32_e32 v0, v0, v4
; GFX9-NEXT: v_xor_b32_e32 v0, v0, v4
; GFX9-NEXT: v_add_u32_e32 v5, v2, v3
; GFX9-NEXT: v_sub_u32_e32 v2, v2, v3
; GFX9-NEXT: v_cndmask_b32_e32 v2, v2, v5, vcc
; GFX9-NEXT: v_mul_hi_u32 v2, v2, v0
; GFX9-NEXT: v_mul_lo_u32 v2, v2, v1
; GFX9-NEXT: v_sub_u32_e32 v4, v0, v2
; GFX9-NEXT: v_sub_u32_e32 v3, v0, v2
; GFX9-NEXT: v_cmp_ge_u32_e64 s[4:5], v0, v2
; GFX9-NEXT: v_cmp_ge_u32_e32 vcc, v4, v1
; GFX9-NEXT: v_sub_u32_e32 v0, v4, v1
; GFX9-NEXT: v_cmp_ge_u32_e32 vcc, v3, v1
; GFX9-NEXT: v_sub_u32_e32 v0, v3, v1
; GFX9-NEXT: s_and_b64 vcc, vcc, s[4:5]
; GFX9-NEXT: v_add_u32_e32 v5, v4, v1
; GFX9-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc
; GFX9-NEXT: v_add_u32_e32 v5, v3, v1
; GFX9-NEXT: v_cndmask_b32_e32 v0, v3, v0, vcc
; GFX9-NEXT: v_cndmask_b32_e64 v0, v5, v0, s[4:5]
; GFX9-NEXT: v_xor_b32_e32 v0, v0, v3
; GFX9-NEXT: v_sub_u32_e32 v0, v0, v3
; GFX9-NEXT: v_xor_b32_e32 v0, v0, v4
; GFX9-NEXT: v_sub_u32_e32 v0, v0, v4
; GFX9-NEXT: s_setpc_b64 s[30:31]
%d = srem i32 %a, %b
ret i32 %d

View File

@ -8,37 +8,37 @@ declare i32 @llvm.amdgcn.workitem.id.y() nounwind readnone
define amdgpu_kernel void @test_copy_v4i8(<4 x i8> addrspace(1)* %out, <4 x i8> addrspace(1)* %in) nounwind {
; SI-LABEL: test_copy_v4i8:
; SI: ; %bb.0:
; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
; SI-NEXT: s_mov_b32 s7, 0xf000
; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
; SI-NEXT: s_mov_b32 s3, 0xf000
; SI-NEXT: s_mov_b32 s10, 0
; SI-NEXT: s_mov_b32 s11, s7
; SI-NEXT: s_mov_b32 s11, s3
; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: s_mov_b64 s[8:9], s[2:3]
; SI-NEXT: s_mov_b64 s[8:9], s[6:7]
; SI-NEXT: v_mov_b32_e32 v1, 0
; SI-NEXT: buffer_load_dword v0, v[0:1], s[8:11], 0 addr64
; SI-NEXT: s_mov_b32 s6, -1
; SI-NEXT: s_mov_b32 s4, s0
; SI-NEXT: s_mov_b32 s5, s1
; SI-NEXT: s_mov_b32 s2, -1
; SI-NEXT: s_mov_b32 s0, s4
; SI-NEXT: s_mov_b32 s1, s5
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0
; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0
; SI-NEXT: s_endpgm
;
; VI-LABEL: test_copy_v4i8:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; VI-NEXT: s_mov_b32 s7, 0xf000
; VI-NEXT: s_mov_b32 s6, -1
; VI-NEXT: s_mov_b32 s3, 0xf000
; VI-NEXT: s_mov_b32 s2, -1
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v1, s3
; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0
; VI-NEXT: v_mov_b32_e32 v1, s7
; VI-NEXT: v_add_u32_e32 v0, vcc, s6, v0
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: flat_load_dword v0, v[0:1]
; VI-NEXT: s_mov_b32 s4, s0
; VI-NEXT: s_mov_b32 s5, s1
; VI-NEXT: s_mov_b32 s0, s4
; VI-NEXT: s_mov_b32 s1, s5
; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0
; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
; VI-NEXT: s_endpgm
%tid.x = call i32 @llvm.amdgcn.workitem.id.x()
%gep = getelementptr <4 x i8>, <4 x i8> addrspace(1)* %in, i32 %tid.x
@ -72,16 +72,16 @@ define amdgpu_kernel void @test_copy_v4i8_x2(<4 x i8> addrspace(1)* %out0, <4 x
; VI-LABEL: test_copy_v4i8_x2:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34
; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; VI-NEXT: s_mov_b32 s3, 0xf000
; VI-NEXT: s_mov_b32 s2, -1
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_mov_b32 s0, s6
; VI-NEXT: v_mov_b32_e32 v1, s9
; VI-NEXT: v_add_u32_e32 v0, vcc, s8, v0
; VI-NEXT: v_mov_b32_e32 v1, s1
; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v0
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: flat_load_dword v0, v[0:1]
; VI-NEXT: s_mov_b32 s0, s6
; VI-NEXT: s_mov_b32 s1, s7
; VI-NEXT: s_mov_b32 s6, s2
; VI-NEXT: s_mov_b32 s7, s3
@ -102,25 +102,25 @@ define amdgpu_kernel void @test_copy_v4i8_x3(<4 x i8> addrspace(1)* %out0, <4 x
; SI: ; %bb.0:
; SI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9
; SI-NEXT: s_mov_b32 s11, 0xf000
; SI-NEXT: s_mov_b32 s14, 0
; SI-NEXT: s_mov_b32 s15, s11
; SI-NEXT: s_mov_b32 s18, 0
; SI-NEXT: s_mov_b32 s19, s11
; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: s_mov_b64 s[12:13], s[6:7]
; SI-NEXT: s_mov_b64 s[16:17], s[6:7]
; SI-NEXT: v_mov_b32_e32 v1, 0
; SI-NEXT: buffer_load_dword v0, v[0:1], s[12:15], 0 addr64
; SI-NEXT: buffer_load_dword v0, v[0:1], s[16:19], 0 addr64
; SI-NEXT: s_mov_b32 s10, -1
; SI-NEXT: s_mov_b32 s8, s4
; SI-NEXT: s_mov_b32 s9, s5
; SI-NEXT: s_mov_b32 s4, s2
; SI-NEXT: s_mov_b32 s5, s3
; SI-NEXT: s_mov_b32 s12, s2
; SI-NEXT: s_mov_b32 s13, s3
; SI-NEXT: s_mov_b32 s2, s10
; SI-NEXT: s_mov_b32 s3, s11
; SI-NEXT: s_mov_b32 s6, s10
; SI-NEXT: s_mov_b32 s7, s11
; SI-NEXT: s_mov_b32 s8, s4
; SI-NEXT: s_mov_b32 s9, s5
; SI-NEXT: s_mov_b32 s14, s10
; SI-NEXT: s_mov_b32 s15, s11
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0
; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0
; SI-NEXT: buffer_store_dword v0, off, s[12:15], 0
; SI-NEXT: buffer_store_dword v0, off, s[8:11], 0
; SI-NEXT: s_endpgm
;
@ -160,61 +160,60 @@ define amdgpu_kernel void @test_copy_v4i8_x3(<4 x i8> addrspace(1)* %out0, <4 x
define amdgpu_kernel void @test_copy_v4i8_x4(<4 x i8> addrspace(1)* %out0, <4 x i8> addrspace(1)* %out1, <4 x i8> addrspace(1)* %out2, <4 x i8> addrspace(1)* %out3, <4 x i8> addrspace(1)* %in) nounwind {
; SI-LABEL: test_copy_v4i8_x4:
; SI: ; %bb.0:
; SI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x11
; SI-NEXT: s_mov_b32 s15, 0xf000
; SI-NEXT: s_mov_b32 s10, 0
; SI-NEXT: s_mov_b32 s11, s15
; SI-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x9
; SI-NEXT: s_load_dwordx2 s[12:13], s[0:1], 0x11
; SI-NEXT: s_mov_b32 s3, 0xf000
; SI-NEXT: s_mov_b32 s14, 0
; SI-NEXT: s_mov_b32 s15, s3
; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; SI-NEXT: v_mov_b32_e32 v1, 0
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: buffer_load_dword v0, v[0:1], s[8:11], 0 addr64
; SI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9
; SI-NEXT: s_mov_b32 s14, -1
; SI-NEXT: s_mov_b32 s18, s14
; SI-NEXT: s_mov_b32 s19, s15
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: s_mov_b32 s16, s2
; SI-NEXT: s_mov_b32 s17, s3
; SI-NEXT: s_mov_b32 s2, s14
; SI-NEXT: s_mov_b32 s3, s15
; SI-NEXT: s_mov_b32 s12, s6
; SI-NEXT: s_mov_b32 s13, s7
; SI-NEXT: s_mov_b32 s6, s14
; SI-NEXT: s_mov_b32 s7, s15
; SI-NEXT: buffer_load_dword v0, v[0:1], s[12:15], 0 addr64
; SI-NEXT: s_mov_b32 s2, -1
; SI-NEXT: s_mov_b32 s16, s6
; SI-NEXT: s_mov_b32 s17, s7
; SI-NEXT: s_mov_b32 s6, s2
; SI-NEXT: s_mov_b32 s7, s3
; SI-NEXT: s_mov_b32 s0, s10
; SI-NEXT: s_mov_b32 s1, s11
; SI-NEXT: s_mov_b32 s10, s2
; SI-NEXT: s_mov_b32 s11, s3
; SI-NEXT: s_mov_b32 s18, s2
; SI-NEXT: s_mov_b32 s19, s3
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0
; SI-NEXT: buffer_store_dword v0, off, s[16:19], 0
; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0
; SI-NEXT: buffer_store_dword v0, off, s[12:15], 0
; SI-NEXT: buffer_store_dword v0, off, s[16:19], 0
; SI-NEXT: buffer_store_dword v0, off, s[8:11], 0
; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0
; SI-NEXT: s_endpgm
;
; VI-LABEL: test_copy_v4i8_x4:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x44
; VI-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
; VI-NEXT: s_load_dwordx2 s[12:13], s[0:1], 0x44
; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; VI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
; VI-NEXT: s_mov_b32 s11, 0xf000
; VI-NEXT: s_mov_b32 s10, -1
; VI-NEXT: s_mov_b32 s3, 0xf000
; VI-NEXT: s_mov_b32 s2, -1
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v1, s9
; VI-NEXT: v_add_u32_e32 v0, vcc, s8, v0
; VI-NEXT: s_mov_b32 s0, s10
; VI-NEXT: v_mov_b32_e32 v1, s13
; VI-NEXT: v_add_u32_e32 v0, vcc, s12, v0
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: flat_load_dword v0, v[0:1]
; VI-NEXT: s_mov_b32 s12, s2
; VI-NEXT: s_mov_b32 s13, s3
; VI-NEXT: s_mov_b32 s2, s10
; VI-NEXT: s_mov_b32 s3, s11
; VI-NEXT: s_mov_b32 s8, s6
; VI-NEXT: s_mov_b32 s9, s7
; VI-NEXT: s_mov_b32 s6, s10
; VI-NEXT: s_mov_b32 s7, s11
; VI-NEXT: s_mov_b32 s14, s10
; VI-NEXT: s_mov_b32 s15, s11
; VI-NEXT: s_mov_b32 s12, s6
; VI-NEXT: s_mov_b32 s13, s7
; VI-NEXT: s_mov_b32 s6, s2
; VI-NEXT: s_mov_b32 s7, s3
; VI-NEXT: s_mov_b32 s1, s11
; VI-NEXT: s_mov_b32 s10, s2
; VI-NEXT: s_mov_b32 s11, s3
; VI-NEXT: s_mov_b32 s14, s2
; VI-NEXT: s_mov_b32 s15, s3
; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
; VI-NEXT: buffer_store_dword v0, off, s[12:15], 0
; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0
; VI-NEXT: buffer_store_dword v0, off, s[12:15], 0
; VI-NEXT: buffer_store_dword v0, off, s[8:11], 0
; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
; VI-NEXT: s_endpgm
%tid.x = call i32 @llvm.amdgcn.workitem.id.x()
%gep = getelementptr <4 x i8>, <4 x i8> addrspace(1)* %in, i32 %tid.x
@ -230,76 +229,75 @@ define amdgpu_kernel void @test_copy_v4i8_extra_use(<4 x i8> addrspace(1)* %out0
; SI-LABEL: test_copy_v4i8_extra_use:
; SI: ; %bb.0:
; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd
; SI-NEXT: s_mov_b32 s11, 0xf000
; SI-NEXT: s_mov_b32 s2, 0
; SI-NEXT: s_mov_b32 s3, s11
; SI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd
; SI-NEXT: s_mov_b32 s3, 0xf000
; SI-NEXT: s_mov_b32 s10, 0
; SI-NEXT: s_mov_b32 s11, s3
; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; SI-NEXT: v_mov_b32_e32 v1, 0
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: buffer_load_dword v0, v[0:1], s[0:3], 0 addr64
; SI-NEXT: s_mov_b32 s10, -1
; SI-NEXT: s_mov_b32 s0, 0xff00
; SI-NEXT: s_mov_b32 s8, s6
; SI-NEXT: s_mov_b32 s9, s7
; SI-NEXT: s_mov_b32 s6, s10
; SI-NEXT: s_mov_b32 s7, s11
; SI-NEXT: s_movk_i32 s1, 0xff
; SI-NEXT: buffer_load_dword v0, v[0:1], s[8:11], 0 addr64
; SI-NEXT: s_mov_b32 s12, 0xff00
; SI-NEXT: s_movk_i32 s13, 0xff
; SI-NEXT: s_mov_b32 s2, -1
; SI-NEXT: s_mov_b32 s0, s6
; SI-NEXT: s_mov_b32 s1, s7
; SI-NEXT: s_mov_b32 s6, s2
; SI-NEXT: s_mov_b32 s7, s3
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_add_i32_e32 v3, vcc, 9, v0
; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v0
; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0
; SI-NEXT: v_and_b32_e32 v2, s0, v0
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_add_i32_e32 v0, vcc, 9, v0
; SI-NEXT: v_and_b32_e32 v0, s1, v0
; SI-NEXT: v_and_b32_e32 v3, s0, v1
; SI-NEXT: v_and_b32_e32 v4, s12, v1
; SI-NEXT: v_add_i32_e32 v1, vcc, 9, v1
; SI-NEXT: v_or_b32_e32 v0, v2, v0
; SI-NEXT: v_and_b32_e32 v1, s1, v1
; SI-NEXT: v_add_i32_e32 v0, vcc, 0x900, v0
; SI-NEXT: v_or_b32_e32 v1, v3, v1
; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0
; SI-NEXT: v_and_b32_e32 v2, s12, v0
; SI-NEXT: v_and_b32_e32 v3, s13, v3
; SI-NEXT: v_or_b32_e32 v2, v2, v3
; SI-NEXT: v_and_b32_e32 v1, s13, v1
; SI-NEXT: v_add_i32_e32 v2, vcc, 0x900, v2
; SI-NEXT: v_or_b32_e32 v1, v4, v1
; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2
; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; SI-NEXT: v_or_b32_e32 v0, v1, v0
; SI-NEXT: v_add_i32_e32 v0, vcc, 0x9000000, v0
; SI-NEXT: buffer_store_dword v0, off, s[8:11], 0
; SI-NEXT: v_or_b32_e32 v1, v1, v2
; SI-NEXT: v_add_i32_e32 v1, vcc, 0x9000000, v1
; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0
; SI-NEXT: buffer_store_dword v1, off, s[0:3], 0
; SI-NEXT: s_endpgm
;
; VI-LABEL: test_copy_v4i8_extra_use:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
; VI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34
; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; VI-NEXT: s_movk_i32 s8, 0xff00
; VI-NEXT: s_movk_i32 s10, 0x900
; VI-NEXT: s_mov_b32 s3, 0xf000
; VI-NEXT: s_mov_b32 s2, -1
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v1, s1
; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v0
; VI-NEXT: v_mov_b32_e32 v1, s9
; VI-NEXT: v_add_u32_e32 v0, vcc, s8, v0
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: flat_load_dword v0, v[0:1]
; VI-NEXT: s_movk_i32 s8, 0xff00
; VI-NEXT: s_movk_i32 s9, 0xff
; VI-NEXT: s_mov_b32 s0, s6
; VI-NEXT: s_mov_b32 s1, s7
; VI-NEXT: s_movk_i32 s9, 0xff
; VI-NEXT: s_mov_b32 s6, s2
; VI-NEXT: s_mov_b32 s7, s3
; VI-NEXT: s_movk_i32 s10, 0x900
; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v0
; VI-NEXT: v_and_b32_e32 v3, s8, v1
; VI-NEXT: v_and_b32_e32 v4, s8, v1
; VI-NEXT: v_add_u16_e32 v1, 9, v1
; VI-NEXT: v_add_u16_e32 v3, 9, v0
; VI-NEXT: v_and_b32_e32 v1, s9, v1
; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0
; VI-NEXT: v_or_b32_e32 v1, v4, v1
; VI-NEXT: v_and_b32_e32 v2, s8, v0
; VI-NEXT: v_add_u16_e32 v0, 9, v0
; VI-NEXT: v_and_b32_e32 v0, s9, v0
; VI-NEXT: v_or_b32_e32 v1, v3, v1
; VI-NEXT: v_or_b32_e32 v0, v2, v0
; VI-NEXT: v_and_b32_e32 v3, s9, v3
; VI-NEXT: v_or_b32_e32 v2, v2, v3
; VI-NEXT: v_add_u16_e32 v1, s10, v1
; VI-NEXT: v_add_u16_e32 v0, s10, v0
; VI-NEXT: v_add_u16_e32 v2, s10, v2
; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; VI-NEXT: v_or_b32_e32 v0, v0, v1
; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
; VI-NEXT: v_or_b32_e32 v1, v2, v1
; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0
; VI-NEXT: buffer_store_dword v1, off, s[0:3], 0
; VI-NEXT: s_endpgm
%tid.x = call i32 @llvm.amdgcn.workitem.id.x()
%gep = getelementptr <4 x i8>, <4 x i8> addrspace(1)* %in, i32 %tid.x
@ -316,33 +314,33 @@ define amdgpu_kernel void @test_copy_v4i8_x2_extra_use(<4 x i8> addrspace(1)* %o
; SI: ; %bb.0:
; SI-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x9
; SI-NEXT: s_mov_b32 s3, 0xf000
; SI-NEXT: s_mov_b32 s14, 0
; SI-NEXT: s_mov_b32 s15, s3
; SI-NEXT: s_mov_b32 s18, 0
; SI-NEXT: s_mov_b32 s19, s3
; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: s_mov_b64 s[12:13], s[10:11]
; SI-NEXT: s_mov_b64 s[16:17], s[10:11]
; SI-NEXT: v_mov_b32_e32 v1, 0
; SI-NEXT: buffer_load_dword v0, v[0:1], s[12:15], 0 addr64
; SI-NEXT: s_mov_b32 s16, 0xff00
; SI-NEXT: s_movk_i32 s17, 0xff
; SI-NEXT: s_mov_b32 s2, -1
; SI-NEXT: buffer_load_dword v0, v[0:1], s[16:19], 0 addr64
; SI-NEXT: s_mov_b32 s0, s8
; SI-NEXT: s_mov_b32 s1, s9
; SI-NEXT: s_mov_b32 s8, s6
; SI-NEXT: s_mov_b32 s9, s7
; SI-NEXT: s_mov_b32 s10, s2
; SI-NEXT: s_mov_b32 s11, s3
; SI-NEXT: s_mov_b32 s8, 0xff00
; SI-NEXT: s_movk_i32 s9, 0xff
; SI-NEXT: s_mov_b32 s2, -1
; SI-NEXT: s_mov_b32 s12, s6
; SI-NEXT: s_mov_b32 s13, s7
; SI-NEXT: s_mov_b32 s14, s2
; SI-NEXT: s_mov_b32 s15, s3
; SI-NEXT: s_mov_b32 s6, s2
; SI-NEXT: s_mov_b32 s7, s3
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_add_i32_e32 v3, vcc, 9, v0
; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v0
; SI-NEXT: v_and_b32_e32 v4, s16, v1
; SI-NEXT: v_and_b32_e32 v4, s8, v1
; SI-NEXT: v_add_i32_e32 v1, vcc, 9, v1
; SI-NEXT: v_and_b32_e32 v2, s16, v0
; SI-NEXT: v_and_b32_e32 v3, s17, v3
; SI-NEXT: v_and_b32_e32 v2, s8, v0
; SI-NEXT: v_and_b32_e32 v3, s9, v3
; SI-NEXT: v_or_b32_e32 v2, v2, v3
; SI-NEXT: v_and_b32_e32 v1, s17, v1
; SI-NEXT: v_and_b32_e32 v1, s9, v1
; SI-NEXT: v_add_i32_e32 v2, vcc, 0x900, v2
; SI-NEXT: v_or_b32_e32 v1, v4, v1
; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2
@ -350,7 +348,7 @@ define amdgpu_kernel void @test_copy_v4i8_x2_extra_use(<4 x i8> addrspace(1)* %o
; SI-NEXT: v_or_b32_e32 v1, v1, v2
; SI-NEXT: v_add_i32_e32 v1, vcc, 0x9000000, v1
; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0
; SI-NEXT: buffer_store_dword v1, off, s[8:11], 0
; SI-NEXT: buffer_store_dword v1, off, s[12:15], 0
; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0
; SI-NEXT: s_endpgm
;
@ -358,39 +356,41 @@ define amdgpu_kernel void @test_copy_v4i8_x2_extra_use(<4 x i8> addrspace(1)* %o
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; VI-NEXT: s_movk_i32 s12, 0xff00
; VI-NEXT: s_movk_i32 s13, 0xff
; VI-NEXT: s_movk_i32 s14, 0x900
; VI-NEXT: s_mov_b32 s11, 0xf000
; VI-NEXT: s_mov_b32 s10, -1
; VI-NEXT: s_mov_b32 s14, s10
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v1, s7
; VI-NEXT: v_add_u32_e32 v0, vcc, s6, v0
; VI-NEXT: v_mov_b32_e32 v1, s7
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: flat_load_dword v0, v[0:1]
; VI-NEXT: s_mov_b32 s7, 0xf000
; VI-NEXT: s_mov_b32 s6, -1
; VI-NEXT: s_mov_b32 s8, s2
; VI-NEXT: s_mov_b32 s9, s3
; VI-NEXT: s_mov_b32 s10, s6
; VI-NEXT: s_mov_b32 s11, s7
; VI-NEXT: s_mov_b32 s2, s6
; VI-NEXT: s_mov_b32 s3, s7
; VI-NEXT: s_mov_b32 s8, s4
; VI-NEXT: s_movk_i32 s4, 0xff00
; VI-NEXT: s_mov_b32 s9, s5
; VI-NEXT: s_movk_i32 s5, 0xff
; VI-NEXT: s_movk_i32 s6, 0x900
; VI-NEXT: s_mov_b32 s12, s2
; VI-NEXT: s_mov_b32 s13, s3
; VI-NEXT: s_mov_b32 s15, s11
; VI-NEXT: s_mov_b32 s2, s10
; VI-NEXT: s_mov_b32 s3, s11
; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v0
; VI-NEXT: v_and_b32_e32 v4, s12, v1
; VI-NEXT: v_and_b32_e32 v4, s4, v1
; VI-NEXT: v_add_u16_e32 v1, 9, v1
; VI-NEXT: v_add_u16_e32 v3, 9, v0
; VI-NEXT: v_and_b32_e32 v1, s13, v1
; VI-NEXT: v_and_b32_e32 v1, s5, v1
; VI-NEXT: v_or_b32_e32 v1, v4, v1
; VI-NEXT: v_and_b32_e32 v2, s12, v0
; VI-NEXT: v_and_b32_e32 v3, s13, v3
; VI-NEXT: v_and_b32_e32 v2, s4, v0
; VI-NEXT: v_and_b32_e32 v3, s5, v3
; VI-NEXT: v_or_b32_e32 v2, v2, v3
; VI-NEXT: v_add_u16_e32 v1, s14, v1
; VI-NEXT: v_add_u16_e32 v2, s14, v2
; VI-NEXT: v_add_u16_e32 v1, s6, v1
; VI-NEXT: v_add_u16_e32 v2, s6, v2
; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; VI-NEXT: v_or_b32_e32 v1, v2, v1
; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
; VI-NEXT: buffer_store_dword v1, off, s[8:11], 0
; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0
; VI-NEXT: buffer_store_dword v1, off, s[12:15], 0
; VI-NEXT: buffer_store_dword v0, off, s[8:11], 0
; VI-NEXT: s_endpgm
%tid.x = call i32 @llvm.amdgcn.workitem.id.x()
%in.ptr = getelementptr <4 x i8>, <4 x i8> addrspace(1)* %in, i32 %tid.x
@ -405,41 +405,41 @@ define amdgpu_kernel void @test_copy_v4i8_x2_extra_use(<4 x i8> addrspace(1)* %o
define amdgpu_kernel void @test_copy_v3i8_align4(<3 x i8> addrspace(1)* %out, <3 x i8> addrspace(1)* %in) nounwind {
; SI-LABEL: test_copy_v3i8_align4:
; SI: ; %bb.0:
; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
; SI-NEXT: s_mov_b32 s7, 0xf000
; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
; SI-NEXT: s_mov_b32 s3, 0xf000
; SI-NEXT: s_mov_b32 s10, 0
; SI-NEXT: s_mov_b32 s11, s7
; SI-NEXT: s_mov_b32 s11, s3
; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: s_mov_b64 s[8:9], s[2:3]
; SI-NEXT: s_mov_b64 s[8:9], s[6:7]
; SI-NEXT: v_mov_b32_e32 v1, 0
; SI-NEXT: buffer_load_dword v0, v[0:1], s[8:11], 0 addr64
; SI-NEXT: s_mov_b32 s6, -1
; SI-NEXT: s_mov_b32 s4, s0
; SI-NEXT: s_mov_b32 s5, s1
; SI-NEXT: s_mov_b32 s2, -1
; SI-NEXT: s_mov_b32 s0, s4
; SI-NEXT: s_mov_b32 s1, s5
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v0
; SI-NEXT: buffer_store_short v0, off, s[4:7], 0
; SI-NEXT: buffer_store_byte v1, off, s[4:7], 0 offset:2
; SI-NEXT: buffer_store_short v0, off, s[0:3], 0
; SI-NEXT: buffer_store_byte v1, off, s[0:3], 0 offset:2
; SI-NEXT: s_endpgm
;
; VI-LABEL: test_copy_v3i8_align4:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; VI-NEXT: s_mov_b32 s7, 0xf000
; VI-NEXT: s_mov_b32 s6, -1
; VI-NEXT: s_mov_b32 s3, 0xf000
; VI-NEXT: s_mov_b32 s2, -1
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v1, s3
; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0
; VI-NEXT: v_mov_b32_e32 v1, s7
; VI-NEXT: v_add_u32_e32 v0, vcc, s6, v0
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: flat_load_dword v0, v[0:1]
; VI-NEXT: s_mov_b32 s4, s0
; VI-NEXT: s_mov_b32 s5, s1
; VI-NEXT: s_mov_b32 s0, s4
; VI-NEXT: s_mov_b32 s1, s5
; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v0
; VI-NEXT: buffer_store_short v0, off, s[4:7], 0
; VI-NEXT: buffer_store_byte v1, off, s[4:7], 0 offset:2
; VI-NEXT: buffer_store_short v0, off, s[0:3], 0
; VI-NEXT: buffer_store_byte v1, off, s[0:3], 0 offset:2
; VI-NEXT: s_endpgm
%tid.x = call i32 @llvm.amdgcn.workitem.id.x()
%gep = getelementptr <3 x i8>, <3 x i8> addrspace(1)* %in, i32 %tid.x

View File

@ -65,21 +65,22 @@ define amdgpu_kernel void @s_ctlz_i32(i32 addrspace(1)* noalias %out, i32 %val)
define amdgpu_kernel void @v_ctlz_i32(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %valptr) nounwind {
; SI-LABEL: v_ctlz_i32:
; SI: ; %bb.0:
; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9
; SI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xb
; SI-NEXT: s_mov_b32 s7, 0xf000
; SI-NEXT: s_mov_b32 s10, 0
; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb
; SI-NEXT: s_mov_b32 s3, 0xf000
; SI-NEXT: s_mov_b32 s6, 0
; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; SI-NEXT: v_mov_b32_e32 v1, 0
; SI-NEXT: s_mov_b32 s11, s7
; SI-NEXT: s_mov_b32 s7, s3
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: buffer_load_dword v0, v[0:1], s[8:11], 0 addr64
; SI-NEXT: s_mov_b32 s6, -1
; SI-NEXT: buffer_load_dword v0, v[0:1], s[4:7], 0 addr64
; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
; SI-NEXT: s_mov_b32 s2, -1
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_ffbh_u32_e32 v1, v0
; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0
; SI-NEXT: v_cndmask_b32_e32 v0, 32, v1, vcc
; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0
; SI-NEXT: s_endpgm
;
; VI-LABEL: v_ctlz_i32:
@ -131,16 +132,16 @@ define amdgpu_kernel void @v_ctlz_i32(i32 addrspace(1)* noalias %out, i32 addrsp
define amdgpu_kernel void @v_ctlz_v2i32(<2 x i32> addrspace(1)* noalias %out, <2 x i32> addrspace(1)* noalias %valptr) nounwind {
; SI-LABEL: v_ctlz_v2i32:
; SI: ; %bb.0:
; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9
; SI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xb
; SI-NEXT: s_mov_b32 s7, 0xf000
; SI-NEXT: s_mov_b32 s10, 0
; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb
; SI-NEXT: s_mov_b32 s3, 0xf000
; SI-NEXT: s_mov_b32 s6, 0
; SI-NEXT: v_lshlrev_b32_e32 v0, 3, v0
; SI-NEXT: v_mov_b32_e32 v1, 0
; SI-NEXT: s_mov_b32 s11, s7
; SI-NEXT: s_mov_b32 s7, s3
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: buffer_load_dwordx2 v[0:1], v[0:1], s[8:11], 0 addr64
; SI-NEXT: s_mov_b32 s6, -1
; SI-NEXT: buffer_load_dwordx2 v[0:1], v[0:1], s[4:7], 0 addr64
; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
; SI-NEXT: s_mov_b32 s2, -1
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_ffbh_u32_e32 v2, v1
; SI-NEXT: v_ffbh_u32_e32 v3, v0
@ -148,7 +149,8 @@ define amdgpu_kernel void @v_ctlz_v2i32(<2 x i32> addrspace(1)* noalias %out, <2
; SI-NEXT: v_cndmask_b32_e32 v1, 32, v2, vcc
; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0
; SI-NEXT: v_cndmask_b32_e32 v0, 32, v3, vcc
; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
; SI-NEXT: s_endpgm
;
; VI-LABEL: v_ctlz_v2i32:
@ -206,16 +208,16 @@ define amdgpu_kernel void @v_ctlz_v2i32(<2 x i32> addrspace(1)* noalias %out, <2
define amdgpu_kernel void @v_ctlz_v4i32(<4 x i32> addrspace(1)* noalias %out, <4 x i32> addrspace(1)* noalias %valptr) nounwind {
; SI-LABEL: v_ctlz_v4i32:
; SI: ; %bb.0:
; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9
; SI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xb
; SI-NEXT: s_mov_b32 s7, 0xf000
; SI-NEXT: s_mov_b32 s10, 0
; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb
; SI-NEXT: s_mov_b32 s3, 0xf000
; SI-NEXT: s_mov_b32 s6, 0
; SI-NEXT: v_lshlrev_b32_e32 v0, 4, v0
; SI-NEXT: v_mov_b32_e32 v1, 0
; SI-NEXT: s_mov_b32 s11, s7
; SI-NEXT: s_mov_b32 s7, s3
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: buffer_load_dwordx4 v[0:3], v[0:1], s[8:11], 0 addr64
; SI-NEXT: s_mov_b32 s6, -1
; SI-NEXT: buffer_load_dwordx4 v[0:3], v[0:1], s[4:7], 0 addr64
; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
; SI-NEXT: s_mov_b32 s2, -1
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_ffbh_u32_e32 v4, v3
; SI-NEXT: v_ffbh_u32_e32 v5, v2
@ -229,7 +231,8 @@ define amdgpu_kernel void @v_ctlz_v4i32(<4 x i32> addrspace(1)* noalias %out, <4
; SI-NEXT: v_cndmask_b32_e32 v1, 32, v6, vcc
; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0
; SI-NEXT: v_cndmask_b32_e32 v0, 32, v7, vcc
; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0
; SI-NEXT: s_endpgm
;
; VI-LABEL: v_ctlz_v4i32:
@ -299,9 +302,9 @@ define amdgpu_kernel void @v_ctlz_v4i32(<4 x i32> addrspace(1)* noalias %out, <4
define amdgpu_kernel void @v_ctlz_i8(i8 addrspace(1)* noalias %out, i8 addrspace(1)* noalias %valptr) nounwind {
; SI-LABEL: v_ctlz_i8:
; SI: ; %bb.0:
; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb
; SI-NEXT: s_mov_b32 s3, 0xf000
; SI-NEXT: s_mov_b32 s2, -1
; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb
; SI-NEXT: s_mov_b32 s6, s2
; SI-NEXT: s_mov_b32 s7, s3
; SI-NEXT: s_waitcnt lgkmcnt(0)
@ -501,7 +504,6 @@ define amdgpu_kernel void @s_ctlz_i64_trunc(i32 addrspace(1)* noalias %out, i64
define amdgpu_kernel void @v_ctlz_i64(i64 addrspace(1)* noalias %out, i64 addrspace(1)* noalias %in) nounwind {
; SI-LABEL: v_ctlz_i64:
; SI: ; %bb.0:
; SI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x9
; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb
; SI-NEXT: s_mov_b32 s7, 0xf000
; SI-NEXT: s_mov_b32 s6, 0
@ -509,7 +511,7 @@ define amdgpu_kernel void @v_ctlz_i64(i64 addrspace(1)* noalias %out, i64 addrsp
; SI-NEXT: v_mov_b32_e32 v1, 0
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: buffer_load_dwordx2 v[2:3], v[0:1], s[4:7], 0 addr64
; SI-NEXT: s_mov_b64 s[10:11], s[6:7]
; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_ffbh_u32_e32 v4, v2
; SI-NEXT: v_ffbh_u32_e32 v5, v3
@ -520,7 +522,8 @@ define amdgpu_kernel void @v_ctlz_i64(i64 addrspace(1)* noalias %out, i64 addrsp
; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2
; SI-NEXT: v_cndmask_b32_e32 v2, 64, v3, vcc
; SI-NEXT: v_mov_b32_e32 v3, v1
; SI-NEXT: buffer_store_dwordx2 v[2:3], v[0:1], s[8:11], 0 addr64
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: buffer_store_dwordx2 v[2:3], v[0:1], s[4:7], 0 addr64
; SI-NEXT: s_endpgm
;
; VI-LABEL: v_ctlz_i64:
@ -588,7 +591,6 @@ define amdgpu_kernel void @v_ctlz_i64(i64 addrspace(1)* noalias %out, i64 addrsp
define amdgpu_kernel void @v_ctlz_i64_trunc(i32 addrspace(1)* noalias %out, i64 addrspace(1)* noalias %in) nounwind {
; SI-LABEL: v_ctlz_i64_trunc:
; SI: ; %bb.0:
; SI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x9
; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb
; SI-NEXT: s_mov_b32 s7, 0xf000
; SI-NEXT: s_mov_b32 s6, 0
@ -596,8 +598,8 @@ define amdgpu_kernel void @v_ctlz_i64_trunc(i32 addrspace(1)* noalias %out, i64
; SI-NEXT: v_mov_b32_e32 v2, 0
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: buffer_load_dwordx2 v[3:4], v[1:2], s[4:7], 0 addr64
; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9
; SI-NEXT: v_lshlrev_b32_e32 v1, 2, v0
; SI-NEXT: s_mov_b64 s[10:11], s[6:7]
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_ffbh_u32_e32 v0, v3
; SI-NEXT: v_ffbh_u32_e32 v5, v4
@ -607,7 +609,8 @@ define amdgpu_kernel void @v_ctlz_i64_trunc(i32 addrspace(1)* noalias %out, i64
; SI-NEXT: v_cndmask_b32_e32 v0, v5, v0, vcc
; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v3
; SI-NEXT: v_cndmask_b32_e32 v0, 64, v0, vcc
; SI-NEXT: buffer_store_dword v0, v[1:2], s[8:11], 0 addr64
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: buffer_store_dword v0, v[1:2], s[4:7], 0 addr64
; SI-NEXT: s_endpgm
;
; VI-LABEL: v_ctlz_i64_trunc:
@ -615,26 +618,26 @@ define amdgpu_kernel void @v_ctlz_i64_trunc(i32 addrspace(1)* noalias %out, i64
; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2c
; VI-NEXT: v_lshlrev_b32_e32 v1, 3, v0
; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; VI-NEXT: v_mov_b32_e32 v3, 0
; VI-NEXT: v_mov_b32_e32 v4, 0
; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v4, s3
; VI-NEXT: v_mov_b32_e32 v5, s1
; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v1
; VI-NEXT: v_addc_u32_e32 v1, vcc, v5, v3, vcc
; VI-NEXT: flat_load_dwordx2 v[0:1], v[0:1]
; VI-NEXT: v_add_u32_e32 v2, vcc, s2, v2
; VI-NEXT: v_addc_u32_e32 v3, vcc, v4, v3, vcc
; VI-NEXT: v_mov_b32_e32 v5, s3
; VI-NEXT: v_mov_b32_e32 v2, s1
; VI-NEXT: v_add_u32_e32 v1, vcc, s0, v1
; VI-NEXT: v_addc_u32_e32 v2, vcc, v2, v4, vcc
; VI-NEXT: v_add_u32_e32 v3, vcc, s2, v0
; VI-NEXT: flat_load_dwordx2 v[0:1], v[1:2]
; VI-NEXT: v_addc_u32_e32 v4, vcc, v5, v4, vcc
; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; VI-NEXT: v_ffbh_u32_e32 v4, v0
; VI-NEXT: v_add_u32_e32 v4, vcc, 32, v4
; VI-NEXT: v_ffbh_u32_e32 v2, v0
; VI-NEXT: v_add_u32_e32 v2, vcc, 32, v2
; VI-NEXT: v_ffbh_u32_e32 v5, v1
; VI-NEXT: v_or_b32_e32 v0, v0, v1
; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1
; VI-NEXT: v_cndmask_b32_e32 v1, v5, v4, vcc
; VI-NEXT: v_cndmask_b32_e32 v1, v5, v2, vcc
; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0
; VI-NEXT: v_cndmask_b32_e32 v0, 64, v1, vcc
; VI-NEXT: flat_store_dword v[2:3], v0
; VI-NEXT: flat_store_dword v[3:4], v0
; VI-NEXT: s_endpgm
;
; EG-LABEL: v_ctlz_i64_trunc:
@ -676,19 +679,20 @@ define amdgpu_kernel void @v_ctlz_i64_trunc(i32 addrspace(1)* noalias %out, i64
define amdgpu_kernel void @v_ctlz_i32_sel_eq_neg1(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %valptr) nounwind {
; SI-LABEL: v_ctlz_i32_sel_eq_neg1:
; SI: ; %bb.0:
; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9
; SI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xb
; SI-NEXT: s_mov_b32 s7, 0xf000
; SI-NEXT: s_mov_b32 s10, 0
; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb
; SI-NEXT: s_mov_b32 s3, 0xf000
; SI-NEXT: s_mov_b32 s6, 0
; SI-NEXT: s_mov_b32 s7, s3
; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; SI-NEXT: v_mov_b32_e32 v1, 0
; SI-NEXT: s_mov_b32 s11, s7
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: buffer_load_dword v0, v[0:1], s[8:11], 0 addr64
; SI-NEXT: s_mov_b32 s6, -1
; SI-NEXT: buffer_load_dword v0, v[0:1], s[4:7], 0 addr64
; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
; SI-NEXT: s_mov_b32 s2, -1
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_ffbh_u32_e32 v0, v0
; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0
; SI-NEXT: s_endpgm
;
; VI-LABEL: v_ctlz_i32_sel_eq_neg1:
@ -742,19 +746,20 @@ define amdgpu_kernel void @v_ctlz_i32_sel_eq_neg1(i32 addrspace(1)* noalias %out
define amdgpu_kernel void @v_ctlz_i32_sel_ne_neg1(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %valptr) nounwind {
; SI-LABEL: v_ctlz_i32_sel_ne_neg1:
; SI: ; %bb.0:
; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9
; SI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xb
; SI-NEXT: s_mov_b32 s7, 0xf000
; SI-NEXT: s_mov_b32 s10, 0
; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb
; SI-NEXT: s_mov_b32 s3, 0xf000
; SI-NEXT: s_mov_b32 s6, 0
; SI-NEXT: s_mov_b32 s7, s3
; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; SI-NEXT: v_mov_b32_e32 v1, 0
; SI-NEXT: s_mov_b32 s11, s7
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: buffer_load_dword v0, v[0:1], s[8:11], 0 addr64
; SI-NEXT: s_mov_b32 s6, -1
; SI-NEXT: buffer_load_dword v0, v[0:1], s[4:7], 0 addr64
; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
; SI-NEXT: s_mov_b32 s2, -1
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_ffbh_u32_e32 v0, v0
; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0
; SI-NEXT: s_endpgm
;
; VI-LABEL: v_ctlz_i32_sel_ne_neg1:
@ -809,23 +814,24 @@ define amdgpu_kernel void @v_ctlz_i32_sel_ne_neg1(i32 addrspace(1)* noalias %out
define amdgpu_kernel void @v_ctlz_i32_sel_eq_bitwidth(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %valptr) nounwind {
; SI-LABEL: v_ctlz_i32_sel_eq_bitwidth:
; SI: ; %bb.0:
; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9
; SI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xb
; SI-NEXT: s_mov_b32 s7, 0xf000
; SI-NEXT: s_mov_b32 s10, 0
; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb
; SI-NEXT: s_mov_b32 s3, 0xf000
; SI-NEXT: s_mov_b32 s6, 0
; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; SI-NEXT: v_mov_b32_e32 v1, 0
; SI-NEXT: s_mov_b32 s11, s7
; SI-NEXT: s_mov_b32 s7, s3
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: buffer_load_dword v0, v[0:1], s[8:11], 0 addr64
; SI-NEXT: s_mov_b32 s6, -1
; SI-NEXT: buffer_load_dword v0, v[0:1], s[4:7], 0 addr64
; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
; SI-NEXT: s_mov_b32 s2, -1
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_ffbh_u32_e32 v1, v0
; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0
; SI-NEXT: v_cndmask_b32_e32 v0, 32, v1, vcc
; SI-NEXT: v_cmp_ne_u32_e32 vcc, 32, v0
; SI-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc
; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0
; SI-NEXT: s_endpgm
;
; VI-LABEL: v_ctlz_i32_sel_eq_bitwidth:
@ -885,23 +891,24 @@ define amdgpu_kernel void @v_ctlz_i32_sel_eq_bitwidth(i32 addrspace(1)* noalias
define amdgpu_kernel void @v_ctlz_i32_sel_ne_bitwidth(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %valptr) nounwind {
; SI-LABEL: v_ctlz_i32_sel_ne_bitwidth:
; SI: ; %bb.0:
; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9
; SI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xb
; SI-NEXT: s_mov_b32 s7, 0xf000
; SI-NEXT: s_mov_b32 s10, 0
; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb
; SI-NEXT: s_mov_b32 s3, 0xf000
; SI-NEXT: s_mov_b32 s6, 0
; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; SI-NEXT: v_mov_b32_e32 v1, 0
; SI-NEXT: s_mov_b32 s11, s7
; SI-NEXT: s_mov_b32 s7, s3
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: buffer_load_dword v0, v[0:1], s[8:11], 0 addr64
; SI-NEXT: s_mov_b32 s6, -1
; SI-NEXT: buffer_load_dword v0, v[0:1], s[4:7], 0 addr64
; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
; SI-NEXT: s_mov_b32 s2, -1
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_ffbh_u32_e32 v1, v0
; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0
; SI-NEXT: v_cndmask_b32_e32 v0, 32, v1, vcc
; SI-NEXT: v_cmp_ne_u32_e32 vcc, 32, v0
; SI-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc
; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0
; SI-NEXT: s_endpgm
;
; VI-LABEL: v_ctlz_i32_sel_ne_bitwidth:
@ -961,18 +968,19 @@ define amdgpu_kernel void @v_ctlz_i32_sel_ne_bitwidth(i32 addrspace(1)* noalias
define amdgpu_kernel void @v_ctlz_i8_sel_eq_neg1(i8 addrspace(1)* noalias %out, i8 addrspace(1)* noalias %valptr) nounwind {
; SI-LABEL: v_ctlz_i8_sel_eq_neg1:
; SI: ; %bb.0:
; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9
; SI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xb
; SI-NEXT: s_mov_b32 s7, 0xf000
; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb
; SI-NEXT: s_mov_b32 s3, 0xf000
; SI-NEXT: v_mov_b32_e32 v1, 0
; SI-NEXT: s_mov_b32 s10, 0
; SI-NEXT: s_mov_b32 s11, s7
; SI-NEXT: s_mov_b32 s6, 0
; SI-NEXT: s_mov_b32 s7, s3
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: buffer_load_ubyte v0, v[0:1], s[8:11], 0 addr64
; SI-NEXT: s_mov_b32 s6, -1
; SI-NEXT: buffer_load_ubyte v0, v[0:1], s[4:7], 0 addr64
; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
; SI-NEXT: s_mov_b32 s2, -1
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_ffbh_u32_e32 v0, v0
; SI-NEXT: buffer_store_byte v0, off, s[4:7], 0
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: buffer_store_byte v0, off, s[0:3], 0
; SI-NEXT: s_endpgm
;
; VI-LABEL: v_ctlz_i8_sel_eq_neg1:
@ -1030,9 +1038,9 @@ define amdgpu_kernel void @v_ctlz_i32_sel_ne_bitwidth(i32 addrspace(1)* noalias
define amdgpu_kernel void @v_ctlz_i16_sel_eq_neg1(i16 addrspace(1)* noalias %out, i16 addrspace(1)* noalias %valptr) nounwind {
; SI-LABEL: v_ctlz_i16_sel_eq_neg1:
; SI: ; %bb.0:
; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb
; SI-NEXT: s_mov_b32 s3, 0xf000
; SI-NEXT: s_mov_b32 s2, -1
; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb
; SI-NEXT: s_mov_b32 s6, s2
; SI-NEXT: s_mov_b32 s7, s3
; SI-NEXT: s_waitcnt lgkmcnt(0)
@ -1102,19 +1110,20 @@ define amdgpu_kernel void @v_ctlz_i32_sel_ne_bitwidth(i32 addrspace(1)* noalias
define amdgpu_kernel void @v_ctlz_i7_sel_eq_neg1(i7 addrspace(1)* noalias %out, i7 addrspace(1)* noalias %valptr) nounwind {
; SI-LABEL: v_ctlz_i7_sel_eq_neg1:
; SI: ; %bb.0:
; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9
; SI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xb
; SI-NEXT: s_mov_b32 s7, 0xf000
; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb
; SI-NEXT: s_mov_b32 s3, 0xf000
; SI-NEXT: v_mov_b32_e32 v1, 0
; SI-NEXT: s_mov_b32 s10, 0
; SI-NEXT: s_mov_b32 s11, s7
; SI-NEXT: s_mov_b32 s6, 0
; SI-NEXT: s_mov_b32 s7, s3
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: buffer_load_ubyte v0, v[0:1], s[8:11], 0 addr64
; SI-NEXT: s_mov_b32 s6, -1
; SI-NEXT: buffer_load_ubyte v0, v[0:1], s[4:7], 0 addr64
; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
; SI-NEXT: s_mov_b32 s2, -1
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_ffbh_u32_e32 v0, v0
; SI-NEXT: v_and_b32_e32 v0, 0x7f, v0
; SI-NEXT: buffer_store_byte v0, off, s[4:7], 0
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: buffer_store_byte v0, off, s[0:3], 0
; SI-NEXT: s_endpgm
;
; VI-LABEL: v_ctlz_i7_sel_eq_neg1:

View File

@ -261,19 +261,19 @@ define amdgpu_kernel void @load_v4i8_to_v4f32_unaligned(<4 x float> addrspace(1)
define amdgpu_kernel void @load_v4i8_to_v4f32_2_uses(<4 x float> addrspace(1)* noalias %out, <4 x i8> addrspace(1)* noalias %out2, <4 x i8> addrspace(1)* noalias %in) nounwind {
; SI-LABEL: load_v4i8_to_v4f32_2_uses:
; SI: ; %bb.0:
; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xd
; SI-NEXT: s_mov_b32 s3, 0xf000
; SI-NEXT: s_mov_b32 s6, 0
; SI-NEXT: s_mov_b32 s7, s3
; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9
; SI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xb
; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd
; SI-NEXT: s_mov_b32 s11, 0xf000
; SI-NEXT: s_mov_b32 s2, 0
; SI-NEXT: s_mov_b32 s3, s11
; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; SI-NEXT: v_mov_b32_e32 v1, 0
; SI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x9
; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: buffer_load_dword v4, v[0:1], s[4:7], 0 addr64
; SI-NEXT: s_mov_b32 s2, -1
; SI-NEXT: s_mov_b32 s10, s2
; SI-NEXT: s_mov_b32 s11, s3
; SI-NEXT: buffer_load_dword v4, v[0:1], s[0:3], 0 addr64
; SI-NEXT: s_mov_b32 s10, -1
; SI-NEXT: s_mov_b32 s6, s10
; SI-NEXT: s_mov_b32 s7, s11
; SI-NEXT: s_movk_i32 s12, 0xff
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v4
@ -284,7 +284,7 @@ define amdgpu_kernel void @load_v4i8_to_v4f32_2_uses(<4 x float> addrspace(1)* n
; SI-NEXT: v_cvt_f32_ubyte1_e32 v1, v4
; SI-NEXT: v_cvt_f32_ubyte0_e32 v0, v4
; SI-NEXT: v_add_i32_e32 v4, vcc, 9, v4
; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[8:11], 0
; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_and_b32_e32 v0, s12, v4
; SI-NEXT: v_add_i32_e32 v2, vcc, 9, v5
@ -297,44 +297,44 @@ define amdgpu_kernel void @load_v4i8_to_v4f32_2_uses(<4 x float> addrspace(1)* n
; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; SI-NEXT: v_or_b32_e32 v0, v1, v0
; SI-NEXT: v_add_i32_e32 v0, vcc, 0x9000000, v0
; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0
; SI-NEXT: buffer_store_dword v0, off, s[8:11], 0
; SI-NEXT: s_endpgm
;
; VI-LABEL: load_v4i8_to_v4f32_2_uses:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2c
; VI-NEXT: v_mov_b32_e32 v4, 9
; VI-NEXT: s_movk_i32 s8, 0x900
; VI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x2c
; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; VI-NEXT: s_mov_b32 s11, 0xf000
; VI-NEXT: s_mov_b32 s10, -1
; VI-NEXT: s_mov_b32 s6, s10
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v1, s3
; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0
; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v0
; VI-NEXT: v_mov_b32_e32 v1, s1
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: flat_load_dword v5, v[0:1]
; VI-NEXT: s_mov_b32 s3, 0xf000
; VI-NEXT: s_mov_b32 s2, -1
; VI-NEXT: s_mov_b32 s6, s2
; VI-NEXT: s_mov_b32 s7, s3
; VI-NEXT: v_mov_b32_e32 v6, s8
; VI-NEXT: v_mov_b32_e32 v4, 9
; VI-NEXT: s_mov_b32 s7, s11
; VI-NEXT: s_movk_i32 s0, 0x900
; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; VI-NEXT: v_lshrrev_b32_e32 v7, 24, v5
; VI-NEXT: v_lshrrev_b32_e32 v6, 24, v5
; VI-NEXT: v_cvt_f32_ubyte3_e32 v3, v5
; VI-NEXT: v_cvt_f32_ubyte2_e32 v2, v5
; VI-NEXT: v_cvt_f32_ubyte1_e32 v1, v5
; VI-NEXT: v_cvt_f32_ubyte0_e32 v0, v5
; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0
; VI-NEXT: v_and_b32_e32 v8, 0xffffff00, v5
; VI-NEXT: v_add_u16_e32 v9, 9, v5
; VI-NEXT: v_and_b32_e32 v7, 0xffffff00, v5
; VI-NEXT: v_add_u16_e32 v8, 9, v5
; VI-NEXT: v_add_u16_sdwa v4, v5, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v7
; VI-NEXT: v_or_b32_sdwa v0, v8, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v6
; VI-NEXT: v_or_b32_sdwa v0, v7, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
; VI-NEXT: v_or_b32_sdwa v1, v1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
; VI-NEXT: v_add_u16_e32 v0, s8, v0
; VI-NEXT: v_add_u16_sdwa v1, v1, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
; VI-NEXT: v_mov_b32_e32 v2, s0
; VI-NEXT: v_add_u16_e32 v0, s0, v0
; VI-NEXT: v_add_u16_sdwa v1, v1, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
; VI-NEXT: v_or_b32_e32 v0, v0, v1
; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
; VI-NEXT: buffer_store_dword v0, off, s[8:11], 0
; VI-NEXT: s_endpgm
%tid.x = call i32 @llvm.amdgcn.workitem.id.x()
%in.ptr = getelementptr <4 x i8>, <4 x i8> addrspace(1)* %in, i32 %tid.x
@ -358,31 +358,30 @@ define amdgpu_kernel void @load_v7i8_to_v7f32(<7 x float> addrspace(1)* noalias
; SI-NEXT: v_lshlrev_b32_e32 v0, 3, v0
; SI-NEXT: v_mov_b32_e32 v1, 0
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: buffer_load_ubyte v2, v[0:1], s[0:3], 0 addr64
; SI-NEXT: buffer_load_ubyte v3, v[0:1], s[0:3], 0 addr64 offset:1
; SI-NEXT: buffer_load_ubyte v6, v[0:1], s[0:3], 0 addr64 offset:2
; SI-NEXT: buffer_load_ubyte v4, v[0:1], s[0:3], 0 addr64 offset:3
; SI-NEXT: buffer_load_ubyte v7, v[0:1], s[0:3], 0 addr64 offset:4
; SI-NEXT: buffer_load_ubyte v5, v[0:1], s[0:3], 0 addr64 offset:5
; SI-NEXT: buffer_load_ubyte v0, v[0:1], s[0:3], 0 addr64 offset:6
; SI-NEXT: buffer_load_ubyte v2, v[0:1], s[0:3], 0 addr64 offset:5
; SI-NEXT: buffer_load_ubyte v3, v[0:1], s[0:3], 0 addr64 offset:6
; SI-NEXT: buffer_load_ubyte v6, v[0:1], s[0:3], 0 addr64
; SI-NEXT: buffer_load_ubyte v4, v[0:1], s[0:3], 0 addr64 offset:1
; SI-NEXT: buffer_load_ubyte v7, v[0:1], s[0:3], 0 addr64 offset:2
; SI-NEXT: buffer_load_ubyte v5, v[0:1], s[0:3], 0 addr64 offset:3
; SI-NEXT: buffer_load_ubyte v0, v[0:1], s[0:3], 0 addr64 offset:4
; SI-NEXT: s_mov_b32 s6, -1
; SI-NEXT: s_waitcnt vmcnt(5)
; SI-NEXT: v_cvt_f32_ubyte2_e32 v1, v3
; SI-NEXT: v_cvt_f32_ubyte0_e32 v9, v3
; SI-NEXT: s_waitcnt vmcnt(3)
; SI-NEXT: v_lshlrev_b32_e32 v8, 8, v4
; SI-NEXT: s_waitcnt vmcnt(2)
; SI-NEXT: v_cvt_f32_ubyte0_e32 v4, v7
; SI-NEXT: v_cvt_f32_ubyte2_e32 v1, v4
; SI-NEXT: s_waitcnt vmcnt(1)
; SI-NEXT: v_cvt_f32_ubyte2_e32 v5, v5
; SI-NEXT: v_lshlrev_b32_e32 v8, 8, v5
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_cvt_f32_ubyte0_e32 v7, v0
; SI-NEXT: v_or_b32_e32 v0, v8, v6
; SI-NEXT: v_cvt_f32_ubyte0_e32 v4, v0
; SI-NEXT: v_or_b32_e32 v0, v8, v7
; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; SI-NEXT: v_or_b32_e32 v6, v0, v2
; SI-NEXT: v_or_b32_e32 v6, v0, v6
; SI-NEXT: v_cvt_f32_ubyte2_e32 v5, v2
; SI-NEXT: v_cvt_f32_ubyte3_e32 v3, v0
; SI-NEXT: v_cvt_f32_ubyte2_e32 v2, v0
; SI-NEXT: v_cvt_f32_ubyte0_e32 v0, v6
; SI-NEXT: buffer_store_dword v7, off, s[4:7], 0 offset:24
; SI-NEXT: buffer_store_dword v9, off, s[4:7], 0 offset:24
; SI-NEXT: buffer_store_dwordx2 v[4:5], off, s[4:7], 0 offset:16
; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0
; SI-NEXT: s_endpgm
@ -402,33 +401,35 @@ define amdgpu_kernel void @load_v7i8_to_v7f32(<7 x float> addrspace(1)* noalias
; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v1, vcc
; VI-NEXT: v_add_u32_e32 v4, vcc, 2, v0
; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc
; VI-NEXT: flat_load_ubyte v10, v[2:3]
; VI-NEXT: flat_load_ubyte v11, v[4:5]
; VI-NEXT: v_add_u32_e32 v2, vcc, 6, v0
; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v1, vcc
; VI-NEXT: v_add_u32_e32 v4, vcc, 4, v0
; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc
; VI-NEXT: v_add_u32_e32 v6, vcc, 5, v0
; VI-NEXT: v_add_u32_e32 v6, vcc, 6, v0
; VI-NEXT: v_addc_u32_e32 v7, vcc, 0, v1, vcc
; VI-NEXT: v_add_u32_e32 v8, vcc, 1, v0
; VI-NEXT: v_add_u32_e32 v8, vcc, 4, v0
; VI-NEXT: v_addc_u32_e32 v9, vcc, 0, v1, vcc
; VI-NEXT: flat_load_ubyte v2, v[2:3]
; VI-NEXT: flat_load_ubyte v3, v[4:5]
; VI-NEXT: flat_load_ubyte v4, v[6:7]
; VI-NEXT: flat_load_ubyte v5, v[8:9]
; VI-NEXT: v_add_u32_e32 v10, vcc, 5, v0
; VI-NEXT: v_addc_u32_e32 v11, vcc, 0, v1, vcc
; VI-NEXT: v_add_u32_e32 v12, vcc, 1, v0
; VI-NEXT: v_addc_u32_e32 v13, vcc, 0, v1, vcc
; VI-NEXT: flat_load_ubyte v6, v[6:7]
; VI-NEXT: flat_load_ubyte v7, v[8:9]
; VI-NEXT: flat_load_ubyte v8, v[10:11]
; VI-NEXT: flat_load_ubyte v9, v[12:13]
; VI-NEXT: flat_load_ubyte v0, v[0:1]
; VI-NEXT: flat_load_ubyte v1, v[2:3]
; VI-NEXT: flat_load_ubyte v2, v[4:5]
; VI-NEXT: s_waitcnt vmcnt(6) lgkmcnt(6)
; VI-NEXT: v_lshlrev_b32_e32 v7, 8, v10
; VI-NEXT: v_cvt_f32_ubyte0_e32 v6, v6
; VI-NEXT: s_waitcnt vmcnt(5) lgkmcnt(5)
; VI-NEXT: v_cvt_f32_ubyte0_e32 v4, v7
; VI-NEXT: s_waitcnt vmcnt(4) lgkmcnt(4)
; VI-NEXT: v_cvt_f32_ubyte0_e32 v6, v2
; VI-NEXT: v_or_b32_sdwa v2, v7, v11 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
; VI-NEXT: s_waitcnt vmcnt(1) lgkmcnt(1)
; VI-NEXT: v_cvt_f32_ubyte2_e32 v1, v5
; VI-NEXT: v_cvt_f32_ubyte2_e32 v5, v4
; VI-NEXT: v_cvt_f32_ubyte0_e32 v4, v3
; VI-NEXT: v_cvt_f32_ubyte3_e32 v3, v2
; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; VI-NEXT: v_cvt_f32_ubyte2_e32 v5, v8
; VI-NEXT: s_waitcnt vmcnt(2) lgkmcnt(2)
; VI-NEXT: v_cvt_f32_ubyte0_e32 v0, v0
; VI-NEXT: s_waitcnt vmcnt(1) lgkmcnt(1)
; VI-NEXT: v_lshlrev_b32_e32 v3, 8, v1
; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; VI-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
; VI-NEXT: v_cvt_f32_ubyte3_e32 v3, v2
; VI-NEXT: v_cvt_f32_ubyte2_e32 v1, v9
; VI-NEXT: v_cvt_f32_ubyte2_e32 v2, v2
; VI-NEXT: buffer_store_dwordx3 v[4:6], off, s[4:7], 0 offset:16
; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0
@ -880,42 +881,42 @@ define amdgpu_kernel void @extract_byte3_to_f32(float addrspace(1)* noalias %out
define amdgpu_kernel void @cvt_ubyte0_or_multiuse(i32 addrspace(1)* %in, float addrspace(1)* %out) {
; SI-LABEL: cvt_ubyte0_or_multiuse:
; SI: ; %bb.0: ; %bb
; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
; SI-NEXT: s_mov_b32 s7, 0xf000
; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
; SI-NEXT: s_mov_b32 s3, 0xf000
; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; SI-NEXT: v_mov_b32_e32 v1, 0
; SI-NEXT: s_mov_b32 s6, -1
; SI-NEXT: s_mov_b32 s2, -1
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: s_mov_b32 s4, s2
; SI-NEXT: s_mov_b32 s5, s3
; SI-NEXT: s_mov_b32 s2, 0
; SI-NEXT: s_mov_b32 s3, s7
; SI-NEXT: buffer_load_dword v0, v[0:1], s[0:3], 0 addr64
; SI-NEXT: s_mov_b32 s0, s6
; SI-NEXT: s_mov_b32 s1, s7
; SI-NEXT: s_mov_b32 s6, 0
; SI-NEXT: s_mov_b32 s7, s3
; SI-NEXT: buffer_load_dword v0, v[0:1], s[4:7], 0 addr64
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_or_b32_e32 v0, 0x80000001, v0
; SI-NEXT: v_cvt_f32_ubyte0_e32 v1, v0
; SI-NEXT: v_add_f32_e32 v0, v0, v1
; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0
; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0
; SI-NEXT: s_endpgm
;
; VI-LABEL: cvt_ubyte0_or_multiuse:
; VI: ; %bb.0: ; %bb
; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; VI-NEXT: s_mov_b32 s7, 0xf000
; VI-NEXT: s_mov_b32 s6, -1
; VI-NEXT: s_mov_b32 s3, 0xf000
; VI-NEXT: s_mov_b32 s2, -1
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v1, s1
; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v0
; VI-NEXT: v_mov_b32_e32 v1, s5
; VI-NEXT: v_add_u32_e32 v0, vcc, s4, v0
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: flat_load_dword v0, v[0:1]
; VI-NEXT: s_mov_b32 s4, s2
; VI-NEXT: s_mov_b32 s5, s3
; VI-NEXT: s_mov_b32 s0, s6
; VI-NEXT: s_mov_b32 s1, s7
; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; VI-NEXT: v_or_b32_e32 v0, 0x80000001, v0
; VI-NEXT: v_cvt_f32_ubyte0_e32 v1, v0
; VI-NEXT: v_add_f32_e32 v0, v0, v1
; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0
; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
; VI-NEXT: s_endpgm
bb:
%lid = tail call i32 @llvm.amdgcn.workitem.id.x()

View File

@ -9,11 +9,10 @@ define i32 @private_load_2xi16_align2(i16 addrspace(5)* %p) #0 {
; GFX7-ALIGNED: ; %bb.0:
; GFX7-ALIGNED-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX7-ALIGNED-NEXT: v_add_i32_e32 v1, vcc, 2, v0
; GFX7-ALIGNED-NEXT: buffer_load_ushort v1, v1, s[0:3], s33 offen
; GFX7-ALIGNED-NEXT: buffer_load_ushort v0, v0, s[0:3], s33 offen
; GFX7-ALIGNED-NEXT: s_waitcnt vmcnt(1)
; GFX7-ALIGNED-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX7-ALIGNED-NEXT: buffer_load_ushort v1, v1, s[0:3], s33 offen
; GFX7-ALIGNED-NEXT: s_waitcnt vmcnt(0)
; GFX7-ALIGNED-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX7-ALIGNED-NEXT: v_or_b32_e32 v0, v0, v1
; GFX7-ALIGNED-NEXT: s_setpc_b64 s[30:31]
;
@ -21,11 +20,10 @@ define i32 @private_load_2xi16_align2(i16 addrspace(5)* %p) #0 {
; GFX7-UNALIGNED: ; %bb.0:
; GFX7-UNALIGNED-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX7-UNALIGNED-NEXT: v_add_i32_e32 v1, vcc, 2, v0
; GFX7-UNALIGNED-NEXT: buffer_load_ushort v1, v1, s[0:3], s33 offen
; GFX7-UNALIGNED-NEXT: buffer_load_ushort v0, v0, s[0:3], s33 offen
; GFX7-UNALIGNED-NEXT: s_waitcnt vmcnt(1)
; GFX7-UNALIGNED-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX7-UNALIGNED-NEXT: buffer_load_ushort v1, v1, s[0:3], s33 offen
; GFX7-UNALIGNED-NEXT: s_waitcnt vmcnt(0)
; GFX7-UNALIGNED-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX7-UNALIGNED-NEXT: v_or_b32_e32 v0, v0, v1
; GFX7-UNALIGNED-NEXT: s_setpc_b64 s[30:31]
;
@ -91,21 +89,21 @@ define i32 @private_load_2xi16_align1(i16 addrspace(5)* %p) #0 {
; GFX7-ALIGNED-LABEL: private_load_2xi16_align1:
; GFX7-ALIGNED: ; %bb.0:
; GFX7-ALIGNED-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX7-ALIGNED-NEXT: v_add_i32_e32 v1, vcc, 3, v0
; GFX7-ALIGNED-NEXT: v_add_i32_e32 v2, vcc, 2, v0
; GFX7-ALIGNED-NEXT: v_add_i32_e32 v3, vcc, 1, v0
; GFX7-ALIGNED-NEXT: buffer_load_ubyte v1, v1, s[0:3], s33 offen
; GFX7-ALIGNED-NEXT: buffer_load_ubyte v3, v3, s[0:3], s33 offen
; GFX7-ALIGNED-NEXT: v_add_i32_e32 v2, vcc, 1, v0
; GFX7-ALIGNED-NEXT: buffer_load_ubyte v2, v2, s[0:3], s33 offen
; GFX7-ALIGNED-NEXT: v_add_i32_e32 v1, vcc, 2, v0
; GFX7-ALIGNED-NEXT: v_add_i32_e32 v3, vcc, 3, v0
; GFX7-ALIGNED-NEXT: buffer_load_ubyte v3, v3, s[0:3], s33 offen
; GFX7-ALIGNED-NEXT: buffer_load_ubyte v1, v1, s[0:3], s33 offen
; GFX7-ALIGNED-NEXT: buffer_load_ubyte v0, v0, s[0:3], s33 offen
; GFX7-ALIGNED-NEXT: s_waitcnt vmcnt(3)
; GFX7-ALIGNED-NEXT: v_lshlrev_b32_e32 v1, 8, v1
; GFX7-ALIGNED-NEXT: v_lshlrev_b32_e32 v2, 8, v2
; GFX7-ALIGNED-NEXT: s_waitcnt vmcnt(2)
; GFX7-ALIGNED-NEXT: v_lshlrev_b32_e32 v3, 8, v3
; GFX7-ALIGNED-NEXT: s_waitcnt vmcnt(1)
; GFX7-ALIGNED-NEXT: v_or_b32_e32 v1, v1, v2
; GFX7-ALIGNED-NEXT: v_or_b32_e32 v1, v3, v1
; GFX7-ALIGNED-NEXT: s_waitcnt vmcnt(0)
; GFX7-ALIGNED-NEXT: v_or_b32_e32 v0, v3, v0
; GFX7-ALIGNED-NEXT: v_or_b32_e32 v0, v2, v0
; GFX7-ALIGNED-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX7-ALIGNED-NEXT: v_or_b32_e32 v0, v0, v1
; GFX7-ALIGNED-NEXT: s_setpc_b64 s[30:31]

View File

@ -7,23 +7,23 @@
define amdgpu_kernel void @test_fmax_legacy_uge_f64(double addrspace(1)* %out, double addrspace(1)* %in) #0 {
; SI-LABEL: test_fmax_legacy_uge_f64:
; SI: ; %bb.0:
; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
; SI-NEXT: s_mov_b32 s7, 0xf000
; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
; SI-NEXT: s_mov_b32 s3, 0xf000
; SI-NEXT: s_mov_b32 s10, 0
; SI-NEXT: s_mov_b32 s11, s7
; SI-NEXT: s_mov_b32 s11, s3
; SI-NEXT: v_lshlrev_b32_e32 v0, 3, v0
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: s_mov_b64 s[8:9], s[2:3]
; SI-NEXT: s_mov_b64 s[8:9], s[6:7]
; SI-NEXT: v_mov_b32_e32 v1, 0
; SI-NEXT: buffer_load_dwordx4 v[0:3], v[0:1], s[8:11], 0 addr64
; SI-NEXT: s_mov_b32 s6, -1
; SI-NEXT: s_mov_b32 s4, s0
; SI-NEXT: s_mov_b32 s5, s1
; SI-NEXT: s_mov_b32 s2, -1
; SI-NEXT: s_mov_b32 s0, s4
; SI-NEXT: s_mov_b32 s1, s5
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_cmp_nlt_f64_e32 vcc, v[0:1], v[2:3]
; SI-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc
; SI-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc
; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
; SI-NEXT: s_endpgm
;
; VI-LABEL: test_fmax_legacy_uge_f64:
@ -59,23 +59,23 @@ define amdgpu_kernel void @test_fmax_legacy_uge_f64(double addrspace(1)* %out, d
define amdgpu_kernel void @test_fmax_legacy_oge_f64(double addrspace(1)* %out, double addrspace(1)* %in) #0 {
; SI-LABEL: test_fmax_legacy_oge_f64:
; SI: ; %bb.0:
; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
; SI-NEXT: s_mov_b32 s7, 0xf000
; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
; SI-NEXT: s_mov_b32 s3, 0xf000
; SI-NEXT: s_mov_b32 s10, 0
; SI-NEXT: s_mov_b32 s11, s7
; SI-NEXT: s_mov_b32 s11, s3
; SI-NEXT: v_lshlrev_b32_e32 v0, 3, v0
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: s_mov_b64 s[8:9], s[2:3]
; SI-NEXT: s_mov_b64 s[8:9], s[6:7]
; SI-NEXT: v_mov_b32_e32 v1, 0
; SI-NEXT: buffer_load_dwordx4 v[0:3], v[0:1], s[8:11], 0 addr64
; SI-NEXT: s_mov_b32 s6, -1
; SI-NEXT: s_mov_b32 s4, s0
; SI-NEXT: s_mov_b32 s5, s1
; SI-NEXT: s_mov_b32 s2, -1
; SI-NEXT: s_mov_b32 s0, s4
; SI-NEXT: s_mov_b32 s1, s5
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_cmp_ge_f64_e32 vcc, v[0:1], v[2:3]
; SI-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc
; SI-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc
; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
; SI-NEXT: s_endpgm
;
; VI-LABEL: test_fmax_legacy_oge_f64:
@ -111,23 +111,23 @@ define amdgpu_kernel void @test_fmax_legacy_oge_f64(double addrspace(1)* %out, d
define amdgpu_kernel void @test_fmax_legacy_ugt_f64(double addrspace(1)* %out, double addrspace(1)* %in) #0 {
; SI-LABEL: test_fmax_legacy_ugt_f64:
; SI: ; %bb.0:
; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
; SI-NEXT: s_mov_b32 s7, 0xf000
; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
; SI-NEXT: s_mov_b32 s3, 0xf000
; SI-NEXT: s_mov_b32 s10, 0
; SI-NEXT: s_mov_b32 s11, s7
; SI-NEXT: s_mov_b32 s11, s3
; SI-NEXT: v_lshlrev_b32_e32 v0, 3, v0
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: s_mov_b64 s[8:9], s[2:3]
; SI-NEXT: s_mov_b64 s[8:9], s[6:7]
; SI-NEXT: v_mov_b32_e32 v1, 0
; SI-NEXT: buffer_load_dwordx4 v[0:3], v[0:1], s[8:11], 0 addr64
; SI-NEXT: s_mov_b32 s6, -1
; SI-NEXT: s_mov_b32 s4, s0
; SI-NEXT: s_mov_b32 s5, s1
; SI-NEXT: s_mov_b32 s2, -1
; SI-NEXT: s_mov_b32 s0, s4
; SI-NEXT: s_mov_b32 s1, s5
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_cmp_nle_f64_e32 vcc, v[0:1], v[2:3]
; SI-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc
; SI-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc
; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
; SI-NEXT: s_endpgm
;
; VI-LABEL: test_fmax_legacy_ugt_f64:
@ -163,23 +163,23 @@ define amdgpu_kernel void @test_fmax_legacy_ugt_f64(double addrspace(1)* %out, d
define amdgpu_kernel void @test_fmax_legacy_ogt_f64(double addrspace(1)* %out, double addrspace(1)* %in) #0 {
; SI-LABEL: test_fmax_legacy_ogt_f64:
; SI: ; %bb.0:
; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
; SI-NEXT: s_mov_b32 s7, 0xf000
; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
; SI-NEXT: s_mov_b32 s3, 0xf000
; SI-NEXT: s_mov_b32 s10, 0
; SI-NEXT: s_mov_b32 s11, s7
; SI-NEXT: s_mov_b32 s11, s3
; SI-NEXT: v_lshlrev_b32_e32 v0, 3, v0
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: s_mov_b64 s[8:9], s[2:3]
; SI-NEXT: s_mov_b64 s[8:9], s[6:7]
; SI-NEXT: v_mov_b32_e32 v1, 0
; SI-NEXT: buffer_load_dwordx4 v[0:3], v[0:1], s[8:11], 0 addr64
; SI-NEXT: s_mov_b32 s6, -1
; SI-NEXT: s_mov_b32 s4, s0
; SI-NEXT: s_mov_b32 s5, s1
; SI-NEXT: s_mov_b32 s2, -1
; SI-NEXT: s_mov_b32 s0, s4
; SI-NEXT: s_mov_b32 s1, s5
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_cmp_gt_f64_e32 vcc, v[0:1], v[2:3]
; SI-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc
; SI-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc
; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
; SI-NEXT: s_endpgm
;
; VI-LABEL: test_fmax_legacy_ogt_f64:

View File

@ -5,23 +5,23 @@
define amdgpu_kernel void @test_fmin_legacy_uge_f64(double addrspace(1)* %out, double addrspace(1)* %in) #0 {
; SI-LABEL: test_fmin_legacy_uge_f64:
; SI: ; %bb.0:
; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
; SI-NEXT: s_mov_b32 s7, 0xf000
; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
; SI-NEXT: s_mov_b32 s3, 0xf000
; SI-NEXT: s_mov_b32 s10, 0
; SI-NEXT: s_mov_b32 s11, s7
; SI-NEXT: s_mov_b32 s11, s3
; SI-NEXT: v_lshlrev_b32_e32 v0, 3, v0
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: s_mov_b64 s[8:9], s[2:3]
; SI-NEXT: s_mov_b64 s[8:9], s[6:7]
; SI-NEXT: v_mov_b32_e32 v1, 0
; SI-NEXT: buffer_load_dwordx4 v[0:3], v[0:1], s[8:11], 0 addr64
; SI-NEXT: s_mov_b32 s6, -1
; SI-NEXT: s_mov_b32 s4, s0
; SI-NEXT: s_mov_b32 s5, s1
; SI-NEXT: s_mov_b32 s2, -1
; SI-NEXT: s_mov_b32 s0, s4
; SI-NEXT: s_mov_b32 s1, s5
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_cmp_nlt_f64_e32 vcc, v[0:1], v[2:3]
; SI-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc
; SI-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc
; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
; SI-NEXT: s_endpgm
;
; VI-LABEL: test_fmin_legacy_uge_f64:
@ -57,23 +57,23 @@ define amdgpu_kernel void @test_fmin_legacy_uge_f64(double addrspace(1)* %out, d
define amdgpu_kernel void @test_fmin_legacy_ugt_f64(double addrspace(1)* %out, double addrspace(1)* %in) #0 {
; SI-LABEL: test_fmin_legacy_ugt_f64:
; SI: ; %bb.0:
; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
; SI-NEXT: s_mov_b32 s7, 0xf000
; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
; SI-NEXT: s_mov_b32 s3, 0xf000
; SI-NEXT: s_mov_b32 s10, 0
; SI-NEXT: s_mov_b32 s11, s7
; SI-NEXT: s_mov_b32 s11, s3
; SI-NEXT: v_lshlrev_b32_e32 v0, 3, v0
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: s_mov_b64 s[8:9], s[2:3]
; SI-NEXT: s_mov_b64 s[8:9], s[6:7]
; SI-NEXT: v_mov_b32_e32 v1, 0
; SI-NEXT: buffer_load_dwordx4 v[0:3], v[0:1], s[8:11], 0 addr64
; SI-NEXT: s_mov_b32 s6, -1
; SI-NEXT: s_mov_b32 s4, s0
; SI-NEXT: s_mov_b32 s5, s1
; SI-NEXT: s_mov_b32 s2, -1
; SI-NEXT: s_mov_b32 s0, s4
; SI-NEXT: s_mov_b32 s1, s5
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_cmp_nle_f64_e32 vcc, v[0:1], v[2:3]
; SI-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc
; SI-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc
; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
; SI-NEXT: s_endpgm
;
; VI-LABEL: test_fmin_legacy_ugt_f64:
@ -109,23 +109,23 @@ define amdgpu_kernel void @test_fmin_legacy_ugt_f64(double addrspace(1)* %out, d
define amdgpu_kernel void @test_fmin_legacy_ule_f64(double addrspace(1)* %out, double addrspace(1)* %in) #0 {
; SI-LABEL: test_fmin_legacy_ule_f64:
; SI: ; %bb.0:
; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
; SI-NEXT: s_mov_b32 s7, 0xf000
; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
; SI-NEXT: s_mov_b32 s3, 0xf000
; SI-NEXT: s_mov_b32 s10, 0
; SI-NEXT: s_mov_b32 s11, s7
; SI-NEXT: s_mov_b32 s11, s3
; SI-NEXT: v_lshlrev_b32_e32 v0, 3, v0
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: s_mov_b64 s[8:9], s[2:3]
; SI-NEXT: s_mov_b64 s[8:9], s[6:7]
; SI-NEXT: v_mov_b32_e32 v1, 0
; SI-NEXT: buffer_load_dwordx4 v[0:3], v[0:1], s[8:11], 0 addr64
; SI-NEXT: s_mov_b32 s6, -1
; SI-NEXT: s_mov_b32 s4, s0
; SI-NEXT: s_mov_b32 s5, s1
; SI-NEXT: s_mov_b32 s2, -1
; SI-NEXT: s_mov_b32 s0, s4
; SI-NEXT: s_mov_b32 s1, s5
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_cmp_ngt_f64_e32 vcc, v[0:1], v[2:3]
; SI-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc
; SI-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc
; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
; SI-NEXT: s_endpgm
;
; VI-LABEL: test_fmin_legacy_ule_f64:
@ -161,23 +161,23 @@ define amdgpu_kernel void @test_fmin_legacy_ule_f64(double addrspace(1)* %out, d
define amdgpu_kernel void @test_fmin_legacy_ult_f64(double addrspace(1)* %out, double addrspace(1)* %in) #0 {
; SI-LABEL: test_fmin_legacy_ult_f64:
; SI: ; %bb.0:
; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
; SI-NEXT: s_mov_b32 s7, 0xf000
; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
; SI-NEXT: s_mov_b32 s3, 0xf000
; SI-NEXT: s_mov_b32 s10, 0
; SI-NEXT: s_mov_b32 s11, s7
; SI-NEXT: s_mov_b32 s11, s3
; SI-NEXT: v_lshlrev_b32_e32 v0, 3, v0
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: s_mov_b64 s[8:9], s[2:3]
; SI-NEXT: s_mov_b64 s[8:9], s[6:7]
; SI-NEXT: v_mov_b32_e32 v1, 0
; SI-NEXT: buffer_load_dwordx4 v[0:3], v[0:1], s[8:11], 0 addr64
; SI-NEXT: s_mov_b32 s6, -1
; SI-NEXT: s_mov_b32 s4, s0
; SI-NEXT: s_mov_b32 s5, s1
; SI-NEXT: s_mov_b32 s2, -1
; SI-NEXT: s_mov_b32 s0, s4
; SI-NEXT: s_mov_b32 s1, s5
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_cmp_nge_f64_e32 vcc, v[0:1], v[2:3]
; SI-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc
; SI-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc
; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
; SI-NEXT: s_endpgm
;
; VI-LABEL: test_fmin_legacy_ult_f64:
@ -213,23 +213,23 @@ define amdgpu_kernel void @test_fmin_legacy_ult_f64(double addrspace(1)* %out, d
define amdgpu_kernel void @test_fmin_legacy_oge_f64(double addrspace(1)* %out, double addrspace(1)* %in) #0 {
; SI-LABEL: test_fmin_legacy_oge_f64:
; SI: ; %bb.0:
; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
; SI-NEXT: s_mov_b32 s7, 0xf000
; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
; SI-NEXT: s_mov_b32 s3, 0xf000
; SI-NEXT: s_mov_b32 s10, 0
; SI-NEXT: s_mov_b32 s11, s7
; SI-NEXT: s_mov_b32 s11, s3
; SI-NEXT: v_lshlrev_b32_e32 v0, 3, v0
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: s_mov_b64 s[8:9], s[2:3]
; SI-NEXT: s_mov_b64 s[8:9], s[6:7]
; SI-NEXT: v_mov_b32_e32 v1, 0
; SI-NEXT: buffer_load_dwordx4 v[0:3], v[0:1], s[8:11], 0 addr64
; SI-NEXT: s_mov_b32 s6, -1
; SI-NEXT: s_mov_b32 s4, s0
; SI-NEXT: s_mov_b32 s5, s1
; SI-NEXT: s_mov_b32 s2, -1
; SI-NEXT: s_mov_b32 s0, s4
; SI-NEXT: s_mov_b32 s1, s5
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_cmp_ge_f64_e32 vcc, v[0:1], v[2:3]
; SI-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc
; SI-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc
; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
; SI-NEXT: s_endpgm
;
; VI-LABEL: test_fmin_legacy_oge_f64:
@ -265,23 +265,23 @@ define amdgpu_kernel void @test_fmin_legacy_oge_f64(double addrspace(1)* %out, d
define amdgpu_kernel void @test_fmin_legacy_ogt_f64(double addrspace(1)* %out, double addrspace(1)* %in) #0 {
; SI-LABEL: test_fmin_legacy_ogt_f64:
; SI: ; %bb.0:
; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
; SI-NEXT: s_mov_b32 s7, 0xf000
; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
; SI-NEXT: s_mov_b32 s3, 0xf000
; SI-NEXT: s_mov_b32 s10, 0
; SI-NEXT: s_mov_b32 s11, s7
; SI-NEXT: s_mov_b32 s11, s3
; SI-NEXT: v_lshlrev_b32_e32 v0, 3, v0
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: s_mov_b64 s[8:9], s[2:3]
; SI-NEXT: s_mov_b64 s[8:9], s[6:7]
; SI-NEXT: v_mov_b32_e32 v1, 0
; SI-NEXT: buffer_load_dwordx4 v[0:3], v[0:1], s[8:11], 0 addr64
; SI-NEXT: s_mov_b32 s6, -1
; SI-NEXT: s_mov_b32 s4, s0
; SI-NEXT: s_mov_b32 s5, s1
; SI-NEXT: s_mov_b32 s2, -1
; SI-NEXT: s_mov_b32 s0, s4
; SI-NEXT: s_mov_b32 s1, s5
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_cmp_gt_f64_e32 vcc, v[0:1], v[2:3]
; SI-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc
; SI-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc
; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
; SI-NEXT: s_endpgm
;
; VI-LABEL: test_fmin_legacy_ogt_f64:
@ -317,23 +317,23 @@ define amdgpu_kernel void @test_fmin_legacy_ogt_f64(double addrspace(1)* %out, d
define amdgpu_kernel void @test_fmin_legacy_ole_f64(double addrspace(1)* %out, double addrspace(1)* %in) #0 {
; SI-LABEL: test_fmin_legacy_ole_f64:
; SI: ; %bb.0:
; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
; SI-NEXT: s_mov_b32 s7, 0xf000
; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
; SI-NEXT: s_mov_b32 s3, 0xf000
; SI-NEXT: s_mov_b32 s10, 0
; SI-NEXT: s_mov_b32 s11, s7
; SI-NEXT: s_mov_b32 s11, s3
; SI-NEXT: v_lshlrev_b32_e32 v0, 3, v0
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: s_mov_b64 s[8:9], s[2:3]
; SI-NEXT: s_mov_b64 s[8:9], s[6:7]
; SI-NEXT: v_mov_b32_e32 v1, 0
; SI-NEXT: buffer_load_dwordx4 v[0:3], v[0:1], s[8:11], 0 addr64
; SI-NEXT: s_mov_b32 s6, -1
; SI-NEXT: s_mov_b32 s4, s0
; SI-NEXT: s_mov_b32 s5, s1
; SI-NEXT: s_mov_b32 s2, -1
; SI-NEXT: s_mov_b32 s0, s4
; SI-NEXT: s_mov_b32 s1, s5
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_cmp_le_f64_e32 vcc, v[0:1], v[2:3]
; SI-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc
; SI-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc
; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
; SI-NEXT: s_endpgm
;
; VI-LABEL: test_fmin_legacy_ole_f64:
@ -369,23 +369,23 @@ define amdgpu_kernel void @test_fmin_legacy_ole_f64(double addrspace(1)* %out, d
define amdgpu_kernel void @test_fmin_legacy_olt_f64(double addrspace(1)* %out, double addrspace(1)* %in) #0 {
; SI-LABEL: test_fmin_legacy_olt_f64:
; SI: ; %bb.0:
; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
; SI-NEXT: s_mov_b32 s7, 0xf000
; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
; SI-NEXT: s_mov_b32 s3, 0xf000
; SI-NEXT: s_mov_b32 s10, 0
; SI-NEXT: s_mov_b32 s11, s7
; SI-NEXT: s_mov_b32 s11, s3
; SI-NEXT: v_lshlrev_b32_e32 v0, 3, v0
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: s_mov_b64 s[8:9], s[2:3]
; SI-NEXT: s_mov_b64 s[8:9], s[6:7]
; SI-NEXT: v_mov_b32_e32 v1, 0
; SI-NEXT: buffer_load_dwordx4 v[0:3], v[0:1], s[8:11], 0 addr64
; SI-NEXT: s_mov_b32 s6, -1
; SI-NEXT: s_mov_b32 s4, s0
; SI-NEXT: s_mov_b32 s5, s1
; SI-NEXT: s_mov_b32 s2, -1
; SI-NEXT: s_mov_b32 s0, s4
; SI-NEXT: s_mov_b32 s1, s5
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_cmp_lt_f64_e32 vcc, v[0:1], v[2:3]
; SI-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc
; SI-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc
; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
; SI-NEXT: s_endpgm
;
; VI-LABEL: test_fmin_legacy_olt_f64:

View File

@ -110,18 +110,18 @@ define <2 x half> @v_pow_v2f16(<2 x half> %x, <2 x half> %y) {
; GFX6-LABEL: v_pow_v2f16:
; GFX6: ; %bb.0:
; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX6-NEXT: v_cvt_f16_f32_e32 v1, v1
; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v0
; GFX6-NEXT: v_cvt_f16_f32_e32 v3, v3
; GFX6-NEXT: v_cvt_f16_f32_e32 v1, v1
; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2
; GFX6-NEXT: v_cvt_f32_f16_e32 v1, v1
; GFX6-NEXT: v_cvt_f16_f32_e32 v3, v3
; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v0
; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v3
; GFX6-NEXT: v_cvt_f32_f16_e32 v1, v1
; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v2
; GFX6-NEXT: v_log_f32_e32 v1, v1
; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v3
; GFX6-NEXT: v_log_f32_e32 v0, v0
; GFX6-NEXT: v_mul_legacy_f32_e32 v1, v3, v1
; GFX6-NEXT: v_log_f32_e32 v1, v1
; GFX6-NEXT: v_mul_legacy_f32_e32 v0, v2, v0
; GFX6-NEXT: v_mul_legacy_f32_e32 v1, v3, v1
; GFX6-NEXT: v_exp_f32_e32 v0, v0
; GFX6-NEXT: v_exp_f32_e32 v1, v1
; GFX6-NEXT: s_setpc_b64 s[30:31]
@ -137,30 +137,30 @@ define <2 x half> @v_pow_v2f16(<2 x half> %x, <2 x half> %y) {
; GFX8-NEXT: v_log_f32_e32 v0, v0
; GFX8-NEXT: v_mul_legacy_f32_e32 v2, v3, v2
; GFX8-NEXT: v_mul_legacy_f32_e32 v0, v1, v0
; GFX8-NEXT: v_exp_f32_e32 v1, v2
; GFX8-NEXT: v_exp_f32_e32 v0, v0
; GFX8-NEXT: v_cvt_f16_f32_sdwa v1, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD
; GFX8-NEXT: v_exp_f32_e32 v2, v2
; GFX8-NEXT: v_cvt_f16_f32_e32 v0, v0
; GFX8-NEXT: v_cvt_f16_f32_sdwa v1, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD
; GFX8-NEXT: v_or_b32_e32 v0, v0, v1
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_pow_v2f16:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: v_cvt_f32_f16_e32 v2, v0
; GFX9-NEXT: v_cvt_f32_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
; GFX9-NEXT: v_cvt_f32_f16_e32 v3, v1
; GFX9-NEXT: v_cvt_f32_f16_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
; GFX9-NEXT: v_cvt_f32_f16_sdwa v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
; GFX9-NEXT: v_cvt_f32_f16_e32 v0, v0
; GFX9-NEXT: v_cvt_f32_f16_sdwa v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
; GFX9-NEXT: v_cvt_f32_f16_e32 v1, v1
; GFX9-NEXT: v_log_f32_e32 v2, v2
; GFX9-NEXT: v_log_f32_e32 v0, v0
; GFX9-NEXT: v_mul_legacy_f32_e32 v2, v3, v2
; GFX9-NEXT: v_mul_legacy_f32_e32 v0, v1, v0
; GFX9-NEXT: v_exp_f32_e32 v1, v2
; GFX9-NEXT: v_exp_f32_e32 v0, v0
; GFX9-NEXT: v_cvt_f16_f32_e32 v1, v1
; GFX9-NEXT: v_exp_f32_e32 v1, v2
; GFX9-NEXT: v_cvt_f16_f32_e32 v0, v0
; GFX9-NEXT: v_and_b32_e32 v1, 0xffff, v1
; GFX9-NEXT: v_lshl_or_b32 v0, v0, 16, v1
; GFX9-NEXT: v_cvt_f16_f32_e32 v1, v1
; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX9-NEXT: v_lshl_or_b32 v0, v1, 16, v0
; GFX9-NEXT: s_setpc_b64 s[30:31]
%pow = call <2 x half> @llvm.pow.v2f16(<2 x half> %x, <2 x half> %y)
ret <2 x half> %pow
@ -172,21 +172,21 @@ define <2 x half> @v_pow_v2f16_fneg_lhs(<2 x half> %x, <2 x half> %y) {
; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX6-NEXT: v_cvt_f16_f32_e32 v1, v1
; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v0
; GFX6-NEXT: v_cvt_f16_f32_e32 v3, v3
; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2
; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX6-NEXT: v_or_b32_e32 v0, v0, v1
; GFX6-NEXT: v_xor_b32_e32 v0, 0x80008000, v0
; GFX6-NEXT: v_cvt_f16_f32_e32 v1, v2
; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v0
; GFX6-NEXT: v_cvt_f16_f32_e32 v1, v3
; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v0
; GFX6-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v0
; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v2
; GFX6-NEXT: v_log_f32_e32 v3, v3
; GFX6-NEXT: v_cvt_f32_f16_e32 v1, v1
; GFX6-NEXT: v_log_f32_e32 v2, v2
; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v3
; GFX6-NEXT: v_log_f32_e32 v4, v0
; GFX6-NEXT: v_mul_legacy_f32_e32 v0, v1, v2
; GFX6-NEXT: v_mul_legacy_f32_e32 v0, v2, v3
; GFX6-NEXT: v_exp_f32_e32 v0, v0
; GFX6-NEXT: v_mul_legacy_f32_e32 v1, v3, v4
; GFX6-NEXT: v_mul_legacy_f32_e32 v1, v1, v4
; GFX6-NEXT: v_exp_f32_e32 v1, v1
; GFX6-NEXT: s_setpc_b64 s[30:31]
;
@ -201,30 +201,30 @@ define <2 x half> @v_pow_v2f16_fneg_lhs(<2 x half> %x, <2 x half> %y) {
; GFX8-NEXT: v_log_f32_e32 v0, v0
; GFX8-NEXT: v_mul_legacy_f32_e32 v2, v3, v2
; GFX8-NEXT: v_mul_legacy_f32_e32 v0, v1, v0
; GFX8-NEXT: v_exp_f32_e32 v1, v2
; GFX8-NEXT: v_exp_f32_e32 v0, v0
; GFX8-NEXT: v_cvt_f16_f32_sdwa v1, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD
; GFX8-NEXT: v_exp_f32_e32 v2, v2
; GFX8-NEXT: v_cvt_f16_f32_e32 v0, v0
; GFX8-NEXT: v_cvt_f16_f32_sdwa v1, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD
; GFX8-NEXT: v_or_b32_e32 v0, v0, v1
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_pow_v2f16_fneg_lhs:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: v_cvt_f32_f16_e64 v2, -v0
; GFX9-NEXT: v_cvt_f32_f16_sdwa v0, -v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
; GFX9-NEXT: v_cvt_f32_f16_e32 v3, v1
; GFX9-NEXT: v_cvt_f32_f16_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
; GFX9-NEXT: v_cvt_f32_f16_sdwa v2, -v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
; GFX9-NEXT: v_cvt_f32_f16_e64 v0, -v0
; GFX9-NEXT: v_cvt_f32_f16_sdwa v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
; GFX9-NEXT: v_cvt_f32_f16_e32 v1, v1
; GFX9-NEXT: v_log_f32_e32 v2, v2
; GFX9-NEXT: v_log_f32_e32 v0, v0
; GFX9-NEXT: v_mul_legacy_f32_e32 v2, v3, v2
; GFX9-NEXT: v_mul_legacy_f32_e32 v0, v1, v0
; GFX9-NEXT: v_exp_f32_e32 v1, v2
; GFX9-NEXT: v_exp_f32_e32 v0, v0
; GFX9-NEXT: v_cvt_f16_f32_e32 v1, v1
; GFX9-NEXT: v_exp_f32_e32 v1, v2
; GFX9-NEXT: v_cvt_f16_f32_e32 v0, v0
; GFX9-NEXT: v_and_b32_e32 v1, 0xffff, v1
; GFX9-NEXT: v_lshl_or_b32 v0, v0, 16, v1
; GFX9-NEXT: v_cvt_f16_f32_e32 v1, v1
; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX9-NEXT: v_lshl_or_b32 v0, v1, 16, v0
; GFX9-NEXT: s_setpc_b64 s[30:31]
%x.fneg = fneg <2 x half> %x
%pow = call <2 x half> @llvm.pow.v2f16(<2 x half> %x.fneg, <2 x half> %y)
@ -266,30 +266,30 @@ define <2 x half> @v_pow_v2f16_fneg_rhs(<2 x half> %x, <2 x half> %y) {
; GFX8-NEXT: v_log_f32_e32 v0, v0
; GFX8-NEXT: v_mul_legacy_f32_e32 v2, v3, v2
; GFX8-NEXT: v_mul_legacy_f32_e32 v0, v1, v0
; GFX8-NEXT: v_exp_f32_e32 v1, v2
; GFX8-NEXT: v_exp_f32_e32 v0, v0
; GFX8-NEXT: v_cvt_f16_f32_sdwa v1, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD
; GFX8-NEXT: v_exp_f32_e32 v2, v2
; GFX8-NEXT: v_cvt_f16_f32_e32 v0, v0
; GFX8-NEXT: v_cvt_f16_f32_sdwa v1, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD
; GFX8-NEXT: v_or_b32_e32 v0, v0, v1
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_pow_v2f16_fneg_rhs:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: v_cvt_f32_f16_e32 v2, v0
; GFX9-NEXT: v_cvt_f32_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
; GFX9-NEXT: v_cvt_f32_f16_e64 v3, -v1
; GFX9-NEXT: v_cvt_f32_f16_sdwa v1, -v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
; GFX9-NEXT: v_cvt_f32_f16_sdwa v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
; GFX9-NEXT: v_cvt_f32_f16_e32 v0, v0
; GFX9-NEXT: v_cvt_f32_f16_sdwa v3, -v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
; GFX9-NEXT: v_cvt_f32_f16_e64 v1, -v1
; GFX9-NEXT: v_log_f32_e32 v2, v2
; GFX9-NEXT: v_log_f32_e32 v0, v0
; GFX9-NEXT: v_mul_legacy_f32_e32 v2, v3, v2
; GFX9-NEXT: v_mul_legacy_f32_e32 v0, v1, v0
; GFX9-NEXT: v_exp_f32_e32 v1, v2
; GFX9-NEXT: v_exp_f32_e32 v0, v0
; GFX9-NEXT: v_cvt_f16_f32_e32 v1, v1
; GFX9-NEXT: v_exp_f32_e32 v1, v2
; GFX9-NEXT: v_cvt_f16_f32_e32 v0, v0
; GFX9-NEXT: v_and_b32_e32 v1, 0xffff, v1
; GFX9-NEXT: v_lshl_or_b32 v0, v0, 16, v1
; GFX9-NEXT: v_cvt_f16_f32_e32 v1, v1
; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX9-NEXT: v_lshl_or_b32 v0, v1, 16, v0
; GFX9-NEXT: s_setpc_b64 s[30:31]
%y.fneg = fneg <2 x half> %y
%pow = call <2 x half> @llvm.pow.v2f16(<2 x half> %x, <2 x half> %y.fneg)
@ -336,30 +336,30 @@ define <2 x half> @v_pow_v2f16_fneg_lhs_rhs(<2 x half> %x, <2 x half> %y) {
; GFX8-NEXT: v_log_f32_e32 v0, v0
; GFX8-NEXT: v_mul_legacy_f32_e32 v2, v3, v2
; GFX8-NEXT: v_mul_legacy_f32_e32 v0, v1, v0
; GFX8-NEXT: v_exp_f32_e32 v1, v2
; GFX8-NEXT: v_exp_f32_e32 v0, v0
; GFX8-NEXT: v_cvt_f16_f32_sdwa v1, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD
; GFX8-NEXT: v_exp_f32_e32 v2, v2
; GFX8-NEXT: v_cvt_f16_f32_e32 v0, v0
; GFX8-NEXT: v_cvt_f16_f32_sdwa v1, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD
; GFX8-NEXT: v_or_b32_e32 v0, v0, v1
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_pow_v2f16_fneg_lhs_rhs:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: v_cvt_f32_f16_e64 v2, -v0
; GFX9-NEXT: v_cvt_f32_f16_sdwa v0, -v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
; GFX9-NEXT: v_cvt_f32_f16_e64 v3, -v1
; GFX9-NEXT: v_cvt_f32_f16_sdwa v1, -v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
; GFX9-NEXT: v_cvt_f32_f16_sdwa v2, -v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
; GFX9-NEXT: v_cvt_f32_f16_e64 v0, -v0
; GFX9-NEXT: v_cvt_f32_f16_sdwa v3, -v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
; GFX9-NEXT: v_cvt_f32_f16_e64 v1, -v1
; GFX9-NEXT: v_log_f32_e32 v2, v2
; GFX9-NEXT: v_log_f32_e32 v0, v0
; GFX9-NEXT: v_mul_legacy_f32_e32 v2, v3, v2
; GFX9-NEXT: v_mul_legacy_f32_e32 v0, v1, v0
; GFX9-NEXT: v_exp_f32_e32 v1, v2
; GFX9-NEXT: v_exp_f32_e32 v0, v0
; GFX9-NEXT: v_cvt_f16_f32_e32 v1, v1
; GFX9-NEXT: v_exp_f32_e32 v1, v2
; GFX9-NEXT: v_cvt_f16_f32_e32 v0, v0
; GFX9-NEXT: v_and_b32_e32 v1, 0xffff, v1
; GFX9-NEXT: v_lshl_or_b32 v0, v0, 16, v1
; GFX9-NEXT: v_cvt_f16_f32_e32 v1, v1
; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX9-NEXT: v_lshl_or_b32 v0, v1, 16, v0
; GFX9-NEXT: s_setpc_b64 s[30:31]
%x.fneg = fneg <2 x half> %x
%y.fneg = fneg <2 x half> %y

View File

@ -24,25 +24,25 @@ define amdgpu_kernel void @udiv32_invariant_denom(i32 addrspace(1)* nocapture %a
; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc
; GFX9-NEXT: BB0_1: ; %bb3
; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX9-NEXT: v_mul_lo_u32 v1, v0, s7
; GFX9-NEXT: v_mul_hi_u32 v2, v0, s6
; GFX9-NEXT: v_add_u32_e32 v3, v2, v1
; GFX9-NEXT: v_mul_lo_u32 v1, s3, v3
; GFX9-NEXT: v_mul_lo_u32 v4, v3, s2
; GFX9-NEXT: v_add_u32_e32 v7, 1, v3
; GFX9-NEXT: v_add_u32_e32 v6, -1, v3
; GFX9-NEXT: v_add_u32_e32 v5, s6, v1
; GFX9-NEXT: v_cmp_ge_u32_e32 vcc, s6, v4
; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s2, v5
; GFX9-NEXT: v_mul_lo_u32 v3, v0, s7
; GFX9-NEXT: v_mul_hi_u32 v4, v0, s6
; GFX9-NEXT: v_mov_b32_e32 v1, s4
; GFX9-NEXT: v_mov_b32_e32 v2, s5
; GFX9-NEXT: v_add_u32_e32 v3, v4, v3
; GFX9-NEXT: v_mul_lo_u32 v4, s3, v3
; GFX9-NEXT: v_mul_lo_u32 v5, v3, s2
; GFX9-NEXT: v_add_u32_e32 v6, 1, v3
; GFX9-NEXT: v_add_u32_e32 v7, -1, v3
; GFX9-NEXT: v_add_u32_e32 v4, s6, v4
; GFX9-NEXT: v_cmp_ge_u32_e32 vcc, s6, v5
; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s2, v4
; GFX9-NEXT: s_and_b64 s[0:1], s[0:1], vcc
; GFX9-NEXT: s_add_u32 s6, s6, 1
; GFX9-NEXT: v_mov_b32_e32 v1, s4
; GFX9-NEXT: s_addc_u32 s7, s7, 0
; GFX9-NEXT: v_mov_b32_e32 v2, s5
; GFX9-NEXT: s_add_u32 s4, s4, 4
; GFX9-NEXT: v_cndmask_b32_e64 v3, v3, v7, s[0:1]
; GFX9-NEXT: v_cndmask_b32_e64 v3, v3, v6, s[0:1]
; GFX9-NEXT: s_addc_u32 s5, s5, 0
; GFX9-NEXT: v_cndmask_b32_e32 v3, v6, v3, vcc
; GFX9-NEXT: v_cndmask_b32_e32 v3, v7, v3, vcc
; GFX9-NEXT: s_cmpk_eq_i32 s6, 0x400
; GFX9-NEXT: global_store_dword v[1:2], v3, off
; GFX9-NEXT: s_cbranch_scc0 BB0_1
@ -88,29 +88,29 @@ define amdgpu_kernel void @urem32_invariant_denom(i32 addrspace(1)* nocapture %a
; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc
; GFX9-NEXT: BB1_1: ; %bb3
; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX9-NEXT: v_mul_lo_u32 v1, v0, s7
; GFX9-NEXT: v_mul_hi_u32 v2, v0, s6
; GFX9-NEXT: v_add_u32_e32 v3, v2, v1
; GFX9-NEXT: v_mul_lo_u32 v4, s3, v3
; GFX9-NEXT: v_mul_lo_u32 v6, v3, s2
; GFX9-NEXT: v_sub_u32_e32 v5, 1, v3
; GFX9-NEXT: v_not_b32_e32 v3, v3
; GFX9-NEXT: v_mul_lo_u32 v5, s2, v5
; GFX9-NEXT: v_mul_lo_u32 v3, s2, v3
; GFX9-NEXT: v_add_u32_e32 v4, s6, v4
; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s2, v4
; GFX9-NEXT: v_cmp_ge_u32_e64 s[0:1], s6, v6
; GFX9-NEXT: s_and_b64 vcc, vcc, s[0:1]
; GFX9-NEXT: v_add_u32_e32 v3, s6, v3
; GFX9-NEXT: v_add_u32_e32 v5, s6, v5
; GFX9-NEXT: s_add_u32 s6, s6, 1
; GFX9-NEXT: v_mul_lo_u32 v3, v0, s7
; GFX9-NEXT: v_mul_hi_u32 v4, v0, s6
; GFX9-NEXT: v_mov_b32_e32 v1, s4
; GFX9-NEXT: s_addc_u32 s7, s7, 0
; GFX9-NEXT: v_mov_b32_e32 v2, s5
; GFX9-NEXT: v_add_u32_e32 v3, v4, v3
; GFX9-NEXT: v_mul_lo_u32 v5, s3, v3
; GFX9-NEXT: v_mul_lo_u32 v4, v3, s2
; GFX9-NEXT: v_not_b32_e32 v6, v3
; GFX9-NEXT: v_sub_u32_e32 v3, 1, v3
; GFX9-NEXT: v_mul_lo_u32 v3, s2, v3
; GFX9-NEXT: v_mul_lo_u32 v6, s2, v6
; GFX9-NEXT: v_add_u32_e32 v5, s6, v5
; GFX9-NEXT: v_cmp_ge_u32_e64 s[0:1], s6, v4
; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s2, v5
; GFX9-NEXT: s_and_b64 vcc, vcc, s[0:1]
; GFX9-NEXT: v_add_u32_e32 v4, s6, v6
; GFX9-NEXT: v_add_u32_e32 v3, s6, v3
; GFX9-NEXT: s_add_u32 s6, s6, 1
; GFX9-NEXT: s_addc_u32 s7, s7, 0
; GFX9-NEXT: s_add_u32 s4, s4, 4
; GFX9-NEXT: v_cndmask_b32_e32 v3, v4, v3, vcc
; GFX9-NEXT: s_addc_u32 s5, s5, 0
; GFX9-NEXT: v_cndmask_b32_e64 v3, v5, v3, s[0:1]
; GFX9-NEXT: v_cndmask_b32_e32 v4, v5, v4, vcc
; GFX9-NEXT: v_cndmask_b32_e64 v3, v3, v4, s[0:1]
; GFX9-NEXT: s_cmpk_eq_i32 s6, 0x400
; GFX9-NEXT: global_store_dword v[1:2], v3, off
; GFX9-NEXT: s_cbranch_scc0 BB1_1
@ -162,15 +162,15 @@ define amdgpu_kernel void @sdiv32_invariant_denom(i32 addrspace(1)* nocapture %a
; GFX9-NEXT: v_mov_b32_e32 v1, s4
; GFX9-NEXT: v_mov_b32_e32 v2, s5
; GFX9-NEXT: v_mul_lo_u32 v4, v3, s3
; GFX9-NEXT: v_add_u32_e32 v6, 1, v3
; GFX9-NEXT: v_add_u32_e32 v7, -1, v3
; GFX9-NEXT: v_sub_u32_e32 v5, s6, v4
; GFX9-NEXT: v_add_u32_e32 v5, 1, v3
; GFX9-NEXT: v_add_u32_e32 v6, -1, v3
; GFX9-NEXT: v_sub_u32_e32 v7, s6, v4
; GFX9-NEXT: v_cmp_ge_u32_e32 vcc, s6, v4
; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s3, v5
; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s3, v7
; GFX9-NEXT: s_and_b64 s[0:1], s[0:1], vcc
; GFX9-NEXT: v_cndmask_b32_e64 v3, v3, v6, s[0:1]
; GFX9-NEXT: v_cndmask_b32_e64 v3, v3, v5, s[0:1]
; GFX9-NEXT: s_add_i32 s6, s6, 1
; GFX9-NEXT: v_cndmask_b32_e32 v3, v7, v3, vcc
; GFX9-NEXT: v_cndmask_b32_e32 v3, v6, v3, vcc
; GFX9-NEXT: s_add_u32 s4, s4, 4
; GFX9-NEXT: v_xor_b32_e32 v3, s2, v3
; GFX9-NEXT: s_addc_u32 s5, s5, 0
@ -222,10 +222,10 @@ define amdgpu_kernel void @srem32_invariant_denom(i32 addrspace(1)* nocapture %a
; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc
; GFX9-NEXT: BB3_1: ; %bb3
; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX9-NEXT: v_mul_hi_u32 v1, v0, s3
; GFX9-NEXT: v_mul_lo_u32 v3, v1, s2
; GFX9-NEXT: v_mul_hi_u32 v3, v0, s3
; GFX9-NEXT: v_mov_b32_e32 v1, s4
; GFX9-NEXT: v_mov_b32_e32 v2, s5
; GFX9-NEXT: v_mul_lo_u32 v3, v3, s2
; GFX9-NEXT: v_sub_u32_e32 v4, s3, v3
; GFX9-NEXT: v_cmp_ge_u32_e64 s[0:1], s3, v3
; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s2, v4
@ -275,19 +275,19 @@ define amdgpu_kernel void @udiv16_invariant_denom(i16 addrspace(1)* nocapture %a
; GFX9-NEXT: BB4_1: ; %bb3
; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX9-NEXT: v_and_b32_e32 v2, s2, v4
; GFX9-NEXT: v_cvt_f32_u32_e32 v7, v2
; GFX9-NEXT: v_cvt_f32_u32_e32 v8, v2
; GFX9-NEXT: v_lshlrev_b64 v[5:6], 1, v[2:3]
; GFX9-NEXT: v_mov_b32_e32 v2, s5
; GFX9-NEXT: v_mov_b32_e32 v7, s5
; GFX9-NEXT: v_add_co_u32_e64 v5, s[0:1], s4, v5
; GFX9-NEXT: v_addc_co_u32_e64 v6, s[0:1], v2, v6, s[0:1]
; GFX9-NEXT: v_mul_f32_e32 v2, v7, v1
; GFX9-NEXT: v_mul_f32_e32 v2, v8, v1
; GFX9-NEXT: v_trunc_f32_e32 v2, v2
; GFX9-NEXT: v_cvt_u32_f32_e32 v8, v2
; GFX9-NEXT: v_addc_co_u32_e64 v6, s[0:1], v7, v6, s[0:1]
; GFX9-NEXT: v_cvt_u32_f32_e32 v7, v2
; GFX9-NEXT: v_add_u16_e32 v4, 1, v4
; GFX9-NEXT: v_mad_f32 v2, -v2, v0, v7
; GFX9-NEXT: v_mad_f32 v2, -v2, v0, v8
; GFX9-NEXT: v_cmp_ge_f32_e64 s[0:1], |v2|, v0
; GFX9-NEXT: v_cmp_eq_u16_e32 vcc, s3, v4
; GFX9-NEXT: v_addc_co_u32_e64 v2, s[0:1], 0, v8, s[0:1]
; GFX9-NEXT: v_addc_co_u32_e64 v2, s[0:1], 0, v7, s[0:1]
; GFX9-NEXT: s_and_b64 vcc, exec, vcc
; GFX9-NEXT: global_store_short v[5:6], v2, off
; GFX9-NEXT: s_cbranch_vccz BB4_1
@ -326,15 +326,15 @@ define amdgpu_kernel void @urem16_invariant_denom(i16 addrspace(1)* nocapture %a
; GFX9-NEXT: BB5_1: ; %bb3
; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX9-NEXT: v_and_b32_e32 v2, s2, v4
; GFX9-NEXT: v_cvt_f32_u32_e32 v7, v2
; GFX9-NEXT: v_cvt_f32_u32_e32 v8, v2
; GFX9-NEXT: v_lshlrev_b64 v[5:6], 1, v[2:3]
; GFX9-NEXT: v_mov_b32_e32 v8, s5
; GFX9-NEXT: v_mov_b32_e32 v7, s5
; GFX9-NEXT: v_add_co_u32_e64 v5, s[0:1], s4, v5
; GFX9-NEXT: v_addc_co_u32_e64 v6, s[0:1], v8, v6, s[0:1]
; GFX9-NEXT: v_mul_f32_e32 v8, v7, v1
; GFX9-NEXT: v_trunc_f32_e32 v8, v8
; GFX9-NEXT: v_cvt_u32_f32_e32 v9, v8
; GFX9-NEXT: v_mad_f32 v7, -v8, v0, v7
; GFX9-NEXT: v_addc_co_u32_e64 v6, s[0:1], v7, v6, s[0:1]
; GFX9-NEXT: v_mul_f32_e32 v7, v8, v1
; GFX9-NEXT: v_trunc_f32_e32 v7, v7
; GFX9-NEXT: v_cvt_u32_f32_e32 v9, v7
; GFX9-NEXT: v_mad_f32 v7, -v7, v0, v8
; GFX9-NEXT: v_cmp_ge_f32_e64 s[0:1], |v7|, v0
; GFX9-NEXT: v_add_u16_e32 v4, 1, v4
; GFX9-NEXT: v_addc_co_u32_e64 v7, s[0:1], 0, v9, s[0:1]

View File

@ -102,18 +102,18 @@ define amdgpu_kernel void @udot2(<2 x i16> addrspace(1)* %src1,
;
; GFX10-DL-LABEL: udot2:
; GFX10-DL: ; %bb.0: ; %entry
; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
; GFX10-DL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34
; GFX10-DL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
; GFX10-DL-NEXT: ; implicit-def: $vcc_hi
; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-DL-NEXT: s_load_dword s2, s[0:1], 0x0
; GFX10-DL-NEXT: s_load_dword s3, s[4:5], 0x0
; GFX10-DL-NEXT: s_load_dword s4, s[6:7], 0x0
; GFX10-DL-NEXT: s_load_dword s6, s[4:5], 0x0
; GFX10-DL-NEXT: s_load_dword s0, s[0:1], 0x0
; GFX10-DL-NEXT: s_load_dword s1, s[2:3], 0x0
; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-DL-NEXT: v_mov_b32_e32 v0, s2
; GFX10-DL-NEXT: v_dot2_u32_u16 v2, s4, s3, v0
; GFX10-DL-NEXT: v_mov_b32_e32 v0, s0
; GFX10-DL-NEXT: v_mov_b32_e32 v1, s1
; GFX10-DL-NEXT: v_mov_b32_e32 v0, s6
; GFX10-DL-NEXT: v_dot2_u32_u16 v2, s1, s0, v0
; GFX10-DL-NEXT: v_mov_b32_e32 v0, s4
; GFX10-DL-NEXT: v_mov_b32_e32 v1, s5
; GFX10-DL-NEXT: global_store_dword v[0:1], v2, off
; GFX10-DL-NEXT: s_endpgm
<2 x i16> addrspace(1)* %src2,
@ -246,19 +246,19 @@ define amdgpu_kernel void @udot2_MulMul(<2 x i16> addrspace(1)* %src1,
; GFX10-DL: ; %bb.0: ; %entry
; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
; GFX10-DL-NEXT: s_mov_b32 s2, 0xffff
; GFX10-DL-NEXT: ; implicit-def: $vcc_hi
; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-DL-NEXT: s_load_dword s3, s[4:5], 0x0
; GFX10-DL-NEXT: s_load_dword s4, s[6:7], 0x0
; GFX10-DL-NEXT: s_load_dword s2, s[4:5], 0x0
; GFX10-DL-NEXT: s_load_dword s3, s[6:7], 0x0
; GFX10-DL-NEXT: s_mov_b32 s4, 0xffff
; GFX10-DL-NEXT: s_load_dword s5, s[0:1], 0x0
; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-DL-NEXT: s_and_b32 s6, s3, s2
; GFX10-DL-NEXT: s_and_b32 s2, s4, s2
; GFX10-DL-NEXT: s_and_b32 s6, s2, s4
; GFX10-DL-NEXT: s_and_b32 s4, s3, s4
; GFX10-DL-NEXT: s_lshr_b32 s2, s2, 16
; GFX10-DL-NEXT: s_lshr_b32 s3, s3, 16
; GFX10-DL-NEXT: s_lshr_b32 s4, s4, 16
; GFX10-DL-NEXT: v_mul_u32_u24_e64 v0, s2, s6
; GFX10-DL-NEXT: v_mad_u32_u24 v0, s4, s3, v0
; GFX10-DL-NEXT: v_mul_u32_u24_e64 v0, s4, s6
; GFX10-DL-NEXT: v_mad_u32_u24 v0, s3, s2, v0
; GFX10-DL-NEXT: v_add_nc_u32_e32 v2, s5, v0
; GFX10-DL-NEXT: v_mov_b32_e32 v0, s0
; GFX10-DL-NEXT: v_mov_b32_e32 v1, s1
@ -377,18 +377,18 @@ define amdgpu_kernel void @idot2(<2 x i16> addrspace(1)* %src1,
;
; GFX10-DL-LABEL: idot2:
; GFX10-DL: ; %bb.0: ; %entry
; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
; GFX10-DL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34
; GFX10-DL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
; GFX10-DL-NEXT: ; implicit-def: $vcc_hi
; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-DL-NEXT: s_load_dword s2, s[0:1], 0x0
; GFX10-DL-NEXT: s_load_dword s3, s[4:5], 0x0
; GFX10-DL-NEXT: s_load_dword s4, s[6:7], 0x0
; GFX10-DL-NEXT: s_load_dword s6, s[4:5], 0x0
; GFX10-DL-NEXT: s_load_dword s0, s[0:1], 0x0
; GFX10-DL-NEXT: s_load_dword s1, s[2:3], 0x0
; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-DL-NEXT: v_mov_b32_e32 v0, s2
; GFX10-DL-NEXT: v_dot2_i32_i16 v2, s4, s3, v0
; GFX10-DL-NEXT: v_mov_b32_e32 v0, s0
; GFX10-DL-NEXT: v_mov_b32_e32 v1, s1
; GFX10-DL-NEXT: v_mov_b32_e32 v0, s6
; GFX10-DL-NEXT: v_dot2_i32_i16 v2, s1, s0, v0
; GFX10-DL-NEXT: v_mov_b32_e32 v0, s4
; GFX10-DL-NEXT: v_mov_b32_e32 v1, s5
; GFX10-DL-NEXT: global_store_dword v[0:1], v2, off
; GFX10-DL-NEXT: s_endpgm
<2 x i16> addrspace(1)* %src2,
@ -648,18 +648,18 @@ define amdgpu_kernel void @udot2_alt_AddOperands(<2 x i16> addrspace(1)* %src1,
;
; GFX10-DL-LABEL: udot2_alt_AddOperands:
; GFX10-DL: ; %bb.0: ; %entry
; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
; GFX10-DL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34
; GFX10-DL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
; GFX10-DL-NEXT: ; implicit-def: $vcc_hi
; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-DL-NEXT: s_load_dword s2, s[0:1], 0x0
; GFX10-DL-NEXT: s_load_dword s3, s[4:5], 0x0
; GFX10-DL-NEXT: s_load_dword s4, s[6:7], 0x0
; GFX10-DL-NEXT: s_load_dword s6, s[4:5], 0x0
; GFX10-DL-NEXT: s_load_dword s0, s[0:1], 0x0
; GFX10-DL-NEXT: s_load_dword s1, s[2:3], 0x0
; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-DL-NEXT: v_mov_b32_e32 v0, s2
; GFX10-DL-NEXT: v_dot2_u32_u16 v2, s4, s3, v0
; GFX10-DL-NEXT: v_mov_b32_e32 v0, s0
; GFX10-DL-NEXT: v_mov_b32_e32 v1, s1
; GFX10-DL-NEXT: v_mov_b32_e32 v0, s6
; GFX10-DL-NEXT: v_dot2_u32_u16 v2, s1, s0, v0
; GFX10-DL-NEXT: v_mov_b32_e32 v0, s4
; GFX10-DL-NEXT: v_mov_b32_e32 v1, s5
; GFX10-DL-NEXT: global_store_dword v[0:1], v2, off
; GFX10-DL-NEXT: s_endpgm
<2 x i16> addrspace(1)* %src2,
@ -854,14 +854,14 @@ define amdgpu_kernel void @notudot2_SameVec(<2 x i16> addrspace(1)* %src1,
; GFX8-NEXT: s_load_dword s2, s[6:7], 0x0
; GFX8-NEXT: s_load_dword s3, s[0:1], 0x0
; GFX8-NEXT: s_load_dword s4, s[4:5], 0x0
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: s_lshr_b32 s2, s2, 16
; GFX8-NEXT: v_mov_b32_e32 v0, s3
; GFX8-NEXT: s_and_b32 s4, s4, 0xffff
; GFX8-NEXT: v_mad_u32_u24 v0, s2, s2, v0
; GFX8-NEXT: v_mad_u32_u24 v2, s4, s4, v0
; GFX8-NEXT: v_mov_b32_e32 v0, s0
; GFX8-NEXT: v_mov_b32_e32 v1, s1
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: s_lshr_b32 s0, s2, 16
; GFX8-NEXT: v_mov_b32_e32 v2, s3
; GFX8-NEXT: s_and_b32 s1, s4, 0xffff
; GFX8-NEXT: v_mad_u32_u24 v2, s0, s0, v2
; GFX8-NEXT: v_mad_u32_u24 v2, s1, s1, v2
; GFX8-NEXT: flat_store_dword v[0:1], v2
; GFX8-NEXT: s_endpgm
;
@ -873,14 +873,14 @@ define amdgpu_kernel void @notudot2_SameVec(<2 x i16> addrspace(1)* %src1,
; GFX9-NODL-NEXT: s_load_dword s2, s[6:7], 0x0
; GFX9-NODL-NEXT: s_load_dword s3, s[0:1], 0x0
; GFX9-NODL-NEXT: s_load_dword s4, s[4:5], 0x0
; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NODL-NEXT: s_lshr_b32 s2, s2, 16
; GFX9-NODL-NEXT: v_mov_b32_e32 v0, s3
; GFX9-NODL-NEXT: s_and_b32 s4, s4, 0xffff
; GFX9-NODL-NEXT: v_mad_u32_u24 v0, s2, s2, v0
; GFX9-NODL-NEXT: v_mad_u32_u24 v2, s4, s4, v0
; GFX9-NODL-NEXT: v_mov_b32_e32 v0, s0
; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s1
; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NODL-NEXT: s_lshr_b32 s0, s2, 16
; GFX9-NODL-NEXT: v_mov_b32_e32 v2, s3
; GFX9-NODL-NEXT: s_and_b32 s1, s4, 0xffff
; GFX9-NODL-NEXT: v_mad_u32_u24 v2, s0, s0, v2
; GFX9-NODL-NEXT: v_mad_u32_u24 v2, s1, s1, v2
; GFX9-NODL-NEXT: global_store_dword v[0:1], v2, off
; GFX9-NODL-NEXT: s_endpgm
;
@ -892,14 +892,14 @@ define amdgpu_kernel void @notudot2_SameVec(<2 x i16> addrspace(1)* %src1,
; GFX9-DL-NEXT: s_load_dword s2, s[6:7], 0x0
; GFX9-DL-NEXT: s_load_dword s3, s[0:1], 0x0
; GFX9-DL-NEXT: s_load_dword s4, s[4:5], 0x0
; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-DL-NEXT: s_lshr_b32 s2, s2, 16
; GFX9-DL-NEXT: v_mov_b32_e32 v0, s3
; GFX9-DL-NEXT: s_and_b32 s4, s4, 0xffff
; GFX9-DL-NEXT: v_mad_u32_u24 v0, s2, s2, v0
; GFX9-DL-NEXT: v_mad_u32_u24 v2, s4, s4, v0
; GFX9-DL-NEXT: v_mov_b32_e32 v0, s0
; GFX9-DL-NEXT: v_mov_b32_e32 v1, s1
; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-DL-NEXT: s_lshr_b32 s0, s2, 16
; GFX9-DL-NEXT: v_mov_b32_e32 v2, s3
; GFX9-DL-NEXT: s_and_b32 s1, s4, 0xffff
; GFX9-DL-NEXT: v_mad_u32_u24 v2, s0, s0, v2
; GFX9-DL-NEXT: v_mad_u32_u24 v2, s1, s1, v2
; GFX9-DL-NEXT: global_store_dword v[0:1], v2, off
; GFX9-DL-NEXT: s_endpgm
;
@ -1039,18 +1039,18 @@ define amdgpu_kernel void @udot2_v4i16(<4 x i16> addrspace(1)* %src1,
;
; GFX10-DL-LABEL: udot2_v4i16:
; GFX10-DL: ; %bb.0: ; %entry
; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
; GFX10-DL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34
; GFX10-DL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
; GFX10-DL-NEXT: ; implicit-def: $vcc_hi
; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-DL-NEXT: s_load_dword s2, s[0:1], 0x0
; GFX10-DL-NEXT: s_load_dword s3, s[4:5], 0x0
; GFX10-DL-NEXT: s_load_dword s4, s[6:7], 0x0
; GFX10-DL-NEXT: s_load_dword s6, s[4:5], 0x0
; GFX10-DL-NEXT: s_load_dword s0, s[0:1], 0x0
; GFX10-DL-NEXT: s_load_dword s1, s[2:3], 0x0
; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-DL-NEXT: v_mov_b32_e32 v0, s2
; GFX10-DL-NEXT: v_dot2_u32_u16 v2, s4, s3, v0
; GFX10-DL-NEXT: v_mov_b32_e32 v0, s0
; GFX10-DL-NEXT: v_mov_b32_e32 v1, s1
; GFX10-DL-NEXT: v_mov_b32_e32 v0, s6
; GFX10-DL-NEXT: v_dot2_u32_u16 v2, s1, s0, v0
; GFX10-DL-NEXT: v_mov_b32_e32 v0, s4
; GFX10-DL-NEXT: v_mov_b32_e32 v1, s5
; GFX10-DL-NEXT: global_store_dword v[0:1], v2, off
; GFX10-DL-NEXT: s_endpgm
<4 x i16> addrspace(1)* %src2,
@ -1171,18 +1171,18 @@ define amdgpu_kernel void @udot2_v4i16_Hi(<4 x i16> addrspace(1)* %src1,
;
; GFX10-DL-LABEL: udot2_v4i16_Hi:
; GFX10-DL: ; %bb.0: ; %entry
; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
; GFX10-DL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34
; GFX10-DL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
; GFX10-DL-NEXT: ; implicit-def: $vcc_hi
; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-DL-NEXT: s_load_dword s2, s[0:1], 0x0
; GFX10-DL-NEXT: s_load_dword s3, s[4:5], 0x4
; GFX10-DL-NEXT: s_load_dword s4, s[6:7], 0x4
; GFX10-DL-NEXT: s_load_dword s6, s[4:5], 0x0
; GFX10-DL-NEXT: s_load_dword s0, s[0:1], 0x4
; GFX10-DL-NEXT: s_load_dword s1, s[2:3], 0x4
; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-DL-NEXT: v_mov_b32_e32 v0, s2
; GFX10-DL-NEXT: v_dot2_u32_u16 v2, s4, s3, v0
; GFX10-DL-NEXT: v_mov_b32_e32 v0, s0
; GFX10-DL-NEXT: v_mov_b32_e32 v1, s1
; GFX10-DL-NEXT: v_mov_b32_e32 v0, s6
; GFX10-DL-NEXT: v_dot2_u32_u16 v2, s1, s0, v0
; GFX10-DL-NEXT: v_mov_b32_e32 v0, s4
; GFX10-DL-NEXT: v_mov_b32_e32 v1, s5
; GFX10-DL-NEXT: global_store_dword v[0:1], v2, off
; GFX10-DL-NEXT: s_endpgm
<4 x i16> addrspace(1)* %src2,
@ -1312,18 +1312,18 @@ define amdgpu_kernel void @notudot2_v4i16_Even(<4 x i16> addrspace(1)* %src1,
; GFX10-DL: ; %bb.0: ; %entry
; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
; GFX10-DL-NEXT: s_mov_b32 s8, 0xffff
; GFX10-DL-NEXT: ; implicit-def: $vcc_hi
; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-DL-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0
; GFX10-DL-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-DL-NEXT: s_load_dword s6, s[0:1], 0x0
; GFX10-DL-NEXT: s_mov_b32 s7, 0xffff
; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-DL-NEXT: s_and_b32 s3, s3, s8
; GFX10-DL-NEXT: s_and_b32 s5, s5, s8
; GFX10-DL-NEXT: s_and_b32 s3, s3, s7
; GFX10-DL-NEXT: s_and_b32 s5, s5, s7
; GFX10-DL-NEXT: v_mov_b32_e32 v0, s6
; GFX10-DL-NEXT: s_and_b32 s2, s2, s8
; GFX10-DL-NEXT: s_and_b32 s4, s4, s8
; GFX10-DL-NEXT: s_and_b32 s2, s2, s7
; GFX10-DL-NEXT: s_and_b32 s4, s4, s7
; GFX10-DL-NEXT: v_mad_u32_u24 v0, s5, s3, v0
; GFX10-DL-NEXT: v_mad_u32_u24 v2, s4, s2, v0
; GFX10-DL-NEXT: v_mov_b32_e32 v0, s0
@ -1457,15 +1457,15 @@ define amdgpu_kernel void @notudot2_v4i16_Middle(<4 x i16> addrspace(1)* %src1,
; GFX10-DL: ; %bb.0: ; %entry
; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
; GFX10-DL-NEXT: s_mov_b32 s8, 0xffff
; GFX10-DL-NEXT: ; implicit-def: $vcc_hi
; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-DL-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0
; GFX10-DL-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-DL-NEXT: s_load_dword s6, s[0:1], 0x0
; GFX10-DL-NEXT: s_mov_b32 s7, 0xffff
; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-DL-NEXT: s_and_b32 s3, s3, s8
; GFX10-DL-NEXT: s_and_b32 s5, s5, s8
; GFX10-DL-NEXT: s_and_b32 s3, s3, s7
; GFX10-DL-NEXT: s_and_b32 s5, s5, s7
; GFX10-DL-NEXT: v_mov_b32_e32 v0, s6
; GFX10-DL-NEXT: s_lshr_b32 s2, s2, 16
; GFX10-DL-NEXT: s_lshr_b32 s4, s4, 16
@ -1602,18 +1602,18 @@ define amdgpu_kernel void @notudot2_DiffIndex(<2 x i16> addrspace(1)* %src1,
; GFX10-DL: ; %bb.0: ; %entry
; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
; GFX10-DL-NEXT: s_mov_b32 s2, 0xffff
; GFX10-DL-NEXT: ; implicit-def: $vcc_hi
; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-DL-NEXT: s_load_dword s3, s[4:5], 0x0
; GFX10-DL-NEXT: s_load_dword s4, s[6:7], 0x0
; GFX10-DL-NEXT: s_load_dword s5, s[0:1], 0x0
; GFX10-DL-NEXT: s_load_dword s2, s[4:5], 0x0
; GFX10-DL-NEXT: s_load_dword s3, s[6:7], 0x0
; GFX10-DL-NEXT: s_load_dword s4, s[0:1], 0x0
; GFX10-DL-NEXT: s_mov_b32 s5, 0xffff
; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-DL-NEXT: s_lshr_b32 s6, s3, 16
; GFX10-DL-NEXT: s_and_b32 s7, s4, s2
; GFX10-DL-NEXT: v_mov_b32_e32 v0, s5
; GFX10-DL-NEXT: s_and_b32 s2, s3, s2
; GFX10-DL-NEXT: s_lshr_b32 s3, s4, 16
; GFX10-DL-NEXT: s_lshr_b32 s6, s2, 16
; GFX10-DL-NEXT: s_and_b32 s7, s3, s5
; GFX10-DL-NEXT: v_mov_b32_e32 v0, s4
; GFX10-DL-NEXT: s_and_b32 s2, s2, s5
; GFX10-DL-NEXT: s_lshr_b32 s3, s3, 16
; GFX10-DL-NEXT: v_mad_u32_u24 v0, s7, s6, v0
; GFX10-DL-NEXT: v_mad_u32_u24 v2, s3, s2, v0
; GFX10-DL-NEXT: v_mov_b32_e32 v0, s0
@ -1751,20 +1751,20 @@ define amdgpu_kernel void @udot2_MultipleUses_add1(<2 x i16> addrspace(1)* %src1
; GFX10-DL: ; %bb.0: ; %entry
; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
; GFX10-DL-NEXT: s_mov_b32 s2, 0xffff
; GFX10-DL-NEXT: ; implicit-def: $vcc_hi
; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-DL-NEXT: s_load_dword s3, s[4:5], 0x0
; GFX10-DL-NEXT: s_load_dword s4, s[6:7], 0x0
; GFX10-DL-NEXT: s_load_dword s5, s[0:1], 0x0
; GFX10-DL-NEXT: s_load_dword s2, s[4:5], 0x0
; GFX10-DL-NEXT: s_load_dword s3, s[6:7], 0x0
; GFX10-DL-NEXT: s_load_dword s4, s[0:1], 0x0
; GFX10-DL-NEXT: s_mov_b32 s5, 0xffff
; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-DL-NEXT: s_lshr_b32 s6, s3, 16
; GFX10-DL-NEXT: s_lshr_b32 s7, s4, 16
; GFX10-DL-NEXT: v_mov_b32_e32 v0, s5
; GFX10-DL-NEXT: s_and_b32 s3, s3, s2
; GFX10-DL-NEXT: s_and_b32 s2, s4, s2
; GFX10-DL-NEXT: s_lshr_b32 s6, s2, 16
; GFX10-DL-NEXT: s_lshr_b32 s7, s3, 16
; GFX10-DL-NEXT: v_mov_b32_e32 v0, s4
; GFX10-DL-NEXT: s_and_b32 s2, s2, s5
; GFX10-DL-NEXT: s_and_b32 s3, s3, s5
; GFX10-DL-NEXT: v_mad_u32_u24 v0, s7, s6, v0
; GFX10-DL-NEXT: v_mad_u32_u24 v1, s2, s3, v0
; GFX10-DL-NEXT: v_mad_u32_u24 v1, s3, s2, v0
; GFX10-DL-NEXT: v_add_nc_u32_e32 v2, v1, v0
; GFX10-DL-NEXT: v_mov_b32_e32 v0, s0
; GFX10-DL-NEXT: v_mov_b32_e32 v1, s1
@ -2049,21 +2049,21 @@ define amdgpu_kernel void @udot2_MultipleUses_mul1(<2 x i16> addrspace(1)* %src1
; GFX10-DL: ; %bb.0: ; %entry
; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
; GFX10-DL-NEXT: s_mov_b32 s2, 0xffff
; GFX10-DL-NEXT: ; implicit-def: $vcc_hi
; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-DL-NEXT: s_load_dword s3, s[4:5], 0x0
; GFX10-DL-NEXT: s_load_dword s4, s[6:7], 0x0
; GFX10-DL-NEXT: s_load_dword s5, s[0:1], 0x0
; GFX10-DL-NEXT: s_load_dword s2, s[4:5], 0x0
; GFX10-DL-NEXT: s_load_dword s3, s[6:7], 0x0
; GFX10-DL-NEXT: s_load_dword s4, s[0:1], 0x0
; GFX10-DL-NEXT: s_mov_b32 s5, 0xffff
; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-DL-NEXT: s_and_b32 s6, s3, s2
; GFX10-DL-NEXT: s_and_b32 s2, s4, s2
; GFX10-DL-NEXT: v_mov_b32_e32 v0, s5
; GFX10-DL-NEXT: s_and_b32 s6, s2, s5
; GFX10-DL-NEXT: s_and_b32 s5, s3, s5
; GFX10-DL-NEXT: v_mov_b32_e32 v0, s4
; GFX10-DL-NEXT: s_lshr_b32 s2, s2, 16
; GFX10-DL-NEXT: s_lshr_b32 s3, s3, 16
; GFX10-DL-NEXT: s_lshr_b32 s4, s4, 16
; GFX10-DL-NEXT: v_mad_u32_u24 v0, s2, s6, v0
; GFX10-DL-NEXT: v_mad_u32_u24 v0, s4, s3, v0
; GFX10-DL-NEXT: v_mad_u32_u24 v2, s2, s6, v0
; GFX10-DL-NEXT: v_mad_u32_u24 v0, s5, s6, v0
; GFX10-DL-NEXT: v_mad_u32_u24 v0, s3, s2, v0
; GFX10-DL-NEXT: v_mad_u32_u24 v2, s5, s6, v0
; GFX10-DL-NEXT: v_mov_b32_e32 v0, s0
; GFX10-DL-NEXT: v_mov_b32_e32 v1, s1
; GFX10-DL-NEXT: global_store_dword v[0:1], v2, off
@ -2349,21 +2349,21 @@ define amdgpu_kernel void @udot2_MultipleUses_mul2(<2 x i16> addrspace(1)* %src1
; GFX10-DL: ; %bb.0: ; %entry
; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
; GFX10-DL-NEXT: s_mov_b32 s2, 0xffff
; GFX10-DL-NEXT: ; implicit-def: $vcc_hi
; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-DL-NEXT: s_load_dword s3, s[4:5], 0x0
; GFX10-DL-NEXT: s_load_dword s4, s[6:7], 0x0
; GFX10-DL-NEXT: s_load_dword s5, s[0:1], 0x0
; GFX10-DL-NEXT: s_load_dword s2, s[4:5], 0x0
; GFX10-DL-NEXT: s_load_dword s3, s[6:7], 0x0
; GFX10-DL-NEXT: s_load_dword s4, s[0:1], 0x0
; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-DL-NEXT: s_lshr_b32 s5, s2, 16
; GFX10-DL-NEXT: s_lshr_b32 s6, s3, 16
; GFX10-DL-NEXT: s_lshr_b32 s7, s4, 16
; GFX10-DL-NEXT: v_mov_b32_e32 v0, s5
; GFX10-DL-NEXT: s_and_b32 s3, s3, s2
; GFX10-DL-NEXT: s_and_b32 s2, s4, s2
; GFX10-DL-NEXT: v_mad_u32_u24 v0, s7, s6, v0
; GFX10-DL-NEXT: v_mad_u32_u24 v0, s7, s6, v0
; GFX10-DL-NEXT: v_mad_u32_u24 v2, s2, s3, v0
; GFX10-DL-NEXT: v_mov_b32_e32 v0, s4
; GFX10-DL-NEXT: s_mov_b32 s4, 0xffff
; GFX10-DL-NEXT: v_mad_u32_u24 v0, s6, s5, v0
; GFX10-DL-NEXT: s_and_b32 s2, s2, s4
; GFX10-DL-NEXT: s_and_b32 s3, s3, s4
; GFX10-DL-NEXT: v_mad_u32_u24 v0, s6, s5, v0
; GFX10-DL-NEXT: v_mad_u32_u24 v2, s3, s2, v0
; GFX10-DL-NEXT: v_mov_b32_e32 v0, s0
; GFX10-DL-NEXT: v_mov_b32_e32 v1, s1
; GFX10-DL-NEXT: global_store_dword v[0:1], v2, off
@ -2621,15 +2621,15 @@ define amdgpu_kernel void @udot2_acc16(<2 x i16> addrspace(1)* %src1,
; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-DL-NEXT: s_load_dword s2, s[6:7], 0x0
; GFX9-DL-NEXT: s_load_dword s2, s[4:5], 0x0
; GFX9-DL-NEXT: s_load_dword s3, s[6:7], 0x0
; GFX9-DL-NEXT: v_mov_b32_e32 v0, s0
; GFX9-DL-NEXT: v_mov_b32_e32 v1, s1
; GFX9-DL-NEXT: global_load_ushort v2, v[0:1], off
; GFX9-DL-NEXT: s_load_dword s0, s[4:5], 0x0
; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-DL-NEXT: v_mov_b32_e32 v3, s2
; GFX9-DL-NEXT: v_mov_b32_e32 v3, s3
; GFX9-DL-NEXT: s_waitcnt vmcnt(0)
; GFX9-DL-NEXT: v_dot2_u32_u16 v2, s0, v3, v2
; GFX9-DL-NEXT: v_dot2_u32_u16 v2, s2, v3, v2
; GFX9-DL-NEXT: global_store_short v[0:1], v2, off
; GFX9-DL-NEXT: s_endpgm
;
@ -2705,22 +2705,22 @@ define amdgpu_kernel void @notsdot2_sext8(<2 x i8> addrspace(1)* %src1,
; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: v_mov_b32_e32 v0, s6
; GFX8-NEXT: v_mov_b32_e32 v1, s7
; GFX8-NEXT: v_mov_b32_e32 v2, s4
; GFX8-NEXT: v_mov_b32_e32 v3, s5
; GFX8-NEXT: s_load_dword s2, s[0:1], 0x0
; GFX8-NEXT: v_mov_b32_e32 v1, s7
; GFX8-NEXT: flat_load_ushort v2, v[2:3]
; GFX8-NEXT: flat_load_ushort v0, v[0:1]
; GFX8-NEXT: flat_load_ushort v1, v[2:3]
; GFX8-NEXT: s_load_dword s2, s[0:1], 0x0
; GFX8-NEXT: s_waitcnt vmcnt(1) lgkmcnt(0)
; GFX8-NEXT: v_bfe_i32 v1, v2, 0, 8
; GFX8-NEXT: v_lshrrev_b16_e32 v2, 8, v2
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: v_bfe_i32 v3, v0, 0, 8
; GFX8-NEXT: v_lshrrev_b16_e32 v0, 8, v0
; GFX8-NEXT: v_bfe_i32 v2, v2, 0, 8
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: v_bfe_i32 v2, v1, 0, 8
; GFX8-NEXT: v_lshrrev_b16_e32 v1, 8, v1
; GFX8-NEXT: v_bfe_i32 v1, v1, 0, 8
; GFX8-NEXT: v_bfe_i32 v0, v0, 0, 8
; GFX8-NEXT: v_mad_i32_i24 v0, v0, v2, s2
; GFX8-NEXT: v_mad_i32_i24 v2, v3, v1, v0
; GFX8-NEXT: v_mad_i32_i24 v0, v0, v1, s2
; GFX8-NEXT: v_mad_i32_i24 v2, v3, v2, v0
; GFX8-NEXT: v_mov_b32_e32 v0, s0
; GFX8-NEXT: v_mov_b32_e32 v1, s1
; GFX8-NEXT: flat_store_dword v[0:1], v2
@ -2732,23 +2732,23 @@ define amdgpu_kernel void @notsdot2_sext8(<2 x i8> addrspace(1)* %src1,
; GFX9-NODL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NODL-NEXT: v_mov_b32_e32 v0, s6
; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s7
; GFX9-NODL-NEXT: v_mov_b32_e32 v2, s4
; GFX9-NODL-NEXT: v_mov_b32_e32 v3, s5
; GFX9-NODL-NEXT: s_load_dword s2, s[0:1], 0x0
; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s7
; GFX9-NODL-NEXT: global_load_ushort v2, v[2:3], off
; GFX9-NODL-NEXT: global_load_ushort v0, v[0:1], off
; GFX9-NODL-NEXT: global_load_ushort v1, v[2:3], off
; GFX9-NODL-NEXT: s_load_dword s2, s[0:1], 0x0
; GFX9-NODL-NEXT: s_waitcnt vmcnt(1)
; GFX9-NODL-NEXT: v_bfe_i32 v1, v2, 0, 8
; GFX9-NODL-NEXT: v_lshrrev_b16_e32 v2, 8, v2
; GFX9-NODL-NEXT: s_waitcnt vmcnt(0)
; GFX9-NODL-NEXT: v_bfe_i32 v3, v0, 0, 8
; GFX9-NODL-NEXT: v_lshrrev_b16_e32 v0, 8, v0
; GFX9-NODL-NEXT: v_bfe_i32 v2, v2, 0, 8
; GFX9-NODL-NEXT: s_waitcnt vmcnt(0)
; GFX9-NODL-NEXT: v_bfe_i32 v2, v1, 0, 8
; GFX9-NODL-NEXT: v_lshrrev_b16_e32 v1, 8, v1
; GFX9-NODL-NEXT: v_bfe_i32 v1, v1, 0, 8
; GFX9-NODL-NEXT: v_bfe_i32 v0, v0, 0, 8
; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NODL-NEXT: v_mad_i32_i24 v0, v0, v2, s2
; GFX9-NODL-NEXT: v_mad_i32_i24 v2, v3, v1, v0
; GFX9-NODL-NEXT: v_mad_i32_i24 v0, v0, v1, s2
; GFX9-NODL-NEXT: v_mad_i32_i24 v2, v3, v2, v0
; GFX9-NODL-NEXT: v_mov_b32_e32 v0, s0
; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s1
; GFX9-NODL-NEXT: global_store_dword v[0:1], v2, off
@ -2760,23 +2760,23 @@ define amdgpu_kernel void @notsdot2_sext8(<2 x i8> addrspace(1)* %src1,
; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-DL-NEXT: v_mov_b32_e32 v0, s6
; GFX9-DL-NEXT: v_mov_b32_e32 v1, s7
; GFX9-DL-NEXT: v_mov_b32_e32 v2, s4
; GFX9-DL-NEXT: v_mov_b32_e32 v3, s5
; GFX9-DL-NEXT: s_load_dword s2, s[0:1], 0x0
; GFX9-DL-NEXT: v_mov_b32_e32 v1, s7
; GFX9-DL-NEXT: global_load_ushort v2, v[2:3], off
; GFX9-DL-NEXT: global_load_ushort v0, v[0:1], off
; GFX9-DL-NEXT: global_load_ushort v1, v[2:3], off
; GFX9-DL-NEXT: s_load_dword s2, s[0:1], 0x0
; GFX9-DL-NEXT: s_waitcnt vmcnt(1)
; GFX9-DL-NEXT: v_bfe_i32 v1, v2, 0, 8
; GFX9-DL-NEXT: v_lshrrev_b16_e32 v2, 8, v2
; GFX9-DL-NEXT: s_waitcnt vmcnt(0)
; GFX9-DL-NEXT: v_bfe_i32 v3, v0, 0, 8
; GFX9-DL-NEXT: v_lshrrev_b16_e32 v0, 8, v0
; GFX9-DL-NEXT: v_bfe_i32 v2, v2, 0, 8
; GFX9-DL-NEXT: s_waitcnt vmcnt(0)
; GFX9-DL-NEXT: v_bfe_i32 v2, v1, 0, 8
; GFX9-DL-NEXT: v_lshrrev_b16_e32 v1, 8, v1
; GFX9-DL-NEXT: v_bfe_i32 v1, v1, 0, 8
; GFX9-DL-NEXT: v_bfe_i32 v0, v0, 0, 8
; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-DL-NEXT: v_mad_i32_i24 v0, v0, v2, s2
; GFX9-DL-NEXT: v_mad_i32_i24 v2, v3, v1, v0
; GFX9-DL-NEXT: v_mad_i32_i24 v0, v0, v1, s2
; GFX9-DL-NEXT: v_mad_i32_i24 v2, v3, v2, v0
; GFX9-DL-NEXT: v_mov_b32_e32 v0, s0
; GFX9-DL-NEXT: v_mov_b32_e32 v1, s1
; GFX9-DL-NEXT: global_store_dword v[0:1], v2, off

View File

@ -108,29 +108,29 @@ define amdgpu_kernel void @idot4_acc32(<4 x i8> addrspace(1)* %src1,
; GFX9-DL-NEXT: s_load_dword s2, s[6:7], 0x0
; GFX9-DL-NEXT: s_load_dword s3, s[0:1], 0x0
; GFX9-DL-NEXT: s_load_dword s4, s[4:5], 0x0
; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-DL-NEXT: v_mov_b32_e32 v0, s2
; GFX9-DL-NEXT: v_mov_b32_e32 v1, s3
; GFX9-DL-NEXT: v_dot4_i32_i8 v2, s4, v0, v1
; GFX9-DL-NEXT: v_mov_b32_e32 v0, s0
; GFX9-DL-NEXT: v_mov_b32_e32 v1, s1
; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-DL-NEXT: v_mov_b32_e32 v2, s2
; GFX9-DL-NEXT: v_mov_b32_e32 v3, s3
; GFX9-DL-NEXT: v_dot4_i32_i8 v2, s4, v2, v3
; GFX9-DL-NEXT: global_store_dword v[0:1], v2, off
; GFX9-DL-NEXT: s_endpgm
;
; GFX10-DL-LABEL: idot4_acc32:
; GFX10-DL: ; %bb.0: ; %entry
; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
; GFX10-DL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34
; GFX10-DL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
; GFX10-DL-NEXT: ; implicit-def: $vcc_hi
; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-DL-NEXT: s_load_dword s2, s[0:1], 0x0
; GFX10-DL-NEXT: s_load_dword s3, s[4:5], 0x0
; GFX10-DL-NEXT: s_load_dword s4, s[6:7], 0x0
; GFX10-DL-NEXT: s_load_dword s6, s[4:5], 0x0
; GFX10-DL-NEXT: s_load_dword s0, s[0:1], 0x0
; GFX10-DL-NEXT: s_load_dword s1, s[2:3], 0x0
; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-DL-NEXT: v_mov_b32_e32 v0, s2
; GFX10-DL-NEXT: v_dot4_i32_i8 v2, s3, s4, v0
; GFX10-DL-NEXT: v_mov_b32_e32 v0, s0
; GFX10-DL-NEXT: v_mov_b32_e32 v1, s1
; GFX10-DL-NEXT: v_mov_b32_e32 v0, s6
; GFX10-DL-NEXT: v_dot4_i32_i8 v2, s0, s1, v0
; GFX10-DL-NEXT: v_mov_b32_e32 v0, s4
; GFX10-DL-NEXT: v_mov_b32_e32 v1, s5
; GFX10-DL-NEXT: global_store_dword v[0:1], v2, off
; GFX10-DL-NEXT: s_endpgm
<4 x i8> addrspace(1)* %src2,
@ -226,21 +226,21 @@ define amdgpu_kernel void @idot4_acc16(<4 x i8> addrspace(1)* %src1,
; GFX8-NEXT: flat_load_ushort v2, v[0:1]
; GFX8-NEXT: s_load_dword s0, s[4:5], 0x0
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: s_sext_i32_i8 s1, s2
; GFX8-NEXT: s_bfe_i32 s3, s2, 0x80008
; GFX8-NEXT: v_mov_b32_e32 v3, s1
; GFX8-NEXT: s_bfe_i32 s5, s2, 0x80010
; GFX8-NEXT: s_sext_i32_i8 s3, s2
; GFX8-NEXT: s_bfe_i32 s5, s2, 0x80008
; GFX8-NEXT: v_mov_b32_e32 v3, s3
; GFX8-NEXT: s_bfe_i32 s7, s2, 0x80010
; GFX8-NEXT: s_sext_i32_i8 s1, s0
; GFX8-NEXT: v_mov_b32_e32 v4, s3
; GFX8-NEXT: s_bfe_i32 s4, s0, 0x80008
; GFX8-NEXT: s_bfe_i32 s3, s0, 0x80010
; GFX8-NEXT: v_mov_b32_e32 v4, s5
; GFX8-NEXT: s_bfe_i32 s6, s0, 0x80010
; GFX8-NEXT: s_ashr_i32 s2, s2, 24
; GFX8-NEXT: v_mov_b32_e32 v5, s5
; GFX8-NEXT: v_mov_b32_e32 v5, s7
; GFX8-NEXT: s_ashr_i32 s0, s0, 24
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: v_mad_i32_i24 v2, s1, v3, v2
; GFX8-NEXT: v_mad_i32_i24 v2, s4, v4, v2
; GFX8-NEXT: v_mad_i32_i24 v2, s3, v5, v2
; GFX8-NEXT: v_mad_i32_i24 v2, s6, v5, v2
; GFX8-NEXT: v_mov_b32_e32 v3, s2
; GFX8-NEXT: v_mad_i32_i24 v2, s0, v3, v2
; GFX8-NEXT: flat_store_short v[0:1], v2
@ -257,21 +257,21 @@ define amdgpu_kernel void @idot4_acc16(<4 x i8> addrspace(1)* %src1,
; GFX9-NODL-NEXT: global_load_ushort v2, v[0:1], off
; GFX9-NODL-NEXT: s_load_dword s0, s[4:5], 0x0
; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NODL-NEXT: s_sext_i32_i8 s1, s2
; GFX9-NODL-NEXT: s_bfe_i32 s3, s2, 0x80008
; GFX9-NODL-NEXT: v_mov_b32_e32 v3, s1
; GFX9-NODL-NEXT: s_bfe_i32 s5, s2, 0x80010
; GFX9-NODL-NEXT: s_sext_i32_i8 s3, s2
; GFX9-NODL-NEXT: s_bfe_i32 s5, s2, 0x80008
; GFX9-NODL-NEXT: v_mov_b32_e32 v3, s3
; GFX9-NODL-NEXT: s_bfe_i32 s7, s2, 0x80010
; GFX9-NODL-NEXT: s_sext_i32_i8 s1, s0
; GFX9-NODL-NEXT: v_mov_b32_e32 v4, s3
; GFX9-NODL-NEXT: s_bfe_i32 s4, s0, 0x80008
; GFX9-NODL-NEXT: s_bfe_i32 s3, s0, 0x80010
; GFX9-NODL-NEXT: v_mov_b32_e32 v4, s5
; GFX9-NODL-NEXT: s_bfe_i32 s6, s0, 0x80010
; GFX9-NODL-NEXT: s_ashr_i32 s2, s2, 24
; GFX9-NODL-NEXT: v_mov_b32_e32 v5, s5
; GFX9-NODL-NEXT: v_mov_b32_e32 v5, s7
; GFX9-NODL-NEXT: s_ashr_i32 s0, s0, 24
; GFX9-NODL-NEXT: s_waitcnt vmcnt(0)
; GFX9-NODL-NEXT: v_mad_i32_i24 v2, s1, v3, v2
; GFX9-NODL-NEXT: v_mad_i32_i24 v2, s4, v4, v2
; GFX9-NODL-NEXT: v_mad_i32_i24 v2, s3, v5, v2
; GFX9-NODL-NEXT: v_mad_i32_i24 v2, s6, v5, v2
; GFX9-NODL-NEXT: v_mov_b32_e32 v3, s2
; GFX9-NODL-NEXT: v_mad_i32_i24 v2, s0, v3, v2
; GFX9-NODL-NEXT: global_store_short v[0:1], v2, off
@ -282,15 +282,15 @@ define amdgpu_kernel void @idot4_acc16(<4 x i8> addrspace(1)* %src1,
; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-DL-NEXT: s_load_dword s2, s[6:7], 0x0
; GFX9-DL-NEXT: s_load_dword s2, s[4:5], 0x0
; GFX9-DL-NEXT: s_load_dword s3, s[6:7], 0x0
; GFX9-DL-NEXT: v_mov_b32_e32 v0, s0
; GFX9-DL-NEXT: v_mov_b32_e32 v1, s1
; GFX9-DL-NEXT: global_load_ushort v2, v[0:1], off
; GFX9-DL-NEXT: s_load_dword s0, s[4:5], 0x0
; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-DL-NEXT: v_mov_b32_e32 v3, s2
; GFX9-DL-NEXT: v_mov_b32_e32 v3, s3
; GFX9-DL-NEXT: s_waitcnt vmcnt(0)
; GFX9-DL-NEXT: v_dot4_i32_i8 v2, s0, v3, v2
; GFX9-DL-NEXT: v_dot4_i32_i8 v2, s2, v3, v2
; GFX9-DL-NEXT: global_store_short v[0:1], v2, off
; GFX9-DL-NEXT: s_endpgm
;
@ -451,15 +451,15 @@ define amdgpu_kernel void @idot4_acc8(<4 x i8> addrspace(1)* %src1,
; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-DL-NEXT: s_load_dword s2, s[6:7], 0x0
; GFX9-DL-NEXT: s_load_dword s2, s[4:5], 0x0
; GFX9-DL-NEXT: s_load_dword s3, s[6:7], 0x0
; GFX9-DL-NEXT: v_mov_b32_e32 v0, s0
; GFX9-DL-NEXT: v_mov_b32_e32 v1, s1
; GFX9-DL-NEXT: global_load_ubyte v2, v[0:1], off
; GFX9-DL-NEXT: s_load_dword s0, s[4:5], 0x0
; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-DL-NEXT: v_mov_b32_e32 v3, s2
; GFX9-DL-NEXT: v_mov_b32_e32 v3, s3
; GFX9-DL-NEXT: s_waitcnt vmcnt(0)
; GFX9-DL-NEXT: v_dot4_u32_u8 v2, s0, v3, v2
; GFX9-DL-NEXT: v_dot4_u32_u8 v2, s2, v3, v2
; GFX9-DL-NEXT: global_store_byte v[0:1], v2, off
; GFX9-DL-NEXT: s_endpgm
;
@ -928,30 +928,30 @@ define amdgpu_kernel void @idot4_acc16_vecMul(<4 x i8> addrspace(1)* %src1,
; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: s_load_dword s2, s[6:7], 0x0
; GFX8-NEXT: v_mov_b32_e32 v0, s0
; GFX8-NEXT: v_mov_b32_e32 v1, s1
; GFX8-NEXT: flat_load_ushort v2, v[0:1]
; GFX8-NEXT: s_load_dword s0, s[4:5], 0x0
; GFX8-NEXT: s_load_dword s1, s[6:7], 0x0
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: s_sext_i32_i8 s1, s2
; GFX8-NEXT: v_lshrrev_b16_e64 v3, 8, s2
; GFX8-NEXT: v_mov_b32_e32 v4, s1
; GFX8-NEXT: s_bfe_i32 s3, s2, 0x80010
; GFX8-NEXT: v_lshrrev_b16_e64 v5, 8, s0
; GFX8-NEXT: s_sext_i32_i8 s1, s0
; GFX8-NEXT: v_bfe_i32 v5, v5, 0, 8
; GFX8-NEXT: v_lshrrev_b16_e64 v3, 8, s0
; GFX8-NEXT: v_lshrrev_b16_e64 v4, 8, s1
; GFX8-NEXT: s_bfe_i32 s5, s1, 0x80010
; GFX8-NEXT: s_ashr_i32 s4, s1, 24
; GFX8-NEXT: s_sext_i32_i8 s1, s1
; GFX8-NEXT: s_ashr_i32 s2, s0, 24
; GFX8-NEXT: s_bfe_i32 s3, s0, 0x80010
; GFX8-NEXT: s_sext_i32_i8 s0, s0
; GFX8-NEXT: v_mov_b32_e32 v5, s1
; GFX8-NEXT: v_bfe_i32 v3, v3, 0, 8
; GFX8-NEXT: s_bfe_i32 s4, s0, 0x80010
; GFX8-NEXT: s_ashr_i32 s2, s2, 24
; GFX8-NEXT: v_mov_b32_e32 v6, s3
; GFX8-NEXT: s_ashr_i32 s0, s0, 24
; GFX8-NEXT: v_bfe_i32 v4, v4, 0, 8
; GFX8-NEXT: v_mov_b32_e32 v6, s5
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: v_mad_i32_i24 v2, s1, v4, v2
; GFX8-NEXT: v_mad_i32_i24 v2, v5, v3, v2
; GFX8-NEXT: v_mad_i32_i24 v2, s4, v6, v2
; GFX8-NEXT: v_mov_b32_e32 v3, s2
; GFX8-NEXT: v_mad_i32_i24 v2, s0, v3, v2
; GFX8-NEXT: v_mad_i32_i24 v2, s0, v5, v2
; GFX8-NEXT: v_mad_i32_i24 v2, v3, v4, v2
; GFX8-NEXT: v_mad_i32_i24 v2, s3, v6, v2
; GFX8-NEXT: v_mov_b32_e32 v3, s4
; GFX8-NEXT: v_mad_i32_i24 v2, s2, v3, v2
; GFX8-NEXT: flat_store_short v[0:1], v2
; GFX8-NEXT: s_endpgm
;
@ -1037,16 +1037,17 @@ define amdgpu_kernel void @idot4_acc16_vecMul(<4 x i8> addrspace(1)* %src1,
;
; GFX10-DL-LABEL: idot4_acc16_vecMul:
; GFX10-DL: ; %bb.0: ; %entry
; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
; GFX10-DL-NEXT: v_mov_b32_e32 v2, 0xffff
; GFX10-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; GFX10-DL-NEXT: v_mov_b32_e32 v3, 0xffff
; GFX10-DL-NEXT: ; implicit-def: $vcc_hi
; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-DL-NEXT: v_mov_b32_e32 v0, s0
; GFX10-DL-NEXT: v_mov_b32_e32 v1, s1
; GFX10-DL-NEXT: global_load_ushort v3, v[0:1], off
; GFX10-DL-NEXT: s_load_dword s0, s[4:5], 0x0
; GFX10-DL-NEXT: s_load_dword s1, s[6:7], 0x0
; GFX10-DL-NEXT: v_mov_b32_e32 v0, s2
; GFX10-DL-NEXT: v_mov_b32_e32 v1, s3
; GFX10-DL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
; GFX10-DL-NEXT: global_load_ushort v2, v[0:1], off
; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-DL-NEXT: s_load_dword s0, s[0:1], 0x0
; GFX10-DL-NEXT: s_load_dword s1, s[2:3], 0x0
; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-DL-NEXT: s_bfe_i32 s4, s0, 0x80000
; GFX10-DL-NEXT: s_bfe_i32 s3, s1, 0x80000
@ -1054,25 +1055,25 @@ define amdgpu_kernel void @idot4_acc16_vecMul(<4 x i8> addrspace(1)* %src1,
; GFX10-DL-NEXT: v_ashrrev_i16_e64 v4, 8, s0
; GFX10-DL-NEXT: s_lshr_b32 s5, s1, 16
; GFX10-DL-NEXT: v_ashrrev_i16_e64 v5, 8, s1
; GFX10-DL-NEXT: v_and_b32_e32 v6, s3, v2
; GFX10-DL-NEXT: v_and_b32_e32 v7, s4, v2
; GFX10-DL-NEXT: v_and_b32_e32 v6, s3, v3
; GFX10-DL-NEXT: v_and_b32_e32 v7, s4, v3
; GFX10-DL-NEXT: s_bfe_i32 s0, s2, 0x80000
; GFX10-DL-NEXT: s_bfe_i32 s1, s5, 0x80000
; GFX10-DL-NEXT: v_ashrrev_i16_e64 v11, 8, s2
; GFX10-DL-NEXT: v_lshl_or_b32 v5, v5, 16, v6
; GFX10-DL-NEXT: v_lshl_or_b32 v4, v4, 16, v7
; GFX10-DL-NEXT: v_ashrrev_i16_e64 v6, 8, s2
; GFX10-DL-NEXT: v_and_b32_e32 v8, s1, v3
; GFX10-DL-NEXT: v_and_b32_e32 v3, s0, v3
; GFX10-DL-NEXT: v_ashrrev_i16_e64 v7, 8, s5
; GFX10-DL-NEXT: v_and_b32_e32 v8, s1, v2
; GFX10-DL-NEXT: v_and_b32_e32 v2, s0, v2
; GFX10-DL-NEXT: v_pk_mul_lo_u16 v4, v4, v5
; GFX10-DL-NEXT: v_lshl_or_b32 v3, v6, 16, v3
; GFX10-DL-NEXT: v_lshl_or_b32 v5, v7, 16, v8
; GFX10-DL-NEXT: v_lshl_or_b32 v2, v11, 16, v2
; GFX10-DL-NEXT: v_pk_mul_lo_u16 v2, v2, v5
; GFX10-DL-NEXT: v_pk_mul_lo_u16 v3, v3, v5
; GFX10-DL-NEXT: s_waitcnt vmcnt(0)
; GFX10-DL-NEXT: v_add_nc_u32_e32 v3, v4, v3
; GFX10-DL-NEXT: v_add_nc_u32_sdwa v3, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
; GFX10-DL-NEXT: v_add_nc_u32_e32 v3, v3, v2
; GFX10-DL-NEXT: v_add_nc_u32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
; GFX10-DL-NEXT: v_add_nc_u32_e32 v2, v4, v2
; GFX10-DL-NEXT: v_add_nc_u32_sdwa v2, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
; GFX10-DL-NEXT: v_add_nc_u32_e32 v2, v2, v3
; GFX10-DL-NEXT: v_add_nc_u32_sdwa v2, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
; GFX10-DL-NEXT: global_store_short v[0:1], v2, off
; GFX10-DL-NEXT: s_endpgm
<4 x i8> addrspace(1)* %src2,

View File

@ -111,29 +111,29 @@ define amdgpu_kernel void @udot4_acc32(<4 x i8> addrspace(1)* %src1,
; GFX9-DL-NEXT: s_load_dword s2, s[6:7], 0x0
; GFX9-DL-NEXT: s_load_dword s3, s[0:1], 0x0
; GFX9-DL-NEXT: s_load_dword s4, s[4:5], 0x0
; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-DL-NEXT: v_mov_b32_e32 v0, s2
; GFX9-DL-NEXT: v_mov_b32_e32 v1, s3
; GFX9-DL-NEXT: v_dot4_u32_u8 v2, s4, v0, v1
; GFX9-DL-NEXT: v_mov_b32_e32 v0, s0
; GFX9-DL-NEXT: v_mov_b32_e32 v1, s1
; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-DL-NEXT: v_mov_b32_e32 v2, s2
; GFX9-DL-NEXT: v_mov_b32_e32 v3, s3
; GFX9-DL-NEXT: v_dot4_u32_u8 v2, s4, v2, v3
; GFX9-DL-NEXT: global_store_dword v[0:1], v2, off
; GFX9-DL-NEXT: s_endpgm
;
; GFX10-DL-LABEL: udot4_acc32:
; GFX10-DL: ; %bb.0: ; %entry
; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
; GFX10-DL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34
; GFX10-DL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
; GFX10-DL-NEXT: ; implicit-def: $vcc_hi
; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-DL-NEXT: s_load_dword s2, s[0:1], 0x0
; GFX10-DL-NEXT: s_load_dword s3, s[4:5], 0x0
; GFX10-DL-NEXT: s_load_dword s4, s[6:7], 0x0
; GFX10-DL-NEXT: s_load_dword s6, s[4:5], 0x0
; GFX10-DL-NEXT: s_load_dword s0, s[0:1], 0x0
; GFX10-DL-NEXT: s_load_dword s1, s[2:3], 0x0
; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-DL-NEXT: v_mov_b32_e32 v0, s2
; GFX10-DL-NEXT: v_dot4_u32_u8 v2, s3, s4, v0
; GFX10-DL-NEXT: v_mov_b32_e32 v0, s0
; GFX10-DL-NEXT: v_mov_b32_e32 v1, s1
; GFX10-DL-NEXT: v_mov_b32_e32 v0, s6
; GFX10-DL-NEXT: v_dot4_u32_u8 v2, s0, s1, v0
; GFX10-DL-NEXT: v_mov_b32_e32 v0, s4
; GFX10-DL-NEXT: v_mov_b32_e32 v1, s5
; GFX10-DL-NEXT: global_store_dword v[0:1], v2, off
; GFX10-DL-NEXT: s_endpgm
<4 x i8> addrspace(1)* %src2,
@ -278,15 +278,15 @@ define amdgpu_kernel void @udot4_acc16(<4 x i8> addrspace(1)* %src1,
; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-DL-NEXT: s_load_dword s2, s[6:7], 0x0
; GFX9-DL-NEXT: s_load_dword s2, s[4:5], 0x0
; GFX9-DL-NEXT: s_load_dword s3, s[6:7], 0x0
; GFX9-DL-NEXT: v_mov_b32_e32 v0, s0
; GFX9-DL-NEXT: v_mov_b32_e32 v1, s1
; GFX9-DL-NEXT: global_load_ushort v2, v[0:1], off
; GFX9-DL-NEXT: s_load_dword s0, s[4:5], 0x0
; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-DL-NEXT: v_mov_b32_e32 v3, s2
; GFX9-DL-NEXT: v_mov_b32_e32 v3, s3
; GFX9-DL-NEXT: s_waitcnt vmcnt(0)
; GFX9-DL-NEXT: v_dot4_u32_u8 v2, s0, v3, v2
; GFX9-DL-NEXT: v_dot4_u32_u8 v2, s2, v3, v2
; GFX9-DL-NEXT: global_store_short v[0:1], v2, off
; GFX9-DL-NEXT: s_endpgm
;
@ -448,15 +448,15 @@ define amdgpu_kernel void @udot4_acc8(<4 x i8> addrspace(1)* %src1,
; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-DL-NEXT: s_load_dword s2, s[6:7], 0x0
; GFX9-DL-NEXT: s_load_dword s2, s[4:5], 0x0
; GFX9-DL-NEXT: s_load_dword s3, s[6:7], 0x0
; GFX9-DL-NEXT: v_mov_b32_e32 v0, s0
; GFX9-DL-NEXT: v_mov_b32_e32 v1, s1
; GFX9-DL-NEXT: global_load_ubyte v2, v[0:1], off
; GFX9-DL-NEXT: s_load_dword s0, s[4:5], 0x0
; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-DL-NEXT: v_mov_b32_e32 v3, s2
; GFX9-DL-NEXT: v_mov_b32_e32 v3, s3
; GFX9-DL-NEXT: s_waitcnt vmcnt(0)
; GFX9-DL-NEXT: v_dot4_u32_u8 v2, s0, v3, v2
; GFX9-DL-NEXT: v_dot4_u32_u8 v2, s2, v3, v2
; GFX9-DL-NEXT: global_store_byte v[0:1], v2, off
; GFX9-DL-NEXT: s_endpgm
;
@ -608,24 +608,25 @@ define amdgpu_kernel void @udot2_8(<4 x i8> addrspace(1)* %src1,
;
; GFX10-DL-LABEL: udot2_8:
; GFX10-DL: ; %bb.0: ; %entry
; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
; GFX10-DL-NEXT: s_movk_i32 s2, 0xff
; GFX10-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; GFX10-DL-NEXT: ; implicit-def: $vcc_hi
; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-DL-NEXT: v_mov_b32_e32 v0, s0
; GFX10-DL-NEXT: v_mov_b32_e32 v1, s1
; GFX10-DL-NEXT: v_mov_b32_e32 v0, s2
; GFX10-DL-NEXT: v_mov_b32_e32 v1, s3
; GFX10-DL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
; GFX10-DL-NEXT: global_load_ubyte v2, v[0:1], off
; GFX10-DL-NEXT: s_load_dword s0, s[6:7], 0x0
; GFX10-DL-NEXT: s_load_dword s1, s[4:5], 0x0
; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-DL-NEXT: s_and_b32 s3, s0, s2
; GFX10-DL-NEXT: s_and_b32 s2, s1, s2
; GFX10-DL-NEXT: s_load_dword s2, s[2:3], 0x0
; GFX10-DL-NEXT: s_load_dword s0, s[0:1], 0x0
; GFX10-DL-NEXT: s_movk_i32 s1, 0xff
; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-DL-NEXT: s_and_b32 s3, s2, s1
; GFX10-DL-NEXT: s_and_b32 s1, s0, s1
; GFX10-DL-NEXT: s_bfe_u32 s2, s2, 0x80008
; GFX10-DL-NEXT: s_bfe_u32 s0, s0, 0x80008
; GFX10-DL-NEXT: s_bfe_u32 s1, s1, 0x80008
; GFX10-DL-NEXT: s_waitcnt vmcnt(0)
; GFX10-DL-NEXT: v_mad_u32_u24 v2, s2, s3, v2
; GFX10-DL-NEXT: v_mad_u32_u24 v2, s1, s0, v2
; GFX10-DL-NEXT: v_mad_u32_u24 v2, s1, s3, v2
; GFX10-DL-NEXT: v_mad_u32_u24 v2, s0, s2, v2
; GFX10-DL-NEXT: global_store_byte v[0:1], v2, off
; GFX10-DL-NEXT: s_endpgm
<4 x i8> addrspace(1)* %src2,
@ -752,14 +753,14 @@ define amdgpu_kernel void @udot4_CommutationInsideMAD(<4 x i8> addrspace(1)* %sr
; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-DL-NEXT: s_load_dword s2, s[4:5], 0x0
; GFX9-DL-NEXT: s_load_dword s3, s[6:7], 0x0
; GFX9-DL-NEXT: v_mov_b32_e32 v0, s0
; GFX9-DL-NEXT: v_mov_b32_e32 v1, s1
; GFX9-DL-NEXT: global_load_ubyte v2, v[0:1], off
; GFX9-DL-NEXT: s_load_dword s0, s[6:7], 0x0
; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-DL-NEXT: v_mov_b32_e32 v3, s2
; GFX9-DL-NEXT: s_waitcnt vmcnt(0)
; GFX9-DL-NEXT: v_dot4_u32_u8 v2, s0, v3, v2
; GFX9-DL-NEXT: v_dot4_u32_u8 v2, s3, v3, v2
; GFX9-DL-NEXT: global_store_byte v[0:1], v2, off
; GFX9-DL-NEXT: s_endpgm
;
@ -943,16 +944,17 @@ define amdgpu_kernel void @udot4_CommutationAccrossMADs(<4 x i8> addrspace(1)* %
;
; GFX10-DL-LABEL: udot4_CommutationAccrossMADs:
; GFX10-DL: ; %bb.0: ; %entry
; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
; GFX10-DL-NEXT: s_movk_i32 s2, 0xff
; GFX10-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; GFX10-DL-NEXT: ; implicit-def: $vcc_hi
; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-DL-NEXT: v_mov_b32_e32 v0, s0
; GFX10-DL-NEXT: v_mov_b32_e32 v1, s1
; GFX10-DL-NEXT: v_mov_b32_e32 v0, s2
; GFX10-DL-NEXT: v_mov_b32_e32 v1, s3
; GFX10-DL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
; GFX10-DL-NEXT: global_load_ubyte v2, v[0:1], off
; GFX10-DL-NEXT: s_load_dword s0, s[4:5], 0x0
; GFX10-DL-NEXT: s_load_dword s1, s[6:7], 0x0
; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-DL-NEXT: s_load_dword s0, s[0:1], 0x0
; GFX10-DL-NEXT: s_load_dword s1, s[2:3], 0x0
; GFX10-DL-NEXT: s_movk_i32 s2, 0xff
; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-DL-NEXT: s_bfe_u32 s3, s0, 0x80008
; GFX10-DL-NEXT: s_bfe_u32 s4, s1, 0x80008
@ -1138,26 +1140,26 @@ define amdgpu_kernel void @udot4_multiuse_mul1(<4 x i8> addrspace(1)* %src1,
; GFX10-DL: ; %bb.0: ; %entry
; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
; GFX10-DL-NEXT: s_movk_i32 s2, 0xff
; GFX10-DL-NEXT: ; implicit-def: $vcc_hi
; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-DL-NEXT: s_load_dword s3, s[4:5], 0x0
; GFX10-DL-NEXT: s_load_dword s4, s[6:7], 0x0
; GFX10-DL-NEXT: s_load_dword s5, s[0:1], 0x0
; GFX10-DL-NEXT: s_load_dword s2, s[4:5], 0x0
; GFX10-DL-NEXT: s_load_dword s3, s[6:7], 0x0
; GFX10-DL-NEXT: s_load_dword s4, s[0:1], 0x0
; GFX10-DL-NEXT: s_movk_i32 s5, 0xff
; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-DL-NEXT: s_and_b32 s6, s3, s2
; GFX10-DL-NEXT: s_and_b32 s2, s4, s2
; GFX10-DL-NEXT: v_mov_b32_e32 v0, s5
; GFX10-DL-NEXT: s_bfe_u32 s5, s3, 0x80008
; GFX10-DL-NEXT: s_bfe_u32 s7, s4, 0x80008
; GFX10-DL-NEXT: v_mad_u32_u24 v0, s6, s2, v0
; GFX10-DL-NEXT: v_mad_u32_u24 v0, s5, s7, v0
; GFX10-DL-NEXT: s_bfe_u32 s5, s3, 0x80010
; GFX10-DL-NEXT: s_bfe_u32 s7, s4, 0x80010
; GFX10-DL-NEXT: v_mad_u32_u24 v0, s6, s2, v0
; GFX10-DL-NEXT: s_lshr_b32 s2, s3, 24
; GFX10-DL-NEXT: s_lshr_b32 s3, s4, 24
; GFX10-DL-NEXT: v_mad_u32_u24 v0, s5, s7, v0
; GFX10-DL-NEXT: s_and_b32 s6, s2, s5
; GFX10-DL-NEXT: s_and_b32 s5, s3, s5
; GFX10-DL-NEXT: v_mov_b32_e32 v0, s4
; GFX10-DL-NEXT: s_bfe_u32 s4, s2, 0x80008
; GFX10-DL-NEXT: s_bfe_u32 s7, s3, 0x80008
; GFX10-DL-NEXT: v_mad_u32_u24 v0, s6, s5, v0
; GFX10-DL-NEXT: v_mad_u32_u24 v0, s4, s7, v0
; GFX10-DL-NEXT: s_bfe_u32 s4, s2, 0x80010
; GFX10-DL-NEXT: s_bfe_u32 s7, s3, 0x80010
; GFX10-DL-NEXT: s_lshr_b32 s2, s2, 24
; GFX10-DL-NEXT: s_lshr_b32 s3, s3, 24
; GFX10-DL-NEXT: v_mad_u32_u24 v0, s6, s5, v0
; GFX10-DL-NEXT: v_mad_u32_u24 v0, s4, s7, v0
; GFX10-DL-NEXT: v_mad_u32_u24 v2, s2, s3, v0
; GFX10-DL-NEXT: v_mov_b32_e32 v0, s0
; GFX10-DL-NEXT: v_mov_b32_e32 v1, s1
@ -1345,25 +1347,25 @@ define amdgpu_kernel void @udot4_multiuse_add1(<4 x i8> addrspace(1)* %src1,
; GFX10-DL: ; %bb.0: ; %entry
; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
; GFX10-DL-NEXT: s_movk_i32 s2, 0xff
; GFX10-DL-NEXT: ; implicit-def: $vcc_hi
; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-DL-NEXT: s_load_dword s3, s[4:5], 0x0
; GFX10-DL-NEXT: s_load_dword s4, s[6:7], 0x0
; GFX10-DL-NEXT: s_load_dword s5, s[0:1], 0x0
; GFX10-DL-NEXT: s_load_dword s2, s[4:5], 0x0
; GFX10-DL-NEXT: s_load_dword s3, s[6:7], 0x0
; GFX10-DL-NEXT: s_load_dword s4, s[0:1], 0x0
; GFX10-DL-NEXT: s_movk_i32 s5, 0xff
; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-DL-NEXT: s_bfe_u32 s6, s3, 0x80008
; GFX10-DL-NEXT: s_bfe_u32 s7, s4, 0x80008
; GFX10-DL-NEXT: v_mov_b32_e32 v0, s5
; GFX10-DL-NEXT: s_and_b32 s8, s3, s2
; GFX10-DL-NEXT: s_and_b32 s2, s4, s2
; GFX10-DL-NEXT: s_bfe_u32 s6, s2, 0x80008
; GFX10-DL-NEXT: s_bfe_u32 s7, s3, 0x80008
; GFX10-DL-NEXT: v_mov_b32_e32 v0, s4
; GFX10-DL-NEXT: s_and_b32 s8, s2, s5
; GFX10-DL-NEXT: s_and_b32 s5, s3, s5
; GFX10-DL-NEXT: v_mad_u32_u24 v0, s6, s7, v0
; GFX10-DL-NEXT: s_bfe_u32 s6, s3, 0x80010
; GFX10-DL-NEXT: s_bfe_u32 s7, s4, 0x80010
; GFX10-DL-NEXT: v_mad_u32_u24 v1, s8, s2, v0
; GFX10-DL-NEXT: s_lshr_b32 s2, s3, 24
; GFX10-DL-NEXT: s_lshr_b32 s3, s4, 24
; GFX10-DL-NEXT: v_add_nc_u32_e32 v0, s5, v0
; GFX10-DL-NEXT: s_bfe_u32 s6, s2, 0x80010
; GFX10-DL-NEXT: s_bfe_u32 s7, s3, 0x80010
; GFX10-DL-NEXT: s_lshr_b32 s2, s2, 24
; GFX10-DL-NEXT: s_lshr_b32 s3, s3, 24
; GFX10-DL-NEXT: v_mad_u32_u24 v1, s8, s5, v0
; GFX10-DL-NEXT: v_add_nc_u32_e32 v0, s4, v0
; GFX10-DL-NEXT: v_mad_u32_u24 v1, s6, s7, v1
; GFX10-DL-NEXT: v_mad_u32_u24 v1, s2, s3, v1
; GFX10-DL-NEXT: v_add_nc_u32_e32 v2, v1, v0
@ -1458,21 +1460,21 @@ define amdgpu_kernel void @notdot4_mixedtypes(<4 x i8> addrspace(1)* %src1,
; GFX8-NEXT: flat_load_ushort v2, v[0:1]
; GFX8-NEXT: s_load_dword s0, s[4:5], 0x0
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: s_bfe_u32 s1, s2, 0x80008
; GFX8-NEXT: s_bfe_u32 s5, s2, 0x80008
; GFX8-NEXT: s_sext_i32_i8 s3, s2
; GFX8-NEXT: v_mov_b32_e32 v3, s1
; GFX8-NEXT: s_bfe_u32 s5, s2, 0x80010
; GFX8-NEXT: s_bfe_u32 s1, s0, 0x80008
; GFX8-NEXT: v_mov_b32_e32 v3, s5
; GFX8-NEXT: s_bfe_u32 s7, s2, 0x80010
; GFX8-NEXT: s_bfe_u32 s4, s0, 0x80008
; GFX8-NEXT: s_sext_i32_i8 s1, s0
; GFX8-NEXT: v_mov_b32_e32 v4, s3
; GFX8-NEXT: s_sext_i32_i8 s4, s0
; GFX8-NEXT: s_bfe_u32 s3, s0, 0x80010
; GFX8-NEXT: s_bfe_u32 s6, s0, 0x80010
; GFX8-NEXT: s_lshr_b32 s2, s2, 24
; GFX8-NEXT: v_mov_b32_e32 v5, s5
; GFX8-NEXT: v_mov_b32_e32 v5, s7
; GFX8-NEXT: s_lshr_b32 s0, s0, 24
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: v_mad_u32_u24 v2, s1, v3, v2
; GFX8-NEXT: v_mad_i32_i24 v2, s4, v4, v2
; GFX8-NEXT: v_mad_u32_u24 v2, s3, v5, v2
; GFX8-NEXT: v_mad_u32_u24 v2, s4, v3, v2
; GFX8-NEXT: v_mad_i32_i24 v2, s1, v4, v2
; GFX8-NEXT: v_mad_u32_u24 v2, s6, v5, v2
; GFX8-NEXT: v_mov_b32_e32 v3, s2
; GFX8-NEXT: v_mad_u32_u24 v2, s0, v3, v2
; GFX8-NEXT: flat_store_short v[0:1], v2
@ -1489,21 +1491,21 @@ define amdgpu_kernel void @notdot4_mixedtypes(<4 x i8> addrspace(1)* %src1,
; GFX9-NODL-NEXT: global_load_ushort v2, v[0:1], off
; GFX9-NODL-NEXT: s_load_dword s0, s[4:5], 0x0
; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NODL-NEXT: s_bfe_u32 s1, s2, 0x80008
; GFX9-NODL-NEXT: s_bfe_u32 s5, s2, 0x80008
; GFX9-NODL-NEXT: s_sext_i32_i8 s3, s2
; GFX9-NODL-NEXT: v_mov_b32_e32 v3, s1
; GFX9-NODL-NEXT: s_bfe_u32 s5, s2, 0x80010
; GFX9-NODL-NEXT: s_bfe_u32 s1, s0, 0x80008
; GFX9-NODL-NEXT: v_mov_b32_e32 v3, s5
; GFX9-NODL-NEXT: s_bfe_u32 s7, s2, 0x80010
; GFX9-NODL-NEXT: s_bfe_u32 s4, s0, 0x80008
; GFX9-NODL-NEXT: s_sext_i32_i8 s1, s0
; GFX9-NODL-NEXT: v_mov_b32_e32 v4, s3
; GFX9-NODL-NEXT: s_sext_i32_i8 s4, s0
; GFX9-NODL-NEXT: s_bfe_u32 s3, s0, 0x80010
; GFX9-NODL-NEXT: s_bfe_u32 s6, s0, 0x80010
; GFX9-NODL-NEXT: s_lshr_b32 s2, s2, 24
; GFX9-NODL-NEXT: v_mov_b32_e32 v5, s5
; GFX9-NODL-NEXT: v_mov_b32_e32 v5, s7
; GFX9-NODL-NEXT: s_lshr_b32 s0, s0, 24
; GFX9-NODL-NEXT: s_waitcnt vmcnt(0)
; GFX9-NODL-NEXT: v_mad_u32_u24 v2, s1, v3, v2
; GFX9-NODL-NEXT: v_mad_i32_i24 v2, s4, v4, v2
; GFX9-NODL-NEXT: v_mad_u32_u24 v2, s3, v5, v2
; GFX9-NODL-NEXT: v_mad_u32_u24 v2, s4, v3, v2
; GFX9-NODL-NEXT: v_mad_i32_i24 v2, s1, v4, v2
; GFX9-NODL-NEXT: v_mad_u32_u24 v2, s6, v5, v2
; GFX9-NODL-NEXT: v_mov_b32_e32 v3, s2
; GFX9-NODL-NEXT: v_mad_u32_u24 v2, s0, v3, v2
; GFX9-NODL-NEXT: global_store_short v[0:1], v2, off
@ -1520,21 +1522,21 @@ define amdgpu_kernel void @notdot4_mixedtypes(<4 x i8> addrspace(1)* %src1,
; GFX9-DL-NEXT: global_load_ushort v2, v[0:1], off
; GFX9-DL-NEXT: s_load_dword s0, s[4:5], 0x0
; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-DL-NEXT: s_bfe_u32 s1, s2, 0x80008
; GFX9-DL-NEXT: s_bfe_u32 s5, s2, 0x80008
; GFX9-DL-NEXT: s_sext_i32_i8 s3, s2
; GFX9-DL-NEXT: v_mov_b32_e32 v3, s1
; GFX9-DL-NEXT: s_bfe_u32 s5, s2, 0x80010
; GFX9-DL-NEXT: s_bfe_u32 s1, s0, 0x80008
; GFX9-DL-NEXT: v_mov_b32_e32 v3, s5
; GFX9-DL-NEXT: s_bfe_u32 s7, s2, 0x80010
; GFX9-DL-NEXT: s_bfe_u32 s4, s0, 0x80008
; GFX9-DL-NEXT: s_sext_i32_i8 s1, s0
; GFX9-DL-NEXT: v_mov_b32_e32 v4, s3
; GFX9-DL-NEXT: s_sext_i32_i8 s4, s0
; GFX9-DL-NEXT: s_bfe_u32 s3, s0, 0x80010
; GFX9-DL-NEXT: s_bfe_u32 s6, s0, 0x80010
; GFX9-DL-NEXT: s_lshr_b32 s2, s2, 24
; GFX9-DL-NEXT: v_mov_b32_e32 v5, s5
; GFX9-DL-NEXT: v_mov_b32_e32 v5, s7
; GFX9-DL-NEXT: s_lshr_b32 s0, s0, 24
; GFX9-DL-NEXT: s_waitcnt vmcnt(0)
; GFX9-DL-NEXT: v_mad_u32_u24 v2, s1, v3, v2
; GFX9-DL-NEXT: v_mad_i32_i24 v2, s4, v4, v2
; GFX9-DL-NEXT: v_mad_u32_u24 v2, s3, v5, v2
; GFX9-DL-NEXT: v_mad_u32_u24 v2, s4, v3, v2
; GFX9-DL-NEXT: v_mad_i32_i24 v2, s1, v4, v2
; GFX9-DL-NEXT: v_mad_u32_u24 v2, s6, v5, v2
; GFX9-DL-NEXT: v_mov_b32_e32 v3, s2
; GFX9-DL-NEXT: v_mad_u32_u24 v2, s0, v3, v2
; GFX9-DL-NEXT: global_store_short v[0:1], v2, off
@ -1542,15 +1544,16 @@ define amdgpu_kernel void @notdot4_mixedtypes(<4 x i8> addrspace(1)* %src1,
;
; GFX10-DL-LABEL: notdot4_mixedtypes:
; GFX10-DL: ; %bb.0: ; %entry
; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
; GFX10-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; GFX10-DL-NEXT: ; implicit-def: $vcc_hi
; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-DL-NEXT: v_mov_b32_e32 v0, s0
; GFX10-DL-NEXT: v_mov_b32_e32 v1, s1
; GFX10-DL-NEXT: v_mov_b32_e32 v0, s2
; GFX10-DL-NEXT: v_mov_b32_e32 v1, s3
; GFX10-DL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
; GFX10-DL-NEXT: global_load_ushort v2, v[0:1], off
; GFX10-DL-NEXT: s_load_dword s0, s[4:5], 0x0
; GFX10-DL-NEXT: s_load_dword s1, s[6:7], 0x0
; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-DL-NEXT: s_load_dword s0, s[0:1], 0x0
; GFX10-DL-NEXT: s_load_dword s1, s[2:3], 0x0
; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-DL-NEXT: s_bfe_u32 s2, s0, 0x80008
; GFX10-DL-NEXT: s_bfe_u32 s3, s1, 0x80008
@ -1742,29 +1745,29 @@ define amdgpu_kernel void @udot4_acc32_vecMul(<4 x i8> addrspace(1)* %src1,
; GFX10-DL: ; %bb.0: ; %entry
; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
; GFX10-DL-NEXT: s_movk_i32 s3, 0xff
; GFX10-DL-NEXT: s_mov_b32 s2, 0xffff
; GFX10-DL-NEXT: ; implicit-def: $vcc_hi
; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-DL-NEXT: s_load_dword s4, s[4:5], 0x0
; GFX10-DL-NEXT: s_load_dword s5, s[6:7], 0x0
; GFX10-DL-NEXT: s_load_dword s6, s[0:1], 0x0
; GFX10-DL-NEXT: s_load_dword s2, s[4:5], 0x0
; GFX10-DL-NEXT: s_load_dword s3, s[6:7], 0x0
; GFX10-DL-NEXT: s_load_dword s4, s[0:1], 0x0
; GFX10-DL-NEXT: s_movk_i32 s5, 0xff
; GFX10-DL-NEXT: s_mov_b32 s6, 0xffff
; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-DL-NEXT: v_mov_b32_e32 v0, s4
; GFX10-DL-NEXT: v_mov_b32_e32 v1, s5
; GFX10-DL-NEXT: s_and_b32 s7, s4, s3
; GFX10-DL-NEXT: s_and_b32 s3, s5, s3
; GFX10-DL-NEXT: v_mov_b32_e32 v2, s6
; GFX10-DL-NEXT: v_and_b32_sdwa v0, s2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
; GFX10-DL-NEXT: v_and_b32_sdwa v1, s2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
; GFX10-DL-NEXT: s_bfe_u32 s2, s4, 0x80010
; GFX10-DL-NEXT: s_lshr_b32 s4, s4, 24
; GFX10-DL-NEXT: v_mad_u32_u24 v2, s7, s3, v2
; GFX10-DL-NEXT: s_bfe_u32 s3, s5, 0x80010
; GFX10-DL-NEXT: s_lshr_b32 s5, s5, 24
; GFX10-DL-NEXT: v_mov_b32_e32 v0, s2
; GFX10-DL-NEXT: v_mov_b32_e32 v1, s3
; GFX10-DL-NEXT: s_and_b32 s7, s2, s5
; GFX10-DL-NEXT: v_mov_b32_e32 v2, s4
; GFX10-DL-NEXT: s_and_b32 s5, s3, s5
; GFX10-DL-NEXT: v_and_b32_sdwa v0, s6, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
; GFX10-DL-NEXT: v_and_b32_sdwa v1, s6, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
; GFX10-DL-NEXT: s_bfe_u32 s4, s2, 0x80010
; GFX10-DL-NEXT: s_lshr_b32 s2, s2, 24
; GFX10-DL-NEXT: v_mad_u32_u24 v2, s7, s5, v2
; GFX10-DL-NEXT: s_bfe_u32 s5, s3, 0x80010
; GFX10-DL-NEXT: s_lshr_b32 s3, s3, 24
; GFX10-DL-NEXT: v_mad_u32_u24 v0, v0, v1, v2
; GFX10-DL-NEXT: v_mad_u32_u24 v0, s2, s3, v0
; GFX10-DL-NEXT: v_mad_u32_u24 v2, s4, s5, v0
; GFX10-DL-NEXT: v_mad_u32_u24 v0, s4, s5, v0
; GFX10-DL-NEXT: v_mad_u32_u24 v2, s2, s3, v0
; GFX10-DL-NEXT: v_mov_b32_e32 v0, s0
; GFX10-DL-NEXT: v_mov_b32_e32 v1, s1
; GFX10-DL-NEXT: global_store_dword v[0:1], v2, off
@ -1933,38 +1936,39 @@ define amdgpu_kernel void @udot4_acc16_vecMul(<4 x i8> addrspace(1)* %src1,
;
; GFX10-DL-LABEL: udot4_acc16_vecMul:
; GFX10-DL: ; %bb.0: ; %entry
; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
; GFX10-DL-NEXT: v_mov_b32_e32 v2, 0xffff
; GFX10-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; GFX10-DL-NEXT: v_mov_b32_e32 v3, 0xffff
; GFX10-DL-NEXT: ; implicit-def: $vcc_hi
; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-DL-NEXT: v_mov_b32_e32 v0, s0
; GFX10-DL-NEXT: v_mov_b32_e32 v1, s1
; GFX10-DL-NEXT: global_load_ushort v3, v[0:1], off
; GFX10-DL-NEXT: s_load_dword s0, s[4:5], 0x0
; GFX10-DL-NEXT: s_load_dword s1, s[6:7], 0x0
; GFX10-DL-NEXT: v_mov_b32_e32 v0, s2
; GFX10-DL-NEXT: v_mov_b32_e32 v1, s3
; GFX10-DL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
; GFX10-DL-NEXT: global_load_ushort v2, v[0:1], off
; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-DL-NEXT: s_load_dword s0, s[0:1], 0x0
; GFX10-DL-NEXT: s_load_dword s1, s[2:3], 0x0
; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-DL-NEXT: v_lshrrev_b16_e64 v4, 8, s0
; GFX10-DL-NEXT: v_and_b32_sdwa v7, v2, s0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
; GFX10-DL-NEXT: v_and_b32_sdwa v7, v3, s0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
; GFX10-DL-NEXT: v_lshrrev_b16_e64 v5, 8, s1
; GFX10-DL-NEXT: v_and_b32_sdwa v6, v2, s1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
; GFX10-DL-NEXT: v_and_b32_sdwa v6, v3, s1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
; GFX10-DL-NEXT: s_lshr_b32 s2, s0, 16
; GFX10-DL-NEXT: s_lshr_b32 s3, s1, 16
; GFX10-DL-NEXT: v_lshl_or_b32 v4, v4, 16, v7
; GFX10-DL-NEXT: s_lshr_b32 s0, s0, 24
; GFX10-DL-NEXT: v_lshl_or_b32 v5, v5, 16, v6
; GFX10-DL-NEXT: s_lshr_b32 s1, s1, 24
; GFX10-DL-NEXT: v_and_b32_sdwa v6, v2, s3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
; GFX10-DL-NEXT: v_and_b32_sdwa v2, v2, s2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
; GFX10-DL-NEXT: v_and_b32_sdwa v6, v3, s3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
; GFX10-DL-NEXT: v_and_b32_sdwa v3, v3, s2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
; GFX10-DL-NEXT: v_pk_mul_lo_u16 v4, v4, v5
; GFX10-DL-NEXT: v_lshl_or_b32 v5, s1, 16, v6
; GFX10-DL-NEXT: v_lshl_or_b32 v2, s0, 16, v2
; GFX10-DL-NEXT: v_pk_mul_lo_u16 v2, v2, v5
; GFX10-DL-NEXT: v_lshl_or_b32 v3, s0, 16, v3
; GFX10-DL-NEXT: v_pk_mul_lo_u16 v3, v3, v5
; GFX10-DL-NEXT: s_waitcnt vmcnt(0)
; GFX10-DL-NEXT: v_add_nc_u32_e32 v3, v4, v3
; GFX10-DL-NEXT: v_add_nc_u32_sdwa v3, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
; GFX10-DL-NEXT: v_add_nc_u32_e32 v3, v3, v2
; GFX10-DL-NEXT: v_add_nc_u32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
; GFX10-DL-NEXT: v_add_nc_u32_e32 v2, v4, v2
; GFX10-DL-NEXT: v_add_nc_u32_sdwa v2, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
; GFX10-DL-NEXT: v_add_nc_u32_e32 v2, v2, v3
; GFX10-DL-NEXT: v_add_nc_u32_sdwa v2, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
; GFX10-DL-NEXT: global_store_short v[0:1], v2, off
; GFX10-DL-NEXT: s_endpgm
<4 x i8> addrspace(1)* %src2,
@ -2084,34 +2088,34 @@ define amdgpu_kernel void @udot4_acc8_vecMul(<4 x i8> addrspace(1)* %src1,
; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NODL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NODL-NEXT: s_load_dword s2, s[6:7], 0x0
; GFX9-NODL-NEXT: s_load_dword s2, s[4:5], 0x0
; GFX9-NODL-NEXT: s_load_dword s3, s[6:7], 0x0
; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NODL-NEXT: s_lshr_b32 s4, s2, 16
; GFX9-NODL-NEXT: v_mov_b32_e32 v0, s3
; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s3
; GFX9-NODL-NEXT: v_mul_lo_u16_e32 v0, s2, v0
; GFX9-NODL-NEXT: v_mul_lo_u16_sdwa v1, s2, v1 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:BYTE_1
; GFX9-NODL-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-NODL-NEXT: s_lshr_b32 s6, s3, 16
; GFX9-NODL-NEXT: s_lshr_b32 s7, s3, 24
; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s6
; GFX9-NODL-NEXT: v_and_b32_e32 v2, 0xffff, v0
; GFX9-NODL-NEXT: s_lshr_b32 s5, s2, 24
; GFX9-NODL-NEXT: v_mov_b32_e32 v0, s7
; GFX9-NODL-NEXT: v_mul_lo_u16_sdwa v0, s5, v0 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
; GFX9-NODL-NEXT: v_mul_lo_u16_e32 v1, s4, v1
; GFX9-NODL-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-NODL-NEXT: v_or_b32_e32 v3, v2, v0
; GFX9-NODL-NEXT: v_mov_b32_e32 v0, s0
; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s1
; GFX9-NODL-NEXT: global_load_ubyte v2, v[0:1], off
; GFX9-NODL-NEXT: s_load_dword s0, s[4:5], 0x0
; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NODL-NEXT: v_mov_b32_e32 v3, s2
; GFX9-NODL-NEXT: v_mov_b32_e32 v4, s2
; GFX9-NODL-NEXT: s_lshr_b32 s1, s2, 16
; GFX9-NODL-NEXT: s_lshr_b32 s3, s2, 24
; GFX9-NODL-NEXT: v_mul_lo_u16_e32 v3, s0, v3
; GFX9-NODL-NEXT: v_mul_lo_u16_sdwa v4, s0, v4 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:BYTE_1
; GFX9-NODL-NEXT: v_or_b32_sdwa v3, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-NODL-NEXT: s_lshr_b32 s2, s0, 16
; GFX9-NODL-NEXT: v_mov_b32_e32 v5, s1
; GFX9-NODL-NEXT: s_lshr_b32 s4, s0, 24
; GFX9-NODL-NEXT: v_mov_b32_e32 v4, s3
; GFX9-NODL-NEXT: v_mul_lo_u16_sdwa v4, s4, v4 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
; GFX9-NODL-NEXT: v_mul_lo_u16_e32 v5, s2, v5
; GFX9-NODL-NEXT: v_and_b32_e32 v3, 0xffff, v3
; GFX9-NODL-NEXT: v_or_b32_sdwa v4, v5, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-NODL-NEXT: v_or_b32_e32 v4, v3, v4
; GFX9-NODL-NEXT: v_lshrrev_b32_e32 v5, 8, v4
; GFX9-NODL-NEXT: global_load_ubyte v5, v[0:1], off
; GFX9-NODL-NEXT: v_lshrrev_b32_e32 v4, 8, v3
; GFX9-NODL-NEXT: s_waitcnt vmcnt(0)
; GFX9-NODL-NEXT: v_add_u32_e32 v2, v3, v2
; GFX9-NODL-NEXT: v_add_u32_e32 v2, v2, v5
; GFX9-NODL-NEXT: v_add_u32_sdwa v2, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
; GFX9-NODL-NEXT: v_add_u32_sdwa v2, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3
; GFX9-NODL-NEXT: v_add_u32_e32 v2, v2, v4
; GFX9-NODL-NEXT: v_add_u32_sdwa v2, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
; GFX9-NODL-NEXT: v_add_u32_sdwa v2, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3
; GFX9-NODL-NEXT: global_store_byte v[0:1], v2, off
; GFX9-NODL-NEXT: s_endpgm
;
@ -2120,48 +2124,49 @@ define amdgpu_kernel void @udot4_acc8_vecMul(<4 x i8> addrspace(1)* %src1,
; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-DL-NEXT: s_load_dword s2, s[6:7], 0x0
; GFX9-DL-NEXT: s_load_dword s2, s[4:5], 0x0
; GFX9-DL-NEXT: s_load_dword s3, s[6:7], 0x0
; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-DL-NEXT: s_lshr_b32 s4, s2, 16
; GFX9-DL-NEXT: v_mov_b32_e32 v0, s3
; GFX9-DL-NEXT: v_mov_b32_e32 v1, s3
; GFX9-DL-NEXT: v_mul_lo_u16_e32 v0, s2, v0
; GFX9-DL-NEXT: v_mul_lo_u16_sdwa v1, s2, v1 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:BYTE_1
; GFX9-DL-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-DL-NEXT: s_lshr_b32 s6, s3, 16
; GFX9-DL-NEXT: s_lshr_b32 s7, s3, 24
; GFX9-DL-NEXT: v_mov_b32_e32 v1, s6
; GFX9-DL-NEXT: v_and_b32_e32 v2, 0xffff, v0
; GFX9-DL-NEXT: s_lshr_b32 s5, s2, 24
; GFX9-DL-NEXT: v_mov_b32_e32 v0, s7
; GFX9-DL-NEXT: v_mul_lo_u16_sdwa v0, s5, v0 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
; GFX9-DL-NEXT: v_mul_lo_u16_e32 v1, s4, v1
; GFX9-DL-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-DL-NEXT: v_or_b32_e32 v3, v2, v0
; GFX9-DL-NEXT: v_mov_b32_e32 v0, s0
; GFX9-DL-NEXT: v_mov_b32_e32 v1, s1
; GFX9-DL-NEXT: global_load_ubyte v2, v[0:1], off
; GFX9-DL-NEXT: s_load_dword s0, s[4:5], 0x0
; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-DL-NEXT: v_mov_b32_e32 v3, s2
; GFX9-DL-NEXT: v_mov_b32_e32 v4, s2
; GFX9-DL-NEXT: s_lshr_b32 s1, s2, 16
; GFX9-DL-NEXT: s_lshr_b32 s3, s2, 24
; GFX9-DL-NEXT: v_mul_lo_u16_e32 v3, s0, v3
; GFX9-DL-NEXT: v_mul_lo_u16_sdwa v4, s0, v4 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:BYTE_1
; GFX9-DL-NEXT: v_or_b32_sdwa v3, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-DL-NEXT: s_lshr_b32 s2, s0, 16
; GFX9-DL-NEXT: v_mov_b32_e32 v5, s1
; GFX9-DL-NEXT: s_lshr_b32 s4, s0, 24
; GFX9-DL-NEXT: v_mov_b32_e32 v4, s3
; GFX9-DL-NEXT: v_mul_lo_u16_sdwa v4, s4, v4 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
; GFX9-DL-NEXT: v_mul_lo_u16_e32 v5, s2, v5
; GFX9-DL-NEXT: v_and_b32_e32 v3, 0xffff, v3
; GFX9-DL-NEXT: v_or_b32_sdwa v4, v5, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-DL-NEXT: v_or_b32_e32 v4, v3, v4
; GFX9-DL-NEXT: v_lshrrev_b32_e32 v5, 8, v4
; GFX9-DL-NEXT: global_load_ubyte v5, v[0:1], off
; GFX9-DL-NEXT: v_lshrrev_b32_e32 v4, 8, v3
; GFX9-DL-NEXT: s_waitcnt vmcnt(0)
; GFX9-DL-NEXT: v_add_u32_e32 v2, v3, v2
; GFX9-DL-NEXT: v_add_u32_e32 v2, v2, v5
; GFX9-DL-NEXT: v_add_u32_sdwa v2, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
; GFX9-DL-NEXT: v_add_u32_sdwa v2, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3
; GFX9-DL-NEXT: v_add_u32_e32 v2, v2, v4
; GFX9-DL-NEXT: v_add_u32_sdwa v2, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
; GFX9-DL-NEXT: v_add_u32_sdwa v2, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3
; GFX9-DL-NEXT: global_store_byte v[0:1], v2, off
; GFX9-DL-NEXT: s_endpgm
;
; GFX10-DL-LABEL: udot4_acc8_vecMul:
; GFX10-DL: ; %bb.0: ; %entry
; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
; GFX10-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; GFX10-DL-NEXT: ; implicit-def: $vcc_hi
; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-DL-NEXT: v_mov_b32_e32 v0, s0
; GFX10-DL-NEXT: v_mov_b32_e32 v1, s1
; GFX10-DL-NEXT: v_mov_b32_e32 v0, s2
; GFX10-DL-NEXT: v_mov_b32_e32 v1, s3
; GFX10-DL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
; GFX10-DL-NEXT: global_load_ubyte v2, v[0:1], off
; GFX10-DL-NEXT: s_load_dword s0, s[4:5], 0x0
; GFX10-DL-NEXT: s_load_dword s1, s[6:7], 0x0
; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-DL-NEXT: s_load_dword s0, s[0:1], 0x0
; GFX10-DL-NEXT: s_load_dword s1, s[2:3], 0x0
; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-DL-NEXT: v_lshrrev_b16_e64 v3, 8, s0
; GFX10-DL-NEXT: v_lshrrev_b16_e64 v4, 8, s1

View File

@ -156,29 +156,29 @@ define amdgpu_kernel void @idot8_acc32(<8 x i4> addrspace(1)* %src1,
; GFX9-DL-NEXT: s_load_dword s2, s[6:7], 0x0
; GFX9-DL-NEXT: s_load_dword s6, s[0:1], 0x0
; GFX9-DL-NEXT: s_load_dword s4, s[4:5], 0x0
; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-DL-NEXT: v_mov_b32_e32 v0, s2
; GFX9-DL-NEXT: v_mov_b32_e32 v1, s6
; GFX9-DL-NEXT: v_dot8_i32_i4 v2, s4, v0, v1
; GFX9-DL-NEXT: v_mov_b32_e32 v0, s0
; GFX9-DL-NEXT: v_mov_b32_e32 v1, s1
; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-DL-NEXT: v_mov_b32_e32 v2, s2
; GFX9-DL-NEXT: v_mov_b32_e32 v3, s6
; GFX9-DL-NEXT: v_dot8_i32_i4 v2, s4, v2, v3
; GFX9-DL-NEXT: global_store_dword v[0:1], v2, off
; GFX9-DL-NEXT: s_endpgm
;
; GFX10-DL-LABEL: idot8_acc32:
; GFX10-DL: ; %bb.0: ; %entry
; GFX10-DL-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34
; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
; GFX10-DL-NEXT: ; implicit-def: $vcc_hi
; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-DL-NEXT: s_load_dword s2, s[0:1], 0x0
; GFX10-DL-NEXT: s_load_dword s4, s[4:5], 0x0
; GFX10-DL-NEXT: s_load_dword s5, s[6:7], 0x0
; GFX10-DL-NEXT: s_load_dword s0, s[8:9], 0x0
; GFX10-DL-NEXT: s_load_dword s1, s[4:5], 0x0
; GFX10-DL-NEXT: s_load_dword s2, s[6:7], 0x0
; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-DL-NEXT: v_mov_b32_e32 v0, s2
; GFX10-DL-NEXT: v_dot8_i32_i4 v2, s4, s5, v0
; GFX10-DL-NEXT: v_mov_b32_e32 v0, s0
; GFX10-DL-NEXT: v_mov_b32_e32 v1, s1
; GFX10-DL-NEXT: v_dot8_i32_i4 v2, s1, s2, v0
; GFX10-DL-NEXT: v_mov_b32_e32 v0, s8
; GFX10-DL-NEXT: v_mov_b32_e32 v1, s9
; GFX10-DL-NEXT: global_store_dword v[0:1], v2, off
; GFX10-DL-NEXT: s_endpgm
<8 x i4> addrspace(1)* %src2,
@ -321,48 +321,48 @@ define amdgpu_kernel void @idot8_acc16(<8 x i4> addrspace(1)* %src1,
; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: s_load_dword s2, s[6:7], 0x0
; GFX8-NEXT: v_mov_b32_e32 v0, s0
; GFX8-NEXT: v_mov_b32_e32 v1, s1
; GFX8-NEXT: flat_load_ushort v2, v[0:1]
; GFX8-NEXT: s_load_dword s0, s[4:5], 0x0
; GFX8-NEXT: s_load_dword s1, s[6:7], 0x0
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: s_bfe_i32 s1, s2, 0x40000
; GFX8-NEXT: s_bfe_i32 s4, s2, 0x40004
; GFX8-NEXT: s_bfe_i32 s5, s2, 0x40008
; GFX8-NEXT: v_mov_b32_e32 v3, s1
; GFX8-NEXT: s_bfe_i32 s6, s0, 0x40000
; GFX8-NEXT: s_lshr_b32 s1, s0, 12
; GFX8-NEXT: s_lshr_b32 s7, s2, 12
; GFX8-NEXT: v_mov_b32_e32 v4, s5
; GFX8-NEXT: s_bfe_i32 s5, s0, 0x40000
; GFX8-NEXT: s_bfe_i32 s6, s1, 0x40000
; GFX8-NEXT: s_bfe_i32 s8, s1, 0x40004
; GFX8-NEXT: s_bfe_i32 s10, s1, 0x40008
; GFX8-NEXT: v_mov_b32_e32 v6, s6
; GFX8-NEXT: s_lshr_b32 s2, s0, 12
; GFX8-NEXT: s_lshr_b32 s4, s1, 12
; GFX8-NEXT: s_bfe_i32 s7, s0, 0x40004
; GFX8-NEXT: s_bfe_i32 s9, s0, 0x40008
; GFX8-NEXT: v_mov_b32_e32 v5, s4
; GFX8-NEXT: s_bfe_i32 s8, s0, 0x40004
; GFX8-NEXT: v_lshlrev_b16_e64 v6, 12, s1
; GFX8-NEXT: v_lshlrev_b16_e64 v7, 12, s7
; GFX8-NEXT: v_mul_i32_i24_e32 v4, s9, v4
; GFX8-NEXT: s_bfe_i32 s1, s2, 0x40010
; GFX8-NEXT: v_ashrrev_i16_e32 v6, 12, v6
; GFX8-NEXT: v_ashrrev_i16_e32 v7, 12, v7
; GFX8-NEXT: s_bfe_i32 s5, s2, 0x40014
; GFX8-NEXT: v_mov_b32_e32 v8, s1
; GFX8-NEXT: s_bfe_i32 s4, s0, 0x40010
; GFX8-NEXT: s_bfe_i32 s7, s2, 0x40018
; GFX8-NEXT: v_mov_b32_e32 v9, s5
; GFX8-NEXT: s_bfe_i32 s1, s0, 0x40014
; GFX8-NEXT: s_bfe_i32 s5, s0, 0x40018
; GFX8-NEXT: s_ashr_i32 s2, s2, 28
; GFX8-NEXT: v_mov_b32_e32 v10, s7
; GFX8-NEXT: v_mov_b32_e32 v3, s10
; GFX8-NEXT: v_mov_b32_e32 v7, s8
; GFX8-NEXT: v_lshlrev_b16_e64 v4, 12, s2
; GFX8-NEXT: v_lshlrev_b16_e64 v5, 12, s4
; GFX8-NEXT: v_mul_i32_i24_e32 v3, s9, v3
; GFX8-NEXT: s_bfe_i32 s12, s1, 0x40010
; GFX8-NEXT: v_ashrrev_i16_e32 v4, 12, v4
; GFX8-NEXT: v_ashrrev_i16_e32 v5, 12, v5
; GFX8-NEXT: s_bfe_i32 s14, s1, 0x40014
; GFX8-NEXT: s_bfe_i32 s11, s0, 0x40010
; GFX8-NEXT: v_mov_b32_e32 v8, s12
; GFX8-NEXT: s_bfe_i32 s16, s1, 0x40018
; GFX8-NEXT: s_bfe_i32 s13, s0, 0x40014
; GFX8-NEXT: v_mov_b32_e32 v9, s14
; GFX8-NEXT: s_bfe_i32 s15, s0, 0x40018
; GFX8-NEXT: s_ashr_i32 s1, s1, 28
; GFX8-NEXT: v_mov_b32_e32 v10, s16
; GFX8-NEXT: s_ashr_i32 s0, s0, 28
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: v_mad_i32_i24 v2, s6, v3, v2
; GFX8-NEXT: v_mad_i32_i24 v2, s8, v5, v2
; GFX8-NEXT: v_add_u32_sdwa v2, vcc, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0
; GFX8-NEXT: v_mad_u32_u24 v2, v6, v7, v2
; GFX8-NEXT: v_mad_i32_i24 v2, s4, v8, v2
; GFX8-NEXT: v_mad_i32_i24 v2, s1, v9, v2
; GFX8-NEXT: v_mad_i32_i24 v2, s5, v10, v2
; GFX8-NEXT: v_mov_b32_e32 v3, s2
; GFX8-NEXT: v_mad_i32_i24 v2, s5, v6, v2
; GFX8-NEXT: v_mad_i32_i24 v2, s7, v7, v2
; GFX8-NEXT: v_add_u32_sdwa v2, vcc, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0
; GFX8-NEXT: v_mad_u32_u24 v2, v4, v5, v2
; GFX8-NEXT: v_mad_i32_i24 v2, s11, v8, v2
; GFX8-NEXT: v_mad_i32_i24 v2, s13, v9, v2
; GFX8-NEXT: v_mad_i32_i24 v2, s15, v10, v2
; GFX8-NEXT: v_mov_b32_e32 v3, s1
; GFX8-NEXT: v_mad_i32_i24 v2, s0, v3, v2
; GFX8-NEXT: flat_store_short v[0:1], v2
; GFX8-NEXT: s_endpgm
@ -372,48 +372,48 @@ define amdgpu_kernel void @idot8_acc16(<8 x i4> addrspace(1)* %src1,
; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: s_load_dword s2, s[6:7], 0x0
; GFX9-NEXT: v_mov_b32_e32 v0, s0
; GFX9-NEXT: v_mov_b32_e32 v1, s1
; GFX9-NEXT: global_load_ushort v2, v[0:1], off
; GFX9-NEXT: s_load_dword s0, s[4:5], 0x0
; GFX9-NEXT: s_load_dword s1, s[6:7], 0x0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: s_bfe_i32 s1, s2, 0x40000
; GFX9-NEXT: s_bfe_i32 s4, s2, 0x40004
; GFX9-NEXT: s_bfe_i32 s5, s2, 0x40008
; GFX9-NEXT: v_mov_b32_e32 v3, s1
; GFX9-NEXT: s_bfe_i32 s6, s0, 0x40000
; GFX9-NEXT: s_lshr_b32 s1, s0, 12
; GFX9-NEXT: s_lshr_b32 s7, s2, 12
; GFX9-NEXT: v_mov_b32_e32 v4, s5
; GFX9-NEXT: s_bfe_i32 s5, s0, 0x40000
; GFX9-NEXT: s_bfe_i32 s6, s1, 0x40000
; GFX9-NEXT: s_bfe_i32 s8, s1, 0x40004
; GFX9-NEXT: s_bfe_i32 s10, s1, 0x40008
; GFX9-NEXT: v_mov_b32_e32 v6, s6
; GFX9-NEXT: s_lshr_b32 s2, s0, 12
; GFX9-NEXT: s_lshr_b32 s4, s1, 12
; GFX9-NEXT: s_bfe_i32 s7, s0, 0x40004
; GFX9-NEXT: s_bfe_i32 s9, s0, 0x40008
; GFX9-NEXT: v_mov_b32_e32 v5, s4
; GFX9-NEXT: s_bfe_i32 s8, s0, 0x40004
; GFX9-NEXT: v_lshlrev_b16_e64 v6, 12, s1
; GFX9-NEXT: v_lshlrev_b16_e64 v7, 12, s7
; GFX9-NEXT: v_mul_i32_i24_e32 v4, s9, v4
; GFX9-NEXT: s_bfe_i32 s1, s2, 0x40010
; GFX9-NEXT: v_ashrrev_i16_e32 v6, 12, v6
; GFX9-NEXT: v_ashrrev_i16_e32 v7, 12, v7
; GFX9-NEXT: s_bfe_i32 s5, s2, 0x40014
; GFX9-NEXT: v_mov_b32_e32 v8, s1
; GFX9-NEXT: s_bfe_i32 s4, s0, 0x40010
; GFX9-NEXT: s_bfe_i32 s7, s2, 0x40018
; GFX9-NEXT: v_mov_b32_e32 v9, s5
; GFX9-NEXT: s_bfe_i32 s1, s0, 0x40014
; GFX9-NEXT: s_bfe_i32 s5, s0, 0x40018
; GFX9-NEXT: s_ashr_i32 s2, s2, 28
; GFX9-NEXT: v_mov_b32_e32 v10, s7
; GFX9-NEXT: v_mov_b32_e32 v3, s10
; GFX9-NEXT: v_mov_b32_e32 v7, s8
; GFX9-NEXT: v_lshlrev_b16_e64 v4, 12, s2
; GFX9-NEXT: v_lshlrev_b16_e64 v5, 12, s4
; GFX9-NEXT: v_mul_i32_i24_e32 v3, s9, v3
; GFX9-NEXT: s_bfe_i32 s12, s1, 0x40010
; GFX9-NEXT: v_ashrrev_i16_e32 v4, 12, v4
; GFX9-NEXT: v_ashrrev_i16_e32 v5, 12, v5
; GFX9-NEXT: s_bfe_i32 s14, s1, 0x40014
; GFX9-NEXT: s_bfe_i32 s11, s0, 0x40010
; GFX9-NEXT: v_mov_b32_e32 v8, s12
; GFX9-NEXT: s_bfe_i32 s16, s1, 0x40018
; GFX9-NEXT: s_bfe_i32 s13, s0, 0x40014
; GFX9-NEXT: v_mov_b32_e32 v9, s14
; GFX9-NEXT: s_bfe_i32 s15, s0, 0x40018
; GFX9-NEXT: s_ashr_i32 s1, s1, 28
; GFX9-NEXT: v_mov_b32_e32 v10, s16
; GFX9-NEXT: s_ashr_i32 s0, s0, 28
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_mad_i32_i24 v2, s6, v3, v2
; GFX9-NEXT: v_mad_i32_i24 v2, s8, v5, v2
; GFX9-NEXT: v_add_u32_sdwa v2, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0
; GFX9-NEXT: v_mad_u32_u24 v2, v6, v7, v2
; GFX9-NEXT: v_mad_i32_i24 v2, s4, v8, v2
; GFX9-NEXT: v_mad_i32_i24 v2, s1, v9, v2
; GFX9-NEXT: v_mad_i32_i24 v2, s5, v10, v2
; GFX9-NEXT: v_mov_b32_e32 v3, s2
; GFX9-NEXT: v_mad_i32_i24 v2, s5, v6, v2
; GFX9-NEXT: v_mad_i32_i24 v2, s7, v7, v2
; GFX9-NEXT: v_add_u32_sdwa v2, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0
; GFX9-NEXT: v_mad_u32_u24 v2, v4, v5, v2
; GFX9-NEXT: v_mad_i32_i24 v2, s11, v8, v2
; GFX9-NEXT: v_mad_i32_i24 v2, s13, v9, v2
; GFX9-NEXT: v_mad_i32_i24 v2, s15, v10, v2
; GFX9-NEXT: v_mov_b32_e32 v3, s1
; GFX9-NEXT: v_mad_i32_i24 v2, s0, v3, v2
; GFX9-NEXT: global_store_short v[0:1], v2, off
; GFX9-NEXT: s_endpgm
@ -423,87 +423,88 @@ define amdgpu_kernel void @idot8_acc16(<8 x i4> addrspace(1)* %src1,
; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-DL-NEXT: s_load_dword s2, s[6:7], 0x0
; GFX9-DL-NEXT: v_mov_b32_e32 v0, s0
; GFX9-DL-NEXT: v_mov_b32_e32 v1, s1
; GFX9-DL-NEXT: global_load_ushort v2, v[0:1], off
; GFX9-DL-NEXT: s_load_dword s0, s[4:5], 0x0
; GFX9-DL-NEXT: s_load_dword s1, s[6:7], 0x0
; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-DL-NEXT: s_bfe_i32 s1, s2, 0x40000
; GFX9-DL-NEXT: s_bfe_i32 s4, s2, 0x40004
; GFX9-DL-NEXT: s_bfe_i32 s5, s2, 0x40008
; GFX9-DL-NEXT: v_mov_b32_e32 v3, s1
; GFX9-DL-NEXT: s_bfe_i32 s6, s0, 0x40000
; GFX9-DL-NEXT: s_lshr_b32 s1, s0, 12
; GFX9-DL-NEXT: s_lshr_b32 s7, s2, 12
; GFX9-DL-NEXT: v_mov_b32_e32 v4, s5
; GFX9-DL-NEXT: s_bfe_i32 s5, s0, 0x40000
; GFX9-DL-NEXT: s_bfe_i32 s6, s1, 0x40000
; GFX9-DL-NEXT: s_bfe_i32 s8, s1, 0x40004
; GFX9-DL-NEXT: s_bfe_i32 s10, s1, 0x40008
; GFX9-DL-NEXT: v_mov_b32_e32 v6, s6
; GFX9-DL-NEXT: s_lshr_b32 s2, s0, 12
; GFX9-DL-NEXT: s_lshr_b32 s4, s1, 12
; GFX9-DL-NEXT: s_bfe_i32 s7, s0, 0x40004
; GFX9-DL-NEXT: s_bfe_i32 s9, s0, 0x40008
; GFX9-DL-NEXT: v_mov_b32_e32 v5, s4
; GFX9-DL-NEXT: s_bfe_i32 s8, s0, 0x40004
; GFX9-DL-NEXT: v_lshlrev_b16_e64 v6, 12, s1
; GFX9-DL-NEXT: v_lshlrev_b16_e64 v7, 12, s7
; GFX9-DL-NEXT: v_mul_i32_i24_e32 v4, s9, v4
; GFX9-DL-NEXT: s_bfe_i32 s1, s2, 0x40010
; GFX9-DL-NEXT: v_ashrrev_i16_e32 v6, 12, v6
; GFX9-DL-NEXT: v_ashrrev_i16_e32 v7, 12, v7
; GFX9-DL-NEXT: s_bfe_i32 s5, s2, 0x40014
; GFX9-DL-NEXT: v_mov_b32_e32 v8, s1
; GFX9-DL-NEXT: s_bfe_i32 s4, s0, 0x40010
; GFX9-DL-NEXT: s_bfe_i32 s7, s2, 0x40018
; GFX9-DL-NEXT: v_mov_b32_e32 v9, s5
; GFX9-DL-NEXT: s_bfe_i32 s1, s0, 0x40014
; GFX9-DL-NEXT: s_bfe_i32 s5, s0, 0x40018
; GFX9-DL-NEXT: s_ashr_i32 s2, s2, 28
; GFX9-DL-NEXT: v_mov_b32_e32 v10, s7
; GFX9-DL-NEXT: v_mov_b32_e32 v3, s10
; GFX9-DL-NEXT: v_mov_b32_e32 v7, s8
; GFX9-DL-NEXT: v_lshlrev_b16_e64 v4, 12, s2
; GFX9-DL-NEXT: v_lshlrev_b16_e64 v5, 12, s4
; GFX9-DL-NEXT: v_mul_i32_i24_e32 v3, s9, v3
; GFX9-DL-NEXT: s_bfe_i32 s12, s1, 0x40010
; GFX9-DL-NEXT: v_ashrrev_i16_e32 v4, 12, v4
; GFX9-DL-NEXT: v_ashrrev_i16_e32 v5, 12, v5
; GFX9-DL-NEXT: s_bfe_i32 s14, s1, 0x40014
; GFX9-DL-NEXT: s_bfe_i32 s11, s0, 0x40010
; GFX9-DL-NEXT: v_mov_b32_e32 v8, s12
; GFX9-DL-NEXT: s_bfe_i32 s16, s1, 0x40018
; GFX9-DL-NEXT: s_bfe_i32 s13, s0, 0x40014
; GFX9-DL-NEXT: v_mov_b32_e32 v9, s14
; GFX9-DL-NEXT: s_bfe_i32 s15, s0, 0x40018
; GFX9-DL-NEXT: s_ashr_i32 s1, s1, 28
; GFX9-DL-NEXT: v_mov_b32_e32 v10, s16
; GFX9-DL-NEXT: s_ashr_i32 s0, s0, 28
; GFX9-DL-NEXT: s_waitcnt vmcnt(0)
; GFX9-DL-NEXT: v_mad_i32_i24 v2, s6, v3, v2
; GFX9-DL-NEXT: v_mad_i32_i24 v2, s8, v5, v2
; GFX9-DL-NEXT: v_add_u32_sdwa v2, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0
; GFX9-DL-NEXT: v_mad_u32_u24 v2, v6, v7, v2
; GFX9-DL-NEXT: v_mad_i32_i24 v2, s4, v8, v2
; GFX9-DL-NEXT: v_mad_i32_i24 v2, s1, v9, v2
; GFX9-DL-NEXT: v_mad_i32_i24 v2, s5, v10, v2
; GFX9-DL-NEXT: v_mov_b32_e32 v3, s2
; GFX9-DL-NEXT: v_mad_i32_i24 v2, s5, v6, v2
; GFX9-DL-NEXT: v_mad_i32_i24 v2, s7, v7, v2
; GFX9-DL-NEXT: v_add_u32_sdwa v2, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0
; GFX9-DL-NEXT: v_mad_u32_u24 v2, v4, v5, v2
; GFX9-DL-NEXT: v_mad_i32_i24 v2, s11, v8, v2
; GFX9-DL-NEXT: v_mad_i32_i24 v2, s13, v9, v2
; GFX9-DL-NEXT: v_mad_i32_i24 v2, s15, v10, v2
; GFX9-DL-NEXT: v_mov_b32_e32 v3, s1
; GFX9-DL-NEXT: v_mad_i32_i24 v2, s0, v3, v2
; GFX9-DL-NEXT: global_store_short v[0:1], v2, off
; GFX9-DL-NEXT: s_endpgm
;
; GFX10-DL-LABEL: idot8_acc16:
; GFX10-DL: ; %bb.0: ; %entry
; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
; GFX10-DL-NEXT: s_mov_b32 s2, 0xffff
; GFX10-DL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34
; GFX10-DL-NEXT: ; implicit-def: $vcc_hi
; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-DL-NEXT: v_mov_b32_e32 v0, s0
; GFX10-DL-NEXT: v_mov_b32_e32 v1, s1
; GFX10-DL-NEXT: v_mov_b32_e32 v0, s4
; GFX10-DL-NEXT: v_mov_b32_e32 v1, s5
; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX10-DL-NEXT: global_load_ushort v2, v[0:1], off
; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-DL-NEXT: s_load_dword s0, s[4:5], 0x0
; GFX10-DL-NEXT: s_load_dword s1, s[6:7], 0x0
; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-DL-NEXT: s_lshr_b32 s4, s0, 12
; GFX10-DL-NEXT: s_lshr_b32 s5, s1, 12
; GFX10-DL-NEXT: s_bfe_i32 s6, s0, 0x40000
; GFX10-DL-NEXT: s_bfe_i32 s7, s1, 0x40000
; GFX10-DL-NEXT: s_bfe_i32 s8, s0, 0x40004
; GFX10-DL-NEXT: v_lshlrev_b16_e64 v3, 12, s4
; GFX10-DL-NEXT: v_lshlrev_b16_e64 v4, 12, s5
; GFX10-DL-NEXT: s_bfe_i32 s9, s1, 0x40004
; GFX10-DL-NEXT: s_bfe_i32 s10, s0, 0x40008
; GFX10-DL-NEXT: s_bfe_i32 s11, s1, 0x40008
; GFX10-DL-NEXT: s_lshr_b32 s2, s0, 12
; GFX10-DL-NEXT: s_lshr_b32 s4, s1, 12
; GFX10-DL-NEXT: s_bfe_i32 s5, s0, 0x40000
; GFX10-DL-NEXT: s_bfe_i32 s6, s1, 0x40000
; GFX10-DL-NEXT: s_bfe_i32 s7, s0, 0x40004
; GFX10-DL-NEXT: v_lshlrev_b16_e64 v3, 12, s2
; GFX10-DL-NEXT: v_lshlrev_b16_e64 v4, 12, s4
; GFX10-DL-NEXT: s_bfe_i32 s8, s0, 0x40008
; GFX10-DL-NEXT: s_bfe_i32 s9, s1, 0x40008
; GFX10-DL-NEXT: s_bfe_i32 s2, s1, 0x40004
; GFX10-DL-NEXT: v_ashrrev_i16_e64 v3, 12, v3
; GFX10-DL-NEXT: s_mov_b32 s4, 0xffff
; GFX10-DL-NEXT: v_ashrrev_i16_e64 v4, 12, v4
; GFX10-DL-NEXT: v_mul_i32_i24_e64 v5, s8, s9
; GFX10-DL-NEXT: v_and_b32_e32 v3, s4, v3
; GFX10-DL-NEXT: v_and_b32_e32 v4, s4, v4
; GFX10-DL-NEXT: s_bfe_i32 s4, s1, 0x40010
; GFX10-DL-NEXT: s_bfe_i32 s5, s0, 0x40014
; GFX10-DL-NEXT: v_mul_i32_i24_e64 v5, s10, s11
; GFX10-DL-NEXT: v_and_b32_e32 v3, s2, v3
; GFX10-DL-NEXT: v_and_b32_e32 v4, s2, v4
; GFX10-DL-NEXT: s_bfe_i32 s2, s0, 0x40010
; GFX10-DL-NEXT: s_waitcnt vmcnt(0)
; GFX10-DL-NEXT: v_mad_i32_i24 v2, s6, s7, v2
; GFX10-DL-NEXT: v_mad_i32_i24 v2, s5, s6, v2
; GFX10-DL-NEXT: s_bfe_i32 s5, s0, 0x40014
; GFX10-DL-NEXT: s_bfe_i32 s6, s1, 0x40014
; GFX10-DL-NEXT: v_mad_i32_i24 v2, s8, s9, v2
; GFX10-DL-NEXT: v_mad_i32_i24 v2, s7, s2, v2
; GFX10-DL-NEXT: s_bfe_i32 s2, s0, 0x40010
; GFX10-DL-NEXT: v_add_nc_u32_sdwa v2, v2, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0
; GFX10-DL-NEXT: v_mad_u32_u24 v2, v3, v4, v2
; GFX10-DL-NEXT: v_mad_i32_i24 v2, s2, s4, v2
@ -814,39 +815,40 @@ define amdgpu_kernel void @idot8_acc8(<8 x i4> addrspace(1)* %src1,
;
; GFX10-DL-LABEL: idot8_acc8:
; GFX10-DL: ; %bb.0: ; %entry
; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
; GFX10-DL-NEXT: s_movk_i32 s2, 0xff
; GFX10-DL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34
; GFX10-DL-NEXT: ; implicit-def: $vcc_hi
; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-DL-NEXT: v_mov_b32_e32 v0, s0
; GFX10-DL-NEXT: v_mov_b32_e32 v1, s1
; GFX10-DL-NEXT: v_mov_b32_e32 v0, s4
; GFX10-DL-NEXT: v_mov_b32_e32 v1, s5
; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX10-DL-NEXT: global_load_ubyte v2, v[0:1], off
; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-DL-NEXT: s_load_dword s0, s[4:5], 0x0
; GFX10-DL-NEXT: s_load_dword s1, s[6:7], 0x0
; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-DL-NEXT: s_lshr_b32 s4, s0, 12
; GFX10-DL-NEXT: s_lshr_b32 s5, s1, 12
; GFX10-DL-NEXT: s_bfe_i32 s6, s0, 0x40000
; GFX10-DL-NEXT: s_bfe_i32 s7, s1, 0x40000
; GFX10-DL-NEXT: s_bfe_i32 s8, s0, 0x40004
; GFX10-DL-NEXT: v_lshlrev_b16_e64 v3, 12, s4
; GFX10-DL-NEXT: v_lshlrev_b16_e64 v4, 12, s5
; GFX10-DL-NEXT: s_bfe_i32 s9, s1, 0x40004
; GFX10-DL-NEXT: s_bfe_i32 s10, s0, 0x40008
; GFX10-DL-NEXT: s_bfe_i32 s11, s1, 0x40008
; GFX10-DL-NEXT: s_lshr_b32 s2, s0, 12
; GFX10-DL-NEXT: s_lshr_b32 s4, s1, 12
; GFX10-DL-NEXT: s_bfe_i32 s5, s0, 0x40000
; GFX10-DL-NEXT: s_bfe_i32 s6, s1, 0x40000
; GFX10-DL-NEXT: s_bfe_i32 s7, s0, 0x40004
; GFX10-DL-NEXT: v_lshlrev_b16_e64 v3, 12, s2
; GFX10-DL-NEXT: v_lshlrev_b16_e64 v4, 12, s4
; GFX10-DL-NEXT: s_bfe_i32 s8, s0, 0x40008
; GFX10-DL-NEXT: s_bfe_i32 s9, s1, 0x40008
; GFX10-DL-NEXT: s_bfe_i32 s2, s1, 0x40004
; GFX10-DL-NEXT: v_ashrrev_i16_e64 v3, 12, v3
; GFX10-DL-NEXT: s_movk_i32 s4, 0xff
; GFX10-DL-NEXT: v_ashrrev_i16_e64 v4, 12, v4
; GFX10-DL-NEXT: v_mul_i32_i24_e64 v5, s8, s9
; GFX10-DL-NEXT: v_and_b32_e32 v3, s4, v3
; GFX10-DL-NEXT: v_and_b32_e32 v4, s4, v4
; GFX10-DL-NEXT: s_bfe_i32 s4, s1, 0x40010
; GFX10-DL-NEXT: s_bfe_i32 s5, s0, 0x40014
; GFX10-DL-NEXT: v_mul_i32_i24_e64 v5, s10, s11
; GFX10-DL-NEXT: v_and_b32_e32 v3, s2, v3
; GFX10-DL-NEXT: v_and_b32_e32 v4, s2, v4
; GFX10-DL-NEXT: s_bfe_i32 s2, s0, 0x40010
; GFX10-DL-NEXT: s_waitcnt vmcnt(0)
; GFX10-DL-NEXT: v_mad_i32_i24 v2, s6, s7, v2
; GFX10-DL-NEXT: v_mad_i32_i24 v2, s5, s6, v2
; GFX10-DL-NEXT: s_bfe_i32 s5, s0, 0x40014
; GFX10-DL-NEXT: s_bfe_i32 s6, s1, 0x40014
; GFX10-DL-NEXT: v_mad_i32_i24 v2, s8, s9, v2
; GFX10-DL-NEXT: v_mad_i32_i24 v2, s7, s2, v2
; GFX10-DL-NEXT: s_bfe_i32 s2, s0, 0x40010
; GFX10-DL-NEXT: v_add_nc_u32_sdwa v2, v2, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0
; GFX10-DL-NEXT: v_mad_u32_u24 v2, v3, v4, v2
; GFX10-DL-NEXT: v_mad_i32_i24 v2, s2, s4, v2
@ -1643,54 +1645,54 @@ define amdgpu_kernel void @idot8_acc16_vecMul(<8 x i4> addrspace(1)* %src1,
; GFX8-NEXT: flat_load_ushort v2, v[0:1]
; GFX8-NEXT: s_load_dword s1, s[4:5], 0x0
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: s_lshl_b32 s5, s7, 28
; GFX8-NEXT: s_ashr_i64 s[4:5], s[4:5], 60
; GFX8-NEXT: s_lshl_b32 s9, s7, 24
; GFX8-NEXT: s_lshl_b32 s11, s7, 20
; GFX8-NEXT: s_lshl_b32 s5, s1, 28
; GFX8-NEXT: s_ashr_i64 s[14:15], s[4:5], 60
; GFX8-NEXT: s_lshl_b32 s5, s1, 20
; GFX8-NEXT: s_lshl_b32 s13, s1, 24
; GFX8-NEXT: s_ashr_i64 s[8:9], s[8:9], 60
; GFX8-NEXT: s_ashr_i64 s[10:11], s[10:11], 60
; GFX8-NEXT: v_mov_b32_e32 v3, s4
; GFX8-NEXT: s_ashr_i64 s[4:5], s[4:5], 60
; GFX8-NEXT: v_mov_b32_e32 v4, s10
; GFX8-NEXT: s_ashr_i64 s[12:13], s[12:13], 60
; GFX8-NEXT: s_lshl_b32 s5, s7, 16
; GFX8-NEXT: v_mov_b32_e32 v5, s8
; GFX8-NEXT: s_lshl_b32 s9, s1, 16
; GFX8-NEXT: s_lshl_b32 s11, s7, 12
; GFX8-NEXT: s_ashr_i64 s[16:17], s[4:5], 60
; GFX8-NEXT: s_ashr_i64 s[8:9], s[8:9], 60
; GFX8-NEXT: v_mul_i32_i24_e32 v4, s4, v4
; GFX8-NEXT: s_lshl_b32 s5, s1, 12
; GFX8-NEXT: s_lshl_b32 s9, s7, 8
; GFX8-NEXT: s_ashr_i64 s[10:11], s[10:11], 60
; GFX8-NEXT: v_mov_b32_e32 v6, s16
; GFX8-NEXT: s_ashr_i64 s[20:21], s[8:9], 60
; GFX8-NEXT: s_lshl_b32 s13, s1, 8
; GFX8-NEXT: s_ashr_i64 s[18:19], s[4:5], 60
; GFX8-NEXT: s_lshl_b32 s5, s7, 4
; GFX8-NEXT: v_mov_b32_e32 v7, s10
; GFX8-NEXT: s_lshl_b32 s9, s1, 4
; GFX8-NEXT: s_ashr_i64 s[24:25], s[4:5], 60
; GFX8-NEXT: s_ashr_i64 s[22:23], s[12:13], 60
; GFX8-NEXT: v_mov_b32_e32 v8, s20
; GFX8-NEXT: s_lshl_b32 s29, s7, 28
; GFX8-NEXT: s_ashr_i64 s[18:19], s[6:7], 60
; GFX8-NEXT: s_lshl_b32 s21, s7, 8
; GFX8-NEXT: s_lshl_b32 s23, s7, 12
; GFX8-NEXT: s_lshl_b32 s17, s1, 28
; GFX8-NEXT: s_lshl_b32 s25, s7, 16
; GFX8-NEXT: s_lshl_b32 s27, s7, 24
; GFX8-NEXT: s_lshl_b32 s19, s7, 4
; GFX8-NEXT: s_lshl_b32 s7, s7, 20
; GFX8-NEXT: s_ashr_i64 s[4:5], s[0:1], 60
; GFX8-NEXT: s_ashr_i64 s[28:29], s[28:29], 60
; GFX8-NEXT: s_lshl_b32 s9, s1, 8
; GFX8-NEXT: s_lshl_b32 s11, s1, 12
; GFX8-NEXT: s_lshl_b32 s13, s1, 16
; GFX8-NEXT: s_lshl_b32 s15, s1, 24
; GFX8-NEXT: s_lshl_b32 s5, s1, 4
; GFX8-NEXT: s_lshl_b32 s1, s1, 20
; GFX8-NEXT: s_ashr_i64 s[26:27], s[26:27], 60
; GFX8-NEXT: s_ashr_i64 s[6:7], s[6:7], 60
; GFX8-NEXT: s_ashr_i64 s[26:27], s[8:9], 60
; GFX8-NEXT: v_mov_b32_e32 v9, s24
; GFX8-NEXT: s_ashr_i64 s[16:17], s[16:17], 60
; GFX8-NEXT: v_mov_b32_e32 v4, s28
; GFX8-NEXT: s_ashr_i64 s[14:15], s[14:15], 60
; GFX8-NEXT: s_ashr_i64 s[0:1], s[0:1], 60
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: v_mad_i32_i24 v2, s14, v3, v2
; GFX8-NEXT: v_mad_i32_i24 v2, s12, v5, v2
; GFX8-NEXT: v_add_u32_sdwa v2, vcc, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0
; GFX8-NEXT: v_mad_i32_i24 v2, s8, v6, v2
; GFX8-NEXT: v_mad_i32_i24 v2, s18, v7, v2
; GFX8-NEXT: v_mad_i32_i24 v2, s22, v8, v2
; GFX8-NEXT: v_mad_i32_i24 v2, s26, v9, v2
; GFX8-NEXT: v_mov_b32_e32 v3, s6
; GFX8-NEXT: v_mad_i32_i24 v2, s0, v3, v2
; GFX8-NEXT: v_mov_b32_e32 v5, s26
; GFX8-NEXT: s_ashr_i64 s[24:25], s[24:25], 60
; GFX8-NEXT: v_mul_i32_i24_e32 v3, s0, v3
; GFX8-NEXT: s_ashr_i64 s[22:23], s[22:23], 60
; GFX8-NEXT: s_ashr_i64 s[12:13], s[12:13], 60
; GFX8-NEXT: v_mov_b32_e32 v6, s24
; GFX8-NEXT: s_ashr_i64 s[20:21], s[20:21], 60
; GFX8-NEXT: s_ashr_i64 s[10:11], s[10:11], 60
; GFX8-NEXT: v_mov_b32_e32 v7, s22
; GFX8-NEXT: s_ashr_i64 s[32:33], s[18:19], 60
; GFX8-NEXT: s_ashr_i64 s[8:9], s[8:9], 60
; GFX8-NEXT: v_mov_b32_e32 v8, s20
; GFX8-NEXT: s_ashr_i64 s[30:31], s[4:5], 60
; GFX8-NEXT: v_mov_b32_e32 v9, s32
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: v_mad_i32_i24 v2, s16, v4, v2
; GFX8-NEXT: v_mad_i32_i24 v2, s14, v5, v2
; GFX8-NEXT: v_add_u32_sdwa v2, vcc, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0
; GFX8-NEXT: v_mad_i32_i24 v2, s12, v6, v2
; GFX8-NEXT: v_mad_i32_i24 v2, s10, v7, v2
; GFX8-NEXT: v_mad_i32_i24 v2, s8, v8, v2
; GFX8-NEXT: v_mad_i32_i24 v2, s30, v9, v2
; GFX8-NEXT: v_mov_b32_e32 v3, s18
; GFX8-NEXT: v_mad_i32_i24 v2, s4, v3, v2
; GFX8-NEXT: flat_store_short v[0:1], v2
; GFX8-NEXT: s_endpgm
;
@ -1702,17 +1704,21 @@ define amdgpu_kernel void @idot8_acc16_vecMul(<8 x i4> addrspace(1)* %src1,
; GFX9-NEXT: s_load_dword s2, s[4:5], 0x0
; GFX9-NEXT: s_load_dword s6, s[6:7], 0x0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: s_and_b32 s4, s2, 15
; GFX9-NEXT: s_bfe_u32 s5, s2, 0x40004
; GFX9-NEXT: s_bfe_u32 s8, s2, 0x40008
; GFX9-NEXT: s_bfe_u32 s9, s2, 0x4000c
; GFX9-NEXT: s_bfe_u32 s10, s2, 0x40010
; GFX9-NEXT: s_bfe_u32 s11, s2, 0x40018
; GFX9-NEXT: s_lshr_b32 s12, s2, 28
; GFX9-NEXT: s_bfe_u32 s2, s2, 0x40014
; GFX9-NEXT: s_pack_ll_b32_b16 s2, s10, s2
; GFX9-NEXT: s_bfe_u32 s4, s2, 0x40018
; GFX9-NEXT: s_lshr_b32 s5, s2, 28
; GFX9-NEXT: s_bfe_u32 s8, s2, 0x40010
; GFX9-NEXT: s_bfe_u32 s9, s2, 0x40014
; GFX9-NEXT: s_bfe_u32 s10, s2, 0x40008
; GFX9-NEXT: s_bfe_u32 s11, s2, 0x4000c
; GFX9-NEXT: s_and_b32 s12, s2, 15
; GFX9-NEXT: s_bfe_u32 s2, s2, 0x40004
; GFX9-NEXT: s_pack_ll_b32_b16 s2, s12, s2
; GFX9-NEXT: v_pk_lshlrev_b16 v0, 12, s2 op_sel_hi:[0,1]
; GFX9-NEXT: s_pack_ll_b32_b16 s2, s10, s11
; GFX9-NEXT: v_pk_lshlrev_b16 v1, 12, s2 op_sel_hi:[0,1]
; GFX9-NEXT: s_pack_ll_b32_b16 s2, s8, s9
; GFX9-NEXT: v_pk_lshlrev_b16 v2, 12, s2 op_sel_hi:[0,1]
; GFX9-NEXT: s_pack_ll_b32_b16 s2, s11, s12
; GFX9-NEXT: s_pack_ll_b32_b16 s2, s4, s5
; GFX9-NEXT: s_bfe_u32 s7, s6, 0x40018
; GFX9-NEXT: s_lshr_b32 s13, s6, 28
; GFX9-NEXT: s_bfe_u32 s14, s6, 0x40010
@ -1722,14 +1728,10 @@ define amdgpu_kernel void @idot8_acc16_vecMul(<8 x i4> addrspace(1)* %src1,
; GFX9-NEXT: s_and_b32 s18, s6, 15
; GFX9-NEXT: s_bfe_u32 s6, s6, 0x40004
; GFX9-NEXT: v_pk_lshlrev_b16 v3, 12, s2 op_sel_hi:[0,1]
; GFX9-NEXT: s_pack_ll_b32_b16 s4, s4, s5
; GFX9-NEXT: s_pack_ll_b32_b16 s2, s18, s6
; GFX9-NEXT: v_pk_lshlrev_b16 v0, 12, s4 op_sel_hi:[0,1]
; GFX9-NEXT: v_pk_lshlrev_b16 v4, 12, s2 op_sel_hi:[0,1]
; GFX9-NEXT: s_pack_ll_b32_b16 s4, s8, s9
; GFX9-NEXT: s_pack_ll_b32_b16 s2, s16, s17
; GFX9-NEXT: v_pk_lshlrev_b16 v5, 12, s2 op_sel_hi:[0,1]
; GFX9-NEXT: v_pk_lshlrev_b16 v1, 12, s4 op_sel_hi:[0,1]
; GFX9-NEXT: s_pack_ll_b32_b16 s2, s14, s15
; GFX9-NEXT: v_pk_ashrrev_i16 v0, 12, v0 op_sel_hi:[0,1]
; GFX9-NEXT: v_pk_ashrrev_i16 v4, 12, v4 op_sel_hi:[0,1]
@ -1769,17 +1771,21 @@ define amdgpu_kernel void @idot8_acc16_vecMul(<8 x i4> addrspace(1)* %src1,
; GFX9-DL-NEXT: s_load_dword s2, s[4:5], 0x0
; GFX9-DL-NEXT: s_load_dword s6, s[6:7], 0x0
; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-DL-NEXT: s_and_b32 s4, s2, 15
; GFX9-DL-NEXT: s_bfe_u32 s5, s2, 0x40004
; GFX9-DL-NEXT: s_bfe_u32 s8, s2, 0x40008
; GFX9-DL-NEXT: s_bfe_u32 s9, s2, 0x4000c
; GFX9-DL-NEXT: s_bfe_u32 s10, s2, 0x40010
; GFX9-DL-NEXT: s_bfe_u32 s11, s2, 0x40018
; GFX9-DL-NEXT: s_lshr_b32 s12, s2, 28
; GFX9-DL-NEXT: s_bfe_u32 s2, s2, 0x40014
; GFX9-DL-NEXT: s_pack_ll_b32_b16 s2, s10, s2
; GFX9-DL-NEXT: s_bfe_u32 s4, s2, 0x40018
; GFX9-DL-NEXT: s_lshr_b32 s5, s2, 28
; GFX9-DL-NEXT: s_bfe_u32 s8, s2, 0x40010
; GFX9-DL-NEXT: s_bfe_u32 s9, s2, 0x40014
; GFX9-DL-NEXT: s_bfe_u32 s10, s2, 0x40008
; GFX9-DL-NEXT: s_bfe_u32 s11, s2, 0x4000c
; GFX9-DL-NEXT: s_and_b32 s12, s2, 15
; GFX9-DL-NEXT: s_bfe_u32 s2, s2, 0x40004
; GFX9-DL-NEXT: s_pack_ll_b32_b16 s2, s12, s2
; GFX9-DL-NEXT: v_pk_lshlrev_b16 v0, 12, s2 op_sel_hi:[0,1]
; GFX9-DL-NEXT: s_pack_ll_b32_b16 s2, s10, s11
; GFX9-DL-NEXT: v_pk_lshlrev_b16 v1, 12, s2 op_sel_hi:[0,1]
; GFX9-DL-NEXT: s_pack_ll_b32_b16 s2, s8, s9
; GFX9-DL-NEXT: v_pk_lshlrev_b16 v2, 12, s2 op_sel_hi:[0,1]
; GFX9-DL-NEXT: s_pack_ll_b32_b16 s2, s11, s12
; GFX9-DL-NEXT: s_pack_ll_b32_b16 s2, s4, s5
; GFX9-DL-NEXT: s_bfe_u32 s7, s6, 0x40018
; GFX9-DL-NEXT: s_lshr_b32 s13, s6, 28
; GFX9-DL-NEXT: s_bfe_u32 s14, s6, 0x40010
@ -1789,14 +1795,10 @@ define amdgpu_kernel void @idot8_acc16_vecMul(<8 x i4> addrspace(1)* %src1,
; GFX9-DL-NEXT: s_and_b32 s18, s6, 15
; GFX9-DL-NEXT: s_bfe_u32 s6, s6, 0x40004
; GFX9-DL-NEXT: v_pk_lshlrev_b16 v3, 12, s2 op_sel_hi:[0,1]
; GFX9-DL-NEXT: s_pack_ll_b32_b16 s4, s4, s5
; GFX9-DL-NEXT: s_pack_ll_b32_b16 s2, s18, s6
; GFX9-DL-NEXT: v_pk_lshlrev_b16 v0, 12, s4 op_sel_hi:[0,1]
; GFX9-DL-NEXT: v_pk_lshlrev_b16 v4, 12, s2 op_sel_hi:[0,1]
; GFX9-DL-NEXT: s_pack_ll_b32_b16 s4, s8, s9
; GFX9-DL-NEXT: s_pack_ll_b32_b16 s2, s16, s17
; GFX9-DL-NEXT: v_pk_lshlrev_b16 v5, 12, s2 op_sel_hi:[0,1]
; GFX9-DL-NEXT: v_pk_lshlrev_b16 v1, 12, s4 op_sel_hi:[0,1]
; GFX9-DL-NEXT: s_pack_ll_b32_b16 s2, s14, s15
; GFX9-DL-NEXT: v_pk_ashrrev_i16 v0, 12, v0 op_sel_hi:[0,1]
; GFX9-DL-NEXT: v_pk_ashrrev_i16 v4, 12, v4 op_sel_hi:[0,1]
@ -1830,13 +1832,14 @@ define amdgpu_kernel void @idot8_acc16_vecMul(<8 x i4> addrspace(1)* %src1,
;
; GFX10-DL-LABEL: idot8_acc16_vecMul:
; GFX10-DL: ; %bb.0: ; %entry
; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
; GFX10-DL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34
; GFX10-DL-NEXT: ; implicit-def: $vcc_hi
; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-DL-NEXT: v_mov_b32_e32 v0, s0
; GFX10-DL-NEXT: v_mov_b32_e32 v1, s1
; GFX10-DL-NEXT: v_mov_b32_e32 v0, s4
; GFX10-DL-NEXT: v_mov_b32_e32 v1, s5
; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX10-DL-NEXT: global_load_ushort v2, v[0:1], off
; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-DL-NEXT: s_load_dword s0, s[4:5], 0x0
; GFX10-DL-NEXT: s_load_dword s1, s[6:7], 0x0
; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
@ -2273,14 +2276,15 @@ define amdgpu_kernel void @idot8_acc8_vecMul(<8 x i4> addrspace(1)* %src1,
;
; GFX10-DL-LABEL: idot8_acc8_vecMul:
; GFX10-DL: ; %bb.0: ; %entry
; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
; GFX10-DL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34
; GFX10-DL-NEXT: s_mov_b32 s2, 0xffff
; GFX10-DL-NEXT: ; implicit-def: $vcc_hi
; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-DL-NEXT: v_mov_b32_e32 v0, s0
; GFX10-DL-NEXT: v_mov_b32_e32 v1, s1
; GFX10-DL-NEXT: v_mov_b32_e32 v0, s4
; GFX10-DL-NEXT: v_mov_b32_e32 v1, s5
; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX10-DL-NEXT: global_load_ubyte v2, v[0:1], off
; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-DL-NEXT: s_load_dword s0, s[4:5], 0x0
; GFX10-DL-NEXT: s_load_dword s1, s[6:7], 0x0
; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@ -21,10 +21,10 @@ define amdgpu_kernel void @s_cvt_pkrtz_v2f16_f32(<2 x half> addrspace(1)* %out,
; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2c
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v0, s1
; VI-NEXT: v_cvt_pkrtz_f16_f32 v2, s0, v0
; VI-NEXT: v_mov_b32_e32 v0, s2
; VI-NEXT: v_mov_b32_e32 v2, s1
; VI-NEXT: v_mov_b32_e32 v1, s3
; VI-NEXT: v_cvt_pkrtz_f16_f32 v2, s0, v2
; VI-NEXT: flat_store_dword v[0:1], v2
; VI-NEXT: s_endpgm
;
@ -33,10 +33,10 @@ define amdgpu_kernel void @s_cvt_pkrtz_v2f16_f32(<2 x half> addrspace(1)* %out,
; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2c
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: v_mov_b32_e32 v0, s1
; GFX9-NEXT: v_cvt_pkrtz_f16_f32 v2, s0, v0
; GFX9-NEXT: v_mov_b32_e32 v0, s2
; GFX9-NEXT: v_mov_b32_e32 v2, s1
; GFX9-NEXT: v_mov_b32_e32 v1, s3
; GFX9-NEXT: v_cvt_pkrtz_f16_f32 v2, s0, v2
; GFX9-NEXT: global_store_dword v[0:1], v2, off
; GFX9-NEXT: s_endpgm
%result = call <2 x half> @llvm.amdgcn.cvt.pkrtz(float %x, float %y)
@ -62,8 +62,8 @@ define amdgpu_kernel void @s_cvt_pkrtz_samereg_v2f16_f32(<2 x half> addrspace(1)
; VI-NEXT: s_load_dword s0, s[0:1], 0x2c
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v0, s2
; VI-NEXT: v_mov_b32_e32 v1, s3
; VI-NEXT: v_cvt_pkrtz_f16_f32 v2, s0, s0
; VI-NEXT: v_mov_b32_e32 v1, s3
; VI-NEXT: flat_store_dword v[0:1], v2
; VI-NEXT: s_endpgm
;
@ -73,8 +73,8 @@ define amdgpu_kernel void @s_cvt_pkrtz_samereg_v2f16_f32(<2 x half> addrspace(1)
; GFX9-NEXT: s_load_dword s0, s[0:1], 0x2c
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: v_mov_b32_e32 v0, s2
; GFX9-NEXT: v_mov_b32_e32 v1, s3
; GFX9-NEXT: v_cvt_pkrtz_f16_f32 v2, s0, s0
; GFX9-NEXT: v_mov_b32_e32 v1, s3
; GFX9-NEXT: global_store_dword v[0:1], v2, off
; GFX9-NEXT: s_endpgm
%result = call <2 x half> @llvm.amdgcn.cvt.pkrtz(float %x, float %x)
@ -123,14 +123,14 @@ define amdgpu_kernel void @v_cvt_pkrtz_v2f16_f32(<2 x half> addrspace(1)* %out,
; VI-NEXT: v_mov_b32_e32 v3, s1
; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v4
; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
; VI-NEXT: flat_load_dword v5, v[0:1]
; VI-NEXT: flat_load_dword v2, v[2:3]
; VI-NEXT: v_mov_b32_e32 v1, s5
; VI-NEXT: v_add_u32_e32 v0, vcc, s4, v4
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: flat_load_dword v0, v[0:1]
; VI-NEXT: flat_load_dword v1, v[2:3]
; VI-NEXT: v_mov_b32_e32 v5, s5
; VI-NEXT: v_add_u32_e32 v4, vcc, s4, v4
; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc
; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; VI-NEXT: v_cvt_pkrtz_f16_f32 v2, v5, v2
; VI-NEXT: flat_store_dword v[0:1], v2
; VI-NEXT: v_cvt_pkrtz_f16_f32 v0, v0, v1
; VI-NEXT: flat_store_dword v[4:5], v0
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: v_cvt_pkrtz_v2f16_f32:
@ -145,14 +145,14 @@ define amdgpu_kernel void @v_cvt_pkrtz_v2f16_f32(<2 x half> addrspace(1)* %out,
; GFX9-NEXT: v_mov_b32_e32 v3, s1
; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s0, v4
; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc
; GFX9-NEXT: global_load_dword v5, v[0:1], off
; GFX9-NEXT: global_load_dword v2, v[2:3], off
; GFX9-NEXT: v_mov_b32_e32 v1, s5
; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s4, v4
; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
; GFX9-NEXT: global_load_dword v0, v[0:1], off
; GFX9-NEXT: global_load_dword v1, v[2:3], off
; GFX9-NEXT: v_mov_b32_e32 v5, s5
; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, s4, v4
; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v5, vcc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_cvt_pkrtz_f16_f32 v2, v5, v2
; GFX9-NEXT: global_store_dword v[0:1], v2, off
; GFX9-NEXT: v_cvt_pkrtz_f16_f32 v0, v0, v1
; GFX9-NEXT: global_store_dword v[4:5], v0, off
; GFX9-NEXT: s_endpgm
%tid = call i32 @llvm.amdgcn.workitem.id.x()
%tid.ext = sext i32 %tid to i64
@ -191,13 +191,13 @@ define amdgpu_kernel void @v_cvt_pkrtz_v2f16_f32_reg_imm(<2 x half> addrspace(1)
; VI-NEXT: v_mov_b32_e32 v1, s3
; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: flat_load_dword v3, v[0:1]
; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v2
; VI-NEXT: v_mov_b32_e32 v1, s1
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: flat_load_dword v0, v[0:1]
; VI-NEXT: v_mov_b32_e32 v3, s1
; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2
; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; VI-NEXT: v_cvt_pkrtz_f16_f32 v2, v3, 1.0
; VI-NEXT: flat_store_dword v[0:1], v2
; VI-NEXT: v_cvt_pkrtz_f16_f32 v0, v0, 1.0
; VI-NEXT: flat_store_dword v[2:3], v0
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: v_cvt_pkrtz_v2f16_f32_reg_imm:
@ -208,13 +208,13 @@ define amdgpu_kernel void @v_cvt_pkrtz_v2f16_f32_reg_imm(<2 x half> addrspace(1)
; GFX9-NEXT: v_mov_b32_e32 v1, s3
; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s2, v2
; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
; GFX9-NEXT: global_load_dword v3, v[0:1], off
; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s0, v2
; GFX9-NEXT: v_mov_b32_e32 v1, s1
; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
; GFX9-NEXT: global_load_dword v0, v[0:1], off
; GFX9-NEXT: v_mov_b32_e32 v3, s1
; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s0, v2
; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_cvt_pkrtz_f16_f32 v2, v3, 1.0
; GFX9-NEXT: global_store_dword v[0:1], v2, off
; GFX9-NEXT: v_cvt_pkrtz_f16_f32 v0, v0, 1.0
; GFX9-NEXT: global_store_dword v[2:3], v0, off
; GFX9-NEXT: s_endpgm
%tid = call i32 @llvm.amdgcn.workitem.id.x()
%tid.ext = sext i32 %tid to i64
@ -251,13 +251,13 @@ define amdgpu_kernel void @v_cvt_pkrtz_v2f16_f32_imm_reg(<2 x half> addrspace(1)
; VI-NEXT: v_mov_b32_e32 v1, s3
; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: flat_load_dword v3, v[0:1]
; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v2
; VI-NEXT: v_mov_b32_e32 v1, s1
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: flat_load_dword v0, v[0:1]
; VI-NEXT: v_mov_b32_e32 v3, s1
; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2
; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; VI-NEXT: v_cvt_pkrtz_f16_f32 v2, 1.0, v3
; VI-NEXT: flat_store_dword v[0:1], v2
; VI-NEXT: v_cvt_pkrtz_f16_f32 v0, 1.0, v0
; VI-NEXT: flat_store_dword v[2:3], v0
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: v_cvt_pkrtz_v2f16_f32_imm_reg:
@ -268,13 +268,13 @@ define amdgpu_kernel void @v_cvt_pkrtz_v2f16_f32_imm_reg(<2 x half> addrspace(1)
; GFX9-NEXT: v_mov_b32_e32 v1, s3
; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s2, v2
; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
; GFX9-NEXT: global_load_dword v3, v[0:1], off
; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s0, v2
; GFX9-NEXT: v_mov_b32_e32 v1, s1
; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
; GFX9-NEXT: global_load_dword v0, v[0:1], off
; GFX9-NEXT: v_mov_b32_e32 v3, s1
; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s0, v2
; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_cvt_pkrtz_f16_f32 v2, 1.0, v3
; GFX9-NEXT: global_store_dword v[0:1], v2, off
; GFX9-NEXT: v_cvt_pkrtz_f16_f32 v0, 1.0, v0
; GFX9-NEXT: global_store_dword v[2:3], v0, off
; GFX9-NEXT: s_endpgm
%tid = call i32 @llvm.amdgcn.workitem.id.x()
%tid.ext = sext i32 %tid to i64
@ -318,14 +318,14 @@ define amdgpu_kernel void @v_cvt_pkrtz_v2f16_f32_fneg_lo(<2 x half> addrspace(1)
; VI-NEXT: v_mov_b32_e32 v3, s1
; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v4
; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
; VI-NEXT: flat_load_dword v5, v[0:1]
; VI-NEXT: flat_load_dword v2, v[2:3]
; VI-NEXT: v_mov_b32_e32 v1, s5
; VI-NEXT: v_add_u32_e32 v0, vcc, s4, v4
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: flat_load_dword v0, v[0:1]
; VI-NEXT: flat_load_dword v1, v[2:3]
; VI-NEXT: v_mov_b32_e32 v5, s5
; VI-NEXT: v_add_u32_e32 v4, vcc, s4, v4
; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc
; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; VI-NEXT: v_cvt_pkrtz_f16_f32 v2, -v5, v2
; VI-NEXT: flat_store_dword v[0:1], v2
; VI-NEXT: v_cvt_pkrtz_f16_f32 v0, -v0, v1
; VI-NEXT: flat_store_dword v[4:5], v0
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: v_cvt_pkrtz_v2f16_f32_fneg_lo:
@ -340,14 +340,14 @@ define amdgpu_kernel void @v_cvt_pkrtz_v2f16_f32_fneg_lo(<2 x half> addrspace(1)
; GFX9-NEXT: v_mov_b32_e32 v3, s1
; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s0, v4
; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc
; GFX9-NEXT: global_load_dword v5, v[0:1], off
; GFX9-NEXT: global_load_dword v2, v[2:3], off
; GFX9-NEXT: v_mov_b32_e32 v1, s5
; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s4, v4
; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
; GFX9-NEXT: global_load_dword v0, v[0:1], off
; GFX9-NEXT: global_load_dword v1, v[2:3], off
; GFX9-NEXT: v_mov_b32_e32 v5, s5
; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, s4, v4
; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v5, vcc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_cvt_pkrtz_f16_f32 v2, -v5, v2
; GFX9-NEXT: global_store_dword v[0:1], v2, off
; GFX9-NEXT: v_cvt_pkrtz_f16_f32 v0, -v0, v1
; GFX9-NEXT: global_store_dword v[4:5], v0, off
; GFX9-NEXT: s_endpgm
%tid = call i32 @llvm.amdgcn.workitem.id.x()
%tid.ext = sext i32 %tid to i64
@ -394,14 +394,14 @@ define amdgpu_kernel void @v_cvt_pkrtz_v2f16_f32_fneg_hi(<2 x half> addrspace(1)
; VI-NEXT: v_mov_b32_e32 v3, s1
; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v4
; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
; VI-NEXT: flat_load_dword v5, v[0:1]
; VI-NEXT: flat_load_dword v2, v[2:3]
; VI-NEXT: v_mov_b32_e32 v1, s5
; VI-NEXT: v_add_u32_e32 v0, vcc, s4, v4
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: flat_load_dword v0, v[0:1]
; VI-NEXT: flat_load_dword v1, v[2:3]
; VI-NEXT: v_mov_b32_e32 v5, s5
; VI-NEXT: v_add_u32_e32 v4, vcc, s4, v4
; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc
; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; VI-NEXT: v_cvt_pkrtz_f16_f32 v2, v5, -v2
; VI-NEXT: flat_store_dword v[0:1], v2
; VI-NEXT: v_cvt_pkrtz_f16_f32 v0, v0, -v1
; VI-NEXT: flat_store_dword v[4:5], v0
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: v_cvt_pkrtz_v2f16_f32_fneg_hi:
@ -416,14 +416,14 @@ define amdgpu_kernel void @v_cvt_pkrtz_v2f16_f32_fneg_hi(<2 x half> addrspace(1)
; GFX9-NEXT: v_mov_b32_e32 v3, s1
; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s0, v4
; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc
; GFX9-NEXT: global_load_dword v5, v[0:1], off
; GFX9-NEXT: global_load_dword v2, v[2:3], off
; GFX9-NEXT: v_mov_b32_e32 v1, s5
; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s4, v4
; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
; GFX9-NEXT: global_load_dword v0, v[0:1], off
; GFX9-NEXT: global_load_dword v1, v[2:3], off
; GFX9-NEXT: v_mov_b32_e32 v5, s5
; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, s4, v4
; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v5, vcc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_cvt_pkrtz_f16_f32 v2, v5, -v2
; GFX9-NEXT: global_store_dword v[0:1], v2, off
; GFX9-NEXT: v_cvt_pkrtz_f16_f32 v0, v0, -v1
; GFX9-NEXT: global_store_dword v[4:5], v0, off
; GFX9-NEXT: s_endpgm
%tid = call i32 @llvm.amdgcn.workitem.id.x()
%tid.ext = sext i32 %tid to i64
@ -470,14 +470,14 @@ define amdgpu_kernel void @v_cvt_pkrtz_v2f16_f32_fneg_lo_hi(<2 x half> addrspace
; VI-NEXT: v_mov_b32_e32 v3, s1
; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v4
; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
; VI-NEXT: flat_load_dword v5, v[0:1]
; VI-NEXT: flat_load_dword v2, v[2:3]
; VI-NEXT: v_mov_b32_e32 v1, s5
; VI-NEXT: v_add_u32_e32 v0, vcc, s4, v4
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: flat_load_dword v0, v[0:1]
; VI-NEXT: flat_load_dword v1, v[2:3]
; VI-NEXT: v_mov_b32_e32 v5, s5
; VI-NEXT: v_add_u32_e32 v4, vcc, s4, v4
; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc
; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; VI-NEXT: v_cvt_pkrtz_f16_f32 v2, -v5, -v2
; VI-NEXT: flat_store_dword v[0:1], v2
; VI-NEXT: v_cvt_pkrtz_f16_f32 v0, -v0, -v1
; VI-NEXT: flat_store_dword v[4:5], v0
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: v_cvt_pkrtz_v2f16_f32_fneg_lo_hi:
@ -492,14 +492,14 @@ define amdgpu_kernel void @v_cvt_pkrtz_v2f16_f32_fneg_lo_hi(<2 x half> addrspace
; GFX9-NEXT: v_mov_b32_e32 v3, s1
; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s0, v4
; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc
; GFX9-NEXT: global_load_dword v5, v[0:1], off
; GFX9-NEXT: global_load_dword v2, v[2:3], off
; GFX9-NEXT: v_mov_b32_e32 v1, s5
; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s4, v4
; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
; GFX9-NEXT: global_load_dword v0, v[0:1], off
; GFX9-NEXT: global_load_dword v1, v[2:3], off
; GFX9-NEXT: v_mov_b32_e32 v5, s5
; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, s4, v4
; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v5, vcc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_cvt_pkrtz_f16_f32 v2, -v5, -v2
; GFX9-NEXT: global_store_dword v[0:1], v2, off
; GFX9-NEXT: v_cvt_pkrtz_f16_f32 v0, -v0, -v1
; GFX9-NEXT: global_store_dword v[4:5], v0, off
; GFX9-NEXT: s_endpgm
%tid = call i32 @llvm.amdgcn.workitem.id.x()
%tid.ext = sext i32 %tid to i64
@ -547,14 +547,14 @@ define amdgpu_kernel void @v_cvt_pkrtz_v2f16_f32_fneg_fabs_lo_fneg_hi(<2 x half>
; VI-NEXT: v_mov_b32_e32 v3, s1
; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v4
; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
; VI-NEXT: flat_load_dword v5, v[0:1]
; VI-NEXT: flat_load_dword v2, v[2:3]
; VI-NEXT: v_mov_b32_e32 v1, s5
; VI-NEXT: v_add_u32_e32 v0, vcc, s4, v4
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: flat_load_dword v0, v[0:1]
; VI-NEXT: flat_load_dword v1, v[2:3]
; VI-NEXT: v_mov_b32_e32 v5, s5
; VI-NEXT: v_add_u32_e32 v4, vcc, s4, v4
; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc
; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; VI-NEXT: v_cvt_pkrtz_f16_f32 v2, -|v5|, -v2
; VI-NEXT: flat_store_dword v[0:1], v2
; VI-NEXT: v_cvt_pkrtz_f16_f32 v0, -|v0|, -v1
; VI-NEXT: flat_store_dword v[4:5], v0
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: v_cvt_pkrtz_v2f16_f32_fneg_fabs_lo_fneg_hi:
@ -569,14 +569,14 @@ define amdgpu_kernel void @v_cvt_pkrtz_v2f16_f32_fneg_fabs_lo_fneg_hi(<2 x half>
; GFX9-NEXT: v_mov_b32_e32 v3, s1
; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s0, v4
; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc
; GFX9-NEXT: global_load_dword v5, v[0:1], off
; GFX9-NEXT: global_load_dword v2, v[2:3], off
; GFX9-NEXT: v_mov_b32_e32 v1, s5
; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s4, v4
; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
; GFX9-NEXT: global_load_dword v0, v[0:1], off
; GFX9-NEXT: global_load_dword v1, v[2:3], off
; GFX9-NEXT: v_mov_b32_e32 v5, s5
; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, s4, v4
; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v5, vcc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_cvt_pkrtz_f16_f32 v2, -|v5|, -v2
; GFX9-NEXT: global_store_dword v[0:1], v2, off
; GFX9-NEXT: v_cvt_pkrtz_f16_f32 v0, -|v0|, -v1
; GFX9-NEXT: global_store_dword v[4:5], v0, off
; GFX9-NEXT: s_endpgm
%tid = call i32 @llvm.amdgcn.workitem.id.x()
%tid.ext = sext i32 %tid to i64

View File

@ -313,16 +313,16 @@ define amdgpu_ps <4 x float> @load_2d_tfe(<8 x i32> inreg %rsrc, i32 addrspace(1
; GFX10-NEXT: v_mov_b32_e32 v5, v0 ; encoding: [0x00,0x03,0x0a,0x7e]
; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; encoding: [0x80,0x02,0x00,0x7e]
; GFX10-NEXT: v_mov_b32_e32 v6, v1 ; encoding: [0x01,0x03,0x0c,0x7e]
; GFX10-NEXT: v_mov_b32_e32 v7, s9 ; encoding: [0x09,0x02,0x0e,0x7e]
; GFX10-NEXT: ; implicit-def: $vcc_hi
; GFX10-NEXT: v_mov_b32_e32 v1, v0 ; encoding: [0x00,0x03,0x02,0x7e]
; GFX10-NEXT: v_mov_b32_e32 v2, v0 ; encoding: [0x00,0x03,0x04,0x7e]
; GFX10-NEXT: v_mov_b32_e32 v3, v0 ; encoding: [0x00,0x03,0x06,0x7e]
; GFX10-NEXT: v_mov_b32_e32 v4, v0 ; encoding: [0x00,0x03,0x08,0x7e]
; GFX10-NEXT: image_load v[0:4], v[5:6], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_2D unorm tfe ; encoding: [0x08,0x1f,0x01,0xf0,0x05,0x00,0x00,0x00]
; GFX10-NEXT: v_mov_b32_e32 v6, s8 ; encoding: [0x08,0x02,0x0c,0x7e]
; GFX10-NEXT: v_mov_b32_e32 v5, s8 ; encoding: [0x08,0x02,0x0a,0x7e]
; GFX10-NEXT: v_mov_b32_e32 v6, s9 ; encoding: [0x09,0x02,0x0c,0x7e]
; GFX10-NEXT: s_waitcnt vmcnt(0) ; encoding: [0x70,0x3f,0x8c,0xbf]
; GFX10-NEXT: global_store_dword v[6:7], v4, off ; encoding: [0x00,0x80,0x70,0xdc,0x06,0x04,0x7d,0x00]
; GFX10-NEXT: global_store_dword v[5:6], v4, off ; encoding: [0x00,0x80,0x70,0xdc,0x05,0x04,0x7d,0x00]
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; encoding: [0x00,0x00,0xfd,0xbb]
; GFX10-NEXT: ; return to shader part epilog
main_body:
@ -441,16 +441,16 @@ define amdgpu_ps <4 x float> @load_3d_tfe_lwe(<8 x i32> inreg %rsrc, i32 addrspa
; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; encoding: [0x80,0x02,0x00,0x7e]
; GFX10-NEXT: v_mov_b32_e32 v7, v2 ; encoding: [0x02,0x03,0x0e,0x7e]
; GFX10-NEXT: v_mov_b32_e32 v6, v1 ; encoding: [0x01,0x03,0x0c,0x7e]
; GFX10-NEXT: v_mov_b32_e32 v11, s9 ; encoding: [0x09,0x02,0x16,0x7e]
; GFX10-NEXT: v_mov_b32_e32 v10, s8 ; encoding: [0x08,0x02,0x14,0x7e]
; GFX10-NEXT: ; implicit-def: $vcc_hi
; GFX10-NEXT: v_mov_b32_e32 v1, v0 ; encoding: [0x00,0x03,0x02,0x7e]
; GFX10-NEXT: v_mov_b32_e32 v2, v0 ; encoding: [0x00,0x03,0x04,0x7e]
; GFX10-NEXT: v_mov_b32_e32 v3, v0 ; encoding: [0x00,0x03,0x06,0x7e]
; GFX10-NEXT: v_mov_b32_e32 v4, v0 ; encoding: [0x00,0x03,0x08,0x7e]
; GFX10-NEXT: image_load v[0:4], v[5:7], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_3D unorm tfe lwe ; encoding: [0x10,0x1f,0x03,0xf0,0x05,0x00,0x00,0x00]
; GFX10-NEXT: ; implicit-def: $vcc_hi
; GFX10-NEXT: v_mov_b32_e32 v5, s8 ; encoding: [0x08,0x02,0x0a,0x7e]
; GFX10-NEXT: v_mov_b32_e32 v6, s9 ; encoding: [0x09,0x02,0x0c,0x7e]
; GFX10-NEXT: s_waitcnt vmcnt(0) ; encoding: [0x70,0x3f,0x8c,0xbf]
; GFX10-NEXT: global_store_dword v[10:11], v4, off ; encoding: [0x00,0x80,0x70,0xdc,0x0a,0x04,0x7d,0x00]
; GFX10-NEXT: global_store_dword v[5:6], v4, off ; encoding: [0x00,0x80,0x70,0xdc,0x05,0x04,0x7d,0x00]
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; encoding: [0x00,0x00,0xfd,0xbb]
; GFX10-NEXT: ; return to shader part epilog
main_body:
@ -569,16 +569,16 @@ define amdgpu_ps <4 x float> @load_cube_lwe(<8 x i32> inreg %rsrc, i32 addrspace
; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; encoding: [0x80,0x02,0x00,0x7e]
; GFX10-NEXT: v_mov_b32_e32 v7, v2 ; encoding: [0x02,0x03,0x0e,0x7e]
; GFX10-NEXT: v_mov_b32_e32 v6, v1 ; encoding: [0x01,0x03,0x0c,0x7e]
; GFX10-NEXT: v_mov_b32_e32 v11, s9 ; encoding: [0x09,0x02,0x16,0x7e]
; GFX10-NEXT: v_mov_b32_e32 v10, s8 ; encoding: [0x08,0x02,0x14,0x7e]
; GFX10-NEXT: ; implicit-def: $vcc_hi
; GFX10-NEXT: v_mov_b32_e32 v1, v0 ; encoding: [0x00,0x03,0x02,0x7e]
; GFX10-NEXT: v_mov_b32_e32 v2, v0 ; encoding: [0x00,0x03,0x04,0x7e]
; GFX10-NEXT: v_mov_b32_e32 v3, v0 ; encoding: [0x00,0x03,0x06,0x7e]
; GFX10-NEXT: v_mov_b32_e32 v4, v0 ; encoding: [0x00,0x03,0x08,0x7e]
; GFX10-NEXT: image_load v[0:4], v[5:7], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_CUBE unorm lwe ; encoding: [0x18,0x1f,0x02,0xf0,0x05,0x00,0x00,0x00]
; GFX10-NEXT: ; implicit-def: $vcc_hi
; GFX10-NEXT: v_mov_b32_e32 v5, s8 ; encoding: [0x08,0x02,0x0a,0x7e]
; GFX10-NEXT: v_mov_b32_e32 v6, s9 ; encoding: [0x09,0x02,0x0c,0x7e]
; GFX10-NEXT: s_waitcnt vmcnt(0) ; encoding: [0x70,0x3f,0x8c,0xbf]
; GFX10-NEXT: global_store_dword v[10:11], v4, off ; encoding: [0x00,0x80,0x70,0xdc,0x0a,0x04,0x7d,0x00]
; GFX10-NEXT: global_store_dword v[5:6], v4, off ; encoding: [0x00,0x80,0x70,0xdc,0x05,0x04,0x7d,0x00]
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; encoding: [0x00,0x00,0xfd,0xbb]
; GFX10-NEXT: ; return to shader part epilog
main_body:
@ -693,16 +693,16 @@ define amdgpu_ps <4 x float> @load_1darray_tfe(<8 x i32> inreg %rsrc, i32 addrsp
; GFX10-NEXT: v_mov_b32_e32 v5, v0 ; encoding: [0x00,0x03,0x0a,0x7e]
; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; encoding: [0x80,0x02,0x00,0x7e]
; GFX10-NEXT: v_mov_b32_e32 v6, v1 ; encoding: [0x01,0x03,0x0c,0x7e]
; GFX10-NEXT: v_mov_b32_e32 v7, s9 ; encoding: [0x09,0x02,0x0e,0x7e]
; GFX10-NEXT: ; implicit-def: $vcc_hi
; GFX10-NEXT: v_mov_b32_e32 v1, v0 ; encoding: [0x00,0x03,0x02,0x7e]
; GFX10-NEXT: v_mov_b32_e32 v2, v0 ; encoding: [0x00,0x03,0x04,0x7e]
; GFX10-NEXT: v_mov_b32_e32 v3, v0 ; encoding: [0x00,0x03,0x06,0x7e]
; GFX10-NEXT: v_mov_b32_e32 v4, v0 ; encoding: [0x00,0x03,0x08,0x7e]
; GFX10-NEXT: image_load v[0:4], v[5:6], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_1D_ARRAY unorm tfe ; encoding: [0x20,0x1f,0x01,0xf0,0x05,0x00,0x00,0x00]
; GFX10-NEXT: v_mov_b32_e32 v6, s8 ; encoding: [0x08,0x02,0x0c,0x7e]
; GFX10-NEXT: v_mov_b32_e32 v5, s8 ; encoding: [0x08,0x02,0x0a,0x7e]
; GFX10-NEXT: v_mov_b32_e32 v6, s9 ; encoding: [0x09,0x02,0x0c,0x7e]
; GFX10-NEXT: s_waitcnt vmcnt(0) ; encoding: [0x70,0x3f,0x8c,0xbf]
; GFX10-NEXT: global_store_dword v[6:7], v4, off ; encoding: [0x00,0x80,0x70,0xdc,0x06,0x04,0x7d,0x00]
; GFX10-NEXT: global_store_dword v[5:6], v4, off ; encoding: [0x00,0x80,0x70,0xdc,0x05,0x04,0x7d,0x00]
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; encoding: [0x00,0x00,0xfd,0xbb]
; GFX10-NEXT: ; return to shader part epilog
main_body:
@ -821,16 +821,16 @@ define amdgpu_ps <4 x float> @load_2darray_lwe(<8 x i32> inreg %rsrc, i32 addrsp
; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; encoding: [0x80,0x02,0x00,0x7e]
; GFX10-NEXT: v_mov_b32_e32 v7, v2 ; encoding: [0x02,0x03,0x0e,0x7e]
; GFX10-NEXT: v_mov_b32_e32 v6, v1 ; encoding: [0x01,0x03,0x0c,0x7e]
; GFX10-NEXT: v_mov_b32_e32 v11, s9 ; encoding: [0x09,0x02,0x16,0x7e]
; GFX10-NEXT: v_mov_b32_e32 v10, s8 ; encoding: [0x08,0x02,0x14,0x7e]
; GFX10-NEXT: ; implicit-def: $vcc_hi
; GFX10-NEXT: v_mov_b32_e32 v1, v0 ; encoding: [0x00,0x03,0x02,0x7e]
; GFX10-NEXT: v_mov_b32_e32 v2, v0 ; encoding: [0x00,0x03,0x04,0x7e]
; GFX10-NEXT: v_mov_b32_e32 v3, v0 ; encoding: [0x00,0x03,0x06,0x7e]
; GFX10-NEXT: v_mov_b32_e32 v4, v0 ; encoding: [0x00,0x03,0x08,0x7e]
; GFX10-NEXT: image_load v[0:4], v[5:7], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_2D_ARRAY unorm lwe ; encoding: [0x28,0x1f,0x02,0xf0,0x05,0x00,0x00,0x00]
; GFX10-NEXT: ; implicit-def: $vcc_hi
; GFX10-NEXT: v_mov_b32_e32 v5, s8 ; encoding: [0x08,0x02,0x0a,0x7e]
; GFX10-NEXT: v_mov_b32_e32 v6, s9 ; encoding: [0x09,0x02,0x0c,0x7e]
; GFX10-NEXT: s_waitcnt vmcnt(0) ; encoding: [0x70,0x3f,0x8c,0xbf]
; GFX10-NEXT: global_store_dword v[10:11], v4, off ; encoding: [0x00,0x80,0x70,0xdc,0x0a,0x04,0x7d,0x00]
; GFX10-NEXT: global_store_dword v[5:6], v4, off ; encoding: [0x00,0x80,0x70,0xdc,0x05,0x04,0x7d,0x00]
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; encoding: [0x00,0x00,0xfd,0xbb]
; GFX10-NEXT: ; return to shader part epilog
main_body:
@ -949,16 +949,16 @@ define amdgpu_ps <4 x float> @load_2dmsaa_both(<8 x i32> inreg %rsrc, i32 addrsp
; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; encoding: [0x80,0x02,0x00,0x7e]
; GFX10-NEXT: v_mov_b32_e32 v7, v2 ; encoding: [0x02,0x03,0x0e,0x7e]
; GFX10-NEXT: v_mov_b32_e32 v6, v1 ; encoding: [0x01,0x03,0x0c,0x7e]
; GFX10-NEXT: v_mov_b32_e32 v11, s9 ; encoding: [0x09,0x02,0x16,0x7e]
; GFX10-NEXT: v_mov_b32_e32 v10, s8 ; encoding: [0x08,0x02,0x14,0x7e]
; GFX10-NEXT: ; implicit-def: $vcc_hi
; GFX10-NEXT: v_mov_b32_e32 v1, v0 ; encoding: [0x00,0x03,0x02,0x7e]
; GFX10-NEXT: v_mov_b32_e32 v2, v0 ; encoding: [0x00,0x03,0x04,0x7e]
; GFX10-NEXT: v_mov_b32_e32 v3, v0 ; encoding: [0x00,0x03,0x06,0x7e]
; GFX10-NEXT: v_mov_b32_e32 v4, v0 ; encoding: [0x00,0x03,0x08,0x7e]
; GFX10-NEXT: image_load v[0:4], v[5:7], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_2D_MSAA unorm tfe lwe ; encoding: [0x30,0x1f,0x03,0xf0,0x05,0x00,0x00,0x00]
; GFX10-NEXT: ; implicit-def: $vcc_hi
; GFX10-NEXT: v_mov_b32_e32 v5, s8 ; encoding: [0x08,0x02,0x0a,0x7e]
; GFX10-NEXT: v_mov_b32_e32 v6, s9 ; encoding: [0x09,0x02,0x0c,0x7e]
; GFX10-NEXT: s_waitcnt vmcnt(0) ; encoding: [0x70,0x3f,0x8c,0xbf]
; GFX10-NEXT: global_store_dword v[10:11], v4, off ; encoding: [0x00,0x80,0x70,0xdc,0x0a,0x04,0x7d,0x00]
; GFX10-NEXT: global_store_dword v[5:6], v4, off ; encoding: [0x00,0x80,0x70,0xdc,0x05,0x04,0x7d,0x00]
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; encoding: [0x00,0x00,0xfd,0xbb]
; GFX10-NEXT: ; return to shader part epilog
main_body:
@ -1081,16 +1081,16 @@ define amdgpu_ps <4 x float> @load_2darraymsaa_tfe(<8 x i32> inreg %rsrc, i32 ad
; GFX10-NEXT: v_mov_b32_e32 v8, v3 ; encoding: [0x03,0x03,0x10,0x7e]
; GFX10-NEXT: v_mov_b32_e32 v7, v2 ; encoding: [0x02,0x03,0x0e,0x7e]
; GFX10-NEXT: v_mov_b32_e32 v6, v1 ; encoding: [0x01,0x03,0x0c,0x7e]
; GFX10-NEXT: v_mov_b32_e32 v11, s9 ; encoding: [0x09,0x02,0x16,0x7e]
; GFX10-NEXT: ; implicit-def: $vcc_hi
; GFX10-NEXT: v_mov_b32_e32 v1, v0 ; encoding: [0x00,0x03,0x02,0x7e]
; GFX10-NEXT: v_mov_b32_e32 v2, v0 ; encoding: [0x00,0x03,0x04,0x7e]
; GFX10-NEXT: v_mov_b32_e32 v3, v0 ; encoding: [0x00,0x03,0x06,0x7e]
; GFX10-NEXT: v_mov_b32_e32 v4, v0 ; encoding: [0x00,0x03,0x08,0x7e]
; GFX10-NEXT: image_load v[0:4], v[5:8], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_2D_MSAA_ARRAY unorm tfe ; encoding: [0x38,0x1f,0x01,0xf0,0x05,0x00,0x00,0x00]
; GFX10-NEXT: v_mov_b32_e32 v10, s8 ; encoding: [0x08,0x02,0x14,0x7e]
; GFX10-NEXT: ; implicit-def: $vcc_hi
; GFX10-NEXT: v_mov_b32_e32 v5, s8 ; encoding: [0x08,0x02,0x0a,0x7e]
; GFX10-NEXT: v_mov_b32_e32 v6, s9 ; encoding: [0x09,0x02,0x0c,0x7e]
; GFX10-NEXT: s_waitcnt vmcnt(0) ; encoding: [0x70,0x3f,0x8c,0xbf]
; GFX10-NEXT: global_store_dword v[10:11], v4, off ; encoding: [0x00,0x80,0x70,0xdc,0x0a,0x04,0x7d,0x00]
; GFX10-NEXT: global_store_dword v[5:6], v4, off ; encoding: [0x00,0x80,0x70,0xdc,0x05,0x04,0x7d,0x00]
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; encoding: [0x00,0x00,0xfd,0xbb]
; GFX10-NEXT: ; return to shader part epilog
main_body:
@ -1205,16 +1205,16 @@ define amdgpu_ps <4 x float> @load_mip_1d_lwe(<8 x i32> inreg %rsrc, i32 addrspa
; GFX10-NEXT: v_mov_b32_e32 v5, v0 ; encoding: [0x00,0x03,0x0a,0x7e]
; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; encoding: [0x80,0x02,0x00,0x7e]
; GFX10-NEXT: v_mov_b32_e32 v6, v1 ; encoding: [0x01,0x03,0x0c,0x7e]
; GFX10-NEXT: v_mov_b32_e32 v7, s9 ; encoding: [0x09,0x02,0x0e,0x7e]
; GFX10-NEXT: ; implicit-def: $vcc_hi
; GFX10-NEXT: v_mov_b32_e32 v1, v0 ; encoding: [0x00,0x03,0x02,0x7e]
; GFX10-NEXT: v_mov_b32_e32 v2, v0 ; encoding: [0x00,0x03,0x04,0x7e]
; GFX10-NEXT: v_mov_b32_e32 v3, v0 ; encoding: [0x00,0x03,0x06,0x7e]
; GFX10-NEXT: v_mov_b32_e32 v4, v0 ; encoding: [0x00,0x03,0x08,0x7e]
; GFX10-NEXT: image_load_mip v[0:4], v[5:6], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_1D unorm lwe ; encoding: [0x00,0x1f,0x06,0xf0,0x05,0x00,0x00,0x00]
; GFX10-NEXT: v_mov_b32_e32 v6, s8 ; encoding: [0x08,0x02,0x0c,0x7e]
; GFX10-NEXT: v_mov_b32_e32 v5, s8 ; encoding: [0x08,0x02,0x0a,0x7e]
; GFX10-NEXT: v_mov_b32_e32 v6, s9 ; encoding: [0x09,0x02,0x0c,0x7e]
; GFX10-NEXT: s_waitcnt vmcnt(0) ; encoding: [0x70,0x3f,0x8c,0xbf]
; GFX10-NEXT: global_store_dword v[6:7], v4, off ; encoding: [0x00,0x80,0x70,0xdc,0x06,0x04,0x7d,0x00]
; GFX10-NEXT: global_store_dword v[5:6], v4, off ; encoding: [0x00,0x80,0x70,0xdc,0x05,0x04,0x7d,0x00]
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; encoding: [0x00,0x00,0xfd,0xbb]
; GFX10-NEXT: ; return to shader part epilog
main_body:
@ -1333,16 +1333,16 @@ define amdgpu_ps <4 x float> @load_mip_2d_tfe(<8 x i32> inreg %rsrc, i32 addrspa
; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; encoding: [0x80,0x02,0x00,0x7e]
; GFX10-NEXT: v_mov_b32_e32 v7, v2 ; encoding: [0x02,0x03,0x0e,0x7e]
; GFX10-NEXT: v_mov_b32_e32 v6, v1 ; encoding: [0x01,0x03,0x0c,0x7e]
; GFX10-NEXT: v_mov_b32_e32 v11, s9 ; encoding: [0x09,0x02,0x16,0x7e]
; GFX10-NEXT: v_mov_b32_e32 v10, s8 ; encoding: [0x08,0x02,0x14,0x7e]
; GFX10-NEXT: ; implicit-def: $vcc_hi
; GFX10-NEXT: v_mov_b32_e32 v1, v0 ; encoding: [0x00,0x03,0x02,0x7e]
; GFX10-NEXT: v_mov_b32_e32 v2, v0 ; encoding: [0x00,0x03,0x04,0x7e]
; GFX10-NEXT: v_mov_b32_e32 v3, v0 ; encoding: [0x00,0x03,0x06,0x7e]
; GFX10-NEXT: v_mov_b32_e32 v4, v0 ; encoding: [0x00,0x03,0x08,0x7e]
; GFX10-NEXT: image_load_mip v[0:4], v[5:7], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_2D unorm tfe ; encoding: [0x08,0x1f,0x05,0xf0,0x05,0x00,0x00,0x00]
; GFX10-NEXT: ; implicit-def: $vcc_hi
; GFX10-NEXT: v_mov_b32_e32 v5, s8 ; encoding: [0x08,0x02,0x0a,0x7e]
; GFX10-NEXT: v_mov_b32_e32 v6, s9 ; encoding: [0x09,0x02,0x0c,0x7e]
; GFX10-NEXT: s_waitcnt vmcnt(0) ; encoding: [0x70,0x3f,0x8c,0xbf]
; GFX10-NEXT: global_store_dword v[10:11], v4, off ; encoding: [0x00,0x80,0x70,0xdc,0x0a,0x04,0x7d,0x00]
; GFX10-NEXT: global_store_dword v[5:6], v4, off ; encoding: [0x00,0x80,0x70,0xdc,0x05,0x04,0x7d,0x00]
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; encoding: [0x00,0x00,0xfd,0xbb]
; GFX10-NEXT: ; return to shader part epilog
main_body:
@ -3247,11 +3247,11 @@ define amdgpu_ps float @image_load_mmo(<8 x i32> inreg %rsrc, float addrspace(3)
; VERDE-LABEL: image_load_mmo:
; VERDE: ; %bb.0:
; VERDE-NEXT: image_load v1, v[1:2], s[0:7] dmask:0x1 unorm
; VERDE-NEXT: v_mov_b32_e32 v2, 0
; VERDE-NEXT: v_mov_b32_e32 v3, 0
; VERDE-NEXT: s_mov_b32 m0, -1
; VERDE-NEXT: v_add_i32_e32 v3, vcc, 16, v0
; VERDE-NEXT: ds_write_b32 v0, v2
; VERDE-NEXT: ds_write_b32 v3, v2
; VERDE-NEXT: ds_write_b32 v0, v3
; VERDE-NEXT: v_add_i32_e32 v0, vcc, 16, v0
; VERDE-NEXT: ds_write_b32 v0, v3
; VERDE-NEXT: s_waitcnt vmcnt(0)
; VERDE-NEXT: v_mov_b32_e32 v0, v1
; VERDE-NEXT: s_waitcnt lgkmcnt(0)
@ -3291,9 +3291,9 @@ define amdgpu_ps float @image_load_mmo(<8 x i32> inreg %rsrc, float addrspace(3)
; GFX10-LABEL: image_load_mmo:
; GFX10: ; %bb.0:
; GFX10-NEXT: image_load v1, v[1:2], s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_2D unorm ; encoding: [0x08,0x11,0x00,0xf0,0x01,0x01,0x00,0x00]
; GFX10-NEXT: v_mov_b32_e32 v3, 0 ; encoding: [0x80,0x02,0x06,0x7e]
; GFX10-NEXT: v_mov_b32_e32 v2, 0 ; encoding: [0x80,0x02,0x04,0x7e]
; GFX10-NEXT: ; implicit-def: $vcc_hi
; GFX10-NEXT: ds_write2_b32 v0, v3, v3 offset1:4 ; encoding: [0x00,0x04,0x38,0xd8,0x00,0x03,0x03,0x00]
; GFX10-NEXT: ds_write2_b32 v0, v2, v2 offset1:4 ; encoding: [0x00,0x04,0x38,0xd8,0x00,0x02,0x02,0x00]
; GFX10-NEXT: s_waitcnt vmcnt(0) ; encoding: [0x70,0x3f,0x8c,0xbf]
; GFX10-NEXT: v_mov_b32_e32 v0, v1 ; encoding: [0x01,0x03,0x00,0x7e]
; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; encoding: [0x7f,0xc0,0x8c,0xbf]

View File

@ -1783,13 +1783,13 @@ define amdgpu_ps float @sample_c_d_o_2darray_V1_tfe(<8 x i32> inreg %rsrc, <4 x
; GFX10-NEXT: v_mov_b32_e32 v10, v0 ; encoding: [0x00,0x03,0x14,0x7e]
; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; encoding: [0x80,0x02,0x00,0x7e]
; GFX10-NEXT: v_mov_b32_e32 v9, v1 ; encoding: [0x01,0x03,0x12,0x7e]
; GFX10-NEXT: v_mov_b32_e32 v11, s13 ; encoding: [0x0d,0x02,0x16,0x7e]
; GFX10-NEXT: ; implicit-def: $vcc_hi
; GFX10-NEXT: v_mov_b32_e32 v1, v0 ; encoding: [0x00,0x03,0x02,0x7e]
; GFX10-NEXT: image_sample_c_d_o v[0:1], [v10, v9, v2, v3, v4, v5, v6, v7, v8], s[0:7], s[8:11] dmask:0x4 dim:SQ_RSRC_IMG_2D_ARRAY tfe ; encoding: [0x2c,0x04,0xe9,0xf0,0x0a,0x00,0x40,0x00,0x09,0x02,0x03,0x04,0x05,0x06,0x07,0x08]
; GFX10-NEXT: v_mov_b32_e32 v10, s12 ; encoding: [0x0c,0x02,0x14,0x7e]
; GFX10-NEXT: v_mov_b32_e32 v2, s12 ; encoding: [0x0c,0x02,0x04,0x7e]
; GFX10-NEXT: v_mov_b32_e32 v3, s13 ; encoding: [0x0d,0x02,0x06,0x7e]
; GFX10-NEXT: s_waitcnt vmcnt(0) ; encoding: [0x70,0x3f,0x8c,0xbf]
; GFX10-NEXT: global_store_dword v[10:11], v1, off ; encoding: [0x00,0x80,0x70,0xdc,0x0a,0x01,0x7d,0x00]
; GFX10-NEXT: global_store_dword v[2:3], v1, off ; encoding: [0x00,0x80,0x70,0xdc,0x02,0x01,0x7d,0x00]
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; encoding: [0x00,0x00,0xfd,0xbb]
; GFX10-NEXT: ; return to shader part epilog
main_body:

View File

@ -9,11 +9,11 @@ define amdgpu_kernel void @test_barrier(i32 addrspace(1)* %out, i32 %size) #0 {
; VARIANT0: ; %bb.0: ; %entry
; VARIANT0-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9
; VARIANT0-NEXT: s_load_dword s2, s[0:1], 0xb
; VARIANT0-NEXT: v_not_b32_e32 v3, v0
; VARIANT0-NEXT: s_mov_b32 s7, 0xf000
; VARIANT0-NEXT: s_mov_b32 s6, 0
; VARIANT0-NEXT: v_lshlrev_b32_e32 v1, 2, v0
; VARIANT0-NEXT: v_mov_b32_e32 v2, 0
; VARIANT0-NEXT: v_not_b32_e32 v3, v0
; VARIANT0-NEXT: s_waitcnt lgkmcnt(0)
; VARIANT0-NEXT: buffer_store_dword v0, v[1:2], s[4:7], 0 addr64
; VARIANT0-NEXT: s_waitcnt vmcnt(0) expcnt(0)
@ -30,11 +30,11 @@ define amdgpu_kernel void @test_barrier(i32 addrspace(1)* %out, i32 %size) #0 {
; VARIANT1: ; %bb.0: ; %entry
; VARIANT1-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9
; VARIANT1-NEXT: s_load_dword s2, s[0:1], 0xb
; VARIANT1-NEXT: v_not_b32_e32 v3, v0
; VARIANT1-NEXT: s_mov_b32 s7, 0xf000
; VARIANT1-NEXT: s_mov_b32 s6, 0
; VARIANT1-NEXT: v_lshlrev_b32_e32 v1, 2, v0
; VARIANT1-NEXT: v_mov_b32_e32 v2, 0
; VARIANT1-NEXT: v_not_b32_e32 v3, v0
; VARIANT1-NEXT: s_waitcnt lgkmcnt(0)
; VARIANT1-NEXT: buffer_store_dword v0, v[1:2], s[4:7], 0 addr64
; VARIANT1-NEXT: s_barrier
@ -51,45 +51,45 @@ define amdgpu_kernel void @test_barrier(i32 addrspace(1)* %out, i32 %size) #0 {
; VARIANT2: ; %bb.0: ; %entry
; VARIANT2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; VARIANT2-NEXT: s_load_dword s0, s[0:1], 0x2c
; VARIANT2-NEXT: v_lshlrev_b32_e32 v3, 2, v0
; VARIANT2-NEXT: v_lshlrev_b32_e32 v1, 2, v0
; VARIANT2-NEXT: s_waitcnt lgkmcnt(0)
; VARIANT2-NEXT: v_mov_b32_e32 v4, s3
; VARIANT2-NEXT: v_xad_u32 v1, v0, -1, s0
; VARIANT2-NEXT: v_ashrrev_i32_e32 v2, 31, v1
; VARIANT2-NEXT: v_mov_b32_e32 v2, s3
; VARIANT2-NEXT: v_xad_u32 v3, v0, -1, s0
; VARIANT2-NEXT: v_ashrrev_i32_e32 v4, 31, v3
; VARIANT2-NEXT: v_add_co_u32_e32 v1, vcc, s2, v1
; VARIANT2-NEXT: v_lshlrev_b64 v[3:4], 2, v[3:4]
; VARIANT2-NEXT: v_addc_co_u32_e32 v2, vcc, 0, v2, vcc
; VARIANT2-NEXT: global_store_dword v[1:2], v0, off
; VARIANT2-NEXT: v_mov_b32_e32 v0, s3
; VARIANT2-NEXT: v_add_co_u32_e32 v3, vcc, s2, v3
; VARIANT2-NEXT: v_lshlrev_b64 v[1:2], 2, v[1:2]
; VARIANT2-NEXT: v_addc_co_u32_e32 v4, vcc, 0, v4, vcc
; VARIANT2-NEXT: global_store_dword v[3:4], v0, off
; VARIANT2-NEXT: v_mov_b32_e32 v5, s3
; VARIANT2-NEXT: v_add_co_u32_e32 v0, vcc, s2, v1
; VARIANT2-NEXT: v_addc_co_u32_e32 v1, vcc, v5, v2, vcc
; VARIANT2-NEXT: v_addc_co_u32_e32 v4, vcc, v0, v4, vcc
; VARIANT2-NEXT: s_waitcnt vmcnt(0)
; VARIANT2-NEXT: s_barrier
; VARIANT2-NEXT: global_load_dword v0, v[0:1], off
; VARIANT2-NEXT: global_load_dword v0, v[3:4], off
; VARIANT2-NEXT: s_waitcnt vmcnt(0)
; VARIANT2-NEXT: global_store_dword v[3:4], v0, off
; VARIANT2-NEXT: global_store_dword v[1:2], v0, off
; VARIANT2-NEXT: s_endpgm
;
; VARIANT3-LABEL: test_barrier:
; VARIANT3: ; %bb.0: ; %entry
; VARIANT3-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; VARIANT3-NEXT: s_load_dword s0, s[0:1], 0x2c
; VARIANT3-NEXT: v_lshlrev_b32_e32 v3, 2, v0
; VARIANT3-NEXT: v_lshlrev_b32_e32 v1, 2, v0
; VARIANT3-NEXT: s_waitcnt lgkmcnt(0)
; VARIANT3-NEXT: v_mov_b32_e32 v4, s3
; VARIANT3-NEXT: v_xad_u32 v1, v0, -1, s0
; VARIANT3-NEXT: v_ashrrev_i32_e32 v2, 31, v1
; VARIANT3-NEXT: v_mov_b32_e32 v2, s3
; VARIANT3-NEXT: v_xad_u32 v3, v0, -1, s0
; VARIANT3-NEXT: v_ashrrev_i32_e32 v4, 31, v3
; VARIANT3-NEXT: v_add_co_u32_e32 v1, vcc, s2, v1
; VARIANT3-NEXT: v_lshlrev_b64 v[3:4], 2, v[3:4]
; VARIANT3-NEXT: v_addc_co_u32_e32 v2, vcc, 0, v2, vcc
; VARIANT3-NEXT: global_store_dword v[1:2], v0, off
; VARIANT3-NEXT: v_mov_b32_e32 v0, s3
; VARIANT3-NEXT: v_add_co_u32_e32 v3, vcc, s2, v3
; VARIANT3-NEXT: v_lshlrev_b64 v[1:2], 2, v[1:2]
; VARIANT3-NEXT: v_addc_co_u32_e32 v4, vcc, 0, v4, vcc
; VARIANT3-NEXT: global_store_dword v[3:4], v0, off
; VARIANT3-NEXT: v_mov_b32_e32 v5, s3
; VARIANT3-NEXT: v_add_co_u32_e32 v0, vcc, s2, v1
; VARIANT3-NEXT: v_addc_co_u32_e32 v1, vcc, v5, v2, vcc
; VARIANT3-NEXT: v_addc_co_u32_e32 v4, vcc, v0, v4, vcc
; VARIANT3-NEXT: s_barrier
; VARIANT3-NEXT: global_load_dword v0, v[0:1], off
; VARIANT3-NEXT: global_load_dword v0, v[3:4], off
; VARIANT3-NEXT: s_waitcnt vmcnt(0)
; VARIANT3-NEXT: global_store_dword v[3:4], v0, off
; VARIANT3-NEXT: global_store_dword v[1:2], v0, off
; VARIANT3-NEXT: s_endpgm
entry:
%tmp = call i32 @llvm.amdgcn.workitem.id.x()

View File

@ -1537,8 +1537,8 @@ define amdgpu_kernel void @bfe_u32_constant_fold_test_18(i32 addrspace(1)* %out)
define amdgpu_kernel void @simplify_bfe_u32_multi_use_arg(i32 addrspace(1)* %out0,
; SI-LABEL: simplify_bfe_u32_multi_use_arg:
; SI: ; %bb.0:
; SI-NEXT: s_mov_b32 s3, 0xf000
; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xd
; SI-NEXT: s_mov_b32 s3, 0xf000
; SI-NEXT: s_mov_b32 s2, -1
; SI-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x9
; SI-NEXT: s_mov_b32 s6, s2

View File

@ -85,20 +85,20 @@ define amdgpu_kernel void @v_round_f64(double addrspace(1)* %out, double addrspa
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: s_mov_b64 s[4:5], s[2:3]
; SI-NEXT: buffer_load_dwordx2 v[2:3], v[0:1], s[4:7], 0 addr64
; SI-NEXT: s_movk_i32 s9, 0xfc01
; SI-NEXT: s_mov_b32 s5, 0xfffff
; SI-NEXT: s_mov_b32 s4, -1
; SI-NEXT: s_movk_i32 s11, 0xfc01
; SI-NEXT: s_mov_b32 s9, 0xfffff
; SI-NEXT: s_mov_b32 s8, -1
; SI-NEXT: v_mov_b32_e32 v8, 0x3ff00000
; SI-NEXT: s_brev_b32 s8, -2
; SI-NEXT: s_brev_b32 s10, -2
; SI-NEXT: s_mov_b64 s[2:3], s[6:7]
; SI-NEXT: s_mov_b32 s7, 0x80000
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_bfe_u32 v4, v3, 20, 11
; SI-NEXT: v_add_i32_e32 v10, vcc, s9, v4
; SI-NEXT: v_lshr_b64 v[4:5], s[4:5], v10
; SI-NEXT: v_add_i32_e32 v10, vcc, s11, v4
; SI-NEXT: v_lshr_b64 v[4:5], s[8:9], v10
; SI-NEXT: v_cmp_eq_u32_e32 vcc, -1, v10
; SI-NEXT: v_cndmask_b32_e32 v8, 0, v8, vcc
; SI-NEXT: v_bfi_b32 v11, s8, v8, v3
; SI-NEXT: v_bfi_b32 v11, s10, v8, v3
; SI-NEXT: v_and_b32_e32 v9, v3, v5
; SI-NEXT: v_and_b32_e32 v8, v2, v4
; SI-NEXT: v_lshr_b64 v[6:7], s[6:7], v10
@ -122,26 +122,26 @@ define amdgpu_kernel void @v_round_f64(double addrspace(1)* %out, double addrspa
;
; CI-LABEL: v_round_f64:
; CI: ; %bb.0:
; CI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
; CI-NEXT: s_mov_b32 s7, 0xf000
; CI-NEXT: s_mov_b32 s6, 0
; CI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
; CI-NEXT: s_mov_b32 s3, 0xf000
; CI-NEXT: s_mov_b32 s2, 0
; CI-NEXT: v_lshlrev_b32_e32 v0, 3, v0
; CI-NEXT: v_mov_b32_e32 v1, 0
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: s_mov_b64 s[4:5], s[2:3]
; CI-NEXT: buffer_load_dwordx2 v[2:3], v[0:1], s[4:7], 0 addr64
; CI-NEXT: s_brev_b32 s2, -2
; CI-NEXT: v_mov_b32_e32 v5, 0x3ff00000
; CI-NEXT: v_mov_b32_e32 v4, 0
; CI-NEXT: s_mov_b64 s[0:1], s[6:7]
; CI-NEXT: buffer_load_dwordx2 v[2:3], v[0:1], s[0:3], 0 addr64
; CI-NEXT: s_brev_b32 s6, -2
; CI-NEXT: v_mov_b32_e32 v8, 0x3ff00000
; CI-NEXT: s_waitcnt vmcnt(0)
; CI-NEXT: v_trunc_f64_e32 v[6:7], v[2:3]
; CI-NEXT: v_add_f64 v[8:9], v[2:3], -v[6:7]
; CI-NEXT: v_bfi_b32 v2, s2, v5, v3
; CI-NEXT: v_cmp_ge_f64_e64 vcc, |v[8:9]|, 0.5
; CI-NEXT: s_mov_b64 s[2:3], s[6:7]
; CI-NEXT: v_cndmask_b32_e32 v5, 0, v2, vcc
; CI-NEXT: v_add_f64 v[2:3], v[6:7], v[4:5]
; CI-NEXT: buffer_store_dwordx2 v[2:3], v[0:1], s[0:3], 0 addr64
; CI-NEXT: v_trunc_f64_e32 v[4:5], v[2:3]
; CI-NEXT: v_add_f64 v[6:7], v[2:3], -v[4:5]
; CI-NEXT: v_bfi_b32 v2, s6, v8, v3
; CI-NEXT: v_cmp_ge_f64_e64 vcc, |v[6:7]|, 0.5
; CI-NEXT: s_mov_b64 s[6:7], s[2:3]
; CI-NEXT: v_cndmask_b32_e32 v3, 0, v2, vcc
; CI-NEXT: v_mov_b32_e32 v2, 0
; CI-NEXT: v_add_f64 v[2:3], v[4:5], v[2:3]
; CI-NEXT: buffer_store_dwordx2 v[2:3], v[0:1], s[4:7], 0 addr64
; CI-NEXT: s_endpgm
%tid = call i32 @llvm.amdgcn.workitem.id.x() #1
%gep = getelementptr double, double addrspace(1)* %in, i32 %tid

View File

@ -519,11 +519,11 @@ define void @load_local_lo_v2i16_reghi_vreg_multi_use_lo(i16 addrspace(3)* %in,
; GFX900: ; %bb.0: ; %entry
; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX900-NEXT: ds_read_u16 v0, v0
; GFX900-NEXT: v_mov_b32_e32 v3, 0
; GFX900-NEXT: v_mov_b32_e32 v2, 0xffff
; GFX900-NEXT: v_mov_b32_e32 v2, 0
; GFX900-NEXT: v_mov_b32_e32 v3, 0xffff
; GFX900-NEXT: s_waitcnt lgkmcnt(0)
; GFX900-NEXT: ds_write_b16 v3, v0
; GFX900-NEXT: v_bfi_b32 v0, v2, v0, v1
; GFX900-NEXT: ds_write_b16 v2, v0
; GFX900-NEXT: v_bfi_b32 v0, v3, v0, v1
; GFX900-NEXT: global_store_dword v[0:1], v0, off
; GFX900-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX900-NEXT: s_setpc_b64 s[30:31]
@ -532,11 +532,11 @@ define void @load_local_lo_v2i16_reghi_vreg_multi_use_lo(i16 addrspace(3)* %in,
; GFX906: ; %bb.0: ; %entry
; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX906-NEXT: ds_read_u16 v0, v0
; GFX906-NEXT: v_mov_b32_e32 v3, 0
; GFX906-NEXT: v_mov_b32_e32 v2, 0xffff
; GFX906-NEXT: v_mov_b32_e32 v2, 0
; GFX906-NEXT: v_mov_b32_e32 v3, 0xffff
; GFX906-NEXT: s_waitcnt lgkmcnt(0)
; GFX906-NEXT: ds_write_b16 v3, v0
; GFX906-NEXT: v_bfi_b32 v0, v2, v0, v1
; GFX906-NEXT: ds_write_b16 v2, v0
; GFX906-NEXT: v_bfi_b32 v0, v3, v0, v1
; GFX906-NEXT: global_store_dword v[0:1], v0, off
; GFX906-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX906-NEXT: s_setpc_b64 s[30:31]
@ -580,10 +580,10 @@ define void @load_local_lo_v2i16_reghi_vreg_multi_use_hi(i16 addrspace(3)* %in,
; GFX906: ; %bb.0: ; %entry
; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX906-NEXT: ds_read_u16 v0, v0
; GFX906-NEXT: v_lshrrev_b32_e32 v2, 16, v1
; GFX906-NEXT: v_mov_b32_e32 v3, 0
; GFX906-NEXT: ds_write_b16 v3, v2
; GFX906-NEXT: v_mov_b32_e32 v2, 0xffff
; GFX906-NEXT: v_lshrrev_b32_e32 v3, 16, v1
; GFX906-NEXT: v_mov_b32_e32 v4, 0
; GFX906-NEXT: ds_write_b16 v4, v3
; GFX906-NEXT: s_waitcnt lgkmcnt(1)
; GFX906-NEXT: v_bfi_b32 v0, v2, v0, v1
; GFX906-NEXT: global_store_dword v[0:1], v0, off
@ -596,11 +596,11 @@ define void @load_local_lo_v2i16_reghi_vreg_multi_use_hi(i16 addrspace(3)* %in,
; GFX803-NEXT: s_mov_b32 m0, -1
; GFX803-NEXT: ds_read_u16 v0, v0
; GFX803-NEXT: v_lshrrev_b32_e32 v1, 16, v1
; GFX803-NEXT: v_lshlrev_b32_e32 v2, 16, v1
; GFX803-NEXT: v_mov_b32_e32 v3, 0
; GFX803-NEXT: ds_write_b16 v3, v1
; GFX803-NEXT: v_mov_b32_e32 v2, 0
; GFX803-NEXT: ds_write_b16 v2, v1
; GFX803-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX803-NEXT: s_waitcnt lgkmcnt(1)
; GFX803-NEXT: v_or_b32_e32 v0, v0, v2
; GFX803-NEXT: v_or_b32_e32 v0, v0, v1
; GFX803-NEXT: flat_store_dword v[0:1], v0
; GFX803-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX803-NEXT: s_setpc_b64 s[30:31]
@ -618,12 +618,12 @@ define void @load_local_lo_v2i16_reghi_vreg_multi_use_lohi(i16 addrspace(3)* noa
; GFX900: ; %bb.0: ; %entry
; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX900-NEXT: ds_read_u16 v0, v0
; GFX900-NEXT: v_mov_b32_e32 v4, 0xffff
; GFX900-NEXT: v_lshrrev_b32_e32 v5, 16, v1
; GFX900-NEXT: v_lshrrev_b32_e32 v4, 16, v1
; GFX900-NEXT: s_waitcnt lgkmcnt(0)
; GFX900-NEXT: ds_write_b16 v2, v0
; GFX900-NEXT: ds_write_b16 v3, v5
; GFX900-NEXT: v_bfi_b32 v0, v4, v0, v1
; GFX900-NEXT: ds_write_b16 v3, v4
; GFX900-NEXT: v_mov_b32_e32 v2, 0xffff
; GFX900-NEXT: v_bfi_b32 v0, v2, v0, v1
; GFX900-NEXT: global_store_dword v[0:1], v0, off
; GFX900-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX900-NEXT: s_setpc_b64 s[30:31]
@ -632,12 +632,12 @@ define void @load_local_lo_v2i16_reghi_vreg_multi_use_lohi(i16 addrspace(3)* noa
; GFX906: ; %bb.0: ; %entry
; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX906-NEXT: ds_read_u16 v0, v0
; GFX906-NEXT: v_mov_b32_e32 v4, 0xffff
; GFX906-NEXT: v_lshrrev_b32_e32 v5, 16, v1
; GFX906-NEXT: v_lshrrev_b32_e32 v4, 16, v1
; GFX906-NEXT: s_waitcnt lgkmcnt(0)
; GFX906-NEXT: ds_write_b16 v2, v0
; GFX906-NEXT: ds_write_b16 v3, v5
; GFX906-NEXT: v_bfi_b32 v0, v4, v0, v1
; GFX906-NEXT: ds_write_b16 v3, v4
; GFX906-NEXT: v_mov_b32_e32 v2, 0xffff
; GFX906-NEXT: v_bfi_b32 v0, v2, v0, v1
; GFX906-NEXT: global_store_dword v[0:1], v0, off
; GFX906-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX906-NEXT: s_setpc_b64 s[30:31]
@ -648,11 +648,11 @@ define void @load_local_lo_v2i16_reghi_vreg_multi_use_lohi(i16 addrspace(3)* noa
; GFX803-NEXT: s_mov_b32 m0, -1
; GFX803-NEXT: ds_read_u16 v0, v0
; GFX803-NEXT: v_lshrrev_b32_e32 v1, 16, v1
; GFX803-NEXT: v_lshlrev_b32_e32 v4, 16, v1
; GFX803-NEXT: s_waitcnt lgkmcnt(0)
; GFX803-NEXT: ds_write_b16 v2, v0
; GFX803-NEXT: ds_write_b16 v3, v1
; GFX803-NEXT: v_or_b32_sdwa v0, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; GFX803-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX803-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; GFX803-NEXT: flat_store_dword v[0:1], v0
; GFX803-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX803-NEXT: s_setpc_b64 s[30:31]
@ -1035,10 +1035,10 @@ define void @load_flat_lo_v2i16_reglo_vreg_zexti8(i8* %in, i32 %reg) #0 {
; GFX803: ; %bb.0: ; %entry
; GFX803-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX803-NEXT: flat_load_ubyte v0, v[0:1]
; GFX803-NEXT: v_lshrrev_b32_e32 v1, 16, v2
; GFX803-NEXT: v_lshrrev_b32_e32 v2, 16, v2
; GFX803-NEXT: s_mov_b32 s4, 0x5040c00
; GFX803-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX803-NEXT: v_perm_b32 v0, v1, v0, s4
; GFX803-NEXT: v_perm_b32 v0, v2, v0, s4
; GFX803-NEXT: flat_store_dword v[0:1], v0
; GFX803-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX803-NEXT: s_setpc_b64 s[30:31]
@ -1117,10 +1117,10 @@ define void @load_flat_lo_v2f16_reglo_vreg_zexti8(i8* %in, i32 %reg) #0 {
; GFX803: ; %bb.0: ; %entry
; GFX803-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX803-NEXT: flat_load_ubyte v0, v[0:1]
; GFX803-NEXT: v_lshrrev_b32_e32 v1, 16, v2
; GFX803-NEXT: v_lshrrev_b32_e32 v2, 16, v2
; GFX803-NEXT: s_mov_b32 s4, 0x5040c00
; GFX803-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX803-NEXT: v_perm_b32 v0, v1, v0, s4
; GFX803-NEXT: v_perm_b32 v0, v2, v0, s4
; GFX803-NEXT: flat_store_dword v[0:1], v0
; GFX803-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX803-NEXT: s_setpc_b64 s[30:31]
@ -1524,11 +1524,11 @@ define void @load_private_lo_v2i16_reglo_vreg_nooff_zexti8(i8 addrspace(5)* %in,
; GFX803-LABEL: load_private_lo_v2i16_reglo_vreg_nooff_zexti8:
; GFX803: ; %bb.0: ; %entry
; GFX803-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX803-NEXT: buffer_load_ubyte v0, off, s[0:3], s33 offset:4094
; GFX803-NEXT: v_lshrrev_b32_e32 v1, 16, v1
; GFX803-NEXT: v_lshrrev_b32_e32 v0, 16, v1
; GFX803-NEXT: buffer_load_ubyte v1, off, s[0:3], s33 offset:4094
; GFX803-NEXT: s_mov_b32 s4, 0x5040c00
; GFX803-NEXT: s_waitcnt vmcnt(0)
; GFX803-NEXT: v_perm_b32 v0, v1, v0, s4
; GFX803-NEXT: v_perm_b32 v0, v0, v1, s4
; GFX803-NEXT: flat_store_dword v[0:1], v0
; GFX803-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX803-NEXT: s_setpc_b64 s[30:31]
@ -1606,11 +1606,11 @@ define void @load_private_lo_v2f16_reglo_vreg_nooff_zexti8(i8 addrspace(5)* %in,
; GFX803-LABEL: load_private_lo_v2f16_reglo_vreg_nooff_zexti8:
; GFX803: ; %bb.0: ; %entry
; GFX803-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX803-NEXT: buffer_load_ubyte v0, off, s[0:3], s33 offset:4094
; GFX803-NEXT: v_lshrrev_b32_e32 v1, 16, v1
; GFX803-NEXT: v_lshrrev_b32_e32 v0, 16, v1
; GFX803-NEXT: buffer_load_ubyte v1, off, s[0:3], s33 offset:4094
; GFX803-NEXT: s_mov_b32 s4, 0x5040c00
; GFX803-NEXT: s_waitcnt vmcnt(0)
; GFX803-NEXT: v_perm_b32 v0, v1, v0, s4
; GFX803-NEXT: v_perm_b32 v0, v0, v1, s4
; GFX803-NEXT: flat_store_dword v[0:1], v0
; GFX803-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX803-NEXT: s_setpc_b64 s[30:31]

View File

@ -112,22 +112,22 @@ bb:
define amdgpu_kernel void @muli24_shl64(i64 addrspace(1)* nocapture %arg, i32 addrspace(1)* nocapture readonly %arg1) {
; GCN-LABEL: muli24_shl64:
; GCN: ; %bb.0: ; %bb
; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
; GCN-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
; GCN-NEXT: v_mov_b32_e32 v2, 0
; GCN-NEXT: s_mov_b32 s7, 0xf000
; GCN-NEXT: s_mov_b32 s6, 0
; GCN-NEXT: s_mov_b32 s3, 0xf000
; GCN-NEXT: s_mov_b32 s2, 0
; GCN-NEXT: v_lshlrev_b32_e32 v1, 2, v0
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: s_mov_b64 s[4:5], s[2:3]
; GCN-NEXT: buffer_load_dword v1, v[1:2], s[4:7], 0 addr64
; GCN-NEXT: s_mov_b64 s[0:1], s[6:7]
; GCN-NEXT: v_lshlrev_b32_e32 v3, 3, v0
; GCN-NEXT: s_mov_b64 s[2:3], s[6:7]
; GCN-NEXT: buffer_load_dword v0, v[1:2], s[0:3], 0 addr64
; GCN-NEXT: s_mov_b64 s[6:7], s[2:3]
; GCN-NEXT: v_mov_b32_e32 v4, v2
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_or_b32_e32 v0, 0x800000, v1
; GCN-NEXT: v_or_b32_e32 v0, 0x800000, v0
; GCN-NEXT: v_mul_i32_i24_e32 v0, -7, v0
; GCN-NEXT: v_lshlrev_b32_e32 v1, 3, v0
; GCN-NEXT: buffer_store_dwordx2 v[1:2], v[3:4], s[0:3], 0 addr64
; GCN-NEXT: buffer_store_dwordx2 v[1:2], v[3:4], s[4:7], 0 addr64
; GCN-NEXT: s_endpgm
bb:
%tmp = tail call i32 @llvm.amdgcn.workitem.id.x()

View File

@ -12,10 +12,10 @@ define amdgpu_kernel void @s_lshr_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16>
; GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c
; GFX9-NEXT: s_load_dword s0, s[0:1], 0x30
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: v_mov_b32_e32 v0, s4
; GFX9-NEXT: v_pk_lshrrev_b16 v2, s0, v0
; GFX9-NEXT: v_mov_b32_e32 v0, s2
; GFX9-NEXT: v_mov_b32_e32 v2, s4
; GFX9-NEXT: v_mov_b32_e32 v1, s3
; GFX9-NEXT: v_pk_lshrrev_b16 v2, s0, v2
; GFX9-NEXT: global_store_dword v[0:1], v2, off
; GFX9-NEXT: s_endpgm
;
@ -74,51 +74,51 @@ define amdgpu_kernel void @v_lshr_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16>
; GFX9-NEXT: v_mov_b32_e32 v1, s3
; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s2, v2
; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
; GFX9-NEXT: global_load_dword v3, v[0:1], off
; GFX9-NEXT: global_load_dword v4, v[0:1], off offset:4
; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s0, v2
; GFX9-NEXT: v_mov_b32_e32 v1, s1
; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
; GFX9-NEXT: global_load_dword v4, v[0:1], off
; GFX9-NEXT: global_load_dword v0, v[0:1], off offset:4
; GFX9-NEXT: v_mov_b32_e32 v3, s1
; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s0, v2
; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_pk_lshrrev_b16 v2, v4, v3
; GFX9-NEXT: global_store_dword v[0:1], v2, off
; GFX9-NEXT: v_pk_lshrrev_b16 v0, v0, v4
; GFX9-NEXT: global_store_dword v[2:3], v0, off
; GFX9-NEXT: s_endpgm
;
; VI-LABEL: v_lshr_v2i16:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
; VI-NEXT: v_lshlrev_b32_e32 v4, 2, v0
; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v1, s3
; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v4
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: v_add_u32_e32 v2, vcc, 4, v0
; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v1, vcc
; VI-NEXT: flat_load_dword v5, v[0:1]
; VI-NEXT: flat_load_dword v2, v[2:3]
; VI-NEXT: v_mov_b32_e32 v1, s1
; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v4
; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: v_mov_b32_e32 v3, s1
; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2
; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
; VI-NEXT: v_add_u32_e32 v4, vcc, 4, v0
; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc
; VI-NEXT: flat_load_dword v0, v[0:1]
; VI-NEXT: flat_load_dword v1, v[4:5]
; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; VI-NEXT: v_lshrrev_b16_e32 v3, v2, v5
; VI-NEXT: v_lshrrev_b16_sdwa v2, v2, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
; VI-NEXT: v_or_b32_e32 v2, v3, v2
; VI-NEXT: flat_store_dword v[0:1], v2
; VI-NEXT: v_lshrrev_b16_e32 v4, v1, v0
; VI-NEXT: v_lshrrev_b16_sdwa v0, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
; VI-NEXT: v_or_b32_e32 v0, v4, v0
; VI-NEXT: flat_store_dword v[2:3], v0
; VI-NEXT: s_endpgm
;
; CI-LABEL: v_lshr_v2i16:
; CI: ; %bb.0:
; CI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
; CI-NEXT: s_mov_b32 s7, 0xf000
; CI-NEXT: s_mov_b32 s6, 0
; CI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
; CI-NEXT: s_mov_b32 s3, 0xf000
; CI-NEXT: s_mov_b32 s2, 0
; CI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; CI-NEXT: v_mov_b32_e32 v1, 0
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: s_mov_b64 s[4:5], s[2:3]
; CI-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
; CI-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:4
; CI-NEXT: s_mov_b64 s[0:1], s[6:7]
; CI-NEXT: buffer_load_dword v2, v[0:1], s[0:3], 0 addr64
; CI-NEXT: buffer_load_dword v3, v[0:1], s[0:3], 0 addr64 offset:4
; CI-NEXT: s_mov_b32 s8, 0xffff
; CI-NEXT: s_mov_b64 s[2:3], s[6:7]
; CI-NEXT: s_mov_b64 s[6:7], s[2:3]
; CI-NEXT: s_waitcnt vmcnt(1)
; CI-NEXT: v_lshrrev_b32_e32 v4, 16, v2
; CI-NEXT: s_waitcnt vmcnt(0)
@ -129,7 +129,7 @@ define amdgpu_kernel void @v_lshr_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16>
; CI-NEXT: v_lshr_b32_e32 v3, v4, v5
; CI-NEXT: v_lshlrev_b32_e32 v3, 16, v3
; CI-NEXT: v_or_b32_e32 v2, v2, v3
; CI-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
; CI-NEXT: buffer_store_dword v2, v[0:1], s[4:7], 0 addr64
; CI-NEXT: s_endpgm
%tid = call i32 @llvm.amdgcn.workitem.id.x()
%tid.ext = sext i32 %tid to i64
@ -153,13 +153,13 @@ define amdgpu_kernel void @lshr_v_s_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16
; GFX9-NEXT: v_mov_b32_e32 v1, s7
; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s6, v2
; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
; GFX9-NEXT: global_load_dword v3, v[0:1], off
; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s4, v2
; GFX9-NEXT: v_mov_b32_e32 v1, s5
; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
; GFX9-NEXT: global_load_dword v0, v[0:1], off
; GFX9-NEXT: v_mov_b32_e32 v3, s5
; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s4, v2
; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_pk_lshrrev_b16 v2, s0, v3
; GFX9-NEXT: global_store_dword v[0:1], v2, off
; GFX9-NEXT: v_pk_lshrrev_b16 v0, s0, v0
; GFX9-NEXT: global_store_dword v[2:3], v0, off
; GFX9-NEXT: s_endpgm
;
; VI-LABEL: lshr_v_s_v2i16:
@ -171,39 +171,39 @@ define amdgpu_kernel void @lshr_v_s_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16
; VI-NEXT: v_mov_b32_e32 v1, s7
; VI-NEXT: v_add_u32_e32 v0, vcc, s6, v2
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: flat_load_dword v3, v[0:1]
; VI-NEXT: flat_load_dword v0, v[0:1]
; VI-NEXT: s_lshr_b32 s1, s0, 16
; VI-NEXT: v_mov_b32_e32 v4, s1
; VI-NEXT: v_add_u32_e32 v0, vcc, s4, v2
; VI-NEXT: v_mov_b32_e32 v1, s5
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: v_mov_b32_e32 v3, s5
; VI-NEXT: v_add_u32_e32 v2, vcc, s4, v2
; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; VI-NEXT: v_lshrrev_b16_e32 v2, s0, v3
; VI-NEXT: v_lshrrev_b16_sdwa v3, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
; VI-NEXT: v_or_b32_e32 v2, v2, v3
; VI-NEXT: flat_store_dword v[0:1], v2
; VI-NEXT: v_lshrrev_b16_e32 v1, s0, v0
; VI-NEXT: v_lshrrev_b16_sdwa v0, v4, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
; VI-NEXT: v_or_b32_e32 v0, v1, v0
; VI-NEXT: flat_store_dword v[2:3], v0
; VI-NEXT: s_endpgm
;
; CI-LABEL: lshr_v_s_v2i16:
; CI: ; %bb.0:
; CI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
; CI-NEXT: s_load_dword s8, s[0:1], 0xd
; CI-NEXT: s_load_dword s0, s[0:1], 0xd
; CI-NEXT: s_mov_b32 s8, 0xffff
; CI-NEXT: s_mov_b32 s3, 0xf000
; CI-NEXT: s_mov_b32 s2, 0
; CI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: s_lshr_b32 s9, s0, 16
; CI-NEXT: s_and_b32 s10, s0, s8
; CI-NEXT: s_mov_b64 s[0:1], s[6:7]
; CI-NEXT: v_mov_b32_e32 v1, 0
; CI-NEXT: buffer_load_dword v2, v[0:1], s[0:3], 0 addr64
; CI-NEXT: s_lshr_b32 s9, s8, 16
; CI-NEXT: s_mov_b32 s10, 0xffff
; CI-NEXT: s_and_b32 s8, s8, s10
; CI-NEXT: s_mov_b64 s[6:7], s[2:3]
; CI-NEXT: s_waitcnt vmcnt(0)
; CI-NEXT: v_lshrrev_b32_e32 v3, 16, v2
; CI-NEXT: v_and_b32_e32 v2, s10, v2
; CI-NEXT: v_and_b32_e32 v2, s8, v2
; CI-NEXT: v_lshrrev_b32_e32 v3, s9, v3
; CI-NEXT: v_lshrrev_b32_e32 v2, s8, v2
; CI-NEXT: v_lshrrev_b32_e32 v2, s10, v2
; CI-NEXT: v_lshlrev_b32_e32 v3, 16, v3
; CI-NEXT: v_or_b32_e32 v2, v2, v3
; CI-NEXT: buffer_store_dword v2, v[0:1], s[4:7], 0 addr64
@ -228,13 +228,13 @@ define amdgpu_kernel void @lshr_s_v_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16
; GFX9-NEXT: v_mov_b32_e32 v1, s7
; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s6, v2
; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
; GFX9-NEXT: global_load_dword v3, v[0:1], off
; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s4, v2
; GFX9-NEXT: v_mov_b32_e32 v1, s5
; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
; GFX9-NEXT: global_load_dword v0, v[0:1], off
; GFX9-NEXT: v_mov_b32_e32 v3, s5
; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s4, v2
; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_pk_lshrrev_b16 v2, v3, s0
; GFX9-NEXT: global_store_dword v[0:1], v2, off
; GFX9-NEXT: v_pk_lshrrev_b16 v0, v0, s0
; GFX9-NEXT: global_store_dword v[2:3], v0, off
; GFX9-NEXT: s_endpgm
;
; VI-LABEL: lshr_s_v_v2i16:
@ -246,39 +246,39 @@ define amdgpu_kernel void @lshr_s_v_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16
; VI-NEXT: v_mov_b32_e32 v1, s7
; VI-NEXT: v_add_u32_e32 v0, vcc, s6, v2
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: flat_load_dword v3, v[0:1]
; VI-NEXT: flat_load_dword v0, v[0:1]
; VI-NEXT: s_lshr_b32 s1, s0, 16
; VI-NEXT: v_mov_b32_e32 v4, s1
; VI-NEXT: v_add_u32_e32 v0, vcc, s4, v2
; VI-NEXT: v_mov_b32_e32 v1, s5
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: v_mov_b32_e32 v3, s5
; VI-NEXT: v_add_u32_e32 v2, vcc, s4, v2
; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; VI-NEXT: v_lshrrev_b16_e64 v2, v3, s0
; VI-NEXT: v_lshrrev_b16_sdwa v3, v3, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; VI-NEXT: v_or_b32_e32 v2, v2, v3
; VI-NEXT: flat_store_dword v[0:1], v2
; VI-NEXT: v_lshrrev_b16_e64 v1, v0, s0
; VI-NEXT: v_lshrrev_b16_sdwa v0, v0, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; VI-NEXT: v_or_b32_e32 v0, v1, v0
; VI-NEXT: flat_store_dword v[2:3], v0
; VI-NEXT: s_endpgm
;
; CI-LABEL: lshr_s_v_v2i16:
; CI: ; %bb.0:
; CI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
; CI-NEXT: s_load_dword s8, s[0:1], 0xd
; CI-NEXT: s_load_dword s0, s[0:1], 0xd
; CI-NEXT: s_mov_b32 s8, 0xffff
; CI-NEXT: s_mov_b32 s3, 0xf000
; CI-NEXT: s_mov_b32 s2, 0
; CI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: s_lshr_b32 s9, s0, 16
; CI-NEXT: s_and_b32 s10, s0, s8
; CI-NEXT: s_mov_b64 s[0:1], s[6:7]
; CI-NEXT: v_mov_b32_e32 v1, 0
; CI-NEXT: buffer_load_dword v2, v[0:1], s[0:3], 0 addr64
; CI-NEXT: s_lshr_b32 s9, s8, 16
; CI-NEXT: s_mov_b32 s10, 0xffff
; CI-NEXT: s_and_b32 s8, s8, s10
; CI-NEXT: s_mov_b64 s[6:7], s[2:3]
; CI-NEXT: s_waitcnt vmcnt(0)
; CI-NEXT: v_lshrrev_b32_e32 v3, 16, v2
; CI-NEXT: v_and_b32_e32 v2, s10, v2
; CI-NEXT: v_and_b32_e32 v2, s8, v2
; CI-NEXT: v_lshr_b32_e32 v3, s9, v3
; CI-NEXT: v_lshr_b32_e32 v2, s8, v2
; CI-NEXT: v_lshr_b32_e32 v2, s10, v2
; CI-NEXT: v_lshlrev_b32_e32 v3, 16, v3
; CI-NEXT: v_or_b32_e32 v2, v2, v3
; CI-NEXT: buffer_store_dword v2, v[0:1], s[4:7], 0 addr64
@ -302,46 +302,46 @@ define amdgpu_kernel void @lshr_imm_v_v2i16(<2 x i16> addrspace(1)* %out, <2 x i
; GFX9-NEXT: v_mov_b32_e32 v1, s3
; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s2, v2
; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
; GFX9-NEXT: global_load_dword v3, v[0:1], off
; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s0, v2
; GFX9-NEXT: v_mov_b32_e32 v1, s1
; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
; GFX9-NEXT: global_load_dword v0, v[0:1], off
; GFX9-NEXT: v_mov_b32_e32 v3, s1
; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s0, v2
; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_pk_lshrrev_b16 v2, v3, 8 op_sel_hi:[1,0]
; GFX9-NEXT: global_store_dword v[0:1], v2, off
; GFX9-NEXT: v_pk_lshrrev_b16 v0, v0, 8 op_sel_hi:[1,0]
; GFX9-NEXT: global_store_dword v[2:3], v0, off
; GFX9-NEXT: s_endpgm
;
; VI-LABEL: lshr_imm_v_v2i16:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; VI-NEXT: v_mov_b32_e32 v3, 8
; VI-NEXT: v_mov_b32_e32 v4, 8
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v1, s3
; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: flat_load_dword v4, v[0:1]
; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v2
; VI-NEXT: v_mov_b32_e32 v1, s1
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: flat_load_dword v0, v[0:1]
; VI-NEXT: v_mov_b32_e32 v3, s1
; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2
; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; VI-NEXT: v_lshrrev_b16_e64 v2, v4, 8
; VI-NEXT: v_lshrrev_b16_sdwa v3, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; VI-NEXT: v_or_b32_e32 v2, v2, v3
; VI-NEXT: flat_store_dword v[0:1], v2
; VI-NEXT: v_lshrrev_b16_e64 v1, v0, 8
; VI-NEXT: v_lshrrev_b16_sdwa v0, v0, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; VI-NEXT: v_or_b32_e32 v0, v1, v0
; VI-NEXT: flat_store_dword v[2:3], v0
; VI-NEXT: s_endpgm
;
; CI-LABEL: lshr_imm_v_v2i16:
; CI: ; %bb.0:
; CI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
; CI-NEXT: s_mov_b32 s7, 0xf000
; CI-NEXT: s_mov_b32 s6, 0
; CI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
; CI-NEXT: s_mov_b32 s3, 0xf000
; CI-NEXT: s_mov_b32 s2, 0
; CI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; CI-NEXT: v_mov_b32_e32 v1, 0
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: s_mov_b64 s[4:5], s[2:3]
; CI-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
; CI-NEXT: s_mov_b64 s[2:3], s[6:7]
; CI-NEXT: s_mov_b64 s[0:1], s[6:7]
; CI-NEXT: buffer_load_dword v2, v[0:1], s[0:3], 0 addr64
; CI-NEXT: s_mov_b64 s[6:7], s[2:3]
; CI-NEXT: s_waitcnt vmcnt(0)
; CI-NEXT: v_lshrrev_b32_e32 v3, 16, v2
; CI-NEXT: v_and_b32_e32 v2, 0xffff, v2
@ -349,7 +349,7 @@ define amdgpu_kernel void @lshr_imm_v_v2i16(<2 x i16> addrspace(1)* %out, <2 x i
; CI-NEXT: v_lshr_b32_e32 v2, 8, v2
; CI-NEXT: v_lshlrev_b32_e32 v3, 16, v3
; CI-NEXT: v_or_b32_e32 v2, v2, v3
; CI-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
; CI-NEXT: buffer_store_dword v2, v[0:1], s[4:7], 0 addr64
; CI-NEXT: s_endpgm
%tid = call i32 @llvm.amdgcn.workitem.id.x()
%tid.ext = sext i32 %tid to i64
@ -370,13 +370,13 @@ define amdgpu_kernel void @lshr_v_imm_v2i16(<2 x i16> addrspace(1)* %out, <2 x i
; GFX9-NEXT: v_mov_b32_e32 v1, s3
; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s2, v2
; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
; GFX9-NEXT: global_load_dword v3, v[0:1], off
; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s0, v2
; GFX9-NEXT: v_mov_b32_e32 v1, s1
; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
; GFX9-NEXT: global_load_dword v0, v[0:1], off
; GFX9-NEXT: v_mov_b32_e32 v3, s1
; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s0, v2
; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_pk_lshrrev_b16 v2, 8, v3 op_sel_hi:[0,1]
; GFX9-NEXT: global_store_dword v[0:1], v2, off
; GFX9-NEXT: v_pk_lshrrev_b16 v0, 8, v0 op_sel_hi:[0,1]
; GFX9-NEXT: global_store_dword v[2:3], v0, off
; GFX9-NEXT: s_endpgm
;
; VI-LABEL: lshr_v_imm_v2i16:
@ -387,32 +387,32 @@ define amdgpu_kernel void @lshr_v_imm_v2i16(<2 x i16> addrspace(1)* %out, <2 x i
; VI-NEXT: v_mov_b32_e32 v1, s3
; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: flat_load_dword v3, v[0:1]
; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v2
; VI-NEXT: v_mov_b32_e32 v1, s1
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: flat_load_dword v0, v[0:1]
; VI-NEXT: v_mov_b32_e32 v3, s1
; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2
; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; VI-NEXT: v_lshrrev_b32_e32 v2, 24, v3
; VI-NEXT: v_lshlrev_b32_e32 v2, 16, v2
; VI-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD
; VI-NEXT: flat_store_dword v[0:1], v2
; VI-NEXT: v_lshrrev_b32_e32 v1, 24, v0
; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD
; VI-NEXT: flat_store_dword v[2:3], v0
; VI-NEXT: s_endpgm
;
; CI-LABEL: lshr_v_imm_v2i16:
; CI: ; %bb.0:
; CI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
; CI-NEXT: s_mov_b32 s7, 0xf000
; CI-NEXT: s_mov_b32 s6, 0
; CI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
; CI-NEXT: s_mov_b32 s3, 0xf000
; CI-NEXT: s_mov_b32 s2, 0
; CI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; CI-NEXT: v_mov_b32_e32 v1, 0
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: s_mov_b64 s[4:5], s[2:3]
; CI-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
; CI-NEXT: s_mov_b64 s[2:3], s[6:7]
; CI-NEXT: s_mov_b64 s[0:1], s[6:7]
; CI-NEXT: buffer_load_dword v2, v[0:1], s[0:3], 0 addr64
; CI-NEXT: s_mov_b64 s[6:7], s[2:3]
; CI-NEXT: s_waitcnt vmcnt(0)
; CI-NEXT: v_lshrrev_b32_e32 v2, 8, v2
; CI-NEXT: v_and_b32_e32 v2, 0xff00ff, v2
; CI-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
; CI-NEXT: buffer_store_dword v2, v[0:1], s[4:7], 0 addr64
; CI-NEXT: s_endpgm
%tid = call i32 @llvm.amdgcn.workitem.id.x()
%tid.ext = sext i32 %tid to i64
@ -428,60 +428,60 @@ define amdgpu_kernel void @v_lshr_v4i16(<4 x i16> addrspace(1)* %out, <4 x i16>
; GFX9-LABEL: v_lshr_v4i16:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
; GFX9-NEXT: v_lshlrev_b32_e32 v4, 3, v0
; GFX9-NEXT: v_lshlrev_b32_e32 v2, 3, v0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: v_mov_b32_e32 v1, s3
; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s2, v4
; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s2, v2
; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
; GFX9-NEXT: global_load_dwordx2 v[2:3], v[0:1], off
; GFX9-NEXT: global_load_dwordx2 v[4:5], v[0:1], off
; GFX9-NEXT: global_load_dwordx2 v[0:1], v[0:1], off offset:8
; GFX9-NEXT: v_mov_b32_e32 v5, s1
; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, s0, v4
; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v5, vcc
; GFX9-NEXT: v_mov_b32_e32 v3, s1
; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s0, v2
; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_pk_lshrrev_b16 v1, v1, v3
; GFX9-NEXT: v_pk_lshrrev_b16 v0, v0, v2
; GFX9-NEXT: global_store_dwordx2 v[4:5], v[0:1], off
; GFX9-NEXT: v_pk_lshrrev_b16 v1, v1, v5
; GFX9-NEXT: v_pk_lshrrev_b16 v0, v0, v4
; GFX9-NEXT: global_store_dwordx2 v[2:3], v[0:1], off
; GFX9-NEXT: s_endpgm
;
; VI-LABEL: v_lshr_v4i16:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
; VI-NEXT: v_lshlrev_b32_e32 v4, 3, v0
; VI-NEXT: v_lshlrev_b32_e32 v2, 3, v0
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v1, s3
; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v4
; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: v_add_u32_e32 v2, vcc, 8, v0
; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v1, vcc
; VI-NEXT: v_mov_b32_e32 v3, s1
; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2
; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
; VI-NEXT: v_add_u32_e32 v4, vcc, 8, v0
; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc
; VI-NEXT: flat_load_dwordx2 v[0:1], v[0:1]
; VI-NEXT: flat_load_dwordx2 v[2:3], v[2:3]
; VI-NEXT: v_mov_b32_e32 v5, s1
; VI-NEXT: v_add_u32_e32 v4, vcc, s0, v4
; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc
; VI-NEXT: flat_load_dwordx2 v[4:5], v[4:5]
; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; VI-NEXT: v_lshrrev_b16_e32 v6, v3, v1
; VI-NEXT: v_lshrrev_b16_sdwa v1, v3, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
; VI-NEXT: v_lshrrev_b16_e32 v3, v2, v0
; VI-NEXT: v_lshrrev_b16_sdwa v0, v2, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
; VI-NEXT: v_lshrrev_b16_e32 v6, v5, v1
; VI-NEXT: v_lshrrev_b16_sdwa v1, v5, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
; VI-NEXT: v_lshrrev_b16_e32 v5, v4, v0
; VI-NEXT: v_lshrrev_b16_sdwa v0, v4, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
; VI-NEXT: v_or_b32_e32 v1, v6, v1
; VI-NEXT: v_or_b32_e32 v0, v3, v0
; VI-NEXT: flat_store_dwordx2 v[4:5], v[0:1]
; VI-NEXT: v_or_b32_e32 v0, v5, v0
; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; VI-NEXT: s_endpgm
;
; CI-LABEL: v_lshr_v4i16:
; CI: ; %bb.0:
; CI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
; CI-NEXT: s_mov_b32 s7, 0xf000
; CI-NEXT: s_mov_b32 s6, 0
; CI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
; CI-NEXT: s_mov_b32 s3, 0xf000
; CI-NEXT: s_mov_b32 s2, 0
; CI-NEXT: v_lshlrev_b32_e32 v0, 3, v0
; CI-NEXT: v_mov_b32_e32 v1, 0
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: s_mov_b64 s[4:5], s[2:3]
; CI-NEXT: buffer_load_dwordx2 v[2:3], v[0:1], s[4:7], 0 addr64
; CI-NEXT: buffer_load_dwordx2 v[4:5], v[0:1], s[4:7], 0 addr64 offset:8
; CI-NEXT: s_mov_b64 s[0:1], s[6:7]
; CI-NEXT: buffer_load_dwordx2 v[2:3], v[0:1], s[0:3], 0 addr64
; CI-NEXT: buffer_load_dwordx2 v[4:5], v[0:1], s[0:3], 0 addr64 offset:8
; CI-NEXT: s_mov_b32 s8, 0xffff
; CI-NEXT: s_mov_b64 s[2:3], s[6:7]
; CI-NEXT: s_mov_b64 s[6:7], s[2:3]
; CI-NEXT: s_waitcnt vmcnt(1)
; CI-NEXT: v_lshrrev_b32_e32 v6, 16, v2
; CI-NEXT: v_lshrrev_b32_e32 v7, 16, v3
@ -500,7 +500,7 @@ define amdgpu_kernel void @v_lshr_v4i16(<4 x i16> addrspace(1)* %out, <4 x i16>
; CI-NEXT: v_lshlrev_b32_e32 v4, 16, v4
; CI-NEXT: v_or_b32_e32 v3, v3, v5
; CI-NEXT: v_or_b32_e32 v2, v2, v4
; CI-NEXT: buffer_store_dwordx2 v[2:3], v[0:1], s[0:3], 0 addr64
; CI-NEXT: buffer_store_dwordx2 v[2:3], v[0:1], s[4:7], 0 addr64
; CI-NEXT: s_endpgm
%tid = call i32 @llvm.amdgcn.workitem.id.x()
%tid.ext = sext i32 %tid to i64
@ -557,22 +557,22 @@ define amdgpu_kernel void @lshr_v_imm_v4i16(<4 x i16> addrspace(1)* %out, <4 x i
;
; CI-LABEL: lshr_v_imm_v4i16:
; CI: ; %bb.0:
; CI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
; CI-NEXT: s_mov_b32 s7, 0xf000
; CI-NEXT: s_mov_b32 s6, 0
; CI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
; CI-NEXT: s_mov_b32 s3, 0xf000
; CI-NEXT: s_mov_b32 s2, 0
; CI-NEXT: v_lshlrev_b32_e32 v0, 3, v0
; CI-NEXT: v_mov_b32_e32 v1, 0
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: s_mov_b64 s[4:5], s[2:3]
; CI-NEXT: buffer_load_dwordx2 v[2:3], v[0:1], s[4:7], 0 addr64
; CI-NEXT: s_mov_b64 s[0:1], s[6:7]
; CI-NEXT: buffer_load_dwordx2 v[2:3], v[0:1], s[0:3], 0 addr64
; CI-NEXT: s_mov_b32 s8, 0xff00ff
; CI-NEXT: s_mov_b64 s[2:3], s[6:7]
; CI-NEXT: s_mov_b64 s[6:7], s[2:3]
; CI-NEXT: s_waitcnt vmcnt(0)
; CI-NEXT: v_lshrrev_b32_e32 v3, 8, v3
; CI-NEXT: v_lshrrev_b32_e32 v2, 8, v2
; CI-NEXT: v_and_b32_e32 v3, s8, v3
; CI-NEXT: v_and_b32_e32 v2, s8, v2
; CI-NEXT: buffer_store_dwordx2 v[2:3], v[0:1], s[0:3], 0 addr64
; CI-NEXT: buffer_store_dwordx2 v[2:3], v[0:1], s[4:7], 0 addr64
; CI-NEXT: s_endpgm
%tid = call i32 @llvm.amdgcn.workitem.id.x()
%tid.ext = sext i32 %tid to i64

View File

@ -16,14 +16,14 @@ define amdgpu_kernel void @v_test_imax_sge_i16(i16 addrspace(1)* %out, i16 addrs
; VI-NEXT: v_mov_b32_e32 v3, s1
; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v4
; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
; VI-NEXT: flat_load_ushort v5, v[0:1]
; VI-NEXT: flat_load_ushort v2, v[2:3]
; VI-NEXT: v_mov_b32_e32 v1, s5
; VI-NEXT: v_add_u32_e32 v0, vcc, s4, v4
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: flat_load_ushort v0, v[0:1]
; VI-NEXT: flat_load_ushort v1, v[2:3]
; VI-NEXT: v_mov_b32_e32 v5, s5
; VI-NEXT: v_add_u32_e32 v4, vcc, s4, v4
; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc
; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; VI-NEXT: v_max_i16_e32 v2, v5, v2
; VI-NEXT: flat_store_short v[0:1], v2
; VI-NEXT: v_max_i16_e32 v0, v0, v1
; VI-NEXT: flat_store_short v[4:5], v0
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: v_test_imax_sge_i16:
@ -38,14 +38,14 @@ define amdgpu_kernel void @v_test_imax_sge_i16(i16 addrspace(1)* %out, i16 addrs
; GFX9-NEXT: v_mov_b32_e32 v3, s1
; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s0, v4
; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc
; GFX9-NEXT: global_load_ushort v5, v[0:1], off
; GFX9-NEXT: global_load_ushort v2, v[2:3], off
; GFX9-NEXT: v_mov_b32_e32 v1, s5
; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s4, v4
; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
; GFX9-NEXT: global_load_ushort v0, v[0:1], off
; GFX9-NEXT: global_load_ushort v1, v[2:3], off
; GFX9-NEXT: v_mov_b32_e32 v5, s5
; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, s4, v4
; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v5, vcc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_max_i16_e32 v2, v5, v2
; GFX9-NEXT: global_store_short v[0:1], v2, off
; GFX9-NEXT: v_max_i16_e32 v0, v0, v1
; GFX9-NEXT: global_store_short v[4:5], v0, off
; GFX9-NEXT: s_endpgm
%tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
%gep0 = getelementptr i16, i16 addrspace(1)* %aptr, i32 %tid
@ -73,16 +73,16 @@ define amdgpu_kernel void @v_test_imax_sge_v2i16(<2 x i16> addrspace(1)* %out, <
; VI-NEXT: v_mov_b32_e32 v3, s1
; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v4
; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
; VI-NEXT: flat_load_dword v5, v[0:1]
; VI-NEXT: flat_load_dword v2, v[2:3]
; VI-NEXT: v_mov_b32_e32 v1, s5
; VI-NEXT: v_add_u32_e32 v0, vcc, s4, v4
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: flat_load_dword v0, v[0:1]
; VI-NEXT: flat_load_dword v1, v[2:3]
; VI-NEXT: v_mov_b32_e32 v5, s5
; VI-NEXT: v_add_u32_e32 v4, vcc, s4, v4
; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc
; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; VI-NEXT: v_max_i16_e32 v3, v5, v2
; VI-NEXT: v_max_i16_sdwa v2, v5, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
; VI-NEXT: v_or_b32_e32 v2, v3, v2
; VI-NEXT: flat_store_dword v[0:1], v2
; VI-NEXT: v_max_i16_e32 v2, v0, v1
; VI-NEXT: v_max_i16_sdwa v0, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
; VI-NEXT: v_or_b32_e32 v0, v2, v0
; VI-NEXT: flat_store_dword v[4:5], v0
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: v_test_imax_sge_v2i16:
@ -97,14 +97,14 @@ define amdgpu_kernel void @v_test_imax_sge_v2i16(<2 x i16> addrspace(1)* %out, <
; GFX9-NEXT: v_mov_b32_e32 v3, s1
; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s0, v4
; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc
; GFX9-NEXT: global_load_dword v5, v[0:1], off
; GFX9-NEXT: global_load_dword v2, v[2:3], off
; GFX9-NEXT: v_mov_b32_e32 v1, s5
; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s4, v4
; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
; GFX9-NEXT: global_load_dword v0, v[0:1], off
; GFX9-NEXT: global_load_dword v1, v[2:3], off
; GFX9-NEXT: v_mov_b32_e32 v5, s5
; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, s4, v4
; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v5, vcc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_pk_max_i16 v2, v5, v2
; GFX9-NEXT: global_store_dword v[0:1], v2, off
; GFX9-NEXT: v_pk_max_i16 v0, v0, v1
; GFX9-NEXT: global_store_dword v[4:5], v0, off
; GFX9-NEXT: s_endpgm
%tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
%gep0 = getelementptr <2 x i16>, <2 x i16> addrspace(1)* %aptr, i32 %tid
@ -124,35 +124,35 @@ define amdgpu_kernel void @v_test_imax_sge_v3i16(<3 x i16> addrspace(1)* %out, <
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
; VI-NEXT: v_lshlrev_b32_e32 v6, 3, v0
; VI-NEXT: v_lshlrev_b32_e32 v4, 3, v0
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v1, s7
; VI-NEXT: v_add_u32_e32 v0, vcc, s6, v6
; VI-NEXT: v_add_u32_e32 v0, vcc, s6, v4
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: v_mov_b32_e32 v3, s1
; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v6
; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v4
; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
; VI-NEXT: v_add_u32_e32 v4, vcc, 4, v0
; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc
; VI-NEXT: v_mov_b32_e32 v7, s5
; VI-NEXT: v_add_u32_e32 v6, vcc, s4, v6
; VI-NEXT: v_addc_u32_e32 v7, vcc, 0, v7, vcc
; VI-NEXT: flat_load_ushort v8, v[4:5]
; VI-NEXT: v_mov_b32_e32 v5, s5
; VI-NEXT: v_add_u32_e32 v4, vcc, s4, v4
; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc
; VI-NEXT: v_add_u32_e32 v6, vcc, 4, v0
; VI-NEXT: v_addc_u32_e32 v7, vcc, 0, v1, vcc
; VI-NEXT: flat_load_ushort v8, v[6:7]
; VI-NEXT: flat_load_dword v9, v[0:1]
; VI-NEXT: v_add_u32_e32 v0, vcc, 4, v2
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v3, vcc
; VI-NEXT: flat_load_ushort v0, v[0:1]
; VI-NEXT: flat_load_dword v1, v[2:3]
; VI-NEXT: v_add_u32_e32 v4, vcc, 4, v6
; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v7, vcc
; VI-NEXT: v_add_u32_e32 v6, vcc, 4, v4
; VI-NEXT: v_addc_u32_e32 v7, vcc, 0, v5, vcc
; VI-NEXT: s_waitcnt vmcnt(1) lgkmcnt(1)
; VI-NEXT: v_max_i16_e32 v0, v8, v0
; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; VI-NEXT: v_max_i16_e32 v2, v9, v1
; VI-NEXT: v_max_i16_sdwa v1, v9, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
; VI-NEXT: v_or_b32_e32 v1, v2, v1
; VI-NEXT: flat_store_short v[4:5], v0
; VI-NEXT: flat_store_dword v[6:7], v1
; VI-NEXT: flat_store_short v[6:7], v0
; VI-NEXT: flat_store_dword v[4:5], v1
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: v_test_imax_sge_v3i16:
@ -160,8 +160,8 @@ define amdgpu_kernel void @v_test_imax_sge_v3i16(<3 x i16> addrspace(1)* %out, <
; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
; GFX9-NEXT: v_lshlrev_b32_e32 v4, 3, v0
; GFX9-NEXT: v_mov_b32_e32 v5, 0
; GFX9-NEXT: v_mov_b32_e32 v6, 0
; GFX9-NEXT: v_mov_b32_e32 v7, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: v_mov_b32_e32 v1, s7
; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s6, v4
@ -169,19 +169,18 @@ define amdgpu_kernel void @v_test_imax_sge_v3i16(<3 x i16> addrspace(1)* %out, <
; GFX9-NEXT: v_mov_b32_e32 v3, s1
; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s0, v4
; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc
; GFX9-NEXT: global_load_short_d16 v6, v[0:1], off offset:4
; GFX9-NEXT: global_load_dword v7, v[0:1], off
; GFX9-NEXT: global_load_short_d16 v5, v[2:3], off offset:4
; GFX9-NEXT: global_load_dword v2, v[2:3], off
; GFX9-NEXT: v_mov_b32_e32 v1, s5
; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s4, v4
; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
; GFX9-NEXT: s_waitcnt vmcnt(1)
; GFX9-NEXT: v_pk_max_i16 v3, v6, v5
; GFX9-NEXT: global_load_short_d16 v7, v[0:1], off offset:4
; GFX9-NEXT: global_load_dword v0, v[0:1], off
; GFX9-NEXT: global_load_short_d16 v6, v[2:3], off offset:4
; GFX9-NEXT: global_load_dword v1, v[2:3], off
; GFX9-NEXT: v_mov_b32_e32 v5, s5
; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, s4, v4
; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v5, vcc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_pk_max_i16 v2, v7, v2
; GFX9-NEXT: global_store_short v[0:1], v3, off offset:4
; GFX9-NEXT: global_store_dword v[0:1], v2, off
; GFX9-NEXT: v_pk_max_i16 v0, v0, v1
; GFX9-NEXT: v_pk_max_i16 v1, v7, v6
; GFX9-NEXT: global_store_short v[4:5], v1, off offset:4
; GFX9-NEXT: global_store_dword v[4:5], v0, off
; GFX9-NEXT: s_endpgm
%tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
%gep0 = getelementptr <3 x i16>, <3 x i16> addrspace(1)* %aptr, i32 %tid
@ -272,14 +271,14 @@ define amdgpu_kernel void @v_test_imax_sgt_i16(i16 addrspace(1)* %out, i16 addrs
; VI-NEXT: v_mov_b32_e32 v3, s1
; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v4
; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
; VI-NEXT: flat_load_ushort v5, v[0:1]
; VI-NEXT: flat_load_ushort v2, v[2:3]
; VI-NEXT: v_mov_b32_e32 v1, s5
; VI-NEXT: v_add_u32_e32 v0, vcc, s4, v4
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: flat_load_ushort v0, v[0:1]
; VI-NEXT: flat_load_ushort v1, v[2:3]
; VI-NEXT: v_mov_b32_e32 v5, s5
; VI-NEXT: v_add_u32_e32 v4, vcc, s4, v4
; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc
; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; VI-NEXT: v_max_i16_e32 v2, v5, v2
; VI-NEXT: flat_store_short v[0:1], v2
; VI-NEXT: v_max_i16_e32 v0, v0, v1
; VI-NEXT: flat_store_short v[4:5], v0
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: v_test_imax_sgt_i16:
@ -294,14 +293,14 @@ define amdgpu_kernel void @v_test_imax_sgt_i16(i16 addrspace(1)* %out, i16 addrs
; GFX9-NEXT: v_mov_b32_e32 v3, s1
; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s0, v4
; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc
; GFX9-NEXT: global_load_ushort v5, v[0:1], off
; GFX9-NEXT: global_load_ushort v2, v[2:3], off
; GFX9-NEXT: v_mov_b32_e32 v1, s5
; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s4, v4
; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
; GFX9-NEXT: global_load_ushort v0, v[0:1], off
; GFX9-NEXT: global_load_ushort v1, v[2:3], off
; GFX9-NEXT: v_mov_b32_e32 v5, s5
; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, s4, v4
; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v5, vcc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_max_i16_e32 v2, v5, v2
; GFX9-NEXT: global_store_short v[0:1], v2, off
; GFX9-NEXT: v_max_i16_e32 v0, v0, v1
; GFX9-NEXT: global_store_short v[4:5], v0, off
; GFX9-NEXT: s_endpgm
%tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
%gep0 = getelementptr i16, i16 addrspace(1)* %aptr, i32 %tid
@ -329,14 +328,14 @@ define amdgpu_kernel void @v_test_umax_uge_i16(i16 addrspace(1)* %out, i16 addrs
; VI-NEXT: v_mov_b32_e32 v3, s1
; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v4
; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
; VI-NEXT: flat_load_ushort v5, v[0:1]
; VI-NEXT: flat_load_ushort v2, v[2:3]
; VI-NEXT: v_mov_b32_e32 v1, s5
; VI-NEXT: v_add_u32_e32 v0, vcc, s4, v4
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: flat_load_ushort v0, v[0:1]
; VI-NEXT: flat_load_ushort v1, v[2:3]
; VI-NEXT: v_mov_b32_e32 v5, s5
; VI-NEXT: v_add_u32_e32 v4, vcc, s4, v4
; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc
; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; VI-NEXT: v_max_u16_e32 v2, v5, v2
; VI-NEXT: flat_store_short v[0:1], v2
; VI-NEXT: v_max_u16_e32 v0, v0, v1
; VI-NEXT: flat_store_short v[4:5], v0
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: v_test_umax_uge_i16:
@ -351,14 +350,14 @@ define amdgpu_kernel void @v_test_umax_uge_i16(i16 addrspace(1)* %out, i16 addrs
; GFX9-NEXT: v_mov_b32_e32 v3, s1
; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s0, v4
; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc
; GFX9-NEXT: global_load_ushort v5, v[0:1], off
; GFX9-NEXT: global_load_ushort v2, v[2:3], off
; GFX9-NEXT: v_mov_b32_e32 v1, s5
; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s4, v4
; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
; GFX9-NEXT: global_load_ushort v0, v[0:1], off
; GFX9-NEXT: global_load_ushort v1, v[2:3], off
; GFX9-NEXT: v_mov_b32_e32 v5, s5
; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, s4, v4
; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v5, vcc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_max_u16_e32 v2, v5, v2
; GFX9-NEXT: global_store_short v[0:1], v2, off
; GFX9-NEXT: v_max_u16_e32 v0, v0, v1
; GFX9-NEXT: global_store_short v[4:5], v0, off
; GFX9-NEXT: s_endpgm
%tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
%gep0 = getelementptr i16, i16 addrspace(1)* %aptr, i32 %tid
@ -386,14 +385,14 @@ define amdgpu_kernel void @v_test_umax_ugt_i16(i16 addrspace(1)* %out, i16 addrs
; VI-NEXT: v_mov_b32_e32 v3, s1
; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v4
; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
; VI-NEXT: flat_load_ushort v5, v[0:1]
; VI-NEXT: flat_load_ushort v2, v[2:3]
; VI-NEXT: v_mov_b32_e32 v1, s5
; VI-NEXT: v_add_u32_e32 v0, vcc, s4, v4
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: flat_load_ushort v0, v[0:1]
; VI-NEXT: flat_load_ushort v1, v[2:3]
; VI-NEXT: v_mov_b32_e32 v5, s5
; VI-NEXT: v_add_u32_e32 v4, vcc, s4, v4
; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc
; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; VI-NEXT: v_max_u16_e32 v2, v5, v2
; VI-NEXT: flat_store_short v[0:1], v2
; VI-NEXT: v_max_u16_e32 v0, v0, v1
; VI-NEXT: flat_store_short v[4:5], v0
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: v_test_umax_ugt_i16:
@ -408,14 +407,14 @@ define amdgpu_kernel void @v_test_umax_ugt_i16(i16 addrspace(1)* %out, i16 addrs
; GFX9-NEXT: v_mov_b32_e32 v3, s1
; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s0, v4
; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc
; GFX9-NEXT: global_load_ushort v5, v[0:1], off
; GFX9-NEXT: global_load_ushort v2, v[2:3], off
; GFX9-NEXT: v_mov_b32_e32 v1, s5
; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s4, v4
; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
; GFX9-NEXT: global_load_ushort v0, v[0:1], off
; GFX9-NEXT: global_load_ushort v1, v[2:3], off
; GFX9-NEXT: v_mov_b32_e32 v5, s5
; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, s4, v4
; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v5, vcc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_max_u16_e32 v2, v5, v2
; GFX9-NEXT: global_store_short v[0:1], v2, off
; GFX9-NEXT: v_max_u16_e32 v0, v0, v1
; GFX9-NEXT: global_store_short v[4:5], v0, off
; GFX9-NEXT: s_endpgm
%tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
%gep0 = getelementptr i16, i16 addrspace(1)* %aptr, i32 %tid
@ -442,16 +441,16 @@ define amdgpu_kernel void @v_test_umax_ugt_v2i16(<2 x i16> addrspace(1)* %out, <
; VI-NEXT: v_mov_b32_e32 v3, s1
; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v4
; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
; VI-NEXT: flat_load_dword v5, v[0:1]
; VI-NEXT: flat_load_dword v2, v[2:3]
; VI-NEXT: v_mov_b32_e32 v1, s5
; VI-NEXT: v_add_u32_e32 v0, vcc, s4, v4
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: flat_load_dword v0, v[0:1]
; VI-NEXT: flat_load_dword v1, v[2:3]
; VI-NEXT: v_mov_b32_e32 v5, s5
; VI-NEXT: v_add_u32_e32 v4, vcc, s4, v4
; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc
; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; VI-NEXT: v_max_u16_e32 v3, v5, v2
; VI-NEXT: v_max_u16_sdwa v2, v5, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
; VI-NEXT: v_or_b32_e32 v2, v3, v2
; VI-NEXT: flat_store_dword v[0:1], v2
; VI-NEXT: v_max_u16_e32 v2, v0, v1
; VI-NEXT: v_max_u16_sdwa v0, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
; VI-NEXT: v_or_b32_e32 v0, v2, v0
; VI-NEXT: flat_store_dword v[4:5], v0
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: v_test_umax_ugt_v2i16:
@ -466,14 +465,14 @@ define amdgpu_kernel void @v_test_umax_ugt_v2i16(<2 x i16> addrspace(1)* %out, <
; GFX9-NEXT: v_mov_b32_e32 v3, s1
; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s0, v4
; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc
; GFX9-NEXT: global_load_dword v5, v[0:1], off
; GFX9-NEXT: global_load_dword v2, v[2:3], off
; GFX9-NEXT: v_mov_b32_e32 v1, s5
; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s4, v4
; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
; GFX9-NEXT: global_load_dword v0, v[0:1], off
; GFX9-NEXT: global_load_dword v1, v[2:3], off
; GFX9-NEXT: v_mov_b32_e32 v5, s5
; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, s4, v4
; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v5, vcc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_pk_max_u16 v2, v5, v2
; GFX9-NEXT: global_store_dword v[0:1], v2, off
; GFX9-NEXT: v_pk_max_u16 v0, v0, v1
; GFX9-NEXT: global_store_dword v[4:5], v0, off
; GFX9-NEXT: s_endpgm
%tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
%gep0 = getelementptr <2 x i16>, <2 x i16> addrspace(1)* %aptr, i32 %tid

View File

@ -25,34 +25,34 @@ body: |
; CHECK: [[DEF:%[0-9]+]]:vreg_64 = IMPLICIT_DEF
; CHECK: [[GLOBAL_LOAD_DWORDX2_:%[0-9]+]]:vreg_64 = GLOBAL_LOAD_DWORDX2 [[DEF]], 0, 0, 0, 0, implicit $exec
; CHECK: [[GLOBAL_LOAD_DWORD:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD [[DEF]], 8, 0, 0, 0, implicit $exec
; CHECK: undef %4.sub1:vreg_64 = V_ADD_U32_e32 [[COPY]], [[COPY]], implicit $exec
; CHECK: %4.sub0:vreg_64 = V_MOV_B32_e32 111, implicit $exec
; CHECK: undef %4.sub0:vreg_64 = V_MOV_B32_e32 111, implicit $exec
; CHECK: [[DEF1:%[0-9]+]]:vreg_64 = IMPLICIT_DEF
; CHECK: [[DEF2:%[0-9]+]]:vreg_64 = IMPLICIT_DEF
; CHECK: [[DEF3:%[0-9]+]]:vreg_64 = IMPLICIT_DEF
; CHECK: undef %11.sub1:vreg_64 = IMPLICIT_DEF
; CHECK: [[DEF4:%[0-9]+]]:vreg_64 = IMPLICIT_DEF
; CHECK: [[DEF5:%[0-9]+]]:vreg_64 = IMPLICIT_DEF
; CHECK: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
; CHECK: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
; CHECK: [[DEF6:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
; CHECK: [[DEF7:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
; CHECK: [[DEF4:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
; CHECK: [[DEF5:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
; CHECK: [[COPY1:%[0-9]+]]:vreg_64 = COPY [[GLOBAL_LOAD_DWORDX2_]]
; CHECK: undef %6.sub0:vreg_64 = V_ADD_F32_e32 [[DEF]].sub0, [[COPY1]].sub0, implicit $exec
; CHECK: dead undef %6.sub1:vreg_64 = V_ADD_F32_e32 [[DEF]].sub1, [[COPY1]].sub0, implicit $exec
; CHECK: [[DEF6:%[0-9]+]]:vreg_64 = IMPLICIT_DEF
; CHECK: [[GLOBAL_LOAD_DWORD1:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD [[COPY1]], 0, 0, 0, 0, implicit $exec
; CHECK: [[DEF8:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
; CHECK: undef %19.sub0:vreg_64 = V_ADD_F32_e32 [[GLOBAL_LOAD_DWORD1]], [[GLOBAL_LOAD_DWORDX2_]].sub0, implicit $exec
; CHECK: [[DEF7:%[0-9]+]]:vreg_64 = IMPLICIT_DEF
; CHECK: %19.sub1:vreg_64 = V_ADD_F32_e32 [[GLOBAL_LOAD_DWORD]], [[GLOBAL_LOAD_DWORD]], implicit $exec
; CHECK: [[DEF8:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
; CHECK: %4.sub1:vreg_64 = V_ADD_U32_e32 [[COPY]], [[COPY]], implicit $exec
; CHECK: GLOBAL_STORE_DWORDX2 %19, %4, 32, 0, 0, 0, implicit $exec
; CHECK: %11.sub0:vreg_64 = GLOBAL_LOAD_DWORD [[DEF1]], 0, 0, 0, 0, implicit $exec
; CHECK: [[DEF2]].sub0:vreg_64 = GLOBAL_LOAD_DWORD [[DEF3]], 0, 0, 0, 0, implicit $exec
; CHECK: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
; CHECK: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
; CHECK: %11.sub0:vreg_64 = GLOBAL_LOAD_DWORD [[DEF2]], 0, 0, 0, 0, implicit $exec
; CHECK: [[DEF1]].sub0:vreg_64 = GLOBAL_LOAD_DWORD [[DEF3]], 0, 0, 0, 0, implicit $exec
; CHECK: dead %20:vgpr_32 = GLOBAL_LOAD_DWORD %11, 0, 0, 0, 0, implicit $exec
; CHECK: dead %21:vgpr_32 = GLOBAL_LOAD_DWORD [[DEF4]], 0, 0, 0, 0, implicit $exec
; CHECK: [[V_LSHLREV_B64_:%[0-9]+]]:vreg_64 = V_LSHLREV_B64 2, [[DEF2]], implicit $exec
; CHECK: dead %22:vgpr_32 = GLOBAL_LOAD_DWORD [[DEF5]], 0, 0, 0, 0, implicit $exec
; CHECK: S_NOP 0, implicit [[DEF7]], implicit [[V_LSHLREV_B64_]].sub0, implicit [[DEF6]], implicit [[V_MOV_B32_e32_]]
; CHECK: GLOBAL_STORE_DWORD [[DEF5]], [[V_MOV_B32_e32_1]], 0, 0, 0, 0, implicit $exec
; CHECK: dead %21:vgpr_32 = GLOBAL_LOAD_DWORD [[DEF6]], 0, 0, 0, 0, implicit $exec
; CHECK: [[V_LSHLREV_B64_:%[0-9]+]]:vreg_64 = V_LSHLREV_B64 2, [[DEF1]], implicit $exec
; CHECK: dead %22:vgpr_32 = GLOBAL_LOAD_DWORD [[DEF7]], 0, 0, 0, 0, implicit $exec
; CHECK: S_NOP 0, implicit [[DEF5]], implicit [[V_LSHLREV_B64_]].sub0, implicit [[DEF4]], implicit [[V_MOV_B32_e32_]]
; CHECK: GLOBAL_STORE_DWORD [[DEF7]], [[V_MOV_B32_e32_1]], 0, 0, 0, 0, implicit $exec
; CHECK: bb.1:
; CHECK: successors: %bb.2(0x80000000)
; CHECK: S_SETREG_IMM32_B32 0, 1

View File

@ -39,6 +39,8 @@ body: |
; CHECK: successors: %bb.1(0x80000000)
; CHECK: INLINEASM &"", 1, 851978, def dead %11
; CHECK: GLOBAL_STORE_DWORD undef %12:vreg_64, [[BUFFER_LOAD_DWORD_OFFEN]], 0, 0, 0, 0, implicit $exec :: (store 4, addrspace 1)
; CHECK: [[V_MOV_B32_e32_2:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
; CHECK: [[V_MOV_B32_e32_3:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
; CHECK: [[DS_READ_B64_gfx9_:%[0-9]+]]:vreg_64 = DS_READ_B64_gfx9 undef %14:vgpr_32, 0, 0, implicit $exec :: (load 8, addrspace 3)
; CHECK: INLINEASM &"def $0 $1", 1, 851978, def %15, 851978, def %16
; CHECK: [[DS_READ_B32_gfx9_:%[0-9]+]]:vgpr_32 = DS_READ_B32_gfx9 [[V_MOV_B32_e32_]], 0, 0, implicit $exec
@ -46,24 +48,22 @@ body: |
; CHECK: [[DS_READ_B32_gfx9_2:%[0-9]+]]:vgpr_32 = DS_READ_B32_gfx9 undef %20:vgpr_32, 0, 0, implicit $exec
; CHECK: INLINEASM &"def $0 $1", 1, 851978, def %21, 851978, def %22
; CHECK: [[DS_READ_B32_gfx9_3:%[0-9]+]]:vgpr_32 = DS_READ_B32_gfx9 [[V_MOV_B32_e32_1]], 0, 0, implicit $exec
; CHECK: [[V_MOV_B32_e32_2:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
; CHECK: %5.sub1:vreg_64 = COPY [[V_MOV_B32_e32_]]
; CHECK: [[V_ADD_U32_e32_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 1, [[DEF2]], implicit $exec
; CHECK: [[V_CMP_GT_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_GT_U32_e64 64, [[V_ADD_U32_e32_]], implicit $exec
; CHECK: [[DEF]].sub1:vreg_64 = COPY [[V_MOV_B32_e32_]]
; CHECK: [[V_MOV_B32_e32_3:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
; CHECK: INLINEASM &"", 1, 851978, def dead [[V_MOV_B32_e32_2]], 851978, def dead [[V_MOV_B32_e32_3]], 851977, [[DS_READ_B64_gfx9_]].sub0, 2147483657, [[V_MOV_B32_e32_2]](tied-def 3), 2147549193, [[V_MOV_B32_e32_3]](tied-def 5), 851977, %15, 851977, %16, 851977, [[DS_READ_B32_gfx9_1]], 851977, [[DS_READ_B32_gfx9_]], 851977, [[DS_READ_B32_gfx9_3]], 851977, [[DS_READ_B32_gfx9_2]]
; CHECK: %5.sub1:vreg_64 = COPY [[V_MOV_B32_e32_]]
; CHECK: DS_WRITE_B32_gfx9 undef %28:vgpr_32, %21, 0, 0, implicit $exec :: (store 4, addrspace 3)
; CHECK: DS_WRITE_B32_gfx9 undef %29:vgpr_32, %22, 0, 0, implicit $exec :: (store 4, addrspace 3)
; CHECK: DS_WRITE_B64_gfx9 undef %30:vgpr_32, %5, 0, 0, implicit $exec :: (store 8, addrspace 3)
; CHECK: undef %31.sub1:vreg_64 = FLAT_LOAD_DWORD undef %32:vreg_64, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 4)
; CHECK: [[V_ADD_U32_e32_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 1, [[DEF2]], implicit $exec
; CHECK: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[S_LOAD_DWORDX2_IMM]].sub1
; CHECK: [[DEF]].sub1:vreg_64 = COPY [[V_MOV_B32_e32_]]
; CHECK: [[V_MUL_LO_U32_:%[0-9]+]]:vgpr_32 = V_MUL_LO_U32 [[V_ADD_U32_e32_]], [[S_MOV_B32_]], implicit $exec
; CHECK: [[V_CMP_GT_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_GT_U32_e64 64, [[V_ADD_U32_e32_]], implicit $exec
; CHECK: [[V_CNDMASK_B32_e64_:%[0-9]+]]:vgpr_32 = V_CNDMASK_B32_e64 0, 0, 0, [[V_ADD_U32_e32_]], [[V_CMP_GT_U32_e64_]], implicit $exec
; CHECK: [[V_SUB_U32_e32_:%[0-9]+]]:vgpr_32 = V_SUB_U32_e32 [[V_MUL_LO_U32_]], [[DEF1]], implicit $exec
; CHECK: [[V_MUL_LO_U32_1:%[0-9]+]]:vgpr_32 = V_MUL_LO_U32 [[V_CNDMASK_B32_e64_]], [[S_MOV_B32_]], implicit $exec
; CHECK: [[V_ADD_U32_e32_1:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 [[V_SUB_U32_e32_]], [[DEF]].sub0, implicit $exec
; CHECK: [[V_SUB_U32_e32_1:%[0-9]+]]:vgpr_32 = V_SUB_U32_e32 [[V_MUL_LO_U32_1]], [[V_MUL_LO_U32_]], implicit $exec
; CHECK: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[S_LOAD_DWORDX2_IMM]].sub1
; CHECK: [[DEF]].sub0:vreg_64 = V_ADD_U32_e32 [[V_SUB_U32_e32_1]], [[V_ADD_U32_e32_1]], implicit $exec
; CHECK: undef %38.sub0:vreg_64, %39:sreg_64_xexec = V_ADD_I32_e64 [[S_LOAD_DWORDX2_IMM]].sub0, [[DEF]].sub0, 0, implicit $exec
; CHECK: undef %40.sub1:vreg_64, dead %41:sreg_64_xexec = V_ADDC_U32_e64 [[COPY1]], [[DEF]].sub1, %39, 0, implicit $exec

View File

@ -19,10 +19,10 @@ body: |
; GCN-LABEL: name: handleMove_bundle
; GCN: liveins: $sgpr4_sgpr5
; GCN: [[COPY:%[0-9]+]]:sgpr_64 = COPY $sgpr4_sgpr5
; GCN: $vcc_hi = IMPLICIT_DEF
; GCN: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 1, implicit $exec
; GCN: [[S_LOAD_DWORD_IMM:%[0-9]+]]:sreg_32_xm0_xexec = S_LOAD_DWORD_IMM [[COPY]], 0, 0, 0 :: (dereferenceable invariant load 4, align 16, addrspace 4)
; GCN: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 1, implicit $exec
; GCN: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
; GCN: $vcc_hi = IMPLICIT_DEF
; GCN: DS_WRITE_B32_gfx9 [[V_MOV_B32_e32_1]], [[V_MOV_B32_e32_]], 0, 0, implicit $exec :: (store 4, addrspace 3)
; GCN: [[V_MOV_B32_e32_2:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 2, implicit $exec
; GCN: $m0 = S_MOV_B32 0

View File

@ -1006,33 +1006,33 @@ define amdgpu_kernel void @s_test_sdiv24_48(i48 addrspace(1)* %out, i48 %x, i48
;
; GCN-IR-LABEL: s_test_sdiv24_48:
; GCN-IR: ; %bb.0: ; %_udiv-special-cases
; GCN-IR-NEXT: s_load_dword s2, s[0:1], 0xe
; GCN-IR-NEXT: s_load_dword s4, s[0:1], 0xb
; GCN-IR-NEXT: s_load_dword s2, s[0:1], 0xb
; GCN-IR-NEXT: s_load_dword s3, s[0:1], 0xc
; GCN-IR-NEXT: s_load_dword s6, s[0:1], 0xd
; GCN-IR-NEXT: s_load_dword s4, s[0:1], 0xd
; GCN-IR-NEXT: s_load_dword s5, s[0:1], 0xe
; GCN-IR-NEXT: s_waitcnt lgkmcnt(0)
; GCN-IR-NEXT: s_sext_i32_i16 s7, s2
; GCN-IR-NEXT: s_ashr_i32 s2, s7, 31
; GCN-IR-NEXT: s_sext_i32_i16 s5, s3
; GCN-IR-NEXT: s_ashr_i64 s[8:9], s[4:5], 24
; GCN-IR-NEXT: s_ashr_i32 s4, s5, 31
; GCN-IR-NEXT: s_mov_b32 s5, s4
; GCN-IR-NEXT: s_ashr_i64 s[6:7], s[6:7], 24
; GCN-IR-NEXT: s_xor_b64 s[8:9], s[4:5], s[8:9]
; GCN-IR-NEXT: s_sub_u32 s10, s8, s4
; GCN-IR-NEXT: s_sext_i32_i16 s3, s3
; GCN-IR-NEXT: s_ashr_i64 s[8:9], s[2:3], 24
; GCN-IR-NEXT: s_ashr_i32 s2, s3, 31
; GCN-IR-NEXT: s_sext_i32_i16 s5, s5
; GCN-IR-NEXT: s_mov_b32 s3, s2
; GCN-IR-NEXT: s_subb_u32 s11, s9, s4
; GCN-IR-NEXT: s_xor_b64 s[6:7], s[2:3], s[6:7]
; GCN-IR-NEXT: s_sub_u32 s6, s6, s2
; GCN-IR-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x9
; GCN-IR-NEXT: s_flbit_i32_b32 s0, s6
; GCN-IR-NEXT: s_subb_u32 s7, s7, s2
; GCN-IR-NEXT: s_ashr_i32 s6, s5, 31
; GCN-IR-NEXT: s_ashr_i64 s[12:13], s[4:5], 24
; GCN-IR-NEXT: s_xor_b64 s[4:5], s[2:3], s[8:9]
; GCN-IR-NEXT: s_sub_u32 s10, s4, s2
; GCN-IR-NEXT: s_mov_b32 s7, s6
; GCN-IR-NEXT: s_subb_u32 s11, s5, s2
; GCN-IR-NEXT: s_xor_b64 s[4:5], s[6:7], s[12:13]
; GCN-IR-NEXT: s_sub_u32 s8, s4, s6
; GCN-IR-NEXT: s_subb_u32 s9, s5, s6
; GCN-IR-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9
; GCN-IR-NEXT: s_flbit_i32_b32 s0, s8
; GCN-IR-NEXT: s_add_i32 s0, s0, 32
; GCN-IR-NEXT: s_flbit_i32_b32 s1, s7
; GCN-IR-NEXT: s_flbit_i32_b32 s1, s9
; GCN-IR-NEXT: v_mov_b32_e32 v1, s0
; GCN-IR-NEXT: s_flbit_i32_b32 s0, s10
; GCN-IR-NEXT: v_mov_b32_e32 v0, s1
; GCN-IR-NEXT: v_cmp_eq_u32_e64 vcc, s7, 0
; GCN-IR-NEXT: v_cmp_eq_u32_e64 vcc, s9, 0
; GCN-IR-NEXT: s_add_i32 s0, s0, 32
; GCN-IR-NEXT: s_flbit_i32_b32 s1, s11
; GCN-IR-NEXT: v_cndmask_b32_e32 v2, v0, v1, vcc
@ -1042,7 +1042,7 @@ define amdgpu_kernel void @s_test_sdiv24_48(i48 addrspace(1)* %out, i48 %x, i48
; GCN-IR-NEXT: v_cndmask_b32_e32 v3, v0, v1, vcc
; GCN-IR-NEXT: v_sub_i32_e32 v0, vcc, v2, v3
; GCN-IR-NEXT: v_subb_u32_e64 v1, s[0:1], 0, 0, vcc
; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[12:13], s[6:7], 0
; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[12:13], s[8:9], 0
; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[14:15], s[10:11], 0
; GCN-IR-NEXT: v_cmp_lt_u64_e32 vcc, 63, v[0:1]
; GCN-IR-NEXT: s_or_b64 s[12:13], s[12:13], s[14:15]
@ -1074,10 +1074,10 @@ define amdgpu_kernel void @s_test_sdiv24_48(i48 addrspace(1)* %out, i48 %x, i48
; GCN-IR-NEXT: BB9_4: ; %udiv-preheader
; GCN-IR-NEXT: v_not_b32_e32 v2, v2
; GCN-IR-NEXT: v_lshr_b64 v[6:7], s[10:11], v4
; GCN-IR-NEXT: s_add_u32 s10, s6, -1
; GCN-IR-NEXT: s_add_u32 s10, s8, -1
; GCN-IR-NEXT: v_add_i32_e32 v4, vcc, v2, v3
; GCN-IR-NEXT: v_mov_b32_e32 v8, 0
; GCN-IR-NEXT: s_addc_u32 s11, s7, -1
; GCN-IR-NEXT: s_addc_u32 s11, s9, -1
; GCN-IR-NEXT: v_addc_u32_e64 v5, s[0:1], -1, 0, vcc
; GCN-IR-NEXT: v_mov_b32_e32 v9, 0
; GCN-IR-NEXT: v_mov_b32_e32 v3, 0
@ -1092,9 +1092,9 @@ define amdgpu_kernel void @s_test_sdiv24_48(i48 addrspace(1)* %out, i48 %x, i48
; GCN-IR-NEXT: v_sub_i32_e32 v8, vcc, s10, v6
; GCN-IR-NEXT: v_subb_u32_e32 v2, vcc, v2, v7, vcc
; GCN-IR-NEXT: v_ashrrev_i32_e32 v8, 31, v2
; GCN-IR-NEXT: v_and_b32_e32 v10, s6, v8
; GCN-IR-NEXT: v_and_b32_e32 v10, s8, v8
; GCN-IR-NEXT: v_and_b32_e32 v2, 1, v8
; GCN-IR-NEXT: v_and_b32_e32 v11, s7, v8
; GCN-IR-NEXT: v_and_b32_e32 v11, s9, v8
; GCN-IR-NEXT: v_add_i32_e32 v8, vcc, 1, v4
; GCN-IR-NEXT: v_or_b32_e32 v1, v9, v1
; GCN-IR-NEXT: v_addc_u32_e32 v9, vcc, 0, v5, vcc
@ -1112,16 +1112,16 @@ define amdgpu_kernel void @s_test_sdiv24_48(i48 addrspace(1)* %out, i48 %x, i48
; GCN-IR-NEXT: v_or_b32_e32 v0, v2, v0
; GCN-IR-NEXT: v_or_b32_e32 v1, v3, v1
; GCN-IR-NEXT: BB9_7: ; %udiv-end
; GCN-IR-NEXT: s_xor_b64 s[0:1], s[2:3], s[4:5]
; GCN-IR-NEXT: s_xor_b64 s[0:1], s[6:7], s[2:3]
; GCN-IR-NEXT: v_xor_b32_e32 v0, s0, v0
; GCN-IR-NEXT: v_xor_b32_e32 v1, s1, v1
; GCN-IR-NEXT: v_mov_b32_e32 v2, s1
; GCN-IR-NEXT: v_subrev_i32_e32 v0, vcc, s0, v0
; GCN-IR-NEXT: v_subb_u32_e32 v1, vcc, v1, v2, vcc
; GCN-IR-NEXT: s_mov_b32 s11, 0xf000
; GCN-IR-NEXT: s_mov_b32 s10, -1
; GCN-IR-NEXT: buffer_store_short v1, off, s[8:11], 0 offset:4
; GCN-IR-NEXT: buffer_store_dword v0, off, s[8:11], 0
; GCN-IR-NEXT: s_mov_b32 s7, 0xf000
; GCN-IR-NEXT: s_mov_b32 s6, -1
; GCN-IR-NEXT: buffer_store_short v1, off, s[4:7], 0 offset:4
; GCN-IR-NEXT: buffer_store_dword v0, off, s[4:7], 0
; GCN-IR-NEXT: s_endpgm
%1 = ashr i48 %x, 24
%2 = ashr i48 %y, 24

View File

@ -281,28 +281,28 @@ define amdgpu_kernel void @shl_i16_v_compute_s(i16 addrspace(1)* %out, i16 addrs
define amdgpu_kernel void @shl_i16_computed_amount(i16 addrspace(1)* %out, i16 addrspace(1)* %in) {
; GCN-LABEL: shl_i16_computed_amount:
; GCN: ; %bb.0:
; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
; GCN-NEXT: s_mov_b32 s7, 0xf000
; GCN-NEXT: s_mov_b32 s6, -1
; GCN-NEXT: s_mov_b32 s10, s6
; GCN-NEXT: s_mov_b32 s11, s7
; GCN-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
; GCN-NEXT: s_mov_b32 s3, 0xf000
; GCN-NEXT: s_mov_b32 s2, -1
; GCN-NEXT: s_mov_b32 s10, s2
; GCN-NEXT: s_mov_b32 s11, s3
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: s_mov_b32 s8, s2
; GCN-NEXT: s_mov_b32 s9, s3
; GCN-NEXT: s_mov_b32 s8, s6
; GCN-NEXT: s_mov_b32 s9, s7
; GCN-NEXT: v_lshlrev_b32_e32 v0, 1, v0
; GCN-NEXT: v_mov_b32_e32 v1, 0
; GCN-NEXT: s_mov_b32 s14, 0
; GCN-NEXT: s_mov_b32 s15, s7
; GCN-NEXT: s_mov_b64 s[12:13], s[2:3]
; GCN-NEXT: s_mov_b32 s15, s3
; GCN-NEXT: s_mov_b64 s[12:13], s[6:7]
; GCN-NEXT: buffer_load_ushort v2, off, s[8:11], 0
; GCN-NEXT: buffer_load_ushort v0, v[0:1], s[12:15], 0 addr64 offset:2
; GCN-NEXT: s_mov_b32 s4, s0
; GCN-NEXT: s_mov_b32 s5, s1
; GCN-NEXT: s_mov_b32 s0, s4
; GCN-NEXT: s_mov_b32 s1, s5
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_add_i32_e32 v0, vcc, 3, v0
; GCN-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GCN-NEXT: v_lshl_b32_e32 v0, v2, v0
; GCN-NEXT: buffer_store_short v0, off, s[4:7], 0
; GCN-NEXT: buffer_store_short v0, off, s[0:3], 0
; GCN-NEXT: s_endpgm
;
; EG-LABEL: shl_i16_computed_amount:
@ -402,35 +402,35 @@ define amdgpu_kernel void @shl_i16_i_s(i16 addrspace(1)* %out, i16 zeroext %a) {
define amdgpu_kernel void @shl_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in) {
; GCN-LABEL: shl_v2i16:
; GCN: ; %bb.0:
; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
; GCN-NEXT: s_mov_b32 s7, 0xf000
; GCN-NEXT: s_mov_b32 s6, -1
; GCN-NEXT: s_mov_b32 s10, s6
; GCN-NEXT: s_mov_b32 s11, s7
; GCN-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
; GCN-NEXT: s_mov_b32 s3, 0xf000
; GCN-NEXT: s_mov_b32 s2, -1
; GCN-NEXT: s_mov_b32 s10, s2
; GCN-NEXT: s_mov_b32 s11, s3
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: s_mov_b32 s8, s2
; GCN-NEXT: s_mov_b32 s9, s3
; GCN-NEXT: s_mov_b32 s8, s6
; GCN-NEXT: s_mov_b32 s9, s7
; GCN-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GCN-NEXT: v_mov_b32_e32 v1, 0
; GCN-NEXT: s_mov_b32 s14, 0
; GCN-NEXT: s_mov_b32 s15, s7
; GCN-NEXT: s_mov_b64 s[12:13], s[2:3]
; GCN-NEXT: s_mov_b32 s15, s3
; GCN-NEXT: s_mov_b64 s[12:13], s[6:7]
; GCN-NEXT: buffer_load_dword v2, off, s[8:11], 0
; GCN-NEXT: buffer_load_dword v0, v[0:1], s[12:15], 0 addr64 offset:4
; GCN-NEXT: s_mov_b32 s4, s0
; GCN-NEXT: s_mov_b32 s0, 0xffff
; GCN-NEXT: s_mov_b32 s5, s1
; GCN-NEXT: s_mov_b32 s0, s4
; GCN-NEXT: s_mov_b32 s4, 0xffff
; GCN-NEXT: s_mov_b32 s1, s5
; GCN-NEXT: s_waitcnt vmcnt(1)
; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v2
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v0
; GCN-NEXT: v_and_b32_e32 v0, s0, v0
; GCN-NEXT: v_and_b32_e32 v0, s4, v0
; GCN-NEXT: v_lshl_b32_e32 v0, v2, v0
; GCN-NEXT: v_lshl_b32_e32 v1, v1, v3
; GCN-NEXT: v_and_b32_e32 v0, s0, v0
; GCN-NEXT: v_and_b32_e32 v0, s4, v0
; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GCN-NEXT: v_or_b32_e32 v0, v0, v1
; GCN-NEXT: buffer_store_dword v0, off, s[4:7], 0
; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0
; GCN-NEXT: s_endpgm
;
; EG-LABEL: shl_v2i16:
@ -481,17 +481,17 @@ define amdgpu_kernel void @shl_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> add
define amdgpu_kernel void @shl_v4i16(<4 x i16> addrspace(1)* %out, <4 x i16> addrspace(1)* %in) {
; GCN-LABEL: shl_v4i16:
; GCN: ; %bb.0:
; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
; GCN-NEXT: s_mov_b32 s7, 0xf000
; GCN-NEXT: s_mov_b32 s6, 0
; GCN-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
; GCN-NEXT: s_mov_b32 s3, 0xf000
; GCN-NEXT: s_mov_b32 s2, 0
; GCN-NEXT: v_lshlrev_b32_e32 v0, 3, v0
; GCN-NEXT: v_mov_b32_e32 v1, 0
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: s_mov_b64 s[4:5], s[2:3]
; GCN-NEXT: buffer_load_dwordx2 v[2:3], v[0:1], s[4:7], 0 addr64
; GCN-NEXT: buffer_load_dwordx2 v[4:5], v[0:1], s[4:7], 0 addr64 offset:8
; GCN-NEXT: s_mov_b64 s[0:1], s[6:7]
; GCN-NEXT: buffer_load_dwordx2 v[2:3], v[0:1], s[0:3], 0 addr64
; GCN-NEXT: buffer_load_dwordx2 v[4:5], v[0:1], s[0:3], 0 addr64 offset:8
; GCN-NEXT: s_mov_b32 s8, 0xffff
; GCN-NEXT: s_mov_b64 s[2:3], s[6:7]
; GCN-NEXT: s_mov_b64 s[6:7], s[2:3]
; GCN-NEXT: s_waitcnt vmcnt(1)
; GCN-NEXT: v_lshrrev_b32_e32 v6, 16, v2
; GCN-NEXT: s_waitcnt vmcnt(0)
@ -510,7 +510,7 @@ define amdgpu_kernel void @shl_v4i16(<4 x i16> addrspace(1)* %out, <4 x i16> add
; GCN-NEXT: v_and_b32_e32 v2, s8, v2
; GCN-NEXT: v_or_b32_e32 v3, v3, v5
; GCN-NEXT: v_or_b32_e32 v2, v2, v4
; GCN-NEXT: buffer_store_dwordx2 v[2:3], v[0:1], s[0:3], 0 addr64
; GCN-NEXT: buffer_store_dwordx2 v[2:3], v[0:1], s[4:7], 0 addr64
; GCN-NEXT: s_endpgm
;
; EG-LABEL: shl_v4i16:
@ -869,20 +869,20 @@ define amdgpu_kernel void @s_shl_32_i64(i64 addrspace(1)* %out, [8 x i32], i64 %
define amdgpu_kernel void @v_shl_32_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %in) {
; GCN-LABEL: v_shl_32_i64:
; GCN: ; %bb.0:
; GCN-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
; GCN-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x9
; GCN-NEXT: s_ashr_i32 s3, s2, 31
; GCN-NEXT: s_lshl_b64 s[8:9], s[2:3], 3
; GCN-NEXT: v_mov_b32_e32 v0, s8
; GCN-NEXT: s_mov_b32 s3, 0xf000
; GCN-NEXT: s_mov_b32 s2, 0
; GCN-NEXT: s_lshl_b64 s[0:1], s[2:3], 3
; GCN-NEXT: v_mov_b32_e32 v0, s0
; GCN-NEXT: s_mov_b32 s7, 0xf000
; GCN-NEXT: s_mov_b32 s6, 0
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: s_mov_b64 s[0:1], s[6:7]
; GCN-NEXT: v_mov_b32_e32 v1, s9
; GCN-NEXT: buffer_load_dword v3, v[0:1], s[0:3], 0 addr64
; GCN-NEXT: s_mov_b64 s[6:7], s[2:3]
; GCN-NEXT: s_mov_b64 s[4:5], s[10:11]
; GCN-NEXT: v_mov_b32_e32 v1, s1
; GCN-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64
; GCN-NEXT: s_mov_b64 s[10:11], s[6:7]
; GCN-NEXT: v_mov_b32_e32 v2, 0
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: buffer_store_dwordx2 v[2:3], v[0:1], s[4:7], 0 addr64
; GCN-NEXT: buffer_store_dwordx2 v[2:3], v[0:1], s[8:11], 0 addr64
; GCN-NEXT: s_endpgm
;
; EG-LABEL: v_shl_32_i64:

View File

@ -73,51 +73,51 @@ define amdgpu_kernel void @v_shl_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> a
; GFX9-NEXT: v_mov_b32_e32 v1, s3
; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s2, v2
; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
; GFX9-NEXT: global_load_dword v3, v[0:1], off
; GFX9-NEXT: global_load_dword v4, v[0:1], off offset:4
; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s0, v2
; GFX9-NEXT: v_mov_b32_e32 v1, s1
; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
; GFX9-NEXT: global_load_dword v4, v[0:1], off
; GFX9-NEXT: global_load_dword v0, v[0:1], off offset:4
; GFX9-NEXT: v_mov_b32_e32 v3, s1
; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s0, v2
; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_pk_lshlrev_b16 v2, v4, v3
; GFX9-NEXT: global_store_dword v[0:1], v2, off
; GFX9-NEXT: v_pk_lshlrev_b16 v0, v0, v4
; GFX9-NEXT: global_store_dword v[2:3], v0, off
; GFX9-NEXT: s_endpgm
;
; VI-LABEL: v_shl_v2i16:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
; VI-NEXT: v_lshlrev_b32_e32 v4, 2, v0
; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v1, s3
; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v4
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: v_add_u32_e32 v2, vcc, 4, v0
; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v1, vcc
; VI-NEXT: flat_load_dword v5, v[0:1]
; VI-NEXT: flat_load_dword v2, v[2:3]
; VI-NEXT: v_mov_b32_e32 v1, s1
; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v4
; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: v_mov_b32_e32 v3, s1
; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2
; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
; VI-NEXT: v_add_u32_e32 v4, vcc, 4, v0
; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc
; VI-NEXT: flat_load_dword v0, v[0:1]
; VI-NEXT: flat_load_dword v1, v[4:5]
; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; VI-NEXT: v_lshlrev_b16_e32 v3, v2, v5
; VI-NEXT: v_lshlrev_b16_sdwa v2, v2, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
; VI-NEXT: v_or_b32_e32 v2, v3, v2
; VI-NEXT: flat_store_dword v[0:1], v2
; VI-NEXT: v_lshlrev_b16_e32 v4, v1, v0
; VI-NEXT: v_lshlrev_b16_sdwa v0, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
; VI-NEXT: v_or_b32_e32 v0, v4, v0
; VI-NEXT: flat_store_dword v[2:3], v0
; VI-NEXT: s_endpgm
;
; CI-LABEL: v_shl_v2i16:
; CI: ; %bb.0:
; CI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
; CI-NEXT: s_mov_b32 s7, 0xf000
; CI-NEXT: s_mov_b32 s6, 0
; CI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
; CI-NEXT: s_mov_b32 s3, 0xf000
; CI-NEXT: s_mov_b32 s2, 0
; CI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; CI-NEXT: v_mov_b32_e32 v1, 0
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: s_mov_b64 s[4:5], s[2:3]
; CI-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
; CI-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:4
; CI-NEXT: s_mov_b64 s[0:1], s[6:7]
; CI-NEXT: buffer_load_dword v2, v[0:1], s[0:3], 0 addr64
; CI-NEXT: buffer_load_dword v3, v[0:1], s[0:3], 0 addr64 offset:4
; CI-NEXT: s_mov_b32 s8, 0xffff
; CI-NEXT: s_mov_b64 s[2:3], s[6:7]
; CI-NEXT: s_mov_b64 s[6:7], s[2:3]
; CI-NEXT: s_waitcnt vmcnt(1)
; CI-NEXT: v_lshrrev_b32_e32 v4, 16, v2
; CI-NEXT: s_waitcnt vmcnt(0)
@ -128,7 +128,7 @@ define amdgpu_kernel void @v_shl_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> a
; CI-NEXT: v_lshlrev_b32_e32 v3, 16, v3
; CI-NEXT: v_and_b32_e32 v2, s8, v2
; CI-NEXT: v_or_b32_e32 v2, v2, v3
; CI-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
; CI-NEXT: buffer_store_dword v2, v[0:1], s[4:7], 0 addr64
; CI-NEXT: s_endpgm
%tid = call i32 @llvm.amdgcn.workitem.id.x()
%tid.ext = sext i32 %tid to i64
@ -152,13 +152,13 @@ define amdgpu_kernel void @shl_v_s_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16>
; GFX9-NEXT: v_mov_b32_e32 v1, s7
; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s6, v2
; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
; GFX9-NEXT: global_load_dword v3, v[0:1], off
; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s4, v2
; GFX9-NEXT: v_mov_b32_e32 v1, s5
; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
; GFX9-NEXT: global_load_dword v0, v[0:1], off
; GFX9-NEXT: v_mov_b32_e32 v3, s5
; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s4, v2
; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_pk_lshlrev_b16 v2, s0, v3
; GFX9-NEXT: global_store_dword v[0:1], v2, off
; GFX9-NEXT: v_pk_lshlrev_b16 v0, s0, v0
; GFX9-NEXT: global_store_dword v[2:3], v0, off
; GFX9-NEXT: s_endpgm
;
; VI-LABEL: shl_v_s_v2i16:
@ -170,39 +170,39 @@ define amdgpu_kernel void @shl_v_s_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16>
; VI-NEXT: v_mov_b32_e32 v1, s7
; VI-NEXT: v_add_u32_e32 v0, vcc, s6, v2
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: flat_load_dword v3, v[0:1]
; VI-NEXT: flat_load_dword v0, v[0:1]
; VI-NEXT: s_lshr_b32 s1, s0, 16
; VI-NEXT: v_mov_b32_e32 v4, s1
; VI-NEXT: v_add_u32_e32 v0, vcc, s4, v2
; VI-NEXT: v_mov_b32_e32 v1, s5
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: v_mov_b32_e32 v3, s5
; VI-NEXT: v_add_u32_e32 v2, vcc, s4, v2
; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; VI-NEXT: v_lshlrev_b16_e32 v2, s0, v3
; VI-NEXT: v_lshlrev_b16_sdwa v3, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
; VI-NEXT: v_or_b32_e32 v2, v2, v3
; VI-NEXT: flat_store_dword v[0:1], v2
; VI-NEXT: v_lshlrev_b16_e32 v1, s0, v0
; VI-NEXT: v_lshlrev_b16_sdwa v0, v4, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
; VI-NEXT: v_or_b32_e32 v0, v1, v0
; VI-NEXT: flat_store_dword v[2:3], v0
; VI-NEXT: s_endpgm
;
; CI-LABEL: shl_v_s_v2i16:
; CI: ; %bb.0:
; CI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
; CI-NEXT: s_load_dword s8, s[0:1], 0xd
; CI-NEXT: s_load_dword s0, s[0:1], 0xd
; CI-NEXT: s_mov_b32 s8, 0xffff
; CI-NEXT: s_mov_b32 s3, 0xf000
; CI-NEXT: s_mov_b32 s2, 0
; CI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: s_lshr_b32 s9, s0, 16
; CI-NEXT: s_and_b32 s10, s0, s8
; CI-NEXT: s_mov_b64 s[0:1], s[6:7]
; CI-NEXT: v_mov_b32_e32 v1, 0
; CI-NEXT: buffer_load_dword v2, v[0:1], s[0:3], 0 addr64
; CI-NEXT: s_mov_b32 s9, 0xffff
; CI-NEXT: s_lshr_b32 s10, s8, 16
; CI-NEXT: s_and_b32 s8, s8, s9
; CI-NEXT: s_mov_b64 s[6:7], s[2:3]
; CI-NEXT: s_waitcnt vmcnt(0)
; CI-NEXT: v_lshrrev_b32_e32 v3, 16, v2
; CI-NEXT: v_lshlrev_b32_e32 v2, s8, v2
; CI-NEXT: v_lshlrev_b32_e32 v3, s10, v3
; CI-NEXT: v_and_b32_e32 v2, s9, v2
; CI-NEXT: v_lshlrev_b32_e32 v2, s10, v2
; CI-NEXT: v_lshlrev_b32_e32 v3, s9, v3
; CI-NEXT: v_and_b32_e32 v2, s8, v2
; CI-NEXT: v_lshlrev_b32_e32 v3, 16, v3
; CI-NEXT: v_or_b32_e32 v2, v2, v3
; CI-NEXT: buffer_store_dword v2, v[0:1], s[4:7], 0 addr64
@ -227,13 +227,13 @@ define amdgpu_kernel void @shl_s_v_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16>
; GFX9-NEXT: v_mov_b32_e32 v1, s7
; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s6, v2
; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
; GFX9-NEXT: global_load_dword v3, v[0:1], off
; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s4, v2
; GFX9-NEXT: v_mov_b32_e32 v1, s5
; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
; GFX9-NEXT: global_load_dword v0, v[0:1], off
; GFX9-NEXT: v_mov_b32_e32 v3, s5
; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s4, v2
; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_pk_lshlrev_b16 v2, v3, s0
; GFX9-NEXT: global_store_dword v[0:1], v2, off
; GFX9-NEXT: v_pk_lshlrev_b16 v0, v0, s0
; GFX9-NEXT: global_store_dword v[2:3], v0, off
; GFX9-NEXT: s_endpgm
;
; VI-LABEL: shl_s_v_v2i16:
@ -245,17 +245,17 @@ define amdgpu_kernel void @shl_s_v_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16>
; VI-NEXT: v_mov_b32_e32 v1, s7
; VI-NEXT: v_add_u32_e32 v0, vcc, s6, v2
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: flat_load_dword v3, v[0:1]
; VI-NEXT: flat_load_dword v0, v[0:1]
; VI-NEXT: s_lshr_b32 s1, s0, 16
; VI-NEXT: v_mov_b32_e32 v4, s1
; VI-NEXT: v_add_u32_e32 v0, vcc, s4, v2
; VI-NEXT: v_mov_b32_e32 v1, s5
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: v_mov_b32_e32 v3, s5
; VI-NEXT: v_add_u32_e32 v2, vcc, s4, v2
; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; VI-NEXT: v_lshlrev_b16_e64 v2, v3, s0
; VI-NEXT: v_lshlrev_b16_sdwa v3, v3, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; VI-NEXT: v_or_b32_e32 v2, v2, v3
; VI-NEXT: flat_store_dword v[0:1], v2
; VI-NEXT: v_lshlrev_b16_e64 v1, v0, s0
; VI-NEXT: v_lshlrev_b16_sdwa v0, v0, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; VI-NEXT: v_or_b32_e32 v0, v1, v0
; VI-NEXT: flat_store_dword v[2:3], v0
; VI-NEXT: s_endpgm
;
; CI-LABEL: shl_s_v_v2i16:
@ -270,12 +270,12 @@ define amdgpu_kernel void @shl_s_v_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16>
; CI-NEXT: v_mov_b32_e32 v1, 0
; CI-NEXT: buffer_load_dword v2, v[0:1], s[0:3], 0 addr64
; CI-NEXT: s_mov_b32 s0, 0xffff
; CI-NEXT: s_lshr_b32 s1, s8, 16
; CI-NEXT: s_lshr_b32 s9, s8, 16
; CI-NEXT: s_mov_b64 s[6:7], s[2:3]
; CI-NEXT: s_waitcnt vmcnt(0)
; CI-NEXT: v_and_b32_e32 v3, s0, v2
; CI-NEXT: v_lshrrev_b32_e32 v2, 16, v2
; CI-NEXT: v_lshl_b32_e32 v2, s1, v2
; CI-NEXT: v_lshl_b32_e32 v2, s9, v2
; CI-NEXT: v_lshl_b32_e32 v3, s8, v3
; CI-NEXT: v_lshlrev_b32_e32 v2, 16, v2
; CI-NEXT: v_and_b32_e32 v3, s0, v3
@ -301,46 +301,46 @@ define amdgpu_kernel void @shl_imm_v_v2i16(<2 x i16> addrspace(1)* %out, <2 x i1
; GFX9-NEXT: v_mov_b32_e32 v1, s3
; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s2, v2
; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
; GFX9-NEXT: global_load_dword v3, v[0:1], off
; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s0, v2
; GFX9-NEXT: v_mov_b32_e32 v1, s1
; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
; GFX9-NEXT: global_load_dword v0, v[0:1], off
; GFX9-NEXT: v_mov_b32_e32 v3, s1
; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s0, v2
; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_pk_lshlrev_b16 v2, v3, 8 op_sel_hi:[1,0]
; GFX9-NEXT: global_store_dword v[0:1], v2, off
; GFX9-NEXT: v_pk_lshlrev_b16 v0, v0, 8 op_sel_hi:[1,0]
; GFX9-NEXT: global_store_dword v[2:3], v0, off
; GFX9-NEXT: s_endpgm
;
; VI-LABEL: shl_imm_v_v2i16:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; VI-NEXT: v_mov_b32_e32 v3, 8
; VI-NEXT: v_mov_b32_e32 v4, 8
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v1, s3
; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: flat_load_dword v4, v[0:1]
; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v2
; VI-NEXT: v_mov_b32_e32 v1, s1
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: flat_load_dword v0, v[0:1]
; VI-NEXT: v_mov_b32_e32 v3, s1
; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2
; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; VI-NEXT: v_lshlrev_b16_e64 v2, v4, 8
; VI-NEXT: v_lshlrev_b16_sdwa v3, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; VI-NEXT: v_or_b32_e32 v2, v2, v3
; VI-NEXT: flat_store_dword v[0:1], v2
; VI-NEXT: v_lshlrev_b16_e64 v1, v0, 8
; VI-NEXT: v_lshlrev_b16_sdwa v0, v0, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; VI-NEXT: v_or_b32_e32 v0, v1, v0
; VI-NEXT: flat_store_dword v[2:3], v0
; VI-NEXT: s_endpgm
;
; CI-LABEL: shl_imm_v_v2i16:
; CI: ; %bb.0:
; CI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
; CI-NEXT: s_mov_b32 s7, 0xf000
; CI-NEXT: s_mov_b32 s6, 0
; CI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
; CI-NEXT: s_mov_b32 s3, 0xf000
; CI-NEXT: s_mov_b32 s2, 0
; CI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; CI-NEXT: v_mov_b32_e32 v1, 0
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: s_mov_b64 s[4:5], s[2:3]
; CI-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
; CI-NEXT: s_mov_b64 s[2:3], s[6:7]
; CI-NEXT: s_mov_b64 s[0:1], s[6:7]
; CI-NEXT: buffer_load_dword v2, v[0:1], s[0:3], 0 addr64
; CI-NEXT: s_mov_b64 s[6:7], s[2:3]
; CI-NEXT: s_waitcnt vmcnt(0)
; CI-NEXT: v_and_b32_e32 v3, 0xffff, v2
; CI-NEXT: v_lshrrev_b32_e32 v2, 16, v2
@ -349,7 +349,7 @@ define amdgpu_kernel void @shl_imm_v_v2i16(<2 x i16> addrspace(1)* %out, <2 x i1
; CI-NEXT: v_lshlrev_b32_e32 v2, 16, v2
; CI-NEXT: v_and_b32_e32 v3, 0xfff8, v3
; CI-NEXT: v_or_b32_e32 v2, v3, v2
; CI-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
; CI-NEXT: buffer_store_dword v2, v[0:1], s[4:7], 0 addr64
; CI-NEXT: s_endpgm
%tid = call i32 @llvm.amdgcn.workitem.id.x()
%tid.ext = sext i32 %tid to i64
@ -370,13 +370,13 @@ define amdgpu_kernel void @shl_v_imm_v2i16(<2 x i16> addrspace(1)* %out, <2 x i1
; GFX9-NEXT: v_mov_b32_e32 v1, s3
; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s2, v2
; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
; GFX9-NEXT: global_load_dword v3, v[0:1], off
; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s0, v2
; GFX9-NEXT: v_mov_b32_e32 v1, s1
; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
; GFX9-NEXT: global_load_dword v0, v[0:1], off
; GFX9-NEXT: v_mov_b32_e32 v3, s1
; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s0, v2
; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_pk_lshlrev_b16 v2, 8, v3 op_sel_hi:[0,1]
; GFX9-NEXT: global_store_dword v[0:1], v2, off
; GFX9-NEXT: v_pk_lshlrev_b16 v0, 8, v0 op_sel_hi:[0,1]
; GFX9-NEXT: global_store_dword v[2:3], v0, off
; GFX9-NEXT: s_endpgm
;
; VI-LABEL: shl_v_imm_v2i16:
@ -387,33 +387,33 @@ define amdgpu_kernel void @shl_v_imm_v2i16(<2 x i16> addrspace(1)* %out, <2 x i1
; VI-NEXT: v_mov_b32_e32 v1, s3
; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: flat_load_dword v3, v[0:1]
; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v2
; VI-NEXT: v_mov_b32_e32 v1, s1
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: flat_load_dword v0, v[0:1]
; VI-NEXT: v_mov_b32_e32 v3, s1
; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2
; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v3
; VI-NEXT: v_and_b32_e32 v2, 0xff000000, v2
; VI-NEXT: v_lshlrev_b16_e32 v3, 8, v3
; VI-NEXT: v_or_b32_e32 v2, v3, v2
; VI-NEXT: flat_store_dword v[0:1], v2
; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v0
; VI-NEXT: v_and_b32_e32 v1, 0xff000000, v1
; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v0
; VI-NEXT: v_or_b32_e32 v0, v0, v1
; VI-NEXT: flat_store_dword v[2:3], v0
; VI-NEXT: s_endpgm
;
; CI-LABEL: shl_v_imm_v2i16:
; CI: ; %bb.0:
; CI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
; CI-NEXT: s_mov_b32 s7, 0xf000
; CI-NEXT: s_mov_b32 s6, 0
; CI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
; CI-NEXT: s_mov_b32 s3, 0xf000
; CI-NEXT: s_mov_b32 s2, 0
; CI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; CI-NEXT: v_mov_b32_e32 v1, 0
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: s_mov_b64 s[4:5], s[2:3]
; CI-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
; CI-NEXT: s_mov_b64 s[2:3], s[6:7]
; CI-NEXT: s_mov_b64 s[0:1], s[6:7]
; CI-NEXT: buffer_load_dword v2, v[0:1], s[0:3], 0 addr64
; CI-NEXT: s_mov_b64 s[6:7], s[2:3]
; CI-NEXT: s_waitcnt vmcnt(0)
; CI-NEXT: v_lshlrev_b32_e32 v2, 8, v2
; CI-NEXT: v_and_b32_e32 v2, 0xff00ff00, v2
; CI-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
; CI-NEXT: buffer_store_dword v2, v[0:1], s[4:7], 0 addr64
; CI-NEXT: s_endpgm
%tid = call i32 @llvm.amdgcn.workitem.id.x()
%tid.ext = sext i32 %tid to i64
@ -429,60 +429,60 @@ define amdgpu_kernel void @v_shl_v4i16(<4 x i16> addrspace(1)* %out, <4 x i16> a
; GFX9-LABEL: v_shl_v4i16:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
; GFX9-NEXT: v_lshlrev_b32_e32 v4, 3, v0
; GFX9-NEXT: v_lshlrev_b32_e32 v2, 3, v0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: v_mov_b32_e32 v1, s3
; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s2, v4
; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s2, v2
; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
; GFX9-NEXT: global_load_dwordx2 v[2:3], v[0:1], off
; GFX9-NEXT: global_load_dwordx2 v[4:5], v[0:1], off
; GFX9-NEXT: global_load_dwordx2 v[0:1], v[0:1], off offset:8
; GFX9-NEXT: v_mov_b32_e32 v5, s1
; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, s0, v4
; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v5, vcc
; GFX9-NEXT: v_mov_b32_e32 v3, s1
; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s0, v2
; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_pk_lshlrev_b16 v1, v1, v3
; GFX9-NEXT: v_pk_lshlrev_b16 v0, v0, v2
; GFX9-NEXT: global_store_dwordx2 v[4:5], v[0:1], off
; GFX9-NEXT: v_pk_lshlrev_b16 v1, v1, v5
; GFX9-NEXT: v_pk_lshlrev_b16 v0, v0, v4
; GFX9-NEXT: global_store_dwordx2 v[2:3], v[0:1], off
; GFX9-NEXT: s_endpgm
;
; VI-LABEL: v_shl_v4i16:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
; VI-NEXT: v_lshlrev_b32_e32 v4, 3, v0
; VI-NEXT: v_lshlrev_b32_e32 v2, 3, v0
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v1, s3
; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v4
; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: v_add_u32_e32 v2, vcc, 8, v0
; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v1, vcc
; VI-NEXT: v_mov_b32_e32 v3, s1
; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2
; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
; VI-NEXT: v_add_u32_e32 v4, vcc, 8, v0
; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc
; VI-NEXT: flat_load_dwordx2 v[0:1], v[0:1]
; VI-NEXT: flat_load_dwordx2 v[2:3], v[2:3]
; VI-NEXT: v_mov_b32_e32 v5, s1
; VI-NEXT: v_add_u32_e32 v4, vcc, s0, v4
; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc
; VI-NEXT: flat_load_dwordx2 v[4:5], v[4:5]
; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; VI-NEXT: v_lshlrev_b16_e32 v6, v3, v1
; VI-NEXT: v_lshlrev_b16_sdwa v1, v3, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
; VI-NEXT: v_lshlrev_b16_e32 v3, v2, v0
; VI-NEXT: v_lshlrev_b16_sdwa v0, v2, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
; VI-NEXT: v_lshlrev_b16_e32 v6, v5, v1
; VI-NEXT: v_lshlrev_b16_sdwa v1, v5, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
; VI-NEXT: v_lshlrev_b16_e32 v5, v4, v0
; VI-NEXT: v_lshlrev_b16_sdwa v0, v4, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
; VI-NEXT: v_or_b32_e32 v1, v6, v1
; VI-NEXT: v_or_b32_e32 v0, v3, v0
; VI-NEXT: flat_store_dwordx2 v[4:5], v[0:1]
; VI-NEXT: v_or_b32_e32 v0, v5, v0
; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; VI-NEXT: s_endpgm
;
; CI-LABEL: v_shl_v4i16:
; CI: ; %bb.0:
; CI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
; CI-NEXT: s_mov_b32 s7, 0xf000
; CI-NEXT: s_mov_b32 s6, 0
; CI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
; CI-NEXT: s_mov_b32 s3, 0xf000
; CI-NEXT: s_mov_b32 s2, 0
; CI-NEXT: v_lshlrev_b32_e32 v0, 3, v0
; CI-NEXT: v_mov_b32_e32 v1, 0
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: s_mov_b64 s[4:5], s[2:3]
; CI-NEXT: buffer_load_dwordx2 v[2:3], v[0:1], s[4:7], 0 addr64
; CI-NEXT: buffer_load_dwordx2 v[4:5], v[0:1], s[4:7], 0 addr64 offset:8
; CI-NEXT: s_mov_b64 s[0:1], s[6:7]
; CI-NEXT: buffer_load_dwordx2 v[2:3], v[0:1], s[0:3], 0 addr64
; CI-NEXT: buffer_load_dwordx2 v[4:5], v[0:1], s[0:3], 0 addr64 offset:8
; CI-NEXT: s_mov_b32 s8, 0xffff
; CI-NEXT: s_mov_b64 s[2:3], s[6:7]
; CI-NEXT: s_mov_b64 s[6:7], s[2:3]
; CI-NEXT: s_waitcnt vmcnt(1)
; CI-NEXT: v_lshrrev_b32_e32 v6, 16, v2
; CI-NEXT: s_waitcnt vmcnt(0)
@ -501,7 +501,7 @@ define amdgpu_kernel void @v_shl_v4i16(<4 x i16> addrspace(1)* %out, <4 x i16> a
; CI-NEXT: v_and_b32_e32 v2, s8, v2
; CI-NEXT: v_or_b32_e32 v3, v3, v5
; CI-NEXT: v_or_b32_e32 v2, v2, v4
; CI-NEXT: buffer_store_dwordx2 v[2:3], v[0:1], s[0:3], 0 addr64
; CI-NEXT: buffer_store_dwordx2 v[2:3], v[0:1], s[4:7], 0 addr64
; CI-NEXT: s_endpgm
%tid = call i32 @llvm.amdgcn.workitem.id.x()
%tid.ext = sext i32 %tid to i64
@ -538,22 +538,22 @@ define amdgpu_kernel void @shl_v_imm_v4i16(<4 x i16> addrspace(1)* %out, <4 x i1
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
; VI-NEXT: v_lshlrev_b32_e32 v2, 3, v0
; VI-NEXT: s_mov_b32 s4, 0xff000000
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v1, s3
; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: flat_load_dwordx2 v[0:1], v[0:1]
; VI-NEXT: v_mov_b32_e32 v3, s1
; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2
; VI-NEXT: s_mov_b32 s0, 0xff000000
; VI-NEXT: v_mov_b32_e32 v3, s1
; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; VI-NEXT: v_lshlrev_b32_e32 v4, 8, v1
; VI-NEXT: v_lshlrev_b16_e32 v5, 8, v0
; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v0
; VI-NEXT: v_and_b32_e32 v0, s4, v0
; VI-NEXT: v_and_b32_e32 v0, s0, v0
; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1
; VI-NEXT: v_and_b32_e32 v4, s4, v4
; VI-NEXT: v_and_b32_e32 v4, s0, v4
; VI-NEXT: v_or_b32_e32 v1, v1, v4
; VI-NEXT: v_or_b32_e32 v0, v5, v0
; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
@ -561,16 +561,16 @@ define amdgpu_kernel void @shl_v_imm_v4i16(<4 x i16> addrspace(1)* %out, <4 x i1
;
; CI-LABEL: shl_v_imm_v4i16:
; CI: ; %bb.0:
; CI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
; CI-NEXT: s_mov_b32 s7, 0xf000
; CI-NEXT: s_mov_b32 s6, 0
; CI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
; CI-NEXT: s_mov_b32 s3, 0xf000
; CI-NEXT: s_mov_b32 s2, 0
; CI-NEXT: v_lshlrev_b32_e32 v0, 3, v0
; CI-NEXT: v_mov_b32_e32 v1, 0
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: s_mov_b64 s[4:5], s[2:3]
; CI-NEXT: buffer_load_dwordx2 v[2:3], v[0:1], s[4:7], 0 addr64
; CI-NEXT: s_mov_b64 s[0:1], s[6:7]
; CI-NEXT: buffer_load_dwordx2 v[2:3], v[0:1], s[0:3], 0 addr64
; CI-NEXT: s_mov_b32 s8, 0xff00
; CI-NEXT: s_mov_b64 s[2:3], s[6:7]
; CI-NEXT: s_mov_b64 s[6:7], s[2:3]
; CI-NEXT: s_waitcnt vmcnt(0)
; CI-NEXT: v_lshrrev_b32_e32 v4, 8, v3
; CI-NEXT: v_lshlrev_b32_e32 v3, 8, v3
@ -580,7 +580,7 @@ define amdgpu_kernel void @shl_v_imm_v4i16(<4 x i16> addrspace(1)* %out, <4 x i1
; CI-NEXT: v_lshlrev_b32_e32 v4, 16, v4
; CI-NEXT: v_or_b32_e32 v3, v3, v4
; CI-NEXT: v_and_b32_e32 v2, 0xff00ff00, v2
; CI-NEXT: buffer_store_dwordx2 v[2:3], v[0:1], s[0:3], 0 addr64
; CI-NEXT: buffer_store_dwordx2 v[2:3], v[0:1], s[4:7], 0 addr64
; CI-NEXT: s_endpgm
%tid = call i32 @llvm.amdgcn.workitem.id.x()
%tid.ext = sext i32 %tid to i64

File diff suppressed because it is too large Load Diff

View File

@ -294,9 +294,9 @@ define amdgpu_kernel void @v_sext_i1_to_i16_with_and(i16 addrspace(1)* %out, i32
; SI-NEXT: s_mov_b32 s7, 0xf000
; SI-NEXT: s_mov_b32 s6, -1
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: v_mov_b32_e32 v1, s2
; SI-NEXT: v_cmp_eq_u32_e32 vcc, s0, v0
; SI-NEXT: v_cmp_eq_u32_e64 s[0:1], s1, v1
; SI-NEXT: v_mov_b32_e32 v0, s2
; SI-NEXT: v_cmp_eq_u32_e64 s[0:1], s1, v0
; SI-NEXT: s_and_b64 s[0:1], vcc, s[0:1]
; SI-NEXT: v_cndmask_b32_e64 v0, 0, -1, s[0:1]
; SI-NEXT: buffer_store_short v0, off, s[4:7], 0
@ -309,9 +309,9 @@ define amdgpu_kernel void @v_sext_i1_to_i16_with_and(i16 addrspace(1)* %out, i32
; VI-NEXT: s_mov_b32 s7, 0xf000
; VI-NEXT: s_mov_b32 s6, -1
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v1, s2
; VI-NEXT: v_cmp_eq_u32_e32 vcc, s0, v0
; VI-NEXT: v_cmp_eq_u32_e64 s[0:1], s1, v1
; VI-NEXT: v_mov_b32_e32 v0, s2
; VI-NEXT: v_cmp_eq_u32_e64 s[0:1], s1, v0
; VI-NEXT: s_and_b64 s[0:1], vcc, s[0:1]
; VI-NEXT: v_cndmask_b32_e64 v0, 0, -1, s[0:1]
; VI-NEXT: buffer_store_short v0, off, s[4:7], 0

View File

@ -1185,25 +1185,25 @@ define amdgpu_kernel void @s_test_srem24_48(i48 addrspace(1)* %out, i48 %x, i48
;
; GCN-IR-LABEL: s_test_srem24_48:
; GCN-IR: ; %bb.0: ; %_udiv-special-cases
; GCN-IR-NEXT: s_load_dword s3, s[0:1], 0xe
; GCN-IR-NEXT: s_load_dword s2, s[0:1], 0xb
; GCN-IR-NEXT: s_load_dword s6, s[0:1], 0xc
; GCN-IR-NEXT: s_load_dword s3, s[0:1], 0xc
; GCN-IR-NEXT: s_load_dword s4, s[0:1], 0xd
; GCN-IR-NEXT: s_load_dword s5, s[0:1], 0xe
; GCN-IR-NEXT: s_waitcnt lgkmcnt(0)
; GCN-IR-NEXT: s_sext_i32_i16 s5, s3
; GCN-IR-NEXT: s_ashr_i32 s10, s5, 31
; GCN-IR-NEXT: s_sext_i32_i16 s3, s6
; GCN-IR-NEXT: s_sext_i32_i16 s3, s3
; GCN-IR-NEXT: s_ashr_i64 s[6:7], s[2:3], 24
; GCN-IR-NEXT: s_sext_i32_i16 s5, s5
; GCN-IR-NEXT: s_ashr_i32 s2, s3, 31
; GCN-IR-NEXT: s_ashr_i64 s[8:9], s[4:5], 24
; GCN-IR-NEXT: s_ashr_i32 s4, s5, 31
; GCN-IR-NEXT: s_mov_b32 s3, s2
; GCN-IR-NEXT: s_ashr_i64 s[4:5], s[4:5], 24
; GCN-IR-NEXT: s_mov_b32 s11, s10
; GCN-IR-NEXT: s_mov_b32 s5, s4
; GCN-IR-NEXT: s_xor_b64 s[6:7], s[6:7], s[2:3]
; GCN-IR-NEXT: s_xor_b64 s[4:5], s[4:5], s[10:11]
; GCN-IR-NEXT: s_xor_b64 s[8:9], s[8:9], s[4:5]
; GCN-IR-NEXT: s_sub_u32 s6, s6, s2
; GCN-IR-NEXT: s_subb_u32 s7, s7, s2
; GCN-IR-NEXT: s_sub_u32 s8, s4, s10
; GCN-IR-NEXT: s_subb_u32 s9, s5, s10
; GCN-IR-NEXT: s_sub_u32 s8, s8, s4
; GCN-IR-NEXT: s_subb_u32 s9, s9, s4
; GCN-IR-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9
; GCN-IR-NEXT: s_flbit_i32_b32 s0, s8
; GCN-IR-NEXT: s_add_i32 s0, s0, 32

View File

@ -8,10 +8,10 @@ define void @local_store_i56(i56 addrspace(3)* %ptr, i56 %arg) #0 {
; CIVI: ; %bb.0:
; CIVI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; CIVI-NEXT: s_mov_b32 m0, -1
; CIVI-NEXT: v_lshrrev_b32_e32 v3, 16, v2
; CIVI-NEXT: ds_write_b16 v0, v2 offset:4
; CIVI-NEXT: ds_write_b32 v0, v1
; CIVI-NEXT: ds_write_b8 v0, v3 offset:6
; CIVI-NEXT: v_lshrrev_b32_e32 v1, 16, v2
; CIVI-NEXT: ds_write_b8 v0, v1 offset:6
; CIVI-NEXT: s_waitcnt lgkmcnt(0)
; CIVI-NEXT: s_setpc_b64 s[30:31]
;
@ -53,24 +53,24 @@ define amdgpu_kernel void @local_store_i55(i55 addrspace(3)* %ptr, i55 %arg) #0
; FIJI-LABEL: local_store_i55:
; FIJI: ; %bb.0:
; FIJI-NEXT: s_load_dword s0, s[4:5], 0x0
; FIJI-NEXT: s_load_dword s1, s[4:5], 0x8
; FIJI-NEXT: s_load_dword s2, s[4:5], 0xc
; FIJI-NEXT: s_load_dword s2, s[4:5], 0x8
; FIJI-NEXT: s_load_dword s1, s[4:5], 0xc
; FIJI-NEXT: s_mov_b32 m0, -1
; FIJI-NEXT: s_waitcnt lgkmcnt(0)
; FIJI-NEXT: v_mov_b32_e32 v2, s0
; FIJI-NEXT: v_mov_b32_e32 v3, s1
; FIJI-NEXT: s_and_b32 s3, s2, 0xffff
; FIJI-NEXT: s_and_b32 s3, s1, 0xffff
; FIJI-NEXT: s_add_u32 s0, s4, 14
; FIJI-NEXT: v_mov_b32_e32 v3, s1
; FIJI-NEXT: s_addc_u32 s1, s5, 0
; FIJI-NEXT: v_mov_b32_e32 v0, s0
; FIJI-NEXT: v_mov_b32_e32 v1, s1
; FIJI-NEXT: flat_load_ubyte v0, v[0:1]
; FIJI-NEXT: v_mov_b32_e32 v1, s2
; FIJI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; FIJI-NEXT: ds_write_b16 v2, v3 offset:4
; FIJI-NEXT: v_mov_b32_e32 v3, s2
; FIJI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(1)
; FIJI-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; FIJI-NEXT: v_or_b32_e32 v0, s3, v0
; FIJI-NEXT: v_bfe_u32 v0, v0, 16, 7
; FIJI-NEXT: ds_write_b16 v2, v1 offset:4
; FIJI-NEXT: ds_write_b8 v2, v0 offset:6
; FIJI-NEXT: ds_write_b32 v2, v3
; FIJI-NEXT: s_endpgm
@ -109,9 +109,9 @@ define amdgpu_kernel void @local_store_i48(i48 addrspace(3)* %ptr, i48 %arg) #0
; HAWAII-NEXT: s_mov_b32 m0, -1
; HAWAII-NEXT: s_waitcnt lgkmcnt(0)
; HAWAII-NEXT: v_mov_b32_e32 v0, s0
; HAWAII-NEXT: v_mov_b32_e32 v1, s2
; HAWAII-NEXT: ds_write_b16 v0, v1 offset:4
; HAWAII-NEXT: v_mov_b32_e32 v1, s1
; HAWAII-NEXT: v_mov_b32_e32 v2, s2
; HAWAII-NEXT: ds_write_b16 v0, v2 offset:4
; HAWAII-NEXT: ds_write_b32 v0, v1
; HAWAII-NEXT: s_endpgm
;
@ -123,9 +123,9 @@ define amdgpu_kernel void @local_store_i48(i48 addrspace(3)* %ptr, i48 %arg) #0
; FIJI-NEXT: s_mov_b32 m0, -1
; FIJI-NEXT: s_waitcnt lgkmcnt(0)
; FIJI-NEXT: v_mov_b32_e32 v0, s0
; FIJI-NEXT: v_mov_b32_e32 v1, s2
; FIJI-NEXT: ds_write_b16 v0, v1 offset:4
; FIJI-NEXT: v_mov_b32_e32 v1, s1
; FIJI-NEXT: v_mov_b32_e32 v2, s2
; FIJI-NEXT: ds_write_b16 v0, v2 offset:4
; FIJI-NEXT: ds_write_b32 v0, v1
; FIJI-NEXT: s_endpgm
;
@ -136,10 +136,10 @@ define amdgpu_kernel void @local_store_i48(i48 addrspace(3)* %ptr, i48 %arg) #0
; GFX9-NEXT: s_load_dword s2, s[4:5], 0xc
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: v_mov_b32_e32 v0, s0
; GFX9-NEXT: v_mov_b32_e32 v1, s1
; GFX9-NEXT: v_mov_b32_e32 v2, s2
; GFX9-NEXT: ds_write_b16 v0, v2 offset:4
; GFX9-NEXT: ds_write_b32 v0, v1
; GFX9-NEXT: v_mov_b32_e32 v2, s1
; GFX9-NEXT: v_mov_b32_e32 v1, s2
; GFX9-NEXT: ds_write_b16 v0, v1 offset:4
; GFX9-NEXT: ds_write_b32 v0, v2
; GFX9-NEXT: s_endpgm
store i48 %arg, i48 addrspace(3)* %ptr, align 8
ret void
@ -154,11 +154,11 @@ define amdgpu_kernel void @local_store_i65(i65 addrspace(3)* %ptr, i65 %arg) #0
; HAWAII-NEXT: s_mov_b32 m0, -1
; HAWAII-NEXT: s_waitcnt lgkmcnt(0)
; HAWAII-NEXT: v_mov_b32_e32 v2, s2
; HAWAII-NEXT: s_and_b32 s3, s3, 1
; HAWAII-NEXT: v_mov_b32_e32 v0, s3
; HAWAII-NEXT: ds_write_b8 v2, v0 offset:8
; HAWAII-NEXT: v_mov_b32_e32 v0, s0
; HAWAII-NEXT: v_mov_b32_e32 v1, s1
; HAWAII-NEXT: s_and_b32 s0, s3, 1
; HAWAII-NEXT: v_mov_b32_e32 v3, s0
; HAWAII-NEXT: ds_write_b8 v2, v3 offset:8
; HAWAII-NEXT: ds_write_b64 v2, v[0:1]
; HAWAII-NEXT: s_endpgm
;
@ -170,11 +170,11 @@ define amdgpu_kernel void @local_store_i65(i65 addrspace(3)* %ptr, i65 %arg) #0
; FIJI-NEXT: s_mov_b32 m0, -1
; FIJI-NEXT: s_waitcnt lgkmcnt(0)
; FIJI-NEXT: v_mov_b32_e32 v2, s2
; FIJI-NEXT: s_and_b32 s3, s3, 1
; FIJI-NEXT: v_mov_b32_e32 v0, s3
; FIJI-NEXT: ds_write_b8 v2, v0 offset:8
; FIJI-NEXT: v_mov_b32_e32 v0, s0
; FIJI-NEXT: v_mov_b32_e32 v1, s1
; FIJI-NEXT: s_and_b32 s0, s3, 1
; FIJI-NEXT: v_mov_b32_e32 v3, s0
; FIJI-NEXT: ds_write_b8 v2, v3 offset:8
; FIJI-NEXT: ds_write_b64 v2, v[0:1]
; FIJI-NEXT: s_endpgm
;
@ -186,9 +186,9 @@ define amdgpu_kernel void @local_store_i65(i65 addrspace(3)* %ptr, i65 %arg) #0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: v_mov_b32_e32 v2, s2
; GFX9-NEXT: v_mov_b32_e32 v0, s0
; GFX9-NEXT: s_and_b32 s3, s3, 1
; GFX9-NEXT: v_mov_b32_e32 v3, s3
; GFX9-NEXT: v_mov_b32_e32 v1, s1
; GFX9-NEXT: s_and_b32 s0, s3, 1
; GFX9-NEXT: v_mov_b32_e32 v3, s0
; GFX9-NEXT: ds_write_b8 v2, v3 offset:8
; GFX9-NEXT: ds_write_b64 v2, v[0:1]
; GFX9-NEXT: s_endpgm
@ -222,9 +222,9 @@ define void @local_store_i17(i17 addrspace(3)* %ptr, i17 %arg) #0 {
; CIVI: ; %bb.0:
; CIVI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; CIVI-NEXT: s_mov_b32 m0, -1
; CIVI-NEXT: v_bfe_u32 v2, v1, 16, 1
; CIVI-NEXT: ds_write_b16 v0, v1
; CIVI-NEXT: ds_write_b8 v0, v2 offset:2
; CIVI-NEXT: v_bfe_u32 v1, v1, 16, 1
; CIVI-NEXT: ds_write_b8 v0, v1 offset:2
; CIVI-NEXT: s_waitcnt lgkmcnt(0)
; CIVI-NEXT: s_setpc_b64 s[30:31]
;

View File

@ -7,7 +7,7 @@ define amdgpu_kernel void @v_test_sub_v2i16(<2 x i16> addrspace(1)* %out, <2 x i
; GFX9-LABEL: v_test_sub_v2i16:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
; GFX9-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34
; GFX9-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; GFX9-NEXT: s_mov_b32 s3, 0xf000
; GFX9-NEXT: s_mov_b32 s2, -1
@ -15,8 +15,8 @@ define amdgpu_kernel void @v_test_sub_v2i16(<2 x i16> addrspace(1)* %out, <2 x i
; GFX9-NEXT: v_mov_b32_e32 v1, s7
; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s6, v2
; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
; GFX9-NEXT: v_mov_b32_e32 v3, s1
; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s0, v2
; GFX9-NEXT: v_mov_b32_e32 v3, s9
; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s8, v2
; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc
; GFX9-NEXT: global_load_dword v0, v[0:1], off
; GFX9-NEXT: global_load_dword v1, v[2:3], off
@ -30,7 +30,7 @@ define amdgpu_kernel void @v_test_sub_v2i16(<2 x i16> addrspace(1)* %out, <2 x i
; VI-LABEL: v_test_sub_v2i16:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
; VI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34
; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; VI-NEXT: s_mov_b32 s3, 0xf000
; VI-NEXT: s_mov_b32 s2, -1
@ -38,8 +38,8 @@ define amdgpu_kernel void @v_test_sub_v2i16(<2 x i16> addrspace(1)* %out, <2 x i
; VI-NEXT: v_mov_b32_e32 v1, s7
; VI-NEXT: v_add_u32_e32 v0, vcc, s6, v2
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: v_mov_b32_e32 v3, s1
; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2
; VI-NEXT: v_mov_b32_e32 v3, s9
; VI-NEXT: v_add_u32_e32 v2, vcc, s8, v2
; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
; VI-NEXT: flat_load_dword v0, v[0:1]
; VI-NEXT: flat_load_dword v1, v[2:3]
@ -166,42 +166,42 @@ define amdgpu_kernel void @s_test_sub_v2i16_kernarg(<2 x i16> addrspace(1)* %out
define amdgpu_kernel void @v_test_sub_v2i16_constant(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in0) #1 {
; GFX9-LABEL: v_test_sub_v2i16_constant:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX9-NEXT: s_mov_b32 s8, 0x1c8007b
; GFX9-NEXT: s_mov_b32 s7, 0xf000
; GFX9-NEXT: s_mov_b32 s6, -1
; GFX9-NEXT: s_mov_b32 s3, 0xf000
; GFX9-NEXT: s_mov_b32 s2, -1
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: v_mov_b32_e32 v1, s3
; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s2, v0
; GFX9-NEXT: v_mov_b32_e32 v1, s7
; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s6, v0
; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
; GFX9-NEXT: global_load_dword v0, v[0:1], off
; GFX9-NEXT: s_mov_b32 s4, s0
; GFX9-NEXT: s_mov_b32 s5, s1
; GFX9-NEXT: s_mov_b32 s0, s4
; GFX9-NEXT: s_mov_b32 s4, 0x1c8007b
; GFX9-NEXT: s_mov_b32 s1, s5
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_pk_sub_i16 v0, v0, s8
; GFX9-NEXT: buffer_store_dword v0, off, s[4:7], 0
; GFX9-NEXT: v_pk_sub_i16 v0, v0, s4
; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0
; GFX9-NEXT: s_endpgm
;
; VI-LABEL: v_test_sub_v2i16_constant:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; VI-NEXT: v_mov_b32_e32 v2, 0xfffffe38
; VI-NEXT: s_mov_b32 s7, 0xf000
; VI-NEXT: s_mov_b32 s6, -1
; VI-NEXT: s_mov_b32 s3, 0xf000
; VI-NEXT: s_mov_b32 s2, -1
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v1, s3
; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0
; VI-NEXT: v_mov_b32_e32 v1, s7
; VI-NEXT: v_add_u32_e32 v0, vcc, s6, v0
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: flat_load_dword v0, v[0:1]
; VI-NEXT: s_mov_b32 s4, s0
; VI-NEXT: s_mov_b32 s5, s1
; VI-NEXT: s_mov_b32 s0, s4
; VI-NEXT: s_mov_b32 s1, s5
; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; VI-NEXT: v_add_u16_e32 v1, 0xffffff85, v0
; VI-NEXT: v_add_u16_sdwa v0, v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; VI-NEXT: v_or_b32_e32 v0, v1, v0
; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0
; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
; VI-NEXT: s_endpgm
%tid = call i32 @llvm.amdgcn.workitem.id.x()
%gep.out = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %out, i32 %tid
@ -216,42 +216,42 @@ define amdgpu_kernel void @v_test_sub_v2i16_constant(<2 x i16> addrspace(1)* %ou
define amdgpu_kernel void @v_test_sub_v2i16_neg_constant(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in0) #1 {
; GFX9-LABEL: v_test_sub_v2i16_neg_constant:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX9-NEXT: s_mov_b32 s8, 0xfc21fcb3
; GFX9-NEXT: s_mov_b32 s7, 0xf000
; GFX9-NEXT: s_mov_b32 s6, -1
; GFX9-NEXT: s_mov_b32 s3, 0xf000
; GFX9-NEXT: s_mov_b32 s2, -1
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: v_mov_b32_e32 v1, s3
; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s2, v0
; GFX9-NEXT: v_mov_b32_e32 v1, s7
; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s6, v0
; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
; GFX9-NEXT: global_load_dword v0, v[0:1], off
; GFX9-NEXT: s_mov_b32 s4, s0
; GFX9-NEXT: s_mov_b32 s5, s1
; GFX9-NEXT: s_mov_b32 s0, s4
; GFX9-NEXT: s_mov_b32 s4, 0xfc21fcb3
; GFX9-NEXT: s_mov_b32 s1, s5
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_pk_sub_i16 v0, v0, s8
; GFX9-NEXT: buffer_store_dword v0, off, s[4:7], 0
; GFX9-NEXT: v_pk_sub_i16 v0, v0, s4
; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0
; GFX9-NEXT: s_endpgm
;
; VI-LABEL: v_test_sub_v2i16_neg_constant:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; VI-NEXT: v_mov_b32_e32 v2, 0x3df
; VI-NEXT: s_mov_b32 s7, 0xf000
; VI-NEXT: s_mov_b32 s6, -1
; VI-NEXT: s_mov_b32 s3, 0xf000
; VI-NEXT: s_mov_b32 s2, -1
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v1, s3
; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0
; VI-NEXT: v_mov_b32_e32 v1, s7
; VI-NEXT: v_add_u32_e32 v0, vcc, s6, v0
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: flat_load_dword v0, v[0:1]
; VI-NEXT: s_mov_b32 s4, s0
; VI-NEXT: s_mov_b32 s5, s1
; VI-NEXT: s_mov_b32 s0, s4
; VI-NEXT: s_mov_b32 s1, s5
; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; VI-NEXT: v_add_u16_e32 v1, 0x34d, v0
; VI-NEXT: v_add_u16_sdwa v0, v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; VI-NEXT: v_or_b32_e32 v0, v1, v0
; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0
; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
; VI-NEXT: s_endpgm
%tid = call i32 @llvm.amdgcn.workitem.id.x()
%gep.out = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %out, i32 %tid
@ -265,41 +265,41 @@ define amdgpu_kernel void @v_test_sub_v2i16_neg_constant(<2 x i16> addrspace(1)*
define amdgpu_kernel void @v_test_sub_v2i16_inline_neg1(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in0) #1 {
; GFX9-LABEL: v_test_sub_v2i16_inline_neg1:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX9-NEXT: s_mov_b32 s7, 0xf000
; GFX9-NEXT: s_mov_b32 s6, -1
; GFX9-NEXT: s_mov_b32 s3, 0xf000
; GFX9-NEXT: s_mov_b32 s2, -1
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: v_mov_b32_e32 v1, s3
; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s2, v0
; GFX9-NEXT: v_mov_b32_e32 v1, s7
; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s6, v0
; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
; GFX9-NEXT: global_load_dword v0, v[0:1], off
; GFX9-NEXT: s_mov_b32 s4, s0
; GFX9-NEXT: s_mov_b32 s5, s1
; GFX9-NEXT: s_mov_b32 s0, s4
; GFX9-NEXT: s_mov_b32 s1, s5
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_pk_sub_i16 v0, v0, -1 op_sel_hi:[1,0]
; GFX9-NEXT: buffer_store_dword v0, off, s[4:7], 0
; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0
; GFX9-NEXT: s_endpgm
;
; VI-LABEL: v_test_sub_v2i16_inline_neg1:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; VI-NEXT: v_mov_b32_e32 v2, 1
; VI-NEXT: s_mov_b32 s7, 0xf000
; VI-NEXT: s_mov_b32 s6, -1
; VI-NEXT: s_mov_b32 s3, 0xf000
; VI-NEXT: s_mov_b32 s2, -1
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v1, s3
; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0
; VI-NEXT: v_mov_b32_e32 v1, s7
; VI-NEXT: v_add_u32_e32 v0, vcc, s6, v0
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: flat_load_dword v0, v[0:1]
; VI-NEXT: s_mov_b32 s4, s0
; VI-NEXT: s_mov_b32 s5, s1
; VI-NEXT: s_mov_b32 s0, s4
; VI-NEXT: s_mov_b32 s1, s5
; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; VI-NEXT: v_add_u16_e32 v1, 1, v0
; VI-NEXT: v_add_u16_sdwa v0, v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; VI-NEXT: v_or_b32_e32 v0, v1, v0
; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0
; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
; VI-NEXT: s_endpgm
%tid = call i32 @llvm.amdgcn.workitem.id.x()
%gep.out = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %out, i32 %tid
@ -313,40 +313,40 @@ define amdgpu_kernel void @v_test_sub_v2i16_inline_neg1(<2 x i16> addrspace(1)*
define amdgpu_kernel void @v_test_sub_v2i16_inline_lo_zero_hi(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in0) #1 {
; GFX9-LABEL: v_test_sub_v2i16_inline_lo_zero_hi:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX9-NEXT: s_mov_b32 s7, 0xf000
; GFX9-NEXT: s_mov_b32 s6, -1
; GFX9-NEXT: s_mov_b32 s3, 0xf000
; GFX9-NEXT: s_mov_b32 s2, -1
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: v_mov_b32_e32 v1, s3
; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s2, v0
; GFX9-NEXT: v_mov_b32_e32 v1, s7
; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s6, v0
; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
; GFX9-NEXT: global_load_dword v0, v[0:1], off
; GFX9-NEXT: s_mov_b32 s4, s0
; GFX9-NEXT: s_mov_b32 s5, s1
; GFX9-NEXT: s_mov_b32 s0, s4
; GFX9-NEXT: s_mov_b32 s1, s5
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_pk_sub_i16 v0, v0, 32
; GFX9-NEXT: buffer_store_dword v0, off, s[4:7], 0
; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0
; GFX9-NEXT: s_endpgm
;
; VI-LABEL: v_test_sub_v2i16_inline_lo_zero_hi:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; VI-NEXT: s_mov_b32 s7, 0xf000
; VI-NEXT: s_mov_b32 s6, -1
; VI-NEXT: s_mov_b32 s3, 0xf000
; VI-NEXT: s_mov_b32 s2, -1
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v1, s3
; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0
; VI-NEXT: v_mov_b32_e32 v1, s7
; VI-NEXT: v_add_u32_e32 v0, vcc, s6, v0
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: flat_load_dword v0, v[0:1]
; VI-NEXT: s_mov_b32 s4, s0
; VI-NEXT: s_mov_b32 s5, s1
; VI-NEXT: s_mov_b32 s0, s4
; VI-NEXT: s_mov_b32 s1, s5
; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; VI-NEXT: v_and_b32_e32 v1, 0xffff0000, v0
; VI-NEXT: v_subrev_u16_e32 v0, 32, v0
; VI-NEXT: v_or_b32_e32 v0, v0, v1
; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0
; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
; VI-NEXT: s_endpgm
%tid = call i32 @llvm.amdgcn.workitem.id.x()
%gep.out = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %out, i32 %tid
@ -361,41 +361,41 @@ define amdgpu_kernel void @v_test_sub_v2i16_inline_lo_zero_hi(<2 x i16> addrspac
define amdgpu_kernel void @v_test_sub_v2i16_inline_fp_split(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in0) #1 {
; GFX9-LABEL: v_test_sub_v2i16_inline_fp_split:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX9-NEXT: s_mov_b32 s8, 1.0
; GFX9-NEXT: s_mov_b32 s7, 0xf000
; GFX9-NEXT: s_mov_b32 s6, -1
; GFX9-NEXT: s_mov_b32 s3, 0xf000
; GFX9-NEXT: s_mov_b32 s2, -1
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: v_mov_b32_e32 v1, s3
; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s2, v0
; GFX9-NEXT: v_mov_b32_e32 v1, s7
; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s6, v0
; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
; GFX9-NEXT: global_load_dword v0, v[0:1], off
; GFX9-NEXT: s_mov_b32 s4, s0
; GFX9-NEXT: s_mov_b32 s5, s1
; GFX9-NEXT: s_mov_b32 s0, s4
; GFX9-NEXT: s_mov_b32 s4, 1.0
; GFX9-NEXT: s_mov_b32 s1, s5
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_pk_sub_i16 v0, v0, s8
; GFX9-NEXT: buffer_store_dword v0, off, s[4:7], 0
; GFX9-NEXT: v_pk_sub_i16 v0, v0, s4
; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0
; GFX9-NEXT: s_endpgm
;
; VI-LABEL: v_test_sub_v2i16_inline_fp_split:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; VI-NEXT: v_mov_b32_e32 v2, 0xffffc080
; VI-NEXT: s_mov_b32 s7, 0xf000
; VI-NEXT: s_mov_b32 s6, -1
; VI-NEXT: s_mov_b32 s3, 0xf000
; VI-NEXT: s_mov_b32 s2, -1
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v1, s3
; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0
; VI-NEXT: v_mov_b32_e32 v1, s7
; VI-NEXT: v_add_u32_e32 v0, vcc, s6, v0
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: flat_load_dword v0, v[0:1]
; VI-NEXT: s_mov_b32 s4, s0
; VI-NEXT: s_mov_b32 s5, s1
; VI-NEXT: v_mov_b32_e32 v1, 0xffffc080
; VI-NEXT: s_mov_b32 s0, s4
; VI-NEXT: s_mov_b32 s1, s5
; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; VI-NEXT: v_add_u16_sdwa v1, v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; VI-NEXT: v_add_u16_sdwa v1, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0
; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
; VI-NEXT: s_endpgm
%tid = call i32 @llvm.amdgcn.workitem.id.x()
%gep.out = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %out, i32 %tid
@ -411,7 +411,7 @@ define amdgpu_kernel void @v_test_sub_v2i16_zext_to_v2i32(<2 x i32> addrspace(1)
; GFX9-LABEL: v_test_sub_v2i16_zext_to_v2i32:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
; GFX9-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34
; GFX9-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; GFX9-NEXT: s_mov_b32 s3, 0xf000
; GFX9-NEXT: s_mov_b32 s2, -1
@ -419,8 +419,8 @@ define amdgpu_kernel void @v_test_sub_v2i16_zext_to_v2i32(<2 x i32> addrspace(1)
; GFX9-NEXT: v_mov_b32_e32 v1, s7
; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s6, v2
; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
; GFX9-NEXT: v_mov_b32_e32 v3, s1
; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s0, v2
; GFX9-NEXT: v_mov_b32_e32 v3, s9
; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s8, v2
; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc
; GFX9-NEXT: global_load_dword v0, v[0:1], off
; GFX9-NEXT: global_load_dword v1, v[2:3], off
@ -436,7 +436,7 @@ define amdgpu_kernel void @v_test_sub_v2i16_zext_to_v2i32(<2 x i32> addrspace(1)
; VI-LABEL: v_test_sub_v2i16_zext_to_v2i32:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
; VI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34
; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; VI-NEXT: s_mov_b32 s3, 0xf000
; VI-NEXT: s_mov_b32 s2, -1
@ -444,8 +444,8 @@ define amdgpu_kernel void @v_test_sub_v2i16_zext_to_v2i32(<2 x i32> addrspace(1)
; VI-NEXT: v_mov_b32_e32 v1, s7
; VI-NEXT: v_add_u32_e32 v0, vcc, s6, v2
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: v_mov_b32_e32 v3, s1
; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2
; VI-NEXT: v_mov_b32_e32 v3, s9
; VI-NEXT: v_add_u32_e32 v2, vcc, s8, v2
; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
; VI-NEXT: flat_load_dword v1, v[0:1]
; VI-NEXT: flat_load_dword v2, v[2:3]
@ -473,7 +473,7 @@ define amdgpu_kernel void @v_test_sub_v2i16_zext_to_v2i64(<2 x i64> addrspace(1)
; GFX9-LABEL: v_test_sub_v2i16_zext_to_v2i64:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
; GFX9-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34
; GFX9-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; GFX9-NEXT: s_mov_b32 s3, 0xf000
; GFX9-NEXT: s_mov_b32 s2, -1
@ -481,14 +481,14 @@ define amdgpu_kernel void @v_test_sub_v2i16_zext_to_v2i64(<2 x i64> addrspace(1)
; GFX9-NEXT: v_mov_b32_e32 v1, s7
; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s6, v2
; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
; GFX9-NEXT: v_mov_b32_e32 v3, s1
; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s0, v2
; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc
; GFX9-NEXT: v_mov_b32_e32 v3, s9
; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, s8, v2
; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v3, vcc
; GFX9-NEXT: global_load_dword v0, v[0:1], off
; GFX9-NEXT: global_load_dword v1, v[2:3], off
; GFX9-NEXT: global_load_dword v1, v[4:5], off
; GFX9-NEXT: s_mov_b32 s0, s4
; GFX9-NEXT: v_mov_b32_e32 v3, 0
; GFX9-NEXT: s_mov_b32 s1, s5
; GFX9-NEXT: v_mov_b32_e32 v3, 0
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_pk_sub_i16 v1, v0, v1
; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v1
@ -500,26 +500,26 @@ define amdgpu_kernel void @v_test_sub_v2i16_zext_to_v2i64(<2 x i64> addrspace(1)
; VI-LABEL: v_test_sub_v2i16_zext_to_v2i64:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; VI-NEXT: v_mov_b32_e32 v1, 0
; VI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34
; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; VI-NEXT: s_mov_b32 s3, 0xf000
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v3, s7
; VI-NEXT: v_add_u32_e32 v2, vcc, s6, v0
; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
; VI-NEXT: v_mov_b32_e32 v5, s1
; VI-NEXT: v_add_u32_e32 v4, vcc, s0, v0
; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc
; VI-NEXT: flat_load_dword v2, v[2:3]
; VI-NEXT: flat_load_dword v4, v[4:5]
; VI-NEXT: s_mov_b32 s2, -1
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v1, s7
; VI-NEXT: v_add_u32_e32 v0, vcc, s6, v2
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: v_mov_b32_e32 v3, s9
; VI-NEXT: v_add_u32_e32 v2, vcc, s8, v2
; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
; VI-NEXT: flat_load_dword v4, v[0:1]
; VI-NEXT: flat_load_dword v2, v[2:3]
; VI-NEXT: v_mov_b32_e32 v1, 0
; VI-NEXT: s_mov_b32 s0, s4
; VI-NEXT: s_mov_b32 s1, s5
; VI-NEXT: v_mov_b32_e32 v3, v1
; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; VI-NEXT: v_sub_u16_e32 v0, v2, v4
; VI-NEXT: v_sub_u16_sdwa v2, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
; VI-NEXT: v_sub_u16_e32 v0, v4, v2
; VI-NEXT: v_sub_u16_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0
; VI-NEXT: s_endpgm
%tid = call i32 @llvm.amdgcn.workitem.id.x()
@ -539,7 +539,7 @@ define amdgpu_kernel void @v_test_sub_v2i16_sext_to_v2i32(<2 x i32> addrspace(1)
; GFX9-LABEL: v_test_sub_v2i16_sext_to_v2i32:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
; GFX9-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34
; GFX9-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; GFX9-NEXT: s_mov_b32 s3, 0xf000
; GFX9-NEXT: s_mov_b32 s2, -1
@ -547,8 +547,8 @@ define amdgpu_kernel void @v_test_sub_v2i16_sext_to_v2i32(<2 x i32> addrspace(1)
; GFX9-NEXT: v_mov_b32_e32 v1, s7
; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s6, v2
; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
; GFX9-NEXT: v_mov_b32_e32 v3, s1
; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s0, v2
; GFX9-NEXT: v_mov_b32_e32 v3, s9
; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s8, v2
; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc
; GFX9-NEXT: global_load_dword v0, v[0:1], off
; GFX9-NEXT: global_load_dword v1, v[2:3], off
@ -564,7 +564,7 @@ define amdgpu_kernel void @v_test_sub_v2i16_sext_to_v2i32(<2 x i32> addrspace(1)
; VI-LABEL: v_test_sub_v2i16_sext_to_v2i32:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
; VI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34
; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; VI-NEXT: s_mov_b32 s3, 0xf000
; VI-NEXT: s_mov_b32 s2, -1
@ -572,8 +572,8 @@ define amdgpu_kernel void @v_test_sub_v2i16_sext_to_v2i32(<2 x i32> addrspace(1)
; VI-NEXT: v_mov_b32_e32 v1, s7
; VI-NEXT: v_add_u32_e32 v0, vcc, s6, v2
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: v_mov_b32_e32 v3, s1
; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2
; VI-NEXT: v_mov_b32_e32 v3, s9
; VI-NEXT: v_add_u32_e32 v2, vcc, s8, v2
; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
; VI-NEXT: flat_load_dword v0, v[0:1]
; VI-NEXT: flat_load_dword v1, v[2:3]
@ -603,7 +603,7 @@ define amdgpu_kernel void @v_test_sub_v2i16_sext_to_v2i64(<2 x i64> addrspace(1)
; GFX9-LABEL: v_test_sub_v2i16_sext_to_v2i64:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
; GFX9-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34
; GFX9-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; GFX9-NEXT: s_mov_b32 s3, 0xf000
; GFX9-NEXT: s_mov_b32 s2, -1
@ -611,8 +611,8 @@ define amdgpu_kernel void @v_test_sub_v2i16_sext_to_v2i64(<2 x i64> addrspace(1)
; GFX9-NEXT: v_mov_b32_e32 v1, s7
; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s6, v2
; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
; GFX9-NEXT: v_mov_b32_e32 v3, s1
; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s0, v2
; GFX9-NEXT: v_mov_b32_e32 v3, s9
; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s8, v2
; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc
; GFX9-NEXT: global_load_dword v0, v[0:1], off
; GFX9-NEXT: global_load_dword v1, v[2:3], off
@ -631,7 +631,7 @@ define amdgpu_kernel void @v_test_sub_v2i16_sext_to_v2i64(<2 x i64> addrspace(1)
; VI-LABEL: v_test_sub_v2i16_sext_to_v2i64:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
; VI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34
; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; VI-NEXT: s_mov_b32 s3, 0xf000
; VI-NEXT: s_mov_b32 s2, -1
@ -639,8 +639,8 @@ define amdgpu_kernel void @v_test_sub_v2i16_sext_to_v2i64(<2 x i64> addrspace(1)
; VI-NEXT: v_mov_b32_e32 v1, s7
; VI-NEXT: v_add_u32_e32 v0, vcc, s6, v2
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: v_mov_b32_e32 v3, s1
; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2
; VI-NEXT: v_mov_b32_e32 v3, s9
; VI-NEXT: v_add_u32_e32 v2, vcc, s8, v2
; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
; VI-NEXT: flat_load_dword v0, v[0:1]
; VI-NEXT: flat_load_dword v1, v[2:3]

View File

@ -704,41 +704,41 @@ define amdgpu_kernel void @s_test_udiv24_i48(i48 addrspace(1)* %out, i48 %x, i48
; GCN-NEXT: s_load_dword s3, s[0:1], 0xe
; GCN-NEXT: s_mov_b32 s5, 0xff000000
; GCN-NEXT: s_mov_b32 s4, 0xffff
; GCN-NEXT: v_cvt_f32_ubyte3_e32 v0, s4
; GCN-NEXT: v_cvt_f32_ubyte3_e32 v2, s4
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: s_and_b32 s2, s2, s5
; GCN-NEXT: s_and_b32 s3, s3, s4
; GCN-NEXT: v_mov_b32_e32 v1, s2
; GCN-NEXT: v_alignbit_b32 v1, s3, v1, 24
; GCN-NEXT: v_cvt_f32_u32_e32 v2, v1
; GCN-NEXT: v_mov_b32_e32 v0, s2
; GCN-NEXT: v_alignbit_b32 v0, s3, v0, 24
; GCN-NEXT: v_cvt_f32_u32_e32 v1, v0
; GCN-NEXT: s_load_dword s6, s[0:1], 0xb
; GCN-NEXT: s_load_dword s7, s[0:1], 0xc
; GCN-NEXT: s_lshr_b64 s[2:3], s[2:3], 24
; GCN-NEXT: v_mov_b32_e32 v9, 0
; GCN-NEXT: v_mac_f32_e32 v2, 0x4f800000, v0
; GCN-NEXT: v_rcp_f32_e32 v0, v2
; GCN-NEXT: v_mac_f32_e32 v1, 0x4f800000, v2
; GCN-NEXT: v_rcp_f32_e32 v1, v1
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: s_and_b32 s7, s7, s4
; GCN-NEXT: s_and_b32 s6, s6, s5
; GCN-NEXT: s_sub_u32 s8, 0, s2
; GCN-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0
; GCN-NEXT: v_mul_f32_e32 v2, 0x2f800000, v0
; GCN-NEXT: v_mul_f32_e32 v1, 0x5f7ffffc, v1
; GCN-NEXT: v_mul_f32_e32 v2, 0x2f800000, v1
; GCN-NEXT: v_trunc_f32_e32 v2, v2
; GCN-NEXT: v_mac_f32_e32 v0, 0xcf800000, v2
; GCN-NEXT: v_mac_f32_e32 v1, 0xcf800000, v2
; GCN-NEXT: v_cvt_u32_f32_e32 v2, v2
; GCN-NEXT: v_cvt_u32_f32_e32 v0, v0
; GCN-NEXT: v_cvt_u32_f32_e32 v1, v1
; GCN-NEXT: s_subb_u32 s9, 0, s3
; GCN-NEXT: v_mov_b32_e32 v8, 0
; GCN-NEXT: v_mul_lo_u32 v3, s8, v2
; GCN-NEXT: v_mul_hi_u32 v4, s8, v0
; GCN-NEXT: v_mul_lo_u32 v5, s9, v0
; GCN-NEXT: v_mul_hi_u32 v4, s8, v1
; GCN-NEXT: v_mul_lo_u32 v5, s9, v1
; GCN-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9
; GCN-NEXT: v_add_i32_e32 v3, vcc, v4, v3
; GCN-NEXT: v_mul_lo_u32 v4, s8, v0
; GCN-NEXT: v_mul_lo_u32 v4, s8, v1
; GCN-NEXT: v_add_i32_e32 v3, vcc, v3, v5
; GCN-NEXT: v_mul_lo_u32 v6, v0, v3
; GCN-NEXT: v_mul_hi_u32 v5, v0, v3
; GCN-NEXT: v_mul_hi_u32 v7, v0, v4
; GCN-NEXT: v_mul_lo_u32 v6, v1, v3
; GCN-NEXT: v_mul_hi_u32 v5, v1, v3
; GCN-NEXT: v_mul_hi_u32 v7, v1, v4
; GCN-NEXT: v_mul_hi_u32 v10, v2, v3
; GCN-NEXT: v_mul_lo_u32 v3, v2, v3
; GCN-NEXT: v_add_i32_e32 v6, vcc, v7, v6
@ -749,18 +749,18 @@ define amdgpu_kernel void @s_test_udiv24_i48(i48 addrspace(1)* %out, i48 %x, i48
; GCN-NEXT: v_addc_u32_e32 v4, vcc, v5, v4, vcc
; GCN-NEXT: v_addc_u32_e32 v5, vcc, v10, v8, vcc
; GCN-NEXT: v_add_i32_e32 v3, vcc, v4, v3
; GCN-NEXT: v_add_i32_e64 v0, s[2:3], v0, v3
; GCN-NEXT: v_add_i32_e64 v1, s[2:3], v1, v3
; GCN-NEXT: v_addc_u32_e32 v4, vcc, v9, v5, vcc
; GCN-NEXT: v_addc_u32_e64 v3, vcc, v2, v4, s[2:3]
; GCN-NEXT: v_mul_lo_u32 v5, s8, v3
; GCN-NEXT: v_mul_hi_u32 v6, s8, v0
; GCN-NEXT: v_mul_lo_u32 v7, s9, v0
; GCN-NEXT: v_mul_hi_u32 v6, s8, v1
; GCN-NEXT: v_mul_lo_u32 v7, s9, v1
; GCN-NEXT: v_add_i32_e32 v5, vcc, v6, v5
; GCN-NEXT: v_mul_lo_u32 v6, s8, v0
; GCN-NEXT: v_mul_lo_u32 v6, s8, v1
; GCN-NEXT: v_add_i32_e32 v5, vcc, v7, v5
; GCN-NEXT: v_mul_lo_u32 v11, v0, v5
; GCN-NEXT: v_mul_hi_u32 v13, v0, v5
; GCN-NEXT: v_mul_hi_u32 v12, v0, v6
; GCN-NEXT: v_mul_lo_u32 v11, v1, v5
; GCN-NEXT: v_mul_hi_u32 v13, v1, v5
; GCN-NEXT: v_mul_hi_u32 v12, v1, v6
; GCN-NEXT: v_mul_hi_u32 v10, v3, v6
; GCN-NEXT: v_mul_lo_u32 v6, v3, v6
; GCN-NEXT: v_mul_hi_u32 v7, v3, v5
@ -774,50 +774,50 @@ define amdgpu_kernel void @s_test_udiv24_i48(i48 addrspace(1)* %out, i48 %x, i48
; GCN-NEXT: v_addc_u32_e32 v5, vcc, v9, v5, vcc
; GCN-NEXT: v_add_i32_e32 v2, vcc, v2, v4
; GCN-NEXT: v_addc_u32_e64 v2, vcc, v2, v5, s[2:3]
; GCN-NEXT: v_add_i32_e32 v0, vcc, v0, v3
; GCN-NEXT: v_add_i32_e32 v1, vcc, v1, v3
; GCN-NEXT: v_mov_b32_e32 v3, s6
; GCN-NEXT: v_alignbit_b32 v3, s7, v3, 24
; GCN-NEXT: v_addc_u32_e32 v2, vcc, 0, v2, vcc
; GCN-NEXT: v_mul_hi_u32 v5, v3, v0
; GCN-NEXT: v_mul_hi_u32 v5, v3, v1
; GCN-NEXT: v_mul_lo_u32 v4, v3, v2
; GCN-NEXT: v_mul_hi_u32 v6, v3, v2
; GCN-NEXT: v_mul_hi_u32 v0, 0, v0
; GCN-NEXT: v_mul_hi_u32 v1, 0, v1
; GCN-NEXT: v_mul_hi_u32 v2, 0, v2
; GCN-NEXT: v_add_i32_e32 v4, vcc, v5, v4
; GCN-NEXT: v_addc_u32_e32 v5, vcc, v9, v6, vcc
; GCN-NEXT: v_add_i32_e32 v4, vcc, 0, v4
; GCN-NEXT: v_addc_u32_e32 v0, vcc, v5, v0, vcc
; GCN-NEXT: v_addc_u32_e32 v1, vcc, v5, v1, vcc
; GCN-NEXT: v_addc_u32_e32 v2, vcc, v2, v8, vcc
; GCN-NEXT: v_add_i32_e32 v0, vcc, 0, v0
; GCN-NEXT: v_add_i32_e32 v1, vcc, 0, v1
; GCN-NEXT: v_addc_u32_e32 v2, vcc, v9, v2, vcc
; GCN-NEXT: v_mul_lo_u32 v4, v1, v2
; GCN-NEXT: v_mul_hi_u32 v5, v1, v0
; GCN-NEXT: v_mul_lo_u32 v6, v1, v0
; GCN-NEXT: v_mul_lo_u32 v4, v0, v2
; GCN-NEXT: v_mul_hi_u32 v5, v0, v1
; GCN-NEXT: v_mul_lo_u32 v6, v0, v1
; GCN-NEXT: s_mov_b32 s7, 0xf000
; GCN-NEXT: s_mov_b32 s6, -1
; GCN-NEXT: v_add_i32_e32 v4, vcc, v5, v4
; GCN-NEXT: v_sub_i32_e32 v3, vcc, v3, v6
; GCN-NEXT: v_subb_u32_e32 v4, vcc, 0, v4, vcc
; GCN-NEXT: v_sub_i32_e32 v5, vcc, v3, v1
; GCN-NEXT: v_sub_i32_e32 v5, vcc, v3, v0
; GCN-NEXT: v_subbrev_u32_e32 v6, vcc, 0, v4, vcc
; GCN-NEXT: v_cmp_ge_u32_e32 vcc, v5, v1
; GCN-NEXT: v_cmp_ge_u32_e32 vcc, v5, v0
; GCN-NEXT: v_cndmask_b32_e64 v5, 0, -1, vcc
; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 0, v6
; GCN-NEXT: v_cndmask_b32_e32 v5, -1, v5, vcc
; GCN-NEXT: v_add_i32_e32 v6, vcc, 2, v0
; GCN-NEXT: v_add_i32_e32 v6, vcc, 2, v1
; GCN-NEXT: v_addc_u32_e32 v7, vcc, 0, v2, vcc
; GCN-NEXT: v_add_i32_e32 v8, vcc, 1, v0
; GCN-NEXT: v_cmp_ge_u32_e64 s[0:1], v3, v1
; GCN-NEXT: v_add_i32_e32 v8, vcc, 1, v1
; GCN-NEXT: v_cmp_ge_u32_e64 s[0:1], v3, v0
; GCN-NEXT: v_addc_u32_e32 v9, vcc, 0, v2, vcc
; GCN-NEXT: v_cndmask_b32_e64 v1, 0, -1, s[0:1]
; GCN-NEXT: v_cndmask_b32_e64 v0, 0, -1, s[0:1]
; GCN-NEXT: v_cmp_eq_u32_e64 s[0:1], 0, v4
; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v5
; GCN-NEXT: v_cndmask_b32_e64 v1, -1, v1, s[0:1]
; GCN-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v1
; GCN-NEXT: v_cndmask_b32_e32 v1, v9, v7, vcc
; GCN-NEXT: v_cndmask_b32_e64 v0, -1, v0, s[0:1]
; GCN-NEXT: v_cndmask_b32_e32 v5, v8, v6, vcc
; GCN-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v0
; GCN-NEXT: v_cndmask_b32_e64 v0, v1, v5, s[0:1]
; GCN-NEXT: v_cndmask_b32_e32 v1, v9, v7, vcc
; GCN-NEXT: v_cndmask_b32_e64 v1, v2, v1, s[0:1]
; GCN-NEXT: v_cndmask_b32_e64 v0, v0, v5, s[0:1]
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: buffer_store_short v1, off, s[4:7], 0 offset:4
; GCN-NEXT: buffer_store_dword v0, off, s[4:7], 0
@ -827,15 +827,15 @@ define amdgpu_kernel void @s_test_udiv24_i48(i48 addrspace(1)* %out, i48 %x, i48
; GCN-IR: ; %bb.0: ; %_udiv-special-cases
; GCN-IR-NEXT: s_load_dword s2, s[0:1], 0xb
; GCN-IR-NEXT: s_load_dword s3, s[0:1], 0xc
; GCN-IR-NEXT: s_load_dword s7, s[0:1], 0xd
; GCN-IR-NEXT: s_load_dword s6, s[0:1], 0xd
; GCN-IR-NEXT: s_load_dword s5, s[0:1], 0xe
; GCN-IR-NEXT: s_mov_b32 s4, 0xffff
; GCN-IR-NEXT: s_mov_b32 s6, 0xff000000
; GCN-IR-NEXT: s_mov_b32 s7, 0xff000000
; GCN-IR-NEXT: s_waitcnt lgkmcnt(0)
; GCN-IR-NEXT: s_and_b32 s3, s3, s4
; GCN-IR-NEXT: s_and_b32 s2, s2, s6
; GCN-IR-NEXT: s_and_b32 s2, s2, s7
; GCN-IR-NEXT: s_and_b32 s5, s5, s4
; GCN-IR-NEXT: s_and_b32 s4, s7, s6
; GCN-IR-NEXT: s_and_b32 s4, s6, s7
; GCN-IR-NEXT: s_lshr_b64 s[6:7], s[2:3], 24
; GCN-IR-NEXT: s_lshr_b64 s[2:3], s[4:5], 24
; GCN-IR-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9

View File

@ -108,12 +108,12 @@ define <4 x half> @shuffle_v4f16_35u5(<4 x half> addrspace(1)* %arg0, <4 x half>
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: global_load_dword v2, v[2:3], off
; GFX9-NEXT: global_load_dword v0, v[0:1], off offset:4
; GFX9-NEXT: v_mov_b32_e32 v1, 0xffff
; GFX9-NEXT: v_mov_b32_e32 v3, 0xffff
; GFX9-NEXT: s_waitcnt vmcnt(1)
; GFX9-NEXT: v_lshrrev_b32_e32 v3, 16, v2
; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v2
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_and_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
; GFX9-NEXT: v_lshl_or_b32 v0, v3, 16, v0
; GFX9-NEXT: v_and_b32_sdwa v0, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
; GFX9-NEXT: v_lshl_or_b32 v0, v1, 16, v0
; GFX9-NEXT: v_mov_b32_e32 v1, v2
; GFX9-NEXT: s_setpc_b64 s[30:31]
%val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0
@ -128,11 +128,11 @@ define <4 x half> @shuffle_v4f16_357u(<4 x half> addrspace(1)* %arg0, <4 x half>
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: global_load_dwordx2 v[2:3], v[2:3], off
; GFX9-NEXT: global_load_dword v0, v[0:1], off offset:4
; GFX9-NEXT: v_mov_b32_e32 v1, 0xffff
; GFX9-NEXT: v_mov_b32_e32 v4, 0xffff
; GFX9-NEXT: s_waitcnt vmcnt(1)
; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v2
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_and_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
; GFX9-NEXT: v_and_b32_sdwa v0, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v3
; GFX9-NEXT: v_lshl_or_b32 v0, v2, 16, v0
; GFX9-NEXT: s_setpc_b64 s[30:31]
@ -415,12 +415,11 @@ define <4 x half> @shuffle_v4f16_3456(<4 x half> addrspace(1)* %arg0, <4 x half>
; GFX9-NEXT: global_load_dwordx2 v[2:3], v[2:3], off
; GFX9-NEXT: global_load_dword v0, v[0:1], off offset:4
; GFX9-NEXT: v_mov_b32_e32 v1, 0xffff
; GFX9-NEXT: s_waitcnt vmcnt(1)
; GFX9-NEXT: v_and_b32_sdwa v4, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_and_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
; GFX9-NEXT: v_and_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
; GFX9-NEXT: v_lshl_or_b32 v0, v2, 16, v0
; GFX9-NEXT: v_lshl_or_b32 v1, v3, 16, v4
; GFX9-NEXT: v_lshl_or_b32 v1, v3, 16, v1
; GFX9-NEXT: s_setpc_b64 s[30:31]
%val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0
%val1 = load <4 x half>, <4 x half> addrspace(1)* %arg1
@ -809,14 +808,14 @@ define amdgpu_kernel void @shuffle_scalar_load_v8i32_0123(<8 x i32> addrspace(4)
; GFX9: ; %bb.0:
; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x0
; GFX9-NEXT: v_mov_b32_e32 v4, s2
; GFX9-NEXT: v_mov_b32_e32 v5, s3
; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: v_mov_b32_e32 v0, s4
; GFX9-NEXT: v_mov_b32_e32 v1, s5
; GFX9-NEXT: v_mov_b32_e32 v2, s6
; GFX9-NEXT: v_mov_b32_e32 v3, s7
; GFX9-NEXT: v_mov_b32_e32 v0, s0
; GFX9-NEXT: v_mov_b32_e32 v1, s1
; GFX9-NEXT: v_mov_b32_e32 v2, s2
; GFX9-NEXT: v_mov_b32_e32 v3, s3
; GFX9-NEXT: global_store_dwordx4 v[4:5], v[0:3], off
; GFX9-NEXT: s_endpgm
%ld8 = load <8 x i32>, <8 x i32> addrspace(4)* %in, align 16

View File

@ -272,17 +272,17 @@ define amdgpu_kernel void @no_widen_i16_constant_divergent_load(i16 addrspace(4)
; VI-LABEL: no_widen_i16_constant_divergent_load:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; VI-NEXT: v_lshlrev_b32_e32 v2, 1, v0
; VI-NEXT: v_lshlrev_b32_e32 v0, 1, v0
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v1, s1
; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v0
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: flat_load_ushort v0, v[0:1]
; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; VI-NEXT: v_add_u16_e32 v0, 0x3e7, v0
; VI-NEXT: v_or_b32_e32 v2, 4, v0
; VI-NEXT: v_mov_b32_e32 v0, 0
; VI-NEXT: v_mov_b32_e32 v1, 0
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v3, s1
; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2
; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
; VI-NEXT: flat_load_ushort v2, v[2:3]
; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; VI-NEXT: v_add_u16_e32 v2, 0x3e7, v2
; VI-NEXT: v_or_b32_e32 v2, 4, v2
; VI-NEXT: flat_store_short v[0:1], v2
; VI-NEXT: s_endpgm
%tid = call i32 @llvm.amdgcn.workitem.id.x()

View File

@ -155,12 +155,12 @@ define amdgpu_ps float @xor3_uniform_vgpr(float inreg %a, float inreg %b, float
;
; GFX10-LABEL: xor3_uniform_vgpr:
; GFX10: ; %bb.0:
; GFX10-NEXT: v_add_f32_e64 v0, s2, 1.0
; GFX10-NEXT: v_add_f32_e64 v1, s3, 2.0
; GFX10-NEXT: v_add_f32_e64 v2, s2, 1.0
; GFX10-NEXT: v_add_f32_e64 v0, 0x40400000, s4
; GFX10-NEXT: v_add_f32_e64 v2, 0x40400000, s4
; GFX10-NEXT: ; implicit-def: $vcc_hi
; GFX10-NEXT: v_xor_b32_e32 v1, v2, v1
; GFX10-NEXT: v_xor_b32_e32 v0, v1, v0
; GFX10-NEXT: v_xor_b32_e32 v0, v0, v1
; GFX10-NEXT: v_xor_b32_e32 v0, v0, v2
; GFX10-NEXT: ; return to shader part epilog
%a1 = fadd float %a, 1.0
%b2 = fadd float %b, 2.0